1/*
2 * kmp_runtime.cpp -- KPTS runtime support library
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp.h"
14#include "kmp_affinity.h"
15#include "kmp_atomic.h"
16#include "kmp_environment.h"
17#include "kmp_error.h"
18#include "kmp_i18n.h"
19#include "kmp_io.h"
20#include "kmp_itt.h"
21#include "kmp_settings.h"
22#include "kmp_stats.h"
23#include "kmp_str.h"
24#include "kmp_wait_release.h"
25#include "kmp_wrapper_getpid.h"
26#include "kmp_dispatch.h"
27#include "kmp_utils.h"
28#if KMP_USE_HIER_SCHED
29#include "kmp_dispatch_hier.h"
30#endif
31
32#if OMPT_SUPPORT
33#include "ompt-specific.h"
34#endif
35#if OMPD_SUPPORT
36#include "ompd-specific.h"
37#endif
38
39#if OMP_PROFILING_SUPPORT
40#include "llvm/Support/TimeProfiler.h"
41static char *ProfileTraceFile = nullptr;
42#endif
43
44/* these are temporary issues to be dealt with */
45#define KMP_USE_PRCTL 0
46
47#if KMP_OS_WINDOWS
48#include <process.h>
49#endif
50
51#ifndef KMP_USE_SHM
52// Windows and WASI do not need these include files as they don't use shared
53// memory.
54#else
55#include <sys/mman.h>
56#include <sys/stat.h>
57#include <fcntl.h>
58#define SHM_SIZE 1024
59#endif
60
61#if defined(KMP_GOMP_COMPAT)
62char const __kmp_version_alt_comp[] =
63 KMP_VERSION_PREFIX "alternative compiler support: yes";
64#endif /* defined(KMP_GOMP_COMPAT) */
65
66char const __kmp_version_omp_api[] =
67 KMP_VERSION_PREFIX "API version: 5.0 (201611)";
68
69#ifdef KMP_DEBUG
70char const __kmp_version_lock[] =
71 KMP_VERSION_PREFIX "lock type: run time selectable";
72#endif /* KMP_DEBUG */
73
74#define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
75
76/* ------------------------------------------------------------------------ */
77
78#if KMP_USE_MONITOR
79kmp_info_t __kmp_monitor;
80#endif
81
82/* Forward declarations */
83
84void __kmp_cleanup(void);
85
86static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
87 int gtid);
88static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
89 kmp_internal_control_t *new_icvs,
90 ident_t *loc);
91#if KMP_AFFINITY_SUPPORTED
92static void __kmp_partition_places(kmp_team_t *team,
93 int update_master_only = 0);
94#endif
95static void __kmp_do_serial_initialize(void);
96void __kmp_fork_barrier(int gtid, int tid);
97void __kmp_join_barrier(int gtid);
98void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
99 kmp_internal_control_t *new_icvs, ident_t *loc);
100
101#ifdef USE_LOAD_BALANCE
102static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
103#endif
104
105static int __kmp_expand_threads(int nNeed);
106#if KMP_OS_WINDOWS
107static int __kmp_unregister_root_other_thread(int gtid);
108#endif
109static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
110kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
111
112void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
113 int new_nthreads);
114void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
115
116/* Calculate the identifier of the current thread */
117/* fast (and somewhat portable) way to get unique identifier of executing
118 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
119int __kmp_get_global_thread_id() {
120 int i;
121 kmp_info_t **other_threads;
122 size_t stack_data;
123 char *stack_addr;
124 size_t stack_size;
125 char *stack_base;
126
127 KA_TRACE(
128 1000,
129 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
130 __kmp_nth, __kmp_all_nth));
131
132 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
133 a parallel region, made it return KMP_GTID_DNE to force serial_initialize
134 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
135 __kmp_init_gtid for this to work. */
136
137 if (!TCR_4(__kmp_init_gtid))
138 return KMP_GTID_DNE;
139
140#ifdef KMP_TDATA_GTID
141 if (TCR_4(__kmp_gtid_mode) >= 3) {
142 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
143 return __kmp_gtid;
144 }
145#endif
146 if (TCR_4(__kmp_gtid_mode) >= 2) {
147 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
148 return __kmp_gtid_get_specific();
149 }
150 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
151
152 stack_addr = (char *)&stack_data;
153 other_threads = __kmp_threads;
154
155 /* ATT: The code below is a source of potential bugs due to unsynchronized
156 access to __kmp_threads array. For example:
157 1. Current thread loads other_threads[i] to thr and checks it, it is
158 non-NULL.
159 2. Current thread is suspended by OS.
160 3. Another thread unregisters and finishes (debug versions of free()
161 may fill memory with something like 0xEF).
162 4. Current thread is resumed.
163 5. Current thread reads junk from *thr.
164 TODO: Fix it. --ln */
165
166 for (i = 0; i < __kmp_threads_capacity; i++) {
167
168 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
169 if (!thr)
170 continue;
171
172 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
173 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
174
175 /* stack grows down -- search through all of the active threads */
176
177 if (stack_addr <= stack_base) {
178 size_t stack_diff = stack_base - stack_addr;
179
180 if (stack_diff <= stack_size) {
181 /* The only way we can be closer than the allocated */
182 /* stack size is if we are running on this thread. */
183 // __kmp_gtid_get_specific can return negative value because this
184 // function can be called by thread destructor. However, before the
185 // thread destructor is called, the value of the corresponding
186 // thread-specific data will be reset to NULL.
187 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 ||
188 __kmp_gtid_get_specific() == i);
189 return i;
190 }
191 }
192 }
193
194 /* get specific to try and determine our gtid */
195 KA_TRACE(1000,
196 ("*** __kmp_get_global_thread_id: internal alg. failed to find "
197 "thread, using TLS\n"));
198 i = __kmp_gtid_get_specific();
199
200 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
201
202 /* if we havn't been assigned a gtid, then return code */
203 if (i < 0)
204 return i;
205
206 // other_threads[i] can be nullptr at this point because the corresponding
207 // thread could have already been destructed. It can happen when this function
208 // is called in end library routine.
209 if (!TCR_SYNC_PTR(other_threads[i]))
210 return i;
211
212 /* dynamically updated stack window for uber threads to avoid get_specific
213 call */
214 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
215 KMP_FATAL(StackOverflow, i);
216 }
217
218 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
219 if (stack_addr > stack_base) {
220 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
221 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
222 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
223 stack_base);
224 } else {
225 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
226 stack_base - stack_addr);
227 }
228
229 /* Reprint stack bounds for ubermaster since they have been refined */
230 if (__kmp_storage_map) {
231 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
232 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
233 __kmp_print_storage_map_gtid(gtid: i, p1: stack_beg, p2: stack_end,
234 size: other_threads[i]->th.th_info.ds.ds_stacksize,
235 format: "th_%d stack (refinement)", i);
236 }
237 return i;
238}
239
240int __kmp_get_global_thread_id_reg() {
241 int gtid;
242
243 if (!__kmp_init_serial) {
244 gtid = KMP_GTID_DNE;
245 } else
246#ifdef KMP_TDATA_GTID
247 if (TCR_4(__kmp_gtid_mode) >= 3) {
248 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
249 gtid = __kmp_gtid;
250 } else
251#endif
252 if (TCR_4(__kmp_gtid_mode) >= 2) {
253 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
254 gtid = __kmp_gtid_get_specific();
255 } else {
256 KA_TRACE(1000,
257 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
258 gtid = __kmp_get_global_thread_id();
259 }
260
261 /* we must be a new uber master sibling thread */
262 if (gtid == KMP_GTID_DNE) {
263 KA_TRACE(10,
264 ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
265 "Registering a new gtid.\n"));
266 __kmp_acquire_bootstrap_lock(lck: &__kmp_initz_lock);
267 if (!__kmp_init_serial) {
268 __kmp_do_serial_initialize();
269 gtid = __kmp_gtid_get_specific();
270 } else {
271 gtid = __kmp_register_root(FALSE);
272 }
273 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
274 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
275 }
276
277 KMP_DEBUG_ASSERT(gtid >= 0);
278
279 return gtid;
280}
281
282/* caller must hold forkjoin_lock */
283void __kmp_check_stack_overlap(kmp_info_t *th) {
284 int f;
285 char *stack_beg = NULL;
286 char *stack_end = NULL;
287 int gtid;
288
289 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
290 if (__kmp_storage_map) {
291 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
292 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
293
294 gtid = __kmp_gtid_from_thread(thr: th);
295
296 if (gtid == KMP_GTID_MONITOR) {
297 __kmp_print_storage_map_gtid(
298 gtid, p1: stack_beg, p2: stack_end, size: th->th.th_info.ds.ds_stacksize,
299 format: "th_%s stack (%s)", "mon",
300 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
301 } else {
302 __kmp_print_storage_map_gtid(
303 gtid, p1: stack_beg, p2: stack_end, size: th->th.th_info.ds.ds_stacksize,
304 format: "th_%d stack (%s)", gtid,
305 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
306 }
307 }
308
309 /* No point in checking ubermaster threads since they use refinement and
310 * cannot overlap */
311 gtid = __kmp_gtid_from_thread(thr: th);
312 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
313 KA_TRACE(10,
314 ("__kmp_check_stack_overlap: performing extensive checking\n"));
315 if (stack_beg == NULL) {
316 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
317 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
318 }
319
320 for (f = 0; f < __kmp_threads_capacity; f++) {
321 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
322
323 if (f_th && f_th != th) {
324 char *other_stack_end =
325 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
326 char *other_stack_beg =
327 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
328 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
329 (stack_end > other_stack_beg && stack_end < other_stack_end)) {
330
331 /* Print the other stack values before the abort */
332 if (__kmp_storage_map)
333 __kmp_print_storage_map_gtid(
334 gtid: -1, p1: other_stack_beg, p2: other_stack_end,
335 size: (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
336 format: "th_%d stack (overlapped)", __kmp_gtid_from_thread(thr: f_th));
337
338 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
339 __kmp_msg_null);
340 }
341 }
342 }
343 }
344 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
345}
346
347/* ------------------------------------------------------------------------ */
348
349void __kmp_infinite_loop(void) {
350 static int done = FALSE;
351
352 while (!done) {
353 KMP_YIELD(TRUE);
354 }
355}
356
357#define MAX_MESSAGE 512
358
359void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
360 char const *format, ...) {
361 char buffer[MAX_MESSAGE];
362 va_list ap;
363
364 va_start(ap, format);
365 KMP_SNPRINTF(s: buffer, maxlen: sizeof(buffer), format: "OMP storage map: %p %p%8lu %s\n", p1,
366 p2, (unsigned long)size, format);
367 __kmp_acquire_bootstrap_lock(lck: &__kmp_stdio_lock);
368 __kmp_vprintf(stream: kmp_err, format: buffer, ap);
369#if KMP_PRINT_DATA_PLACEMENT
370 int node;
371 if (gtid >= 0) {
372 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
373 if (__kmp_storage_map_verbose) {
374 node = __kmp_get_host_node(p1);
375 if (node < 0) /* doesn't work, so don't try this next time */
376 __kmp_storage_map_verbose = FALSE;
377 else {
378 char *last;
379 int lastNode;
380 int localProc = __kmp_get_cpu_from_gtid(gtid);
381
382 const int page_size = KMP_GET_PAGE_SIZE();
383
384 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
385 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
386 if (localProc >= 0)
387 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
388 localProc >> 1);
389 else
390 __kmp_printf_no_lock(" GTID %d\n", gtid);
391#if KMP_USE_PRCTL
392 /* The more elaborate format is disabled for now because of the prctl
393 * hanging bug. */
394 do {
395 last = p1;
396 lastNode = node;
397 /* This loop collates adjacent pages with the same host node. */
398 do {
399 (char *)p1 += page_size;
400 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
401 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
402 lastNode);
403 } while (p1 <= p2);
404#else
405 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
406 (char *)p1 + (page_size - 1),
407 __kmp_get_host_node(p1));
408 if (p1 < p2) {
409 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
410 (char *)p2 + (page_size - 1),
411 __kmp_get_host_node(p2));
412 }
413#endif
414 }
415 }
416 } else
417 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
418 }
419#endif /* KMP_PRINT_DATA_PLACEMENT */
420 __kmp_release_bootstrap_lock(lck: &__kmp_stdio_lock);
421
422 va_end(ap);
423}
424
425void __kmp_warn(char const *format, ...) {
426 char buffer[MAX_MESSAGE];
427 va_list ap;
428
429 if (__kmp_generate_warnings == kmp_warnings_off) {
430 return;
431 }
432
433 va_start(ap, format);
434
435 KMP_SNPRINTF(s: buffer, maxlen: sizeof(buffer), format: "OMP warning: %s\n", format);
436 __kmp_acquire_bootstrap_lock(lck: &__kmp_stdio_lock);
437 __kmp_vprintf(stream: kmp_err, format: buffer, ap);
438 __kmp_release_bootstrap_lock(lck: &__kmp_stdio_lock);
439
440 va_end(ap);
441}
442
443void __kmp_abort_process() {
444 // Later threads may stall here, but that's ok because abort() will kill them.
445 __kmp_acquire_bootstrap_lock(lck: &__kmp_exit_lock);
446
447 if (__kmp_debug_buf) {
448 __kmp_dump_debug_buffer();
449 }
450
451#if KMP_OS_WINDOWS
452 // Let other threads know of abnormal termination and prevent deadlock
453 // if abort happened during library initialization or shutdown
454 __kmp_global.g.g_abort = SIGABRT;
455
456 /* On Windows* OS by default abort() causes pop-up error box, which stalls
457 nightly testing. Unfortunately, we cannot reliably suppress pop-up error
458 boxes. _set_abort_behavior() works well, but this function is not
459 available in VS7 (this is not problem for DLL, but it is a problem for
460 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
461 help, at least in some versions of MS C RTL.
462
463 It seems following sequence is the only way to simulate abort() and
464 avoid pop-up error box. */
465 raise(SIGABRT);
466 _exit(3); // Just in case, if signal ignored, exit anyway.
467#else
468 __kmp_unregister_library();
469 abort();
470#endif
471
472 __kmp_infinite_loop();
473 __kmp_release_bootstrap_lock(lck: &__kmp_exit_lock);
474
475} // __kmp_abort_process
476
477void __kmp_abort_thread(void) {
478 // TODO: Eliminate g_abort global variable and this function.
479 // In case of abort just call abort(), it will kill all the threads.
480 __kmp_infinite_loop();
481} // __kmp_abort_thread
482
483/* Print out the storage map for the major kmp_info_t thread data structures
484 that are allocated together. */
485
486static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
487 __kmp_print_storage_map_gtid(gtid, p1: thr, p2: thr + 1, size: sizeof(kmp_info_t), format: "th_%d",
488 gtid);
489
490 __kmp_print_storage_map_gtid(gtid, p1: &thr->th.th_info, p2: &thr->th.th_team,
491 size: sizeof(kmp_desc_t), format: "th_%d.th_info", gtid);
492
493 __kmp_print_storage_map_gtid(gtid, p1: &thr->th.th_local, p2: &thr->th.th_pri_head,
494 size: sizeof(kmp_local_t), format: "th_%d.th_local", gtid);
495
496 __kmp_print_storage_map_gtid(
497 gtid, p1: &thr->th.th_bar[0], p2: &thr->th.th_bar[bs_last_barrier],
498 size: sizeof(kmp_balign_t) * bs_last_barrier, format: "th_%d.th_bar", gtid);
499
500 __kmp_print_storage_map_gtid(gtid, p1: &thr->th.th_bar[bs_plain_barrier],
501 p2: &thr->th.th_bar[bs_plain_barrier + 1],
502 size: sizeof(kmp_balign_t), format: "th_%d.th_bar[plain]",
503 gtid);
504
505 __kmp_print_storage_map_gtid(gtid, p1: &thr->th.th_bar[bs_forkjoin_barrier],
506 p2: &thr->th.th_bar[bs_forkjoin_barrier + 1],
507 size: sizeof(kmp_balign_t), format: "th_%d.th_bar[forkjoin]",
508 gtid);
509
510#if KMP_FAST_REDUCTION_BARRIER
511 __kmp_print_storage_map_gtid(gtid, p1: &thr->th.th_bar[bs_reduction_barrier],
512 p2: &thr->th.th_bar[bs_reduction_barrier + 1],
513 size: sizeof(kmp_balign_t), format: "th_%d.th_bar[reduction]",
514 gtid);
515#endif // KMP_FAST_REDUCTION_BARRIER
516}
517
518/* Print out the storage map for the major kmp_team_t team data structures
519 that are allocated together. */
520
521static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
522 int team_id, int num_thr) {
523 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
524 __kmp_print_storage_map_gtid(gtid: -1, p1: team, p2: team + 1, size: sizeof(kmp_team_t), format: "%s_%d",
525 header, team_id);
526
527 __kmp_print_storage_map_gtid(gtid: -1, p1: &team->t.t_bar[0],
528 p2: &team->t.t_bar[bs_last_barrier],
529 size: sizeof(kmp_balign_team_t) * bs_last_barrier,
530 format: "%s_%d.t_bar", header, team_id);
531
532 __kmp_print_storage_map_gtid(gtid: -1, p1: &team->t.t_bar[bs_plain_barrier],
533 p2: &team->t.t_bar[bs_plain_barrier + 1],
534 size: sizeof(kmp_balign_team_t), format: "%s_%d.t_bar[plain]",
535 header, team_id);
536
537 __kmp_print_storage_map_gtid(gtid: -1, p1: &team->t.t_bar[bs_forkjoin_barrier],
538 p2: &team->t.t_bar[bs_forkjoin_barrier + 1],
539 size: sizeof(kmp_balign_team_t),
540 format: "%s_%d.t_bar[forkjoin]", header, team_id);
541
542#if KMP_FAST_REDUCTION_BARRIER
543 __kmp_print_storage_map_gtid(gtid: -1, p1: &team->t.t_bar[bs_reduction_barrier],
544 p2: &team->t.t_bar[bs_reduction_barrier + 1],
545 size: sizeof(kmp_balign_team_t),
546 format: "%s_%d.t_bar[reduction]", header, team_id);
547#endif // KMP_FAST_REDUCTION_BARRIER
548
549 __kmp_print_storage_map_gtid(
550 gtid: -1, p1: &team->t.t_dispatch[0], p2: &team->t.t_dispatch[num_thr],
551 size: sizeof(kmp_disp_t) * num_thr, format: "%s_%d.t_dispatch", header, team_id);
552
553 __kmp_print_storage_map_gtid(
554 gtid: -1, p1: &team->t.t_threads[0], p2: &team->t.t_threads[num_thr],
555 size: sizeof(kmp_info_t *) * num_thr, format: "%s_%d.t_threads", header, team_id);
556
557 __kmp_print_storage_map_gtid(gtid: -1, p1: &team->t.t_disp_buffer[0],
558 p2: &team->t.t_disp_buffer[num_disp_buff],
559 size: sizeof(dispatch_shared_info_t) * num_disp_buff,
560 format: "%s_%d.t_disp_buffer", header, team_id);
561}
562
563static void __kmp_init_allocator() {
564 __kmp_init_memkind();
565 __kmp_init_target_mem();
566}
567static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
568
569/* ------------------------------------------------------------------------ */
570
571#if ENABLE_LIBOMPTARGET
572static void __kmp_init_omptarget() {
573 __kmp_init_target_task();
574}
575#endif
576
577/* ------------------------------------------------------------------------ */
578
579#if KMP_DYNAMIC_LIB
580#if KMP_OS_WINDOWS
581
582BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
583 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
584
585 switch (fdwReason) {
586
587 case DLL_PROCESS_ATTACH:
588 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
589
590 return TRUE;
591
592 case DLL_PROCESS_DETACH:
593 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
594
595 // According to Windows* documentation for DllMain entry point:
596 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
597 // lpReserved == NULL when FreeLibrary() is called,
598 // lpReserved != NULL when the process is terminated.
599 // When FreeLibrary() is called, worker threads remain alive. So the
600 // runtime's state is consistent and executing proper shutdown is OK.
601 // When the process is terminated, worker threads have exited or been
602 // forcefully terminated by the OS and only the shutdown thread remains.
603 // This can leave the runtime in an inconsistent state.
604 // Hence, only attempt proper cleanup when FreeLibrary() is called.
605 // Otherwise, rely on OS to reclaim resources.
606 if (lpReserved == NULL)
607 __kmp_internal_end_library(__kmp_gtid_get_specific());
608
609 return TRUE;
610
611 case DLL_THREAD_ATTACH:
612 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
613
614 /* if we want to register new siblings all the time here call
615 * __kmp_get_gtid(); */
616 return TRUE;
617
618 case DLL_THREAD_DETACH:
619 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
620
621 __kmp_internal_end_thread(__kmp_gtid_get_specific());
622 return TRUE;
623 }
624
625 return TRUE;
626}
627
628#endif /* KMP_OS_WINDOWS */
629#endif /* KMP_DYNAMIC_LIB */
630
631/* __kmp_parallel_deo -- Wait until it's our turn. */
632void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
633 int gtid = *gtid_ref;
634#ifdef BUILD_PARALLEL_ORDERED
635 kmp_team_t *team = __kmp_team_from_gtid(gtid);
636#endif /* BUILD_PARALLEL_ORDERED */
637
638 if (__kmp_env_consistency_check) {
639 if (__kmp_threads[gtid]->th.th_root->r.r_active)
640#if KMP_USE_DYNAMIC_LOCK
641 __kmp_push_sync(gtid, ct: ct_ordered_in_parallel, ident: loc_ref, NULL, 0);
642#else
643 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
644#endif
645 }
646#ifdef BUILD_PARALLEL_ORDERED
647 if (!team->t.t_serialized) {
648 KMP_MB();
649 KMP_WAIT(spinner: &team->t.t_ordered.dt.t_value, checker: __kmp_tid_from_gtid(gtid), KMP_EQ,
650 NULL);
651 KMP_MB();
652 }
653#endif /* BUILD_PARALLEL_ORDERED */
654}
655
656/* __kmp_parallel_dxo -- Signal the next task. */
657void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
658 int gtid = *gtid_ref;
659#ifdef BUILD_PARALLEL_ORDERED
660 int tid = __kmp_tid_from_gtid(gtid);
661 kmp_team_t *team = __kmp_team_from_gtid(gtid);
662#endif /* BUILD_PARALLEL_ORDERED */
663
664 if (__kmp_env_consistency_check) {
665 if (__kmp_threads[gtid]->th.th_root->r.r_active)
666 __kmp_pop_sync(gtid, ct: ct_ordered_in_parallel, ident: loc_ref);
667 }
668#ifdef BUILD_PARALLEL_ORDERED
669 if (!team->t.t_serialized) {
670 KMP_MB(); /* Flush all pending memory write invalidates. */
671
672 /* use the tid of the next thread in this team */
673 /* TODO replace with general release procedure */
674 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
675
676 KMP_MB(); /* Flush all pending memory write invalidates. */
677 }
678#endif /* BUILD_PARALLEL_ORDERED */
679}
680
681/* ------------------------------------------------------------------------ */
682/* The BARRIER for a SINGLE process section is always explicit */
683
684int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
685 int status;
686 kmp_info_t *th;
687 kmp_team_t *team;
688
689 if (!TCR_4(__kmp_init_parallel))
690 __kmp_parallel_initialize();
691 __kmp_resume_if_soft_paused();
692
693 th = __kmp_threads[gtid];
694 team = th->th.th_team;
695 status = 0;
696
697 th->th.th_ident = id_ref;
698
699 if (team->t.t_serialized) {
700 status = 1;
701 } else {
702 kmp_int32 old_this = th->th.th_local.this_construct;
703
704 ++th->th.th_local.this_construct;
705 /* try to set team count to thread count--success means thread got the
706 single block */
707 /* TODO: Should this be acquire or release? */
708 if (team->t.t_construct == old_this) {
709 status = __kmp_atomic_compare_store_acq(p: &team->t.t_construct, expected: old_this,
710 desired: th->th.th_local.this_construct);
711 }
712#if USE_ITT_BUILD
713 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
714 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
715 team->t.t_active_level == 1) {
716 // Only report metadata by primary thread of active team at level 1
717 __kmp_itt_metadata_single(loc: id_ref);
718 }
719#endif /* USE_ITT_BUILD */
720 }
721
722 if (__kmp_env_consistency_check) {
723 if (status && push_ws) {
724 __kmp_push_workshare(gtid, ct: ct_psingle, ident: id_ref);
725 } else {
726 __kmp_check_workshare(gtid, ct: ct_psingle, ident: id_ref);
727 }
728 }
729#if USE_ITT_BUILD
730 if (status) {
731 __kmp_itt_single_start(gtid);
732 }
733#endif /* USE_ITT_BUILD */
734 return status;
735}
736
737void __kmp_exit_single(int gtid) {
738#if USE_ITT_BUILD
739 __kmp_itt_single_end(gtid);
740#endif /* USE_ITT_BUILD */
741 if (__kmp_env_consistency_check)
742 __kmp_pop_workshare(gtid, ct: ct_psingle, NULL);
743}
744
745/* determine if we can go parallel or must use a serialized parallel region and
746 * how many threads we can use
747 * set_nproc is the number of threads requested for the team
748 * returns 0 if we should serialize or only use one thread,
749 * otherwise the number of threads to use
750 * The forkjoin lock is held by the caller. */
751static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
752 int master_tid, int set_nthreads,
753 int enter_teams) {
754 int capacity;
755 int new_nthreads;
756 KMP_DEBUG_ASSERT(__kmp_init_serial);
757 KMP_DEBUG_ASSERT(root && parent_team);
758 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
759
760 // If dyn-var is set, dynamically adjust the number of desired threads,
761 // according to the method specified by dynamic_mode.
762 new_nthreads = set_nthreads;
763 if (!get__dynamic_2(parent_team, master_tid)) {
764 ;
765 }
766#ifdef USE_LOAD_BALANCE
767 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
768 new_nthreads = __kmp_load_balance_nproc(root, set_nproc: set_nthreads);
769 if (new_nthreads == 1) {
770 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
771 "reservation to 1 thread\n",
772 master_tid));
773 return 1;
774 }
775 if (new_nthreads < set_nthreads) {
776 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
777 "reservation to %d threads\n",
778 master_tid, new_nthreads));
779 }
780 }
781#endif /* USE_LOAD_BALANCE */
782 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
783 new_nthreads = __kmp_avail_proc - __kmp_nth +
784 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
785 if (new_nthreads <= 1) {
786 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
787 "reservation to 1 thread\n",
788 master_tid));
789 return 1;
790 }
791 if (new_nthreads < set_nthreads) {
792 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
793 "reservation to %d threads\n",
794 master_tid, new_nthreads));
795 } else {
796 new_nthreads = set_nthreads;
797 }
798 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
799 if (set_nthreads > 2) {
800 new_nthreads = __kmp_get_random(thread: parent_team->t.t_threads[master_tid]);
801 new_nthreads = (new_nthreads % set_nthreads) + 1;
802 if (new_nthreads == 1) {
803 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
804 "reservation to 1 thread\n",
805 master_tid));
806 return 1;
807 }
808 if (new_nthreads < set_nthreads) {
809 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
810 "reservation to %d threads\n",
811 master_tid, new_nthreads));
812 }
813 }
814 } else {
815 KMP_ASSERT(0);
816 }
817
818 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
819 if (__kmp_nth + new_nthreads -
820 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
821 __kmp_max_nth) {
822 int tl_nthreads = __kmp_max_nth - __kmp_nth +
823 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
824 if (tl_nthreads <= 0) {
825 tl_nthreads = 1;
826 }
827
828 // If dyn-var is false, emit a 1-time warning.
829 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
830 __kmp_reserve_warn = 1;
831 __kmp_msg(kmp_ms_warning,
832 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
833 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
834 }
835 if (tl_nthreads == 1) {
836 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
837 "reduced reservation to 1 thread\n",
838 master_tid));
839 return 1;
840 }
841 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
842 "reservation to %d threads\n",
843 master_tid, tl_nthreads));
844 new_nthreads = tl_nthreads;
845 }
846
847 // Respect OMP_THREAD_LIMIT
848 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
849 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
850 if (cg_nthreads + new_nthreads -
851 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
852 max_cg_threads) {
853 int tl_nthreads = max_cg_threads - cg_nthreads +
854 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
855 if (tl_nthreads <= 0) {
856 tl_nthreads = 1;
857 }
858
859 // If dyn-var is false, emit a 1-time warning.
860 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
861 __kmp_reserve_warn = 1;
862 __kmp_msg(kmp_ms_warning,
863 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
864 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
865 }
866 if (tl_nthreads == 1) {
867 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
868 "reduced reservation to 1 thread\n",
869 master_tid));
870 return 1;
871 }
872 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
873 "reservation to %d threads\n",
874 master_tid, tl_nthreads));
875 new_nthreads = tl_nthreads;
876 }
877
878 // Check if the threads array is large enough, or needs expanding.
879 // See comment in __kmp_register_root() about the adjustment if
880 // __kmp_threads[0] == NULL.
881 capacity = __kmp_threads_capacity;
882 if (TCR_PTR(__kmp_threads[0]) == NULL) {
883 --capacity;
884 }
885 // If it is not for initializing the hidden helper team, we need to take
886 // __kmp_hidden_helper_threads_num out of the capacity because it is included
887 // in __kmp_threads_capacity.
888 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
889 capacity -= __kmp_hidden_helper_threads_num;
890 }
891 if (__kmp_nth + new_nthreads -
892 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
893 capacity) {
894 // Expand the threads array.
895 int slotsRequired = __kmp_nth + new_nthreads -
896 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
897 capacity;
898 int slotsAdded = __kmp_expand_threads(nNeed: slotsRequired);
899 if (slotsAdded < slotsRequired) {
900 // The threads array was not expanded enough.
901 new_nthreads -= (slotsRequired - slotsAdded);
902 KMP_ASSERT(new_nthreads >= 1);
903
904 // If dyn-var is false, emit a 1-time warning.
905 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
906 __kmp_reserve_warn = 1;
907 if (__kmp_tp_cached) {
908 __kmp_msg(kmp_ms_warning,
909 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
910 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
911 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
912 } else {
913 __kmp_msg(kmp_ms_warning,
914 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
915 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
916 }
917 }
918 }
919 }
920
921#ifdef KMP_DEBUG
922 if (new_nthreads == 1) {
923 KC_TRACE(10,
924 ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
925 "dead roots and rechecking; requested %d threads\n",
926 __kmp_get_gtid(), set_nthreads));
927 } else {
928 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
929 " %d threads\n",
930 __kmp_get_gtid(), new_nthreads, set_nthreads));
931 }
932#endif // KMP_DEBUG
933 return new_nthreads;
934}
935
936/* Allocate threads from the thread pool and assign them to the new team. We are
937 assured that there are enough threads available, because we checked on that
938 earlier within critical section forkjoin */
939static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
940 kmp_info_t *master_th, int master_gtid,
941 int fork_teams_workers) {
942 int i;
943 int use_hot_team;
944
945 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
946 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
947 KMP_MB();
948
949 /* first, let's setup the primary thread */
950 master_th->th.th_info.ds.ds_tid = 0;
951 master_th->th.th_team = team;
952 master_th->th.th_team_nproc = team->t.t_nproc;
953 master_th->th.th_team_master = master_th;
954 master_th->th.th_team_serialized = FALSE;
955 master_th->th.th_dispatch = &team->t.t_dispatch[0];
956
957/* make sure we are not the optimized hot team */
958#if KMP_NESTED_HOT_TEAMS
959 use_hot_team = 0;
960 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
961 if (hot_teams) { // hot teams array is not allocated if
962 // KMP_HOT_TEAMS_MAX_LEVEL=0
963 int level = team->t.t_active_level - 1; // index in array of hot teams
964 if (master_th->th.th_teams_microtask) { // are we inside the teams?
965 if (master_th->th.th_teams_size.nteams > 1) {
966 ++level; // level was not increased in teams construct for
967 // team_of_masters
968 }
969 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
970 master_th->th.th_teams_level == team->t.t_level) {
971 ++level; // level was not increased in teams construct for
972 // team_of_workers before the parallel
973 } // team->t.t_level will be increased inside parallel
974 }
975 if (level < __kmp_hot_teams_max_level) {
976 if (hot_teams[level].hot_team) {
977 // hot team has already been allocated for given level
978 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
979 use_hot_team = 1; // the team is ready to use
980 } else {
981 use_hot_team = 0; // AC: threads are not allocated yet
982 hot_teams[level].hot_team = team; // remember new hot team
983 hot_teams[level].hot_team_nth = team->t.t_nproc;
984 }
985 } else {
986 use_hot_team = 0;
987 }
988 }
989#else
990 use_hot_team = team == root->r.r_hot_team;
991#endif
992 if (!use_hot_team) {
993
994 /* install the primary thread */
995 team->t.t_threads[0] = master_th;
996 __kmp_initialize_info(master_th, team, tid: 0, gtid: master_gtid);
997
998 /* now, install the worker threads */
999 for (i = 1; i < team->t.t_nproc; i++) {
1000
1001 /* fork or reallocate a new thread and install it in team */
1002 kmp_info_t *thr = __kmp_allocate_thread(root, team, tid: i);
1003 team->t.t_threads[i] = thr;
1004 KMP_DEBUG_ASSERT(thr);
1005 KMP_DEBUG_ASSERT(thr->th.th_team == team);
1006 /* align team and thread arrived states */
1007 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1008 "T#%d(%d:%d) join =%llu, plain=%llu\n",
1009 __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1010 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1011 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1012 team->t.t_bar[bs_plain_barrier].b_arrived));
1013 thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1014 thr->th.th_teams_level = master_th->th.th_teams_level;
1015 thr->th.th_teams_size = master_th->th.th_teams_size;
1016 { // Initialize threads' barrier data.
1017 int b;
1018 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1019 for (b = 0; b < bs_last_barrier; ++b) {
1020 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1021 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1022#if USE_DEBUGGER
1023 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1024#endif
1025 }
1026 }
1027 }
1028
1029#if KMP_AFFINITY_SUPPORTED
1030 // Do not partition the places list for teams construct workers who
1031 // haven't actually been forked to do real work yet. This partitioning
1032 // will take place in the parallel region nested within the teams construct.
1033 if (!fork_teams_workers) {
1034 __kmp_partition_places(team);
1035 }
1036#endif
1037
1038 if (team->t.t_nproc > 1 &&
1039 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1040 team->t.b->update_num_threads(nthr: team->t.t_nproc);
1041 __kmp_add_threads_to_team(team, new_nthreads: team->t.t_nproc);
1042 }
1043 }
1044
1045 if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1046 for (i = 0; i < team->t.t_nproc; i++) {
1047 kmp_info_t *thr = team->t.t_threads[i];
1048 if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1049 thr->th.th_prev_level != team->t.t_level) {
1050 team->t.t_display_affinity = 1;
1051 break;
1052 }
1053 }
1054 }
1055
1056 KMP_MB();
1057}
1058
1059#if KMP_ARCH_X86 || KMP_ARCH_X86_64
1060// Propagate any changes to the floating point control registers out to the team
1061// We try to avoid unnecessary writes to the relevant cache line in the team
1062// structure, so we don't make changes unless they are needed.
1063inline static void propagateFPControl(kmp_team_t *team) {
1064 if (__kmp_inherit_fp_control) {
1065 kmp_int16 x87_fpu_control_word;
1066 kmp_uint32 mxcsr;
1067
1068 // Get primary thread's values of FPU control flags (both X87 and vector)
1069 __kmp_store_x87_fpu_control_word(p: &x87_fpu_control_word);
1070 __kmp_store_mxcsr(p: &mxcsr);
1071 mxcsr &= KMP_X86_MXCSR_MASK;
1072
1073 // There is no point looking at t_fp_control_saved here.
1074 // If it is TRUE, we still have to update the values if they are different
1075 // from those we now have. If it is FALSE we didn't save anything yet, but
1076 // our objective is the same. We have to ensure that the values in the team
1077 // are the same as those we have.
1078 // So, this code achieves what we need whether or not t_fp_control_saved is
1079 // true. By checking whether the value needs updating we avoid unnecessary
1080 // writes that would put the cache-line into a written state, causing all
1081 // threads in the team to have to read it again.
1082 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1083 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1084 // Although we don't use this value, other code in the runtime wants to know
1085 // whether it should restore them. So we must ensure it is correct.
1086 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1087 } else {
1088 // Similarly here. Don't write to this cache-line in the team structure
1089 // unless we have to.
1090 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1091 }
1092}
1093
1094// Do the opposite, setting the hardware registers to the updated values from
1095// the team.
1096inline static void updateHWFPControl(kmp_team_t *team) {
1097 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1098 // Only reset the fp control regs if they have been changed in the team.
1099 // the parallel region that we are exiting.
1100 kmp_int16 x87_fpu_control_word;
1101 kmp_uint32 mxcsr;
1102 __kmp_store_x87_fpu_control_word(p: &x87_fpu_control_word);
1103 __kmp_store_mxcsr(p: &mxcsr);
1104 mxcsr &= KMP_X86_MXCSR_MASK;
1105
1106 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1107 __kmp_clear_x87_fpu_status_word();
1108 __kmp_load_x87_fpu_control_word(p: &team->t.t_x87_fpu_control_word);
1109 }
1110
1111 if (team->t.t_mxcsr != mxcsr) {
1112 __kmp_load_mxcsr(p: &team->t.t_mxcsr);
1113 }
1114 }
1115}
1116#else
1117#define propagateFPControl(x) ((void)0)
1118#define updateHWFPControl(x) ((void)0)
1119#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1120
1121static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1122 int realloc); // forward declaration
1123
1124/* Run a parallel region that has been serialized, so runs only in a team of the
1125 single primary thread. */
1126void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1127 kmp_info_t *this_thr;
1128 kmp_team_t *serial_team;
1129
1130 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1131
1132 /* Skip all this code for autopar serialized loops since it results in
1133 unacceptable overhead */
1134 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1135 return;
1136
1137 if (!TCR_4(__kmp_init_parallel))
1138 __kmp_parallel_initialize();
1139 __kmp_resume_if_soft_paused();
1140
1141 this_thr = __kmp_threads[global_tid];
1142 serial_team = this_thr->th.th_serial_team;
1143
1144 /* utilize the serialized team held by this thread */
1145 KMP_DEBUG_ASSERT(serial_team);
1146 KMP_MB();
1147
1148 if (__kmp_tasking_mode != tskm_immediate_exec) {
1149 KMP_DEBUG_ASSERT(
1150 this_thr->th.th_task_team ==
1151 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1152 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1153 NULL);
1154 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1155 "team %p, new task_team = NULL\n",
1156 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1157 this_thr->th.th_task_team = NULL;
1158 }
1159
1160 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1161 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1162 proc_bind = proc_bind_false;
1163 } else if (proc_bind == proc_bind_default) {
1164 // No proc_bind clause was specified, so use the current value
1165 // of proc-bind-var for this parallel region.
1166 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1167 }
1168 // Reset for next parallel region
1169 this_thr->th.th_set_proc_bind = proc_bind_default;
1170
1171 // Reset num_threads for next parallel region
1172 this_thr->th.th_set_nproc = 0;
1173
1174#if OMPT_SUPPORT
1175 ompt_data_t ompt_parallel_data = ompt_data_none;
1176 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1177 if (ompt_enabled.enabled &&
1178 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1179
1180 ompt_task_info_t *parent_task_info;
1181 parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1182
1183 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1184 if (ompt_enabled.ompt_callback_parallel_begin) {
1185 int team_size = 1;
1186
1187 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1188 &(parent_task_info->task_data), &(parent_task_info->frame),
1189 &ompt_parallel_data, team_size,
1190 ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1191 }
1192 }
1193#endif // OMPT_SUPPORT
1194
1195 if (this_thr->th.th_team != serial_team) {
1196 // Nested level will be an index in the nested nthreads array
1197 int level = this_thr->th.th_team->t.t_level;
1198
1199 if (serial_team->t.t_serialized) {
1200 /* this serial team was already used
1201 TODO increase performance by making this locks more specific */
1202 kmp_team_t *new_team;
1203
1204 __kmp_acquire_bootstrap_lock(lck: &__kmp_forkjoin_lock);
1205
1206 new_team =
1207 __kmp_allocate_team(root: this_thr->th.th_root, new_nproc: 1, max_nproc: 1,
1208#if OMPT_SUPPORT
1209 ompt_parallel_data,
1210#endif
1211 proc_bind, new_icvs: &this_thr->th.th_current_task->td_icvs,
1212 argc: 0 USE_NESTED_HOT_ARG(NULL));
1213 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
1214 KMP_ASSERT(new_team);
1215
1216 /* setup new serialized team and install it */
1217 new_team->t.t_threads[0] = this_thr;
1218 new_team->t.t_parent = this_thr->th.th_team;
1219 serial_team = new_team;
1220 this_thr->th.th_serial_team = serial_team;
1221
1222 KF_TRACE(
1223 10,
1224 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1225 global_tid, serial_team));
1226
1227 /* TODO the above breaks the requirement that if we run out of resources,
1228 then we can still guarantee that serialized teams are ok, since we may
1229 need to allocate a new one */
1230 } else {
1231 KF_TRACE(
1232 10,
1233 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1234 global_tid, serial_team));
1235 }
1236
1237 /* we have to initialize this serial team */
1238 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1239 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1240 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1241 serial_team->t.t_ident = loc;
1242 serial_team->t.t_serialized = 1;
1243 serial_team->t.t_nproc = 1;
1244 serial_team->t.t_parent = this_thr->th.th_team;
1245 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1246 this_thr->th.th_team = serial_team;
1247 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1248
1249 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1250 this_thr->th.th_current_task));
1251 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1252 this_thr->th.th_current_task->td_flags.executing = 0;
1253
1254 __kmp_push_current_task_to_thread(this_thr, team: serial_team, tid: 0);
1255
1256 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1257 implicit task for each serialized task represented by
1258 team->t.t_serialized? */
1259 copy_icvs(dst: &this_thr->th.th_current_task->td_icvs,
1260 src: &this_thr->th.th_current_task->td_parent->td_icvs);
1261
1262 // Thread value exists in the nested nthreads array for the next nested
1263 // level
1264 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1265 this_thr->th.th_current_task->td_icvs.nproc =
1266 __kmp_nested_nth.nth[level + 1];
1267 }
1268
1269 if (__kmp_nested_proc_bind.used &&
1270 (level + 1 < __kmp_nested_proc_bind.used)) {
1271 this_thr->th.th_current_task->td_icvs.proc_bind =
1272 __kmp_nested_proc_bind.bind_types[level + 1];
1273 }
1274
1275#if USE_DEBUGGER
1276 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1277#endif
1278 this_thr->th.th_info.ds.ds_tid = 0;
1279
1280 /* set thread cache values */
1281 this_thr->th.th_team_nproc = 1;
1282 this_thr->th.th_team_master = this_thr;
1283 this_thr->th.th_team_serialized = 1;
1284
1285 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1286 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1287 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1288
1289 propagateFPControl(team: serial_team);
1290
1291 /* check if we need to allocate dispatch buffers stack */
1292 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1293 if (!serial_team->t.t_dispatch->th_disp_buffer) {
1294 serial_team->t.t_dispatch->th_disp_buffer =
1295 (dispatch_private_info_t *)__kmp_allocate(
1296 sizeof(dispatch_private_info_t));
1297 }
1298 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1299
1300 KMP_MB();
1301
1302 } else {
1303 /* this serialized team is already being used,
1304 * that's fine, just add another nested level */
1305 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1306 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1307 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1308 ++serial_team->t.t_serialized;
1309 this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1310
1311 // Nested level will be an index in the nested nthreads array
1312 int level = this_thr->th.th_team->t.t_level;
1313 // Thread value exists in the nested nthreads array for the next nested
1314 // level
1315 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1316 this_thr->th.th_current_task->td_icvs.nproc =
1317 __kmp_nested_nth.nth[level + 1];
1318 }
1319 serial_team->t.t_level++;
1320 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1321 "of serial team %p to %d\n",
1322 global_tid, serial_team, serial_team->t.t_level));
1323
1324 /* allocate/push dispatch buffers stack */
1325 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1326 {
1327 dispatch_private_info_t *disp_buffer =
1328 (dispatch_private_info_t *)__kmp_allocate(
1329 sizeof(dispatch_private_info_t));
1330 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1331 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1332 }
1333 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1334
1335 KMP_MB();
1336 }
1337 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1338
1339 // Perform the display affinity functionality for
1340 // serialized parallel regions
1341 if (__kmp_display_affinity) {
1342 if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1343 this_thr->th.th_prev_num_threads != 1) {
1344 // NULL means use the affinity-format-var ICV
1345 __kmp_aux_display_affinity(gtid: global_tid, NULL);
1346 this_thr->th.th_prev_level = serial_team->t.t_level;
1347 this_thr->th.th_prev_num_threads = 1;
1348 }
1349 }
1350
1351 if (__kmp_env_consistency_check)
1352 __kmp_push_parallel(gtid: global_tid, NULL);
1353#if OMPT_SUPPORT
1354 serial_team->t.ompt_team_info.master_return_address = codeptr;
1355 if (ompt_enabled.enabled &&
1356 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1357 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1358 OMPT_GET_FRAME_ADDRESS(0);
1359
1360 ompt_lw_taskteam_t lw_taskteam;
1361 __ompt_lw_taskteam_init(lwt: &lw_taskteam, thr: this_thr, gtid: global_tid,
1362 ompt_pid: &ompt_parallel_data, codeptr);
1363
1364 __ompt_lw_taskteam_link(lwt: &lw_taskteam, thr: this_thr, on_heap: 1);
1365 // don't use lw_taskteam after linking. content was swaped
1366
1367 /* OMPT implicit task begin */
1368 if (ompt_enabled.ompt_callback_implicit_task) {
1369 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1370 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1371 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(gtid: global_tid),
1372 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1373 OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1374 __kmp_tid_from_gtid(gtid: global_tid);
1375 }
1376
1377 /* OMPT state */
1378 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1379 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1380 OMPT_GET_FRAME_ADDRESS(0);
1381 }
1382#endif
1383}
1384
1385// Test if this fork is for a team closely nested in a teams construct
1386static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1387 microtask_t microtask, int level,
1388 int teams_level, kmp_va_list ap) {
1389 return (master_th->th.th_teams_microtask && ap &&
1390 microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1391}
1392
1393// Test if this fork is for the teams construct, i.e. to form the outer league
1394// of teams
1395static inline bool __kmp_is_entering_teams(int active_level, int level,
1396 int teams_level, kmp_va_list ap) {
1397 return ((ap == NULL && active_level == 0) ||
1398 (ap && teams_level > 0 && teams_level == level));
1399}
1400
1401// AC: This is start of parallel that is nested inside teams construct.
1402// The team is actual (hot), all workers are ready at the fork barrier.
1403// No lock needed to initialize the team a bit, then free workers.
1404static inline int
1405__kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1406 kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1407 enum fork_context_e call_context, microtask_t microtask,
1408 launch_t invoker, int master_set_numthreads, int level,
1409#if OMPT_SUPPORT
1410 ompt_data_t ompt_parallel_data, void *return_address,
1411#endif
1412 kmp_va_list ap) {
1413 void **argv;
1414 int i;
1415
1416 parent_team->t.t_ident = loc;
1417 __kmp_alloc_argv_entries(argc, team: parent_team, TRUE);
1418 parent_team->t.t_argc = argc;
1419 argv = (void **)parent_team->t.t_argv;
1420 for (i = argc - 1; i >= 0; --i) {
1421 *argv++ = va_arg(kmp_va_deref(ap), void *);
1422 }
1423 // Increment our nested depth levels, but not increase the serialization
1424 if (parent_team == master_th->th.th_serial_team) {
1425 // AC: we are in serialized parallel
1426 __kmpc_serialized_parallel(loc, global_tid: gtid);
1427 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1428
1429 if (call_context == fork_context_gnu) {
1430 // AC: need to decrement t_serialized for enquiry functions to work
1431 // correctly, will restore at join time
1432 parent_team->t.t_serialized--;
1433 return TRUE;
1434 }
1435
1436#if OMPD_SUPPORT
1437 parent_team->t.t_pkfn = microtask;
1438#endif
1439
1440#if OMPT_SUPPORT
1441 void *dummy;
1442 void **exit_frame_p;
1443 ompt_data_t *implicit_task_data;
1444 ompt_lw_taskteam_t lw_taskteam;
1445
1446 if (ompt_enabled.enabled) {
1447 __ompt_lw_taskteam_init(lwt: &lw_taskteam, thr: master_th, gtid,
1448 ompt_pid: &ompt_parallel_data, codeptr: return_address);
1449 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1450
1451 __ompt_lw_taskteam_link(lwt: &lw_taskteam, thr: master_th, on_heap: 0);
1452 // Don't use lw_taskteam after linking. Content was swapped.
1453
1454 /* OMPT implicit task begin */
1455 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1456 if (ompt_enabled.ompt_callback_implicit_task) {
1457 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1458 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1459 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1460 1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1461 }
1462
1463 /* OMPT state */
1464 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1465 } else {
1466 exit_frame_p = &dummy;
1467 }
1468#endif
1469
1470 // AC: need to decrement t_serialized for enquiry functions to work
1471 // correctly, will restore at join time
1472 parent_team->t.t_serialized--;
1473
1474 {
1475 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1476 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1477 __kmp_invoke_microtask(pkfn: microtask, gtid, npr: 0, argc, argv: parent_team->t.t_argv
1478#if OMPT_SUPPORT
1479 ,
1480 exit_frame_ptr: exit_frame_p
1481#endif
1482 );
1483 }
1484
1485#if OMPT_SUPPORT
1486 if (ompt_enabled.enabled) {
1487 *exit_frame_p = NULL;
1488 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1489 if (ompt_enabled.ompt_callback_implicit_task) {
1490 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1491 ompt_scope_end, NULL, implicit_task_data, 1,
1492 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1493 }
1494 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1495 __ompt_lw_taskteam_unlink(thr: master_th);
1496 if (ompt_enabled.ompt_callback_parallel_end) {
1497 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1498 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1499 OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1500 }
1501 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1502 }
1503#endif
1504 return TRUE;
1505 }
1506
1507 parent_team->t.t_pkfn = microtask;
1508 parent_team->t.t_invoke = invoker;
1509 KMP_ATOMIC_INC(&root->r.r_in_parallel);
1510 parent_team->t.t_active_level++;
1511 parent_team->t.t_level++;
1512 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1513
1514 // If the threads allocated to the team are less than the thread limit, update
1515 // the thread limit here. th_teams_size.nth is specific to this team nested
1516 // in a teams construct, the team is fully created, and we're about to do
1517 // the actual fork. Best to do this here so that the subsequent uses below
1518 // and in the join have the correct value.
1519 master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1520
1521#if OMPT_SUPPORT
1522 if (ompt_enabled.enabled) {
1523 ompt_lw_taskteam_t lw_taskteam;
1524 __ompt_lw_taskteam_init(lwt: &lw_taskteam, thr: master_th, gtid, ompt_pid: &ompt_parallel_data,
1525 codeptr: return_address);
1526 __ompt_lw_taskteam_link(lwt: &lw_taskteam, thr: master_th, on_heap: 1, always: true);
1527 }
1528#endif
1529
1530 /* Change number of threads in the team if requested */
1531 if (master_set_numthreads) { // The parallel has num_threads clause
1532 if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1533 // AC: only can reduce number of threads dynamically, can't increase
1534 kmp_info_t **other_threads = parent_team->t.t_threads;
1535 // NOTE: if using distributed barrier, we need to run this code block
1536 // even when the team size appears not to have changed from the max.
1537 int old_proc = master_th->th.th_teams_size.nth;
1538 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1539 __kmp_resize_dist_barrier(team: parent_team, old_nthreads: old_proc, new_nthreads: master_set_numthreads);
1540 __kmp_add_threads_to_team(team: parent_team, new_nthreads: master_set_numthreads);
1541 }
1542 parent_team->t.t_nproc = master_set_numthreads;
1543 for (i = 0; i < master_set_numthreads; ++i) {
1544 other_threads[i]->th.th_team_nproc = master_set_numthreads;
1545 }
1546 }
1547 // Keep extra threads hot in the team for possible next parallels
1548 master_th->th.th_set_nproc = 0;
1549 }
1550
1551#if USE_DEBUGGER
1552 if (__kmp_debugging) { // Let debugger override number of threads.
1553 int nth = __kmp_omp_num_threads(loc);
1554 if (nth > 0) { // 0 means debugger doesn't want to change num threads
1555 master_set_numthreads = nth;
1556 }
1557 }
1558#endif
1559
1560 // Figure out the proc_bind policy for the nested parallel within teams
1561 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1562 // proc_bind_default means don't update
1563 kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1564 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1565 proc_bind = proc_bind_false;
1566 } else {
1567 // No proc_bind clause specified; use current proc-bind-var
1568 if (proc_bind == proc_bind_default) {
1569 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1570 }
1571 /* else: The proc_bind policy was specified explicitly on parallel clause.
1572 This overrides proc-bind-var for this parallel region, but does not
1573 change proc-bind-var. */
1574 // Figure the value of proc-bind-var for the child threads.
1575 if ((level + 1 < __kmp_nested_proc_bind.used) &&
1576 (__kmp_nested_proc_bind.bind_types[level + 1] !=
1577 master_th->th.th_current_task->td_icvs.proc_bind)) {
1578 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1579 }
1580 }
1581 KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1582 // Need to change the bind-var ICV to correct value for each implicit task
1583 if (proc_bind_icv != proc_bind_default &&
1584 master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1585 kmp_info_t **other_threads = parent_team->t.t_threads;
1586 for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1587 other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1588 }
1589 }
1590 // Reset for next parallel region
1591 master_th->th.th_set_proc_bind = proc_bind_default;
1592
1593#if USE_ITT_BUILD && USE_ITT_NOTIFY
1594 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1595 KMP_ITT_DEBUG) &&
1596 __kmp_forkjoin_frames_mode == 3 &&
1597 parent_team->t.t_active_level == 1 // only report frames at level 1
1598 && master_th->th.th_teams_size.nteams == 1) {
1599 kmp_uint64 tmp_time = __itt_get_timestamp();
1600 master_th->th.th_frame_time = tmp_time;
1601 parent_team->t.t_region_time = tmp_time;
1602 }
1603 if (__itt_stack_caller_create_ptr) {
1604 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1605 // create new stack stitching id before entering fork barrier
1606 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1607 }
1608#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1609#if KMP_AFFINITY_SUPPORTED
1610 __kmp_partition_places(team: parent_team);
1611#endif
1612
1613 KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1614 "master_th=%p, gtid=%d\n",
1615 root, parent_team, master_th, gtid));
1616 __kmp_internal_fork(id: loc, gtid, team: parent_team);
1617 KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1618 "master_th=%p, gtid=%d\n",
1619 root, parent_team, master_th, gtid));
1620
1621 if (call_context == fork_context_gnu)
1622 return TRUE;
1623
1624 /* Invoke microtask for PRIMARY thread */
1625 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1626 parent_team->t.t_id, parent_team->t.t_pkfn));
1627
1628 if (!parent_team->t.t_invoke(gtid)) {
1629 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1630 }
1631 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1632 parent_team->t.t_id, parent_team->t.t_pkfn));
1633 KMP_MB(); /* Flush all pending memory write invalidates. */
1634
1635 KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1636
1637 return TRUE;
1638}
1639
1640// Create a serialized parallel region
1641static inline int
1642__kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1643 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1644 kmp_info_t *master_th, kmp_team_t *parent_team,
1645#if OMPT_SUPPORT
1646 ompt_data_t *ompt_parallel_data, void **return_address,
1647 ompt_data_t **parent_task_data,
1648#endif
1649 kmp_va_list ap) {
1650 kmp_team_t *team;
1651 int i;
1652 void **argv;
1653
1654/* josh todo: hypothetical question: what do we do for OS X*? */
1655#if KMP_OS_LINUX && \
1656 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1657 SimpleVLA<void *> args(argc);
1658#else
1659 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1660#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1661 KMP_ARCH_AARCH64) */
1662
1663 KA_TRACE(
1664 20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1665
1666 __kmpc_serialized_parallel(loc, global_tid: gtid);
1667
1668#if OMPD_SUPPORT
1669 master_th->th.th_serial_team->t.t_pkfn = microtask;
1670#endif
1671
1672 if (call_context == fork_context_intel) {
1673 /* TODO this sucks, use the compiler itself to pass args! :) */
1674 master_th->th.th_serial_team->t.t_ident = loc;
1675 if (!ap) {
1676 // revert change made in __kmpc_serialized_parallel()
1677 master_th->th.th_serial_team->t.t_level--;
1678// Get args from parent team for teams construct
1679
1680#if OMPT_SUPPORT
1681 void *dummy;
1682 void **exit_frame_p;
1683 ompt_task_info_t *task_info;
1684 ompt_lw_taskteam_t lw_taskteam;
1685
1686 if (ompt_enabled.enabled) {
1687 __ompt_lw_taskteam_init(lwt: &lw_taskteam, thr: master_th, gtid,
1688 ompt_pid: ompt_parallel_data, codeptr: *return_address);
1689
1690 __ompt_lw_taskteam_link(lwt: &lw_taskteam, thr: master_th, on_heap: 0);
1691 // don't use lw_taskteam after linking. content was swaped
1692 task_info = OMPT_CUR_TASK_INFO(master_th);
1693 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1694 if (ompt_enabled.ompt_callback_implicit_task) {
1695 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1696 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1697 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1698 &(task_info->task_data), 1,
1699 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1700 }
1701
1702 /* OMPT state */
1703 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1704 } else {
1705 exit_frame_p = &dummy;
1706 }
1707#endif
1708
1709 {
1710 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1711 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1712 __kmp_invoke_microtask(pkfn: microtask, gtid, npr: 0, argc, argv: parent_team->t.t_argv
1713#if OMPT_SUPPORT
1714 ,
1715 exit_frame_ptr: exit_frame_p
1716#endif
1717 );
1718 }
1719
1720#if OMPT_SUPPORT
1721 if (ompt_enabled.enabled) {
1722 *exit_frame_p = NULL;
1723 if (ompt_enabled.ompt_callback_implicit_task) {
1724 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1725 ompt_scope_end, NULL, &(task_info->task_data), 1,
1726 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1727 }
1728 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1729 __ompt_lw_taskteam_unlink(thr: master_th);
1730 if (ompt_enabled.ompt_callback_parallel_end) {
1731 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1732 ompt_parallel_data, *parent_task_data,
1733 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1734 }
1735 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1736 }
1737#endif
1738 } else if (microtask == (microtask_t)__kmp_teams_master) {
1739 KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1740 team = master_th->th.th_team;
1741 // team->t.t_pkfn = microtask;
1742 team->t.t_invoke = invoker;
1743 __kmp_alloc_argv_entries(argc, team, TRUE);
1744 team->t.t_argc = argc;
1745 argv = (void **)team->t.t_argv;
1746 for (i = argc - 1; i >= 0; --i)
1747 *argv++ = va_arg(kmp_va_deref(ap), void *);
1748 // AC: revert change made in __kmpc_serialized_parallel()
1749 // because initial code in teams should have level=0
1750 team->t.t_level--;
1751 // AC: call special invoker for outer "parallel" of teams construct
1752 invoker(gtid);
1753#if OMPT_SUPPORT
1754 if (ompt_enabled.enabled) {
1755 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1756 if (ompt_enabled.ompt_callback_implicit_task) {
1757 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1758 ompt_scope_end, NULL, &(task_info->task_data), 0,
1759 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1760 }
1761 if (ompt_enabled.ompt_callback_parallel_end) {
1762 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1763 ompt_parallel_data, *parent_task_data,
1764 OMPT_INVOKER(call_context) | ompt_parallel_league,
1765 *return_address);
1766 }
1767 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1768 }
1769#endif
1770 } else {
1771 argv = args;
1772 for (i = argc - 1; i >= 0; --i)
1773 *argv++ = va_arg(kmp_va_deref(ap), void *);
1774 KMP_MB();
1775
1776#if OMPT_SUPPORT
1777 void *dummy;
1778 void **exit_frame_p;
1779 ompt_task_info_t *task_info;
1780 ompt_lw_taskteam_t lw_taskteam;
1781 ompt_data_t *implicit_task_data;
1782
1783 if (ompt_enabled.enabled) {
1784 __ompt_lw_taskteam_init(lwt: &lw_taskteam, thr: master_th, gtid,
1785 ompt_pid: ompt_parallel_data, codeptr: *return_address);
1786 __ompt_lw_taskteam_link(lwt: &lw_taskteam, thr: master_th, on_heap: 0);
1787 // don't use lw_taskteam after linking. content was swaped
1788 task_info = OMPT_CUR_TASK_INFO(master_th);
1789 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1790
1791 /* OMPT implicit task begin */
1792 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1793 if (ompt_enabled.ompt_callback_implicit_task) {
1794 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1795 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1796 implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1797 ompt_task_implicit);
1798 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1799 }
1800
1801 /* OMPT state */
1802 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1803 } else {
1804 exit_frame_p = &dummy;
1805 }
1806#endif
1807
1808 {
1809 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1810 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1811 __kmp_invoke_microtask(pkfn: microtask, gtid, npr: 0, argc, argv: args
1812#if OMPT_SUPPORT
1813 ,
1814 exit_frame_ptr: exit_frame_p
1815#endif
1816 );
1817 }
1818
1819#if OMPT_SUPPORT
1820 if (ompt_enabled.enabled) {
1821 *exit_frame_p = NULL;
1822 if (ompt_enabled.ompt_callback_implicit_task) {
1823 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1824 ompt_scope_end, NULL, &(task_info->task_data), 1,
1825 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1826 }
1827
1828 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1829 __ompt_lw_taskteam_unlink(thr: master_th);
1830 if (ompt_enabled.ompt_callback_parallel_end) {
1831 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1832 ompt_parallel_data, *parent_task_data,
1833 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1834 }
1835 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1836 }
1837#endif
1838 }
1839 } else if (call_context == fork_context_gnu) {
1840#if OMPT_SUPPORT
1841 if (ompt_enabled.enabled) {
1842 ompt_lw_taskteam_t lwt;
1843 __ompt_lw_taskteam_init(lwt: &lwt, thr: master_th, gtid, ompt_pid: ompt_parallel_data,
1844 codeptr: *return_address);
1845
1846 lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1847 __ompt_lw_taskteam_link(lwt: &lwt, thr: master_th, on_heap: 1);
1848 }
1849// don't use lw_taskteam after linking. content was swaped
1850#endif
1851
1852 // we were called from GNU native code
1853 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1854 return FALSE;
1855 } else {
1856 KMP_ASSERT2(call_context < fork_context_last,
1857 "__kmp_serial_fork_call: unknown fork_context parameter");
1858 }
1859
1860 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1861 KMP_MB();
1862 return FALSE;
1863}
1864
1865/* most of the work for a fork */
1866/* return true if we really went parallel, false if serialized */
1867int __kmp_fork_call(ident_t *loc, int gtid,
1868 enum fork_context_e call_context, // Intel, GNU, ...
1869 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1870 kmp_va_list ap) {
1871 void **argv;
1872 int i;
1873 int master_tid;
1874 int master_this_cons;
1875 kmp_team_t *team;
1876 kmp_team_t *parent_team;
1877 kmp_info_t *master_th;
1878 kmp_root_t *root;
1879 int nthreads;
1880 int master_active;
1881 int master_set_numthreads;
1882 int task_thread_limit = 0;
1883 int level;
1884 int active_level;
1885 int teams_level;
1886#if KMP_NESTED_HOT_TEAMS
1887 kmp_hot_team_ptr_t **p_hot_teams;
1888#endif
1889 { // KMP_TIME_BLOCK
1890 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1891 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1892
1893 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1894 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1895 /* Some systems prefer the stack for the root thread(s) to start with */
1896 /* some gap from the parent stack to prevent false sharing. */
1897 void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1898 /* These 2 lines below are so this does not get optimized out */
1899 if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1900 __kmp_stkpadding += (short)((kmp_int64)dummy);
1901 }
1902
1903 /* initialize if needed */
1904 KMP_DEBUG_ASSERT(
1905 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1906 if (!TCR_4(__kmp_init_parallel))
1907 __kmp_parallel_initialize();
1908 __kmp_resume_if_soft_paused();
1909
1910 /* setup current data */
1911 // AC: potentially unsafe, not in sync with library shutdown,
1912 // __kmp_threads can be freed
1913 master_th = __kmp_threads[gtid];
1914
1915 parent_team = master_th->th.th_team;
1916 master_tid = master_th->th.th_info.ds.ds_tid;
1917 master_this_cons = master_th->th.th_local.this_construct;
1918 root = master_th->th.th_root;
1919 master_active = root->r.r_active;
1920 master_set_numthreads = master_th->th.th_set_nproc;
1921 task_thread_limit =
1922 master_th->th.th_current_task->td_icvs.task_thread_limit;
1923
1924#if OMPT_SUPPORT
1925 ompt_data_t ompt_parallel_data = ompt_data_none;
1926 ompt_data_t *parent_task_data;
1927 ompt_frame_t *ompt_frame;
1928 void *return_address = NULL;
1929
1930 if (ompt_enabled.enabled) {
1931 __ompt_get_task_info_internal(ancestor_level: 0, NULL, task_data: &parent_task_data, task_frame: &ompt_frame,
1932 NULL, NULL);
1933 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1934 }
1935#endif
1936
1937 // Assign affinity to root thread if it hasn't happened yet
1938 __kmp_assign_root_init_mask();
1939
1940 // Nested level will be an index in the nested nthreads array
1941 level = parent_team->t.t_level;
1942 // used to launch non-serial teams even if nested is not allowed
1943 active_level = parent_team->t.t_active_level;
1944 // needed to check nesting inside the teams
1945 teams_level = master_th->th.th_teams_level;
1946#if KMP_NESTED_HOT_TEAMS
1947 p_hot_teams = &master_th->th.th_hot_teams;
1948 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1949 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1950 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1951 (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1952 // it is either actual or not needed (when active_level > 0)
1953 (*p_hot_teams)[0].hot_team_nth = 1;
1954 }
1955#endif
1956
1957#if OMPT_SUPPORT
1958 if (ompt_enabled.enabled) {
1959 if (ompt_enabled.ompt_callback_parallel_begin) {
1960 int team_size = master_set_numthreads
1961 ? master_set_numthreads
1962 : get__nproc_2(parent_team, master_tid);
1963 int flags = OMPT_INVOKER(call_context) |
1964 ((microtask == (microtask_t)__kmp_teams_master)
1965 ? ompt_parallel_league
1966 : ompt_parallel_team);
1967 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1968 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1969 return_address);
1970 }
1971 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1972 }
1973#endif
1974
1975 master_th->th.th_ident = loc;
1976
1977 // Parallel closely nested in teams construct:
1978 if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
1979 return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
1980 call_context, microtask, invoker,
1981 master_set_numthreads, level,
1982#if OMPT_SUPPORT
1983 ompt_parallel_data, return_address,
1984#endif
1985 ap);
1986 } // End parallel closely nested in teams construct
1987
1988#if KMP_DEBUG
1989 if (__kmp_tasking_mode != tskm_immediate_exec) {
1990 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1991 parent_team->t.t_task_team[master_th->th.th_task_state]);
1992 }
1993#endif
1994
1995 // Need this to happen before we determine the number of threads, not while
1996 // we are allocating the team
1997 //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
1998
1999 // Determine the number of threads
2000 int enter_teams =
2001 __kmp_is_entering_teams(active_level, level, teams_level, ap);
2002 if ((!enter_teams &&
2003 (parent_team->t.t_active_level >=
2004 master_th->th.th_current_task->td_icvs.max_active_levels)) ||
2005 (__kmp_library == library_serial)) {
2006 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
2007 nthreads = 1;
2008 } else {
2009 nthreads = master_set_numthreads
2010 ? master_set_numthreads
2011 // TODO: get nproc directly from current task
2012 : get__nproc_2(parent_team, master_tid);
2013 // Use the thread_limit set for the current target task if exists, else go
2014 // with the deduced nthreads
2015 nthreads = task_thread_limit > 0 && task_thread_limit < nthreads
2016 ? task_thread_limit
2017 : nthreads;
2018 // Check if we need to take forkjoin lock? (no need for serialized
2019 // parallel out of teams construct).
2020 if (nthreads > 1) {
2021 /* determine how many new threads we can use */
2022 __kmp_acquire_bootstrap_lock(lck: &__kmp_forkjoin_lock);
2023 /* AC: If we execute teams from parallel region (on host), then teams
2024 should be created but each can only have 1 thread if nesting is
2025 disabled. If teams called from serial region, then teams and their
2026 threads should be created regardless of the nesting setting. */
2027 nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2028 set_nthreads: nthreads, enter_teams);
2029 if (nthreads == 1) {
2030 // Free lock for single thread execution here; for multi-thread
2031 // execution it will be freed later after team of threads created
2032 // and initialized
2033 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
2034 }
2035 }
2036 }
2037 KMP_DEBUG_ASSERT(nthreads > 0);
2038
2039 // If we temporarily changed the set number of threads then restore it now
2040 master_th->th.th_set_nproc = 0;
2041
2042 if (nthreads == 1) {
2043 return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2044 invoker, master_th, parent_team,
2045#if OMPT_SUPPORT
2046 ompt_parallel_data: &ompt_parallel_data, return_address: &return_address,
2047 parent_task_data: &parent_task_data,
2048#endif
2049 ap);
2050 } // if (nthreads == 1)
2051
2052 // GEH: only modify the executing flag in the case when not serialized
2053 // serialized case is handled in kmpc_serialized_parallel
2054 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2055 "curtask=%p, curtask_max_aclevel=%d\n",
2056 parent_team->t.t_active_level, master_th,
2057 master_th->th.th_current_task,
2058 master_th->th.th_current_task->td_icvs.max_active_levels));
2059 // TODO: GEH - cannot do this assertion because root thread not set up as
2060 // executing
2061 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2062 master_th->th.th_current_task->td_flags.executing = 0;
2063
2064 if (!master_th->th.th_teams_microtask || level > teams_level) {
2065 /* Increment our nested depth level */
2066 KMP_ATOMIC_INC(&root->r.r_in_parallel);
2067 }
2068
2069 // See if we need to make a copy of the ICVs.
2070 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2071 if ((level + 1 < __kmp_nested_nth.used) &&
2072 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
2073 nthreads_icv = __kmp_nested_nth.nth[level + 1];
2074 } else {
2075 nthreads_icv = 0; // don't update
2076 }
2077
2078 // Figure out the proc_bind_policy for the new team.
2079 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2080 // proc_bind_default means don't update
2081 kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2082 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2083 proc_bind = proc_bind_false;
2084 } else {
2085 // No proc_bind clause specified; use current proc-bind-var for this
2086 // parallel region
2087 if (proc_bind == proc_bind_default) {
2088 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2089 }
2090 // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2091 if (master_th->th.th_teams_microtask &&
2092 microtask == (microtask_t)__kmp_teams_master) {
2093 proc_bind = __kmp_teams_proc_bind;
2094 }
2095 /* else: The proc_bind policy was specified explicitly on parallel clause.
2096 This overrides proc-bind-var for this parallel region, but does not
2097 change proc-bind-var. */
2098 // Figure the value of proc-bind-var for the child threads.
2099 if ((level + 1 < __kmp_nested_proc_bind.used) &&
2100 (__kmp_nested_proc_bind.bind_types[level + 1] !=
2101 master_th->th.th_current_task->td_icvs.proc_bind)) {
2102 // Do not modify the proc bind icv for the two teams construct forks
2103 // They just let the proc bind icv pass through
2104 if (!master_th->th.th_teams_microtask ||
2105 !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2106 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2107 }
2108 }
2109
2110 // Reset for next parallel region
2111 master_th->th.th_set_proc_bind = proc_bind_default;
2112
2113 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2114 kmp_internal_control_t new_icvs;
2115 copy_icvs(dst: &new_icvs, src: &master_th->th.th_current_task->td_icvs);
2116 new_icvs.next = NULL;
2117 if (nthreads_icv > 0) {
2118 new_icvs.nproc = nthreads_icv;
2119 }
2120 if (proc_bind_icv != proc_bind_default) {
2121 new_icvs.proc_bind = proc_bind_icv;
2122 }
2123
2124 /* allocate a new parallel team */
2125 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2126 team = __kmp_allocate_team(root, new_nproc: nthreads, max_nproc: nthreads,
2127#if OMPT_SUPPORT
2128 ompt_parallel_data,
2129#endif
2130 proc_bind, new_icvs: &new_icvs,
2131 argc USE_NESTED_HOT_ARG(master_th));
2132 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2133 copy_icvs(dst: (kmp_internal_control_t *)team->t.b->team_icvs, src: &new_icvs);
2134 } else {
2135 /* allocate a new parallel team */
2136 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2137 team = __kmp_allocate_team(root, new_nproc: nthreads, max_nproc: nthreads,
2138#if OMPT_SUPPORT
2139 ompt_parallel_data,
2140#endif
2141 proc_bind,
2142 new_icvs: &master_th->th.th_current_task->td_icvs,
2143 argc USE_NESTED_HOT_ARG(master_th));
2144 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2145 copy_icvs(dst: (kmp_internal_control_t *)team->t.b->team_icvs,
2146 src: &master_th->th.th_current_task->td_icvs);
2147 }
2148 KF_TRACE(
2149 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2150
2151 /* setup the new team */
2152 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2153 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2154 KMP_CHECK_UPDATE(team->t.t_ident, loc);
2155 KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2156 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2157#if OMPT_SUPPORT
2158 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2159 return_address);
2160#endif
2161 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2162 // TODO: parent_team->t.t_level == INT_MAX ???
2163 if (!master_th->th.th_teams_microtask || level > teams_level) {
2164 int new_level = parent_team->t.t_level + 1;
2165 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2166 new_level = parent_team->t.t_active_level + 1;
2167 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2168 } else {
2169 // AC: Do not increase parallel level at start of the teams construct
2170 int new_level = parent_team->t.t_level;
2171 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2172 new_level = parent_team->t.t_active_level;
2173 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2174 }
2175 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2176 // set primary thread's schedule as new run-time schedule
2177 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2178
2179 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2180 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2181
2182 // Update the floating point rounding in the team if required.
2183 propagateFPControl(team);
2184#if OMPD_SUPPORT
2185 if (ompd_state & OMPD_ENABLE_BP)
2186 ompd_bp_parallel_begin();
2187#endif
2188
2189 if (__kmp_tasking_mode != tskm_immediate_exec) {
2190 // Set primary thread's task team to team's task team. Unless this is hot
2191 // team, it should be NULL.
2192 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2193 parent_team->t.t_task_team[master_th->th.th_task_state]);
2194 KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2195 "%p, new task_team %p / team %p\n",
2196 __kmp_gtid_from_thread(master_th),
2197 master_th->th.th_task_team, parent_team,
2198 team->t.t_task_team[master_th->th.th_task_state], team));
2199
2200 if (active_level || master_th->th.th_task_team) {
2201 // Take a memo of primary thread's task_state
2202 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2203 if (master_th->th.th_task_state_top >=
2204 master_th->th.th_task_state_stack_sz) { // increase size
2205 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2206 kmp_uint8 *old_stack, *new_stack;
2207 kmp_uint32 i;
2208 new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2209 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2210 new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2211 }
2212 for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2213 ++i) { // zero-init rest of stack
2214 new_stack[i] = 0;
2215 }
2216 old_stack = master_th->th.th_task_state_memo_stack;
2217 master_th->th.th_task_state_memo_stack = new_stack;
2218 master_th->th.th_task_state_stack_sz = new_size;
2219 __kmp_free(old_stack);
2220 }
2221 // Store primary thread's task_state on stack
2222 master_th->th
2223 .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2224 master_th->th.th_task_state;
2225 master_th->th.th_task_state_top++;
2226#if KMP_NESTED_HOT_TEAMS
2227 if (master_th->th.th_hot_teams &&
2228 active_level < __kmp_hot_teams_max_level &&
2229 team == master_th->th.th_hot_teams[active_level].hot_team) {
2230 // Restore primary thread's nested state if nested hot team
2231 master_th->th.th_task_state =
2232 master_th->th
2233 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2234 } else {
2235#endif
2236 master_th->th.th_task_state = 0;
2237#if KMP_NESTED_HOT_TEAMS
2238 }
2239#endif
2240 }
2241#if !KMP_NESTED_HOT_TEAMS
2242 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2243 (team == root->r.r_hot_team));
2244#endif
2245 }
2246
2247 KA_TRACE(
2248 20,
2249 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2250 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2251 team->t.t_nproc));
2252 KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2253 (team->t.t_master_tid == 0 &&
2254 (team->t.t_parent == root->r.r_root_team ||
2255 team->t.t_parent->t.t_serialized)));
2256 KMP_MB();
2257
2258 /* now, setup the arguments */
2259 argv = (void **)team->t.t_argv;
2260 if (ap) {
2261 for (i = argc - 1; i >= 0; --i) {
2262 void *new_argv = va_arg(kmp_va_deref(ap), void *);
2263 KMP_CHECK_UPDATE(*argv, new_argv);
2264 argv++;
2265 }
2266 } else {
2267 for (i = 0; i < argc; ++i) {
2268 // Get args from parent team for teams construct
2269 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2270 }
2271 }
2272
2273 /* now actually fork the threads */
2274 KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2275 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2276 root->r.r_active = TRUE;
2277
2278 __kmp_fork_team_threads(root, team, master_th, master_gtid: gtid, fork_teams_workers: !ap);
2279 __kmp_setup_icv_copy(team, new_nproc: nthreads,
2280 new_icvs: &master_th->th.th_current_task->td_icvs, loc);
2281
2282#if OMPT_SUPPORT
2283 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2284#endif
2285
2286 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
2287
2288#if USE_ITT_BUILD
2289 if (team->t.t_active_level == 1 // only report frames at level 1
2290 && !master_th->th.th_teams_microtask) { // not in teams construct
2291#if USE_ITT_NOTIFY
2292 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2293 (__kmp_forkjoin_frames_mode == 3 ||
2294 __kmp_forkjoin_frames_mode == 1)) {
2295 kmp_uint64 tmp_time = 0;
2296 if (__itt_get_timestamp_ptr)
2297 tmp_time = __itt_get_timestamp();
2298 // Internal fork - report frame begin
2299 master_th->th.th_frame_time = tmp_time;
2300 if (__kmp_forkjoin_frames_mode == 3)
2301 team->t.t_region_time = tmp_time;
2302 } else
2303// only one notification scheme (either "submit" or "forking/joined", not both)
2304#endif /* USE_ITT_NOTIFY */
2305 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2306 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2307 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2308 __kmp_itt_region_forking(gtid, team_size: team->t.t_nproc, barriers: 0);
2309 }
2310 }
2311#endif /* USE_ITT_BUILD */
2312
2313 /* now go on and do the work */
2314 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2315 KMP_MB();
2316 KF_TRACE(10,
2317 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2318 root, team, master_th, gtid));
2319
2320#if USE_ITT_BUILD
2321 if (__itt_stack_caller_create_ptr) {
2322 // create new stack stitching id before entering fork barrier
2323 if (!enter_teams) {
2324 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2325 team->t.t_stack_id = __kmp_itt_stack_caller_create();
2326 } else if (parent_team->t.t_serialized) {
2327 // keep stack stitching id in the serialized parent_team;
2328 // current team will be used for parallel inside the teams;
2329 // if parent_team is active, then it already keeps stack stitching id
2330 // for the league of teams
2331 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2332 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2333 }
2334 }
2335#endif /* USE_ITT_BUILD */
2336
2337 // AC: skip __kmp_internal_fork at teams construct, let only primary
2338 // threads execute
2339 if (ap) {
2340 __kmp_internal_fork(id: loc, gtid, team);
2341 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2342 "master_th=%p, gtid=%d\n",
2343 root, team, master_th, gtid));
2344 }
2345
2346 if (call_context == fork_context_gnu) {
2347 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2348 return TRUE;
2349 }
2350
2351 /* Invoke microtask for PRIMARY thread */
2352 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2353 team->t.t_id, team->t.t_pkfn));
2354 } // END of timer KMP_fork_call block
2355
2356#if KMP_STATS_ENABLED
2357 // If beginning a teams construct, then change thread state
2358 stats_state_e previous_state = KMP_GET_THREAD_STATE();
2359 if (!ap) {
2360 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2361 }
2362#endif
2363
2364 if (!team->t.t_invoke(gtid)) {
2365 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2366 }
2367
2368#if KMP_STATS_ENABLED
2369 // If was beginning of a teams construct, then reset thread state
2370 if (!ap) {
2371 KMP_SET_THREAD_STATE(previous_state);
2372 }
2373#endif
2374
2375 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2376 team->t.t_id, team->t.t_pkfn));
2377 KMP_MB(); /* Flush all pending memory write invalidates. */
2378
2379 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2380#if OMPT_SUPPORT
2381 if (ompt_enabled.enabled) {
2382 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2383 }
2384#endif
2385
2386 return TRUE;
2387}
2388
2389#if OMPT_SUPPORT
2390static inline void __kmp_join_restore_state(kmp_info_t *thread,
2391 kmp_team_t *team) {
2392 // restore state outside the region
2393 thread->th.ompt_thread_info.state =
2394 ((team->t.t_serialized) ? ompt_state_work_serial
2395 : ompt_state_work_parallel);
2396}
2397
2398static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2399 kmp_team_t *team, ompt_data_t *parallel_data,
2400 int flags, void *codeptr) {
2401 ompt_task_info_t *task_info = __ompt_get_task_info_object(depth: 0);
2402 if (ompt_enabled.ompt_callback_parallel_end) {
2403 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2404 parallel_data, &(task_info->task_data), flags, codeptr);
2405 }
2406
2407 task_info->frame.enter_frame = ompt_data_none;
2408 __kmp_join_restore_state(thread, team);
2409}
2410#endif
2411
2412void __kmp_join_call(ident_t *loc, int gtid
2413#if OMPT_SUPPORT
2414 ,
2415 enum fork_context_e fork_context
2416#endif
2417 ,
2418 int exit_teams) {
2419 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2420 kmp_team_t *team;
2421 kmp_team_t *parent_team;
2422 kmp_info_t *master_th;
2423 kmp_root_t *root;
2424 int master_active;
2425
2426 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2427
2428 /* setup current data */
2429 master_th = __kmp_threads[gtid];
2430 root = master_th->th.th_root;
2431 team = master_th->th.th_team;
2432 parent_team = team->t.t_parent;
2433
2434 master_th->th.th_ident = loc;
2435
2436#if OMPT_SUPPORT
2437 void *team_microtask = (void *)team->t.t_pkfn;
2438 // For GOMP interface with serialized parallel, need the
2439 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2440 // and end-parallel events.
2441 if (ompt_enabled.enabled &&
2442 !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2443 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2444 }
2445#endif
2446
2447#if KMP_DEBUG
2448 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2449 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2450 "th_task_team = %p\n",
2451 __kmp_gtid_from_thread(master_th), team,
2452 team->t.t_task_team[master_th->th.th_task_state],
2453 master_th->th.th_task_team));
2454 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2455 team->t.t_task_team[master_th->th.th_task_state]);
2456 }
2457#endif
2458
2459 if (team->t.t_serialized) {
2460 if (master_th->th.th_teams_microtask) {
2461 // We are in teams construct
2462 int level = team->t.t_level;
2463 int tlevel = master_th->th.th_teams_level;
2464 if (level == tlevel) {
2465 // AC: we haven't incremented it earlier at start of teams construct,
2466 // so do it here - at the end of teams construct
2467 team->t.t_level++;
2468 } else if (level == tlevel + 1) {
2469 // AC: we are exiting parallel inside teams, need to increment
2470 // serialization in order to restore it in the next call to
2471 // __kmpc_end_serialized_parallel
2472 team->t.t_serialized++;
2473 }
2474 }
2475 __kmpc_end_serialized_parallel(loc, global_tid: gtid);
2476
2477#if OMPT_SUPPORT
2478 if (ompt_enabled.enabled) {
2479 if (fork_context == fork_context_gnu) {
2480 __ompt_lw_taskteam_unlink(thr: master_th);
2481 }
2482 __kmp_join_restore_state(thread: master_th, team: parent_team);
2483 }
2484#endif
2485
2486 return;
2487 }
2488
2489 master_active = team->t.t_master_active;
2490
2491 if (!exit_teams) {
2492 // AC: No barrier for internal teams at exit from teams construct.
2493 // But there is barrier for external team (league).
2494 __kmp_internal_join(id: loc, gtid, team);
2495#if USE_ITT_BUILD
2496 if (__itt_stack_caller_create_ptr) {
2497 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2498 // destroy the stack stitching id after join barrier
2499 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2500 team->t.t_stack_id = NULL;
2501 }
2502#endif
2503 } else {
2504 master_th->th.th_task_state =
2505 0; // AC: no tasking in teams (out of any parallel)
2506#if USE_ITT_BUILD
2507 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2508 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2509 // destroy the stack stitching id on exit from the teams construct
2510 // if parent_team is active, then the id will be destroyed later on
2511 // by master of the league of teams
2512 __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2513 parent_team->t.t_stack_id = NULL;
2514 }
2515#endif
2516 }
2517
2518 KMP_MB();
2519
2520#if OMPT_SUPPORT
2521 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2522 void *codeptr = team->t.ompt_team_info.master_return_address;
2523#endif
2524
2525#if USE_ITT_BUILD
2526 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2527 if (team->t.t_active_level == 1 &&
2528 (!master_th->th.th_teams_microtask || /* not in teams construct */
2529 master_th->th.th_teams_size.nteams == 1)) {
2530 master_th->th.th_ident = loc;
2531 // only one notification scheme (either "submit" or "forking/joined", not
2532 // both)
2533 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2534 __kmp_forkjoin_frames_mode == 3)
2535 __kmp_itt_frame_submit(gtid, begin: team->t.t_region_time,
2536 end: master_th->th.th_frame_time, imbalance: 0, loc,
2537 team_size: master_th->th.th_team_nproc, region: 1);
2538 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2539 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2540 __kmp_itt_region_joined(gtid);
2541 } // active_level == 1
2542#endif /* USE_ITT_BUILD */
2543
2544#if KMP_AFFINITY_SUPPORTED
2545 if (!exit_teams) {
2546 // Restore master thread's partition.
2547 master_th->th.th_first_place = team->t.t_first_place;
2548 master_th->th.th_last_place = team->t.t_last_place;
2549 }
2550#endif // KMP_AFFINITY_SUPPORTED
2551
2552 if (master_th->th.th_teams_microtask && !exit_teams &&
2553 team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2554 team->t.t_level == master_th->th.th_teams_level + 1) {
2555// AC: We need to leave the team structure intact at the end of parallel
2556// inside the teams construct, so that at the next parallel same (hot) team
2557// works, only adjust nesting levels
2558#if OMPT_SUPPORT
2559 ompt_data_t ompt_parallel_data = ompt_data_none;
2560 if (ompt_enabled.enabled) {
2561 ompt_task_info_t *task_info = __ompt_get_task_info_object(depth: 0);
2562 if (ompt_enabled.ompt_callback_implicit_task) {
2563 int ompt_team_size = team->t.t_nproc;
2564 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2565 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2566 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2567 }
2568 task_info->frame.exit_frame = ompt_data_none;
2569 task_info->task_data = ompt_data_none;
2570 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2571 __ompt_lw_taskteam_unlink(thr: master_th);
2572 }
2573#endif
2574 /* Decrement our nested depth level */
2575 team->t.t_level--;
2576 team->t.t_active_level--;
2577 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2578
2579 // Restore number of threads in the team if needed. This code relies on
2580 // the proper adjustment of th_teams_size.nth after the fork in
2581 // __kmp_teams_master on each teams primary thread in the case that
2582 // __kmp_reserve_threads reduced it.
2583 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2584 int old_num = master_th->th.th_team_nproc;
2585 int new_num = master_th->th.th_teams_size.nth;
2586 kmp_info_t **other_threads = team->t.t_threads;
2587 team->t.t_nproc = new_num;
2588 for (int i = 0; i < old_num; ++i) {
2589 other_threads[i]->th.th_team_nproc = new_num;
2590 }
2591 // Adjust states of non-used threads of the team
2592 for (int i = old_num; i < new_num; ++i) {
2593 // Re-initialize thread's barrier data.
2594 KMP_DEBUG_ASSERT(other_threads[i]);
2595 kmp_balign_t *balign = other_threads[i]->th.th_bar;
2596 for (int b = 0; b < bs_last_barrier; ++b) {
2597 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2598 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2599#if USE_DEBUGGER
2600 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2601#endif
2602 }
2603 if (__kmp_tasking_mode != tskm_immediate_exec) {
2604 // Synchronize thread's task state
2605 other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2606 }
2607 }
2608 }
2609
2610#if OMPT_SUPPORT
2611 if (ompt_enabled.enabled) {
2612 __kmp_join_ompt(gtid, thread: master_th, team: parent_team, parallel_data: &ompt_parallel_data,
2613 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2614 }
2615#endif
2616
2617 return;
2618 }
2619
2620 /* do cleanup and restore the parent team */
2621 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2622 master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2623
2624 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2625
2626 /* jc: The following lock has instructions with REL and ACQ semantics,
2627 separating the parallel user code called in this parallel region
2628 from the serial user code called after this function returns. */
2629 __kmp_acquire_bootstrap_lock(lck: &__kmp_forkjoin_lock);
2630
2631 if (!master_th->th.th_teams_microtask ||
2632 team->t.t_level > master_th->th.th_teams_level) {
2633 /* Decrement our nested depth level */
2634 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2635 }
2636 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2637
2638#if OMPT_SUPPORT
2639 if (ompt_enabled.enabled) {
2640 ompt_task_info_t *task_info = __ompt_get_task_info_object(depth: 0);
2641 if (ompt_enabled.ompt_callback_implicit_task) {
2642 int flags = (team_microtask == (void *)__kmp_teams_master)
2643 ? ompt_task_initial
2644 : ompt_task_implicit;
2645 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2646 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2647 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2648 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2649 }
2650 task_info->frame.exit_frame = ompt_data_none;
2651 task_info->task_data = ompt_data_none;
2652 }
2653#endif
2654
2655 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2656 master_th, team));
2657 __kmp_pop_current_task_from_thread(this_thr: master_th);
2658
2659 master_th->th.th_def_allocator = team->t.t_def_allocator;
2660
2661#if OMPD_SUPPORT
2662 if (ompd_state & OMPD_ENABLE_BP)
2663 ompd_bp_parallel_end();
2664#endif
2665 updateHWFPControl(team);
2666
2667 if (root->r.r_active != master_active)
2668 root->r.r_active = master_active;
2669
2670 __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2671 master_th)); // this will free worker threads
2672
2673 /* this race was fun to find. make sure the following is in the critical
2674 region otherwise assertions may fail occasionally since the old team may be
2675 reallocated and the hierarchy appears inconsistent. it is actually safe to
2676 run and won't cause any bugs, but will cause those assertion failures. it's
2677 only one deref&assign so might as well put this in the critical region */
2678 master_th->th.th_team = parent_team;
2679 master_th->th.th_team_nproc = parent_team->t.t_nproc;
2680 master_th->th.th_team_master = parent_team->t.t_threads[0];
2681 master_th->th.th_team_serialized = parent_team->t.t_serialized;
2682
2683 /* restore serialized team, if need be */
2684 if (parent_team->t.t_serialized &&
2685 parent_team != master_th->th.th_serial_team &&
2686 parent_team != root->r.r_root_team) {
2687 __kmp_free_team(root,
2688 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2689 master_th->th.th_serial_team = parent_team;
2690 }
2691
2692 if (__kmp_tasking_mode != tskm_immediate_exec) {
2693 if (master_th->th.th_task_state_top >
2694 0) { // Restore task state from memo stack
2695 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2696 // Remember primary thread's state if we re-use this nested hot team
2697 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2698 master_th->th.th_task_state;
2699 --master_th->th.th_task_state_top; // pop
2700 // Now restore state at this level
2701 master_th->th.th_task_state =
2702 master_th->th
2703 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2704 } else if (team != root->r.r_hot_team) {
2705 // Reset the task state of primary thread if we are not hot team because
2706 // in this case all the worker threads will be free, and their task state
2707 // will be reset. If not reset the primary's, the task state will be
2708 // inconsistent.
2709 master_th->th.th_task_state = 0;
2710 }
2711 // Copy the task team from the parent team to the primary thread
2712 master_th->th.th_task_team =
2713 parent_team->t.t_task_team[master_th->th.th_task_state];
2714 KA_TRACE(20,
2715 ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2716 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2717 parent_team));
2718 }
2719
2720 // TODO: GEH - cannot do this assertion because root thread not set up as
2721 // executing
2722 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2723 master_th->th.th_current_task->td_flags.executing = 1;
2724
2725 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
2726
2727#if KMP_AFFINITY_SUPPORTED
2728 if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2729 __kmp_reset_root_init_mask(gtid);
2730 }
2731#endif
2732#if OMPT_SUPPORT
2733 int flags =
2734 OMPT_INVOKER(fork_context) |
2735 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2736 : ompt_parallel_team);
2737 if (ompt_enabled.enabled) {
2738 __kmp_join_ompt(gtid, thread: master_th, team: parent_team, parallel_data, flags,
2739 codeptr);
2740 }
2741#endif
2742
2743 KMP_MB();
2744 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2745}
2746
2747/* Check whether we should push an internal control record onto the
2748 serial team stack. If so, do it. */
2749void __kmp_save_internal_controls(kmp_info_t *thread) {
2750
2751 if (thread->th.th_team != thread->th.th_serial_team) {
2752 return;
2753 }
2754 if (thread->th.th_team->t.t_serialized > 1) {
2755 int push = 0;
2756
2757 if (thread->th.th_team->t.t_control_stack_top == NULL) {
2758 push = 1;
2759 } else {
2760 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2761 thread->th.th_team->t.t_serialized) {
2762 push = 1;
2763 }
2764 }
2765 if (push) { /* push a record on the serial team's stack */
2766 kmp_internal_control_t *control =
2767 (kmp_internal_control_t *)__kmp_allocate(
2768 sizeof(kmp_internal_control_t));
2769
2770 copy_icvs(dst: control, src: &thread->th.th_current_task->td_icvs);
2771
2772 control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2773
2774 control->next = thread->th.th_team->t.t_control_stack_top;
2775 thread->th.th_team->t.t_control_stack_top = control;
2776 }
2777 }
2778}
2779
2780/* Changes set_nproc */
2781void __kmp_set_num_threads(int new_nth, int gtid) {
2782 kmp_info_t *thread;
2783 kmp_root_t *root;
2784
2785 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2786 KMP_DEBUG_ASSERT(__kmp_init_serial);
2787
2788 if (new_nth < 1)
2789 new_nth = 1;
2790 else if (new_nth > __kmp_max_nth)
2791 new_nth = __kmp_max_nth;
2792
2793 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2794 thread = __kmp_threads[gtid];
2795 if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2796 return; // nothing to do
2797
2798 __kmp_save_internal_controls(thread);
2799
2800 set__nproc(thread, new_nth);
2801
2802 // If this omp_set_num_threads() call will cause the hot team size to be
2803 // reduced (in the absence of a num_threads clause), then reduce it now,
2804 // rather than waiting for the next parallel region.
2805 root = thread->th.th_root;
2806 if (__kmp_init_parallel && (!root->r.r_active) &&
2807 (root->r.r_hot_team->t.t_nproc > new_nth)
2808#if KMP_NESTED_HOT_TEAMS
2809 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2810#endif
2811 ) {
2812 kmp_team_t *hot_team = root->r.r_hot_team;
2813 int f;
2814
2815 __kmp_acquire_bootstrap_lock(lck: &__kmp_forkjoin_lock);
2816
2817 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2818 __kmp_resize_dist_barrier(team: hot_team, old_nthreads: hot_team->t.t_nproc, new_nthreads: new_nth);
2819 }
2820 // Release the extra threads we don't need any more.
2821 for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2822 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2823 if (__kmp_tasking_mode != tskm_immediate_exec) {
2824 // When decreasing team size, threads no longer in the team should unref
2825 // task team.
2826 hot_team->t.t_threads[f]->th.th_task_team = NULL;
2827 }
2828 __kmp_free_thread(hot_team->t.t_threads[f]);
2829 hot_team->t.t_threads[f] = NULL;
2830 }
2831 hot_team->t.t_nproc = new_nth;
2832#if KMP_NESTED_HOT_TEAMS
2833 if (thread->th.th_hot_teams) {
2834 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2835 thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2836 }
2837#endif
2838
2839 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2840 hot_team->t.b->update_num_threads(nthr: new_nth);
2841 __kmp_add_threads_to_team(team: hot_team, new_nthreads: new_nth);
2842 }
2843
2844 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
2845
2846 // Update the t_nproc field in the threads that are still active.
2847 for (f = 0; f < new_nth; f++) {
2848 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2849 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2850 }
2851 // Special flag in case omp_set_num_threads() call
2852 hot_team->t.t_size_changed = -1;
2853 }
2854}
2855
2856/* Changes max_active_levels */
2857void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2858 kmp_info_t *thread;
2859
2860 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2861 "%d = (%d)\n",
2862 gtid, max_active_levels));
2863 KMP_DEBUG_ASSERT(__kmp_init_serial);
2864
2865 // validate max_active_levels
2866 if (max_active_levels < 0) {
2867 KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2868 // We ignore this call if the user has specified a negative value.
2869 // The current setting won't be changed. The last valid setting will be
2870 // used. A warning will be issued (if warnings are allowed as controlled by
2871 // the KMP_WARNINGS env var).
2872 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2873 "max_active_levels for thread %d = (%d)\n",
2874 gtid, max_active_levels));
2875 return;
2876 }
2877 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2878 // it's OK, the max_active_levels is within the valid range: [ 0;
2879 // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2880 // We allow a zero value. (implementation defined behavior)
2881 } else {
2882 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2883 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2884 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2885 // Current upper limit is MAX_INT. (implementation defined behavior)
2886 // If the input exceeds the upper limit, we correct the input to be the
2887 // upper limit. (implementation defined behavior)
2888 // Actually, the flow should never get here until we use MAX_INT limit.
2889 }
2890 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2891 "max_active_levels for thread %d = (%d)\n",
2892 gtid, max_active_levels));
2893
2894 thread = __kmp_threads[gtid];
2895
2896 __kmp_save_internal_controls(thread);
2897
2898 set__max_active_levels(thread, max_active_levels);
2899}
2900
2901/* Gets max_active_levels */
2902int __kmp_get_max_active_levels(int gtid) {
2903 kmp_info_t *thread;
2904
2905 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2906 KMP_DEBUG_ASSERT(__kmp_init_serial);
2907
2908 thread = __kmp_threads[gtid];
2909 KMP_DEBUG_ASSERT(thread->th.th_current_task);
2910 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2911 "curtask_maxaclevel=%d\n",
2912 gtid, thread->th.th_current_task,
2913 thread->th.th_current_task->td_icvs.max_active_levels));
2914 return thread->th.th_current_task->td_icvs.max_active_levels;
2915}
2916
2917// nteams-var per-device ICV
2918void __kmp_set_num_teams(int num_teams) {
2919 if (num_teams > 0)
2920 __kmp_nteams = num_teams;
2921}
2922int __kmp_get_max_teams(void) { return __kmp_nteams; }
2923// teams-thread-limit-var per-device ICV
2924void __kmp_set_teams_thread_limit(int limit) {
2925 if (limit > 0)
2926 __kmp_teams_thread_limit = limit;
2927}
2928int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2929
2930KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2931KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2932
2933/* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2934void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2935 kmp_info_t *thread;
2936 kmp_sched_t orig_kind;
2937 // kmp_team_t *team;
2938
2939 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2940 gtid, (int)kind, chunk));
2941 KMP_DEBUG_ASSERT(__kmp_init_serial);
2942
2943 // Check if the kind parameter is valid, correct if needed.
2944 // Valid parameters should fit in one of two intervals - standard or extended:
2945 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2946 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2947 orig_kind = kind;
2948 kind = __kmp_sched_without_mods(kind);
2949
2950 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2951 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2952 // TODO: Hint needs attention in case we change the default schedule.
2953 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2954 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2955 __kmp_msg_null);
2956 kind = kmp_sched_default;
2957 chunk = 0; // ignore chunk value in case of bad kind
2958 }
2959
2960 thread = __kmp_threads[gtid];
2961
2962 __kmp_save_internal_controls(thread);
2963
2964 if (kind < kmp_sched_upper_std) {
2965 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2966 // differ static chunked vs. unchunked: chunk should be invalid to
2967 // indicate unchunked schedule (which is the default)
2968 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2969 } else {
2970 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2971 __kmp_sch_map[kind - kmp_sched_lower - 1];
2972 }
2973 } else {
2974 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2975 // kmp_sched_lower - 2 ];
2976 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2977 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2978 kmp_sched_lower - 2];
2979 }
2980 __kmp_sched_apply_mods_intkind(
2981 kind: orig_kind, internal_kind: &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2982 if (kind == kmp_sched_auto || chunk < 1) {
2983 // ignore parameter chunk for schedule auto
2984 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2985 } else {
2986 thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2987 }
2988}
2989
2990/* Gets def_sched_var ICV values */
2991void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2992 kmp_info_t *thread;
2993 enum sched_type th_type;
2994
2995 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2996 KMP_DEBUG_ASSERT(__kmp_init_serial);
2997
2998 thread = __kmp_threads[gtid];
2999
3000 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
3001 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
3002 case kmp_sch_static:
3003 case kmp_sch_static_greedy:
3004 case kmp_sch_static_balanced:
3005 *kind = kmp_sched_static;
3006 __kmp_sched_apply_mods_stdkind(kind, internal_kind: th_type);
3007 *chunk = 0; // chunk was not set, try to show this fact via zero value
3008 return;
3009 case kmp_sch_static_chunked:
3010 *kind = kmp_sched_static;
3011 break;
3012 case kmp_sch_dynamic_chunked:
3013 *kind = kmp_sched_dynamic;
3014 break;
3015 case kmp_sch_guided_chunked:
3016 case kmp_sch_guided_iterative_chunked:
3017 case kmp_sch_guided_analytical_chunked:
3018 *kind = kmp_sched_guided;
3019 break;
3020 case kmp_sch_auto:
3021 *kind = kmp_sched_auto;
3022 break;
3023 case kmp_sch_trapezoidal:
3024 *kind = kmp_sched_trapezoidal;
3025 break;
3026#if KMP_STATIC_STEAL_ENABLED
3027 case kmp_sch_static_steal:
3028 *kind = kmp_sched_static_steal;
3029 break;
3030#endif
3031 default:
3032 KMP_FATAL(UnknownSchedulingType, th_type);
3033 }
3034
3035 __kmp_sched_apply_mods_stdkind(kind, internal_kind: th_type);
3036 *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3037}
3038
3039int __kmp_get_ancestor_thread_num(int gtid, int level) {
3040
3041 int ii, dd;
3042 kmp_team_t *team;
3043 kmp_info_t *thr;
3044
3045 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3046 KMP_DEBUG_ASSERT(__kmp_init_serial);
3047
3048 // validate level
3049 if (level == 0)
3050 return 0;
3051 if (level < 0)
3052 return -1;
3053 thr = __kmp_threads[gtid];
3054 team = thr->th.th_team;
3055 ii = team->t.t_level;
3056 if (level > ii)
3057 return -1;
3058
3059 if (thr->th.th_teams_microtask) {
3060 // AC: we are in teams region where multiple nested teams have same level
3061 int tlevel = thr->th.th_teams_level; // the level of the teams construct
3062 if (level <=
3063 tlevel) { // otherwise usual algorithm works (will not touch the teams)
3064 KMP_DEBUG_ASSERT(ii >= tlevel);
3065 // AC: As we need to pass by the teams league, we need to artificially
3066 // increase ii
3067 if (ii == tlevel) {
3068 ii += 2; // three teams have same level
3069 } else {
3070 ii++; // two teams have same level
3071 }
3072 }
3073 }
3074
3075 if (ii == level)
3076 return __kmp_tid_from_gtid(gtid);
3077
3078 dd = team->t.t_serialized;
3079 level++;
3080 while (ii > level) {
3081 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3082 }
3083 if ((team->t.t_serialized) && (!dd)) {
3084 team = team->t.t_parent;
3085 continue;
3086 }
3087 if (ii > level) {
3088 team = team->t.t_parent;
3089 dd = team->t.t_serialized;
3090 ii--;
3091 }
3092 }
3093
3094 return (dd > 1) ? (0) : (team->t.t_master_tid);
3095}
3096
3097int __kmp_get_team_size(int gtid, int level) {
3098
3099 int ii, dd;
3100 kmp_team_t *team;
3101 kmp_info_t *thr;
3102
3103 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3104 KMP_DEBUG_ASSERT(__kmp_init_serial);
3105
3106 // validate level
3107 if (level == 0)
3108 return 1;
3109 if (level < 0)
3110 return -1;
3111 thr = __kmp_threads[gtid];
3112 team = thr->th.th_team;
3113 ii = team->t.t_level;
3114 if (level > ii)
3115 return -1;
3116
3117 if (thr->th.th_teams_microtask) {
3118 // AC: we are in teams region where multiple nested teams have same level
3119 int tlevel = thr->th.th_teams_level; // the level of the teams construct
3120 if (level <=
3121 tlevel) { // otherwise usual algorithm works (will not touch the teams)
3122 KMP_DEBUG_ASSERT(ii >= tlevel);
3123 // AC: As we need to pass by the teams league, we need to artificially
3124 // increase ii
3125 if (ii == tlevel) {
3126 ii += 2; // three teams have same level
3127 } else {
3128 ii++; // two teams have same level
3129 }
3130 }
3131 }
3132
3133 while (ii > level) {
3134 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3135 }
3136 if (team->t.t_serialized && (!dd)) {
3137 team = team->t.t_parent;
3138 continue;
3139 }
3140 if (ii > level) {
3141 team = team->t.t_parent;
3142 ii--;
3143 }
3144 }
3145
3146 return team->t.t_nproc;
3147}
3148
3149kmp_r_sched_t __kmp_get_schedule_global() {
3150 // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3151 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3152 // independently. So one can get the updated schedule here.
3153
3154 kmp_r_sched_t r_sched;
3155
3156 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3157 // __kmp_guided. __kmp_sched should keep original value, so that user can set
3158 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3159 // different roots (even in OMP 2.5)
3160 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3161 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3162 if (s == kmp_sch_static) {
3163 // replace STATIC with more detailed schedule (balanced or greedy)
3164 r_sched.r_sched_type = __kmp_static;
3165 } else if (s == kmp_sch_guided_chunked) {
3166 // replace GUIDED with more detailed schedule (iterative or analytical)
3167 r_sched.r_sched_type = __kmp_guided;
3168 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3169 r_sched.r_sched_type = __kmp_sched;
3170 }
3171 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3172
3173 if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3174 // __kmp_chunk may be wrong here (if it was not ever set)
3175 r_sched.chunk = KMP_DEFAULT_CHUNK;
3176 } else {
3177 r_sched.chunk = __kmp_chunk;
3178 }
3179
3180 return r_sched;
3181}
3182
3183/* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3184 at least argc number of *t_argv entries for the requested team. */
3185static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3186
3187 KMP_DEBUG_ASSERT(team);
3188 if (!realloc || argc > team->t.t_max_argc) {
3189
3190 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3191 "current entries=%d\n",
3192 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3193 /* if previously allocated heap space for args, free them */
3194 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3195 __kmp_free((void *)team->t.t_argv);
3196
3197 if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3198 /* use unused space in the cache line for arguments */
3199 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3200 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3201 "argv entries\n",
3202 team->t.t_id, team->t.t_max_argc));
3203 team->t.t_argv = &team->t.t_inline_argv[0];
3204 if (__kmp_storage_map) {
3205 __kmp_print_storage_map_gtid(
3206 gtid: -1, p1: &team->t.t_inline_argv[0],
3207 p2: &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3208 size: (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), format: "team_%d.t_inline_argv",
3209 team->t.t_id);
3210 }
3211 } else {
3212 /* allocate space for arguments in the heap */
3213 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3214 ? KMP_MIN_MALLOC_ARGV_ENTRIES
3215 : 2 * argc;
3216 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3217 "argv entries\n",
3218 team->t.t_id, team->t.t_max_argc));
3219 team->t.t_argv =
3220 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3221 if (__kmp_storage_map) {
3222 __kmp_print_storage_map_gtid(gtid: -1, p1: &team->t.t_argv[0],
3223 p2: &team->t.t_argv[team->t.t_max_argc],
3224 size: sizeof(void *) * team->t.t_max_argc,
3225 format: "team_%d.t_argv", team->t.t_id);
3226 }
3227 }
3228 }
3229}
3230
3231static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3232 int i;
3233 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3234 team->t.t_threads =
3235 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3236 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3237 sizeof(dispatch_shared_info_t) * num_disp_buff);
3238 team->t.t_dispatch =
3239 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3240 team->t.t_implicit_task_taskdata =
3241 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3242 team->t.t_max_nproc = max_nth;
3243
3244 /* setup dispatch buffers */
3245 for (i = 0; i < num_disp_buff; ++i) {
3246 team->t.t_disp_buffer[i].buffer_index = i;
3247 team->t.t_disp_buffer[i].doacross_buf_idx = i;
3248 }
3249}
3250
3251static void __kmp_free_team_arrays(kmp_team_t *team) {
3252 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3253 int i;
3254 for (i = 0; i < team->t.t_max_nproc; ++i) {
3255 if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3256 __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3257 team->t.t_dispatch[i].th_disp_buffer = NULL;
3258 }
3259 }
3260#if KMP_USE_HIER_SCHED
3261 __kmp_dispatch_free_hierarchies(team);
3262#endif
3263 __kmp_free(team->t.t_threads);
3264 __kmp_free(team->t.t_disp_buffer);
3265 __kmp_free(team->t.t_dispatch);
3266 __kmp_free(team->t.t_implicit_task_taskdata);
3267 team->t.t_threads = NULL;
3268 team->t.t_disp_buffer = NULL;
3269 team->t.t_dispatch = NULL;
3270 team->t.t_implicit_task_taskdata = 0;
3271}
3272
3273static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3274 kmp_info_t **oldThreads = team->t.t_threads;
3275
3276 __kmp_free(team->t.t_disp_buffer);
3277 __kmp_free(team->t.t_dispatch);
3278 __kmp_free(team->t.t_implicit_task_taskdata);
3279 __kmp_allocate_team_arrays(team, max_nth);
3280
3281 KMP_MEMCPY(dest: team->t.t_threads, src: oldThreads,
3282 n: team->t.t_nproc * sizeof(kmp_info_t *));
3283
3284 __kmp_free(oldThreads);
3285}
3286
3287static kmp_internal_control_t __kmp_get_global_icvs(void) {
3288
3289 kmp_r_sched_t r_sched =
3290 __kmp_get_schedule_global(); // get current state of scheduling globals
3291
3292 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3293
3294 kmp_internal_control_t g_icvs = {
3295 .serial_nesting_level: 0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3296 .dynamic: (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3297 // adjustment of threads (per thread)
3298 .bt_set: (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3299 // whether blocktime is explicitly set
3300 .blocktime: __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3301#if KMP_USE_MONITOR
3302 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3303// intervals
3304#endif
3305 .nproc: __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3306 // next parallel region (per thread)
3307 // (use a max ub on value if __kmp_parallel_initialize not called yet)
3308 .thread_limit: __kmp_cg_max_nth, // int thread_limit;
3309 .task_thread_limit: __kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit
3310 // on task. This is used in the case of target thread_limit
3311 .max_active_levels: __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3312 // for max_active_levels
3313 .sched: r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3314 // {sched,chunk} pair
3315 .proc_bind: __kmp_nested_proc_bind.bind_types[0],
3316 .default_device: __kmp_default_device,
3317 NULL // struct kmp_internal_control *next;
3318 };
3319
3320 return g_icvs;
3321}
3322
3323static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3324
3325 kmp_internal_control_t gx_icvs;
3326 gx_icvs.serial_nesting_level =
3327 0; // probably =team->t.t_serial like in save_inter_controls
3328 copy_icvs(dst: &gx_icvs, src: &team->t.t_threads[0]->th.th_current_task->td_icvs);
3329 gx_icvs.next = NULL;
3330
3331 return gx_icvs;
3332}
3333
3334static void __kmp_initialize_root(kmp_root_t *root) {
3335 int f;
3336 kmp_team_t *root_team;
3337 kmp_team_t *hot_team;
3338 int hot_team_max_nth;
3339 kmp_r_sched_t r_sched =
3340 __kmp_get_schedule_global(); // get current state of scheduling globals
3341 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3342 KMP_DEBUG_ASSERT(root);
3343 KMP_ASSERT(!root->r.r_begin);
3344
3345 /* setup the root state structure */
3346 __kmp_init_lock(lck: &root->r.r_begin_lock);
3347 root->r.r_begin = FALSE;
3348 root->r.r_active = FALSE;
3349 root->r.r_in_parallel = 0;
3350 root->r.r_blocktime = __kmp_dflt_blocktime;
3351#if KMP_AFFINITY_SUPPORTED
3352 root->r.r_affinity_assigned = FALSE;
3353#endif
3354
3355 /* setup the root team for this task */
3356 /* allocate the root team structure */
3357 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3358
3359 root_team =
3360 __kmp_allocate_team(root,
3361 new_nproc: 1, // new_nproc
3362 max_nproc: 1, // max_nproc
3363#if OMPT_SUPPORT
3364 ompt_data_none, // root parallel id
3365#endif
3366 proc_bind: __kmp_nested_proc_bind.bind_types[0], new_icvs: &r_icvs,
3367 argc: 0 // argc
3368 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3369 );
3370#if USE_DEBUGGER
3371 // Non-NULL value should be assigned to make the debugger display the root
3372 // team.
3373 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3374#endif
3375
3376 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3377
3378 root->r.r_root_team = root_team;
3379 root_team->t.t_control_stack_top = NULL;
3380
3381 /* initialize root team */
3382 root_team->t.t_threads[0] = NULL;
3383 root_team->t.t_nproc = 1;
3384 root_team->t.t_serialized = 1;
3385 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3386 root_team->t.t_sched.sched = r_sched.sched;
3387 KA_TRACE(
3388 20,
3389 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3390 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3391
3392 /* setup the hot team for this task */
3393 /* allocate the hot team structure */
3394 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3395
3396 hot_team =
3397 __kmp_allocate_team(root,
3398 new_nproc: 1, // new_nproc
3399 max_nproc: __kmp_dflt_team_nth_ub * 2, // max_nproc
3400#if OMPT_SUPPORT
3401 ompt_data_none, // root parallel id
3402#endif
3403 proc_bind: __kmp_nested_proc_bind.bind_types[0], new_icvs: &r_icvs,
3404 argc: 0 // argc
3405 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3406 );
3407 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3408
3409 root->r.r_hot_team = hot_team;
3410 root_team->t.t_control_stack_top = NULL;
3411
3412 /* first-time initialization */
3413 hot_team->t.t_parent = root_team;
3414
3415 /* initialize hot team */
3416 hot_team_max_nth = hot_team->t.t_max_nproc;
3417 for (f = 0; f < hot_team_max_nth; ++f) {
3418 hot_team->t.t_threads[f] = NULL;
3419 }
3420 hot_team->t.t_nproc = 1;
3421 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3422 hot_team->t.t_sched.sched = r_sched.sched;
3423 hot_team->t.t_size_changed = 0;
3424}
3425
3426#ifdef KMP_DEBUG
3427
3428typedef struct kmp_team_list_item {
3429 kmp_team_p const *entry;
3430 struct kmp_team_list_item *next;
3431} kmp_team_list_item_t;
3432typedef kmp_team_list_item_t *kmp_team_list_t;
3433
3434static void __kmp_print_structure_team_accum( // Add team to list of teams.
3435 kmp_team_list_t list, // List of teams.
3436 kmp_team_p const *team // Team to add.
3437) {
3438
3439 // List must terminate with item where both entry and next are NULL.
3440 // Team is added to the list only once.
3441 // List is sorted in ascending order by team id.
3442 // Team id is *not* a key.
3443
3444 kmp_team_list_t l;
3445
3446 KMP_DEBUG_ASSERT(list != NULL);
3447 if (team == NULL) {
3448 return;
3449 }
3450
3451 __kmp_print_structure_team_accum(list, team: team->t.t_parent);
3452 __kmp_print_structure_team_accum(list, team: team->t.t_next_pool);
3453
3454 // Search list for the team.
3455 l = list;
3456 while (l->next != NULL && l->entry != team) {
3457 l = l->next;
3458 }
3459 if (l->next != NULL) {
3460 return; // Team has been added before, exit.
3461 }
3462
3463 // Team is not found. Search list again for insertion point.
3464 l = list;
3465 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3466 l = l->next;
3467 }
3468
3469 // Insert team.
3470 {
3471 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3472 sizeof(kmp_team_list_item_t));
3473 *item = *l;
3474 l->entry = team;
3475 l->next = item;
3476 }
3477}
3478
3479static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3480
3481) {
3482 __kmp_printf(format: "%s", title);
3483 if (team != NULL) {
3484 __kmp_printf(format: "%2x %p\n", team->t.t_id, team);
3485 } else {
3486 __kmp_printf(format: " - (nil)\n");
3487 }
3488}
3489
3490static void __kmp_print_structure_thread(char const *title,
3491 kmp_info_p const *thread) {
3492 __kmp_printf(format: "%s", title);
3493 if (thread != NULL) {
3494 __kmp_printf(format: "%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3495 } else {
3496 __kmp_printf(format: " - (nil)\n");
3497 }
3498}
3499
3500void __kmp_print_structure(void) {
3501
3502 kmp_team_list_t list;
3503
3504 // Initialize list of teams.
3505 list =
3506 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3507 list->entry = NULL;
3508 list->next = NULL;
3509
3510 __kmp_printf(format: "\n------------------------------\nGlobal Thread "
3511 "Table\n------------------------------\n");
3512 {
3513 int gtid;
3514 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3515 __kmp_printf(format: "%2d", gtid);
3516 if (__kmp_threads != NULL) {
3517 __kmp_printf(format: " %p", __kmp_threads[gtid]);
3518 }
3519 if (__kmp_root != NULL) {
3520 __kmp_printf(format: " %p", __kmp_root[gtid]);
3521 }
3522 __kmp_printf(format: "\n");
3523 }
3524 }
3525
3526 // Print out __kmp_threads array.
3527 __kmp_printf(format: "\n------------------------------\nThreads\n--------------------"
3528 "----------\n");
3529 if (__kmp_threads != NULL) {
3530 int gtid;
3531 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3532 kmp_info_t const *thread = __kmp_threads[gtid];
3533 if (thread != NULL) {
3534 __kmp_printf(format: "GTID %2d %p:\n", gtid, thread);
3535 __kmp_printf(format: " Our Root: %p\n", thread->th.th_root);
3536 __kmp_print_structure_team(title: " Our Team: ", team: thread->th.th_team);
3537 __kmp_print_structure_team(title: " Serial Team: ",
3538 team: thread->th.th_serial_team);
3539 __kmp_printf(format: " Threads: %2d\n", thread->th.th_team_nproc);
3540 __kmp_print_structure_thread(title: " Primary: ",
3541 thread: thread->th.th_team_master);
3542 __kmp_printf(format: " Serialized?: %2d\n", thread->th.th_team_serialized);
3543 __kmp_printf(format: " Set NProc: %2d\n", thread->th.th_set_nproc);
3544 __kmp_printf(format: " Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3545 __kmp_print_structure_thread(title: " Next in pool: ",
3546 thread: thread->th.th_next_pool);
3547 __kmp_printf(format: "\n");
3548 __kmp_print_structure_team_accum(list, team: thread->th.th_team);
3549 __kmp_print_structure_team_accum(list, team: thread->th.th_serial_team);
3550 }
3551 }
3552 } else {
3553 __kmp_printf(format: "Threads array is not allocated.\n");
3554 }
3555
3556 // Print out __kmp_root array.
3557 __kmp_printf(format: "\n------------------------------\nUbers\n----------------------"
3558 "--------\n");
3559 if (__kmp_root != NULL) {
3560 int gtid;
3561 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3562 kmp_root_t const *root = __kmp_root[gtid];
3563 if (root != NULL) {
3564 __kmp_printf(format: "GTID %2d %p:\n", gtid, root);
3565 __kmp_print_structure_team(title: " Root Team: ", team: root->r.r_root_team);
3566 __kmp_print_structure_team(title: " Hot Team: ", team: root->r.r_hot_team);
3567 __kmp_print_structure_thread(title: " Uber Thread: ",
3568 thread: root->r.r_uber_thread);
3569 __kmp_printf(format: " Active?: %2d\n", root->r.r_active);
3570 __kmp_printf(format: " In Parallel: %2d\n",
3571 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3572 __kmp_printf(format: "\n");
3573 __kmp_print_structure_team_accum(list, team: root->r.r_root_team);
3574 __kmp_print_structure_team_accum(list, team: root->r.r_hot_team);
3575 }
3576 }
3577 } else {
3578 __kmp_printf(format: "Ubers array is not allocated.\n");
3579 }
3580
3581 __kmp_printf(format: "\n------------------------------\nTeams\n----------------------"
3582 "--------\n");
3583 while (list->next != NULL) {
3584 kmp_team_p const *team = list->entry;
3585 int i;
3586 __kmp_printf(format: "Team %2x %p:\n", team->t.t_id, team);
3587 __kmp_print_structure_team(title: " Parent Team: ", team: team->t.t_parent);
3588 __kmp_printf(format: " Primary TID: %2d\n", team->t.t_master_tid);
3589 __kmp_printf(format: " Max threads: %2d\n", team->t.t_max_nproc);
3590 __kmp_printf(format: " Levels of serial: %2d\n", team->t.t_serialized);
3591 __kmp_printf(format: " Number threads: %2d\n", team->t.t_nproc);
3592 for (i = 0; i < team->t.t_nproc; ++i) {
3593 __kmp_printf(format: " Thread %2d: ", i);
3594 __kmp_print_structure_thread(title: "", thread: team->t.t_threads[i]);
3595 }
3596 __kmp_print_structure_team(title: " Next in pool: ", team: team->t.t_next_pool);
3597 __kmp_printf(format: "\n");
3598 list = list->next;
3599 }
3600
3601 // Print out __kmp_thread_pool and __kmp_team_pool.
3602 __kmp_printf(format: "\n------------------------------\nPools\n----------------------"
3603 "--------\n");
3604 __kmp_print_structure_thread(title: "Thread pool: ",
3605 CCAST(kmp_info_t *, __kmp_thread_pool));
3606 __kmp_print_structure_team(title: "Team pool: ",
3607 CCAST(kmp_team_t *, __kmp_team_pool));
3608 __kmp_printf(format: "\n");
3609
3610 // Free team list.
3611 while (list != NULL) {
3612 kmp_team_list_item_t *item = list;
3613 list = list->next;
3614 KMP_INTERNAL_FREE(item);
3615 }
3616}
3617
3618#endif
3619
3620//---------------------------------------------------------------------------
3621// Stuff for per-thread fast random number generator
3622// Table of primes
3623static const unsigned __kmp_primes[] = {
3624 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3625 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3626 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3627 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3628 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3629 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3630 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3631 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3632 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3633 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3634 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3635
3636//---------------------------------------------------------------------------
3637// __kmp_get_random: Get a random number using a linear congruential method.
3638unsigned short __kmp_get_random(kmp_info_t *thread) {
3639 unsigned x = thread->th.th_x;
3640 unsigned short r = (unsigned short)(x >> 16);
3641
3642 thread->th.th_x = x * thread->th.th_a + 1;
3643
3644 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3645 thread->th.th_info.ds.ds_tid, r));
3646
3647 return r;
3648}
3649//--------------------------------------------------------
3650// __kmp_init_random: Initialize a random number generator
3651void __kmp_init_random(kmp_info_t *thread) {
3652 unsigned seed = thread->th.th_info.ds.ds_tid;
3653
3654 thread->th.th_a =
3655 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3656 thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3657 KA_TRACE(30,
3658 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3659}
3660
3661#if KMP_OS_WINDOWS
3662/* reclaim array entries for root threads that are already dead, returns number
3663 * reclaimed */
3664static int __kmp_reclaim_dead_roots(void) {
3665 int i, r = 0;
3666
3667 for (i = 0; i < __kmp_threads_capacity; ++i) {
3668 if (KMP_UBER_GTID(i) &&
3669 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3670 !__kmp_root[i]
3671 ->r.r_active) { // AC: reclaim only roots died in non-active state
3672 r += __kmp_unregister_root_other_thread(i);
3673 }
3674 }
3675 return r;
3676}
3677#endif
3678
3679/* This function attempts to create free entries in __kmp_threads and
3680 __kmp_root, and returns the number of free entries generated.
3681
3682 For Windows* OS static library, the first mechanism used is to reclaim array
3683 entries for root threads that are already dead.
3684
3685 On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3686 __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3687 capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3688 threadprivate cache array has been created. Synchronization with
3689 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3690
3691 After any dead root reclamation, if the clipping value allows array expansion
3692 to result in the generation of a total of nNeed free slots, the function does
3693 that expansion. If not, nothing is done beyond the possible initial root
3694 thread reclamation.
3695
3696 If any argument is negative, the behavior is undefined. */
3697static int __kmp_expand_threads(int nNeed) {
3698 int added = 0;
3699 int minimumRequiredCapacity;
3700 int newCapacity;
3701 kmp_info_t **newThreads;
3702 kmp_root_t **newRoot;
3703
3704 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3705 // resizing __kmp_threads does not need additional protection if foreign
3706 // threads are present
3707
3708#if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3709 /* only for Windows static library */
3710 /* reclaim array entries for root threads that are already dead */
3711 added = __kmp_reclaim_dead_roots();
3712
3713 if (nNeed) {
3714 nNeed -= added;
3715 if (nNeed < 0)
3716 nNeed = 0;
3717 }
3718#endif
3719 if (nNeed <= 0)
3720 return added;
3721
3722 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3723 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3724 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3725 // > __kmp_max_nth in one of two ways:
3726 //
3727 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3728 // may not be reused by another thread, so we may need to increase
3729 // __kmp_threads_capacity to __kmp_max_nth + 1.
3730 //
3731 // 2) New foreign root(s) are encountered. We always register new foreign
3732 // roots. This may cause a smaller # of threads to be allocated at
3733 // subsequent parallel regions, but the worker threads hang around (and
3734 // eventually go to sleep) and need slots in the __kmp_threads[] array.
3735 //
3736 // Anyway, that is the reason for moving the check to see if
3737 // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3738 // instead of having it performed here. -BB
3739
3740 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3741
3742 /* compute expansion headroom to check if we can expand */
3743 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3744 /* possible expansion too small -- give up */
3745 return added;
3746 }
3747 minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3748
3749 newCapacity = __kmp_threads_capacity;
3750 do {
3751 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3752 : __kmp_sys_max_nth;
3753 } while (newCapacity < minimumRequiredCapacity);
3754 newThreads = (kmp_info_t **)__kmp_allocate(
3755 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3756 newRoot =
3757 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3758 KMP_MEMCPY(dest: newThreads, src: __kmp_threads,
3759 n: __kmp_threads_capacity * sizeof(kmp_info_t *));
3760 KMP_MEMCPY(dest: newRoot, src: __kmp_root,
3761 n: __kmp_threads_capacity * sizeof(kmp_root_t *));
3762 // Put old __kmp_threads array on a list. Any ongoing references to the old
3763 // list will be valid. This list is cleaned up at library shutdown.
3764 kmp_old_threads_list_t *node =
3765 (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3766 node->threads = __kmp_threads;
3767 node->next = __kmp_old_threads_list;
3768 __kmp_old_threads_list = node;
3769
3770 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3771 *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3772 added += newCapacity - __kmp_threads_capacity;
3773 *(volatile int *)&__kmp_threads_capacity = newCapacity;
3774
3775 if (newCapacity > __kmp_tp_capacity) {
3776 __kmp_acquire_bootstrap_lock(lck: &__kmp_tp_cached_lock);
3777 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3778 __kmp_threadprivate_resize_cache(newCapacity);
3779 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3780 *(volatile int *)&__kmp_tp_capacity = newCapacity;
3781 }
3782 __kmp_release_bootstrap_lock(lck: &__kmp_tp_cached_lock);
3783 }
3784
3785 return added;
3786}
3787
3788/* Register the current thread as a root thread and obtain our gtid. We must
3789 have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3790 thread that calls from __kmp_do_serial_initialize() */
3791int __kmp_register_root(int initial_thread) {
3792 kmp_info_t *root_thread;
3793 kmp_root_t *root;
3794 int gtid;
3795 int capacity;
3796 __kmp_acquire_bootstrap_lock(lck: &__kmp_forkjoin_lock);
3797 KA_TRACE(20, ("__kmp_register_root: entered\n"));
3798 KMP_MB();
3799
3800 /* 2007-03-02:
3801 If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3802 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3803 work as expected -- it may return false (that means there is at least one
3804 empty slot in __kmp_threads array), but it is possible the only free slot
3805 is #0, which is reserved for initial thread and so cannot be used for this
3806 one. Following code workarounds this bug.
3807
3808 However, right solution seems to be not reserving slot #0 for initial
3809 thread because:
3810 (1) there is no magic in slot #0,
3811 (2) we cannot detect initial thread reliably (the first thread which does
3812 serial initialization may be not a real initial thread).
3813 */
3814 capacity = __kmp_threads_capacity;
3815 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3816 --capacity;
3817 }
3818
3819 // If it is not for initializing the hidden helper team, we need to take
3820 // __kmp_hidden_helper_threads_num out of the capacity because it is included
3821 // in __kmp_threads_capacity.
3822 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3823 capacity -= __kmp_hidden_helper_threads_num;
3824 }
3825
3826 /* see if there are too many threads */
3827 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(nNeed: 1)) {
3828 if (__kmp_tp_cached) {
3829 __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3830 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3831 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3832 } else {
3833 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3834 __kmp_msg_null);
3835 }
3836 }
3837
3838 // When hidden helper task is enabled, __kmp_threads is organized as follows:
3839 // 0: initial thread, also a regular OpenMP thread.
3840 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3841 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3842 // regular OpenMP threads.
3843 if (TCR_4(__kmp_init_hidden_helper_threads)) {
3844 // Find an available thread slot for hidden helper thread. Slots for hidden
3845 // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3846 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3847 gtid <= __kmp_hidden_helper_threads_num;
3848 gtid++)
3849 ;
3850 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3851 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3852 "hidden helper thread: T#%d\n",
3853 gtid));
3854 } else {
3855 /* find an available thread slot */
3856 // Don't reassign the zero slot since we need that to only be used by
3857 // initial thread. Slots for hidden helper threads should also be skipped.
3858 if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3859 gtid = 0;
3860 } else {
3861 for (gtid = __kmp_hidden_helper_threads_num + 1;
3862 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3863 ;
3864 }
3865 KA_TRACE(
3866 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3867 KMP_ASSERT(gtid < __kmp_threads_capacity);
3868 }
3869
3870 /* update global accounting */
3871 __kmp_all_nth++;
3872 TCW_4(__kmp_nth, __kmp_nth + 1);
3873
3874 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3875 // numbers of procs, and method #2 (keyed API call) for higher numbers.
3876 if (__kmp_adjust_gtid_mode) {
3877 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3878 if (TCR_4(__kmp_gtid_mode) != 2) {
3879 TCW_4(__kmp_gtid_mode, 2);
3880 }
3881 } else {
3882 if (TCR_4(__kmp_gtid_mode) != 1) {
3883 TCW_4(__kmp_gtid_mode, 1);
3884 }
3885 }
3886 }
3887
3888#ifdef KMP_ADJUST_BLOCKTIME
3889 /* Adjust blocktime to zero if necessary */
3890 /* Middle initialization might not have occurred yet */
3891 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3892 if (__kmp_nth > __kmp_avail_proc) {
3893 __kmp_zero_bt = TRUE;
3894 }
3895 }
3896#endif /* KMP_ADJUST_BLOCKTIME */
3897
3898 /* setup this new hierarchy */
3899 if (!(root = __kmp_root[gtid])) {
3900 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3901 KMP_DEBUG_ASSERT(!root->r.r_root_team);
3902 }
3903
3904#if KMP_STATS_ENABLED
3905 // Initialize stats as soon as possible (right after gtid assignment).
3906 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3907 __kmp_stats_thread_ptr->startLife();
3908 KMP_SET_THREAD_STATE(SERIAL_REGION);
3909 KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3910#endif
3911 __kmp_initialize_root(root);
3912
3913 /* setup new root thread structure */
3914 if (root->r.r_uber_thread) {
3915 root_thread = root->r.r_uber_thread;
3916 } else {
3917 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3918 if (__kmp_storage_map) {
3919 __kmp_print_thread_storage_map(thr: root_thread, gtid);
3920 }
3921 root_thread->th.th_info.ds.ds_gtid = gtid;
3922#if OMPT_SUPPORT
3923 root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3924#endif
3925 root_thread->th.th_root = root;
3926 if (__kmp_env_consistency_check) {
3927 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3928 }
3929#if USE_FAST_MEMORY
3930 __kmp_initialize_fast_memory(this_thr: root_thread);
3931#endif /* USE_FAST_MEMORY */
3932
3933#if KMP_USE_BGET
3934 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3935 __kmp_initialize_bget(th: root_thread);
3936#endif
3937 __kmp_init_random(thread: root_thread); // Initialize random number generator
3938 }
3939
3940 /* setup the serial team held in reserve by the root thread */
3941 if (!root_thread->th.th_serial_team) {
3942 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3943 KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3944 root_thread->th.th_serial_team = __kmp_allocate_team(
3945 root, new_nproc: 1, max_nproc: 1,
3946#if OMPT_SUPPORT
3947 ompt_data_none, // root parallel id
3948#endif
3949 proc_bind: proc_bind_default, new_icvs: &r_icvs, argc: 0 USE_NESTED_HOT_ARG(NULL));
3950 }
3951 KMP_ASSERT(root_thread->th.th_serial_team);
3952 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3953 root_thread->th.th_serial_team));
3954
3955 /* drop root_thread into place */
3956 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3957
3958 root->r.r_root_team->t.t_threads[0] = root_thread;
3959 root->r.r_hot_team->t.t_threads[0] = root_thread;
3960 root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3961 // AC: the team created in reserve, not for execution (it is unused for now).
3962 root_thread->th.th_serial_team->t.t_serialized = 0;
3963 root->r.r_uber_thread = root_thread;
3964
3965 /* initialize the thread, get it ready to go */
3966 __kmp_initialize_info(root_thread, root->r.r_root_team, tid: 0, gtid);
3967 TCW_4(__kmp_init_gtid, TRUE);
3968
3969 /* prepare the primary thread for get_gtid() */
3970 __kmp_gtid_set_specific(gtid);
3971
3972#if USE_ITT_BUILD
3973 __kmp_itt_thread_name(gtid);
3974#endif /* USE_ITT_BUILD */
3975
3976#ifdef KMP_TDATA_GTID
3977 __kmp_gtid = gtid;
3978#endif
3979 __kmp_create_worker(gtid, th: root_thread, stack_size: __kmp_stksize);
3980 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3981
3982 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3983 "plain=%u\n",
3984 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3985 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3986 KMP_INIT_BARRIER_STATE));
3987 { // Initialize barrier data.
3988 int b;
3989 for (b = 0; b < bs_last_barrier; ++b) {
3990 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3991#if USE_DEBUGGER
3992 root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3993#endif
3994 }
3995 }
3996 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3997 KMP_INIT_BARRIER_STATE);
3998
3999#if KMP_AFFINITY_SUPPORTED
4000 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
4001 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
4002 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
4003 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
4004#endif /* KMP_AFFINITY_SUPPORTED */
4005 root_thread->th.th_def_allocator = __kmp_def_allocator;
4006 root_thread->th.th_prev_level = 0;
4007 root_thread->th.th_prev_num_threads = 1;
4008
4009 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
4010 tmp->cg_root = root_thread;
4011 tmp->cg_thread_limit = __kmp_cg_max_nth;
4012 tmp->cg_nthreads = 1;
4013 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
4014 " cg_nthreads init to 1\n",
4015 root_thread, tmp));
4016 tmp->up = NULL;
4017 root_thread->th.th_cg_roots = tmp;
4018
4019 __kmp_root_counter++;
4020
4021#if OMPT_SUPPORT
4022 if (!initial_thread && ompt_enabled.enabled) {
4023
4024 kmp_info_t *root_thread = ompt_get_thread();
4025
4026 ompt_set_thread_state(thread: root_thread, state: ompt_state_overhead);
4027
4028 if (ompt_enabled.ompt_callback_thread_begin) {
4029 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4030 ompt_thread_initial, __ompt_get_thread_data_internal());
4031 }
4032 ompt_data_t *task_data;
4033 ompt_data_t *parallel_data;
4034 __ompt_get_task_info_internal(ancestor_level: 0, NULL, task_data: &task_data, NULL, parallel_data: &parallel_data,
4035 NULL);
4036 if (ompt_enabled.ompt_callback_implicit_task) {
4037 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4038 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4039 }
4040
4041 ompt_set_thread_state(thread: root_thread, state: ompt_state_work_serial);
4042 }
4043#endif
4044#if OMPD_SUPPORT
4045 if (ompd_state & OMPD_ENABLE_BP)
4046 ompd_bp_thread_begin();
4047#endif
4048
4049 KMP_MB();
4050 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
4051
4052 return gtid;
4053}
4054
4055#if KMP_NESTED_HOT_TEAMS
4056static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4057 const int max_level) {
4058 int i, n, nth;
4059 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4060 if (!hot_teams || !hot_teams[level].hot_team) {
4061 return 0;
4062 }
4063 KMP_DEBUG_ASSERT(level < max_level);
4064 kmp_team_t *team = hot_teams[level].hot_team;
4065 nth = hot_teams[level].hot_team_nth;
4066 n = nth - 1; // primary thread is not freed
4067 if (level < max_level - 1) {
4068 for (i = 0; i < nth; ++i) {
4069 kmp_info_t *th = team->t.t_threads[i];
4070 n += __kmp_free_hot_teams(root, thr: th, level: level + 1, max_level);
4071 if (i > 0 && th->th.th_hot_teams) {
4072 __kmp_free(th->th.th_hot_teams);
4073 th->th.th_hot_teams = NULL;
4074 }
4075 }
4076 }
4077 __kmp_free_team(root, team, NULL);
4078 return n;
4079}
4080#endif
4081
4082// Resets a root thread and clear its root and hot teams.
4083// Returns the number of __kmp_threads entries directly and indirectly freed.
4084static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4085 kmp_team_t *root_team = root->r.r_root_team;
4086 kmp_team_t *hot_team = root->r.r_hot_team;
4087 int n = hot_team->t.t_nproc;
4088 int i;
4089
4090 KMP_DEBUG_ASSERT(!root->r.r_active);
4091
4092 root->r.r_root_team = NULL;
4093 root->r.r_hot_team = NULL;
4094 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4095 // before call to __kmp_free_team().
4096 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4097#if KMP_NESTED_HOT_TEAMS
4098 if (__kmp_hot_teams_max_level >
4099 0) { // need to free nested hot teams and their threads if any
4100 for (i = 0; i < hot_team->t.t_nproc; ++i) {
4101 kmp_info_t *th = hot_team->t.t_threads[i];
4102 if (__kmp_hot_teams_max_level > 1) {
4103 n += __kmp_free_hot_teams(root, thr: th, level: 1, max_level: __kmp_hot_teams_max_level);
4104 }
4105 if (th->th.th_hot_teams) {
4106 __kmp_free(th->th.th_hot_teams);
4107 th->th.th_hot_teams = NULL;
4108 }
4109 }
4110 }
4111#endif
4112 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4113
4114 // Before we can reap the thread, we need to make certain that all other
4115 // threads in the teams that had this root as ancestor have stopped trying to
4116 // steal tasks.
4117 if (__kmp_tasking_mode != tskm_immediate_exec) {
4118 __kmp_wait_to_unref_task_teams();
4119 }
4120
4121#if KMP_OS_WINDOWS
4122 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4123 KA_TRACE(
4124 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4125 "\n",
4126 (LPVOID) & (root->r.r_uber_thread->th),
4127 root->r.r_uber_thread->th.th_info.ds.ds_thread));
4128 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4129#endif /* KMP_OS_WINDOWS */
4130
4131#if OMPD_SUPPORT
4132 if (ompd_state & OMPD_ENABLE_BP)
4133 ompd_bp_thread_end();
4134#endif
4135
4136#if OMPT_SUPPORT
4137 ompt_data_t *task_data;
4138 ompt_data_t *parallel_data;
4139 __ompt_get_task_info_internal(ancestor_level: 0, NULL, task_data: &task_data, NULL, parallel_data: &parallel_data,
4140 NULL);
4141 if (ompt_enabled.ompt_callback_implicit_task) {
4142 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4143 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4144 }
4145 if (ompt_enabled.ompt_callback_thread_end) {
4146 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4147 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4148 }
4149#endif
4150
4151 TCW_4(__kmp_nth,
4152 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4153 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4154 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4155 " to %d\n",
4156 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4157 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4158 if (i == 1) {
4159 // need to free contention group structure
4160 KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4161 root->r.r_uber_thread->th.th_cg_roots->cg_root);
4162 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4163 __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4164 root->r.r_uber_thread->th.th_cg_roots = NULL;
4165 }
4166 __kmp_reap_thread(thread: root->r.r_uber_thread, is_root: 1);
4167
4168 // We canot put root thread to __kmp_thread_pool, so we have to reap it
4169 // instead of freeing.
4170 root->r.r_uber_thread = NULL;
4171 /* mark root as no longer in use */
4172 root->r.r_begin = FALSE;
4173
4174 return n;
4175}
4176
4177void __kmp_unregister_root_current_thread(int gtid) {
4178 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4179 /* this lock should be ok, since unregister_root_current_thread is never
4180 called during an abort, only during a normal close. furthermore, if you
4181 have the forkjoin lock, you should never try to get the initz lock */
4182 __kmp_acquire_bootstrap_lock(lck: &__kmp_forkjoin_lock);
4183 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4184 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4185 "exiting T#%d\n",
4186 gtid));
4187 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
4188 return;
4189 }
4190 kmp_root_t *root = __kmp_root[gtid];
4191
4192 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4193 KMP_ASSERT(KMP_UBER_GTID(gtid));
4194 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4195 KMP_ASSERT(root->r.r_active == FALSE);
4196
4197 KMP_MB();
4198
4199 kmp_info_t *thread = __kmp_threads[gtid];
4200 kmp_team_t *team = thread->th.th_team;
4201 kmp_task_team_t *task_team = thread->th.th_task_team;
4202
4203 // we need to wait for the proxy tasks before finishing the thread
4204 if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4205 task_team->tt.tt_hidden_helper_task_encountered)) {
4206#if OMPT_SUPPORT
4207 // the runtime is shutting down so we won't report any events
4208 thread->th.ompt_thread_info.state = ompt_state_undefined;
4209#endif
4210 __kmp_task_team_wait(this_thr: thread, team USE_ITT_BUILD_ARG(NULL));
4211 }
4212
4213 __kmp_reset_root(gtid, root);
4214
4215 KMP_MB();
4216 KC_TRACE(10,
4217 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4218
4219 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
4220}
4221
4222#if KMP_OS_WINDOWS
4223/* __kmp_forkjoin_lock must be already held
4224 Unregisters a root thread that is not the current thread. Returns the number
4225 of __kmp_threads entries freed as a result. */
4226static int __kmp_unregister_root_other_thread(int gtid) {
4227 kmp_root_t *root = __kmp_root[gtid];
4228 int r;
4229
4230 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4231 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4232 KMP_ASSERT(KMP_UBER_GTID(gtid));
4233 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4234 KMP_ASSERT(root->r.r_active == FALSE);
4235
4236 r = __kmp_reset_root(gtid, root);
4237 KC_TRACE(10,
4238 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4239 return r;
4240}
4241#endif
4242
4243#if KMP_DEBUG
4244void __kmp_task_info() {
4245
4246 kmp_int32 gtid = __kmp_entry_gtid();
4247 kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4248 kmp_info_t *this_thr = __kmp_threads[gtid];
4249 kmp_team_t *steam = this_thr->th.th_serial_team;
4250 kmp_team_t *team = this_thr->th.th_team;
4251
4252 __kmp_printf(
4253 format: "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4254 "ptask=%p\n",
4255 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4256 team->t.t_implicit_task_taskdata[tid].td_parent);
4257}
4258#endif // KMP_DEBUG
4259
4260/* TODO optimize with one big memclr, take out what isn't needed, split
4261 responsibility to workers as much as possible, and delay initialization of
4262 features as much as possible */
4263static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4264 int tid, int gtid) {
4265 /* this_thr->th.th_info.ds.ds_gtid is setup in
4266 kmp_allocate_thread/create_worker.
4267 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4268 KMP_DEBUG_ASSERT(this_thr != NULL);
4269 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4270 KMP_DEBUG_ASSERT(team);
4271 KMP_DEBUG_ASSERT(team->t.t_threads);
4272 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4273 kmp_info_t *master = team->t.t_threads[0];
4274 KMP_DEBUG_ASSERT(master);
4275 KMP_DEBUG_ASSERT(master->th.th_root);
4276
4277 KMP_MB();
4278
4279 TCW_SYNC_PTR(this_thr->th.th_team, team);
4280
4281 this_thr->th.th_info.ds.ds_tid = tid;
4282 this_thr->th.th_set_nproc = 0;
4283 if (__kmp_tasking_mode != tskm_immediate_exec)
4284 // When tasking is possible, threads are not safe to reap until they are
4285 // done tasking; this will be set when tasking code is exited in wait
4286 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4287 else // no tasking --> always safe to reap
4288 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4289 this_thr->th.th_set_proc_bind = proc_bind_default;
4290#if KMP_AFFINITY_SUPPORTED
4291 this_thr->th.th_new_place = this_thr->th.th_current_place;
4292#endif
4293 this_thr->th.th_root = master->th.th_root;
4294
4295 /* setup the thread's cache of the team structure */
4296 this_thr->th.th_team_nproc = team->t.t_nproc;
4297 this_thr->th.th_team_master = master;
4298 this_thr->th.th_team_serialized = team->t.t_serialized;
4299
4300 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4301
4302 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4303 tid, gtid, this_thr, this_thr->th.th_current_task));
4304
4305 __kmp_init_implicit_task(loc_ref: this_thr->th.th_team_master->th.th_ident, this_thr,
4306 team, tid, TRUE);
4307
4308 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4309 tid, gtid, this_thr, this_thr->th.th_current_task));
4310 // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4311 // __kmp_initialize_team()?
4312
4313 /* TODO no worksharing in speculative threads */
4314 this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4315
4316 this_thr->th.th_local.this_construct = 0;
4317
4318 if (!this_thr->th.th_pri_common) {
4319 this_thr->th.th_pri_common =
4320 (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4321 if (__kmp_storage_map) {
4322 __kmp_print_storage_map_gtid(
4323 gtid, p1: this_thr->th.th_pri_common, p2: this_thr->th.th_pri_common + 1,
4324 size: sizeof(struct common_table), format: "th_%d.th_pri_common\n", gtid);
4325 }
4326 this_thr->th.th_pri_head = NULL;
4327 }
4328
4329 if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4330 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4331 // Make new thread's CG root same as primary thread's
4332 KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4333 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4334 if (tmp) {
4335 // worker changes CG, need to check if old CG should be freed
4336 int i = tmp->cg_nthreads--;
4337 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4338 " on node %p of thread %p to %d\n",
4339 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4340 if (i == 1) {
4341 __kmp_free(tmp); // last thread left CG --> free it
4342 }
4343 }
4344 this_thr->th.th_cg_roots = master->th.th_cg_roots;
4345 // Increment new thread's CG root's counter to add the new thread
4346 this_thr->th.th_cg_roots->cg_nthreads++;
4347 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4348 " node %p of thread %p to %d\n",
4349 this_thr, this_thr->th.th_cg_roots,
4350 this_thr->th.th_cg_roots->cg_root,
4351 this_thr->th.th_cg_roots->cg_nthreads));
4352 this_thr->th.th_current_task->td_icvs.thread_limit =
4353 this_thr->th.th_cg_roots->cg_thread_limit;
4354 }
4355
4356 /* Initialize dynamic dispatch */
4357 {
4358 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4359 // Use team max_nproc since this will never change for the team.
4360 size_t disp_size =
4361 sizeof(dispatch_private_info_t) *
4362 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4363 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4364 team->t.t_max_nproc));
4365 KMP_ASSERT(dispatch);
4366 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4367 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4368
4369 dispatch->th_disp_index = 0;
4370 dispatch->th_doacross_buf_idx = 0;
4371 if (!dispatch->th_disp_buffer) {
4372 dispatch->th_disp_buffer =
4373 (dispatch_private_info_t *)__kmp_allocate(disp_size);
4374
4375 if (__kmp_storage_map) {
4376 __kmp_print_storage_map_gtid(
4377 gtid, p1: &dispatch->th_disp_buffer[0],
4378 p2: &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4379 ? 1
4380 : __kmp_dispatch_num_buffers],
4381 size: disp_size,
4382 format: "th_%d.th_dispatch.th_disp_buffer "
4383 "(team_%d.t_dispatch[%d].th_disp_buffer)",
4384 gtid, team->t.t_id, gtid);
4385 }
4386 } else {
4387 memset(s: &dispatch->th_disp_buffer[0], c: '\0', n: disp_size);
4388 }
4389
4390 dispatch->th_dispatch_pr_current = 0;
4391 dispatch->th_dispatch_sh_current = 0;
4392
4393 dispatch->th_deo_fcn = 0; /* ORDERED */
4394 dispatch->th_dxo_fcn = 0; /* END ORDERED */
4395 }
4396
4397 this_thr->th.th_next_pool = NULL;
4398
4399 if (!this_thr->th.th_task_state_memo_stack) {
4400 size_t i;
4401 this_thr->th.th_task_state_memo_stack =
4402 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4403 this_thr->th.th_task_state_top = 0;
4404 this_thr->th.th_task_state_stack_sz = 4;
4405 for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4406 ++i) // zero init the stack
4407 this_thr->th.th_task_state_memo_stack[i] = 0;
4408 }
4409
4410 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4411 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4412
4413 KMP_MB();
4414}
4415
4416/* allocate a new thread for the requesting team. this is only called from
4417 within a forkjoin critical section. we will first try to get an available
4418 thread from the thread pool. if none is available, we will fork a new one
4419 assuming we are able to create a new one. this should be assured, as the
4420 caller should check on this first. */
4421kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4422 int new_tid) {
4423 kmp_team_t *serial_team;
4424 kmp_info_t *new_thr;
4425 int new_gtid;
4426
4427 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4428 KMP_DEBUG_ASSERT(root && team);
4429#if !KMP_NESTED_HOT_TEAMS
4430 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4431#endif
4432 KMP_MB();
4433
4434 /* first, try to get one from the thread pool unless allocating thread is
4435 * the main hidden helper thread. The hidden helper team should always
4436 * allocate new OS threads. */
4437 if (__kmp_thread_pool && !KMP_HIDDEN_HELPER_TEAM(team)) {
4438 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4439 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4440 if (new_thr == __kmp_thread_pool_insert_pt) {
4441 __kmp_thread_pool_insert_pt = NULL;
4442 }
4443 TCW_4(new_thr->th.th_in_pool, FALSE);
4444 __kmp_suspend_initialize_thread(th: new_thr);
4445 __kmp_lock_suspend_mx(th: new_thr);
4446 if (new_thr->th.th_active_in_pool == TRUE) {
4447 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4448 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4449 new_thr->th.th_active_in_pool = FALSE;
4450 }
4451 __kmp_unlock_suspend_mx(th: new_thr);
4452
4453 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4454 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4455 KMP_ASSERT(!new_thr->th.th_team);
4456 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4457
4458 /* setup the thread structure */
4459 __kmp_initialize_info(this_thr: new_thr, team, tid: new_tid,
4460 gtid: new_thr->th.th_info.ds.ds_gtid);
4461 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4462
4463 TCW_4(__kmp_nth, __kmp_nth + 1);
4464
4465 new_thr->th.th_task_state = 0;
4466 new_thr->th.th_task_state_top = 0;
4467 new_thr->th.th_task_state_stack_sz = 4;
4468
4469 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4470 // Make sure pool thread has transitioned to waiting on own thread struct
4471 KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4472 // Thread activated in __kmp_allocate_team when increasing team size
4473 }
4474
4475#ifdef KMP_ADJUST_BLOCKTIME
4476 /* Adjust blocktime back to zero if necessary */
4477 /* Middle initialization might not have occurred yet */
4478 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4479 if (__kmp_nth > __kmp_avail_proc) {
4480 __kmp_zero_bt = TRUE;
4481 }
4482 }
4483#endif /* KMP_ADJUST_BLOCKTIME */
4484
4485#if KMP_DEBUG
4486 // If thread entered pool via __kmp_free_thread, wait_flag should !=
4487 // KMP_BARRIER_PARENT_FLAG.
4488 int b;
4489 kmp_balign_t *balign = new_thr->th.th_bar;
4490 for (b = 0; b < bs_last_barrier; ++b)
4491 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4492#endif
4493
4494 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4495 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4496
4497 KMP_MB();
4498 return new_thr;
4499 }
4500
4501 /* no, well fork a new one */
4502 KMP_ASSERT(KMP_HIDDEN_HELPER_TEAM(team) || __kmp_nth == __kmp_all_nth);
4503 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4504
4505#if KMP_USE_MONITOR
4506 // If this is the first worker thread the RTL is creating, then also
4507 // launch the monitor thread. We try to do this as early as possible.
4508 if (!TCR_4(__kmp_init_monitor)) {
4509 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4510 if (!TCR_4(__kmp_init_monitor)) {
4511 KF_TRACE(10, ("before __kmp_create_monitor\n"));
4512 TCW_4(__kmp_init_monitor, 1);
4513 __kmp_create_monitor(&__kmp_monitor);
4514 KF_TRACE(10, ("after __kmp_create_monitor\n"));
4515#if KMP_OS_WINDOWS
4516 // AC: wait until monitor has started. This is a fix for CQ232808.
4517 // The reason is that if the library is loaded/unloaded in a loop with
4518 // small (parallel) work in between, then there is high probability that
4519 // monitor thread started after the library shutdown. At shutdown it is
4520 // too late to cope with the problem, because when the primary thread is
4521 // in DllMain (process detach) the monitor has no chances to start (it is
4522 // blocked), and primary thread has no means to inform the monitor that
4523 // the library has gone, because all the memory which the monitor can
4524 // access is going to be released/reset.
4525 while (TCR_4(__kmp_init_monitor) < 2) {
4526 KMP_YIELD(TRUE);
4527 }
4528 KF_TRACE(10, ("after monitor thread has started\n"));
4529#endif
4530 }
4531 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4532 }
4533#endif
4534
4535 KMP_MB();
4536
4537 {
4538 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4539 ? 1
4540 : __kmp_hidden_helper_threads_num + 1;
4541
4542 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4543 ++new_gtid) {
4544 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4545 }
4546
4547 if (TCR_4(__kmp_init_hidden_helper_threads)) {
4548 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4549 }
4550 }
4551
4552 /* allocate space for it. */
4553 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4554
4555 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4556
4557#if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4558 // suppress race conditions detection on synchronization flags in debug mode
4559 // this helps to analyze library internals eliminating false positives
4560 __itt_suppress_mark_range(
4561 __itt_suppress_range, __itt_suppress_threading_errors,
4562 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4563 __itt_suppress_mark_range(
4564 __itt_suppress_range, __itt_suppress_threading_errors,
4565 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4566#if KMP_OS_WINDOWS
4567 __itt_suppress_mark_range(
4568 __itt_suppress_range, __itt_suppress_threading_errors,
4569 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4570#else
4571 __itt_suppress_mark_range(__itt_suppress_range,
4572 __itt_suppress_threading_errors,
4573 &new_thr->th.th_suspend_init_count,
4574 sizeof(new_thr->th.th_suspend_init_count));
4575#endif
4576 // TODO: check if we need to also suppress b_arrived flags
4577 __itt_suppress_mark_range(__itt_suppress_range,
4578 __itt_suppress_threading_errors,
4579 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4580 sizeof(new_thr->th.th_bar[0].bb.b_go));
4581 __itt_suppress_mark_range(__itt_suppress_range,
4582 __itt_suppress_threading_errors,
4583 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4584 sizeof(new_thr->th.th_bar[1].bb.b_go));
4585 __itt_suppress_mark_range(__itt_suppress_range,
4586 __itt_suppress_threading_errors,
4587 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4588 sizeof(new_thr->th.th_bar[2].bb.b_go));
4589#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4590 if (__kmp_storage_map) {
4591 __kmp_print_thread_storage_map(thr: new_thr, gtid: new_gtid);
4592 }
4593
4594 // add the reserve serialized team, initialized from the team's primary thread
4595 {
4596 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4597 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4598 new_thr->th.th_serial_team = serial_team =
4599 (kmp_team_t *)__kmp_allocate_team(root, new_nproc: 1, max_nproc: 1,
4600#if OMPT_SUPPORT
4601 ompt_data_none, // root parallel id
4602#endif
4603 proc_bind: proc_bind_default, new_icvs: &r_icvs,
4604 argc: 0 USE_NESTED_HOT_ARG(NULL));
4605 }
4606 KMP_ASSERT(serial_team);
4607 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4608 // execution (it is unused for now).
4609 serial_team->t.t_threads[0] = new_thr;
4610 KF_TRACE(10,
4611 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4612 new_thr));
4613
4614 /* setup the thread structures */
4615 __kmp_initialize_info(this_thr: new_thr, team, tid: new_tid, gtid: new_gtid);
4616
4617#if USE_FAST_MEMORY
4618 __kmp_initialize_fast_memory(this_thr: new_thr);
4619#endif /* USE_FAST_MEMORY */
4620
4621#if KMP_USE_BGET
4622 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4623 __kmp_initialize_bget(th: new_thr);
4624#endif
4625
4626 __kmp_init_random(thread: new_thr); // Initialize random number generator
4627
4628 /* Initialize these only once when thread is grabbed for a team allocation */
4629 KA_TRACE(20,
4630 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4631 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4632
4633 int b;
4634 kmp_balign_t *balign = new_thr->th.th_bar;
4635 for (b = 0; b < bs_last_barrier; ++b) {
4636 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4637 balign[b].bb.team = NULL;
4638 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4639 balign[b].bb.use_oncore_barrier = 0;
4640 }
4641
4642 TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4643 new_thr->th.th_sleep_loc_type = flag_unset;
4644
4645 new_thr->th.th_spin_here = FALSE;
4646 new_thr->th.th_next_waiting = 0;
4647#if KMP_OS_UNIX
4648 new_thr->th.th_blocking = false;
4649#endif
4650
4651#if KMP_AFFINITY_SUPPORTED
4652 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4653 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4654 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4655 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4656#endif
4657 new_thr->th.th_def_allocator = __kmp_def_allocator;
4658 new_thr->th.th_prev_level = 0;
4659 new_thr->th.th_prev_num_threads = 1;
4660
4661 TCW_4(new_thr->th.th_in_pool, FALSE);
4662 new_thr->th.th_active_in_pool = FALSE;
4663 TCW_4(new_thr->th.th_active, TRUE);
4664
4665 /* adjust the global counters */
4666 __kmp_all_nth++;
4667 __kmp_nth++;
4668
4669 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4670 // numbers of procs, and method #2 (keyed API call) for higher numbers.
4671 if (__kmp_adjust_gtid_mode) {
4672 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4673 if (TCR_4(__kmp_gtid_mode) != 2) {
4674 TCW_4(__kmp_gtid_mode, 2);
4675 }
4676 } else {
4677 if (TCR_4(__kmp_gtid_mode) != 1) {
4678 TCW_4(__kmp_gtid_mode, 1);
4679 }
4680 }
4681 }
4682
4683#ifdef KMP_ADJUST_BLOCKTIME
4684 /* Adjust blocktime back to zero if necessary */
4685 /* Middle initialization might not have occurred yet */
4686 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4687 if (__kmp_nth > __kmp_avail_proc) {
4688 __kmp_zero_bt = TRUE;
4689 }
4690 }
4691#endif /* KMP_ADJUST_BLOCKTIME */
4692
4693#if KMP_AFFINITY_SUPPORTED
4694 // Set the affinity and topology information for new thread
4695 __kmp_affinity_set_init_mask(gtid: new_gtid, /*isa_root=*/FALSE);
4696#endif
4697
4698 /* actually fork it and create the new worker thread */
4699 KF_TRACE(
4700 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4701 __kmp_create_worker(gtid: new_gtid, th: new_thr, stack_size: __kmp_stksize);
4702 KF_TRACE(10,
4703 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4704
4705 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4706 new_gtid));
4707 KMP_MB();
4708 return new_thr;
4709}
4710
4711/* Reinitialize team for reuse.
4712 The hot team code calls this case at every fork barrier, so EPCC barrier
4713 test are extremely sensitive to changes in it, esp. writes to the team
4714 struct, which cause a cache invalidation in all threads.
4715 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4716static void __kmp_reinitialize_team(kmp_team_t *team,
4717 kmp_internal_control_t *new_icvs,
4718 ident_t *loc) {
4719 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4720 team->t.t_threads[0], team));
4721 KMP_DEBUG_ASSERT(team && new_icvs);
4722 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4723 KMP_CHECK_UPDATE(team->t.t_ident, loc);
4724
4725 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4726 // Copy ICVs to the primary thread's implicit taskdata
4727 __kmp_init_implicit_task(loc_ref: loc, this_thr: team->t.t_threads[0], team, tid: 0, FALSE);
4728 copy_icvs(dst: &team->t.t_implicit_task_taskdata[0].td_icvs, src: new_icvs);
4729
4730 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4731 team->t.t_threads[0], team));
4732}
4733
4734/* Initialize the team data structure.
4735 This assumes the t_threads and t_max_nproc are already set.
4736 Also, we don't touch the arguments */
4737static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4738 kmp_internal_control_t *new_icvs,
4739 ident_t *loc) {
4740 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4741
4742 /* verify */
4743 KMP_DEBUG_ASSERT(team);
4744 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4745 KMP_DEBUG_ASSERT(team->t.t_threads);
4746 KMP_MB();
4747
4748 team->t.t_master_tid = 0; /* not needed */
4749 /* team->t.t_master_bar; not needed */
4750 team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4751 team->t.t_nproc = new_nproc;
4752
4753 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4754 team->t.t_next_pool = NULL;
4755 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4756 * up hot team */
4757
4758 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4759 team->t.t_invoke = NULL; /* not needed */
4760
4761 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4762 team->t.t_sched.sched = new_icvs->sched.sched;
4763
4764#if KMP_ARCH_X86 || KMP_ARCH_X86_64
4765 team->t.t_fp_control_saved = FALSE; /* not needed */
4766 team->t.t_x87_fpu_control_word = 0; /* not needed */
4767 team->t.t_mxcsr = 0; /* not needed */
4768#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4769
4770 team->t.t_construct = 0;
4771
4772 team->t.t_ordered.dt.t_value = 0;
4773 team->t.t_master_active = FALSE;
4774
4775#ifdef KMP_DEBUG
4776 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4777#endif
4778#if KMP_OS_WINDOWS
4779 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4780#endif
4781
4782 team->t.t_control_stack_top = NULL;
4783
4784 __kmp_reinitialize_team(team, new_icvs, loc);
4785
4786 KMP_MB();
4787 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4788}
4789
4790#if KMP_AFFINITY_SUPPORTED
4791static inline void __kmp_set_thread_place(kmp_team_t *team, kmp_info_t *th,
4792 int first, int last, int newp) {
4793 th->th.th_first_place = first;
4794 th->th.th_last_place = last;
4795 th->th.th_new_place = newp;
4796 if (newp != th->th.th_current_place) {
4797 if (__kmp_display_affinity && team->t.t_display_affinity != 1)
4798 team->t.t_display_affinity = 1;
4799 // Copy topology information associated with the new place
4800 th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place];
4801 th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place];
4802 }
4803}
4804
4805// __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4806// It calculates the worker + primary thread's partition based upon the parent
4807// thread's partition, and binds each worker to a thread in their partition.
4808// The primary thread's partition should already include its current binding.
4809static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4810 // Do not partition places for the hidden helper team
4811 if (KMP_HIDDEN_HELPER_TEAM(team))
4812 return;
4813 // Copy the primary thread's place partition to the team struct
4814 kmp_info_t *master_th = team->t.t_threads[0];
4815 KMP_DEBUG_ASSERT(master_th != NULL);
4816 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4817 int first_place = master_th->th.th_first_place;
4818 int last_place = master_th->th.th_last_place;
4819 int masters_place = master_th->th.th_current_place;
4820 int num_masks = __kmp_affinity.num_masks;
4821 team->t.t_first_place = first_place;
4822 team->t.t_last_place = last_place;
4823
4824 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4825 "bound to place %d partition = [%d,%d]\n",
4826 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4827 team->t.t_id, masters_place, first_place, last_place));
4828
4829 switch (proc_bind) {
4830
4831 case proc_bind_default:
4832 // Serial teams might have the proc_bind policy set to proc_bind_default.
4833 // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4834 KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4835 break;
4836
4837 case proc_bind_primary: {
4838 int f;
4839 int n_th = team->t.t_nproc;
4840 for (f = 1; f < n_th; f++) {
4841 kmp_info_t *th = team->t.t_threads[f];
4842 KMP_DEBUG_ASSERT(th != NULL);
4843 __kmp_set_thread_place(team, th, first: first_place, last: last_place, newp: masters_place);
4844
4845 KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4846 "partition = [%d,%d]\n",
4847 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4848 f, masters_place, first_place, last_place));
4849 }
4850 } break;
4851
4852 case proc_bind_close: {
4853 int f;
4854 int n_th = team->t.t_nproc;
4855 int n_places;
4856 if (first_place <= last_place) {
4857 n_places = last_place - first_place + 1;
4858 } else {
4859 n_places = num_masks - first_place + last_place + 1;
4860 }
4861 if (n_th <= n_places) {
4862 int place = masters_place;
4863 for (f = 1; f < n_th; f++) {
4864 kmp_info_t *th = team->t.t_threads[f];
4865 KMP_DEBUG_ASSERT(th != NULL);
4866
4867 if (place == last_place) {
4868 place = first_place;
4869 } else if (place == (num_masks - 1)) {
4870 place = 0;
4871 } else {
4872 place++;
4873 }
4874 __kmp_set_thread_place(team, th, first: first_place, last: last_place, newp: place);
4875
4876 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4877 "partition = [%d,%d]\n",
4878 __kmp_gtid_from_thread(team->t.t_threads[f]),
4879 team->t.t_id, f, place, first_place, last_place));
4880 }
4881 } else {
4882 int S, rem, gap, s_count;
4883 S = n_th / n_places;
4884 s_count = 0;
4885 rem = n_th - (S * n_places);
4886 gap = rem > 0 ? n_places / rem : n_places;
4887 int place = masters_place;
4888 int gap_ct = gap;
4889 for (f = 0; f < n_th; f++) {
4890 kmp_info_t *th = team->t.t_threads[f];
4891 KMP_DEBUG_ASSERT(th != NULL);
4892
4893 __kmp_set_thread_place(team, th, first: first_place, last: last_place, newp: place);
4894 s_count++;
4895
4896 if ((s_count == S) && rem && (gap_ct == gap)) {
4897 // do nothing, add an extra thread to place on next iteration
4898 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4899 // we added an extra thread to this place; move to next place
4900 if (place == last_place) {
4901 place = first_place;
4902 } else if (place == (num_masks - 1)) {
4903 place = 0;
4904 } else {
4905 place++;
4906 }
4907 s_count = 0;
4908 gap_ct = 1;
4909 rem--;
4910 } else if (s_count == S) { // place full; don't add extra
4911 if (place == last_place) {
4912 place = first_place;
4913 } else if (place == (num_masks - 1)) {
4914 place = 0;
4915 } else {
4916 place++;
4917 }
4918 gap_ct++;
4919 s_count = 0;
4920 }
4921
4922 KA_TRACE(100,
4923 ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4924 "partition = [%d,%d]\n",
4925 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4926 th->th.th_new_place, first_place, last_place));
4927 }
4928 KMP_DEBUG_ASSERT(place == masters_place);
4929 }
4930 } break;
4931
4932 case proc_bind_spread: {
4933 int f;
4934 int n_th = team->t.t_nproc;
4935 int n_places;
4936 int thidx;
4937 if (first_place <= last_place) {
4938 n_places = last_place - first_place + 1;
4939 } else {
4940 n_places = num_masks - first_place + last_place + 1;
4941 }
4942 if (n_th <= n_places) {
4943 int place = -1;
4944
4945 if (n_places != num_masks) {
4946 int S = n_places / n_th;
4947 int s_count, rem, gap, gap_ct;
4948
4949 place = masters_place;
4950 rem = n_places - n_th * S;
4951 gap = rem ? n_th / rem : 1;
4952 gap_ct = gap;
4953 thidx = n_th;
4954 if (update_master_only == 1)
4955 thidx = 1;
4956 for (f = 0; f < thidx; f++) {
4957 kmp_info_t *th = team->t.t_threads[f];
4958 KMP_DEBUG_ASSERT(th != NULL);
4959
4960 int fplace = place, nplace = place;
4961 s_count = 1;
4962 while (s_count < S) {
4963 if (place == last_place) {
4964 place = first_place;
4965 } else if (place == (num_masks - 1)) {
4966 place = 0;
4967 } else {
4968 place++;
4969 }
4970 s_count++;
4971 }
4972 if (rem && (gap_ct == gap)) {
4973 if (place == last_place) {
4974 place = first_place;
4975 } else if (place == (num_masks - 1)) {
4976 place = 0;
4977 } else {
4978 place++;
4979 }
4980 rem--;
4981 gap_ct = 0;
4982 }
4983 __kmp_set_thread_place(team, th, first: fplace, last: place, newp: nplace);
4984 gap_ct++;
4985
4986 if (place == last_place) {
4987 place = first_place;
4988 } else if (place == (num_masks - 1)) {
4989 place = 0;
4990 } else {
4991 place++;
4992 }
4993
4994 KA_TRACE(100,
4995 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4996 "partition = [%d,%d], num_masks: %u\n",
4997 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4998 f, th->th.th_new_place, th->th.th_first_place,
4999 th->th.th_last_place, num_masks));
5000 }
5001 } else {
5002 /* Having uniform space of available computation places I can create
5003 T partitions of round(P/T) size and put threads into the first
5004 place of each partition. */
5005 double current = static_cast<double>(masters_place);
5006 double spacing =
5007 (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
5008 int first, last;
5009 kmp_info_t *th;
5010
5011 thidx = n_th + 1;
5012 if (update_master_only == 1)
5013 thidx = 1;
5014 for (f = 0; f < thidx; f++) {
5015 first = static_cast<int>(current);
5016 last = static_cast<int>(current + spacing) - 1;
5017 KMP_DEBUG_ASSERT(last >= first);
5018 if (first >= n_places) {
5019 if (masters_place) {
5020 first -= n_places;
5021 last -= n_places;
5022 if (first == (masters_place + 1)) {
5023 KMP_DEBUG_ASSERT(f == n_th);
5024 first--;
5025 }
5026 if (last == masters_place) {
5027 KMP_DEBUG_ASSERT(f == (n_th - 1));
5028 last--;
5029 }
5030 } else {
5031 KMP_DEBUG_ASSERT(f == n_th);
5032 first = 0;
5033 last = 0;
5034 }
5035 }
5036 if (last >= n_places) {
5037 last = (n_places - 1);
5038 }
5039 place = first;
5040 current += spacing;
5041 if (f < n_th) {
5042 KMP_DEBUG_ASSERT(0 <= first);
5043 KMP_DEBUG_ASSERT(n_places > first);
5044 KMP_DEBUG_ASSERT(0 <= last);
5045 KMP_DEBUG_ASSERT(n_places > last);
5046 KMP_DEBUG_ASSERT(last_place >= first_place);
5047 th = team->t.t_threads[f];
5048 KMP_DEBUG_ASSERT(th);
5049 __kmp_set_thread_place(team, th, first, last, newp: place);
5050 KA_TRACE(100,
5051 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5052 "partition = [%d,%d], spacing = %.4f\n",
5053 __kmp_gtid_from_thread(team->t.t_threads[f]),
5054 team->t.t_id, f, th->th.th_new_place,
5055 th->th.th_first_place, th->th.th_last_place, spacing));
5056 }
5057 }
5058 }
5059 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5060 } else {
5061 int S, rem, gap, s_count;
5062 S = n_th / n_places;
5063 s_count = 0;
5064 rem = n_th - (S * n_places);
5065 gap = rem > 0 ? n_places / rem : n_places;
5066 int place = masters_place;
5067 int gap_ct = gap;
5068 thidx = n_th;
5069 if (update_master_only == 1)
5070 thidx = 1;
5071 for (f = 0; f < thidx; f++) {
5072 kmp_info_t *th = team->t.t_threads[f];
5073 KMP_DEBUG_ASSERT(th != NULL);
5074
5075 __kmp_set_thread_place(team, th, first: place, last: place, newp: place);
5076 s_count++;
5077
5078 if ((s_count == S) && rem && (gap_ct == gap)) {
5079 // do nothing, add an extra thread to place on next iteration
5080 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5081 // we added an extra thread to this place; move on to next place
5082 if (place == last_place) {
5083 place = first_place;
5084 } else if (place == (num_masks - 1)) {
5085 place = 0;
5086 } else {
5087 place++;
5088 }
5089 s_count = 0;
5090 gap_ct = 1;
5091 rem--;
5092 } else if (s_count == S) { // place is full; don't add extra thread
5093 if (place == last_place) {
5094 place = first_place;
5095 } else if (place == (num_masks - 1)) {
5096 place = 0;
5097 } else {
5098 place++;
5099 }
5100 gap_ct++;
5101 s_count = 0;
5102 }
5103
5104 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5105 "partition = [%d,%d]\n",
5106 __kmp_gtid_from_thread(team->t.t_threads[f]),
5107 team->t.t_id, f, th->th.th_new_place,
5108 th->th.th_first_place, th->th.th_last_place));
5109 }
5110 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5111 }
5112 } break;
5113
5114 default:
5115 break;
5116 }
5117
5118 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5119}
5120
5121#endif // KMP_AFFINITY_SUPPORTED
5122
5123/* allocate a new team data structure to use. take one off of the free pool if
5124 available */
5125kmp_team_t *
5126__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5127#if OMPT_SUPPORT
5128 ompt_data_t ompt_parallel_data,
5129#endif
5130 kmp_proc_bind_t new_proc_bind,
5131 kmp_internal_control_t *new_icvs,
5132 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5133 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5134 int f;
5135 kmp_team_t *team;
5136 int use_hot_team = !root->r.r_active;
5137 int level = 0;
5138 int do_place_partition = 1;
5139
5140 KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5141 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5142 KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5143 KMP_MB();
5144
5145#if KMP_NESTED_HOT_TEAMS
5146 kmp_hot_team_ptr_t *hot_teams;
5147 if (master) {
5148 team = master->th.th_team;
5149 level = team->t.t_active_level;
5150 if (master->th.th_teams_microtask) { // in teams construct?
5151 if (master->th.th_teams_size.nteams > 1 &&
5152 ( // #teams > 1
5153 team->t.t_pkfn ==
5154 (microtask_t)__kmp_teams_master || // inner fork of the teams
5155 master->th.th_teams_level <
5156 team->t.t_level)) { // or nested parallel inside the teams
5157 ++level; // not increment if #teams==1, or for outer fork of the teams;
5158 // increment otherwise
5159 }
5160 // Do not perform the place partition if inner fork of the teams
5161 // Wait until nested parallel region encountered inside teams construct
5162 if ((master->th.th_teams_size.nteams == 1 &&
5163 master->th.th_teams_level >= team->t.t_level) ||
5164 (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5165 do_place_partition = 0;
5166 }
5167 hot_teams = master->th.th_hot_teams;
5168 if (level < __kmp_hot_teams_max_level && hot_teams &&
5169 hot_teams[level].hot_team) {
5170 // hot team has already been allocated for given level
5171 use_hot_team = 1;
5172 } else {
5173 use_hot_team = 0;
5174 }
5175 } else {
5176 // check we won't access uninitialized hot_teams, just in case
5177 KMP_DEBUG_ASSERT(new_nproc == 1);
5178 }
5179#endif
5180 // Optimization to use a "hot" team
5181 if (use_hot_team && new_nproc > 1) {
5182 KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5183#if KMP_NESTED_HOT_TEAMS
5184 team = hot_teams[level].hot_team;
5185#else
5186 team = root->r.r_hot_team;
5187#endif
5188#if KMP_DEBUG
5189 if (__kmp_tasking_mode != tskm_immediate_exec) {
5190 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5191 "task_team[1] = %p before reinit\n",
5192 team->t.t_task_team[0], team->t.t_task_team[1]));
5193 }
5194#endif
5195
5196 if (team->t.t_nproc != new_nproc &&
5197 __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5198 // Distributed barrier may need a resize
5199 int old_nthr = team->t.t_nproc;
5200 __kmp_resize_dist_barrier(team, old_nthreads: old_nthr, new_nthreads: new_nproc);
5201 }
5202
5203 // If not doing the place partition, then reset the team's proc bind
5204 // to indicate that partitioning of all threads still needs to take place
5205 if (do_place_partition == 0)
5206 team->t.t_proc_bind = proc_bind_default;
5207 // Has the number of threads changed?
5208 /* Let's assume the most common case is that the number of threads is
5209 unchanged, and put that case first. */
5210 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5211 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5212 // This case can mean that omp_set_num_threads() was called and the hot
5213 // team size was already reduced, so we check the special flag
5214 if (team->t.t_size_changed == -1) {
5215 team->t.t_size_changed = 1;
5216 } else {
5217 KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5218 }
5219
5220 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5221 kmp_r_sched_t new_sched = new_icvs->sched;
5222 // set primary thread's schedule as new run-time schedule
5223 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5224
5225 __kmp_reinitialize_team(team, new_icvs,
5226 loc: root->r.r_uber_thread->th.th_ident);
5227
5228 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5229 team->t.t_threads[0], team));
5230 __kmp_push_current_task_to_thread(this_thr: team->t.t_threads[0], team, tid: 0);
5231
5232#if KMP_AFFINITY_SUPPORTED
5233 if ((team->t.t_size_changed == 0) &&
5234 (team->t.t_proc_bind == new_proc_bind)) {
5235 if (new_proc_bind == proc_bind_spread) {
5236 if (do_place_partition) {
5237 // add flag to update only master for spread
5238 __kmp_partition_places(team, update_master_only: 1);
5239 }
5240 }
5241 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5242 "proc_bind = %d, partition = [%d,%d]\n",
5243 team->t.t_id, new_proc_bind, team->t.t_first_place,
5244 team->t.t_last_place));
5245 } else {
5246 if (do_place_partition) {
5247 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5248 __kmp_partition_places(team);
5249 }
5250 }
5251#else
5252 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5253#endif /* KMP_AFFINITY_SUPPORTED */
5254 } else if (team->t.t_nproc > new_nproc) {
5255 KA_TRACE(20,
5256 ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5257 new_nproc));
5258
5259 team->t.t_size_changed = 1;
5260 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5261 // Barrier size already reduced earlier in this function
5262 // Activate team threads via th_used_in_team
5263 __kmp_add_threads_to_team(team, new_nthreads: new_nproc);
5264 }
5265#if KMP_NESTED_HOT_TEAMS
5266 if (__kmp_hot_teams_mode == 0) {
5267 // AC: saved number of threads should correspond to team's value in this
5268 // mode, can be bigger in mode 1, when hot team has threads in reserve
5269 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5270 hot_teams[level].hot_team_nth = new_nproc;
5271#endif // KMP_NESTED_HOT_TEAMS
5272 /* release the extra threads we don't need any more */
5273 for (f = new_nproc; f < team->t.t_nproc; f++) {
5274 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5275 if (__kmp_tasking_mode != tskm_immediate_exec) {
5276 // When decreasing team size, threads no longer in the team should
5277 // unref task team.
5278 team->t.t_threads[f]->th.th_task_team = NULL;
5279 }
5280 __kmp_free_thread(team->t.t_threads[f]);
5281 team->t.t_threads[f] = NULL;
5282 }
5283#if KMP_NESTED_HOT_TEAMS
5284 } // (__kmp_hot_teams_mode == 0)
5285 else {
5286 // When keeping extra threads in team, switch threads to wait on own
5287 // b_go flag
5288 for (f = new_nproc; f < team->t.t_nproc; ++f) {
5289 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5290 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5291 for (int b = 0; b < bs_last_barrier; ++b) {
5292 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5293 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5294 }
5295 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5296 }
5297 }
5298 }
5299#endif // KMP_NESTED_HOT_TEAMS
5300 team->t.t_nproc = new_nproc;
5301 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5302 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5303 __kmp_reinitialize_team(team, new_icvs,
5304 loc: root->r.r_uber_thread->th.th_ident);
5305
5306 // Update remaining threads
5307 for (f = 0; f < new_nproc; ++f) {
5308 team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5309 }
5310
5311 // restore the current task state of the primary thread: should be the
5312 // implicit task
5313 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5314 team->t.t_threads[0], team));
5315
5316 __kmp_push_current_task_to_thread(this_thr: team->t.t_threads[0], team, tid: 0);
5317
5318#ifdef KMP_DEBUG
5319 for (f = 0; f < team->t.t_nproc; f++) {
5320 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5321 team->t.t_threads[f]->th.th_team_nproc ==
5322 team->t.t_nproc);
5323 }
5324#endif
5325
5326 if (do_place_partition) {
5327 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5328#if KMP_AFFINITY_SUPPORTED
5329 __kmp_partition_places(team);
5330#endif
5331 }
5332 } else { // team->t.t_nproc < new_nproc
5333
5334 KA_TRACE(20,
5335 ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5336 new_nproc));
5337 int old_nproc = team->t.t_nproc; // save old value and use to update only
5338 team->t.t_size_changed = 1;
5339
5340#if KMP_NESTED_HOT_TEAMS
5341 int avail_threads = hot_teams[level].hot_team_nth;
5342 if (new_nproc < avail_threads)
5343 avail_threads = new_nproc;
5344 kmp_info_t **other_threads = team->t.t_threads;
5345 for (f = team->t.t_nproc; f < avail_threads; ++f) {
5346 // Adjust barrier data of reserved threads (if any) of the team
5347 // Other data will be set in __kmp_initialize_info() below.
5348 int b;
5349 kmp_balign_t *balign = other_threads[f]->th.th_bar;
5350 for (b = 0; b < bs_last_barrier; ++b) {
5351 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5352 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5353#if USE_DEBUGGER
5354 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5355#endif
5356 }
5357 }
5358 if (hot_teams[level].hot_team_nth >= new_nproc) {
5359 // we have all needed threads in reserve, no need to allocate any
5360 // this only possible in mode 1, cannot have reserved threads in mode 0
5361 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5362 team->t.t_nproc = new_nproc; // just get reserved threads involved
5363 } else {
5364 // We may have some threads in reserve, but not enough;
5365 // get reserved threads involved if any.
5366 team->t.t_nproc = hot_teams[level].hot_team_nth;
5367 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5368#endif // KMP_NESTED_HOT_TEAMS
5369 if (team->t.t_max_nproc < new_nproc) {
5370 /* reallocate larger arrays */
5371 __kmp_reallocate_team_arrays(team, max_nth: new_nproc);
5372 __kmp_reinitialize_team(team, new_icvs, NULL);
5373 }
5374
5375#if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) && \
5376 KMP_AFFINITY_SUPPORTED
5377 /* Temporarily set full mask for primary thread before creation of
5378 workers. The reason is that workers inherit the affinity from the
5379 primary thread, so if a lot of workers are created on the single
5380 core quickly, they don't get a chance to set their own affinity for
5381 a long time. */
5382 kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask};
5383#endif
5384
5385 /* allocate new threads for the hot team */
5386 for (f = team->t.t_nproc; f < new_nproc; f++) {
5387 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, new_tid: f);
5388 KMP_DEBUG_ASSERT(new_worker);
5389 team->t.t_threads[f] = new_worker;
5390
5391 KA_TRACE(20,
5392 ("__kmp_allocate_team: team %d init T#%d arrived: "
5393 "join=%llu, plain=%llu\n",
5394 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5395 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5396 team->t.t_bar[bs_plain_barrier].b_arrived));
5397
5398 { // Initialize barrier data for new threads.
5399 int b;
5400 kmp_balign_t *balign = new_worker->th.th_bar;
5401 for (b = 0; b < bs_last_barrier; ++b) {
5402 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5403 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5404 KMP_BARRIER_PARENT_FLAG);
5405#if USE_DEBUGGER
5406 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5407#endif
5408 }
5409 }
5410 }
5411
5412#if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) && \
5413 KMP_AFFINITY_SUPPORTED
5414 /* Restore initial primary thread's affinity mask */
5415 new_temp_affinity.restore();
5416#endif
5417#if KMP_NESTED_HOT_TEAMS
5418 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5419#endif // KMP_NESTED_HOT_TEAMS
5420 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5421 // Barrier size already increased earlier in this function
5422 // Activate team threads via th_used_in_team
5423 __kmp_add_threads_to_team(team, new_nthreads: new_nproc);
5424 }
5425 /* make sure everyone is syncronized */
5426 // new threads below
5427 __kmp_initialize_team(team, new_nproc, new_icvs,
5428 loc: root->r.r_uber_thread->th.th_ident);
5429
5430 /* reinitialize the threads */
5431 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5432 for (f = 0; f < team->t.t_nproc; ++f)
5433 __kmp_initialize_info(this_thr: team->t.t_threads[f], team, tid: f,
5434 gtid: __kmp_gtid_from_tid(tid: f, team));
5435
5436 // set th_task_state for new threads in hot team with older thread's state
5437 kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
5438 for (f = old_nproc; f < team->t.t_nproc; ++f)
5439 team->t.t_threads[f]->th.th_task_state = old_state;
5440
5441#ifdef KMP_DEBUG
5442 for (f = 0; f < team->t.t_nproc; ++f) {
5443 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5444 team->t.t_threads[f]->th.th_team_nproc ==
5445 team->t.t_nproc);
5446 }
5447#endif
5448
5449 if (do_place_partition) {
5450 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5451#if KMP_AFFINITY_SUPPORTED
5452 __kmp_partition_places(team);
5453#endif
5454 }
5455 } // Check changes in number of threads
5456
5457 kmp_info_t *master = team->t.t_threads[0];
5458 if (master->th.th_teams_microtask) {
5459 for (f = 1; f < new_nproc; ++f) {
5460 // propagate teams construct specific info to workers
5461 kmp_info_t *thr = team->t.t_threads[f];
5462 thr->th.th_teams_microtask = master->th.th_teams_microtask;
5463 thr->th.th_teams_level = master->th.th_teams_level;
5464 thr->th.th_teams_size = master->th.th_teams_size;
5465 }
5466 }
5467#if KMP_NESTED_HOT_TEAMS
5468 if (level) {
5469 // Sync barrier state for nested hot teams, not needed for outermost hot
5470 // team.
5471 for (f = 1; f < new_nproc; ++f) {
5472 kmp_info_t *thr = team->t.t_threads[f];
5473 int b;
5474 kmp_balign_t *balign = thr->th.th_bar;
5475 for (b = 0; b < bs_last_barrier; ++b) {
5476 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5477 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5478#if USE_DEBUGGER
5479 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5480#endif
5481 }
5482 }
5483 }
5484#endif // KMP_NESTED_HOT_TEAMS
5485
5486 /* reallocate space for arguments if necessary */
5487 __kmp_alloc_argv_entries(argc, team, TRUE);
5488 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5489 // The hot team re-uses the previous task team,
5490 // if untouched during the previous release->gather phase.
5491
5492 KF_TRACE(10, (" hot_team = %p\n", team));
5493
5494#if KMP_DEBUG
5495 if (__kmp_tasking_mode != tskm_immediate_exec) {
5496 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5497 "task_team[1] = %p after reinit\n",
5498 team->t.t_task_team[0], team->t.t_task_team[1]));
5499 }
5500#endif
5501
5502#if OMPT_SUPPORT
5503 __ompt_team_assign_id(team, ompt_pid: ompt_parallel_data);
5504#endif
5505
5506 KMP_MB();
5507
5508 return team;
5509 }
5510
5511 /* next, let's try to take one from the team pool */
5512 KMP_MB();
5513 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5514 /* TODO: consider resizing undersized teams instead of reaping them, now
5515 that we have a resizing mechanism */
5516 if (team->t.t_max_nproc >= max_nproc) {
5517 /* take this team from the team pool */
5518 __kmp_team_pool = team->t.t_next_pool;
5519
5520 if (max_nproc > 1 &&
5521 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5522 if (!team->t.b) { // Allocate barrier structure
5523 team->t.b = distributedBarrier::allocate(nThreads: __kmp_dflt_team_nth_ub);
5524 }
5525 }
5526
5527 /* setup the team for fresh use */
5528 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5529
5530 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5531 "task_team[1] %p to NULL\n",
5532 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5533 team->t.t_task_team[0] = NULL;
5534 team->t.t_task_team[1] = NULL;
5535
5536 /* reallocate space for arguments if necessary */
5537 __kmp_alloc_argv_entries(argc, team, TRUE);
5538 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5539
5540 KA_TRACE(
5541 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5542 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5543 { // Initialize barrier data.
5544 int b;
5545 for (b = 0; b < bs_last_barrier; ++b) {
5546 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5547#if USE_DEBUGGER
5548 team->t.t_bar[b].b_master_arrived = 0;
5549 team->t.t_bar[b].b_team_arrived = 0;
5550#endif
5551 }
5552 }
5553
5554 team->t.t_proc_bind = new_proc_bind;
5555
5556 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5557 team->t.t_id));
5558
5559#if OMPT_SUPPORT
5560 __ompt_team_assign_id(team, ompt_pid: ompt_parallel_data);
5561#endif
5562
5563 KMP_MB();
5564
5565 return team;
5566 }
5567
5568 /* reap team if it is too small, then loop back and check the next one */
5569 // not sure if this is wise, but, will be redone during the hot-teams
5570 // rewrite.
5571 /* TODO: Use technique to find the right size hot-team, don't reap them */
5572 team = __kmp_reap_team(team);
5573 __kmp_team_pool = team;
5574 }
5575
5576 /* nothing available in the pool, no matter, make a new team! */
5577 KMP_MB();
5578 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5579
5580 /* and set it up */
5581 team->t.t_max_nproc = max_nproc;
5582 if (max_nproc > 1 &&
5583 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5584 // Allocate barrier structure
5585 team->t.b = distributedBarrier::allocate(nThreads: __kmp_dflt_team_nth_ub);
5586 }
5587
5588 /* NOTE well, for some reason allocating one big buffer and dividing it up
5589 seems to really hurt performance a lot on the P4, so, let's not use this */
5590 __kmp_allocate_team_arrays(team, max_nth: max_nproc);
5591
5592 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5593 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5594
5595 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5596 "%p to NULL\n",
5597 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5598 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5599 // memory, no need to duplicate
5600 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5601 // memory, no need to duplicate
5602
5603 if (__kmp_storage_map) {
5604 __kmp_print_team_storage_map(header: "team", team, team_id: team->t.t_id, num_thr: new_nproc);
5605 }
5606
5607 /* allocate space for arguments */
5608 __kmp_alloc_argv_entries(argc, team, FALSE);
5609 team->t.t_argc = argc;
5610
5611 KA_TRACE(20,
5612 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5613 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5614 { // Initialize barrier data.
5615 int b;
5616 for (b = 0; b < bs_last_barrier; ++b) {
5617 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5618#if USE_DEBUGGER
5619 team->t.t_bar[b].b_master_arrived = 0;
5620 team->t.t_bar[b].b_team_arrived = 0;
5621#endif
5622 }
5623 }
5624
5625 team->t.t_proc_bind = new_proc_bind;
5626
5627#if OMPT_SUPPORT
5628 __ompt_team_assign_id(team, ompt_pid: ompt_parallel_data);
5629 team->t.ompt_serialized_team_info = NULL;
5630#endif
5631
5632 KMP_MB();
5633
5634 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5635 team->t.t_id));
5636
5637 return team;
5638}
5639
5640/* TODO implement hot-teams at all levels */
5641/* TODO implement lazy thread release on demand (disband request) */
5642
5643/* free the team. return it to the team pool. release all the threads
5644 * associated with it */
5645void __kmp_free_team(kmp_root_t *root,
5646 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5647 int f;
5648 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5649 team->t.t_id));
5650
5651 /* verify state */
5652 KMP_DEBUG_ASSERT(root);
5653 KMP_DEBUG_ASSERT(team);
5654 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5655 KMP_DEBUG_ASSERT(team->t.t_threads);
5656
5657 int use_hot_team = team == root->r.r_hot_team;
5658#if KMP_NESTED_HOT_TEAMS
5659 int level;
5660 if (master) {
5661 level = team->t.t_active_level - 1;
5662 if (master->th.th_teams_microtask) { // in teams construct?
5663 if (master->th.th_teams_size.nteams > 1) {
5664 ++level; // level was not increased in teams construct for
5665 // team_of_masters
5666 }
5667 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5668 master->th.th_teams_level == team->t.t_level) {
5669 ++level; // level was not increased in teams construct for
5670 // team_of_workers before the parallel
5671 } // team->t.t_level will be increased inside parallel
5672 }
5673#if KMP_DEBUG
5674 kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5675#endif
5676 if (level < __kmp_hot_teams_max_level) {
5677 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5678 use_hot_team = 1;
5679 }
5680 }
5681#endif // KMP_NESTED_HOT_TEAMS
5682
5683 /* team is done working */
5684 TCW_SYNC_PTR(team->t.t_pkfn,
5685 NULL); // Important for Debugging Support Library.
5686#if KMP_OS_WINDOWS
5687 team->t.t_copyin_counter = 0; // init counter for possible reuse
5688#endif
5689 // Do not reset pointer to parent team to NULL for hot teams.
5690
5691 /* if we are non-hot team, release our threads */
5692 if (!use_hot_team) {
5693 if (__kmp_tasking_mode != tskm_immediate_exec) {
5694 // Wait for threads to reach reapable state
5695 for (f = 1; f < team->t.t_nproc; ++f) {
5696 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5697 kmp_info_t *th = team->t.t_threads[f];
5698 volatile kmp_uint32 *state = &th->th.th_reap_state;
5699 while (*state != KMP_SAFE_TO_REAP) {
5700#if KMP_OS_WINDOWS
5701 // On Windows a thread can be killed at any time, check this
5702 DWORD ecode;
5703 if (!__kmp_is_thread_alive(th, &ecode)) {
5704 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5705 break;
5706 }
5707#endif
5708 // first check if thread is sleeping
5709 if (th->th.th_sleep_loc)
5710 __kmp_null_resume_wrapper(thr: th);
5711 KMP_CPU_PAUSE();
5712 }
5713 }
5714
5715 // Delete task teams
5716 int tt_idx;
5717 for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5718 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5719 if (task_team != NULL) {
5720 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5721 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5722 team->t.t_threads[f]->th.th_task_team = NULL;
5723 }
5724 KA_TRACE(
5725 20,
5726 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5727 __kmp_get_gtid(), task_team, team->t.t_id));
5728#if KMP_NESTED_HOT_TEAMS
5729 __kmp_free_task_team(thread: master, task_team);
5730#endif
5731 team->t.t_task_team[tt_idx] = NULL;
5732 }
5733 }
5734 }
5735
5736 // Reset pointer to parent team only for non-hot teams.
5737 team->t.t_parent = NULL;
5738 team->t.t_level = 0;
5739 team->t.t_active_level = 0;
5740
5741 /* free the worker threads */
5742 for (f = 1; f < team->t.t_nproc; ++f) {
5743 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5744 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5745 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5746 1, 2);
5747 }
5748 __kmp_free_thread(team->t.t_threads[f]);
5749 }
5750
5751 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5752 if (team->t.b) {
5753 // wake up thread at old location
5754 team->t.b->go_release();
5755 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5756 for (f = 1; f < team->t.t_nproc; ++f) {
5757 if (team->t.b->sleep[f].sleep) {
5758 __kmp_atomic_resume_64(
5759 target_gtid: team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5760 flag: (kmp_atomic_flag_64<> *)NULL);
5761 }
5762 }
5763 }
5764 // Wait for threads to be removed from team
5765 for (int f = 1; f < team->t.t_nproc; ++f) {
5766 while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5767 KMP_CPU_PAUSE();
5768 }
5769 }
5770 }
5771
5772 for (f = 1; f < team->t.t_nproc; ++f) {
5773 team->t.t_threads[f] = NULL;
5774 }
5775
5776 if (team->t.t_max_nproc > 1 &&
5777 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5778 distributedBarrier::deallocate(db: team->t.b);
5779 team->t.b = NULL;
5780 }
5781 /* put the team back in the team pool */
5782 /* TODO limit size of team pool, call reap_team if pool too large */
5783 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5784 __kmp_team_pool = (volatile kmp_team_t *)team;
5785 } else { // Check if team was created for primary threads in teams construct
5786 // See if first worker is a CG root
5787 KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5788 team->t.t_threads[1]->th.th_cg_roots);
5789 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5790 // Clean up the CG root nodes on workers so that this team can be re-used
5791 for (f = 1; f < team->t.t_nproc; ++f) {
5792 kmp_info_t *thr = team->t.t_threads[f];
5793 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5794 thr->th.th_cg_roots->cg_root == thr);
5795 // Pop current CG root off list
5796 kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5797 thr->th.th_cg_roots = tmp->up;
5798 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5799 " up to node %p. cg_nthreads was %d\n",
5800 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5801 int i = tmp->cg_nthreads--;
5802 if (i == 1) {
5803 __kmp_free(tmp); // free CG if we are the last thread in it
5804 }
5805 // Restore current task's thread_limit from CG root
5806 if (thr->th.th_cg_roots)
5807 thr->th.th_current_task->td_icvs.thread_limit =
5808 thr->th.th_cg_roots->cg_thread_limit;
5809 }
5810 }
5811 }
5812
5813 KMP_MB();
5814}
5815
5816/* reap the team. destroy it, reclaim all its resources and free its memory */
5817kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5818 kmp_team_t *next_pool = team->t.t_next_pool;
5819
5820 KMP_DEBUG_ASSERT(team);
5821 KMP_DEBUG_ASSERT(team->t.t_dispatch);
5822 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5823 KMP_DEBUG_ASSERT(team->t.t_threads);
5824 KMP_DEBUG_ASSERT(team->t.t_argv);
5825
5826 /* TODO clean the threads that are a part of this? */
5827
5828 /* free stuff */
5829 __kmp_free_team_arrays(team);
5830 if (team->t.t_argv != &team->t.t_inline_argv[0])
5831 __kmp_free((void *)team->t.t_argv);
5832 __kmp_free(team);
5833
5834 KMP_MB();
5835 return next_pool;
5836}
5837
5838// Free the thread. Don't reap it, just place it on the pool of available
5839// threads.
5840//
5841// Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5842// binding for the affinity mechanism to be useful.
5843//
5844// Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5845// However, we want to avoid a potential performance problem by always
5846// scanning through the list to find the correct point at which to insert
5847// the thread (potential N**2 behavior). To do this we keep track of the
5848// last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5849// With single-level parallelism, threads will always be added to the tail
5850// of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5851// parallelism, all bets are off and we may need to scan through the entire
5852// free list.
5853//
5854// This change also has a potentially large performance benefit, for some
5855// applications. Previously, as threads were freed from the hot team, they
5856// would be placed back on the free list in inverse order. If the hot team
5857// grew back to it's original size, then the freed thread would be placed
5858// back on the hot team in reverse order. This could cause bad cache
5859// locality problems on programs where the size of the hot team regularly
5860// grew and shrunk.
5861//
5862// Now, for single-level parallelism, the OMP tid is always == gtid.
5863void __kmp_free_thread(kmp_info_t *this_th) {
5864 int gtid;
5865 kmp_info_t **scan;
5866
5867 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5868 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5869
5870 KMP_DEBUG_ASSERT(this_th);
5871
5872 // When moving thread to pool, switch thread to wait on own b_go flag, and
5873 // uninitialized (NULL team).
5874 int b;
5875 kmp_balign_t *balign = this_th->th.th_bar;
5876 for (b = 0; b < bs_last_barrier; ++b) {
5877 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5878 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5879 balign[b].bb.team = NULL;
5880 balign[b].bb.leaf_kids = 0;
5881 }
5882 this_th->th.th_task_state = 0;
5883 this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5884
5885 /* put thread back on the free pool */
5886 TCW_PTR(this_th->th.th_team, NULL);
5887 TCW_PTR(this_th->th.th_root, NULL);
5888 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5889
5890 while (this_th->th.th_cg_roots) {
5891 this_th->th.th_cg_roots->cg_nthreads--;
5892 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5893 " %p of thread %p to %d\n",
5894 this_th, this_th->th.th_cg_roots,
5895 this_th->th.th_cg_roots->cg_root,
5896 this_th->th.th_cg_roots->cg_nthreads));
5897 kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5898 if (tmp->cg_root == this_th) { // Thread is a cg_root
5899 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5900 KA_TRACE(
5901 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5902 this_th->th.th_cg_roots = tmp->up;
5903 __kmp_free(tmp);
5904 } else { // Worker thread
5905 if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5906 __kmp_free(tmp);
5907 }
5908 this_th->th.th_cg_roots = NULL;
5909 break;
5910 }
5911 }
5912
5913 /* If the implicit task assigned to this thread can be used by other threads
5914 * -> multiple threads can share the data and try to free the task at
5915 * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5916 * with higher probability when hot team is disabled but can occurs even when
5917 * the hot team is enabled */
5918 __kmp_free_implicit_task(this_thr: this_th);
5919 this_th->th.th_current_task = NULL;
5920
5921 // If the __kmp_thread_pool_insert_pt is already past the new insert
5922 // point, then we need to re-scan the entire list.
5923 gtid = this_th->th.th_info.ds.ds_gtid;
5924 if (__kmp_thread_pool_insert_pt != NULL) {
5925 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5926 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5927 __kmp_thread_pool_insert_pt = NULL;
5928 }
5929 }
5930
5931 // Scan down the list to find the place to insert the thread.
5932 // scan is the address of a link in the list, possibly the address of
5933 // __kmp_thread_pool itself.
5934 //
5935 // In the absence of nested parallelism, the for loop will have 0 iterations.
5936 if (__kmp_thread_pool_insert_pt != NULL) {
5937 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5938 } else {
5939 scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5940 }
5941 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5942 scan = &((*scan)->th.th_next_pool))
5943 ;
5944
5945 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5946 // to its address.
5947 TCW_PTR(this_th->th.th_next_pool, *scan);
5948 __kmp_thread_pool_insert_pt = *scan = this_th;
5949 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5950 (this_th->th.th_info.ds.ds_gtid <
5951 this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5952 TCW_4(this_th->th.th_in_pool, TRUE);
5953 __kmp_suspend_initialize_thread(th: this_th);
5954 __kmp_lock_suspend_mx(th: this_th);
5955 if (this_th->th.th_active == TRUE) {
5956 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5957 this_th->th.th_active_in_pool = TRUE;
5958 }
5959#if KMP_DEBUG
5960 else {
5961 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5962 }
5963#endif
5964 __kmp_unlock_suspend_mx(th: this_th);
5965
5966 TCW_4(__kmp_nth, __kmp_nth - 1);
5967
5968#ifdef KMP_ADJUST_BLOCKTIME
5969 /* Adjust blocktime back to user setting or default if necessary */
5970 /* Middle initialization might never have occurred */
5971 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5972 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5973 if (__kmp_nth <= __kmp_avail_proc) {
5974 __kmp_zero_bt = FALSE;
5975 }
5976 }
5977#endif /* KMP_ADJUST_BLOCKTIME */
5978
5979 KMP_MB();
5980}
5981
5982/* ------------------------------------------------------------------------ */
5983
5984void *__kmp_launch_thread(kmp_info_t *this_thr) {
5985#if OMP_PROFILING_SUPPORT
5986 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5987 // TODO: add a configuration option for time granularity
5988 if (ProfileTraceFile)
5989 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5990#endif
5991
5992 int gtid = this_thr->th.th_info.ds.ds_gtid;
5993 /* void *stack_data;*/
5994 kmp_team_t **volatile pteam;
5995
5996 KMP_MB();
5997 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5998
5999 if (__kmp_env_consistency_check) {
6000 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
6001 }
6002
6003#if OMPD_SUPPORT
6004 if (ompd_state & OMPD_ENABLE_BP)
6005 ompd_bp_thread_begin();
6006#endif
6007
6008#if OMPT_SUPPORT
6009 ompt_data_t *thread_data = nullptr;
6010 if (ompt_enabled.enabled) {
6011 thread_data = &(this_thr->th.ompt_thread_info.thread_data);
6012 *thread_data = ompt_data_none;
6013
6014 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6015 this_thr->th.ompt_thread_info.wait_id = 0;
6016 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
6017 this_thr->th.ompt_thread_info.parallel_flags = 0;
6018 if (ompt_enabled.ompt_callback_thread_begin) {
6019 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6020 ompt_thread_worker, thread_data);
6021 }
6022 this_thr->th.ompt_thread_info.state = ompt_state_idle;
6023 }
6024#endif
6025
6026 /* This is the place where threads wait for work */
6027 while (!TCR_4(__kmp_global.g.g_done)) {
6028 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6029 KMP_MB();
6030
6031 /* wait for work to do */
6032 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6033
6034 /* No tid yet since not part of a team */
6035 __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6036
6037#if OMPT_SUPPORT
6038 if (ompt_enabled.enabled) {
6039 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6040 }
6041#endif
6042
6043 pteam = &this_thr->th.th_team;
6044
6045 /* have we been allocated? */
6046 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6047 /* we were just woken up, so run our new task */
6048 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6049 int rc;
6050 KA_TRACE(20,
6051 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6052 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6053 (*pteam)->t.t_pkfn));
6054
6055 updateHWFPControl(team: *pteam);
6056
6057#if OMPT_SUPPORT
6058 if (ompt_enabled.enabled) {
6059 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6060 }
6061#endif
6062
6063 rc = (*pteam)->t.t_invoke(gtid);
6064 KMP_ASSERT(rc);
6065
6066 KMP_MB();
6067 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6068 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6069 (*pteam)->t.t_pkfn));
6070 }
6071#if OMPT_SUPPORT
6072 if (ompt_enabled.enabled) {
6073 /* no frame set while outside task */
6074 __ompt_get_task_info_object(depth: 0)->frame.exit_frame = ompt_data_none;
6075
6076 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6077 }
6078#endif
6079 /* join barrier after parallel region */
6080 __kmp_join_barrier(gtid);
6081 }
6082 }
6083
6084#if OMPD_SUPPORT
6085 if (ompd_state & OMPD_ENABLE_BP)
6086 ompd_bp_thread_end();
6087#endif
6088
6089#if OMPT_SUPPORT
6090 if (ompt_enabled.ompt_callback_thread_end) {
6091 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6092 }
6093#endif
6094
6095 this_thr->th.th_task_team = NULL;
6096 /* run the destructors for the threadprivate data for this thread */
6097 __kmp_common_destroy_gtid(gtid);
6098
6099 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6100 KMP_MB();
6101
6102#if OMP_PROFILING_SUPPORT
6103 llvm::timeTraceProfilerFinishThread();
6104#endif
6105 return this_thr;
6106}
6107
6108/* ------------------------------------------------------------------------ */
6109
6110void __kmp_internal_end_dest(void *specific_gtid) {
6111 // Make sure no significant bits are lost
6112 int gtid;
6113 __kmp_type_convert(src: (kmp_intptr_t)specific_gtid - 1, dest: &gtid);
6114
6115 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6116 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6117 * this is because 0 is reserved for the nothing-stored case */
6118
6119 __kmp_internal_end_thread(gtid);
6120}
6121
6122#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6123
6124__attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6125 __kmp_internal_end_atexit();
6126}
6127
6128#endif
6129
6130/* [Windows] josh: when the atexit handler is called, there may still be more
6131 than one thread alive */
6132void __kmp_internal_end_atexit(void) {
6133 KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6134 /* [Windows]
6135 josh: ideally, we want to completely shutdown the library in this atexit
6136 handler, but stat code that depends on thread specific data for gtid fails
6137 because that data becomes unavailable at some point during the shutdown, so
6138 we call __kmp_internal_end_thread instead. We should eventually remove the
6139 dependency on __kmp_get_specific_gtid in the stat code and use
6140 __kmp_internal_end_library to cleanly shutdown the library.
6141
6142 // TODO: Can some of this comment about GVS be removed?
6143 I suspect that the offending stat code is executed when the calling thread
6144 tries to clean up a dead root thread's data structures, resulting in GVS
6145 code trying to close the GVS structures for that thread, but since the stat
6146 code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6147 the calling thread is cleaning up itself instead of another thread, it get
6148 confused. This happens because allowing a thread to unregister and cleanup
6149 another thread is a recent modification for addressing an issue.
6150 Based on the current design (20050722), a thread may end up
6151 trying to unregister another thread only if thread death does not trigger
6152 the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6153 thread specific data destructor function to detect thread death. For
6154 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6155 is nothing. Thus, the workaround is applicable only for Windows static
6156 stat library. */
6157 __kmp_internal_end_library(gtid: -1);
6158#if KMP_OS_WINDOWS
6159 __kmp_close_console();
6160#endif
6161}
6162
6163static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6164 // It is assumed __kmp_forkjoin_lock is acquired.
6165
6166 int gtid;
6167
6168 KMP_DEBUG_ASSERT(thread != NULL);
6169
6170 gtid = thread->th.th_info.ds.ds_gtid;
6171
6172 if (!is_root) {
6173 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6174 /* Assume the threads are at the fork barrier here */
6175 KA_TRACE(
6176 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6177 gtid));
6178 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6179 while (
6180 !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6181 KMP_CPU_PAUSE();
6182 __kmp_resume_32(target_gtid: gtid, flag: (kmp_flag_32<false, false> *)NULL);
6183 } else {
6184 /* Need release fence here to prevent seg faults for tree forkjoin
6185 barrier (GEH) */
6186 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6187 thread);
6188 __kmp_release_64(flag: &flag);
6189 }
6190 }
6191
6192 // Terminate OS thread.
6193 __kmp_reap_worker(th: thread);
6194
6195 // The thread was killed asynchronously. If it was actively
6196 // spinning in the thread pool, decrement the global count.
6197 //
6198 // There is a small timing hole here - if the worker thread was just waking
6199 // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6200 // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6201 // the global counter might not get updated.
6202 //
6203 // Currently, this can only happen as the library is unloaded,
6204 // so there are no harmful side effects.
6205 if (thread->th.th_active_in_pool) {
6206 thread->th.th_active_in_pool = FALSE;
6207 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6208 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6209 }
6210 }
6211
6212 __kmp_free_implicit_task(this_thr: thread);
6213
6214// Free the fast memory for tasking
6215#if USE_FAST_MEMORY
6216 __kmp_free_fast_memory(this_thr: thread);
6217#endif /* USE_FAST_MEMORY */
6218
6219 __kmp_suspend_uninitialize_thread(th: thread);
6220
6221 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6222 TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6223
6224 --__kmp_all_nth;
6225 // __kmp_nth was decremented when thread is added to the pool.
6226
6227#ifdef KMP_ADJUST_BLOCKTIME
6228 /* Adjust blocktime back to user setting or default if necessary */
6229 /* Middle initialization might never have occurred */
6230 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6231 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6232 if (__kmp_nth <= __kmp_avail_proc) {
6233 __kmp_zero_bt = FALSE;
6234 }
6235 }
6236#endif /* KMP_ADJUST_BLOCKTIME */
6237
6238 /* free the memory being used */
6239 if (__kmp_env_consistency_check) {
6240 if (thread->th.th_cons) {
6241 __kmp_free_cons_stack(ptr: thread->th.th_cons);
6242 thread->th.th_cons = NULL;
6243 }
6244 }
6245
6246 if (thread->th.th_pri_common != NULL) {
6247 __kmp_free(thread->th.th_pri_common);
6248 thread->th.th_pri_common = NULL;
6249 }
6250
6251 if (thread->th.th_task_state_memo_stack != NULL) {
6252 __kmp_free(thread->th.th_task_state_memo_stack);
6253 thread->th.th_task_state_memo_stack = NULL;
6254 }
6255
6256#if KMP_USE_BGET
6257 if (thread->th.th_local.bget_data != NULL) {
6258 __kmp_finalize_bget(th: thread);
6259 }
6260#endif
6261
6262#if KMP_AFFINITY_SUPPORTED
6263 if (thread->th.th_affin_mask != NULL) {
6264 KMP_CPU_FREE(thread->th.th_affin_mask);
6265 thread->th.th_affin_mask = NULL;
6266 }
6267#endif /* KMP_AFFINITY_SUPPORTED */
6268
6269#if KMP_USE_HIER_SCHED
6270 if (thread->th.th_hier_bar_data != NULL) {
6271 __kmp_free(thread->th.th_hier_bar_data);
6272 thread->th.th_hier_bar_data = NULL;
6273 }
6274#endif
6275
6276 __kmp_reap_team(team: thread->th.th_serial_team);
6277 thread->th.th_serial_team = NULL;
6278 __kmp_free(thread);
6279
6280 KMP_MB();
6281
6282} // __kmp_reap_thread
6283
6284static void __kmp_itthash_clean(kmp_info_t *th) {
6285#if USE_ITT_NOTIFY
6286 if (__kmp_itt_region_domains.count > 0) {
6287 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6288 kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6289 while (bucket) {
6290 kmp_itthash_entry_t *next = bucket->next_in_bucket;
6291 __kmp_thread_free(th, bucket);
6292 bucket = next;
6293 }
6294 }
6295 }
6296 if (__kmp_itt_barrier_domains.count > 0) {
6297 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6298 kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6299 while (bucket) {
6300 kmp_itthash_entry_t *next = bucket->next_in_bucket;
6301 __kmp_thread_free(th, bucket);
6302 bucket = next;
6303 }
6304 }
6305 }
6306#endif
6307}
6308
6309static void __kmp_internal_end(void) {
6310 int i;
6311
6312 /* First, unregister the library */
6313 __kmp_unregister_library();
6314
6315#if KMP_OS_WINDOWS
6316 /* In Win static library, we can't tell when a root actually dies, so we
6317 reclaim the data structures for any root threads that have died but not
6318 unregistered themselves, in order to shut down cleanly.
6319 In Win dynamic library we also can't tell when a thread dies. */
6320 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6321// dead roots
6322#endif
6323
6324 for (i = 0; i < __kmp_threads_capacity; i++)
6325 if (__kmp_root[i])
6326 if (__kmp_root[i]->r.r_active)
6327 break;
6328 KMP_MB(); /* Flush all pending memory write invalidates. */
6329 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6330
6331 if (i < __kmp_threads_capacity) {
6332#if KMP_USE_MONITOR
6333 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6334 KMP_MB(); /* Flush all pending memory write invalidates. */
6335
6336 // Need to check that monitor was initialized before reaping it. If we are
6337 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6338 // __kmp_monitor will appear to contain valid data, but it is only valid in
6339 // the parent process, not the child.
6340 // New behavior (201008): instead of keying off of the flag
6341 // __kmp_init_parallel, the monitor thread creation is keyed off
6342 // of the new flag __kmp_init_monitor.
6343 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6344 if (TCR_4(__kmp_init_monitor)) {
6345 __kmp_reap_monitor(&__kmp_monitor);
6346 TCW_4(__kmp_init_monitor, 0);
6347 }
6348 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6349 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6350#endif // KMP_USE_MONITOR
6351 } else {
6352/* TODO move this to cleanup code */
6353#ifdef KMP_DEBUG
6354 /* make sure that everything has properly ended */
6355 for (i = 0; i < __kmp_threads_capacity; i++) {
6356 if (__kmp_root[i]) {
6357 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6358 // there can be uber threads alive here
6359 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6360 }
6361 }
6362#endif
6363
6364 KMP_MB();
6365
6366 // Reap the worker threads.
6367 // This is valid for now, but be careful if threads are reaped sooner.
6368 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6369 // Get the next thread from the pool.
6370 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6371 __kmp_thread_pool = thread->th.th_next_pool;
6372 // Reap it.
6373 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6374 thread->th.th_next_pool = NULL;
6375 thread->th.th_in_pool = FALSE;
6376 __kmp_reap_thread(thread, is_root: 0);
6377 }
6378 __kmp_thread_pool_insert_pt = NULL;
6379
6380 // Reap teams.
6381 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6382 // Get the next team from the pool.
6383 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6384 __kmp_team_pool = team->t.t_next_pool;
6385 // Reap it.
6386 team->t.t_next_pool = NULL;
6387 __kmp_reap_team(team);
6388 }
6389
6390 __kmp_reap_task_teams();
6391
6392#if KMP_OS_UNIX
6393 // Threads that are not reaped should not access any resources since they
6394 // are going to be deallocated soon, so the shutdown sequence should wait
6395 // until all threads either exit the final spin-waiting loop or begin
6396 // sleeping after the given blocktime.
6397 for (i = 0; i < __kmp_threads_capacity; i++) {
6398 kmp_info_t *thr = __kmp_threads[i];
6399 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6400 KMP_CPU_PAUSE();
6401 }
6402#endif
6403
6404 for (i = 0; i < __kmp_threads_capacity; ++i) {
6405 // TBD: Add some checking...
6406 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6407 }
6408
6409 /* Make sure all threadprivate destructors get run by joining with all
6410 worker threads before resetting this flag */
6411 TCW_SYNC_4(__kmp_init_common, FALSE);
6412
6413 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6414 KMP_MB();
6415
6416#if KMP_USE_MONITOR
6417 // See note above: One of the possible fixes for CQ138434 / CQ140126
6418 //
6419 // FIXME: push both code fragments down and CSE them?
6420 // push them into __kmp_cleanup() ?
6421 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6422 if (TCR_4(__kmp_init_monitor)) {
6423 __kmp_reap_monitor(&__kmp_monitor);
6424 TCW_4(__kmp_init_monitor, 0);
6425 }
6426 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6427 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6428#endif
6429 } /* else !__kmp_global.t_active */
6430 TCW_4(__kmp_init_gtid, FALSE);
6431 KMP_MB(); /* Flush all pending memory write invalidates. */
6432
6433 __kmp_cleanup();
6434#if OMPT_SUPPORT
6435 ompt_fini();
6436#endif
6437}
6438
6439void __kmp_internal_end_library(int gtid_req) {
6440 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6441 /* this shouldn't be a race condition because __kmp_internal_end() is the
6442 only place to clear __kmp_serial_init */
6443 /* we'll check this later too, after we get the lock */
6444 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6445 // redundant, because the next check will work in any case.
6446 if (__kmp_global.g.g_abort) {
6447 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6448 /* TODO abort? */
6449 return;
6450 }
6451 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6452 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6453 return;
6454 }
6455
6456 // If hidden helper team has been initialized, we need to deinit it
6457 if (TCR_4(__kmp_init_hidden_helper) &&
6458 !TCR_4(__kmp_hidden_helper_team_done)) {
6459 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6460 // First release the main thread to let it continue its work
6461 __kmp_hidden_helper_main_thread_release();
6462 // Wait until the hidden helper team has been destroyed
6463 __kmp_hidden_helper_threads_deinitz_wait();
6464 }
6465
6466 KMP_MB(); /* Flush all pending memory write invalidates. */
6467 /* find out who we are and what we should do */
6468 {
6469 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6470 KA_TRACE(
6471 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6472 if (gtid == KMP_GTID_SHUTDOWN) {
6473 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6474 "already shutdown\n"));
6475 return;
6476 } else if (gtid == KMP_GTID_MONITOR) {
6477 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6478 "registered, or system shutdown\n"));
6479 return;
6480 } else if (gtid == KMP_GTID_DNE) {
6481 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6482 "shutdown\n"));
6483 /* we don't know who we are, but we may still shutdown the library */
6484 } else if (KMP_UBER_GTID(gtid)) {
6485 /* unregister ourselves as an uber thread. gtid is no longer valid */
6486 if (__kmp_root[gtid]->r.r_active) {
6487 __kmp_global.g.g_abort = -1;
6488 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6489 __kmp_unregister_library();
6490 KA_TRACE(10,
6491 ("__kmp_internal_end_library: root still active, abort T#%d\n",
6492 gtid));
6493 return;
6494 } else {
6495 __kmp_itthash_clean(th: __kmp_threads[gtid]);
6496 KA_TRACE(
6497 10,
6498 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6499 __kmp_unregister_root_current_thread(gtid);
6500 }
6501 } else {
6502/* worker threads may call this function through the atexit handler, if they
6503 * call exit() */
6504/* For now, skip the usual subsequent processing and just dump the debug buffer.
6505 TODO: do a thorough shutdown instead */
6506#ifdef DUMP_DEBUG_ON_EXIT
6507 if (__kmp_debug_buf)
6508 __kmp_dump_debug_buffer();
6509#endif
6510 // added unregister library call here when we switch to shm linux
6511 // if we don't, it will leave lots of files in /dev/shm
6512 // cleanup shared memory file before exiting.
6513 __kmp_unregister_library();
6514 return;
6515 }
6516 }
6517 /* synchronize the termination process */
6518 __kmp_acquire_bootstrap_lock(lck: &__kmp_initz_lock);
6519
6520 /* have we already finished */
6521 if (__kmp_global.g.g_abort) {
6522 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6523 /* TODO abort? */
6524 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
6525 return;
6526 }
6527 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6528 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
6529 return;
6530 }
6531
6532 /* We need this lock to enforce mutex between this reading of
6533 __kmp_threads_capacity and the writing by __kmp_register_root.
6534 Alternatively, we can use a counter of roots that is atomically updated by
6535 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6536 __kmp_internal_end_*. */
6537 __kmp_acquire_bootstrap_lock(lck: &__kmp_forkjoin_lock);
6538
6539 /* now we can safely conduct the actual termination */
6540 __kmp_internal_end();
6541
6542 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
6543 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
6544
6545 KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6546
6547#ifdef DUMP_DEBUG_ON_EXIT
6548 if (__kmp_debug_buf)
6549 __kmp_dump_debug_buffer();
6550#endif
6551
6552#if KMP_OS_WINDOWS
6553 __kmp_close_console();
6554#endif
6555
6556 __kmp_fini_allocator();
6557
6558} // __kmp_internal_end_library
6559
6560void __kmp_internal_end_thread(int gtid_req) {
6561 int i;
6562
6563 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6564 /* this shouldn't be a race condition because __kmp_internal_end() is the
6565 * only place to clear __kmp_serial_init */
6566 /* we'll check this later too, after we get the lock */
6567 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6568 // redundant, because the next check will work in any case.
6569 if (__kmp_global.g.g_abort) {
6570 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6571 /* TODO abort? */
6572 return;
6573 }
6574 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6575 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6576 return;
6577 }
6578
6579 // If hidden helper team has been initialized, we need to deinit it
6580 if (TCR_4(__kmp_init_hidden_helper) &&
6581 !TCR_4(__kmp_hidden_helper_team_done)) {
6582 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6583 // First release the main thread to let it continue its work
6584 __kmp_hidden_helper_main_thread_release();
6585 // Wait until the hidden helper team has been destroyed
6586 __kmp_hidden_helper_threads_deinitz_wait();
6587 }
6588
6589 KMP_MB(); /* Flush all pending memory write invalidates. */
6590
6591 /* find out who we are and what we should do */
6592 {
6593 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6594 KA_TRACE(10,
6595 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6596 if (gtid == KMP_GTID_SHUTDOWN) {
6597 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6598 "already shutdown\n"));
6599 return;
6600 } else if (gtid == KMP_GTID_MONITOR) {
6601 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6602 "registered, or system shutdown\n"));
6603 return;
6604 } else if (gtid == KMP_GTID_DNE) {
6605 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6606 "shutdown\n"));
6607 return;
6608 /* we don't know who we are */
6609 } else if (KMP_UBER_GTID(gtid)) {
6610 /* unregister ourselves as an uber thread. gtid is no longer valid */
6611 if (__kmp_root[gtid]->r.r_active) {
6612 __kmp_global.g.g_abort = -1;
6613 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6614 KA_TRACE(10,
6615 ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6616 gtid));
6617 return;
6618 } else {
6619 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6620 gtid));
6621 __kmp_unregister_root_current_thread(gtid);
6622 }
6623 } else {
6624 /* just a worker thread, let's leave */
6625 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6626
6627 if (gtid >= 0) {
6628 __kmp_threads[gtid]->th.th_task_team = NULL;
6629 }
6630
6631 KA_TRACE(10,
6632 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6633 gtid));
6634 return;
6635 }
6636 }
6637#if KMP_DYNAMIC_LIB
6638 if (__kmp_pause_status != kmp_hard_paused)
6639 // AC: lets not shutdown the dynamic library at the exit of uber thread,
6640 // because we will better shutdown later in the library destructor.
6641 {
6642 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6643 return;
6644 }
6645#endif
6646 /* synchronize the termination process */
6647 __kmp_acquire_bootstrap_lock(lck: &__kmp_initz_lock);
6648
6649 /* have we already finished */
6650 if (__kmp_global.g.g_abort) {
6651 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6652 /* TODO abort? */
6653 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
6654 return;
6655 }
6656 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6657 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
6658 return;
6659 }
6660
6661 /* We need this lock to enforce mutex between this reading of
6662 __kmp_threads_capacity and the writing by __kmp_register_root.
6663 Alternatively, we can use a counter of roots that is atomically updated by
6664 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6665 __kmp_internal_end_*. */
6666
6667 /* should we finish the run-time? are all siblings done? */
6668 __kmp_acquire_bootstrap_lock(lck: &__kmp_forkjoin_lock);
6669
6670 for (i = 0; i < __kmp_threads_capacity; ++i) {
6671 if (KMP_UBER_GTID(gtid: i)) {
6672 KA_TRACE(
6673 10,
6674 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6675 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
6676 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
6677 return;
6678 }
6679 }
6680
6681 /* now we can safely conduct the actual termination */
6682
6683 __kmp_internal_end();
6684
6685 __kmp_release_bootstrap_lock(lck: &__kmp_forkjoin_lock);
6686 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
6687
6688 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6689
6690#ifdef DUMP_DEBUG_ON_EXIT
6691 if (__kmp_debug_buf)
6692 __kmp_dump_debug_buffer();
6693#endif
6694} // __kmp_internal_end_thread
6695
6696// -----------------------------------------------------------------------------
6697// Library registration stuff.
6698
6699static long __kmp_registration_flag = 0;
6700// Random value used to indicate library initialization.
6701static char *__kmp_registration_str = NULL;
6702// Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6703
6704static inline char *__kmp_reg_status_name() {
6705/* On RHEL 3u5 if linked statically, getpid() returns different values in
6706 each thread. If registration and unregistration go in different threads
6707 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6708 env var can not be found, because the name will contain different pid. */
6709// macOS* complains about name being too long with additional getuid()
6710#if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6711 return __kmp_str_format(format: "__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6712 (int)getuid());
6713#else
6714 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6715#endif
6716} // __kmp_reg_status_get
6717
6718#if defined(KMP_USE_SHM)
6719bool __kmp_shm_available = false;
6720bool __kmp_tmp_available = false;
6721// If /dev/shm is not accessible, we will create a temporary file under /tmp.
6722char *temp_reg_status_file_name = nullptr;
6723#endif
6724
6725void __kmp_register_library_startup(void) {
6726
6727 char *name = __kmp_reg_status_name(); // Name of the environment variable.
6728 int done = 0;
6729 union {
6730 double dtime;
6731 long ltime;
6732 } time;
6733#if KMP_ARCH_X86 || KMP_ARCH_X86_64
6734 __kmp_initialize_system_tick();
6735#endif
6736 __kmp_read_system_time(delta: &time.dtime);
6737 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6738 __kmp_registration_str =
6739 __kmp_str_format(format: "%p-%lx-%s", &__kmp_registration_flag,
6740 __kmp_registration_flag, KMP_LIBRARY_FILE);
6741
6742 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6743 __kmp_registration_str));
6744
6745 while (!done) {
6746
6747 char *value = NULL; // Actual value of the environment variable.
6748
6749#if defined(KMP_USE_SHM)
6750 char *shm_name = nullptr;
6751 char *data1 = nullptr;
6752 __kmp_shm_available = __kmp_detect_shm();
6753 if (__kmp_shm_available) {
6754 int fd1 = -1;
6755 shm_name = __kmp_str_format(format: "/%s", name);
6756 int shm_preexist = 0;
6757 fd1 = shm_open(name: shm_name, O_CREAT | O_EXCL | O_RDWR, mode: 0600);
6758 if ((fd1 == -1) && (errno == EEXIST)) {
6759 // file didn't open because it already exists.
6760 // try opening existing file
6761 fd1 = shm_open(name: shm_name, O_RDWR, mode: 0600);
6762 if (fd1 == -1) { // file didn't open
6763 KMP_WARNING(FunctionError, "Can't open SHM");
6764 __kmp_shm_available = false;
6765 } else { // able to open existing file
6766 shm_preexist = 1;
6767 }
6768 }
6769 if (__kmp_shm_available && shm_preexist == 0) { // SHM created, set size
6770 if (ftruncate(fd: fd1, SHM_SIZE) == -1) { // error occured setting size;
6771 KMP_WARNING(FunctionError, "Can't set size of SHM");
6772 __kmp_shm_available = false;
6773 }
6774 }
6775 if (__kmp_shm_available) { // SHM exists, now map it
6776 data1 = (char *)mmap(addr: 0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6777 fd: fd1, offset: 0);
6778 if (data1 == MAP_FAILED) { // failed to map shared memory
6779 KMP_WARNING(FunctionError, "Can't map SHM");
6780 __kmp_shm_available = false;
6781 }
6782 }
6783 if (__kmp_shm_available) { // SHM mapped
6784 if (shm_preexist == 0) { // set data to SHM, set value
6785 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6786 }
6787 // Read value from either what we just wrote or existing file.
6788 value = __kmp_str_format(format: "%s", data1); // read value from SHM
6789 munmap(addr: data1, SHM_SIZE);
6790 }
6791 if (fd1 != -1)
6792 close(fd: fd1);
6793 }
6794 if (!__kmp_shm_available)
6795 __kmp_tmp_available = __kmp_detect_tmp();
6796 if (!__kmp_shm_available && __kmp_tmp_available) {
6797 // SHM failed to work due to an error other than that the file already
6798 // exists. Try to create a temp file under /tmp.
6799 // If /tmp isn't accessible, fall back to using environment variable.
6800 // TODO: /tmp might not always be the temporary directory. For now we will
6801 // not consider TMPDIR.
6802 int fd1 = -1;
6803 temp_reg_status_file_name = __kmp_str_format(format: "/tmp/%s", name);
6804 int tmp_preexist = 0;
6805 fd1 = open(file: temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0600);
6806 if ((fd1 == -1) && (errno == EEXIST)) {
6807 // file didn't open because it already exists.
6808 // try opening existing file
6809 fd1 = open(file: temp_reg_status_file_name, O_RDWR, 0600);
6810 if (fd1 == -1) { // file didn't open if (fd1 == -1) {
6811 KMP_WARNING(FunctionError, "Can't open TEMP");
6812 __kmp_tmp_available = false;
6813 } else {
6814 tmp_preexist = 1;
6815 }
6816 }
6817 if (__kmp_tmp_available && tmp_preexist == 0) {
6818 // we created /tmp file now set size
6819 if (ftruncate(fd: fd1, SHM_SIZE) == -1) { // error occured setting size;
6820 KMP_WARNING(FunctionError, "Can't set size of /tmp file");
6821 __kmp_tmp_available = false;
6822 }
6823 }
6824 if (__kmp_tmp_available) {
6825 data1 = (char *)mmap(addr: 0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6826 fd: fd1, offset: 0);
6827 if (data1 == MAP_FAILED) { // failed to map /tmp
6828 KMP_WARNING(FunctionError, "Can't map /tmp");
6829 __kmp_tmp_available = false;
6830 }
6831 }
6832 if (__kmp_tmp_available) {
6833 if (tmp_preexist == 0) { // set data to TMP, set value
6834 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6835 }
6836 // Read value from either what we just wrote or existing file.
6837 value = __kmp_str_format(format: "%s", data1); // read value from SHM
6838 munmap(addr: data1, SHM_SIZE);
6839 }
6840 if (fd1 != -1)
6841 close(fd: fd1);
6842 }
6843 if (!__kmp_shm_available && !__kmp_tmp_available) {
6844 // no /dev/shm and no /tmp -- fall back to environment variable
6845 // Set environment variable, but do not overwrite if it exists.
6846 __kmp_env_set(name, value: __kmp_registration_str, overwrite: 0);
6847 // read value to see if it got set
6848 value = __kmp_env_get(name);
6849 }
6850#else // Windows and unix with static library
6851 // Set environment variable, but do not overwrite if it exists.
6852 __kmp_env_set(name, __kmp_registration_str, 0);
6853 // read value to see if it got set
6854 value = __kmp_env_get(name);
6855#endif
6856
6857 if (value != NULL && strcmp(s1: value, s2: __kmp_registration_str) == 0) {
6858 done = 1; // Ok, environment variable set successfully, exit the loop.
6859 } else {
6860 // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6861 // Check whether it alive or dead.
6862 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6863 char *tail = value;
6864 char *flag_addr_str = NULL;
6865 char *flag_val_str = NULL;
6866 char const *file_name = NULL;
6867 __kmp_str_split(str: tail, delim: '-', head: &flag_addr_str, tail: &tail);
6868 __kmp_str_split(str: tail, delim: '-', head: &flag_val_str, tail: &tail);
6869 file_name = tail;
6870 if (tail != NULL) {
6871 unsigned long *flag_addr = 0;
6872 unsigned long flag_val = 0;
6873 KMP_SSCANF(s: flag_addr_str, format: "%p", RCAST(void **, &flag_addr));
6874 KMP_SSCANF(s: flag_val_str, format: "%lx", &flag_val);
6875 if (flag_addr != 0 && flag_val != 0 && strcmp(s1: file_name, s2: "") != 0) {
6876 // First, check whether environment-encoded address is mapped into
6877 // addr space.
6878 // If so, dereference it to see if it still has the right value.
6879 if (__kmp_is_address_mapped(addr: flag_addr) && *flag_addr == flag_val) {
6880 neighbor = 1;
6881 } else {
6882 // If not, then we know the other copy of the library is no longer
6883 // running.
6884 neighbor = 2;
6885 }
6886 }
6887 }
6888 switch (neighbor) {
6889 case 0: // Cannot parse environment variable -- neighbor status unknown.
6890 // Assume it is the incompatible format of future version of the
6891 // library. Assume the other library is alive.
6892 // WARN( ... ); // TODO: Issue a warning.
6893 file_name = "unknown library";
6894 KMP_FALLTHROUGH();
6895 // Attention! Falling to the next case. That's intentional.
6896 case 1: { // Neighbor is alive.
6897 // Check it is allowed.
6898 char *duplicate_ok = __kmp_env_get(name: "KMP_DUPLICATE_LIB_OK");
6899 if (!__kmp_str_match_true(data: duplicate_ok)) {
6900 // That's not allowed. Issue fatal error.
6901 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6902 KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6903 }
6904 KMP_INTERNAL_FREE(duplicate_ok);
6905 __kmp_duplicate_library_ok = 1;
6906 done = 1; // Exit the loop.
6907 } break;
6908 case 2: { // Neighbor is dead.
6909
6910#if defined(KMP_USE_SHM)
6911 if (__kmp_shm_available) { // close shared memory.
6912 shm_unlink(name: shm_name); // this removes file in /dev/shm
6913 } else if (__kmp_tmp_available) {
6914 unlink(name: temp_reg_status_file_name); // this removes the temp file
6915 } else {
6916 // Clear the variable and try to register library again.
6917 __kmp_env_unset(name);
6918 }
6919#else
6920 // Clear the variable and try to register library again.
6921 __kmp_env_unset(name);
6922#endif
6923 } break;
6924 default: {
6925 KMP_DEBUG_ASSERT(0);
6926 } break;
6927 }
6928 }
6929 KMP_INTERNAL_FREE((void *)value);
6930#if defined(KMP_USE_SHM)
6931 if (shm_name)
6932 KMP_INTERNAL_FREE((void *)shm_name);
6933#endif
6934 } // while
6935 KMP_INTERNAL_FREE((void *)name);
6936
6937} // func __kmp_register_library_startup
6938
6939void __kmp_unregister_library(void) {
6940
6941 char *name = __kmp_reg_status_name();
6942 char *value = NULL;
6943
6944#if defined(KMP_USE_SHM)
6945 char *shm_name = nullptr;
6946 int fd1;
6947 if (__kmp_shm_available) {
6948 shm_name = __kmp_str_format(format: "/%s", name);
6949 fd1 = shm_open(name: shm_name, O_RDONLY, mode: 0600);
6950 if (fd1 != -1) { // File opened successfully
6951 char *data1 = (char *)mmap(addr: 0, SHM_SIZE, PROT_READ, MAP_SHARED, fd: fd1, offset: 0);
6952 if (data1 != MAP_FAILED) {
6953 value = __kmp_str_format(format: "%s", data1); // read value from SHM
6954 munmap(addr: data1, SHM_SIZE);
6955 }
6956 close(fd: fd1);
6957 }
6958 } else if (__kmp_tmp_available) { // try /tmp
6959 fd1 = open(file: temp_reg_status_file_name, O_RDONLY);
6960 if (fd1 != -1) { // File opened successfully
6961 char *data1 = (char *)mmap(addr: 0, SHM_SIZE, PROT_READ, MAP_SHARED, fd: fd1, offset: 0);
6962 if (data1 != MAP_FAILED) {
6963 value = __kmp_str_format(format: "%s", data1); // read value from /tmp
6964 munmap(addr: data1, SHM_SIZE);
6965 }
6966 close(fd: fd1);
6967 }
6968 } else { // fall back to envirable
6969 value = __kmp_env_get(name);
6970 }
6971#else
6972 value = __kmp_env_get(name);
6973#endif
6974
6975 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6976 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6977 if (value != NULL && strcmp(s1: value, s2: __kmp_registration_str) == 0) {
6978// Ok, this is our variable. Delete it.
6979#if defined(KMP_USE_SHM)
6980 if (__kmp_shm_available) {
6981 shm_unlink(name: shm_name); // this removes file in /dev/shm
6982 } else if (__kmp_tmp_available) {
6983 unlink(name: temp_reg_status_file_name); // this removes the temp file
6984 } else {
6985 __kmp_env_unset(name);
6986 }
6987#else
6988 __kmp_env_unset(name);
6989#endif
6990 }
6991
6992#if defined(KMP_USE_SHM)
6993 if (shm_name)
6994 KMP_INTERNAL_FREE(shm_name);
6995 if (temp_reg_status_file_name)
6996 KMP_INTERNAL_FREE(temp_reg_status_file_name);
6997#endif
6998
6999 KMP_INTERNAL_FREE(__kmp_registration_str);
7000 KMP_INTERNAL_FREE(value);
7001 KMP_INTERNAL_FREE(name);
7002
7003 __kmp_registration_flag = 0;
7004 __kmp_registration_str = NULL;
7005
7006} // __kmp_unregister_library
7007
7008// End of Library registration stuff.
7009// -----------------------------------------------------------------------------
7010
7011#if KMP_MIC_SUPPORTED
7012
7013static void __kmp_check_mic_type() {
7014 kmp_cpuid_t cpuid_state = {.eax: 0};
7015 kmp_cpuid_t *cs_p = &cpuid_state;
7016 __kmp_x86_cpuid(leaf: 1, subleaf: 0, p: cs_p);
7017 // We don't support mic1 at the moment
7018 if ((cs_p->eax & 0xff0) == 0xB10) {
7019 __kmp_mic_type = mic2;
7020 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
7021 __kmp_mic_type = mic3;
7022 } else {
7023 __kmp_mic_type = non_mic;
7024 }
7025}
7026
7027#endif /* KMP_MIC_SUPPORTED */
7028
7029#if KMP_HAVE_UMWAIT
7030static void __kmp_user_level_mwait_init() {
7031 struct kmp_cpuid buf;
7032 __kmp_x86_cpuid(leaf: 7, subleaf: 0, p: &buf);
7033 __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
7034 __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
7035 __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
7036 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
7037 __kmp_umwait_enabled));
7038}
7039#elif KMP_HAVE_MWAIT
7040#ifndef AT_INTELPHIUSERMWAIT
7041// Spurious, non-existent value that should always fail to return anything.
7042// Will be replaced with the correct value when we know that.
7043#define AT_INTELPHIUSERMWAIT 10000
7044#endif
7045// getauxval() function is available in RHEL7 and SLES12. If a system with an
7046// earlier OS is used to build the RTL, we'll use the following internal
7047// function when the entry is not found.
7048unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
7049unsigned long getauxval(unsigned long) { return 0; }
7050
7051static void __kmp_user_level_mwait_init() {
7052 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
7053 // use them to find if the user-level mwait is enabled. Otherwise, forcibly
7054 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
7055 // KMP_USER_LEVEL_MWAIT was set to TRUE.
7056 if (__kmp_mic_type == mic3) {
7057 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
7058 if ((res & 0x1) || __kmp_user_level_mwait) {
7059 __kmp_mwait_enabled = TRUE;
7060 if (__kmp_user_level_mwait) {
7061 KMP_INFORM(EnvMwaitWarn);
7062 }
7063 } else {
7064 __kmp_mwait_enabled = FALSE;
7065 }
7066 }
7067 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7068 "__kmp_mwait_enabled = %d\n",
7069 __kmp_mic_type, __kmp_mwait_enabled));
7070}
7071#endif /* KMP_HAVE_UMWAIT */
7072
7073static void __kmp_do_serial_initialize(void) {
7074 int i, gtid;
7075 size_t size;
7076
7077 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7078
7079 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7080 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7081 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7082 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7083 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7084
7085#if OMPT_SUPPORT
7086 ompt_pre_init();
7087#endif
7088#if OMPD_SUPPORT
7089 __kmp_env_dump();
7090 ompd_init();
7091#endif
7092
7093 __kmp_validate_locks();
7094
7095#if ENABLE_LIBOMPTARGET
7096 /* Initialize functions from libomptarget */
7097 __kmp_init_omptarget();
7098#endif
7099
7100 /* Initialize internal memory allocator */
7101 __kmp_init_allocator();
7102
7103 /* Register the library startup via an environment variable or via mapped
7104 shared memory file and check to see whether another copy of the library is
7105 already registered. Since forked child process is often terminated, we
7106 postpone the registration till middle initialization in the child */
7107 if (__kmp_need_register_serial)
7108 __kmp_register_library_startup();
7109
7110 /* TODO reinitialization of library */
7111 if (TCR_4(__kmp_global.g.g_done)) {
7112 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7113 }
7114
7115 __kmp_global.g.g_abort = 0;
7116 TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7117
7118/* initialize the locks */
7119#if KMP_USE_ADAPTIVE_LOCKS
7120#if KMP_DEBUG_ADAPTIVE_LOCKS
7121 __kmp_init_speculative_stats();
7122#endif
7123#endif
7124#if KMP_STATS_ENABLED
7125 __kmp_stats_init();
7126#endif
7127 __kmp_init_lock(lck: &__kmp_global_lock);
7128 __kmp_init_queuing_lock(lck: &__kmp_dispatch_lock);
7129 __kmp_init_lock(lck: &__kmp_debug_lock);
7130 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock);
7131 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_1i);
7132 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_2i);
7133 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_4i);
7134 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_4r);
7135 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_8i);
7136 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_8r);
7137 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_8c);
7138 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_10r);
7139 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_16r);
7140 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_16c);
7141 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_20c);
7142 __kmp_init_atomic_lock(lck: &__kmp_atomic_lock_32c);
7143 __kmp_init_bootstrap_lock(lck: &__kmp_forkjoin_lock);
7144 __kmp_init_bootstrap_lock(lck: &__kmp_exit_lock);
7145#if KMP_USE_MONITOR
7146 __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7147#endif
7148 __kmp_init_bootstrap_lock(lck: &__kmp_tp_cached_lock);
7149
7150 /* conduct initialization and initial setup of configuration */
7151
7152 __kmp_runtime_initialize();
7153
7154#if KMP_MIC_SUPPORTED
7155 __kmp_check_mic_type();
7156#endif
7157
7158// Some global variable initialization moved here from kmp_env_initialize()
7159#ifdef KMP_DEBUG
7160 kmp_diag = 0;
7161#endif
7162 __kmp_abort_delay = 0;
7163
7164 // From __kmp_init_dflt_team_nth()
7165 /* assume the entire machine will be used */
7166 __kmp_dflt_team_nth_ub = __kmp_xproc;
7167 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7168 __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7169 }
7170 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7171 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7172 }
7173 __kmp_max_nth = __kmp_sys_max_nth;
7174 __kmp_cg_max_nth = __kmp_sys_max_nth;
7175 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7176 if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7177 __kmp_teams_max_nth = __kmp_sys_max_nth;
7178 }
7179
7180 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7181 // part
7182 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7183#if KMP_USE_MONITOR
7184 __kmp_monitor_wakeups =
7185 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7186 __kmp_bt_intervals =
7187 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7188#endif
7189 // From "KMP_LIBRARY" part of __kmp_env_initialize()
7190 __kmp_library = library_throughput;
7191 // From KMP_SCHEDULE initialization
7192 __kmp_static = kmp_sch_static_balanced;
7193// AC: do not use analytical here, because it is non-monotonous
7194//__kmp_guided = kmp_sch_guided_iterative_chunked;
7195//__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7196// need to repeat assignment
7197// Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7198// bit control and barrier method control parts
7199#if KMP_FAST_REDUCTION_BARRIER
7200#define kmp_reduction_barrier_gather_bb ((int)1)
7201#define kmp_reduction_barrier_release_bb ((int)1)
7202#define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7203#define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7204#endif // KMP_FAST_REDUCTION_BARRIER
7205 for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7206 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7207 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7208 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7209 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7210#if KMP_FAST_REDUCTION_BARRIER
7211 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7212 // lin_64 ): hyper,1
7213 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7214 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7215 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7216 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7217 }
7218#endif // KMP_FAST_REDUCTION_BARRIER
7219 }
7220#if KMP_FAST_REDUCTION_BARRIER
7221#undef kmp_reduction_barrier_release_pat
7222#undef kmp_reduction_barrier_gather_pat
7223#undef kmp_reduction_barrier_release_bb
7224#undef kmp_reduction_barrier_gather_bb
7225#endif // KMP_FAST_REDUCTION_BARRIER
7226#if KMP_MIC_SUPPORTED
7227 if (__kmp_mic_type == mic2) { // KNC
7228 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7229 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7230 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7231 1; // forkjoin release
7232 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7233 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7234 }
7235#if KMP_FAST_REDUCTION_BARRIER
7236 if (__kmp_mic_type == mic2) { // KNC
7237 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7238 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7239 }
7240#endif // KMP_FAST_REDUCTION_BARRIER
7241#endif // KMP_MIC_SUPPORTED
7242
7243// From KMP_CHECKS initialization
7244#ifdef KMP_DEBUG
7245 __kmp_env_checks = TRUE; /* development versions have the extra checks */
7246#else
7247 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7248#endif
7249
7250 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7251 __kmp_foreign_tp = TRUE;
7252
7253 __kmp_global.g.g_dynamic = FALSE;
7254 __kmp_global.g.g_dynamic_mode = dynamic_default;
7255
7256 __kmp_init_nesting_mode();
7257
7258 __kmp_env_initialize(NULL);
7259
7260#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7261 __kmp_user_level_mwait_init();
7262#endif
7263// Print all messages in message catalog for testing purposes.
7264#ifdef KMP_DEBUG
7265 char const *val = __kmp_env_get(name: "KMP_DUMP_CATALOG");
7266 if (__kmp_str_match_true(data: val)) {
7267 kmp_str_buf_t buffer;
7268 __kmp_str_buf_init(&buffer);
7269 __kmp_i18n_dump_catalog(buffer: &buffer);
7270 __kmp_printf(format: "%s", buffer.str);
7271 __kmp_str_buf_free(buffer: &buffer);
7272 }
7273 __kmp_env_free(value: &val);
7274#endif
7275
7276 __kmp_threads_capacity =
7277 __kmp_initial_threads_capacity(req_nproc: __kmp_dflt_team_nth_ub);
7278 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7279 __kmp_tp_capacity = __kmp_default_tp_capacity(
7280 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7281
7282 // If the library is shut down properly, both pools must be NULL. Just in
7283 // case, set them to NULL -- some memory may leak, but subsequent code will
7284 // work even if pools are not freed.
7285 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7286 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7287 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7288 __kmp_thread_pool = NULL;
7289 __kmp_thread_pool_insert_pt = NULL;
7290 __kmp_team_pool = NULL;
7291
7292 /* Allocate all of the variable sized records */
7293 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7294 * expandable */
7295 /* Since allocation is cache-aligned, just add extra padding at the end */
7296 size =
7297 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7298 CACHE_LINE;
7299 __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7300 __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7301 sizeof(kmp_info_t *) * __kmp_threads_capacity);
7302
7303 /* init thread counts */
7304 KMP_DEBUG_ASSERT(__kmp_all_nth ==
7305 0); // Asserts fail if the library is reinitializing and
7306 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7307 __kmp_all_nth = 0;
7308 __kmp_nth = 0;
7309
7310 /* setup the uber master thread and hierarchy */
7311 gtid = __kmp_register_root(TRUE);
7312 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7313 KMP_ASSERT(KMP_UBER_GTID(gtid));
7314 KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7315
7316 KMP_MB(); /* Flush all pending memory write invalidates. */
7317
7318 __kmp_common_initialize();
7319
7320#if KMP_OS_UNIX
7321 /* invoke the child fork handler */
7322 __kmp_register_atfork();
7323#endif
7324
7325#if !KMP_DYNAMIC_LIB || \
7326 ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7327 {
7328 /* Invoke the exit handler when the program finishes, only for static
7329 library and macOS* dynamic. For other dynamic libraries, we already
7330 have _fini and DllMain. */
7331 int rc = atexit(__kmp_internal_end_atexit);
7332 if (rc != 0) {
7333 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7334 __kmp_msg_null);
7335 }
7336 }
7337#endif
7338
7339#if KMP_HANDLE_SIGNALS
7340#if KMP_OS_UNIX
7341 /* NOTE: make sure that this is called before the user installs their own
7342 signal handlers so that the user handlers are called first. this way they
7343 can return false, not call our handler, avoid terminating the library, and
7344 continue execution where they left off. */
7345 __kmp_install_signals(FALSE);
7346#endif /* KMP_OS_UNIX */
7347#if KMP_OS_WINDOWS
7348 __kmp_install_signals(TRUE);
7349#endif /* KMP_OS_WINDOWS */
7350#endif
7351
7352 /* we have finished the serial initialization */
7353 __kmp_init_counter++;
7354
7355 __kmp_init_serial = TRUE;
7356
7357 if (__kmp_version) {
7358 __kmp_print_version_1();
7359 }
7360
7361 if (__kmp_settings) {
7362 __kmp_env_print();
7363 }
7364
7365 if (__kmp_display_env || __kmp_display_env_verbose) {
7366 __kmp_env_print_2();
7367 }
7368
7369#if OMPT_SUPPORT
7370 ompt_post_init();
7371#endif
7372
7373 KMP_MB();
7374
7375 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7376}
7377
7378void __kmp_serial_initialize(void) {
7379 if (__kmp_init_serial) {
7380 return;
7381 }
7382 __kmp_acquire_bootstrap_lock(lck: &__kmp_initz_lock);
7383 if (__kmp_init_serial) {
7384 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
7385 return;
7386 }
7387 __kmp_do_serial_initialize();
7388 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
7389}
7390
7391static void __kmp_do_middle_initialize(void) {
7392 int i, j;
7393 int prev_dflt_team_nth;
7394
7395 if (!__kmp_init_serial) {
7396 __kmp_do_serial_initialize();
7397 }
7398
7399 KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7400
7401 if (UNLIKELY(!__kmp_need_register_serial)) {
7402 // We are in a forked child process. The registration was skipped during
7403 // serial initialization in __kmp_atfork_child handler. Do it here.
7404 __kmp_register_library_startup();
7405 }
7406
7407 // Save the previous value for the __kmp_dflt_team_nth so that
7408 // we can avoid some reinitialization if it hasn't changed.
7409 prev_dflt_team_nth = __kmp_dflt_team_nth;
7410
7411#if KMP_AFFINITY_SUPPORTED
7412 // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7413 // number of cores on the machine.
7414 __kmp_affinity_initialize(affinity&: __kmp_affinity);
7415
7416#endif /* KMP_AFFINITY_SUPPORTED */
7417
7418 KMP_ASSERT(__kmp_xproc > 0);
7419 if (__kmp_avail_proc == 0) {
7420 __kmp_avail_proc = __kmp_xproc;
7421 }
7422
7423 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7424 // correct them now
7425 j = 0;
7426 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7427 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7428 __kmp_avail_proc;
7429 j++;
7430 }
7431
7432 if (__kmp_dflt_team_nth == 0) {
7433#ifdef KMP_DFLT_NTH_CORES
7434 // Default #threads = #cores
7435 __kmp_dflt_team_nth = __kmp_ncores;
7436 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7437 "__kmp_ncores (%d)\n",
7438 __kmp_dflt_team_nth));
7439#else
7440 // Default #threads = #available OS procs
7441 __kmp_dflt_team_nth = __kmp_avail_proc;
7442 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7443 "__kmp_avail_proc(%d)\n",
7444 __kmp_dflt_team_nth));
7445#endif /* KMP_DFLT_NTH_CORES */
7446 }
7447
7448 if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7449 __kmp_dflt_team_nth = KMP_MIN_NTH;
7450 }
7451 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7452 __kmp_dflt_team_nth = __kmp_sys_max_nth;
7453 }
7454
7455 if (__kmp_nesting_mode > 0)
7456 __kmp_set_nesting_mode_threads();
7457
7458 // There's no harm in continuing if the following check fails,
7459 // but it indicates an error in the previous logic.
7460 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7461
7462 if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7463 // Run through the __kmp_threads array and set the num threads icv for each
7464 // root thread that is currently registered with the RTL (which has not
7465 // already explicitly set its nthreads-var with a call to
7466 // omp_set_num_threads()).
7467 for (i = 0; i < __kmp_threads_capacity; i++) {
7468 kmp_info_t *thread = __kmp_threads[i];
7469 if (thread == NULL)
7470 continue;
7471 if (thread->th.th_current_task->td_icvs.nproc != 0)
7472 continue;
7473
7474 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7475 }
7476 }
7477 KA_TRACE(
7478 20,
7479 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7480 __kmp_dflt_team_nth));
7481
7482#ifdef KMP_ADJUST_BLOCKTIME
7483 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7484 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7485 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7486 if (__kmp_nth > __kmp_avail_proc) {
7487 __kmp_zero_bt = TRUE;
7488 }
7489 }
7490#endif /* KMP_ADJUST_BLOCKTIME */
7491
7492 /* we have finished middle initialization */
7493 TCW_SYNC_4(__kmp_init_middle, TRUE);
7494
7495 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7496}
7497
7498void __kmp_middle_initialize(void) {
7499 if (__kmp_init_middle) {
7500 return;
7501 }
7502 __kmp_acquire_bootstrap_lock(lck: &__kmp_initz_lock);
7503 if (__kmp_init_middle) {
7504 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
7505 return;
7506 }
7507 __kmp_do_middle_initialize();
7508 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
7509}
7510
7511void __kmp_parallel_initialize(void) {
7512 int gtid = __kmp_entry_gtid(); // this might be a new root
7513
7514 /* synchronize parallel initialization (for sibling) */
7515 if (TCR_4(__kmp_init_parallel))
7516 return;
7517 __kmp_acquire_bootstrap_lock(lck: &__kmp_initz_lock);
7518 if (TCR_4(__kmp_init_parallel)) {
7519 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
7520 return;
7521 }
7522
7523 /* TODO reinitialization after we have already shut down */
7524 if (TCR_4(__kmp_global.g.g_done)) {
7525 KA_TRACE(
7526 10,
7527 ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7528 __kmp_infinite_loop();
7529 }
7530
7531 /* jc: The lock __kmp_initz_lock is already held, so calling
7532 __kmp_serial_initialize would cause a deadlock. So we call
7533 __kmp_do_serial_initialize directly. */
7534 if (!__kmp_init_middle) {
7535 __kmp_do_middle_initialize();
7536 }
7537 __kmp_assign_root_init_mask();
7538 __kmp_resume_if_hard_paused();
7539
7540 /* begin initialization */
7541 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7542 KMP_ASSERT(KMP_UBER_GTID(gtid));
7543
7544#if KMP_ARCH_X86 || KMP_ARCH_X86_64
7545 // Save the FP control regs.
7546 // Worker threads will set theirs to these values at thread startup.
7547 __kmp_store_x87_fpu_control_word(p: &__kmp_init_x87_fpu_control_word);
7548 __kmp_store_mxcsr(p: &__kmp_init_mxcsr);
7549 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7550#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7551
7552#if KMP_OS_UNIX
7553#if KMP_HANDLE_SIGNALS
7554 /* must be after __kmp_serial_initialize */
7555 __kmp_install_signals(TRUE);
7556#endif
7557#endif
7558
7559 __kmp_suspend_initialize();
7560
7561#if defined(USE_LOAD_BALANCE)
7562 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7563 __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7564 }
7565#else
7566 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7567 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7568 }
7569#endif
7570
7571 if (__kmp_version) {
7572 __kmp_print_version_2();
7573 }
7574
7575 /* we have finished parallel initialization */
7576 TCW_SYNC_4(__kmp_init_parallel, TRUE);
7577
7578 KMP_MB();
7579 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7580
7581 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
7582}
7583
7584void __kmp_hidden_helper_initialize() {
7585 if (TCR_4(__kmp_init_hidden_helper))
7586 return;
7587
7588 // __kmp_parallel_initialize is required before we initialize hidden helper
7589 if (!TCR_4(__kmp_init_parallel))
7590 __kmp_parallel_initialize();
7591
7592 // Double check. Note that this double check should not be placed before
7593 // __kmp_parallel_initialize as it will cause dead lock.
7594 __kmp_acquire_bootstrap_lock(lck: &__kmp_initz_lock);
7595 if (TCR_4(__kmp_init_hidden_helper)) {
7596 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
7597 return;
7598 }
7599
7600#if KMP_AFFINITY_SUPPORTED
7601 // Initialize hidden helper affinity settings.
7602 // The above __kmp_parallel_initialize() will initialize
7603 // regular affinity (and topology) if not already done.
7604 if (!__kmp_hh_affinity.flags.initialized)
7605 __kmp_affinity_initialize(affinity&: __kmp_hh_affinity);
7606#endif
7607
7608 // Set the count of hidden helper tasks to be executed to zero
7609 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7610
7611 // Set the global variable indicating that we're initializing hidden helper
7612 // team/threads
7613 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7614
7615 // Platform independent initialization
7616 __kmp_do_initialize_hidden_helper_threads();
7617
7618 // Wait here for the finish of initialization of hidden helper teams
7619 __kmp_hidden_helper_threads_initz_wait();
7620
7621 // We have finished hidden helper initialization
7622 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7623
7624 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
7625}
7626
7627/* ------------------------------------------------------------------------ */
7628
7629void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7630 kmp_team_t *team) {
7631 kmp_disp_t *dispatch;
7632
7633 KMP_MB();
7634
7635 /* none of the threads have encountered any constructs, yet. */
7636 this_thr->th.th_local.this_construct = 0;
7637#if KMP_CACHE_MANAGE
7638 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7639#endif /* KMP_CACHE_MANAGE */
7640 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7641 KMP_DEBUG_ASSERT(dispatch);
7642 KMP_DEBUG_ASSERT(team->t.t_dispatch);
7643 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7644 // this_thr->th.th_info.ds.ds_tid ] );
7645
7646 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7647 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7648 if (__kmp_env_consistency_check)
7649 __kmp_push_parallel(gtid, ident: team->t.t_ident);
7650
7651 KMP_MB(); /* Flush all pending memory write invalidates. */
7652}
7653
7654void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7655 kmp_team_t *team) {
7656 if (__kmp_env_consistency_check)
7657 __kmp_pop_parallel(gtid, ident: team->t.t_ident);
7658
7659 __kmp_finish_implicit_task(this_thr);
7660}
7661
7662int __kmp_invoke_task_func(int gtid) {
7663 int rc;
7664 int tid = __kmp_tid_from_gtid(gtid);
7665 kmp_info_t *this_thr = __kmp_threads[gtid];
7666 kmp_team_t *team = this_thr->th.th_team;
7667
7668 __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7669#if USE_ITT_BUILD
7670 if (__itt_stack_caller_create_ptr) {
7671 // inform ittnotify about entering user's code
7672 if (team->t.t_stack_id != NULL) {
7673 __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7674 } else {
7675 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7676 __kmp_itt_stack_callee_enter(
7677 (__itt_caller)team->t.t_parent->t.t_stack_id);
7678 }
7679 }
7680#endif /* USE_ITT_BUILD */
7681#if INCLUDE_SSC_MARKS
7682 SSC_MARK_INVOKING();
7683#endif
7684
7685#if OMPT_SUPPORT
7686 void *dummy;
7687 void **exit_frame_p;
7688 ompt_data_t *my_task_data;
7689 ompt_data_t *my_parallel_data;
7690 int ompt_team_size;
7691
7692 if (ompt_enabled.enabled) {
7693 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7694 .ompt_task_info.frame.exit_frame.ptr);
7695 } else {
7696 exit_frame_p = &dummy;
7697 }
7698
7699 my_task_data =
7700 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7701 my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7702 if (ompt_enabled.ompt_callback_implicit_task) {
7703 ompt_team_size = team->t.t_nproc;
7704 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7705 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7706 __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7707 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7708 }
7709#endif
7710
7711#if KMP_STATS_ENABLED
7712 stats_state_e previous_state = KMP_GET_THREAD_STATE();
7713 if (previous_state == stats_state_e::TEAMS_REGION) {
7714 KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7715 } else {
7716 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7717 }
7718 KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7719#endif
7720
7721 rc = __kmp_invoke_microtask(pkfn: (microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7722 npr: tid, argc: (int)team->t.t_argc, argv: (void **)team->t.t_argv
7723#if OMPT_SUPPORT
7724 ,
7725 exit_frame_ptr: exit_frame_p
7726#endif
7727 );
7728#if OMPT_SUPPORT
7729 *exit_frame_p = NULL;
7730 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7731#endif
7732
7733#if KMP_STATS_ENABLED
7734 if (previous_state == stats_state_e::TEAMS_REGION) {
7735 KMP_SET_THREAD_STATE(previous_state);
7736 }
7737 KMP_POP_PARTITIONED_TIMER();
7738#endif
7739
7740#if USE_ITT_BUILD
7741 if (__itt_stack_caller_create_ptr) {
7742 // inform ittnotify about leaving user's code
7743 if (team->t.t_stack_id != NULL) {
7744 __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7745 } else {
7746 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7747 __kmp_itt_stack_callee_leave(
7748 (__itt_caller)team->t.t_parent->t.t_stack_id);
7749 }
7750 }
7751#endif /* USE_ITT_BUILD */
7752 __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7753
7754 return rc;
7755}
7756
7757void __kmp_teams_master(int gtid) {
7758 // This routine is called by all primary threads in teams construct
7759 kmp_info_t *thr = __kmp_threads[gtid];
7760 kmp_team_t *team = thr->th.th_team;
7761 ident_t *loc = team->t.t_ident;
7762 thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7763 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7764 KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7765 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7766 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7767
7768 // This thread is a new CG root. Set up the proper variables.
7769 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7770 tmp->cg_root = thr; // Make thr the CG root
7771 // Init to thread limit stored when league primary threads were forked
7772 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7773 tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7774 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7775 " cg_nthreads to 1\n",
7776 thr, tmp));
7777 tmp->up = thr->th.th_cg_roots;
7778 thr->th.th_cg_roots = tmp;
7779
7780// Launch league of teams now, but not let workers execute
7781// (they hang on fork barrier until next parallel)
7782#if INCLUDE_SSC_MARKS
7783 SSC_MARK_FORKING();
7784#endif
7785 __kmp_fork_call(loc, gtid, call_context: fork_context_intel, argc: team->t.t_argc,
7786 microtask: (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7787 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7788#if INCLUDE_SSC_MARKS
7789 SSC_MARK_JOINING();
7790#endif
7791 // If the team size was reduced from the limit, set it to the new size
7792 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7793 thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7794 // AC: last parameter "1" eliminates join barrier which won't work because
7795 // worker threads are in a fork barrier waiting for more parallel regions
7796 __kmp_join_call(loc, gtid
7797#if OMPT_SUPPORT
7798 ,
7799 fork_context: fork_context_intel
7800#endif
7801 ,
7802 exit_teams: 1);
7803}
7804
7805int __kmp_invoke_teams_master(int gtid) {
7806 kmp_info_t *this_thr = __kmp_threads[gtid];
7807 kmp_team_t *team = this_thr->th.th_team;
7808#if KMP_DEBUG
7809 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7810 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7811 (void *)__kmp_teams_master);
7812#endif
7813 __kmp_run_before_invoked_task(gtid, tid: 0, this_thr, team);
7814#if OMPT_SUPPORT
7815 int tid = __kmp_tid_from_gtid(gtid);
7816 ompt_data_t *task_data =
7817 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7818 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7819 if (ompt_enabled.ompt_callback_implicit_task) {
7820 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7821 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7822 ompt_task_initial);
7823 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7824 }
7825#endif
7826 __kmp_teams_master(gtid);
7827#if OMPT_SUPPORT
7828 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7829#endif
7830 __kmp_run_after_invoked_task(gtid, tid: 0, this_thr, team);
7831 return 1;
7832}
7833
7834/* this sets the requested number of threads for the next parallel region
7835 encountered by this team. since this should be enclosed in the forkjoin
7836 critical section it should avoid race conditions with asymmetrical nested
7837 parallelism */
7838
7839void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7840 kmp_info_t *thr = __kmp_threads[gtid];
7841
7842 if (num_threads > 0)
7843 thr->th.th_set_nproc = num_threads;
7844}
7845
7846static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7847 int num_threads) {
7848 KMP_DEBUG_ASSERT(thr);
7849 // Remember the number of threads for inner parallel regions
7850 if (!TCR_4(__kmp_init_middle))
7851 __kmp_middle_initialize(); // get internal globals calculated
7852 __kmp_assign_root_init_mask();
7853 KMP_DEBUG_ASSERT(__kmp_avail_proc);
7854 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7855
7856 if (num_threads == 0) {
7857 if (__kmp_teams_thread_limit > 0) {
7858 num_threads = __kmp_teams_thread_limit;
7859 } else {
7860 num_threads = __kmp_avail_proc / num_teams;
7861 }
7862 // adjust num_threads w/o warning as it is not user setting
7863 // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7864 // no thread_limit clause specified - do not change thread-limit-var ICV
7865 if (num_threads > __kmp_dflt_team_nth) {
7866 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7867 }
7868 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7869 num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7870 } // prevent team size to exceed thread-limit-var
7871 if (num_teams * num_threads > __kmp_teams_max_nth) {
7872 num_threads = __kmp_teams_max_nth / num_teams;
7873 }
7874 if (num_threads == 0) {
7875 num_threads = 1;
7876 }
7877 } else {
7878 if (num_threads < 0) {
7879 __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7880 __kmp_msg_null);
7881 num_threads = 1;
7882 }
7883 // This thread will be the primary thread of the league primary threads
7884 // Store new thread limit; old limit is saved in th_cg_roots list
7885 thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7886 // num_threads = min(num_threads, nthreads-var)
7887 if (num_threads > __kmp_dflt_team_nth) {
7888 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7889 }
7890 if (num_teams * num_threads > __kmp_teams_max_nth) {
7891 int new_threads = __kmp_teams_max_nth / num_teams;
7892 if (new_threads == 0) {
7893 new_threads = 1;
7894 }
7895 if (new_threads != num_threads) {
7896 if (!__kmp_reserve_warn) { // user asked for too many threads
7897 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7898 __kmp_msg(kmp_ms_warning,
7899 KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7900 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7901 }
7902 }
7903 num_threads = new_threads;
7904 }
7905 }
7906 thr->th.th_teams_size.nth = num_threads;
7907}
7908
7909/* this sets the requested number of teams for the teams region and/or
7910 the number of threads for the next parallel region encountered */
7911void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7912 int num_threads) {
7913 kmp_info_t *thr = __kmp_threads[gtid];
7914 if (num_teams < 0) {
7915 // OpenMP specification requires requested values to be positive,
7916 // but people can send us any value, so we'd better check
7917 __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7918 __kmp_msg_null);
7919 num_teams = 1;
7920 }
7921 if (num_teams == 0) {
7922 if (__kmp_nteams > 0) {
7923 num_teams = __kmp_nteams;
7924 } else {
7925 num_teams = 1; // default number of teams is 1.
7926 }
7927 }
7928 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7929 if (!__kmp_reserve_warn) {
7930 __kmp_reserve_warn = 1;
7931 __kmp_msg(kmp_ms_warning,
7932 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7933 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7934 }
7935 num_teams = __kmp_teams_max_nth;
7936 }
7937 // Set number of teams (number of threads in the outer "parallel" of the
7938 // teams)
7939 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7940
7941 __kmp_push_thread_limit(thr, num_teams, num_threads);
7942}
7943
7944/* This sets the requested number of teams for the teams region and/or
7945 the number of threads for the next parallel region encountered */
7946void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7947 int num_teams_ub, int num_threads) {
7948 kmp_info_t *thr = __kmp_threads[gtid];
7949 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7950 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7951 KMP_DEBUG_ASSERT(num_threads >= 0);
7952
7953 if (num_teams_lb > num_teams_ub) {
7954 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7955 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7956 }
7957
7958 int num_teams = 1; // defalt number of teams is 1.
7959
7960 if (num_teams_lb == 0 && num_teams_ub > 0)
7961 num_teams_lb = num_teams_ub;
7962
7963 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7964 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7965 if (num_teams > __kmp_teams_max_nth) {
7966 if (!__kmp_reserve_warn) {
7967 __kmp_reserve_warn = 1;
7968 __kmp_msg(kmp_ms_warning,
7969 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7970 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7971 }
7972 num_teams = __kmp_teams_max_nth;
7973 }
7974 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7975 num_teams = num_teams_ub;
7976 } else { // num_teams_lb <= num_teams <= num_teams_ub
7977 if (num_threads <= 0) {
7978 if (num_teams_ub > __kmp_teams_max_nth) {
7979 num_teams = num_teams_lb;
7980 } else {
7981 num_teams = num_teams_ub;
7982 }
7983 } else {
7984 num_teams = (num_threads > __kmp_teams_max_nth)
7985 ? num_teams
7986 : __kmp_teams_max_nth / num_threads;
7987 if (num_teams < num_teams_lb) {
7988 num_teams = num_teams_lb;
7989 } else if (num_teams > num_teams_ub) {
7990 num_teams = num_teams_ub;
7991 }
7992 }
7993 }
7994 // Set number of teams (number of threads in the outer "parallel" of the
7995 // teams)
7996 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7997
7998 __kmp_push_thread_limit(thr, num_teams, num_threads);
7999}
8000
8001// Set the proc_bind var to use in the following parallel region.
8002void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
8003 kmp_info_t *thr = __kmp_threads[gtid];
8004 thr->th.th_set_proc_bind = proc_bind;
8005}
8006
8007/* Launch the worker threads into the microtask. */
8008
8009void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
8010 kmp_info_t *this_thr = __kmp_threads[gtid];
8011
8012#ifdef KMP_DEBUG
8013 int f;
8014#endif /* KMP_DEBUG */
8015
8016 KMP_DEBUG_ASSERT(team);
8017 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8018 KMP_ASSERT(KMP_MASTER_GTID(gtid));
8019 KMP_MB(); /* Flush all pending memory write invalidates. */
8020
8021 team->t.t_construct = 0; /* no single directives seen yet */
8022 team->t.t_ordered.dt.t_value =
8023 0; /* thread 0 enters the ordered section first */
8024
8025 /* Reset the identifiers on the dispatch buffer */
8026 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
8027 if (team->t.t_max_nproc > 1) {
8028 int i;
8029 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
8030 team->t.t_disp_buffer[i].buffer_index = i;
8031 team->t.t_disp_buffer[i].doacross_buf_idx = i;
8032 }
8033 } else {
8034 team->t.t_disp_buffer[0].buffer_index = 0;
8035 team->t.t_disp_buffer[0].doacross_buf_idx = 0;
8036 }
8037
8038 KMP_MB(); /* Flush all pending memory write invalidates. */
8039 KMP_ASSERT(this_thr->th.th_team == team);
8040
8041#ifdef KMP_DEBUG
8042 for (f = 0; f < team->t.t_nproc; f++) {
8043 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
8044 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
8045 }
8046#endif /* KMP_DEBUG */
8047
8048 /* release the worker threads so they may begin working */
8049 __kmp_fork_barrier(gtid, tid: 0);
8050}
8051
8052void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
8053 kmp_info_t *this_thr = __kmp_threads[gtid];
8054
8055 KMP_DEBUG_ASSERT(team);
8056 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8057 KMP_ASSERT(KMP_MASTER_GTID(gtid));
8058 KMP_MB(); /* Flush all pending memory write invalidates. */
8059
8060 /* Join barrier after fork */
8061
8062#ifdef KMP_DEBUG
8063 if (__kmp_threads[gtid] &&
8064 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
8065 __kmp_printf(format: "GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
8066 __kmp_threads[gtid]);
8067 __kmp_printf(format: "__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8068 "team->t.t_nproc=%d\n",
8069 gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8070 team->t.t_nproc);
8071 __kmp_print_structure();
8072 }
8073 KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8074 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8075#endif /* KMP_DEBUG */
8076
8077 __kmp_join_barrier(gtid); /* wait for everyone */
8078#if OMPT_SUPPORT
8079 if (ompt_enabled.enabled &&
8080 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
8081 int ds_tid = this_thr->th.th_info.ds.ds_tid;
8082 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8083 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8084#if OMPT_OPTIONAL
8085 void *codeptr = NULL;
8086 if (KMP_MASTER_TID(ds_tid) &&
8087 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8088 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8089 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8090
8091 if (ompt_enabled.ompt_callback_sync_region_wait) {
8092 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8093 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8094 codeptr);
8095 }
8096 if (ompt_enabled.ompt_callback_sync_region) {
8097 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8098 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8099 codeptr);
8100 }
8101#endif
8102 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8103 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8104 ompt_scope_end, NULL, task_data, 0, ds_tid,
8105 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8106 }
8107 }
8108#endif
8109
8110 KMP_MB(); /* Flush all pending memory write invalidates. */
8111 KMP_ASSERT(this_thr->th.th_team == team);
8112}
8113
8114/* ------------------------------------------------------------------------ */
8115
8116#ifdef USE_LOAD_BALANCE
8117
8118// Return the worker threads actively spinning in the hot team, if we
8119// are at the outermost level of parallelism. Otherwise, return 0.
8120static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8121 int i;
8122 int retval;
8123 kmp_team_t *hot_team;
8124
8125 if (root->r.r_active) {
8126 return 0;
8127 }
8128 hot_team = root->r.r_hot_team;
8129 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8130 return hot_team->t.t_nproc - 1; // Don't count primary thread
8131 }
8132
8133 // Skip the primary thread - it is accounted for elsewhere.
8134 retval = 0;
8135 for (i = 1; i < hot_team->t.t_nproc; i++) {
8136 if (hot_team->t.t_threads[i]->th.th_active) {
8137 retval++;
8138 }
8139 }
8140 return retval;
8141}
8142
8143// Perform an automatic adjustment to the number of
8144// threads used by the next parallel region.
8145static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8146 int retval;
8147 int pool_active;
8148 int hot_team_active;
8149 int team_curr_active;
8150 int system_active;
8151
8152 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8153 set_nproc));
8154 KMP_DEBUG_ASSERT(root);
8155 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8156 ->th.th_current_task->td_icvs.dynamic == TRUE);
8157 KMP_DEBUG_ASSERT(set_nproc > 1);
8158
8159 if (set_nproc == 1) {
8160 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8161 return 1;
8162 }
8163
8164 // Threads that are active in the thread pool, active in the hot team for this
8165 // particular root (if we are at the outer par level), and the currently
8166 // executing thread (to become the primary thread) are available to add to the
8167 // new team, but are currently contributing to the system load, and must be
8168 // accounted for.
8169 pool_active = __kmp_thread_pool_active_nth;
8170 hot_team_active = __kmp_active_hot_team_nproc(root);
8171 team_curr_active = pool_active + hot_team_active + 1;
8172
8173 // Check the system load.
8174 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8175 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8176 "hot team active = %d\n",
8177 system_active, pool_active, hot_team_active));
8178
8179 if (system_active < 0) {
8180 // There was an error reading the necessary info from /proc, so use the
8181 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8182 // = dynamic_thread_limit, we shouldn't wind up getting back here.
8183 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8184 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8185
8186 // Make this call behave like the thread limit algorithm.
8187 retval = __kmp_avail_proc - __kmp_nth +
8188 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8189 if (retval > set_nproc) {
8190 retval = set_nproc;
8191 }
8192 if (retval < KMP_MIN_NTH) {
8193 retval = KMP_MIN_NTH;
8194 }
8195
8196 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8197 retval));
8198 return retval;
8199 }
8200
8201 // There is a slight delay in the load balance algorithm in detecting new
8202 // running procs. The real system load at this instant should be at least as
8203 // large as the #active omp thread that are available to add to the team.
8204 if (system_active < team_curr_active) {
8205 system_active = team_curr_active;
8206 }
8207 retval = __kmp_avail_proc - system_active + team_curr_active;
8208 if (retval > set_nproc) {
8209 retval = set_nproc;
8210 }
8211 if (retval < KMP_MIN_NTH) {
8212 retval = KMP_MIN_NTH;
8213 }
8214
8215 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8216 return retval;
8217} // __kmp_load_balance_nproc()
8218
8219#endif /* USE_LOAD_BALANCE */
8220
8221/* ------------------------------------------------------------------------ */
8222
8223/* NOTE: this is called with the __kmp_init_lock held */
8224void __kmp_cleanup(void) {
8225 int f;
8226
8227 KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8228
8229 if (TCR_4(__kmp_init_parallel)) {
8230#if KMP_HANDLE_SIGNALS
8231 __kmp_remove_signals();
8232#endif
8233 TCW_4(__kmp_init_parallel, FALSE);
8234 }
8235
8236 if (TCR_4(__kmp_init_middle)) {
8237#if KMP_AFFINITY_SUPPORTED
8238 __kmp_affinity_uninitialize();
8239#endif /* KMP_AFFINITY_SUPPORTED */
8240 __kmp_cleanup_hierarchy();
8241 TCW_4(__kmp_init_middle, FALSE);
8242 }
8243
8244 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8245
8246 if (__kmp_init_serial) {
8247 __kmp_runtime_destroy();
8248 __kmp_init_serial = FALSE;
8249 }
8250
8251 __kmp_cleanup_threadprivate_caches();
8252
8253 for (f = 0; f < __kmp_threads_capacity; f++) {
8254 if (__kmp_root[f] != NULL) {
8255 __kmp_free(__kmp_root[f]);
8256 __kmp_root[f] = NULL;
8257 }
8258 }
8259 __kmp_free(__kmp_threads);
8260 // __kmp_threads and __kmp_root were allocated at once, as single block, so
8261 // there is no need in freeing __kmp_root.
8262 __kmp_threads = NULL;
8263 __kmp_root = NULL;
8264 __kmp_threads_capacity = 0;
8265
8266 // Free old __kmp_threads arrays if they exist.
8267 kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8268 while (ptr) {
8269 kmp_old_threads_list_t *next = ptr->next;
8270 __kmp_free(ptr->threads);
8271 __kmp_free(ptr);
8272 ptr = next;
8273 }
8274
8275#if KMP_USE_DYNAMIC_LOCK
8276 __kmp_cleanup_indirect_user_locks();
8277#else
8278 __kmp_cleanup_user_locks();
8279#endif
8280#if OMPD_SUPPORT
8281 if (ompd_state) {
8282 __kmp_free(ompd_env_block);
8283 ompd_env_block = NULL;
8284 ompd_env_block_size = 0;
8285 }
8286#endif
8287
8288#if KMP_AFFINITY_SUPPORTED
8289 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8290 __kmp_cpuinfo_file = NULL;
8291#endif /* KMP_AFFINITY_SUPPORTED */
8292
8293#if KMP_USE_ADAPTIVE_LOCKS
8294#if KMP_DEBUG_ADAPTIVE_LOCKS
8295 __kmp_print_speculative_stats();
8296#endif
8297#endif
8298 KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8299 __kmp_nested_nth.nth = NULL;
8300 __kmp_nested_nth.size = 0;
8301 __kmp_nested_nth.used = 0;
8302 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8303 __kmp_nested_proc_bind.bind_types = NULL;
8304 __kmp_nested_proc_bind.size = 0;
8305 __kmp_nested_proc_bind.used = 0;
8306 if (__kmp_affinity_format) {
8307 KMP_INTERNAL_FREE(__kmp_affinity_format);
8308 __kmp_affinity_format = NULL;
8309 }
8310
8311 __kmp_i18n_catclose();
8312
8313#if KMP_USE_HIER_SCHED
8314 __kmp_hier_scheds.deallocate();
8315#endif
8316
8317#if KMP_STATS_ENABLED
8318 __kmp_stats_fini();
8319#endif
8320
8321 KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8322}
8323
8324/* ------------------------------------------------------------------------ */
8325
8326int __kmp_ignore_mppbeg(void) {
8327 char *env;
8328
8329 if ((env = getenv(name: "KMP_IGNORE_MPPBEG")) != NULL) {
8330 if (__kmp_str_match_false(data: env))
8331 return FALSE;
8332 }
8333 // By default __kmpc_begin() is no-op.
8334 return TRUE;
8335}
8336
8337int __kmp_ignore_mppend(void) {
8338 char *env;
8339
8340 if ((env = getenv(name: "KMP_IGNORE_MPPEND")) != NULL) {
8341 if (__kmp_str_match_false(data: env))
8342 return FALSE;
8343 }
8344 // By default __kmpc_end() is no-op.
8345 return TRUE;
8346}
8347
8348void __kmp_internal_begin(void) {
8349 int gtid;
8350 kmp_root_t *root;
8351
8352 /* this is a very important step as it will register new sibling threads
8353 and assign these new uber threads a new gtid */
8354 gtid = __kmp_entry_gtid();
8355 root = __kmp_threads[gtid]->th.th_root;
8356 KMP_ASSERT(KMP_UBER_GTID(gtid));
8357
8358 if (root->r.r_begin)
8359 return;
8360 __kmp_acquire_lock(lck: &root->r.r_begin_lock, gtid);
8361 if (root->r.r_begin) {
8362 __kmp_release_lock(lck: &root->r.r_begin_lock, gtid);
8363 return;
8364 }
8365
8366 root->r.r_begin = TRUE;
8367
8368 __kmp_release_lock(lck: &root->r.r_begin_lock, gtid);
8369}
8370
8371/* ------------------------------------------------------------------------ */
8372
8373void __kmp_user_set_library(enum library_type arg) {
8374 int gtid;
8375 kmp_root_t *root;
8376 kmp_info_t *thread;
8377
8378 /* first, make sure we are initialized so we can get our gtid */
8379
8380 gtid = __kmp_entry_gtid();
8381 thread = __kmp_threads[gtid];
8382
8383 root = thread->th.th_root;
8384
8385 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8386 library_serial));
8387 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8388 thread */
8389 KMP_WARNING(SetLibraryIncorrectCall);
8390 return;
8391 }
8392
8393 switch (arg) {
8394 case library_serial:
8395 thread->th.th_set_nproc = 0;
8396 set__nproc(thread, 1);
8397 break;
8398 case library_turnaround:
8399 thread->th.th_set_nproc = 0;
8400 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8401 : __kmp_dflt_team_nth_ub);
8402 break;
8403 case library_throughput:
8404 thread->th.th_set_nproc = 0;
8405 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8406 : __kmp_dflt_team_nth_ub);
8407 break;
8408 default:
8409 KMP_FATAL(UnknownLibraryType, arg);
8410 }
8411
8412 __kmp_aux_set_library(arg);
8413}
8414
8415void __kmp_aux_set_stacksize(size_t arg) {
8416 if (!__kmp_init_serial)
8417 __kmp_serial_initialize();
8418
8419#if KMP_OS_DARWIN
8420 if (arg & (0x1000 - 1)) {
8421 arg &= ~(0x1000 - 1);
8422 if (arg + 0x1000) /* check for overflow if we round up */
8423 arg += 0x1000;
8424 }
8425#endif
8426 __kmp_acquire_bootstrap_lock(lck: &__kmp_initz_lock);
8427
8428 /* only change the default stacksize before the first parallel region */
8429 if (!TCR_4(__kmp_init_parallel)) {
8430 size_t value = arg; /* argument is in bytes */
8431
8432 if (value < __kmp_sys_min_stksize)
8433 value = __kmp_sys_min_stksize;
8434 else if (value > KMP_MAX_STKSIZE)
8435 value = KMP_MAX_STKSIZE;
8436
8437 __kmp_stksize = value;
8438
8439 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8440 }
8441
8442 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
8443}
8444
8445/* set the behaviour of the runtime library */
8446/* TODO this can cause some odd behaviour with sibling parallelism... */
8447void __kmp_aux_set_library(enum library_type arg) {
8448 __kmp_library = arg;
8449
8450 switch (__kmp_library) {
8451 case library_serial: {
8452 KMP_INFORM(LibraryIsSerial);
8453 } break;
8454 case library_turnaround:
8455 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8456 __kmp_use_yield = 2; // only yield when oversubscribed
8457 break;
8458 case library_throughput:
8459 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8460 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8461 break;
8462 default:
8463 KMP_FATAL(UnknownLibraryType, arg);
8464 }
8465}
8466
8467/* Getting team information common for all team API */
8468// Returns NULL if not in teams construct
8469static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8470 kmp_info_t *thr = __kmp_entry_thread();
8471 teams_serialized = 0;
8472 if (thr->th.th_teams_microtask) {
8473 kmp_team_t *team = thr->th.th_team;
8474 int tlevel = thr->th.th_teams_level; // the level of the teams construct
8475 int ii = team->t.t_level;
8476 teams_serialized = team->t.t_serialized;
8477 int level = tlevel + 1;
8478 KMP_DEBUG_ASSERT(ii >= tlevel);
8479 while (ii > level) {
8480 for (teams_serialized = team->t.t_serialized;
8481 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8482 }
8483 if (team->t.t_serialized && (!teams_serialized)) {
8484 team = team->t.t_parent;
8485 continue;
8486 }
8487 if (ii > level) {
8488 team = team->t.t_parent;
8489 ii--;
8490 }
8491 }
8492 return team;
8493 }
8494 return NULL;
8495}
8496
8497int __kmp_aux_get_team_num() {
8498 int serialized;
8499 kmp_team_t *team = __kmp_aux_get_team_info(teams_serialized&: serialized);
8500 if (team) {
8501 if (serialized > 1) {
8502 return 0; // teams region is serialized ( 1 team of 1 thread ).
8503 } else {
8504 return team->t.t_master_tid;
8505 }
8506 }
8507 return 0;
8508}
8509
8510int __kmp_aux_get_num_teams() {
8511 int serialized;
8512 kmp_team_t *team = __kmp_aux_get_team_info(teams_serialized&: serialized);
8513 if (team) {
8514 if (serialized > 1) {
8515 return 1;
8516 } else {
8517 return team->t.t_parent->t.t_nproc;
8518 }
8519 }
8520 return 1;
8521}
8522
8523/* ------------------------------------------------------------------------ */
8524
8525/*
8526 * Affinity Format Parser
8527 *
8528 * Field is in form of: %[[[0].]size]type
8529 * % and type are required (%% means print a literal '%')
8530 * type is either single char or long name surrounded by {},
8531 * e.g., N or {num_threads}
8532 * 0 => leading zeros
8533 * . => right justified when size is specified
8534 * by default output is left justified
8535 * size is the *minimum* field length
8536 * All other characters are printed as is
8537 *
8538 * Available field types:
8539 * L {thread_level} - omp_get_level()
8540 * n {thread_num} - omp_get_thread_num()
8541 * h {host} - name of host machine
8542 * P {process_id} - process id (integer)
8543 * T {thread_identifier} - native thread identifier (integer)
8544 * N {num_threads} - omp_get_num_threads()
8545 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8546 * a {thread_affinity} - comma separated list of integers or integer ranges
8547 * (values of affinity mask)
8548 *
8549 * Implementation-specific field types can be added
8550 * If a type is unknown, print "undefined"
8551 */
8552
8553// Structure holding the short name, long name, and corresponding data type
8554// for snprintf. A table of these will represent the entire valid keyword
8555// field types.
8556typedef struct kmp_affinity_format_field_t {
8557 char short_name; // from spec e.g., L -> thread level
8558 const char *long_name; // from spec thread_level -> thread level
8559 char field_format; // data type for snprintf (typically 'd' or 's'
8560 // for integer or string)
8561} kmp_affinity_format_field_t;
8562
8563static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8564#if KMP_AFFINITY_SUPPORTED
8565 {.short_name: 'A', .long_name: "thread_affinity", .field_format: 's'},
8566#endif
8567 {.short_name: 't', .long_name: "team_num", .field_format: 'd'},
8568 {.short_name: 'T', .long_name: "num_teams", .field_format: 'd'},
8569 {.short_name: 'L', .long_name: "nesting_level", .field_format: 'd'},
8570 {.short_name: 'n', .long_name: "thread_num", .field_format: 'd'},
8571 {.short_name: 'N', .long_name: "num_threads", .field_format: 'd'},
8572 {.short_name: 'a', .long_name: "ancestor_tnum", .field_format: 'd'},
8573 {.short_name: 'H', .long_name: "host", .field_format: 's'},
8574 {.short_name: 'P', .long_name: "process_id", .field_format: 'd'},
8575 {.short_name: 'i', .long_name: "native_thread_id", .field_format: 'd'}};
8576
8577// Return the number of characters it takes to hold field
8578static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8579 const char **ptr,
8580 kmp_str_buf_t *field_buffer) {
8581 int rc, format_index, field_value;
8582 const char *width_left, *width_right;
8583 bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8584 static const int FORMAT_SIZE = 20;
8585 char format[FORMAT_SIZE] = {0};
8586 char absolute_short_name = 0;
8587
8588 KMP_DEBUG_ASSERT(gtid >= 0);
8589 KMP_DEBUG_ASSERT(th);
8590 KMP_DEBUG_ASSERT(**ptr == '%');
8591 KMP_DEBUG_ASSERT(field_buffer);
8592
8593 __kmp_str_buf_clear(buffer: field_buffer);
8594
8595 // Skip the initial %
8596 (*ptr)++;
8597
8598 // Check for %% first
8599 if (**ptr == '%') {
8600 __kmp_str_buf_cat(buffer: field_buffer, str: "%", len: 1);
8601 (*ptr)++; // skip over the second %
8602 return 1;
8603 }
8604
8605 // Parse field modifiers if they are present
8606 pad_zeros = false;
8607 if (**ptr == '0') {
8608 pad_zeros = true;
8609 (*ptr)++; // skip over 0
8610 }
8611 right_justify = false;
8612 if (**ptr == '.') {
8613 right_justify = true;
8614 (*ptr)++; // skip over .
8615 }
8616 // Parse width of field: [width_left, width_right)
8617 width_left = width_right = NULL;
8618 if (**ptr >= '0' && **ptr <= '9') {
8619 width_left = *ptr;
8620 SKIP_DIGITS(*ptr);
8621 width_right = *ptr;
8622 }
8623
8624 // Create the format for KMP_SNPRINTF based on flags parsed above
8625 format_index = 0;
8626 format[format_index++] = '%';
8627 if (!right_justify)
8628 format[format_index++] = '-';
8629 if (pad_zeros)
8630 format[format_index++] = '0';
8631 if (width_left && width_right) {
8632 int i = 0;
8633 // Only allow 8 digit number widths.
8634 // This also prevents overflowing format variable
8635 while (i < 8 && width_left < width_right) {
8636 format[format_index++] = *width_left;
8637 width_left++;
8638 i++;
8639 }
8640 }
8641
8642 // Parse a name (long or short)
8643 // Canonicalize the name into absolute_short_name
8644 found_valid_name = false;
8645 parse_long_name = (**ptr == '{');
8646 if (parse_long_name)
8647 (*ptr)++; // skip initial left brace
8648 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8649 sizeof(__kmp_affinity_format_table[0]);
8650 ++i) {
8651 char short_name = __kmp_affinity_format_table[i].short_name;
8652 const char *long_name = __kmp_affinity_format_table[i].long_name;
8653 char field_format = __kmp_affinity_format_table[i].field_format;
8654 if (parse_long_name) {
8655 size_t length = KMP_STRLEN(s: long_name);
8656 if (strncmp(s1: *ptr, s2: long_name, n: length) == 0) {
8657 found_valid_name = true;
8658 (*ptr) += length; // skip the long name
8659 }
8660 } else if (**ptr == short_name) {
8661 found_valid_name = true;
8662 (*ptr)++; // skip the short name
8663 }
8664 if (found_valid_name) {
8665 format[format_index++] = field_format;
8666 format[format_index++] = '\0';
8667 absolute_short_name = short_name;
8668 break;
8669 }
8670 }
8671 if (parse_long_name) {
8672 if (**ptr != '}') {
8673 absolute_short_name = 0;
8674 } else {
8675 (*ptr)++; // skip over the right brace
8676 }
8677 }
8678
8679 // Attempt to fill the buffer with the requested
8680 // value using snprintf within __kmp_str_buf_print()
8681 switch (absolute_short_name) {
8682 case 't':
8683 rc = __kmp_str_buf_print(buffer: field_buffer, format, __kmp_aux_get_team_num());
8684 break;
8685 case 'T':
8686 rc = __kmp_str_buf_print(buffer: field_buffer, format, __kmp_aux_get_num_teams());
8687 break;
8688 case 'L':
8689 rc = __kmp_str_buf_print(buffer: field_buffer, format, th->th.th_team->t.t_level);
8690 break;
8691 case 'n':
8692 rc = __kmp_str_buf_print(buffer: field_buffer, format, __kmp_tid_from_gtid(gtid));
8693 break;
8694 case 'H': {
8695 static const int BUFFER_SIZE = 256;
8696 char buf[BUFFER_SIZE];
8697 __kmp_expand_host_name(buffer: buf, size: BUFFER_SIZE);
8698 rc = __kmp_str_buf_print(buffer: field_buffer, format, buf);
8699 } break;
8700 case 'P':
8701 rc = __kmp_str_buf_print(buffer: field_buffer, format, getpid());
8702 break;
8703 case 'i':
8704 rc = __kmp_str_buf_print(buffer: field_buffer, format, __kmp_gettid());
8705 break;
8706 case 'N':
8707 rc = __kmp_str_buf_print(buffer: field_buffer, format, th->th.th_team->t.t_nproc);
8708 break;
8709 case 'a':
8710 field_value =
8711 __kmp_get_ancestor_thread_num(gtid, level: th->th.th_team->t.t_level - 1);
8712 rc = __kmp_str_buf_print(buffer: field_buffer, format, field_value);
8713 break;
8714#if KMP_AFFINITY_SUPPORTED
8715 case 'A': {
8716 kmp_str_buf_t buf;
8717 __kmp_str_buf_init(&buf);
8718 __kmp_affinity_str_buf_mask(buf: &buf, mask: th->th.th_affin_mask);
8719 rc = __kmp_str_buf_print(buffer: field_buffer, format, buf.str);
8720 __kmp_str_buf_free(buffer: &buf);
8721 } break;
8722#endif
8723 default:
8724 // According to spec, If an implementation does not have info for field
8725 // type, then "undefined" is printed
8726 rc = __kmp_str_buf_print(buffer: field_buffer, format: "%s", "undefined");
8727 // Skip the field
8728 if (parse_long_name) {
8729 SKIP_TOKEN(*ptr);
8730 if (**ptr == '}')
8731 (*ptr)++;
8732 } else {
8733 (*ptr)++;
8734 }
8735 }
8736
8737 KMP_ASSERT(format_index <= FORMAT_SIZE);
8738 return rc;
8739}
8740
8741/*
8742 * Return number of characters needed to hold the affinity string
8743 * (not including null byte character)
8744 * The resultant string is printed to buffer, which the caller can then
8745 * handle afterwards
8746 */
8747size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8748 kmp_str_buf_t *buffer) {
8749 const char *parse_ptr;
8750 size_t retval;
8751 const kmp_info_t *th;
8752 kmp_str_buf_t field;
8753
8754 KMP_DEBUG_ASSERT(buffer);
8755 KMP_DEBUG_ASSERT(gtid >= 0);
8756
8757 __kmp_str_buf_init(&field);
8758 __kmp_str_buf_clear(buffer);
8759
8760 th = __kmp_threads[gtid];
8761 retval = 0;
8762
8763 // If format is NULL or zero-length string, then we use
8764 // affinity-format-var ICV
8765 parse_ptr = format;
8766 if (parse_ptr == NULL || *parse_ptr == '\0') {
8767 parse_ptr = __kmp_affinity_format;
8768 }
8769 KMP_DEBUG_ASSERT(parse_ptr);
8770
8771 while (*parse_ptr != '\0') {
8772 // Parse a field
8773 if (*parse_ptr == '%') {
8774 // Put field in the buffer
8775 int rc = __kmp_aux_capture_affinity_field(gtid, th, ptr: &parse_ptr, field_buffer: &field);
8776 __kmp_str_buf_catbuf(dest: buffer, src: &field);
8777 retval += rc;
8778 } else {
8779 // Put literal character in buffer
8780 __kmp_str_buf_cat(buffer, str: parse_ptr, len: 1);
8781 retval++;
8782 parse_ptr++;
8783 }
8784 }
8785 __kmp_str_buf_free(buffer: &field);
8786 return retval;
8787}
8788
8789// Displays the affinity string to stdout
8790void __kmp_aux_display_affinity(int gtid, const char *format) {
8791 kmp_str_buf_t buf;
8792 __kmp_str_buf_init(&buf);
8793 __kmp_aux_capture_affinity(gtid, format, buffer: &buf);
8794 __kmp_fprintf(stream: kmp_out, format: "%s" KMP_END_OF_LINE, buf.str);
8795 __kmp_str_buf_free(buffer: &buf);
8796}
8797
8798/* ------------------------------------------------------------------------ */
8799void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8800 int blocktime = arg; /* argument is in microseconds */
8801#if KMP_USE_MONITOR
8802 int bt_intervals;
8803#endif
8804 kmp_int8 bt_set;
8805
8806 __kmp_save_internal_controls(thread);
8807
8808 /* Normalize and set blocktime for the teams */
8809 if (blocktime < KMP_MIN_BLOCKTIME)
8810 blocktime = KMP_MIN_BLOCKTIME;
8811 else if (blocktime > KMP_MAX_BLOCKTIME)
8812 blocktime = KMP_MAX_BLOCKTIME;
8813
8814 set__blocktime_team(thread->th.th_team, tid, blocktime);
8815 set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8816
8817#if KMP_USE_MONITOR
8818 /* Calculate and set blocktime intervals for the teams */
8819 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8820
8821 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8822 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8823#endif
8824
8825 /* Set whether blocktime has been set to "TRUE" */
8826 bt_set = TRUE;
8827
8828 set__bt_set_team(thread->th.th_team, tid, bt_set);
8829 set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8830#if KMP_USE_MONITOR
8831 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8832 "bt_intervals=%d, monitor_updates=%d\n",
8833 __kmp_gtid_from_tid(tid, thread->th.th_team),
8834 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8835 __kmp_monitor_wakeups));
8836#else
8837 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8838 __kmp_gtid_from_tid(tid, thread->th.th_team),
8839 thread->th.th_team->t.t_id, tid, blocktime));
8840#endif
8841}
8842
8843void __kmp_aux_set_defaults(char const *str, size_t len) {
8844 if (!__kmp_init_serial) {
8845 __kmp_serial_initialize();
8846 }
8847 __kmp_env_initialize(str);
8848
8849 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8850 __kmp_env_print();
8851 }
8852} // __kmp_aux_set_defaults
8853
8854/* ------------------------------------------------------------------------ */
8855/* internal fast reduction routines */
8856
8857PACKED_REDUCTION_METHOD_T
8858__kmp_determine_reduction_method(
8859 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8860 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8861 kmp_critical_name *lck) {
8862
8863 // Default reduction method: critical construct ( lck != NULL, like in current
8864 // PAROPT )
8865 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8866 // can be selected by RTL
8867 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8868 // can be selected by RTL
8869 // Finally, it's up to OpenMP RTL to make a decision on which method to select
8870 // among generated by PAROPT.
8871
8872 PACKED_REDUCTION_METHOD_T retval;
8873
8874 int team_size;
8875
8876 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8877
8878#define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8879 (loc && \
8880 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8881#define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8882
8883 retval = critical_reduce_block;
8884
8885 // another choice of getting a team size (with 1 dynamic deference) is slower
8886 team_size = __kmp_get_team_num_threads(global_tid);
8887 if (team_size == 1) {
8888
8889 retval = empty_reduce_block;
8890
8891 } else {
8892
8893 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8894
8895#if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8896 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \
8897 KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_WASM
8898
8899#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8900 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD || \
8901 KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8902
8903 int teamsize_cutoff = 4;
8904
8905#if KMP_MIC_SUPPORTED
8906 if (__kmp_mic_type != non_mic) {
8907 teamsize_cutoff = 8;
8908 }
8909#endif
8910 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8911 if (tree_available) {
8912 if (team_size <= teamsize_cutoff) {
8913 if (atomic_available) {
8914 retval = atomic_reduce_block;
8915 }
8916 } else {
8917 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8918 }
8919 } else if (atomic_available) {
8920 retval = atomic_reduce_block;
8921 }
8922#else
8923#error "Unknown or unsupported OS"
8924#endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8925 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD ||
8926 // KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8927
8928#elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS || \
8929 KMP_ARCH_WASM || KMP_ARCH_PPC || KMP_ARCH_AARCH64_32
8930
8931#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8932 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HURD || KMP_OS_SOLARIS || \
8933 KMP_OS_WASI || KMP_OS_AIX
8934
8935 // basic tuning
8936
8937 if (atomic_available) {
8938 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8939 retval = atomic_reduce_block;
8940 }
8941 } // otherwise: use critical section
8942
8943#elif KMP_OS_DARWIN
8944
8945 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8946 if (atomic_available && (num_vars <= 3)) {
8947 retval = atomic_reduce_block;
8948 } else if (tree_available) {
8949 if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8950 (reduce_size < (2000 * sizeof(kmp_real64)))) {
8951 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8952 }
8953 } // otherwise: use critical section
8954
8955#else
8956#error "Unknown or unsupported OS"
8957#endif
8958
8959#else
8960#error "Unknown or unsupported architecture"
8961#endif
8962 }
8963
8964 // KMP_FORCE_REDUCTION
8965
8966 // If the team is serialized (team_size == 1), ignore the forced reduction
8967 // method and stay with the unsynchronized method (empty_reduce_block)
8968 if (__kmp_force_reduction_method != reduction_method_not_defined &&
8969 team_size != 1) {
8970
8971 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8972
8973 int atomic_available, tree_available;
8974
8975 switch ((forced_retval = __kmp_force_reduction_method)) {
8976 case critical_reduce_block:
8977 KMP_ASSERT(lck); // lck should be != 0
8978 break;
8979
8980 case atomic_reduce_block:
8981 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8982 if (!atomic_available) {
8983 KMP_WARNING(RedMethodNotSupported, "atomic");
8984 forced_retval = critical_reduce_block;
8985 }
8986 break;
8987
8988 case tree_reduce_block:
8989 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8990 if (!tree_available) {
8991 KMP_WARNING(RedMethodNotSupported, "tree");
8992 forced_retval = critical_reduce_block;
8993 } else {
8994#if KMP_FAST_REDUCTION_BARRIER
8995 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8996#endif
8997 }
8998 break;
8999
9000 default:
9001 KMP_ASSERT(0); // "unsupported method specified"
9002 }
9003
9004 retval = forced_retval;
9005 }
9006
9007 KA_TRACE(10, ("reduction method selected=%08x\n", retval));
9008
9009#undef FAST_REDUCTION_TREE_METHOD_GENERATED
9010#undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
9011
9012 return (retval);
9013}
9014// this function is for testing set/get/determine reduce method
9015kmp_int32 __kmp_get_reduce_method(void) {
9016 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
9017}
9018
9019// Soft pause sets up threads to ignore blocktime and just go to sleep.
9020// Spin-wait code checks __kmp_pause_status and reacts accordingly.
9021void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
9022
9023// Hard pause shuts down the runtime completely. Resume happens naturally when
9024// OpenMP is used subsequently.
9025void __kmp_hard_pause() {
9026 __kmp_pause_status = kmp_hard_paused;
9027 __kmp_internal_end_thread(gtid_req: -1);
9028}
9029
9030// Soft resume sets __kmp_pause_status, and wakes up all threads.
9031void __kmp_resume_if_soft_paused() {
9032 if (__kmp_pause_status == kmp_soft_paused) {
9033 __kmp_pause_status = kmp_not_paused;
9034
9035 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
9036 kmp_info_t *thread = __kmp_threads[gtid];
9037 if (thread) { // Wake it if sleeping
9038 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
9039 thread);
9040 if (fl.is_sleeping())
9041 fl.resume(th_gtid: gtid);
9042 else if (__kmp_try_suspend_mx(th: thread)) { // got suspend lock
9043 __kmp_unlock_suspend_mx(th: thread); // unlock it; it won't sleep
9044 } else { // thread holds the lock and may sleep soon
9045 do { // until either the thread sleeps, or we can get the lock
9046 if (fl.is_sleeping()) {
9047 fl.resume(th_gtid: gtid);
9048 break;
9049 } else if (__kmp_try_suspend_mx(th: thread)) {
9050 __kmp_unlock_suspend_mx(th: thread);
9051 break;
9052 }
9053 } while (1);
9054 }
9055 }
9056 }
9057 }
9058}
9059
9060// This function is called via __kmpc_pause_resource. Returns 0 if successful.
9061// TODO: add warning messages
9062int __kmp_pause_resource(kmp_pause_status_t level) {
9063 if (level == kmp_not_paused) { // requesting resume
9064 if (__kmp_pause_status == kmp_not_paused) {
9065 // error message about runtime not being paused, so can't resume
9066 return 1;
9067 } else {
9068 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
9069 __kmp_pause_status == kmp_hard_paused);
9070 __kmp_pause_status = kmp_not_paused;
9071 return 0;
9072 }
9073 } else if (level == kmp_soft_paused) { // requesting soft pause
9074 if (__kmp_pause_status != kmp_not_paused) {
9075 // error message about already being paused
9076 return 1;
9077 } else {
9078 __kmp_soft_pause();
9079 return 0;
9080 }
9081 } else if (level == kmp_hard_paused) { // requesting hard pause
9082 if (__kmp_pause_status != kmp_not_paused) {
9083 // error message about already being paused
9084 return 1;
9085 } else {
9086 __kmp_hard_pause();
9087 return 0;
9088 }
9089 } else {
9090 // error message about invalid level
9091 return 1;
9092 }
9093}
9094
9095void __kmp_omp_display_env(int verbose) {
9096 __kmp_acquire_bootstrap_lock(lck: &__kmp_initz_lock);
9097 if (__kmp_init_serial == 0)
9098 __kmp_do_serial_initialize();
9099 __kmp_display_env_impl(display_env: !verbose, display_env_verbose: verbose);
9100 __kmp_release_bootstrap_lock(lck: &__kmp_initz_lock);
9101}
9102
9103// The team size is changing, so distributed barrier must be modified
9104void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9105 int new_nthreads) {
9106 KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9107 bp_dist_bar);
9108 kmp_info_t **other_threads = team->t.t_threads;
9109
9110 // We want all the workers to stop waiting on the barrier while we adjust the
9111 // size of the team.
9112 for (int f = 1; f < old_nthreads; ++f) {
9113 KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9114 // Ignore threads that are already inactive or not present in the team
9115 if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9116 // teams construct causes thread_limit to get passed in, and some of
9117 // those could be inactive; just ignore them
9118 continue;
9119 }
9120 // If thread is transitioning still to in_use state, wait for it
9121 if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9122 while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9123 KMP_CPU_PAUSE();
9124 }
9125 // The thread should be in_use now
9126 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9127 // Transition to unused state
9128 team->t.t_threads[f]->th.th_used_in_team.store(i: 2);
9129 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9130 }
9131 // Release all the workers
9132 team->t.b->go_release();
9133
9134 KMP_MFENCE();
9135
9136 // Workers should see transition status 2 and move to 0; but may need to be
9137 // woken up first
9138 int count = old_nthreads - 1;
9139 while (count > 0) {
9140 count = old_nthreads - 1;
9141 for (int f = 1; f < old_nthreads; ++f) {
9142 if (other_threads[f]->th.th_used_in_team.load() != 0) {
9143 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9144 kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9145 void *, other_threads[f]->th.th_sleep_loc);
9146 __kmp_atomic_resume_64(target_gtid: other_threads[f]->th.th_info.ds.ds_gtid, flag);
9147 }
9148 } else {
9149 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9150 count--;
9151 }
9152 }
9153 }
9154 // Now update the barrier size
9155 team->t.b->update_num_threads(nthr: new_nthreads);
9156 team->t.b->go_reset();
9157}
9158
9159void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9160 // Add the threads back to the team
9161 KMP_DEBUG_ASSERT(team);
9162 // Threads were paused and pointed at th_used_in_team temporarily during a
9163 // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9164 // the thread that it should transition itself back into the team. Then, if
9165 // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9166 // to wake it up.
9167 for (int f = 1; f < new_nthreads; ++f) {
9168 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9169 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
9170 3);
9171 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9172 __kmp_resume_32(target_gtid: team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9173 flag: (kmp_flag_32<false, false> *)NULL);
9174 }
9175 }
9176 // The threads should be transitioning to the team; when they are done, they
9177 // should have set th_used_in_team to 1. This loop forces master to wait until
9178 // all threads have moved into the team and are waiting in the barrier.
9179 int count = new_nthreads - 1;
9180 while (count > 0) {
9181 count = new_nthreads - 1;
9182 for (int f = 1; f < new_nthreads; ++f) {
9183 if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9184 count--;
9185 }
9186 }
9187 }
9188}
9189
9190// Globals and functions for hidden helper task
9191kmp_info_t **__kmp_hidden_helper_threads;
9192kmp_info_t *__kmp_hidden_helper_main_thread;
9193std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9194#if KMP_OS_LINUX
9195kmp_int32 __kmp_hidden_helper_threads_num = 8;
9196kmp_int32 __kmp_enable_hidden_helper = TRUE;
9197#else
9198kmp_int32 __kmp_hidden_helper_threads_num = 0;
9199kmp_int32 __kmp_enable_hidden_helper = FALSE;
9200#endif
9201
9202namespace {
9203std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9204
9205void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9206 // This is an explicit synchronization on all hidden helper threads in case
9207 // that when a regular thread pushes a hidden helper task to one hidden
9208 // helper thread, the thread has not been awaken once since they're released
9209 // by the main thread after creating the team.
9210 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9211 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9212 __kmp_hidden_helper_threads_num)
9213 ;
9214
9215 // If main thread, then wait for signal
9216 if (__kmpc_master(nullptr, global_tid: *gtid)) {
9217 // First, unset the initial state and release the initial thread
9218 TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9219 __kmp_hidden_helper_initz_release();
9220 __kmp_hidden_helper_main_thread_wait();
9221 // Now wake up all worker threads
9222 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9223 __kmp_hidden_helper_worker_thread_signal();
9224 }
9225 }
9226}
9227} // namespace
9228
9229void __kmp_hidden_helper_threads_initz_routine() {
9230 // Create a new root for hidden helper team/threads
9231 const int gtid = __kmp_register_root(TRUE);
9232 __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9233 __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9234 __kmp_hidden_helper_main_thread->th.th_set_nproc =
9235 __kmp_hidden_helper_threads_num;
9236
9237 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9238
9239 __kmpc_fork_call(nullptr, nargs: 0, microtask: __kmp_hidden_helper_wrapper_fn);
9240
9241 // Set the initialization flag to FALSE
9242 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9243
9244 __kmp_hidden_helper_threads_deinitz_release();
9245}
9246
9247/* Nesting Mode:
9248 Set via KMP_NESTING_MODE, which takes an integer.
9249 Note: we skip duplicate topology levels, and skip levels with only
9250 one entity.
9251 KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9252 KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9253 in the topology, and initializes the number of threads at each of those
9254 levels to the number of entities at each level, respectively, below the
9255 entity at the parent level.
9256 KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9257 but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9258 the user to turn nesting on explicitly. This is an even more experimental
9259 option to this experimental feature, and may change or go away in the
9260 future.
9261*/
9262
9263// Allocate space to store nesting levels
9264void __kmp_init_nesting_mode() {
9265 int levels = KMP_HW_LAST;
9266 __kmp_nesting_mode_nlevels = levels;
9267 __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9268 for (int i = 0; i < levels; ++i)
9269 __kmp_nesting_nth_level[i] = 0;
9270 if (__kmp_nested_nth.size < levels) {
9271 __kmp_nested_nth.nth =
9272 (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9273 __kmp_nested_nth.size = levels;
9274 }
9275}
9276
9277// Set # threads for top levels of nesting; must be called after topology set
9278void __kmp_set_nesting_mode_threads() {
9279 kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9280
9281 if (__kmp_nesting_mode == 1)
9282 __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9283 else if (__kmp_nesting_mode > 1)
9284 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9285
9286 if (__kmp_topology) { // use topology info
9287 int loc, hw_level;
9288 for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9289 loc < __kmp_nesting_mode_nlevels;
9290 loc++, hw_level++) {
9291 __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(level: hw_level);
9292 if (__kmp_nesting_nth_level[loc] == 1)
9293 loc--;
9294 }
9295 // Make sure all cores are used
9296 if (__kmp_nesting_mode > 1 && loc > 1) {
9297 int core_level = __kmp_topology->get_level(type: KMP_HW_CORE);
9298 int num_cores = __kmp_topology->get_count(level: core_level);
9299 int upper_levels = 1;
9300 for (int level = 0; level < loc - 1; ++level)
9301 upper_levels *= __kmp_nesting_nth_level[level];
9302 if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9303 __kmp_nesting_nth_level[loc - 1] =
9304 num_cores / __kmp_nesting_nth_level[loc - 2];
9305 }
9306 __kmp_nesting_mode_nlevels = loc;
9307 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9308 } else { // no topology info available; provide a reasonable guesstimation
9309 if (__kmp_avail_proc >= 4) {
9310 __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9311 __kmp_nesting_nth_level[1] = 2;
9312 __kmp_nesting_mode_nlevels = 2;
9313 } else {
9314 __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9315 __kmp_nesting_mode_nlevels = 1;
9316 }
9317 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9318 }
9319 for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9320 __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9321 }
9322 set__nproc(thread, __kmp_nesting_nth_level[0]);
9323 if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9324 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9325 if (get__max_active_levels(thread) > 1) {
9326 // if max levels was set, set nesting mode levels to same
9327 __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9328 }
9329 if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9330 set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9331}
9332
9333// Empty symbols to export (see exports_so.txt) when feature is disabled
9334extern "C" {
9335#if !KMP_STATS_ENABLED
9336void __kmp_reset_stats() {}
9337#endif
9338#if !USE_DEBUGGER
9339int __kmp_omp_debug_struct_info = FALSE;
9340int __kmp_debugging = FALSE;
9341#endif
9342#if !USE_ITT_BUILD || !USE_ITT_NOTIFY
9343void __kmp_itt_fini_ittlib() {}
9344void __kmp_itt_init_ittlib() {}
9345#endif
9346}
9347
9348// end of file
9349

source code of openmp/runtime/src/kmp_runtime.cpp