1/*
2 * kmp_affinity.h -- header for affinity management
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef KMP_AFFINITY_H
14#define KMP_AFFINITY_H
15
16#include "kmp.h"
17#include "kmp_os.h"
18#include <limits>
19
20#if KMP_AFFINITY_SUPPORTED
21#if KMP_USE_HWLOC
22class KMPHwlocAffinity : public KMPAffinity {
23public:
24 class Mask : public KMPAffinity::Mask {
25 hwloc_cpuset_t mask;
26
27 public:
28 Mask() {
29 mask = hwloc_bitmap_alloc();
30 this->zero();
31 }
32 ~Mask() { hwloc_bitmap_free(mask); }
33 void set(int i) override { hwloc_bitmap_set(mask, i); }
34 bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
35 void clear(int i) override { hwloc_bitmap_clr(mask, i); }
36 void zero() override { hwloc_bitmap_zero(mask); }
37 bool empty() const override { return hwloc_bitmap_iszero(mask); }
38 void copy(const KMPAffinity::Mask *src) override {
39 const Mask *convert = static_cast<const Mask *>(src);
40 hwloc_bitmap_copy(mask, convert->mask);
41 }
42 void bitwise_and(const KMPAffinity::Mask *rhs) override {
43 const Mask *convert = static_cast<const Mask *>(rhs);
44 hwloc_bitmap_and(mask, mask, convert->mask);
45 }
46 void bitwise_or(const KMPAffinity::Mask *rhs) override {
47 const Mask *convert = static_cast<const Mask *>(rhs);
48 hwloc_bitmap_or(mask, mask, convert->mask);
49 }
50 void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
51 bool is_equal(const KMPAffinity::Mask *rhs) const override {
52 const Mask *convert = static_cast<const Mask *>(rhs);
53 return hwloc_bitmap_isequal(mask, convert->mask);
54 }
55 int begin() const override { return hwloc_bitmap_first(mask); }
56 int end() const override { return -1; }
57 int next(int previous) const override {
58 return hwloc_bitmap_next(mask, previous);
59 }
60 int get_system_affinity(bool abort_on_error) override {
61 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
62 "Illegal get affinity operation when not capable");
63 long retval =
64 hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
65 if (retval >= 0) {
66 return 0;
67 }
68 int error = errno;
69 if (abort_on_error) {
70 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"),
71 KMP_ERR(error), __kmp_msg_null);
72 }
73 return error;
74 }
75 int set_system_affinity(bool abort_on_error) const override {
76 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
77 "Illegal set affinity operation when not capable");
78 long retval =
79 hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
80 if (retval >= 0) {
81 return 0;
82 }
83 int error = errno;
84 if (abort_on_error) {
85 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
86 KMP_ERR(error), __kmp_msg_null);
87 }
88 return error;
89 }
90#if KMP_OS_WINDOWS
91 int set_process_affinity(bool abort_on_error) const override {
92 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
93 "Illegal set process affinity operation when not capable");
94 int error = 0;
95 const hwloc_topology_support *support =
96 hwloc_topology_get_support(__kmp_hwloc_topology);
97 if (support->cpubind->set_proc_cpubind) {
98 int retval;
99 retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
100 HWLOC_CPUBIND_PROCESS);
101 if (retval >= 0)
102 return 0;
103 error = errno;
104 if (abort_on_error)
105 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
106 KMP_ERR(error), __kmp_msg_null);
107 }
108 return error;
109 }
110#endif
111 int get_proc_group() const override {
112 int group = -1;
113#if KMP_OS_WINDOWS
114 if (__kmp_num_proc_groups == 1) {
115 return 1;
116 }
117 for (int i = 0; i < __kmp_num_proc_groups; i++) {
118 // On windows, the long type is always 32 bits
119 unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
120 unsigned long second_32_bits =
121 hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
122 if (first_32_bits == 0 && second_32_bits == 0) {
123 continue;
124 }
125 if (group >= 0) {
126 return -1;
127 }
128 group = i;
129 }
130#endif /* KMP_OS_WINDOWS */
131 return group;
132 }
133 };
134 void determine_capable(const char *var) override {
135 const hwloc_topology_support *topology_support;
136 if (__kmp_hwloc_topology == NULL) {
137 if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
138 __kmp_hwloc_error = TRUE;
139 if (__kmp_affinity.flags.verbose) {
140 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
141 }
142 }
143 if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
144 __kmp_hwloc_error = TRUE;
145 if (__kmp_affinity.flags.verbose) {
146 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
147 }
148 }
149 }
150 topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
151 // Is the system capable of setting/getting this thread's affinity?
152 // Also, is topology discovery possible? (pu indicates ability to discover
153 // processing units). And finally, were there no errors when calling any
154 // hwloc_* API functions?
155 if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
156 topology_support->cpubind->get_thisthread_cpubind &&
157 topology_support->discovery->pu && !__kmp_hwloc_error) {
158 // enables affinity according to KMP_AFFINITY_CAPABLE() macro
159 KMP_AFFINITY_ENABLE(TRUE);
160 } else {
161 // indicate that hwloc didn't work and disable affinity
162 __kmp_hwloc_error = TRUE;
163 KMP_AFFINITY_DISABLE();
164 }
165 }
166 void bind_thread(int which) override {
167 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
168 "Illegal set affinity operation when not capable");
169 KMPAffinity::Mask *mask;
170 KMP_CPU_ALLOC_ON_STACK(mask);
171 KMP_CPU_ZERO(mask);
172 KMP_CPU_SET(which, mask);
173 __kmp_set_system_affinity(mask, TRUE);
174 KMP_CPU_FREE_FROM_STACK(mask);
175 }
176 KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
177 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
178 KMPAffinity::Mask *allocate_mask_array(int num) override {
179 return new Mask[num];
180 }
181 void deallocate_mask_array(KMPAffinity::Mask *array) override {
182 Mask *hwloc_array = static_cast<Mask *>(array);
183 delete[] hwloc_array;
184 }
185 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
186 int index) override {
187 Mask *hwloc_array = static_cast<Mask *>(array);
188 return &(hwloc_array[index]);
189 }
190 api_type get_api_type() const override { return HWLOC; }
191};
192#endif /* KMP_USE_HWLOC */
193
194#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \
195 KMP_OS_AIX
196#if KMP_OS_LINUX
197/* On some of the older OS's that we build on, these constants aren't present
198 in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
199 all systems of the same arch where they are defined, and they cannot change.
200 stone forever. */
201#include <sys/syscall.h>
202#if KMP_ARCH_X86 || KMP_ARCH_ARM
203#ifndef __NR_sched_setaffinity
204#define __NR_sched_setaffinity 241
205#elif __NR_sched_setaffinity != 241
206#error Wrong code for setaffinity system call.
207#endif /* __NR_sched_setaffinity */
208#ifndef __NR_sched_getaffinity
209#define __NR_sched_getaffinity 242
210#elif __NR_sched_getaffinity != 242
211#error Wrong code for getaffinity system call.
212#endif /* __NR_sched_getaffinity */
213#elif KMP_ARCH_AARCH64
214#ifndef __NR_sched_setaffinity
215#define __NR_sched_setaffinity 122
216#elif __NR_sched_setaffinity != 122
217#error Wrong code for setaffinity system call.
218#endif /* __NR_sched_setaffinity */
219#ifndef __NR_sched_getaffinity
220#define __NR_sched_getaffinity 123
221#elif __NR_sched_getaffinity != 123
222#error Wrong code for getaffinity system call.
223#endif /* __NR_sched_getaffinity */
224#elif KMP_ARCH_X86_64
225#ifndef __NR_sched_setaffinity
226#define __NR_sched_setaffinity 203
227#elif __NR_sched_setaffinity != 203
228#error Wrong code for setaffinity system call.
229#endif /* __NR_sched_setaffinity */
230#ifndef __NR_sched_getaffinity
231#define __NR_sched_getaffinity 204
232#elif __NR_sched_getaffinity != 204
233#error Wrong code for getaffinity system call.
234#endif /* __NR_sched_getaffinity */
235#elif KMP_ARCH_PPC64
236#ifndef __NR_sched_setaffinity
237#define __NR_sched_setaffinity 222
238#elif __NR_sched_setaffinity != 222
239#error Wrong code for setaffinity system call.
240#endif /* __NR_sched_setaffinity */
241#ifndef __NR_sched_getaffinity
242#define __NR_sched_getaffinity 223
243#elif __NR_sched_getaffinity != 223
244#error Wrong code for getaffinity system call.
245#endif /* __NR_sched_getaffinity */
246#elif KMP_ARCH_MIPS
247#ifndef __NR_sched_setaffinity
248#define __NR_sched_setaffinity 4239
249#elif __NR_sched_setaffinity != 4239
250#error Wrong code for setaffinity system call.
251#endif /* __NR_sched_setaffinity */
252#ifndef __NR_sched_getaffinity
253#define __NR_sched_getaffinity 4240
254#elif __NR_sched_getaffinity != 4240
255#error Wrong code for getaffinity system call.
256#endif /* __NR_sched_getaffinity */
257#elif KMP_ARCH_MIPS64
258#ifndef __NR_sched_setaffinity
259#define __NR_sched_setaffinity 5195
260#elif __NR_sched_setaffinity != 5195
261#error Wrong code for setaffinity system call.
262#endif /* __NR_sched_setaffinity */
263#ifndef __NR_sched_getaffinity
264#define __NR_sched_getaffinity 5196
265#elif __NR_sched_getaffinity != 5196
266#error Wrong code for getaffinity system call.
267#endif /* __NR_sched_getaffinity */
268#elif KMP_ARCH_LOONGARCH64
269#ifndef __NR_sched_setaffinity
270#define __NR_sched_setaffinity 122
271#elif __NR_sched_setaffinity != 122
272#error Wrong code for setaffinity system call.
273#endif /* __NR_sched_setaffinity */
274#ifndef __NR_sched_getaffinity
275#define __NR_sched_getaffinity 123
276#elif __NR_sched_getaffinity != 123
277#error Wrong code for getaffinity system call.
278#endif /* __NR_sched_getaffinity */
279#elif KMP_ARCH_RISCV64
280#ifndef __NR_sched_setaffinity
281#define __NR_sched_setaffinity 122
282#elif __NR_sched_setaffinity != 122
283#error Wrong code for setaffinity system call.
284#endif /* __NR_sched_setaffinity */
285#ifndef __NR_sched_getaffinity
286#define __NR_sched_getaffinity 123
287#elif __NR_sched_getaffinity != 123
288#error Wrong code for getaffinity system call.
289#endif /* __NR_sched_getaffinity */
290#elif KMP_ARCH_VE
291#ifndef __NR_sched_setaffinity
292#define __NR_sched_setaffinity 203
293#elif __NR_sched_setaffinity != 203
294#error Wrong code for setaffinity system call.
295#endif /* __NR_sched_setaffinity */
296#ifndef __NR_sched_getaffinity
297#define __NR_sched_getaffinity 204
298#elif __NR_sched_getaffinity != 204
299#error Wrong code for getaffinity system call.
300#endif /* __NR_sched_getaffinity */
301#elif KMP_ARCH_S390X
302#ifndef __NR_sched_setaffinity
303#define __NR_sched_setaffinity 239
304#elif __NR_sched_setaffinity != 239
305#error Wrong code for setaffinity system call.
306#endif /* __NR_sched_setaffinity */
307#ifndef __NR_sched_getaffinity
308#define __NR_sched_getaffinity 240
309#elif __NR_sched_getaffinity != 240
310#error Wrong code for getaffinity system call.
311#endif /* __NR_sched_getaffinity */
312#else
313#error Unknown or unsupported architecture
314#endif /* KMP_ARCH_* */
315#elif KMP_OS_FREEBSD || KMP_OS_DRAGONFLY
316#include <pthread.h>
317#include <pthread_np.h>
318#elif KMP_OS_NETBSD
319#include <pthread.h>
320#include <sched.h>
321#elif KMP_OS_AIX
322#include <sys/dr.h>
323#include <sys/rset.h>
324#define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX.
325#endif
326class KMPNativeAffinity : public KMPAffinity {
327 class Mask : public KMPAffinity::Mask {
328 typedef unsigned long mask_t;
329 typedef decltype(__kmp_affin_mask_size) mask_size_type;
330 static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
331 static const mask_t ONE = 1;
332 mask_size_type get_num_mask_types() const {
333 return __kmp_affin_mask_size / sizeof(mask_t);
334 }
335
336 public:
337 mask_t *mask;
338 Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
339 ~Mask() {
340 if (mask)
341 __kmp_free(mask);
342 }
343 void set(int i) override {
344 mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
345 }
346 bool is_set(int i) const override {
347 return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
348 }
349 void clear(int i) override {
350 mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
351 }
352 void zero() override {
353 mask_size_type e = get_num_mask_types();
354 for (mask_size_type i = 0; i < e; ++i)
355 mask[i] = (mask_t)0;
356 }
357 bool empty() const override {
358 mask_size_type e = get_num_mask_types();
359 for (mask_size_type i = 0; i < e; ++i)
360 if (mask[i] != (mask_t)0)
361 return false;
362 return true;
363 }
364 void copy(const KMPAffinity::Mask *src) override {
365 const Mask *convert = static_cast<const Mask *>(src);
366 mask_size_type e = get_num_mask_types();
367 for (mask_size_type i = 0; i < e; ++i)
368 mask[i] = convert->mask[i];
369 }
370 void bitwise_and(const KMPAffinity::Mask *rhs) override {
371 const Mask *convert = static_cast<const Mask *>(rhs);
372 mask_size_type e = get_num_mask_types();
373 for (mask_size_type i = 0; i < e; ++i)
374 mask[i] &= convert->mask[i];
375 }
376 void bitwise_or(const KMPAffinity::Mask *rhs) override {
377 const Mask *convert = static_cast<const Mask *>(rhs);
378 mask_size_type e = get_num_mask_types();
379 for (mask_size_type i = 0; i < e; ++i)
380 mask[i] |= convert->mask[i];
381 }
382 void bitwise_not() override {
383 mask_size_type e = get_num_mask_types();
384 for (mask_size_type i = 0; i < e; ++i)
385 mask[i] = ~(mask[i]);
386 }
387 bool is_equal(const KMPAffinity::Mask *rhs) const override {
388 const Mask *convert = static_cast<const Mask *>(rhs);
389 mask_size_type e = get_num_mask_types();
390 for (mask_size_type i = 0; i < e; ++i)
391 if (mask[i] != convert->mask[i])
392 return false;
393 return true;
394 }
395 int begin() const override {
396 int retval = 0;
397 while (retval < end() && !is_set(i: retval))
398 ++retval;
399 return retval;
400 }
401 int end() const override {
402 int e;
403 __kmp_type_convert(src: get_num_mask_types() * BITS_PER_MASK_T, dest: &e);
404 return e;
405 }
406 int next(int previous) const override {
407 int retval = previous + 1;
408 while (retval < end() && !is_set(i: retval))
409 ++retval;
410 return retval;
411 }
412#if KMP_OS_AIX
413 // On AIX, we don't have a way to get CPU(s) a thread is bound to.
414 // This routine is only used to get the full mask.
415 int get_system_affinity(bool abort_on_error) override {
416 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
417 "Illegal get affinity operation when not capable");
418
419 (void)abort_on_error;
420
421 // Set the mask with all CPUs that are available.
422 for (int i = 0; i < __kmp_xproc; ++i)
423 KMP_CPU_SET(i, this);
424 return 0;
425 }
426 int set_system_affinity(bool abort_on_error) const override {
427 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
428
429 "Illegal set affinity operation when not capable");
430
431 int location;
432 int gtid = __kmp_entry_gtid();
433 int tid = thread_self();
434
435 // Unbind the thread if it was bound to any processors before so that
436 // we can bind the thread to CPUs specified by the mask not others.
437 int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY);
438
439 // On AIX, we can only bind to one instead of a set of CPUs with the
440 // bindprocessor() system call.
441 KMP_CPU_SET_ITERATE(location, this) {
442 if (KMP_CPU_ISSET(location, this)) {
443 retval = bindprocessor(BINDTHREAD, tid, location);
444 if (retval == -1 && errno == 1) {
445 rsid_t rsid;
446 rsethandle_t rsh;
447 // Put something in rsh to prevent compiler warning
448 // about uninitalized use
449 rsh = rs_alloc(RS_EMPTY);
450 rsid.at_pid = getpid();
451 if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) {
452 retval = ra_detachrset(R_PROCESS, rsid, 0);
453 retval = bindprocessor(BINDTHREAD, tid, location);
454 }
455 }
456 if (retval == 0) {
457 KA_TRACE(10, ("__kmp_set_system_affinity: Done binding "
458 "T#%d to cpu=%d.\n",
459 gtid, location));
460 continue;
461 }
462 int error = errno;
463 if (abort_on_error) {
464 __kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()"),
465 KMP_ERR(error), __kmp_msg_null);
466 KA_TRACE(10, ("__kmp_set_system_affinity: Error binding "
467 "T#%d to cpu=%d, errno=%d.\n",
468 gtid, location, error));
469 return error;
470 }
471 }
472 }
473 return 0;
474 }
475#else // !KMP_OS_AIX
476 int get_system_affinity(bool abort_on_error) override {
477 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
478 "Illegal get affinity operation when not capable");
479#if KMP_OS_LINUX
480 long retval =
481 syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
482#elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
483 int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
484 reinterpret_cast<cpuset_t *>(mask));
485 int retval = (r == 0 ? 0 : -1);
486#endif
487 if (retval >= 0) {
488 return 0;
489 }
490 int error = errno;
491 if (abort_on_error) {
492 __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"),
493 KMP_ERR(error), __kmp_msg_null);
494 }
495 return error;
496 }
497 int set_system_affinity(bool abort_on_error) const override {
498 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
499 "Illegal set affinity operation when not capable");
500#if KMP_OS_LINUX
501 long retval =
502 syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
503#elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
504 int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
505 reinterpret_cast<cpuset_t *>(mask));
506 int retval = (r == 0 ? 0 : -1);
507#endif
508 if (retval >= 0) {
509 return 0;
510 }
511 int error = errno;
512 if (abort_on_error) {
513 __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"),
514 KMP_ERR(error), __kmp_msg_null);
515 }
516 return error;
517 }
518#endif // KMP_OS_AIX
519 };
520 void determine_capable(const char *env_var) override {
521 __kmp_affinity_determine_capable(env_var);
522 }
523 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
524 KMPAffinity::Mask *allocate_mask() override {
525 KMPNativeAffinity::Mask *retval = new Mask();
526 return retval;
527 }
528 void deallocate_mask(KMPAffinity::Mask *m) override {
529 KMPNativeAffinity::Mask *native_mask =
530 static_cast<KMPNativeAffinity::Mask *>(m);
531 delete native_mask;
532 }
533 KMPAffinity::Mask *allocate_mask_array(int num) override {
534 return new Mask[num];
535 }
536 void deallocate_mask_array(KMPAffinity::Mask *array) override {
537 Mask *linux_array = static_cast<Mask *>(array);
538 delete[] linux_array;
539 }
540 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
541 int index) override {
542 Mask *linux_array = static_cast<Mask *>(array);
543 return &(linux_array[index]);
544 }
545 api_type get_api_type() const override { return NATIVE_OS; }
546};
547#endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY \
548 || KMP_OS_AIX */
549
550#if KMP_OS_WINDOWS
551class KMPNativeAffinity : public KMPAffinity {
552 class Mask : public KMPAffinity::Mask {
553 typedef ULONG_PTR mask_t;
554 static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
555 mask_t *mask;
556
557 public:
558 Mask() {
559 mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
560 }
561 ~Mask() {
562 if (mask)
563 __kmp_free(mask);
564 }
565 void set(int i) override {
566 mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
567 }
568 bool is_set(int i) const override {
569 return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
570 }
571 void clear(int i) override {
572 mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
573 }
574 void zero() override {
575 for (int i = 0; i < __kmp_num_proc_groups; ++i)
576 mask[i] = 0;
577 }
578 bool empty() const override {
579 for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
580 if (mask[i])
581 return false;
582 return true;
583 }
584 void copy(const KMPAffinity::Mask *src) override {
585 const Mask *convert = static_cast<const Mask *>(src);
586 for (int i = 0; i < __kmp_num_proc_groups; ++i)
587 mask[i] = convert->mask[i];
588 }
589 void bitwise_and(const KMPAffinity::Mask *rhs) override {
590 const Mask *convert = static_cast<const Mask *>(rhs);
591 for (int i = 0; i < __kmp_num_proc_groups; ++i)
592 mask[i] &= convert->mask[i];
593 }
594 void bitwise_or(const KMPAffinity::Mask *rhs) override {
595 const Mask *convert = static_cast<const Mask *>(rhs);
596 for (int i = 0; i < __kmp_num_proc_groups; ++i)
597 mask[i] |= convert->mask[i];
598 }
599 void bitwise_not() override {
600 for (int i = 0; i < __kmp_num_proc_groups; ++i)
601 mask[i] = ~(mask[i]);
602 }
603 bool is_equal(const KMPAffinity::Mask *rhs) const override {
604 const Mask *convert = static_cast<const Mask *>(rhs);
605 for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
606 if (mask[i] != convert->mask[i])
607 return false;
608 return true;
609 }
610 int begin() const override {
611 int retval = 0;
612 while (retval < end() && !is_set(retval))
613 ++retval;
614 return retval;
615 }
616 int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
617 int next(int previous) const override {
618 int retval = previous + 1;
619 while (retval < end() && !is_set(retval))
620 ++retval;
621 return retval;
622 }
623 int set_process_affinity(bool abort_on_error) const override {
624 if (__kmp_num_proc_groups <= 1) {
625 if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
626 DWORD error = GetLastError();
627 if (abort_on_error) {
628 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
629 __kmp_msg_null);
630 }
631 return error;
632 }
633 }
634 return 0;
635 }
636 int set_system_affinity(bool abort_on_error) const override {
637 if (__kmp_num_proc_groups > 1) {
638 // Check for a valid mask.
639 GROUP_AFFINITY ga;
640 int group = get_proc_group();
641 if (group < 0) {
642 if (abort_on_error) {
643 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
644 }
645 return -1;
646 }
647 // Transform the bit vector into a GROUP_AFFINITY struct
648 // and make the system call to set affinity.
649 ga.Group = group;
650 ga.Mask = mask[group];
651 ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
652
653 KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
654 if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
655 DWORD error = GetLastError();
656 if (abort_on_error) {
657 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
658 __kmp_msg_null);
659 }
660 return error;
661 }
662 } else {
663 if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
664 DWORD error = GetLastError();
665 if (abort_on_error) {
666 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
667 __kmp_msg_null);
668 }
669 return error;
670 }
671 }
672 return 0;
673 }
674 int get_system_affinity(bool abort_on_error) override {
675 if (__kmp_num_proc_groups > 1) {
676 this->zero();
677 GROUP_AFFINITY ga;
678 KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
679 if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
680 DWORD error = GetLastError();
681 if (abort_on_error) {
682 __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
683 KMP_ERR(error), __kmp_msg_null);
684 }
685 return error;
686 }
687 if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
688 (ga.Mask == 0)) {
689 return -1;
690 }
691 mask[ga.Group] = ga.Mask;
692 } else {
693 mask_t newMask, sysMask, retval;
694 if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
695 DWORD error = GetLastError();
696 if (abort_on_error) {
697 __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
698 KMP_ERR(error), __kmp_msg_null);
699 }
700 return error;
701 }
702 retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
703 if (!retval) {
704 DWORD error = GetLastError();
705 if (abort_on_error) {
706 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
707 KMP_ERR(error), __kmp_msg_null);
708 }
709 return error;
710 }
711 newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
712 if (!newMask) {
713 DWORD error = GetLastError();
714 if (abort_on_error) {
715 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
716 KMP_ERR(error), __kmp_msg_null);
717 }
718 }
719 *mask = retval;
720 }
721 return 0;
722 }
723 int get_proc_group() const override {
724 int group = -1;
725 if (__kmp_num_proc_groups == 1) {
726 return 1;
727 }
728 for (int i = 0; i < __kmp_num_proc_groups; i++) {
729 if (mask[i] == 0)
730 continue;
731 if (group >= 0)
732 return -1;
733 group = i;
734 }
735 return group;
736 }
737 };
738 void determine_capable(const char *env_var) override {
739 __kmp_affinity_determine_capable(env_var);
740 }
741 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
742 KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
743 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
744 KMPAffinity::Mask *allocate_mask_array(int num) override {
745 return new Mask[num];
746 }
747 void deallocate_mask_array(KMPAffinity::Mask *array) override {
748 Mask *windows_array = static_cast<Mask *>(array);
749 delete[] windows_array;
750 }
751 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
752 int index) override {
753 Mask *windows_array = static_cast<Mask *>(array);
754 return &(windows_array[index]);
755 }
756 api_type get_api_type() const override { return NATIVE_OS; }
757};
758#endif /* KMP_OS_WINDOWS */
759#endif /* KMP_AFFINITY_SUPPORTED */
760
761// Describe an attribute for a level in the machine topology
762struct kmp_hw_attr_t {
763 int core_type : 8;
764 int core_eff : 8;
765 unsigned valid : 1;
766 unsigned reserved : 15;
767
768 static const int UNKNOWN_CORE_EFF = -1;
769
770 kmp_hw_attr_t()
771 : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF),
772 valid(0), reserved(0) {}
773 void set_core_type(kmp_hw_core_type_t type) {
774 valid = 1;
775 core_type = type;
776 }
777 void set_core_eff(int eff) {
778 valid = 1;
779 core_eff = eff;
780 }
781 kmp_hw_core_type_t get_core_type() const {
782 return (kmp_hw_core_type_t)core_type;
783 }
784 int get_core_eff() const { return core_eff; }
785 bool is_core_type_valid() const {
786 return core_type != KMP_HW_CORE_TYPE_UNKNOWN;
787 }
788 bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
789 operator bool() const { return valid; }
790 void clear() {
791 core_type = KMP_HW_CORE_TYPE_UNKNOWN;
792 core_eff = UNKNOWN_CORE_EFF;
793 valid = 0;
794 }
795 bool contains(const kmp_hw_attr_t &other) const {
796 if (!valid && !other.valid)
797 return true;
798 if (valid && other.valid) {
799 if (other.is_core_type_valid()) {
800 if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))
801 return false;
802 }
803 if (other.is_core_eff_valid()) {
804 if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))
805 return false;
806 }
807 return true;
808 }
809 return false;
810 }
811#if KMP_AFFINITY_SUPPORTED
812 bool contains(const kmp_affinity_attrs_t &attr) const {
813 if (!valid && !attr.valid)
814 return true;
815 if (valid && attr.valid) {
816 if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN)
817 return (is_core_type_valid() &&
818 (get_core_type() == (kmp_hw_core_type_t)attr.core_type));
819 if (attr.core_eff != UNKNOWN_CORE_EFF)
820 return (is_core_eff_valid() && (get_core_eff() == attr.core_eff));
821 return true;
822 }
823 return false;
824 }
825#endif // KMP_AFFINITY_SUPPORTED
826 bool operator==(const kmp_hw_attr_t &rhs) const {
827 return (rhs.valid == valid && rhs.core_eff == core_eff &&
828 rhs.core_type == core_type);
829 }
830 bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
831};
832
833#if KMP_AFFINITY_SUPPORTED
834KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t));
835#endif
836
837class kmp_hw_thread_t {
838public:
839 static const int UNKNOWN_ID = -1;
840 static const int MULTIPLE_ID = -2;
841 static int compare_ids(const void *a, const void *b);
842 static int compare_compact(const void *a, const void *b);
843 int ids[KMP_HW_LAST];
844 int sub_ids[KMP_HW_LAST];
845 bool leader;
846 int os_id;
847 kmp_hw_attr_t attrs;
848
849 void print() const;
850 void clear() {
851 for (int i = 0; i < (int)KMP_HW_LAST; ++i)
852 ids[i] = UNKNOWN_ID;
853 leader = false;
854 attrs.clear();
855 }
856};
857
858class kmp_topology_t {
859
860 struct flags_t {
861 int uniform : 1;
862 int reserved : 31;
863 };
864
865 int depth;
866
867 // The following arrays are all 'depth' long and have been
868 // allocated to hold up to KMP_HW_LAST number of objects if
869 // needed so layers can be added without reallocation of any array
870
871 // Orderd array of the types in the topology
872 kmp_hw_t *types;
873
874 // Keep quick topology ratios, for non-uniform topologies,
875 // this ratio holds the max number of itemAs per itemB
876 // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
877 int *ratio;
878
879 // Storage containing the absolute number of each topology layer
880 int *count;
881
882 // The number of core efficiencies. This is only useful for hybrid
883 // topologies. Core efficiencies will range from 0 to num efficiencies - 1
884 int num_core_efficiencies;
885 int num_core_types;
886 kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];
887
888 // The hardware threads array
889 // hw_threads is num_hw_threads long
890 // Each hw_thread's ids and sub_ids are depth deep
891 int num_hw_threads;
892 kmp_hw_thread_t *hw_threads;
893
894 // Equivalence hash where the key is the hardware topology item
895 // and the value is the equivalent hardware topology type in the
896 // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
897 // known equivalence for the topology type
898 kmp_hw_t equivalent[KMP_HW_LAST];
899
900 // Flags describing the topology
901 flags_t flags;
902
903 // Compact value used during sort_compact()
904 int compact;
905
906 // Insert a new topology layer after allocation
907 void _insert_layer(kmp_hw_t type, const int *ids);
908
909#if KMP_GROUP_AFFINITY
910 // Insert topology information about Windows Processor groups
911 void _insert_windows_proc_groups();
912#endif
913
914 // Count each item & get the num x's per y
915 // e.g., get the number of cores and the number of threads per core
916 // for each (x, y) in (KMP_HW_* , KMP_HW_*)
917 void _gather_enumeration_information();
918
919 // Remove layers that don't add information to the topology.
920 // This is done by having the layer take on the id = UNKNOWN_ID (-1)
921 void _remove_radix1_layers();
922
923 // Find out if the topology is uniform
924 void _discover_uniformity();
925
926 // Set all the sub_ids for each hardware thread
927 void _set_sub_ids();
928
929 // Set global affinity variables describing the number of threads per
930 // core, the number of packages, the number of cores per package, and
931 // the number of cores.
932 void _set_globals();
933
934 // Set the last level cache equivalent type
935 void _set_last_level_cache();
936
937 // Return the number of cores with a particular attribute, 'attr'.
938 // If 'find_all' is true, then find all cores on the machine, otherwise find
939 // all cores per the layer 'above'
940 int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
941 bool find_all = false) const;
942
943public:
944 // Force use of allocate()/deallocate()
945 kmp_topology_t() = delete;
946 kmp_topology_t(const kmp_topology_t &t) = delete;
947 kmp_topology_t(kmp_topology_t &&t) = delete;
948 kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
949 kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
950
951 static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
952 static void deallocate(kmp_topology_t *);
953
954 // Functions used in create_map() routines
955 kmp_hw_thread_t &at(int index) {
956 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
957 return hw_threads[index];
958 }
959 const kmp_hw_thread_t &at(int index) const {
960 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
961 return hw_threads[index];
962 }
963 int get_num_hw_threads() const { return num_hw_threads; }
964 void sort_ids() {
965 qsort(base: hw_threads, nmemb: num_hw_threads, size: sizeof(kmp_hw_thread_t),
966 compar: kmp_hw_thread_t::compare_ids);
967 }
968 // Check if the hardware ids are unique, if they are
969 // return true, otherwise return false
970 bool check_ids() const;
971
972 // Function to call after the create_map() routine
973 void canonicalize();
974 void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
975
976// Functions used after canonicalize() called
977
978#if KMP_AFFINITY_SUPPORTED
979 // Set the granularity for affinity settings
980 void set_granularity(kmp_affinity_t &stgs) const;
981 bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const;
982 bool restrict_to_mask(const kmp_affin_mask_t *mask);
983 bool filter_hw_subset();
984#endif
985 bool is_uniform() const { return flags.uniform; }
986 // Tell whether a type is a valid type in the topology
987 // returns KMP_HW_UNKNOWN when there is no equivalent type
988 kmp_hw_t get_equivalent_type(kmp_hw_t type) const {
989 if (type == KMP_HW_UNKNOWN)
990 return KMP_HW_UNKNOWN;
991 return equivalent[type];
992 }
993 // Set type1 = type2
994 void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
995 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
996 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
997 kmp_hw_t real_type2 = equivalent[type2];
998 if (real_type2 == KMP_HW_UNKNOWN)
999 real_type2 = type2;
1000 equivalent[type1] = real_type2;
1001 // This loop is required since any of the types may have been set to
1002 // be equivalent to type1. They all must be checked and reset to type2.
1003 KMP_FOREACH_HW_TYPE(type) {
1004 if (equivalent[type] == type1) {
1005 equivalent[type] = real_type2;
1006 }
1007 }
1008 }
1009 // Calculate number of types corresponding to level1
1010 // per types corresponding to level2 (e.g., number of threads per core)
1011 int calculate_ratio(int level1, int level2) const {
1012 KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
1013 KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
1014 int r = 1;
1015 for (int level = level1; level > level2; --level)
1016 r *= ratio[level];
1017 return r;
1018 }
1019 int get_ratio(int level) const {
1020 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1021 return ratio[level];
1022 }
1023 int get_depth() const { return depth; };
1024 kmp_hw_t get_type(int level) const {
1025 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1026 return types[level];
1027 }
1028 int get_level(kmp_hw_t type) const {
1029 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
1030 int eq_type = equivalent[type];
1031 if (eq_type == KMP_HW_UNKNOWN)
1032 return -1;
1033 for (int i = 0; i < depth; ++i)
1034 if (types[i] == eq_type)
1035 return i;
1036 return -1;
1037 }
1038 int get_count(int level) const {
1039 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1040 return count[level];
1041 }
1042 // Return the total number of cores with attribute 'attr'
1043 int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
1044 return _get_ncores_with_attr(attr, above: -1, find_all: true);
1045 }
1046 // Return the number of cores with attribute
1047 // 'attr' per topology level 'above'
1048 int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
1049 return _get_ncores_with_attr(attr, above, find_all: false);
1050 }
1051
1052#if KMP_AFFINITY_SUPPORTED
1053 friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b);
1054 void sort_compact(kmp_affinity_t &affinity) {
1055 compact = affinity.compact;
1056 qsort(base: hw_threads, nmemb: num_hw_threads, size: sizeof(kmp_hw_thread_t),
1057 compar: kmp_hw_thread_t::compare_compact);
1058 }
1059#endif
1060 void print(const char *env_var = "KMP_AFFINITY") const;
1061 void dump() const;
1062};
1063extern kmp_topology_t *__kmp_topology;
1064
1065class kmp_hw_subset_t {
1066 const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
1067
1068public:
1069 // Describe a machine topology item in KMP_HW_SUBSET
1070 struct item_t {
1071 kmp_hw_t type;
1072 int num_attrs;
1073 int num[MAX_ATTRS];
1074 int offset[MAX_ATTRS];
1075 kmp_hw_attr_t attr[MAX_ATTRS];
1076 };
1077 // Put parenthesis around max to avoid accidental use of Windows max macro.
1078 const static int USE_ALL = (std::numeric_limits<int>::max)();
1079
1080private:
1081 int depth;
1082 int capacity;
1083 item_t *items;
1084 kmp_uint64 set;
1085 bool absolute;
1086 // The set must be able to handle up to KMP_HW_LAST number of layers
1087 KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
1088 // Sorting the KMP_HW_SUBSET items to follow topology order
1089 // All unknown topology types will be at the beginning of the subset
1090 static int hw_subset_compare(const void *i1, const void *i2) {
1091 kmp_hw_t type1 = ((const item_t *)i1)->type;
1092 kmp_hw_t type2 = ((const item_t *)i2)->type;
1093 int level1 = __kmp_topology->get_level(type: type1);
1094 int level2 = __kmp_topology->get_level(type: type2);
1095 return level1 - level2;
1096 }
1097
1098public:
1099 // Force use of allocate()/deallocate()
1100 kmp_hw_subset_t() = delete;
1101 kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
1102 kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
1103 kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
1104 kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
1105
1106 static kmp_hw_subset_t *allocate() {
1107 int initial_capacity = 5;
1108 kmp_hw_subset_t *retval =
1109 (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
1110 retval->depth = 0;
1111 retval->capacity = initial_capacity;
1112 retval->set = 0ull;
1113 retval->absolute = false;
1114 retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
1115 return retval;
1116 }
1117 static void deallocate(kmp_hw_subset_t *subset) {
1118 __kmp_free(subset->items);
1119 __kmp_free(subset);
1120 }
1121 void set_absolute() { absolute = true; }
1122 bool is_absolute() const { return absolute; }
1123 void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
1124 for (int i = 0; i < depth; ++i) {
1125 // Found an existing item for this layer type
1126 // Add the num, offset, and attr to this item
1127 if (items[i].type == type) {
1128 int idx = items[i].num_attrs++;
1129 if ((size_t)idx >= MAX_ATTRS)
1130 return;
1131 items[i].num[idx] = num;
1132 items[i].offset[idx] = offset;
1133 items[i].attr[idx] = attr;
1134 return;
1135 }
1136 }
1137 if (depth == capacity - 1) {
1138 capacity *= 2;
1139 item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
1140 for (int i = 0; i < depth; ++i)
1141 new_items[i] = items[i];
1142 __kmp_free(items);
1143 items = new_items;
1144 }
1145 items[depth].num_attrs = 1;
1146 items[depth].type = type;
1147 items[depth].num[0] = num;
1148 items[depth].offset[0] = offset;
1149 items[depth].attr[0] = attr;
1150 depth++;
1151 set |= (1ull << type);
1152 }
1153 int get_depth() const { return depth; }
1154 const item_t &at(int index) const {
1155 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1156 return items[index];
1157 }
1158 item_t &at(int index) {
1159 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1160 return items[index];
1161 }
1162 void remove(int index) {
1163 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1164 set &= ~(1ull << items[index].type);
1165 for (int j = index + 1; j < depth; ++j) {
1166 items[j - 1] = items[j];
1167 }
1168 depth--;
1169 }
1170 void sort() {
1171 KMP_DEBUG_ASSERT(__kmp_topology);
1172 qsort(base: items, nmemb: depth, size: sizeof(item_t), compar: hw_subset_compare);
1173 }
1174 bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
1175
1176 // Canonicalize the KMP_HW_SUBSET value if it is not an absolute subset.
1177 // This means putting each of {sockets, cores, threads} in the topology if
1178 // they are not specified:
1179 // e.g., 1s,2c => 1s,2c,*t | 2c,1t => *s,2c,1t | 1t => *s,*c,1t | etc.
1180 // e.g., 3module => *s,3module,*c,*t
1181 // By doing this, the runtime assumes users who fiddle with KMP_HW_SUBSET
1182 // are expecting the traditional sockets/cores/threads topology. For newer
1183 // hardware, there can be intervening layers like dies/tiles/modules
1184 // (usually corresponding to a cache level). So when a user asks for
1185 // 1s,6c,2t and the topology is really 1s,2modules,4cores,2threads, the user
1186 // should get 12 hardware threads across 6 cores and effectively ignore the
1187 // module layer.
1188 void canonicalize(const kmp_topology_t *top) {
1189 // Layers to target for KMP_HW_SUBSET canonicalization
1190 kmp_hw_t targeted[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
1191
1192 // Do not target-layer-canonicalize absolute KMP_HW_SUBSETS
1193 if (is_absolute())
1194 return;
1195
1196 // Do not target-layer-canonicalize KMP_HW_SUBSETS when the
1197 // topology doesn't have these layers
1198 for (kmp_hw_t type : targeted)
1199 if (top->get_level(type) == KMP_HW_UNKNOWN)
1200 return;
1201
1202 // Put targeted layers in topology if they do not exist
1203 for (kmp_hw_t type : targeted) {
1204 bool found = false;
1205 for (int i = 0; i < get_depth(); ++i) {
1206 if (top->get_equivalent_type(type: items[i].type) == type) {
1207 found = true;
1208 break;
1209 }
1210 }
1211 if (!found) {
1212 push_back(num: USE_ALL, type, offset: 0, attr: kmp_hw_attr_t{});
1213 }
1214 }
1215 sort();
1216 // Set as an absolute topology that only targets the targeted layers
1217 set_absolute();
1218 }
1219 void dump() const {
1220 printf(format: "**********************\n");
1221 printf(format: "*** kmp_hw_subset: ***\n");
1222 printf(format: "* depth: %d\n", depth);
1223 printf(format: "* items:\n");
1224 for (int i = 0; i < depth; ++i) {
1225 printf(format: " type: %s\n", __kmp_hw_get_keyword(type: items[i].type));
1226 for (int j = 0; j < items[i].num_attrs; ++j) {
1227 printf(format: " num: %d, offset: %d, attr: ", items[i].num[j],
1228 items[i].offset[j]);
1229 if (!items[i].attr[j]) {
1230 printf(format: " (none)\n");
1231 } else {
1232 printf(
1233 format: " core_type = %s, core_eff = %d\n",
1234 __kmp_hw_get_core_type_string(type: items[i].attr[j].get_core_type()),
1235 items[i].attr[j].get_core_eff());
1236 }
1237 }
1238 }
1239 printf(format: "* set: 0x%llx\n", set);
1240 printf(format: "* absolute: %d\n", absolute);
1241 printf(format: "**********************\n");
1242 }
1243};
1244extern kmp_hw_subset_t *__kmp_hw_subset;
1245
1246/* A structure for holding machine-specific hierarchy info to be computed once
1247 at init. This structure represents a mapping of threads to the actual machine
1248 hierarchy, or to our best guess at what the hierarchy might be, for the
1249 purpose of performing an efficient barrier. In the worst case, when there is
1250 no machine hierarchy information, it produces a tree suitable for a barrier,
1251 similar to the tree used in the hyper barrier. */
1252class hierarchy_info {
1253public:
1254 /* Good default values for number of leaves and branching factor, given no
1255 affinity information. Behaves a bit like hyper barrier. */
1256 static const kmp_uint32 maxLeaves = 4;
1257 static const kmp_uint32 minBranch = 4;
1258 /** Number of levels in the hierarchy. Typical levels are threads/core,
1259 cores/package or socket, packages/node, nodes/machine, etc. We don't want
1260 to get specific with nomenclature. When the machine is oversubscribed we
1261 add levels to duplicate the hierarchy, doubling the thread capacity of the
1262 hierarchy each time we add a level. */
1263 kmp_uint32 maxLevels;
1264
1265 /** This is specifically the depth of the machine configuration hierarchy, in
1266 terms of the number of levels along the longest path from root to any
1267 leaf. It corresponds to the number of entries in numPerLevel if we exclude
1268 all but one trailing 1. */
1269 kmp_uint32 depth;
1270 kmp_uint32 base_num_threads;
1271 enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
1272 volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
1273 // 2=initialization in progress
1274 volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
1275
1276 /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children
1277 the parent of a node at level i has. For example, if we have a machine
1278 with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel =
1279 {2, 4, 4, 1, 1}. All empty levels are set to 1. */
1280 kmp_uint32 *numPerLevel;
1281 kmp_uint32 *skipPerLevel;
1282
1283 void deriveLevels() {
1284 int hier_depth = __kmp_topology->get_depth();
1285 for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
1286 numPerLevel[level] = __kmp_topology->get_ratio(level: i);
1287 }
1288 }
1289
1290 hierarchy_info()
1291 : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
1292
1293 void fini() {
1294 if (!uninitialized && numPerLevel) {
1295 __kmp_free(numPerLevel);
1296 numPerLevel = NULL;
1297 uninitialized = not_initialized;
1298 }
1299 }
1300
1301 void init(int num_addrs) {
1302 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
1303 &uninitialized, not_initialized, initializing);
1304 if (bool_result == 0) { // Wait for initialization
1305 while (TCR_1(uninitialized) != initialized)
1306 KMP_CPU_PAUSE();
1307 return;
1308 }
1309 KMP_DEBUG_ASSERT(bool_result == 1);
1310
1311 /* Added explicit initialization of the data fields here to prevent usage of
1312 dirty value observed when static library is re-initialized multiple times
1313 (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
1314 OpenMP). */
1315 depth = 1;
1316 resizing = 0;
1317 maxLevels = 7;
1318 numPerLevel =
1319 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1320 skipPerLevel = &(numPerLevel[maxLevels]);
1321 for (kmp_uint32 i = 0; i < maxLevels;
1322 ++i) { // init numPerLevel[*] to 1 item per level
1323 numPerLevel[i] = 1;
1324 skipPerLevel[i] = 1;
1325 }
1326
1327 // Sort table by physical ID
1328 if (__kmp_topology && __kmp_topology->get_depth() > 0) {
1329 deriveLevels();
1330 } else {
1331 numPerLevel[0] = maxLeaves;
1332 numPerLevel[1] = num_addrs / maxLeaves;
1333 if (num_addrs % maxLeaves)
1334 numPerLevel[1]++;
1335 }
1336
1337 base_num_threads = num_addrs;
1338 for (int i = maxLevels - 1; i >= 0;
1339 --i) // count non-empty levels to get depth
1340 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
1341 depth++;
1342
1343 kmp_uint32 branch = minBranch;
1344 if (numPerLevel[0] == 1)
1345 branch = num_addrs / maxLeaves;
1346 if (branch < minBranch)
1347 branch = minBranch;
1348 for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
1349 while (numPerLevel[d] > branch ||
1350 (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
1351 if (numPerLevel[d] & 1)
1352 numPerLevel[d]++;
1353 numPerLevel[d] = numPerLevel[d] >> 1;
1354 if (numPerLevel[d + 1] == 1)
1355 depth++;
1356 numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
1357 }
1358 if (numPerLevel[0] == 1) {
1359 branch = branch >> 1;
1360 if (branch < 4)
1361 branch = minBranch;
1362 }
1363 }
1364
1365 for (kmp_uint32 i = 1; i < depth; ++i)
1366 skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1367 // Fill in hierarchy in the case of oversubscription
1368 for (kmp_uint32 i = depth; i < maxLevels; ++i)
1369 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1370
1371 uninitialized = initialized; // One writer
1372 }
1373
1374 // Resize the hierarchy if nproc changes to something larger than before
1375 void resize(kmp_uint32 nproc) {
1376 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1377 while (bool_result == 0) { // someone else is trying to resize
1378 KMP_CPU_PAUSE();
1379 if (nproc <= base_num_threads) // happy with other thread's resize
1380 return;
1381 else // try to resize
1382 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1383 }
1384 KMP_DEBUG_ASSERT(bool_result != 0);
1385 if (nproc <= base_num_threads)
1386 return; // happy with other thread's resize
1387
1388 // Calculate new maxLevels
1389 kmp_uint32 old_sz = skipPerLevel[depth - 1];
1390 kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1391 // First see if old maxLevels is enough to contain new size
1392 for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1393 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1394 numPerLevel[i - 1] *= 2;
1395 old_sz *= 2;
1396 depth++;
1397 }
1398 if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1399 while (nproc > old_sz) {
1400 old_sz *= 2;
1401 incs++;
1402 depth++;
1403 }
1404 maxLevels += incs;
1405
1406 // Resize arrays
1407 kmp_uint32 *old_numPerLevel = numPerLevel;
1408 kmp_uint32 *old_skipPerLevel = skipPerLevel;
1409 numPerLevel = skipPerLevel = NULL;
1410 numPerLevel =
1411 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1412 skipPerLevel = &(numPerLevel[maxLevels]);
1413
1414 // Copy old elements from old arrays
1415 for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1416 // init numPerLevel[*] to 1 item per level
1417 numPerLevel[i] = old_numPerLevel[i];
1418 skipPerLevel[i] = old_skipPerLevel[i];
1419 }
1420
1421 // Init new elements in arrays to 1
1422 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1423 // init numPerLevel[*] to 1 item per level
1424 numPerLevel[i] = 1;
1425 skipPerLevel[i] = 1;
1426 }
1427
1428 // Free old arrays
1429 __kmp_free(old_numPerLevel);
1430 }
1431
1432 // Fill in oversubscription levels of hierarchy
1433 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1434 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1435
1436 base_num_threads = nproc;
1437 resizing = 0; // One writer
1438 }
1439};
1440#endif // KMP_AFFINITY_H
1441

source code of openmp/runtime/src/kmp_affinity.h