1/* Initialize x86 cache info.
2 Copyright (C) 2020-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19static const struct intel_02_cache_info
20{
21 unsigned char idx;
22 unsigned char assoc;
23 unsigned char linesize;
24 unsigned char rel_name;
25 unsigned int size;
26} intel_02_known [] =
27 {
28#define M(sc) ((sc) - _SC_LEVEL1_ICACHE_SIZE)
29 { 0x06, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 8192 },
30 { 0x08, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 16384 },
31 { 0x09, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 32768 },
32 { 0x0a, 2, 32, M(_SC_LEVEL1_DCACHE_SIZE), 8192 },
33 { 0x0c, 4, 32, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
34 { 0x0d, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
35 { 0x0e, 6, 64, M(_SC_LEVEL1_DCACHE_SIZE), 24576 },
36 { 0x21, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
37 { 0x22, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 524288 },
38 { 0x23, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 },
39 { 0x25, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
40 { 0x29, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
41 { 0x2c, 8, 64, M(_SC_LEVEL1_DCACHE_SIZE), 32768 },
42 { 0x30, 8, 64, M(_SC_LEVEL1_ICACHE_SIZE), 32768 },
43 { 0x39, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
44 { 0x3a, 6, 64, M(_SC_LEVEL2_CACHE_SIZE), 196608 },
45 { 0x3b, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
46 { 0x3c, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
47 { 0x3d, 6, 64, M(_SC_LEVEL2_CACHE_SIZE), 393216 },
48 { 0x3e, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
49 { 0x3f, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
50 { 0x41, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
51 { 0x42, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
52 { 0x43, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
53 { 0x44, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
54 { 0x45, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 2097152 },
55 { 0x46, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
56 { 0x47, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
57 { 0x48, 12, 64, M(_SC_LEVEL2_CACHE_SIZE), 3145728 },
58 { 0x49, 16, 64, M(_SC_LEVEL2_CACHE_SIZE), 4194304 },
59 { 0x4a, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 6291456 },
60 { 0x4b, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
61 { 0x4c, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
62 { 0x4d, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 16777216 },
63 { 0x4e, 24, 64, M(_SC_LEVEL2_CACHE_SIZE), 6291456 },
64 { 0x60, 8, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
65 { 0x66, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 8192 },
66 { 0x67, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
67 { 0x68, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 32768 },
68 { 0x78, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
69 { 0x79, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
70 { 0x7a, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
71 { 0x7b, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
72 { 0x7c, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
73 { 0x7d, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 2097152 },
74 { 0x7f, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
75 { 0x80, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
76 { 0x82, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
77 { 0x83, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
78 { 0x84, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
79 { 0x85, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 2097152 },
80 { 0x86, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
81 { 0x87, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
82 { 0xd0, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 524288 },
83 { 0xd1, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 },
84 { 0xd2, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
85 { 0xd6, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 },
86 { 0xd7, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
87 { 0xd8, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
88 { 0xdc, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
89 { 0xdd, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
90 { 0xde, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
91 { 0xe2, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
92 { 0xe3, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
93 { 0xe4, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
94 { 0xea, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
95 { 0xeb, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 18874368 },
96 { 0xec, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 25165824 },
97 };
98
99#define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known [0]))
100
101static int
102intel_02_known_compare (const void *p1, const void *p2)
103{
104 const struct intel_02_cache_info *i1;
105 const struct intel_02_cache_info *i2;
106
107 i1 = (const struct intel_02_cache_info *) p1;
108 i2 = (const struct intel_02_cache_info *) p2;
109
110 if (i1->idx == i2->idx)
111 return 0;
112
113 return i1->idx < i2->idx ? -1 : 1;
114}
115
116
117static long int
118__attribute__ ((noinline))
119intel_check_word (int name, unsigned int value, bool *has_level_2,
120 bool *no_level_2_or_3,
121 const struct cpu_features *cpu_features)
122{
123 if ((value & 0x80000000) != 0)
124 /* The register value is reserved. */
125 return 0;
126
127 /* Fold the name. The _SC_ constants are always in the order SIZE,
128 ASSOC, LINESIZE. */
129 int folded_rel_name = (M(name) / 3) * 3;
130
131 while (value != 0)
132 {
133 unsigned int byte = value & 0xff;
134
135 if (byte == 0x40)
136 {
137 *no_level_2_or_3 = true;
138
139 if (folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
140 /* No need to look further. */
141 break;
142 }
143 else if (byte == 0xff)
144 {
145 /* CPUID leaf 0x4 contains all the information. We need to
146 iterate over it. */
147 unsigned int eax;
148 unsigned int ebx;
149 unsigned int ecx;
150 unsigned int edx;
151
152 unsigned int round = 0;
153 while (1)
154 {
155 __cpuid_count (4, round, eax, ebx, ecx, edx);
156
157 enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
158 if (type == null)
159 /* That was the end. */
160 break;
161
162 unsigned int level = (eax >> 5) & 0x7;
163
164 if ((level == 1 && type == data
165 && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
166 || (level == 1 && type == inst
167 && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
168 || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
169 || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
170 || (level == 4 && folded_rel_name == M(_SC_LEVEL4_CACHE_SIZE)))
171 {
172 unsigned int offset = M(name) - folded_rel_name;
173
174 if (offset == 0)
175 /* Cache size. */
176 return (((ebx >> 22) + 1)
177 * (((ebx >> 12) & 0x3ff) + 1)
178 * ((ebx & 0xfff) + 1)
179 * (ecx + 1));
180 if (offset == 1)
181 return (ebx >> 22) + 1;
182
183 assert (offset == 2);
184 return (ebx & 0xfff) + 1;
185 }
186
187 ++round;
188 }
189 /* There is no other cache information anywhere else. */
190 return -1;
191 }
192 else
193 {
194 if (byte == 0x49 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
195 {
196 /* Intel reused this value. For family 15, model 6 it
197 specifies the 3rd level cache. Otherwise the 2nd
198 level cache. */
199 unsigned int family = cpu_features->basic.family;
200 unsigned int model = cpu_features->basic.model;
201
202 if (family == 15 && model == 6)
203 {
204 /* The level 3 cache is encoded for this model like
205 the level 2 cache is for other models. Pretend
206 the caller asked for the level 2 cache. */
207 name = (_SC_LEVEL2_CACHE_SIZE
208 + (name - _SC_LEVEL3_CACHE_SIZE));
209 folded_rel_name = M(_SC_LEVEL2_CACHE_SIZE);
210 }
211 }
212
213 struct intel_02_cache_info *found;
214 struct intel_02_cache_info search;
215
216 search.idx = byte;
217 found = bsearch (&search, intel_02_known, nintel_02_known,
218 sizeof (intel_02_known[0]), intel_02_known_compare);
219 if (found != NULL)
220 {
221 if (found->rel_name == folded_rel_name)
222 {
223 unsigned int offset = M(name) - folded_rel_name;
224
225 if (offset == 0)
226 /* Cache size. */
227 return found->size;
228 if (offset == 1)
229 return found->assoc;
230
231 assert (offset == 2);
232 return found->linesize;
233 }
234
235 if (found->rel_name == M(_SC_LEVEL2_CACHE_SIZE))
236 *has_level_2 = true;
237 }
238 }
239
240 /* Next byte for the next round. */
241 value >>= 8;
242 }
243
244 /* Nothing found. */
245 return 0;
246}
247
248
249static long int __attribute__ ((noinline))
250handle_intel (int name, const struct cpu_features *cpu_features)
251{
252 unsigned int maxidx = cpu_features->basic.max_cpuid;
253
254 /* Return -1 for older CPUs. */
255 if (maxidx < 2)
256 return -1;
257
258 /* OK, we can use the CPUID instruction to get all info about the
259 caches. */
260 long int result = 0;
261 bool no_level_2_or_3 = false;
262 bool has_level_2 = false;
263 unsigned int eax;
264 unsigned int ebx;
265 unsigned int ecx;
266 unsigned int edx;
267 __cpuid (2, eax, ebx, ecx, edx);
268
269 /* The low byte of EAX of CPUID leaf 2 should always return 1 and it
270 should be ignored. If it isn't 1, use CPUID leaf 4 instead. */
271 if ((eax & 0xff) != 1)
272 return intel_check_word (name, value: 0xff, has_level_2: &has_level_2, no_level_2_or_3: &no_level_2_or_3,
273 cpu_features);
274 else
275 {
276 eax &= 0xffffff00;
277
278 /* Process the individual registers' value. */
279 result = intel_check_word (name, value: eax, has_level_2: &has_level_2,
280 no_level_2_or_3: &no_level_2_or_3, cpu_features);
281 if (result != 0)
282 return result;
283
284 result = intel_check_word (name, value: ebx, has_level_2: &has_level_2,
285 no_level_2_or_3: &no_level_2_or_3, cpu_features);
286 if (result != 0)
287 return result;
288
289 result = intel_check_word (name, value: ecx, has_level_2: &has_level_2,
290 no_level_2_or_3: &no_level_2_or_3, cpu_features);
291 if (result != 0)
292 return result;
293
294 result = intel_check_word (name, value: edx, has_level_2: &has_level_2,
295 no_level_2_or_3: &no_level_2_or_3, cpu_features);
296 if (result != 0)
297 return result;
298 }
299
300 if (name >= _SC_LEVEL2_CACHE_SIZE && name <= _SC_LEVEL3_CACHE_LINESIZE
301 && no_level_2_or_3)
302 return -1;
303
304 return 0;
305}
306
307
308static long int __attribute__ ((noinline))
309handle_amd (int name)
310{
311 unsigned int eax;
312 unsigned int ebx;
313 unsigned int ecx = 0;
314 unsigned int edx;
315 unsigned int max_cpuid = 0;
316 unsigned int fn = 0;
317
318 /* No level 4 cache (yet). */
319 if (name > _SC_LEVEL3_CACHE_LINESIZE)
320 return 0;
321
322 __cpuid (0x80000000, max_cpuid, ebx, ecx, edx);
323
324 if (max_cpuid >= 0x8000001D)
325 /* Use __cpuid__ '0x8000_001D' to compute cache details. */
326 {
327 unsigned int count = 0x1;
328
329 if (name >= _SC_LEVEL3_CACHE_SIZE)
330 count = 0x3;
331 else if (name >= _SC_LEVEL2_CACHE_SIZE)
332 count = 0x2;
333 else if (name >= _SC_LEVEL1_DCACHE_SIZE)
334 count = 0x0;
335
336 __cpuid_count (0x8000001D, count, eax, ebx, ecx, edx);
337
338 if (ecx != 0)
339 {
340 switch (name)
341 {
342 case _SC_LEVEL1_ICACHE_ASSOC:
343 case _SC_LEVEL1_DCACHE_ASSOC:
344 case _SC_LEVEL2_CACHE_ASSOC:
345 case _SC_LEVEL3_CACHE_ASSOC:
346 return ((ebx >> 22) & 0x3ff) + 1;
347 case _SC_LEVEL1_ICACHE_LINESIZE:
348 case _SC_LEVEL1_DCACHE_LINESIZE:
349 case _SC_LEVEL2_CACHE_LINESIZE:
350 case _SC_LEVEL3_CACHE_LINESIZE:
351 return (ebx & 0xfff) + 1;
352 case _SC_LEVEL1_ICACHE_SIZE:
353 case _SC_LEVEL1_DCACHE_SIZE:
354 case _SC_LEVEL2_CACHE_SIZE:
355 case _SC_LEVEL3_CACHE_SIZE:
356 return (((ebx >> 22) & 0x3ff) + 1) * ((ebx & 0xfff) + 1) * (ecx + 1);
357 default:
358 __builtin_unreachable ();
359 }
360 return -1;
361 }
362 }
363
364 /* Legacy cache computation for CPUs prior to Bulldozer family.
365 This is also a fail-safe mechanism for some hypervisors that
366 accidentally configure __cpuid__ '0x8000_001D' to Zero. */
367
368 fn = 0x80000005 + (name >= _SC_LEVEL2_CACHE_SIZE);
369
370 if (max_cpuid < fn)
371 return 0;
372
373 __cpuid (fn, eax, ebx, ecx, edx);
374
375 if (name < _SC_LEVEL1_DCACHE_SIZE)
376 {
377 name += _SC_LEVEL1_DCACHE_SIZE - _SC_LEVEL1_ICACHE_SIZE;
378 ecx = edx;
379 }
380
381 switch (name)
382 {
383 case _SC_LEVEL1_DCACHE_SIZE:
384 return (ecx >> 14) & 0x3fc00;
385
386 case _SC_LEVEL1_DCACHE_ASSOC:
387 ecx >>= 16;
388 if ((ecx & 0xff) == 0xff)
389 {
390 /* Fully associative. */
391 return (ecx << 2) & 0x3fc00;
392 }
393 return ecx & 0xff;
394
395 case _SC_LEVEL1_DCACHE_LINESIZE:
396 return ecx & 0xff;
397
398 case _SC_LEVEL2_CACHE_SIZE:
399 return (ecx & 0xf000) == 0 ? 0 : (ecx >> 6) & 0x3fffc00;
400
401 case _SC_LEVEL2_CACHE_ASSOC:
402 switch ((ecx >> 12) & 0xf)
403 {
404 case 0:
405 case 1:
406 case 2:
407 case 4:
408 return (ecx >> 12) & 0xf;
409 case 6:
410 return 8;
411 case 8:
412 return 16;
413 case 10:
414 return 32;
415 case 11:
416 return 48;
417 case 12:
418 return 64;
419 case 13:
420 return 96;
421 case 14:
422 return 128;
423 case 15:
424 return ((ecx >> 6) & 0x3fffc00) / (ecx & 0xff);
425 default:
426 return 0;
427 }
428
429 case _SC_LEVEL2_CACHE_LINESIZE:
430 return (ecx & 0xf000) == 0 ? 0 : ecx & 0xff;
431
432 case _SC_LEVEL3_CACHE_SIZE:
433 {
434 long int total_l3_cache = 0, l3_cache_per_thread = 0;
435 unsigned int threads = 0;
436 const struct cpu_features *cpu_features;
437
438 if ((edx & 0xf000) == 0)
439 return 0;
440
441 total_l3_cache = (edx & 0x3ffc0000) << 1;
442 cpu_features = __get_cpu_features ();
443
444 /* Figure out the number of logical threads that share L3. */
445 if (max_cpuid >= 0x80000008)
446 {
447 /* Get width of APIC ID. */
448 __cpuid (0x80000008, eax, ebx, ecx, edx);
449 threads = (ecx & 0xff) + 1;
450 }
451
452 if (threads == 0)
453 {
454 /* If APIC ID width is not available, use logical
455 processor count. */
456 __cpuid (0x00000001, eax, ebx, ecx, edx);
457 if ((edx & (1 << 28)) != 0)
458 threads = (ebx >> 16) & 0xff;
459 }
460
461 /* Cap usage of highest cache level to the number of
462 supported threads. */
463 if (threads > 0)
464 l3_cache_per_thread = total_l3_cache/threads;
465
466 /* Get shared cache per ccx for Zen architectures. */
467 if (cpu_features->basic.family >= 0x17)
468 {
469 long int l3_cache_per_ccx = 0;
470 /* Get number of threads share the L3 cache in CCX. */
471 __cpuid_count (0x8000001D, 0x3, eax, ebx, ecx, edx);
472 unsigned int threads_per_ccx = ((eax >> 14) & 0xfff) + 1;
473 l3_cache_per_ccx = l3_cache_per_thread * threads_per_ccx;
474 return l3_cache_per_ccx;
475 }
476 else
477 {
478 return l3_cache_per_thread;
479 }
480 }
481
482 case _SC_LEVEL3_CACHE_ASSOC:
483 switch ((edx >> 12) & 0xf)
484 {
485 case 0:
486 case 1:
487 case 2:
488 case 4:
489 return (edx >> 12) & 0xf;
490 case 6:
491 return 8;
492 case 8:
493 return 16;
494 case 10:
495 return 32;
496 case 11:
497 return 48;
498 case 12:
499 return 64;
500 case 13:
501 return 96;
502 case 14:
503 return 128;
504 case 15:
505 return ((edx & 0x3ffc0000) << 1) / (edx & 0xff);
506 default:
507 return 0;
508 }
509
510 case _SC_LEVEL3_CACHE_LINESIZE:
511 return (edx & 0xf000) == 0 ? 0 : edx & 0xff;
512
513 default:
514 __builtin_unreachable ();
515 }
516 return -1;
517}
518
519
520static long int __attribute__ ((noinline))
521handle_zhaoxin (int name)
522{
523 unsigned int eax;
524 unsigned int ebx;
525 unsigned int ecx;
526 unsigned int edx;
527
528 int folded_rel_name = (M(name) / 3) * 3;
529
530 unsigned int round = 0;
531 while (1)
532 {
533 __cpuid_count (4, round, eax, ebx, ecx, edx);
534
535 enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
536 if (type == null)
537 break;
538
539 unsigned int level = (eax >> 5) & 0x7;
540
541 if ((level == 1 && type == data
542 && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
543 || (level == 1 && type == inst
544 && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
545 || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
546 || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE)))
547 {
548 unsigned int offset = M(name) - folded_rel_name;
549
550 if (offset == 0)
551 /* Cache size. */
552 return (((ebx >> 22) + 1)
553 * (((ebx >> 12) & 0x3ff) + 1)
554 * ((ebx & 0xfff) + 1)
555 * (ecx + 1));
556 if (offset == 1)
557 return (ebx >> 22) + 1;
558
559 assert (offset == 2);
560 return (ebx & 0xfff) + 1;
561 }
562
563 ++round;
564 }
565
566 /* Nothing found. */
567 return 0;
568}
569
570static void
571get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, unsigned int *threads_ptr,
572 long int core)
573{
574 unsigned int eax;
575 unsigned int ebx;
576 unsigned int ecx;
577 unsigned int edx;
578
579 /* Number of logical processors sharing L2 cache. */
580 int threads_l2;
581
582 /* Number of logical processors sharing L3 cache. */
583 int threads_l3;
584
585 const struct cpu_features *cpu_features = __get_cpu_features ();
586 int max_cpuid = cpu_features->basic.max_cpuid;
587 unsigned int family = cpu_features->basic.family;
588 unsigned int model = cpu_features->basic.model;
589 long int shared = *shared_ptr;
590 long int shared_per_thread = *shared_per_thread_ptr;
591 unsigned int threads = *threads_ptr;
592 bool inclusive_cache = true;
593 bool support_count_mask = true;
594
595 /* Try L3 first. */
596 unsigned int level = 3;
597
598 if (cpu_features->basic.kind == arch_kind_zhaoxin && family == 6)
599 support_count_mask = false;
600
601 if (shared <= 0)
602 {
603 /* Try L2 otherwise. */
604 level = 2;
605 shared = core;
606 shared_per_thread = core;
607 threads_l2 = 0;
608 threads_l3 = -1;
609 }
610 else
611 {
612 threads_l2 = 0;
613 threads_l3 = 0;
614 }
615
616 /* A value of 0 for the HTT bit indicates there is only a single
617 logical processor. */
618 if (HAS_CPU_FEATURE (HTT))
619 {
620 /* Figure out the number of logical threads that share the
621 highest cache level. */
622 if (max_cpuid >= 4)
623 {
624 int i = 0;
625
626 /* Query until cache level 2 and 3 are enumerated. */
627 int check = 0x1 | (threads_l3 == 0) << 1;
628 do
629 {
630 __cpuid_count (4, i++, eax, ebx, ecx, edx);
631
632 /* There seems to be a bug in at least some Pentium Ds
633 which sometimes fail to iterate all cache parameters.
634 Do not loop indefinitely here, stop in this case and
635 assume there is no such information. */
636 if (cpu_features->basic.kind == arch_kind_intel
637 && (eax & 0x1f) == 0 )
638 goto intel_bug_no_cache_info;
639
640 switch ((eax >> 5) & 0x7)
641 {
642 default:
643 break;
644 case 2:
645 if ((check & 0x1))
646 {
647 /* Get maximum number of logical processors
648 sharing L2 cache. */
649 threads_l2 = (eax >> 14) & 0x3ff;
650 check &= ~0x1;
651 }
652 break;
653 case 3:
654 if ((check & (0x1 << 1)))
655 {
656 /* Get maximum number of logical processors
657 sharing L3 cache. */
658 threads_l3 = (eax >> 14) & 0x3ff;
659
660 /* Check if L2 and L3 caches are inclusive. */
661 inclusive_cache = (edx & 0x2) != 0;
662 check &= ~(0x1 << 1);
663 }
664 break;
665 }
666 }
667 while (check);
668
669 /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum
670 numbers of addressable IDs for logical processors sharing
671 the cache, instead of the maximum number of threads
672 sharing the cache. */
673 if (max_cpuid >= 11 && support_count_mask)
674 {
675 /* Find the number of logical processors shipped in
676 one core and apply count mask. */
677 i = 0;
678
679 /* Count SMT only if there is L3 cache. Always count
680 core if there is no L3 cache. */
681 int count = ((threads_l2 > 0 && level == 3)
682 | ((threads_l3 > 0
683 || (threads_l2 > 0 && level == 2)) << 1));
684
685 while (count)
686 {
687 __cpuid_count (11, i++, eax, ebx, ecx, edx);
688
689 int shipped = ebx & 0xff;
690 int type = ecx & 0xff00;
691 if (shipped == 0 || type == 0)
692 break;
693 else if (type == 0x100)
694 {
695 /* Count SMT. */
696 if ((count & 0x1))
697 {
698 int count_mask;
699
700 /* Compute count mask. */
701 asm ("bsr %1, %0"
702 : "=r" (count_mask) : "g" (threads_l2));
703 count_mask = ~(-1 << (count_mask + 1));
704 threads_l2 = (shipped - 1) & count_mask;
705 count &= ~0x1;
706 }
707 }
708 else if (type == 0x200)
709 {
710 /* Count core. */
711 if ((count & (0x1 << 1)))
712 {
713 int count_mask;
714 int threads_core
715 = (level == 2 ? threads_l2 : threads_l3);
716
717 /* Compute count mask. */
718 asm ("bsr %1, %0"
719 : "=r" (count_mask) : "g" (threads_core));
720 count_mask = ~(-1 << (count_mask + 1));
721 threads_core = (shipped - 1) & count_mask;
722 if (level == 2)
723 threads_l2 = threads_core;
724 else
725 threads_l3 = threads_core;
726 count &= ~(0x1 << 1);
727 }
728 }
729 }
730 }
731 if (threads_l2 > 0)
732 threads_l2 += 1;
733 if (threads_l3 > 0)
734 threads_l3 += 1;
735 if (level == 2)
736 {
737 if (threads_l2)
738 {
739 threads = threads_l2;
740 if (cpu_features->basic.kind == arch_kind_intel
741 && threads > 2
742 && family == 6)
743 switch (model)
744 {
745 case 0x37:
746 case 0x4a:
747 case 0x4d:
748 case 0x5a:
749 case 0x5d:
750 /* Silvermont has L2 cache shared by 2 cores. */
751 threads = 2;
752 break;
753 default:
754 break;
755 }
756 }
757 }
758 else if (threads_l3)
759 threads = threads_l3;
760 }
761 else
762 {
763 intel_bug_no_cache_info:
764 /* Assume that all logical threads share the highest cache
765 level. */
766 threads = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16)
767 & 0xff);
768 }
769 /* Get per-thread size of highest level cache. */
770 if (shared_per_thread > 0 && threads > 0)
771 shared_per_thread /= threads;
772 }
773
774 /* Account for non-inclusive L2 and L3 caches. */
775 if (!inclusive_cache)
776 {
777 long int core_per_thread = threads_l2 > 0 ? (core / threads_l2) : core;
778 shared_per_thread += core_per_thread;
779 shared += core;
780 }
781
782 *shared_ptr = shared;
783 *shared_per_thread_ptr = shared_per_thread;
784 *threads_ptr = threads;
785}
786
787static void
788dl_init_cacheinfo (struct cpu_features *cpu_features)
789{
790 /* Find out what brand of processor. */
791 long int data = -1;
792 long int shared = -1;
793 long int shared_per_thread = -1;
794 unsigned int threads = 0;
795 unsigned long int level1_icache_size = -1;
796 unsigned long int level1_icache_linesize = -1;
797 unsigned long int level1_dcache_size = -1;
798 unsigned long int level1_dcache_assoc = -1;
799 unsigned long int level1_dcache_linesize = -1;
800 unsigned long int level2_cache_size = -1;
801 unsigned long int level2_cache_assoc = -1;
802 unsigned long int level2_cache_linesize = -1;
803 unsigned long int level3_cache_size = -1;
804 unsigned long int level3_cache_assoc = -1;
805 unsigned long int level3_cache_linesize = -1;
806 unsigned long int level4_cache_size = -1;
807
808 if (cpu_features->basic.kind == arch_kind_intel)
809 {
810 data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
811 shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
812 shared_per_thread = shared;
813
814 level1_icache_size
815 = handle_intel (_SC_LEVEL1_ICACHE_SIZE, cpu_features);
816 level1_icache_linesize
817 = handle_intel (_SC_LEVEL1_ICACHE_LINESIZE, cpu_features);
818 level1_dcache_size = data;
819 level1_dcache_assoc
820 = handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
821 level1_dcache_linesize
822 = handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
823 level2_cache_size
824 = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
825 level2_cache_assoc
826 = handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
827 level2_cache_linesize
828 = handle_intel (_SC_LEVEL2_CACHE_LINESIZE, cpu_features);
829 level3_cache_size = shared;
830 level3_cache_assoc
831 = handle_intel (_SC_LEVEL3_CACHE_ASSOC, cpu_features);
832 level3_cache_linesize
833 = handle_intel (_SC_LEVEL3_CACHE_LINESIZE, cpu_features);
834 level4_cache_size
835 = handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
836
837 get_common_cache_info (shared_ptr: &shared, shared_per_thread_ptr: &shared_per_thread, threads_ptr: &threads,
838 core: level2_cache_size);
839 }
840 else if (cpu_features->basic.kind == arch_kind_zhaoxin)
841 {
842 data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
843 shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
844 shared_per_thread = shared;
845
846 level1_icache_size = handle_zhaoxin (_SC_LEVEL1_ICACHE_SIZE);
847 level1_icache_linesize = handle_zhaoxin (_SC_LEVEL1_ICACHE_LINESIZE);
848 level1_dcache_size = data;
849 level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC);
850 level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE);
851 level2_cache_size = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
852 level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC);
853 level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE);
854 level3_cache_size = shared;
855 level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
856 level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
857
858 get_common_cache_info (shared_ptr: &shared, shared_per_thread_ptr: &shared_per_thread, threads_ptr: &threads,
859 core: level2_cache_size);
860 }
861 else if (cpu_features->basic.kind == arch_kind_amd)
862 {
863 data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
864 shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
865
866 level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE);
867 level1_icache_linesize = handle_amd (_SC_LEVEL1_ICACHE_LINESIZE);
868 level1_dcache_size = data;
869 level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC);
870 level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE);
871 level2_cache_size = handle_amd (_SC_LEVEL2_CACHE_SIZE);;
872 level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC);
873 level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE);
874 level3_cache_size = shared;
875 level3_cache_assoc = handle_amd (_SC_LEVEL3_CACHE_ASSOC);
876 level3_cache_linesize = handle_amd (_SC_LEVEL3_CACHE_LINESIZE);
877 level4_cache_size = handle_amd (_SC_LEVEL4_CACHE_SIZE);
878
879 if (shared <= 0)
880 {
881 /* No shared L3 cache. All we have is the L2 cache. */
882 shared = level2_cache_size;
883 }
884 else if (cpu_features->basic.family < 0x17)
885 {
886 /* Account for exclusive L2 and L3 caches. */
887 shared += level2_cache_size;
888 }
889
890 shared_per_thread = shared;
891 }
892
893 cpu_features->level1_icache_size = level1_icache_size;
894 cpu_features->level1_icache_linesize = level1_icache_linesize;
895 cpu_features->level1_dcache_size = level1_dcache_size;
896 cpu_features->level1_dcache_assoc = level1_dcache_assoc;
897 cpu_features->level1_dcache_linesize = level1_dcache_linesize;
898 cpu_features->level2_cache_size = level2_cache_size;
899 cpu_features->level2_cache_assoc = level2_cache_assoc;
900 cpu_features->level2_cache_linesize = level2_cache_linesize;
901 cpu_features->level3_cache_size = level3_cache_size;
902 cpu_features->level3_cache_assoc = level3_cache_assoc;
903 cpu_features->level3_cache_linesize = level3_cache_linesize;
904 cpu_features->level4_cache_size = level4_cache_size;
905
906 unsigned long int cachesize_non_temporal_divisor
907 = cpu_features->cachesize_non_temporal_divisor;
908 if (cachesize_non_temporal_divisor <= 0)
909 cachesize_non_temporal_divisor = 4;
910
911 /* The default setting for the non_temporal threshold is [1/8, 1/2] of size
912 of the chip's cache (depending on `cachesize_non_temporal_divisor` which
913 is microarch specific. The default is 1/4). For most Intel processors
914 with an initial release date between 2017 and 2023, a thread's
915 typical share of the cache is from 18-64MB. Using a reasonable size
916 fraction of L3 is meant to estimate the point where non-temporal stores
917 begin out-competing REP MOVSB. As well the point where the fact that
918 non-temporal stores are forced back to main memory would already occurred
919 to the majority of the lines in the copy. Note, concerns about the entire
920 L3 cache being evicted by the copy are mostly alleviated by the fact that
921 modern HW detects streaming patterns and provides proper LRU hints so that
922 the maximum thrashing capped at 1/associativity. */
923 unsigned long int non_temporal_threshold
924 = shared / cachesize_non_temporal_divisor;
925
926 /* If the computed non_temporal_threshold <= 3/4 * per-thread L3, we most
927 likely have incorrect/incomplete cache info in which case, default to
928 3/4 * per-thread L3 to avoid regressions. */
929 unsigned long int non_temporal_threshold_lowbound
930 = shared_per_thread * 3 / 4;
931 if (non_temporal_threshold < non_temporal_threshold_lowbound)
932 non_temporal_threshold = non_temporal_threshold_lowbound;
933
934 /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run
935 a higher risk of actually thrashing the cache as they don't have a HW LRU
936 hint. As well, their performance in highly parallel situations is
937 noticeably worse. */
938 if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS))
939 non_temporal_threshold = non_temporal_threshold_lowbound;
940 /* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of
941 'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best
942 if that operation cannot overflow. Minimum of 0x4040 (16448) because the
943 L(large_memset_4x) loops need 64-byte to cache align and enough space for
944 at least 1 iteration of 4x PAGE_SIZE unrolled loop. Both values are
945 reflected in the manual. */
946 unsigned long int maximum_non_temporal_threshold = SIZE_MAX >> 4;
947 unsigned long int minimum_non_temporal_threshold = 0x4040;
948
949 /* If `non_temporal_threshold` less than `minimum_non_temporal_threshold`
950 it most likely means we failed to detect the cache info. We don't want
951 to default to `minimum_non_temporal_threshold` as such a small value,
952 while correct, has bad performance. We default to 64MB as reasonable
953 default bound. 64MB is likely conservative in that most/all systems would
954 choose a lower value so it should never forcing non-temporal stores when
955 they otherwise wouldn't be used. */
956 if (non_temporal_threshold < minimum_non_temporal_threshold)
957 non_temporal_threshold = 64 * 1024 * 1024;
958 else if (non_temporal_threshold > maximum_non_temporal_threshold)
959 non_temporal_threshold = maximum_non_temporal_threshold;
960
961 /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */
962 unsigned int minimum_rep_movsb_threshold;
963 /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
964 VEC_SIZE == 64 or 32. For VEC_SIZE == 16, the default REP MOVSB
965 threshold is 2048 * (VEC_SIZE / 16). */
966 unsigned int rep_movsb_threshold;
967 if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
968 && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
969 {
970 rep_movsb_threshold = 4096 * (64 / 16);
971 minimum_rep_movsb_threshold = 64 * 8;
972 }
973 else if (CPU_FEATURE_PREFERRED_P (cpu_features,
974 AVX_Fast_Unaligned_Load))
975 {
976 rep_movsb_threshold = 4096 * (32 / 16);
977 minimum_rep_movsb_threshold = 32 * 8;
978 }
979 else
980 {
981 rep_movsb_threshold = 2048 * (16 / 16);
982 minimum_rep_movsb_threshold = 16 * 8;
983 }
984 /* NB: The default REP MOVSB threshold is 2112 on processors with fast
985 short REP MOVSB (FSRM). */
986 if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
987 rep_movsb_threshold = 2112;
988
989 /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
990 cases slower than the vectorized path (and for some alignments,
991 it is really slow, check BZ #30994). */
992 if (cpu_features->basic.kind == arch_kind_amd)
993 rep_movsb_threshold = non_temporal_threshold;
994
995 /* The default threshold to use Enhanced REP STOSB. */
996 unsigned long int rep_stosb_threshold = 2048;
997
998 long int tunable_size;
999
1000 tunable_size = TUNABLE_GET (x86_data_cache_size, long int, NULL);
1001 /* NB: Ignore the default value 0. */
1002 if (tunable_size != 0)
1003 data = tunable_size;
1004
1005 tunable_size = TUNABLE_GET (x86_shared_cache_size, long int, NULL);
1006 /* NB: Ignore the default value 0. */
1007 if (tunable_size != 0)
1008 shared = tunable_size;
1009
1010 tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
1011 if (tunable_size > minimum_non_temporal_threshold
1012 && tunable_size <= maximum_non_temporal_threshold)
1013 non_temporal_threshold = tunable_size;
1014
1015 tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
1016 if (tunable_size > minimum_rep_movsb_threshold)
1017 rep_movsb_threshold = tunable_size;
1018
1019 /* NB: The default value of the x86_rep_stosb_threshold tunable is the
1020 same as the default value of __x86_rep_stosb_threshold and the
1021 minimum value is fixed. */
1022 rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
1023 long int, NULL);
1024 if (cpu_features->basic.kind == arch_kind_amd
1025 && !TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold))
1026 /* For AMD Zen3+ architecture, the performance of the vectorized loop is
1027 slightly better than ERMS. */
1028 rep_stosb_threshold = SIZE_MAX;
1029
1030 TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
1031 TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
1032 TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
1033 minimum_non_temporal_threshold,
1034 maximum_non_temporal_threshold);
1035 TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
1036 minimum_rep_movsb_threshold, SIZE_MAX);
1037 TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
1038 SIZE_MAX);
1039
1040 unsigned long int rep_movsb_stop_threshold;
1041 /* Setting the upper bound of ERMS to the computed value of
1042 non-temporal threshold for all architectures. */
1043 rep_movsb_stop_threshold = non_temporal_threshold;
1044
1045 cpu_features->data_cache_size = data;
1046 cpu_features->shared_cache_size = shared;
1047 cpu_features->non_temporal_threshold = non_temporal_threshold;
1048 cpu_features->rep_movsb_threshold = rep_movsb_threshold;
1049 cpu_features->rep_stosb_threshold = rep_stosb_threshold;
1050 cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
1051}
1052

source code of glibc/sysdeps/x86/dl-cacheinfo.h