dl-cacheinfo.h source code [glibc/sysdeps/x86/dl-cacheinfo.h]

1	/ Initialize x86 cache info.*
2	Copyright (C) 2020-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	static const struct intel_02_cache_info
20	{
21	unsigned char idx;
22	unsigned char assoc;
23	unsigned char linesize;
24	unsigned char rel_name;
25	unsigned int size;
26	} intel_02_known [] =
27	{
28	#define M(sc) ((sc) - _SC_LEVEL1_ICACHE_SIZE)
29	{ `0x06`, `4`, `32`, M(_SC_LEVEL1_ICACHE_SIZE), `8192` },
30	{ `0x08`, `4`, `32`, M(_SC_LEVEL1_ICACHE_SIZE), `16384` },
31	{ `0x09`, `4`, `32`, M(_SC_LEVEL1_ICACHE_SIZE), `32768` },
32	{ `0x0a`, `2`, `32`, M(_SC_LEVEL1_DCACHE_SIZE), `8192` },
33	{ `0x0c`, `4`, `32`, M(_SC_LEVEL1_DCACHE_SIZE), `16384` },
34	{ `0x0d`, `4`, `64`, M(_SC_LEVEL1_DCACHE_SIZE), `16384` },
35	{ `0x0e`, `6`, `64`, M(_SC_LEVEL1_DCACHE_SIZE), `24576` },
36	{ `0x21`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `262144` },
37	{ `0x22`, `4`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `524288` },
38	{ `0x23`, `8`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `1048576` },
39	{ `0x25`, `8`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `2097152` },
40	{ `0x29`, `8`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `4194304` },
41	{ `0x2c`, `8`, `64`, M(_SC_LEVEL1_DCACHE_SIZE), `32768` },
42	{ `0x30`, `8`, `64`, M(_SC_LEVEL1_ICACHE_SIZE), `32768` },
43	{ `0x39`, `4`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `131072` },
44	{ `0x3a`, `6`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `196608` },
45	{ `0x3b`, `2`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `131072` },
46	{ `0x3c`, `4`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `262144` },
47	{ `0x3d`, `6`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `393216` },
48	{ `0x3e`, `4`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `524288` },
49	{ `0x3f`, `2`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `262144` },
50	{ `0x41`, `4`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `131072` },
51	{ `0x42`, `4`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `262144` },
52	{ `0x43`, `4`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `524288` },
53	{ `0x44`, `4`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `1048576` },
54	{ `0x45`, `4`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `2097152` },
55	{ `0x46`, `4`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `4194304` },
56	{ `0x47`, `8`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `8388608` },
57	{ `0x48`, `12`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `3145728` },
58	{ `0x49`, `16`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `4194304` },
59	{ `0x4a`, `12`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `6291456` },
60	{ `0x4b`, `16`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `8388608` },
61	{ `0x4c`, `12`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `12582912` },
62	{ `0x4d`, `16`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `16777216` },
63	{ `0x4e`, `24`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `6291456` },
64	{ `0x60`, `8`, `64`, M(_SC_LEVEL1_DCACHE_SIZE), `16384` },
65	{ `0x66`, `4`, `64`, M(_SC_LEVEL1_DCACHE_SIZE), `8192` },
66	{ `0x67`, `4`, `64`, M(_SC_LEVEL1_DCACHE_SIZE), `16384` },
67	{ `0x68`, `4`, `64`, M(_SC_LEVEL1_DCACHE_SIZE), `32768` },
68	{ `0x78`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `1048576` },
69	{ `0x79`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `131072` },
70	{ `0x7a`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `262144` },
71	{ `0x7b`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `524288` },
72	{ `0x7c`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `1048576` },
73	{ `0x7d`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `2097152` },
74	{ `0x7f`, `2`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `524288` },
75	{ `0x80`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `524288` },
76	{ `0x82`, `8`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `262144` },
77	{ `0x83`, `8`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `524288` },
78	{ `0x84`, `8`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `1048576` },
79	{ `0x85`, `8`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `2097152` },
80	{ `0x86`, `4`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `524288` },
81	{ `0x87`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `1048576` },
82	{ `0xd0`, `4`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `524288` },
83	{ `0xd1`, `4`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `1048576` },
84	{ `0xd2`, `4`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `2097152` },
85	{ `0xd6`, `8`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `1048576` },
86	{ `0xd7`, `8`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `2097152` },
87	{ `0xd8`, `8`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `4194304` },
88	{ `0xdc`, `12`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `2097152` },
89	{ `0xdd`, `12`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `4194304` },
90	{ `0xde`, `12`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `8388608` },
91	{ `0xe2`, `16`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `2097152` },
92	{ `0xe3`, `16`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `4194304` },
93	{ `0xe4`, `16`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `8388608` },
94	{ `0xea`, `24`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `12582912` },
95	{ `0xeb`, `24`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `18874368` },
96	{ `0xec`, `24`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `25165824` },
97	};
98
99	#define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known [0]))
100
101	static int
102	intel_02_known_compare (const void p1, const* void *p2)
103	{
104	const struct intel_02_cache_info *i1;
105	const struct intel_02_cache_info *i2;
106
107	i1 = (const struct intel_02_cache_info *) p1;
108	i2 = (const struct intel_02_cache_info *) p2;
109
110	if (i1->idx == i2->idx)
111	return `0`;
112
113	return i1->idx < i2->idx ? -`1` : `1`;
114	}
115
116
117	static long int
118	__attribute__ ((noinline))
119	intel_check_word (int name, unsigned int value, bool *has_level_2,
120	bool *no_level_2_or_3,
121	const struct cpu_features *cpu_features)
122	{
123	if ((value & `0x80000000`) != `0`)
124	/ The register value is reserved. /
125	return `0`;
126
127	/ Fold the name. The _SC_ constants are always in the order SIZE,*
128	ASSOC, LINESIZE. /*
129	int folded_rel_name = (M(name) / `3`) * `3`;
130
131	while (value != `0`)
132	{
133	unsigned int byte = value & `0xff`;
134
135	if (byte == `0x40`)
136	{
137	*no_level_2_or_3 = true;
138
139	if (folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
140	/ No need to look further. /
141	break;
142	}
143	else if (byte == `0xff`)
144	{
145	/ CPUID leaf 0x4 contains all the information. We need to*
146	iterate over it. /*
147	unsigned int eax;
148	unsigned int ebx;
149	unsigned int ecx;
150	unsigned int edx;
151
152	unsigned int round = `0`;
153	while (`1`)
154	{
155	__cpuid_count (`4`, round, eax, ebx, ecx, edx);
156
157	enum { null = `0`, data = `1`, inst = `2`, uni = `3` } type = eax & `0x1f`;
158	if (type == null)
159	/ That was the end. /
160	break;
161
162	unsigned int level = (eax >> `5`) & `0x7`;
163
164	if ((level == `1` && type == data
165	&& folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
166	\|\| (level == `1` && type == inst
167	&& folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
168	\|\| (level == `2` && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
169	\|\| (level == `3` && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
170	\|\| (level == `4` && folded_rel_name == M(_SC_LEVEL4_CACHE_SIZE)))
171	{
172	unsigned int offset = M(name) - folded_rel_name;
173
174	if (offset == `0`)
175	/ Cache size. /
176	return (((ebx >> `22`) + `1`)
177	* (((ebx >> `12`) & `0x3ff`) + `1`)
178	* ((ebx & `0xfff`) + `1`)
179	* (ecx + `1`));
180	if (offset == `1`)
181	return (ebx >> `22`) + `1`;
182
183	assert (offset == `2`);
184	return (ebx & `0xfff`) + `1`;
185	}
186
187	++round;
188	}
189	/ There is no other cache information anywhere else. /
190	return -`1`;
191	}
192	else
193	{
194	if (byte == `0x49` && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
195	{
196	/ Intel reused this value. For family 15, model 6 it*
197	specifies the 3rd level cache. Otherwise the 2nd
198	level cache. /*
199	unsigned int family = cpu_features->basic.family;
200	unsigned int model = cpu_features->basic.model;
201
202	if (family == `15` && model == `6`)
203	{
204	/ The level 3 cache is encoded for this model like*
205	the level 2 cache is for other models. Pretend
206	the caller asked for the level 2 cache. /*
207	name = (_SC_LEVEL2_CACHE_SIZE
208	+ (name - _SC_LEVEL3_CACHE_SIZE));
209	folded_rel_name = M(_SC_LEVEL2_CACHE_SIZE);
210	}
211	}
212
213	struct intel_02_cache_info *found;
214	struct intel_02_cache_info search;
215
216	search.idx = byte;
217	found = bsearch (&search, intel_02_known, nintel_02_known,
218	sizeof (intel_02_known[`0`]), intel_02_known_compare);
219	if (found != NULL)
220	{
221	if (found->rel_name == folded_rel_name)
222	{
223	unsigned int offset = M(name) - folded_rel_name;
224
225	if (offset == `0`)
226	/ Cache size. /
227	return found->size;
228	if (offset == `1`)
229	return found->assoc;
230
231	assert (offset == `2`);
232	return found->linesize;
233	}
234
235	if (found->rel_name == M(_SC_LEVEL2_CACHE_SIZE))
236	*has_level_2 = true;
237	}
238	}
239
240	/ Next byte for the next round. /
241	value >>= `8`;
242	}
243
244	/ Nothing found. /
245	return `0`;
246	}
247
248
249	static long int __attribute__ ((noinline))
250	handle_intel (int name, const struct cpu_features *cpu_features)
251	{
252	unsigned int maxidx = cpu_features->basic.max_cpuid;
253
254	/ Return -1 for older CPUs. /
255	if (maxidx < `2`)
256	return -`1`;
257
258	/ OK, we can use the CPUID instruction to get all info about the*
259	caches. /*
260	long int result = `0`;
261	bool no_level_2_or_3 = false;
262	bool has_level_2 = false;
263	unsigned int eax;
264	unsigned int ebx;
265	unsigned int ecx;
266	unsigned int edx;
267	__cpuid (`2`, eax, ebx, ecx, edx);
268
269	/ The low byte of EAX of CPUID leaf 2 should always return 1 and it*
270	should be ignored. If it isn't 1, use CPUID leaf 4 instead. /*
271	if ((eax & `0xff`) != `1`)
272	return intel_check_word (name, value: `0xff`, has_level_2: &has_level_2, no_level_2_or_3: &no_level_2_or_3,
273	cpu_features);
274	else
275	{
276	eax &= `0xffffff00`;
277
278	/ Process the individual registers' value. /
279	result = intel_check_word (name, value: eax, has_level_2: &has_level_2,
280	no_level_2_or_3: &no_level_2_or_3, cpu_features);
281	if (result != `0`)
282	return result;
283
284	result = intel_check_word (name, value: ebx, has_level_2: &has_level_2,
285	no_level_2_or_3: &no_level_2_or_3, cpu_features);
286	if (result != `0`)
287	return result;
288
289	result = intel_check_word (name, value: ecx, has_level_2: &has_level_2,
290	no_level_2_or_3: &no_level_2_or_3, cpu_features);
291	if (result != `0`)
292	return result;
293
294	result = intel_check_word (name, value: edx, has_level_2: &has_level_2,
295	no_level_2_or_3: &no_level_2_or_3, cpu_features);
296	if (result != `0`)
297	return result;
298	}
299
300	if (name >= _SC_LEVEL2_CACHE_SIZE && name <= _SC_LEVEL3_CACHE_LINESIZE
301	&& no_level_2_or_3)
302	return -`1`;
303
304	return `0`;
305	}
306
307
308	static long int __attribute__ ((noinline))
309	handle_amd (int name)
310	{
311	unsigned int eax;
312	unsigned int ebx;
313	unsigned int ecx = `0`;
314	unsigned int edx;
315	unsigned int max_cpuid = `0`;
316	unsigned int fn = `0`;
317
318	/ No level 4 cache (yet). /
319	if (name > _SC_LEVEL3_CACHE_LINESIZE)
320	return `0`;
321
322	__cpuid (`0x80000000`, max_cpuid, ebx, ecx, edx);
323
324	if (max_cpuid >= `0x8000001D`)
325	/ Use __cpuid__ '0x8000_001D' to compute cache details. /
326	{
327	unsigned int count = `0x1`;
328
329	if (name >= _SC_LEVEL3_CACHE_SIZE)
330	count = `0x3`;
331	else if (name >= _SC_LEVEL2_CACHE_SIZE)
332	count = `0x2`;
333	else if (name >= _SC_LEVEL1_DCACHE_SIZE)
334	count = `0x0`;
335
336	__cpuid_count (`0x8000001D`, count, eax, ebx, ecx, edx);
337
338	if (ecx != `0`)
339	{
340	switch (name)
341	{
342	case _SC_LEVEL1_ICACHE_ASSOC:
343	case _SC_LEVEL1_DCACHE_ASSOC:
344	case _SC_LEVEL2_CACHE_ASSOC:
345	case _SC_LEVEL3_CACHE_ASSOC:
346	return ((ebx >> `22`) & `0x3ff`) + `1`;
347	case _SC_LEVEL1_ICACHE_LINESIZE:
348	case _SC_LEVEL1_DCACHE_LINESIZE:
349	case _SC_LEVEL2_CACHE_LINESIZE:
350	case _SC_LEVEL3_CACHE_LINESIZE:
351	return (ebx & `0xfff`) + `1`;
352	case _SC_LEVEL1_ICACHE_SIZE:
353	case _SC_LEVEL1_DCACHE_SIZE:
354	case _SC_LEVEL2_CACHE_SIZE:
355	case _SC_LEVEL3_CACHE_SIZE:
356	return (((ebx >> `22`) & `0x3ff`) + `1`) * ((ebx & `0xfff`) + `1`) * (ecx + `1`);
357	default:
358	__builtin_unreachable ();
359	}
360	return -`1`;
361	}
362	}
363
364	/ Legacy cache computation for CPUs prior to Bulldozer family.*
365	This is also a fail-safe mechanism for some hypervisors that
366	accidentally configure __cpuid__ '0x8000_001D' to Zero. /*
367
368	fn = `0x80000005` + (name >= _SC_LEVEL2_CACHE_SIZE);
369
370	if (max_cpuid < fn)
371	return `0`;
372
373	__cpuid (fn, eax, ebx, ecx, edx);
374
375	if (name < _SC_LEVEL1_DCACHE_SIZE)
376	{
377	name += _SC_LEVEL1_DCACHE_SIZE - _SC_LEVEL1_ICACHE_SIZE;
378	ecx = edx;
379	}
380
381	switch (name)
382	{
383	case _SC_LEVEL1_DCACHE_SIZE:
384	return (ecx >> `14`) & `0x3fc00`;
385
386	case _SC_LEVEL1_DCACHE_ASSOC:
387	ecx >>= `16`;
388	if ((ecx & `0xff`) == `0xff`)
389	{
390	/ Fully associative. /
391	return (ecx << `2`) & `0x3fc00`;
392	}
393	return ecx & `0xff`;
394
395	case _SC_LEVEL1_DCACHE_LINESIZE:
396	return ecx & `0xff`;
397
398	case _SC_LEVEL2_CACHE_SIZE:
399	return (ecx & `0xf000`) == `0` ? `0` : (ecx >> `6`) & `0x3fffc00`;
400
401	case _SC_LEVEL2_CACHE_ASSOC:
402	switch ((ecx >> `12`) & `0xf`)
403	{
404	case `0`:
405	case `1`:
406	case `2`:
407	case `4`:
408	return (ecx >> `12`) & `0xf`;
409	case `6`:
410	return `8`;
411	case `8`:
412	return `16`;
413	case `10`:
414	return `32`;
415	case `11`:
416	return `48`;
417	case `12`:
418	return `64`;
419	case `13`:
420	return `96`;
421	case `14`:
422	return `128`;
423	case `15`:
424	return ((ecx >> `6`) & `0x3fffc00`) / (ecx & `0xff`);
425	default:
426	return `0`;
427	}
428
429	case _SC_LEVEL2_CACHE_LINESIZE:
430	return (ecx & `0xf000`) == `0` ? `0` : ecx & `0xff`;
431
432	case _SC_LEVEL3_CACHE_SIZE:
433	{
434	long int total_l3_cache = `0`, l3_cache_per_thread = `0`;
435	unsigned int threads = `0`;
436	const struct cpu_features *cpu_features;
437
438	if ((edx & `0xf000`) == `0`)
439	return `0`;
440
441	total_l3_cache = (edx & `0x3ffc0000`) << `1`;
442	cpu_features = __get_cpu_features ();
443
444	/ Figure out the number of logical threads that share L3. /
445	if (max_cpuid >= `0x80000008`)
446	{
447	/ Get width of APIC ID. /
448	__cpuid (`0x80000008`, eax, ebx, ecx, edx);
449	threads = (ecx & `0xff`) + `1`;
450	}
451
452	if (threads == `0`)
453	{
454	/ If APIC ID width is not available, use logical*
455	processor count. /*
456	__cpuid (`0x00000001`, eax, ebx, ecx, edx);
457	if ((edx & (`1` << `28`)) != `0`)
458	threads = (ebx >> `16`) & `0xff`;
459	}
460
461	/ Cap usage of highest cache level to the number of*
462	supported threads. /*
463	if (threads > `0`)
464	l3_cache_per_thread = total_l3_cache/threads;
465
466	/ Get shared cache per ccx for Zen architectures. /
467	if (cpu_features->basic.family >= `0x17`)
468	{
469	long int l3_cache_per_ccx = `0`;
470	/ Get number of threads share the L3 cache in CCX. /
471	__cpuid_count (`0x8000001D`, `0x3`, eax, ebx, ecx, edx);
472	unsigned int threads_per_ccx = ((eax >> `14`) & `0xfff`) + `1`;
473	l3_cache_per_ccx = l3_cache_per_thread * threads_per_ccx;
474	return l3_cache_per_ccx;
475	}
476	else
477	{
478	return l3_cache_per_thread;
479	}
480	}
481
482	case _SC_LEVEL3_CACHE_ASSOC:
483	switch ((edx >> `12`) & `0xf`)
484	{
485	case `0`:
486	case `1`:
487	case `2`:
488	case `4`:
489	return (edx >> `12`) & `0xf`;
490	case `6`:
491	return `8`;
492	case `8`:
493	return `16`;
494	case `10`:
495	return `32`;
496	case `11`:
497	return `48`;
498	case `12`:
499	return `64`;
500	case `13`:
501	return `96`;
502	case `14`:
503	return `128`;
504	case `15`:
505	return ((edx & `0x3ffc0000`) << `1`) / (edx & `0xff`);
506	default:
507	return `0`;
508	}
509
510	case _SC_LEVEL3_CACHE_LINESIZE:
511	return (edx & `0xf000`) == `0` ? `0` : edx & `0xff`;
512
513	default:
514	__builtin_unreachable ();
515	}
516	return -`1`;
517	}
518
519
520	static long int __attribute__ ((noinline))
521	handle_zhaoxin (int name)
522	{
523	unsigned int eax;
524	unsigned int ebx;
525	unsigned int ecx;
526	unsigned int edx;
527
528	int folded_rel_name = (M(name) / `3`) * `3`;
529
530	unsigned int round = `0`;
531	while (`1`)
532	{
533	__cpuid_count (`4`, round, eax, ebx, ecx, edx);
534
535	enum { null = `0`, data = `1`, inst = `2`, uni = `3` } type = eax & `0x1f`;
536	if (type == null)
537	break;
538
539	unsigned int level = (eax >> `5`) & `0x7`;
540
541	if ((level == `1` && type == data
542	&& folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
543	\|\| (level == `1` && type == inst
544	&& folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
545	\|\| (level == `2` && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
546	\|\| (level == `3` && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE)))
547	{
548	unsigned int offset = M(name) - folded_rel_name;
549
550	if (offset == `0`)
551	/ Cache size. /
552	return (((ebx >> `22`) + `1`)
553	* (((ebx >> `12`) & `0x3ff`) + `1`)
554	* ((ebx & `0xfff`) + `1`)
555	* (ecx + `1`));
556	if (offset == `1`)
557	return (ebx >> `22`) + `1`;
558
559	assert (offset == `2`);
560	return (ebx & `0xfff`) + `1`;
561	}
562
563	++round;
564	}
565
566	/ Nothing found. /
567	return `0`;
568	}
569
570	static void
571	get_common_cache_info (long int shared_ptr, long* int * shared_per_thread_ptr, unsigned int *threads_ptr,
572	long int core)
573	{
574	unsigned int eax;
575	unsigned int ebx;
576	unsigned int ecx;
577	unsigned int edx;
578
579	/ Number of logical processors sharing L2 cache. /
580	int threads_l2;
581
582	/ Number of logical processors sharing L3 cache. /
583	int threads_l3;
584
585	const struct cpu_features *cpu_features = __get_cpu_features ();
586	int max_cpuid = cpu_features->basic.max_cpuid;
587	unsigned int family = cpu_features->basic.family;
588	unsigned int model = cpu_features->basic.model;
589	long int shared = *shared_ptr;
590	long int shared_per_thread = *shared_per_thread_ptr;
591	unsigned int threads = *threads_ptr;
592	bool inclusive_cache = true;
593	bool support_count_mask = true;
594
595	/ Try L3 first. /
596	unsigned int level = `3`;
597
598	if (cpu_features->basic.kind == arch_kind_zhaoxin && family == `6`)
599	support_count_mask = false;
600
601	if (shared <= `0`)
602	{
603	/ Try L2 otherwise. /
604	level = `2`;
605	shared = core;
606	shared_per_thread = core;
607	threads_l2 = `0`;
608	threads_l3 = -`1`;
609	}
610	else
611	{
612	threads_l2 = `0`;
613	threads_l3 = `0`;
614	}
615
616	/ A value of 0 for the HTT bit indicates there is only a single*
617	logical processor. /*
618	if (HAS_CPU_FEATURE (HTT))
619	{
620	/ Figure out the number of logical threads that share the*
621	highest cache level. /*
622	if (max_cpuid >= `4`)
623	{
624	int i = `0`;
625
626	/ Query until cache level 2 and 3 are enumerated. /
627	int check = `0x1` \| (threads_l3 == `0`) << `1`;
628	do
629	{
630	__cpuid_count (`4`, i++, eax, ebx, ecx, edx);
631
632	/ There seems to be a bug in at least some Pentium Ds*
633	which sometimes fail to iterate all cache parameters.
634	Do not loop indefinitely here, stop in this case and
635	assume there is no such information. /*
636	if (cpu_features->basic.kind == arch_kind_intel
637	&& (eax & `0x1f`) == `0` )
638	goto intel_bug_no_cache_info;
639
640	switch ((eax >> `5`) & `0x7`)
641	{
642	default:
643	break;
644	case `2`:
645	if ((check & `0x1`))
646	{
647	/ Get maximum number of logical processors*
648	sharing L2 cache. /*
649	threads_l2 = (eax >> `14`) & `0x3ff`;
650	check &= ~`0x1`;
651	}
652	break;
653	case `3`:
654	if ((check & (`0x1` << `1`)))
655	{
656	/ Get maximum number of logical processors*
657	sharing L3 cache. /*
658	threads_l3 = (eax >> `14`) & `0x3ff`;
659
660	/ Check if L2 and L3 caches are inclusive. /
661	inclusive_cache = (edx & `0x2`) != `0`;
662	check &= ~(`0x1` << `1`);
663	}
664	break;
665	}
666	}
667	while (check);
668
669	/ If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum*
670	numbers of addressable IDs for logical processors sharing
671	the cache, instead of the maximum number of threads
672	sharing the cache. /*
673	if (max_cpuid >= `11` && support_count_mask)
674	{
675	/ Find the number of logical processors shipped in*
676	one core and apply count mask. /*
677	i = `0`;
678
679	/ Count SMT only if there is L3 cache. Always count*
680	core if there is no L3 cache. /*
681	int count = ((threads_l2 > `0` && level == `3`)
682	\| ((threads_l3 > `0`
683	\|\| (threads_l2 > `0` && level == `2`)) << `1`));
684
685	while (count)
686	{
687	__cpuid_count (`11`, i++, eax, ebx, ecx, edx);
688
689	int shipped = ebx & `0xff`;
690	int type = ecx & `0xff00`;
691	if (shipped == `0` \|\| type == `0`)
692	break;
693	else if (type == `0x100`)
694	{
695	/ Count SMT. /
696	if ((count & `0x1`))
697	{
698	int count_mask;
699
700	/ Compute count mask. /
701	asm ("bsr %1, %0"
702	: "=r" (count_mask) : "g" (threads_l2));
703	count_mask = ~(-`1` << (count_mask + `1`));
704	threads_l2 = (shipped - `1`) & count_mask;
705	count &= ~`0x1`;
706	}
707	}
708	else if (type == `0x200`)
709	{
710	/ Count core. /
711	if ((count & (`0x1` << `1`)))
712	{
713	int count_mask;
714	int threads_core
715	= (level == `2` ? threads_l2 : threads_l3);
716
717	/ Compute count mask. /
718	asm ("bsr %1, %0"
719	: "=r" (count_mask) : "g" (threads_core));
720	count_mask = ~(-`1` << (count_mask + `1`));
721	threads_core = (shipped - `1`) & count_mask;
722	if (level == `2`)
723	threads_l2 = threads_core;
724	else
725	threads_l3 = threads_core;
726	count &= ~(`0x1` << `1`);
727	}
728	}
729	}
730	}
731	if (threads_l2 > `0`)
732	threads_l2 += `1`;
733	if (threads_l3 > `0`)
734	threads_l3 += `1`;
735	if (level == `2`)
736	{
737	if (threads_l2)
738	{
739	threads = threads_l2;
740	if (cpu_features->basic.kind == arch_kind_intel
741	&& threads > `2`
742	&& family == `6`)
743	switch (model)
744	{
745	case `0x37`:
746	case `0x4a`:
747	case `0x4d`:
748	case `0x5a`:
749	case `0x5d`:
750	/ Silvermont has L2 cache shared by 2 cores. /
751	threads = `2`;
752	break;
753	default:
754	break;
755	}
756	}
757	}
758	else if (threads_l3)
759	threads = threads_l3;
760	}
761	else
762	{
763	intel_bug_no_cache_info:
764	/ Assume that all logical threads share the highest cache*
765	level. /*
766	threads = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> `16`)
767	& `0xff`);
768	}
769	/ Get per-thread size of highest level cache. /
770	if (shared_per_thread > `0` && threads > `0`)
771	shared_per_thread /= threads;
772	}
773
774	/ Account for non-inclusive L2 and L3 caches. /
775	if (!inclusive_cache)
776	{
777	long int core_per_thread = threads_l2 > `0` ? (core / threads_l2) : core;
778	shared_per_thread += core_per_thread;
779	shared += core;
780	}
781
782	*shared_ptr = shared;
783	*shared_per_thread_ptr = shared_per_thread;
784	*threads_ptr = threads;
785	}
786
787	static void
788	dl_init_cacheinfo (struct cpu_features *cpu_features)
789	{
790	/ Find out what brand of processor. /
791	long int data = -`1`;
792	long int shared = -`1`;
793	long int shared_per_thread = -`1`;
794	unsigned int threads = `0`;
795	unsigned long int level1_icache_size = -`1`;
796	unsigned long int level1_icache_linesize = -`1`;
797	unsigned long int level1_dcache_size = -`1`;
798	unsigned long int level1_dcache_assoc = -`1`;
799	unsigned long int level1_dcache_linesize = -`1`;
800	unsigned long int level2_cache_size = -`1`;
801	unsigned long int level2_cache_assoc = -`1`;
802	unsigned long int level2_cache_linesize = -`1`;
803	unsigned long int level3_cache_size = -`1`;
804	unsigned long int level3_cache_assoc = -`1`;
805	unsigned long int level3_cache_linesize = -`1`;
806	unsigned long int level4_cache_size = -`1`;
807
808	if (cpu_features->basic.kind == arch_kind_intel)
809	{
810	data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
811	shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
812	shared_per_thread = shared;
813
814	level1_icache_size
815	= handle_intel (_SC_LEVEL1_ICACHE_SIZE, cpu_features);
816	level1_icache_linesize
817	= handle_intel (_SC_LEVEL1_ICACHE_LINESIZE, cpu_features);
818	level1_dcache_size = data;
819	level1_dcache_assoc
820	= handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
821	level1_dcache_linesize
822	= handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
823	level2_cache_size
824	= handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
825	level2_cache_assoc
826	= handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
827	level2_cache_linesize
828	= handle_intel (_SC_LEVEL2_CACHE_LINESIZE, cpu_features);
829	level3_cache_size = shared;
830	level3_cache_assoc
831	= handle_intel (_SC_LEVEL3_CACHE_ASSOC, cpu_features);
832	level3_cache_linesize
833	= handle_intel (_SC_LEVEL3_CACHE_LINESIZE, cpu_features);
834	level4_cache_size
835	= handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
836
837	get_common_cache_info (shared_ptr: &shared, shared_per_thread_ptr: &shared_per_thread, threads_ptr: &threads,
838	core: level2_cache_size);
839	}
840	else if (cpu_features->basic.kind == arch_kind_zhaoxin)
841	{
842	data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
843	shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
844	shared_per_thread = shared;
845
846	level1_icache_size = handle_zhaoxin (_SC_LEVEL1_ICACHE_SIZE);
847	level1_icache_linesize = handle_zhaoxin (_SC_LEVEL1_ICACHE_LINESIZE);
848	level1_dcache_size = data;
849	level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC);
850	level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE);
851	level2_cache_size = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
852	level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC);
853	level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE);
854	level3_cache_size = shared;
855	level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
856	level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
857
858	get_common_cache_info (shared_ptr: &shared, shared_per_thread_ptr: &shared_per_thread, threads_ptr: &threads,
859	core: level2_cache_size);
860	}
861	else if (cpu_features->basic.kind == arch_kind_amd)
862	{
863	data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
864	shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
865
866	level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE);
867	level1_icache_linesize = handle_amd (_SC_LEVEL1_ICACHE_LINESIZE);
868	level1_dcache_size = data;
869	level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC);
870	level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE);
871	level2_cache_size = handle_amd (_SC_LEVEL2_CACHE_SIZE);;
872	level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC);
873	level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE);
874	level3_cache_size = shared;
875	level3_cache_assoc = handle_amd (_SC_LEVEL3_CACHE_ASSOC);
876	level3_cache_linesize = handle_amd (_SC_LEVEL3_CACHE_LINESIZE);
877	level4_cache_size = handle_amd (_SC_LEVEL4_CACHE_SIZE);
878
879	if (shared <= `0`)
880	{
881	/ No shared L3 cache. All we have is the L2 cache. /
882	shared = level2_cache_size;
883	}
884	else if (cpu_features->basic.family < `0x17`)
885	{
886	/ Account for exclusive L2 and L3 caches. /
887	shared += level2_cache_size;
888	}
889
890	shared_per_thread = shared;
891	}
892
893	cpu_features->level1_icache_size = level1_icache_size;
894	cpu_features->level1_icache_linesize = level1_icache_linesize;
895	cpu_features->level1_dcache_size = level1_dcache_size;
896	cpu_features->level1_dcache_assoc = level1_dcache_assoc;
897	cpu_features->level1_dcache_linesize = level1_dcache_linesize;
898	cpu_features->level2_cache_size = level2_cache_size;
899	cpu_features->level2_cache_assoc = level2_cache_assoc;
900	cpu_features->level2_cache_linesize = level2_cache_linesize;
901	cpu_features->level3_cache_size = level3_cache_size;
902	cpu_features->level3_cache_assoc = level3_cache_assoc;
903	cpu_features->level3_cache_linesize = level3_cache_linesize;
904	cpu_features->level4_cache_size = level4_cache_size;
905
906	unsigned long int cachesize_non_temporal_divisor
907	= cpu_features->cachesize_non_temporal_divisor;
908	if (cachesize_non_temporal_divisor <= `0`)
909	cachesize_non_temporal_divisor = `4`;
910
911	/ The default setting for the non_temporal threshold is [1/8, 1/2] of size*
912	of the chip's cache (depending on `cachesize_non_temporal_divisor` which
913	is microarch specific. The default is 1/4). For most Intel processors
914	with an initial release date between 2017 and 2023, a thread's
915	typical share of the cache is from 18-64MB. Using a reasonable size
916	fraction of L3 is meant to estimate the point where non-temporal stores
917	begin out-competing REP MOVSB. As well the point where the fact that
918	non-temporal stores are forced back to main memory would already occurred
919	to the majority of the lines in the copy. Note, concerns about the entire
920	L3 cache being evicted by the copy are mostly alleviated by the fact that
921	modern HW detects streaming patterns and provides proper LRU hints so that
922	the maximum thrashing capped at 1/associativity. /*
923	unsigned long int non_temporal_threshold
924	= shared / cachesize_non_temporal_divisor;
925
926	/ If the computed non_temporal_threshold <= 3/4 * per-thread L3, we most*
927	likely have incorrect/incomplete cache info in which case, default to
928	3/4 per-thread L3 to avoid regressions. /
929	unsigned long int non_temporal_threshold_lowbound
930	= shared_per_thread * `3` / `4`;
931	if (non_temporal_threshold < non_temporal_threshold_lowbound)
932	non_temporal_threshold = non_temporal_threshold_lowbound;
933
934	/ If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run*
935	a higher risk of actually thrashing the cache as they don't have a HW LRU
936	hint. As well, their performance in highly parallel situations is
937	noticeably worse. /*
938	if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS))
939	non_temporal_threshold = non_temporal_threshold_lowbound;
940	/ SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of*
941	'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best
942	if that operation cannot overflow. Minimum of 0x4040 (16448) because the
943	L(large_memset_4x) loops need 64-byte to cache align and enough space for
944	at least 1 iteration of 4x PAGE_SIZE unrolled loop. Both values are
945	reflected in the manual. /*
946	unsigned long int maximum_non_temporal_threshold = SIZE_MAX >> `4`;
947	unsigned long int minimum_non_temporal_threshold = `0x4040`;
948
949	/ If `non_temporal_threshold` less than `minimum_non_temporal_threshold`*
950	it most likely means we failed to detect the cache info. We don't want
951	to default to `minimum_non_temporal_threshold` as such a small value,
952	while correct, has bad performance. We default to 64MB as reasonable
953	default bound. 64MB is likely conservative in that most/all systems would
954	choose a lower value so it should never forcing non-temporal stores when
955	they otherwise wouldn't be used. /*
956	if (non_temporal_threshold < minimum_non_temporal_threshold)
957	non_temporal_threshold = `64` * `1024` * `1024`;
958	else if (non_temporal_threshold > maximum_non_temporal_threshold)
959	non_temporal_threshold = maximum_non_temporal_threshold;
960
961	/ NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. /
962	unsigned int minimum_rep_movsb_threshold;
963	/ NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for*
964	VEC_SIZE == 64 or 32. For VEC_SIZE == 16, the default REP MOVSB
965	threshold is 2048 (VEC_SIZE / 16). /
966	unsigned int rep_movsb_threshold;
967	if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
968	&& !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
969	{
970	rep_movsb_threshold = `4096` * (`64` / `16`);
971	minimum_rep_movsb_threshold = `64` * `8`;
972	}
973	else if (CPU_FEATURE_PREFERRED_P (cpu_features,
974	AVX_Fast_Unaligned_Load))
975	{
976	rep_movsb_threshold = `4096` * (`32` / `16`);
977	minimum_rep_movsb_threshold = `32` * `8`;
978	}
979	else
980	{
981	rep_movsb_threshold = `2048` * (`16` / `16`);
982	minimum_rep_movsb_threshold = `16` * `8`;
983	}
984	/ NB: The default REP MOVSB threshold is 2112 on processors with fast*
985	short REP MOVSB (FSRM). /*
986	if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
987	rep_movsb_threshold = `2112`;
988
989	/ For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of*
990	cases slower than the vectorized path (and for some alignments,
991	it is really slow, check BZ #30994). /*
992	if (cpu_features->basic.kind == arch_kind_amd)
993	rep_movsb_threshold = non_temporal_threshold;
994
995	/ The default threshold to use Enhanced REP STOSB. /
996	unsigned long int rep_stosb_threshold = `2048`;
997
998	long int tunable_size;
999
1000	tunable_size = TUNABLE_GET (x86_data_cache_size, long int, NULL);
1001	/ NB: Ignore the default value 0. /
1002	if (tunable_size != `0`)
1003	data = tunable_size;
1004
1005	tunable_size = TUNABLE_GET (x86_shared_cache_size, long int, NULL);
1006	/ NB: Ignore the default value 0. /
1007	if (tunable_size != `0`)
1008	shared = tunable_size;
1009
1010	tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
1011	if (tunable_size > minimum_non_temporal_threshold
1012	&& tunable_size <= maximum_non_temporal_threshold)
1013	non_temporal_threshold = tunable_size;
1014
1015	tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
1016	if (tunable_size > minimum_rep_movsb_threshold)
1017	rep_movsb_threshold = tunable_size;
1018
1019	/ NB: The default value of the x86_rep_stosb_threshold tunable is the*
1020	same as the default value of __x86_rep_stosb_threshold and the
1021	minimum value is fixed. /*
1022	rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
1023	long int, NULL);
1024	if (cpu_features->basic.kind == arch_kind_amd
1025	&& !TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold))
1026	/ For AMD Zen3+ architecture, the performance of the vectorized loop is*
1027	slightly better than ERMS. /*
1028	rep_stosb_threshold = SIZE_MAX;
1029
1030	TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, `0`, SIZE_MAX);
1031	TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, `0`, SIZE_MAX);
1032	TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
1033	minimum_non_temporal_threshold,
1034	maximum_non_temporal_threshold);
1035	TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
1036	minimum_rep_movsb_threshold, SIZE_MAX);
1037	TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, `1`,
1038	SIZE_MAX);
1039
1040	unsigned long int rep_movsb_stop_threshold;
1041	/ Setting the upper bound of ERMS to the computed value of*
1042	non-temporal threshold for all architectures. /*
1043	rep_movsb_stop_threshold = non_temporal_threshold;
1044
1045	cpu_features->data_cache_size = data;
1046	cpu_features->shared_cache_size = shared;
1047	cpu_features->non_temporal_threshold = non_temporal_threshold;
1048	cpu_features->rep_movsb_threshold = rep_movsb_threshold;
1049	cpu_features->rep_stosb_threshold = rep_stosb_threshold;
1050	cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
1051	}
1052

source code of glibc/sysdeps/x86/dl-cacheinfo.h