smpboot.c source code [linux/arch/x86/kernel/smpboot.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* x86 SMP booting functions
4	*
5	* (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
6	* (c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
7	* Copyright 2001 Andi Kleen, SuSE Labs.
8	*
9	* Much of the core SMP work is based on previous work by Thomas Radke, to
10	* whom a great many thanks are extended.
11	*
12	* Thanks to Intel for making available several different Pentium,
13	* Pentium Pro and Pentium-II/Xeon MP machines.
14	* Original development of Linux SMP code supported by Caldera.
15	*
16	* Fixes
17	* Felix Koop : NR_CPUS used properly
18	* Jose Renau : Handle single CPU case.
19	* Alan Cox : By repeated request 8) - Total BogoMIPS report.
20	* Greg Wright : Fix for kernel stacks panic.
21	* Erich Boleyn : MP v1.4 and additional changes.
22	* Matthias Sattler : Changes for 2.1 kernel map.
23	* Michel Lespinasse : Changes for 2.1 kernel map.
24	* Michael Chastain : Change trampoline.S to gnu as.
25	* Alan Cox : Dumb bug: 'B' step PPro's are fine
26	* Ingo Molnar : Added APIC timers, based on code
27	* from Jose Renau
28	* Ingo Molnar : various cleanups and rewrites
29	* Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug.
30	* Maciej W. Rozycki : Bits for genuine 82489DX APICs
31	* Andi Kleen : Changed for SMP boot into long mode.
32	* Martin J. Bligh : Added support for multi-quad systems
33	* Dave Jones : Report invalid combinations of Athlon CPUs.
34	* Rusty Russell : Hacked into shape for new "hotplug" boot process.
35	* Andi Kleen : Converted to new state machine.
36	* Ashok Raj : CPU hotplug support
37	* Glauber Costa : i386 and x86_64 integration
38	*/
39
40	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
41
42	#include <linux/init.h>
43	#include <linux/smp.h>
44	#include <linux/export.h>
45	#include <linux/sched.h>
46	#include <linux/sched/topology.h>
47	#include <linux/sched/hotplug.h>
48	#include <linux/sched/task_stack.h>
49	#include <linux/percpu.h>
50	#include <linux/memblock.h>
51	#include <linux/err.h>
52	#include <linux/nmi.h>
53	#include <linux/tboot.h>
54	#include <linux/gfp.h>
55	#include <linux/cpuidle.h>
56	#include <linux/kexec.h>
57	#include <linux/numa.h>
58	#include <linux/pgtable.h>
59	#include <linux/overflow.h>
60	#include <linux/stackprotector.h>
61	#include <linux/cpuhotplug.h>
62	#include <linux/mc146818rtc.h>
63
64	#include <asm/acpi.h>
65	#include <asm/cacheinfo.h>
66	#include <asm/desc.h>
67	#include <asm/nmi.h>
68	#include <asm/irq.h>
69	#include <asm/realmode.h>
70	#include <asm/cpu.h>
71	#include <asm/numa.h>
72	#include <asm/tlbflush.h>
73	#include <asm/mtrr.h>
74	#include <asm/mwait.h>
75	#include <asm/apic.h>
76	#include <asm/io_apic.h>
77	#include <asm/fpu/api.h>
78	#include <asm/setup.h>
79	#include <asm/uv/uv.h>
80	#include <asm/microcode.h>
81	#include <asm/i8259.h>
82	#include <asm/misc.h>
83	#include <asm/qspinlock.h>
84	#include <asm/intel-family.h>
85	#include <asm/cpu_device_id.h>
86	#include <asm/spec-ctrl.h>
87	#include <asm/hw_irq.h>
88	#include <asm/stackprotector.h>
89	#include <asm/sev.h>
90	#include <asm/spec-ctrl.h>
91
92	/ representing HT siblings of each logical CPU /
93	DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
94	EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
95
96	/ representing HT and core siblings of each logical CPU /
97	DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
98	EXPORT_PER_CPU_SYMBOL(cpu_core_map);
99
100	/ representing HT, core, and die siblings of each logical CPU /
101	DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
102	EXPORT_PER_CPU_SYMBOL(cpu_die_map);
103
104	/ CPUs which are the primary SMT threads /
105	struct cpumask __cpu_primary_thread_mask __read_mostly;
106
107	/ Representing CPUs for which sibling maps can be computed /
108	static cpumask_var_t cpu_sibling_setup_mask;
109
110	struct mwait_cpu_dead {
111	unsigned int control;
112	unsigned int status;
113	};
114
115	#define CPUDEAD_MWAIT_WAIT 0xDEADBEEF
116	#define CPUDEAD_MWAIT_KEXEC_HLT 0x4A17DEAD
117
118	/*
119	* Cache line aligned data for mwait_play_dead(). Separate on purpose so
120	* that it's unlikely to be touched by other CPUs.
121	*/
122	static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead);
123
124	/ Maximum number of SMT threads on any online core /
125	int __read_mostly __max_smt_threads = `1`;
126
127	/ Flag to indicate if a complete sched domain rebuild is required /
128	bool x86_topology_update;
129
130	int arch_update_cpu_topology(void)
131	{
132	int retval = x86_topology_update;
133
134	x86_topology_update = false;
135	return retval;
136	}
137
138	static unsigned int smpboot_warm_reset_vector_count;
139
140	static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
141	{
142	unsigned long flags;
143
144	spin_lock_irqsave(&rtc_lock, flags);
145	if (!smpboot_warm_reset_vector_count++) {
146	CMOS_WRITE(`0xa`, `0xf`);
147	((volatile* unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = start_eip >> `4`;
148	((volatile* unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = start_eip & `0xf`;
149	}
150	spin_unlock_irqrestore(lock: &rtc_lock, flags);
151	}
152
153	static inline void smpboot_restore_warm_reset_vector(void)
154	{
155	unsigned long flags;
156
157	/*
158	* Paranoid: Set warm reset code and vector here back
159	* to default values.
160	*/
161	spin_lock_irqsave(&rtc_lock, flags);
162	if (!--smpboot_warm_reset_vector_count) {
163	CMOS_WRITE(`0`, `0xf`);
164	((volatile* u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = `0`;
165	}
166	spin_unlock_irqrestore(lock: &rtc_lock, flags);
167
168	}
169
170	/ Run the next set of setup steps for the upcoming CPU /
171	static void ap_starting(void)
172	{
173	int cpuid = smp_processor_id();
174
175	/ Mop up eventual mwait_play_dead() wreckage /
176	this_cpu_write(mwait_cpu_dead.status, `0`);
177	this_cpu_write(mwait_cpu_dead.control, `0`);
178
179	/*
180	* If woken up by an INIT in an 82489DX configuration the alive
181	* synchronization guarantees that the CPU does not reach this
182	* point before an INIT_deassert IPI reaches the local APIC, so it
183	* is now safe to touch the local APIC.
184	*
185	* Set up this CPU, first the APIC, which is probably redundant on
186	* most boards.
187	*/
188	apic_ap_setup();
189
190	/ Save the processor parameters. /
191	smp_store_cpu_info(id: cpuid);
192
193	/*
194	* The topology information must be up to date before
195	* notify_cpu_starting().
196	*/
197	set_cpu_sibling_map(cpuid);
198
199	ap_init_aperfmperf();
200
201	pr_debug("Stack at about %p\n", &cpuid);
202
203	wmb();
204
205	/*
206	* This runs the AP through all the cpuhp states to its target
207	* state CPUHP_ONLINE.
208	*/
209	notify_cpu_starting(cpu: cpuid);
210	}
211
212	static void ap_calibrate_delay(void)
213	{
214	/*
215	* Calibrate the delay loop and update loops_per_jiffy in cpu_data.
216	* smp_store_cpu_info() stored a value that is close but not as
217	* accurate as the value just calculated.
218	*
219	* As this is invoked after the TSC synchronization check,
220	* calibrate_delay_is_known() will skip the calibration routine
221	* when TSC is synchronized across sockets.
222	*/
223	calibrate_delay();
224	cpu_data(smp_processor_id()).loops_per_jiffy = loops_per_jiffy;
225	}
226
227	/*
228	* Activate a secondary processor.
229	*/
230	static void notrace start_secondary(void *unused)
231	{
232	/*
233	* Don't put anything except direct CPU state initialization
234	* before cpu_init(), SMP booting is too fragile that we want to
235	* limit the things done here to the most necessary things.
236	*/
237	cr4_init();
238
239	/*
240	* 32-bit specific. 64-bit reaches this code with the correct page
241	* table established. Yet another historical divergence.
242	*/
243	if (IS_ENABLED(CONFIG_X86_32)) {
244	/ switch away from the initial page table /
245	load_cr3(swapper_pg_dir);
246	__flush_tlb_all();
247	}
248
249	cpu_init_exception_handling();
250
251	/*
252	* Load the microcode before reaching the AP alive synchronization
253	* point below so it is not part of the full per CPU serialized
254	* bringup part when "parallel" bringup is enabled.
255	*
256	* That's even safe when hyperthreading is enabled in the CPU as
257	* the core code starts the primary threads first and leaves the
258	* secondary threads waiting for SIPI. Loading microcode on
259	* physical cores concurrently is a safe operation.
260	*
261	* This covers both the Intel specific issue that concurrent
262	* microcode loading on SMT siblings must be prohibited and the
263	* vendor independent issue`that microcode loading which changes
264	* CPUID, MSRs etc. must be strictly serialized to maintain
265	* software state correctness.
266	*/
267	load_ucode_ap();
268
269	/*
270	* Synchronization point with the hotplug core. Sets this CPUs
271	* synchronization state to ALIVE and spin-waits for the control CPU to
272	* release this CPU for further bringup.
273	*/
274	cpuhp_ap_sync_alive();
275
276	cpu_init();
277	fpu__init_cpu();
278	rcutree_report_cpu_starting(raw_smp_processor_id());
279	x86_cpuinit.early_percpu_clock_init();
280
281	ap_starting();
282
283	/ Check TSC synchronization with the control CPU. /
284	check_tsc_sync_target();
285
286	/*
287	* Calibrate the delay loop after the TSC synchronization check.
288	* This allows to skip the calibration when TSC is synchronized
289	* across sockets.
290	*/
291	ap_calibrate_delay();
292
293	speculative_store_bypass_ht_init();
294
295	/*
296	* Lock vector_lock, set CPU online and bring the vector
297	* allocator online. Online must be set with vector_lock held
298	* to prevent a concurrent irq setup/teardown from seeing a
299	* half valid vector space.
300	*/
301	lock_vector_lock();
302	set_cpu_online(smp_processor_id(), online: true);
303	lapic_online();
304	unlock_vector_lock();
305	x86_platform.nmi_init();
306
307	/ enable local interrupts /
308	local_irq_enable();
309
310	x86_cpuinit.setup_percpu_clockev();
311
312	wmb();
313	cpu_startup_entry(state: CPUHP_AP_ONLINE_IDLE);
314	}
315
316	/*
317	* The bootstrap kernel entry code has set these up. Save them for
318	* a given CPU
319	*/
320	void smp_store_cpu_info(int id)
321	{
322	struct cpuinfo_x86 *c = &cpu_data(id);
323
324	/ Copy boot_cpu_data only on the first bringup /
325	if (!c->initialized)
326	*c = boot_cpu_data;
327	c->cpu_index = id;
328	/*
329	* During boot time, CPU0 has this setup already. Save the info when
330	* bringing up an AP.
331	*/
332	identify_secondary_cpu(c);
333	c->initialized = true;
334	}
335
336	static bool
337	topology_same_node(struct cpuinfo_x86 c, struct* cpuinfo_x86 *o)
338	{
339	int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
340
341	return (cpu_to_node(cpu: cpu1) == cpu_to_node(cpu: cpu2));
342	}
343
344	static bool
345	topology_sane(struct cpuinfo_x86 c, struct* cpuinfo_x86 o, const* char *name)
346	{
347	int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
348
349	return !WARN_ONCE(!topology_same_node(c, o),
350	"sched: CPU #%d's %s-sibling CPU #%d is not on the same node! "
351	"[node: %d != %d]. Ignoring dependency.\n",
352	cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
353	}
354
355	#define link_mask(mfunc, c1, c2) \
356	do { \
357	cpumask_set_cpu((c1), mfunc(c2)); \
358	cpumask_set_cpu((c2), mfunc(c1)); \
359	} while (0)
360
361	static bool match_smt(struct cpuinfo_x86 c, struct* cpuinfo_x86 *o)
362	{
363	if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
364	int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
365
366	if (c->topo.pkg_id == o->topo.pkg_id &&
367	c->topo.die_id == o->topo.die_id &&
368	c->topo.amd_node_id == o->topo.amd_node_id &&
369	per_cpu_llc_id(cpu: cpu1) == per_cpu_llc_id(cpu: cpu2)) {
370	if (c->topo.core_id == o->topo.core_id)
371	return topology_sane(c, o, name: "smt");
372
373	if ((c->topo.cu_id != `0xff`) &&
374	(o->topo.cu_id != `0xff`) &&
375	(c->topo.cu_id == o->topo.cu_id))
376	return topology_sane(c, o, name: "smt");
377	}
378
379	} else if (c->topo.pkg_id == o->topo.pkg_id &&
380	c->topo.die_id == o->topo.die_id &&
381	c->topo.core_id == o->topo.core_id) {
382	return topology_sane(c, o, name: "smt");
383	}
384
385	return false;
386	}
387
388	static bool match_die(struct cpuinfo_x86 c, struct* cpuinfo_x86 *o)
389	{
390	if (c->topo.pkg_id != o->topo.pkg_id \|\| c->topo.die_id != o->topo.die_id)
391	return false;
392
393	if (cpu_feature_enabled(X86_FEATURE_TOPOEXT) && topology_amd_nodes_per_pkg() > `1`)
394	return c->topo.amd_node_id == o->topo.amd_node_id;
395
396	return true;
397	}
398
399	static bool match_l2c(struct cpuinfo_x86 c, struct* cpuinfo_x86 *o)
400	{
401	int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
402
403	/ If the arch didn't set up l2c_id, fall back to SMT /
404	if (per_cpu_l2c_id(cpu: cpu1) == BAD_APICID)
405	return match_smt(c, o);
406
407	/ Do not match if L2 cache id does not match: /
408	if (per_cpu_l2c_id(cpu: cpu1) != per_cpu_l2c_id(cpu: cpu2))
409	return false;
410
411	return topology_sane(c, o, name: "l2c");
412	}
413
414	/*
415	* Unlike the other levels, we do not enforce keeping a
416	* multicore group inside a NUMA node. If this happens, we will
417	* discard the MC level of the topology later.
418	*/
419	static bool match_pkg(struct cpuinfo_x86 c, struct* cpuinfo_x86 *o)
420	{
421	if (c->topo.pkg_id == o->topo.pkg_id)
422	return true;
423	return false;
424	}
425
426	/*
427	* Define intel_cod_cpu[] for Intel COD (Cluster-on-Die) CPUs.
428	*
429	* Any Intel CPU that has multiple nodes per package and does not
430	* match intel_cod_cpu[] has the SNC (Sub-NUMA Cluster) topology.
431	*
432	* When in SNC mode, these CPUs enumerate an LLC that is shared
433	* by multiple NUMA nodes. The LLC is shared for off-package data
434	* access but private to the NUMA node (half of the package) for
435	* on-package access. CPUID (the source of the information about
436	* the LLC) can only enumerate the cache as shared or unshared,
437	* but not this particular configuration.
438	*/
439
440	static const struct x86_cpu_id intel_cod_cpu[] = {
441	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, `0`), / COD /
442	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, `0`), / COD /
443	X86_MATCH_INTEL_FAM6_MODEL(ANY, `1`), / SNC /
444	{}
445	};
446
447	static bool match_llc(struct cpuinfo_x86 c, struct* cpuinfo_x86 *o)
448	{
449	const struct x86_cpu_id *id = x86_match_cpu(match: intel_cod_cpu);
450	int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
451	bool intel_snc = id && id->driver_data;
452
453	/ Do not match if we do not have a valid APICID for cpu: /
454	if (per_cpu_llc_id(cpu: cpu1) == BAD_APICID)
455	return false;
456
457	/ Do not match if LLC id does not match: /
458	if (per_cpu_llc_id(cpu: cpu1) != per_cpu_llc_id(cpu: cpu2))
459	return false;
460
461	/*
462	* Allow the SNC topology without warning. Return of false
463	* means 'c' does not share the LLC of 'o'. This will be
464	* reflected to userspace.
465	*/
466	if (match_pkg(c, o) && !topology_same_node(c, o) && intel_snc)
467	return false;
468
469	return topology_sane(c, o, name: "llc");
470	}
471
472
473	static inline int x86_sched_itmt_flags(void)
474	{
475	return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : `0`;
476	}
477
478	#ifdef CONFIG_SCHED_MC
479	static int x86_core_flags(void)
480	{
481	return cpu_core_flags() \| x86_sched_itmt_flags();
482	}
483	#endif
484	#ifdef CONFIG_SCHED_SMT
485	static int x86_smt_flags(void)
486	{
487	return cpu_smt_flags();
488	}
489	#endif
490	#ifdef CONFIG_SCHED_CLUSTER
491	static int x86_cluster_flags(void)
492	{
493	return cpu_cluster_flags() \| x86_sched_itmt_flags();
494	}
495	#endif
496
497	static int x86_die_flags(void)
498	{
499	if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
500	return x86_sched_itmt_flags();
501
502	return `0`;
503	}
504
505	/*
506	* Set if a package/die has multiple NUMA nodes inside.
507	* AMD Magny-Cours, Intel Cluster-on-Die, and Intel
508	* Sub-NUMA Clustering have this.
509	*/
510	static bool x86_has_numa_in_package;
511
512	static struct sched_domain_topology_level x86_topology[`6`];
513
514	static void __init build_sched_topology(void)
515	{
516	int i = `0`;
517
518	#ifdef CONFIG_SCHED_SMT
519	x86_topology[i++] = (struct sched_domain_topology_level){
520	cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT)
521	};
522	#endif
523	#ifdef CONFIG_SCHED_CLUSTER
524	x86_topology[i++] = (struct sched_domain_topology_level){
525	cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS)
526	};
527	#endif
528	#ifdef CONFIG_SCHED_MC
529	x86_topology[i++] = (struct sched_domain_topology_level){
530	cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC)
531	};
532	#endif
533	/*
534	* When there is NUMA topology inside the package skip the PKG domain
535	* since the NUMA domains will auto-magically create the right spanning
536	* domains based on the SLIT.
537	*/
538	if (!x86_has_numa_in_package) {
539	x86_topology[i++] = (struct sched_domain_topology_level){
540	cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(PKG)
541	};
542	}
543
544	/*
545	* There must be one trailing NULL entry left.
546	*/
547	BUG_ON(i >= ARRAY_SIZE(x86_topology)-`1`);
548
549	set_sched_topology(x86_topology);
550	}
551
552	void set_cpu_sibling_map(int cpu)
553	{
554	bool has_smt = __max_threads_per_core > `1`;
555	bool has_mp = has_smt \|\| topology_num_cores_per_package() > `1`;
556	struct cpuinfo_x86 *c = &cpu_data(cpu);
557	struct cpuinfo_x86 *o;
558	int i, threads;
559
560	cpumask_set_cpu(cpu, dstp: cpu_sibling_setup_mask);
561
562	if (!has_mp) {
563	cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
564	cpumask_set_cpu(cpu, dstp: cpu_llc_shared_mask(cpu));
565	cpumask_set_cpu(cpu, dstp: cpu_l2c_shared_mask(cpu));
566	cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
567	cpumask_set_cpu(cpu, topology_die_cpumask(cpu));
568	c->booted_cores = `1`;
569	return;
570	}
571
572	for_each_cpu(i, cpu_sibling_setup_mask) {
573	o = &cpu_data(i);
574
575	if (match_pkg(c, o) && !topology_same_node(c, o))
576	x86_has_numa_in_package = true;
577
578	if ((i == cpu) \|\| (has_smt && match_smt(c, o)))
579	link_mask(topology_sibling_cpumask, cpu, i);
580
581	if ((i == cpu) \|\| (has_mp && match_llc(c, o)))
582	link_mask(cpu_llc_shared_mask, cpu, i);
583
584	if ((i == cpu) \|\| (has_mp && match_l2c(c, o)))
585	link_mask(cpu_l2c_shared_mask, cpu, i);
586
587	if ((i == cpu) \|\| (has_mp && match_die(c, o)))
588	link_mask(topology_die_cpumask, cpu, i);
589	}
590
591	threads = cpumask_weight(topology_sibling_cpumask(cpu));
592	if (threads > __max_smt_threads)
593	__max_smt_threads = threads;
594
595	for_each_cpu(i, topology_sibling_cpumask(cpu))
596	cpu_data(i).smt_active = threads > `1`;
597
598	/*
599	* This needs a separate iteration over the cpus because we rely on all
600	* topology_sibling_cpumask links to be set-up.
601	*/
602	for_each_cpu(i, cpu_sibling_setup_mask) {
603	o = &cpu_data(i);
604
605	if ((i == cpu) \|\| (has_mp && match_pkg(c, o))) {
606	link_mask(topology_core_cpumask, cpu, i);
607
608	/*
609	* Does this new cpu bringup a new core?
610	*/
611	if (threads == `1`) {
612	/*
613	* for each core in package, increment
614	* the booted_cores for this new cpu
615	*/
616	if (cpumask_first(
617	topology_sibling_cpumask(i)) == i)
618	c->booted_cores++;
619	/*
620	* increment the core count for all
621	* the other cpus in this package
622	*/
623	if (i != cpu)
624	cpu_data(i).booted_cores++;
625	} else if (i != cpu && !c->booted_cores)
626	c->booted_cores = cpu_data(i).booted_cores;
627	}
628	}
629	}
630
631	/ maps the cpu to the sched domain representing multi-core /
632	const struct cpumask cpu_coregroup_mask(int* cpu)
633	{
634	return cpu_llc_shared_mask(cpu);
635	}
636
637	const struct cpumask cpu_clustergroup_mask(int* cpu)
638	{
639	return cpu_l2c_shared_mask(cpu);
640	}
641	EXPORT_SYMBOL_GPL(cpu_clustergroup_mask);
642
643	static void impress_friends(void)
644	{
645	int cpu;
646	unsigned long bogosum = `0`;
647	/*
648	* Allow the user to impress friends.
649	*/
650	pr_debug("Before bogomips\n");
651	for_each_online_cpu(cpu)
652	bogosum += cpu_data(cpu).loops_per_jiffy;
653
654	pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n",
655	num_online_cpus(),
656	bogosum/(`500000`/HZ),
657	(bogosum/(`5000`/HZ))%`100`);
658
659	pr_debug("Before bogocount - setting activated=1\n");
660	}
661
662	/*
663	* The Multiprocessor Specification 1.4 (1997) example code suggests
664	* that there should be a 10ms delay between the BSP asserting INIT
665	* and de-asserting INIT, when starting a remote processor.
666	* But that slows boot and resume on modern processors, which include
667	* many cores and don't require that delay.
668	*
669	* Cmdline "init_cpu_udelay=" is available to over-ride this delay.
670	* Modern processor families are quirked to remove the delay entirely.
671	*/
672	#define UDELAY_10MS_DEFAULT 10000
673
674	static unsigned int init_udelay = UINT_MAX;
675
676	static int __init cpu_init_udelay(char *str)
677	{
678	get_option(str: &str, pint: &init_udelay);
679
680	return `0`;
681	}
682	early_param("cpu_init_udelay", cpu_init_udelay);
683
684	static void __init smp_quirk_init_udelay(void)
685	{
686	/ if cmdline changed it from default, leave it alone /
687	if (init_udelay != UINT_MAX)
688	return;
689
690	/ if modern processor, use no delay /
691	if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == `6`)) \|\|
692	((boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) && (boot_cpu_data.x86 >= `0x18`)) \|\|
693	((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= `0xF`))) {
694	init_udelay = `0`;
695	return;
696	}
697	/ else, use legacy delay /
698	init_udelay = UDELAY_10MS_DEFAULT;
699	}
700
701	/*
702	* Wake up AP by INIT, INIT, STARTUP sequence.
703	*/
704	static void send_init_sequence(u32 phys_apicid)
705	{
706	int maxlvt = lapic_get_maxlvt();
707
708	/ Be paranoid about clearing APIC errors. /
709	if (APIC_INTEGRATED(boot_cpu_apic_version)) {
710	/ Due to the Pentium erratum 3AP. /
711	if (maxlvt > `3`)
712	apic_write(APIC_ESR, val: `0`);
713	apic_read(APIC_ESR);
714	}
715
716	/ Assert INIT on the target CPU /
717	apic_icr_write(APIC_INT_LEVELTRIG \| APIC_INT_ASSERT \| APIC_DM_INIT, high: phys_apicid);
718	safe_apic_wait_icr_idle();
719
720	udelay(init_udelay);
721
722	/ Deassert INIT on the target CPU /
723	apic_icr_write(APIC_INT_LEVELTRIG \| APIC_DM_INIT, high: phys_apicid);
724	safe_apic_wait_icr_idle();
725	}
726
727	/*
728	* Wake up AP by INIT, INIT, STARTUP sequence.
729	*/
730	static int wakeup_secondary_cpu_via_init(u32 phys_apicid, unsigned long start_eip)
731	{
732	unsigned long send_status = `0`, accept_status = `0`;
733	int num_starts, j, maxlvt;
734
735	preempt_disable();
736	maxlvt = lapic_get_maxlvt();
737	send_init_sequence(phys_apicid);
738
739	mb();
740
741	/*
742	* Should we send STARTUP IPIs ?
743	*
744	* Determine this based on the APIC version.
745	* If we don't have an integrated APIC, don't send the STARTUP IPIs.
746	*/
747	if (APIC_INTEGRATED(boot_cpu_apic_version))
748	num_starts = `2`;
749	else
750	num_starts = `0`;
751
752	/*
753	* Run STARTUP IPI loop.
754	*/
755	pr_debug("#startup loops: %d\n", num_starts);
756
757	for (j = `1`; j <= num_starts; j++) {
758	pr_debug("Sending STARTUP #%d\n", j);
759	if (maxlvt > `3`) / Due to the Pentium erratum 3AP. /
760	apic_write(APIC_ESR, val: `0`);
761	apic_read(APIC_ESR);
762	pr_debug("After apic_write\n");
763
764	/*
765	* STARTUP IPI
766	*/
767
768	/ Target chip /
769	/ Boot on the stack /
770	/ Kick the second /
771	apic_icr_write(APIC_DM_STARTUP \| (start_eip >> `12`),
772	high: phys_apicid);
773
774	/*
775	* Give the other CPU some time to accept the IPI.
776	*/
777	if (init_udelay == `0`)
778	udelay(`10`);
779	else
780	udelay(`300`);
781
782	pr_debug("Startup point 1\n");
783
784	pr_debug("Waiting for send to finish...\n");
785	send_status = safe_apic_wait_icr_idle();
786
787	/*
788	* Give the other CPU some time to accept the IPI.
789	*/
790	if (init_udelay == `0`)
791	udelay(`10`);
792	else
793	udelay(`200`);
794
795	if (maxlvt > `3`) / Due to the Pentium erratum 3AP. /
796	apic_write(APIC_ESR, val: `0`);
797	accept_status = (apic_read(APIC_ESR) & `0xEF`);
798	if (send_status \|\| accept_status)
799	break;
800	}
801	pr_debug("After Startup\n");
802
803	if (send_status)
804	pr_err("APIC never delivered???\n");
805	if (accept_status)
806	pr_err("APIC delivery error (%lx)\n", accept_status);
807
808	preempt_enable();
809	return (send_status \| accept_status);
810	}
811
812	/ reduce the number of lines printed when booting a large cpu count system /
813	static void announce_cpu(int cpu, int apicid)
814	{
815	static int width, node_width, first = `1`;
816	static int current_node = NUMA_NO_NODE;
817	int node = early_cpu_to_node(cpu);
818
819	if (!width)
820	width = num_digits(num_possible_cpus()) + `1`; / + '#' sign /
821
822	if (!node_width)
823	node_width = num_digits(num_possible_nodes()) + `1`; / + '#' /
824
825	if (system_state < SYSTEM_RUNNING) {
826	if (first)
827	pr_info("x86: Booting SMP configuration:\n");
828
829	if (node != current_node) {
830	if (current_node > (-`1`))
831	pr_cont("\n");
832	current_node = node;
833
834	printk(KERN_INFO ".... node %*s#%d, CPUs: ",
835	node_width - num_digits(node), " ", node);
836	}
837
838	/ Add padding for the BSP /
839	if (first)
840	pr_cont("%*s", width + `1`, " ");
841	first = `0`;
842
843	pr_cont("%*s#%d", width - num_digits(cpu), " ", cpu);
844	} else
845	pr_info("Booting Node %d Processor %d APIC 0x%x\n",
846	node, cpu, apicid);
847	}
848
849	int common_cpu_up(unsigned int cpu, struct task_struct *idle)
850	{
851	int ret;
852
853	/ Just in case we booted with a single CPU. /
854	alternatives_enable_smp();
855
856	per_cpu(pcpu_hot.current_task, cpu) = idle;
857	cpu_init_stack_canary(cpu, idle);
858
859	/ Initialize the interrupt stack(s) /
860	ret = irq_init_percpu_irqstack(cpu);
861	if (ret)
862	return ret;
863
864	#ifdef CONFIG_X86_32
865	/ Stack for startup_32 can be just as for start_secondary onwards /
866	per_cpu(pcpu_hot.top_of_stack, cpu) = task_top_of_stack(idle);
867	#endif
868	return `0`;
869	}
870
871	/*
872	* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
873	* (ie clustered apic addressing mode), this is a LOGICAL apic ID.
874	* Returns zero if startup was successfully sent, else error code from
875	* ->wakeup_secondary_cpu.
876	*/
877	static int do_boot_cpu(u32 apicid, int cpu, struct task_struct *idle)
878	{
879	unsigned long start_ip = real_mode_header->trampoline_start;
880	int ret;
881
882	#ifdef CONFIG_X86_64
883	/ If 64-bit wakeup method exists, use the 64-bit mode trampoline IP /
884	if (apic->wakeup_secondary_cpu_64)
885	start_ip = real_mode_header->trampoline_start64;
886	#endif
887	idle->thread.sp = (unsigned long)task_pt_regs(idle);
888	initial_code = (unsigned long)start_secondary;
889
890	if (IS_ENABLED(CONFIG_X86_32)) {
891	early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
892	initial_stack = idle->thread.sp;
893	} else if (!(smpboot_control & STARTUP_PARALLEL_MASK)) {
894	smpboot_control = cpu;
895	}
896
897	/ Enable the espfix hack for this CPU /
898	init_espfix_ap(cpu);
899
900	/ So we see what's up /
901	announce_cpu(cpu, apicid);
902
903	/*
904	* This grunge runs the startup process for
905	* the targeted processor.
906	*/
907	if (x86_platform.legacy.warm_reset) {
908
909	pr_debug("Setting warm reset code and vector.\n");
910
911	smpboot_setup_warm_reset_vector(start_eip: start_ip);
912	/*
913	* Be paranoid about clearing APIC errors.
914	*/
915	if (APIC_INTEGRATED(boot_cpu_apic_version)) {
916	apic_write(APIC_ESR, val: `0`);
917	apic_read(APIC_ESR);
918	}
919	}
920
921	smp_mb();
922
923	/*
924	* Wake up a CPU in difference cases:
925	* - Use a method from the APIC driver if one defined, with wakeup
926	* straight to 64-bit mode preferred over wakeup to RM.
927	* Otherwise,
928	* - Use an INIT boot APIC message
929	*/
930	if (apic->wakeup_secondary_cpu_64)
931	ret = apic->wakeup_secondary_cpu_64(apicid, start_ip);
932	else if (apic->wakeup_secondary_cpu)
933	ret = apic->wakeup_secondary_cpu(apicid, start_ip);
934	else
935	ret = wakeup_secondary_cpu_via_init(phys_apicid: apicid, start_eip: start_ip);
936
937	/ If the wakeup mechanism failed, cleanup the warm reset vector /
938	if (ret)
939	arch_cpuhp_cleanup_kick_cpu(cpu);
940	return ret;
941	}
942
943	int native_kick_ap(unsigned int cpu, struct task_struct *tidle)
944	{
945	u32 apicid = apic->cpu_present_to_apicid(cpu);
946	int err;
947
948	lockdep_assert_irqs_enabled();
949
950	pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu);
951
952	if (apicid == BAD_APICID \|\| !apic_id_valid(apic_id: apicid)) {
953	pr_err("CPU %u has invalid APIC ID %x. Aborting bringup\n", cpu, apicid);
954	return -EINVAL;
955	}
956
957	if (!test_bit(apicid, phys_cpu_present_map)) {
958	pr_err("CPU %u APIC ID %x is not present. Aborting bringup\n", cpu, apicid);
959	return -EINVAL;
960	}
961
962	/*
963	* Save current MTRR state in case it was changed since early boot
964	* (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
965	*/
966	mtrr_save_state();
967
968	/ the FPU context is blank, nobody can own it /
969	per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
970
971	err = common_cpu_up(cpu, idle: tidle);
972	if (err)
973	return err;
974
975	err = do_boot_cpu(apicid, cpu, idle: tidle);
976	if (err)
977	pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
978
979	return err;
980	}
981
982	int arch_cpuhp_kick_ap_alive(unsigned int cpu, struct task_struct *tidle)
983	{
984	return smp_ops.kick_ap_alive(cpu, tidle);
985	}
986
987	void arch_cpuhp_cleanup_kick_cpu(unsigned int cpu)
988	{
989	/ Cleanup possible dangling ends... /
990	if (smp_ops.kick_ap_alive == native_kick_ap && x86_platform.legacy.warm_reset)
991	smpboot_restore_warm_reset_vector();
992	}
993
994	void arch_cpuhp_cleanup_dead_cpu(unsigned int cpu)
995	{
996	if (smp_ops.cleanup_dead_cpu)
997	smp_ops.cleanup_dead_cpu(cpu);
998
999	if (system_state == SYSTEM_RUNNING)
1000	pr_info("CPU %u is now offline\n", cpu);
1001	}
1002
1003	void arch_cpuhp_sync_state_poll(void)
1004	{
1005	if (smp_ops.poll_sync_state)
1006	smp_ops.poll_sync_state();
1007	}
1008
1009	/**
1010	* arch_disable_smp_support() - Disables SMP support for x86 at boottime
1011	*/
1012	void __init arch_disable_smp_support(void)
1013	{
1014	disable_ioapic_support();
1015	}
1016
1017	/*
1018	* Fall back to non SMP mode after errors.
1019	*
1020	* RED-PEN audit/test this more. I bet there is more state messed up here.
1021	*/
1022	static __init void disable_smp(void)
1023	{
1024	pr_info("SMP disabled\n");
1025
1026	disable_ioapic_support();
1027	topology_reset_possible_cpus_up();
1028
1029	cpumask_set_cpu(cpu: `0`, topology_sibling_cpumask(`0`));
1030	cpumask_set_cpu(cpu: `0`, topology_core_cpumask(`0`));
1031	cpumask_set_cpu(cpu: `0`, topology_die_cpumask(`0`));
1032	}
1033
1034	void __init smp_prepare_cpus_common(void)
1035	{
1036	unsigned int i;
1037
1038	/ Mark all except the boot CPU as hotpluggable /
1039	for_each_possible_cpu(i) {
1040	if (i)
1041	per_cpu(cpu_info.cpu_index, i) = nr_cpu_ids;
1042	}
1043
1044	for_each_possible_cpu(i) {
1045	zalloc_cpumask_var(mask: &per_cpu(cpu_sibling_map, i), GFP_KERNEL);
1046	zalloc_cpumask_var(mask: &per_cpu(cpu_core_map, i), GFP_KERNEL);
1047	zalloc_cpumask_var(mask: &per_cpu(cpu_die_map, i), GFP_KERNEL);
1048	zalloc_cpumask_var(mask: &per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
1049	zalloc_cpumask_var(mask: &per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL);
1050	}
1051
1052	set_cpu_sibling_map(`0`);
1053	}
1054
1055	void __init smp_prepare_boot_cpu(void)
1056	{
1057	smp_ops.smp_prepare_boot_cpu();
1058	}
1059
1060	#ifdef CONFIG_X86_64
1061	/ Establish whether parallel bringup can be supported. /
1062	bool __init arch_cpuhp_init_parallel_bringup(void)
1063	{
1064	if (!x86_cpuinit.parallel_bringup) {
1065	pr_info("Parallel CPU startup disabled by the platform\n");
1066	return false;
1067	}
1068
1069	smpboot_control = STARTUP_READ_APICID;
1070	pr_debug("Parallel CPU startup enabled: 0x%08x\n", smpboot_control);
1071	return true;
1072	}
1073	#endif
1074
1075	/*
1076	* Prepare for SMP bootup.
1077	* @max_cpus: configured maximum number of CPUs, It is a legacy parameter
1078	* for common interface support.
1079	*/
1080	void __init native_smp_prepare_cpus(unsigned int max_cpus)
1081	{
1082	smp_prepare_cpus_common();
1083
1084	switch (apic_intr_mode) {
1085	case APIC_PIC:
1086	case APIC_VIRTUAL_WIRE_NO_CONFIG:
1087	disable_smp();
1088	return;
1089	case APIC_SYMMETRIC_IO_NO_ROUTING:
1090	disable_smp();
1091	/ Setup local timer /
1092	x86_init.timers.setup_percpu_clockev();
1093	return;
1094	case APIC_VIRTUAL_WIRE:
1095	case APIC_SYMMETRIC_IO:
1096	break;
1097	}
1098
1099	/ Setup local timer /
1100	x86_init.timers.setup_percpu_clockev();
1101
1102	pr_info("CPU0: ");
1103	print_cpu_info(&cpu_data(`0`));
1104
1105	uv_system_init();
1106
1107	smp_quirk_init_udelay();
1108
1109	speculative_store_bypass_ht_init();
1110
1111	snp_set_wakeup_secondary_cpu();
1112	}
1113
1114	void arch_thaw_secondary_cpus_begin(void)
1115	{
1116	set_cache_aps_delayed_init(true);
1117	}
1118
1119	void arch_thaw_secondary_cpus_end(void)
1120	{
1121	cache_aps_init();
1122	}
1123
1124	/*
1125	* Early setup to make printk work.
1126	*/
1127	void __init native_smp_prepare_boot_cpu(void)
1128	{
1129	int me = smp_processor_id();
1130
1131	/ SMP handles this from setup_per_cpu_areas() /
1132	if (!IS_ENABLED(CONFIG_SMP))
1133	switch_gdt_and_percpu_base(me);
1134
1135	native_pv_lock_init();
1136	}
1137
1138	void __init native_smp_cpus_done(unsigned int max_cpus)
1139	{
1140	pr_debug("Boot done\n");
1141
1142	build_sched_topology();
1143	nmi_selftest();
1144	impress_friends();
1145	cache_aps_init();
1146	}
1147
1148	/ correctly size the local cpu masks /
1149	void __init setup_cpu_local_masks(void)
1150	{
1151	alloc_bootmem_cpumask_var(mask: &cpu_sibling_setup_mask);
1152	}
1153
1154	#ifdef CONFIG_HOTPLUG_CPU
1155
1156	/ Recompute SMT state for all CPUs on offline /
1157	static void recompute_smt_state(void)
1158	{
1159	int max_threads, cpu;
1160
1161	max_threads = `0`;
1162	for_each_online_cpu (cpu) {
1163	int threads = cpumask_weight(topology_sibling_cpumask(cpu));
1164
1165	if (threads > max_threads)
1166	max_threads = threads;
1167	}
1168	__max_smt_threads = max_threads;
1169	}
1170
1171	static void remove_siblinginfo(int cpu)
1172	{
1173	int sibling;
1174	struct cpuinfo_x86 *c = &cpu_data(cpu);
1175
1176	for_each_cpu(sibling, topology_core_cpumask(cpu)) {
1177	cpumask_clear_cpu(cpu, topology_core_cpumask(sibling));
1178	//*
1179	* last thread sibling in this cpu core going down
1180	*/
1181	if (cpumask_weight(topology_sibling_cpumask(cpu)) == `1`)
1182	cpu_data(sibling).booted_cores--;
1183	}
1184
1185	for_each_cpu(sibling, topology_die_cpumask(cpu))
1186	cpumask_clear_cpu(cpu, topology_die_cpumask(sibling));
1187
1188	for_each_cpu(sibling, topology_sibling_cpumask(cpu)) {
1189	cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
1190	if (cpumask_weight(topology_sibling_cpumask(sibling)) == `1`)
1191	cpu_data(sibling).smt_active = false;
1192	}
1193
1194	for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
1195	cpumask_clear_cpu(cpu, dstp: cpu_llc_shared_mask(cpu: sibling));
1196	for_each_cpu(sibling, cpu_l2c_shared_mask(cpu))
1197	cpumask_clear_cpu(cpu, dstp: cpu_l2c_shared_mask(cpu: sibling));
1198	cpumask_clear(dstp: cpu_llc_shared_mask(cpu));
1199	cpumask_clear(dstp: cpu_l2c_shared_mask(cpu));
1200	cpumask_clear(topology_sibling_cpumask(cpu));
1201	cpumask_clear(topology_core_cpumask(cpu));
1202	cpumask_clear(topology_die_cpumask(cpu));
1203	c->topo.core_id = `0`;
1204	c->booted_cores = `0`;
1205	cpumask_clear_cpu(cpu, dstp: cpu_sibling_setup_mask);
1206	recompute_smt_state();
1207	}
1208
1209	static void remove_cpu_from_maps(int cpu)
1210	{
1211	set_cpu_online(cpu, online: false);
1212	numa_remove_cpu(cpu);
1213	}
1214
1215	void cpu_disable_common(void)
1216	{
1217	int cpu = smp_processor_id();
1218
1219	remove_siblinginfo(cpu);
1220
1221	/ It's now safe to remove this processor from the online map /
1222	lock_vector_lock();
1223	remove_cpu_from_maps(cpu);
1224	unlock_vector_lock();
1225	fixup_irqs();
1226	lapic_offline();
1227	}
1228
1229	int native_cpu_disable(void)
1230	{
1231	int ret;
1232
1233	ret = lapic_can_unplug_cpu();
1234	if (ret)
1235	return ret;
1236
1237	cpu_disable_common();
1238
1239	/*
1240	* Disable the local APIC. Otherwise IPI broadcasts will reach
1241	* it. It still responds normally to INIT, NMI, SMI, and SIPI
1242	* messages.
1243	*
1244	* Disabling the APIC must happen after cpu_disable_common()
1245	* which invokes fixup_irqs().
1246	*
1247	* Disabling the APIC preserves already set bits in IRR, but
1248	* an interrupt arriving after disabling the local APIC does not
1249	* set the corresponding IRR bit.
1250	*
1251	* fixup_irqs() scans IRR for set bits so it can raise a not
1252	* yet handled interrupt on the new destination CPU via an IPI
1253	* but obviously it can't do so for IRR bits which are not set.
1254	* IOW, interrupts arriving after disabling the local APIC will
1255	* be lost.
1256	*/
1257	apic_soft_disable();
1258
1259	return `0`;
1260	}
1261
1262	void play_dead_common(void)
1263	{
1264	idle_task_exit();
1265
1266	cpuhp_ap_report_dead();
1267
1268	local_irq_disable();
1269	}
1270
1271	/*
1272	* We need to flush the caches before going to sleep, lest we have
1273	* dirty data in our caches when we come back up.
1274	*/
1275	static inline void mwait_play_dead(void)
1276	{
1277	struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
1278	unsigned int eax, ebx, ecx, edx;
1279	unsigned int highest_cstate = `0`;
1280	unsigned int highest_subcstate = `0`;
1281	int i;
1282
1283	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD \|\|
1284	boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
1285	return;
1286	if (!this_cpu_has(X86_FEATURE_MWAIT))
1287	return;
1288	if (!this_cpu_has(X86_FEATURE_CLFLUSH))
1289	return;
1290	if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
1291	return;
1292
1293	eax = CPUID_MWAIT_LEAF;
1294	ecx = `0`;
1295	native_cpuid(eax: &eax, ebx: &ebx, ecx: &ecx, edx: &edx);
1296
1297	/*
1298	* eax will be 0 if EDX enumeration is not valid.
1299	* Initialized below to cstate, sub_cstate value when EDX is valid.
1300	*/
1301	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) {
1302	eax = `0`;
1303	} else {
1304	edx >>= MWAIT_SUBSTATE_SIZE;
1305	for (i = `0`; i < `7` && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
1306	if (edx & MWAIT_SUBSTATE_MASK) {
1307	highest_cstate = i;
1308	highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
1309	}
1310	}
1311	eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) \|
1312	(highest_subcstate - `1`);
1313	}
1314
1315	/ Set up state for the kexec() hack below /
1316	md->status = CPUDEAD_MWAIT_WAIT;
1317	md->control = CPUDEAD_MWAIT_WAIT;
1318
1319	wbinvd();
1320
1321	while (`1`) {
1322	/*
1323	* The CLFLUSH is a workaround for erratum AAI65 for
1324	* the Xeon 7400 series. It's not clear it is actually
1325	* needed, but it should be harmless in either case.
1326	* The WBINVD is insufficient due to the spurious-wakeup
1327	* case where we return around the loop.
1328	*/
1329	mb();
1330	clflush(p: md);
1331	mb();
1332	__monitor(eax: md, ecx: `0`, edx: `0`);
1333	mb();
1334	__mwait(eax, ecx: `0`);
1335
1336	if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
1337	/*
1338	* Kexec is about to happen. Don't go back into mwait() as
1339	* the kexec kernel might overwrite text and data including
1340	* page tables and stack. So mwait() would resume when the
1341	* monitor cache line is written to and then the CPU goes
1342	* south due to overwritten text, page tables and stack.
1343	*
1344	* Note: This does _NOT_ protect against a stray MCE, NMI,
1345	* SMI. They will resume execution at the instruction
1346	* following the HLT instruction and run into the problem
1347	* which this is trying to prevent.
1348	*/
1349	WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
1350	while(`1`)
1351	native_halt();
1352	}
1353	}
1354	}
1355
1356	/*
1357	* Kick all "offline" CPUs out of mwait on kexec(). See comment in
1358	* mwait_play_dead().
1359	*/
1360	void smp_kick_mwait_play_dead(void)
1361	{
1362	u32 newstate = CPUDEAD_MWAIT_KEXEC_HLT;
1363	struct mwait_cpu_dead *md;
1364	unsigned int cpu, i;
1365
1366	for_each_cpu_andnot(cpu, cpu_present_mask, cpu_online_mask) {
1367	md = per_cpu_ptr(&mwait_cpu_dead, cpu);
1368
1369	/ Does it sit in mwait_play_dead() ? /
1370	if (READ_ONCE(md->status) != CPUDEAD_MWAIT_WAIT)
1371	continue;
1372
1373	/ Wait up to 5ms /
1374	for (i = `0`; READ_ONCE(md->status) != newstate && i < `1000`; i++) {
1375	/ Bring it out of mwait /
1376	WRITE_ONCE(md->control, newstate);
1377	udelay(`5`);
1378	}
1379
1380	if (READ_ONCE(md->status) != newstate)
1381	pr_err_once("CPU%u is stuck in mwait_play_dead()\n", cpu);
1382	}
1383	}
1384
1385	void __noreturn hlt_play_dead(void)
1386	{
1387	if (__this_cpu_read(cpu_info.x86) >= `4`)
1388	wbinvd();
1389
1390	while (`1`)
1391	native_halt();
1392	}
1393
1394	/*
1395	* native_play_dead() is essentially a __noreturn function, but it can't
1396	* be marked as such as the compiler may complain about it.
1397	*/
1398	void native_play_dead(void)
1399	{
1400	if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
1401	__update_spec_ctrl(val: `0`);
1402
1403	play_dead_common();
1404	tboot_shutdown(shutdown_type: TB_SHUTDOWN_WFS);
1405
1406	mwait_play_dead();
1407	if (cpuidle_play_dead())
1408	hlt_play_dead();
1409	}
1410
1411	#else /* ... !CONFIG_HOTPLUG_CPU */
1412	int native_cpu_disable(void)
1413	{
1414	return -ENOSYS;
1415	}
1416
1417	void native_play_dead(void)
1418	{
1419	BUG();
1420	}
1421
1422	#endif
1423

source code of linux/arch/x86/kernel/smpboot.c