1 /*
2 * x86 SMP booting functions
3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
5 * (c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
6 * Copyright 2001 Andi Kleen, SuSE Labs.
7 *
8 * Much of the core SMP work is based on previous work by Thomas Radke, to
9 * whom a great many thanks are extended.
10 *
11 * Thanks to Intel for making available several different Pentium,
12 * Pentium Pro and Pentium-II/Xeon MP machines.
13 * Original development of Linux SMP code supported by Caldera.
14 *
15 * This code is released under the GNU General Public License version 2 or
16 * later.
17 *
18 * Fixes
19 * Felix Koop : NR_CPUS used properly
20 * Jose Renau : Handle single CPU case.
21 * Alan Cox : By repeated request 8) - Total BogoMIPS report.
22 * Greg Wright : Fix for kernel stacks panic.
23 * Erich Boleyn : MP v1.4 and additional changes.
24 * Matthias Sattler : Changes for 2.1 kernel map.
25 * Michel Lespinasse : Changes for 2.1 kernel map.
26 * Michael Chastain : Change trampoline.S to gnu as.
27 * Alan Cox : Dumb bug: 'B' step PPro's are fine
28 * Ingo Molnar : Added APIC timers, based on code
29 * from Jose Renau
30 * Ingo Molnar : various cleanups and rewrites
31 * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug.
32 * Maciej W. Rozycki : Bits for genuine 82489DX APICs
33 * Andi Kleen : Changed for SMP boot into long mode.
34 * Martin J. Bligh : Added support for multi-quad systems
35 * Dave Jones : Report invalid combinations of Athlon CPUs.
36 * Rusty Russell : Hacked into shape for new "hotplug" boot process.
37 * Andi Kleen : Converted to new state machine.
38 * Ashok Raj : CPU hotplug support
39 * Glauber Costa : i386 and x86_64 integration
40 */
41
42#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
43
44#include <linux/init.h>
45#include <linux/smp.h>
46#include <linux/export.h>
47#include <linux/sched.h>
48#include <linux/sched/topology.h>
49#include <linux/sched/hotplug.h>
50#include <linux/sched/task_stack.h>
51#include <linux/percpu.h>
52#include <linux/memblock.h>
53#include <linux/err.h>
54#include <linux/nmi.h>
55#include <linux/tboot.h>
56#include <linux/stackprotector.h>
57#include <linux/gfp.h>
58#include <linux/cpuidle.h>
59#include <linux/numa.h>
60
61#include <asm/acpi.h>
62#include <asm/desc.h>
63#include <asm/nmi.h>
64#include <asm/irq.h>
65#include <asm/realmode.h>
66#include <asm/cpu.h>
67#include <asm/numa.h>
68#include <asm/pgtable.h>
69#include <asm/tlbflush.h>
70#include <asm/mtrr.h>
71#include <asm/mwait.h>
72#include <asm/apic.h>
73#include <asm/io_apic.h>
74#include <asm/fpu/internal.h>
75#include <asm/setup.h>
76#include <asm/uv/uv.h>
77#include <linux/mc146818rtc.h>
78#include <asm/i8259.h>
79#include <asm/misc.h>
80#include <asm/qspinlock.h>
81#include <asm/intel-family.h>
82#include <asm/cpu_device_id.h>
83#include <asm/spec-ctrl.h>
84#include <asm/hw_irq.h>
85
86/* representing HT siblings of each logical CPU */
87DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
88EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
89
90/* representing HT and core siblings of each logical CPU */
91DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
92EXPORT_PER_CPU_SYMBOL(cpu_core_map);
93
94DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
95
96/* Per CPU bogomips and other parameters */
97DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
98EXPORT_PER_CPU_SYMBOL(cpu_info);
99
100/* Logical package management. We might want to allocate that dynamically */
101unsigned int __max_logical_packages __read_mostly;
102EXPORT_SYMBOL(__max_logical_packages);
103static unsigned int logical_packages __read_mostly;
104
105/* Maximum number of SMT threads on any online core */
106int __read_mostly __max_smt_threads = 1;
107
108/* Flag to indicate if a complete sched domain rebuild is required */
109bool x86_topology_update;
110
111int arch_update_cpu_topology(void)
112{
113 int retval = x86_topology_update;
114
115 x86_topology_update = false;
116 return retval;
117}
118
119static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
120{
121 unsigned long flags;
122
123 spin_lock_irqsave(&rtc_lock, flags);
124 CMOS_WRITE(0xa, 0xf);
125 spin_unlock_irqrestore(&rtc_lock, flags);
126 *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
127 start_eip >> 4;
128 *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
129 start_eip & 0xf;
130}
131
132static inline void smpboot_restore_warm_reset_vector(void)
133{
134 unsigned long flags;
135
136 /*
137 * Paranoid: Set warm reset code and vector here back
138 * to default values.
139 */
140 spin_lock_irqsave(&rtc_lock, flags);
141 CMOS_WRITE(0, 0xf);
142 spin_unlock_irqrestore(&rtc_lock, flags);
143
144 *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
145}
146
147/*
148 * Report back to the Boot Processor during boot time or to the caller processor
149 * during CPU online.
150 */
151static void smp_callin(void)
152{
153 int cpuid;
154
155 /*
156 * If waken up by an INIT in an 82489DX configuration
157 * cpu_callout_mask guarantees we don't get here before
158 * an INIT_deassert IPI reaches our local APIC, so it is
159 * now safe to touch our local APIC.
160 */
161 cpuid = smp_processor_id();
162
163 /*
164 * the boot CPU has finished the init stage and is spinning
165 * on callin_map until we finish. We are free to set up this
166 * CPU, first the APIC. (this is probably redundant on most
167 * boards)
168 */
169 apic_ap_setup();
170
171 /*
172 * Save our processor parameters. Note: this information
173 * is needed for clock calibration.
174 */
175 smp_store_cpu_info(cpuid);
176
177 /*
178 * The topology information must be up to date before
179 * calibrate_delay() and notify_cpu_starting().
180 */
181 set_cpu_sibling_map(raw_smp_processor_id());
182
183 /*
184 * Get our bogomips.
185 * Update loops_per_jiffy in cpu_data. Previous call to
186 * smp_store_cpu_info() stored a value that is close but not as
187 * accurate as the value just calculated.
188 */
189 calibrate_delay();
190 cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy;
191 pr_debug("Stack at about %p\n", &cpuid);
192
193 wmb();
194
195 notify_cpu_starting(cpuid);
196
197 /*
198 * Allow the master to continue.
199 */
200 cpumask_set_cpu(cpuid, cpu_callin_mask);
201}
202
203static int cpu0_logical_apicid;
204static int enable_start_cpu0;
205/*
206 * Activate a secondary processor.
207 */
208static void notrace start_secondary(void *unused)
209{
210 /*
211 * Don't put *anything* except direct CPU state initialization
212 * before cpu_init(), SMP booting is too fragile that we want to
213 * limit the things done here to the most necessary things.
214 */
215 if (boot_cpu_has(X86_FEATURE_PCID))
216 __write_cr4(__read_cr4() | X86_CR4_PCIDE);
217
218#ifdef CONFIG_X86_32
219 /* switch away from the initial page table */
220 load_cr3(swapper_pg_dir);
221 /*
222 * Initialize the CR4 shadow before doing anything that could
223 * try to read it.
224 */
225 cr4_init_shadow();
226 __flush_tlb_all();
227#endif
228 load_current_idt();
229 cpu_init();
230 x86_cpuinit.early_percpu_clock_init();
231 preempt_disable();
232 smp_callin();
233
234 enable_start_cpu0 = 0;
235
236 /* otherwise gcc will move up smp_processor_id before the cpu_init */
237 barrier();
238 /*
239 * Check TSC synchronization with the boot CPU:
240 */
241 check_tsc_sync_target();
242
243 speculative_store_bypass_ht_init();
244
245 /*
246 * Lock vector_lock, set CPU online and bring the vector
247 * allocator online. Online must be set with vector_lock held
248 * to prevent a concurrent irq setup/teardown from seeing a
249 * half valid vector space.
250 */
251 lock_vector_lock();
252 set_cpu_online(smp_processor_id(), true);
253 lapic_online();
254 unlock_vector_lock();
255 cpu_set_state_online(smp_processor_id());
256 x86_platform.nmi_init();
257
258 /* enable local interrupts */
259 local_irq_enable();
260
261 /* to prevent fake stack check failure in clock setup */
262 boot_init_stack_canary();
263
264 x86_cpuinit.setup_percpu_clockev();
265
266 wmb();
267 cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
268}
269
270/**
271 * topology_is_primary_thread - Check whether CPU is the primary SMT thread
272 * @cpu: CPU to check
273 */
274bool topology_is_primary_thread(unsigned int cpu)
275{
276 return apic_id_is_primary_thread(per_cpu(x86_cpu_to_apicid, cpu));
277}
278
279/**
280 * topology_smt_supported - Check whether SMT is supported by the CPUs
281 */
282bool topology_smt_supported(void)
283{
284 return smp_num_siblings > 1;
285}
286
287/**
288 * topology_phys_to_logical_pkg - Map a physical package id to a logical
289 *
290 * Returns logical package id or -1 if not found
291 */
292int topology_phys_to_logical_pkg(unsigned int phys_pkg)
293{
294 int cpu;
295
296 for_each_possible_cpu(cpu) {
297 struct cpuinfo_x86 *c = &cpu_data(cpu);
298
299 if (c->initialized && c->phys_proc_id == phys_pkg)
300 return c->logical_proc_id;
301 }
302 return -1;
303}
304EXPORT_SYMBOL(topology_phys_to_logical_pkg);
305
306/**
307 * topology_update_package_map - Update the physical to logical package map
308 * @pkg: The physical package id as retrieved via CPUID
309 * @cpu: The cpu for which this is updated
310 */
311int topology_update_package_map(unsigned int pkg, unsigned int cpu)
312{
313 int new;
314
315 /* Already available somewhere? */
316 new = topology_phys_to_logical_pkg(pkg);
317 if (new >= 0)
318 goto found;
319
320 new = logical_packages++;
321 if (new != pkg) {
322 pr_info("CPU %u Converting physical %u to logical package %u\n",
323 cpu, pkg, new);
324 }
325found:
326 cpu_data(cpu).logical_proc_id = new;
327 return 0;
328}
329
330void __init smp_store_boot_cpu_info(void)
331{
332 int id = 0; /* CPU 0 */
333 struct cpuinfo_x86 *c = &cpu_data(id);
334
335 *c = boot_cpu_data;
336 c->cpu_index = id;
337 topology_update_package_map(c->phys_proc_id, id);
338 c->initialized = true;
339}
340
341/*
342 * The bootstrap kernel entry code has set these up. Save them for
343 * a given CPU
344 */
345void smp_store_cpu_info(int id)
346{
347 struct cpuinfo_x86 *c = &cpu_data(id);
348
349 /* Copy boot_cpu_data only on the first bringup */
350 if (!c->initialized)
351 *c = boot_cpu_data;
352 c->cpu_index = id;
353 /*
354 * During boot time, CPU0 has this setup already. Save the info when
355 * bringing up AP or offlined CPU0.
356 */
357 identify_secondary_cpu(c);
358 c->initialized = true;
359}
360
361static bool
362topology_same_node(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
363{
364 int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
365
366 return (cpu_to_node(cpu1) == cpu_to_node(cpu2));
367}
368
369static bool
370topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
371{
372 int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
373
374 return !WARN_ONCE(!topology_same_node(c, o),
375 "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! "
376 "[node: %d != %d]. Ignoring dependency.\n",
377 cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
378}
379
380#define link_mask(mfunc, c1, c2) \
381do { \
382 cpumask_set_cpu((c1), mfunc(c2)); \
383 cpumask_set_cpu((c2), mfunc(c1)); \
384} while (0)
385
386static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
387{
388 if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
389 int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
390
391 if (c->phys_proc_id == o->phys_proc_id &&
392 per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) {
393 if (c->cpu_core_id == o->cpu_core_id)
394 return topology_sane(c, o, "smt");
395
396 if ((c->cu_id != 0xff) &&
397 (o->cu_id != 0xff) &&
398 (c->cu_id == o->cu_id))
399 return topology_sane(c, o, "smt");
400 }
401
402 } else if (c->phys_proc_id == o->phys_proc_id &&
403 c->cpu_core_id == o->cpu_core_id) {
404 return topology_sane(c, o, "smt");
405 }
406
407 return false;
408}
409
410/*
411 * Define snc_cpu[] for SNC (Sub-NUMA Cluster) CPUs.
412 *
413 * These are Intel CPUs that enumerate an LLC that is shared by
414 * multiple NUMA nodes. The LLC on these systems is shared for
415 * off-package data access but private to the NUMA node (half
416 * of the package) for on-package access.
417 *
418 * CPUID (the source of the information about the LLC) can only
419 * enumerate the cache as being shared *or* unshared, but not
420 * this particular configuration. The CPU in this case enumerates
421 * the cache to be shared across the entire package (spanning both
422 * NUMA nodes).
423 */
424
425static const struct x86_cpu_id snc_cpu[] = {
426 { X86_VENDOR_INTEL, 6, INTEL_FAM6_SKYLAKE_X },
427 {}
428};
429
430static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
431{
432 int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
433
434 /* Do not match if we do not have a valid APICID for cpu: */
435 if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID)
436 return false;
437
438 /* Do not match if LLC id does not match: */
439 if (per_cpu(cpu_llc_id, cpu1) != per_cpu(cpu_llc_id, cpu2))
440 return false;
441
442 /*
443 * Allow the SNC topology without warning. Return of false
444 * means 'c' does not share the LLC of 'o'. This will be
445 * reflected to userspace.
446 */
447 if (!topology_same_node(c, o) && x86_match_cpu(snc_cpu))
448 return false;
449
450 return topology_sane(c, o, "llc");
451}
452
453/*
454 * Unlike the other levels, we do not enforce keeping a
455 * multicore group inside a NUMA node. If this happens, we will
456 * discard the MC level of the topology later.
457 */
458static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
459{
460 if (c->phys_proc_id == o->phys_proc_id)
461 return true;
462 return false;
463}
464
465#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
466static inline int x86_sched_itmt_flags(void)
467{
468 return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0;
469}
470
471#ifdef CONFIG_SCHED_MC
472static int x86_core_flags(void)
473{
474 return cpu_core_flags() | x86_sched_itmt_flags();
475}
476#endif
477#ifdef CONFIG_SCHED_SMT
478static int x86_smt_flags(void)
479{
480 return cpu_smt_flags() | x86_sched_itmt_flags();
481}
482#endif
483#endif
484
485static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
486#ifdef CONFIG_SCHED_SMT
487 { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
488#endif
489#ifdef CONFIG_SCHED_MC
490 { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
491#endif
492 { NULL, },
493};
494
495static struct sched_domain_topology_level x86_topology[] = {
496#ifdef CONFIG_SCHED_SMT
497 { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
498#endif
499#ifdef CONFIG_SCHED_MC
500 { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
501#endif
502 { cpu_cpu_mask, SD_INIT_NAME(DIE) },
503 { NULL, },
504};
505
506/*
507 * Set if a package/die has multiple NUMA nodes inside.
508 * AMD Magny-Cours, Intel Cluster-on-Die, and Intel
509 * Sub-NUMA Clustering have this.
510 */
511static bool x86_has_numa_in_package;
512
513void set_cpu_sibling_map(int cpu)
514{
515 bool has_smt = smp_num_siblings > 1;
516 bool has_mp = has_smt || boot_cpu_data.x86_max_cores > 1;
517 struct cpuinfo_x86 *c = &cpu_data(cpu);
518 struct cpuinfo_x86 *o;
519 int i, threads;
520
521 cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
522
523 if (!has_mp) {
524 cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
525 cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
526 cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
527 c->booted_cores = 1;
528 return;
529 }
530
531 for_each_cpu(i, cpu_sibling_setup_mask) {
532 o = &cpu_data(i);
533
534 if ((i == cpu) || (has_smt && match_smt(c, o)))
535 link_mask(topology_sibling_cpumask, cpu, i);
536
537 if ((i == cpu) || (has_mp && match_llc(c, o)))
538 link_mask(cpu_llc_shared_mask, cpu, i);
539
540 }
541
542 /*
543 * This needs a separate iteration over the cpus because we rely on all
544 * topology_sibling_cpumask links to be set-up.
545 */
546 for_each_cpu(i, cpu_sibling_setup_mask) {
547 o = &cpu_data(i);
548
549 if ((i == cpu) || (has_mp && match_die(c, o))) {
550 link_mask(topology_core_cpumask, cpu, i);
551
552 /*
553 * Does this new cpu bringup a new core?
554 */
555 if (cpumask_weight(
556 topology_sibling_cpumask(cpu)) == 1) {
557 /*
558 * for each core in package, increment
559 * the booted_cores for this new cpu
560 */
561 if (cpumask_first(
562 topology_sibling_cpumask(i)) == i)
563 c->booted_cores++;
564 /*
565 * increment the core count for all
566 * the other cpus in this package
567 */
568 if (i != cpu)
569 cpu_data(i).booted_cores++;
570 } else if (i != cpu && !c->booted_cores)
571 c->booted_cores = cpu_data(i).booted_cores;
572 }
573 if (match_die(c, o) && !topology_same_node(c, o))
574 x86_has_numa_in_package = true;
575 }
576
577 threads = cpumask_weight(topology_sibling_cpumask(cpu));
578 if (threads > __max_smt_threads)
579 __max_smt_threads = threads;
580}
581
582/* maps the cpu to the sched domain representing multi-core */
583const struct cpumask *cpu_coregroup_mask(int cpu)
584{
585 return cpu_llc_shared_mask(cpu);
586}
587
588static void impress_friends(void)
589{
590 int cpu;
591 unsigned long bogosum = 0;
592 /*
593 * Allow the user to impress friends.
594 */
595 pr_debug("Before bogomips\n");
596 for_each_possible_cpu(cpu)
597 if (cpumask_test_cpu(cpu, cpu_callout_mask))
598 bogosum += cpu_data(cpu).loops_per_jiffy;
599 pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n",
600 num_online_cpus(),
601 bogosum/(500000/HZ),
602 (bogosum/(5000/HZ))%100);
603
604 pr_debug("Before bogocount - setting activated=1\n");
605}
606
607void __inquire_remote_apic(int apicid)
608{
609 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
610 const char * const names[] = { "ID", "VERSION", "SPIV" };
611 int timeout;
612 u32 status;
613
614 pr_info("Inquiring remote APIC 0x%x...\n", apicid);
615
616 for (i = 0; i < ARRAY_SIZE(regs); i++) {
617 pr_info("... APIC 0x%x %s: ", apicid, names[i]);
618
619 /*
620 * Wait for idle.
621 */
622 status = safe_apic_wait_icr_idle();
623 if (status)
624 pr_cont("a previous APIC delivery may have failed\n");
625
626 apic_icr_write(APIC_DM_REMRD | regs[i], apicid);
627
628 timeout = 0;
629 do {
630 udelay(100);
631 status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
632 } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
633
634 switch (status) {
635 case APIC_ICR_RR_VALID:
636 status = apic_read(APIC_RRR);
637 pr_cont("%08x\n", status);
638 break;
639 default:
640 pr_cont("failed\n");
641 }
642 }
643}
644
645/*
646 * The Multiprocessor Specification 1.4 (1997) example code suggests
647 * that there should be a 10ms delay between the BSP asserting INIT
648 * and de-asserting INIT, when starting a remote processor.
649 * But that slows boot and resume on modern processors, which include
650 * many cores and don't require that delay.
651 *
652 * Cmdline "init_cpu_udelay=" is available to over-ride this delay.
653 * Modern processor families are quirked to remove the delay entirely.
654 */
655#define UDELAY_10MS_DEFAULT 10000
656
657static unsigned int init_udelay = UINT_MAX;
658
659static int __init cpu_init_udelay(char *str)
660{
661 get_option(&str, &init_udelay);
662
663 return 0;
664}
665early_param("cpu_init_udelay", cpu_init_udelay);
666
667static void __init smp_quirk_init_udelay(void)
668{
669 /* if cmdline changed it from default, leave it alone */
670 if (init_udelay != UINT_MAX)
671 return;
672
673 /* if modern processor, use no delay */
674 if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) ||
675 ((boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) && (boot_cpu_data.x86 >= 0x18)) ||
676 ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) {
677 init_udelay = 0;
678 return;
679 }
680 /* else, use legacy delay */
681 init_udelay = UDELAY_10MS_DEFAULT;
682}
683
684/*
685 * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
686 * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
687 * won't ... remember to clear down the APIC, etc later.
688 */
689int
690wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip)
691{
692 unsigned long send_status, accept_status = 0;
693 int maxlvt;
694
695 /* Target chip */
696 /* Boot on the stack */
697 /* Kick the second */
698 apic_icr_write(APIC_DM_NMI | apic->dest_logical, apicid);
699
700 pr_debug("Waiting for send to finish...\n");
701 send_status = safe_apic_wait_icr_idle();
702
703 /*
704 * Give the other CPU some time to accept the IPI.
705 */
706 udelay(200);
707 if (APIC_INTEGRATED(boot_cpu_apic_version)) {
708 maxlvt = lapic_get_maxlvt();
709 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
710 apic_write(APIC_ESR, 0);
711 accept_status = (apic_read(APIC_ESR) & 0xEF);
712 }
713 pr_debug("NMI sent\n");
714
715 if (send_status)
716 pr_err("APIC never delivered???\n");
717 if (accept_status)
718 pr_err("APIC delivery error (%lx)\n", accept_status);
719
720 return (send_status | accept_status);
721}
722
723static int
724wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
725{
726 unsigned long send_status = 0, accept_status = 0;
727 int maxlvt, num_starts, j;
728
729 maxlvt = lapic_get_maxlvt();
730
731 /*
732 * Be paranoid about clearing APIC errors.
733 */
734 if (APIC_INTEGRATED(boot_cpu_apic_version)) {
735 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
736 apic_write(APIC_ESR, 0);
737 apic_read(APIC_ESR);
738 }
739
740 pr_debug("Asserting INIT\n");
741
742 /*
743 * Turn INIT on target chip
744 */
745 /*
746 * Send IPI
747 */
748 apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,
749 phys_apicid);
750
751 pr_debug("Waiting for send to finish...\n");
752 send_status = safe_apic_wait_icr_idle();
753
754 udelay(init_udelay);
755
756 pr_debug("Deasserting INIT\n");
757
758 /* Target chip */
759 /* Send IPI */
760 apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
761
762 pr_debug("Waiting for send to finish...\n");
763 send_status = safe_apic_wait_icr_idle();
764
765 mb();
766
767 /*
768 * Should we send STARTUP IPIs ?
769 *
770 * Determine this based on the APIC version.
771 * If we don't have an integrated APIC, don't send the STARTUP IPIs.
772 */
773 if (APIC_INTEGRATED(boot_cpu_apic_version))
774 num_starts = 2;
775 else
776 num_starts = 0;
777
778 /*
779 * Run STARTUP IPI loop.
780 */
781 pr_debug("#startup loops: %d\n", num_starts);
782
783 for (j = 1; j <= num_starts; j++) {
784 pr_debug("Sending STARTUP #%d\n", j);
785 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
786 apic_write(APIC_ESR, 0);
787 apic_read(APIC_ESR);
788 pr_debug("After apic_write\n");
789
790 /*
791 * STARTUP IPI
792 */
793
794 /* Target chip */
795 /* Boot on the stack */
796 /* Kick the second */
797 apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12),
798 phys_apicid);
799
800 /*
801 * Give the other CPU some time to accept the IPI.
802 */
803 if (init_udelay == 0)
804 udelay(10);
805 else
806 udelay(300);
807
808 pr_debug("Startup point 1\n");
809
810 pr_debug("Waiting for send to finish...\n");
811 send_status = safe_apic_wait_icr_idle();
812
813 /*
814 * Give the other CPU some time to accept the IPI.
815 */
816 if (init_udelay == 0)
817 udelay(10);
818 else
819 udelay(200);
820
821 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
822 apic_write(APIC_ESR, 0);
823 accept_status = (apic_read(APIC_ESR) & 0xEF);
824 if (send_status || accept_status)
825 break;
826 }
827 pr_debug("After Startup\n");
828
829 if (send_status)
830 pr_err("APIC never delivered???\n");
831 if (accept_status)
832 pr_err("APIC delivery error (%lx)\n", accept_status);
833
834 return (send_status | accept_status);
835}
836
837/* reduce the number of lines printed when booting a large cpu count system */
838static void announce_cpu(int cpu, int apicid)
839{
840 static int current_node = NUMA_NO_NODE;
841 int node = early_cpu_to_node(cpu);
842 static int width, node_width;
843
844 if (!width)
845 width = num_digits(num_possible_cpus()) + 1; /* + '#' sign */
846
847 if (!node_width)
848 node_width = num_digits(num_possible_nodes()) + 1; /* + '#' */
849
850 if (cpu == 1)
851 printk(KERN_INFO "x86: Booting SMP configuration:\n");
852
853 if (system_state < SYSTEM_RUNNING) {
854 if (node != current_node) {
855 if (current_node > (-1))
856 pr_cont("\n");
857 current_node = node;
858
859 printk(KERN_INFO ".... node %*s#%d, CPUs: ",
860 node_width - num_digits(node), " ", node);
861 }
862
863 /* Add padding for the BSP */
864 if (cpu == 1)
865 pr_cont("%*s", width + 1, " ");
866
867 pr_cont("%*s#%d", width - num_digits(cpu), " ", cpu);
868
869 } else
870 pr_info("Booting Node %d Processor %d APIC 0x%x\n",
871 node, cpu, apicid);
872}
873
874static int wakeup_cpu0_nmi(unsigned int cmd, struct pt_regs *regs)
875{
876 int cpu;
877
878 cpu = smp_processor_id();
879 if (cpu == 0 && !cpu_online(cpu) && enable_start_cpu0)
880 return NMI_HANDLED;
881
882 return NMI_DONE;
883}
884
885/*
886 * Wake up AP by INIT, INIT, STARTUP sequence.
887 *
888 * Instead of waiting for STARTUP after INITs, BSP will execute the BIOS
889 * boot-strap code which is not a desired behavior for waking up BSP. To
890 * void the boot-strap code, wake up CPU0 by NMI instead.
891 *
892 * This works to wake up soft offlined CPU0 only. If CPU0 is hard offlined
893 * (i.e. physically hot removed and then hot added), NMI won't wake it up.
894 * We'll change this code in the future to wake up hard offlined CPU0 if
895 * real platform and request are available.
896 */
897static int
898wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid,
899 int *cpu0_nmi_registered)
900{
901 int id;
902 int boot_error;
903
904 preempt_disable();
905
906 /*
907 * Wake up AP by INIT, INIT, STARTUP sequence.
908 */
909 if (cpu) {
910 boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip);
911 goto out;
912 }
913
914 /*
915 * Wake up BSP by nmi.
916 *
917 * Register a NMI handler to help wake up CPU0.
918 */
919 boot_error = register_nmi_handler(NMI_LOCAL,
920 wakeup_cpu0_nmi, 0, "wake_cpu0");
921
922 if (!boot_error) {
923 enable_start_cpu0 = 1;
924 *cpu0_nmi_registered = 1;
925 if (apic->dest_logical == APIC_DEST_LOGICAL)
926 id = cpu0_logical_apicid;
927 else
928 id = apicid;
929 boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip);
930 }
931
932out:
933 preempt_enable();
934
935 return boot_error;
936}
937
938void common_cpu_up(unsigned int cpu, struct task_struct *idle)
939{
940 /* Just in case we booted with a single CPU. */
941 alternatives_enable_smp();
942
943 per_cpu(current_task, cpu) = idle;
944
945#ifdef CONFIG_X86_32
946 /* Stack for startup_32 can be just as for start_secondary onwards */
947 irq_ctx_init(cpu);
948 per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
949#else
950 initial_gs = per_cpu_offset(cpu);
951#endif
952}
953
954/*
955 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
956 * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
957 * Returns zero if CPU booted OK, else error code from
958 * ->wakeup_secondary_cpu.
959 */
960static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
961 int *cpu0_nmi_registered)
962{
963 volatile u32 *trampoline_status =
964 (volatile u32 *) __va(real_mode_header->trampoline_status);
965 /* start_ip had better be page-aligned! */
966 unsigned long start_ip = real_mode_header->trampoline_start;
967
968 unsigned long boot_error = 0;
969 unsigned long timeout;
970
971 idle->thread.sp = (unsigned long)task_pt_regs(idle);
972 early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
973 initial_code = (unsigned long)start_secondary;
974 initial_stack = idle->thread.sp;
975
976 /* Enable the espfix hack for this CPU */
977 init_espfix_ap(cpu);
978
979 /* So we see what's up */
980 announce_cpu(cpu, apicid);
981
982 /*
983 * This grunge runs the startup process for
984 * the targeted processor.
985 */
986
987 if (x86_platform.legacy.warm_reset) {
988
989 pr_debug("Setting warm reset code and vector.\n");
990
991 smpboot_setup_warm_reset_vector(start_ip);
992 /*
993 * Be paranoid about clearing APIC errors.
994 */
995 if (APIC_INTEGRATED(boot_cpu_apic_version)) {
996 apic_write(APIC_ESR, 0);
997 apic_read(APIC_ESR);
998 }
999 }
1000
1001 /*
1002 * AP might wait on cpu_callout_mask in cpu_init() with
1003 * cpu_initialized_mask set if previous attempt to online
1004 * it timed-out. Clear cpu_initialized_mask so that after
1005 * INIT/SIPI it could start with a clean state.
1006 */
1007 cpumask_clear_cpu(cpu, cpu_initialized_mask);
1008 smp_mb();
1009
1010 /*
1011 * Wake up a CPU in difference cases:
1012 * - Use the method in the APIC driver if it's defined
1013 * Otherwise,
1014 * - Use an INIT boot APIC message for APs or NMI for BSP.
1015 */
1016 if (apic->wakeup_secondary_cpu)
1017 boot_error = apic->wakeup_secondary_cpu(apicid, start_ip);
1018 else
1019 boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid,
1020 cpu0_nmi_registered);
1021
1022 if (!boot_error) {
1023 /*
1024 * Wait 10s total for first sign of life from AP
1025 */
1026 boot_error = -1;
1027 timeout = jiffies + 10*HZ;
1028 while (time_before(jiffies, timeout)) {
1029 if (cpumask_test_cpu(cpu, cpu_initialized_mask)) {
1030 /*
1031 * Tell AP to proceed with initialization
1032 */
1033 cpumask_set_cpu(cpu, cpu_callout_mask);
1034 boot_error = 0;
1035 break;
1036 }
1037 schedule();
1038 }
1039 }
1040
1041 if (!boot_error) {
1042 /*
1043 * Wait till AP completes initial initialization
1044 */
1045 while (!cpumask_test_cpu(cpu, cpu_callin_mask)) {
1046 /*
1047 * Allow other tasks to run while we wait for the
1048 * AP to come online. This also gives a chance
1049 * for the MTRR work(triggered by the AP coming online)
1050 * to be completed in the stop machine context.
1051 */
1052 schedule();
1053 }
1054 }
1055
1056 /* mark "stuck" area as not stuck */
1057 *trampoline_status = 0;
1058
1059 if (x86_platform.legacy.warm_reset) {
1060 /*
1061 * Cleanup possible dangling ends...
1062 */
1063 smpboot_restore_warm_reset_vector();
1064 }
1065
1066 return boot_error;
1067}
1068
1069int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
1070{
1071 int apicid = apic->cpu_present_to_apicid(cpu);
1072 int cpu0_nmi_registered = 0;
1073 unsigned long flags;
1074 int err, ret = 0;
1075
1076 lockdep_assert_irqs_enabled();
1077
1078 pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu);
1079
1080 if (apicid == BAD_APICID ||
1081 !physid_isset(apicid, phys_cpu_present_map) ||
1082 !apic->apic_id_valid(apicid)) {
1083 pr_err("%s: bad cpu %d\n", __func__, cpu);
1084 return -EINVAL;
1085 }
1086
1087 /*
1088 * Already booted CPU?
1089 */
1090 if (cpumask_test_cpu(cpu, cpu_callin_mask)) {
1091 pr_debug("do_boot_cpu %d Already started\n", cpu);
1092 return -ENOSYS;
1093 }
1094
1095 /*
1096 * Save current MTRR state in case it was changed since early boot
1097 * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
1098 */
1099 mtrr_save_state();
1100
1101 /* x86 CPUs take themselves offline, so delayed offline is OK. */
1102 err = cpu_check_up_prepare(cpu);
1103 if (err && err != -EBUSY)
1104 return err;
1105
1106 /* the FPU context is blank, nobody can own it */
1107 per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
1108
1109 common_cpu_up(cpu, tidle);
1110
1111 err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered);
1112 if (err) {
1113 pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
1114 ret = -EIO;
1115 goto unreg_nmi;
1116 }
1117
1118 /*
1119 * Check TSC synchronization with the AP (keep irqs disabled
1120 * while doing so):
1121 */
1122 local_irq_save(flags);
1123 check_tsc_sync_source(cpu);
1124 local_irq_restore(flags);
1125
1126 while (!cpu_online(cpu)) {
1127 cpu_relax();
1128 touch_nmi_watchdog();
1129 }
1130
1131unreg_nmi:
1132 /*
1133 * Clean up the nmi handler. Do this after the callin and callout sync
1134 * to avoid impact of possible long unregister time.
1135 */
1136 if (cpu0_nmi_registered)
1137 unregister_nmi_handler(NMI_LOCAL, "wake_cpu0");
1138
1139 return ret;
1140}
1141
1142/**
1143 * arch_disable_smp_support() - disables SMP support for x86 at runtime
1144 */
1145void arch_disable_smp_support(void)
1146{
1147 disable_ioapic_support();
1148}
1149
1150/*
1151 * Fall back to non SMP mode after errors.
1152 *
1153 * RED-PEN audit/test this more. I bet there is more state messed up here.
1154 */
1155static __init void disable_smp(void)
1156{
1157 pr_info("SMP disabled\n");
1158
1159 disable_ioapic_support();
1160
1161 init_cpu_present(cpumask_of(0));
1162 init_cpu_possible(cpumask_of(0));
1163
1164 if (smp_found_config)
1165 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
1166 else
1167 physid_set_mask_of_physid(0, &phys_cpu_present_map);
1168 cpumask_set_cpu(0, topology_sibling_cpumask(0));
1169 cpumask_set_cpu(0, topology_core_cpumask(0));
1170}
1171
1172/*
1173 * Various sanity checks.
1174 */
1175static void __init smp_sanity_check(void)
1176{
1177 preempt_disable();
1178
1179#if !defined(CONFIG_X86_BIGSMP) && defined(CONFIG_X86_32)
1180 if (def_to_bigsmp && nr_cpu_ids > 8) {
1181 unsigned int cpu;
1182 unsigned nr;
1183
1184 pr_warn("More than 8 CPUs detected - skipping them\n"
1185 "Use CONFIG_X86_BIGSMP\n");
1186
1187 nr = 0;
1188 for_each_present_cpu(cpu) {
1189 if (nr >= 8)
1190 set_cpu_present(cpu, false);
1191 nr++;
1192 }
1193
1194 nr = 0;
1195 for_each_possible_cpu(cpu) {
1196 if (nr >= 8)
1197 set_cpu_possible(cpu, false);
1198 nr++;
1199 }
1200
1201 nr_cpu_ids = 8;
1202 }
1203#endif
1204
1205 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
1206 pr_warn("weird, boot CPU (#%d) not listed by the BIOS\n",
1207 hard_smp_processor_id());
1208
1209 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
1210 }
1211
1212 /*
1213 * Should not be necessary because the MP table should list the boot
1214 * CPU too, but we do it for the sake of robustness anyway.
1215 */
1216 if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) {
1217 pr_notice("weird, boot CPU (#%d) not listed by the BIOS\n",
1218 boot_cpu_physical_apicid);
1219 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
1220 }
1221 preempt_enable();
1222}
1223
1224static void __init smp_cpu_index_default(void)
1225{
1226 int i;
1227 struct cpuinfo_x86 *c;
1228
1229 for_each_possible_cpu(i) {
1230 c = &cpu_data(i);
1231 /* mark all to hotplug */
1232 c->cpu_index = nr_cpu_ids;
1233 }
1234}
1235
1236static void __init smp_get_logical_apicid(void)
1237{
1238 if (x2apic_mode)
1239 cpu0_logical_apicid = apic_read(APIC_LDR);
1240 else
1241 cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
1242}
1243
1244/*
1245 * Prepare for SMP bootup.
1246 * @max_cpus: configured maximum number of CPUs, It is a legacy parameter
1247 * for common interface support.
1248 */
1249void __init native_smp_prepare_cpus(unsigned int max_cpus)
1250{
1251 unsigned int i;
1252
1253 smp_cpu_index_default();
1254
1255 /*
1256 * Setup boot CPU information
1257 */
1258 smp_store_boot_cpu_info(); /* Final full version of the data */
1259 cpumask_copy(cpu_callin_mask, cpumask_of(0));
1260 mb();
1261
1262 for_each_possible_cpu(i) {
1263 zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
1264 zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
1265 zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
1266 }
1267
1268 /*
1269 * Set 'default' x86 topology, this matches default_topology() in that
1270 * it has NUMA nodes as a topology level. See also
1271 * native_smp_cpus_done().
1272 *
1273 * Must be done before set_cpus_sibling_map() is ran.
1274 */
1275 set_sched_topology(x86_topology);
1276
1277 set_cpu_sibling_map(0);
1278
1279 smp_sanity_check();
1280
1281 switch (apic_intr_mode) {
1282 case APIC_PIC:
1283 case APIC_VIRTUAL_WIRE_NO_CONFIG:
1284 disable_smp();
1285 return;
1286 case APIC_SYMMETRIC_IO_NO_ROUTING:
1287 disable_smp();
1288 /* Setup local timer */
1289 x86_init.timers.setup_percpu_clockev();
1290 return;
1291 case APIC_VIRTUAL_WIRE:
1292 case APIC_SYMMETRIC_IO:
1293 break;
1294 }
1295
1296 /* Setup local timer */
1297 x86_init.timers.setup_percpu_clockev();
1298
1299 smp_get_logical_apicid();
1300
1301 pr_info("CPU0: ");
1302 print_cpu_info(&cpu_data(0));
1303
1304 native_pv_lock_init();
1305
1306 uv_system_init();
1307
1308 set_mtrr_aps_delayed_init();
1309
1310 smp_quirk_init_udelay();
1311
1312 speculative_store_bypass_ht_init();
1313}
1314
1315void arch_enable_nonboot_cpus_begin(void)
1316{
1317 set_mtrr_aps_delayed_init();
1318}
1319
1320void arch_enable_nonboot_cpus_end(void)
1321{
1322 mtrr_aps_init();
1323}
1324
1325/*
1326 * Early setup to make printk work.
1327 */
1328void __init native_smp_prepare_boot_cpu(void)
1329{
1330 int me = smp_processor_id();
1331 switch_to_new_gdt(me);
1332 /* already set me in cpu_online_mask in boot_cpu_init() */
1333 cpumask_set_cpu(me, cpu_callout_mask);
1334 cpu_set_state_online(me);
1335}
1336
1337void __init calculate_max_logical_packages(void)
1338{
1339 int ncpus;
1340
1341 /*
1342 * Today neither Intel nor AMD support heterogenous systems so
1343 * extrapolate the boot cpu's data to all packages.
1344 */
1345 ncpus = cpu_data(0).booted_cores * topology_max_smt_threads();
1346 __max_logical_packages = DIV_ROUND_UP(total_cpus, ncpus);
1347 pr_info("Max logical packages: %u\n", __max_logical_packages);
1348}
1349
1350void __init native_smp_cpus_done(unsigned int max_cpus)
1351{
1352 pr_debug("Boot done\n");
1353
1354 calculate_max_logical_packages();
1355
1356 if (x86_has_numa_in_package)
1357 set_sched_topology(x86_numa_in_package_topology);
1358
1359 nmi_selftest();
1360 impress_friends();
1361 mtrr_aps_init();
1362}
1363
1364static int __initdata setup_possible_cpus = -1;
1365static int __init _setup_possible_cpus(char *str)
1366{
1367 get_option(&str, &setup_possible_cpus);
1368 return 0;
1369}
1370early_param("possible_cpus", _setup_possible_cpus);
1371
1372
1373/*
1374 * cpu_possible_mask should be static, it cannot change as cpu's
1375 * are onlined, or offlined. The reason is per-cpu data-structures
1376 * are allocated by some modules at init time, and dont expect to
1377 * do this dynamically on cpu arrival/departure.
1378 * cpu_present_mask on the other hand can change dynamically.
1379 * In case when cpu_hotplug is not compiled, then we resort to current
1380 * behaviour, which is cpu_possible == cpu_present.
1381 * - Ashok Raj
1382 *
1383 * Three ways to find out the number of additional hotplug CPUs:
1384 * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
1385 * - The user can overwrite it with possible_cpus=NUM
1386 * - Otherwise don't reserve additional CPUs.
1387 * We do this because additional CPUs waste a lot of memory.
1388 * -AK
1389 */
1390__init void prefill_possible_map(void)
1391{
1392 int i, possible;
1393
1394 /* No boot processor was found in mptable or ACPI MADT */
1395 if (!num_processors) {
1396 if (boot_cpu_has(X86_FEATURE_APIC)) {
1397 int apicid = boot_cpu_physical_apicid;
1398 int cpu = hard_smp_processor_id();
1399
1400 pr_warn("Boot CPU (id %d) not listed by BIOS\n", cpu);
1401
1402 /* Make sure boot cpu is enumerated */
1403 if (apic->cpu_present_to_apicid(0) == BAD_APICID &&
1404 apic->apic_id_valid(apicid))
1405 generic_processor_info(apicid, boot_cpu_apic_version);
1406 }
1407
1408 if (!num_processors)
1409 num_processors = 1;
1410 }
1411
1412 i = setup_max_cpus ?: 1;
1413 if (setup_possible_cpus == -1) {
1414 possible = num_processors;
1415#ifdef CONFIG_HOTPLUG_CPU
1416 if (setup_max_cpus)
1417 possible += disabled_cpus;
1418#else
1419 if (possible > i)
1420 possible = i;
1421#endif
1422 } else
1423 possible = setup_possible_cpus;
1424
1425 total_cpus = max_t(int, possible, num_processors + disabled_cpus);
1426
1427 /* nr_cpu_ids could be reduced via nr_cpus= */
1428 if (possible > nr_cpu_ids) {
1429 pr_warn("%d Processors exceeds NR_CPUS limit of %u\n",
1430 possible, nr_cpu_ids);
1431 possible = nr_cpu_ids;
1432 }
1433
1434#ifdef CONFIG_HOTPLUG_CPU
1435 if (!setup_max_cpus)
1436#endif
1437 if (possible > i) {
1438 pr_warn("%d Processors exceeds max_cpus limit of %u\n",
1439 possible, setup_max_cpus);
1440 possible = i;
1441 }
1442
1443 nr_cpu_ids = possible;
1444
1445 pr_info("Allowing %d CPUs, %d hotplug CPUs\n",
1446 possible, max_t(int, possible - num_processors, 0));
1447
1448 reset_cpu_possible_mask();
1449
1450 for (i = 0; i < possible; i++)
1451 set_cpu_possible(i, true);
1452}
1453
1454#ifdef CONFIG_HOTPLUG_CPU
1455
1456/* Recompute SMT state for all CPUs on offline */
1457static void recompute_smt_state(void)
1458{
1459 int max_threads, cpu;
1460
1461 max_threads = 0;
1462 for_each_online_cpu (cpu) {
1463 int threads = cpumask_weight(topology_sibling_cpumask(cpu));
1464
1465 if (threads > max_threads)
1466 max_threads = threads;
1467 }
1468 __max_smt_threads = max_threads;
1469}
1470
1471static void remove_siblinginfo(int cpu)
1472{
1473 int sibling;
1474 struct cpuinfo_x86 *c = &cpu_data(cpu);
1475
1476 for_each_cpu(sibling, topology_core_cpumask(cpu)) {
1477 cpumask_clear_cpu(cpu, topology_core_cpumask(sibling));
1478 /*/
1479 * last thread sibling in this cpu core going down
1480 */
1481 if (cpumask_weight(topology_sibling_cpumask(cpu)) == 1)
1482 cpu_data(sibling).booted_cores--;
1483 }
1484
1485 for_each_cpu(sibling, topology_sibling_cpumask(cpu))
1486 cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
1487 for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
1488 cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
1489 cpumask_clear(cpu_llc_shared_mask(cpu));
1490 cpumask_clear(topology_sibling_cpumask(cpu));
1491 cpumask_clear(topology_core_cpumask(cpu));
1492 c->cpu_core_id = 0;
1493 c->booted_cores = 0;
1494 cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
1495 recompute_smt_state();
1496}
1497
1498static void remove_cpu_from_maps(int cpu)
1499{
1500 set_cpu_online(cpu, false);
1501 cpumask_clear_cpu(cpu, cpu_callout_mask);
1502 cpumask_clear_cpu(cpu, cpu_callin_mask);
1503 /* was set by cpu_init() */
1504 cpumask_clear_cpu(cpu, cpu_initialized_mask);
1505 numa_remove_cpu(cpu);
1506}
1507
1508void cpu_disable_common(void)
1509{
1510 int cpu = smp_processor_id();
1511
1512 remove_siblinginfo(cpu);
1513
1514 /* It's now safe to remove this processor from the online map */
1515 lock_vector_lock();
1516 remove_cpu_from_maps(cpu);
1517 unlock_vector_lock();
1518 fixup_irqs();
1519 lapic_offline();
1520}
1521
1522int native_cpu_disable(void)
1523{
1524 int ret;
1525
1526 ret = lapic_can_unplug_cpu();
1527 if (ret)
1528 return ret;
1529
1530 clear_local_APIC();
1531 cpu_disable_common();
1532
1533 return 0;
1534}
1535
1536int common_cpu_die(unsigned int cpu)
1537{
1538 int ret = 0;
1539
1540 /* We don't do anything here: idle task is faking death itself. */
1541
1542 /* They ack this in play_dead() by setting CPU_DEAD */
1543 if (cpu_wait_death(cpu, 5)) {
1544 if (system_state == SYSTEM_RUNNING)
1545 pr_info("CPU %u is now offline\n", cpu);
1546 } else {
1547 pr_err("CPU %u didn't die...\n", cpu);
1548 ret = -1;
1549 }
1550
1551 return ret;
1552}
1553
1554void native_cpu_die(unsigned int cpu)
1555{
1556 common_cpu_die(cpu);
1557}
1558
1559void play_dead_common(void)
1560{
1561 idle_task_exit();
1562
1563 /* Ack it */
1564 (void)cpu_report_death();
1565
1566 /*
1567 * With physical CPU hotplug, we should halt the cpu
1568 */
1569 local_irq_disable();
1570}
1571
1572static bool wakeup_cpu0(void)
1573{
1574 if (smp_processor_id() == 0 && enable_start_cpu0)
1575 return true;
1576
1577 return false;
1578}
1579
1580/*
1581 * We need to flush the caches before going to sleep, lest we have
1582 * dirty data in our caches when we come back up.
1583 */
1584static inline void mwait_play_dead(void)
1585{
1586 unsigned int eax, ebx, ecx, edx;
1587 unsigned int highest_cstate = 0;
1588 unsigned int highest_subcstate = 0;
1589 void *mwait_ptr;
1590 int i;
1591
1592 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
1593 boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
1594 return;
1595 if (!this_cpu_has(X86_FEATURE_MWAIT))
1596 return;
1597 if (!this_cpu_has(X86_FEATURE_CLFLUSH))
1598 return;
1599 if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
1600 return;
1601
1602 eax = CPUID_MWAIT_LEAF;
1603 ecx = 0;
1604 native_cpuid(&eax, &ebx, &ecx, &edx);
1605
1606 /*
1607 * eax will be 0 if EDX enumeration is not valid.
1608 * Initialized below to cstate, sub_cstate value when EDX is valid.
1609 */
1610 if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) {
1611 eax = 0;
1612 } else {
1613 edx >>= MWAIT_SUBSTATE_SIZE;
1614 for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
1615 if (edx & MWAIT_SUBSTATE_MASK) {
1616 highest_cstate = i;
1617 highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
1618 }
1619 }
1620 eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
1621 (highest_subcstate - 1);
1622 }
1623
1624 /*
1625 * This should be a memory location in a cache line which is
1626 * unlikely to be touched by other processors. The actual
1627 * content is immaterial as it is not actually modified in any way.
1628 */
1629 mwait_ptr = &current_thread_info()->flags;
1630
1631 wbinvd();
1632
1633 while (1) {
1634 /*
1635 * The CLFLUSH is a workaround for erratum AAI65 for
1636 * the Xeon 7400 series. It's not clear it is actually
1637 * needed, but it should be harmless in either case.
1638 * The WBINVD is insufficient due to the spurious-wakeup
1639 * case where we return around the loop.
1640 */
1641 mb();
1642 clflush(mwait_ptr);
1643 mb();
1644 __monitor(mwait_ptr, 0, 0);
1645 mb();
1646 __mwait(eax, 0);
1647 /*
1648 * If NMI wants to wake up CPU0, start CPU0.
1649 */
1650 if (wakeup_cpu0())
1651 start_cpu0();
1652 }
1653}
1654
1655void hlt_play_dead(void)
1656{
1657 if (__this_cpu_read(cpu_info.x86) >= 4)
1658 wbinvd();
1659
1660 while (1) {
1661 native_halt();
1662 /*
1663 * If NMI wants to wake up CPU0, start CPU0.
1664 */
1665 if (wakeup_cpu0())
1666 start_cpu0();
1667 }
1668}
1669
1670void native_play_dead(void)
1671{
1672 play_dead_common();
1673 tboot_shutdown(TB_SHUTDOWN_WFS);
1674
1675 mwait_play_dead(); /* Only returns on failure */
1676 if (cpuidle_play_dead())
1677 hlt_play_dead();
1678}
1679
1680#else /* ... !CONFIG_HOTPLUG_CPU */
1681int native_cpu_disable(void)
1682{
1683 return -ENOSYS;
1684}
1685
1686void native_cpu_die(unsigned int cpu)
1687{
1688 /* We said "no" in __cpu_disable */
1689 BUG();
1690}
1691
1692void native_play_dead(void)
1693{
1694 BUG();
1695}
1696
1697#endif
1698