1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * PPC64 code to handle Linux booting another kernel. |
4 | * |
5 | * Copyright (C) 2004-2005, IBM Corp. |
6 | * |
7 | * Created by: Milton D Miller II |
8 | */ |
9 | |
10 | |
11 | #include <linux/kexec.h> |
12 | #include <linux/smp.h> |
13 | #include <linux/thread_info.h> |
14 | #include <linux/init_task.h> |
15 | #include <linux/errno.h> |
16 | #include <linux/kernel.h> |
17 | #include <linux/cpu.h> |
18 | #include <linux/hardirq.h> |
19 | #include <linux/of.h> |
20 | |
21 | #include <asm/page.h> |
22 | #include <asm/current.h> |
23 | #include <asm/machdep.h> |
24 | #include <asm/cacheflush.h> |
25 | #include <asm/firmware.h> |
26 | #include <asm/paca.h> |
27 | #include <asm/mmu.h> |
28 | #include <asm/sections.h> /* _end */ |
29 | #include <asm/smp.h> |
30 | #include <asm/hw_breakpoint.h> |
31 | #include <asm/svm.h> |
32 | #include <asm/ultravisor.h> |
33 | |
34 | int machine_kexec_prepare(struct kimage *image) |
35 | { |
36 | int i; |
37 | unsigned long begin, end; /* limits of segment */ |
38 | unsigned long low, high; /* limits of blocked memory range */ |
39 | struct device_node *node; |
40 | const unsigned long *basep; |
41 | const unsigned int *sizep; |
42 | |
43 | /* |
44 | * Since we use the kernel fault handlers and paging code to |
45 | * handle the virtual mode, we must make sure no destination |
46 | * overlaps kernel static data or bss. |
47 | */ |
48 | for (i = 0; i < image->nr_segments; i++) |
49 | if (image->segment[i].mem < __pa(_end)) |
50 | return -ETXTBSY; |
51 | |
52 | /* We also should not overwrite the tce tables */ |
53 | for_each_node_by_type(node, "pci" ) { |
54 | basep = of_get_property(node, name: "linux,tce-base" , NULL); |
55 | sizep = of_get_property(node, name: "linux,tce-size" , NULL); |
56 | if (basep == NULL || sizep == NULL) |
57 | continue; |
58 | |
59 | low = *basep; |
60 | high = low + (*sizep); |
61 | |
62 | for (i = 0; i < image->nr_segments; i++) { |
63 | begin = image->segment[i].mem; |
64 | end = begin + image->segment[i].memsz; |
65 | |
66 | if ((begin < high) && (end > low)) { |
67 | of_node_put(node); |
68 | return -ETXTBSY; |
69 | } |
70 | } |
71 | } |
72 | |
73 | return 0; |
74 | } |
75 | |
76 | /* Called during kexec sequence with MMU off */ |
77 | static notrace void copy_segments(unsigned long ind) |
78 | { |
79 | unsigned long entry; |
80 | unsigned long *ptr; |
81 | void *dest; |
82 | void *addr; |
83 | |
84 | /* |
85 | * We rely on kexec_load to create a lists that properly |
86 | * initializes these pointers before they are used. |
87 | * We will still crash if the list is wrong, but at least |
88 | * the compiler will be quiet. |
89 | */ |
90 | ptr = NULL; |
91 | dest = NULL; |
92 | |
93 | for (entry = ind; !(entry & IND_DONE); entry = *ptr++) { |
94 | addr = __va(entry & PAGE_MASK); |
95 | |
96 | switch (entry & IND_FLAGS) { |
97 | case IND_DESTINATION: |
98 | dest = addr; |
99 | break; |
100 | case IND_INDIRECTION: |
101 | ptr = addr; |
102 | break; |
103 | case IND_SOURCE: |
104 | copy_page(to: dest, from: addr); |
105 | dest += PAGE_SIZE; |
106 | } |
107 | } |
108 | } |
109 | |
110 | /* Called during kexec sequence with MMU off */ |
111 | notrace void kexec_copy_flush(struct kimage *image) |
112 | { |
113 | long i, nr_segments = image->nr_segments; |
114 | struct kexec_segment ranges[KEXEC_SEGMENT_MAX]; |
115 | |
116 | /* save the ranges on the stack to efficiently flush the icache */ |
117 | memcpy(ranges, image->segment, sizeof(ranges)); |
118 | |
119 | /* |
120 | * After this call we may not use anything allocated in dynamic |
121 | * memory, including *image. |
122 | * |
123 | * Only globals and the stack are allowed. |
124 | */ |
125 | copy_segments(ind: image->head); |
126 | |
127 | /* |
128 | * we need to clear the icache for all dest pages sometime, |
129 | * including ones that were in place on the original copy |
130 | */ |
131 | for (i = 0; i < nr_segments; i++) |
132 | flush_icache_range(start: (unsigned long)__va(ranges[i].mem), |
133 | end: (unsigned long)__va(ranges[i].mem + ranges[i].memsz)); |
134 | } |
135 | |
136 | #ifdef CONFIG_SMP |
137 | |
138 | static int kexec_all_irq_disabled = 0; |
139 | |
140 | static void kexec_smp_down(void *arg) |
141 | { |
142 | local_irq_disable(); |
143 | hard_irq_disable(); |
144 | |
145 | mb(); /* make sure our irqs are disabled before we say they are */ |
146 | get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF; |
147 | while(kexec_all_irq_disabled == 0) |
148 | cpu_relax(); |
149 | mb(); /* make sure all irqs are disabled before this */ |
150 | hw_breakpoint_disable(); |
151 | /* |
152 | * Now every CPU has IRQs off, we can clear out any pending |
153 | * IPIs and be sure that no more will come in after this. |
154 | */ |
155 | if (ppc_md.kexec_cpu_down) |
156 | ppc_md.kexec_cpu_down(0, 1); |
157 | |
158 | reset_sprs(); |
159 | |
160 | kexec_smp_wait(); |
161 | /* NOTREACHED */ |
162 | } |
163 | |
164 | static void kexec_prepare_cpus_wait(int wait_state) |
165 | { |
166 | int my_cpu, i, notified=-1; |
167 | |
168 | hw_breakpoint_disable(); |
169 | my_cpu = get_cpu(); |
170 | /* Make sure each CPU has at least made it to the state we need. |
171 | * |
172 | * FIXME: There is a (slim) chance of a problem if not all of the CPUs |
173 | * are correctly onlined. If somehow we start a CPU on boot with RTAS |
174 | * start-cpu, but somehow that CPU doesn't write callin_cpu_map[] in |
175 | * time, the boot CPU will timeout. If it does eventually execute |
176 | * stuff, the secondary will start up (paca_ptrs[]->cpu_start was |
177 | * written) and get into a peculiar state. |
178 | * If the platform supports smp_ops->take_timebase(), the secondary CPU |
179 | * will probably be spinning in there. If not (i.e. pseries), the |
180 | * secondary will continue on and try to online itself/idle/etc. If it |
181 | * survives that, we need to find these |
182 | * possible-but-not-online-but-should-be CPUs and chaperone them into |
183 | * kexec_smp_wait(). |
184 | */ |
185 | for_each_online_cpu(i) { |
186 | if (i == my_cpu) |
187 | continue; |
188 | |
189 | while (paca_ptrs[i]->kexec_state < wait_state) { |
190 | barrier(); |
191 | if (i != notified) { |
192 | printk(KERN_INFO "kexec: waiting for cpu %d " |
193 | "(physical %d) to enter %i state\n" , |
194 | i, paca_ptrs[i]->hw_cpu_id, wait_state); |
195 | notified = i; |
196 | } |
197 | } |
198 | } |
199 | mb(); |
200 | } |
201 | |
202 | /* |
203 | * We need to make sure each present CPU is online. The next kernel will scan |
204 | * the device tree and assume primary threads are online and query secondary |
205 | * threads via RTAS to online them if required. If we don't online primary |
206 | * threads, they will be stuck. However, we also online secondary threads as we |
207 | * may be using 'cede offline'. In this case RTAS doesn't see the secondary |
208 | * threads as offline -- and again, these CPUs will be stuck. |
209 | * |
210 | * So, we online all CPUs that should be running, including secondary threads. |
211 | */ |
212 | static void wake_offline_cpus(void) |
213 | { |
214 | int cpu = 0; |
215 | |
216 | for_each_present_cpu(cpu) { |
217 | if (!cpu_online(cpu)) { |
218 | printk(KERN_INFO "kexec: Waking offline cpu %d.\n" , |
219 | cpu); |
220 | WARN_ON(add_cpu(cpu)); |
221 | } |
222 | } |
223 | } |
224 | |
225 | static void kexec_prepare_cpus(void) |
226 | { |
227 | wake_offline_cpus(); |
228 | smp_call_function(func: kexec_smp_down, NULL, /* wait */0); |
229 | local_irq_disable(); |
230 | hard_irq_disable(); |
231 | |
232 | mb(); /* make sure IRQs are disabled before we say they are */ |
233 | get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF; |
234 | |
235 | kexec_prepare_cpus_wait(wait_state: KEXEC_STATE_IRQS_OFF); |
236 | /* we are sure every CPU has IRQs off at this point */ |
237 | kexec_all_irq_disabled = 1; |
238 | |
239 | /* |
240 | * Before removing MMU mappings make sure all CPUs have entered real |
241 | * mode: |
242 | */ |
243 | kexec_prepare_cpus_wait(wait_state: KEXEC_STATE_REAL_MODE); |
244 | |
245 | /* after we tell the others to go down */ |
246 | if (ppc_md.kexec_cpu_down) |
247 | ppc_md.kexec_cpu_down(0, 0); |
248 | |
249 | put_cpu(); |
250 | } |
251 | |
252 | #else /* ! SMP */ |
253 | |
254 | static void kexec_prepare_cpus(void) |
255 | { |
256 | /* |
257 | * move the secondarys to us so that we can copy |
258 | * the new kernel 0-0x100 safely |
259 | * |
260 | * do this if kexec in setup.c ? |
261 | * |
262 | * We need to release the cpus if we are ever going from an |
263 | * UP to an SMP kernel. |
264 | */ |
265 | smp_release_cpus(); |
266 | if (ppc_md.kexec_cpu_down) |
267 | ppc_md.kexec_cpu_down(0, 0); |
268 | local_irq_disable(); |
269 | hard_irq_disable(); |
270 | } |
271 | |
272 | #endif /* SMP */ |
273 | |
274 | /* |
275 | * kexec thread structure and stack. |
276 | * |
277 | * We need to make sure that this is 16384-byte aligned due to the |
278 | * way process stacks are handled. It also must be statically allocated |
279 | * or allocated as part of the kimage, because everything else may be |
280 | * overwritten when we copy the kexec image. We piggyback on the |
281 | * "init_task" linker section here to statically allocate a stack. |
282 | * |
283 | * We could use a smaller stack if we don't care about anything using |
284 | * current, but that audit has not been performed. |
285 | */ |
286 | static union thread_union kexec_stack = { }; |
287 | |
288 | /* |
289 | * For similar reasons to the stack above, the kexecing CPU needs to be on a |
290 | * static PACA; we switch to kexec_paca. |
291 | */ |
292 | static struct paca_struct kexec_paca; |
293 | |
294 | /* Our assembly helper, in misc_64.S */ |
295 | extern void kexec_sequence(void *newstack, unsigned long start, |
296 | void *image, void *control, |
297 | void (*clear_all)(void), |
298 | bool copy_with_mmu_off) __noreturn; |
299 | |
300 | /* too late to fail here */ |
301 | void default_machine_kexec(struct kimage *image) |
302 | { |
303 | bool copy_with_mmu_off; |
304 | |
305 | /* prepare control code if any */ |
306 | |
307 | /* |
308 | * If the kexec boot is the normal one, need to shutdown other cpus |
309 | * into our wait loop and quiesce interrupts. |
310 | * Otherwise, in the case of crashed mode (crashing_cpu >= 0), |
311 | * stopping other CPUs and collecting their pt_regs is done before |
312 | * using debugger IPI. |
313 | */ |
314 | |
315 | if (!kdump_in_progress()) |
316 | kexec_prepare_cpus(); |
317 | |
318 | printk("kexec: Starting switchover sequence.\n" ); |
319 | |
320 | /* switch to a staticly allocated stack. Based on irq stack code. |
321 | * We setup preempt_count to avoid using VMX in memcpy. |
322 | * XXX: the task struct will likely be invalid once we do the copy! |
323 | */ |
324 | current_thread_info()->flags = 0; |
325 | current_thread_info()->preempt_count = HARDIRQ_OFFSET; |
326 | |
327 | /* We need a static PACA, too; copy this CPU's PACA over and switch to |
328 | * it. Also poison per_cpu_offset and NULL lppaca to catch anyone using |
329 | * non-static data. |
330 | */ |
331 | memcpy(&kexec_paca, get_paca(), sizeof(struct paca_struct)); |
332 | kexec_paca.data_offset = 0xedeaddeadeeeeeeeUL; |
333 | #ifdef CONFIG_PPC_PSERIES |
334 | kexec_paca.lppaca_ptr = NULL; |
335 | #endif |
336 | |
337 | if (is_secure_guest() && !(image->preserve_context || |
338 | image->type == KEXEC_TYPE_CRASH)) { |
339 | uv_unshare_all_pages(); |
340 | printk("kexec: Unshared all shared pages.\n" ); |
341 | } |
342 | |
343 | paca_ptrs[kexec_paca.paca_index] = &kexec_paca; |
344 | |
345 | setup_paca(&kexec_paca); |
346 | |
347 | /* |
348 | * The lppaca should be unregistered at this point so the HV won't |
349 | * touch it. In the case of a crash, none of the lppacas are |
350 | * unregistered so there is not much we can do about it here. |
351 | */ |
352 | |
353 | /* |
354 | * On Book3S, the copy must happen with the MMU off if we are either |
355 | * using Radix page tables or we are not in an LPAR since we can |
356 | * overwrite the page tables while copying. |
357 | * |
358 | * In an LPAR, we keep the MMU on otherwise we can't access beyond |
359 | * the RMA. On BookE there is no real MMU off mode, so we have to |
360 | * keep it enabled as well (but then we have bolted TLB entries). |
361 | */ |
362 | #ifdef CONFIG_PPC_BOOK3E_64 |
363 | copy_with_mmu_off = false; |
364 | #else |
365 | copy_with_mmu_off = radix_enabled() || |
366 | !(firmware_has_feature(FW_FEATURE_LPAR) || |
367 | firmware_has_feature(FW_FEATURE_PS3_LV1)); |
368 | #endif |
369 | |
370 | /* Some things are best done in assembly. Finding globals with |
371 | * a toc is easier in C, so pass in what we can. |
372 | */ |
373 | kexec_sequence(newstack: &kexec_stack, start: image->start, image, |
374 | page_address(image->control_code_page), |
375 | clear_all: mmu_cleanup_all, copy_with_mmu_off); |
376 | /* NOTREACHED */ |
377 | } |
378 | |
379 | #ifdef CONFIG_PPC_64S_HASH_MMU |
380 | /* Values we need to export to the second kernel via the device tree. */ |
381 | static __be64 htab_base; |
382 | static __be64 htab_size; |
383 | |
384 | static struct property htab_base_prop = { |
385 | .name = "linux,htab-base" , |
386 | .length = sizeof(unsigned long), |
387 | .value = &htab_base, |
388 | }; |
389 | |
390 | static struct property htab_size_prop = { |
391 | .name = "linux,htab-size" , |
392 | .length = sizeof(unsigned long), |
393 | .value = &htab_size, |
394 | }; |
395 | |
396 | static int __init export_htab_values(void) |
397 | { |
398 | struct device_node *node; |
399 | |
400 | /* On machines with no htab htab_address is NULL */ |
401 | if (!htab_address) |
402 | return -ENODEV; |
403 | |
404 | node = of_find_node_by_path("/chosen" ); |
405 | if (!node) |
406 | return -ENODEV; |
407 | |
408 | /* remove any stale properties so ours can be found */ |
409 | of_remove_property(node, of_find_property(node, htab_base_prop.name, NULL)); |
410 | of_remove_property(node, of_find_property(node, htab_size_prop.name, NULL)); |
411 | |
412 | htab_base = cpu_to_be64(__pa(htab_address)); |
413 | of_add_property(node, &htab_base_prop); |
414 | htab_size = cpu_to_be64(htab_size_bytes); |
415 | of_add_property(node, &htab_size_prop); |
416 | |
417 | of_node_put(node); |
418 | return 0; |
419 | } |
420 | late_initcall(export_htab_values); |
421 | #endif /* CONFIG_PPC_64S_HASH_MMU */ |
422 | |