1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Architecture specific (i386/x86_64) functions for kexec based crash dumps. |
4 | * |
5 | * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) |
6 | * |
7 | * Copyright (C) IBM Corporation, 2004. All rights reserved. |
8 | * Copyright (C) Red Hat Inc., 2014. All rights reserved. |
9 | * Authors: |
10 | * Vivek Goyal <vgoyal@redhat.com> |
11 | * |
12 | */ |
13 | |
14 | #define pr_fmt(fmt) "kexec: " fmt |
15 | |
16 | #include <linux/types.h> |
17 | #include <linux/kernel.h> |
18 | #include <linux/smp.h> |
19 | #include <linux/reboot.h> |
20 | #include <linux/kexec.h> |
21 | #include <linux/delay.h> |
22 | #include <linux/elf.h> |
23 | #include <linux/elfcore.h> |
24 | #include <linux/export.h> |
25 | #include <linux/slab.h> |
26 | #include <linux/vmalloc.h> |
27 | #include <linux/memblock.h> |
28 | |
29 | #include <asm/bootparam.h> |
30 | #include <asm/processor.h> |
31 | #include <asm/hardirq.h> |
32 | #include <asm/nmi.h> |
33 | #include <asm/hw_irq.h> |
34 | #include <asm/apic.h> |
35 | #include <asm/e820/types.h> |
36 | #include <asm/io_apic.h> |
37 | #include <asm/hpet.h> |
38 | #include <linux/kdebug.h> |
39 | #include <asm/cpu.h> |
40 | #include <asm/reboot.h> |
41 | #include <asm/intel_pt.h> |
42 | #include <asm/crash.h> |
43 | #include <asm/cmdline.h> |
44 | #include <asm/sev.h> |
45 | |
46 | /* Used while preparing memory map entries for second kernel */ |
47 | struct crash_memmap_data { |
48 | struct boot_params *params; |
49 | /* Type of memory */ |
50 | unsigned int type; |
51 | }; |
52 | |
53 | #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) |
54 | |
55 | static void kdump_nmi_callback(int cpu, struct pt_regs *regs) |
56 | { |
57 | crash_save_cpu(regs, cpu); |
58 | |
59 | /* |
60 | * Disable Intel PT to stop its logging |
61 | */ |
62 | cpu_emergency_stop_pt(); |
63 | |
64 | kdump_sev_callback(); |
65 | |
66 | disable_local_APIC(); |
67 | } |
68 | |
69 | void kdump_nmi_shootdown_cpus(void) |
70 | { |
71 | nmi_shootdown_cpus(callback: kdump_nmi_callback); |
72 | |
73 | disable_local_APIC(); |
74 | } |
75 | |
76 | /* Override the weak function in kernel/panic.c */ |
77 | void crash_smp_send_stop(void) |
78 | { |
79 | static int cpus_stopped; |
80 | |
81 | if (cpus_stopped) |
82 | return; |
83 | |
84 | if (smp_ops.crash_stop_other_cpus) |
85 | smp_ops.crash_stop_other_cpus(); |
86 | else |
87 | smp_send_stop(); |
88 | |
89 | cpus_stopped = 1; |
90 | } |
91 | |
92 | #else |
93 | void crash_smp_send_stop(void) |
94 | { |
95 | /* There are no cpus to shootdown */ |
96 | } |
97 | #endif |
98 | |
99 | void native_machine_crash_shutdown(struct pt_regs *regs) |
100 | { |
101 | /* This function is only called after the system |
102 | * has panicked or is otherwise in a critical state. |
103 | * The minimum amount of code to allow a kexec'd kernel |
104 | * to run successfully needs to happen here. |
105 | * |
106 | * In practice this means shooting down the other cpus in |
107 | * an SMP system. |
108 | */ |
109 | /* The kernel is broken so disable interrupts */ |
110 | local_irq_disable(); |
111 | |
112 | crash_smp_send_stop(); |
113 | |
114 | cpu_emergency_disable_virtualization(); |
115 | |
116 | /* |
117 | * Disable Intel PT to stop its logging |
118 | */ |
119 | cpu_emergency_stop_pt(); |
120 | |
121 | #ifdef CONFIG_X86_IO_APIC |
122 | /* Prevent crash_kexec() from deadlocking on ioapic_lock. */ |
123 | ioapic_zap_locks(); |
124 | clear_IO_APIC(); |
125 | #endif |
126 | lapic_shutdown(); |
127 | restore_boot_irq_mode(); |
128 | #ifdef CONFIG_HPET_TIMER |
129 | hpet_disable(); |
130 | #endif |
131 | crash_save_cpu(regs, safe_smp_processor_id()); |
132 | } |
133 | |
134 | #if defined(CONFIG_KEXEC_FILE) || defined(CONFIG_CRASH_HOTPLUG) |
135 | static int get_nr_ram_ranges_callback(struct resource *res, void *arg) |
136 | { |
137 | unsigned int *nr_ranges = arg; |
138 | |
139 | (*nr_ranges)++; |
140 | return 0; |
141 | } |
142 | |
143 | /* Gather all the required information to prepare elf headers for ram regions */ |
144 | static struct crash_mem *fill_up_crash_elf_data(void) |
145 | { |
146 | unsigned int nr_ranges = 0; |
147 | struct crash_mem *cmem; |
148 | |
149 | walk_system_ram_res(start: 0, end: -1, arg: &nr_ranges, func: get_nr_ram_ranges_callback); |
150 | if (!nr_ranges) |
151 | return NULL; |
152 | |
153 | /* |
154 | * Exclusion of crash region and/or crashk_low_res may cause |
155 | * another range split. So add extra two slots here. |
156 | */ |
157 | nr_ranges += 2; |
158 | cmem = vzalloc(struct_size(cmem, ranges, nr_ranges)); |
159 | if (!cmem) |
160 | return NULL; |
161 | |
162 | cmem->max_nr_ranges = nr_ranges; |
163 | cmem->nr_ranges = 0; |
164 | |
165 | return cmem; |
166 | } |
167 | |
168 | /* |
169 | * Look for any unwanted ranges between mstart, mend and remove them. This |
170 | * might lead to split and split ranges are put in cmem->ranges[] array |
171 | */ |
172 | static int (struct crash_mem *cmem) |
173 | { |
174 | int ret = 0; |
175 | |
176 | /* Exclude the low 1M because it is always reserved */ |
177 | ret = crash_exclude_mem_range(mem: cmem, mstart: 0, SZ_1M - 1); |
178 | if (ret) |
179 | return ret; |
180 | |
181 | /* Exclude crashkernel region */ |
182 | ret = crash_exclude_mem_range(mem: cmem, mstart: crashk_res.start, mend: crashk_res.end); |
183 | if (ret) |
184 | return ret; |
185 | |
186 | if (crashk_low_res.end) |
187 | ret = crash_exclude_mem_range(mem: cmem, mstart: crashk_low_res.start, |
188 | mend: crashk_low_res.end); |
189 | |
190 | return ret; |
191 | } |
192 | |
193 | static int (struct resource *res, void *arg) |
194 | { |
195 | struct crash_mem *cmem = arg; |
196 | |
197 | cmem->ranges[cmem->nr_ranges].start = res->start; |
198 | cmem->ranges[cmem->nr_ranges].end = res->end; |
199 | cmem->nr_ranges++; |
200 | |
201 | return 0; |
202 | } |
203 | |
204 | /* Prepare elf headers. Return addr and size */ |
205 | static int (void **addr, unsigned long *sz, |
206 | unsigned long *nr_mem_ranges) |
207 | { |
208 | struct crash_mem *cmem; |
209 | int ret; |
210 | |
211 | cmem = fill_up_crash_elf_data(); |
212 | if (!cmem) |
213 | return -ENOMEM; |
214 | |
215 | ret = walk_system_ram_res(start: 0, end: -1, arg: cmem, func: prepare_elf64_ram_headers_callback); |
216 | if (ret) |
217 | goto out; |
218 | |
219 | /* Exclude unwanted mem ranges */ |
220 | ret = elf_header_exclude_ranges(cmem); |
221 | if (ret) |
222 | goto out; |
223 | |
224 | /* Return the computed number of memory ranges, for hotplug usage */ |
225 | *nr_mem_ranges = cmem->nr_ranges; |
226 | |
227 | /* By default prepare 64bit headers */ |
228 | ret = crash_prepare_elf64_headers(mem: cmem, IS_ENABLED(CONFIG_X86_64), addr, sz); |
229 | |
230 | out: |
231 | vfree(addr: cmem); |
232 | return ret; |
233 | } |
234 | #endif |
235 | |
236 | #ifdef CONFIG_KEXEC_FILE |
237 | static int add_e820_entry(struct boot_params *params, struct e820_entry *entry) |
238 | { |
239 | unsigned int nr_e820_entries; |
240 | |
241 | nr_e820_entries = params->e820_entries; |
242 | if (nr_e820_entries >= E820_MAX_ENTRIES_ZEROPAGE) |
243 | return 1; |
244 | |
245 | memcpy(¶ms->e820_table[nr_e820_entries], entry, sizeof(struct e820_entry)); |
246 | params->e820_entries++; |
247 | return 0; |
248 | } |
249 | |
250 | static int memmap_entry_callback(struct resource *res, void *arg) |
251 | { |
252 | struct crash_memmap_data *cmd = arg; |
253 | struct boot_params *params = cmd->params; |
254 | struct e820_entry ei; |
255 | |
256 | ei.addr = res->start; |
257 | ei.size = resource_size(res); |
258 | ei.type = cmd->type; |
259 | add_e820_entry(params, entry: &ei); |
260 | |
261 | return 0; |
262 | } |
263 | |
264 | static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem, |
265 | unsigned long long mstart, |
266 | unsigned long long mend) |
267 | { |
268 | unsigned long start, end; |
269 | |
270 | cmem->ranges[0].start = mstart; |
271 | cmem->ranges[0].end = mend; |
272 | cmem->nr_ranges = 1; |
273 | |
274 | /* Exclude elf header region */ |
275 | start = image->elf_load_addr; |
276 | end = start + image->elf_headers_sz - 1; |
277 | return crash_exclude_mem_range(mem: cmem, mstart: start, mend: end); |
278 | } |
279 | |
280 | /* Prepare memory map for crash dump kernel */ |
281 | int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) |
282 | { |
283 | int i, ret = 0; |
284 | unsigned long flags; |
285 | struct e820_entry ei; |
286 | struct crash_memmap_data cmd; |
287 | struct crash_mem *cmem; |
288 | |
289 | cmem = vzalloc(struct_size(cmem, ranges, 1)); |
290 | if (!cmem) |
291 | return -ENOMEM; |
292 | |
293 | memset(&cmd, 0, sizeof(struct crash_memmap_data)); |
294 | cmd.params = params; |
295 | |
296 | /* Add the low 1M */ |
297 | cmd.type = E820_TYPE_RAM; |
298 | flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; |
299 | walk_iomem_res_desc(desc: IORES_DESC_NONE, flags, start: 0, end: (1<<20)-1, arg: &cmd, |
300 | func: memmap_entry_callback); |
301 | |
302 | /* Add ACPI tables */ |
303 | cmd.type = E820_TYPE_ACPI; |
304 | flags = IORESOURCE_MEM | IORESOURCE_BUSY; |
305 | walk_iomem_res_desc(desc: IORES_DESC_ACPI_TABLES, flags, start: 0, end: -1, arg: &cmd, |
306 | func: memmap_entry_callback); |
307 | |
308 | /* Add ACPI Non-volatile Storage */ |
309 | cmd.type = E820_TYPE_NVS; |
310 | walk_iomem_res_desc(desc: IORES_DESC_ACPI_NV_STORAGE, flags, start: 0, end: -1, arg: &cmd, |
311 | func: memmap_entry_callback); |
312 | |
313 | /* Add e820 reserved ranges */ |
314 | cmd.type = E820_TYPE_RESERVED; |
315 | flags = IORESOURCE_MEM; |
316 | walk_iomem_res_desc(desc: IORES_DESC_RESERVED, flags, start: 0, end: -1, arg: &cmd, |
317 | func: memmap_entry_callback); |
318 | |
319 | /* Add crashk_low_res region */ |
320 | if (crashk_low_res.end) { |
321 | ei.addr = crashk_low_res.start; |
322 | ei.size = resource_size(res: &crashk_low_res); |
323 | ei.type = E820_TYPE_RAM; |
324 | add_e820_entry(params, entry: &ei); |
325 | } |
326 | |
327 | /* Exclude some ranges from crashk_res and add rest to memmap */ |
328 | ret = memmap_exclude_ranges(image, cmem, mstart: crashk_res.start, mend: crashk_res.end); |
329 | if (ret) |
330 | goto out; |
331 | |
332 | for (i = 0; i < cmem->nr_ranges; i++) { |
333 | ei.size = cmem->ranges[i].end - cmem->ranges[i].start + 1; |
334 | |
335 | /* If entry is less than a page, skip it */ |
336 | if (ei.size < PAGE_SIZE) |
337 | continue; |
338 | ei.addr = cmem->ranges[i].start; |
339 | ei.type = E820_TYPE_RAM; |
340 | add_e820_entry(params, entry: &ei); |
341 | } |
342 | |
343 | out: |
344 | vfree(addr: cmem); |
345 | return ret; |
346 | } |
347 | |
348 | int crash_load_segments(struct kimage *image) |
349 | { |
350 | int ret; |
351 | unsigned long pnum = 0; |
352 | struct kexec_buf kbuf = { .image = image, .buf_min = 0, |
353 | .buf_max = ULONG_MAX, .top_down = false }; |
354 | |
355 | /* Prepare elf headers and add a segment */ |
356 | ret = prepare_elf_headers(addr: &kbuf.buffer, sz: &kbuf.bufsz, nr_mem_ranges: &pnum); |
357 | if (ret) |
358 | return ret; |
359 | |
360 | image->elf_headers = kbuf.buffer; |
361 | image->elf_headers_sz = kbuf.bufsz; |
362 | kbuf.memsz = kbuf.bufsz; |
363 | |
364 | #ifdef CONFIG_CRASH_HOTPLUG |
365 | /* |
366 | * The elfcorehdr segment size accounts for VMCOREINFO, kernel_map, |
367 | * maximum CPUs and maximum memory ranges. |
368 | */ |
369 | if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) |
370 | pnum = 2 + CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES; |
371 | else |
372 | pnum += 2 + CONFIG_NR_CPUS_DEFAULT; |
373 | |
374 | if (pnum < (unsigned long)PN_XNUM) { |
375 | kbuf.memsz = pnum * sizeof(Elf64_Phdr); |
376 | kbuf.memsz += sizeof(Elf64_Ehdr); |
377 | |
378 | image->elfcorehdr_index = image->nr_segments; |
379 | |
380 | /* Mark as usable to crash kernel, else crash kernel fails on boot */ |
381 | image->elf_headers_sz = kbuf.memsz; |
382 | } else { |
383 | pr_err("number of Phdrs %lu exceeds max\n" , pnum); |
384 | } |
385 | #endif |
386 | |
387 | kbuf.buf_align = ELF_CORE_HEADER_ALIGN; |
388 | kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; |
389 | ret = kexec_add_buffer(kbuf: &kbuf); |
390 | if (ret) |
391 | return ret; |
392 | image->elf_load_addr = kbuf.mem; |
393 | kexec_dprintk("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n" , |
394 | image->elf_load_addr, kbuf.bufsz, kbuf.memsz); |
395 | |
396 | return ret; |
397 | } |
398 | #endif /* CONFIG_KEXEC_FILE */ |
399 | |
400 | #ifdef CONFIG_CRASH_HOTPLUG |
401 | |
402 | #undef pr_fmt |
403 | #define pr_fmt(fmt) "crash hp: " fmt |
404 | |
405 | /* These functions provide the value for the sysfs crash_hotplug nodes */ |
406 | #ifdef CONFIG_HOTPLUG_CPU |
407 | int arch_crash_hotplug_cpu_support(void) |
408 | { |
409 | return crash_check_update_elfcorehdr(); |
410 | } |
411 | #endif |
412 | |
413 | #ifdef CONFIG_MEMORY_HOTPLUG |
414 | int arch_crash_hotplug_memory_support(void) |
415 | { |
416 | return crash_check_update_elfcorehdr(); |
417 | } |
418 | #endif |
419 | |
420 | unsigned int arch_crash_get_elfcorehdr_size(void) |
421 | { |
422 | unsigned int sz; |
423 | |
424 | /* kernel_map, VMCOREINFO and maximum CPUs */ |
425 | sz = 2 + CONFIG_NR_CPUS_DEFAULT; |
426 | if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) |
427 | sz += CONFIG_CRASH_MAX_MEMORY_RANGES; |
428 | sz *= sizeof(Elf64_Phdr); |
429 | return sz; |
430 | } |
431 | |
432 | /** |
433 | * arch_crash_handle_hotplug_event() - Handle hotplug elfcorehdr changes |
434 | * @image: a pointer to kexec_crash_image |
435 | * |
436 | * Prepare the new elfcorehdr and replace the existing elfcorehdr. |
437 | */ |
438 | void arch_crash_handle_hotplug_event(struct kimage *image) |
439 | { |
440 | void *elfbuf = NULL, *old_elfcorehdr; |
441 | unsigned long nr_mem_ranges; |
442 | unsigned long mem, memsz; |
443 | unsigned long elfsz = 0; |
444 | |
445 | /* |
446 | * As crash_prepare_elf64_headers() has already described all |
447 | * possible CPUs, there is no need to update the elfcorehdr |
448 | * for additional CPU changes. |
449 | */ |
450 | if ((image->file_mode || image->elfcorehdr_updated) && |
451 | ((image->hp_action == KEXEC_CRASH_HP_ADD_CPU) || |
452 | (image->hp_action == KEXEC_CRASH_HP_REMOVE_CPU))) |
453 | return; |
454 | |
455 | /* |
456 | * Create the new elfcorehdr reflecting the changes to CPU and/or |
457 | * memory resources. |
458 | */ |
459 | if (prepare_elf_headers(addr: &elfbuf, sz: &elfsz, nr_mem_ranges: &nr_mem_ranges)) { |
460 | pr_err("unable to create new elfcorehdr" ); |
461 | goto out; |
462 | } |
463 | |
464 | /* |
465 | * Obtain address and size of the elfcorehdr segment, and |
466 | * check it against the new elfcorehdr buffer. |
467 | */ |
468 | mem = image->segment[image->elfcorehdr_index].mem; |
469 | memsz = image->segment[image->elfcorehdr_index].memsz; |
470 | if (elfsz > memsz) { |
471 | pr_err("update elfcorehdr elfsz %lu > memsz %lu" , |
472 | elfsz, memsz); |
473 | goto out; |
474 | } |
475 | |
476 | /* |
477 | * Copy new elfcorehdr over the old elfcorehdr at destination. |
478 | */ |
479 | old_elfcorehdr = kmap_local_page(pfn_to_page(mem >> PAGE_SHIFT)); |
480 | if (!old_elfcorehdr) { |
481 | pr_err("mapping elfcorehdr segment failed\n" ); |
482 | goto out; |
483 | } |
484 | |
485 | /* |
486 | * Temporarily invalidate the crash image while the |
487 | * elfcorehdr is updated. |
488 | */ |
489 | xchg(&kexec_crash_image, NULL); |
490 | memcpy_flushcache(dst: old_elfcorehdr, src: elfbuf, cnt: elfsz); |
491 | xchg(&kexec_crash_image, image); |
492 | kunmap_local(old_elfcorehdr); |
493 | pr_debug("updated elfcorehdr\n" ); |
494 | |
495 | out: |
496 | vfree(addr: elfbuf); |
497 | } |
498 | #endif |
499 | |