1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Architecture specific (i386/x86_64) functions for kexec based crash dumps. |
4 | * |
5 | * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) |
6 | * |
7 | * Copyright (C) IBM Corporation, 2004. All rights reserved. |
8 | * Copyright (C) Red Hat Inc., 2014. All rights reserved. |
9 | * Authors: |
10 | * Vivek Goyal <vgoyal@redhat.com> |
11 | * |
12 | */ |
13 | |
14 | #define pr_fmt(fmt) "kexec: " fmt |
15 | |
16 | #include <linux/types.h> |
17 | #include <linux/kernel.h> |
18 | #include <linux/smp.h> |
19 | #include <linux/reboot.h> |
20 | #include <linux/kexec.h> |
21 | #include <linux/delay.h> |
22 | #include <linux/elf.h> |
23 | #include <linux/elfcore.h> |
24 | #include <linux/export.h> |
25 | #include <linux/slab.h> |
26 | #include <linux/vmalloc.h> |
27 | #include <linux/memblock.h> |
28 | |
29 | #include <asm/processor.h> |
30 | #include <asm/hardirq.h> |
31 | #include <asm/nmi.h> |
32 | #include <asm/hw_irq.h> |
33 | #include <asm/apic.h> |
34 | #include <asm/e820/types.h> |
35 | #include <asm/io_apic.h> |
36 | #include <asm/hpet.h> |
37 | #include <linux/kdebug.h> |
38 | #include <asm/cpu.h> |
39 | #include <asm/reboot.h> |
40 | #include <asm/intel_pt.h> |
41 | #include <asm/crash.h> |
42 | #include <asm/cmdline.h> |
43 | |
44 | /* Used while preparing memory map entries for second kernel */ |
45 | struct crash_memmap_data { |
46 | struct boot_params *params; |
47 | /* Type of memory */ |
48 | unsigned int type; |
49 | }; |
50 | |
51 | #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) |
52 | |
53 | static void kdump_nmi_callback(int cpu, struct pt_regs *regs) |
54 | { |
55 | crash_save_cpu(regs, cpu); |
56 | |
57 | /* |
58 | * Disable Intel PT to stop its logging |
59 | */ |
60 | cpu_emergency_stop_pt(); |
61 | |
62 | disable_local_APIC(); |
63 | } |
64 | |
65 | void kdump_nmi_shootdown_cpus(void) |
66 | { |
67 | nmi_shootdown_cpus(callback: kdump_nmi_callback); |
68 | |
69 | disable_local_APIC(); |
70 | } |
71 | |
72 | /* Override the weak function in kernel/panic.c */ |
73 | void crash_smp_send_stop(void) |
74 | { |
75 | static int cpus_stopped; |
76 | |
77 | if (cpus_stopped) |
78 | return; |
79 | |
80 | if (smp_ops.crash_stop_other_cpus) |
81 | smp_ops.crash_stop_other_cpus(); |
82 | else |
83 | smp_send_stop(); |
84 | |
85 | cpus_stopped = 1; |
86 | } |
87 | |
88 | #else |
89 | void crash_smp_send_stop(void) |
90 | { |
91 | /* There are no cpus to shootdown */ |
92 | } |
93 | #endif |
94 | |
95 | void native_machine_crash_shutdown(struct pt_regs *regs) |
96 | { |
97 | /* This function is only called after the system |
98 | * has panicked or is otherwise in a critical state. |
99 | * The minimum amount of code to allow a kexec'd kernel |
100 | * to run successfully needs to happen here. |
101 | * |
102 | * In practice this means shooting down the other cpus in |
103 | * an SMP system. |
104 | */ |
105 | /* The kernel is broken so disable interrupts */ |
106 | local_irq_disable(); |
107 | |
108 | crash_smp_send_stop(); |
109 | |
110 | cpu_emergency_disable_virtualization(); |
111 | |
112 | /* |
113 | * Disable Intel PT to stop its logging |
114 | */ |
115 | cpu_emergency_stop_pt(); |
116 | |
117 | #ifdef CONFIG_X86_IO_APIC |
118 | /* Prevent crash_kexec() from deadlocking on ioapic_lock. */ |
119 | ioapic_zap_locks(); |
120 | clear_IO_APIC(); |
121 | #endif |
122 | lapic_shutdown(); |
123 | restore_boot_irq_mode(); |
124 | #ifdef CONFIG_HPET_TIMER |
125 | hpet_disable(); |
126 | #endif |
127 | crash_save_cpu(regs, safe_smp_processor_id()); |
128 | } |
129 | |
130 | #if defined(CONFIG_KEXEC_FILE) || defined(CONFIG_CRASH_HOTPLUG) |
131 | static int get_nr_ram_ranges_callback(struct resource *res, void *arg) |
132 | { |
133 | unsigned int *nr_ranges = arg; |
134 | |
135 | (*nr_ranges)++; |
136 | return 0; |
137 | } |
138 | |
139 | /* Gather all the required information to prepare elf headers for ram regions */ |
140 | static struct crash_mem *fill_up_crash_elf_data(void) |
141 | { |
142 | unsigned int nr_ranges = 0; |
143 | struct crash_mem *cmem; |
144 | |
145 | walk_system_ram_res(start: 0, end: -1, arg: &nr_ranges, func: get_nr_ram_ranges_callback); |
146 | if (!nr_ranges) |
147 | return NULL; |
148 | |
149 | /* |
150 | * Exclusion of crash region and/or crashk_low_res may cause |
151 | * another range split. So add extra two slots here. |
152 | */ |
153 | nr_ranges += 2; |
154 | cmem = vzalloc(struct_size(cmem, ranges, nr_ranges)); |
155 | if (!cmem) |
156 | return NULL; |
157 | |
158 | cmem->max_nr_ranges = nr_ranges; |
159 | cmem->nr_ranges = 0; |
160 | |
161 | return cmem; |
162 | } |
163 | |
164 | /* |
165 | * Look for any unwanted ranges between mstart, mend and remove them. This |
166 | * might lead to split and split ranges are put in cmem->ranges[] array |
167 | */ |
168 | static int (struct crash_mem *cmem) |
169 | { |
170 | int ret = 0; |
171 | |
172 | /* Exclude the low 1M because it is always reserved */ |
173 | ret = crash_exclude_mem_range(mem: cmem, mstart: 0, mend: (1<<20)-1); |
174 | if (ret) |
175 | return ret; |
176 | |
177 | /* Exclude crashkernel region */ |
178 | ret = crash_exclude_mem_range(mem: cmem, mstart: crashk_res.start, mend: crashk_res.end); |
179 | if (ret) |
180 | return ret; |
181 | |
182 | if (crashk_low_res.end) |
183 | ret = crash_exclude_mem_range(mem: cmem, mstart: crashk_low_res.start, |
184 | mend: crashk_low_res.end); |
185 | |
186 | return ret; |
187 | } |
188 | |
189 | static int (struct resource *res, void *arg) |
190 | { |
191 | struct crash_mem *cmem = arg; |
192 | |
193 | cmem->ranges[cmem->nr_ranges].start = res->start; |
194 | cmem->ranges[cmem->nr_ranges].end = res->end; |
195 | cmem->nr_ranges++; |
196 | |
197 | return 0; |
198 | } |
199 | |
200 | /* Prepare elf headers. Return addr and size */ |
201 | static int (struct kimage *image, void **addr, |
202 | unsigned long *sz, unsigned long *nr_mem_ranges) |
203 | { |
204 | struct crash_mem *cmem; |
205 | int ret; |
206 | |
207 | cmem = fill_up_crash_elf_data(); |
208 | if (!cmem) |
209 | return -ENOMEM; |
210 | |
211 | ret = walk_system_ram_res(start: 0, end: -1, arg: cmem, func: prepare_elf64_ram_headers_callback); |
212 | if (ret) |
213 | goto out; |
214 | |
215 | /* Exclude unwanted mem ranges */ |
216 | ret = elf_header_exclude_ranges(cmem); |
217 | if (ret) |
218 | goto out; |
219 | |
220 | /* Return the computed number of memory ranges, for hotplug usage */ |
221 | *nr_mem_ranges = cmem->nr_ranges; |
222 | |
223 | /* By default prepare 64bit headers */ |
224 | ret = crash_prepare_elf64_headers(mem: cmem, IS_ENABLED(CONFIG_X86_64), addr, sz); |
225 | |
226 | out: |
227 | vfree(addr: cmem); |
228 | return ret; |
229 | } |
230 | #endif |
231 | |
232 | #ifdef CONFIG_KEXEC_FILE |
233 | static int add_e820_entry(struct boot_params *params, struct e820_entry *entry) |
234 | { |
235 | unsigned int nr_e820_entries; |
236 | |
237 | nr_e820_entries = params->e820_entries; |
238 | if (nr_e820_entries >= E820_MAX_ENTRIES_ZEROPAGE) |
239 | return 1; |
240 | |
241 | memcpy(¶ms->e820_table[nr_e820_entries], entry, sizeof(struct e820_entry)); |
242 | params->e820_entries++; |
243 | return 0; |
244 | } |
245 | |
246 | static int memmap_entry_callback(struct resource *res, void *arg) |
247 | { |
248 | struct crash_memmap_data *cmd = arg; |
249 | struct boot_params *params = cmd->params; |
250 | struct e820_entry ei; |
251 | |
252 | ei.addr = res->start; |
253 | ei.size = resource_size(res); |
254 | ei.type = cmd->type; |
255 | add_e820_entry(params, entry: &ei); |
256 | |
257 | return 0; |
258 | } |
259 | |
260 | static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem, |
261 | unsigned long long mstart, |
262 | unsigned long long mend) |
263 | { |
264 | unsigned long start, end; |
265 | |
266 | cmem->ranges[0].start = mstart; |
267 | cmem->ranges[0].end = mend; |
268 | cmem->nr_ranges = 1; |
269 | |
270 | /* Exclude elf header region */ |
271 | start = image->elf_load_addr; |
272 | end = start + image->elf_headers_sz - 1; |
273 | return crash_exclude_mem_range(mem: cmem, mstart: start, mend: end); |
274 | } |
275 | |
276 | /* Prepare memory map for crash dump kernel */ |
277 | int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) |
278 | { |
279 | int i, ret = 0; |
280 | unsigned long flags; |
281 | struct e820_entry ei; |
282 | struct crash_memmap_data cmd; |
283 | struct crash_mem *cmem; |
284 | |
285 | cmem = vzalloc(struct_size(cmem, ranges, 1)); |
286 | if (!cmem) |
287 | return -ENOMEM; |
288 | |
289 | memset(&cmd, 0, sizeof(struct crash_memmap_data)); |
290 | cmd.params = params; |
291 | |
292 | /* Add the low 1M */ |
293 | cmd.type = E820_TYPE_RAM; |
294 | flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; |
295 | walk_iomem_res_desc(desc: IORES_DESC_NONE, flags, start: 0, end: (1<<20)-1, arg: &cmd, |
296 | func: memmap_entry_callback); |
297 | |
298 | /* Add ACPI tables */ |
299 | cmd.type = E820_TYPE_ACPI; |
300 | flags = IORESOURCE_MEM | IORESOURCE_BUSY; |
301 | walk_iomem_res_desc(desc: IORES_DESC_ACPI_TABLES, flags, start: 0, end: -1, arg: &cmd, |
302 | func: memmap_entry_callback); |
303 | |
304 | /* Add ACPI Non-volatile Storage */ |
305 | cmd.type = E820_TYPE_NVS; |
306 | walk_iomem_res_desc(desc: IORES_DESC_ACPI_NV_STORAGE, flags, start: 0, end: -1, arg: &cmd, |
307 | func: memmap_entry_callback); |
308 | |
309 | /* Add e820 reserved ranges */ |
310 | cmd.type = E820_TYPE_RESERVED; |
311 | flags = IORESOURCE_MEM; |
312 | walk_iomem_res_desc(desc: IORES_DESC_RESERVED, flags, start: 0, end: -1, arg: &cmd, |
313 | func: memmap_entry_callback); |
314 | |
315 | /* Add crashk_low_res region */ |
316 | if (crashk_low_res.end) { |
317 | ei.addr = crashk_low_res.start; |
318 | ei.size = resource_size(res: &crashk_low_res); |
319 | ei.type = E820_TYPE_RAM; |
320 | add_e820_entry(params, entry: &ei); |
321 | } |
322 | |
323 | /* Exclude some ranges from crashk_res and add rest to memmap */ |
324 | ret = memmap_exclude_ranges(image, cmem, mstart: crashk_res.start, mend: crashk_res.end); |
325 | if (ret) |
326 | goto out; |
327 | |
328 | for (i = 0; i < cmem->nr_ranges; i++) { |
329 | ei.size = cmem->ranges[i].end - cmem->ranges[i].start + 1; |
330 | |
331 | /* If entry is less than a page, skip it */ |
332 | if (ei.size < PAGE_SIZE) |
333 | continue; |
334 | ei.addr = cmem->ranges[i].start; |
335 | ei.type = E820_TYPE_RAM; |
336 | add_e820_entry(params, entry: &ei); |
337 | } |
338 | |
339 | out: |
340 | vfree(addr: cmem); |
341 | return ret; |
342 | } |
343 | |
344 | int crash_load_segments(struct kimage *image) |
345 | { |
346 | int ret; |
347 | unsigned long pnum = 0; |
348 | struct kexec_buf kbuf = { .image = image, .buf_min = 0, |
349 | .buf_max = ULONG_MAX, .top_down = false }; |
350 | |
351 | /* Prepare elf headers and add a segment */ |
352 | ret = prepare_elf_headers(image, addr: &kbuf.buffer, sz: &kbuf.bufsz, nr_mem_ranges: &pnum); |
353 | if (ret) |
354 | return ret; |
355 | |
356 | image->elf_headers = kbuf.buffer; |
357 | image->elf_headers_sz = kbuf.bufsz; |
358 | kbuf.memsz = kbuf.bufsz; |
359 | |
360 | #ifdef CONFIG_CRASH_HOTPLUG |
361 | /* |
362 | * The elfcorehdr segment size accounts for VMCOREINFO, kernel_map, |
363 | * maximum CPUs and maximum memory ranges. |
364 | */ |
365 | if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) |
366 | pnum = 2 + CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES; |
367 | else |
368 | pnum += 2 + CONFIG_NR_CPUS_DEFAULT; |
369 | |
370 | if (pnum < (unsigned long)PN_XNUM) { |
371 | kbuf.memsz = pnum * sizeof(Elf64_Phdr); |
372 | kbuf.memsz += sizeof(Elf64_Ehdr); |
373 | |
374 | image->elfcorehdr_index = image->nr_segments; |
375 | |
376 | /* Mark as usable to crash kernel, else crash kernel fails on boot */ |
377 | image->elf_headers_sz = kbuf.memsz; |
378 | } else { |
379 | pr_err("number of Phdrs %lu exceeds max\n" , pnum); |
380 | } |
381 | #endif |
382 | |
383 | kbuf.buf_align = ELF_CORE_HEADER_ALIGN; |
384 | kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; |
385 | ret = kexec_add_buffer(kbuf: &kbuf); |
386 | if (ret) |
387 | return ret; |
388 | image->elf_load_addr = kbuf.mem; |
389 | pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n" , |
390 | image->elf_load_addr, kbuf.bufsz, kbuf.memsz); |
391 | |
392 | return ret; |
393 | } |
394 | #endif /* CONFIG_KEXEC_FILE */ |
395 | |
396 | #ifdef CONFIG_CRASH_HOTPLUG |
397 | |
398 | #undef pr_fmt |
399 | #define pr_fmt(fmt) "crash hp: " fmt |
400 | |
401 | /* These functions provide the value for the sysfs crash_hotplug nodes */ |
402 | #ifdef CONFIG_HOTPLUG_CPU |
403 | int arch_crash_hotplug_cpu_support(void) |
404 | { |
405 | return crash_check_update_elfcorehdr(); |
406 | } |
407 | #endif |
408 | |
409 | #ifdef CONFIG_MEMORY_HOTPLUG |
410 | int arch_crash_hotplug_memory_support(void) |
411 | { |
412 | return crash_check_update_elfcorehdr(); |
413 | } |
414 | #endif |
415 | |
416 | unsigned int arch_crash_get_elfcorehdr_size(void) |
417 | { |
418 | unsigned int sz; |
419 | |
420 | /* kernel_map, VMCOREINFO and maximum CPUs */ |
421 | sz = 2 + CONFIG_NR_CPUS_DEFAULT; |
422 | if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) |
423 | sz += CONFIG_CRASH_MAX_MEMORY_RANGES; |
424 | sz *= sizeof(Elf64_Phdr); |
425 | return sz; |
426 | } |
427 | |
428 | /** |
429 | * arch_crash_handle_hotplug_event() - Handle hotplug elfcorehdr changes |
430 | * @image: a pointer to kexec_crash_image |
431 | * |
432 | * Prepare the new elfcorehdr and replace the existing elfcorehdr. |
433 | */ |
434 | void arch_crash_handle_hotplug_event(struct kimage *image) |
435 | { |
436 | void *elfbuf = NULL, *old_elfcorehdr; |
437 | unsigned long nr_mem_ranges; |
438 | unsigned long mem, memsz; |
439 | unsigned long elfsz = 0; |
440 | |
441 | /* |
442 | * As crash_prepare_elf64_headers() has already described all |
443 | * possible CPUs, there is no need to update the elfcorehdr |
444 | * for additional CPU changes. |
445 | */ |
446 | if ((image->file_mode || image->elfcorehdr_updated) && |
447 | ((image->hp_action == KEXEC_CRASH_HP_ADD_CPU) || |
448 | (image->hp_action == KEXEC_CRASH_HP_REMOVE_CPU))) |
449 | return; |
450 | |
451 | /* |
452 | * Create the new elfcorehdr reflecting the changes to CPU and/or |
453 | * memory resources. |
454 | */ |
455 | if (prepare_elf_headers(image, addr: &elfbuf, sz: &elfsz, nr_mem_ranges: &nr_mem_ranges)) { |
456 | pr_err("unable to create new elfcorehdr" ); |
457 | goto out; |
458 | } |
459 | |
460 | /* |
461 | * Obtain address and size of the elfcorehdr segment, and |
462 | * check it against the new elfcorehdr buffer. |
463 | */ |
464 | mem = image->segment[image->elfcorehdr_index].mem; |
465 | memsz = image->segment[image->elfcorehdr_index].memsz; |
466 | if (elfsz > memsz) { |
467 | pr_err("update elfcorehdr elfsz %lu > memsz %lu" , |
468 | elfsz, memsz); |
469 | goto out; |
470 | } |
471 | |
472 | /* |
473 | * Copy new elfcorehdr over the old elfcorehdr at destination. |
474 | */ |
475 | old_elfcorehdr = kmap_local_page(pfn_to_page(mem >> PAGE_SHIFT)); |
476 | if (!old_elfcorehdr) { |
477 | pr_err("mapping elfcorehdr segment failed\n" ); |
478 | goto out; |
479 | } |
480 | |
481 | /* |
482 | * Temporarily invalidate the crash image while the |
483 | * elfcorehdr is updated. |
484 | */ |
485 | xchg(&kexec_crash_image, NULL); |
486 | memcpy_flushcache(dst: old_elfcorehdr, src: elfbuf, cnt: elfsz); |
487 | xchg(&kexec_crash_image, image); |
488 | kunmap_local(old_elfcorehdr); |
489 | pr_debug("updated elfcorehdr\n" ); |
490 | |
491 | out: |
492 | vfree(addr: elfbuf); |
493 | } |
494 | #endif |
495 | |