1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * handle transition of Linux booting another kernel |
4 | * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> |
5 | */ |
6 | |
7 | #define pr_fmt(fmt) "kexec: " fmt |
8 | |
9 | #include <linux/mm.h> |
10 | #include <linux/kexec.h> |
11 | #include <linux/string.h> |
12 | #include <linux/gfp.h> |
13 | #include <linux/reboot.h> |
14 | #include <linux/numa.h> |
15 | #include <linux/ftrace.h> |
16 | #include <linux/io.h> |
17 | #include <linux/suspend.h> |
18 | #include <linux/vmalloc.h> |
19 | #include <linux/efi.h> |
20 | #include <linux/cc_platform.h> |
21 | |
22 | #include <asm/init.h> |
23 | #include <asm/tlbflush.h> |
24 | #include <asm/mmu_context.h> |
25 | #include <asm/io_apic.h> |
26 | #include <asm/debugreg.h> |
27 | #include <asm/kexec-bzimage64.h> |
28 | #include <asm/setup.h> |
29 | #include <asm/set_memory.h> |
30 | #include <asm/cpu.h> |
31 | |
32 | #ifdef CONFIG_ACPI |
33 | /* |
34 | * Used while adding mapping for ACPI tables. |
35 | * Can be reused when other iomem regions need be mapped |
36 | */ |
37 | struct init_pgtable_data { |
38 | struct x86_mapping_info *info; |
39 | pgd_t *level4p; |
40 | }; |
41 | |
42 | static int mem_region_callback(struct resource *res, void *arg) |
43 | { |
44 | struct init_pgtable_data *data = arg; |
45 | |
46 | return kernel_ident_mapping_init(info: data->info, pgd_page: data->level4p, |
47 | pstart: res->start, pend: res->end + 1); |
48 | } |
49 | |
50 | static int |
51 | map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) |
52 | { |
53 | struct init_pgtable_data data; |
54 | unsigned long flags; |
55 | int ret; |
56 | |
57 | data.info = info; |
58 | data.level4p = level4p; |
59 | flags = IORESOURCE_MEM | IORESOURCE_BUSY; |
60 | |
61 | ret = walk_iomem_res_desc(desc: IORES_DESC_ACPI_TABLES, flags, start: 0, end: -1, |
62 | arg: &data, func: mem_region_callback); |
63 | if (ret && ret != -EINVAL) |
64 | return ret; |
65 | |
66 | /* ACPI tables could be located in ACPI Non-volatile Storage region */ |
67 | ret = walk_iomem_res_desc(desc: IORES_DESC_ACPI_NV_STORAGE, flags, start: 0, end: -1, |
68 | arg: &data, func: mem_region_callback); |
69 | if (ret && ret != -EINVAL) |
70 | return ret; |
71 | |
72 | return 0; |
73 | } |
74 | #else |
75 | static int map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) { return 0; } |
76 | #endif |
77 | |
78 | #ifdef CONFIG_KEXEC_FILE |
79 | const struct kexec_file_ops * const kexec_file_loaders[] = { |
80 | &kexec_bzImage64_ops, |
81 | NULL |
82 | }; |
83 | #endif |
84 | |
85 | static int |
86 | map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p) |
87 | { |
88 | #ifdef CONFIG_EFI |
89 | unsigned long mstart, mend; |
90 | |
91 | if (!efi_enabled(EFI_BOOT)) |
92 | return 0; |
93 | |
94 | mstart = (boot_params.efi_info.efi_systab | |
95 | ((u64)boot_params.efi_info.efi_systab_hi<<32)); |
96 | |
97 | if (efi_enabled(EFI_64BIT)) |
98 | mend = mstart + sizeof(efi_system_table_64_t); |
99 | else |
100 | mend = mstart + sizeof(efi_system_table_32_t); |
101 | |
102 | if (!mstart) |
103 | return 0; |
104 | |
105 | return kernel_ident_mapping_init(info, pgd_page: level4p, pstart: mstart, pend: mend); |
106 | #endif |
107 | return 0; |
108 | } |
109 | |
110 | static void free_transition_pgtable(struct kimage *image) |
111 | { |
112 | free_page((unsigned long)image->arch.p4d); |
113 | image->arch.p4d = NULL; |
114 | free_page((unsigned long)image->arch.pud); |
115 | image->arch.pud = NULL; |
116 | free_page((unsigned long)image->arch.pmd); |
117 | image->arch.pmd = NULL; |
118 | free_page((unsigned long)image->arch.pte); |
119 | image->arch.pte = NULL; |
120 | } |
121 | |
122 | static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) |
123 | { |
124 | pgprot_t prot = PAGE_KERNEL_EXEC_NOENC; |
125 | unsigned long vaddr, paddr; |
126 | int result = -ENOMEM; |
127 | p4d_t *p4d; |
128 | pud_t *pud; |
129 | pmd_t *pmd; |
130 | pte_t *pte; |
131 | |
132 | vaddr = (unsigned long)relocate_kernel; |
133 | paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE); |
134 | pgd += pgd_index(vaddr); |
135 | if (!pgd_present(pgd: *pgd)) { |
136 | p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL); |
137 | if (!p4d) |
138 | goto err; |
139 | image->arch.p4d = p4d; |
140 | set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); |
141 | } |
142 | p4d = p4d_offset(pgd, address: vaddr); |
143 | if (!p4d_present(p4d: *p4d)) { |
144 | pud = (pud_t *)get_zeroed_page(GFP_KERNEL); |
145 | if (!pud) |
146 | goto err; |
147 | image->arch.pud = pud; |
148 | set_p4d(p4dp: p4d, p4d: __p4d(__pa(pud) | _KERNPG_TABLE)); |
149 | } |
150 | pud = pud_offset(p4d, address: vaddr); |
151 | if (!pud_present(pud: *pud)) { |
152 | pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); |
153 | if (!pmd) |
154 | goto err; |
155 | image->arch.pmd = pmd; |
156 | set_pud(pudp: pud, pud: __pud(__pa(pmd) | _KERNPG_TABLE)); |
157 | } |
158 | pmd = pmd_offset(pud, address: vaddr); |
159 | if (!pmd_present(pmd: *pmd)) { |
160 | pte = (pte_t *)get_zeroed_page(GFP_KERNEL); |
161 | if (!pte) |
162 | goto err; |
163 | image->arch.pte = pte; |
164 | set_pmd(pmdp: pmd, pmd: __pmd(__pa(pte) | _KERNPG_TABLE)); |
165 | } |
166 | pte = pte_offset_kernel(pmd, address: vaddr); |
167 | |
168 | if (cc_platform_has(attr: CC_ATTR_GUEST_MEM_ENCRYPT)) |
169 | prot = PAGE_KERNEL_EXEC; |
170 | |
171 | set_pte(ptep: pte, pte: pfn_pte(page_nr: paddr >> PAGE_SHIFT, pgprot: prot)); |
172 | return 0; |
173 | err: |
174 | return result; |
175 | } |
176 | |
177 | static void *alloc_pgt_page(void *data) |
178 | { |
179 | struct kimage *image = (struct kimage *)data; |
180 | struct page *page; |
181 | void *p = NULL; |
182 | |
183 | page = kimage_alloc_control_pages(image, order: 0); |
184 | if (page) { |
185 | p = page_address(page); |
186 | clear_page(page: p); |
187 | } |
188 | |
189 | return p; |
190 | } |
191 | |
192 | static int init_pgtable(struct kimage *image, unsigned long start_pgtable) |
193 | { |
194 | struct x86_mapping_info info = { |
195 | .alloc_pgt_page = alloc_pgt_page, |
196 | .context = image, |
197 | .page_flag = __PAGE_KERNEL_LARGE_EXEC, |
198 | .kernpg_flag = _KERNPG_TABLE_NOENC, |
199 | }; |
200 | unsigned long mstart, mend; |
201 | pgd_t *level4p; |
202 | int result; |
203 | int i; |
204 | |
205 | level4p = (pgd_t *)__va(start_pgtable); |
206 | clear_page(page: level4p); |
207 | |
208 | if (cc_platform_has(attr: CC_ATTR_GUEST_MEM_ENCRYPT)) { |
209 | info.page_flag |= _PAGE_ENC; |
210 | info.kernpg_flag |= _PAGE_ENC; |
211 | } |
212 | |
213 | if (direct_gbpages) |
214 | info.direct_gbpages = true; |
215 | |
216 | for (i = 0; i < nr_pfn_mapped; i++) { |
217 | mstart = pfn_mapped[i].start << PAGE_SHIFT; |
218 | mend = pfn_mapped[i].end << PAGE_SHIFT; |
219 | |
220 | result = kernel_ident_mapping_init(info: &info, |
221 | pgd_page: level4p, pstart: mstart, pend: mend); |
222 | if (result) |
223 | return result; |
224 | } |
225 | |
226 | /* |
227 | * segments's mem ranges could be outside 0 ~ max_pfn, |
228 | * for example when jump back to original kernel from kexeced kernel. |
229 | * or first kernel is booted with user mem map, and second kernel |
230 | * could be loaded out of that range. |
231 | */ |
232 | for (i = 0; i < image->nr_segments; i++) { |
233 | mstart = image->segment[i].mem; |
234 | mend = mstart + image->segment[i].memsz; |
235 | |
236 | result = kernel_ident_mapping_init(info: &info, |
237 | pgd_page: level4p, pstart: mstart, pend: mend); |
238 | |
239 | if (result) |
240 | return result; |
241 | } |
242 | |
243 | /* |
244 | * Prepare EFI systab and ACPI tables for kexec kernel since they are |
245 | * not covered by pfn_mapped. |
246 | */ |
247 | result = map_efi_systab(info: &info, level4p); |
248 | if (result) |
249 | return result; |
250 | |
251 | result = map_acpi_tables(info: &info, level4p); |
252 | if (result) |
253 | return result; |
254 | |
255 | return init_transition_pgtable(image, pgd: level4p); |
256 | } |
257 | |
258 | static void load_segments(void) |
259 | { |
260 | __asm__ __volatile__ ( |
261 | "\tmovl %0,%%ds\n" |
262 | "\tmovl %0,%%es\n" |
263 | "\tmovl %0,%%ss\n" |
264 | "\tmovl %0,%%fs\n" |
265 | "\tmovl %0,%%gs\n" |
266 | : : "a" (__KERNEL_DS) : "memory" |
267 | ); |
268 | } |
269 | |
270 | int machine_kexec_prepare(struct kimage *image) |
271 | { |
272 | unsigned long start_pgtable; |
273 | int result; |
274 | |
275 | /* Calculate the offsets */ |
276 | start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; |
277 | |
278 | /* Setup the identity mapped 64bit page table */ |
279 | result = init_pgtable(image, start_pgtable); |
280 | if (result) |
281 | return result; |
282 | |
283 | return 0; |
284 | } |
285 | |
286 | void machine_kexec_cleanup(struct kimage *image) |
287 | { |
288 | free_transition_pgtable(image); |
289 | } |
290 | |
291 | /* |
292 | * Do not allocate memory (or fail in any way) in machine_kexec(). |
293 | * We are past the point of no return, committed to rebooting now. |
294 | */ |
295 | void machine_kexec(struct kimage *image) |
296 | { |
297 | unsigned long page_list[PAGES_NR]; |
298 | void *control_page; |
299 | int save_ftrace_enabled; |
300 | |
301 | #ifdef CONFIG_KEXEC_JUMP |
302 | if (image->preserve_context) |
303 | save_processor_state(); |
304 | #endif |
305 | |
306 | save_ftrace_enabled = __ftrace_enabled_save(); |
307 | |
308 | /* Interrupts aren't acceptable while we reboot */ |
309 | local_irq_disable(); |
310 | hw_breakpoint_disable(); |
311 | cet_disable(); |
312 | |
313 | if (image->preserve_context) { |
314 | #ifdef CONFIG_X86_IO_APIC |
315 | /* |
316 | * We need to put APICs in legacy mode so that we can |
317 | * get timer interrupts in second kernel. kexec/kdump |
318 | * paths already have calls to restore_boot_irq_mode() |
319 | * in one form or other. kexec jump path also need one. |
320 | */ |
321 | clear_IO_APIC(); |
322 | restore_boot_irq_mode(); |
323 | #endif |
324 | } |
325 | |
326 | control_page = page_address(image->control_code_page) + PAGE_SIZE; |
327 | __memcpy(to: control_page, from: relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); |
328 | |
329 | page_list[PA_CONTROL_PAGE] = virt_to_phys(address: control_page); |
330 | page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; |
331 | page_list[PA_TABLE_PAGE] = |
332 | (unsigned long)__pa(page_address(image->control_code_page)); |
333 | |
334 | if (image->type == KEXEC_TYPE_DEFAULT) |
335 | page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) |
336 | << PAGE_SHIFT); |
337 | |
338 | /* |
339 | * The segment registers are funny things, they have both a |
340 | * visible and an invisible part. Whenever the visible part is |
341 | * set to a specific selector, the invisible part is loaded |
342 | * with from a table in memory. At no other time is the |
343 | * descriptor table in memory accessed. |
344 | * |
345 | * I take advantage of this here by force loading the |
346 | * segments, before I zap the gdt with an invalid value. |
347 | */ |
348 | load_segments(); |
349 | /* |
350 | * The gdt & idt are now invalid. |
351 | * If you want to load them you must set up your own idt & gdt. |
352 | */ |
353 | native_idt_invalidate(); |
354 | native_gdt_invalidate(); |
355 | |
356 | /* now call it */ |
357 | image->start = relocate_kernel(indirection_page: (unsigned long)image->head, |
358 | page_list: (unsigned long)page_list, |
359 | start_address: image->start, |
360 | preserve_context: image->preserve_context, |
361 | host_mem_enc_active: cc_platform_has(attr: CC_ATTR_HOST_MEM_ENCRYPT)); |
362 | |
363 | #ifdef CONFIG_KEXEC_JUMP |
364 | if (image->preserve_context) |
365 | restore_processor_state(); |
366 | #endif |
367 | |
368 | __ftrace_enabled_restore(enabled: save_ftrace_enabled); |
369 | } |
370 | |
371 | /* arch-dependent functionality related to kexec file-based syscall */ |
372 | |
373 | #ifdef CONFIG_KEXEC_FILE |
374 | /* |
375 | * Apply purgatory relocations. |
376 | * |
377 | * @pi: Purgatory to be relocated. |
378 | * @section: Section relocations applying to. |
379 | * @relsec: Section containing RELAs. |
380 | * @symtabsec: Corresponding symtab. |
381 | * |
382 | * TODO: Some of the code belongs to generic code. Move that in kexec.c. |
383 | */ |
384 | int arch_kexec_apply_relocations_add(struct purgatory_info *pi, |
385 | Elf_Shdr *section, const Elf_Shdr *relsec, |
386 | const Elf_Shdr *symtabsec) |
387 | { |
388 | unsigned int i; |
389 | Elf64_Rela *rel; |
390 | Elf64_Sym *sym; |
391 | void *location; |
392 | unsigned long address, sec_base, value; |
393 | const char *strtab, *name, *shstrtab; |
394 | const Elf_Shdr *sechdrs; |
395 | |
396 | /* String & section header string table */ |
397 | sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff; |
398 | strtab = (char *)pi->ehdr + sechdrs[symtabsec->sh_link].sh_offset; |
399 | shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset; |
400 | |
401 | rel = (void *)pi->ehdr + relsec->sh_offset; |
402 | |
403 | pr_debug("Applying relocate section %s to %u\n" , |
404 | shstrtab + relsec->sh_name, relsec->sh_info); |
405 | |
406 | for (i = 0; i < relsec->sh_size / sizeof(*rel); i++) { |
407 | |
408 | /* |
409 | * rel[i].r_offset contains byte offset from beginning |
410 | * of section to the storage unit affected. |
411 | * |
412 | * This is location to update. This is temporary buffer |
413 | * where section is currently loaded. This will finally be |
414 | * loaded to a different address later, pointed to by |
415 | * ->sh_addr. kexec takes care of moving it |
416 | * (kexec_load_segment()). |
417 | */ |
418 | location = pi->purgatory_buf; |
419 | location += section->sh_offset; |
420 | location += rel[i].r_offset; |
421 | |
422 | /* Final address of the location */ |
423 | address = section->sh_addr + rel[i].r_offset; |
424 | |
425 | /* |
426 | * rel[i].r_info contains information about symbol table index |
427 | * w.r.t which relocation must be made and type of relocation |
428 | * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get |
429 | * these respectively. |
430 | */ |
431 | sym = (void *)pi->ehdr + symtabsec->sh_offset; |
432 | sym += ELF64_R_SYM(rel[i].r_info); |
433 | |
434 | if (sym->st_name) |
435 | name = strtab + sym->st_name; |
436 | else |
437 | name = shstrtab + sechdrs[sym->st_shndx].sh_name; |
438 | |
439 | pr_debug("Symbol: %s info: %02x shndx: %02x value=%llx size: %llx\n" , |
440 | name, sym->st_info, sym->st_shndx, sym->st_value, |
441 | sym->st_size); |
442 | |
443 | if (sym->st_shndx == SHN_UNDEF) { |
444 | pr_err("Undefined symbol: %s\n" , name); |
445 | return -ENOEXEC; |
446 | } |
447 | |
448 | if (sym->st_shndx == SHN_COMMON) { |
449 | pr_err("symbol '%s' in common section\n" , name); |
450 | return -ENOEXEC; |
451 | } |
452 | |
453 | if (sym->st_shndx == SHN_ABS) |
454 | sec_base = 0; |
455 | else if (sym->st_shndx >= pi->ehdr->e_shnum) { |
456 | pr_err("Invalid section %d for symbol %s\n" , |
457 | sym->st_shndx, name); |
458 | return -ENOEXEC; |
459 | } else |
460 | sec_base = pi->sechdrs[sym->st_shndx].sh_addr; |
461 | |
462 | value = sym->st_value; |
463 | value += sec_base; |
464 | value += rel[i].r_addend; |
465 | |
466 | switch (ELF64_R_TYPE(rel[i].r_info)) { |
467 | case R_X86_64_NONE: |
468 | break; |
469 | case R_X86_64_64: |
470 | *(u64 *)location = value; |
471 | break; |
472 | case R_X86_64_32: |
473 | *(u32 *)location = value; |
474 | if (value != *(u32 *)location) |
475 | goto overflow; |
476 | break; |
477 | case R_X86_64_32S: |
478 | *(s32 *)location = value; |
479 | if ((s64)value != *(s32 *)location) |
480 | goto overflow; |
481 | break; |
482 | case R_X86_64_PC32: |
483 | case R_X86_64_PLT32: |
484 | value -= (u64)address; |
485 | *(u32 *)location = value; |
486 | break; |
487 | default: |
488 | pr_err("Unknown rela relocation: %llu\n" , |
489 | ELF64_R_TYPE(rel[i].r_info)); |
490 | return -ENOEXEC; |
491 | } |
492 | } |
493 | return 0; |
494 | |
495 | overflow: |
496 | pr_err("Overflow in relocation type %d value 0x%lx\n" , |
497 | (int)ELF64_R_TYPE(rel[i].r_info), value); |
498 | return -ENOEXEC; |
499 | } |
500 | |
501 | int arch_kimage_file_post_load_cleanup(struct kimage *image) |
502 | { |
503 | vfree(addr: image->elf_headers); |
504 | image->elf_headers = NULL; |
505 | image->elf_headers_sz = 0; |
506 | |
507 | return kexec_image_post_load_cleanup_default(image); |
508 | } |
509 | #endif /* CONFIG_KEXEC_FILE */ |
510 | |
511 | #ifdef CONFIG_CRASH_DUMP |
512 | |
513 | static int |
514 | kexec_mark_range(unsigned long start, unsigned long end, bool protect) |
515 | { |
516 | struct page *page; |
517 | unsigned int nr_pages; |
518 | |
519 | /* |
520 | * For physical range: [start, end]. We must skip the unassigned |
521 | * crashk resource with zero-valued "end" member. |
522 | */ |
523 | if (!end || start > end) |
524 | return 0; |
525 | |
526 | page = pfn_to_page(start >> PAGE_SHIFT); |
527 | nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; |
528 | if (protect) |
529 | return set_pages_ro(page, numpages: nr_pages); |
530 | else |
531 | return set_pages_rw(page, numpages: nr_pages); |
532 | } |
533 | |
534 | static void kexec_mark_crashkres(bool protect) |
535 | { |
536 | unsigned long control; |
537 | |
538 | kexec_mark_range(start: crashk_low_res.start, end: crashk_low_res.end, protect); |
539 | |
540 | /* Don't touch the control code page used in crash_kexec().*/ |
541 | control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page)); |
542 | /* Control code page is located in the 2nd page. */ |
543 | kexec_mark_range(start: crashk_res.start, end: control + PAGE_SIZE - 1, protect); |
544 | control += KEXEC_CONTROL_PAGE_SIZE; |
545 | kexec_mark_range(start: control, end: crashk_res.end, protect); |
546 | } |
547 | |
548 | void arch_kexec_protect_crashkres(void) |
549 | { |
550 | kexec_mark_crashkres(protect: true); |
551 | } |
552 | |
553 | void arch_kexec_unprotect_crashkres(void) |
554 | { |
555 | kexec_mark_crashkres(protect: false); |
556 | } |
557 | #endif |
558 | |
559 | /* |
560 | * During a traditional boot under SME, SME will encrypt the kernel, |
561 | * so the SME kexec kernel also needs to be un-encrypted in order to |
562 | * replicate a normal SME boot. |
563 | * |
564 | * During a traditional boot under SEV, the kernel has already been |
565 | * loaded encrypted, so the SEV kexec kernel needs to be encrypted in |
566 | * order to replicate a normal SEV boot. |
567 | */ |
568 | int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) |
569 | { |
570 | if (!cc_platform_has(attr: CC_ATTR_HOST_MEM_ENCRYPT)) |
571 | return 0; |
572 | |
573 | /* |
574 | * If host memory encryption is active we need to be sure that kexec |
575 | * pages are not encrypted because when we boot to the new kernel the |
576 | * pages won't be accessed encrypted (initially). |
577 | */ |
578 | return set_memory_decrypted(addr: (unsigned long)vaddr, numpages: pages); |
579 | } |
580 | |
581 | void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) |
582 | { |
583 | if (!cc_platform_has(attr: CC_ATTR_HOST_MEM_ENCRYPT)) |
584 | return; |
585 | |
586 | /* |
587 | * If host memory encryption is active we need to reset the pages back |
588 | * to being an encrypted mapping before freeing them. |
589 | */ |
590 | set_memory_encrypted(addr: (unsigned long)vaddr, numpages: pages); |
591 | } |
592 | |