1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * handle transition of Linux booting another kernel |
4 | * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> |
5 | */ |
6 | |
7 | #define pr_fmt(fmt) "kexec: " fmt |
8 | |
9 | #include <linux/mm.h> |
10 | #include <linux/kexec.h> |
11 | #include <linux/string.h> |
12 | #include <linux/gfp.h> |
13 | #include <linux/reboot.h> |
14 | #include <linux/numa.h> |
15 | #include <linux/ftrace.h> |
16 | #include <linux/io.h> |
17 | #include <linux/suspend.h> |
18 | #include <linux/vmalloc.h> |
19 | #include <linux/efi.h> |
20 | #include <linux/cc_platform.h> |
21 | |
22 | #include <asm/init.h> |
23 | #include <asm/tlbflush.h> |
24 | #include <asm/mmu_context.h> |
25 | #include <asm/io_apic.h> |
26 | #include <asm/debugreg.h> |
27 | #include <asm/kexec-bzimage64.h> |
28 | #include <asm/setup.h> |
29 | #include <asm/set_memory.h> |
30 | #include <asm/cpu.h> |
31 | |
32 | #ifdef CONFIG_ACPI |
33 | /* |
34 | * Used while adding mapping for ACPI tables. |
35 | * Can be reused when other iomem regions need be mapped |
36 | */ |
37 | struct init_pgtable_data { |
38 | struct x86_mapping_info *info; |
39 | pgd_t *level4p; |
40 | }; |
41 | |
42 | static int mem_region_callback(struct resource *res, void *arg) |
43 | { |
44 | struct init_pgtable_data *data = arg; |
45 | unsigned long mstart, mend; |
46 | |
47 | mstart = res->start; |
48 | mend = mstart + resource_size(res) - 1; |
49 | |
50 | return kernel_ident_mapping_init(info: data->info, pgd_page: data->level4p, pstart: mstart, pend: mend); |
51 | } |
52 | |
53 | static int |
54 | map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) |
55 | { |
56 | struct init_pgtable_data data; |
57 | unsigned long flags; |
58 | int ret; |
59 | |
60 | data.info = info; |
61 | data.level4p = level4p; |
62 | flags = IORESOURCE_MEM | IORESOURCE_BUSY; |
63 | |
64 | ret = walk_iomem_res_desc(desc: IORES_DESC_ACPI_TABLES, flags, start: 0, end: -1, |
65 | arg: &data, func: mem_region_callback); |
66 | if (ret && ret != -EINVAL) |
67 | return ret; |
68 | |
69 | /* ACPI tables could be located in ACPI Non-volatile Storage region */ |
70 | ret = walk_iomem_res_desc(desc: IORES_DESC_ACPI_NV_STORAGE, flags, start: 0, end: -1, |
71 | arg: &data, func: mem_region_callback); |
72 | if (ret && ret != -EINVAL) |
73 | return ret; |
74 | |
75 | return 0; |
76 | } |
77 | #else |
78 | static int map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) { return 0; } |
79 | #endif |
80 | |
81 | #ifdef CONFIG_KEXEC_FILE |
82 | const struct kexec_file_ops * const kexec_file_loaders[] = { |
83 | &kexec_bzImage64_ops, |
84 | NULL |
85 | }; |
86 | #endif |
87 | |
88 | static int |
89 | map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p) |
90 | { |
91 | #ifdef CONFIG_EFI |
92 | unsigned long mstart, mend; |
93 | |
94 | if (!efi_enabled(EFI_BOOT)) |
95 | return 0; |
96 | |
97 | mstart = (boot_params.efi_info.efi_systab | |
98 | ((u64)boot_params.efi_info.efi_systab_hi<<32)); |
99 | |
100 | if (efi_enabled(EFI_64BIT)) |
101 | mend = mstart + sizeof(efi_system_table_64_t); |
102 | else |
103 | mend = mstart + sizeof(efi_system_table_32_t); |
104 | |
105 | if (!mstart) |
106 | return 0; |
107 | |
108 | return kernel_ident_mapping_init(info, pgd_page: level4p, pstart: mstart, pend: mend); |
109 | #endif |
110 | return 0; |
111 | } |
112 | |
113 | static void free_transition_pgtable(struct kimage *image) |
114 | { |
115 | free_page((unsigned long)image->arch.p4d); |
116 | image->arch.p4d = NULL; |
117 | free_page((unsigned long)image->arch.pud); |
118 | image->arch.pud = NULL; |
119 | free_page((unsigned long)image->arch.pmd); |
120 | image->arch.pmd = NULL; |
121 | free_page((unsigned long)image->arch.pte); |
122 | image->arch.pte = NULL; |
123 | } |
124 | |
125 | static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) |
126 | { |
127 | pgprot_t prot = PAGE_KERNEL_EXEC_NOENC; |
128 | unsigned long vaddr, paddr; |
129 | int result = -ENOMEM; |
130 | p4d_t *p4d; |
131 | pud_t *pud; |
132 | pmd_t *pmd; |
133 | pte_t *pte; |
134 | |
135 | vaddr = (unsigned long)relocate_kernel; |
136 | paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE); |
137 | pgd += pgd_index(vaddr); |
138 | if (!pgd_present(pgd: *pgd)) { |
139 | p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL); |
140 | if (!p4d) |
141 | goto err; |
142 | image->arch.p4d = p4d; |
143 | set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); |
144 | } |
145 | p4d = p4d_offset(pgd, address: vaddr); |
146 | if (!p4d_present(p4d: *p4d)) { |
147 | pud = (pud_t *)get_zeroed_page(GFP_KERNEL); |
148 | if (!pud) |
149 | goto err; |
150 | image->arch.pud = pud; |
151 | set_p4d(p4dp: p4d, p4d: __p4d(__pa(pud) | _KERNPG_TABLE)); |
152 | } |
153 | pud = pud_offset(p4d, address: vaddr); |
154 | if (!pud_present(pud: *pud)) { |
155 | pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); |
156 | if (!pmd) |
157 | goto err; |
158 | image->arch.pmd = pmd; |
159 | set_pud(pudp: pud, pud: __pud(__pa(pmd) | _KERNPG_TABLE)); |
160 | } |
161 | pmd = pmd_offset(pud, address: vaddr); |
162 | if (!pmd_present(pmd: *pmd)) { |
163 | pte = (pte_t *)get_zeroed_page(GFP_KERNEL); |
164 | if (!pte) |
165 | goto err; |
166 | image->arch.pte = pte; |
167 | set_pmd(pmdp: pmd, pmd: __pmd(__pa(pte) | _KERNPG_TABLE)); |
168 | } |
169 | pte = pte_offset_kernel(pmd, address: vaddr); |
170 | |
171 | if (cc_platform_has(attr: CC_ATTR_GUEST_MEM_ENCRYPT)) |
172 | prot = PAGE_KERNEL_EXEC; |
173 | |
174 | set_pte(ptep: pte, pte: pfn_pte(page_nr: paddr >> PAGE_SHIFT, pgprot: prot)); |
175 | return 0; |
176 | err: |
177 | return result; |
178 | } |
179 | |
180 | static void *alloc_pgt_page(void *data) |
181 | { |
182 | struct kimage *image = (struct kimage *)data; |
183 | struct page *page; |
184 | void *p = NULL; |
185 | |
186 | page = kimage_alloc_control_pages(image, order: 0); |
187 | if (page) { |
188 | p = page_address(page); |
189 | clear_page(page: p); |
190 | } |
191 | |
192 | return p; |
193 | } |
194 | |
195 | static int init_pgtable(struct kimage *image, unsigned long start_pgtable) |
196 | { |
197 | struct x86_mapping_info info = { |
198 | .alloc_pgt_page = alloc_pgt_page, |
199 | .context = image, |
200 | .page_flag = __PAGE_KERNEL_LARGE_EXEC, |
201 | .kernpg_flag = _KERNPG_TABLE_NOENC, |
202 | }; |
203 | unsigned long mstart, mend; |
204 | pgd_t *level4p; |
205 | int result; |
206 | int i; |
207 | |
208 | level4p = (pgd_t *)__va(start_pgtable); |
209 | clear_page(page: level4p); |
210 | |
211 | if (cc_platform_has(attr: CC_ATTR_GUEST_MEM_ENCRYPT)) { |
212 | info.page_flag |= _PAGE_ENC; |
213 | info.kernpg_flag |= _PAGE_ENC; |
214 | } |
215 | |
216 | if (direct_gbpages) |
217 | info.direct_gbpages = true; |
218 | |
219 | for (i = 0; i < nr_pfn_mapped; i++) { |
220 | mstart = pfn_mapped[i].start << PAGE_SHIFT; |
221 | mend = pfn_mapped[i].end << PAGE_SHIFT; |
222 | |
223 | result = kernel_ident_mapping_init(info: &info, |
224 | pgd_page: level4p, pstart: mstart, pend: mend); |
225 | if (result) |
226 | return result; |
227 | } |
228 | |
229 | /* |
230 | * segments's mem ranges could be outside 0 ~ max_pfn, |
231 | * for example when jump back to original kernel from kexeced kernel. |
232 | * or first kernel is booted with user mem map, and second kernel |
233 | * could be loaded out of that range. |
234 | */ |
235 | for (i = 0; i < image->nr_segments; i++) { |
236 | mstart = image->segment[i].mem; |
237 | mend = mstart + image->segment[i].memsz; |
238 | |
239 | result = kernel_ident_mapping_init(info: &info, |
240 | pgd_page: level4p, pstart: mstart, pend: mend); |
241 | |
242 | if (result) |
243 | return result; |
244 | } |
245 | |
246 | /* |
247 | * Prepare EFI systab and ACPI tables for kexec kernel since they are |
248 | * not covered by pfn_mapped. |
249 | */ |
250 | result = map_efi_systab(info: &info, level4p); |
251 | if (result) |
252 | return result; |
253 | |
254 | result = map_acpi_tables(info: &info, level4p); |
255 | if (result) |
256 | return result; |
257 | |
258 | return init_transition_pgtable(image, pgd: level4p); |
259 | } |
260 | |
261 | static void load_segments(void) |
262 | { |
263 | __asm__ __volatile__ ( |
264 | "\tmovl %0,%%ds\n" |
265 | "\tmovl %0,%%es\n" |
266 | "\tmovl %0,%%ss\n" |
267 | "\tmovl %0,%%fs\n" |
268 | "\tmovl %0,%%gs\n" |
269 | : : "a" (__KERNEL_DS) : "memory" |
270 | ); |
271 | } |
272 | |
273 | int machine_kexec_prepare(struct kimage *image) |
274 | { |
275 | unsigned long start_pgtable; |
276 | int result; |
277 | |
278 | /* Calculate the offsets */ |
279 | start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; |
280 | |
281 | /* Setup the identity mapped 64bit page table */ |
282 | result = init_pgtable(image, start_pgtable); |
283 | if (result) |
284 | return result; |
285 | |
286 | return 0; |
287 | } |
288 | |
289 | void machine_kexec_cleanup(struct kimage *image) |
290 | { |
291 | free_transition_pgtable(image); |
292 | } |
293 | |
294 | /* |
295 | * Do not allocate memory (or fail in any way) in machine_kexec(). |
296 | * We are past the point of no return, committed to rebooting now. |
297 | */ |
298 | void machine_kexec(struct kimage *image) |
299 | { |
300 | unsigned long page_list[PAGES_NR]; |
301 | void *control_page; |
302 | int save_ftrace_enabled; |
303 | |
304 | #ifdef CONFIG_KEXEC_JUMP |
305 | if (image->preserve_context) |
306 | save_processor_state(); |
307 | #endif |
308 | |
309 | save_ftrace_enabled = __ftrace_enabled_save(); |
310 | |
311 | /* Interrupts aren't acceptable while we reboot */ |
312 | local_irq_disable(); |
313 | hw_breakpoint_disable(); |
314 | cet_disable(); |
315 | |
316 | if (image->preserve_context) { |
317 | #ifdef CONFIG_X86_IO_APIC |
318 | /* |
319 | * We need to put APICs in legacy mode so that we can |
320 | * get timer interrupts in second kernel. kexec/kdump |
321 | * paths already have calls to restore_boot_irq_mode() |
322 | * in one form or other. kexec jump path also need one. |
323 | */ |
324 | clear_IO_APIC(); |
325 | restore_boot_irq_mode(); |
326 | #endif |
327 | } |
328 | |
329 | control_page = page_address(image->control_code_page) + PAGE_SIZE; |
330 | __memcpy(to: control_page, from: relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); |
331 | |
332 | page_list[PA_CONTROL_PAGE] = virt_to_phys(address: control_page); |
333 | page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; |
334 | page_list[PA_TABLE_PAGE] = |
335 | (unsigned long)__pa(page_address(image->control_code_page)); |
336 | |
337 | if (image->type == KEXEC_TYPE_DEFAULT) |
338 | page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) |
339 | << PAGE_SHIFT); |
340 | |
341 | /* |
342 | * The segment registers are funny things, they have both a |
343 | * visible and an invisible part. Whenever the visible part is |
344 | * set to a specific selector, the invisible part is loaded |
345 | * with from a table in memory. At no other time is the |
346 | * descriptor table in memory accessed. |
347 | * |
348 | * I take advantage of this here by force loading the |
349 | * segments, before I zap the gdt with an invalid value. |
350 | */ |
351 | load_segments(); |
352 | /* |
353 | * The gdt & idt are now invalid. |
354 | * If you want to load them you must set up your own idt & gdt. |
355 | */ |
356 | native_idt_invalidate(); |
357 | native_gdt_invalidate(); |
358 | |
359 | /* now call it */ |
360 | image->start = relocate_kernel(indirection_page: (unsigned long)image->head, |
361 | page_list: (unsigned long)page_list, |
362 | start_address: image->start, |
363 | preserve_context: image->preserve_context, |
364 | host_mem_enc_active: cc_platform_has(attr: CC_ATTR_HOST_MEM_ENCRYPT)); |
365 | |
366 | #ifdef CONFIG_KEXEC_JUMP |
367 | if (image->preserve_context) |
368 | restore_processor_state(); |
369 | #endif |
370 | |
371 | __ftrace_enabled_restore(enabled: save_ftrace_enabled); |
372 | } |
373 | |
374 | /* arch-dependent functionality related to kexec file-based syscall */ |
375 | |
376 | #ifdef CONFIG_KEXEC_FILE |
377 | /* |
378 | * Apply purgatory relocations. |
379 | * |
380 | * @pi: Purgatory to be relocated. |
381 | * @section: Section relocations applying to. |
382 | * @relsec: Section containing RELAs. |
383 | * @symtabsec: Corresponding symtab. |
384 | * |
385 | * TODO: Some of the code belongs to generic code. Move that in kexec.c. |
386 | */ |
387 | int arch_kexec_apply_relocations_add(struct purgatory_info *pi, |
388 | Elf_Shdr *section, const Elf_Shdr *relsec, |
389 | const Elf_Shdr *symtabsec) |
390 | { |
391 | unsigned int i; |
392 | Elf64_Rela *rel; |
393 | Elf64_Sym *sym; |
394 | void *location; |
395 | unsigned long address, sec_base, value; |
396 | const char *strtab, *name, *shstrtab; |
397 | const Elf_Shdr *sechdrs; |
398 | |
399 | /* String & section header string table */ |
400 | sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff; |
401 | strtab = (char *)pi->ehdr + sechdrs[symtabsec->sh_link].sh_offset; |
402 | shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset; |
403 | |
404 | rel = (void *)pi->ehdr + relsec->sh_offset; |
405 | |
406 | pr_debug("Applying relocate section %s to %u\n" , |
407 | shstrtab + relsec->sh_name, relsec->sh_info); |
408 | |
409 | for (i = 0; i < relsec->sh_size / sizeof(*rel); i++) { |
410 | |
411 | /* |
412 | * rel[i].r_offset contains byte offset from beginning |
413 | * of section to the storage unit affected. |
414 | * |
415 | * This is location to update. This is temporary buffer |
416 | * where section is currently loaded. This will finally be |
417 | * loaded to a different address later, pointed to by |
418 | * ->sh_addr. kexec takes care of moving it |
419 | * (kexec_load_segment()). |
420 | */ |
421 | location = pi->purgatory_buf; |
422 | location += section->sh_offset; |
423 | location += rel[i].r_offset; |
424 | |
425 | /* Final address of the location */ |
426 | address = section->sh_addr + rel[i].r_offset; |
427 | |
428 | /* |
429 | * rel[i].r_info contains information about symbol table index |
430 | * w.r.t which relocation must be made and type of relocation |
431 | * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get |
432 | * these respectively. |
433 | */ |
434 | sym = (void *)pi->ehdr + symtabsec->sh_offset; |
435 | sym += ELF64_R_SYM(rel[i].r_info); |
436 | |
437 | if (sym->st_name) |
438 | name = strtab + sym->st_name; |
439 | else |
440 | name = shstrtab + sechdrs[sym->st_shndx].sh_name; |
441 | |
442 | pr_debug("Symbol: %s info: %02x shndx: %02x value=%llx size: %llx\n" , |
443 | name, sym->st_info, sym->st_shndx, sym->st_value, |
444 | sym->st_size); |
445 | |
446 | if (sym->st_shndx == SHN_UNDEF) { |
447 | pr_err("Undefined symbol: %s\n" , name); |
448 | return -ENOEXEC; |
449 | } |
450 | |
451 | if (sym->st_shndx == SHN_COMMON) { |
452 | pr_err("symbol '%s' in common section\n" , name); |
453 | return -ENOEXEC; |
454 | } |
455 | |
456 | if (sym->st_shndx == SHN_ABS) |
457 | sec_base = 0; |
458 | else if (sym->st_shndx >= pi->ehdr->e_shnum) { |
459 | pr_err("Invalid section %d for symbol %s\n" , |
460 | sym->st_shndx, name); |
461 | return -ENOEXEC; |
462 | } else |
463 | sec_base = pi->sechdrs[sym->st_shndx].sh_addr; |
464 | |
465 | value = sym->st_value; |
466 | value += sec_base; |
467 | value += rel[i].r_addend; |
468 | |
469 | switch (ELF64_R_TYPE(rel[i].r_info)) { |
470 | case R_X86_64_NONE: |
471 | break; |
472 | case R_X86_64_64: |
473 | *(u64 *)location = value; |
474 | break; |
475 | case R_X86_64_32: |
476 | *(u32 *)location = value; |
477 | if (value != *(u32 *)location) |
478 | goto overflow; |
479 | break; |
480 | case R_X86_64_32S: |
481 | *(s32 *)location = value; |
482 | if ((s64)value != *(s32 *)location) |
483 | goto overflow; |
484 | break; |
485 | case R_X86_64_PC32: |
486 | case R_X86_64_PLT32: |
487 | value -= (u64)address; |
488 | *(u32 *)location = value; |
489 | break; |
490 | default: |
491 | pr_err("Unknown rela relocation: %llu\n" , |
492 | ELF64_R_TYPE(rel[i].r_info)); |
493 | return -ENOEXEC; |
494 | } |
495 | } |
496 | return 0; |
497 | |
498 | overflow: |
499 | pr_err("Overflow in relocation type %d value 0x%lx\n" , |
500 | (int)ELF64_R_TYPE(rel[i].r_info), value); |
501 | return -ENOEXEC; |
502 | } |
503 | |
504 | int arch_kimage_file_post_load_cleanup(struct kimage *image) |
505 | { |
506 | vfree(addr: image->elf_headers); |
507 | image->elf_headers = NULL; |
508 | image->elf_headers_sz = 0; |
509 | |
510 | return kexec_image_post_load_cleanup_default(image); |
511 | } |
512 | #endif /* CONFIG_KEXEC_FILE */ |
513 | |
514 | static int |
515 | kexec_mark_range(unsigned long start, unsigned long end, bool protect) |
516 | { |
517 | struct page *page; |
518 | unsigned int nr_pages; |
519 | |
520 | /* |
521 | * For physical range: [start, end]. We must skip the unassigned |
522 | * crashk resource with zero-valued "end" member. |
523 | */ |
524 | if (!end || start > end) |
525 | return 0; |
526 | |
527 | page = pfn_to_page(start >> PAGE_SHIFT); |
528 | nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; |
529 | if (protect) |
530 | return set_pages_ro(page, numpages: nr_pages); |
531 | else |
532 | return set_pages_rw(page, numpages: nr_pages); |
533 | } |
534 | |
535 | static void kexec_mark_crashkres(bool protect) |
536 | { |
537 | unsigned long control; |
538 | |
539 | kexec_mark_range(start: crashk_low_res.start, end: crashk_low_res.end, protect); |
540 | |
541 | /* Don't touch the control code page used in crash_kexec().*/ |
542 | control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page)); |
543 | /* Control code page is located in the 2nd page. */ |
544 | kexec_mark_range(start: crashk_res.start, end: control + PAGE_SIZE - 1, protect); |
545 | control += KEXEC_CONTROL_PAGE_SIZE; |
546 | kexec_mark_range(start: control, end: crashk_res.end, protect); |
547 | } |
548 | |
549 | void arch_kexec_protect_crashkres(void) |
550 | { |
551 | kexec_mark_crashkres(protect: true); |
552 | } |
553 | |
554 | void arch_kexec_unprotect_crashkres(void) |
555 | { |
556 | kexec_mark_crashkres(protect: false); |
557 | } |
558 | |
559 | /* |
560 | * During a traditional boot under SME, SME will encrypt the kernel, |
561 | * so the SME kexec kernel also needs to be un-encrypted in order to |
562 | * replicate a normal SME boot. |
563 | * |
564 | * During a traditional boot under SEV, the kernel has already been |
565 | * loaded encrypted, so the SEV kexec kernel needs to be encrypted in |
566 | * order to replicate a normal SEV boot. |
567 | */ |
568 | int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) |
569 | { |
570 | if (!cc_platform_has(attr: CC_ATTR_HOST_MEM_ENCRYPT)) |
571 | return 0; |
572 | |
573 | /* |
574 | * If host memory encryption is active we need to be sure that kexec |
575 | * pages are not encrypted because when we boot to the new kernel the |
576 | * pages won't be accessed encrypted (initially). |
577 | */ |
578 | return set_memory_decrypted(addr: (unsigned long)vaddr, numpages: pages); |
579 | } |
580 | |
581 | void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) |
582 | { |
583 | if (!cc_platform_has(attr: CC_ATTR_HOST_MEM_ENCRYPT)) |
584 | return; |
585 | |
586 | /* |
587 | * If host memory encryption is active we need to reset the pages back |
588 | * to being an encrypted mapping before freeing them. |
589 | */ |
590 | set_memory_encrypted(addr: (unsigned long)vaddr, numpages: pages); |
591 | } |
592 | |