1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * handle transition of Linux booting another kernel |
4 | * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> |
5 | */ |
6 | |
7 | #include <linux/mm.h> |
8 | #include <linux/kexec.h> |
9 | #include <linux/delay.h> |
10 | #include <linux/numa.h> |
11 | #include <linux/ftrace.h> |
12 | #include <linux/suspend.h> |
13 | #include <linux/gfp.h> |
14 | #include <linux/io.h> |
15 | |
16 | #include <asm/pgalloc.h> |
17 | #include <asm/tlbflush.h> |
18 | #include <asm/mmu_context.h> |
19 | #include <asm/apic.h> |
20 | #include <asm/io_apic.h> |
21 | #include <asm/cpufeature.h> |
22 | #include <asm/desc.h> |
23 | #include <asm/set_memory.h> |
24 | #include <asm/debugreg.h> |
25 | |
26 | static void load_segments(void) |
27 | { |
28 | #define __STR(X) #X |
29 | #define STR(X) __STR(X) |
30 | |
31 | __asm__ __volatile__ ( |
32 | "\tljmp $" STR(__KERNEL_CS)",$1f\n" |
33 | "\t1:\n" |
34 | "\tmovl $" STR(__KERNEL_DS)",%%eax\n" |
35 | "\tmovl %%eax,%%ds\n" |
36 | "\tmovl %%eax,%%es\n" |
37 | "\tmovl %%eax,%%ss\n" |
38 | : : : "eax" , "memory" ); |
39 | #undef STR |
40 | #undef __STR |
41 | } |
42 | |
43 | static void machine_kexec_free_page_tables(struct kimage *image) |
44 | { |
45 | free_pages((unsigned long)image->arch.pgd, PGD_ALLOCATION_ORDER); |
46 | image->arch.pgd = NULL; |
47 | #ifdef CONFIG_X86_PAE |
48 | free_page((unsigned long)image->arch.pmd0); |
49 | image->arch.pmd0 = NULL; |
50 | free_page((unsigned long)image->arch.pmd1); |
51 | image->arch.pmd1 = NULL; |
52 | #endif |
53 | free_page((unsigned long)image->arch.pte0); |
54 | image->arch.pte0 = NULL; |
55 | free_page((unsigned long)image->arch.pte1); |
56 | image->arch.pte1 = NULL; |
57 | } |
58 | |
59 | static int machine_kexec_alloc_page_tables(struct kimage *image) |
60 | { |
61 | image->arch.pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, |
62 | PGD_ALLOCATION_ORDER); |
63 | #ifdef CONFIG_X86_PAE |
64 | image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL); |
65 | image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL); |
66 | #endif |
67 | image->arch.pte0 = (pte_t *)get_zeroed_page(GFP_KERNEL); |
68 | image->arch.pte1 = (pte_t *)get_zeroed_page(GFP_KERNEL); |
69 | if (!image->arch.pgd || |
70 | #ifdef CONFIG_X86_PAE |
71 | !image->arch.pmd0 || !image->arch.pmd1 || |
72 | #endif |
73 | !image->arch.pte0 || !image->arch.pte1) { |
74 | return -ENOMEM; |
75 | } |
76 | return 0; |
77 | } |
78 | |
79 | static void machine_kexec_page_table_set_one( |
80 | pgd_t *pgd, pmd_t *pmd, pte_t *pte, |
81 | unsigned long vaddr, unsigned long paddr) |
82 | { |
83 | p4d_t *p4d; |
84 | pud_t *pud; |
85 | |
86 | pgd += pgd_index(vaddr); |
87 | #ifdef CONFIG_X86_PAE |
88 | if (!(pgd_val(*pgd) & _PAGE_PRESENT)) |
89 | set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT)); |
90 | #endif |
91 | p4d = p4d_offset(pgd, address: vaddr); |
92 | pud = pud_offset(p4d, address: vaddr); |
93 | pmd = pmd_offset(pud, address: vaddr); |
94 | if (!(pmd_val(pmd: *pmd) & _PAGE_PRESENT)) |
95 | set_pmd(pmdp: pmd, pmd: __pmd(__pa(pte) | _PAGE_TABLE)); |
96 | pte = pte_offset_kernel(pmd, address: vaddr); |
97 | set_pte(ptep: pte, pte: pfn_pte(page_nr: paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); |
98 | } |
99 | |
100 | static void machine_kexec_prepare_page_tables(struct kimage *image) |
101 | { |
102 | void *control_page; |
103 | pmd_t *pmd = NULL; |
104 | |
105 | control_page = page_address(image->control_code_page); |
106 | #ifdef CONFIG_X86_PAE |
107 | pmd = image->arch.pmd0; |
108 | #endif |
109 | machine_kexec_page_table_set_one( |
110 | image->arch.pgd: pgd, pmd, image->arch.pte: pte0, |
111 | vaddr: (unsigned long)control_page, __pa(control_page)); |
112 | #ifdef CONFIG_X86_PAE |
113 | pmd = image->arch.pmd1; |
114 | #endif |
115 | machine_kexec_page_table_set_one( |
116 | image->arch.pgd: pgd, pmd, image->arch.pte: pte1, |
117 | __pa(control_page), __pa(control_page)); |
118 | } |
119 | |
120 | /* |
121 | * A architecture hook called to validate the |
122 | * proposed image and prepare the control pages |
123 | * as needed. The pages for KEXEC_CONTROL_PAGE_SIZE |
124 | * have been allocated, but the segments have yet |
125 | * been copied into the kernel. |
126 | * |
127 | * Do what every setup is needed on image and the |
128 | * reboot code buffer to allow us to avoid allocations |
129 | * later. |
130 | * |
131 | * - Make control page executable. |
132 | * - Allocate page tables |
133 | * - Setup page tables |
134 | */ |
135 | int machine_kexec_prepare(struct kimage *image) |
136 | { |
137 | int error; |
138 | |
139 | set_memory_x(addr: (unsigned long)page_address(image->control_code_page), numpages: 1); |
140 | error = machine_kexec_alloc_page_tables(image); |
141 | if (error) |
142 | return error; |
143 | machine_kexec_prepare_page_tables(image); |
144 | return 0; |
145 | } |
146 | |
147 | /* |
148 | * Undo anything leftover by machine_kexec_prepare |
149 | * when an image is freed. |
150 | */ |
151 | void machine_kexec_cleanup(struct kimage *image) |
152 | { |
153 | set_memory_nx(addr: (unsigned long)page_address(image->control_code_page), numpages: 1); |
154 | machine_kexec_free_page_tables(image); |
155 | } |
156 | |
157 | /* |
158 | * Do not allocate memory (or fail in any way) in machine_kexec(). |
159 | * We are past the point of no return, committed to rebooting now. |
160 | */ |
161 | void machine_kexec(struct kimage *image) |
162 | { |
163 | unsigned long page_list[PAGES_NR]; |
164 | void *control_page; |
165 | int save_ftrace_enabled; |
166 | asmlinkage unsigned long |
167 | (*relocate_kernel_ptr)(unsigned long indirection_page, |
168 | unsigned long control_page, |
169 | unsigned long start_address, |
170 | unsigned int has_pae, |
171 | unsigned int preserve_context); |
172 | |
173 | #ifdef CONFIG_KEXEC_JUMP |
174 | if (image->preserve_context) |
175 | save_processor_state(); |
176 | #endif |
177 | |
178 | save_ftrace_enabled = __ftrace_enabled_save(); |
179 | |
180 | /* Interrupts aren't acceptable while we reboot */ |
181 | local_irq_disable(); |
182 | hw_breakpoint_disable(); |
183 | |
184 | if (image->preserve_context) { |
185 | #ifdef CONFIG_X86_IO_APIC |
186 | /* |
187 | * We need to put APICs in legacy mode so that we can |
188 | * get timer interrupts in second kernel. kexec/kdump |
189 | * paths already have calls to restore_boot_irq_mode() |
190 | * in one form or other. kexec jump path also need one. |
191 | */ |
192 | clear_IO_APIC(); |
193 | restore_boot_irq_mode(); |
194 | #endif |
195 | } |
196 | |
197 | control_page = page_address(image->control_code_page); |
198 | memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); |
199 | |
200 | relocate_kernel_ptr = control_page; |
201 | page_list[PA_CONTROL_PAGE] = __pa(control_page); |
202 | page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; |
203 | page_list[PA_PGD] = __pa(image->arch.pgd); |
204 | |
205 | if (image->type == KEXEC_TYPE_DEFAULT) |
206 | page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) |
207 | << PAGE_SHIFT); |
208 | |
209 | /* |
210 | * The segment registers are funny things, they have both a |
211 | * visible and an invisible part. Whenever the visible part is |
212 | * set to a specific selector, the invisible part is loaded |
213 | * with from a table in memory. At no other time is the |
214 | * descriptor table in memory accessed. |
215 | * |
216 | * I take advantage of this here by force loading the |
217 | * segments, before I zap the gdt with an invalid value. |
218 | */ |
219 | load_segments(); |
220 | /* |
221 | * The gdt & idt are now invalid. |
222 | * If you want to load them you must set up your own idt & gdt. |
223 | */ |
224 | native_idt_invalidate(); |
225 | native_gdt_invalidate(); |
226 | |
227 | /* now call it */ |
228 | image->start = relocate_kernel_ptr((unsigned long)image->head, |
229 | (unsigned long)page_list, |
230 | image->start, |
231 | boot_cpu_has(X86_FEATURE_PAE), |
232 | image->preserve_context); |
233 | |
234 | #ifdef CONFIG_KEXEC_JUMP |
235 | if (image->preserve_context) |
236 | restore_processor_state(); |
237 | #endif |
238 | |
239 | __ftrace_enabled_restore(enabled: save_ftrace_enabled); |
240 | } |
241 | |