1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * This code is used on x86_64 to create page table identity mappings on |
4 | * demand by building up a new set of page tables (or appending to the |
5 | * existing ones), and then switching over to them when ready. |
6 | * |
7 | * Copyright (C) 2015-2016 Yinghai Lu |
8 | * Copyright (C) 2016 Kees Cook |
9 | */ |
10 | |
11 | /* No PAGE_TABLE_ISOLATION support needed either: */ |
12 | #undef CONFIG_PAGE_TABLE_ISOLATION |
13 | |
14 | #include "error.h" |
15 | #include "misc.h" |
16 | |
17 | /* These actually do the work of building the kernel identity maps. */ |
18 | #include <linux/pgtable.h> |
19 | #include <asm/cmpxchg.h> |
20 | #include <asm/trap_pf.h> |
21 | #include <asm/trapnr.h> |
22 | #include <asm/init.h> |
23 | /* Use the static base for this part of the boot process */ |
24 | #undef __PAGE_OFFSET |
25 | #define __PAGE_OFFSET __PAGE_OFFSET_BASE |
26 | #include "../../mm/ident_map.c" |
27 | |
28 | #define _SETUP |
29 | #include <asm/setup.h> /* For COMMAND_LINE_SIZE */ |
30 | #undef _SETUP |
31 | |
32 | extern unsigned long get_cmd_line_ptr(void); |
33 | |
34 | /* Used by PAGE_KERN* macros: */ |
35 | pteval_t __default_kernel_pte_mask __read_mostly = ~0; |
36 | |
37 | /* Used to track our page table allocation area. */ |
38 | struct alloc_pgt_data { |
39 | unsigned char *pgt_buf; |
40 | unsigned long pgt_buf_size; |
41 | unsigned long pgt_buf_offset; |
42 | }; |
43 | |
44 | /* |
45 | * Allocates space for a page table entry, using struct alloc_pgt_data |
46 | * above. Besides the local callers, this is used as the allocation |
47 | * callback in mapping_info below. |
48 | */ |
49 | static void *alloc_pgt_page(void *context) |
50 | { |
51 | struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context; |
52 | unsigned char *entry; |
53 | |
54 | /* Validate there is space available for a new page. */ |
55 | if (pages->pgt_buf_offset >= pages->pgt_buf_size) { |
56 | debug_putstr("out of pgt_buf in " __FILE__ "!?\n" ); |
57 | debug_putaddr(pages->pgt_buf_offset); |
58 | debug_putaddr(pages->pgt_buf_size); |
59 | return NULL; |
60 | } |
61 | |
62 | /* Consumed more tables than expected? */ |
63 | if (pages->pgt_buf_offset == BOOT_PGT_SIZE_WARN) { |
64 | debug_putstr("pgt_buf running low in " __FILE__ "\n" ); |
65 | debug_putstr("Need to raise BOOT_PGT_SIZE?\n" ); |
66 | debug_putaddr(pages->pgt_buf_offset); |
67 | debug_putaddr(pages->pgt_buf_size); |
68 | } |
69 | |
70 | entry = pages->pgt_buf + pages->pgt_buf_offset; |
71 | pages->pgt_buf_offset += PAGE_SIZE; |
72 | |
73 | return entry; |
74 | } |
75 | |
76 | /* Used to track our allocated page tables. */ |
77 | static struct alloc_pgt_data pgt_data; |
78 | |
79 | /* The top level page table entry pointer. */ |
80 | static unsigned long top_level_pgt; |
81 | |
82 | phys_addr_t physical_mask = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; |
83 | |
84 | /* |
85 | * Mapping information structure passed to kernel_ident_mapping_init(). |
86 | * Due to relocation, pointers must be assigned at run time not build time. |
87 | */ |
88 | static struct x86_mapping_info mapping_info; |
89 | |
90 | /* |
91 | * Adds the specified range to the identity mappings. |
92 | */ |
93 | void kernel_add_identity_map(unsigned long start, unsigned long end) |
94 | { |
95 | int ret; |
96 | |
97 | /* Align boundary to 2M. */ |
98 | start = round_down(start, PMD_SIZE); |
99 | end = round_up(end, PMD_SIZE); |
100 | if (start >= end) |
101 | return; |
102 | |
103 | /* Build the mapping. */ |
104 | ret = kernel_ident_mapping_init(info: &mapping_info, pgd_page: (pgd_t *)top_level_pgt, pstart: start, pend: end); |
105 | if (ret) |
106 | error(m: "Error: kernel_ident_mapping_init() failed\n" ); |
107 | } |
108 | |
109 | /* Locates and clears a region for a new top level page table. */ |
110 | void initialize_identity_maps(void *rmode) |
111 | { |
112 | unsigned long cmdline; |
113 | struct setup_data *sd; |
114 | |
115 | /* Exclude the encryption mask from __PHYSICAL_MASK */ |
116 | physical_mask &= ~sme_me_mask; |
117 | |
118 | /* Init mapping_info with run-time function/buffer pointers. */ |
119 | mapping_info.alloc_pgt_page = alloc_pgt_page; |
120 | mapping_info.context = &pgt_data; |
121 | mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask; |
122 | mapping_info.kernpg_flag = _KERNPG_TABLE; |
123 | |
124 | /* |
125 | * It should be impossible for this not to already be true, |
126 | * but since calling this a second time would rewind the other |
127 | * counters, let's just make sure this is reset too. |
128 | */ |
129 | pgt_data.pgt_buf_offset = 0; |
130 | |
131 | /* |
132 | * If we came here via startup_32(), cr3 will be _pgtable already |
133 | * and we must append to the existing area instead of entirely |
134 | * overwriting it. |
135 | * |
136 | * With 5-level paging, we use '_pgtable' to allocate the p4d page table, |
137 | * the top-level page table is allocated separately. |
138 | * |
139 | * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level |
140 | * cases. On 4-level paging it's equal to 'top_level_pgt'. |
141 | */ |
142 | top_level_pgt = read_cr3_pa(); |
143 | if (p4d_offset(pgd: (pgd_t *)top_level_pgt, address: 0) == (p4d_t *)_pgtable) { |
144 | pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE; |
145 | pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE; |
146 | memset(s: pgt_data.pgt_buf, c: 0, n: pgt_data.pgt_buf_size); |
147 | } else { |
148 | pgt_data.pgt_buf = _pgtable; |
149 | pgt_data.pgt_buf_size = BOOT_PGT_SIZE; |
150 | memset(s: pgt_data.pgt_buf, c: 0, n: pgt_data.pgt_buf_size); |
151 | top_level_pgt = (unsigned long)alloc_pgt_page(context: &pgt_data); |
152 | } |
153 | |
154 | /* |
155 | * New page-table is set up - map the kernel image, boot_params and the |
156 | * command line. The uncompressed kernel requires boot_params and the |
157 | * command line to be mapped in the identity mapping. Map them |
158 | * explicitly here in case the compressed kernel does not touch them, |
159 | * or does not touch all the pages covering them. |
160 | */ |
161 | kernel_add_identity_map(start: (unsigned long)_head, end: (unsigned long)_end); |
162 | boot_params_ptr = rmode; |
163 | kernel_add_identity_map(start: (unsigned long)boot_params_ptr, |
164 | end: (unsigned long)(boot_params_ptr + 1)); |
165 | cmdline = get_cmd_line_ptr(); |
166 | kernel_add_identity_map(start: cmdline, end: cmdline + COMMAND_LINE_SIZE); |
167 | |
168 | /* |
169 | * Also map the setup_data entries passed via boot_params in case they |
170 | * need to be accessed by uncompressed kernel via the identity mapping. |
171 | */ |
172 | sd = (struct setup_data *)boot_params_ptr->hdr.setup_data; |
173 | while (sd) { |
174 | unsigned long sd_addr = (unsigned long)sd; |
175 | |
176 | kernel_add_identity_map(start: sd_addr, end: sd_addr + sizeof(*sd) + sd->len); |
177 | sd = (struct setup_data *)sd->next; |
178 | } |
179 | |
180 | sev_prep_identity_maps(top_level_pgt); |
181 | |
182 | /* Load the new page-table. */ |
183 | write_cr3(x: top_level_pgt); |
184 | |
185 | /* |
186 | * Now that the required page table mappings are established and a |
187 | * GHCB can be used, check for SNP guest/HV feature compatibility. |
188 | */ |
189 | snp_check_features(); |
190 | } |
191 | |
192 | static pte_t *split_large_pmd(struct x86_mapping_info *info, |
193 | pmd_t *pmdp, unsigned long __address) |
194 | { |
195 | unsigned long page_flags; |
196 | unsigned long address; |
197 | pte_t *pte; |
198 | pmd_t pmd; |
199 | int i; |
200 | |
201 | pte = (pte_t *)info->alloc_pgt_page(info->context); |
202 | if (!pte) |
203 | return NULL; |
204 | |
205 | address = __address & PMD_MASK; |
206 | /* No large page - clear PSE flag */ |
207 | page_flags = info->page_flag & ~_PAGE_PSE; |
208 | |
209 | /* Populate the PTEs */ |
210 | for (i = 0; i < PTRS_PER_PMD; i++) { |
211 | set_pte(&pte[i], __pte(address | page_flags)); |
212 | address += PAGE_SIZE; |
213 | } |
214 | |
215 | /* |
216 | * Ideally we need to clear the large PMD first and do a TLB |
217 | * flush before we write the new PMD. But the 2M range of the |
218 | * PMD might contain the code we execute and/or the stack |
219 | * we are on, so we can't do that. But that should be safe here |
220 | * because we are going from large to small mappings and we are |
221 | * also the only user of the page-table, so there is no chance |
222 | * of a TLB multihit. |
223 | */ |
224 | pmd = __pmd((unsigned long)pte | info->kernpg_flag); |
225 | set_pmd(pmdp, pmd); |
226 | /* Flush TLB to establish the new PMD */ |
227 | write_cr3(x: top_level_pgt); |
228 | |
229 | return pte + pte_index(address: __address); |
230 | } |
231 | |
232 | static void clflush_page(unsigned long address) |
233 | { |
234 | unsigned int flush_size; |
235 | char *cl, *start, *end; |
236 | |
237 | /* |
238 | * Hardcode cl-size to 64 - CPUID can't be used here because that might |
239 | * cause another #VC exception and the GHCB is not ready to use yet. |
240 | */ |
241 | flush_size = 64; |
242 | start = (char *)(address & PAGE_MASK); |
243 | end = start + PAGE_SIZE; |
244 | |
245 | /* |
246 | * First make sure there are no pending writes on the cache-lines to |
247 | * flush. |
248 | */ |
249 | asm volatile("mfence" : : : "memory" ); |
250 | |
251 | for (cl = start; cl != end; cl += flush_size) |
252 | clflush(p: cl); |
253 | } |
254 | |
255 | static int set_clr_page_flags(struct x86_mapping_info *info, |
256 | unsigned long address, |
257 | pteval_t set, pteval_t clr) |
258 | { |
259 | pgd_t *pgdp = (pgd_t *)top_level_pgt; |
260 | p4d_t *p4dp; |
261 | pud_t *pudp; |
262 | pmd_t *pmdp; |
263 | pte_t *ptep, pte; |
264 | |
265 | /* |
266 | * First make sure there is a PMD mapping for 'address'. |
267 | * It should already exist, but keep things generic. |
268 | * |
269 | * To map the page just read from it and fault it in if there is no |
270 | * mapping yet. kernel_add_identity_map() can't be called here because |
271 | * that would unconditionally map the address on PMD level, destroying |
272 | * any PTE-level mappings that might already exist. Use assembly here |
273 | * so the access won't be optimized away. |
274 | */ |
275 | asm volatile("mov %[address], %%r9" |
276 | :: [address] "g" (*(unsigned long *)address) |
277 | : "r9" , "memory" ); |
278 | |
279 | /* |
280 | * The page is mapped at least with PMD size - so skip checks and walk |
281 | * directly to the PMD. |
282 | */ |
283 | p4dp = p4d_offset(pgd: pgdp, address); |
284 | pudp = pud_offset(p4d: p4dp, address); |
285 | pmdp = pmd_offset(pud: pudp, address); |
286 | |
287 | if (pmd_large(pte: *pmdp)) |
288 | ptep = split_large_pmd(info, pmdp, address: address); |
289 | else |
290 | ptep = pte_offset_kernel(pmd: pmdp, address); |
291 | |
292 | if (!ptep) |
293 | return -ENOMEM; |
294 | |
295 | /* |
296 | * Changing encryption attributes of a page requires to flush it from |
297 | * the caches. |
298 | */ |
299 | if ((set | clr) & _PAGE_ENC) { |
300 | clflush_page(address); |
301 | |
302 | /* |
303 | * If the encryption attribute is being cleared, change the page state |
304 | * to shared in the RMP table. |
305 | */ |
306 | if (clr) |
307 | snp_set_page_shared(__pa(address & PAGE_MASK)); |
308 | } |
309 | |
310 | /* Update PTE */ |
311 | pte = *ptep; |
312 | pte = pte_set_flags(pte, set); |
313 | pte = pte_clear_flags(pte, clear: clr); |
314 | set_pte(ptep, pte); |
315 | |
316 | /* |
317 | * If the encryption attribute is being set, then change the page state to |
318 | * private in the RMP entry. The page state change must be done after the PTE |
319 | * is updated. |
320 | */ |
321 | if (set & _PAGE_ENC) |
322 | snp_set_page_private(__pa(address & PAGE_MASK)); |
323 | |
324 | /* Flush TLB after changing encryption attribute */ |
325 | write_cr3(x: top_level_pgt); |
326 | |
327 | return 0; |
328 | } |
329 | |
330 | int set_page_decrypted(unsigned long address) |
331 | { |
332 | return set_clr_page_flags(info: &mapping_info, address, set: 0, _PAGE_ENC); |
333 | } |
334 | |
335 | int set_page_encrypted(unsigned long address) |
336 | { |
337 | return set_clr_page_flags(info: &mapping_info, address, _PAGE_ENC, clr: 0); |
338 | } |
339 | |
340 | int set_page_non_present(unsigned long address) |
341 | { |
342 | return set_clr_page_flags(info: &mapping_info, address, set: 0, _PAGE_PRESENT); |
343 | } |
344 | |
345 | static void do_pf_error(const char *msg, unsigned long error_code, |
346 | unsigned long address, unsigned long ip) |
347 | { |
348 | error_putstr(msg); |
349 | |
350 | error_putstr("\nError Code: " ); |
351 | error_puthex(error_code); |
352 | error_putstr("\nCR2: 0x" ); |
353 | error_puthex(address); |
354 | error_putstr("\nRIP relative to _head: 0x" ); |
355 | error_puthex(ip - (unsigned long)_head); |
356 | error_putstr("\n" ); |
357 | |
358 | error(m: "Stopping.\n" ); |
359 | } |
360 | |
361 | void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code) |
362 | { |
363 | unsigned long address = native_read_cr2(); |
364 | unsigned long end; |
365 | bool ghcb_fault; |
366 | |
367 | ghcb_fault = sev_es_check_ghcb_fault(address); |
368 | |
369 | address &= PMD_MASK; |
370 | end = address + PMD_SIZE; |
371 | |
372 | /* |
373 | * Check for unexpected error codes. Unexpected are: |
374 | * - Faults on present pages |
375 | * - User faults |
376 | * - Reserved bits set |
377 | */ |
378 | if (error_code & (X86_PF_PROT | X86_PF_USER | X86_PF_RSVD)) |
379 | do_pf_error(msg: "Unexpected page-fault:" , error_code, address, ip: regs->ip); |
380 | else if (ghcb_fault) |
381 | do_pf_error(msg: "Page-fault on GHCB page:" , error_code, address, ip: regs->ip); |
382 | |
383 | /* |
384 | * Error code is sane - now identity map the 2M region around |
385 | * the faulting address. |
386 | */ |
387 | kernel_add_identity_map(start: address, end); |
388 | } |
389 | |