1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Debug helper to dump the current kernel pagetables of the system |
4 | * so that we can see what the various memory ranges are set to. |
5 | * |
6 | * (C) Copyright 2008 Intel Corporation |
7 | * |
8 | * Author: Arjan van de Ven <arjan@linux.intel.com> |
9 | */ |
10 | |
11 | #include <linux/debugfs.h> |
12 | #include <linux/kasan.h> |
13 | #include <linux/mm.h> |
14 | #include <linux/init.h> |
15 | #include <linux/sched.h> |
16 | #include <linux/seq_file.h> |
17 | #include <linux/highmem.h> |
18 | #include <linux/pci.h> |
19 | #include <linux/ptdump.h> |
20 | |
21 | #include <asm/e820/types.h> |
22 | |
23 | /* |
24 | * The dumper groups pagetable entries of the same type into one, and for |
25 | * that it needs to keep some state when walking, and flush this state |
26 | * when a "break" in the continuity is found. |
27 | */ |
28 | struct pg_state { |
29 | struct ptdump_state ptdump; |
30 | int level; |
31 | pgprotval_t current_prot; |
32 | pgprotval_t effective_prot; |
33 | pgprotval_t prot_levels[5]; |
34 | unsigned long start_address; |
35 | const struct addr_marker *marker; |
36 | unsigned long lines; |
37 | bool to_dmesg; |
38 | bool check_wx; |
39 | unsigned long wx_pages; |
40 | struct seq_file *seq; |
41 | }; |
42 | |
43 | struct addr_marker { |
44 | unsigned long start_address; |
45 | const char *name; |
46 | unsigned long max_lines; |
47 | }; |
48 | |
49 | /* Address space markers hints */ |
50 | |
51 | #ifdef CONFIG_X86_64 |
52 | |
53 | enum address_markers_idx { |
54 | USER_SPACE_NR = 0, |
55 | KERNEL_SPACE_NR, |
56 | #ifdef CONFIG_MODIFY_LDT_SYSCALL |
57 | LDT_NR, |
58 | #endif |
59 | LOW_KERNEL_NR, |
60 | VMALLOC_START_NR, |
61 | VMEMMAP_START_NR, |
62 | #ifdef CONFIG_KASAN |
63 | KASAN_SHADOW_START_NR, |
64 | KASAN_SHADOW_END_NR, |
65 | #endif |
66 | CPU_ENTRY_AREA_NR, |
67 | #ifdef CONFIG_X86_ESPFIX64 |
68 | ESPFIX_START_NR, |
69 | #endif |
70 | #ifdef CONFIG_EFI |
71 | EFI_END_NR, |
72 | #endif |
73 | HIGH_KERNEL_NR, |
74 | MODULES_VADDR_NR, |
75 | MODULES_END_NR, |
76 | FIXADDR_START_NR, |
77 | END_OF_SPACE_NR, |
78 | }; |
79 | |
80 | static struct addr_marker address_markers[] = { |
81 | [USER_SPACE_NR] = { 0, "User Space" }, |
82 | [KERNEL_SPACE_NR] = { .start_address: (1UL << 63), .name: "Kernel Space" }, |
83 | [LOW_KERNEL_NR] = { .start_address: 0UL, .name: "Low Kernel Mapping" }, |
84 | [VMALLOC_START_NR] = { .start_address: 0UL, .name: "vmalloc() Area" }, |
85 | [VMEMMAP_START_NR] = { .start_address: 0UL, .name: "Vmemmap" }, |
86 | #ifdef CONFIG_KASAN |
87 | /* |
88 | * These fields get initialized with the (dynamic) |
89 | * KASAN_SHADOW_{START,END} values in pt_dump_init(). |
90 | */ |
91 | [KASAN_SHADOW_START_NR] = { 0UL, "KASAN shadow" }, |
92 | [KASAN_SHADOW_END_NR] = { 0UL, "KASAN shadow end" }, |
93 | #endif |
94 | #ifdef CONFIG_MODIFY_LDT_SYSCALL |
95 | [LDT_NR] = { .start_address: 0UL, .name: "LDT remap" }, |
96 | #endif |
97 | [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,.name: "CPU entry Area" }, |
98 | #ifdef CONFIG_X86_ESPFIX64 |
99 | [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, .name: "ESPfix Area" , .max_lines: 16 }, |
100 | #endif |
101 | #ifdef CONFIG_EFI |
102 | [EFI_END_NR] = { EFI_VA_END, .name: "EFI Runtime Services" }, |
103 | #endif |
104 | [HIGH_KERNEL_NR] = { __START_KERNEL_map, .name: "High Kernel Mapping" }, |
105 | [MODULES_VADDR_NR] = { MODULES_VADDR, .name: "Modules" }, |
106 | [MODULES_END_NR] = { MODULES_END, .name: "End Modules" }, |
107 | [FIXADDR_START_NR] = { FIXADDR_START, .name: "Fixmap Area" }, |
108 | [END_OF_SPACE_NR] = { .start_address: -1, NULL } |
109 | }; |
110 | |
111 | #define INIT_PGD ((pgd_t *) &init_top_pgt) |
112 | |
113 | #else /* CONFIG_X86_64 */ |
114 | |
115 | enum address_markers_idx { |
116 | USER_SPACE_NR = 0, |
117 | KERNEL_SPACE_NR, |
118 | VMALLOC_START_NR, |
119 | VMALLOC_END_NR, |
120 | #ifdef CONFIG_HIGHMEM |
121 | PKMAP_BASE_NR, |
122 | #endif |
123 | #ifdef CONFIG_MODIFY_LDT_SYSCALL |
124 | LDT_NR, |
125 | #endif |
126 | CPU_ENTRY_AREA_NR, |
127 | FIXADDR_START_NR, |
128 | END_OF_SPACE_NR, |
129 | }; |
130 | |
131 | static struct addr_marker address_markers[] = { |
132 | [USER_SPACE_NR] = { 0, "User Space" }, |
133 | [KERNEL_SPACE_NR] = { PAGE_OFFSET, "Kernel Mapping" }, |
134 | [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, |
135 | [VMALLOC_END_NR] = { 0UL, "vmalloc() End" }, |
136 | #ifdef CONFIG_HIGHMEM |
137 | [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" }, |
138 | #endif |
139 | #ifdef CONFIG_MODIFY_LDT_SYSCALL |
140 | [LDT_NR] = { 0UL, "LDT remap" }, |
141 | #endif |
142 | [CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" }, |
143 | [FIXADDR_START_NR] = { 0UL, "Fixmap area" }, |
144 | [END_OF_SPACE_NR] = { -1, NULL } |
145 | }; |
146 | |
147 | #define INIT_PGD (swapper_pg_dir) |
148 | |
149 | #endif /* !CONFIG_X86_64 */ |
150 | |
151 | /* Multipliers for offsets within the PTEs */ |
152 | #define PTE_LEVEL_MULT (PAGE_SIZE) |
153 | #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) |
154 | #define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT) |
155 | #define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) |
156 | #define PGD_LEVEL_MULT (PTRS_PER_P4D * P4D_LEVEL_MULT) |
157 | |
158 | #define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \ |
159 | ({ \ |
160 | if (to_dmesg) \ |
161 | printk(KERN_INFO fmt, ##args); \ |
162 | else \ |
163 | if (m) \ |
164 | seq_printf(m, fmt, ##args); \ |
165 | }) |
166 | |
167 | #define pt_dump_cont_printf(m, to_dmesg, fmt, args...) \ |
168 | ({ \ |
169 | if (to_dmesg) \ |
170 | printk(KERN_CONT fmt, ##args); \ |
171 | else \ |
172 | if (m) \ |
173 | seq_printf(m, fmt, ##args); \ |
174 | }) |
175 | |
176 | /* |
177 | * Print a readable form of a pgprot_t to the seq_file |
178 | */ |
179 | static void printk_prot(struct seq_file *m, pgprotval_t pr, int level, bool dmsg) |
180 | { |
181 | static const char * const level_name[] = |
182 | { "pgd" , "p4d" , "pud" , "pmd" , "pte" }; |
183 | |
184 | if (!(pr & _PAGE_PRESENT)) { |
185 | /* Not present */ |
186 | pt_dump_cont_printf(m, dmsg, " " ); |
187 | } else { |
188 | if (pr & _PAGE_USER) |
189 | pt_dump_cont_printf(m, dmsg, "USR " ); |
190 | else |
191 | pt_dump_cont_printf(m, dmsg, " " ); |
192 | if (pr & _PAGE_RW) |
193 | pt_dump_cont_printf(m, dmsg, "RW " ); |
194 | else |
195 | pt_dump_cont_printf(m, dmsg, "ro " ); |
196 | if (pr & _PAGE_PWT) |
197 | pt_dump_cont_printf(m, dmsg, "PWT " ); |
198 | else |
199 | pt_dump_cont_printf(m, dmsg, " " ); |
200 | if (pr & _PAGE_PCD) |
201 | pt_dump_cont_printf(m, dmsg, "PCD " ); |
202 | else |
203 | pt_dump_cont_printf(m, dmsg, " " ); |
204 | |
205 | /* Bit 7 has a different meaning on level 3 vs 4 */ |
206 | if (level <= 3 && pr & _PAGE_PSE) |
207 | pt_dump_cont_printf(m, dmsg, "PSE " ); |
208 | else |
209 | pt_dump_cont_printf(m, dmsg, " " ); |
210 | if ((level == 4 && pr & _PAGE_PAT) || |
211 | ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE)) |
212 | pt_dump_cont_printf(m, dmsg, "PAT " ); |
213 | else |
214 | pt_dump_cont_printf(m, dmsg, " " ); |
215 | if (pr & _PAGE_GLOBAL) |
216 | pt_dump_cont_printf(m, dmsg, "GLB " ); |
217 | else |
218 | pt_dump_cont_printf(m, dmsg, " " ); |
219 | if (pr & _PAGE_NX) |
220 | pt_dump_cont_printf(m, dmsg, "NX " ); |
221 | else |
222 | pt_dump_cont_printf(m, dmsg, "x " ); |
223 | } |
224 | pt_dump_cont_printf(m, dmsg, "%s\n" , level_name[level]); |
225 | } |
226 | |
227 | static void note_wx(struct pg_state *st, unsigned long addr) |
228 | { |
229 | unsigned long npages; |
230 | |
231 | npages = (addr - st->start_address) / PAGE_SIZE; |
232 | |
233 | #ifdef CONFIG_PCI_BIOS |
234 | /* |
235 | * If PCI BIOS is enabled, the PCI BIOS area is forced to WX. |
236 | * Inform about it, but avoid the warning. |
237 | */ |
238 | if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN && |
239 | addr <= PAGE_OFFSET + BIOS_END) { |
240 | pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n" , npages); |
241 | return; |
242 | } |
243 | #endif |
244 | /* Account the WX pages */ |
245 | st->wx_pages += npages; |
246 | WARN_ONCE(__supported_pte_mask & _PAGE_NX, |
247 | "x86/mm: Found insecure W+X mapping at address %pS\n" , |
248 | (void *)st->start_address); |
249 | } |
250 | |
251 | static void effective_prot(struct ptdump_state *pt_st, int level, u64 val) |
252 | { |
253 | struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); |
254 | pgprotval_t prot = val & PTE_FLAGS_MASK; |
255 | pgprotval_t effective; |
256 | |
257 | if (level > 0) { |
258 | pgprotval_t higher_prot = st->prot_levels[level - 1]; |
259 | |
260 | effective = (higher_prot & prot & (_PAGE_USER | _PAGE_RW)) | |
261 | ((higher_prot | prot) & _PAGE_NX); |
262 | } else { |
263 | effective = prot; |
264 | } |
265 | |
266 | st->prot_levels[level] = effective; |
267 | } |
268 | |
269 | /* |
270 | * This function gets called on a break in a continuous series |
271 | * of PTE entries; the next one is different so we need to |
272 | * print what we collected so far. |
273 | */ |
274 | static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, |
275 | u64 val) |
276 | { |
277 | struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); |
278 | pgprotval_t new_prot, new_eff; |
279 | pgprotval_t cur, eff; |
280 | static const char units[] = "BKMGTPE" ; |
281 | struct seq_file *m = st->seq; |
282 | |
283 | new_prot = val & PTE_FLAGS_MASK; |
284 | if (!val) |
285 | new_eff = 0; |
286 | else |
287 | new_eff = st->prot_levels[level]; |
288 | |
289 | /* |
290 | * If we have a "break" in the series, we need to flush the state that |
291 | * we have now. "break" is either changing perms, levels or |
292 | * address space marker. |
293 | */ |
294 | cur = st->current_prot; |
295 | eff = st->effective_prot; |
296 | |
297 | if (st->level == -1) { |
298 | /* First entry */ |
299 | st->current_prot = new_prot; |
300 | st->effective_prot = new_eff; |
301 | st->level = level; |
302 | st->marker = address_markers; |
303 | st->lines = 0; |
304 | pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n" , |
305 | st->marker->name); |
306 | } else if (new_prot != cur || new_eff != eff || level != st->level || |
307 | addr >= st->marker[1].start_address) { |
308 | const char *unit = units; |
309 | unsigned long delta; |
310 | int width = sizeof(unsigned long) * 2; |
311 | |
312 | if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) |
313 | note_wx(st, addr); |
314 | |
315 | /* |
316 | * Now print the actual finished series |
317 | */ |
318 | if (!st->marker->max_lines || |
319 | st->lines < st->marker->max_lines) { |
320 | pt_dump_seq_printf(m, st->to_dmesg, |
321 | "0x%0*lx-0x%0*lx " , |
322 | width, st->start_address, |
323 | width, addr); |
324 | |
325 | delta = addr - st->start_address; |
326 | while (!(delta & 1023) && unit[1]) { |
327 | delta >>= 10; |
328 | unit++; |
329 | } |
330 | pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c " , |
331 | delta, *unit); |
332 | printk_prot(m, pr: st->current_prot, level: st->level, |
333 | dmsg: st->to_dmesg); |
334 | } |
335 | st->lines++; |
336 | |
337 | /* |
338 | * We print markers for special areas of address space, |
339 | * such as the start of vmalloc space etc. |
340 | * This helps in the interpretation. |
341 | */ |
342 | if (addr >= st->marker[1].start_address) { |
343 | if (st->marker->max_lines && |
344 | st->lines > st->marker->max_lines) { |
345 | unsigned long nskip = |
346 | st->lines - st->marker->max_lines; |
347 | pt_dump_seq_printf(m, st->to_dmesg, |
348 | "... %lu entr%s skipped ... \n" , |
349 | nskip, |
350 | nskip == 1 ? "y" : "ies" ); |
351 | } |
352 | st->marker++; |
353 | st->lines = 0; |
354 | pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n" , |
355 | st->marker->name); |
356 | } |
357 | |
358 | st->start_address = addr; |
359 | st->current_prot = new_prot; |
360 | st->effective_prot = new_eff; |
361 | st->level = level; |
362 | } |
363 | } |
364 | |
365 | bool ptdump_walk_pgd_level_core(struct seq_file *m, |
366 | struct mm_struct *mm, pgd_t *pgd, |
367 | bool checkwx, bool dmesg) |
368 | { |
369 | const struct ptdump_range ptdump_ranges[] = { |
370 | #ifdef CONFIG_X86_64 |
371 | {0, PTRS_PER_PGD * PGD_LEVEL_MULT / 2}, |
372 | {GUARD_HOLE_END_ADDR, ~0UL}, |
373 | #else |
374 | {0, ~0UL}, |
375 | #endif |
376 | {0, 0} |
377 | }; |
378 | |
379 | struct pg_state st = { |
380 | .ptdump = { |
381 | .note_page = note_page, |
382 | .effective_prot = effective_prot, |
383 | .range = ptdump_ranges |
384 | }, |
385 | .level = -1, |
386 | .to_dmesg = dmesg, |
387 | .check_wx = checkwx, |
388 | .seq = m |
389 | }; |
390 | |
391 | ptdump_walk_pgd(st: &st.ptdump, mm, pgd); |
392 | |
393 | if (!checkwx) |
394 | return true; |
395 | if (st.wx_pages) { |
396 | pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n" , |
397 | st.wx_pages); |
398 | |
399 | return false; |
400 | } else { |
401 | pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n" ); |
402 | |
403 | return true; |
404 | } |
405 | } |
406 | |
407 | void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm) |
408 | { |
409 | ptdump_walk_pgd_level_core(m, mm, pgd: mm->pgd, checkwx: false, dmesg: true); |
410 | } |
411 | |
412 | void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm, |
413 | bool user) |
414 | { |
415 | pgd_t *pgd = mm->pgd; |
416 | #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION |
417 | if (user && boot_cpu_has(X86_FEATURE_PTI)) |
418 | pgd = kernel_to_user_pgdp(pgdp: pgd); |
419 | #endif |
420 | ptdump_walk_pgd_level_core(m, mm, pgd, checkwx: false, dmesg: false); |
421 | } |
422 | EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); |
423 | |
424 | void ptdump_walk_user_pgd_level_checkwx(void) |
425 | { |
426 | #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION |
427 | pgd_t *pgd = INIT_PGD; |
428 | |
429 | if (!(__supported_pte_mask & _PAGE_NX) || |
430 | !boot_cpu_has(X86_FEATURE_PTI)) |
431 | return; |
432 | |
433 | pr_info("x86/mm: Checking user space page tables\n" ); |
434 | pgd = kernel_to_user_pgdp(pgdp: pgd); |
435 | ptdump_walk_pgd_level_core(NULL, mm: &init_mm, pgd, checkwx: true, dmesg: false); |
436 | #endif |
437 | } |
438 | |
439 | bool ptdump_walk_pgd_level_checkwx(void) |
440 | { |
441 | if (!(__supported_pte_mask & _PAGE_NX)) |
442 | return true; |
443 | |
444 | return ptdump_walk_pgd_level_core(NULL, mm: &init_mm, INIT_PGD, checkwx: true, dmesg: false); |
445 | } |
446 | |
447 | static int __init pt_dump_init(void) |
448 | { |
449 | /* |
450 | * Various markers are not compile-time constants, so assign them |
451 | * here. |
452 | */ |
453 | #ifdef CONFIG_X86_64 |
454 | address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET; |
455 | address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; |
456 | address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START; |
457 | #ifdef CONFIG_MODIFY_LDT_SYSCALL |
458 | address_markers[LDT_NR].start_address = LDT_BASE_ADDR; |
459 | #endif |
460 | #ifdef CONFIG_KASAN |
461 | address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START; |
462 | address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END; |
463 | #endif |
464 | #endif |
465 | #ifdef CONFIG_X86_32 |
466 | address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; |
467 | address_markers[VMALLOC_END_NR].start_address = VMALLOC_END; |
468 | # ifdef CONFIG_HIGHMEM |
469 | address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE; |
470 | # endif |
471 | address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; |
472 | address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE; |
473 | # ifdef CONFIG_MODIFY_LDT_SYSCALL |
474 | address_markers[LDT_NR].start_address = LDT_BASE_ADDR; |
475 | # endif |
476 | #endif |
477 | return 0; |
478 | } |
479 | __initcall(pt_dump_init); |
480 | |