fault.c source code [linux/arch/x86/mm/fault.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (C) 1995 Linus Torvalds
4	* Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
5	* Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
6	*/
7	#include <linux/sched.h> /* test_thread_flag(), ... */
8	#include <linux/sched/task_stack.h> /* task_stack_(), ... /
9	#include <linux/kdebug.h> /* oops_begin/end, ... */
10	#include <linux/extable.h> /* search_exception_tables */
11	#include <linux/memblock.h> /* max_low_pfn */
12	#include <linux/kfence.h> /* kfence_handle_page_fault */
13	#include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */
14	#include <linux/mmiotrace.h> /* kmmio_handler, ... */
15	#include <linux/perf_event.h> /* perf_sw_event */
16	#include <linux/hugetlb.h> /* hstate_index_to_shift */
17	#include <linux/prefetch.h> /* prefetchw */
18	#include <linux/context_tracking.h> /* exception_enter(), ... */
19	#include <linux/uaccess.h> /* faulthandler_disabled() */
20	#include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/
21	#include <linux/mm_types.h>
22	#include <linux/mm.h> /* find_and_lock_vma() */
23
24	#include <asm/cpufeature.h> /* boot_cpu_has, ... */
25	#include <asm/traps.h> /* dotraplinkage, ... */
26	#include <asm/fixmap.h> /* VSYSCALL_ADDR */
27	#include <asm/vsyscall.h> /* emulate_vsyscall */
28	#include <asm/vm86.h> /* struct vm86 */
29	#include <asm/mmu_context.h> /* vma_pkey() */
30	#include <asm/efi.h> /* efi_crash_gracefully_on_page_fault()*/
31	#include <asm/desc.h> /* store_idt(), ... */
32	#include <asm/cpu_entry_area.h> /* exception stack */
33	#include <asm/pgtable_areas.h> /* VMALLOC_START, ... */
34	#include <asm/kvm_para.h> /* kvm_handle_async_pf */
35	#include <asm/vdso.h> /* fixup_vdso_exception() */
36	#include <asm/irq_stack.h>
37	#include <asm/fred.h>
38	#include <asm/sev.h> /* snp_dump_hva_rmpentry() */
39
40	#define CREATE_TRACE_POINTS
41	#include <asm/trace/exceptions.h>
42
43	/*
44	* Returns 0 if mmiotrace is disabled, or if the fault is not
45	* handled by mmiotrace:
46	*/
47	static nokprobe_inline int
48	kmmio_fault(struct pt_regs regs, unsigned* long addr)
49	{
50	if (unlikely(is_kmmio_active()))
51	if (kmmio_handler(regs, addr) == `1`)
52	return -`1`;
53	return `0`;
54	}
55
56	/*
57	* Prefetch quirks:
58	*
59	* 32-bit mode:
60	*
61	* Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
62	* Check that here and ignore it. This is AMD erratum #91.
63	*
64	* 64-bit mode:
65	*
66	* Sometimes the CPU reports invalid exceptions on prefetch.
67	* Check that here and ignore it.
68	*
69	* Opcode checker based on code by Richard Brunner.
70	*/
71	static inline int
72	check_prefetch_opcode(struct pt_regs regs, unsigned* char *instr,
73	unsigned char opcode, int *prefetch)
74	{
75	unsigned char instr_hi = opcode & `0xf0`;
76	unsigned char instr_lo = opcode & `0x0f`;
77
78	switch (instr_hi) {
79	case `0x20`:
80	case `0x30`:
81	/*
82	* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
83	* In X86_64 long mode, the CPU will signal invalid
84	* opcode if some of these prefixes are present so
85	* X86_64 will never get here anyway
86	*/
87	return ((instr_lo & `7`) == `0x6`);
88	#ifdef CONFIG_X86_64
89	case `0x40`:
90	/*
91	* In 64-bit mode 0x40..0x4F are valid REX prefixes
92	*/
93	return (!user_mode(regs) \|\| user_64bit_mode(regs));
94	#endif
95	case `0x60`:
96	/ 0x64 thru 0x67 are valid prefixes in all modes. /
97	return (instr_lo & `0xC`) == `0x4`;
98	case `0xF0`:
99	/ 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. /
100	return !instr_lo \|\| (instr_lo>>`1`) == `1`;
101	case `0x00`:
102	/ Prefetch instruction is 0x0F0D or 0x0F18 /
103	if (get_kernel_nofault(opcode, instr))
104	return `0`;
105
106	*prefetch = (instr_lo == `0xF`) &&
107	(opcode == `0x0D` \|\| opcode == `0x18`);
108	return `0`;
109	default:
110	return `0`;
111	}
112	}
113
114	static bool is_amd_k8_pre_npt(void)
115	{
116	struct cpuinfo_x86 *c = &boot_cpu_data;
117
118	return unlikely(IS_ENABLED(CONFIG_CPU_SUP_AMD) &&
119	c->x86_vendor == X86_VENDOR_AMD &&
120	c->x86 == `0xf` && c->x86_model < `0x40`);
121	}
122
123	static int
124	is_prefetch(struct pt_regs regs, unsigned* long error_code, unsigned long addr)
125	{
126	unsigned char *max_instr;
127	unsigned char *instr;
128	int prefetch = `0`;
129
130	/ Erratum #91 affects AMD K8, pre-NPT CPUs /
131	if (!is_amd_k8_pre_npt())
132	return `0`;
133
134	/*
135	* If it was a exec (instruction fetch) fault on NX page, then
136	* do not ignore the fault:
137	*/
138	if (error_code & X86_PF_INSTR)
139	return `0`;
140
141	instr = (void *)convert_ip_to_linear(current, regs);
142	max_instr = instr + `15`;
143
144	/*
145	* This code has historically always bailed out if IP points to a
146	* not-present page (e.g. due to a race). No one has ever
147	* complained about this.
148	*/
149	pagefault_disable();
150
151	while (instr < max_instr) {
152	unsigned char opcode;
153
154	if (user_mode(regs)) {
155	if (get_user(opcode, (unsigned char __user *) instr))
156	break;
157	} else {
158	if (get_kernel_nofault(opcode, instr))
159	break;
160	}
161
162	instr++;
163
164	if (!check_prefetch_opcode(regs, instr, opcode, prefetch: &prefetch))
165	break;
166	}
167
168	pagefault_enable();
169	return prefetch;
170	}
171
172	DEFINE_SPINLOCK(pgd_lock);
173	LIST_HEAD(pgd_list);
174
175	#ifdef CONFIG_X86_32
176	static inline pmd_t vmalloc_sync_one(pgd_t pgd, unsigned long address)
177	{
178	unsigned index = pgd_index(address);
179	pgd_t *pgd_k;
180	p4d_t p4d, p4d_k;
181	pud_t pud, pud_k;
182	pmd_t pmd, pmd_k;
183
184	pgd += index;
185	pgd_k = init_mm.pgd + index;
186
187	if (!pgd_present(*pgd_k))
188	return NULL;
189
190	/*
191	* set_pgd(pgd, *pgd_k); here would be useless on PAE
192	* and redundant with the set_pmd() on non-PAE. As would
193	* set_p4d/set_pud.
194	*/
195	p4d = p4d_offset(pgd, address);
196	p4d_k = p4d_offset(pgd_k, address);
197	if (!p4d_present(*p4d_k))
198	return NULL;
199
200	pud = pud_offset(p4d, address);
201	pud_k = pud_offset(p4d_k, address);
202	if (!pud_present(*pud_k))
203	return NULL;
204
205	pmd = pmd_offset(pud, address);
206	pmd_k = pmd_offset(pud_k, address);
207
208	if (pmd_present(pmd) != pmd_present(pmd_k))
209	set_pmd(pmd, *pmd_k);
210
211	if (!pmd_present(*pmd_k))
212	return NULL;
213	else
214	BUG_ON(pmd_pfn(pmd) != pmd_pfn(pmd_k));
215
216	return pmd_k;
217	}
218
219	/*
220	* Handle a fault on the vmalloc or module mapping area
221	*
222	* This is needed because there is a race condition between the time
223	* when the vmalloc mapping code updates the PMD to the point in time
224	* where it synchronizes this update with the other page-tables in the
225	* system.
226	*
227	* In this race window another thread/CPU can map an area on the same
228	* PMD, finds it already present and does not synchronize it with the
229	* rest of the system yet. As a result v[mz]alloc might return areas
230	* which are not mapped in every page-table in the system, causing an
231	* unhandled page-fault when they are accessed.
232	*/
233	static noinline int vmalloc_fault(unsigned long address)
234	{
235	unsigned long pgd_paddr;
236	pmd_t *pmd_k;
237	pte_t *pte_k;
238
239	/ Make sure we are in vmalloc area: /
240	if (!(address >= VMALLOC_START && address < VMALLOC_END))
241	return -`1`;
242
243	/*
244	* Synchronize this task's top level page-table
245	* with the 'reference' page table.
246	*
247	* Do _not_ use "current" here. We might be inside
248	* an interrupt in the middle of a task switch..
249	*/
250	pgd_paddr = read_cr3_pa();
251	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
252	if (!pmd_k)
253	return -`1`;
254
255	if (pmd_leaf(*pmd_k))
256	return `0`;
257
258	pte_k = pte_offset_kernel(pmd_k, address);
259	if (!pte_present(*pte_k))
260	return -`1`;
261
262	return `0`;
263	}
264	NOKPROBE_SYMBOL(vmalloc_fault);
265
266	void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
267	{
268	unsigned long addr;
269
270	for (addr = start & PMD_MASK;
271	addr >= TASK_SIZE_MAX && addr < VMALLOC_END;
272	addr += PMD_SIZE) {
273	struct page *page;
274
275	spin_lock(&pgd_lock);
276	list_for_each_entry(page, &pgd_list, lru) {
277	spinlock_t *pgt_lock;
278
279	/ the pgt_lock only for Xen /
280	pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
281
282	spin_lock(pgt_lock);
283	vmalloc_sync_one(page_address(page), addr);
284	spin_unlock(pgt_lock);
285	}
286	spin_unlock(&pgd_lock);
287	}
288	}
289
290	static bool low_pfn(unsigned long pfn)
291	{
292	return pfn < max_low_pfn;
293	}
294
295	static void dump_pagetable(unsigned long address)
296	{
297	pgd_t *base = __va(read_cr3_pa());
298	pgd_t *pgd = &base[pgd_index(address)];
299	p4d_t *p4d;
300	pud_t *pud;
301	pmd_t *pmd;
302	pte_t *pte;
303
304	#ifdef CONFIG_X86_PAE
305	pr_info("pdpt = %016Lx ", pgd_val(pgd));
306	if (!low_pfn(pgd_val(pgd) >> PAGE_SHIFT) \|\| !pgd_present(pgd))
307	goto out;
308	#define pr_pde pr_cont
309	#else
310	#define pr_pde pr_info
311	#endif
312	p4d = p4d_offset(pgd, address);
313	pud = pud_offset(p4d, address);
314	pmd = pmd_offset(pud, address);
315	pr_pde("pde = %0Lx ", sizeof(pmd) `2`, (u64)pmd_val(*pmd));
316	#undef pr_pde
317
318	/*
319	* We must not directly access the pte in the highpte
320	* case if the page table is located in highmem.
321	* And let's rather not kmap-atomic the pte, just in case
322	* it's allocated already:
323	*/
324	if (!low_pfn(pmd_pfn(pmd)) \|\| !pmd_present(pmd) \|\| pmd_leaf(*pmd))
325	goto out;
326
327	pte = pte_offset_kernel(pmd, address);
328	pr_cont("pte = %0Lx ", sizeof(pte) `2`, (u64)pte_val(*pte));
329	out:
330	pr_cont("\n");
331	}
332
333	#else /* CONFIG_X86_64: */
334
335	#ifdef CONFIG_CPU_SUP_AMD
336	static const char errata93_warning[] =
337	KERN_ERR
338	"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
339	"******* Working around it, but it may cause SEGVs or burn power.\n"
340	"******* Please consider a BIOS update.\n"
341	"******* Disabling USB legacy in the BIOS may also help.\n";
342	#endif
343
344	static int bad_address(void *p)
345	{
346	unsigned long dummy;
347
348	return get_kernel_nofault(dummy, (unsigned long *)p);
349	}
350
351	static void dump_pagetable(unsigned long address)
352	{
353	pgd_t *base = __va(read_cr3_pa());
354	pgd_t *pgd = base + pgd_index(address);
355	p4d_t *p4d;
356	pud_t *pud;
357	pmd_t *pmd;
358	pte_t *pte;
359
360	if (bad_address(p: pgd))
361	goto bad;
362
363	pr_info("PGD %lx ", pgd_val(*pgd));
364
365	if (!pgd_present(pgd: *pgd))
366	goto out;
367
368	p4d = p4d_offset(pgd, address);
369	if (bad_address(p: p4d))
370	goto bad;
371
372	pr_cont("P4D %lx ", p4d_val(*p4d));
373	if (!p4d_present(p4d: p4d) \|\| p4d_leaf(p4d: p4d))
374	goto out;
375
376	pud = pud_offset(p4d, address);
377	if (bad_address(p: pud))
378	goto bad;
379
380	pr_cont("PUD %lx ", pud_val(*pud));
381	if (!pud_present(pud: pud) \|\| pud_leaf(pud: pud))
382	goto out;
383
384	pmd = pmd_offset(pud, address);
385	if (bad_address(p: pmd))
386	goto bad;
387
388	pr_cont("PMD %lx ", pmd_val(*pmd));
389	if (!pmd_present(pmd: pmd) \|\| pmd_leaf(pte: pmd))
390	goto out;
391
392	pte = pte_offset_kernel(pmd, address);
393	if (bad_address(p: pte))
394	goto bad;
395
396	pr_cont("PTE %lx", pte_val(*pte));
397	out:
398	pr_cont("\n");
399	return;
400	bad:
401	pr_info("BAD\n");
402	}
403
404	#endif /* CONFIG_X86_64 */
405
406	/*
407	* Workaround for K8 erratum #93 & buggy BIOS.
408	*
409	* BIOS SMM functions are required to use a specific workaround
410	* to avoid corruption of the 64bit RIP register on C stepping K8.
411	*
412	* A lot of BIOS that didn't get tested properly miss this.
413	*
414	* The OS sees this as a page fault with the upper 32bits of RIP cleared.
415	* Try to work around it here.
416	*
417	* Note we only handle faults in kernel here.
418	* Does nothing on 32-bit.
419	*/
420	static int is_errata93(struct pt_regs regs, unsigned* long address)
421	{
422	#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
423	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
424	\|\| boot_cpu_data.x86 != `0xf`)
425	return `0`;
426
427	if (user_mode(regs))
428	return `0`;
429
430	if (address != regs->ip)
431	return `0`;
432
433	if ((address >> `32`) != `0`)
434	return `0`;
435
436	address \|= `0xffffffffUL` << `32`;
437	if ((address >= (u64)_stext && address <= (u64)_etext) \|\|
438	(address >= MODULES_VADDR && address <= MODULES_END)) {
439	printk_once(errata93_warning);
440	regs->ip = address;
441	return `1`;
442	}
443	#endif
444	return `0`;
445	}
446
447	/*
448	* Work around K8 erratum #100 K8 in compat mode occasionally jumps
449	* to illegal addresses >4GB.
450	*
451	* We catch this in the page fault handler because these addresses
452	* are not reachable. Just detect this case and return. Any code
453	* segment in LDT is compatibility mode.
454	*/
455	static int is_errata100(struct pt_regs regs, unsigned* long address)
456	{
457	#ifdef CONFIG_X86_64
458	if ((regs->cs == __USER32_CS \|\| (regs->cs & (`1`<<`2`))) && (address >> `32`))
459	return `1`;
460	#endif
461	return `0`;
462	}
463
464	/ Pentium F0 0F C7 C8 bug workaround: /
465	static int is_f00f_bug(struct pt_regs regs, unsigned* long error_code,
466	unsigned long address)
467	{
468	#ifdef CONFIG_X86_F00F_BUG
469	if (boot_cpu_has_bug(X86_BUG_F00F) && !(error_code & X86_PF_USER) &&
470	idt_is_f00f_address(address)) {
471	handle_invalid_op(regs);
472	return `1`;
473	}
474	#endif
475	return `0`;
476	}
477
478	static void show_ldttss(const struct desc_ptr gdt, const* char *name, u16 index)
479	{
480	u32 offset = (index >> `3`) * sizeof(struct desc_struct);
481	unsigned long addr;
482	struct ldttss_desc desc;
483
484	if (index == `0`) {
485	pr_alert("%s: NULL\n", name);
486	return;
487	}
488
489	if (offset + sizeof(struct ldttss_desc) >= gdt->size) {
490	pr_alert("%s: 0x%hx -- out of bounds\n", name, index);
491	return;
492	}
493
494	if (copy_from_kernel_nofault(dst: &desc, src: (void *)(gdt->address + offset),
495	size: sizeof(struct ldttss_desc))) {
496	pr_alert("%s: 0x%hx -- GDT entry is not readable\n",
497	name, index);
498	return;
499	}
500
501	addr = desc.base0 \| (desc.base1 << `16`) \| ((unsigned long)desc.base2 << `24`);
502	#ifdef CONFIG_X86_64
503	addr \|= ((u64)desc.base3 << `32`);
504	#endif
505	pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n",
506	name, index, addr, (desc.limit0 \| (desc.limit1 << `16`)));
507	}
508
509	static void
510	show_fault_oops(struct pt_regs regs, unsigned* long error_code, unsigned long address)
511	{
512	if (!oops_may_print())
513	return;
514
515	if (error_code & X86_PF_INSTR) {
516	unsigned int level;
517	pgd_t *pgd;
518	pte_t *pte;
519
520	pgd = __va(read_cr3_pa());
521	pgd += pgd_index(address);
522
523	pte = lookup_address_in_pgd(pgd, address, level: &level);
524
525	if (pte && pte_present(a: pte) && !pte_exec(pte: pte))
526	pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n",
527	from_kuid(&init_user_ns, current_uid()));
528	if (pte && pte_present(a: pte) && pte_exec(pte: pte) &&
529	(pgd_flags(pgd: *pgd) & _PAGE_USER) &&
530	(__read_cr4() & X86_CR4_SMEP))
531	pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n",
532	from_kuid(&init_user_ns, current_uid()));
533	}
534
535	if (address < PAGE_SIZE && !user_mode(regs))
536	pr_alert("BUG: kernel NULL pointer dereference, address: %px\n",
537	(void *)address);
538	else
539	pr_alert("BUG: unable to handle page fault for address: %px\n",
540	(void *)address);
541
542	pr_alert("#PF: %s %s in %s mode\n",
543	(error_code & X86_PF_USER) ? "user" : "supervisor",
544	(error_code & X86_PF_INSTR) ? "instruction fetch" :
545	(error_code & X86_PF_WRITE) ? "write access" :
546	"read access",
547	user_mode(regs) ? "user" : "kernel");
548	pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code,
549	!(error_code & X86_PF_PROT) ? "not-present page" :
550	(error_code & X86_PF_RSVD) ? "reserved bit violation" :
551	(error_code & X86_PF_PK) ? "protection keys violation" :
552	(error_code & X86_PF_RMP) ? "RMP violation" :
553	"permissions violation");
554
555	if (!(error_code & X86_PF_USER) && user_mode(regs)) {
556	struct desc_ptr idt, gdt;
557	u16 ldtr, tr;
558
559	/*
560	* This can happen for quite a few reasons. The more obvious
561	* ones are faults accessing the GDT, or LDT. Perhaps
562	* surprisingly, if the CPU tries to deliver a benign or
563	* contributory exception from user code and gets a page fault
564	* during delivery, the page fault can be delivered as though
565	* it originated directly from user code. This could happen
566	* due to wrong permissions on the IDT, GDT, LDT, TSS, or
567	* kernel or IST stack.
568	*/
569	store_idt(dtr: &idt);
570
571	/ Usable even on Xen PV -- it's just slow. /
572	native_store_gdt(dtr: &gdt);
573
574	pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n",
575	idt.address, idt.size, gdt.address, gdt.size);
576
577	store_ldt(ldtr);
578	show_ldttss(gdt: &gdt, name: "LDTR", index: ldtr);
579
580	store_tr(tr);
581	show_ldttss(gdt: &gdt, name: "TR", index: tr);
582	}
583
584	dump_pagetable(address);
585
586	if (error_code & X86_PF_RMP)
587	snp_dump_hva_rmpentry(address);
588	}
589
590	static noinline void
591	pgtable_bad(struct pt_regs regs, unsigned* long error_code,
592	unsigned long address)
593	{
594	struct task_struct *tsk;
595	unsigned long flags;
596	int sig;
597
598	flags = oops_begin();
599	tsk = current;
600	sig = SIGKILL;
601
602	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
603	tsk->comm, address);
604	dump_pagetable(address);
605
606	if (__die("Bad pagetable", regs, error_code))
607	sig = `0`;
608
609	oops_end(flags, regs, signr: sig);
610	}
611
612	static void sanitize_error_code(unsigned long address,
613	unsigned long *error_code)
614	{
615	/*
616	* To avoid leaking information about the kernel page
617	* table layout, pretend that user-mode accesses to
618	* kernel addresses are always protection faults.
619	*
620	* NB: This means that failed vsyscalls with vsyscall=none
621	* will have the PROT bit. This doesn't leak any
622	* information and does not appear to cause any problems.
623	*/
624	if (address >= TASK_SIZE_MAX)
625	*error_code \|= X86_PF_PROT;
626	}
627
628	static void set_signal_archinfo(unsigned long address,
629	unsigned long error_code)
630	{
631	struct task_struct *tsk = current;
632
633	tsk->thread.trap_nr = X86_TRAP_PF;
634	tsk->thread.error_code = error_code \| X86_PF_USER;
635	tsk->thread.cr2 = address;
636	}
637
638	static noinline void
639	page_fault_oops(struct pt_regs regs, unsigned* long error_code,
640	unsigned long address)
641	{
642	#ifdef CONFIG_VMAP_STACK
643	struct stack_info info;
644	#endif
645	unsigned long flags;
646	int sig;
647
648	if (user_mode(regs)) {
649	/*
650	* Implicit kernel access from user mode? Skip the stack
651	* overflow and EFI special cases.
652	*/
653	goto oops;
654	}
655
656	#ifdef CONFIG_VMAP_STACK
657	/*
658	* Stack overflow? During boot, we can fault near the initial
659	* stack in the direct map, but that's not an overflow -- check
660	* that we're in vmalloc space to avoid this.
661	*/
662	if (is_vmalloc_addr(x: (void *)address) &&
663	get_stack_guard_info(stack: (void *)address, info: &info)) {
664	/*
665	* We're likely to be running with very little stack space
666	* left. It's plausible that we'd hit this condition but
667	* double-fault even before we get this far, in which case
668	* we're fine: the double-fault handler will deal with it.
669	*
670	* We don't want to make it all the way into the oops code
671	* and then double-fault, though, because we're likely to
672	* break the console driver and lose most of the stack dump.
673	*/
674	call_on_stack(__this_cpu_ist_top_va(DF) - sizeof(void*),
675	handle_stack_overflow,
676	ASM_CALL_ARG3,
677	, [arg1] "r" (regs), [arg2] "r" (address), [arg3] "r" (&info));
678
679	unreachable();
680	}
681	#endif
682
683	/*
684	* Buggy firmware could access regions which might page fault. If
685	* this happens, EFI has a special OOPS path that will try to
686	* avoid hanging the system.
687	*/
688	if (IS_ENABLED(CONFIG_EFI))
689	efi_crash_gracefully_on_page_fault(phys_addr: address);
690
691	/ Only not-present faults should be handled by KFENCE. /
692	if (!(error_code & X86_PF_PROT) &&
693	kfence_handle_page_fault(addr: address, is_write: error_code & X86_PF_WRITE, regs))
694	return;
695
696	oops:
697	/*
698	* Oops. The kernel tried to access some bad page. We'll have to
699	* terminate things with extreme prejudice:
700	*/
701	flags = oops_begin();
702
703	show_fault_oops(regs, error_code, address);
704
705	if (task_stack_end_corrupted(current))
706	printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
707
708	sig = SIGKILL;
709	if (__die("Oops", regs, error_code))
710	sig = `0`;
711
712	/ Executive summary in case the body of the oops scrolled away /
713	printk(KERN_DEFAULT "CR2: %016lx\n", address);
714
715	oops_end(flags, regs, signr: sig);
716	}
717
718	static noinline void
719	kernelmode_fixup_or_oops(struct pt_regs regs, unsigned* long error_code,
720	unsigned long address, int signal, int si_code,
721	u32 pkey)
722	{
723	WARN_ON_ONCE(user_mode(regs));
724
725	/ Are we prepared to handle this kernel fault? /
726	if (fixup_exception(regs, X86_TRAP_PF, error_code, fault_addr: address)) {
727	/*
728	* Any interrupt that takes a fault gets the fixup. This makes
729	* the below recursive fault logic only apply to a faults from
730	* task context.
731	*/
732	if (in_interrupt())
733	return;
734
735	/*
736	* Per the above we're !in_interrupt(), aka. task context.
737	*
738	* In this case we need to make sure we're not recursively
739	* faulting through the emulate_vsyscall() logic.
740	*/
741	if (current->thread.sig_on_uaccess_err && signal) {
742	sanitize_error_code(address, error_code: &error_code);
743
744	set_signal_archinfo(address, error_code);
745
746	if (si_code == SEGV_PKUERR) {
747	force_sig_pkuerr(addr: (void __user *)address, pkey);
748	} else {
749	/ XXX: hwpoison faults will set the wrong code. /
750	force_sig_fault(sig: signal, code: si_code, addr: (void __user *)address);
751	}
752	}
753
754	/*
755	* Barring that, we can do the fixup and be happy.
756	*/
757	return;
758	}
759
760	/*
761	* AMD erratum #91 manifests as a spurious page fault on a PREFETCH
762	* instruction.
763	*/
764	if (is_prefetch(regs, error_code, addr: address))
765	return;
766
767	page_fault_oops(regs, error_code, address);
768	}
769
770	/*
771	* Print out info about fatal segfaults, if the show_unhandled_signals
772	* sysctl is set:
773	*/
774	static inline void
775	show_signal_msg(struct pt_regs regs, unsigned* long error_code,
776	unsigned long address, struct task_struct *tsk)
777	{
778	const char *loglvl = task_pid_nr(tsk) > `1` ? KERN_INFO : KERN_EMERG;
779	/ This is a racy snapshot, but it's better than nothing. /
780	int cpu = raw_smp_processor_id();
781
782	if (!unhandled_signal(tsk, SIGSEGV))
783	return;
784
785	if (!printk_ratelimit())
786	return;
787
788	printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
789	loglvl, tsk->comm, task_pid_nr(tsk), address,
790	(void )regs->ip, (void* *)regs->sp, error_code);
791
792	print_vma_addr(KERN_CONT " in ", rip: regs->ip);
793
794	/*
795	* Dump the likely CPU where the fatal segfault happened.
796	* This can help identify faulty hardware.
797	*/
798	printk(KERN_CONT " likely on CPU %d (core %d, socket %d)", cpu,
799	topology_core_id(cpu), topology_physical_package_id(cpu));
800
801
802	printk(KERN_CONT "\n");
803
804	show_opcodes(regs, loglvl);
805	}
806
807	static void
808	__bad_area_nosemaphore(struct pt_regs regs, unsigned* long error_code,
809	unsigned long address, u32 pkey, int si_code)
810	{
811	struct task_struct *tsk = current;
812
813	if (!user_mode(regs)) {
814	kernelmode_fixup_or_oops(regs, error_code, address,
815	SIGSEGV, si_code, pkey);
816	return;
817	}
818
819	if (!(error_code & X86_PF_USER)) {
820	/ Implicit user access to kernel memory -- just oops /
821	page_fault_oops(regs, error_code, address);
822	return;
823	}
824
825	/*
826	* User mode accesses just cause a SIGSEGV.
827	* It's possible to have interrupts off here:
828	*/
829	local_irq_enable();
830
831	/*
832	* Valid to do another page fault here because this one came
833	* from user space:
834	*/
835	if (is_prefetch(regs, error_code, addr: address))
836	return;
837
838	if (is_errata100(regs, address))
839	return;
840
841	sanitize_error_code(address, error_code: &error_code);
842
843	if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, fault_addr: address))
844	return;
845
846	if (likely(show_unhandled_signals))
847	show_signal_msg(regs, error_code, address, tsk);
848
849	set_signal_archinfo(address, error_code);
850
851	if (si_code == SEGV_PKUERR)
852	force_sig_pkuerr(addr: (void __user *)address, pkey);
853	else
854	force_sig_fault(SIGSEGV, code: si_code, addr: (void __user *)address);
855
856	local_irq_disable();
857	}
858
859	static noinline void
860	bad_area_nosemaphore(struct pt_regs regs, unsigned* long error_code,
861	unsigned long address)
862	{
863	__bad_area_nosemaphore(regs, error_code, address, pkey: `0`, SEGV_MAPERR);
864	}
865
866	static void
867	__bad_area(struct pt_regs regs, unsigned* long error_code,
868	unsigned long address, u32 pkey, int si_code)
869	{
870	struct mm_struct *mm = current->mm;
871	/*
872	* Something tried to access memory that isn't in our memory map..
873	* Fix it, but check if it's kernel or user first..
874	*/
875	mmap_read_unlock(mm);
876
877	__bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
878	}
879
880	static inline bool bad_area_access_from_pkeys(unsigned long error_code,
881	struct vm_area_struct *vma)
882	{
883	/ This code is always called on the current mm /
884	bool foreign = false;
885
886	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
887	return false;
888	if (error_code & X86_PF_PK)
889	return true;
890	/ this checks permission keys on the VMA: /
891	if (!arch_vma_access_permitted(vma, write: (error_code & X86_PF_WRITE),
892	execute: (error_code & X86_PF_INSTR), foreign))
893	return true;
894	return false;
895	}
896
897	static noinline void
898	bad_area_access_error(struct pt_regs regs, unsigned* long error_code,
899	unsigned long address, struct vm_area_struct *vma)
900	{
901	/*
902	* This OSPKE check is not strictly necessary at runtime.
903	* But, doing it this way allows compiler optimizations
904	* if pkeys are compiled out.
905	*/
906	if (bad_area_access_from_pkeys(error_code, vma)) {
907	/*
908	* A protection key fault means that the PKRU value did not allow
909	* access to some PTE. Userspace can figure out what PKRU was
910	* from the XSAVE state. This function captures the pkey from
911	* the vma and passes it to userspace so userspace can discover
912	* which protection key was set on the PTE.
913	*
914	* If we get here, we know that the hardware signaled a X86_PF_PK
915	* fault and that there was a VMA once we got in the fault
916	* handler. It does not guarantee that the VMA we find here
917	* was the one that we faulted on.
918	*
919	* 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4);
920	* 2. T1 : set PKRU to deny access to pkey=4, touches page
921	* 3. T1 : faults...
922	* 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
923	* 5. T1 : enters fault handler, takes mmap_lock, etc...
924	* 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
925	* faulted on a pte with its pkey=4.
926	*/
927	u32 pkey = vma_pkey(vma);
928
929	__bad_area(regs, error_code, address, pkey, SEGV_PKUERR);
930	} else {
931	__bad_area(regs, error_code, address, pkey: `0`, SEGV_ACCERR);
932	}
933	}
934
935	static void
936	do_sigbus(struct pt_regs regs, unsigned* long error_code, unsigned long address,
937	vm_fault_t fault)
938	{
939	/ Kernel mode? Handle exceptions or die: /
940	if (!user_mode(regs)) {
941	kernelmode_fixup_or_oops(regs, error_code, address,
942	SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY);
943	return;
944	}
945
946	/ User-space => ok to do another page fault: /
947	if (is_prefetch(regs, error_code, addr: address))
948	return;
949
950	sanitize_error_code(address, error_code: &error_code);
951
952	if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, fault_addr: address))
953	return;
954
955	set_signal_archinfo(address, error_code);
956
957	#ifdef CONFIG_MEMORY_FAILURE
958	if (fault & (VM_FAULT_HWPOISON\|VM_FAULT_HWPOISON_LARGE)) {
959	struct task_struct *tsk = current;
960	unsigned lsb = `0`;
961
962	pr_err(
963	"MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
964	tsk->comm, tsk->pid, address);
965	if (fault & VM_FAULT_HWPOISON_LARGE)
966	lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
967	if (fault & VM_FAULT_HWPOISON)
968	lsb = PAGE_SHIFT;
969	force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb);
970	return;
971	}
972	#endif
973	force_sig_fault(SIGBUS, BUS_ADRERR, addr: (void __user *)address);
974	}
975
976	static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
977	{
978	if ((error_code & X86_PF_WRITE) && !pte_write(pte: *pte))
979	return `0`;
980
981	if ((error_code & X86_PF_INSTR) && !pte_exec(pte: *pte))
982	return `0`;
983
984	return `1`;
985	}
986
987	/*
988	* Handle a spurious fault caused by a stale TLB entry.
989	*
990	* This allows us to lazily refresh the TLB when increasing the
991	* permissions of a kernel page (RO -> RW or NX -> X). Doing it
992	* eagerly is very expensive since that implies doing a full
993	* cross-processor TLB flush, even if no stale TLB entries exist
994	* on other processors.
995	*
996	* Spurious faults may only occur if the TLB contains an entry with
997	* fewer permission than the page table entry. Non-present (P = 0)
998	* and reserved bit (R = 1) faults are never spurious.
999	*
1000	* There are no security implications to leaving a stale TLB when
1001	* increasing the permissions on a page.
1002	*
1003	* Returns non-zero if a spurious fault was handled, zero otherwise.
1004	*
1005	* See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
1006	* (Optional Invalidation).
1007	*/
1008	static noinline int
1009	spurious_kernel_fault(unsigned long error_code, unsigned long address)
1010	{
1011	pgd_t *pgd;
1012	p4d_t *p4d;
1013	pud_t *pud;
1014	pmd_t *pmd;
1015	pte_t *pte;
1016	int ret;
1017
1018	/*
1019	* Only writes to RO or instruction fetches from NX may cause
1020	* spurious faults.
1021	*
1022	* These could be from user or supervisor accesses but the TLB
1023	* is only lazily flushed after a kernel mapping protection
1024	* change, so user accesses are not expected to cause spurious
1025	* faults.
1026	*/
1027	if (error_code != (X86_PF_WRITE \| X86_PF_PROT) &&
1028	error_code != (X86_PF_INSTR \| X86_PF_PROT))
1029	return `0`;
1030
1031	pgd = init_mm.pgd + pgd_index(address);
1032	if (!pgd_present(pgd: *pgd))
1033	return `0`;
1034
1035	p4d = p4d_offset(pgd, address);
1036	if (!p4d_present(p4d: *p4d))
1037	return `0`;
1038
1039	if (p4d_leaf(p4d: *p4d))
1040	return spurious_kernel_fault_check(error_code, pte: (pte_t *) p4d);
1041
1042	pud = pud_offset(p4d, address);
1043	if (!pud_present(pud: *pud))
1044	return `0`;
1045
1046	if (pud_leaf(pud: *pud))
1047	return spurious_kernel_fault_check(error_code, pte: (pte_t *) pud);
1048
1049	pmd = pmd_offset(pud, address);
1050	if (!pmd_present(pmd: *pmd))
1051	return `0`;
1052
1053	if (pmd_leaf(pte: *pmd))
1054	return spurious_kernel_fault_check(error_code, pte: (pte_t *) pmd);
1055
1056	pte = pte_offset_kernel(pmd, address);
1057	if (!pte_present(a: *pte))
1058	return `0`;
1059
1060	ret = spurious_kernel_fault_check(error_code, pte);
1061	if (!ret)
1062	return `0`;
1063
1064	/*
1065	* Make sure we have permissions in PMD.
1066	* If not, then there's a bug in the page tables:
1067	*/
1068	ret = spurious_kernel_fault_check(error_code, pte: (pte_t *) pmd);
1069	WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
1070
1071	return ret;
1072	}
1073	NOKPROBE_SYMBOL(spurious_kernel_fault);
1074
1075	int show_unhandled_signals = `1`;
1076
1077	static inline int
1078	access_error(unsigned long error_code, struct vm_area_struct *vma)
1079	{
1080	/ This is only called for the current mm, so: /
1081	bool foreign = false;
1082
1083	/*
1084	* Read or write was blocked by protection keys. This is
1085	* always an unconditional error and can never result in
1086	* a follow-up action to resolve the fault, like a COW.
1087	*/
1088	if (error_code & X86_PF_PK)
1089	return `1`;
1090
1091	/*
1092	* SGX hardware blocked the access. This usually happens
1093	* when the enclave memory contents have been destroyed, like
1094	* after a suspend/resume cycle. In any case, the kernel can't
1095	* fix the cause of the fault. Handle the fault as an access
1096	* error even in cases where no actual access violation
1097	* occurred. This allows userspace to rebuild the enclave in
1098	* response to the signal.
1099	*/
1100	if (unlikely(error_code & X86_PF_SGX))
1101	return `1`;
1102
1103	/*
1104	* Make sure to check the VMA so that we do not perform
1105	* faults just to hit a X86_PF_PK as soon as we fill in a
1106	* page.
1107	*/
1108	if (!arch_vma_access_permitted(vma, write: (error_code & X86_PF_WRITE),
1109	execute: (error_code & X86_PF_INSTR), foreign))
1110	return `1`;
1111
1112	/*
1113	* Shadow stack accesses (PF_SHSTK=1) are only permitted to
1114	* shadow stack VMAs. All other accesses result in an error.
1115	*/
1116	if (error_code & X86_PF_SHSTK) {
1117	if (unlikely(!(vma->vm_flags & VM_SHADOW_STACK)))
1118	return `1`;
1119	if (unlikely(!(vma->vm_flags & VM_WRITE)))
1120	return `1`;
1121	return `0`;
1122	}
1123
1124	if (error_code & X86_PF_WRITE) {
1125	/ write, present and write, not present: /
1126	if (unlikely(vma->vm_flags & VM_SHADOW_STACK))
1127	return `1`;
1128	if (unlikely(!(vma->vm_flags & VM_WRITE)))
1129	return `1`;
1130	return `0`;
1131	}
1132
1133	/ read, present: /
1134	if (unlikely(error_code & X86_PF_PROT))
1135	return `1`;
1136
1137	/ read, not present: /
1138	if (unlikely(!vma_is_accessible(vma)))
1139	return `1`;
1140
1141	return `0`;
1142	}
1143
1144	bool fault_in_kernel_space(unsigned long address)
1145	{
1146	/*
1147	* On 64-bit systems, the vsyscall page is at an address above
1148	* TASK_SIZE_MAX, but is not considered part of the kernel
1149	* address space.
1150	*/
1151	if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(vaddr: address))
1152	return false;
1153
1154	return address >= TASK_SIZE_MAX;
1155	}
1156
1157	/*
1158	* Called for all faults where 'address' is part of the kernel address
1159	* space. Might get called for faults that originate from code that
1160	* ran in userspace or the kernel.
1161	*/
1162	static void
1163	do_kern_addr_fault(struct pt_regs regs, unsigned* long hw_error_code,
1164	unsigned long address)
1165	{
1166	/*
1167	* Protection keys exceptions only happen on user pages. We
1168	* have no user pages in the kernel portion of the address
1169	* space, so do not expect them here.
1170	*/
1171	WARN_ON_ONCE(hw_error_code & X86_PF_PK);
1172
1173	#ifdef CONFIG_X86_32
1174	/*
1175	* We can fault-in kernel-space virtual memory on-demand. The
1176	* 'reference' page table is init_mm.pgd.
1177	*
1178	* NOTE! We MUST NOT take any locks for this case. We may
1179	* be in an interrupt or a critical region, and should
1180	* only copy the information from the master page table,
1181	* nothing more.
1182	*
1183	* Before doing this on-demand faulting, ensure that the
1184	* fault is not any of the following:
1185	* 1. A fault on a PTE with a reserved bit set.
1186	* 2. A fault caused by a user-mode access. (Do not demand-
1187	* fault kernel memory due to user-mode accesses).
1188	* 3. A fault caused by a page-level protection violation.
1189	* (A demand fault would be on a non-present page which
1190	* would have X86_PF_PROT==0).
1191	*
1192	* This is only needed to close a race condition on x86-32 in
1193	* the vmalloc mapping/unmapping code. See the comment above
1194	* vmalloc_fault() for details. On x86-64 the race does not
1195	* exist as the vmalloc mappings don't need to be synchronized
1196	* there.
1197	*/
1198	if (!(hw_error_code & (X86_PF_RSVD \| X86_PF_USER \| X86_PF_PROT))) {
1199	if (vmalloc_fault(address) >= `0`)
1200	return;
1201	}
1202	#endif
1203
1204	if (is_f00f_bug(regs, error_code: hw_error_code, address))
1205	return;
1206
1207	/ Was the fault spurious, caused by lazy TLB invalidation? /
1208	if (spurious_kernel_fault(error_code: hw_error_code, address))
1209	return;
1210
1211	/ kprobes don't want to hook the spurious faults: /
1212	if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF)))
1213	return;
1214
1215	/*
1216	* Note, despite being a "bad area", there are quite a few
1217	* acceptable reasons to get here, such as erratum fixups
1218	* and handling kernel code that can fault, like get_user().
1219	*
1220	* Don't take the mm semaphore here. If we fixup a prefetch
1221	* fault we could otherwise deadlock:
1222	*/
1223	bad_area_nosemaphore(regs, error_code: hw_error_code, address);
1224	}
1225	NOKPROBE_SYMBOL(do_kern_addr_fault);
1226
1227	/*
1228	* Handle faults in the user portion of the address space. Nothing in here
1229	* should check X86_PF_USER without a specific justification: for almost
1230	* all purposes, we should treat a normal kernel access to user memory
1231	* (e.g. get_user(), put_user(), etc.) the same as the WRUSS instruction.
1232	* The one exception is AC flag handling, which is, per the x86
1233	* architecture, special for WRUSS.
1234	*/
1235	static inline
1236	void do_user_addr_fault(struct pt_regs *regs,
1237	unsigned long error_code,
1238	unsigned long address)
1239	{
1240	struct vm_area_struct *vma;
1241	struct task_struct *tsk;
1242	struct mm_struct *mm;
1243	vm_fault_t fault;
1244	unsigned int flags = FAULT_FLAG_DEFAULT;
1245
1246	tsk = current;
1247	mm = tsk->mm;
1248
1249	if (unlikely((error_code & (X86_PF_USER \| X86_PF_INSTR)) == X86_PF_INSTR)) {
1250	/*
1251	* Whoops, this is kernel mode code trying to execute from
1252	* user memory. Unless this is AMD erratum #93, which
1253	* corrupts RIP such that it looks like a user address,
1254	* this is unrecoverable. Don't even try to look up the
1255	* VMA or look for extable entries.
1256	*/
1257	if (is_errata93(regs, address))
1258	return;
1259
1260	page_fault_oops(regs, error_code, address);
1261	return;
1262	}
1263
1264	/ kprobes don't want to hook the spurious faults: /
1265	if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF)))
1266	return;
1267
1268	/*
1269	* Reserved bits are never expected to be set on
1270	* entries in the user portion of the page tables.
1271	*/
1272	if (unlikely(error_code & X86_PF_RSVD))
1273	pgtable_bad(regs, error_code, address);
1274
1275	/*
1276	* If SMAP is on, check for invalid kernel (supervisor) access to user
1277	* pages in the user address space. The odd case here is WRUSS,
1278	* which, according to the preliminary documentation, does not respect
1279	* SMAP and will have the USER bit set so, in all cases, SMAP
1280	* enforcement appears to be consistent with the USER bit.
1281	*/
1282	if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
1283	!(error_code & X86_PF_USER) &&
1284	!(regs->flags & X86_EFLAGS_AC))) {
1285	/*
1286	* No extable entry here. This was a kernel access to an
1287	* invalid pointer. get_kernel_nofault() will not get here.
1288	*/
1289	page_fault_oops(regs, error_code, address);
1290	return;
1291	}
1292
1293	/*
1294	* If we're in an interrupt, have no user context or are running
1295	* in a region with pagefaults disabled then we must not take the fault
1296	*/
1297	if (unlikely(faulthandler_disabled() \|\| !mm)) {
1298	bad_area_nosemaphore(regs, error_code, address);
1299	return;
1300	}
1301
1302	/ Legacy check - remove this after verifying that it doesn't trigger /
1303	if (WARN_ON_ONCE(!(regs->flags & X86_EFLAGS_IF))) {
1304	bad_area_nosemaphore(regs, error_code, address);
1305	return;
1306	}
1307
1308	local_irq_enable();
1309
1310	perf_sw_event(event_id: PERF_COUNT_SW_PAGE_FAULTS, nr: `1`, regs, addr: address);
1311
1312	/*
1313	* Read-only permissions can not be expressed in shadow stack PTEs.
1314	* Treat all shadow stack accesses as WRITE faults. This ensures
1315	* that the MM will prepare everything (e.g., break COW) such that
1316	* maybe_mkwrite() can create a proper shadow stack PTE.
1317	*/
1318	if (error_code & X86_PF_SHSTK)
1319	flags \|= FAULT_FLAG_WRITE;
1320	if (error_code & X86_PF_WRITE)
1321	flags \|= FAULT_FLAG_WRITE;
1322	if (error_code & X86_PF_INSTR)
1323	flags \|= FAULT_FLAG_INSTRUCTION;
1324
1325	/*
1326	* We set FAULT_FLAG_USER based on the register state, not
1327	* based on X86_PF_USER. User space accesses that cause
1328	* system page faults are still user accesses.
1329	*/
1330	if (user_mode(regs))
1331	flags \|= FAULT_FLAG_USER;
1332
1333	#ifdef CONFIG_X86_64
1334	/*
1335	* Faults in the vsyscall page might need emulation. The
1336	* vsyscall page is at a high address (>PAGE_OFFSET), but is
1337	* considered to be part of the user address space.
1338	*
1339	* The vsyscall page does not have a "real" VMA, so do this
1340	* emulation before we go searching for VMAs.
1341	*
1342	* PKRU never rejects instruction fetches, so we don't need
1343	* to consider the PF_PK bit.
1344	*/
1345	if (is_vsyscall_vaddr(vaddr: address)) {
1346	if (emulate_vsyscall(error_code, regs, address))
1347	return;
1348	}
1349	#endif
1350
1351	if (!(flags & FAULT_FLAG_USER))
1352	goto lock_mmap;
1353
1354	vma = lock_vma_under_rcu(mm, address);
1355	if (!vma)
1356	goto lock_mmap;
1357
1358	if (unlikely(access_error(error_code, vma))) {
1359	vma_end_read(vma);
1360	goto lock_mmap;
1361	}
1362	fault = handle_mm_fault(vma, address, flags: flags \| FAULT_FLAG_VMA_LOCK, regs);
1363	if (!(fault & (VM_FAULT_RETRY \| VM_FAULT_COMPLETED)))
1364	vma_end_read(vma);
1365
1366	if (!(fault & VM_FAULT_RETRY)) {
1367	count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
1368	goto done;
1369	}
1370	count_vm_vma_lock_event(VMA_LOCK_RETRY);
1371	if (fault & VM_FAULT_MAJOR)
1372	flags \|= FAULT_FLAG_TRIED;
1373
1374	/ Quick path to respond to signals /
1375	if (fault_signal_pending(fault_flags: fault, regs)) {
1376	if (!user_mode(regs))
1377	kernelmode_fixup_or_oops(regs, error_code, address,
1378	SIGBUS, BUS_ADRERR,
1379	ARCH_DEFAULT_PKEY);
1380	return;
1381	}
1382	lock_mmap:
1383
1384	retry:
1385	vma = lock_mm_and_find_vma(mm, address, regs);
1386	if (unlikely(!vma)) {
1387	bad_area_nosemaphore(regs, error_code, address);
1388	return;
1389	}
1390
1391	/*
1392	* Ok, we have a good vm_area for this memory access, so
1393	* we can handle it..
1394	*/
1395	if (unlikely(access_error(error_code, vma))) {
1396	bad_area_access_error(regs, error_code, address, vma);
1397	return;
1398	}
1399
1400	/*
1401	* If for any reason at all we couldn't handle the fault,
1402	* make sure we exit gracefully rather than endlessly redo
1403	* the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if
1404	* we get VM_FAULT_RETRY back, the mmap_lock has been unlocked.
1405	*
1406	* Note that handle_userfault() may also release and reacquire mmap_lock
1407	* (and not return with VM_FAULT_RETRY), when returning to userland to
1408	* repeat the page fault later with a VM_FAULT_NOPAGE retval
1409	* (potentially after handling any pending signal during the return to
1410	* userland). The return to userland is identified whenever
1411	* FAULT_FLAG_USER\|FAULT_FLAG_KILLABLE are both set in flags.
1412	*/
1413	fault = handle_mm_fault(vma, address, flags, regs);
1414
1415	if (fault_signal_pending(fault_flags: fault, regs)) {
1416	/*
1417	* Quick path to respond to signals. The core mm code
1418	* has unlocked the mm for us if we get here.
1419	*/
1420	if (!user_mode(regs))
1421	kernelmode_fixup_or_oops(regs, error_code, address,
1422	SIGBUS, BUS_ADRERR,
1423	ARCH_DEFAULT_PKEY);
1424	return;
1425	}
1426
1427	/ The fault is fully completed (including releasing mmap lock) /
1428	if (fault & VM_FAULT_COMPLETED)
1429	return;
1430
1431	/*
1432	* If we need to retry the mmap_lock has already been released,
1433	* and if there is a fatal signal pending there is no guarantee
1434	* that we made any progress. Handle this case first.
1435	*/
1436	if (unlikely(fault & VM_FAULT_RETRY)) {
1437	flags \|= FAULT_FLAG_TRIED;
1438	goto retry;
1439	}
1440
1441	mmap_read_unlock(mm);
1442	done:
1443	if (likely(!(fault & VM_FAULT_ERROR)))
1444	return;
1445
1446	if (fatal_signal_pending(current) && !user_mode(regs)) {
1447	kernelmode_fixup_or_oops(regs, error_code, address,
1448	signal: `0`, si_code: `0`, ARCH_DEFAULT_PKEY);
1449	return;
1450	}
1451
1452	if (fault & VM_FAULT_OOM) {
1453	/ Kernel mode? Handle exceptions or die: /
1454	if (!user_mode(regs)) {
1455	kernelmode_fixup_or_oops(regs, error_code, address,
1456	SIGSEGV, SEGV_MAPERR,
1457	ARCH_DEFAULT_PKEY);
1458	return;
1459	}
1460
1461	/*
1462	* We ran out of memory, call the OOM killer, and return the
1463	* userspace (which will retry the fault, or kill us if we got
1464	* oom-killed):
1465	*/
1466	pagefault_out_of_memory();
1467	} else {
1468	if (fault & (VM_FAULT_SIGBUS\|VM_FAULT_HWPOISON\|
1469	VM_FAULT_HWPOISON_LARGE))
1470	do_sigbus(regs, error_code, address, fault);
1471	else if (fault & VM_FAULT_SIGSEGV)
1472	bad_area_nosemaphore(regs, error_code, address);
1473	else
1474	BUG();
1475	}
1476	}
1477	NOKPROBE_SYMBOL(do_user_addr_fault);
1478
1479	static __always_inline void
1480	trace_page_fault_entries(struct pt_regs regs, unsigned* long error_code,
1481	unsigned long address)
1482	{
1483	if (!trace_pagefault_enabled())
1484	return;
1485
1486	if (user_mode(regs))
1487	trace_page_fault_user(address, regs, error_code);
1488	else
1489	trace_page_fault_kernel(address, regs, error_code);
1490	}
1491
1492	static __always_inline void
1493	handle_page_fault(struct pt_regs regs, unsigned* long error_code,
1494	unsigned long address)
1495	{
1496	trace_page_fault_entries(regs, error_code, address);
1497
1498	if (unlikely(kmmio_fault(regs, address)))
1499	return;
1500
1501	/ Was the fault on kernel-controlled part of the address space? /
1502	if (unlikely(fault_in_kernel_space(address))) {
1503	do_kern_addr_fault(regs, hw_error_code: error_code, address);
1504	} else {
1505	do_user_addr_fault(regs, error_code, address);
1506	/*
1507	* User address page fault handling might have reenabled
1508	* interrupts. Fixing up all potential exit points of
1509	* do_user_addr_fault() and its leaf functions is just not
1510	* doable w/o creating an unholy mess or turning the code
1511	* upside down.
1512	*/
1513	local_irq_disable();
1514	}
1515	}
1516
1517	DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
1518	{
1519	irqentry_state_t state;
1520	unsigned long address;
1521
1522	address = cpu_feature_enabled(X86_FEATURE_FRED) ? fred_event_data(regs) : read_cr2();
1523
1524	prefetchw(x: &current->mm->mmap_lock);
1525
1526	/*
1527	* KVM uses #PF vector to deliver 'page not present' events to guests
1528	* (asynchronous page fault mechanism). The event happens when a
1529	* userspace task is trying to access some valid (from guest's point of
1530	* view) memory which is not currently mapped by the host (e.g. the
1531	* memory is swapped out). Note, the corresponding "page ready" event
1532	* which is injected when the memory becomes available, is delivered via
1533	* an interrupt mechanism and not a #PF exception
1534	* (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()).
1535	*
1536	* We are relying on the interrupted context being sane (valid RSP,
1537	* relevant locks not held, etc.), which is fine as long as the
1538	* interrupted context had IF=1. We are also relying on the KVM
1539	* async pf type field and CR2 being read consistently instead of
1540	* getting values from real and async page faults mixed up.
1541	*
1542	* Fingers crossed.
1543	*
1544	* The async #PF handling code takes care of idtentry handling
1545	* itself.
1546	*/
1547	if (kvm_handle_async_pf(regs, token: (u32)address))
1548	return;
1549
1550	/*
1551	* Entry handling for valid #PF from kernel mode is slightly
1552	* different: RCU is already watching and ct_irq_enter() must not
1553	* be invoked because a kernel fault on a user space address might
1554	* sleep.
1555	*
1556	* In case the fault hit a RCU idle region the conditional entry
1557	* code reenabled RCU to avoid subsequent wreckage which helps
1558	* debuggability.
1559	*/
1560	state = irqentry_enter(regs);
1561
1562	instrumentation_begin();
1563	handle_page_fault(regs, error_code, address);
1564	instrumentation_end();
1565
1566	irqentry_exit(regs, state);
1567	}
1568

source code of linux/arch/x86/mm/fault.c