fault.c source code [linux/arch/x86/mm/fault.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (C) 1995 Linus Torvalds
4	* Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
5	* Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
6	*/
7	#include <linux/sched.h> /* test_thread_flag(), ... */
8	#include <linux/sched/task_stack.h> /* task_stack_(), ... /
9	#include <linux/kdebug.h> /* oops_begin/end, ... */
10	#include <linux/extable.h> /* search_exception_tables */
11	#include <linux/memblock.h> /* max_low_pfn */
12	#include <linux/kfence.h> /* kfence_handle_page_fault */
13	#include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */
14	#include <linux/mmiotrace.h> /* kmmio_handler, ... */
15	#include <linux/perf_event.h> /* perf_sw_event */
16	#include <linux/hugetlb.h> /* hstate_index_to_shift */
17	#include <linux/prefetch.h> /* prefetchw */
18	#include <linux/context_tracking.h> /* exception_enter(), ... */
19	#include <linux/uaccess.h> /* faulthandler_disabled() */
20	#include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/
21	#include <linux/mm_types.h>
22	#include <linux/mm.h> /* find_and_lock_vma() */
23
24	#include <asm/cpufeature.h> /* boot_cpu_has, ... */
25	#include <asm/traps.h> /* dotraplinkage, ... */
26	#include <asm/fixmap.h> /* VSYSCALL_ADDR */
27	#include <asm/vsyscall.h> /* emulate_vsyscall */
28	#include <asm/vm86.h> /* struct vm86 */
29	#include <asm/mmu_context.h> /* vma_pkey() */
30	#include <asm/efi.h> /* efi_crash_gracefully_on_page_fault()*/
31	#include <asm/desc.h> /* store_idt(), ... */
32	#include <asm/cpu_entry_area.h> /* exception stack */
33	#include <asm/pgtable_areas.h> /* VMALLOC_START, ... */
34	#include <asm/kvm_para.h> /* kvm_handle_async_pf */
35	#include <asm/vdso.h> /* fixup_vdso_exception() */
36	#include <asm/irq_stack.h>
37
38	#define CREATE_TRACE_POINTS
39	#include <asm/trace/exceptions.h>
40
41	/*
42	* Returns 0 if mmiotrace is disabled, or if the fault is not
43	* handled by mmiotrace:
44	*/
45	static nokprobe_inline int
46	kmmio_fault(struct pt_regs regs, unsigned* long addr)
47	{
48	if (unlikely(is_kmmio_active()))
49	if (kmmio_handler(regs, addr) == `1`)
50	return -`1`;
51	return `0`;
52	}
53
54	/*
55	* Prefetch quirks:
56	*
57	* 32-bit mode:
58	*
59	* Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
60	* Check that here and ignore it. This is AMD erratum #91.
61	*
62	* 64-bit mode:
63	*
64	* Sometimes the CPU reports invalid exceptions on prefetch.
65	* Check that here and ignore it.
66	*
67	* Opcode checker based on code by Richard Brunner.
68	*/
69	static inline int
70	check_prefetch_opcode(struct pt_regs regs, unsigned* char *instr,
71	unsigned char opcode, int *prefetch)
72	{
73	unsigned char instr_hi = opcode & `0xf0`;
74	unsigned char instr_lo = opcode & `0x0f`;
75
76	switch (instr_hi) {
77	case `0x20`:
78	case `0x30`:
79	/*
80	* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
81	* In X86_64 long mode, the CPU will signal invalid
82	* opcode if some of these prefixes are present so
83	* X86_64 will never get here anyway
84	*/
85	return ((instr_lo & `7`) == `0x6`);
86	#ifdef CONFIG_X86_64
87	case `0x40`:
88	/*
89	* In 64-bit mode 0x40..0x4F are valid REX prefixes
90	*/
91	return (!user_mode(regs) \|\| user_64bit_mode(regs));
92	#endif
93	case `0x60`:
94	/ 0x64 thru 0x67 are valid prefixes in all modes. /
95	return (instr_lo & `0xC`) == `0x4`;
96	case `0xF0`:
97	/ 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. /
98	return !instr_lo \|\| (instr_lo>>`1`) == `1`;
99	case `0x00`:
100	/ Prefetch instruction is 0x0F0D or 0x0F18 /
101	if (get_kernel_nofault(opcode, instr))
102	return `0`;
103
104	*prefetch = (instr_lo == `0xF`) &&
105	(opcode == `0x0D` \|\| opcode == `0x18`);
106	return `0`;
107	default:
108	return `0`;
109	}
110	}
111
112	static bool is_amd_k8_pre_npt(void)
113	{
114	struct cpuinfo_x86 *c = &boot_cpu_data;
115
116	return unlikely(IS_ENABLED(CONFIG_CPU_SUP_AMD) &&
117	c->x86_vendor == X86_VENDOR_AMD &&
118	c->x86 == `0xf` && c->x86_model < `0x40`);
119	}
120
121	static int
122	is_prefetch(struct pt_regs regs, unsigned* long error_code, unsigned long addr)
123	{
124	unsigned char *max_instr;
125	unsigned char *instr;
126	int prefetch = `0`;
127
128	/ Erratum #91 affects AMD K8, pre-NPT CPUs /
129	if (!is_amd_k8_pre_npt())
130	return `0`;
131
132	/*
133	* If it was a exec (instruction fetch) fault on NX page, then
134	* do not ignore the fault:
135	*/
136	if (error_code & X86_PF_INSTR)
137	return `0`;
138
139	instr = (void *)convert_ip_to_linear(current, regs);
140	max_instr = instr + `15`;
141
142	/*
143	* This code has historically always bailed out if IP points to a
144	* not-present page (e.g. due to a race). No one has ever
145	* complained about this.
146	*/
147	pagefault_disable();
148
149	while (instr < max_instr) {
150	unsigned char opcode;
151
152	if (user_mode(regs)) {
153	if (get_user(opcode, (unsigned char __user *) instr))
154	break;
155	} else {
156	if (get_kernel_nofault(opcode, instr))
157	break;
158	}
159
160	instr++;
161
162	if (!check_prefetch_opcode(regs, instr, opcode, prefetch: &prefetch))
163	break;
164	}
165
166	pagefault_enable();
167	return prefetch;
168	}
169
170	DEFINE_SPINLOCK(pgd_lock);
171	LIST_HEAD(pgd_list);
172
173	#ifdef CONFIG_X86_32
174	static inline pmd_t vmalloc_sync_one(pgd_t pgd, unsigned long address)
175	{
176	unsigned index = pgd_index(address);
177	pgd_t *pgd_k;
178	p4d_t p4d, p4d_k;
179	pud_t pud, pud_k;
180	pmd_t pmd, pmd_k;
181
182	pgd += index;
183	pgd_k = init_mm.pgd + index;
184
185	if (!pgd_present(*pgd_k))
186	return NULL;
187
188	/*
189	* set_pgd(pgd, *pgd_k); here would be useless on PAE
190	* and redundant with the set_pmd() on non-PAE. As would
191	* set_p4d/set_pud.
192	*/
193	p4d = p4d_offset(pgd, address);
194	p4d_k = p4d_offset(pgd_k, address);
195	if (!p4d_present(*p4d_k))
196	return NULL;
197
198	pud = pud_offset(p4d, address);
199	pud_k = pud_offset(p4d_k, address);
200	if (!pud_present(*pud_k))
201	return NULL;
202
203	pmd = pmd_offset(pud, address);
204	pmd_k = pmd_offset(pud_k, address);
205
206	if (pmd_present(pmd) != pmd_present(pmd_k))
207	set_pmd(pmd, *pmd_k);
208
209	if (!pmd_present(*pmd_k))
210	return NULL;
211	else
212	BUG_ON(pmd_pfn(pmd) != pmd_pfn(pmd_k));
213
214	return pmd_k;
215	}
216
217	/*
218	* Handle a fault on the vmalloc or module mapping area
219	*
220	* This is needed because there is a race condition between the time
221	* when the vmalloc mapping code updates the PMD to the point in time
222	* where it synchronizes this update with the other page-tables in the
223	* system.
224	*
225	* In this race window another thread/CPU can map an area on the same
226	* PMD, finds it already present and does not synchronize it with the
227	* rest of the system yet. As a result v[mz]alloc might return areas
228	* which are not mapped in every page-table in the system, causing an
229	* unhandled page-fault when they are accessed.
230	*/
231	static noinline int vmalloc_fault(unsigned long address)
232	{
233	unsigned long pgd_paddr;
234	pmd_t *pmd_k;
235	pte_t *pte_k;
236
237	/ Make sure we are in vmalloc area: /
238	if (!(address >= VMALLOC_START && address < VMALLOC_END))
239	return -`1`;
240
241	/*
242	* Synchronize this task's top level page-table
243	* with the 'reference' page table.
244	*
245	* Do _not_ use "current" here. We might be inside
246	* an interrupt in the middle of a task switch..
247	*/
248	pgd_paddr = read_cr3_pa();
249	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
250	if (!pmd_k)
251	return -`1`;
252
253	if (pmd_large(*pmd_k))
254	return `0`;
255
256	pte_k = pte_offset_kernel(pmd_k, address);
257	if (!pte_present(*pte_k))
258	return -`1`;
259
260	return `0`;
261	}
262	NOKPROBE_SYMBOL(vmalloc_fault);
263
264	void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
265	{
266	unsigned long addr;
267
268	for (addr = start & PMD_MASK;
269	addr >= TASK_SIZE_MAX && addr < VMALLOC_END;
270	addr += PMD_SIZE) {
271	struct page *page;
272
273	spin_lock(&pgd_lock);
274	list_for_each_entry(page, &pgd_list, lru) {
275	spinlock_t *pgt_lock;
276
277	/ the pgt_lock only for Xen /
278	pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
279
280	spin_lock(pgt_lock);
281	vmalloc_sync_one(page_address(page), addr);
282	spin_unlock(pgt_lock);
283	}
284	spin_unlock(&pgd_lock);
285	}
286	}
287
288	static bool low_pfn(unsigned long pfn)
289	{
290	return pfn < max_low_pfn;
291	}
292
293	static void dump_pagetable(unsigned long address)
294	{
295	pgd_t *base = __va(read_cr3_pa());
296	pgd_t *pgd = &base[pgd_index(address)];
297	p4d_t *p4d;
298	pud_t *pud;
299	pmd_t *pmd;
300	pte_t *pte;
301
302	#ifdef CONFIG_X86_PAE
303	pr_info("pdpt = %016Lx ", pgd_val(pgd));
304	if (!low_pfn(pgd_val(pgd) >> PAGE_SHIFT) \|\| !pgd_present(pgd))
305	goto out;
306	#define pr_pde pr_cont
307	#else
308	#define pr_pde pr_info
309	#endif
310	p4d = p4d_offset(pgd, address);
311	pud = pud_offset(p4d, address);
312	pmd = pmd_offset(pud, address);
313	pr_pde("pde = %0Lx ", sizeof(pmd) `2`, (u64)pmd_val(*pmd));
314	#undef pr_pde
315
316	/*
317	* We must not directly access the pte in the highpte
318	* case if the page table is located in highmem.
319	* And let's rather not kmap-atomic the pte, just in case
320	* it's allocated already:
321	*/
322	if (!low_pfn(pmd_pfn(pmd)) \|\| !pmd_present(pmd) \|\| pmd_large(*pmd))
323	goto out;
324
325	pte = pte_offset_kernel(pmd, address);
326	pr_cont("pte = %0Lx ", sizeof(pte) `2`, (u64)pte_val(*pte));
327	out:
328	pr_cont("\n");
329	}
330
331	#else /* CONFIG_X86_64: */
332
333	#ifdef CONFIG_CPU_SUP_AMD
334	static const char errata93_warning[] =
335	KERN_ERR
336	"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
337	"******* Working around it, but it may cause SEGVs or burn power.\n"
338	"******* Please consider a BIOS update.\n"
339	"******* Disabling USB legacy in the BIOS may also help.\n";
340	#endif
341
342	static int bad_address(void *p)
343	{
344	unsigned long dummy;
345
346	return get_kernel_nofault(dummy, (unsigned long *)p);
347	}
348
349	static void dump_pagetable(unsigned long address)
350	{
351	pgd_t *base = __va(read_cr3_pa());
352	pgd_t *pgd = base + pgd_index(address);
353	p4d_t *p4d;
354	pud_t *pud;
355	pmd_t *pmd;
356	pte_t *pte;
357
358	if (bad_address(p: pgd))
359	goto bad;
360
361	pr_info("PGD %lx ", pgd_val(*pgd));
362
363	if (!pgd_present(pgd: *pgd))
364	goto out;
365
366	p4d = p4d_offset(pgd, address);
367	if (bad_address(p: p4d))
368	goto bad;
369
370	pr_cont("P4D %lx ", p4d_val(*p4d));
371	if (!p4d_present(p4d: p4d) \|\| p4d_large(p4d: p4d))
372	goto out;
373
374	pud = pud_offset(p4d, address);
375	if (bad_address(p: pud))
376	goto bad;
377
378	pr_cont("PUD %lx ", pud_val(*pud));
379	if (!pud_present(pud: pud) \|\| pud_large(pud: pud))
380	goto out;
381
382	pmd = pmd_offset(pud, address);
383	if (bad_address(p: pmd))
384	goto bad;
385
386	pr_cont("PMD %lx ", pmd_val(*pmd));
387	if (!pmd_present(pmd: pmd) \|\| pmd_large(pte: pmd))
388	goto out;
389
390	pte = pte_offset_kernel(pmd, address);
391	if (bad_address(p: pte))
392	goto bad;
393
394	pr_cont("PTE %lx", pte_val(*pte));
395	out:
396	pr_cont("\n");
397	return;
398	bad:
399	pr_info("BAD\n");
400	}
401
402	#endif /* CONFIG_X86_64 */
403
404	/*
405	* Workaround for K8 erratum #93 & buggy BIOS.
406	*
407	* BIOS SMM functions are required to use a specific workaround
408	* to avoid corruption of the 64bit RIP register on C stepping K8.
409	*
410	* A lot of BIOS that didn't get tested properly miss this.
411	*
412	* The OS sees this as a page fault with the upper 32bits of RIP cleared.
413	* Try to work around it here.
414	*
415	* Note we only handle faults in kernel here.
416	* Does nothing on 32-bit.
417	*/
418	static int is_errata93(struct pt_regs regs, unsigned* long address)
419	{
420	#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
421	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
422	\|\| boot_cpu_data.x86 != `0xf`)
423	return `0`;
424
425	if (user_mode(regs))
426	return `0`;
427
428	if (address != regs->ip)
429	return `0`;
430
431	if ((address >> `32`) != `0`)
432	return `0`;
433
434	address \|= `0xffffffffUL` << `32`;
435	if ((address >= (u64)_stext && address <= (u64)_etext) \|\|
436	(address >= MODULES_VADDR && address <= MODULES_END)) {
437	printk_once(errata93_warning);
438	regs->ip = address;
439	return `1`;
440	}
441	#endif
442	return `0`;
443	}
444
445	/*
446	* Work around K8 erratum #100 K8 in compat mode occasionally jumps
447	* to illegal addresses >4GB.
448	*
449	* We catch this in the page fault handler because these addresses
450	* are not reachable. Just detect this case and return. Any code
451	* segment in LDT is compatibility mode.
452	*/
453	static int is_errata100(struct pt_regs regs, unsigned* long address)
454	{
455	#ifdef CONFIG_X86_64
456	if ((regs->cs == __USER32_CS \|\| (regs->cs & (`1`<<`2`))) && (address >> `32`))
457	return `1`;
458	#endif
459	return `0`;
460	}
461
462	/ Pentium F0 0F C7 C8 bug workaround: /
463	static int is_f00f_bug(struct pt_regs regs, unsigned* long error_code,
464	unsigned long address)
465	{
466	#ifdef CONFIG_X86_F00F_BUG
467	if (boot_cpu_has_bug(X86_BUG_F00F) && !(error_code & X86_PF_USER) &&
468	idt_is_f00f_address(address)) {
469	handle_invalid_op(regs);
470	return `1`;
471	}
472	#endif
473	return `0`;
474	}
475
476	static void show_ldttss(const struct desc_ptr gdt, const* char *name, u16 index)
477	{
478	u32 offset = (index >> `3`) * sizeof(struct desc_struct);
479	unsigned long addr;
480	struct ldttss_desc desc;
481
482	if (index == `0`) {
483	pr_alert("%s: NULL\n", name);
484	return;
485	}
486
487	if (offset + sizeof(struct ldttss_desc) >= gdt->size) {
488	pr_alert("%s: 0x%hx -- out of bounds\n", name, index);
489	return;
490	}
491
492	if (copy_from_kernel_nofault(dst: &desc, src: (void *)(gdt->address + offset),
493	size: sizeof(struct ldttss_desc))) {
494	pr_alert("%s: 0x%hx -- GDT entry is not readable\n",
495	name, index);
496	return;
497	}
498
499	addr = desc.base0 \| (desc.base1 << `16`) \| ((unsigned long)desc.base2 << `24`);
500	#ifdef CONFIG_X86_64
501	addr \|= ((u64)desc.base3 << `32`);
502	#endif
503	pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n",
504	name, index, addr, (desc.limit0 \| (desc.limit1 << `16`)));
505	}
506
507	static void
508	show_fault_oops(struct pt_regs regs, unsigned* long error_code, unsigned long address)
509	{
510	if (!oops_may_print())
511	return;
512
513	if (error_code & X86_PF_INSTR) {
514	unsigned int level;
515	pgd_t *pgd;
516	pte_t *pte;
517
518	pgd = __va(read_cr3_pa());
519	pgd += pgd_index(address);
520
521	pte = lookup_address_in_pgd(pgd, address, level: &level);
522
523	if (pte && pte_present(a: pte) && !pte_exec(pte: pte))
524	pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n",
525	from_kuid(&init_user_ns, current_uid()));
526	if (pte && pte_present(a: pte) && pte_exec(pte: pte) &&
527	(pgd_flags(pgd: *pgd) & _PAGE_USER) &&
528	(__read_cr4() & X86_CR4_SMEP))
529	pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n",
530	from_kuid(&init_user_ns, current_uid()));
531	}
532
533	if (address < PAGE_SIZE && !user_mode(regs))
534	pr_alert("BUG: kernel NULL pointer dereference, address: %px\n",
535	(void *)address);
536	else
537	pr_alert("BUG: unable to handle page fault for address: %px\n",
538	(void *)address);
539
540	pr_alert("#PF: %s %s in %s mode\n",
541	(error_code & X86_PF_USER) ? "user" : "supervisor",
542	(error_code & X86_PF_INSTR) ? "instruction fetch" :
543	(error_code & X86_PF_WRITE) ? "write access" :
544	"read access",
545	user_mode(regs) ? "user" : "kernel");
546	pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code,
547	!(error_code & X86_PF_PROT) ? "not-present page" :
548	(error_code & X86_PF_RSVD) ? "reserved bit violation" :
549	(error_code & X86_PF_PK) ? "protection keys violation" :
550	"permissions violation");
551
552	if (!(error_code & X86_PF_USER) && user_mode(regs)) {
553	struct desc_ptr idt, gdt;
554	u16 ldtr, tr;
555
556	/*
557	* This can happen for quite a few reasons. The more obvious
558	* ones are faults accessing the GDT, or LDT. Perhaps
559	* surprisingly, if the CPU tries to deliver a benign or
560	* contributory exception from user code and gets a page fault
561	* during delivery, the page fault can be delivered as though
562	* it originated directly from user code. This could happen
563	* due to wrong permissions on the IDT, GDT, LDT, TSS, or
564	* kernel or IST stack.
565	*/
566	store_idt(dtr: &idt);
567
568	/ Usable even on Xen PV -- it's just slow. /
569	native_store_gdt(dtr: &gdt);
570
571	pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n",
572	idt.address, idt.size, gdt.address, gdt.size);
573
574	store_ldt(ldtr);
575	show_ldttss(gdt: &gdt, name: "LDTR", index: ldtr);
576
577	store_tr(tr);
578	show_ldttss(gdt: &gdt, name: "TR", index: tr);
579	}
580
581	dump_pagetable(address);
582	}
583
584	static noinline void
585	pgtable_bad(struct pt_regs regs, unsigned* long error_code,
586	unsigned long address)
587	{
588	struct task_struct *tsk;
589	unsigned long flags;
590	int sig;
591
592	flags = oops_begin();
593	tsk = current;
594	sig = SIGKILL;
595
596	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
597	tsk->comm, address);
598	dump_pagetable(address);
599
600	if (__die("Bad pagetable", regs, error_code))
601	sig = `0`;
602
603	oops_end(flags, regs, signr: sig);
604	}
605
606	static void sanitize_error_code(unsigned long address,
607	unsigned long *error_code)
608	{
609	/*
610	* To avoid leaking information about the kernel page
611	* table layout, pretend that user-mode accesses to
612	* kernel addresses are always protection faults.
613	*
614	* NB: This means that failed vsyscalls with vsyscall=none
615	* will have the PROT bit. This doesn't leak any
616	* information and does not appear to cause any problems.
617	*/
618	if (address >= TASK_SIZE_MAX)
619	*error_code \|= X86_PF_PROT;
620	}
621
622	static void set_signal_archinfo(unsigned long address,
623	unsigned long error_code)
624	{
625	struct task_struct *tsk = current;
626
627	tsk->thread.trap_nr = X86_TRAP_PF;
628	tsk->thread.error_code = error_code \| X86_PF_USER;
629	tsk->thread.cr2 = address;
630	}
631
632	static noinline void
633	page_fault_oops(struct pt_regs regs, unsigned* long error_code,
634	unsigned long address)
635	{
636	#ifdef CONFIG_VMAP_STACK
637	struct stack_info info;
638	#endif
639	unsigned long flags;
640	int sig;
641
642	if (user_mode(regs)) {
643	/*
644	* Implicit kernel access from user mode? Skip the stack
645	* overflow and EFI special cases.
646	*/
647	goto oops;
648	}
649
650	#ifdef CONFIG_VMAP_STACK
651	/*
652	* Stack overflow? During boot, we can fault near the initial
653	* stack in the direct map, but that's not an overflow -- check
654	* that we're in vmalloc space to avoid this.
655	*/
656	if (is_vmalloc_addr(x: (void *)address) &&
657	get_stack_guard_info(stack: (void *)address, info: &info)) {
658	/*
659	* We're likely to be running with very little stack space
660	* left. It's plausible that we'd hit this condition but
661	* double-fault even before we get this far, in which case
662	* we're fine: the double-fault handler will deal with it.
663	*
664	* We don't want to make it all the way into the oops code
665	* and then double-fault, though, because we're likely to
666	* break the console driver and lose most of the stack dump.
667	*/
668	call_on_stack(__this_cpu_ist_top_va(DF) - sizeof(void*),
669	handle_stack_overflow,
670	ASM_CALL_ARG3,
671	, [arg1] "r" (regs), [arg2] "r" (address), [arg3] "r" (&info));
672
673	unreachable();
674	}
675	#endif
676
677	/*
678	* Buggy firmware could access regions which might page fault. If
679	* this happens, EFI has a special OOPS path that will try to
680	* avoid hanging the system.
681	*/
682	if (IS_ENABLED(CONFIG_EFI))
683	efi_crash_gracefully_on_page_fault(phys_addr: address);
684
685	/ Only not-present faults should be handled by KFENCE. /
686	if (!(error_code & X86_PF_PROT) &&
687	kfence_handle_page_fault(addr: address, is_write: error_code & X86_PF_WRITE, regs))
688	return;
689
690	oops:
691	/*
692	* Oops. The kernel tried to access some bad page. We'll have to
693	* terminate things with extreme prejudice:
694	*/
695	flags = oops_begin();
696
697	show_fault_oops(regs, error_code, address);
698
699	if (task_stack_end_corrupted(current))
700	printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
701
702	sig = SIGKILL;
703	if (__die("Oops", regs, error_code))
704	sig = `0`;
705
706	/ Executive summary in case the body of the oops scrolled away /
707	printk(KERN_DEFAULT "CR2: %016lx\n", address);
708
709	oops_end(flags, regs, signr: sig);
710	}
711
712	static noinline void
713	kernelmode_fixup_or_oops(struct pt_regs regs, unsigned* long error_code,
714	unsigned long address, int signal, int si_code,
715	u32 pkey)
716	{
717	WARN_ON_ONCE(user_mode(regs));
718
719	/ Are we prepared to handle this kernel fault? /
720	if (fixup_exception(regs, X86_TRAP_PF, error_code, fault_addr: address)) {
721	/*
722	* Any interrupt that takes a fault gets the fixup. This makes
723	* the below recursive fault logic only apply to a faults from
724	* task context.
725	*/
726	if (in_interrupt())
727	return;
728
729	/*
730	* Per the above we're !in_interrupt(), aka. task context.
731	*
732	* In this case we need to make sure we're not recursively
733	* faulting through the emulate_vsyscall() logic.
734	*/
735	if (current->thread.sig_on_uaccess_err && signal) {
736	sanitize_error_code(address, error_code: &error_code);
737
738	set_signal_archinfo(address, error_code);
739
740	if (si_code == SEGV_PKUERR) {
741	force_sig_pkuerr(addr: (void __user *)address, pkey);
742	} else {
743	/ XXX: hwpoison faults will set the wrong code. /
744	force_sig_fault(sig: signal, code: si_code, addr: (void __user *)address);
745	}
746	}
747
748	/*
749	* Barring that, we can do the fixup and be happy.
750	*/
751	return;
752	}
753
754	/*
755	* AMD erratum #91 manifests as a spurious page fault on a PREFETCH
756	* instruction.
757	*/
758	if (is_prefetch(regs, error_code, addr: address))
759	return;
760
761	page_fault_oops(regs, error_code, address);
762	}
763
764	/*
765	* Print out info about fatal segfaults, if the show_unhandled_signals
766	* sysctl is set:
767	*/
768	static inline void
769	show_signal_msg(struct pt_regs regs, unsigned* long error_code,
770	unsigned long address, struct task_struct *tsk)
771	{
772	const char *loglvl = task_pid_nr(tsk) > `1` ? KERN_INFO : KERN_EMERG;
773	/ This is a racy snapshot, but it's better than nothing. /
774	int cpu = raw_smp_processor_id();
775
776	if (!unhandled_signal(tsk, SIGSEGV))
777	return;
778
779	if (!printk_ratelimit())
780	return;
781
782	printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
783	loglvl, tsk->comm, task_pid_nr(tsk), address,
784	(void )regs->ip, (void* *)regs->sp, error_code);
785
786	print_vma_addr(KERN_CONT " in ", rip: regs->ip);
787
788	/*
789	* Dump the likely CPU where the fatal segfault happened.
790	* This can help identify faulty hardware.
791	*/
792	printk(KERN_CONT " likely on CPU %d (core %d, socket %d)", cpu,
793	topology_core_id(cpu), topology_physical_package_id(cpu));
794
795
796	printk(KERN_CONT "\n");
797
798	show_opcodes(regs, loglvl);
799	}
800
801	/*
802	* The (legacy) vsyscall page is the long page in the kernel portion
803	* of the address space that has user-accessible permissions.
804	*/
805	static bool is_vsyscall_vaddr(unsigned long vaddr)
806	{
807	return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
808	}
809
810	static void
811	__bad_area_nosemaphore(struct pt_regs regs, unsigned* long error_code,
812	unsigned long address, u32 pkey, int si_code)
813	{
814	struct task_struct *tsk = current;
815
816	if (!user_mode(regs)) {
817	kernelmode_fixup_or_oops(regs, error_code, address,
818	SIGSEGV, si_code, pkey);
819	return;
820	}
821
822	if (!(error_code & X86_PF_USER)) {
823	/ Implicit user access to kernel memory -- just oops /
824	page_fault_oops(regs, error_code, address);
825	return;
826	}
827
828	/*
829	* User mode accesses just cause a SIGSEGV.
830	* It's possible to have interrupts off here:
831	*/
832	local_irq_enable();
833
834	/*
835	* Valid to do another page fault here because this one came
836	* from user space:
837	*/
838	if (is_prefetch(regs, error_code, addr: address))
839	return;
840
841	if (is_errata100(regs, address))
842	return;
843
844	sanitize_error_code(address, error_code: &error_code);
845
846	if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, fault_addr: address))
847	return;
848
849	if (likely(show_unhandled_signals))
850	show_signal_msg(regs, error_code, address, tsk);
851
852	set_signal_archinfo(address, error_code);
853
854	if (si_code == SEGV_PKUERR)
855	force_sig_pkuerr(addr: (void __user *)address, pkey);
856	else
857	force_sig_fault(SIGSEGV, code: si_code, addr: (void __user *)address);
858
859	local_irq_disable();
860	}
861
862	static noinline void
863	bad_area_nosemaphore(struct pt_regs regs, unsigned* long error_code,
864	unsigned long address)
865	{
866	__bad_area_nosemaphore(regs, error_code, address, pkey: `0`, SEGV_MAPERR);
867	}
868
869	static void
870	__bad_area(struct pt_regs regs, unsigned* long error_code,
871	unsigned long address, u32 pkey, int si_code)
872	{
873	struct mm_struct *mm = current->mm;
874	/*
875	* Something tried to access memory that isn't in our memory map..
876	* Fix it, but check if it's kernel or user first..
877	*/
878	mmap_read_unlock(mm);
879
880	__bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
881	}
882
883	static inline bool bad_area_access_from_pkeys(unsigned long error_code,
884	struct vm_area_struct *vma)
885	{
886	/ This code is always called on the current mm /
887	bool foreign = false;
888
889	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
890	return false;
891	if (error_code & X86_PF_PK)
892	return true;
893	/ this checks permission keys on the VMA: /
894	if (!arch_vma_access_permitted(vma, write: (error_code & X86_PF_WRITE),
895	execute: (error_code & X86_PF_INSTR), foreign))
896	return true;
897	return false;
898	}
899
900	static noinline void
901	bad_area_access_error(struct pt_regs regs, unsigned* long error_code,
902	unsigned long address, struct vm_area_struct *vma)
903	{
904	/*
905	* This OSPKE check is not strictly necessary at runtime.
906	* But, doing it this way allows compiler optimizations
907	* if pkeys are compiled out.
908	*/
909	if (bad_area_access_from_pkeys(error_code, vma)) {
910	/*
911	* A protection key fault means that the PKRU value did not allow
912	* access to some PTE. Userspace can figure out what PKRU was
913	* from the XSAVE state. This function captures the pkey from
914	* the vma and passes it to userspace so userspace can discover
915	* which protection key was set on the PTE.
916	*
917	* If we get here, we know that the hardware signaled a X86_PF_PK
918	* fault and that there was a VMA once we got in the fault
919	* handler. It does not guarantee that the VMA we find here
920	* was the one that we faulted on.
921	*
922	* 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4);
923	* 2. T1 : set PKRU to deny access to pkey=4, touches page
924	* 3. T1 : faults...
925	* 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
926	* 5. T1 : enters fault handler, takes mmap_lock, etc...
927	* 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
928	* faulted on a pte with its pkey=4.
929	*/
930	u32 pkey = vma_pkey(vma);
931
932	__bad_area(regs, error_code, address, pkey, SEGV_PKUERR);
933	} else {
934	__bad_area(regs, error_code, address, pkey: `0`, SEGV_ACCERR);
935	}
936	}
937
938	static void
939	do_sigbus(struct pt_regs regs, unsigned* long error_code, unsigned long address,
940	vm_fault_t fault)
941	{
942	/ Kernel mode? Handle exceptions or die: /
943	if (!user_mode(regs)) {
944	kernelmode_fixup_or_oops(regs, error_code, address,
945	SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY);
946	return;
947	}
948
949	/ User-space => ok to do another page fault: /
950	if (is_prefetch(regs, error_code, addr: address))
951	return;
952
953	sanitize_error_code(address, error_code: &error_code);
954
955	if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, fault_addr: address))
956	return;
957
958	set_signal_archinfo(address, error_code);
959
960	#ifdef CONFIG_MEMORY_FAILURE
961	if (fault & (VM_FAULT_HWPOISON\|VM_FAULT_HWPOISON_LARGE)) {
962	struct task_struct *tsk = current;
963	unsigned lsb = `0`;
964
965	pr_err(
966	"MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
967	tsk->comm, tsk->pid, address);
968	if (fault & VM_FAULT_HWPOISON_LARGE)
969	lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
970	if (fault & VM_FAULT_HWPOISON)
971	lsb = PAGE_SHIFT;
972	force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb);
973	return;
974	}
975	#endif
976	force_sig_fault(SIGBUS, BUS_ADRERR, addr: (void __user *)address);
977	}
978
979	static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
980	{
981	if ((error_code & X86_PF_WRITE) && !pte_write(pte: *pte))
982	return `0`;
983
984	if ((error_code & X86_PF_INSTR) && !pte_exec(pte: *pte))
985	return `0`;
986
987	return `1`;
988	}
989
990	/*
991	* Handle a spurious fault caused by a stale TLB entry.
992	*
993	* This allows us to lazily refresh the TLB when increasing the
994	* permissions of a kernel page (RO -> RW or NX -> X). Doing it
995	* eagerly is very expensive since that implies doing a full
996	* cross-processor TLB flush, even if no stale TLB entries exist
997	* on other processors.
998	*
999	* Spurious faults may only occur if the TLB contains an entry with
1000	* fewer permission than the page table entry. Non-present (P = 0)
1001	* and reserved bit (R = 1) faults are never spurious.
1002	*
1003	* There are no security implications to leaving a stale TLB when
1004	* increasing the permissions on a page.
1005	*
1006	* Returns non-zero if a spurious fault was handled, zero otherwise.
1007	*
1008	* See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
1009	* (Optional Invalidation).
1010	*/
1011	static noinline int
1012	spurious_kernel_fault(unsigned long error_code, unsigned long address)
1013	{
1014	pgd_t *pgd;
1015	p4d_t *p4d;
1016	pud_t *pud;
1017	pmd_t *pmd;
1018	pte_t *pte;
1019	int ret;
1020
1021	/*
1022	* Only writes to RO or instruction fetches from NX may cause
1023	* spurious faults.
1024	*
1025	* These could be from user or supervisor accesses but the TLB
1026	* is only lazily flushed after a kernel mapping protection
1027	* change, so user accesses are not expected to cause spurious
1028	* faults.
1029	*/
1030	if (error_code != (X86_PF_WRITE \| X86_PF_PROT) &&
1031	error_code != (X86_PF_INSTR \| X86_PF_PROT))
1032	return `0`;
1033
1034	pgd = init_mm.pgd + pgd_index(address);
1035	if (!pgd_present(pgd: *pgd))
1036	return `0`;
1037
1038	p4d = p4d_offset(pgd, address);
1039	if (!p4d_present(p4d: *p4d))
1040	return `0`;
1041
1042	if (p4d_large(p4d: *p4d))
1043	return spurious_kernel_fault_check(error_code, pte: (pte_t *) p4d);
1044
1045	pud = pud_offset(p4d, address);
1046	if (!pud_present(pud: *pud))
1047	return `0`;
1048
1049	if (pud_large(pud: *pud))
1050	return spurious_kernel_fault_check(error_code, pte: (pte_t *) pud);
1051
1052	pmd = pmd_offset(pud, address);
1053	if (!pmd_present(pmd: *pmd))
1054	return `0`;
1055
1056	if (pmd_large(pte: *pmd))
1057	return spurious_kernel_fault_check(error_code, pte: (pte_t *) pmd);
1058
1059	pte = pte_offset_kernel(pmd, address);
1060	if (!pte_present(a: *pte))
1061	return `0`;
1062
1063	ret = spurious_kernel_fault_check(error_code, pte);
1064	if (!ret)
1065	return `0`;
1066
1067	/*
1068	* Make sure we have permissions in PMD.
1069	* If not, then there's a bug in the page tables:
1070	*/
1071	ret = spurious_kernel_fault_check(error_code, pte: (pte_t *) pmd);
1072	WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
1073
1074	return ret;
1075	}
1076	NOKPROBE_SYMBOL(spurious_kernel_fault);
1077
1078	int show_unhandled_signals = `1`;
1079
1080	static inline int
1081	access_error(unsigned long error_code, struct vm_area_struct *vma)
1082	{
1083	/ This is only called for the current mm, so: /
1084	bool foreign = false;
1085
1086	/*
1087	* Read or write was blocked by protection keys. This is
1088	* always an unconditional error and can never result in
1089	* a follow-up action to resolve the fault, like a COW.
1090	*/
1091	if (error_code & X86_PF_PK)
1092	return `1`;
1093
1094	/*
1095	* SGX hardware blocked the access. This usually happens
1096	* when the enclave memory contents have been destroyed, like
1097	* after a suspend/resume cycle. In any case, the kernel can't
1098	* fix the cause of the fault. Handle the fault as an access
1099	* error even in cases where no actual access violation
1100	* occurred. This allows userspace to rebuild the enclave in
1101	* response to the signal.
1102	*/
1103	if (unlikely(error_code & X86_PF_SGX))
1104	return `1`;
1105
1106	/*
1107	* Make sure to check the VMA so that we do not perform
1108	* faults just to hit a X86_PF_PK as soon as we fill in a
1109	* page.
1110	*/
1111	if (!arch_vma_access_permitted(vma, write: (error_code & X86_PF_WRITE),
1112	execute: (error_code & X86_PF_INSTR), foreign))
1113	return `1`;
1114
1115	/*
1116	* Shadow stack accesses (PF_SHSTK=1) are only permitted to
1117	* shadow stack VMAs. All other accesses result in an error.
1118	*/
1119	if (error_code & X86_PF_SHSTK) {
1120	if (unlikely(!(vma->vm_flags & VM_SHADOW_STACK)))
1121	return `1`;
1122	if (unlikely(!(vma->vm_flags & VM_WRITE)))
1123	return `1`;
1124	return `0`;
1125	}
1126
1127	if (error_code & X86_PF_WRITE) {
1128	/ write, present and write, not present: /
1129	if (unlikely(vma->vm_flags & VM_SHADOW_STACK))
1130	return `1`;
1131	if (unlikely(!(vma->vm_flags & VM_WRITE)))
1132	return `1`;
1133	return `0`;
1134	}
1135
1136	/ read, present: /
1137	if (unlikely(error_code & X86_PF_PROT))
1138	return `1`;
1139
1140	/ read, not present: /
1141	if (unlikely(!vma_is_accessible(vma)))
1142	return `1`;
1143
1144	return `0`;
1145	}
1146
1147	bool fault_in_kernel_space(unsigned long address)
1148	{
1149	/*
1150	* On 64-bit systems, the vsyscall page is at an address above
1151	* TASK_SIZE_MAX, but is not considered part of the kernel
1152	* address space.
1153	*/
1154	if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(vaddr: address))
1155	return false;
1156
1157	return address >= TASK_SIZE_MAX;
1158	}
1159
1160	/*
1161	* Called for all faults where 'address' is part of the kernel address
1162	* space. Might get called for faults that originate from code that
1163	* ran in userspace or the kernel.
1164	*/
1165	static void
1166	do_kern_addr_fault(struct pt_regs regs, unsigned* long hw_error_code,
1167	unsigned long address)
1168	{
1169	/*
1170	* Protection keys exceptions only happen on user pages. We
1171	* have no user pages in the kernel portion of the address
1172	* space, so do not expect them here.
1173	*/
1174	WARN_ON_ONCE(hw_error_code & X86_PF_PK);
1175
1176	#ifdef CONFIG_X86_32
1177	/*
1178	* We can fault-in kernel-space virtual memory on-demand. The
1179	* 'reference' page table is init_mm.pgd.
1180	*
1181	* NOTE! We MUST NOT take any locks for this case. We may
1182	* be in an interrupt or a critical region, and should
1183	* only copy the information from the master page table,
1184	* nothing more.
1185	*
1186	* Before doing this on-demand faulting, ensure that the
1187	* fault is not any of the following:
1188	* 1. A fault on a PTE with a reserved bit set.
1189	* 2. A fault caused by a user-mode access. (Do not demand-
1190	* fault kernel memory due to user-mode accesses).
1191	* 3. A fault caused by a page-level protection violation.
1192	* (A demand fault would be on a non-present page which
1193	* would have X86_PF_PROT==0).
1194	*
1195	* This is only needed to close a race condition on x86-32 in
1196	* the vmalloc mapping/unmapping code. See the comment above
1197	* vmalloc_fault() for details. On x86-64 the race does not
1198	* exist as the vmalloc mappings don't need to be synchronized
1199	* there.
1200	*/
1201	if (!(hw_error_code & (X86_PF_RSVD \| X86_PF_USER \| X86_PF_PROT))) {
1202	if (vmalloc_fault(address) >= `0`)
1203	return;
1204	}
1205	#endif
1206
1207	if (is_f00f_bug(regs, error_code: hw_error_code, address))
1208	return;
1209
1210	/ Was the fault spurious, caused by lazy TLB invalidation? /
1211	if (spurious_kernel_fault(error_code: hw_error_code, address))
1212	return;
1213
1214	/ kprobes don't want to hook the spurious faults: /
1215	if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF)))
1216	return;
1217
1218	/*
1219	* Note, despite being a "bad area", there are quite a few
1220	* acceptable reasons to get here, such as erratum fixups
1221	* and handling kernel code that can fault, like get_user().
1222	*
1223	* Don't take the mm semaphore here. If we fixup a prefetch
1224	* fault we could otherwise deadlock:
1225	*/
1226	bad_area_nosemaphore(regs, error_code: hw_error_code, address);
1227	}
1228	NOKPROBE_SYMBOL(do_kern_addr_fault);
1229
1230	/*
1231	* Handle faults in the user portion of the address space. Nothing in here
1232	* should check X86_PF_USER without a specific justification: for almost
1233	* all purposes, we should treat a normal kernel access to user memory
1234	* (e.g. get_user(), put_user(), etc.) the same as the WRUSS instruction.
1235	* The one exception is AC flag handling, which is, per the x86
1236	* architecture, special for WRUSS.
1237	*/
1238	static inline
1239	void do_user_addr_fault(struct pt_regs *regs,
1240	unsigned long error_code,
1241	unsigned long address)
1242	{
1243	struct vm_area_struct *vma;
1244	struct task_struct *tsk;
1245	struct mm_struct *mm;
1246	vm_fault_t fault;
1247	unsigned int flags = FAULT_FLAG_DEFAULT;
1248
1249	tsk = current;
1250	mm = tsk->mm;
1251
1252	if (unlikely((error_code & (X86_PF_USER \| X86_PF_INSTR)) == X86_PF_INSTR)) {
1253	/*
1254	* Whoops, this is kernel mode code trying to execute from
1255	* user memory. Unless this is AMD erratum #93, which
1256	* corrupts RIP such that it looks like a user address,
1257	* this is unrecoverable. Don't even try to look up the
1258	* VMA or look for extable entries.
1259	*/
1260	if (is_errata93(regs, address))
1261	return;
1262
1263	page_fault_oops(regs, error_code, address);
1264	return;
1265	}
1266
1267	/ kprobes don't want to hook the spurious faults: /
1268	if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF)))
1269	return;
1270
1271	/*
1272	* Reserved bits are never expected to be set on
1273	* entries in the user portion of the page tables.
1274	*/
1275	if (unlikely(error_code & X86_PF_RSVD))
1276	pgtable_bad(regs, error_code, address);
1277
1278	/*
1279	* If SMAP is on, check for invalid kernel (supervisor) access to user
1280	* pages in the user address space. The odd case here is WRUSS,
1281	* which, according to the preliminary documentation, does not respect
1282	* SMAP and will have the USER bit set so, in all cases, SMAP
1283	* enforcement appears to be consistent with the USER bit.
1284	*/
1285	if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
1286	!(error_code & X86_PF_USER) &&
1287	!(regs->flags & X86_EFLAGS_AC))) {
1288	/*
1289	* No extable entry here. This was a kernel access to an
1290	* invalid pointer. get_kernel_nofault() will not get here.
1291	*/
1292	page_fault_oops(regs, error_code, address);
1293	return;
1294	}
1295
1296	/*
1297	* If we're in an interrupt, have no user context or are running
1298	* in a region with pagefaults disabled then we must not take the fault
1299	*/
1300	if (unlikely(faulthandler_disabled() \|\| !mm)) {
1301	bad_area_nosemaphore(regs, error_code, address);
1302	return;
1303	}
1304
1305	/*
1306	* It's safe to allow irq's after cr2 has been saved and the
1307	* vmalloc fault has been handled.
1308	*
1309	* User-mode registers count as a user access even for any
1310	* potential system fault or CPU buglet:
1311	*/
1312	if (user_mode(regs)) {
1313	local_irq_enable();
1314	flags \|= FAULT_FLAG_USER;
1315	} else {
1316	if (regs->flags & X86_EFLAGS_IF)
1317	local_irq_enable();
1318	}
1319
1320	perf_sw_event(event_id: PERF_COUNT_SW_PAGE_FAULTS, nr: `1`, regs, addr: address);
1321
1322	/*
1323	* Read-only permissions can not be expressed in shadow stack PTEs.
1324	* Treat all shadow stack accesses as WRITE faults. This ensures
1325	* that the MM will prepare everything (e.g., break COW) such that
1326	* maybe_mkwrite() can create a proper shadow stack PTE.
1327	*/
1328	if (error_code & X86_PF_SHSTK)
1329	flags \|= FAULT_FLAG_WRITE;
1330	if (error_code & X86_PF_WRITE)
1331	flags \|= FAULT_FLAG_WRITE;
1332	if (error_code & X86_PF_INSTR)
1333	flags \|= FAULT_FLAG_INSTRUCTION;
1334
1335	#ifdef CONFIG_X86_64
1336	/*
1337	* Faults in the vsyscall page might need emulation. The
1338	* vsyscall page is at a high address (>PAGE_OFFSET), but is
1339	* considered to be part of the user address space.
1340	*
1341	* The vsyscall page does not have a "real" VMA, so do this
1342	* emulation before we go searching for VMAs.
1343	*
1344	* PKRU never rejects instruction fetches, so we don't need
1345	* to consider the PF_PK bit.
1346	*/
1347	if (is_vsyscall_vaddr(vaddr: address)) {
1348	if (emulate_vsyscall(error_code, regs, address))
1349	return;
1350	}
1351	#endif
1352
1353	if (!(flags & FAULT_FLAG_USER))
1354	goto lock_mmap;
1355
1356	vma = lock_vma_under_rcu(mm, address);
1357	if (!vma)
1358	goto lock_mmap;
1359
1360	if (unlikely(access_error(error_code, vma))) {
1361	vma_end_read(vma);
1362	goto lock_mmap;
1363	}
1364	fault = handle_mm_fault(vma, address, flags: flags \| FAULT_FLAG_VMA_LOCK, regs);
1365	if (!(fault & (VM_FAULT_RETRY \| VM_FAULT_COMPLETED)))
1366	vma_end_read(vma);
1367
1368	if (!(fault & VM_FAULT_RETRY)) {
1369	count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
1370	goto done;
1371	}
1372	count_vm_vma_lock_event(VMA_LOCK_RETRY);
1373
1374	/ Quick path to respond to signals /
1375	if (fault_signal_pending(fault_flags: fault, regs)) {
1376	if (!user_mode(regs))
1377	kernelmode_fixup_or_oops(regs, error_code, address,
1378	SIGBUS, BUS_ADRERR,
1379	ARCH_DEFAULT_PKEY);
1380	return;
1381	}
1382	lock_mmap:
1383
1384	retry:
1385	vma = lock_mm_and_find_vma(mm, address, regs);
1386	if (unlikely(!vma)) {
1387	bad_area_nosemaphore(regs, error_code, address);
1388	return;
1389	}
1390
1391	/*
1392	* Ok, we have a good vm_area for this memory access, so
1393	* we can handle it..
1394	*/
1395	if (unlikely(access_error(error_code, vma))) {
1396	bad_area_access_error(regs, error_code, address, vma);
1397	return;
1398	}
1399
1400	/*
1401	* If for any reason at all we couldn't handle the fault,
1402	* make sure we exit gracefully rather than endlessly redo
1403	* the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if
1404	* we get VM_FAULT_RETRY back, the mmap_lock has been unlocked.
1405	*
1406	* Note that handle_userfault() may also release and reacquire mmap_lock
1407	* (and not return with VM_FAULT_RETRY), when returning to userland to
1408	* repeat the page fault later with a VM_FAULT_NOPAGE retval
1409	* (potentially after handling any pending signal during the return to
1410	* userland). The return to userland is identified whenever
1411	* FAULT_FLAG_USER\|FAULT_FLAG_KILLABLE are both set in flags.
1412	*/
1413	fault = handle_mm_fault(vma, address, flags, regs);
1414
1415	if (fault_signal_pending(fault_flags: fault, regs)) {
1416	/*
1417	* Quick path to respond to signals. The core mm code
1418	* has unlocked the mm for us if we get here.
1419	*/
1420	if (!user_mode(regs))
1421	kernelmode_fixup_or_oops(regs, error_code, address,
1422	SIGBUS, BUS_ADRERR,
1423	ARCH_DEFAULT_PKEY);
1424	return;
1425	}
1426
1427	/ The fault is fully completed (including releasing mmap lock) /
1428	if (fault & VM_FAULT_COMPLETED)
1429	return;
1430
1431	/*
1432	* If we need to retry the mmap_lock has already been released,
1433	* and if there is a fatal signal pending there is no guarantee
1434	* that we made any progress. Handle this case first.
1435	*/
1436	if (unlikely(fault & VM_FAULT_RETRY)) {
1437	flags \|= FAULT_FLAG_TRIED;
1438	goto retry;
1439	}
1440
1441	mmap_read_unlock(mm);
1442	done:
1443	if (likely(!(fault & VM_FAULT_ERROR)))
1444	return;
1445
1446	if (fatal_signal_pending(current) && !user_mode(regs)) {
1447	kernelmode_fixup_or_oops(regs, error_code, address,
1448	signal: `0`, si_code: `0`, ARCH_DEFAULT_PKEY);
1449	return;
1450	}
1451
1452	if (fault & VM_FAULT_OOM) {
1453	/ Kernel mode? Handle exceptions or die: /
1454	if (!user_mode(regs)) {
1455	kernelmode_fixup_or_oops(regs, error_code, address,
1456	SIGSEGV, SEGV_MAPERR,
1457	ARCH_DEFAULT_PKEY);
1458	return;
1459	}
1460
1461	/*
1462	* We ran out of memory, call the OOM killer, and return the
1463	* userspace (which will retry the fault, or kill us if we got
1464	* oom-killed):
1465	*/
1466	pagefault_out_of_memory();
1467	} else {
1468	if (fault & (VM_FAULT_SIGBUS\|VM_FAULT_HWPOISON\|
1469	VM_FAULT_HWPOISON_LARGE))
1470	do_sigbus(regs, error_code, address, fault);
1471	else if (fault & VM_FAULT_SIGSEGV)
1472	bad_area_nosemaphore(regs, error_code, address);
1473	else
1474	BUG();
1475	}
1476	}
1477	NOKPROBE_SYMBOL(do_user_addr_fault);
1478
1479	static __always_inline void
1480	trace_page_fault_entries(struct pt_regs regs, unsigned* long error_code,
1481	unsigned long address)
1482	{
1483	if (!trace_pagefault_enabled())
1484	return;
1485
1486	if (user_mode(regs))
1487	trace_page_fault_user(address, regs, error_code);
1488	else
1489	trace_page_fault_kernel(address, regs, error_code);
1490	}
1491
1492	static __always_inline void
1493	handle_page_fault(struct pt_regs regs, unsigned* long error_code,
1494	unsigned long address)
1495	{
1496	trace_page_fault_entries(regs, error_code, address);
1497
1498	if (unlikely(kmmio_fault(regs, address)))
1499	return;
1500
1501	/ Was the fault on kernel-controlled part of the address space? /
1502	if (unlikely(fault_in_kernel_space(address))) {
1503	do_kern_addr_fault(regs, hw_error_code: error_code, address);
1504	} else {
1505	do_user_addr_fault(regs, error_code, address);
1506	/*
1507	* User address page fault handling might have reenabled
1508	* interrupts. Fixing up all potential exit points of
1509	* do_user_addr_fault() and its leaf functions is just not
1510	* doable w/o creating an unholy mess or turning the code
1511	* upside down.
1512	*/
1513	local_irq_disable();
1514	}
1515	}
1516
1517	DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
1518	{
1519	unsigned long address = read_cr2();
1520	irqentry_state_t state;
1521
1522	prefetchw(x: &current->mm->mmap_lock);
1523
1524	/*
1525	* KVM uses #PF vector to deliver 'page not present' events to guests
1526	* (asynchronous page fault mechanism). The event happens when a
1527	* userspace task is trying to access some valid (from guest's point of
1528	* view) memory which is not currently mapped by the host (e.g. the
1529	* memory is swapped out). Note, the corresponding "page ready" event
1530	* which is injected when the memory becomes available, is delivered via
1531	* an interrupt mechanism and not a #PF exception
1532	* (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()).
1533	*
1534	* We are relying on the interrupted context being sane (valid RSP,
1535	* relevant locks not held, etc.), which is fine as long as the
1536	* interrupted context had IF=1. We are also relying on the KVM
1537	* async pf type field and CR2 being read consistently instead of
1538	* getting values from real and async page faults mixed up.
1539	*
1540	* Fingers crossed.
1541	*
1542	* The async #PF handling code takes care of idtentry handling
1543	* itself.
1544	*/
1545	if (kvm_handle_async_pf(regs, token: (u32)address))
1546	return;
1547
1548	/*
1549	* Entry handling for valid #PF from kernel mode is slightly
1550	* different: RCU is already watching and ct_irq_enter() must not
1551	* be invoked because a kernel fault on a user space address might
1552	* sleep.
1553	*
1554	* In case the fault hit a RCU idle region the conditional entry
1555	* code reenabled RCU to avoid subsequent wreckage which helps
1556	* debuggability.
1557	*/
1558	state = irqentry_enter(regs);
1559
1560	instrumentation_begin();
1561	handle_page_fault(regs, error_code, address);
1562	instrumentation_end();
1563
1564	irqentry_exit(regs, state);
1565	}
1566

source code of linux/arch/x86/mm/fault.c