userfaultfd.c source code [linux/fs/userfaultfd.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* fs/userfaultfd.c
4	*
5	* Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
6	* Copyright (C) 2008-2009 Red Hat, Inc.
7	* Copyright (C) 2015 Red Hat, Inc.
8	*
9	* Some part derived from fs/eventfd.c (anon inode setup) and
10	* mm/ksm.c (mm hashing).
11	*/
12
13	#include <linux/list.h>
14	#include <linux/hashtable.h>
15	#include <linux/sched/signal.h>
16	#include <linux/sched/mm.h>
17	#include <linux/mm.h>
18	#include <linux/mm_inline.h>
19	#include <linux/mmu_notifier.h>
20	#include <linux/poll.h>
21	#include <linux/slab.h>
22	#include <linux/seq_file.h>
23	#include <linux/file.h>
24	#include <linux/bug.h>
25	#include <linux/anon_inodes.h>
26	#include <linux/syscalls.h>
27	#include <linux/userfaultfd_k.h>
28	#include <linux/mempolicy.h>
29	#include <linux/ioctl.h>
30	#include <linux/security.h>
31	#include <linux/hugetlb.h>
32	#include <linux/swapops.h>
33	#include <linux/miscdevice.h>
34
35	static int sysctl_unprivileged_userfaultfd __read_mostly;
36
37	#ifdef CONFIG_SYSCTL
38	static struct ctl_table vm_userfaultfd_table[] = {
39	{
40	.procname = "unprivileged_userfaultfd",
41	.data = &sysctl_unprivileged_userfaultfd,
42	.maxlen = sizeof(sysctl_unprivileged_userfaultfd),
43	.mode = `0644`,
44	.proc_handler = proc_dointvec_minmax,
45	.extra1 = SYSCTL_ZERO,
46	.extra2 = SYSCTL_ONE,
47	},
48	};
49	#endif
50
51	static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;
52
53	struct userfaultfd_fork_ctx {
54	struct userfaultfd_ctx *orig;
55	struct userfaultfd_ctx *new;
56	struct list_head list;
57	};
58
59	struct userfaultfd_unmap_ctx {
60	struct userfaultfd_ctx *ctx;
61	unsigned long start;
62	unsigned long end;
63	struct list_head list;
64	};
65
66	struct userfaultfd_wait_queue {
67	struct uffd_msg msg;
68	wait_queue_entry_t wq;
69	struct userfaultfd_ctx *ctx;
70	bool waken;
71	};
72
73	struct userfaultfd_wake_range {
74	unsigned long start;
75	unsigned long len;
76	};
77
78	/ internal indication that UFFD_API ioctl was successfully executed /
79	#define UFFD_FEATURE_INITIALIZED (1u << 31)
80
81	static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
82	{
83	return ctx->features & UFFD_FEATURE_INITIALIZED;
84	}
85
86	static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
87	{
88	return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
89	}
90
91	/*
92	* Whether WP_UNPOPULATED is enabled on the uffd context. It is only
93	* meaningful when userfaultfd_wp()==true on the vma and when it's
94	* anonymous.
95	*/
96	bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
97	{
98	struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
99
100	if (!ctx)
101	return false;
102
103	return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
104	}
105
106	static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
107	vm_flags_t flags)
108	{
109	const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP;
110
111	vm_flags_reset(vma, flags);
112	/*
113	* For shared mappings, we want to enable writenotify while
114	* userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
115	* recalculate vma->vm_page_prot whenever userfaultfd-wp changes.
116	*/
117	if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
118	vma_set_page_prot(vma);
119	}
120
121	static int userfaultfd_wake_function(wait_queue_entry_t wq, unsigned* mode,
122	int wake_flags, void *key)
123	{
124	struct userfaultfd_wake_range *range = key;
125	int ret;
126	struct userfaultfd_wait_queue *uwq;
127	unsigned long start, len;
128
129	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
130	ret = `0`;
131	/ len == 0 means wake all /
132	start = range->start;
133	len = range->len;
134	if (len && (start > uwq->msg.arg.pagefault.address \|\|
135	start + len <= uwq->msg.arg.pagefault.address))
136	goto out;
137	WRITE_ONCE(uwq->waken, true);
138	/*
139	* The Program-Order guarantees provided by the scheduler
140	* ensure uwq->waken is visible before the task is woken.
141	*/
142	ret = wake_up_state(tsk: wq->private, state: mode);
143	if (ret) {
144	/*
145	* Wake only once, autoremove behavior.
146	*
147	* After the effect of list_del_init is visible to the other
148	* CPUs, the waitqueue may disappear from under us, see the
149	* !list_empty_careful() in handle_userfault().
150	*
151	* try_to_wake_up() has an implicit smp_mb(), and the
152	* wq->private is read before calling the extern function
153	* "wake_up_state" (which in turns calls try_to_wake_up).
154	*/
155	list_del_init(entry: &wq->entry);
156	}
157	out:
158	return ret;
159	}
160
161	/**
162	* userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
163	* context.
164	* @ctx: [in] Pointer to the userfaultfd context.
165	*/
166	static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
167	{
168	refcount_inc(r: &ctx->refcount);
169	}
170
171	/**
172	* userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
173	* context.
174	* @ctx: [in] Pointer to userfaultfd context.
175	*
176	* The userfaultfd context reference must have been previously acquired either
177	* with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
178	*/
179	static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
180	{
181	if (refcount_dec_and_test(r: &ctx->refcount)) {
182	VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
183	VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
184	VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
185	VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
186	VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
187	VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
188	VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
189	VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
190	mmdrop(mm: ctx->mm);
191	kmem_cache_free(s: userfaultfd_ctx_cachep, objp: ctx);
192	}
193	}
194
195	static inline void msg_init(struct uffd_msg *msg)
196	{
197	BUILD_BUG_ON(sizeof(struct uffd_msg) != `32`);
198	/*
199	* Must use memset to zero out the paddings or kernel data is
200	* leaked to userland.
201	*/
202	memset(msg, `0`, sizeof(struct uffd_msg));
203	}
204
205	static inline struct uffd_msg userfault_msg(unsigned long address,
206	unsigned long real_address,
207	unsigned int flags,
208	unsigned long reason,
209	unsigned int features)
210	{
211	struct uffd_msg msg;
212
213	msg_init(msg: &msg);
214	msg.event = UFFD_EVENT_PAGEFAULT;
215
216	msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ?
217	real_address : address;
218
219	/*
220	* These flags indicate why the userfault occurred:
221	* - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
222	* - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault.
223	* - Neither of these flags being set indicates a MISSING fault.
224	*
225	* Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write
226	* fault. Otherwise, it was a read fault.
227	*/
228	if (flags & FAULT_FLAG_WRITE)
229	msg.arg.pagefault.flags \|= UFFD_PAGEFAULT_FLAG_WRITE;
230	if (reason & VM_UFFD_WP)
231	msg.arg.pagefault.flags \|= UFFD_PAGEFAULT_FLAG_WP;
232	if (reason & VM_UFFD_MINOR)
233	msg.arg.pagefault.flags \|= UFFD_PAGEFAULT_FLAG_MINOR;
234	if (features & UFFD_FEATURE_THREAD_ID)
235	msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
236	return msg;
237	}
238
239	#ifdef CONFIG_HUGETLB_PAGE
240	/*
241	* Same functionality as userfaultfd_must_wait below with modifications for
242	* hugepmd ranges.
243	*/
244	static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
245	struct vm_fault *vmf,
246	unsigned long reason)
247	{
248	struct vm_area_struct *vma = vmf->vma;
249	pte_t *ptep, pte;
250	bool ret = true;
251
252	assert_fault_locked(vmf);
253
254	ptep = hugetlb_walk(vma, addr: vmf->address, sz: vma_mmu_pagesize(vma));
255	if (!ptep)
256	goto out;
257
258	ret = false;
259	pte = huge_ptep_get(ptep);
260
261	/*
262	* Lockless access: we're in a wait_event so it's ok if it
263	* changes under us. PTE markers should be handled the same as none
264	* ptes here.
265	*/
266	if (huge_pte_none_mostly(pte))
267	ret = true;
268	if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
269	ret = true;
270	out:
271	return ret;
272	}
273	#else
274	static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
275	struct vm_fault *vmf,
276	unsigned long reason)
277	{
278	return false; / should never get here /
279	}
280	#endif /* CONFIG_HUGETLB_PAGE */
281
282	/*
283	* Verify the pagetables are still not ok after having reigstered into
284	* the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
285	* userfault that has already been resolved, if userfaultfd_read and
286	* UFFDIO_COPY\|ZEROPAGE are being run simultaneously on two different
287	* threads.
288	*/
289	static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
290	struct vm_fault *vmf,
291	unsigned long reason)
292	{
293	struct mm_struct *mm = ctx->mm;
294	unsigned long address = vmf->address;
295	pgd_t *pgd;
296	p4d_t *p4d;
297	pud_t *pud;
298	pmd_t *pmd, _pmd;
299	pte_t *pte;
300	pte_t ptent;
301	bool ret = true;
302
303	assert_fault_locked(vmf);
304
305	pgd = pgd_offset(mm, address);
306	if (!pgd_present(pgd: *pgd))
307	goto out;
308	p4d = p4d_offset(pgd, address);
309	if (!p4d_present(p4d: *p4d))
310	goto out;
311	pud = pud_offset(p4d, address);
312	if (!pud_present(pud: *pud))
313	goto out;
314	pmd = pmd_offset(pud, address);
315	again:
316	_pmd = pmdp_get_lockless(pmdp: pmd);
317	if (pmd_none(pmd: _pmd))
318	goto out;
319
320	ret = false;
321	if (!pmd_present(pmd: _pmd) \|\| pmd_devmap(pmd: _pmd))
322	goto out;
323
324	if (pmd_trans_huge(pmd: _pmd)) {
325	if (!pmd_write(pmd: _pmd) && (reason & VM_UFFD_WP))
326	ret = true;
327	goto out;
328	}
329
330	pte = pte_offset_map(pmd, addr: address);
331	if (!pte) {
332	ret = true;
333	goto again;
334	}
335	/*
336	* Lockless access: we're in a wait_event so it's ok if it
337	* changes under us. PTE markers should be handled the same as none
338	* ptes here.
339	*/
340	ptent = ptep_get(ptep: pte);
341	if (pte_none_mostly(pte: ptent))
342	ret = true;
343	if (!pte_write(pte: ptent) && (reason & VM_UFFD_WP))
344	ret = true;
345	pte_unmap(pte);
346
347	out:
348	return ret;
349	}
350
351	static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags)
352	{
353	if (flags & FAULT_FLAG_INTERRUPTIBLE)
354	return TASK_INTERRUPTIBLE;
355
356	if (flags & FAULT_FLAG_KILLABLE)
357	return TASK_KILLABLE;
358
359	return TASK_UNINTERRUPTIBLE;
360	}
361
362	/*
363	* The locking rules involved in returning VM_FAULT_RETRY depending on
364	* FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
365	* FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
366	* recommendation in __lock_page_or_retry is not an understatement.
367	*
368	* If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released
369	* before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
370	* not set.
371	*
372	* If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
373	* set, VM_FAULT_RETRY can still be returned if and only if there are
374	* fatal_signal_pending()s, and the mmap_lock must be released before
375	* returning it.
376	*/
377	vm_fault_t handle_userfault(struct vm_fault vmf, unsigned* long reason)
378	{
379	struct vm_area_struct *vma = vmf->vma;
380	struct mm_struct *mm = vma->vm_mm;
381	struct userfaultfd_ctx *ctx;
382	struct userfaultfd_wait_queue uwq;
383	vm_fault_t ret = VM_FAULT_SIGBUS;
384	bool must_wait;
385	unsigned int blocking_state;
386
387	/*
388	* We don't do userfault handling for the final child pid update.
389	*
390	* We also don't do userfault handling during
391	* coredumping. hugetlbfs has the special
392	* hugetlb_follow_page_mask() to skip missing pages in the
393	* FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
394	* the no_page_table() helper in follow_page_mask(), but the
395	* shmem_vm_ops->fault method is invoked even during
396	* coredumping and it ends up here.
397	*/
398	if (current->flags & (PF_EXITING\|PF_DUMPCORE))
399	goto out;
400
401	assert_fault_locked(vmf);
402
403	ctx = vma->vm_userfaultfd_ctx.ctx;
404	if (!ctx)
405	goto out;
406
407	BUG_ON(ctx->mm != mm);
408
409	/ Any unrecognized flag is a bug. /
410	VM_BUG_ON(reason & ~__VM_UFFD_FLAGS);
411	/ 0 or > 1 flags set is a bug; we expect exactly 1. /
412	VM_BUG_ON(!reason \|\| (reason & (reason - `1`)));
413
414	if (ctx->features & UFFD_FEATURE_SIGBUS)
415	goto out;
416	if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
417	goto out;
418
419	/*
420	* If it's already released don't get it. This avoids to loop
421	* in __get_user_pages if userfaultfd_release waits on the
422	* caller of handle_userfault to release the mmap_lock.
423	*/
424	if (unlikely(READ_ONCE(ctx->released))) {
425	/*
426	* Don't return VM_FAULT_SIGBUS in this case, so a non
427	* cooperative manager can close the uffd after the
428	* last UFFDIO_COPY, without risking to trigger an
429	* involuntary SIGBUS if the process was starting the
430	* userfaultfd while the userfaultfd was still armed
431	* (but after the last UFFDIO_COPY). If the uffd
432	* wasn't already closed when the userfault reached
433	* this point, that would normally be solved by
434	* userfaultfd_must_wait returning 'false'.
435	*
436	* If we were to return VM_FAULT_SIGBUS here, the non
437	* cooperative manager would be instead forced to
438	* always call UFFDIO_UNREGISTER before it can safely
439	* close the uffd.
440	*/
441	ret = VM_FAULT_NOPAGE;
442	goto out;
443	}
444
445	/*
446	* Check that we can return VM_FAULT_RETRY.
447	*
448	* NOTE: it should become possible to return VM_FAULT_RETRY
449	* even if FAULT_FLAG_TRIED is set without leading to gup()
450	* -EBUSY failures, if the userfaultfd is to be extended for
451	* VM_UFFD_WP tracking and we intend to arm the userfault
452	* without first stopping userland access to the memory. For
453	* VM_UFFD_MISSING userfaults this is enough for now.
454	*/
455	if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
456	/*
457	* Validate the invariant that nowait must allow retry
458	* to be sure not to return SIGBUS erroneously on
459	* nowait invocations.
460	*/
461	BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
462	#ifdef CONFIG_DEBUG_VM
463	if (printk_ratelimit()) {
464	printk(KERN_WARNING
465	"FAULT_FLAG_ALLOW_RETRY missing %x\n",
466	vmf->flags);
467	dump_stack();
468	}
469	#endif
470	goto out;
471	}
472
473	/*
474	* Handle nowait, not much to do other than tell it to retry
475	* and wait.
476	*/
477	ret = VM_FAULT_RETRY;
478	if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
479	goto out;
480
481	/ take the reference before dropping the mmap_lock /
482	userfaultfd_ctx_get(ctx);
483
484	init_waitqueue_func_entry(wq_entry: &uwq.wq, func: userfaultfd_wake_function);
485	uwq.wq.private = current;
486	uwq.msg = userfault_msg(address: vmf->address, real_address: vmf->real_address, flags: vmf->flags,
487	reason, features: ctx->features);
488	uwq.ctx = ctx;
489	uwq.waken = false;
490
491	blocking_state = userfaultfd_get_blocking_state(flags: vmf->flags);
492
493	/*
494	* Take the vma lock now, in order to safely call
495	* userfaultfd_huge_must_wait() later. Since acquiring the
496	* (sleepable) vma lock can modify the current task state, that
497	* must be before explicitly calling set_current_state().
498	*/
499	if (is_vm_hugetlb_page(vma))
500	hugetlb_vma_lock_read(vma);
501
502	spin_lock_irq(lock: &ctx->fault_pending_wqh.lock);
503	/*
504	* After the __add_wait_queue the uwq is visible to userland
505	* through poll/read().
506	*/
507	__add_wait_queue(wq_head: &ctx->fault_pending_wqh, wq_entry: &uwq.wq);
508	/*
509	* The smp_mb() after __set_current_state prevents the reads
510	* following the spin_unlock to happen before the list_add in
511	* __add_wait_queue.
512	*/
513	set_current_state(blocking_state);
514	spin_unlock_irq(lock: &ctx->fault_pending_wqh.lock);
515
516	if (!is_vm_hugetlb_page(vma))
517	must_wait = userfaultfd_must_wait(ctx, vmf, reason);
518	else
519	must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason);
520	if (is_vm_hugetlb_page(vma))
521	hugetlb_vma_unlock_read(vma);
522	release_fault_lock(vmf);
523
524	if (likely(must_wait && !READ_ONCE(ctx->released))) {
525	wake_up_poll(&ctx->fd_wqh, EPOLLIN);
526	schedule();
527	}
528
529	__set_current_state(TASK_RUNNING);
530
531	/*
532	* Here we race with the list_del; list_add in
533	* userfaultfd_ctx_read(), however because we don't ever run
534	* list_del_init() to refile across the two lists, the prev
535	* and next pointers will never point to self. list_add also
536	* would never let any of the two pointers to point to
537	* self. So list_empty_careful won't risk to see both pointers
538	* pointing to self at any time during the list refile. The
539	* only case where list_del_init() is called is the full
540	* removal in the wake function and there we don't re-list_add
541	* and it's fine not to block on the spinlock. The uwq on this
542	* kernel stack can be released after the list_del_init.
543	*/
544	if (!list_empty_careful(head: &uwq.wq.entry)) {
545	spin_lock_irq(lock: &ctx->fault_pending_wqh.lock);
546	/*
547	* No need of list_del_init(), the uwq on the stack
548	* will be freed shortly anyway.
549	*/
550	list_del(entry: &uwq.wq.entry);
551	spin_unlock_irq(lock: &ctx->fault_pending_wqh.lock);
552	}
553
554	/*
555	* ctx may go away after this if the userfault pseudo fd is
556	* already released.
557	*/
558	userfaultfd_ctx_put(ctx);
559
560	out:
561	return ret;
562	}
563
564	static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
565	struct userfaultfd_wait_queue *ewq)
566	{
567	struct userfaultfd_ctx *release_new_ctx;
568
569	if (WARN_ON_ONCE(current->flags & PF_EXITING))
570	goto out;
571
572	ewq->ctx = ctx;
573	init_waitqueue_entry(wq_entry: &ewq->wq, current);
574	release_new_ctx = NULL;
575
576	spin_lock_irq(lock: &ctx->event_wqh.lock);
577	/*
578	* After the __add_wait_queue the uwq is visible to userland
579	* through poll/read().
580	*/
581	__add_wait_queue(wq_head: &ctx->event_wqh, wq_entry: &ewq->wq);
582	for (;;) {
583	set_current_state(TASK_KILLABLE);
584	if (ewq->msg.event == `0`)
585	break;
586	if (READ_ONCE(ctx->released) \|\|
587	fatal_signal_pending(current)) {
588	/*
589	* &ewq->wq may be queued in fork_event, but
590	* __remove_wait_queue ignores the head
591	* parameter. It would be a problem if it
592	* didn't.
593	*/
594	__remove_wait_queue(wq_head: &ctx->event_wqh, wq_entry: &ewq->wq);
595	if (ewq->msg.event == UFFD_EVENT_FORK) {
596	struct userfaultfd_ctx *new;
597
598	new = (struct userfaultfd_ctx *)
599	(unsigned long)
600	ewq->msg.arg.reserved.reserved1;
601	release_new_ctx = new;
602	}
603	break;
604	}
605
606	spin_unlock_irq(lock: &ctx->event_wqh.lock);
607
608	wake_up_poll(&ctx->fd_wqh, EPOLLIN);
609	schedule();
610
611	spin_lock_irq(lock: &ctx->event_wqh.lock);
612	}
613	__set_current_state(TASK_RUNNING);
614	spin_unlock_irq(lock: &ctx->event_wqh.lock);
615
616	if (release_new_ctx) {
617	struct vm_area_struct *vma;
618	struct mm_struct *mm = release_new_ctx->mm;
619	VMA_ITERATOR(vmi, mm, `0`);
620
621	/ the various vma->vm_userfaultfd_ctx still points to it /
622	mmap_write_lock(mm);
623	for_each_vma(vmi, vma) {
624	if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
625	vma_start_write(vma);
626	vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
627	userfaultfd_set_vm_flags(vma,
628	flags: vma->vm_flags & ~__VM_UFFD_FLAGS);
629	}
630	}
631	mmap_write_unlock(mm);
632
633	userfaultfd_ctx_put(ctx: release_new_ctx);
634	}
635
636	/*
637	* ctx may go away after this if the userfault pseudo fd is
638	* already released.
639	*/
640	out:
641	atomic_dec(v: &ctx->mmap_changing);
642	VM_BUG_ON(atomic_read(&ctx->mmap_changing) < `0`);
643	userfaultfd_ctx_put(ctx);
644	}
645
646	static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
647	struct userfaultfd_wait_queue *ewq)
648	{
649	ewq->msg.event = `0`;
650	wake_up_locked(&ctx->event_wqh);
651	__remove_wait_queue(wq_head: &ctx->event_wqh, wq_entry: &ewq->wq);
652	}
653
654	int dup_userfaultfd(struct vm_area_struct vma, struct* list_head *fcs)
655	{
656	struct userfaultfd_ctx ctx = NULL, octx;
657	struct userfaultfd_fork_ctx *fctx;
658
659	octx = vma->vm_userfaultfd_ctx.ctx;
660	if (!octx \|\| !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
661	vma_start_write(vma);
662	vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
663	userfaultfd_set_vm_flags(vma, flags: vma->vm_flags & ~__VM_UFFD_FLAGS);
664	return `0`;
665	}
666
667	list_for_each_entry(fctx, fcs, list)
668	if (fctx->orig == octx) {
669	ctx = fctx->new;
670	break;
671	}
672
673	if (!ctx) {
674	fctx = kmalloc(size: sizeof(*fctx), GFP_KERNEL);
675	if (!fctx)
676	return -ENOMEM;
677
678	ctx = kmem_cache_alloc(cachep: userfaultfd_ctx_cachep, GFP_KERNEL);
679	if (!ctx) {
680	kfree(objp: fctx);
681	return -ENOMEM;
682	}
683
684	refcount_set(r: &ctx->refcount, n: `1`);
685	ctx->flags = octx->flags;
686	ctx->features = octx->features;
687	ctx->released = false;
688	init_rwsem(&ctx->map_changing_lock);
689	atomic_set(v: &ctx->mmap_changing, i: `0`);
690	ctx->mm = vma->vm_mm;
691	mmgrab(mm: ctx->mm);
692
693	userfaultfd_ctx_get(ctx: octx);
694	down_write(sem: &octx->map_changing_lock);
695	atomic_inc(v: &octx->mmap_changing);
696	up_write(sem: &octx->map_changing_lock);
697	fctx->orig = octx;
698	fctx->new = ctx;
699	list_add_tail(new: &fctx->list, head: fcs);
700	}
701
702	vma->vm_userfaultfd_ctx.ctx = ctx;
703	return `0`;
704	}
705
706	static void dup_fctx(struct userfaultfd_fork_ctx *fctx)
707	{
708	struct userfaultfd_ctx *ctx = fctx->orig;
709	struct userfaultfd_wait_queue ewq;
710
711	msg_init(msg: &ewq.msg);
712
713	ewq.msg.event = UFFD_EVENT_FORK;
714	ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
715
716	userfaultfd_event_wait_completion(ctx, ewq: &ewq);
717	}
718
719	void dup_userfaultfd_complete(struct list_head *fcs)
720	{
721	struct userfaultfd_fork_ctx fctx, n;
722
723	list_for_each_entry_safe(fctx, n, fcs, list) {
724	dup_fctx(fctx);
725	list_del(entry: &fctx->list);
726	kfree(objp: fctx);
727	}
728	}
729
730	void mremap_userfaultfd_prep(struct vm_area_struct *vma,
731	struct vm_userfaultfd_ctx *vm_ctx)
732	{
733	struct userfaultfd_ctx *ctx;
734
735	ctx = vma->vm_userfaultfd_ctx.ctx;
736
737	if (!ctx)
738	return;
739
740	if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
741	vm_ctx->ctx = ctx;
742	userfaultfd_ctx_get(ctx);
743	down_write(sem: &ctx->map_changing_lock);
744	atomic_inc(v: &ctx->mmap_changing);
745	up_write(sem: &ctx->map_changing_lock);
746	} else {
747	/ Drop uffd context if remap feature not enabled /
748	vma_start_write(vma);
749	vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
750	userfaultfd_set_vm_flags(vma, flags: vma->vm_flags & ~__VM_UFFD_FLAGS);
751	}
752	}
753
754	void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
755	unsigned long from, unsigned long to,
756	unsigned long len)
757	{
758	struct userfaultfd_ctx *ctx = vm_ctx->ctx;
759	struct userfaultfd_wait_queue ewq;
760
761	if (!ctx)
762	return;
763
764	if (to & ~PAGE_MASK) {
765	userfaultfd_ctx_put(ctx);
766	return;
767	}
768
769	msg_init(msg: &ewq.msg);
770
771	ewq.msg.event = UFFD_EVENT_REMAP;
772	ewq.msg.arg.remap.from = from;
773	ewq.msg.arg.remap.to = to;
774	ewq.msg.arg.remap.len = len;
775
776	userfaultfd_event_wait_completion(ctx, ewq: &ewq);
777	}
778
779	bool userfaultfd_remove(struct vm_area_struct *vma,
780	unsigned long start, unsigned long end)
781	{
782	struct mm_struct *mm = vma->vm_mm;
783	struct userfaultfd_ctx *ctx;
784	struct userfaultfd_wait_queue ewq;
785
786	ctx = vma->vm_userfaultfd_ctx.ctx;
787	if (!ctx \|\| !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
788	return true;
789
790	userfaultfd_ctx_get(ctx);
791	down_write(sem: &ctx->map_changing_lock);
792	atomic_inc(v: &ctx->mmap_changing);
793	up_write(sem: &ctx->map_changing_lock);
794	mmap_read_unlock(mm);
795
796	msg_init(msg: &ewq.msg);
797
798	ewq.msg.event = UFFD_EVENT_REMOVE;
799	ewq.msg.arg.remove.start = start;
800	ewq.msg.arg.remove.end = end;
801
802	userfaultfd_event_wait_completion(ctx, ewq: &ewq);
803
804	return false;
805	}
806
807	static bool has_unmap_ctx(struct userfaultfd_ctx ctx, struct* list_head *unmaps,
808	unsigned long start, unsigned long end)
809	{
810	struct userfaultfd_unmap_ctx *unmap_ctx;
811
812	list_for_each_entry(unmap_ctx, unmaps, list)
813	if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
814	unmap_ctx->end == end)
815	return true;
816
817	return false;
818	}
819
820	int userfaultfd_unmap_prep(struct vm_area_struct vma, unsigned* long start,
821	unsigned long end, struct list_head *unmaps)
822	{
823	struct userfaultfd_unmap_ctx *unmap_ctx;
824	struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
825
826	if (!ctx \|\| !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) \|\|
827	has_unmap_ctx(ctx, unmaps, start, end))
828	return `0`;
829
830	unmap_ctx = kzalloc(size: sizeof(*unmap_ctx), GFP_KERNEL);
831	if (!unmap_ctx)
832	return -ENOMEM;
833
834	userfaultfd_ctx_get(ctx);
835	down_write(sem: &ctx->map_changing_lock);
836	atomic_inc(v: &ctx->mmap_changing);
837	up_write(sem: &ctx->map_changing_lock);
838	unmap_ctx->ctx = ctx;
839	unmap_ctx->start = start;
840	unmap_ctx->end = end;
841	list_add_tail(new: &unmap_ctx->list, head: unmaps);
842
843	return `0`;
844	}
845
846	void userfaultfd_unmap_complete(struct mm_struct mm, struct* list_head *uf)
847	{
848	struct userfaultfd_unmap_ctx ctx, n;
849	struct userfaultfd_wait_queue ewq;
850
851	list_for_each_entry_safe(ctx, n, uf, list) {
852	msg_init(msg: &ewq.msg);
853
854	ewq.msg.event = UFFD_EVENT_UNMAP;
855	ewq.msg.arg.remove.start = ctx->start;
856	ewq.msg.arg.remove.end = ctx->end;
857
858	userfaultfd_event_wait_completion(ctx: ctx->ctx, ewq: &ewq);
859
860	list_del(entry: &ctx->list);
861	kfree(objp: ctx);
862	}
863	}
864
865	static int userfaultfd_release(struct inode inode, struct* file *file)
866	{
867	struct userfaultfd_ctx *ctx = file->private_data;
868	struct mm_struct *mm = ctx->mm;
869	struct vm_area_struct vma, prev;
870	/ len == 0 means wake all /
871	struct userfaultfd_wake_range range = { .len = `0`, };
872	unsigned long new_flags;
873	VMA_ITERATOR(vmi, mm, `0`);
874
875	WRITE_ONCE(ctx->released, true);
876
877	if (!mmget_not_zero(mm))
878	goto wakeup;
879
880	/*
881	* Flush page faults out of all CPUs. NOTE: all page faults
882	* must be retried without returning VM_FAULT_SIGBUS if
883	* userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
884	* changes while handle_userfault released the mmap_lock. So
885	* it's critical that released is set to true (above), before
886	* taking the mmap_lock for writing.
887	*/
888	mmap_write_lock(mm);
889	prev = NULL;
890	for_each_vma(vmi, vma) {
891	cond_resched();
892	BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
893	!!(vma->vm_flags & __VM_UFFD_FLAGS));
894	if (vma->vm_userfaultfd_ctx.ctx != ctx) {
895	prev = vma;
896	continue;
897	}
898	new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
899	vma = vma_modify_flags_uffd(vmi: &vmi, prev, vma, start: vma->vm_start,
900	end: vma->vm_end, new_flags,
901	NULL_VM_UFFD_CTX);
902
903	vma_start_write(vma);
904	userfaultfd_set_vm_flags(vma, flags: new_flags);
905	vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
906
907	prev = vma;
908	}
909	mmap_write_unlock(mm);
910	mmput(mm);
911	wakeup:
912	/*
913	* After no new page faults can wait on this fault_*wqh, flush
914	* the last page faults that may have been already waiting on
915	* the fault_*wqh.
916	*/
917	spin_lock_irq(lock: &ctx->fault_pending_wqh.lock);
918	__wake_up_locked_key(wq_head: &ctx->fault_pending_wqh, TASK_NORMAL, key: &range);
919	__wake_up(wq_head: &ctx->fault_wqh, TASK_NORMAL, nr: `1`, key: &range);
920	spin_unlock_irq(lock: &ctx->fault_pending_wqh.lock);
921
922	/ Flush pending events that may still wait on event_wqh /
923	wake_up_all(&ctx->event_wqh);
924
925	wake_up_poll(&ctx->fd_wqh, EPOLLHUP);
926	userfaultfd_ctx_put(ctx);
927	return `0`;
928	}
929
930	/ fault_pending_wqh.lock must be hold by the caller /
931	static inline struct userfaultfd_wait_queue *find_userfault_in(
932	wait_queue_head_t *wqh)
933	{
934	wait_queue_entry_t *wq;
935	struct userfaultfd_wait_queue *uwq;
936
937	lockdep_assert_held(&wqh->lock);
938
939	uwq = NULL;
940	if (!waitqueue_active(wq_head: wqh))
941	goto out;
942	/ walk in reverse to provide FIFO behavior to read userfaults /
943	wq = list_last_entry(&wqh->head, typeof(*wq), entry);
944	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
945	out:
946	return uwq;
947	}
948
949	static inline struct userfaultfd_wait_queue *find_userfault(
950	struct userfaultfd_ctx *ctx)
951	{
952	return find_userfault_in(wqh: &ctx->fault_pending_wqh);
953	}
954
955	static inline struct userfaultfd_wait_queue *find_userfault_evt(
956	struct userfaultfd_ctx *ctx)
957	{
958	return find_userfault_in(wqh: &ctx->event_wqh);
959	}
960
961	static __poll_t userfaultfd_poll(struct file file, poll_table wait)
962	{
963	struct userfaultfd_ctx *ctx = file->private_data;
964	__poll_t ret;
965
966	poll_wait(filp: file, wait_address: &ctx->fd_wqh, p: wait);
967
968	if (!userfaultfd_is_initialized(ctx))
969	return EPOLLERR;
970
971	/*
972	* poll() never guarantees that read won't block.
973	* userfaults can be waken before they're read().
974	*/
975	if (unlikely(!(file->f_flags & O_NONBLOCK)))
976	return EPOLLERR;
977	/*
978	* lockless access to see if there are pending faults
979	* __pollwait last action is the add_wait_queue but
980	* the spin_unlock would allow the waitqueue_active to
981	* pass above the actual list_add inside
982	* add_wait_queue critical section. So use a full
983	* memory barrier to serialize the list_add write of
984	* add_wait_queue() with the waitqueue_active read
985	* below.
986	*/
987	ret = `0`;
988	smp_mb();
989	if (waitqueue_active(wq_head: &ctx->fault_pending_wqh))
990	ret = EPOLLIN;
991	else if (waitqueue_active(wq_head: &ctx->event_wqh))
992	ret = EPOLLIN;
993
994	return ret;
995	}
996
997	static const struct file_operations userfaultfd_fops;
998
999	static int resolve_userfault_fork(struct userfaultfd_ctx *new,
1000	struct inode *inode,
1001	struct uffd_msg *msg)
1002	{
1003	int fd;
1004
1005	fd = anon_inode_create_getfd(name: "[userfaultfd]", fops: &userfaultfd_fops, priv: new,
1006	O_RDONLY \| (new->flags & UFFD_SHARED_FCNTL_FLAGS), context_inode: inode);
1007	if (fd < `0`)
1008	return fd;
1009
1010	msg->arg.reserved.reserved1 = `0`;
1011	msg->arg.fork.ufd = fd;
1012	return `0`;
1013	}
1014
1015	static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx ctx, int* no_wait,
1016	struct uffd_msg msg, struct* inode *inode)
1017	{
1018	ssize_t ret;
1019	DECLARE_WAITQUEUE(wait, current);
1020	struct userfaultfd_wait_queue *uwq;
1021	/*
1022	* Handling fork event requires sleeping operations, so
1023	* we drop the event_wqh lock, then do these ops, then
1024	* lock it back and wake up the waiter. While the lock is
1025	* dropped the ewq may go away so we keep track of it
1026	* carefully.
1027	*/
1028	LIST_HEAD(fork_event);
1029	struct userfaultfd_ctx *fork_nctx = NULL;
1030
1031	/ always take the fd_wqh lock before the fault_pending_wqh lock /
1032	spin_lock_irq(lock: &ctx->fd_wqh.lock);
1033	__add_wait_queue(wq_head: &ctx->fd_wqh, wq_entry: &wait);
1034	for (;;) {
1035	set_current_state(TASK_INTERRUPTIBLE);
1036	spin_lock(lock: &ctx->fault_pending_wqh.lock);
1037	uwq = find_userfault(ctx);
1038	if (uwq) {
1039	/*
1040	* Use a seqcount to repeat the lockless check
1041	* in wake_userfault() to avoid missing
1042	* wakeups because during the refile both
1043	* waitqueue could become empty if this is the
1044	* only userfault.
1045	*/
1046	write_seqcount_begin(&ctx->refile_seq);
1047
1048	/*
1049	* The fault_pending_wqh.lock prevents the uwq
1050	* to disappear from under us.
1051	*
1052	* Refile this userfault from
1053	* fault_pending_wqh to fault_wqh, it's not
1054	* pending anymore after we read it.
1055	*
1056	* Use list_del() by hand (as
1057	* userfaultfd_wake_function also uses
1058	* list_del_init() by hand) to be sure nobody
1059	* changes __remove_wait_queue() to use
1060	* list_del_init() in turn breaking the
1061	* !list_empty_careful() check in
1062	* handle_userfault(). The uwq->wq.head list
1063	* must never be empty at any time during the
1064	* refile, or the waitqueue could disappear
1065	* from under us. The "wait_queue_head_t"
1066	* parameter of __remove_wait_queue() is unused
1067	* anyway.
1068	*/
1069	list_del(entry: &uwq->wq.entry);
1070	add_wait_queue(wq_head: &ctx->fault_wqh, wq_entry: &uwq->wq);
1071
1072	write_seqcount_end(&ctx->refile_seq);
1073
1074	/ careful to always initialize msg if ret == 0 /
1075	*msg = uwq->msg;
1076	spin_unlock(lock: &ctx->fault_pending_wqh.lock);
1077	ret = `0`;
1078	break;
1079	}
1080	spin_unlock(lock: &ctx->fault_pending_wqh.lock);
1081
1082	spin_lock(lock: &ctx->event_wqh.lock);
1083	uwq = find_userfault_evt(ctx);
1084	if (uwq) {
1085	*msg = uwq->msg;
1086
1087	if (uwq->msg.event == UFFD_EVENT_FORK) {
1088	fork_nctx = (struct userfaultfd_ctx *)
1089	(unsigned long)
1090	uwq->msg.arg.reserved.reserved1;
1091	list_move(list: &uwq->wq.entry, head: &fork_event);
1092	/*
1093	* fork_nctx can be freed as soon as
1094	* we drop the lock, unless we take a
1095	* reference on it.
1096	*/
1097	userfaultfd_ctx_get(ctx: fork_nctx);
1098	spin_unlock(lock: &ctx->event_wqh.lock);
1099	ret = `0`;
1100	break;
1101	}
1102
1103	userfaultfd_event_complete(ctx, ewq: uwq);
1104	spin_unlock(lock: &ctx->event_wqh.lock);
1105	ret = `0`;
1106	break;
1107	}
1108	spin_unlock(lock: &ctx->event_wqh.lock);
1109
1110	if (signal_pending(current)) {
1111	ret = -ERESTARTSYS;
1112	break;
1113	}
1114	if (no_wait) {
1115	ret = -EAGAIN;
1116	break;
1117	}
1118	spin_unlock_irq(lock: &ctx->fd_wqh.lock);
1119	schedule();
1120	spin_lock_irq(lock: &ctx->fd_wqh.lock);
1121	}
1122	__remove_wait_queue(wq_head: &ctx->fd_wqh, wq_entry: &wait);
1123	__set_current_state(TASK_RUNNING);
1124	spin_unlock_irq(lock: &ctx->fd_wqh.lock);
1125
1126	if (!ret && msg->event == UFFD_EVENT_FORK) {
1127	ret = resolve_userfault_fork(new: fork_nctx, inode, msg);
1128	spin_lock_irq(lock: &ctx->event_wqh.lock);
1129	if (!list_empty(head: &fork_event)) {
1130	/*
1131	* The fork thread didn't abort, so we can
1132	* drop the temporary refcount.
1133	*/
1134	userfaultfd_ctx_put(ctx: fork_nctx);
1135
1136	uwq = list_first_entry(&fork_event,
1137	typeof(*uwq),
1138	wq.entry);
1139	/*
1140	* If fork_event list wasn't empty and in turn
1141	* the event wasn't already released by fork
1142	* (the event is allocated on fork kernel
1143	* stack), put the event back to its place in
1144	* the event_wq. fork_event head will be freed
1145	* as soon as we return so the event cannot
1146	* stay queued there no matter the current
1147	* "ret" value.
1148	*/
1149	list_del(entry: &uwq->wq.entry);
1150	__add_wait_queue(wq_head: &ctx->event_wqh, wq_entry: &uwq->wq);
1151
1152	/*
1153	* Leave the event in the waitqueue and report
1154	* error to userland if we failed to resolve
1155	* the userfault fork.
1156	*/
1157	if (likely(!ret))
1158	userfaultfd_event_complete(ctx, ewq: uwq);
1159	} else {
1160	/*
1161	* Here the fork thread aborted and the
1162	* refcount from the fork thread on fork_nctx
1163	* has already been released. We still hold
1164	* the reference we took before releasing the
1165	* lock above. If resolve_userfault_fork
1166	* failed we've to drop it because the
1167	* fork_nctx has to be freed in such case. If
1168	* it succeeded we'll hold it because the new
1169	* uffd references it.
1170	*/
1171	if (ret)
1172	userfaultfd_ctx_put(ctx: fork_nctx);
1173	}
1174	spin_unlock_irq(lock: &ctx->event_wqh.lock);
1175	}
1176
1177	return ret;
1178	}
1179
1180	static ssize_t userfaultfd_read(struct file file, char* __user *buf,
1181	size_t count, loff_t *ppos)
1182	{
1183	struct userfaultfd_ctx *ctx = file->private_data;
1184	ssize_t _ret, ret = `0`;
1185	struct uffd_msg msg;
1186	int no_wait = file->f_flags & O_NONBLOCK;
1187	struct inode *inode = file_inode(f: file);
1188
1189	if (!userfaultfd_is_initialized(ctx))
1190	return -EINVAL;
1191
1192	for (;;) {
1193	if (count < sizeof(msg))
1194	return ret ? ret : -EINVAL;
1195	_ret = userfaultfd_ctx_read(ctx, no_wait, msg: &msg, inode);
1196	if (_ret < `0`)
1197	return ret ? ret : _ret;
1198	if (copy_to_user(to: (__u64 __user ) buf, from: &msg, n: sizeof*(msg)))
1199	return ret ? ret : -EFAULT;
1200	ret += sizeof(msg);
1201	buf += sizeof(msg);
1202	count -= sizeof(msg);
1203	/*
1204	* Allow to read more than one fault at time but only
1205	* block if waiting for the very first one.
1206	*/
1207	no_wait = O_NONBLOCK;
1208	}
1209	}
1210
1211	static void __wake_userfault(struct userfaultfd_ctx *ctx,
1212	struct userfaultfd_wake_range *range)
1213	{
1214	spin_lock_irq(lock: &ctx->fault_pending_wqh.lock);
1215	/ wake all in the range and autoremove /
1216	if (waitqueue_active(wq_head: &ctx->fault_pending_wqh))
1217	__wake_up_locked_key(wq_head: &ctx->fault_pending_wqh, TASK_NORMAL,
1218	key: range);
1219	if (waitqueue_active(wq_head: &ctx->fault_wqh))
1220	__wake_up(wq_head: &ctx->fault_wqh, TASK_NORMAL, nr: `1`, key: range);
1221	spin_unlock_irq(lock: &ctx->fault_pending_wqh.lock);
1222	}
1223
1224	static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
1225	struct userfaultfd_wake_range *range)
1226	{
1227	unsigned seq;
1228	bool need_wakeup;
1229
1230	/*
1231	* To be sure waitqueue_active() is not reordered by the CPU
1232	* before the pagetable update, use an explicit SMP memory
1233	* barrier here. PT lock release or mmap_read_unlock(mm) still
1234	* have release semantics that can allow the
1235	* waitqueue_active() to be reordered before the pte update.
1236	*/
1237	smp_mb();
1238
1239	/*
1240	* Use waitqueue_active because it's very frequent to
1241	* change the address space atomically even if there are no
1242	* userfaults yet. So we take the spinlock only when we're
1243	* sure we've userfaults to wake.
1244	*/
1245	do {
1246	seq = read_seqcount_begin(&ctx->refile_seq);
1247	need_wakeup = waitqueue_active(wq_head: &ctx->fault_pending_wqh) \|\|
1248	waitqueue_active(wq_head: &ctx->fault_wqh);
1249	cond_resched();
1250	} while (read_seqcount_retry(&ctx->refile_seq, seq));
1251	if (need_wakeup)
1252	__wake_userfault(ctx, range);
1253	}
1254
1255	static __always_inline int validate_unaligned_range(
1256	struct mm_struct *mm, __u64 start, __u64 len)
1257	{
1258	__u64 task_size = mm->task_size;
1259
1260	if (len & ~PAGE_MASK)
1261	return -EINVAL;
1262	if (!len)
1263	return -EINVAL;
1264	if (start < mmap_min_addr)
1265	return -EINVAL;
1266	if (start >= task_size)
1267	return -EINVAL;
1268	if (len > task_size - start)
1269	return -EINVAL;
1270	if (start + len <= start)
1271	return -EINVAL;
1272	return `0`;
1273	}
1274
1275	static __always_inline int validate_range(struct mm_struct *mm,
1276	__u64 start, __u64 len)
1277	{
1278	if (start & ~PAGE_MASK)
1279	return -EINVAL;
1280
1281	return validate_unaligned_range(mm, start, len);
1282	}
1283
1284	static int userfaultfd_register(struct userfaultfd_ctx *ctx,
1285	unsigned long arg)
1286	{
1287	struct mm_struct *mm = ctx->mm;
1288	struct vm_area_struct vma, prev, *cur;
1289	int ret;
1290	struct uffdio_register uffdio_register;
1291	struct uffdio_register __user *user_uffdio_register;
1292	unsigned long vm_flags, new_flags;
1293	bool found;
1294	bool basic_ioctls;
1295	unsigned long start, end, vma_end;
1296	struct vma_iterator vmi;
1297	bool wp_async = userfaultfd_wp_async_ctx(ctx);
1298
1299	user_uffdio_register = (struct uffdio_register __user *) arg;
1300
1301	ret = -EFAULT;
1302	if (copy_from_user(to: &uffdio_register, from: user_uffdio_register,
1303	n: sizeof(uffdio_register)-sizeof(__u64)))
1304	goto out;
1305
1306	ret = -EINVAL;
1307	if (!uffdio_register.mode)
1308	goto out;
1309	if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES)
1310	goto out;
1311	vm_flags = `0`;
1312	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
1313	vm_flags \|= VM_UFFD_MISSING;
1314	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
1315	#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
1316	goto out;
1317	#endif
1318	vm_flags \|= VM_UFFD_WP;
1319	}
1320	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
1321	#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
1322	goto out;
1323	#endif
1324	vm_flags \|= VM_UFFD_MINOR;
1325	}
1326
1327	ret = validate_range(mm, start: uffdio_register.range.start,
1328	len: uffdio_register.range.len);
1329	if (ret)
1330	goto out;
1331
1332	start = uffdio_register.range.start;
1333	end = start + uffdio_register.range.len;
1334
1335	ret = -ENOMEM;
1336	if (!mmget_not_zero(mm))
1337	goto out;
1338
1339	ret = -EINVAL;
1340	mmap_write_lock(mm);
1341	vma_iter_init(vmi: &vmi, mm, addr: start);
1342	vma = vma_find(vmi: &vmi, max: end);
1343	if (!vma)
1344	goto out_unlock;
1345
1346	/*
1347	* If the first vma contains huge pages, make sure start address
1348	* is aligned to huge page size.
1349	*/
1350	if (is_vm_hugetlb_page(vma)) {
1351	unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
1352
1353	if (start & (vma_hpagesize - `1`))
1354	goto out_unlock;
1355	}
1356
1357	/*
1358	* Search for not compatible vmas.
1359	*/
1360	found = false;
1361	basic_ioctls = false;
1362	cur = vma;
1363	do {
1364	cond_resched();
1365
1366	BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
1367	!!(cur->vm_flags & __VM_UFFD_FLAGS));
1368
1369	/ check not compatible vmas /
1370	ret = -EINVAL;
1371	if (!vma_can_userfault(vma: cur, vm_flags, wp_async))
1372	goto out_unlock;
1373
1374	/*
1375	* UFFDIO_COPY will fill file holes even without
1376	* PROT_WRITE. This check enforces that if this is a
1377	* MAP_SHARED, the process has write permission to the backing
1378	* file. If VM_MAYWRITE is set it also enforces that on a
1379	* MAP_SHARED vma: there is no F_WRITE_SEAL and no further
1380	* F_WRITE_SEAL can be taken until the vma is destroyed.
1381	*/
1382	ret = -EPERM;
1383	if (unlikely(!(cur->vm_flags & VM_MAYWRITE)))
1384	goto out_unlock;
1385
1386	/*
1387	* If this vma contains ending address, and huge pages
1388	* check alignment.
1389	*/
1390	if (is_vm_hugetlb_page(vma: cur) && end <= cur->vm_end &&
1391	end > cur->vm_start) {
1392	unsigned long vma_hpagesize = vma_kernel_pagesize(vma: cur);
1393
1394	ret = -EINVAL;
1395
1396	if (end & (vma_hpagesize - `1`))
1397	goto out_unlock;
1398	}
1399	if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
1400	goto out_unlock;
1401
1402	/*
1403	* Check that this vma isn't already owned by a
1404	* different userfaultfd. We can't allow more than one
1405	* userfaultfd to own a single vma simultaneously or we
1406	* wouldn't know which one to deliver the userfaults to.
1407	*/
1408	ret = -EBUSY;
1409	if (cur->vm_userfaultfd_ctx.ctx &&
1410	cur->vm_userfaultfd_ctx.ctx != ctx)
1411	goto out_unlock;
1412
1413	/*
1414	* Note vmas containing huge pages
1415	*/
1416	if (is_vm_hugetlb_page(vma: cur))
1417	basic_ioctls = true;
1418
1419	found = true;
1420	} for_each_vma_range(vmi, cur, end);
1421	BUG_ON(!found);
1422
1423	vma_iter_set(vmi: &vmi, addr: start);
1424	prev = vma_prev(vmi: &vmi);
1425	if (vma->vm_start < start)
1426	prev = vma;
1427
1428	ret = `0`;
1429	for_each_vma_range(vmi, vma, end) {
1430	cond_resched();
1431
1432	BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async));
1433	BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
1434	vma->vm_userfaultfd_ctx.ctx != ctx);
1435	WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
1436
1437	/*
1438	* Nothing to do: this vma is already registered into this
1439	* userfaultfd and with the right tracking mode too.
1440	*/
1441	if (vma->vm_userfaultfd_ctx.ctx == ctx &&
1442	(vma->vm_flags & vm_flags) == vm_flags)
1443	goto skip;
1444
1445	if (vma->vm_start > start)
1446	start = vma->vm_start;
1447	vma_end = min(end, vma->vm_end);
1448
1449	new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) \| vm_flags;
1450	vma = vma_modify_flags_uffd(vmi: &vmi, prev, vma, start, end: vma_end,
1451	new_flags,
1452	new_ctx: (struct vm_userfaultfd_ctx){ctx});
1453	if (IS_ERR(ptr: vma)) {
1454	ret = PTR_ERR(ptr: vma);
1455	break;
1456	}
1457
1458	/*
1459	* In the vma_merge() successful mprotect-like case 8:
1460	* the next vma was merged into the current one and
1461	* the current one has not been updated yet.
1462	*/
1463	vma_start_write(vma);
1464	userfaultfd_set_vm_flags(vma, flags: new_flags);
1465	vma->vm_userfaultfd_ctx.ctx = ctx;
1466
1467	if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
1468	hugetlb_unshare_all_pmds(vma);
1469
1470	skip:
1471	prev = vma;
1472	start = vma->vm_end;
1473	}
1474
1475	out_unlock:
1476	mmap_write_unlock(mm);
1477	mmput(mm);
1478	if (!ret) {
1479	__u64 ioctls_out;
1480
1481	ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
1482	UFFD_API_RANGE_IOCTLS;
1483
1484	/*
1485	* Declare the WP ioctl only if the WP mode is
1486	* specified and all checks passed with the range
1487	*/
1488	if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
1489	ioctls_out &= ~((__u64)`1` << _UFFDIO_WRITEPROTECT);
1490
1491	/ CONTINUE ioctl is only supported for MINOR ranges. /
1492	if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
1493	ioctls_out &= ~((__u64)`1` << _UFFDIO_CONTINUE);
1494
1495	/*
1496	* Now that we scanned all vmas we can already tell
1497	* userland which ioctls methods are guaranteed to
1498	* succeed on this range.
1499	*/
1500	if (put_user(ioctls_out, &user_uffdio_register->ioctls))
1501	ret = -EFAULT;
1502	}
1503	out:
1504	return ret;
1505	}
1506
1507	static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
1508	unsigned long arg)
1509	{
1510	struct mm_struct *mm = ctx->mm;
1511	struct vm_area_struct vma, prev, *cur;
1512	int ret;
1513	struct uffdio_range uffdio_unregister;
1514	unsigned long new_flags;
1515	bool found;
1516	unsigned long start, end, vma_end;
1517	const void __user buf = (void* __user *)arg;
1518	struct vma_iterator vmi;
1519	bool wp_async = userfaultfd_wp_async_ctx(ctx);
1520
1521	ret = -EFAULT;
1522	if (copy_from_user(to: &uffdio_unregister, from: buf, n: sizeof(uffdio_unregister)))
1523	goto out;
1524
1525	ret = validate_range(mm, start: uffdio_unregister.start,
1526	len: uffdio_unregister.len);
1527	if (ret)
1528	goto out;
1529
1530	start = uffdio_unregister.start;
1531	end = start + uffdio_unregister.len;
1532
1533	ret = -ENOMEM;
1534	if (!mmget_not_zero(mm))
1535	goto out;
1536
1537	mmap_write_lock(mm);
1538	ret = -EINVAL;
1539	vma_iter_init(vmi: &vmi, mm, addr: start);
1540	vma = vma_find(vmi: &vmi, max: end);
1541	if (!vma)
1542	goto out_unlock;
1543
1544	/*
1545	* If the first vma contains huge pages, make sure start address
1546	* is aligned to huge page size.
1547	*/
1548	if (is_vm_hugetlb_page(vma)) {
1549	unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
1550
1551	if (start & (vma_hpagesize - `1`))
1552	goto out_unlock;
1553	}
1554
1555	/*
1556	* Search for not compatible vmas.
1557	*/
1558	found = false;
1559	cur = vma;
1560	do {
1561	cond_resched();
1562
1563	BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
1564	!!(cur->vm_flags & __VM_UFFD_FLAGS));
1565
1566	/*
1567	* Check not compatible vmas, not strictly required
1568	* here as not compatible vmas cannot have an
1569	* userfaultfd_ctx registered on them, but this
1570	* provides for more strict behavior to notice
1571	* unregistration errors.
1572	*/
1573	if (!vma_can_userfault(vma: cur, vm_flags: cur->vm_flags, wp_async))
1574	goto out_unlock;
1575
1576	found = true;
1577	} for_each_vma_range(vmi, cur, end);
1578	BUG_ON(!found);
1579
1580	vma_iter_set(vmi: &vmi, addr: start);
1581	prev = vma_prev(vmi: &vmi);
1582	if (vma->vm_start < start)
1583	prev = vma;
1584
1585	ret = `0`;
1586	for_each_vma_range(vmi, vma, end) {
1587	cond_resched();
1588
1589	BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async));
1590
1591	/*
1592	* Nothing to do: this vma is already registered into this
1593	* userfaultfd and with the right tracking mode too.
1594	*/
1595	if (!vma->vm_userfaultfd_ctx.ctx)
1596	goto skip;
1597
1598	WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
1599
1600	if (vma->vm_start > start)
1601	start = vma->vm_start;
1602	vma_end = min(end, vma->vm_end);
1603
1604	if (userfaultfd_missing(vma)) {
1605	/*
1606	* Wake any concurrent pending userfault while
1607	* we unregister, so they will not hang
1608	* permanently and it avoids userland to call
1609	* UFFDIO_WAKE explicitly.
1610	*/
1611	struct userfaultfd_wake_range range;
1612	range.start = start;
1613	range.len = vma_end - start;
1614	wake_userfault(ctx: vma->vm_userfaultfd_ctx.ctx, range: &range);
1615	}
1616
1617	/ Reset ptes for the whole vma range if wr-protected /
1618	if (userfaultfd_wp(vma))
1619	uffd_wp_range(vma, start, len: vma_end - start, enable_wp: false);
1620
1621	new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
1622	vma = vma_modify_flags_uffd(vmi: &vmi, prev, vma, start, end: vma_end,
1623	new_flags, NULL_VM_UFFD_CTX);
1624	if (IS_ERR(ptr: vma)) {
1625	ret = PTR_ERR(ptr: vma);
1626	break;
1627	}
1628
1629	/*
1630	* In the vma_merge() successful mprotect-like case 8:
1631	* the next vma was merged into the current one and
1632	* the current one has not been updated yet.
1633	*/
1634	vma_start_write(vma);
1635	userfaultfd_set_vm_flags(vma, flags: new_flags);
1636	vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
1637
1638	skip:
1639	prev = vma;
1640	start = vma->vm_end;
1641	}
1642
1643	out_unlock:
1644	mmap_write_unlock(mm);
1645	mmput(mm);
1646	out:
1647	return ret;
1648	}
1649
1650	/*
1651	* userfaultfd_wake may be used in combination with the
1652	* UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
1653	*/
1654	static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
1655	unsigned long arg)
1656	{
1657	int ret;
1658	struct uffdio_range uffdio_wake;
1659	struct userfaultfd_wake_range range;
1660	const void __user buf = (void* __user *)arg;
1661
1662	ret = -EFAULT;
1663	if (copy_from_user(to: &uffdio_wake, from: buf, n: sizeof(uffdio_wake)))
1664	goto out;
1665
1666	ret = validate_range(mm: ctx->mm, start: uffdio_wake.start, len: uffdio_wake.len);
1667	if (ret)
1668	goto out;
1669
1670	range.start = uffdio_wake.start;
1671	range.len = uffdio_wake.len;
1672
1673	/*
1674	* len == 0 means wake all and we don't want to wake all here,
1675	* so check it again to be sure.
1676	*/
1677	VM_BUG_ON(!range.len);
1678
1679	wake_userfault(ctx, range: &range);
1680	ret = `0`;
1681
1682	out:
1683	return ret;
1684	}
1685
1686	static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
1687	unsigned long arg)
1688	{
1689	__s64 ret;
1690	struct uffdio_copy uffdio_copy;
1691	struct uffdio_copy __user *user_uffdio_copy;
1692	struct userfaultfd_wake_range range;
1693	uffd_flags_t flags = `0`;
1694
1695	user_uffdio_copy = (struct uffdio_copy __user *) arg;
1696
1697	ret = -EAGAIN;
1698	if (atomic_read(v: &ctx->mmap_changing))
1699	goto out;
1700
1701	ret = -EFAULT;
1702	if (copy_from_user(to: &uffdio_copy, from: user_uffdio_copy,
1703	/ don't copy "copy" last field /
1704	n: sizeof(uffdio_copy)-sizeof(__s64)))
1705	goto out;
1706
1707	ret = validate_unaligned_range(mm: ctx->mm, start: uffdio_copy.src,
1708	len: uffdio_copy.len);
1709	if (ret)
1710	goto out;
1711	ret = validate_range(mm: ctx->mm, start: uffdio_copy.dst, len: uffdio_copy.len);
1712	if (ret)
1713	goto out;
1714
1715	ret = -EINVAL;
1716	if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE\|UFFDIO_COPY_MODE_WP))
1717	goto out;
1718	if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
1719	flags \|= MFILL_ATOMIC_WP;
1720	if (mmget_not_zero(mm: ctx->mm)) {
1721	ret = mfill_atomic_copy(ctx, dst_start: uffdio_copy.dst, src_start: uffdio_copy.src,
1722	len: uffdio_copy.len, flags);
1723	mmput(ctx->mm);
1724	} else {
1725	return -ESRCH;
1726	}
1727	if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
1728	return -EFAULT;
1729	if (ret < `0`)
1730	goto out;
1731	BUG_ON(!ret);
1732	/ len == 0 would wake all /
1733	range.len = ret;
1734	if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
1735	range.start = uffdio_copy.dst;
1736	wake_userfault(ctx, range: &range);
1737	}
1738	ret = range.len == uffdio_copy.len ? `0` : -EAGAIN;
1739	out:
1740	return ret;
1741	}
1742
1743	static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
1744	unsigned long arg)
1745	{
1746	__s64 ret;
1747	struct uffdio_zeropage uffdio_zeropage;
1748	struct uffdio_zeropage __user *user_uffdio_zeropage;
1749	struct userfaultfd_wake_range range;
1750
1751	user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
1752
1753	ret = -EAGAIN;
1754	if (atomic_read(v: &ctx->mmap_changing))
1755	goto out;
1756
1757	ret = -EFAULT;
1758	if (copy_from_user(to: &uffdio_zeropage, from: user_uffdio_zeropage,
1759	/ don't copy "zeropage" last field /
1760	n: sizeof(uffdio_zeropage)-sizeof(__s64)))
1761	goto out;
1762
1763	ret = validate_range(mm: ctx->mm, start: uffdio_zeropage.range.start,
1764	len: uffdio_zeropage.range.len);
1765	if (ret)
1766	goto out;
1767	ret = -EINVAL;
1768	if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
1769	goto out;
1770
1771	if (mmget_not_zero(mm: ctx->mm)) {
1772	ret = mfill_atomic_zeropage(ctx, dst_start: uffdio_zeropage.range.start,
1773	len: uffdio_zeropage.range.len);
1774	mmput(ctx->mm);
1775	} else {
1776	return -ESRCH;
1777	}
1778	if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
1779	return -EFAULT;
1780	if (ret < `0`)
1781	goto out;
1782	/ len == 0 would wake all /
1783	BUG_ON(!ret);
1784	range.len = ret;
1785	if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
1786	range.start = uffdio_zeropage.range.start;
1787	wake_userfault(ctx, range: &range);
1788	}
1789	ret = range.len == uffdio_zeropage.range.len ? `0` : -EAGAIN;
1790	out:
1791	return ret;
1792	}
1793
1794	static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
1795	unsigned long arg)
1796	{
1797	int ret;
1798	struct uffdio_writeprotect uffdio_wp;
1799	struct uffdio_writeprotect __user *user_uffdio_wp;
1800	struct userfaultfd_wake_range range;
1801	bool mode_wp, mode_dontwake;
1802
1803	if (atomic_read(v: &ctx->mmap_changing))
1804	return -EAGAIN;
1805
1806	user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
1807
1808	if (copy_from_user(to: &uffdio_wp, from: user_uffdio_wp,
1809	n: sizeof(struct uffdio_writeprotect)))
1810	return -EFAULT;
1811
1812	ret = validate_range(mm: ctx->mm, start: uffdio_wp.range.start,
1813	len: uffdio_wp.range.len);
1814	if (ret)
1815	return ret;
1816
1817	if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE \|
1818	UFFDIO_WRITEPROTECT_MODE_WP))
1819	return -EINVAL;
1820
1821	mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
1822	mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
1823
1824	if (mode_wp && mode_dontwake)
1825	return -EINVAL;
1826
1827	if (mmget_not_zero(mm: ctx->mm)) {
1828	ret = mwriteprotect_range(ctx, start: uffdio_wp.range.start,
1829	len: uffdio_wp.range.len, enable_wp: mode_wp);
1830	mmput(ctx->mm);
1831	} else {
1832	return -ESRCH;
1833	}
1834
1835	if (ret)
1836	return ret;
1837
1838	if (!mode_wp && !mode_dontwake) {
1839	range.start = uffdio_wp.range.start;
1840	range.len = uffdio_wp.range.len;
1841	wake_userfault(ctx, range: &range);
1842	}
1843	return ret;
1844	}
1845
1846	static int userfaultfd_continue(struct userfaultfd_ctx ctx, unsigned* long arg)
1847	{
1848	__s64 ret;
1849	struct uffdio_continue uffdio_continue;
1850	struct uffdio_continue __user *user_uffdio_continue;
1851	struct userfaultfd_wake_range range;
1852	uffd_flags_t flags = `0`;
1853
1854	user_uffdio_continue = (struct uffdio_continue __user *)arg;
1855
1856	ret = -EAGAIN;
1857	if (atomic_read(v: &ctx->mmap_changing))
1858	goto out;
1859
1860	ret = -EFAULT;
1861	if (copy_from_user(to: &uffdio_continue, from: user_uffdio_continue,
1862	/ don't copy the output fields /
1863	n: sizeof(uffdio_continue) - (sizeof(__s64))))
1864	goto out;
1865
1866	ret = validate_range(mm: ctx->mm, start: uffdio_continue.range.start,
1867	len: uffdio_continue.range.len);
1868	if (ret)
1869	goto out;
1870
1871	ret = -EINVAL;
1872	if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE \|
1873	UFFDIO_CONTINUE_MODE_WP))
1874	goto out;
1875	if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP)
1876	flags \|= MFILL_ATOMIC_WP;
1877
1878	if (mmget_not_zero(mm: ctx->mm)) {
1879	ret = mfill_atomic_continue(ctx, dst_start: uffdio_continue.range.start,
1880	len: uffdio_continue.range.len, flags);
1881	mmput(ctx->mm);
1882	} else {
1883	return -ESRCH;
1884	}
1885
1886	if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
1887	return -EFAULT;
1888	if (ret < `0`)
1889	goto out;
1890
1891	/ len == 0 would wake all /
1892	BUG_ON(!ret);
1893	range.len = ret;
1894	if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
1895	range.start = uffdio_continue.range.start;
1896	wake_userfault(ctx, range: &range);
1897	}
1898	ret = range.len == uffdio_continue.range.len ? `0` : -EAGAIN;
1899
1900	out:
1901	return ret;
1902	}
1903
1904	static inline int userfaultfd_poison(struct userfaultfd_ctx ctx, unsigned* long arg)
1905	{
1906	__s64 ret;
1907	struct uffdio_poison uffdio_poison;
1908	struct uffdio_poison __user *user_uffdio_poison;
1909	struct userfaultfd_wake_range range;
1910
1911	user_uffdio_poison = (struct uffdio_poison __user *)arg;
1912
1913	ret = -EAGAIN;
1914	if (atomic_read(v: &ctx->mmap_changing))
1915	goto out;
1916
1917	ret = -EFAULT;
1918	if (copy_from_user(to: &uffdio_poison, from: user_uffdio_poison,
1919	/ don't copy the output fields /
1920	n: sizeof(uffdio_poison) - (sizeof(__s64))))
1921	goto out;
1922
1923	ret = validate_range(mm: ctx->mm, start: uffdio_poison.range.start,
1924	len: uffdio_poison.range.len);
1925	if (ret)
1926	goto out;
1927
1928	ret = -EINVAL;
1929	if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE)
1930	goto out;
1931
1932	if (mmget_not_zero(mm: ctx->mm)) {
1933	ret = mfill_atomic_poison(ctx, start: uffdio_poison.range.start,
1934	len: uffdio_poison.range.len, flags: `0`);
1935	mmput(ctx->mm);
1936	} else {
1937	return -ESRCH;
1938	}
1939
1940	if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
1941	return -EFAULT;
1942	if (ret < `0`)
1943	goto out;
1944
1945	/ len == 0 would wake all /
1946	BUG_ON(!ret);
1947	range.len = ret;
1948	if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) {
1949	range.start = uffdio_poison.range.start;
1950	wake_userfault(ctx, range: &range);
1951	}
1952	ret = range.len == uffdio_poison.range.len ? `0` : -EAGAIN;
1953
1954	out:
1955	return ret;
1956	}
1957
1958	bool userfaultfd_wp_async(struct vm_area_struct *vma)
1959	{
1960	return userfaultfd_wp_async_ctx(ctx: vma->vm_userfaultfd_ctx.ctx);
1961	}
1962
1963	static inline unsigned int uffd_ctx_features(__u64 user_features)
1964	{
1965	/*
1966	* For the current set of features the bits just coincide. Set
1967	* UFFD_FEATURE_INITIALIZED to mark the features as enabled.
1968	*/
1969	return (unsigned int)user_features \| UFFD_FEATURE_INITIALIZED;
1970	}
1971
1972	static int userfaultfd_move(struct userfaultfd_ctx *ctx,
1973	unsigned long arg)
1974	{
1975	__s64 ret;
1976	struct uffdio_move uffdio_move;
1977	struct uffdio_move __user *user_uffdio_move;
1978	struct userfaultfd_wake_range range;
1979	struct mm_struct *mm = ctx->mm;
1980
1981	user_uffdio_move = (struct uffdio_move __user *) arg;
1982
1983	if (atomic_read(v: &ctx->mmap_changing))
1984	return -EAGAIN;
1985
1986	if (copy_from_user(to: &uffdio_move, from: user_uffdio_move,
1987	/ don't copy "move" last field /
1988	n: sizeof(uffdio_move)-sizeof(__s64)))
1989	return -EFAULT;
1990
1991	/ Do not allow cross-mm moves. /
1992	if (mm != current->mm)
1993	return -EINVAL;
1994
1995	ret = validate_range(mm, start: uffdio_move.dst, len: uffdio_move.len);
1996	if (ret)
1997	return ret;
1998
1999	ret = validate_range(mm, start: uffdio_move.src, len: uffdio_move.len);
2000	if (ret)
2001	return ret;
2002
2003	if (uffdio_move.mode & ~(UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES\|
2004	UFFDIO_MOVE_MODE_DONTWAKE))
2005	return -EINVAL;
2006
2007	if (mmget_not_zero(mm)) {
2008	ret = move_pages(ctx, dst_start: uffdio_move.dst, src_start: uffdio_move.src,
2009	len: uffdio_move.len, flags: uffdio_move.mode);
2010	mmput(mm);
2011	} else {
2012	return -ESRCH;
2013	}
2014
2015	if (unlikely(put_user(ret, &user_uffdio_move->move)))
2016	return -EFAULT;
2017	if (ret < `0`)
2018	goto out;
2019
2020	/ len == 0 would wake all /
2021	VM_WARN_ON(!ret);
2022	range.len = ret;
2023	if (!(uffdio_move.mode & UFFDIO_MOVE_MODE_DONTWAKE)) {
2024	range.start = uffdio_move.dst;
2025	wake_userfault(ctx, range: &range);
2026	}
2027	ret = range.len == uffdio_move.len ? `0` : -EAGAIN;
2028
2029	out:
2030	return ret;
2031	}
2032
2033	/*
2034	* userland asks for a certain API version and we return which bits
2035	* and ioctl commands are implemented in this kernel for such API
2036	* version or -EINVAL if unknown.
2037	*/
2038	static int userfaultfd_api(struct userfaultfd_ctx *ctx,
2039	unsigned long arg)
2040	{
2041	struct uffdio_api uffdio_api;
2042	void __user buf = (void* __user *)arg;
2043	unsigned int ctx_features;
2044	int ret;
2045	__u64 features;
2046
2047	ret = -EFAULT;
2048	if (copy_from_user(to: &uffdio_api, from: buf, n: sizeof(uffdio_api)))
2049	goto out;
2050	features = uffdio_api.features;
2051	ret = -EINVAL;
2052	if (uffdio_api.api != UFFD_API \|\| (features & ~UFFD_API_FEATURES))
2053	goto err_out;
2054	ret = -EPERM;
2055	if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
2056	goto err_out;
2057
2058	/ WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally /
2059	if (features & UFFD_FEATURE_WP_ASYNC)
2060	features \|= UFFD_FEATURE_WP_UNPOPULATED;
2061
2062	/ report all available features and ioctls to userland /
2063	uffdio_api.features = UFFD_API_FEATURES;
2064	#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
2065	uffdio_api.features &=
2066	~(UFFD_FEATURE_MINOR_HUGETLBFS \| UFFD_FEATURE_MINOR_SHMEM);
2067	#endif
2068	#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
2069	uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
2070	#endif
2071	#ifndef CONFIG_PTE_MARKER_UFFD_WP
2072	uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
2073	uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
2074	uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
2075	#endif
2076	uffdio_api.ioctls = UFFD_API_IOCTLS;
2077	ret = -EFAULT;
2078	if (copy_to_user(to: buf, from: &uffdio_api, n: sizeof(uffdio_api)))
2079	goto out;
2080
2081	/ only enable the requested features for this uffd context /
2082	ctx_features = uffd_ctx_features(user_features: features);
2083	ret = -EINVAL;
2084	if (cmpxchg(&ctx->features, `0`, ctx_features) != `0`)
2085	goto err_out;
2086
2087	ret = `0`;
2088	out:
2089	return ret;
2090	err_out:
2091	memset(&uffdio_api, `0`, sizeof(uffdio_api));
2092	if (copy_to_user(to: buf, from: &uffdio_api, n: sizeof(uffdio_api)))
2093	ret = -EFAULT;
2094	goto out;
2095	}
2096
2097	static long userfaultfd_ioctl(struct file file, unsigned* cmd,
2098	unsigned long arg)
2099	{
2100	int ret = -EINVAL;
2101	struct userfaultfd_ctx *ctx = file->private_data;
2102
2103	if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx))
2104	return -EINVAL;
2105
2106	switch(cmd) {
2107	case UFFDIO_API:
2108	ret = userfaultfd_api(ctx, arg);
2109	break;
2110	case UFFDIO_REGISTER:
2111	ret = userfaultfd_register(ctx, arg);
2112	break;
2113	case UFFDIO_UNREGISTER:
2114	ret = userfaultfd_unregister(ctx, arg);
2115	break;
2116	case UFFDIO_WAKE:
2117	ret = userfaultfd_wake(ctx, arg);
2118	break;
2119	case UFFDIO_COPY:
2120	ret = userfaultfd_copy(ctx, arg);
2121	break;
2122	case UFFDIO_ZEROPAGE:
2123	ret = userfaultfd_zeropage(ctx, arg);
2124	break;
2125	case UFFDIO_MOVE:
2126	ret = userfaultfd_move(ctx, arg);
2127	break;
2128	case UFFDIO_WRITEPROTECT:
2129	ret = userfaultfd_writeprotect(ctx, arg);
2130	break;
2131	case UFFDIO_CONTINUE:
2132	ret = userfaultfd_continue(ctx, arg);
2133	break;
2134	case UFFDIO_POISON:
2135	ret = userfaultfd_poison(ctx, arg);
2136	break;
2137	}
2138	return ret;
2139	}
2140
2141	#ifdef CONFIG_PROC_FS
2142	static void userfaultfd_show_fdinfo(struct seq_file m, struct* file *f)
2143	{
2144	struct userfaultfd_ctx *ctx = f->private_data;
2145	wait_queue_entry_t *wq;
2146	unsigned long pending = `0`, total = `0`;
2147
2148	spin_lock_irq(lock: &ctx->fault_pending_wqh.lock);
2149	list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
2150	pending++;
2151	total++;
2152	}
2153	list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
2154	total++;
2155	}
2156	spin_unlock_irq(lock: &ctx->fault_pending_wqh.lock);
2157
2158	/*
2159	* If more protocols will be added, there will be all shown
2160	* separated by a space. Like this:
2161	* protocols: aa:... bb:...
2162	*/
2163	seq_printf(m, fmt: "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
2164	pending, total, UFFD_API, ctx->features,
2165	UFFD_API_IOCTLS\|UFFD_API_RANGE_IOCTLS);
2166	}
2167	#endif
2168
2169	static const struct file_operations userfaultfd_fops = {
2170	#ifdef CONFIG_PROC_FS
2171	.show_fdinfo = userfaultfd_show_fdinfo,
2172	#endif
2173	.release = userfaultfd_release,
2174	.poll = userfaultfd_poll,
2175	.read = userfaultfd_read,
2176	.unlocked_ioctl = userfaultfd_ioctl,
2177	.compat_ioctl = compat_ptr_ioctl,
2178	.llseek = noop_llseek,
2179	};
2180
2181	static void init_once_userfaultfd_ctx(void *mem)
2182	{
2183	struct userfaultfd_ctx ctx = (struct* userfaultfd_ctx *) mem;
2184
2185	init_waitqueue_head(&ctx->fault_pending_wqh);
2186	init_waitqueue_head(&ctx->fault_wqh);
2187	init_waitqueue_head(&ctx->event_wqh);
2188	init_waitqueue_head(&ctx->fd_wqh);
2189	seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
2190	}
2191
2192	static int new_userfaultfd(int flags)
2193	{
2194	struct userfaultfd_ctx *ctx;
2195	int fd;
2196
2197	BUG_ON(!current->mm);
2198
2199	/ Check the UFFD_* constants for consistency. /
2200	BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS);
2201	BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
2202	BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
2203
2204	if (flags & ~(UFFD_SHARED_FCNTL_FLAGS \| UFFD_USER_MODE_ONLY))
2205	return -EINVAL;
2206
2207	ctx = kmem_cache_alloc(cachep: userfaultfd_ctx_cachep, GFP_KERNEL);
2208	if (!ctx)
2209	return -ENOMEM;
2210
2211	refcount_set(r: &ctx->refcount, n: `1`);
2212	ctx->flags = flags;
2213	ctx->features = `0`;
2214	ctx->released = false;
2215	init_rwsem(&ctx->map_changing_lock);
2216	atomic_set(v: &ctx->mmap_changing, i: `0`);
2217	ctx->mm = current->mm;
2218	/ prevent the mm struct to be freed /
2219	mmgrab(mm: ctx->mm);
2220
2221	/ Create a new inode so that the LSM can block the creation. /
2222	fd = anon_inode_create_getfd(name: "[userfaultfd]", fops: &userfaultfd_fops, priv: ctx,
2223	O_RDONLY \| (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
2224	if (fd < `0`) {
2225	mmdrop(mm: ctx->mm);
2226	kmem_cache_free(s: userfaultfd_ctx_cachep, objp: ctx);
2227	}
2228	return fd;
2229	}
2230
2231	static inline bool userfaultfd_syscall_allowed(int flags)
2232	{
2233	/ Userspace-only page faults are always allowed /
2234	if (flags & UFFD_USER_MODE_ONLY)
2235	return true;
2236
2237	/*
2238	* The user is requesting a userfaultfd which can handle kernel faults.
2239	* Privileged users are always allowed to do this.
2240	*/
2241	if (capable(CAP_SYS_PTRACE))
2242	return true;
2243
2244	/ Otherwise, access to kernel fault handling is sysctl controlled. /
2245	return sysctl_unprivileged_userfaultfd;
2246	}
2247
2248	SYSCALL_DEFINE1(userfaultfd, int, flags)
2249	{
2250	if (!userfaultfd_syscall_allowed(flags))
2251	return -EPERM;
2252
2253	return new_userfaultfd(flags);
2254	}
2255
2256	static long userfaultfd_dev_ioctl(struct file file, unsigned* int cmd, unsigned long flags)
2257	{
2258	if (cmd != USERFAULTFD_IOC_NEW)
2259	return -EINVAL;
2260
2261	return new_userfaultfd(flags);
2262	}
2263
2264	static const struct file_operations userfaultfd_dev_fops = {
2265	.unlocked_ioctl = userfaultfd_dev_ioctl,
2266	.compat_ioctl = userfaultfd_dev_ioctl,
2267	.owner = THIS_MODULE,
2268	.llseek = noop_llseek,
2269	};
2270
2271	static struct miscdevice userfaultfd_misc = {
2272	.minor = MISC_DYNAMIC_MINOR,
2273	.name = "userfaultfd",
2274	.fops = &userfaultfd_dev_fops
2275	};
2276
2277	static int __init userfaultfd_init(void)
2278	{
2279	int ret;
2280
2281	ret = misc_register(misc: &userfaultfd_misc);
2282	if (ret)
2283	return ret;
2284
2285	userfaultfd_ctx_cachep = kmem_cache_create(name: "userfaultfd_ctx_cache",
2286	size: sizeof(struct userfaultfd_ctx),
2287	align: `0`,
2288	SLAB_HWCACHE_ALIGN\|SLAB_PANIC,
2289	ctor: init_once_userfaultfd_ctx);
2290	#ifdef CONFIG_SYSCTL
2291	register_sysctl_init("vm", vm_userfaultfd_table);
2292	#endif
2293	return `0`;
2294	}
2295	__initcall(userfaultfd_init);
2296

source code of linux/fs/userfaultfd.c