1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * fs/userfaultfd.c
4 *
5 * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
6 * Copyright (C) 2008-2009 Red Hat, Inc.
7 * Copyright (C) 2015 Red Hat, Inc.
8 *
9 * Some part derived from fs/eventfd.c (anon inode setup) and
10 * mm/ksm.c (mm hashing).
11 */
12
13#include <linux/list.h>
14#include <linux/hashtable.h>
15#include <linux/sched/signal.h>
16#include <linux/sched/mm.h>
17#include <linux/mm.h>
18#include <linux/mm_inline.h>
19#include <linux/mmu_notifier.h>
20#include <linux/poll.h>
21#include <linux/slab.h>
22#include <linux/seq_file.h>
23#include <linux/file.h>
24#include <linux/bug.h>
25#include <linux/anon_inodes.h>
26#include <linux/syscalls.h>
27#include <linux/userfaultfd_k.h>
28#include <linux/mempolicy.h>
29#include <linux/ioctl.h>
30#include <linux/security.h>
31#include <linux/hugetlb.h>
32#include <linux/swapops.h>
33#include <linux/miscdevice.h>
34
35static int sysctl_unprivileged_userfaultfd __read_mostly;
36
37#ifdef CONFIG_SYSCTL
38static struct ctl_table vm_userfaultfd_table[] = {
39 {
40 .procname = "unprivileged_userfaultfd",
41 .data = &sysctl_unprivileged_userfaultfd,
42 .maxlen = sizeof(sysctl_unprivileged_userfaultfd),
43 .mode = 0644,
44 .proc_handler = proc_dointvec_minmax,
45 .extra1 = SYSCTL_ZERO,
46 .extra2 = SYSCTL_ONE,
47 },
48};
49#endif
50
51static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;
52
53struct userfaultfd_fork_ctx {
54 struct userfaultfd_ctx *orig;
55 struct userfaultfd_ctx *new;
56 struct list_head list;
57};
58
59struct userfaultfd_unmap_ctx {
60 struct userfaultfd_ctx *ctx;
61 unsigned long start;
62 unsigned long end;
63 struct list_head list;
64};
65
66struct userfaultfd_wait_queue {
67 struct uffd_msg msg;
68 wait_queue_entry_t wq;
69 struct userfaultfd_ctx *ctx;
70 bool waken;
71};
72
73struct userfaultfd_wake_range {
74 unsigned long start;
75 unsigned long len;
76};
77
78/* internal indication that UFFD_API ioctl was successfully executed */
79#define UFFD_FEATURE_INITIALIZED (1u << 31)
80
81static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
82{
83 return ctx->features & UFFD_FEATURE_INITIALIZED;
84}
85
86static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
87{
88 return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
89}
90
91/*
92 * Whether WP_UNPOPULATED is enabled on the uffd context. It is only
93 * meaningful when userfaultfd_wp()==true on the vma and when it's
94 * anonymous.
95 */
96bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
97{
98 struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
99
100 if (!ctx)
101 return false;
102
103 return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
104}
105
106static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
107 vm_flags_t flags)
108{
109 const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP;
110
111 vm_flags_reset(vma, flags);
112 /*
113 * For shared mappings, we want to enable writenotify while
114 * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
115 * recalculate vma->vm_page_prot whenever userfaultfd-wp changes.
116 */
117 if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
118 vma_set_page_prot(vma);
119}
120
121static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
122 int wake_flags, void *key)
123{
124 struct userfaultfd_wake_range *range = key;
125 int ret;
126 struct userfaultfd_wait_queue *uwq;
127 unsigned long start, len;
128
129 uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
130 ret = 0;
131 /* len == 0 means wake all */
132 start = range->start;
133 len = range->len;
134 if (len && (start > uwq->msg.arg.pagefault.address ||
135 start + len <= uwq->msg.arg.pagefault.address))
136 goto out;
137 WRITE_ONCE(uwq->waken, true);
138 /*
139 * The Program-Order guarantees provided by the scheduler
140 * ensure uwq->waken is visible before the task is woken.
141 */
142 ret = wake_up_state(tsk: wq->private, state: mode);
143 if (ret) {
144 /*
145 * Wake only once, autoremove behavior.
146 *
147 * After the effect of list_del_init is visible to the other
148 * CPUs, the waitqueue may disappear from under us, see the
149 * !list_empty_careful() in handle_userfault().
150 *
151 * try_to_wake_up() has an implicit smp_mb(), and the
152 * wq->private is read before calling the extern function
153 * "wake_up_state" (which in turns calls try_to_wake_up).
154 */
155 list_del_init(entry: &wq->entry);
156 }
157out:
158 return ret;
159}
160
161/**
162 * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
163 * context.
164 * @ctx: [in] Pointer to the userfaultfd context.
165 */
166static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
167{
168 refcount_inc(r: &ctx->refcount);
169}
170
171/**
172 * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
173 * context.
174 * @ctx: [in] Pointer to userfaultfd context.
175 *
176 * The userfaultfd context reference must have been previously acquired either
177 * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
178 */
179static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
180{
181 if (refcount_dec_and_test(r: &ctx->refcount)) {
182 VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
183 VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
184 VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
185 VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
186 VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
187 VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
188 VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
189 VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
190 mmdrop(mm: ctx->mm);
191 kmem_cache_free(s: userfaultfd_ctx_cachep, objp: ctx);
192 }
193}
194
195static inline void msg_init(struct uffd_msg *msg)
196{
197 BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
198 /*
199 * Must use memset to zero out the paddings or kernel data is
200 * leaked to userland.
201 */
202 memset(msg, 0, sizeof(struct uffd_msg));
203}
204
205static inline struct uffd_msg userfault_msg(unsigned long address,
206 unsigned long real_address,
207 unsigned int flags,
208 unsigned long reason,
209 unsigned int features)
210{
211 struct uffd_msg msg;
212
213 msg_init(msg: &msg);
214 msg.event = UFFD_EVENT_PAGEFAULT;
215
216 msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ?
217 real_address : address;
218
219 /*
220 * These flags indicate why the userfault occurred:
221 * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
222 * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault.
223 * - Neither of these flags being set indicates a MISSING fault.
224 *
225 * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write
226 * fault. Otherwise, it was a read fault.
227 */
228 if (flags & FAULT_FLAG_WRITE)
229 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
230 if (reason & VM_UFFD_WP)
231 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
232 if (reason & VM_UFFD_MINOR)
233 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR;
234 if (features & UFFD_FEATURE_THREAD_ID)
235 msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
236 return msg;
237}
238
239#ifdef CONFIG_HUGETLB_PAGE
240/*
241 * Same functionality as userfaultfd_must_wait below with modifications for
242 * hugepmd ranges.
243 */
244static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
245 struct vm_fault *vmf,
246 unsigned long reason)
247{
248 struct vm_area_struct *vma = vmf->vma;
249 pte_t *ptep, pte;
250 bool ret = true;
251
252 assert_fault_locked(vmf);
253
254 ptep = hugetlb_walk(vma, addr: vmf->address, sz: vma_mmu_pagesize(vma));
255 if (!ptep)
256 goto out;
257
258 ret = false;
259 pte = huge_ptep_get(ptep);
260
261 /*
262 * Lockless access: we're in a wait_event so it's ok if it
263 * changes under us. PTE markers should be handled the same as none
264 * ptes here.
265 */
266 if (huge_pte_none_mostly(pte))
267 ret = true;
268 if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
269 ret = true;
270out:
271 return ret;
272}
273#else
274static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
275 struct vm_fault *vmf,
276 unsigned long reason)
277{
278 return false; /* should never get here */
279}
280#endif /* CONFIG_HUGETLB_PAGE */
281
282/*
283 * Verify the pagetables are still not ok after having reigstered into
284 * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
285 * userfault that has already been resolved, if userfaultfd_read and
286 * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
287 * threads.
288 */
289static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
290 struct vm_fault *vmf,
291 unsigned long reason)
292{
293 struct mm_struct *mm = ctx->mm;
294 unsigned long address = vmf->address;
295 pgd_t *pgd;
296 p4d_t *p4d;
297 pud_t *pud;
298 pmd_t *pmd, _pmd;
299 pte_t *pte;
300 pte_t ptent;
301 bool ret = true;
302
303 assert_fault_locked(vmf);
304
305 pgd = pgd_offset(mm, address);
306 if (!pgd_present(pgd: *pgd))
307 goto out;
308 p4d = p4d_offset(pgd, address);
309 if (!p4d_present(p4d: *p4d))
310 goto out;
311 pud = pud_offset(p4d, address);
312 if (!pud_present(pud: *pud))
313 goto out;
314 pmd = pmd_offset(pud, address);
315again:
316 _pmd = pmdp_get_lockless(pmdp: pmd);
317 if (pmd_none(pmd: _pmd))
318 goto out;
319
320 ret = false;
321 if (!pmd_present(pmd: _pmd) || pmd_devmap(pmd: _pmd))
322 goto out;
323
324 if (pmd_trans_huge(pmd: _pmd)) {
325 if (!pmd_write(pmd: _pmd) && (reason & VM_UFFD_WP))
326 ret = true;
327 goto out;
328 }
329
330 pte = pte_offset_map(pmd, addr: address);
331 if (!pte) {
332 ret = true;
333 goto again;
334 }
335 /*
336 * Lockless access: we're in a wait_event so it's ok if it
337 * changes under us. PTE markers should be handled the same as none
338 * ptes here.
339 */
340 ptent = ptep_get(ptep: pte);
341 if (pte_none_mostly(pte: ptent))
342 ret = true;
343 if (!pte_write(pte: ptent) && (reason & VM_UFFD_WP))
344 ret = true;
345 pte_unmap(pte);
346
347out:
348 return ret;
349}
350
351static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags)
352{
353 if (flags & FAULT_FLAG_INTERRUPTIBLE)
354 return TASK_INTERRUPTIBLE;
355
356 if (flags & FAULT_FLAG_KILLABLE)
357 return TASK_KILLABLE;
358
359 return TASK_UNINTERRUPTIBLE;
360}
361
362/*
363 * The locking rules involved in returning VM_FAULT_RETRY depending on
364 * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
365 * FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
366 * recommendation in __lock_page_or_retry is not an understatement.
367 *
368 * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released
369 * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
370 * not set.
371 *
372 * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
373 * set, VM_FAULT_RETRY can still be returned if and only if there are
374 * fatal_signal_pending()s, and the mmap_lock must be released before
375 * returning it.
376 */
377vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
378{
379 struct vm_area_struct *vma = vmf->vma;
380 struct mm_struct *mm = vma->vm_mm;
381 struct userfaultfd_ctx *ctx;
382 struct userfaultfd_wait_queue uwq;
383 vm_fault_t ret = VM_FAULT_SIGBUS;
384 bool must_wait;
385 unsigned int blocking_state;
386
387 /*
388 * We don't do userfault handling for the final child pid update.
389 *
390 * We also don't do userfault handling during
391 * coredumping. hugetlbfs has the special
392 * hugetlb_follow_page_mask() to skip missing pages in the
393 * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
394 * the no_page_table() helper in follow_page_mask(), but the
395 * shmem_vm_ops->fault method is invoked even during
396 * coredumping and it ends up here.
397 */
398 if (current->flags & (PF_EXITING|PF_DUMPCORE))
399 goto out;
400
401 assert_fault_locked(vmf);
402
403 ctx = vma->vm_userfaultfd_ctx.ctx;
404 if (!ctx)
405 goto out;
406
407 BUG_ON(ctx->mm != mm);
408
409 /* Any unrecognized flag is a bug. */
410 VM_BUG_ON(reason & ~__VM_UFFD_FLAGS);
411 /* 0 or > 1 flags set is a bug; we expect exactly 1. */
412 VM_BUG_ON(!reason || (reason & (reason - 1)));
413
414 if (ctx->features & UFFD_FEATURE_SIGBUS)
415 goto out;
416 if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
417 goto out;
418
419 /*
420 * If it's already released don't get it. This avoids to loop
421 * in __get_user_pages if userfaultfd_release waits on the
422 * caller of handle_userfault to release the mmap_lock.
423 */
424 if (unlikely(READ_ONCE(ctx->released))) {
425 /*
426 * Don't return VM_FAULT_SIGBUS in this case, so a non
427 * cooperative manager can close the uffd after the
428 * last UFFDIO_COPY, without risking to trigger an
429 * involuntary SIGBUS if the process was starting the
430 * userfaultfd while the userfaultfd was still armed
431 * (but after the last UFFDIO_COPY). If the uffd
432 * wasn't already closed when the userfault reached
433 * this point, that would normally be solved by
434 * userfaultfd_must_wait returning 'false'.
435 *
436 * If we were to return VM_FAULT_SIGBUS here, the non
437 * cooperative manager would be instead forced to
438 * always call UFFDIO_UNREGISTER before it can safely
439 * close the uffd.
440 */
441 ret = VM_FAULT_NOPAGE;
442 goto out;
443 }
444
445 /*
446 * Check that we can return VM_FAULT_RETRY.
447 *
448 * NOTE: it should become possible to return VM_FAULT_RETRY
449 * even if FAULT_FLAG_TRIED is set without leading to gup()
450 * -EBUSY failures, if the userfaultfd is to be extended for
451 * VM_UFFD_WP tracking and we intend to arm the userfault
452 * without first stopping userland access to the memory. For
453 * VM_UFFD_MISSING userfaults this is enough for now.
454 */
455 if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
456 /*
457 * Validate the invariant that nowait must allow retry
458 * to be sure not to return SIGBUS erroneously on
459 * nowait invocations.
460 */
461 BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
462#ifdef CONFIG_DEBUG_VM
463 if (printk_ratelimit()) {
464 printk(KERN_WARNING
465 "FAULT_FLAG_ALLOW_RETRY missing %x\n",
466 vmf->flags);
467 dump_stack();
468 }
469#endif
470 goto out;
471 }
472
473 /*
474 * Handle nowait, not much to do other than tell it to retry
475 * and wait.
476 */
477 ret = VM_FAULT_RETRY;
478 if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
479 goto out;
480
481 /* take the reference before dropping the mmap_lock */
482 userfaultfd_ctx_get(ctx);
483
484 init_waitqueue_func_entry(wq_entry: &uwq.wq, func: userfaultfd_wake_function);
485 uwq.wq.private = current;
486 uwq.msg = userfault_msg(address: vmf->address, real_address: vmf->real_address, flags: vmf->flags,
487 reason, features: ctx->features);
488 uwq.ctx = ctx;
489 uwq.waken = false;
490
491 blocking_state = userfaultfd_get_blocking_state(flags: vmf->flags);
492
493 /*
494 * Take the vma lock now, in order to safely call
495 * userfaultfd_huge_must_wait() later. Since acquiring the
496 * (sleepable) vma lock can modify the current task state, that
497 * must be before explicitly calling set_current_state().
498 */
499 if (is_vm_hugetlb_page(vma))
500 hugetlb_vma_lock_read(vma);
501
502 spin_lock_irq(lock: &ctx->fault_pending_wqh.lock);
503 /*
504 * After the __add_wait_queue the uwq is visible to userland
505 * through poll/read().
506 */
507 __add_wait_queue(wq_head: &ctx->fault_pending_wqh, wq_entry: &uwq.wq);
508 /*
509 * The smp_mb() after __set_current_state prevents the reads
510 * following the spin_unlock to happen before the list_add in
511 * __add_wait_queue.
512 */
513 set_current_state(blocking_state);
514 spin_unlock_irq(lock: &ctx->fault_pending_wqh.lock);
515
516 if (!is_vm_hugetlb_page(vma))
517 must_wait = userfaultfd_must_wait(ctx, vmf, reason);
518 else
519 must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason);
520 if (is_vm_hugetlb_page(vma))
521 hugetlb_vma_unlock_read(vma);
522 release_fault_lock(vmf);
523
524 if (likely(must_wait && !READ_ONCE(ctx->released))) {
525 wake_up_poll(&ctx->fd_wqh, EPOLLIN);
526 schedule();
527 }
528
529 __set_current_state(TASK_RUNNING);
530
531 /*
532 * Here we race with the list_del; list_add in
533 * userfaultfd_ctx_read(), however because we don't ever run
534 * list_del_init() to refile across the two lists, the prev
535 * and next pointers will never point to self. list_add also
536 * would never let any of the two pointers to point to
537 * self. So list_empty_careful won't risk to see both pointers
538 * pointing to self at any time during the list refile. The
539 * only case where list_del_init() is called is the full
540 * removal in the wake function and there we don't re-list_add
541 * and it's fine not to block on the spinlock. The uwq on this
542 * kernel stack can be released after the list_del_init.
543 */
544 if (!list_empty_careful(head: &uwq.wq.entry)) {
545 spin_lock_irq(lock: &ctx->fault_pending_wqh.lock);
546 /*
547 * No need of list_del_init(), the uwq on the stack
548 * will be freed shortly anyway.
549 */
550 list_del(entry: &uwq.wq.entry);
551 spin_unlock_irq(lock: &ctx->fault_pending_wqh.lock);
552 }
553
554 /*
555 * ctx may go away after this if the userfault pseudo fd is
556 * already released.
557 */
558 userfaultfd_ctx_put(ctx);
559
560out:
561 return ret;
562}
563
564static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
565 struct userfaultfd_wait_queue *ewq)
566{
567 struct userfaultfd_ctx *release_new_ctx;
568
569 if (WARN_ON_ONCE(current->flags & PF_EXITING))
570 goto out;
571
572 ewq->ctx = ctx;
573 init_waitqueue_entry(wq_entry: &ewq->wq, current);
574 release_new_ctx = NULL;
575
576 spin_lock_irq(lock: &ctx->event_wqh.lock);
577 /*
578 * After the __add_wait_queue the uwq is visible to userland
579 * through poll/read().
580 */
581 __add_wait_queue(wq_head: &ctx->event_wqh, wq_entry: &ewq->wq);
582 for (;;) {
583 set_current_state(TASK_KILLABLE);
584 if (ewq->msg.event == 0)
585 break;
586 if (READ_ONCE(ctx->released) ||
587 fatal_signal_pending(current)) {
588 /*
589 * &ewq->wq may be queued in fork_event, but
590 * __remove_wait_queue ignores the head
591 * parameter. It would be a problem if it
592 * didn't.
593 */
594 __remove_wait_queue(wq_head: &ctx->event_wqh, wq_entry: &ewq->wq);
595 if (ewq->msg.event == UFFD_EVENT_FORK) {
596 struct userfaultfd_ctx *new;
597
598 new = (struct userfaultfd_ctx *)
599 (unsigned long)
600 ewq->msg.arg.reserved.reserved1;
601 release_new_ctx = new;
602 }
603 break;
604 }
605
606 spin_unlock_irq(lock: &ctx->event_wqh.lock);
607
608 wake_up_poll(&ctx->fd_wqh, EPOLLIN);
609 schedule();
610
611 spin_lock_irq(lock: &ctx->event_wqh.lock);
612 }
613 __set_current_state(TASK_RUNNING);
614 spin_unlock_irq(lock: &ctx->event_wqh.lock);
615
616 if (release_new_ctx) {
617 struct vm_area_struct *vma;
618 struct mm_struct *mm = release_new_ctx->mm;
619 VMA_ITERATOR(vmi, mm, 0);
620
621 /* the various vma->vm_userfaultfd_ctx still points to it */
622 mmap_write_lock(mm);
623 for_each_vma(vmi, vma) {
624 if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
625 vma_start_write(vma);
626 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
627 userfaultfd_set_vm_flags(vma,
628 flags: vma->vm_flags & ~__VM_UFFD_FLAGS);
629 }
630 }
631 mmap_write_unlock(mm);
632
633 userfaultfd_ctx_put(ctx: release_new_ctx);
634 }
635
636 /*
637 * ctx may go away after this if the userfault pseudo fd is
638 * already released.
639 */
640out:
641 atomic_dec(v: &ctx->mmap_changing);
642 VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0);
643 userfaultfd_ctx_put(ctx);
644}
645
646static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
647 struct userfaultfd_wait_queue *ewq)
648{
649 ewq->msg.event = 0;
650 wake_up_locked(&ctx->event_wqh);
651 __remove_wait_queue(wq_head: &ctx->event_wqh, wq_entry: &ewq->wq);
652}
653
654int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
655{
656 struct userfaultfd_ctx *ctx = NULL, *octx;
657 struct userfaultfd_fork_ctx *fctx;
658
659 octx = vma->vm_userfaultfd_ctx.ctx;
660 if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
661 vma_start_write(vma);
662 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
663 userfaultfd_set_vm_flags(vma, flags: vma->vm_flags & ~__VM_UFFD_FLAGS);
664 return 0;
665 }
666
667 list_for_each_entry(fctx, fcs, list)
668 if (fctx->orig == octx) {
669 ctx = fctx->new;
670 break;
671 }
672
673 if (!ctx) {
674 fctx = kmalloc(size: sizeof(*fctx), GFP_KERNEL);
675 if (!fctx)
676 return -ENOMEM;
677
678 ctx = kmem_cache_alloc(cachep: userfaultfd_ctx_cachep, GFP_KERNEL);
679 if (!ctx) {
680 kfree(objp: fctx);
681 return -ENOMEM;
682 }
683
684 refcount_set(r: &ctx->refcount, n: 1);
685 ctx->flags = octx->flags;
686 ctx->features = octx->features;
687 ctx->released = false;
688 init_rwsem(&ctx->map_changing_lock);
689 atomic_set(v: &ctx->mmap_changing, i: 0);
690 ctx->mm = vma->vm_mm;
691 mmgrab(mm: ctx->mm);
692
693 userfaultfd_ctx_get(ctx: octx);
694 down_write(sem: &octx->map_changing_lock);
695 atomic_inc(v: &octx->mmap_changing);
696 up_write(sem: &octx->map_changing_lock);
697 fctx->orig = octx;
698 fctx->new = ctx;
699 list_add_tail(new: &fctx->list, head: fcs);
700 }
701
702 vma->vm_userfaultfd_ctx.ctx = ctx;
703 return 0;
704}
705
706static void dup_fctx(struct userfaultfd_fork_ctx *fctx)
707{
708 struct userfaultfd_ctx *ctx = fctx->orig;
709 struct userfaultfd_wait_queue ewq;
710
711 msg_init(msg: &ewq.msg);
712
713 ewq.msg.event = UFFD_EVENT_FORK;
714 ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
715
716 userfaultfd_event_wait_completion(ctx, ewq: &ewq);
717}
718
719void dup_userfaultfd_complete(struct list_head *fcs)
720{
721 struct userfaultfd_fork_ctx *fctx, *n;
722
723 list_for_each_entry_safe(fctx, n, fcs, list) {
724 dup_fctx(fctx);
725 list_del(entry: &fctx->list);
726 kfree(objp: fctx);
727 }
728}
729
730void mremap_userfaultfd_prep(struct vm_area_struct *vma,
731 struct vm_userfaultfd_ctx *vm_ctx)
732{
733 struct userfaultfd_ctx *ctx;
734
735 ctx = vma->vm_userfaultfd_ctx.ctx;
736
737 if (!ctx)
738 return;
739
740 if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
741 vm_ctx->ctx = ctx;
742 userfaultfd_ctx_get(ctx);
743 down_write(sem: &ctx->map_changing_lock);
744 atomic_inc(v: &ctx->mmap_changing);
745 up_write(sem: &ctx->map_changing_lock);
746 } else {
747 /* Drop uffd context if remap feature not enabled */
748 vma_start_write(vma);
749 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
750 userfaultfd_set_vm_flags(vma, flags: vma->vm_flags & ~__VM_UFFD_FLAGS);
751 }
752}
753
754void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
755 unsigned long from, unsigned long to,
756 unsigned long len)
757{
758 struct userfaultfd_ctx *ctx = vm_ctx->ctx;
759 struct userfaultfd_wait_queue ewq;
760
761 if (!ctx)
762 return;
763
764 if (to & ~PAGE_MASK) {
765 userfaultfd_ctx_put(ctx);
766 return;
767 }
768
769 msg_init(msg: &ewq.msg);
770
771 ewq.msg.event = UFFD_EVENT_REMAP;
772 ewq.msg.arg.remap.from = from;
773 ewq.msg.arg.remap.to = to;
774 ewq.msg.arg.remap.len = len;
775
776 userfaultfd_event_wait_completion(ctx, ewq: &ewq);
777}
778
779bool userfaultfd_remove(struct vm_area_struct *vma,
780 unsigned long start, unsigned long end)
781{
782 struct mm_struct *mm = vma->vm_mm;
783 struct userfaultfd_ctx *ctx;
784 struct userfaultfd_wait_queue ewq;
785
786 ctx = vma->vm_userfaultfd_ctx.ctx;
787 if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
788 return true;
789
790 userfaultfd_ctx_get(ctx);
791 down_write(sem: &ctx->map_changing_lock);
792 atomic_inc(v: &ctx->mmap_changing);
793 up_write(sem: &ctx->map_changing_lock);
794 mmap_read_unlock(mm);
795
796 msg_init(msg: &ewq.msg);
797
798 ewq.msg.event = UFFD_EVENT_REMOVE;
799 ewq.msg.arg.remove.start = start;
800 ewq.msg.arg.remove.end = end;
801
802 userfaultfd_event_wait_completion(ctx, ewq: &ewq);
803
804 return false;
805}
806
807static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
808 unsigned long start, unsigned long end)
809{
810 struct userfaultfd_unmap_ctx *unmap_ctx;
811
812 list_for_each_entry(unmap_ctx, unmaps, list)
813 if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
814 unmap_ctx->end == end)
815 return true;
816
817 return false;
818}
819
820int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
821 unsigned long end, struct list_head *unmaps)
822{
823 struct userfaultfd_unmap_ctx *unmap_ctx;
824 struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
825
826 if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
827 has_unmap_ctx(ctx, unmaps, start, end))
828 return 0;
829
830 unmap_ctx = kzalloc(size: sizeof(*unmap_ctx), GFP_KERNEL);
831 if (!unmap_ctx)
832 return -ENOMEM;
833
834 userfaultfd_ctx_get(ctx);
835 down_write(sem: &ctx->map_changing_lock);
836 atomic_inc(v: &ctx->mmap_changing);
837 up_write(sem: &ctx->map_changing_lock);
838 unmap_ctx->ctx = ctx;
839 unmap_ctx->start = start;
840 unmap_ctx->end = end;
841 list_add_tail(new: &unmap_ctx->list, head: unmaps);
842
843 return 0;
844}
845
846void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
847{
848 struct userfaultfd_unmap_ctx *ctx, *n;
849 struct userfaultfd_wait_queue ewq;
850
851 list_for_each_entry_safe(ctx, n, uf, list) {
852 msg_init(msg: &ewq.msg);
853
854 ewq.msg.event = UFFD_EVENT_UNMAP;
855 ewq.msg.arg.remove.start = ctx->start;
856 ewq.msg.arg.remove.end = ctx->end;
857
858 userfaultfd_event_wait_completion(ctx: ctx->ctx, ewq: &ewq);
859
860 list_del(entry: &ctx->list);
861 kfree(objp: ctx);
862 }
863}
864
865static int userfaultfd_release(struct inode *inode, struct file *file)
866{
867 struct userfaultfd_ctx *ctx = file->private_data;
868 struct mm_struct *mm = ctx->mm;
869 struct vm_area_struct *vma, *prev;
870 /* len == 0 means wake all */
871 struct userfaultfd_wake_range range = { .len = 0, };
872 unsigned long new_flags;
873 VMA_ITERATOR(vmi, mm, 0);
874
875 WRITE_ONCE(ctx->released, true);
876
877 if (!mmget_not_zero(mm))
878 goto wakeup;
879
880 /*
881 * Flush page faults out of all CPUs. NOTE: all page faults
882 * must be retried without returning VM_FAULT_SIGBUS if
883 * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
884 * changes while handle_userfault released the mmap_lock. So
885 * it's critical that released is set to true (above), before
886 * taking the mmap_lock for writing.
887 */
888 mmap_write_lock(mm);
889 prev = NULL;
890 for_each_vma(vmi, vma) {
891 cond_resched();
892 BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
893 !!(vma->vm_flags & __VM_UFFD_FLAGS));
894 if (vma->vm_userfaultfd_ctx.ctx != ctx) {
895 prev = vma;
896 continue;
897 }
898 new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
899 vma = vma_modify_flags_uffd(vmi: &vmi, prev, vma, start: vma->vm_start,
900 end: vma->vm_end, new_flags,
901 NULL_VM_UFFD_CTX);
902
903 vma_start_write(vma);
904 userfaultfd_set_vm_flags(vma, flags: new_flags);
905 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
906
907 prev = vma;
908 }
909 mmap_write_unlock(mm);
910 mmput(mm);
911wakeup:
912 /*
913 * After no new page faults can wait on this fault_*wqh, flush
914 * the last page faults that may have been already waiting on
915 * the fault_*wqh.
916 */
917 spin_lock_irq(lock: &ctx->fault_pending_wqh.lock);
918 __wake_up_locked_key(wq_head: &ctx->fault_pending_wqh, TASK_NORMAL, key: &range);
919 __wake_up(wq_head: &ctx->fault_wqh, TASK_NORMAL, nr: 1, key: &range);
920 spin_unlock_irq(lock: &ctx->fault_pending_wqh.lock);
921
922 /* Flush pending events that may still wait on event_wqh */
923 wake_up_all(&ctx->event_wqh);
924
925 wake_up_poll(&ctx->fd_wqh, EPOLLHUP);
926 userfaultfd_ctx_put(ctx);
927 return 0;
928}
929
930/* fault_pending_wqh.lock must be hold by the caller */
931static inline struct userfaultfd_wait_queue *find_userfault_in(
932 wait_queue_head_t *wqh)
933{
934 wait_queue_entry_t *wq;
935 struct userfaultfd_wait_queue *uwq;
936
937 lockdep_assert_held(&wqh->lock);
938
939 uwq = NULL;
940 if (!waitqueue_active(wq_head: wqh))
941 goto out;
942 /* walk in reverse to provide FIFO behavior to read userfaults */
943 wq = list_last_entry(&wqh->head, typeof(*wq), entry);
944 uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
945out:
946 return uwq;
947}
948
949static inline struct userfaultfd_wait_queue *find_userfault(
950 struct userfaultfd_ctx *ctx)
951{
952 return find_userfault_in(wqh: &ctx->fault_pending_wqh);
953}
954
955static inline struct userfaultfd_wait_queue *find_userfault_evt(
956 struct userfaultfd_ctx *ctx)
957{
958 return find_userfault_in(wqh: &ctx->event_wqh);
959}
960
961static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
962{
963 struct userfaultfd_ctx *ctx = file->private_data;
964 __poll_t ret;
965
966 poll_wait(filp: file, wait_address: &ctx->fd_wqh, p: wait);
967
968 if (!userfaultfd_is_initialized(ctx))
969 return EPOLLERR;
970
971 /*
972 * poll() never guarantees that read won't block.
973 * userfaults can be waken before they're read().
974 */
975 if (unlikely(!(file->f_flags & O_NONBLOCK)))
976 return EPOLLERR;
977 /*
978 * lockless access to see if there are pending faults
979 * __pollwait last action is the add_wait_queue but
980 * the spin_unlock would allow the waitqueue_active to
981 * pass above the actual list_add inside
982 * add_wait_queue critical section. So use a full
983 * memory barrier to serialize the list_add write of
984 * add_wait_queue() with the waitqueue_active read
985 * below.
986 */
987 ret = 0;
988 smp_mb();
989 if (waitqueue_active(wq_head: &ctx->fault_pending_wqh))
990 ret = EPOLLIN;
991 else if (waitqueue_active(wq_head: &ctx->event_wqh))
992 ret = EPOLLIN;
993
994 return ret;
995}
996
997static const struct file_operations userfaultfd_fops;
998
999static int resolve_userfault_fork(struct userfaultfd_ctx *new,
1000 struct inode *inode,
1001 struct uffd_msg *msg)
1002{
1003 int fd;
1004
1005 fd = anon_inode_create_getfd(name: "[userfaultfd]", fops: &userfaultfd_fops, priv: new,
1006 O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), context_inode: inode);
1007 if (fd < 0)
1008 return fd;
1009
1010 msg->arg.reserved.reserved1 = 0;
1011 msg->arg.fork.ufd = fd;
1012 return 0;
1013}
1014
1015static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
1016 struct uffd_msg *msg, struct inode *inode)
1017{
1018 ssize_t ret;
1019 DECLARE_WAITQUEUE(wait, current);
1020 struct userfaultfd_wait_queue *uwq;
1021 /*
1022 * Handling fork event requires sleeping operations, so
1023 * we drop the event_wqh lock, then do these ops, then
1024 * lock it back and wake up the waiter. While the lock is
1025 * dropped the ewq may go away so we keep track of it
1026 * carefully.
1027 */
1028 LIST_HEAD(fork_event);
1029 struct userfaultfd_ctx *fork_nctx = NULL;
1030
1031 /* always take the fd_wqh lock before the fault_pending_wqh lock */
1032 spin_lock_irq(lock: &ctx->fd_wqh.lock);
1033 __add_wait_queue(wq_head: &ctx->fd_wqh, wq_entry: &wait);
1034 for (;;) {
1035 set_current_state(TASK_INTERRUPTIBLE);
1036 spin_lock(lock: &ctx->fault_pending_wqh.lock);
1037 uwq = find_userfault(ctx);
1038 if (uwq) {
1039 /*
1040 * Use a seqcount to repeat the lockless check
1041 * in wake_userfault() to avoid missing
1042 * wakeups because during the refile both
1043 * waitqueue could become empty if this is the
1044 * only userfault.
1045 */
1046 write_seqcount_begin(&ctx->refile_seq);
1047
1048 /*
1049 * The fault_pending_wqh.lock prevents the uwq
1050 * to disappear from under us.
1051 *
1052 * Refile this userfault from
1053 * fault_pending_wqh to fault_wqh, it's not
1054 * pending anymore after we read it.
1055 *
1056 * Use list_del() by hand (as
1057 * userfaultfd_wake_function also uses
1058 * list_del_init() by hand) to be sure nobody
1059 * changes __remove_wait_queue() to use
1060 * list_del_init() in turn breaking the
1061 * !list_empty_careful() check in
1062 * handle_userfault(). The uwq->wq.head list
1063 * must never be empty at any time during the
1064 * refile, or the waitqueue could disappear
1065 * from under us. The "wait_queue_head_t"
1066 * parameter of __remove_wait_queue() is unused
1067 * anyway.
1068 */
1069 list_del(entry: &uwq->wq.entry);
1070 add_wait_queue(wq_head: &ctx->fault_wqh, wq_entry: &uwq->wq);
1071
1072 write_seqcount_end(&ctx->refile_seq);
1073
1074 /* careful to always initialize msg if ret == 0 */
1075 *msg = uwq->msg;
1076 spin_unlock(lock: &ctx->fault_pending_wqh.lock);
1077 ret = 0;
1078 break;
1079 }
1080 spin_unlock(lock: &ctx->fault_pending_wqh.lock);
1081
1082 spin_lock(lock: &ctx->event_wqh.lock);
1083 uwq = find_userfault_evt(ctx);
1084 if (uwq) {
1085 *msg = uwq->msg;
1086
1087 if (uwq->msg.event == UFFD_EVENT_FORK) {
1088 fork_nctx = (struct userfaultfd_ctx *)
1089 (unsigned long)
1090 uwq->msg.arg.reserved.reserved1;
1091 list_move(list: &uwq->wq.entry, head: &fork_event);
1092 /*
1093 * fork_nctx can be freed as soon as
1094 * we drop the lock, unless we take a
1095 * reference on it.
1096 */
1097 userfaultfd_ctx_get(ctx: fork_nctx);
1098 spin_unlock(lock: &ctx->event_wqh.lock);
1099 ret = 0;
1100 break;
1101 }
1102
1103 userfaultfd_event_complete(ctx, ewq: uwq);
1104 spin_unlock(lock: &ctx->event_wqh.lock);
1105 ret = 0;
1106 break;
1107 }
1108 spin_unlock(lock: &ctx->event_wqh.lock);
1109
1110 if (signal_pending(current)) {
1111 ret = -ERESTARTSYS;
1112 break;
1113 }
1114 if (no_wait) {
1115 ret = -EAGAIN;
1116 break;
1117 }
1118 spin_unlock_irq(lock: &ctx->fd_wqh.lock);
1119 schedule();
1120 spin_lock_irq(lock: &ctx->fd_wqh.lock);
1121 }
1122 __remove_wait_queue(wq_head: &ctx->fd_wqh, wq_entry: &wait);
1123 __set_current_state(TASK_RUNNING);
1124 spin_unlock_irq(lock: &ctx->fd_wqh.lock);
1125
1126 if (!ret && msg->event == UFFD_EVENT_FORK) {
1127 ret = resolve_userfault_fork(new: fork_nctx, inode, msg);
1128 spin_lock_irq(lock: &ctx->event_wqh.lock);
1129 if (!list_empty(head: &fork_event)) {
1130 /*
1131 * The fork thread didn't abort, so we can
1132 * drop the temporary refcount.
1133 */
1134 userfaultfd_ctx_put(ctx: fork_nctx);
1135
1136 uwq = list_first_entry(&fork_event,
1137 typeof(*uwq),
1138 wq.entry);
1139 /*
1140 * If fork_event list wasn't empty and in turn
1141 * the event wasn't already released by fork
1142 * (the event is allocated on fork kernel
1143 * stack), put the event back to its place in
1144 * the event_wq. fork_event head will be freed
1145 * as soon as we return so the event cannot
1146 * stay queued there no matter the current
1147 * "ret" value.
1148 */
1149 list_del(entry: &uwq->wq.entry);
1150 __add_wait_queue(wq_head: &ctx->event_wqh, wq_entry: &uwq->wq);
1151
1152 /*
1153 * Leave the event in the waitqueue and report
1154 * error to userland if we failed to resolve
1155 * the userfault fork.
1156 */
1157 if (likely(!ret))
1158 userfaultfd_event_complete(ctx, ewq: uwq);
1159 } else {
1160 /*
1161 * Here the fork thread aborted and the
1162 * refcount from the fork thread on fork_nctx
1163 * has already been released. We still hold
1164 * the reference we took before releasing the
1165 * lock above. If resolve_userfault_fork
1166 * failed we've to drop it because the
1167 * fork_nctx has to be freed in such case. If
1168 * it succeeded we'll hold it because the new
1169 * uffd references it.
1170 */
1171 if (ret)
1172 userfaultfd_ctx_put(ctx: fork_nctx);
1173 }
1174 spin_unlock_irq(lock: &ctx->event_wqh.lock);
1175 }
1176
1177 return ret;
1178}
1179
1180static ssize_t userfaultfd_read(struct file *file, char __user *buf,
1181 size_t count, loff_t *ppos)
1182{
1183 struct userfaultfd_ctx *ctx = file->private_data;
1184 ssize_t _ret, ret = 0;
1185 struct uffd_msg msg;
1186 int no_wait = file->f_flags & O_NONBLOCK;
1187 struct inode *inode = file_inode(f: file);
1188
1189 if (!userfaultfd_is_initialized(ctx))
1190 return -EINVAL;
1191
1192 for (;;) {
1193 if (count < sizeof(msg))
1194 return ret ? ret : -EINVAL;
1195 _ret = userfaultfd_ctx_read(ctx, no_wait, msg: &msg, inode);
1196 if (_ret < 0)
1197 return ret ? ret : _ret;
1198 if (copy_to_user(to: (__u64 __user *) buf, from: &msg, n: sizeof(msg)))
1199 return ret ? ret : -EFAULT;
1200 ret += sizeof(msg);
1201 buf += sizeof(msg);
1202 count -= sizeof(msg);
1203 /*
1204 * Allow to read more than one fault at time but only
1205 * block if waiting for the very first one.
1206 */
1207 no_wait = O_NONBLOCK;
1208 }
1209}
1210
1211static void __wake_userfault(struct userfaultfd_ctx *ctx,
1212 struct userfaultfd_wake_range *range)
1213{
1214 spin_lock_irq(lock: &ctx->fault_pending_wqh.lock);
1215 /* wake all in the range and autoremove */
1216 if (waitqueue_active(wq_head: &ctx->fault_pending_wqh))
1217 __wake_up_locked_key(wq_head: &ctx->fault_pending_wqh, TASK_NORMAL,
1218 key: range);
1219 if (waitqueue_active(wq_head: &ctx->fault_wqh))
1220 __wake_up(wq_head: &ctx->fault_wqh, TASK_NORMAL, nr: 1, key: range);
1221 spin_unlock_irq(lock: &ctx->fault_pending_wqh.lock);
1222}
1223
1224static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
1225 struct userfaultfd_wake_range *range)
1226{
1227 unsigned seq;
1228 bool need_wakeup;
1229
1230 /*
1231 * To be sure waitqueue_active() is not reordered by the CPU
1232 * before the pagetable update, use an explicit SMP memory
1233 * barrier here. PT lock release or mmap_read_unlock(mm) still
1234 * have release semantics that can allow the
1235 * waitqueue_active() to be reordered before the pte update.
1236 */
1237 smp_mb();
1238
1239 /*
1240 * Use waitqueue_active because it's very frequent to
1241 * change the address space atomically even if there are no
1242 * userfaults yet. So we take the spinlock only when we're
1243 * sure we've userfaults to wake.
1244 */
1245 do {
1246 seq = read_seqcount_begin(&ctx->refile_seq);
1247 need_wakeup = waitqueue_active(wq_head: &ctx->fault_pending_wqh) ||
1248 waitqueue_active(wq_head: &ctx->fault_wqh);
1249 cond_resched();
1250 } while (read_seqcount_retry(&ctx->refile_seq, seq));
1251 if (need_wakeup)
1252 __wake_userfault(ctx, range);
1253}
1254
1255static __always_inline int validate_unaligned_range(
1256 struct mm_struct *mm, __u64 start, __u64 len)
1257{
1258 __u64 task_size = mm->task_size;
1259
1260 if (len & ~PAGE_MASK)
1261 return -EINVAL;
1262 if (!len)
1263 return -EINVAL;
1264 if (start < mmap_min_addr)
1265 return -EINVAL;
1266 if (start >= task_size)
1267 return -EINVAL;
1268 if (len > task_size - start)
1269 return -EINVAL;
1270 if (start + len <= start)
1271 return -EINVAL;
1272 return 0;
1273}
1274
1275static __always_inline int validate_range(struct mm_struct *mm,
1276 __u64 start, __u64 len)
1277{
1278 if (start & ~PAGE_MASK)
1279 return -EINVAL;
1280
1281 return validate_unaligned_range(mm, start, len);
1282}
1283
1284static int userfaultfd_register(struct userfaultfd_ctx *ctx,
1285 unsigned long arg)
1286{
1287 struct mm_struct *mm = ctx->mm;
1288 struct vm_area_struct *vma, *prev, *cur;
1289 int ret;
1290 struct uffdio_register uffdio_register;
1291 struct uffdio_register __user *user_uffdio_register;
1292 unsigned long vm_flags, new_flags;
1293 bool found;
1294 bool basic_ioctls;
1295 unsigned long start, end, vma_end;
1296 struct vma_iterator vmi;
1297 bool wp_async = userfaultfd_wp_async_ctx(ctx);
1298
1299 user_uffdio_register = (struct uffdio_register __user *) arg;
1300
1301 ret = -EFAULT;
1302 if (copy_from_user(to: &uffdio_register, from: user_uffdio_register,
1303 n: sizeof(uffdio_register)-sizeof(__u64)))
1304 goto out;
1305
1306 ret = -EINVAL;
1307 if (!uffdio_register.mode)
1308 goto out;
1309 if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES)
1310 goto out;
1311 vm_flags = 0;
1312 if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
1313 vm_flags |= VM_UFFD_MISSING;
1314 if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
1315#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
1316 goto out;
1317#endif
1318 vm_flags |= VM_UFFD_WP;
1319 }
1320 if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
1321#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
1322 goto out;
1323#endif
1324 vm_flags |= VM_UFFD_MINOR;
1325 }
1326
1327 ret = validate_range(mm, start: uffdio_register.range.start,
1328 len: uffdio_register.range.len);
1329 if (ret)
1330 goto out;
1331
1332 start = uffdio_register.range.start;
1333 end = start + uffdio_register.range.len;
1334
1335 ret = -ENOMEM;
1336 if (!mmget_not_zero(mm))
1337 goto out;
1338
1339 ret = -EINVAL;
1340 mmap_write_lock(mm);
1341 vma_iter_init(vmi: &vmi, mm, addr: start);
1342 vma = vma_find(vmi: &vmi, max: end);
1343 if (!vma)
1344 goto out_unlock;
1345
1346 /*
1347 * If the first vma contains huge pages, make sure start address
1348 * is aligned to huge page size.
1349 */
1350 if (is_vm_hugetlb_page(vma)) {
1351 unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
1352
1353 if (start & (vma_hpagesize - 1))
1354 goto out_unlock;
1355 }
1356
1357 /*
1358 * Search for not compatible vmas.
1359 */
1360 found = false;
1361 basic_ioctls = false;
1362 cur = vma;
1363 do {
1364 cond_resched();
1365
1366 BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
1367 !!(cur->vm_flags & __VM_UFFD_FLAGS));
1368
1369 /* check not compatible vmas */
1370 ret = -EINVAL;
1371 if (!vma_can_userfault(vma: cur, vm_flags, wp_async))
1372 goto out_unlock;
1373
1374 /*
1375 * UFFDIO_COPY will fill file holes even without
1376 * PROT_WRITE. This check enforces that if this is a
1377 * MAP_SHARED, the process has write permission to the backing
1378 * file. If VM_MAYWRITE is set it also enforces that on a
1379 * MAP_SHARED vma: there is no F_WRITE_SEAL and no further
1380 * F_WRITE_SEAL can be taken until the vma is destroyed.
1381 */
1382 ret = -EPERM;
1383 if (unlikely(!(cur->vm_flags & VM_MAYWRITE)))
1384 goto out_unlock;
1385
1386 /*
1387 * If this vma contains ending address, and huge pages
1388 * check alignment.
1389 */
1390 if (is_vm_hugetlb_page(vma: cur) && end <= cur->vm_end &&
1391 end > cur->vm_start) {
1392 unsigned long vma_hpagesize = vma_kernel_pagesize(vma: cur);
1393
1394 ret = -EINVAL;
1395
1396 if (end & (vma_hpagesize - 1))
1397 goto out_unlock;
1398 }
1399 if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
1400 goto out_unlock;
1401
1402 /*
1403 * Check that this vma isn't already owned by a
1404 * different userfaultfd. We can't allow more than one
1405 * userfaultfd to own a single vma simultaneously or we
1406 * wouldn't know which one to deliver the userfaults to.
1407 */
1408 ret = -EBUSY;
1409 if (cur->vm_userfaultfd_ctx.ctx &&
1410 cur->vm_userfaultfd_ctx.ctx != ctx)
1411 goto out_unlock;
1412
1413 /*
1414 * Note vmas containing huge pages
1415 */
1416 if (is_vm_hugetlb_page(vma: cur))
1417 basic_ioctls = true;
1418
1419 found = true;
1420 } for_each_vma_range(vmi, cur, end);
1421 BUG_ON(!found);
1422
1423 vma_iter_set(vmi: &vmi, addr: start);
1424 prev = vma_prev(vmi: &vmi);
1425 if (vma->vm_start < start)
1426 prev = vma;
1427
1428 ret = 0;
1429 for_each_vma_range(vmi, vma, end) {
1430 cond_resched();
1431
1432 BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async));
1433 BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
1434 vma->vm_userfaultfd_ctx.ctx != ctx);
1435 WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
1436
1437 /*
1438 * Nothing to do: this vma is already registered into this
1439 * userfaultfd and with the right tracking mode too.
1440 */
1441 if (vma->vm_userfaultfd_ctx.ctx == ctx &&
1442 (vma->vm_flags & vm_flags) == vm_flags)
1443 goto skip;
1444
1445 if (vma->vm_start > start)
1446 start = vma->vm_start;
1447 vma_end = min(end, vma->vm_end);
1448
1449 new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
1450 vma = vma_modify_flags_uffd(vmi: &vmi, prev, vma, start, end: vma_end,
1451 new_flags,
1452 new_ctx: (struct vm_userfaultfd_ctx){ctx});
1453 if (IS_ERR(ptr: vma)) {
1454 ret = PTR_ERR(ptr: vma);
1455 break;
1456 }
1457
1458 /*
1459 * In the vma_merge() successful mprotect-like case 8:
1460 * the next vma was merged into the current one and
1461 * the current one has not been updated yet.
1462 */
1463 vma_start_write(vma);
1464 userfaultfd_set_vm_flags(vma, flags: new_flags);
1465 vma->vm_userfaultfd_ctx.ctx = ctx;
1466
1467 if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
1468 hugetlb_unshare_all_pmds(vma);
1469
1470 skip:
1471 prev = vma;
1472 start = vma->vm_end;
1473 }
1474
1475out_unlock:
1476 mmap_write_unlock(mm);
1477 mmput(mm);
1478 if (!ret) {
1479 __u64 ioctls_out;
1480
1481 ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
1482 UFFD_API_RANGE_IOCTLS;
1483
1484 /*
1485 * Declare the WP ioctl only if the WP mode is
1486 * specified and all checks passed with the range
1487 */
1488 if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
1489 ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);
1490
1491 /* CONTINUE ioctl is only supported for MINOR ranges. */
1492 if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
1493 ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);
1494
1495 /*
1496 * Now that we scanned all vmas we can already tell
1497 * userland which ioctls methods are guaranteed to
1498 * succeed on this range.
1499 */
1500 if (put_user(ioctls_out, &user_uffdio_register->ioctls))
1501 ret = -EFAULT;
1502 }
1503out:
1504 return ret;
1505}
1506
1507static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
1508 unsigned long arg)
1509{
1510 struct mm_struct *mm = ctx->mm;
1511 struct vm_area_struct *vma, *prev, *cur;
1512 int ret;
1513 struct uffdio_range uffdio_unregister;
1514 unsigned long new_flags;
1515 bool found;
1516 unsigned long start, end, vma_end;
1517 const void __user *buf = (void __user *)arg;
1518 struct vma_iterator vmi;
1519 bool wp_async = userfaultfd_wp_async_ctx(ctx);
1520
1521 ret = -EFAULT;
1522 if (copy_from_user(to: &uffdio_unregister, from: buf, n: sizeof(uffdio_unregister)))
1523 goto out;
1524
1525 ret = validate_range(mm, start: uffdio_unregister.start,
1526 len: uffdio_unregister.len);
1527 if (ret)
1528 goto out;
1529
1530 start = uffdio_unregister.start;
1531 end = start + uffdio_unregister.len;
1532
1533 ret = -ENOMEM;
1534 if (!mmget_not_zero(mm))
1535 goto out;
1536
1537 mmap_write_lock(mm);
1538 ret = -EINVAL;
1539 vma_iter_init(vmi: &vmi, mm, addr: start);
1540 vma = vma_find(vmi: &vmi, max: end);
1541 if (!vma)
1542 goto out_unlock;
1543
1544 /*
1545 * If the first vma contains huge pages, make sure start address
1546 * is aligned to huge page size.
1547 */
1548 if (is_vm_hugetlb_page(vma)) {
1549 unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
1550
1551 if (start & (vma_hpagesize - 1))
1552 goto out_unlock;
1553 }
1554
1555 /*
1556 * Search for not compatible vmas.
1557 */
1558 found = false;
1559 cur = vma;
1560 do {
1561 cond_resched();
1562
1563 BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
1564 !!(cur->vm_flags & __VM_UFFD_FLAGS));
1565
1566 /*
1567 * Check not compatible vmas, not strictly required
1568 * here as not compatible vmas cannot have an
1569 * userfaultfd_ctx registered on them, but this
1570 * provides for more strict behavior to notice
1571 * unregistration errors.
1572 */
1573 if (!vma_can_userfault(vma: cur, vm_flags: cur->vm_flags, wp_async))
1574 goto out_unlock;
1575
1576 found = true;
1577 } for_each_vma_range(vmi, cur, end);
1578 BUG_ON(!found);
1579
1580 vma_iter_set(vmi: &vmi, addr: start);
1581 prev = vma_prev(vmi: &vmi);
1582 if (vma->vm_start < start)
1583 prev = vma;
1584
1585 ret = 0;
1586 for_each_vma_range(vmi, vma, end) {
1587 cond_resched();
1588
1589 BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async));
1590
1591 /*
1592 * Nothing to do: this vma is already registered into this
1593 * userfaultfd and with the right tracking mode too.
1594 */
1595 if (!vma->vm_userfaultfd_ctx.ctx)
1596 goto skip;
1597
1598 WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
1599
1600 if (vma->vm_start > start)
1601 start = vma->vm_start;
1602 vma_end = min(end, vma->vm_end);
1603
1604 if (userfaultfd_missing(vma)) {
1605 /*
1606 * Wake any concurrent pending userfault while
1607 * we unregister, so they will not hang
1608 * permanently and it avoids userland to call
1609 * UFFDIO_WAKE explicitly.
1610 */
1611 struct userfaultfd_wake_range range;
1612 range.start = start;
1613 range.len = vma_end - start;
1614 wake_userfault(ctx: vma->vm_userfaultfd_ctx.ctx, range: &range);
1615 }
1616
1617 /* Reset ptes for the whole vma range if wr-protected */
1618 if (userfaultfd_wp(vma))
1619 uffd_wp_range(vma, start, len: vma_end - start, enable_wp: false);
1620
1621 new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
1622 vma = vma_modify_flags_uffd(vmi: &vmi, prev, vma, start, end: vma_end,
1623 new_flags, NULL_VM_UFFD_CTX);
1624 if (IS_ERR(ptr: vma)) {
1625 ret = PTR_ERR(ptr: vma);
1626 break;
1627 }
1628
1629 /*
1630 * In the vma_merge() successful mprotect-like case 8:
1631 * the next vma was merged into the current one and
1632 * the current one has not been updated yet.
1633 */
1634 vma_start_write(vma);
1635 userfaultfd_set_vm_flags(vma, flags: new_flags);
1636 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
1637
1638 skip:
1639 prev = vma;
1640 start = vma->vm_end;
1641 }
1642
1643out_unlock:
1644 mmap_write_unlock(mm);
1645 mmput(mm);
1646out:
1647 return ret;
1648}
1649
1650/*
1651 * userfaultfd_wake may be used in combination with the
1652 * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
1653 */
1654static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
1655 unsigned long arg)
1656{
1657 int ret;
1658 struct uffdio_range uffdio_wake;
1659 struct userfaultfd_wake_range range;
1660 const void __user *buf = (void __user *)arg;
1661
1662 ret = -EFAULT;
1663 if (copy_from_user(to: &uffdio_wake, from: buf, n: sizeof(uffdio_wake)))
1664 goto out;
1665
1666 ret = validate_range(mm: ctx->mm, start: uffdio_wake.start, len: uffdio_wake.len);
1667 if (ret)
1668 goto out;
1669
1670 range.start = uffdio_wake.start;
1671 range.len = uffdio_wake.len;
1672
1673 /*
1674 * len == 0 means wake all and we don't want to wake all here,
1675 * so check it again to be sure.
1676 */
1677 VM_BUG_ON(!range.len);
1678
1679 wake_userfault(ctx, range: &range);
1680 ret = 0;
1681
1682out:
1683 return ret;
1684}
1685
1686static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
1687 unsigned long arg)
1688{
1689 __s64 ret;
1690 struct uffdio_copy uffdio_copy;
1691 struct uffdio_copy __user *user_uffdio_copy;
1692 struct userfaultfd_wake_range range;
1693 uffd_flags_t flags = 0;
1694
1695 user_uffdio_copy = (struct uffdio_copy __user *) arg;
1696
1697 ret = -EAGAIN;
1698 if (atomic_read(v: &ctx->mmap_changing))
1699 goto out;
1700
1701 ret = -EFAULT;
1702 if (copy_from_user(to: &uffdio_copy, from: user_uffdio_copy,
1703 /* don't copy "copy" last field */
1704 n: sizeof(uffdio_copy)-sizeof(__s64)))
1705 goto out;
1706
1707 ret = validate_unaligned_range(mm: ctx->mm, start: uffdio_copy.src,
1708 len: uffdio_copy.len);
1709 if (ret)
1710 goto out;
1711 ret = validate_range(mm: ctx->mm, start: uffdio_copy.dst, len: uffdio_copy.len);
1712 if (ret)
1713 goto out;
1714
1715 ret = -EINVAL;
1716 if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
1717 goto out;
1718 if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
1719 flags |= MFILL_ATOMIC_WP;
1720 if (mmget_not_zero(mm: ctx->mm)) {
1721 ret = mfill_atomic_copy(ctx, dst_start: uffdio_copy.dst, src_start: uffdio_copy.src,
1722 len: uffdio_copy.len, flags);
1723 mmput(ctx->mm);
1724 } else {
1725 return -ESRCH;
1726 }
1727 if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
1728 return -EFAULT;
1729 if (ret < 0)
1730 goto out;
1731 BUG_ON(!ret);
1732 /* len == 0 would wake all */
1733 range.len = ret;
1734 if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
1735 range.start = uffdio_copy.dst;
1736 wake_userfault(ctx, range: &range);
1737 }
1738 ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
1739out:
1740 return ret;
1741}
1742
1743static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
1744 unsigned long arg)
1745{
1746 __s64 ret;
1747 struct uffdio_zeropage uffdio_zeropage;
1748 struct uffdio_zeropage __user *user_uffdio_zeropage;
1749 struct userfaultfd_wake_range range;
1750
1751 user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
1752
1753 ret = -EAGAIN;
1754 if (atomic_read(v: &ctx->mmap_changing))
1755 goto out;
1756
1757 ret = -EFAULT;
1758 if (copy_from_user(to: &uffdio_zeropage, from: user_uffdio_zeropage,
1759 /* don't copy "zeropage" last field */
1760 n: sizeof(uffdio_zeropage)-sizeof(__s64)))
1761 goto out;
1762
1763 ret = validate_range(mm: ctx->mm, start: uffdio_zeropage.range.start,
1764 len: uffdio_zeropage.range.len);
1765 if (ret)
1766 goto out;
1767 ret = -EINVAL;
1768 if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
1769 goto out;
1770
1771 if (mmget_not_zero(mm: ctx->mm)) {
1772 ret = mfill_atomic_zeropage(ctx, dst_start: uffdio_zeropage.range.start,
1773 len: uffdio_zeropage.range.len);
1774 mmput(ctx->mm);
1775 } else {
1776 return -ESRCH;
1777 }
1778 if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
1779 return -EFAULT;
1780 if (ret < 0)
1781 goto out;
1782 /* len == 0 would wake all */
1783 BUG_ON(!ret);
1784 range.len = ret;
1785 if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
1786 range.start = uffdio_zeropage.range.start;
1787 wake_userfault(ctx, range: &range);
1788 }
1789 ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
1790out:
1791 return ret;
1792}
1793
1794static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
1795 unsigned long arg)
1796{
1797 int ret;
1798 struct uffdio_writeprotect uffdio_wp;
1799 struct uffdio_writeprotect __user *user_uffdio_wp;
1800 struct userfaultfd_wake_range range;
1801 bool mode_wp, mode_dontwake;
1802
1803 if (atomic_read(v: &ctx->mmap_changing))
1804 return -EAGAIN;
1805
1806 user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
1807
1808 if (copy_from_user(to: &uffdio_wp, from: user_uffdio_wp,
1809 n: sizeof(struct uffdio_writeprotect)))
1810 return -EFAULT;
1811
1812 ret = validate_range(mm: ctx->mm, start: uffdio_wp.range.start,
1813 len: uffdio_wp.range.len);
1814 if (ret)
1815 return ret;
1816
1817 if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
1818 UFFDIO_WRITEPROTECT_MODE_WP))
1819 return -EINVAL;
1820
1821 mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
1822 mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
1823
1824 if (mode_wp && mode_dontwake)
1825 return -EINVAL;
1826
1827 if (mmget_not_zero(mm: ctx->mm)) {
1828 ret = mwriteprotect_range(ctx, start: uffdio_wp.range.start,
1829 len: uffdio_wp.range.len, enable_wp: mode_wp);
1830 mmput(ctx->mm);
1831 } else {
1832 return -ESRCH;
1833 }
1834
1835 if (ret)
1836 return ret;
1837
1838 if (!mode_wp && !mode_dontwake) {
1839 range.start = uffdio_wp.range.start;
1840 range.len = uffdio_wp.range.len;
1841 wake_userfault(ctx, range: &range);
1842 }
1843 return ret;
1844}
1845
1846static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
1847{
1848 __s64 ret;
1849 struct uffdio_continue uffdio_continue;
1850 struct uffdio_continue __user *user_uffdio_continue;
1851 struct userfaultfd_wake_range range;
1852 uffd_flags_t flags = 0;
1853
1854 user_uffdio_continue = (struct uffdio_continue __user *)arg;
1855
1856 ret = -EAGAIN;
1857 if (atomic_read(v: &ctx->mmap_changing))
1858 goto out;
1859
1860 ret = -EFAULT;
1861 if (copy_from_user(to: &uffdio_continue, from: user_uffdio_continue,
1862 /* don't copy the output fields */
1863 n: sizeof(uffdio_continue) - (sizeof(__s64))))
1864 goto out;
1865
1866 ret = validate_range(mm: ctx->mm, start: uffdio_continue.range.start,
1867 len: uffdio_continue.range.len);
1868 if (ret)
1869 goto out;
1870
1871 ret = -EINVAL;
1872 if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE |
1873 UFFDIO_CONTINUE_MODE_WP))
1874 goto out;
1875 if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP)
1876 flags |= MFILL_ATOMIC_WP;
1877
1878 if (mmget_not_zero(mm: ctx->mm)) {
1879 ret = mfill_atomic_continue(ctx, dst_start: uffdio_continue.range.start,
1880 len: uffdio_continue.range.len, flags);
1881 mmput(ctx->mm);
1882 } else {
1883 return -ESRCH;
1884 }
1885
1886 if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
1887 return -EFAULT;
1888 if (ret < 0)
1889 goto out;
1890
1891 /* len == 0 would wake all */
1892 BUG_ON(!ret);
1893 range.len = ret;
1894 if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
1895 range.start = uffdio_continue.range.start;
1896 wake_userfault(ctx, range: &range);
1897 }
1898 ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;
1899
1900out:
1901 return ret;
1902}
1903
1904static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg)
1905{
1906 __s64 ret;
1907 struct uffdio_poison uffdio_poison;
1908 struct uffdio_poison __user *user_uffdio_poison;
1909 struct userfaultfd_wake_range range;
1910
1911 user_uffdio_poison = (struct uffdio_poison __user *)arg;
1912
1913 ret = -EAGAIN;
1914 if (atomic_read(v: &ctx->mmap_changing))
1915 goto out;
1916
1917 ret = -EFAULT;
1918 if (copy_from_user(to: &uffdio_poison, from: user_uffdio_poison,
1919 /* don't copy the output fields */
1920 n: sizeof(uffdio_poison) - (sizeof(__s64))))
1921 goto out;
1922
1923 ret = validate_range(mm: ctx->mm, start: uffdio_poison.range.start,
1924 len: uffdio_poison.range.len);
1925 if (ret)
1926 goto out;
1927
1928 ret = -EINVAL;
1929 if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE)
1930 goto out;
1931
1932 if (mmget_not_zero(mm: ctx->mm)) {
1933 ret = mfill_atomic_poison(ctx, start: uffdio_poison.range.start,
1934 len: uffdio_poison.range.len, flags: 0);
1935 mmput(ctx->mm);
1936 } else {
1937 return -ESRCH;
1938 }
1939
1940 if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
1941 return -EFAULT;
1942 if (ret < 0)
1943 goto out;
1944
1945 /* len == 0 would wake all */
1946 BUG_ON(!ret);
1947 range.len = ret;
1948 if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) {
1949 range.start = uffdio_poison.range.start;
1950 wake_userfault(ctx, range: &range);
1951 }
1952 ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN;
1953
1954out:
1955 return ret;
1956}
1957
1958bool userfaultfd_wp_async(struct vm_area_struct *vma)
1959{
1960 return userfaultfd_wp_async_ctx(ctx: vma->vm_userfaultfd_ctx.ctx);
1961}
1962
1963static inline unsigned int uffd_ctx_features(__u64 user_features)
1964{
1965 /*
1966 * For the current set of features the bits just coincide. Set
1967 * UFFD_FEATURE_INITIALIZED to mark the features as enabled.
1968 */
1969 return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED;
1970}
1971
1972static int userfaultfd_move(struct userfaultfd_ctx *ctx,
1973 unsigned long arg)
1974{
1975 __s64 ret;
1976 struct uffdio_move uffdio_move;
1977 struct uffdio_move __user *user_uffdio_move;
1978 struct userfaultfd_wake_range range;
1979 struct mm_struct *mm = ctx->mm;
1980
1981 user_uffdio_move = (struct uffdio_move __user *) arg;
1982
1983 if (atomic_read(v: &ctx->mmap_changing))
1984 return -EAGAIN;
1985
1986 if (copy_from_user(to: &uffdio_move, from: user_uffdio_move,
1987 /* don't copy "move" last field */
1988 n: sizeof(uffdio_move)-sizeof(__s64)))
1989 return -EFAULT;
1990
1991 /* Do not allow cross-mm moves. */
1992 if (mm != current->mm)
1993 return -EINVAL;
1994
1995 ret = validate_range(mm, start: uffdio_move.dst, len: uffdio_move.len);
1996 if (ret)
1997 return ret;
1998
1999 ret = validate_range(mm, start: uffdio_move.src, len: uffdio_move.len);
2000 if (ret)
2001 return ret;
2002
2003 if (uffdio_move.mode & ~(UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES|
2004 UFFDIO_MOVE_MODE_DONTWAKE))
2005 return -EINVAL;
2006
2007 if (mmget_not_zero(mm)) {
2008 ret = move_pages(ctx, dst_start: uffdio_move.dst, src_start: uffdio_move.src,
2009 len: uffdio_move.len, flags: uffdio_move.mode);
2010 mmput(mm);
2011 } else {
2012 return -ESRCH;
2013 }
2014
2015 if (unlikely(put_user(ret, &user_uffdio_move->move)))
2016 return -EFAULT;
2017 if (ret < 0)
2018 goto out;
2019
2020 /* len == 0 would wake all */
2021 VM_WARN_ON(!ret);
2022 range.len = ret;
2023 if (!(uffdio_move.mode & UFFDIO_MOVE_MODE_DONTWAKE)) {
2024 range.start = uffdio_move.dst;
2025 wake_userfault(ctx, range: &range);
2026 }
2027 ret = range.len == uffdio_move.len ? 0 : -EAGAIN;
2028
2029out:
2030 return ret;
2031}
2032
2033/*
2034 * userland asks for a certain API version and we return which bits
2035 * and ioctl commands are implemented in this kernel for such API
2036 * version or -EINVAL if unknown.
2037 */
2038static int userfaultfd_api(struct userfaultfd_ctx *ctx,
2039 unsigned long arg)
2040{
2041 struct uffdio_api uffdio_api;
2042 void __user *buf = (void __user *)arg;
2043 unsigned int ctx_features;
2044 int ret;
2045 __u64 features;
2046
2047 ret = -EFAULT;
2048 if (copy_from_user(to: &uffdio_api, from: buf, n: sizeof(uffdio_api)))
2049 goto out;
2050 features = uffdio_api.features;
2051 ret = -EINVAL;
2052 if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
2053 goto err_out;
2054 ret = -EPERM;
2055 if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
2056 goto err_out;
2057
2058 /* WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally */
2059 if (features & UFFD_FEATURE_WP_ASYNC)
2060 features |= UFFD_FEATURE_WP_UNPOPULATED;
2061
2062 /* report all available features and ioctls to userland */
2063 uffdio_api.features = UFFD_API_FEATURES;
2064#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
2065 uffdio_api.features &=
2066 ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
2067#endif
2068#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
2069 uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
2070#endif
2071#ifndef CONFIG_PTE_MARKER_UFFD_WP
2072 uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
2073 uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
2074 uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
2075#endif
2076 uffdio_api.ioctls = UFFD_API_IOCTLS;
2077 ret = -EFAULT;
2078 if (copy_to_user(to: buf, from: &uffdio_api, n: sizeof(uffdio_api)))
2079 goto out;
2080
2081 /* only enable the requested features for this uffd context */
2082 ctx_features = uffd_ctx_features(user_features: features);
2083 ret = -EINVAL;
2084 if (cmpxchg(&ctx->features, 0, ctx_features) != 0)
2085 goto err_out;
2086
2087 ret = 0;
2088out:
2089 return ret;
2090err_out:
2091 memset(&uffdio_api, 0, sizeof(uffdio_api));
2092 if (copy_to_user(to: buf, from: &uffdio_api, n: sizeof(uffdio_api)))
2093 ret = -EFAULT;
2094 goto out;
2095}
2096
2097static long userfaultfd_ioctl(struct file *file, unsigned cmd,
2098 unsigned long arg)
2099{
2100 int ret = -EINVAL;
2101 struct userfaultfd_ctx *ctx = file->private_data;
2102
2103 if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx))
2104 return -EINVAL;
2105
2106 switch(cmd) {
2107 case UFFDIO_API:
2108 ret = userfaultfd_api(ctx, arg);
2109 break;
2110 case UFFDIO_REGISTER:
2111 ret = userfaultfd_register(ctx, arg);
2112 break;
2113 case UFFDIO_UNREGISTER:
2114 ret = userfaultfd_unregister(ctx, arg);
2115 break;
2116 case UFFDIO_WAKE:
2117 ret = userfaultfd_wake(ctx, arg);
2118 break;
2119 case UFFDIO_COPY:
2120 ret = userfaultfd_copy(ctx, arg);
2121 break;
2122 case UFFDIO_ZEROPAGE:
2123 ret = userfaultfd_zeropage(ctx, arg);
2124 break;
2125 case UFFDIO_MOVE:
2126 ret = userfaultfd_move(ctx, arg);
2127 break;
2128 case UFFDIO_WRITEPROTECT:
2129 ret = userfaultfd_writeprotect(ctx, arg);
2130 break;
2131 case UFFDIO_CONTINUE:
2132 ret = userfaultfd_continue(ctx, arg);
2133 break;
2134 case UFFDIO_POISON:
2135 ret = userfaultfd_poison(ctx, arg);
2136 break;
2137 }
2138 return ret;
2139}
2140
2141#ifdef CONFIG_PROC_FS
2142static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
2143{
2144 struct userfaultfd_ctx *ctx = f->private_data;
2145 wait_queue_entry_t *wq;
2146 unsigned long pending = 0, total = 0;
2147
2148 spin_lock_irq(lock: &ctx->fault_pending_wqh.lock);
2149 list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
2150 pending++;
2151 total++;
2152 }
2153 list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
2154 total++;
2155 }
2156 spin_unlock_irq(lock: &ctx->fault_pending_wqh.lock);
2157
2158 /*
2159 * If more protocols will be added, there will be all shown
2160 * separated by a space. Like this:
2161 * protocols: aa:... bb:...
2162 */
2163 seq_printf(m, fmt: "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
2164 pending, total, UFFD_API, ctx->features,
2165 UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
2166}
2167#endif
2168
2169static const struct file_operations userfaultfd_fops = {
2170#ifdef CONFIG_PROC_FS
2171 .show_fdinfo = userfaultfd_show_fdinfo,
2172#endif
2173 .release = userfaultfd_release,
2174 .poll = userfaultfd_poll,
2175 .read = userfaultfd_read,
2176 .unlocked_ioctl = userfaultfd_ioctl,
2177 .compat_ioctl = compat_ptr_ioctl,
2178 .llseek = noop_llseek,
2179};
2180
2181static void init_once_userfaultfd_ctx(void *mem)
2182{
2183 struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;
2184
2185 init_waitqueue_head(&ctx->fault_pending_wqh);
2186 init_waitqueue_head(&ctx->fault_wqh);
2187 init_waitqueue_head(&ctx->event_wqh);
2188 init_waitqueue_head(&ctx->fd_wqh);
2189 seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
2190}
2191
2192static int new_userfaultfd(int flags)
2193{
2194 struct userfaultfd_ctx *ctx;
2195 int fd;
2196
2197 BUG_ON(!current->mm);
2198
2199 /* Check the UFFD_* constants for consistency. */
2200 BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS);
2201 BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
2202 BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
2203
2204 if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY))
2205 return -EINVAL;
2206
2207 ctx = kmem_cache_alloc(cachep: userfaultfd_ctx_cachep, GFP_KERNEL);
2208 if (!ctx)
2209 return -ENOMEM;
2210
2211 refcount_set(r: &ctx->refcount, n: 1);
2212 ctx->flags = flags;
2213 ctx->features = 0;
2214 ctx->released = false;
2215 init_rwsem(&ctx->map_changing_lock);
2216 atomic_set(v: &ctx->mmap_changing, i: 0);
2217 ctx->mm = current->mm;
2218 /* prevent the mm struct to be freed */
2219 mmgrab(mm: ctx->mm);
2220
2221 /* Create a new inode so that the LSM can block the creation. */
2222 fd = anon_inode_create_getfd(name: "[userfaultfd]", fops: &userfaultfd_fops, priv: ctx,
2223 O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
2224 if (fd < 0) {
2225 mmdrop(mm: ctx->mm);
2226 kmem_cache_free(s: userfaultfd_ctx_cachep, objp: ctx);
2227 }
2228 return fd;
2229}
2230
2231static inline bool userfaultfd_syscall_allowed(int flags)
2232{
2233 /* Userspace-only page faults are always allowed */
2234 if (flags & UFFD_USER_MODE_ONLY)
2235 return true;
2236
2237 /*
2238 * The user is requesting a userfaultfd which can handle kernel faults.
2239 * Privileged users are always allowed to do this.
2240 */
2241 if (capable(CAP_SYS_PTRACE))
2242 return true;
2243
2244 /* Otherwise, access to kernel fault handling is sysctl controlled. */
2245 return sysctl_unprivileged_userfaultfd;
2246}
2247
2248SYSCALL_DEFINE1(userfaultfd, int, flags)
2249{
2250 if (!userfaultfd_syscall_allowed(flags))
2251 return -EPERM;
2252
2253 return new_userfaultfd(flags);
2254}
2255
2256static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags)
2257{
2258 if (cmd != USERFAULTFD_IOC_NEW)
2259 return -EINVAL;
2260
2261 return new_userfaultfd(flags);
2262}
2263
2264static const struct file_operations userfaultfd_dev_fops = {
2265 .unlocked_ioctl = userfaultfd_dev_ioctl,
2266 .compat_ioctl = userfaultfd_dev_ioctl,
2267 .owner = THIS_MODULE,
2268 .llseek = noop_llseek,
2269};
2270
2271static struct miscdevice userfaultfd_misc = {
2272 .minor = MISC_DYNAMIC_MINOR,
2273 .name = "userfaultfd",
2274 .fops = &userfaultfd_dev_fops
2275};
2276
2277static int __init userfaultfd_init(void)
2278{
2279 int ret;
2280
2281 ret = misc_register(misc: &userfaultfd_misc);
2282 if (ret)
2283 return ret;
2284
2285 userfaultfd_ctx_cachep = kmem_cache_create(name: "userfaultfd_ctx_cache",
2286 size: sizeof(struct userfaultfd_ctx),
2287 align: 0,
2288 SLAB_HWCACHE_ALIGN|SLAB_PANIC,
2289 ctor: init_once_userfaultfd_ctx);
2290#ifdef CONFIG_SYSCTL
2291 register_sysctl_init("vm", vm_userfaultfd_table);
2292#endif
2293 return 0;
2294}
2295__initcall(userfaultfd_init);
2296

source code of linux/fs/userfaultfd.c