1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * Fast Userspace Mutexes (which I call "Futexes!"). |
4 | * (C) Rusty Russell, IBM 2002 |
5 | * |
6 | * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar |
7 | * (C) Copyright 2003 Red Hat Inc, All Rights Reserved |
8 | * |
9 | * Removed page pinning, fix privately mapped COW pages and other cleanups |
10 | * (C) Copyright 2003, 2004 Jamie Lokier |
11 | * |
12 | * Robust futex support started by Ingo Molnar |
13 | * (C) Copyright 2006 Red Hat Inc, All Rights Reserved |
14 | * Thanks to Thomas Gleixner for suggestions, analysis and fixes. |
15 | * |
16 | * PI-futex support started by Ingo Molnar and Thomas Gleixner |
17 | * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> |
18 | * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> |
19 | * |
20 | * PRIVATE futexes by Eric Dumazet |
21 | * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> |
22 | * |
23 | * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> |
24 | * Copyright (C) IBM Corporation, 2009 |
25 | * Thanks to Thomas Gleixner for conceptual design and careful reviews. |
26 | * |
27 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly |
28 | * enough at me, Linus for the original (flawed) idea, Matthew |
29 | * Kirkwood for proof-of-concept implementation. |
30 | * |
31 | * "The futexes are also cursed." |
32 | * "But they come in a choice of three flavours!" |
33 | */ |
34 | #include <linux/compat.h> |
35 | #include <linux/jhash.h> |
36 | #include <linux/pagemap.h> |
37 | #include <linux/plist.h> |
38 | #include <linux/memblock.h> |
39 | #include <linux/fault-inject.h> |
40 | #include <linux/slab.h> |
41 | |
42 | #include "futex.h" |
43 | #include "../locking/rtmutex_common.h" |
44 | |
45 | /* |
46 | * The base of the bucket array and its size are always used together |
47 | * (after initialization only in futex_hash()), so ensure that they |
48 | * reside in the same cacheline. |
49 | */ |
50 | static struct { |
51 | struct futex_hash_bucket *queues; |
52 | unsigned long hashsize; |
53 | } __futex_data __read_mostly __aligned(2*sizeof(long)); |
54 | #define futex_queues (__futex_data.queues) |
55 | #define futex_hashsize (__futex_data.hashsize) |
56 | |
57 | |
58 | /* |
59 | * Fault injections for futexes. |
60 | */ |
61 | #ifdef CONFIG_FAIL_FUTEX |
62 | |
63 | static struct { |
64 | struct fault_attr attr; |
65 | |
66 | bool ignore_private; |
67 | } fail_futex = { |
68 | .attr = FAULT_ATTR_INITIALIZER, |
69 | .ignore_private = false, |
70 | }; |
71 | |
72 | static int __init setup_fail_futex(char *str) |
73 | { |
74 | return setup_fault_attr(attr: &fail_futex.attr, str); |
75 | } |
76 | __setup("fail_futex=" , setup_fail_futex); |
77 | |
78 | bool should_fail_futex(bool fshared) |
79 | { |
80 | if (fail_futex.ignore_private && !fshared) |
81 | return false; |
82 | |
83 | return should_fail(attr: &fail_futex.attr, size: 1); |
84 | } |
85 | |
86 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS |
87 | |
88 | static int __init fail_futex_debugfs(void) |
89 | { |
90 | umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; |
91 | struct dentry *dir; |
92 | |
93 | dir = fault_create_debugfs_attr(name: "fail_futex" , NULL, |
94 | attr: &fail_futex.attr); |
95 | if (IS_ERR(ptr: dir)) |
96 | return PTR_ERR(ptr: dir); |
97 | |
98 | debugfs_create_bool(name: "ignore-private" , mode, parent: dir, |
99 | value: &fail_futex.ignore_private); |
100 | return 0; |
101 | } |
102 | |
103 | late_initcall(fail_futex_debugfs); |
104 | |
105 | #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ |
106 | |
107 | #endif /* CONFIG_FAIL_FUTEX */ |
108 | |
109 | /** |
110 | * futex_hash - Return the hash bucket in the global hash |
111 | * @key: Pointer to the futex key for which the hash is calculated |
112 | * |
113 | * We hash on the keys returned from get_futex_key (see below) and return the |
114 | * corresponding hash bucket in the global hash. |
115 | */ |
116 | struct futex_hash_bucket *futex_hash(union futex_key *key) |
117 | { |
118 | u32 hash = jhash2(k: (u32 *)key, offsetof(typeof(*key), both.offset) / 4, |
119 | initval: key->both.offset); |
120 | |
121 | return &futex_queues[hash & (futex_hashsize - 1)]; |
122 | } |
123 | |
124 | |
125 | /** |
126 | * futex_setup_timer - set up the sleeping hrtimer. |
127 | * @time: ptr to the given timeout value |
128 | * @timeout: the hrtimer_sleeper structure to be set up |
129 | * @flags: futex flags |
130 | * @range_ns: optional range in ns |
131 | * |
132 | * Return: Initialized hrtimer_sleeper structure or NULL if no timeout |
133 | * value given |
134 | */ |
135 | struct hrtimer_sleeper * |
136 | futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, |
137 | int flags, u64 range_ns) |
138 | { |
139 | if (!time) |
140 | return NULL; |
141 | |
142 | hrtimer_init_sleeper_on_stack(sl: timeout, clock_id: (flags & FLAGS_CLOCKRT) ? |
143 | CLOCK_REALTIME : CLOCK_MONOTONIC, |
144 | mode: HRTIMER_MODE_ABS); |
145 | /* |
146 | * If range_ns is 0, calling hrtimer_set_expires_range_ns() is |
147 | * effectively the same as calling hrtimer_set_expires(). |
148 | */ |
149 | hrtimer_set_expires_range_ns(timer: &timeout->timer, time: *time, delta: range_ns); |
150 | |
151 | return timeout; |
152 | } |
153 | |
154 | /* |
155 | * Generate a machine wide unique identifier for this inode. |
156 | * |
157 | * This relies on u64 not wrapping in the life-time of the machine; which with |
158 | * 1ns resolution means almost 585 years. |
159 | * |
160 | * This further relies on the fact that a well formed program will not unmap |
161 | * the file while it has a (shared) futex waiting on it. This mapping will have |
162 | * a file reference which pins the mount and inode. |
163 | * |
164 | * If for some reason an inode gets evicted and read back in again, it will get |
165 | * a new sequence number and will _NOT_ match, even though it is the exact same |
166 | * file. |
167 | * |
168 | * It is important that futex_match() will never have a false-positive, esp. |
169 | * for PI futexes that can mess up the state. The above argues that false-negatives |
170 | * are only possible for malformed programs. |
171 | */ |
172 | static u64 get_inode_sequence_number(struct inode *inode) |
173 | { |
174 | static atomic64_t i_seq; |
175 | u64 old; |
176 | |
177 | /* Does the inode already have a sequence number? */ |
178 | old = atomic64_read(v: &inode->i_sequence); |
179 | if (likely(old)) |
180 | return old; |
181 | |
182 | for (;;) { |
183 | u64 new = atomic64_add_return(i: 1, v: &i_seq); |
184 | if (WARN_ON_ONCE(!new)) |
185 | continue; |
186 | |
187 | old = atomic64_cmpxchg_relaxed(v: &inode->i_sequence, old: 0, new); |
188 | if (old) |
189 | return old; |
190 | return new; |
191 | } |
192 | } |
193 | |
194 | /** |
195 | * get_futex_key() - Get parameters which are the keys for a futex |
196 | * @uaddr: virtual address of the futex |
197 | * @flags: FLAGS_* |
198 | * @key: address where result is stored. |
199 | * @rw: mapping needs to be read/write (values: FUTEX_READ, |
200 | * FUTEX_WRITE) |
201 | * |
202 | * Return: a negative error code or 0 |
203 | * |
204 | * The key words are stored in @key on success. |
205 | * |
206 | * For shared mappings (when @fshared), the key is: |
207 | * |
208 | * ( inode->i_sequence, page->index, offset_within_page ) |
209 | * |
210 | * [ also see get_inode_sequence_number() ] |
211 | * |
212 | * For private mappings (or when !@fshared), the key is: |
213 | * |
214 | * ( current->mm, address, 0 ) |
215 | * |
216 | * This allows (cross process, where applicable) identification of the futex |
217 | * without keeping the page pinned for the duration of the FUTEX_WAIT. |
218 | * |
219 | * lock_page() might sleep, the caller should not hold a spinlock. |
220 | */ |
221 | int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, |
222 | enum futex_access rw) |
223 | { |
224 | unsigned long address = (unsigned long)uaddr; |
225 | struct mm_struct *mm = current->mm; |
226 | struct page *page; |
227 | struct folio *folio; |
228 | struct address_space *mapping; |
229 | int err, ro = 0; |
230 | bool fshared; |
231 | |
232 | fshared = flags & FLAGS_SHARED; |
233 | |
234 | /* |
235 | * The futex address must be "naturally" aligned. |
236 | */ |
237 | key->both.offset = address % PAGE_SIZE; |
238 | if (unlikely((address % sizeof(u32)) != 0)) |
239 | return -EINVAL; |
240 | address -= key->both.offset; |
241 | |
242 | if (unlikely(!access_ok(uaddr, sizeof(u32)))) |
243 | return -EFAULT; |
244 | |
245 | if (unlikely(should_fail_futex(fshared))) |
246 | return -EFAULT; |
247 | |
248 | /* |
249 | * PROCESS_PRIVATE futexes are fast. |
250 | * As the mm cannot disappear under us and the 'key' only needs |
251 | * virtual address, we dont even have to find the underlying vma. |
252 | * Note : We do have to check 'uaddr' is a valid user address, |
253 | * but access_ok() should be faster than find_vma() |
254 | */ |
255 | if (!fshared) { |
256 | /* |
257 | * On no-MMU, shared futexes are treated as private, therefore |
258 | * we must not include the current process in the key. Since |
259 | * there is only one address space, the address is a unique key |
260 | * on its own. |
261 | */ |
262 | if (IS_ENABLED(CONFIG_MMU)) |
263 | key->private.mm = mm; |
264 | else |
265 | key->private.mm = NULL; |
266 | |
267 | key->private.address = address; |
268 | return 0; |
269 | } |
270 | |
271 | again: |
272 | /* Ignore any VERIFY_READ mapping (futex common case) */ |
273 | if (unlikely(should_fail_futex(true))) |
274 | return -EFAULT; |
275 | |
276 | err = get_user_pages_fast(start: address, nr_pages: 1, gup_flags: FOLL_WRITE, pages: &page); |
277 | /* |
278 | * If write access is not required (eg. FUTEX_WAIT), try |
279 | * and get read-only access. |
280 | */ |
281 | if (err == -EFAULT && rw == FUTEX_READ) { |
282 | err = get_user_pages_fast(start: address, nr_pages: 1, gup_flags: 0, pages: &page); |
283 | ro = 1; |
284 | } |
285 | if (err < 0) |
286 | return err; |
287 | else |
288 | err = 0; |
289 | |
290 | /* |
291 | * The treatment of mapping from this point on is critical. The folio |
292 | * lock protects many things but in this context the folio lock |
293 | * stabilizes mapping, prevents inode freeing in the shared |
294 | * file-backed region case and guards against movement to swap cache. |
295 | * |
296 | * Strictly speaking the folio lock is not needed in all cases being |
297 | * considered here and folio lock forces unnecessarily serialization. |
298 | * From this point on, mapping will be re-verified if necessary and |
299 | * folio lock will be acquired only if it is unavoidable |
300 | * |
301 | * Mapping checks require the folio so it is looked up now. For |
302 | * anonymous pages, it does not matter if the folio is split |
303 | * in the future as the key is based on the address. For |
304 | * filesystem-backed pages, the precise page is required as the |
305 | * index of the page determines the key. |
306 | */ |
307 | folio = page_folio(page); |
308 | mapping = READ_ONCE(folio->mapping); |
309 | |
310 | /* |
311 | * If folio->mapping is NULL, then it cannot be an anonymous |
312 | * page; but it might be the ZERO_PAGE or in the gate area or |
313 | * in a special mapping (all cases which we are happy to fail); |
314 | * or it may have been a good file page when get_user_pages_fast |
315 | * found it, but truncated or holepunched or subjected to |
316 | * invalidate_complete_page2 before we got the folio lock (also |
317 | * cases which we are happy to fail). And we hold a reference, |
318 | * so refcount care in invalidate_inode_page's remove_mapping |
319 | * prevents drop_caches from setting mapping to NULL beneath us. |
320 | * |
321 | * The case we do have to guard against is when memory pressure made |
322 | * shmem_writepage move it from filecache to swapcache beneath us: |
323 | * an unlikely race, but we do need to retry for folio->mapping. |
324 | */ |
325 | if (unlikely(!mapping)) { |
326 | int shmem_swizzled; |
327 | |
328 | /* |
329 | * Folio lock is required to identify which special case above |
330 | * applies. If this is really a shmem page then the folio lock |
331 | * will prevent unexpected transitions. |
332 | */ |
333 | folio_lock(folio); |
334 | shmem_swizzled = folio_test_swapcache(folio) || folio->mapping; |
335 | folio_unlock(folio); |
336 | folio_put(folio); |
337 | |
338 | if (shmem_swizzled) |
339 | goto again; |
340 | |
341 | return -EFAULT; |
342 | } |
343 | |
344 | /* |
345 | * Private mappings are handled in a simple way. |
346 | * |
347 | * If the futex key is stored in anonymous memory, then the associated |
348 | * object is the mm which is implicitly pinned by the calling process. |
349 | * |
350 | * NOTE: When userspace waits on a MAP_SHARED mapping, even if |
351 | * it's a read-only handle, it's expected that futexes attach to |
352 | * the object not the particular process. |
353 | */ |
354 | if (folio_test_anon(folio)) { |
355 | /* |
356 | * A RO anonymous page will never change and thus doesn't make |
357 | * sense for futex operations. |
358 | */ |
359 | if (unlikely(should_fail_futex(true)) || ro) { |
360 | err = -EFAULT; |
361 | goto out; |
362 | } |
363 | |
364 | key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ |
365 | key->private.mm = mm; |
366 | key->private.address = address; |
367 | |
368 | } else { |
369 | struct inode *inode; |
370 | |
371 | /* |
372 | * The associated futex object in this case is the inode and |
373 | * the folio->mapping must be traversed. Ordinarily this should |
374 | * be stabilised under folio lock but it's not strictly |
375 | * necessary in this case as we just want to pin the inode, not |
376 | * update i_pages or anything like that. |
377 | * |
378 | * The RCU read lock is taken as the inode is finally freed |
379 | * under RCU. If the mapping still matches expectations then the |
380 | * mapping->host can be safely accessed as being a valid inode. |
381 | */ |
382 | rcu_read_lock(); |
383 | |
384 | if (READ_ONCE(folio->mapping) != mapping) { |
385 | rcu_read_unlock(); |
386 | folio_put(folio); |
387 | |
388 | goto again; |
389 | } |
390 | |
391 | inode = READ_ONCE(mapping->host); |
392 | if (!inode) { |
393 | rcu_read_unlock(); |
394 | folio_put(folio); |
395 | |
396 | goto again; |
397 | } |
398 | |
399 | key->both.offset |= FUT_OFF_INODE; /* inode-based key */ |
400 | key->shared.i_seq = get_inode_sequence_number(inode); |
401 | key->shared.pgoff = folio->index + folio_page_idx(folio, page); |
402 | rcu_read_unlock(); |
403 | } |
404 | |
405 | out: |
406 | folio_put(folio); |
407 | return err; |
408 | } |
409 | |
410 | /** |
411 | * fault_in_user_writeable() - Fault in user address and verify RW access |
412 | * @uaddr: pointer to faulting user space address |
413 | * |
414 | * Slow path to fixup the fault we just took in the atomic write |
415 | * access to @uaddr. |
416 | * |
417 | * We have no generic implementation of a non-destructive write to the |
418 | * user address. We know that we faulted in the atomic pagefault |
419 | * disabled section so we can as well avoid the #PF overhead by |
420 | * calling get_user_pages() right away. |
421 | */ |
422 | int fault_in_user_writeable(u32 __user *uaddr) |
423 | { |
424 | struct mm_struct *mm = current->mm; |
425 | int ret; |
426 | |
427 | mmap_read_lock(mm); |
428 | ret = fixup_user_fault(mm, address: (unsigned long)uaddr, |
429 | fault_flags: FAULT_FLAG_WRITE, NULL); |
430 | mmap_read_unlock(mm); |
431 | |
432 | return ret < 0 ? ret : 0; |
433 | } |
434 | |
435 | /** |
436 | * futex_top_waiter() - Return the highest priority waiter on a futex |
437 | * @hb: the hash bucket the futex_q's reside in |
438 | * @key: the futex key (to distinguish it from other futex futex_q's) |
439 | * |
440 | * Must be called with the hb lock held. |
441 | */ |
442 | struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key) |
443 | { |
444 | struct futex_q *this; |
445 | |
446 | plist_for_each_entry(this, &hb->chain, list) { |
447 | if (futex_match(key1: &this->key, key2: key)) |
448 | return this; |
449 | } |
450 | return NULL; |
451 | } |
452 | |
453 | int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval) |
454 | { |
455 | int ret; |
456 | |
457 | pagefault_disable(); |
458 | ret = futex_atomic_cmpxchg_inatomic(uval: curval, uaddr, oldval: uval, newval); |
459 | pagefault_enable(); |
460 | |
461 | return ret; |
462 | } |
463 | |
464 | int futex_get_value_locked(u32 *dest, u32 __user *from) |
465 | { |
466 | int ret; |
467 | |
468 | pagefault_disable(); |
469 | ret = __get_user(*dest, from); |
470 | pagefault_enable(); |
471 | |
472 | return ret ? -EFAULT : 0; |
473 | } |
474 | |
475 | /** |
476 | * wait_for_owner_exiting - Block until the owner has exited |
477 | * @ret: owner's current futex lock status |
478 | * @exiting: Pointer to the exiting task |
479 | * |
480 | * Caller must hold a refcount on @exiting. |
481 | */ |
482 | void wait_for_owner_exiting(int ret, struct task_struct *exiting) |
483 | { |
484 | if (ret != -EBUSY) { |
485 | WARN_ON_ONCE(exiting); |
486 | return; |
487 | } |
488 | |
489 | if (WARN_ON_ONCE(ret == -EBUSY && !exiting)) |
490 | return; |
491 | |
492 | mutex_lock(&exiting->futex_exit_mutex); |
493 | /* |
494 | * No point in doing state checking here. If the waiter got here |
495 | * while the task was in exec()->exec_futex_release() then it can |
496 | * have any FUTEX_STATE_* value when the waiter has acquired the |
497 | * mutex. OK, if running, EXITING or DEAD if it reached exit() |
498 | * already. Highly unlikely and not a problem. Just one more round |
499 | * through the futex maze. |
500 | */ |
501 | mutex_unlock(lock: &exiting->futex_exit_mutex); |
502 | |
503 | put_task_struct(t: exiting); |
504 | } |
505 | |
506 | /** |
507 | * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket |
508 | * @q: The futex_q to unqueue |
509 | * |
510 | * The q->lock_ptr must not be NULL and must be held by the caller. |
511 | */ |
512 | void __futex_unqueue(struct futex_q *q) |
513 | { |
514 | struct futex_hash_bucket *hb; |
515 | |
516 | if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list))) |
517 | return; |
518 | lockdep_assert_held(q->lock_ptr); |
519 | |
520 | hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); |
521 | plist_del(node: &q->list, head: &hb->chain); |
522 | futex_hb_waiters_dec(hb); |
523 | } |
524 | |
525 | /* The key must be already stored in q->key. */ |
526 | struct futex_hash_bucket *futex_q_lock(struct futex_q *q) |
527 | __acquires(&hb->lock) |
528 | { |
529 | struct futex_hash_bucket *hb; |
530 | |
531 | hb = futex_hash(key: &q->key); |
532 | |
533 | /* |
534 | * Increment the counter before taking the lock so that |
535 | * a potential waker won't miss a to-be-slept task that is |
536 | * waiting for the spinlock. This is safe as all futex_q_lock() |
537 | * users end up calling futex_queue(). Similarly, for housekeeping, |
538 | * decrement the counter at futex_q_unlock() when some error has |
539 | * occurred and we don't end up adding the task to the list. |
540 | */ |
541 | futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */ |
542 | |
543 | q->lock_ptr = &hb->lock; |
544 | |
545 | spin_lock(lock: &hb->lock); |
546 | return hb; |
547 | } |
548 | |
549 | void futex_q_unlock(struct futex_hash_bucket *hb) |
550 | __releases(&hb->lock) |
551 | { |
552 | spin_unlock(lock: &hb->lock); |
553 | futex_hb_waiters_dec(hb); |
554 | } |
555 | |
556 | void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb) |
557 | { |
558 | int prio; |
559 | |
560 | /* |
561 | * The priority used to register this element is |
562 | * - either the real thread-priority for the real-time threads |
563 | * (i.e. threads with a priority lower than MAX_RT_PRIO) |
564 | * - or MAX_RT_PRIO for non-RT threads. |
565 | * Thus, all RT-threads are woken first in priority order, and |
566 | * the others are woken last, in FIFO order. |
567 | */ |
568 | prio = min(current->normal_prio, MAX_RT_PRIO); |
569 | |
570 | plist_node_init(node: &q->list, prio); |
571 | plist_add(node: &q->list, head: &hb->chain); |
572 | q->task = current; |
573 | } |
574 | |
575 | /** |
576 | * futex_unqueue() - Remove the futex_q from its futex_hash_bucket |
577 | * @q: The futex_q to unqueue |
578 | * |
579 | * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must |
580 | * be paired with exactly one earlier call to futex_queue(). |
581 | * |
582 | * Return: |
583 | * - 1 - if the futex_q was still queued (and we removed unqueued it); |
584 | * - 0 - if the futex_q was already removed by the waking thread |
585 | */ |
586 | int futex_unqueue(struct futex_q *q) |
587 | { |
588 | spinlock_t *lock_ptr; |
589 | int ret = 0; |
590 | |
591 | /* In the common case we don't take the spinlock, which is nice. */ |
592 | retry: |
593 | /* |
594 | * q->lock_ptr can change between this read and the following spin_lock. |
595 | * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and |
596 | * optimizing lock_ptr out of the logic below. |
597 | */ |
598 | lock_ptr = READ_ONCE(q->lock_ptr); |
599 | if (lock_ptr != NULL) { |
600 | spin_lock(lock: lock_ptr); |
601 | /* |
602 | * q->lock_ptr can change between reading it and |
603 | * spin_lock(), causing us to take the wrong lock. This |
604 | * corrects the race condition. |
605 | * |
606 | * Reasoning goes like this: if we have the wrong lock, |
607 | * q->lock_ptr must have changed (maybe several times) |
608 | * between reading it and the spin_lock(). It can |
609 | * change again after the spin_lock() but only if it was |
610 | * already changed before the spin_lock(). It cannot, |
611 | * however, change back to the original value. Therefore |
612 | * we can detect whether we acquired the correct lock. |
613 | */ |
614 | if (unlikely(lock_ptr != q->lock_ptr)) { |
615 | spin_unlock(lock: lock_ptr); |
616 | goto retry; |
617 | } |
618 | __futex_unqueue(q); |
619 | |
620 | BUG_ON(q->pi_state); |
621 | |
622 | spin_unlock(lock: lock_ptr); |
623 | ret = 1; |
624 | } |
625 | |
626 | return ret; |
627 | } |
628 | |
629 | /* |
630 | * PI futexes can not be requeued and must remove themselves from the hash |
631 | * bucket. The hash bucket lock (i.e. lock_ptr) is held. |
632 | */ |
633 | void futex_unqueue_pi(struct futex_q *q) |
634 | { |
635 | /* |
636 | * If the lock was not acquired (due to timeout or signal) then the |
637 | * rt_waiter is removed before futex_q is. If this is observed by |
638 | * an unlocker after dropping the rtmutex wait lock and before |
639 | * acquiring the hash bucket lock, then the unlocker dequeues the |
640 | * futex_q from the hash bucket list to guarantee consistent state |
641 | * vs. userspace. Therefore the dequeue here must be conditional. |
642 | */ |
643 | if (!plist_node_empty(node: &q->list)) |
644 | __futex_unqueue(q); |
645 | |
646 | BUG_ON(!q->pi_state); |
647 | put_pi_state(pi_state: q->pi_state); |
648 | q->pi_state = NULL; |
649 | } |
650 | |
651 | /* Constants for the pending_op argument of handle_futex_death */ |
652 | #define HANDLE_DEATH_PENDING true |
653 | #define HANDLE_DEATH_LIST false |
654 | |
655 | /* |
656 | * Process a futex-list entry, check whether it's owned by the |
657 | * dying task, and do notification if so: |
658 | */ |
659 | static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, |
660 | bool pi, bool pending_op) |
661 | { |
662 | u32 uval, nval, mval; |
663 | pid_t owner; |
664 | int err; |
665 | |
666 | /* Futex address must be 32bit aligned */ |
667 | if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) |
668 | return -1; |
669 | |
670 | retry: |
671 | if (get_user(uval, uaddr)) |
672 | return -1; |
673 | |
674 | /* |
675 | * Special case for regular (non PI) futexes. The unlock path in |
676 | * user space has two race scenarios: |
677 | * |
678 | * 1. The unlock path releases the user space futex value and |
679 | * before it can execute the futex() syscall to wake up |
680 | * waiters it is killed. |
681 | * |
682 | * 2. A woken up waiter is killed before it can acquire the |
683 | * futex in user space. |
684 | * |
685 | * In the second case, the wake up notification could be generated |
686 | * by the unlock path in user space after setting the futex value |
687 | * to zero or by the kernel after setting the OWNER_DIED bit below. |
688 | * |
689 | * In both cases the TID validation below prevents a wakeup of |
690 | * potential waiters which can cause these waiters to block |
691 | * forever. |
692 | * |
693 | * In both cases the following conditions are met: |
694 | * |
695 | * 1) task->robust_list->list_op_pending != NULL |
696 | * @pending_op == true |
697 | * 2) The owner part of user space futex value == 0 |
698 | * 3) Regular futex: @pi == false |
699 | * |
700 | * If these conditions are met, it is safe to attempt waking up a |
701 | * potential waiter without touching the user space futex value and |
702 | * trying to set the OWNER_DIED bit. If the futex value is zero, |
703 | * the rest of the user space mutex state is consistent, so a woken |
704 | * waiter will just take over the uncontended futex. Setting the |
705 | * OWNER_DIED bit would create inconsistent state and malfunction |
706 | * of the user space owner died handling. Otherwise, the OWNER_DIED |
707 | * bit is already set, and the woken waiter is expected to deal with |
708 | * this. |
709 | */ |
710 | owner = uval & FUTEX_TID_MASK; |
711 | |
712 | if (pending_op && !pi && !owner) { |
713 | futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, nr_wake: 1, |
714 | FUTEX_BITSET_MATCH_ANY); |
715 | return 0; |
716 | } |
717 | |
718 | if (owner != task_pid_vnr(tsk: curr)) |
719 | return 0; |
720 | |
721 | /* |
722 | * Ok, this dying thread is truly holding a futex |
723 | * of interest. Set the OWNER_DIED bit atomically |
724 | * via cmpxchg, and if the value had FUTEX_WAITERS |
725 | * set, wake up a waiter (if any). (We have to do a |
726 | * futex_wake() even if OWNER_DIED is already set - |
727 | * to handle the rare but possible case of recursive |
728 | * thread-death.) The rest of the cleanup is done in |
729 | * userspace. |
730 | */ |
731 | mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; |
732 | |
733 | /* |
734 | * We are not holding a lock here, but we want to have |
735 | * the pagefault_disable/enable() protection because |
736 | * we want to handle the fault gracefully. If the |
737 | * access fails we try to fault in the futex with R/W |
738 | * verification via get_user_pages. get_user() above |
739 | * does not guarantee R/W access. If that fails we |
740 | * give up and leave the futex locked. |
741 | */ |
742 | if ((err = futex_cmpxchg_value_locked(curval: &nval, uaddr, uval, newval: mval))) { |
743 | switch (err) { |
744 | case -EFAULT: |
745 | if (fault_in_user_writeable(uaddr)) |
746 | return -1; |
747 | goto retry; |
748 | |
749 | case -EAGAIN: |
750 | cond_resched(); |
751 | goto retry; |
752 | |
753 | default: |
754 | WARN_ON_ONCE(1); |
755 | return err; |
756 | } |
757 | } |
758 | |
759 | if (nval != uval) |
760 | goto retry; |
761 | |
762 | /* |
763 | * Wake robust non-PI futexes here. The wakeup of |
764 | * PI futexes happens in exit_pi_state(): |
765 | */ |
766 | if (!pi && (uval & FUTEX_WAITERS)) { |
767 | futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, nr_wake: 1, |
768 | FUTEX_BITSET_MATCH_ANY); |
769 | } |
770 | |
771 | return 0; |
772 | } |
773 | |
774 | /* |
775 | * Fetch a robust-list pointer. Bit 0 signals PI futexes: |
776 | */ |
777 | static inline int fetch_robust_entry(struct robust_list __user **entry, |
778 | struct robust_list __user * __user *head, |
779 | unsigned int *pi) |
780 | { |
781 | unsigned long uentry; |
782 | |
783 | if (get_user(uentry, (unsigned long __user *)head)) |
784 | return -EFAULT; |
785 | |
786 | *entry = (void __user *)(uentry & ~1UL); |
787 | *pi = uentry & 1; |
788 | |
789 | return 0; |
790 | } |
791 | |
792 | /* |
793 | * Walk curr->robust_list (very carefully, it's a userspace list!) |
794 | * and mark any locks found there dead, and notify any waiters. |
795 | * |
796 | * We silently return on any sign of list-walking problem. |
797 | */ |
798 | static void exit_robust_list(struct task_struct *curr) |
799 | { |
800 | struct robust_list_head __user *head = curr->robust_list; |
801 | struct robust_list __user *entry, *next_entry, *pending; |
802 | unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; |
803 | unsigned int next_pi; |
804 | unsigned long futex_offset; |
805 | int rc; |
806 | |
807 | /* |
808 | * Fetch the list head (which was registered earlier, via |
809 | * sys_set_robust_list()): |
810 | */ |
811 | if (fetch_robust_entry(entry: &entry, head: &head->list.next, pi: &pi)) |
812 | return; |
813 | /* |
814 | * Fetch the relative futex offset: |
815 | */ |
816 | if (get_user(futex_offset, &head->futex_offset)) |
817 | return; |
818 | /* |
819 | * Fetch any possibly pending lock-add first, and handle it |
820 | * if it exists: |
821 | */ |
822 | if (fetch_robust_entry(entry: &pending, head: &head->list_op_pending, pi: &pip)) |
823 | return; |
824 | |
825 | next_entry = NULL; /* avoid warning with gcc */ |
826 | while (entry != &head->list) { |
827 | /* |
828 | * Fetch the next entry in the list before calling |
829 | * handle_futex_death: |
830 | */ |
831 | rc = fetch_robust_entry(entry: &next_entry, head: &entry->next, pi: &next_pi); |
832 | /* |
833 | * A pending lock might already be on the list, so |
834 | * don't process it twice: |
835 | */ |
836 | if (entry != pending) { |
837 | if (handle_futex_death(uaddr: (void __user *)entry + futex_offset, |
838 | curr, pi, HANDLE_DEATH_LIST)) |
839 | return; |
840 | } |
841 | if (rc) |
842 | return; |
843 | entry = next_entry; |
844 | pi = next_pi; |
845 | /* |
846 | * Avoid excessively long or circular lists: |
847 | */ |
848 | if (!--limit) |
849 | break; |
850 | |
851 | cond_resched(); |
852 | } |
853 | |
854 | if (pending) { |
855 | handle_futex_death(uaddr: (void __user *)pending + futex_offset, |
856 | curr, pi: pip, HANDLE_DEATH_PENDING); |
857 | } |
858 | } |
859 | |
860 | #ifdef CONFIG_COMPAT |
861 | static void __user *futex_uaddr(struct robust_list __user *entry, |
862 | compat_long_t futex_offset) |
863 | { |
864 | compat_uptr_t base = ptr_to_compat(uptr: entry); |
865 | void __user *uaddr = compat_ptr(uptr: base + futex_offset); |
866 | |
867 | return uaddr; |
868 | } |
869 | |
870 | /* |
871 | * Fetch a robust-list pointer. Bit 0 signals PI futexes: |
872 | */ |
873 | static inline int |
874 | compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, |
875 | compat_uptr_t __user *head, unsigned int *pi) |
876 | { |
877 | if (get_user(*uentry, head)) |
878 | return -EFAULT; |
879 | |
880 | *entry = compat_ptr(uptr: (*uentry) & ~1); |
881 | *pi = (unsigned int)(*uentry) & 1; |
882 | |
883 | return 0; |
884 | } |
885 | |
886 | /* |
887 | * Walk curr->robust_list (very carefully, it's a userspace list!) |
888 | * and mark any locks found there dead, and notify any waiters. |
889 | * |
890 | * We silently return on any sign of list-walking problem. |
891 | */ |
892 | static void compat_exit_robust_list(struct task_struct *curr) |
893 | { |
894 | struct compat_robust_list_head __user *head = curr->compat_robust_list; |
895 | struct robust_list __user *entry, *next_entry, *pending; |
896 | unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; |
897 | unsigned int next_pi; |
898 | compat_uptr_t uentry, next_uentry, upending; |
899 | compat_long_t futex_offset; |
900 | int rc; |
901 | |
902 | /* |
903 | * Fetch the list head (which was registered earlier, via |
904 | * sys_set_robust_list()): |
905 | */ |
906 | if (compat_fetch_robust_entry(uentry: &uentry, entry: &entry, head: &head->list.next, pi: &pi)) |
907 | return; |
908 | /* |
909 | * Fetch the relative futex offset: |
910 | */ |
911 | if (get_user(futex_offset, &head->futex_offset)) |
912 | return; |
913 | /* |
914 | * Fetch any possibly pending lock-add first, and handle it |
915 | * if it exists: |
916 | */ |
917 | if (compat_fetch_robust_entry(uentry: &upending, entry: &pending, |
918 | head: &head->list_op_pending, pi: &pip)) |
919 | return; |
920 | |
921 | next_entry = NULL; /* avoid warning with gcc */ |
922 | while (entry != (struct robust_list __user *) &head->list) { |
923 | /* |
924 | * Fetch the next entry in the list before calling |
925 | * handle_futex_death: |
926 | */ |
927 | rc = compat_fetch_robust_entry(uentry: &next_uentry, entry: &next_entry, |
928 | head: (compat_uptr_t __user *)&entry->next, pi: &next_pi); |
929 | /* |
930 | * A pending lock might already be on the list, so |
931 | * dont process it twice: |
932 | */ |
933 | if (entry != pending) { |
934 | void __user *uaddr = futex_uaddr(entry, futex_offset); |
935 | |
936 | if (handle_futex_death(uaddr, curr, pi, |
937 | HANDLE_DEATH_LIST)) |
938 | return; |
939 | } |
940 | if (rc) |
941 | return; |
942 | uentry = next_uentry; |
943 | entry = next_entry; |
944 | pi = next_pi; |
945 | /* |
946 | * Avoid excessively long or circular lists: |
947 | */ |
948 | if (!--limit) |
949 | break; |
950 | |
951 | cond_resched(); |
952 | } |
953 | if (pending) { |
954 | void __user *uaddr = futex_uaddr(entry: pending, futex_offset); |
955 | |
956 | handle_futex_death(uaddr, curr, pi: pip, HANDLE_DEATH_PENDING); |
957 | } |
958 | } |
959 | #endif |
960 | |
961 | #ifdef CONFIG_FUTEX_PI |
962 | |
963 | /* |
964 | * This task is holding PI mutexes at exit time => bad. |
965 | * Kernel cleans up PI-state, but userspace is likely hosed. |
966 | * (Robust-futex cleanup is separate and might save the day for userspace.) |
967 | */ |
968 | static void exit_pi_state_list(struct task_struct *curr) |
969 | { |
970 | struct list_head *next, *head = &curr->pi_state_list; |
971 | struct futex_pi_state *pi_state; |
972 | struct futex_hash_bucket *hb; |
973 | union futex_key key = FUTEX_KEY_INIT; |
974 | |
975 | /* |
976 | * We are a ZOMBIE and nobody can enqueue itself on |
977 | * pi_state_list anymore, but we have to be careful |
978 | * versus waiters unqueueing themselves: |
979 | */ |
980 | raw_spin_lock_irq(&curr->pi_lock); |
981 | while (!list_empty(head)) { |
982 | next = head->next; |
983 | pi_state = list_entry(next, struct futex_pi_state, list); |
984 | key = pi_state->key; |
985 | hb = futex_hash(key: &key); |
986 | |
987 | /* |
988 | * We can race against put_pi_state() removing itself from the |
989 | * list (a waiter going away). put_pi_state() will first |
990 | * decrement the reference count and then modify the list, so |
991 | * its possible to see the list entry but fail this reference |
992 | * acquire. |
993 | * |
994 | * In that case; drop the locks to let put_pi_state() make |
995 | * progress and retry the loop. |
996 | */ |
997 | if (!refcount_inc_not_zero(r: &pi_state->refcount)) { |
998 | raw_spin_unlock_irq(&curr->pi_lock); |
999 | cpu_relax(); |
1000 | raw_spin_lock_irq(&curr->pi_lock); |
1001 | continue; |
1002 | } |
1003 | raw_spin_unlock_irq(&curr->pi_lock); |
1004 | |
1005 | spin_lock(lock: &hb->lock); |
1006 | raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); |
1007 | raw_spin_lock(&curr->pi_lock); |
1008 | /* |
1009 | * We dropped the pi-lock, so re-check whether this |
1010 | * task still owns the PI-state: |
1011 | */ |
1012 | if (head->next != next) { |
1013 | /* retain curr->pi_lock for the loop invariant */ |
1014 | raw_spin_unlock(&pi_state->pi_mutex.wait_lock); |
1015 | spin_unlock(lock: &hb->lock); |
1016 | put_pi_state(pi_state); |
1017 | continue; |
1018 | } |
1019 | |
1020 | WARN_ON(pi_state->owner != curr); |
1021 | WARN_ON(list_empty(&pi_state->list)); |
1022 | list_del_init(entry: &pi_state->list); |
1023 | pi_state->owner = NULL; |
1024 | |
1025 | raw_spin_unlock(&curr->pi_lock); |
1026 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); |
1027 | spin_unlock(lock: &hb->lock); |
1028 | |
1029 | rt_mutex_futex_unlock(lock: &pi_state->pi_mutex); |
1030 | put_pi_state(pi_state); |
1031 | |
1032 | raw_spin_lock_irq(&curr->pi_lock); |
1033 | } |
1034 | raw_spin_unlock_irq(&curr->pi_lock); |
1035 | } |
1036 | #else |
1037 | static inline void exit_pi_state_list(struct task_struct *curr) { } |
1038 | #endif |
1039 | |
1040 | static void futex_cleanup(struct task_struct *tsk) |
1041 | { |
1042 | if (unlikely(tsk->robust_list)) { |
1043 | exit_robust_list(curr: tsk); |
1044 | tsk->robust_list = NULL; |
1045 | } |
1046 | |
1047 | #ifdef CONFIG_COMPAT |
1048 | if (unlikely(tsk->compat_robust_list)) { |
1049 | compat_exit_robust_list(curr: tsk); |
1050 | tsk->compat_robust_list = NULL; |
1051 | } |
1052 | #endif |
1053 | |
1054 | if (unlikely(!list_empty(&tsk->pi_state_list))) |
1055 | exit_pi_state_list(curr: tsk); |
1056 | } |
1057 | |
1058 | /** |
1059 | * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD |
1060 | * @tsk: task to set the state on |
1061 | * |
1062 | * Set the futex exit state of the task lockless. The futex waiter code |
1063 | * observes that state when a task is exiting and loops until the task has |
1064 | * actually finished the futex cleanup. The worst case for this is that the |
1065 | * waiter runs through the wait loop until the state becomes visible. |
1066 | * |
1067 | * This is called from the recursive fault handling path in make_task_dead(). |
1068 | * |
1069 | * This is best effort. Either the futex exit code has run already or |
1070 | * not. If the OWNER_DIED bit has been set on the futex then the waiter can |
1071 | * take it over. If not, the problem is pushed back to user space. If the |
1072 | * futex exit code did not run yet, then an already queued waiter might |
1073 | * block forever, but there is nothing which can be done about that. |
1074 | */ |
1075 | void futex_exit_recursive(struct task_struct *tsk) |
1076 | { |
1077 | /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */ |
1078 | if (tsk->futex_state == FUTEX_STATE_EXITING) |
1079 | mutex_unlock(lock: &tsk->futex_exit_mutex); |
1080 | tsk->futex_state = FUTEX_STATE_DEAD; |
1081 | } |
1082 | |
1083 | static void futex_cleanup_begin(struct task_struct *tsk) |
1084 | { |
1085 | /* |
1086 | * Prevent various race issues against a concurrent incoming waiter |
1087 | * including live locks by forcing the waiter to block on |
1088 | * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in |
1089 | * attach_to_pi_owner(). |
1090 | */ |
1091 | mutex_lock(&tsk->futex_exit_mutex); |
1092 | |
1093 | /* |
1094 | * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. |
1095 | * |
1096 | * This ensures that all subsequent checks of tsk->futex_state in |
1097 | * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with |
1098 | * tsk->pi_lock held. |
1099 | * |
1100 | * It guarantees also that a pi_state which was queued right before |
1101 | * the state change under tsk->pi_lock by a concurrent waiter must |
1102 | * be observed in exit_pi_state_list(). |
1103 | */ |
1104 | raw_spin_lock_irq(&tsk->pi_lock); |
1105 | tsk->futex_state = FUTEX_STATE_EXITING; |
1106 | raw_spin_unlock_irq(&tsk->pi_lock); |
1107 | } |
1108 | |
1109 | static void futex_cleanup_end(struct task_struct *tsk, int state) |
1110 | { |
1111 | /* |
1112 | * Lockless store. The only side effect is that an observer might |
1113 | * take another loop until it becomes visible. |
1114 | */ |
1115 | tsk->futex_state = state; |
1116 | /* |
1117 | * Drop the exit protection. This unblocks waiters which observed |
1118 | * FUTEX_STATE_EXITING to reevaluate the state. |
1119 | */ |
1120 | mutex_unlock(lock: &tsk->futex_exit_mutex); |
1121 | } |
1122 | |
1123 | void futex_exec_release(struct task_struct *tsk) |
1124 | { |
1125 | /* |
1126 | * The state handling is done for consistency, but in the case of |
1127 | * exec() there is no way to prevent further damage as the PID stays |
1128 | * the same. But for the unlikely and arguably buggy case that a |
1129 | * futex is held on exec(), this provides at least as much state |
1130 | * consistency protection which is possible. |
1131 | */ |
1132 | futex_cleanup_begin(tsk); |
1133 | futex_cleanup(tsk); |
1134 | /* |
1135 | * Reset the state to FUTEX_STATE_OK. The task is alive and about |
1136 | * exec a new binary. |
1137 | */ |
1138 | futex_cleanup_end(tsk, state: FUTEX_STATE_OK); |
1139 | } |
1140 | |
1141 | void futex_exit_release(struct task_struct *tsk) |
1142 | { |
1143 | futex_cleanup_begin(tsk); |
1144 | futex_cleanup(tsk); |
1145 | futex_cleanup_end(tsk, state: FUTEX_STATE_DEAD); |
1146 | } |
1147 | |
1148 | static int __init futex_init(void) |
1149 | { |
1150 | unsigned int futex_shift; |
1151 | unsigned long i; |
1152 | |
1153 | #if CONFIG_BASE_SMALL |
1154 | futex_hashsize = 16; |
1155 | #else |
1156 | futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); |
1157 | #endif |
1158 | |
1159 | futex_queues = alloc_large_system_hash(tablename: "futex" , bucketsize: sizeof(*futex_queues), |
1160 | futex_hashsize, scale: 0, flags: 0, |
1161 | hash_shift: &futex_shift, NULL, |
1162 | futex_hashsize, futex_hashsize); |
1163 | futex_hashsize = 1UL << futex_shift; |
1164 | |
1165 | for (i = 0; i < futex_hashsize; i++) { |
1166 | atomic_set(v: &futex_queues[i].waiters, i: 0); |
1167 | plist_head_init(head: &futex_queues[i].chain); |
1168 | spin_lock_init(&futex_queues[i].lock); |
1169 | } |
1170 | |
1171 | return 0; |
1172 | } |
1173 | core_initcall(futex_init); |
1174 | |