core.c source code [linux/kernel/futex/core.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* Fast Userspace Mutexes (which I call "Futexes!").
4	* (C) Rusty Russell, IBM 2002
5	*
6	* Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
7	* (C) Copyright 2003 Red Hat Inc, All Rights Reserved
8	*
9	* Removed page pinning, fix privately mapped COW pages and other cleanups
10	* (C) Copyright 2003, 2004 Jamie Lokier
11	*
12	* Robust futex support started by Ingo Molnar
13	* (C) Copyright 2006 Red Hat Inc, All Rights Reserved
14	* Thanks to Thomas Gleixner for suggestions, analysis and fixes.
15	*
16	* PI-futex support started by Ingo Molnar and Thomas Gleixner
17	* Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
18	* Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
19	*
20	* PRIVATE futexes by Eric Dumazet
21	* Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
22	*
23	* Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
24	* Copyright (C) IBM Corporation, 2009
25	* Thanks to Thomas Gleixner for conceptual design and careful reviews.
26	*
27	* Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
28	* enough at me, Linus for the original (flawed) idea, Matthew
29	* Kirkwood for proof-of-concept implementation.
30	*
31	* "The futexes are also cursed."
32	* "But they come in a choice of three flavours!"
33	*/
34	#include <linux/compat.h>
35	#include <linux/jhash.h>
36	#include <linux/pagemap.h>
37	#include <linux/plist.h>
38	#include <linux/memblock.h>
39	#include <linux/fault-inject.h>
40	#include <linux/slab.h>
41
42	#include "futex.h"
43	#include "../locking/rtmutex_common.h"
44
45	/*
46	* The base of the bucket array and its size are always used together
47	* (after initialization only in futex_hash()), so ensure that they
48	* reside in the same cacheline.
49	*/
50	static struct {
51	struct futex_hash_bucket *queues;
52	unsigned long hashsize;
53	} __futex_data __read_mostly __aligned(`2`*sizeof(long));
54	#define futex_queues (__futex_data.queues)
55	#define futex_hashsize (__futex_data.hashsize)
56
57
58	/*
59	* Fault injections for futexes.
60	*/
61	#ifdef CONFIG_FAIL_FUTEX
62
63	static struct {
64	struct fault_attr attr;
65
66	bool ignore_private;
67	} fail_futex = {
68	.attr = FAULT_ATTR_INITIALIZER,
69	.ignore_private = false,
70	};
71
72	static int __init setup_fail_futex(char *str)
73	{
74	return setup_fault_attr(attr: &fail_futex.attr, str);
75	}
76	__setup("fail_futex=", setup_fail_futex);
77
78	bool should_fail_futex(bool fshared)
79	{
80	if (fail_futex.ignore_private && !fshared)
81	return false;
82
83	return should_fail(attr: &fail_futex.attr, size: `1`);
84	}
85
86	#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
87
88	static int __init fail_futex_debugfs(void)
89	{
90	umode_t mode = S_IFREG \| S_IRUSR \| S_IWUSR;
91	struct dentry *dir;
92
93	dir = fault_create_debugfs_attr(name: "fail_futex", NULL,
94	attr: &fail_futex.attr);
95	if (IS_ERR(ptr: dir))
96	return PTR_ERR(ptr: dir);
97
98	debugfs_create_bool(name: "ignore-private", mode, parent: dir,
99	value: &fail_futex.ignore_private);
100	return `0`;
101	}
102
103	late_initcall(fail_futex_debugfs);
104
105	#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
106
107	#endif /* CONFIG_FAIL_FUTEX */
108
109	/**
110	* futex_hash - Return the hash bucket in the global hash
111	* @key: Pointer to the futex key for which the hash is calculated
112	*
113	* We hash on the keys returned from get_futex_key (see below) and return the
114	* corresponding hash bucket in the global hash.
115	*/
116	struct futex_hash_bucket futex_hash(union* futex_key *key)
117	{
118	u32 hash = jhash2(k: (u32 )key, offsetof(typeof(key), both.offset) / `4`,
119	initval: key->both.offset);
120
121	return &futex_queues[hash & (futex_hashsize - `1`)];
122	}
123
124
125	/**
126	* futex_setup_timer - set up the sleeping hrtimer.
127	* @time: ptr to the given timeout value
128	* @timeout: the hrtimer_sleeper structure to be set up
129	* @flags: futex flags
130	* @range_ns: optional range in ns
131	*
132	* Return: Initialized hrtimer_sleeper structure or NULL if no timeout
133	* value given
134	*/
135	struct hrtimer_sleeper *
136	futex_setup_timer(ktime_t time, struct* hrtimer_sleeper *timeout,
137	int flags, u64 range_ns)
138	{
139	if (!time)
140	return NULL;
141
142	hrtimer_init_sleeper_on_stack(sl: timeout, clock_id: (flags & FLAGS_CLOCKRT) ?
143	CLOCK_REALTIME : CLOCK_MONOTONIC,
144	mode: HRTIMER_MODE_ABS);
145	/*
146	* If range_ns is 0, calling hrtimer_set_expires_range_ns() is
147	* effectively the same as calling hrtimer_set_expires().
148	*/
149	hrtimer_set_expires_range_ns(timer: &timeout->timer, time: *time, delta: range_ns);
150
151	return timeout;
152	}
153
154	/*
155	* Generate a machine wide unique identifier for this inode.
156	*
157	* This relies on u64 not wrapping in the life-time of the machine; which with
158	* 1ns resolution means almost 585 years.
159	*
160	* This further relies on the fact that a well formed program will not unmap
161	* the file while it has a (shared) futex waiting on it. This mapping will have
162	* a file reference which pins the mount and inode.
163	*
164	* If for some reason an inode gets evicted and read back in again, it will get
165	* a new sequence number and will _NOT_ match, even though it is the exact same
166	* file.
167	*
168	* It is important that futex_match() will never have a false-positive, esp.
169	* for PI futexes that can mess up the state. The above argues that false-negatives
170	* are only possible for malformed programs.
171	*/
172	static u64 get_inode_sequence_number(struct inode *inode)
173	{
174	static atomic64_t i_seq;
175	u64 old;
176
177	/ Does the inode already have a sequence number? /
178	old = atomic64_read(v: &inode->i_sequence);
179	if (likely(old))
180	return old;
181
182	for (;;) {
183	u64 new = atomic64_add_return(i: `1`, v: &i_seq);
184	if (WARN_ON_ONCE(!new))
185	continue;
186
187	old = atomic64_cmpxchg_relaxed(v: &inode->i_sequence, old: `0`, new);
188	if (old)
189	return old;
190	return new;
191	}
192	}
193
194	/**
195	* get_futex_key() - Get parameters which are the keys for a futex
196	* @uaddr: virtual address of the futex
197	* @flags: FLAGS_*
198	* @key: address where result is stored.
199	* @rw: mapping needs to be read/write (values: FUTEX_READ,
200	* FUTEX_WRITE)
201	*
202	* Return: a negative error code or 0
203	*
204	* The key words are stored in @key on success.
205	*
206	* For shared mappings (when @fshared), the key is:
207	*
208	* ( inode->i_sequence, page->index, offset_within_page )
209	*
210	* [ also see get_inode_sequence_number() ]
211	*
212	* For private mappings (or when !@fshared), the key is:
213	*
214	* ( current->mm, address, 0 )
215	*
216	* This allows (cross process, where applicable) identification of the futex
217	* without keeping the page pinned for the duration of the FUTEX_WAIT.
218	*
219	* lock_page() might sleep, the caller should not hold a spinlock.
220	*/
221	int get_futex_key(u32 __user uaddr, unsigned* int flags, union futex_key *key,
222	enum futex_access rw)
223	{
224	unsigned long address = (unsigned long)uaddr;
225	struct mm_struct *mm = current->mm;
226	struct page *page;
227	struct folio *folio;
228	struct address_space *mapping;
229	int err, ro = `0`;
230	bool fshared;
231
232	fshared = flags & FLAGS_SHARED;
233
234	/*
235	* The futex address must be "naturally" aligned.
236	*/
237	key->both.offset = address % PAGE_SIZE;
238	if (unlikely((address % sizeof(u32)) != `0`))
239	return -EINVAL;
240	address -= key->both.offset;
241
242	if (unlikely(!access_ok(uaddr, sizeof(u32))))
243	return -EFAULT;
244
245	if (unlikely(should_fail_futex(fshared)))
246	return -EFAULT;
247
248	/*
249	* PROCESS_PRIVATE futexes are fast.
250	* As the mm cannot disappear under us and the 'key' only needs
251	* virtual address, we dont even have to find the underlying vma.
252	* Note : We do have to check 'uaddr' is a valid user address,
253	* but access_ok() should be faster than find_vma()
254	*/
255	if (!fshared) {
256	/*
257	* On no-MMU, shared futexes are treated as private, therefore
258	* we must not include the current process in the key. Since
259	* there is only one address space, the address is a unique key
260	* on its own.
261	*/
262	if (IS_ENABLED(CONFIG_MMU))
263	key->private.mm = mm;
264	else
265	key->private.mm = NULL;
266
267	key->private.address = address;
268	return `0`;
269	}
270
271	again:
272	/ Ignore any VERIFY_READ mapping (futex common case) /
273	if (unlikely(should_fail_futex(true)))
274	return -EFAULT;
275
276	err = get_user_pages_fast(start: address, nr_pages: `1`, gup_flags: FOLL_WRITE, pages: &page);
277	/*
278	* If write access is not required (eg. FUTEX_WAIT), try
279	* and get read-only access.
280	*/
281	if (err == -EFAULT && rw == FUTEX_READ) {
282	err = get_user_pages_fast(start: address, nr_pages: `1`, gup_flags: `0`, pages: &page);
283	ro = `1`;
284	}
285	if (err < `0`)
286	return err;
287	else
288	err = `0`;
289
290	/*
291	* The treatment of mapping from this point on is critical. The folio
292	* lock protects many things but in this context the folio lock
293	* stabilizes mapping, prevents inode freeing in the shared
294	* file-backed region case and guards against movement to swap cache.
295	*
296	* Strictly speaking the folio lock is not needed in all cases being
297	* considered here and folio lock forces unnecessarily serialization.
298	* From this point on, mapping will be re-verified if necessary and
299	* folio lock will be acquired only if it is unavoidable
300	*
301	* Mapping checks require the folio so it is looked up now. For
302	* anonymous pages, it does not matter if the folio is split
303	* in the future as the key is based on the address. For
304	* filesystem-backed pages, the precise page is required as the
305	* index of the page determines the key.
306	*/
307	folio = page_folio(page);
308	mapping = READ_ONCE(folio->mapping);
309
310	/*
311	* If folio->mapping is NULL, then it cannot be an anonymous
312	* page; but it might be the ZERO_PAGE or in the gate area or
313	* in a special mapping (all cases which we are happy to fail);
314	* or it may have been a good file page when get_user_pages_fast
315	* found it, but truncated or holepunched or subjected to
316	* invalidate_complete_page2 before we got the folio lock (also
317	* cases which we are happy to fail). And we hold a reference,
318	* so refcount care in invalidate_inode_page's remove_mapping
319	* prevents drop_caches from setting mapping to NULL beneath us.
320	*
321	* The case we do have to guard against is when memory pressure made
322	* shmem_writepage move it from filecache to swapcache beneath us:
323	* an unlikely race, but we do need to retry for folio->mapping.
324	*/
325	if (unlikely(!mapping)) {
326	int shmem_swizzled;
327
328	/*
329	* Folio lock is required to identify which special case above
330	* applies. If this is really a shmem page then the folio lock
331	* will prevent unexpected transitions.
332	*/
333	folio_lock(folio);
334	shmem_swizzled = folio_test_swapcache(folio) \|\| folio->mapping;
335	folio_unlock(folio);
336	folio_put(folio);
337
338	if (shmem_swizzled)
339	goto again;
340
341	return -EFAULT;
342	}
343
344	/*
345	* Private mappings are handled in a simple way.
346	*
347	* If the futex key is stored in anonymous memory, then the associated
348	* object is the mm which is implicitly pinned by the calling process.
349	*
350	* NOTE: When userspace waits on a MAP_SHARED mapping, even if
351	* it's a read-only handle, it's expected that futexes attach to
352	* the object not the particular process.
353	*/
354	if (folio_test_anon(folio)) {
355	/*
356	* A RO anonymous page will never change and thus doesn't make
357	* sense for futex operations.
358	*/
359	if (unlikely(should_fail_futex(true)) \|\| ro) {
360	err = -EFAULT;
361	goto out;
362	}
363
364	key->both.offset \|= FUT_OFF_MMSHARED; / ref taken on mm /
365	key->private.mm = mm;
366	key->private.address = address;
367
368	} else {
369	struct inode *inode;
370
371	/*
372	* The associated futex object in this case is the inode and
373	* the folio->mapping must be traversed. Ordinarily this should
374	* be stabilised under folio lock but it's not strictly
375	* necessary in this case as we just want to pin the inode, not
376	* update i_pages or anything like that.
377	*
378	* The RCU read lock is taken as the inode is finally freed
379	* under RCU. If the mapping still matches expectations then the
380	* mapping->host can be safely accessed as being a valid inode.
381	*/
382	rcu_read_lock();
383
384	if (READ_ONCE(folio->mapping) != mapping) {
385	rcu_read_unlock();
386	folio_put(folio);
387
388	goto again;
389	}
390
391	inode = READ_ONCE(mapping->host);
392	if (!inode) {
393	rcu_read_unlock();
394	folio_put(folio);
395
396	goto again;
397	}
398
399	key->both.offset \|= FUT_OFF_INODE; / inode-based key /
400	key->shared.i_seq = get_inode_sequence_number(inode);
401	key->shared.pgoff = folio->index + folio_page_idx(folio, page);
402	rcu_read_unlock();
403	}
404
405	out:
406	folio_put(folio);
407	return err;
408	}
409
410	/**
411	* fault_in_user_writeable() - Fault in user address and verify RW access
412	* @uaddr: pointer to faulting user space address
413	*
414	* Slow path to fixup the fault we just took in the atomic write
415	* access to @uaddr.
416	*
417	* We have no generic implementation of a non-destructive write to the
418	* user address. We know that we faulted in the atomic pagefault
419	* disabled section so we can as well avoid the #PF overhead by
420	* calling get_user_pages() right away.
421	*/
422	int fault_in_user_writeable(u32 __user *uaddr)
423	{
424	struct mm_struct *mm = current->mm;
425	int ret;
426
427	mmap_read_lock(mm);
428	ret = fixup_user_fault(mm, address: (unsigned long)uaddr,
429	fault_flags: FAULT_FLAG_WRITE, NULL);
430	mmap_read_unlock(mm);
431
432	return ret < `0` ? ret : `0`;
433	}
434
435	/**
436	* futex_top_waiter() - Return the highest priority waiter on a futex
437	* @hb: the hash bucket the futex_q's reside in
438	* @key: the futex key (to distinguish it from other futex futex_q's)
439	*
440	* Must be called with the hb lock held.
441	*/
442	struct futex_q futex_top_waiter(struct* futex_hash_bucket hb, union* futex_key *key)
443	{
444	struct futex_q *this;
445
446	plist_for_each_entry(this, &hb->chain, list) {
447	if (futex_match(key1: &this->key, key2: key))
448	return this;
449	}
450	return NULL;
451	}
452
453	int futex_cmpxchg_value_locked(u32 curval, u32 __user uaddr, u32 uval, u32 newval)
454	{
455	int ret;
456
457	pagefault_disable();
458	ret = futex_atomic_cmpxchg_inatomic(uval: curval, uaddr, oldval: uval, newval);
459	pagefault_enable();
460
461	return ret;
462	}
463
464	int futex_get_value_locked(u32 dest, u32 __user from)
465	{
466	int ret;
467
468	pagefault_disable();
469	ret = __get_user(*dest, from);
470	pagefault_enable();
471
472	return ret ? -EFAULT : `0`;
473	}
474
475	/**
476	* wait_for_owner_exiting - Block until the owner has exited
477	* @ret: owner's current futex lock status
478	* @exiting: Pointer to the exiting task
479	*
480	* Caller must hold a refcount on @exiting.
481	*/
482	void wait_for_owner_exiting(int ret, struct task_struct *exiting)
483	{
484	if (ret != -EBUSY) {
485	WARN_ON_ONCE(exiting);
486	return;
487	}
488
489	if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
490	return;
491
492	mutex_lock(&exiting->futex_exit_mutex);
493	/*
494	* No point in doing state checking here. If the waiter got here
495	* while the task was in exec()->exec_futex_release() then it can
496	* have any FUTEX_STATE_* value when the waiter has acquired the
497	* mutex. OK, if running, EXITING or DEAD if it reached exit()
498	* already. Highly unlikely and not a problem. Just one more round
499	* through the futex maze.
500	*/
501	mutex_unlock(lock: &exiting->futex_exit_mutex);
502
503	put_task_struct(t: exiting);
504	}
505
506	/**
507	* __futex_unqueue() - Remove the futex_q from its futex_hash_bucket
508	* @q: The futex_q to unqueue
509	*
510	* The q->lock_ptr must not be NULL and must be held by the caller.
511	*/
512	void __futex_unqueue(struct futex_q *q)
513	{
514	struct futex_hash_bucket *hb;
515
516	if (WARN_ON_SMP(!q->lock_ptr) \|\| WARN_ON(plist_node_empty(&q->list)))
517	return;
518	lockdep_assert_held(q->lock_ptr);
519
520	hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
521	plist_del(node: &q->list, head: &hb->chain);
522	futex_hb_waiters_dec(hb);
523	}
524
525	/ The key must be already stored in q->key. /
526	struct futex_hash_bucket futex_q_lock(struct* futex_q *q)
527	__acquires(&hb->lock)
528	{
529	struct futex_hash_bucket *hb;
530
531	hb = futex_hash(key: &q->key);
532
533	/*
534	* Increment the counter before taking the lock so that
535	* a potential waker won't miss a to-be-slept task that is
536	* waiting for the spinlock. This is safe as all futex_q_lock()
537	* users end up calling futex_queue(). Similarly, for housekeeping,
538	* decrement the counter at futex_q_unlock() when some error has
539	* occurred and we don't end up adding the task to the list.
540	*/
541	futex_hb_waiters_inc(hb); / implies smp_mb(); (A) /
542
543	q->lock_ptr = &hb->lock;
544
545	spin_lock(lock: &hb->lock);
546	return hb;
547	}
548
549	void futex_q_unlock(struct futex_hash_bucket *hb)
550	__releases(&hb->lock)
551	{
552	spin_unlock(lock: &hb->lock);
553	futex_hb_waiters_dec(hb);
554	}
555
556	void __futex_queue(struct futex_q q, struct* futex_hash_bucket *hb)
557	{
558	int prio;
559
560	/*
561	* The priority used to register this element is
562	* - either the real thread-priority for the real-time threads
563	* (i.e. threads with a priority lower than MAX_RT_PRIO)
564	* - or MAX_RT_PRIO for non-RT threads.
565	* Thus, all RT-threads are woken first in priority order, and
566	* the others are woken last, in FIFO order.
567	*/
568	prio = min(current->normal_prio, MAX_RT_PRIO);
569
570	plist_node_init(node: &q->list, prio);
571	plist_add(node: &q->list, head: &hb->chain);
572	q->task = current;
573	}
574
575	/**
576	* futex_unqueue() - Remove the futex_q from its futex_hash_bucket
577	* @q: The futex_q to unqueue
578	*
579	* The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must
580	* be paired with exactly one earlier call to futex_queue().
581	*
582	* Return:
583	* - 1 - if the futex_q was still queued (and we removed unqueued it);
584	* - 0 - if the futex_q was already removed by the waking thread
585	*/
586	int futex_unqueue(struct futex_q *q)
587	{
588	spinlock_t *lock_ptr;
589	int ret = `0`;
590
591	/ In the common case we don't take the spinlock, which is nice. /
592	retry:
593	/*
594	* q->lock_ptr can change between this read and the following spin_lock.
595	* Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
596	* optimizing lock_ptr out of the logic below.
597	*/
598	lock_ptr = READ_ONCE(q->lock_ptr);
599	if (lock_ptr != NULL) {
600	spin_lock(lock: lock_ptr);
601	/*
602	* q->lock_ptr can change between reading it and
603	* spin_lock(), causing us to take the wrong lock. This
604	* corrects the race condition.
605	*
606	* Reasoning goes like this: if we have the wrong lock,
607	* q->lock_ptr must have changed (maybe several times)
608	* between reading it and the spin_lock(). It can
609	* change again after the spin_lock() but only if it was
610	* already changed before the spin_lock(). It cannot,
611	* however, change back to the original value. Therefore
612	* we can detect whether we acquired the correct lock.
613	*/
614	if (unlikely(lock_ptr != q->lock_ptr)) {
615	spin_unlock(lock: lock_ptr);
616	goto retry;
617	}
618	__futex_unqueue(q);
619
620	BUG_ON(q->pi_state);
621
622	spin_unlock(lock: lock_ptr);
623	ret = `1`;
624	}
625
626	return ret;
627	}
628
629	/*
630	* PI futexes can not be requeued and must remove themselves from the hash
631	* bucket. The hash bucket lock (i.e. lock_ptr) is held.
632	*/
633	void futex_unqueue_pi(struct futex_q *q)
634	{
635	/*
636	* If the lock was not acquired (due to timeout or signal) then the
637	* rt_waiter is removed before futex_q is. If this is observed by
638	* an unlocker after dropping the rtmutex wait lock and before
639	* acquiring the hash bucket lock, then the unlocker dequeues the
640	* futex_q from the hash bucket list to guarantee consistent state
641	* vs. userspace. Therefore the dequeue here must be conditional.
642	*/
643	if (!plist_node_empty(node: &q->list))
644	__futex_unqueue(q);
645
646	BUG_ON(!q->pi_state);
647	put_pi_state(pi_state: q->pi_state);
648	q->pi_state = NULL;
649	}
650
651	/ Constants for the pending_op argument of handle_futex_death /
652	#define HANDLE_DEATH_PENDING true
653	#define HANDLE_DEATH_LIST false
654
655	/*
656	* Process a futex-list entry, check whether it's owned by the
657	* dying task, and do notification if so:
658	*/
659	static int handle_futex_death(u32 __user uaddr, struct* task_struct *curr,
660	bool pi, bool pending_op)
661	{
662	u32 uval, nval, mval;
663	pid_t owner;
664	int err;
665
666	/ Futex address must be 32bit aligned /
667	if ((((unsigned long)uaddr) % sizeof(*uaddr)) != `0`)
668	return -`1`;
669
670	retry:
671	if (get_user(uval, uaddr))
672	return -`1`;
673
674	/*
675	* Special case for regular (non PI) futexes. The unlock path in
676	* user space has two race scenarios:
677	*
678	* 1. The unlock path releases the user space futex value and
679	* before it can execute the futex() syscall to wake up
680	* waiters it is killed.
681	*
682	* 2. A woken up waiter is killed before it can acquire the
683	* futex in user space.
684	*
685	* In the second case, the wake up notification could be generated
686	* by the unlock path in user space after setting the futex value
687	* to zero or by the kernel after setting the OWNER_DIED bit below.
688	*
689	* In both cases the TID validation below prevents a wakeup of
690	* potential waiters which can cause these waiters to block
691	* forever.
692	*
693	* In both cases the following conditions are met:
694	*
695	* 1) task->robust_list->list_op_pending != NULL
696	* @pending_op == true
697	* 2) The owner part of user space futex value == 0
698	* 3) Regular futex: @pi == false
699	*
700	* If these conditions are met, it is safe to attempt waking up a
701	* potential waiter without touching the user space futex value and
702	* trying to set the OWNER_DIED bit. If the futex value is zero,
703	* the rest of the user space mutex state is consistent, so a woken
704	* waiter will just take over the uncontended futex. Setting the
705	* OWNER_DIED bit would create inconsistent state and malfunction
706	* of the user space owner died handling. Otherwise, the OWNER_DIED
707	* bit is already set, and the woken waiter is expected to deal with
708	* this.
709	*/
710	owner = uval & FUTEX_TID_MASK;
711
712	if (pending_op && !pi && !owner) {
713	futex_wake(uaddr, FLAGS_SIZE_32 \| FLAGS_SHARED, nr_wake: `1`,
714	FUTEX_BITSET_MATCH_ANY);
715	return `0`;
716	}
717
718	if (owner != task_pid_vnr(tsk: curr))
719	return `0`;
720
721	/*
722	* Ok, this dying thread is truly holding a futex
723	* of interest. Set the OWNER_DIED bit atomically
724	* via cmpxchg, and if the value had FUTEX_WAITERS
725	* set, wake up a waiter (if any). (We have to do a
726	* futex_wake() even if OWNER_DIED is already set -
727	* to handle the rare but possible case of recursive
728	* thread-death.) The rest of the cleanup is done in
729	* userspace.
730	*/
731	mval = (uval & FUTEX_WAITERS) \| FUTEX_OWNER_DIED;
732
733	/*
734	* We are not holding a lock here, but we want to have
735	* the pagefault_disable/enable() protection because
736	* we want to handle the fault gracefully. If the
737	* access fails we try to fault in the futex with R/W
738	* verification via get_user_pages. get_user() above
739	* does not guarantee R/W access. If that fails we
740	* give up and leave the futex locked.
741	*/
742	if ((err = futex_cmpxchg_value_locked(curval: &nval, uaddr, uval, newval: mval))) {
743	switch (err) {
744	case -EFAULT:
745	if (fault_in_user_writeable(uaddr))
746	return -`1`;
747	goto retry;
748
749	case -EAGAIN:
750	cond_resched();
751	goto retry;
752
753	default:
754	WARN_ON_ONCE(`1`);
755	return err;
756	}
757	}
758
759	if (nval != uval)
760	goto retry;
761
762	/*
763	* Wake robust non-PI futexes here. The wakeup of
764	* PI futexes happens in exit_pi_state():
765	*/
766	if (!pi && (uval & FUTEX_WAITERS)) {
767	futex_wake(uaddr, FLAGS_SIZE_32 \| FLAGS_SHARED, nr_wake: `1`,
768	FUTEX_BITSET_MATCH_ANY);
769	}
770
771	return `0`;
772	}
773
774	/*
775	* Fetch a robust-list pointer. Bit 0 signals PI futexes:
776	*/
777	static inline int fetch_robust_entry(struct robust_list __user **entry,
778	struct robust_list __user * __user *head,
779	unsigned int *pi)
780	{
781	unsigned long uentry;
782
783	if (get_user(uentry, (unsigned long __user *)head))
784	return -EFAULT;
785
786	entry = (void* __user *)(uentry & ~`1UL`);
787	*pi = uentry & `1`;
788
789	return `0`;
790	}
791
792	/*
793	* Walk curr->robust_list (very carefully, it's a userspace list!)
794	* and mark any locks found there dead, and notify any waiters.
795	*
796	* We silently return on any sign of list-walking problem.
797	*/
798	static void exit_robust_list(struct task_struct *curr)
799	{
800	struct robust_list_head __user *head = curr->robust_list;
801	struct robust_list __user entry, next_entry, *pending;
802	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
803	unsigned int next_pi;
804	unsigned long futex_offset;
805	int rc;
806
807	/*
808	* Fetch the list head (which was registered earlier, via
809	* sys_set_robust_list()):
810	*/
811	if (fetch_robust_entry(entry: &entry, head: &head->list.next, pi: &pi))
812	return;
813	/*
814	* Fetch the relative futex offset:
815	*/
816	if (get_user(futex_offset, &head->futex_offset))
817	return;
818	/*
819	* Fetch any possibly pending lock-add first, and handle it
820	* if it exists:
821	*/
822	if (fetch_robust_entry(entry: &pending, head: &head->list_op_pending, pi: &pip))
823	return;
824
825	next_entry = NULL; / avoid warning with gcc /
826	while (entry != &head->list) {
827	/*
828	* Fetch the next entry in the list before calling
829	* handle_futex_death:
830	*/
831	rc = fetch_robust_entry(entry: &next_entry, head: &entry->next, pi: &next_pi);
832	/*
833	* A pending lock might already be on the list, so
834	* don't process it twice:
835	*/
836	if (entry != pending) {
837	if (handle_futex_death(uaddr: (void __user *)entry + futex_offset,
838	curr, pi, HANDLE_DEATH_LIST))
839	return;
840	}
841	if (rc)
842	return;
843	entry = next_entry;
844	pi = next_pi;
845	/*
846	* Avoid excessively long or circular lists:
847	*/
848	if (!--limit)
849	break;
850
851	cond_resched();
852	}
853
854	if (pending) {
855	handle_futex_death(uaddr: (void __user *)pending + futex_offset,
856	curr, pi: pip, HANDLE_DEATH_PENDING);
857	}
858	}
859
860	#ifdef CONFIG_COMPAT
861	static void __user futex_uaddr(struct* robust_list __user *entry,
862	compat_long_t futex_offset)
863	{
864	compat_uptr_t base = ptr_to_compat(uptr: entry);
865	void __user *uaddr = compat_ptr(uptr: base + futex_offset);
866
867	return uaddr;
868	}
869
870	/*
871	* Fetch a robust-list pointer. Bit 0 signals PI futexes:
872	*/
873	static inline int
874	compat_fetch_robust_entry(compat_uptr_t uentry, struct* robust_list __user **entry,
875	compat_uptr_t __user head, unsigned* int *pi)
876	{
877	if (get_user(*uentry, head))
878	return -EFAULT;
879
880	entry = compat_ptr(uptr: (uentry) & ~`1`);
881	pi = (unsigned* int)(*uentry) & `1`;
882
883	return `0`;
884	}
885
886	/*
887	* Walk curr->robust_list (very carefully, it's a userspace list!)
888	* and mark any locks found there dead, and notify any waiters.
889	*
890	* We silently return on any sign of list-walking problem.
891	*/
892	static void compat_exit_robust_list(struct task_struct *curr)
893	{
894	struct compat_robust_list_head __user *head = curr->compat_robust_list;
895	struct robust_list __user entry, next_entry, *pending;
896	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
897	unsigned int next_pi;
898	compat_uptr_t uentry, next_uentry, upending;
899	compat_long_t futex_offset;
900	int rc;
901
902	/*
903	* Fetch the list head (which was registered earlier, via
904	* sys_set_robust_list()):
905	*/
906	if (compat_fetch_robust_entry(uentry: &uentry, entry: &entry, head: &head->list.next, pi: &pi))
907	return;
908	/*
909	* Fetch the relative futex offset:
910	*/
911	if (get_user(futex_offset, &head->futex_offset))
912	return;
913	/*
914	* Fetch any possibly pending lock-add first, and handle it
915	* if it exists:
916	*/
917	if (compat_fetch_robust_entry(uentry: &upending, entry: &pending,
918	head: &head->list_op_pending, pi: &pip))
919	return;
920
921	next_entry = NULL; / avoid warning with gcc /
922	while (entry != (struct robust_list __user *) &head->list) {
923	/*
924	* Fetch the next entry in the list before calling
925	* handle_futex_death:
926	*/
927	rc = compat_fetch_robust_entry(uentry: &next_uentry, entry: &next_entry,
928	head: (compat_uptr_t __user *)&entry->next, pi: &next_pi);
929	/*
930	* A pending lock might already be on the list, so
931	* dont process it twice:
932	*/
933	if (entry != pending) {
934	void __user *uaddr = futex_uaddr(entry, futex_offset);
935
936	if (handle_futex_death(uaddr, curr, pi,
937	HANDLE_DEATH_LIST))
938	return;
939	}
940	if (rc)
941	return;
942	uentry = next_uentry;
943	entry = next_entry;
944	pi = next_pi;
945	/*
946	* Avoid excessively long or circular lists:
947	*/
948	if (!--limit)
949	break;
950
951	cond_resched();
952	}
953	if (pending) {
954	void __user *uaddr = futex_uaddr(entry: pending, futex_offset);
955
956	handle_futex_death(uaddr, curr, pi: pip, HANDLE_DEATH_PENDING);
957	}
958	}
959	#endif
960
961	#ifdef CONFIG_FUTEX_PI
962
963	/*
964	* This task is holding PI mutexes at exit time => bad.
965	* Kernel cleans up PI-state, but userspace is likely hosed.
966	* (Robust-futex cleanup is separate and might save the day for userspace.)
967	*/
968	static void exit_pi_state_list(struct task_struct *curr)
969	{
970	struct list_head next, head = &curr->pi_state_list;
971	struct futex_pi_state *pi_state;
972	struct futex_hash_bucket *hb;
973	union futex_key key = FUTEX_KEY_INIT;
974
975	/*
976	* We are a ZOMBIE and nobody can enqueue itself on
977	* pi_state_list anymore, but we have to be careful
978	* versus waiters unqueueing themselves:
979	*/
980	raw_spin_lock_irq(&curr->pi_lock);
981	while (!list_empty(head)) {
982	next = head->next;
983	pi_state = list_entry(next, struct futex_pi_state, list);
984	key = pi_state->key;
985	hb = futex_hash(key: &key);
986
987	/*
988	* We can race against put_pi_state() removing itself from the
989	* list (a waiter going away). put_pi_state() will first
990	* decrement the reference count and then modify the list, so
991	* its possible to see the list entry but fail this reference
992	* acquire.
993	*
994	* In that case; drop the locks to let put_pi_state() make
995	* progress and retry the loop.
996	*/
997	if (!refcount_inc_not_zero(r: &pi_state->refcount)) {
998	raw_spin_unlock_irq(&curr->pi_lock);
999	cpu_relax();
1000	raw_spin_lock_irq(&curr->pi_lock);
1001	continue;
1002	}
1003	raw_spin_unlock_irq(&curr->pi_lock);
1004
1005	spin_lock(lock: &hb->lock);
1006	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1007	raw_spin_lock(&curr->pi_lock);
1008	/*
1009	* We dropped the pi-lock, so re-check whether this
1010	* task still owns the PI-state:
1011	*/
1012	if (head->next != next) {
1013	/ retain curr->pi_lock for the loop invariant /
1014	raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
1015	spin_unlock(lock: &hb->lock);
1016	put_pi_state(pi_state);
1017	continue;
1018	}
1019
1020	WARN_ON(pi_state->owner != curr);
1021	WARN_ON(list_empty(&pi_state->list));
1022	list_del_init(entry: &pi_state->list);
1023	pi_state->owner = NULL;
1024
1025	raw_spin_unlock(&curr->pi_lock);
1026	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1027	spin_unlock(lock: &hb->lock);
1028
1029	rt_mutex_futex_unlock(lock: &pi_state->pi_mutex);
1030	put_pi_state(pi_state);
1031
1032	raw_spin_lock_irq(&curr->pi_lock);
1033	}
1034	raw_spin_unlock_irq(&curr->pi_lock);
1035	}
1036	#else
1037	static inline void exit_pi_state_list(struct task_struct *curr) { }
1038	#endif
1039
1040	static void futex_cleanup(struct task_struct *tsk)
1041	{
1042	if (unlikely(tsk->robust_list)) {
1043	exit_robust_list(curr: tsk);
1044	tsk->robust_list = NULL;
1045	}
1046
1047	#ifdef CONFIG_COMPAT
1048	if (unlikely(tsk->compat_robust_list)) {
1049	compat_exit_robust_list(curr: tsk);
1050	tsk->compat_robust_list = NULL;
1051	}
1052	#endif
1053
1054	if (unlikely(!list_empty(&tsk->pi_state_list)))
1055	exit_pi_state_list(curr: tsk);
1056	}
1057
1058	/**
1059	* futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
1060	* @tsk: task to set the state on
1061	*
1062	* Set the futex exit state of the task lockless. The futex waiter code
1063	* observes that state when a task is exiting and loops until the task has
1064	* actually finished the futex cleanup. The worst case for this is that the
1065	* waiter runs through the wait loop until the state becomes visible.
1066	*
1067	* This is called from the recursive fault handling path in make_task_dead().
1068	*
1069	* This is best effort. Either the futex exit code has run already or
1070	* not. If the OWNER_DIED bit has been set on the futex then the waiter can
1071	* take it over. If not, the problem is pushed back to user space. If the
1072	* futex exit code did not run yet, then an already queued waiter might
1073	* block forever, but there is nothing which can be done about that.
1074	*/
1075	void futex_exit_recursive(struct task_struct *tsk)
1076	{
1077	/ If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held /
1078	if (tsk->futex_state == FUTEX_STATE_EXITING)
1079	mutex_unlock(lock: &tsk->futex_exit_mutex);
1080	tsk->futex_state = FUTEX_STATE_DEAD;
1081	}
1082
1083	static void futex_cleanup_begin(struct task_struct *tsk)
1084	{
1085	/*
1086	* Prevent various race issues against a concurrent incoming waiter
1087	* including live locks by forcing the waiter to block on
1088	* tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
1089	* attach_to_pi_owner().
1090	*/
1091	mutex_lock(&tsk->futex_exit_mutex);
1092
1093	/*
1094	* Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
1095	*
1096	* This ensures that all subsequent checks of tsk->futex_state in
1097	* attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
1098	* tsk->pi_lock held.
1099	*
1100	* It guarantees also that a pi_state which was queued right before
1101	* the state change under tsk->pi_lock by a concurrent waiter must
1102	* be observed in exit_pi_state_list().
1103	*/
1104	raw_spin_lock_irq(&tsk->pi_lock);
1105	tsk->futex_state = FUTEX_STATE_EXITING;
1106	raw_spin_unlock_irq(&tsk->pi_lock);
1107	}
1108
1109	static void futex_cleanup_end(struct task_struct tsk, int* state)
1110	{
1111	/*
1112	* Lockless store. The only side effect is that an observer might
1113	* take another loop until it becomes visible.
1114	*/
1115	tsk->futex_state = state;
1116	/*
1117	* Drop the exit protection. This unblocks waiters which observed
1118	* FUTEX_STATE_EXITING to reevaluate the state.
1119	*/
1120	mutex_unlock(lock: &tsk->futex_exit_mutex);
1121	}
1122
1123	void futex_exec_release(struct task_struct *tsk)
1124	{
1125	/*
1126	* The state handling is done for consistency, but in the case of
1127	* exec() there is no way to prevent further damage as the PID stays
1128	* the same. But for the unlikely and arguably buggy case that a
1129	* futex is held on exec(), this provides at least as much state
1130	* consistency protection which is possible.
1131	*/
1132	futex_cleanup_begin(tsk);
1133	futex_cleanup(tsk);
1134	/*
1135	* Reset the state to FUTEX_STATE_OK. The task is alive and about
1136	* exec a new binary.
1137	*/
1138	futex_cleanup_end(tsk, state: FUTEX_STATE_OK);
1139	}
1140
1141	void futex_exit_release(struct task_struct *tsk)
1142	{
1143	futex_cleanup_begin(tsk);
1144	futex_cleanup(tsk);
1145	futex_cleanup_end(tsk, state: FUTEX_STATE_DEAD);
1146	}
1147
1148	static int __init futex_init(void)
1149	{
1150	unsigned int futex_shift;
1151	unsigned long i;
1152
1153	#if CONFIG_BASE_SMALL
1154	futex_hashsize = `16`;
1155	#else
1156	futex_hashsize = roundup_pow_of_two(`256` * num_possible_cpus());
1157	#endif
1158
1159	futex_queues = alloc_large_system_hash(tablename: "futex", bucketsize: sizeof(*futex_queues),
1160	futex_hashsize, scale: `0`, flags: `0`,
1161	hash_shift: &futex_shift, NULL,
1162	futex_hashsize, futex_hashsize);
1163	futex_hashsize = `1UL` << futex_shift;
1164
1165	for (i = `0`; i < futex_hashsize; i++) {
1166	atomic_set(v: &futex_queues[i].waiters, i: `0`);
1167	plist_head_init(head: &futex_queues[i].chain);
1168	spin_lock_init(&futex_queues[i].lock);
1169	}
1170
1171	return `0`;
1172	}
1173	core_initcall(futex_init);
1174

source code of linux/kernel/futex/core.c