rwsem.c source code [linux/kernel/locking/rwsem.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/ kernel/rwsem.c: R/W semaphores, public implementation*
3	*
4	* Written by David Howells (dhowells@redhat.com).
5	* Derived from asm-i386/semaphore.h
6	*
7	* Writer lock-stealing by Alex Shi <alex.shi@intel.com>
8	* and Michel Lespinasse <walken@google.com>
9	*
10	* Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
11	* and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
12	*
13	* Rwsem count bit fields re-definition and rwsem rearchitecture by
14	* Waiman Long <longman@redhat.com> and
15	* Peter Zijlstra <peterz@infradead.org>.
16	*/
17
18	#include <linux/types.h>
19	#include <linux/kernel.h>
20	#include <linux/sched.h>
21	#include <linux/sched/rt.h>
22	#include <linux/sched/task.h>
23	#include <linux/sched/debug.h>
24	#include <linux/sched/wake_q.h>
25	#include <linux/sched/signal.h>
26	#include <linux/sched/clock.h>
27	#include <linux/export.h>
28	#include <linux/rwsem.h>
29	#include <linux/atomic.h>
30	#include <trace/events/lock.h>
31
32	#ifndef CONFIG_PREEMPT_RT
33	#include "lock_events.h"
34
35	/*
36	* The least significant 2 bits of the owner value has the following
37	* meanings when set.
38	* - Bit 0: RWSEM_READER_OWNED - The rwsem is owned by readers
39	* - Bit 1: RWSEM_NONSPINNABLE - Cannot spin on a reader-owned lock
40	*
41	* When the rwsem is reader-owned and a spinning writer has timed out,
42	* the nonspinnable bit will be set to disable optimistic spinning.
43
44	* When a writer acquires a rwsem, it puts its task_struct pointer
45	* into the owner field. It is cleared after an unlock.
46	*
47	* When a reader acquires a rwsem, it will also puts its task_struct
48	* pointer into the owner field with the RWSEM_READER_OWNED bit set.
49	* On unlock, the owner field will largely be left untouched. So
50	* for a free or reader-owned rwsem, the owner value may contain
51	* information about the last reader that acquires the rwsem.
52	*
53	* That information may be helpful in debugging cases where the system
54	* seems to hang on a reader owned rwsem especially if only one reader
55	* is involved. Ideally we would like to track all the readers that own
56	* a rwsem, but the overhead is simply too big.
57	*
58	* A fast path reader optimistic lock stealing is supported when the rwsem
59	* is previously owned by a writer and the following conditions are met:
60	* - rwsem is not currently writer owned
61	* - the handoff isn't set.
62	*/
63	#define RWSEM_READER_OWNED (1UL << 0)
64	#define RWSEM_NONSPINNABLE (1UL << 1)
65	#define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED \| RWSEM_NONSPINNABLE)
66
67	#ifdef CONFIG_DEBUG_RWSEMS
68	# define DEBUG_RWSEMS_WARN_ON(c, sem) do { \
69	if (!debug_locks_silent && \
70	WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, magic = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
71	#c, atomic_long_read(&(sem)->count), \
72	(unsigned long) sem->magic, \
73	atomic_long_read(&(sem)->owner), (long)current, \
74	list_empty(&(sem)->wait_list) ? "" : "not ")) \
75	debug_locks_off(); \
76	} while (0)
77	#else
78	# define DEBUG_RWSEMS_WARN_ON(c, sem)
79	#endif
80
81	/*
82	* On 64-bit architectures, the bit definitions of the count are:
83	*
84	* Bit 0 - writer locked bit
85	* Bit 1 - waiters present bit
86	* Bit 2 - lock handoff bit
87	* Bits 3-7 - reserved
88	* Bits 8-62 - 55-bit reader count
89	* Bit 63 - read fail bit
90	*
91	* On 32-bit architectures, the bit definitions of the count are:
92	*
93	* Bit 0 - writer locked bit
94	* Bit 1 - waiters present bit
95	* Bit 2 - lock handoff bit
96	* Bits 3-7 - reserved
97	* Bits 8-30 - 23-bit reader count
98	* Bit 31 - read fail bit
99	*
100	* It is not likely that the most significant bit (read fail bit) will ever
101	* be set. This guard bit is still checked anyway in the down_read() fastpath
102	* just in case we need to use up more of the reader bits for other purpose
103	* in the future.
104	*
105	* atomic_long_fetch_add() is used to obtain reader lock, whereas
106	* atomic_long_cmpxchg() will be used to obtain writer lock.
107	*
108	* There are three places where the lock handoff bit may be set or cleared.
109	* 1) rwsem_mark_wake() for readers -- set, clear
110	* 2) rwsem_try_write_lock() for writers -- set, clear
111	* 3) rwsem_del_waiter() -- clear
112	*
113	* For all the above cases, wait_lock will be held. A writer must also
114	* be the first one in the wait_list to be eligible for setting the handoff
115	* bit. So concurrent setting/clearing of handoff bit is not possible.
116	*/
117	#define RWSEM_WRITER_LOCKED (1UL << 0)
118	#define RWSEM_FLAG_WAITERS (1UL << 1)
119	#define RWSEM_FLAG_HANDOFF (1UL << 2)
120	#define RWSEM_FLAG_READFAIL (1UL << (BITS_PER_LONG - 1))
121
122	#define RWSEM_READER_SHIFT 8
123	#define RWSEM_READER_BIAS (1UL << RWSEM_READER_SHIFT)
124	#define RWSEM_READER_MASK (~(RWSEM_READER_BIAS - 1))
125	#define RWSEM_WRITER_MASK RWSEM_WRITER_LOCKED
126	#define RWSEM_LOCK_MASK (RWSEM_WRITER_MASK\|RWSEM_READER_MASK)
127	#define RWSEM_READ_FAILED_MASK (RWSEM_WRITER_MASK\|RWSEM_FLAG_WAITERS\|\
128	RWSEM_FLAG_HANDOFF\|RWSEM_FLAG_READFAIL)
129
130	/*
131	* All writes to owner are protected by WRITE_ONCE() to make sure that
132	* store tearing can't happen as optimistic spinners may read and use
133	* the owner value concurrently without lock. Read from owner, however,
134	* may not need READ_ONCE() as long as the pointer value is only used
135	* for comparison and isn't being dereferenced.
136	*
137	* Both rwsem_{set,clear}_owner() functions should be in the same
138	* preempt disable section as the atomic op that changes sem->count.
139	*/
140	static inline void rwsem_set_owner(struct rw_semaphore *sem)
141	{
142	lockdep_assert_preemption_disabled();
143	atomic_long_set(v: &sem->owner, i: (long)current);
144	}
145
146	static inline void rwsem_clear_owner(struct rw_semaphore *sem)
147	{
148	lockdep_assert_preemption_disabled();
149	atomic_long_set(v: &sem->owner, i: `0`);
150	}
151
152	/*
153	* Test the flags in the owner field.
154	*/
155	static inline bool rwsem_test_oflags(struct rw_semaphore sem, long* flags)
156	{
157	return atomic_long_read(v: &sem->owner) & flags;
158	}
159
160	/*
161	* The task_struct pointer of the last owning reader will be left in
162	* the owner field.
163	*
164	* Note that the owner value just indicates the task has owned the rwsem
165	* previously, it may not be the real owner or one of the real owners
166	* anymore when that field is examined, so take it with a grain of salt.
167	*
168	* The reader non-spinnable bit is preserved.
169	*/
170	static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
171	struct task_struct *owner)
172	{
173	unsigned long val = (unsigned long)owner \| RWSEM_READER_OWNED \|
174	(atomic_long_read(v: &sem->owner) & RWSEM_NONSPINNABLE);
175
176	atomic_long_set(v: &sem->owner, i: val);
177	}
178
179	static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
180	{
181	__rwsem_set_reader_owned(sem, current);
182	}
183
184	/*
185	* Return true if the rwsem is owned by a reader.
186	*/
187	static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
188	{
189	#ifdef CONFIG_DEBUG_RWSEMS
190	/*
191	* Check the count to see if it is write-locked.
192	*/
193	long count = atomic_long_read(v: &sem->count);
194
195	if (count & RWSEM_WRITER_MASK)
196	return false;
197	#endif
198	return rwsem_test_oflags(sem, RWSEM_READER_OWNED);
199	}
200
201	#ifdef CONFIG_DEBUG_RWSEMS
202	/*
203	* With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there
204	* is a task pointer in owner of a reader-owned rwsem, it will be the
205	* real owner or one of the real owners. The only exception is when the
206	* unlock is done by up_read_non_owner().
207	*/
208	static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
209	{
210	unsigned long val = atomic_long_read(v: &sem->owner);
211
212	while ((val & ~RWSEM_OWNER_FLAGS_MASK) == (unsigned long)current) {
213	if (atomic_long_try_cmpxchg(v: &sem->owner, old: &val,
214	new: val & RWSEM_OWNER_FLAGS_MASK))
215	return;
216	}
217	}
218	#else
219	static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
220	{
221	}
222	#endif
223
224	/*
225	* Set the RWSEM_NONSPINNABLE bits if the RWSEM_READER_OWNED flag
226	* remains set. Otherwise, the operation will be aborted.
227	*/
228	static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem)
229	{
230	unsigned long owner = atomic_long_read(v: &sem->owner);
231
232	do {
233	if (!(owner & RWSEM_READER_OWNED))
234	break;
235	if (owner & RWSEM_NONSPINNABLE)
236	break;
237	} while (!atomic_long_try_cmpxchg(v: &sem->owner, old: &owner,
238	new: owner \| RWSEM_NONSPINNABLE));
239	}
240
241	static inline bool rwsem_read_trylock(struct rw_semaphore sem, long* *cntp)
242	{
243	*cntp = atomic_long_add_return_acquire(RWSEM_READER_BIAS, v: &sem->count);
244
245	if (WARN_ON_ONCE(*cntp < `0`))
246	rwsem_set_nonspinnable(sem);
247
248	if (!(*cntp & RWSEM_READ_FAILED_MASK)) {
249	rwsem_set_reader_owned(sem);
250	return true;
251	}
252
253	return false;
254	}
255
256	static inline bool rwsem_write_trylock(struct rw_semaphore *sem)
257	{
258	long tmp = RWSEM_UNLOCKED_VALUE;
259
260	if (atomic_long_try_cmpxchg_acquire(v: &sem->count, old: &tmp, RWSEM_WRITER_LOCKED)) {
261	rwsem_set_owner(sem);
262	return true;
263	}
264
265	return false;
266	}
267
268	/*
269	* Return just the real task structure pointer of the owner
270	*/
271	static inline struct task_struct rwsem_owner(struct* rw_semaphore *sem)
272	{
273	return (struct task_struct *)
274	(atomic_long_read(v: &sem->owner) & ~RWSEM_OWNER_FLAGS_MASK);
275	}
276
277	/*
278	* Return the real task structure pointer of the owner and the embedded
279	* flags in the owner. pflags must be non-NULL.
280	*/
281	static inline struct task_struct *
282	rwsem_owner_flags(struct rw_semaphore sem, unsigned* long *pflags)
283	{
284	unsigned long owner = atomic_long_read(v: &sem->owner);
285
286	*pflags = owner & RWSEM_OWNER_FLAGS_MASK;
287	return (struct task_struct *)(owner & ~RWSEM_OWNER_FLAGS_MASK);
288	}
289
290	/*
291	* Guide to the rw_semaphore's count field.
292	*
293	* When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned
294	* by a writer.
295	*
296	* The lock is owned by readers when
297	* (1) the RWSEM_WRITER_LOCKED isn't set in count,
298	* (2) some of the reader bits are set in count, and
299	* (3) the owner field has RWSEM_READ_OWNED bit set.
300	*
301	* Having some reader bits set is not enough to guarantee a readers owned
302	* lock as the readers may be in the process of backing out from the count
303	* and a writer has just released the lock. So another writer may steal
304	* the lock immediately after that.
305	*/
306
307	/*
308	* Initialize an rwsem:
309	*/
310	void __init_rwsem(struct rw_semaphore sem, const* char *name,
311	struct lock_class_key *key)
312	{
313	#ifdef CONFIG_DEBUG_LOCK_ALLOC
314	/*
315	* Make sure we are not reinitializing a held semaphore:
316	*/
317	debug_check_no_locks_freed(from: (void )sem, len: sizeof(sem));
318	lockdep_init_map_wait(lock: &sem->dep_map, name, key, subclass: `0`, inner: LD_WAIT_SLEEP);
319	#endif
320	#ifdef CONFIG_DEBUG_RWSEMS
321	sem->magic = sem;
322	#endif
323	atomic_long_set(v: &sem->count, RWSEM_UNLOCKED_VALUE);
324	raw_spin_lock_init(&sem->wait_lock);
325	INIT_LIST_HEAD(list: &sem->wait_list);
326	atomic_long_set(v: &sem->owner, i: `0L`);
327	#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
328	osq_lock_init(lock: &sem->osq);
329	#endif
330	}
331	EXPORT_SYMBOL(__init_rwsem);
332
333	enum rwsem_waiter_type {
334	RWSEM_WAITING_FOR_WRITE,
335	RWSEM_WAITING_FOR_READ
336	};
337
338	struct rwsem_waiter {
339	struct list_head list;
340	struct task_struct *task;
341	enum rwsem_waiter_type type;
342	unsigned long timeout;
343	bool handoff_set;
344	};
345	#define rwsem_first_waiter(sem) \
346	list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
347
348	enum rwsem_wake_type {
349	RWSEM_WAKE_ANY, / Wake whatever's at head of wait list /
350	RWSEM_WAKE_READERS, / Wake readers only /
351	RWSEM_WAKE_READ_OWNED / Waker thread holds the read lock /
352	};
353
354	/*
355	* The typical HZ value is either 250 or 1000. So set the minimum waiting
356	* time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
357	* queue before initiating the handoff protocol.
358	*/
359	#define RWSEM_WAIT_TIMEOUT DIV_ROUND_UP(HZ, 250)
360
361	/*
362	* Magic number to batch-wakeup waiting readers, even when writers are
363	* also present in the queue. This both limits the amount of work the
364	* waking thread must do and also prevents any potential counter overflow,
365	* however unlikely.
366	*/
367	#define MAX_READERS_WAKEUP 0x100
368
369	static inline void
370	rwsem_add_waiter(struct rw_semaphore sem, struct* rwsem_waiter *waiter)
371	{
372	lockdep_assert_held(&sem->wait_lock);
373	list_add_tail(new: &waiter->list, head: &sem->wait_list);
374	/ caller will set RWSEM_FLAG_WAITERS /
375	}
376
377	/*
378	* Remove a waiter from the wait_list and clear flags.
379	*
380	* Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of
381	* this function. Modify with care.
382	*
383	* Return: true if wait_list isn't empty and false otherwise
384	*/
385	static inline bool
386	rwsem_del_waiter(struct rw_semaphore sem, struct* rwsem_waiter *waiter)
387	{
388	lockdep_assert_held(&sem->wait_lock);
389	list_del(entry: &waiter->list);
390	if (likely(!list_empty(&sem->wait_list)))
391	return true;
392
393	atomic_long_andnot(RWSEM_FLAG_HANDOFF \| RWSEM_FLAG_WAITERS, v: &sem->count);
394	return false;
395	}
396
397	/*
398	* handle the lock release when processes blocked on it that can now run
399	* - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
400	* have been set.
401	* - there must be someone on the queue
402	* - the wait_lock must be held by the caller
403	* - tasks are marked for wakeup, the caller must later invoke wake_up_q()
404	* to actually wakeup the blocked task(s) and drop the reference count,
405	* preferably when the wait_lock is released
406	* - woken process blocks are discarded from the list after having task zeroed
407	* - writers are only marked woken if downgrading is false
408	*
409	* Implies rwsem_del_waiter() for all woken readers.
410	*/
411	static void rwsem_mark_wake(struct rw_semaphore *sem,
412	enum rwsem_wake_type wake_type,
413	struct wake_q_head *wake_q)
414	{
415	struct rwsem_waiter waiter, tmp;
416	long oldcount, woken = `0`, adjustment = `0`;
417	struct list_head wlist;
418
419	lockdep_assert_held(&sem->wait_lock);
420
421	/*
422	* Take a peek at the queue head waiter such that we can determine
423	* the wakeup(s) to perform.
424	*/
425	waiter = rwsem_first_waiter(sem);
426
427	if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
428	if (wake_type == RWSEM_WAKE_ANY) {
429	/*
430	* Mark writer at the front of the queue for wakeup.
431	* Until the task is actually later awoken later by
432	* the caller, other writers are able to steal it.
433	* Readers, on the other hand, will block as they
434	* will notice the queued writer.
435	*/
436	wake_q_add(head: wake_q, task: waiter->task);
437	lockevent_inc(rwsem_wake_writer);
438	}
439
440	return;
441	}
442
443	/*
444	* No reader wakeup if there are too many of them already.
445	*/
446	if (unlikely(atomic_long_read(&sem->count) < `0`))
447	return;
448
449	/*
450	* Writers might steal the lock before we grant it to the next reader.
451	* We prefer to do the first reader grant before counting readers
452	* so we can bail out early if a writer stole the lock.
453	*/
454	if (wake_type != RWSEM_WAKE_READ_OWNED) {
455	struct task_struct *owner;
456
457	adjustment = RWSEM_READER_BIAS;
458	oldcount = atomic_long_fetch_add(i: adjustment, v: &sem->count);
459	if (unlikely(oldcount & RWSEM_WRITER_MASK)) {
460	/*
461	* When we've been waiting "too" long (for writers
462	* to give up the lock), request a HANDOFF to
463	* force the issue.
464	*/
465	if (time_after(jiffies, waiter->timeout)) {
466	if (!(oldcount & RWSEM_FLAG_HANDOFF)) {
467	adjustment -= RWSEM_FLAG_HANDOFF;
468	lockevent_inc(rwsem_rlock_handoff);
469	}
470	waiter->handoff_set = true;
471	}
472
473	atomic_long_add(i: -adjustment, v: &sem->count);
474	return;
475	}
476	/*
477	* Set it to reader-owned to give spinners an early
478	* indication that readers now have the lock.
479	* The reader nonspinnable bit seen at slowpath entry of
480	* the reader is copied over.
481	*/
482	owner = waiter->task;
483	__rwsem_set_reader_owned(sem, owner);
484	}
485
486	/*
487	* Grant up to MAX_READERS_WAKEUP read locks to all the readers in the
488	* queue. We know that the woken will be at least 1 as we accounted
489	* for above. Note we increment the 'active part' of the count by the
490	* number of readers before waking any processes up.
491	*
492	* This is an adaptation of the phase-fair R/W locks where at the
493	* reader phase (first waiter is a reader), all readers are eligible
494	* to acquire the lock at the same time irrespective of their order
495	* in the queue. The writers acquire the lock according to their
496	* order in the queue.
497	*
498	* We have to do wakeup in 2 passes to prevent the possibility that
499	* the reader count may be decremented before it is incremented. It
500	* is because the to-be-woken waiter may not have slept yet. So it
501	* may see waiter->task got cleared, finish its critical section and
502	* do an unlock before the reader count increment.
503	*
504	* 1) Collect the read-waiters in a separate list, count them and
505	* fully increment the reader count in rwsem.
506	* 2) For each waiters in the new list, clear waiter->task and
507	* put them into wake_q to be woken up later.
508	*/
509	INIT_LIST_HEAD(list: &wlist);
510	list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
511	if (waiter->type == RWSEM_WAITING_FOR_WRITE)
512	continue;
513
514	woken++;
515	list_move_tail(list: &waiter->list, head: &wlist);
516
517	/*
518	* Limit # of readers that can be woken up per wakeup call.
519	*/
520	if (unlikely(woken >= MAX_READERS_WAKEUP))
521	break;
522	}
523
524	adjustment = woken * RWSEM_READER_BIAS - adjustment;
525	lockevent_cond_inc(rwsem_wake_reader, woken);
526
527	oldcount = atomic_long_read(v: &sem->count);
528	if (list_empty(head: &sem->wait_list)) {
529	/*
530	* Combined with list_move_tail() above, this implies
531	* rwsem_del_waiter().
532	*/
533	adjustment -= RWSEM_FLAG_WAITERS;
534	if (oldcount & RWSEM_FLAG_HANDOFF)
535	adjustment -= RWSEM_FLAG_HANDOFF;
536	} else if (woken) {
537	/*
538	* When we've woken a reader, we no longer need to force
539	* writers to give up the lock and we can clear HANDOFF.
540	*/
541	if (oldcount & RWSEM_FLAG_HANDOFF)
542	adjustment -= RWSEM_FLAG_HANDOFF;
543	}
544
545	if (adjustment)
546	atomic_long_add(i: adjustment, v: &sem->count);
547
548	/ 2nd pass /
549	list_for_each_entry_safe(waiter, tmp, &wlist, list) {
550	struct task_struct *tsk;
551
552	tsk = waiter->task;
553	get_task_struct(t: tsk);
554
555	/*
556	* Ensure calling get_task_struct() before setting the reader
557	* waiter to nil such that rwsem_down_read_slowpath() cannot
558	* race with do_exit() by always holding a reference count
559	* to the task to wakeup.
560	*/
561	smp_store_release(&waiter->task, NULL);
562	/*
563	* Ensure issuing the wakeup (either by us or someone else)
564	* after setting the reader waiter to nil.
565	*/
566	wake_q_add_safe(head: wake_q, task: tsk);
567	}
568	}
569
570	/*
571	* Remove a waiter and try to wake up other waiters in the wait queue
572	* This function is called from the out_nolock path of both the reader and
573	* writer slowpaths with wait_lock held. It releases the wait_lock and
574	* optionally wake up waiters before it returns.
575	*/
576	static inline void
577	rwsem_del_wake_waiter(struct rw_semaphore sem, struct* rwsem_waiter *waiter,
578	struct wake_q_head *wake_q)
579	__releases(&sem->wait_lock)
580	{
581	bool first = rwsem_first_waiter(sem) == waiter;
582
583	wake_q_init(head: wake_q);
584
585	/*
586	* If the wait_list isn't empty and the waiter to be deleted is
587	* the first waiter, we wake up the remaining waiters as they may
588	* be eligible to acquire or spin on the lock.
589	*/
590	if (rwsem_del_waiter(sem, waiter) && first)
591	rwsem_mark_wake(sem, wake_type: RWSEM_WAKE_ANY, wake_q);
592	raw_spin_unlock_irq(&sem->wait_lock);
593	if (!wake_q_empty(head: wake_q))
594	wake_up_q(head: wake_q);
595	}
596
597	/*
598	* This function must be called with the sem->wait_lock held to prevent
599	* race conditions between checking the rwsem wait list and setting the
600	* sem->count accordingly.
601	*
602	* Implies rwsem_del_waiter() on success.
603	*/
604	static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
605	struct rwsem_waiter *waiter)
606	{
607	struct rwsem_waiter *first = rwsem_first_waiter(sem);
608	long count, new;
609
610	lockdep_assert_held(&sem->wait_lock);
611
612	count = atomic_long_read(v: &sem->count);
613	do {
614	bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
615
616	if (has_handoff) {
617	/*
618	* Honor handoff bit and yield only when the first
619	* waiter is the one that set it. Otherwisee, we
620	* still try to acquire the rwsem.
621	*/
622	if (first->handoff_set && (waiter != first))
623	return false;
624	}
625
626	new = count;
627
628	if (count & RWSEM_LOCK_MASK) {
629	/*
630	* A waiter (first or not) can set the handoff bit
631	* if it is an RT task or wait in the wait queue
632	* for too long.
633	*/
634	if (has_handoff \|\| (!rt_task(p: waiter->task) &&
635	!time_after(jiffies, waiter->timeout)))
636	return false;
637
638	new \|= RWSEM_FLAG_HANDOFF;
639	} else {
640	new \|= RWSEM_WRITER_LOCKED;
641	new &= ~RWSEM_FLAG_HANDOFF;
642
643	if (list_is_singular(head: &sem->wait_list))
644	new &= ~RWSEM_FLAG_WAITERS;
645	}
646	} while (!atomic_long_try_cmpxchg_acquire(v: &sem->count, old: &count, new));
647
648	/*
649	* We have either acquired the lock with handoff bit cleared or set
650	* the handoff bit. Only the first waiter can have its handoff_set
651	* set here to enable optimistic spinning in slowpath loop.
652	*/
653	if (new & RWSEM_FLAG_HANDOFF) {
654	first->handoff_set = true;
655	lockevent_inc(rwsem_wlock_handoff);
656	return false;
657	}
658
659	/*
660	* Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
661	* success.
662	*/
663	list_del(entry: &waiter->list);
664	rwsem_set_owner(sem);
665	return true;
666	}
667
668	/*
669	* The rwsem_spin_on_owner() function returns the following 4 values
670	* depending on the lock owner state.
671	* OWNER_NULL : owner is currently NULL
672	* OWNER_WRITER: when owner changes and is a writer
673	* OWNER_READER: when owner changes and the new owner may be a reader.
674	* OWNER_NONSPINNABLE:
675	* when optimistic spinning has to stop because either the
676	* owner stops running, is unknown, or its timeslice has
677	* been used up.
678	*/
679	enum owner_state {
680	OWNER_NULL = `1` << `0`,
681	OWNER_WRITER = `1` << `1`,
682	OWNER_READER = `1` << `2`,
683	OWNER_NONSPINNABLE = `1` << `3`,
684	};
685
686	#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
687	/*
688	* Try to acquire write lock before the writer has been put on wait queue.
689	*/
690	static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
691	{
692	long count = atomic_long_read(v: &sem->count);
693
694	while (!(count & (RWSEM_LOCK_MASK\|RWSEM_FLAG_HANDOFF))) {
695	if (atomic_long_try_cmpxchg_acquire(v: &sem->count, old: &count,
696	new: count \| RWSEM_WRITER_LOCKED)) {
697	rwsem_set_owner(sem);
698	lockevent_inc(rwsem_opt_lock);
699	return true;
700	}
701	}
702	return false;
703	}
704
705	static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
706	{
707	struct task_struct *owner;
708	unsigned long flags;
709	bool ret = true;
710
711	if (need_resched()) {
712	lockevent_inc(rwsem_opt_fail);
713	return false;
714	}
715
716	/*
717	* Disable preemption is equal to the RCU read-side crital section,
718	* thus the task_strcut structure won't go away.
719	*/
720	owner = rwsem_owner_flags(sem, pflags: &flags);
721	/*
722	* Don't check the read-owner as the entry may be stale.
723	*/
724	if ((flags & RWSEM_NONSPINNABLE) \|\|
725	(owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner)))
726	ret = false;
727
728	lockevent_cond_inc(rwsem_opt_fail, !ret);
729	return ret;
730	}
731
732	#define OWNER_SPINNABLE (OWNER_NULL \| OWNER_WRITER \| OWNER_READER)
733
734	static inline enum owner_state
735	rwsem_owner_state(struct task_struct owner, unsigned* long flags)
736	{
737	if (flags & RWSEM_NONSPINNABLE)
738	return OWNER_NONSPINNABLE;
739
740	if (flags & RWSEM_READER_OWNED)
741	return OWNER_READER;
742
743	return owner ? OWNER_WRITER : OWNER_NULL;
744	}
745
746	static noinline enum owner_state
747	rwsem_spin_on_owner(struct rw_semaphore *sem)
748	{
749	struct task_struct new, owner;
750	unsigned long flags, new_flags;
751	enum owner_state state;
752
753	lockdep_assert_preemption_disabled();
754
755	owner = rwsem_owner_flags(sem, pflags: &flags);
756	state = rwsem_owner_state(owner, flags);
757	if (state != OWNER_WRITER)
758	return state;
759
760	for (;;) {
761	/*
762	* When a waiting writer set the handoff flag, it may spin
763	* on the owner as well. Once that writer acquires the lock,
764	* we can spin on it. So we don't need to quit even when the
765	* handoff bit is set.
766	*/
767	new = rwsem_owner_flags(sem, pflags: &new_flags);
768	if ((new != owner) \|\| (new_flags != flags)) {
769	state = rwsem_owner_state(owner: new, flags: new_flags);
770	break;
771	}
772
773	/*
774	* Ensure we emit the owner->on_cpu, dereference _after_
775	* checking sem->owner still matches owner, if that fails,
776	* owner might point to free()d memory, if it still matches,
777	* our spinning context already disabled preemption which is
778	* equal to RCU read-side crital section ensures the memory
779	* stays valid.
780	*/
781	barrier();
782
783	if (need_resched() \|\| !owner_on_cpu(owner)) {
784	state = OWNER_NONSPINNABLE;
785	break;
786	}
787
788	cpu_relax();
789	}
790
791	return state;
792	}
793
794	/*
795	* Calculate reader-owned rwsem spinning threshold for writer
796	*
797	* The more readers own the rwsem, the longer it will take for them to
798	* wind down and free the rwsem. So the empirical formula used to
799	* determine the actual spinning time limit here is:
800	*
801	* Spinning threshold = (10 + nr_readers/2)us
802	*
803	* The limit is capped to a maximum of 25us (30 readers). This is just
804	* a heuristic and is subjected to change in the future.
805	*/
806	static inline u64 rwsem_rspin_threshold(struct rw_semaphore *sem)
807	{
808	long count = atomic_long_read(v: &sem->count);
809	int readers = count >> RWSEM_READER_SHIFT;
810	u64 delta;
811
812	if (readers > `30`)
813	readers = `30`;
814	delta = (`20` + readers) * NSEC_PER_USEC / `2`;
815
816	return sched_clock() + delta;
817	}
818
819	static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
820	{
821	bool taken = false;
822	int prev_owner_state = OWNER_NULL;
823	int loop = `0`;
824	u64 rspin_threshold = `0`;
825
826	/ sem->wait_lock should not be held when doing optimistic spinning /
827	if (!osq_lock(lock: &sem->osq))
828	goto done;
829
830	/*
831	* Optimistically spin on the owner field and attempt to acquire the
832	* lock whenever the owner changes. Spinning will be stopped when:
833	* 1) the owning writer isn't running; or
834	* 2) readers own the lock and spinning time has exceeded limit.
835	*/
836	for (;;) {
837	enum owner_state owner_state;
838
839	owner_state = rwsem_spin_on_owner(sem);
840	if (!(owner_state & OWNER_SPINNABLE))
841	break;
842
843	/*
844	* Try to acquire the lock
845	*/
846	taken = rwsem_try_write_lock_unqueued(sem);
847
848	if (taken)
849	break;
850
851	/*
852	* Time-based reader-owned rwsem optimistic spinning
853	*/
854	if (owner_state == OWNER_READER) {
855	/*
856	* Re-initialize rspin_threshold every time when
857	* the owner state changes from non-reader to reader.
858	* This allows a writer to steal the lock in between
859	* 2 reader phases and have the threshold reset at
860	* the beginning of the 2nd reader phase.
861	*/
862	if (prev_owner_state != OWNER_READER) {
863	if (rwsem_test_oflags(sem, RWSEM_NONSPINNABLE))
864	break;
865	rspin_threshold = rwsem_rspin_threshold(sem);
866	loop = `0`;
867	}
868
869	/*
870	* Check time threshold once every 16 iterations to
871	* avoid calling sched_clock() too frequently so
872	* as to reduce the average latency between the times
873	* when the lock becomes free and when the spinner
874	* is ready to do a trylock.
875	*/
876	else if (!(++loop & `0xf`) && (sched_clock() > rspin_threshold)) {
877	rwsem_set_nonspinnable(sem);
878	lockevent_inc(rwsem_opt_nospin);
879	break;
880	}
881	}
882
883	/*
884	* An RT task cannot do optimistic spinning if it cannot
885	* be sure the lock holder is running or live-lock may
886	* happen if the current task and the lock holder happen
887	* to run in the same CPU. However, aborting optimistic
888	* spinning while a NULL owner is detected may miss some
889	* opportunity where spinning can continue without causing
890	* problem.
891	*
892	* There are 2 possible cases where an RT task may be able
893	* to continue spinning.
894	*
895	* 1) The lock owner is in the process of releasing the
896	* lock, sem->owner is cleared but the lock has not
897	* been released yet.
898	* 2) The lock was free and owner cleared, but another
899	* task just comes in and acquire the lock before
900	* we try to get it. The new owner may be a spinnable
901	* writer.
902	*
903	* To take advantage of two scenarios listed above, the RT
904	* task is made to retry one more time to see if it can
905	* acquire the lock or continue spinning on the new owning
906	* writer. Of course, if the time lag is long enough or the
907	* new owner is not a writer or spinnable, the RT task will
908	* quit spinning.
909	*
910	* If the owner is a writer, the need_resched() check is
911	* done inside rwsem_spin_on_owner(). If the owner is not
912	* a writer, need_resched() check needs to be done here.
913	*/
914	if (owner_state != OWNER_WRITER) {
915	if (need_resched())
916	break;
917	if (rt_task(current) &&
918	(prev_owner_state != OWNER_WRITER))
919	break;
920	}
921	prev_owner_state = owner_state;
922
923	/*
924	* The cpu_relax() call is a compiler barrier which forces
925	* everything in this loop to be re-loaded. We don't need
926	* memory barriers as we'll eventually observe the right
927	* values at the cost of a few extra spins.
928	*/
929	cpu_relax();
930	}
931	osq_unlock(lock: &sem->osq);
932	done:
933	lockevent_cond_inc(rwsem_opt_fail, !taken);
934	return taken;
935	}
936
937	/*
938	* Clear the owner's RWSEM_NONSPINNABLE bit if it is set. This should
939	* only be called when the reader count reaches 0.
940	*/
941	static inline void clear_nonspinnable(struct rw_semaphore *sem)
942	{
943	if (unlikely(rwsem_test_oflags(sem, RWSEM_NONSPINNABLE)))
944	atomic_long_andnot(RWSEM_NONSPINNABLE, v: &sem->owner);
945	}
946
947	#else
948	static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
949	{
950	return false;
951	}
952
953	static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem)
954	{
955	return false;
956	}
957
958	static inline void clear_nonspinnable(struct rw_semaphore *sem) { }
959
960	static inline enum owner_state
961	rwsem_spin_on_owner(struct rw_semaphore *sem)
962	{
963	return OWNER_NONSPINNABLE;
964	}
965	#endif
966
967	/*
968	* Prepare to wake up waiter(s) in the wait queue by putting them into the
969	* given wake_q if the rwsem lock owner isn't a writer. If rwsem is likely
970	* reader-owned, wake up read lock waiters in queue front or wake up any
971	* front waiter otherwise.
972
973	* This is being called from both reader and writer slow paths.
974	*/
975	static inline void rwsem_cond_wake_waiter(struct rw_semaphore sem, long* count,
976	struct wake_q_head *wake_q)
977	{
978	enum rwsem_wake_type wake_type;
979
980	if (count & RWSEM_WRITER_MASK)
981	return;
982
983	if (count & RWSEM_READER_MASK) {
984	wake_type = RWSEM_WAKE_READERS;
985	} else {
986	wake_type = RWSEM_WAKE_ANY;
987	clear_nonspinnable(sem);
988	}
989	rwsem_mark_wake(sem, wake_type, wake_q);
990	}
991
992	/*
993	* Wait for the read lock to be granted
994	*/
995	static struct rw_semaphore __sched *
996	rwsem_down_read_slowpath(struct rw_semaphore sem, long* count, unsigned int state)
997	{
998	long adjustment = -RWSEM_READER_BIAS;
999	long rcnt = (count >> RWSEM_READER_SHIFT);
1000	struct rwsem_waiter waiter;
1001	DEFINE_WAKE_Q(wake_q);
1002
1003	/*
1004	* To prevent a constant stream of readers from starving a sleeping
1005	* waiter, don't attempt optimistic lock stealing if the lock is
1006	* currently owned by readers.
1007	*/
1008	if ((atomic_long_read(v: &sem->owner) & RWSEM_READER_OWNED) &&
1009	(rcnt > `1`) && !(count & RWSEM_WRITER_LOCKED))
1010	goto queue;
1011
1012	/*
1013	* Reader optimistic lock stealing.
1014	*/
1015	if (!(count & (RWSEM_WRITER_LOCKED \| RWSEM_FLAG_HANDOFF))) {
1016	rwsem_set_reader_owned(sem);
1017	lockevent_inc(rwsem_rlock_steal);
1018
1019	/*
1020	* Wake up other readers in the wait queue if it is
1021	* the first reader.
1022	*/
1023	if ((rcnt == `1`) && (count & RWSEM_FLAG_WAITERS)) {
1024	raw_spin_lock_irq(&sem->wait_lock);
1025	if (!list_empty(head: &sem->wait_list))
1026	rwsem_mark_wake(sem, wake_type: RWSEM_WAKE_READ_OWNED,
1027	wake_q: &wake_q);
1028	raw_spin_unlock_irq(&sem->wait_lock);
1029	wake_up_q(head: &wake_q);
1030	}
1031	return sem;
1032	}
1033
1034	queue:
1035	waiter.task = current;
1036	waiter.type = RWSEM_WAITING_FOR_READ;
1037	waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
1038	waiter.handoff_set = false;
1039
1040	raw_spin_lock_irq(&sem->wait_lock);
1041	if (list_empty(head: &sem->wait_list)) {
1042	/*
1043	* In case the wait queue is empty and the lock isn't owned
1044	* by a writer, this reader can exit the slowpath and return
1045	* immediately as its RWSEM_READER_BIAS has already been set
1046	* in the count.
1047	*/
1048	if (!(atomic_long_read(v: &sem->count) & RWSEM_WRITER_MASK)) {
1049	/ Provide lock ACQUIRE /
1050	smp_acquire__after_ctrl_dep();
1051	raw_spin_unlock_irq(&sem->wait_lock);
1052	rwsem_set_reader_owned(sem);
1053	lockevent_inc(rwsem_rlock_fast);
1054	return sem;
1055	}
1056	adjustment += RWSEM_FLAG_WAITERS;
1057	}
1058	rwsem_add_waiter(sem, waiter: &waiter);
1059
1060	/ we're now waiting on the lock, but no longer actively locking /
1061	count = atomic_long_add_return(i: adjustment, v: &sem->count);
1062
1063	rwsem_cond_wake_waiter(sem, count, wake_q: &wake_q);
1064	raw_spin_unlock_irq(&sem->wait_lock);
1065
1066	if (!wake_q_empty(head: &wake_q))
1067	wake_up_q(head: &wake_q);
1068
1069	trace_contention_begin(lock: sem, LCB_F_READ);
1070
1071	/ wait to be given the lock /
1072	for (;;) {
1073	set_current_state(state);
1074	if (!smp_load_acquire(&waiter.task)) {
1075	/ Matches rwsem_mark_wake()'s smp_store_release(). /
1076	break;
1077	}
1078	if (signal_pending_state(state, current)) {
1079	raw_spin_lock_irq(&sem->wait_lock);
1080	if (waiter.task)
1081	goto out_nolock;
1082	raw_spin_unlock_irq(&sem->wait_lock);
1083	/ Ordered by sem->wait_lock against rwsem_mark_wake(). /
1084	break;
1085	}
1086	schedule_preempt_disabled();
1087	lockevent_inc(rwsem_sleep_reader);
1088	}
1089
1090	__set_current_state(TASK_RUNNING);
1091	lockevent_inc(rwsem_rlock);
1092	trace_contention_end(lock: sem, ret: `0`);
1093	return sem;
1094
1095	out_nolock:
1096	rwsem_del_wake_waiter(sem, waiter: &waiter, wake_q: &wake_q);
1097	__set_current_state(TASK_RUNNING);
1098	lockevent_inc(rwsem_rlock_fail);
1099	trace_contention_end(lock: sem, ret: -EINTR);
1100	return ERR_PTR(error: -EINTR);
1101	}
1102
1103	/*
1104	* Wait until we successfully acquire the write lock
1105	*/
1106	static struct rw_semaphore __sched *
1107	rwsem_down_write_slowpath(struct rw_semaphore sem, int* state)
1108	{
1109	struct rwsem_waiter waiter;
1110	DEFINE_WAKE_Q(wake_q);
1111
1112	/ do optimistic spinning and steal lock if possible /
1113	if (rwsem_can_spin_on_owner(sem) && rwsem_optimistic_spin(sem)) {
1114	/ rwsem_optimistic_spin() implies ACQUIRE on success /
1115	return sem;
1116	}
1117
1118	/*
1119	* Optimistic spinning failed, proceed to the slowpath
1120	* and block until we can acquire the sem.
1121	*/
1122	waiter.task = current;
1123	waiter.type = RWSEM_WAITING_FOR_WRITE;
1124	waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
1125	waiter.handoff_set = false;
1126
1127	raw_spin_lock_irq(&sem->wait_lock);
1128	rwsem_add_waiter(sem, waiter: &waiter);
1129
1130	/ we're now waiting on the lock /
1131	if (rwsem_first_waiter(sem) != &waiter) {
1132	rwsem_cond_wake_waiter(sem, count: atomic_long_read(v: &sem->count),
1133	wake_q: &wake_q);
1134	if (!wake_q_empty(head: &wake_q)) {
1135	/*
1136	* We want to minimize wait_lock hold time especially
1137	* when a large number of readers are to be woken up.
1138	*/
1139	raw_spin_unlock_irq(&sem->wait_lock);
1140	wake_up_q(head: &wake_q);
1141	raw_spin_lock_irq(&sem->wait_lock);
1142	}
1143	} else {
1144	atomic_long_or(RWSEM_FLAG_WAITERS, v: &sem->count);
1145	}
1146
1147	/ wait until we successfully acquire the lock /
1148	set_current_state(state);
1149	trace_contention_begin(lock: sem, LCB_F_WRITE);
1150
1151	for (;;) {
1152	if (rwsem_try_write_lock(sem, waiter: &waiter)) {
1153	/ rwsem_try_write_lock() implies ACQUIRE on success /
1154	break;
1155	}
1156
1157	raw_spin_unlock_irq(&sem->wait_lock);
1158
1159	if (signal_pending_state(state, current))
1160	goto out_nolock;
1161
1162	/*
1163	* After setting the handoff bit and failing to acquire
1164	* the lock, attempt to spin on owner to accelerate lock
1165	* transfer. If the previous owner is a on-cpu writer and it
1166	* has just released the lock, OWNER_NULL will be returned.
1167	* In this case, we attempt to acquire the lock again
1168	* without sleeping.
1169	*/
1170	if (waiter.handoff_set) {
1171	enum owner_state owner_state;
1172
1173	owner_state = rwsem_spin_on_owner(sem);
1174	if (owner_state == OWNER_NULL)
1175	goto trylock_again;
1176	}
1177
1178	schedule_preempt_disabled();
1179	lockevent_inc(rwsem_sleep_writer);
1180	set_current_state(state);
1181	trylock_again:
1182	raw_spin_lock_irq(&sem->wait_lock);
1183	}
1184	__set_current_state(TASK_RUNNING);
1185	raw_spin_unlock_irq(&sem->wait_lock);
1186	lockevent_inc(rwsem_wlock);
1187	trace_contention_end(lock: sem, ret: `0`);
1188	return sem;
1189
1190	out_nolock:
1191	__set_current_state(TASK_RUNNING);
1192	raw_spin_lock_irq(&sem->wait_lock);
1193	rwsem_del_wake_waiter(sem, waiter: &waiter, wake_q: &wake_q);
1194	lockevent_inc(rwsem_wlock_fail);
1195	trace_contention_end(lock: sem, ret: -EINTR);
1196	return ERR_PTR(error: -EINTR);
1197	}
1198
1199	/*
1200	* handle waking up a waiter on the semaphore
1201	* - up_read/up_write has decremented the active part of count if we come here
1202	*/
1203	static struct rw_semaphore rwsem_wake(struct* rw_semaphore *sem)
1204	{
1205	unsigned long flags;
1206	DEFINE_WAKE_Q(wake_q);
1207
1208	raw_spin_lock_irqsave(&sem->wait_lock, flags);
1209
1210	if (!list_empty(head: &sem->wait_list))
1211	rwsem_mark_wake(sem, wake_type: RWSEM_WAKE_ANY, wake_q: &wake_q);
1212
1213	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
1214	wake_up_q(head: &wake_q);
1215
1216	return sem;
1217	}
1218
1219	/*
1220	* downgrade a write lock into a read lock
1221	* - caller incremented waiting part of count and discovered it still negative
1222	* - just wake up any readers at the front of the queue
1223	*/
1224	static struct rw_semaphore rwsem_downgrade_wake(struct* rw_semaphore *sem)
1225	{
1226	unsigned long flags;
1227	DEFINE_WAKE_Q(wake_q);
1228
1229	raw_spin_lock_irqsave(&sem->wait_lock, flags);
1230
1231	if (!list_empty(head: &sem->wait_list))
1232	rwsem_mark_wake(sem, wake_type: RWSEM_WAKE_READ_OWNED, wake_q: &wake_q);
1233
1234	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
1235	wake_up_q(head: &wake_q);
1236
1237	return sem;
1238	}
1239
1240	/*
1241	* lock for reading
1242	*/
1243	static __always_inline int __down_read_common(struct rw_semaphore sem, int* state)
1244	{
1245	int ret = `0`;
1246	long count;
1247
1248	preempt_disable();
1249	if (!rwsem_read_trylock(sem, cntp: &count)) {
1250	if (IS_ERR(ptr: rwsem_down_read_slowpath(sem, count, state))) {
1251	ret = -EINTR;
1252	goto out;
1253	}
1254	DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
1255	}
1256	out:
1257	preempt_enable();
1258	return ret;
1259	}
1260
1261	static __always_inline void __down_read(struct rw_semaphore *sem)
1262	{
1263	__down_read_common(sem, TASK_UNINTERRUPTIBLE);
1264	}
1265
1266	static __always_inline int __down_read_interruptible(struct rw_semaphore *sem)
1267	{
1268	return __down_read_common(sem, TASK_INTERRUPTIBLE);
1269	}
1270
1271	static __always_inline int __down_read_killable(struct rw_semaphore *sem)
1272	{
1273	return __down_read_common(sem, TASK_KILLABLE);
1274	}
1275
1276	static inline int __down_read_trylock(struct rw_semaphore *sem)
1277	{
1278	int ret = `0`;
1279	long tmp;
1280
1281	DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
1282
1283	preempt_disable();
1284	tmp = atomic_long_read(v: &sem->count);
1285	while (!(tmp & RWSEM_READ_FAILED_MASK)) {
1286	if (atomic_long_try_cmpxchg_acquire(v: &sem->count, old: &tmp,
1287	new: tmp + RWSEM_READER_BIAS)) {
1288	rwsem_set_reader_owned(sem);
1289	ret = `1`;
1290	break;
1291	}
1292	}
1293	preempt_enable();
1294	return ret;
1295	}
1296
1297	/*
1298	* lock for writing
1299	*/
1300	static inline int __down_write_common(struct rw_semaphore sem, int* state)
1301	{
1302	int ret = `0`;
1303
1304	preempt_disable();
1305	if (unlikely(!rwsem_write_trylock(sem))) {
1306	if (IS_ERR(ptr: rwsem_down_write_slowpath(sem, state)))
1307	ret = -EINTR;
1308	}
1309	preempt_enable();
1310	return ret;
1311	}
1312
1313	static inline void __down_write(struct rw_semaphore *sem)
1314	{
1315	__down_write_common(sem, TASK_UNINTERRUPTIBLE);
1316	}
1317
1318	static inline int __down_write_killable(struct rw_semaphore *sem)
1319	{
1320	return __down_write_common(sem, TASK_KILLABLE);
1321	}
1322
1323	static inline int __down_write_trylock(struct rw_semaphore *sem)
1324	{
1325	int ret;
1326
1327	preempt_disable();
1328	DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
1329	ret = rwsem_write_trylock(sem);
1330	preempt_enable();
1331
1332	return ret;
1333	}
1334
1335	/*
1336	* unlock after reading
1337	*/
1338	static inline void __up_read(struct rw_semaphore *sem)
1339	{
1340	long tmp;
1341
1342	DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
1343	DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
1344
1345	preempt_disable();
1346	rwsem_clear_reader_owned(sem);
1347	tmp = atomic_long_add_return_release(i: -RWSEM_READER_BIAS, v: &sem->count);
1348	DEBUG_RWSEMS_WARN_ON(tmp < `0`, sem);
1349	if (unlikely((tmp & (RWSEM_LOCK_MASK\|RWSEM_FLAG_WAITERS)) ==
1350	RWSEM_FLAG_WAITERS)) {
1351	clear_nonspinnable(sem);
1352	rwsem_wake(sem);
1353	}
1354	preempt_enable();
1355	}
1356
1357	/*
1358	* unlock after writing
1359	*/
1360	static inline void __up_write(struct rw_semaphore *sem)
1361	{
1362	long tmp;
1363
1364	DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
1365	/*
1366	* sem->owner may differ from current if the ownership is transferred
1367	* to an anonymous writer by setting the RWSEM_NONSPINNABLE bits.
1368	*/
1369	DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) &&
1370	!rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem);
1371
1372	preempt_disable();
1373	rwsem_clear_owner(sem);
1374	tmp = atomic_long_fetch_add_release(i: -RWSEM_WRITER_LOCKED, v: &sem->count);
1375	if (unlikely(tmp & RWSEM_FLAG_WAITERS))
1376	rwsem_wake(sem);
1377	preempt_enable();
1378	}
1379
1380	/*
1381	* downgrade write lock to read lock
1382	*/
1383	static inline void __downgrade_write(struct rw_semaphore *sem)
1384	{
1385	long tmp;
1386
1387	/*
1388	* When downgrading from exclusive to shared ownership,
1389	* anything inside the write-locked region cannot leak
1390	* into the read side. In contrast, anything in the
1391	* read-locked region is ok to be re-ordered into the
1392	* write side. As such, rely on RELEASE semantics.
1393	*/
1394	DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem);
1395	preempt_disable();
1396	tmp = atomic_long_fetch_add_release(
1397	i: -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, v: &sem->count);
1398	rwsem_set_reader_owned(sem);
1399	if (tmp & RWSEM_FLAG_WAITERS)
1400	rwsem_downgrade_wake(sem);
1401	preempt_enable();
1402	}
1403
1404	#else /* !CONFIG_PREEMPT_RT */
1405
1406	#define RT_MUTEX_BUILD_MUTEX
1407	#include "rtmutex.c"
1408
1409	#define rwbase_set_and_save_current_state(state) \
1410	set_current_state(state)
1411
1412	#define rwbase_restore_current_state() \
1413	__set_current_state(TASK_RUNNING)
1414
1415	#define rwbase_rtmutex_lock_state(rtm, state) \
1416	__rt_mutex_lock(rtm, state)
1417
1418	#define rwbase_rtmutex_slowlock_locked(rtm, state) \
1419	__rt_mutex_slowlock_locked(rtm, NULL, state)
1420
1421	#define rwbase_rtmutex_unlock(rtm) \
1422	__rt_mutex_unlock(rtm)
1423
1424	#define rwbase_rtmutex_trylock(rtm) \
1425	__rt_mutex_trylock(rtm)
1426
1427	#define rwbase_signal_pending_state(state, current) \
1428	signal_pending_state(state, current)
1429
1430	#define rwbase_pre_schedule() \
1431	rt_mutex_pre_schedule()
1432
1433	#define rwbase_schedule() \
1434	rt_mutex_schedule()
1435
1436	#define rwbase_post_schedule() \
1437	rt_mutex_post_schedule()
1438
1439	#include "rwbase_rt.c"
1440
1441	void __init_rwsem(struct rw_semaphore sem, const* char *name,
1442	struct lock_class_key *key)
1443	{
1444	init_rwbase_rt(&(sem)->rwbase);
1445
1446	#ifdef CONFIG_DEBUG_LOCK_ALLOC
1447	debug_check_no_locks_freed((void )sem, sizeof(sem));
1448	lockdep_init_map_wait(&sem->dep_map, name, key, `0`, LD_WAIT_SLEEP);
1449	#endif
1450	}
1451	EXPORT_SYMBOL(__init_rwsem);
1452
1453	static inline void __down_read(struct rw_semaphore *sem)
1454	{
1455	rwbase_read_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE);
1456	}
1457
1458	static inline int __down_read_interruptible(struct rw_semaphore *sem)
1459	{
1460	return rwbase_read_lock(&sem->rwbase, TASK_INTERRUPTIBLE);
1461	}
1462
1463	static inline int __down_read_killable(struct rw_semaphore *sem)
1464	{
1465	return rwbase_read_lock(&sem->rwbase, TASK_KILLABLE);
1466	}
1467
1468	static inline int __down_read_trylock(struct rw_semaphore *sem)
1469	{
1470	return rwbase_read_trylock(&sem->rwbase);
1471	}
1472
1473	static inline void __up_read(struct rw_semaphore *sem)
1474	{
1475	rwbase_read_unlock(&sem->rwbase, TASK_NORMAL);
1476	}
1477
1478	static inline void __sched __down_write(struct rw_semaphore *sem)
1479	{
1480	rwbase_write_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE);
1481	}
1482
1483	static inline int __sched __down_write_killable(struct rw_semaphore *sem)
1484	{
1485	return rwbase_write_lock(&sem->rwbase, TASK_KILLABLE);
1486	}
1487
1488	static inline int __down_write_trylock(struct rw_semaphore *sem)
1489	{
1490	return rwbase_write_trylock(&sem->rwbase);
1491	}
1492
1493	static inline void __up_write(struct rw_semaphore *sem)
1494	{
1495	rwbase_write_unlock(&sem->rwbase);
1496	}
1497
1498	static inline void __downgrade_write(struct rw_semaphore *sem)
1499	{
1500	rwbase_write_downgrade(&sem->rwbase);
1501	}
1502
1503	/ Debug stubs for the common API /
1504	#define DEBUG_RWSEMS_WARN_ON(c, sem)
1505
1506	static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
1507	struct task_struct *owner)
1508	{
1509	}
1510
1511	static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
1512	{
1513	int count = atomic_read(&sem->rwbase.readers);
1514
1515	return count < `0` && count != READER_BIAS;
1516	}
1517
1518	#endif /* CONFIG_PREEMPT_RT */
1519
1520	/*
1521	* lock for reading
1522	*/
1523	void __sched down_read(struct rw_semaphore *sem)
1524	{
1525	might_sleep();
1526	rwsem_acquire_read(&sem->dep_map, `0`, `0`, _RET_IP_);
1527
1528	LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
1529	}
1530	EXPORT_SYMBOL(down_read);
1531
1532	int __sched down_read_interruptible(struct rw_semaphore *sem)
1533	{
1534	might_sleep();
1535	rwsem_acquire_read(&sem->dep_map, `0`, `0`, _RET_IP_);
1536
1537	if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_interruptible)) {
1538	rwsem_release(&sem->dep_map, _RET_IP_);
1539	return -EINTR;
1540	}
1541
1542	return `0`;
1543	}
1544	EXPORT_SYMBOL(down_read_interruptible);
1545
1546	int __sched down_read_killable(struct rw_semaphore *sem)
1547	{
1548	might_sleep();
1549	rwsem_acquire_read(&sem->dep_map, `0`, `0`, _RET_IP_);
1550
1551	if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
1552	rwsem_release(&sem->dep_map, _RET_IP_);
1553	return -EINTR;
1554	}
1555
1556	return `0`;
1557	}
1558	EXPORT_SYMBOL(down_read_killable);
1559
1560	/*
1561	* trylock for reading -- returns 1 if successful, 0 if contention
1562	*/
1563	int down_read_trylock(struct rw_semaphore *sem)
1564	{
1565	int ret = __down_read_trylock(sem);
1566
1567	if (ret == `1`)
1568	rwsem_acquire_read(&sem->dep_map, `0`, `1`, _RET_IP_);
1569	return ret;
1570	}
1571	EXPORT_SYMBOL(down_read_trylock);
1572
1573	/*
1574	* lock for writing
1575	*/
1576	void __sched down_write(struct rw_semaphore *sem)
1577	{
1578	might_sleep();
1579	rwsem_acquire(&sem->dep_map, `0`, `0`, _RET_IP_);
1580	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
1581	}
1582	EXPORT_SYMBOL(down_write);
1583
1584	/*
1585	* lock for writing
1586	*/
1587	int __sched down_write_killable(struct rw_semaphore *sem)
1588	{
1589	might_sleep();
1590	rwsem_acquire(&sem->dep_map, `0`, `0`, _RET_IP_);
1591
1592	if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
1593	__down_write_killable)) {
1594	rwsem_release(&sem->dep_map, _RET_IP_);
1595	return -EINTR;
1596	}
1597
1598	return `0`;
1599	}
1600	EXPORT_SYMBOL(down_write_killable);
1601
1602	/*
1603	* trylock for writing -- returns 1 if successful, 0 if contention
1604	*/
1605	int down_write_trylock(struct rw_semaphore *sem)
1606	{
1607	int ret = __down_write_trylock(sem);
1608
1609	if (ret == `1`)
1610	rwsem_acquire(&sem->dep_map, `0`, `1`, _RET_IP_);
1611
1612	return ret;
1613	}
1614	EXPORT_SYMBOL(down_write_trylock);
1615
1616	/*
1617	* release a read lock
1618	*/
1619	void up_read(struct rw_semaphore *sem)
1620	{
1621	rwsem_release(&sem->dep_map, _RET_IP_);
1622	__up_read(sem);
1623	}
1624	EXPORT_SYMBOL(up_read);
1625
1626	/*
1627	* release a write lock
1628	*/
1629	void up_write(struct rw_semaphore *sem)
1630	{
1631	rwsem_release(&sem->dep_map, _RET_IP_);
1632	__up_write(sem);
1633	}
1634	EXPORT_SYMBOL(up_write);
1635
1636	/*
1637	* downgrade write lock to read lock
1638	*/
1639	void downgrade_write(struct rw_semaphore *sem)
1640	{
1641	lock_downgrade(lock: &sem->dep_map, _RET_IP_);
1642	__downgrade_write(sem);
1643	}
1644	EXPORT_SYMBOL(downgrade_write);
1645
1646	#ifdef CONFIG_DEBUG_LOCK_ALLOC
1647
1648	void down_read_nested(struct rw_semaphore sem, int* subclass)
1649	{
1650	might_sleep();
1651	rwsem_acquire_read(&sem->dep_map, subclass, `0`, _RET_IP_);
1652	LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
1653	}
1654	EXPORT_SYMBOL(down_read_nested);
1655
1656	int down_read_killable_nested(struct rw_semaphore sem, int* subclass)
1657	{
1658	might_sleep();
1659	rwsem_acquire_read(&sem->dep_map, subclass, `0`, _RET_IP_);
1660
1661	if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
1662	rwsem_release(&sem->dep_map, _RET_IP_);
1663	return -EINTR;
1664	}
1665
1666	return `0`;
1667	}
1668	EXPORT_SYMBOL(down_read_killable_nested);
1669
1670	void _down_write_nest_lock(struct rw_semaphore sem, struct* lockdep_map *nest)
1671	{
1672	might_sleep();
1673	rwsem_acquire_nest(&sem->dep_map, `0`, `0`, nest, _RET_IP_);
1674	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
1675	}
1676	EXPORT_SYMBOL(_down_write_nest_lock);
1677
1678	void down_read_non_owner(struct rw_semaphore *sem)
1679	{
1680	might_sleep();
1681	__down_read(sem);
1682	/*
1683	* The owner value for a reader-owned lock is mostly for debugging
1684	* purpose only and is not critical to the correct functioning of
1685	* rwsem. So it is perfectly fine to set it in a preempt-enabled
1686	* context here.
1687	*/
1688	__rwsem_set_reader_owned(sem, NULL);
1689	}
1690	EXPORT_SYMBOL(down_read_non_owner);
1691
1692	void down_write_nested(struct rw_semaphore sem, int* subclass)
1693	{
1694	might_sleep();
1695	rwsem_acquire(&sem->dep_map, subclass, `0`, _RET_IP_);
1696	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
1697	}
1698	EXPORT_SYMBOL(down_write_nested);
1699
1700	int __sched down_write_killable_nested(struct rw_semaphore sem, int* subclass)
1701	{
1702	might_sleep();
1703	rwsem_acquire(&sem->dep_map, subclass, `0`, _RET_IP_);
1704
1705	if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
1706	__down_write_killable)) {
1707	rwsem_release(&sem->dep_map, _RET_IP_);
1708	return -EINTR;
1709	}
1710
1711	return `0`;
1712	}
1713	EXPORT_SYMBOL(down_write_killable_nested);
1714
1715	void up_read_non_owner(struct rw_semaphore *sem)
1716	{
1717	DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
1718	__up_read(sem);
1719	}
1720	EXPORT_SYMBOL(up_read_non_owner);
1721
1722	#endif
1723

source code of linux/kernel/locking/rwsem.c