qspinlock.c source code [linux/kernel/locking/qspinlock.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* Queued spinlock
4	*
5	* (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
6	* (C) Copyright 2013-2014,2018 Red Hat, Inc.
7	* (C) Copyright 2015 Intel Corp.
8	* (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
9	*
10	* Authors: Waiman Long <longman@redhat.com>
11	* Peter Zijlstra <peterz@infradead.org>
12	*/
13
14	#ifndef _GEN_PV_LOCK_SLOWPATH
15
16	#include <linux/smp.h>
17	#include <linux/bug.h>
18	#include <linux/cpumask.h>
19	#include <linux/percpu.h>
20	#include <linux/hardirq.h>
21	#include <linux/mutex.h>
22	#include <linux/prefetch.h>
23	#include <asm/byteorder.h>
24	#include <asm/qspinlock.h>
25	#include <trace/events/lock.h>
26
27	/*
28	* Include queued spinlock statistics code
29	*/
30	#include "qspinlock_stat.h"
31
32	/*
33	* The basic principle of a queue-based spinlock can best be understood
34	* by studying a classic queue-based spinlock implementation called the
35	* MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable
36	* Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and
37	* Scott") is available at
38	*
39	* https://bugzilla.kernel.org/show_bug.cgi?id=206115
40	*
41	* This queued spinlock implementation is based on the MCS lock, however to
42	* make it fit the 4 bytes we assume spinlock_t to be, and preserve its
43	* existing API, we must modify it somehow.
44	*
45	* In particular; where the traditional MCS lock consists of a tail pointer
46	* (8 bytes) and needs the next pointer (another 8 bytes) of its own node to
47	* unlock the next pending (next->locked), we compress both these: {tail,
48	* next->locked} into a single u32 value.
49	*
50	* Since a spinlock disables recursion of its own context and there is a limit
51	* to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there
52	* are at most 4 nesting levels, it can be encoded by a 2-bit number. Now
53	* we can encode the tail by combining the 2-bit nesting level with the cpu
54	* number. With one byte for the lock value and 3 bytes for the tail, only a
55	* 32-bit word is now needed. Even though we only need 1 bit for the lock,
56	* we extend it to a full byte to achieve better performance for architectures
57	* that support atomic byte write.
58	*
59	* We also change the first spinner to spin on the lock bit instead of its
60	* node; whereby avoiding the need to carry a node from lock to unlock, and
61	* preserving existing lock API. This also makes the unlock code simpler and
62	* faster.
63	*
64	* N.B. The current implementation only supports architectures that allow
65	* atomic operations on smaller 8-bit and 16-bit data types.
66	*
67	*/
68
69	#include "mcs_spinlock.h"
70	#define MAX_NODES 4
71
72	/*
73	* On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in
74	* size and four of them will fit nicely in one 64-byte cacheline. For
75	* pvqspinlock, however, we need more space for extra data. To accommodate
76	* that, we insert two more long words to pad it up to 32 bytes. IOW, only
77	* two of them can fit in a cacheline in this case. That is OK as it is rare
78	* to have more than 2 levels of slowpath nesting in actual use. We don't
79	* want to penalize pvqspinlocks to optimize for a rare case in native
80	* qspinlocks.
81	*/
82	struct qnode {
83	struct mcs_spinlock mcs;
84	#ifdef CONFIG_PARAVIRT_SPINLOCKS
85	long reserved[`2`];
86	#endif
87	};
88
89	/*
90	* The pending bit spinning loop count.
91	* This heuristic is used to limit the number of lockword accesses
92	* made by atomic_cond_read_relaxed when waiting for the lock to
93	* transition out of the "== _Q_PENDING_VAL" state. We don't spin
94	* indefinitely because there's no guarantee that we'll make forward
95	* progress.
96	*/
97	#ifndef _Q_PENDING_LOOPS
98	#define _Q_PENDING_LOOPS 1
99	#endif
100
101	/*
102	* Per-CPU queue node structures; we can never have more than 4 nested
103	* contexts: task, softirq, hardirq, nmi.
104	*
105	* Exactly fits one 64-byte cacheline on a 64-bit architecture.
106	*
107	* PV doubles the storage and uses the second cacheline for PV state.
108	*/
109	static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[MAX_NODES]);
110
111	/*
112	* We must be able to distinguish between no-tail and the tail at 0:0,
113	* therefore increment the cpu number by one.
114	*/
115
116	static inline __pure u32 encode_tail(int cpu, int idx)
117	{
118	u32 tail;
119
120	tail = (cpu + `1`) << _Q_TAIL_CPU_OFFSET;
121	tail \|= idx << _Q_TAIL_IDX_OFFSET; / assume < 4 /
122
123	return tail;
124	}
125
126	static inline __pure struct mcs_spinlock *decode_tail(u32 tail)
127	{
128	int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - `1`;
129	int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
130
131	return per_cpu_ptr(&qnodes[idx].mcs, cpu);
132	}
133
134	static inline __pure
135	struct mcs_spinlock grab_mcs_node(struct* mcs_spinlock base, int* idx)
136	{
137	return &((struct qnode *)base + idx)->mcs;
138	}
139
140	#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK \| _Q_PENDING_MASK)
141
142	#if _Q_PENDING_BITS == 8
143	/**
144	* clear_pending - clear the pending bit.
145	* @lock: Pointer to queued spinlock structure
146	*
147	* ,1, -> ,0,
148	*/
149	static __always_inline void clear_pending(struct qspinlock *lock)
150	{
151	WRITE_ONCE(lock->pending, `0`);
152	}
153
154	/**
155	* clear_pending_set_locked - take ownership and clear the pending bit.
156	* @lock: Pointer to queued spinlock structure
157	*
158	* ,1,0 -> ,0,1
159	*
160	* Lock stealing is not allowed if this function is used.
161	*/
162	static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
163	{
164	WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
165	}
166
167	/*
168	* xchg_tail - Put in the new queue tail code word & retrieve previous one
169	* @lock : Pointer to queued spinlock structure
170	* @tail : The new queue tail code word
171	* Return: The previous queue tail code word
172	*
173	* xchg(lock, tail), which heads an address dependency
174	*
175	* p,, -> n,, ; prev = xchg(lock, node)
176	*/
177	static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
178	{
179	/*
180	* We can use relaxed semantics since the caller ensures that the
181	* MCS node is properly initialized before updating the tail.
182	*/
183	return (u32)xchg_relaxed(&lock->tail,
184	tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
185	}
186
187	#else /* _Q_PENDING_BITS == 8 */
188
189	/**
190	* clear_pending - clear the pending bit.
191	* @lock: Pointer to queued spinlock structure
192	*
193	* ,1, -> ,0,
194	*/
195	static __always_inline void clear_pending(struct qspinlock *lock)
196	{
197	atomic_andnot(_Q_PENDING_VAL, &lock->val);
198	}
199
200	/**
201	* clear_pending_set_locked - take ownership and clear the pending bit.
202	* @lock: Pointer to queued spinlock structure
203	*
204	* ,1,0 -> ,0,1
205	*/
206	static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
207	{
208	atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
209	}
210
211	/**
212	* xchg_tail - Put in the new queue tail code word & retrieve previous one
213	* @lock : Pointer to queued spinlock structure
214	* @tail : The new queue tail code word
215	* Return: The previous queue tail code word
216	*
217	* xchg(lock, tail)
218	*
219	* p,, -> n,, ; prev = xchg(lock, node)
220	*/
221	static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
222	{
223	u32 old, new, val = atomic_read(&lock->val);
224
225	for (;;) {
226	new = (val & _Q_LOCKED_PENDING_MASK) \| tail;
227	/*
228	* We can use relaxed semantics since the caller ensures that
229	* the MCS node is properly initialized before updating the
230	* tail.
231	*/
232	old = atomic_cmpxchg_relaxed(&lock->val, val, new);
233	if (old == val)
234	break;
235
236	val = old;
237	}
238	return old;
239	}
240	#endif /* _Q_PENDING_BITS == 8 */
241
242	/**
243	* queued_fetch_set_pending_acquire - fetch the whole lock value and set pending
244	* @lock : Pointer to queued spinlock structure
245	* Return: The previous lock value
246	*
247	* ,,* -> ,1,
248	*/
249	#ifndef queued_fetch_set_pending_acquire
250	static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock)
251	{
252	return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val);
253	}
254	#endif
255
256	/**
257	* set_locked - Set the lock bit and own the lock
258	* @lock: Pointer to queued spinlock structure
259	*
260	* ,,0 -> *,0,1
261	*/
262	static __always_inline void set_locked(struct qspinlock *lock)
263	{
264	WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
265	}
266
267
268	/*
269	* Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for
270	* all the PV callbacks.
271	*/
272
273	static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
274	static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
275	struct mcs_spinlock *prev) { }
276	static __always_inline void __pv_kick_node(struct qspinlock *lock,
277	struct mcs_spinlock *node) { }
278	static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
279	struct mcs_spinlock *node)
280	{ return `0`; }
281
282	#define pv_enabled() false
283
284	#define pv_init_node __pv_init_node
285	#define pv_wait_node __pv_wait_node
286	#define pv_kick_node __pv_kick_node
287	#define pv_wait_head_or_lock __pv_wait_head_or_lock
288
289	#ifdef CONFIG_PARAVIRT_SPINLOCKS
290	#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
291	#endif
292
293	#endif /* _GEN_PV_LOCK_SLOWPATH */
294
295	/**
296	* queued_spin_lock_slowpath - acquire the queued spinlock
297	* @lock: Pointer to queued spinlock structure
298	* @val: Current value of the queued spinlock 32-bit word
299	*
300	* (queue tail, pending bit, lock value)
301	*
302	* fast : slow : unlock
303	* : :
304	* uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (,,0)
305	* : \| ^--------.------. / :
306	* : v \ \ \| :
307	* pending : (0,1,1) +--> (0,1,0) \ \| :
308	* : \| ^--' \| \| :
309	* : v \| \| :
310	* uncontended : (n,x,y) +--> (n,0,0) --' \| :
311	* queue : \| ^--' \| :
312	* : v \| :
313	* contended : (,x,y) +--> (,0,0) ---> (*,0,1) -' :
314	* queue : ^--' :
315	*/
316	void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
317	{
318	struct mcs_spinlock prev, next, *node;
319	u32 old, tail;
320	int idx;
321
322	BUILD_BUG_ON(CONFIG_NR_CPUS >= (`1U` << _Q_TAIL_CPU_BITS));
323
324	if (pv_enabled())
325	goto pv_queue;
326
327	if (virt_spin_lock(lock))
328	return;
329
330	/*
331	* Wait for in-progress pending->locked hand-overs with a bounded
332	* number of spins so that we guarantee forward progress.
333	*
334	* 0,1,0 -> 0,0,1
335	*/
336	if (val == _Q_PENDING_VAL) {
337	int cnt = _Q_PENDING_LOOPS;
338	val = atomic_cond_read_relaxed(&lock->val,
339	(VAL != _Q_PENDING_VAL) \|\| !cnt--);
340	}
341
342	/*
343	* If we observe any contention; queue.
344	*/
345	if (val & ~_Q_LOCKED_MASK)
346	goto queue;
347
348	/*
349	* trylock \|\| pending
350	*
351	* 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
352	*/
353	val = queued_fetch_set_pending_acquire(lock);
354
355	/*
356	* If we observe contention, there is a concurrent locker.
357	*
358	* Undo and queue; our setting of PENDING might have made the
359	* n,0,0 -> 0,0,0 transition fail and it will now be waiting
360	* on @next to become !NULL.
361	*/
362	if (unlikely(val & ~_Q_LOCKED_MASK)) {
363
364	/ Undo PENDING if we set it. /
365	if (!(val & _Q_PENDING_MASK))
366	clear_pending(lock);
367
368	goto queue;
369	}
370
371	/*
372	* We're pending, wait for the owner to go away.
373	*
374	* 0,1,1 -> *,1,0
375	*
376	* this wait loop must be a load-acquire such that we match the
377	* store-release that clears the locked bit and create lock
378	* sequentiality; this is because not all
379	* clear_pending_set_locked() implementations imply full
380	* barriers.
381	*/
382	if (val & _Q_LOCKED_MASK)
383	smp_cond_load_acquire(&lock->locked, !VAL);
384
385	/*
386	* take ownership and clear the pending bit.
387	*
388	* 0,1,0 -> 0,0,1
389	*/
390	clear_pending_set_locked(lock);
391	lockevent_inc(lock_pending);
392	return;
393
394	/*
395	* End of pending bit optimistic spinning and beginning of MCS
396	* queuing.
397	*/
398	queue:
399	lockevent_inc(lock_slowpath);
400	pv_queue:
401	node = this_cpu_ptr(&qnodes[`0`].mcs);
402	idx = node->count++;
403	tail = encode_tail(smp_processor_id(), idx);
404
405	trace_contention_begin(lock, LCB_F_SPIN);
406
407	/*
408	* 4 nodes are allocated based on the assumption that there will
409	* not be nested NMIs taking spinlocks. That may not be true in
410	* some architectures even though the chance of needing more than
411	* 4 nodes will still be extremely unlikely. When that happens,
412	* we fall back to spinning on the lock directly without using
413	* any MCS node. This is not the most elegant solution, but is
414	* simple enough.
415	*/
416	if (unlikely(idx >= MAX_NODES)) {
417	lockevent_inc(lock_no_node);
418	while (!queued_spin_trylock(lock))
419	cpu_relax();
420	goto release;
421	}
422
423	node = grab_mcs_node(base: node, idx);
424
425	/*
426	* Keep counts of non-zero index values:
427	*/
428	lockevent_cond_inc(lock_use_node2 + idx - `1`, idx);
429
430	/*
431	* Ensure that we increment the head node->count before initialising
432	* the actual node. If the compiler is kind enough to reorder these
433	* stores, then an IRQ could overwrite our assignments.
434	*/
435	barrier();
436
437	node->locked = `0`;
438	node->next = NULL;
439	pv_init_node(node);
440
441	/*
442	* We touched a (possibly) cold cacheline in the per-cpu queue node;
443	* attempt the trylock once more in the hope someone let go while we
444	* weren't watching.
445	*/
446	if (queued_spin_trylock(lock))
447	goto release;
448
449	/*
450	* Ensure that the initialisation of @node is complete before we
451	* publish the updated tail via xchg_tail() and potentially link
452	* @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
453	*/
454	smp_wmb();
455
456	/*
457	* Publish the updated tail.
458	* We have already touched the queueing cacheline; don't bother with
459	* pending stuff.
460	*
461	* p,, -> n,,
462	*/
463	old = xchg_tail(lock, tail);
464	next = NULL;
465
466	/*
467	* if there was a previous node; link it and wait until reaching the
468	* head of the waitqueue.
469	*/
470	if (old & _Q_TAIL_MASK) {
471	prev = decode_tail(tail: old);
472
473	/ Link @node into the waitqueue. /
474	WRITE_ONCE(prev->next, node);
475
476	pv_wait_node(node, prev);
477	arch_mcs_spin_lock_contended(&node->locked);
478
479	/*
480	* While waiting for the MCS lock, the next pointer may have
481	* been set by another lock waiter. We optimistically load
482	* the next pointer & prefetch the cacheline for writing
483	* to reduce latency in the upcoming MCS unlock operation.
484	*/
485	next = READ_ONCE(node->next);
486	if (next)
487	prefetchw(x: next);
488	}
489
490	/*
491	* we're at the head of the waitqueue, wait for the owner & pending to
492	* go away.
493	*
494	* ,x,y -> ,0,0
495	*
496	* this wait loop must use a load-acquire such that we match the
497	* store-release that clears the locked bit and create lock
498	* sequentiality; this is because the set_locked() function below
499	* does not imply a full barrier.
500	*
501	* The PV pv_wait_head_or_lock function, if active, will acquire
502	* the lock and return a non-zero value. So we have to skip the
503	* atomic_cond_read_acquire() call. As the next PV queue head hasn't
504	* been designated yet, there is no way for the locked value to become
505	* _Q_SLOW_VAL. So both the set_locked() and the
506	* atomic_cmpxchg_relaxed() calls will be safe.
507	*
508	* If PV isn't active, 0 will be returned instead.
509	*
510	*/
511	if ((val = pv_wait_head_or_lock(lock, node)))
512	goto locked;
513
514	val = atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK));
515
516	locked:
517	/*
518	* claim the lock:
519	*
520	* n,0,0 -> 0,0,1 : lock, uncontended
521	* ,,0 -> ,,1 : lock, contended
522	*
523	* If the queue head is the only one in the queue (lock value == tail)
524	* and nobody is pending, clear the tail code and grab the lock.
525	* Otherwise, we only need to grab the lock.
526	*/
527
528	/*
529	* In the PV case we might already have _Q_LOCKED_VAL set, because
530	* of lock stealing; therefore we must also allow:
531	*
532	* n,0,1 -> 0,0,1
533	*
534	* Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
535	* above wait condition, therefore any concurrent setting of
536	* PENDING will make the uncontended transition fail.
537	*/
538	if ((val & _Q_TAIL_MASK) == tail) {
539	if (atomic_try_cmpxchg_relaxed(v: &lock->val, old: &val, _Q_LOCKED_VAL))
540	goto release; / No contention /
541	}
542
543	/*
544	* Either somebody is queued behind us or _Q_PENDING_VAL got set
545	* which will then detect the remaining tail and queue behind us
546	* ensuring we'll see a @next.
547	*/
548	set_locked(lock);
549
550	/*
551	* contended path; wait for next if not observed yet, release.
552	*/
553	if (!next)
554	next = smp_cond_load_relaxed(&node->next, (VAL));
555
556	arch_mcs_spin_unlock_contended(&next->locked);
557	pv_kick_node(lock, node: next);
558
559	release:
560	trace_contention_end(lock, ret: `0`);
561
562	/*
563	* release the node
564	*/
565	__this_cpu_dec(qnodes[`0`].mcs.count);
566	}
567	EXPORT_SYMBOL(queued_spin_lock_slowpath);
568
569	/*
570	* Generate the paravirt code for queued_spin_unlock_slowpath().
571	*/
572	#if !defined(_GEN_PV_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS)
573	#define _GEN_PV_LOCK_SLOWPATH
574
575	#undef pv_enabled
576	#define pv_enabled() true
577
578	#undef pv_init_node
579	#undef pv_wait_node
580	#undef pv_kick_node
581	#undef pv_wait_head_or_lock
582
583	#undef queued_spin_lock_slowpath
584	#define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath
585
586	#include "qspinlock_paravirt.h"
587	#include "qspinlock.c"
588
589	bool nopvspin __initdata;
590	static __init int parse_nopvspin(char *arg)
591	{
592	nopvspin = true;
593	return `0`;
594	}
595	early_param("nopvspin", parse_nopvspin);
596	#endif
597

source code of linux/kernel/locking/qspinlock.c