tree_plugin.h source code [linux/kernel/rcu/tree_plugin.h]

1	/ SPDX-License-Identifier: GPL-2.0+ /
2	/*
3	* Read-Copy Update mechanism for mutual exclusion (tree-based version)
4	* Internal non-public definitions that provide either classic
5	* or preemptible semantics.
6	*
7	* Copyright Red Hat, 2009
8	* Copyright IBM Corporation, 2009
9	*
10	* Author: Ingo Molnar <mingo@elte.hu>
11	* Paul E. McKenney <paulmck@linux.ibm.com>
12	*/
13
14	#include "../locking/rtmutex_common.h"
15
16	static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
17	{
18	/*
19	* In order to read the offloaded state of an rdp in a safe
20	* and stable way and prevent from its value to be changed
21	* under us, we must either hold the barrier mutex, the cpu
22	* hotplug lock (read or write) or the nocb lock. Local
23	* non-preemptible reads are also safe. NOCB kthreads and
24	* timers have their own means of synchronization against the
25	* offloaded state updaters.
26	*/
27	RCU_LOCKDEP_WARN(
28	!(lockdep_is_held(&rcu_state.barrier_mutex) \|\|
29	(IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) \|\|
30	rcu_lockdep_is_held_nocb(rdp) \|\|
31	(rdp == this_cpu_ptr(&rcu_data) &&
32	!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible())) \|\|
33	rcu_current_is_nocb_kthread(rdp)),
34	"Unsafe read of RCU_NOCB offloaded state"
35	);
36
37	return rcu_segcblist_is_offloaded(rsclp: &rdp->cblist);
38	}
39
40	/*
41	* Check the RCU kernel configuration parameters and print informative
42	* messages about anything out of the ordinary.
43	*/
44	static void __init rcu_bootup_announce_oddness(void)
45	{
46	if (IS_ENABLED(CONFIG_RCU_TRACE))
47	pr_info("\tRCU event tracing is enabled.\n");
48	if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != `64`) \|\|
49	(!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != `32`))
50	pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d.\n",
51	RCU_FANOUT);
52	if (rcu_fanout_exact)
53	pr_info("\tHierarchical RCU autobalancing is disabled.\n");
54	if (IS_ENABLED(CONFIG_PROVE_RCU))
55	pr_info("\tRCU lockdep checking is enabled.\n");
56	if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
57	pr_info("\tRCU strict (and thus non-scalable) grace periods are enabled.\n");
58	if (RCU_NUM_LVLS >= `4`)
59	pr_info("\tFour(or more)-level hierarchy is enabled.\n");
60	if (RCU_FANOUT_LEAF != `16`)
61	pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
62	RCU_FANOUT_LEAF);
63	if (rcu_fanout_leaf != RCU_FANOUT_LEAF)
64	pr_info("\tBoot-time adjustment of leaf fanout to %d.\n",
65	rcu_fanout_leaf);
66	if (nr_cpu_ids != NR_CPUS)
67	pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%u.\n", NR_CPUS, nr_cpu_ids);
68	#ifdef CONFIG_RCU_BOOST
69	pr_info("\tRCU priority boosting: priority %d delay %d ms.\n",
70	kthread_prio, CONFIG_RCU_BOOST_DELAY);
71	#endif
72	if (blimit != DEFAULT_RCU_BLIMIT)
73	pr_info("\tBoot-time adjustment of callback invocation limit to %ld.\n", blimit);
74	if (qhimark != DEFAULT_RCU_QHIMARK)
75	pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark);
76	if (qlowmark != DEFAULT_RCU_QLOMARK)
77	pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark);
78	if (qovld != DEFAULT_RCU_QOVLD)
79	pr_info("\tBoot-time adjustment of callback overload level to %ld.\n", qovld);
80	if (jiffies_till_first_fqs != ULONG_MAX)
81	pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs);
82	if (jiffies_till_next_fqs != ULONG_MAX)
83	pr_info("\tBoot-time adjustment of subsequent FQS scan delay to %ld jiffies.\n", jiffies_till_next_fqs);
84	if (jiffies_till_sched_qs != ULONG_MAX)
85	pr_info("\tBoot-time adjustment of scheduler-enlistment delay to %ld jiffies.\n", jiffies_till_sched_qs);
86	if (rcu_kick_kthreads)
87	pr_info("\tKick kthreads if too-long grace period.\n");
88	if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD))
89	pr_info("\tRCU callback double-/use-after-free debug is enabled.\n");
90	if (gp_preinit_delay)
91	pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay);
92	if (gp_init_delay)
93	pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay);
94	if (gp_cleanup_delay)
95	pr_info("\tRCU debug GP cleanup slowdown %d jiffies.\n", gp_cleanup_delay);
96	if (!use_softirq)
97	pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n");
98	if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
99	pr_info("\tRCU debug extended QS entry/exit.\n");
100	rcupdate_announce_bootup_oddness();
101	}
102
103	#ifdef CONFIG_PREEMPT_RCU
104
105	static void rcu_report_exp_rnp(struct rcu_node *rnp, bool wake);
106	static void rcu_read_unlock_special(struct task_struct *t);
107
108	/*
109	* Tell them what RCU they are running.
110	*/
111	static void __init rcu_bootup_announce(void)
112	{
113	pr_info("Preemptible hierarchical RCU implementation.\n");
114	rcu_bootup_announce_oddness();
115	}
116
117	/ Flags for rcu_preempt_ctxt_queue() decision table. /
118	#define RCU_GP_TASKS 0x8
119	#define RCU_EXP_TASKS 0x4
120	#define RCU_GP_BLKD 0x2
121	#define RCU_EXP_BLKD 0x1
122
123	/*
124	* Queues a task preempted within an RCU-preempt read-side critical
125	* section into the appropriate location within the ->blkd_tasks list,
126	* depending on the states of any ongoing normal and expedited grace
127	* periods. The ->gp_tasks pointer indicates which element the normal
128	* grace period is waiting on (NULL if none), and the ->exp_tasks pointer
129	* indicates which element the expedited grace period is waiting on (again,
130	* NULL if none). If a grace period is waiting on a given element in the
131	* ->blkd_tasks list, it also waits on all subsequent elements. Thus,
132	* adding a task to the tail of the list blocks any grace period that is
133	* already waiting on one of the elements. In contrast, adding a task
134	* to the head of the list won't block any grace period that is already
135	* waiting on one of the elements.
136	*
137	* This queuing is imprecise, and can sometimes make an ongoing grace
138	* period wait for a task that is not strictly speaking blocking it.
139	* Given the choice, we needlessly block a normal grace period rather than
140	* blocking an expedited grace period.
141	*
142	* Note that an endless sequence of expedited grace periods still cannot
143	* indefinitely postpone a normal grace period. Eventually, all of the
144	* fixed number of preempted tasks blocking the normal grace period that are
145	* not also blocking the expedited grace period will resume and complete
146	* their RCU read-side critical sections. At that point, the ->gp_tasks
147	* pointer will equal the ->exp_tasks pointer, at which point the end of
148	* the corresponding expedited grace period will also be the end of the
149	* normal grace period.
150	*/
151	static void rcu_preempt_ctxt_queue(struct rcu_node rnp, struct* rcu_data *rdp)
152	__releases(rnp->lock) / But leaves rrupts disabled. /
153	{
154	int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : `0`) +
155	(rnp->exp_tasks ? RCU_EXP_TASKS : `0`) +
156	(rnp->qsmask & rdp->grpmask ? RCU_GP_BLKD : `0`) +
157	(rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : `0`);
158	struct task_struct *t = current;
159
160	raw_lockdep_assert_held_rcu_node(rnp);
161	WARN_ON_ONCE(rdp->mynode != rnp);
162	WARN_ON_ONCE(!rcu_is_leaf_node(rnp));
163	/ RCU better not be waiting on newly onlined CPUs! /
164	WARN_ON_ONCE(rnp->qsmaskinitnext & ~rnp->qsmaskinit & rnp->qsmask &
165	rdp->grpmask);
166
167	/*
168	* Decide where to queue the newly blocked task. In theory,
169	* this could be an if-statement. In practice, when I tried
170	* that, it was quite messy.
171	*/
172	switch (blkd_state) {
173	case `0`:
174	case RCU_EXP_TASKS:
175	case RCU_EXP_TASKS + RCU_GP_BLKD:
176	case RCU_GP_TASKS:
177	case RCU_GP_TASKS + RCU_EXP_TASKS:
178
179	/*
180	* Blocking neither GP, or first task blocking the normal
181	* GP but not blocking the already-waiting expedited GP.
182	* Queue at the head of the list to avoid unnecessarily
183	* blocking the already-waiting GPs.
184	*/
185	list_add(new: &t->rcu_node_entry, head: &rnp->blkd_tasks);
186	break;
187
188	case RCU_EXP_BLKD:
189	case RCU_GP_BLKD:
190	case RCU_GP_BLKD + RCU_EXP_BLKD:
191	case RCU_GP_TASKS + RCU_EXP_BLKD:
192	case RCU_GP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
193	case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
194
195	/*
196	* First task arriving that blocks either GP, or first task
197	* arriving that blocks the expedited GP (with the normal
198	* GP already waiting), or a task arriving that blocks
199	* both GPs with both GPs already waiting. Queue at the
200	* tail of the list to avoid any GP waiting on any of the
201	* already queued tasks that are not blocking it.
202	*/
203	list_add_tail(new: &t->rcu_node_entry, head: &rnp->blkd_tasks);
204	break;
205
206	case RCU_EXP_TASKS + RCU_EXP_BLKD:
207	case RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
208	case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_EXP_BLKD:
209
210	/*
211	* Second or subsequent task blocking the expedited GP.
212	* The task either does not block the normal GP, or is the
213	* first task blocking the normal GP. Queue just after
214	* the first task blocking the expedited GP.
215	*/
216	list_add(new: &t->rcu_node_entry, head: rnp->exp_tasks);
217	break;
218
219	case RCU_GP_TASKS + RCU_GP_BLKD:
220	case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD:
221
222	/*
223	* Second or subsequent task blocking the normal GP.
224	* The task does not block the expedited GP. Queue just
225	* after the first task blocking the normal GP.
226	*/
227	list_add(new: &t->rcu_node_entry, head: rnp->gp_tasks);
228	break;
229
230	default:
231
232	/ Yet another exercise in excessive paranoia. /
233	WARN_ON_ONCE(`1`);
234	break;
235	}
236
237	/*
238	* We have now queued the task. If it was the first one to
239	* block either grace period, update the ->gp_tasks and/or
240	* ->exp_tasks pointers, respectively, to reference the newly
241	* blocked tasks.
242	*/
243	if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) {
244	WRITE_ONCE(rnp->gp_tasks, &t->rcu_node_entry);
245	WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq);
246	}
247	if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
248	WRITE_ONCE(rnp->exp_tasks, &t->rcu_node_entry);
249	WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) !=
250	!(rnp->qsmask & rdp->grpmask));
251	WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) !=
252	!(rnp->expmask & rdp->grpmask));
253	raw_spin_unlock_rcu_node(rnp); / interrupts remain disabled. /
254
255	/*
256	* Report the quiescent state for the expedited GP. This expedited
257	* GP should not be able to end until we report, so there should be
258	* no need to check for a subsequent expedited GP. (Though we are
259	* still in a quiescent state in any case.)
260	*
261	* Interrupts are disabled, so ->cpu_no_qs.b.exp cannot change.
262	*/
263	if (blkd_state & RCU_EXP_BLKD && rdp->cpu_no_qs.b.exp)
264	rcu_report_exp_rdp(rdp);
265	else
266	WARN_ON_ONCE(rdp->cpu_no_qs.b.exp);
267	}
268
269	/*
270	* Record a preemptible-RCU quiescent state for the specified CPU.
271	* Note that this does not necessarily mean that the task currently running
272	* on the CPU is in a quiescent state: Instead, it means that the current
273	* grace period need not wait on any RCU read-side critical section that
274	* starts later on this CPU. It also means that if the current task is
275	* in an RCU read-side critical section, it has already added itself to
276	* some leaf rcu_node structure's ->blkd_tasks list. In addition to the
277	* current task, there might be any number of other tasks blocked while
278	* in an RCU read-side critical section.
279	*
280	* Unlike non-preemptible-RCU, quiescent state reports for expedited
281	* grace periods are handled separately via deferred quiescent states
282	* and context switch events.
283	*
284	* Callers to this function must disable preemption.
285	*/
286	static void rcu_qs(void)
287	{
288	RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!\n");
289	if (__this_cpu_read(rcu_data.cpu_no_qs.b.norm)) {
290	trace_rcu_grace_period(TPS("rcu_preempt"),
291	__this_cpu_read(rcu_data.gp_seq),
292	TPS("cpuqs"));
293	__this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);
294	barrier(); / Coordinate with rcu_flavor_sched_clock_irq(). /
295	WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, false);
296	}
297	}
298
299	/*
300	* We have entered the scheduler, and the current task might soon be
301	* context-switched away from. If this task is in an RCU read-side
302	* critical section, we will no longer be able to rely on the CPU to
303	* record that fact, so we enqueue the task on the blkd_tasks list.
304	* The task will dequeue itself when it exits the outermost enclosing
305	* RCU read-side critical section. Therefore, the current grace period
306	* cannot be permitted to complete until the blkd_tasks list entries
307	* predating the current grace period drain, in other words, until
308	* rnp->gp_tasks becomes NULL.
309	*
310	* Caller must disable interrupts.
311	*/
312	void rcu_note_context_switch(bool preempt)
313	{
314	struct task_struct *t = current;
315	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
316	struct rcu_node *rnp;
317
318	trace_rcu_utilization(TPS("Start context switch"));
319	lockdep_assert_irqs_disabled();
320	WARN_ONCE(!preempt && rcu_preempt_depth() > `0`, "Voluntary context switch within RCU read-side critical section!");
321	if (rcu_preempt_depth() > `0` &&
322	!t->rcu_read_unlock_special.b.blocked) {
323
324	/ Possibly blocking in an RCU read-side critical section. /
325	rnp = rdp->mynode;
326	raw_spin_lock_rcu_node(rnp);
327	t->rcu_read_unlock_special.b.blocked = true;
328	t->rcu_blocked_node = rnp;
329
330	/*
331	* Verify the CPU's sanity, trace the preemption, and
332	* then queue the task as required based on the states
333	* of any ongoing and expedited grace periods.
334	*/
335	WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp));
336	WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
337	trace_rcu_preempt_task(rcuname: rcu_state.name,
338	pid: t->pid,
339	gp_seq: (rnp->qsmask & rdp->grpmask)
340	? rnp->gp_seq
341	: rcu_seq_snap(sp: &rnp->gp_seq));
342	rcu_preempt_ctxt_queue(rnp, rdp);
343	} else {
344	rcu_preempt_deferred_qs(t);
345	}
346
347	/*
348	* Either we were not in an RCU read-side critical section to
349	* begin with, or we have now recorded that critical section
350	* globally. Either way, we can now note a quiescent state
351	* for this CPU. Again, if we were in an RCU read-side critical
352	* section, and if that critical section was blocking the current
353	* grace period, then the fact that the task has been enqueued
354	* means that we continue to block the current grace period.
355	*/
356	rcu_qs();
357	if (rdp->cpu_no_qs.b.exp)
358	rcu_report_exp_rdp(rdp);
359	rcu_tasks_qs(current, preempt);
360	trace_rcu_utilization(TPS("End context switch"));
361	}
362	EXPORT_SYMBOL_GPL(rcu_note_context_switch);
363
364	/*
365	* Check for preempted RCU readers blocking the current grace period
366	* for the specified rcu_node structure. If the caller needs a reliable
367	* answer, it must hold the rcu_node's ->lock.
368	*/
369	static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
370	{
371	return READ_ONCE(rnp->gp_tasks) != NULL;
372	}
373
374	/ limit value for ->rcu_read_lock_nesting. /
375	#define RCU_NEST_PMAX (INT_MAX / 2)
376
377	static void rcu_preempt_read_enter(void)
378	{
379	WRITE_ONCE(current->rcu_read_lock_nesting, READ_ONCE(current->rcu_read_lock_nesting) + `1`);
380	}
381
382	static int rcu_preempt_read_exit(void)
383	{
384	int ret = READ_ONCE(current->rcu_read_lock_nesting) - `1`;
385
386	WRITE_ONCE(current->rcu_read_lock_nesting, ret);
387	return ret;
388	}
389
390	static void rcu_preempt_depth_set(int val)
391	{
392	WRITE_ONCE(current->rcu_read_lock_nesting, val);
393	}
394
395	/*
396	* Preemptible RCU implementation for rcu_read_lock().
397	* Just increment ->rcu_read_lock_nesting, shared state will be updated
398	* if we block.
399	*/
400	void __rcu_read_lock(void)
401	{
402	rcu_preempt_read_enter();
403	if (IS_ENABLED(CONFIG_PROVE_LOCKING))
404	WARN_ON_ONCE(rcu_preempt_depth() > RCU_NEST_PMAX);
405	if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && rcu_state.gp_kthread)
406	WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, true);
407	barrier(); / critical section after entry code. /
408	}
409	EXPORT_SYMBOL_GPL(__rcu_read_lock);
410
411	/*
412	* Preemptible RCU implementation for rcu_read_unlock().
413	* Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
414	* rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
415	* invoke rcu_read_unlock_special() to clean up after a context switch
416	* in an RCU read-side critical section and other special cases.
417	*/
418	void __rcu_read_unlock(void)
419	{
420	struct task_struct *t = current;
421
422	barrier(); // critical section before exit code.
423	if (rcu_preempt_read_exit() == `0`) {
424	barrier(); // critical-section exit before .s check.
425	if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
426	rcu_read_unlock_special(t);
427	}
428	if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
429	int rrln = rcu_preempt_depth();
430
431	WARN_ON_ONCE(rrln < `0` \|\| rrln > RCU_NEST_PMAX);
432	}
433	}
434	EXPORT_SYMBOL_GPL(__rcu_read_unlock);
435
436	/*
437	* Advance a ->blkd_tasks-list pointer to the next entry, instead
438	* returning NULL if at the end of the list.
439	*/
440	static struct list_head rcu_next_node_entry(struct* task_struct *t,
441	struct rcu_node *rnp)
442	{
443	struct list_head *np;
444
445	np = t->rcu_node_entry.next;
446	if (np == &rnp->blkd_tasks)
447	np = NULL;
448	return np;
449	}
450
451	/*
452	* Return true if the specified rcu_node structure has tasks that were
453	* preempted within an RCU read-side critical section.
454	*/
455	static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
456	{
457	return !list_empty(head: &rnp->blkd_tasks);
458	}
459
460	/*
461	* Report deferred quiescent states. The deferral time can
462	* be quite short, for example, in the case of the call from
463	* rcu_read_unlock_special().
464	*/
465	static notrace void
466	rcu_preempt_deferred_qs_irqrestore(struct task_struct t, unsigned* long flags)
467	{
468	bool empty_exp;
469	bool empty_norm;
470	bool empty_exp_now;
471	struct list_head *np;
472	bool drop_boost_mutex = false;
473	struct rcu_data *rdp;
474	struct rcu_node *rnp;
475	union rcu_special special;
476
477	/*
478	* If RCU core is waiting for this CPU to exit its critical section,
479	* report the fact that it has exited. Because irqs are disabled,
480	* t->rcu_read_unlock_special cannot change.
481	*/
482	special = t->rcu_read_unlock_special;
483	rdp = this_cpu_ptr(&rcu_data);
484	if (!special.s && !rdp->cpu_no_qs.b.exp) {
485	local_irq_restore(flags);
486	return;
487	}
488	t->rcu_read_unlock_special.s = `0`;
489	if (special.b.need_qs) {
490	if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) {
491	rdp->cpu_no_qs.b.norm = false;
492	rcu_report_qs_rdp(rdp);
493	udelay(rcu_unlock_delay);
494	} else {
495	rcu_qs();
496	}
497	}
498
499	/*
500	* Respond to a request by an expedited grace period for a
501	* quiescent state from this CPU. Note that requests from
502	* tasks are handled when removing the task from the
503	* blocked-tasks list below.
504	*/
505	if (rdp->cpu_no_qs.b.exp)
506	rcu_report_exp_rdp(rdp);
507
508	/ Clean up if blocked during RCU read-side critical section. /
509	if (special.b.blocked) {
510
511	/*
512	* Remove this task from the list it blocked on. The task
513	* now remains queued on the rcu_node corresponding to the
514	* CPU it first blocked on, so there is no longer any need
515	* to loop. Retain a WARN_ON_ONCE() out of sheer paranoia.
516	*/
517	rnp = t->rcu_blocked_node;
518	raw_spin_lock_rcu_node(rnp); / irqs already disabled. /
519	WARN_ON_ONCE(rnp != t->rcu_blocked_node);
520	WARN_ON_ONCE(!rcu_is_leaf_node(rnp));
521	empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
522	WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq &&
523	(!empty_norm \|\| rnp->qsmask));
524	empty_exp = sync_rcu_exp_done(rnp);
525	smp_mb(); / ensure expedited fastpath sees end of RCU c-s. /
526	np = rcu_next_node_entry(t, rnp);
527	list_del_init(entry: &t->rcu_node_entry);
528	t->rcu_blocked_node = NULL;
529	trace_rcu_unlock_preempted_task(TPS("rcu_preempt"),
530	gp_seq: rnp->gp_seq, pid: t->pid);
531	if (&t->rcu_node_entry == rnp->gp_tasks)
532	WRITE_ONCE(rnp->gp_tasks, np);
533	if (&t->rcu_node_entry == rnp->exp_tasks)
534	WRITE_ONCE(rnp->exp_tasks, np);
535	if (IS_ENABLED(CONFIG_RCU_BOOST)) {
536	/ Snapshot ->boost_mtx ownership w/rnp->lock held. /
537	drop_boost_mutex = rt_mutex_owner(lock: &rnp->boost_mtx.rtmutex) == t;
538	if (&t->rcu_node_entry == rnp->boost_tasks)
539	WRITE_ONCE(rnp->boost_tasks, np);
540	}
541
542	/*
543	* If this was the last task on the current list, and if
544	* we aren't waiting on any CPUs, report the quiescent state.
545	* Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
546	* so we must take a snapshot of the expedited state.
547	*/
548	empty_exp_now = sync_rcu_exp_done(rnp);
549	if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
550	trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
551	gp_seq: rnp->gp_seq,
552	mask: `0`, qsmask: rnp->qsmask,
553	level: rnp->level,
554	grplo: rnp->grplo,
555	grphi: rnp->grphi,
556	gp_tasks: !!rnp->gp_tasks);
557	rcu_report_unblock_qs_rnp(rnp, flags);
558	} else {
559	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
560	}
561
562	/*
563	* If this was the last task on the expedited lists,
564	* then we need to report up the rcu_node hierarchy.
565	*/
566	if (!empty_exp && empty_exp_now)
567	rcu_report_exp_rnp(rnp, wake: true);
568
569	/ Unboost if we were boosted. /
570	if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
571	rt_mutex_futex_unlock(lock: &rnp->boost_mtx.rtmutex);
572	} else {
573	local_irq_restore(flags);
574	}
575	}
576
577	/*
578	* Is a deferred quiescent-state pending, and are we also not in
579	* an RCU read-side critical section? It is the caller's responsibility
580	* to ensure it is otherwise safe to report any deferred quiescent
581	* states. The reason for this is that it is safe to report a
582	* quiescent state during context switch even though preemption
583	* is disabled. This function cannot be expected to understand these
584	* nuances, so the caller must handle them.
585	*/
586	static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t)
587	{
588	return (__this_cpu_read(rcu_data.cpu_no_qs.b.exp) \|\|
589	READ_ONCE(t->rcu_read_unlock_special.s)) &&
590	rcu_preempt_depth() == `0`;
591	}
592
593	/*
594	* Report a deferred quiescent state if needed and safe to do so.
595	* As with rcu_preempt_need_deferred_qs(), "safe" involves only
596	* not being in an RCU read-side critical section. The caller must
597	* evaluate safety in terms of interrupt, softirq, and preemption
598	* disabling.
599	*/
600	notrace void rcu_preempt_deferred_qs(struct task_struct *t)
601	{
602	unsigned long flags;
603
604	if (!rcu_preempt_need_deferred_qs(t))
605	return;
606	local_irq_save(flags);
607	rcu_preempt_deferred_qs_irqrestore(t, flags);
608	}
609
610	/*
611	* Minimal handler to give the scheduler a chance to re-evaluate.
612	*/
613	static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp)
614	{
615	struct rcu_data *rdp;
616
617	rdp = container_of(iwp, struct rcu_data, defer_qs_iw);
618	rdp->defer_qs_iw_pending = false;
619	}
620
621	/*
622	* Handle special cases during rcu_read_unlock(), such as needing to
623	* notify RCU core processing or task having blocked during the RCU
624	* read-side critical section.
625	*/
626	static void rcu_read_unlock_special(struct task_struct *t)
627	{
628	unsigned long flags;
629	bool irqs_were_disabled;
630	bool preempt_bh_were_disabled =
631	!!(preempt_count() & (PREEMPT_MASK \| SOFTIRQ_MASK));
632
633	/ NMI handlers cannot block and cannot safely manipulate state. /
634	if (in_nmi())
635	return;
636
637	local_irq_save(flags);
638	irqs_were_disabled = irqs_disabled_flags(flags);
639	if (preempt_bh_were_disabled \|\| irqs_were_disabled) {
640	bool expboost; // Expedited GP in flight or possible boosting.
641	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
642	struct rcu_node *rnp = rdp->mynode;
643
644	expboost = (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) \|\|
645	(rdp->grpmask & READ_ONCE(rnp->expmask)) \|\|
646	(IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
647	((rdp->grpmask & READ_ONCE(rnp->qsmask)) \|\| t->rcu_blocked_node)) \|\|
648	(IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled &&
649	t->rcu_blocked_node);
650	// Need to defer quiescent state until everything is enabled.
651	if (use_softirq && (in_hardirq() \|\| (expboost && !irqs_were_disabled))) {
652	// Using softirq, safe to awaken, and either the
653	// wakeup is free or there is either an expedited
654	// GP in flight or a potential need to deboost.
655	raise_softirq_irqoff(nr: RCU_SOFTIRQ);
656	} else {
657	// Enabling BH or preempt does reschedule, so...
658	// Also if no expediting and no possible deboosting,
659	// slow is OK. Plus nohz_full CPUs eventually get
660	// tick enabled.
661	set_tsk_need_resched(current);
662	set_preempt_need_resched();
663	if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
664	expboost && !rdp->defer_qs_iw_pending && cpu_online(cpu: rdp->cpu)) {
665	// Get scheduler to re-evaluate and call hooks.
666	// If !IRQ_WORK, FQS scan will eventually IPI.
667	if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
668	IS_ENABLED(CONFIG_PREEMPT_RT))
669	rdp->defer_qs_iw = IRQ_WORK_INIT_HARD(
670	rcu_preempt_deferred_qs_handler);
671	else
672	init_irq_work(work: &rdp->defer_qs_iw,
673	func: rcu_preempt_deferred_qs_handler);
674	rdp->defer_qs_iw_pending = true;
675	irq_work_queue_on(work: &rdp->defer_qs_iw, cpu: rdp->cpu);
676	}
677	}
678	local_irq_restore(flags);
679	return;
680	}
681	rcu_preempt_deferred_qs_irqrestore(t, flags);
682	}
683
684	/*
685	* Check that the list of blocked tasks for the newly completed grace
686	* period is in fact empty. It is a serious bug to complete a grace
687	* period that still has RCU readers blocked! This function must be
688	* invoked -before- updating this rnp's ->gp_seq.
689	*
690	* Also, if there are blocked tasks on the list, they automatically
691	* block the newly created grace period, so set up ->gp_tasks accordingly.
692	*/
693	static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
694	{
695	struct task_struct *t;
696
697	RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n");
698	raw_lockdep_assert_held_rcu_node(rnp);
699	if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
700	dump_blkd_tasks(rnp, ncheck: `10`);
701	if (rcu_preempt_has_tasks(rnp) &&
702	(rnp->qsmaskinit \|\| rnp->wait_blkd_tasks)) {
703	WRITE_ONCE(rnp->gp_tasks, rnp->blkd_tasks.next);
704	t = container_of(rnp->gp_tasks, struct task_struct,
705	rcu_node_entry);
706	trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"),
707	gp_seq: rnp->gp_seq, pid: t->pid);
708	}
709	WARN_ON_ONCE(rnp->qsmask);
710	}
711
712	/*
713	* Check for a quiescent state from the current CPU, including voluntary
714	* context switches for Tasks RCU. When a task blocks, the task is
715	* recorded in the corresponding CPU's rcu_node structure, which is checked
716	* elsewhere, hence this function need only check for quiescent states
717	* related to the current CPU, not to those related to tasks.
718	*/
719	static void rcu_flavor_sched_clock_irq(int user)
720	{
721	struct task_struct *t = current;
722
723	lockdep_assert_irqs_disabled();
724	if (rcu_preempt_depth() > `0` \|\|
725	(preempt_count() & (PREEMPT_MASK \| SOFTIRQ_MASK))) {
726	/ No QS, force context switch if deferred. /
727	if (rcu_preempt_need_deferred_qs(t)) {
728	set_tsk_need_resched(t);
729	set_preempt_need_resched();
730	}
731	} else if (rcu_preempt_need_deferred_qs(t)) {
732	rcu_preempt_deferred_qs(t); / Report deferred QS. /
733	return;
734	} else if (!WARN_ON_ONCE(rcu_preempt_depth())) {
735	rcu_qs(); / Report immediate QS. /
736	return;
737	}
738
739	/ If GP is oldish, ask for help from rcu_read_unlock_special(). /
740	if (rcu_preempt_depth() > `0` &&
741	__this_cpu_read(rcu_data.core_needs_qs) &&
742	__this_cpu_read(rcu_data.cpu_no_qs.b.norm) &&
743	!t->rcu_read_unlock_special.b.need_qs &&
744	time_after(jiffies, rcu_state.gp_start + HZ))
745	t->rcu_read_unlock_special.b.need_qs = true;
746	}
747
748	/*
749	* Check for a task exiting while in a preemptible-RCU read-side
750	* critical section, clean up if so. No need to issue warnings, as
751	* debug_check_no_locks_held() already does this if lockdep is enabled.
752	* Besides, if this function does anything other than just immediately
753	* return, there was a bug of some sort. Spewing warnings from this
754	* function is like as not to simply obscure important prior warnings.
755	*/
756	void exit_rcu(void)
757	{
758	struct task_struct *t = current;
759
760	if (unlikely(!list_empty(&current->rcu_node_entry))) {
761	rcu_preempt_depth_set(val: `1`);
762	barrier();
763	WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true);
764	} else if (unlikely(rcu_preempt_depth())) {
765	rcu_preempt_depth_set(val: `1`);
766	} else {
767	return;
768	}
769	__rcu_read_unlock();
770	rcu_preempt_deferred_qs(current);
771	}
772
773	/*
774	* Dump the blocked-tasks state, but limit the list dump to the
775	* specified number of elements.
776	*/
777	static void
778	dump_blkd_tasks(struct rcu_node rnp, int* ncheck)
779	{
780	int cpu;
781	int i;
782	struct list_head *lhp;
783	struct rcu_data *rdp;
784	struct rcu_node *rnp1;
785
786	raw_lockdep_assert_held_rcu_node(rnp);
787	pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n",
788	__func__, rnp->grplo, rnp->grphi, rnp->level,
789	(long)READ_ONCE(rnp->gp_seq), (long)rnp->completedqs);
790	for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent)
791	pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n",
792	__func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext);
793	pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n",
794	__func__, READ_ONCE(rnp->gp_tasks), data_race(rnp->boost_tasks),
795	READ_ONCE(rnp->exp_tasks));
796	pr_info("%s: ->blkd_tasks", __func__);
797	i = `0`;
798	list_for_each(lhp, &rnp->blkd_tasks) {
799	pr_cont(" %p", lhp);
800	if (++i >= ncheck)
801	break;
802	}
803	pr_cont("\n");
804	for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
805	rdp = per_cpu_ptr(&rcu_data, cpu);
806	pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n",
807	cpu, ".o"[rcu_rdp_cpu_online(rdp)],
808	(long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
809	(long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
810	}
811	}
812
813	#else /* #ifdef CONFIG_PREEMPT_RCU */
814
815	/*
816	* If strict grace periods are enabled, and if the calling
817	* __rcu_read_unlock() marks the beginning of a quiescent state, immediately
818	* report that quiescent state and, if requested, spin for a bit.
819	*/
820	void rcu_read_unlock_strict(void)
821	{
822	struct rcu_data *rdp;
823
824	if (irqs_disabled() \|\| preempt_count() \|\| !rcu_state.gp_kthread)
825	return;
826	rdp = this_cpu_ptr(&rcu_data);
827	rdp->cpu_no_qs.b.norm = false;
828	rcu_report_qs_rdp(rdp);
829	udelay(rcu_unlock_delay);
830	}
831	EXPORT_SYMBOL_GPL(rcu_read_unlock_strict);
832
833	/*
834	* Tell them what RCU they are running.
835	*/
836	static void __init rcu_bootup_announce(void)
837	{
838	pr_info("Hierarchical RCU implementation.\n");
839	rcu_bootup_announce_oddness();
840	}
841
842	/*
843	* Note a quiescent state for PREEMPTION=n. Because we do not need to know
844	* how many quiescent states passed, just if there was at least one since
845	* the start of the grace period, this just sets a flag. The caller must
846	* have disabled preemption.
847	*/
848	static void rcu_qs(void)
849	{
850	RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!");
851	if (!__this_cpu_read(rcu_data.cpu_no_qs.s))
852	return;
853	trace_rcu_grace_period(TPS("rcu_sched"),
854	__this_cpu_read(rcu_data.gp_seq), TPS("cpuqs"));
855	__this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);
856	if (__this_cpu_read(rcu_data.cpu_no_qs.b.exp))
857	rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
858	}
859
860	/*
861	* Register an urgently needed quiescent state. If there is an
862	* emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
863	* dyntick-idle quiescent state visible to other CPUs, which will in
864	* some cases serve for expedited as well as normal grace periods.
865	* Either way, register a lightweight quiescent state.
866	*/
867	void rcu_all_qs(void)
868	{
869	unsigned long flags;
870
871	if (!raw_cpu_read(rcu_data.rcu_urgent_qs))
872	return;
873	preempt_disable(); // For CONFIG_PREEMPT_COUNT=y kernels
874	/ Load rcu_urgent_qs before other flags. /
875	if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
876	preempt_enable();
877	return;
878	}
879	this_cpu_write(rcu_data.rcu_urgent_qs, false);
880	if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) {
881	local_irq_save(flags);
882	rcu_momentary_dyntick_idle();
883	local_irq_restore(flags);
884	}
885	rcu_qs();
886	preempt_enable();
887	}
888	EXPORT_SYMBOL_GPL(rcu_all_qs);
889
890	/*
891	* Note a PREEMPTION=n context switch. The caller must have disabled interrupts.
892	*/
893	void rcu_note_context_switch(bool preempt)
894	{
895	trace_rcu_utilization(TPS("Start context switch"));
896	rcu_qs();
897	/ Load rcu_urgent_qs before other flags. /
898	if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs)))
899	goto out;
900	this_cpu_write(rcu_data.rcu_urgent_qs, false);
901	if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs)))
902	rcu_momentary_dyntick_idle();
903	out:
904	rcu_tasks_qs(current, preempt);
905	trace_rcu_utilization(TPS("End context switch"));
906	}
907	EXPORT_SYMBOL_GPL(rcu_note_context_switch);
908
909	/*
910	* Because preemptible RCU does not exist, there are never any preempted
911	* RCU readers.
912	*/
913	static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
914	{
915	return `0`;
916	}
917
918	/*
919	* Because there is no preemptible RCU, there can be no readers blocked.
920	*/
921	static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
922	{
923	return false;
924	}
925
926	/*
927	* Because there is no preemptible RCU, there can be no deferred quiescent
928	* states.
929	*/
930	static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t)
931	{
932	return false;
933	}
934
935	// Except that we do need to respond to a request by an expedited
936	// grace period for a quiescent state from this CPU. Note that in
937	// non-preemptible kernels, there can be no context switches within RCU
938	// read-side critical sections, which in turn means that the leaf rcu_node
939	// structure's blocked-tasks list is always empty. is therefore no need to
940	// actually check it. Instead, a quiescent state from this CPU suffices,
941	// and this function is only called from such a quiescent state.
942	notrace void rcu_preempt_deferred_qs(struct task_struct *t)
943	{
944	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
945
946	if (READ_ONCE(rdp->cpu_no_qs.b.exp))
947	rcu_report_exp_rdp(rdp);
948	}
949
950	/*
951	* Because there is no preemptible RCU, there can be no readers blocked,
952	* so there is no need to check for blocked tasks. So check only for
953	* bogus qsmask values.
954	*/
955	static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
956	{
957	WARN_ON_ONCE(rnp->qsmask);
958	}
959
960	/*
961	* Check to see if this CPU is in a non-context-switch quiescent state,
962	* namely user mode and idle loop.
963	*/
964	static void rcu_flavor_sched_clock_irq(int user)
965	{
966	if (user \|\| rcu_is_cpu_rrupt_from_idle()) {
967
968	/*
969	* Get here if this CPU took its interrupt from user
970	* mode or from the idle loop, and if this is not a
971	* nested interrupt. In this case, the CPU is in
972	* a quiescent state, so note it.
973	*
974	* No memory barrier is required here because rcu_qs()
975	* references only CPU-local variables that other CPUs
976	* neither access nor modify, at least not while the
977	* corresponding CPU is online.
978	*/
979	rcu_qs();
980	}
981	}
982
983	/*
984	* Because preemptible RCU does not exist, tasks cannot possibly exit
985	* while in preemptible RCU read-side critical sections.
986	*/
987	void exit_rcu(void)
988	{
989	}
990
991	/*
992	* Dump the guaranteed-empty blocked-tasks state. Trust but verify.
993	*/
994	static void
995	dump_blkd_tasks(struct rcu_node rnp, int* ncheck)
996	{
997	WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks));
998	}
999
1000	#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
1001
1002	/*
1003	* If boosting, set rcuc kthreads to realtime priority.
1004	*/
1005	static void rcu_cpu_kthread_setup(unsigned int cpu)
1006	{
1007	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
1008	#ifdef CONFIG_RCU_BOOST
1009	struct sched_param sp;
1010
1011	sp.sched_priority = kthread_prio;
1012	sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1013	#endif /* #ifdef CONFIG_RCU_BOOST */
1014
1015	WRITE_ONCE(rdp->rcuc_activity, jiffies);
1016	}
1017
1018	static bool rcu_is_callbacks_nocb_kthread(struct rcu_data *rdp)
1019	{
1020	#ifdef CONFIG_RCU_NOCB_CPU
1021	return rdp->nocb_cb_kthread == current;
1022	#else
1023	return false;
1024	#endif
1025	}
1026
1027	/*
1028	* Is the current CPU running the RCU-callbacks kthread?
1029	* Caller must have preemption disabled.
1030	*/
1031	static bool rcu_is_callbacks_kthread(struct rcu_data *rdp)
1032	{
1033	return rdp->rcu_cpu_kthread_task == current \|\|
1034	rcu_is_callbacks_nocb_kthread(rdp);
1035	}
1036
1037	#ifdef CONFIG_RCU_BOOST
1038
1039	/*
1040	* Carry out RCU priority boosting on the task indicated by ->exp_tasks
1041	* or ->boost_tasks, advancing the pointer to the next task in the
1042	* ->blkd_tasks list.
1043	*
1044	* Note that irqs must be enabled: boosting the task can block.
1045	* Returns 1 if there are more tasks needing to be boosted.
1046	*/
1047	static int rcu_boost(struct rcu_node *rnp)
1048	{
1049	unsigned long flags;
1050	struct task_struct *t;
1051	struct list_head *tb;
1052
1053	if (READ_ONCE(rnp->exp_tasks) == NULL &&
1054	READ_ONCE(rnp->boost_tasks) == NULL)
1055	return `0`; / Nothing left to boost. /
1056
1057	raw_spin_lock_irqsave_rcu_node(rnp, flags);
1058
1059	/*
1060	* Recheck under the lock: all tasks in need of boosting
1061	* might exit their RCU read-side critical sections on their own.
1062	*/
1063	if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
1064	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1065	return `0`;
1066	}
1067
1068	/*
1069	* Preferentially boost tasks blocking expedited grace periods.
1070	* This cannot starve the normal grace periods because a second
1071	* expedited grace period must boost all blocked tasks, including
1072	* those blocking the pre-existing normal grace period.
1073	*/
1074	if (rnp->exp_tasks != NULL)
1075	tb = rnp->exp_tasks;
1076	else
1077	tb = rnp->boost_tasks;
1078
1079	/*
1080	* We boost task t by manufacturing an rt_mutex that appears to
1081	* be held by task t. We leave a pointer to that rt_mutex where
1082	* task t can find it, and task t will release the mutex when it
1083	* exits its outermost RCU read-side critical section. Then
1084	* simply acquiring this artificial rt_mutex will boost task
1085	* t's priority. (Thanks to tglx for suggesting this approach!)
1086	*
1087	* Note that task t must acquire rnp->lock to remove itself from
1088	* the ->blkd_tasks list, which it will do from exit() if from
1089	* nowhere else. We therefore are guaranteed that task t will
1090	* stay around at least until we drop rnp->lock. Note that
1091	* rnp->lock also resolves races between our priority boosting
1092	* and task t's exiting its outermost RCU read-side critical
1093	* section.
1094	*/
1095	t = container_of(tb, struct task_struct, rcu_node_entry);
1096	rt_mutex_init_proxy_locked(lock: &rnp->boost_mtx.rtmutex, proxy_owner: t);
1097	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1098	/ Lock only for side effect: boosts task t's priority. /
1099	rt_mutex_lock(&rnp->boost_mtx);
1100	rt_mutex_unlock(lock: &rnp->boost_mtx); / Then keep lockdep happy. /
1101	rnp->n_boosts++;
1102
1103	return READ_ONCE(rnp->exp_tasks) != NULL \|\|
1104	READ_ONCE(rnp->boost_tasks) != NULL;
1105	}
1106
1107	/*
1108	* Priority-boosting kthread, one per leaf rcu_node.
1109	*/
1110	static int rcu_boost_kthread(void *arg)
1111	{
1112	struct rcu_node rnp = (struct* rcu_node *)arg;
1113	int spincnt = `0`;
1114	int more2boost;
1115
1116	trace_rcu_utilization(TPS("Start boost kthread@init"));
1117	for (;;) {
1118	WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_WAITING);
1119	trace_rcu_utilization(TPS("End boost kthread@rcu_wait"));
1120	rcu_wait(READ_ONCE(rnp->boost_tasks) \|\|
1121	READ_ONCE(rnp->exp_tasks));
1122	trace_rcu_utilization(TPS("Start boost kthread@rcu_wait"));
1123	WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_RUNNING);
1124	more2boost = rcu_boost(rnp);
1125	if (more2boost)
1126	spincnt++;
1127	else
1128	spincnt = `0`;
1129	if (spincnt > `10`) {
1130	WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_YIELDING);
1131	trace_rcu_utilization(TPS("End boost kthread@rcu_yield"));
1132	schedule_timeout_idle(timeout: `2`);
1133	trace_rcu_utilization(TPS("Start boost kthread@rcu_yield"));
1134	spincnt = `0`;
1135	}
1136	}
1137	/ NOTREACHED /
1138	trace_rcu_utilization(TPS("End boost kthread@notreached"));
1139	return `0`;
1140	}
1141
1142	/*
1143	* Check to see if it is time to start boosting RCU readers that are
1144	* blocking the current grace period, and, if so, tell the per-rcu_node
1145	* kthread to start boosting them. If there is an expedited grace
1146	* period in progress, it is always time to boost.
1147	*
1148	* The caller must hold rnp->lock, which this function releases.
1149	* The ->boost_kthread_task is immortal, so we don't need to worry
1150	* about it going away.
1151	*/
1152	static void rcu_initiate_boost(struct rcu_node rnp, unsigned* long flags)
1153	__releases(rnp->lock)
1154	{
1155	raw_lockdep_assert_held_rcu_node(rnp);
1156	if (!rnp->boost_kthread_task \|\|
1157	(!rcu_preempt_blocked_readers_cgp(rnp) && !rnp->exp_tasks)) {
1158	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1159	return;
1160	}
1161	if (rnp->exp_tasks != NULL \|\|
1162	(rnp->gp_tasks != NULL &&
1163	rnp->boost_tasks == NULL &&
1164	rnp->qsmask == `0` &&
1165	(!time_after(rnp->boost_time, jiffies) \|\| rcu_state.cbovld \|\|
1166	IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)))) {
1167	if (rnp->exp_tasks == NULL)
1168	WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks);
1169	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1170	rcu_wake_cond(t: rnp->boost_kthread_task,
1171	READ_ONCE(rnp->boost_kthread_status));
1172	} else {
1173	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1174	}
1175	}
1176
1177	#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
1178
1179	/*
1180	* Do priority-boost accounting for the start of a new grace period.
1181	*/
1182	static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1183	{
1184	rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
1185	}
1186
1187	/*
1188	* Create an RCU-boost kthread for the specified node if one does not
1189	* already exist. We only create this kthread for preemptible RCU.
1190	*/
1191	static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
1192	{
1193	unsigned long flags;
1194	int rnp_index = rnp - rcu_get_root();
1195	struct sched_param sp;
1196	struct task_struct *t;
1197
1198	mutex_lock(&rnp->boost_kthread_mutex);
1199	if (rnp->boost_kthread_task \|\| !rcu_scheduler_fully_active)
1200	goto out;
1201
1202	t = kthread_create(rcu_boost_kthread, (void *)rnp,
1203	"rcub/%d", rnp_index);
1204	if (WARN_ON_ONCE(IS_ERR(t)))
1205	goto out;
1206
1207	raw_spin_lock_irqsave_rcu_node(rnp, flags);
1208	rnp->boost_kthread_task = t;
1209	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1210	sp.sched_priority = kthread_prio;
1211	sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1212	wake_up_process(tsk: t); / get to TASK_INTERRUPTIBLE quickly. /
1213
1214	out:
1215	mutex_unlock(lock: &rnp->boost_kthread_mutex);
1216	}
1217
1218	/*
1219	* Set the per-rcu_node kthread's affinity to cover all CPUs that are
1220	* served by the rcu_node in question. The CPU hotplug lock is still
1221	* held, so the value of rnp->qsmaskinit will be stable.
1222	*
1223	* We don't include outgoingcpu in the affinity set, use -1 if there is
1224	* no outgoing CPU. If there are no CPUs left in the affinity set,
1225	* this function allows the kthread to execute on any CPU.
1226	*
1227	* Any future concurrent calls are serialized via ->boost_kthread_mutex.
1228	*/
1229	static void rcu_boost_kthread_setaffinity(struct rcu_node rnp, int* outgoingcpu)
1230	{
1231	struct task_struct *t = rnp->boost_kthread_task;
1232	unsigned long mask;
1233	cpumask_var_t cm;
1234	int cpu;
1235
1236	if (!t)
1237	return;
1238	if (!zalloc_cpumask_var(mask: &cm, GFP_KERNEL))
1239	return;
1240	mutex_lock(&rnp->boost_kthread_mutex);
1241	mask = rcu_rnp_online_cpus(rnp);
1242	for_each_leaf_node_possible_cpu(rnp, cpu)
1243	if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
1244	cpu != outgoingcpu)
1245	cpumask_set_cpu(cpu, dstp: cm);
1246	cpumask_and(dstp: cm, src1p: cm, src2p: housekeeping_cpumask(type: HK_TYPE_RCU));
1247	if (cpumask_empty(srcp: cm)) {
1248	cpumask_copy(dstp: cm, srcp: housekeeping_cpumask(type: HK_TYPE_RCU));
1249	if (outgoingcpu >= `0`)
1250	cpumask_clear_cpu(cpu: outgoingcpu, dstp: cm);
1251	}
1252	set_cpus_allowed_ptr(p: t, new_mask: cm);
1253	mutex_unlock(lock: &rnp->boost_kthread_mutex);
1254	free_cpumask_var(mask: cm);
1255	}
1256
1257	#else /* #ifdef CONFIG_RCU_BOOST */
1258
1259	static void rcu_initiate_boost(struct rcu_node rnp, unsigned* long flags)
1260	__releases(rnp->lock)
1261	{
1262	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1263	}
1264
1265	static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1266	{
1267	}
1268
1269	static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
1270	{
1271	}
1272
1273	static void rcu_boost_kthread_setaffinity(struct rcu_node rnp, int* outgoingcpu)
1274	{
1275	}
1276
1277	#endif /* #else #ifdef CONFIG_RCU_BOOST */
1278
1279	/*
1280	* Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
1281	* grace-period kthread will do force_quiescent_state() processing?
1282	* The idea is to avoid waking up RCU core processing on such a
1283	* CPU unless the grace period has extended for too long.
1284	*
1285	* This code relies on the fact that all NO_HZ_FULL CPUs are also
1286	* RCU_NOCB_CPU CPUs.
1287	*/
1288	static bool rcu_nohz_full_cpu(void)
1289	{
1290	#ifdef CONFIG_NO_HZ_FULL
1291	if (tick_nohz_full_cpu(smp_processor_id()) &&
1292	(!rcu_gp_in_progress() \|\|
1293	time_before(jiffies, READ_ONCE(rcu_state.gp_start) + HZ)))
1294	return true;
1295	#endif /* #ifdef CONFIG_NO_HZ_FULL */
1296	return false;
1297	}
1298
1299	/*
1300	* Bind the RCU grace-period kthreads to the housekeeping CPU.
1301	*/
1302	static void rcu_bind_gp_kthread(void)
1303	{
1304	if (!tick_nohz_full_enabled())
1305	return;
1306	housekeeping_affine(current, type: HK_TYPE_RCU);
1307	}
1308

source code of linux/kernel/rcu/tree_plugin.h