tree.c source code [linux/kernel/rcu/tree.c]

1	// SPDX-License-Identifier: GPL-2.0+
2	/*
3	* Read-Copy Update mechanism for mutual exclusion (tree-based version)
4	*
5	* Copyright IBM Corporation, 2008
6	*
7	* Authors: Dipankar Sarma <dipankar@in.ibm.com>
8	* Manfred Spraul <manfred@colorfullife.com>
9	* Paul E. McKenney <paulmck@linux.ibm.com>
10	*
11	* Based on the original work by Paul McKenney <paulmck@linux.ibm.com>
12	* and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
13	*
14	* For detailed explanation of Read-Copy Update mechanism see -
15	* Documentation/RCU
16	*/
17
18	#define pr_fmt(fmt) "rcu: " fmt
19
20	#include <linux/types.h>
21	#include <linux/kernel.h>
22	#include <linux/init.h>
23	#include <linux/spinlock.h>
24	#include <linux/smp.h>
25	#include <linux/rcupdate_wait.h>
26	#include <linux/interrupt.h>
27	#include <linux/sched.h>
28	#include <linux/sched/debug.h>
29	#include <linux/nmi.h>
30	#include <linux/atomic.h>
31	#include <linux/bitops.h>
32	#include <linux/export.h>
33	#include <linux/completion.h>
34	#include <linux/kmemleak.h>
35	#include <linux/moduleparam.h>
36	#include <linux/panic.h>
37	#include <linux/panic_notifier.h>
38	#include <linux/percpu.h>
39	#include <linux/notifier.h>
40	#include <linux/cpu.h>
41	#include <linux/mutex.h>
42	#include <linux/time.h>
43	#include <linux/kernel_stat.h>
44	#include <linux/wait.h>
45	#include <linux/kthread.h>
46	#include <uapi/linux/sched/types.h>
47	#include <linux/prefetch.h>
48	#include <linux/delay.h>
49	#include <linux/random.h>
50	#include <linux/trace_events.h>
51	#include <linux/suspend.h>
52	#include <linux/ftrace.h>
53	#include <linux/tick.h>
54	#include <linux/sysrq.h>
55	#include <linux/kprobes.h>
56	#include <linux/gfp.h>
57	#include <linux/oom.h>
58	#include <linux/smpboot.h>
59	#include <linux/jiffies.h>
60	#include <linux/slab.h>
61	#include <linux/sched/isolation.h>
62	#include <linux/sched/clock.h>
63	#include <linux/vmalloc.h>
64	#include <linux/mm.h>
65	#include <linux/kasan.h>
66	#include <linux/context_tracking.h>
67	#include "../time/tick-internal.h"
68
69	#include "tree.h"
70	#include "rcu.h"
71
72	#ifdef MODULE_PARAM_PREFIX
73	#undef MODULE_PARAM_PREFIX
74	#endif
75	#define MODULE_PARAM_PREFIX "rcutree."
76
77	/ Data structures. /
78
79	static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
80	.gpwrap = true,
81	#ifdef CONFIG_RCU_NOCB_CPU
82	.cblist.flags = SEGCBLIST_RCU_CORE,
83	#endif
84	};
85	static struct rcu_state rcu_state = {
86	.level = { &rcu_state.node[`0`] },
87	.gp_state = RCU_GP_IDLE,
88	.gp_seq = (`0UL` - `300UL`) << RCU_SEQ_CTR_SHIFT,
89	.barrier_mutex = __MUTEX_INITIALIZER(rcu_state.barrier_mutex),
90	.barrier_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.barrier_lock),
91	.name = RCU_NAME,
92	.abbr = RCU_ABBR,
93	.exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex),
94	.exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex),
95	.ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
96	};
97
98	/ Dump rcu_node combining tree at boot to verify correct setup. /
99	static bool dump_tree;
100	module_param(dump_tree, bool, `0444`);
101	/ By default, use RCU_SOFTIRQ instead of rcuc kthreads. /
102	static bool use_softirq = !IS_ENABLED(CONFIG_PREEMPT_RT);
103	#ifndef CONFIG_PREEMPT_RT
104	module_param(use_softirq, bool, `0444`);
105	#endif
106	/ Control rcu_node-tree auto-balancing at boot time. /
107	static bool rcu_fanout_exact;
108	module_param(rcu_fanout_exact, bool, `0444`);
109	/ Increase (but not decrease) the RCU_FANOUT_LEAF at boot time. /
110	static int rcu_fanout_leaf = RCU_FANOUT_LEAF;
111	module_param(rcu_fanout_leaf, int, `0444`);
112	int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
113	/ Number of rcu_nodes at specified level. /
114	int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
115	int rcu_num_nodes __read_mostly = NUM_RCU_NODES; / Total # rcu_nodes in use. /
116
117	/*
118	* The rcu_scheduler_active variable is initialized to the value
119	* RCU_SCHEDULER_INACTIVE and transitions RCU_SCHEDULER_INIT just before the
120	* first task is spawned. So when this variable is RCU_SCHEDULER_INACTIVE,
121	* RCU can assume that there is but one task, allowing RCU to (for example)
122	* optimize synchronize_rcu() to a simple barrier(). When this variable
123	* is RCU_SCHEDULER_INIT, RCU must actually do all the hard work required
124	* to detect real grace periods. This variable is also used to suppress
125	* boot-time false positives from lockdep-RCU error checking. Finally, it
126	* transitions from RCU_SCHEDULER_INIT to RCU_SCHEDULER_RUNNING after RCU
127	* is fully initialized, including all of its kthreads having been spawned.
128	*/
129	int rcu_scheduler_active __read_mostly;
130	EXPORT_SYMBOL_GPL(rcu_scheduler_active);
131
132	/*
133	* The rcu_scheduler_fully_active variable transitions from zero to one
134	* during the early_initcall() processing, which is after the scheduler
135	* is capable of creating new tasks. So RCU processing (for example,
136	* creating tasks for RCU priority boosting) must be delayed until after
137	* rcu_scheduler_fully_active transitions from zero to one. We also
138	* currently delay invocation of any RCU callbacks until after this point.
139	*
140	* It might later prove better for people registering RCU callbacks during
141	* early boot to take responsibility for these callbacks, but one step at
142	* a time.
143	*/
144	static int rcu_scheduler_fully_active __read_mostly;
145
146	static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
147	unsigned long gps, unsigned long flags);
148	static void rcu_boost_kthread_setaffinity(struct rcu_node rnp, int* outgoingcpu);
149	static void invoke_rcu_core(void);
150	static void rcu_report_exp_rdp(struct rcu_data *rdp);
151	static void sync_sched_exp_online_cleanup(int cpu);
152	static void check_cb_ovld_locked(struct rcu_data rdp, struct* rcu_node *rnp);
153	static bool rcu_rdp_is_offloaded(struct rcu_data *rdp);
154	static bool rcu_rdp_cpu_online(struct rcu_data *rdp);
155	static bool rcu_init_invoked(void);
156	static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
157	static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
158
159	/*
160	* rcuc/rcub/rcuop kthread realtime priority. The "rcuop"
161	* real-time priority(enabling/disabling) is controlled by
162	* the extra CONFIG_RCU_NOCB_CPU_CB_BOOST configuration.
163	*/
164	static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? `1` : `0`;
165	module_param(kthread_prio, int, `0444`);
166
167	/ Delay in jiffies for grace-period initialization delays, debug only. /
168
169	static int gp_preinit_delay;
170	module_param(gp_preinit_delay, int, `0444`);
171	static int gp_init_delay;
172	module_param(gp_init_delay, int, `0444`);
173	static int gp_cleanup_delay;
174	module_param(gp_cleanup_delay, int, `0444`);
175
176	// Add delay to rcu_read_unlock() for strict grace periods.
177	static int rcu_unlock_delay;
178	#ifdef CONFIG_RCU_STRICT_GRACE_PERIOD
179	module_param(rcu_unlock_delay, int, `0444`);
180	#endif
181
182	/*
183	* This rcu parameter is runtime-read-only. It reflects
184	* a minimum allowed number of objects which can be cached
185	* per-CPU. Object size is equal to one page. This value
186	* can be changed at boot time.
187	*/
188	static int rcu_min_cached_objs = `5`;
189	module_param(rcu_min_cached_objs, int, `0444`);
190
191	// A page shrinker can ask for pages to be freed to make them
192	// available for other parts of the system. This usually happens
193	// under low memory conditions, and in that case we should also
194	// defer page-cache filling for a short time period.
195	//
196	// The default value is 5 seconds, which is long enough to reduce
197	// interference with the shrinker while it asks other systems to
198	// drain their caches.
199	static int rcu_delay_page_cache_fill_msec = `5000`;
200	module_param(rcu_delay_page_cache_fill_msec, int, `0444`);
201
202	/ Retrieve RCU kthreads priority for rcutorture /
203	int rcu_get_gp_kthreads_prio(void)
204	{
205	return kthread_prio;
206	}
207	EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio);
208
209	/*
210	* Number of grace periods between delays, normalized by the duration of
211	* the delay. The longer the delay, the more the grace periods between
212	* each delay. The reason for this normalization is that it means that,
213	* for non-zero delays, the overall slowdown of grace periods is constant
214	* regardless of the duration of the delay. This arrangement balances
215	* the need for long delays to increase some race probabilities with the
216	* need for fast grace periods to increase other race probabilities.
217	*/
218	#define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays for debugging. */
219
220	/*
221	* Return true if an RCU grace period is in progress. The READ_ONCE()s
222	* permit this function to be invoked without holding the root rcu_node
223	* structure's ->lock, but of course results can be subject to change.
224	*/
225	static int rcu_gp_in_progress(void)
226	{
227	return rcu_seq_state(s: rcu_seq_current(sp: &rcu_state.gp_seq));
228	}
229
230	/*
231	* Return the number of callbacks queued on the specified CPU.
232	* Handles both the nocbs and normal cases.
233	*/
234	static long rcu_get_n_cbs_cpu(int cpu)
235	{
236	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
237
238	if (rcu_segcblist_is_enabled(rsclp: &rdp->cblist))
239	return rcu_segcblist_n_cbs(rsclp: &rdp->cblist);
240	return `0`;
241	}
242
243	void rcu_softirq_qs(void)
244	{
245	rcu_qs();
246	rcu_preempt_deferred_qs(current);
247	rcu_tasks_qs(current, false);
248	}
249
250	/*
251	* Reset the current CPU's ->dynticks counter to indicate that the
252	* newly onlined CPU is no longer in an extended quiescent state.
253	* This will either leave the counter unchanged, or increment it
254	* to the next non-quiescent value.
255	*
256	* The non-atomic test/increment sequence works because the upper bits
257	* of the ->dynticks counter are manipulated only by the corresponding CPU,
258	* or when the corresponding CPU is offline.
259	*/
260	static void rcu_dynticks_eqs_online(void)
261	{
262	if (ct_dynticks() & RCU_DYNTICKS_IDX)
263	return;
264	ct_state_inc(RCU_DYNTICKS_IDX);
265	}
266
267	/*
268	* Snapshot the ->dynticks counter with full ordering so as to allow
269	* stable comparison of this counter with past and future snapshots.
270	*/
271	static int rcu_dynticks_snap(int cpu)
272	{
273	smp_mb(); // Fundamental RCU ordering guarantee.
274	return ct_dynticks_cpu_acquire(cpu);
275	}
276
277	/*
278	* Return true if the snapshot returned from rcu_dynticks_snap()
279	* indicates that RCU is in an extended quiescent state.
280	*/
281	static bool rcu_dynticks_in_eqs(int snap)
282	{
283	return !(snap & RCU_DYNTICKS_IDX);
284	}
285
286	/*
287	* Return true if the CPU corresponding to the specified rcu_data
288	* structure has spent some time in an extended quiescent state since
289	* rcu_dynticks_snap() returned the specified snapshot.
290	*/
291	static bool rcu_dynticks_in_eqs_since(struct rcu_data rdp, int* snap)
292	{
293	return snap != rcu_dynticks_snap(cpu: rdp->cpu);
294	}
295
296	/*
297	* Return true if the referenced integer is zero while the specified
298	* CPU remains within a single extended quiescent state.
299	*/
300	bool rcu_dynticks_zero_in_eqs(int cpu, int *vp)
301	{
302	int snap;
303
304	// If not quiescent, force back to earlier extended quiescent state.
305	snap = ct_dynticks_cpu(cpu) & ~RCU_DYNTICKS_IDX;
306	smp_rmb(); // Order ->dynticks and vp reads.*
307	if (READ_ONCE(*vp))
308	return false; // Non-zero, so report failure;
309	smp_rmb(); // Order vp read and ->dynticks re-read.*
310
311	// If still in the same extended quiescent state, we are good!
312	return snap == ct_dynticks_cpu(cpu);
313	}
314
315	/*
316	* Let the RCU core know that this CPU has gone through the scheduler,
317	* which is a quiescent state. This is called when the need for a
318	* quiescent state is urgent, so we burn an atomic operation and full
319	* memory barriers to let the RCU core know about it, regardless of what
320	* this CPU might (or might not) do in the near future.
321	*
322	* We inform the RCU core by emulating a zero-duration dyntick-idle period.
323	*
324	* The caller must have disabled interrupts and must not be idle.
325	*/
326	notrace void rcu_momentary_dyntick_idle(void)
327	{
328	int seq;
329
330	raw_cpu_write(rcu_data.rcu_need_heavy_qs, false);
331	seq = ct_state_inc(incby: `2` * RCU_DYNTICKS_IDX);
332	/ It is illegal to call this from idle state. /
333	WARN_ON_ONCE(!(seq & RCU_DYNTICKS_IDX));
334	rcu_preempt_deferred_qs(current);
335	}
336	EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle);
337
338	/**
339	* rcu_is_cpu_rrupt_from_idle - see if 'interrupted' from idle
340	*
341	* If the current CPU is idle and running at a first-level (not nested)
342	* interrupt, or directly, from idle, return true.
343	*
344	* The caller must have at least disabled IRQs.
345	*/
346	static int rcu_is_cpu_rrupt_from_idle(void)
347	{
348	long nesting;
349
350	/*
351	* Usually called from the tick; but also used from smp_function_call()
352	* for expedited grace periods. This latter can result in running from
353	* the idle task, instead of an actual IPI.
354	*/
355	lockdep_assert_irqs_disabled();
356
357	/ Check for counter underflows /
358	RCU_LOCKDEP_WARN(ct_dynticks_nesting() < `0`,
359	"RCU dynticks_nesting counter underflow!");
360	RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() <= `0`,
361	"RCU dynticks_nmi_nesting counter underflow/zero!");
362
363	/ Are we at first interrupt nesting level? /
364	nesting = ct_dynticks_nmi_nesting();
365	if (nesting > `1`)
366	return false;
367
368	/*
369	* If we're not in an interrupt, we must be in the idle task!
370	*/
371	WARN_ON_ONCE(!nesting && !is_idle_task(current));
372
373	/ Does CPU appear to be idle from an RCU standpoint? /
374	return ct_dynticks_nesting() == `0`;
375	}
376
377	#define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10)
378	// Maximum callbacks per rcu_do_batch ...
379	#define DEFAULT_MAX_RCU_BLIMIT 10000 // ... even during callback flood.
380	static long blimit = DEFAULT_RCU_BLIMIT;
381	#define DEFAULT_RCU_QHIMARK 10000 // If this many pending, ignore blimit.
382	static long qhimark = DEFAULT_RCU_QHIMARK;
383	#define DEFAULT_RCU_QLOMARK 100 // Once only this many pending, use blimit.
384	static long qlowmark = DEFAULT_RCU_QLOMARK;
385	#define DEFAULT_RCU_QOVLD_MULT 2
386	#define DEFAULT_RCU_QOVLD (DEFAULT_RCU_QOVLD_MULT * DEFAULT_RCU_QHIMARK)
387	static long qovld = DEFAULT_RCU_QOVLD; // If this many pending, hammer QS.
388	static long qovld_calc = -`1`; // No pre-initialization lock acquisitions!
389
390	module_param(blimit, long, `0444`);
391	module_param(qhimark, long, `0444`);
392	module_param(qlowmark, long, `0444`);
393	module_param(qovld, long, `0444`);
394
395	static ulong jiffies_till_first_fqs = IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? `0` : ULONG_MAX;
396	static ulong jiffies_till_next_fqs = ULONG_MAX;
397	static bool rcu_kick_kthreads;
398	static int rcu_divisor = `7`;
399	module_param(rcu_divisor, int, `0644`);
400
401	/ Force an exit from rcu_do_batch() after 3 milliseconds. /
402	static long rcu_resched_ns = `3` * NSEC_PER_MSEC;
403	module_param(rcu_resched_ns, long, `0644`);
404
405	/*
406	* How long the grace period must be before we start recruiting
407	* quiescent-state help from rcu_note_context_switch().
408	*/
409	static ulong jiffies_till_sched_qs = ULONG_MAX;
410	module_param(jiffies_till_sched_qs, ulong, `0444`);
411	static ulong jiffies_to_sched_qs; / See adjust_jiffies_till_sched_qs(). /
412	module_param(jiffies_to_sched_qs, ulong, `0444`); / Display only! /
413
414	/*
415	* Make sure that we give the grace-period kthread time to detect any
416	* idle CPUs before taking active measures to force quiescent states.
417	* However, don't go below 100 milliseconds, adjusted upwards for really
418	* large systems.
419	*/
420	static void adjust_jiffies_till_sched_qs(void)
421	{
422	unsigned long j;
423
424	/ If jiffies_till_sched_qs was specified, respect the request. /
425	if (jiffies_till_sched_qs != ULONG_MAX) {
426	WRITE_ONCE(jiffies_to_sched_qs, jiffies_till_sched_qs);
427	return;
428	}
429	/ Otherwise, set to third fqs scan, but bound below on large system. /
430	j = READ_ONCE(jiffies_till_first_fqs) +
431	`2` * READ_ONCE(jiffies_till_next_fqs);
432	if (j < HZ / `10` + nr_cpu_ids / RCU_JIFFIES_FQS_DIV)
433	j = HZ / `10` + nr_cpu_ids / RCU_JIFFIES_FQS_DIV;
434	pr_info("RCU calculated value of scheduler-enlistment delay is %ld jiffies.\n", j);
435	WRITE_ONCE(jiffies_to_sched_qs, j);
436	}
437
438	static int param_set_first_fqs_jiffies(const char val, const* struct kernel_param *kp)
439	{
440	ulong j;
441	int ret = kstrtoul(s: val, base: `0`, res: &j);
442
443	if (!ret) {
444	WRITE_ONCE((ulong )kp->arg, (j > HZ) ? HZ : j);
445	adjust_jiffies_till_sched_qs();
446	}
447	return ret;
448	}
449
450	static int param_set_next_fqs_jiffies(const char val, const* struct kernel_param *kp)
451	{
452	ulong j;
453	int ret = kstrtoul(s: val, base: `0`, res: &j);
454
455	if (!ret) {
456	WRITE_ONCE((ulong )kp->arg, (j > HZ) ? HZ : (j ?: `1`));
457	adjust_jiffies_till_sched_qs();
458	}
459	return ret;
460	}
461
462	static const struct kernel_param_ops first_fqs_jiffies_ops = {
463	.set = param_set_first_fqs_jiffies,
464	.get = param_get_ulong,
465	};
466
467	static const struct kernel_param_ops next_fqs_jiffies_ops = {
468	.set = param_set_next_fqs_jiffies,
469	.get = param_get_ulong,
470	};
471
472	module_param_cb(jiffies_till_first_fqs, &first_fqs_jiffies_ops, &jiffies_till_first_fqs, `0644`);
473	module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next_fqs, `0644`);
474	module_param(rcu_kick_kthreads, bool, `0644`);
475
476	static void force_qs_rnp(int (f)(struct* rcu_data *rdp));
477	static int rcu_pending(int user);
478
479	/*
480	* Return the number of RCU GPs completed thus far for debug & stats.
481	*/
482	unsigned long rcu_get_gp_seq(void)
483	{
484	return READ_ONCE(rcu_state.gp_seq);
485	}
486	EXPORT_SYMBOL_GPL(rcu_get_gp_seq);
487
488	/*
489	* Return the number of RCU expedited batches completed thus far for
490	* debug & stats. Odd numbers mean that a batch is in progress, even
491	* numbers mean idle. The value returned will thus be roughly double
492	* the cumulative batches since boot.
493	*/
494	unsigned long rcu_exp_batches_completed(void)
495	{
496	return rcu_state.expedited_sequence;
497	}
498	EXPORT_SYMBOL_GPL(rcu_exp_batches_completed);
499
500	/*
501	* Return the root node of the rcu_state structure.
502	*/
503	static struct rcu_node rcu_get_root(void*)
504	{
505	return &rcu_state.node[`0`];
506	}
507
508	/*
509	* Send along grace-period-related data for rcutorture diagnostics.
510	*/
511	void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
512	unsigned long *gp_seq)
513	{
514	switch (test_type) {
515	case RCU_FLAVOR:
516	*flags = READ_ONCE(rcu_state.gp_flags);
517	*gp_seq = rcu_seq_current(sp: &rcu_state.gp_seq);
518	break;
519	default:
520	break;
521	}
522	}
523	EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
524
525	#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) \|\| !defined(CONFIG_KVM_XFER_TO_GUEST_WORK))
526	/*
527	* An empty function that will trigger a reschedule on
528	* IRQ tail once IRQs get re-enabled on userspace/guest resume.
529	*/
530	static void late_wakeup_func(struct irq_work *work)
531	{
532	}
533
534	static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) =
535	IRQ_WORK_INIT(late_wakeup_func);
536
537	/*
538	* If either:
539	*
540	* 1) the task is about to enter in guest mode and $ARCH doesn't support KVM generic work
541	* 2) the task is about to enter in user mode and $ARCH doesn't support generic entry.
542	*
543	* In these cases the late RCU wake ups aren't supported in the resched loops and our
544	* last resort is to fire a local irq_work that will trigger a reschedule once IRQs
545	* get re-enabled again.
546	*/
547	noinstr void rcu_irq_work_resched(void)
548	{
549	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
550
551	if (IS_ENABLED(CONFIG_GENERIC_ENTRY) && !(current->flags & PF_VCPU))
552	return;
553
554	if (IS_ENABLED(CONFIG_KVM_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU))
555	return;
556
557	instrumentation_begin();
558	if (do_nocb_deferred_wakeup(rdp) && need_resched()) {
559	irq_work_queue(this_cpu_ptr(&late_wakeup_work));
560	}
561	instrumentation_end();
562	}
563	#endif /* #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) \|\| !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) */
564
565	#ifdef CONFIG_PROVE_RCU
566	/**
567	* rcu_irq_exit_check_preempt - Validate that scheduling is possible
568	*/
569	void rcu_irq_exit_check_preempt(void)
570	{
571	lockdep_assert_irqs_disabled();
572
573	RCU_LOCKDEP_WARN(ct_dynticks_nesting() <= `0`,
574	"RCU dynticks_nesting counter underflow/zero!");
575	RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() !=
576	DYNTICK_IRQ_NONIDLE,
577	"Bad RCU dynticks_nmi_nesting counter\n");
578	RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
579	"RCU in extended quiescent state!");
580	}
581	#endif /* #ifdef CONFIG_PROVE_RCU */
582
583	#ifdef CONFIG_NO_HZ_FULL
584	/**
585	* __rcu_irq_enter_check_tick - Enable scheduler tick on CPU if RCU needs it.
586	*
587	* The scheduler tick is not normally enabled when CPUs enter the kernel
588	* from nohz_full userspace execution. After all, nohz_full userspace
589	* execution is an RCU quiescent state and the time executing in the kernel
590	* is quite short. Except of course when it isn't. And it is not hard to
591	* cause a large system to spend tens of seconds or even minutes looping
592	* in the kernel, which can cause a number of problems, include RCU CPU
593	* stall warnings.
594	*
595	* Therefore, if a nohz_full CPU fails to report a quiescent state
596	* in a timely manner, the RCU grace-period kthread sets that CPU's
597	* ->rcu_urgent_qs flag with the expectation that the next interrupt or
598	* exception will invoke this function, which will turn on the scheduler
599	* tick, which will enable RCU to detect that CPU's quiescent states,
600	* for example, due to cond_resched() calls in CONFIG_PREEMPT=n kernels.
601	* The tick will be disabled once a quiescent state is reported for
602	* this CPU.
603	*
604	* Of course, in carefully tuned systems, there might never be an
605	* interrupt or exception. In that case, the RCU grace-period kthread
606	* will eventually cause one to happen. However, in less carefully
607	* controlled environments, this function allows RCU to get what it
608	* needs without creating otherwise useless interruptions.
609	*/
610	void __rcu_irq_enter_check_tick(void)
611	{
612	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
613
614	// If we're here from NMI there's nothing to do.
615	if (in_nmi())
616	return;
617
618	RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
619	"Illegal rcu_irq_enter_check_tick() from extended quiescent state");
620
621	if (!tick_nohz_full_cpu(rdp->cpu) \|\|
622	!READ_ONCE(rdp->rcu_urgent_qs) \|\|
623	READ_ONCE(rdp->rcu_forced_tick)) {
624	// RCU doesn't need nohz_full help from this CPU, or it is
625	// already getting that help.
626	return;
627	}
628
629	// We get here only when not in an extended quiescent state and
630	// from interrupts (as opposed to NMIs). Therefore, (1) RCU is
631	// already watching and (2) The fact that we are in an interrupt
632	// handler and that the rcu_node lock is an irq-disabled lock
633	// prevents self-deadlock. So we can safely recheck under the lock.
634	// Note that the nohz_full state currently cannot change.
635	raw_spin_lock_rcu_node(rdp->mynode);
636	if (READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) {
637	// A nohz_full CPU is in the kernel and RCU needs a
638	// quiescent state. Turn on the tick!
639	WRITE_ONCE(rdp->rcu_forced_tick, true);
640	tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
641	}
642	raw_spin_unlock_rcu_node(rdp->mynode);
643	}
644	NOKPROBE_SYMBOL(__rcu_irq_enter_check_tick);
645	#endif /* CONFIG_NO_HZ_FULL */
646
647	/*
648	* Check to see if any future non-offloaded RCU-related work will need
649	* to be done by the current CPU, even if none need be done immediately,
650	* returning 1 if so. This function is part of the RCU implementation;
651	* it is -not- an exported member of the RCU API. This is used by
652	* the idle-entry code to figure out whether it is safe to disable the
653	* scheduler-clock interrupt.
654	*
655	* Just check whether or not this CPU has non-offloaded RCU callbacks
656	* queued.
657	*/
658	int rcu_needs_cpu(void)
659	{
660	return !rcu_segcblist_empty(rsclp: &this_cpu_ptr(&rcu_data)->cblist) &&
661	!rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data));
662	}
663
664	/*
665	* If any sort of urgency was applied to the current CPU (for example,
666	* the scheduler-clock interrupt was enabled on a nohz_full CPU) in order
667	* to get to a quiescent state, disable it.
668	*/
669	static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)
670	{
671	raw_lockdep_assert_held_rcu_node(rdp->mynode);
672	WRITE_ONCE(rdp->rcu_urgent_qs, false);
673	WRITE_ONCE(rdp->rcu_need_heavy_qs, false);
674	if (tick_nohz_full_cpu(cpu: rdp->cpu) && rdp->rcu_forced_tick) {
675	tick_dep_clear_cpu(cpu: rdp->cpu, bit: TICK_DEP_BIT_RCU);
676	WRITE_ONCE(rdp->rcu_forced_tick, false);
677	}
678	}
679
680	/**
681	* rcu_is_watching - RCU read-side critical sections permitted on current CPU?
682	*
683	* Return @true if RCU is watching the running CPU and @false otherwise.
684	* An @true return means that this CPU can safely enter RCU read-side
685	* critical sections.
686	*
687	* Although calls to rcu_is_watching() from most parts of the kernel
688	* will return @true, there are important exceptions. For example, if the
689	* current CPU is deep within its idle loop, in kernel entry/exit code,
690	* or offline, rcu_is_watching() will return @false.
691	*
692	* Make notrace because it can be called by the internal functions of
693	* ftrace, and making this notrace removes unnecessary recursion calls.
694	*/
695	notrace bool rcu_is_watching(void)
696	{
697	bool ret;
698
699	preempt_disable_notrace();
700	ret = !rcu_dynticks_curr_cpu_in_eqs();
701	preempt_enable_notrace();
702	return ret;
703	}
704	EXPORT_SYMBOL_GPL(rcu_is_watching);
705
706	/*
707	* If a holdout task is actually running, request an urgent quiescent
708	* state from its CPU. This is unsynchronized, so migrations can cause
709	* the request to go to the wrong CPU. Which is OK, all that will happen
710	* is that the CPU's next context switch will be a bit slower and next
711	* time around this task will generate another request.
712	*/
713	void rcu_request_urgent_qs_task(struct task_struct *t)
714	{
715	int cpu;
716
717	barrier();
718	cpu = task_cpu(p: t);
719	if (!task_curr(p: t))
720	return; / This task is not running on that CPU. /
721	smp_store_release(per_cpu_ptr(&rcu_data.rcu_urgent_qs, cpu), true);
722	}
723
724	/*
725	* When trying to report a quiescent state on behalf of some other CPU,
726	* it is our responsibility to check for and handle potential overflow
727	* of the rcu_node ->gp_seq counter with respect to the rcu_data counters.
728	* After all, the CPU might be in deep idle state, and thus executing no
729	* code whatsoever.
730	*/
731	static void rcu_gpnum_ovf(struct rcu_node rnp, struct* rcu_data *rdp)
732	{
733	raw_lockdep_assert_held_rcu_node(rnp);
734	if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + ULONG_MAX / `4`,
735	rnp->gp_seq))
736	WRITE_ONCE(rdp->gpwrap, true);
737	if (ULONG_CMP_LT(rdp->rcu_iw_gp_seq + ULONG_MAX / `4`, rnp->gp_seq))
738	rdp->rcu_iw_gp_seq = rnp->gp_seq + ULONG_MAX / `4`;
739	}
740
741	/*
742	* Snapshot the specified CPU's dynticks counter so that we can later
743	* credit them with an implicit quiescent state. Return 1 if this CPU
744	* is in dynticks idle mode, which is an extended quiescent state.
745	*/
746	static int dyntick_save_progress_counter(struct rcu_data *rdp)
747	{
748	rdp->dynticks_snap = rcu_dynticks_snap(cpu: rdp->cpu);
749	if (rcu_dynticks_in_eqs(snap: rdp->dynticks_snap)) {
750	trace_rcu_fqs(rcuname: rcu_state.name, gp_seq: rdp->gp_seq, cpu: rdp->cpu, TPS("dti"));
751	rcu_gpnum_ovf(rnp: rdp->mynode, rdp);
752	return `1`;
753	}
754	return `0`;
755	}
756
757	/*
758	* Returns positive if the specified CPU has passed through a quiescent state
759	* by virtue of being in or having passed through an dynticks idle state since
760	* the last call to dyntick_save_progress_counter() for this same CPU, or by
761	* virtue of having been offline.
762	*
763	* Returns negative if the specified CPU needs a force resched.
764	*
765	* Returns zero otherwise.
766	*/
767	static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
768	{
769	unsigned long jtsq;
770	int ret = `0`;
771	struct rcu_node *rnp = rdp->mynode;
772
773	/*
774	* If the CPU passed through or entered a dynticks idle phase with
775	* no active irq/NMI handlers, then we can safely pretend that the CPU
776	* already acknowledged the request to pass through a quiescent
777	* state. Either way, that CPU cannot possibly be in an RCU
778	* read-side critical section that started before the beginning
779	* of the current RCU grace period.
780	*/
781	if (rcu_dynticks_in_eqs_since(rdp, snap: rdp->dynticks_snap)) {
782	trace_rcu_fqs(rcuname: rcu_state.name, gp_seq: rdp->gp_seq, cpu: rdp->cpu, TPS("dti"));
783	rcu_gpnum_ovf(rnp, rdp);
784	return `1`;
785	}
786
787	/*
788	* Complain if a CPU that is considered to be offline from RCU's
789	* perspective has not yet reported a quiescent state. After all,
790	* the offline CPU should have reported a quiescent state during
791	* the CPU-offline process, or, failing that, by rcu_gp_init()
792	* if it ran concurrently with either the CPU going offline or the
793	* last task on a leaf rcu_node structure exiting its RCU read-side
794	* critical section while all CPUs corresponding to that structure
795	* are offline. This added warning detects bugs in any of these
796	* code paths.
797	*
798	* The rcu_node structure's ->lock is held here, which excludes
799	* the relevant portions the CPU-hotplug code, the grace-period
800	* initialization code, and the rcu_read_unlock() code paths.
801	*
802	* For more detail, please refer to the "Hotplug CPU" section
803	* of RCU's Requirements documentation.
804	*/
805	if (WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp))) {
806	struct rcu_node *rnp1;
807
808	pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n",
809	__func__, rnp->grplo, rnp->grphi, rnp->level,
810	(long)rnp->gp_seq, (long)rnp->completedqs);
811	for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent)
812	pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx ->rcu_gp_init_mask %#lx\n",
813	__func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask);
814	pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n",
815	__func__, rdp->cpu, ".o"[rcu_rdp_cpu_online(rdp)],
816	(long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
817	(long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
818	return `1`; / Break things loose after complaining. /
819	}
820
821	/*
822	* A CPU running for an extended time within the kernel can
823	* delay RCU grace periods: (1) At age jiffies_to_sched_qs,
824	* set .rcu_urgent_qs, (2) At age 2*jiffies_to_sched_qs, set
825	* both .rcu_need_heavy_qs and .rcu_urgent_qs. Note that the
826	* unsynchronized assignments to the per-CPU rcu_need_heavy_qs
827	* variable are safe because the assignments are repeated if this
828	* CPU failed to pass through a quiescent state. This code
829	* also checks .jiffies_resched in case jiffies_to_sched_qs
830	* is set way high.
831	*/
832	jtsq = READ_ONCE(jiffies_to_sched_qs);
833	if (!READ_ONCE(rdp->rcu_need_heavy_qs) &&
834	(time_after(jiffies, rcu_state.gp_start + jtsq * `2`) \|\|
835	time_after(jiffies, rcu_state.jiffies_resched) \|\|
836	rcu_state.cbovld)) {
837	WRITE_ONCE(rdp->rcu_need_heavy_qs, true);
838	/ Store rcu_need_heavy_qs before rcu_urgent_qs. /
839	smp_store_release(&rdp->rcu_urgent_qs, true);
840	} else if (time_after(jiffies, rcu_state.gp_start + jtsq)) {
841	WRITE_ONCE(rdp->rcu_urgent_qs, true);
842	}
843
844	/*
845	* NO_HZ_FULL CPUs can run in-kernel without rcu_sched_clock_irq!
846	* The above code handles this, but only for straight cond_resched().
847	* And some in-kernel loops check need_resched() before calling
848	* cond_resched(), which defeats the above code for CPUs that are
849	* running in-kernel with scheduling-clock interrupts disabled.
850	* So hit them over the head with the resched_cpu() hammer!
851	*/
852	if (tick_nohz_full_cpu(cpu: rdp->cpu) &&
853	(time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * `3`) \|\|
854	rcu_state.cbovld)) {
855	WRITE_ONCE(rdp->rcu_urgent_qs, true);
856	WRITE_ONCE(rdp->last_fqs_resched, jiffies);
857	ret = -`1`;
858	}
859
860	/*
861	* If more than halfway to RCU CPU stall-warning time, invoke
862	* resched_cpu() more frequently to try to loosen things up a bit.
863	* Also check to see if the CPU is getting hammered with interrupts,
864	* but only once per grace period, just to keep the IPIs down to
865	* a dull roar.
866	*/
867	if (time_after(jiffies, rcu_state.jiffies_resched)) {
868	if (time_after(jiffies,
869	READ_ONCE(rdp->last_fqs_resched) + jtsq)) {
870	WRITE_ONCE(rdp->last_fqs_resched, jiffies);
871	ret = -`1`;
872	}
873	if (IS_ENABLED(CONFIG_IRQ_WORK) &&
874	!rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq &&
875	(rnp->ffmask & rdp->grpmask)) {
876	rdp->rcu_iw_pending = true;
877	rdp->rcu_iw_gp_seq = rnp->gp_seq;
878	irq_work_queue_on(work: &rdp->rcu_iw, cpu: rdp->cpu);
879	}
880
881	if (rcu_cpu_stall_cputime && rdp->snap_record.gp_seq != rdp->gp_seq) {
882	int cpu = rdp->cpu;
883	struct rcu_snap_record *rsrp;
884	struct kernel_cpustat *kcsp;
885
886	kcsp = &kcpustat_cpu(cpu);
887
888	rsrp = &rdp->snap_record;
889	rsrp->cputime_irq = kcpustat_field(kcpustat: kcsp, usage: CPUTIME_IRQ, cpu);
890	rsrp->cputime_softirq = kcpustat_field(kcpustat: kcsp, usage: CPUTIME_SOFTIRQ, cpu);
891	rsrp->cputime_system = kcpustat_field(kcpustat: kcsp, usage: CPUTIME_SYSTEM, cpu);
892	rsrp->nr_hardirqs = kstat_cpu_irqs_sum(cpu: rdp->cpu);
893	rsrp->nr_softirqs = kstat_cpu_softirqs_sum(cpu: rdp->cpu);
894	rsrp->nr_csw = nr_context_switches_cpu(cpu: rdp->cpu);
895	rsrp->jiffies = jiffies;
896	rsrp->gp_seq = rdp->gp_seq;
897	}
898	}
899
900	return ret;
901	}
902
903	/ Trace-event wrapper function for trace_rcu_future_grace_period. /
904	static void trace_rcu_this_gp(struct rcu_node rnp, struct* rcu_data *rdp,
905	unsigned long gp_seq_req, const char *s)
906	{
907	trace_rcu_future_grace_period(rcuname: rcu_state.name, READ_ONCE(rnp->gp_seq),
908	gp_seq_req, level: rnp->level,
909	grplo: rnp->grplo, grphi: rnp->grphi, gpevent: s);
910	}
911
912	/*
913	* rcu_start_this_gp - Request the start of a particular grace period
914	* @rnp_start: The leaf node of the CPU from which to start.
915	* @rdp: The rcu_data corresponding to the CPU from which to start.
916	* @gp_seq_req: The gp_seq of the grace period to start.
917	*
918	* Start the specified grace period, as needed to handle newly arrived
919	* callbacks. The required future grace periods are recorded in each
920	* rcu_node structure's ->gp_seq_needed field. Returns true if there
921	* is reason to awaken the grace-period kthread.
922	*
923	* The caller must hold the specified rcu_node structure's ->lock, which
924	* is why the caller is responsible for waking the grace-period kthread.
925	*
926	* Returns true if the GP thread needs to be awakened else false.
927	*/
928	static bool rcu_start_this_gp(struct rcu_node rnp_start, struct* rcu_data *rdp,
929	unsigned long gp_seq_req)
930	{
931	bool ret = false;
932	struct rcu_node *rnp;
933
934	/*
935	* Use funnel locking to either acquire the root rcu_node
936	* structure's lock or bail out if the need for this grace period
937	* has already been recorded -- or if that grace period has in
938	* fact already started. If there is already a grace period in
939	* progress in a non-leaf node, no recording is needed because the
940	* end of the grace period will scan the leaf rcu_node structures.
941	* Note that rnp_start->lock must not be released.
942	*/
943	raw_lockdep_assert_held_rcu_node(rnp_start);
944	trace_rcu_this_gp(rnp: rnp_start, rdp, gp_seq_req, TPS("Startleaf"));
945	for (rnp = rnp_start; `1`; rnp = rnp->parent) {
946	if (rnp != rnp_start)
947	raw_spin_lock_rcu_node(rnp);
948	if (ULONG_CMP_GE(rnp->gp_seq_needed, gp_seq_req) \|\|
949	rcu_seq_started(sp: &rnp->gp_seq, s: gp_seq_req) \|\|
950	(rnp != rnp_start &&
951	rcu_seq_state(s: rcu_seq_current(sp: &rnp->gp_seq)))) {
952	trace_rcu_this_gp(rnp, rdp, gp_seq_req,
953	TPS("Prestarted"));
954	goto unlock_out;
955	}
956	WRITE_ONCE(rnp->gp_seq_needed, gp_seq_req);
957	if (rcu_seq_state(s: rcu_seq_current(sp: &rnp->gp_seq))) {
958	/*
959	* We just marked the leaf or internal node, and a
960	* grace period is in progress, which means that
961	* rcu_gp_cleanup() will see the marking. Bail to
962	* reduce contention.
963	*/
964	trace_rcu_this_gp(rnp: rnp_start, rdp, gp_seq_req,
965	TPS("Startedleaf"));
966	goto unlock_out;
967	}
968	if (rnp != rnp_start && rnp->parent != NULL)
969	raw_spin_unlock_rcu_node(rnp);
970	if (!rnp->parent)
971	break; / At root, and perhaps also leaf. /
972	}
973
974	/ If GP already in progress, just leave, otherwise start one. /
975	if (rcu_gp_in_progress()) {
976	trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedleafroot"));
977	goto unlock_out;
978	}
979	trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedroot"));
980	WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags \| RCU_GP_FLAG_INIT);
981	WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
982	if (!READ_ONCE(rcu_state.gp_kthread)) {
983	trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread"));
984	goto unlock_out;
985	}
986	trace_rcu_grace_period(rcuname: rcu_state.name, data_race(rcu_state.gp_seq), TPS("newreq"));
987	ret = true; / Caller must wake GP kthread. /
988	unlock_out:
989	/ Push furthest requested GP to leaf node and rcu_data structure. /
990	if (ULONG_CMP_LT(gp_seq_req, rnp->gp_seq_needed)) {
991	WRITE_ONCE(rnp_start->gp_seq_needed, rnp->gp_seq_needed);
992	WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed);
993	}
994	if (rnp != rnp_start)
995	raw_spin_unlock_rcu_node(rnp);
996	return ret;
997	}
998
999	/*
1000	* Clean up any old requests for the just-ended grace period. Also return
1001	* whether any additional grace periods have been requested.
1002	*/
1003	static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
1004	{
1005	bool needmore;
1006	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
1007
1008	needmore = ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed);
1009	if (!needmore)
1010	rnp->gp_seq_needed = rnp->gp_seq; / Avoid counter wrap. /
1011	trace_rcu_this_gp(rnp, rdp, gp_seq_req: rnp->gp_seq,
1012	s: needmore ? TPS("CleanupMore") : TPS("Cleanup"));
1013	return needmore;
1014	}
1015
1016	/*
1017	* Awaken the grace-period kthread. Don't do a self-awaken (unless in an
1018	* interrupt or softirq handler, in which case we just might immediately
1019	* sleep upon return, resulting in a grace-period hang), and don't bother
1020	* awakening when there is nothing for the grace-period kthread to do
1021	* (as in several CPUs raced to awaken, we lost), and finally don't try
1022	* to awaken a kthread that has not yet been created. If all those checks
1023	* are passed, track some debug information and awaken.
1024	*
1025	* So why do the self-wakeup when in an interrupt or softirq handler
1026	* in the grace-period kthread's context? Because the kthread might have
1027	* been interrupted just as it was going to sleep, and just after the final
1028	* pre-sleep check of the awaken condition. In this case, a wakeup really
1029	* is required, and is therefore supplied.
1030	*/
1031	static void rcu_gp_kthread_wake(void)
1032	{
1033	struct task_struct *t = READ_ONCE(rcu_state.gp_kthread);
1034
1035	if ((current == t && !in_hardirq() && !in_serving_softirq()) \|\|
1036	!READ_ONCE(rcu_state.gp_flags) \|\| !t)
1037	return;
1038	WRITE_ONCE(rcu_state.gp_wake_time, jiffies);
1039	WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq));
1040	swake_up_one(q: &rcu_state.gp_wq);
1041	}
1042
1043	/*
1044	* If there is room, assign a ->gp_seq number to any callbacks on this
1045	* CPU that have not already been assigned. Also accelerate any callbacks
1046	* that were previously assigned a ->gp_seq number that has since proven
1047	* to be too conservative, which can happen if callbacks get assigned a
1048	* ->gp_seq number while RCU is idle, but with reference to a non-root
1049	* rcu_node structure. This function is idempotent, so it does not hurt
1050	* to call it repeatedly. Returns an flag saying that we should awaken
1051	* the RCU grace-period kthread.
1052	*
1053	* The caller must hold rnp->lock with interrupts disabled.
1054	*/
1055	static bool rcu_accelerate_cbs(struct rcu_node rnp, struct* rcu_data *rdp)
1056	{
1057	unsigned long gp_seq_req;
1058	bool ret = false;
1059
1060	rcu_lockdep_assert_cblist_protected(rdp);
1061	raw_lockdep_assert_held_rcu_node(rnp);
1062
1063	/ If no pending (not yet ready to invoke) callbacks, nothing to do. /
1064	if (!rcu_segcblist_pend_cbs(rsclp: &rdp->cblist))
1065	return false;
1066
1067	trace_rcu_segcb_stats(rs: &rdp->cblist, TPS("SegCbPreAcc"));
1068
1069	/*
1070	* Callbacks are often registered with incomplete grace-period
1071	* information. Something about the fact that getting exact
1072	* information requires acquiring a global lock... RCU therefore
1073	* makes a conservative estimate of the grace period number at which
1074	* a given callback will become ready to invoke. The following
1075	* code checks this estimate and improves it when possible, thus
1076	* accelerating callback invocation to an earlier grace-period
1077	* number.
1078	*/
1079	gp_seq_req = rcu_seq_snap(sp: &rcu_state.gp_seq);
1080	if (rcu_segcblist_accelerate(rsclp: &rdp->cblist, seq: gp_seq_req))
1081	ret = rcu_start_this_gp(rnp_start: rnp, rdp, gp_seq_req);
1082
1083	/ Trace depending on how much we were able to accelerate. /
1084	if (rcu_segcblist_restempty(rsclp: &rdp->cblist, RCU_WAIT_TAIL))
1085	trace_rcu_grace_period(rcuname: rcu_state.name, gp_seq: gp_seq_req, TPS("AccWaitCB"));
1086	else
1087	trace_rcu_grace_period(rcuname: rcu_state.name, gp_seq: gp_seq_req, TPS("AccReadyCB"));
1088
1089	trace_rcu_segcb_stats(rs: &rdp->cblist, TPS("SegCbPostAcc"));
1090
1091	return ret;
1092	}
1093
1094	/*
1095	* Similar to rcu_accelerate_cbs(), but does not require that the leaf
1096	* rcu_node structure's ->lock be held. It consults the cached value
1097	* of ->gp_seq_needed in the rcu_data structure, and if that indicates
1098	* that a new grace-period request be made, invokes rcu_accelerate_cbs()
1099	* while holding the leaf rcu_node structure's ->lock.
1100	*/
1101	static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
1102	struct rcu_data *rdp)
1103	{
1104	unsigned long c;
1105	bool needwake;
1106
1107	rcu_lockdep_assert_cblist_protected(rdp);
1108	c = rcu_seq_snap(sp: &rcu_state.gp_seq);
1109	if (!READ_ONCE(rdp->gpwrap) && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {
1110	/ Old request still live, so mark recent callbacks. /
1111	(void)rcu_segcblist_accelerate(rsclp: &rdp->cblist, seq: c);
1112	return;
1113	}
1114	raw_spin_lock_rcu_node(rnp); / irqs already disabled. /
1115	needwake = rcu_accelerate_cbs(rnp, rdp);
1116	raw_spin_unlock_rcu_node(rnp); / irqs remain disabled. /
1117	if (needwake)
1118	rcu_gp_kthread_wake();
1119	}
1120
1121	/*
1122	* Move any callbacks whose grace period has completed to the
1123	* RCU_DONE_TAIL sublist, then compact the remaining sublists and
1124	* assign ->gp_seq numbers to any callbacks in the RCU_NEXT_TAIL
1125	* sublist. This function is idempotent, so it does not hurt to
1126	* invoke it repeatedly. As long as it is not invoked -too- often...
1127	* Returns true if the RCU grace-period kthread needs to be awakened.
1128	*
1129	* The caller must hold rnp->lock with interrupts disabled.
1130	*/
1131	static bool rcu_advance_cbs(struct rcu_node rnp, struct* rcu_data *rdp)
1132	{
1133	rcu_lockdep_assert_cblist_protected(rdp);
1134	raw_lockdep_assert_held_rcu_node(rnp);
1135
1136	/ If no pending (not yet ready to invoke) callbacks, nothing to do. /
1137	if (!rcu_segcblist_pend_cbs(rsclp: &rdp->cblist))
1138	return false;
1139
1140	/*
1141	* Find all callbacks whose ->gp_seq numbers indicate that they
1142	* are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
1143	*/
1144	rcu_segcblist_advance(rsclp: &rdp->cblist, seq: rnp->gp_seq);
1145
1146	/ Classify any remaining callbacks. /
1147	return rcu_accelerate_cbs(rnp, rdp);
1148	}
1149
1150	/*
1151	* Move and classify callbacks, but only if doing so won't require
1152	* that the RCU grace-period kthread be awakened.
1153	*/
1154	static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp,
1155	struct rcu_data *rdp)
1156	{
1157	rcu_lockdep_assert_cblist_protected(rdp);
1158	if (!rcu_seq_state(s: rcu_seq_current(sp: &rnp->gp_seq)) \|\| !raw_spin_trylock_rcu_node(rnp))
1159	return;
1160	// The grace period cannot end while we hold the rcu_node lock.
1161	if (rcu_seq_state(s: rcu_seq_current(sp: &rnp->gp_seq)))
1162	WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp));
1163	raw_spin_unlock_rcu_node(rnp);
1164	}
1165
1166	/*
1167	* In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, attempt to generate a
1168	* quiescent state. This is intended to be invoked when the CPU notices
1169	* a new grace period.
1170	*/
1171	static void rcu_strict_gp_check_qs(void)
1172	{
1173	if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) {
1174	rcu_read_lock();
1175	rcu_read_unlock();
1176	}
1177	}
1178
1179	/*
1180	* Update CPU-local rcu_data state to record the beginnings and ends of
1181	* grace periods. The caller must hold the ->lock of the leaf rcu_node
1182	* structure corresponding to the current CPU, and must have irqs disabled.
1183	* Returns true if the grace-period kthread needs to be awakened.
1184	*/
1185	static bool __note_gp_changes(struct rcu_node rnp, struct* rcu_data *rdp)
1186	{
1187	bool ret = false;
1188	bool need_qs;
1189	const bool offloaded = rcu_rdp_is_offloaded(rdp);
1190
1191	raw_lockdep_assert_held_rcu_node(rnp);
1192
1193	if (rdp->gp_seq == rnp->gp_seq)
1194	return false; / Nothing to do. /
1195
1196	/ Handle the ends of any preceding grace periods first. /
1197	if (rcu_seq_completed_gp(old: rdp->gp_seq, new: rnp->gp_seq) \|\|
1198	unlikely(READ_ONCE(rdp->gpwrap))) {
1199	if (!offloaded)
1200	ret = rcu_advance_cbs(rnp, rdp); / Advance CBs. /
1201	rdp->core_needs_qs = false;
1202	trace_rcu_grace_period(rcuname: rcu_state.name, gp_seq: rdp->gp_seq, TPS("cpuend"));
1203	} else {
1204	if (!offloaded)
1205	ret = rcu_accelerate_cbs(rnp, rdp); / Recent CBs. /
1206	if (rdp->core_needs_qs)
1207	rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask);
1208	}
1209
1210	/ Now handle the beginnings of any new-to-this-CPU grace periods. /
1211	if (rcu_seq_new_gp(old: rdp->gp_seq, new: rnp->gp_seq) \|\|
1212	unlikely(READ_ONCE(rdp->gpwrap))) {
1213	/*
1214	* If the current grace period is waiting for this CPU,
1215	* set up to detect a quiescent state, otherwise don't
1216	* go looking for one.
1217	*/
1218	trace_rcu_grace_period(rcuname: rcu_state.name, gp_seq: rnp->gp_seq, TPS("cpustart"));
1219	need_qs = !!(rnp->qsmask & rdp->grpmask);
1220	rdp->cpu_no_qs.b.norm = need_qs;
1221	rdp->core_needs_qs = need_qs;
1222	zero_cpu_stall_ticks(rdp);
1223	}
1224	rdp->gp_seq = rnp->gp_seq; / Remember new grace-period state. /
1225	if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) \|\| rdp->gpwrap)
1226	WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed);
1227	if (IS_ENABLED(CONFIG_PROVE_RCU) && READ_ONCE(rdp->gpwrap))
1228	WRITE_ONCE(rdp->last_sched_clock, jiffies);
1229	WRITE_ONCE(rdp->gpwrap, false);
1230	rcu_gpnum_ovf(rnp, rdp);
1231	return ret;
1232	}
1233
1234	static void note_gp_changes(struct rcu_data *rdp)
1235	{
1236	unsigned long flags;
1237	bool needwake;
1238	struct rcu_node *rnp;
1239
1240	local_irq_save(flags);
1241	rnp = rdp->mynode;
1242	if ((rdp->gp_seq == rcu_seq_current(sp: &rnp->gp_seq) &&
1243	!unlikely(READ_ONCE(rdp->gpwrap))) \|\| / w/out lock. /
1244	!raw_spin_trylock_rcu_node(rnp)) { / irqs already off, so later. /
1245	local_irq_restore(flags);
1246	return;
1247	}
1248	needwake = __note_gp_changes(rnp, rdp);
1249	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1250	rcu_strict_gp_check_qs();
1251	if (needwake)
1252	rcu_gp_kthread_wake();
1253	}
1254
1255	static atomic_t *rcu_gp_slow_suppress;
1256
1257	/ Register a counter to suppress debugging grace-period delays. /
1258	void rcu_gp_slow_register(atomic_t *rgssp)
1259	{
1260	WARN_ON_ONCE(rcu_gp_slow_suppress);
1261
1262	WRITE_ONCE(rcu_gp_slow_suppress, rgssp);
1263	}
1264	EXPORT_SYMBOL_GPL(rcu_gp_slow_register);
1265
1266	/ Unregister a counter, with NULL for not caring which. /
1267	void rcu_gp_slow_unregister(atomic_t *rgssp)
1268	{
1269	WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress && rcu_gp_slow_suppress != NULL);
1270
1271	WRITE_ONCE(rcu_gp_slow_suppress, NULL);
1272	}
1273	EXPORT_SYMBOL_GPL(rcu_gp_slow_unregister);
1274
1275	static bool rcu_gp_slow_is_suppressed(void)
1276	{
1277	atomic_t *rgssp = READ_ONCE(rcu_gp_slow_suppress);
1278
1279	return rgssp && atomic_read(v: rgssp);
1280	}
1281
1282	static void rcu_gp_slow(int delay)
1283	{
1284	if (!rcu_gp_slow_is_suppressed() && delay > `0` &&
1285	!(rcu_seq_ctr(s: rcu_state.gp_seq) % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
1286	schedule_timeout_idle(timeout: delay);
1287	}
1288
1289	static unsigned long sleep_duration;
1290
1291	/ Allow rcutorture to stall the grace-period kthread. /
1292	void rcu_gp_set_torture_wait(int duration)
1293	{
1294	if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST) && duration > `0`)
1295	WRITE_ONCE(sleep_duration, duration);
1296	}
1297	EXPORT_SYMBOL_GPL(rcu_gp_set_torture_wait);
1298
1299	/ Actually implement the aforementioned wait. /
1300	static void rcu_gp_torture_wait(void)
1301	{
1302	unsigned long duration;
1303
1304	if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST))
1305	return;
1306	duration = xchg(&sleep_duration, `0UL`);
1307	if (duration > `0`) {
1308	pr_alert("%s: Waiting %lu jiffies\n", __func__, duration);
1309	schedule_timeout_idle(timeout: duration);
1310	pr_alert("%s: Wait complete\n", __func__);
1311	}
1312	}
1313
1314	/*
1315	* Handler for on_each_cpu() to invoke the target CPU's RCU core
1316	* processing.
1317	*/
1318	static void rcu_strict_gp_boundary(void *unused)
1319	{
1320	invoke_rcu_core();
1321	}
1322
1323	// Make the polled API aware of the beginning of a grace period.
1324	static void rcu_poll_gp_seq_start(unsigned long *snap)
1325	{
1326	struct rcu_node *rnp = rcu_get_root();
1327
1328	if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
1329	raw_lockdep_assert_held_rcu_node(rnp);
1330
1331	// If RCU was idle, note beginning of GP.
1332	if (!rcu_seq_state(s: rcu_state.gp_seq_polled))
1333	rcu_seq_start(sp: &rcu_state.gp_seq_polled);
1334
1335	// Either way, record current state.
1336	*snap = rcu_state.gp_seq_polled;
1337	}
1338
1339	// Make the polled API aware of the end of a grace period.
1340	static void rcu_poll_gp_seq_end(unsigned long *snap)
1341	{
1342	struct rcu_node *rnp = rcu_get_root();
1343
1344	if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
1345	raw_lockdep_assert_held_rcu_node(rnp);
1346
1347	// If the previously noted GP is still in effect, record the
1348	// end of that GP. Either way, zero counter to avoid counter-wrap
1349	// problems.
1350	if (snap && snap == rcu_state.gp_seq_polled) {
1351	rcu_seq_end(sp: &rcu_state.gp_seq_polled);
1352	rcu_state.gp_seq_polled_snap = `0`;
1353	rcu_state.gp_seq_polled_exp_snap = `0`;
1354	} else {
1355	*snap = `0`;
1356	}
1357	}
1358
1359	// Make the polled API aware of the beginning of a grace period, but
1360	// where caller does not hold the root rcu_node structure's lock.
1361	static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap)
1362	{
1363	unsigned long flags;
1364	struct rcu_node *rnp = rcu_get_root();
1365
1366	if (rcu_init_invoked()) {
1367	if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
1368	lockdep_assert_irqs_enabled();
1369	raw_spin_lock_irqsave_rcu_node(rnp, flags);
1370	}
1371	rcu_poll_gp_seq_start(snap);
1372	if (rcu_init_invoked())
1373	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1374	}
1375
1376	// Make the polled API aware of the end of a grace period, but where
1377	// caller does not hold the root rcu_node structure's lock.
1378	static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap)
1379	{
1380	unsigned long flags;
1381	struct rcu_node *rnp = rcu_get_root();
1382
1383	if (rcu_init_invoked()) {
1384	if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
1385	lockdep_assert_irqs_enabled();
1386	raw_spin_lock_irqsave_rcu_node(rnp, flags);
1387	}
1388	rcu_poll_gp_seq_end(snap);
1389	if (rcu_init_invoked())
1390	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1391	}
1392
1393	/*
1394	* Initialize a new grace period. Return false if no grace period required.
1395	*/
1396	static noinline_for_stack bool rcu_gp_init(void)
1397	{
1398	unsigned long flags;
1399	unsigned long oldmask;
1400	unsigned long mask;
1401	struct rcu_data *rdp;
1402	struct rcu_node *rnp = rcu_get_root();
1403
1404	WRITE_ONCE(rcu_state.gp_activity, jiffies);
1405	raw_spin_lock_irq_rcu_node(rnp);
1406	if (!READ_ONCE(rcu_state.gp_flags)) {
1407	/ Spurious wakeup, tell caller to go back to sleep. /
1408	raw_spin_unlock_irq_rcu_node(rnp);
1409	return false;
1410	}
1411	WRITE_ONCE(rcu_state.gp_flags, `0`); / Clear all flags: New GP. /
1412
1413	if (WARN_ON_ONCE(rcu_gp_in_progress())) {
1414	/*
1415	* Grace period already in progress, don't start another.
1416	* Not supposed to be able to happen.
1417	*/
1418	raw_spin_unlock_irq_rcu_node(rnp);
1419	return false;
1420	}
1421
1422	/ Advance to a new grace period and initialize state. /
1423	record_gp_stall_check_time();
1424	/ Record GP times before starting GP, hence rcu_seq_start(). /
1425	rcu_seq_start(sp: &rcu_state.gp_seq);
1426	ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
1427	trace_rcu_grace_period(rcuname: rcu_state.name, gp_seq: rcu_state.gp_seq, TPS("start"));
1428	rcu_poll_gp_seq_start(snap: &rcu_state.gp_seq_polled_snap);
1429	raw_spin_unlock_irq_rcu_node(rnp);
1430
1431	/*
1432	* Apply per-leaf buffered online and offline operations to
1433	* the rcu_node tree. Note that this new grace period need not
1434	* wait for subsequent online CPUs, and that RCU hooks in the CPU
1435	* offlining path, when combined with checks in this function,
1436	* will handle CPUs that are currently going offline or that will
1437	* go offline later. Please also refer to "Hotplug CPU" section
1438	* of RCU's Requirements documentation.
1439	*/
1440	WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF);
1441	/ Exclude CPU hotplug operations. /
1442	rcu_for_each_leaf_node(rnp) {
1443	local_irq_save(flags);
1444	arch_spin_lock(&rcu_state.ofl_lock);
1445	raw_spin_lock_rcu_node(rnp);
1446	if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
1447	!rnp->wait_blkd_tasks) {
1448	/ Nothing to do on this leaf rcu_node structure. /
1449	raw_spin_unlock_rcu_node(rnp);
1450	arch_spin_unlock(&rcu_state.ofl_lock);
1451	local_irq_restore(flags);
1452	continue;
1453	}
1454
1455	/ Record old state, apply changes to ->qsmaskinit field. /
1456	oldmask = rnp->qsmaskinit;
1457	rnp->qsmaskinit = rnp->qsmaskinitnext;
1458
1459	/ If zero-ness of ->qsmaskinit changed, propagate up tree. /
1460	if (!oldmask != !rnp->qsmaskinit) {
1461	if (!oldmask) { / First online CPU for rcu_node. /
1462	if (!rnp->wait_blkd_tasks) / Ever offline? /
1463	rcu_init_new_rnp(rnp_leaf: rnp);
1464	} else if (rcu_preempt_has_tasks(rnp)) {
1465	rnp->wait_blkd_tasks = true; / blocked tasks /
1466	} else { / Last offline CPU and can propagate. /
1467	rcu_cleanup_dead_rnp(rnp_leaf: rnp);
1468	}
1469	}
1470
1471	/*
1472	* If all waited-on tasks from prior grace period are
1473	* done, and if all this rcu_node structure's CPUs are
1474	* still offline, propagate up the rcu_node tree and
1475	* clear ->wait_blkd_tasks. Otherwise, if one of this
1476	* rcu_node structure's CPUs has since come back online,
1477	* simply clear ->wait_blkd_tasks.
1478	*/
1479	if (rnp->wait_blkd_tasks &&
1480	(!rcu_preempt_has_tasks(rnp) \|\| rnp->qsmaskinit)) {
1481	rnp->wait_blkd_tasks = false;
1482	if (!rnp->qsmaskinit)
1483	rcu_cleanup_dead_rnp(rnp_leaf: rnp);
1484	}
1485
1486	raw_spin_unlock_rcu_node(rnp);
1487	arch_spin_unlock(&rcu_state.ofl_lock);
1488	local_irq_restore(flags);
1489	}
1490	rcu_gp_slow(delay: gp_preinit_delay); / Races with CPU hotplug. /
1491
1492	/*
1493	* Set the quiescent-state-needed bits in all the rcu_node
1494	* structures for all currently online CPUs in breadth-first
1495	* order, starting from the root rcu_node structure, relying on the
1496	* layout of the tree within the rcu_state.node[] array. Note that
1497	* other CPUs will access only the leaves of the hierarchy, thus
1498	* seeing that no grace period is in progress, at least until the
1499	* corresponding leaf node has been initialized.
1500	*
1501	* The grace period cannot complete until the initialization
1502	* process finishes, because this kthread handles both.
1503	*/
1504	WRITE_ONCE(rcu_state.gp_state, RCU_GP_INIT);
1505	rcu_for_each_node_breadth_first(rnp) {
1506	rcu_gp_slow(delay: gp_init_delay);
1507	raw_spin_lock_irqsave_rcu_node(rnp, flags);
1508	rdp = this_cpu_ptr(&rcu_data);
1509	rcu_preempt_check_blocked_tasks(rnp);
1510	rnp->qsmask = rnp->qsmaskinit;
1511	WRITE_ONCE(rnp->gp_seq, rcu_state.gp_seq);
1512	if (rnp == rdp->mynode)
1513	(void)__note_gp_changes(rnp, rdp);
1514	rcu_preempt_boost_start_gp(rnp);
1515	trace_rcu_grace_period_init(rcuname: rcu_state.name, gp_seq: rnp->gp_seq,
1516	level: rnp->level, grplo: rnp->grplo,
1517	grphi: rnp->grphi, qsmask: rnp->qsmask);
1518	/ Quiescent states for tasks on any now-offline CPUs. /
1519	mask = rnp->qsmask & ~rnp->qsmaskinitnext;
1520	rnp->rcu_gp_init_mask = mask;
1521	if ((mask \|\| rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp))
1522	rcu_report_qs_rnp(mask, rnp, gps: rnp->gp_seq, flags);
1523	else
1524	raw_spin_unlock_irq_rcu_node(rnp);
1525	cond_resched_tasks_rcu_qs();
1526	WRITE_ONCE(rcu_state.gp_activity, jiffies);
1527	}
1528
1529	// If strict, make all CPUs aware of new grace period.
1530	if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
1531	on_each_cpu(func: rcu_strict_gp_boundary, NULL, wait: `0`);
1532
1533	return true;
1534	}
1535
1536	/*
1537	* Helper function for swait_event_idle_exclusive() wakeup at force-quiescent-state
1538	* time.
1539	*/
1540	static bool rcu_gp_fqs_check_wake(int *gfp)
1541	{
1542	struct rcu_node *rnp = rcu_get_root();
1543
1544	// If under overload conditions, force an immediate FQS scan.
1545	if (*gfp & RCU_GP_FLAG_OVLD)
1546	return true;
1547
1548	// Someone like call_rcu() requested a force-quiescent-state scan.
1549	*gfp = READ_ONCE(rcu_state.gp_flags);
1550	if (*gfp & RCU_GP_FLAG_FQS)
1551	return true;
1552
1553	// The current grace period has completed.
1554	if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp))
1555	return true;
1556
1557	return false;
1558	}
1559
1560	/*
1561	* Do one round of quiescent-state forcing.
1562	*/
1563	static void rcu_gp_fqs(bool first_time)
1564	{
1565	int nr_fqs = READ_ONCE(rcu_state.nr_fqs_jiffies_stall);
1566	struct rcu_node *rnp = rcu_get_root();
1567
1568	WRITE_ONCE(rcu_state.gp_activity, jiffies);
1569	WRITE_ONCE(rcu_state.n_force_qs, rcu_state.n_force_qs + `1`);
1570
1571	WARN_ON_ONCE(nr_fqs > `3`);
1572	/ Only countdown nr_fqs for stall purposes if jiffies moves. /
1573	if (nr_fqs) {
1574	if (nr_fqs == `1`) {
1575	WRITE_ONCE(rcu_state.jiffies_stall,
1576	jiffies + rcu_jiffies_till_stall_check());
1577	}
1578	WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, --nr_fqs);
1579	}
1580
1581	if (first_time) {
1582	/ Collect dyntick-idle snapshots. /
1583	force_qs_rnp(f: dyntick_save_progress_counter);
1584	} else {
1585	/ Handle dyntick-idle and offline CPUs. /
1586	force_qs_rnp(f: rcu_implicit_dynticks_qs);
1587	}
1588	/ Clear flag to prevent immediate re-entry. /
1589	if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) {
1590	raw_spin_lock_irq_rcu_node(rnp);
1591	WRITE_ONCE(rcu_state.gp_flags,
1592	READ_ONCE(rcu_state.gp_flags) & ~RCU_GP_FLAG_FQS);
1593	raw_spin_unlock_irq_rcu_node(rnp);
1594	}
1595	}
1596
1597	/*
1598	* Loop doing repeated quiescent-state forcing until the grace period ends.
1599	*/
1600	static noinline_for_stack void rcu_gp_fqs_loop(void)
1601	{
1602	bool first_gp_fqs = true;
1603	int gf = `0`;
1604	unsigned long j;
1605	int ret;
1606	struct rcu_node *rnp = rcu_get_root();
1607
1608	j = READ_ONCE(jiffies_till_first_fqs);
1609	if (rcu_state.cbovld)
1610	gf = RCU_GP_FLAG_OVLD;
1611	ret = `0`;
1612	for (;;) {
1613	if (rcu_state.cbovld) {
1614	j = (j + `2`) / `3`;
1615	if (j <= `0`)
1616	j = `1`;
1617	}
1618	if (!ret \|\| time_before(jiffies + j, rcu_state.jiffies_force_qs)) {
1619	WRITE_ONCE(rcu_state.jiffies_force_qs, jiffies + j);
1620	/*
1621	* jiffies_force_qs before RCU_GP_WAIT_FQS state
1622	* update; required for stall checks.
1623	*/
1624	smp_wmb();
1625	WRITE_ONCE(rcu_state.jiffies_kick_kthreads,
1626	jiffies + (j ? `3` * j : `2`));
1627	}
1628	trace_rcu_grace_period(rcuname: rcu_state.name, gp_seq: rcu_state.gp_seq,
1629	TPS("fqswait"));
1630	WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_FQS);
1631	(void)swait_event_idle_timeout_exclusive(rcu_state.gp_wq,
1632	rcu_gp_fqs_check_wake(&gf), j);
1633	rcu_gp_torture_wait();
1634	WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS);
1635	/ Locking provides needed memory barriers. /
1636	/*
1637	* Exit the loop if the root rcu_node structure indicates that the grace period
1638	* has ended, leave the loop. The rcu_preempt_blocked_readers_cgp(rnp) check
1639	* is required only for single-node rcu_node trees because readers blocking
1640	* the current grace period are queued only on leaf rcu_node structures.
1641	* For multi-node trees, checking the root node's ->qsmask suffices, because a
1642	* given root node's ->qsmask bit is cleared only when all CPUs and tasks from
1643	* the corresponding leaf nodes have passed through their quiescent state.
1644	*/
1645	if (!READ_ONCE(rnp->qsmask) &&
1646	!rcu_preempt_blocked_readers_cgp(rnp))
1647	break;
1648	/ If time for quiescent-state forcing, do it. /
1649	if (!time_after(rcu_state.jiffies_force_qs, jiffies) \|\|
1650	(gf & (RCU_GP_FLAG_FQS \| RCU_GP_FLAG_OVLD))) {
1651	trace_rcu_grace_period(rcuname: rcu_state.name, gp_seq: rcu_state.gp_seq,
1652	TPS("fqsstart"));
1653	rcu_gp_fqs(first_time: first_gp_fqs);
1654	gf = `0`;
1655	if (first_gp_fqs) {
1656	first_gp_fqs = false;
1657	gf = rcu_state.cbovld ? RCU_GP_FLAG_OVLD : `0`;
1658	}
1659	trace_rcu_grace_period(rcuname: rcu_state.name, gp_seq: rcu_state.gp_seq,
1660	TPS("fqsend"));
1661	cond_resched_tasks_rcu_qs();
1662	WRITE_ONCE(rcu_state.gp_activity, jiffies);
1663	ret = `0`; / Force full wait till next FQS. /
1664	j = READ_ONCE(jiffies_till_next_fqs);
1665	} else {
1666	/ Deal with stray signal. /
1667	cond_resched_tasks_rcu_qs();
1668	WRITE_ONCE(rcu_state.gp_activity, jiffies);
1669	WARN_ON(signal_pending(current));
1670	trace_rcu_grace_period(rcuname: rcu_state.name, gp_seq: rcu_state.gp_seq,
1671	TPS("fqswaitsig"));
1672	ret = `1`; / Keep old FQS timing. /
1673	j = jiffies;
1674	if (time_after(jiffies, rcu_state.jiffies_force_qs))
1675	j = `1`;
1676	else
1677	j = rcu_state.jiffies_force_qs - j;
1678	gf = `0`;
1679	}
1680	}
1681	}
1682
1683	/*
1684	* Clean up after the old grace period.
1685	*/
1686	static noinline void rcu_gp_cleanup(void)
1687	{
1688	int cpu;
1689	bool needgp = false;
1690	unsigned long gp_duration;
1691	unsigned long new_gp_seq;
1692	bool offloaded;
1693	struct rcu_data *rdp;
1694	struct rcu_node *rnp = rcu_get_root();
1695	struct swait_queue_head *sq;
1696
1697	WRITE_ONCE(rcu_state.gp_activity, jiffies);
1698	raw_spin_lock_irq_rcu_node(rnp);
1699	rcu_state.gp_end = jiffies;
1700	gp_duration = rcu_state.gp_end - rcu_state.gp_start;
1701	if (gp_duration > rcu_state.gp_max)
1702	rcu_state.gp_max = gp_duration;
1703
1704	/*
1705	* We know the grace period is complete, but to everyone else
1706	* it appears to still be ongoing. But it is also the case
1707	* that to everyone else it looks like there is nothing that
1708	* they can do to advance the grace period. It is therefore
1709	* safe for us to drop the lock in order to mark the grace
1710	* period as completed in all of the rcu_node structures.
1711	*/
1712	rcu_poll_gp_seq_end(snap: &rcu_state.gp_seq_polled_snap);
1713	raw_spin_unlock_irq_rcu_node(rnp);
1714
1715	/*
1716	* Propagate new ->gp_seq value to rcu_node structures so that
1717	* other CPUs don't have to wait until the start of the next grace
1718	* period to process their callbacks. This also avoids some nasty
1719	* RCU grace-period initialization races by forcing the end of
1720	* the current grace period to be completely recorded in all of
1721	* the rcu_node structures before the beginning of the next grace
1722	* period is recorded in any of the rcu_node structures.
1723	*/
1724	new_gp_seq = rcu_state.gp_seq;
1725	rcu_seq_end(sp: &new_gp_seq);
1726	rcu_for_each_node_breadth_first(rnp) {
1727	raw_spin_lock_irq_rcu_node(rnp);
1728	if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
1729	dump_blkd_tasks(rnp, ncheck: `10`);
1730	WARN_ON_ONCE(rnp->qsmask);
1731	WRITE_ONCE(rnp->gp_seq, new_gp_seq);
1732	if (!rnp->parent)
1733	smp_mb(); // Order against failing poll_state_synchronize_rcu_full().
1734	rdp = this_cpu_ptr(&rcu_data);
1735	if (rnp == rdp->mynode)
1736	needgp = __note_gp_changes(rnp, rdp) \|\| needgp;
1737	/ smp_mb() provided by prior unlock-lock pair. /
1738	needgp = rcu_future_gp_cleanup(rnp) \|\| needgp;
1739	// Reset overload indication for CPUs no longer overloaded
1740	if (rcu_is_leaf_node(rnp))
1741	for_each_leaf_node_cpu_mask(rnp, cpu, rnp->cbovldmask) {
1742	rdp = per_cpu_ptr(&rcu_data, cpu);
1743	check_cb_ovld_locked(rdp, rnp);
1744	}
1745	sq = rcu_nocb_gp_get(rnp);
1746	raw_spin_unlock_irq_rcu_node(rnp);
1747	rcu_nocb_gp_cleanup(sq);
1748	cond_resched_tasks_rcu_qs();
1749	WRITE_ONCE(rcu_state.gp_activity, jiffies);
1750	rcu_gp_slow(delay: gp_cleanup_delay);
1751	}
1752	rnp = rcu_get_root();
1753	raw_spin_lock_irq_rcu_node(rnp); / GP before ->gp_seq update. /
1754
1755	/ Declare grace period done, trace first to use old GP number. /
1756	trace_rcu_grace_period(rcuname: rcu_state.name, gp_seq: rcu_state.gp_seq, TPS("end"));
1757	rcu_seq_end(sp: &rcu_state.gp_seq);
1758	ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
1759	WRITE_ONCE(rcu_state.gp_state, RCU_GP_IDLE);
1760	/ Check for GP requests since above loop. /
1761	rdp = this_cpu_ptr(&rcu_data);
1762	if (!needgp && ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) {
1763	trace_rcu_this_gp(rnp, rdp, gp_seq_req: rnp->gp_seq_needed,
1764	TPS("CleanupMore"));
1765	needgp = true;
1766	}
1767	/ Advance CBs to reduce false positives below. /
1768	offloaded = rcu_rdp_is_offloaded(rdp);
1769	if ((offloaded \|\| !rcu_accelerate_cbs(rnp, rdp)) && needgp) {
1770
1771	// We get here if a grace period was needed (“needgp”)
1772	// and the above call to rcu_accelerate_cbs() did not set
1773	// the RCU_GP_FLAG_INIT bit in ->gp_state (which records
1774	// the need for another grace period). The purpose
1775	// of the “offloaded” check is to avoid invoking
1776	// rcu_accelerate_cbs() on an offloaded CPU because we do not
1777	// hold the ->nocb_lock needed to safely access an offloaded
1778	// ->cblist. We do not want to acquire that lock because
1779	// it can be heavily contended during callback floods.
1780
1781	WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT);
1782	WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
1783	trace_rcu_grace_period(rcuname: rcu_state.name, gp_seq: rcu_state.gp_seq, TPS("newreq"));
1784	} else {
1785
1786	// We get here either if there is no need for an
1787	// additional grace period or if rcu_accelerate_cbs() has
1788	// already set the RCU_GP_FLAG_INIT bit in ->gp_flags.
1789	// So all we need to do is to clear all of the other
1790	// ->gp_flags bits.
1791
1792	WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags & RCU_GP_FLAG_INIT);
1793	}
1794	raw_spin_unlock_irq_rcu_node(rnp);
1795
1796	// If strict, make all CPUs aware of the end of the old grace period.
1797	if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
1798	on_each_cpu(func: rcu_strict_gp_boundary, NULL, wait: `0`);
1799	}
1800
1801	/*
1802	* Body of kthread that handles grace periods.
1803	*/
1804	static int __noreturn rcu_gp_kthread(void *unused)
1805	{
1806	rcu_bind_gp_kthread();
1807	for (;;) {
1808
1809	/ Handle grace-period start. /
1810	for (;;) {
1811	trace_rcu_grace_period(rcuname: rcu_state.name, gp_seq: rcu_state.gp_seq,
1812	TPS("reqwait"));
1813	WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_GPS);
1814	swait_event_idle_exclusive(rcu_state.gp_wq,
1815	READ_ONCE(rcu_state.gp_flags) &
1816	RCU_GP_FLAG_INIT);
1817	rcu_gp_torture_wait();
1818	WRITE_ONCE(rcu_state.gp_state, RCU_GP_DONE_GPS);
1819	/ Locking provides needed memory barrier. /
1820	if (rcu_gp_init())
1821	break;
1822	cond_resched_tasks_rcu_qs();
1823	WRITE_ONCE(rcu_state.gp_activity, jiffies);
1824	WARN_ON(signal_pending(current));
1825	trace_rcu_grace_period(rcuname: rcu_state.name, gp_seq: rcu_state.gp_seq,
1826	TPS("reqwaitsig"));
1827	}
1828
1829	/ Handle quiescent-state forcing. /
1830	rcu_gp_fqs_loop();
1831
1832	/ Handle grace-period end. /
1833	WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANUP);
1834	rcu_gp_cleanup();
1835	WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANED);
1836	}
1837	}
1838
1839	/*
1840	* Report a full set of quiescent states to the rcu_state data structure.
1841	* Invoke rcu_gp_kthread_wake() to awaken the grace-period kthread if
1842	* another grace period is required. Whether we wake the grace-period
1843	* kthread or it awakens itself for the next round of quiescent-state
1844	* forcing, that kthread will clean up after the just-completed grace
1845	* period. Note that the caller must hold rnp->lock, which is released
1846	* before return.
1847	*/
1848	static void rcu_report_qs_rsp(unsigned long flags)
1849	__releases(rcu_get_root()->lock)
1850	{
1851	raw_lockdep_assert_held_rcu_node(rcu_get_root());
1852	WARN_ON_ONCE(!rcu_gp_in_progress());
1853	WRITE_ONCE(rcu_state.gp_flags,
1854	READ_ONCE(rcu_state.gp_flags) \| RCU_GP_FLAG_FQS);
1855	raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(), flags);
1856	rcu_gp_kthread_wake();
1857	}
1858
1859	/*
1860	* Similar to rcu_report_qs_rdp(), for which it is a helper function.
1861	* Allows quiescent states for a group of CPUs to be reported at one go
1862	* to the specified rcu_node structure, though all the CPUs in the group
1863	* must be represented by the same rcu_node structure (which need not be a
1864	* leaf rcu_node structure, though it often will be). The gps parameter
1865	* is the grace-period snapshot, which means that the quiescent states
1866	* are valid only if rnp->gp_seq is equal to gps. That structure's lock
1867	* must be held upon entry, and it is released before return.
1868	*
1869	* As a special case, if mask is zero, the bit-already-cleared check is
1870	* disabled. This allows propagating quiescent state due to resumed tasks
1871	* during grace-period initialization.
1872	*/
1873	static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
1874	unsigned long gps, unsigned long flags)
1875	__releases(rnp->lock)
1876	{
1877	unsigned long oldmask = `0`;
1878	struct rcu_node *rnp_c;
1879
1880	raw_lockdep_assert_held_rcu_node(rnp);
1881
1882	/ Walk up the rcu_node hierarchy. /
1883	for (;;) {
1884	if ((!(rnp->qsmask & mask) && mask) \|\| rnp->gp_seq != gps) {
1885
1886	/*
1887	* Our bit has already been cleared, or the
1888	* relevant grace period is already over, so done.
1889	*/
1890	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1891	return;
1892	}
1893	WARN_ON_ONCE(oldmask); / Any child must be all zeroed! /
1894	WARN_ON_ONCE(!rcu_is_leaf_node(rnp) &&
1895	rcu_preempt_blocked_readers_cgp(rnp));
1896	WRITE_ONCE(rnp->qsmask, rnp->qsmask & ~mask);
1897	trace_rcu_quiescent_state_report(rcuname: rcu_state.name, gp_seq: rnp->gp_seq,
1898	mask, qsmask: rnp->qsmask, level: rnp->level,
1899	grplo: rnp->grplo, grphi: rnp->grphi,
1900	gp_tasks: !!rnp->gp_tasks);
1901	if (rnp->qsmask != `0` \|\| rcu_preempt_blocked_readers_cgp(rnp)) {
1902
1903	/ Other bits still set at this level, so done. /
1904	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1905	return;
1906	}
1907	rnp->completedqs = rnp->gp_seq;
1908	mask = rnp->grpmask;
1909	if (rnp->parent == NULL) {
1910
1911	/ No more levels. Exit loop holding root lock. /
1912
1913	break;
1914	}
1915	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1916	rnp_c = rnp;
1917	rnp = rnp->parent;
1918	raw_spin_lock_irqsave_rcu_node(rnp, flags);
1919	oldmask = READ_ONCE(rnp_c->qsmask);
1920	}
1921
1922	/*
1923	* Get here if we are the last CPU to pass through a quiescent
1924	* state for this grace period. Invoke rcu_report_qs_rsp()
1925	* to clean up and start the next grace period if one is needed.
1926	*/
1927	rcu_report_qs_rsp(flags); / releases rnp->lock. /
1928	}
1929
1930	/*
1931	* Record a quiescent state for all tasks that were previously queued
1932	* on the specified rcu_node structure and that were blocking the current
1933	* RCU grace period. The caller must hold the corresponding rnp->lock with
1934	* irqs disabled, and this lock is released upon return, but irqs remain
1935	* disabled.
1936	*/
1937	static void __maybe_unused
1938	rcu_report_unblock_qs_rnp(struct rcu_node rnp, unsigned* long flags)
1939	__releases(rnp->lock)
1940	{
1941	unsigned long gps;
1942	unsigned long mask;
1943	struct rcu_node *rnp_p;
1944
1945	raw_lockdep_assert_held_rcu_node(rnp);
1946	if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_RCU)) \|\|
1947	WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) \|\|
1948	rnp->qsmask != `0`) {
1949	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1950	return; / Still need more quiescent states! /
1951	}
1952
1953	rnp->completedqs = rnp->gp_seq;
1954	rnp_p = rnp->parent;
1955	if (rnp_p == NULL) {
1956	/*
1957	* Only one rcu_node structure in the tree, so don't
1958	* try to report up to its nonexistent parent!
1959	*/
1960	rcu_report_qs_rsp(flags);
1961	return;
1962	}
1963
1964	/ Report up the rest of the hierarchy, tracking current ->gp_seq. /
1965	gps = rnp->gp_seq;
1966	mask = rnp->grpmask;
1967	raw_spin_unlock_rcu_node(rnp); / irqs remain disabled. /
1968	raw_spin_lock_rcu_node(rnp_p); / irqs already disabled. /
1969	rcu_report_qs_rnp(mask, rnp: rnp_p, gps, flags);
1970	}
1971
1972	/*
1973	* Record a quiescent state for the specified CPU to that CPU's rcu_data
1974	* structure. This must be called from the specified CPU.
1975	*/
1976	static void
1977	rcu_report_qs_rdp(struct rcu_data *rdp)
1978	{
1979	unsigned long flags;
1980	unsigned long mask;
1981	bool needacc = false;
1982	struct rcu_node *rnp;
1983
1984	WARN_ON_ONCE(rdp->cpu != smp_processor_id());
1985	rnp = rdp->mynode;
1986	raw_spin_lock_irqsave_rcu_node(rnp, flags);
1987	if (rdp->cpu_no_qs.b.norm \|\| rdp->gp_seq != rnp->gp_seq \|\|
1988	rdp->gpwrap) {
1989
1990	/*
1991	* The grace period in which this quiescent state was
1992	* recorded has ended, so don't report it upwards.
1993	* We will instead need a new quiescent state that lies
1994	* within the current grace period.
1995	*/
1996	rdp->cpu_no_qs.b.norm = true; / need qs for new gp. /
1997	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1998	return;
1999	}
2000	mask = rdp->grpmask;
2001	rdp->core_needs_qs = false;
2002	if ((rnp->qsmask & mask) == `0`) {
2003	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2004	} else {
2005	/*
2006	* This GP can't end until cpu checks in, so all of our
2007	* callbacks can be processed during the next GP.
2008	*
2009	* NOCB kthreads have their own way to deal with that...
2010	*/
2011	if (!rcu_rdp_is_offloaded(rdp)) {
2012	/*
2013	* The current GP has not yet ended, so it
2014	* should not be possible for rcu_accelerate_cbs()
2015	* to return true. So complain, but don't awaken.
2016	*/
2017	WARN_ON_ONCE(rcu_accelerate_cbs(rnp, rdp));
2018	} else if (!rcu_segcblist_completely_offloaded(rsclp: &rdp->cblist)) {
2019	/*
2020	* ...but NOCB kthreads may miss or delay callbacks acceleration
2021	* if in the middle of a (de-)offloading process.
2022	*/
2023	needacc = true;
2024	}
2025
2026	rcu_disable_urgency_upon_qs(rdp);
2027	rcu_report_qs_rnp(mask, rnp, gps: rnp->gp_seq, flags);
2028	/ ^^^ Released rnp->lock /
2029
2030	if (needacc) {
2031	rcu_nocb_lock_irqsave(rdp, flags);
2032	rcu_accelerate_cbs_unlocked(rnp, rdp);
2033	rcu_nocb_unlock_irqrestore(rdp, flags);
2034	}
2035	}
2036	}
2037
2038	/*
2039	* Check to see if there is a new grace period of which this CPU
2040	* is not yet aware, and if so, set up local rcu_data state for it.
2041	* Otherwise, see if this CPU has just passed through its first
2042	* quiescent state for this grace period, and record that fact if so.
2043	*/
2044	static void
2045	rcu_check_quiescent_state(struct rcu_data *rdp)
2046	{
2047	/ Check for grace-period ends and beginnings. /
2048	note_gp_changes(rdp);
2049
2050	/*
2051	* Does this CPU still need to do its part for current grace period?
2052	* If no, return and let the other CPUs do their part as well.
2053	*/
2054	if (!rdp->core_needs_qs)
2055	return;
2056
2057	/*
2058	* Was there a quiescent state since the beginning of the grace
2059	* period? If no, then exit and wait for the next call.
2060	*/
2061	if (rdp->cpu_no_qs.b.norm)
2062	return;
2063
2064	/*
2065	* Tell RCU we are done (but rcu_report_qs_rdp() will be the
2066	* judge of that).
2067	*/
2068	rcu_report_qs_rdp(rdp);
2069	}
2070
2071	/ Return true if callback-invocation time limit exceeded. /
2072	static bool rcu_do_batch_check_time(long count, long tlimit,
2073	bool jlimit_check, unsigned long jlimit)
2074	{
2075	// Invoke local_clock() only once per 32 consecutive callbacks.
2076	return unlikely(tlimit) &&
2077	(!likely(count & `31`) \|\|
2078	(IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME) &&
2079	jlimit_check && time_after(jiffies, jlimit))) &&
2080	local_clock() >= tlimit;
2081	}
2082
2083	/*
2084	* Invoke any RCU callbacks that have made it to the end of their grace
2085	* period. Throttle as specified by rdp->blimit.
2086	*/
2087	static void rcu_do_batch(struct rcu_data *rdp)
2088	{
2089	long bl;
2090	long count = `0`;
2091	int div;
2092	bool __maybe_unused empty;
2093	unsigned long flags;
2094	unsigned long jlimit;
2095	bool jlimit_check = false;
2096	long pending;
2097	struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
2098	struct rcu_head *rhp;
2099	long tlimit = `0`;
2100
2101	/ If no callbacks are ready, just return. /
2102	if (!rcu_segcblist_ready_cbs(rsclp: &rdp->cblist)) {
2103	trace_rcu_batch_start(rcuname: rcu_state.name,
2104	qlen: rcu_segcblist_n_cbs(rsclp: &rdp->cblist), blimit: `0`);
2105	trace_rcu_batch_end(rcuname: rcu_state.name, callbacks_invoked: `0`,
2106	cb: !rcu_segcblist_empty(rsclp: &rdp->cblist),
2107	nr: need_resched(), iit: is_idle_task(current),
2108	risk: rcu_is_callbacks_kthread(rdp));
2109	return;
2110	}
2111
2112	/*
2113	* Extract the list of ready callbacks, disabling IRQs to prevent
2114	* races with call_rcu() from interrupt handlers. Leave the
2115	* callback counts, as rcu_barrier() needs to be conservative.
2116	*/
2117	rcu_nocb_lock_irqsave(rdp, flags);
2118	WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
2119	pending = rcu_segcblist_get_seglen(rsclp: &rdp->cblist, RCU_DONE_TAIL);
2120	div = READ_ONCE(rcu_divisor);
2121	div = div < `0` ? `7` : div > sizeof(long) * `8` - `2` ? sizeof(long) * `8` - `2` : div;
2122	bl = max(rdp->blimit, pending >> div);
2123	if ((in_serving_softirq() \|\| rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING) &&
2124	(IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME) \|\| unlikely(bl > `100`))) {
2125	const long npj = NSEC_PER_SEC / HZ;
2126	long rrn = READ_ONCE(rcu_resched_ns);
2127
2128	rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn;
2129	tlimit = local_clock() + rrn;
2130	jlimit = jiffies + (rrn + npj + `1`) / npj;
2131	jlimit_check = true;
2132	}
2133	trace_rcu_batch_start(rcuname: rcu_state.name,
2134	qlen: rcu_segcblist_n_cbs(rsclp: &rdp->cblist), blimit: bl);
2135	rcu_segcblist_extract_done_cbs(rsclp: &rdp->cblist, rclp: &rcl);
2136	if (rcu_rdp_is_offloaded(rdp))
2137	rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(rsclp: &rdp->cblist);
2138
2139	trace_rcu_segcb_stats(rs: &rdp->cblist, TPS("SegCbDequeued"));
2140	rcu_nocb_unlock_irqrestore(rdp, flags);
2141
2142	/ Invoke callbacks. /
2143	tick_dep_set_task(current, bit: TICK_DEP_BIT_RCU);
2144	rhp = rcu_cblist_dequeue(rclp: &rcl);
2145
2146	for (; rhp; rhp = rcu_cblist_dequeue(rclp: &rcl)) {
2147	rcu_callback_t f;
2148
2149	count++;
2150	debug_rcu_head_unqueue(head: rhp);
2151
2152	rcu_lock_acquire(map: &rcu_callback_map);
2153	trace_rcu_invoke_callback(rcuname: rcu_state.name, rhp);
2154
2155	f = rhp->func;
2156	debug_rcu_head_callback(rhp);
2157	WRITE_ONCE(rhp->func, (rcu_callback_t)`0L`);
2158	f(rhp);
2159
2160	rcu_lock_release(map: &rcu_callback_map);
2161
2162	/*
2163	* Stop only if limit reached and CPU has something to do.
2164	*/
2165	if (in_serving_softirq()) {
2166	if (count >= bl && (need_resched() \|\| !is_idle_task(current)))
2167	break;
2168	/*
2169	* Make sure we don't spend too much time here and deprive other
2170	* softirq vectors of CPU cycles.
2171	*/
2172	if (rcu_do_batch_check_time(count, tlimit, jlimit_check, jlimit))
2173	break;
2174	} else {
2175	// In rcuc/rcuoc context, so no worries about
2176	// depriving other softirq vectors of CPU cycles.
2177	local_bh_enable();
2178	lockdep_assert_irqs_enabled();
2179	cond_resched_tasks_rcu_qs();
2180	lockdep_assert_irqs_enabled();
2181	local_bh_disable();
2182	// But rcuc kthreads can delay quiescent-state
2183	// reporting, so check time limits for them.
2184	if (rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING &&
2185	rcu_do_batch_check_time(count, tlimit, jlimit_check, jlimit)) {
2186	rdp->rcu_cpu_has_work = `1`;
2187	break;
2188	}
2189	}
2190	}
2191
2192	rcu_nocb_lock_irqsave(rdp, flags);
2193	rdp->n_cbs_invoked += count;
2194	trace_rcu_batch_end(rcuname: rcu_state.name, callbacks_invoked: count, cb: !!rcl.head, nr: need_resched(),
2195	iit: is_idle_task(current), risk: rcu_is_callbacks_kthread(rdp));
2196
2197	/ Update counts and requeue any remaining callbacks. /
2198	rcu_segcblist_insert_done_cbs(rsclp: &rdp->cblist, rclp: &rcl);
2199	rcu_segcblist_add_len(rsclp: &rdp->cblist, v: -count);
2200
2201	/ Reinstate batch limit if we have worked down the excess. /
2202	count = rcu_segcblist_n_cbs(rsclp: &rdp->cblist);
2203	if (rdp->blimit >= DEFAULT_MAX_RCU_BLIMIT && count <= qlowmark)
2204	rdp->blimit = blimit;
2205
2206	/ Reset ->qlen_last_fqs_check trigger if enough CBs have drained. /
2207	if (count == `0` && rdp->qlen_last_fqs_check != `0`) {
2208	rdp->qlen_last_fqs_check = `0`;
2209	rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
2210	} else if (count < rdp->qlen_last_fqs_check - qhimark)
2211	rdp->qlen_last_fqs_check = count;
2212
2213	/*
2214	* The following usually indicates a double call_rcu(). To track
2215	* this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.
2216	*/
2217	empty = rcu_segcblist_empty(rsclp: &rdp->cblist);
2218	WARN_ON_ONCE(count == `0` && !empty);
2219	WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
2220	count != `0` && empty);
2221	WARN_ON_ONCE(count == `0` && rcu_segcblist_n_segment_cbs(&rdp->cblist) != `0`);
2222	WARN_ON_ONCE(!empty && rcu_segcblist_n_segment_cbs(&rdp->cblist) == `0`);
2223
2224	rcu_nocb_unlock_irqrestore(rdp, flags);
2225
2226	tick_dep_clear_task(current, bit: TICK_DEP_BIT_RCU);
2227	}
2228
2229	/*
2230	* This function is invoked from each scheduling-clock interrupt,
2231	* and checks to see if this CPU is in a non-context-switch quiescent
2232	* state, for example, user mode or idle loop. It also schedules RCU
2233	* core processing. If the current grace period has gone on too long,
2234	* it will ask the scheduler to manufacture a context switch for the sole
2235	* purpose of providing the needed quiescent state.
2236	*/
2237	void rcu_sched_clock_irq(int user)
2238	{
2239	unsigned long j;
2240
2241	if (IS_ENABLED(CONFIG_PROVE_RCU)) {
2242	j = jiffies;
2243	WARN_ON_ONCE(time_before(j, __this_cpu_read(rcu_data.last_sched_clock)));
2244	__this_cpu_write(rcu_data.last_sched_clock, j);
2245	}
2246	trace_rcu_utilization(TPS("Start scheduler-tick"));
2247	lockdep_assert_irqs_disabled();
2248	raw_cpu_inc(rcu_data.ticks_this_gp);
2249	/ The load-acquire pairs with the store-release setting to true. /
2250	if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
2251	/ Idle and userspace execution already are quiescent states. /
2252	if (!rcu_is_cpu_rrupt_from_idle() && !user) {
2253	set_tsk_need_resched(current);
2254	set_preempt_need_resched();
2255	}
2256	__this_cpu_write(rcu_data.rcu_urgent_qs, false);
2257	}
2258	rcu_flavor_sched_clock_irq(user);
2259	if (rcu_pending(user))
2260	invoke_rcu_core();
2261	if (user \|\| rcu_is_cpu_rrupt_from_idle())
2262	rcu_note_voluntary_context_switch(current);
2263	lockdep_assert_irqs_disabled();
2264
2265	trace_rcu_utilization(TPS("End scheduler-tick"));
2266	}
2267
2268	/*
2269	* Scan the leaf rcu_node structures. For each structure on which all
2270	* CPUs have reported a quiescent state and on which there are tasks
2271	* blocking the current grace period, initiate RCU priority boosting.
2272	* Otherwise, invoke the specified function to check dyntick state for
2273	* each CPU that has not yet reported a quiescent state.
2274	*/
2275	static void force_qs_rnp(int (f)(struct* rcu_data *rdp))
2276	{
2277	int cpu;
2278	unsigned long flags;
2279	struct rcu_node *rnp;
2280
2281	rcu_state.cbovld = rcu_state.cbovldnext;
2282	rcu_state.cbovldnext = false;
2283	rcu_for_each_leaf_node(rnp) {
2284	unsigned long mask = `0`;
2285	unsigned long rsmask = `0`;
2286
2287	cond_resched_tasks_rcu_qs();
2288	raw_spin_lock_irqsave_rcu_node(rnp, flags);
2289	rcu_state.cbovldnext \|= !!rnp->cbovldmask;
2290	if (rnp->qsmask == `0`) {
2291	if (rcu_preempt_blocked_readers_cgp(rnp)) {
2292	/*
2293	* No point in scanning bits because they
2294	* are all zero. But we might need to
2295	* priority-boost blocked readers.
2296	*/
2297	rcu_initiate_boost(rnp, flags);
2298	/ rcu_initiate_boost() releases rnp->lock /
2299	continue;
2300	}
2301	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2302	continue;
2303	}
2304	for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) {
2305	struct rcu_data *rdp;
2306	int ret;
2307
2308	rdp = per_cpu_ptr(&rcu_data, cpu);
2309	ret = f(rdp);
2310	if (ret > `0`) {
2311	mask \|= rdp->grpmask;
2312	rcu_disable_urgency_upon_qs(rdp);
2313	}
2314	if (ret < `0`)
2315	rsmask \|= rdp->grpmask;
2316	}
2317	if (mask != `0`) {
2318	/ Idle/offline CPUs, report (releases rnp->lock). /
2319	rcu_report_qs_rnp(mask, rnp, gps: rnp->gp_seq, flags);
2320	} else {
2321	/ Nothing to do here, so just drop the lock. /
2322	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2323	}
2324
2325	for_each_leaf_node_cpu_mask(rnp, cpu, rsmask)
2326	resched_cpu(cpu);
2327	}
2328	}
2329
2330	/*
2331	* Force quiescent states on reluctant CPUs, and also detect which
2332	* CPUs are in dyntick-idle mode.
2333	*/
2334	void rcu_force_quiescent_state(void)
2335	{
2336	unsigned long flags;
2337	bool ret;
2338	struct rcu_node *rnp;
2339	struct rcu_node *rnp_old = NULL;
2340
2341	/ Funnel through hierarchy to reduce memory contention. /
2342	rnp = raw_cpu_read(rcu_data.mynode);
2343	for (; rnp != NULL; rnp = rnp->parent) {
2344	ret = (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) \|\|
2345	!raw_spin_trylock(&rnp->fqslock);
2346	if (rnp_old != NULL)
2347	raw_spin_unlock(&rnp_old->fqslock);
2348	if (ret)
2349	return;
2350	rnp_old = rnp;
2351	}
2352	/ rnp_old == rcu_get_root(), rnp == NULL. /
2353
2354	/ Reached the root of the rcu_node tree, acquire lock. /
2355	raw_spin_lock_irqsave_rcu_node(rnp_old, flags);
2356	raw_spin_unlock(&rnp_old->fqslock);
2357	if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) {
2358	raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
2359	return; / Someone beat us to it. /
2360	}
2361	WRITE_ONCE(rcu_state.gp_flags,
2362	READ_ONCE(rcu_state.gp_flags) \| RCU_GP_FLAG_FQS);
2363	raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
2364	rcu_gp_kthread_wake();
2365	}
2366	EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
2367
2368	// Workqueue handler for an RCU reader for kernels enforcing struct RCU
2369	// grace periods.
2370	static void strict_work_handler(struct work_struct *work)
2371	{
2372	rcu_read_lock();
2373	rcu_read_unlock();
2374	}
2375
2376	/ Perform RCU core processing work for the current CPU. /
2377	static __latent_entropy void rcu_core(void)
2378	{
2379	unsigned long flags;
2380	struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
2381	struct rcu_node *rnp = rdp->mynode;
2382	/*
2383	* On RT rcu_core() can be preempted when IRQs aren't disabled.
2384	* Therefore this function can race with concurrent NOCB (de-)offloading
2385	* on this CPU and the below condition must be considered volatile.
2386	* However if we race with:
2387	*
2388	* _ Offloading: In the worst case we accelerate or process callbacks
2389	* concurrently with NOCB kthreads. We are guaranteed to
2390	* call rcu_nocb_lock() if that happens.
2391	*
2392	* _ Deoffloading: In the worst case we miss callbacks acceleration or
2393	* processing. This is fine because the early stage
2394	* of deoffloading invokes rcu_core() after setting
2395	* SEGCBLIST_RCU_CORE. So we guarantee that we'll process
2396	* what could have been dismissed without the need to wait
2397	* for the next rcu_pending() check in the next jiffy.
2398	*/
2399	const bool do_batch = !rcu_segcblist_completely_offloaded(rsclp: &rdp->cblist);
2400
2401	if (cpu_is_offline(smp_processor_id()))
2402	return;
2403	trace_rcu_utilization(TPS("Start RCU core"));
2404	WARN_ON_ONCE(!rdp->beenonline);
2405
2406	/ Report any deferred quiescent states if preemption enabled. /
2407	if (IS_ENABLED(CONFIG_PREEMPT_COUNT) && (!(preempt_count() & PREEMPT_MASK))) {
2408	rcu_preempt_deferred_qs(current);
2409	} else if (rcu_preempt_need_deferred_qs(current)) {
2410	set_tsk_need_resched(current);
2411	set_preempt_need_resched();
2412	}
2413
2414	/ Update RCU state based on any recent quiescent states. /
2415	rcu_check_quiescent_state(rdp);
2416
2417	/ No grace period and unregistered callbacks? /
2418	if (!rcu_gp_in_progress() &&
2419	rcu_segcblist_is_enabled(rsclp: &rdp->cblist) && do_batch) {
2420	rcu_nocb_lock_irqsave(rdp, flags);
2421	if (!rcu_segcblist_restempty(rsclp: &rdp->cblist, RCU_NEXT_READY_TAIL))
2422	rcu_accelerate_cbs_unlocked(rnp, rdp);
2423	rcu_nocb_unlock_irqrestore(rdp, flags);
2424	}
2425
2426	rcu_check_gp_start_stall(rnp, rdp, gpssdelay: rcu_jiffies_till_stall_check());
2427
2428	/ If there are callbacks ready, invoke them. /
2429	if (do_batch && rcu_segcblist_ready_cbs(rsclp: &rdp->cblist) &&
2430	likely(READ_ONCE(rcu_scheduler_fully_active))) {
2431	rcu_do_batch(rdp);
2432	/ Re-invoke RCU core processing if there are callbacks remaining. /
2433	if (rcu_segcblist_ready_cbs(rsclp: &rdp->cblist))
2434	invoke_rcu_core();
2435	}
2436
2437	/ Do any needed deferred wakeups of rcuo kthreads. /
2438	do_nocb_deferred_wakeup(rdp);
2439	trace_rcu_utilization(TPS("End RCU core"));
2440
2441	// If strict GPs, schedule an RCU reader in a clean environment.
2442	if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
2443	queue_work_on(cpu: rdp->cpu, wq: rcu_gp_wq, work: &rdp->strict_work);
2444	}
2445
2446	static void rcu_core_si(struct softirq_action *h)
2447	{
2448	rcu_core();
2449	}
2450
2451	static void rcu_wake_cond(struct task_struct t, int* status)
2452	{
2453	/*
2454	* If the thread is yielding, only wake it when this
2455	* is invoked from idle
2456	*/
2457	if (t && (status != RCU_KTHREAD_YIELDING \|\| is_idle_task(current)))
2458	wake_up_process(tsk: t);
2459	}
2460
2461	static void invoke_rcu_core_kthread(void)
2462	{
2463	struct task_struct *t;
2464	unsigned long flags;
2465
2466	local_irq_save(flags);
2467	__this_cpu_write(rcu_data.rcu_cpu_has_work, `1`);
2468	t = __this_cpu_read(rcu_data.rcu_cpu_kthread_task);
2469	if (t != NULL && t != current)
2470	rcu_wake_cond(t, __this_cpu_read(rcu_data.rcu_cpu_kthread_status));
2471	local_irq_restore(flags);
2472	}
2473
2474	/*
2475	* Wake up this CPU's rcuc kthread to do RCU core processing.
2476	*/
2477	static void invoke_rcu_core(void)
2478	{
2479	if (!cpu_online(smp_processor_id()))
2480	return;
2481	if (use_softirq)
2482	raise_softirq(nr: RCU_SOFTIRQ);
2483	else
2484	invoke_rcu_core_kthread();
2485	}
2486
2487	static void rcu_cpu_kthread_park(unsigned int cpu)
2488	{
2489	per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
2490	}
2491
2492	static int rcu_cpu_kthread_should_run(unsigned int cpu)
2493	{
2494	return __this_cpu_read(rcu_data.rcu_cpu_has_work);
2495	}
2496
2497	/*
2498	* Per-CPU kernel thread that invokes RCU callbacks. This replaces
2499	* the RCU softirq used in configurations of RCU that do not support RCU
2500	* priority boosting.
2501	*/
2502	static void rcu_cpu_kthread(unsigned int cpu)
2503	{
2504	unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status);
2505	char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
2506	unsigned long *j = this_cpu_ptr(&rcu_data.rcuc_activity);
2507	int spincnt;
2508
2509	trace_rcu_utilization(TPS("Start CPU kthread@rcu_run"));
2510	for (spincnt = `0`; spincnt < `10`; spincnt++) {
2511	WRITE_ONCE(*j, jiffies);
2512	local_bh_disable();
2513	*statusp = RCU_KTHREAD_RUNNING;
2514	local_irq_disable();
2515	work = *workp;
2516	WRITE_ONCE(*workp, `0`);
2517	local_irq_enable();
2518	if (work)
2519	rcu_core();
2520	local_bh_enable();
2521	if (!READ_ONCE(*workp)) {
2522	trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
2523	*statusp = RCU_KTHREAD_WAITING;
2524	return;
2525	}
2526	}
2527	*statusp = RCU_KTHREAD_YIELDING;
2528	trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
2529	schedule_timeout_idle(timeout: `2`);
2530	trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
2531	*statusp = RCU_KTHREAD_WAITING;
2532	WRITE_ONCE(*j, jiffies);
2533	}
2534
2535	static struct smp_hotplug_thread rcu_cpu_thread_spec = {
2536	.store = &rcu_data.rcu_cpu_kthread_task,
2537	.thread_should_run = rcu_cpu_kthread_should_run,
2538	.thread_fn = rcu_cpu_kthread,
2539	.thread_comm = "rcuc/%u",
2540	.setup = rcu_cpu_kthread_setup,
2541	.park = rcu_cpu_kthread_park,
2542	};
2543
2544	/*
2545	* Spawn per-CPU RCU core processing kthreads.
2546	*/
2547	static int __init rcu_spawn_core_kthreads(void)
2548	{
2549	int cpu;
2550
2551	for_each_possible_cpu(cpu)
2552	per_cpu(rcu_data.rcu_cpu_has_work, cpu) = `0`;
2553	if (use_softirq)
2554	return `0`;
2555	WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec),
2556	"%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__);
2557	return `0`;
2558	}
2559
2560	/*
2561	* Handle any core-RCU processing required by a call_rcu() invocation.
2562	*/
2563	static void __call_rcu_core(struct rcu_data rdp, struct* rcu_head *head,
2564	unsigned long flags)
2565	{
2566	/*
2567	* If called from an extended quiescent state, invoke the RCU
2568	* core in order to force a re-evaluation of RCU's idleness.
2569	*/
2570	if (!rcu_is_watching())
2571	invoke_rcu_core();
2572
2573	/ If interrupts were disabled or CPU offline, don't invoke RCU core. /
2574	if (irqs_disabled_flags(flags) \|\| cpu_is_offline(smp_processor_id()))
2575	return;
2576
2577	/*
2578	* Force the grace period if too many callbacks or too long waiting.
2579	* Enforce hysteresis, and don't invoke rcu_force_quiescent_state()
2580	* if some other CPU has recently done so. Also, don't bother
2581	* invoking rcu_force_quiescent_state() if the newly enqueued callback
2582	* is the only one waiting for a grace period to complete.
2583	*/
2584	if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) >
2585	rdp->qlen_last_fqs_check + qhimark)) {
2586
2587	/ Are we ignoring a completed grace period? /
2588	note_gp_changes(rdp);
2589
2590	/ Start a new grace period if one not already started. /
2591	if (!rcu_gp_in_progress()) {
2592	rcu_accelerate_cbs_unlocked(rnp: rdp->mynode, rdp);
2593	} else {
2594	/ Give the grace period a kick. /
2595	rdp->blimit = DEFAULT_MAX_RCU_BLIMIT;
2596	if (READ_ONCE(rcu_state.n_force_qs) == rdp->n_force_qs_snap &&
2597	rcu_segcblist_first_pend_cb(rsclp: &rdp->cblist) != head)
2598	rcu_force_quiescent_state();
2599	rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
2600	rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(rsclp: &rdp->cblist);
2601	}
2602	}
2603	}
2604
2605	/*
2606	* RCU callback function to leak a callback.
2607	*/
2608	static void rcu_leak_callback(struct rcu_head *rhp)
2609	{
2610	}
2611
2612	/*
2613	* Check and if necessary update the leaf rcu_node structure's
2614	* ->cbovldmask bit corresponding to the current CPU based on that CPU's
2615	* number of queued RCU callbacks. The caller must hold the leaf rcu_node
2616	* structure's ->lock.
2617	*/
2618	static void check_cb_ovld_locked(struct rcu_data rdp, struct* rcu_node *rnp)
2619	{
2620	raw_lockdep_assert_held_rcu_node(rnp);
2621	if (qovld_calc <= `0`)
2622	return; // Early boot and wildcard value set.
2623	if (rcu_segcblist_n_cbs(rsclp: &rdp->cblist) >= qovld_calc)
2624	WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask \| rdp->grpmask);
2625	else
2626	WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask & ~rdp->grpmask);
2627	}
2628
2629	/*
2630	* Check and if necessary update the leaf rcu_node structure's
2631	* ->cbovldmask bit corresponding to the current CPU based on that CPU's
2632	* number of queued RCU callbacks. No locks need be held, but the
2633	* caller must have disabled interrupts.
2634	*
2635	* Note that this function ignores the possibility that there are a lot
2636	* of callbacks all of which have already seen the end of their respective
2637	* grace periods. This omission is due to the need for no-CBs CPUs to
2638	* be holding ->nocb_lock to do this check, which is too heavy for a
2639	* common-case operation.
2640	*/
2641	static void check_cb_ovld(struct rcu_data *rdp)
2642	{
2643	struct rcu_node *const rnp = rdp->mynode;
2644
2645	if (qovld_calc <= `0` \|\|
2646	((rcu_segcblist_n_cbs(rsclp: &rdp->cblist) >= qovld_calc) ==
2647	!!(READ_ONCE(rnp->cbovldmask) & rdp->grpmask)))
2648	return; // Early boot wildcard value or already set correctly.
2649	raw_spin_lock_rcu_node(rnp);
2650	check_cb_ovld_locked(rdp, rnp);
2651	raw_spin_unlock_rcu_node(rnp);
2652	}
2653
2654	static void
2655	__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
2656	{
2657	static atomic_t doublefrees;
2658	unsigned long flags;
2659	bool lazy;
2660	struct rcu_data *rdp;
2661	bool was_alldone;
2662
2663	/ Misaligned rcu_head! /
2664	WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - `1`));
2665
2666	if (debug_rcu_head_queue(head)) {
2667	/*
2668	* Probable double call_rcu(), so leak the callback.
2669	* Use rcu:rcu_callback trace event to find the previous
2670	* time callback was passed to call_rcu().
2671	*/
2672	if (atomic_inc_return(v: &doublefrees) < `4`) {
2673	pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func);
2674	mem_dump_obj(object: head);
2675	}
2676	WRITE_ONCE(head->func, rcu_leak_callback);
2677	return;
2678	}
2679	head->func = func;
2680	head->next = NULL;
2681	kasan_record_aux_stack_noalloc(ptr: head);
2682	local_irq_save(flags);
2683	rdp = this_cpu_ptr(&rcu_data);
2684	lazy = lazy_in && !rcu_async_should_hurry();
2685
2686	/ Add the callback to our list. /
2687	if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) {
2688	// This can trigger due to call_rcu() from offline CPU:
2689	WARN_ON_ONCE(rcu_scheduler_active != RCU_SCHEDULER_INACTIVE);
2690	WARN_ON_ONCE(!rcu_is_watching());
2691	// Very early boot, before rcu_init(). Initialize if needed
2692	// and then drop through to queue the callback.
2693	if (rcu_segcblist_empty(rsclp: &rdp->cblist))
2694	rcu_segcblist_init(rsclp: &rdp->cblist);
2695	}
2696
2697	check_cb_ovld(rdp);
2698	if (rcu_nocb_try_bypass(rdp, rhp: head, was_alldone: &was_alldone, flags, lazy))
2699	return; // Enqueued onto ->nocb_bypass, so just leave.
2700	// If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
2701	rcu_segcblist_enqueue(rsclp: &rdp->cblist, rhp: head);
2702	if (__is_kvfree_rcu_offset((unsigned long)func))
2703	trace_rcu_kvfree_callback(rcuname: rcu_state.name, rhp: head,
2704	offset: (unsigned long)func,
2705	qlen: rcu_segcblist_n_cbs(rsclp: &rdp->cblist));
2706	else
2707	trace_rcu_callback(rcuname: rcu_state.name, rhp: head,
2708	qlen: rcu_segcblist_n_cbs(rsclp: &rdp->cblist));
2709
2710	trace_rcu_segcb_stats(rs: &rdp->cblist, TPS("SegCBQueued"));
2711
2712	/ Go handle any RCU core processing required. /
2713	if (unlikely(rcu_rdp_is_offloaded(rdp))) {
2714	__call_rcu_nocb_wake(rdp, was_empty: was_alldone, flags); / unlocks /
2715	} else {
2716	__call_rcu_core(rdp, head, flags);
2717	local_irq_restore(flags);
2718	}
2719	}
2720
2721	#ifdef CONFIG_RCU_LAZY
2722	/**
2723	* call_rcu_hurry() - Queue RCU callback for invocation after grace period, and
2724	* flush all lazy callbacks (including the new one) to the main ->cblist while
2725	* doing so.
2726	*
2727	* @head: structure to be used for queueing the RCU updates.
2728	* @func: actual callback function to be invoked after the grace period
2729	*
2730	* The callback function will be invoked some time after a full grace
2731	* period elapses, in other words after all pre-existing RCU read-side
2732	* critical sections have completed.
2733	*
2734	* Use this API instead of call_rcu() if you don't want the callback to be
2735	* invoked after very long periods of time, which can happen on systems without
2736	* memory pressure and on systems which are lightly loaded or mostly idle.
2737	* This function will cause callbacks to be invoked sooner than later at the
2738	* expense of extra power. Other than that, this function is identical to, and
2739	* reuses call_rcu()'s logic. Refer to call_rcu() for more details about memory
2740	* ordering and other functionality.
2741	*/
2742	void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
2743	{
2744	__call_rcu_common(head, func, lazy_in: false);
2745	}
2746	EXPORT_SYMBOL_GPL(call_rcu_hurry);
2747	#endif
2748
2749	/**
2750	* call_rcu() - Queue an RCU callback for invocation after a grace period.
2751	* By default the callbacks are 'lazy' and are kept hidden from the main
2752	* ->cblist to prevent starting of grace periods too soon.
2753	* If you desire grace periods to start very soon, use call_rcu_hurry().
2754	*
2755	* @head: structure to be used for queueing the RCU updates.
2756	* @func: actual callback function to be invoked after the grace period
2757	*
2758	* The callback function will be invoked some time after a full grace
2759	* period elapses, in other words after all pre-existing RCU read-side
2760	* critical sections have completed. However, the callback function
2761	* might well execute concurrently with RCU read-side critical sections
2762	* that started after call_rcu() was invoked.
2763	*
2764	* RCU read-side critical sections are delimited by rcu_read_lock()
2765	* and rcu_read_unlock(), and may be nested. In addition, but only in
2766	* v5.0 and later, regions of code across which interrupts, preemption,
2767	* or softirqs have been disabled also serve as RCU read-side critical
2768	* sections. This includes hardware interrupt handlers, softirq handlers,
2769	* and NMI handlers.
2770	*
2771	* Note that all CPUs must agree that the grace period extended beyond
2772	* all pre-existing RCU read-side critical section. On systems with more
2773	* than one CPU, this means that when "func()" is invoked, each CPU is
2774	* guaranteed to have executed a full memory barrier since the end of its
2775	* last RCU read-side critical section whose beginning preceded the call
2776	* to call_rcu(). It also means that each CPU executing an RCU read-side
2777	* critical section that continues beyond the start of "func()" must have
2778	* executed a memory barrier after the call_rcu() but before the beginning
2779	* of that RCU read-side critical section. Note that these guarantees
2780	* include CPUs that are offline, idle, or executing in user mode, as
2781	* well as CPUs that are executing in the kernel.
2782	*
2783	* Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
2784	* resulting RCU callback function "func()", then both CPU A and CPU B are
2785	* guaranteed to execute a full memory barrier during the time interval
2786	* between the call to call_rcu() and the invocation of "func()" -- even
2787	* if CPU A and CPU B are the same CPU (but again only if the system has
2788	* more than one CPU).
2789	*
2790	* Implementation of these memory-ordering guarantees is described here:
2791	* Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
2792	*/
2793	void call_rcu(struct rcu_head *head, rcu_callback_t func)
2794	{
2795	__call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY));
2796	}
2797	EXPORT_SYMBOL_GPL(call_rcu);
2798
2799	/ Maximum number of jiffies to wait before draining a batch. /
2800	#define KFREE_DRAIN_JIFFIES (5 * HZ)
2801	#define KFREE_N_BATCHES 2
2802	#define FREE_N_CHANNELS 2
2803
2804	/**
2805	* struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
2806	* @list: List node. All blocks are linked between each other
2807	* @gp_snap: Snapshot of RCU state for objects placed to this bulk
2808	* @nr_records: Number of active pointers in the array
2809	* @records: Array of the kvfree_rcu() pointers
2810	*/
2811	struct kvfree_rcu_bulk_data {
2812	struct list_head list;
2813	struct rcu_gp_oldstate gp_snap;
2814	unsigned long nr_records;
2815	void *records[];
2816	};
2817
2818	/*
2819	* This macro defines how many entries the "records" array
2820	* will contain. It is based on the fact that the size of
2821	* kvfree_rcu_bulk_data structure becomes exactly one page.
2822	*/
2823	#define KVFREE_BULK_MAX_ENTR \
2824	((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
2825
2826	/**
2827	* struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
2828	* @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
2829	* @head_free: List of kfree_rcu() objects waiting for a grace period
2830	* @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees.
2831	* @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
2832	* @krcp: Pointer to @kfree_rcu_cpu structure
2833	*/
2834
2835	struct kfree_rcu_cpu_work {
2836	struct rcu_work rcu_work;
2837	struct rcu_head *head_free;
2838	struct rcu_gp_oldstate head_free_gp_snap;
2839	struct list_head bulk_head_free[FREE_N_CHANNELS];
2840	struct kfree_rcu_cpu *krcp;
2841	};
2842
2843	/**
2844	* struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
2845	* @head: List of kfree_rcu() objects not yet waiting for a grace period
2846	* @head_gp_snap: Snapshot of RCU state for objects placed to "@head"
2847	* @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
2848	* @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
2849	* @lock: Synchronize access to this structure
2850	* @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
2851	* @initialized: The @rcu_work fields have been initialized
2852	* @head_count: Number of objects in rcu_head singular list
2853	* @bulk_count: Number of objects in bulk-list
2854	* @bkvcache:
2855	* A simple cache list that contains objects for reuse purpose.
2856	* In order to save some per-cpu space the list is singular.
2857	* Even though it is lockless an access has to be protected by the
2858	* per-cpu lock.
2859	* @page_cache_work: A work to refill the cache when it is empty
2860	* @backoff_page_cache_fill: Delay cache refills
2861	* @work_in_progress: Indicates that page_cache_work is running
2862	* @hrtimer: A hrtimer for scheduling a page_cache_work
2863	* @nr_bkv_objs: number of allocated objects at @bkvcache.
2864	*
2865	* This is a per-CPU structure. The reason that it is not included in
2866	* the rcu_data structure is to permit this code to be extracted from
2867	* the RCU files. Such extraction could allow further optimization of
2868	* the interactions with the slab allocators.
2869	*/
2870	struct kfree_rcu_cpu {
2871	// Objects queued on a linked list
2872	// through their rcu_head structures.
2873	struct rcu_head *head;
2874	unsigned long head_gp_snap;
2875	atomic_t head_count;
2876
2877	// Objects queued on a bulk-list.
2878	struct list_head bulk_head[FREE_N_CHANNELS];
2879	atomic_t bulk_count[FREE_N_CHANNELS];
2880
2881	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
2882	raw_spinlock_t lock;
2883	struct delayed_work monitor_work;
2884	bool initialized;
2885
2886	struct delayed_work page_cache_work;
2887	atomic_t backoff_page_cache_fill;
2888	atomic_t work_in_progress;
2889	struct hrtimer hrtimer;
2890
2891	struct llist_head bkvcache;
2892	int nr_bkv_objs;
2893	};
2894
2895	static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
2896	.lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock),
2897	};
2898
2899	static __always_inline void
2900	debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
2901	{
2902	#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
2903	int i;
2904
2905	for (i = `0`; i < bhead->nr_records; i++)
2906	debug_rcu_head_unqueue(head: (struct rcu_head *)(bhead->records[i]));
2907	#endif
2908	}
2909
2910	static inline struct kfree_rcu_cpu *
2911	krc_this_cpu_lock(unsigned long *flags)
2912	{
2913	struct kfree_rcu_cpu *krcp;
2914
2915	local_irq_save(flags); // For safely calling this_cpu_ptr().*
2916	krcp = this_cpu_ptr(&krc);
2917	raw_spin_lock(&krcp->lock);
2918
2919	return krcp;
2920	}
2921
2922	static inline void
2923	krc_this_cpu_unlock(struct kfree_rcu_cpu krcp, unsigned* long flags)
2924	{
2925	raw_spin_unlock_irqrestore(&krcp->lock, flags);
2926	}
2927
2928	static inline struct kvfree_rcu_bulk_data *
2929	get_cached_bnode(struct kfree_rcu_cpu *krcp)
2930	{
2931	if (!krcp->nr_bkv_objs)
2932	return NULL;
2933
2934	WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - `1`);
2935	return (struct kvfree_rcu_bulk_data *)
2936	llist_del_first(head: &krcp->bkvcache);
2937	}
2938
2939	static inline bool
2940	put_cached_bnode(struct kfree_rcu_cpu *krcp,
2941	struct kvfree_rcu_bulk_data *bnode)
2942	{
2943	// Check the limit.
2944	if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
2945	return false;
2946
2947	llist_add(new: (struct llist_node *) bnode, head: &krcp->bkvcache);
2948	WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + `1`);
2949	return true;
2950	}
2951
2952	static int
2953	drain_page_cache(struct kfree_rcu_cpu *krcp)
2954	{
2955	unsigned long flags;
2956	struct llist_node page_list, pos, *n;
2957	int freed = `0`;
2958
2959	if (!rcu_min_cached_objs)
2960	return `0`;
2961
2962	raw_spin_lock_irqsave(&krcp->lock, flags);
2963	page_list = llist_del_all(head: &krcp->bkvcache);
2964	WRITE_ONCE(krcp->nr_bkv_objs, `0`);
2965	raw_spin_unlock_irqrestore(&krcp->lock, flags);
2966
2967	llist_for_each_safe(pos, n, page_list) {
2968	free_page((unsigned long)pos);
2969	freed++;
2970	}
2971
2972	return freed;
2973	}
2974
2975	static void
2976	kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp,
2977	struct kvfree_rcu_bulk_data bnode, int* idx)
2978	{
2979	unsigned long flags;
2980	int i;
2981
2982	if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) {
2983	debug_rcu_bhead_unqueue(bhead: bnode);
2984	rcu_lock_acquire(map: &rcu_callback_map);
2985	if (idx == `0`) { // kmalloc() / kfree().
2986	trace_rcu_invoke_kfree_bulk_callback(
2987	rcuname: rcu_state.name, nr_records: bnode->nr_records,
2988	p: bnode->records);
2989
2990	kfree_bulk(size: bnode->nr_records, p: bnode->records);
2991	} else { // vmalloc() / vfree().
2992	for (i = `0`; i < bnode->nr_records; i++) {
2993	trace_rcu_invoke_kvfree_callback(
2994	rcuname: rcu_state.name, rhp: bnode->records[i], offset: `0`);
2995
2996	vfree(addr: bnode->records[i]);
2997	}
2998	}
2999	rcu_lock_release(map: &rcu_callback_map);
3000	}
3001
3002	raw_spin_lock_irqsave(&krcp->lock, flags);
3003	if (put_cached_bnode(krcp, bnode))
3004	bnode = NULL;
3005	raw_spin_unlock_irqrestore(&krcp->lock, flags);
3006
3007	if (bnode)
3008	free_page((unsigned long) bnode);
3009
3010	cond_resched_tasks_rcu_qs();
3011	}
3012
3013	static void
3014	kvfree_rcu_list(struct rcu_head *head)
3015	{
3016	struct rcu_head *next;
3017
3018	for (; head; head = next) {
3019	void ptr = (void* *) head->func;
3020	unsigned long offset = (void *) head - ptr;
3021
3022	next = head->next;
3023	debug_rcu_head_unqueue(head: (struct rcu_head *)ptr);
3024	rcu_lock_acquire(map: &rcu_callback_map);
3025	trace_rcu_invoke_kvfree_callback(rcuname: rcu_state.name, rhp: head, offset);
3026
3027	if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
3028	kvfree(addr: ptr);
3029
3030	rcu_lock_release(map: &rcu_callback_map);
3031	cond_resched_tasks_rcu_qs();
3032	}
3033	}
3034
3035	/*
3036	* This function is invoked in workqueue context after a grace period.
3037	* It frees all the objects queued on ->bulk_head_free or ->head_free.
3038	*/
3039	static void kfree_rcu_work(struct work_struct *work)
3040	{
3041	unsigned long flags;
3042	struct kvfree_rcu_bulk_data bnode, n;
3043	struct list_head bulk_head[FREE_N_CHANNELS];
3044	struct rcu_head *head;
3045	struct kfree_rcu_cpu *krcp;
3046	struct kfree_rcu_cpu_work *krwp;
3047	struct rcu_gp_oldstate head_gp_snap;
3048	int i;
3049
3050	krwp = container_of(to_rcu_work(work),
3051	struct kfree_rcu_cpu_work, rcu_work);
3052	krcp = krwp->krcp;
3053
3054	raw_spin_lock_irqsave(&krcp->lock, flags);
3055	// Channels 1 and 2.
3056	for (i = `0`; i < FREE_N_CHANNELS; i++)
3057	list_replace_init(old: &krwp->bulk_head_free[i], new: &bulk_head[i]);
3058
3059	// Channel 3.
3060	head = krwp->head_free;
3061	krwp->head_free = NULL;
3062	head_gp_snap = krwp->head_free_gp_snap;
3063	raw_spin_unlock_irqrestore(&krcp->lock, flags);
3064
3065	// Handle the first two channels.
3066	for (i = `0`; i < FREE_N_CHANNELS; i++) {
3067	// Start from the tail page, so a GP is likely passed for it.
3068	list_for_each_entry_safe(bnode, n, &bulk_head[i], list)
3069	kvfree_rcu_bulk(krcp, bnode, idx: i);
3070	}
3071
3072	/*
3073	* This is used when the "bulk" path can not be used for the
3074	* double-argument of kvfree_rcu(). This happens when the
3075	* page-cache is empty, which means that objects are instead
3076	* queued on a linked list through their rcu_head structures.
3077	* This list is named "Channel 3".
3078	*/
3079	if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap)))
3080	kvfree_rcu_list(head);
3081	}
3082
3083	static bool
3084	need_offload_krc(struct kfree_rcu_cpu *krcp)
3085	{
3086	int i;
3087
3088	for (i = `0`; i < FREE_N_CHANNELS; i++)
3089	if (!list_empty(head: &krcp->bulk_head[i]))
3090	return true;
3091
3092	return !!READ_ONCE(krcp->head);
3093	}
3094
3095	static bool
3096	need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp)
3097	{
3098	int i;
3099
3100	for (i = `0`; i < FREE_N_CHANNELS; i++)
3101	if (!list_empty(head: &krwp->bulk_head_free[i]))
3102	return true;
3103
3104	return !!krwp->head_free;
3105	}
3106
3107	static int krc_count(struct kfree_rcu_cpu *krcp)
3108	{
3109	int sum = atomic_read(v: &krcp->head_count);
3110	int i;
3111
3112	for (i = `0`; i < FREE_N_CHANNELS; i++)
3113	sum += atomic_read(v: &krcp->bulk_count[i]);
3114
3115	return sum;
3116	}
3117
3118	static void
3119	schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
3120	{
3121	long delay, delay_left;
3122
3123	delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? `1`:KFREE_DRAIN_JIFFIES;
3124	if (delayed_work_pending(&krcp->monitor_work)) {
3125	delay_left = krcp->monitor_work.timer.expires - jiffies;
3126	if (delay < delay_left)
3127	mod_delayed_work(wq: system_wq, dwork: &krcp->monitor_work, delay);
3128	return;
3129	}
3130	queue_delayed_work(wq: system_wq, dwork: &krcp->monitor_work, delay);
3131	}
3132
3133	static void
3134	kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
3135	{
3136	struct list_head bulk_ready[FREE_N_CHANNELS];
3137	struct kvfree_rcu_bulk_data bnode, n;
3138	struct rcu_head *head_ready = NULL;
3139	unsigned long flags;
3140	int i;
3141
3142	raw_spin_lock_irqsave(&krcp->lock, flags);
3143	for (i = `0`; i < FREE_N_CHANNELS; i++) {
3144	INIT_LIST_HEAD(list: &bulk_ready[i]);
3145
3146	list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) {
3147	if (!poll_state_synchronize_rcu_full(rgosp: &bnode->gp_snap))
3148	break;
3149
3150	atomic_sub(i: bnode->nr_records, v: &krcp->bulk_count[i]);
3151	list_move(list: &bnode->list, head: &bulk_ready[i]);
3152	}
3153	}
3154
3155	if (krcp->head && poll_state_synchronize_rcu(oldstate: krcp->head_gp_snap)) {
3156	head_ready = krcp->head;
3157	atomic_set(v: &krcp->head_count, i: `0`);
3158	WRITE_ONCE(krcp->head, NULL);
3159	}
3160	raw_spin_unlock_irqrestore(&krcp->lock, flags);
3161
3162	for (i = `0`; i < FREE_N_CHANNELS; i++) {
3163	list_for_each_entry_safe(bnode, n, &bulk_ready[i], list)
3164	kvfree_rcu_bulk(krcp, bnode, idx: i);
3165	}
3166
3167	if (head_ready)
3168	kvfree_rcu_list(head: head_ready);
3169	}
3170
3171	/*
3172	* This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
3173	*/
3174	static void kfree_rcu_monitor(struct work_struct *work)
3175	{
3176	struct kfree_rcu_cpu *krcp = container_of(work,
3177	struct kfree_rcu_cpu, monitor_work.work);
3178	unsigned long flags;
3179	int i, j;
3180
3181	// Drain ready for reclaim.
3182	kvfree_rcu_drain_ready(krcp);
3183
3184	raw_spin_lock_irqsave(&krcp->lock, flags);
3185
3186	// Attempt to start a new batch.
3187	for (i = `0`; i < KFREE_N_BATCHES; i++) {
3188	struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
3189
3190	// Try to detach bulk_head or head and attach it, only when
3191	// all channels are free. Any channel is not free means at krwp
3192	// there is on-going rcu work to handle krwp's free business.
3193	if (need_wait_for_krwp_work(krwp))
3194	continue;
3195
3196	// kvfree_rcu_drain_ready() might handle this krcp, if so give up.
3197	if (need_offload_krc(krcp)) {
3198	// Channel 1 corresponds to the SLAB-pointer bulk path.
3199	// Channel 2 corresponds to vmalloc-pointer bulk path.
3200	for (j = `0`; j < FREE_N_CHANNELS; j++) {
3201	if (list_empty(head: &krwp->bulk_head_free[j])) {
3202	atomic_set(v: &krcp->bulk_count[j], i: `0`);
3203	list_replace_init(old: &krcp->bulk_head[j],
3204	new: &krwp->bulk_head_free[j]);
3205	}
3206	}
3207
3208	// Channel 3 corresponds to both SLAB and vmalloc
3209	// objects queued on the linked list.
3210	if (!krwp->head_free) {
3211	krwp->head_free = krcp->head;
3212	get_state_synchronize_rcu_full(rgosp: &krwp->head_free_gp_snap);
3213	atomic_set(v: &krcp->head_count, i: `0`);
3214	WRITE_ONCE(krcp->head, NULL);
3215	}
3216
3217	// One work is per one batch, so there are three
3218	// "free channels", the batch can handle. It can
3219	// be that the work is in the pending state when
3220	// channels have been detached following by each
3221	// other.
3222	queue_rcu_work(wq: system_wq, rwork: &krwp->rcu_work);
3223	}
3224	}
3225
3226	raw_spin_unlock_irqrestore(&krcp->lock, flags);
3227
3228	// If there is nothing to detach, it means that our job is
3229	// successfully done here. In case of having at least one
3230	// of the channels that is still busy we should rearm the
3231	// work to repeat an attempt. Because previous batches are
3232	// still in progress.
3233	if (need_offload_krc(krcp))
3234	schedule_delayed_monitor_work(krcp);
3235	}
3236
3237	static enum hrtimer_restart
3238	schedule_page_work_fn(struct hrtimer *t)
3239	{
3240	struct kfree_rcu_cpu *krcp =
3241	container_of(t, struct kfree_rcu_cpu, hrtimer);
3242
3243	queue_delayed_work(wq: system_highpri_wq, dwork: &krcp->page_cache_work, delay: `0`);
3244	return HRTIMER_NORESTART;
3245	}
3246
3247	static void fill_page_cache_func(struct work_struct *work)
3248	{
3249	struct kvfree_rcu_bulk_data *bnode;
3250	struct kfree_rcu_cpu *krcp =
3251	container_of(work, struct kfree_rcu_cpu,
3252	page_cache_work.work);
3253	unsigned long flags;
3254	int nr_pages;
3255	bool pushed;
3256	int i;
3257
3258	nr_pages = atomic_read(v: &krcp->backoff_page_cache_fill) ?
3259	`1` : rcu_min_cached_objs;
3260
3261	for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) {
3262	bnode = (struct kvfree_rcu_bulk_data *)
3263	__get_free_page(GFP_KERNEL \| __GFP_NORETRY \| __GFP_NOMEMALLOC \| __GFP_NOWARN);
3264
3265	if (!bnode)
3266	break;
3267
3268	raw_spin_lock_irqsave(&krcp->lock, flags);
3269	pushed = put_cached_bnode(krcp, bnode);
3270	raw_spin_unlock_irqrestore(&krcp->lock, flags);
3271
3272	if (!pushed) {
3273	free_page((unsigned long) bnode);
3274	break;
3275	}
3276	}
3277
3278	atomic_set(v: &krcp->work_in_progress, i: `0`);
3279	atomic_set(v: &krcp->backoff_page_cache_fill, i: `0`);
3280	}
3281
3282	static void
3283	run_page_cache_worker(struct kfree_rcu_cpu *krcp)
3284	{
3285	// If cache disabled, bail out.
3286	if (!rcu_min_cached_objs)
3287	return;
3288
3289	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
3290	!atomic_xchg(v: &krcp->work_in_progress, new: `1`)) {
3291	if (atomic_read(v: &krcp->backoff_page_cache_fill)) {
3292	queue_delayed_work(wq: system_wq,
3293	dwork: &krcp->page_cache_work,
3294	delay: msecs_to_jiffies(m: rcu_delay_page_cache_fill_msec));
3295	} else {
3296	hrtimer_init(timer: &krcp->hrtimer, CLOCK_MONOTONIC, mode: HRTIMER_MODE_REL);
3297	krcp->hrtimer.function = schedule_page_work_fn;
3298	hrtimer_start(timer: &krcp->hrtimer, tim: `0`, mode: HRTIMER_MODE_REL);
3299	}
3300	}
3301	}
3302
3303	// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
3304	// state specified by flags. If can_alloc is true, the caller must
3305	// be schedulable and not be holding any locks or mutexes that might be
3306	// acquired by the memory allocator or anything that it might invoke.
3307	// Returns true if ptr was successfully recorded, else the caller must
3308	// use a fallback.
3309	static inline bool
3310	add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
3311	unsigned long flags, void* *ptr, bool can_alloc)
3312	{
3313	struct kvfree_rcu_bulk_data *bnode;
3314	int idx;
3315
3316	*krcp = krc_this_cpu_lock(flags);
3317	if (unlikely(!(*krcp)->initialized))
3318	return false;
3319
3320	idx = !!is_vmalloc_addr(x: ptr);
3321	bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx],
3322	struct kvfree_rcu_bulk_data, list);
3323
3324	/ Check if a new block is required. /
3325	if (!bnode \|\| bnode->nr_records == KVFREE_BULK_MAX_ENTR) {
3326	bnode = get_cached_bnode(krcp: *krcp);
3327	if (!bnode && can_alloc) {
3328	krc_this_cpu_unlock(krcp: krcp, flags: flags);
3329
3330	// __GFP_NORETRY - allows a light-weight direct reclaim
3331	// what is OK from minimizing of fallback hitting point of
3332	// view. Apart of that it forbids any OOM invoking what is
3333	// also beneficial since we are about to release memory soon.
3334	//
3335	// __GFP_NOMEMALLOC - prevents from consuming of all the
3336	// memory reserves. Please note we have a fallback path.
3337	//
3338	// __GFP_NOWARN - it is supposed that an allocation can
3339	// be failed under low memory or high memory pressure
3340	// scenarios.
3341	bnode = (struct kvfree_rcu_bulk_data *)
3342	__get_free_page(GFP_KERNEL \| __GFP_NORETRY \| __GFP_NOMEMALLOC \| __GFP_NOWARN);
3343	raw_spin_lock_irqsave(&(krcp)->lock, flags);
3344	}
3345
3346	if (!bnode)
3347	return false;
3348
3349	// Initialize the new block and attach it.
3350	bnode->nr_records = `0`;
3351	list_add(new: &bnode->list, head: &(*krcp)->bulk_head[idx]);
3352	}
3353
3354	// Finally insert and update the GP for this page.
3355	bnode->records[bnode->nr_records++] = ptr;
3356	get_state_synchronize_rcu_full(rgosp: &bnode->gp_snap);
3357	atomic_inc(v: &(*krcp)->bulk_count[idx]);
3358
3359	return true;
3360	}
3361
3362	/*
3363	* Queue a request for lazy invocation of the appropriate free routine
3364	* after a grace period. Please note that three paths are maintained,
3365	* two for the common case using arrays of pointers and a third one that
3366	* is used only when the main paths cannot be used, for example, due to
3367	* memory pressure.
3368	*
3369	* Each kvfree_call_rcu() request is added to a batch. The batch will be drained
3370	* every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
3371	* be free'd in workqueue context. This allows us to: batch requests together to
3372	* reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
3373	*/
3374	void kvfree_call_rcu(struct rcu_head head, void* *ptr)
3375	{
3376	unsigned long flags;
3377	struct kfree_rcu_cpu *krcp;
3378	bool success;
3379
3380	/*
3381	* Please note there is a limitation for the head-less
3382	* variant, that is why there is a clear rule for such
3383	* objects: it can be used from might_sleep() context
3384	* only. For other places please embed an rcu_head to
3385	* your data.
3386	*/
3387	if (!head)
3388	might_sleep();
3389
3390	// Queue the object but don't yet schedule the batch.
3391	if (debug_rcu_head_queue(head: ptr)) {
3392	// Probable double kfree_rcu(), just leak.
3393	WARN_ONCE(`1`, "%s(): Double-freed call. rcu_head %p\n",
3394	__func__, head);
3395
3396	// Mark as success and leave.
3397	return;
3398	}
3399
3400	kasan_record_aux_stack_noalloc(ptr);
3401	success = add_ptr_to_bulk_krc_lock(krcp: &krcp, flags: &flags, ptr, can_alloc: !head);
3402	if (!success) {
3403	run_page_cache_worker(krcp);
3404
3405	if (head == NULL)
3406	// Inline if kvfree_rcu(one_arg) call.
3407	goto unlock_return;
3408
3409	head->func = ptr;
3410	head->next = krcp->head;
3411	WRITE_ONCE(krcp->head, head);
3412	atomic_inc(v: &krcp->head_count);
3413
3414	// Take a snapshot for this krcp.
3415	krcp->head_gp_snap = get_state_synchronize_rcu();
3416	success = true;
3417	}
3418
3419	/*
3420	* The kvfree_rcu() caller considers the pointer freed at this point
3421	* and likely removes any references to it. Since the actual slab
3422	* freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore
3423	* this object (no scanning or false positives reporting).
3424	*/
3425	kmemleak_ignore(ptr);
3426
3427	// Set timer to drain after KFREE_DRAIN_JIFFIES.
3428	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
3429	schedule_delayed_monitor_work(krcp);
3430
3431	unlock_return:
3432	krc_this_cpu_unlock(krcp, flags);
3433
3434	/*
3435	* Inline kvfree() after synchronize_rcu(). We can do
3436	* it from might_sleep() context only, so the current
3437	* CPU can pass the QS state.
3438	*/
3439	if (!success) {
3440	debug_rcu_head_unqueue(head: (struct rcu_head *) ptr);
3441	synchronize_rcu();
3442	kvfree(addr: ptr);
3443	}
3444	}
3445	EXPORT_SYMBOL_GPL(kvfree_call_rcu);
3446
3447	static unsigned long
3448	kfree_rcu_shrink_count(struct shrinker shrink, struct* shrink_control *sc)
3449	{
3450	int cpu;
3451	unsigned long count = `0`;
3452
3453	/ Snapshot count of all CPUs /
3454	for_each_possible_cpu(cpu) {
3455	struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
3456
3457	count += krc_count(krcp);
3458	count += READ_ONCE(krcp->nr_bkv_objs);
3459	atomic_set(v: &krcp->backoff_page_cache_fill, i: `1`);
3460	}
3461
3462	return count == `0` ? SHRINK_EMPTY : count;
3463	}
3464
3465	static unsigned long
3466	kfree_rcu_shrink_scan(struct shrinker shrink, struct* shrink_control *sc)
3467	{
3468	int cpu, freed = `0`;
3469
3470	for_each_possible_cpu(cpu) {
3471	int count;
3472	struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
3473
3474	count = krc_count(krcp);
3475	count += drain_page_cache(krcp);
3476	kfree_rcu_monitor(work: &krcp->monitor_work.work);
3477
3478	sc->nr_to_scan -= count;
3479	freed += count;
3480
3481	if (sc->nr_to_scan <= `0`)
3482	break;
3483	}
3484
3485	return freed == `0` ? SHRINK_STOP : freed;
3486	}
3487
3488	void __init kfree_rcu_scheduler_running(void)
3489	{
3490	int cpu;
3491
3492	for_each_possible_cpu(cpu) {
3493	struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
3494
3495	if (need_offload_krc(krcp))
3496	schedule_delayed_monitor_work(krcp);
3497	}
3498	}
3499
3500	/*
3501	* During early boot, any blocking grace-period wait automatically
3502	* implies a grace period.
3503	*
3504	* Later on, this could in theory be the case for kernels built with
3505	* CONFIG_SMP=y && CONFIG_PREEMPTION=y running on a single CPU, but this
3506	* is not a common case. Furthermore, this optimization would cause
3507	* the rcu_gp_oldstate structure to expand by 50%, so this potential
3508	* grace-period optimization is ignored once the scheduler is running.
3509	*/
3510	static int rcu_blocking_is_gp(void)
3511	{
3512	if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) {
3513	might_sleep();
3514	return false;
3515	}
3516	return true;
3517	}
3518
3519	/**
3520	* synchronize_rcu - wait until a grace period has elapsed.
3521	*
3522	* Control will return to the caller some time after a full grace
3523	* period has elapsed, in other words after all currently executing RCU
3524	* read-side critical sections have completed. Note, however, that
3525	* upon return from synchronize_rcu(), the caller might well be executing
3526	* concurrently with new RCU read-side critical sections that began while
3527	* synchronize_rcu() was waiting.
3528	*
3529	* RCU read-side critical sections are delimited by rcu_read_lock()
3530	* and rcu_read_unlock(), and may be nested. In addition, but only in
3531	* v5.0 and later, regions of code across which interrupts, preemption,
3532	* or softirqs have been disabled also serve as RCU read-side critical
3533	* sections. This includes hardware interrupt handlers, softirq handlers,
3534	* and NMI handlers.
3535	*
3536	* Note that this guarantee implies further memory-ordering guarantees.
3537	* On systems with more than one CPU, when synchronize_rcu() returns,
3538	* each CPU is guaranteed to have executed a full memory barrier since
3539	* the end of its last RCU read-side critical section whose beginning
3540	* preceded the call to synchronize_rcu(). In addition, each CPU having
3541	* an RCU read-side critical section that extends beyond the return from
3542	* synchronize_rcu() is guaranteed to have executed a full memory barrier
3543	* after the beginning of synchronize_rcu() and before the beginning of
3544	* that RCU read-side critical section. Note that these guarantees include
3545	* CPUs that are offline, idle, or executing in user mode, as well as CPUs
3546	* that are executing in the kernel.
3547	*
3548	* Furthermore, if CPU A invoked synchronize_rcu(), which returned
3549	* to its caller on CPU B, then both CPU A and CPU B are guaranteed
3550	* to have executed a full memory barrier during the execution of
3551	* synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but
3552	* again only if the system has more than one CPU).
3553	*
3554	* Implementation of these memory-ordering guarantees is described here:
3555	* Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
3556	*/
3557	void synchronize_rcu(void)
3558	{
3559	unsigned long flags;
3560	struct rcu_node *rnp;
3561
3562	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) \|\|
3563	lock_is_held(&rcu_lock_map) \|\|
3564	lock_is_held(&rcu_sched_lock_map),
3565	"Illegal synchronize_rcu() in RCU read-side critical section");
3566	if (!rcu_blocking_is_gp()) {
3567	if (rcu_gp_is_expedited())
3568	synchronize_rcu_expedited();
3569	else
3570	wait_rcu_gp(call_rcu_hurry);
3571	return;
3572	}
3573
3574	// Context allows vacuous grace periods.
3575	// Note well that this code runs with !PREEMPT && !SMP.
3576	// In addition, all code that advances grace periods runs at
3577	// process level. Therefore, this normal GP overlaps with other
3578	// normal GPs only by being fully nested within them, which allows
3579	// reuse of ->gp_seq_polled_snap.
3580	rcu_poll_gp_seq_start_unlocked(snap: &rcu_state.gp_seq_polled_snap);
3581	rcu_poll_gp_seq_end_unlocked(snap: &rcu_state.gp_seq_polled_snap);
3582
3583	// Update the normal grace-period counters to record
3584	// this grace period, but only those used by the boot CPU.
3585	// The rcu_scheduler_starting() will take care of the rest of
3586	// these counters.
3587	local_irq_save(flags);
3588	WARN_ON_ONCE(num_online_cpus() > `1`);
3589	rcu_state.gp_seq += (`1` << RCU_SEQ_CTR_SHIFT);
3590	for (rnp = this_cpu_ptr(&rcu_data)->mynode; rnp; rnp = rnp->parent)
3591	rnp->gp_seq_needed = rnp->gp_seq = rcu_state.gp_seq;
3592	local_irq_restore(flags);
3593	}
3594	EXPORT_SYMBOL_GPL(synchronize_rcu);
3595
3596	/**
3597	* get_completed_synchronize_rcu_full - Return a full pre-completed polled state cookie
3598	* @rgosp: Place to put state cookie
3599	*
3600	* Stores into @rgosp a value that will always be treated by functions
3601	* like poll_state_synchronize_rcu_full() as a cookie whose grace period
3602	* has already completed.
3603	*/
3604	void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
3605	{
3606	rgosp->rgos_norm = RCU_GET_STATE_COMPLETED;
3607	rgosp->rgos_exp = RCU_GET_STATE_COMPLETED;
3608	}
3609	EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full);
3610
3611	/**
3612	* get_state_synchronize_rcu - Snapshot current RCU state
3613	*
3614	* Returns a cookie that is used by a later call to cond_synchronize_rcu()
3615	* or poll_state_synchronize_rcu() to determine whether or not a full
3616	* grace period has elapsed in the meantime.
3617	*/
3618	unsigned long get_state_synchronize_rcu(void)
3619	{
3620	/*
3621	* Any prior manipulation of RCU-protected data must happen
3622	* before the load from ->gp_seq.
3623	*/
3624	smp_mb(); / ^^^ /
3625	return rcu_seq_snap(sp: &rcu_state.gp_seq_polled);
3626	}
3627	EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
3628
3629	/**
3630	* get_state_synchronize_rcu_full - Snapshot RCU state, both normal and expedited
3631	* @rgosp: location to place combined normal/expedited grace-period state
3632	*
3633	* Places the normal and expedited grace-period states in @rgosp. This
3634	* state value can be passed to a later call to cond_synchronize_rcu_full()
3635	* or poll_state_synchronize_rcu_full() to determine whether or not a
3636	* grace period (whether normal or expedited) has elapsed in the meantime.
3637	* The rcu_gp_oldstate structure takes up twice the memory of an unsigned
3638	* long, but is guaranteed to see all grace periods. In contrast, the
3639	* combined state occupies less memory, but can sometimes fail to take
3640	* grace periods into account.
3641	*
3642	* This does not guarantee that the needed grace period will actually
3643	* start.
3644	*/
3645	void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
3646	{
3647	struct rcu_node *rnp = rcu_get_root();
3648
3649	/*
3650	* Any prior manipulation of RCU-protected data must happen
3651	* before the loads from ->gp_seq and ->expedited_sequence.
3652	*/
3653	smp_mb(); / ^^^ /
3654	rgosp->rgos_norm = rcu_seq_snap(sp: &rnp->gp_seq);
3655	rgosp->rgos_exp = rcu_seq_snap(sp: &rcu_state.expedited_sequence);
3656	}
3657	EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full);
3658
3659	/*
3660	* Helper function for start_poll_synchronize_rcu() and
3661	* start_poll_synchronize_rcu_full().
3662	*/
3663	static void start_poll_synchronize_rcu_common(void)
3664	{
3665	unsigned long flags;
3666	bool needwake;
3667	struct rcu_data *rdp;
3668	struct rcu_node *rnp;
3669
3670	lockdep_assert_irqs_enabled();
3671	local_irq_save(flags);
3672	rdp = this_cpu_ptr(&rcu_data);
3673	rnp = rdp->mynode;
3674	raw_spin_lock_rcu_node(rnp); // irqs already disabled.
3675	// Note it is possible for a grace period to have elapsed between
3676	// the above call to get_state_synchronize_rcu() and the below call
3677	// to rcu_seq_snap. This is OK, the worst that happens is that we
3678	// get a grace period that no one needed. These accesses are ordered
3679	// by smp_mb(), and we are accessing them in the opposite order
3680	// from which they are updated at grace-period start, as required.
3681	needwake = rcu_start_this_gp(rnp_start: rnp, rdp, gp_seq_req: rcu_seq_snap(sp: &rcu_state.gp_seq));
3682	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3683	if (needwake)
3684	rcu_gp_kthread_wake();
3685	}
3686
3687	/**
3688	* start_poll_synchronize_rcu - Snapshot and start RCU grace period
3689	*
3690	* Returns a cookie that is used by a later call to cond_synchronize_rcu()
3691	* or poll_state_synchronize_rcu() to determine whether or not a full
3692	* grace period has elapsed in the meantime. If the needed grace period
3693	* is not already slated to start, notifies RCU core of the need for that
3694	* grace period.
3695	*
3696	* Interrupts must be enabled for the case where it is necessary to awaken
3697	* the grace-period kthread.
3698	*/
3699	unsigned long start_poll_synchronize_rcu(void)
3700	{
3701	unsigned long gp_seq = get_state_synchronize_rcu();
3702
3703	start_poll_synchronize_rcu_common();
3704	return gp_seq;
3705	}
3706	EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
3707
3708	/**
3709	* start_poll_synchronize_rcu_full - Take a full snapshot and start RCU grace period
3710	* @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
3711	*
3712	* Places the normal and expedited grace-period states in *@rgos. This
3713	* state value can be passed to a later call to cond_synchronize_rcu_full()
3714	* or poll_state_synchronize_rcu_full() to determine whether or not a
3715	* grace period (whether normal or expedited) has elapsed in the meantime.
3716	* If the needed grace period is not already slated to start, notifies
3717	* RCU core of the need for that grace period.
3718	*
3719	* Interrupts must be enabled for the case where it is necessary to awaken
3720	* the grace-period kthread.
3721	*/
3722	void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
3723	{
3724	get_state_synchronize_rcu_full(rgosp);
3725
3726	start_poll_synchronize_rcu_common();
3727	}
3728	EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full);
3729
3730	/**
3731	* poll_state_synchronize_rcu - Has the specified RCU grace period completed?
3732	* @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
3733	*
3734	* If a full RCU grace period has elapsed since the earlier call from
3735	* which @oldstate was obtained, return @true, otherwise return @false.
3736	* If @false is returned, it is the caller's responsibility to invoke this
3737	* function later on until it does return @true. Alternatively, the caller
3738	* can explicitly wait for a grace period, for example, by passing @oldstate
3739	* to either cond_synchronize_rcu() or cond_synchronize_rcu_expedited()
3740	* on the one hand or by directly invoking either synchronize_rcu() or
3741	* synchronize_rcu_expedited() on the other.
3742	*
3743	* Yes, this function does not take counter wrap into account.
3744	* But counter wrap is harmless. If the counter wraps, we have waited for
3745	* more than a billion grace periods (and way more on a 64-bit system!).
3746	* Those needing to keep old state values for very long time periods
3747	* (many hours even on 32-bit systems) should check them occasionally and
3748	* either refresh them or set a flag indicating that the grace period has
3749	* completed. Alternatively, they can use get_completed_synchronize_rcu()
3750	* to get a guaranteed-completed grace-period state.
3751	*
3752	* In addition, because oldstate compresses the grace-period state for
3753	* both normal and expedited grace periods into a single unsigned long,
3754	* it can miss a grace period when synchronize_rcu() runs concurrently
3755	* with synchronize_rcu_expedited(). If this is unacceptable, please
3756	* instead use the _full() variant of these polling APIs.
3757	*
3758	* This function provides the same memory-ordering guarantees that
3759	* would be provided by a synchronize_rcu() that was invoked at the call
3760	* to the function that provided @oldstate, and that returned at the end
3761	* of this function.
3762	*/
3763	bool poll_state_synchronize_rcu(unsigned long oldstate)
3764	{
3765	if (oldstate == RCU_GET_STATE_COMPLETED \|\|
3766	rcu_seq_done_exact(sp: &rcu_state.gp_seq_polled, s: oldstate)) {
3767	smp_mb(); / Ensure GP ends before subsequent accesses. /
3768	return true;
3769	}
3770	return false;
3771	}
3772	EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
3773
3774	/**
3775	* poll_state_synchronize_rcu_full - Has the specified RCU grace period completed?
3776	* @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
3777	*
3778	* If a full RCU grace period has elapsed since the earlier call from
3779	* which *rgosp was obtained, return @true, otherwise return @false.
3780	* If @false is returned, it is the caller's responsibility to invoke this
3781	* function later on until it does return @true. Alternatively, the caller
3782	* can explicitly wait for a grace period, for example, by passing @rgosp
3783	* to cond_synchronize_rcu() or by directly invoking synchronize_rcu().
3784	*
3785	* Yes, this function does not take counter wrap into account.
3786	* But counter wrap is harmless. If the counter wraps, we have waited
3787	* for more than a billion grace periods (and way more on a 64-bit
3788	* system!). Those needing to keep rcu_gp_oldstate values for very
3789	* long time periods (many hours even on 32-bit systems) should check
3790	* them occasionally and either refresh them or set a flag indicating
3791	* that the grace period has completed. Alternatively, they can use
3792	* get_completed_synchronize_rcu_full() to get a guaranteed-completed
3793	* grace-period state.
3794	*
3795	* This function provides the same memory-ordering guarantees that would
3796	* be provided by a synchronize_rcu() that was invoked at the call to
3797	* the function that provided @rgosp, and that returned at the end of this
3798	* function. And this guarantee requires that the root rcu_node structure's
3799	* ->gp_seq field be checked instead of that of the rcu_state structure.
3800	* The problem is that the just-ending grace-period's callbacks can be
3801	* invoked between the time that the root rcu_node structure's ->gp_seq
3802	* field is updated and the time that the rcu_state structure's ->gp_seq
3803	* field is updated. Therefore, if a single synchronize_rcu() is to
3804	* cause a subsequent poll_state_synchronize_rcu_full() to return @true,
3805	* then the root rcu_node structure is the one that needs to be polled.
3806	*/
3807	bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
3808	{
3809	struct rcu_node *rnp = rcu_get_root();
3810
3811	smp_mb(); // Order against root rcu_node structure grace-period cleanup.
3812	if (rgosp->rgos_norm == RCU_GET_STATE_COMPLETED \|\|
3813	rcu_seq_done_exact(sp: &rnp->gp_seq, s: rgosp->rgos_norm) \|\|
3814	rgosp->rgos_exp == RCU_GET_STATE_COMPLETED \|\|
3815	rcu_seq_done_exact(sp: &rcu_state.expedited_sequence, s: rgosp->rgos_exp)) {
3816	smp_mb(); / Ensure GP ends before subsequent accesses. /
3817	return true;
3818	}
3819	return false;
3820	}
3821	EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu_full);
3822
3823	/**
3824	* cond_synchronize_rcu - Conditionally wait for an RCU grace period
3825	* @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited()
3826	*
3827	* If a full RCU grace period has elapsed since the earlier call to
3828	* get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return.
3829	* Otherwise, invoke synchronize_rcu() to wait for a full grace period.
3830	*
3831	* Yes, this function does not take counter wrap into account.
3832	* But counter wrap is harmless. If the counter wraps, we have waited for
3833	* more than 2 billion grace periods (and way more on a 64-bit system!),
3834	* so waiting for a couple of additional grace periods should be just fine.
3835	*
3836	* This function provides the same memory-ordering guarantees that
3837	* would be provided by a synchronize_rcu() that was invoked at the call
3838	* to the function that provided @oldstate and that returned at the end
3839	* of this function.
3840	*/
3841	void cond_synchronize_rcu(unsigned long oldstate)
3842	{
3843	if (!poll_state_synchronize_rcu(oldstate))
3844	synchronize_rcu();
3845	}
3846	EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
3847
3848	/**
3849	* cond_synchronize_rcu_full - Conditionally wait for an RCU grace period
3850	* @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full()
3851	*
3852	* If a full RCU grace period has elapsed since the call to
3853	* get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(),
3854	* or start_poll_synchronize_rcu_expedited_full() from which @rgosp was
3855	* obtained, just return. Otherwise, invoke synchronize_rcu() to wait
3856	* for a full grace period.
3857	*
3858	* Yes, this function does not take counter wrap into account.
3859	* But counter wrap is harmless. If the counter wraps, we have waited for
3860	* more than 2 billion grace periods (and way more on a 64-bit system!),
3861	* so waiting for a couple of additional grace periods should be just fine.
3862	*
3863	* This function provides the same memory-ordering guarantees that
3864	* would be provided by a synchronize_rcu() that was invoked at the call
3865	* to the function that provided @rgosp and that returned at the end of
3866	* this function.
3867	*/
3868	void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
3869	{
3870	if (!poll_state_synchronize_rcu_full(rgosp))
3871	synchronize_rcu();
3872	}
3873	EXPORT_SYMBOL_GPL(cond_synchronize_rcu_full);
3874
3875	/*
3876	* Check to see if there is any immediate RCU-related work to be done by
3877	* the current CPU, returning 1 if so and zero otherwise. The checks are
3878	* in order of increasing expense: checks that can be carried out against
3879	* CPU-local state are performed first. However, we must check for CPU
3880	* stalls first, else we might not get a chance.
3881	*/
3882	static int rcu_pending(int user)
3883	{
3884	bool gp_in_progress;
3885	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
3886	struct rcu_node *rnp = rdp->mynode;
3887
3888	lockdep_assert_irqs_disabled();
3889
3890	/ Check for CPU stalls, if enabled. /
3891	check_cpu_stall(rdp);
3892
3893	/ Does this CPU need a deferred NOCB wakeup? /
3894	if (rcu_nocb_need_deferred_wakeup(rdp, RCU_NOCB_WAKE))
3895	return `1`;
3896
3897	/ Is this a nohz_full CPU in userspace or idle? (Ignore RCU if so.) /
3898	if ((user \|\| rcu_is_cpu_rrupt_from_idle()) && rcu_nohz_full_cpu())
3899	return `0`;
3900
3901	/ Is the RCU core waiting for a quiescent state from this CPU? /
3902	gp_in_progress = rcu_gp_in_progress();
3903	if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm && gp_in_progress)
3904	return `1`;
3905
3906	/ Does this CPU have callbacks ready to invoke? /
3907	if (!rcu_rdp_is_offloaded(rdp) &&
3908	rcu_segcblist_ready_cbs(rsclp: &rdp->cblist))
3909	return `1`;
3910
3911	/ Has RCU gone idle with this CPU needing another grace period? /
3912	if (!gp_in_progress && rcu_segcblist_is_enabled(rsclp: &rdp->cblist) &&
3913	!rcu_rdp_is_offloaded(rdp) &&
3914	!rcu_segcblist_restempty(rsclp: &rdp->cblist, RCU_NEXT_READY_TAIL))
3915	return `1`;
3916
3917	/ Have RCU grace period completed or started? /
3918	if (rcu_seq_current(sp: &rnp->gp_seq) != rdp->gp_seq \|\|
3919	unlikely(READ_ONCE(rdp->gpwrap))) / outside lock /
3920	return `1`;
3921
3922	/ nothing to do /
3923	return `0`;
3924	}
3925
3926	/*
3927	* Helper function for rcu_barrier() tracing. If tracing is disabled,
3928	* the compiler is expected to optimize this away.
3929	*/
3930	static void rcu_barrier_trace(const char s, int* cpu, unsigned long done)
3931	{
3932	trace_rcu_barrier(rcuname: rcu_state.name, s, cpu,
3933	cnt: atomic_read(v: &rcu_state.barrier_cpu_count), done);
3934	}
3935
3936	/*
3937	* RCU callback function for rcu_barrier(). If we are last, wake
3938	* up the task executing rcu_barrier().
3939	*
3940	* Note that the value of rcu_state.barrier_sequence must be captured
3941	* before the atomic_dec_and_test(). Otherwise, if this CPU is not last,
3942	* other CPUs might count the value down to zero before this CPU gets
3943	* around to invoking rcu_barrier_trace(), which might result in bogus
3944	* data from the next instance of rcu_barrier().
3945	*/
3946	static void rcu_barrier_callback(struct rcu_head *rhp)
3947	{
3948	unsigned long __maybe_unused s = rcu_state.barrier_sequence;
3949
3950	if (atomic_dec_and_test(v: &rcu_state.barrier_cpu_count)) {
3951	rcu_barrier_trace(TPS("LastCB"), cpu: -`1`, done: s);
3952	complete(&rcu_state.barrier_completion);
3953	} else {
3954	rcu_barrier_trace(TPS("CB"), cpu: -`1`, done: s);
3955	}
3956	}
3957
3958	/*
3959	* If needed, entrain an rcu_barrier() callback on rdp->cblist.
3960	*/
3961	static void rcu_barrier_entrain(struct rcu_data *rdp)
3962	{
3963	unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence);
3964	unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap);
3965	bool wake_nocb = false;
3966	bool was_alldone = false;
3967
3968	lockdep_assert_held(&rcu_state.barrier_lock);
3969	if (rcu_seq_state(s: lseq) \|\| !rcu_seq_state(s: gseq) \|\| rcu_seq_ctr(s: lseq) != rcu_seq_ctr(s: gseq))
3970	return;
3971	rcu_barrier_trace(TPS("IRQ"), cpu: -`1`, done: rcu_state.barrier_sequence);
3972	rdp->barrier_head.func = rcu_barrier_callback;
3973	debug_rcu_head_queue(head: &rdp->barrier_head);
3974	rcu_nocb_lock(rdp);
3975	/*
3976	* Flush bypass and wakeup rcuog if we add callbacks to an empty regular
3977	* queue. This way we don't wait for bypass timer that can reach seconds
3978	* if it's fully lazy.
3979	*/
3980	was_alldone = rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_pend_cbs(rsclp: &rdp->cblist);
3981	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
3982	wake_nocb = was_alldone && rcu_segcblist_pend_cbs(rsclp: &rdp->cblist);
3983	if (rcu_segcblist_entrain(rsclp: &rdp->cblist, rhp: &rdp->barrier_head)) {
3984	atomic_inc(v: &rcu_state.barrier_cpu_count);
3985	} else {
3986	debug_rcu_head_unqueue(head: &rdp->barrier_head);
3987	rcu_barrier_trace(TPS("IRQNQ"), cpu: -`1`, done: rcu_state.barrier_sequence);
3988	}
3989	rcu_nocb_unlock(rdp);
3990	if (wake_nocb)
3991	wake_nocb_gp(rdp, force: false);
3992	smp_store_release(&rdp->barrier_seq_snap, gseq);
3993	}
3994
3995	/*
3996	* Called with preemption disabled, and from cross-cpu IRQ context.
3997	*/
3998	static void rcu_barrier_handler(void *cpu_in)
3999	{
4000	uintptr_t cpu = (uintptr_t)cpu_in;
4001	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
4002
4003	lockdep_assert_irqs_disabled();
4004	WARN_ON_ONCE(cpu != rdp->cpu);
4005	WARN_ON_ONCE(cpu != smp_processor_id());
4006	raw_spin_lock(&rcu_state.barrier_lock);
4007	rcu_barrier_entrain(rdp);
4008	raw_spin_unlock(&rcu_state.barrier_lock);
4009	}
4010
4011	/**
4012	* rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
4013	*
4014	* Note that this primitive does not necessarily wait for an RCU grace period
4015	* to complete. For example, if there are no RCU callbacks queued anywhere
4016	* in the system, then rcu_barrier() is within its rights to return
4017	* immediately, without waiting for anything, much less an RCU grace period.
4018	*/
4019	void rcu_barrier(void)
4020	{
4021	uintptr_t cpu;
4022	unsigned long flags;
4023	unsigned long gseq;
4024	struct rcu_data *rdp;
4025	unsigned long s = rcu_seq_snap(sp: &rcu_state.barrier_sequence);
4026
4027	rcu_barrier_trace(TPS("Begin"), cpu: -`1`, done: s);
4028
4029	/ Take mutex to serialize concurrent rcu_barrier() requests. /
4030	mutex_lock(&rcu_state.barrier_mutex);
4031
4032	/ Did someone else do our work for us? /
4033	if (rcu_seq_done(sp: &rcu_state.barrier_sequence, s)) {
4034	rcu_barrier_trace(TPS("EarlyExit"), cpu: -`1`, done: rcu_state.barrier_sequence);
4035	smp_mb(); / caller's subsequent code after above check. /
4036	mutex_unlock(lock: &rcu_state.barrier_mutex);
4037	return;
4038	}
4039
4040	/ Mark the start of the barrier operation. /
4041	raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
4042	rcu_seq_start(sp: &rcu_state.barrier_sequence);
4043	gseq = rcu_state.barrier_sequence;
4044	rcu_barrier_trace(TPS("Inc1"), cpu: -`1`, done: rcu_state.barrier_sequence);
4045
4046	/*
4047	* Initialize the count to two rather than to zero in order
4048	* to avoid a too-soon return to zero in case of an immediate
4049	* invocation of the just-enqueued callback (or preemption of
4050	* this task). Exclude CPU-hotplug operations to ensure that no
4051	* offline non-offloaded CPU has callbacks queued.
4052	*/
4053	init_completion(x: &rcu_state.barrier_completion);
4054	atomic_set(v: &rcu_state.barrier_cpu_count, i: `2`);
4055	raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
4056
4057	/*
4058	* Force each CPU with callbacks to register a new callback.
4059	* When that callback is invoked, we will know that all of the
4060	* corresponding CPU's preceding callbacks have been invoked.
4061	*/
4062	for_each_possible_cpu(cpu) {
4063	rdp = per_cpu_ptr(&rcu_data, cpu);
4064	retry:
4065	if (smp_load_acquire(&rdp->barrier_seq_snap) == gseq)
4066	continue;
4067	raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
4068	if (!rcu_segcblist_n_cbs(rsclp: &rdp->cblist)) {
4069	WRITE_ONCE(rdp->barrier_seq_snap, gseq);
4070	raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
4071	rcu_barrier_trace(TPS("NQ"), cpu, done: rcu_state.barrier_sequence);
4072	continue;
4073	}
4074	if (!rcu_rdp_cpu_online(rdp)) {
4075	rcu_barrier_entrain(rdp);
4076	WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq);
4077	raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
4078	rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu, done: rcu_state.barrier_sequence);
4079	continue;
4080	}
4081	raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
4082	if (smp_call_function_single(cpuid: cpu, func: rcu_barrier_handler, info: (void *)cpu, wait: `1`)) {
4083	schedule_timeout_uninterruptible(timeout: `1`);
4084	goto retry;
4085	}
4086	WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq);
4087	rcu_barrier_trace(TPS("OnlineQ"), cpu, done: rcu_state.barrier_sequence);
4088	}
4089
4090	/*
4091	* Now that we have an rcu_barrier_callback() callback on each
4092	* CPU, and thus each counted, remove the initial count.
4093	*/
4094	if (atomic_sub_and_test(i: `2`, v: &rcu_state.barrier_cpu_count))
4095	complete(&rcu_state.barrier_completion);
4096
4097	/ Wait for all rcu_barrier_callback() callbacks to be invoked. /
4098	wait_for_completion(&rcu_state.barrier_completion);
4099
4100	/ Mark the end of the barrier operation. /
4101	rcu_barrier_trace(TPS("Inc2"), cpu: -`1`, done: rcu_state.barrier_sequence);
4102	rcu_seq_end(sp: &rcu_state.barrier_sequence);
4103	gseq = rcu_state.barrier_sequence;
4104	for_each_possible_cpu(cpu) {
4105	rdp = per_cpu_ptr(&rcu_data, cpu);
4106
4107	WRITE_ONCE(rdp->barrier_seq_snap, gseq);
4108	}
4109
4110	/ Other rcu_barrier() invocations can now safely proceed. /
4111	mutex_unlock(lock: &rcu_state.barrier_mutex);
4112	}
4113	EXPORT_SYMBOL_GPL(rcu_barrier);
4114
4115	static unsigned long rcu_barrier_last_throttle;
4116
4117	/**
4118	* rcu_barrier_throttled - Do rcu_barrier(), but limit to one per second
4119	*
4120	* This can be thought of as guard rails around rcu_barrier() that
4121	* permits unrestricted userspace use, at least assuming the hardware's
4122	* try_cmpxchg() is robust. There will be at most one call per second to
4123	* rcu_barrier() system-wide from use of this function, which means that
4124	* callers might needlessly wait a second or three.
4125	*
4126	* This is intended for use by test suites to avoid OOM by flushing RCU
4127	* callbacks from the previous test before starting the next. See the
4128	* rcutree.do_rcu_barrier module parameter for more information.
4129	*
4130	* Why not simply make rcu_barrier() more scalable? That might be
4131	* the eventual endpoint, but let's keep it simple for the time being.
4132	* Note that the module parameter infrastructure serializes calls to a
4133	* given .set() function, but should concurrent .set() invocation ever be
4134	* possible, we are ready!
4135	*/
4136	static void rcu_barrier_throttled(void)
4137	{
4138	unsigned long j = jiffies;
4139	unsigned long old = READ_ONCE(rcu_barrier_last_throttle);
4140	unsigned long s = rcu_seq_snap(sp: &rcu_state.barrier_sequence);
4141
4142	while (time_in_range(j, old, old + HZ / `16`) \|\|
4143	!try_cmpxchg(&rcu_barrier_last_throttle, &old, j)) {
4144	schedule_timeout_idle(HZ / `16`);
4145	if (rcu_seq_done(sp: &rcu_state.barrier_sequence, s)) {
4146	smp_mb(); / caller's subsequent code after above check. /
4147	return;
4148	}
4149	j = jiffies;
4150	old = READ_ONCE(rcu_barrier_last_throttle);
4151	}
4152	rcu_barrier();
4153	}
4154
4155	/*
4156	* Invoke rcu_barrier_throttled() when a rcutree.do_rcu_barrier
4157	* request arrives. We insist on a true value to allow for possible
4158	* future expansion.
4159	*/
4160	static int param_set_do_rcu_barrier(const char val, const* struct kernel_param *kp)
4161	{
4162	bool b;
4163	int ret;
4164
4165	if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING)
4166	return -EAGAIN;
4167	ret = kstrtobool(s: val, res: &b);
4168	if (!ret && b) {
4169	atomic_inc(v: (atomic_t *)kp->arg);
4170	rcu_barrier_throttled();
4171	atomic_dec(v: (atomic_t *)kp->arg);
4172	}
4173	return ret;
4174	}
4175
4176	/*
4177	* Output the number of outstanding rcutree.do_rcu_barrier requests.
4178	*/
4179	static int param_get_do_rcu_barrier(char buffer, const* struct kernel_param *kp)
4180	{
4181	return sprintf(buf: buffer, fmt: "%d\n", atomic_read(v: (atomic_t *)kp->arg));
4182	}
4183
4184	static const struct kernel_param_ops do_rcu_barrier_ops = {
4185	.set = param_set_do_rcu_barrier,
4186	.get = param_get_do_rcu_barrier,
4187	};
4188	static atomic_t do_rcu_barrier;
4189	module_param_cb(do_rcu_barrier, &do_rcu_barrier_ops, &do_rcu_barrier, `0644`);
4190
4191	/*
4192	* Compute the mask of online CPUs for the specified rcu_node structure.
4193	* This will not be stable unless the rcu_node structure's ->lock is
4194	* held, but the bit corresponding to the current CPU will be stable
4195	* in most contexts.
4196	*/
4197	static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
4198	{
4199	return READ_ONCE(rnp->qsmaskinitnext);
4200	}
4201
4202	/*
4203	* Is the CPU corresponding to the specified rcu_data structure online
4204	* from RCU's perspective? This perspective is given by that structure's
4205	* ->qsmaskinitnext field rather than by the global cpu_online_mask.
4206	*/
4207	static bool rcu_rdp_cpu_online(struct rcu_data *rdp)
4208	{
4209	return !!(rdp->grpmask & rcu_rnp_online_cpus(rnp: rdp->mynode));
4210	}
4211
4212	bool rcu_cpu_online(int cpu)
4213	{
4214	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
4215
4216	return rcu_rdp_cpu_online(rdp);
4217	}
4218
4219	#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
4220
4221	/*
4222	* Is the current CPU online as far as RCU is concerned?
4223	*
4224	* Disable preemption to avoid false positives that could otherwise
4225	* happen due to the current CPU number being sampled, this task being
4226	* preempted, its old CPU being taken offline, resuming on some other CPU,
4227	* then determining that its old CPU is now offline.
4228	*
4229	* Disable checking if in an NMI handler because we cannot safely
4230	* report errors from NMI handlers anyway. In addition, it is OK to use
4231	* RCU on an offline processor during initial boot, hence the check for
4232	* rcu_scheduler_fully_active.
4233	*/
4234	bool rcu_lockdep_current_cpu_online(void)
4235	{
4236	struct rcu_data *rdp;
4237	bool ret = false;
4238
4239	if (in_nmi() \|\| !rcu_scheduler_fully_active)
4240	return true;
4241	preempt_disable_notrace();
4242	rdp = this_cpu_ptr(&rcu_data);
4243	/*
4244	* Strictly, we care here about the case where the current CPU is
4245	* in rcutree_report_cpu_starting() and thus has an excuse for rdp->grpmask
4246	* not being up to date. So arch_spin_is_locked() might have a
4247	* false positive if it's held by some other CPU, but that's
4248	* OK because that just means a false negative on the warning.
4249	*/
4250	if (rcu_rdp_cpu_online(rdp) \|\| arch_spin_is_locked(&rcu_state.ofl_lock))
4251	ret = true;
4252	preempt_enable_notrace();
4253	return ret;
4254	}
4255	EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
4256
4257	#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
4258
4259	// Has rcu_init() been invoked? This is used (for example) to determine
4260	// whether spinlocks may be acquired safely.
4261	static bool rcu_init_invoked(void)
4262	{
4263	return !!rcu_state.n_online_cpus;
4264	}
4265
4266	/*
4267	* All CPUs for the specified rcu_node structure have gone offline,
4268	* and all tasks that were preempted within an RCU read-side critical
4269	* section while running on one of those CPUs have since exited their RCU
4270	* read-side critical section. Some other CPU is reporting this fact with
4271	* the specified rcu_node structure's ->lock held and interrupts disabled.
4272	* This function therefore goes up the tree of rcu_node structures,
4273	* clearing the corresponding bits in the ->qsmaskinit fields. Note that
4274	* the leaf rcu_node structure's ->qsmaskinit field has already been
4275	* updated.
4276	*
4277	* This function does check that the specified rcu_node structure has
4278	* all CPUs offline and no blocked tasks, so it is OK to invoke it
4279	* prematurely. That said, invoking it after the fact will cost you
4280	* a needless lock acquisition. So once it has done its work, don't
4281	* invoke it again.
4282	*/
4283	static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
4284	{
4285	long mask;
4286	struct rcu_node *rnp = rnp_leaf;
4287
4288	raw_lockdep_assert_held_rcu_node(rnp_leaf);
4289	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) \|\|
4290	WARN_ON_ONCE(rnp_leaf->qsmaskinit) \|\|
4291	WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf)))
4292	return;
4293	for (;;) {
4294	mask = rnp->grpmask;
4295	rnp = rnp->parent;
4296	if (!rnp)
4297	break;
4298	raw_spin_lock_rcu_node(rnp); / irqs already disabled. /
4299	rnp->qsmaskinit &= ~mask;
4300	/ Between grace periods, so better already be zero! /
4301	WARN_ON_ONCE(rnp->qsmask);
4302	if (rnp->qsmaskinit) {
4303	raw_spin_unlock_rcu_node(rnp);
4304	/ irqs remain disabled. /
4305	return;
4306	}
4307	raw_spin_unlock_rcu_node(rnp); / irqs remain disabled. /
4308	}
4309	}
4310
4311	/*
4312	* Propagate ->qsinitmask bits up the rcu_node tree to account for the
4313	* first CPU in a given leaf rcu_node structure coming online. The caller
4314	* must hold the corresponding leaf rcu_node ->lock with interrupts
4315	* disabled.
4316	*/
4317	static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
4318	{
4319	long mask;
4320	long oldmask;
4321	struct rcu_node *rnp = rnp_leaf;
4322
4323	raw_lockdep_assert_held_rcu_node(rnp_leaf);
4324	WARN_ON_ONCE(rnp->wait_blkd_tasks);
4325	for (;;) {
4326	mask = rnp->grpmask;
4327	rnp = rnp->parent;
4328	if (rnp == NULL)
4329	return;
4330	raw_spin_lock_rcu_node(rnp); / Interrupts already disabled. /
4331	oldmask = rnp->qsmaskinit;
4332	rnp->qsmaskinit \|= mask;
4333	raw_spin_unlock_rcu_node(rnp); / Interrupts remain disabled. /
4334	if (oldmask)
4335	return;
4336	}
4337	}
4338
4339	/*
4340	* Do boot-time initialization of a CPU's per-CPU RCU data.
4341	*/
4342	static void __init
4343	rcu_boot_init_percpu_data(int cpu)
4344	{
4345	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
4346	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
4347
4348	/ Set up local state, ensuring consistent view of global state. /
4349	rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
4350	INIT_WORK(&rdp->strict_work, strict_work_handler);
4351	WARN_ON_ONCE(ct->dynticks_nesting != `1`);
4352	WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu)));
4353	rdp->barrier_seq_snap = rcu_state.barrier_sequence;
4354	rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
4355	rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED;
4356	rdp->rcu_onl_gp_seq = rcu_state.gp_seq;
4357	rdp->rcu_onl_gp_flags = RCU_GP_CLEANED;
4358	rdp->last_sched_clock = jiffies;
4359	rdp->cpu = cpu;
4360	rcu_boot_init_nocb_percpu_data(rdp);
4361	}
4362
4363	/*
4364	* Invoked early in the CPU-online process, when pretty much all services
4365	* are available. The incoming CPU is not present.
4366	*
4367	* Initializes a CPU's per-CPU RCU data. Note that only one online or
4368	* offline event can be happening at a given time. Note also that we can
4369	* accept some slop in the rsp->gp_seq access due to the fact that this
4370	* CPU cannot possibly have any non-offloaded RCU callbacks in flight yet.
4371	* And any offloaded callbacks are being numbered elsewhere.
4372	*/
4373	int rcutree_prepare_cpu(unsigned int cpu)
4374	{
4375	unsigned long flags;
4376	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
4377	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
4378	struct rcu_node *rnp = rcu_get_root();
4379
4380	/ Set up local state, ensuring consistent view of global state. /
4381	raw_spin_lock_irqsave_rcu_node(rnp, flags);
4382	rdp->qlen_last_fqs_check = `0`;
4383	rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
4384	rdp->blimit = blimit;
4385	ct->dynticks_nesting = `1`; / CPU not up, no tearing. /
4386	raw_spin_unlock_rcu_node(rnp); / irqs remain disabled. /
4387
4388	/*
4389	* Only non-NOCB CPUs that didn't have early-boot callbacks need to be
4390	* (re-)initialized.
4391	*/
4392	if (!rcu_segcblist_is_enabled(rsclp: &rdp->cblist))
4393	rcu_segcblist_init(rsclp: &rdp->cblist); / Re-enable callbacks. /
4394
4395	/*
4396	* Add CPU to leaf rcu_node pending-online bitmask. Any needed
4397	* propagation up the rcu_node tree will happen at the beginning
4398	* of the next grace period.
4399	*/
4400	rnp = rdp->mynode;
4401	raw_spin_lock_rcu_node(rnp); / irqs already disabled. /
4402	rdp->gp_seq = READ_ONCE(rnp->gp_seq);
4403	rdp->gp_seq_needed = rdp->gp_seq;
4404	rdp->cpu_no_qs.b.norm = true;
4405	rdp->core_needs_qs = false;
4406	rdp->rcu_iw_pending = false;
4407	rdp->rcu_iw = IRQ_WORK_INIT_HARD(rcu_iw_handler);
4408	rdp->rcu_iw_gp_seq = rdp->gp_seq - `1`;
4409	trace_rcu_grace_period(rcuname: rcu_state.name, gp_seq: rdp->gp_seq, TPS("cpuonl"));
4410	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
4411	rcu_spawn_one_boost_kthread(rnp);
4412	rcu_spawn_cpu_nocb_kthread(cpu);
4413	WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + `1`);
4414
4415	return `0`;
4416	}
4417
4418	/*
4419	* Update RCU priority boot kthread affinity for CPU-hotplug changes.
4420	*/
4421	static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
4422	{
4423	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
4424
4425	rcu_boost_kthread_setaffinity(rnp: rdp->mynode, outgoingcpu: outgoing);
4426	}
4427
4428	/*
4429	* Has the specified (known valid) CPU ever been fully online?
4430	*/
4431	bool rcu_cpu_beenfullyonline(int cpu)
4432	{
4433	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
4434
4435	return smp_load_acquire(&rdp->beenonline);
4436	}
4437
4438	/*
4439	* Near the end of the CPU-online process. Pretty much all services
4440	* enabled, and the CPU is now very much alive.
4441	*/
4442	int rcutree_online_cpu(unsigned int cpu)
4443	{
4444	unsigned long flags;
4445	struct rcu_data *rdp;
4446	struct rcu_node *rnp;
4447
4448	rdp = per_cpu_ptr(&rcu_data, cpu);
4449	rnp = rdp->mynode;
4450	raw_spin_lock_irqsave_rcu_node(rnp, flags);
4451	rnp->ffmask \|= rdp->grpmask;
4452	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
4453	if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
4454	return `0`; / Too early in boot for scheduler work. /
4455	sync_sched_exp_online_cleanup(cpu);
4456	rcutree_affinity_setting(cpu, outgoing: -`1`);
4457
4458	// Stop-machine done, so allow nohz_full to disable tick.
4459	tick_dep_clear(bit: TICK_DEP_BIT_RCU);
4460	return `0`;
4461	}
4462
4463	/*
4464	* Mark the specified CPU as being online so that subsequent grace periods
4465	* (both expedited and normal) will wait on it. Note that this means that
4466	* incoming CPUs are not allowed to use RCU read-side critical sections
4467	* until this function is called. Failing to observe this restriction
4468	* will result in lockdep splats.
4469	*
4470	* Note that this function is special in that it is invoked directly
4471	* from the incoming CPU rather than from the cpuhp_step mechanism.
4472	* This is because this function must be invoked at a precise location.
4473	* This incoming CPU must not have enabled interrupts yet.
4474	*
4475	* This mirrors the effects of rcutree_report_cpu_dead().
4476	*/
4477	void rcutree_report_cpu_starting(unsigned int cpu)
4478	{
4479	unsigned long mask;
4480	struct rcu_data *rdp;
4481	struct rcu_node *rnp;
4482	bool newcpu;
4483
4484	lockdep_assert_irqs_disabled();
4485	rdp = per_cpu_ptr(&rcu_data, cpu);
4486	if (rdp->cpu_started)
4487	return;
4488	rdp->cpu_started = true;
4489
4490	rnp = rdp->mynode;
4491	mask = rdp->grpmask;
4492	arch_spin_lock(&rcu_state.ofl_lock);
4493	rcu_dynticks_eqs_online();
4494	raw_spin_lock(&rcu_state.barrier_lock);
4495	raw_spin_lock_rcu_node(rnp);
4496	WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext \| mask);
4497	raw_spin_unlock(&rcu_state.barrier_lock);
4498	newcpu = !(rnp->expmaskinitnext & mask);
4499	rnp->expmaskinitnext \|= mask;
4500	/ Allow lockless access for expedited grace periods. /
4501	smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + newcpu); / ^^^ /
4502	ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus);
4503	rcu_gpnum_ovf(rnp, rdp); / Offline-induced counter wrap? /
4504	rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq);
4505	rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags);
4506
4507	/ An incoming CPU should never be blocking a grace period. /
4508	if (WARN_ON_ONCE(rnp->qsmask & mask)) { / RCU waiting on incoming CPU? /
4509	/ rcu_report_qs_rnp() really wants some flags to restore /
4510	unsigned long flags;
4511
4512	local_irq_save(flags);
4513	rcu_disable_urgency_upon_qs(rdp);
4514	/ Report QS -after- changing ->qsmaskinitnext! /
4515	rcu_report_qs_rnp(mask, rnp, gps: rnp->gp_seq, flags);
4516	} else {
4517	raw_spin_unlock_rcu_node(rnp);
4518	}
4519	arch_spin_unlock(&rcu_state.ofl_lock);
4520	smp_store_release(&rdp->beenonline, true);
4521	smp_mb(); / Ensure RCU read-side usage follows above initialization. /
4522	}
4523
4524	/*
4525	* The outgoing function has no further need of RCU, so remove it from
4526	* the rcu_node tree's ->qsmaskinitnext bit masks.
4527	*
4528	* Note that this function is special in that it is invoked directly
4529	* from the outgoing CPU rather than from the cpuhp_step mechanism.
4530	* This is because this function must be invoked at a precise location.
4531	*
4532	* This mirrors the effect of rcutree_report_cpu_starting().
4533	*/
4534	void rcutree_report_cpu_dead(void)
4535	{
4536	unsigned long flags;
4537	unsigned long mask;
4538	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
4539	struct rcu_node rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. /
4540
4541	/*
4542	* IRQS must be disabled from now on and until the CPU dies, or an interrupt
4543	* may introduce a new READ-side while it is actually off the QS masks.
4544	*/
4545	lockdep_assert_irqs_disabled();
4546	// Do any dangling deferred wakeups.
4547	do_nocb_deferred_wakeup(rdp);
4548
4549	rcu_preempt_deferred_qs(current);
4550
4551	/ Remove outgoing CPU from mask in the leaf rcu_node structure. /
4552	mask = rdp->grpmask;
4553	arch_spin_lock(&rcu_state.ofl_lock);
4554	raw_spin_lock_irqsave_rcu_node(rnp, flags); / Enforce GP memory-order guarantee. /
4555	rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
4556	rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags);
4557	if (rnp->qsmask & mask) { / RCU waiting on outgoing CPU? /
4558	/ Report quiescent state -before- changing ->qsmaskinitnext! /
4559	rcu_disable_urgency_upon_qs(rdp);
4560	rcu_report_qs_rnp(mask, rnp, gps: rnp->gp_seq, flags);
4561	raw_spin_lock_irqsave_rcu_node(rnp, flags);
4562	}
4563	WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
4564	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
4565	arch_spin_unlock(&rcu_state.ofl_lock);
4566	rdp->cpu_started = false;
4567	}
4568
4569	#ifdef CONFIG_HOTPLUG_CPU
4570	/*
4571	* The outgoing CPU has just passed through the dying-idle state, and we
4572	* are being invoked from the CPU that was IPIed to continue the offline
4573	* operation. Migrate the outgoing CPU's callbacks to the current CPU.
4574	*/
4575	void rcutree_migrate_callbacks(int cpu)
4576	{
4577	unsigned long flags;
4578	struct rcu_data *my_rdp;
4579	struct rcu_node *my_rnp;
4580	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
4581	bool needwake;
4582
4583	if (rcu_rdp_is_offloaded(rdp) \|\|
4584	rcu_segcblist_empty(rsclp: &rdp->cblist))
4585	return; / No callbacks to migrate. /
4586
4587	raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
4588	WARN_ON_ONCE(rcu_rdp_cpu_online(rdp));
4589	rcu_barrier_entrain(rdp);
4590	my_rdp = this_cpu_ptr(&rcu_data);
4591	my_rnp = my_rdp->mynode;
4592	rcu_nocb_lock(rdp: my_rdp); / irqs already disabled. /
4593	WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies, false));
4594	raw_spin_lock_rcu_node(my_rnp); / irqs already disabled. /
4595	/ Leverage recent GPs and set GP for new callbacks. /
4596	needwake = rcu_advance_cbs(rnp: my_rnp, rdp) \|\|
4597	rcu_advance_cbs(rnp: my_rnp, rdp: my_rdp);
4598	rcu_segcblist_merge(dst_rsclp: &my_rdp->cblist, src_rsclp: &rdp->cblist);
4599	raw_spin_unlock(&rcu_state.barrier_lock); / irqs remain disabled. /
4600	needwake = needwake \|\| rcu_advance_cbs(rnp: my_rnp, rdp: my_rdp);
4601	rcu_segcblist_disable(rsclp: &rdp->cblist);
4602	WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist));
4603	check_cb_ovld_locked(rdp: my_rdp, rnp: my_rnp);
4604	if (rcu_rdp_is_offloaded(rdp: my_rdp)) {
4605	raw_spin_unlock_rcu_node(my_rnp); / irqs remain disabled. /
4606	__call_rcu_nocb_wake(rdp: my_rdp, was_empty: true, flags);
4607	} else {
4608	rcu_nocb_unlock(rdp: my_rdp); / irqs remain disabled. /
4609	raw_spin_unlock_irqrestore_rcu_node(my_rnp, flags);
4610	}
4611	if (needwake)
4612	rcu_gp_kthread_wake();
4613	lockdep_assert_irqs_enabled();
4614	WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != `0` \|\|
4615	!rcu_segcblist_empty(&rdp->cblist),
4616	"rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
4617	cpu, rcu_segcblist_n_cbs(&rdp->cblist),
4618	rcu_segcblist_first_cb(&rdp->cblist));
4619	}
4620
4621	/*
4622	* The CPU has been completely removed, and some other CPU is reporting
4623	* this fact from process context. Do the remainder of the cleanup.
4624	* There can only be one CPU hotplug operation at a time, so no need for
4625	* explicit locking.
4626	*/
4627	int rcutree_dead_cpu(unsigned int cpu)
4628	{
4629	WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - `1`);
4630	// Stop-machine done, so allow nohz_full to disable tick.
4631	tick_dep_clear(bit: TICK_DEP_BIT_RCU);
4632	return `0`;
4633	}
4634
4635	/*
4636	* Near the end of the offline process. Trace the fact that this CPU
4637	* is going offline.
4638	*/
4639	int rcutree_dying_cpu(unsigned int cpu)
4640	{
4641	bool blkd;
4642	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
4643	struct rcu_node *rnp = rdp->mynode;
4644
4645	blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask);
4646	trace_rcu_grace_period(rcuname: rcu_state.name, READ_ONCE(rnp->gp_seq),
4647	gpevent: blkd ? TPS("cpuofl-bgp") : TPS("cpuofl"));
4648	return `0`;
4649	}
4650
4651	/*
4652	* Near the beginning of the process. The CPU is still very much alive
4653	* with pretty much all services enabled.
4654	*/
4655	int rcutree_offline_cpu(unsigned int cpu)
4656	{
4657	unsigned long flags;
4658	struct rcu_data *rdp;
4659	struct rcu_node *rnp;
4660
4661	rdp = per_cpu_ptr(&rcu_data, cpu);
4662	rnp = rdp->mynode;
4663	raw_spin_lock_irqsave_rcu_node(rnp, flags);
4664	rnp->ffmask &= ~rdp->grpmask;
4665	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
4666
4667	rcutree_affinity_setting(cpu, outgoing: cpu);
4668
4669	// nohz_full CPUs need the tick for stop-machine to work quickly
4670	tick_dep_set(bit: TICK_DEP_BIT_RCU);
4671	return `0`;
4672	}
4673	#endif /* #ifdef CONFIG_HOTPLUG_CPU */
4674
4675	/*
4676	* On non-huge systems, use expedited RCU grace periods to make suspend
4677	* and hibernation run faster.
4678	*/
4679	static int rcu_pm_notify(struct notifier_block *self,
4680	unsigned long action, void *hcpu)
4681	{
4682	switch (action) {
4683	case PM_HIBERNATION_PREPARE:
4684	case PM_SUSPEND_PREPARE:
4685	rcu_async_hurry();
4686	rcu_expedite_gp();
4687	break;
4688	case PM_POST_HIBERNATION:
4689	case PM_POST_SUSPEND:
4690	rcu_unexpedite_gp();
4691	rcu_async_relax();
4692	break;
4693	default:
4694	break;
4695	}
4696	return NOTIFY_OK;
4697	}
4698
4699	#ifdef CONFIG_RCU_EXP_KTHREAD
4700	struct kthread_worker *rcu_exp_gp_kworker;
4701	struct kthread_worker *rcu_exp_par_gp_kworker;
4702
4703	static void __init rcu_start_exp_gp_kworkers(void)
4704	{
4705	const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker";
4706	const char *gp_kworker_name = "rcu_exp_gp_kthread_worker";
4707	struct sched_param param = { .sched_priority = kthread_prio };
4708
4709	rcu_exp_gp_kworker = kthread_create_worker(flags: `0`, namefmt: gp_kworker_name);
4710	if (IS_ERR_OR_NULL(ptr: rcu_exp_gp_kworker)) {
4711	pr_err("Failed to create %s!\n", gp_kworker_name);
4712	return;
4713	}
4714
4715	rcu_exp_par_gp_kworker = kthread_create_worker(flags: `0`, namefmt: par_gp_kworker_name);
4716	if (IS_ERR_OR_NULL(ptr: rcu_exp_par_gp_kworker)) {
4717	pr_err("Failed to create %s!\n", par_gp_kworker_name);
4718	kthread_destroy_worker(worker: rcu_exp_gp_kworker);
4719	return;
4720	}
4721
4722	sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
4723	sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO,
4724	&param);
4725	}
4726
4727	static inline void rcu_alloc_par_gp_wq(void)
4728	{
4729	}
4730	#else /* !CONFIG_RCU_EXP_KTHREAD */
4731	struct workqueue_struct *rcu_par_gp_wq;
4732
4733	static void __init rcu_start_exp_gp_kworkers(void)
4734	{
4735	}
4736
4737	static inline void rcu_alloc_par_gp_wq(void)
4738	{
4739	rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, `0`);
4740	WARN_ON(!rcu_par_gp_wq);
4741	}
4742	#endif /* CONFIG_RCU_EXP_KTHREAD */
4743
4744	/*
4745	* Spawn the kthreads that handle RCU's grace periods.
4746	*/
4747	static int __init rcu_spawn_gp_kthread(void)
4748	{
4749	unsigned long flags;
4750	struct rcu_node *rnp;
4751	struct sched_param sp;
4752	struct task_struct *t;
4753	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
4754
4755	rcu_scheduler_fully_active = `1`;
4756	t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name);
4757	if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__))
4758	return `0`;
4759	if (kthread_prio) {
4760	sp.sched_priority = kthread_prio;
4761	sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
4762	}
4763	rnp = rcu_get_root();
4764	raw_spin_lock_irqsave_rcu_node(rnp, flags);
4765	WRITE_ONCE(rcu_state.gp_activity, jiffies);
4766	WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
4767	// Reset .gp_activity and .gp_req_activity before setting .gp_kthread.
4768	smp_store_release(&rcu_state.gp_kthread, t); / ^^^ /
4769	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
4770	wake_up_process(tsk: t);
4771	/ This is a pre-SMP initcall, we expect a single CPU /
4772	WARN_ON(num_online_cpus() > `1`);
4773	/*
4774	* Those kthreads couldn't be created on rcu_init() -> rcutree_prepare_cpu()
4775	* due to rcu_scheduler_fully_active.
4776	*/
4777	rcu_spawn_cpu_nocb_kthread(smp_processor_id());
4778	rcu_spawn_one_boost_kthread(rnp: rdp->mynode);
4779	rcu_spawn_core_kthreads();
4780	/ Create kthread worker for expedited GPs /
4781	rcu_start_exp_gp_kworkers();
4782	return `0`;
4783	}
4784	early_initcall(rcu_spawn_gp_kthread);
4785
4786	/*
4787	* This function is invoked towards the end of the scheduler's
4788	* initialization process. Before this is called, the idle task might
4789	* contain synchronous grace-period primitives (during which time, this idle
4790	* task is booting the system, and such primitives are no-ops). After this
4791	* function is called, any synchronous grace-period primitives are run as
4792	* expedited, with the requesting task driving the grace period forward.
4793	* A later core_initcall() rcu_set_runtime_mode() will switch to full
4794	* runtime RCU functionality.
4795	*/
4796	void rcu_scheduler_starting(void)
4797	{
4798	unsigned long flags;
4799	struct rcu_node *rnp;
4800
4801	WARN_ON(num_online_cpus() != `1`);
4802	WARN_ON(nr_context_switches() > `0`);
4803	rcu_test_sync_prims();
4804
4805	// Fix up the ->gp_seq counters.
4806	local_irq_save(flags);
4807	rcu_for_each_node_breadth_first(rnp)
4808	rnp->gp_seq_needed = rnp->gp_seq = rcu_state.gp_seq;
4809	local_irq_restore(flags);
4810
4811	// Switch out of early boot mode.
4812	rcu_scheduler_active = RCU_SCHEDULER_INIT;
4813	rcu_test_sync_prims();
4814	}
4815
4816	/*
4817	* Helper function for rcu_init() that initializes the rcu_state structure.
4818	*/
4819	static void __init rcu_init_one(void)
4820	{
4821	static const char * const buf[] = RCU_NODE_NAME_INIT;
4822	static const char * const fqs[] = RCU_FQS_NAME_INIT;
4823	static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
4824	static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
4825
4826	int levelspread[RCU_NUM_LVLS]; / kids/node in each level. /
4827	int cpustride = `1`;
4828	int i;
4829	int j;
4830	struct rcu_node *rnp;
4831
4832	BUILD_BUG_ON(RCU_NUM_LVLS > ARRAY_SIZE(buf)); / Fix buf[] init! /
4833
4834	/ Silence gcc 4.8 false positive about array index out of range. /
4835	if (rcu_num_lvls <= `0` \|\| rcu_num_lvls > RCU_NUM_LVLS)
4836	panic(fmt: "rcu_init_one: rcu_num_lvls out of range");
4837
4838	/ Initialize the level-tracking arrays. /
4839
4840	for (i = `1`; i < rcu_num_lvls; i++)
4841	rcu_state.level[i] =
4842	rcu_state.level[i - `1`] + num_rcu_lvl[i - `1`];
4843	rcu_init_levelspread(levelspread, levelcnt: num_rcu_lvl);
4844
4845	/ Initialize the elements themselves, starting from the leaves. /
4846
4847	for (i = rcu_num_lvls - `1`; i >= `0`; i--) {
4848	cpustride *= levelspread[i];
4849	rnp = rcu_state.level[i];
4850	for (j = `0`; j < num_rcu_lvl[i]; j++, rnp++) {
4851	raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock));
4852	lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock),
4853	&rcu_node_class[i], buf[i]);
4854	raw_spin_lock_init(&rnp->fqslock);
4855	lockdep_set_class_and_name(&rnp->fqslock,
4856	&rcu_fqs_class[i], fqs[i]);
4857	rnp->gp_seq = rcu_state.gp_seq;
4858	rnp->gp_seq_needed = rcu_state.gp_seq;
4859	rnp->completedqs = rcu_state.gp_seq;
4860	rnp->qsmask = `0`;
4861	rnp->qsmaskinit = `0`;
4862	rnp->grplo = j * cpustride;
4863	rnp->grphi = (j + `1`) * cpustride - `1`;
4864	if (rnp->grphi >= nr_cpu_ids)
4865	rnp->grphi = nr_cpu_ids - `1`;
4866	if (i == `0`) {
4867	rnp->grpnum = `0`;
4868	rnp->grpmask = `0`;
4869	rnp->parent = NULL;
4870	} else {
4871	rnp->grpnum = j % levelspread[i - `1`];
4872	rnp->grpmask = BIT(rnp->grpnum);
4873	rnp->parent = rcu_state.level[i - `1`] +
4874	j / levelspread[i - `1`];
4875	}
4876	rnp->level = i;
4877	INIT_LIST_HEAD(list: &rnp->blkd_tasks);
4878	rcu_init_one_nocb(rnp);
4879	init_waitqueue_head(&rnp->exp_wq[`0`]);
4880	init_waitqueue_head(&rnp->exp_wq[`1`]);
4881	init_waitqueue_head(&rnp->exp_wq[`2`]);
4882	init_waitqueue_head(&rnp->exp_wq[`3`]);
4883	spin_lock_init(&rnp->exp_lock);
4884	mutex_init(&rnp->boost_kthread_mutex);
4885	raw_spin_lock_init(&rnp->exp_poll_lock);
4886	rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
4887	INIT_WORK(&rnp->exp_poll_wq, sync_rcu_do_polled_gp);
4888	}
4889	}
4890
4891	init_swait_queue_head(&rcu_state.gp_wq);
4892	init_swait_queue_head(&rcu_state.expedited_wq);
4893	rnp = rcu_first_leaf_node();
4894	for_each_possible_cpu(i) {
4895	while (i > rnp->grphi)
4896	rnp++;
4897	per_cpu_ptr(&rcu_data, i)->mynode = rnp;
4898	rcu_boot_init_percpu_data(cpu: i);
4899	}
4900	}
4901
4902	/*
4903	* Force priority from the kernel command-line into range.
4904	*/
4905	static void __init sanitize_kthread_prio(void)
4906	{
4907	int kthread_prio_in = kthread_prio;
4908
4909	if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < `2`
4910	&& IS_BUILTIN(CONFIG_RCU_TORTURE_TEST))
4911	kthread_prio = `2`;
4912	else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < `1`)
4913	kthread_prio = `1`;
4914	else if (kthread_prio < `0`)
4915	kthread_prio = `0`;
4916	else if (kthread_prio > `99`)
4917	kthread_prio = `99`;
4918
4919	if (kthread_prio != kthread_prio_in)
4920	pr_alert("%s: Limited prio to %d from %d\n",
4921	__func__, kthread_prio, kthread_prio_in);
4922	}
4923
4924	/*
4925	* Compute the rcu_node tree geometry from kernel parameters. This cannot
4926	* replace the definitions in tree.h because those are needed to size
4927	* the ->node array in the rcu_state structure.
4928	*/
4929	void rcu_init_geometry(void)
4930	{
4931	ulong d;
4932	int i;
4933	static unsigned long old_nr_cpu_ids;
4934	int rcu_capacity[RCU_NUM_LVLS];
4935	static bool initialized;
4936
4937	if (initialized) {
4938	/*
4939	* Warn if setup_nr_cpu_ids() had not yet been invoked,
4940	* unless nr_cpus_ids == NR_CPUS, in which case who cares?
4941	*/
4942	WARN_ON_ONCE(old_nr_cpu_ids != nr_cpu_ids);
4943	return;
4944	}
4945
4946	old_nr_cpu_ids = nr_cpu_ids;
4947	initialized = true;
4948
4949	/*
4950	* Initialize any unspecified boot parameters.
4951	* The default values of jiffies_till_first_fqs and
4952	* jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS
4953	* value, which is a function of HZ, then adding one for each
4954	* RCU_JIFFIES_FQS_DIV CPUs that might be on the system.
4955	*/
4956	d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV;
4957	if (jiffies_till_first_fqs == ULONG_MAX)
4958	jiffies_till_first_fqs = d;
4959	if (jiffies_till_next_fqs == ULONG_MAX)
4960	jiffies_till_next_fqs = d;
4961	adjust_jiffies_till_sched_qs();
4962
4963	/ If the compile-time values are accurate, just leave. /
4964	if (rcu_fanout_leaf == RCU_FANOUT_LEAF &&
4965	nr_cpu_ids == NR_CPUS)
4966	return;
4967	pr_info("Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n",
4968	rcu_fanout_leaf, nr_cpu_ids);
4969
4970	/*
4971	* The boot-time rcu_fanout_leaf parameter must be at least two
4972	* and cannot exceed the number of bits in the rcu_node masks.
4973	* Complain and fall back to the compile-time values if this
4974	* limit is exceeded.
4975	*/
4976	if (rcu_fanout_leaf < `2` \|\|
4977	rcu_fanout_leaf > sizeof(unsigned long) * `8`) {
4978	rcu_fanout_leaf = RCU_FANOUT_LEAF;
4979	WARN_ON(`1`);
4980	return;
4981	}
4982
4983	/*
4984	* Compute number of nodes that can be handled an rcu_node tree
4985	* with the given number of levels.
4986	*/
4987	rcu_capacity[`0`] = rcu_fanout_leaf;
4988	for (i = `1`; i < RCU_NUM_LVLS; i++)
4989	rcu_capacity[i] = rcu_capacity[i - `1`] * RCU_FANOUT;
4990
4991	/*
4992	* The tree must be able to accommodate the configured number of CPUs.
4993	* If this limit is exceeded, fall back to the compile-time values.
4994	*/
4995	if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - `1`]) {
4996	rcu_fanout_leaf = RCU_FANOUT_LEAF;
4997	WARN_ON(`1`);
4998	return;
4999	}
5000
5001	/ Calculate the number of levels in the tree. /
5002	for (i = `0`; nr_cpu_ids > rcu_capacity[i]; i++) {
5003	}
5004	rcu_num_lvls = i + `1`;
5005
5006	/ Calculate the number of rcu_nodes at each level of the tree. /
5007	for (i = `0`; i < rcu_num_lvls; i++) {
5008	int cap = rcu_capacity[(rcu_num_lvls - `1`) - i];
5009	num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap);
5010	}
5011
5012	/ Calculate the total number of rcu_node structures. /
5013	rcu_num_nodes = `0`;
5014	for (i = `0`; i < rcu_num_lvls; i++)
5015	rcu_num_nodes += num_rcu_lvl[i];
5016	}
5017
5018	/*
5019	* Dump out the structure of the rcu_node combining tree associated
5020	* with the rcu_state structure.
5021	*/
5022	static void __init rcu_dump_rcu_node_tree(void)
5023	{
5024	int level = `0`;
5025	struct rcu_node *rnp;
5026
5027	pr_info("rcu_node tree layout dump\n");
5028	pr_info(" ");
5029	rcu_for_each_node_breadth_first(rnp) {
5030	if (rnp->level != level) {
5031	pr_cont("\n");
5032	pr_info(" ");
5033	level = rnp->level;
5034	}
5035	pr_cont("%d:%d ^%d ", rnp->grplo, rnp->grphi, rnp->grpnum);
5036	}
5037	pr_cont("\n");
5038	}
5039
5040	struct workqueue_struct *rcu_gp_wq;
5041
5042	static void __init kfree_rcu_batch_init(void)
5043	{
5044	int cpu;
5045	int i, j;
5046	struct shrinker *kfree_rcu_shrinker;
5047
5048	/ Clamp it to [0:100] seconds interval. /
5049	if (rcu_delay_page_cache_fill_msec < `0` \|\|
5050	rcu_delay_page_cache_fill_msec > `100` * MSEC_PER_SEC) {
5051
5052	rcu_delay_page_cache_fill_msec =
5053	clamp(rcu_delay_page_cache_fill_msec, `0`,
5054	(int) (`100` * MSEC_PER_SEC));
5055
5056	pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n",
5057	rcu_delay_page_cache_fill_msec);
5058	}
5059
5060	for_each_possible_cpu(cpu) {
5061	struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
5062
5063	for (i = `0`; i < KFREE_N_BATCHES; i++) {
5064	INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
5065	krcp->krw_arr[i].krcp = krcp;
5066
5067	for (j = `0`; j < FREE_N_CHANNELS; j++)
5068	INIT_LIST_HEAD(list: &krcp->krw_arr[i].bulk_head_free[j]);
5069	}
5070
5071	for (i = `0`; i < FREE_N_CHANNELS; i++)
5072	INIT_LIST_HEAD(list: &krcp->bulk_head[i]);
5073
5074	INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
5075	INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
5076	krcp->initialized = true;
5077	}
5078
5079	kfree_rcu_shrinker = shrinker_alloc(flags: `0`, fmt: "rcu-kfree");
5080	if (!kfree_rcu_shrinker) {
5081	pr_err("Failed to allocate kfree_rcu() shrinker!\n");
5082	return;
5083	}
5084
5085	kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count;
5086	kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan;
5087
5088	shrinker_register(shrinker: kfree_rcu_shrinker);
5089	}
5090
5091	void __init rcu_init(void)
5092	{
5093	int cpu = smp_processor_id();
5094
5095	rcu_early_boot_tests();
5096
5097	kfree_rcu_batch_init();
5098	rcu_bootup_announce();
5099	sanitize_kthread_prio();
5100	rcu_init_geometry();
5101	rcu_init_one();
5102	if (dump_tree)
5103	rcu_dump_rcu_node_tree();
5104	if (use_softirq)
5105	open_softirq(nr: RCU_SOFTIRQ, action: rcu_core_si);
5106
5107	/*
5108	* We don't need protection against CPU-hotplug here because
5109	* this is called early in boot, before either interrupts
5110	* or the scheduler are operational.
5111	*/
5112	pm_notifier(rcu_pm_notify, `0`);
5113	WARN_ON(num_online_cpus() > `1`); // Only one CPU this early in boot.
5114	rcutree_prepare_cpu(cpu);
5115	rcutree_report_cpu_starting(cpu);
5116	rcutree_online_cpu(cpu);
5117
5118	/ Create workqueue for Tree SRCU and for expedited GPs. /
5119	rcu_gp_wq = alloc_workqueue(fmt: "rcu_gp", flags: WQ_MEM_RECLAIM, max_active: `0`);
5120	WARN_ON(!rcu_gp_wq);
5121	rcu_alloc_par_gp_wq();
5122
5123	/ Fill in default value for rcutree.qovld boot parameter. /
5124	/ -After- the rcu_node ->lock fields are initialized! /
5125	if (qovld < `0`)
5126	qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark;
5127	else
5128	qovld_calc = qovld;
5129
5130	// Kick-start in case any polled grace periods started early.
5131	(void)start_poll_synchronize_rcu_expedited();
5132
5133	rcu_test_sync_prims();
5134	}
5135
5136	#include "tree_stall.h"
5137	#include "tree_exp.h"
5138	#include "tree_nocb.h"
5139	#include "tree_plugin.h"
5140

source code of linux/kernel/rcu/tree.c