timer.c source code [linux/kernel/time/timer.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Kernel internal timers
4	*
5	* Copyright (C) 1991, 1992 Linus Torvalds
6	*
7	* 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
8	*
9	* 1997-09-10 Updated NTP code according to technical memorandum Jan '96
10	* "A Kernel Model for Precision Timekeeping" by Dave Mills
11	* 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
12	* serialize accesses to xtime/lost_ticks).
13	* Copyright (C) 1998 Andrea Arcangeli
14	* 1999-03-10 Improved NTP compatibility by Ulrich Windl
15	* 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love
16	* 2000-10-05 Implemented scalable SMP per-CPU timer handling.
17	* Copyright (C) 2000, 2001, 2002 Ingo Molnar
18	* Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
19	*/
20
21	#include <linux/kernel_stat.h>
22	#include <linux/export.h>
23	#include <linux/interrupt.h>
24	#include <linux/percpu.h>
25	#include <linux/init.h>
26	#include <linux/mm.h>
27	#include <linux/swap.h>
28	#include <linux/pid_namespace.h>
29	#include <linux/notifier.h>
30	#include <linux/thread_info.h>
31	#include <linux/time.h>
32	#include <linux/jiffies.h>
33	#include <linux/posix-timers.h>
34	#include <linux/cpu.h>
35	#include <linux/syscalls.h>
36	#include <linux/delay.h>
37	#include <linux/tick.h>
38	#include <linux/kallsyms.h>
39	#include <linux/irq_work.h>
40	#include <linux/sched/signal.h>
41	#include <linux/sched/sysctl.h>
42	#include <linux/sched/nohz.h>
43	#include <linux/sched/debug.h>
44	#include <linux/slab.h>
45	#include <linux/compat.h>
46	#include <linux/random.h>
47	#include <linux/sysctl.h>
48
49	#include <linux/uaccess.h>
50	#include <asm/unistd.h>
51	#include <asm/div64.h>
52	#include <asm/timex.h>
53	#include <asm/io.h>
54
55	#include "tick-internal.h"
56	#include "timer_migration.h"
57
58	#define CREATE_TRACE_POINTS
59	#include <trace/events/timer.h>
60
61	__visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
62
63	EXPORT_SYMBOL(jiffies_64);
64
65	/*
66	* The timer wheel has LVL_DEPTH array levels. Each level provides an array of
67	* LVL_SIZE buckets. Each level is driven by its own clock and therefore each
68	* level has a different granularity.
69	*
70	* The level granularity is: LVL_CLK_DIV ^ level
71	* The level clock frequency is: HZ / (LVL_CLK_DIV ^ level)
72	*
73	* The array level of a newly armed timer depends on the relative expiry
74	* time. The farther the expiry time is away the higher the array level and
75	* therefore the granularity becomes.
76	*
77	* Contrary to the original timer wheel implementation, which aims for 'exact'
78	* expiry of the timers, this implementation removes the need for recascading
79	* the timers into the lower array levels. The previous 'classic' timer wheel
80	* implementation of the kernel already violated the 'exact' expiry by adding
81	* slack to the expiry time to provide batched expiration. The granularity
82	* levels provide implicit batching.
83	*
84	* This is an optimization of the original timer wheel implementation for the
85	* majority of the timer wheel use cases: timeouts. The vast majority of
86	* timeout timers (networking, disk I/O ...) are canceled before expiry. If
87	* the timeout expires it indicates that normal operation is disturbed, so it
88	* does not matter much whether the timeout comes with a slight delay.
89	*
90	* The only exception to this are networking timers with a small expiry
91	* time. They rely on the granularity. Those fit into the first wheel level,
92	* which has HZ granularity.
93	*
94	* We don't have cascading anymore. timers with a expiry time above the
95	* capacity of the last wheel level are force expired at the maximum timeout
96	* value of the last wheel level. From data sampling we know that the maximum
97	* value observed is 5 days (network connection tracking), so this should not
98	* be an issue.
99	*
100	* The currently chosen array constants values are a good compromise between
101	* array size and granularity.
102	*
103	* This results in the following granularity and range levels:
104	*
105	* HZ 1000 steps
106	* Level Offset Granularity Range
107	* 0 0 1 ms 0 ms - 63 ms
108	* 1 64 8 ms 64 ms - 511 ms
109	* 2 128 64 ms 512 ms - 4095 ms (512ms - ~4s)
110	* 3 192 512 ms 4096 ms - 32767 ms (~4s - ~32s)
111	* 4 256 4096 ms (~4s) 32768 ms - 262143 ms (~32s - ~4m)
112	* 5 320 32768 ms (~32s) 262144 ms - 2097151 ms (~4m - ~34m)
113	* 6 384 262144 ms (~4m) 2097152 ms - 16777215 ms (~34m - ~4h)
114	* 7 448 2097152 ms (~34m) 16777216 ms - 134217727 ms (~4h - ~1d)
115	* 8 512 16777216 ms (~4h) 134217728 ms - 1073741822 ms (~1d - ~12d)
116	*
117	* HZ 300
118	* Level Offset Granularity Range
119	* 0 0 3 ms 0 ms - 210 ms
120	* 1 64 26 ms 213 ms - 1703 ms (213ms - ~1s)
121	* 2 128 213 ms 1706 ms - 13650 ms (~1s - ~13s)
122	* 3 192 1706 ms (~1s) 13653 ms - 109223 ms (~13s - ~1m)
123	* 4 256 13653 ms (~13s) 109226 ms - 873810 ms (~1m - ~14m)
124	* 5 320 109226 ms (~1m) 873813 ms - 6990503 ms (~14m - ~1h)
125	* 6 384 873813 ms (~14m) 6990506 ms - 55924050 ms (~1h - ~15h)
126	* 7 448 6990506 ms (~1h) 55924053 ms - 447392423 ms (~15h - ~5d)
127	* 8 512 55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d)
128	*
129	* HZ 250
130	* Level Offset Granularity Range
131	* 0 0 4 ms 0 ms - 255 ms
132	* 1 64 32 ms 256 ms - 2047 ms (256ms - ~2s)
133	* 2 128 256 ms 2048 ms - 16383 ms (~2s - ~16s)
134	* 3 192 2048 ms (~2s) 16384 ms - 131071 ms (~16s - ~2m)
135	* 4 256 16384 ms (~16s) 131072 ms - 1048575 ms (~2m - ~17m)
136	* 5 320 131072 ms (~2m) 1048576 ms - 8388607 ms (~17m - ~2h)
137	* 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h)
138	* 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d)
139	* 8 512 67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d)
140	*
141	* HZ 100
142	* Level Offset Granularity Range
143	* 0 0 10 ms 0 ms - 630 ms
144	* 1 64 80 ms 640 ms - 5110 ms (640ms - ~5s)
145	* 2 128 640 ms 5120 ms - 40950 ms (~5s - ~40s)
146	* 3 192 5120 ms (~5s) 40960 ms - 327670 ms (~40s - ~5m)
147	* 4 256 40960 ms (~40s) 327680 ms - 2621430 ms (~5m - ~43m)
148	* 5 320 327680 ms (~5m) 2621440 ms - 20971510 ms (~43m - ~5h)
149	* 6 384 2621440 ms (~43m) 20971520 ms - 167772150 ms (~5h - ~1d)
150	* 7 448 20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d)
151	*/
152
153	/ Clock divisor for the next level /
154	#define LVL_CLK_SHIFT 3
155	#define LVL_CLK_DIV (1UL << LVL_CLK_SHIFT)
156	#define LVL_CLK_MASK (LVL_CLK_DIV - 1)
157	#define LVL_SHIFT(n) ((n) * LVL_CLK_SHIFT)
158	#define LVL_GRAN(n) (1UL << LVL_SHIFT(n))
159
160	/*
161	* The time start value for each level to select the bucket at enqueue
162	* time. We start from the last possible delta of the previous level
163	* so that we can later add an extra LVL_GRAN(n) to n (see calc_index()).
164	*/
165	#define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))
166
167	/ Size of each clock level /
168	#define LVL_BITS 6
169	#define LVL_SIZE (1UL << LVL_BITS)
170	#define LVL_MASK (LVL_SIZE - 1)
171	#define LVL_OFFS(n) ((n) * LVL_SIZE)
172
173	/ Level depth /
174	#if HZ > 100
175	# define LVL_DEPTH 9
176	# else
177	# define LVL_DEPTH 8
178	#endif
179
180	/ The cutoff (max. capacity of the wheel) /
181	#define WHEEL_TIMEOUT_CUTOFF (LVL_START(LVL_DEPTH))
182	#define WHEEL_TIMEOUT_MAX (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1))
183
184	/*
185	* The resulting wheel size. If NOHZ is configured we allocate two
186	* wheels so we have a separate storage for the deferrable timers.
187	*/
188	#define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH)
189
190	#ifdef CONFIG_NO_HZ_COMMON
191	/*
192	* If multiple bases need to be locked, use the base ordering for lock
193	* nesting, i.e. lowest number first.
194	*/
195	# define NR_BASES 3
196	# define BASE_LOCAL 0
197	# define BASE_GLOBAL 1
198	# define BASE_DEF 2
199	#else
200	# define NR_BASES 1
201	# define BASE_LOCAL 0
202	# define BASE_GLOBAL 0
203	# define BASE_DEF 0
204	#endif
205
206	/**
207	* struct timer_base - Per CPU timer base (number of base depends on config)
208	* @lock: Lock protecting the timer_base
209	* @running_timer: When expiring timers, the lock is dropped. To make
210	* sure not to race against deleting/modifying a
211	* currently running timer, the pointer is set to the
212	* timer, which expires at the moment. If no timer is
213	* running, the pointer is NULL.
214	* @expiry_lock: PREEMPT_RT only: Lock is taken in softirq around
215	* timer expiry callback execution and when trying to
216	* delete a running timer and it wasn't successful in
217	* the first glance. It prevents priority inversion
218	* when callback was preempted on a remote CPU and a
219	* caller tries to delete the running timer. It also
220	* prevents a life lock, when the task which tries to
221	* delete a timer preempted the softirq thread which
222	* is running the timer callback function.
223	* @timer_waiters: PREEMPT_RT only: Tells, if there is a waiter
224	* waiting for the end of the timer callback function
225	* execution.
226	* @clk: clock of the timer base; is updated before enqueue
227	* of a timer; during expiry, it is 1 offset ahead of
228	* jiffies to avoid endless requeuing to current
229	* jiffies
230	* @next_expiry: expiry value of the first timer; it is updated when
231	* finding the next timer and during enqueue; the
232	* value is not valid, when next_expiry_recalc is set
233	* @cpu: Number of CPU the timer base belongs to
234	* @next_expiry_recalc: States, whether a recalculation of next_expiry is
235	* required. Value is set true, when a timer was
236	* deleted.
237	* @is_idle: Is set, when timer_base is idle. It is triggered by NOHZ
238	* code. This state is only used in standard
239	* base. Deferrable timers, which are enqueued remotely
240	* never wake up an idle CPU. So no matter of supporting it
241	* for this base.
242	* @timers_pending: Is set, when a timer is pending in the base. It is only
243	* reliable when next_expiry_recalc is not set.
244	* @pending_map: bitmap of the timer wheel; each bit reflects a
245	* bucket of the wheel. When a bit is set, at least a
246	* single timer is enqueued in the related bucket.
247	* @vectors: Array of lists; Each array member reflects a bucket
248	* of the timer wheel. The list contains all timers
249	* which are enqueued into a specific bucket.
250	*/
251	struct timer_base {
252	raw_spinlock_t lock;
253	struct timer_list *running_timer;
254	#ifdef CONFIG_PREEMPT_RT
255	spinlock_t expiry_lock;
256	atomic_t timer_waiters;
257	#endif
258	unsigned long clk;
259	unsigned long next_expiry;
260	unsigned int cpu;
261	bool next_expiry_recalc;
262	bool is_idle;
263	bool timers_pending;
264	DECLARE_BITMAP(pending_map, WHEEL_SIZE);
265	struct hlist_head vectors[WHEEL_SIZE];
266	} ____cacheline_aligned;
267
268	static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
269
270	#ifdef CONFIG_NO_HZ_COMMON
271
272	static DEFINE_STATIC_KEY_FALSE(timers_nohz_active);
273	static DEFINE_MUTEX(timer_keys_mutex);
274
275	static void timer_update_keys(struct work_struct *work);
276	static DECLARE_WORK(timer_update_work, timer_update_keys);
277
278	#ifdef CONFIG_SMP
279	static unsigned int sysctl_timer_migration = `1`;
280
281	DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);
282
283	static void timers_update_migration(void)
284	{
285	if (sysctl_timer_migration && tick_nohz_active)
286	static_branch_enable(&timers_migration_enabled);
287	else
288	static_branch_disable(&timers_migration_enabled);
289	}
290
291	#ifdef CONFIG_SYSCTL
292	static int timer_migration_handler(struct ctl_table table, int* write,
293	void buffer, size_t lenp, loff_t *ppos)
294	{
295	int ret;
296
297	mutex_lock(&timer_keys_mutex);
298	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
299	if (!ret && write)
300	timers_update_migration();
301	mutex_unlock(lock: &timer_keys_mutex);
302	return ret;
303	}
304
305	static struct ctl_table timer_sysctl[] = {
306	{
307	.procname = "timer_migration",
308	.data = &sysctl_timer_migration,
309	.maxlen = sizeof(unsigned int),
310	.mode = `0644`,
311	.proc_handler = timer_migration_handler,
312	.extra1 = SYSCTL_ZERO,
313	.extra2 = SYSCTL_ONE,
314	},
315	{}
316	};
317
318	static int __init timer_sysctl_init(void)
319	{
320	register_sysctl("kernel", timer_sysctl);
321	return `0`;
322	}
323	device_initcall(timer_sysctl_init);
324	#endif /* CONFIG_SYSCTL */
325	#else /* CONFIG_SMP */
326	static inline void timers_update_migration(void) { }
327	#endif /* !CONFIG_SMP */
328
329	static void timer_update_keys(struct work_struct *work)
330	{
331	mutex_lock(&timer_keys_mutex);
332	timers_update_migration();
333	static_branch_enable(&timers_nohz_active);
334	mutex_unlock(lock: &timer_keys_mutex);
335	}
336
337	void timers_update_nohz(void)
338	{
339	schedule_work(work: &timer_update_work);
340	}
341
342	static inline bool is_timers_nohz_active(void)
343	{
344	return static_branch_unlikely(&timers_nohz_active);
345	}
346	#else
347	static inline bool is_timers_nohz_active(void) { return false; }
348	#endif /* NO_HZ_COMMON */
349
350	static unsigned long round_jiffies_common(unsigned long j, int cpu,
351	bool force_up)
352	{
353	int rem;
354	unsigned long original = j;
355
356	/*
357	* We don't want all cpus firing their timers at once hitting the
358	* same lock or cachelines, so we skew each extra cpu with an extra
359	* 3 jiffies. This 3 jiffies came originally from the mm/ code which
360	* already did this.
361	* The skew is done by adding 3*cpunr, then round, then subtract this
362	* extra offset again.
363	*/
364	j += cpu * `3`;
365
366	rem = j % HZ;
367
368	/*
369	* If the target jiffie is just after a whole second (which can happen
370	* due to delays of the timer irq, long irq off times etc etc) then
371	* we should round down to the whole second, not up. Use 1/4th second
372	* as cutoff for this rounding as an extreme upper bound for this.
373	* But never round down if @force_up is set.
374	*/
375	if (rem < HZ/`4` && !force_up) / round down /
376	j = j - rem;
377	else / round up /
378	j = j - rem + HZ;
379
380	/ now that we have rounded, subtract the extra skew again /
381	j -= cpu * `3`;
382
383	/*
384	* Make sure j is still in the future. Otherwise return the
385	* unmodified value.
386	*/
387	return time_is_after_jiffies(j) ? j : original;
388	}
389
390	/**
391	* __round_jiffies - function to round jiffies to a full second
392	* @j: the time in (absolute) jiffies that should be rounded
393	* @cpu: the processor number on which the timeout will happen
394	*
395	* __round_jiffies() rounds an absolute time in the future (in jiffies)
396	* up or down to (approximately) full seconds. This is useful for timers
397	* for which the exact time they fire does not matter too much, as long as
398	* they fire approximately every X seconds.
399	*
400	* By rounding these timers to whole seconds, all such timers will fire
401	* at the same time, rather than at various times spread out. The goal
402	* of this is to have the CPU wake up less, which saves power.
403	*
404	* The exact rounding is skewed for each processor to avoid all
405	* processors firing at the exact same time, which could lead
406	* to lock contention or spurious cache line bouncing.
407	*
408	* The return value is the rounded version of the @j parameter.
409	*/
410	unsigned long __round_jiffies(unsigned long j, int cpu)
411	{
412	return round_jiffies_common(j, cpu, force_up: false);
413	}
414	EXPORT_SYMBOL_GPL(__round_jiffies);
415
416	/**
417	* __round_jiffies_relative - function to round jiffies to a full second
418	* @j: the time in (relative) jiffies that should be rounded
419	* @cpu: the processor number on which the timeout will happen
420	*
421	* __round_jiffies_relative() rounds a time delta in the future (in jiffies)
422	* up or down to (approximately) full seconds. This is useful for timers
423	* for which the exact time they fire does not matter too much, as long as
424	* they fire approximately every X seconds.
425	*
426	* By rounding these timers to whole seconds, all such timers will fire
427	* at the same time, rather than at various times spread out. The goal
428	* of this is to have the CPU wake up less, which saves power.
429	*
430	* The exact rounding is skewed for each processor to avoid all
431	* processors firing at the exact same time, which could lead
432	* to lock contention or spurious cache line bouncing.
433	*
434	* The return value is the rounded version of the @j parameter.
435	*/
436	unsigned long __round_jiffies_relative(unsigned long j, int cpu)
437	{
438	unsigned long j0 = jiffies;
439
440	/ Use j0 because jiffies might change while we run /
441	return round_jiffies_common(j: j + j0, cpu, force_up: false) - j0;
442	}
443	EXPORT_SYMBOL_GPL(__round_jiffies_relative);
444
445	/**
446	* round_jiffies - function to round jiffies to a full second
447	* @j: the time in (absolute) jiffies that should be rounded
448	*
449	* round_jiffies() rounds an absolute time in the future (in jiffies)
450	* up or down to (approximately) full seconds. This is useful for timers
451	* for which the exact time they fire does not matter too much, as long as
452	* they fire approximately every X seconds.
453	*
454	* By rounding these timers to whole seconds, all such timers will fire
455	* at the same time, rather than at various times spread out. The goal
456	* of this is to have the CPU wake up less, which saves power.
457	*
458	* The return value is the rounded version of the @j parameter.
459	*/
460	unsigned long round_jiffies(unsigned long j)
461	{
462	return round_jiffies_common(j, raw_smp_processor_id(), force_up: false);
463	}
464	EXPORT_SYMBOL_GPL(round_jiffies);
465
466	/**
467	* round_jiffies_relative - function to round jiffies to a full second
468	* @j: the time in (relative) jiffies that should be rounded
469	*
470	* round_jiffies_relative() rounds a time delta in the future (in jiffies)
471	* up or down to (approximately) full seconds. This is useful for timers
472	* for which the exact time they fire does not matter too much, as long as
473	* they fire approximately every X seconds.
474	*
475	* By rounding these timers to whole seconds, all such timers will fire
476	* at the same time, rather than at various times spread out. The goal
477	* of this is to have the CPU wake up less, which saves power.
478	*
479	* The return value is the rounded version of the @j parameter.
480	*/
481	unsigned long round_jiffies_relative(unsigned long j)
482	{
483	return __round_jiffies_relative(j, raw_smp_processor_id());
484	}
485	EXPORT_SYMBOL_GPL(round_jiffies_relative);
486
487	/**
488	* __round_jiffies_up - function to round jiffies up to a full second
489	* @j: the time in (absolute) jiffies that should be rounded
490	* @cpu: the processor number on which the timeout will happen
491	*
492	* This is the same as __round_jiffies() except that it will never
493	* round down. This is useful for timeouts for which the exact time
494	* of firing does not matter too much, as long as they don't fire too
495	* early.
496	*/
497	unsigned long __round_jiffies_up(unsigned long j, int cpu)
498	{
499	return round_jiffies_common(j, cpu, force_up: true);
500	}
501	EXPORT_SYMBOL_GPL(__round_jiffies_up);
502
503	/**
504	* __round_jiffies_up_relative - function to round jiffies up to a full second
505	* @j: the time in (relative) jiffies that should be rounded
506	* @cpu: the processor number on which the timeout will happen
507	*
508	* This is the same as __round_jiffies_relative() except that it will never
509	* round down. This is useful for timeouts for which the exact time
510	* of firing does not matter too much, as long as they don't fire too
511	* early.
512	*/
513	unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
514	{
515	unsigned long j0 = jiffies;
516
517	/ Use j0 because jiffies might change while we run /
518	return round_jiffies_common(j: j + j0, cpu, force_up: true) - j0;
519	}
520	EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);
521
522	/**
523	* round_jiffies_up - function to round jiffies up to a full second
524	* @j: the time in (absolute) jiffies that should be rounded
525	*
526	* This is the same as round_jiffies() except that it will never
527	* round down. This is useful for timeouts for which the exact time
528	* of firing does not matter too much, as long as they don't fire too
529	* early.
530	*/
531	unsigned long round_jiffies_up(unsigned long j)
532	{
533	return round_jiffies_common(j, raw_smp_processor_id(), force_up: true);
534	}
535	EXPORT_SYMBOL_GPL(round_jiffies_up);
536
537	/**
538	* round_jiffies_up_relative - function to round jiffies up to a full second
539	* @j: the time in (relative) jiffies that should be rounded
540	*
541	* This is the same as round_jiffies_relative() except that it will never
542	* round down. This is useful for timeouts for which the exact time
543	* of firing does not matter too much, as long as they don't fire too
544	* early.
545	*/
546	unsigned long round_jiffies_up_relative(unsigned long j)
547	{
548	return __round_jiffies_up_relative(j, raw_smp_processor_id());
549	}
550	EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
551
552
553	static inline unsigned int timer_get_idx(struct timer_list *timer)
554	{
555	return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT;
556	}
557
558	static inline void timer_set_idx(struct timer_list timer, unsigned* int idx)
559	{
560	timer->flags = (timer->flags & ~TIMER_ARRAYMASK) \|
561	idx << TIMER_ARRAYSHIFT;
562	}
563
564	/*
565	* Helper function to calculate the array index for a given expiry
566	* time.
567	*/
568	static inline unsigned calc_index(unsigned long expires, unsigned lvl,
569	unsigned long *bucket_expiry)
570	{
571
572	/*
573	* The timer wheel has to guarantee that a timer does not fire
574	* early. Early expiry can happen due to:
575	* - Timer is armed at the edge of a tick
576	* - Truncation of the expiry time in the outer wheel levels
577	*
578	* Round up with level granularity to prevent this.
579	*/
580	expires = (expires >> LVL_SHIFT(lvl)) + `1`;
581	*bucket_expiry = expires << LVL_SHIFT(lvl);
582	return LVL_OFFS(lvl) + (expires & LVL_MASK);
583	}
584
585	static int calc_wheel_index(unsigned long expires, unsigned long clk,
586	unsigned long *bucket_expiry)
587	{
588	unsigned long delta = expires - clk;
589	unsigned int idx;
590
591	if (delta < LVL_START(`1`)) {
592	idx = calc_index(expires, lvl: `0`, bucket_expiry);
593	} else if (delta < LVL_START(`2`)) {
594	idx = calc_index(expires, lvl: `1`, bucket_expiry);
595	} else if (delta < LVL_START(`3`)) {
596	idx = calc_index(expires, lvl: `2`, bucket_expiry);
597	} else if (delta < LVL_START(`4`)) {
598	idx = calc_index(expires, lvl: `3`, bucket_expiry);
599	} else if (delta < LVL_START(`5`)) {
600	idx = calc_index(expires, lvl: `4`, bucket_expiry);
601	} else if (delta < LVL_START(`6`)) {
602	idx = calc_index(expires, lvl: `5`, bucket_expiry);
603	} else if (delta < LVL_START(`7`)) {
604	idx = calc_index(expires, lvl: `6`, bucket_expiry);
605	} else if (LVL_DEPTH > `8` && delta < LVL_START(`8`)) {
606	idx = calc_index(expires, lvl: `7`, bucket_expiry);
607	} else if ((long) delta < `0`) {
608	idx = clk & LVL_MASK;
609	*bucket_expiry = clk;
610	} else {
611	/*
612	* Force expire obscene large timeouts to expire at the
613	* capacity limit of the wheel.
614	*/
615	if (delta >= WHEEL_TIMEOUT_CUTOFF)
616	expires = clk + WHEEL_TIMEOUT_MAX;
617
618	idx = calc_index(expires, LVL_DEPTH - `1`, bucket_expiry);
619	}
620	return idx;
621	}
622
623	static void
624	trigger_dyntick_cpu(struct timer_base base, struct* timer_list *timer)
625	{
626	/*
627	* Deferrable timers do not prevent the CPU from entering dynticks and
628	* are not taken into account on the idle/nohz_full path. An IPI when a
629	* new deferrable timer is enqueued will wake up the remote CPU but
630	* nothing will be done with the deferrable timer base. Therefore skip
631	* the remote IPI for deferrable timers completely.
632	*/
633	if (!is_timers_nohz_active() \|\| timer->flags & TIMER_DEFERRABLE)
634	return;
635
636	/*
637	* We might have to IPI the remote CPU if the base is idle and the
638	* timer is pinned. If it is a non pinned timer, it is only queued
639	* on the remote CPU, when timer was running during queueing. Then
640	* everything is handled by remote CPU anyway. If the other CPU is
641	* on the way to idle then it can't set base->is_idle as we hold
642	* the base lock:
643	*/
644	if (base->is_idle) {
645	WARN_ON_ONCE(!(timer->flags & TIMER_PINNED \|\|
646	tick_nohz_full_cpu(base->cpu)));
647	wake_up_nohz_cpu(cpu: base->cpu);
648	}
649	}
650
651	/*
652	* Enqueue the timer into the hash bucket, mark it pending in
653	* the bitmap, store the index in the timer flags then wake up
654	* the target CPU if needed.
655	*/
656	static void enqueue_timer(struct timer_base base, struct* timer_list *timer,
657	unsigned int idx, unsigned long bucket_expiry)
658	{
659
660	hlist_add_head(n: &timer->entry, h: base->vectors + idx);
661	__set_bit(idx, base->pending_map);
662	timer_set_idx(timer, idx);
663
664	trace_timer_start(timer, bucket_expiry);
665
666	/*
667	* Check whether this is the new first expiring timer. The
668	* effective expiry time of the timer is required here
669	* (bucket_expiry) instead of timer->expires.
670	*/
671	if (time_before(bucket_expiry, base->next_expiry)) {
672	/*
673	* Set the next expiry time and kick the CPU so it
674	* can reevaluate the wheel:
675	*/
676	base->next_expiry = bucket_expiry;
677	base->timers_pending = true;
678	base->next_expiry_recalc = false;
679	trigger_dyntick_cpu(base, timer);
680	}
681	}
682
683	static void internal_add_timer(struct timer_base base, struct* timer_list *timer)
684	{
685	unsigned long bucket_expiry;
686	unsigned int idx;
687
688	idx = calc_wheel_index(expires: timer->expires, clk: base->clk, bucket_expiry: &bucket_expiry);
689	enqueue_timer(base, timer, idx, bucket_expiry);
690	}
691
692	#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
693
694	static const struct debug_obj_descr timer_debug_descr;
695
696	struct timer_hint {
697	void (function)(struct* timer_list *t);
698	long offset;
699	};
700
701	#define TIMER_HINT(fn, container, timr, hintfn) \
702	{ \
703	.function = fn, \
704	.offset = offsetof(container, hintfn) - \
705	offsetof(container, timr) \
706	}
707
708	static const struct timer_hint timer_hints[] = {
709	TIMER_HINT(delayed_work_timer_fn,
710	struct delayed_work, timer, work.func),
711	TIMER_HINT(kthread_delayed_work_timer_fn,
712	struct kthread_delayed_work, timer, work.func),
713	};
714
715	static void timer_debug_hint(void* *addr)
716	{
717	struct timer_list *timer = addr;
718	int i;
719
720	for (i = `0`; i < ARRAY_SIZE(timer_hints); i++) {
721	if (timer_hints[i].function == timer->function) {
722	void (*fn)(void*) = addr + timer_hints[i].offset;
723
724	return *fn;
725	}
726	}
727
728	return timer->function;
729	}
730
731	static bool timer_is_static_object(void *addr)
732	{
733	struct timer_list *timer = addr;
734
735	return (timer->entry.pprev == NULL &&
736	timer->entry.next == TIMER_ENTRY_STATIC);
737	}
738
739	/*
740	* timer_fixup_init is called when:
741	* - an active object is initialized
742	*/
743	static bool timer_fixup_init(void addr, enum* debug_obj_state state)
744	{
745	struct timer_list *timer = addr;
746
747	switch (state) {
748	case ODEBUG_STATE_ACTIVE:
749	del_timer_sync(timer);
750	debug_object_init(addr: timer, descr: &timer_debug_descr);
751	return true;
752	default:
753	return false;
754	}
755	}
756
757	/ Stub timer callback for improperly used timers. /
758	static void stub_timer(struct timer_list *unused)
759	{
760	WARN_ON(`1`);
761	}
762
763	/*
764	* timer_fixup_activate is called when:
765	* - an active object is activated
766	* - an unknown non-static object is activated
767	*/
768	static bool timer_fixup_activate(void addr, enum* debug_obj_state state)
769	{
770	struct timer_list *timer = addr;
771
772	switch (state) {
773	case ODEBUG_STATE_NOTAVAILABLE:
774	timer_setup(timer, stub_timer, `0`);
775	return true;
776
777	case ODEBUG_STATE_ACTIVE:
778	WARN_ON(`1`);
779	fallthrough;
780	default:
781	return false;
782	}
783	}
784
785	/*
786	* timer_fixup_free is called when:
787	* - an active object is freed
788	*/
789	static bool timer_fixup_free(void addr, enum* debug_obj_state state)
790	{
791	struct timer_list *timer = addr;
792
793	switch (state) {
794	case ODEBUG_STATE_ACTIVE:
795	del_timer_sync(timer);
796	debug_object_free(addr: timer, descr: &timer_debug_descr);
797	return true;
798	default:
799	return false;
800	}
801	}
802
803	/*
804	* timer_fixup_assert_init is called when:
805	* - an untracked/uninit-ed object is found
806	*/
807	static bool timer_fixup_assert_init(void addr, enum* debug_obj_state state)
808	{
809	struct timer_list *timer = addr;
810
811	switch (state) {
812	case ODEBUG_STATE_NOTAVAILABLE:
813	timer_setup(timer, stub_timer, `0`);
814	return true;
815	default:
816	return false;
817	}
818	}
819
820	static const struct debug_obj_descr timer_debug_descr = {
821	.name = "timer_list",
822	.debug_hint = timer_debug_hint,
823	.is_static_object = timer_is_static_object,
824	.fixup_init = timer_fixup_init,
825	.fixup_activate = timer_fixup_activate,
826	.fixup_free = timer_fixup_free,
827	.fixup_assert_init = timer_fixup_assert_init,
828	};
829
830	static inline void debug_timer_init(struct timer_list *timer)
831	{
832	debug_object_init(addr: timer, descr: &timer_debug_descr);
833	}
834
835	static inline void debug_timer_activate(struct timer_list *timer)
836	{
837	debug_object_activate(addr: timer, descr: &timer_debug_descr);
838	}
839
840	static inline void debug_timer_deactivate(struct timer_list *timer)
841	{
842	debug_object_deactivate(addr: timer, descr: &timer_debug_descr);
843	}
844
845	static inline void debug_timer_assert_init(struct timer_list *timer)
846	{
847	debug_object_assert_init(addr: timer, descr: &timer_debug_descr);
848	}
849
850	static void do_init_timer(struct timer_list *timer,
851	void (func)(struct* timer_list *),
852	unsigned int flags,
853	const char name, struct* lock_class_key *key);
854
855	void init_timer_on_stack_key(struct timer_list *timer,
856	void (func)(struct* timer_list *),
857	unsigned int flags,
858	const char name, struct* lock_class_key *key)
859	{
860	debug_object_init_on_stack(addr: timer, descr: &timer_debug_descr);
861	do_init_timer(timer, func, flags, name, key);
862	}
863	EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
864
865	void destroy_timer_on_stack(struct timer_list *timer)
866	{
867	debug_object_free(addr: timer, descr: &timer_debug_descr);
868	}
869	EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
870
871	#else
872	static inline void debug_timer_init(struct timer_list *timer) { }
873	static inline void debug_timer_activate(struct timer_list *timer) { }
874	static inline void debug_timer_deactivate(struct timer_list *timer) { }
875	static inline void debug_timer_assert_init(struct timer_list *timer) { }
876	#endif
877
878	static inline void debug_init(struct timer_list *timer)
879	{
880	debug_timer_init(timer);
881	trace_timer_init(timer);
882	}
883
884	static inline void debug_deactivate(struct timer_list *timer)
885	{
886	debug_timer_deactivate(timer);
887	trace_timer_cancel(timer);
888	}
889
890	static inline void debug_assert_init(struct timer_list *timer)
891	{
892	debug_timer_assert_init(timer);
893	}
894
895	static void do_init_timer(struct timer_list *timer,
896	void (func)(struct* timer_list *),
897	unsigned int flags,
898	const char name, struct* lock_class_key *key)
899	{
900	timer->entry.pprev = NULL;
901	timer->function = func;
902	if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS))
903	flags &= TIMER_INIT_FLAGS;
904	timer->flags = flags \| raw_smp_processor_id();
905	lockdep_init_map(lock: &timer->lockdep_map, name, key, subclass: `0`);
906	}
907
908	/**
909	* init_timer_key - initialize a timer
910	* @timer: the timer to be initialized
911	* @func: timer callback function
912	* @flags: timer flags
913	* @name: name of the timer
914	* @key: lockdep class key of the fake lock used for tracking timer
915	* sync lock dependencies
916	*
917	* init_timer_key() must be done to a timer prior to calling any of the
918	* other timer functions.
919	*/
920	void init_timer_key(struct timer_list *timer,
921	void (func)(struct* timer_list ), unsigned* int flags,
922	const char name, struct* lock_class_key *key)
923	{
924	debug_init(timer);
925	do_init_timer(timer, func, flags, name, key);
926	}
927	EXPORT_SYMBOL(init_timer_key);
928
929	static inline void detach_timer(struct timer_list *timer, bool clear_pending)
930	{
931	struct hlist_node *entry = &timer->entry;
932
933	debug_deactivate(timer);
934
935	__hlist_del(n: entry);
936	if (clear_pending)
937	entry->pprev = NULL;
938	entry->next = LIST_POISON2;
939	}
940
941	static int detach_if_pending(struct timer_list timer, struct* timer_base *base,
942	bool clear_pending)
943	{
944	unsigned idx = timer_get_idx(timer);
945
946	if (!timer_pending(timer))
947	return `0`;
948
949	if (hlist_is_singular_node(n: &timer->entry, h: base->vectors + idx)) {
950	__clear_bit(idx, base->pending_map);
951	base->next_expiry_recalc = true;
952	}
953
954	detach_timer(timer, clear_pending);
955	return `1`;
956	}
957
958	static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
959	{
960	int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;
961	struct timer_base *base;
962
963	base = per_cpu_ptr(&timer_bases[index], cpu);
964
965	/*
966	* If the timer is deferrable and NO_HZ_COMMON is set then we need
967	* to use the deferrable base.
968	*/
969	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
970	base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
971	return base;
972	}
973
974	static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
975	{
976	int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;
977	struct timer_base *base;
978
979	base = this_cpu_ptr(&timer_bases[index]);
980
981	/*
982	* If the timer is deferrable and NO_HZ_COMMON is set then we need
983	* to use the deferrable base.
984	*/
985	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
986	base = this_cpu_ptr(&timer_bases[BASE_DEF]);
987	return base;
988	}
989
990	static inline struct timer_base *get_timer_base(u32 tflags)
991	{
992	return get_timer_cpu_base(tflags, cpu: tflags & TIMER_CPUMASK);
993	}
994
995	static inline void __forward_timer_base(struct timer_base *base,
996	unsigned long basej)
997	{
998	/*
999	* Check whether we can forward the base. We can only do that when
1000	* @basej is past base->clk otherwise we might rewind base->clk.
1001	*/
1002	if (time_before_eq(basej, base->clk))
1003	return;
1004
1005	/*
1006	* If the next expiry value is > jiffies, then we fast forward to
1007	* jiffies otherwise we forward to the next expiry value.
1008	*/
1009	if (time_after(base->next_expiry, basej)) {
1010	base->clk = basej;
1011	} else {
1012	if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk)))
1013	return;
1014	base->clk = base->next_expiry;
1015	}
1016
1017	}
1018
1019	static inline void forward_timer_base(struct timer_base *base)
1020	{
1021	__forward_timer_base(base, READ_ONCE(jiffies));
1022	}
1023
1024	/*
1025	* We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
1026	* that all timers which are tied to this base are locked, and the base itself
1027	* is locked too.
1028	*
1029	* So __run_timers/migrate_timers can safely modify all timers which could
1030	* be found in the base->vectors array.
1031	*
1032	* When a timer is migrating then the TIMER_MIGRATING flag is set and we need
1033	* to wait until the migration is done.
1034	*/
1035	static struct timer_base lock_timer_base(struct* timer_list *timer,
1036	unsigned long *flags)
1037	__acquires(timer->base->lock)
1038	{
1039	for (;;) {
1040	struct timer_base *base;
1041	u32 tf;
1042
1043	/*
1044	* We need to use READ_ONCE() here, otherwise the compiler
1045	* might re-read @tf between the check for TIMER_MIGRATING
1046	* and spin_lock().
1047	*/
1048	tf = READ_ONCE(timer->flags);
1049
1050	if (!(tf & TIMER_MIGRATING)) {
1051	base = get_timer_base(tflags: tf);
1052	raw_spin_lock_irqsave(&base->lock, *flags);
1053	if (timer->flags == tf)
1054	return base;
1055	raw_spin_unlock_irqrestore(&base->lock, *flags);
1056	}
1057	cpu_relax();
1058	}
1059	}
1060
1061	#define MOD_TIMER_PENDING_ONLY 0x01
1062	#define MOD_TIMER_REDUCE 0x02
1063	#define MOD_TIMER_NOTPENDING 0x04
1064
1065	static inline int
1066	__mod_timer(struct timer_list timer, unsigned* long expires, unsigned int options)
1067	{
1068	unsigned long clk = `0`, flags, bucket_expiry;
1069	struct timer_base base, new_base;
1070	unsigned int idx = UINT_MAX;
1071	int ret = `0`;
1072
1073	debug_assert_init(timer);
1074
1075	/*
1076	* This is a common optimization triggered by the networking code - if
1077	* the timer is re-modified to have the same timeout or ends up in the
1078	* same array bucket then just return:
1079	*/
1080	if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) {
1081	/*
1082	* The downside of this optimization is that it can result in
1083	* larger granularity than you would get from adding a new
1084	* timer with this expiry.
1085	*/
1086	long diff = timer->expires - expires;
1087
1088	if (!diff)
1089	return `1`;
1090	if (options & MOD_TIMER_REDUCE && diff <= `0`)
1091	return `1`;
1092
1093	/*
1094	* We lock timer base and calculate the bucket index right
1095	* here. If the timer ends up in the same bucket, then we
1096	* just update the expiry time and avoid the whole
1097	* dequeue/enqueue dance.
1098	*/
1099	base = lock_timer_base(timer, flags: &flags);
1100	/*
1101	* Has @timer been shutdown? This needs to be evaluated
1102	* while holding base lock to prevent a race against the
1103	* shutdown code.
1104	*/
1105	if (!timer->function)
1106	goto out_unlock;
1107
1108	forward_timer_base(base);
1109
1110	if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) &&
1111	time_before_eq(timer->expires, expires)) {
1112	ret = `1`;
1113	goto out_unlock;
1114	}
1115
1116	clk = base->clk;
1117	idx = calc_wheel_index(expires, clk, bucket_expiry: &bucket_expiry);
1118
1119	/*
1120	* Retrieve and compare the array index of the pending
1121	* timer. If it matches set the expiry to the new value so a
1122	* subsequent call will exit in the expires check above.
1123	*/
1124	if (idx == timer_get_idx(timer)) {
1125	if (!(options & MOD_TIMER_REDUCE))
1126	timer->expires = expires;
1127	else if (time_after(timer->expires, expires))
1128	timer->expires = expires;
1129	ret = `1`;
1130	goto out_unlock;
1131	}
1132	} else {
1133	base = lock_timer_base(timer, flags: &flags);
1134	/*
1135	* Has @timer been shutdown? This needs to be evaluated
1136	* while holding base lock to prevent a race against the
1137	* shutdown code.
1138	*/
1139	if (!timer->function)
1140	goto out_unlock;
1141
1142	forward_timer_base(base);
1143	}
1144
1145	ret = detach_if_pending(timer, base, clear_pending: false);
1146	if (!ret && (options & MOD_TIMER_PENDING_ONLY))
1147	goto out_unlock;
1148
1149	new_base = get_timer_this_cpu_base(tflags: timer->flags);
1150
1151	if (base != new_base) {
1152	/*
1153	* We are trying to schedule the timer on the new base.
1154	* However we can't change timer's base while it is running,
1155	* otherwise timer_delete_sync() can't detect that the timer's
1156	* handler yet has not finished. This also guarantees that the
1157	* timer is serialized wrt itself.
1158	*/
1159	if (likely(base->running_timer != timer)) {
1160	/ See the comment in lock_timer_base() /
1161	timer->flags \|= TIMER_MIGRATING;
1162
1163	raw_spin_unlock(&base->lock);
1164	base = new_base;
1165	raw_spin_lock(&base->lock);
1166	WRITE_ONCE(timer->flags,
1167	(timer->flags & ~TIMER_BASEMASK) \| base->cpu);
1168	forward_timer_base(base);
1169	}
1170	}
1171
1172	debug_timer_activate(timer);
1173
1174	timer->expires = expires;
1175	/*
1176	* If 'idx' was calculated above and the base time did not advance
1177	* between calculating 'idx' and possibly switching the base, only
1178	* enqueue_timer() is required. Otherwise we need to (re)calculate
1179	* the wheel index via internal_add_timer().
1180	*/
1181	if (idx != UINT_MAX && clk == base->clk)
1182	enqueue_timer(base, timer, idx, bucket_expiry);
1183	else
1184	internal_add_timer(base, timer);
1185
1186	out_unlock:
1187	raw_spin_unlock_irqrestore(&base->lock, flags);
1188
1189	return ret;
1190	}
1191
1192	/**
1193	* mod_timer_pending - Modify a pending timer's timeout
1194	* @timer: The pending timer to be modified
1195	* @expires: New absolute timeout in jiffies
1196	*
1197	* mod_timer_pending() is the same for pending timers as mod_timer(), but
1198	* will not activate inactive timers.
1199	*
1200	* If @timer->function == NULL then the start operation is silently
1201	* discarded.
1202	*
1203	* Return:
1204	* * %0 - The timer was inactive and not modified or was in
1205	* shutdown state and the operation was discarded
1206	* * %1 - The timer was active and requeued to expire at @expires
1207	*/
1208	int mod_timer_pending(struct timer_list timer, unsigned* long expires)
1209	{
1210	return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY);
1211	}
1212	EXPORT_SYMBOL(mod_timer_pending);
1213
1214	/**
1215	* mod_timer - Modify a timer's timeout
1216	* @timer: The timer to be modified
1217	* @expires: New absolute timeout in jiffies
1218	*
1219	* mod_timer(timer, expires) is equivalent to:
1220	*
1221	* del_timer(timer); timer->expires = expires; add_timer(timer);
1222	*
1223	* mod_timer() is more efficient than the above open coded sequence. In
1224	* case that the timer is inactive, the del_timer() part is a NOP. The
1225	* timer is in any case activated with the new expiry time @expires.
1226	*
1227	* Note that if there are multiple unserialized concurrent users of the
1228	* same timer, then mod_timer() is the only safe way to modify the timeout,
1229	* since add_timer() cannot modify an already running timer.
1230	*
1231	* If @timer->function == NULL then the start operation is silently
1232	* discarded. In this case the return value is 0 and meaningless.
1233	*
1234	* Return:
1235	* * %0 - The timer was inactive and started or was in shutdown
1236	* state and the operation was discarded
1237	* * %1 - The timer was active and requeued to expire at @expires or
1238	* the timer was active and not modified because @expires did
1239	* not change the effective expiry time
1240	*/
1241	int mod_timer(struct timer_list timer, unsigned* long expires)
1242	{
1243	return __mod_timer(timer, expires, options: `0`);
1244	}
1245	EXPORT_SYMBOL(mod_timer);
1246
1247	/**
1248	* timer_reduce - Modify a timer's timeout if it would reduce the timeout
1249	* @timer: The timer to be modified
1250	* @expires: New absolute timeout in jiffies
1251	*
1252	* timer_reduce() is very similar to mod_timer(), except that it will only
1253	* modify an enqueued timer if that would reduce the expiration time. If
1254	* @timer is not enqueued it starts the timer.
1255	*
1256	* If @timer->function == NULL then the start operation is silently
1257	* discarded.
1258	*
1259	* Return:
1260	* * %0 - The timer was inactive and started or was in shutdown
1261	* state and the operation was discarded
1262	* * %1 - The timer was active and requeued to expire at @expires or
1263	* the timer was active and not modified because @expires
1264	* did not change the effective expiry time such that the
1265	* timer would expire earlier than already scheduled
1266	*/
1267	int timer_reduce(struct timer_list timer, unsigned* long expires)
1268	{
1269	return __mod_timer(timer, expires, MOD_TIMER_REDUCE);
1270	}
1271	EXPORT_SYMBOL(timer_reduce);
1272
1273	/**
1274	* add_timer - Start a timer
1275	* @timer: The timer to be started
1276	*
1277	* Start @timer to expire at @timer->expires in the future. @timer->expires
1278	* is the absolute expiry time measured in 'jiffies'. When the timer expires
1279	* timer->function(timer) will be invoked from soft interrupt context.
1280	*
1281	* The @timer->expires and @timer->function fields must be set prior
1282	* to calling this function.
1283	*
1284	* If @timer->function == NULL then the start operation is silently
1285	* discarded.
1286	*
1287	* If @timer->expires is already in the past @timer will be queued to
1288	* expire at the next timer tick.
1289	*
1290	* This can only operate on an inactive timer. Attempts to invoke this on
1291	* an active timer are rejected with a warning.
1292	*/
1293	void add_timer(struct timer_list *timer)
1294	{
1295	if (WARN_ON_ONCE(timer_pending(timer)))
1296	return;
1297	__mod_timer(timer, expires: timer->expires, MOD_TIMER_NOTPENDING);
1298	}
1299	EXPORT_SYMBOL(add_timer);
1300
1301	/**
1302	* add_timer_local() - Start a timer on the local CPU
1303	* @timer: The timer to be started
1304	*
1305	* Same as add_timer() except that the timer flag TIMER_PINNED is set.
1306	*
1307	* See add_timer() for further details.
1308	*/
1309	void add_timer_local(struct timer_list *timer)
1310	{
1311	if (WARN_ON_ONCE(timer_pending(timer)))
1312	return;
1313	timer->flags \|= TIMER_PINNED;
1314	__mod_timer(timer, expires: timer->expires, MOD_TIMER_NOTPENDING);
1315	}
1316	EXPORT_SYMBOL(add_timer_local);
1317
1318	/**
1319	* add_timer_global() - Start a timer without TIMER_PINNED flag set
1320	* @timer: The timer to be started
1321	*
1322	* Same as add_timer() except that the timer flag TIMER_PINNED is unset.
1323	*
1324	* See add_timer() for further details.
1325	*/
1326	void add_timer_global(struct timer_list *timer)
1327	{
1328	if (WARN_ON_ONCE(timer_pending(timer)))
1329	return;
1330	timer->flags &= ~TIMER_PINNED;
1331	__mod_timer(timer, expires: timer->expires, MOD_TIMER_NOTPENDING);
1332	}
1333	EXPORT_SYMBOL(add_timer_global);
1334
1335	/**
1336	* add_timer_on - Start a timer on a particular CPU
1337	* @timer: The timer to be started
1338	* @cpu: The CPU to start it on
1339	*
1340	* Same as add_timer() except that it starts the timer on the given CPU and
1341	* the TIMER_PINNED flag is set. When timer shouldn't be a pinned timer in
1342	* the next round, add_timer_global() should be used instead as it unsets
1343	* the TIMER_PINNED flag.
1344	*
1345	* See add_timer() for further details.
1346	*/
1347	void add_timer_on(struct timer_list timer, int* cpu)
1348	{
1349	struct timer_base new_base, base;
1350	unsigned long flags;
1351
1352	debug_assert_init(timer);
1353
1354	if (WARN_ON_ONCE(timer_pending(timer)))
1355	return;
1356
1357	/ Make sure timer flags have TIMER_PINNED flag set /
1358	timer->flags \|= TIMER_PINNED;
1359
1360	new_base = get_timer_cpu_base(tflags: timer->flags, cpu);
1361
1362	/*
1363	* If @timer was on a different CPU, it should be migrated with the
1364	* old base locked to prevent other operations proceeding with the
1365	* wrong base locked. See lock_timer_base().
1366	*/
1367	base = lock_timer_base(timer, flags: &flags);
1368	/*
1369	* Has @timer been shutdown? This needs to be evaluated while
1370	* holding base lock to prevent a race against the shutdown code.
1371	*/
1372	if (!timer->function)
1373	goto out_unlock;
1374
1375	if (base != new_base) {
1376	timer->flags \|= TIMER_MIGRATING;
1377
1378	raw_spin_unlock(&base->lock);
1379	base = new_base;
1380	raw_spin_lock(&base->lock);
1381	WRITE_ONCE(timer->flags,
1382	(timer->flags & ~TIMER_BASEMASK) \| cpu);
1383	}
1384	forward_timer_base(base);
1385
1386	debug_timer_activate(timer);
1387	internal_add_timer(base, timer);
1388	out_unlock:
1389	raw_spin_unlock_irqrestore(&base->lock, flags);
1390	}
1391	EXPORT_SYMBOL_GPL(add_timer_on);
1392
1393	/**
1394	* __timer_delete - Internal function: Deactivate a timer
1395	* @timer: The timer to be deactivated
1396	* @shutdown: If true, this indicates that the timer is about to be
1397	* shutdown permanently.
1398	*
1399	* If @shutdown is true then @timer->function is set to NULL under the
1400	* timer base lock which prevents further rearming of the time. In that
1401	* case any attempt to rearm @timer after this function returns will be
1402	* silently ignored.
1403	*
1404	* Return:
1405	* * %0 - The timer was not pending
1406	* * %1 - The timer was pending and deactivated
1407	*/
1408	static int __timer_delete(struct timer_list *timer, bool shutdown)
1409	{
1410	struct timer_base *base;
1411	unsigned long flags;
1412	int ret = `0`;
1413
1414	debug_assert_init(timer);
1415
1416	/*
1417	* If @shutdown is set then the lock has to be taken whether the
1418	* timer is pending or not to protect against a concurrent rearm
1419	* which might hit between the lockless pending check and the lock
1420	* acquisition. By taking the lock it is ensured that such a newly
1421	* enqueued timer is dequeued and cannot end up with
1422	* timer->function == NULL in the expiry code.
1423	*
1424	* If timer->function is currently executed, then this makes sure
1425	* that the callback cannot requeue the timer.
1426	*/
1427	if (timer_pending(timer) \|\| shutdown) {
1428	base = lock_timer_base(timer, flags: &flags);
1429	ret = detach_if_pending(timer, base, clear_pending: true);
1430	if (shutdown)
1431	timer->function = NULL;
1432	raw_spin_unlock_irqrestore(&base->lock, flags);
1433	}
1434
1435	return ret;
1436	}
1437
1438	/**
1439	* timer_delete - Deactivate a timer
1440	* @timer: The timer to be deactivated
1441	*
1442	* The function only deactivates a pending timer, but contrary to
1443	* timer_delete_sync() it does not take into account whether the timer's
1444	* callback function is concurrently executed on a different CPU or not.
1445	* It neither prevents rearming of the timer. If @timer can be rearmed
1446	* concurrently then the return value of this function is meaningless.
1447	*
1448	* Return:
1449	* * %0 - The timer was not pending
1450	* * %1 - The timer was pending and deactivated
1451	*/
1452	int timer_delete(struct timer_list *timer)
1453	{
1454	return __timer_delete(timer, shutdown: false);
1455	}
1456	EXPORT_SYMBOL(timer_delete);
1457
1458	/**
1459	* timer_shutdown - Deactivate a timer and prevent rearming
1460	* @timer: The timer to be deactivated
1461	*
1462	* The function does not wait for an eventually running timer callback on a
1463	* different CPU but it prevents rearming of the timer. Any attempt to arm
1464	* @timer after this function returns will be silently ignored.
1465	*
1466	* This function is useful for teardown code and should only be used when
1467	* timer_shutdown_sync() cannot be invoked due to locking or context constraints.
1468	*
1469	* Return:
1470	* * %0 - The timer was not pending
1471	* * %1 - The timer was pending
1472	*/
1473	int timer_shutdown(struct timer_list *timer)
1474	{
1475	return __timer_delete(timer, shutdown: true);
1476	}
1477	EXPORT_SYMBOL_GPL(timer_shutdown);
1478
1479	/**
1480	* __try_to_del_timer_sync - Internal function: Try to deactivate a timer
1481	* @timer: Timer to deactivate
1482	* @shutdown: If true, this indicates that the timer is about to be
1483	* shutdown permanently.
1484	*
1485	* If @shutdown is true then @timer->function is set to NULL under the
1486	* timer base lock which prevents further rearming of the timer. Any
1487	* attempt to rearm @timer after this function returns will be silently
1488	* ignored.
1489	*
1490	* This function cannot guarantee that the timer cannot be rearmed
1491	* right after dropping the base lock if @shutdown is false. That
1492	* needs to be prevented by the calling code if necessary.
1493	*
1494	* Return:
1495	* * %0 - The timer was not pending
1496	* * %1 - The timer was pending and deactivated
1497	* * %-1 - The timer callback function is running on a different CPU
1498	*/
1499	static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown)
1500	{
1501	struct timer_base *base;
1502	unsigned long flags;
1503	int ret = -`1`;
1504
1505	debug_assert_init(timer);
1506
1507	base = lock_timer_base(timer, flags: &flags);
1508
1509	if (base->running_timer != timer)
1510	ret = detach_if_pending(timer, base, clear_pending: true);
1511	if (shutdown)
1512	timer->function = NULL;
1513
1514	raw_spin_unlock_irqrestore(&base->lock, flags);
1515
1516	return ret;
1517	}
1518
1519	/**
1520	* try_to_del_timer_sync - Try to deactivate a timer
1521	* @timer: Timer to deactivate
1522	*
1523	* This function tries to deactivate a timer. On success the timer is not
1524	* queued and the timer callback function is not running on any CPU.
1525	*
1526	* This function does not guarantee that the timer cannot be rearmed right
1527	* after dropping the base lock. That needs to be prevented by the calling
1528	* code if necessary.
1529	*
1530	* Return:
1531	* * %0 - The timer was not pending
1532	* * %1 - The timer was pending and deactivated
1533	* * %-1 - The timer callback function is running on a different CPU
1534	*/
1535	int try_to_del_timer_sync(struct timer_list *timer)
1536	{
1537	return __try_to_del_timer_sync(timer, shutdown: false);
1538	}
1539	EXPORT_SYMBOL(try_to_del_timer_sync);
1540
1541	#ifdef CONFIG_PREEMPT_RT
1542	static __init void timer_base_init_expiry_lock(struct timer_base *base)
1543	{
1544	spin_lock_init(&base->expiry_lock);
1545	}
1546
1547	static inline void timer_base_lock_expiry(struct timer_base *base)
1548	{
1549	spin_lock(&base->expiry_lock);
1550	}
1551
1552	static inline void timer_base_unlock_expiry(struct timer_base *base)
1553	{
1554	spin_unlock(&base->expiry_lock);
1555	}
1556
1557	/*
1558	* The counterpart to del_timer_wait_running().
1559	*
1560	* If there is a waiter for base->expiry_lock, then it was waiting for the
1561	* timer callback to finish. Drop expiry_lock and reacquire it. That allows
1562	* the waiter to acquire the lock and make progress.
1563	*/
1564	static void timer_sync_wait_running(struct timer_base *base)
1565	{
1566	if (atomic_read(&base->timer_waiters)) {
1567	raw_spin_unlock_irq(&base->lock);
1568	spin_unlock(&base->expiry_lock);
1569	spin_lock(&base->expiry_lock);
1570	raw_spin_lock_irq(&base->lock);
1571	}
1572	}
1573
1574	/*
1575	* This function is called on PREEMPT_RT kernels when the fast path
1576	* deletion of a timer failed because the timer callback function was
1577	* running.
1578	*
1579	* This prevents priority inversion, if the softirq thread on a remote CPU
1580	* got preempted, and it prevents a life lock when the task which tries to
1581	* delete a timer preempted the softirq thread running the timer callback
1582	* function.
1583	*/
1584	static void del_timer_wait_running(struct timer_list *timer)
1585	{
1586	u32 tf;
1587
1588	tf = READ_ONCE(timer->flags);
1589	if (!(tf & (TIMER_MIGRATING \| TIMER_IRQSAFE))) {
1590	struct timer_base *base = get_timer_base(tf);
1591
1592	/*
1593	* Mark the base as contended and grab the expiry lock,
1594	* which is held by the softirq across the timer
1595	* callback. Drop the lock immediately so the softirq can
1596	* expire the next timer. In theory the timer could already
1597	* be running again, but that's more than unlikely and just
1598	* causes another wait loop.
1599	*/
1600	atomic_inc(&base->timer_waiters);
1601	spin_lock_bh(&base->expiry_lock);
1602	atomic_dec(&base->timer_waiters);
1603	spin_unlock_bh(&base->expiry_lock);
1604	}
1605	}
1606	#else
1607	static inline void timer_base_init_expiry_lock(struct timer_base *base) { }
1608	static inline void timer_base_lock_expiry(struct timer_base *base) { }
1609	static inline void timer_base_unlock_expiry(struct timer_base *base) { }
1610	static inline void timer_sync_wait_running(struct timer_base *base) { }
1611	static inline void del_timer_wait_running(struct timer_list *timer) { }
1612	#endif
1613
1614	/**
1615	* __timer_delete_sync - Internal function: Deactivate a timer and wait
1616	* for the handler to finish.
1617	* @timer: The timer to be deactivated
1618	* @shutdown: If true, @timer->function will be set to NULL under the
1619	* timer base lock which prevents rearming of @timer
1620	*
1621	* If @shutdown is not set the timer can be rearmed later. If the timer can
1622	* be rearmed concurrently, i.e. after dropping the base lock then the
1623	* return value is meaningless.
1624	*
1625	* If @shutdown is set then @timer->function is set to NULL under timer
1626	* base lock which prevents rearming of the timer. Any attempt to rearm
1627	* a shutdown timer is silently ignored.
1628	*
1629	* If the timer should be reused after shutdown it has to be initialized
1630	* again.
1631	*
1632	* Return:
1633	* * %0 - The timer was not pending
1634	* * %1 - The timer was pending and deactivated
1635	*/
1636	static int __timer_delete_sync(struct timer_list *timer, bool shutdown)
1637	{
1638	int ret;
1639
1640	#ifdef CONFIG_LOCKDEP
1641	unsigned long flags;
1642
1643	/*
1644	* If lockdep gives a backtrace here, please reference
1645	* the synchronization rules above.
1646	*/
1647	local_irq_save(flags);
1648	lock_map_acquire(&timer->lockdep_map);
1649	lock_map_release(&timer->lockdep_map);
1650	local_irq_restore(flags);
1651	#endif
1652	/*
1653	* don't use it in hardirq context, because it
1654	* could lead to deadlock.
1655	*/
1656	WARN_ON(in_hardirq() && !(timer->flags & TIMER_IRQSAFE));
1657
1658	/*
1659	* Must be able to sleep on PREEMPT_RT because of the slowpath in
1660	* del_timer_wait_running().
1661	*/
1662	if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE))
1663	lockdep_assert_preemption_enabled();
1664
1665	do {
1666	ret = __try_to_del_timer_sync(timer, shutdown);
1667
1668	if (unlikely(ret < `0`)) {
1669	del_timer_wait_running(timer);
1670	cpu_relax();
1671	}
1672	} while (ret < `0`);
1673
1674	return ret;
1675	}
1676
1677	/**
1678	* timer_delete_sync - Deactivate a timer and wait for the handler to finish.
1679	* @timer: The timer to be deactivated
1680	*
1681	* Synchronization rules: Callers must prevent restarting of the timer,
1682	* otherwise this function is meaningless. It must not be called from
1683	* interrupt contexts unless the timer is an irqsafe one. The caller must
1684	* not hold locks which would prevent completion of the timer's callback
1685	* function. The timer's handler must not call add_timer_on(). Upon exit
1686	* the timer is not queued and the handler is not running on any CPU.
1687	*
1688	* For !irqsafe timers, the caller must not hold locks that are held in
1689	* interrupt context. Even if the lock has nothing to do with the timer in
1690	* question. Here's why::
1691	*
1692	* CPU0 CPU1
1693	* ---- ----
1694	* <SOFTIRQ>
1695	* call_timer_fn();
1696	* base->running_timer = mytimer;
1697	* spin_lock_irq(somelock);
1698	* <IRQ>
1699	* spin_lock(somelock);
1700	* timer_delete_sync(mytimer);
1701	* while (base->running_timer == mytimer);
1702	*
1703	* Now timer_delete_sync() will never return and never release somelock.
1704	* The interrupt on the other CPU is waiting to grab somelock but it has
1705	* interrupted the softirq that CPU0 is waiting to finish.
1706	*
1707	* This function cannot guarantee that the timer is not rearmed again by
1708	* some concurrent or preempting code, right after it dropped the base
1709	* lock. If there is the possibility of a concurrent rearm then the return
1710	* value of the function is meaningless.
1711	*
1712	* If such a guarantee is needed, e.g. for teardown situations then use
1713	* timer_shutdown_sync() instead.
1714	*
1715	* Return:
1716	* * %0 - The timer was not pending
1717	* * %1 - The timer was pending and deactivated
1718	*/
1719	int timer_delete_sync(struct timer_list *timer)
1720	{
1721	return __timer_delete_sync(timer, shutdown: false);
1722	}
1723	EXPORT_SYMBOL(timer_delete_sync);
1724
1725	/**
1726	* timer_shutdown_sync - Shutdown a timer and prevent rearming
1727	* @timer: The timer to be shutdown
1728	*
1729	* When the function returns it is guaranteed that:
1730	* - @timer is not queued
1731	* - The callback function of @timer is not running
1732	* - @timer cannot be enqueued again. Any attempt to rearm
1733	* @timer is silently ignored.
1734	*
1735	* See timer_delete_sync() for synchronization rules.
1736	*
1737	* This function is useful for final teardown of an infrastructure where
1738	* the timer is subject to a circular dependency problem.
1739	*
1740	* A common pattern for this is a timer and a workqueue where the timer can
1741	* schedule work and work can arm the timer. On shutdown the workqueue must
1742	* be destroyed and the timer must be prevented from rearming. Unless the
1743	* code has conditionals like 'if (mything->in_shutdown)' to prevent that
1744	* there is no way to get this correct with timer_delete_sync().
1745	*
1746	* timer_shutdown_sync() is solving the problem. The correct ordering of
1747	* calls in this case is:
1748	*
1749	* timer_shutdown_sync(&mything->timer);
1750	* workqueue_destroy(&mything->workqueue);
1751	*
1752	* After this 'mything' can be safely freed.
1753	*
1754	* This obviously implies that the timer is not required to be functional
1755	* for the rest of the shutdown operation.
1756	*
1757	* Return:
1758	* * %0 - The timer was not pending
1759	* * %1 - The timer was pending
1760	*/
1761	int timer_shutdown_sync(struct timer_list *timer)
1762	{
1763	return __timer_delete_sync(timer, shutdown: true);
1764	}
1765	EXPORT_SYMBOL_GPL(timer_shutdown_sync);
1766
1767	static void call_timer_fn(struct timer_list *timer,
1768	void (fn)(struct* timer_list *),
1769	unsigned long baseclk)
1770	{
1771	int count = preempt_count();
1772
1773	#ifdef CONFIG_LOCKDEP
1774	/*
1775	* It is permissible to free the timer from inside the
1776	* function that is called from it, this we need to take into
1777	* account for lockdep too. To avoid bogus "held lock freed"
1778	* warnings as well as problems when looking into
1779	* timer->lockdep_map, make a copy and use that here.
1780	*/
1781	struct lockdep_map lockdep_map;
1782
1783	lockdep_copy_map(to: &lockdep_map, from: &timer->lockdep_map);
1784	#endif
1785	/*
1786	* Couple the lock chain with the lock chain at
1787	* timer_delete_sync() by acquiring the lock_map around the fn()
1788	* call here and in timer_delete_sync().
1789	*/
1790	lock_map_acquire(&lockdep_map);
1791
1792	trace_timer_expire_entry(timer, baseclk);
1793	fn(timer);
1794	trace_timer_expire_exit(timer);
1795
1796	lock_map_release(&lockdep_map);
1797
1798	if (count != preempt_count()) {
1799	WARN_ONCE(`1`, "timer: %pS preempt leak: %08x -> %08x\n",
1800	fn, count, preempt_count());
1801	/*
1802	* Restore the preempt count. That gives us a decent
1803	* chance to survive and extract information. If the
1804	* callback kept a lock held, bad luck, but not worse
1805	* than the BUG() we had.
1806	*/
1807	preempt_count_set(pc: count);
1808	}
1809	}
1810
1811	static void expire_timers(struct timer_base base, struct* hlist_head *head)
1812	{
1813	/*
1814	* This value is required only for tracing. base->clk was
1815	* incremented directly before expire_timers was called. But expiry
1816	* is related to the old base->clk value.
1817	*/
1818	unsigned long baseclk = base->clk - `1`;
1819
1820	while (!hlist_empty(h: head)) {
1821	struct timer_list *timer;
1822	void (fn)(struct* timer_list *);
1823
1824	timer = hlist_entry(head->first, struct timer_list, entry);
1825
1826	base->running_timer = timer;
1827	detach_timer(timer, clear_pending: true);
1828
1829	fn = timer->function;
1830
1831	if (WARN_ON_ONCE(!fn)) {
1832	/ Should never happen. Emphasis on should! /
1833	base->running_timer = NULL;
1834	continue;
1835	}
1836
1837	if (timer->flags & TIMER_IRQSAFE) {
1838	raw_spin_unlock(&base->lock);
1839	call_timer_fn(timer, fn, baseclk);
1840	raw_spin_lock(&base->lock);
1841	base->running_timer = NULL;
1842	} else {
1843	raw_spin_unlock_irq(&base->lock);
1844	call_timer_fn(timer, fn, baseclk);
1845	raw_spin_lock_irq(&base->lock);
1846	base->running_timer = NULL;
1847	timer_sync_wait_running(base);
1848	}
1849	}
1850	}
1851
1852	static int collect_expired_timers(struct timer_base *base,
1853	struct hlist_head *heads)
1854	{
1855	unsigned long clk = base->clk = base->next_expiry;
1856	struct hlist_head *vec;
1857	int i, levels = `0`;
1858	unsigned int idx;
1859
1860	for (i = `0`; i < LVL_DEPTH; i++) {
1861	idx = (clk & LVL_MASK) + i * LVL_SIZE;
1862
1863	if (__test_and_clear_bit(idx, base->pending_map)) {
1864	vec = base->vectors + idx;
1865	hlist_move_list(old: vec, new: heads++);
1866	levels++;
1867	}
1868	/ Is it time to look at the next level? /
1869	if (clk & LVL_CLK_MASK)
1870	break;
1871	/ Shift clock for the next level granularity /
1872	clk >>= LVL_CLK_SHIFT;
1873	}
1874	return levels;
1875	}
1876
1877	/*
1878	* Find the next pending bucket of a level. Search from level start (@offset)
1879	* + @clk upwards and if nothing there, search from start of the level
1880	* (@offset) up to @offset + clk.
1881	*/
1882	static int next_pending_bucket(struct timer_base base, unsigned* offset,
1883	unsigned clk)
1884	{
1885	unsigned pos, start = offset + clk;
1886	unsigned end = offset + LVL_SIZE;
1887
1888	pos = find_next_bit(addr: base->pending_map, size: end, offset: start);
1889	if (pos < end)
1890	return pos - start;
1891
1892	pos = find_next_bit(addr: base->pending_map, size: start, offset);
1893	return pos < start ? pos + LVL_SIZE - start : -`1`;
1894	}
1895
1896	/*
1897	* Search the first expiring timer in the various clock levels. Caller must
1898	* hold base->lock.
1899	*
1900	* Store next expiry time in base->next_expiry.
1901	*/
1902	static void next_expiry_recalc(struct timer_base *base)
1903	{
1904	unsigned long clk, next, adj;
1905	unsigned lvl, offset = `0`;
1906
1907	next = base->clk + NEXT_TIMER_MAX_DELTA;
1908	clk = base->clk;
1909	for (lvl = `0`; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
1910	int pos = next_pending_bucket(base, offset, clk: clk & LVL_MASK);
1911	unsigned long lvl_clk = clk & LVL_CLK_MASK;
1912
1913	if (pos >= `0`) {
1914	unsigned long tmp = clk + (unsigned long) pos;
1915
1916	tmp <<= LVL_SHIFT(lvl);
1917	if (time_before(tmp, next))
1918	next = tmp;
1919
1920	/*
1921	* If the next expiration happens before we reach
1922	* the next level, no need to check further.
1923	*/
1924	if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK))
1925	break;
1926	}
1927	/*
1928	* Clock for the next level. If the current level clock lower
1929	* bits are zero, we look at the next level as is. If not we
1930	* need to advance it by one because that's going to be the
1931	* next expiring bucket in that level. base->clk is the next
1932	* expiring jiffie. So in case of:
1933	*
1934	* LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
1935	* 0 0 0 0 0 0
1936	*
1937	* we have to look at all levels @index 0. With
1938	*
1939	* LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
1940	* 0 0 0 0 0 2
1941	*
1942	* LVL0 has the next expiring bucket @index 2. The upper
1943	* levels have the next expiring bucket @index 1.
1944	*
1945	* In case that the propagation wraps the next level the same
1946	* rules apply:
1947	*
1948	* LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
1949	* 0 0 0 0 F 2
1950	*
1951	* So after looking at LVL0 we get:
1952	*
1953	* LVL5 LVL4 LVL3 LVL2 LVL1
1954	* 0 0 0 1 0
1955	*
1956	* So no propagation from LVL1 to LVL2 because that happened
1957	* with the add already, but then we need to propagate further
1958	* from LVL2 to LVL3.
1959	*
1960	* So the simple check whether the lower bits of the current
1961	* level are 0 or not is sufficient for all cases.
1962	*/
1963	adj = lvl_clk ? `1` : `0`;
1964	clk >>= LVL_CLK_SHIFT;
1965	clk += adj;
1966	}
1967
1968	base->next_expiry = next;
1969	base->next_expiry_recalc = false;
1970	base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
1971	}
1972
1973	#ifdef CONFIG_NO_HZ_COMMON
1974	/*
1975	* Check, if the next hrtimer event is before the next timer wheel
1976	* event:
1977	*/
1978	static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
1979	{
1980	u64 nextevt = hrtimer_get_next_event();
1981
1982	/*
1983	* If high resolution timers are enabled
1984	* hrtimer_get_next_event() returns KTIME_MAX.
1985	*/
1986	if (expires <= nextevt)
1987	return expires;
1988
1989	/*
1990	* If the next timer is already expired, return the tick base
1991	* time so the tick is fired immediately.
1992	*/
1993	if (nextevt <= basem)
1994	return basem;
1995
1996	/*
1997	* Round up to the next jiffie. High resolution timers are
1998	* off, so the hrtimers are expired in the tick and we need to
1999	* make sure that this tick really expires the timer to avoid
2000	* a ping pong of the nohz stop code.
2001	*
2002	* Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3
2003	*/
2004	return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC;
2005	}
2006
2007	static unsigned long next_timer_interrupt(struct timer_base *base,
2008	unsigned long basej)
2009	{
2010	if (base->next_expiry_recalc)
2011	next_expiry_recalc(base);
2012
2013	/*
2014	* Move next_expiry for the empty base into the future to prevent an
2015	* unnecessary raise of the timer softirq when the next_expiry value
2016	* will be reached even if there is no timer pending.
2017	*
2018	* This update is also required to make timer_base::next_expiry values
2019	* easy comparable to find out which base holds the first pending timer.
2020	*/
2021	if (!base->timers_pending)
2022	base->next_expiry = basej + NEXT_TIMER_MAX_DELTA;
2023
2024	return base->next_expiry;
2025	}
2026
2027	static unsigned long fetch_next_timer_interrupt(unsigned long basej, u64 basem,
2028	struct timer_base *base_local,
2029	struct timer_base *base_global,
2030	struct timer_events *tevt)
2031	{
2032	unsigned long nextevt, nextevt_local, nextevt_global;
2033	bool local_first;
2034
2035	nextevt_local = next_timer_interrupt(base: base_local, basej);
2036	nextevt_global = next_timer_interrupt(base: base_global, basej);
2037
2038	local_first = time_before_eq(nextevt_local, nextevt_global);
2039
2040	nextevt = local_first ? nextevt_local : nextevt_global;
2041
2042	/*
2043	* If the @nextevt is at max. one tick away, use @nextevt and store
2044	* it in the local expiry value. The next global event is irrelevant in
2045	* this case and can be left as KTIME_MAX.
2046	*/
2047	if (time_before_eq(nextevt, basej + `1`)) {
2048	/ If we missed a tick already, force 0 delta /
2049	if (time_before(nextevt, basej))
2050	nextevt = basej;
2051	tevt->local = basem + (u64)(nextevt - basej) * TICK_NSEC;
2052
2053	/*
2054	* This is required for the remote check only but it doesn't
2055	* hurt, when it is done for both call sites:
2056	*
2057	* * The remote callers will only take care of the global timers
2058	* as local timers will be handled by CPU itself. When not
2059	* updating tevt->global with the already missed first global
2060	* timer, it is possible that it will be missed completely.
2061	*
2062	* * The local callers will ignore the tevt->global anyway, when
2063	* nextevt is max. one tick away.
2064	*/
2065	if (!local_first)
2066	tevt->global = tevt->local;
2067	return nextevt;
2068	}
2069
2070	/*
2071	* Update tevt.* values:
2072	*
2073	* If the local queue expires first, then the global event can be
2074	* ignored. If the global queue is empty, nothing to do either.
2075	*/
2076	if (!local_first && base_global->timers_pending)
2077	tevt->global = basem + (u64)(nextevt_global - basej) * TICK_NSEC;
2078
2079	if (base_local->timers_pending)
2080	tevt->local = basem + (u64)(nextevt_local - basej) * TICK_NSEC;
2081
2082	return nextevt;
2083	}
2084
2085	# ifdef CONFIG_SMP
2086	/**
2087	* fetch_next_timer_interrupt_remote() - Store next timers into @tevt
2088	* @basej: base time jiffies
2089	* @basem: base time clock monotonic
2090	* @tevt: Pointer to the storage for the expiry values
2091	* @cpu: Remote CPU
2092	*
2093	* Stores the next pending local and global timer expiry values in the
2094	* struct pointed to by @tevt. If a queue is empty the corresponding
2095	* field is set to KTIME_MAX. If local event expires before global
2096	* event, global event is set to KTIME_MAX as well.
2097	*
2098	* Caller needs to make sure timer base locks are held (use
2099	* timer_lock_remote_bases() for this purpose).
2100	*/
2101	void fetch_next_timer_interrupt_remote(unsigned long basej, u64 basem,
2102	struct timer_events *tevt,
2103	unsigned int cpu)
2104	{
2105	struct timer_base base_local, base_global;
2106
2107	/ Preset local / global events /
2108	tevt->local = tevt->global = KTIME_MAX;
2109
2110	base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
2111	base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);
2112
2113	lockdep_assert_held(&base_local->lock);
2114	lockdep_assert_held(&base_global->lock);
2115
2116	fetch_next_timer_interrupt(basej, basem, base_local, base_global, tevt);
2117	}
2118
2119	/**
2120	* timer_unlock_remote_bases - unlock timer bases of cpu
2121	* @cpu: Remote CPU
2122	*
2123	* Unlocks the remote timer bases.
2124	*/
2125	void timer_unlock_remote_bases(unsigned int cpu)
2126	__releases(timer_bases[BASE_LOCAL]->lock)
2127	__releases(timer_bases[BASE_GLOBAL]->lock)
2128	{
2129	struct timer_base base_local, base_global;
2130
2131	base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
2132	base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);
2133
2134	raw_spin_unlock(&base_global->lock);
2135	raw_spin_unlock(&base_local->lock);
2136	}
2137
2138	/**
2139	* timer_lock_remote_bases - lock timer bases of cpu
2140	* @cpu: Remote CPU
2141	*
2142	* Locks the remote timer bases.
2143	*/
2144	void timer_lock_remote_bases(unsigned int cpu)
2145	__acquires(timer_bases[BASE_LOCAL]->lock)
2146	__acquires(timer_bases[BASE_GLOBAL]->lock)
2147	{
2148	struct timer_base base_local, base_global;
2149
2150	base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
2151	base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);
2152
2153	lockdep_assert_irqs_disabled();
2154
2155	raw_spin_lock(&base_local->lock);
2156	raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING);
2157	}
2158
2159	/**
2160	* timer_base_is_idle() - Return whether timer base is set idle
2161	*
2162	* Returns value of local timer base is_idle value.
2163	*/
2164	bool timer_base_is_idle(void)
2165	{
2166	return __this_cpu_read(timer_bases[BASE_LOCAL].is_idle);
2167	}
2168
2169	static void __run_timer_base(struct timer_base *base);
2170
2171	/**
2172	* timer_expire_remote() - expire global timers of cpu
2173	* @cpu: Remote CPU
2174	*
2175	* Expire timers of global base of remote CPU.
2176	*/
2177	void timer_expire_remote(unsigned int cpu)
2178	{
2179	struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);
2180
2181	__run_timer_base(base);
2182	}
2183
2184	static void timer_use_tmigr(unsigned long basej, u64 basem,
2185	unsigned long nextevt, bool tick_stop_path,
2186	bool timer_base_idle, struct timer_events *tevt)
2187	{
2188	u64 next_tmigr;
2189
2190	if (timer_base_idle)
2191	next_tmigr = tmigr_cpu_new_timer(nextevt: tevt->global);
2192	else if (tick_stop_path)
2193	next_tmigr = tmigr_cpu_deactivate(nextevt: tevt->global);
2194	else
2195	next_tmigr = tmigr_quick_check(nextevt: tevt->global);
2196
2197	/*
2198	* If the CPU is the last going idle in timer migration hierarchy, make
2199	* sure the CPU will wake up in time to handle remote timers.
2200	* next_tmigr == KTIME_MAX if other CPUs are still active.
2201	*/
2202	if (next_tmigr < tevt->local) {
2203	u64 tmp;
2204
2205	/ If we missed a tick already, force 0 delta /
2206	if (next_tmigr < basem)
2207	next_tmigr = basem;
2208
2209	tmp = div_u64(dividend: next_tmigr - basem, TICK_NSEC);
2210
2211	nextevt = basej + (unsigned* long)tmp;
2212	tevt->local = next_tmigr;
2213	}
2214	}
2215	# else
2216	static void timer_use_tmigr(unsigned long basej, u64 basem,
2217	unsigned long nextevt, bool tick_stop_path,
2218	bool timer_base_idle, struct timer_events *tevt)
2219	{
2220	/*
2221	* Make sure first event is written into tevt->local to not miss a
2222	* timer on !SMP systems.
2223	*/
2224	tevt->local = min_t(u64, tevt->local, tevt->global);
2225	}
2226	# endif /* CONFIG_SMP */
2227
2228	static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
2229	bool *idle)
2230	{
2231	struct timer_events tevt = { .local = KTIME_MAX, .global = KTIME_MAX };
2232	struct timer_base base_local, base_global;
2233	unsigned long nextevt;
2234	bool idle_is_possible;
2235
2236	/*
2237	* When the CPU is offline, the tick is cancelled and nothing is supposed
2238	* to try to stop it.
2239	*/
2240	if (WARN_ON_ONCE(cpu_is_offline(smp_processor_id()))) {
2241	if (idle)
2242	*idle = true;
2243	return tevt.local;
2244	}
2245
2246	base_local = this_cpu_ptr(&timer_bases[BASE_LOCAL]);
2247	base_global = this_cpu_ptr(&timer_bases[BASE_GLOBAL]);
2248
2249	raw_spin_lock(&base_local->lock);
2250	raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING);
2251
2252	nextevt = fetch_next_timer_interrupt(basej, basem, base_local,
2253	base_global, tevt: &tevt);
2254
2255	/*
2256	* If the next event is only one jiffie ahead there is no need to call
2257	* timer migration hierarchy related functions. The value for the next
2258	* global timer in @tevt struct equals then KTIME_MAX. This is also
2259	* true, when the timer base is idle.
2260	*
2261	* The proper timer migration hierarchy function depends on the callsite
2262	* and whether timer base is idle or not. @nextevt will be updated when
2263	* this CPU needs to handle the first timer migration hierarchy
2264	* event. See timer_use_tmigr() for detailed information.
2265	*/
2266	idle_is_possible = time_after(nextevt, basej + `1`);
2267	if (idle_is_possible)
2268	timer_use_tmigr(basej, basem, nextevt: &nextevt, tick_stop_path: idle,
2269	timer_base_idle: base_local->is_idle, tevt: &tevt);
2270
2271	/*
2272	* We have a fresh next event. Check whether we can forward the
2273	* base.
2274	*/
2275	__forward_timer_base(base: base_local, basej);
2276	__forward_timer_base(base: base_global, basej);
2277
2278	/*
2279	* Set base->is_idle only when caller is timer_base_try_to_set_idle()
2280	*/
2281	if (idle) {
2282	/*
2283	* Bases are idle if the next event is more than a tick
2284	* away. Caution: @nextevt could have changed by enqueueing a
2285	* global timer into timer migration hierarchy. Therefore a new
2286	* check is required here.
2287	*
2288	* If the base is marked idle then any timer add operation must
2289	* forward the base clk itself to keep granularity small. This
2290	* idle logic is only maintained for the BASE_LOCAL and
2291	* BASE_GLOBAL base, deferrable timers may still see large
2292	* granularity skew (by design).
2293	*/
2294	if (!base_local->is_idle && time_after(nextevt, basej + `1`)) {
2295	base_local->is_idle = true;
2296	/*
2297	* Global timers queued locally while running in a task
2298	* in nohz_full mode need a self-IPI to kick reprogramming
2299	* in IRQ tail.
2300	*/
2301	if (tick_nohz_full_cpu(cpu: base_local->cpu))
2302	base_global->is_idle = true;
2303	trace_timer_base_idle(is_idle: true, cpu: base_local->cpu);
2304	}
2305	*idle = base_local->is_idle;
2306
2307	/*
2308	* When timer base is not set idle, undo the effect of
2309	* tmigr_cpu_deactivate() to prevent inconsistent states - active
2310	* timer base but inactive timer migration hierarchy.
2311	*
2312	* When timer base was already marked idle, nothing will be
2313	* changed here.
2314	*/
2315	if (!base_local->is_idle && idle_is_possible)
2316	tmigr_cpu_activate();
2317	}
2318
2319	raw_spin_unlock(&base_global->lock);
2320	raw_spin_unlock(&base_local->lock);
2321
2322	return cmp_next_hrtimer_event(basem, expires: tevt.local);
2323	}
2324
2325	/**
2326	* get_next_timer_interrupt() - return the time (clock mono) of the next timer
2327	* @basej: base time jiffies
2328	* @basem: base time clock monotonic
2329	*
2330	* Returns the tick aligned clock monotonic time of the next pending timer or
2331	* KTIME_MAX if no timer is pending. If timer of global base was queued into
2332	* timer migration hierarchy, first global timer is not taken into account. If
2333	* it was the last CPU of timer migration hierarchy going idle, first global
2334	* event is taken into account.
2335	*/
2336	u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
2337	{
2338	return __get_next_timer_interrupt(basej, basem, NULL);
2339	}
2340
2341	/**
2342	* timer_base_try_to_set_idle() - Try to set the idle state of the timer bases
2343	* @basej: base time jiffies
2344	* @basem: base time clock monotonic
2345	* @idle: pointer to store the value of timer_base->is_idle on return;
2346	* *idle contains the information whether tick was already stopped
2347	*
2348	* Returns the tick aligned clock monotonic time of the next pending timer or
2349	* KTIME_MAX if no timer is pending. When tick was already stopped KTIME_MAX is
2350	* returned as well.
2351	*/
2352	u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle)
2353	{
2354	if (*idle)
2355	return KTIME_MAX;
2356
2357	return __get_next_timer_interrupt(basej, basem, idle);
2358	}
2359
2360	/**
2361	* timer_clear_idle - Clear the idle state of the timer base
2362	*
2363	* Called with interrupts disabled
2364	*/
2365	void timer_clear_idle(void)
2366	{
2367	/*
2368	* We do this unlocked. The worst outcome is a remote pinned timer
2369	* enqueue sending a pointless IPI, but taking the lock would just
2370	* make the window for sending the IPI a few instructions smaller
2371	* for the cost of taking the lock in the exit from idle
2372	* path. Required for BASE_LOCAL only.
2373	*/
2374	__this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false);
2375	if (tick_nohz_full_cpu(smp_processor_id()))
2376	__this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false);
2377	trace_timer_base_idle(is_idle: false, smp_processor_id());
2378
2379	/ Activate without holding the timer_base->lock /
2380	tmigr_cpu_activate();
2381	}
2382	#endif
2383
2384	/**
2385	* __run_timers - run all expired timers (if any) on this CPU.
2386	* @base: the timer vector to be processed.
2387	*/
2388	static inline void __run_timers(struct timer_base *base)
2389	{
2390	struct hlist_head heads[LVL_DEPTH];
2391	int levels;
2392
2393	lockdep_assert_held(&base->lock);
2394
2395	if (base->running_timer)
2396	return;
2397
2398	while (time_after_eq(jiffies, base->clk) &&
2399	time_after_eq(jiffies, base->next_expiry)) {
2400	levels = collect_expired_timers(base, heads);
2401	/*
2402	* The two possible reasons for not finding any expired
2403	* timer at this clk are that all matching timers have been
2404	* dequeued or no timer has been queued since
2405	* base::next_expiry was set to base::clk +
2406	* NEXT_TIMER_MAX_DELTA.
2407	*/
2408	WARN_ON_ONCE(!levels && !base->next_expiry_recalc
2409	&& base->timers_pending);
2410	/*
2411	* While executing timers, base->clk is set 1 offset ahead of
2412	* jiffies to avoid endless requeuing to current jiffies.
2413	*/
2414	base->clk++;
2415	next_expiry_recalc(base);
2416
2417	while (levels--)
2418	expire_timers(base, head: heads + levels);
2419	}
2420	}
2421
2422	static void __run_timer_base(struct timer_base *base)
2423	{
2424	if (time_before(jiffies, base->next_expiry))
2425	return;
2426
2427	timer_base_lock_expiry(base);
2428	raw_spin_lock_irq(&base->lock);
2429	__run_timers(base);
2430	raw_spin_unlock_irq(&base->lock);
2431	timer_base_unlock_expiry(base);
2432	}
2433
2434	static void run_timer_base(int index)
2435	{
2436	struct timer_base *base = this_cpu_ptr(&timer_bases[index]);
2437
2438	__run_timer_base(base);
2439	}
2440
2441	/*
2442	* This function runs timers and the timer-tq in bottom half context.
2443	*/
2444	static __latent_entropy void run_timer_softirq(struct softirq_action *h)
2445	{
2446	run_timer_base(BASE_LOCAL);
2447	if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) {
2448	run_timer_base(BASE_GLOBAL);
2449	run_timer_base(BASE_DEF);
2450
2451	if (is_timers_nohz_active())
2452	tmigr_handle_remote();
2453	}
2454	}
2455
2456	/*
2457	* Called by the local, per-CPU timer interrupt on SMP.
2458	*/
2459	static void run_local_timers(void)
2460	{
2461	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_LOCAL]);
2462
2463	hrtimer_run_queues();
2464
2465	for (int i = `0`; i < NR_BASES; i++, base++) {
2466	/ Raise the softirq only if required. /
2467	if (time_after_eq(jiffies, base->next_expiry) \|\|
2468	(i == BASE_DEF && tmigr_requires_handle_remote())) {
2469	raise_softirq(nr: TIMER_SOFTIRQ);
2470	return;
2471	}
2472	}
2473	}
2474
2475	/*
2476	* Called from the timer interrupt handler to charge one tick to the current
2477	* process. user_tick is 1 if the tick is user time, 0 for system.
2478	*/
2479	void update_process_times(int user_tick)
2480	{
2481	struct task_struct *p = current;
2482
2483	/ Note: this timer irq context must be accounted for as well. /
2484	account_process_tick(p, user: user_tick);
2485	run_local_timers();
2486	rcu_sched_clock_irq(user: user_tick);
2487	#ifdef CONFIG_IRQ_WORK
2488	if (in_irq())
2489	irq_work_tick();
2490	#endif
2491	scheduler_tick();
2492	if (IS_ENABLED(CONFIG_POSIX_TIMERS))
2493	run_posix_cpu_timers();
2494	}
2495
2496	/*
2497	* Since schedule_timeout()'s timer is defined on the stack, it must store
2498	* the target task on the stack as well.
2499	*/
2500	struct process_timer {
2501	struct timer_list timer;
2502	struct task_struct *task;
2503	};
2504
2505	static void process_timeout(struct timer_list *t)
2506	{
2507	struct process_timer *timeout = from_timer(timeout, t, timer);
2508
2509	wake_up_process(tsk: timeout->task);
2510	}
2511
2512	/**
2513	* schedule_timeout - sleep until timeout
2514	* @timeout: timeout value in jiffies
2515	*
2516	* Make the current task sleep until @timeout jiffies have elapsed.
2517	* The function behavior depends on the current task state
2518	* (see also set_current_state() description):
2519	*
2520	* %TASK_RUNNING - the scheduler is called, but the task does not sleep
2521	* at all. That happens because sched_submit_work() does nothing for
2522	* tasks in %TASK_RUNNING state.
2523	*
2524	* %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
2525	* pass before the routine returns unless the current task is explicitly
2526	* woken up, (e.g. by wake_up_process()).
2527	*
2528	* %TASK_INTERRUPTIBLE - the routine may return early if a signal is
2529	* delivered to the current task or the current task is explicitly woken
2530	* up.
2531	*
2532	* The current task state is guaranteed to be %TASK_RUNNING when this
2533	* routine returns.
2534	*
2535	* Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
2536	* the CPU away without a bound on the timeout. In this case the return
2537	* value will be %MAX_SCHEDULE_TIMEOUT.
2538	*
2539	* Returns 0 when the timer has expired otherwise the remaining time in
2540	* jiffies will be returned. In all cases the return value is guaranteed
2541	* to be non-negative.
2542	*/
2543	signed long __sched schedule_timeout(signed long timeout)
2544	{
2545	struct process_timer timer;
2546	unsigned long expire;
2547
2548	switch (timeout)
2549	{
2550	case MAX_SCHEDULE_TIMEOUT:
2551	/*
2552	* These two special cases are useful to be comfortable
2553	* in the caller. Nothing more. We could take
2554	* MAX_SCHEDULE_TIMEOUT from one of the negative value
2555	* but I' d like to return a valid offset (>=0) to allow
2556	* the caller to do everything it want with the retval.
2557	*/
2558	schedule();
2559	goto out;
2560	default:
2561	/*
2562	* Another bit of PARANOID. Note that the retval will be
2563	* 0 since no piece of kernel is supposed to do a check
2564	* for a negative retval of schedule_timeout() (since it
2565	* should never happens anyway). You just have the printk()
2566	* that will tell you if something is gone wrong and where.
2567	*/
2568	if (timeout < `0`) {
2569	printk(KERN_ERR "schedule_timeout: wrong timeout "
2570	"value %lx\n", timeout);
2571	dump_stack();
2572	__set_current_state(TASK_RUNNING);
2573	goto out;
2574	}
2575	}
2576
2577	expire = timeout + jiffies;
2578
2579	timer.task = current;
2580	timer_setup_on_stack(&timer.timer, process_timeout, `0`);
2581	__mod_timer(timer: &timer.timer, expires: expire, MOD_TIMER_NOTPENDING);
2582	schedule();
2583	del_timer_sync(timer: &timer.timer);
2584
2585	/ Remove the timer from the object tracker /
2586	destroy_timer_on_stack(&timer.timer);
2587
2588	timeout = expire - jiffies;
2589
2590	out:
2591	return timeout < `0` ? `0` : timeout;
2592	}
2593	EXPORT_SYMBOL(schedule_timeout);
2594
2595	/*
2596	* We can use __set_current_state() here because schedule_timeout() calls
2597	* schedule() unconditionally.
2598	*/
2599	signed long __sched schedule_timeout_interruptible(signed long timeout)
2600	{
2601	__set_current_state(TASK_INTERRUPTIBLE);
2602	return schedule_timeout(timeout);
2603	}
2604	EXPORT_SYMBOL(schedule_timeout_interruptible);
2605
2606	signed long __sched schedule_timeout_killable(signed long timeout)
2607	{
2608	__set_current_state(TASK_KILLABLE);
2609	return schedule_timeout(timeout);
2610	}
2611	EXPORT_SYMBOL(schedule_timeout_killable);
2612
2613	signed long __sched schedule_timeout_uninterruptible(signed long timeout)
2614	{
2615	__set_current_state(TASK_UNINTERRUPTIBLE);
2616	return schedule_timeout(timeout);
2617	}
2618	EXPORT_SYMBOL(schedule_timeout_uninterruptible);
2619
2620	/*
2621	* Like schedule_timeout_uninterruptible(), except this task will not contribute
2622	* to load average.
2623	*/
2624	signed long __sched schedule_timeout_idle(signed long timeout)
2625	{
2626	__set_current_state(TASK_IDLE);
2627	return schedule_timeout(timeout);
2628	}
2629	EXPORT_SYMBOL(schedule_timeout_idle);
2630
2631	#ifdef CONFIG_HOTPLUG_CPU
2632	static void migrate_timer_list(struct timer_base new_base, struct* hlist_head *head)
2633	{
2634	struct timer_list *timer;
2635	int cpu = new_base->cpu;
2636
2637	while (!hlist_empty(h: head)) {
2638	timer = hlist_entry(head->first, struct timer_list, entry);
2639	detach_timer(timer, clear_pending: false);
2640	timer->flags = (timer->flags & ~TIMER_BASEMASK) \| cpu;
2641	internal_add_timer(base: new_base, timer);
2642	}
2643	}
2644
2645	int timers_prepare_cpu(unsigned int cpu)
2646	{
2647	struct timer_base *base;
2648	int b;
2649
2650	for (b = `0`; b < NR_BASES; b++) {
2651	base = per_cpu_ptr(&timer_bases[b], cpu);
2652	base->clk = jiffies;
2653	base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
2654	base->next_expiry_recalc = false;
2655	base->timers_pending = false;
2656	base->is_idle = false;
2657	}
2658	return `0`;
2659	}
2660
2661	int timers_dead_cpu(unsigned int cpu)
2662	{
2663	struct timer_base *old_base;
2664	struct timer_base *new_base;
2665	int b, i;
2666
2667	for (b = `0`; b < NR_BASES; b++) {
2668	old_base = per_cpu_ptr(&timer_bases[b], cpu);
2669	new_base = get_cpu_ptr(&timer_bases[b]);
2670	/*
2671	* The caller is globally serialized and nobody else
2672	* takes two locks at once, deadlock is not possible.
2673	*/
2674	raw_spin_lock_irq(&new_base->lock);
2675	raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
2676
2677	/*
2678	* The current CPUs base clock might be stale. Update it
2679	* before moving the timers over.
2680	*/
2681	forward_timer_base(base: new_base);
2682
2683	WARN_ON_ONCE(old_base->running_timer);
2684	old_base->running_timer = NULL;
2685
2686	for (i = `0`; i < WHEEL_SIZE; i++)
2687	migrate_timer_list(new_base, head: old_base->vectors + i);
2688
2689	raw_spin_unlock(&old_base->lock);
2690	raw_spin_unlock_irq(&new_base->lock);
2691	put_cpu_ptr(&timer_bases);
2692	}
2693	return `0`;
2694	}
2695
2696	#endif /* CONFIG_HOTPLUG_CPU */
2697
2698	static void __init init_timer_cpu(int cpu)
2699	{
2700	struct timer_base *base;
2701	int i;
2702
2703	for (i = `0`; i < NR_BASES; i++) {
2704	base = per_cpu_ptr(&timer_bases[i], cpu);
2705	base->cpu = cpu;
2706	raw_spin_lock_init(&base->lock);
2707	base->clk = jiffies;
2708	base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
2709	timer_base_init_expiry_lock(base);
2710	}
2711	}
2712
2713	static void __init init_timer_cpus(void)
2714	{
2715	int cpu;
2716
2717	for_each_possible_cpu(cpu)
2718	init_timer_cpu(cpu);
2719	}
2720
2721	void __init init_timers(void)
2722	{
2723	init_timer_cpus();
2724	posix_cputimers_init_work();
2725	open_softirq(nr: TIMER_SOFTIRQ, action: run_timer_softirq);
2726	}
2727
2728	/**
2729	* msleep - sleep safely even with waitqueue interruptions
2730	* @msecs: Time in milliseconds to sleep for
2731	*/
2732	void msleep(unsigned int msecs)
2733	{
2734	unsigned long timeout = msecs_to_jiffies(m: msecs) + `1`;
2735
2736	while (timeout)
2737	timeout = schedule_timeout_uninterruptible(timeout);
2738	}
2739
2740	EXPORT_SYMBOL(msleep);
2741
2742	/**
2743	* msleep_interruptible - sleep waiting for signals
2744	* @msecs: Time in milliseconds to sleep for
2745	*/
2746	unsigned long msleep_interruptible(unsigned int msecs)
2747	{
2748	unsigned long timeout = msecs_to_jiffies(m: msecs) + `1`;
2749
2750	while (timeout && !signal_pending(current))
2751	timeout = schedule_timeout_interruptible(timeout);
2752	return jiffies_to_msecs(j: timeout);
2753	}
2754
2755	EXPORT_SYMBOL(msleep_interruptible);
2756
2757	/**
2758	* usleep_range_state - Sleep for an approximate time in a given state
2759	* @min: Minimum time in usecs to sleep
2760	* @max: Maximum time in usecs to sleep
2761	* @state: State of the current task that will be while sleeping
2762	*
2763	* In non-atomic context where the exact wakeup time is flexible, use
2764	* usleep_range_state() instead of udelay(). The sleep improves responsiveness
2765	* by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces
2766	* power usage by allowing hrtimers to take advantage of an already-
2767	* scheduled interrupt instead of scheduling a new one just for this sleep.
2768	*/
2769	void __sched usleep_range_state(unsigned long min, unsigned long max,
2770	unsigned int state)
2771	{
2772	ktime_t exp = ktime_add_us(kt: ktime_get(), usec: min);
2773	u64 delta = (u64)(max - min) * NSEC_PER_USEC;
2774
2775	for (;;) {
2776	__set_current_state(state);
2777	/ Do not return before the requested sleep time has elapsed /
2778	if (!schedule_hrtimeout_range(expires: &exp, delta, mode: HRTIMER_MODE_ABS))
2779	break;
2780	}
2781	}
2782	EXPORT_SYMBOL(usleep_range_state);
2783

source code of linux/kernel/time/timer.c