1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Implement CPU time clocks for the POSIX clock interface. |
4 | */ |
5 | |
6 | #include <linux/sched/signal.h> |
7 | #include <linux/sched/cputime.h> |
8 | #include <linux/posix-timers.h> |
9 | #include <linux/errno.h> |
10 | #include <linux/math64.h> |
11 | #include <linux/uaccess.h> |
12 | #include <linux/kernel_stat.h> |
13 | #include <trace/events/timer.h> |
14 | #include <linux/tick.h> |
15 | #include <linux/workqueue.h> |
16 | #include <linux/compat.h> |
17 | #include <linux/sched/deadline.h> |
18 | #include <linux/task_work.h> |
19 | |
20 | #include "posix-timers.h" |
21 | |
22 | static void posix_cpu_timer_rearm(struct k_itimer *timer); |
23 | |
24 | void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) |
25 | { |
26 | posix_cputimers_init(pct); |
27 | if (cpu_limit != RLIM_INFINITY) { |
28 | pct->bases[CPUCLOCK_PROF].nextevt = cpu_limit * NSEC_PER_SEC; |
29 | pct->timers_active = true; |
30 | } |
31 | } |
32 | |
33 | /* |
34 | * Called after updating RLIMIT_CPU to run cpu timer and update |
35 | * tsk->signal->posix_cputimers.bases[clock].nextevt expiration cache if |
36 | * necessary. Needs siglock protection since other code may update the |
37 | * expiration cache as well. |
38 | * |
39 | * Returns 0 on success, -ESRCH on failure. Can fail if the task is exiting and |
40 | * we cannot lock_task_sighand. Cannot fail if task is current. |
41 | */ |
42 | int update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) |
43 | { |
44 | u64 nsecs = rlim_new * NSEC_PER_SEC; |
45 | unsigned long irq_fl; |
46 | |
47 | if (!lock_task_sighand(task, flags: &irq_fl)) |
48 | return -ESRCH; |
49 | set_process_cpu_timer(task, CPUCLOCK_PROF, newval: &nsecs, NULL); |
50 | unlock_task_sighand(task, flags: &irq_fl); |
51 | return 0; |
52 | } |
53 | |
54 | /* |
55 | * Functions for validating access to tasks. |
56 | */ |
57 | static struct pid *pid_for_clock(const clockid_t clock, bool gettime) |
58 | { |
59 | const bool thread = !!CPUCLOCK_PERTHREAD(clock); |
60 | const pid_t upid = CPUCLOCK_PID(clock); |
61 | struct pid *pid; |
62 | |
63 | if (CPUCLOCK_WHICH(clock) >= CPUCLOCK_MAX) |
64 | return NULL; |
65 | |
66 | /* |
67 | * If the encoded PID is 0, then the timer is targeted at current |
68 | * or the process to which current belongs. |
69 | */ |
70 | if (upid == 0) |
71 | return thread ? task_pid(current) : task_tgid(current); |
72 | |
73 | pid = find_vpid(nr: upid); |
74 | if (!pid) |
75 | return NULL; |
76 | |
77 | if (thread) { |
78 | struct task_struct *tsk = pid_task(pid, PIDTYPE_PID); |
79 | return (tsk && same_thread_group(p1: tsk, current)) ? pid : NULL; |
80 | } |
81 | |
82 | /* |
83 | * For clock_gettime(PROCESS) allow finding the process by |
84 | * with the pid of the current task. The code needs the tgid |
85 | * of the process so that pid_task(pid, PIDTYPE_TGID) can be |
86 | * used to find the process. |
87 | */ |
88 | if (gettime && (pid == task_pid(current))) |
89 | return task_tgid(current); |
90 | |
91 | /* |
92 | * For processes require that pid identifies a process. |
93 | */ |
94 | return pid_has_task(pid, type: PIDTYPE_TGID) ? pid : NULL; |
95 | } |
96 | |
97 | static inline int validate_clock_permissions(const clockid_t clock) |
98 | { |
99 | int ret; |
100 | |
101 | rcu_read_lock(); |
102 | ret = pid_for_clock(clock, gettime: false) ? 0 : -EINVAL; |
103 | rcu_read_unlock(); |
104 | |
105 | return ret; |
106 | } |
107 | |
108 | static inline enum pid_type clock_pid_type(const clockid_t clock) |
109 | { |
110 | return CPUCLOCK_PERTHREAD(clock) ? PIDTYPE_PID : PIDTYPE_TGID; |
111 | } |
112 | |
113 | static inline struct task_struct *cpu_timer_task_rcu(struct k_itimer *timer) |
114 | { |
115 | return pid_task(pid: timer->it.cpu.pid, clock_pid_type(clock: timer->it_clock)); |
116 | } |
117 | |
118 | /* |
119 | * Update expiry time from increment, and increase overrun count, |
120 | * given the current clock sample. |
121 | */ |
122 | static u64 bump_cpu_timer(struct k_itimer *timer, u64 now) |
123 | { |
124 | u64 delta, incr, expires = timer->it.cpu.node.expires; |
125 | int i; |
126 | |
127 | if (!timer->it_interval) |
128 | return expires; |
129 | |
130 | if (now < expires) |
131 | return expires; |
132 | |
133 | incr = timer->it_interval; |
134 | delta = now + incr - expires; |
135 | |
136 | /* Don't use (incr*2 < delta), incr*2 might overflow. */ |
137 | for (i = 0; incr < delta - incr; i++) |
138 | incr = incr << 1; |
139 | |
140 | for (; i >= 0; incr >>= 1, i--) { |
141 | if (delta < incr) |
142 | continue; |
143 | |
144 | timer->it.cpu.node.expires += incr; |
145 | timer->it_overrun += 1LL << i; |
146 | delta -= incr; |
147 | } |
148 | return timer->it.cpu.node.expires; |
149 | } |
150 | |
151 | /* Check whether all cache entries contain U64_MAX, i.e. eternal expiry time */ |
152 | static inline bool expiry_cache_is_inactive(const struct posix_cputimers *pct) |
153 | { |
154 | return !(~pct->bases[CPUCLOCK_PROF].nextevt | |
155 | ~pct->bases[CPUCLOCK_VIRT].nextevt | |
156 | ~pct->bases[CPUCLOCK_SCHED].nextevt); |
157 | } |
158 | |
159 | static int |
160 | posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp) |
161 | { |
162 | int error = validate_clock_permissions(clock: which_clock); |
163 | |
164 | if (!error) { |
165 | tp->tv_sec = 0; |
166 | tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ); |
167 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { |
168 | /* |
169 | * If sched_clock is using a cycle counter, we |
170 | * don't have any idea of its true resolution |
171 | * exported, but it is much more than 1s/HZ. |
172 | */ |
173 | tp->tv_nsec = 1; |
174 | } |
175 | } |
176 | return error; |
177 | } |
178 | |
179 | static int |
180 | posix_cpu_clock_set(const clockid_t clock, const struct timespec64 *tp) |
181 | { |
182 | int error = validate_clock_permissions(clock); |
183 | |
184 | /* |
185 | * You can never reset a CPU clock, but we check for other errors |
186 | * in the call before failing with EPERM. |
187 | */ |
188 | return error ? : -EPERM; |
189 | } |
190 | |
191 | /* |
192 | * Sample a per-thread clock for the given task. clkid is validated. |
193 | */ |
194 | static u64 cpu_clock_sample(const clockid_t clkid, struct task_struct *p) |
195 | { |
196 | u64 utime, stime; |
197 | |
198 | if (clkid == CPUCLOCK_SCHED) |
199 | return task_sched_runtime(task: p); |
200 | |
201 | task_cputime(t: p, utime: &utime, stime: &stime); |
202 | |
203 | switch (clkid) { |
204 | case CPUCLOCK_PROF: |
205 | return utime + stime; |
206 | case CPUCLOCK_VIRT: |
207 | return utime; |
208 | default: |
209 | WARN_ON_ONCE(1); |
210 | } |
211 | return 0; |
212 | } |
213 | |
214 | static inline void store_samples(u64 *samples, u64 stime, u64 utime, u64 rtime) |
215 | { |
216 | samples[CPUCLOCK_PROF] = stime + utime; |
217 | samples[CPUCLOCK_VIRT] = utime; |
218 | samples[CPUCLOCK_SCHED] = rtime; |
219 | } |
220 | |
221 | static void task_sample_cputime(struct task_struct *p, u64 *samples) |
222 | { |
223 | u64 stime, utime; |
224 | |
225 | task_cputime(t: p, utime: &utime, stime: &stime); |
226 | store_samples(samples, stime, utime, rtime: p->se.sum_exec_runtime); |
227 | } |
228 | |
229 | static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, |
230 | u64 *samples) |
231 | { |
232 | u64 stime, utime, rtime; |
233 | |
234 | utime = atomic64_read(v: &at->utime); |
235 | stime = atomic64_read(v: &at->stime); |
236 | rtime = atomic64_read(v: &at->sum_exec_runtime); |
237 | store_samples(samples, stime, utime, rtime); |
238 | } |
239 | |
240 | /* |
241 | * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg |
242 | * to avoid race conditions with concurrent updates to cputime. |
243 | */ |
244 | static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime) |
245 | { |
246 | u64 curr_cputime = atomic64_read(v: cputime); |
247 | |
248 | do { |
249 | if (sum_cputime <= curr_cputime) |
250 | return; |
251 | } while (!atomic64_try_cmpxchg(v: cputime, old: &curr_cputime, new: sum_cputime)); |
252 | } |
253 | |
254 | static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, |
255 | struct task_cputime *sum) |
256 | { |
257 | __update_gt_cputime(cputime: &cputime_atomic->utime, sum_cputime: sum->utime); |
258 | __update_gt_cputime(cputime: &cputime_atomic->stime, sum_cputime: sum->stime); |
259 | __update_gt_cputime(cputime: &cputime_atomic->sum_exec_runtime, sum_cputime: sum->sum_exec_runtime); |
260 | } |
261 | |
262 | /** |
263 | * thread_group_sample_cputime - Sample cputime for a given task |
264 | * @tsk: Task for which cputime needs to be started |
265 | * @samples: Storage for time samples |
266 | * |
267 | * Called from sys_getitimer() to calculate the expiry time of an active |
268 | * timer. That means group cputime accounting is already active. Called |
269 | * with task sighand lock held. |
270 | * |
271 | * Updates @times with an uptodate sample of the thread group cputimes. |
272 | */ |
273 | void thread_group_sample_cputime(struct task_struct *tsk, u64 *samples) |
274 | { |
275 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; |
276 | struct posix_cputimers *pct = &tsk->signal->posix_cputimers; |
277 | |
278 | WARN_ON_ONCE(!pct->timers_active); |
279 | |
280 | proc_sample_cputime_atomic(at: &cputimer->cputime_atomic, samples); |
281 | } |
282 | |
283 | /** |
284 | * thread_group_start_cputime - Start cputime and return a sample |
285 | * @tsk: Task for which cputime needs to be started |
286 | * @samples: Storage for time samples |
287 | * |
288 | * The thread group cputime accounting is avoided when there are no posix |
289 | * CPU timers armed. Before starting a timer it's required to check whether |
290 | * the time accounting is active. If not, a full update of the atomic |
291 | * accounting store needs to be done and the accounting enabled. |
292 | * |
293 | * Updates @times with an uptodate sample of the thread group cputimes. |
294 | */ |
295 | static void thread_group_start_cputime(struct task_struct *tsk, u64 *samples) |
296 | { |
297 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; |
298 | struct posix_cputimers *pct = &tsk->signal->posix_cputimers; |
299 | |
300 | lockdep_assert_task_sighand_held(task: tsk); |
301 | |
302 | /* Check if cputimer isn't running. This is accessed without locking. */ |
303 | if (!READ_ONCE(pct->timers_active)) { |
304 | struct task_cputime sum; |
305 | |
306 | /* |
307 | * The POSIX timer interface allows for absolute time expiry |
308 | * values through the TIMER_ABSTIME flag, therefore we have |
309 | * to synchronize the timer to the clock every time we start it. |
310 | */ |
311 | thread_group_cputime(tsk, times: &sum); |
312 | update_gt_cputime(cputime_atomic: &cputimer->cputime_atomic, sum: &sum); |
313 | |
314 | /* |
315 | * We're setting timers_active without a lock. Ensure this |
316 | * only gets written to in one operation. We set it after |
317 | * update_gt_cputime() as a small optimization, but |
318 | * barriers are not required because update_gt_cputime() |
319 | * can handle concurrent updates. |
320 | */ |
321 | WRITE_ONCE(pct->timers_active, true); |
322 | } |
323 | proc_sample_cputime_atomic(at: &cputimer->cputime_atomic, samples); |
324 | } |
325 | |
326 | static void __thread_group_cputime(struct task_struct *tsk, u64 *samples) |
327 | { |
328 | struct task_cputime ct; |
329 | |
330 | thread_group_cputime(tsk, times: &ct); |
331 | store_samples(samples, stime: ct.stime, utime: ct.utime, rtime: ct.sum_exec_runtime); |
332 | } |
333 | |
334 | /* |
335 | * Sample a process (thread group) clock for the given task clkid. If the |
336 | * group's cputime accounting is already enabled, read the atomic |
337 | * store. Otherwise a full update is required. clkid is already validated. |
338 | */ |
339 | static u64 cpu_clock_sample_group(const clockid_t clkid, struct task_struct *p, |
340 | bool start) |
341 | { |
342 | struct thread_group_cputimer *cputimer = &p->signal->cputimer; |
343 | struct posix_cputimers *pct = &p->signal->posix_cputimers; |
344 | u64 samples[CPUCLOCK_MAX]; |
345 | |
346 | if (!READ_ONCE(pct->timers_active)) { |
347 | if (start) |
348 | thread_group_start_cputime(tsk: p, samples); |
349 | else |
350 | __thread_group_cputime(tsk: p, samples); |
351 | } else { |
352 | proc_sample_cputime_atomic(at: &cputimer->cputime_atomic, samples); |
353 | } |
354 | |
355 | return samples[clkid]; |
356 | } |
357 | |
358 | static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) |
359 | { |
360 | const clockid_t clkid = CPUCLOCK_WHICH(clock); |
361 | struct task_struct *tsk; |
362 | u64 t; |
363 | |
364 | rcu_read_lock(); |
365 | tsk = pid_task(pid: pid_for_clock(clock, gettime: true), clock_pid_type(clock)); |
366 | if (!tsk) { |
367 | rcu_read_unlock(); |
368 | return -EINVAL; |
369 | } |
370 | |
371 | if (CPUCLOCK_PERTHREAD(clock)) |
372 | t = cpu_clock_sample(clkid, p: tsk); |
373 | else |
374 | t = cpu_clock_sample_group(clkid, p: tsk, start: false); |
375 | rcu_read_unlock(); |
376 | |
377 | *tp = ns_to_timespec64(nsec: t); |
378 | return 0; |
379 | } |
380 | |
381 | /* |
382 | * Validate the clockid_t for a new CPU-clock timer, and initialize the timer. |
383 | * This is called from sys_timer_create() and do_cpu_nanosleep() with the |
384 | * new timer already all-zeros initialized. |
385 | */ |
386 | static int posix_cpu_timer_create(struct k_itimer *new_timer) |
387 | { |
388 | static struct lock_class_key posix_cpu_timers_key; |
389 | struct pid *pid; |
390 | |
391 | rcu_read_lock(); |
392 | pid = pid_for_clock(clock: new_timer->it_clock, gettime: false); |
393 | if (!pid) { |
394 | rcu_read_unlock(); |
395 | return -EINVAL; |
396 | } |
397 | |
398 | /* |
399 | * If posix timer expiry is handled in task work context then |
400 | * timer::it_lock can be taken without disabling interrupts as all |
401 | * other locking happens in task context. This requires a separate |
402 | * lock class key otherwise regular posix timer expiry would record |
403 | * the lock class being taken in interrupt context and generate a |
404 | * false positive warning. |
405 | */ |
406 | if (IS_ENABLED(CONFIG_POSIX_CPU_TIMERS_TASK_WORK)) |
407 | lockdep_set_class(&new_timer->it_lock, &posix_cpu_timers_key); |
408 | |
409 | new_timer->kclock = &clock_posix_cpu; |
410 | timerqueue_init(node: &new_timer->it.cpu.node); |
411 | new_timer->it.cpu.pid = get_pid(pid); |
412 | rcu_read_unlock(); |
413 | return 0; |
414 | } |
415 | |
416 | static struct posix_cputimer_base *timer_base(struct k_itimer *timer, |
417 | struct task_struct *tsk) |
418 | { |
419 | int clkidx = CPUCLOCK_WHICH(timer->it_clock); |
420 | |
421 | if (CPUCLOCK_PERTHREAD(timer->it_clock)) |
422 | return tsk->posix_cputimers.bases + clkidx; |
423 | else |
424 | return tsk->signal->posix_cputimers.bases + clkidx; |
425 | } |
426 | |
427 | /* |
428 | * Force recalculating the base earliest expiration on the next tick. |
429 | * This will also re-evaluate the need to keep around the process wide |
430 | * cputime counter and tick dependency and eventually shut these down |
431 | * if necessary. |
432 | */ |
433 | static void trigger_base_recalc_expires(struct k_itimer *timer, |
434 | struct task_struct *tsk) |
435 | { |
436 | struct posix_cputimer_base *base = timer_base(timer, tsk); |
437 | |
438 | base->nextevt = 0; |
439 | } |
440 | |
441 | /* |
442 | * Dequeue the timer and reset the base if it was its earliest expiration. |
443 | * It makes sure the next tick recalculates the base next expiration so we |
444 | * don't keep the costly process wide cputime counter around for a random |
445 | * amount of time, along with the tick dependency. |
446 | * |
447 | * If another timer gets queued between this and the next tick, its |
448 | * expiration will update the base next event if necessary on the next |
449 | * tick. |
450 | */ |
451 | static void disarm_timer(struct k_itimer *timer, struct task_struct *p) |
452 | { |
453 | struct cpu_timer *ctmr = &timer->it.cpu; |
454 | struct posix_cputimer_base *base; |
455 | |
456 | if (!cpu_timer_dequeue(ctmr)) |
457 | return; |
458 | |
459 | base = timer_base(timer, tsk: p); |
460 | if (cpu_timer_getexpires(ctmr) == base->nextevt) |
461 | trigger_base_recalc_expires(timer, tsk: p); |
462 | } |
463 | |
464 | |
465 | /* |
466 | * Clean up a CPU-clock timer that is about to be destroyed. |
467 | * This is called from timer deletion with the timer already locked. |
468 | * If we return TIMER_RETRY, it's necessary to release the timer's lock |
469 | * and try again. (This happens when the timer is in the middle of firing.) |
470 | */ |
471 | static int posix_cpu_timer_del(struct k_itimer *timer) |
472 | { |
473 | struct cpu_timer *ctmr = &timer->it.cpu; |
474 | struct sighand_struct *sighand; |
475 | struct task_struct *p; |
476 | unsigned long flags; |
477 | int ret = 0; |
478 | |
479 | rcu_read_lock(); |
480 | p = cpu_timer_task_rcu(timer); |
481 | if (!p) |
482 | goto out; |
483 | |
484 | /* |
485 | * Protect against sighand release/switch in exit/exec and process/ |
486 | * thread timer list entry concurrent read/writes. |
487 | */ |
488 | sighand = lock_task_sighand(task: p, flags: &flags); |
489 | if (unlikely(sighand == NULL)) { |
490 | /* |
491 | * This raced with the reaping of the task. The exit cleanup |
492 | * should have removed this timer from the timer queue. |
493 | */ |
494 | WARN_ON_ONCE(ctmr->head || timerqueue_node_queued(&ctmr->node)); |
495 | } else { |
496 | if (timer->it.cpu.firing) |
497 | ret = TIMER_RETRY; |
498 | else |
499 | disarm_timer(timer, p); |
500 | |
501 | unlock_task_sighand(task: p, flags: &flags); |
502 | } |
503 | |
504 | out: |
505 | rcu_read_unlock(); |
506 | if (!ret) |
507 | put_pid(pid: ctmr->pid); |
508 | |
509 | return ret; |
510 | } |
511 | |
512 | static void cleanup_timerqueue(struct timerqueue_head *head) |
513 | { |
514 | struct timerqueue_node *node; |
515 | struct cpu_timer *ctmr; |
516 | |
517 | while ((node = timerqueue_getnext(head))) { |
518 | timerqueue_del(head, node); |
519 | ctmr = container_of(node, struct cpu_timer, node); |
520 | ctmr->head = NULL; |
521 | } |
522 | } |
523 | |
524 | /* |
525 | * Clean out CPU timers which are still armed when a thread exits. The |
526 | * timers are only removed from the list. No other updates are done. The |
527 | * corresponding posix timers are still accessible, but cannot be rearmed. |
528 | * |
529 | * This must be called with the siglock held. |
530 | */ |
531 | static void cleanup_timers(struct posix_cputimers *pct) |
532 | { |
533 | cleanup_timerqueue(head: &pct->bases[CPUCLOCK_PROF].tqhead); |
534 | cleanup_timerqueue(head: &pct->bases[CPUCLOCK_VIRT].tqhead); |
535 | cleanup_timerqueue(head: &pct->bases[CPUCLOCK_SCHED].tqhead); |
536 | } |
537 | |
538 | /* |
539 | * These are both called with the siglock held, when the current thread |
540 | * is being reaped. When the final (leader) thread in the group is reaped, |
541 | * posix_cpu_timers_exit_group will be called after posix_cpu_timers_exit. |
542 | */ |
543 | void posix_cpu_timers_exit(struct task_struct *tsk) |
544 | { |
545 | cleanup_timers(pct: &tsk->posix_cputimers); |
546 | } |
547 | void posix_cpu_timers_exit_group(struct task_struct *tsk) |
548 | { |
549 | cleanup_timers(pct: &tsk->signal->posix_cputimers); |
550 | } |
551 | |
552 | /* |
553 | * Insert the timer on the appropriate list before any timers that |
554 | * expire later. This must be called with the sighand lock held. |
555 | */ |
556 | static void arm_timer(struct k_itimer *timer, struct task_struct *p) |
557 | { |
558 | struct posix_cputimer_base *base = timer_base(timer, tsk: p); |
559 | struct cpu_timer *ctmr = &timer->it.cpu; |
560 | u64 newexp = cpu_timer_getexpires(ctmr); |
561 | |
562 | if (!cpu_timer_enqueue(head: &base->tqhead, ctmr)) |
563 | return; |
564 | |
565 | /* |
566 | * We are the new earliest-expiring POSIX 1.b timer, hence |
567 | * need to update expiration cache. Take into account that |
568 | * for process timers we share expiration cache with itimers |
569 | * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME. |
570 | */ |
571 | if (newexp < base->nextevt) |
572 | base->nextevt = newexp; |
573 | |
574 | if (CPUCLOCK_PERTHREAD(timer->it_clock)) |
575 | tick_dep_set_task(tsk: p, bit: TICK_DEP_BIT_POSIX_TIMER); |
576 | else |
577 | tick_dep_set_signal(tsk: p, bit: TICK_DEP_BIT_POSIX_TIMER); |
578 | } |
579 | |
580 | /* |
581 | * The timer is locked, fire it and arrange for its reload. |
582 | */ |
583 | static void cpu_timer_fire(struct k_itimer *timer) |
584 | { |
585 | struct cpu_timer *ctmr = &timer->it.cpu; |
586 | |
587 | if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { |
588 | /* |
589 | * User don't want any signal. |
590 | */ |
591 | cpu_timer_setexpires(ctmr, exp: 0); |
592 | } else if (unlikely(timer->sigq == NULL)) { |
593 | /* |
594 | * This a special case for clock_nanosleep, |
595 | * not a normal timer from sys_timer_create. |
596 | */ |
597 | wake_up_process(tsk: timer->it_process); |
598 | cpu_timer_setexpires(ctmr, exp: 0); |
599 | } else if (!timer->it_interval) { |
600 | /* |
601 | * One-shot timer. Clear it as soon as it's fired. |
602 | */ |
603 | posix_timer_event(timr: timer, si_private: 0); |
604 | cpu_timer_setexpires(ctmr, exp: 0); |
605 | } else if (posix_timer_event(timr: timer, si_private: ++timer->it_requeue_pending)) { |
606 | /* |
607 | * The signal did not get queued because the signal |
608 | * was ignored, so we won't get any callback to |
609 | * reload the timer. But we need to keep it |
610 | * ticking in case the signal is deliverable next time. |
611 | */ |
612 | posix_cpu_timer_rearm(timer); |
613 | ++timer->it_requeue_pending; |
614 | } |
615 | } |
616 | |
617 | /* |
618 | * Guts of sys_timer_settime for CPU timers. |
619 | * This is called with the timer locked and interrupts disabled. |
620 | * If we return TIMER_RETRY, it's necessary to release the timer's lock |
621 | * and try again. (This happens when the timer is in the middle of firing.) |
622 | */ |
623 | static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, |
624 | struct itimerspec64 *new, struct itimerspec64 *old) |
625 | { |
626 | clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); |
627 | u64 old_expires, new_expires, old_incr, val; |
628 | struct cpu_timer *ctmr = &timer->it.cpu; |
629 | struct sighand_struct *sighand; |
630 | struct task_struct *p; |
631 | unsigned long flags; |
632 | int ret = 0; |
633 | |
634 | rcu_read_lock(); |
635 | p = cpu_timer_task_rcu(timer); |
636 | if (!p) { |
637 | /* |
638 | * If p has just been reaped, we can no |
639 | * longer get any information about it at all. |
640 | */ |
641 | rcu_read_unlock(); |
642 | return -ESRCH; |
643 | } |
644 | |
645 | /* |
646 | * Use the to_ktime conversion because that clamps the maximum |
647 | * value to KTIME_MAX and avoid multiplication overflows. |
648 | */ |
649 | new_expires = ktime_to_ns(kt: timespec64_to_ktime(ts: new->it_value)); |
650 | |
651 | /* |
652 | * Protect against sighand release/switch in exit/exec and p->cpu_timers |
653 | * and p->signal->cpu_timers read/write in arm_timer() |
654 | */ |
655 | sighand = lock_task_sighand(task: p, flags: &flags); |
656 | /* |
657 | * If p has just been reaped, we can no |
658 | * longer get any information about it at all. |
659 | */ |
660 | if (unlikely(sighand == NULL)) { |
661 | rcu_read_unlock(); |
662 | return -ESRCH; |
663 | } |
664 | |
665 | /* |
666 | * Disarm any old timer after extracting its expiry time. |
667 | */ |
668 | old_incr = timer->it_interval; |
669 | old_expires = cpu_timer_getexpires(ctmr); |
670 | |
671 | if (unlikely(timer->it.cpu.firing)) { |
672 | timer->it.cpu.firing = -1; |
673 | ret = TIMER_RETRY; |
674 | } else { |
675 | cpu_timer_dequeue(ctmr); |
676 | } |
677 | |
678 | /* |
679 | * We need to sample the current value to convert the new |
680 | * value from to relative and absolute, and to convert the |
681 | * old value from absolute to relative. To set a process |
682 | * timer, we need a sample to balance the thread expiry |
683 | * times (in arm_timer). With an absolute time, we must |
684 | * check if it's already passed. In short, we need a sample. |
685 | */ |
686 | if (CPUCLOCK_PERTHREAD(timer->it_clock)) |
687 | val = cpu_clock_sample(clkid, p); |
688 | else |
689 | val = cpu_clock_sample_group(clkid, p, start: true); |
690 | |
691 | if (old) { |
692 | if (old_expires == 0) { |
693 | old->it_value.tv_sec = 0; |
694 | old->it_value.tv_nsec = 0; |
695 | } else { |
696 | /* |
697 | * Update the timer in case it has overrun already. |
698 | * If it has, we'll report it as having overrun and |
699 | * with the next reloaded timer already ticking, |
700 | * though we are swallowing that pending |
701 | * notification here to install the new setting. |
702 | */ |
703 | u64 exp = bump_cpu_timer(timer, now: val); |
704 | |
705 | if (val < exp) { |
706 | old_expires = exp - val; |
707 | old->it_value = ns_to_timespec64(nsec: old_expires); |
708 | } else { |
709 | old->it_value.tv_nsec = 1; |
710 | old->it_value.tv_sec = 0; |
711 | } |
712 | } |
713 | } |
714 | |
715 | if (unlikely(ret)) { |
716 | /* |
717 | * We are colliding with the timer actually firing. |
718 | * Punt after filling in the timer's old value, and |
719 | * disable this firing since we are already reporting |
720 | * it as an overrun (thanks to bump_cpu_timer above). |
721 | */ |
722 | unlock_task_sighand(task: p, flags: &flags); |
723 | goto out; |
724 | } |
725 | |
726 | if (new_expires != 0 && !(timer_flags & TIMER_ABSTIME)) { |
727 | new_expires += val; |
728 | } |
729 | |
730 | /* |
731 | * Install the new expiry time (or zero). |
732 | * For a timer with no notification action, we don't actually |
733 | * arm the timer (we'll just fake it for timer_gettime). |
734 | */ |
735 | cpu_timer_setexpires(ctmr, exp: new_expires); |
736 | if (new_expires != 0 && val < new_expires) { |
737 | arm_timer(timer, p); |
738 | } |
739 | |
740 | unlock_task_sighand(task: p, flags: &flags); |
741 | /* |
742 | * Install the new reload setting, and |
743 | * set up the signal and overrun bookkeeping. |
744 | */ |
745 | timer->it_interval = timespec64_to_ktime(ts: new->it_interval); |
746 | |
747 | /* |
748 | * This acts as a modification timestamp for the timer, |
749 | * so any automatic reload attempt will punt on seeing |
750 | * that we have reset the timer manually. |
751 | */ |
752 | timer->it_requeue_pending = (timer->it_requeue_pending + 2) & |
753 | ~REQUEUE_PENDING; |
754 | timer->it_overrun_last = 0; |
755 | timer->it_overrun = -1; |
756 | |
757 | if (val >= new_expires) { |
758 | if (new_expires != 0) { |
759 | /* |
760 | * The designated time already passed, so we notify |
761 | * immediately, even if the thread never runs to |
762 | * accumulate more time on this clock. |
763 | */ |
764 | cpu_timer_fire(timer); |
765 | } |
766 | |
767 | /* |
768 | * Make sure we don't keep around the process wide cputime |
769 | * counter or the tick dependency if they are not necessary. |
770 | */ |
771 | sighand = lock_task_sighand(task: p, flags: &flags); |
772 | if (!sighand) |
773 | goto out; |
774 | |
775 | if (!cpu_timer_queued(ctmr)) |
776 | trigger_base_recalc_expires(timer, tsk: p); |
777 | |
778 | unlock_task_sighand(task: p, flags: &flags); |
779 | } |
780 | out: |
781 | rcu_read_unlock(); |
782 | if (old) |
783 | old->it_interval = ns_to_timespec64(nsec: old_incr); |
784 | |
785 | return ret; |
786 | } |
787 | |
788 | static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp) |
789 | { |
790 | clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); |
791 | struct cpu_timer *ctmr = &timer->it.cpu; |
792 | u64 now, expires = cpu_timer_getexpires(ctmr); |
793 | struct task_struct *p; |
794 | |
795 | rcu_read_lock(); |
796 | p = cpu_timer_task_rcu(timer); |
797 | if (!p) |
798 | goto out; |
799 | |
800 | /* |
801 | * Easy part: convert the reload time. |
802 | */ |
803 | itp->it_interval = ktime_to_timespec64(timer->it_interval); |
804 | |
805 | if (!expires) |
806 | goto out; |
807 | |
808 | /* |
809 | * Sample the clock to take the difference with the expiry time. |
810 | */ |
811 | if (CPUCLOCK_PERTHREAD(timer->it_clock)) |
812 | now = cpu_clock_sample(clkid, p); |
813 | else |
814 | now = cpu_clock_sample_group(clkid, p, start: false); |
815 | |
816 | if (now < expires) { |
817 | itp->it_value = ns_to_timespec64(nsec: expires - now); |
818 | } else { |
819 | /* |
820 | * The timer should have expired already, but the firing |
821 | * hasn't taken place yet. Say it's just about to expire. |
822 | */ |
823 | itp->it_value.tv_nsec = 1; |
824 | itp->it_value.tv_sec = 0; |
825 | } |
826 | out: |
827 | rcu_read_unlock(); |
828 | } |
829 | |
830 | #define MAX_COLLECTED 20 |
831 | |
832 | static u64 collect_timerqueue(struct timerqueue_head *head, |
833 | struct list_head *firing, u64 now) |
834 | { |
835 | struct timerqueue_node *next; |
836 | int i = 0; |
837 | |
838 | while ((next = timerqueue_getnext(head))) { |
839 | struct cpu_timer *ctmr; |
840 | u64 expires; |
841 | |
842 | ctmr = container_of(next, struct cpu_timer, node); |
843 | expires = cpu_timer_getexpires(ctmr); |
844 | /* Limit the number of timers to expire at once */ |
845 | if (++i == MAX_COLLECTED || now < expires) |
846 | return expires; |
847 | |
848 | ctmr->firing = 1; |
849 | /* See posix_cpu_timer_wait_running() */ |
850 | rcu_assign_pointer(ctmr->handling, current); |
851 | cpu_timer_dequeue(ctmr); |
852 | list_add_tail(new: &ctmr->elist, head: firing); |
853 | } |
854 | |
855 | return U64_MAX; |
856 | } |
857 | |
858 | static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, |
859 | struct list_head *firing) |
860 | { |
861 | struct posix_cputimer_base *base = pct->bases; |
862 | int i; |
863 | |
864 | for (i = 0; i < CPUCLOCK_MAX; i++, base++) { |
865 | base->nextevt = collect_timerqueue(head: &base->tqhead, firing, |
866 | now: samples[i]); |
867 | } |
868 | } |
869 | |
870 | static inline void check_dl_overrun(struct task_struct *tsk) |
871 | { |
872 | if (tsk->dl.dl_overrun) { |
873 | tsk->dl.dl_overrun = 0; |
874 | send_signal_locked(SIGXCPU, SEND_SIG_PRIV, p: tsk, type: PIDTYPE_TGID); |
875 | } |
876 | } |
877 | |
878 | static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) |
879 | { |
880 | if (time < limit) |
881 | return false; |
882 | |
883 | if (print_fatal_signals) { |
884 | pr_info("%s Watchdog Timeout (%s): %s[%d]\n" , |
885 | rt ? "RT" : "CPU" , hard ? "hard" : "soft" , |
886 | current->comm, task_pid_nr(current)); |
887 | } |
888 | send_signal_locked(sig: signo, SEND_SIG_PRIV, current, type: PIDTYPE_TGID); |
889 | return true; |
890 | } |
891 | |
892 | /* |
893 | * Check for any per-thread CPU timers that have fired and move them off |
894 | * the tsk->cpu_timers[N] list onto the firing list. Here we update the |
895 | * tsk->it_*_expires values to reflect the remaining thread CPU timers. |
896 | */ |
897 | static void check_thread_timers(struct task_struct *tsk, |
898 | struct list_head *firing) |
899 | { |
900 | struct posix_cputimers *pct = &tsk->posix_cputimers; |
901 | u64 samples[CPUCLOCK_MAX]; |
902 | unsigned long soft; |
903 | |
904 | if (dl_task(p: tsk)) |
905 | check_dl_overrun(tsk); |
906 | |
907 | if (expiry_cache_is_inactive(pct)) |
908 | return; |
909 | |
910 | task_sample_cputime(p: tsk, samples); |
911 | collect_posix_cputimers(pct, samples, firing); |
912 | |
913 | /* |
914 | * Check for the special case thread timers. |
915 | */ |
916 | soft = task_rlimit(task: tsk, RLIMIT_RTTIME); |
917 | if (soft != RLIM_INFINITY) { |
918 | /* Task RT timeout is accounted in jiffies. RTTIME is usec */ |
919 | unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); |
920 | unsigned long hard = task_rlimit_max(task: tsk, RLIMIT_RTTIME); |
921 | |
922 | /* At the hard limit, send SIGKILL. No further action. */ |
923 | if (hard != RLIM_INFINITY && |
924 | check_rlimit(time: rttime, limit: hard, SIGKILL, rt: true, hard: true)) |
925 | return; |
926 | |
927 | /* At the soft limit, send a SIGXCPU every second */ |
928 | if (check_rlimit(time: rttime, limit: soft, SIGXCPU, rt: true, hard: false)) { |
929 | soft += USEC_PER_SEC; |
930 | tsk->signal->rlim[RLIMIT_RTTIME].rlim_cur = soft; |
931 | } |
932 | } |
933 | |
934 | if (expiry_cache_is_inactive(pct)) |
935 | tick_dep_clear_task(tsk, bit: TICK_DEP_BIT_POSIX_TIMER); |
936 | } |
937 | |
938 | static inline void stop_process_timers(struct signal_struct *sig) |
939 | { |
940 | struct posix_cputimers *pct = &sig->posix_cputimers; |
941 | |
942 | /* Turn off the active flag. This is done without locking. */ |
943 | WRITE_ONCE(pct->timers_active, false); |
944 | tick_dep_clear_signal(signal: sig, bit: TICK_DEP_BIT_POSIX_TIMER); |
945 | } |
946 | |
947 | static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, |
948 | u64 *expires, u64 cur_time, int signo) |
949 | { |
950 | if (!it->expires) |
951 | return; |
952 | |
953 | if (cur_time >= it->expires) { |
954 | if (it->incr) |
955 | it->expires += it->incr; |
956 | else |
957 | it->expires = 0; |
958 | |
959 | trace_itimer_expire(which: signo == SIGPROF ? |
960 | ITIMER_PROF : ITIMER_VIRTUAL, |
961 | pid: task_tgid(task: tsk), now: cur_time); |
962 | send_signal_locked(sig: signo, SEND_SIG_PRIV, p: tsk, type: PIDTYPE_TGID); |
963 | } |
964 | |
965 | if (it->expires && it->expires < *expires) |
966 | *expires = it->expires; |
967 | } |
968 | |
969 | /* |
970 | * Check for any per-thread CPU timers that have fired and move them |
971 | * off the tsk->*_timers list onto the firing list. Per-thread timers |
972 | * have already been taken off. |
973 | */ |
974 | static void check_process_timers(struct task_struct *tsk, |
975 | struct list_head *firing) |
976 | { |
977 | struct signal_struct *const sig = tsk->signal; |
978 | struct posix_cputimers *pct = &sig->posix_cputimers; |
979 | u64 samples[CPUCLOCK_MAX]; |
980 | unsigned long soft; |
981 | |
982 | /* |
983 | * If there are no active process wide timers (POSIX 1.b, itimers, |
984 | * RLIMIT_CPU) nothing to check. Also skip the process wide timer |
985 | * processing when there is already another task handling them. |
986 | */ |
987 | if (!READ_ONCE(pct->timers_active) || pct->expiry_active) |
988 | return; |
989 | |
990 | /* |
991 | * Signify that a thread is checking for process timers. |
992 | * Write access to this field is protected by the sighand lock. |
993 | */ |
994 | pct->expiry_active = true; |
995 | |
996 | /* |
997 | * Collect the current process totals. Group accounting is active |
998 | * so the sample can be taken directly. |
999 | */ |
1000 | proc_sample_cputime_atomic(at: &sig->cputimer.cputime_atomic, samples); |
1001 | collect_posix_cputimers(pct, samples, firing); |
1002 | |
1003 | /* |
1004 | * Check for the special case process timers. |
1005 | */ |
1006 | check_cpu_itimer(tsk, it: &sig->it[CPUCLOCK_PROF], |
1007 | expires: &pct->bases[CPUCLOCK_PROF].nextevt, |
1008 | cur_time: samples[CPUCLOCK_PROF], SIGPROF); |
1009 | check_cpu_itimer(tsk, it: &sig->it[CPUCLOCK_VIRT], |
1010 | expires: &pct->bases[CPUCLOCK_VIRT].nextevt, |
1011 | cur_time: samples[CPUCLOCK_VIRT], SIGVTALRM); |
1012 | |
1013 | soft = task_rlimit(task: tsk, RLIMIT_CPU); |
1014 | if (soft != RLIM_INFINITY) { |
1015 | /* RLIMIT_CPU is in seconds. Samples are nanoseconds */ |
1016 | unsigned long hard = task_rlimit_max(task: tsk, RLIMIT_CPU); |
1017 | u64 ptime = samples[CPUCLOCK_PROF]; |
1018 | u64 softns = (u64)soft * NSEC_PER_SEC; |
1019 | u64 hardns = (u64)hard * NSEC_PER_SEC; |
1020 | |
1021 | /* At the hard limit, send SIGKILL. No further action. */ |
1022 | if (hard != RLIM_INFINITY && |
1023 | check_rlimit(time: ptime, limit: hardns, SIGKILL, rt: false, hard: true)) |
1024 | return; |
1025 | |
1026 | /* At the soft limit, send a SIGXCPU every second */ |
1027 | if (check_rlimit(time: ptime, limit: softns, SIGXCPU, rt: false, hard: false)) { |
1028 | sig->rlim[RLIMIT_CPU].rlim_cur = soft + 1; |
1029 | softns += NSEC_PER_SEC; |
1030 | } |
1031 | |
1032 | /* Update the expiry cache */ |
1033 | if (softns < pct->bases[CPUCLOCK_PROF].nextevt) |
1034 | pct->bases[CPUCLOCK_PROF].nextevt = softns; |
1035 | } |
1036 | |
1037 | if (expiry_cache_is_inactive(pct)) |
1038 | stop_process_timers(sig); |
1039 | |
1040 | pct->expiry_active = false; |
1041 | } |
1042 | |
1043 | /* |
1044 | * This is called from the signal code (via posixtimer_rearm) |
1045 | * when the last timer signal was delivered and we have to reload the timer. |
1046 | */ |
1047 | static void posix_cpu_timer_rearm(struct k_itimer *timer) |
1048 | { |
1049 | clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); |
1050 | struct task_struct *p; |
1051 | struct sighand_struct *sighand; |
1052 | unsigned long flags; |
1053 | u64 now; |
1054 | |
1055 | rcu_read_lock(); |
1056 | p = cpu_timer_task_rcu(timer); |
1057 | if (!p) |
1058 | goto out; |
1059 | |
1060 | /* Protect timer list r/w in arm_timer() */ |
1061 | sighand = lock_task_sighand(task: p, flags: &flags); |
1062 | if (unlikely(sighand == NULL)) |
1063 | goto out; |
1064 | |
1065 | /* |
1066 | * Fetch the current sample and update the timer's expiry time. |
1067 | */ |
1068 | if (CPUCLOCK_PERTHREAD(timer->it_clock)) |
1069 | now = cpu_clock_sample(clkid, p); |
1070 | else |
1071 | now = cpu_clock_sample_group(clkid, p, start: true); |
1072 | |
1073 | bump_cpu_timer(timer, now); |
1074 | |
1075 | /* |
1076 | * Now re-arm for the new expiry time. |
1077 | */ |
1078 | arm_timer(timer, p); |
1079 | unlock_task_sighand(task: p, flags: &flags); |
1080 | out: |
1081 | rcu_read_unlock(); |
1082 | } |
1083 | |
1084 | /** |
1085 | * task_cputimers_expired - Check whether posix CPU timers are expired |
1086 | * |
1087 | * @samples: Array of current samples for the CPUCLOCK clocks |
1088 | * @pct: Pointer to a posix_cputimers container |
1089 | * |
1090 | * Returns true if any member of @samples is greater than the corresponding |
1091 | * member of @pct->bases[CLK].nextevt. False otherwise |
1092 | */ |
1093 | static inline bool |
1094 | task_cputimers_expired(const u64 *samples, struct posix_cputimers *pct) |
1095 | { |
1096 | int i; |
1097 | |
1098 | for (i = 0; i < CPUCLOCK_MAX; i++) { |
1099 | if (samples[i] >= pct->bases[i].nextevt) |
1100 | return true; |
1101 | } |
1102 | return false; |
1103 | } |
1104 | |
1105 | /** |
1106 | * fastpath_timer_check - POSIX CPU timers fast path. |
1107 | * |
1108 | * @tsk: The task (thread) being checked. |
1109 | * |
1110 | * Check the task and thread group timers. If both are zero (there are no |
1111 | * timers set) return false. Otherwise snapshot the task and thread group |
1112 | * timers and compare them with the corresponding expiration times. Return |
1113 | * true if a timer has expired, else return false. |
1114 | */ |
1115 | static inline bool fastpath_timer_check(struct task_struct *tsk) |
1116 | { |
1117 | struct posix_cputimers *pct = &tsk->posix_cputimers; |
1118 | struct signal_struct *sig; |
1119 | |
1120 | if (!expiry_cache_is_inactive(pct)) { |
1121 | u64 samples[CPUCLOCK_MAX]; |
1122 | |
1123 | task_sample_cputime(p: tsk, samples); |
1124 | if (task_cputimers_expired(samples, pct)) |
1125 | return true; |
1126 | } |
1127 | |
1128 | sig = tsk->signal; |
1129 | pct = &sig->posix_cputimers; |
1130 | /* |
1131 | * Check if thread group timers expired when timers are active and |
1132 | * no other thread in the group is already handling expiry for |
1133 | * thread group cputimers. These fields are read without the |
1134 | * sighand lock. However, this is fine because this is meant to be |
1135 | * a fastpath heuristic to determine whether we should try to |
1136 | * acquire the sighand lock to handle timer expiry. |
1137 | * |
1138 | * In the worst case scenario, if concurrently timers_active is set |
1139 | * or expiry_active is cleared, but the current thread doesn't see |
1140 | * the change yet, the timer checks are delayed until the next |
1141 | * thread in the group gets a scheduler interrupt to handle the |
1142 | * timer. This isn't an issue in practice because these types of |
1143 | * delays with signals actually getting sent are expected. |
1144 | */ |
1145 | if (READ_ONCE(pct->timers_active) && !READ_ONCE(pct->expiry_active)) { |
1146 | u64 samples[CPUCLOCK_MAX]; |
1147 | |
1148 | proc_sample_cputime_atomic(at: &sig->cputimer.cputime_atomic, |
1149 | samples); |
1150 | |
1151 | if (task_cputimers_expired(samples, pct)) |
1152 | return true; |
1153 | } |
1154 | |
1155 | if (dl_task(p: tsk) && tsk->dl.dl_overrun) |
1156 | return true; |
1157 | |
1158 | return false; |
1159 | } |
1160 | |
1161 | static void handle_posix_cpu_timers(struct task_struct *tsk); |
1162 | |
1163 | #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK |
1164 | static void posix_cpu_timers_work(struct callback_head *work) |
1165 | { |
1166 | struct posix_cputimers_work *cw = container_of(work, typeof(*cw), work); |
1167 | |
1168 | mutex_lock(&cw->mutex); |
1169 | handle_posix_cpu_timers(current); |
1170 | mutex_unlock(lock: &cw->mutex); |
1171 | } |
1172 | |
1173 | /* |
1174 | * Invoked from the posix-timer core when a cancel operation failed because |
1175 | * the timer is marked firing. The caller holds rcu_read_lock(), which |
1176 | * protects the timer and the task which is expiring it from being freed. |
1177 | */ |
1178 | static void posix_cpu_timer_wait_running(struct k_itimer *timr) |
1179 | { |
1180 | struct task_struct *tsk = rcu_dereference(timr->it.cpu.handling); |
1181 | |
1182 | /* Has the handling task completed expiry already? */ |
1183 | if (!tsk) |
1184 | return; |
1185 | |
1186 | /* Ensure that the task cannot go away */ |
1187 | get_task_struct(t: tsk); |
1188 | /* Now drop the RCU protection so the mutex can be locked */ |
1189 | rcu_read_unlock(); |
1190 | /* Wait on the expiry mutex */ |
1191 | mutex_lock(&tsk->posix_cputimers_work.mutex); |
1192 | /* Release it immediately again. */ |
1193 | mutex_unlock(lock: &tsk->posix_cputimers_work.mutex); |
1194 | /* Drop the task reference. */ |
1195 | put_task_struct(t: tsk); |
1196 | /* Relock RCU so the callsite is balanced */ |
1197 | rcu_read_lock(); |
1198 | } |
1199 | |
1200 | static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr) |
1201 | { |
1202 | /* Ensure that timr->it.cpu.handling task cannot go away */ |
1203 | rcu_read_lock(); |
1204 | spin_unlock_irq(lock: &timr->it_lock); |
1205 | posix_cpu_timer_wait_running(timr); |
1206 | rcu_read_unlock(); |
1207 | /* @timr is on stack and is valid */ |
1208 | spin_lock_irq(lock: &timr->it_lock); |
1209 | } |
1210 | |
1211 | /* |
1212 | * Clear existing posix CPU timers task work. |
1213 | */ |
1214 | void clear_posix_cputimers_work(struct task_struct *p) |
1215 | { |
1216 | /* |
1217 | * A copied work entry from the old task is not meaningful, clear it. |
1218 | * N.B. init_task_work will not do this. |
1219 | */ |
1220 | memset(&p->posix_cputimers_work.work, 0, |
1221 | sizeof(p->posix_cputimers_work.work)); |
1222 | init_task_work(twork: &p->posix_cputimers_work.work, |
1223 | func: posix_cpu_timers_work); |
1224 | mutex_init(&p->posix_cputimers_work.mutex); |
1225 | p->posix_cputimers_work.scheduled = false; |
1226 | } |
1227 | |
1228 | /* |
1229 | * Initialize posix CPU timers task work in init task. Out of line to |
1230 | * keep the callback static and to avoid header recursion hell. |
1231 | */ |
1232 | void __init posix_cputimers_init_work(void) |
1233 | { |
1234 | clear_posix_cputimers_work(current); |
1235 | } |
1236 | |
1237 | /* |
1238 | * Note: All operations on tsk->posix_cputimer_work.scheduled happen either |
1239 | * in hard interrupt context or in task context with interrupts |
1240 | * disabled. Aside of that the writer/reader interaction is always in the |
1241 | * context of the current task, which means they are strict per CPU. |
1242 | */ |
1243 | static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk) |
1244 | { |
1245 | return tsk->posix_cputimers_work.scheduled; |
1246 | } |
1247 | |
1248 | static inline void __run_posix_cpu_timers(struct task_struct *tsk) |
1249 | { |
1250 | if (WARN_ON_ONCE(tsk->posix_cputimers_work.scheduled)) |
1251 | return; |
1252 | |
1253 | /* Schedule task work to actually expire the timers */ |
1254 | tsk->posix_cputimers_work.scheduled = true; |
1255 | task_work_add(task: tsk, twork: &tsk->posix_cputimers_work.work, mode: TWA_RESUME); |
1256 | } |
1257 | |
1258 | static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk, |
1259 | unsigned long start) |
1260 | { |
1261 | bool ret = true; |
1262 | |
1263 | /* |
1264 | * On !RT kernels interrupts are disabled while collecting expired |
1265 | * timers, so no tick can happen and the fast path check can be |
1266 | * reenabled without further checks. |
1267 | */ |
1268 | if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { |
1269 | tsk->posix_cputimers_work.scheduled = false; |
1270 | return true; |
1271 | } |
1272 | |
1273 | /* |
1274 | * On RT enabled kernels ticks can happen while the expired timers |
1275 | * are collected under sighand lock. But any tick which observes |
1276 | * the CPUTIMERS_WORK_SCHEDULED bit set, does not run the fastpath |
1277 | * checks. So reenabling the tick work has do be done carefully: |
1278 | * |
1279 | * Disable interrupts and run the fast path check if jiffies have |
1280 | * advanced since the collecting of expired timers started. If |
1281 | * jiffies have not advanced or the fast path check did not find |
1282 | * newly expired timers, reenable the fast path check in the timer |
1283 | * interrupt. If there are newly expired timers, return false and |
1284 | * let the collection loop repeat. |
1285 | */ |
1286 | local_irq_disable(); |
1287 | if (start != jiffies && fastpath_timer_check(tsk)) |
1288 | ret = false; |
1289 | else |
1290 | tsk->posix_cputimers_work.scheduled = false; |
1291 | local_irq_enable(); |
1292 | |
1293 | return ret; |
1294 | } |
1295 | #else /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */ |
1296 | static inline void __run_posix_cpu_timers(struct task_struct *tsk) |
1297 | { |
1298 | lockdep_posixtimer_enter(); |
1299 | handle_posix_cpu_timers(tsk); |
1300 | lockdep_posixtimer_exit(); |
1301 | } |
1302 | |
1303 | static void posix_cpu_timer_wait_running(struct k_itimer *timr) |
1304 | { |
1305 | cpu_relax(); |
1306 | } |
1307 | |
1308 | static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr) |
1309 | { |
1310 | spin_unlock_irq(&timr->it_lock); |
1311 | cpu_relax(); |
1312 | spin_lock_irq(&timr->it_lock); |
1313 | } |
1314 | |
1315 | static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk) |
1316 | { |
1317 | return false; |
1318 | } |
1319 | |
1320 | static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk, |
1321 | unsigned long start) |
1322 | { |
1323 | return true; |
1324 | } |
1325 | #endif /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */ |
1326 | |
1327 | static void handle_posix_cpu_timers(struct task_struct *tsk) |
1328 | { |
1329 | struct k_itimer *timer, *next; |
1330 | unsigned long flags, start; |
1331 | LIST_HEAD(firing); |
1332 | |
1333 | if (!lock_task_sighand(task: tsk, flags: &flags)) |
1334 | return; |
1335 | |
1336 | do { |
1337 | /* |
1338 | * On RT locking sighand lock does not disable interrupts, |
1339 | * so this needs to be careful vs. ticks. Store the current |
1340 | * jiffies value. |
1341 | */ |
1342 | start = READ_ONCE(jiffies); |
1343 | barrier(); |
1344 | |
1345 | /* |
1346 | * Here we take off tsk->signal->cpu_timers[N] and |
1347 | * tsk->cpu_timers[N] all the timers that are firing, and |
1348 | * put them on the firing list. |
1349 | */ |
1350 | check_thread_timers(tsk, firing: &firing); |
1351 | |
1352 | check_process_timers(tsk, firing: &firing); |
1353 | |
1354 | /* |
1355 | * The above timer checks have updated the expiry cache and |
1356 | * because nothing can have queued or modified timers after |
1357 | * sighand lock was taken above it is guaranteed to be |
1358 | * consistent. So the next timer interrupt fastpath check |
1359 | * will find valid data. |
1360 | * |
1361 | * If timer expiry runs in the timer interrupt context then |
1362 | * the loop is not relevant as timers will be directly |
1363 | * expired in interrupt context. The stub function below |
1364 | * returns always true which allows the compiler to |
1365 | * optimize the loop out. |
1366 | * |
1367 | * If timer expiry is deferred to task work context then |
1368 | * the following rules apply: |
1369 | * |
1370 | * - On !RT kernels no tick can have happened on this CPU |
1371 | * after sighand lock was acquired because interrupts are |
1372 | * disabled. So reenabling task work before dropping |
1373 | * sighand lock and reenabling interrupts is race free. |
1374 | * |
1375 | * - On RT kernels ticks might have happened but the tick |
1376 | * work ignored posix CPU timer handling because the |
1377 | * CPUTIMERS_WORK_SCHEDULED bit is set. Reenabling work |
1378 | * must be done very carefully including a check whether |
1379 | * ticks have happened since the start of the timer |
1380 | * expiry checks. posix_cpu_timers_enable_work() takes |
1381 | * care of that and eventually lets the expiry checks |
1382 | * run again. |
1383 | */ |
1384 | } while (!posix_cpu_timers_enable_work(tsk, start)); |
1385 | |
1386 | /* |
1387 | * We must release sighand lock before taking any timer's lock. |
1388 | * There is a potential race with timer deletion here, as the |
1389 | * siglock now protects our private firing list. We have set |
1390 | * the firing flag in each timer, so that a deletion attempt |
1391 | * that gets the timer lock before we do will give it up and |
1392 | * spin until we've taken care of that timer below. |
1393 | */ |
1394 | unlock_task_sighand(task: tsk, flags: &flags); |
1395 | |
1396 | /* |
1397 | * Now that all the timers on our list have the firing flag, |
1398 | * no one will touch their list entries but us. We'll take |
1399 | * each timer's lock before clearing its firing flag, so no |
1400 | * timer call will interfere. |
1401 | */ |
1402 | list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) { |
1403 | int cpu_firing; |
1404 | |
1405 | /* |
1406 | * spin_lock() is sufficient here even independent of the |
1407 | * expiry context. If expiry happens in hard interrupt |
1408 | * context it's obvious. For task work context it's safe |
1409 | * because all other operations on timer::it_lock happen in |
1410 | * task context (syscall or exit). |
1411 | */ |
1412 | spin_lock(lock: &timer->it_lock); |
1413 | list_del_init(entry: &timer->it.cpu.elist); |
1414 | cpu_firing = timer->it.cpu.firing; |
1415 | timer->it.cpu.firing = 0; |
1416 | /* |
1417 | * The firing flag is -1 if we collided with a reset |
1418 | * of the timer, which already reported this |
1419 | * almost-firing as an overrun. So don't generate an event. |
1420 | */ |
1421 | if (likely(cpu_firing >= 0)) |
1422 | cpu_timer_fire(timer); |
1423 | /* See posix_cpu_timer_wait_running() */ |
1424 | rcu_assign_pointer(timer->it.cpu.handling, NULL); |
1425 | spin_unlock(lock: &timer->it_lock); |
1426 | } |
1427 | } |
1428 | |
1429 | /* |
1430 | * This is called from the timer interrupt handler. The irq handler has |
1431 | * already updated our counts. We need to check if any timers fire now. |
1432 | * Interrupts are disabled. |
1433 | */ |
1434 | void run_posix_cpu_timers(void) |
1435 | { |
1436 | struct task_struct *tsk = current; |
1437 | |
1438 | lockdep_assert_irqs_disabled(); |
1439 | |
1440 | /* |
1441 | * If the actual expiry is deferred to task work context and the |
1442 | * work is already scheduled there is no point to do anything here. |
1443 | */ |
1444 | if (posix_cpu_timers_work_scheduled(tsk)) |
1445 | return; |
1446 | |
1447 | /* |
1448 | * The fast path checks that there are no expired thread or thread |
1449 | * group timers. If that's so, just return. |
1450 | */ |
1451 | if (!fastpath_timer_check(tsk)) |
1452 | return; |
1453 | |
1454 | __run_posix_cpu_timers(tsk); |
1455 | } |
1456 | |
1457 | /* |
1458 | * Set one of the process-wide special case CPU timers or RLIMIT_CPU. |
1459 | * The tsk->sighand->siglock must be held by the caller. |
1460 | */ |
1461 | void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid, |
1462 | u64 *newval, u64 *oldval) |
1463 | { |
1464 | u64 now, *nextevt; |
1465 | |
1466 | if (WARN_ON_ONCE(clkid >= CPUCLOCK_SCHED)) |
1467 | return; |
1468 | |
1469 | nextevt = &tsk->signal->posix_cputimers.bases[clkid].nextevt; |
1470 | now = cpu_clock_sample_group(clkid, p: tsk, start: true); |
1471 | |
1472 | if (oldval) { |
1473 | /* |
1474 | * We are setting itimer. The *oldval is absolute and we update |
1475 | * it to be relative, *newval argument is relative and we update |
1476 | * it to be absolute. |
1477 | */ |
1478 | if (*oldval) { |
1479 | if (*oldval <= now) { |
1480 | /* Just about to fire. */ |
1481 | *oldval = TICK_NSEC; |
1482 | } else { |
1483 | *oldval -= now; |
1484 | } |
1485 | } |
1486 | |
1487 | if (*newval) |
1488 | *newval += now; |
1489 | } |
1490 | |
1491 | /* |
1492 | * Update expiration cache if this is the earliest timer. CPUCLOCK_PROF |
1493 | * expiry cache is also used by RLIMIT_CPU!. |
1494 | */ |
1495 | if (*newval < *nextevt) |
1496 | *nextevt = *newval; |
1497 | |
1498 | tick_dep_set_signal(tsk, bit: TICK_DEP_BIT_POSIX_TIMER); |
1499 | } |
1500 | |
1501 | static int do_cpu_nanosleep(const clockid_t which_clock, int flags, |
1502 | const struct timespec64 *rqtp) |
1503 | { |
1504 | struct itimerspec64 it; |
1505 | struct k_itimer timer; |
1506 | u64 expires; |
1507 | int error; |
1508 | |
1509 | /* |
1510 | * Set up a temporary timer and then wait for it to go off. |
1511 | */ |
1512 | memset(&timer, 0, sizeof timer); |
1513 | spin_lock_init(&timer.it_lock); |
1514 | timer.it_clock = which_clock; |
1515 | timer.it_overrun = -1; |
1516 | error = posix_cpu_timer_create(new_timer: &timer); |
1517 | timer.it_process = current; |
1518 | |
1519 | if (!error) { |
1520 | static struct itimerspec64 zero_it; |
1521 | struct restart_block *restart; |
1522 | |
1523 | memset(&it, 0, sizeof(it)); |
1524 | it.it_value = *rqtp; |
1525 | |
1526 | spin_lock_irq(lock: &timer.it_lock); |
1527 | error = posix_cpu_timer_set(timer: &timer, timer_flags: flags, new: &it, NULL); |
1528 | if (error) { |
1529 | spin_unlock_irq(lock: &timer.it_lock); |
1530 | return error; |
1531 | } |
1532 | |
1533 | while (!signal_pending(current)) { |
1534 | if (!cpu_timer_getexpires(ctmr: &timer.it.cpu)) { |
1535 | /* |
1536 | * Our timer fired and was reset, below |
1537 | * deletion can not fail. |
1538 | */ |
1539 | posix_cpu_timer_del(timer: &timer); |
1540 | spin_unlock_irq(lock: &timer.it_lock); |
1541 | return 0; |
1542 | } |
1543 | |
1544 | /* |
1545 | * Block until cpu_timer_fire (or a signal) wakes us. |
1546 | */ |
1547 | __set_current_state(TASK_INTERRUPTIBLE); |
1548 | spin_unlock_irq(lock: &timer.it_lock); |
1549 | schedule(); |
1550 | spin_lock_irq(lock: &timer.it_lock); |
1551 | } |
1552 | |
1553 | /* |
1554 | * We were interrupted by a signal. |
1555 | */ |
1556 | expires = cpu_timer_getexpires(ctmr: &timer.it.cpu); |
1557 | error = posix_cpu_timer_set(timer: &timer, timer_flags: 0, new: &zero_it, old: &it); |
1558 | if (!error) { |
1559 | /* Timer is now unarmed, deletion can not fail. */ |
1560 | posix_cpu_timer_del(timer: &timer); |
1561 | } else { |
1562 | while (error == TIMER_RETRY) { |
1563 | posix_cpu_timer_wait_running_nsleep(timr: &timer); |
1564 | error = posix_cpu_timer_del(timer: &timer); |
1565 | } |
1566 | } |
1567 | |
1568 | spin_unlock_irq(lock: &timer.it_lock); |
1569 | |
1570 | if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) { |
1571 | /* |
1572 | * It actually did fire already. |
1573 | */ |
1574 | return 0; |
1575 | } |
1576 | |
1577 | error = -ERESTART_RESTARTBLOCK; |
1578 | /* |
1579 | * Report back to the user the time still remaining. |
1580 | */ |
1581 | restart = ¤t->restart_block; |
1582 | restart->nanosleep.expires = expires; |
1583 | if (restart->nanosleep.type != TT_NONE) |
1584 | error = nanosleep_copyout(restart, &it.it_value); |
1585 | } |
1586 | |
1587 | return error; |
1588 | } |
1589 | |
1590 | static long posix_cpu_nsleep_restart(struct restart_block *restart_block); |
1591 | |
1592 | static int posix_cpu_nsleep(const clockid_t which_clock, int flags, |
1593 | const struct timespec64 *rqtp) |
1594 | { |
1595 | struct restart_block *restart_block = ¤t->restart_block; |
1596 | int error; |
1597 | |
1598 | /* |
1599 | * Diagnose required errors first. |
1600 | */ |
1601 | if (CPUCLOCK_PERTHREAD(which_clock) && |
1602 | (CPUCLOCK_PID(which_clock) == 0 || |
1603 | CPUCLOCK_PID(which_clock) == task_pid_vnr(current))) |
1604 | return -EINVAL; |
1605 | |
1606 | error = do_cpu_nanosleep(which_clock, flags, rqtp); |
1607 | |
1608 | if (error == -ERESTART_RESTARTBLOCK) { |
1609 | |
1610 | if (flags & TIMER_ABSTIME) |
1611 | return -ERESTARTNOHAND; |
1612 | |
1613 | restart_block->nanosleep.clockid = which_clock; |
1614 | set_restart_fn(restart: restart_block, fn: posix_cpu_nsleep_restart); |
1615 | } |
1616 | return error; |
1617 | } |
1618 | |
1619 | static long posix_cpu_nsleep_restart(struct restart_block *restart_block) |
1620 | { |
1621 | clockid_t which_clock = restart_block->nanosleep.clockid; |
1622 | struct timespec64 t; |
1623 | |
1624 | t = ns_to_timespec64(nsec: restart_block->nanosleep.expires); |
1625 | |
1626 | return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, rqtp: &t); |
1627 | } |
1628 | |
1629 | #define PROCESS_CLOCK make_process_cpuclock(0, CPUCLOCK_SCHED) |
1630 | #define THREAD_CLOCK make_thread_cpuclock(0, CPUCLOCK_SCHED) |
1631 | |
1632 | static int process_cpu_clock_getres(const clockid_t which_clock, |
1633 | struct timespec64 *tp) |
1634 | { |
1635 | return posix_cpu_clock_getres(PROCESS_CLOCK, tp); |
1636 | } |
1637 | static int process_cpu_clock_get(const clockid_t which_clock, |
1638 | struct timespec64 *tp) |
1639 | { |
1640 | return posix_cpu_clock_get(PROCESS_CLOCK, tp); |
1641 | } |
1642 | static int process_cpu_timer_create(struct k_itimer *timer) |
1643 | { |
1644 | timer->it_clock = PROCESS_CLOCK; |
1645 | return posix_cpu_timer_create(new_timer: timer); |
1646 | } |
1647 | static int process_cpu_nsleep(const clockid_t which_clock, int flags, |
1648 | const struct timespec64 *rqtp) |
1649 | { |
1650 | return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp); |
1651 | } |
1652 | static int thread_cpu_clock_getres(const clockid_t which_clock, |
1653 | struct timespec64 *tp) |
1654 | { |
1655 | return posix_cpu_clock_getres(THREAD_CLOCK, tp); |
1656 | } |
1657 | static int thread_cpu_clock_get(const clockid_t which_clock, |
1658 | struct timespec64 *tp) |
1659 | { |
1660 | return posix_cpu_clock_get(THREAD_CLOCK, tp); |
1661 | } |
1662 | static int thread_cpu_timer_create(struct k_itimer *timer) |
1663 | { |
1664 | timer->it_clock = THREAD_CLOCK; |
1665 | return posix_cpu_timer_create(new_timer: timer); |
1666 | } |
1667 | |
1668 | const struct k_clock clock_posix_cpu = { |
1669 | .clock_getres = posix_cpu_clock_getres, |
1670 | .clock_set = posix_cpu_clock_set, |
1671 | .clock_get_timespec = posix_cpu_clock_get, |
1672 | .timer_create = posix_cpu_timer_create, |
1673 | .nsleep = posix_cpu_nsleep, |
1674 | .timer_set = posix_cpu_timer_set, |
1675 | .timer_del = posix_cpu_timer_del, |
1676 | .timer_get = posix_cpu_timer_get, |
1677 | .timer_rearm = posix_cpu_timer_rearm, |
1678 | .timer_wait_running = posix_cpu_timer_wait_running, |
1679 | }; |
1680 | |
1681 | const struct k_clock clock_process = { |
1682 | .clock_getres = process_cpu_clock_getres, |
1683 | .clock_get_timespec = process_cpu_clock_get, |
1684 | .timer_create = process_cpu_timer_create, |
1685 | .nsleep = process_cpu_nsleep, |
1686 | }; |
1687 | |
1688 | const struct k_clock clock_thread = { |
1689 | .clock_getres = thread_cpu_clock_getres, |
1690 | .clock_get_timespec = thread_cpu_clock_get, |
1691 | .timer_create = thread_cpu_timer_create, |
1692 | }; |
1693 | |