1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | #include "cgroup-internal.h" |
3 | |
4 | #include <linux/sched/cputime.h> |
5 | |
6 | #include <linux/bpf.h> |
7 | #include <linux/btf.h> |
8 | #include <linux/btf_ids.h> |
9 | |
10 | static DEFINE_SPINLOCK(cgroup_rstat_lock); |
11 | static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); |
12 | |
13 | static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); |
14 | |
15 | static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) |
16 | { |
17 | return per_cpu_ptr(cgrp->rstat_cpu, cpu); |
18 | } |
19 | |
20 | /** |
21 | * cgroup_rstat_updated - keep track of updated rstat_cpu |
22 | * @cgrp: target cgroup |
23 | * @cpu: cpu on which rstat_cpu was updated |
24 | * |
25 | * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching |
26 | * rstat_cpu->updated_children list. See the comment on top of |
27 | * cgroup_rstat_cpu definition for details. |
28 | */ |
29 | __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) |
30 | { |
31 | raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); |
32 | unsigned long flags; |
33 | |
34 | /* |
35 | * Speculative already-on-list test. This may race leading to |
36 | * temporary inaccuracies, which is fine. |
37 | * |
38 | * Because @parent's updated_children is terminated with @parent |
39 | * instead of NULL, we can tell whether @cgrp is on the list by |
40 | * testing the next pointer for NULL. |
41 | */ |
42 | if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next)) |
43 | return; |
44 | |
45 | raw_spin_lock_irqsave(cpu_lock, flags); |
46 | |
47 | /* put @cgrp and all ancestors on the corresponding updated lists */ |
48 | while (true) { |
49 | struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); |
50 | struct cgroup *parent = cgroup_parent(cgrp); |
51 | struct cgroup_rstat_cpu *prstatc; |
52 | |
53 | /* |
54 | * Both additions and removals are bottom-up. If a cgroup |
55 | * is already in the tree, all ancestors are. |
56 | */ |
57 | if (rstatc->updated_next) |
58 | break; |
59 | |
60 | /* Root has no parent to link it to, but mark it busy */ |
61 | if (!parent) { |
62 | rstatc->updated_next = cgrp; |
63 | break; |
64 | } |
65 | |
66 | prstatc = cgroup_rstat_cpu(cgrp: parent, cpu); |
67 | rstatc->updated_next = prstatc->updated_children; |
68 | prstatc->updated_children = cgrp; |
69 | |
70 | cgrp = parent; |
71 | } |
72 | |
73 | raw_spin_unlock_irqrestore(cpu_lock, flags); |
74 | } |
75 | |
76 | /** |
77 | * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree |
78 | * @pos: current position |
79 | * @root: root of the tree to traversal |
80 | * @cpu: target cpu |
81 | * |
82 | * Walks the updated rstat_cpu tree on @cpu from @root. %NULL @pos starts |
83 | * the traversal and %NULL return indicates the end. During traversal, |
84 | * each returned cgroup is unlinked from the tree. Must be called with the |
85 | * matching cgroup_rstat_cpu_lock held. |
86 | * |
87 | * The only ordering guarantee is that, for a parent and a child pair |
88 | * covered by a given traversal, if a child is visited, its parent is |
89 | * guaranteed to be visited afterwards. |
90 | */ |
91 | static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, |
92 | struct cgroup *root, int cpu) |
93 | { |
94 | struct cgroup_rstat_cpu *rstatc; |
95 | struct cgroup *parent; |
96 | |
97 | if (pos == root) |
98 | return NULL; |
99 | |
100 | /* |
101 | * We're gonna walk down to the first leaf and visit/remove it. We |
102 | * can pick whatever unvisited node as the starting point. |
103 | */ |
104 | if (!pos) { |
105 | pos = root; |
106 | /* return NULL if this subtree is not on-list */ |
107 | if (!cgroup_rstat_cpu(cgrp: pos, cpu)->updated_next) |
108 | return NULL; |
109 | } else { |
110 | pos = cgroup_parent(cgrp: pos); |
111 | } |
112 | |
113 | /* walk down to the first leaf */ |
114 | while (true) { |
115 | rstatc = cgroup_rstat_cpu(cgrp: pos, cpu); |
116 | if (rstatc->updated_children == pos) |
117 | break; |
118 | pos = rstatc->updated_children; |
119 | } |
120 | |
121 | /* |
122 | * Unlink @pos from the tree. As the updated_children list is |
123 | * singly linked, we have to walk it to find the removal point. |
124 | * However, due to the way we traverse, @pos will be the first |
125 | * child in most cases. The only exception is @root. |
126 | */ |
127 | parent = cgroup_parent(cgrp: pos); |
128 | if (parent) { |
129 | struct cgroup_rstat_cpu *prstatc; |
130 | struct cgroup **nextp; |
131 | |
132 | prstatc = cgroup_rstat_cpu(cgrp: parent, cpu); |
133 | nextp = &prstatc->updated_children; |
134 | while (*nextp != pos) { |
135 | struct cgroup_rstat_cpu *nrstatc; |
136 | |
137 | nrstatc = cgroup_rstat_cpu(cgrp: *nextp, cpu); |
138 | WARN_ON_ONCE(*nextp == parent); |
139 | nextp = &nrstatc->updated_next; |
140 | } |
141 | *nextp = rstatc->updated_next; |
142 | } |
143 | |
144 | rstatc->updated_next = NULL; |
145 | return pos; |
146 | } |
147 | |
148 | /* |
149 | * A hook for bpf stat collectors to attach to and flush their stats. |
150 | * Together with providing bpf kfuncs for cgroup_rstat_updated() and |
151 | * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that |
152 | * collect cgroup stats can integrate with rstat for efficient flushing. |
153 | * |
154 | * A static noinline declaration here could cause the compiler to optimize away |
155 | * the function. A global noinline declaration will keep the definition, but may |
156 | * optimize away the callsite. Therefore, __weak is needed to ensure that the |
157 | * call is still emitted, by telling the compiler that we don't know what the |
158 | * function might eventually be. |
159 | * |
160 | * __diag_* below are needed to dismiss the missing prototype warning. |
161 | */ |
162 | __diag_push(); |
163 | __diag_ignore_all("-Wmissing-prototypes" , |
164 | "kfuncs which will be used in BPF programs" ); |
165 | |
166 | __weak noinline void bpf_rstat_flush(struct cgroup *cgrp, |
167 | struct cgroup *parent, int cpu) |
168 | { |
169 | } |
170 | |
171 | __diag_pop(); |
172 | |
173 | /* see cgroup_rstat_flush() */ |
174 | static void cgroup_rstat_flush_locked(struct cgroup *cgrp) |
175 | __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) |
176 | { |
177 | int cpu; |
178 | |
179 | lockdep_assert_held(&cgroup_rstat_lock); |
180 | |
181 | for_each_possible_cpu(cpu) { |
182 | raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, |
183 | cpu); |
184 | struct cgroup *pos = NULL; |
185 | unsigned long flags; |
186 | |
187 | /* |
188 | * The _irqsave() is needed because cgroup_rstat_lock is |
189 | * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring |
190 | * this lock with the _irq() suffix only disables interrupts on |
191 | * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables |
192 | * interrupts on both configurations. The _irqsave() ensures |
193 | * that interrupts are always disabled and later restored. |
194 | */ |
195 | raw_spin_lock_irqsave(cpu_lock, flags); |
196 | while ((pos = cgroup_rstat_cpu_pop_updated(pos, root: cgrp, cpu))) { |
197 | struct cgroup_subsys_state *css; |
198 | |
199 | cgroup_base_stat_flush(cgrp: pos, cpu); |
200 | bpf_rstat_flush(cgrp: pos, parent: cgroup_parent(cgrp: pos), cpu); |
201 | |
202 | rcu_read_lock(); |
203 | list_for_each_entry_rcu(css, &pos->rstat_css_list, |
204 | rstat_css_node) |
205 | css->ss->css_rstat_flush(css, cpu); |
206 | rcu_read_unlock(); |
207 | } |
208 | raw_spin_unlock_irqrestore(cpu_lock, flags); |
209 | |
210 | /* play nice and yield if necessary */ |
211 | if (need_resched() || spin_needbreak(lock: &cgroup_rstat_lock)) { |
212 | spin_unlock_irq(lock: &cgroup_rstat_lock); |
213 | if (!cond_resched()) |
214 | cpu_relax(); |
215 | spin_lock_irq(lock: &cgroup_rstat_lock); |
216 | } |
217 | } |
218 | } |
219 | |
220 | /** |
221 | * cgroup_rstat_flush - flush stats in @cgrp's subtree |
222 | * @cgrp: target cgroup |
223 | * |
224 | * Collect all per-cpu stats in @cgrp's subtree into the global counters |
225 | * and propagate them upwards. After this function returns, all cgroups in |
226 | * the subtree have up-to-date ->stat. |
227 | * |
228 | * This also gets all cgroups in the subtree including @cgrp off the |
229 | * ->updated_children lists. |
230 | * |
231 | * This function may block. |
232 | */ |
233 | __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp) |
234 | { |
235 | might_sleep(); |
236 | |
237 | spin_lock_irq(lock: &cgroup_rstat_lock); |
238 | cgroup_rstat_flush_locked(cgrp); |
239 | spin_unlock_irq(lock: &cgroup_rstat_lock); |
240 | } |
241 | |
242 | /** |
243 | * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold |
244 | * @cgrp: target cgroup |
245 | * |
246 | * Flush stats in @cgrp's subtree and prevent further flushes. Must be |
247 | * paired with cgroup_rstat_flush_release(). |
248 | * |
249 | * This function may block. |
250 | */ |
251 | void cgroup_rstat_flush_hold(struct cgroup *cgrp) |
252 | __acquires(&cgroup_rstat_lock) |
253 | { |
254 | might_sleep(); |
255 | spin_lock_irq(lock: &cgroup_rstat_lock); |
256 | cgroup_rstat_flush_locked(cgrp); |
257 | } |
258 | |
259 | /** |
260 | * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() |
261 | */ |
262 | void cgroup_rstat_flush_release(void) |
263 | __releases(&cgroup_rstat_lock) |
264 | { |
265 | spin_unlock_irq(lock: &cgroup_rstat_lock); |
266 | } |
267 | |
268 | int cgroup_rstat_init(struct cgroup *cgrp) |
269 | { |
270 | int cpu; |
271 | |
272 | /* the root cgrp has rstat_cpu preallocated */ |
273 | if (!cgrp->rstat_cpu) { |
274 | cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu); |
275 | if (!cgrp->rstat_cpu) |
276 | return -ENOMEM; |
277 | } |
278 | |
279 | /* ->updated_children list is self terminated */ |
280 | for_each_possible_cpu(cpu) { |
281 | struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); |
282 | |
283 | rstatc->updated_children = cgrp; |
284 | u64_stats_init(syncp: &rstatc->bsync); |
285 | } |
286 | |
287 | return 0; |
288 | } |
289 | |
290 | void cgroup_rstat_exit(struct cgroup *cgrp) |
291 | { |
292 | int cpu; |
293 | |
294 | cgroup_rstat_flush(cgrp); |
295 | |
296 | /* sanity check */ |
297 | for_each_possible_cpu(cpu) { |
298 | struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); |
299 | |
300 | if (WARN_ON_ONCE(rstatc->updated_children != cgrp) || |
301 | WARN_ON_ONCE(rstatc->updated_next)) |
302 | return; |
303 | } |
304 | |
305 | free_percpu(pdata: cgrp->rstat_cpu); |
306 | cgrp->rstat_cpu = NULL; |
307 | } |
308 | |
309 | void __init cgroup_rstat_boot(void) |
310 | { |
311 | int cpu; |
312 | |
313 | for_each_possible_cpu(cpu) |
314 | raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu)); |
315 | } |
316 | |
317 | /* |
318 | * Functions for cgroup basic resource statistics implemented on top of |
319 | * rstat. |
320 | */ |
321 | static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat, |
322 | struct cgroup_base_stat *src_bstat) |
323 | { |
324 | dst_bstat->cputime.utime += src_bstat->cputime.utime; |
325 | dst_bstat->cputime.stime += src_bstat->cputime.stime; |
326 | dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; |
327 | #ifdef CONFIG_SCHED_CORE |
328 | dst_bstat->forceidle_sum += src_bstat->forceidle_sum; |
329 | #endif |
330 | } |
331 | |
332 | static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, |
333 | struct cgroup_base_stat *src_bstat) |
334 | { |
335 | dst_bstat->cputime.utime -= src_bstat->cputime.utime; |
336 | dst_bstat->cputime.stime -= src_bstat->cputime.stime; |
337 | dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime; |
338 | #ifdef CONFIG_SCHED_CORE |
339 | dst_bstat->forceidle_sum -= src_bstat->forceidle_sum; |
340 | #endif |
341 | } |
342 | |
343 | static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) |
344 | { |
345 | struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); |
346 | struct cgroup *parent = cgroup_parent(cgrp); |
347 | struct cgroup_rstat_cpu *prstatc; |
348 | struct cgroup_base_stat delta; |
349 | unsigned seq; |
350 | |
351 | /* Root-level stats are sourced from system-wide CPU stats */ |
352 | if (!parent) |
353 | return; |
354 | |
355 | /* fetch the current per-cpu values */ |
356 | do { |
357 | seq = __u64_stats_fetch_begin(syncp: &rstatc->bsync); |
358 | delta = rstatc->bstat; |
359 | } while (__u64_stats_fetch_retry(syncp: &rstatc->bsync, start: seq)); |
360 | |
361 | /* propagate per-cpu delta to cgroup and per-cpu global statistics */ |
362 | cgroup_base_stat_sub(dst_bstat: &delta, src_bstat: &rstatc->last_bstat); |
363 | cgroup_base_stat_add(dst_bstat: &cgrp->bstat, src_bstat: &delta); |
364 | cgroup_base_stat_add(dst_bstat: &rstatc->last_bstat, src_bstat: &delta); |
365 | cgroup_base_stat_add(dst_bstat: &rstatc->subtree_bstat, src_bstat: &delta); |
366 | |
367 | /* propagate cgroup and per-cpu global delta to parent (unless that's root) */ |
368 | if (cgroup_parent(cgrp: parent)) { |
369 | delta = cgrp->bstat; |
370 | cgroup_base_stat_sub(dst_bstat: &delta, src_bstat: &cgrp->last_bstat); |
371 | cgroup_base_stat_add(dst_bstat: &parent->bstat, src_bstat: &delta); |
372 | cgroup_base_stat_add(dst_bstat: &cgrp->last_bstat, src_bstat: &delta); |
373 | |
374 | delta = rstatc->subtree_bstat; |
375 | prstatc = cgroup_rstat_cpu(cgrp: parent, cpu); |
376 | cgroup_base_stat_sub(dst_bstat: &delta, src_bstat: &rstatc->last_subtree_bstat); |
377 | cgroup_base_stat_add(dst_bstat: &prstatc->subtree_bstat, src_bstat: &delta); |
378 | cgroup_base_stat_add(dst_bstat: &rstatc->last_subtree_bstat, src_bstat: &delta); |
379 | } |
380 | } |
381 | |
382 | static struct cgroup_rstat_cpu * |
383 | cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags) |
384 | { |
385 | struct cgroup_rstat_cpu *rstatc; |
386 | |
387 | rstatc = get_cpu_ptr(cgrp->rstat_cpu); |
388 | *flags = u64_stats_update_begin_irqsave(syncp: &rstatc->bsync); |
389 | return rstatc; |
390 | } |
391 | |
392 | static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, |
393 | struct cgroup_rstat_cpu *rstatc, |
394 | unsigned long flags) |
395 | { |
396 | u64_stats_update_end_irqrestore(syncp: &rstatc->bsync, flags); |
397 | cgroup_rstat_updated(cgrp, smp_processor_id()); |
398 | put_cpu_ptr(rstatc); |
399 | } |
400 | |
401 | void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) |
402 | { |
403 | struct cgroup_rstat_cpu *rstatc; |
404 | unsigned long flags; |
405 | |
406 | rstatc = cgroup_base_stat_cputime_account_begin(cgrp, flags: &flags); |
407 | rstatc->bstat.cputime.sum_exec_runtime += delta_exec; |
408 | cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags); |
409 | } |
410 | |
411 | void __cgroup_account_cputime_field(struct cgroup *cgrp, |
412 | enum cpu_usage_stat index, u64 delta_exec) |
413 | { |
414 | struct cgroup_rstat_cpu *rstatc; |
415 | unsigned long flags; |
416 | |
417 | rstatc = cgroup_base_stat_cputime_account_begin(cgrp, flags: &flags); |
418 | |
419 | switch (index) { |
420 | case CPUTIME_USER: |
421 | case CPUTIME_NICE: |
422 | rstatc->bstat.cputime.utime += delta_exec; |
423 | break; |
424 | case CPUTIME_SYSTEM: |
425 | case CPUTIME_IRQ: |
426 | case CPUTIME_SOFTIRQ: |
427 | rstatc->bstat.cputime.stime += delta_exec; |
428 | break; |
429 | #ifdef CONFIG_SCHED_CORE |
430 | case CPUTIME_FORCEIDLE: |
431 | rstatc->bstat.forceidle_sum += delta_exec; |
432 | break; |
433 | #endif |
434 | default: |
435 | break; |
436 | } |
437 | |
438 | cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags); |
439 | } |
440 | |
441 | /* |
442 | * compute the cputime for the root cgroup by getting the per cpu data |
443 | * at a global level, then categorizing the fields in a manner consistent |
444 | * with how it is done by __cgroup_account_cputime_field for each bit of |
445 | * cpu time attributed to a cgroup. |
446 | */ |
447 | static void root_cgroup_cputime(struct cgroup_base_stat *bstat) |
448 | { |
449 | struct task_cputime *cputime = &bstat->cputime; |
450 | int i; |
451 | |
452 | memset(bstat, 0, sizeof(*bstat)); |
453 | for_each_possible_cpu(i) { |
454 | struct kernel_cpustat kcpustat; |
455 | u64 *cpustat = kcpustat.cpustat; |
456 | u64 user = 0; |
457 | u64 sys = 0; |
458 | |
459 | kcpustat_cpu_fetch(dst: &kcpustat, cpu: i); |
460 | |
461 | user += cpustat[CPUTIME_USER]; |
462 | user += cpustat[CPUTIME_NICE]; |
463 | cputime->utime += user; |
464 | |
465 | sys += cpustat[CPUTIME_SYSTEM]; |
466 | sys += cpustat[CPUTIME_IRQ]; |
467 | sys += cpustat[CPUTIME_SOFTIRQ]; |
468 | cputime->stime += sys; |
469 | |
470 | cputime->sum_exec_runtime += user; |
471 | cputime->sum_exec_runtime += sys; |
472 | cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL]; |
473 | |
474 | #ifdef CONFIG_SCHED_CORE |
475 | bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE]; |
476 | #endif |
477 | } |
478 | } |
479 | |
480 | void cgroup_base_stat_cputime_show(struct seq_file *seq) |
481 | { |
482 | struct cgroup *cgrp = seq_css(seq)->cgroup; |
483 | u64 usage, utime, stime; |
484 | struct cgroup_base_stat bstat; |
485 | #ifdef CONFIG_SCHED_CORE |
486 | u64 forceidle_time; |
487 | #endif |
488 | |
489 | if (cgroup_parent(cgrp)) { |
490 | cgroup_rstat_flush_hold(cgrp); |
491 | usage = cgrp->bstat.cputime.sum_exec_runtime; |
492 | cputime_adjust(curr: &cgrp->bstat.cputime, prev: &cgrp->prev_cputime, |
493 | ut: &utime, st: &stime); |
494 | #ifdef CONFIG_SCHED_CORE |
495 | forceidle_time = cgrp->bstat.forceidle_sum; |
496 | #endif |
497 | cgroup_rstat_flush_release(); |
498 | } else { |
499 | root_cgroup_cputime(bstat: &bstat); |
500 | usage = bstat.cputime.sum_exec_runtime; |
501 | utime = bstat.cputime.utime; |
502 | stime = bstat.cputime.stime; |
503 | #ifdef CONFIG_SCHED_CORE |
504 | forceidle_time = bstat.forceidle_sum; |
505 | #endif |
506 | } |
507 | |
508 | do_div(usage, NSEC_PER_USEC); |
509 | do_div(utime, NSEC_PER_USEC); |
510 | do_div(stime, NSEC_PER_USEC); |
511 | #ifdef CONFIG_SCHED_CORE |
512 | do_div(forceidle_time, NSEC_PER_USEC); |
513 | #endif |
514 | |
515 | seq_printf(m: seq, fmt: "usage_usec %llu\n" |
516 | "user_usec %llu\n" |
517 | "system_usec %llu\n" , |
518 | usage, utime, stime); |
519 | |
520 | #ifdef CONFIG_SCHED_CORE |
521 | seq_printf(m: seq, fmt: "core_sched.force_idle_usec %llu\n" , forceidle_time); |
522 | #endif |
523 | } |
524 | |
525 | /* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */ |
526 | BTF_SET8_START(bpf_rstat_kfunc_ids) |
527 | BTF_ID_FLAGS(func, cgroup_rstat_updated) |
528 | BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE) |
529 | BTF_SET8_END(bpf_rstat_kfunc_ids) |
530 | |
531 | static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = { |
532 | .owner = THIS_MODULE, |
533 | .set = &bpf_rstat_kfunc_ids, |
534 | }; |
535 | |
536 | static int __init bpf_rstat_kfunc_init(void) |
537 | { |
538 | return register_btf_kfunc_id_set(prog_type: BPF_PROG_TYPE_TRACING, |
539 | s: &bpf_rstat_kfunc_set); |
540 | } |
541 | late_initcall(bpf_rstat_kfunc_init); |
542 | |