rstat.c source code [linux/kernel/cgroup/rstat.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	#include "cgroup-internal.h"
3
4	#include <linux/sched/cputime.h>
5
6	#include <linux/bpf.h>
7	#include <linux/btf.h>
8	#include <linux/btf_ids.h>
9
10	static DEFINE_SPINLOCK(cgroup_rstat_lock);
11	static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
12
13	static void cgroup_base_stat_flush(struct cgroup cgrp, int* cpu);
14
15	static struct cgroup_rstat_cpu cgroup_rstat_cpu(struct* cgroup cgrp, int* cpu)
16	{
17	return per_cpu_ptr(cgrp->rstat_cpu, cpu);
18	}
19
20	/**
21	* cgroup_rstat_updated - keep track of updated rstat_cpu
22	* @cgrp: target cgroup
23	* @cpu: cpu on which rstat_cpu was updated
24	*
25	* @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching
26	* rstat_cpu->updated_children list. See the comment on top of
27	* cgroup_rstat_cpu definition for details.
28	*/
29	__bpf_kfunc void cgroup_rstat_updated(struct cgroup cgrp, int* cpu)
30	{
31	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
32	unsigned long flags;
33
34	/*
35	* Speculative already-on-list test. This may race leading to
36	* temporary inaccuracies, which is fine.
37	*
38	* Because @parent's updated_children is terminated with @parent
39	* instead of NULL, we can tell whether @cgrp is on the list by
40	* testing the next pointer for NULL.
41	*/
42	if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
43	return;
44
45	raw_spin_lock_irqsave(cpu_lock, flags);
46
47	/ put @cgrp and all ancestors on the corresponding updated lists /
48	while (true) {
49	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
50	struct cgroup *parent = cgroup_parent(cgrp);
51	struct cgroup_rstat_cpu *prstatc;
52
53	/*
54	* Both additions and removals are bottom-up. If a cgroup
55	* is already in the tree, all ancestors are.
56	*/
57	if (rstatc->updated_next)
58	break;
59
60	/ Root has no parent to link it to, but mark it busy /
61	if (!parent) {
62	rstatc->updated_next = cgrp;
63	break;
64	}
65
66	prstatc = cgroup_rstat_cpu(cgrp: parent, cpu);
67	rstatc->updated_next = prstatc->updated_children;
68	prstatc->updated_children = cgrp;
69
70	cgrp = parent;
71	}
72
73	raw_spin_unlock_irqrestore(cpu_lock, flags);
74	}
75
76	/**
77	* cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
78	* @pos: current position
79	* @root: root of the tree to traversal
80	* @cpu: target cpu
81	*
82	* Walks the updated rstat_cpu tree on @cpu from @root. %NULL @pos starts
83	* the traversal and %NULL return indicates the end. During traversal,
84	* each returned cgroup is unlinked from the tree. Must be called with the
85	* matching cgroup_rstat_cpu_lock held.
86	*
87	* The only ordering guarantee is that, for a parent and a child pair
88	* covered by a given traversal, if a child is visited, its parent is
89	* guaranteed to be visited afterwards.
90	*/
91	static struct cgroup cgroup_rstat_cpu_pop_updated(struct* cgroup *pos,
92	struct cgroup root, int* cpu)
93	{
94	struct cgroup_rstat_cpu *rstatc;
95	struct cgroup *parent;
96
97	if (pos == root)
98	return NULL;
99
100	/*
101	* We're gonna walk down to the first leaf and visit/remove it. We
102	* can pick whatever unvisited node as the starting point.
103	*/
104	if (!pos) {
105	pos = root;
106	/ return NULL if this subtree is not on-list /
107	if (!cgroup_rstat_cpu(cgrp: pos, cpu)->updated_next)
108	return NULL;
109	} else {
110	pos = cgroup_parent(cgrp: pos);
111	}
112
113	/ walk down to the first leaf /
114	while (true) {
115	rstatc = cgroup_rstat_cpu(cgrp: pos, cpu);
116	if (rstatc->updated_children == pos)
117	break;
118	pos = rstatc->updated_children;
119	}
120
121	/*
122	* Unlink @pos from the tree. As the updated_children list is
123	* singly linked, we have to walk it to find the removal point.
124	* However, due to the way we traverse, @pos will be the first
125	* child in most cases. The only exception is @root.
126	*/
127	parent = cgroup_parent(cgrp: pos);
128	if (parent) {
129	struct cgroup_rstat_cpu *prstatc;
130	struct cgroup **nextp;
131
132	prstatc = cgroup_rstat_cpu(cgrp: parent, cpu);
133	nextp = &prstatc->updated_children;
134	while (*nextp != pos) {
135	struct cgroup_rstat_cpu *nrstatc;
136
137	nrstatc = cgroup_rstat_cpu(cgrp: *nextp, cpu);
138	WARN_ON_ONCE(*nextp == parent);
139	nextp = &nrstatc->updated_next;
140	}
141	*nextp = rstatc->updated_next;
142	}
143
144	rstatc->updated_next = NULL;
145	return pos;
146	}
147
148	/*
149	* A hook for bpf stat collectors to attach to and flush their stats.
150	* Together with providing bpf kfuncs for cgroup_rstat_updated() and
151	* cgroup_rstat_flush(), this enables a complete workflow where bpf progs that
152	* collect cgroup stats can integrate with rstat for efficient flushing.
153	*
154	* A static noinline declaration here could cause the compiler to optimize away
155	* the function. A global noinline declaration will keep the definition, but may
156	* optimize away the callsite. Therefore, __weak is needed to ensure that the
157	* call is still emitted, by telling the compiler that we don't know what the
158	* function might eventually be.
159	*
160	* __diag_* below are needed to dismiss the missing prototype warning.
161	*/
162	__diag_push();
163	__diag_ignore_all("-Wmissing-prototypes",
164	"kfuncs which will be used in BPF programs");
165
166	__weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
167	struct cgroup parent, int* cpu)
168	{
169	}
170
171	__diag_pop();
172
173	/ see cgroup_rstat_flush() /
174	static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
175	__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
176	{
177	int cpu;
178
179	lockdep_assert_held(&cgroup_rstat_lock);
180
181	for_each_possible_cpu(cpu) {
182	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
183	cpu);
184	struct cgroup *pos = NULL;
185	unsigned long flags;
186
187	/*
188	* The _irqsave() is needed because cgroup_rstat_lock is
189	* spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
190	* this lock with the _irq() suffix only disables interrupts on
191	* a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
192	* interrupts on both configurations. The _irqsave() ensures
193	* that interrupts are always disabled and later restored.
194	*/
195	raw_spin_lock_irqsave(cpu_lock, flags);
196	while ((pos = cgroup_rstat_cpu_pop_updated(pos, root: cgrp, cpu))) {
197	struct cgroup_subsys_state *css;
198
199	cgroup_base_stat_flush(cgrp: pos, cpu);
200	bpf_rstat_flush(cgrp: pos, parent: cgroup_parent(cgrp: pos), cpu);
201
202	rcu_read_lock();
203	list_for_each_entry_rcu(css, &pos->rstat_css_list,
204	rstat_css_node)
205	css->ss->css_rstat_flush(css, cpu);
206	rcu_read_unlock();
207	}
208	raw_spin_unlock_irqrestore(cpu_lock, flags);
209
210	/ play nice and yield if necessary /
211	if (need_resched() \|\| spin_needbreak(lock: &cgroup_rstat_lock)) {
212	spin_unlock_irq(lock: &cgroup_rstat_lock);
213	if (!cond_resched())
214	cpu_relax();
215	spin_lock_irq(lock: &cgroup_rstat_lock);
216	}
217	}
218	}
219
220	/**
221	* cgroup_rstat_flush - flush stats in @cgrp's subtree
222	* @cgrp: target cgroup
223	*
224	* Collect all per-cpu stats in @cgrp's subtree into the global counters
225	* and propagate them upwards. After this function returns, all cgroups in
226	* the subtree have up-to-date ->stat.
227	*
228	* This also gets all cgroups in the subtree including @cgrp off the
229	* ->updated_children lists.
230	*
231	* This function may block.
232	*/
233	__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
234	{
235	might_sleep();
236
237	spin_lock_irq(lock: &cgroup_rstat_lock);
238	cgroup_rstat_flush_locked(cgrp);
239	spin_unlock_irq(lock: &cgroup_rstat_lock);
240	}
241
242	/**
243	* cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
244	* @cgrp: target cgroup
245	*
246	* Flush stats in @cgrp's subtree and prevent further flushes. Must be
247	* paired with cgroup_rstat_flush_release().
248	*
249	* This function may block.
250	*/
251	void cgroup_rstat_flush_hold(struct cgroup *cgrp)
252	__acquires(&cgroup_rstat_lock)
253	{
254	might_sleep();
255	spin_lock_irq(lock: &cgroup_rstat_lock);
256	cgroup_rstat_flush_locked(cgrp);
257	}
258
259	/**
260	* cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
261	*/
262	void cgroup_rstat_flush_release(void)
263	__releases(&cgroup_rstat_lock)
264	{
265	spin_unlock_irq(lock: &cgroup_rstat_lock);
266	}
267
268	int cgroup_rstat_init(struct cgroup *cgrp)
269	{
270	int cpu;
271
272	/ the root cgrp has rstat_cpu preallocated /
273	if (!cgrp->rstat_cpu) {
274	cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
275	if (!cgrp->rstat_cpu)
276	return -ENOMEM;
277	}
278
279	/ ->updated_children list is self terminated /
280	for_each_possible_cpu(cpu) {
281	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
282
283	rstatc->updated_children = cgrp;
284	u64_stats_init(syncp: &rstatc->bsync);
285	}
286
287	return `0`;
288	}
289
290	void cgroup_rstat_exit(struct cgroup *cgrp)
291	{
292	int cpu;
293
294	cgroup_rstat_flush(cgrp);
295
296	/ sanity check /
297	for_each_possible_cpu(cpu) {
298	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
299
300	if (WARN_ON_ONCE(rstatc->updated_children != cgrp) \|\|
301	WARN_ON_ONCE(rstatc->updated_next))
302	return;
303	}
304
305	free_percpu(pdata: cgrp->rstat_cpu);
306	cgrp->rstat_cpu = NULL;
307	}
308
309	void __init cgroup_rstat_boot(void)
310	{
311	int cpu;
312
313	for_each_possible_cpu(cpu)
314	raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
315	}
316
317	/*
318	* Functions for cgroup basic resource statistics implemented on top of
319	* rstat.
320	*/
321	static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
322	struct cgroup_base_stat *src_bstat)
323	{
324	dst_bstat->cputime.utime += src_bstat->cputime.utime;
325	dst_bstat->cputime.stime += src_bstat->cputime.stime;
326	dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
327	#ifdef CONFIG_SCHED_CORE
328	dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
329	#endif
330	}
331
332	static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
333	struct cgroup_base_stat *src_bstat)
334	{
335	dst_bstat->cputime.utime -= src_bstat->cputime.utime;
336	dst_bstat->cputime.stime -= src_bstat->cputime.stime;
337	dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
338	#ifdef CONFIG_SCHED_CORE
339	dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
340	#endif
341	}
342
343	static void cgroup_base_stat_flush(struct cgroup cgrp, int* cpu)
344	{
345	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
346	struct cgroup *parent = cgroup_parent(cgrp);
347	struct cgroup_rstat_cpu *prstatc;
348	struct cgroup_base_stat delta;
349	unsigned seq;
350
351	/ Root-level stats are sourced from system-wide CPU stats /
352	if (!parent)
353	return;
354
355	/ fetch the current per-cpu values /
356	do {
357	seq = __u64_stats_fetch_begin(syncp: &rstatc->bsync);
358	delta = rstatc->bstat;
359	} while (__u64_stats_fetch_retry(syncp: &rstatc->bsync, start: seq));
360
361	/ propagate per-cpu delta to cgroup and per-cpu global statistics /
362	cgroup_base_stat_sub(dst_bstat: &delta, src_bstat: &rstatc->last_bstat);
363	cgroup_base_stat_add(dst_bstat: &cgrp->bstat, src_bstat: &delta);
364	cgroup_base_stat_add(dst_bstat: &rstatc->last_bstat, src_bstat: &delta);
365	cgroup_base_stat_add(dst_bstat: &rstatc->subtree_bstat, src_bstat: &delta);
366
367	/ propagate cgroup and per-cpu global delta to parent (unless that's root) /
368	if (cgroup_parent(cgrp: parent)) {
369	delta = cgrp->bstat;
370	cgroup_base_stat_sub(dst_bstat: &delta, src_bstat: &cgrp->last_bstat);
371	cgroup_base_stat_add(dst_bstat: &parent->bstat, src_bstat: &delta);
372	cgroup_base_stat_add(dst_bstat: &cgrp->last_bstat, src_bstat: &delta);
373
374	delta = rstatc->subtree_bstat;
375	prstatc = cgroup_rstat_cpu(cgrp: parent, cpu);
376	cgroup_base_stat_sub(dst_bstat: &delta, src_bstat: &rstatc->last_subtree_bstat);
377	cgroup_base_stat_add(dst_bstat: &prstatc->subtree_bstat, src_bstat: &delta);
378	cgroup_base_stat_add(dst_bstat: &rstatc->last_subtree_bstat, src_bstat: &delta);
379	}
380	}
381
382	static struct cgroup_rstat_cpu *
383	cgroup_base_stat_cputime_account_begin(struct cgroup cgrp, unsigned* long *flags)
384	{
385	struct cgroup_rstat_cpu *rstatc;
386
387	rstatc = get_cpu_ptr(cgrp->rstat_cpu);
388	*flags = u64_stats_update_begin_irqsave(syncp: &rstatc->bsync);
389	return rstatc;
390	}
391
392	static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
393	struct cgroup_rstat_cpu *rstatc,
394	unsigned long flags)
395	{
396	u64_stats_update_end_irqrestore(syncp: &rstatc->bsync, flags);
397	cgroup_rstat_updated(cgrp, smp_processor_id());
398	put_cpu_ptr(rstatc);
399	}
400
401	void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
402	{
403	struct cgroup_rstat_cpu *rstatc;
404	unsigned long flags;
405
406	rstatc = cgroup_base_stat_cputime_account_begin(cgrp, flags: &flags);
407	rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
408	cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
409	}
410
411	void __cgroup_account_cputime_field(struct cgroup *cgrp,
412	enum cpu_usage_stat index, u64 delta_exec)
413	{
414	struct cgroup_rstat_cpu *rstatc;
415	unsigned long flags;
416
417	rstatc = cgroup_base_stat_cputime_account_begin(cgrp, flags: &flags);
418
419	switch (index) {
420	case CPUTIME_USER:
421	case CPUTIME_NICE:
422	rstatc->bstat.cputime.utime += delta_exec;
423	break;
424	case CPUTIME_SYSTEM:
425	case CPUTIME_IRQ:
426	case CPUTIME_SOFTIRQ:
427	rstatc->bstat.cputime.stime += delta_exec;
428	break;
429	#ifdef CONFIG_SCHED_CORE
430	case CPUTIME_FORCEIDLE:
431	rstatc->bstat.forceidle_sum += delta_exec;
432	break;
433	#endif
434	default:
435	break;
436	}
437
438	cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
439	}
440
441	/*
442	* compute the cputime for the root cgroup by getting the per cpu data
443	* at a global level, then categorizing the fields in a manner consistent
444	* with how it is done by __cgroup_account_cputime_field for each bit of
445	* cpu time attributed to a cgroup.
446	*/
447	static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
448	{
449	struct task_cputime *cputime = &bstat->cputime;
450	int i;
451
452	memset(bstat, `0`, sizeof(*bstat));
453	for_each_possible_cpu(i) {
454	struct kernel_cpustat kcpustat;
455	u64 *cpustat = kcpustat.cpustat;
456	u64 user = `0`;
457	u64 sys = `0`;
458
459	kcpustat_cpu_fetch(dst: &kcpustat, cpu: i);
460
461	user += cpustat[CPUTIME_USER];
462	user += cpustat[CPUTIME_NICE];
463	cputime->utime += user;
464
465	sys += cpustat[CPUTIME_SYSTEM];
466	sys += cpustat[CPUTIME_IRQ];
467	sys += cpustat[CPUTIME_SOFTIRQ];
468	cputime->stime += sys;
469
470	cputime->sum_exec_runtime += user;
471	cputime->sum_exec_runtime += sys;
472	cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
473
474	#ifdef CONFIG_SCHED_CORE
475	bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
476	#endif
477	}
478	}
479
480	void cgroup_base_stat_cputime_show(struct seq_file *seq)
481	{
482	struct cgroup *cgrp = seq_css(seq)->cgroup;
483	u64 usage, utime, stime;
484	struct cgroup_base_stat bstat;
485	#ifdef CONFIG_SCHED_CORE
486	u64 forceidle_time;
487	#endif
488
489	if (cgroup_parent(cgrp)) {
490	cgroup_rstat_flush_hold(cgrp);
491	usage = cgrp->bstat.cputime.sum_exec_runtime;
492	cputime_adjust(curr: &cgrp->bstat.cputime, prev: &cgrp->prev_cputime,
493	ut: &utime, st: &stime);
494	#ifdef CONFIG_SCHED_CORE
495	forceidle_time = cgrp->bstat.forceidle_sum;
496	#endif
497	cgroup_rstat_flush_release();
498	} else {
499	root_cgroup_cputime(bstat: &bstat);
500	usage = bstat.cputime.sum_exec_runtime;
501	utime = bstat.cputime.utime;
502	stime = bstat.cputime.stime;
503	#ifdef CONFIG_SCHED_CORE
504	forceidle_time = bstat.forceidle_sum;
505	#endif
506	}
507
508	do_div(usage, NSEC_PER_USEC);
509	do_div(utime, NSEC_PER_USEC);
510	do_div(stime, NSEC_PER_USEC);
511	#ifdef CONFIG_SCHED_CORE
512	do_div(forceidle_time, NSEC_PER_USEC);
513	#endif
514
515	seq_printf(m: seq, fmt: "usage_usec %llu\n"
516	"user_usec %llu\n"
517	"system_usec %llu\n",
518	usage, utime, stime);
519
520	#ifdef CONFIG_SCHED_CORE
521	seq_printf(m: seq, fmt: "core_sched.force_idle_usec %llu\n", forceidle_time);
522	#endif
523	}
524
525	/ Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() /
526	BTF_SET8_START(bpf_rstat_kfunc_ids)
527	BTF_ID_FLAGS(func, cgroup_rstat_updated)
528	BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE)
529	BTF_SET8_END(bpf_rstat_kfunc_ids)
530
531	static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
532	.owner = THIS_MODULE,
533	.set = &bpf_rstat_kfunc_ids,
534	};
535
536	static int __init bpf_rstat_kfunc_init(void)
537	{
538	return register_btf_kfunc_id_set(prog_type: BPF_PROG_TYPE_TRACING,
539	s: &bpf_rstat_kfunc_set);
540	}
541	late_initcall(bpf_rstat_kfunc_init);
542

source code of linux/kernel/cgroup/rstat.c