cgroup-v1.c source code [linux/kernel/cgroup/cgroup-v1.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	#include "cgroup-internal.h"
3
4	#include <linux/ctype.h>
5	#include <linux/kmod.h>
6	#include <linux/sort.h>
7	#include <linux/delay.h>
8	#include <linux/mm.h>
9	#include <linux/sched/signal.h>
10	#include <linux/sched/task.h>
11	#include <linux/magic.h>
12	#include <linux/slab.h>
13	#include <linux/vmalloc.h>
14	#include <linux/delayacct.h>
15	#include <linux/pid_namespace.h>
16	#include <linux/cgroupstats.h>
17	#include <linux/fs_parser.h>
18
19	#include <trace/events/cgroup.h>
20
21	/*
22	* pidlists linger the following amount before being destroyed. The goal
23	* is avoiding frequent destruction in the middle of consecutive read calls
24	* Expiring in the middle is a performance problem not a correctness one.
25	* 1 sec should be enough.
26	*/
27	#define CGROUP_PIDLIST_DESTROY_DELAY HZ
28
29	/ Controllers blocked by the commandline in v1 /
30	static u16 cgroup_no_v1_mask;
31
32	/ disable named v1 mounts /
33	static bool cgroup_no_v1_named;
34
35	/*
36	* pidlist destructions need to be flushed on cgroup destruction. Use a
37	* separate workqueue as flush domain.
38	*/
39	static struct workqueue_struct *cgroup_pidlist_destroy_wq;
40
41	/ protects cgroup_subsys->release_agent_path /
42	static DEFINE_SPINLOCK(release_agent_path_lock);
43
44	bool cgroup1_ssid_disabled(int ssid)
45	{
46	return cgroup_no_v1_mask & (`1` << ssid);
47	}
48
49	/**
50	* cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
51	* @from: attach to all cgroups of a given task
52	* @tsk: the task to be attached
53	*
54	* Return: %0 on success or a negative errno code on failure
55	*/
56	int cgroup_attach_task_all(struct task_struct from, struct* task_struct *tsk)
57	{
58	struct cgroup_root *root;
59	int retval = `0`;
60
61	cgroup_lock();
62	cgroup_attach_lock(lock_threadgroup: true);
63	for_each_root(root) {
64	struct cgroup *from_cgrp;
65
66	spin_lock_irq(lock: &css_set_lock);
67	from_cgrp = task_cgroup_from_root(task: from, root);
68	spin_unlock_irq(lock: &css_set_lock);
69
70	retval = cgroup_attach_task(dst_cgrp: from_cgrp, leader: tsk, threadgroup: false);
71	if (retval)
72	break;
73	}
74	cgroup_attach_unlock(lock_threadgroup: true);
75	cgroup_unlock();
76
77	return retval;
78	}
79	EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
80
81	/**
82	* cgroup_transfer_tasks - move tasks from one cgroup to another
83	* @to: cgroup to which the tasks will be moved
84	* @from: cgroup in which the tasks currently reside
85	*
86	* Locking rules between cgroup_post_fork() and the migration path
87	* guarantee that, if a task is forking while being migrated, the new child
88	* is guaranteed to be either visible in the source cgroup after the
89	* parent's migration is complete or put into the target cgroup. No task
90	* can slip out of migration through forking.
91	*
92	* Return: %0 on success or a negative errno code on failure
93	*/
94	int cgroup_transfer_tasks(struct cgroup to, struct* cgroup *from)
95	{
96	DEFINE_CGROUP_MGCTX(mgctx);
97	struct cgrp_cset_link *link;
98	struct css_task_iter it;
99	struct task_struct *task;
100	int ret;
101
102	if (cgroup_on_dfl(cgrp: to))
103	return -EINVAL;
104
105	ret = cgroup_migrate_vet_dst(dst_cgrp: to);
106	if (ret)
107	return ret;
108
109	cgroup_lock();
110
111	cgroup_attach_lock(lock_threadgroup: true);
112
113	/ all tasks in @from are being moved, all csets are source /
114	spin_lock_irq(lock: &css_set_lock);
115	list_for_each_entry(link, &from->cset_links, cset_link)
116	cgroup_migrate_add_src(src_cset: link->cset, dst_cgrp: to, mgctx: &mgctx);
117	spin_unlock_irq(lock: &css_set_lock);
118
119	ret = cgroup_migrate_prepare_dst(mgctx: &mgctx);
120	if (ret)
121	goto out_err;
122
123	/*
124	* Migrate tasks one-by-one until @from is empty. This fails iff
125	* ->can_attach() fails.
126	*/
127	do {
128	css_task_iter_start(css: &from->self, flags: `0`, it: &it);
129
130	do {
131	task = css_task_iter_next(it: &it);
132	} while (task && (task->flags & PF_EXITING));
133
134	if (task)
135	get_task_struct(t: task);
136	css_task_iter_end(it: &it);
137
138	if (task) {
139	ret = cgroup_migrate(leader: task, threadgroup: false, mgctx: &mgctx);
140	if (!ret)
141	TRACE_CGROUP_PATH(transfer_tasks, to, task, false);
142	put_task_struct(t: task);
143	}
144	} while (task && !ret);
145	out_err:
146	cgroup_migrate_finish(mgctx: &mgctx);
147	cgroup_attach_unlock(lock_threadgroup: true);
148	cgroup_unlock();
149	return ret;
150	}
151
152	/*
153	* Stuff for reading the 'tasks'/'procs' files.
154	*
155	* Reading this file can return large amounts of data if a cgroup has
156	* lots of attached tasks. So it may need several calls to read(),
157	* but we cannot guarantee that the information we produce is correct
158	* unless we produce it entirely atomically.
159	*
160	*/
161
162	/ which pidlist file are we talking about? /
163	enum cgroup_filetype {
164	CGROUP_FILE_PROCS,
165	CGROUP_FILE_TASKS,
166	};
167
168	/*
169	* A pidlist is a list of pids that virtually represents the contents of one
170	* of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
171	* a pair (one each for procs, tasks) for each pid namespace that's relevant
172	* to the cgroup.
173	*/
174	struct cgroup_pidlist {
175	/*
176	* used to find which pidlist is wanted. doesn't change as long as
177	* this particular list stays in the list.
178	*/
179	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
180	/ array of xids /
181	pid_t *list;
182	/ how many elements the above list has /
183	int length;
184	/ each of these stored in a list by its cgroup /
185	struct list_head links;
186	/ pointer to the cgroup we belong to, for list removal purposes /
187	struct cgroup *owner;
188	/ for delayed destruction /
189	struct delayed_work destroy_dwork;
190	};
191
192	/*
193	* Used to destroy all pidlists lingering waiting for destroy timer. None
194	* should be left afterwards.
195	*/
196	void cgroup1_pidlist_destroy_all(struct cgroup *cgrp)
197	{
198	struct cgroup_pidlist l, tmp_l;
199
200	mutex_lock(&cgrp->pidlist_mutex);
201	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
202	mod_delayed_work(wq: cgroup_pidlist_destroy_wq, dwork: &l->destroy_dwork, delay: `0`);
203	mutex_unlock(lock: &cgrp->pidlist_mutex);
204
205	flush_workqueue(cgroup_pidlist_destroy_wq);
206	BUG_ON(!list_empty(&cgrp->pidlists));
207	}
208
209	static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
210	{
211	struct delayed_work *dwork = to_delayed_work(work);
212	struct cgroup_pidlist l = container_of(dwork, struct* cgroup_pidlist,
213	destroy_dwork);
214	struct cgroup_pidlist *tofree = NULL;
215
216	mutex_lock(&l->owner->pidlist_mutex);
217
218	/*
219	* Destroy iff we didn't get queued again. The state won't change
220	* as destroy_dwork can only be queued while locked.
221	*/
222	if (!delayed_work_pending(dwork)) {
223	list_del(entry: &l->links);
224	kvfree(addr: l->list);
225	put_pid_ns(ns: l->key.ns);
226	tofree = l;
227	}
228
229	mutex_unlock(lock: &l->owner->pidlist_mutex);
230	kfree(objp: tofree);
231	}
232
233	/*
234	* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
235	* Returns the number of unique elements.
236	*/
237	static int pidlist_uniq(pid_t list, int* length)
238	{
239	int src, dest = `1`;
240
241	/*
242	* we presume the 0th element is unique, so i starts at 1. trivial
243	* edge cases first; no work needs to be done for either
244	*/
245	if (length == `0` \|\| length == `1`)
246	return length;
247	/ src and dest walk down the list; dest counts unique elements /
248	for (src = `1`; src < length; src++) {
249	/ find next unique element /
250	while (list[src] == list[src-`1`]) {
251	src++;
252	if (src == length)
253	goto after;
254	}
255	/ dest always points to where the next unique element goes /
256	list[dest] = list[src];
257	dest++;
258	}
259	after:
260	return dest;
261	}
262
263	/*
264	* The two pid files - task and cgroup.procs - guaranteed that the result
265	* is sorted, which forced this whole pidlist fiasco. As pid order is
266	* different per namespace, each namespace needs differently sorted list,
267	* making it impossible to use, for example, single rbtree of member tasks
268	* sorted by task pointer. As pidlists can be fairly large, allocating one
269	* per open file is dangerous, so cgroup had to implement shared pool of
270	* pidlists keyed by cgroup and namespace.
271	*/
272	static int cmppid(const void a, const* void *b)
273	{
274	return (pid_t )a - (pid_t )b;
275	}
276
277	static struct cgroup_pidlist cgroup_pidlist_find(struct* cgroup *cgrp,
278	enum cgroup_filetype type)
279	{
280	struct cgroup_pidlist *l;
281	/ don't need task_nsproxy() if we're looking at ourself /
282	struct pid_namespace *ns = task_active_pid_ns(current);
283
284	lockdep_assert_held(&cgrp->pidlist_mutex);
285
286	list_for_each_entry(l, &cgrp->pidlists, links)
287	if (l->key.type == type && l->key.ns == ns)
288	return l;
289	return NULL;
290	}
291
292	/*
293	* find the appropriate pidlist for our purpose (given procs vs tasks)
294	* returns with the lock on that pidlist already held, and takes care
295	* of the use count, or returns NULL with no locks held if we're out of
296	* memory.
297	*/
298	static struct cgroup_pidlist cgroup_pidlist_find_create(struct* cgroup *cgrp,
299	enum cgroup_filetype type)
300	{
301	struct cgroup_pidlist *l;
302
303	lockdep_assert_held(&cgrp->pidlist_mutex);
304
305	l = cgroup_pidlist_find(cgrp, type);
306	if (l)
307	return l;
308
309	/ entry not found; create a new one /
310	l = kzalloc(size: sizeof(struct cgroup_pidlist), GFP_KERNEL);
311	if (!l)
312	return l;
313
314	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
315	l->key.type = type;
316	/ don't need task_nsproxy() if we're looking at ourself /
317	l->key.ns = get_pid_ns(ns: task_active_pid_ns(current));
318	l->owner = cgrp;
319	list_add(new: &l->links, head: &cgrp->pidlists);
320	return l;
321	}
322
323	/*
324	* Load a cgroup's pidarray with either procs' tgids or tasks' pids
325	*/
326	static int pidlist_array_load(struct cgroup cgrp, enum* cgroup_filetype type,
327	struct cgroup_pidlist **lp)
328	{
329	pid_t *array;
330	int length;
331	int pid, n = `0`; / used for populating the array /
332	struct css_task_iter it;
333	struct task_struct *tsk;
334	struct cgroup_pidlist *l;
335
336	lockdep_assert_held(&cgrp->pidlist_mutex);
337
338	/*
339	* If cgroup gets more users after we read count, we won't have
340	* enough space - tough. This race is indistinguishable to the
341	* caller from the case that the additional cgroup users didn't
342	* show up until sometime later on.
343	*/
344	length = cgroup_task_count(cgrp);
345	array = kvmalloc_array(n: length, size: sizeof(pid_t), GFP_KERNEL);
346	if (!array)
347	return -ENOMEM;
348	/ now, populate the array /
349	css_task_iter_start(css: &cgrp->self, flags: `0`, it: &it);
350	while ((tsk = css_task_iter_next(it: &it))) {
351	if (unlikely(n == length))
352	break;
353	/ get tgid or pid for procs or tasks file respectively /
354	if (type == CGROUP_FILE_PROCS)
355	pid = task_tgid_vnr(tsk);
356	else
357	pid = task_pid_vnr(tsk);
358	if (pid > `0`) / make sure to only use valid results /
359	array[n++] = pid;
360	}
361	css_task_iter_end(it: &it);
362	length = n;
363	/ now sort & strip out duplicates (tgids or recycled thread PIDs) /
364	sort(base: array, num: length, size: sizeof(pid_t), cmp_func: cmppid, NULL);
365	length = pidlist_uniq(list: array, length);
366
367	l = cgroup_pidlist_find_create(cgrp, type);
368	if (!l) {
369	kvfree(addr: array);
370	return -ENOMEM;
371	}
372
373	/ store array, freeing old if necessary /
374	kvfree(addr: l->list);
375	l->list = array;
376	l->length = length;
377	*lp = l;
378	return `0`;
379	}
380
381	/*
382	* seq_file methods for the tasks/procs files. The seq_file position is the
383	* next pid to display; the seq_file iterator is a pointer to the pid
384	* in the cgroup->l->list array.
385	*/
386
387	static void cgroup_pidlist_start(struct* seq_file s, loff_t pos)
388	{
389	/*
390	* Initially we receive a position value that corresponds to
391	* one more than the last pid shown (or 0 on the first call or
392	* after a seek to the start). Use a binary-search to find the
393	* next pid to display, if any
394	*/
395	struct kernfs_open_file *of = s->private;
396	struct cgroup_file_ctx *ctx = of->priv;
397	struct cgroup *cgrp = seq_css(seq: s)->cgroup;
398	struct cgroup_pidlist *l;
399	enum cgroup_filetype type = seq_cft(seq: s)->private;
400	int index = `0`, pid = *pos;
401	int *iter, ret;
402
403	mutex_lock(&cgrp->pidlist_mutex);
404
405	/*
406	* !NULL @ctx->procs1.pidlist indicates that this isn't the first
407	* start() after open. If the matching pidlist is around, we can use
408	* that. Look for it. Note that @ctx->procs1.pidlist can't be used
409	* directly. It could already have been destroyed.
410	*/
411	if (ctx->procs1.pidlist)
412	ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type);
413
414	/*
415	* Either this is the first start() after open or the matching
416	* pidlist has been destroyed inbetween. Create a new one.
417	*/
418	if (!ctx->procs1.pidlist) {
419	ret = pidlist_array_load(cgrp, type, lp: &ctx->procs1.pidlist);
420	if (ret)
421	return ERR_PTR(error: ret);
422	}
423	l = ctx->procs1.pidlist;
424
425	if (pid) {
426	int end = l->length;
427
428	while (index < end) {
429	int mid = (index + end) / `2`;
430	if (l->list[mid] == pid) {
431	index = mid;
432	break;
433	} else if (l->list[mid] < pid)
434	index = mid + `1`;
435	else
436	end = mid;
437	}
438	}
439	/ If we're off the end of the array, we're done /
440	if (index >= l->length)
441	return NULL;
442	/ Update the abstract position to be the actual pid that we found /
443	iter = l->list + index;
444	pos = iter;
445	return iter;
446	}
447
448	static void cgroup_pidlist_stop(struct seq_file s, void* *v)
449	{
450	struct kernfs_open_file *of = s->private;
451	struct cgroup_file_ctx *ctx = of->priv;
452	struct cgroup_pidlist *l = ctx->procs1.pidlist;
453
454	if (l)
455	mod_delayed_work(wq: cgroup_pidlist_destroy_wq, dwork: &l->destroy_dwork,
456	CGROUP_PIDLIST_DESTROY_DELAY);
457	mutex_unlock(lock: &seq_css(seq: s)->cgroup->pidlist_mutex);
458	}
459
460	static void cgroup_pidlist_next(struct* seq_file s, void* v, loff_t pos)
461	{
462	struct kernfs_open_file *of = s->private;
463	struct cgroup_file_ctx *ctx = of->priv;
464	struct cgroup_pidlist *l = ctx->procs1.pidlist;
465	pid_t *p = v;
466	pid_t *end = l->list + l->length;
467	/*
468	* Advance to the next pid in the array. If this goes off the
469	* end, we're done
470	*/
471	p++;
472	if (p >= end) {
473	(*pos)++;
474	return NULL;
475	} else {
476	pos = p;
477	return p;
478	}
479	}
480
481	static int cgroup_pidlist_show(struct seq_file s, void* *v)
482	{
483	seq_printf(m: s, fmt: "%d\n", (int* *)v);
484
485	return `0`;
486	}
487
488	static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
489	char *buf, size_t nbytes, loff_t off,
490	bool threadgroup)
491	{
492	struct cgroup *cgrp;
493	struct task_struct *task;
494	const struct cred cred, tcred;
495	ssize_t ret;
496	bool locked;
497
498	cgrp = cgroup_kn_lock_live(kn: of->kn, drain_offline: false);
499	if (!cgrp)
500	return -ENODEV;
501
502	task = cgroup_procs_write_start(buf, threadgroup, locked: &locked);
503	ret = PTR_ERR_OR_ZERO(ptr: task);
504	if (ret)
505	goto out_unlock;
506
507	/*
508	* Even if we're attaching all tasks in the thread group, we only need
509	* to check permissions on one of them. Check permissions using the
510	* credentials from file open to protect against inherited fd attacks.
511	*/
512	cred = of->file->f_cred;
513	tcred = get_task_cred(task);
514	if (!uid_eq(left: cred->euid, GLOBAL_ROOT_UID) &&
515	!uid_eq(cred->euid, tcred->uid) &&
516	!uid_eq(cred->euid, tcred->suid))
517	ret = -EACCES;
518	put_cred(tcred);
519	if (ret)
520	goto out_finish;
521
522	ret = cgroup_attach_task(cgrp, task, threadgroup);
523
524	out_finish:
525	cgroup_procs_write_finish(task, locked);
526	out_unlock:
527	cgroup_kn_unlock(of->kn);
528
529	return ret ?: nbytes;
530	}
531
532	static ssize_t cgroup1_procs_write(struct kernfs_open_file *of,
533	char *buf, size_t nbytes, loff_t off)
534	{
535	return __cgroup1_procs_write(of, buf, nbytes, off, threadgroup: true);
536	}
537
538	static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of,
539	char *buf, size_t nbytes, loff_t off)
540	{
541	return __cgroup1_procs_write(of, buf, nbytes, off, threadgroup: false);
542	}
543
544	static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
545	char *buf, size_t nbytes, loff_t off)
546	{
547	struct cgroup *cgrp;
548	struct cgroup_file_ctx *ctx;
549
550	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
551
552	/*
553	* Release agent gets called with all capabilities,
554	* require capabilities to set release agent.
555	*/
556	ctx = of->priv;
557	if ((ctx->ns->user_ns != &init_user_ns) \|\|
558	!file_ns_capable(file: of->file, ns: &init_user_ns, CAP_SYS_ADMIN))
559	return -EPERM;
560
561	cgrp = cgroup_kn_lock_live(kn: of->kn, drain_offline: false);
562	if (!cgrp)
563	return -ENODEV;
564	spin_lock(lock: &release_agent_path_lock);
565	strscpy(p: cgrp->root->release_agent_path, q: strstrip(str: buf),
566	size: sizeof(cgrp->root->release_agent_path));
567	spin_unlock(lock: &release_agent_path_lock);
568	cgroup_kn_unlock(kn: of->kn);
569	return nbytes;
570	}
571
572	static int cgroup_release_agent_show(struct seq_file seq, void* *v)
573	{
574	struct cgroup *cgrp = seq_css(seq)->cgroup;
575
576	spin_lock(lock: &release_agent_path_lock);
577	seq_puts(m: seq, s: cgrp->root->release_agent_path);
578	spin_unlock(lock: &release_agent_path_lock);
579	seq_putc(m: seq, c: `'\n'`);
580	return `0`;
581	}
582
583	static int cgroup_sane_behavior_show(struct seq_file seq, void* *v)
584	{
585	seq_puts(m: seq, s: "0\n");
586	return `0`;
587	}
588
589	static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
590	struct cftype *cft)
591	{
592	return notify_on_release(cgrp: css->cgroup);
593	}
594
595	static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
596	struct cftype *cft, u64 val)
597	{
598	if (val)
599	set_bit(nr: CGRP_NOTIFY_ON_RELEASE, addr: &css->cgroup->flags);
600	else
601	clear_bit(nr: CGRP_NOTIFY_ON_RELEASE, addr: &css->cgroup->flags);
602	return `0`;
603	}
604
605	static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
606	struct cftype *cft)
607	{
608	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
609	}
610
611	static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
612	struct cftype *cft, u64 val)
613	{
614	if (val)
615	set_bit(nr: CGRP_CPUSET_CLONE_CHILDREN, addr: &css->cgroup->flags);
616	else
617	clear_bit(nr: CGRP_CPUSET_CLONE_CHILDREN, addr: &css->cgroup->flags);
618	return `0`;
619	}
620
621	/ cgroup core interface files for the legacy hierarchies /
622	struct cftype cgroup1_base_files[] = {
623	{
624	.name = "cgroup.procs",
625	.seq_start = cgroup_pidlist_start,
626	.seq_next = cgroup_pidlist_next,
627	.seq_stop = cgroup_pidlist_stop,
628	.seq_show = cgroup_pidlist_show,
629	.private = CGROUP_FILE_PROCS,
630	.write = cgroup1_procs_write,
631	},
632	{
633	.name = "cgroup.clone_children",
634	.read_u64 = cgroup_clone_children_read,
635	.write_u64 = cgroup_clone_children_write,
636	},
637	{
638	.name = "cgroup.sane_behavior",
639	.flags = CFTYPE_ONLY_ON_ROOT,
640	.seq_show = cgroup_sane_behavior_show,
641	},
642	{
643	.name = "tasks",
644	.seq_start = cgroup_pidlist_start,
645	.seq_next = cgroup_pidlist_next,
646	.seq_stop = cgroup_pidlist_stop,
647	.seq_show = cgroup_pidlist_show,
648	.private = CGROUP_FILE_TASKS,
649	.write = cgroup1_tasks_write,
650	},
651	{
652	.name = "notify_on_release",
653	.read_u64 = cgroup_read_notify_on_release,
654	.write_u64 = cgroup_write_notify_on_release,
655	},
656	{
657	.name = "release_agent",
658	.flags = CFTYPE_ONLY_ON_ROOT,
659	.seq_show = cgroup_release_agent_show,
660	.write = cgroup_release_agent_write,
661	.max_write_len = PATH_MAX - `1`,
662	},
663	{ } / terminate /
664	};
665
666	/ Display information about each subsystem and each hierarchy /
667	int proc_cgroupstats_show(struct seq_file m, void* *v)
668	{
669	struct cgroup_subsys *ss;
670	int i;
671
672	seq_puts(m, s: "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
673	/*
674	* Grab the subsystems state racily. No need to add avenue to
675	* cgroup_mutex contention.
676	*/
677
678	for_each_subsys(ss, i)
679	seq_printf(m, fmt: "%s\t%d\t%d\t%d\n",
680	ss->legacy_name, ss->root->hierarchy_id,
681	atomic_read(v: &ss->root->nr_cgrps),
682	cgroup_ssid_enabled(ssid: i));
683
684	return `0`;
685	}
686
687	/**
688	* cgroupstats_build - build and fill cgroupstats
689	* @stats: cgroupstats to fill information into
690	* @dentry: A dentry entry belonging to the cgroup for which stats have
691	* been requested.
692	*
693	* Build and fill cgroupstats so that taskstats can export it to user
694	* space.
695	*
696	* Return: %0 on success or a negative errno code on failure
697	*/
698	int cgroupstats_build(struct cgroupstats stats, struct* dentry *dentry)
699	{
700	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
701	struct cgroup *cgrp;
702	struct css_task_iter it;
703	struct task_struct *tsk;
704
705	/ it should be kernfs_node belonging to cgroupfs and is a directory /
706	if (dentry->d_sb->s_type != &cgroup_fs_type \|\| !kn \|\|
707	kernfs_type(kn) != KERNFS_DIR)
708	return -EINVAL;
709
710	/*
711	* We aren't being called from kernfs and there's no guarantee on
712	* @kn->priv's validity. For this and css_tryget_online_from_dir(),
713	* @kn->priv is RCU safe. Let's do the RCU dancing.
714	*/
715	rcu_read_lock();
716	cgrp = rcu_dereference((void* __rcu __force **)&kn->priv);
717	if (!cgrp \|\| !cgroup_tryget(cgrp)) {
718	rcu_read_unlock();
719	return -ENOENT;
720	}
721	rcu_read_unlock();
722
723	css_task_iter_start(css: &cgrp->self, flags: `0`, it: &it);
724	while ((tsk = css_task_iter_next(it: &it))) {
725	switch (READ_ONCE(tsk->__state)) {
726	case TASK_RUNNING:
727	stats->nr_running++;
728	break;
729	case TASK_INTERRUPTIBLE:
730	stats->nr_sleeping++;
731	break;
732	case TASK_UNINTERRUPTIBLE:
733	stats->nr_uninterruptible++;
734	break;
735	case TASK_STOPPED:
736	stats->nr_stopped++;
737	break;
738	default:
739	if (tsk->in_iowait)
740	stats->nr_io_wait++;
741	break;
742	}
743	}
744	css_task_iter_end(it: &it);
745
746	cgroup_put(cgrp);
747	return `0`;
748	}
749
750	void cgroup1_check_for_release(struct cgroup *cgrp)
751	{
752	if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
753	!css_has_online_children(css: &cgrp->self) && !cgroup_is_dead(cgrp))
754	schedule_work(work: &cgrp->release_agent_work);
755	}
756
757	/*
758	* Notify userspace when a cgroup is released, by running the
759	* configured release agent with the name of the cgroup (path
760	* relative to the root of cgroup file system) as the argument.
761	*
762	* Most likely, this user command will try to rmdir this cgroup.
763	*
764	* This races with the possibility that some other task will be
765	* attached to this cgroup before it is removed, or that some other
766	* user task will 'mkdir' a child cgroup of this cgroup. That's ok.
767	* The presumed 'rmdir' will fail quietly if this cgroup is no longer
768	* unused, and this cgroup will be reprieved from its death sentence,
769	* to continue to serve a useful existence. Next time it's released,
770	* we will get notified again, if it still has 'notify_on_release' set.
771	*
772	* The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
773	* means only wait until the task is successfully execve()'d. The
774	* separate release agent task is forked by call_usermodehelper(),
775	* then control in this thread returns here, without waiting for the
776	* release agent task. We don't bother to wait because the caller of
777	* this routine has no use for the exit status of the release agent
778	* task, so no sense holding our caller up for that.
779	*/
780	void cgroup1_release_agent(struct work_struct *work)
781	{
782	struct cgroup *cgrp =
783	container_of(work, struct cgroup, release_agent_work);
784	char pathbuf, agentbuf;
785	char argv[`3`], envp[`3`];
786	int ret;
787
788	/ snoop agent path and exit early if empty /
789	if (!cgrp->root->release_agent_path[`0`])
790	return;
791
792	/ prepare argument buffers /
793	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
794	agentbuf = kmalloc(PATH_MAX, GFP_KERNEL);
795	if (!pathbuf \|\| !agentbuf)
796	goto out_free;
797
798	spin_lock(lock: &release_agent_path_lock);
799	strscpy(p: agentbuf, q: cgrp->root->release_agent_path, PATH_MAX);
800	spin_unlock(lock: &release_agent_path_lock);
801	if (!agentbuf[`0`])
802	goto out_free;
803
804	ret = cgroup_path_ns(cgrp, buf: pathbuf, PATH_MAX, ns: &init_cgroup_ns);
805	if (ret < `0` \|\| ret >= PATH_MAX)
806	goto out_free;
807
808	argv[`0`] = agentbuf;
809	argv[`1`] = pathbuf;
810	argv[`2`] = NULL;
811
812	/ minimal command environment /
813	envp[`0`] = "HOME=/";
814	envp[`1`] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
815	envp[`2`] = NULL;
816
817	call_usermodehelper(path: argv[`0`], argv, envp, UMH_WAIT_EXEC);
818	out_free:
819	kfree(objp: agentbuf);
820	kfree(objp: pathbuf);
821	}
822
823	/*
824	* cgroup_rename - Only allow simple rename of directories in place.
825	*/
826	static int cgroup1_rename(struct kernfs_node kn, struct* kernfs_node *new_parent,
827	const char *new_name_str)
828	{
829	struct cgroup *cgrp = kn->priv;
830	int ret;
831
832	/ do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable /
833	if (strchr(new_name_str, `'\n'`))
834	return -EINVAL;
835
836	if (kernfs_type(kn) != KERNFS_DIR)
837	return -ENOTDIR;
838	if (kn->parent != new_parent)
839	return -EIO;
840
841	/*
842	* We're gonna grab cgroup_mutex which nests outside kernfs
843	* active_ref. kernfs_rename() doesn't require active_ref
844	* protection. Break them before grabbing cgroup_mutex.
845	*/
846	kernfs_break_active_protection(kn: new_parent);
847	kernfs_break_active_protection(kn);
848
849	cgroup_lock();
850
851	ret = kernfs_rename(kn, new_parent, new_name: new_name_str);
852	if (!ret)
853	TRACE_CGROUP_PATH(rename, cgrp);
854
855	cgroup_unlock();
856
857	kernfs_unbreak_active_protection(kn);
858	kernfs_unbreak_active_protection(kn: new_parent);
859	return ret;
860	}
861
862	static int cgroup1_show_options(struct seq_file seq, struct* kernfs_root *kf_root)
863	{
864	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
865	struct cgroup_subsys *ss;
866	int ssid;
867
868	for_each_subsys(ss, ssid)
869	if (root->subsys_mask & (`1` << ssid))
870	seq_show_option(m: seq, name: ss->legacy_name, NULL);
871	if (root->flags & CGRP_ROOT_NOPREFIX)
872	seq_puts(m: seq, s: ",noprefix");
873	if (root->flags & CGRP_ROOT_XATTR)
874	seq_puts(m: seq, s: ",xattr");
875	if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
876	seq_puts(m: seq, s: ",cpuset_v2_mode");
877	if (root->flags & CGRP_ROOT_FAVOR_DYNMODS)
878	seq_puts(m: seq, s: ",favordynmods");
879
880	spin_lock(lock: &release_agent_path_lock);
881	if (strlen(root->release_agent_path))
882	seq_show_option(m: seq, name: "release_agent",
883	value: root->release_agent_path);
884	spin_unlock(lock: &release_agent_path_lock);
885
886	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
887	seq_puts(m: seq, s: ",clone_children");
888	if (strlen(root->name))
889	seq_show_option(m: seq, name: "name", value: root->name);
890	return `0`;
891	}
892
893	enum cgroup1_param {
894	Opt_all,
895	Opt_clone_children,
896	Opt_cpuset_v2_mode,
897	Opt_name,
898	Opt_none,
899	Opt_noprefix,
900	Opt_release_agent,
901	Opt_xattr,
902	Opt_favordynmods,
903	Opt_nofavordynmods,
904	};
905
906	const struct fs_parameter_spec cgroup1_fs_parameters[] = {
907	fsparam_flag ("all", Opt_all),
908	fsparam_flag ("clone_children", Opt_clone_children),
909	fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode),
910	fsparam_string("name", Opt_name),
911	fsparam_flag ("none", Opt_none),
912	fsparam_flag ("noprefix", Opt_noprefix),
913	fsparam_string("release_agent", Opt_release_agent),
914	fsparam_flag ("xattr", Opt_xattr),
915	fsparam_flag ("favordynmods", Opt_favordynmods),
916	fsparam_flag ("nofavordynmods", Opt_nofavordynmods),
917	{}
918	};
919
920	int cgroup1_parse_param(struct fs_context fc, struct* fs_parameter *param)
921	{
922	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
923	struct cgroup_subsys *ss;
924	struct fs_parse_result result;
925	int opt, i;
926
927	opt = fs_parse(fc, desc: cgroup1_fs_parameters, param, result: &result);
928	if (opt == -ENOPARAM) {
929	int ret;
930
931	ret = vfs_parse_fs_param_source(fc, param);
932	if (ret != -ENOPARAM)
933	return ret;
934	for_each_subsys(ss, i) {
935	if (strcmp(param->key, ss->legacy_name))
936	continue;
937	if (!cgroup_ssid_enabled(ssid: i) \|\| cgroup1_ssid_disabled(ssid: i))
938	return invalfc(fc, "Disabled controller '%s'",
939	param->key);
940	ctx->subsys_mask \|= (`1` << i);
941	return `0`;
942	}
943	return invalfc(fc, "Unknown subsys name '%s'", param->key);
944	}
945	if (opt < `0`)
946	return opt;
947
948	switch (opt) {
949	case Opt_none:
950	/ Explicitly have no subsystems /
951	ctx->none = true;
952	break;
953	case Opt_all:
954	ctx->all_ss = true;
955	break;
956	case Opt_noprefix:
957	ctx->flags \|= CGRP_ROOT_NOPREFIX;
958	break;
959	case Opt_clone_children:
960	ctx->cpuset_clone_children = true;
961	break;
962	case Opt_cpuset_v2_mode:
963	ctx->flags \|= CGRP_ROOT_CPUSET_V2_MODE;
964	break;
965	case Opt_xattr:
966	ctx->flags \|= CGRP_ROOT_XATTR;
967	break;
968	case Opt_favordynmods:
969	ctx->flags \|= CGRP_ROOT_FAVOR_DYNMODS;
970	break;
971	case Opt_nofavordynmods:
972	ctx->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
973	break;
974	case Opt_release_agent:
975	/ Specifying two release agents is forbidden /
976	if (ctx->release_agent)
977	return invalfc(fc, "release_agent respecified");
978	/*
979	* Release agent gets called with all capabilities,
980	* require capabilities to set release agent.
981	*/
982	if ((fc->user_ns != &init_user_ns) \|\| !capable(CAP_SYS_ADMIN))
983	return invalfc(fc, "Setting release_agent not allowed");
984	ctx->release_agent = param->string;
985	param->string = NULL;
986	break;
987	case Opt_name:
988	/ blocked by boot param? /
989	if (cgroup_no_v1_named)
990	return -ENOENT;
991	/ Can't specify an empty name /
992	if (!param->size)
993	return invalfc(fc, "Empty name");
994	if (param->size > MAX_CGROUP_ROOT_NAMELEN - `1`)
995	return invalfc(fc, "Name too long");
996	/ Must match [\w.-]+ /
997	for (i = `0`; i < param->size; i++) {
998	char c = param->string[i];
999	if (isalnum(c))
1000	continue;
1001	if ((c == `'.'`) \|\| (c == `'-'`) \|\| (c == `'_'`))
1002	continue;
1003	return invalfc(fc, "Invalid name");
1004	}
1005	/ Specifying two names is forbidden /
1006	if (ctx->name)
1007	return invalfc(fc, "name respecified");
1008	ctx->name = param->string;
1009	param->string = NULL;
1010	break;
1011	}
1012	return `0`;
1013	}
1014
1015	static int check_cgroupfs_options(struct fs_context *fc)
1016	{
1017	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1018	u16 mask = U16_MAX;
1019	u16 enabled = `0`;
1020	struct cgroup_subsys *ss;
1021	int i;
1022
1023	#ifdef CONFIG_CPUSETS
1024	mask = ~((u16)`1` << cpuset_cgrp_id);
1025	#endif
1026	for_each_subsys(ss, i)
1027	if (cgroup_ssid_enabled(ssid: i) && !cgroup1_ssid_disabled(ssid: i))
1028	enabled \|= `1` << i;
1029
1030	ctx->subsys_mask &= enabled;
1031
1032	/*
1033	* In absence of 'none', 'name=' and subsystem name options,
1034	* let's default to 'all'.
1035	*/
1036	if (!ctx->subsys_mask && !ctx->none && !ctx->name)
1037	ctx->all_ss = true;
1038
1039	if (ctx->all_ss) {
1040	/ Mutually exclusive option 'all' + subsystem name /
1041	if (ctx->subsys_mask)
1042	return invalfc(fc, "subsys name conflicts with all");
1043	/ 'all' => select all the subsystems /
1044	ctx->subsys_mask = enabled;
1045	}
1046
1047	/*
1048	* We either have to specify by name or by subsystems. (So all
1049	* empty hierarchies must have a name).
1050	*/
1051	if (!ctx->subsys_mask && !ctx->name)
1052	return invalfc(fc, "Need name or subsystem set");
1053
1054	/*
1055	* Option noprefix was introduced just for backward compatibility
1056	* with the old cpuset, so we allow noprefix only if mounting just
1057	* the cpuset subsystem.
1058	*/
1059	if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask))
1060	return invalfc(fc, "noprefix used incorrectly");
1061
1062	/ Can't specify "none" and some subsystems /
1063	if (ctx->subsys_mask && ctx->none)
1064	return invalfc(fc, "none used incorrectly");
1065
1066	return `0`;
1067	}
1068
1069	int cgroup1_reconfigure(struct fs_context *fc)
1070	{
1071	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1072	struct kernfs_root *kf_root = kernfs_root_from_sb(sb: fc->root->d_sb);
1073	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1074	int ret = `0`;
1075	u16 added_mask, removed_mask;
1076
1077	cgroup_lock_and_drain_offline(cgrp: &cgrp_dfl_root.cgrp);
1078
1079	/ See what subsystems are wanted /
1080	ret = check_cgroupfs_options(fc);
1081	if (ret)
1082	goto out_unlock;
1083
1084	if (ctx->subsys_mask != root->subsys_mask \|\| ctx->release_agent)
1085	pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
1086	task_tgid_nr(current), current->comm);
1087
1088	added_mask = ctx->subsys_mask & ~root->subsys_mask;
1089	removed_mask = root->subsys_mask & ~ctx->subsys_mask;
1090
1091	/ Don't allow flags or name to change at remount /
1092	if ((ctx->flags ^ root->flags) \|\|
1093	(ctx->name && strcmp(ctx->name, root->name))) {
1094	errorfc(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"",
1095	ctx->flags, ctx->name ?: "", root->flags, root->name);
1096	ret = -EINVAL;
1097	goto out_unlock;
1098	}
1099
1100	/ remounting is not allowed for populated hierarchies /
1101	if (!list_empty(head: &root->cgrp.self.children)) {
1102	ret = -EBUSY;
1103	goto out_unlock;
1104	}
1105
1106	ret = rebind_subsystems(dst_root: root, ss_mask: added_mask);
1107	if (ret)
1108	goto out_unlock;
1109
1110	WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
1111
1112	if (ctx->release_agent) {
1113	spin_lock(lock: &release_agent_path_lock);
1114	strcpy(p: root->release_agent_path, q: ctx->release_agent);
1115	spin_unlock(lock: &release_agent_path_lock);
1116	}
1117
1118	trace_cgroup_remount(root);
1119
1120	out_unlock:
1121	cgroup_unlock();
1122	return ret;
1123	}
1124
1125	struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
1126	.rename = cgroup1_rename,
1127	.show_options = cgroup1_show_options,
1128	.mkdir = cgroup_mkdir,
1129	.rmdir = cgroup_rmdir,
1130	.show_path = cgroup_show_path,
1131	};
1132
1133	/*
1134	* The guts of cgroup1 mount - find or create cgroup_root to use.
1135	* Called with cgroup_mutex held; returns 0 on success, -E... on
1136	* error and positive - in case when the candidate is busy dying.
1137	* On success it stashes a reference to cgroup_root into given
1138	* cgroup_fs_context; that reference is NOT counting towards the
1139	* cgroup_root refcount.
1140	*/
1141	static int cgroup1_root_to_use(struct fs_context *fc)
1142	{
1143	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1144	struct cgroup_root *root;
1145	struct cgroup_subsys *ss;
1146	int i, ret;
1147
1148	/ First find the desired set of subsystems /
1149	ret = check_cgroupfs_options(fc);
1150	if (ret)
1151	return ret;
1152
1153	/*
1154	* Destruction of cgroup root is asynchronous, so subsystems may
1155	* still be dying after the previous unmount. Let's drain the
1156	* dying subsystems. We just need to ensure that the ones
1157	* unmounted previously finish dying and don't care about new ones
1158	* starting. Testing ref liveliness is good enough.
1159	*/
1160	for_each_subsys(ss, i) {
1161	if (!(ctx->subsys_mask & (`1` << i)) \|\|
1162	ss->root == &cgrp_dfl_root)
1163	continue;
1164
1165	if (!percpu_ref_tryget_live(ref: &ss->root->cgrp.self.refcnt))
1166	return `1`; / restart /
1167	cgroup_put(cgrp: &ss->root->cgrp);
1168	}
1169
1170	for_each_root(root) {
1171	bool name_match = false;
1172
1173	if (root == &cgrp_dfl_root)
1174	continue;
1175
1176	/*
1177	* If we asked for a name then it must match. Also, if
1178	* name matches but sybsys_mask doesn't, we should fail.
1179	* Remember whether name matched.
1180	*/
1181	if (ctx->name) {
1182	if (strcmp(ctx->name, root->name))
1183	continue;
1184	name_match = true;
1185	}
1186
1187	/*
1188	* If we asked for subsystems (or explicitly for no
1189	* subsystems) then they must match.
1190	*/
1191	if ((ctx->subsys_mask \|\| ctx->none) &&
1192	(ctx->subsys_mask != root->subsys_mask)) {
1193	if (!name_match)
1194	continue;
1195	return -EBUSY;
1196	}
1197
1198	if (root->flags ^ ctx->flags)
1199	pr_warn("new mount options do not match the existing superblock, will be ignored\n");
1200
1201	ctx->root = root;
1202	return `0`;
1203	}
1204
1205	/*
1206	* No such thing, create a new one. name= matching without subsys
1207	* specification is allowed for already existing hierarchies but we
1208	* can't create new one without subsys specification.
1209	*/
1210	if (!ctx->subsys_mask && !ctx->none)
1211	return invalfc(fc, "No subsys list or none specified");
1212
1213	/ Hierarchies may only be created in the initial cgroup namespace. /
1214	if (ctx->ns != &init_cgroup_ns)
1215	return -EPERM;
1216
1217	root = kzalloc(size: sizeof(*root), GFP_KERNEL);
1218	if (!root)
1219	return -ENOMEM;
1220
1221	ctx->root = root;
1222	init_cgroup_root(ctx);
1223
1224	ret = cgroup_setup_root(root, ss_mask: ctx->subsys_mask);
1225	if (!ret)
1226	cgroup_favor_dynmods(root, favor: ctx->flags & CGRP_ROOT_FAVOR_DYNMODS);
1227	else
1228	cgroup_free_root(root);
1229
1230	return ret;
1231	}
1232
1233	int cgroup1_get_tree(struct fs_context *fc)
1234	{
1235	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1236	int ret;
1237
1238	/ Check if the caller has permission to mount. /
1239	if (!ns_capable(ns: ctx->ns->user_ns, CAP_SYS_ADMIN))
1240	return -EPERM;
1241
1242	cgroup_lock_and_drain_offline(cgrp: &cgrp_dfl_root.cgrp);
1243
1244	ret = cgroup1_root_to_use(fc);
1245	if (!ret && !percpu_ref_tryget_live(ref: &ctx->root->cgrp.self.refcnt))
1246	ret = `1`; / restart /
1247
1248	cgroup_unlock();
1249
1250	if (!ret)
1251	ret = cgroup_do_get_tree(fc);
1252
1253	if (!ret && percpu_ref_is_dying(ref: &ctx->root->cgrp.self.refcnt)) {
1254	fc_drop_locked(fc);
1255	ret = `1`;
1256	}
1257
1258	if (unlikely(ret > `0`)) {
1259	msleep(msecs: `10`);
1260	return restart_syscall();
1261	}
1262	return ret;
1263	}
1264
1265	static int __init cgroup1_wq_init(void)
1266	{
1267	/*
1268	* Used to destroy pidlists and separate to serve as flush domain.
1269	* Cap @max_active to 1 too.
1270	*/
1271	cgroup_pidlist_destroy_wq = alloc_workqueue(fmt: "cgroup_pidlist_destroy",
1272	flags: `0`, max_active: `1`);
1273	BUG_ON(!cgroup_pidlist_destroy_wq);
1274	return `0`;
1275	}
1276	core_initcall(cgroup1_wq_init);
1277
1278	static int __init cgroup_no_v1(char *str)
1279	{
1280	struct cgroup_subsys *ss;
1281	char *token;
1282	int i;
1283
1284	while ((token = strsep(&str, ",")) != NULL) {
1285	if (!*token)
1286	continue;
1287
1288	if (!strcmp(token, "all")) {
1289	cgroup_no_v1_mask = U16_MAX;
1290	continue;
1291	}
1292
1293	if (!strcmp(token, "named")) {
1294	cgroup_no_v1_named = true;
1295	continue;
1296	}
1297
1298	for_each_subsys(ss, i) {
1299	if (strcmp(token, ss->name) &&
1300	strcmp(token, ss->legacy_name))
1301	continue;
1302
1303	cgroup_no_v1_mask \|= `1` << i;
1304	}
1305	}
1306	return `1`;
1307	}
1308	__setup("cgroup_no_v1=", cgroup_no_v1);
1309

source code of linux/kernel/cgroup/cgroup-v1.c