commoncap.c source code [linux/security/commoncap.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/ Common capabilities, needed by capability.o.*
3	*/
4
5	#include <linux/capability.h>
6	#include <linux/audit.h>
7	#include <linux/init.h>
8	#include <linux/kernel.h>
9	#include <linux/lsm_hooks.h>
10	#include <linux/file.h>
11	#include <linux/mm.h>
12	#include <linux/mman.h>
13	#include <linux/pagemap.h>
14	#include <linux/swap.h>
15	#include <linux/skbuff.h>
16	#include <linux/netlink.h>
17	#include <linux/ptrace.h>
18	#include <linux/xattr.h>
19	#include <linux/hugetlb.h>
20	#include <linux/mount.h>
21	#include <linux/sched.h>
22	#include <linux/prctl.h>
23	#include <linux/securebits.h>
24	#include <linux/user_namespace.h>
25	#include <linux/binfmts.h>
26	#include <linux/personality.h>
27	#include <linux/mnt_idmapping.h>
28
29	/*
30	* If a non-root user executes a setuid-root binary in
31	* !secure(SECURE_NOROOT) mode, then we raise capabilities.
32	* However if fE is also set, then the intent is for only
33	* the file capabilities to be applied, and the setuid-root
34	* bit is left on either to change the uid (plausible) or
35	* to get full privilege on a kernel without file capabilities
36	* support. So in that case we do not raise capabilities.
37	*
38	* Warn if that happens, once per boot.
39	*/
40	static void warn_setuid_and_fcaps_mixed(const char *fname)
41	{
42	static int warned;
43	if (!warned) {
44	printk(KERN_INFO "warning: `%s' has both setuid-root and"
45	" effective capabilities. Therefore not raising all"
46	" capabilities.\n", fname);
47	warned = `1`;
48	}
49	}
50
51	/**
52	* cap_capable - Determine whether a task has a particular effective capability
53	* @cred: The credentials to use
54	* @targ_ns: The user namespace in which we need the capability
55	* @cap: The capability to check for
56	* @opts: Bitmask of options defined in include/linux/security.h
57	*
58	* Determine whether the nominated task has the specified capability amongst
59	* its effective set, returning 0 if it does, -ve if it does not.
60	*
61	* NOTE WELL: cap_has_capability() cannot be used like the kernel's capable()
62	* and has_capability() functions. That is, it has the reverse semantics:
63	* cap_has_capability() returns 0 when a task has a capability, but the
64	* kernel's capable() and has_capability() returns 1 for this case.
65	*/
66	int cap_capable(const struct cred cred, struct* user_namespace *targ_ns,
67	int cap, unsigned int opts)
68	{
69	struct user_namespace *ns = targ_ns;
70
71	/ See if cred has the capability in the target user namespace*
72	* by examining the target user namespace and all of the target
73	* user namespace's parents.
74	*/
75	for (;;) {
76	/ Do we have the necessary capabilities? /
77	if (ns == cred->user_ns)
78	return cap_raised(cred->cap_effective, cap) ? `0` : -EPERM;
79
80	/*
81	* If we're already at a lower level than we're looking for,
82	* we're done searching.
83	*/
84	if (ns->level <= cred->user_ns->level)
85	return -EPERM;
86
87	/*
88	* The owner of the user namespace in the parent of the
89	* user namespace has all caps.
90	*/
91	if ((ns->parent == cred->user_ns) && uid_eq(left: ns->owner, right: cred->euid))
92	return `0`;
93
94	/*
95	* If you have a capability in a parent user ns, then you have
96	* it over all children user namespaces as well.
97	*/
98	ns = ns->parent;
99	}
100
101	/ We never get here /
102	}
103
104	/**
105	* cap_settime - Determine whether the current process may set the system clock
106	* @ts: The time to set
107	* @tz: The timezone to set
108	*
109	* Determine whether the current process may set the system clock and timezone
110	* information, returning 0 if permission granted, -ve if denied.
111	*/
112	int cap_settime(const struct timespec64 ts, const* struct timezone *tz)
113	{
114	if (!capable(CAP_SYS_TIME))
115	return -EPERM;
116	return `0`;
117	}
118
119	/**
120	* cap_ptrace_access_check - Determine whether the current process may access
121	* another
122	* @child: The process to be accessed
123	* @mode: The mode of attachment.
124	*
125	* If we are in the same or an ancestor user_ns and have all the target
126	* task's capabilities, then ptrace access is allowed.
127	* If we have the ptrace capability to the target user_ns, then ptrace
128	* access is allowed.
129	* Else denied.
130	*
131	* Determine whether a process may access another, returning 0 if permission
132	* granted, -ve if denied.
133	*/
134	int cap_ptrace_access_check(struct task_struct child, unsigned* int mode)
135	{
136	int ret = `0`;
137	const struct cred cred, child_cred;
138	const kernel_cap_t *caller_caps;
139
140	rcu_read_lock();
141	cred = current_cred();
142	child_cred = __task_cred(child);
143	if (mode & PTRACE_MODE_FSCREDS)
144	caller_caps = &cred->cap_effective;
145	else
146	caller_caps = &cred->cap_permitted;
147	if (cred->user_ns == child_cred->user_ns &&
148	cap_issubset(a: child_cred->cap_permitted, set: *caller_caps))
149	goto out;
150	if (ns_capable(ns: child_cred->user_ns, CAP_SYS_PTRACE))
151	goto out;
152	ret = -EPERM;
153	out:
154	rcu_read_unlock();
155	return ret;
156	}
157
158	/**
159	* cap_ptrace_traceme - Determine whether another process may trace the current
160	* @parent: The task proposed to be the tracer
161	*
162	* If parent is in the same or an ancestor user_ns and has all current's
163	* capabilities, then ptrace access is allowed.
164	* If parent has the ptrace capability to current's user_ns, then ptrace
165	* access is allowed.
166	* Else denied.
167	*
168	* Determine whether the nominated task is permitted to trace the current
169	* process, returning 0 if permission is granted, -ve if denied.
170	*/
171	int cap_ptrace_traceme(struct task_struct *parent)
172	{
173	int ret = `0`;
174	const struct cred cred, child_cred;
175
176	rcu_read_lock();
177	cred = __task_cred(parent);
178	child_cred = current_cred();
179	if (cred->user_ns == child_cred->user_ns &&
180	cap_issubset(a: child_cred->cap_permitted, set: cred->cap_permitted))
181	goto out;
182	if (has_ns_capability(t: parent, ns: child_cred->user_ns, CAP_SYS_PTRACE))
183	goto out;
184	ret = -EPERM;
185	out:
186	rcu_read_unlock();
187	return ret;
188	}
189
190	/**
191	* cap_capget - Retrieve a task's capability sets
192	* @target: The task from which to retrieve the capability sets
193	* @effective: The place to record the effective set
194	* @inheritable: The place to record the inheritable set
195	* @permitted: The place to record the permitted set
196	*
197	* This function retrieves the capabilities of the nominated task and returns
198	* them to the caller.
199	*/
200	int cap_capget(const struct task_struct target, kernel_cap_t effective,
201	kernel_cap_t inheritable, kernel_cap_t permitted)
202	{
203	const struct cred *cred;
204
205	/ Derived from kernel/capability.c:sys_capget. /
206	rcu_read_lock();
207	cred = __task_cred(target);
208	*effective = cred->cap_effective;
209	*inheritable = cred->cap_inheritable;
210	*permitted = cred->cap_permitted;
211	rcu_read_unlock();
212	return `0`;
213	}
214
215	/*
216	* Determine whether the inheritable capabilities are limited to the old
217	* permitted set. Returns 1 if they are limited, 0 if they are not.
218	*/
219	static inline int cap_inh_is_capped(void)
220	{
221	/ they are so limited unless the current task has the CAP_SETPCAP*
222	* capability
223	*/
224	if (cap_capable(current_cred(), current_cred()->user_ns,
225	CAP_SETPCAP, CAP_OPT_NONE) == `0`)
226	return `0`;
227	return `1`;
228	}
229
230	/**
231	* cap_capset - Validate and apply proposed changes to current's capabilities
232	* @new: The proposed new credentials; alterations should be made here
233	* @old: The current task's current credentials
234	* @effective: A pointer to the proposed new effective capabilities set
235	* @inheritable: A pointer to the proposed new inheritable capabilities set
236	* @permitted: A pointer to the proposed new permitted capabilities set
237	*
238	* This function validates and applies a proposed mass change to the current
239	* process's capability sets. The changes are made to the proposed new
240	* credentials, and assuming no error, will be committed by the caller of LSM.
241	*/
242	int cap_capset(struct cred *new,
243	const struct cred *old,
244	const kernel_cap_t *effective,
245	const kernel_cap_t *inheritable,
246	const kernel_cap_t *permitted)
247	{
248	if (cap_inh_is_capped() &&
249	!cap_issubset(a: *inheritable,
250	set: cap_combine(a: old->cap_inheritable,
251	b: old->cap_permitted)))
252	/ incapable of using this inheritable set /
253	return -EPERM;
254
255	if (!cap_issubset(a: *inheritable,
256	set: cap_combine(a: old->cap_inheritable,
257	b: old->cap_bset)))
258	/ no new pI capabilities outside bounding set /
259	return -EPERM;
260
261	/ verify restrictions on target's new Permitted set /
262	if (!cap_issubset(a: *permitted, set: old->cap_permitted))
263	return -EPERM;
264
265	/ verify the _new_Effective_ is a subset of the _new_Permitted_ /
266	if (!cap_issubset(a: effective, set: permitted))
267	return -EPERM;
268
269	new->cap_effective = *effective;
270	new->cap_inheritable = *inheritable;
271	new->cap_permitted = *permitted;
272
273	/*
274	* Mask off ambient bits that are no longer both permitted and
275	* inheritable.
276	*/
277	new->cap_ambient = cap_intersect(a: new->cap_ambient,
278	b: cap_intersect(a: *permitted,
279	b: *inheritable));
280	if (WARN_ON(!cap_ambient_invariant_ok(new)))
281	return -EINVAL;
282	return `0`;
283	}
284
285	/**
286	* cap_inode_need_killpriv - Determine if inode change affects privileges
287	* @dentry: The inode/dentry in being changed with change marked ATTR_KILL_PRIV
288	*
289	* Determine if an inode having a change applied that's marked ATTR_KILL_PRIV
290	* affects the security markings on that inode, and if it is, should
291	* inode_killpriv() be invoked or the change rejected.
292	*
293	* Return: 1 if security.capability has a value, meaning inode_killpriv()
294	* is required, 0 otherwise, meaning inode_killpriv() is not required.
295	*/
296	int cap_inode_need_killpriv(struct dentry *dentry)
297	{
298	struct inode *inode = d_backing_inode(upper: dentry);
299	int error;
300
301	error = __vfs_getxattr(dentry, inode, XATTR_NAME_CAPS, NULL, `0`);
302	return error > `0`;
303	}
304
305	/**
306	* cap_inode_killpriv - Erase the security markings on an inode
307	*
308	* @idmap: idmap of the mount the inode was found from
309	* @dentry: The inode/dentry to alter
310	*
311	* Erase the privilege-enhancing security markings on an inode.
312	*
313	* If the inode has been found through an idmapped mount the idmap of
314	* the vfsmount must be passed through @idmap. This function will then
315	* take care to map the inode according to @idmap before checking
316	* permissions. On non-idmapped mounts or if permission checking is to be
317	* performed on the raw inode simply pass @nop_mnt_idmap.
318	*
319	* Return: 0 if successful, -ve on error.
320	*/
321	int cap_inode_killpriv(struct mnt_idmap idmap, struct* dentry *dentry)
322	{
323	int error;
324
325	error = __vfs_removexattr(idmap, dentry, XATTR_NAME_CAPS);
326	if (error == -EOPNOTSUPP)
327	error = `0`;
328	return error;
329	}
330
331	static bool rootid_owns_currentns(vfsuid_t rootvfsuid)
332	{
333	struct user_namespace *ns;
334	kuid_t kroot;
335
336	if (!vfsuid_valid(uid: rootvfsuid))
337	return false;
338
339	kroot = vfsuid_into_kuid(vfsuid: rootvfsuid);
340	for (ns = current_user_ns();; ns = ns->parent) {
341	if (from_kuid(to: ns, uid: kroot) == `0`)
342	return true;
343	if (ns == &init_user_ns)
344	break;
345	}
346
347	return false;
348	}
349
350	static __u32 sansflags(__u32 m)
351	{
352	return m & ~VFS_CAP_FLAGS_EFFECTIVE;
353	}
354
355	static bool is_v2header(int size, const struct vfs_cap_data *cap)
356	{
357	if (size != XATTR_CAPS_SZ_2)
358	return false;
359	return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_2;
360	}
361
362	static bool is_v3header(int size, const struct vfs_cap_data *cap)
363	{
364	if (size != XATTR_CAPS_SZ_3)
365	return false;
366	return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_3;
367	}
368
369	/*
370	* getsecurity: We are called for security.* before any attempt to read the
371	* xattr from the inode itself.
372	*
373	* This gives us a chance to read the on-disk value and convert it. If we
374	* return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler.
375	*
376	* Note we are not called by vfs_getxattr_alloc(), but that is only called
377	* by the integrity subsystem, which really wants the unconverted values -
378	* so that's good.
379	*/
380	int cap_inode_getsecurity(struct mnt_idmap *idmap,
381	struct inode inode, const* char name, void* **buffer,
382	bool alloc)
383	{
384	int size;
385	kuid_t kroot;
386	vfsuid_t vfsroot;
387	u32 nsmagic, magic;
388	uid_t root, mappedroot;
389	char *tmpbuf = NULL;
390	struct vfs_cap_data *cap;
391	struct vfs_ns_cap_data *nscap = NULL;
392	struct dentry *dentry;
393	struct user_namespace *fs_ns;
394
395	if (strcmp(name, "capability") != `0`)
396	return -EOPNOTSUPP;
397
398	dentry = d_find_any_alias(inode);
399	if (!dentry)
400	return -EINVAL;
401	size = vfs_getxattr_alloc(idmap, dentry, XATTR_NAME_CAPS, xattr_value: &tmpbuf,
402	size: sizeof(struct vfs_ns_cap_data), GFP_NOFS);
403	dput(dentry);
404	/ gcc11 complains if we don't check for !tmpbuf /
405	if (size < `0` \|\| !tmpbuf)
406	goto out_free;
407
408	fs_ns = inode->i_sb->s_user_ns;
409	cap = (struct vfs_cap_data *) tmpbuf;
410	if (is_v2header(size, cap)) {
411	root = `0`;
412	} else if (is_v3header(size, cap)) {
413	nscap = (struct vfs_ns_cap_data *) tmpbuf;
414	root = le32_to_cpu(nscap->rootid);
415	} else {
416	size = -EINVAL;
417	goto out_free;
418	}
419
420	kroot = make_kuid(from: fs_ns, uid: root);
421
422	/ If this is an idmapped mount shift the kuid. /
423	vfsroot = make_vfsuid(idmap, fs_userns: fs_ns, kuid: kroot);
424
425	/ If the root kuid maps to a valid uid in current ns, then return*
426	* this as a nscap. */
427	mappedroot = from_kuid(current_user_ns(), uid: vfsuid_into_kuid(vfsuid: vfsroot));
428	if (mappedroot != (uid_t)-`1` && mappedroot != (uid_t)`0`) {
429	size = sizeof(struct vfs_ns_cap_data);
430	if (alloc) {
431	if (!nscap) {
432	/ v2 -> v3 conversion /
433	nscap = kzalloc(size, GFP_ATOMIC);
434	if (!nscap) {
435	size = -ENOMEM;
436	goto out_free;
437	}
438	nsmagic = VFS_CAP_REVISION_3;
439	magic = le32_to_cpu(cap->magic_etc);
440	if (magic & VFS_CAP_FLAGS_EFFECTIVE)
441	nsmagic \|= VFS_CAP_FLAGS_EFFECTIVE;
442	memcpy(&nscap->data, &cap->data, sizeof(__le32) * `2` * VFS_CAP_U32);
443	nscap->magic_etc = cpu_to_le32(nsmagic);
444	} else {
445	/ use allocated v3 buffer /
446	tmpbuf = NULL;
447	}
448	nscap->rootid = cpu_to_le32(mappedroot);
449	*buffer = nscap;
450	}
451	goto out_free;
452	}
453
454	if (!rootid_owns_currentns(rootvfsuid: vfsroot)) {
455	size = -EOVERFLOW;
456	goto out_free;
457	}
458
459	/ This comes from a parent namespace. Return as a v2 capability /
460	size = sizeof(struct vfs_cap_data);
461	if (alloc) {
462	if (nscap) {
463	/ v3 -> v2 conversion /
464	cap = kzalloc(size, GFP_ATOMIC);
465	if (!cap) {
466	size = -ENOMEM;
467	goto out_free;
468	}
469	magic = VFS_CAP_REVISION_2;
470	nsmagic = le32_to_cpu(nscap->magic_etc);
471	if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
472	magic \|= VFS_CAP_FLAGS_EFFECTIVE;
473	memcpy(&cap->data, &nscap->data, sizeof(__le32) * `2` * VFS_CAP_U32);
474	cap->magic_etc = cpu_to_le32(magic);
475	} else {
476	/ use unconverted v2 /
477	tmpbuf = NULL;
478	}
479	*buffer = cap;
480	}
481	out_free:
482	kfree(objp: tmpbuf);
483	return size;
484	}
485
486	/**
487	* rootid_from_xattr - translate root uid of vfs caps
488	*
489	* @value: vfs caps value which may be modified by this function
490	* @size: size of @ivalue
491	* @task_ns: user namespace of the caller
492	*/
493	static vfsuid_t rootid_from_xattr(const void *value, size_t size,
494	struct user_namespace *task_ns)
495	{
496	const struct vfs_ns_cap_data *nscap = value;
497	uid_t rootid = `0`;
498
499	if (size == XATTR_CAPS_SZ_3)
500	rootid = le32_to_cpu(nscap->rootid);
501
502	return VFSUIDT_INIT(make_kuid(task_ns, rootid));
503	}
504
505	static bool validheader(size_t size, const struct vfs_cap_data *cap)
506	{
507	return is_v2header(size, cap) \|\| is_v3header(size, cap);
508	}
509
510	/**
511	* cap_convert_nscap - check vfs caps
512	*
513	* @idmap: idmap of the mount the inode was found from
514	* @dentry: used to retrieve inode to check permissions on
515	* @ivalue: vfs caps value which may be modified by this function
516	* @size: size of @ivalue
517	*
518	* User requested a write of security.capability. If needed, update the
519	* xattr to change from v2 to v3, or to fixup the v3 rootid.
520	*
521	* If the inode has been found through an idmapped mount the idmap of
522	* the vfsmount must be passed through @idmap. This function will then
523	* take care to map the inode according to @idmap before checking
524	* permissions. On non-idmapped mounts or if permission checking is to be
525	* performed on the raw inode simply pass @nop_mnt_idmap.
526	*
527	* Return: On success, return the new size; on error, return < 0.
528	*/
529	int cap_convert_nscap(struct mnt_idmap idmap, struct* dentry *dentry,
530	const void **ivalue, size_t size)
531	{
532	struct vfs_ns_cap_data *nscap;
533	uid_t nsrootid;
534	const struct vfs_cap_data cap = ivalue;
535	__u32 magic, nsmagic;
536	struct inode *inode = d_backing_inode(upper: dentry);
537	struct user_namespace *task_ns = current_user_ns(),
538	*fs_ns = inode->i_sb->s_user_ns;
539	kuid_t rootid;
540	vfsuid_t vfsrootid;
541	size_t newsize;
542
543	if (!*ivalue)
544	return -EINVAL;
545	if (!validheader(size, cap))
546	return -EINVAL;
547	if (!capable_wrt_inode_uidgid(idmap, inode, CAP_SETFCAP))
548	return -EPERM;
549	if (size == XATTR_CAPS_SZ_2 && (idmap == &nop_mnt_idmap))
550	if (ns_capable(ns: inode->i_sb->s_user_ns, CAP_SETFCAP))
551	/ user is privileged, just write the v2 /
552	return size;
553
554	vfsrootid = rootid_from_xattr(value: *ivalue, size, task_ns);
555	if (!vfsuid_valid(uid: vfsrootid))
556	return -EINVAL;
557
558	rootid = from_vfsuid(idmap, fs_userns: fs_ns, vfsuid: vfsrootid);
559	if (!uid_valid(uid: rootid))
560	return -EINVAL;
561
562	nsrootid = from_kuid(to: fs_ns, uid: rootid);
563	if (nsrootid == -`1`)
564	return -EINVAL;
565
566	newsize = sizeof(struct vfs_ns_cap_data);
567	nscap = kmalloc(size: newsize, GFP_ATOMIC);
568	if (!nscap)
569	return -ENOMEM;
570	nscap->rootid = cpu_to_le32(nsrootid);
571	nsmagic = VFS_CAP_REVISION_3;
572	magic = le32_to_cpu(cap->magic_etc);
573	if (magic & VFS_CAP_FLAGS_EFFECTIVE)
574	nsmagic \|= VFS_CAP_FLAGS_EFFECTIVE;
575	nscap->magic_etc = cpu_to_le32(nsmagic);
576	memcpy(&nscap->data, &cap->data, sizeof(__le32) * `2` * VFS_CAP_U32);
577
578	*ivalue = nscap;
579	return newsize;
580	}
581
582	/*
583	* Calculate the new process capability sets from the capability sets attached
584	* to a file.
585	*/
586	static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
587	struct linux_binprm *bprm,
588	bool *effective,
589	bool *has_fcap)
590	{
591	struct cred *new = bprm->cred;
592	int ret = `0`;
593
594	if (caps->magic_etc & VFS_CAP_FLAGS_EFFECTIVE)
595	*effective = true;
596
597	if (caps->magic_etc & VFS_CAP_REVISION_MASK)
598	*has_fcap = true;
599
600	/*
601	* pP' = (X & fP) \| (pI & fI)
602	* The addition of pA' is handled later.
603	*/
604	new->cap_permitted.val =
605	(new->cap_bset.val & caps->permitted.val) \|
606	(new->cap_inheritable.val & caps->inheritable.val);
607
608	if (caps->permitted.val & ~new->cap_permitted.val)
609	/ insufficient to execute correctly /
610	ret = -EPERM;
611
612	/*
613	* For legacy apps, with no internal support for recognizing they
614	* do not have enough capabilities, we return an error if they are
615	* missing some "forced" (aka file-permitted) capabilities.
616	*/
617	return *effective ? ret : `0`;
618	}
619
620	/**
621	* get_vfs_caps_from_disk - retrieve vfs caps from disk
622	*
623	* @idmap: idmap of the mount the inode was found from
624	* @dentry: dentry from which @inode is retrieved
625	* @cpu_caps: vfs capabilities
626	*
627	* Extract the on-exec-apply capability sets for an executable file.
628	*
629	* If the inode has been found through an idmapped mount the idmap of
630	* the vfsmount must be passed through @idmap. This function will then
631	* take care to map the inode according to @idmap before checking
632	* permissions. On non-idmapped mounts or if permission checking is to be
633	* performed on the raw inode simply pass @nop_mnt_idmap.
634	*/
635	int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
636	const struct dentry *dentry,
637	struct cpu_vfs_cap_data *cpu_caps)
638	{
639	struct inode *inode = d_backing_inode(upper: dentry);
640	__u32 magic_etc;
641	int size;
642	struct vfs_ns_cap_data data, *nscaps = &data;
643	struct vfs_cap_data caps = (struct* vfs_cap_data *) &data;
644	kuid_t rootkuid;
645	vfsuid_t rootvfsuid;
646	struct user_namespace *fs_ns;
647
648	memset(cpu_caps, `0`, sizeof(struct cpu_vfs_cap_data));
649
650	if (!inode)
651	return -ENODATA;
652
653	fs_ns = inode->i_sb->s_user_ns;
654	size = __vfs_getxattr((struct dentry *)dentry, inode,
655	XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
656	if (size == -ENODATA \|\| size == -EOPNOTSUPP)
657	/ no data, that's ok /
658	return -ENODATA;
659
660	if (size < `0`)
661	return size;
662
663	if (size < sizeof(magic_etc))
664	return -EINVAL;
665
666	cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);
667
668	rootkuid = make_kuid(from: fs_ns, uid: `0`);
669	switch (magic_etc & VFS_CAP_REVISION_MASK) {
670	case VFS_CAP_REVISION_1:
671	if (size != XATTR_CAPS_SZ_1)
672	return -EINVAL;
673	break;
674	case VFS_CAP_REVISION_2:
675	if (size != XATTR_CAPS_SZ_2)
676	return -EINVAL;
677	break;
678	case VFS_CAP_REVISION_3:
679	if (size != XATTR_CAPS_SZ_3)
680	return -EINVAL;
681	rootkuid = make_kuid(from: fs_ns, le32_to_cpu(nscaps->rootid));
682	break;
683
684	default:
685	return -EINVAL;
686	}
687
688	rootvfsuid = make_vfsuid(idmap, fs_userns: fs_ns, kuid: rootkuid);
689	if (!vfsuid_valid(uid: rootvfsuid))
690	return -ENODATA;
691
692	/ Limit the caps to the mounter of the filesystem*
693	* or the more limited uid specified in the xattr.
694	*/
695	if (!rootid_owns_currentns(rootvfsuid))
696	return -ENODATA;
697
698	cpu_caps->permitted.val = le32_to_cpu(caps->data[`0`].permitted);
699	cpu_caps->inheritable.val = le32_to_cpu(caps->data[`0`].inheritable);
700
701	/*
702	* Rev1 had just a single 32-bit word, later expanded
703	* to a second one for the high bits
704	*/
705	if ((magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_1) {
706	cpu_caps->permitted.val += (u64)le32_to_cpu(caps->data[`1`].permitted) << `32`;
707	cpu_caps->inheritable.val += (u64)le32_to_cpu(caps->data[`1`].inheritable) << `32`;
708	}
709
710	cpu_caps->permitted.val &= CAP_VALID_MASK;
711	cpu_caps->inheritable.val &= CAP_VALID_MASK;
712
713	cpu_caps->rootid = vfsuid_into_kuid(vfsuid: rootvfsuid);
714
715	return `0`;
716	}
717
718	/*
719	* Attempt to get the on-exec apply capability sets for an executable file from
720	* its xattrs and, if present, apply them to the proposed credentials being
721	* constructed by execve().
722	*/
723	static int get_file_caps(struct linux_binprm bprm, const* struct file *file,
724	bool effective, bool has_fcap)
725	{
726	int rc = `0`;
727	struct cpu_vfs_cap_data vcaps;
728
729	cap_clear(bprm->cred->cap_permitted);
730
731	if (!file_caps_enabled)
732	return `0`;
733
734	if (!mnt_may_suid(mnt: file->f_path.mnt))
735	return `0`;
736
737	/*
738	* This check is redundant with mnt_may_suid() but is kept to make
739	* explicit that capability bits are limited to s_user_ns and its
740	* descendants.
741	*/
742	if (!current_in_userns(target_ns: file->f_path.mnt->mnt_sb->s_user_ns))
743	return `0`;
744
745	rc = get_vfs_caps_from_disk(idmap: file_mnt_idmap(file),
746	dentry: file->f_path.dentry, cpu_caps: &vcaps);
747	if (rc < `0`) {
748	if (rc == -EINVAL)
749	printk(KERN_NOTICE "Invalid argument reading file caps for %s\n",
750	bprm->filename);
751	else if (rc == -ENODATA)
752	rc = `0`;
753	goto out;
754	}
755
756	rc = bprm_caps_from_vfs_caps(caps: &vcaps, bprm, effective, has_fcap);
757
758	out:
759	if (rc)
760	cap_clear(bprm->cred->cap_permitted);
761
762	return rc;
763	}
764
765	static inline bool root_privileged(void) { return !issecure(SECURE_NOROOT); }
766
767	static inline bool __is_real(kuid_t uid, struct cred *cred)
768	{ return uid_eq(left: cred->uid, right: uid); }
769
770	static inline bool __is_eff(kuid_t uid, struct cred *cred)
771	{ return uid_eq(left: cred->euid, right: uid); }
772
773	static inline bool __is_suid(kuid_t uid, struct cred *cred)
774	{ return !__is_real(uid, cred) && __is_eff(uid, cred); }
775
776	/*
777	* handle_privileged_root - Handle case of privileged root
778	* @bprm: The execution parameters, including the proposed creds
779	* @has_fcap: Are any file capabilities set?
780	* @effective: Do we have effective root privilege?
781	* @root_uid: This namespace' root UID WRT initial USER namespace
782	*
783	* Handle the case where root is privileged and hasn't been neutered by
784	* SECURE_NOROOT. If file capabilities are set, they won't be combined with
785	* set UID root and nothing is changed. If we are root, cap_permitted is
786	* updated. If we have become set UID root, the effective bit is set.
787	*/
788	static void handle_privileged_root(struct linux_binprm *bprm, bool has_fcap,
789	bool *effective, kuid_t root_uid)
790	{
791	const struct cred *old = current_cred();
792	struct cred *new = bprm->cred;
793
794	if (!root_privileged())
795	return;
796	/*
797	* If the legacy file capability is set, then don't set privs
798	* for a setuid root binary run by a non-root user. Do set it
799	* for a root user just to cause least surprise to an admin.
800	*/
801	if (has_fcap && __is_suid(uid: root_uid, cred: new)) {
802	warn_setuid_and_fcaps_mixed(fname: bprm->filename);
803	return;
804	}
805	/*
806	* To support inheritance of root-permissions and suid-root
807	* executables under compatibility mode, we override the
808	* capability sets for the file.
809	*/
810	if (__is_eff(uid: root_uid, cred: new) \|\| __is_real(uid: root_uid, cred: new)) {
811	/ pP' = (cap_bset & ~0) \| (pI & ~0) /
812	new->cap_permitted = cap_combine(a: old->cap_bset,
813	b: old->cap_inheritable);
814	}
815	/*
816	* If only the real uid is 0, we do not set the effective bit.
817	*/
818	if (__is_eff(uid: root_uid, cred: new))
819	*effective = true;
820	}
821
822	#define __cap_gained(field, target, source) \
823	!cap_issubset(target->cap_##field, source->cap_##field)
824	#define __cap_grew(target, source, cred) \
825	!cap_issubset(cred->cap_##target, cred->cap_##source)
826	#define __cap_full(field, cred) \
827	cap_issubset(CAP_FULL_SET, cred->cap_##field)
828
829	static inline bool __is_setuid(struct cred new, const* struct cred *old)
830	{ return !uid_eq(left: new->euid, right: old->uid); }
831
832	static inline bool __is_setgid(struct cred new, const* struct cred *old)
833	{ return !gid_eq(left: new->egid, right: old->gid); }
834
835	/*
836	* 1) Audit candidate if current->cap_effective is set
837	*
838	* We do not bother to audit if 3 things are true:
839	* 1) cap_effective has all caps
840	* 2) we became root OR are were already root
841	* 3) root is supposed to have all caps (SECURE_NOROOT)
842	* Since this is just a normal root execing a process.
843	*
844	* Number 1 above might fail if you don't have a full bset, but I think
845	* that is interesting information to audit.
846	*
847	* A number of other conditions require logging:
848	* 2) something prevented setuid root getting all caps
849	* 3) non-setuid root gets fcaps
850	* 4) non-setuid root gets ambient
851	*/
852	static inline bool nonroot_raised_pE(struct cred new, const* struct cred *old,
853	kuid_t root, bool has_fcap)
854	{
855	bool ret = false;
856
857	if ((__cap_grew(effective, ambient, new) &&
858	!(__cap_full(effective, new) &&
859	(__is_eff(uid: root, cred: new) \|\| __is_real(uid: root, cred: new)) &&
860	root_privileged())) \|\|
861	(root_privileged() &&
862	__is_suid(uid: root, cred: new) &&
863	!__cap_full(effective, new)) \|\|
864	(!__is_setuid(new, old) &&
865	((has_fcap &&
866	__cap_gained(permitted, new, old)) \|\|
867	__cap_gained(ambient, new, old))))
868
869	ret = true;
870
871	return ret;
872	}
873
874	/**
875	* cap_bprm_creds_from_file - Set up the proposed credentials for execve().
876	* @bprm: The execution parameters, including the proposed creds
877	* @file: The file to pull the credentials from
878	*
879	* Set up the proposed credentials for a new execution context being
880	* constructed by execve(). The proposed creds in @bprm->cred is altered,
881	* which won't take effect immediately.
882	*
883	* Return: 0 if successful, -ve on error.
884	*/
885	int cap_bprm_creds_from_file(struct linux_binprm bprm, const* struct file *file)
886	{
887	/ Process setpcap binaries and capabilities for uid 0 /
888	const struct cred *old = current_cred();
889	struct cred *new = bprm->cred;
890	bool effective = false, has_fcap = false, is_setid;
891	int ret;
892	kuid_t root_uid;
893
894	if (WARN_ON(!cap_ambient_invariant_ok(old)))
895	return -EPERM;
896
897	ret = get_file_caps(bprm, file, effective: &effective, has_fcap: &has_fcap);
898	if (ret < `0`)
899	return ret;
900
901	root_uid = make_kuid(from: new->user_ns, uid: `0`);
902
903	handle_privileged_root(bprm, has_fcap, effective: &effective, root_uid);
904
905	/ if we have fs caps, clear dangerous personality flags /
906	if (__cap_gained(permitted, new, old))
907	bprm->per_clear \|= PER_CLEAR_ON_SETID;
908
909	/ Don't let someone trace a set[ug]id/setpcap binary with the revised*
910	* credentials unless they have the appropriate permit.
911	*
912	* In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
913	*/
914	is_setid = __is_setuid(new, old) \|\| __is_setgid(new, old);
915
916	if ((is_setid \|\| __cap_gained(permitted, new, old)) &&
917	((bprm->unsafe & ~LSM_UNSAFE_PTRACE) \|\|
918	!ptracer_capable(current, ns: new->user_ns))) {
919	/ downgrade; they get no more than they had, and maybe less /
920	if (!ns_capable(ns: new->user_ns, CAP_SETUID) \|\|
921	(bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) {
922	new->euid = new->uid;
923	new->egid = new->gid;
924	}
925	new->cap_permitted = cap_intersect(a: new->cap_permitted,
926	b: old->cap_permitted);
927	}
928
929	new->suid = new->fsuid = new->euid;
930	new->sgid = new->fsgid = new->egid;
931
932	/ File caps or setid cancels ambient. /
933	if (has_fcap \|\| is_setid)
934	cap_clear(new->cap_ambient);
935
936	/*
937	* Now that we've computed pA', update pP' to give:
938	* pP' = (X & fP) \| (pI & fI) \| pA'
939	*/
940	new->cap_permitted = cap_combine(a: new->cap_permitted, b: new->cap_ambient);
941
942	/*
943	* Set pE' = (fE ? pP' : pA'). Because pA' is zero if fE is set,
944	* this is the same as pE' = (fE ? pP' : 0) \| pA'.
945	*/
946	if (effective)
947	new->cap_effective = new->cap_permitted;
948	else
949	new->cap_effective = new->cap_ambient;
950
951	if (WARN_ON(!cap_ambient_invariant_ok(new)))
952	return -EPERM;
953
954	if (nonroot_raised_pE(new, old, root: root_uid, has_fcap)) {
955	ret = audit_log_bprm_fcaps(bprm, new, old);
956	if (ret < `0`)
957	return ret;
958	}
959
960	new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
961
962	if (WARN_ON(!cap_ambient_invariant_ok(new)))
963	return -EPERM;
964
965	/ Check for privilege-elevated exec. /
966	if (is_setid \|\|
967	(!__is_real(uid: root_uid, cred: new) &&
968	(effective \|\|
969	__cap_grew(permitted, ambient, new))))
970	bprm->secureexec = `1`;
971
972	return `0`;
973	}
974
975	/**
976	* cap_inode_setxattr - Determine whether an xattr may be altered
977	* @dentry: The inode/dentry being altered
978	* @name: The name of the xattr to be changed
979	* @value: The value that the xattr will be changed to
980	* @size: The size of value
981	* @flags: The replacement flag
982	*
983	* Determine whether an xattr may be altered or set on an inode, returning 0 if
984	* permission is granted, -ve if denied.
985	*
986	* This is used to make sure security xattrs don't get updated or set by those
987	* who aren't privileged to do so.
988	*/
989	int cap_inode_setxattr(struct dentry dentry, const* char *name,
990	const void value, size_t size, int* flags)
991	{
992	struct user_namespace *user_ns = dentry->d_sb->s_user_ns;
993
994	/ Ignore non-security xattrs /
995	if (strncmp(name, XATTR_SECURITY_PREFIX,
996	XATTR_SECURITY_PREFIX_LEN) != `0`)
997	return `0`;
998
999	/*
1000	* For XATTR_NAME_CAPS the check will be done in
1001	* cap_convert_nscap(), called by setxattr()
1002	*/
1003	if (strcmp(name, XATTR_NAME_CAPS) == `0`)
1004	return `0`;
1005
1006	if (!ns_capable(ns: user_ns, CAP_SYS_ADMIN))
1007	return -EPERM;
1008	return `0`;
1009	}
1010
1011	/**
1012	* cap_inode_removexattr - Determine whether an xattr may be removed
1013	*
1014	* @idmap: idmap of the mount the inode was found from
1015	* @dentry: The inode/dentry being altered
1016	* @name: The name of the xattr to be changed
1017	*
1018	* Determine whether an xattr may be removed from an inode, returning 0 if
1019	* permission is granted, -ve if denied.
1020	*
1021	* If the inode has been found through an idmapped mount the idmap of
1022	* the vfsmount must be passed through @idmap. This function will then
1023	* take care to map the inode according to @idmap before checking
1024	* permissions. On non-idmapped mounts or if permission checking is to be
1025	* performed on the raw inode simply pass @nop_mnt_idmap.
1026	*
1027	* This is used to make sure security xattrs don't get removed by those who
1028	* aren't privileged to remove them.
1029	*/
1030	int cap_inode_removexattr(struct mnt_idmap *idmap,
1031	struct dentry dentry, const* char *name)
1032	{
1033	struct user_namespace *user_ns = dentry->d_sb->s_user_ns;
1034
1035	/ Ignore non-security xattrs /
1036	if (strncmp(name, XATTR_SECURITY_PREFIX,
1037	XATTR_SECURITY_PREFIX_LEN) != `0`)
1038	return `0`;
1039
1040	if (strcmp(name, XATTR_NAME_CAPS) == `0`) {
1041	/ security.capability gets namespaced /
1042	struct inode *inode = d_backing_inode(upper: dentry);
1043	if (!inode)
1044	return -EINVAL;
1045	if (!capable_wrt_inode_uidgid(idmap, inode, CAP_SETFCAP))
1046	return -EPERM;
1047	return `0`;
1048	}
1049
1050	if (!ns_capable(ns: user_ns, CAP_SYS_ADMIN))
1051	return -EPERM;
1052	return `0`;
1053	}
1054
1055	/*
1056	* cap_emulate_setxuid() fixes the effective / permitted capabilities of
1057	* a process after a call to setuid, setreuid, or setresuid.
1058	*
1059	* 1) When set*uiding _from_ one of {r,e,s}uid == 0 _to_ all of
1060	* {r,e,s}uid != 0, the permitted and effective capabilities are
1061	* cleared.
1062	*
1063	* 2) When set*uiding _from_ euid == 0 _to_ euid != 0, the effective
1064	* capabilities of the process are cleared.
1065	*
1066	* 3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective
1067	* capabilities are set to the permitted capabilities.
1068	*
1069	* fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should
1070	* never happen.
1071	*
1072	* -astor
1073	*
1074	* cevans - New behaviour, Oct '99
1075	* A process may, via prctl(), elect to keep its capabilities when it
1076	* calls setuid() and switches away from uid==0. Both permitted and
1077	* effective sets will be retained.
1078	* Without this change, it was impossible for a daemon to drop only some
1079	* of its privilege. The call to setuid(!=0) would drop all privileges!
1080	* Keeping uid 0 is not an option because uid 0 owns too many vital
1081	* files..
1082	* Thanks to Olaf Kirch and Peter Benie for spotting this.
1083	*/
1084	static inline void cap_emulate_setxuid(struct cred new, const* struct cred *old)
1085	{
1086	kuid_t root_uid = make_kuid(from: old->user_ns, uid: `0`);
1087
1088	if ((uid_eq(left: old->uid, right: root_uid) \|\|
1089	uid_eq(left: old->euid, right: root_uid) \|\|
1090	uid_eq(left: old->suid, right: root_uid)) &&
1091	(!uid_eq(left: new->uid, right: root_uid) &&
1092	!uid_eq(left: new->euid, right: root_uid) &&
1093	!uid_eq(left: new->suid, right: root_uid))) {
1094	if (!issecure(SECURE_KEEP_CAPS)) {
1095	cap_clear(new->cap_permitted);
1096	cap_clear(new->cap_effective);
1097	}
1098
1099	/*
1100	* Pre-ambient programs expect setresuid to nonroot followed
1101	* by exec to drop capabilities. We should make sure that
1102	* this remains the case.
1103	*/
1104	cap_clear(new->cap_ambient);
1105	}
1106	if (uid_eq(left: old->euid, right: root_uid) && !uid_eq(left: new->euid, right: root_uid))
1107	cap_clear(new->cap_effective);
1108	if (!uid_eq(left: old->euid, right: root_uid) && uid_eq(left: new->euid, right: root_uid))
1109	new->cap_effective = new->cap_permitted;
1110	}
1111
1112	/**
1113	* cap_task_fix_setuid - Fix up the results of setuid() call
1114	* @new: The proposed credentials
1115	* @old: The current task's current credentials
1116	* @flags: Indications of what has changed
1117	*
1118	* Fix up the results of setuid() call before the credential changes are
1119	* actually applied.
1120	*
1121	* Return: 0 to grant the changes, -ve to deny them.
1122	*/
1123	int cap_task_fix_setuid(struct cred new, const* struct cred old, int* flags)
1124	{
1125	switch (flags) {
1126	case LSM_SETID_RE:
1127	case LSM_SETID_ID:
1128	case LSM_SETID_RES:
1129	/ juggle the capabilities to follow [RES]UID changes unless*
1130	* otherwise suppressed */
1131	if (!issecure(SECURE_NO_SETUID_FIXUP))
1132	cap_emulate_setxuid(new, old);
1133	break;
1134
1135	case LSM_SETID_FS:
1136	/ juggle the capabilities to follow FSUID changes, unless*
1137	* otherwise suppressed
1138	*
1139	* FIXME - is fsuser used for all CAP_FS_MASK capabilities?
1140	* if not, we might be a bit too harsh here.
1141	*/
1142	if (!issecure(SECURE_NO_SETUID_FIXUP)) {
1143	kuid_t root_uid = make_kuid(from: old->user_ns, uid: `0`);
1144	if (uid_eq(left: old->fsuid, right: root_uid) && !uid_eq(left: new->fsuid, right: root_uid))
1145	new->cap_effective =
1146	cap_drop_fs_set(a: new->cap_effective);
1147
1148	if (!uid_eq(left: old->fsuid, right: root_uid) && uid_eq(left: new->fsuid, right: root_uid))
1149	new->cap_effective =
1150	cap_raise_fs_set(a: new->cap_effective,
1151	permitted: new->cap_permitted);
1152	}
1153	break;
1154
1155	default:
1156	return -EINVAL;
1157	}
1158
1159	return `0`;
1160	}
1161
1162	/*
1163	* Rationale: code calling task_setscheduler, task_setioprio, and
1164	* task_setnice, assumes that
1165	* . if capable(cap_sys_nice), then those actions should be allowed
1166	* . if not capable(cap_sys_nice), but acting on your own processes,
1167	* then those actions should be allowed
1168	* This is insufficient now since you can call code without suid, but
1169	* yet with increased caps.
1170	* So we check for increased caps on the target process.
1171	*/
1172	static int cap_safe_nice(struct task_struct *p)
1173	{
1174	int is_subset, ret = `0`;
1175
1176	rcu_read_lock();
1177	is_subset = cap_issubset(__task_cred(p)->cap_permitted,
1178	current_cred()->cap_permitted);
1179	if (!is_subset && !ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
1180	ret = -EPERM;
1181	rcu_read_unlock();
1182
1183	return ret;
1184	}
1185
1186	/**
1187	* cap_task_setscheduler - Determine if scheduler policy change is permitted
1188	* @p: The task to affect
1189	*
1190	* Determine if the requested scheduler policy change is permitted for the
1191	* specified task.
1192	*
1193	* Return: 0 if permission is granted, -ve if denied.
1194	*/
1195	int cap_task_setscheduler(struct task_struct *p)
1196	{
1197	return cap_safe_nice(p);
1198	}
1199
1200	/**
1201	* cap_task_setioprio - Determine if I/O priority change is permitted
1202	* @p: The task to affect
1203	* @ioprio: The I/O priority to set
1204	*
1205	* Determine if the requested I/O priority change is permitted for the specified
1206	* task.
1207	*
1208	* Return: 0 if permission is granted, -ve if denied.
1209	*/
1210	int cap_task_setioprio(struct task_struct p, int* ioprio)
1211	{
1212	return cap_safe_nice(p);
1213	}
1214
1215	/**
1216	* cap_task_setnice - Determine if task priority change is permitted
1217	* @p: The task to affect
1218	* @nice: The nice value to set
1219	*
1220	* Determine if the requested task priority change is permitted for the
1221	* specified task.
1222	*
1223	* Return: 0 if permission is granted, -ve if denied.
1224	*/
1225	int cap_task_setnice(struct task_struct p, int* nice)
1226	{
1227	return cap_safe_nice(p);
1228	}
1229
1230	/*
1231	* Implement PR_CAPBSET_DROP. Attempt to remove the specified capability from
1232	* the current task's bounding set. Returns 0 on success, -ve on error.
1233	*/
1234	static int cap_prctl_drop(unsigned long cap)
1235	{
1236	struct cred *new;
1237
1238	if (!ns_capable(current_user_ns(), CAP_SETPCAP))
1239	return -EPERM;
1240	if (!cap_valid(cap))
1241	return -EINVAL;
1242
1243	new = prepare_creds();
1244	if (!new)
1245	return -ENOMEM;
1246	cap_lower(new->cap_bset, cap);
1247	return commit_creds(new);
1248	}
1249
1250	/**
1251	* cap_task_prctl - Implement process control functions for this security module
1252	* @option: The process control function requested
1253	* @arg2: The argument data for this function
1254	* @arg3: The argument data for this function
1255	* @arg4: The argument data for this function
1256	* @arg5: The argument data for this function
1257	*
1258	* Allow process control functions (sys_prctl()) to alter capabilities; may
1259	* also deny access to other functions not otherwise implemented here.
1260	*
1261	* Return: 0 or +ve on success, -ENOSYS if this function is not implemented
1262	* here, other -ve on error. If -ENOSYS is returned, sys_prctl() and other LSM
1263	* modules will consider performing the function.
1264	*/
1265	int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
1266	unsigned long arg4, unsigned long arg5)
1267	{
1268	const struct cred *old = current_cred();
1269	struct cred *new;
1270
1271	switch (option) {
1272	case PR_CAPBSET_READ:
1273	if (!cap_valid(arg2))
1274	return -EINVAL;
1275	return !!cap_raised(old->cap_bset, arg2);
1276
1277	case PR_CAPBSET_DROP:
1278	return cap_prctl_drop(cap: arg2);
1279
1280	/*
1281	* The next four prctl's remain to assist with transitioning a
1282	* system from legacy UID=0 based privilege (when filesystem
1283	* capabilities are not in use) to a system using filesystem
1284	* capabilities only - as the POSIX.1e draft intended.
1285	*
1286	* Note:
1287	*
1288	* PR_SET_SECUREBITS =
1289	* issecure_mask(SECURE_KEEP_CAPS_LOCKED)
1290	* \| issecure_mask(SECURE_NOROOT)
1291	* \| issecure_mask(SECURE_NOROOT_LOCKED)
1292	* \| issecure_mask(SECURE_NO_SETUID_FIXUP)
1293	* \| issecure_mask(SECURE_NO_SETUID_FIXUP_LOCKED)
1294	*
1295	* will ensure that the current process and all of its
1296	* children will be locked into a pure
1297	* capability-based-privilege environment.
1298	*/
1299	case PR_SET_SECUREBITS:
1300	if ((((old->securebits & SECURE_ALL_LOCKS) >> `1`)
1301	& (old->securebits ^ arg2)) /[1]/
1302	\|\| ((old->securebits & SECURE_ALL_LOCKS & ~arg2)) /[2]/
1303	\|\| (arg2 & ~(SECURE_ALL_LOCKS \| SECURE_ALL_BITS)) /[3]/
1304	\|\| (cap_capable(current_cred(),
1305	current_cred()->user_ns,
1306	CAP_SETPCAP,
1307	CAP_OPT_NONE) != `0`) /[4]/
1308	/*
1309	* [1] no changing of bits that are locked
1310	* [2] no unlocking of locks
1311	* [3] no setting of unsupported bits
1312	* [4] doing anything requires privilege (go read about
1313	* the "sendmail capabilities bug")
1314	*/
1315	)
1316	/ cannot change a locked bit /
1317	return -EPERM;
1318
1319	new = prepare_creds();
1320	if (!new)
1321	return -ENOMEM;
1322	new->securebits = arg2;
1323	return commit_creds(new);
1324
1325	case PR_GET_SECUREBITS:
1326	return old->securebits;
1327
1328	case PR_GET_KEEPCAPS:
1329	return !!issecure(SECURE_KEEP_CAPS);
1330
1331	case PR_SET_KEEPCAPS:
1332	if (arg2 > `1`) / Note, we rely on arg2 being unsigned here /
1333	return -EINVAL;
1334	if (issecure(SECURE_KEEP_CAPS_LOCKED))
1335	return -EPERM;
1336
1337	new = prepare_creds();
1338	if (!new)
1339	return -ENOMEM;
1340	if (arg2)
1341	new->securebits \|= issecure_mask(SECURE_KEEP_CAPS);
1342	else
1343	new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
1344	return commit_creds(new);
1345
1346	case PR_CAP_AMBIENT:
1347	if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) {
1348	if (arg3 \| arg4 \| arg5)
1349	return -EINVAL;
1350
1351	new = prepare_creds();
1352	if (!new)
1353	return -ENOMEM;
1354	cap_clear(new->cap_ambient);
1355	return commit_creds(new);
1356	}
1357
1358	if (((!cap_valid(arg3)) \| arg4 \| arg5))
1359	return -EINVAL;
1360
1361	if (arg2 == PR_CAP_AMBIENT_IS_SET) {
1362	return !!cap_raised(current_cred()->cap_ambient, arg3);
1363	} else if (arg2 != PR_CAP_AMBIENT_RAISE &&
1364	arg2 != PR_CAP_AMBIENT_LOWER) {
1365	return -EINVAL;
1366	} else {
1367	if (arg2 == PR_CAP_AMBIENT_RAISE &&
1368	(!cap_raised(current_cred()->cap_permitted, arg3) \|\|
1369	!cap_raised(current_cred()->cap_inheritable,
1370	arg3) \|\|
1371	issecure(SECURE_NO_CAP_AMBIENT_RAISE)))
1372	return -EPERM;
1373
1374	new = prepare_creds();
1375	if (!new)
1376	return -ENOMEM;
1377	if (arg2 == PR_CAP_AMBIENT_RAISE)
1378	cap_raise(new->cap_ambient, arg3);
1379	else
1380	cap_lower(new->cap_ambient, arg3);
1381	return commit_creds(new);
1382	}
1383
1384	default:
1385	/ No functionality available - continue with default /
1386	return -ENOSYS;
1387	}
1388	}
1389
1390	/**
1391	* cap_vm_enough_memory - Determine whether a new virtual mapping is permitted
1392	* @mm: The VM space in which the new mapping is to be made
1393	* @pages: The size of the mapping
1394	*
1395	* Determine whether the allocation of a new virtual mapping by the current
1396	* task is permitted.
1397	*
1398	* Return: 1 if permission is granted, 0 if not.
1399	*/
1400	int cap_vm_enough_memory(struct mm_struct mm, long* pages)
1401	{
1402	int cap_sys_admin = `0`;
1403
1404	if (cap_capable(current_cred(), targ_ns: &init_user_ns,
1405	CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) == `0`)
1406	cap_sys_admin = `1`;
1407
1408	return cap_sys_admin;
1409	}
1410
1411	/**
1412	* cap_mmap_addr - check if able to map given addr
1413	* @addr: address attempting to be mapped
1414	*
1415	* If the process is attempting to map memory below dac_mmap_min_addr they need
1416	* CAP_SYS_RAWIO. The other parameters to this function are unused by the
1417	* capability security module.
1418	*
1419	* Return: 0 if this mapping should be allowed or -EPERM if not.
1420	*/
1421	int cap_mmap_addr(unsigned long addr)
1422	{
1423	int ret = `0`;
1424
1425	if (addr < dac_mmap_min_addr) {
1426	ret = cap_capable(current_cred(), targ_ns: &init_user_ns, CAP_SYS_RAWIO,
1427	CAP_OPT_NONE);
1428	/ set PF_SUPERPRIV if it turns out we allow the low mmap /
1429	if (ret == `0`)
1430	current->flags \|= PF_SUPERPRIV;
1431	}
1432	return ret;
1433	}
1434
1435	int cap_mmap_file(struct file file, unsigned* long reqprot,
1436	unsigned long prot, unsigned long flags)
1437	{
1438	return `0`;
1439	}
1440
1441	#ifdef CONFIG_SECURITY
1442
1443	static struct security_hook_list capability_hooks[] __ro_after_init = {
1444	LSM_HOOK_INIT(capable, cap_capable),
1445	LSM_HOOK_INIT(settime, cap_settime),
1446	LSM_HOOK_INIT(ptrace_access_check, cap_ptrace_access_check),
1447	LSM_HOOK_INIT(ptrace_traceme, cap_ptrace_traceme),
1448	LSM_HOOK_INIT(capget, cap_capget),
1449	LSM_HOOK_INIT(capset, cap_capset),
1450	LSM_HOOK_INIT(bprm_creds_from_file, cap_bprm_creds_from_file),
1451	LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv),
1452	LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
1453	LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity),
1454	LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
1455	LSM_HOOK_INIT(mmap_file, cap_mmap_file),
1456	LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid),
1457	LSM_HOOK_INIT(task_prctl, cap_task_prctl),
1458	LSM_HOOK_INIT(task_setscheduler, cap_task_setscheduler),
1459	LSM_HOOK_INIT(task_setioprio, cap_task_setioprio),
1460	LSM_HOOK_INIT(task_setnice, cap_task_setnice),
1461	LSM_HOOK_INIT(vm_enough_memory, cap_vm_enough_memory),
1462	};
1463
1464	static int __init capability_init(void)
1465	{
1466	security_add_hooks(hooks: capability_hooks, ARRAY_SIZE(capability_hooks),
1467	lsm: "capability");
1468	return `0`;
1469	}
1470
1471	DEFINE_LSM(capability) = {
1472	.name = "capability",
1473	.order = LSM_ORDER_FIRST,
1474	.init = capability_init,
1475	};
1476
1477	#endif /* CONFIG_SECURITY */
1478

source code of linux/security/commoncap.c