commoncap.c source code [linux/security/commoncap.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/ Common capabilities, needed by capability.o.*
3	*/
4
5	#include <linux/capability.h>
6	#include <linux/audit.h>
7	#include <linux/init.h>
8	#include <linux/kernel.h>
9	#include <linux/lsm_hooks.h>
10	#include <linux/file.h>
11	#include <linux/mm.h>
12	#include <linux/mman.h>
13	#include <linux/pagemap.h>
14	#include <linux/swap.h>
15	#include <linux/skbuff.h>
16	#include <linux/netlink.h>
17	#include <linux/ptrace.h>
18	#include <linux/xattr.h>
19	#include <linux/hugetlb.h>
20	#include <linux/mount.h>
21	#include <linux/sched.h>
22	#include <linux/prctl.h>
23	#include <linux/securebits.h>
24	#include <linux/user_namespace.h>
25	#include <linux/binfmts.h>
26	#include <linux/personality.h>
27	#include <linux/mnt_idmapping.h>
28	#include <uapi/linux/lsm.h>
29
30	/*
31	* If a non-root user executes a setuid-root binary in
32	* !secure(SECURE_NOROOT) mode, then we raise capabilities.
33	* However if fE is also set, then the intent is for only
34	* the file capabilities to be applied, and the setuid-root
35	* bit is left on either to change the uid (plausible) or
36	* to get full privilege on a kernel without file capabilities
37	* support. So in that case we do not raise capabilities.
38	*
39	* Warn if that happens, once per boot.
40	*/
41	static void warn_setuid_and_fcaps_mixed(const char *fname)
42	{
43	static int warned;
44	if (!warned) {
45	printk(KERN_INFO "warning: `%s' has both setuid-root and"
46	" effective capabilities. Therefore not raising all"
47	" capabilities.\n", fname);
48	warned = `1`;
49	}
50	}
51
52	/**
53	* cap_capable - Determine whether a task has a particular effective capability
54	* @cred: The credentials to use
55	* @targ_ns: The user namespace in which we need the capability
56	* @cap: The capability to check for
57	* @opts: Bitmask of options defined in include/linux/security.h
58	*
59	* Determine whether the nominated task has the specified capability amongst
60	* its effective set, returning 0 if it does, -ve if it does not.
61	*
62	* NOTE WELL: cap_has_capability() cannot be used like the kernel's capable()
63	* and has_capability() functions. That is, it has the reverse semantics:
64	* cap_has_capability() returns 0 when a task has a capability, but the
65	* kernel's capable() and has_capability() returns 1 for this case.
66	*/
67	int cap_capable(const struct cred cred, struct* user_namespace *targ_ns,
68	int cap, unsigned int opts)
69	{
70	struct user_namespace *ns = targ_ns;
71
72	/ See if cred has the capability in the target user namespace*
73	* by examining the target user namespace and all of the target
74	* user namespace's parents.
75	*/
76	for (;;) {
77	/ Do we have the necessary capabilities? /
78	if (ns == cred->user_ns)
79	return cap_raised(cred->cap_effective, cap) ? `0` : -EPERM;
80
81	/*
82	* If we're already at a lower level than we're looking for,
83	* we're done searching.
84	*/
85	if (ns->level <= cred->user_ns->level)
86	return -EPERM;
87
88	/*
89	* The owner of the user namespace in the parent of the
90	* user namespace has all caps.
91	*/
92	if ((ns->parent == cred->user_ns) && uid_eq(left: ns->owner, right: cred->euid))
93	return `0`;
94
95	/*
96	* If you have a capability in a parent user ns, then you have
97	* it over all children user namespaces as well.
98	*/
99	ns = ns->parent;
100	}
101
102	/ We never get here /
103	}
104
105	/**
106	* cap_settime - Determine whether the current process may set the system clock
107	* @ts: The time to set
108	* @tz: The timezone to set
109	*
110	* Determine whether the current process may set the system clock and timezone
111	* information, returning 0 if permission granted, -ve if denied.
112	*/
113	int cap_settime(const struct timespec64 ts, const* struct timezone *tz)
114	{
115	if (!capable(CAP_SYS_TIME))
116	return -EPERM;
117	return `0`;
118	}
119
120	/**
121	* cap_ptrace_access_check - Determine whether the current process may access
122	* another
123	* @child: The process to be accessed
124	* @mode: The mode of attachment.
125	*
126	* If we are in the same or an ancestor user_ns and have all the target
127	* task's capabilities, then ptrace access is allowed.
128	* If we have the ptrace capability to the target user_ns, then ptrace
129	* access is allowed.
130	* Else denied.
131	*
132	* Determine whether a process may access another, returning 0 if permission
133	* granted, -ve if denied.
134	*/
135	int cap_ptrace_access_check(struct task_struct child, unsigned* int mode)
136	{
137	int ret = `0`;
138	const struct cred cred, child_cred;
139	const kernel_cap_t *caller_caps;
140
141	rcu_read_lock();
142	cred = current_cred();
143	child_cred = __task_cred(child);
144	if (mode & PTRACE_MODE_FSCREDS)
145	caller_caps = &cred->cap_effective;
146	else
147	caller_caps = &cred->cap_permitted;
148	if (cred->user_ns == child_cred->user_ns &&
149	cap_issubset(a: child_cred->cap_permitted, set: *caller_caps))
150	goto out;
151	if (ns_capable(ns: child_cred->user_ns, CAP_SYS_PTRACE))
152	goto out;
153	ret = -EPERM;
154	out:
155	rcu_read_unlock();
156	return ret;
157	}
158
159	/**
160	* cap_ptrace_traceme - Determine whether another process may trace the current
161	* @parent: The task proposed to be the tracer
162	*
163	* If parent is in the same or an ancestor user_ns and has all current's
164	* capabilities, then ptrace access is allowed.
165	* If parent has the ptrace capability to current's user_ns, then ptrace
166	* access is allowed.
167	* Else denied.
168	*
169	* Determine whether the nominated task is permitted to trace the current
170	* process, returning 0 if permission is granted, -ve if denied.
171	*/
172	int cap_ptrace_traceme(struct task_struct *parent)
173	{
174	int ret = `0`;
175	const struct cred cred, child_cred;
176
177	rcu_read_lock();
178	cred = __task_cred(parent);
179	child_cred = current_cred();
180	if (cred->user_ns == child_cred->user_ns &&
181	cap_issubset(a: child_cred->cap_permitted, set: cred->cap_permitted))
182	goto out;
183	if (has_ns_capability(t: parent, ns: child_cred->user_ns, CAP_SYS_PTRACE))
184	goto out;
185	ret = -EPERM;
186	out:
187	rcu_read_unlock();
188	return ret;
189	}
190
191	/**
192	* cap_capget - Retrieve a task's capability sets
193	* @target: The task from which to retrieve the capability sets
194	* @effective: The place to record the effective set
195	* @inheritable: The place to record the inheritable set
196	* @permitted: The place to record the permitted set
197	*
198	* This function retrieves the capabilities of the nominated task and returns
199	* them to the caller.
200	*/
201	int cap_capget(const struct task_struct target, kernel_cap_t effective,
202	kernel_cap_t inheritable, kernel_cap_t permitted)
203	{
204	const struct cred *cred;
205
206	/ Derived from kernel/capability.c:sys_capget. /
207	rcu_read_lock();
208	cred = __task_cred(target);
209	*effective = cred->cap_effective;
210	*inheritable = cred->cap_inheritable;
211	*permitted = cred->cap_permitted;
212	rcu_read_unlock();
213	return `0`;
214	}
215
216	/*
217	* Determine whether the inheritable capabilities are limited to the old
218	* permitted set. Returns 1 if they are limited, 0 if they are not.
219	*/
220	static inline int cap_inh_is_capped(void)
221	{
222	/ they are so limited unless the current task has the CAP_SETPCAP*
223	* capability
224	*/
225	if (cap_capable(current_cred(), current_cred()->user_ns,
226	CAP_SETPCAP, CAP_OPT_NONE) == `0`)
227	return `0`;
228	return `1`;
229	}
230
231	/**
232	* cap_capset - Validate and apply proposed changes to current's capabilities
233	* @new: The proposed new credentials; alterations should be made here
234	* @old: The current task's current credentials
235	* @effective: A pointer to the proposed new effective capabilities set
236	* @inheritable: A pointer to the proposed new inheritable capabilities set
237	* @permitted: A pointer to the proposed new permitted capabilities set
238	*
239	* This function validates and applies a proposed mass change to the current
240	* process's capability sets. The changes are made to the proposed new
241	* credentials, and assuming no error, will be committed by the caller of LSM.
242	*/
243	int cap_capset(struct cred *new,
244	const struct cred *old,
245	const kernel_cap_t *effective,
246	const kernel_cap_t *inheritable,
247	const kernel_cap_t *permitted)
248	{
249	if (cap_inh_is_capped() &&
250	!cap_issubset(a: *inheritable,
251	set: cap_combine(a: old->cap_inheritable,
252	b: old->cap_permitted)))
253	/ incapable of using this inheritable set /
254	return -EPERM;
255
256	if (!cap_issubset(a: *inheritable,
257	set: cap_combine(a: old->cap_inheritable,
258	b: old->cap_bset)))
259	/ no new pI capabilities outside bounding set /
260	return -EPERM;
261
262	/ verify restrictions on target's new Permitted set /
263	if (!cap_issubset(a: *permitted, set: old->cap_permitted))
264	return -EPERM;
265
266	/ verify the _new_Effective_ is a subset of the _new_Permitted_ /
267	if (!cap_issubset(a: effective, set: permitted))
268	return -EPERM;
269
270	new->cap_effective = *effective;
271	new->cap_inheritable = *inheritable;
272	new->cap_permitted = *permitted;
273
274	/*
275	* Mask off ambient bits that are no longer both permitted and
276	* inheritable.
277	*/
278	new->cap_ambient = cap_intersect(a: new->cap_ambient,
279	b: cap_intersect(a: *permitted,
280	b: *inheritable));
281	if (WARN_ON(!cap_ambient_invariant_ok(new)))
282	return -EINVAL;
283	return `0`;
284	}
285
286	/**
287	* cap_inode_need_killpriv - Determine if inode change affects privileges
288	* @dentry: The inode/dentry in being changed with change marked ATTR_KILL_PRIV
289	*
290	* Determine if an inode having a change applied that's marked ATTR_KILL_PRIV
291	* affects the security markings on that inode, and if it is, should
292	* inode_killpriv() be invoked or the change rejected.
293	*
294	* Return: 1 if security.capability has a value, meaning inode_killpriv()
295	* is required, 0 otherwise, meaning inode_killpriv() is not required.
296	*/
297	int cap_inode_need_killpriv(struct dentry *dentry)
298	{
299	struct inode *inode = d_backing_inode(upper: dentry);
300	int error;
301
302	error = __vfs_getxattr(dentry, inode, XATTR_NAME_CAPS, NULL, `0`);
303	return error > `0`;
304	}
305
306	/**
307	* cap_inode_killpriv - Erase the security markings on an inode
308	*
309	* @idmap: idmap of the mount the inode was found from
310	* @dentry: The inode/dentry to alter
311	*
312	* Erase the privilege-enhancing security markings on an inode.
313	*
314	* If the inode has been found through an idmapped mount the idmap of
315	* the vfsmount must be passed through @idmap. This function will then
316	* take care to map the inode according to @idmap before checking
317	* permissions. On non-idmapped mounts or if permission checking is to be
318	* performed on the raw inode simply pass @nop_mnt_idmap.
319	*
320	* Return: 0 if successful, -ve on error.
321	*/
322	int cap_inode_killpriv(struct mnt_idmap idmap, struct* dentry *dentry)
323	{
324	int error;
325
326	error = __vfs_removexattr(idmap, dentry, XATTR_NAME_CAPS);
327	if (error == -EOPNOTSUPP)
328	error = `0`;
329	return error;
330	}
331
332	static bool rootid_owns_currentns(vfsuid_t rootvfsuid)
333	{
334	struct user_namespace *ns;
335	kuid_t kroot;
336
337	if (!vfsuid_valid(uid: rootvfsuid))
338	return false;
339
340	kroot = vfsuid_into_kuid(vfsuid: rootvfsuid);
341	for (ns = current_user_ns();; ns = ns->parent) {
342	if (from_kuid(to: ns, uid: kroot) == `0`)
343	return true;
344	if (ns == &init_user_ns)
345	break;
346	}
347
348	return false;
349	}
350
351	static __u32 sansflags(__u32 m)
352	{
353	return m & ~VFS_CAP_FLAGS_EFFECTIVE;
354	}
355
356	static bool is_v2header(int size, const struct vfs_cap_data *cap)
357	{
358	if (size != XATTR_CAPS_SZ_2)
359	return false;
360	return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_2;
361	}
362
363	static bool is_v3header(int size, const struct vfs_cap_data *cap)
364	{
365	if (size != XATTR_CAPS_SZ_3)
366	return false;
367	return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_3;
368	}
369
370	/*
371	* getsecurity: We are called for security.* before any attempt to read the
372	* xattr from the inode itself.
373	*
374	* This gives us a chance to read the on-disk value and convert it. If we
375	* return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler.
376	*
377	* Note we are not called by vfs_getxattr_alloc(), but that is only called
378	* by the integrity subsystem, which really wants the unconverted values -
379	* so that's good.
380	*/
381	int cap_inode_getsecurity(struct mnt_idmap *idmap,
382	struct inode inode, const* char name, void* **buffer,
383	bool alloc)
384	{
385	int size;
386	kuid_t kroot;
387	vfsuid_t vfsroot;
388	u32 nsmagic, magic;
389	uid_t root, mappedroot;
390	char *tmpbuf = NULL;
391	struct vfs_cap_data *cap;
392	struct vfs_ns_cap_data *nscap = NULL;
393	struct dentry *dentry;
394	struct user_namespace *fs_ns;
395
396	if (strcmp(name, "capability") != `0`)
397	return -EOPNOTSUPP;
398
399	dentry = d_find_any_alias(inode);
400	if (!dentry)
401	return -EINVAL;
402	size = vfs_getxattr_alloc(idmap, dentry, XATTR_NAME_CAPS, xattr_value: &tmpbuf,
403	size: sizeof(struct vfs_ns_cap_data), GFP_NOFS);
404	dput(dentry);
405	/ gcc11 complains if we don't check for !tmpbuf /
406	if (size < `0` \|\| !tmpbuf)
407	goto out_free;
408
409	fs_ns = inode->i_sb->s_user_ns;
410	cap = (struct vfs_cap_data *) tmpbuf;
411	if (is_v2header(size, cap)) {
412	root = `0`;
413	} else if (is_v3header(size, cap)) {
414	nscap = (struct vfs_ns_cap_data *) tmpbuf;
415	root = le32_to_cpu(nscap->rootid);
416	} else {
417	size = -EINVAL;
418	goto out_free;
419	}
420
421	kroot = make_kuid(from: fs_ns, uid: root);
422
423	/ If this is an idmapped mount shift the kuid. /
424	vfsroot = make_vfsuid(idmap, fs_userns: fs_ns, kuid: kroot);
425
426	/ If the root kuid maps to a valid uid in current ns, then return*
427	* this as a nscap. */
428	mappedroot = from_kuid(current_user_ns(), uid: vfsuid_into_kuid(vfsuid: vfsroot));
429	if (mappedroot != (uid_t)-`1` && mappedroot != (uid_t)`0`) {
430	size = sizeof(struct vfs_ns_cap_data);
431	if (alloc) {
432	if (!nscap) {
433	/ v2 -> v3 conversion /
434	nscap = kzalloc(size, GFP_ATOMIC);
435	if (!nscap) {
436	size = -ENOMEM;
437	goto out_free;
438	}
439	nsmagic = VFS_CAP_REVISION_3;
440	magic = le32_to_cpu(cap->magic_etc);
441	if (magic & VFS_CAP_FLAGS_EFFECTIVE)
442	nsmagic \|= VFS_CAP_FLAGS_EFFECTIVE;
443	memcpy(&nscap->data, &cap->data, sizeof(__le32) * `2` * VFS_CAP_U32);
444	nscap->magic_etc = cpu_to_le32(nsmagic);
445	} else {
446	/ use allocated v3 buffer /
447	tmpbuf = NULL;
448	}
449	nscap->rootid = cpu_to_le32(mappedroot);
450	*buffer = nscap;
451	}
452	goto out_free;
453	}
454
455	if (!rootid_owns_currentns(rootvfsuid: vfsroot)) {
456	size = -EOVERFLOW;
457	goto out_free;
458	}
459
460	/ This comes from a parent namespace. Return as a v2 capability /
461	size = sizeof(struct vfs_cap_data);
462	if (alloc) {
463	if (nscap) {
464	/ v3 -> v2 conversion /
465	cap = kzalloc(size, GFP_ATOMIC);
466	if (!cap) {
467	size = -ENOMEM;
468	goto out_free;
469	}
470	magic = VFS_CAP_REVISION_2;
471	nsmagic = le32_to_cpu(nscap->magic_etc);
472	if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
473	magic \|= VFS_CAP_FLAGS_EFFECTIVE;
474	memcpy(&cap->data, &nscap->data, sizeof(__le32) * `2` * VFS_CAP_U32);
475	cap->magic_etc = cpu_to_le32(magic);
476	} else {
477	/ use unconverted v2 /
478	tmpbuf = NULL;
479	}
480	*buffer = cap;
481	}
482	out_free:
483	kfree(objp: tmpbuf);
484	return size;
485	}
486
487	/**
488	* rootid_from_xattr - translate root uid of vfs caps
489	*
490	* @value: vfs caps value which may be modified by this function
491	* @size: size of @ivalue
492	* @task_ns: user namespace of the caller
493	*/
494	static vfsuid_t rootid_from_xattr(const void *value, size_t size,
495	struct user_namespace *task_ns)
496	{
497	const struct vfs_ns_cap_data *nscap = value;
498	uid_t rootid = `0`;
499
500	if (size == XATTR_CAPS_SZ_3)
501	rootid = le32_to_cpu(nscap->rootid);
502
503	return VFSUIDT_INIT(make_kuid(task_ns, rootid));
504	}
505
506	static bool validheader(size_t size, const struct vfs_cap_data *cap)
507	{
508	return is_v2header(size, cap) \|\| is_v3header(size, cap);
509	}
510
511	/**
512	* cap_convert_nscap - check vfs caps
513	*
514	* @idmap: idmap of the mount the inode was found from
515	* @dentry: used to retrieve inode to check permissions on
516	* @ivalue: vfs caps value which may be modified by this function
517	* @size: size of @ivalue
518	*
519	* User requested a write of security.capability. If needed, update the
520	* xattr to change from v2 to v3, or to fixup the v3 rootid.
521	*
522	* If the inode has been found through an idmapped mount the idmap of
523	* the vfsmount must be passed through @idmap. This function will then
524	* take care to map the inode according to @idmap before checking
525	* permissions. On non-idmapped mounts or if permission checking is to be
526	* performed on the raw inode simply pass @nop_mnt_idmap.
527	*
528	* Return: On success, return the new size; on error, return < 0.
529	*/
530	int cap_convert_nscap(struct mnt_idmap idmap, struct* dentry *dentry,
531	const void **ivalue, size_t size)
532	{
533	struct vfs_ns_cap_data *nscap;
534	uid_t nsrootid;
535	const struct vfs_cap_data cap = ivalue;
536	__u32 magic, nsmagic;
537	struct inode *inode = d_backing_inode(upper: dentry);
538	struct user_namespace *task_ns = current_user_ns(),
539	*fs_ns = inode->i_sb->s_user_ns;
540	kuid_t rootid;
541	vfsuid_t vfsrootid;
542	size_t newsize;
543
544	if (!*ivalue)
545	return -EINVAL;
546	if (!validheader(size, cap))
547	return -EINVAL;
548	if (!capable_wrt_inode_uidgid(idmap, inode, CAP_SETFCAP))
549	return -EPERM;
550	if (size == XATTR_CAPS_SZ_2 && (idmap == &nop_mnt_idmap))
551	if (ns_capable(ns: inode->i_sb->s_user_ns, CAP_SETFCAP))
552	/ user is privileged, just write the v2 /
553	return size;
554
555	vfsrootid = rootid_from_xattr(value: *ivalue, size, task_ns);
556	if (!vfsuid_valid(uid: vfsrootid))
557	return -EINVAL;
558
559	rootid = from_vfsuid(idmap, fs_userns: fs_ns, vfsuid: vfsrootid);
560	if (!uid_valid(uid: rootid))
561	return -EINVAL;
562
563	nsrootid = from_kuid(to: fs_ns, uid: rootid);
564	if (nsrootid == -`1`)
565	return -EINVAL;
566
567	newsize = sizeof(struct vfs_ns_cap_data);
568	nscap = kmalloc(size: newsize, GFP_ATOMIC);
569	if (!nscap)
570	return -ENOMEM;
571	nscap->rootid = cpu_to_le32(nsrootid);
572	nsmagic = VFS_CAP_REVISION_3;
573	magic = le32_to_cpu(cap->magic_etc);
574	if (magic & VFS_CAP_FLAGS_EFFECTIVE)
575	nsmagic \|= VFS_CAP_FLAGS_EFFECTIVE;
576	nscap->magic_etc = cpu_to_le32(nsmagic);
577	memcpy(&nscap->data, &cap->data, sizeof(__le32) * `2` * VFS_CAP_U32);
578
579	*ivalue = nscap;
580	return newsize;
581	}
582
583	/*
584	* Calculate the new process capability sets from the capability sets attached
585	* to a file.
586	*/
587	static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
588	struct linux_binprm *bprm,
589	bool *effective,
590	bool *has_fcap)
591	{
592	struct cred *new = bprm->cred;
593	int ret = `0`;
594
595	if (caps->magic_etc & VFS_CAP_FLAGS_EFFECTIVE)
596	*effective = true;
597
598	if (caps->magic_etc & VFS_CAP_REVISION_MASK)
599	*has_fcap = true;
600
601	/*
602	* pP' = (X & fP) \| (pI & fI)
603	* The addition of pA' is handled later.
604	*/
605	new->cap_permitted.val =
606	(new->cap_bset.val & caps->permitted.val) \|
607	(new->cap_inheritable.val & caps->inheritable.val);
608
609	if (caps->permitted.val & ~new->cap_permitted.val)
610	/ insufficient to execute correctly /
611	ret = -EPERM;
612
613	/*
614	* For legacy apps, with no internal support for recognizing they
615	* do not have enough capabilities, we return an error if they are
616	* missing some "forced" (aka file-permitted) capabilities.
617	*/
618	return *effective ? ret : `0`;
619	}
620
621	/**
622	* get_vfs_caps_from_disk - retrieve vfs caps from disk
623	*
624	* @idmap: idmap of the mount the inode was found from
625	* @dentry: dentry from which @inode is retrieved
626	* @cpu_caps: vfs capabilities
627	*
628	* Extract the on-exec-apply capability sets for an executable file.
629	*
630	* If the inode has been found through an idmapped mount the idmap of
631	* the vfsmount must be passed through @idmap. This function will then
632	* take care to map the inode according to @idmap before checking
633	* permissions. On non-idmapped mounts or if permission checking is to be
634	* performed on the raw inode simply pass @nop_mnt_idmap.
635	*/
636	int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
637	const struct dentry *dentry,
638	struct cpu_vfs_cap_data *cpu_caps)
639	{
640	struct inode *inode = d_backing_inode(upper: dentry);
641	__u32 magic_etc;
642	int size;
643	struct vfs_ns_cap_data data, *nscaps = &data;
644	struct vfs_cap_data caps = (struct* vfs_cap_data *) &data;
645	kuid_t rootkuid;
646	vfsuid_t rootvfsuid;
647	struct user_namespace *fs_ns;
648
649	memset(cpu_caps, `0`, sizeof(struct cpu_vfs_cap_data));
650
651	if (!inode)
652	return -ENODATA;
653
654	fs_ns = inode->i_sb->s_user_ns;
655	size = __vfs_getxattr((struct dentry *)dentry, inode,
656	XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
657	if (size == -ENODATA \|\| size == -EOPNOTSUPP)
658	/ no data, that's ok /
659	return -ENODATA;
660
661	if (size < `0`)
662	return size;
663
664	if (size < sizeof(magic_etc))
665	return -EINVAL;
666
667	cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);
668
669	rootkuid = make_kuid(from: fs_ns, uid: `0`);
670	switch (magic_etc & VFS_CAP_REVISION_MASK) {
671	case VFS_CAP_REVISION_1:
672	if (size != XATTR_CAPS_SZ_1)
673	return -EINVAL;
674	break;
675	case VFS_CAP_REVISION_2:
676	if (size != XATTR_CAPS_SZ_2)
677	return -EINVAL;
678	break;
679	case VFS_CAP_REVISION_3:
680	if (size != XATTR_CAPS_SZ_3)
681	return -EINVAL;
682	rootkuid = make_kuid(from: fs_ns, le32_to_cpu(nscaps->rootid));
683	break;
684
685	default:
686	return -EINVAL;
687	}
688
689	rootvfsuid = make_vfsuid(idmap, fs_userns: fs_ns, kuid: rootkuid);
690	if (!vfsuid_valid(uid: rootvfsuid))
691	return -ENODATA;
692
693	/ Limit the caps to the mounter of the filesystem*
694	* or the more limited uid specified in the xattr.
695	*/
696	if (!rootid_owns_currentns(rootvfsuid))
697	return -ENODATA;
698
699	cpu_caps->permitted.val = le32_to_cpu(caps->data[`0`].permitted);
700	cpu_caps->inheritable.val = le32_to_cpu(caps->data[`0`].inheritable);
701
702	/*
703	* Rev1 had just a single 32-bit word, later expanded
704	* to a second one for the high bits
705	*/
706	if ((magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_1) {
707	cpu_caps->permitted.val += (u64)le32_to_cpu(caps->data[`1`].permitted) << `32`;
708	cpu_caps->inheritable.val += (u64)le32_to_cpu(caps->data[`1`].inheritable) << `32`;
709	}
710
711	cpu_caps->permitted.val &= CAP_VALID_MASK;
712	cpu_caps->inheritable.val &= CAP_VALID_MASK;
713
714	cpu_caps->rootid = vfsuid_into_kuid(vfsuid: rootvfsuid);
715
716	return `0`;
717	}
718
719	/*
720	* Attempt to get the on-exec apply capability sets for an executable file from
721	* its xattrs and, if present, apply them to the proposed credentials being
722	* constructed by execve().
723	*/
724	static int get_file_caps(struct linux_binprm bprm, const* struct file *file,
725	bool effective, bool has_fcap)
726	{
727	int rc = `0`;
728	struct cpu_vfs_cap_data vcaps;
729
730	cap_clear(bprm->cred->cap_permitted);
731
732	if (!file_caps_enabled)
733	return `0`;
734
735	if (!mnt_may_suid(mnt: file->f_path.mnt))
736	return `0`;
737
738	/*
739	* This check is redundant with mnt_may_suid() but is kept to make
740	* explicit that capability bits are limited to s_user_ns and its
741	* descendants.
742	*/
743	if (!current_in_userns(target_ns: file->f_path.mnt->mnt_sb->s_user_ns))
744	return `0`;
745
746	rc = get_vfs_caps_from_disk(idmap: file_mnt_idmap(file),
747	dentry: file->f_path.dentry, cpu_caps: &vcaps);
748	if (rc < `0`) {
749	if (rc == -EINVAL)
750	printk(KERN_NOTICE "Invalid argument reading file caps for %s\n",
751	bprm->filename);
752	else if (rc == -ENODATA)
753	rc = `0`;
754	goto out;
755	}
756
757	rc = bprm_caps_from_vfs_caps(caps: &vcaps, bprm, effective, has_fcap);
758
759	out:
760	if (rc)
761	cap_clear(bprm->cred->cap_permitted);
762
763	return rc;
764	}
765
766	static inline bool root_privileged(void) { return !issecure(SECURE_NOROOT); }
767
768	static inline bool __is_real(kuid_t uid, struct cred *cred)
769	{ return uid_eq(left: cred->uid, right: uid); }
770
771	static inline bool __is_eff(kuid_t uid, struct cred *cred)
772	{ return uid_eq(left: cred->euid, right: uid); }
773
774	static inline bool __is_suid(kuid_t uid, struct cred *cred)
775	{ return !__is_real(uid, cred) && __is_eff(uid, cred); }
776
777	/*
778	* handle_privileged_root - Handle case of privileged root
779	* @bprm: The execution parameters, including the proposed creds
780	* @has_fcap: Are any file capabilities set?
781	* @effective: Do we have effective root privilege?
782	* @root_uid: This namespace' root UID WRT initial USER namespace
783	*
784	* Handle the case where root is privileged and hasn't been neutered by
785	* SECURE_NOROOT. If file capabilities are set, they won't be combined with
786	* set UID root and nothing is changed. If we are root, cap_permitted is
787	* updated. If we have become set UID root, the effective bit is set.
788	*/
789	static void handle_privileged_root(struct linux_binprm *bprm, bool has_fcap,
790	bool *effective, kuid_t root_uid)
791	{
792	const struct cred *old = current_cred();
793	struct cred *new = bprm->cred;
794
795	if (!root_privileged())
796	return;
797	/*
798	* If the legacy file capability is set, then don't set privs
799	* for a setuid root binary run by a non-root user. Do set it
800	* for a root user just to cause least surprise to an admin.
801	*/
802	if (has_fcap && __is_suid(uid: root_uid, cred: new)) {
803	warn_setuid_and_fcaps_mixed(fname: bprm->filename);
804	return;
805	}
806	/*
807	* To support inheritance of root-permissions and suid-root
808	* executables under compatibility mode, we override the
809	* capability sets for the file.
810	*/
811	if (__is_eff(uid: root_uid, cred: new) \|\| __is_real(uid: root_uid, cred: new)) {
812	/ pP' = (cap_bset & ~0) \| (pI & ~0) /
813	new->cap_permitted = cap_combine(a: old->cap_bset,
814	b: old->cap_inheritable);
815	}
816	/*
817	* If only the real uid is 0, we do not set the effective bit.
818	*/
819	if (__is_eff(uid: root_uid, cred: new))
820	*effective = true;
821	}
822
823	#define __cap_gained(field, target, source) \
824	!cap_issubset(target->cap_##field, source->cap_##field)
825	#define __cap_grew(target, source, cred) \
826	!cap_issubset(cred->cap_##target, cred->cap_##source)
827	#define __cap_full(field, cred) \
828	cap_issubset(CAP_FULL_SET, cred->cap_##field)
829
830	static inline bool __is_setuid(struct cred new, const* struct cred *old)
831	{ return !uid_eq(left: new->euid, right: old->uid); }
832
833	static inline bool __is_setgid(struct cred new, const* struct cred *old)
834	{ return !gid_eq(left: new->egid, right: old->gid); }
835
836	/*
837	* 1) Audit candidate if current->cap_effective is set
838	*
839	* We do not bother to audit if 3 things are true:
840	* 1) cap_effective has all caps
841	* 2) we became root OR are were already root
842	* 3) root is supposed to have all caps (SECURE_NOROOT)
843	* Since this is just a normal root execing a process.
844	*
845	* Number 1 above might fail if you don't have a full bset, but I think
846	* that is interesting information to audit.
847	*
848	* A number of other conditions require logging:
849	* 2) something prevented setuid root getting all caps
850	* 3) non-setuid root gets fcaps
851	* 4) non-setuid root gets ambient
852	*/
853	static inline bool nonroot_raised_pE(struct cred new, const* struct cred *old,
854	kuid_t root, bool has_fcap)
855	{
856	bool ret = false;
857
858	if ((__cap_grew(effective, ambient, new) &&
859	!(__cap_full(effective, new) &&
860	(__is_eff(uid: root, cred: new) \|\| __is_real(uid: root, cred: new)) &&
861	root_privileged())) \|\|
862	(root_privileged() &&
863	__is_suid(uid: root, cred: new) &&
864	!__cap_full(effective, new)) \|\|
865	(!__is_setuid(new, old) &&
866	((has_fcap &&
867	__cap_gained(permitted, new, old)) \|\|
868	__cap_gained(ambient, new, old))))
869
870	ret = true;
871
872	return ret;
873	}
874
875	/**
876	* cap_bprm_creds_from_file - Set up the proposed credentials for execve().
877	* @bprm: The execution parameters, including the proposed creds
878	* @file: The file to pull the credentials from
879	*
880	* Set up the proposed credentials for a new execution context being
881	* constructed by execve(). The proposed creds in @bprm->cred is altered,
882	* which won't take effect immediately.
883	*
884	* Return: 0 if successful, -ve on error.
885	*/
886	int cap_bprm_creds_from_file(struct linux_binprm bprm, const* struct file *file)
887	{
888	/ Process setpcap binaries and capabilities for uid 0 /
889	const struct cred *old = current_cred();
890	struct cred *new = bprm->cred;
891	bool effective = false, has_fcap = false, is_setid;
892	int ret;
893	kuid_t root_uid;
894
895	if (WARN_ON(!cap_ambient_invariant_ok(old)))
896	return -EPERM;
897
898	ret = get_file_caps(bprm, file, effective: &effective, has_fcap: &has_fcap);
899	if (ret < `0`)
900	return ret;
901
902	root_uid = make_kuid(from: new->user_ns, uid: `0`);
903
904	handle_privileged_root(bprm, has_fcap, effective: &effective, root_uid);
905
906	/ if we have fs caps, clear dangerous personality flags /
907	if (__cap_gained(permitted, new, old))
908	bprm->per_clear \|= PER_CLEAR_ON_SETID;
909
910	/ Don't let someone trace a set[ug]id/setpcap binary with the revised*
911	* credentials unless they have the appropriate permit.
912	*
913	* In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
914	*/
915	is_setid = __is_setuid(new, old) \|\| __is_setgid(new, old);
916
917	if ((is_setid \|\| __cap_gained(permitted, new, old)) &&
918	((bprm->unsafe & ~LSM_UNSAFE_PTRACE) \|\|
919	!ptracer_capable(current, ns: new->user_ns))) {
920	/ downgrade; they get no more than they had, and maybe less /
921	if (!ns_capable(ns: new->user_ns, CAP_SETUID) \|\|
922	(bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) {
923	new->euid = new->uid;
924	new->egid = new->gid;
925	}
926	new->cap_permitted = cap_intersect(a: new->cap_permitted,
927	b: old->cap_permitted);
928	}
929
930	new->suid = new->fsuid = new->euid;
931	new->sgid = new->fsgid = new->egid;
932
933	/ File caps or setid cancels ambient. /
934	if (has_fcap \|\| is_setid)
935	cap_clear(new->cap_ambient);
936
937	/*
938	* Now that we've computed pA', update pP' to give:
939	* pP' = (X & fP) \| (pI & fI) \| pA'
940	*/
941	new->cap_permitted = cap_combine(a: new->cap_permitted, b: new->cap_ambient);
942
943	/*
944	* Set pE' = (fE ? pP' : pA'). Because pA' is zero if fE is set,
945	* this is the same as pE' = (fE ? pP' : 0) \| pA'.
946	*/
947	if (effective)
948	new->cap_effective = new->cap_permitted;
949	else
950	new->cap_effective = new->cap_ambient;
951
952	if (WARN_ON(!cap_ambient_invariant_ok(new)))
953	return -EPERM;
954
955	if (nonroot_raised_pE(new, old, root: root_uid, has_fcap)) {
956	ret = audit_log_bprm_fcaps(bprm, new, old);
957	if (ret < `0`)
958	return ret;
959	}
960
961	new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
962
963	if (WARN_ON(!cap_ambient_invariant_ok(new)))
964	return -EPERM;
965
966	/ Check for privilege-elevated exec. /
967	if (is_setid \|\|
968	(!__is_real(uid: root_uid, cred: new) &&
969	(effective \|\|
970	__cap_grew(permitted, ambient, new))))
971	bprm->secureexec = `1`;
972
973	return `0`;
974	}
975
976	/**
977	* cap_inode_setxattr - Determine whether an xattr may be altered
978	* @dentry: The inode/dentry being altered
979	* @name: The name of the xattr to be changed
980	* @value: The value that the xattr will be changed to
981	* @size: The size of value
982	* @flags: The replacement flag
983	*
984	* Determine whether an xattr may be altered or set on an inode, returning 0 if
985	* permission is granted, -ve if denied.
986	*
987	* This is used to make sure security xattrs don't get updated or set by those
988	* who aren't privileged to do so.
989	*/
990	int cap_inode_setxattr(struct dentry dentry, const* char *name,
991	const void value, size_t size, int* flags)
992	{
993	struct user_namespace *user_ns = dentry->d_sb->s_user_ns;
994
995	/ Ignore non-security xattrs /
996	if (strncmp(name, XATTR_SECURITY_PREFIX,
997	XATTR_SECURITY_PREFIX_LEN) != `0`)
998	return `0`;
999
1000	/*
1001	* For XATTR_NAME_CAPS the check will be done in
1002	* cap_convert_nscap(), called by setxattr()
1003	*/
1004	if (strcmp(name, XATTR_NAME_CAPS) == `0`)
1005	return `0`;
1006
1007	if (!ns_capable(ns: user_ns, CAP_SYS_ADMIN))
1008	return -EPERM;
1009	return `0`;
1010	}
1011
1012	/**
1013	* cap_inode_removexattr - Determine whether an xattr may be removed
1014	*
1015	* @idmap: idmap of the mount the inode was found from
1016	* @dentry: The inode/dentry being altered
1017	* @name: The name of the xattr to be changed
1018	*
1019	* Determine whether an xattr may be removed from an inode, returning 0 if
1020	* permission is granted, -ve if denied.
1021	*
1022	* If the inode has been found through an idmapped mount the idmap of
1023	* the vfsmount must be passed through @idmap. This function will then
1024	* take care to map the inode according to @idmap before checking
1025	* permissions. On non-idmapped mounts or if permission checking is to be
1026	* performed on the raw inode simply pass @nop_mnt_idmap.
1027	*
1028	* This is used to make sure security xattrs don't get removed by those who
1029	* aren't privileged to remove them.
1030	*/
1031	int cap_inode_removexattr(struct mnt_idmap *idmap,
1032	struct dentry dentry, const* char *name)
1033	{
1034	struct user_namespace *user_ns = dentry->d_sb->s_user_ns;
1035
1036	/ Ignore non-security xattrs /
1037	if (strncmp(name, XATTR_SECURITY_PREFIX,
1038	XATTR_SECURITY_PREFIX_LEN) != `0`)
1039	return `0`;
1040
1041	if (strcmp(name, XATTR_NAME_CAPS) == `0`) {
1042	/ security.capability gets namespaced /
1043	struct inode *inode = d_backing_inode(upper: dentry);
1044	if (!inode)
1045	return -EINVAL;
1046	if (!capable_wrt_inode_uidgid(idmap, inode, CAP_SETFCAP))
1047	return -EPERM;
1048	return `0`;
1049	}
1050
1051	if (!ns_capable(ns: user_ns, CAP_SYS_ADMIN))
1052	return -EPERM;
1053	return `0`;
1054	}
1055
1056	/*
1057	* cap_emulate_setxuid() fixes the effective / permitted capabilities of
1058	* a process after a call to setuid, setreuid, or setresuid.
1059	*
1060	* 1) When set*uiding _from_ one of {r,e,s}uid == 0 _to_ all of
1061	* {r,e,s}uid != 0, the permitted and effective capabilities are
1062	* cleared.
1063	*
1064	* 2) When set*uiding _from_ euid == 0 _to_ euid != 0, the effective
1065	* capabilities of the process are cleared.
1066	*
1067	* 3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective
1068	* capabilities are set to the permitted capabilities.
1069	*
1070	* fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should
1071	* never happen.
1072	*
1073	* -astor
1074	*
1075	* cevans - New behaviour, Oct '99
1076	* A process may, via prctl(), elect to keep its capabilities when it
1077	* calls setuid() and switches away from uid==0. Both permitted and
1078	* effective sets will be retained.
1079	* Without this change, it was impossible for a daemon to drop only some
1080	* of its privilege. The call to setuid(!=0) would drop all privileges!
1081	* Keeping uid 0 is not an option because uid 0 owns too many vital
1082	* files..
1083	* Thanks to Olaf Kirch and Peter Benie for spotting this.
1084	*/
1085	static inline void cap_emulate_setxuid(struct cred new, const* struct cred *old)
1086	{
1087	kuid_t root_uid = make_kuid(from: old->user_ns, uid: `0`);
1088
1089	if ((uid_eq(left: old->uid, right: root_uid) \|\|
1090	uid_eq(left: old->euid, right: root_uid) \|\|
1091	uid_eq(left: old->suid, right: root_uid)) &&
1092	(!uid_eq(left: new->uid, right: root_uid) &&
1093	!uid_eq(left: new->euid, right: root_uid) &&
1094	!uid_eq(left: new->suid, right: root_uid))) {
1095	if (!issecure(SECURE_KEEP_CAPS)) {
1096	cap_clear(new->cap_permitted);
1097	cap_clear(new->cap_effective);
1098	}
1099
1100	/*
1101	* Pre-ambient programs expect setresuid to nonroot followed
1102	* by exec to drop capabilities. We should make sure that
1103	* this remains the case.
1104	*/
1105	cap_clear(new->cap_ambient);
1106	}
1107	if (uid_eq(left: old->euid, right: root_uid) && !uid_eq(left: new->euid, right: root_uid))
1108	cap_clear(new->cap_effective);
1109	if (!uid_eq(left: old->euid, right: root_uid) && uid_eq(left: new->euid, right: root_uid))
1110	new->cap_effective = new->cap_permitted;
1111	}
1112
1113	/**
1114	* cap_task_fix_setuid - Fix up the results of setuid() call
1115	* @new: The proposed credentials
1116	* @old: The current task's current credentials
1117	* @flags: Indications of what has changed
1118	*
1119	* Fix up the results of setuid() call before the credential changes are
1120	* actually applied.
1121	*
1122	* Return: 0 to grant the changes, -ve to deny them.
1123	*/
1124	int cap_task_fix_setuid(struct cred new, const* struct cred old, int* flags)
1125	{
1126	switch (flags) {
1127	case LSM_SETID_RE:
1128	case LSM_SETID_ID:
1129	case LSM_SETID_RES:
1130	/ juggle the capabilities to follow [RES]UID changes unless*
1131	* otherwise suppressed */
1132	if (!issecure(SECURE_NO_SETUID_FIXUP))
1133	cap_emulate_setxuid(new, old);
1134	break;
1135
1136	case LSM_SETID_FS:
1137	/ juggle the capabilities to follow FSUID changes, unless*
1138	* otherwise suppressed
1139	*
1140	* FIXME - is fsuser used for all CAP_FS_MASK capabilities?
1141	* if not, we might be a bit too harsh here.
1142	*/
1143	if (!issecure(SECURE_NO_SETUID_FIXUP)) {
1144	kuid_t root_uid = make_kuid(from: old->user_ns, uid: `0`);
1145	if (uid_eq(left: old->fsuid, right: root_uid) && !uid_eq(left: new->fsuid, right: root_uid))
1146	new->cap_effective =
1147	cap_drop_fs_set(a: new->cap_effective);
1148
1149	if (!uid_eq(left: old->fsuid, right: root_uid) && uid_eq(left: new->fsuid, right: root_uid))
1150	new->cap_effective =
1151	cap_raise_fs_set(a: new->cap_effective,
1152	permitted: new->cap_permitted);
1153	}
1154	break;
1155
1156	default:
1157	return -EINVAL;
1158	}
1159
1160	return `0`;
1161	}
1162
1163	/*
1164	* Rationale: code calling task_setscheduler, task_setioprio, and
1165	* task_setnice, assumes that
1166	* . if capable(cap_sys_nice), then those actions should be allowed
1167	* . if not capable(cap_sys_nice), but acting on your own processes,
1168	* then those actions should be allowed
1169	* This is insufficient now since you can call code without suid, but
1170	* yet with increased caps.
1171	* So we check for increased caps on the target process.
1172	*/
1173	static int cap_safe_nice(struct task_struct *p)
1174	{
1175	int is_subset, ret = `0`;
1176
1177	rcu_read_lock();
1178	is_subset = cap_issubset(__task_cred(p)->cap_permitted,
1179	current_cred()->cap_permitted);
1180	if (!is_subset && !ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
1181	ret = -EPERM;
1182	rcu_read_unlock();
1183
1184	return ret;
1185	}
1186
1187	/**
1188	* cap_task_setscheduler - Determine if scheduler policy change is permitted
1189	* @p: The task to affect
1190	*
1191	* Determine if the requested scheduler policy change is permitted for the
1192	* specified task.
1193	*
1194	* Return: 0 if permission is granted, -ve if denied.
1195	*/
1196	int cap_task_setscheduler(struct task_struct *p)
1197	{
1198	return cap_safe_nice(p);
1199	}
1200
1201	/**
1202	* cap_task_setioprio - Determine if I/O priority change is permitted
1203	* @p: The task to affect
1204	* @ioprio: The I/O priority to set
1205	*
1206	* Determine if the requested I/O priority change is permitted for the specified
1207	* task.
1208	*
1209	* Return: 0 if permission is granted, -ve if denied.
1210	*/
1211	int cap_task_setioprio(struct task_struct p, int* ioprio)
1212	{
1213	return cap_safe_nice(p);
1214	}
1215
1216	/**
1217	* cap_task_setnice - Determine if task priority change is permitted
1218	* @p: The task to affect
1219	* @nice: The nice value to set
1220	*
1221	* Determine if the requested task priority change is permitted for the
1222	* specified task.
1223	*
1224	* Return: 0 if permission is granted, -ve if denied.
1225	*/
1226	int cap_task_setnice(struct task_struct p, int* nice)
1227	{
1228	return cap_safe_nice(p);
1229	}
1230
1231	/*
1232	* Implement PR_CAPBSET_DROP. Attempt to remove the specified capability from
1233	* the current task's bounding set. Returns 0 on success, -ve on error.
1234	*/
1235	static int cap_prctl_drop(unsigned long cap)
1236	{
1237	struct cred *new;
1238
1239	if (!ns_capable(current_user_ns(), CAP_SETPCAP))
1240	return -EPERM;
1241	if (!cap_valid(cap))
1242	return -EINVAL;
1243
1244	new = prepare_creds();
1245	if (!new)
1246	return -ENOMEM;
1247	cap_lower(new->cap_bset, cap);
1248	return commit_creds(new);
1249	}
1250
1251	/**
1252	* cap_task_prctl - Implement process control functions for this security module
1253	* @option: The process control function requested
1254	* @arg2: The argument data for this function
1255	* @arg3: The argument data for this function
1256	* @arg4: The argument data for this function
1257	* @arg5: The argument data for this function
1258	*
1259	* Allow process control functions (sys_prctl()) to alter capabilities; may
1260	* also deny access to other functions not otherwise implemented here.
1261	*
1262	* Return: 0 or +ve on success, -ENOSYS if this function is not implemented
1263	* here, other -ve on error. If -ENOSYS is returned, sys_prctl() and other LSM
1264	* modules will consider performing the function.
1265	*/
1266	int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
1267	unsigned long arg4, unsigned long arg5)
1268	{
1269	const struct cred *old = current_cred();
1270	struct cred *new;
1271
1272	switch (option) {
1273	case PR_CAPBSET_READ:
1274	if (!cap_valid(arg2))
1275	return -EINVAL;
1276	return !!cap_raised(old->cap_bset, arg2);
1277
1278	case PR_CAPBSET_DROP:
1279	return cap_prctl_drop(cap: arg2);
1280
1281	/*
1282	* The next four prctl's remain to assist with transitioning a
1283	* system from legacy UID=0 based privilege (when filesystem
1284	* capabilities are not in use) to a system using filesystem
1285	* capabilities only - as the POSIX.1e draft intended.
1286	*
1287	* Note:
1288	*
1289	* PR_SET_SECUREBITS =
1290	* issecure_mask(SECURE_KEEP_CAPS_LOCKED)
1291	* \| issecure_mask(SECURE_NOROOT)
1292	* \| issecure_mask(SECURE_NOROOT_LOCKED)
1293	* \| issecure_mask(SECURE_NO_SETUID_FIXUP)
1294	* \| issecure_mask(SECURE_NO_SETUID_FIXUP_LOCKED)
1295	*
1296	* will ensure that the current process and all of its
1297	* children will be locked into a pure
1298	* capability-based-privilege environment.
1299	*/
1300	case PR_SET_SECUREBITS:
1301	if ((((old->securebits & SECURE_ALL_LOCKS) >> `1`)
1302	& (old->securebits ^ arg2)) /[1]/
1303	\|\| ((old->securebits & SECURE_ALL_LOCKS & ~arg2)) /[2]/
1304	\|\| (arg2 & ~(SECURE_ALL_LOCKS \| SECURE_ALL_BITS)) /[3]/
1305	\|\| (cap_capable(current_cred(),
1306	current_cred()->user_ns,
1307	CAP_SETPCAP,
1308	CAP_OPT_NONE) != `0`) /[4]/
1309	/*
1310	* [1] no changing of bits that are locked
1311	* [2] no unlocking of locks
1312	* [3] no setting of unsupported bits
1313	* [4] doing anything requires privilege (go read about
1314	* the "sendmail capabilities bug")
1315	*/
1316	)
1317	/ cannot change a locked bit /
1318	return -EPERM;
1319
1320	new = prepare_creds();
1321	if (!new)
1322	return -ENOMEM;
1323	new->securebits = arg2;
1324	return commit_creds(new);
1325
1326	case PR_GET_SECUREBITS:
1327	return old->securebits;
1328
1329	case PR_GET_KEEPCAPS:
1330	return !!issecure(SECURE_KEEP_CAPS);
1331
1332	case PR_SET_KEEPCAPS:
1333	if (arg2 > `1`) / Note, we rely on arg2 being unsigned here /
1334	return -EINVAL;
1335	if (issecure(SECURE_KEEP_CAPS_LOCKED))
1336	return -EPERM;
1337
1338	new = prepare_creds();
1339	if (!new)
1340	return -ENOMEM;
1341	if (arg2)
1342	new->securebits \|= issecure_mask(SECURE_KEEP_CAPS);
1343	else
1344	new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
1345	return commit_creds(new);
1346
1347	case PR_CAP_AMBIENT:
1348	if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) {
1349	if (arg3 \| arg4 \| arg5)
1350	return -EINVAL;
1351
1352	new = prepare_creds();
1353	if (!new)
1354	return -ENOMEM;
1355	cap_clear(new->cap_ambient);
1356	return commit_creds(new);
1357	}
1358
1359	if (((!cap_valid(arg3)) \| arg4 \| arg5))
1360	return -EINVAL;
1361
1362	if (arg2 == PR_CAP_AMBIENT_IS_SET) {
1363	return !!cap_raised(current_cred()->cap_ambient, arg3);
1364	} else if (arg2 != PR_CAP_AMBIENT_RAISE &&
1365	arg2 != PR_CAP_AMBIENT_LOWER) {
1366	return -EINVAL;
1367	} else {
1368	if (arg2 == PR_CAP_AMBIENT_RAISE &&
1369	(!cap_raised(current_cred()->cap_permitted, arg3) \|\|
1370	!cap_raised(current_cred()->cap_inheritable,
1371	arg3) \|\|
1372	issecure(SECURE_NO_CAP_AMBIENT_RAISE)))
1373	return -EPERM;
1374
1375	new = prepare_creds();
1376	if (!new)
1377	return -ENOMEM;
1378	if (arg2 == PR_CAP_AMBIENT_RAISE)
1379	cap_raise(new->cap_ambient, arg3);
1380	else
1381	cap_lower(new->cap_ambient, arg3);
1382	return commit_creds(new);
1383	}
1384
1385	default:
1386	/ No functionality available - continue with default /
1387	return -ENOSYS;
1388	}
1389	}
1390
1391	/**
1392	* cap_vm_enough_memory - Determine whether a new virtual mapping is permitted
1393	* @mm: The VM space in which the new mapping is to be made
1394	* @pages: The size of the mapping
1395	*
1396	* Determine whether the allocation of a new virtual mapping by the current
1397	* task is permitted.
1398	*
1399	* Return: 1 if permission is granted, 0 if not.
1400	*/
1401	int cap_vm_enough_memory(struct mm_struct mm, long* pages)
1402	{
1403	int cap_sys_admin = `0`;
1404
1405	if (cap_capable(current_cred(), targ_ns: &init_user_ns,
1406	CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) == `0`)
1407	cap_sys_admin = `1`;
1408
1409	return cap_sys_admin;
1410	}
1411
1412	/**
1413	* cap_mmap_addr - check if able to map given addr
1414	* @addr: address attempting to be mapped
1415	*
1416	* If the process is attempting to map memory below dac_mmap_min_addr they need
1417	* CAP_SYS_RAWIO. The other parameters to this function are unused by the
1418	* capability security module.
1419	*
1420	* Return: 0 if this mapping should be allowed or -EPERM if not.
1421	*/
1422	int cap_mmap_addr(unsigned long addr)
1423	{
1424	int ret = `0`;
1425
1426	if (addr < dac_mmap_min_addr) {
1427	ret = cap_capable(current_cred(), targ_ns: &init_user_ns, CAP_SYS_RAWIO,
1428	CAP_OPT_NONE);
1429	/ set PF_SUPERPRIV if it turns out we allow the low mmap /
1430	if (ret == `0`)
1431	current->flags \|= PF_SUPERPRIV;
1432	}
1433	return ret;
1434	}
1435
1436	int cap_mmap_file(struct file file, unsigned* long reqprot,
1437	unsigned long prot, unsigned long flags)
1438	{
1439	return `0`;
1440	}
1441
1442	#ifdef CONFIG_SECURITY
1443
1444	static const struct lsm_id capability_lsmid = {
1445	.name = "capability",
1446	.id = LSM_ID_CAPABILITY,
1447	};
1448
1449	static struct security_hook_list capability_hooks[] __ro_after_init = {
1450	LSM_HOOK_INIT(capable, cap_capable),
1451	LSM_HOOK_INIT(settime, cap_settime),
1452	LSM_HOOK_INIT(ptrace_access_check, cap_ptrace_access_check),
1453	LSM_HOOK_INIT(ptrace_traceme, cap_ptrace_traceme),
1454	LSM_HOOK_INIT(capget, cap_capget),
1455	LSM_HOOK_INIT(capset, cap_capset),
1456	LSM_HOOK_INIT(bprm_creds_from_file, cap_bprm_creds_from_file),
1457	LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv),
1458	LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
1459	LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity),
1460	LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
1461	LSM_HOOK_INIT(mmap_file, cap_mmap_file),
1462	LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid),
1463	LSM_HOOK_INIT(task_prctl, cap_task_prctl),
1464	LSM_HOOK_INIT(task_setscheduler, cap_task_setscheduler),
1465	LSM_HOOK_INIT(task_setioprio, cap_task_setioprio),
1466	LSM_HOOK_INIT(task_setnice, cap_task_setnice),
1467	LSM_HOOK_INIT(vm_enough_memory, cap_vm_enough_memory),
1468	};
1469
1470	static int __init capability_init(void)
1471	{
1472	security_add_hooks(hooks: capability_hooks, ARRAY_SIZE(capability_hooks),
1473	lsmid: &capability_lsmid);
1474	return `0`;
1475	}
1476
1477	DEFINE_LSM(capability) = {
1478	.name = "capability",
1479	.order = LSM_ORDER_FIRST,
1480	.init = capability_init,
1481	};
1482
1483	#endif /* CONFIG_SECURITY */
1484

source code of linux/security/commoncap.c