seccomp.c source code [linux/kernel/seccomp.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* linux/kernel/seccomp.c
4	*
5	* Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com>
6	*
7	* Copyright (C) 2012 Google, Inc.
8	* Will Drewry <wad@chromium.org>
9	*
10	* This defines a simple but solid secure-computing facility.
11	*
12	* Mode 1 uses a fixed list of allowed system calls.
13	* Mode 2 allows user-defined system call filters in the form
14	* of Berkeley Packet Filters/Linux Socket Filters.
15	*/
16	#define pr_fmt(fmt) "seccomp: " fmt
17
18	#include <linux/refcount.h>
19	#include <linux/audit.h>
20	#include <linux/compat.h>
21	#include <linux/coredump.h>
22	#include <linux/kmemleak.h>
23	#include <linux/nospec.h>
24	#include <linux/prctl.h>
25	#include <linux/sched.h>
26	#include <linux/sched/task_stack.h>
27	#include <linux/seccomp.h>
28	#include <linux/slab.h>
29	#include <linux/syscalls.h>
30	#include <linux/sysctl.h>
31
32	/ Not exposed in headers: strictly internal use only. /
33	#define SECCOMP_MODE_DEAD (SECCOMP_MODE_FILTER + 1)
34
35	#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
36	#include <asm/syscall.h>
37	#endif
38
39	#ifdef CONFIG_SECCOMP_FILTER
40	#include <linux/file.h>
41	#include <linux/filter.h>
42	#include <linux/pid.h>
43	#include <linux/ptrace.h>
44	#include <linux/capability.h>
45	#include <linux/uaccess.h>
46	#include <linux/anon_inodes.h>
47	#include <linux/lockdep.h>
48
49	/*
50	* When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the
51	* wrong direction flag in the ioctl number. This is the broken one,
52	* which the kernel needs to keep supporting until all userspaces stop
53	* using the wrong command number.
54	*/
55	#define SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR SECCOMP_IOR(2, __u64)
56
57	enum notify_state {
58	SECCOMP_NOTIFY_INIT,
59	SECCOMP_NOTIFY_SENT,
60	SECCOMP_NOTIFY_REPLIED,
61	};
62
63	struct seccomp_knotif {
64	/ The struct pid of the task whose filter triggered the notification /
65	struct task_struct *task;
66
67	/ The "cookie" for this request; this is unique for this filter. /
68	u64 id;
69
70	/*
71	* The seccomp data. This pointer is valid the entire time this
72	* notification is active, since it comes from __seccomp_filter which
73	* eclipses the entire lifecycle here.
74	*/
75	const struct seccomp_data *data;
76
77	/*
78	* Notification states. When SECCOMP_RET_USER_NOTIF is returned, a
79	* struct seccomp_knotif is created and starts out in INIT. Once the
80	* handler reads the notification off of an FD, it transitions to SENT.
81	* If a signal is received the state transitions back to INIT and
82	* another message is sent. When the userspace handler replies, state
83	* transitions to REPLIED.
84	*/
85	enum notify_state state;
86
87	/ The return values, only valid when in SECCOMP_NOTIFY_REPLIED /
88	int error;
89	long val;
90	u32 flags;
91
92	/*
93	* Signals when this has changed states, such as the listener
94	* dying, a new seccomp addfd message, or changing to REPLIED
95	*/
96	struct completion ready;
97
98	struct list_head list;
99
100	/ outstanding addfd requests /
101	struct list_head addfd;
102	};
103
104	/**
105	* struct seccomp_kaddfd - container for seccomp_addfd ioctl messages
106	*
107	* @file: A reference to the file to install in the other task
108	* @fd: The fd number to install it at. If the fd number is -1, it means the
109	* installing process should allocate the fd as normal.
110	* @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC
111	* is allowed.
112	* @ioctl_flags: The flags used for the seccomp_addfd ioctl.
113	* @setfd: whether or not SECCOMP_ADDFD_FLAG_SETFD was set during notify_addfd
114	* @ret: The return value of the installing process. It is set to the fd num
115	* upon success (>= 0).
116	* @completion: Indicates that the installing process has completed fd
117	* installation, or gone away (either due to successful
118	* reply, or signal)
119	* @list: list_head for chaining seccomp_kaddfd together.
120	*
121	*/
122	struct seccomp_kaddfd {
123	struct file *file;
124	int fd;
125	unsigned int flags;
126	__u32 ioctl_flags;
127
128	union {
129	bool setfd;
130	/ To only be set on reply /
131	int ret;
132	};
133	struct completion completion;
134	struct list_head list;
135	};
136
137	/**
138	* struct notification - container for seccomp userspace notifications. Since
139	* most seccomp filters will not have notification listeners attached and this
140	* structure is fairly large, we store the notification-specific stuff in a
141	* separate structure.
142	*
143	* @requests: A semaphore that users of this notification can wait on for
144	* changes. Actual reads and writes are still controlled with
145	* filter->notify_lock.
146	* @flags: A set of SECCOMP_USER_NOTIF_FD_* flags.
147	* @next_id: The id of the next request.
148	* @notifications: A list of struct seccomp_knotif elements.
149	*/
150
151	struct notification {
152	atomic_t requests;
153	u32 flags;
154	u64 next_id;
155	struct list_head notifications;
156	};
157
158	#ifdef SECCOMP_ARCH_NATIVE
159	/**
160	* struct action_cache - per-filter cache of seccomp actions per
161	* arch/syscall pair
162	*
163	* @allow_native: A bitmap where each bit represents whether the
164	* filter will always allow the syscall, for the
165	* native architecture.
166	* @allow_compat: A bitmap where each bit represents whether the
167	* filter will always allow the syscall, for the
168	* compat architecture.
169	*/
170	struct action_cache {
171	DECLARE_BITMAP(allow_native, SECCOMP_ARCH_NATIVE_NR);
172	#ifdef SECCOMP_ARCH_COMPAT
173	DECLARE_BITMAP(allow_compat, SECCOMP_ARCH_COMPAT_NR);
174	#endif
175	};
176	#else
177	struct action_cache { };
178
179	static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
180	const struct seccomp_data *sd)
181	{
182	return false;
183	}
184
185	static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
186	{
187	}
188	#endif /* SECCOMP_ARCH_NATIVE */
189
190	/**
191	* struct seccomp_filter - container for seccomp BPF programs
192	*
193	* @refs: Reference count to manage the object lifetime.
194	* A filter's reference count is incremented for each directly
195	* attached task, once for the dependent filter, and if
196	* requested for the user notifier. When @refs reaches zero,
197	* the filter can be freed.
198	* @users: A filter's @users count is incremented for each directly
199	* attached task (filter installation, fork(), thread_sync),
200	* and once for the dependent filter (tracked in filter->prev).
201	* When it reaches zero it indicates that no direct or indirect
202	* users of that filter exist. No new tasks can get associated with
203	* this filter after reaching 0. The @users count is always smaller
204	* or equal to @refs. Hence, reaching 0 for @users does not mean
205	* the filter can be freed.
206	* @cache: cache of arch/syscall mappings to actions
207	* @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
208	* @wait_killable_recv: Put notifying process in killable state once the
209	* notification is received by the userspace listener.
210	* @prev: points to a previously installed, or inherited, filter
211	* @prog: the BPF program to evaluate
212	* @notif: the struct that holds all notification related information
213	* @notify_lock: A lock for all notification-related accesses.
214	* @wqh: A wait queue for poll if a notifier is in use.
215	*
216	* seccomp_filter objects are organized in a tree linked via the @prev
217	* pointer. For any task, it appears to be a singly-linked list starting
218	* with current->seccomp.filter, the most recently attached or inherited filter.
219	* However, multiple filters may share a @prev node, by way of fork(), which
220	* results in a unidirectional tree existing in memory. This is similar to
221	* how namespaces work.
222	*
223	* seccomp_filter objects should never be modified after being attached
224	* to a task_struct (other than @refs).
225	*/
226	struct seccomp_filter {
227	refcount_t refs;
228	refcount_t users;
229	bool log;
230	bool wait_killable_recv;
231	struct action_cache cache;
232	struct seccomp_filter *prev;
233	struct bpf_prog *prog;
234	struct notification *notif;
235	struct mutex notify_lock;
236	wait_queue_head_t wqh;
237	};
238
239	/ Limit any path through the tree to 256KB worth of instructions. /
240	#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
241
242	/*
243	* Endianness is explicitly ignored and left for BPF program authors to manage
244	* as per the specific architecture.
245	*/
246	static void populate_seccomp_data(struct seccomp_data *sd)
247	{
248	/*
249	* Instead of using current_pt_reg(), we're already doing the work
250	* to safely fetch "current", so just use "task" everywhere below.
251	*/
252	struct task_struct *task = current;
253	struct pt_regs *regs = task_pt_regs(task);
254	unsigned long args[`6`];
255
256	sd->nr = syscall_get_nr(task, regs);
257	sd->arch = syscall_get_arch(task);
258	syscall_get_arguments(task, regs, args);
259	sd->args[`0`] = args[`0`];
260	sd->args[`1`] = args[`1`];
261	sd->args[`2`] = args[`2`];
262	sd->args[`3`] = args[`3`];
263	sd->args[`4`] = args[`4`];
264	sd->args[`5`] = args[`5`];
265	sd->instruction_pointer = KSTK_EIP(task);
266	}
267
268	/**
269	* seccomp_check_filter - verify seccomp filter code
270	* @filter: filter to verify
271	* @flen: length of filter
272	*
273	* Takes a previously checked filter (by bpf_check_classic) and
274	* redirects all filter code that loads struct sk_buff data
275	* and related data through seccomp_bpf_load. It also
276	* enforces length and alignment checking of those loads.
277	*
278	* Returns 0 if the rule set is legal or -EINVAL if not.
279	*/
280	static int seccomp_check_filter(struct sock_filter filter, unsigned* int flen)
281	{
282	int pc;
283	for (pc = `0`; pc < flen; pc++) {
284	struct sock_filter *ftest = &filter[pc];
285	u16 code = ftest->code;
286	u32 k = ftest->k;
287
288	switch (code) {
289	case BPF_LD \| BPF_W \| BPF_ABS:
290	ftest->code = BPF_LDX \| BPF_W \| BPF_ABS;
291	/ 32-bit aligned and not out of bounds. /
292	if (k >= sizeof(struct seccomp_data) \|\| k & `3`)
293	return -EINVAL;
294	continue;
295	case BPF_LD \| BPF_W \| BPF_LEN:
296	ftest->code = BPF_LD \| BPF_IMM;
297	ftest->k = sizeof(struct seccomp_data);
298	continue;
299	case BPF_LDX \| BPF_W \| BPF_LEN:
300	ftest->code = BPF_LDX \| BPF_IMM;
301	ftest->k = sizeof(struct seccomp_data);
302	continue;
303	/ Explicitly include allowed calls. /
304	case BPF_RET \| BPF_K:
305	case BPF_RET \| BPF_A:
306	case BPF_ALU \| BPF_ADD \| BPF_K:
307	case BPF_ALU \| BPF_ADD \| BPF_X:
308	case BPF_ALU \| BPF_SUB \| BPF_K:
309	case BPF_ALU \| BPF_SUB \| BPF_X:
310	case BPF_ALU \| BPF_MUL \| BPF_K:
311	case BPF_ALU \| BPF_MUL \| BPF_X:
312	case BPF_ALU \| BPF_DIV \| BPF_K:
313	case BPF_ALU \| BPF_DIV \| BPF_X:
314	case BPF_ALU \| BPF_AND \| BPF_K:
315	case BPF_ALU \| BPF_AND \| BPF_X:
316	case BPF_ALU \| BPF_OR \| BPF_K:
317	case BPF_ALU \| BPF_OR \| BPF_X:
318	case BPF_ALU \| BPF_XOR \| BPF_K:
319	case BPF_ALU \| BPF_XOR \| BPF_X:
320	case BPF_ALU \| BPF_LSH \| BPF_K:
321	case BPF_ALU \| BPF_LSH \| BPF_X:
322	case BPF_ALU \| BPF_RSH \| BPF_K:
323	case BPF_ALU \| BPF_RSH \| BPF_X:
324	case BPF_ALU \| BPF_NEG:
325	case BPF_LD \| BPF_IMM:
326	case BPF_LDX \| BPF_IMM:
327	case BPF_MISC \| BPF_TAX:
328	case BPF_MISC \| BPF_TXA:
329	case BPF_LD \| BPF_MEM:
330	case BPF_LDX \| BPF_MEM:
331	case BPF_ST:
332	case BPF_STX:
333	case BPF_JMP \| BPF_JA:
334	case BPF_JMP \| BPF_JEQ \| BPF_K:
335	case BPF_JMP \| BPF_JEQ \| BPF_X:
336	case BPF_JMP \| BPF_JGE \| BPF_K:
337	case BPF_JMP \| BPF_JGE \| BPF_X:
338	case BPF_JMP \| BPF_JGT \| BPF_K:
339	case BPF_JMP \| BPF_JGT \| BPF_X:
340	case BPF_JMP \| BPF_JSET \| BPF_K:
341	case BPF_JMP \| BPF_JSET \| BPF_X:
342	continue;
343	default:
344	return -EINVAL;
345	}
346	}
347	return `0`;
348	}
349
350	#ifdef SECCOMP_ARCH_NATIVE
351	static inline bool seccomp_cache_check_allow_bitmap(const void *bitmap,
352	size_t bitmap_size,
353	int syscall_nr)
354	{
355	if (unlikely(syscall_nr < `0` \|\| syscall_nr >= bitmap_size))
356	return false;
357	syscall_nr = array_index_nospec(syscall_nr, bitmap_size);
358
359	return test_bit(syscall_nr, bitmap);
360	}
361
362	/**
363	* seccomp_cache_check_allow - lookup seccomp cache
364	* @sfilter: The seccomp filter
365	* @sd: The seccomp data to lookup the cache with
366	*
367	* Returns true if the seccomp_data is cached and allowed.
368	*/
369	static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
370	const struct seccomp_data *sd)
371	{
372	int syscall_nr = sd->nr;
373	const struct action_cache *cache = &sfilter->cache;
374
375	#ifndef SECCOMP_ARCH_COMPAT
376	/ A native-only architecture doesn't need to check sd->arch. /
377	return seccomp_cache_check_allow_bitmap(cache->allow_native,
378	SECCOMP_ARCH_NATIVE_NR,
379	syscall_nr);
380	#else
381	if (likely(sd->arch == SECCOMP_ARCH_NATIVE))
382	return seccomp_cache_check_allow_bitmap(bitmap: cache->allow_native,
383	SECCOMP_ARCH_NATIVE_NR,
384	syscall_nr);
385	if (likely(sd->arch == SECCOMP_ARCH_COMPAT))
386	return seccomp_cache_check_allow_bitmap(bitmap: cache->allow_compat,
387	SECCOMP_ARCH_COMPAT_NR,
388	syscall_nr);
389	#endif /* SECCOMP_ARCH_COMPAT */
390
391	WARN_ON_ONCE(true);
392	return false;
393	}
394	#endif /* SECCOMP_ARCH_NATIVE */
395
396	#define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL)))
397	/**
398	* seccomp_run_filters - evaluates all seccomp filters against @sd
399	* @sd: optional seccomp data to be passed to filters
400	* @match: stores struct seccomp_filter that resulted in the return value,
401	* unless filter returned SECCOMP_RET_ALLOW, in which case it will
402	* be unchanged.
403	*
404	* Returns valid seccomp BPF response codes.
405	*/
406	static u32 seccomp_run_filters(const struct seccomp_data *sd,
407	struct seccomp_filter **match)
408	{
409	u32 ret = SECCOMP_RET_ALLOW;
410	/ Make sure cross-thread synced filter points somewhere sane. /
411	struct seccomp_filter *f =
412	READ_ONCE(current->seccomp.filter);
413
414	/ Ensure unexpected behavior doesn't result in failing open. /
415	if (WARN_ON(f == NULL))
416	return SECCOMP_RET_KILL_PROCESS;
417
418	if (seccomp_cache_check_allow(sfilter: f, sd))
419	return SECCOMP_RET_ALLOW;
420
421	/*
422	* All filters in the list are evaluated and the lowest BPF return
423	* value always takes priority (ignoring the DATA).
424	*/
425	for (; f; f = f->prev) {
426	u32 cur_ret = bpf_prog_run_pin_on_cpu(prog: f->prog, ctx: sd);
427
428	if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {
429	ret = cur_ret;
430	*match = f;
431	}
432	}
433	return ret;
434	}
435	#endif /* CONFIG_SECCOMP_FILTER */
436
437	static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
438	{
439	assert_spin_locked(&current->sighand->siglock);
440
441	if (current->seccomp.mode && current->seccomp.mode != seccomp_mode)
442	return false;
443
444	return true;
445	}
446
447	void __weak arch_seccomp_spec_mitigate(struct task_struct *task) { }
448
449	static inline void seccomp_assign_mode(struct task_struct *task,
450	unsigned long seccomp_mode,
451	unsigned long flags)
452	{
453	assert_spin_locked(&task->sighand->siglock);
454
455	task->seccomp.mode = seccomp_mode;
456	/*
457	* Make sure SYSCALL_WORK_SECCOMP cannot be set before the mode (and
458	* filter) is set.
459	*/
460	smp_mb__before_atomic();
461	/ Assume default seccomp processes want spec flaw mitigation. /
462	if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == `0`)
463	arch_seccomp_spec_mitigate(task);
464	set_task_syscall_work(task, SECCOMP);
465	}
466
467	#ifdef CONFIG_SECCOMP_FILTER
468	/ Returns 1 if the parent is an ancestor of the child. /
469	static int is_ancestor(struct seccomp_filter *parent,
470	struct seccomp_filter *child)
471	{
472	/ NULL is the root ancestor. /
473	if (parent == NULL)
474	return `1`;
475	for (; child; child = child->prev)
476	if (child == parent)
477	return `1`;
478	return `0`;
479	}
480
481	/**
482	* seccomp_can_sync_threads: checks if all threads can be synchronized
483	*
484	* Expects sighand and cred_guard_mutex locks to be held.
485	*
486	* Returns 0 on success, -ve on error, or the pid of a thread which was
487	* either not in the correct seccomp mode or did not have an ancestral
488	* seccomp filter.
489	*/
490	static inline pid_t seccomp_can_sync_threads(void)
491	{
492	struct task_struct thread, caller;
493
494	BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
495	assert_spin_locked(&current->sighand->siglock);
496
497	/ Validate all threads being eligible for synchronization. /
498	caller = current;
499	for_each_thread(caller, thread) {
500	pid_t failed;
501
502	/ Skip current, since it is initiating the sync. /
503	if (thread == caller)
504	continue;
505
506	if (thread->seccomp.mode == SECCOMP_MODE_DISABLED \|\|
507	(thread->seccomp.mode == SECCOMP_MODE_FILTER &&
508	is_ancestor(parent: thread->seccomp.filter,
509	child: caller->seccomp.filter)))
510	continue;
511
512	/ Return the first thread that cannot be synchronized. /
513	failed = task_pid_vnr(tsk: thread);
514	/ If the pid cannot be resolved, then return -ESRCH /
515	if (WARN_ON(failed == `0`))
516	failed = -ESRCH;
517	return failed;
518	}
519
520	return `0`;
521	}
522
523	static inline void seccomp_filter_free(struct seccomp_filter *filter)
524	{
525	if (filter) {
526	bpf_prog_destroy(fp: filter->prog);
527	kfree(objp: filter);
528	}
529	}
530
531	static void __seccomp_filter_orphan(struct seccomp_filter *orig)
532	{
533	while (orig && refcount_dec_and_test(r: &orig->users)) {
534	if (waitqueue_active(wq_head: &orig->wqh))
535	wake_up_poll(&orig->wqh, EPOLLHUP);
536	orig = orig->prev;
537	}
538	}
539
540	static void __put_seccomp_filter(struct seccomp_filter *orig)
541	{
542	/ Clean up single-reference branches iteratively. /
543	while (orig && refcount_dec_and_test(r: &orig->refs)) {
544	struct seccomp_filter *freeme = orig;
545	orig = orig->prev;
546	seccomp_filter_free(filter: freeme);
547	}
548	}
549
550	static void __seccomp_filter_release(struct seccomp_filter *orig)
551	{
552	/ Notify about any unused filters in the task's former filter tree. /
553	__seccomp_filter_orphan(orig);
554	/ Finally drop all references to the task's former tree. /
555	__put_seccomp_filter(orig);
556	}
557
558	/**
559	* seccomp_filter_release - Detach the task from its filter tree,
560	* drop its reference count, and notify
561	* about unused filters
562	*
563	* @tsk: task the filter should be released from.
564	*
565	* This function should only be called when the task is exiting as
566	* it detaches it from its filter tree. As such, READ_ONCE() and
567	* barriers are not needed here, as would normally be needed.
568	*/
569	void seccomp_filter_release(struct task_struct *tsk)
570	{
571	struct seccomp_filter *orig = tsk->seccomp.filter;
572
573	/ We are effectively holding the siglock by not having any sighand. /
574	WARN_ON(tsk->sighand != NULL);
575
576	/ Detach task from its filter tree. /
577	tsk->seccomp.filter = NULL;
578	__seccomp_filter_release(orig);
579	}
580
581	/**
582	* seccomp_sync_threads: sets all threads to use current's filter
583	*
584	* @flags: SECCOMP_FILTER_FLAG_* flags to set during sync.
585	*
586	* Expects sighand and cred_guard_mutex locks to be held, and for
587	* seccomp_can_sync_threads() to have returned success already
588	* without dropping the locks.
589	*
590	*/
591	static inline void seccomp_sync_threads(unsigned long flags)
592	{
593	struct task_struct thread, caller;
594
595	BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
596	assert_spin_locked(&current->sighand->siglock);
597
598	/ Synchronize all threads. /
599	caller = current;
600	for_each_thread(caller, thread) {
601	/ Skip current, since it needs no changes. /
602	if (thread == caller)
603	continue;
604
605	/ Get a task reference for the new leaf node. /
606	get_seccomp_filter(tsk: caller);
607
608	/*
609	* Drop the task reference to the shared ancestor since
610	* current's path will hold a reference. (This also
611	* allows a put before the assignment.)
612	*/
613	__seccomp_filter_release(orig: thread->seccomp.filter);
614
615	/ Make our new filter tree visible. /
616	smp_store_release(&thread->seccomp.filter,
617	caller->seccomp.filter);
618	atomic_set(v: &thread->seccomp.filter_count,
619	i: atomic_read(v: &caller->seccomp.filter_count));
620
621	/*
622	* Don't let an unprivileged task work around
623	* the no_new_privs restriction by creating
624	* a thread that sets it up, enters seccomp,
625	* then dies.
626	*/
627	if (task_no_new_privs(p: caller))
628	task_set_no_new_privs(p: thread);
629
630	/*
631	* Opt the other thread into seccomp if needed.
632	* As threads are considered to be trust-realm
633	* equivalent (see ptrace_may_access), it is safe to
634	* allow one thread to transition the other.
635	*/
636	if (thread->seccomp.mode == SECCOMP_MODE_DISABLED)
637	seccomp_assign_mode(task: thread, SECCOMP_MODE_FILTER,
638	flags);
639	}
640	}
641
642	/**
643	* seccomp_prepare_filter: Prepares a seccomp filter for use.
644	* @fprog: BPF program to install
645	*
646	* Returns filter on success or an ERR_PTR on failure.
647	*/
648	static struct seccomp_filter seccomp_prepare_filter(struct* sock_fprog *fprog)
649	{
650	struct seccomp_filter *sfilter;
651	int ret;
652	const bool save_orig =
653	#if defined(CONFIG_CHECKPOINT_RESTORE) \|\| defined(SECCOMP_ARCH_NATIVE)
654	true;
655	#else
656	false;
657	#endif
658
659	if (fprog->len == `0` \|\| fprog->len > BPF_MAXINSNS)
660	return ERR_PTR(error: -EINVAL);
661
662	BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
663
664	/*
665	* Installing a seccomp filter requires that the task has
666	* CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
667	* This avoids scenarios where unprivileged tasks can affect the
668	* behavior of privileged children.
669	*/
670	if (!task_no_new_privs(current) &&
671	!ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
672	return ERR_PTR(error: -EACCES);
673
674	/ Allocate a new seccomp_filter /
675	sfilter = kzalloc(size: sizeof(*sfilter), GFP_KERNEL \| __GFP_NOWARN);
676	if (!sfilter)
677	return ERR_PTR(error: -ENOMEM);
678
679	mutex_init(&sfilter->notify_lock);
680	ret = bpf_prog_create_from_user(pfp: &sfilter->prog, fprog,
681	trans: seccomp_check_filter, save_orig);
682	if (ret < `0`) {
683	kfree(objp: sfilter);
684	return ERR_PTR(error: ret);
685	}
686
687	refcount_set(r: &sfilter->refs, n: `1`);
688	refcount_set(r: &sfilter->users, n: `1`);
689	init_waitqueue_head(&sfilter->wqh);
690
691	return sfilter;
692	}
693
694	/**
695	* seccomp_prepare_user_filter - prepares a user-supplied sock_fprog
696	* @user_filter: pointer to the user data containing a sock_fprog.
697	*
698	* Returns 0 on success and non-zero otherwise.
699	*/
700	static struct seccomp_filter *
701	seccomp_prepare_user_filter(const char __user *user_filter)
702	{
703	struct sock_fprog fprog;
704	struct seccomp_filter *filter = ERR_PTR(error: -EFAULT);
705
706	#ifdef CONFIG_COMPAT
707	if (in_compat_syscall()) {
708	struct compat_sock_fprog fprog32;
709	if (copy_from_user(to: &fprog32, from: user_filter, n: sizeof(fprog32)))
710	goto out;
711	fprog.len = fprog32.len;
712	fprog.filter = compat_ptr(uptr: fprog32.filter);
713	} else / falls through to the if below. /
714	#endif
715	if (copy_from_user(to: &fprog, from: user_filter, n: sizeof(fprog)))
716	goto out;
717	filter = seccomp_prepare_filter(fprog: &fprog);
718	out:
719	return filter;
720	}
721
722	#ifdef SECCOMP_ARCH_NATIVE
723	/**
724	* seccomp_is_const_allow - check if filter is constant allow with given data
725	* @fprog: The BPF programs
726	* @sd: The seccomp data to check against, only syscall number and arch
727	* number are considered constant.
728	*/
729	static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog,
730	struct seccomp_data *sd)
731	{
732	unsigned int reg_value = `0`;
733	unsigned int pc;
734	bool op_res;
735
736	if (WARN_ON_ONCE(!fprog))
737	return false;
738
739	for (pc = `0`; pc < fprog->len; pc++) {
740	struct sock_filter *insn = &fprog->filter[pc];
741	u16 code = insn->code;
742	u32 k = insn->k;
743
744	switch (code) {
745	case BPF_LD \| BPF_W \| BPF_ABS:
746	switch (k) {
747	case offsetof(struct seccomp_data, nr):
748	reg_value = sd->nr;
749	break;
750	case offsetof(struct seccomp_data, arch):
751	reg_value = sd->arch;
752	break;
753	default:
754	/ can't optimize (non-constant value load) /
755	return false;
756	}
757	break;
758	case BPF_RET \| BPF_K:
759	/ reached return with constant values only, check allow /
760	return k == SECCOMP_RET_ALLOW;
761	case BPF_JMP \| BPF_JA:
762	pc += insn->k;
763	break;
764	case BPF_JMP \| BPF_JEQ \| BPF_K:
765	case BPF_JMP \| BPF_JGE \| BPF_K:
766	case BPF_JMP \| BPF_JGT \| BPF_K:
767	case BPF_JMP \| BPF_JSET \| BPF_K:
768	switch (BPF_OP(code)) {
769	case BPF_JEQ:
770	op_res = reg_value == k;
771	break;
772	case BPF_JGE:
773	op_res = reg_value >= k;
774	break;
775	case BPF_JGT:
776	op_res = reg_value > k;
777	break;
778	case BPF_JSET:
779	op_res = !!(reg_value & k);
780	break;
781	default:
782	/ can't optimize (unknown jump) /
783	return false;
784	}
785
786	pc += op_res ? insn->jt : insn->jf;
787	break;
788	case BPF_ALU \| BPF_AND \| BPF_K:
789	reg_value &= k;
790	break;
791	default:
792	/ can't optimize (unknown insn) /
793	return false;
794	}
795	}
796
797	/ ran off the end of the filter?! /
798	WARN_ON(`1`);
799	return false;
800	}
801
802	static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter,
803	void bitmap, const* void *bitmap_prev,
804	size_t bitmap_size, int arch)
805	{
806	struct sock_fprog_kern *fprog = sfilter->prog->orig_prog;
807	struct seccomp_data sd;
808	int nr;
809
810	if (bitmap_prev) {
811	/ The new filter must be as restrictive as the last. /
812	bitmap_copy(dst: bitmap, src: bitmap_prev, nbits: bitmap_size);
813	} else {
814	/ Before any filters, all syscalls are always allowed. /
815	bitmap_fill(dst: bitmap, nbits: bitmap_size);
816	}
817
818	for (nr = `0`; nr < bitmap_size; nr++) {
819	/ No bitmap change: not a cacheable action. /
820	if (!test_bit(nr, bitmap))
821	continue;
822
823	sd.nr = nr;
824	sd.arch = arch;
825
826	/ No bitmap change: continue to always allow. /
827	if (seccomp_is_const_allow(fprog, sd: &sd))
828	continue;
829
830	/*
831	* Not a cacheable action: always run filters.
832	* atomic clear_bit() not needed, filter not visible yet.
833	*/
834	__clear_bit(nr, bitmap);
835	}
836	}
837
838	/**
839	* seccomp_cache_prepare - emulate the filter to find cacheable syscalls
840	* @sfilter: The seccomp filter
841	*
842	* Returns 0 if successful or -errno if error occurred.
843	*/
844	static void seccomp_cache_prepare(struct seccomp_filter *sfilter)
845	{
846	struct action_cache *cache = &sfilter->cache;
847	const struct action_cache *cache_prev =
848	sfilter->prev ? &sfilter->prev->cache : NULL;
849
850	seccomp_cache_prepare_bitmap(sfilter, bitmap: cache->allow_native,
851	bitmap_prev: cache_prev ? cache_prev->allow_native : NULL,
852	SECCOMP_ARCH_NATIVE_NR,
853	SECCOMP_ARCH_NATIVE);
854
855	#ifdef SECCOMP_ARCH_COMPAT
856	seccomp_cache_prepare_bitmap(sfilter, bitmap: cache->allow_compat,
857	bitmap_prev: cache_prev ? cache_prev->allow_compat : NULL,
858	SECCOMP_ARCH_COMPAT_NR,
859	SECCOMP_ARCH_COMPAT);
860	#endif /* SECCOMP_ARCH_COMPAT */
861	}
862	#endif /* SECCOMP_ARCH_NATIVE */
863
864	/**
865	* seccomp_attach_filter: validate and attach filter
866	* @flags: flags to change filter behavior
867	* @filter: seccomp filter to add to the current process
868	*
869	* Caller must be holding current->sighand->siglock lock.
870	*
871	* Returns 0 on success, -ve on error, or
872	* - in TSYNC mode: the pid of a thread which was either not in the correct
873	* seccomp mode or did not have an ancestral seccomp filter
874	* - in NEW_LISTENER mode: the fd of the new listener
875	*/
876	static long seccomp_attach_filter(unsigned int flags,
877	struct seccomp_filter *filter)
878	{
879	unsigned long total_insns;
880	struct seccomp_filter *walker;
881
882	assert_spin_locked(&current->sighand->siglock);
883
884	/ Validate resulting filter length. /
885	total_insns = filter->prog->len;
886	for (walker = current->seccomp.filter; walker; walker = walker->prev)
887	total_insns += walker->prog->len + `4`; / 4 instr penalty /
888	if (total_insns > MAX_INSNS_PER_PATH)
889	return -ENOMEM;
890
891	/ If thread sync has been requested, check that it is possible. /
892	if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
893	int ret;
894
895	ret = seccomp_can_sync_threads();
896	if (ret) {
897	if (flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH)
898	return -ESRCH;
899	else
900	return ret;
901	}
902	}
903
904	/ Set log flag, if present. /
905	if (flags & SECCOMP_FILTER_FLAG_LOG)
906	filter->log = true;
907
908	/ Set wait killable flag, if present. /
909	if (flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
910	filter->wait_killable_recv = true;
911
912	/*
913	* If there is an existing filter, make it the prev and don't drop its
914	* task reference.
915	*/
916	filter->prev = current->seccomp.filter;
917	seccomp_cache_prepare(sfilter: filter);
918	current->seccomp.filter = filter;
919	atomic_inc(v: &current->seccomp.filter_count);
920
921	/ Now that the new filter is in place, synchronize to all threads. /
922	if (flags & SECCOMP_FILTER_FLAG_TSYNC)
923	seccomp_sync_threads(flags);
924
925	return `0`;
926	}
927
928	static void __get_seccomp_filter(struct seccomp_filter *filter)
929	{
930	refcount_inc(r: &filter->refs);
931	}
932
933	/ get_seccomp_filter - increments the reference count of the filter on @tsk /
934	void get_seccomp_filter(struct task_struct *tsk)
935	{
936	struct seccomp_filter *orig = tsk->seccomp.filter;
937	if (!orig)
938	return;
939	__get_seccomp_filter(filter: orig);
940	refcount_inc(r: &orig->users);
941	}
942
943	#endif /* CONFIG_SECCOMP_FILTER */
944
945	/ For use with seccomp_actions_logged /
946	#define SECCOMP_LOG_KILL_PROCESS (1 << 0)
947	#define SECCOMP_LOG_KILL_THREAD (1 << 1)
948	#define SECCOMP_LOG_TRAP (1 << 2)
949	#define SECCOMP_LOG_ERRNO (1 << 3)
950	#define SECCOMP_LOG_TRACE (1 << 4)
951	#define SECCOMP_LOG_LOG (1 << 5)
952	#define SECCOMP_LOG_ALLOW (1 << 6)
953	#define SECCOMP_LOG_USER_NOTIF (1 << 7)
954
955	static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS \|
956	SECCOMP_LOG_KILL_THREAD \|
957	SECCOMP_LOG_TRAP \|
958	SECCOMP_LOG_ERRNO \|
959	SECCOMP_LOG_USER_NOTIF \|
960	SECCOMP_LOG_TRACE \|
961	SECCOMP_LOG_LOG;
962
963	static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
964	bool requested)
965	{
966	bool log = false;
967
968	switch (action) {
969	case SECCOMP_RET_ALLOW:
970	break;
971	case SECCOMP_RET_TRAP:
972	log = requested && seccomp_actions_logged & SECCOMP_LOG_TRAP;
973	break;
974	case SECCOMP_RET_ERRNO:
975	log = requested && seccomp_actions_logged & SECCOMP_LOG_ERRNO;
976	break;
977	case SECCOMP_RET_TRACE:
978	log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE;
979	break;
980	case SECCOMP_RET_USER_NOTIF:
981	log = requested && seccomp_actions_logged & SECCOMP_LOG_USER_NOTIF;
982	break;
983	case SECCOMP_RET_LOG:
984	log = seccomp_actions_logged & SECCOMP_LOG_LOG;
985	break;
986	case SECCOMP_RET_KILL_THREAD:
987	log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD;
988	break;
989	case SECCOMP_RET_KILL_PROCESS:
990	default:
991	log = seccomp_actions_logged & SECCOMP_LOG_KILL_PROCESS;
992	}
993
994	/*
995	* Emit an audit message when the action is RET_KILL_*, RET_LOG, or the
996	* FILTER_FLAG_LOG bit was set. The admin has the ability to silence
997	* any action from being logged by removing the action name from the
998	* seccomp_actions_logged sysctl.
999	*/
1000	if (!log)
1001	return;
1002
1003	audit_seccomp(syscall, signr, code: action);
1004	}
1005
1006	/*
1007	* Secure computing mode 1 allows only read/write/exit/sigreturn.
1008	* To be fully secure this must be combined with rlimit
1009	* to limit the stack allocations too.
1010	*/
1011	static const int mode1_syscalls[] = {
1012	__NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
1013	-`1`, / negative terminated /
1014	};
1015
1016	static void __secure_computing_strict(int this_syscall)
1017	{
1018	const int *allowed_syscalls = mode1_syscalls;
1019	#ifdef CONFIG_COMPAT
1020	if (in_compat_syscall())
1021	allowed_syscalls = get_compat_mode1_syscalls();
1022	#endif
1023	do {
1024	if (*allowed_syscalls == this_syscall)
1025	return;
1026	} while (*++allowed_syscalls != -`1`);
1027
1028	#ifdef SECCOMP_DEBUG
1029	dump_stack();
1030	#endif
1031	current->seccomp.mode = SECCOMP_MODE_DEAD;
1032	seccomp_log(syscall: this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, requested: true);
1033	do_exit(SIGKILL);
1034	}
1035
1036	#ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER
1037	void secure_computing_strict(int this_syscall)
1038	{
1039	int mode = current->seccomp.mode;
1040
1041	if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
1042	unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
1043	return;
1044
1045	if (mode == SECCOMP_MODE_DISABLED)
1046	return;
1047	else if (mode == SECCOMP_MODE_STRICT)
1048	__secure_computing_strict(this_syscall);
1049	else
1050	BUG();
1051	}
1052	#else
1053
1054	#ifdef CONFIG_SECCOMP_FILTER
1055	static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
1056	{
1057	/*
1058	* Note: overflow is ok here, the id just needs to be unique per
1059	* filter.
1060	*/
1061	lockdep_assert_held(&filter->notify_lock);
1062	return filter->notif->next_id++;
1063	}
1064
1065	static void seccomp_handle_addfd(struct seccomp_kaddfd addfd, struct* seccomp_knotif *n)
1066	{
1067	int fd;
1068
1069	/*
1070	* Remove the notification, and reset the list pointers, indicating
1071	* that it has been handled.
1072	*/
1073	list_del_init(entry: &addfd->list);
1074	if (!addfd->setfd)
1075	fd = receive_fd(file: addfd->file, o_flags: addfd->flags);
1076	else
1077	fd = receive_fd_replace(new_fd: addfd->fd, file: addfd->file, o_flags: addfd->flags);
1078	addfd->ret = fd;
1079
1080	if (addfd->ioctl_flags & SECCOMP_ADDFD_FLAG_SEND) {
1081	/ If we fail reset and return an error to the notifier /
1082	if (fd < `0`) {
1083	n->state = SECCOMP_NOTIFY_SENT;
1084	} else {
1085	/ Return the FD we just added /
1086	n->flags = `0`;
1087	n->error = `0`;
1088	n->val = fd;
1089	}
1090	}
1091
1092	/*
1093	* Mark the notification as completed. From this point, addfd mem
1094	* might be invalidated and we can't safely read it anymore.
1095	*/
1096	complete(&addfd->completion);
1097	}
1098
1099	static bool should_sleep_killable(struct seccomp_filter *match,
1100	struct seccomp_knotif *n)
1101	{
1102	return match->wait_killable_recv && n->state == SECCOMP_NOTIFY_SENT;
1103	}
1104
1105	static int seccomp_do_user_notification(int this_syscall,
1106	struct seccomp_filter *match,
1107	const struct seccomp_data *sd)
1108	{
1109	int err;
1110	u32 flags = `0`;
1111	long ret = `0`;
1112	struct seccomp_knotif n = {};
1113	struct seccomp_kaddfd addfd, tmp;
1114
1115	mutex_lock(&match->notify_lock);
1116	err = -ENOSYS;
1117	if (!match->notif)
1118	goto out;
1119
1120	n.task = current;
1121	n.state = SECCOMP_NOTIFY_INIT;
1122	n.data = sd;
1123	n.id = seccomp_next_notify_id(filter: match);
1124	init_completion(x: &n.ready);
1125	list_add_tail(new: &n.list, head: &match->notif->notifications);
1126	INIT_LIST_HEAD(list: &n.addfd);
1127
1128	atomic_inc(v: &match->notif->requests);
1129	if (match->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
1130	wake_up_poll_on_current_cpu(&match->wqh, EPOLLIN \| EPOLLRDNORM);
1131	else
1132	wake_up_poll(&match->wqh, EPOLLIN \| EPOLLRDNORM);
1133
1134	/*
1135	* This is where we wait for a reply from userspace.
1136	*/
1137	do {
1138	bool wait_killable = should_sleep_killable(match, n: &n);
1139
1140	mutex_unlock(lock: &match->notify_lock);
1141	if (wait_killable)
1142	err = wait_for_completion_killable(x: &n.ready);
1143	else
1144	err = wait_for_completion_interruptible(x: &n.ready);
1145	mutex_lock(&match->notify_lock);
1146
1147	if (err != `0`) {
1148	/*
1149	* Check to see if the notifcation got picked up and
1150	* whether we should switch to wait killable.
1151	*/
1152	if (!wait_killable && should_sleep_killable(match, n: &n))
1153	continue;
1154
1155	goto interrupted;
1156	}
1157
1158	addfd = list_first_entry_or_null(&n.addfd,
1159	struct seccomp_kaddfd, list);
1160	/ Check if we were woken up by a addfd message /
1161	if (addfd)
1162	seccomp_handle_addfd(addfd, n: &n);
1163
1164	} while (n.state != SECCOMP_NOTIFY_REPLIED);
1165
1166	ret = n.val;
1167	err = n.error;
1168	flags = n.flags;
1169
1170	interrupted:
1171	/ If there were any pending addfd calls, clear them out /
1172	list_for_each_entry_safe(addfd, tmp, &n.addfd, list) {
1173	/ The process went away before we got a chance to handle it /
1174	addfd->ret = -ESRCH;
1175	list_del_init(entry: &addfd->list);
1176	complete(&addfd->completion);
1177	}
1178
1179	/*
1180	* Note that it's possible the listener died in between the time when
1181	* we were notified of a response (or a signal) and when we were able to
1182	* re-acquire the lock, so only delete from the list if the
1183	* notification actually exists.
1184	*
1185	* Also note that this test is only valid because there's no way to
1186	* reattach to a notifier right now. If one is added, we'll need to
1187	* keep track of the notif itself and make sure they match here.
1188	*/
1189	if (match->notif)
1190	list_del(entry: &n.list);
1191	out:
1192	mutex_unlock(lock: &match->notify_lock);
1193
1194	/ Userspace requests to continue the syscall. /
1195	if (flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE)
1196	return `0`;
1197
1198	syscall_set_return_value(current, current_pt_regs(),
1199	error: err, val: ret);
1200	return -`1`;
1201	}
1202
1203	static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
1204	const bool recheck_after_trace)
1205	{
1206	u32 filter_ret, action;
1207	struct seccomp_filter *match = NULL;
1208	int data;
1209	struct seccomp_data sd_local;
1210
1211	/*
1212	* Make sure that any changes to mode from another thread have
1213	* been seen after SYSCALL_WORK_SECCOMP was seen.
1214	*/
1215	smp_rmb();
1216
1217	if (!sd) {
1218	populate_seccomp_data(sd: &sd_local);
1219	sd = &sd_local;
1220	}
1221
1222	filter_ret = seccomp_run_filters(sd, match: &match);
1223	data = filter_ret & SECCOMP_RET_DATA;
1224	action = filter_ret & SECCOMP_RET_ACTION_FULL;
1225
1226	switch (action) {
1227	case SECCOMP_RET_ERRNO:
1228	/ Set low-order bits as an errno, capped at MAX_ERRNO. /
1229	if (data > MAX_ERRNO)
1230	data = MAX_ERRNO;
1231	syscall_set_return_value(current, current_pt_regs(),
1232	error: -data, val: `0`);
1233	goto skip;
1234
1235	case SECCOMP_RET_TRAP:
1236	/ Show the handler the original registers. /
1237	syscall_rollback(current, current_pt_regs());
1238	/ Let the filter pass back 16 bits of data. /
1239	force_sig_seccomp(syscall: this_syscall, reason: data, force_coredump: false);
1240	goto skip;
1241
1242	case SECCOMP_RET_TRACE:
1243	/ We've been put in this state by the ptracer already. /
1244	if (recheck_after_trace)
1245	return `0`;
1246
1247	/ ENOSYS these calls if there is no tracer attached. /
1248	if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
1249	syscall_set_return_value(current,
1250	current_pt_regs(),
1251	error: -ENOSYS, val: `0`);
1252	goto skip;
1253	}
1254
1255	/ Allow the BPF to provide the event message /
1256	ptrace_event(PTRACE_EVENT_SECCOMP, message: data);
1257	/*
1258	* The delivery of a fatal signal during event
1259	* notification may silently skip tracer notification,
1260	* which could leave us with a potentially unmodified
1261	* syscall that the tracer would have liked to have
1262	* changed. Since the process is about to die, we just
1263	* force the syscall to be skipped and let the signal
1264	* kill the process and correctly handle any tracer exit
1265	* notifications.
1266	*/
1267	if (fatal_signal_pending(current))
1268	goto skip;
1269	/ Check if the tracer forced the syscall to be skipped. /
1270	this_syscall = syscall_get_nr(current, current_pt_regs());
1271	if (this_syscall < `0`)
1272	goto skip;
1273
1274	/*
1275	* Recheck the syscall, since it may have changed. This
1276	* intentionally uses a NULL struct seccomp_data to force
1277	* a reload of all registers. This does not goto skip since
1278	* a skip would have already been reported.
1279	*/
1280	if (__seccomp_filter(this_syscall, NULL, recheck_after_trace: true))
1281	return -`1`;
1282
1283	return `0`;
1284
1285	case SECCOMP_RET_USER_NOTIF:
1286	if (seccomp_do_user_notification(this_syscall, match, sd))
1287	goto skip;
1288
1289	return `0`;
1290
1291	case SECCOMP_RET_LOG:
1292	seccomp_log(syscall: this_syscall, signr: `0`, action, requested: true);
1293	return `0`;
1294
1295	case SECCOMP_RET_ALLOW:
1296	/*
1297	* Note that the "match" filter will always be NULL for
1298	* this action since SECCOMP_RET_ALLOW is the starting
1299	* state in seccomp_run_filters().
1300	*/
1301	return `0`;
1302
1303	case SECCOMP_RET_KILL_THREAD:
1304	case SECCOMP_RET_KILL_PROCESS:
1305	default:
1306	current->seccomp.mode = SECCOMP_MODE_DEAD;
1307	seccomp_log(syscall: this_syscall, SIGSYS, action, requested: true);
1308	/ Dump core only if this is the last remaining thread. /
1309	if (action != SECCOMP_RET_KILL_THREAD \|\|
1310	(atomic_read(v: &current->signal->live) == `1`)) {
1311	/ Show the original registers in the dump. /
1312	syscall_rollback(current, current_pt_regs());
1313	/ Trigger a coredump with SIGSYS /
1314	force_sig_seccomp(syscall: this_syscall, reason: data, force_coredump: true);
1315	} else {
1316	do_exit(SIGSYS);
1317	}
1318	return -`1`; / skip the syscall go directly to signal handling /
1319	}
1320
1321	unreachable();
1322
1323	skip:
1324	seccomp_log(syscall: this_syscall, signr: `0`, action, requested: match ? match->log : false);
1325	return -`1`;
1326	}
1327	#else
1328	static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
1329	const bool recheck_after_trace)
1330	{
1331	BUG();
1332
1333	return -`1`;
1334	}
1335	#endif
1336
1337	int __secure_computing(const struct seccomp_data *sd)
1338	{
1339	int mode = current->seccomp.mode;
1340	int this_syscall;
1341
1342	if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
1343	unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
1344	return `0`;
1345
1346	this_syscall = sd ? sd->nr :
1347	syscall_get_nr(current, current_pt_regs());
1348
1349	switch (mode) {
1350	case SECCOMP_MODE_STRICT:
1351	__secure_computing_strict(this_syscall); / may call do_exit /
1352	return `0`;
1353	case SECCOMP_MODE_FILTER:
1354	return __seccomp_filter(this_syscall, sd, recheck_after_trace: false);
1355	/ Surviving SECCOMP_RET_KILL_* must be proactively impossible. /
1356	case SECCOMP_MODE_DEAD:
1357	WARN_ON_ONCE(`1`);
1358	do_exit(SIGKILL);
1359	return -`1`;
1360	default:
1361	BUG();
1362	}
1363	}
1364	#endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */
1365
1366	long prctl_get_seccomp(void)
1367	{
1368	return current->seccomp.mode;
1369	}
1370
1371	/**
1372	* seccomp_set_mode_strict: internal function for setting strict seccomp
1373	*
1374	* Once current->seccomp.mode is non-zero, it may not be changed.
1375	*
1376	* Returns 0 on success or -EINVAL on failure.
1377	*/
1378	static long seccomp_set_mode_strict(void)
1379	{
1380	const unsigned long seccomp_mode = SECCOMP_MODE_STRICT;
1381	long ret = -EINVAL;
1382
1383	spin_lock_irq(lock: &current->sighand->siglock);
1384
1385	if (!seccomp_may_assign_mode(seccomp_mode))
1386	goto out;
1387
1388	#ifdef TIF_NOTSC
1389	disable_TSC();
1390	#endif
1391	seccomp_assign_mode(current, seccomp_mode, flags: `0`);
1392	ret = `0`;
1393
1394	out:
1395	spin_unlock_irq(lock: &current->sighand->siglock);
1396
1397	return ret;
1398	}
1399
1400	#ifdef CONFIG_SECCOMP_FILTER
1401	static void seccomp_notify_free(struct seccomp_filter *filter)
1402	{
1403	kfree(objp: filter->notif);
1404	filter->notif = NULL;
1405	}
1406
1407	static void seccomp_notify_detach(struct seccomp_filter *filter)
1408	{
1409	struct seccomp_knotif *knotif;
1410
1411	if (!filter)
1412	return;
1413
1414	mutex_lock(&filter->notify_lock);
1415
1416	/*
1417	* If this file is being closed because e.g. the task who owned it
1418	* died, let's wake everyone up who was waiting on us.
1419	*/
1420	list_for_each_entry(knotif, &filter->notif->notifications, list) {
1421	if (knotif->state == SECCOMP_NOTIFY_REPLIED)
1422	continue;
1423
1424	knotif->state = SECCOMP_NOTIFY_REPLIED;
1425	knotif->error = -ENOSYS;
1426	knotif->val = `0`;
1427
1428	/*
1429	* We do not need to wake up any pending addfd messages, as
1430	* the notifier will do that for us, as this just looks
1431	* like a standard reply.
1432	*/
1433	complete(&knotif->ready);
1434	}
1435
1436	seccomp_notify_free(filter);
1437	mutex_unlock(lock: &filter->notify_lock);
1438	}
1439
1440	static int seccomp_notify_release(struct inode inode, struct* file *file)
1441	{
1442	struct seccomp_filter *filter = file->private_data;
1443
1444	seccomp_notify_detach(filter);
1445	__put_seccomp_filter(orig: filter);
1446	return `0`;
1447	}
1448
1449	/ must be called with notif_lock held /
1450	static inline struct seccomp_knotif *
1451	find_notification(struct seccomp_filter *filter, u64 id)
1452	{
1453	struct seccomp_knotif *cur;
1454
1455	lockdep_assert_held(&filter->notify_lock);
1456
1457	list_for_each_entry(cur, &filter->notif->notifications, list) {
1458	if (cur->id == id)
1459	return cur;
1460	}
1461
1462	return NULL;
1463	}
1464
1465	static int recv_wake_function(wait_queue_entry_t wait, unsigned* int mode, int sync,
1466	void *key)
1467	{
1468	/ Avoid a wakeup if event not interesting for us. /
1469	if (key && !(key_to_poll(key) & (EPOLLIN \| EPOLLERR)))
1470	return `0`;
1471	return autoremove_wake_function(wq_entry: wait, mode, sync, key);
1472	}
1473
1474	static int recv_wait_event(struct seccomp_filter *filter)
1475	{
1476	DEFINE_WAIT_FUNC(wait, recv_wake_function);
1477	int ret;
1478
1479	if (atomic_dec_if_positive(v: &filter->notif->requests) >= `0`)
1480	return `0`;
1481
1482	for (;;) {
1483	ret = prepare_to_wait_event(wq_head: &filter->wqh, wq_entry: &wait, TASK_INTERRUPTIBLE);
1484
1485	if (atomic_dec_if_positive(v: &filter->notif->requests) >= `0`)
1486	break;
1487
1488	if (ret)
1489	return ret;
1490
1491	schedule();
1492	}
1493	finish_wait(wq_head: &filter->wqh, wq_entry: &wait);
1494	return `0`;
1495	}
1496
1497	static long seccomp_notify_recv(struct seccomp_filter *filter,
1498	void __user *buf)
1499	{
1500	struct seccomp_knotif knotif = NULL, cur;
1501	struct seccomp_notif unotif;
1502	ssize_t ret;
1503
1504	/ Verify that we're not given garbage to keep struct extensible. /
1505	ret = check_zeroed_user(from: buf, size: sizeof(unotif));
1506	if (ret < `0`)
1507	return ret;
1508	if (!ret)
1509	return -EINVAL;
1510
1511	memset(&unotif, `0`, sizeof(unotif));
1512
1513	ret = recv_wait_event(filter);
1514	if (ret < `0`)
1515	return ret;
1516
1517	mutex_lock(&filter->notify_lock);
1518	list_for_each_entry(cur, &filter->notif->notifications, list) {
1519	if (cur->state == SECCOMP_NOTIFY_INIT) {
1520	knotif = cur;
1521	break;
1522	}
1523	}
1524
1525	/*
1526	* If we didn't find a notification, it could be that the task was
1527	* interrupted by a fatal signal between the time we were woken and
1528	* when we were able to acquire the rw lock.
1529	*/
1530	if (!knotif) {
1531	ret = -ENOENT;
1532	goto out;
1533	}
1534
1535	unotif.id = knotif->id;
1536	unotif.pid = task_pid_vnr(tsk: knotif->task);
1537	unotif.data = *(knotif->data);
1538
1539	knotif->state = SECCOMP_NOTIFY_SENT;
1540	wake_up_poll(&filter->wqh, EPOLLOUT \| EPOLLWRNORM);
1541	ret = `0`;
1542	out:
1543	mutex_unlock(lock: &filter->notify_lock);
1544
1545	if (ret == `0` && copy_to_user(to: buf, from: &unotif, n: sizeof(unotif))) {
1546	ret = -EFAULT;
1547
1548	/*
1549	* Userspace screwed up. To make sure that we keep this
1550	* notification alive, let's reset it back to INIT. It
1551	* may have died when we released the lock, so we need to make
1552	* sure it's still around.
1553	*/
1554	mutex_lock(&filter->notify_lock);
1555	knotif = find_notification(filter, id: unotif.id);
1556	if (knotif) {
1557	/ Reset the process to make sure it's not stuck /
1558	if (should_sleep_killable(match: filter, n: knotif))
1559	complete(&knotif->ready);
1560	knotif->state = SECCOMP_NOTIFY_INIT;
1561	atomic_inc(v: &filter->notif->requests);
1562	wake_up_poll(&filter->wqh, EPOLLIN \| EPOLLRDNORM);
1563	}
1564	mutex_unlock(lock: &filter->notify_lock);
1565	}
1566
1567	return ret;
1568	}
1569
1570	static long seccomp_notify_send(struct seccomp_filter *filter,
1571	void __user *buf)
1572	{
1573	struct seccomp_notif_resp resp = {};
1574	struct seccomp_knotif *knotif;
1575	long ret;
1576
1577	if (copy_from_user(to: &resp, from: buf, n: sizeof(resp)))
1578	return -EFAULT;
1579
1580	if (resp.flags & ~SECCOMP_USER_NOTIF_FLAG_CONTINUE)
1581	return -EINVAL;
1582
1583	if ((resp.flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) &&
1584	(resp.error \|\| resp.val))
1585	return -EINVAL;
1586
1587	ret = mutex_lock_interruptible(&filter->notify_lock);
1588	if (ret < `0`)
1589	return ret;
1590
1591	knotif = find_notification(filter, id: resp.id);
1592	if (!knotif) {
1593	ret = -ENOENT;
1594	goto out;
1595	}
1596
1597	/ Allow exactly one reply. /
1598	if (knotif->state != SECCOMP_NOTIFY_SENT) {
1599	ret = -EINPROGRESS;
1600	goto out;
1601	}
1602
1603	ret = `0`;
1604	knotif->state = SECCOMP_NOTIFY_REPLIED;
1605	knotif->error = resp.error;
1606	knotif->val = resp.val;
1607	knotif->flags = resp.flags;
1608	if (filter->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
1609	complete_on_current_cpu(x: &knotif->ready);
1610	else
1611	complete(&knotif->ready);
1612	out:
1613	mutex_unlock(lock: &filter->notify_lock);
1614	return ret;
1615	}
1616
1617	static long seccomp_notify_id_valid(struct seccomp_filter *filter,
1618	void __user *buf)
1619	{
1620	struct seccomp_knotif *knotif;
1621	u64 id;
1622	long ret;
1623
1624	if (copy_from_user(to: &id, from: buf, n: sizeof(id)))
1625	return -EFAULT;
1626
1627	ret = mutex_lock_interruptible(&filter->notify_lock);
1628	if (ret < `0`)
1629	return ret;
1630
1631	knotif = find_notification(filter, id);
1632	if (knotif && knotif->state == SECCOMP_NOTIFY_SENT)
1633	ret = `0`;
1634	else
1635	ret = -ENOENT;
1636
1637	mutex_unlock(lock: &filter->notify_lock);
1638	return ret;
1639	}
1640
1641	static long seccomp_notify_set_flags(struct seccomp_filter *filter,
1642	unsigned long flags)
1643	{
1644	long ret;
1645
1646	if (flags & ~SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
1647	return -EINVAL;
1648
1649	ret = mutex_lock_interruptible(&filter->notify_lock);
1650	if (ret < `0`)
1651	return ret;
1652	filter->notif->flags = flags;
1653	mutex_unlock(lock: &filter->notify_lock);
1654	return `0`;
1655	}
1656
1657	static long seccomp_notify_addfd(struct seccomp_filter *filter,
1658	struct seccomp_notif_addfd __user *uaddfd,
1659	unsigned int size)
1660	{
1661	struct seccomp_notif_addfd addfd;
1662	struct seccomp_knotif *knotif;
1663	struct seccomp_kaddfd kaddfd;
1664	int ret;
1665
1666	BUILD_BUG_ON(sizeof(addfd) < SECCOMP_NOTIFY_ADDFD_SIZE_VER0);
1667	BUILD_BUG_ON(sizeof(addfd) != SECCOMP_NOTIFY_ADDFD_SIZE_LATEST);
1668
1669	if (size < SECCOMP_NOTIFY_ADDFD_SIZE_VER0 \|\| size >= PAGE_SIZE)
1670	return -EINVAL;
1671
1672	ret = copy_struct_from_user(dst: &addfd, ksize: sizeof(addfd), src: uaddfd, usize: size);
1673	if (ret)
1674	return ret;
1675
1676	if (addfd.newfd_flags & ~O_CLOEXEC)
1677	return -EINVAL;
1678
1679	if (addfd.flags & ~(SECCOMP_ADDFD_FLAG_SETFD \| SECCOMP_ADDFD_FLAG_SEND))
1680	return -EINVAL;
1681
1682	if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD))
1683	return -EINVAL;
1684
1685	kaddfd.file = fget(fd: addfd.srcfd);
1686	if (!kaddfd.file)
1687	return -EBADF;
1688
1689	kaddfd.ioctl_flags = addfd.flags;
1690	kaddfd.flags = addfd.newfd_flags;
1691	kaddfd.setfd = addfd.flags & SECCOMP_ADDFD_FLAG_SETFD;
1692	kaddfd.fd = addfd.newfd;
1693	init_completion(x: &kaddfd.completion);
1694
1695	ret = mutex_lock_interruptible(&filter->notify_lock);
1696	if (ret < `0`)
1697	goto out;
1698
1699	knotif = find_notification(filter, id: addfd.id);
1700	if (!knotif) {
1701	ret = -ENOENT;
1702	goto out_unlock;
1703	}
1704
1705	/*
1706	* We do not want to allow for FD injection to occur before the
1707	* notification has been picked up by a userspace handler, or after
1708	* the notification has been replied to.
1709	*/
1710	if (knotif->state != SECCOMP_NOTIFY_SENT) {
1711	ret = -EINPROGRESS;
1712	goto out_unlock;
1713	}
1714
1715	if (addfd.flags & SECCOMP_ADDFD_FLAG_SEND) {
1716	/*
1717	* Disallow queuing an atomic addfd + send reply while there are
1718	* some addfd requests still to process.
1719	*
1720	* There is no clear reason to support it and allows us to keep
1721	* the loop on the other side straight-forward.
1722	*/
1723	if (!list_empty(head: &knotif->addfd)) {
1724	ret = -EBUSY;
1725	goto out_unlock;
1726	}
1727
1728	/ Allow exactly only one reply /
1729	knotif->state = SECCOMP_NOTIFY_REPLIED;
1730	}
1731
1732	list_add(new: &kaddfd.list, head: &knotif->addfd);
1733	complete(&knotif->ready);
1734	mutex_unlock(lock: &filter->notify_lock);
1735
1736	/ Now we wait for it to be processed or be interrupted /
1737	ret = wait_for_completion_interruptible(x: &kaddfd.completion);
1738	if (ret == `0`) {
1739	/*
1740	* We had a successful completion. The other side has already
1741	* removed us from the addfd queue, and
1742	* wait_for_completion_interruptible has a memory barrier upon
1743	* success that lets us read this value directly without
1744	* locking.
1745	*/
1746	ret = kaddfd.ret;
1747	goto out;
1748	}
1749
1750	mutex_lock(&filter->notify_lock);
1751	/*
1752	* Even though we were woken up by a signal and not a successful
1753	* completion, a completion may have happened in the mean time.
1754	*
1755	* We need to check again if the addfd request has been handled,
1756	* and if not, we will remove it from the queue.
1757	*/
1758	if (list_empty(head: &kaddfd.list))
1759	ret = kaddfd.ret;
1760	else
1761	list_del(entry: &kaddfd.list);
1762
1763	out_unlock:
1764	mutex_unlock(lock: &filter->notify_lock);
1765	out:
1766	fput(kaddfd.file);
1767
1768	return ret;
1769	}
1770
1771	static long seccomp_notify_ioctl(struct file file, unsigned* int cmd,
1772	unsigned long arg)
1773	{
1774	struct seccomp_filter *filter = file->private_data;
1775	void __user buf = (void* __user *)arg;
1776
1777	/ Fixed-size ioctls /
1778	switch (cmd) {
1779	case SECCOMP_IOCTL_NOTIF_RECV:
1780	return seccomp_notify_recv(filter, buf);
1781	case SECCOMP_IOCTL_NOTIF_SEND:
1782	return seccomp_notify_send(filter, buf);
1783	case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
1784	case SECCOMP_IOCTL_NOTIF_ID_VALID:
1785	return seccomp_notify_id_valid(filter, buf);
1786	case SECCOMP_IOCTL_NOTIF_SET_FLAGS:
1787	return seccomp_notify_set_flags(filter, flags: arg);
1788	}
1789
1790	/ Extensible Argument ioctls /
1791	#define EA_IOCTL(cmd) ((cmd) & ~(IOC_INOUT \| IOCSIZE_MASK))
1792	switch (EA_IOCTL(cmd)) {
1793	case EA_IOCTL(SECCOMP_IOCTL_NOTIF_ADDFD):
1794	return seccomp_notify_addfd(filter, uaddfd: buf, _IOC_SIZE(cmd));
1795	default:
1796	return -EINVAL;
1797	}
1798	}
1799
1800	static __poll_t seccomp_notify_poll(struct file *file,
1801	struct poll_table_struct *poll_tab)
1802	{
1803	struct seccomp_filter *filter = file->private_data;
1804	__poll_t ret = `0`;
1805	struct seccomp_knotif *cur;
1806
1807	poll_wait(filp: file, wait_address: &filter->wqh, p: poll_tab);
1808
1809	if (mutex_lock_interruptible(&filter->notify_lock) < `0`)
1810	return EPOLLERR;
1811
1812	list_for_each_entry(cur, &filter->notif->notifications, list) {
1813	if (cur->state == SECCOMP_NOTIFY_INIT)
1814	ret \|= EPOLLIN \| EPOLLRDNORM;
1815	if (cur->state == SECCOMP_NOTIFY_SENT)
1816	ret \|= EPOLLOUT \| EPOLLWRNORM;
1817	if ((ret & EPOLLIN) && (ret & EPOLLOUT))
1818	break;
1819	}
1820
1821	mutex_unlock(lock: &filter->notify_lock);
1822
1823	if (refcount_read(r: &filter->users) == `0`)
1824	ret \|= EPOLLHUP;
1825
1826	return ret;
1827	}
1828
1829	static const struct file_operations seccomp_notify_ops = {
1830	.poll = seccomp_notify_poll,
1831	.release = seccomp_notify_release,
1832	.unlocked_ioctl = seccomp_notify_ioctl,
1833	.compat_ioctl = seccomp_notify_ioctl,
1834	};
1835
1836	static struct file init_listener(struct* seccomp_filter *filter)
1837	{
1838	struct file *ret;
1839
1840	ret = ERR_PTR(error: -ENOMEM);
1841	filter->notif = kzalloc(size: sizeof(*(filter->notif)), GFP_KERNEL);
1842	if (!filter->notif)
1843	goto out;
1844
1845	filter->notif->next_id = get_random_u64();
1846	INIT_LIST_HEAD(list: &filter->notif->notifications);
1847
1848	ret = anon_inode_getfile(name: "seccomp notify", fops: &seccomp_notify_ops,
1849	priv: filter, O_RDWR);
1850	if (IS_ERR(ptr: ret))
1851	goto out_notif;
1852
1853	/ The file has a reference to it now /
1854	__get_seccomp_filter(filter);
1855
1856	out_notif:
1857	if (IS_ERR(ptr: ret))
1858	seccomp_notify_free(filter);
1859	out:
1860	return ret;
1861	}
1862
1863	/*
1864	* Does @new_child have a listener while an ancestor also has a listener?
1865	* If so, we'll want to reject this filter.
1866	* This only has to be tested for the current process, even in the TSYNC case,
1867	* because TSYNC installs @child with the same parent on all threads.
1868	* Note that @new_child is not hooked up to its parent at this point yet, so
1869	* we use current->seccomp.filter.
1870	*/
1871	static bool has_duplicate_listener(struct seccomp_filter *new_child)
1872	{
1873	struct seccomp_filter *cur;
1874
1875	/ must be protected against concurrent TSYNC /
1876	lockdep_assert_held(&current->sighand->siglock);
1877
1878	if (!new_child->notif)
1879	return false;
1880	for (cur = current->seccomp.filter; cur; cur = cur->prev) {
1881	if (cur->notif)
1882	return true;
1883	}
1884
1885	return false;
1886	}
1887
1888	/**
1889	* seccomp_set_mode_filter: internal function for setting seccomp filter
1890	* @flags: flags to change filter behavior
1891	* @filter: struct sock_fprog containing filter
1892	*
1893	* This function may be called repeatedly to install additional filters.
1894	* Every filter successfully installed will be evaluated (in reverse order)
1895	* for each system call the task makes.
1896	*
1897	* Once current->seccomp.mode is non-zero, it may not be changed.
1898	*
1899	* Returns 0 on success or -EINVAL on failure.
1900	*/
1901	static long seccomp_set_mode_filter(unsigned int flags,
1902	const char __user *filter)
1903	{
1904	const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
1905	struct seccomp_filter *prepared = NULL;
1906	long ret = -EINVAL;
1907	int listener = -`1`;
1908	struct file *listener_f = NULL;
1909
1910	/ Validate flags. /
1911	if (flags & ~SECCOMP_FILTER_FLAG_MASK)
1912	return -EINVAL;
1913
1914	/*
1915	* In the successful case, NEW_LISTENER returns the new listener fd.
1916	* But in the failure case, TSYNC returns the thread that died. If you
1917	* combine these two flags, there's no way to tell whether something
1918	* succeeded or failed. So, let's disallow this combination if the user
1919	* has not explicitly requested no errors from TSYNC.
1920	*/
1921	if ((flags & SECCOMP_FILTER_FLAG_TSYNC) &&
1922	(flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) &&
1923	((flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) == `0`))
1924	return -EINVAL;
1925
1926	/*
1927	* The SECCOMP_FILTER_FLAG_WAIT_KILLABLE_SENT flag doesn't make sense
1928	* without the SECCOMP_FILTER_FLAG_NEW_LISTENER flag.
1929	*/
1930	if ((flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) &&
1931	((flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) == `0`))
1932	return -EINVAL;
1933
1934	/ Prepare the new filter before holding any locks. /
1935	prepared = seccomp_prepare_user_filter(user_filter: filter);
1936	if (IS_ERR(ptr: prepared))
1937	return PTR_ERR(ptr: prepared);
1938
1939	if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
1940	listener = get_unused_fd_flags(O_CLOEXEC);
1941	if (listener < `0`) {
1942	ret = listener;
1943	goto out_free;
1944	}
1945
1946	listener_f = init_listener(filter: prepared);
1947	if (IS_ERR(ptr: listener_f)) {
1948	put_unused_fd(fd: listener);
1949	ret = PTR_ERR(ptr: listener_f);
1950	goto out_free;
1951	}
1952	}
1953
1954	/*
1955	* Make sure we cannot change seccomp or nnp state via TSYNC
1956	* while another thread is in the middle of calling exec.
1957	*/
1958	if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
1959	mutex_lock_killable(&current->signal->cred_guard_mutex))
1960	goto out_put_fd;
1961
1962	spin_lock_irq(lock: &current->sighand->siglock);
1963
1964	if (!seccomp_may_assign_mode(seccomp_mode))
1965	goto out;
1966
1967	if (has_duplicate_listener(new_child: prepared)) {
1968	ret = -EBUSY;
1969	goto out;
1970	}
1971
1972	ret = seccomp_attach_filter(flags, filter: prepared);
1973	if (ret)
1974	goto out;
1975	/ Do not free the successfully attached filter. /
1976	prepared = NULL;
1977
1978	seccomp_assign_mode(current, seccomp_mode, flags);
1979	out:
1980	spin_unlock_irq(lock: &current->sighand->siglock);
1981	if (flags & SECCOMP_FILTER_FLAG_TSYNC)
1982	mutex_unlock(lock: &current->signal->cred_guard_mutex);
1983	out_put_fd:
1984	if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
1985	if (ret) {
1986	listener_f->private_data = NULL;
1987	fput(listener_f);
1988	put_unused_fd(fd: listener);
1989	seccomp_notify_detach(filter: prepared);
1990	} else {
1991	fd_install(fd: listener, file: listener_f);
1992	ret = listener;
1993	}
1994	}
1995	out_free:
1996	seccomp_filter_free(filter: prepared);
1997	return ret;
1998	}
1999	#else
2000	static inline long seccomp_set_mode_filter(unsigned int flags,
2001	const char __user *filter)
2002	{
2003	return -EINVAL;
2004	}
2005	#endif
2006
2007	static long seccomp_get_action_avail(const char __user *uaction)
2008	{
2009	u32 action;
2010
2011	if (copy_from_user(to: &action, from: uaction, n: sizeof(action)))
2012	return -EFAULT;
2013
2014	switch (action) {
2015	case SECCOMP_RET_KILL_PROCESS:
2016	case SECCOMP_RET_KILL_THREAD:
2017	case SECCOMP_RET_TRAP:
2018	case SECCOMP_RET_ERRNO:
2019	case SECCOMP_RET_USER_NOTIF:
2020	case SECCOMP_RET_TRACE:
2021	case SECCOMP_RET_LOG:
2022	case SECCOMP_RET_ALLOW:
2023	break;
2024	default:
2025	return -EOPNOTSUPP;
2026	}
2027
2028	return `0`;
2029	}
2030
2031	static long seccomp_get_notif_sizes(void __user *usizes)
2032	{
2033	struct seccomp_notif_sizes sizes = {
2034	.seccomp_notif = sizeof(struct seccomp_notif),
2035	.seccomp_notif_resp = sizeof(struct seccomp_notif_resp),
2036	.seccomp_data = sizeof(struct seccomp_data),
2037	};
2038
2039	if (copy_to_user(to: usizes, from: &sizes, n: sizeof(sizes)))
2040	return -EFAULT;
2041
2042	return `0`;
2043	}
2044
2045	/ Common entry point for both prctl and syscall. /
2046	static long do_seccomp(unsigned int op, unsigned int flags,
2047	void __user *uargs)
2048	{
2049	switch (op) {
2050	case SECCOMP_SET_MODE_STRICT:
2051	if (flags != `0` \|\| uargs != NULL)
2052	return -EINVAL;
2053	return seccomp_set_mode_strict();
2054	case SECCOMP_SET_MODE_FILTER:
2055	return seccomp_set_mode_filter(flags, filter: uargs);
2056	case SECCOMP_GET_ACTION_AVAIL:
2057	if (flags != `0`)
2058	return -EINVAL;
2059
2060	return seccomp_get_action_avail(uaction: uargs);
2061	case SECCOMP_GET_NOTIF_SIZES:
2062	if (flags != `0`)
2063	return -EINVAL;
2064
2065	return seccomp_get_notif_sizes(usizes: uargs);
2066	default:
2067	return -EINVAL;
2068	}
2069	}
2070
2071	SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
2072	void __user *, uargs)
2073	{
2074	return do_seccomp(op, flags, uargs);
2075	}
2076
2077	/**
2078	* prctl_set_seccomp: configures current->seccomp.mode
2079	* @seccomp_mode: requested mode to use
2080	* @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
2081	*
2082	* Returns 0 on success or -EINVAL on failure.
2083	*/
2084	long prctl_set_seccomp(unsigned long seccomp_mode, void __user *filter)
2085	{
2086	unsigned int op;
2087	void __user *uargs;
2088
2089	switch (seccomp_mode) {
2090	case SECCOMP_MODE_STRICT:
2091	op = SECCOMP_SET_MODE_STRICT;
2092	/*
2093	* Setting strict mode through prctl always ignored filter,
2094	* so make sure it is always NULL here to pass the internal
2095	* check in do_seccomp().
2096	*/
2097	uargs = NULL;
2098	break;
2099	case SECCOMP_MODE_FILTER:
2100	op = SECCOMP_SET_MODE_FILTER;
2101	uargs = filter;
2102	break;
2103	default:
2104	return -EINVAL;
2105	}
2106
2107	/ prctl interface doesn't have flags, so they are always zero. /
2108	return do_seccomp(op, flags: `0`, uargs);
2109	}
2110
2111	#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
2112	static struct seccomp_filter get_nth_filter(struct* task_struct *task,
2113	unsigned long filter_off)
2114	{
2115	struct seccomp_filter orig, filter;
2116	unsigned long count;
2117
2118	/*
2119	* Note: this is only correct because the caller should be the (ptrace)
2120	* tracer of the task, otherwise lock_task_sighand is needed.
2121	*/
2122	spin_lock_irq(lock: &task->sighand->siglock);
2123
2124	if (task->seccomp.mode != SECCOMP_MODE_FILTER) {
2125	spin_unlock_irq(lock: &task->sighand->siglock);
2126	return ERR_PTR(error: -EINVAL);
2127	}
2128
2129	orig = task->seccomp.filter;
2130	__get_seccomp_filter(filter: orig);
2131	spin_unlock_irq(lock: &task->sighand->siglock);
2132
2133	count = `0`;
2134	for (filter = orig; filter; filter = filter->prev)
2135	count++;
2136
2137	if (filter_off >= count) {
2138	filter = ERR_PTR(error: -ENOENT);
2139	goto out;
2140	}
2141
2142	count -= filter_off;
2143	for (filter = orig; filter && count > `1`; filter = filter->prev)
2144	count--;
2145
2146	if (WARN_ON(count != `1` \|\| !filter)) {
2147	filter = ERR_PTR(error: -ENOENT);
2148	goto out;
2149	}
2150
2151	__get_seccomp_filter(filter);
2152
2153	out:
2154	__put_seccomp_filter(orig);
2155	return filter;
2156	}
2157
2158	long seccomp_get_filter(struct task_struct task, unsigned* long filter_off,
2159	void __user *data)
2160	{
2161	struct seccomp_filter *filter;
2162	struct sock_fprog_kern *fprog;
2163	long ret;
2164
2165	if (!capable(CAP_SYS_ADMIN) \|\|
2166	current->seccomp.mode != SECCOMP_MODE_DISABLED) {
2167	return -EACCES;
2168	}
2169
2170	filter = get_nth_filter(task, filter_off);
2171	if (IS_ERR(ptr: filter))
2172	return PTR_ERR(ptr: filter);
2173
2174	fprog = filter->prog->orig_prog;
2175	if (!fprog) {
2176	/ This must be a new non-cBPF filter, since we save*
2177	* every cBPF filter's orig_prog above when
2178	* CONFIG_CHECKPOINT_RESTORE is enabled.
2179	*/
2180	ret = -EMEDIUMTYPE;
2181	goto out;
2182	}
2183
2184	ret = fprog->len;
2185	if (!data)
2186	goto out;
2187
2188	if (copy_to_user(to: data, from: fprog->filter, bpf_classic_proglen(fprog)))
2189	ret = -EFAULT;
2190
2191	out:
2192	__put_seccomp_filter(orig: filter);
2193	return ret;
2194	}
2195
2196	long seccomp_get_metadata(struct task_struct *task,
2197	unsigned long size, void __user *data)
2198	{
2199	long ret;
2200	struct seccomp_filter *filter;
2201	struct seccomp_metadata kmd = {};
2202
2203	if (!capable(CAP_SYS_ADMIN) \|\|
2204	current->seccomp.mode != SECCOMP_MODE_DISABLED) {
2205	return -EACCES;
2206	}
2207
2208	size = min_t(unsigned long, size, sizeof(kmd));
2209
2210	if (size < sizeof(kmd.filter_off))
2211	return -EINVAL;
2212
2213	if (copy_from_user(to: &kmd.filter_off, from: data, n: sizeof(kmd.filter_off)))
2214	return -EFAULT;
2215
2216	filter = get_nth_filter(task, filter_off: kmd.filter_off);
2217	if (IS_ERR(ptr: filter))
2218	return PTR_ERR(ptr: filter);
2219
2220	if (filter->log)
2221	kmd.flags \|= SECCOMP_FILTER_FLAG_LOG;
2222
2223	ret = size;
2224	if (copy_to_user(to: data, from: &kmd, n: size))
2225	ret = -EFAULT;
2226
2227	__put_seccomp_filter(orig: filter);
2228	return ret;
2229	}
2230	#endif
2231
2232	#ifdef CONFIG_SYSCTL
2233
2234	/ Human readable action names for friendly sysctl interaction /
2235	#define SECCOMP_RET_KILL_PROCESS_NAME "kill_process"
2236	#define SECCOMP_RET_KILL_THREAD_NAME "kill_thread"
2237	#define SECCOMP_RET_TRAP_NAME "trap"
2238	#define SECCOMP_RET_ERRNO_NAME "errno"
2239	#define SECCOMP_RET_USER_NOTIF_NAME "user_notif"
2240	#define SECCOMP_RET_TRACE_NAME "trace"
2241	#define SECCOMP_RET_LOG_NAME "log"
2242	#define SECCOMP_RET_ALLOW_NAME "allow"
2243
2244	static const char seccomp_actions_avail[] =
2245	SECCOMP_RET_KILL_PROCESS_NAME " "
2246	SECCOMP_RET_KILL_THREAD_NAME " "
2247	SECCOMP_RET_TRAP_NAME " "
2248	SECCOMP_RET_ERRNO_NAME " "
2249	SECCOMP_RET_USER_NOTIF_NAME " "
2250	SECCOMP_RET_TRACE_NAME " "
2251	SECCOMP_RET_LOG_NAME " "
2252	SECCOMP_RET_ALLOW_NAME;
2253
2254	struct seccomp_log_name {
2255	u32 log;
2256	const char *name;
2257	};
2258
2259	static const struct seccomp_log_name seccomp_log_names[] = {
2260	{ SECCOMP_LOG_KILL_PROCESS, SECCOMP_RET_KILL_PROCESS_NAME },
2261	{ SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME },
2262	{ SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME },
2263	{ SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME },
2264	{ SECCOMP_LOG_USER_NOTIF, SECCOMP_RET_USER_NOTIF_NAME },
2265	{ SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME },
2266	{ SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME },
2267	{ SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME },
2268	{ }
2269	};
2270
2271	static bool seccomp_names_from_actions_logged(char *names, size_t size,
2272	u32 actions_logged,
2273	const char *sep)
2274	{
2275	const struct seccomp_log_name *cur;
2276	bool append_sep = false;
2277
2278	for (cur = seccomp_log_names; cur->name && size; cur++) {
2279	ssize_t ret;
2280
2281	if (!(actions_logged & cur->log))
2282	continue;
2283
2284	if (append_sep) {
2285	ret = strscpy(p: names, q: sep, size);
2286	if (ret < `0`)
2287	return false;
2288
2289	names += ret;
2290	size -= ret;
2291	} else
2292	append_sep = true;
2293
2294	ret = strscpy(p: names, q: cur->name, size);
2295	if (ret < `0`)
2296	return false;
2297
2298	names += ret;
2299	size -= ret;
2300	}
2301
2302	return true;
2303	}
2304
2305	static bool seccomp_action_logged_from_name(u32 *action_logged,
2306	const char *name)
2307	{
2308	const struct seccomp_log_name *cur;
2309
2310	for (cur = seccomp_log_names; cur->name; cur++) {
2311	if (!strcmp(cur->name, name)) {
2312	*action_logged = cur->log;
2313	return true;
2314	}
2315	}
2316
2317	return false;
2318	}
2319
2320	static bool seccomp_actions_logged_from_names(u32 actions_logged, char* *names)
2321	{
2322	char *name;
2323
2324	*actions_logged = `0`;
2325	while ((name = strsep(&names, " ")) && *name) {
2326	u32 action_logged = `0`;
2327
2328	if (!seccomp_action_logged_from_name(action_logged: &action_logged, name))
2329	return false;
2330
2331	*actions_logged \|= action_logged;
2332	}
2333
2334	return true;
2335	}
2336
2337	static int read_actions_logged(struct ctl_table ro_table, void* *buffer,
2338	size_t lenp, loff_t ppos)
2339	{
2340	char names[sizeof(seccomp_actions_avail)];
2341	struct ctl_table table;
2342
2343	memset(names, `0`, sizeof(names));
2344
2345	if (!seccomp_names_from_actions_logged(names, size: sizeof(names),
2346	actions_logged: seccomp_actions_logged, sep: " "))
2347	return -EINVAL;
2348
2349	table = *ro_table;
2350	table.data = names;
2351	table.maxlen = sizeof(names);
2352	return proc_dostring(&table, `0`, buffer, lenp, ppos);
2353	}
2354
2355	static int write_actions_logged(struct ctl_table ro_table, void* *buffer,
2356	size_t lenp, loff_t ppos, u32 *actions_logged)
2357	{
2358	char names[sizeof(seccomp_actions_avail)];
2359	struct ctl_table table;
2360	int ret;
2361
2362	if (!capable(CAP_SYS_ADMIN))
2363	return -EPERM;
2364
2365	memset(names, `0`, sizeof(names));
2366
2367	table = *ro_table;
2368	table.data = names;
2369	table.maxlen = sizeof(names);
2370	ret = proc_dostring(&table, `1`, buffer, lenp, ppos);
2371	if (ret)
2372	return ret;
2373
2374	if (!seccomp_actions_logged_from_names(actions_logged, names: table.data))
2375	return -EINVAL;
2376
2377	if (*actions_logged & SECCOMP_LOG_ALLOW)
2378	return -EINVAL;
2379
2380	seccomp_actions_logged = *actions_logged;
2381	return `0`;
2382	}
2383
2384	static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged,
2385	int ret)
2386	{
2387	char names[sizeof(seccomp_actions_avail)];
2388	char old_names[sizeof(seccomp_actions_avail)];
2389	const char *new = names;
2390	const char *old = old_names;
2391
2392	if (!audit_enabled)
2393	return;
2394
2395	memset(names, `0`, sizeof(names));
2396	memset(old_names, `0`, sizeof(old_names));
2397
2398	if (ret)
2399	new = "?";
2400	else if (!actions_logged)
2401	new = "(none)";
2402	else if (!seccomp_names_from_actions_logged(names, size: sizeof(names),
2403	actions_logged, sep: ","))
2404	new = "?";
2405
2406	if (!old_actions_logged)
2407	old = "(none)";
2408	else if (!seccomp_names_from_actions_logged(names: old_names,
2409	size: sizeof(old_names),
2410	actions_logged: old_actions_logged, sep: ","))
2411	old = "?";
2412
2413	return audit_seccomp_actions_logged(names: new, old_names: old, res: !ret);
2414	}
2415
2416	static int seccomp_actions_logged_handler(struct ctl_table ro_table, int* write,
2417	void buffer, size_t lenp,
2418	loff_t *ppos)
2419	{
2420	int ret;
2421
2422	if (write) {
2423	u32 actions_logged = `0`;
2424	u32 old_actions_logged = seccomp_actions_logged;
2425
2426	ret = write_actions_logged(ro_table, buffer, lenp, ppos,
2427	actions_logged: &actions_logged);
2428	audit_actions_logged(actions_logged, old_actions_logged, ret);
2429	} else
2430	ret = read_actions_logged(ro_table, buffer, lenp, ppos);
2431
2432	return ret;
2433	}
2434
2435	static struct ctl_table seccomp_sysctl_table[] = {
2436	{
2437	.procname = "actions_avail",
2438	.data = (void *) &seccomp_actions_avail,
2439	.maxlen = sizeof(seccomp_actions_avail),
2440	.mode = `0444`,
2441	.proc_handler = proc_dostring,
2442	},
2443	{
2444	.procname = "actions_logged",
2445	.mode = `0644`,
2446	.proc_handler = seccomp_actions_logged_handler,
2447	},
2448	{ }
2449	};
2450
2451	static int __init seccomp_sysctl_init(void)
2452	{
2453	register_sysctl_init("kernel/seccomp", seccomp_sysctl_table);
2454	return `0`;
2455	}
2456
2457	device_initcall(seccomp_sysctl_init)
2458
2459	#endif /* CONFIG_SYSCTL */
2460
2461	#ifdef CONFIG_SECCOMP_CACHE_DEBUG
2462	/ Currently CONFIG_SECCOMP_CACHE_DEBUG implies SECCOMP_ARCH_NATIVE /
2463	static void proc_pid_seccomp_cache_arch(struct seq_file m, const* char *name,
2464	const void *bitmap, size_t bitmap_size)
2465	{
2466	int nr;
2467
2468	for (nr = `0`; nr < bitmap_size; nr++) {
2469	bool cached = test_bit(nr, bitmap);
2470	char *status = cached ? "ALLOW" : "FILTER";
2471
2472	seq_printf(m, fmt: "%s %d %s\n", name, nr, status);
2473	}
2474	}
2475
2476	int proc_pid_seccomp_cache(struct seq_file m, struct* pid_namespace *ns,
2477	struct pid pid, struct* task_struct *task)
2478	{
2479	struct seccomp_filter *f;
2480	unsigned long flags;
2481
2482	/*
2483	* We don't want some sandboxed process to know what their seccomp
2484	* filters consist of.
2485	*/
2486	if (!file_ns_capable(file: m->file, ns: &init_user_ns, CAP_SYS_ADMIN))
2487	return -EACCES;
2488
2489	if (!lock_task_sighand(task, flags: &flags))
2490	return -ESRCH;
2491
2492	f = READ_ONCE(task->seccomp.filter);
2493	if (!f) {
2494	unlock_task_sighand(task, flags: &flags);
2495	return `0`;
2496	}
2497
2498	/ prevent filter from being freed while we are printing it /
2499	__get_seccomp_filter(filter: f);
2500	unlock_task_sighand(task, flags: &flags);
2501
2502	proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_NATIVE_NAME,
2503	bitmap: f->cache.allow_native,
2504	SECCOMP_ARCH_NATIVE_NR);
2505
2506	#ifdef SECCOMP_ARCH_COMPAT
2507	proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_COMPAT_NAME,
2508	bitmap: f->cache.allow_compat,
2509	SECCOMP_ARCH_COMPAT_NR);
2510	#endif /* SECCOMP_ARCH_COMPAT */
2511
2512	__put_seccomp_filter(orig: f);
2513	return `0`;
2514	}
2515	#endif /* CONFIG_SECCOMP_CACHE_DEBUG */
2516

source code of linux/kernel/seccomp.c