user-trap.c source code [linux/samples/seccomp/user-trap.c]

1	#include <signal.h>
2	#include <stdio.h>
3	#include <stdlib.h>
4	#include <unistd.h>
5	#include <errno.h>
6	#include <fcntl.h>
7	#include <string.h>
8	#include <stddef.h>
9	#include <sys/sysmacros.h>
10	#include <sys/types.h>
11	#include <sys/wait.h>
12	#include <sys/socket.h>
13	#include <sys/stat.h>
14	#include <sys/mman.h>
15	#include <sys/syscall.h>
16	#include <sys/user.h>
17	#include <sys/ioctl.h>
18	#include <sys/ptrace.h>
19	#include <sys/mount.h>
20	#include <linux/limits.h>
21	#include <linux/filter.h>
22	#include <linux/seccomp.h>
23
24	#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
25
26	static int seccomp(unsigned int op, unsigned int flags, void *args)
27	{
28	errno = `0`;
29	return syscall(__NR_seccomp, op, flags, args);
30	}
31
32	static int send_fd(int sock, int fd)
33	{
34	struct msghdr msg = {};
35	struct cmsghdr *cmsg;
36	int *fd_ptr;
37	char buf[CMSG_SPACE(sizeof(int))] = {`0`}, c = `'c'`;
38	struct iovec io = {
39	.iov_base = &c,
40	.iov_len = `1`,
41	};
42
43	msg.msg_iov = &io;
44	msg.msg_iovlen = `1`;
45	msg.msg_control = buf;
46	msg.msg_controllen = sizeof(buf);
47	cmsg = CMSG_FIRSTHDR(&msg);
48	cmsg->cmsg_level = SOL_SOCKET;
49	cmsg->cmsg_type = SCM_RIGHTS;
50	cmsg->cmsg_len = CMSG_LEN(sizeof(int));
51	fd_ptr = (int *)CMSG_DATA(cmsg);
52	*fd_ptr = fd;
53	msg.msg_controllen = cmsg->cmsg_len;
54
55	if (sendmsg(fd: sock, message: &msg, flags: `0`) < `0`) {
56	perror(s: "sendmsg");
57	return -`1`;
58	}
59
60	return `0`;
61	}
62
63	static int recv_fd(int sock)
64	{
65	struct msghdr msg = {};
66	struct cmsghdr *cmsg;
67	int *fd_ptr;
68	char buf[CMSG_SPACE(sizeof(int))] = {`0`}, c = `'c'`;
69	struct iovec io = {
70	.iov_base = &c,
71	.iov_len = `1`,
72	};
73
74	msg.msg_iov = &io;
75	msg.msg_iovlen = `1`;
76	msg.msg_control = buf;
77	msg.msg_controllen = sizeof(buf);
78
79	if (recvmsg(fd: sock, message: &msg, flags: `0`) < `0`) {
80	perror(s: "recvmsg");
81	return -`1`;
82	}
83
84	cmsg = CMSG_FIRSTHDR(&msg);
85	fd_ptr = (int *)CMSG_DATA(cmsg);
86
87	return *fd_ptr;
88	}
89
90	static int user_trap_syscall(int nr, unsigned int flags)
91	{
92	struct sock_filter filter[] = {
93	BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
94	offsetof(struct seccomp_data, nr)),
95	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, `0`, `1`),
96	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_USER_NOTIF),
97	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
98	};
99
100	struct sock_fprog prog = {
101	.len = (unsigned short)ARRAY_SIZE(filter),
102	.filter = filter,
103	};
104
105	return seccomp(SECCOMP_SET_MODE_FILTER, flags, args: &prog);
106	}
107
108	static int handle_req(struct seccomp_notif *req,
109	struct seccomp_notif_resp resp, int* listener)
110	{
111	char path[PATH_MAX], source[PATH_MAX], target[PATH_MAX];
112	int ret = -`1`, mem;
113
114	resp->id = req->id;
115	resp->error = -EPERM;
116	resp->val = `0`;
117
118	if (req->data.nr != __NR_mount) {
119	fprintf(stderr, format: "huh? trapped something besides mount? %d\n", req->data.nr);
120	return -`1`;
121	}
122
123	/ Only allow bind mounts. /
124	if (!(req->data.args[`3`] & MS_BIND))
125	return `0`;
126
127	/*
128	* Ok, let's read the task's memory to see where they wanted their
129	* mount to go.
130	*/
131	snprintf(s: path, maxlen: sizeof(path), format: "/proc/%d/mem", req->pid);
132	mem = open(file: path, O_RDONLY);
133	if (mem < `0`) {
134	perror(s: "open mem");
135	return -`1`;
136	}
137
138	/*
139	* Now we avoid a TOCTOU: we referred to a pid by its pid, but since
140	* the pid that made the syscall may have died, we need to confirm that
141	* the pid is still valid after we open its /proc/pid/mem file. We can
142	* ask the listener fd this as follows.
143	*
144	* Note that this check should occur after any task-specific
145	* resources are opened, to make sure that the task has not died and
146	* we're not wrongly reading someone else's state in order to make
147	* decisions.
148	*/
149	if (ioctl(fd: listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req->id) < `0`) {
150	fprintf(stderr, format: "task died before we could map its memory\n");
151	goto out;
152	}
153
154	/*
155	* Phew, we've got the right /proc/pid/mem. Now we can read it. Note
156	* that to avoid another TOCTOU, we should read all of the pointer args
157	* before we decide to allow the syscall.
158	*/
159	if (lseek(fd: mem, offset: req->data.args[`0`], SEEK_SET) < `0`) {
160	perror(s: "seek");
161	goto out;
162	}
163
164	ret = read(fd: mem, buf: source, nbytes: sizeof(source));
165	if (ret < `0`) {
166	perror(s: "read");
167	goto out;
168	}
169
170	if (lseek(fd: mem, offset: req->data.args[`1`], SEEK_SET) < `0`) {
171	perror(s: "seek");
172	goto out;
173	}
174
175	ret = read(fd: mem, buf: target, nbytes: sizeof(target));
176	if (ret < `0`) {
177	perror(s: "read");
178	goto out;
179	}
180
181	/*
182	* Our policy is to only allow bind mounts inside /tmp. This isn't very
183	* interesting, because we could do unprivlieged bind mounts with user
184	* namespaces already, but you get the idea.
185	*/
186	if (!strncmp(s1: source, s2: "/tmp/", n: `5`) && !strncmp(s1: target, s2: "/tmp/", n: `5`)) {
187	if (mount(special_file: source, dir: target, NULL, rwflag: req->data.args[`3`], NULL) < `0`) {
188	ret = -`1`;
189	perror(s: "actual mount");
190	goto out;
191	}
192	resp->error = `0`;
193	}
194
195	/ Even if we didn't allow it because of policy, generating the*
196	* response was be a success, because we want to tell the worker EPERM.
197	*/
198	ret = `0`;
199
200	out:
201	close(fd: mem);
202	return ret;
203	}
204
205	int main(void)
206	{
207	int sk_pair[`2`], ret = `1`, status, listener;
208	pid_t worker = `0` , tracer = `0`;
209
210	if (socketpair(PF_LOCAL, SOCK_SEQPACKET, protocol: `0`, fds: sk_pair) < `0`) {
211	perror(s: "socketpair");
212	return `1`;
213	}
214
215	worker = fork();
216	if (worker < `0`) {
217	perror(s: "fork");
218	goto close_pair;
219	}
220
221	if (worker == `0`) {
222	listener = user_trap_syscall(__NR_mount,
223	SECCOMP_FILTER_FLAG_NEW_LISTENER);
224	if (listener < `0`) {
225	perror(s: "seccomp");
226	exit(status: `1`);
227	}
228
229	/*
230	* Drop privileges. We definitely can't mount as uid 1000.
231	*/
232	if (setuid(`1000`) < `0`) {
233	perror(s: "setuid");
234	exit(status: `1`);
235	}
236
237	/*
238	* Send the listener to the parent; also serves as
239	* synchronization.
240	*/
241	if (send_fd(sock: sk_pair[`1`], fd: listener) < `0`)
242	exit(status: `1`);
243	close(fd: listener);
244
245	if (mkdir(path: "/tmp/foo", mode: `0755`) < `0`) {
246	perror(s: "mkdir");
247	exit(status: `1`);
248	}
249
250	/*
251	* Try a bad mount just for grins.
252	*/
253	if (mount(special_file: "/dev/sda", dir: "/tmp/foo", NULL, rwflag: `0`, NULL) != -`1`) {
254	fprintf(stderr, format: "huh? mounted /dev/sda?\n");
255	exit(status: `1`);
256	}
257
258	if (errno != EPERM) {
259	perror(s: "bad error from mount");
260	exit(status: `1`);
261	}
262
263	/*
264	* Ok, we expect this one to succeed.
265	*/
266	if (mount(special_file: "/tmp/foo", dir: "/tmp/foo", NULL, MS_BIND, NULL) < `0`) {
267	perror(s: "mount");
268	exit(status: `1`);
269	}
270
271	exit(status: `0`);
272	}
273
274	/*
275	* Get the listener from the child.
276	*/
277	listener = recv_fd(sock: sk_pair[`0`]);
278	if (listener < `0`)
279	goto out_kill;
280
281	/*
282	* Fork a task to handle the requests. This isn't strictly necessary,
283	* but it makes the particular writing of this sample easier, since we
284	* can just wait ofr the tracee to exit and kill the tracer.
285	*/
286	tracer = fork();
287	if (tracer < `0`) {
288	perror(s: "fork");
289	goto out_kill;
290	}
291
292	if (tracer == `0`) {
293	struct seccomp_notif *req;
294	struct seccomp_notif_resp *resp;
295	struct seccomp_notif_sizes sizes;
296
297	if (seccomp(SECCOMP_GET_NOTIF_SIZES, flags: `0`, args: &sizes) < `0`) {
298	perror(s: "seccomp(GET_NOTIF_SIZES)");
299	goto out_close;
300	}
301
302	req = malloc(size: sizes.seccomp_notif);
303	if (!req)
304	goto out_close;
305
306	resp = malloc(size: sizes.seccomp_notif_resp);
307	if (!resp)
308	goto out_req;
309	memset(s: resp, c: `0`, n: sizes.seccomp_notif_resp);
310
311	while (`1`) {
312	memset(s: req, c: `0`, n: sizes.seccomp_notif);
313	if (ioctl(fd: listener, SECCOMP_IOCTL_NOTIF_RECV, req)) {
314	perror(s: "ioctl recv");
315	goto out_resp;
316	}
317
318	if (handle_req(req, resp, listener) < `0`)
319	goto out_resp;
320
321	/*
322	* ENOENT here means that the task may have gotten a
323	* signal and restarted the syscall. It's up to the
324	* handler to decide what to do in this case, but for
325	* the sample code, we just ignore it. Probably
326	* something better should happen, like undoing the
327	* mount, or keeping track of the args to make sure we
328	* don't do it again.
329	*/
330	if (ioctl(fd: listener, SECCOMP_IOCTL_NOTIF_SEND, resp) < `0` &&
331	errno != ENOENT) {
332	perror(s: "ioctl send");
333	goto out_resp;
334	}
335	}
336	out_resp:
337	free(ptr: resp);
338	out_req:
339	free(ptr: req);
340	out_close:
341	close(fd: listener);
342	exit(status: `1`);
343	}
344
345	close(fd: listener);
346
347	if (waitpid(pid: worker, stat_loc: &status, options: `0`) != worker) {
348	perror(s: "waitpid");
349	goto out_kill;
350	}
351
352	if (umount2(special_file: "/tmp/foo", MNT_DETACH) < `0` && errno != EINVAL) {
353	perror(s: "umount2");
354	goto out_kill;
355	}
356
357	if (remove(filename: "/tmp/foo") < `0` && errno != ENOENT) {
358	perror(s: "remove");
359	exit(status: `1`);
360	}
361
362	if (!WIFEXITED(status) \|\| WEXITSTATUS(status)) {
363	fprintf(stderr, format: "worker exited nonzero\n");
364	goto out_kill;
365	}
366
367	ret = `0`;
368
369	out_kill:
370	if (tracer > `0`)
371	kill(pid: tracer, SIGKILL);
372	if (worker > `0`)
373	kill(pid: worker, SIGKILL);
374
375	close_pair:
376	close(fd: sk_pair[`0`]);
377	close(fd: sk_pair[`1`]);
378	return ret;
379	}
380

source code of linux/samples/seccomp/user-trap.c