user-trap.c source code [linux/samples/seccomp/user-trap.c]

1	#include <signal.h>
2	#include <stdio.h>
3	#include <stdlib.h>
4	#include <unistd.h>
5	#include <errno.h>
6	#include <fcntl.h>
7	#include <string.h>
8	#include <stddef.h>
9	#include <sys/sysmacros.h>
10	#include <sys/types.h>
11	#include <sys/wait.h>
12	#include <sys/socket.h>
13	#include <sys/stat.h>
14	#include <sys/mman.h>
15	#include <sys/syscall.h>
16	#include <sys/user.h>
17	#include <sys/ioctl.h>
18	#include <sys/ptrace.h>
19	#include <sys/mount.h>
20	#include <linux/limits.h>
21	#include <linux/filter.h>
22	#include <linux/seccomp.h>
23
24	#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
25
26	static int seccomp(unsigned int op, unsigned int flags, void *args)
27	{
28	errno = `0`;
29	return syscall(__NR_seccomp, op, flags, args);
30	}
31
32	static int send_fd(int sock, int fd)
33	{
34	struct msghdr msg = {};
35	struct cmsghdr *cmsg;
36	char buf[CMSG_SPACE(sizeof(int))] = {`0`}, c = `'c'`;
37	struct iovec io = {
38	.iov_base = &c,
39	.iov_len = `1`,
40	};
41
42	msg.msg_iov = &io;
43	msg.msg_iovlen = `1`;
44	msg.msg_control = buf;
45	msg.msg_controllen = sizeof(buf);
46	cmsg = CMSG_FIRSTHDR(&msg);
47	cmsg->cmsg_level = SOL_SOCKET;
48	cmsg->cmsg_type = SCM_RIGHTS;
49	cmsg->cmsg_len = CMSG_LEN(sizeof(int));
50	((int* *)CMSG_DATA(cmsg)) = fd;
51	msg.msg_controllen = cmsg->cmsg_len;
52
53	if (sendmsg(fd: sock, message: &msg, flags: `0`) < `0`) {
54	perror(s: "sendmsg");
55	return -`1`;
56	}
57
58	return `0`;
59	}
60
61	static int recv_fd(int sock)
62	{
63	struct msghdr msg = {};
64	struct cmsghdr *cmsg;
65	char buf[CMSG_SPACE(sizeof(int))] = {`0`}, c = `'c'`;
66	struct iovec io = {
67	.iov_base = &c,
68	.iov_len = `1`,
69	};
70
71	msg.msg_iov = &io;
72	msg.msg_iovlen = `1`;
73	msg.msg_control = buf;
74	msg.msg_controllen = sizeof(buf);
75
76	if (recvmsg(fd: sock, message: &msg, flags: `0`) < `0`) {
77	perror(s: "recvmsg");
78	return -`1`;
79	}
80
81	cmsg = CMSG_FIRSTHDR(&msg);
82
83	return ((int* *)CMSG_DATA(cmsg));
84	}
85
86	static int user_trap_syscall(int nr, unsigned int flags)
87	{
88	struct sock_filter filter[] = {
89	BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
90	offsetof(struct seccomp_data, nr)),
91	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, `0`, `1`),
92	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_USER_NOTIF),
93	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
94	};
95
96	struct sock_fprog prog = {
97	.len = (unsigned short)ARRAY_SIZE(filter),
98	.filter = filter,
99	};
100
101	return seccomp(SECCOMP_SET_MODE_FILTER, flags, args: &prog);
102	}
103
104	static int handle_req(struct seccomp_notif *req,
105	struct seccomp_notif_resp resp, int* listener)
106	{
107	char path[PATH_MAX], source[PATH_MAX], target[PATH_MAX];
108	int ret = -`1`, mem;
109
110	resp->id = req->id;
111	resp->error = -EPERM;
112	resp->val = `0`;
113
114	if (req->data.nr != __NR_mount) {
115	fprintf(stderr, format: "huh? trapped something besides mount? %d\n", req->data.nr);
116	return -`1`;
117	}
118
119	/ Only allow bind mounts. /
120	if (!(req->data.args[`3`] & MS_BIND))
121	return `0`;
122
123	/*
124	* Ok, let's read the task's memory to see where they wanted their
125	* mount to go.
126	*/
127	snprintf(s: path, maxlen: sizeof(path), format: "/proc/%d/mem", req->pid);
128	mem = open(file: path, O_RDONLY);
129	if (mem < `0`) {
130	perror(s: "open mem");
131	return -`1`;
132	}
133
134	/*
135	* Now we avoid a TOCTOU: we referred to a pid by its pid, but since
136	* the pid that made the syscall may have died, we need to confirm that
137	* the pid is still valid after we open its /proc/pid/mem file. We can
138	* ask the listener fd this as follows.
139	*
140	* Note that this check should occur after any task-specific
141	* resources are opened, to make sure that the task has not died and
142	* we're not wrongly reading someone else's state in order to make
143	* decisions.
144	*/
145	if (ioctl(fd: listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req->id) < `0`) {
146	fprintf(stderr, format: "task died before we could map its memory\n");
147	goto out;
148	}
149
150	/*
151	* Phew, we've got the right /proc/pid/mem. Now we can read it. Note
152	* that to avoid another TOCTOU, we should read all of the pointer args
153	* before we decide to allow the syscall.
154	*/
155	if (lseek(fd: mem, offset: req->data.args[`0`], SEEK_SET) < `0`) {
156	perror(s: "seek");
157	goto out;
158	}
159
160	ret = read(fd: mem, buf: source, nbytes: sizeof(source));
161	if (ret < `0`) {
162	perror(s: "read");
163	goto out;
164	}
165
166	if (lseek(fd: mem, offset: req->data.args[`1`], SEEK_SET) < `0`) {
167	perror(s: "seek");
168	goto out;
169	}
170
171	ret = read(fd: mem, buf: target, nbytes: sizeof(target));
172	if (ret < `0`) {
173	perror(s: "read");
174	goto out;
175	}
176
177	/*
178	* Our policy is to only allow bind mounts inside /tmp. This isn't very
179	* interesting, because we could do unprivlieged bind mounts with user
180	* namespaces already, but you get the idea.
181	*/
182	if (!strncmp(s1: source, s2: "/tmp/", n: `5`) && !strncmp(s1: target, s2: "/tmp/", n: `5`)) {
183	if (mount(special_file: source, dir: target, NULL, rwflag: req->data.args[`3`], NULL) < `0`) {
184	ret = -`1`;
185	perror(s: "actual mount");
186	goto out;
187	}
188	resp->error = `0`;
189	}
190
191	/ Even if we didn't allow it because of policy, generating the*
192	* response was be a success, because we want to tell the worker EPERM.
193	*/
194	ret = `0`;
195
196	out:
197	close(fd: mem);
198	return ret;
199	}
200
201	int main(void)
202	{
203	int sk_pair[`2`], ret = `1`, status, listener;
204	pid_t worker = `0` , tracer = `0`;
205
206	if (socketpair(PF_LOCAL, SOCK_SEQPACKET, protocol: `0`, fds: sk_pair) < `0`) {
207	perror(s: "socketpair");
208	return `1`;
209	}
210
211	worker = fork();
212	if (worker < `0`) {
213	perror(s: "fork");
214	goto close_pair;
215	}
216
217	if (worker == `0`) {
218	listener = user_trap_syscall(__NR_mount,
219	SECCOMP_FILTER_FLAG_NEW_LISTENER);
220	if (listener < `0`) {
221	perror(s: "seccomp");
222	exit(status: `1`);
223	}
224
225	/*
226	* Drop privileges. We definitely can't mount as uid 1000.
227	*/
228	if (setuid(`1000`) < `0`) {
229	perror(s: "setuid");
230	exit(status: `1`);
231	}
232
233	/*
234	* Send the listener to the parent; also serves as
235	* synchronization.
236	*/
237	if (send_fd(sock: sk_pair[`1`], fd: listener) < `0`)
238	exit(status: `1`);
239	close(fd: listener);
240
241	if (mkdir(path: "/tmp/foo", mode: `0755`) < `0`) {
242	perror(s: "mkdir");
243	exit(status: `1`);
244	}
245
246	/*
247	* Try a bad mount just for grins.
248	*/
249	if (mount(special_file: "/dev/sda", dir: "/tmp/foo", NULL, rwflag: `0`, NULL) != -`1`) {
250	fprintf(stderr, format: "huh? mounted /dev/sda?\n");
251	exit(status: `1`);
252	}
253
254	if (errno != EPERM) {
255	perror(s: "bad error from mount");
256	exit(status: `1`);
257	}
258
259	/*
260	* Ok, we expect this one to succeed.
261	*/
262	if (mount(special_file: "/tmp/foo", dir: "/tmp/foo", NULL, MS_BIND, NULL) < `0`) {
263	perror(s: "mount");
264	exit(status: `1`);
265	}
266
267	exit(status: `0`);
268	}
269
270	/*
271	* Get the listener from the child.
272	*/
273	listener = recv_fd(sock: sk_pair[`0`]);
274	if (listener < `0`)
275	goto out_kill;
276
277	/*
278	* Fork a task to handle the requests. This isn't strictly necessary,
279	* but it makes the particular writing of this sample easier, since we
280	* can just wait ofr the tracee to exit and kill the tracer.
281	*/
282	tracer = fork();
283	if (tracer < `0`) {
284	perror(s: "fork");
285	goto out_kill;
286	}
287
288	if (tracer == `0`) {
289	struct seccomp_notif *req;
290	struct seccomp_notif_resp *resp;
291	struct seccomp_notif_sizes sizes;
292
293	if (seccomp(SECCOMP_GET_NOTIF_SIZES, flags: `0`, args: &sizes) < `0`) {
294	perror(s: "seccomp(GET_NOTIF_SIZES)");
295	goto out_close;
296	}
297
298	req = malloc(size: sizes.seccomp_notif);
299	if (!req)
300	goto out_close;
301
302	resp = malloc(size: sizes.seccomp_notif_resp);
303	if (!resp)
304	goto out_req;
305	memset(s: resp, c: `0`, n: sizes.seccomp_notif_resp);
306
307	while (`1`) {
308	memset(s: req, c: `0`, n: sizes.seccomp_notif);
309	if (ioctl(fd: listener, SECCOMP_IOCTL_NOTIF_RECV, req)) {
310	perror(s: "ioctl recv");
311	goto out_resp;
312	}
313
314	if (handle_req(req, resp, listener) < `0`)
315	goto out_resp;
316
317	/*
318	* ENOENT here means that the task may have gotten a
319	* signal and restarted the syscall. It's up to the
320	* handler to decide what to do in this case, but for
321	* the sample code, we just ignore it. Probably
322	* something better should happen, like undoing the
323	* mount, or keeping track of the args to make sure we
324	* don't do it again.
325	*/
326	if (ioctl(fd: listener, SECCOMP_IOCTL_NOTIF_SEND, resp) < `0` &&
327	errno != ENOENT) {
328	perror(s: "ioctl send");
329	goto out_resp;
330	}
331	}
332	out_resp:
333	free(ptr: resp);
334	out_req:
335	free(ptr: req);
336	out_close:
337	close(fd: listener);
338	exit(status: `1`);
339	}
340
341	close(fd: listener);
342
343	if (waitpid(pid: worker, stat_loc: &status, options: `0`) != worker) {
344	perror(s: "waitpid");
345	goto out_kill;
346	}
347
348	if (umount2(special_file: "/tmp/foo", MNT_DETACH) < `0` && errno != EINVAL) {
349	perror(s: "umount2");
350	goto out_kill;
351	}
352
353	if (remove(filename: "/tmp/foo") < `0` && errno != ENOENT) {
354	perror(s: "remove");
355	exit(status: `1`);
356	}
357
358	if (!WIFEXITED(status) \|\| WEXITSTATUS(status)) {
359	fprintf(stderr, format: "worker exited nonzero\n");
360	goto out_kill;
361	}
362
363	ret = `0`;
364
365	out_kill:
366	if (tracer > `0`)
367	kill(pid: tracer, SIGKILL);
368	if (worker > `0`)
369	kill(pid: worker, SIGKILL);
370
371	close_pair:
372	close(fd: sk_pair[`0`]);
373	close(fd: sk_pair[`1`]);
374	return ret;
375	}
376

source code of linux/samples/seccomp/user-trap.c