1 | #include <signal.h> |
2 | #include <stdio.h> |
3 | #include <stdlib.h> |
4 | #include <unistd.h> |
5 | #include <errno.h> |
6 | #include <fcntl.h> |
7 | #include <string.h> |
8 | #include <stddef.h> |
9 | #include <sys/sysmacros.h> |
10 | #include <sys/types.h> |
11 | #include <sys/wait.h> |
12 | #include <sys/socket.h> |
13 | #include <sys/stat.h> |
14 | #include <sys/mman.h> |
15 | #include <sys/syscall.h> |
16 | #include <sys/user.h> |
17 | #include <sys/ioctl.h> |
18 | #include <sys/ptrace.h> |
19 | #include <sys/mount.h> |
20 | #include <linux/limits.h> |
21 | #include <linux/filter.h> |
22 | #include <linux/seccomp.h> |
23 | |
24 | #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) |
25 | |
26 | static int seccomp(unsigned int op, unsigned int flags, void *args) |
27 | { |
28 | errno = 0; |
29 | return syscall(__NR_seccomp, op, flags, args); |
30 | } |
31 | |
32 | static int send_fd(int sock, int fd) |
33 | { |
34 | struct msghdr msg = {}; |
35 | struct cmsghdr *cmsg; |
36 | int *fd_ptr; |
37 | char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c'; |
38 | struct iovec io = { |
39 | .iov_base = &c, |
40 | .iov_len = 1, |
41 | }; |
42 | |
43 | msg.msg_iov = &io; |
44 | msg.msg_iovlen = 1; |
45 | msg.msg_control = buf; |
46 | msg.msg_controllen = sizeof(buf); |
47 | cmsg = CMSG_FIRSTHDR(&msg); |
48 | cmsg->cmsg_level = SOL_SOCKET; |
49 | cmsg->cmsg_type = SCM_RIGHTS; |
50 | cmsg->cmsg_len = CMSG_LEN(sizeof(int)); |
51 | fd_ptr = (int *)CMSG_DATA(cmsg); |
52 | *fd_ptr = fd; |
53 | msg.msg_controllen = cmsg->cmsg_len; |
54 | |
55 | if (sendmsg(fd: sock, message: &msg, flags: 0) < 0) { |
56 | perror(s: "sendmsg" ); |
57 | return -1; |
58 | } |
59 | |
60 | return 0; |
61 | } |
62 | |
63 | static int recv_fd(int sock) |
64 | { |
65 | struct msghdr msg = {}; |
66 | struct cmsghdr *cmsg; |
67 | int *fd_ptr; |
68 | char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c'; |
69 | struct iovec io = { |
70 | .iov_base = &c, |
71 | .iov_len = 1, |
72 | }; |
73 | |
74 | msg.msg_iov = &io; |
75 | msg.msg_iovlen = 1; |
76 | msg.msg_control = buf; |
77 | msg.msg_controllen = sizeof(buf); |
78 | |
79 | if (recvmsg(fd: sock, message: &msg, flags: 0) < 0) { |
80 | perror(s: "recvmsg" ); |
81 | return -1; |
82 | } |
83 | |
84 | cmsg = CMSG_FIRSTHDR(&msg); |
85 | fd_ptr = (int *)CMSG_DATA(cmsg); |
86 | |
87 | return *fd_ptr; |
88 | } |
89 | |
90 | static int user_trap_syscall(int nr, unsigned int flags) |
91 | { |
92 | struct sock_filter filter[] = { |
93 | BPF_STMT(BPF_LD+BPF_W+BPF_ABS, |
94 | offsetof(struct seccomp_data, nr)), |
95 | BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1), |
96 | BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_USER_NOTIF), |
97 | BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), |
98 | }; |
99 | |
100 | struct sock_fprog prog = { |
101 | .len = (unsigned short)ARRAY_SIZE(filter), |
102 | .filter = filter, |
103 | }; |
104 | |
105 | return seccomp(SECCOMP_SET_MODE_FILTER, flags, args: &prog); |
106 | } |
107 | |
108 | static int handle_req(struct seccomp_notif *req, |
109 | struct seccomp_notif_resp *resp, int listener) |
110 | { |
111 | char path[PATH_MAX], source[PATH_MAX], target[PATH_MAX]; |
112 | int ret = -1, mem; |
113 | |
114 | resp->id = req->id; |
115 | resp->error = -EPERM; |
116 | resp->val = 0; |
117 | |
118 | if (req->data.nr != __NR_mount) { |
119 | fprintf(stderr, format: "huh? trapped something besides mount? %d\n" , req->data.nr); |
120 | return -1; |
121 | } |
122 | |
123 | /* Only allow bind mounts. */ |
124 | if (!(req->data.args[3] & MS_BIND)) |
125 | return 0; |
126 | |
127 | /* |
128 | * Ok, let's read the task's memory to see where they wanted their |
129 | * mount to go. |
130 | */ |
131 | snprintf(s: path, maxlen: sizeof(path), format: "/proc/%d/mem" , req->pid); |
132 | mem = open(file: path, O_RDONLY); |
133 | if (mem < 0) { |
134 | perror(s: "open mem" ); |
135 | return -1; |
136 | } |
137 | |
138 | /* |
139 | * Now we avoid a TOCTOU: we referred to a pid by its pid, but since |
140 | * the pid that made the syscall may have died, we need to confirm that |
141 | * the pid is still valid after we open its /proc/pid/mem file. We can |
142 | * ask the listener fd this as follows. |
143 | * |
144 | * Note that this check should occur *after* any task-specific |
145 | * resources are opened, to make sure that the task has not died and |
146 | * we're not wrongly reading someone else's state in order to make |
147 | * decisions. |
148 | */ |
149 | if (ioctl(fd: listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req->id) < 0) { |
150 | fprintf(stderr, format: "task died before we could map its memory\n" ); |
151 | goto out; |
152 | } |
153 | |
154 | /* |
155 | * Phew, we've got the right /proc/pid/mem. Now we can read it. Note |
156 | * that to avoid another TOCTOU, we should read all of the pointer args |
157 | * before we decide to allow the syscall. |
158 | */ |
159 | if (lseek(fd: mem, offset: req->data.args[0], SEEK_SET) < 0) { |
160 | perror(s: "seek" ); |
161 | goto out; |
162 | } |
163 | |
164 | ret = read(fd: mem, buf: source, nbytes: sizeof(source)); |
165 | if (ret < 0) { |
166 | perror(s: "read" ); |
167 | goto out; |
168 | } |
169 | |
170 | if (lseek(fd: mem, offset: req->data.args[1], SEEK_SET) < 0) { |
171 | perror(s: "seek" ); |
172 | goto out; |
173 | } |
174 | |
175 | ret = read(fd: mem, buf: target, nbytes: sizeof(target)); |
176 | if (ret < 0) { |
177 | perror(s: "read" ); |
178 | goto out; |
179 | } |
180 | |
181 | /* |
182 | * Our policy is to only allow bind mounts inside /tmp. This isn't very |
183 | * interesting, because we could do unprivlieged bind mounts with user |
184 | * namespaces already, but you get the idea. |
185 | */ |
186 | if (!strncmp(s1: source, s2: "/tmp/" , n: 5) && !strncmp(s1: target, s2: "/tmp/" , n: 5)) { |
187 | if (mount(special_file: source, dir: target, NULL, rwflag: req->data.args[3], NULL) < 0) { |
188 | ret = -1; |
189 | perror(s: "actual mount" ); |
190 | goto out; |
191 | } |
192 | resp->error = 0; |
193 | } |
194 | |
195 | /* Even if we didn't allow it because of policy, generating the |
196 | * response was be a success, because we want to tell the worker EPERM. |
197 | */ |
198 | ret = 0; |
199 | |
200 | out: |
201 | close(fd: mem); |
202 | return ret; |
203 | } |
204 | |
205 | int main(void) |
206 | { |
207 | int sk_pair[2], ret = 1, status, listener; |
208 | pid_t worker = 0 , tracer = 0; |
209 | |
210 | if (socketpair(PF_LOCAL, SOCK_SEQPACKET, protocol: 0, fds: sk_pair) < 0) { |
211 | perror(s: "socketpair" ); |
212 | return 1; |
213 | } |
214 | |
215 | worker = fork(); |
216 | if (worker < 0) { |
217 | perror(s: "fork" ); |
218 | goto close_pair; |
219 | } |
220 | |
221 | if (worker == 0) { |
222 | listener = user_trap_syscall(__NR_mount, |
223 | SECCOMP_FILTER_FLAG_NEW_LISTENER); |
224 | if (listener < 0) { |
225 | perror(s: "seccomp" ); |
226 | exit(status: 1); |
227 | } |
228 | |
229 | /* |
230 | * Drop privileges. We definitely can't mount as uid 1000. |
231 | */ |
232 | if (setuid(1000) < 0) { |
233 | perror(s: "setuid" ); |
234 | exit(status: 1); |
235 | } |
236 | |
237 | /* |
238 | * Send the listener to the parent; also serves as |
239 | * synchronization. |
240 | */ |
241 | if (send_fd(sock: sk_pair[1], fd: listener) < 0) |
242 | exit(status: 1); |
243 | close(fd: listener); |
244 | |
245 | if (mkdir(path: "/tmp/foo" , mode: 0755) < 0) { |
246 | perror(s: "mkdir" ); |
247 | exit(status: 1); |
248 | } |
249 | |
250 | /* |
251 | * Try a bad mount just for grins. |
252 | */ |
253 | if (mount(special_file: "/dev/sda" , dir: "/tmp/foo" , NULL, rwflag: 0, NULL) != -1) { |
254 | fprintf(stderr, format: "huh? mounted /dev/sda?\n" ); |
255 | exit(status: 1); |
256 | } |
257 | |
258 | if (errno != EPERM) { |
259 | perror(s: "bad error from mount" ); |
260 | exit(status: 1); |
261 | } |
262 | |
263 | /* |
264 | * Ok, we expect this one to succeed. |
265 | */ |
266 | if (mount(special_file: "/tmp/foo" , dir: "/tmp/foo" , NULL, MS_BIND, NULL) < 0) { |
267 | perror(s: "mount" ); |
268 | exit(status: 1); |
269 | } |
270 | |
271 | exit(status: 0); |
272 | } |
273 | |
274 | /* |
275 | * Get the listener from the child. |
276 | */ |
277 | listener = recv_fd(sock: sk_pair[0]); |
278 | if (listener < 0) |
279 | goto out_kill; |
280 | |
281 | /* |
282 | * Fork a task to handle the requests. This isn't strictly necessary, |
283 | * but it makes the particular writing of this sample easier, since we |
284 | * can just wait ofr the tracee to exit and kill the tracer. |
285 | */ |
286 | tracer = fork(); |
287 | if (tracer < 0) { |
288 | perror(s: "fork" ); |
289 | goto out_kill; |
290 | } |
291 | |
292 | if (tracer == 0) { |
293 | struct seccomp_notif *req; |
294 | struct seccomp_notif_resp *resp; |
295 | struct seccomp_notif_sizes sizes; |
296 | |
297 | if (seccomp(SECCOMP_GET_NOTIF_SIZES, flags: 0, args: &sizes) < 0) { |
298 | perror(s: "seccomp(GET_NOTIF_SIZES)" ); |
299 | goto out_close; |
300 | } |
301 | |
302 | req = malloc(size: sizes.seccomp_notif); |
303 | if (!req) |
304 | goto out_close; |
305 | |
306 | resp = malloc(size: sizes.seccomp_notif_resp); |
307 | if (!resp) |
308 | goto out_req; |
309 | memset(s: resp, c: 0, n: sizes.seccomp_notif_resp); |
310 | |
311 | while (1) { |
312 | memset(s: req, c: 0, n: sizes.seccomp_notif); |
313 | if (ioctl(fd: listener, SECCOMP_IOCTL_NOTIF_RECV, req)) { |
314 | perror(s: "ioctl recv" ); |
315 | goto out_resp; |
316 | } |
317 | |
318 | if (handle_req(req, resp, listener) < 0) |
319 | goto out_resp; |
320 | |
321 | /* |
322 | * ENOENT here means that the task may have gotten a |
323 | * signal and restarted the syscall. It's up to the |
324 | * handler to decide what to do in this case, but for |
325 | * the sample code, we just ignore it. Probably |
326 | * something better should happen, like undoing the |
327 | * mount, or keeping track of the args to make sure we |
328 | * don't do it again. |
329 | */ |
330 | if (ioctl(fd: listener, SECCOMP_IOCTL_NOTIF_SEND, resp) < 0 && |
331 | errno != ENOENT) { |
332 | perror(s: "ioctl send" ); |
333 | goto out_resp; |
334 | } |
335 | } |
336 | out_resp: |
337 | free(ptr: resp); |
338 | out_req: |
339 | free(ptr: req); |
340 | out_close: |
341 | close(fd: listener); |
342 | exit(status: 1); |
343 | } |
344 | |
345 | close(fd: listener); |
346 | |
347 | if (waitpid(pid: worker, stat_loc: &status, options: 0) != worker) { |
348 | perror(s: "waitpid" ); |
349 | goto out_kill; |
350 | } |
351 | |
352 | if (umount2(special_file: "/tmp/foo" , MNT_DETACH) < 0 && errno != EINVAL) { |
353 | perror(s: "umount2" ); |
354 | goto out_kill; |
355 | } |
356 | |
357 | if (remove(filename: "/tmp/foo" ) < 0 && errno != ENOENT) { |
358 | perror(s: "remove" ); |
359 | exit(status: 1); |
360 | } |
361 | |
362 | if (!WIFEXITED(status) || WEXITSTATUS(status)) { |
363 | fprintf(stderr, format: "worker exited nonzero\n" ); |
364 | goto out_kill; |
365 | } |
366 | |
367 | ret = 0; |
368 | |
369 | out_kill: |
370 | if (tracer > 0) |
371 | kill(pid: tracer, SIGKILL); |
372 | if (worker > 0) |
373 | kill(pid: worker, SIGKILL); |
374 | |
375 | close_pair: |
376 | close(fd: sk_pair[0]); |
377 | close(fd: sk_pair[1]); |
378 | return ret; |
379 | } |
380 | |