1 | #include <signal.h> |
2 | #include <stdio.h> |
3 | #include <stdlib.h> |
4 | #include <unistd.h> |
5 | #include <errno.h> |
6 | #include <fcntl.h> |
7 | #include <string.h> |
8 | #include <stddef.h> |
9 | #include <sys/sysmacros.h> |
10 | #include <sys/types.h> |
11 | #include <sys/wait.h> |
12 | #include <sys/socket.h> |
13 | #include <sys/stat.h> |
14 | #include <sys/mman.h> |
15 | #include <sys/syscall.h> |
16 | #include <sys/user.h> |
17 | #include <sys/ioctl.h> |
18 | #include <sys/ptrace.h> |
19 | #include <sys/mount.h> |
20 | #include <linux/limits.h> |
21 | #include <linux/filter.h> |
22 | #include <linux/seccomp.h> |
23 | |
24 | #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) |
25 | |
26 | static int seccomp(unsigned int op, unsigned int flags, void *args) |
27 | { |
28 | errno = 0; |
29 | return syscall(__NR_seccomp, op, flags, args); |
30 | } |
31 | |
32 | static int send_fd(int sock, int fd) |
33 | { |
34 | struct msghdr msg = {}; |
35 | struct cmsghdr *cmsg; |
36 | char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c'; |
37 | struct iovec io = { |
38 | .iov_base = &c, |
39 | .iov_len = 1, |
40 | }; |
41 | |
42 | msg.msg_iov = &io; |
43 | msg.msg_iovlen = 1; |
44 | msg.msg_control = buf; |
45 | msg.msg_controllen = sizeof(buf); |
46 | cmsg = CMSG_FIRSTHDR(&msg); |
47 | cmsg->cmsg_level = SOL_SOCKET; |
48 | cmsg->cmsg_type = SCM_RIGHTS; |
49 | cmsg->cmsg_len = CMSG_LEN(sizeof(int)); |
50 | *((int *)CMSG_DATA(cmsg)) = fd; |
51 | msg.msg_controllen = cmsg->cmsg_len; |
52 | |
53 | if (sendmsg(fd: sock, message: &msg, flags: 0) < 0) { |
54 | perror(s: "sendmsg" ); |
55 | return -1; |
56 | } |
57 | |
58 | return 0; |
59 | } |
60 | |
61 | static int recv_fd(int sock) |
62 | { |
63 | struct msghdr msg = {}; |
64 | struct cmsghdr *cmsg; |
65 | char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c'; |
66 | struct iovec io = { |
67 | .iov_base = &c, |
68 | .iov_len = 1, |
69 | }; |
70 | |
71 | msg.msg_iov = &io; |
72 | msg.msg_iovlen = 1; |
73 | msg.msg_control = buf; |
74 | msg.msg_controllen = sizeof(buf); |
75 | |
76 | if (recvmsg(fd: sock, message: &msg, flags: 0) < 0) { |
77 | perror(s: "recvmsg" ); |
78 | return -1; |
79 | } |
80 | |
81 | cmsg = CMSG_FIRSTHDR(&msg); |
82 | |
83 | return *((int *)CMSG_DATA(cmsg)); |
84 | } |
85 | |
86 | static int user_trap_syscall(int nr, unsigned int flags) |
87 | { |
88 | struct sock_filter filter[] = { |
89 | BPF_STMT(BPF_LD+BPF_W+BPF_ABS, |
90 | offsetof(struct seccomp_data, nr)), |
91 | BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1), |
92 | BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_USER_NOTIF), |
93 | BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), |
94 | }; |
95 | |
96 | struct sock_fprog prog = { |
97 | .len = (unsigned short)ARRAY_SIZE(filter), |
98 | .filter = filter, |
99 | }; |
100 | |
101 | return seccomp(SECCOMP_SET_MODE_FILTER, flags, args: &prog); |
102 | } |
103 | |
104 | static int handle_req(struct seccomp_notif *req, |
105 | struct seccomp_notif_resp *resp, int listener) |
106 | { |
107 | char path[PATH_MAX], source[PATH_MAX], target[PATH_MAX]; |
108 | int ret = -1, mem; |
109 | |
110 | resp->id = req->id; |
111 | resp->error = -EPERM; |
112 | resp->val = 0; |
113 | |
114 | if (req->data.nr != __NR_mount) { |
115 | fprintf(stderr, format: "huh? trapped something besides mount? %d\n" , req->data.nr); |
116 | return -1; |
117 | } |
118 | |
119 | /* Only allow bind mounts. */ |
120 | if (!(req->data.args[3] & MS_BIND)) |
121 | return 0; |
122 | |
123 | /* |
124 | * Ok, let's read the task's memory to see where they wanted their |
125 | * mount to go. |
126 | */ |
127 | snprintf(s: path, maxlen: sizeof(path), format: "/proc/%d/mem" , req->pid); |
128 | mem = open(file: path, O_RDONLY); |
129 | if (mem < 0) { |
130 | perror(s: "open mem" ); |
131 | return -1; |
132 | } |
133 | |
134 | /* |
135 | * Now we avoid a TOCTOU: we referred to a pid by its pid, but since |
136 | * the pid that made the syscall may have died, we need to confirm that |
137 | * the pid is still valid after we open its /proc/pid/mem file. We can |
138 | * ask the listener fd this as follows. |
139 | * |
140 | * Note that this check should occur *after* any task-specific |
141 | * resources are opened, to make sure that the task has not died and |
142 | * we're not wrongly reading someone else's state in order to make |
143 | * decisions. |
144 | */ |
145 | if (ioctl(fd: listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req->id) < 0) { |
146 | fprintf(stderr, format: "task died before we could map its memory\n" ); |
147 | goto out; |
148 | } |
149 | |
150 | /* |
151 | * Phew, we've got the right /proc/pid/mem. Now we can read it. Note |
152 | * that to avoid another TOCTOU, we should read all of the pointer args |
153 | * before we decide to allow the syscall. |
154 | */ |
155 | if (lseek(fd: mem, offset: req->data.args[0], SEEK_SET) < 0) { |
156 | perror(s: "seek" ); |
157 | goto out; |
158 | } |
159 | |
160 | ret = read(fd: mem, buf: source, nbytes: sizeof(source)); |
161 | if (ret < 0) { |
162 | perror(s: "read" ); |
163 | goto out; |
164 | } |
165 | |
166 | if (lseek(fd: mem, offset: req->data.args[1], SEEK_SET) < 0) { |
167 | perror(s: "seek" ); |
168 | goto out; |
169 | } |
170 | |
171 | ret = read(fd: mem, buf: target, nbytes: sizeof(target)); |
172 | if (ret < 0) { |
173 | perror(s: "read" ); |
174 | goto out; |
175 | } |
176 | |
177 | /* |
178 | * Our policy is to only allow bind mounts inside /tmp. This isn't very |
179 | * interesting, because we could do unprivlieged bind mounts with user |
180 | * namespaces already, but you get the idea. |
181 | */ |
182 | if (!strncmp(s1: source, s2: "/tmp/" , n: 5) && !strncmp(s1: target, s2: "/tmp/" , n: 5)) { |
183 | if (mount(special_file: source, dir: target, NULL, rwflag: req->data.args[3], NULL) < 0) { |
184 | ret = -1; |
185 | perror(s: "actual mount" ); |
186 | goto out; |
187 | } |
188 | resp->error = 0; |
189 | } |
190 | |
191 | /* Even if we didn't allow it because of policy, generating the |
192 | * response was be a success, because we want to tell the worker EPERM. |
193 | */ |
194 | ret = 0; |
195 | |
196 | out: |
197 | close(fd: mem); |
198 | return ret; |
199 | } |
200 | |
201 | int main(void) |
202 | { |
203 | int sk_pair[2], ret = 1, status, listener; |
204 | pid_t worker = 0 , tracer = 0; |
205 | |
206 | if (socketpair(PF_LOCAL, SOCK_SEQPACKET, protocol: 0, fds: sk_pair) < 0) { |
207 | perror(s: "socketpair" ); |
208 | return 1; |
209 | } |
210 | |
211 | worker = fork(); |
212 | if (worker < 0) { |
213 | perror(s: "fork" ); |
214 | goto close_pair; |
215 | } |
216 | |
217 | if (worker == 0) { |
218 | listener = user_trap_syscall(__NR_mount, |
219 | SECCOMP_FILTER_FLAG_NEW_LISTENER); |
220 | if (listener < 0) { |
221 | perror(s: "seccomp" ); |
222 | exit(status: 1); |
223 | } |
224 | |
225 | /* |
226 | * Drop privileges. We definitely can't mount as uid 1000. |
227 | */ |
228 | if (setuid(1000) < 0) { |
229 | perror(s: "setuid" ); |
230 | exit(status: 1); |
231 | } |
232 | |
233 | /* |
234 | * Send the listener to the parent; also serves as |
235 | * synchronization. |
236 | */ |
237 | if (send_fd(sock: sk_pair[1], fd: listener) < 0) |
238 | exit(status: 1); |
239 | close(fd: listener); |
240 | |
241 | if (mkdir(path: "/tmp/foo" , mode: 0755) < 0) { |
242 | perror(s: "mkdir" ); |
243 | exit(status: 1); |
244 | } |
245 | |
246 | /* |
247 | * Try a bad mount just for grins. |
248 | */ |
249 | if (mount(special_file: "/dev/sda" , dir: "/tmp/foo" , NULL, rwflag: 0, NULL) != -1) { |
250 | fprintf(stderr, format: "huh? mounted /dev/sda?\n" ); |
251 | exit(status: 1); |
252 | } |
253 | |
254 | if (errno != EPERM) { |
255 | perror(s: "bad error from mount" ); |
256 | exit(status: 1); |
257 | } |
258 | |
259 | /* |
260 | * Ok, we expect this one to succeed. |
261 | */ |
262 | if (mount(special_file: "/tmp/foo" , dir: "/tmp/foo" , NULL, MS_BIND, NULL) < 0) { |
263 | perror(s: "mount" ); |
264 | exit(status: 1); |
265 | } |
266 | |
267 | exit(status: 0); |
268 | } |
269 | |
270 | /* |
271 | * Get the listener from the child. |
272 | */ |
273 | listener = recv_fd(sock: sk_pair[0]); |
274 | if (listener < 0) |
275 | goto out_kill; |
276 | |
277 | /* |
278 | * Fork a task to handle the requests. This isn't strictly necessary, |
279 | * but it makes the particular writing of this sample easier, since we |
280 | * can just wait ofr the tracee to exit and kill the tracer. |
281 | */ |
282 | tracer = fork(); |
283 | if (tracer < 0) { |
284 | perror(s: "fork" ); |
285 | goto out_kill; |
286 | } |
287 | |
288 | if (tracer == 0) { |
289 | struct seccomp_notif *req; |
290 | struct seccomp_notif_resp *resp; |
291 | struct seccomp_notif_sizes sizes; |
292 | |
293 | if (seccomp(SECCOMP_GET_NOTIF_SIZES, flags: 0, args: &sizes) < 0) { |
294 | perror(s: "seccomp(GET_NOTIF_SIZES)" ); |
295 | goto out_close; |
296 | } |
297 | |
298 | req = malloc(size: sizes.seccomp_notif); |
299 | if (!req) |
300 | goto out_close; |
301 | |
302 | resp = malloc(size: sizes.seccomp_notif_resp); |
303 | if (!resp) |
304 | goto out_req; |
305 | memset(s: resp, c: 0, n: sizes.seccomp_notif_resp); |
306 | |
307 | while (1) { |
308 | memset(s: req, c: 0, n: sizes.seccomp_notif); |
309 | if (ioctl(fd: listener, SECCOMP_IOCTL_NOTIF_RECV, req)) { |
310 | perror(s: "ioctl recv" ); |
311 | goto out_resp; |
312 | } |
313 | |
314 | if (handle_req(req, resp, listener) < 0) |
315 | goto out_resp; |
316 | |
317 | /* |
318 | * ENOENT here means that the task may have gotten a |
319 | * signal and restarted the syscall. It's up to the |
320 | * handler to decide what to do in this case, but for |
321 | * the sample code, we just ignore it. Probably |
322 | * something better should happen, like undoing the |
323 | * mount, or keeping track of the args to make sure we |
324 | * don't do it again. |
325 | */ |
326 | if (ioctl(fd: listener, SECCOMP_IOCTL_NOTIF_SEND, resp) < 0 && |
327 | errno != ENOENT) { |
328 | perror(s: "ioctl send" ); |
329 | goto out_resp; |
330 | } |
331 | } |
332 | out_resp: |
333 | free(ptr: resp); |
334 | out_req: |
335 | free(ptr: req); |
336 | out_close: |
337 | close(fd: listener); |
338 | exit(status: 1); |
339 | } |
340 | |
341 | close(fd: listener); |
342 | |
343 | if (waitpid(pid: worker, stat_loc: &status, options: 0) != worker) { |
344 | perror(s: "waitpid" ); |
345 | goto out_kill; |
346 | } |
347 | |
348 | if (umount2(special_file: "/tmp/foo" , MNT_DETACH) < 0 && errno != EINVAL) { |
349 | perror(s: "umount2" ); |
350 | goto out_kill; |
351 | } |
352 | |
353 | if (remove(filename: "/tmp/foo" ) < 0 && errno != ENOENT) { |
354 | perror(s: "remove" ); |
355 | exit(status: 1); |
356 | } |
357 | |
358 | if (!WIFEXITED(status) || WEXITSTATUS(status)) { |
359 | fprintf(stderr, format: "worker exited nonzero\n" ); |
360 | goto out_kill; |
361 | } |
362 | |
363 | ret = 0; |
364 | |
365 | out_kill: |
366 | if (tracer > 0) |
367 | kill(pid: tracer, SIGKILL); |
368 | if (worker > 0) |
369 | kill(pid: worker, SIGKILL); |
370 | |
371 | close_pair: |
372 | close(fd: sk_pair[0]); |
373 | close(fd: sk_pair[1]); |
374 | return ret; |
375 | } |
376 | |