1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * shstk.c - Intel shadow stack support |
4 | * |
5 | * Copyright (c) 2021, Intel Corporation. |
6 | * Yu-cheng Yu <yu-cheng.yu@intel.com> |
7 | */ |
8 | |
9 | #include <linux/sched.h> |
10 | #include <linux/bitops.h> |
11 | #include <linux/types.h> |
12 | #include <linux/mm.h> |
13 | #include <linux/mman.h> |
14 | #include <linux/slab.h> |
15 | #include <linux/uaccess.h> |
16 | #include <linux/sched/signal.h> |
17 | #include <linux/compat.h> |
18 | #include <linux/sizes.h> |
19 | #include <linux/user.h> |
20 | #include <linux/syscalls.h> |
21 | #include <asm/msr.h> |
22 | #include <asm/fpu/xstate.h> |
23 | #include <asm/fpu/types.h> |
24 | #include <asm/shstk.h> |
25 | #include <asm/special_insns.h> |
26 | #include <asm/fpu/api.h> |
27 | #include <asm/prctl.h> |
28 | |
29 | #define SS_FRAME_SIZE 8 |
30 | |
31 | static bool features_enabled(unsigned long features) |
32 | { |
33 | return current->thread.features & features; |
34 | } |
35 | |
36 | static void features_set(unsigned long features) |
37 | { |
38 | current->thread.features |= features; |
39 | } |
40 | |
41 | static void features_clr(unsigned long features) |
42 | { |
43 | current->thread.features &= ~features; |
44 | } |
45 | |
46 | /* |
47 | * Create a restore token on the shadow stack. A token is always 8-byte |
48 | * and aligned to 8. |
49 | */ |
50 | static int create_rstor_token(unsigned long ssp, unsigned long *token_addr) |
51 | { |
52 | unsigned long addr; |
53 | |
54 | /* Token must be aligned */ |
55 | if (!IS_ALIGNED(ssp, 8)) |
56 | return -EINVAL; |
57 | |
58 | addr = ssp - SS_FRAME_SIZE; |
59 | |
60 | /* |
61 | * SSP is aligned, so reserved bits and mode bit are a zero, just mark |
62 | * the token 64-bit. |
63 | */ |
64 | ssp |= BIT(0); |
65 | |
66 | if (write_user_shstk_64(addr: (u64 __user *)addr, val: (u64)ssp)) |
67 | return -EFAULT; |
68 | |
69 | if (token_addr) |
70 | *token_addr = addr; |
71 | |
72 | return 0; |
73 | } |
74 | |
75 | /* |
76 | * VM_SHADOW_STACK will have a guard page. This helps userspace protect |
77 | * itself from attacks. The reasoning is as follows: |
78 | * |
79 | * The shadow stack pointer(SSP) is moved by CALL, RET, and INCSSPQ. The |
80 | * INCSSP instruction can increment the shadow stack pointer. It is the |
81 | * shadow stack analog of an instruction like: |
82 | * |
83 | * addq $0x80, %rsp |
84 | * |
85 | * However, there is one important difference between an ADD on %rsp |
86 | * and INCSSP. In addition to modifying SSP, INCSSP also reads from the |
87 | * memory of the first and last elements that were "popped". It can be |
88 | * thought of as acting like this: |
89 | * |
90 | * READ_ONCE(ssp); // read+discard top element on stack |
91 | * ssp += nr_to_pop * 8; // move the shadow stack |
92 | * READ_ONCE(ssp-8); // read+discard last popped stack element |
93 | * |
94 | * The maximum distance INCSSP can move the SSP is 2040 bytes, before |
95 | * it would read the memory. Therefore a single page gap will be enough |
96 | * to prevent any operation from shifting the SSP to an adjacent stack, |
97 | * since it would have to land in the gap at least once, causing a |
98 | * fault. |
99 | */ |
100 | static unsigned long alloc_shstk(unsigned long addr, unsigned long size, |
101 | unsigned long token_offset, bool set_res_tok) |
102 | { |
103 | int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_ABOVE4G; |
104 | struct mm_struct *mm = current->mm; |
105 | unsigned long mapped_addr, unused; |
106 | |
107 | if (addr) |
108 | flags |= MAP_FIXED_NOREPLACE; |
109 | |
110 | mmap_write_lock(mm); |
111 | mapped_addr = do_mmap(NULL, addr, len: size, PROT_READ, flags, |
112 | VM_SHADOW_STACK | VM_WRITE, pgoff: 0, populate: &unused, NULL); |
113 | mmap_write_unlock(mm); |
114 | |
115 | if (!set_res_tok || IS_ERR_VALUE(mapped_addr)) |
116 | goto out; |
117 | |
118 | if (create_rstor_token(ssp: mapped_addr + token_offset, NULL)) { |
119 | vm_munmap(mapped_addr, size); |
120 | return -EINVAL; |
121 | } |
122 | |
123 | out: |
124 | return mapped_addr; |
125 | } |
126 | |
127 | static unsigned long adjust_shstk_size(unsigned long size) |
128 | { |
129 | if (size) |
130 | return PAGE_ALIGN(size); |
131 | |
132 | return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G)); |
133 | } |
134 | |
135 | static void unmap_shadow_stack(u64 base, u64 size) |
136 | { |
137 | int r; |
138 | |
139 | r = vm_munmap(base, size); |
140 | |
141 | /* |
142 | * mmap_write_lock_killable() failed with -EINTR. This means |
143 | * the process is about to die and have it's MM cleaned up. |
144 | * This task shouldn't ever make it back to userspace. In this |
145 | * case it is ok to leak a shadow stack, so just exit out. |
146 | */ |
147 | if (r == -EINTR) |
148 | return; |
149 | |
150 | /* |
151 | * For all other types of vm_munmap() failure, either the |
152 | * system is out of memory or there is bug. |
153 | */ |
154 | WARN_ON_ONCE(r); |
155 | } |
156 | |
157 | static int shstk_setup(void) |
158 | { |
159 | struct thread_shstk *shstk = ¤t->thread.shstk; |
160 | unsigned long addr, size; |
161 | |
162 | /* Already enabled */ |
163 | if (features_enabled(ARCH_SHSTK_SHSTK)) |
164 | return 0; |
165 | |
166 | /* Also not supported for 32 bit and x32 */ |
167 | if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_32bit_syscall()) |
168 | return -EOPNOTSUPP; |
169 | |
170 | size = adjust_shstk_size(size: 0); |
171 | addr = alloc_shstk(addr: 0, size, token_offset: 0, set_res_tok: false); |
172 | if (IS_ERR_VALUE(addr)) |
173 | return PTR_ERR(ptr: (void *)addr); |
174 | |
175 | fpregs_lock_and_load(); |
176 | wrmsrl(MSR_IA32_PL3_SSP, val: addr + size); |
177 | wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN); |
178 | fpregs_unlock(); |
179 | |
180 | shstk->base = addr; |
181 | shstk->size = size; |
182 | features_set(ARCH_SHSTK_SHSTK); |
183 | |
184 | return 0; |
185 | } |
186 | |
187 | void reset_thread_features(void) |
188 | { |
189 | memset(¤t->thread.shstk, 0, sizeof(struct thread_shstk)); |
190 | current->thread.features = 0; |
191 | current->thread.features_locked = 0; |
192 | } |
193 | |
194 | unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long clone_flags, |
195 | unsigned long stack_size) |
196 | { |
197 | struct thread_shstk *shstk = &tsk->thread.shstk; |
198 | unsigned long addr, size; |
199 | |
200 | /* |
201 | * If shadow stack is not enabled on the new thread, skip any |
202 | * switch to a new shadow stack. |
203 | */ |
204 | if (!features_enabled(ARCH_SHSTK_SHSTK)) |
205 | return 0; |
206 | |
207 | /* |
208 | * For CLONE_VFORK the child will share the parents shadow stack. |
209 | * Make sure to clear the internal tracking of the thread shadow |
210 | * stack so the freeing logic run for child knows to leave it alone. |
211 | */ |
212 | if (clone_flags & CLONE_VFORK) { |
213 | shstk->base = 0; |
214 | shstk->size = 0; |
215 | return 0; |
216 | } |
217 | |
218 | /* |
219 | * For !CLONE_VM the child will use a copy of the parents shadow |
220 | * stack. |
221 | */ |
222 | if (!(clone_flags & CLONE_VM)) |
223 | return 0; |
224 | |
225 | size = adjust_shstk_size(size: stack_size); |
226 | addr = alloc_shstk(addr: 0, size, token_offset: 0, set_res_tok: false); |
227 | if (IS_ERR_VALUE(addr)) |
228 | return addr; |
229 | |
230 | shstk->base = addr; |
231 | shstk->size = size; |
232 | |
233 | return addr + size; |
234 | } |
235 | |
236 | static unsigned long get_user_shstk_addr(void) |
237 | { |
238 | unsigned long long ssp; |
239 | |
240 | fpregs_lock_and_load(); |
241 | |
242 | rdmsrl(MSR_IA32_PL3_SSP, ssp); |
243 | |
244 | fpregs_unlock(); |
245 | |
246 | return ssp; |
247 | } |
248 | |
249 | #define SHSTK_DATA_BIT BIT(63) |
250 | |
251 | static int put_shstk_data(u64 __user *addr, u64 data) |
252 | { |
253 | if (WARN_ON_ONCE(data & SHSTK_DATA_BIT)) |
254 | return -EINVAL; |
255 | |
256 | /* |
257 | * Mark the high bit so that the sigframe can't be processed as a |
258 | * return address. |
259 | */ |
260 | if (write_user_shstk_64(addr, val: data | SHSTK_DATA_BIT)) |
261 | return -EFAULT; |
262 | return 0; |
263 | } |
264 | |
265 | static int get_shstk_data(unsigned long *data, unsigned long __user *addr) |
266 | { |
267 | unsigned long ldata; |
268 | |
269 | if (unlikely(get_user(ldata, addr))) |
270 | return -EFAULT; |
271 | |
272 | if (!(ldata & SHSTK_DATA_BIT)) |
273 | return -EINVAL; |
274 | |
275 | *data = ldata & ~SHSTK_DATA_BIT; |
276 | |
277 | return 0; |
278 | } |
279 | |
280 | static int shstk_push_sigframe(unsigned long *ssp) |
281 | { |
282 | unsigned long target_ssp = *ssp; |
283 | |
284 | /* Token must be aligned */ |
285 | if (!IS_ALIGNED(target_ssp, 8)) |
286 | return -EINVAL; |
287 | |
288 | *ssp -= SS_FRAME_SIZE; |
289 | if (put_shstk_data(addr: (void __user *)*ssp, data: target_ssp)) |
290 | return -EFAULT; |
291 | |
292 | return 0; |
293 | } |
294 | |
295 | static int shstk_pop_sigframe(unsigned long *ssp) |
296 | { |
297 | struct vm_area_struct *vma; |
298 | unsigned long token_addr; |
299 | bool need_to_check_vma; |
300 | int err = 1; |
301 | |
302 | /* |
303 | * It is possible for the SSP to be off the end of a shadow stack by 4 |
304 | * or 8 bytes. If the shadow stack is at the start of a page or 4 bytes |
305 | * before it, it might be this case, so check that the address being |
306 | * read is actually shadow stack. |
307 | */ |
308 | if (!IS_ALIGNED(*ssp, 8)) |
309 | return -EINVAL; |
310 | |
311 | need_to_check_vma = PAGE_ALIGN(*ssp) == *ssp; |
312 | |
313 | if (need_to_check_vma) |
314 | mmap_read_lock_killable(current->mm); |
315 | |
316 | err = get_shstk_data(data: &token_addr, addr: (unsigned long __user *)*ssp); |
317 | if (unlikely(err)) |
318 | goto out_err; |
319 | |
320 | if (need_to_check_vma) { |
321 | vma = find_vma(current->mm, addr: *ssp); |
322 | if (!vma || !(vma->vm_flags & VM_SHADOW_STACK)) { |
323 | err = -EFAULT; |
324 | goto out_err; |
325 | } |
326 | |
327 | mmap_read_unlock(current->mm); |
328 | } |
329 | |
330 | /* Restore SSP aligned? */ |
331 | if (unlikely(!IS_ALIGNED(token_addr, 8))) |
332 | return -EINVAL; |
333 | |
334 | /* SSP in userspace? */ |
335 | if (unlikely(token_addr >= TASK_SIZE_MAX)) |
336 | return -EINVAL; |
337 | |
338 | *ssp = token_addr; |
339 | |
340 | return 0; |
341 | out_err: |
342 | if (need_to_check_vma) |
343 | mmap_read_unlock(current->mm); |
344 | return err; |
345 | } |
346 | |
347 | int setup_signal_shadow_stack(struct ksignal *ksig) |
348 | { |
349 | void __user *restorer = ksig->ka.sa.sa_restorer; |
350 | unsigned long ssp; |
351 | int err; |
352 | |
353 | if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || |
354 | !features_enabled(ARCH_SHSTK_SHSTK)) |
355 | return 0; |
356 | |
357 | if (!restorer) |
358 | return -EINVAL; |
359 | |
360 | ssp = get_user_shstk_addr(); |
361 | if (unlikely(!ssp)) |
362 | return -EINVAL; |
363 | |
364 | err = shstk_push_sigframe(ssp: &ssp); |
365 | if (unlikely(err)) |
366 | return err; |
367 | |
368 | /* Push restorer address */ |
369 | ssp -= SS_FRAME_SIZE; |
370 | err = write_user_shstk_64(addr: (u64 __user *)ssp, val: (u64)restorer); |
371 | if (unlikely(err)) |
372 | return -EFAULT; |
373 | |
374 | fpregs_lock_and_load(); |
375 | wrmsrl(MSR_IA32_PL3_SSP, val: ssp); |
376 | fpregs_unlock(); |
377 | |
378 | return 0; |
379 | } |
380 | |
381 | int restore_signal_shadow_stack(void) |
382 | { |
383 | unsigned long ssp; |
384 | int err; |
385 | |
386 | if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || |
387 | !features_enabled(ARCH_SHSTK_SHSTK)) |
388 | return 0; |
389 | |
390 | ssp = get_user_shstk_addr(); |
391 | if (unlikely(!ssp)) |
392 | return -EINVAL; |
393 | |
394 | err = shstk_pop_sigframe(ssp: &ssp); |
395 | if (unlikely(err)) |
396 | return err; |
397 | |
398 | fpregs_lock_and_load(); |
399 | wrmsrl(MSR_IA32_PL3_SSP, val: ssp); |
400 | fpregs_unlock(); |
401 | |
402 | return 0; |
403 | } |
404 | |
405 | void shstk_free(struct task_struct *tsk) |
406 | { |
407 | struct thread_shstk *shstk = &tsk->thread.shstk; |
408 | |
409 | if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || |
410 | !features_enabled(ARCH_SHSTK_SHSTK)) |
411 | return; |
412 | |
413 | /* |
414 | * When fork() with CLONE_VM fails, the child (tsk) already has a |
415 | * shadow stack allocated, and exit_thread() calls this function to |
416 | * free it. In this case the parent (current) and the child share |
417 | * the same mm struct. |
418 | */ |
419 | if (!tsk->mm || tsk->mm != current->mm) |
420 | return; |
421 | |
422 | /* |
423 | * If shstk->base is NULL, then this task is not managing its |
424 | * own shadow stack (CLONE_VFORK). So skip freeing it. |
425 | */ |
426 | if (!shstk->base) |
427 | return; |
428 | |
429 | /* |
430 | * shstk->base is NULL for CLONE_VFORK child tasks, and so is |
431 | * normal. But size = 0 on a shstk->base is not normal and |
432 | * indicated an attempt to free the thread shadow stack twice. |
433 | * Warn about it. |
434 | */ |
435 | if (WARN_ON(!shstk->size)) |
436 | return; |
437 | |
438 | unmap_shadow_stack(base: shstk->base, size: shstk->size); |
439 | |
440 | shstk->size = 0; |
441 | } |
442 | |
443 | static int (bool enable) |
444 | { |
445 | u64 msrval; |
446 | |
447 | if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) |
448 | return -EOPNOTSUPP; |
449 | |
450 | /* |
451 | * Only enable WRSS if shadow stack is enabled. If shadow stack is not |
452 | * enabled, WRSS will already be disabled, so don't bother clearing it |
453 | * when disabling. |
454 | */ |
455 | if (!features_enabled(ARCH_SHSTK_SHSTK)) |
456 | return -EPERM; |
457 | |
458 | /* Already enabled/disabled? */ |
459 | if (features_enabled(ARCH_SHSTK_WRSS) == enable) |
460 | return 0; |
461 | |
462 | fpregs_lock_and_load(); |
463 | rdmsrl(MSR_IA32_U_CET, msrval); |
464 | |
465 | if (enable) { |
466 | features_set(ARCH_SHSTK_WRSS); |
467 | msrval |= CET_WRSS_EN; |
468 | } else { |
469 | features_clr(ARCH_SHSTK_WRSS); |
470 | if (!(msrval & CET_WRSS_EN)) |
471 | goto unlock; |
472 | |
473 | msrval &= ~CET_WRSS_EN; |
474 | } |
475 | |
476 | wrmsrl(MSR_IA32_U_CET, val: msrval); |
477 | |
478 | unlock: |
479 | fpregs_unlock(); |
480 | |
481 | return 0; |
482 | } |
483 | |
484 | static int shstk_disable(void) |
485 | { |
486 | if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) |
487 | return -EOPNOTSUPP; |
488 | |
489 | /* Already disabled? */ |
490 | if (!features_enabled(ARCH_SHSTK_SHSTK)) |
491 | return 0; |
492 | |
493 | fpregs_lock_and_load(); |
494 | /* Disable WRSS too when disabling shadow stack */ |
495 | wrmsrl(MSR_IA32_U_CET, val: 0); |
496 | wrmsrl(MSR_IA32_PL3_SSP, val: 0); |
497 | fpregs_unlock(); |
498 | |
499 | shstk_free(current); |
500 | features_clr(ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS); |
501 | |
502 | return 0; |
503 | } |
504 | |
505 | SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags) |
506 | { |
507 | bool set_tok = flags & SHADOW_STACK_SET_TOKEN; |
508 | unsigned long aligned_size; |
509 | |
510 | if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) |
511 | return -EOPNOTSUPP; |
512 | |
513 | if (flags & ~SHADOW_STACK_SET_TOKEN) |
514 | return -EINVAL; |
515 | |
516 | /* If there isn't space for a token */ |
517 | if (set_tok && size < 8) |
518 | return -ENOSPC; |
519 | |
520 | if (addr && addr < SZ_4G) |
521 | return -ERANGE; |
522 | |
523 | /* |
524 | * An overflow would result in attempting to write the restore token |
525 | * to the wrong location. Not catastrophic, but just return the right |
526 | * error code and block it. |
527 | */ |
528 | aligned_size = PAGE_ALIGN(size); |
529 | if (aligned_size < size) |
530 | return -EOVERFLOW; |
531 | |
532 | return alloc_shstk(addr, size: aligned_size, token_offset: size, set_res_tok: set_tok); |
533 | } |
534 | |
535 | long shstk_prctl(struct task_struct *task, int option, unsigned long arg2) |
536 | { |
537 | unsigned long features = arg2; |
538 | |
539 | if (option == ARCH_SHSTK_STATUS) { |
540 | return put_user(task->thread.features, (unsigned long __user *)arg2); |
541 | } |
542 | |
543 | if (option == ARCH_SHSTK_LOCK) { |
544 | task->thread.features_locked |= features; |
545 | return 0; |
546 | } |
547 | |
548 | /* Only allow via ptrace */ |
549 | if (task != current) { |
550 | if (option == ARCH_SHSTK_UNLOCK && IS_ENABLED(CONFIG_CHECKPOINT_RESTORE)) { |
551 | task->thread.features_locked &= ~features; |
552 | return 0; |
553 | } |
554 | return -EINVAL; |
555 | } |
556 | |
557 | /* Do not allow to change locked features */ |
558 | if (features & task->thread.features_locked) |
559 | return -EPERM; |
560 | |
561 | /* Only support enabling/disabling one feature at a time. */ |
562 | if (hweight_long(w: features) > 1) |
563 | return -EINVAL; |
564 | |
565 | if (option == ARCH_SHSTK_DISABLE) { |
566 | if (features & ARCH_SHSTK_WRSS) |
567 | return wrss_control(enable: false); |
568 | if (features & ARCH_SHSTK_SHSTK) |
569 | return shstk_disable(); |
570 | return -EINVAL; |
571 | } |
572 | |
573 | /* Handle ARCH_SHSTK_ENABLE */ |
574 | if (features & ARCH_SHSTK_SHSTK) |
575 | return shstk_setup(); |
576 | if (features & ARCH_SHSTK_WRSS) |
577 | return wrss_control(enable: true); |
578 | return -EINVAL; |
579 | } |
580 | |