1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright (c) 2012-2014 Andy Lutomirski <luto@amacapital.net> |
4 | * |
5 | * Based on the original implementation which is: |
6 | * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE |
7 | * Copyright 2003 Andi Kleen, SuSE Labs. |
8 | * |
9 | * Parts of the original code have been moved to arch/x86/vdso/vma.c |
10 | * |
11 | * This file implements vsyscall emulation. vsyscalls are a legacy ABI: |
12 | * Userspace can request certain kernel services by calling fixed |
13 | * addresses. This concept is problematic: |
14 | * |
15 | * - It interferes with ASLR. |
16 | * - It's awkward to write code that lives in kernel addresses but is |
17 | * callable by userspace at fixed addresses. |
18 | * - The whole concept is impossible for 32-bit compat userspace. |
19 | * - UML cannot easily virtualize a vsyscall. |
20 | * |
21 | * As of mid-2014, I believe that there is no new userspace code that |
22 | * will use a vsyscall if the vDSO is present. I hope that there will |
23 | * soon be no new userspace code that will ever use a vsyscall. |
24 | * |
25 | * The code in this file emulates vsyscalls when notified of a page |
26 | * fault to a vsyscall address. |
27 | */ |
28 | |
29 | #include <linux/kernel.h> |
30 | #include <linux/timer.h> |
31 | #include <linux/sched/signal.h> |
32 | #include <linux/mm_types.h> |
33 | #include <linux/syscalls.h> |
34 | #include <linux/ratelimit.h> |
35 | |
36 | #include <asm/vsyscall.h> |
37 | #include <asm/unistd.h> |
38 | #include <asm/fixmap.h> |
39 | #include <asm/traps.h> |
40 | #include <asm/paravirt.h> |
41 | |
42 | #define CREATE_TRACE_POINTS |
43 | #include "vsyscall_trace.h" |
44 | |
45 | static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init = |
46 | #ifdef CONFIG_LEGACY_VSYSCALL_NONE |
47 | NONE; |
48 | #elif defined(CONFIG_LEGACY_VSYSCALL_XONLY) |
49 | XONLY; |
50 | #else |
51 | #error VSYSCALL config is broken |
52 | #endif |
53 | |
54 | static int __init vsyscall_setup(char *str) |
55 | { |
56 | if (str) { |
57 | if (!strcmp("emulate" , str)) |
58 | vsyscall_mode = EMULATE; |
59 | else if (!strcmp("xonly" , str)) |
60 | vsyscall_mode = XONLY; |
61 | else if (!strcmp("none" , str)) |
62 | vsyscall_mode = NONE; |
63 | else |
64 | return -EINVAL; |
65 | |
66 | return 0; |
67 | } |
68 | |
69 | return -EINVAL; |
70 | } |
71 | early_param("vsyscall" , vsyscall_setup); |
72 | |
73 | static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, |
74 | const char *message) |
75 | { |
76 | if (!show_unhandled_signals) |
77 | return; |
78 | |
79 | printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n" , |
80 | level, current->comm, task_pid_nr(current), |
81 | message, regs->ip, regs->cs, |
82 | regs->sp, regs->ax, regs->si, regs->di); |
83 | } |
84 | |
85 | static int addr_to_vsyscall_nr(unsigned long addr) |
86 | { |
87 | int nr; |
88 | |
89 | if ((addr & ~0xC00UL) != VSYSCALL_ADDR) |
90 | return -EINVAL; |
91 | |
92 | nr = (addr & 0xC00UL) >> 10; |
93 | if (nr >= 3) |
94 | return -EINVAL; |
95 | |
96 | return nr; |
97 | } |
98 | |
99 | static bool write_ok_or_segv(unsigned long ptr, size_t size) |
100 | { |
101 | /* |
102 | * XXX: if access_ok, get_user, and put_user handled |
103 | * sig_on_uaccess_err, this could go away. |
104 | */ |
105 | |
106 | if (!access_ok((void __user *)ptr, size)) { |
107 | struct thread_struct *thread = ¤t->thread; |
108 | |
109 | thread->error_code = X86_PF_USER | X86_PF_WRITE; |
110 | thread->cr2 = ptr; |
111 | thread->trap_nr = X86_TRAP_PF; |
112 | |
113 | force_sig_fault(SIGSEGV, SEGV_MAPERR, addr: (void __user *)ptr); |
114 | return false; |
115 | } else { |
116 | return true; |
117 | } |
118 | } |
119 | |
120 | bool emulate_vsyscall(unsigned long error_code, |
121 | struct pt_regs *regs, unsigned long address) |
122 | { |
123 | struct task_struct *tsk; |
124 | unsigned long caller; |
125 | int vsyscall_nr, syscall_nr, tmp; |
126 | int prev_sig_on_uaccess_err; |
127 | long ret; |
128 | unsigned long orig_dx; |
129 | |
130 | /* Write faults or kernel-privilege faults never get fixed up. */ |
131 | if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER) |
132 | return false; |
133 | |
134 | if (!(error_code & X86_PF_INSTR)) { |
135 | /* Failed vsyscall read */ |
136 | if (vsyscall_mode == EMULATE) |
137 | return false; |
138 | |
139 | /* |
140 | * User code tried and failed to read the vsyscall page. |
141 | */ |
142 | warn_bad_vsyscall(KERN_INFO, regs, message: "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround" ); |
143 | return false; |
144 | } |
145 | |
146 | /* |
147 | * No point in checking CS -- the only way to get here is a user mode |
148 | * trap to a high address, which means that we're in 64-bit user code. |
149 | */ |
150 | |
151 | WARN_ON_ONCE(address != regs->ip); |
152 | |
153 | if (vsyscall_mode == NONE) { |
154 | warn_bad_vsyscall(KERN_INFO, regs, |
155 | message: "vsyscall attempted with vsyscall=none" ); |
156 | return false; |
157 | } |
158 | |
159 | vsyscall_nr = addr_to_vsyscall_nr(addr: address); |
160 | |
161 | trace_emulate_vsyscall(nr: vsyscall_nr); |
162 | |
163 | if (vsyscall_nr < 0) { |
164 | warn_bad_vsyscall(KERN_WARNING, regs, |
165 | message: "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround" ); |
166 | goto sigsegv; |
167 | } |
168 | |
169 | if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { |
170 | warn_bad_vsyscall(KERN_WARNING, regs, |
171 | message: "vsyscall with bad stack (exploit attempt?)" ); |
172 | goto sigsegv; |
173 | } |
174 | |
175 | tsk = current; |
176 | |
177 | /* |
178 | * Check for access_ok violations and find the syscall nr. |
179 | * |
180 | * NULL is a valid user pointer (in the access_ok sense) on 32-bit and |
181 | * 64-bit, so we don't need to special-case it here. For all the |
182 | * vsyscalls, NULL means "don't write anything" not "write it at |
183 | * address 0". |
184 | */ |
185 | switch (vsyscall_nr) { |
186 | case 0: |
187 | if (!write_ok_or_segv(ptr: regs->di, size: sizeof(struct __kernel_old_timeval)) || |
188 | !write_ok_or_segv(ptr: regs->si, size: sizeof(struct timezone))) { |
189 | ret = -EFAULT; |
190 | goto check_fault; |
191 | } |
192 | |
193 | syscall_nr = __NR_gettimeofday; |
194 | break; |
195 | |
196 | case 1: |
197 | if (!write_ok_or_segv(ptr: regs->di, size: sizeof(__kernel_old_time_t))) { |
198 | ret = -EFAULT; |
199 | goto check_fault; |
200 | } |
201 | |
202 | syscall_nr = __NR_time; |
203 | break; |
204 | |
205 | case 2: |
206 | if (!write_ok_or_segv(ptr: regs->di, size: sizeof(unsigned)) || |
207 | !write_ok_or_segv(ptr: regs->si, size: sizeof(unsigned))) { |
208 | ret = -EFAULT; |
209 | goto check_fault; |
210 | } |
211 | |
212 | syscall_nr = __NR_getcpu; |
213 | break; |
214 | } |
215 | |
216 | /* |
217 | * Handle seccomp. regs->ip must be the original value. |
218 | * See seccomp_send_sigsys and Documentation/userspace-api/seccomp_filter.rst. |
219 | * |
220 | * We could optimize the seccomp disabled case, but performance |
221 | * here doesn't matter. |
222 | */ |
223 | regs->orig_ax = syscall_nr; |
224 | regs->ax = -ENOSYS; |
225 | tmp = secure_computing(); |
226 | if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { |
227 | warn_bad_vsyscall(KERN_DEBUG, regs, |
228 | message: "seccomp tried to change syscall nr or ip" ); |
229 | force_exit_sig(SIGSYS); |
230 | return true; |
231 | } |
232 | regs->orig_ax = -1; |
233 | if (tmp) |
234 | goto do_ret; /* skip requested */ |
235 | |
236 | /* |
237 | * With a real vsyscall, page faults cause SIGSEGV. We want to |
238 | * preserve that behavior to make writing exploits harder. |
239 | */ |
240 | prev_sig_on_uaccess_err = current->thread.sig_on_uaccess_err; |
241 | current->thread.sig_on_uaccess_err = 1; |
242 | |
243 | ret = -EFAULT; |
244 | switch (vsyscall_nr) { |
245 | case 0: |
246 | /* this decodes regs->di and regs->si on its own */ |
247 | ret = __x64_sys_gettimeofday(regs); |
248 | break; |
249 | |
250 | case 1: |
251 | /* this decodes regs->di on its own */ |
252 | ret = __x64_sys_time(regs); |
253 | break; |
254 | |
255 | case 2: |
256 | /* while we could clobber regs->dx, we didn't in the past... */ |
257 | orig_dx = regs->dx; |
258 | regs->dx = 0; |
259 | /* this decodes regs->di, regs->si and regs->dx on its own */ |
260 | ret = __x64_sys_getcpu(regs); |
261 | regs->dx = orig_dx; |
262 | break; |
263 | } |
264 | |
265 | current->thread.sig_on_uaccess_err = prev_sig_on_uaccess_err; |
266 | |
267 | check_fault: |
268 | if (ret == -EFAULT) { |
269 | /* Bad news -- userspace fed a bad pointer to a vsyscall. */ |
270 | warn_bad_vsyscall(KERN_INFO, regs, |
271 | message: "vsyscall fault (exploit attempt?)" ); |
272 | |
273 | /* |
274 | * If we failed to generate a signal for any reason, |
275 | * generate one here. (This should be impossible.) |
276 | */ |
277 | if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) && |
278 | !sigismember(&tsk->pending.signal, SIGSEGV))) |
279 | goto sigsegv; |
280 | |
281 | return true; /* Don't emulate the ret. */ |
282 | } |
283 | |
284 | regs->ax = ret; |
285 | |
286 | do_ret: |
287 | /* Emulate a ret instruction. */ |
288 | regs->ip = caller; |
289 | regs->sp += 8; |
290 | return true; |
291 | |
292 | sigsegv: |
293 | force_sig(SIGSEGV); |
294 | return true; |
295 | } |
296 | |
297 | /* |
298 | * A pseudo VMA to allow ptrace access for the vsyscall page. This only |
299 | * covers the 64bit vsyscall page now. 32bit has a real VMA now and does |
300 | * not need special handling anymore: |
301 | */ |
302 | static const char *gate_vma_name(struct vm_area_struct *vma) |
303 | { |
304 | return "[vsyscall]" ; |
305 | } |
306 | static const struct vm_operations_struct gate_vma_ops = { |
307 | .name = gate_vma_name, |
308 | }; |
309 | static struct vm_area_struct gate_vma __ro_after_init = { |
310 | .vm_start = VSYSCALL_ADDR, |
311 | .vm_end = VSYSCALL_ADDR + PAGE_SIZE, |
312 | .vm_page_prot = PAGE_READONLY_EXEC, |
313 | .vm_flags = VM_READ | VM_EXEC, |
314 | .vm_ops = &gate_vma_ops, |
315 | }; |
316 | |
317 | struct vm_area_struct *get_gate_vma(struct mm_struct *mm) |
318 | { |
319 | #ifdef CONFIG_COMPAT |
320 | if (!mm || !test_bit(MM_CONTEXT_HAS_VSYSCALL, &mm->context.flags)) |
321 | return NULL; |
322 | #endif |
323 | if (vsyscall_mode == NONE) |
324 | return NULL; |
325 | return &gate_vma; |
326 | } |
327 | |
328 | int in_gate_area(struct mm_struct *mm, unsigned long addr) |
329 | { |
330 | struct vm_area_struct *vma = get_gate_vma(mm); |
331 | |
332 | if (!vma) |
333 | return 0; |
334 | |
335 | return (addr >= vma->vm_start) && (addr < vma->vm_end); |
336 | } |
337 | |
338 | /* |
339 | * Use this when you have no reliable mm, typically from interrupt |
340 | * context. It is less reliable than using a task's mm and may give |
341 | * false positives. |
342 | */ |
343 | int in_gate_area_no_mm(unsigned long addr) |
344 | { |
345 | return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; |
346 | } |
347 | |
348 | /* |
349 | * The VSYSCALL page is the only user-accessible page in the kernel address |
350 | * range. Normally, the kernel page tables can have _PAGE_USER clear, but |
351 | * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls |
352 | * are enabled. |
353 | * |
354 | * Some day we may create a "minimal" vsyscall mode in which we emulate |
355 | * vsyscalls but leave the page not present. If so, we skip calling |
356 | * this. |
357 | */ |
358 | void __init set_vsyscall_pgtable_user_bits(pgd_t *root) |
359 | { |
360 | pgd_t *pgd; |
361 | p4d_t *p4d; |
362 | pud_t *pud; |
363 | pmd_t *pmd; |
364 | |
365 | pgd = pgd_offset_pgd(pgd: root, VSYSCALL_ADDR); |
366 | set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); |
367 | p4d = p4d_offset(pgd, VSYSCALL_ADDR); |
368 | #if CONFIG_PGTABLE_LEVELS >= 5 |
369 | set_p4d(p4dp: p4d, p4d: __p4d(val: p4d_val(p4d: *p4d) | _PAGE_USER)); |
370 | #endif |
371 | pud = pud_offset(p4d, VSYSCALL_ADDR); |
372 | set_pud(pudp: pud, pud: __pud(val: pud_val(pud: *pud) | _PAGE_USER)); |
373 | pmd = pmd_offset(pud, VSYSCALL_ADDR); |
374 | set_pmd(pmdp: pmd, pmd: __pmd(val: pmd_val(pmd: *pmd) | _PAGE_USER)); |
375 | } |
376 | |
377 | void __init map_vsyscall(void) |
378 | { |
379 | extern char __vsyscall_page; |
380 | unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); |
381 | |
382 | /* |
383 | * For full emulation, the page needs to exist for real. In |
384 | * execute-only mode, there is no PTE at all backing the vsyscall |
385 | * page. |
386 | */ |
387 | if (vsyscall_mode == EMULATE) { |
388 | __set_fixmap(idx: VSYSCALL_PAGE, phys: physaddr_vsyscall, |
389 | PAGE_KERNEL_VVAR); |
390 | set_vsyscall_pgtable_user_bits(swapper_pg_dir); |
391 | } |
392 | |
393 | if (vsyscall_mode == XONLY) |
394 | vm_flags_init(vma: &gate_vma, VM_EXEC); |
395 | |
396 | BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != |
397 | (unsigned long)VSYSCALL_ADDR); |
398 | } |
399 | |