1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Copyright 2007 Andi Kleen, SUSE Labs. |
4 | * |
5 | * This contains most of the x86 vDSO kernel-side code. |
6 | */ |
7 | #include <linux/mm.h> |
8 | #include <linux/err.h> |
9 | #include <linux/sched.h> |
10 | #include <linux/sched/task_stack.h> |
11 | #include <linux/slab.h> |
12 | #include <linux/init.h> |
13 | #include <linux/random.h> |
14 | #include <linux/elf.h> |
15 | #include <linux/cpu.h> |
16 | #include <linux/ptrace.h> |
17 | #include <linux/time_namespace.h> |
18 | |
19 | #include <asm/pvclock.h> |
20 | #include <asm/vgtod.h> |
21 | #include <asm/proto.h> |
22 | #include <asm/vdso.h> |
23 | #include <asm/vvar.h> |
24 | #include <asm/tlb.h> |
25 | #include <asm/page.h> |
26 | #include <asm/desc.h> |
27 | #include <asm/cpufeature.h> |
28 | #include <clocksource/hyperv_timer.h> |
29 | |
30 | #undef _ASM_X86_VVAR_H |
31 | #define EMIT_VVAR(name, offset) \ |
32 | const size_t name ## _offset = offset; |
33 | #include <asm/vvar.h> |
34 | |
35 | struct vdso_data *arch_get_vdso_data(void *vvar_page) |
36 | { |
37 | return (struct vdso_data *)(vvar_page + _vdso_data_offset); |
38 | } |
39 | #undef EMIT_VVAR |
40 | |
41 | unsigned int vclocks_used __read_mostly; |
42 | |
43 | #if defined(CONFIG_X86_64) |
44 | unsigned int __read_mostly vdso64_enabled = 1; |
45 | #endif |
46 | |
47 | int __init init_vdso_image(const struct vdso_image *image) |
48 | { |
49 | BUILD_BUG_ON(VDSO_CLOCKMODE_MAX >= 32); |
50 | BUG_ON(image->size % PAGE_SIZE != 0); |
51 | |
52 | apply_alternatives(start: (struct alt_instr *)(image->data + image->alt), |
53 | end: (struct alt_instr *)(image->data + image->alt + |
54 | image->alt_len)); |
55 | |
56 | return 0; |
57 | } |
58 | |
59 | static const struct vm_special_mapping vvar_mapping; |
60 | struct linux_binprm; |
61 | |
62 | static vm_fault_t vdso_fault(const struct vm_special_mapping *sm, |
63 | struct vm_area_struct *vma, struct vm_fault *vmf) |
64 | { |
65 | const struct vdso_image *image = vma->vm_mm->context.vdso_image; |
66 | |
67 | if (!image || (vmf->pgoff << PAGE_SHIFT) >= image->size) |
68 | return VM_FAULT_SIGBUS; |
69 | |
70 | vmf->page = virt_to_page(image->data + (vmf->pgoff << PAGE_SHIFT)); |
71 | get_page(page: vmf->page); |
72 | return 0; |
73 | } |
74 | |
75 | static void vdso_fix_landing(const struct vdso_image *image, |
76 | struct vm_area_struct *new_vma) |
77 | { |
78 | #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION |
79 | if (in_ia32_syscall() && image == &vdso_image_32) { |
80 | struct pt_regs *regs = current_pt_regs(); |
81 | unsigned long vdso_land = image->sym_int80_landing_pad; |
82 | unsigned long old_land_addr = vdso_land + |
83 | (unsigned long)current->mm->context.vdso; |
84 | |
85 | /* Fixing userspace landing - look at do_fast_syscall_32 */ |
86 | if (regs->ip == old_land_addr) |
87 | regs->ip = new_vma->vm_start + vdso_land; |
88 | } |
89 | #endif |
90 | } |
91 | |
92 | static int vdso_mremap(const struct vm_special_mapping *sm, |
93 | struct vm_area_struct *new_vma) |
94 | { |
95 | const struct vdso_image *image = current->mm->context.vdso_image; |
96 | |
97 | vdso_fix_landing(image, new_vma); |
98 | current->mm->context.vdso = (void __user *)new_vma->vm_start; |
99 | |
100 | return 0; |
101 | } |
102 | |
103 | #ifdef CONFIG_TIME_NS |
104 | /* |
105 | * The vvar page layout depends on whether a task belongs to the root or |
106 | * non-root time namespace. Whenever a task changes its namespace, the VVAR |
107 | * page tables are cleared and then they will re-faulted with a |
108 | * corresponding layout. |
109 | * See also the comment near timens_setup_vdso_data() for details. |
110 | */ |
111 | int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) |
112 | { |
113 | struct mm_struct *mm = task->mm; |
114 | struct vm_area_struct *vma; |
115 | VMA_ITERATOR(vmi, mm, 0); |
116 | |
117 | mmap_read_lock(mm); |
118 | for_each_vma(vmi, vma) { |
119 | if (vma_is_special_mapping(vma, sm: &vvar_mapping)) |
120 | zap_vma_pages(vma); |
121 | } |
122 | mmap_read_unlock(mm); |
123 | |
124 | return 0; |
125 | } |
126 | #endif |
127 | |
128 | static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, |
129 | struct vm_area_struct *vma, struct vm_fault *vmf) |
130 | { |
131 | const struct vdso_image *image = vma->vm_mm->context.vdso_image; |
132 | unsigned long pfn; |
133 | long sym_offset; |
134 | |
135 | if (!image) |
136 | return VM_FAULT_SIGBUS; |
137 | |
138 | sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) + |
139 | image->sym_vvar_start; |
140 | |
141 | /* |
142 | * Sanity check: a symbol offset of zero means that the page |
143 | * does not exist for this vdso image, not that the page is at |
144 | * offset zero relative to the text mapping. This should be |
145 | * impossible here, because sym_offset should only be zero for |
146 | * the page past the end of the vvar mapping. |
147 | */ |
148 | if (sym_offset == 0) |
149 | return VM_FAULT_SIGBUS; |
150 | |
151 | if (sym_offset == image->sym_vvar_page) { |
152 | struct page *timens_page = find_timens_vvar_page(vma); |
153 | |
154 | pfn = __pa_symbol(&__vvar_page) >> PAGE_SHIFT; |
155 | |
156 | /* |
157 | * If a task belongs to a time namespace then a namespace |
158 | * specific VVAR is mapped with the sym_vvar_page offset and |
159 | * the real VVAR page is mapped with the sym_timens_page |
160 | * offset. |
161 | * See also the comment near timens_setup_vdso_data(). |
162 | */ |
163 | if (timens_page) { |
164 | unsigned long addr; |
165 | vm_fault_t err; |
166 | |
167 | /* |
168 | * Optimization: inside time namespace pre-fault |
169 | * VVAR page too. As on timens page there are only |
170 | * offsets for clocks on VVAR, it'll be faulted |
171 | * shortly by VDSO code. |
172 | */ |
173 | addr = vmf->address + (image->sym_timens_page - sym_offset); |
174 | err = vmf_insert_pfn(vma, addr, pfn); |
175 | if (unlikely(err & VM_FAULT_ERROR)) |
176 | return err; |
177 | |
178 | pfn = page_to_pfn(timens_page); |
179 | } |
180 | |
181 | return vmf_insert_pfn(vma, addr: vmf->address, pfn); |
182 | } else if (sym_offset == image->sym_pvclock_page) { |
183 | struct pvclock_vsyscall_time_info *pvti = |
184 | pvclock_get_pvti_cpu0_va(); |
185 | if (pvti && vclock_was_used(vclock: VDSO_CLOCKMODE_PVCLOCK)) { |
186 | return vmf_insert_pfn_prot(vma, addr: vmf->address, |
187 | __pa(pvti) >> PAGE_SHIFT, |
188 | pgprot_decrypted(vma->vm_page_prot)); |
189 | } |
190 | } else if (sym_offset == image->sym_hvclock_page) { |
191 | pfn = hv_get_tsc_pfn(); |
192 | |
193 | if (pfn && vclock_was_used(vclock: VDSO_CLOCKMODE_HVCLOCK)) |
194 | return vmf_insert_pfn(vma, addr: vmf->address, pfn); |
195 | } else if (sym_offset == image->sym_timens_page) { |
196 | struct page *timens_page = find_timens_vvar_page(vma); |
197 | |
198 | if (!timens_page) |
199 | return VM_FAULT_SIGBUS; |
200 | |
201 | pfn = __pa_symbol(&__vvar_page) >> PAGE_SHIFT; |
202 | return vmf_insert_pfn(vma, addr: vmf->address, pfn); |
203 | } |
204 | |
205 | return VM_FAULT_SIGBUS; |
206 | } |
207 | |
208 | static const struct vm_special_mapping vdso_mapping = { |
209 | .name = "[vdso]" , |
210 | .fault = vdso_fault, |
211 | .mremap = vdso_mremap, |
212 | }; |
213 | static const struct vm_special_mapping vvar_mapping = { |
214 | .name = "[vvar]" , |
215 | .fault = vvar_fault, |
216 | }; |
217 | |
218 | /* |
219 | * Add vdso and vvar mappings to current process. |
220 | * @image - blob to map |
221 | * @addr - request a specific address (zero to map at free addr) |
222 | */ |
223 | static int map_vdso(const struct vdso_image *image, unsigned long addr) |
224 | { |
225 | struct mm_struct *mm = current->mm; |
226 | struct vm_area_struct *vma; |
227 | unsigned long text_start; |
228 | int ret = 0; |
229 | |
230 | if (mmap_write_lock_killable(mm)) |
231 | return -EINTR; |
232 | |
233 | addr = get_unmapped_area(NULL, addr, |
234 | image->size - image->sym_vvar_start, 0, 0); |
235 | if (IS_ERR_VALUE(addr)) { |
236 | ret = addr; |
237 | goto up_fail; |
238 | } |
239 | |
240 | text_start = addr - image->sym_vvar_start; |
241 | |
242 | /* |
243 | * MAYWRITE to allow gdb to COW and set breakpoints |
244 | */ |
245 | vma = _install_special_mapping(mm, |
246 | addr: text_start, |
247 | len: image->size, |
248 | VM_READ|VM_EXEC| |
249 | VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, |
250 | spec: &vdso_mapping); |
251 | |
252 | if (IS_ERR(ptr: vma)) { |
253 | ret = PTR_ERR(ptr: vma); |
254 | goto up_fail; |
255 | } |
256 | |
257 | vma = _install_special_mapping(mm, |
258 | addr, |
259 | len: -image->sym_vvar_start, |
260 | VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| |
261 | VM_PFNMAP, |
262 | spec: &vvar_mapping); |
263 | |
264 | if (IS_ERR(ptr: vma)) { |
265 | ret = PTR_ERR(ptr: vma); |
266 | do_munmap(mm, text_start, image->size, NULL); |
267 | } else { |
268 | current->mm->context.vdso = (void __user *)text_start; |
269 | current->mm->context.vdso_image = image; |
270 | } |
271 | |
272 | up_fail: |
273 | mmap_write_unlock(mm); |
274 | return ret; |
275 | } |
276 | |
277 | #ifdef CONFIG_X86_64 |
278 | /* |
279 | * Put the vdso above the (randomized) stack with another randomized |
280 | * offset. This way there is no hole in the middle of address space. |
281 | * To save memory make sure it is still in the same PTE as the stack |
282 | * top. This doesn't give that many random bits. |
283 | * |
284 | * Note that this algorithm is imperfect: the distribution of the vdso |
285 | * start address within a PMD is biased toward the end. |
286 | * |
287 | * Only used for the 64-bit and x32 vdsos. |
288 | */ |
289 | static unsigned long vdso_addr(unsigned long start, unsigned len) |
290 | { |
291 | unsigned long addr, end; |
292 | unsigned offset; |
293 | |
294 | /* |
295 | * Round up the start address. It can start out unaligned as a result |
296 | * of stack start randomization. |
297 | */ |
298 | start = PAGE_ALIGN(start); |
299 | |
300 | /* Round the lowest possible end address up to a PMD boundary. */ |
301 | end = (start + len + PMD_SIZE - 1) & PMD_MASK; |
302 | if (end >= DEFAULT_MAP_WINDOW) |
303 | end = DEFAULT_MAP_WINDOW; |
304 | end -= len; |
305 | |
306 | if (end > start) { |
307 | offset = get_random_u32_below(ceil: ((end - start) >> PAGE_SHIFT) + 1); |
308 | addr = start + (offset << PAGE_SHIFT); |
309 | } else { |
310 | addr = start; |
311 | } |
312 | |
313 | /* |
314 | * Forcibly align the final address in case we have a hardware |
315 | * issue that requires alignment for performance reasons. |
316 | */ |
317 | addr = align_vdso_addr(addr); |
318 | |
319 | return addr; |
320 | } |
321 | |
322 | static int map_vdso_randomized(const struct vdso_image *image) |
323 | { |
324 | unsigned long addr = vdso_addr(current->mm->start_stack, len: image->size-image->sym_vvar_start); |
325 | |
326 | return map_vdso(image, addr); |
327 | } |
328 | #endif |
329 | |
330 | int map_vdso_once(const struct vdso_image *image, unsigned long addr) |
331 | { |
332 | struct mm_struct *mm = current->mm; |
333 | struct vm_area_struct *vma; |
334 | VMA_ITERATOR(vmi, mm, 0); |
335 | |
336 | mmap_write_lock(mm); |
337 | /* |
338 | * Check if we have already mapped vdso blob - fail to prevent |
339 | * abusing from userspace install_special_mapping, which may |
340 | * not do accounting and rlimit right. |
341 | * We could search vma near context.vdso, but it's a slowpath, |
342 | * so let's explicitly check all VMAs to be completely sure. |
343 | */ |
344 | for_each_vma(vmi, vma) { |
345 | if (vma_is_special_mapping(vma, sm: &vdso_mapping) || |
346 | vma_is_special_mapping(vma, sm: &vvar_mapping)) { |
347 | mmap_write_unlock(mm); |
348 | return -EEXIST; |
349 | } |
350 | } |
351 | mmap_write_unlock(mm); |
352 | |
353 | return map_vdso(image, addr); |
354 | } |
355 | |
356 | #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) |
357 | static int load_vdso32(void) |
358 | { |
359 | if (vdso32_enabled != 1) /* Other values all mean "disabled" */ |
360 | return 0; |
361 | |
362 | return map_vdso(image: &vdso_image_32, addr: 0); |
363 | } |
364 | #endif |
365 | |
366 | #ifdef CONFIG_X86_64 |
367 | int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) |
368 | { |
369 | if (!vdso64_enabled) |
370 | return 0; |
371 | |
372 | return map_vdso_randomized(image: &vdso_image_64); |
373 | } |
374 | |
375 | #ifdef CONFIG_COMPAT |
376 | int compat_arch_setup_additional_pages(struct linux_binprm *bprm, |
377 | int uses_interp, bool x32) |
378 | { |
379 | #ifdef CONFIG_X86_X32_ABI |
380 | if (x32) { |
381 | if (!vdso64_enabled) |
382 | return 0; |
383 | return map_vdso_randomized(image: &vdso_image_x32); |
384 | } |
385 | #endif |
386 | #ifdef CONFIG_IA32_EMULATION |
387 | return load_vdso32(); |
388 | #else |
389 | return 0; |
390 | #endif |
391 | } |
392 | #endif |
393 | #else |
394 | int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) |
395 | { |
396 | return load_vdso32(); |
397 | } |
398 | #endif |
399 | |
400 | bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs) |
401 | { |
402 | #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) |
403 | const struct vdso_image *image = current->mm->context.vdso_image; |
404 | unsigned long vdso = (unsigned long) current->mm->context.vdso; |
405 | |
406 | if (in_ia32_syscall() && image == &vdso_image_32) { |
407 | if (regs->ip == vdso + image->sym_vdso32_sigreturn_landing_pad || |
408 | regs->ip == vdso + image->sym_vdso32_rt_sigreturn_landing_pad) |
409 | return true; |
410 | } |
411 | #endif |
412 | return false; |
413 | } |
414 | |
415 | #ifdef CONFIG_X86_64 |
416 | static __init int vdso_setup(char *s) |
417 | { |
418 | vdso64_enabled = simple_strtoul(s, NULL, 0); |
419 | return 1; |
420 | } |
421 | __setup("vdso=" , vdso_setup); |
422 | #endif /* CONFIG_X86_64 */ |
423 | |