1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * common.c - C code for kernel entry and exit |
4 | * Copyright (c) 2015 Andrew Lutomirski |
5 | * |
6 | * Based on asm and ptrace code by many authors. The code here originated |
7 | * in ptrace.c and signal.c. |
8 | */ |
9 | |
10 | #include <linux/kernel.h> |
11 | #include <linux/sched.h> |
12 | #include <linux/sched/task_stack.h> |
13 | #include <linux/entry-common.h> |
14 | #include <linux/mm.h> |
15 | #include <linux/smp.h> |
16 | #include <linux/errno.h> |
17 | #include <linux/ptrace.h> |
18 | #include <linux/export.h> |
19 | #include <linux/nospec.h> |
20 | #include <linux/syscalls.h> |
21 | #include <linux/uaccess.h> |
22 | #include <linux/init.h> |
23 | |
24 | #ifdef CONFIG_XEN_PV |
25 | #include <xen/xen-ops.h> |
26 | #include <xen/events.h> |
27 | #endif |
28 | |
29 | #include <asm/desc.h> |
30 | #include <asm/traps.h> |
31 | #include <asm/vdso.h> |
32 | #include <asm/cpufeature.h> |
33 | #include <asm/fpu/api.h> |
34 | #include <asm/nospec-branch.h> |
35 | #include <asm/io_bitmap.h> |
36 | #include <asm/syscall.h> |
37 | #include <asm/irq_stack.h> |
38 | |
39 | #ifdef CONFIG_X86_64 |
40 | |
41 | static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr) |
42 | { |
43 | /* |
44 | * Convert negative numbers to very high and thus out of range |
45 | * numbers for comparisons. |
46 | */ |
47 | unsigned int unr = nr; |
48 | |
49 | if (likely(unr < NR_syscalls)) { |
50 | unr = array_index_nospec(unr, NR_syscalls); |
51 | regs->ax = sys_call_table[unr](regs); |
52 | return true; |
53 | } |
54 | return false; |
55 | } |
56 | |
57 | static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) |
58 | { |
59 | /* |
60 | * Adjust the starting offset of the table, and convert numbers |
61 | * < __X32_SYSCALL_BIT to very high and thus out of range |
62 | * numbers for comparisons. |
63 | */ |
64 | unsigned int xnr = nr - __X32_SYSCALL_BIT; |
65 | |
66 | if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) { |
67 | xnr = array_index_nospec(xnr, X32_NR_syscalls); |
68 | regs->ax = x32_sys_call_table[xnr](regs); |
69 | return true; |
70 | } |
71 | return false; |
72 | } |
73 | |
74 | /* Returns true to return using SYSRET, or false to use IRET */ |
75 | __visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr) |
76 | { |
77 | add_random_kstack_offset(); |
78 | nr = syscall_enter_from_user_mode(regs, syscall: nr); |
79 | |
80 | instrumentation_begin(); |
81 | |
82 | if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) { |
83 | /* Invalid system call, but still a system call. */ |
84 | regs->ax = __x64_sys_ni_syscall(regs); |
85 | } |
86 | |
87 | instrumentation_end(); |
88 | syscall_exit_to_user_mode(regs); |
89 | |
90 | /* |
91 | * Check that the register state is valid for using SYSRET to exit |
92 | * to userspace. Otherwise use the slower but fully capable IRET |
93 | * exit path. |
94 | */ |
95 | |
96 | /* XEN PV guests always use the IRET path */ |
97 | if (cpu_feature_enabled(X86_FEATURE_XENPV)) |
98 | return false; |
99 | |
100 | /* SYSRET requires RCX == RIP and R11 == EFLAGS */ |
101 | if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags)) |
102 | return false; |
103 | |
104 | /* CS and SS must match the values set in MSR_STAR */ |
105 | if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS)) |
106 | return false; |
107 | |
108 | /* |
109 | * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP |
110 | * in kernel space. This essentially lets the user take over |
111 | * the kernel, since userspace controls RSP. |
112 | * |
113 | * TASK_SIZE_MAX covers all user-accessible addresses other than |
114 | * the deprecated vsyscall page. |
115 | */ |
116 | if (unlikely(regs->ip >= TASK_SIZE_MAX)) |
117 | return false; |
118 | |
119 | /* |
120 | * SYSRET cannot restore RF. It can restore TF, but unlike IRET, |
121 | * restoring TF results in a trap from userspace immediately after |
122 | * SYSRET. |
123 | */ |
124 | if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF))) |
125 | return false; |
126 | |
127 | /* Use SYSRET to exit to userspace */ |
128 | return true; |
129 | } |
130 | #endif |
131 | |
132 | #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) |
133 | static __always_inline int syscall_32_enter(struct pt_regs *regs) |
134 | { |
135 | if (IS_ENABLED(CONFIG_IA32_EMULATION)) |
136 | current_thread_info()->status |= TS_COMPAT; |
137 | |
138 | return (int)regs->orig_ax; |
139 | } |
140 | |
141 | #ifdef CONFIG_IA32_EMULATION |
142 | bool __ia32_enabled __ro_after_init = !IS_ENABLED(CONFIG_IA32_EMULATION_DEFAULT_DISABLED); |
143 | |
144 | static int ia32_emulation_override_cmdline(char *arg) |
145 | { |
146 | return kstrtobool(s: arg, res: &__ia32_enabled); |
147 | } |
148 | early_param("ia32_emulation" , ia32_emulation_override_cmdline); |
149 | #endif |
150 | |
151 | /* |
152 | * Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. |
153 | */ |
154 | static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr) |
155 | { |
156 | /* |
157 | * Convert negative numbers to very high and thus out of range |
158 | * numbers for comparisons. |
159 | */ |
160 | unsigned int unr = nr; |
161 | |
162 | if (likely(unr < IA32_NR_syscalls)) { |
163 | unr = array_index_nospec(unr, IA32_NR_syscalls); |
164 | regs->ax = ia32_sys_call_table[unr](regs); |
165 | } else if (nr != -1) { |
166 | regs->ax = __ia32_sys_ni_syscall(regs); |
167 | } |
168 | } |
169 | |
170 | /* Handles int $0x80 */ |
171 | __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) |
172 | { |
173 | int nr = syscall_32_enter(regs); |
174 | |
175 | add_random_kstack_offset(); |
176 | /* |
177 | * Subtlety here: if ptrace pokes something larger than 2^31-1 into |
178 | * orig_ax, the int return value truncates it. This matches |
179 | * the semantics of syscall_get_nr(). |
180 | */ |
181 | nr = syscall_enter_from_user_mode(regs, syscall: nr); |
182 | instrumentation_begin(); |
183 | |
184 | do_syscall_32_irqs_on(regs, nr); |
185 | |
186 | instrumentation_end(); |
187 | syscall_exit_to_user_mode(regs); |
188 | } |
189 | |
190 | static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) |
191 | { |
192 | int nr = syscall_32_enter(regs); |
193 | int res; |
194 | |
195 | add_random_kstack_offset(); |
196 | /* |
197 | * This cannot use syscall_enter_from_user_mode() as it has to |
198 | * fetch EBP before invoking any of the syscall entry work |
199 | * functions. |
200 | */ |
201 | syscall_enter_from_user_mode_prepare(regs); |
202 | |
203 | instrumentation_begin(); |
204 | /* Fetch EBP from where the vDSO stashed it. */ |
205 | if (IS_ENABLED(CONFIG_X86_64)) { |
206 | /* |
207 | * Micro-optimization: the pointer we're following is |
208 | * explicitly 32 bits, so it can't be out of range. |
209 | */ |
210 | res = __get_user(*(u32 *)®s->bp, |
211 | (u32 __user __force *)(unsigned long)(u32)regs->sp); |
212 | } else { |
213 | res = get_user(*(u32 *)®s->bp, |
214 | (u32 __user __force *)(unsigned long)(u32)regs->sp); |
215 | } |
216 | |
217 | if (res) { |
218 | /* User code screwed up. */ |
219 | regs->ax = -EFAULT; |
220 | |
221 | local_irq_disable(); |
222 | instrumentation_end(); |
223 | irqentry_exit_to_user_mode(regs); |
224 | return false; |
225 | } |
226 | |
227 | nr = syscall_enter_from_user_mode_work(regs, syscall: nr); |
228 | |
229 | /* Now this is just like a normal syscall. */ |
230 | do_syscall_32_irqs_on(regs, nr); |
231 | |
232 | instrumentation_end(); |
233 | syscall_exit_to_user_mode(regs); |
234 | return true; |
235 | } |
236 | |
237 | /* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */ |
238 | __visible noinstr bool do_fast_syscall_32(struct pt_regs *regs) |
239 | { |
240 | /* |
241 | * Called using the internal vDSO SYSENTER/SYSCALL32 calling |
242 | * convention. Adjust regs so it looks like we entered using int80. |
243 | */ |
244 | unsigned long landing_pad = (unsigned long)current->mm->context.vdso + |
245 | vdso_image_32.sym_int80_landing_pad; |
246 | |
247 | /* |
248 | * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward |
249 | * so that 'regs->ip -= 2' lands back on an int $0x80 instruction. |
250 | * Fix it up. |
251 | */ |
252 | regs->ip = landing_pad; |
253 | |
254 | /* Invoke the syscall. If it failed, keep it simple: use IRET. */ |
255 | if (!__do_fast_syscall_32(regs)) |
256 | return false; |
257 | |
258 | /* |
259 | * Check that the register state is valid for using SYSRETL/SYSEXIT |
260 | * to exit to userspace. Otherwise use the slower but fully capable |
261 | * IRET exit path. |
262 | */ |
263 | |
264 | /* XEN PV guests always use the IRET path */ |
265 | if (cpu_feature_enabled(X86_FEATURE_XENPV)) |
266 | return false; |
267 | |
268 | /* EIP must point to the VDSO landing pad */ |
269 | if (unlikely(regs->ip != landing_pad)) |
270 | return false; |
271 | |
272 | /* CS and SS must match the values set in MSR_STAR */ |
273 | if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS)) |
274 | return false; |
275 | |
276 | /* If the TF, RF, or VM flags are set, use IRET */ |
277 | if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM))) |
278 | return false; |
279 | |
280 | /* Use SYSRETL/SYSEXIT to exit to userspace */ |
281 | return true; |
282 | } |
283 | |
284 | /* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */ |
285 | __visible noinstr bool do_SYSENTER_32(struct pt_regs *regs) |
286 | { |
287 | /* SYSENTER loses RSP, but the vDSO saved it in RBP. */ |
288 | regs->sp = regs->bp; |
289 | |
290 | /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */ |
291 | regs->flags |= X86_EFLAGS_IF; |
292 | |
293 | return do_fast_syscall_32(regs); |
294 | } |
295 | #endif |
296 | |
297 | SYSCALL_DEFINE0(ni_syscall) |
298 | { |
299 | return -ENOSYS; |
300 | } |
301 | |
302 | #ifdef CONFIG_XEN_PV |
303 | #ifndef CONFIG_PREEMPTION |
304 | /* |
305 | * Some hypercalls issued by the toolstack can take many 10s of |
306 | * seconds. Allow tasks running hypercalls via the privcmd driver to |
307 | * be voluntarily preempted even if full kernel preemption is |
308 | * disabled. |
309 | * |
310 | * Such preemptible hypercalls are bracketed by |
311 | * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end() |
312 | * calls. |
313 | */ |
314 | DEFINE_PER_CPU(bool, xen_in_preemptible_hcall); |
315 | EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall); |
316 | |
317 | /* |
318 | * In case of scheduling the flag must be cleared and restored after |
319 | * returning from schedule as the task might move to a different CPU. |
320 | */ |
321 | static __always_inline bool get_and_clear_inhcall(void) |
322 | { |
323 | bool inhcall = __this_cpu_read(xen_in_preemptible_hcall); |
324 | |
325 | __this_cpu_write(xen_in_preemptible_hcall, false); |
326 | return inhcall; |
327 | } |
328 | |
329 | static __always_inline void restore_inhcall(bool inhcall) |
330 | { |
331 | __this_cpu_write(xen_in_preemptible_hcall, inhcall); |
332 | } |
333 | #else |
334 | static __always_inline bool get_and_clear_inhcall(void) { return false; } |
335 | static __always_inline void restore_inhcall(bool inhcall) { } |
336 | #endif |
337 | |
338 | static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs) |
339 | { |
340 | struct pt_regs *old_regs = set_irq_regs(regs); |
341 | |
342 | inc_irq_stat(irq_hv_callback_count); |
343 | |
344 | xen_evtchn_do_upcall(); |
345 | |
346 | set_irq_regs(old_regs); |
347 | } |
348 | |
349 | __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) |
350 | { |
351 | irqentry_state_t state = irqentry_enter(regs); |
352 | bool inhcall; |
353 | |
354 | instrumentation_begin(); |
355 | run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs); |
356 | |
357 | inhcall = get_and_clear_inhcall(); |
358 | if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) { |
359 | irqentry_exit_cond_resched(); |
360 | instrumentation_end(); |
361 | restore_inhcall(inhcall); |
362 | } else { |
363 | instrumentation_end(); |
364 | irqentry_exit(regs, state); |
365 | } |
366 | } |
367 | #endif /* CONFIG_XEN_PV */ |
368 | |