1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | /* |
3 | * linux/arch/x86_64/entry.S |
4 | * |
5 | * Copyright (C) 1991, 1992 Linus Torvalds |
6 | * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs |
7 | * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> |
8 | * |
9 | * entry.S contains the system-call and fault low-level handling routines. |
10 | * |
11 | * Some of this is documented in Documentation/arch/x86/entry_64.rst |
12 | * |
13 | * A note on terminology: |
14 | * - iret frame: Architecture defined interrupt frame from SS to RIP |
15 | * at the top of the kernel process stack. |
16 | * |
17 | * Some macro usage: |
18 | * - SYM_FUNC_START/END:Define functions in the symbol table. |
19 | * - idtentry: Define exception entry points. |
20 | */ |
21 | #include <linux/export.h> |
22 | #include <linux/linkage.h> |
23 | #include <asm/segment.h> |
24 | #include <asm/cache.h> |
25 | #include <asm/errno.h> |
26 | #include <asm/asm-offsets.h> |
27 | #include <asm/msr.h> |
28 | #include <asm/unistd.h> |
29 | #include <asm/thread_info.h> |
30 | #include <asm/hw_irq.h> |
31 | #include <asm/page_types.h> |
32 | #include <asm/irqflags.h> |
33 | #include <asm/paravirt.h> |
34 | #include <asm/percpu.h> |
35 | #include <asm/asm.h> |
36 | #include <asm/smap.h> |
37 | #include <asm/pgtable_types.h> |
38 | #include <asm/frame.h> |
39 | #include <asm/trapnr.h> |
40 | #include <asm/nospec-branch.h> |
41 | #include <asm/fsgsbase.h> |
42 | #include <linux/err.h> |
43 | |
44 | #include "calling.h" |
45 | |
46 | .code64 |
47 | .section .entry.text, "ax" |
48 | |
49 | /* |
50 | * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers. |
51 | * |
52 | * This is the only entry point used for 64-bit system calls. The |
53 | * hardware interface is reasonably well designed and the register to |
54 | * argument mapping Linux uses fits well with the registers that are |
55 | * available when SYSCALL is used. |
56 | * |
57 | * SYSCALL instructions can be found inlined in libc implementations as |
58 | * well as some other programs and libraries. There are also a handful |
59 | * of SYSCALL instructions in the vDSO used, for example, as a |
60 | * clock_gettimeofday fallback. |
61 | * |
62 | * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, |
63 | * then loads new ss, cs, and rip from previously programmed MSRs. |
64 | * rflags gets masked by a value from another MSR (so CLD and CLAC |
65 | * are not needed). SYSCALL does not save anything on the stack |
66 | * and does not change rsp. |
67 | * |
68 | * Registers on entry: |
69 | * rax system call number |
70 | * rcx return address |
71 | * r11 saved rflags (note: r11 is callee-clobbered register in C ABI) |
72 | * rdi arg0 |
73 | * rsi arg1 |
74 | * rdx arg2 |
75 | * r10 arg3 (needs to be moved to rcx to conform to C ABI) |
76 | * r8 arg4 |
77 | * r9 arg5 |
78 | * (note: r12-r15, rbp, rbx are callee-preserved in C ABI) |
79 | * |
80 | * Only called from user space. |
81 | * |
82 | * When user can change pt_regs->foo always force IRET. That is because |
83 | * it deals with uncanonical addresses better. SYSRET has trouble |
84 | * with them due to bugs in both AMD and Intel CPUs. |
85 | */ |
86 | |
87 | SYM_CODE_START(entry_SYSCALL_64) |
88 | UNWIND_HINT_ENTRY |
89 | ENDBR |
90 | |
91 | swapgs |
92 | /* tss.sp2 is scratch space. */ |
93 | movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2) |
94 | SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp |
95 | movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp |
96 | |
97 | SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL) |
98 | ANNOTATE_NOENDBR |
99 | |
100 | /* Construct struct pt_regs on stack */ |
101 | pushq $__USER_DS /* pt_regs->ss */ |
102 | pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */ |
103 | pushq %r11 /* pt_regs->flags */ |
104 | pushq $__USER_CS /* pt_regs->cs */ |
105 | pushq %rcx /* pt_regs->ip */ |
106 | SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) |
107 | pushq %rax /* pt_regs->orig_ax */ |
108 | |
109 | PUSH_AND_CLEAR_REGS rax=$-ENOSYS |
110 | |
111 | /* IRQs are off. */ |
112 | movq %rsp, %rdi |
113 | /* Sign extend the lower 32bit as syscall numbers are treated as int */ |
114 | movslq %eax, %rsi |
115 | |
116 | /* clobbers %rax, make sure it is after saving the syscall nr */ |
117 | IBRS_ENTER |
118 | UNTRAIN_RET |
119 | |
120 | call do_syscall_64 /* returns with IRQs disabled */ |
121 | |
122 | /* |
123 | * Try to use SYSRET instead of IRET if we're returning to |
124 | * a completely clean 64-bit userspace context. If we're not, |
125 | * go to the slow exit path. |
126 | * In the Xen PV case we must use iret anyway. |
127 | */ |
128 | |
129 | ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode" , \ |
130 | "jmp swapgs_restore_regs_and_return_to_usermode" , X86_FEATURE_XENPV |
131 | |
132 | /* |
133 | * We win! This label is here just for ease of understanding |
134 | * perf profiles. Nothing jumps here. |
135 | */ |
136 | syscall_return_via_sysret: |
137 | IBRS_EXIT |
138 | POP_REGS pop_rdi=0 |
139 | |
140 | /* |
141 | * Now all regs are restored except RSP and RDI. |
142 | * Save old stack pointer and switch to trampoline stack. |
143 | */ |
144 | movq %rsp, %rdi |
145 | movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp |
146 | UNWIND_HINT_END_OF_STACK |
147 | |
148 | pushq RSP-RDI(%rdi) /* RSP */ |
149 | pushq (%rdi) /* RDI */ |
150 | |
151 | /* |
152 | * We are on the trampoline stack. All regs except RDI are live. |
153 | * We can do future final exit work right here. |
154 | */ |
155 | STACKLEAK_ERASE_NOCLOBBER |
156 | |
157 | SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi |
158 | |
159 | popq %rdi |
160 | popq %rsp |
161 | SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL) |
162 | ANNOTATE_NOENDBR |
163 | swapgs |
164 | sysretq |
165 | SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL) |
166 | ANNOTATE_NOENDBR |
167 | int3 |
168 | SYM_CODE_END(entry_SYSCALL_64) |
169 | |
170 | /* |
171 | * %rdi: prev task |
172 | * %rsi: next task |
173 | */ |
174 | .pushsection .text, "ax" |
175 | SYM_FUNC_START(__switch_to_asm) |
176 | /* |
177 | * Save callee-saved registers |
178 | * This must match the order in inactive_task_frame |
179 | */ |
180 | pushq %rbp |
181 | pushq %rbx |
182 | pushq %r12 |
183 | pushq %r13 |
184 | pushq %r14 |
185 | pushq %r15 |
186 | |
187 | /* switch stack */ |
188 | movq %rsp, TASK_threadsp(%rdi) |
189 | movq TASK_threadsp(%rsi), %rsp |
190 | |
191 | #ifdef CONFIG_STACKPROTECTOR |
192 | movq TASK_stack_canary(%rsi), %rbx |
193 | movq %rbx, PER_CPU_VAR(fixed_percpu_data) + FIXED_stack_canary |
194 | #endif |
195 | |
196 | /* |
197 | * When switching from a shallower to a deeper call stack |
198 | * the RSB may either underflow or use entries populated |
199 | * with userspace addresses. On CPUs where those concerns |
200 | * exist, overwrite the RSB with entries which capture |
201 | * speculative execution to prevent attack. |
202 | */ |
203 | FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW |
204 | |
205 | /* restore callee-saved registers */ |
206 | popq %r15 |
207 | popq %r14 |
208 | popq %r13 |
209 | popq %r12 |
210 | popq %rbx |
211 | popq %rbp |
212 | |
213 | jmp __switch_to |
214 | SYM_FUNC_END(__switch_to_asm) |
215 | .popsection |
216 | |
217 | /* |
218 | * A newly forked process directly context switches into this address. |
219 | * |
220 | * rax: prev task we switched from |
221 | * rbx: kernel thread func (NULL for user thread) |
222 | * r12: kernel thread arg |
223 | */ |
224 | .pushsection .text, "ax" |
225 | SYM_CODE_START(ret_from_fork_asm) |
226 | /* |
227 | * This is the start of the kernel stack; even through there's a |
228 | * register set at the top, the regset isn't necessarily coherent |
229 | * (consider kthreads) and one cannot unwind further. |
230 | * |
231 | * This ensures stack unwinds of kernel threads terminate in a known |
232 | * good state. |
233 | */ |
234 | UNWIND_HINT_END_OF_STACK |
235 | ANNOTATE_NOENDBR // copy_thread |
236 | CALL_DEPTH_ACCOUNT |
237 | |
238 | movq %rax, %rdi /* prev */ |
239 | movq %rsp, %rsi /* regs */ |
240 | movq %rbx, %rdx /* fn */ |
241 | movq %r12, %rcx /* fn_arg */ |
242 | call ret_from_fork |
243 | |
244 | /* |
245 | * Set the stack state to what is expected for the target function |
246 | * -- at this point the register set should be a valid user set |
247 | * and unwind should work normally. |
248 | */ |
249 | UNWIND_HINT_REGS |
250 | jmp swapgs_restore_regs_and_return_to_usermode |
251 | SYM_CODE_END(ret_from_fork_asm) |
252 | .popsection |
253 | |
254 | .macro DEBUG_ENTRY_ASSERT_IRQS_OFF |
255 | #ifdef CONFIG_DEBUG_ENTRY |
256 | pushq %rax |
257 | SAVE_FLAGS |
258 | testl $X86_EFLAGS_IF, %eax |
259 | jz .Lokay_\@ |
260 | ud2 |
261 | .Lokay_\@: |
262 | popq %rax |
263 | #endif |
264 | .endm |
265 | |
266 | SYM_CODE_START(xen_error_entry) |
267 | ANNOTATE_NOENDBR |
268 | UNWIND_HINT_FUNC |
269 | PUSH_AND_CLEAR_REGS save_ret=1 |
270 | ENCODE_FRAME_POINTER 8 |
271 | UNTRAIN_RET_FROM_CALL |
272 | RET |
273 | SYM_CODE_END(xen_error_entry) |
274 | |
275 | /** |
276 | * idtentry_body - Macro to emit code calling the C function |
277 | * @cfunc: C function to be called |
278 | * @has_error_code: Hardware pushed error code on stack |
279 | */ |
280 | .macro idtentry_body cfunc has_error_code:req |
281 | |
282 | /* |
283 | * Call error_entry() and switch to the task stack if from userspace. |
284 | * |
285 | * When in XENPV, it is already in the task stack, and it can't fault |
286 | * for native_iret() nor native_load_gs_index() since XENPV uses its |
287 | * own pvops for IRET and load_gs_index(). And it doesn't need to |
288 | * switch the CR3. So it can skip invoking error_entry(). |
289 | */ |
290 | ALTERNATIVE "call error_entry; movq %rax, %rsp" , \ |
291 | "call xen_error_entry" , X86_FEATURE_XENPV |
292 | |
293 | ENCODE_FRAME_POINTER |
294 | UNWIND_HINT_REGS |
295 | |
296 | movq %rsp, %rdi /* pt_regs pointer into 1st argument*/ |
297 | |
298 | .if \has_error_code == 1 |
299 | movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/ |
300 | movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ |
301 | .endif |
302 | |
303 | call \cfunc |
304 | |
305 | /* For some configurations \cfunc ends up being a noreturn. */ |
306 | REACHABLE |
307 | |
308 | jmp error_return |
309 | .endm |
310 | |
311 | /** |
312 | * idtentry - Macro to generate entry stubs for simple IDT entries |
313 | * @vector: Vector number |
314 | * @asmsym: ASM symbol for the entry point |
315 | * @cfunc: C function to be called |
316 | * @has_error_code: Hardware pushed error code on stack |
317 | * |
318 | * The macro emits code to set up the kernel context for straight forward |
319 | * and simple IDT entries. No IST stack, no paranoid entry checks. |
320 | */ |
321 | .macro idtentry vector asmsym cfunc has_error_code:req |
322 | SYM_CODE_START(\asmsym) |
323 | |
324 | .if \vector == X86_TRAP_BP |
325 | /* #BP advances %rip to the next instruction */ |
326 | UNWIND_HINT_IRET_ENTRY offset=\has_error_code*8 signal=0 |
327 | .else |
328 | UNWIND_HINT_IRET_ENTRY offset=\has_error_code*8 |
329 | .endif |
330 | |
331 | ENDBR |
332 | ASM_CLAC |
333 | cld |
334 | |
335 | .if \has_error_code == 0 |
336 | pushq $-1 /* ORIG_RAX: no syscall to restart */ |
337 | .endif |
338 | |
339 | .if \vector == X86_TRAP_BP |
340 | /* |
341 | * If coming from kernel space, create a 6-word gap to allow the |
342 | * int3 handler to emulate a call instruction. |
343 | */ |
344 | testb $3, CS-ORIG_RAX(%rsp) |
345 | jnz .Lfrom_usermode_no_gap_\@ |
346 | .rept 6 |
347 | pushq 5*8(%rsp) |
348 | .endr |
349 | UNWIND_HINT_IRET_REGS offset=8 |
350 | .Lfrom_usermode_no_gap_\@: |
351 | .endif |
352 | |
353 | idtentry_body \cfunc \has_error_code |
354 | |
355 | _ASM_NOKPROBE(\asmsym) |
356 | SYM_CODE_END(\asmsym) |
357 | .endm |
358 | |
359 | /* |
360 | * Interrupt entry/exit. |
361 | * |
362 | + The interrupt stubs push (vector) onto the stack, which is the error_code |
363 | * position of idtentry exceptions, and jump to one of the two idtentry points |
364 | * (common/spurious). |
365 | * |
366 | * common_interrupt is a hotpath, align it to a cache line |
367 | */ |
368 | .macro idtentry_irq vector cfunc |
369 | .p2align CONFIG_X86_L1_CACHE_SHIFT |
370 | idtentry \vector asm_\cfunc \cfunc has_error_code=1 |
371 | .endm |
372 | |
373 | /* |
374 | * System vectors which invoke their handlers directly and are not |
375 | * going through the regular common device interrupt handling code. |
376 | */ |
377 | .macro idtentry_sysvec vector cfunc |
378 | idtentry \vector asm_\cfunc \cfunc has_error_code=0 |
379 | .endm |
380 | |
381 | /** |
382 | * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB |
383 | * @vector: Vector number |
384 | * @asmsym: ASM symbol for the entry point |
385 | * @cfunc: C function to be called |
386 | * |
387 | * The macro emits code to set up the kernel context for #MC and #DB |
388 | * |
389 | * If the entry comes from user space it uses the normal entry path |
390 | * including the return to user space work and preemption checks on |
391 | * exit. |
392 | * |
393 | * If hits in kernel mode then it needs to go through the paranoid |
394 | * entry as the exception can hit any random state. No preemption |
395 | * check on exit to keep the paranoid path simple. |
396 | */ |
397 | .macro idtentry_mce_db vector asmsym cfunc |
398 | SYM_CODE_START(\asmsym) |
399 | UNWIND_HINT_IRET_ENTRY |
400 | ENDBR |
401 | ASM_CLAC |
402 | cld |
403 | |
404 | pushq $-1 /* ORIG_RAX: no syscall to restart */ |
405 | |
406 | /* |
407 | * If the entry is from userspace, switch stacks and treat it as |
408 | * a normal entry. |
409 | */ |
410 | testb $3, CS-ORIG_RAX(%rsp) |
411 | jnz .Lfrom_usermode_switch_stack_\@ |
412 | |
413 | /* paranoid_entry returns GS information for paranoid_exit in EBX. */ |
414 | call paranoid_entry |
415 | |
416 | UNWIND_HINT_REGS |
417 | |
418 | movq %rsp, %rdi /* pt_regs pointer */ |
419 | |
420 | call \cfunc |
421 | |
422 | jmp paranoid_exit |
423 | |
424 | /* Switch to the regular task stack and use the noist entry point */ |
425 | .Lfrom_usermode_switch_stack_\@: |
426 | idtentry_body noist_\cfunc, has_error_code=0 |
427 | |
428 | _ASM_NOKPROBE(\asmsym) |
429 | SYM_CODE_END(\asmsym) |
430 | .endm |
431 | |
432 | #ifdef CONFIG_AMD_MEM_ENCRYPT |
433 | /** |
434 | * idtentry_vc - Macro to generate entry stub for #VC |
435 | * @vector: Vector number |
436 | * @asmsym: ASM symbol for the entry point |
437 | * @cfunc: C function to be called |
438 | * |
439 | * The macro emits code to set up the kernel context for #VC. The #VC handler |
440 | * runs on an IST stack and needs to be able to cause nested #VC exceptions. |
441 | * |
442 | * To make this work the #VC entry code tries its best to pretend it doesn't use |
443 | * an IST stack by switching to the task stack if coming from user-space (which |
444 | * includes early SYSCALL entry path) or back to the stack in the IRET frame if |
445 | * entered from kernel-mode. |
446 | * |
447 | * If entered from kernel-mode the return stack is validated first, and if it is |
448 | * not safe to use (e.g. because it points to the entry stack) the #VC handler |
449 | * will switch to a fall-back stack (VC2) and call a special handler function. |
450 | * |
451 | * The macro is only used for one vector, but it is planned to be extended in |
452 | * the future for the #HV exception. |
453 | */ |
454 | .macro idtentry_vc vector asmsym cfunc |
455 | SYM_CODE_START(\asmsym) |
456 | UNWIND_HINT_IRET_ENTRY |
457 | ENDBR |
458 | ASM_CLAC |
459 | cld |
460 | |
461 | /* |
462 | * If the entry is from userspace, switch stacks and treat it as |
463 | * a normal entry. |
464 | */ |
465 | testb $3, CS-ORIG_RAX(%rsp) |
466 | jnz .Lfrom_usermode_switch_stack_\@ |
467 | |
468 | /* |
469 | * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX. |
470 | * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS |
471 | */ |
472 | call paranoid_entry |
473 | |
474 | UNWIND_HINT_REGS |
475 | |
476 | /* |
477 | * Switch off the IST stack to make it free for nested exceptions. The |
478 | * vc_switch_off_ist() function will switch back to the interrupted |
479 | * stack if it is safe to do so. If not it switches to the VC fall-back |
480 | * stack. |
481 | */ |
482 | movq %rsp, %rdi /* pt_regs pointer */ |
483 | call vc_switch_off_ist |
484 | movq %rax, %rsp /* Switch to new stack */ |
485 | |
486 | ENCODE_FRAME_POINTER |
487 | UNWIND_HINT_REGS |
488 | |
489 | /* Update pt_regs */ |
490 | movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/ |
491 | movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ |
492 | |
493 | movq %rsp, %rdi /* pt_regs pointer */ |
494 | |
495 | call kernel_\cfunc |
496 | |
497 | /* |
498 | * No need to switch back to the IST stack. The current stack is either |
499 | * identical to the stack in the IRET frame or the VC fall-back stack, |
500 | * so it is definitely mapped even with PTI enabled. |
501 | */ |
502 | jmp paranoid_exit |
503 | |
504 | /* Switch to the regular task stack */ |
505 | .Lfrom_usermode_switch_stack_\@: |
506 | idtentry_body user_\cfunc, has_error_code=1 |
507 | |
508 | _ASM_NOKPROBE(\asmsym) |
509 | SYM_CODE_END(\asmsym) |
510 | .endm |
511 | #endif |
512 | |
513 | /* |
514 | * Double fault entry. Straight paranoid. No checks from which context |
515 | * this comes because for the espfix induced #DF this would do the wrong |
516 | * thing. |
517 | */ |
518 | .macro idtentry_df vector asmsym cfunc |
519 | SYM_CODE_START(\asmsym) |
520 | UNWIND_HINT_IRET_ENTRY offset=8 |
521 | ENDBR |
522 | ASM_CLAC |
523 | cld |
524 | |
525 | /* paranoid_entry returns GS information for paranoid_exit in EBX. */ |
526 | call paranoid_entry |
527 | UNWIND_HINT_REGS |
528 | |
529 | movq %rsp, %rdi /* pt_regs pointer into first argument */ |
530 | movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/ |
531 | movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ |
532 | call \cfunc |
533 | |
534 | /* For some configurations \cfunc ends up being a noreturn. */ |
535 | REACHABLE |
536 | |
537 | jmp paranoid_exit |
538 | |
539 | _ASM_NOKPROBE(\asmsym) |
540 | SYM_CODE_END(\asmsym) |
541 | .endm |
542 | |
543 | /* |
544 | * Include the defines which emit the idt entries which are shared |
545 | * shared between 32 and 64 bit and emit the __irqentry_text_* markers |
546 | * so the stacktrace boundary checks work. |
547 | */ |
548 | __ALIGN |
549 | .globl __irqentry_text_start |
550 | __irqentry_text_start: |
551 | |
552 | #include <asm/idtentry.h> |
553 | |
554 | __ALIGN |
555 | .globl __irqentry_text_end |
556 | __irqentry_text_end: |
557 | ANNOTATE_NOENDBR |
558 | |
559 | SYM_CODE_START_LOCAL(common_interrupt_return) |
560 | SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) |
561 | IBRS_EXIT |
562 | #ifdef CONFIG_DEBUG_ENTRY |
563 | /* Assert that pt_regs indicates user mode. */ |
564 | testb $3, CS(%rsp) |
565 | jnz 1f |
566 | ud2 |
567 | 1: |
568 | #endif |
569 | #ifdef CONFIG_XEN_PV |
570 | ALTERNATIVE "" , "jmp xenpv_restore_regs_and_return_to_usermode" , X86_FEATURE_XENPV |
571 | #endif |
572 | |
573 | POP_REGS pop_rdi=0 |
574 | |
575 | /* |
576 | * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS. |
577 | * Save old stack pointer and switch to trampoline stack. |
578 | */ |
579 | movq %rsp, %rdi |
580 | movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp |
581 | UNWIND_HINT_END_OF_STACK |
582 | |
583 | /* Copy the IRET frame to the trampoline stack. */ |
584 | pushq 6*8(%rdi) /* SS */ |
585 | pushq 5*8(%rdi) /* RSP */ |
586 | pushq 4*8(%rdi) /* EFLAGS */ |
587 | pushq 3*8(%rdi) /* CS */ |
588 | pushq 2*8(%rdi) /* RIP */ |
589 | |
590 | /* Push user RDI on the trampoline stack. */ |
591 | pushq (%rdi) |
592 | |
593 | /* |
594 | * We are on the trampoline stack. All regs except RDI are live. |
595 | * We can do future final exit work right here. |
596 | */ |
597 | STACKLEAK_ERASE_NOCLOBBER |
598 | |
599 | SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi |
600 | |
601 | /* Restore RDI. */ |
602 | popq %rdi |
603 | swapgs |
604 | jmp .Lnative_iret |
605 | |
606 | |
607 | SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL) |
608 | #ifdef CONFIG_DEBUG_ENTRY |
609 | /* Assert that pt_regs indicates kernel mode. */ |
610 | testb $3, CS(%rsp) |
611 | jz 1f |
612 | ud2 |
613 | 1: |
614 | #endif |
615 | POP_REGS |
616 | addq $8, %rsp /* skip regs->orig_ax */ |
617 | /* |
618 | * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization |
619 | * when returning from IPI handler. |
620 | */ |
621 | #ifdef CONFIG_XEN_PV |
622 | SYM_INNER_LABEL(early_xen_iret_patch, SYM_L_GLOBAL) |
623 | ANNOTATE_NOENDBR |
624 | .byte 0xe9 |
625 | .long .Lnative_iret - (. + 4) |
626 | #endif |
627 | |
628 | .Lnative_iret: |
629 | UNWIND_HINT_IRET_REGS |
630 | /* |
631 | * Are we returning to a stack segment from the LDT? Note: in |
632 | * 64-bit mode SS:RSP on the exception stack is always valid. |
633 | */ |
634 | #ifdef CONFIG_X86_ESPFIX64 |
635 | testb $4, (SS-RIP)(%rsp) |
636 | jnz native_irq_return_ldt |
637 | #endif |
638 | |
639 | SYM_INNER_LABEL(native_irq_return_iret, SYM_L_GLOBAL) |
640 | ANNOTATE_NOENDBR // exc_double_fault |
641 | /* |
642 | * This may fault. Non-paranoid faults on return to userspace are |
643 | * handled by fixup_bad_iret. These include #SS, #GP, and #NP. |
644 | * Double-faults due to espfix64 are handled in exc_double_fault. |
645 | * Other faults here are fatal. |
646 | */ |
647 | iretq |
648 | |
649 | #ifdef CONFIG_X86_ESPFIX64 |
650 | native_irq_return_ldt: |
651 | /* |
652 | * We are running with user GSBASE. All GPRs contain their user |
653 | * values. We have a percpu ESPFIX stack that is eight slots |
654 | * long (see ESPFIX_STACK_SIZE). espfix_waddr points to the bottom |
655 | * of the ESPFIX stack. |
656 | * |
657 | * We clobber RAX and RDI in this code. We stash RDI on the |
658 | * normal stack and RAX on the ESPFIX stack. |
659 | * |
660 | * The ESPFIX stack layout we set up looks like this: |
661 | * |
662 | * --- top of ESPFIX stack --- |
663 | * SS |
664 | * RSP |
665 | * RFLAGS |
666 | * CS |
667 | * RIP <-- RSP points here when we're done |
668 | * RAX <-- espfix_waddr points here |
669 | * --- bottom of ESPFIX stack --- |
670 | */ |
671 | |
672 | pushq %rdi /* Stash user RDI */ |
673 | swapgs /* to kernel GS */ |
674 | SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */ |
675 | |
676 | movq PER_CPU_VAR(espfix_waddr), %rdi |
677 | movq %rax, (0*8)(%rdi) /* user RAX */ |
678 | movq (1*8)(%rsp), %rax /* user RIP */ |
679 | movq %rax, (1*8)(%rdi) |
680 | movq (2*8)(%rsp), %rax /* user CS */ |
681 | movq %rax, (2*8)(%rdi) |
682 | movq (3*8)(%rsp), %rax /* user RFLAGS */ |
683 | movq %rax, (3*8)(%rdi) |
684 | movq (5*8)(%rsp), %rax /* user SS */ |
685 | movq %rax, (5*8)(%rdi) |
686 | movq (4*8)(%rsp), %rax /* user RSP */ |
687 | movq %rax, (4*8)(%rdi) |
688 | /* Now RAX == RSP. */ |
689 | |
690 | andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */ |
691 | |
692 | /* |
693 | * espfix_stack[31:16] == 0. The page tables are set up such that |
694 | * (espfix_stack | (X & 0xffff0000)) points to a read-only alias of |
695 | * espfix_waddr for any X. That is, there are 65536 RO aliases of |
696 | * the same page. Set up RSP so that RSP[31:16] contains the |
697 | * respective 16 bits of the /userspace/ RSP and RSP nonetheless |
698 | * still points to an RO alias of the ESPFIX stack. |
699 | */ |
700 | orq PER_CPU_VAR(espfix_stack), %rax |
701 | |
702 | SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi |
703 | swapgs /* to user GS */ |
704 | popq %rdi /* Restore user RDI */ |
705 | |
706 | movq %rax, %rsp |
707 | UNWIND_HINT_IRET_REGS offset=8 |
708 | |
709 | /* |
710 | * At this point, we cannot write to the stack any more, but we can |
711 | * still read. |
712 | */ |
713 | popq %rax /* Restore user RAX */ |
714 | |
715 | /* |
716 | * RSP now points to an ordinary IRET frame, except that the page |
717 | * is read-only and RSP[31:16] are preloaded with the userspace |
718 | * values. We can now IRET back to userspace. |
719 | */ |
720 | jmp native_irq_return_iret |
721 | #endif |
722 | SYM_CODE_END(common_interrupt_return) |
723 | _ASM_NOKPROBE(common_interrupt_return) |
724 | |
725 | /* |
726 | * Reload gs selector with exception handling |
727 | * di: new selector |
728 | * |
729 | * Is in entry.text as it shouldn't be instrumented. |
730 | */ |
731 | SYM_FUNC_START(asm_load_gs_index) |
732 | FRAME_BEGIN |
733 | swapgs |
734 | .Lgs_change: |
735 | ANNOTATE_NOENDBR // error_entry |
736 | movl %edi, %gs |
737 | 2: ALTERNATIVE "" , "mfence" , X86_BUG_SWAPGS_FENCE |
738 | swapgs |
739 | FRAME_END |
740 | RET |
741 | |
742 | /* running with kernelgs */ |
743 | .Lbad_gs: |
744 | swapgs /* switch back to user gs */ |
745 | .macro ZAP_GS |
746 | /* This can't be a string because the preprocessor needs to see it. */ |
747 | movl $__USER_DS, %eax |
748 | movl %eax, %gs |
749 | .endm |
750 | ALTERNATIVE "" , "ZAP_GS" , X86_BUG_NULL_SEG |
751 | xorl %eax, %eax |
752 | movl %eax, %gs |
753 | jmp 2b |
754 | |
755 | _ASM_EXTABLE(.Lgs_change, .Lbad_gs) |
756 | |
757 | SYM_FUNC_END(asm_load_gs_index) |
758 | EXPORT_SYMBOL(asm_load_gs_index) |
759 | |
760 | #ifdef CONFIG_XEN_PV |
761 | /* |
762 | * A note on the "critical region" in our callback handler. |
763 | * We want to avoid stacking callback handlers due to events occurring |
764 | * during handling of the last event. To do this, we keep events disabled |
765 | * until we've done all processing. HOWEVER, we must enable events before |
766 | * popping the stack frame (can't be done atomically) and so it would still |
767 | * be possible to get enough handler activations to overflow the stack. |
768 | * Although unlikely, bugs of that kind are hard to track down, so we'd |
769 | * like to avoid the possibility. |
770 | * So, on entry to the handler we detect whether we interrupted an |
771 | * existing activation in its critical region -- if so, we pop the current |
772 | * activation and restart the handler using the previous one. |
773 | * |
774 | * C calling convention: exc_xen_hypervisor_callback(struct *pt_regs) |
775 | */ |
776 | __FUNC_ALIGN |
777 | SYM_CODE_START_LOCAL_NOALIGN(exc_xen_hypervisor_callback) |
778 | |
779 | /* |
780 | * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will |
781 | * see the correct pointer to the pt_regs |
782 | */ |
783 | UNWIND_HINT_FUNC |
784 | movq %rdi, %rsp /* we don't return, adjust the stack frame */ |
785 | UNWIND_HINT_REGS |
786 | |
787 | call xen_pv_evtchn_do_upcall |
788 | |
789 | jmp error_return |
790 | SYM_CODE_END(exc_xen_hypervisor_callback) |
791 | |
792 | /* |
793 | * Hypervisor uses this for application faults while it executes. |
794 | * We get here for two reasons: |
795 | * 1. Fault while reloading DS, ES, FS or GS |
796 | * 2. Fault while executing IRET |
797 | * Category 1 we do not need to fix up as Xen has already reloaded all segment |
798 | * registers that could be reloaded and zeroed the others. |
799 | * Category 2 we fix up by killing the current process. We cannot use the |
800 | * normal Linux return path in this case because if we use the IRET hypercall |
801 | * to pop the stack frame we end up in an infinite loop of failsafe callbacks. |
802 | * We distinguish between categories by comparing each saved segment register |
803 | * with its current contents: any discrepancy means we in category 1. |
804 | */ |
805 | __FUNC_ALIGN |
806 | SYM_CODE_START_NOALIGN(xen_failsafe_callback) |
807 | UNWIND_HINT_UNDEFINED |
808 | ENDBR |
809 | movl %ds, %ecx |
810 | cmpw %cx, 0x10(%rsp) |
811 | jne 1f |
812 | movl %es, %ecx |
813 | cmpw %cx, 0x18(%rsp) |
814 | jne 1f |
815 | movl %fs, %ecx |
816 | cmpw %cx, 0x20(%rsp) |
817 | jne 1f |
818 | movl %gs, %ecx |
819 | cmpw %cx, 0x28(%rsp) |
820 | jne 1f |
821 | /* All segments match their saved values => Category 2 (Bad IRET). */ |
822 | movq (%rsp), %rcx |
823 | movq 8(%rsp), %r11 |
824 | addq $0x30, %rsp |
825 | pushq $0 /* RIP */ |
826 | UNWIND_HINT_IRET_REGS offset=8 |
827 | jmp asm_exc_general_protection |
828 | 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ |
829 | movq (%rsp), %rcx |
830 | movq 8(%rsp), %r11 |
831 | addq $0x30, %rsp |
832 | UNWIND_HINT_IRET_REGS |
833 | pushq $-1 /* orig_ax = -1 => not a system call */ |
834 | PUSH_AND_CLEAR_REGS |
835 | ENCODE_FRAME_POINTER |
836 | jmp error_return |
837 | SYM_CODE_END(xen_failsafe_callback) |
838 | #endif /* CONFIG_XEN_PV */ |
839 | |
840 | /* |
841 | * Save all registers in pt_regs. Return GSBASE related information |
842 | * in EBX depending on the availability of the FSGSBASE instructions: |
843 | * |
844 | * FSGSBASE R/EBX |
845 | * N 0 -> SWAPGS on exit |
846 | * 1 -> no SWAPGS on exit |
847 | * |
848 | * Y GSBASE value at entry, must be restored in paranoid_exit |
849 | * |
850 | * R14 - old CR3 |
851 | * R15 - old SPEC_CTRL |
852 | */ |
853 | SYM_CODE_START(paranoid_entry) |
854 | ANNOTATE_NOENDBR |
855 | UNWIND_HINT_FUNC |
856 | PUSH_AND_CLEAR_REGS save_ret=1 |
857 | ENCODE_FRAME_POINTER 8 |
858 | |
859 | /* |
860 | * Always stash CR3 in %r14. This value will be restored, |
861 | * verbatim, at exit. Needed if paranoid_entry interrupted |
862 | * another entry that already switched to the user CR3 value |
863 | * but has not yet returned to userspace. |
864 | * |
865 | * This is also why CS (stashed in the "iret frame" by the |
866 | * hardware at entry) can not be used: this may be a return |
867 | * to kernel code, but with a user CR3 value. |
868 | * |
869 | * Switching CR3 does not depend on kernel GSBASE so it can |
870 | * be done before switching to the kernel GSBASE. This is |
871 | * required for FSGSBASE because the kernel GSBASE has to |
872 | * be retrieved from a kernel internal table. |
873 | */ |
874 | SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 |
875 | |
876 | /* |
877 | * Handling GSBASE depends on the availability of FSGSBASE. |
878 | * |
879 | * Without FSGSBASE the kernel enforces that negative GSBASE |
880 | * values indicate kernel GSBASE. With FSGSBASE no assumptions |
881 | * can be made about the GSBASE value when entering from user |
882 | * space. |
883 | */ |
884 | ALTERNATIVE "jmp .Lparanoid_entry_checkgs" , "" , X86_FEATURE_FSGSBASE |
885 | |
886 | /* |
887 | * Read the current GSBASE and store it in %rbx unconditionally, |
888 | * retrieve and set the current CPUs kernel GSBASE. The stored value |
889 | * has to be restored in paranoid_exit unconditionally. |
890 | * |
891 | * The unconditional write to GS base below ensures that no subsequent |
892 | * loads based on a mispredicted GS base can happen, therefore no LFENCE |
893 | * is needed here. |
894 | */ |
895 | SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx |
896 | jmp .Lparanoid_gsbase_done |
897 | |
898 | .Lparanoid_entry_checkgs: |
899 | /* EBX = 1 -> kernel GSBASE active, no restore required */ |
900 | movl $1, %ebx |
901 | |
902 | /* |
903 | * The kernel-enforced convention is a negative GSBASE indicates |
904 | * a kernel value. No SWAPGS needed on entry and exit. |
905 | */ |
906 | movl $MSR_GS_BASE, %ecx |
907 | rdmsr |
908 | testl %edx, %edx |
909 | js .Lparanoid_kernel_gsbase |
910 | |
911 | /* EBX = 0 -> SWAPGS required on exit */ |
912 | xorl %ebx, %ebx |
913 | swapgs |
914 | .Lparanoid_kernel_gsbase: |
915 | FENCE_SWAPGS_KERNEL_ENTRY |
916 | .Lparanoid_gsbase_done: |
917 | |
918 | /* |
919 | * Once we have CR3 and %GS setup save and set SPEC_CTRL. Just like |
920 | * CR3 above, keep the old value in a callee saved register. |
921 | */ |
922 | IBRS_ENTER save_reg=%r15 |
923 | UNTRAIN_RET_FROM_CALL |
924 | |
925 | RET |
926 | SYM_CODE_END(paranoid_entry) |
927 | |
928 | /* |
929 | * "Paranoid" exit path from exception stack. This is invoked |
930 | * only on return from non-NMI IST interrupts that came |
931 | * from kernel space. |
932 | * |
933 | * We may be returning to very strange contexts (e.g. very early |
934 | * in syscall entry), so checking for preemption here would |
935 | * be complicated. Fortunately, there's no good reason to try |
936 | * to handle preemption here. |
937 | * |
938 | * R/EBX contains the GSBASE related information depending on the |
939 | * availability of the FSGSBASE instructions: |
940 | * |
941 | * FSGSBASE R/EBX |
942 | * N 0 -> SWAPGS on exit |
943 | * 1 -> no SWAPGS on exit |
944 | * |
945 | * Y User space GSBASE, must be restored unconditionally |
946 | * |
947 | * R14 - old CR3 |
948 | * R15 - old SPEC_CTRL |
949 | */ |
950 | SYM_CODE_START_LOCAL(paranoid_exit) |
951 | UNWIND_HINT_REGS |
952 | |
953 | /* |
954 | * Must restore IBRS state before both CR3 and %GS since we need access |
955 | * to the per-CPU x86_spec_ctrl_shadow variable. |
956 | */ |
957 | IBRS_EXIT save_reg=%r15 |
958 | |
959 | /* |
960 | * The order of operations is important. RESTORE_CR3 requires |
961 | * kernel GSBASE. |
962 | * |
963 | * NB to anyone to try to optimize this code: this code does |
964 | * not execute at all for exceptions from user mode. Those |
965 | * exceptions go through error_return instead. |
966 | */ |
967 | RESTORE_CR3 scratch_reg=%rax save_reg=%r14 |
968 | |
969 | /* Handle the three GSBASE cases */ |
970 | ALTERNATIVE "jmp .Lparanoid_exit_checkgs" , "" , X86_FEATURE_FSGSBASE |
971 | |
972 | /* With FSGSBASE enabled, unconditionally restore GSBASE */ |
973 | wrgsbase %rbx |
974 | jmp restore_regs_and_return_to_kernel |
975 | |
976 | .Lparanoid_exit_checkgs: |
977 | /* On non-FSGSBASE systems, conditionally do SWAPGS */ |
978 | testl %ebx, %ebx |
979 | jnz restore_regs_and_return_to_kernel |
980 | |
981 | /* We are returning to a context with user GSBASE */ |
982 | swapgs |
983 | jmp restore_regs_and_return_to_kernel |
984 | SYM_CODE_END(paranoid_exit) |
985 | |
986 | /* |
987 | * Switch GS and CR3 if needed. |
988 | */ |
989 | SYM_CODE_START(error_entry) |
990 | ANNOTATE_NOENDBR |
991 | UNWIND_HINT_FUNC |
992 | |
993 | PUSH_AND_CLEAR_REGS save_ret=1 |
994 | ENCODE_FRAME_POINTER 8 |
995 | |
996 | testb $3, CS+8(%rsp) |
997 | jz .Lerror_kernelspace |
998 | |
999 | /* |
1000 | * We entered from user mode or we're pretending to have entered |
1001 | * from user mode due to an IRET fault. |
1002 | */ |
1003 | swapgs |
1004 | FENCE_SWAPGS_USER_ENTRY |
1005 | /* We have user CR3. Change to kernel CR3. */ |
1006 | SWITCH_TO_KERNEL_CR3 scratch_reg=%rax |
1007 | IBRS_ENTER |
1008 | UNTRAIN_RET_FROM_CALL |
1009 | |
1010 | leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */ |
1011 | /* Put us onto the real thread stack. */ |
1012 | jmp sync_regs |
1013 | |
1014 | /* |
1015 | * There are two places in the kernel that can potentially fault with |
1016 | * usergs. Handle them here. B stepping K8s sometimes report a |
1017 | * truncated RIP for IRET exceptions returning to compat mode. Check |
1018 | * for these here too. |
1019 | */ |
1020 | .Lerror_kernelspace: |
1021 | leaq native_irq_return_iret(%rip), %rcx |
1022 | cmpq %rcx, RIP+8(%rsp) |
1023 | je .Lerror_bad_iret |
1024 | movl %ecx, %eax /* zero extend */ |
1025 | cmpq %rax, RIP+8(%rsp) |
1026 | je .Lbstep_iret |
1027 | cmpq $.Lgs_change, RIP+8(%rsp) |
1028 | jne .Lerror_entry_done_lfence |
1029 | |
1030 | /* |
1031 | * hack: .Lgs_change can fail with user gsbase. If this happens, fix up |
1032 | * gsbase and proceed. We'll fix up the exception and land in |
1033 | * .Lgs_change's error handler with kernel gsbase. |
1034 | */ |
1035 | swapgs |
1036 | |
1037 | /* |
1038 | * Issue an LFENCE to prevent GS speculation, regardless of whether it is a |
1039 | * kernel or user gsbase. |
1040 | */ |
1041 | .Lerror_entry_done_lfence: |
1042 | FENCE_SWAPGS_KERNEL_ENTRY |
1043 | CALL_DEPTH_ACCOUNT |
1044 | leaq 8(%rsp), %rax /* return pt_regs pointer */ |
1045 | VALIDATE_UNRET_END |
1046 | RET |
1047 | |
1048 | .Lbstep_iret: |
1049 | /* Fix truncated RIP */ |
1050 | movq %rcx, RIP+8(%rsp) |
1051 | /* fall through */ |
1052 | |
1053 | .Lerror_bad_iret: |
1054 | /* |
1055 | * We came from an IRET to user mode, so we have user |
1056 | * gsbase and CR3. Switch to kernel gsbase and CR3: |
1057 | */ |
1058 | swapgs |
1059 | FENCE_SWAPGS_USER_ENTRY |
1060 | SWITCH_TO_KERNEL_CR3 scratch_reg=%rax |
1061 | IBRS_ENTER |
1062 | UNTRAIN_RET_FROM_CALL |
1063 | |
1064 | /* |
1065 | * Pretend that the exception came from user mode: set up pt_regs |
1066 | * as if we faulted immediately after IRET. |
1067 | */ |
1068 | leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */ |
1069 | call fixup_bad_iret |
1070 | mov %rax, %rdi |
1071 | jmp sync_regs |
1072 | SYM_CODE_END(error_entry) |
1073 | |
1074 | SYM_CODE_START_LOCAL(error_return) |
1075 | UNWIND_HINT_REGS |
1076 | DEBUG_ENTRY_ASSERT_IRQS_OFF |
1077 | testb $3, CS(%rsp) |
1078 | jz restore_regs_and_return_to_kernel |
1079 | jmp swapgs_restore_regs_and_return_to_usermode |
1080 | SYM_CODE_END(error_return) |
1081 | |
1082 | /* |
1083 | * Runs on exception stack. Xen PV does not go through this path at all, |
1084 | * so we can use real assembly here. |
1085 | * |
1086 | * Registers: |
1087 | * %r14: Used to save/restore the CR3 of the interrupted context |
1088 | * when PAGE_TABLE_ISOLATION is in use. Do not clobber. |
1089 | */ |
1090 | SYM_CODE_START(asm_exc_nmi) |
1091 | UNWIND_HINT_IRET_ENTRY |
1092 | ENDBR |
1093 | |
1094 | /* |
1095 | * We allow breakpoints in NMIs. If a breakpoint occurs, then |
1096 | * the iretq it performs will take us out of NMI context. |
1097 | * This means that we can have nested NMIs where the next |
1098 | * NMI is using the top of the stack of the previous NMI. We |
1099 | * can't let it execute because the nested NMI will corrupt the |
1100 | * stack of the previous NMI. NMI handlers are not re-entrant |
1101 | * anyway. |
1102 | * |
1103 | * To handle this case we do the following: |
1104 | * Check a special location on the stack that contains a |
1105 | * variable that is set when NMIs are executing. |
1106 | * The interrupted task's stack is also checked to see if it |
1107 | * is an NMI stack. |
1108 | * If the variable is not set and the stack is not the NMI |
1109 | * stack then: |
1110 | * o Set the special variable on the stack |
1111 | * o Copy the interrupt frame into an "outermost" location on the |
1112 | * stack |
1113 | * o Copy the interrupt frame into an "iret" location on the stack |
1114 | * o Continue processing the NMI |
1115 | * If the variable is set or the previous stack is the NMI stack: |
1116 | * o Modify the "iret" location to jump to the repeat_nmi |
1117 | * o return back to the first NMI |
1118 | * |
1119 | * Now on exit of the first NMI, we first clear the stack variable |
1120 | * The NMI stack will tell any nested NMIs at that point that it is |
1121 | * nested. Then we pop the stack normally with iret, and if there was |
1122 | * a nested NMI that updated the copy interrupt stack frame, a |
1123 | * jump will be made to the repeat_nmi code that will handle the second |
1124 | * NMI. |
1125 | * |
1126 | * However, espfix prevents us from directly returning to userspace |
1127 | * with a single IRET instruction. Similarly, IRET to user mode |
1128 | * can fault. We therefore handle NMIs from user space like |
1129 | * other IST entries. |
1130 | */ |
1131 | |
1132 | ASM_CLAC |
1133 | cld |
1134 | |
1135 | /* Use %rdx as our temp variable throughout */ |
1136 | pushq %rdx |
1137 | |
1138 | testb $3, CS-RIP+8(%rsp) |
1139 | jz .Lnmi_from_kernel |
1140 | |
1141 | /* |
1142 | * NMI from user mode. We need to run on the thread stack, but we |
1143 | * can't go through the normal entry paths: NMIs are masked, and |
1144 | * we don't want to enable interrupts, because then we'll end |
1145 | * up in an awkward situation in which IRQs are on but NMIs |
1146 | * are off. |
1147 | * |
1148 | * We also must not push anything to the stack before switching |
1149 | * stacks lest we corrupt the "NMI executing" variable. |
1150 | */ |
1151 | |
1152 | swapgs |
1153 | FENCE_SWAPGS_USER_ENTRY |
1154 | SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx |
1155 | movq %rsp, %rdx |
1156 | movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp |
1157 | UNWIND_HINT_IRET_REGS base=%rdx offset=8 |
1158 | pushq 5*8(%rdx) /* pt_regs->ss */ |
1159 | pushq 4*8(%rdx) /* pt_regs->rsp */ |
1160 | pushq 3*8(%rdx) /* pt_regs->flags */ |
1161 | pushq 2*8(%rdx) /* pt_regs->cs */ |
1162 | pushq 1*8(%rdx) /* pt_regs->rip */ |
1163 | UNWIND_HINT_IRET_REGS |
1164 | pushq $-1 /* pt_regs->orig_ax */ |
1165 | PUSH_AND_CLEAR_REGS rdx=(%rdx) |
1166 | ENCODE_FRAME_POINTER |
1167 | |
1168 | IBRS_ENTER |
1169 | UNTRAIN_RET |
1170 | |
1171 | /* |
1172 | * At this point we no longer need to worry about stack damage |
1173 | * due to nesting -- we're on the normal thread stack and we're |
1174 | * done with the NMI stack. |
1175 | */ |
1176 | |
1177 | movq %rsp, %rdi |
1178 | call exc_nmi |
1179 | |
1180 | /* |
1181 | * Return back to user mode. We must *not* do the normal exit |
1182 | * work, because we don't want to enable interrupts. |
1183 | */ |
1184 | jmp swapgs_restore_regs_and_return_to_usermode |
1185 | |
1186 | .Lnmi_from_kernel: |
1187 | /* |
1188 | * Here's what our stack frame will look like: |
1189 | * +---------------------------------------------------------+ |
1190 | * | original SS | |
1191 | * | original Return RSP | |
1192 | * | original RFLAGS | |
1193 | * | original CS | |
1194 | * | original RIP | |
1195 | * +---------------------------------------------------------+ |
1196 | * | temp storage for rdx | |
1197 | * +---------------------------------------------------------+ |
1198 | * | "NMI executing" variable | |
1199 | * +---------------------------------------------------------+ |
1200 | * | iret SS } Copied from "outermost" frame | |
1201 | * | iret Return RSP } on each loop iteration; overwritten | |
1202 | * | iret RFLAGS } by a nested NMI to force another | |
1203 | * | iret CS } iteration if needed. | |
1204 | * | iret RIP } | |
1205 | * +---------------------------------------------------------+ |
1206 | * | outermost SS } initialized in first_nmi; | |
1207 | * | outermost Return RSP } will not be changed before | |
1208 | * | outermost RFLAGS } NMI processing is done. | |
1209 | * | outermost CS } Copied to "iret" frame on each | |
1210 | * | outermost RIP } iteration. | |
1211 | * +---------------------------------------------------------+ |
1212 | * | pt_regs | |
1213 | * +---------------------------------------------------------+ |
1214 | * |
1215 | * The "original" frame is used by hardware. Before re-enabling |
1216 | * NMIs, we need to be done with it, and we need to leave enough |
1217 | * space for the asm code here. |
1218 | * |
1219 | * We return by executing IRET while RSP points to the "iret" frame. |
1220 | * That will either return for real or it will loop back into NMI |
1221 | * processing. |
1222 | * |
1223 | * The "outermost" frame is copied to the "iret" frame on each |
1224 | * iteration of the loop, so each iteration starts with the "iret" |
1225 | * frame pointing to the final return target. |
1226 | */ |
1227 | |
1228 | /* |
1229 | * Determine whether we're a nested NMI. |
1230 | * |
1231 | * If we interrupted kernel code between repeat_nmi and |
1232 | * end_repeat_nmi, then we are a nested NMI. We must not |
1233 | * modify the "iret" frame because it's being written by |
1234 | * the outer NMI. That's okay; the outer NMI handler is |
1235 | * about to call exc_nmi() anyway, so we can just resume |
1236 | * the outer NMI. |
1237 | */ |
1238 | |
1239 | movq $repeat_nmi, %rdx |
1240 | cmpq 8(%rsp), %rdx |
1241 | ja 1f |
1242 | movq $end_repeat_nmi, %rdx |
1243 | cmpq 8(%rsp), %rdx |
1244 | ja nested_nmi_out |
1245 | 1: |
1246 | |
1247 | /* |
1248 | * Now check "NMI executing". If it's set, then we're nested. |
1249 | * This will not detect if we interrupted an outer NMI just |
1250 | * before IRET. |
1251 | */ |
1252 | cmpl $1, -8(%rsp) |
1253 | je nested_nmi |
1254 | |
1255 | /* |
1256 | * Now test if the previous stack was an NMI stack. This covers |
1257 | * the case where we interrupt an outer NMI after it clears |
1258 | * "NMI executing" but before IRET. We need to be careful, though: |
1259 | * there is one case in which RSP could point to the NMI stack |
1260 | * despite there being no NMI active: naughty userspace controls |
1261 | * RSP at the very beginning of the SYSCALL targets. We can |
1262 | * pull a fast one on naughty userspace, though: we program |
1263 | * SYSCALL to mask DF, so userspace cannot cause DF to be set |
1264 | * if it controls the kernel's RSP. We set DF before we clear |
1265 | * "NMI executing". |
1266 | */ |
1267 | lea 6*8(%rsp), %rdx |
1268 | /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ |
1269 | cmpq %rdx, 4*8(%rsp) |
1270 | /* If the stack pointer is above the NMI stack, this is a normal NMI */ |
1271 | ja first_nmi |
1272 | |
1273 | subq $EXCEPTION_STKSZ, %rdx |
1274 | cmpq %rdx, 4*8(%rsp) |
1275 | /* If it is below the NMI stack, it is a normal NMI */ |
1276 | jb first_nmi |
1277 | |
1278 | /* Ah, it is within the NMI stack. */ |
1279 | |
1280 | testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp) |
1281 | jz first_nmi /* RSP was user controlled. */ |
1282 | |
1283 | /* This is a nested NMI. */ |
1284 | |
1285 | nested_nmi: |
1286 | /* |
1287 | * Modify the "iret" frame to point to repeat_nmi, forcing another |
1288 | * iteration of NMI handling. |
1289 | */ |
1290 | subq $8, %rsp |
1291 | leaq -10*8(%rsp), %rdx |
1292 | pushq $__KERNEL_DS |
1293 | pushq %rdx |
1294 | pushfq |
1295 | pushq $__KERNEL_CS |
1296 | pushq $repeat_nmi |
1297 | |
1298 | /* Put stack back */ |
1299 | addq $(6*8), %rsp |
1300 | |
1301 | nested_nmi_out: |
1302 | popq %rdx |
1303 | |
1304 | /* We are returning to kernel mode, so this cannot result in a fault. */ |
1305 | iretq |
1306 | |
1307 | first_nmi: |
1308 | /* Restore rdx. */ |
1309 | movq (%rsp), %rdx |
1310 | |
1311 | /* Make room for "NMI executing". */ |
1312 | pushq $0 |
1313 | |
1314 | /* Leave room for the "iret" frame */ |
1315 | subq $(5*8), %rsp |
1316 | |
1317 | /* Copy the "original" frame to the "outermost" frame */ |
1318 | .rept 5 |
1319 | pushq 11*8(%rsp) |
1320 | .endr |
1321 | UNWIND_HINT_IRET_REGS |
1322 | |
1323 | /* Everything up to here is safe from nested NMIs */ |
1324 | |
1325 | #ifdef CONFIG_DEBUG_ENTRY |
1326 | /* |
1327 | * For ease of testing, unmask NMIs right away. Disabled by |
1328 | * default because IRET is very expensive. |
1329 | */ |
1330 | pushq $0 /* SS */ |
1331 | pushq %rsp /* RSP (minus 8 because of the previous push) */ |
1332 | addq $8, (%rsp) /* Fix up RSP */ |
1333 | pushfq /* RFLAGS */ |
1334 | pushq $__KERNEL_CS /* CS */ |
1335 | pushq $1f /* RIP */ |
1336 | iretq /* continues at repeat_nmi below */ |
1337 | UNWIND_HINT_IRET_REGS |
1338 | 1: |
1339 | #endif |
1340 | |
1341 | repeat_nmi: |
1342 | ANNOTATE_NOENDBR // this code |
1343 | /* |
1344 | * If there was a nested NMI, the first NMI's iret will return |
1345 | * here. But NMIs are still enabled and we can take another |
1346 | * nested NMI. The nested NMI checks the interrupted RIP to see |
1347 | * if it is between repeat_nmi and end_repeat_nmi, and if so |
1348 | * it will just return, as we are about to repeat an NMI anyway. |
1349 | * This makes it safe to copy to the stack frame that a nested |
1350 | * NMI will update. |
1351 | * |
1352 | * RSP is pointing to "outermost RIP". gsbase is unknown, but, if |
1353 | * we're repeating an NMI, gsbase has the same value that it had on |
1354 | * the first iteration. paranoid_entry will load the kernel |
1355 | * gsbase if needed before we call exc_nmi(). "NMI executing" |
1356 | * is zero. |
1357 | */ |
1358 | movq $1, 10*8(%rsp) /* Set "NMI executing". */ |
1359 | |
1360 | /* |
1361 | * Copy the "outermost" frame to the "iret" frame. NMIs that nest |
1362 | * here must not modify the "iret" frame while we're writing to |
1363 | * it or it will end up containing garbage. |
1364 | */ |
1365 | addq $(10*8), %rsp |
1366 | .rept 5 |
1367 | pushq -6*8(%rsp) |
1368 | .endr |
1369 | subq $(5*8), %rsp |
1370 | end_repeat_nmi: |
1371 | ANNOTATE_NOENDBR // this code |
1372 | |
1373 | /* |
1374 | * Everything below this point can be preempted by a nested NMI. |
1375 | * If this happens, then the inner NMI will change the "iret" |
1376 | * frame to point back to repeat_nmi. |
1377 | */ |
1378 | pushq $-1 /* ORIG_RAX: no syscall to restart */ |
1379 | |
1380 | /* |
1381 | * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit |
1382 | * as we should not be calling schedule in NMI context. |
1383 | * Even with normal interrupts enabled. An NMI should not be |
1384 | * setting NEED_RESCHED or anything that normal interrupts and |
1385 | * exceptions might do. |
1386 | */ |
1387 | call paranoid_entry |
1388 | UNWIND_HINT_REGS |
1389 | |
1390 | movq %rsp, %rdi |
1391 | call exc_nmi |
1392 | |
1393 | /* Always restore stashed SPEC_CTRL value (see paranoid_entry) */ |
1394 | IBRS_EXIT save_reg=%r15 |
1395 | |
1396 | /* Always restore stashed CR3 value (see paranoid_entry) */ |
1397 | RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 |
1398 | |
1399 | /* |
1400 | * The above invocation of paranoid_entry stored the GSBASE |
1401 | * related information in R/EBX depending on the availability |
1402 | * of FSGSBASE. |
1403 | * |
1404 | * If FSGSBASE is enabled, restore the saved GSBASE value |
1405 | * unconditionally, otherwise take the conditional SWAPGS path. |
1406 | */ |
1407 | ALTERNATIVE "jmp nmi_no_fsgsbase" , "" , X86_FEATURE_FSGSBASE |
1408 | |
1409 | wrgsbase %rbx |
1410 | jmp nmi_restore |
1411 | |
1412 | nmi_no_fsgsbase: |
1413 | /* EBX == 0 -> invoke SWAPGS */ |
1414 | testl %ebx, %ebx |
1415 | jnz nmi_restore |
1416 | |
1417 | nmi_swapgs: |
1418 | swapgs |
1419 | |
1420 | nmi_restore: |
1421 | POP_REGS |
1422 | |
1423 | /* |
1424 | * Skip orig_ax and the "outermost" frame to point RSP at the "iret" |
1425 | * at the "iret" frame. |
1426 | */ |
1427 | addq $6*8, %rsp |
1428 | |
1429 | /* |
1430 | * Clear "NMI executing". Set DF first so that we can easily |
1431 | * distinguish the remaining code between here and IRET from |
1432 | * the SYSCALL entry and exit paths. |
1433 | * |
1434 | * We arguably should just inspect RIP instead, but I (Andy) wrote |
1435 | * this code when I had the misapprehension that Xen PV supported |
1436 | * NMIs, and Xen PV would break that approach. |
1437 | */ |
1438 | std |
1439 | movq $0, 5*8(%rsp) /* clear "NMI executing" */ |
1440 | |
1441 | /* |
1442 | * iretq reads the "iret" frame and exits the NMI stack in a |
1443 | * single instruction. We are returning to kernel mode, so this |
1444 | * cannot result in a fault. Similarly, we don't need to worry |
1445 | * about espfix64 on the way back to kernel mode. |
1446 | */ |
1447 | iretq |
1448 | SYM_CODE_END(asm_exc_nmi) |
1449 | |
1450 | /* |
1451 | * This handles SYSCALL from 32-bit code. There is no way to program |
1452 | * MSRs to fully disable 32-bit SYSCALL. |
1453 | */ |
1454 | SYM_CODE_START(entry_SYSCALL32_ignore) |
1455 | UNWIND_HINT_END_OF_STACK |
1456 | ENDBR |
1457 | mov $-ENOSYS, %eax |
1458 | sysretl |
1459 | SYM_CODE_END(entry_SYSCALL32_ignore) |
1460 | |
1461 | .pushsection .text, "ax" |
1462 | __FUNC_ALIGN |
1463 | SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead) |
1464 | UNWIND_HINT_FUNC |
1465 | /* Prevent any naive code from trying to unwind to our caller. */ |
1466 | xorl %ebp, %ebp |
1467 | |
1468 | movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rax |
1469 | leaq -PTREGS_SIZE(%rax), %rsp |
1470 | UNWIND_HINT_REGS |
1471 | |
1472 | call make_task_dead |
1473 | SYM_CODE_END(rewind_stack_and_make_dead) |
1474 | .popsection |
1475 | |