1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | #include <linux/extable.h> |
3 | #include <linux/uaccess.h> |
4 | #include <linux/sched/debug.h> |
5 | #include <linux/bitfield.h> |
6 | #include <xen/xen.h> |
7 | |
8 | #include <asm/fpu/api.h> |
9 | #include <asm/fred.h> |
10 | #include <asm/sev.h> |
11 | #include <asm/traps.h> |
12 | #include <asm/kdebug.h> |
13 | #include <asm/insn-eval.h> |
14 | #include <asm/sgx.h> |
15 | |
16 | static inline unsigned long *pt_regs_nr(struct pt_regs *regs, int nr) |
17 | { |
18 | int reg_offset = pt_regs_offset(regs, regno: nr); |
19 | static unsigned long __dummy; |
20 | |
21 | if (WARN_ON_ONCE(reg_offset < 0)) |
22 | return &__dummy; |
23 | |
24 | return (unsigned long *)((unsigned long)regs + reg_offset); |
25 | } |
26 | |
27 | static inline unsigned long |
28 | ex_fixup_addr(const struct exception_table_entry *x) |
29 | { |
30 | return (unsigned long)&x->fixup + x->fixup; |
31 | } |
32 | |
33 | static bool ex_handler_default(const struct exception_table_entry *e, |
34 | struct pt_regs *regs) |
35 | { |
36 | if (e->data & EX_FLAG_CLEAR_AX) |
37 | regs->ax = 0; |
38 | if (e->data & EX_FLAG_CLEAR_DX) |
39 | regs->dx = 0; |
40 | |
41 | regs->ip = ex_fixup_addr(x: e); |
42 | return true; |
43 | } |
44 | |
45 | /* |
46 | * This is the *very* rare case where we do a "load_unaligned_zeropad()" |
47 | * and it's a page crosser into a non-existent page. |
48 | * |
49 | * This happens when we optimistically load a pathname a word-at-a-time |
50 | * and the name is less than the full word and the next page is not |
51 | * mapped. Typically that only happens for CONFIG_DEBUG_PAGEALLOC. |
52 | * |
53 | * NOTE! The faulting address is always a 'mov mem,reg' type instruction |
54 | * of size 'long', and the exception fixup must always point to right |
55 | * after the instruction. |
56 | */ |
57 | static bool ex_handler_zeropad(const struct exception_table_entry *e, |
58 | struct pt_regs *regs, |
59 | unsigned long fault_addr) |
60 | { |
61 | struct insn insn; |
62 | const unsigned long mask = sizeof(long) - 1; |
63 | unsigned long offset, addr, next_ip, len; |
64 | unsigned long *reg; |
65 | |
66 | next_ip = ex_fixup_addr(x: e); |
67 | len = next_ip - regs->ip; |
68 | if (len > MAX_INSN_SIZE) |
69 | return false; |
70 | |
71 | if (insn_decode(insn: &insn, kaddr: (void *) regs->ip, buf_len: len, m: INSN_MODE_KERN)) |
72 | return false; |
73 | if (insn.length != len) |
74 | return false; |
75 | |
76 | if (insn.opcode.bytes[0] != 0x8b) |
77 | return false; |
78 | if (insn.opnd_bytes != sizeof(long)) |
79 | return false; |
80 | |
81 | addr = (unsigned long) insn_get_addr_ref(insn: &insn, regs); |
82 | if (addr == ~0ul) |
83 | return false; |
84 | |
85 | offset = addr & mask; |
86 | addr = addr & ~mask; |
87 | if (fault_addr != addr + sizeof(long)) |
88 | return false; |
89 | |
90 | reg = insn_get_modrm_reg_ptr(insn: &insn, regs); |
91 | if (!reg) |
92 | return false; |
93 | |
94 | *reg = *(unsigned long *)addr >> (offset * 8); |
95 | return ex_handler_default(e, regs); |
96 | } |
97 | |
98 | static bool ex_handler_fault(const struct exception_table_entry *fixup, |
99 | struct pt_regs *regs, int trapnr) |
100 | { |
101 | regs->ax = trapnr; |
102 | return ex_handler_default(e: fixup, regs); |
103 | } |
104 | |
105 | static bool ex_handler_sgx(const struct exception_table_entry *fixup, |
106 | struct pt_regs *regs, int trapnr) |
107 | { |
108 | regs->ax = trapnr | SGX_ENCLS_FAULT_FLAG; |
109 | return ex_handler_default(e: fixup, regs); |
110 | } |
111 | |
112 | /* |
113 | * Handler for when we fail to restore a task's FPU state. We should never get |
114 | * here because the FPU state of a task using the FPU (task->thread.fpu.state) |
115 | * should always be valid. However, past bugs have allowed userspace to set |
116 | * reserved bits in the XSAVE area using PTRACE_SETREGSET or sys_rt_sigreturn(). |
117 | * These caused XRSTOR to fail when switching to the task, leaking the FPU |
118 | * registers of the task previously executing on the CPU. Mitigate this class |
119 | * of vulnerability by restoring from the initial state (essentially, zeroing |
120 | * out all the FPU registers) if we can't restore from the task's FPU state. |
121 | */ |
122 | static bool ex_handler_fprestore(const struct exception_table_entry *fixup, |
123 | struct pt_regs *regs) |
124 | { |
125 | regs->ip = ex_fixup_addr(x: fixup); |
126 | |
127 | WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers." , |
128 | (void *)instruction_pointer(regs)); |
129 | |
130 | fpu_reset_from_exception_fixup(); |
131 | return true; |
132 | } |
133 | |
134 | /* |
135 | * On x86-64, we end up being imprecise with 'access_ok()', and allow |
136 | * non-canonical user addresses to make the range comparisons simpler, |
137 | * and to not have to worry about LAM being enabled. |
138 | * |
139 | * In fact, we allow up to one page of "slop" at the sign boundary, |
140 | * which means that we can do access_ok() by just checking the sign |
141 | * of the pointer for the common case of having a small access size. |
142 | */ |
143 | static bool gp_fault_address_ok(unsigned long fault_address) |
144 | { |
145 | #ifdef CONFIG_X86_64 |
146 | /* Is it in the "user space" part of the non-canonical space? */ |
147 | if (valid_user_address(fault_address)) |
148 | return true; |
149 | |
150 | /* .. or just above it? */ |
151 | fault_address -= PAGE_SIZE; |
152 | if (valid_user_address(fault_address)) |
153 | return true; |
154 | #endif |
155 | return false; |
156 | } |
157 | |
158 | static bool ex_handler_uaccess(const struct exception_table_entry *fixup, |
159 | struct pt_regs *regs, int trapnr, |
160 | unsigned long fault_address) |
161 | { |
162 | WARN_ONCE(trapnr == X86_TRAP_GP && !gp_fault_address_ok(fault_address), |
163 | "General protection fault in user access. Non-canonical address?" ); |
164 | return ex_handler_default(e: fixup, regs); |
165 | } |
166 | |
167 | static bool ex_handler_copy(const struct exception_table_entry *fixup, |
168 | struct pt_regs *regs, int trapnr) |
169 | { |
170 | WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?" ); |
171 | return ex_handler_fault(fixup, regs, trapnr); |
172 | } |
173 | |
174 | static bool ex_handler_msr(const struct exception_table_entry *fixup, |
175 | struct pt_regs *regs, bool wrmsr, bool safe, int reg) |
176 | { |
177 | if (__ONCE_LITE_IF(!safe && wrmsr)) { |
178 | pr_warn("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n" , |
179 | (unsigned int)regs->cx, (unsigned int)regs->dx, |
180 | (unsigned int)regs->ax, regs->ip, (void *)regs->ip); |
181 | show_stack_regs(regs); |
182 | } |
183 | |
184 | if (__ONCE_LITE_IF(!safe && !wrmsr)) { |
185 | pr_warn("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n" , |
186 | (unsigned int)regs->cx, regs->ip, (void *)regs->ip); |
187 | show_stack_regs(regs); |
188 | } |
189 | |
190 | if (!wrmsr) { |
191 | /* Pretend that the read succeeded and returned 0. */ |
192 | regs->ax = 0; |
193 | regs->dx = 0; |
194 | } |
195 | |
196 | if (safe) |
197 | *pt_regs_nr(regs, nr: reg) = -EIO; |
198 | |
199 | return ex_handler_default(e: fixup, regs); |
200 | } |
201 | |
202 | static bool ex_handler_clear_fs(const struct exception_table_entry *fixup, |
203 | struct pt_regs *regs) |
204 | { |
205 | if (static_cpu_has(X86_BUG_NULL_SEG)) |
206 | asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS)); |
207 | asm volatile ("mov %0, %%fs" : : "rm" (0)); |
208 | return ex_handler_default(e: fixup, regs); |
209 | } |
210 | |
211 | static bool ex_handler_imm_reg(const struct exception_table_entry *fixup, |
212 | struct pt_regs *regs, int reg, int imm) |
213 | { |
214 | *pt_regs_nr(regs, nr: reg) = (long)imm; |
215 | return ex_handler_default(e: fixup, regs); |
216 | } |
217 | |
218 | static bool ex_handler_ucopy_len(const struct exception_table_entry *fixup, |
219 | struct pt_regs *regs, int trapnr, |
220 | unsigned long fault_address, |
221 | int reg, int imm) |
222 | { |
223 | regs->cx = imm * regs->cx + *pt_regs_nr(regs, nr: reg); |
224 | return ex_handler_uaccess(fixup, regs, trapnr, fault_address); |
225 | } |
226 | |
227 | #ifdef CONFIG_X86_FRED |
228 | static bool ex_handler_eretu(const struct exception_table_entry *fixup, |
229 | struct pt_regs *regs, unsigned long error_code) |
230 | { |
231 | struct pt_regs *uregs = (struct pt_regs *)(regs->sp - offsetof(struct pt_regs, orig_ax)); |
232 | unsigned short ss = uregs->ss; |
233 | unsigned short cs = uregs->cs; |
234 | |
235 | /* |
236 | * Move the NMI bit from the invalid stack frame, which caused ERETU |
237 | * to fault, to the fault handler's stack frame, thus to unblock NMI |
238 | * with the fault handler's ERETS instruction ASAP if NMI is blocked. |
239 | */ |
240 | regs->fred_ss.nmi = uregs->fred_ss.nmi; |
241 | |
242 | /* |
243 | * Sync event information to uregs, i.e., the ERETU return frame, but |
244 | * is it safe to write to the ERETU return frame which is just above |
245 | * current event stack frame? |
246 | * |
247 | * The RSP used by FRED to push a stack frame is not the value in %rsp, |
248 | * it is calculated from %rsp with the following 2 steps: |
249 | * 1) RSP = %rsp - (IA32_FRED_CONFIG & 0x1c0) // Reserve N*64 bytes |
250 | * 2) RSP = RSP & ~0x3f // Align to a 64-byte cache line |
251 | * when an event delivery doesn't trigger a stack level change. |
252 | * |
253 | * Here is an example with N*64 (N=1) bytes reserved: |
254 | * |
255 | * 64-byte cache line ==> ______________ |
256 | * |___Reserved___| |
257 | * |__Event_data__| |
258 | * |_____SS_______| |
259 | * |_____RSP______| |
260 | * |_____FLAGS____| |
261 | * |_____CS_______| |
262 | * |_____IP_______| |
263 | * 64-byte cache line ==> |__Error_code__| <== ERETU return frame |
264 | * |______________| |
265 | * |______________| |
266 | * |______________| |
267 | * |______________| |
268 | * |______________| |
269 | * |______________| |
270 | * |______________| |
271 | * 64-byte cache line ==> |______________| <== RSP after step 1) and 2) |
272 | * |___Reserved___| |
273 | * |__Event_data__| |
274 | * |_____SS_______| |
275 | * |_____RSP______| |
276 | * |_____FLAGS____| |
277 | * |_____CS_______| |
278 | * |_____IP_______| |
279 | * 64-byte cache line ==> |__Error_code__| <== ERETS return frame |
280 | * |
281 | * Thus a new FRED stack frame will always be pushed below a previous |
282 | * FRED stack frame ((N*64) bytes may be reserved between), and it is |
283 | * safe to write to a previous FRED stack frame as they never overlap. |
284 | */ |
285 | fred_info(regs: uregs)->edata = fred_event_data(regs); |
286 | uregs->ssx = regs->ssx; |
287 | uregs->fred_ss.ss = ss; |
288 | /* The NMI bit was moved away above */ |
289 | uregs->fred_ss.nmi = 0; |
290 | uregs->csx = regs->csx; |
291 | uregs->fred_cs.sl = 0; |
292 | uregs->fred_cs.wfe = 0; |
293 | uregs->cs = cs; |
294 | uregs->orig_ax = error_code; |
295 | |
296 | return ex_handler_default(e: fixup, regs); |
297 | } |
298 | #endif |
299 | |
300 | int ex_get_fixup_type(unsigned long ip) |
301 | { |
302 | const struct exception_table_entry *e = search_exception_tables(add: ip); |
303 | |
304 | return e ? FIELD_GET(EX_DATA_TYPE_MASK, e->data) : EX_TYPE_NONE; |
305 | } |
306 | |
307 | int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, |
308 | unsigned long fault_addr) |
309 | { |
310 | const struct exception_table_entry *e; |
311 | int type, reg, imm; |
312 | |
313 | #ifdef CONFIG_PNPBIOS |
314 | if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) { |
315 | extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp; |
316 | extern u32 pnp_bios_is_utter_crap; |
317 | pnp_bios_is_utter_crap = 1; |
318 | printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n" ); |
319 | __asm__ volatile( |
320 | "movl %0, %%esp\n\t" |
321 | "jmp *%1\n\t" |
322 | : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip)); |
323 | panic("do_trap: can't hit this" ); |
324 | } |
325 | #endif |
326 | |
327 | e = search_exception_tables(add: regs->ip); |
328 | if (!e) |
329 | return 0; |
330 | |
331 | type = FIELD_GET(EX_DATA_TYPE_MASK, e->data); |
332 | reg = FIELD_GET(EX_DATA_REG_MASK, e->data); |
333 | imm = FIELD_GET(EX_DATA_IMM_MASK, e->data); |
334 | |
335 | switch (type) { |
336 | case EX_TYPE_DEFAULT: |
337 | case EX_TYPE_DEFAULT_MCE_SAFE: |
338 | return ex_handler_default(e, regs); |
339 | case EX_TYPE_FAULT: |
340 | case EX_TYPE_FAULT_MCE_SAFE: |
341 | return ex_handler_fault(fixup: e, regs, trapnr); |
342 | case EX_TYPE_UACCESS: |
343 | return ex_handler_uaccess(fixup: e, regs, trapnr, fault_address: fault_addr); |
344 | case EX_TYPE_COPY: |
345 | return ex_handler_copy(fixup: e, regs, trapnr); |
346 | case EX_TYPE_CLEAR_FS: |
347 | return ex_handler_clear_fs(fixup: e, regs); |
348 | case EX_TYPE_FPU_RESTORE: |
349 | return ex_handler_fprestore(fixup: e, regs); |
350 | case EX_TYPE_BPF: |
351 | return ex_handler_bpf(x: e, regs); |
352 | case EX_TYPE_WRMSR: |
353 | return ex_handler_msr(fixup: e, regs, wrmsr: true, safe: false, reg); |
354 | case EX_TYPE_RDMSR: |
355 | return ex_handler_msr(fixup: e, regs, wrmsr: false, safe: false, reg); |
356 | case EX_TYPE_WRMSR_SAFE: |
357 | return ex_handler_msr(fixup: e, regs, wrmsr: true, safe: true, reg); |
358 | case EX_TYPE_RDMSR_SAFE: |
359 | return ex_handler_msr(fixup: e, regs, wrmsr: false, safe: true, reg); |
360 | case EX_TYPE_WRMSR_IN_MCE: |
361 | ex_handler_msr_mce(regs, wrmsr: true); |
362 | break; |
363 | case EX_TYPE_RDMSR_IN_MCE: |
364 | ex_handler_msr_mce(regs, wrmsr: false); |
365 | break; |
366 | case EX_TYPE_POP_REG: |
367 | regs->sp += sizeof(long); |
368 | fallthrough; |
369 | case EX_TYPE_IMM_REG: |
370 | return ex_handler_imm_reg(fixup: e, regs, reg, imm); |
371 | case EX_TYPE_FAULT_SGX: |
372 | return ex_handler_sgx(fixup: e, regs, trapnr); |
373 | case EX_TYPE_UCOPY_LEN: |
374 | return ex_handler_ucopy_len(fixup: e, regs, trapnr, fault_address: fault_addr, reg, imm); |
375 | case EX_TYPE_ZEROPAD: |
376 | return ex_handler_zeropad(e, regs, fault_addr); |
377 | #ifdef CONFIG_X86_FRED |
378 | case EX_TYPE_ERETU: |
379 | return ex_handler_eretu(fixup: e, regs, error_code); |
380 | #endif |
381 | } |
382 | BUG(); |
383 | } |
384 | |
385 | extern unsigned int early_recursion_flag; |
386 | |
387 | /* Restricted version used during very early boot */ |
388 | void __init early_fixup_exception(struct pt_regs *regs, int trapnr) |
389 | { |
390 | /* Ignore early NMIs. */ |
391 | if (trapnr == X86_TRAP_NMI) |
392 | return; |
393 | |
394 | if (early_recursion_flag > 2) |
395 | goto halt_loop; |
396 | |
397 | /* |
398 | * Old CPUs leave the high bits of CS on the stack |
399 | * undefined. I'm not sure which CPUs do this, but at least |
400 | * the 486 DX works this way. |
401 | * Xen pv domains are not using the default __KERNEL_CS. |
402 | */ |
403 | if (!xen_pv_domain() && regs->cs != __KERNEL_CS) |
404 | goto fail; |
405 | |
406 | /* |
407 | * The full exception fixup machinery is available as soon as |
408 | * the early IDT is loaded. This means that it is the |
409 | * responsibility of extable users to either function correctly |
410 | * when handlers are invoked early or to simply avoid causing |
411 | * exceptions before they're ready to handle them. |
412 | * |
413 | * This is better than filtering which handlers can be used, |
414 | * because refusing to call a handler here is guaranteed to |
415 | * result in a hard-to-debug panic. |
416 | * |
417 | * Keep in mind that not all vectors actually get here. Early |
418 | * page faults, for example, are special. |
419 | */ |
420 | if (fixup_exception(regs, trapnr, error_code: regs->orig_ax, fault_addr: 0)) |
421 | return; |
422 | |
423 | if (trapnr == X86_TRAP_UD) { |
424 | if (report_bug(bug_addr: regs->ip, regs) == BUG_TRAP_TYPE_WARN) { |
425 | /* Skip the ud2. */ |
426 | regs->ip += LEN_UD2; |
427 | return; |
428 | } |
429 | |
430 | /* |
431 | * If this was a BUG and report_bug returns or if this |
432 | * was just a normal #UD, we want to continue onward and |
433 | * crash. |
434 | */ |
435 | } |
436 | |
437 | fail: |
438 | early_printk(fmt: "PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 0x%lx\n" , |
439 | (unsigned)trapnr, (unsigned long)regs->cs, regs->ip, |
440 | regs->orig_ax, read_cr2()); |
441 | |
442 | show_regs(regs); |
443 | |
444 | halt_loop: |
445 | while (true) |
446 | halt(); |
447 | } |
448 | |