1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * |
4 | * Copyright (C) 2007 Alan Stern |
5 | * Copyright (C) 2009 IBM Corporation |
6 | * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com> |
7 | * |
8 | * Authors: Alan Stern <stern@rowland.harvard.edu> |
9 | * K.Prasad <prasad@linux.vnet.ibm.com> |
10 | * Frederic Weisbecker <fweisbec@gmail.com> |
11 | */ |
12 | |
13 | /* |
14 | * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, |
15 | * using the CPU's debug registers. |
16 | */ |
17 | |
18 | #include <linux/perf_event.h> |
19 | #include <linux/hw_breakpoint.h> |
20 | #include <linux/irqflags.h> |
21 | #include <linux/notifier.h> |
22 | #include <linux/kallsyms.h> |
23 | #include <linux/kprobes.h> |
24 | #include <linux/percpu.h> |
25 | #include <linux/kdebug.h> |
26 | #include <linux/kernel.h> |
27 | #include <linux/export.h> |
28 | #include <linux/sched.h> |
29 | #include <linux/smp.h> |
30 | |
31 | #include <asm/hw_breakpoint.h> |
32 | #include <asm/processor.h> |
33 | #include <asm/debugreg.h> |
34 | #include <asm/user.h> |
35 | #include <asm/desc.h> |
36 | #include <asm/tlbflush.h> |
37 | |
38 | /* Per cpu debug control register value */ |
39 | DEFINE_PER_CPU(unsigned long, cpu_dr7); |
40 | EXPORT_PER_CPU_SYMBOL(cpu_dr7); |
41 | |
42 | /* Per cpu debug address registers values */ |
43 | static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]); |
44 | |
45 | /* |
46 | * Stores the breakpoints currently in use on each breakpoint address |
47 | * register for each cpus |
48 | */ |
49 | static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]); |
50 | |
51 | |
52 | static inline unsigned long |
53 | __encode_dr7(int drnum, unsigned int len, unsigned int type) |
54 | { |
55 | unsigned long bp_info; |
56 | |
57 | bp_info = (len | type) & 0xf; |
58 | bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE); |
59 | bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)); |
60 | |
61 | return bp_info; |
62 | } |
63 | |
64 | /* |
65 | * Encode the length, type, Exact, and Enable bits for a particular breakpoint |
66 | * as stored in debug register 7. |
67 | */ |
68 | unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) |
69 | { |
70 | return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN; |
71 | } |
72 | |
73 | /* |
74 | * Decode the length and type bits for a particular breakpoint as |
75 | * stored in debug register 7. Return the "enabled" status. |
76 | */ |
77 | int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type) |
78 | { |
79 | int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE); |
80 | |
81 | *len = (bp_info & 0xc) | 0x40; |
82 | *type = (bp_info & 0x3) | 0x80; |
83 | |
84 | return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3; |
85 | } |
86 | |
87 | /* |
88 | * Install a perf counter breakpoint. |
89 | * |
90 | * We seek a free debug address register and use it for this |
91 | * breakpoint. Eventually we enable it in the debug control register. |
92 | * |
93 | * Atomic: we hold the counter->ctx->lock and we only handle variables |
94 | * and registers local to this cpu. |
95 | */ |
96 | int arch_install_hw_breakpoint(struct perf_event *bp) |
97 | { |
98 | struct arch_hw_breakpoint *info = counter_arch_bp(bp); |
99 | unsigned long *dr7; |
100 | int i; |
101 | |
102 | lockdep_assert_irqs_disabled(); |
103 | |
104 | for (i = 0; i < HBP_NUM; i++) { |
105 | struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]); |
106 | |
107 | if (!*slot) { |
108 | *slot = bp; |
109 | break; |
110 | } |
111 | } |
112 | |
113 | if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot" )) |
114 | return -EBUSY; |
115 | |
116 | set_debugreg(val: info->address, reg: i); |
117 | __this_cpu_write(cpu_debugreg[i], info->address); |
118 | |
119 | dr7 = this_cpu_ptr(&cpu_dr7); |
120 | *dr7 |= encode_dr7(drnum: i, len: info->len, type: info->type); |
121 | |
122 | /* |
123 | * Ensure we first write cpu_dr7 before we set the DR7 register. |
124 | * This ensures an NMI never see cpu_dr7 0 when DR7 is not. |
125 | */ |
126 | barrier(); |
127 | |
128 | set_debugreg(val: *dr7, reg: 7); |
129 | if (info->mask) |
130 | amd_set_dr_addr_mask(mask: info->mask, dr: i); |
131 | |
132 | return 0; |
133 | } |
134 | |
135 | /* |
136 | * Uninstall the breakpoint contained in the given counter. |
137 | * |
138 | * First we search the debug address register it uses and then we disable |
139 | * it. |
140 | * |
141 | * Atomic: we hold the counter->ctx->lock and we only handle variables |
142 | * and registers local to this cpu. |
143 | */ |
144 | void arch_uninstall_hw_breakpoint(struct perf_event *bp) |
145 | { |
146 | struct arch_hw_breakpoint *info = counter_arch_bp(bp); |
147 | unsigned long dr7; |
148 | int i; |
149 | |
150 | lockdep_assert_irqs_disabled(); |
151 | |
152 | for (i = 0; i < HBP_NUM; i++) { |
153 | struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]); |
154 | |
155 | if (*slot == bp) { |
156 | *slot = NULL; |
157 | break; |
158 | } |
159 | } |
160 | |
161 | if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot" )) |
162 | return; |
163 | |
164 | dr7 = this_cpu_read(cpu_dr7); |
165 | dr7 &= ~__encode_dr7(drnum: i, len: info->len, type: info->type); |
166 | |
167 | set_debugreg(val: dr7, reg: 7); |
168 | if (info->mask) |
169 | amd_set_dr_addr_mask(mask: 0, dr: i); |
170 | |
171 | /* |
172 | * Ensure the write to cpu_dr7 is after we've set the DR7 register. |
173 | * This ensures an NMI never see cpu_dr7 0 when DR7 is not. |
174 | */ |
175 | barrier(); |
176 | |
177 | this_cpu_write(cpu_dr7, dr7); |
178 | } |
179 | |
180 | static int arch_bp_generic_len(int x86_len) |
181 | { |
182 | switch (x86_len) { |
183 | case X86_BREAKPOINT_LEN_1: |
184 | return HW_BREAKPOINT_LEN_1; |
185 | case X86_BREAKPOINT_LEN_2: |
186 | return HW_BREAKPOINT_LEN_2; |
187 | case X86_BREAKPOINT_LEN_4: |
188 | return HW_BREAKPOINT_LEN_4; |
189 | #ifdef CONFIG_X86_64 |
190 | case X86_BREAKPOINT_LEN_8: |
191 | return HW_BREAKPOINT_LEN_8; |
192 | #endif |
193 | default: |
194 | return -EINVAL; |
195 | } |
196 | } |
197 | |
198 | int arch_bp_generic_fields(int x86_len, int x86_type, |
199 | int *gen_len, int *gen_type) |
200 | { |
201 | int len; |
202 | |
203 | /* Type */ |
204 | switch (x86_type) { |
205 | case X86_BREAKPOINT_EXECUTE: |
206 | if (x86_len != X86_BREAKPOINT_LEN_X) |
207 | return -EINVAL; |
208 | |
209 | *gen_type = HW_BREAKPOINT_X; |
210 | *gen_len = sizeof(long); |
211 | return 0; |
212 | case X86_BREAKPOINT_WRITE: |
213 | *gen_type = HW_BREAKPOINT_W; |
214 | break; |
215 | case X86_BREAKPOINT_RW: |
216 | *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; |
217 | break; |
218 | default: |
219 | return -EINVAL; |
220 | } |
221 | |
222 | /* Len */ |
223 | len = arch_bp_generic_len(x86_len); |
224 | if (len < 0) |
225 | return -EINVAL; |
226 | *gen_len = len; |
227 | |
228 | return 0; |
229 | } |
230 | |
231 | /* |
232 | * Check for virtual address in kernel space. |
233 | */ |
234 | int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw) |
235 | { |
236 | unsigned long va; |
237 | int len; |
238 | |
239 | va = hw->address; |
240 | len = arch_bp_generic_len(x86_len: hw->len); |
241 | WARN_ON_ONCE(len < 0); |
242 | |
243 | /* |
244 | * We don't need to worry about va + len - 1 overflowing: |
245 | * we already require that va is aligned to a multiple of len. |
246 | */ |
247 | return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX); |
248 | } |
249 | |
250 | /* |
251 | * Checks whether the range [addr, end], overlaps the area [base, base + size). |
252 | */ |
253 | static inline bool within_area(unsigned long addr, unsigned long end, |
254 | unsigned long base, unsigned long size) |
255 | { |
256 | return end >= base && addr < (base + size); |
257 | } |
258 | |
259 | /* |
260 | * Checks whether the range from addr to end, inclusive, overlaps the fixed |
261 | * mapped CPU entry area range or other ranges used for CPU entry. |
262 | */ |
263 | static inline bool within_cpu_entry(unsigned long addr, unsigned long end) |
264 | { |
265 | int cpu; |
266 | |
267 | /* CPU entry erea is always used for CPU entry */ |
268 | if (within_area(addr, end, CPU_ENTRY_AREA_BASE, |
269 | CPU_ENTRY_AREA_MAP_SIZE)) |
270 | return true; |
271 | |
272 | /* |
273 | * When FSGSBASE is enabled, paranoid_entry() fetches the per-CPU |
274 | * GSBASE value via __per_cpu_offset or pcpu_unit_offsets. |
275 | */ |
276 | #ifdef CONFIG_SMP |
277 | if (within_area(addr, end, base: (unsigned long)__per_cpu_offset, |
278 | size: sizeof(unsigned long) * nr_cpu_ids)) |
279 | return true; |
280 | #else |
281 | if (within_area(addr, end, (unsigned long)&pcpu_unit_offsets, |
282 | sizeof(pcpu_unit_offsets))) |
283 | return true; |
284 | #endif |
285 | |
286 | for_each_possible_cpu(cpu) { |
287 | /* The original rw GDT is being used after load_direct_gdt() */ |
288 | if (within_area(addr, end, base: (unsigned long)get_cpu_gdt_rw(cpu), |
289 | GDT_SIZE)) |
290 | return true; |
291 | |
292 | /* |
293 | * cpu_tss_rw is not directly referenced by hardware, but |
294 | * cpu_tss_rw is also used in CPU entry code, |
295 | */ |
296 | if (within_area(addr, end, |
297 | base: (unsigned long)&per_cpu(cpu_tss_rw, cpu), |
298 | size: sizeof(struct tss_struct))) |
299 | return true; |
300 | |
301 | /* |
302 | * cpu_tlbstate.user_pcid_flush_mask is used for CPU entry. |
303 | * If a data breakpoint on it, it will cause an unwanted #DB. |
304 | * Protect the full cpu_tlbstate structure to be sure. |
305 | */ |
306 | if (within_area(addr, end, |
307 | base: (unsigned long)&per_cpu(cpu_tlbstate, cpu), |
308 | size: sizeof(struct tlb_state))) |
309 | return true; |
310 | |
311 | /* |
312 | * When in guest (X86_FEATURE_HYPERVISOR), local_db_save() |
313 | * will read per-cpu cpu_dr7 before clear dr7 register. |
314 | */ |
315 | if (within_area(addr, end, base: (unsigned long)&per_cpu(cpu_dr7, cpu), |
316 | size: sizeof(cpu_dr7))) |
317 | return true; |
318 | } |
319 | |
320 | return false; |
321 | } |
322 | |
323 | static int arch_build_bp_info(struct perf_event *bp, |
324 | const struct perf_event_attr *attr, |
325 | struct arch_hw_breakpoint *hw) |
326 | { |
327 | unsigned long bp_end; |
328 | |
329 | bp_end = attr->bp_addr + attr->bp_len - 1; |
330 | if (bp_end < attr->bp_addr) |
331 | return -EINVAL; |
332 | |
333 | /* |
334 | * Prevent any breakpoint of any type that overlaps the CPU |
335 | * entry area and data. This protects the IST stacks and also |
336 | * reduces the chance that we ever find out what happens if |
337 | * there's a data breakpoint on the GDT, IDT, or TSS. |
338 | */ |
339 | if (within_cpu_entry(addr: attr->bp_addr, end: bp_end)) |
340 | return -EINVAL; |
341 | |
342 | hw->address = attr->bp_addr; |
343 | hw->mask = 0; |
344 | |
345 | /* Type */ |
346 | switch (attr->bp_type) { |
347 | case HW_BREAKPOINT_W: |
348 | hw->type = X86_BREAKPOINT_WRITE; |
349 | break; |
350 | case HW_BREAKPOINT_W | HW_BREAKPOINT_R: |
351 | hw->type = X86_BREAKPOINT_RW; |
352 | break; |
353 | case HW_BREAKPOINT_X: |
354 | /* |
355 | * We don't allow kernel breakpoints in places that are not |
356 | * acceptable for kprobes. On non-kprobes kernels, we don't |
357 | * allow kernel breakpoints at all. |
358 | */ |
359 | if (attr->bp_addr >= TASK_SIZE_MAX) { |
360 | if (within_kprobe_blacklist(addr: attr->bp_addr)) |
361 | return -EINVAL; |
362 | } |
363 | |
364 | hw->type = X86_BREAKPOINT_EXECUTE; |
365 | /* |
366 | * x86 inst breakpoints need to have a specific undefined len. |
367 | * But we still need to check userspace is not trying to setup |
368 | * an unsupported length, to get a range breakpoint for example. |
369 | */ |
370 | if (attr->bp_len == sizeof(long)) { |
371 | hw->len = X86_BREAKPOINT_LEN_X; |
372 | return 0; |
373 | } |
374 | fallthrough; |
375 | default: |
376 | return -EINVAL; |
377 | } |
378 | |
379 | /* Len */ |
380 | switch (attr->bp_len) { |
381 | case HW_BREAKPOINT_LEN_1: |
382 | hw->len = X86_BREAKPOINT_LEN_1; |
383 | break; |
384 | case HW_BREAKPOINT_LEN_2: |
385 | hw->len = X86_BREAKPOINT_LEN_2; |
386 | break; |
387 | case HW_BREAKPOINT_LEN_4: |
388 | hw->len = X86_BREAKPOINT_LEN_4; |
389 | break; |
390 | #ifdef CONFIG_X86_64 |
391 | case HW_BREAKPOINT_LEN_8: |
392 | hw->len = X86_BREAKPOINT_LEN_8; |
393 | break; |
394 | #endif |
395 | default: |
396 | /* AMD range breakpoint */ |
397 | if (!is_power_of_2(n: attr->bp_len)) |
398 | return -EINVAL; |
399 | if (attr->bp_addr & (attr->bp_len - 1)) |
400 | return -EINVAL; |
401 | |
402 | if (!boot_cpu_has(X86_FEATURE_BPEXT)) |
403 | return -EOPNOTSUPP; |
404 | |
405 | /* |
406 | * It's impossible to use a range breakpoint to fake out |
407 | * user vs kernel detection because bp_len - 1 can't |
408 | * have the high bit set. If we ever allow range instruction |
409 | * breakpoints, then we'll have to check for kprobe-blacklisted |
410 | * addresses anywhere in the range. |
411 | */ |
412 | hw->mask = attr->bp_len - 1; |
413 | hw->len = X86_BREAKPOINT_LEN_1; |
414 | } |
415 | |
416 | return 0; |
417 | } |
418 | |
419 | /* |
420 | * Validate the arch-specific HW Breakpoint register settings |
421 | */ |
422 | int hw_breakpoint_arch_parse(struct perf_event *bp, |
423 | const struct perf_event_attr *attr, |
424 | struct arch_hw_breakpoint *hw) |
425 | { |
426 | unsigned int align; |
427 | int ret; |
428 | |
429 | |
430 | ret = arch_build_bp_info(bp, attr, hw); |
431 | if (ret) |
432 | return ret; |
433 | |
434 | switch (hw->len) { |
435 | case X86_BREAKPOINT_LEN_1: |
436 | align = 0; |
437 | if (hw->mask) |
438 | align = hw->mask; |
439 | break; |
440 | case X86_BREAKPOINT_LEN_2: |
441 | align = 1; |
442 | break; |
443 | case X86_BREAKPOINT_LEN_4: |
444 | align = 3; |
445 | break; |
446 | #ifdef CONFIG_X86_64 |
447 | case X86_BREAKPOINT_LEN_8: |
448 | align = 7; |
449 | break; |
450 | #endif |
451 | default: |
452 | WARN_ON_ONCE(1); |
453 | return -EINVAL; |
454 | } |
455 | |
456 | /* |
457 | * Check that the low-order bits of the address are appropriate |
458 | * for the alignment implied by len. |
459 | */ |
460 | if (hw->address & align) |
461 | return -EINVAL; |
462 | |
463 | return 0; |
464 | } |
465 | |
466 | /* |
467 | * Release the user breakpoints used by ptrace |
468 | */ |
469 | void flush_ptrace_hw_breakpoint(struct task_struct *tsk) |
470 | { |
471 | int i; |
472 | struct thread_struct *t = &tsk->thread; |
473 | |
474 | for (i = 0; i < HBP_NUM; i++) { |
475 | unregister_hw_breakpoint(bp: t->ptrace_bps[i]); |
476 | t->ptrace_bps[i] = NULL; |
477 | } |
478 | |
479 | t->virtual_dr6 = 0; |
480 | t->ptrace_dr7 = 0; |
481 | } |
482 | |
483 | void hw_breakpoint_restore(void) |
484 | { |
485 | set_debugreg(__this_cpu_read(cpu_debugreg[0]), reg: 0); |
486 | set_debugreg(__this_cpu_read(cpu_debugreg[1]), reg: 1); |
487 | set_debugreg(__this_cpu_read(cpu_debugreg[2]), reg: 2); |
488 | set_debugreg(__this_cpu_read(cpu_debugreg[3]), reg: 3); |
489 | set_debugreg(DR6_RESERVED, reg: 6); |
490 | set_debugreg(__this_cpu_read(cpu_dr7), reg: 7); |
491 | } |
492 | EXPORT_SYMBOL_GPL(hw_breakpoint_restore); |
493 | |
494 | /* |
495 | * Handle debug exception notifications. |
496 | * |
497 | * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below. |
498 | * |
499 | * NOTIFY_DONE returned if one of the following conditions is true. |
500 | * i) When the causative address is from user-space and the exception |
501 | * is a valid one, i.e. not triggered as a result of lazy debug register |
502 | * switching |
503 | * ii) When there are more bits than trap<n> set in DR6 register (such |
504 | * as BD, BS or BT) indicating that more than one debug condition is |
505 | * met and requires some more action in do_debug(). |
506 | * |
507 | * NOTIFY_STOP returned for all other cases |
508 | * |
509 | */ |
510 | static int hw_breakpoint_handler(struct die_args *args) |
511 | { |
512 | int i, rc = NOTIFY_STOP; |
513 | struct perf_event *bp; |
514 | unsigned long *dr6_p; |
515 | unsigned long dr6; |
516 | bool bpx; |
517 | |
518 | /* The DR6 value is pointed by args->err */ |
519 | dr6_p = (unsigned long *)ERR_PTR(error: args->err); |
520 | dr6 = *dr6_p; |
521 | |
522 | /* Do an early return if no trap bits are set in DR6 */ |
523 | if ((dr6 & DR_TRAP_BITS) == 0) |
524 | return NOTIFY_DONE; |
525 | |
526 | /* Handle all the breakpoints that were triggered */ |
527 | for (i = 0; i < HBP_NUM; ++i) { |
528 | if (likely(!(dr6 & (DR_TRAP0 << i)))) |
529 | continue; |
530 | |
531 | bp = this_cpu_read(bp_per_reg[i]); |
532 | if (!bp) |
533 | continue; |
534 | |
535 | bpx = bp->hw.info.type == X86_BREAKPOINT_EXECUTE; |
536 | |
537 | /* |
538 | * TF and data breakpoints are traps and can be merged, however |
539 | * instruction breakpoints are faults and will be raised |
540 | * separately. |
541 | * |
542 | * However DR6 can indicate both TF and instruction |
543 | * breakpoints. In that case take TF as that has precedence and |
544 | * delay the instruction breakpoint for the next exception. |
545 | */ |
546 | if (bpx && (dr6 & DR_STEP)) |
547 | continue; |
548 | |
549 | /* |
550 | * Reset the 'i'th TRAP bit in dr6 to denote completion of |
551 | * exception handling |
552 | */ |
553 | (*dr6_p) &= ~(DR_TRAP0 << i); |
554 | |
555 | perf_bp_event(event: bp, data: args->regs); |
556 | |
557 | /* |
558 | * Set up resume flag to avoid breakpoint recursion when |
559 | * returning back to origin. |
560 | */ |
561 | if (bpx) |
562 | args->regs->flags |= X86_EFLAGS_RF; |
563 | } |
564 | |
565 | /* |
566 | * Further processing in do_debug() is needed for a) user-space |
567 | * breakpoints (to generate signals) and b) when the system has |
568 | * taken exception due to multiple causes |
569 | */ |
570 | if ((current->thread.virtual_dr6 & DR_TRAP_BITS) || |
571 | (dr6 & (~DR_TRAP_BITS))) |
572 | rc = NOTIFY_DONE; |
573 | |
574 | return rc; |
575 | } |
576 | |
577 | /* |
578 | * Handle debug exception notifications. |
579 | */ |
580 | int hw_breakpoint_exceptions_notify( |
581 | struct notifier_block *unused, unsigned long val, void *data) |
582 | { |
583 | if (val != DIE_DEBUG) |
584 | return NOTIFY_DONE; |
585 | |
586 | return hw_breakpoint_handler(args: data); |
587 | } |
588 | |
589 | void hw_breakpoint_pmu_read(struct perf_event *bp) |
590 | { |
591 | /* TODO */ |
592 | } |
593 | |