1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * BTS PMU driver for perf |
4 | * Copyright (c) 2013-2014, Intel Corporation. |
5 | */ |
6 | |
7 | #undef DEBUG |
8 | |
9 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
10 | |
11 | #include <linux/bitops.h> |
12 | #include <linux/types.h> |
13 | #include <linux/slab.h> |
14 | #include <linux/debugfs.h> |
15 | #include <linux/device.h> |
16 | #include <linux/coredump.h> |
17 | |
18 | #include <linux/sizes.h> |
19 | #include <asm/perf_event.h> |
20 | |
21 | #include "../perf_event.h" |
22 | |
23 | struct bts_ctx { |
24 | struct perf_output_handle handle; |
25 | struct debug_store ds_back; |
26 | int state; |
27 | }; |
28 | |
29 | /* BTS context states: */ |
30 | enum { |
31 | /* no ongoing AUX transactions */ |
32 | BTS_STATE_STOPPED = 0, |
33 | /* AUX transaction is on, BTS tracing is disabled */ |
34 | BTS_STATE_INACTIVE, |
35 | /* AUX transaction is on, BTS tracing is running */ |
36 | BTS_STATE_ACTIVE, |
37 | }; |
38 | |
39 | static DEFINE_PER_CPU(struct bts_ctx, bts_ctx); |
40 | |
41 | #define BTS_RECORD_SIZE 24 |
42 | #define BTS_SAFETY_MARGIN 4080 |
43 | |
44 | struct bts_phys { |
45 | struct page *page; |
46 | unsigned long size; |
47 | unsigned long offset; |
48 | unsigned long displacement; |
49 | }; |
50 | |
51 | struct bts_buffer { |
52 | size_t real_size; /* multiple of BTS_RECORD_SIZE */ |
53 | unsigned int nr_pages; |
54 | unsigned int nr_bufs; |
55 | unsigned int cur_buf; |
56 | bool snapshot; |
57 | local_t data_size; |
58 | local_t head; |
59 | unsigned long end; |
60 | void **data_pages; |
61 | struct bts_phys buf[]; |
62 | }; |
63 | |
64 | static struct pmu bts_pmu; |
65 | |
66 | static int buf_nr_pages(struct page *page) |
67 | { |
68 | if (!PagePrivate(page)) |
69 | return 1; |
70 | |
71 | return 1 << page_private(page); |
72 | } |
73 | |
74 | static size_t buf_size(struct page *page) |
75 | { |
76 | return buf_nr_pages(page) * PAGE_SIZE; |
77 | } |
78 | |
79 | static void * |
80 | bts_buffer_setup_aux(struct perf_event *event, void **pages, |
81 | int nr_pages, bool overwrite) |
82 | { |
83 | struct bts_buffer *buf; |
84 | struct page *page; |
85 | int cpu = event->cpu; |
86 | int node = (cpu == -1) ? cpu : cpu_to_node(cpu); |
87 | unsigned long offset; |
88 | size_t size = nr_pages << PAGE_SHIFT; |
89 | int pg, nbuf, pad; |
90 | |
91 | /* count all the high order buffers */ |
92 | for (pg = 0, nbuf = 0; pg < nr_pages;) { |
93 | page = virt_to_page(pages[pg]); |
94 | pg += buf_nr_pages(page); |
95 | nbuf++; |
96 | } |
97 | |
98 | /* |
99 | * to avoid interrupts in overwrite mode, only allow one physical |
100 | */ |
101 | if (overwrite && nbuf > 1) |
102 | return NULL; |
103 | |
104 | buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node); |
105 | if (!buf) |
106 | return NULL; |
107 | |
108 | buf->nr_pages = nr_pages; |
109 | buf->nr_bufs = nbuf; |
110 | buf->snapshot = overwrite; |
111 | buf->data_pages = pages; |
112 | buf->real_size = size - size % BTS_RECORD_SIZE; |
113 | |
114 | for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) { |
115 | unsigned int __nr_pages; |
116 | |
117 | page = virt_to_page(pages[pg]); |
118 | __nr_pages = buf_nr_pages(page); |
119 | buf->buf[nbuf].page = page; |
120 | buf->buf[nbuf].offset = offset; |
121 | buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0); |
122 | buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement; |
123 | pad = buf->buf[nbuf].size % BTS_RECORD_SIZE; |
124 | buf->buf[nbuf].size -= pad; |
125 | |
126 | pg += __nr_pages; |
127 | offset += __nr_pages << PAGE_SHIFT; |
128 | } |
129 | |
130 | return buf; |
131 | } |
132 | |
133 | static void bts_buffer_free_aux(void *data) |
134 | { |
135 | kfree(objp: data); |
136 | } |
137 | |
138 | static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx) |
139 | { |
140 | return buf->buf[idx].offset + buf->buf[idx].displacement; |
141 | } |
142 | |
143 | static void |
144 | bts_config_buffer(struct bts_buffer *buf) |
145 | { |
146 | int cpu = raw_smp_processor_id(); |
147 | struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; |
148 | struct bts_phys *phys = &buf->buf[buf->cur_buf]; |
149 | unsigned long index, thresh = 0, end = phys->size; |
150 | struct page *page = phys->page; |
151 | |
152 | index = local_read(&buf->head); |
153 | |
154 | if (!buf->snapshot) { |
155 | if (buf->end < phys->offset + buf_size(page)) |
156 | end = buf->end - phys->offset - phys->displacement; |
157 | |
158 | index -= phys->offset + phys->displacement; |
159 | |
160 | if (end - index > BTS_SAFETY_MARGIN) |
161 | thresh = end - BTS_SAFETY_MARGIN; |
162 | else if (end - index > BTS_RECORD_SIZE) |
163 | thresh = end - BTS_RECORD_SIZE; |
164 | else |
165 | thresh = end; |
166 | } |
167 | |
168 | ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement; |
169 | ds->bts_index = ds->bts_buffer_base + index; |
170 | ds->bts_absolute_maximum = ds->bts_buffer_base + end; |
171 | ds->bts_interrupt_threshold = !buf->snapshot |
172 | ? ds->bts_buffer_base + thresh |
173 | : ds->bts_absolute_maximum + BTS_RECORD_SIZE; |
174 | } |
175 | |
176 | static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head) |
177 | { |
178 | unsigned long index = head - phys->offset; |
179 | |
180 | memset(page_address(phys->page) + index, 0, phys->size - index); |
181 | } |
182 | |
183 | static void bts_update(struct bts_ctx *bts) |
184 | { |
185 | int cpu = raw_smp_processor_id(); |
186 | struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; |
187 | struct bts_buffer *buf = perf_get_aux(handle: &bts->handle); |
188 | unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head; |
189 | |
190 | if (!buf) |
191 | return; |
192 | |
193 | head = index + bts_buffer_offset(buf, idx: buf->cur_buf); |
194 | old = local_xchg(&buf->head, head); |
195 | |
196 | if (!buf->snapshot) { |
197 | if (old == head) |
198 | return; |
199 | |
200 | if (ds->bts_index >= ds->bts_absolute_maximum) |
201 | perf_aux_output_flag(handle: &bts->handle, |
202 | PERF_AUX_FLAG_TRUNCATED); |
203 | |
204 | /* |
205 | * old and head are always in the same physical buffer, so we |
206 | * can subtract them to get the data size. |
207 | */ |
208 | local_add(i: head - old, l: &buf->data_size); |
209 | } else { |
210 | local_set(&buf->data_size, head); |
211 | } |
212 | |
213 | /* |
214 | * Since BTS is coherent, just add compiler barrier to ensure |
215 | * BTS updating is ordered against bts::handle::event. |
216 | */ |
217 | barrier(); |
218 | } |
219 | |
220 | static int |
221 | bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle); |
222 | |
223 | /* |
224 | * Ordering PMU callbacks wrt themselves and the PMI is done by means |
225 | * of bts::state, which: |
226 | * - is set when bts::handle::event is valid, that is, between |
227 | * perf_aux_output_begin() and perf_aux_output_end(); |
228 | * - is zero otherwise; |
229 | * - is ordered against bts::handle::event with a compiler barrier. |
230 | */ |
231 | |
232 | static void __bts_event_start(struct perf_event *event) |
233 | { |
234 | struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); |
235 | struct bts_buffer *buf = perf_get_aux(handle: &bts->handle); |
236 | u64 config = 0; |
237 | |
238 | if (!buf->snapshot) |
239 | config |= ARCH_PERFMON_EVENTSEL_INT; |
240 | if (!event->attr.exclude_kernel) |
241 | config |= ARCH_PERFMON_EVENTSEL_OS; |
242 | if (!event->attr.exclude_user) |
243 | config |= ARCH_PERFMON_EVENTSEL_USR; |
244 | |
245 | bts_config_buffer(buf); |
246 | |
247 | /* |
248 | * local barrier to make sure that ds configuration made it |
249 | * before we enable BTS and bts::state goes ACTIVE |
250 | */ |
251 | wmb(); |
252 | |
253 | /* INACTIVE/STOPPED -> ACTIVE */ |
254 | WRITE_ONCE(bts->state, BTS_STATE_ACTIVE); |
255 | |
256 | intel_pmu_enable_bts(config); |
257 | |
258 | } |
259 | |
260 | static void bts_event_start(struct perf_event *event, int flags) |
261 | { |
262 | struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); |
263 | struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); |
264 | struct bts_buffer *buf; |
265 | |
266 | buf = perf_aux_output_begin(handle: &bts->handle, event); |
267 | if (!buf) |
268 | goto fail_stop; |
269 | |
270 | if (bts_buffer_reset(buf, handle: &bts->handle)) |
271 | goto fail_end_stop; |
272 | |
273 | bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base; |
274 | bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum; |
275 | bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold; |
276 | |
277 | perf_event_itrace_started(event); |
278 | event->hw.state = 0; |
279 | |
280 | __bts_event_start(event); |
281 | |
282 | return; |
283 | |
284 | fail_end_stop: |
285 | perf_aux_output_end(handle: &bts->handle, size: 0); |
286 | |
287 | fail_stop: |
288 | event->hw.state = PERF_HES_STOPPED; |
289 | } |
290 | |
291 | static void __bts_event_stop(struct perf_event *event, int state) |
292 | { |
293 | struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); |
294 | |
295 | /* ACTIVE -> INACTIVE(PMI)/STOPPED(->stop()) */ |
296 | WRITE_ONCE(bts->state, state); |
297 | |
298 | /* |
299 | * No extra synchronization is mandated by the documentation to have |
300 | * BTS data stores globally visible. |
301 | */ |
302 | intel_pmu_disable_bts(); |
303 | } |
304 | |
305 | static void bts_event_stop(struct perf_event *event, int flags) |
306 | { |
307 | struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); |
308 | struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); |
309 | struct bts_buffer *buf = NULL; |
310 | int state = READ_ONCE(bts->state); |
311 | |
312 | if (state == BTS_STATE_ACTIVE) |
313 | __bts_event_stop(event, state: BTS_STATE_STOPPED); |
314 | |
315 | if (state != BTS_STATE_STOPPED) |
316 | buf = perf_get_aux(handle: &bts->handle); |
317 | |
318 | event->hw.state |= PERF_HES_STOPPED; |
319 | |
320 | if (flags & PERF_EF_UPDATE) { |
321 | bts_update(bts); |
322 | |
323 | if (buf) { |
324 | if (buf->snapshot) |
325 | bts->handle.head = |
326 | local_xchg(&buf->data_size, |
327 | buf->nr_pages << PAGE_SHIFT); |
328 | perf_aux_output_end(handle: &bts->handle, |
329 | local_xchg(&buf->data_size, 0)); |
330 | } |
331 | |
332 | cpuc->ds->bts_index = bts->ds_back.bts_buffer_base; |
333 | cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base; |
334 | cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum; |
335 | cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold; |
336 | } |
337 | } |
338 | |
339 | void intel_bts_enable_local(void) |
340 | { |
341 | struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); |
342 | int state = READ_ONCE(bts->state); |
343 | |
344 | /* |
345 | * Here we transition from INACTIVE to ACTIVE; |
346 | * if we instead are STOPPED from the interrupt handler, |
347 | * stay that way. Can't be ACTIVE here though. |
348 | */ |
349 | if (WARN_ON_ONCE(state == BTS_STATE_ACTIVE)) |
350 | return; |
351 | |
352 | if (state == BTS_STATE_STOPPED) |
353 | return; |
354 | |
355 | if (bts->handle.event) |
356 | __bts_event_start(event: bts->handle.event); |
357 | } |
358 | |
359 | void intel_bts_disable_local(void) |
360 | { |
361 | struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); |
362 | |
363 | /* |
364 | * Here we transition from ACTIVE to INACTIVE; |
365 | * do nothing for STOPPED or INACTIVE. |
366 | */ |
367 | if (READ_ONCE(bts->state) != BTS_STATE_ACTIVE) |
368 | return; |
369 | |
370 | if (bts->handle.event) |
371 | __bts_event_stop(event: bts->handle.event, state: BTS_STATE_INACTIVE); |
372 | } |
373 | |
374 | static int |
375 | bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle) |
376 | { |
377 | unsigned long head, space, next_space, pad, gap, skip, wakeup; |
378 | unsigned int next_buf; |
379 | struct bts_phys *phys, *next_phys; |
380 | int ret; |
381 | |
382 | if (buf->snapshot) |
383 | return 0; |
384 | |
385 | head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1); |
386 | |
387 | phys = &buf->buf[buf->cur_buf]; |
388 | space = phys->offset + phys->displacement + phys->size - head; |
389 | pad = space; |
390 | if (space > handle->size) { |
391 | space = handle->size; |
392 | space -= space % BTS_RECORD_SIZE; |
393 | } |
394 | if (space <= BTS_SAFETY_MARGIN) { |
395 | /* See if next phys buffer has more space */ |
396 | next_buf = buf->cur_buf + 1; |
397 | if (next_buf >= buf->nr_bufs) |
398 | next_buf = 0; |
399 | next_phys = &buf->buf[next_buf]; |
400 | gap = buf_size(page: phys->page) - phys->displacement - phys->size + |
401 | next_phys->displacement; |
402 | skip = pad + gap; |
403 | if (handle->size >= skip) { |
404 | next_space = next_phys->size; |
405 | if (next_space + skip > handle->size) { |
406 | next_space = handle->size - skip; |
407 | next_space -= next_space % BTS_RECORD_SIZE; |
408 | } |
409 | if (next_space > space || !space) { |
410 | if (pad) |
411 | bts_buffer_pad_out(phys, head); |
412 | ret = perf_aux_output_skip(handle, size: skip); |
413 | if (ret) |
414 | return ret; |
415 | /* Advance to next phys buffer */ |
416 | phys = next_phys; |
417 | space = next_space; |
418 | head = phys->offset + phys->displacement; |
419 | /* |
420 | * After this, cur_buf and head won't match ds |
421 | * anymore, so we must not be racing with |
422 | * bts_update(). |
423 | */ |
424 | buf->cur_buf = next_buf; |
425 | local_set(&buf->head, head); |
426 | } |
427 | } |
428 | } |
429 | |
430 | /* Don't go far beyond wakeup watermark */ |
431 | wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup - |
432 | handle->head; |
433 | if (space > wakeup) { |
434 | space = wakeup; |
435 | space -= space % BTS_RECORD_SIZE; |
436 | } |
437 | |
438 | buf->end = head + space; |
439 | |
440 | /* |
441 | * If we have no space, the lost notification would have been sent when |
442 | * we hit absolute_maximum - see bts_update() |
443 | */ |
444 | if (!space) |
445 | return -ENOSPC; |
446 | |
447 | return 0; |
448 | } |
449 | |
450 | int intel_bts_interrupt(void) |
451 | { |
452 | struct debug_store *ds = this_cpu_ptr(&cpu_hw_events)->ds; |
453 | struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); |
454 | struct perf_event *event = bts->handle.event; |
455 | struct bts_buffer *buf; |
456 | s64 old_head; |
457 | int err = -ENOSPC, handled = 0; |
458 | |
459 | /* |
460 | * The only surefire way of knowing if this NMI is ours is by checking |
461 | * the write ptr against the PMI threshold. |
462 | */ |
463 | if (ds && (ds->bts_index >= ds->bts_interrupt_threshold)) |
464 | handled = 1; |
465 | |
466 | /* |
467 | * this is wrapped in intel_bts_enable_local/intel_bts_disable_local, |
468 | * so we can only be INACTIVE or STOPPED |
469 | */ |
470 | if (READ_ONCE(bts->state) == BTS_STATE_STOPPED) |
471 | return handled; |
472 | |
473 | buf = perf_get_aux(handle: &bts->handle); |
474 | if (!buf) |
475 | return handled; |
476 | |
477 | /* |
478 | * Skip snapshot counters: they don't use the interrupt, but |
479 | * there's no other way of telling, because the pointer will |
480 | * keep moving |
481 | */ |
482 | if (buf->snapshot) |
483 | return 0; |
484 | |
485 | old_head = local_read(&buf->head); |
486 | bts_update(bts); |
487 | |
488 | /* no new data */ |
489 | if (old_head == local_read(&buf->head)) |
490 | return handled; |
491 | |
492 | perf_aux_output_end(handle: &bts->handle, local_xchg(&buf->data_size, 0)); |
493 | |
494 | buf = perf_aux_output_begin(handle: &bts->handle, event); |
495 | if (buf) |
496 | err = bts_buffer_reset(buf, handle: &bts->handle); |
497 | |
498 | if (err) { |
499 | WRITE_ONCE(bts->state, BTS_STATE_STOPPED); |
500 | |
501 | if (buf) { |
502 | /* |
503 | * BTS_STATE_STOPPED should be visible before |
504 | * cleared handle::event |
505 | */ |
506 | barrier(); |
507 | perf_aux_output_end(handle: &bts->handle, size: 0); |
508 | } |
509 | } |
510 | |
511 | return 1; |
512 | } |
513 | |
514 | static void bts_event_del(struct perf_event *event, int mode) |
515 | { |
516 | bts_event_stop(event, PERF_EF_UPDATE); |
517 | } |
518 | |
519 | static int bts_event_add(struct perf_event *event, int mode) |
520 | { |
521 | struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); |
522 | struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); |
523 | struct hw_perf_event *hwc = &event->hw; |
524 | |
525 | event->hw.state = PERF_HES_STOPPED; |
526 | |
527 | if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) |
528 | return -EBUSY; |
529 | |
530 | if (bts->handle.event) |
531 | return -EBUSY; |
532 | |
533 | if (mode & PERF_EF_START) { |
534 | bts_event_start(event, flags: 0); |
535 | if (hwc->state & PERF_HES_STOPPED) |
536 | return -EINVAL; |
537 | } |
538 | |
539 | return 0; |
540 | } |
541 | |
542 | static void bts_event_destroy(struct perf_event *event) |
543 | { |
544 | x86_release_hardware(); |
545 | x86_del_exclusive(what: x86_lbr_exclusive_bts); |
546 | } |
547 | |
548 | static int bts_event_init(struct perf_event *event) |
549 | { |
550 | int ret; |
551 | |
552 | if (event->attr.type != bts_pmu.type) |
553 | return -ENOENT; |
554 | |
555 | /* |
556 | * BTS leaks kernel addresses even when CPL0 tracing is |
557 | * disabled, so disallow intel_bts driver for unprivileged |
558 | * users on paranoid systems since it provides trace data |
559 | * to the user in a zero-copy fashion. |
560 | * |
561 | * Note that the default paranoia setting permits unprivileged |
562 | * users to profile the kernel. |
563 | */ |
564 | if (event->attr.exclude_kernel) { |
565 | ret = perf_allow_kernel(attr: &event->attr); |
566 | if (ret) |
567 | return ret; |
568 | } |
569 | |
570 | if (x86_add_exclusive(what: x86_lbr_exclusive_bts)) |
571 | return -EBUSY; |
572 | |
573 | ret = x86_reserve_hardware(); |
574 | if (ret) { |
575 | x86_del_exclusive(what: x86_lbr_exclusive_bts); |
576 | return ret; |
577 | } |
578 | |
579 | event->destroy = bts_event_destroy; |
580 | |
581 | return 0; |
582 | } |
583 | |
584 | static void bts_event_read(struct perf_event *event) |
585 | { |
586 | } |
587 | |
588 | static __init int bts_init(void) |
589 | { |
590 | if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts) |
591 | return -ENODEV; |
592 | |
593 | if (boot_cpu_has(X86_FEATURE_PTI)) { |
594 | /* |
595 | * BTS hardware writes through a virtual memory map we must |
596 | * either use the kernel physical map, or the user mapping of |
597 | * the AUX buffer. |
598 | * |
599 | * However, since this driver supports per-CPU and per-task inherit |
600 | * we cannot use the user mapping since it will not be available |
601 | * if we're not running the owning process. |
602 | * |
603 | * With PTI we can't use the kernel map either, because its not |
604 | * there when we run userspace. |
605 | * |
606 | * For now, disable this driver when using PTI. |
607 | */ |
608 | return -ENODEV; |
609 | } |
610 | |
611 | bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE | |
612 | PERF_PMU_CAP_EXCLUSIVE; |
613 | bts_pmu.task_ctx_nr = perf_sw_context; |
614 | bts_pmu.event_init = bts_event_init; |
615 | bts_pmu.add = bts_event_add; |
616 | bts_pmu.del = bts_event_del; |
617 | bts_pmu.start = bts_event_start; |
618 | bts_pmu.stop = bts_event_stop; |
619 | bts_pmu.read = bts_event_read; |
620 | bts_pmu.setup_aux = bts_buffer_setup_aux; |
621 | bts_pmu.free_aux = bts_buffer_free_aux; |
622 | |
623 | return perf_pmu_register(pmu: &bts_pmu, name: "intel_bts" , type: -1); |
624 | } |
625 | arch_initcall(bts_init); |
626 | |