1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Performance events ring-buffer code: |
4 | * |
5 | * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> |
6 | * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar |
7 | * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra |
8 | * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> |
9 | */ |
10 | |
11 | #include <linux/perf_event.h> |
12 | #include <linux/vmalloc.h> |
13 | #include <linux/slab.h> |
14 | #include <linux/circ_buf.h> |
15 | #include <linux/poll.h> |
16 | #include <linux/nospec.h> |
17 | |
18 | #include "internal.h" |
19 | |
20 | static void perf_output_wakeup(struct perf_output_handle *handle) |
21 | { |
22 | atomic_set(v: &handle->rb->poll, EPOLLIN); |
23 | |
24 | handle->event->pending_wakeup = 1; |
25 | irq_work_queue(work: &handle->event->pending_irq); |
26 | } |
27 | |
28 | /* |
29 | * We need to ensure a later event_id doesn't publish a head when a former |
30 | * event isn't done writing. However since we need to deal with NMIs we |
31 | * cannot fully serialize things. |
32 | * |
33 | * We only publish the head (and generate a wakeup) when the outer-most |
34 | * event completes. |
35 | */ |
36 | static void perf_output_get_handle(struct perf_output_handle *handle) |
37 | { |
38 | struct perf_buffer *rb = handle->rb; |
39 | |
40 | preempt_disable(); |
41 | |
42 | /* |
43 | * Avoid an explicit LOAD/STORE such that architectures with memops |
44 | * can use them. |
45 | */ |
46 | (*(volatile unsigned int *)&rb->nest)++; |
47 | handle->wakeup = local_read(&rb->wakeup); |
48 | } |
49 | |
50 | static void perf_output_put_handle(struct perf_output_handle *handle) |
51 | { |
52 | struct perf_buffer *rb = handle->rb; |
53 | unsigned long head; |
54 | unsigned int nest; |
55 | |
56 | /* |
57 | * If this isn't the outermost nesting, we don't have to update |
58 | * @rb->user_page->data_head. |
59 | */ |
60 | nest = READ_ONCE(rb->nest); |
61 | if (nest > 1) { |
62 | WRITE_ONCE(rb->nest, nest - 1); |
63 | goto out; |
64 | } |
65 | |
66 | again: |
67 | /* |
68 | * In order to avoid publishing a head value that goes backwards, |
69 | * we must ensure the load of @rb->head happens after we've |
70 | * incremented @rb->nest. |
71 | * |
72 | * Otherwise we can observe a @rb->head value before one published |
73 | * by an IRQ/NMI happening between the load and the increment. |
74 | */ |
75 | barrier(); |
76 | head = local_read(&rb->head); |
77 | |
78 | /* |
79 | * IRQ/NMI can happen here and advance @rb->head, causing our |
80 | * load above to be stale. |
81 | */ |
82 | |
83 | /* |
84 | * Since the mmap() consumer (userspace) can run on a different CPU: |
85 | * |
86 | * kernel user |
87 | * |
88 | * if (LOAD ->data_tail) { LOAD ->data_head |
89 | * (A) smp_rmb() (C) |
90 | * STORE $data LOAD $data |
91 | * smp_wmb() (B) smp_mb() (D) |
92 | * STORE ->data_head STORE ->data_tail |
93 | * } |
94 | * |
95 | * Where A pairs with D, and B pairs with C. |
96 | * |
97 | * In our case (A) is a control dependency that separates the load of |
98 | * the ->data_tail and the stores of $data. In case ->data_tail |
99 | * indicates there is no room in the buffer to store $data we do not. |
100 | * |
101 | * D needs to be a full barrier since it separates the data READ |
102 | * from the tail WRITE. |
103 | * |
104 | * For B a WMB is sufficient since it separates two WRITEs, and for C |
105 | * an RMB is sufficient since it separates two READs. |
106 | * |
107 | * See perf_output_begin(). |
108 | */ |
109 | smp_wmb(); /* B, matches C */ |
110 | WRITE_ONCE(rb->user_page->data_head, head); |
111 | |
112 | /* |
113 | * We must publish the head before decrementing the nest count, |
114 | * otherwise an IRQ/NMI can publish a more recent head value and our |
115 | * write will (temporarily) publish a stale value. |
116 | */ |
117 | barrier(); |
118 | WRITE_ONCE(rb->nest, 0); |
119 | |
120 | /* |
121 | * Ensure we decrement @rb->nest before we validate the @rb->head. |
122 | * Otherwise we cannot be sure we caught the 'last' nested update. |
123 | */ |
124 | barrier(); |
125 | if (unlikely(head != local_read(&rb->head))) { |
126 | WRITE_ONCE(rb->nest, 1); |
127 | goto again; |
128 | } |
129 | |
130 | if (handle->wakeup != local_read(&rb->wakeup)) |
131 | perf_output_wakeup(handle); |
132 | |
133 | out: |
134 | preempt_enable(); |
135 | } |
136 | |
137 | static __always_inline bool |
138 | ring_buffer_has_space(unsigned long head, unsigned long tail, |
139 | unsigned long data_size, unsigned int size, |
140 | bool backward) |
141 | { |
142 | if (!backward) |
143 | return CIRC_SPACE(head, tail, data_size) >= size; |
144 | else |
145 | return CIRC_SPACE(tail, head, data_size) >= size; |
146 | } |
147 | |
148 | static __always_inline int |
149 | __perf_output_begin(struct perf_output_handle *handle, |
150 | struct perf_sample_data *data, |
151 | struct perf_event *event, unsigned int size, |
152 | bool backward) |
153 | { |
154 | struct perf_buffer *rb; |
155 | unsigned long tail, offset, head; |
156 | int have_lost, page_shift; |
157 | struct { |
158 | struct perf_event_header ; |
159 | u64 id; |
160 | u64 lost; |
161 | } lost_event; |
162 | |
163 | rcu_read_lock(); |
164 | /* |
165 | * For inherited events we send all the output towards the parent. |
166 | */ |
167 | if (event->parent) |
168 | event = event->parent; |
169 | |
170 | rb = rcu_dereference(event->rb); |
171 | if (unlikely(!rb)) |
172 | goto out; |
173 | |
174 | if (unlikely(rb->paused)) { |
175 | if (rb->nr_pages) { |
176 | local_inc(l: &rb->lost); |
177 | atomic64_inc(v: &event->lost_samples); |
178 | } |
179 | goto out; |
180 | } |
181 | |
182 | handle->rb = rb; |
183 | handle->event = event; |
184 | |
185 | have_lost = local_read(&rb->lost); |
186 | if (unlikely(have_lost)) { |
187 | size += sizeof(lost_event); |
188 | if (event->attr.sample_id_all) |
189 | size += event->id_header_size; |
190 | } |
191 | |
192 | perf_output_get_handle(handle); |
193 | |
194 | offset = local_read(&rb->head); |
195 | do { |
196 | head = offset; |
197 | tail = READ_ONCE(rb->user_page->data_tail); |
198 | if (!rb->overwrite) { |
199 | if (unlikely(!ring_buffer_has_space(head, tail, |
200 | perf_data_size(rb), |
201 | size, backward))) |
202 | goto fail; |
203 | } |
204 | |
205 | /* |
206 | * The above forms a control dependency barrier separating the |
207 | * @tail load above from the data stores below. Since the @tail |
208 | * load is required to compute the branch to fail below. |
209 | * |
210 | * A, matches D; the full memory barrier userspace SHOULD issue |
211 | * after reading the data and before storing the new tail |
212 | * position. |
213 | * |
214 | * See perf_output_put_handle(). |
215 | */ |
216 | |
217 | if (!backward) |
218 | head += size; |
219 | else |
220 | head -= size; |
221 | } while (!local_try_cmpxchg(l: &rb->head, old: &offset, new: head)); |
222 | |
223 | if (backward) { |
224 | offset = head; |
225 | head = (u64)(-head); |
226 | } |
227 | |
228 | /* |
229 | * We rely on the implied barrier() by local_cmpxchg() to ensure |
230 | * none of the data stores below can be lifted up by the compiler. |
231 | */ |
232 | |
233 | if (unlikely(head - local_read(&rb->wakeup) > rb->watermark)) |
234 | local_add(i: rb->watermark, l: &rb->wakeup); |
235 | |
236 | page_shift = PAGE_SHIFT + page_order(rb); |
237 | |
238 | handle->page = (offset >> page_shift) & (rb->nr_pages - 1); |
239 | offset &= (1UL << page_shift) - 1; |
240 | handle->addr = rb->data_pages[handle->page] + offset; |
241 | handle->size = (1UL << page_shift) - offset; |
242 | |
243 | if (unlikely(have_lost)) { |
244 | lost_event.header.size = sizeof(lost_event); |
245 | lost_event.header.type = PERF_RECORD_LOST; |
246 | lost_event.header.misc = 0; |
247 | lost_event.id = event->id; |
248 | lost_event.lost = local_xchg(&rb->lost, 0); |
249 | |
250 | /* XXX mostly redundant; @data is already fully initializes */ |
251 | perf_event_header__init_id(header: &lost_event.header, data, event); |
252 | perf_output_put(handle, lost_event); |
253 | perf_event__output_id_sample(event, handle, sample: data); |
254 | } |
255 | |
256 | return 0; |
257 | |
258 | fail: |
259 | local_inc(l: &rb->lost); |
260 | atomic64_inc(v: &event->lost_samples); |
261 | perf_output_put_handle(handle); |
262 | out: |
263 | rcu_read_unlock(); |
264 | |
265 | return -ENOSPC; |
266 | } |
267 | |
268 | int perf_output_begin_forward(struct perf_output_handle *handle, |
269 | struct perf_sample_data *data, |
270 | struct perf_event *event, unsigned int size) |
271 | { |
272 | return __perf_output_begin(handle, data, event, size, backward: false); |
273 | } |
274 | |
275 | int perf_output_begin_backward(struct perf_output_handle *handle, |
276 | struct perf_sample_data *data, |
277 | struct perf_event *event, unsigned int size) |
278 | { |
279 | return __perf_output_begin(handle, data, event, size, backward: true); |
280 | } |
281 | |
282 | int perf_output_begin(struct perf_output_handle *handle, |
283 | struct perf_sample_data *data, |
284 | struct perf_event *event, unsigned int size) |
285 | { |
286 | |
287 | return __perf_output_begin(handle, data, event, size, |
288 | unlikely(is_write_backward(event))); |
289 | } |
290 | |
291 | unsigned int perf_output_copy(struct perf_output_handle *handle, |
292 | const void *buf, unsigned int len) |
293 | { |
294 | return __output_copy(handle, buf, len); |
295 | } |
296 | |
297 | unsigned int perf_output_skip(struct perf_output_handle *handle, |
298 | unsigned int len) |
299 | { |
300 | return __output_skip(handle, NULL, len); |
301 | } |
302 | |
303 | void perf_output_end(struct perf_output_handle *handle) |
304 | { |
305 | perf_output_put_handle(handle); |
306 | rcu_read_unlock(); |
307 | } |
308 | |
309 | static void |
310 | ring_buffer_init(struct perf_buffer *rb, long watermark, int flags) |
311 | { |
312 | long max_size = perf_data_size(rb); |
313 | |
314 | if (watermark) |
315 | rb->watermark = min(max_size, watermark); |
316 | |
317 | if (!rb->watermark) |
318 | rb->watermark = max_size / 2; |
319 | |
320 | if (flags & RING_BUFFER_WRITABLE) |
321 | rb->overwrite = 0; |
322 | else |
323 | rb->overwrite = 1; |
324 | |
325 | refcount_set(r: &rb->refcount, n: 1); |
326 | |
327 | INIT_LIST_HEAD(list: &rb->event_list); |
328 | spin_lock_init(&rb->event_lock); |
329 | |
330 | /* |
331 | * perf_output_begin() only checks rb->paused, therefore |
332 | * rb->paused must be true if we have no pages for output. |
333 | */ |
334 | if (!rb->nr_pages) |
335 | rb->paused = 1; |
336 | } |
337 | |
338 | void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags) |
339 | { |
340 | /* |
341 | * OVERWRITE is determined by perf_aux_output_end() and can't |
342 | * be passed in directly. |
343 | */ |
344 | if (WARN_ON_ONCE(flags & PERF_AUX_FLAG_OVERWRITE)) |
345 | return; |
346 | |
347 | handle->aux_flags |= flags; |
348 | } |
349 | EXPORT_SYMBOL_GPL(perf_aux_output_flag); |
350 | |
351 | /* |
352 | * This is called before hardware starts writing to the AUX area to |
353 | * obtain an output handle and make sure there's room in the buffer. |
354 | * When the capture completes, call perf_aux_output_end() to commit |
355 | * the recorded data to the buffer. |
356 | * |
357 | * The ordering is similar to that of perf_output_{begin,end}, with |
358 | * the exception of (B), which should be taken care of by the pmu |
359 | * driver, since ordering rules will differ depending on hardware. |
360 | * |
361 | * Call this from pmu::start(); see the comment in perf_aux_output_end() |
362 | * about its use in pmu callbacks. Both can also be called from the PMI |
363 | * handler if needed. |
364 | */ |
365 | void *perf_aux_output_begin(struct perf_output_handle *handle, |
366 | struct perf_event *event) |
367 | { |
368 | struct perf_event *output_event = event; |
369 | unsigned long aux_head, aux_tail; |
370 | struct perf_buffer *rb; |
371 | unsigned int nest; |
372 | |
373 | if (output_event->parent) |
374 | output_event = output_event->parent; |
375 | |
376 | /* |
377 | * Since this will typically be open across pmu::add/pmu::del, we |
378 | * grab ring_buffer's refcount instead of holding rcu read lock |
379 | * to make sure it doesn't disappear under us. |
380 | */ |
381 | rb = ring_buffer_get(event: output_event); |
382 | if (!rb) |
383 | return NULL; |
384 | |
385 | if (!rb_has_aux(rb)) |
386 | goto err; |
387 | |
388 | /* |
389 | * If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(), |
390 | * about to get freed, so we leave immediately. |
391 | * |
392 | * Checking rb::aux_mmap_count and rb::refcount has to be done in |
393 | * the same order, see perf_mmap_close. Otherwise we end up freeing |
394 | * aux pages in this path, which is a bug, because in_atomic(). |
395 | */ |
396 | if (!atomic_read(v: &rb->aux_mmap_count)) |
397 | goto err; |
398 | |
399 | if (!refcount_inc_not_zero(r: &rb->aux_refcount)) |
400 | goto err; |
401 | |
402 | nest = READ_ONCE(rb->aux_nest); |
403 | /* |
404 | * Nesting is not supported for AUX area, make sure nested |
405 | * writers are caught early |
406 | */ |
407 | if (WARN_ON_ONCE(nest)) |
408 | goto err_put; |
409 | |
410 | WRITE_ONCE(rb->aux_nest, nest + 1); |
411 | |
412 | aux_head = rb->aux_head; |
413 | |
414 | handle->rb = rb; |
415 | handle->event = event; |
416 | handle->head = aux_head; |
417 | handle->size = 0; |
418 | handle->aux_flags = 0; |
419 | |
420 | /* |
421 | * In overwrite mode, AUX data stores do not depend on aux_tail, |
422 | * therefore (A) control dependency barrier does not exist. The |
423 | * (B) <-> (C) ordering is still observed by the pmu driver. |
424 | */ |
425 | if (!rb->aux_overwrite) { |
426 | aux_tail = READ_ONCE(rb->user_page->aux_tail); |
427 | handle->wakeup = rb->aux_wakeup + rb->aux_watermark; |
428 | if (aux_head - aux_tail < perf_aux_size(rb)) |
429 | handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb)); |
430 | |
431 | /* |
432 | * handle->size computation depends on aux_tail load; this forms a |
433 | * control dependency barrier separating aux_tail load from aux data |
434 | * store that will be enabled on successful return |
435 | */ |
436 | if (!handle->size) { /* A, matches D */ |
437 | event->pending_disable = smp_processor_id(); |
438 | perf_output_wakeup(handle); |
439 | WRITE_ONCE(rb->aux_nest, 0); |
440 | goto err_put; |
441 | } |
442 | } |
443 | |
444 | return handle->rb->aux_priv; |
445 | |
446 | err_put: |
447 | /* can't be last */ |
448 | rb_free_aux(rb); |
449 | |
450 | err: |
451 | ring_buffer_put(rb); |
452 | handle->event = NULL; |
453 | |
454 | return NULL; |
455 | } |
456 | EXPORT_SYMBOL_GPL(perf_aux_output_begin); |
457 | |
458 | static __always_inline bool rb_need_aux_wakeup(struct perf_buffer *rb) |
459 | { |
460 | if (rb->aux_overwrite) |
461 | return false; |
462 | |
463 | if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { |
464 | rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); |
465 | return true; |
466 | } |
467 | |
468 | return false; |
469 | } |
470 | |
471 | /* |
472 | * Commit the data written by hardware into the ring buffer by adjusting |
473 | * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the |
474 | * pmu driver's responsibility to observe ordering rules of the hardware, |
475 | * so that all the data is externally visible before this is called. |
476 | * |
477 | * Note: this has to be called from pmu::stop() callback, as the assumption |
478 | * of the AUX buffer management code is that after pmu::stop(), the AUX |
479 | * transaction must be stopped and therefore drop the AUX reference count. |
480 | */ |
481 | void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) |
482 | { |
483 | bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED); |
484 | struct perf_buffer *rb = handle->rb; |
485 | unsigned long aux_head; |
486 | |
487 | /* in overwrite mode, driver provides aux_head via handle */ |
488 | if (rb->aux_overwrite) { |
489 | handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE; |
490 | |
491 | aux_head = handle->head; |
492 | rb->aux_head = aux_head; |
493 | } else { |
494 | handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE; |
495 | |
496 | aux_head = rb->aux_head; |
497 | rb->aux_head += size; |
498 | } |
499 | |
500 | /* |
501 | * Only send RECORD_AUX if we have something useful to communicate |
502 | * |
503 | * Note: the OVERWRITE records by themselves are not considered |
504 | * useful, as they don't communicate any *new* information, |
505 | * aside from the short-lived offset, that becomes history at |
506 | * the next event sched-in and therefore isn't useful. |
507 | * The userspace that needs to copy out AUX data in overwrite |
508 | * mode should know to use user_page::aux_head for the actual |
509 | * offset. So, from now on we don't output AUX records that |
510 | * have *only* OVERWRITE flag set. |
511 | */ |
512 | if (size || (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE)) |
513 | perf_event_aux_event(event: handle->event, head: aux_head, size, |
514 | flags: handle->aux_flags); |
515 | |
516 | WRITE_ONCE(rb->user_page->aux_head, rb->aux_head); |
517 | if (rb_need_aux_wakeup(rb)) |
518 | wakeup = true; |
519 | |
520 | if (wakeup) { |
521 | if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) |
522 | handle->event->pending_disable = smp_processor_id(); |
523 | perf_output_wakeup(handle); |
524 | } |
525 | |
526 | handle->event = NULL; |
527 | |
528 | WRITE_ONCE(rb->aux_nest, 0); |
529 | /* can't be last */ |
530 | rb_free_aux(rb); |
531 | ring_buffer_put(rb); |
532 | } |
533 | EXPORT_SYMBOL_GPL(perf_aux_output_end); |
534 | |
535 | /* |
536 | * Skip over a given number of bytes in the AUX buffer, due to, for example, |
537 | * hardware's alignment constraints. |
538 | */ |
539 | int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) |
540 | { |
541 | struct perf_buffer *rb = handle->rb; |
542 | |
543 | if (size > handle->size) |
544 | return -ENOSPC; |
545 | |
546 | rb->aux_head += size; |
547 | |
548 | WRITE_ONCE(rb->user_page->aux_head, rb->aux_head); |
549 | if (rb_need_aux_wakeup(rb)) { |
550 | perf_output_wakeup(handle); |
551 | handle->wakeup = rb->aux_wakeup + rb->aux_watermark; |
552 | } |
553 | |
554 | handle->head = rb->aux_head; |
555 | handle->size -= size; |
556 | |
557 | return 0; |
558 | } |
559 | EXPORT_SYMBOL_GPL(perf_aux_output_skip); |
560 | |
561 | void *perf_get_aux(struct perf_output_handle *handle) |
562 | { |
563 | /* this is only valid between perf_aux_output_begin and *_end */ |
564 | if (!handle->event) |
565 | return NULL; |
566 | |
567 | return handle->rb->aux_priv; |
568 | } |
569 | EXPORT_SYMBOL_GPL(perf_get_aux); |
570 | |
571 | /* |
572 | * Copy out AUX data from an AUX handle. |
573 | */ |
574 | long perf_output_copy_aux(struct perf_output_handle *aux_handle, |
575 | struct perf_output_handle *handle, |
576 | unsigned long from, unsigned long to) |
577 | { |
578 | struct perf_buffer *rb = aux_handle->rb; |
579 | unsigned long tocopy, remainder, len = 0; |
580 | void *addr; |
581 | |
582 | from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; |
583 | to &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; |
584 | |
585 | do { |
586 | tocopy = PAGE_SIZE - offset_in_page(from); |
587 | if (to > from) |
588 | tocopy = min(tocopy, to - from); |
589 | if (!tocopy) |
590 | break; |
591 | |
592 | addr = rb->aux_pages[from >> PAGE_SHIFT]; |
593 | addr += offset_in_page(from); |
594 | |
595 | remainder = perf_output_copy(handle, buf: addr, len: tocopy); |
596 | if (remainder) |
597 | return -EFAULT; |
598 | |
599 | len += tocopy; |
600 | from += tocopy; |
601 | from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; |
602 | } while (to != from); |
603 | |
604 | return len; |
605 | } |
606 | |
607 | #define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY) |
608 | |
609 | static struct page *rb_alloc_aux_page(int node, int order) |
610 | { |
611 | struct page *page; |
612 | |
613 | if (order > MAX_ORDER) |
614 | order = MAX_ORDER; |
615 | |
616 | do { |
617 | page = alloc_pages_node(nid: node, PERF_AUX_GFP, order); |
618 | } while (!page && order--); |
619 | |
620 | if (page && order) { |
621 | /* |
622 | * Communicate the allocation size to the driver: |
623 | * if we managed to secure a high-order allocation, |
624 | * set its first page's private to this order; |
625 | * !PagePrivate(page) means it's just a normal page. |
626 | */ |
627 | split_page(page, order); |
628 | SetPagePrivate(page); |
629 | set_page_private(page, private: order); |
630 | } |
631 | |
632 | return page; |
633 | } |
634 | |
635 | static void rb_free_aux_page(struct perf_buffer *rb, int idx) |
636 | { |
637 | struct page *page = virt_to_page(rb->aux_pages[idx]); |
638 | |
639 | ClearPagePrivate(page); |
640 | page->mapping = NULL; |
641 | __free_page(page); |
642 | } |
643 | |
644 | static void __rb_free_aux(struct perf_buffer *rb) |
645 | { |
646 | int pg; |
647 | |
648 | /* |
649 | * Should never happen, the last reference should be dropped from |
650 | * perf_mmap_close() path, which first stops aux transactions (which |
651 | * in turn are the atomic holders of aux_refcount) and then does the |
652 | * last rb_free_aux(). |
653 | */ |
654 | WARN_ON_ONCE(in_atomic()); |
655 | |
656 | if (rb->aux_priv) { |
657 | rb->free_aux(rb->aux_priv); |
658 | rb->free_aux = NULL; |
659 | rb->aux_priv = NULL; |
660 | } |
661 | |
662 | if (rb->aux_nr_pages) { |
663 | for (pg = 0; pg < rb->aux_nr_pages; pg++) |
664 | rb_free_aux_page(rb, idx: pg); |
665 | |
666 | kfree(objp: rb->aux_pages); |
667 | rb->aux_nr_pages = 0; |
668 | } |
669 | } |
670 | |
671 | int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, |
672 | pgoff_t pgoff, int nr_pages, long watermark, int flags) |
673 | { |
674 | bool overwrite = !(flags & RING_BUFFER_WRITABLE); |
675 | int node = (event->cpu == -1) ? -1 : cpu_to_node(cpu: event->cpu); |
676 | int ret = -ENOMEM, max_order; |
677 | |
678 | if (!has_aux(event)) |
679 | return -EOPNOTSUPP; |
680 | |
681 | if (!overwrite) { |
682 | /* |
683 | * Watermark defaults to half the buffer, and so does the |
684 | * max_order, to aid PMU drivers in double buffering. |
685 | */ |
686 | if (!watermark) |
687 | watermark = nr_pages << (PAGE_SHIFT - 1); |
688 | |
689 | /* |
690 | * Use aux_watermark as the basis for chunking to |
691 | * help PMU drivers honor the watermark. |
692 | */ |
693 | max_order = get_order(size: watermark); |
694 | } else { |
695 | /* |
696 | * We need to start with the max_order that fits in nr_pages, |
697 | * not the other way around, hence ilog2() and not get_order. |
698 | */ |
699 | max_order = ilog2(nr_pages); |
700 | watermark = 0; |
701 | } |
702 | |
703 | /* |
704 | * kcalloc_node() is unable to allocate buffer if the size is larger |
705 | * than: PAGE_SIZE << MAX_ORDER; directly bail out in this case. |
706 | */ |
707 | if (get_order(size: (unsigned long)nr_pages * sizeof(void *)) > MAX_ORDER) |
708 | return -ENOMEM; |
709 | rb->aux_pages = kcalloc_node(n: nr_pages, size: sizeof(void *), GFP_KERNEL, |
710 | node); |
711 | if (!rb->aux_pages) |
712 | return -ENOMEM; |
713 | |
714 | rb->free_aux = event->pmu->free_aux; |
715 | for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) { |
716 | struct page *page; |
717 | int last, order; |
718 | |
719 | order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages)); |
720 | page = rb_alloc_aux_page(node, order); |
721 | if (!page) |
722 | goto out; |
723 | |
724 | for (last = rb->aux_nr_pages + (1 << page_private(page)); |
725 | last > rb->aux_nr_pages; rb->aux_nr_pages++) |
726 | rb->aux_pages[rb->aux_nr_pages] = page_address(page++); |
727 | } |
728 | |
729 | /* |
730 | * In overwrite mode, PMUs that don't support SG may not handle more |
731 | * than one contiguous allocation, since they rely on PMI to do double |
732 | * buffering. In this case, the entire buffer has to be one contiguous |
733 | * chunk. |
734 | */ |
735 | if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) && |
736 | overwrite) { |
737 | struct page *page = virt_to_page(rb->aux_pages[0]); |
738 | |
739 | if (page_private(page) != max_order) |
740 | goto out; |
741 | } |
742 | |
743 | rb->aux_priv = event->pmu->setup_aux(event, rb->aux_pages, nr_pages, |
744 | overwrite); |
745 | if (!rb->aux_priv) |
746 | goto out; |
747 | |
748 | ret = 0; |
749 | |
750 | /* |
751 | * aux_pages (and pmu driver's private data, aux_priv) will be |
752 | * referenced in both producer's and consumer's contexts, thus |
753 | * we keep a refcount here to make sure either of the two can |
754 | * reference them safely. |
755 | */ |
756 | refcount_set(r: &rb->aux_refcount, n: 1); |
757 | |
758 | rb->aux_overwrite = overwrite; |
759 | rb->aux_watermark = watermark; |
760 | |
761 | out: |
762 | if (!ret) |
763 | rb->aux_pgoff = pgoff; |
764 | else |
765 | __rb_free_aux(rb); |
766 | |
767 | return ret; |
768 | } |
769 | |
770 | void rb_free_aux(struct perf_buffer *rb) |
771 | { |
772 | if (refcount_dec_and_test(r: &rb->aux_refcount)) |
773 | __rb_free_aux(rb); |
774 | } |
775 | |
776 | #ifndef CONFIG_PERF_USE_VMALLOC |
777 | |
778 | /* |
779 | * Back perf_mmap() with regular GFP_KERNEL-0 pages. |
780 | */ |
781 | |
782 | static struct page * |
783 | __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) |
784 | { |
785 | if (pgoff > rb->nr_pages) |
786 | return NULL; |
787 | |
788 | if (pgoff == 0) |
789 | return virt_to_page(rb->user_page); |
790 | |
791 | return virt_to_page(rb->data_pages[pgoff - 1]); |
792 | } |
793 | |
794 | static void *perf_mmap_alloc_page(int cpu) |
795 | { |
796 | struct page *page; |
797 | int node; |
798 | |
799 | node = (cpu == -1) ? cpu : cpu_to_node(cpu); |
800 | page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); |
801 | if (!page) |
802 | return NULL; |
803 | |
804 | return page_address(page); |
805 | } |
806 | |
807 | static void perf_mmap_free_page(void *addr) |
808 | { |
809 | struct page *page = virt_to_page(addr); |
810 | |
811 | page->mapping = NULL; |
812 | __free_page(page); |
813 | } |
814 | |
815 | struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) |
816 | { |
817 | struct perf_buffer *rb; |
818 | unsigned long size; |
819 | int i, node; |
820 | |
821 | size = sizeof(struct perf_buffer); |
822 | size += nr_pages * sizeof(void *); |
823 | |
824 | if (order_base_2(size) > PAGE_SHIFT+MAX_ORDER) |
825 | goto fail; |
826 | |
827 | node = (cpu == -1) ? cpu : cpu_to_node(cpu); |
828 | rb = kzalloc_node(size, GFP_KERNEL, node); |
829 | if (!rb) |
830 | goto fail; |
831 | |
832 | rb->user_page = perf_mmap_alloc_page(cpu); |
833 | if (!rb->user_page) |
834 | goto fail_user_page; |
835 | |
836 | for (i = 0; i < nr_pages; i++) { |
837 | rb->data_pages[i] = perf_mmap_alloc_page(cpu); |
838 | if (!rb->data_pages[i]) |
839 | goto fail_data_pages; |
840 | } |
841 | |
842 | rb->nr_pages = nr_pages; |
843 | |
844 | ring_buffer_init(rb, watermark, flags); |
845 | |
846 | return rb; |
847 | |
848 | fail_data_pages: |
849 | for (i--; i >= 0; i--) |
850 | perf_mmap_free_page(rb->data_pages[i]); |
851 | |
852 | perf_mmap_free_page(rb->user_page); |
853 | |
854 | fail_user_page: |
855 | kfree(rb); |
856 | |
857 | fail: |
858 | return NULL; |
859 | } |
860 | |
861 | void rb_free(struct perf_buffer *rb) |
862 | { |
863 | int i; |
864 | |
865 | perf_mmap_free_page(rb->user_page); |
866 | for (i = 0; i < rb->nr_pages; i++) |
867 | perf_mmap_free_page(rb->data_pages[i]); |
868 | kfree(rb); |
869 | } |
870 | |
871 | #else |
872 | static struct page * |
873 | __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) |
874 | { |
875 | /* The '>' counts in the user page. */ |
876 | if (pgoff > data_page_nr(rb)) |
877 | return NULL; |
878 | |
879 | return vmalloc_to_page(addr: (void *)rb->user_page + pgoff * PAGE_SIZE); |
880 | } |
881 | |
882 | static void perf_mmap_unmark_page(void *addr) |
883 | { |
884 | struct page *page = vmalloc_to_page(addr); |
885 | |
886 | page->mapping = NULL; |
887 | } |
888 | |
889 | static void rb_free_work(struct work_struct *work) |
890 | { |
891 | struct perf_buffer *rb; |
892 | void *base; |
893 | int i, nr; |
894 | |
895 | rb = container_of(work, struct perf_buffer, work); |
896 | nr = data_page_nr(rb); |
897 | |
898 | base = rb->user_page; |
899 | /* The '<=' counts in the user page. */ |
900 | for (i = 0; i <= nr; i++) |
901 | perf_mmap_unmark_page(addr: base + (i * PAGE_SIZE)); |
902 | |
903 | vfree(addr: base); |
904 | kfree(objp: rb); |
905 | } |
906 | |
907 | void rb_free(struct perf_buffer *rb) |
908 | { |
909 | schedule_work(work: &rb->work); |
910 | } |
911 | |
912 | struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) |
913 | { |
914 | struct perf_buffer *rb; |
915 | unsigned long size; |
916 | void *all_buf; |
917 | int node; |
918 | |
919 | size = sizeof(struct perf_buffer); |
920 | size += sizeof(void *); |
921 | |
922 | node = (cpu == -1) ? cpu : cpu_to_node(cpu); |
923 | rb = kzalloc_node(size, GFP_KERNEL, node); |
924 | if (!rb) |
925 | goto fail; |
926 | |
927 | INIT_WORK(&rb->work, rb_free_work); |
928 | |
929 | all_buf = vmalloc_user(size: (nr_pages + 1) * PAGE_SIZE); |
930 | if (!all_buf) |
931 | goto fail_all_buf; |
932 | |
933 | rb->user_page = all_buf; |
934 | rb->data_pages[0] = all_buf + PAGE_SIZE; |
935 | if (nr_pages) { |
936 | rb->nr_pages = 1; |
937 | rb->page_order = ilog2(nr_pages); |
938 | } |
939 | |
940 | ring_buffer_init(rb, watermark, flags); |
941 | |
942 | return rb; |
943 | |
944 | fail_all_buf: |
945 | kfree(objp: rb); |
946 | |
947 | fail: |
948 | return NULL; |
949 | } |
950 | |
951 | #endif |
952 | |
953 | struct page * |
954 | perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) |
955 | { |
956 | if (rb->aux_nr_pages) { |
957 | /* above AUX space */ |
958 | if (pgoff > rb->aux_pgoff + rb->aux_nr_pages) |
959 | return NULL; |
960 | |
961 | /* AUX space */ |
962 | if (pgoff >= rb->aux_pgoff) { |
963 | int aux_pgoff = array_index_nospec(pgoff - rb->aux_pgoff, rb->aux_nr_pages); |
964 | return virt_to_page(rb->aux_pages[aux_pgoff]); |
965 | } |
966 | } |
967 | |
968 | return __perf_mmap_to_page(rb, pgoff); |
969 | } |
970 | |