ring_buffer.c source code [linux/kernel/events/ring_buffer.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Performance events ring-buffer code:
4	*
5	* Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
6	* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
7	* Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
8	* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
9	*/
10
11	#include <linux/perf_event.h>
12	#include <linux/vmalloc.h>
13	#include <linux/slab.h>
14	#include <linux/circ_buf.h>
15	#include <linux/poll.h>
16	#include <linux/nospec.h>
17
18	#include "internal.h"
19
20	static void perf_output_wakeup(struct perf_output_handle *handle)
21	{
22	atomic_set(v: &handle->rb->poll, EPOLLIN);
23
24	handle->event->pending_wakeup = `1`;
25	irq_work_queue(work: &handle->event->pending_irq);
26	}
27
28	/*
29	* We need to ensure a later event_id doesn't publish a head when a former
30	* event isn't done writing. However since we need to deal with NMIs we
31	* cannot fully serialize things.
32	*
33	* We only publish the head (and generate a wakeup) when the outer-most
34	* event completes.
35	*/
36	static void perf_output_get_handle(struct perf_output_handle *handle)
37	{
38	struct perf_buffer *rb = handle->rb;
39
40	preempt_disable();
41
42	/*
43	* Avoid an explicit LOAD/STORE such that architectures with memops
44	* can use them.
45	*/
46	((volatile* unsigned int *)&rb->nest)++;
47	handle->wakeup = local_read(&rb->wakeup);
48	}
49
50	static void perf_output_put_handle(struct perf_output_handle *handle)
51	{
52	struct perf_buffer *rb = handle->rb;
53	unsigned long head;
54	unsigned int nest;
55
56	/*
57	* If this isn't the outermost nesting, we don't have to update
58	* @rb->user_page->data_head.
59	*/
60	nest = READ_ONCE(rb->nest);
61	if (nest > `1`) {
62	WRITE_ONCE(rb->nest, nest - `1`);
63	goto out;
64	}
65
66	again:
67	/*
68	* In order to avoid publishing a head value that goes backwards,
69	* we must ensure the load of @rb->head happens after we've
70	* incremented @rb->nest.
71	*
72	* Otherwise we can observe a @rb->head value before one published
73	* by an IRQ/NMI happening between the load and the increment.
74	*/
75	barrier();
76	head = local_read(&rb->head);
77
78	/*
79	* IRQ/NMI can happen here and advance @rb->head, causing our
80	* load above to be stale.
81	*/
82
83	/*
84	* Since the mmap() consumer (userspace) can run on a different CPU:
85	*
86	* kernel user
87	*
88	* if (LOAD ->data_tail) { LOAD ->data_head
89	* (A) smp_rmb() (C)
90	* STORE $data LOAD $data
91	* smp_wmb() (B) smp_mb() (D)
92	* STORE ->data_head STORE ->data_tail
93	* }
94	*
95	* Where A pairs with D, and B pairs with C.
96	*
97	* In our case (A) is a control dependency that separates the load of
98	* the ->data_tail and the stores of $data. In case ->data_tail
99	* indicates there is no room in the buffer to store $data we do not.
100	*
101	* D needs to be a full barrier since it separates the data READ
102	* from the tail WRITE.
103	*
104	* For B a WMB is sufficient since it separates two WRITEs, and for C
105	* an RMB is sufficient since it separates two READs.
106	*
107	* See perf_output_begin().
108	*/
109	smp_wmb(); / B, matches C /
110	WRITE_ONCE(rb->user_page->data_head, head);
111
112	/*
113	* We must publish the head before decrementing the nest count,
114	* otherwise an IRQ/NMI can publish a more recent head value and our
115	* write will (temporarily) publish a stale value.
116	*/
117	barrier();
118	WRITE_ONCE(rb->nest, `0`);
119
120	/*
121	* Ensure we decrement @rb->nest before we validate the @rb->head.
122	* Otherwise we cannot be sure we caught the 'last' nested update.
123	*/
124	barrier();
125	if (unlikely(head != local_read(&rb->head))) {
126	WRITE_ONCE(rb->nest, `1`);
127	goto again;
128	}
129
130	if (handle->wakeup != local_read(&rb->wakeup))
131	perf_output_wakeup(handle);
132
133	out:
134	preempt_enable();
135	}
136
137	static __always_inline bool
138	ring_buffer_has_space(unsigned long head, unsigned long tail,
139	unsigned long data_size, unsigned int size,
140	bool backward)
141	{
142	if (!backward)
143	return CIRC_SPACE(head, tail, data_size) >= size;
144	else
145	return CIRC_SPACE(tail, head, data_size) >= size;
146	}
147
148	static __always_inline int
149	__perf_output_begin(struct perf_output_handle *handle,
150	struct perf_sample_data *data,
151	struct perf_event event, unsigned* int size,
152	bool backward)
153	{
154	struct perf_buffer *rb;
155	unsigned long tail, offset, head;
156	int have_lost, page_shift;
157	struct {
158	struct perf_event_header header;
159	u64 id;
160	u64 lost;
161	} lost_event;
162
163	rcu_read_lock();
164	/*
165	* For inherited events we send all the output towards the parent.
166	*/
167	if (event->parent)
168	event = event->parent;
169
170	rb = rcu_dereference(event->rb);
171	if (unlikely(!rb))
172	goto out;
173
174	if (unlikely(rb->paused)) {
175	if (rb->nr_pages) {
176	local_inc(l: &rb->lost);
177	atomic64_inc(v: &event->lost_samples);
178	}
179	goto out;
180	}
181
182	handle->rb = rb;
183	handle->event = event;
184
185	have_lost = local_read(&rb->lost);
186	if (unlikely(have_lost)) {
187	size += sizeof(lost_event);
188	if (event->attr.sample_id_all)
189	size += event->id_header_size;
190	}
191
192	perf_output_get_handle(handle);
193
194	offset = local_read(&rb->head);
195	do {
196	head = offset;
197	tail = READ_ONCE(rb->user_page->data_tail);
198	if (!rb->overwrite) {
199	if (unlikely(!ring_buffer_has_space(head, tail,
200	perf_data_size(rb),
201	size, backward)))
202	goto fail;
203	}
204
205	/*
206	* The above forms a control dependency barrier separating the
207	* @tail load above from the data stores below. Since the @tail
208	* load is required to compute the branch to fail below.
209	*
210	* A, matches D; the full memory barrier userspace SHOULD issue
211	* after reading the data and before storing the new tail
212	* position.
213	*
214	* See perf_output_put_handle().
215	*/
216
217	if (!backward)
218	head += size;
219	else
220	head -= size;
221	} while (!local_try_cmpxchg(l: &rb->head, old: &offset, new: head));
222
223	if (backward) {
224	offset = head;
225	head = (u64)(-head);
226	}
227
228	/*
229	* We rely on the implied barrier() by local_cmpxchg() to ensure
230	* none of the data stores below can be lifted up by the compiler.
231	*/
232
233	if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
234	local_add(i: rb->watermark, l: &rb->wakeup);
235
236	page_shift = PAGE_SHIFT + page_order(rb);
237
238	handle->page = (offset >> page_shift) & (rb->nr_pages - `1`);
239	offset &= (`1UL` << page_shift) - `1`;
240	handle->addr = rb->data_pages[handle->page] + offset;
241	handle->size = (`1UL` << page_shift) - offset;
242
243	if (unlikely(have_lost)) {
244	lost_event.header.size = sizeof(lost_event);
245	lost_event.header.type = PERF_RECORD_LOST;
246	lost_event.header.misc = `0`;
247	lost_event.id = event->id;
248	lost_event.lost = local_xchg(&rb->lost, `0`);
249
250	/ XXX mostly redundant; @data is already fully initializes /
251	perf_event_header__init_id(header: &lost_event.header, data, event);
252	perf_output_put(handle, lost_event);
253	perf_event__output_id_sample(event, handle, sample: data);
254	}
255
256	return `0`;
257
258	fail:
259	local_inc(l: &rb->lost);
260	atomic64_inc(v: &event->lost_samples);
261	perf_output_put_handle(handle);
262	out:
263	rcu_read_unlock();
264
265	return -ENOSPC;
266	}
267
268	int perf_output_begin_forward(struct perf_output_handle *handle,
269	struct perf_sample_data *data,
270	struct perf_event event, unsigned* int size)
271	{
272	return __perf_output_begin(handle, data, event, size, backward: false);
273	}
274
275	int perf_output_begin_backward(struct perf_output_handle *handle,
276	struct perf_sample_data *data,
277	struct perf_event event, unsigned* int size)
278	{
279	return __perf_output_begin(handle, data, event, size, backward: true);
280	}
281
282	int perf_output_begin(struct perf_output_handle *handle,
283	struct perf_sample_data *data,
284	struct perf_event event, unsigned* int size)
285	{
286
287	return __perf_output_begin(handle, data, event, size,
288	unlikely(is_write_backward(event)));
289	}
290
291	unsigned int perf_output_copy(struct perf_output_handle *handle,
292	const void buf, unsigned* int len)
293	{
294	return __output_copy(handle, buf, len);
295	}
296
297	unsigned int perf_output_skip(struct perf_output_handle *handle,
298	unsigned int len)
299	{
300	return __output_skip(handle, NULL, len);
301	}
302
303	void perf_output_end(struct perf_output_handle *handle)
304	{
305	perf_output_put_handle(handle);
306	rcu_read_unlock();
307	}
308
309	static void
310	ring_buffer_init(struct perf_buffer rb, long* watermark, int flags)
311	{
312	long max_size = perf_data_size(rb);
313
314	if (watermark)
315	rb->watermark = min(max_size, watermark);
316
317	if (!rb->watermark)
318	rb->watermark = max_size / `2`;
319
320	if (flags & RING_BUFFER_WRITABLE)
321	rb->overwrite = `0`;
322	else
323	rb->overwrite = `1`;
324
325	refcount_set(r: &rb->refcount, n: `1`);
326
327	INIT_LIST_HEAD(list: &rb->event_list);
328	spin_lock_init(&rb->event_lock);
329
330	/*
331	* perf_output_begin() only checks rb->paused, therefore
332	* rb->paused must be true if we have no pages for output.
333	*/
334	if (!rb->nr_pages)
335	rb->paused = `1`;
336	}
337
338	void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags)
339	{
340	/*
341	* OVERWRITE is determined by perf_aux_output_end() and can't
342	* be passed in directly.
343	*/
344	if (WARN_ON_ONCE(flags & PERF_AUX_FLAG_OVERWRITE))
345	return;
346
347	handle->aux_flags \|= flags;
348	}
349	EXPORT_SYMBOL_GPL(perf_aux_output_flag);
350
351	/*
352	* This is called before hardware starts writing to the AUX area to
353	* obtain an output handle and make sure there's room in the buffer.
354	* When the capture completes, call perf_aux_output_end() to commit
355	* the recorded data to the buffer.
356	*
357	* The ordering is similar to that of perf_output_{begin,end}, with
358	* the exception of (B), which should be taken care of by the pmu
359	* driver, since ordering rules will differ depending on hardware.
360	*
361	* Call this from pmu::start(); see the comment in perf_aux_output_end()
362	* about its use in pmu callbacks. Both can also be called from the PMI
363	* handler if needed.
364	*/
365	void perf_aux_output_begin(struct* perf_output_handle *handle,
366	struct perf_event *event)
367	{
368	struct perf_event *output_event = event;
369	unsigned long aux_head, aux_tail;
370	struct perf_buffer *rb;
371	unsigned int nest;
372
373	if (output_event->parent)
374	output_event = output_event->parent;
375
376	/*
377	* Since this will typically be open across pmu::add/pmu::del, we
378	* grab ring_buffer's refcount instead of holding rcu read lock
379	* to make sure it doesn't disappear under us.
380	*/
381	rb = ring_buffer_get(event: output_event);
382	if (!rb)
383	return NULL;
384
385	if (!rb_has_aux(rb))
386	goto err;
387
388	/*
389	* If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(),
390	* about to get freed, so we leave immediately.
391	*
392	* Checking rb::aux_mmap_count and rb::refcount has to be done in
393	* the same order, see perf_mmap_close. Otherwise we end up freeing
394	* aux pages in this path, which is a bug, because in_atomic().
395	*/
396	if (!atomic_read(v: &rb->aux_mmap_count))
397	goto err;
398
399	if (!refcount_inc_not_zero(r: &rb->aux_refcount))
400	goto err;
401
402	nest = READ_ONCE(rb->aux_nest);
403	/*
404	* Nesting is not supported for AUX area, make sure nested
405	* writers are caught early
406	*/
407	if (WARN_ON_ONCE(nest))
408	goto err_put;
409
410	WRITE_ONCE(rb->aux_nest, nest + `1`);
411
412	aux_head = rb->aux_head;
413
414	handle->rb = rb;
415	handle->event = event;
416	handle->head = aux_head;
417	handle->size = `0`;
418	handle->aux_flags = `0`;
419
420	/*
421	* In overwrite mode, AUX data stores do not depend on aux_tail,
422	* therefore (A) control dependency barrier does not exist. The
423	* (B) <-> (C) ordering is still observed by the pmu driver.
424	*/
425	if (!rb->aux_overwrite) {
426	aux_tail = READ_ONCE(rb->user_page->aux_tail);
427	handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
428	if (aux_head - aux_tail < perf_aux_size(rb))
429	handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
430
431	/*
432	* handle->size computation depends on aux_tail load; this forms a
433	* control dependency barrier separating aux_tail load from aux data
434	* store that will be enabled on successful return
435	*/
436	if (!handle->size) { / A, matches D /
437	event->pending_disable = smp_processor_id();
438	perf_output_wakeup(handle);
439	WRITE_ONCE(rb->aux_nest, `0`);
440	goto err_put;
441	}
442	}
443
444	return handle->rb->aux_priv;
445
446	err_put:
447	/ can't be last /
448	rb_free_aux(rb);
449
450	err:
451	ring_buffer_put(rb);
452	handle->event = NULL;
453
454	return NULL;
455	}
456	EXPORT_SYMBOL_GPL(perf_aux_output_begin);
457
458	static __always_inline bool rb_need_aux_wakeup(struct perf_buffer *rb)
459	{
460	if (rb->aux_overwrite)
461	return false;
462
463	if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
464	rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
465	return true;
466	}
467
468	return false;
469	}
470
471	/*
472	* Commit the data written by hardware into the ring buffer by adjusting
473	* aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
474	* pmu driver's responsibility to observe ordering rules of the hardware,
475	* so that all the data is externally visible before this is called.
476	*
477	* Note: this has to be called from pmu::stop() callback, as the assumption
478	* of the AUX buffer management code is that after pmu::stop(), the AUX
479	* transaction must be stopped and therefore drop the AUX reference count.
480	*/
481	void perf_aux_output_end(struct perf_output_handle handle, unsigned* long size)
482	{
483	bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED);
484	struct perf_buffer *rb = handle->rb;
485	unsigned long aux_head;
486
487	/ in overwrite mode, driver provides aux_head via handle /
488	if (rb->aux_overwrite) {
489	handle->aux_flags \|= PERF_AUX_FLAG_OVERWRITE;
490
491	aux_head = handle->head;
492	rb->aux_head = aux_head;
493	} else {
494	handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE;
495
496	aux_head = rb->aux_head;
497	rb->aux_head += size;
498	}
499
500	/*
501	* Only send RECORD_AUX if we have something useful to communicate
502	*
503	* Note: the OVERWRITE records by themselves are not considered
504	* useful, as they don't communicate any new information,
505	* aside from the short-lived offset, that becomes history at
506	* the next event sched-in and therefore isn't useful.
507	* The userspace that needs to copy out AUX data in overwrite
508	* mode should know to use user_page::aux_head for the actual
509	* offset. So, from now on we don't output AUX records that
510	* have only OVERWRITE flag set.
511	*/
512	if (size \|\| (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE))
513	perf_event_aux_event(event: handle->event, head: aux_head, size,
514	flags: handle->aux_flags);
515
516	WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
517	if (rb_need_aux_wakeup(rb))
518	wakeup = true;
519
520	if (wakeup) {
521	if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
522	handle->event->pending_disable = smp_processor_id();
523	perf_output_wakeup(handle);
524	}
525
526	handle->event = NULL;
527
528	WRITE_ONCE(rb->aux_nest, `0`);
529	/ can't be last /
530	rb_free_aux(rb);
531	ring_buffer_put(rb);
532	}
533	EXPORT_SYMBOL_GPL(perf_aux_output_end);
534
535	/*
536	* Skip over a given number of bytes in the AUX buffer, due to, for example,
537	* hardware's alignment constraints.
538	*/
539	int perf_aux_output_skip(struct perf_output_handle handle, unsigned* long size)
540	{
541	struct perf_buffer *rb = handle->rb;
542
543	if (size > handle->size)
544	return -ENOSPC;
545
546	rb->aux_head += size;
547
548	WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
549	if (rb_need_aux_wakeup(rb)) {
550	perf_output_wakeup(handle);
551	handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
552	}
553
554	handle->head = rb->aux_head;
555	handle->size -= size;
556
557	return `0`;
558	}
559	EXPORT_SYMBOL_GPL(perf_aux_output_skip);
560
561	void perf_get_aux(struct* perf_output_handle *handle)
562	{
563	/ this is only valid between perf_aux_output_begin and _end /*
564	if (!handle->event)
565	return NULL;
566
567	return handle->rb->aux_priv;
568	}
569	EXPORT_SYMBOL_GPL(perf_get_aux);
570
571	/*
572	* Copy out AUX data from an AUX handle.
573	*/
574	long perf_output_copy_aux(struct perf_output_handle *aux_handle,
575	struct perf_output_handle *handle,
576	unsigned long from, unsigned long to)
577	{
578	struct perf_buffer *rb = aux_handle->rb;
579	unsigned long tocopy, remainder, len = `0`;
580	void *addr;
581
582	from &= (rb->aux_nr_pages << PAGE_SHIFT) - `1`;
583	to &= (rb->aux_nr_pages << PAGE_SHIFT) - `1`;
584
585	do {
586	tocopy = PAGE_SIZE - offset_in_page(from);
587	if (to > from)
588	tocopy = min(tocopy, to - from);
589	if (!tocopy)
590	break;
591
592	addr = rb->aux_pages[from >> PAGE_SHIFT];
593	addr += offset_in_page(from);
594
595	remainder = perf_output_copy(handle, buf: addr, len: tocopy);
596	if (remainder)
597	return -EFAULT;
598
599	len += tocopy;
600	from += tocopy;
601	from &= (rb->aux_nr_pages << PAGE_SHIFT) - `1`;
602	} while (to != from);
603
604	return len;
605	}
606
607	#define PERF_AUX_GFP (GFP_KERNEL \| __GFP_ZERO \| __GFP_NOWARN \| __GFP_NORETRY)
608
609	static struct page rb_alloc_aux_page(int* node, int order)
610	{
611	struct page *page;
612
613	if (order > MAX_ORDER)
614	order = MAX_ORDER;
615
616	do {
617	page = alloc_pages_node(nid: node, PERF_AUX_GFP, order);
618	} while (!page && order--);
619
620	if (page && order) {
621	/*
622	* Communicate the allocation size to the driver:
623	* if we managed to secure a high-order allocation,
624	* set its first page's private to this order;
625	* !PagePrivate(page) means it's just a normal page.
626	*/
627	split_page(page, order);
628	SetPagePrivate(page);
629	set_page_private(page, private: order);
630	}
631
632	return page;
633	}
634
635	static void rb_free_aux_page(struct perf_buffer rb, int* idx)
636	{
637	struct page *page = virt_to_page(rb->aux_pages[idx]);
638
639	ClearPagePrivate(page);
640	page->mapping = NULL;
641	__free_page(page);
642	}
643
644	static void __rb_free_aux(struct perf_buffer *rb)
645	{
646	int pg;
647
648	/*
649	* Should never happen, the last reference should be dropped from
650	* perf_mmap_close() path, which first stops aux transactions (which
651	* in turn are the atomic holders of aux_refcount) and then does the
652	* last rb_free_aux().
653	*/
654	WARN_ON_ONCE(in_atomic());
655
656	if (rb->aux_priv) {
657	rb->free_aux(rb->aux_priv);
658	rb->free_aux = NULL;
659	rb->aux_priv = NULL;
660	}
661
662	if (rb->aux_nr_pages) {
663	for (pg = `0`; pg < rb->aux_nr_pages; pg++)
664	rb_free_aux_page(rb, idx: pg);
665
666	kfree(objp: rb->aux_pages);
667	rb->aux_nr_pages = `0`;
668	}
669	}
670
671	int rb_alloc_aux(struct perf_buffer rb, struct* perf_event *event,
672	pgoff_t pgoff, int nr_pages, long watermark, int flags)
673	{
674	bool overwrite = !(flags & RING_BUFFER_WRITABLE);
675	int node = (event->cpu == -`1`) ? -`1` : cpu_to_node(cpu: event->cpu);
676	int ret = -ENOMEM, max_order;
677
678	if (!has_aux(event))
679	return -EOPNOTSUPP;
680
681	if (!overwrite) {
682	/*
683	* Watermark defaults to half the buffer, and so does the
684	* max_order, to aid PMU drivers in double buffering.
685	*/
686	if (!watermark)
687	watermark = nr_pages << (PAGE_SHIFT - `1`);
688
689	/*
690	* Use aux_watermark as the basis for chunking to
691	* help PMU drivers honor the watermark.
692	*/
693	max_order = get_order(size: watermark);
694	} else {
695	/*
696	* We need to start with the max_order that fits in nr_pages,
697	* not the other way around, hence ilog2() and not get_order.
698	*/
699	max_order = ilog2(nr_pages);
700	watermark = `0`;
701	}
702
703	/*
704	* kcalloc_node() is unable to allocate buffer if the size is larger
705	* than: PAGE_SIZE << MAX_ORDER; directly bail out in this case.
706	*/
707	if (get_order(size: (unsigned long)nr_pages * sizeof(void *)) > MAX_ORDER)
708	return -ENOMEM;
709	rb->aux_pages = kcalloc_node(n: nr_pages, size: sizeof(void *), GFP_KERNEL,
710	node);
711	if (!rb->aux_pages)
712	return -ENOMEM;
713
714	rb->free_aux = event->pmu->free_aux;
715	for (rb->aux_nr_pages = `0`; rb->aux_nr_pages < nr_pages;) {
716	struct page *page;
717	int last, order;
718
719	order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages));
720	page = rb_alloc_aux_page(node, order);
721	if (!page)
722	goto out;
723
724	for (last = rb->aux_nr_pages + (`1` << page_private(page));
725	last > rb->aux_nr_pages; rb->aux_nr_pages++)
726	rb->aux_pages[rb->aux_nr_pages] = page_address(page++);
727	}
728
729	/*
730	* In overwrite mode, PMUs that don't support SG may not handle more
731	* than one contiguous allocation, since they rely on PMI to do double
732	* buffering. In this case, the entire buffer has to be one contiguous
733	* chunk.
734	*/
735	if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) &&
736	overwrite) {
737	struct page *page = virt_to_page(rb->aux_pages[`0`]);
738
739	if (page_private(page) != max_order)
740	goto out;
741	}
742
743	rb->aux_priv = event->pmu->setup_aux(event, rb->aux_pages, nr_pages,
744	overwrite);
745	if (!rb->aux_priv)
746	goto out;
747
748	ret = `0`;
749
750	/*
751	* aux_pages (and pmu driver's private data, aux_priv) will be
752	* referenced in both producer's and consumer's contexts, thus
753	* we keep a refcount here to make sure either of the two can
754	* reference them safely.
755	*/
756	refcount_set(r: &rb->aux_refcount, n: `1`);
757
758	rb->aux_overwrite = overwrite;
759	rb->aux_watermark = watermark;
760
761	out:
762	if (!ret)
763	rb->aux_pgoff = pgoff;
764	else
765	__rb_free_aux(rb);
766
767	return ret;
768	}
769
770	void rb_free_aux(struct perf_buffer *rb)
771	{
772	if (refcount_dec_and_test(r: &rb->aux_refcount))
773	__rb_free_aux(rb);
774	}
775
776	#ifndef CONFIG_PERF_USE_VMALLOC
777
778	/*
779	* Back perf_mmap() with regular GFP_KERNEL-0 pages.
780	*/
781
782	static struct page *
783	__perf_mmap_to_page(struct perf_buffer rb, unsigned* long pgoff)
784	{
785	if (pgoff > rb->nr_pages)
786	return NULL;
787
788	if (pgoff == `0`)
789	return virt_to_page(rb->user_page);
790
791	return virt_to_page(rb->data_pages[pgoff - `1`]);
792	}
793
794	static void perf_mmap_alloc_page(int* cpu)
795	{
796	struct page *page;
797	int node;
798
799	node = (cpu == -`1`) ? cpu : cpu_to_node(cpu);
800	page = alloc_pages_node(node, GFP_KERNEL \| __GFP_ZERO, `0`);
801	if (!page)
802	return NULL;
803
804	return page_address(page);
805	}
806
807	static void perf_mmap_free_page(void *addr)
808	{
809	struct page *page = virt_to_page(addr);
810
811	page->mapping = NULL;
812	__free_page(page);
813	}
814
815	struct perf_buffer rb_alloc(int* nr_pages, long watermark, int cpu, int flags)
816	{
817	struct perf_buffer *rb;
818	unsigned long size;
819	int i, node;
820
821	size = sizeof(struct perf_buffer);
822	size += nr_pages * sizeof(void *);
823
824	if (order_base_2(size) > PAGE_SHIFT+MAX_ORDER)
825	goto fail;
826
827	node = (cpu == -`1`) ? cpu : cpu_to_node(cpu);
828	rb = kzalloc_node(size, GFP_KERNEL, node);
829	if (!rb)
830	goto fail;
831
832	rb->user_page = perf_mmap_alloc_page(cpu);
833	if (!rb->user_page)
834	goto fail_user_page;
835
836	for (i = `0`; i < nr_pages; i++) {
837	rb->data_pages[i] = perf_mmap_alloc_page(cpu);
838	if (!rb->data_pages[i])
839	goto fail_data_pages;
840	}
841
842	rb->nr_pages = nr_pages;
843
844	ring_buffer_init(rb, watermark, flags);
845
846	return rb;
847
848	fail_data_pages:
849	for (i--; i >= `0`; i--)
850	perf_mmap_free_page(rb->data_pages[i]);
851
852	perf_mmap_free_page(rb->user_page);
853
854	fail_user_page:
855	kfree(rb);
856
857	fail:
858	return NULL;
859	}
860
861	void rb_free(struct perf_buffer *rb)
862	{
863	int i;
864
865	perf_mmap_free_page(rb->user_page);
866	for (i = `0`; i < rb->nr_pages; i++)
867	perf_mmap_free_page(rb->data_pages[i]);
868	kfree(rb);
869	}
870
871	#else
872	static struct page *
873	__perf_mmap_to_page(struct perf_buffer rb, unsigned* long pgoff)
874	{
875	/ The '>' counts in the user page. /
876	if (pgoff > data_page_nr(rb))
877	return NULL;
878
879	return vmalloc_to_page(addr: (void )rb->user_page + pgoff PAGE_SIZE);
880	}
881
882	static void perf_mmap_unmark_page(void *addr)
883	{
884	struct page *page = vmalloc_to_page(addr);
885
886	page->mapping = NULL;
887	}
888
889	static void rb_free_work(struct work_struct *work)
890	{
891	struct perf_buffer *rb;
892	void *base;
893	int i, nr;
894
895	rb = container_of(work, struct perf_buffer, work);
896	nr = data_page_nr(rb);
897
898	base = rb->user_page;
899	/ The '<=' counts in the user page. /
900	for (i = `0`; i <= nr; i++)
901	perf_mmap_unmark_page(addr: base + (i * PAGE_SIZE));
902
903	vfree(addr: base);
904	kfree(objp: rb);
905	}
906
907	void rb_free(struct perf_buffer *rb)
908	{
909	schedule_work(work: &rb->work);
910	}
911
912	struct perf_buffer rb_alloc(int* nr_pages, long watermark, int cpu, int flags)
913	{
914	struct perf_buffer *rb;
915	unsigned long size;
916	void *all_buf;
917	int node;
918
919	size = sizeof(struct perf_buffer);
920	size += sizeof(void *);
921
922	node = (cpu == -`1`) ? cpu : cpu_to_node(cpu);
923	rb = kzalloc_node(size, GFP_KERNEL, node);
924	if (!rb)
925	goto fail;
926
927	INIT_WORK(&rb->work, rb_free_work);
928
929	all_buf = vmalloc_user(size: (nr_pages + `1`) * PAGE_SIZE);
930	if (!all_buf)
931	goto fail_all_buf;
932
933	rb->user_page = all_buf;
934	rb->data_pages[`0`] = all_buf + PAGE_SIZE;
935	if (nr_pages) {
936	rb->nr_pages = `1`;
937	rb->page_order = ilog2(nr_pages);
938	}
939
940	ring_buffer_init(rb, watermark, flags);
941
942	return rb;
943
944	fail_all_buf:
945	kfree(objp: rb);
946
947	fail:
948	return NULL;
949	}
950
951	#endif
952
953	struct page *
954	perf_mmap_to_page(struct perf_buffer rb, unsigned* long pgoff)
955	{
956	if (rb->aux_nr_pages) {
957	/ above AUX space /
958	if (pgoff > rb->aux_pgoff + rb->aux_nr_pages)
959	return NULL;
960
961	/ AUX space /
962	if (pgoff >= rb->aux_pgoff) {
963	int aux_pgoff = array_index_nospec(pgoff - rb->aux_pgoff, rb->aux_nr_pages);
964	return virt_to_page(rb->aux_pages[aux_pgoff]);
965	}
966	}
967
968	return __perf_mmap_to_page(rb, pgoff);
969	}
970

source code of linux/kernel/events/ring_buffer.c