ringbuf.c source code [linux/kernel/bpf/ringbuf.c]

1	#include <linux/bpf.h>
2	#include <linux/btf.h>
3	#include <linux/err.h>
4	#include <linux/irq_work.h>
5	#include <linux/slab.h>
6	#include <linux/filter.h>
7	#include <linux/mm.h>
8	#include <linux/vmalloc.h>
9	#include <linux/wait.h>
10	#include <linux/poll.h>
11	#include <linux/kmemleak.h>
12	#include <uapi/linux/btf.h>
13	#include <linux/btf_ids.h>
14
15	#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE)
16
17	/ non-mmap()'able part of bpf_ringbuf (everything up to consumer page) /
18	#define RINGBUF_PGOFF \
19	(offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT)
20	/ consumer page and producer page /
21	#define RINGBUF_POS_PAGES 2
22	#define RINGBUF_NR_META_PAGES (RINGBUF_PGOFF + RINGBUF_POS_PAGES)
23
24	#define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4)
25
26	struct bpf_ringbuf {
27	wait_queue_head_t waitq;
28	struct irq_work work;
29	u64 mask;
30	struct page **pages;
31	int nr_pages;
32	spinlock_t spinlock ____cacheline_aligned_in_smp;
33	/ For user-space producer ring buffers, an atomic_t busy bit is used*
34	* to synchronize access to the ring buffers in the kernel, rather than
35	* the spinlock that is used for kernel-producer ring buffers. This is
36	* done because the ring buffer must hold a lock across a BPF program's
37	* callback:
38	*
39	* __bpf_user_ringbuf_peek() // lock acquired
40	* -> program callback_fn()
41	* -> __bpf_user_ringbuf_sample_release() // lock released
42	*
43	* It is unsafe and incorrect to hold an IRQ spinlock across what could
44	* be a long execution window, so we instead simply disallow concurrent
45	* access to the ring buffer by kernel consumers, and return -EBUSY from
46	* __bpf_user_ringbuf_peek() if the busy bit is held by another task.
47	*/
48	atomic_t busy ____cacheline_aligned_in_smp;
49	/ Consumer and producer counters are put into separate pages to*
50	* allow each position to be mapped with different permissions.
51	* This prevents a user-space application from modifying the
52	* position and ruining in-kernel tracking. The permissions of the
53	* pages depend on who is producing samples: user-space or the
54	* kernel.
55	*
56	* Kernel-producer
57	* ---------------
58	* The producer position and data pages are mapped as r/o in
59	* userspace. For this approach, bits in the header of samples are
60	* used to signal to user-space, and to other producers, whether a
61	* sample is currently being written.
62	*
63	* User-space producer
64	* -------------------
65	* Only the page containing the consumer position is mapped r/o in
66	* user-space. User-space producers also use bits of the header to
67	* communicate to the kernel, but the kernel must carefully check and
68	* validate each sample to ensure that they're correctly formatted, and
69	* fully contained within the ring buffer.
70	*/
71	unsigned long consumer_pos __aligned(PAGE_SIZE);
72	unsigned long producer_pos __aligned(PAGE_SIZE);
73	char data[] __aligned(PAGE_SIZE);
74	};
75
76	struct bpf_ringbuf_map {
77	struct bpf_map map;
78	struct bpf_ringbuf *rb;
79	};
80
81	/ 8-byte ring buffer record header structure /
82	struct bpf_ringbuf_hdr {
83	u32 len;
84	u32 pg_off;
85	};
86
87	static struct bpf_ringbuf bpf_ringbuf_area_alloc(size_t data_sz, int* numa_node)
88	{
89	const gfp_t flags = GFP_KERNEL_ACCOUNT \| __GFP_RETRY_MAYFAIL \|
90	__GFP_NOWARN \| __GFP_ZERO;
91	int nr_meta_pages = RINGBUF_NR_META_PAGES;
92	int nr_data_pages = data_sz >> PAGE_SHIFT;
93	int nr_pages = nr_meta_pages + nr_data_pages;
94	struct page *pages, page;
95	struct bpf_ringbuf *rb;
96	size_t array_size;
97	int i;
98
99	/ Each data page is mapped twice to allow "virtual"*
100	* continuous read of samples wrapping around the end of ring
101	* buffer area:
102	* ------------------------------------------------------
103	* \| meta pages \| real data pages \| same data pages \|
104	* ------------------------------------------------------
105	* \| \| 1 2 3 4 5 6 7 8 9 \| 1 2 3 4 5 6 7 8 9 \|
106	* ------------------------------------------------------
107	* \| \| TA DA \| TA DA \|
108	* ------------------------------------------------------
109	* ^^^^^^^
110	* \|
111	* Here, no need to worry about special handling of wrapped-around
112	* data due to double-mapped data pages. This works both in kernel and
113	* when mmap()'ed in user-space, simplifying both kernel and
114	* user-space implementations significantly.
115	*/
116	array_size = (nr_meta_pages + `2` * nr_data_pages) * sizeof(*pages);
117	pages = bpf_map_area_alloc(size: array_size, numa_node);
118	if (!pages)
119	return NULL;
120
121	for (i = `0`; i < nr_pages; i++) {
122	page = alloc_pages_node(nid: numa_node, gfp_mask: flags, order: `0`);
123	if (!page) {
124	nr_pages = i;
125	goto err_free_pages;
126	}
127	pages[i] = page;
128	if (i >= nr_meta_pages)
129	pages[nr_data_pages + i] = page;
130	}
131
132	rb = vmap(pages, count: nr_meta_pages + `2` * nr_data_pages,
133	VM_MAP \| VM_USERMAP, PAGE_KERNEL);
134	if (rb) {
135	kmemleak_not_leak(ptr: pages);
136	rb->pages = pages;
137	rb->nr_pages = nr_pages;
138	return rb;
139	}
140
141	err_free_pages:
142	for (i = `0`; i < nr_pages; i++)
143	__free_page(pages[i]);
144	bpf_map_area_free(base: pages);
145	return NULL;
146	}
147
148	static void bpf_ringbuf_notify(struct irq_work *work)
149	{
150	struct bpf_ringbuf rb = container_of(work, struct* bpf_ringbuf, work);
151
152	wake_up_all(&rb->waitq);
153	}
154
155	/ Maximum size of ring buffer area is limited by 32-bit page offset within*
156	* record header, counted in pages. Reserve 8 bits for extensibility, and
157	* take into account few extra pages for consumer/producer pages and
158	* non-mmap()'able parts, the current maximum size would be:
159	*
160	* (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE)
161	*
162	* This gives 64GB limit, which seems plenty for single ring buffer. Now
163	* considering that the maximum value of data_sz is (4GB - 1), there
164	* will be no overflow, so just note the size limit in the comments.
165	*/
166	static struct bpf_ringbuf bpf_ringbuf_alloc(size_t data_sz, int* numa_node)
167	{
168	struct bpf_ringbuf *rb;
169
170	rb = bpf_ringbuf_area_alloc(data_sz, numa_node);
171	if (!rb)
172	return NULL;
173
174	spin_lock_init(&rb->spinlock);
175	atomic_set(v: &rb->busy, i: `0`);
176	init_waitqueue_head(&rb->waitq);
177	init_irq_work(work: &rb->work, func: bpf_ringbuf_notify);
178
179	rb->mask = data_sz - `1`;
180	rb->consumer_pos = `0`;
181	rb->producer_pos = `0`;
182
183	return rb;
184	}
185
186	static struct bpf_map ringbuf_map_alloc(union* bpf_attr *attr)
187	{
188	struct bpf_ringbuf_map *rb_map;
189
190	if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK)
191	return ERR_PTR(error: -EINVAL);
192
193	if (attr->key_size \|\| attr->value_size \|\|
194	!is_power_of_2(n: attr->max_entries) \|\|
195	!PAGE_ALIGNED(attr->max_entries))
196	return ERR_PTR(error: -EINVAL);
197
198	rb_map = bpf_map_area_alloc(size: sizeof(*rb_map), NUMA_NO_NODE);
199	if (!rb_map)
200	return ERR_PTR(error: -ENOMEM);
201
202	bpf_map_init_from_attr(map: &rb_map->map, attr);
203
204	rb_map->rb = bpf_ringbuf_alloc(data_sz: attr->max_entries, numa_node: rb_map->map.numa_node);
205	if (!rb_map->rb) {
206	bpf_map_area_free(base: rb_map);
207	return ERR_PTR(error: -ENOMEM);
208	}
209
210	return &rb_map->map;
211	}
212
213	static void bpf_ringbuf_free(struct bpf_ringbuf *rb)
214	{
215	/ copy pages pointer and nr_pages to local variable, as we are going*
216	* to unmap rb itself with vunmap() below
217	*/
218	struct page **pages = rb->pages;
219	int i, nr_pages = rb->nr_pages;
220
221	vunmap(addr: rb);
222	for (i = `0`; i < nr_pages; i++)
223	__free_page(pages[i]);
224	bpf_map_area_free(base: pages);
225	}
226
227	static void ringbuf_map_free(struct bpf_map *map)
228	{
229	struct bpf_ringbuf_map *rb_map;
230
231	rb_map = container_of(map, struct bpf_ringbuf_map, map);
232	bpf_ringbuf_free(rb: rb_map->rb);
233	bpf_map_area_free(base: rb_map);
234	}
235
236	static void ringbuf_map_lookup_elem(struct* bpf_map map, void* *key)
237	{
238	return ERR_PTR(error: -ENOTSUPP);
239	}
240
241	static long ringbuf_map_update_elem(struct bpf_map map, void* key, void* *value,
242	u64 flags)
243	{
244	return -ENOTSUPP;
245	}
246
247	static long ringbuf_map_delete_elem(struct bpf_map map, void* *key)
248	{
249	return -ENOTSUPP;
250	}
251
252	static int ringbuf_map_get_next_key(struct bpf_map map, void* *key,
253	void *next_key)
254	{
255	return -ENOTSUPP;
256	}
257
258	static int ringbuf_map_mmap_kern(struct bpf_map map, struct* vm_area_struct *vma)
259	{
260	struct bpf_ringbuf_map *rb_map;
261
262	rb_map = container_of(map, struct bpf_ringbuf_map, map);
263
264	if (vma->vm_flags & VM_WRITE) {
265	/ allow writable mapping for the consumer_pos only /
266	if (vma->vm_pgoff != `0` \|\| vma->vm_end - vma->vm_start != PAGE_SIZE)
267	return -EPERM;
268	} else {
269	vm_flags_clear(vma, VM_MAYWRITE);
270	}
271	/ remap_vmalloc_range() checks size and offset constraints /
272	return remap_vmalloc_range(vma, addr: rb_map->rb,
273	pgoff: vma->vm_pgoff + RINGBUF_PGOFF);
274	}
275
276	static int ringbuf_map_mmap_user(struct bpf_map map, struct* vm_area_struct *vma)
277	{
278	struct bpf_ringbuf_map *rb_map;
279
280	rb_map = container_of(map, struct bpf_ringbuf_map, map);
281
282	if (vma->vm_flags & VM_WRITE) {
283	if (vma->vm_pgoff == `0`)
284	/ Disallow writable mappings to the consumer pointer,*
285	* and allow writable mappings to both the producer
286	* position, and the ring buffer data itself.
287	*/
288	return -EPERM;
289	} else {
290	vm_flags_clear(vma, VM_MAYWRITE);
291	}
292	/ remap_vmalloc_range() checks size and offset constraints /
293	return remap_vmalloc_range(vma, addr: rb_map->rb, pgoff: vma->vm_pgoff + RINGBUF_PGOFF);
294	}
295
296	static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
297	{
298	unsigned long cons_pos, prod_pos;
299
300	cons_pos = smp_load_acquire(&rb->consumer_pos);
301	prod_pos = smp_load_acquire(&rb->producer_pos);
302	return prod_pos - cons_pos;
303	}
304
305	static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb)
306	{
307	return rb->mask + `1`;
308	}
309
310	static __poll_t ringbuf_map_poll_kern(struct bpf_map map, struct* file *filp,
311	struct poll_table_struct *pts)
312	{
313	struct bpf_ringbuf_map *rb_map;
314
315	rb_map = container_of(map, struct bpf_ringbuf_map, map);
316	poll_wait(filp, wait_address: &rb_map->rb->waitq, p: pts);
317
318	if (ringbuf_avail_data_sz(rb: rb_map->rb))
319	return EPOLLIN \| EPOLLRDNORM;
320	return `0`;
321	}
322
323	static __poll_t ringbuf_map_poll_user(struct bpf_map map, struct* file *filp,
324	struct poll_table_struct *pts)
325	{
326	struct bpf_ringbuf_map *rb_map;
327
328	rb_map = container_of(map, struct bpf_ringbuf_map, map);
329	poll_wait(filp, wait_address: &rb_map->rb->waitq, p: pts);
330
331	if (ringbuf_avail_data_sz(rb: rb_map->rb) < ringbuf_total_data_sz(rb: rb_map->rb))
332	return EPOLLOUT \| EPOLLWRNORM;
333	return `0`;
334	}
335
336	static u64 ringbuf_map_mem_usage(const struct bpf_map *map)
337	{
338	struct bpf_ringbuf *rb;
339	int nr_data_pages;
340	int nr_meta_pages;
341	u64 usage = sizeof(struct bpf_ringbuf_map);
342
343	rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
344	usage += (u64)rb->nr_pages << PAGE_SHIFT;
345	nr_meta_pages = RINGBUF_NR_META_PAGES;
346	nr_data_pages = map->max_entries >> PAGE_SHIFT;
347	usage += (nr_meta_pages + `2` * nr_data_pages) * sizeof(struct page *);
348	return usage;
349	}
350
351	BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
352	const struct bpf_map_ops ringbuf_map_ops = {
353	.map_meta_equal = bpf_map_meta_equal,
354	.map_alloc = ringbuf_map_alloc,
355	.map_free = ringbuf_map_free,
356	.map_mmap = ringbuf_map_mmap_kern,
357	.map_poll = ringbuf_map_poll_kern,
358	.map_lookup_elem = ringbuf_map_lookup_elem,
359	.map_update_elem = ringbuf_map_update_elem,
360	.map_delete_elem = ringbuf_map_delete_elem,
361	.map_get_next_key = ringbuf_map_get_next_key,
362	.map_mem_usage = ringbuf_map_mem_usage,
363	.map_btf_id = &ringbuf_map_btf_ids[`0`],
364	};
365
366	BTF_ID_LIST_SINGLE(user_ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
367	const struct bpf_map_ops user_ringbuf_map_ops = {
368	.map_meta_equal = bpf_map_meta_equal,
369	.map_alloc = ringbuf_map_alloc,
370	.map_free = ringbuf_map_free,
371	.map_mmap = ringbuf_map_mmap_user,
372	.map_poll = ringbuf_map_poll_user,
373	.map_lookup_elem = ringbuf_map_lookup_elem,
374	.map_update_elem = ringbuf_map_update_elem,
375	.map_delete_elem = ringbuf_map_delete_elem,
376	.map_get_next_key = ringbuf_map_get_next_key,
377	.map_mem_usage = ringbuf_map_mem_usage,
378	.map_btf_id = &user_ringbuf_map_btf_ids[`0`],
379	};
380
381	/ Given pointer to ring buffer record metadata and struct bpf_ringbuf itself,*
382	* calculate offset from record metadata to ring buffer in pages, rounded
383	* down. This page offset is stored as part of record metadata and allows to
384	* restore struct bpf_ringbuf * from record pointer. This page offset is
385	* stored at offset 4 of record metadata header.
386	*/
387	static size_t bpf_ringbuf_rec_pg_off(struct bpf_ringbuf *rb,
388	struct bpf_ringbuf_hdr *hdr)
389	{
390	return ((void )hdr - (void* *)rb) >> PAGE_SHIFT;
391	}
392
393	/ Given pointer to ring buffer record header, restore pointer to struct*
394	* bpf_ringbuf itself by using page offset stored at offset 4
395	*/
396	static struct bpf_ringbuf *
397	bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr)
398	{
399	unsigned long addr = (unsigned long)(void *)hdr;
400	unsigned long off = (unsigned long)hdr->pg_off << PAGE_SHIFT;
401
402	return (void*)((addr & PAGE_MASK) - off);
403	}
404
405	static void __bpf_ringbuf_reserve(struct* bpf_ringbuf *rb, u64 size)
406	{
407	unsigned long cons_pos, prod_pos, new_prod_pos, flags;
408	u32 len, pg_off;
409	struct bpf_ringbuf_hdr *hdr;
410
411	if (unlikely(size > RINGBUF_MAX_RECORD_SZ))
412	return NULL;
413
414	len = round_up(size + BPF_RINGBUF_HDR_SZ, `8`);
415	if (len > ringbuf_total_data_sz(rb))
416	return NULL;
417
418	cons_pos = smp_load_acquire(&rb->consumer_pos);
419
420	if (in_nmi()) {
421	if (!spin_trylock_irqsave(&rb->spinlock, flags))
422	return NULL;
423	} else {
424	spin_lock_irqsave(&rb->spinlock, flags);
425	}
426
427	prod_pos = rb->producer_pos;
428	new_prod_pos = prod_pos + len;
429
430	/ check for out of ringbuf space by ensuring producer position*
431	* doesn't advance more than (ringbuf_size - 1) ahead
432	*/
433	if (new_prod_pos - cons_pos > rb->mask) {
434	spin_unlock_irqrestore(lock: &rb->spinlock, flags);
435	return NULL;
436	}
437
438	hdr = (void *)rb->data + (prod_pos & rb->mask);
439	pg_off = bpf_ringbuf_rec_pg_off(rb, hdr);
440	hdr->len = size \| BPF_RINGBUF_BUSY_BIT;
441	hdr->pg_off = pg_off;
442
443	/ pairs with consumer's smp_load_acquire() /
444	smp_store_release(&rb->producer_pos, new_prod_pos);
445
446	spin_unlock_irqrestore(lock: &rb->spinlock, flags);
447
448	return (void *)hdr + BPF_RINGBUF_HDR_SZ;
449	}
450
451	BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags)
452	{
453	struct bpf_ringbuf_map *rb_map;
454
455	if (unlikely(flags))
456	return `0`;
457
458	rb_map = container_of(map, struct bpf_ringbuf_map, map);
459	return (unsigned long)__bpf_ringbuf_reserve(rb: rb_map->rb, size);
460	}
461
462	const struct bpf_func_proto bpf_ringbuf_reserve_proto = {
463	.func = bpf_ringbuf_reserve,
464	.ret_type = RET_PTR_TO_RINGBUF_MEM_OR_NULL,
465	.arg1_type = ARG_CONST_MAP_PTR,
466	.arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO,
467	.arg3_type = ARG_ANYTHING,
468	};
469
470	static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard)
471	{
472	unsigned long rec_pos, cons_pos;
473	struct bpf_ringbuf_hdr *hdr;
474	struct bpf_ringbuf *rb;
475	u32 new_len;
476
477	hdr = sample - BPF_RINGBUF_HDR_SZ;
478	rb = bpf_ringbuf_restore_from_rec(hdr);
479	new_len = hdr->len ^ BPF_RINGBUF_BUSY_BIT;
480	if (discard)
481	new_len \|= BPF_RINGBUF_DISCARD_BIT;
482
483	/ update record header with correct final size prefix /
484	xchg(&hdr->len, new_len);
485
486	/ if consumer caught up and is waiting for our record, notify about*
487	* new data availability
488	*/
489	rec_pos = (void )hdr - (void* *)rb->data;
490	cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask;
491
492	if (flags & BPF_RB_FORCE_WAKEUP)
493	irq_work_queue(work: &rb->work);
494	else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP))
495	irq_work_queue(work: &rb->work);
496	}
497
498	BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags)
499	{
500	bpf_ringbuf_commit(sample, flags, discard: false / discard /);
501	return `0`;
502	}
503
504	const struct bpf_func_proto bpf_ringbuf_submit_proto = {
505	.func = bpf_ringbuf_submit,
506	.ret_type = RET_VOID,
507	.arg1_type = ARG_PTR_TO_RINGBUF_MEM \| OBJ_RELEASE,
508	.arg2_type = ARG_ANYTHING,
509	};
510
511	BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags)
512	{
513	bpf_ringbuf_commit(sample, flags, discard: true / discard /);
514	return `0`;
515	}
516
517	const struct bpf_func_proto bpf_ringbuf_discard_proto = {
518	.func = bpf_ringbuf_discard,
519	.ret_type = RET_VOID,
520	.arg1_type = ARG_PTR_TO_RINGBUF_MEM \| OBJ_RELEASE,
521	.arg2_type = ARG_ANYTHING,
522	};
523
524	BPF_CALL_4(bpf_ringbuf_output, struct bpf_map , map, void* *, data, u64, size,
525	u64, flags)
526	{
527	struct bpf_ringbuf_map *rb_map;
528	void *rec;
529
530	if (unlikely(flags & ~(BPF_RB_NO_WAKEUP \| BPF_RB_FORCE_WAKEUP)))
531	return -EINVAL;
532
533	rb_map = container_of(map, struct bpf_ringbuf_map, map);
534	rec = __bpf_ringbuf_reserve(rb: rb_map->rb, size);
535	if (!rec)
536	return -EAGAIN;
537
538	memcpy(rec, data, size);
539	bpf_ringbuf_commit(sample: rec, flags, discard: false / discard /);
540	return `0`;
541	}
542
543	const struct bpf_func_proto bpf_ringbuf_output_proto = {
544	.func = bpf_ringbuf_output,
545	.ret_type = RET_INTEGER,
546	.arg1_type = ARG_CONST_MAP_PTR,
547	.arg2_type = ARG_PTR_TO_MEM \| MEM_RDONLY,
548	.arg3_type = ARG_CONST_SIZE_OR_ZERO,
549	.arg4_type = ARG_ANYTHING,
550	};
551
552	BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags)
553	{
554	struct bpf_ringbuf *rb;
555
556	rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
557
558	switch (flags) {
559	case BPF_RB_AVAIL_DATA:
560	return ringbuf_avail_data_sz(rb);
561	case BPF_RB_RING_SIZE:
562	return ringbuf_total_data_sz(rb);
563	case BPF_RB_CONS_POS:
564	return smp_load_acquire(&rb->consumer_pos);
565	case BPF_RB_PROD_POS:
566	return smp_load_acquire(&rb->producer_pos);
567	default:
568	return `0`;
569	}
570	}
571
572	const struct bpf_func_proto bpf_ringbuf_query_proto = {
573	.func = bpf_ringbuf_query,
574	.ret_type = RET_INTEGER,
575	.arg1_type = ARG_CONST_MAP_PTR,
576	.arg2_type = ARG_ANYTHING,
577	};
578
579	BPF_CALL_4(bpf_ringbuf_reserve_dynptr, struct bpf_map *, map, u32, size, u64, flags,
580	struct bpf_dynptr_kern *, ptr)
581	{
582	struct bpf_ringbuf_map *rb_map;
583	void *sample;
584	int err;
585
586	if (unlikely(flags)) {
587	bpf_dynptr_set_null(ptr);
588	return -EINVAL;
589	}
590
591	err = bpf_dynptr_check_size(size);
592	if (err) {
593	bpf_dynptr_set_null(ptr);
594	return err;
595	}
596
597	rb_map = container_of(map, struct bpf_ringbuf_map, map);
598
599	sample = __bpf_ringbuf_reserve(rb: rb_map->rb, size);
600	if (!sample) {
601	bpf_dynptr_set_null(ptr);
602	return -EINVAL;
603	}
604
605	bpf_dynptr_init(ptr, data: sample, type: BPF_DYNPTR_TYPE_RINGBUF, offset: `0`, size);
606
607	return `0`;
608	}
609
610	const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = {
611	.func = bpf_ringbuf_reserve_dynptr,
612	.ret_type = RET_INTEGER,
613	.arg1_type = ARG_CONST_MAP_PTR,
614	.arg2_type = ARG_ANYTHING,
615	.arg3_type = ARG_ANYTHING,
616	.arg4_type = ARG_PTR_TO_DYNPTR \| DYNPTR_TYPE_RINGBUF \| MEM_UNINIT,
617	};
618
619	BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
620	{
621	if (!ptr->data)
622	return `0`;
623
624	bpf_ringbuf_commit(sample: ptr->data, flags, discard: false / discard /);
625
626	bpf_dynptr_set_null(ptr);
627
628	return `0`;
629	}
630
631	const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto = {
632	.func = bpf_ringbuf_submit_dynptr,
633	.ret_type = RET_VOID,
634	.arg1_type = ARG_PTR_TO_DYNPTR \| DYNPTR_TYPE_RINGBUF \| OBJ_RELEASE,
635	.arg2_type = ARG_ANYTHING,
636	};
637
638	BPF_CALL_2(bpf_ringbuf_discard_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
639	{
640	if (!ptr->data)
641	return `0`;
642
643	bpf_ringbuf_commit(sample: ptr->data, flags, discard: true / discard /);
644
645	bpf_dynptr_set_null(ptr);
646
647	return `0`;
648	}
649
650	const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = {
651	.func = bpf_ringbuf_discard_dynptr,
652	.ret_type = RET_VOID,
653	.arg1_type = ARG_PTR_TO_DYNPTR \| DYNPTR_TYPE_RINGBUF \| OBJ_RELEASE,
654	.arg2_type = ARG_ANYTHING,
655	};
656
657	static int __bpf_user_ringbuf_peek(struct bpf_ringbuf rb, void* *sample, u32 size)
658	{
659	int err;
660	u32 hdr_len, sample_len, total_len, flags, *hdr;
661	u64 cons_pos, prod_pos;
662
663	/ Synchronizes with smp_store_release() in user-space producer. /
664	prod_pos = smp_load_acquire(&rb->producer_pos);
665	if (prod_pos % `8`)
666	return -EINVAL;
667
668	/ Synchronizes with smp_store_release() in __bpf_user_ringbuf_sample_release() /
669	cons_pos = smp_load_acquire(&rb->consumer_pos);
670	if (cons_pos >= prod_pos)
671	return -ENODATA;
672
673	hdr = (u32 *)((uintptr_t)rb->data + (uintptr_t)(cons_pos & rb->mask));
674	/ Synchronizes with smp_store_release() in user-space producer. /
675	hdr_len = smp_load_acquire(hdr);
676	flags = hdr_len & (BPF_RINGBUF_BUSY_BIT \| BPF_RINGBUF_DISCARD_BIT);
677	sample_len = hdr_len & ~flags;
678	total_len = round_up(sample_len + BPF_RINGBUF_HDR_SZ, `8`);
679
680	/ The sample must fit within the region advertised by the producer position. /
681	if (total_len > prod_pos - cons_pos)
682	return -EINVAL;
683
684	/ The sample must fit within the data region of the ring buffer. /
685	if (total_len > ringbuf_total_data_sz(rb))
686	return -E2BIG;
687
688	/ The sample must fit into a struct bpf_dynptr. /
689	err = bpf_dynptr_check_size(size: sample_len);
690	if (err)
691	return -E2BIG;
692
693	if (flags & BPF_RINGBUF_DISCARD_BIT) {
694	/ If the discard bit is set, the sample should be skipped.*
695	*
696	* Update the consumer pos, and return -EAGAIN so the caller
697	* knows to skip this sample and try to read the next one.
698	*/
699	smp_store_release(&rb->consumer_pos, cons_pos + total_len);
700	return -EAGAIN;
701	}
702
703	if (flags & BPF_RINGBUF_BUSY_BIT)
704	return -ENODATA;
705
706	sample = (void* *)((uintptr_t)rb->data +
707	(uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask));
708	*size = sample_len;
709	return `0`;
710	}
711
712	static void __bpf_user_ringbuf_sample_release(struct bpf_ringbuf *rb, size_t size, u64 flags)
713	{
714	u64 consumer_pos;
715	u32 rounded_size = round_up(size + BPF_RINGBUF_HDR_SZ, `8`);
716
717	/ Using smp_load_acquire() is unnecessary here, as the busy-bit*
718	* prevents another task from writing to consumer_pos after it was read
719	* by this task with smp_load_acquire() in __bpf_user_ringbuf_peek().
720	*/
721	consumer_pos = rb->consumer_pos;
722	/ Synchronizes with smp_load_acquire() in user-space producer. /
723	smp_store_release(&rb->consumer_pos, consumer_pos + rounded_size);
724	}
725
726	BPF_CALL_4(bpf_user_ringbuf_drain, struct bpf_map *, map,
727	void , callback_fn, void* *, callback_ctx, u64, flags)
728	{
729	struct bpf_ringbuf *rb;
730	long samples, discarded_samples = `0`, ret = `0`;
731	bpf_callback_t callback = (bpf_callback_t)callback_fn;
732	u64 wakeup_flags = BPF_RB_NO_WAKEUP \| BPF_RB_FORCE_WAKEUP;
733	int busy = `0`;
734
735	if (unlikely(flags & ~wakeup_flags))
736	return -EINVAL;
737
738	rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
739
740	/ If another consumer is already consuming a sample, wait for them to finish. /
741	if (!atomic_try_cmpxchg(v: &rb->busy, old: &busy, new: `1`))
742	return -EBUSY;
743
744	for (samples = `0`; samples < BPF_MAX_USER_RINGBUF_SAMPLES && ret == `0`; samples++) {
745	int err;
746	u32 size;
747	void *sample;
748	struct bpf_dynptr_kern dynptr;
749
750	err = __bpf_user_ringbuf_peek(rb, sample: &sample, size: &size);
751	if (err) {
752	if (err == -ENODATA) {
753	break;
754	} else if (err == -EAGAIN) {
755	discarded_samples++;
756	continue;
757	} else {
758	ret = err;
759	goto schedule_work_return;
760	}
761	}
762
763	bpf_dynptr_init(ptr: &dynptr, data: sample, type: BPF_DYNPTR_TYPE_LOCAL, offset: `0`, size);
764	ret = callback((uintptr_t)&dynptr, (uintptr_t)callback_ctx, `0`, `0`, `0`);
765	__bpf_user_ringbuf_sample_release(rb, size, flags);
766	}
767	ret = samples - discarded_samples;
768
769	schedule_work_return:
770	/ Prevent the clearing of the busy-bit from being reordered before the*
771	* storing of any rb consumer or producer positions.
772	*/
773	atomic_set_release(v: &rb->busy, i: `0`);
774
775	if (flags & BPF_RB_FORCE_WAKEUP)
776	irq_work_queue(work: &rb->work);
777	else if (!(flags & BPF_RB_NO_WAKEUP) && samples > `0`)
778	irq_work_queue(work: &rb->work);
779	return ret;
780	}
781
782	const struct bpf_func_proto bpf_user_ringbuf_drain_proto = {
783	.func = bpf_user_ringbuf_drain,
784	.ret_type = RET_INTEGER,
785	.arg1_type = ARG_CONST_MAP_PTR,
786	.arg2_type = ARG_PTR_TO_FUNC,
787	.arg3_type = ARG_PTR_TO_STACK_OR_NULL,
788	.arg4_type = ARG_ANYTHING,
789	};
790

source code of linux/kernel/bpf/ringbuf.c