1 | #include <linux/bpf.h> |
2 | #include <linux/btf.h> |
3 | #include <linux/err.h> |
4 | #include <linux/irq_work.h> |
5 | #include <linux/slab.h> |
6 | #include <linux/filter.h> |
7 | #include <linux/mm.h> |
8 | #include <linux/vmalloc.h> |
9 | #include <linux/wait.h> |
10 | #include <linux/poll.h> |
11 | #include <linux/kmemleak.h> |
12 | #include <uapi/linux/btf.h> |
13 | #include <linux/btf_ids.h> |
14 | |
15 | #define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) |
16 | |
17 | /* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */ |
18 | #define RINGBUF_PGOFF \ |
19 | (offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT) |
20 | /* consumer page and producer page */ |
21 | #define RINGBUF_POS_PAGES 2 |
22 | #define RINGBUF_NR_META_PAGES (RINGBUF_PGOFF + RINGBUF_POS_PAGES) |
23 | |
24 | #define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4) |
25 | |
26 | struct bpf_ringbuf { |
27 | wait_queue_head_t waitq; |
28 | struct irq_work work; |
29 | u64 mask; |
30 | struct page **pages; |
31 | int nr_pages; |
32 | spinlock_t spinlock ____cacheline_aligned_in_smp; |
33 | /* For user-space producer ring buffers, an atomic_t busy bit is used |
34 | * to synchronize access to the ring buffers in the kernel, rather than |
35 | * the spinlock that is used for kernel-producer ring buffers. This is |
36 | * done because the ring buffer must hold a lock across a BPF program's |
37 | * callback: |
38 | * |
39 | * __bpf_user_ringbuf_peek() // lock acquired |
40 | * -> program callback_fn() |
41 | * -> __bpf_user_ringbuf_sample_release() // lock released |
42 | * |
43 | * It is unsafe and incorrect to hold an IRQ spinlock across what could |
44 | * be a long execution window, so we instead simply disallow concurrent |
45 | * access to the ring buffer by kernel consumers, and return -EBUSY from |
46 | * __bpf_user_ringbuf_peek() if the busy bit is held by another task. |
47 | */ |
48 | atomic_t busy ____cacheline_aligned_in_smp; |
49 | /* Consumer and producer counters are put into separate pages to |
50 | * allow each position to be mapped with different permissions. |
51 | * This prevents a user-space application from modifying the |
52 | * position and ruining in-kernel tracking. The permissions of the |
53 | * pages depend on who is producing samples: user-space or the |
54 | * kernel. |
55 | * |
56 | * Kernel-producer |
57 | * --------------- |
58 | * The producer position and data pages are mapped as r/o in |
59 | * userspace. For this approach, bits in the header of samples are |
60 | * used to signal to user-space, and to other producers, whether a |
61 | * sample is currently being written. |
62 | * |
63 | * User-space producer |
64 | * ------------------- |
65 | * Only the page containing the consumer position is mapped r/o in |
66 | * user-space. User-space producers also use bits of the header to |
67 | * communicate to the kernel, but the kernel must carefully check and |
68 | * validate each sample to ensure that they're correctly formatted, and |
69 | * fully contained within the ring buffer. |
70 | */ |
71 | unsigned long consumer_pos __aligned(PAGE_SIZE); |
72 | unsigned long producer_pos __aligned(PAGE_SIZE); |
73 | char data[] __aligned(PAGE_SIZE); |
74 | }; |
75 | |
76 | struct bpf_ringbuf_map { |
77 | struct bpf_map map; |
78 | struct bpf_ringbuf *rb; |
79 | }; |
80 | |
81 | /* 8-byte ring buffer record header structure */ |
82 | struct bpf_ringbuf_hdr { |
83 | u32 len; |
84 | u32 pg_off; |
85 | }; |
86 | |
87 | static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) |
88 | { |
89 | const gfp_t flags = GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL | |
90 | __GFP_NOWARN | __GFP_ZERO; |
91 | int nr_meta_pages = RINGBUF_NR_META_PAGES; |
92 | int nr_data_pages = data_sz >> PAGE_SHIFT; |
93 | int nr_pages = nr_meta_pages + nr_data_pages; |
94 | struct page **pages, *page; |
95 | struct bpf_ringbuf *rb; |
96 | size_t array_size; |
97 | int i; |
98 | |
99 | /* Each data page is mapped twice to allow "virtual" |
100 | * continuous read of samples wrapping around the end of ring |
101 | * buffer area: |
102 | * ------------------------------------------------------ |
103 | * | meta pages | real data pages | same data pages | |
104 | * ------------------------------------------------------ |
105 | * | | 1 2 3 4 5 6 7 8 9 | 1 2 3 4 5 6 7 8 9 | |
106 | * ------------------------------------------------------ |
107 | * | | TA DA | TA DA | |
108 | * ------------------------------------------------------ |
109 | * ^^^^^^^ |
110 | * | |
111 | * Here, no need to worry about special handling of wrapped-around |
112 | * data due to double-mapped data pages. This works both in kernel and |
113 | * when mmap()'ed in user-space, simplifying both kernel and |
114 | * user-space implementations significantly. |
115 | */ |
116 | array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages); |
117 | pages = bpf_map_area_alloc(size: array_size, numa_node); |
118 | if (!pages) |
119 | return NULL; |
120 | |
121 | for (i = 0; i < nr_pages; i++) { |
122 | page = alloc_pages_node(nid: numa_node, gfp_mask: flags, order: 0); |
123 | if (!page) { |
124 | nr_pages = i; |
125 | goto err_free_pages; |
126 | } |
127 | pages[i] = page; |
128 | if (i >= nr_meta_pages) |
129 | pages[nr_data_pages + i] = page; |
130 | } |
131 | |
132 | rb = vmap(pages, count: nr_meta_pages + 2 * nr_data_pages, |
133 | VM_MAP | VM_USERMAP, PAGE_KERNEL); |
134 | if (rb) { |
135 | kmemleak_not_leak(ptr: pages); |
136 | rb->pages = pages; |
137 | rb->nr_pages = nr_pages; |
138 | return rb; |
139 | } |
140 | |
141 | err_free_pages: |
142 | for (i = 0; i < nr_pages; i++) |
143 | __free_page(pages[i]); |
144 | bpf_map_area_free(base: pages); |
145 | return NULL; |
146 | } |
147 | |
148 | static void bpf_ringbuf_notify(struct irq_work *work) |
149 | { |
150 | struct bpf_ringbuf *rb = container_of(work, struct bpf_ringbuf, work); |
151 | |
152 | wake_up_all(&rb->waitq); |
153 | } |
154 | |
155 | /* Maximum size of ring buffer area is limited by 32-bit page offset within |
156 | * record header, counted in pages. Reserve 8 bits for extensibility, and |
157 | * take into account few extra pages for consumer/producer pages and |
158 | * non-mmap()'able parts, the current maximum size would be: |
159 | * |
160 | * (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE) |
161 | * |
162 | * This gives 64GB limit, which seems plenty for single ring buffer. Now |
163 | * considering that the maximum value of data_sz is (4GB - 1), there |
164 | * will be no overflow, so just note the size limit in the comments. |
165 | */ |
166 | static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) |
167 | { |
168 | struct bpf_ringbuf *rb; |
169 | |
170 | rb = bpf_ringbuf_area_alloc(data_sz, numa_node); |
171 | if (!rb) |
172 | return NULL; |
173 | |
174 | spin_lock_init(&rb->spinlock); |
175 | atomic_set(v: &rb->busy, i: 0); |
176 | init_waitqueue_head(&rb->waitq); |
177 | init_irq_work(work: &rb->work, func: bpf_ringbuf_notify); |
178 | |
179 | rb->mask = data_sz - 1; |
180 | rb->consumer_pos = 0; |
181 | rb->producer_pos = 0; |
182 | |
183 | return rb; |
184 | } |
185 | |
186 | static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) |
187 | { |
188 | struct bpf_ringbuf_map *rb_map; |
189 | |
190 | if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK) |
191 | return ERR_PTR(error: -EINVAL); |
192 | |
193 | if (attr->key_size || attr->value_size || |
194 | !is_power_of_2(n: attr->max_entries) || |
195 | !PAGE_ALIGNED(attr->max_entries)) |
196 | return ERR_PTR(error: -EINVAL); |
197 | |
198 | rb_map = bpf_map_area_alloc(size: sizeof(*rb_map), NUMA_NO_NODE); |
199 | if (!rb_map) |
200 | return ERR_PTR(error: -ENOMEM); |
201 | |
202 | bpf_map_init_from_attr(map: &rb_map->map, attr); |
203 | |
204 | rb_map->rb = bpf_ringbuf_alloc(data_sz: attr->max_entries, numa_node: rb_map->map.numa_node); |
205 | if (!rb_map->rb) { |
206 | bpf_map_area_free(base: rb_map); |
207 | return ERR_PTR(error: -ENOMEM); |
208 | } |
209 | |
210 | return &rb_map->map; |
211 | } |
212 | |
213 | static void bpf_ringbuf_free(struct bpf_ringbuf *rb) |
214 | { |
215 | /* copy pages pointer and nr_pages to local variable, as we are going |
216 | * to unmap rb itself with vunmap() below |
217 | */ |
218 | struct page **pages = rb->pages; |
219 | int i, nr_pages = rb->nr_pages; |
220 | |
221 | vunmap(addr: rb); |
222 | for (i = 0; i < nr_pages; i++) |
223 | __free_page(pages[i]); |
224 | bpf_map_area_free(base: pages); |
225 | } |
226 | |
227 | static void ringbuf_map_free(struct bpf_map *map) |
228 | { |
229 | struct bpf_ringbuf_map *rb_map; |
230 | |
231 | rb_map = container_of(map, struct bpf_ringbuf_map, map); |
232 | bpf_ringbuf_free(rb: rb_map->rb); |
233 | bpf_map_area_free(base: rb_map); |
234 | } |
235 | |
236 | static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key) |
237 | { |
238 | return ERR_PTR(error: -ENOTSUPP); |
239 | } |
240 | |
241 | static long ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value, |
242 | u64 flags) |
243 | { |
244 | return -ENOTSUPP; |
245 | } |
246 | |
247 | static long ringbuf_map_delete_elem(struct bpf_map *map, void *key) |
248 | { |
249 | return -ENOTSUPP; |
250 | } |
251 | |
252 | static int ringbuf_map_get_next_key(struct bpf_map *map, void *key, |
253 | void *next_key) |
254 | { |
255 | return -ENOTSUPP; |
256 | } |
257 | |
258 | static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma) |
259 | { |
260 | struct bpf_ringbuf_map *rb_map; |
261 | |
262 | rb_map = container_of(map, struct bpf_ringbuf_map, map); |
263 | |
264 | if (vma->vm_flags & VM_WRITE) { |
265 | /* allow writable mapping for the consumer_pos only */ |
266 | if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE) |
267 | return -EPERM; |
268 | } else { |
269 | vm_flags_clear(vma, VM_MAYWRITE); |
270 | } |
271 | /* remap_vmalloc_range() checks size and offset constraints */ |
272 | return remap_vmalloc_range(vma, addr: rb_map->rb, |
273 | pgoff: vma->vm_pgoff + RINGBUF_PGOFF); |
274 | } |
275 | |
276 | static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma) |
277 | { |
278 | struct bpf_ringbuf_map *rb_map; |
279 | |
280 | rb_map = container_of(map, struct bpf_ringbuf_map, map); |
281 | |
282 | if (vma->vm_flags & VM_WRITE) { |
283 | if (vma->vm_pgoff == 0) |
284 | /* Disallow writable mappings to the consumer pointer, |
285 | * and allow writable mappings to both the producer |
286 | * position, and the ring buffer data itself. |
287 | */ |
288 | return -EPERM; |
289 | } else { |
290 | vm_flags_clear(vma, VM_MAYWRITE); |
291 | } |
292 | /* remap_vmalloc_range() checks size and offset constraints */ |
293 | return remap_vmalloc_range(vma, addr: rb_map->rb, pgoff: vma->vm_pgoff + RINGBUF_PGOFF); |
294 | } |
295 | |
296 | static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) |
297 | { |
298 | unsigned long cons_pos, prod_pos; |
299 | |
300 | cons_pos = smp_load_acquire(&rb->consumer_pos); |
301 | prod_pos = smp_load_acquire(&rb->producer_pos); |
302 | return prod_pos - cons_pos; |
303 | } |
304 | |
305 | static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb) |
306 | { |
307 | return rb->mask + 1; |
308 | } |
309 | |
310 | static __poll_t ringbuf_map_poll_kern(struct bpf_map *map, struct file *filp, |
311 | struct poll_table_struct *pts) |
312 | { |
313 | struct bpf_ringbuf_map *rb_map; |
314 | |
315 | rb_map = container_of(map, struct bpf_ringbuf_map, map); |
316 | poll_wait(filp, wait_address: &rb_map->rb->waitq, p: pts); |
317 | |
318 | if (ringbuf_avail_data_sz(rb: rb_map->rb)) |
319 | return EPOLLIN | EPOLLRDNORM; |
320 | return 0; |
321 | } |
322 | |
323 | static __poll_t ringbuf_map_poll_user(struct bpf_map *map, struct file *filp, |
324 | struct poll_table_struct *pts) |
325 | { |
326 | struct bpf_ringbuf_map *rb_map; |
327 | |
328 | rb_map = container_of(map, struct bpf_ringbuf_map, map); |
329 | poll_wait(filp, wait_address: &rb_map->rb->waitq, p: pts); |
330 | |
331 | if (ringbuf_avail_data_sz(rb: rb_map->rb) < ringbuf_total_data_sz(rb: rb_map->rb)) |
332 | return EPOLLOUT | EPOLLWRNORM; |
333 | return 0; |
334 | } |
335 | |
336 | static u64 ringbuf_map_mem_usage(const struct bpf_map *map) |
337 | { |
338 | struct bpf_ringbuf *rb; |
339 | int nr_data_pages; |
340 | int nr_meta_pages; |
341 | u64 usage = sizeof(struct bpf_ringbuf_map); |
342 | |
343 | rb = container_of(map, struct bpf_ringbuf_map, map)->rb; |
344 | usage += (u64)rb->nr_pages << PAGE_SHIFT; |
345 | nr_meta_pages = RINGBUF_NR_META_PAGES; |
346 | nr_data_pages = map->max_entries >> PAGE_SHIFT; |
347 | usage += (nr_meta_pages + 2 * nr_data_pages) * sizeof(struct page *); |
348 | return usage; |
349 | } |
350 | |
351 | BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map) |
352 | const struct bpf_map_ops ringbuf_map_ops = { |
353 | .map_meta_equal = bpf_map_meta_equal, |
354 | .map_alloc = ringbuf_map_alloc, |
355 | .map_free = ringbuf_map_free, |
356 | .map_mmap = ringbuf_map_mmap_kern, |
357 | .map_poll = ringbuf_map_poll_kern, |
358 | .map_lookup_elem = ringbuf_map_lookup_elem, |
359 | .map_update_elem = ringbuf_map_update_elem, |
360 | .map_delete_elem = ringbuf_map_delete_elem, |
361 | .map_get_next_key = ringbuf_map_get_next_key, |
362 | .map_mem_usage = ringbuf_map_mem_usage, |
363 | .map_btf_id = &ringbuf_map_btf_ids[0], |
364 | }; |
365 | |
366 | BTF_ID_LIST_SINGLE(user_ringbuf_map_btf_ids, struct, bpf_ringbuf_map) |
367 | const struct bpf_map_ops user_ringbuf_map_ops = { |
368 | .map_meta_equal = bpf_map_meta_equal, |
369 | .map_alloc = ringbuf_map_alloc, |
370 | .map_free = ringbuf_map_free, |
371 | .map_mmap = ringbuf_map_mmap_user, |
372 | .map_poll = ringbuf_map_poll_user, |
373 | .map_lookup_elem = ringbuf_map_lookup_elem, |
374 | .map_update_elem = ringbuf_map_update_elem, |
375 | .map_delete_elem = ringbuf_map_delete_elem, |
376 | .map_get_next_key = ringbuf_map_get_next_key, |
377 | .map_mem_usage = ringbuf_map_mem_usage, |
378 | .map_btf_id = &user_ringbuf_map_btf_ids[0], |
379 | }; |
380 | |
381 | /* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, |
382 | * calculate offset from record metadata to ring buffer in pages, rounded |
383 | * down. This page offset is stored as part of record metadata and allows to |
384 | * restore struct bpf_ringbuf * from record pointer. This page offset is |
385 | * stored at offset 4 of record metadata header. |
386 | */ |
387 | static size_t bpf_ringbuf_rec_pg_off(struct bpf_ringbuf *rb, |
388 | struct bpf_ringbuf_hdr *hdr) |
389 | { |
390 | return ((void *)hdr - (void *)rb) >> PAGE_SHIFT; |
391 | } |
392 | |
393 | /* Given pointer to ring buffer record header, restore pointer to struct |
394 | * bpf_ringbuf itself by using page offset stored at offset 4 |
395 | */ |
396 | static struct bpf_ringbuf * |
397 | bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr) |
398 | { |
399 | unsigned long addr = (unsigned long)(void *)hdr; |
400 | unsigned long off = (unsigned long)hdr->pg_off << PAGE_SHIFT; |
401 | |
402 | return (void*)((addr & PAGE_MASK) - off); |
403 | } |
404 | |
405 | static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) |
406 | { |
407 | unsigned long cons_pos, prod_pos, new_prod_pos, flags; |
408 | u32 len, pg_off; |
409 | struct bpf_ringbuf_hdr *hdr; |
410 | |
411 | if (unlikely(size > RINGBUF_MAX_RECORD_SZ)) |
412 | return NULL; |
413 | |
414 | len = round_up(size + BPF_RINGBUF_HDR_SZ, 8); |
415 | if (len > ringbuf_total_data_sz(rb)) |
416 | return NULL; |
417 | |
418 | cons_pos = smp_load_acquire(&rb->consumer_pos); |
419 | |
420 | if (in_nmi()) { |
421 | if (!spin_trylock_irqsave(&rb->spinlock, flags)) |
422 | return NULL; |
423 | } else { |
424 | spin_lock_irqsave(&rb->spinlock, flags); |
425 | } |
426 | |
427 | prod_pos = rb->producer_pos; |
428 | new_prod_pos = prod_pos + len; |
429 | |
430 | /* check for out of ringbuf space by ensuring producer position |
431 | * doesn't advance more than (ringbuf_size - 1) ahead |
432 | */ |
433 | if (new_prod_pos - cons_pos > rb->mask) { |
434 | spin_unlock_irqrestore(lock: &rb->spinlock, flags); |
435 | return NULL; |
436 | } |
437 | |
438 | hdr = (void *)rb->data + (prod_pos & rb->mask); |
439 | pg_off = bpf_ringbuf_rec_pg_off(rb, hdr); |
440 | hdr->len = size | BPF_RINGBUF_BUSY_BIT; |
441 | hdr->pg_off = pg_off; |
442 | |
443 | /* pairs with consumer's smp_load_acquire() */ |
444 | smp_store_release(&rb->producer_pos, new_prod_pos); |
445 | |
446 | spin_unlock_irqrestore(lock: &rb->spinlock, flags); |
447 | |
448 | return (void *)hdr + BPF_RINGBUF_HDR_SZ; |
449 | } |
450 | |
451 | BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags) |
452 | { |
453 | struct bpf_ringbuf_map *rb_map; |
454 | |
455 | if (unlikely(flags)) |
456 | return 0; |
457 | |
458 | rb_map = container_of(map, struct bpf_ringbuf_map, map); |
459 | return (unsigned long)__bpf_ringbuf_reserve(rb: rb_map->rb, size); |
460 | } |
461 | |
462 | const struct bpf_func_proto bpf_ringbuf_reserve_proto = { |
463 | .func = bpf_ringbuf_reserve, |
464 | .ret_type = RET_PTR_TO_RINGBUF_MEM_OR_NULL, |
465 | .arg1_type = ARG_CONST_MAP_PTR, |
466 | .arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, |
467 | .arg3_type = ARG_ANYTHING, |
468 | }; |
469 | |
470 | static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard) |
471 | { |
472 | unsigned long rec_pos, cons_pos; |
473 | struct bpf_ringbuf_hdr *hdr; |
474 | struct bpf_ringbuf *rb; |
475 | u32 new_len; |
476 | |
477 | hdr = sample - BPF_RINGBUF_HDR_SZ; |
478 | rb = bpf_ringbuf_restore_from_rec(hdr); |
479 | new_len = hdr->len ^ BPF_RINGBUF_BUSY_BIT; |
480 | if (discard) |
481 | new_len |= BPF_RINGBUF_DISCARD_BIT; |
482 | |
483 | /* update record header with correct final size prefix */ |
484 | xchg(&hdr->len, new_len); |
485 | |
486 | /* if consumer caught up and is waiting for our record, notify about |
487 | * new data availability |
488 | */ |
489 | rec_pos = (void *)hdr - (void *)rb->data; |
490 | cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask; |
491 | |
492 | if (flags & BPF_RB_FORCE_WAKEUP) |
493 | irq_work_queue(work: &rb->work); |
494 | else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP)) |
495 | irq_work_queue(work: &rb->work); |
496 | } |
497 | |
498 | BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags) |
499 | { |
500 | bpf_ringbuf_commit(sample, flags, discard: false /* discard */); |
501 | return 0; |
502 | } |
503 | |
504 | const struct bpf_func_proto bpf_ringbuf_submit_proto = { |
505 | .func = bpf_ringbuf_submit, |
506 | .ret_type = RET_VOID, |
507 | .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE, |
508 | .arg2_type = ARG_ANYTHING, |
509 | }; |
510 | |
511 | BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags) |
512 | { |
513 | bpf_ringbuf_commit(sample, flags, discard: true /* discard */); |
514 | return 0; |
515 | } |
516 | |
517 | const struct bpf_func_proto bpf_ringbuf_discard_proto = { |
518 | .func = bpf_ringbuf_discard, |
519 | .ret_type = RET_VOID, |
520 | .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE, |
521 | .arg2_type = ARG_ANYTHING, |
522 | }; |
523 | |
524 | BPF_CALL_4(bpf_ringbuf_output, struct bpf_map *, map, void *, data, u64, size, |
525 | u64, flags) |
526 | { |
527 | struct bpf_ringbuf_map *rb_map; |
528 | void *rec; |
529 | |
530 | if (unlikely(flags & ~(BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP))) |
531 | return -EINVAL; |
532 | |
533 | rb_map = container_of(map, struct bpf_ringbuf_map, map); |
534 | rec = __bpf_ringbuf_reserve(rb: rb_map->rb, size); |
535 | if (!rec) |
536 | return -EAGAIN; |
537 | |
538 | memcpy(rec, data, size); |
539 | bpf_ringbuf_commit(sample: rec, flags, discard: false /* discard */); |
540 | return 0; |
541 | } |
542 | |
543 | const struct bpf_func_proto bpf_ringbuf_output_proto = { |
544 | .func = bpf_ringbuf_output, |
545 | .ret_type = RET_INTEGER, |
546 | .arg1_type = ARG_CONST_MAP_PTR, |
547 | .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, |
548 | .arg3_type = ARG_CONST_SIZE_OR_ZERO, |
549 | .arg4_type = ARG_ANYTHING, |
550 | }; |
551 | |
552 | BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags) |
553 | { |
554 | struct bpf_ringbuf *rb; |
555 | |
556 | rb = container_of(map, struct bpf_ringbuf_map, map)->rb; |
557 | |
558 | switch (flags) { |
559 | case BPF_RB_AVAIL_DATA: |
560 | return ringbuf_avail_data_sz(rb); |
561 | case BPF_RB_RING_SIZE: |
562 | return ringbuf_total_data_sz(rb); |
563 | case BPF_RB_CONS_POS: |
564 | return smp_load_acquire(&rb->consumer_pos); |
565 | case BPF_RB_PROD_POS: |
566 | return smp_load_acquire(&rb->producer_pos); |
567 | default: |
568 | return 0; |
569 | } |
570 | } |
571 | |
572 | const struct bpf_func_proto bpf_ringbuf_query_proto = { |
573 | .func = bpf_ringbuf_query, |
574 | .ret_type = RET_INTEGER, |
575 | .arg1_type = ARG_CONST_MAP_PTR, |
576 | .arg2_type = ARG_ANYTHING, |
577 | }; |
578 | |
579 | BPF_CALL_4(bpf_ringbuf_reserve_dynptr, struct bpf_map *, map, u32, size, u64, flags, |
580 | struct bpf_dynptr_kern *, ptr) |
581 | { |
582 | struct bpf_ringbuf_map *rb_map; |
583 | void *sample; |
584 | int err; |
585 | |
586 | if (unlikely(flags)) { |
587 | bpf_dynptr_set_null(ptr); |
588 | return -EINVAL; |
589 | } |
590 | |
591 | err = bpf_dynptr_check_size(size); |
592 | if (err) { |
593 | bpf_dynptr_set_null(ptr); |
594 | return err; |
595 | } |
596 | |
597 | rb_map = container_of(map, struct bpf_ringbuf_map, map); |
598 | |
599 | sample = __bpf_ringbuf_reserve(rb: rb_map->rb, size); |
600 | if (!sample) { |
601 | bpf_dynptr_set_null(ptr); |
602 | return -EINVAL; |
603 | } |
604 | |
605 | bpf_dynptr_init(ptr, data: sample, type: BPF_DYNPTR_TYPE_RINGBUF, offset: 0, size); |
606 | |
607 | return 0; |
608 | } |
609 | |
610 | const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = { |
611 | .func = bpf_ringbuf_reserve_dynptr, |
612 | .ret_type = RET_INTEGER, |
613 | .arg1_type = ARG_CONST_MAP_PTR, |
614 | .arg2_type = ARG_ANYTHING, |
615 | .arg3_type = ARG_ANYTHING, |
616 | .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT, |
617 | }; |
618 | |
619 | BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags) |
620 | { |
621 | if (!ptr->data) |
622 | return 0; |
623 | |
624 | bpf_ringbuf_commit(sample: ptr->data, flags, discard: false /* discard */); |
625 | |
626 | bpf_dynptr_set_null(ptr); |
627 | |
628 | return 0; |
629 | } |
630 | |
631 | const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto = { |
632 | .func = bpf_ringbuf_submit_dynptr, |
633 | .ret_type = RET_VOID, |
634 | .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE, |
635 | .arg2_type = ARG_ANYTHING, |
636 | }; |
637 | |
638 | BPF_CALL_2(bpf_ringbuf_discard_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags) |
639 | { |
640 | if (!ptr->data) |
641 | return 0; |
642 | |
643 | bpf_ringbuf_commit(sample: ptr->data, flags, discard: true /* discard */); |
644 | |
645 | bpf_dynptr_set_null(ptr); |
646 | |
647 | return 0; |
648 | } |
649 | |
650 | const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = { |
651 | .func = bpf_ringbuf_discard_dynptr, |
652 | .ret_type = RET_VOID, |
653 | .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE, |
654 | .arg2_type = ARG_ANYTHING, |
655 | }; |
656 | |
657 | static int __bpf_user_ringbuf_peek(struct bpf_ringbuf *rb, void **sample, u32 *size) |
658 | { |
659 | int err; |
660 | u32 hdr_len, sample_len, total_len, flags, *hdr; |
661 | u64 cons_pos, prod_pos; |
662 | |
663 | /* Synchronizes with smp_store_release() in user-space producer. */ |
664 | prod_pos = smp_load_acquire(&rb->producer_pos); |
665 | if (prod_pos % 8) |
666 | return -EINVAL; |
667 | |
668 | /* Synchronizes with smp_store_release() in __bpf_user_ringbuf_sample_release() */ |
669 | cons_pos = smp_load_acquire(&rb->consumer_pos); |
670 | if (cons_pos >= prod_pos) |
671 | return -ENODATA; |
672 | |
673 | hdr = (u32 *)((uintptr_t)rb->data + (uintptr_t)(cons_pos & rb->mask)); |
674 | /* Synchronizes with smp_store_release() in user-space producer. */ |
675 | hdr_len = smp_load_acquire(hdr); |
676 | flags = hdr_len & (BPF_RINGBUF_BUSY_BIT | BPF_RINGBUF_DISCARD_BIT); |
677 | sample_len = hdr_len & ~flags; |
678 | total_len = round_up(sample_len + BPF_RINGBUF_HDR_SZ, 8); |
679 | |
680 | /* The sample must fit within the region advertised by the producer position. */ |
681 | if (total_len > prod_pos - cons_pos) |
682 | return -EINVAL; |
683 | |
684 | /* The sample must fit within the data region of the ring buffer. */ |
685 | if (total_len > ringbuf_total_data_sz(rb)) |
686 | return -E2BIG; |
687 | |
688 | /* The sample must fit into a struct bpf_dynptr. */ |
689 | err = bpf_dynptr_check_size(size: sample_len); |
690 | if (err) |
691 | return -E2BIG; |
692 | |
693 | if (flags & BPF_RINGBUF_DISCARD_BIT) { |
694 | /* If the discard bit is set, the sample should be skipped. |
695 | * |
696 | * Update the consumer pos, and return -EAGAIN so the caller |
697 | * knows to skip this sample and try to read the next one. |
698 | */ |
699 | smp_store_release(&rb->consumer_pos, cons_pos + total_len); |
700 | return -EAGAIN; |
701 | } |
702 | |
703 | if (flags & BPF_RINGBUF_BUSY_BIT) |
704 | return -ENODATA; |
705 | |
706 | *sample = (void *)((uintptr_t)rb->data + |
707 | (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask)); |
708 | *size = sample_len; |
709 | return 0; |
710 | } |
711 | |
712 | static void __bpf_user_ringbuf_sample_release(struct bpf_ringbuf *rb, size_t size, u64 flags) |
713 | { |
714 | u64 consumer_pos; |
715 | u32 rounded_size = round_up(size + BPF_RINGBUF_HDR_SZ, 8); |
716 | |
717 | /* Using smp_load_acquire() is unnecessary here, as the busy-bit |
718 | * prevents another task from writing to consumer_pos after it was read |
719 | * by this task with smp_load_acquire() in __bpf_user_ringbuf_peek(). |
720 | */ |
721 | consumer_pos = rb->consumer_pos; |
722 | /* Synchronizes with smp_load_acquire() in user-space producer. */ |
723 | smp_store_release(&rb->consumer_pos, consumer_pos + rounded_size); |
724 | } |
725 | |
726 | BPF_CALL_4(bpf_user_ringbuf_drain, struct bpf_map *, map, |
727 | void *, callback_fn, void *, callback_ctx, u64, flags) |
728 | { |
729 | struct bpf_ringbuf *rb; |
730 | long samples, discarded_samples = 0, ret = 0; |
731 | bpf_callback_t callback = (bpf_callback_t)callback_fn; |
732 | u64 wakeup_flags = BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP; |
733 | int busy = 0; |
734 | |
735 | if (unlikely(flags & ~wakeup_flags)) |
736 | return -EINVAL; |
737 | |
738 | rb = container_of(map, struct bpf_ringbuf_map, map)->rb; |
739 | |
740 | /* If another consumer is already consuming a sample, wait for them to finish. */ |
741 | if (!atomic_try_cmpxchg(v: &rb->busy, old: &busy, new: 1)) |
742 | return -EBUSY; |
743 | |
744 | for (samples = 0; samples < BPF_MAX_USER_RINGBUF_SAMPLES && ret == 0; samples++) { |
745 | int err; |
746 | u32 size; |
747 | void *sample; |
748 | struct bpf_dynptr_kern dynptr; |
749 | |
750 | err = __bpf_user_ringbuf_peek(rb, sample: &sample, size: &size); |
751 | if (err) { |
752 | if (err == -ENODATA) { |
753 | break; |
754 | } else if (err == -EAGAIN) { |
755 | discarded_samples++; |
756 | continue; |
757 | } else { |
758 | ret = err; |
759 | goto schedule_work_return; |
760 | } |
761 | } |
762 | |
763 | bpf_dynptr_init(ptr: &dynptr, data: sample, type: BPF_DYNPTR_TYPE_LOCAL, offset: 0, size); |
764 | ret = callback((uintptr_t)&dynptr, (uintptr_t)callback_ctx, 0, 0, 0); |
765 | __bpf_user_ringbuf_sample_release(rb, size, flags); |
766 | } |
767 | ret = samples - discarded_samples; |
768 | |
769 | schedule_work_return: |
770 | /* Prevent the clearing of the busy-bit from being reordered before the |
771 | * storing of any rb consumer or producer positions. |
772 | */ |
773 | atomic_set_release(v: &rb->busy, i: 0); |
774 | |
775 | if (flags & BPF_RB_FORCE_WAKEUP) |
776 | irq_work_queue(work: &rb->work); |
777 | else if (!(flags & BPF_RB_NO_WAKEUP) && samples > 0) |
778 | irq_work_queue(work: &rb->work); |
779 | return ret; |
780 | } |
781 | |
782 | const struct bpf_func_proto bpf_user_ringbuf_drain_proto = { |
783 | .func = bpf_user_ringbuf_drain, |
784 | .ret_type = RET_INTEGER, |
785 | .arg1_type = ARG_CONST_MAP_PTR, |
786 | .arg2_type = ARG_PTR_TO_FUNC, |
787 | .arg3_type = ARG_PTR_TO_STACK_OR_NULL, |
788 | .arg4_type = ARG_ANYTHING, |
789 | }; |
790 | |