1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ |
3 | #include <linux/mm.h> |
4 | #include <linux/llist.h> |
5 | #include <linux/bpf.h> |
6 | #include <linux/irq_work.h> |
7 | #include <linux/bpf_mem_alloc.h> |
8 | #include <linux/memcontrol.h> |
9 | #include <asm/local.h> |
10 | |
11 | /* Any context (including NMI) BPF specific memory allocator. |
12 | * |
13 | * Tracing BPF programs can attach to kprobe and fentry. Hence they |
14 | * run in unknown context where calling plain kmalloc() might not be safe. |
15 | * |
16 | * Front-end kmalloc() with per-cpu per-bucket cache of free elements. |
17 | * Refill this cache asynchronously from irq_work. |
18 | * |
19 | * CPU_0 buckets |
20 | * 16 32 64 96 128 196 256 512 1024 2048 4096 |
21 | * ... |
22 | * CPU_N buckets |
23 | * 16 32 64 96 128 196 256 512 1024 2048 4096 |
24 | * |
25 | * The buckets are prefilled at the start. |
26 | * BPF programs always run with migration disabled. |
27 | * It's safe to allocate from cache of the current cpu with irqs disabled. |
28 | * Free-ing is always done into bucket of the current cpu as well. |
29 | * irq_work trims extra free elements from buckets with kfree |
30 | * and refills them with kmalloc, so global kmalloc logic takes care |
31 | * of freeing objects allocated by one cpu and freed on another. |
32 | * |
33 | * Every allocated objected is padded with extra 8 bytes that contains |
34 | * struct llist_node. |
35 | */ |
36 | #define LLIST_NODE_SZ sizeof(struct llist_node) |
37 | |
38 | /* similar to kmalloc, but sizeof == 8 bucket is gone */ |
39 | static u8 size_index[24] __ro_after_init = { |
40 | 3, /* 8 */ |
41 | 3, /* 16 */ |
42 | 4, /* 24 */ |
43 | 4, /* 32 */ |
44 | 5, /* 40 */ |
45 | 5, /* 48 */ |
46 | 5, /* 56 */ |
47 | 5, /* 64 */ |
48 | 1, /* 72 */ |
49 | 1, /* 80 */ |
50 | 1, /* 88 */ |
51 | 1, /* 96 */ |
52 | 6, /* 104 */ |
53 | 6, /* 112 */ |
54 | 6, /* 120 */ |
55 | 6, /* 128 */ |
56 | 2, /* 136 */ |
57 | 2, /* 144 */ |
58 | 2, /* 152 */ |
59 | 2, /* 160 */ |
60 | 2, /* 168 */ |
61 | 2, /* 176 */ |
62 | 2, /* 184 */ |
63 | 2 /* 192 */ |
64 | }; |
65 | |
66 | static int bpf_mem_cache_idx(size_t size) |
67 | { |
68 | if (!size || size > 4096) |
69 | return -1; |
70 | |
71 | if (size <= 192) |
72 | return size_index[(size - 1) / 8] - 1; |
73 | |
74 | return fls(x: size - 1) - 2; |
75 | } |
76 | |
77 | #define NUM_CACHES 11 |
78 | |
79 | struct bpf_mem_cache { |
80 | /* per-cpu list of free objects of size 'unit_size'. |
81 | * All accesses are done with interrupts disabled and 'active' counter |
82 | * protection with __llist_add() and __llist_del_first(). |
83 | */ |
84 | struct llist_head free_llist; |
85 | local_t active; |
86 | |
87 | /* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill |
88 | * are sequenced by per-cpu 'active' counter. But unit_free() cannot |
89 | * fail. When 'active' is busy the unit_free() will add an object to |
90 | * free_llist_extra. |
91 | */ |
92 | struct llist_head ; |
93 | |
94 | struct irq_work refill_work; |
95 | struct obj_cgroup *objcg; |
96 | int unit_size; |
97 | /* count of objects in free_llist */ |
98 | int free_cnt; |
99 | int low_watermark, high_watermark, batch; |
100 | int percpu_size; |
101 | bool draining; |
102 | struct bpf_mem_cache *tgt; |
103 | |
104 | /* list of objects to be freed after RCU GP */ |
105 | struct llist_head free_by_rcu; |
106 | struct llist_node *free_by_rcu_tail; |
107 | struct llist_head waiting_for_gp; |
108 | struct llist_node *waiting_for_gp_tail; |
109 | struct rcu_head rcu; |
110 | atomic_t call_rcu_in_progress; |
111 | struct llist_head ; |
112 | |
113 | /* list of objects to be freed after RCU tasks trace GP */ |
114 | struct llist_head free_by_rcu_ttrace; |
115 | struct llist_head waiting_for_gp_ttrace; |
116 | struct rcu_head rcu_ttrace; |
117 | atomic_t call_rcu_ttrace_in_progress; |
118 | }; |
119 | |
120 | struct bpf_mem_caches { |
121 | struct bpf_mem_cache cache[NUM_CACHES]; |
122 | }; |
123 | |
124 | static const u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; |
125 | |
126 | static struct llist_node notrace *__llist_del_first(struct llist_head *head) |
127 | { |
128 | struct llist_node *entry, *next; |
129 | |
130 | entry = head->first; |
131 | if (!entry) |
132 | return NULL; |
133 | next = entry->next; |
134 | head->first = next; |
135 | return entry; |
136 | } |
137 | |
138 | static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags) |
139 | { |
140 | if (c->percpu_size) { |
141 | void **obj = kmalloc_node(size: c->percpu_size, flags, node); |
142 | void *pptr = __alloc_percpu_gfp(size: c->unit_size, align: 8, gfp: flags); |
143 | |
144 | if (!obj || !pptr) { |
145 | free_percpu(pdata: pptr); |
146 | kfree(objp: obj); |
147 | return NULL; |
148 | } |
149 | obj[1] = pptr; |
150 | return obj; |
151 | } |
152 | |
153 | return kmalloc_node(size: c->unit_size, flags: flags | __GFP_ZERO, node); |
154 | } |
155 | |
156 | static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c) |
157 | { |
158 | #ifdef CONFIG_MEMCG_KMEM |
159 | if (c->objcg) |
160 | return get_mem_cgroup_from_objcg(objcg: c->objcg); |
161 | #endif |
162 | |
163 | #ifdef CONFIG_MEMCG |
164 | return root_mem_cgroup; |
165 | #else |
166 | return NULL; |
167 | #endif |
168 | } |
169 | |
170 | static void inc_active(struct bpf_mem_cache *c, unsigned long *flags) |
171 | { |
172 | if (IS_ENABLED(CONFIG_PREEMPT_RT)) |
173 | /* In RT irq_work runs in per-cpu kthread, so disable |
174 | * interrupts to avoid preemption and interrupts and |
175 | * reduce the chance of bpf prog executing on this cpu |
176 | * when active counter is busy. |
177 | */ |
178 | local_irq_save(*flags); |
179 | /* alloc_bulk runs from irq_work which will not preempt a bpf |
180 | * program that does unit_alloc/unit_free since IRQs are |
181 | * disabled there. There is no race to increment 'active' |
182 | * counter. It protects free_llist from corruption in case NMI |
183 | * bpf prog preempted this loop. |
184 | */ |
185 | WARN_ON_ONCE(local_inc_return(&c->active) != 1); |
186 | } |
187 | |
188 | static void dec_active(struct bpf_mem_cache *c, unsigned long *flags) |
189 | { |
190 | local_dec(l: &c->active); |
191 | if (IS_ENABLED(CONFIG_PREEMPT_RT)) |
192 | local_irq_restore(*flags); |
193 | } |
194 | |
195 | static void add_obj_to_free_list(struct bpf_mem_cache *c, void *obj) |
196 | { |
197 | unsigned long flags; |
198 | |
199 | inc_active(c, flags: &flags); |
200 | __llist_add(new: obj, head: &c->free_llist); |
201 | c->free_cnt++; |
202 | dec_active(c, flags: &flags); |
203 | } |
204 | |
205 | /* Mostly runs from irq_work except __init phase. */ |
206 | static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic) |
207 | { |
208 | struct mem_cgroup *memcg = NULL, *old_memcg; |
209 | gfp_t gfp; |
210 | void *obj; |
211 | int i; |
212 | |
213 | gfp = __GFP_NOWARN | __GFP_ACCOUNT; |
214 | gfp |= atomic ? GFP_NOWAIT : GFP_KERNEL; |
215 | |
216 | for (i = 0; i < cnt; i++) { |
217 | /* |
218 | * For every 'c' llist_del_first(&c->free_by_rcu_ttrace); is |
219 | * done only by one CPU == current CPU. Other CPUs might |
220 | * llist_add() and llist_del_all() in parallel. |
221 | */ |
222 | obj = llist_del_first(head: &c->free_by_rcu_ttrace); |
223 | if (!obj) |
224 | break; |
225 | add_obj_to_free_list(c, obj); |
226 | } |
227 | if (i >= cnt) |
228 | return; |
229 | |
230 | for (; i < cnt; i++) { |
231 | obj = llist_del_first(head: &c->waiting_for_gp_ttrace); |
232 | if (!obj) |
233 | break; |
234 | add_obj_to_free_list(c, obj); |
235 | } |
236 | if (i >= cnt) |
237 | return; |
238 | |
239 | memcg = get_memcg(c); |
240 | old_memcg = set_active_memcg(memcg); |
241 | for (; i < cnt; i++) { |
242 | /* Allocate, but don't deplete atomic reserves that typical |
243 | * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc |
244 | * will allocate from the current numa node which is what we |
245 | * want here. |
246 | */ |
247 | obj = __alloc(c, node, flags: gfp); |
248 | if (!obj) |
249 | break; |
250 | add_obj_to_free_list(c, obj); |
251 | } |
252 | set_active_memcg(old_memcg); |
253 | mem_cgroup_put(memcg); |
254 | } |
255 | |
256 | static void free_one(void *obj, bool percpu) |
257 | { |
258 | if (percpu) { |
259 | free_percpu(pdata: ((void **)obj)[1]); |
260 | kfree(objp: obj); |
261 | return; |
262 | } |
263 | |
264 | kfree(objp: obj); |
265 | } |
266 | |
267 | static int free_all(struct llist_node *llnode, bool percpu) |
268 | { |
269 | struct llist_node *pos, *t; |
270 | int cnt = 0; |
271 | |
272 | llist_for_each_safe(pos, t, llnode) { |
273 | free_one(obj: pos, percpu); |
274 | cnt++; |
275 | } |
276 | return cnt; |
277 | } |
278 | |
279 | static void __free_rcu(struct rcu_head *head) |
280 | { |
281 | struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace); |
282 | |
283 | free_all(llnode: llist_del_all(head: &c->waiting_for_gp_ttrace), percpu: !!c->percpu_size); |
284 | atomic_set(v: &c->call_rcu_ttrace_in_progress, i: 0); |
285 | } |
286 | |
287 | static void __free_rcu_tasks_trace(struct rcu_head *head) |
288 | { |
289 | /* If RCU Tasks Trace grace period implies RCU grace period, |
290 | * there is no need to invoke call_rcu(). |
291 | */ |
292 | if (rcu_trace_implies_rcu_gp()) |
293 | __free_rcu(head); |
294 | else |
295 | call_rcu(head, func: __free_rcu); |
296 | } |
297 | |
298 | static void enque_to_free(struct bpf_mem_cache *c, void *obj) |
299 | { |
300 | struct llist_node *llnode = obj; |
301 | |
302 | /* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work. |
303 | * Nothing races to add to free_by_rcu_ttrace list. |
304 | */ |
305 | llist_add(new: llnode, head: &c->free_by_rcu_ttrace); |
306 | } |
307 | |
308 | static void do_call_rcu_ttrace(struct bpf_mem_cache *c) |
309 | { |
310 | struct llist_node *llnode, *t; |
311 | |
312 | if (atomic_xchg(v: &c->call_rcu_ttrace_in_progress, new: 1)) { |
313 | if (unlikely(READ_ONCE(c->draining))) { |
314 | llnode = llist_del_all(head: &c->free_by_rcu_ttrace); |
315 | free_all(llnode, percpu: !!c->percpu_size); |
316 | } |
317 | return; |
318 | } |
319 | |
320 | WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace)); |
321 | llist_for_each_safe(llnode, t, llist_del_all(&c->free_by_rcu_ttrace)) |
322 | llist_add(new: llnode, head: &c->waiting_for_gp_ttrace); |
323 | |
324 | if (unlikely(READ_ONCE(c->draining))) { |
325 | __free_rcu(head: &c->rcu_ttrace); |
326 | return; |
327 | } |
328 | |
329 | /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish. |
330 | * If RCU Tasks Trace grace period implies RCU grace period, free |
331 | * these elements directly, else use call_rcu() to wait for normal |
332 | * progs to finish and finally do free_one() on each element. |
333 | */ |
334 | call_rcu_tasks_trace(rhp: &c->rcu_ttrace, func: __free_rcu_tasks_trace); |
335 | } |
336 | |
337 | static void free_bulk(struct bpf_mem_cache *c) |
338 | { |
339 | struct bpf_mem_cache *tgt = c->tgt; |
340 | struct llist_node *llnode, *t; |
341 | unsigned long flags; |
342 | int cnt; |
343 | |
344 | WARN_ON_ONCE(tgt->unit_size != c->unit_size); |
345 | WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); |
346 | |
347 | do { |
348 | inc_active(c, flags: &flags); |
349 | llnode = __llist_del_first(head: &c->free_llist); |
350 | if (llnode) |
351 | cnt = --c->free_cnt; |
352 | else |
353 | cnt = 0; |
354 | dec_active(c, flags: &flags); |
355 | if (llnode) |
356 | enque_to_free(c: tgt, obj: llnode); |
357 | } while (cnt > (c->high_watermark + c->low_watermark) / 2); |
358 | |
359 | /* and drain free_llist_extra */ |
360 | llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) |
361 | enque_to_free(c: tgt, obj: llnode); |
362 | do_call_rcu_ttrace(c: tgt); |
363 | } |
364 | |
365 | static void __free_by_rcu(struct rcu_head *head) |
366 | { |
367 | struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); |
368 | struct bpf_mem_cache *tgt = c->tgt; |
369 | struct llist_node *llnode; |
370 | |
371 | WARN_ON_ONCE(tgt->unit_size != c->unit_size); |
372 | WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); |
373 | |
374 | llnode = llist_del_all(head: &c->waiting_for_gp); |
375 | if (!llnode) |
376 | goto out; |
377 | |
378 | llist_add_batch(new_first: llnode, new_last: c->waiting_for_gp_tail, head: &tgt->free_by_rcu_ttrace); |
379 | |
380 | /* Objects went through regular RCU GP. Send them to RCU tasks trace */ |
381 | do_call_rcu_ttrace(c: tgt); |
382 | out: |
383 | atomic_set(v: &c->call_rcu_in_progress, i: 0); |
384 | } |
385 | |
386 | static void check_free_by_rcu(struct bpf_mem_cache *c) |
387 | { |
388 | struct llist_node *llnode, *t; |
389 | unsigned long flags; |
390 | |
391 | /* drain free_llist_extra_rcu */ |
392 | if (unlikely(!llist_empty(&c->free_llist_extra_rcu))) { |
393 | inc_active(c, flags: &flags); |
394 | llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu)) |
395 | if (__llist_add(new: llnode, head: &c->free_by_rcu)) |
396 | c->free_by_rcu_tail = llnode; |
397 | dec_active(c, flags: &flags); |
398 | } |
399 | |
400 | if (llist_empty(head: &c->free_by_rcu)) |
401 | return; |
402 | |
403 | if (atomic_xchg(v: &c->call_rcu_in_progress, new: 1)) { |
404 | /* |
405 | * Instead of kmalloc-ing new rcu_head and triggering 10k |
406 | * call_rcu() to hit rcutree.qhimark and force RCU to notice |
407 | * the overload just ask RCU to hurry up. There could be many |
408 | * objects in free_by_rcu list. |
409 | * This hint reduces memory consumption for an artificial |
410 | * benchmark from 2 Gbyte to 150 Mbyte. |
411 | */ |
412 | rcu_request_urgent_qs_task(current); |
413 | return; |
414 | } |
415 | |
416 | WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); |
417 | |
418 | inc_active(c, flags: &flags); |
419 | WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu)); |
420 | c->waiting_for_gp_tail = c->free_by_rcu_tail; |
421 | dec_active(c, flags: &flags); |
422 | |
423 | if (unlikely(READ_ONCE(c->draining))) { |
424 | free_all(llnode: llist_del_all(head: &c->waiting_for_gp), percpu: !!c->percpu_size); |
425 | atomic_set(v: &c->call_rcu_in_progress, i: 0); |
426 | } else { |
427 | call_rcu_hurry(head: &c->rcu, func: __free_by_rcu); |
428 | } |
429 | } |
430 | |
431 | static void bpf_mem_refill(struct irq_work *work) |
432 | { |
433 | struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work); |
434 | int cnt; |
435 | |
436 | /* Racy access to free_cnt. It doesn't need to be 100% accurate */ |
437 | cnt = c->free_cnt; |
438 | if (cnt < c->low_watermark) |
439 | /* irq_work runs on this cpu and kmalloc will allocate |
440 | * from the current numa node which is what we want here. |
441 | */ |
442 | alloc_bulk(c, cnt: c->batch, NUMA_NO_NODE, atomic: true); |
443 | else if (cnt > c->high_watermark) |
444 | free_bulk(c); |
445 | |
446 | check_free_by_rcu(c); |
447 | } |
448 | |
449 | static void notrace irq_work_raise(struct bpf_mem_cache *c) |
450 | { |
451 | irq_work_queue(work: &c->refill_work); |
452 | } |
453 | |
454 | /* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket |
455 | * the freelist cache will be elem_size * 64 (or less) on each cpu. |
456 | * |
457 | * For bpf programs that don't have statically known allocation sizes and |
458 | * assuming (low_mark + high_mark) / 2 as an average number of elements per |
459 | * bucket and all buckets are used the total amount of memory in freelists |
460 | * on each cpu will be: |
461 | * 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096 |
462 | * == ~ 116 Kbyte using below heuristic. |
463 | * Initialized, but unused bpf allocator (not bpf map specific one) will |
464 | * consume ~ 11 Kbyte per cpu. |
465 | * Typical case will be between 11K and 116K closer to 11K. |
466 | * bpf progs can and should share bpf_mem_cache when possible. |
467 | * |
468 | * Percpu allocation is typically rare. To avoid potential unnecessary large |
469 | * memory consumption, set low_mark = 1 and high_mark = 3, resulting in c->batch = 1. |
470 | */ |
471 | static void init_refill_work(struct bpf_mem_cache *c) |
472 | { |
473 | init_irq_work(work: &c->refill_work, func: bpf_mem_refill); |
474 | if (c->percpu_size) { |
475 | c->low_watermark = 1; |
476 | c->high_watermark = 3; |
477 | } else if (c->unit_size <= 256) { |
478 | c->low_watermark = 32; |
479 | c->high_watermark = 96; |
480 | } else { |
481 | /* When page_size == 4k, order-0 cache will have low_mark == 2 |
482 | * and high_mark == 6 with batch alloc of 3 individual pages at |
483 | * a time. |
484 | * 8k allocs and above low == 1, high == 3, batch == 1. |
485 | */ |
486 | c->low_watermark = max(32 * 256 / c->unit_size, 1); |
487 | c->high_watermark = max(96 * 256 / c->unit_size, 3); |
488 | } |
489 | c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1); |
490 | } |
491 | |
492 | static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) |
493 | { |
494 | int cnt = 1; |
495 | |
496 | /* To avoid consuming memory, for non-percpu allocation, assume that |
497 | * 1st run of bpf prog won't be doing more than 4 map_update_elem from |
498 | * irq disabled region if unit size is less than or equal to 256. |
499 | * For all other cases, let us just do one allocation. |
500 | */ |
501 | if (!c->percpu_size && c->unit_size <= 256) |
502 | cnt = 4; |
503 | alloc_bulk(c, cnt, cpu_to_node(cpu), atomic: false); |
504 | } |
505 | |
506 | /* When size != 0 bpf_mem_cache for each cpu. |
507 | * This is typical bpf hash map use case when all elements have equal size. |
508 | * |
509 | * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on |
510 | * kmalloc/kfree. Max allocation size is 4096 in this case. |
511 | * This is bpf_dynptr and bpf_kptr use case. |
512 | */ |
513 | int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) |
514 | { |
515 | struct bpf_mem_caches *cc, __percpu *pcc; |
516 | struct bpf_mem_cache *c, __percpu *pc; |
517 | struct obj_cgroup *objcg = NULL; |
518 | int cpu, i, unit_size, percpu_size = 0; |
519 | |
520 | if (percpu && size == 0) |
521 | return -EINVAL; |
522 | |
523 | /* room for llist_node and per-cpu pointer */ |
524 | if (percpu) |
525 | percpu_size = LLIST_NODE_SZ + sizeof(void *); |
526 | ma->percpu = percpu; |
527 | |
528 | if (size) { |
529 | pc = __alloc_percpu_gfp(size: sizeof(*pc), align: 8, GFP_KERNEL); |
530 | if (!pc) |
531 | return -ENOMEM; |
532 | |
533 | if (!percpu) |
534 | size += LLIST_NODE_SZ; /* room for llist_node */ |
535 | unit_size = size; |
536 | |
537 | #ifdef CONFIG_MEMCG_KMEM |
538 | if (memcg_bpf_enabled()) |
539 | objcg = get_obj_cgroup_from_current(); |
540 | #endif |
541 | ma->objcg = objcg; |
542 | |
543 | for_each_possible_cpu(cpu) { |
544 | c = per_cpu_ptr(pc, cpu); |
545 | c->unit_size = unit_size; |
546 | c->objcg = objcg; |
547 | c->percpu_size = percpu_size; |
548 | c->tgt = c; |
549 | init_refill_work(c); |
550 | prefill_mem_cache(c, cpu); |
551 | } |
552 | ma->cache = pc; |
553 | return 0; |
554 | } |
555 | |
556 | pcc = __alloc_percpu_gfp(size: sizeof(*cc), align: 8, GFP_KERNEL); |
557 | if (!pcc) |
558 | return -ENOMEM; |
559 | #ifdef CONFIG_MEMCG_KMEM |
560 | objcg = get_obj_cgroup_from_current(); |
561 | #endif |
562 | ma->objcg = objcg; |
563 | for_each_possible_cpu(cpu) { |
564 | cc = per_cpu_ptr(pcc, cpu); |
565 | for (i = 0; i < NUM_CACHES; i++) { |
566 | c = &cc->cache[i]; |
567 | c->unit_size = sizes[i]; |
568 | c->objcg = objcg; |
569 | c->percpu_size = percpu_size; |
570 | c->tgt = c; |
571 | |
572 | init_refill_work(c); |
573 | prefill_mem_cache(c, cpu); |
574 | } |
575 | } |
576 | |
577 | ma->caches = pcc; |
578 | return 0; |
579 | } |
580 | |
581 | int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg) |
582 | { |
583 | struct bpf_mem_caches __percpu *pcc; |
584 | |
585 | pcc = __alloc_percpu_gfp(size: sizeof(struct bpf_mem_caches), align: 8, GFP_KERNEL); |
586 | if (!pcc) |
587 | return -ENOMEM; |
588 | |
589 | ma->caches = pcc; |
590 | ma->objcg = objcg; |
591 | ma->percpu = true; |
592 | return 0; |
593 | } |
594 | |
595 | int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size) |
596 | { |
597 | struct bpf_mem_caches *cc, __percpu *pcc; |
598 | int cpu, i, unit_size, percpu_size; |
599 | struct obj_cgroup *objcg; |
600 | struct bpf_mem_cache *c; |
601 | |
602 | i = bpf_mem_cache_idx(size); |
603 | if (i < 0) |
604 | return -EINVAL; |
605 | |
606 | /* room for llist_node and per-cpu pointer */ |
607 | percpu_size = LLIST_NODE_SZ + sizeof(void *); |
608 | |
609 | unit_size = sizes[i]; |
610 | objcg = ma->objcg; |
611 | pcc = ma->caches; |
612 | |
613 | for_each_possible_cpu(cpu) { |
614 | cc = per_cpu_ptr(pcc, cpu); |
615 | c = &cc->cache[i]; |
616 | if (c->unit_size) |
617 | break; |
618 | |
619 | c->unit_size = unit_size; |
620 | c->objcg = objcg; |
621 | c->percpu_size = percpu_size; |
622 | c->tgt = c; |
623 | |
624 | init_refill_work(c); |
625 | prefill_mem_cache(c, cpu); |
626 | } |
627 | |
628 | return 0; |
629 | } |
630 | |
631 | static void drain_mem_cache(struct bpf_mem_cache *c) |
632 | { |
633 | bool percpu = !!c->percpu_size; |
634 | |
635 | /* No progs are using this bpf_mem_cache, but htab_map_free() called |
636 | * bpf_mem_cache_free() for all remaining elements and they can be in |
637 | * free_by_rcu_ttrace or in waiting_for_gp_ttrace lists, so drain those lists now. |
638 | * |
639 | * Except for waiting_for_gp_ttrace list, there are no concurrent operations |
640 | * on these lists, so it is safe to use __llist_del_all(). |
641 | */ |
642 | free_all(llnode: llist_del_all(head: &c->free_by_rcu_ttrace), percpu); |
643 | free_all(llnode: llist_del_all(head: &c->waiting_for_gp_ttrace), percpu); |
644 | free_all(llnode: __llist_del_all(head: &c->free_llist), percpu); |
645 | free_all(llnode: __llist_del_all(head: &c->free_llist_extra), percpu); |
646 | free_all(llnode: __llist_del_all(head: &c->free_by_rcu), percpu); |
647 | free_all(llnode: __llist_del_all(head: &c->free_llist_extra_rcu), percpu); |
648 | free_all(llnode: llist_del_all(head: &c->waiting_for_gp), percpu); |
649 | } |
650 | |
651 | static void check_mem_cache(struct bpf_mem_cache *c) |
652 | { |
653 | WARN_ON_ONCE(!llist_empty(&c->free_by_rcu_ttrace)); |
654 | WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace)); |
655 | WARN_ON_ONCE(!llist_empty(&c->free_llist)); |
656 | WARN_ON_ONCE(!llist_empty(&c->free_llist_extra)); |
657 | WARN_ON_ONCE(!llist_empty(&c->free_by_rcu)); |
658 | WARN_ON_ONCE(!llist_empty(&c->free_llist_extra_rcu)); |
659 | WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); |
660 | } |
661 | |
662 | static void check_leaked_objs(struct bpf_mem_alloc *ma) |
663 | { |
664 | struct bpf_mem_caches *cc; |
665 | struct bpf_mem_cache *c; |
666 | int cpu, i; |
667 | |
668 | if (ma->cache) { |
669 | for_each_possible_cpu(cpu) { |
670 | c = per_cpu_ptr(ma->cache, cpu); |
671 | check_mem_cache(c); |
672 | } |
673 | } |
674 | if (ma->caches) { |
675 | for_each_possible_cpu(cpu) { |
676 | cc = per_cpu_ptr(ma->caches, cpu); |
677 | for (i = 0; i < NUM_CACHES; i++) { |
678 | c = &cc->cache[i]; |
679 | check_mem_cache(c); |
680 | } |
681 | } |
682 | } |
683 | } |
684 | |
685 | static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) |
686 | { |
687 | check_leaked_objs(ma); |
688 | free_percpu(pdata: ma->cache); |
689 | free_percpu(pdata: ma->caches); |
690 | ma->cache = NULL; |
691 | ma->caches = NULL; |
692 | } |
693 | |
694 | static void free_mem_alloc(struct bpf_mem_alloc *ma) |
695 | { |
696 | /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks |
697 | * might still execute. Wait for them. |
698 | * |
699 | * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), |
700 | * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used |
701 | * to wait for the pending __free_rcu_tasks_trace() and __free_rcu(), |
702 | * so if call_rcu(head, __free_rcu) is skipped due to |
703 | * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by |
704 | * using rcu_trace_implies_rcu_gp() as well. |
705 | */ |
706 | rcu_barrier(); /* wait for __free_by_rcu */ |
707 | rcu_barrier_tasks_trace(); /* wait for __free_rcu */ |
708 | if (!rcu_trace_implies_rcu_gp()) |
709 | rcu_barrier(); |
710 | free_mem_alloc_no_barrier(ma); |
711 | } |
712 | |
713 | static void free_mem_alloc_deferred(struct work_struct *work) |
714 | { |
715 | struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work); |
716 | |
717 | free_mem_alloc(ma); |
718 | kfree(objp: ma); |
719 | } |
720 | |
721 | static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress) |
722 | { |
723 | struct bpf_mem_alloc *copy; |
724 | |
725 | if (!rcu_in_progress) { |
726 | /* Fast path. No callbacks are pending, hence no need to do |
727 | * rcu_barrier-s. |
728 | */ |
729 | free_mem_alloc_no_barrier(ma); |
730 | return; |
731 | } |
732 | |
733 | copy = kmemdup(p: ma, size: sizeof(*ma), GFP_KERNEL); |
734 | if (!copy) { |
735 | /* Slow path with inline barrier-s */ |
736 | free_mem_alloc(ma); |
737 | return; |
738 | } |
739 | |
740 | /* Defer barriers into worker to let the rest of map memory to be freed */ |
741 | memset(ma, 0, sizeof(*ma)); |
742 | INIT_WORK(©->work, free_mem_alloc_deferred); |
743 | queue_work(wq: system_unbound_wq, work: ©->work); |
744 | } |
745 | |
746 | void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) |
747 | { |
748 | struct bpf_mem_caches *cc; |
749 | struct bpf_mem_cache *c; |
750 | int cpu, i, rcu_in_progress; |
751 | |
752 | if (ma->cache) { |
753 | rcu_in_progress = 0; |
754 | for_each_possible_cpu(cpu) { |
755 | c = per_cpu_ptr(ma->cache, cpu); |
756 | WRITE_ONCE(c->draining, true); |
757 | irq_work_sync(work: &c->refill_work); |
758 | drain_mem_cache(c); |
759 | rcu_in_progress += atomic_read(v: &c->call_rcu_ttrace_in_progress); |
760 | rcu_in_progress += atomic_read(v: &c->call_rcu_in_progress); |
761 | } |
762 | if (ma->objcg) |
763 | obj_cgroup_put(objcg: ma->objcg); |
764 | destroy_mem_alloc(ma, rcu_in_progress); |
765 | } |
766 | if (ma->caches) { |
767 | rcu_in_progress = 0; |
768 | for_each_possible_cpu(cpu) { |
769 | cc = per_cpu_ptr(ma->caches, cpu); |
770 | for (i = 0; i < NUM_CACHES; i++) { |
771 | c = &cc->cache[i]; |
772 | WRITE_ONCE(c->draining, true); |
773 | irq_work_sync(work: &c->refill_work); |
774 | drain_mem_cache(c); |
775 | rcu_in_progress += atomic_read(v: &c->call_rcu_ttrace_in_progress); |
776 | rcu_in_progress += atomic_read(v: &c->call_rcu_in_progress); |
777 | } |
778 | } |
779 | if (ma->objcg) |
780 | obj_cgroup_put(objcg: ma->objcg); |
781 | destroy_mem_alloc(ma, rcu_in_progress); |
782 | } |
783 | } |
784 | |
785 | /* notrace is necessary here and in other functions to make sure |
786 | * bpf programs cannot attach to them and cause llist corruptions. |
787 | */ |
788 | static void notrace *unit_alloc(struct bpf_mem_cache *c) |
789 | { |
790 | struct llist_node *llnode = NULL; |
791 | unsigned long flags; |
792 | int cnt = 0; |
793 | |
794 | /* Disable irqs to prevent the following race for majority of prog types: |
795 | * prog_A |
796 | * bpf_mem_alloc |
797 | * preemption or irq -> prog_B |
798 | * bpf_mem_alloc |
799 | * |
800 | * but prog_B could be a perf_event NMI prog. |
801 | * Use per-cpu 'active' counter to order free_list access between |
802 | * unit_alloc/unit_free/bpf_mem_refill. |
803 | */ |
804 | local_irq_save(flags); |
805 | if (local_inc_return(&c->active) == 1) { |
806 | llnode = __llist_del_first(head: &c->free_llist); |
807 | if (llnode) { |
808 | cnt = --c->free_cnt; |
809 | *(struct bpf_mem_cache **)llnode = c; |
810 | } |
811 | } |
812 | local_dec(l: &c->active); |
813 | |
814 | WARN_ON(cnt < 0); |
815 | |
816 | if (cnt < c->low_watermark) |
817 | irq_work_raise(c); |
818 | /* Enable IRQ after the enqueue of irq work completes, so irq work |
819 | * will run after IRQ is enabled and free_llist may be refilled by |
820 | * irq work before other task preempts current task. |
821 | */ |
822 | local_irq_restore(flags); |
823 | |
824 | return llnode; |
825 | } |
826 | |
827 | /* Though 'ptr' object could have been allocated on a different cpu |
828 | * add it to the free_llist of the current cpu. |
829 | * Let kfree() logic deal with it when it's later called from irq_work. |
830 | */ |
831 | static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) |
832 | { |
833 | struct llist_node *llnode = ptr - LLIST_NODE_SZ; |
834 | unsigned long flags; |
835 | int cnt = 0; |
836 | |
837 | BUILD_BUG_ON(LLIST_NODE_SZ > 8); |
838 | |
839 | /* |
840 | * Remember bpf_mem_cache that allocated this object. |
841 | * The hint is not accurate. |
842 | */ |
843 | c->tgt = *(struct bpf_mem_cache **)llnode; |
844 | |
845 | local_irq_save(flags); |
846 | if (local_inc_return(&c->active) == 1) { |
847 | __llist_add(new: llnode, head: &c->free_llist); |
848 | cnt = ++c->free_cnt; |
849 | } else { |
850 | /* unit_free() cannot fail. Therefore add an object to atomic |
851 | * llist. free_bulk() will drain it. Though free_llist_extra is |
852 | * a per-cpu list we have to use atomic llist_add here, since |
853 | * it also can be interrupted by bpf nmi prog that does another |
854 | * unit_free() into the same free_llist_extra. |
855 | */ |
856 | llist_add(new: llnode, head: &c->free_llist_extra); |
857 | } |
858 | local_dec(l: &c->active); |
859 | |
860 | if (cnt > c->high_watermark) |
861 | /* free few objects from current cpu into global kmalloc pool */ |
862 | irq_work_raise(c); |
863 | /* Enable IRQ after irq_work_raise() completes, otherwise when current |
864 | * task is preempted by task which does unit_alloc(), unit_alloc() may |
865 | * return NULL unexpectedly because irq work is already pending but can |
866 | * not been triggered and free_llist can not be refilled timely. |
867 | */ |
868 | local_irq_restore(flags); |
869 | } |
870 | |
871 | static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) |
872 | { |
873 | struct llist_node *llnode = ptr - LLIST_NODE_SZ; |
874 | unsigned long flags; |
875 | |
876 | c->tgt = *(struct bpf_mem_cache **)llnode; |
877 | |
878 | local_irq_save(flags); |
879 | if (local_inc_return(&c->active) == 1) { |
880 | if (__llist_add(new: llnode, head: &c->free_by_rcu)) |
881 | c->free_by_rcu_tail = llnode; |
882 | } else { |
883 | llist_add(new: llnode, head: &c->free_llist_extra_rcu); |
884 | } |
885 | local_dec(l: &c->active); |
886 | |
887 | if (!atomic_read(v: &c->call_rcu_in_progress)) |
888 | irq_work_raise(c); |
889 | local_irq_restore(flags); |
890 | } |
891 | |
892 | /* Called from BPF program or from sys_bpf syscall. |
893 | * In both cases migration is disabled. |
894 | */ |
895 | void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) |
896 | { |
897 | int idx; |
898 | void *ret; |
899 | |
900 | if (!size) |
901 | return NULL; |
902 | |
903 | if (!ma->percpu) |
904 | size += LLIST_NODE_SZ; |
905 | idx = bpf_mem_cache_idx(size); |
906 | if (idx < 0) |
907 | return NULL; |
908 | |
909 | ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx); |
910 | return !ret ? NULL : ret + LLIST_NODE_SZ; |
911 | } |
912 | |
913 | void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) |
914 | { |
915 | struct bpf_mem_cache *c; |
916 | int idx; |
917 | |
918 | if (!ptr) |
919 | return; |
920 | |
921 | c = *(void **)(ptr - LLIST_NODE_SZ); |
922 | idx = bpf_mem_cache_idx(size: c->unit_size); |
923 | if (WARN_ON_ONCE(idx < 0)) |
924 | return; |
925 | |
926 | unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr); |
927 | } |
928 | |
929 | void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr) |
930 | { |
931 | struct bpf_mem_cache *c; |
932 | int idx; |
933 | |
934 | if (!ptr) |
935 | return; |
936 | |
937 | c = *(void **)(ptr - LLIST_NODE_SZ); |
938 | idx = bpf_mem_cache_idx(size: c->unit_size); |
939 | if (WARN_ON_ONCE(idx < 0)) |
940 | return; |
941 | |
942 | unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr); |
943 | } |
944 | |
945 | void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma) |
946 | { |
947 | void *ret; |
948 | |
949 | ret = unit_alloc(this_cpu_ptr(ma->cache)); |
950 | return !ret ? NULL : ret + LLIST_NODE_SZ; |
951 | } |
952 | |
953 | void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr) |
954 | { |
955 | if (!ptr) |
956 | return; |
957 | |
958 | unit_free(this_cpu_ptr(ma->cache), ptr); |
959 | } |
960 | |
961 | void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr) |
962 | { |
963 | if (!ptr) |
964 | return; |
965 | |
966 | unit_free_rcu(this_cpu_ptr(ma->cache), ptr); |
967 | } |
968 | |
969 | /* Directly does a kfree() without putting 'ptr' back to the free_llist |
970 | * for reuse and without waiting for a rcu_tasks_trace gp. |
971 | * The caller must first go through the rcu_tasks_trace gp for 'ptr' |
972 | * before calling bpf_mem_cache_raw_free(). |
973 | * It could be used when the rcu_tasks_trace callback does not have |
974 | * a hold on the original bpf_mem_alloc object that allocated the |
975 | * 'ptr'. This should only be used in the uncommon code path. |
976 | * Otherwise, the bpf_mem_alloc's free_llist cannot be refilled |
977 | * and may affect performance. |
978 | */ |
979 | void bpf_mem_cache_raw_free(void *ptr) |
980 | { |
981 | if (!ptr) |
982 | return; |
983 | |
984 | kfree(objp: ptr - LLIST_NODE_SZ); |
985 | } |
986 | |
987 | /* When flags == GFP_KERNEL, it signals that the caller will not cause |
988 | * deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use |
989 | * kmalloc if the free_llist is empty. |
990 | */ |
991 | void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) |
992 | { |
993 | struct bpf_mem_cache *c; |
994 | void *ret; |
995 | |
996 | c = this_cpu_ptr(ma->cache); |
997 | |
998 | ret = unit_alloc(c); |
999 | if (!ret && flags == GFP_KERNEL) { |
1000 | struct mem_cgroup *memcg, *old_memcg; |
1001 | |
1002 | memcg = get_memcg(c); |
1003 | old_memcg = set_active_memcg(memcg); |
1004 | ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT); |
1005 | if (ret) |
1006 | *(struct bpf_mem_cache **)ret = c; |
1007 | set_active_memcg(old_memcg); |
1008 | mem_cgroup_put(memcg); |
1009 | } |
1010 | |
1011 | return !ret ? NULL : ret + LLIST_NODE_SZ; |
1012 | } |
1013 | |