memalloc.c source code [linux/kernel/bpf/memalloc.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/ Copyright (c) 2022 Meta Platforms, Inc. and affiliates. /
3	#include <linux/mm.h>
4	#include <linux/llist.h>
5	#include <linux/bpf.h>
6	#include <linux/irq_work.h>
7	#include <linux/bpf_mem_alloc.h>
8	#include <linux/memcontrol.h>
9	#include <asm/local.h>
10
11	/ Any context (including NMI) BPF specific memory allocator.*
12	*
13	* Tracing BPF programs can attach to kprobe and fentry. Hence they
14	* run in unknown context where calling plain kmalloc() might not be safe.
15	*
16	* Front-end kmalloc() with per-cpu per-bucket cache of free elements.
17	* Refill this cache asynchronously from irq_work.
18	*
19	* CPU_0 buckets
20	* 16 32 64 96 128 196 256 512 1024 2048 4096
21	* ...
22	* CPU_N buckets
23	* 16 32 64 96 128 196 256 512 1024 2048 4096
24	*
25	* The buckets are prefilled at the start.
26	* BPF programs always run with migration disabled.
27	* It's safe to allocate from cache of the current cpu with irqs disabled.
28	* Free-ing is always done into bucket of the current cpu as well.
29	* irq_work trims extra free elements from buckets with kfree
30	* and refills them with kmalloc, so global kmalloc logic takes care
31	* of freeing objects allocated by one cpu and freed on another.
32	*
33	* Every allocated objected is padded with extra 8 bytes that contains
34	* struct llist_node.
35	*/
36	#define LLIST_NODE_SZ sizeof(struct llist_node)
37
38	/ similar to kmalloc, but sizeof == 8 bucket is gone /
39	static u8 size_index[`24`] __ro_after_init = {
40	`3`, / 8 /
41	`3`, / 16 /
42	`4`, / 24 /
43	`4`, / 32 /
44	`5`, / 40 /
45	`5`, / 48 /
46	`5`, / 56 /
47	`5`, / 64 /
48	`1`, / 72 /
49	`1`, / 80 /
50	`1`, / 88 /
51	`1`, / 96 /
52	`6`, / 104 /
53	`6`, / 112 /
54	`6`, / 120 /
55	`6`, / 128 /
56	`2`, / 136 /
57	`2`, / 144 /
58	`2`, / 152 /
59	`2`, / 160 /
60	`2`, / 168 /
61	`2`, / 176 /
62	`2`, / 184 /
63	`2` / 192 /
64	};
65
66	static int bpf_mem_cache_idx(size_t size)
67	{
68	if (!size \|\| size > `4096`)
69	return -`1`;
70
71	if (size <= `192`)
72	return size_index[(size - `1`) / `8`] - `1`;
73
74	return fls(x: size - `1`) - `2`;
75	}
76
77	#define NUM_CACHES 11
78
79	struct bpf_mem_cache {
80	/ per-cpu list of free objects of size 'unit_size'.*
81	* All accesses are done with interrupts disabled and 'active' counter
82	* protection with __llist_add() and __llist_del_first().
83	*/
84	struct llist_head free_llist;
85	local_t active;
86
87	/ Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill*
88	* are sequenced by per-cpu 'active' counter. But unit_free() cannot
89	* fail. When 'active' is busy the unit_free() will add an object to
90	* free_llist_extra.
91	*/
92	struct llist_head free_llist_extra;
93
94	struct irq_work refill_work;
95	struct obj_cgroup *objcg;
96	int unit_size;
97	/ count of objects in free_llist /
98	int free_cnt;
99	int low_watermark, high_watermark, batch;
100	int percpu_size;
101	bool draining;
102	struct bpf_mem_cache *tgt;
103
104	/ list of objects to be freed after RCU GP /
105	struct llist_head free_by_rcu;
106	struct llist_node *free_by_rcu_tail;
107	struct llist_head waiting_for_gp;
108	struct llist_node *waiting_for_gp_tail;
109	struct rcu_head rcu;
110	atomic_t call_rcu_in_progress;
111	struct llist_head free_llist_extra_rcu;
112
113	/ list of objects to be freed after RCU tasks trace GP /
114	struct llist_head free_by_rcu_ttrace;
115	struct llist_head waiting_for_gp_ttrace;
116	struct rcu_head rcu_ttrace;
117	atomic_t call_rcu_ttrace_in_progress;
118	};
119
120	struct bpf_mem_caches {
121	struct bpf_mem_cache cache[NUM_CACHES];
122	};
123
124	static const u16 sizes[NUM_CACHES] = {`96`, `192`, `16`, `32`, `64`, `128`, `256`, `512`, `1024`, `2048`, `4096`};
125
126	static struct llist_node notrace __llist_del_first(struct* llist_head *head)
127	{
128	struct llist_node entry, next;
129
130	entry = head->first;
131	if (!entry)
132	return NULL;
133	next = entry->next;
134	head->first = next;
135	return entry;
136	}
137
138	static void __alloc(struct* bpf_mem_cache c, int* node, gfp_t flags)
139	{
140	if (c->percpu_size) {
141	void **obj = kmalloc_node(size: c->percpu_size, flags, node);
142	void *pptr = __alloc_percpu_gfp(size: c->unit_size, align: `8`, gfp: flags);
143
144	if (!obj \|\| !pptr) {
145	free_percpu(pdata: pptr);
146	kfree(objp: obj);
147	return NULL;
148	}
149	obj[`1`] = pptr;
150	return obj;
151	}
152
153	return kmalloc_node(size: c->unit_size, flags: flags \| __GFP_ZERO, node);
154	}
155
156	static struct mem_cgroup get_memcg(const* struct bpf_mem_cache *c)
157	{
158	#ifdef CONFIG_MEMCG_KMEM
159	if (c->objcg)
160	return get_mem_cgroup_from_objcg(objcg: c->objcg);
161	#endif
162
163	#ifdef CONFIG_MEMCG
164	return root_mem_cgroup;
165	#else
166	return NULL;
167	#endif
168	}
169
170	static void inc_active(struct bpf_mem_cache c, unsigned* long *flags)
171	{
172	if (IS_ENABLED(CONFIG_PREEMPT_RT))
173	/ In RT irq_work runs in per-cpu kthread, so disable*
174	* interrupts to avoid preemption and interrupts and
175	* reduce the chance of bpf prog executing on this cpu
176	* when active counter is busy.
177	*/
178	local_irq_save(*flags);
179	/ alloc_bulk runs from irq_work which will not preempt a bpf*
180	* program that does unit_alloc/unit_free since IRQs are
181	* disabled there. There is no race to increment 'active'
182	* counter. It protects free_llist from corruption in case NMI
183	* bpf prog preempted this loop.
184	*/
185	WARN_ON_ONCE(local_inc_return(&c->active) != `1`);
186	}
187
188	static void dec_active(struct bpf_mem_cache c, unsigned* long *flags)
189	{
190	local_dec(l: &c->active);
191	if (IS_ENABLED(CONFIG_PREEMPT_RT))
192	local_irq_restore(*flags);
193	}
194
195	static void add_obj_to_free_list(struct bpf_mem_cache c, void* *obj)
196	{
197	unsigned long flags;
198
199	inc_active(c, flags: &flags);
200	__llist_add(new: obj, head: &c->free_llist);
201	c->free_cnt++;
202	dec_active(c, flags: &flags);
203	}
204
205	/ Mostly runs from irq_work except __init phase. /
206	static void alloc_bulk(struct bpf_mem_cache c, int* cnt, int node, bool atomic)
207	{
208	struct mem_cgroup memcg = NULL, old_memcg;
209	gfp_t gfp;
210	void *obj;
211	int i;
212
213	gfp = __GFP_NOWARN \| __GFP_ACCOUNT;
214	gfp \|= atomic ? GFP_NOWAIT : GFP_KERNEL;
215
216	for (i = `0`; i < cnt; i++) {
217	/*
218	* For every 'c' llist_del_first(&c->free_by_rcu_ttrace); is
219	* done only by one CPU == current CPU. Other CPUs might
220	* llist_add() and llist_del_all() in parallel.
221	*/
222	obj = llist_del_first(head: &c->free_by_rcu_ttrace);
223	if (!obj)
224	break;
225	add_obj_to_free_list(c, obj);
226	}
227	if (i >= cnt)
228	return;
229
230	for (; i < cnt; i++) {
231	obj = llist_del_first(head: &c->waiting_for_gp_ttrace);
232	if (!obj)
233	break;
234	add_obj_to_free_list(c, obj);
235	}
236	if (i >= cnt)
237	return;
238
239	memcg = get_memcg(c);
240	old_memcg = set_active_memcg(memcg);
241	for (; i < cnt; i++) {
242	/ Allocate, but don't deplete atomic reserves that typical*
243	* GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
244	* will allocate from the current numa node which is what we
245	* want here.
246	*/
247	obj = __alloc(c, node, flags: gfp);
248	if (!obj)
249	break;
250	add_obj_to_free_list(c, obj);
251	}
252	set_active_memcg(old_memcg);
253	mem_cgroup_put(memcg);
254	}
255
256	static void free_one(void *obj, bool percpu)
257	{
258	if (percpu) {
259	free_percpu(pdata: ((void **)obj)[`1`]);
260	kfree(objp: obj);
261	return;
262	}
263
264	kfree(objp: obj);
265	}
266
267	static int free_all(struct llist_node *llnode, bool percpu)
268	{
269	struct llist_node pos, t;
270	int cnt = `0`;
271
272	llist_for_each_safe(pos, t, llnode) {
273	free_one(obj: pos, percpu);
274	cnt++;
275	}
276	return cnt;
277	}
278
279	static void __free_rcu(struct rcu_head *head)
280	{
281	struct bpf_mem_cache c = container_of(head, struct* bpf_mem_cache, rcu_ttrace);
282
283	free_all(llnode: llist_del_all(head: &c->waiting_for_gp_ttrace), percpu: !!c->percpu_size);
284	atomic_set(v: &c->call_rcu_ttrace_in_progress, i: `0`);
285	}
286
287	static void __free_rcu_tasks_trace(struct rcu_head *head)
288	{
289	/ If RCU Tasks Trace grace period implies RCU grace period,*
290	* there is no need to invoke call_rcu().
291	*/
292	if (rcu_trace_implies_rcu_gp())
293	__free_rcu(head);
294	else
295	call_rcu(head, func: __free_rcu);
296	}
297
298	static void enque_to_free(struct bpf_mem_cache c, void* *obj)
299	{
300	struct llist_node *llnode = obj;
301
302	/ bpf_mem_cache is a per-cpu object. Freeing happens in irq_work.*
303	* Nothing races to add to free_by_rcu_ttrace list.
304	*/
305	llist_add(new: llnode, head: &c->free_by_rcu_ttrace);
306	}
307
308	static void do_call_rcu_ttrace(struct bpf_mem_cache *c)
309	{
310	struct llist_node llnode, t;
311
312	if (atomic_xchg(v: &c->call_rcu_ttrace_in_progress, new: `1`)) {
313	if (unlikely(READ_ONCE(c->draining))) {
314	llnode = llist_del_all(head: &c->free_by_rcu_ttrace);
315	free_all(llnode, percpu: !!c->percpu_size);
316	}
317	return;
318	}
319
320	WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace));
321	llist_for_each_safe(llnode, t, llist_del_all(&c->free_by_rcu_ttrace))
322	llist_add(new: llnode, head: &c->waiting_for_gp_ttrace);
323
324	if (unlikely(READ_ONCE(c->draining))) {
325	__free_rcu(head: &c->rcu_ttrace);
326	return;
327	}
328
329	/ Use call_rcu_tasks_trace() to wait for sleepable progs to finish.*
330	* If RCU Tasks Trace grace period implies RCU grace period, free
331	* these elements directly, else use call_rcu() to wait for normal
332	* progs to finish and finally do free_one() on each element.
333	*/
334	call_rcu_tasks_trace(rhp: &c->rcu_ttrace, func: __free_rcu_tasks_trace);
335	}
336
337	static void free_bulk(struct bpf_mem_cache *c)
338	{
339	struct bpf_mem_cache *tgt = c->tgt;
340	struct llist_node llnode, t;
341	unsigned long flags;
342	int cnt;
343
344	WARN_ON_ONCE(tgt->unit_size != c->unit_size);
345	WARN_ON_ONCE(tgt->percpu_size != c->percpu_size);
346
347	do {
348	inc_active(c, flags: &flags);
349	llnode = __llist_del_first(head: &c->free_llist);
350	if (llnode)
351	cnt = --c->free_cnt;
352	else
353	cnt = `0`;
354	dec_active(c, flags: &flags);
355	if (llnode)
356	enque_to_free(c: tgt, obj: llnode);
357	} while (cnt > (c->high_watermark + c->low_watermark) / `2`);
358
359	/ and drain free_llist_extra /
360	llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra))
361	enque_to_free(c: tgt, obj: llnode);
362	do_call_rcu_ttrace(c: tgt);
363	}
364
365	static void __free_by_rcu(struct rcu_head *head)
366	{
367	struct bpf_mem_cache c = container_of(head, struct* bpf_mem_cache, rcu);
368	struct bpf_mem_cache *tgt = c->tgt;
369	struct llist_node *llnode;
370
371	WARN_ON_ONCE(tgt->unit_size != c->unit_size);
372	WARN_ON_ONCE(tgt->percpu_size != c->percpu_size);
373
374	llnode = llist_del_all(head: &c->waiting_for_gp);
375	if (!llnode)
376	goto out;
377
378	llist_add_batch(new_first: llnode, new_last: c->waiting_for_gp_tail, head: &tgt->free_by_rcu_ttrace);
379
380	/ Objects went through regular RCU GP. Send them to RCU tasks trace /
381	do_call_rcu_ttrace(c: tgt);
382	out:
383	atomic_set(v: &c->call_rcu_in_progress, i: `0`);
384	}
385
386	static void check_free_by_rcu(struct bpf_mem_cache *c)
387	{
388	struct llist_node llnode, t;
389	unsigned long flags;
390
391	/ drain free_llist_extra_rcu /
392	if (unlikely(!llist_empty(&c->free_llist_extra_rcu))) {
393	inc_active(c, flags: &flags);
394	llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu))
395	if (__llist_add(new: llnode, head: &c->free_by_rcu))
396	c->free_by_rcu_tail = llnode;
397	dec_active(c, flags: &flags);
398	}
399
400	if (llist_empty(head: &c->free_by_rcu))
401	return;
402
403	if (atomic_xchg(v: &c->call_rcu_in_progress, new: `1`)) {
404	/*
405	* Instead of kmalloc-ing new rcu_head and triggering 10k
406	* call_rcu() to hit rcutree.qhimark and force RCU to notice
407	* the overload just ask RCU to hurry up. There could be many
408	* objects in free_by_rcu list.
409	* This hint reduces memory consumption for an artificial
410	* benchmark from 2 Gbyte to 150 Mbyte.
411	*/
412	rcu_request_urgent_qs_task(current);
413	return;
414	}
415
416	WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
417
418	inc_active(c, flags: &flags);
419	WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu));
420	c->waiting_for_gp_tail = c->free_by_rcu_tail;
421	dec_active(c, flags: &flags);
422
423	if (unlikely(READ_ONCE(c->draining))) {
424	free_all(llnode: llist_del_all(head: &c->waiting_for_gp), percpu: !!c->percpu_size);
425	atomic_set(v: &c->call_rcu_in_progress, i: `0`);
426	} else {
427	call_rcu_hurry(head: &c->rcu, func: __free_by_rcu);
428	}
429	}
430
431	static void bpf_mem_refill(struct irq_work *work)
432	{
433	struct bpf_mem_cache c = container_of(work, struct* bpf_mem_cache, refill_work);
434	int cnt;
435
436	/ Racy access to free_cnt. It doesn't need to be 100% accurate /
437	cnt = c->free_cnt;
438	if (cnt < c->low_watermark)
439	/ irq_work runs on this cpu and kmalloc will allocate*
440	* from the current numa node which is what we want here.
441	*/
442	alloc_bulk(c, cnt: c->batch, NUMA_NO_NODE, atomic: true);
443	else if (cnt > c->high_watermark)
444	free_bulk(c);
445
446	check_free_by_rcu(c);
447	}
448
449	static void notrace irq_work_raise(struct bpf_mem_cache *c)
450	{
451	irq_work_queue(work: &c->refill_work);
452	}
453
454	/ For typical bpf map case that uses bpf_mem_cache_alloc and single bucket*
455	* the freelist cache will be elem_size * 64 (or less) on each cpu.
456	*
457	* For bpf programs that don't have statically known allocation sizes and
458	* assuming (low_mark + high_mark) / 2 as an average number of elements per
459	* bucket and all buckets are used the total amount of memory in freelists
460	* on each cpu will be:
461	* 6416 + 6432 + 6464 + 6496 + 64128 + 64196 + 64256 + 32512 + 161024 + 82048 + 4*4096
462	* == ~ 116 Kbyte using below heuristic.
463	* Initialized, but unused bpf allocator (not bpf map specific one) will
464	* consume ~ 11 Kbyte per cpu.
465	* Typical case will be between 11K and 116K closer to 11K.
466	* bpf progs can and should share bpf_mem_cache when possible.
467	*
468	* Percpu allocation is typically rare. To avoid potential unnecessary large
469	* memory consumption, set low_mark = 1 and high_mark = 3, resulting in c->batch = 1.
470	*/
471	static void init_refill_work(struct bpf_mem_cache *c)
472	{
473	init_irq_work(work: &c->refill_work, func: bpf_mem_refill);
474	if (c->percpu_size) {
475	c->low_watermark = `1`;
476	c->high_watermark = `3`;
477	} else if (c->unit_size <= `256`) {
478	c->low_watermark = `32`;
479	c->high_watermark = `96`;
480	} else {
481	/ When page_size == 4k, order-0 cache will have low_mark == 2*
482	* and high_mark == 6 with batch alloc of 3 individual pages at
483	* a time.
484	* 8k allocs and above low == 1, high == 3, batch == 1.
485	*/
486	c->low_watermark = max(`32` * `256` / c->unit_size, `1`);
487	c->high_watermark = max(`96` * `256` / c->unit_size, `3`);
488	}
489	c->batch = max((c->high_watermark - c->low_watermark) / `4` * `3`, `1`);
490	}
491
492	static void prefill_mem_cache(struct bpf_mem_cache c, int* cpu)
493	{
494	int cnt = `1`;
495
496	/ To avoid consuming memory, for non-percpu allocation, assume that*
497	* 1st run of bpf prog won't be doing more than 4 map_update_elem from
498	* irq disabled region if unit size is less than or equal to 256.
499	* For all other cases, let us just do one allocation.
500	*/
501	if (!c->percpu_size && c->unit_size <= `256`)
502	cnt = `4`;
503	alloc_bulk(c, cnt, cpu_to_node(cpu), atomic: false);
504	}
505
506	/ When size != 0 bpf_mem_cache for each cpu.*
507	* This is typical bpf hash map use case when all elements have equal size.
508	*
509	* When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on
510	* kmalloc/kfree. Max allocation size is 4096 in this case.
511	* This is bpf_dynptr and bpf_kptr use case.
512	*/
513	int bpf_mem_alloc_init(struct bpf_mem_alloc ma, int* size, bool percpu)
514	{
515	struct bpf_mem_caches cc, __percpu pcc;
516	struct bpf_mem_cache c, __percpu pc;
517	struct obj_cgroup *objcg = NULL;
518	int cpu, i, unit_size, percpu_size = `0`;
519
520	if (percpu && size == `0`)
521	return -EINVAL;
522
523	/ room for llist_node and per-cpu pointer /
524	if (percpu)
525	percpu_size = LLIST_NODE_SZ + sizeof(void *);
526	ma->percpu = percpu;
527
528	if (size) {
529	pc = __alloc_percpu_gfp(size: sizeof(*pc), align: `8`, GFP_KERNEL);
530	if (!pc)
531	return -ENOMEM;
532
533	if (!percpu)
534	size += LLIST_NODE_SZ; / room for llist_node /
535	unit_size = size;
536
537	#ifdef CONFIG_MEMCG_KMEM
538	if (memcg_bpf_enabled())
539	objcg = get_obj_cgroup_from_current();
540	#endif
541	ma->objcg = objcg;
542
543	for_each_possible_cpu(cpu) {
544	c = per_cpu_ptr(pc, cpu);
545	c->unit_size = unit_size;
546	c->objcg = objcg;
547	c->percpu_size = percpu_size;
548	c->tgt = c;
549	init_refill_work(c);
550	prefill_mem_cache(c, cpu);
551	}
552	ma->cache = pc;
553	return `0`;
554	}
555
556	pcc = __alloc_percpu_gfp(size: sizeof(*cc), align: `8`, GFP_KERNEL);
557	if (!pcc)
558	return -ENOMEM;
559	#ifdef CONFIG_MEMCG_KMEM
560	objcg = get_obj_cgroup_from_current();
561	#endif
562	ma->objcg = objcg;
563	for_each_possible_cpu(cpu) {
564	cc = per_cpu_ptr(pcc, cpu);
565	for (i = `0`; i < NUM_CACHES; i++) {
566	c = &cc->cache[i];
567	c->unit_size = sizes[i];
568	c->objcg = objcg;
569	c->percpu_size = percpu_size;
570	c->tgt = c;
571
572	init_refill_work(c);
573	prefill_mem_cache(c, cpu);
574	}
575	}
576
577	ma->caches = pcc;
578	return `0`;
579	}
580
581	int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc ma, struct* obj_cgroup *objcg)
582	{
583	struct bpf_mem_caches __percpu *pcc;
584
585	pcc = __alloc_percpu_gfp(size: sizeof(struct bpf_mem_caches), align: `8`, GFP_KERNEL);
586	if (!pcc)
587	return -ENOMEM;
588
589	ma->caches = pcc;
590	ma->objcg = objcg;
591	ma->percpu = true;
592	return `0`;
593	}
594
595	int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc ma, int* size)
596	{
597	struct bpf_mem_caches cc, __percpu pcc;
598	int cpu, i, unit_size, percpu_size;
599	struct obj_cgroup *objcg;
600	struct bpf_mem_cache *c;
601
602	i = bpf_mem_cache_idx(size);
603	if (i < `0`)
604	return -EINVAL;
605
606	/ room for llist_node and per-cpu pointer /
607	percpu_size = LLIST_NODE_SZ + sizeof(void *);
608
609	unit_size = sizes[i];
610	objcg = ma->objcg;
611	pcc = ma->caches;
612
613	for_each_possible_cpu(cpu) {
614	cc = per_cpu_ptr(pcc, cpu);
615	c = &cc->cache[i];
616	if (c->unit_size)
617	break;
618
619	c->unit_size = unit_size;
620	c->objcg = objcg;
621	c->percpu_size = percpu_size;
622	c->tgt = c;
623
624	init_refill_work(c);
625	prefill_mem_cache(c, cpu);
626	}
627
628	return `0`;
629	}
630
631	static void drain_mem_cache(struct bpf_mem_cache *c)
632	{
633	bool percpu = !!c->percpu_size;
634
635	/ No progs are using this bpf_mem_cache, but htab_map_free() called*
636	* bpf_mem_cache_free() for all remaining elements and they can be in
637	* free_by_rcu_ttrace or in waiting_for_gp_ttrace lists, so drain those lists now.
638	*
639	* Except for waiting_for_gp_ttrace list, there are no concurrent operations
640	* on these lists, so it is safe to use __llist_del_all().
641	*/
642	free_all(llnode: llist_del_all(head: &c->free_by_rcu_ttrace), percpu);
643	free_all(llnode: llist_del_all(head: &c->waiting_for_gp_ttrace), percpu);
644	free_all(llnode: __llist_del_all(head: &c->free_llist), percpu);
645	free_all(llnode: __llist_del_all(head: &c->free_llist_extra), percpu);
646	free_all(llnode: __llist_del_all(head: &c->free_by_rcu), percpu);
647	free_all(llnode: __llist_del_all(head: &c->free_llist_extra_rcu), percpu);
648	free_all(llnode: llist_del_all(head: &c->waiting_for_gp), percpu);
649	}
650
651	static void check_mem_cache(struct bpf_mem_cache *c)
652	{
653	WARN_ON_ONCE(!llist_empty(&c->free_by_rcu_ttrace));
654	WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace));
655	WARN_ON_ONCE(!llist_empty(&c->free_llist));
656	WARN_ON_ONCE(!llist_empty(&c->free_llist_extra));
657	WARN_ON_ONCE(!llist_empty(&c->free_by_rcu));
658	WARN_ON_ONCE(!llist_empty(&c->free_llist_extra_rcu));
659	WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
660	}
661
662	static void check_leaked_objs(struct bpf_mem_alloc *ma)
663	{
664	struct bpf_mem_caches *cc;
665	struct bpf_mem_cache *c;
666	int cpu, i;
667
668	if (ma->cache) {
669	for_each_possible_cpu(cpu) {
670	c = per_cpu_ptr(ma->cache, cpu);
671	check_mem_cache(c);
672	}
673	}
674	if (ma->caches) {
675	for_each_possible_cpu(cpu) {
676	cc = per_cpu_ptr(ma->caches, cpu);
677	for (i = `0`; i < NUM_CACHES; i++) {
678	c = &cc->cache[i];
679	check_mem_cache(c);
680	}
681	}
682	}
683	}
684
685	static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
686	{
687	check_leaked_objs(ma);
688	free_percpu(pdata: ma->cache);
689	free_percpu(pdata: ma->caches);
690	ma->cache = NULL;
691	ma->caches = NULL;
692	}
693
694	static void free_mem_alloc(struct bpf_mem_alloc *ma)
695	{
696	/ waiting_for_gp[_ttrace] lists were drained, but RCU callbacks*
697	* might still execute. Wait for them.
698	*
699	* rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(),
700	* but rcu_barrier_tasks_trace() and rcu_barrier() below are only used
701	* to wait for the pending __free_rcu_tasks_trace() and __free_rcu(),
702	* so if call_rcu(head, __free_rcu) is skipped due to
703	* rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by
704	* using rcu_trace_implies_rcu_gp() as well.
705	*/
706	rcu_barrier(); / wait for __free_by_rcu /
707	rcu_barrier_tasks_trace(); / wait for __free_rcu /
708	if (!rcu_trace_implies_rcu_gp())
709	rcu_barrier();
710	free_mem_alloc_no_barrier(ma);
711	}
712
713	static void free_mem_alloc_deferred(struct work_struct *work)
714	{
715	struct bpf_mem_alloc ma = container_of(work, struct* bpf_mem_alloc, work);
716
717	free_mem_alloc(ma);
718	kfree(objp: ma);
719	}
720
721	static void destroy_mem_alloc(struct bpf_mem_alloc ma, int* rcu_in_progress)
722	{
723	struct bpf_mem_alloc *copy;
724
725	if (!rcu_in_progress) {
726	/ Fast path. No callbacks are pending, hence no need to do*
727	* rcu_barrier-s.
728	*/
729	free_mem_alloc_no_barrier(ma);
730	return;
731	}
732
733	copy = kmemdup(p: ma, size: sizeof(*ma), GFP_KERNEL);
734	if (!copy) {
735	/ Slow path with inline barrier-s /
736	free_mem_alloc(ma);
737	return;
738	}
739
740	/ Defer barriers into worker to let the rest of map memory to be freed /
741	memset(ma, `0`, sizeof(*ma));
742	INIT_WORK(&copy->work, free_mem_alloc_deferred);
743	queue_work(wq: system_unbound_wq, work: &copy->work);
744	}
745
746	void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
747	{
748	struct bpf_mem_caches *cc;
749	struct bpf_mem_cache *c;
750	int cpu, i, rcu_in_progress;
751
752	if (ma->cache) {
753	rcu_in_progress = `0`;
754	for_each_possible_cpu(cpu) {
755	c = per_cpu_ptr(ma->cache, cpu);
756	WRITE_ONCE(c->draining, true);
757	irq_work_sync(work: &c->refill_work);
758	drain_mem_cache(c);
759	rcu_in_progress += atomic_read(v: &c->call_rcu_ttrace_in_progress);
760	rcu_in_progress += atomic_read(v: &c->call_rcu_in_progress);
761	}
762	if (ma->objcg)
763	obj_cgroup_put(objcg: ma->objcg);
764	destroy_mem_alloc(ma, rcu_in_progress);
765	}
766	if (ma->caches) {
767	rcu_in_progress = `0`;
768	for_each_possible_cpu(cpu) {
769	cc = per_cpu_ptr(ma->caches, cpu);
770	for (i = `0`; i < NUM_CACHES; i++) {
771	c = &cc->cache[i];
772	WRITE_ONCE(c->draining, true);
773	irq_work_sync(work: &c->refill_work);
774	drain_mem_cache(c);
775	rcu_in_progress += atomic_read(v: &c->call_rcu_ttrace_in_progress);
776	rcu_in_progress += atomic_read(v: &c->call_rcu_in_progress);
777	}
778	}
779	if (ma->objcg)
780	obj_cgroup_put(objcg: ma->objcg);
781	destroy_mem_alloc(ma, rcu_in_progress);
782	}
783	}
784
785	/ notrace is necessary here and in other functions to make sure*
786	* bpf programs cannot attach to them and cause llist corruptions.
787	*/
788	static void notrace unit_alloc(struct* bpf_mem_cache *c)
789	{
790	struct llist_node *llnode = NULL;
791	unsigned long flags;
792	int cnt = `0`;
793
794	/ Disable irqs to prevent the following race for majority of prog types:*
795	* prog_A
796	* bpf_mem_alloc
797	* preemption or irq -> prog_B
798	* bpf_mem_alloc
799	*
800	* but prog_B could be a perf_event NMI prog.
801	* Use per-cpu 'active' counter to order free_list access between
802	* unit_alloc/unit_free/bpf_mem_refill.
803	*/
804	local_irq_save(flags);
805	if (local_inc_return(&c->active) == `1`) {
806	llnode = __llist_del_first(head: &c->free_llist);
807	if (llnode) {
808	cnt = --c->free_cnt;
809	(struct* bpf_mem_cache **)llnode = c;
810	}
811	}
812	local_dec(l: &c->active);
813
814	WARN_ON(cnt < `0`);
815
816	if (cnt < c->low_watermark)
817	irq_work_raise(c);
818	/ Enable IRQ after the enqueue of irq work completes, so irq work*
819	* will run after IRQ is enabled and free_llist may be refilled by
820	* irq work before other task preempts current task.
821	*/
822	local_irq_restore(flags);
823
824	return llnode;
825	}
826
827	/ Though 'ptr' object could have been allocated on a different cpu*
828	* add it to the free_llist of the current cpu.
829	* Let kfree() logic deal with it when it's later called from irq_work.
830	*/
831	static void notrace unit_free(struct bpf_mem_cache c, void* *ptr)
832	{
833	struct llist_node *llnode = ptr - LLIST_NODE_SZ;
834	unsigned long flags;
835	int cnt = `0`;
836
837	BUILD_BUG_ON(LLIST_NODE_SZ > `8`);
838
839	/*
840	* Remember bpf_mem_cache that allocated this object.
841	* The hint is not accurate.
842	*/
843	c->tgt = (struct* bpf_mem_cache **)llnode;
844
845	local_irq_save(flags);
846	if (local_inc_return(&c->active) == `1`) {
847	__llist_add(new: llnode, head: &c->free_llist);
848	cnt = ++c->free_cnt;
849	} else {
850	/ unit_free() cannot fail. Therefore add an object to atomic*
851	* llist. free_bulk() will drain it. Though free_llist_extra is
852	* a per-cpu list we have to use atomic llist_add here, since
853	* it also can be interrupted by bpf nmi prog that does another
854	* unit_free() into the same free_llist_extra.
855	*/
856	llist_add(new: llnode, head: &c->free_llist_extra);
857	}
858	local_dec(l: &c->active);
859
860	if (cnt > c->high_watermark)
861	/ free few objects from current cpu into global kmalloc pool /
862	irq_work_raise(c);
863	/ Enable IRQ after irq_work_raise() completes, otherwise when current*
864	* task is preempted by task which does unit_alloc(), unit_alloc() may
865	* return NULL unexpectedly because irq work is already pending but can
866	* not been triggered and free_llist can not be refilled timely.
867	*/
868	local_irq_restore(flags);
869	}
870
871	static void notrace unit_free_rcu(struct bpf_mem_cache c, void* *ptr)
872	{
873	struct llist_node *llnode = ptr - LLIST_NODE_SZ;
874	unsigned long flags;
875
876	c->tgt = (struct* bpf_mem_cache **)llnode;
877
878	local_irq_save(flags);
879	if (local_inc_return(&c->active) == `1`) {
880	if (__llist_add(new: llnode, head: &c->free_by_rcu))
881	c->free_by_rcu_tail = llnode;
882	} else {
883	llist_add(new: llnode, head: &c->free_llist_extra_rcu);
884	}
885	local_dec(l: &c->active);
886
887	if (!atomic_read(v: &c->call_rcu_in_progress))
888	irq_work_raise(c);
889	local_irq_restore(flags);
890	}
891
892	/ Called from BPF program or from sys_bpf syscall.*
893	* In both cases migration is disabled.
894	*/
895	void notrace bpf_mem_alloc(struct* bpf_mem_alloc *ma, size_t size)
896	{
897	int idx;
898	void *ret;
899
900	if (!size)
901	return NULL;
902
903	if (!ma->percpu)
904	size += LLIST_NODE_SZ;
905	idx = bpf_mem_cache_idx(size);
906	if (idx < `0`)
907	return NULL;
908
909	ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx);
910	return !ret ? NULL : ret + LLIST_NODE_SZ;
911	}
912
913	void notrace bpf_mem_free(struct bpf_mem_alloc ma, void* *ptr)
914	{
915	struct bpf_mem_cache *c;
916	int idx;
917
918	if (!ptr)
919	return;
920
921	c = (void* **)(ptr - LLIST_NODE_SZ);
922	idx = bpf_mem_cache_idx(size: c->unit_size);
923	if (WARN_ON_ONCE(idx < `0`))
924	return;
925
926	unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr);
927	}
928
929	void notrace bpf_mem_free_rcu(struct bpf_mem_alloc ma, void* *ptr)
930	{
931	struct bpf_mem_cache *c;
932	int idx;
933
934	if (!ptr)
935	return;
936
937	c = (void* **)(ptr - LLIST_NODE_SZ);
938	idx = bpf_mem_cache_idx(size: c->unit_size);
939	if (WARN_ON_ONCE(idx < `0`))
940	return;
941
942	unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr);
943	}
944
945	void notrace bpf_mem_cache_alloc(struct* bpf_mem_alloc *ma)
946	{
947	void *ret;
948
949	ret = unit_alloc(this_cpu_ptr(ma->cache));
950	return !ret ? NULL : ret + LLIST_NODE_SZ;
951	}
952
953	void notrace bpf_mem_cache_free(struct bpf_mem_alloc ma, void* *ptr)
954	{
955	if (!ptr)
956	return;
957
958	unit_free(this_cpu_ptr(ma->cache), ptr);
959	}
960
961	void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc ma, void* *ptr)
962	{
963	if (!ptr)
964	return;
965
966	unit_free_rcu(this_cpu_ptr(ma->cache), ptr);
967	}
968
969	/ Directly does a kfree() without putting 'ptr' back to the free_llist*
970	* for reuse and without waiting for a rcu_tasks_trace gp.
971	* The caller must first go through the rcu_tasks_trace gp for 'ptr'
972	* before calling bpf_mem_cache_raw_free().
973	* It could be used when the rcu_tasks_trace callback does not have
974	* a hold on the original bpf_mem_alloc object that allocated the
975	* 'ptr'. This should only be used in the uncommon code path.
976	* Otherwise, the bpf_mem_alloc's free_llist cannot be refilled
977	* and may affect performance.
978	*/
979	void bpf_mem_cache_raw_free(void *ptr)
980	{
981	if (!ptr)
982	return;
983
984	kfree(objp: ptr - LLIST_NODE_SZ);
985	}
986
987	/ When flags == GFP_KERNEL, it signals that the caller will not cause*
988	* deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use
989	* kmalloc if the free_llist is empty.
990	*/
991	void notrace bpf_mem_cache_alloc_flags(struct* bpf_mem_alloc *ma, gfp_t flags)
992	{
993	struct bpf_mem_cache *c;
994	void *ret;
995
996	c = this_cpu_ptr(ma->cache);
997
998	ret = unit_alloc(c);
999	if (!ret && flags == GFP_KERNEL) {
1000	struct mem_cgroup memcg, old_memcg;
1001
1002	memcg = get_memcg(c);
1003	old_memcg = set_active_memcg(memcg);
1004	ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL \| __GFP_NOWARN \| __GFP_ACCOUNT);
1005	if (ret)
1006	(struct* bpf_mem_cache **)ret = c;
1007	set_active_memcg(old_memcg);
1008	mem_cgroup_put(memcg);
1009	}
1010
1011	return !ret ? NULL : ret + LLIST_NODE_SZ;
1012	}
1013

source code of linux/kernel/bpf/memalloc.c