hashtab.c source code [linux/kernel/bpf/hashtab.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/ Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com*
3	* Copyright (c) 2016 Facebook
4	*/
5	#include <linux/bpf.h>
6	#include <linux/btf.h>
7	#include <linux/jhash.h>
8	#include <linux/filter.h>
9	#include <linux/rculist_nulls.h>
10	#include <linux/random.h>
11	#include <uapi/linux/btf.h>
12	#include <linux/rcupdate_trace.h>
13	#include <linux/btf_ids.h>
14	#include "percpu_freelist.h"
15	#include "bpf_lru_list.h"
16	#include "map_in_map.h"
17	#include <linux/bpf_mem_alloc.h>
18
19	#define HTAB_CREATE_FLAG_MASK \
20	(BPF_F_NO_PREALLOC \| BPF_F_NO_COMMON_LRU \| BPF_F_NUMA_NODE \| \
21	BPF_F_ACCESS_MASK \| BPF_F_ZERO_SEED)
22
23	#define BATCH_OPS(_name) \
24	.map_lookup_batch = \
25	_name##_map_lookup_batch, \
26	.map_lookup_and_delete_batch = \
27	_name##_map_lookup_and_delete_batch, \
28	.map_update_batch = \
29	generic_map_update_batch, \
30	.map_delete_batch = \
31	generic_map_delete_batch
32
33	/*
34	* The bucket lock has two protection scopes:
35	*
36	* 1) Serializing concurrent operations from BPF programs on different
37	* CPUs
38	*
39	* 2) Serializing concurrent operations from BPF programs and sys_bpf()
40	*
41	* BPF programs can execute in any context including perf, kprobes and
42	* tracing. As there are almost no limits where perf, kprobes and tracing
43	* can be invoked from the lock operations need to be protected against
44	* deadlocks. Deadlocks can be caused by recursion and by an invocation in
45	* the lock held section when functions which acquire this lock are invoked
46	* from sys_bpf(). BPF recursion is prevented by incrementing the per CPU
47	* variable bpf_prog_active, which prevents BPF programs attached to perf
48	* events, kprobes and tracing to be invoked before the prior invocation
49	* from one of these contexts completed. sys_bpf() uses the same mechanism
50	* by pinning the task to the current CPU and incrementing the recursion
51	* protection across the map operation.
52	*
53	* This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
54	* operations like memory allocations (even with GFP_ATOMIC) from atomic
55	* contexts. This is required because even with GFP_ATOMIC the memory
56	* allocator calls into code paths which acquire locks with long held lock
57	* sections. To ensure the deterministic behaviour these locks are regular
58	* spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
59	* true atomic contexts on an RT kernel are the low level hardware
60	* handling, scheduling, low level interrupt handling, NMIs etc. None of
61	* these contexts should ever do memory allocations.
62	*
63	* As regular device interrupt handlers and soft interrupts are forced into
64	* thread context, the existing code which does
65	* spin_lock(); alloc(GFP_ATOMIC); spin_unlock();
66	* just works.
67	*
68	* In theory the BPF locks could be converted to regular spinlocks as well,
69	* but the bucket locks and percpu_freelist locks can be taken from
70	* arbitrary contexts (perf, kprobes, tracepoints) which are required to be
71	* atomic contexts even on RT. Before the introduction of bpf_mem_alloc,
72	* it is only safe to use raw spinlock for preallocated hash map on a RT kernel,
73	* because there is no memory allocation within the lock held sections. However
74	* after hash map was fully converted to use bpf_mem_alloc, there will be
75	* non-synchronous memory allocation for non-preallocated hash map, so it is
76	* safe to always use raw spinlock for bucket lock.
77	*/
78	struct bucket {
79	struct hlist_nulls_head head;
80	raw_spinlock_t raw_lock;
81	};
82
83	#define HASHTAB_MAP_LOCK_COUNT 8
84	#define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1)
85
86	struct bpf_htab {
87	struct bpf_map map;
88	struct bpf_mem_alloc ma;
89	struct bpf_mem_alloc pcpu_ma;
90	struct bucket *buckets;
91	void *elems;
92	union {
93	struct pcpu_freelist freelist;
94	struct bpf_lru lru;
95	};
96	struct htab_elem __percpu extra_elems;
97	/ number of elements in non-preallocated hashtable are kept*
98	* in either pcount or count
99	*/
100	struct percpu_counter pcount;
101	atomic_t count;
102	bool use_percpu_counter;
103	u32 n_buckets; / number of hash buckets /
104	u32 elem_size; / size of each element in bytes /
105	u32 hashrnd;
106	struct lock_class_key lockdep_key;
107	int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
108	};
109
110	/ each htab element is struct htab_elem + key + value /
111	struct htab_elem {
112	union {
113	struct hlist_nulls_node hash_node;
114	struct {
115	void *padding;
116	union {
117	struct pcpu_freelist_node fnode;
118	struct htab_elem *batch_flink;
119	};
120	};
121	};
122	union {
123	/ pointer to per-cpu pointer /
124	void *ptr_to_pptr;
125	struct bpf_lru_node lru_node;
126	};
127	u32 hash;
128	char key[] __aligned(`8`);
129	};
130
131	static inline bool htab_is_prealloc(const struct bpf_htab *htab)
132	{
133	return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
134	}
135
136	static void htab_init_buckets(struct bpf_htab *htab)
137	{
138	unsigned int i;
139
140	for (i = `0`; i < htab->n_buckets; i++) {
141	INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
142	raw_spin_lock_init(&htab->buckets[i].raw_lock);
143	lockdep_set_class(&htab->buckets[i].raw_lock,
144	&htab->lockdep_key);
145	cond_resched();
146	}
147	}
148
149	static inline int htab_lock_bucket(const struct bpf_htab *htab,
150	struct bucket *b, u32 hash,
151	unsigned long *pflags)
152	{
153	unsigned long flags;
154
155	hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - `1`);
156
157	preempt_disable();
158	local_irq_save(flags);
159	if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != `1`)) {
160	__this_cpu_dec(*(htab->map_locked[hash]));
161	local_irq_restore(flags);
162	preempt_enable();
163	return -EBUSY;
164	}
165
166	raw_spin_lock(&b->raw_lock);
167	*pflags = flags;
168
169	return `0`;
170	}
171
172	static inline void htab_unlock_bucket(const struct bpf_htab *htab,
173	struct bucket *b, u32 hash,
174	unsigned long flags)
175	{
176	hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - `1`);
177	raw_spin_unlock(&b->raw_lock);
178	__this_cpu_dec(*(htab->map_locked[hash]));
179	local_irq_restore(flags);
180	preempt_enable();
181	}
182
183	static bool htab_lru_map_delete_node(void arg, struct* bpf_lru_node *node);
184
185	static bool htab_is_lru(const struct bpf_htab *htab)
186	{
187	return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH \|\|
188	htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
189	}
190
191	static bool htab_is_percpu(const struct bpf_htab *htab)
192	{
193	return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH \|\|
194	htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
195	}
196
197	static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
198	void __percpu *pptr)
199	{
200	(void* __percpu **)(l->key + key_size) = pptr;
201	}
202
203	static inline void __percpu htab_elem_get_ptr(struct* htab_elem *l, u32 key_size)
204	{
205	return (void* __percpu **)(l->key + key_size);
206	}
207
208	static void fd_htab_map_get_ptr(const* struct bpf_map map, struct* htab_elem *l)
209	{
210	return (void* **)(l->key + roundup(map->key_size, `8`));
211	}
212
213	static struct htab_elem get_htab_elem(struct* bpf_htab htab, int* i)
214	{
215	return (struct htab_elem ) (htab->elems + i (u64)htab->elem_size);
216	}
217
218	static bool htab_has_extra_elems(struct bpf_htab *htab)
219	{
220	return !htab_is_percpu(htab) && !htab_is_lru(htab);
221	}
222
223	static void htab_free_prealloced_timers(struct bpf_htab *htab)
224	{
225	u32 num_entries = htab->map.max_entries;
226	int i;
227
228	if (!btf_record_has_field(rec: htab->map.record, type: BPF_TIMER))
229	return;
230	if (htab_has_extra_elems(htab))
231	num_entries += num_possible_cpus();
232
233	for (i = `0`; i < num_entries; i++) {
234	struct htab_elem *elem;
235
236	elem = get_htab_elem(htab, i);
237	bpf_obj_free_timer(rec: htab->map.record, obj: elem->key + round_up(htab->map.key_size, `8`));
238	cond_resched();
239	}
240	}
241
242	static void htab_free_prealloced_fields(struct bpf_htab *htab)
243	{
244	u32 num_entries = htab->map.max_entries;
245	int i;
246
247	if (IS_ERR_OR_NULL(ptr: htab->map.record))
248	return;
249	if (htab_has_extra_elems(htab))
250	num_entries += num_possible_cpus();
251	for (i = `0`; i < num_entries; i++) {
252	struct htab_elem *elem;
253
254	elem = get_htab_elem(htab, i);
255	if (htab_is_percpu(htab)) {
256	void __percpu *pptr = htab_elem_get_ptr(l: elem, key_size: htab->map.key_size);
257	int cpu;
258
259	for_each_possible_cpu(cpu) {
260	bpf_obj_free_fields(rec: htab->map.record, per_cpu_ptr(pptr, cpu));
261	cond_resched();
262	}
263	} else {
264	bpf_obj_free_fields(rec: htab->map.record, obj: elem->key + round_up(htab->map.key_size, `8`));
265	cond_resched();
266	}
267	cond_resched();
268	}
269	}
270
271	static void htab_free_elems(struct bpf_htab *htab)
272	{
273	int i;
274
275	if (!htab_is_percpu(htab))
276	goto free_elems;
277
278	for (i = `0`; i < htab->map.max_entries; i++) {
279	void __percpu *pptr;
280
281	pptr = htab_elem_get_ptr(l: get_htab_elem(htab, i),
282	key_size: htab->map.key_size);
283	free_percpu(pdata: pptr);
284	cond_resched();
285	}
286	free_elems:
287	bpf_map_area_free(base: htab->elems);
288	}
289
290	/ The LRU list has a lock (lru_lock). Each htab bucket has a lock*
291	* (bucket_lock). If both locks need to be acquired together, the lock
292	* order is always lru_lock -> bucket_lock and this only happens in
293	* bpf_lru_list.c logic. For example, certain code path of
294	* bpf_lru_pop_free(), which is called by function prealloc_lru_pop(),
295	* will acquire lru_lock first followed by acquiring bucket_lock.
296	*
297	* In hashtab.c, to avoid deadlock, lock acquisition of
298	* bucket_lock followed by lru_lock is not allowed. In such cases,
299	* bucket_lock needs to be released first before acquiring lru_lock.
300	*/
301	static struct htab_elem prealloc_lru_pop(struct* bpf_htab htab, void* *key,
302	u32 hash)
303	{
304	struct bpf_lru_node *node = bpf_lru_pop_free(lru: &htab->lru, hash);
305	struct htab_elem *l;
306
307	if (node) {
308	bpf_map_inc_elem_count(map: &htab->map);
309	l = container_of(node, struct htab_elem, lru_node);
310	memcpy(l->key, key, htab->map.key_size);
311	return l;
312	}
313
314	return NULL;
315	}
316
317	static int prealloc_init(struct bpf_htab *htab)
318	{
319	u32 num_entries = htab->map.max_entries;
320	int err = -ENOMEM, i;
321
322	if (htab_has_extra_elems(htab))
323	num_entries += num_possible_cpus();
324
325	htab->elems = bpf_map_area_alloc(size: (u64)htab->elem_size * num_entries,
326	numa_node: htab->map.numa_node);
327	if (!htab->elems)
328	return -ENOMEM;
329
330	if (!htab_is_percpu(htab))
331	goto skip_percpu_elems;
332
333	for (i = `0`; i < num_entries; i++) {
334	u32 size = round_up(htab->map.value_size, `8`);
335	void __percpu *pptr;
336
337	pptr = bpf_map_alloc_percpu(map: &htab->map, size, align: `8`,
338	GFP_USER \| __GFP_NOWARN);
339	if (!pptr)
340	goto free_elems;
341	htab_elem_set_ptr(l: get_htab_elem(htab, i), key_size: htab->map.key_size,
342	pptr);
343	cond_resched();
344	}
345
346	skip_percpu_elems:
347	if (htab_is_lru(htab))
348	err = bpf_lru_init(lru: &htab->lru,
349	percpu: htab->map.map_flags & BPF_F_NO_COMMON_LRU,
350	offsetof(struct htab_elem, hash) -
351	offsetof(struct htab_elem, lru_node),
352	del_from_htab: htab_lru_map_delete_node,
353	delete_arg: htab);
354	else
355	err = pcpu_freelist_init(&htab->freelist);
356
357	if (err)
358	goto free_elems;
359
360	if (htab_is_lru(htab))
361	bpf_lru_populate(lru: &htab->lru, buf: htab->elems,
362	offsetof(struct htab_elem, lru_node),
363	elem_size: htab->elem_size, nr_elems: num_entries);
364	else
365	pcpu_freelist_populate(s: &htab->freelist,
366	buf: htab->elems + offsetof(struct htab_elem, fnode),
367	elem_size: htab->elem_size, nr_elems: num_entries);
368
369	return `0`;
370
371	free_elems:
372	htab_free_elems(htab);
373	return err;
374	}
375
376	static void prealloc_destroy(struct bpf_htab *htab)
377	{
378	htab_free_elems(htab);
379
380	if (htab_is_lru(htab))
381	bpf_lru_destroy(lru: &htab->lru);
382	else
383	pcpu_freelist_destroy(s: &htab->freelist);
384	}
385
386	static int alloc_extra_elems(struct bpf_htab *htab)
387	{
388	struct htab_elem __percpu pptr, *l_new;
389	struct pcpu_freelist_node *l;
390	int cpu;
391
392	pptr = bpf_map_alloc_percpu(map: &htab->map, size: sizeof(struct htab_elem *), align: `8`,
393	GFP_USER \| __GFP_NOWARN);
394	if (!pptr)
395	return -ENOMEM;
396
397	for_each_possible_cpu(cpu) {
398	l = pcpu_freelist_pop(&htab->freelist);
399	/ pop will succeed, since prealloc_init()*
400	* preallocated extra num_possible_cpus elements
401	*/
402	l_new = container_of(l, struct htab_elem, fnode);
403	*per_cpu_ptr(pptr, cpu) = l_new;
404	}
405	htab->extra_elems = pptr;
406	return `0`;
407	}
408
409	/ Called from syscall /
410	static int htab_map_alloc_check(union bpf_attr *attr)
411	{
412	bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH \|\|
413	attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
414	bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH \|\|
415	attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
416	/ percpu_lru means each cpu has its own LRU list.*
417	* it is different from BPF_MAP_TYPE_PERCPU_HASH where
418	* the map's value itself is percpu. percpu_lru has
419	* nothing to do with the map's value.
420	*/
421	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
422	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
423	bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
424	int numa_node = bpf_map_attr_numa_node(attr);
425
426	BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
427	offsetof(struct htab_elem, hash_node.pprev));
428
429	if (zero_seed && !capable(CAP_SYS_ADMIN))
430	/ Guard against local DoS, and discourage production use. /
431	return -EPERM;
432
433	if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK \|\|
434	!bpf_map_flags_access_ok(access_flags: attr->map_flags))
435	return -EINVAL;
436
437	if (!lru && percpu_lru)
438	return -EINVAL;
439
440	if (lru && !prealloc)
441	return -ENOTSUPP;
442
443	if (numa_node != NUMA_NO_NODE && (percpu \|\| percpu_lru))
444	return -EINVAL;
445
446	/ check sanity of attributes.*
447	* value_size == 0 may be allowed in the future to use map as a set
448	*/
449	if (attr->max_entries == `0` \|\| attr->key_size == `0` \|\|
450	attr->value_size == `0`)
451	return -EINVAL;
452
453	if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE -
454	sizeof(struct htab_elem))
455	/ if key_size + value_size is bigger, the user space won't be*
456	* able to access the elements via bpf syscall. This check
457	* also makes sure that the elem_size doesn't overflow and it's
458	* kmalloc-able later in htab_map_update_elem()
459	*/
460	return -E2BIG;
461
462	return `0`;
463	}
464
465	static struct bpf_map htab_map_alloc(union* bpf_attr *attr)
466	{
467	bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH \|\|
468	attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
469	bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH \|\|
470	attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
471	/ percpu_lru means each cpu has its own LRU list.*
472	* it is different from BPF_MAP_TYPE_PERCPU_HASH where
473	* the map's value itself is percpu. percpu_lru has
474	* nothing to do with the map's value.
475	*/
476	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
477	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
478	struct bpf_htab *htab;
479	int err, i;
480
481	htab = bpf_map_area_alloc(size: sizeof(*htab), NUMA_NO_NODE);
482	if (!htab)
483	return ERR_PTR(error: -ENOMEM);
484
485	lockdep_register_key(key: &htab->lockdep_key);
486
487	bpf_map_init_from_attr(map: &htab->map, attr);
488
489	if (percpu_lru) {
490	/ ensure each CPU's lru list has >=1 elements.*
491	* since we are at it, make each lru list has the same
492	* number of elements.
493	*/
494	htab->map.max_entries = roundup(attr->max_entries,
495	num_possible_cpus());
496	if (htab->map.max_entries < attr->max_entries)
497	htab->map.max_entries = rounddown(attr->max_entries,
498	num_possible_cpus());
499	}
500
501	/ hash table size must be power of 2 /
502	htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
503
504	htab->elem_size = sizeof(struct htab_elem) +
505	round_up(htab->map.key_size, `8`);
506	if (percpu)
507	htab->elem_size += sizeof(void *);
508	else
509	htab->elem_size += round_up(htab->map.value_size, `8`);
510
511	err = -E2BIG;
512	/ prevent zero size kmalloc and check for u32 overflow /
513	if (htab->n_buckets == `0` \|\|
514	htab->n_buckets > U32_MAX / sizeof(struct bucket))
515	goto free_htab;
516
517	err = bpf_map_init_elem_count(map: &htab->map);
518	if (err)
519	goto free_htab;
520
521	err = -ENOMEM;
522	htab->buckets = bpf_map_area_alloc(size: htab->n_buckets *
523	sizeof(struct bucket),
524	numa_node: htab->map.numa_node);
525	if (!htab->buckets)
526	goto free_elem_count;
527
528	for (i = `0`; i < HASHTAB_MAP_LOCK_COUNT; i++) {
529	htab->map_locked[i] = bpf_map_alloc_percpu(map: &htab->map,
530	size: sizeof(int),
531	align: sizeof(int),
532	GFP_USER);
533	if (!htab->map_locked[i])
534	goto free_map_locked;
535	}
536
537	if (htab->map.map_flags & BPF_F_ZERO_SEED)
538	htab->hashrnd = `0`;
539	else
540	htab->hashrnd = get_random_u32();
541
542	htab_init_buckets(htab);
543
544	/ compute_batch_value() computes batch value as num_online_cpus() * 2*
545	* and __percpu_counter_compare() needs
546	* htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus()
547	* for percpu_counter to be faster than atomic_t. In practice the average bpf
548	* hash map size is 10k, which means that a system with 64 cpus will fill
549	* hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore
550	* define our own batch count as 32 then 10k hash map can be filled up to 80%:
551	* 10k - 8k > 32 _batch_ * 64 _cpus_
552	* and __percpu_counter_compare() will still be fast. At that point hash map
553	* collisions will dominate its performance anyway. Assume that hash map filled
554	* to 50+% isn't going to be O(1) and use the following formula to choose
555	* between percpu_counter and atomic_t.
556	*/
557	#define PERCPU_COUNTER_BATCH 32
558	if (attr->max_entries / `2` > num_online_cpus() * PERCPU_COUNTER_BATCH)
559	htab->use_percpu_counter = true;
560
561	if (htab->use_percpu_counter) {
562	err = percpu_counter_init(&htab->pcount, `0`, GFP_KERNEL);
563	if (err)
564	goto free_map_locked;
565	}
566
567	if (prealloc) {
568	err = prealloc_init(htab);
569	if (err)
570	goto free_map_locked;
571
572	if (!percpu && !lru) {
573	/ lru itself can remove the least used element, so*
574	* there is no need for an extra elem during map_update.
575	*/
576	err = alloc_extra_elems(htab);
577	if (err)
578	goto free_prealloc;
579	}
580	} else {
581	err = bpf_mem_alloc_init(ma: &htab->ma, size: htab->elem_size, percpu: false);
582	if (err)
583	goto free_map_locked;
584	if (percpu) {
585	err = bpf_mem_alloc_init(ma: &htab->pcpu_ma,
586	round_up(htab->map.value_size, `8`), percpu: true);
587	if (err)
588	goto free_map_locked;
589	}
590	}
591
592	return &htab->map;
593
594	free_prealloc:
595	prealloc_destroy(htab);
596	free_map_locked:
597	if (htab->use_percpu_counter)
598	percpu_counter_destroy(fbc: &htab->pcount);
599	for (i = `0`; i < HASHTAB_MAP_LOCK_COUNT; i++)
600	free_percpu(pdata: htab->map_locked[i]);
601	bpf_map_area_free(base: htab->buckets);
602	bpf_mem_alloc_destroy(ma: &htab->pcpu_ma);
603	bpf_mem_alloc_destroy(ma: &htab->ma);
604	free_elem_count:
605	bpf_map_free_elem_count(map: &htab->map);
606	free_htab:
607	lockdep_unregister_key(key: &htab->lockdep_key);
608	bpf_map_area_free(base: htab);
609	return ERR_PTR(error: err);
610	}
611
612	static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
613	{
614	if (likely(key_len % `4` == `0`))
615	return jhash2(k: key, length: key_len / `4`, initval: hashrnd);
616	return jhash(key, length: key_len, initval: hashrnd);
617	}
618
619	static inline struct bucket __select_bucket(struct* bpf_htab *htab, u32 hash)
620	{
621	return &htab->buckets[hash & (htab->n_buckets - `1`)];
622	}
623
624	static inline struct hlist_nulls_head select_bucket(struct* bpf_htab *htab, u32 hash)
625	{
626	return &__select_bucket(htab, hash)->head;
627	}
628
629	/ this lookup function can only be called with bucket lock taken /
630	static struct htab_elem lookup_elem_raw(struct* hlist_nulls_head *head, u32 hash,
631	void *key, u32 key_size)
632	{
633	struct hlist_nulls_node *n;
634	struct htab_elem *l;
635
636	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
637	if (l->hash == hash && !memcmp(p: &l->key, q: key, size: key_size))
638	return l;
639
640	return NULL;
641	}
642
643	/ can be called without bucket lock. it will repeat the loop in*
644	* the unlikely event when elements moved from one bucket into another
645	* while link list is being walked
646	*/
647	static struct htab_elem lookup_nulls_elem_raw(struct* hlist_nulls_head *head,
648	u32 hash, void *key,
649	u32 key_size, u32 n_buckets)
650	{
651	struct hlist_nulls_node *n;
652	struct htab_elem *l;
653
654	again:
655	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
656	if (l->hash == hash && !memcmp(p: &l->key, q: key, size: key_size))
657	return l;
658
659	if (unlikely(get_nulls_value(n) != (hash & (n_buckets - `1`))))
660	goto again;
661
662	return NULL;
663	}
664
665	/ Called from syscall or from eBPF program directly, so*
666	* arguments have to match bpf_map_lookup_elem() exactly.
667	* The return value is adjusted by BPF instructions
668	* in htab_map_gen_lookup().
669	*/
670	static void __htab_map_lookup_elem(struct* bpf_map map, void* *key)
671	{
672	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
673	struct hlist_nulls_head *head;
674	struct htab_elem *l;
675	u32 hash, key_size;
676
677	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
678	!rcu_read_lock_bh_held());
679
680	key_size = map->key_size;
681
682	hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd);
683
684	head = select_bucket(htab, hash);
685
686	l = lookup_nulls_elem_raw(head, hash, key, key_size, n_buckets: htab->n_buckets);
687
688	return l;
689	}
690
691	static void htab_map_lookup_elem(struct* bpf_map map, void* *key)
692	{
693	struct htab_elem *l = __htab_map_lookup_elem(map, key);
694
695	if (l)
696	return l->key + round_up(map->key_size, `8`);
697
698	return NULL;
699	}
700
701	/ inline bpf_map_lookup_elem() call.*
702	* Instead of:
703	* bpf_prog
704	* bpf_map_lookup_elem
705	* map->ops->map_lookup_elem
706	* htab_map_lookup_elem
707	* __htab_map_lookup_elem
708	* do:
709	* bpf_prog
710	* __htab_map_lookup_elem
711	*/
712	static int htab_map_gen_lookup(struct bpf_map map, struct* bpf_insn *insn_buf)
713	{
714	struct bpf_insn *insn = insn_buf;
715	const int ret = BPF_REG_0;
716
717	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
718	(void ()(struct bpf_map map, void* *key))NULL));
719	*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
720	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, `0`, `1`);
721	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
722	offsetof(struct htab_elem, key) +
723	round_up(map->key_size, `8`));
724	return insn - insn_buf;
725	}
726
727	static __always_inline void __htab_lru_map_lookup_elem(struct* bpf_map *map,
728	void key, const* bool mark)
729	{
730	struct htab_elem *l = __htab_map_lookup_elem(map, key);
731
732	if (l) {
733	if (mark)
734	bpf_lru_node_set_ref(node: &l->lru_node);
735	return l->key + round_up(map->key_size, `8`);
736	}
737
738	return NULL;
739	}
740
741	static void htab_lru_map_lookup_elem(struct* bpf_map map, void* *key)
742	{
743	return __htab_lru_map_lookup_elem(map, key, mark: true);
744	}
745
746	static void htab_lru_map_lookup_elem_sys(struct* bpf_map map, void* *key)
747	{
748	return __htab_lru_map_lookup_elem(map, key, mark: false);
749	}
750
751	static int htab_lru_map_gen_lookup(struct bpf_map *map,
752	struct bpf_insn *insn_buf)
753	{
754	struct bpf_insn *insn = insn_buf;
755	const int ret = BPF_REG_0;
756	const int ref_reg = BPF_REG_1;
757
758	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
759	(void ()(struct bpf_map map, void* *key))NULL));
760	*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
761	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, `0`, `4`);
762	*insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
763	offsetof(struct htab_elem, lru_node) +
764	offsetof(struct bpf_lru_node, ref));
765	*insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, `0`, `1`);
766	*insn++ = BPF_ST_MEM(BPF_B, ret,
767	offsetof(struct htab_elem, lru_node) +
768	offsetof(struct bpf_lru_node, ref),
769	`1`);
770	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
771	offsetof(struct htab_elem, key) +
772	round_up(map->key_size, `8`));
773	return insn - insn_buf;
774	}
775
776	static void check_and_free_fields(struct bpf_htab *htab,
777	struct htab_elem *elem)
778	{
779	if (htab_is_percpu(htab)) {
780	void __percpu *pptr = htab_elem_get_ptr(l: elem, key_size: htab->map.key_size);
781	int cpu;
782
783	for_each_possible_cpu(cpu)
784	bpf_obj_free_fields(rec: htab->map.record, per_cpu_ptr(pptr, cpu));
785	} else {
786	void *map_value = elem->key + round_up(htab->map.key_size, `8`);
787
788	bpf_obj_free_fields(rec: htab->map.record, obj: map_value);
789	}
790	}
791
792	/ It is called from the bpf_lru_list when the LRU needs to delete*
793	* older elements from the htab.
794	*/
795	static bool htab_lru_map_delete_node(void arg, struct* bpf_lru_node *node)
796	{
797	struct bpf_htab *htab = arg;
798	struct htab_elem l = NULL, tgt_l;
799	struct hlist_nulls_head *head;
800	struct hlist_nulls_node *n;
801	unsigned long flags;
802	struct bucket *b;
803	int ret;
804
805	tgt_l = container_of(node, struct htab_elem, lru_node);
806	b = __select_bucket(htab, hash: tgt_l->hash);
807	head = &b->head;
808
809	ret = htab_lock_bucket(htab, b, hash: tgt_l->hash, pflags: &flags);
810	if (ret)
811	return false;
812
813	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
814	if (l == tgt_l) {
815	hlist_nulls_del_rcu(n: &l->hash_node);
816	check_and_free_fields(htab, elem: l);
817	bpf_map_dec_elem_count(map: &htab->map);
818	break;
819	}
820
821	htab_unlock_bucket(htab, b, hash: tgt_l->hash, flags);
822
823	return l == tgt_l;
824	}
825
826	/ Called from syscall /
827	static int htab_map_get_next_key(struct bpf_map map, void* key, void* *next_key)
828	{
829	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
830	struct hlist_nulls_head *head;
831	struct htab_elem l, next_l;
832	u32 hash, key_size;
833	int i = `0`;
834
835	WARN_ON_ONCE(!rcu_read_lock_held());
836
837	key_size = map->key_size;
838
839	if (!key)
840	goto find_first_elem;
841
842	hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd);
843
844	head = select_bucket(htab, hash);
845
846	/ lookup the key /
847	l = lookup_nulls_elem_raw(head, hash, key, key_size, n_buckets: htab->n_buckets);
848
849	if (!l)
850	goto find_first_elem;
851
852	/ key was found, get next key in the same bucket /
853	next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)),
854	struct htab_elem, hash_node);
855
856	if (next_l) {
857	/ if next elem in this hash list is non-zero, just return it /
858	memcpy(next_key, next_l->key, key_size);
859	return `0`;
860	}
861
862	/ no more elements in this hash list, go to the next bucket /
863	i = hash & (htab->n_buckets - `1`);
864	i++;
865
866	find_first_elem:
867	/ iterate over buckets /
868	for (; i < htab->n_buckets; i++) {
869	head = select_bucket(htab, hash: i);
870
871	/ pick first element in the bucket /
872	next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),
873	struct htab_elem, hash_node);
874	if (next_l) {
875	/ if it's not empty, just return it /
876	memcpy(next_key, next_l->key, key_size);
877	return `0`;
878	}
879	}
880
881	/ iterated over all buckets and all elements /
882	return -ENOENT;
883	}
884
885	static void htab_elem_free(struct bpf_htab htab, struct* htab_elem *l)
886	{
887	check_and_free_fields(htab, elem: l);
888	if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
889	bpf_mem_cache_free(ma: &htab->pcpu_ma, ptr: l->ptr_to_pptr);
890	bpf_mem_cache_free(ma: &htab->ma, ptr: l);
891	}
892
893	static void htab_put_fd_value(struct bpf_htab htab, struct* htab_elem *l)
894	{
895	struct bpf_map *map = &htab->map;
896	void *ptr;
897
898	if (map->ops->map_fd_put_ptr) {
899	ptr = fd_htab_map_get_ptr(map, l);
900	map->ops->map_fd_put_ptr(ptr);
901	}
902	}
903
904	static bool is_map_full(struct bpf_htab *htab)
905	{
906	if (htab->use_percpu_counter)
907	return __percpu_counter_compare(fbc: &htab->pcount, rhs: htab->map.max_entries,
908	PERCPU_COUNTER_BATCH) >= `0`;
909	return atomic_read(v: &htab->count) >= htab->map.max_entries;
910	}
911
912	static void inc_elem_count(struct bpf_htab *htab)
913	{
914	bpf_map_inc_elem_count(map: &htab->map);
915
916	if (htab->use_percpu_counter)
917	percpu_counter_add_batch(fbc: &htab->pcount, amount: `1`, PERCPU_COUNTER_BATCH);
918	else
919	atomic_inc(v: &htab->count);
920	}
921
922	static void dec_elem_count(struct bpf_htab *htab)
923	{
924	bpf_map_dec_elem_count(map: &htab->map);
925
926	if (htab->use_percpu_counter)
927	percpu_counter_add_batch(fbc: &htab->pcount, amount: -`1`, PERCPU_COUNTER_BATCH);
928	else
929	atomic_dec(v: &htab->count);
930	}
931
932
933	static void free_htab_elem(struct bpf_htab htab, struct* htab_elem *l)
934	{
935	htab_put_fd_value(htab, l);
936
937	if (htab_is_prealloc(htab)) {
938	bpf_map_dec_elem_count(map: &htab->map);
939	check_and_free_fields(htab, elem: l);
940	__pcpu_freelist_push(&htab->freelist, &l->fnode);
941	} else {
942	dec_elem_count(htab);
943	htab_elem_free(htab, l);
944	}
945	}
946
947	static void pcpu_copy_value(struct bpf_htab htab, void* __percpu *pptr,
948	void *value, bool onallcpus)
949	{
950	if (!onallcpus) {
951	/ copy true value_size bytes /
952	copy_map_value(map: &htab->map, this_cpu_ptr(pptr), src: value);
953	} else {
954	u32 size = round_up(htab->map.value_size, `8`);
955	int off = `0`, cpu;
956
957	for_each_possible_cpu(cpu) {
958	copy_map_value_long(map: &htab->map, per_cpu_ptr(pptr, cpu), src: value + off);
959	off += size;
960	}
961	}
962	}
963
964	static void pcpu_init_value(struct bpf_htab htab, void* __percpu *pptr,
965	void *value, bool onallcpus)
966	{
967	/ When not setting the initial value on all cpus, zero-fill element*
968	* values for other cpus. Otherwise, bpf program has no way to ensure
969	* known initial values for cpus other than current one
970	* (onallcpus=false always when coming from bpf prog).
971	*/
972	if (!onallcpus) {
973	int current_cpu = raw_smp_processor_id();
974	int cpu;
975
976	for_each_possible_cpu(cpu) {
977	if (cpu == current_cpu)
978	copy_map_value_long(map: &htab->map, per_cpu_ptr(pptr, cpu), src: value);
979	else / Since elem is preallocated, we cannot touch special fields /
980	zero_map_value(map: &htab->map, per_cpu_ptr(pptr, cpu));
981	}
982	} else {
983	pcpu_copy_value(htab, pptr, value, onallcpus);
984	}
985	}
986
987	static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
988	{
989	return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS &&
990	BITS_PER_LONG == `64`;
991	}
992
993	static struct htab_elem alloc_htab_elem(struct* bpf_htab htab, void* *key,
994	void *value, u32 key_size, u32 hash,
995	bool percpu, bool onallcpus,
996	struct htab_elem *old_elem)
997	{
998	u32 size = htab->map.value_size;
999	bool prealloc = htab_is_prealloc(htab);
1000	struct htab_elem l_new, *pl_new;
1001	void __percpu *pptr;
1002
1003	if (prealloc) {
1004	if (old_elem) {
1005	/ if we're updating the existing element,*
1006	* use per-cpu extra elems to avoid freelist_pop/push
1007	*/
1008	pl_new = this_cpu_ptr(htab->extra_elems);
1009	l_new = *pl_new;
1010	htab_put_fd_value(htab, l: old_elem);
1011	*pl_new = old_elem;
1012	} else {
1013	struct pcpu_freelist_node *l;
1014
1015	l = __pcpu_freelist_pop(&htab->freelist);
1016	if (!l)
1017	return ERR_PTR(error: -E2BIG);
1018	l_new = container_of(l, struct htab_elem, fnode);
1019	bpf_map_inc_elem_count(map: &htab->map);
1020	}
1021	} else {
1022	if (is_map_full(htab))
1023	if (!old_elem)
1024	/ when map is full and update() is replacing*
1025	* old element, it's ok to allocate, since
1026	* old element will be freed immediately.
1027	* Otherwise return an error
1028	*/
1029	return ERR_PTR(error: -E2BIG);
1030	inc_elem_count(htab);
1031	l_new = bpf_mem_cache_alloc(ma: &htab->ma);
1032	if (!l_new) {
1033	l_new = ERR_PTR(error: -ENOMEM);
1034	goto dec_count;
1035	}
1036	}
1037
1038	memcpy(l_new->key, key, key_size);
1039	if (percpu) {
1040	if (prealloc) {
1041	pptr = htab_elem_get_ptr(l: l_new, key_size);
1042	} else {
1043	/ alloc_percpu zero-fills /
1044	pptr = bpf_mem_cache_alloc(ma: &htab->pcpu_ma);
1045	if (!pptr) {
1046	bpf_mem_cache_free(ma: &htab->ma, ptr: l_new);
1047	l_new = ERR_PTR(error: -ENOMEM);
1048	goto dec_count;
1049	}
1050	l_new->ptr_to_pptr = pptr;
1051	pptr = (void* **)pptr;
1052	}
1053
1054	pcpu_init_value(htab, pptr, value, onallcpus);
1055
1056	if (!prealloc)
1057	htab_elem_set_ptr(l: l_new, key_size, pptr);
1058	} else if (fd_htab_map_needs_adjust(htab)) {
1059	size = round_up(size, `8`);
1060	memcpy(l_new->key + round_up(key_size, `8`), value, size);
1061	} else {
1062	copy_map_value(map: &htab->map,
1063	dst: l_new->key + round_up(key_size, `8`),
1064	src: value);
1065	}
1066
1067	l_new->hash = hash;
1068	return l_new;
1069	dec_count:
1070	dec_elem_count(htab);
1071	return l_new;
1072	}
1073
1074	static int check_flags(struct bpf_htab htab, struct* htab_elem *l_old,
1075	u64 map_flags)
1076	{
1077	if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
1078	/ elem already exists /
1079	return -EEXIST;
1080
1081	if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
1082	/ elem doesn't exist, cannot update it /
1083	return -ENOENT;
1084
1085	return `0`;
1086	}
1087
1088	/ Called from syscall or from eBPF program /
1089	static long htab_map_update_elem(struct bpf_map map, void* key, void* *value,
1090	u64 map_flags)
1091	{
1092	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1093	struct htab_elem l_new = NULL, l_old;
1094	struct hlist_nulls_head *head;
1095	unsigned long flags;
1096	struct bucket *b;
1097	u32 key_size, hash;
1098	int ret;
1099
1100	if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
1101	/ unknown flags /
1102	return -EINVAL;
1103
1104	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1105	!rcu_read_lock_bh_held());
1106
1107	key_size = map->key_size;
1108
1109	hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd);
1110
1111	b = __select_bucket(htab, hash);
1112	head = &b->head;
1113
1114	if (unlikely(map_flags & BPF_F_LOCK)) {
1115	if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK)))
1116	return -EINVAL;
1117	/ find an element without taking the bucket lock /
1118	l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
1119	n_buckets: htab->n_buckets);
1120	ret = check_flags(htab, l_old, map_flags);
1121	if (ret)
1122	return ret;
1123	if (l_old) {
1124	/ grab the element lock and update value in place /
1125	copy_map_value_locked(map,
1126	dst: l_old->key + round_up(key_size, `8`),
1127	src: value, lock_src: false);
1128	return `0`;
1129	}
1130	/ fall through, grab the bucket lock and lookup again.*
1131	* 99.9% chance that the element won't be found,
1132	* but second lookup under lock has to be done.
1133	*/
1134	}
1135
1136	ret = htab_lock_bucket(htab, b, hash, pflags: &flags);
1137	if (ret)
1138	return ret;
1139
1140	l_old = lookup_elem_raw(head, hash, key, key_size);
1141
1142	ret = check_flags(htab, l_old, map_flags);
1143	if (ret)
1144	goto err;
1145
1146	if (unlikely(l_old && (map_flags & BPF_F_LOCK))) {
1147	/ first lookup without the bucket lock didn't find the element,*
1148	* but second lookup with the bucket lock found it.
1149	* This case is highly unlikely, but has to be dealt with:
1150	* grab the element lock in addition to the bucket lock
1151	* and update element in place
1152	*/
1153	copy_map_value_locked(map,
1154	dst: l_old->key + round_up(key_size, `8`),
1155	src: value, lock_src: false);
1156	ret = `0`;
1157	goto err;
1158	}
1159
1160	l_new = alloc_htab_elem(htab, key, value, key_size, hash, percpu: false, onallcpus: false,
1161	old_elem: l_old);
1162	if (IS_ERR(ptr: l_new)) {
1163	/ all pre-allocated elements are in use or memory exhausted /
1164	ret = PTR_ERR(ptr: l_new);
1165	goto err;
1166	}
1167
1168	/ add new element to the head of the list, so that*
1169	* concurrent search will find it before old elem
1170	*/
1171	hlist_nulls_add_head_rcu(n: &l_new->hash_node, h: head);
1172	if (l_old) {
1173	hlist_nulls_del_rcu(n: &l_old->hash_node);
1174	if (!htab_is_prealloc(htab))
1175	free_htab_elem(htab, l: l_old);
1176	else
1177	check_and_free_fields(htab, elem: l_old);
1178	}
1179	ret = `0`;
1180	err:
1181	htab_unlock_bucket(htab, b, hash, flags);
1182	return ret;
1183	}
1184
1185	static void htab_lru_push_free(struct bpf_htab htab, struct* htab_elem *elem)
1186	{
1187	check_and_free_fields(htab, elem);
1188	bpf_map_dec_elem_count(map: &htab->map);
1189	bpf_lru_push_free(lru: &htab->lru, node: &elem->lru_node);
1190	}
1191
1192	static long htab_lru_map_update_elem(struct bpf_map map, void* key, void* *value,
1193	u64 map_flags)
1194	{
1195	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1196	struct htab_elem l_new, l_old = NULL;
1197	struct hlist_nulls_head *head;
1198	unsigned long flags;
1199	struct bucket *b;
1200	u32 key_size, hash;
1201	int ret;
1202
1203	if (unlikely(map_flags > BPF_EXIST))
1204	/ unknown flags /
1205	return -EINVAL;
1206
1207	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1208	!rcu_read_lock_bh_held());
1209
1210	key_size = map->key_size;
1211
1212	hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd);
1213
1214	b = __select_bucket(htab, hash);
1215	head = &b->head;
1216
1217	/ For LRU, we need to alloc before taking bucket's*
1218	* spinlock because getting free nodes from LRU may need
1219	* to remove older elements from htab and this removal
1220	* operation will need a bucket lock.
1221	*/
1222	l_new = prealloc_lru_pop(htab, key, hash);
1223	if (!l_new)
1224	return -ENOMEM;
1225	copy_map_value(map: &htab->map,
1226	dst: l_new->key + round_up(map->key_size, `8`), src: value);
1227
1228	ret = htab_lock_bucket(htab, b, hash, pflags: &flags);
1229	if (ret)
1230	goto err_lock_bucket;
1231
1232	l_old = lookup_elem_raw(head, hash, key, key_size);
1233
1234	ret = check_flags(htab, l_old, map_flags);
1235	if (ret)
1236	goto err;
1237
1238	/ add new element to the head of the list, so that*
1239	* concurrent search will find it before old elem
1240	*/
1241	hlist_nulls_add_head_rcu(n: &l_new->hash_node, h: head);
1242	if (l_old) {
1243	bpf_lru_node_set_ref(node: &l_new->lru_node);
1244	hlist_nulls_del_rcu(n: &l_old->hash_node);
1245	}
1246	ret = `0`;
1247
1248	err:
1249	htab_unlock_bucket(htab, b, hash, flags);
1250
1251	err_lock_bucket:
1252	if (ret)
1253	htab_lru_push_free(htab, elem: l_new);
1254	else if (l_old)
1255	htab_lru_push_free(htab, elem: l_old);
1256
1257	return ret;
1258	}
1259
1260	static long __htab_percpu_map_update_elem(struct bpf_map map, void* *key,
1261	void *value, u64 map_flags,
1262	bool onallcpus)
1263	{
1264	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1265	struct htab_elem l_new = NULL, l_old;
1266	struct hlist_nulls_head *head;
1267	unsigned long flags;
1268	struct bucket *b;
1269	u32 key_size, hash;
1270	int ret;
1271
1272	if (unlikely(map_flags > BPF_EXIST))
1273	/ unknown flags /
1274	return -EINVAL;
1275
1276	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1277	!rcu_read_lock_bh_held());
1278
1279	key_size = map->key_size;
1280
1281	hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd);
1282
1283	b = __select_bucket(htab, hash);
1284	head = &b->head;
1285
1286	ret = htab_lock_bucket(htab, b, hash, pflags: &flags);
1287	if (ret)
1288	return ret;
1289
1290	l_old = lookup_elem_raw(head, hash, key, key_size);
1291
1292	ret = check_flags(htab, l_old, map_flags);
1293	if (ret)
1294	goto err;
1295
1296	if (l_old) {
1297	/ per-cpu hash map can update value in-place /
1298	pcpu_copy_value(htab, pptr: htab_elem_get_ptr(l: l_old, key_size),
1299	value, onallcpus);
1300	} else {
1301	l_new = alloc_htab_elem(htab, key, value, key_size,
1302	hash, percpu: true, onallcpus, NULL);
1303	if (IS_ERR(ptr: l_new)) {
1304	ret = PTR_ERR(ptr: l_new);
1305	goto err;
1306	}
1307	hlist_nulls_add_head_rcu(n: &l_new->hash_node, h: head);
1308	}
1309	ret = `0`;
1310	err:
1311	htab_unlock_bucket(htab, b, hash, flags);
1312	return ret;
1313	}
1314
1315	static long __htab_lru_percpu_map_update_elem(struct bpf_map map, void* *key,
1316	void *value, u64 map_flags,
1317	bool onallcpus)
1318	{
1319	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1320	struct htab_elem l_new = NULL, l_old;
1321	struct hlist_nulls_head *head;
1322	unsigned long flags;
1323	struct bucket *b;
1324	u32 key_size, hash;
1325	int ret;
1326
1327	if (unlikely(map_flags > BPF_EXIST))
1328	/ unknown flags /
1329	return -EINVAL;
1330
1331	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1332	!rcu_read_lock_bh_held());
1333
1334	key_size = map->key_size;
1335
1336	hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd);
1337
1338	b = __select_bucket(htab, hash);
1339	head = &b->head;
1340
1341	/ For LRU, we need to alloc before taking bucket's*
1342	* spinlock because LRU's elem alloc may need
1343	* to remove older elem from htab and this removal
1344	* operation will need a bucket lock.
1345	*/
1346	if (map_flags != BPF_EXIST) {
1347	l_new = prealloc_lru_pop(htab, key, hash);
1348	if (!l_new)
1349	return -ENOMEM;
1350	}
1351
1352	ret = htab_lock_bucket(htab, b, hash, pflags: &flags);
1353	if (ret)
1354	goto err_lock_bucket;
1355
1356	l_old = lookup_elem_raw(head, hash, key, key_size);
1357
1358	ret = check_flags(htab, l_old, map_flags);
1359	if (ret)
1360	goto err;
1361
1362	if (l_old) {
1363	bpf_lru_node_set_ref(node: &l_old->lru_node);
1364
1365	/ per-cpu hash map can update value in-place /
1366	pcpu_copy_value(htab, pptr: htab_elem_get_ptr(l: l_old, key_size),
1367	value, onallcpus);
1368	} else {
1369	pcpu_init_value(htab, pptr: htab_elem_get_ptr(l: l_new, key_size),
1370	value, onallcpus);
1371	hlist_nulls_add_head_rcu(n: &l_new->hash_node, h: head);
1372	l_new = NULL;
1373	}
1374	ret = `0`;
1375	err:
1376	htab_unlock_bucket(htab, b, hash, flags);
1377	err_lock_bucket:
1378	if (l_new) {
1379	bpf_map_dec_elem_count(map: &htab->map);
1380	bpf_lru_push_free(lru: &htab->lru, node: &l_new->lru_node);
1381	}
1382	return ret;
1383	}
1384
1385	static long htab_percpu_map_update_elem(struct bpf_map map, void* *key,
1386	void *value, u64 map_flags)
1387	{
1388	return __htab_percpu_map_update_elem(map, key, value, map_flags, onallcpus: false);
1389	}
1390
1391	static long htab_lru_percpu_map_update_elem(struct bpf_map map, void* *key,
1392	void *value, u64 map_flags)
1393	{
1394	return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
1395	onallcpus: false);
1396	}
1397
1398	/ Called from syscall or from eBPF program /
1399	static long htab_map_delete_elem(struct bpf_map map, void* *key)
1400	{
1401	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1402	struct hlist_nulls_head *head;
1403	struct bucket *b;
1404	struct htab_elem *l;
1405	unsigned long flags;
1406	u32 hash, key_size;
1407	int ret;
1408
1409	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1410	!rcu_read_lock_bh_held());
1411
1412	key_size = map->key_size;
1413
1414	hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd);
1415	b = __select_bucket(htab, hash);
1416	head = &b->head;
1417
1418	ret = htab_lock_bucket(htab, b, hash, pflags: &flags);
1419	if (ret)
1420	return ret;
1421
1422	l = lookup_elem_raw(head, hash, key, key_size);
1423
1424	if (l) {
1425	hlist_nulls_del_rcu(n: &l->hash_node);
1426	free_htab_elem(htab, l);
1427	} else {
1428	ret = -ENOENT;
1429	}
1430
1431	htab_unlock_bucket(htab, b, hash, flags);
1432	return ret;
1433	}
1434
1435	static long htab_lru_map_delete_elem(struct bpf_map map, void* *key)
1436	{
1437	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1438	struct hlist_nulls_head *head;
1439	struct bucket *b;
1440	struct htab_elem *l;
1441	unsigned long flags;
1442	u32 hash, key_size;
1443	int ret;
1444
1445	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1446	!rcu_read_lock_bh_held());
1447
1448	key_size = map->key_size;
1449
1450	hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd);
1451	b = __select_bucket(htab, hash);
1452	head = &b->head;
1453
1454	ret = htab_lock_bucket(htab, b, hash, pflags: &flags);
1455	if (ret)
1456	return ret;
1457
1458	l = lookup_elem_raw(head, hash, key, key_size);
1459
1460	if (l)
1461	hlist_nulls_del_rcu(n: &l->hash_node);
1462	else
1463	ret = -ENOENT;
1464
1465	htab_unlock_bucket(htab, b, hash, flags);
1466	if (l)
1467	htab_lru_push_free(htab, elem: l);
1468	return ret;
1469	}
1470
1471	static void delete_all_elements(struct bpf_htab *htab)
1472	{
1473	int i;
1474
1475	/ It's called from a worker thread, so disable migration here,*
1476	* since bpf_mem_cache_free() relies on that.
1477	*/
1478	migrate_disable();
1479	for (i = `0`; i < htab->n_buckets; i++) {
1480	struct hlist_nulls_head *head = select_bucket(htab, hash: i);
1481	struct hlist_nulls_node *n;
1482	struct htab_elem *l;
1483
1484	hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1485	hlist_nulls_del_rcu(n: &l->hash_node);
1486	htab_elem_free(htab, l);
1487	}
1488	}
1489	migrate_enable();
1490	}
1491
1492	static void htab_free_malloced_timers(struct bpf_htab *htab)
1493	{
1494	int i;
1495
1496	rcu_read_lock();
1497	for (i = `0`; i < htab->n_buckets; i++) {
1498	struct hlist_nulls_head *head = select_bucket(htab, hash: i);
1499	struct hlist_nulls_node *n;
1500	struct htab_elem *l;
1501
1502	hlist_nulls_for_each_entry(l, n, head, hash_node) {
1503	/ We only free timer on uref dropping to zero /
1504	bpf_obj_free_timer(rec: htab->map.record, obj: l->key + round_up(htab->map.key_size, `8`));
1505	}
1506	cond_resched_rcu();
1507	}
1508	rcu_read_unlock();
1509	}
1510
1511	static void htab_map_free_timers(struct bpf_map *map)
1512	{
1513	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1514
1515	/ We only free timer on uref dropping to zero /
1516	if (!btf_record_has_field(rec: htab->map.record, type: BPF_TIMER))
1517	return;
1518	if (!htab_is_prealloc(htab))
1519	htab_free_malloced_timers(htab);
1520	else
1521	htab_free_prealloced_timers(htab);
1522	}
1523
1524	/ Called when map->refcnt goes to zero, either from workqueue or from syscall /
1525	static void htab_map_free(struct bpf_map *map)
1526	{
1527	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1528	int i;
1529
1530	/ bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.*
1531	* bpf_free_used_maps() is called after bpf prog is no longer executing.
1532	* There is no need to synchronize_rcu() here to protect map elements.
1533	*/
1534
1535	/ htab no longer uses call_rcu() directly. bpf_mem_alloc does it*
1536	* underneath and is reponsible for waiting for callbacks to finish
1537	* during bpf_mem_alloc_destroy().
1538	*/
1539	if (!htab_is_prealloc(htab)) {
1540	delete_all_elements(htab);
1541	} else {
1542	htab_free_prealloced_fields(htab);
1543	prealloc_destroy(htab);
1544	}
1545
1546	bpf_map_free_elem_count(map);
1547	free_percpu(pdata: htab->extra_elems);
1548	bpf_map_area_free(base: htab->buckets);
1549	bpf_mem_alloc_destroy(ma: &htab->pcpu_ma);
1550	bpf_mem_alloc_destroy(ma: &htab->ma);
1551	if (htab->use_percpu_counter)
1552	percpu_counter_destroy(fbc: &htab->pcount);
1553	for (i = `0`; i < HASHTAB_MAP_LOCK_COUNT; i++)
1554	free_percpu(pdata: htab->map_locked[i]);
1555	lockdep_unregister_key(key: &htab->lockdep_key);
1556	bpf_map_area_free(base: htab);
1557	}
1558
1559	static void htab_map_seq_show_elem(struct bpf_map map, void* *key,
1560	struct seq_file *m)
1561	{
1562	void *value;
1563
1564	rcu_read_lock();
1565
1566	value = htab_map_lookup_elem(map, key);
1567	if (!value) {
1568	rcu_read_unlock();
1569	return;
1570	}
1571
1572	btf_type_seq_show(btf: map->btf, type_id: map->btf_key_type_id, obj: key, m);
1573	seq_puts(m, s: ": ");
1574	btf_type_seq_show(btf: map->btf, type_id: map->btf_value_type_id, obj: value, m);
1575	seq_puts(m, s: "\n");
1576
1577	rcu_read_unlock();
1578	}
1579
1580	static int __htab_map_lookup_and_delete_elem(struct bpf_map map, void* *key,
1581	void *value, bool is_lru_map,
1582	bool is_percpu, u64 flags)
1583	{
1584	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1585	struct hlist_nulls_head *head;
1586	unsigned long bflags;
1587	struct htab_elem *l;
1588	u32 hash, key_size;
1589	struct bucket *b;
1590	int ret;
1591
1592	key_size = map->key_size;
1593
1594	hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd);
1595	b = __select_bucket(htab, hash);
1596	head = &b->head;
1597
1598	ret = htab_lock_bucket(htab, b, hash, pflags: &bflags);
1599	if (ret)
1600	return ret;
1601
1602	l = lookup_elem_raw(head, hash, key, key_size);
1603	if (!l) {
1604	ret = -ENOENT;
1605	} else {
1606	if (is_percpu) {
1607	u32 roundup_value_size = round_up(map->value_size, `8`);
1608	void __percpu *pptr;
1609	int off = `0`, cpu;
1610
1611	pptr = htab_elem_get_ptr(l, key_size);
1612	for_each_possible_cpu(cpu) {
1613	copy_map_value_long(map: &htab->map, dst: value + off, per_cpu_ptr(pptr, cpu));
1614	check_and_init_map_value(map: &htab->map, dst: value + off);
1615	off += roundup_value_size;
1616	}
1617	} else {
1618	u32 roundup_key_size = round_up(map->key_size, `8`);
1619
1620	if (flags & BPF_F_LOCK)
1621	copy_map_value_locked(map, dst: value, src: l->key +
1622	roundup_key_size,
1623	lock_src: true);
1624	else
1625	copy_map_value(map, dst: value, src: l->key +
1626	roundup_key_size);
1627	/ Zeroing special fields in the temp buffer /
1628	check_and_init_map_value(map, dst: value);
1629	}
1630
1631	hlist_nulls_del_rcu(n: &l->hash_node);
1632	if (!is_lru_map)
1633	free_htab_elem(htab, l);
1634	}
1635
1636	htab_unlock_bucket(htab, b, hash, flags: bflags);
1637
1638	if (is_lru_map && l)
1639	htab_lru_push_free(htab, elem: l);
1640
1641	return ret;
1642	}
1643
1644	static int htab_map_lookup_and_delete_elem(struct bpf_map map, void* *key,
1645	void *value, u64 flags)
1646	{
1647	return __htab_map_lookup_and_delete_elem(map, key, value, is_lru_map: false, is_percpu: false,
1648	flags);
1649	}
1650
1651	static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
1652	void key, void* *value,
1653	u64 flags)
1654	{
1655	return __htab_map_lookup_and_delete_elem(map, key, value, is_lru_map: false, is_percpu: true,
1656	flags);
1657	}
1658
1659	static int htab_lru_map_lookup_and_delete_elem(struct bpf_map map, void* *key,
1660	void *value, u64 flags)
1661	{
1662	return __htab_map_lookup_and_delete_elem(map, key, value, is_lru_map: true, is_percpu: false,
1663	flags);
1664	}
1665
1666	static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
1667	void key, void* *value,
1668	u64 flags)
1669	{
1670	return __htab_map_lookup_and_delete_elem(map, key, value, is_lru_map: true, is_percpu: true,
1671	flags);
1672	}
1673
1674	static int
1675	__htab_map_lookup_and_delete_batch(struct bpf_map *map,
1676	const union bpf_attr *attr,
1677	union bpf_attr __user *uattr,
1678	bool do_delete, bool is_lru_map,
1679	bool is_percpu)
1680	{
1681	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1682	u32 bucket_cnt, total, key_size, value_size, roundup_key_size;
1683	void keys = NULL, values = NULL, value, dst_key, *dst_val;
1684	void __user *uvalues = u64_to_user_ptr(attr->batch.values);
1685	void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
1686	void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
1687	u32 batch, max_count, size, bucket_size, map_id;
1688	struct htab_elem *node_to_free = NULL;
1689	u64 elem_map_flags, map_flags;
1690	struct hlist_nulls_head *head;
1691	struct hlist_nulls_node *n;
1692	unsigned long flags = `0`;
1693	bool locked = false;
1694	struct htab_elem *l;
1695	struct bucket *b;
1696	int ret = `0`;
1697
1698	elem_map_flags = attr->batch.elem_flags;
1699	if ((elem_map_flags & ~BPF_F_LOCK) \|\|
1700	((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(rec: map->record, type: BPF_SPIN_LOCK)))
1701	return -EINVAL;
1702
1703	map_flags = attr->batch.flags;
1704	if (map_flags)
1705	return -EINVAL;
1706
1707	max_count = attr->batch.count;
1708	if (!max_count)
1709	return `0`;
1710
1711	if (put_user(`0`, &uattr->batch.count))
1712	return -EFAULT;
1713
1714	batch = `0`;
1715	if (ubatch && copy_from_user(to: &batch, from: ubatch, n: sizeof(batch)))
1716	return -EFAULT;
1717
1718	if (batch >= htab->n_buckets)
1719	return -ENOENT;
1720
1721	key_size = htab->map.key_size;
1722	roundup_key_size = round_up(htab->map.key_size, `8`);
1723	value_size = htab->map.value_size;
1724	size = round_up(value_size, `8`);
1725	if (is_percpu)
1726	value_size = size * num_possible_cpus();
1727	total = `0`;
1728	/ while experimenting with hash tables with sizes ranging from 10 to*
1729	* 1000, it was observed that a bucket can have up to 5 entries.
1730	*/
1731	bucket_size = `5`;
1732
1733	alloc:
1734	/ We cannot do copy_from_user or copy_to_user inside*
1735	* the rcu_read_lock. Allocate enough space here.
1736	*/
1737	keys = kvmalloc_array(n: key_size, size: bucket_size, GFP_USER \| __GFP_NOWARN);
1738	values = kvmalloc_array(n: value_size, size: bucket_size, GFP_USER \| __GFP_NOWARN);
1739	if (!keys \|\| !values) {
1740	ret = -ENOMEM;
1741	goto after_loop;
1742	}
1743
1744	again:
1745	bpf_disable_instrumentation();
1746	rcu_read_lock();
1747	again_nocopy:
1748	dst_key = keys;
1749	dst_val = values;
1750	b = &htab->buckets[batch];
1751	head = &b->head;
1752	/ do not grab the lock unless need it (bucket_cnt > 0). /
1753	if (locked) {
1754	ret = htab_lock_bucket(htab, b, hash: batch, pflags: &flags);
1755	if (ret) {
1756	rcu_read_unlock();
1757	bpf_enable_instrumentation();
1758	goto after_loop;
1759	}
1760	}
1761
1762	bucket_cnt = `0`;
1763	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
1764	bucket_cnt++;
1765
1766	if (bucket_cnt && !locked) {
1767	locked = true;
1768	goto again_nocopy;
1769	}
1770
1771	if (bucket_cnt > (max_count - total)) {
1772	if (total == `0`)
1773	ret = -ENOSPC;
1774	/ Note that since bucket_cnt > 0 here, it is implicit*
1775	* that the locked was grabbed, so release it.
1776	*/
1777	htab_unlock_bucket(htab, b, hash: batch, flags);
1778	rcu_read_unlock();
1779	bpf_enable_instrumentation();
1780	goto after_loop;
1781	}
1782
1783	if (bucket_cnt > bucket_size) {
1784	bucket_size = bucket_cnt;
1785	/ Note that since bucket_cnt > 0 here, it is implicit*
1786	* that the locked was grabbed, so release it.
1787	*/
1788	htab_unlock_bucket(htab, b, hash: batch, flags);
1789	rcu_read_unlock();
1790	bpf_enable_instrumentation();
1791	kvfree(addr: keys);
1792	kvfree(addr: values);
1793	goto alloc;
1794	}
1795
1796	/ Next block is only safe to run if you have grabbed the lock /
1797	if (!locked)
1798	goto next_batch;
1799
1800	hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1801	memcpy(dst_key, l->key, key_size);
1802
1803	if (is_percpu) {
1804	int off = `0`, cpu;
1805	void __percpu *pptr;
1806
1807	pptr = htab_elem_get_ptr(l, key_size: map->key_size);
1808	for_each_possible_cpu(cpu) {
1809	copy_map_value_long(map: &htab->map, dst: dst_val + off, per_cpu_ptr(pptr, cpu));
1810	check_and_init_map_value(map: &htab->map, dst: dst_val + off);
1811	off += size;
1812	}
1813	} else {
1814	value = l->key + roundup_key_size;
1815	if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
1816	struct bpf_map **inner_map = value;
1817
1818	/ Actual value is the id of the inner map /
1819	map_id = map->ops->map_fd_sys_lookup_elem(*inner_map);
1820	value = &map_id;
1821	}
1822
1823	if (elem_map_flags & BPF_F_LOCK)
1824	copy_map_value_locked(map, dst: dst_val, src: value,
1825	lock_src: true);
1826	else
1827	copy_map_value(map, dst: dst_val, src: value);
1828	/ Zeroing special fields in the temp buffer /
1829	check_and_init_map_value(map, dst: dst_val);
1830	}
1831	if (do_delete) {
1832	hlist_nulls_del_rcu(n: &l->hash_node);
1833
1834	/ bpf_lru_push_free() will acquire lru_lock, which*
1835	* may cause deadlock. See comments in function
1836	* prealloc_lru_pop(). Let us do bpf_lru_push_free()
1837	* after releasing the bucket lock.
1838	*/
1839	if (is_lru_map) {
1840	l->batch_flink = node_to_free;
1841	node_to_free = l;
1842	} else {
1843	free_htab_elem(htab, l);
1844	}
1845	}
1846	dst_key += key_size;
1847	dst_val += value_size;
1848	}
1849
1850	htab_unlock_bucket(htab, b, hash: batch, flags);
1851	locked = false;
1852
1853	while (node_to_free) {
1854	l = node_to_free;
1855	node_to_free = node_to_free->batch_flink;
1856	htab_lru_push_free(htab, elem: l);
1857	}
1858
1859	next_batch:
1860	/ If we are not copying data, we can go to next bucket and avoid*
1861	* unlocking the rcu.
1862	*/
1863	if (!bucket_cnt && (batch + `1` < htab->n_buckets)) {
1864	batch++;
1865	goto again_nocopy;
1866	}
1867
1868	rcu_read_unlock();
1869	bpf_enable_instrumentation();
1870	if (bucket_cnt && (copy_to_user(to: ukeys + total * key_size, from: keys,
1871	n: key_size * bucket_cnt) \|\|
1872	copy_to_user(to: uvalues + total * value_size, from: values,
1873	n: value_size * bucket_cnt))) {
1874	ret = -EFAULT;
1875	goto after_loop;
1876	}
1877
1878	total += bucket_cnt;
1879	batch++;
1880	if (batch >= htab->n_buckets) {
1881	ret = -ENOENT;
1882	goto after_loop;
1883	}
1884	goto again;
1885
1886	after_loop:
1887	if (ret == -EFAULT)
1888	goto out;
1889
1890	/ copy # of entries and next batch /
1891	ubatch = u64_to_user_ptr(attr->batch.out_batch);
1892	if (copy_to_user(to: ubatch, from: &batch, n: sizeof(batch)) \|\|
1893	put_user(total, &uattr->batch.count))
1894	ret = -EFAULT;
1895
1896	out:
1897	kvfree(addr: keys);
1898	kvfree(addr: values);
1899	return ret;
1900	}
1901
1902	static int
1903	htab_percpu_map_lookup_batch(struct bpf_map map, const* union bpf_attr *attr,
1904	union bpf_attr __user *uattr)
1905	{
1906	return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: false,
1907	is_lru_map: false, is_percpu: true);
1908	}
1909
1910	static int
1911	htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
1912	const union bpf_attr *attr,
1913	union bpf_attr __user *uattr)
1914	{
1915	return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: true,
1916	is_lru_map: false, is_percpu: true);
1917	}
1918
1919	static int
1920	htab_map_lookup_batch(struct bpf_map map, const* union bpf_attr *attr,
1921	union bpf_attr __user *uattr)
1922	{
1923	return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: false,
1924	is_lru_map: false, is_percpu: false);
1925	}
1926
1927	static int
1928	htab_map_lookup_and_delete_batch(struct bpf_map *map,
1929	const union bpf_attr *attr,
1930	union bpf_attr __user *uattr)
1931	{
1932	return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: true,
1933	is_lru_map: false, is_percpu: false);
1934	}
1935
1936	static int
1937	htab_lru_percpu_map_lookup_batch(struct bpf_map *map,
1938	const union bpf_attr *attr,
1939	union bpf_attr __user *uattr)
1940	{
1941	return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: false,
1942	is_lru_map: true, is_percpu: true);
1943	}
1944
1945	static int
1946	htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
1947	const union bpf_attr *attr,
1948	union bpf_attr __user *uattr)
1949	{
1950	return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: true,
1951	is_lru_map: true, is_percpu: true);
1952	}
1953
1954	static int
1955	htab_lru_map_lookup_batch(struct bpf_map map, const* union bpf_attr *attr,
1956	union bpf_attr __user *uattr)
1957	{
1958	return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: false,
1959	is_lru_map: true, is_percpu: false);
1960	}
1961
1962	static int
1963	htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
1964	const union bpf_attr *attr,
1965	union bpf_attr __user *uattr)
1966	{
1967	return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: true,
1968	is_lru_map: true, is_percpu: false);
1969	}
1970
1971	struct bpf_iter_seq_hash_map_info {
1972	struct bpf_map *map;
1973	struct bpf_htab *htab;
1974	void percpu_value_buf; // non-zero means percpu hash*
1975	u32 bucket_id;
1976	u32 skip_elems;
1977	};
1978
1979	static struct htab_elem *
1980	bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,
1981	struct htab_elem *prev_elem)
1982	{
1983	const struct bpf_htab *htab = info->htab;
1984	u32 skip_elems = info->skip_elems;
1985	u32 bucket_id = info->bucket_id;
1986	struct hlist_nulls_head *head;
1987	struct hlist_nulls_node *n;
1988	struct htab_elem *elem;
1989	struct bucket *b;
1990	u32 i, count;
1991
1992	if (bucket_id >= htab->n_buckets)
1993	return NULL;
1994
1995	/ try to find next elem in the same bucket /
1996	if (prev_elem) {
1997	/ no update/deletion on this bucket, prev_elem should be still valid*
1998	* and we won't skip elements.
1999	*/
2000	n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node));
2001	elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node);
2002	if (elem)
2003	return elem;
2004
2005	/ not found, unlock and go to the next bucket /
2006	b = &htab->buckets[bucket_id++];
2007	rcu_read_unlock();
2008	skip_elems = `0`;
2009	}
2010
2011	for (i = bucket_id; i < htab->n_buckets; i++) {
2012	b = &htab->buckets[i];
2013	rcu_read_lock();
2014
2015	count = `0`;
2016	head = &b->head;
2017	hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
2018	if (count >= skip_elems) {
2019	info->bucket_id = i;
2020	info->skip_elems = count;
2021	return elem;
2022	}
2023	count++;
2024	}
2025
2026	rcu_read_unlock();
2027	skip_elems = `0`;
2028	}
2029
2030	info->bucket_id = i;
2031	info->skip_elems = `0`;
2032	return NULL;
2033	}
2034
2035	static void bpf_hash_map_seq_start(struct* seq_file seq, loff_t pos)
2036	{
2037	struct bpf_iter_seq_hash_map_info *info = seq->private;
2038	struct htab_elem *elem;
2039
2040	elem = bpf_hash_map_seq_find_next(info, NULL);
2041	if (!elem)
2042	return NULL;
2043
2044	if (*pos == `0`)
2045	++*pos;
2046	return elem;
2047	}
2048
2049	static void bpf_hash_map_seq_next(struct* seq_file seq, void* v, loff_t pos)
2050	{
2051	struct bpf_iter_seq_hash_map_info *info = seq->private;
2052
2053	++*pos;
2054	++info->skip_elems;
2055	return bpf_hash_map_seq_find_next(info, prev_elem: v);
2056	}
2057
2058	static int __bpf_hash_map_seq_show(struct seq_file seq, struct* htab_elem *elem)
2059	{
2060	struct bpf_iter_seq_hash_map_info *info = seq->private;
2061	u32 roundup_key_size, roundup_value_size;
2062	struct bpf_iter__bpf_map_elem ctx = {};
2063	struct bpf_map *map = info->map;
2064	struct bpf_iter_meta meta;
2065	int ret = `0`, off = `0`, cpu;
2066	struct bpf_prog *prog;
2067	void __percpu *pptr;
2068
2069	meta.seq = seq;
2070	prog = bpf_iter_get_info(meta: &meta, in_stop: elem == NULL);
2071	if (prog) {
2072	ctx.meta = &meta;
2073	ctx.map = info->map;
2074	if (elem) {
2075	roundup_key_size = round_up(map->key_size, `8`);
2076	ctx.key = elem->key;
2077	if (!info->percpu_value_buf) {
2078	ctx.value = elem->key + roundup_key_size;
2079	} else {
2080	roundup_value_size = round_up(map->value_size, `8`);
2081	pptr = htab_elem_get_ptr(l: elem, key_size: map->key_size);
2082	for_each_possible_cpu(cpu) {
2083	copy_map_value_long(map, dst: info->percpu_value_buf + off,
2084	per_cpu_ptr(pptr, cpu));
2085	check_and_init_map_value(map, dst: info->percpu_value_buf + off);
2086	off += roundup_value_size;
2087	}
2088	ctx.value = info->percpu_value_buf;
2089	}
2090	}
2091	ret = bpf_iter_run_prog(prog, ctx: &ctx);
2092	}
2093
2094	return ret;
2095	}
2096
2097	static int bpf_hash_map_seq_show(struct seq_file seq, void* *v)
2098	{
2099	return __bpf_hash_map_seq_show(seq, elem: v);
2100	}
2101
2102	static void bpf_hash_map_seq_stop(struct seq_file seq, void* *v)
2103	{
2104	if (!v)
2105	(void)__bpf_hash_map_seq_show(seq, NULL);
2106	else
2107	rcu_read_unlock();
2108	}
2109
2110	static int bpf_iter_init_hash_map(void *priv_data,
2111	struct bpf_iter_aux_info *aux)
2112	{
2113	struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
2114	struct bpf_map *map = aux->map;
2115	void *value_buf;
2116	u32 buf_size;
2117
2118	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH \|\|
2119	map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
2120	buf_size = round_up(map->value_size, `8`) * num_possible_cpus();
2121	value_buf = kmalloc(size: buf_size, GFP_USER \| __GFP_NOWARN);
2122	if (!value_buf)
2123	return -ENOMEM;
2124
2125	seq_info->percpu_value_buf = value_buf;
2126	}
2127
2128	bpf_map_inc_with_uref(map);
2129	seq_info->map = map;
2130	seq_info->htab = container_of(map, struct bpf_htab, map);
2131	return `0`;
2132	}
2133
2134	static void bpf_iter_fini_hash_map(void *priv_data)
2135	{
2136	struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
2137
2138	bpf_map_put_with_uref(map: seq_info->map);
2139	kfree(objp: seq_info->percpu_value_buf);
2140	}
2141
2142	static const struct seq_operations bpf_hash_map_seq_ops = {
2143	.start = bpf_hash_map_seq_start,
2144	.next = bpf_hash_map_seq_next,
2145	.stop = bpf_hash_map_seq_stop,
2146	.show = bpf_hash_map_seq_show,
2147	};
2148
2149	static const struct bpf_iter_seq_info iter_seq_info = {
2150	.seq_ops = &bpf_hash_map_seq_ops,
2151	.init_seq_private = bpf_iter_init_hash_map,
2152	.fini_seq_private = bpf_iter_fini_hash_map,
2153	.seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info),
2154	};
2155
2156	static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
2157	void *callback_ctx, u64 flags)
2158	{
2159	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
2160	struct hlist_nulls_head *head;
2161	struct hlist_nulls_node *n;
2162	struct htab_elem *elem;
2163	u32 roundup_key_size;
2164	int i, num_elems = `0`;
2165	void __percpu *pptr;
2166	struct bucket *b;
2167	void key, val;
2168	bool is_percpu;
2169	u64 ret = `0`;
2170
2171	if (flags != `0`)
2172	return -EINVAL;
2173
2174	is_percpu = htab_is_percpu(htab);
2175
2176	roundup_key_size = round_up(map->key_size, `8`);
2177	/ disable migration so percpu value prepared here will be the*
2178	* same as the one seen by the bpf program with bpf_map_lookup_elem().
2179	*/
2180	if (is_percpu)
2181	migrate_disable();
2182	for (i = `0`; i < htab->n_buckets; i++) {
2183	b = &htab->buckets[i];
2184	rcu_read_lock();
2185	head = &b->head;
2186	hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
2187	key = elem->key;
2188	if (is_percpu) {
2189	/ current cpu value for percpu map /
2190	pptr = htab_elem_get_ptr(l: elem, key_size: map->key_size);
2191	val = this_cpu_ptr(pptr);
2192	} else {
2193	val = elem->key + roundup_key_size;
2194	}
2195	num_elems++;
2196	ret = callback_fn((u64)(long)map, (u64)(long)key,
2197	(u64)(long)val, (u64)(long)callback_ctx, `0`);
2198	/ return value: 0 - continue, 1 - stop and return /
2199	if (ret) {
2200	rcu_read_unlock();
2201	goto out;
2202	}
2203	}
2204	rcu_read_unlock();
2205	}
2206	out:
2207	if (is_percpu)
2208	migrate_enable();
2209	return num_elems;
2210	}
2211
2212	static u64 htab_map_mem_usage(const struct bpf_map *map)
2213	{
2214	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
2215	u32 value_size = round_up(htab->map.value_size, `8`);
2216	bool prealloc = htab_is_prealloc(htab);
2217	bool percpu = htab_is_percpu(htab);
2218	bool lru = htab_is_lru(htab);
2219	u64 num_entries;
2220	u64 usage = sizeof(struct bpf_htab);
2221
2222	usage += sizeof(struct bucket) * htab->n_buckets;
2223	usage += sizeof(int) * num_possible_cpus() * HASHTAB_MAP_LOCK_COUNT;
2224	if (prealloc) {
2225	num_entries = map->max_entries;
2226	if (htab_has_extra_elems(htab))
2227	num_entries += num_possible_cpus();
2228
2229	usage += htab->elem_size * num_entries;
2230
2231	if (percpu)
2232	usage += value_size * num_possible_cpus() * num_entries;
2233	else if (!lru)
2234	usage += sizeof(struct htab_elem ) num_possible_cpus();
2235	} else {
2236	#define LLIST_NODE_SZ sizeof(struct llist_node)
2237
2238	num_entries = htab->use_percpu_counter ?
2239	percpu_counter_sum(fbc: &htab->pcount) :
2240	atomic_read(v: &htab->count);
2241	usage += (htab->elem_size + LLIST_NODE_SZ) * num_entries;
2242	if (percpu) {
2243	usage += (LLIST_NODE_SZ + sizeof(void )) num_entries;
2244	usage += value_size * num_possible_cpus() * num_entries;
2245	}
2246	}
2247	return usage;
2248	}
2249
2250	BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab)
2251	const struct bpf_map_ops htab_map_ops = {
2252	.map_meta_equal = bpf_map_meta_equal,
2253	.map_alloc_check = htab_map_alloc_check,
2254	.map_alloc = htab_map_alloc,
2255	.map_free = htab_map_free,
2256	.map_get_next_key = htab_map_get_next_key,
2257	.map_release_uref = htab_map_free_timers,
2258	.map_lookup_elem = htab_map_lookup_elem,
2259	.map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
2260	.map_update_elem = htab_map_update_elem,
2261	.map_delete_elem = htab_map_delete_elem,
2262	.map_gen_lookup = htab_map_gen_lookup,
2263	.map_seq_show_elem = htab_map_seq_show_elem,
2264	.map_set_for_each_callback_args = map_set_for_each_callback_args,
2265	.map_for_each_callback = bpf_for_each_hash_elem,
2266	.map_mem_usage = htab_map_mem_usage,
2267	BATCH_OPS(htab),
2268	.map_btf_id = &htab_map_btf_ids[`0`],
2269	.iter_seq_info = &iter_seq_info,
2270	};
2271
2272	const struct bpf_map_ops htab_lru_map_ops = {
2273	.map_meta_equal = bpf_map_meta_equal,
2274	.map_alloc_check = htab_map_alloc_check,
2275	.map_alloc = htab_map_alloc,
2276	.map_free = htab_map_free,
2277	.map_get_next_key = htab_map_get_next_key,
2278	.map_release_uref = htab_map_free_timers,
2279	.map_lookup_elem = htab_lru_map_lookup_elem,
2280	.map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
2281	.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
2282	.map_update_elem = htab_lru_map_update_elem,
2283	.map_delete_elem = htab_lru_map_delete_elem,
2284	.map_gen_lookup = htab_lru_map_gen_lookup,
2285	.map_seq_show_elem = htab_map_seq_show_elem,
2286	.map_set_for_each_callback_args = map_set_for_each_callback_args,
2287	.map_for_each_callback = bpf_for_each_hash_elem,
2288	.map_mem_usage = htab_map_mem_usage,
2289	BATCH_OPS(htab_lru),
2290	.map_btf_id = &htab_map_btf_ids[`0`],
2291	.iter_seq_info = &iter_seq_info,
2292	};
2293
2294	/ Called from eBPF program /
2295	static void htab_percpu_map_lookup_elem(struct* bpf_map map, void* *key)
2296	{
2297	struct htab_elem *l = __htab_map_lookup_elem(map, key);
2298
2299	if (l)
2300	return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
2301	else
2302	return NULL;
2303	}
2304
2305	static void htab_percpu_map_lookup_percpu_elem(struct* bpf_map map, void* *key, u32 cpu)
2306	{
2307	struct htab_elem *l;
2308
2309	if (cpu >= nr_cpu_ids)
2310	return NULL;
2311
2312	l = __htab_map_lookup_elem(map, key);
2313	if (l)
2314	return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
2315	else
2316	return NULL;
2317	}
2318
2319	static void htab_lru_percpu_map_lookup_elem(struct* bpf_map map, void* *key)
2320	{
2321	struct htab_elem *l = __htab_map_lookup_elem(map, key);
2322
2323	if (l) {
2324	bpf_lru_node_set_ref(node: &l->lru_node);
2325	return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
2326	}
2327
2328	return NULL;
2329	}
2330
2331	static void htab_lru_percpu_map_lookup_percpu_elem(struct* bpf_map map, void* *key, u32 cpu)
2332	{
2333	struct htab_elem *l;
2334
2335	if (cpu >= nr_cpu_ids)
2336	return NULL;
2337
2338	l = __htab_map_lookup_elem(map, key);
2339	if (l) {
2340	bpf_lru_node_set_ref(node: &l->lru_node);
2341	return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
2342	}
2343
2344	return NULL;
2345	}
2346
2347	int bpf_percpu_hash_copy(struct bpf_map map, void* key, void* *value)
2348	{
2349	struct htab_elem *l;
2350	void __percpu *pptr;
2351	int ret = -ENOENT;
2352	int cpu, off = `0`;
2353	u32 size;
2354
2355	/ per_cpu areas are zero-filled and bpf programs can only*
2356	* access 'value_size' of them, so copying rounded areas
2357	* will not leak any kernel data
2358	*/
2359	size = round_up(map->value_size, `8`);
2360	rcu_read_lock();
2361	l = __htab_map_lookup_elem(map, key);
2362	if (!l)
2363	goto out;
2364	/ We do not mark LRU map element here in order to not mess up*
2365	* eviction heuristics when user space does a map walk.
2366	*/
2367	pptr = htab_elem_get_ptr(l, key_size: map->key_size);
2368	for_each_possible_cpu(cpu) {
2369	copy_map_value_long(map, dst: value + off, per_cpu_ptr(pptr, cpu));
2370	check_and_init_map_value(map, dst: value + off);
2371	off += size;
2372	}
2373	ret = `0`;
2374	out:
2375	rcu_read_unlock();
2376	return ret;
2377	}
2378
2379	int bpf_percpu_hash_update(struct bpf_map map, void* key, void* *value,
2380	u64 map_flags)
2381	{
2382	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
2383	int ret;
2384
2385	rcu_read_lock();
2386	if (htab_is_lru(htab))
2387	ret = __htab_lru_percpu_map_update_elem(map, key, value,
2388	map_flags, onallcpus: true);
2389	else
2390	ret = __htab_percpu_map_update_elem(map, key, value, map_flags,
2391	onallcpus: true);
2392	rcu_read_unlock();
2393
2394	return ret;
2395	}
2396
2397	static void htab_percpu_map_seq_show_elem(struct bpf_map map, void* *key,
2398	struct seq_file *m)
2399	{
2400	struct htab_elem *l;
2401	void __percpu *pptr;
2402	int cpu;
2403
2404	rcu_read_lock();
2405
2406	l = __htab_map_lookup_elem(map, key);
2407	if (!l) {
2408	rcu_read_unlock();
2409	return;
2410	}
2411
2412	btf_type_seq_show(btf: map->btf, type_id: map->btf_key_type_id, obj: key, m);
2413	seq_puts(m, s: ": {\n");
2414	pptr = htab_elem_get_ptr(l, key_size: map->key_size);
2415	for_each_possible_cpu(cpu) {
2416	seq_printf(m, fmt: "\tcpu%d: ", cpu);
2417	btf_type_seq_show(btf: map->btf, type_id: map->btf_value_type_id,
2418	per_cpu_ptr(pptr, cpu), m);
2419	seq_puts(m, s: "\n");
2420	}
2421	seq_puts(m, s: "}\n");
2422
2423	rcu_read_unlock();
2424	}
2425
2426	const struct bpf_map_ops htab_percpu_map_ops = {
2427	.map_meta_equal = bpf_map_meta_equal,
2428	.map_alloc_check = htab_map_alloc_check,
2429	.map_alloc = htab_map_alloc,
2430	.map_free = htab_map_free,
2431	.map_get_next_key = htab_map_get_next_key,
2432	.map_lookup_elem = htab_percpu_map_lookup_elem,
2433	.map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
2434	.map_update_elem = htab_percpu_map_update_elem,
2435	.map_delete_elem = htab_map_delete_elem,
2436	.map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem,
2437	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
2438	.map_set_for_each_callback_args = map_set_for_each_callback_args,
2439	.map_for_each_callback = bpf_for_each_hash_elem,
2440	.map_mem_usage = htab_map_mem_usage,
2441	BATCH_OPS(htab_percpu),
2442	.map_btf_id = &htab_map_btf_ids[`0`],
2443	.iter_seq_info = &iter_seq_info,
2444	};
2445
2446	const struct bpf_map_ops htab_lru_percpu_map_ops = {
2447	.map_meta_equal = bpf_map_meta_equal,
2448	.map_alloc_check = htab_map_alloc_check,
2449	.map_alloc = htab_map_alloc,
2450	.map_free = htab_map_free,
2451	.map_get_next_key = htab_map_get_next_key,
2452	.map_lookup_elem = htab_lru_percpu_map_lookup_elem,
2453	.map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem,
2454	.map_update_elem = htab_lru_percpu_map_update_elem,
2455	.map_delete_elem = htab_lru_map_delete_elem,
2456	.map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem,
2457	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
2458	.map_set_for_each_callback_args = map_set_for_each_callback_args,
2459	.map_for_each_callback = bpf_for_each_hash_elem,
2460	.map_mem_usage = htab_map_mem_usage,
2461	BATCH_OPS(htab_lru_percpu),
2462	.map_btf_id = &htab_map_btf_ids[`0`],
2463	.iter_seq_info = &iter_seq_info,
2464	};
2465
2466	static int fd_htab_map_alloc_check(union bpf_attr *attr)
2467	{
2468	if (attr->value_size != sizeof(u32))
2469	return -EINVAL;
2470	return htab_map_alloc_check(attr);
2471	}
2472
2473	static void fd_htab_map_free(struct bpf_map *map)
2474	{
2475	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
2476	struct hlist_nulls_node *n;
2477	struct hlist_nulls_head *head;
2478	struct htab_elem *l;
2479	int i;
2480
2481	for (i = `0`; i < htab->n_buckets; i++) {
2482	head = select_bucket(htab, hash: i);
2483
2484	hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
2485	void *ptr = fd_htab_map_get_ptr(map, l);
2486
2487	map->ops->map_fd_put_ptr(ptr);
2488	}
2489	}
2490
2491	htab_map_free(map);
2492	}
2493
2494	/ only called from syscall /
2495	int bpf_fd_htab_map_lookup_elem(struct bpf_map map, void* key, u32 value)
2496	{
2497	void **ptr;
2498	int ret = `0`;
2499
2500	if (!map->ops->map_fd_sys_lookup_elem)
2501	return -ENOTSUPP;
2502
2503	rcu_read_lock();
2504	ptr = htab_map_lookup_elem(map, key);
2505	if (ptr)
2506	value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(ptr));
2507	else
2508	ret = -ENOENT;
2509	rcu_read_unlock();
2510
2511	return ret;
2512	}
2513
2514	/ only called from syscall /
2515	int bpf_fd_htab_map_update_elem(struct bpf_map map, struct* file *map_file,
2516	void key, void* *value, u64 map_flags)
2517	{
2518	void *ptr;
2519	int ret;
2520	u32 ufd = (u32 )value;
2521
2522	ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
2523	if (IS_ERR(ptr))
2524	return PTR_ERR(ptr);
2525
2526	ret = htab_map_update_elem(map, key, value: &ptr, map_flags);
2527	if (ret)
2528	map->ops->map_fd_put_ptr(ptr);
2529
2530	return ret;
2531	}
2532
2533	static struct bpf_map htab_of_map_alloc(union* bpf_attr *attr)
2534	{
2535	struct bpf_map map, inner_map_meta;
2536
2537	inner_map_meta = bpf_map_meta_alloc(inner_map_ufd: attr->inner_map_fd);
2538	if (IS_ERR(ptr: inner_map_meta))
2539	return inner_map_meta;
2540
2541	map = htab_map_alloc(attr);
2542	if (IS_ERR(ptr: map)) {
2543	bpf_map_meta_free(map_meta: inner_map_meta);
2544	return map;
2545	}
2546
2547	map->inner_map_meta = inner_map_meta;
2548
2549	return map;
2550	}
2551
2552	static void htab_of_map_lookup_elem(struct* bpf_map map, void* *key)
2553	{
2554	struct bpf_map **inner_map = htab_map_lookup_elem(map, key);
2555
2556	if (!inner_map)
2557	return NULL;
2558
2559	return READ_ONCE(*inner_map);
2560	}
2561
2562	static int htab_of_map_gen_lookup(struct bpf_map *map,
2563	struct bpf_insn *insn_buf)
2564	{
2565	struct bpf_insn *insn = insn_buf;
2566	const int ret = BPF_REG_0;
2567
2568	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
2569	(void ()(struct bpf_map map, void* *key))NULL));
2570	*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
2571	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, `0`, `2`);
2572	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
2573	offsetof(struct htab_elem, key) +
2574	round_up(map->key_size, `8`));
2575	*insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, `0`);
2576
2577	return insn - insn_buf;
2578	}
2579
2580	static void htab_of_map_free(struct bpf_map *map)
2581	{
2582	bpf_map_meta_free(map_meta: map->inner_map_meta);
2583	fd_htab_map_free(map);
2584	}
2585
2586	const struct bpf_map_ops htab_of_maps_map_ops = {
2587	.map_alloc_check = fd_htab_map_alloc_check,
2588	.map_alloc = htab_of_map_alloc,
2589	.map_free = htab_of_map_free,
2590	.map_get_next_key = htab_map_get_next_key,
2591	.map_lookup_elem = htab_of_map_lookup_elem,
2592	.map_delete_elem = htab_map_delete_elem,
2593	.map_fd_get_ptr = bpf_map_fd_get_ptr,
2594	.map_fd_put_ptr = bpf_map_fd_put_ptr,
2595	.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
2596	.map_gen_lookup = htab_of_map_gen_lookup,
2597	.map_check_btf = map_check_no_btf,
2598	.map_mem_usage = htab_map_mem_usage,
2599	BATCH_OPS(htab),
2600	.map_btf_id = &htab_map_btf_ids[`0`],
2601	};
2602

source code of linux/kernel/bpf/hashtab.c