1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com |
3 | * Copyright (c) 2016 Facebook |
4 | */ |
5 | #include <linux/bpf.h> |
6 | #include <linux/btf.h> |
7 | #include <linux/jhash.h> |
8 | #include <linux/filter.h> |
9 | #include <linux/rculist_nulls.h> |
10 | #include <linux/random.h> |
11 | #include <uapi/linux/btf.h> |
12 | #include <linux/rcupdate_trace.h> |
13 | #include <linux/btf_ids.h> |
14 | #include "percpu_freelist.h" |
15 | #include "bpf_lru_list.h" |
16 | #include "map_in_map.h" |
17 | #include <linux/bpf_mem_alloc.h> |
18 | |
19 | #define HTAB_CREATE_FLAG_MASK \ |
20 | (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ |
21 | BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) |
22 | |
23 | #define BATCH_OPS(_name) \ |
24 | .map_lookup_batch = \ |
25 | _name##_map_lookup_batch, \ |
26 | .map_lookup_and_delete_batch = \ |
27 | _name##_map_lookup_and_delete_batch, \ |
28 | .map_update_batch = \ |
29 | generic_map_update_batch, \ |
30 | .map_delete_batch = \ |
31 | generic_map_delete_batch |
32 | |
33 | /* |
34 | * The bucket lock has two protection scopes: |
35 | * |
36 | * 1) Serializing concurrent operations from BPF programs on different |
37 | * CPUs |
38 | * |
39 | * 2) Serializing concurrent operations from BPF programs and sys_bpf() |
40 | * |
41 | * BPF programs can execute in any context including perf, kprobes and |
42 | * tracing. As there are almost no limits where perf, kprobes and tracing |
43 | * can be invoked from the lock operations need to be protected against |
44 | * deadlocks. Deadlocks can be caused by recursion and by an invocation in |
45 | * the lock held section when functions which acquire this lock are invoked |
46 | * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU |
47 | * variable bpf_prog_active, which prevents BPF programs attached to perf |
48 | * events, kprobes and tracing to be invoked before the prior invocation |
49 | * from one of these contexts completed. sys_bpf() uses the same mechanism |
50 | * by pinning the task to the current CPU and incrementing the recursion |
51 | * protection across the map operation. |
52 | * |
53 | * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain |
54 | * operations like memory allocations (even with GFP_ATOMIC) from atomic |
55 | * contexts. This is required because even with GFP_ATOMIC the memory |
56 | * allocator calls into code paths which acquire locks with long held lock |
57 | * sections. To ensure the deterministic behaviour these locks are regular |
58 | * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only |
59 | * true atomic contexts on an RT kernel are the low level hardware |
60 | * handling, scheduling, low level interrupt handling, NMIs etc. None of |
61 | * these contexts should ever do memory allocations. |
62 | * |
63 | * As regular device interrupt handlers and soft interrupts are forced into |
64 | * thread context, the existing code which does |
65 | * spin_lock*(); alloc(GFP_ATOMIC); spin_unlock*(); |
66 | * just works. |
67 | * |
68 | * In theory the BPF locks could be converted to regular spinlocks as well, |
69 | * but the bucket locks and percpu_freelist locks can be taken from |
70 | * arbitrary contexts (perf, kprobes, tracepoints) which are required to be |
71 | * atomic contexts even on RT. Before the introduction of bpf_mem_alloc, |
72 | * it is only safe to use raw spinlock for preallocated hash map on a RT kernel, |
73 | * because there is no memory allocation within the lock held sections. However |
74 | * after hash map was fully converted to use bpf_mem_alloc, there will be |
75 | * non-synchronous memory allocation for non-preallocated hash map, so it is |
76 | * safe to always use raw spinlock for bucket lock. |
77 | */ |
78 | struct bucket { |
79 | struct hlist_nulls_head head; |
80 | raw_spinlock_t raw_lock; |
81 | }; |
82 | |
83 | #define HASHTAB_MAP_LOCK_COUNT 8 |
84 | #define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1) |
85 | |
86 | struct bpf_htab { |
87 | struct bpf_map map; |
88 | struct bpf_mem_alloc ma; |
89 | struct bpf_mem_alloc pcpu_ma; |
90 | struct bucket *buckets; |
91 | void *elems; |
92 | union { |
93 | struct pcpu_freelist freelist; |
94 | struct bpf_lru lru; |
95 | }; |
96 | struct htab_elem *__percpu *; |
97 | /* number of elements in non-preallocated hashtable are kept |
98 | * in either pcount or count |
99 | */ |
100 | struct percpu_counter pcount; |
101 | atomic_t count; |
102 | bool use_percpu_counter; |
103 | u32 n_buckets; /* number of hash buckets */ |
104 | u32 elem_size; /* size of each element in bytes */ |
105 | u32 hashrnd; |
106 | struct lock_class_key lockdep_key; |
107 | int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT]; |
108 | }; |
109 | |
110 | /* each htab element is struct htab_elem + key + value */ |
111 | struct htab_elem { |
112 | union { |
113 | struct hlist_nulls_node hash_node; |
114 | struct { |
115 | void *padding; |
116 | union { |
117 | struct pcpu_freelist_node fnode; |
118 | struct htab_elem *batch_flink; |
119 | }; |
120 | }; |
121 | }; |
122 | union { |
123 | /* pointer to per-cpu pointer */ |
124 | void *ptr_to_pptr; |
125 | struct bpf_lru_node lru_node; |
126 | }; |
127 | u32 hash; |
128 | char key[] __aligned(8); |
129 | }; |
130 | |
131 | static inline bool htab_is_prealloc(const struct bpf_htab *htab) |
132 | { |
133 | return !(htab->map.map_flags & BPF_F_NO_PREALLOC); |
134 | } |
135 | |
136 | static void htab_init_buckets(struct bpf_htab *htab) |
137 | { |
138 | unsigned int i; |
139 | |
140 | for (i = 0; i < htab->n_buckets; i++) { |
141 | INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); |
142 | raw_spin_lock_init(&htab->buckets[i].raw_lock); |
143 | lockdep_set_class(&htab->buckets[i].raw_lock, |
144 | &htab->lockdep_key); |
145 | cond_resched(); |
146 | } |
147 | } |
148 | |
149 | static inline int htab_lock_bucket(const struct bpf_htab *htab, |
150 | struct bucket *b, u32 hash, |
151 | unsigned long *pflags) |
152 | { |
153 | unsigned long flags; |
154 | |
155 | hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); |
156 | |
157 | preempt_disable(); |
158 | local_irq_save(flags); |
159 | if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { |
160 | __this_cpu_dec(*(htab->map_locked[hash])); |
161 | local_irq_restore(flags); |
162 | preempt_enable(); |
163 | return -EBUSY; |
164 | } |
165 | |
166 | raw_spin_lock(&b->raw_lock); |
167 | *pflags = flags; |
168 | |
169 | return 0; |
170 | } |
171 | |
172 | static inline void htab_unlock_bucket(const struct bpf_htab *htab, |
173 | struct bucket *b, u32 hash, |
174 | unsigned long flags) |
175 | { |
176 | hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); |
177 | raw_spin_unlock(&b->raw_lock); |
178 | __this_cpu_dec(*(htab->map_locked[hash])); |
179 | local_irq_restore(flags); |
180 | preempt_enable(); |
181 | } |
182 | |
183 | static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); |
184 | |
185 | static bool htab_is_lru(const struct bpf_htab *htab) |
186 | { |
187 | return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH || |
188 | htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; |
189 | } |
190 | |
191 | static bool htab_is_percpu(const struct bpf_htab *htab) |
192 | { |
193 | return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH || |
194 | htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; |
195 | } |
196 | |
197 | static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, |
198 | void __percpu *pptr) |
199 | { |
200 | *(void __percpu **)(l->key + key_size) = pptr; |
201 | } |
202 | |
203 | static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) |
204 | { |
205 | return *(void __percpu **)(l->key + key_size); |
206 | } |
207 | |
208 | static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) |
209 | { |
210 | return *(void **)(l->key + roundup(map->key_size, 8)); |
211 | } |
212 | |
213 | static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) |
214 | { |
215 | return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size); |
216 | } |
217 | |
218 | static bool (struct bpf_htab *htab) |
219 | { |
220 | return !htab_is_percpu(htab) && !htab_is_lru(htab); |
221 | } |
222 | |
223 | static void htab_free_prealloced_timers(struct bpf_htab *htab) |
224 | { |
225 | u32 num_entries = htab->map.max_entries; |
226 | int i; |
227 | |
228 | if (!btf_record_has_field(rec: htab->map.record, type: BPF_TIMER)) |
229 | return; |
230 | if (htab_has_extra_elems(htab)) |
231 | num_entries += num_possible_cpus(); |
232 | |
233 | for (i = 0; i < num_entries; i++) { |
234 | struct htab_elem *elem; |
235 | |
236 | elem = get_htab_elem(htab, i); |
237 | bpf_obj_free_timer(rec: htab->map.record, obj: elem->key + round_up(htab->map.key_size, 8)); |
238 | cond_resched(); |
239 | } |
240 | } |
241 | |
242 | static void htab_free_prealloced_fields(struct bpf_htab *htab) |
243 | { |
244 | u32 num_entries = htab->map.max_entries; |
245 | int i; |
246 | |
247 | if (IS_ERR_OR_NULL(ptr: htab->map.record)) |
248 | return; |
249 | if (htab_has_extra_elems(htab)) |
250 | num_entries += num_possible_cpus(); |
251 | for (i = 0; i < num_entries; i++) { |
252 | struct htab_elem *elem; |
253 | |
254 | elem = get_htab_elem(htab, i); |
255 | if (htab_is_percpu(htab)) { |
256 | void __percpu *pptr = htab_elem_get_ptr(l: elem, key_size: htab->map.key_size); |
257 | int cpu; |
258 | |
259 | for_each_possible_cpu(cpu) { |
260 | bpf_obj_free_fields(rec: htab->map.record, per_cpu_ptr(pptr, cpu)); |
261 | cond_resched(); |
262 | } |
263 | } else { |
264 | bpf_obj_free_fields(rec: htab->map.record, obj: elem->key + round_up(htab->map.key_size, 8)); |
265 | cond_resched(); |
266 | } |
267 | cond_resched(); |
268 | } |
269 | } |
270 | |
271 | static void htab_free_elems(struct bpf_htab *htab) |
272 | { |
273 | int i; |
274 | |
275 | if (!htab_is_percpu(htab)) |
276 | goto free_elems; |
277 | |
278 | for (i = 0; i < htab->map.max_entries; i++) { |
279 | void __percpu *pptr; |
280 | |
281 | pptr = htab_elem_get_ptr(l: get_htab_elem(htab, i), |
282 | key_size: htab->map.key_size); |
283 | free_percpu(pdata: pptr); |
284 | cond_resched(); |
285 | } |
286 | free_elems: |
287 | bpf_map_area_free(base: htab->elems); |
288 | } |
289 | |
290 | /* The LRU list has a lock (lru_lock). Each htab bucket has a lock |
291 | * (bucket_lock). If both locks need to be acquired together, the lock |
292 | * order is always lru_lock -> bucket_lock and this only happens in |
293 | * bpf_lru_list.c logic. For example, certain code path of |
294 | * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(), |
295 | * will acquire lru_lock first followed by acquiring bucket_lock. |
296 | * |
297 | * In hashtab.c, to avoid deadlock, lock acquisition of |
298 | * bucket_lock followed by lru_lock is not allowed. In such cases, |
299 | * bucket_lock needs to be released first before acquiring lru_lock. |
300 | */ |
301 | static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, |
302 | u32 hash) |
303 | { |
304 | struct bpf_lru_node *node = bpf_lru_pop_free(lru: &htab->lru, hash); |
305 | struct htab_elem *l; |
306 | |
307 | if (node) { |
308 | bpf_map_inc_elem_count(map: &htab->map); |
309 | l = container_of(node, struct htab_elem, lru_node); |
310 | memcpy(l->key, key, htab->map.key_size); |
311 | return l; |
312 | } |
313 | |
314 | return NULL; |
315 | } |
316 | |
317 | static int prealloc_init(struct bpf_htab *htab) |
318 | { |
319 | u32 num_entries = htab->map.max_entries; |
320 | int err = -ENOMEM, i; |
321 | |
322 | if (htab_has_extra_elems(htab)) |
323 | num_entries += num_possible_cpus(); |
324 | |
325 | htab->elems = bpf_map_area_alloc(size: (u64)htab->elem_size * num_entries, |
326 | numa_node: htab->map.numa_node); |
327 | if (!htab->elems) |
328 | return -ENOMEM; |
329 | |
330 | if (!htab_is_percpu(htab)) |
331 | goto skip_percpu_elems; |
332 | |
333 | for (i = 0; i < num_entries; i++) { |
334 | u32 size = round_up(htab->map.value_size, 8); |
335 | void __percpu *pptr; |
336 | |
337 | pptr = bpf_map_alloc_percpu(map: &htab->map, size, align: 8, |
338 | GFP_USER | __GFP_NOWARN); |
339 | if (!pptr) |
340 | goto free_elems; |
341 | htab_elem_set_ptr(l: get_htab_elem(htab, i), key_size: htab->map.key_size, |
342 | pptr); |
343 | cond_resched(); |
344 | } |
345 | |
346 | skip_percpu_elems: |
347 | if (htab_is_lru(htab)) |
348 | err = bpf_lru_init(lru: &htab->lru, |
349 | percpu: htab->map.map_flags & BPF_F_NO_COMMON_LRU, |
350 | offsetof(struct htab_elem, hash) - |
351 | offsetof(struct htab_elem, lru_node), |
352 | del_from_htab: htab_lru_map_delete_node, |
353 | delete_arg: htab); |
354 | else |
355 | err = pcpu_freelist_init(&htab->freelist); |
356 | |
357 | if (err) |
358 | goto free_elems; |
359 | |
360 | if (htab_is_lru(htab)) |
361 | bpf_lru_populate(lru: &htab->lru, buf: htab->elems, |
362 | offsetof(struct htab_elem, lru_node), |
363 | elem_size: htab->elem_size, nr_elems: num_entries); |
364 | else |
365 | pcpu_freelist_populate(s: &htab->freelist, |
366 | buf: htab->elems + offsetof(struct htab_elem, fnode), |
367 | elem_size: htab->elem_size, nr_elems: num_entries); |
368 | |
369 | return 0; |
370 | |
371 | free_elems: |
372 | htab_free_elems(htab); |
373 | return err; |
374 | } |
375 | |
376 | static void prealloc_destroy(struct bpf_htab *htab) |
377 | { |
378 | htab_free_elems(htab); |
379 | |
380 | if (htab_is_lru(htab)) |
381 | bpf_lru_destroy(lru: &htab->lru); |
382 | else |
383 | pcpu_freelist_destroy(s: &htab->freelist); |
384 | } |
385 | |
386 | static int (struct bpf_htab *htab) |
387 | { |
388 | struct htab_elem *__percpu *pptr, *l_new; |
389 | struct pcpu_freelist_node *l; |
390 | int cpu; |
391 | |
392 | pptr = bpf_map_alloc_percpu(map: &htab->map, size: sizeof(struct htab_elem *), align: 8, |
393 | GFP_USER | __GFP_NOWARN); |
394 | if (!pptr) |
395 | return -ENOMEM; |
396 | |
397 | for_each_possible_cpu(cpu) { |
398 | l = pcpu_freelist_pop(&htab->freelist); |
399 | /* pop will succeed, since prealloc_init() |
400 | * preallocated extra num_possible_cpus elements |
401 | */ |
402 | l_new = container_of(l, struct htab_elem, fnode); |
403 | *per_cpu_ptr(pptr, cpu) = l_new; |
404 | } |
405 | htab->extra_elems = pptr; |
406 | return 0; |
407 | } |
408 | |
409 | /* Called from syscall */ |
410 | static int htab_map_alloc_check(union bpf_attr *attr) |
411 | { |
412 | bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || |
413 | attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); |
414 | bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || |
415 | attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); |
416 | /* percpu_lru means each cpu has its own LRU list. |
417 | * it is different from BPF_MAP_TYPE_PERCPU_HASH where |
418 | * the map's value itself is percpu. percpu_lru has |
419 | * nothing to do with the map's value. |
420 | */ |
421 | bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); |
422 | bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); |
423 | bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED); |
424 | int numa_node = bpf_map_attr_numa_node(attr); |
425 | |
426 | BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != |
427 | offsetof(struct htab_elem, hash_node.pprev)); |
428 | |
429 | if (zero_seed && !capable(CAP_SYS_ADMIN)) |
430 | /* Guard against local DoS, and discourage production use. */ |
431 | return -EPERM; |
432 | |
433 | if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK || |
434 | !bpf_map_flags_access_ok(access_flags: attr->map_flags)) |
435 | return -EINVAL; |
436 | |
437 | if (!lru && percpu_lru) |
438 | return -EINVAL; |
439 | |
440 | if (lru && !prealloc) |
441 | return -ENOTSUPP; |
442 | |
443 | if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) |
444 | return -EINVAL; |
445 | |
446 | /* check sanity of attributes. |
447 | * value_size == 0 may be allowed in the future to use map as a set |
448 | */ |
449 | if (attr->max_entries == 0 || attr->key_size == 0 || |
450 | attr->value_size == 0) |
451 | return -EINVAL; |
452 | |
453 | if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE - |
454 | sizeof(struct htab_elem)) |
455 | /* if key_size + value_size is bigger, the user space won't be |
456 | * able to access the elements via bpf syscall. This check |
457 | * also makes sure that the elem_size doesn't overflow and it's |
458 | * kmalloc-able later in htab_map_update_elem() |
459 | */ |
460 | return -E2BIG; |
461 | |
462 | return 0; |
463 | } |
464 | |
465 | static struct bpf_map *htab_map_alloc(union bpf_attr *attr) |
466 | { |
467 | bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || |
468 | attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); |
469 | bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || |
470 | attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); |
471 | /* percpu_lru means each cpu has its own LRU list. |
472 | * it is different from BPF_MAP_TYPE_PERCPU_HASH where |
473 | * the map's value itself is percpu. percpu_lru has |
474 | * nothing to do with the map's value. |
475 | */ |
476 | bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); |
477 | bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); |
478 | struct bpf_htab *htab; |
479 | int err, i; |
480 | |
481 | htab = bpf_map_area_alloc(size: sizeof(*htab), NUMA_NO_NODE); |
482 | if (!htab) |
483 | return ERR_PTR(error: -ENOMEM); |
484 | |
485 | lockdep_register_key(key: &htab->lockdep_key); |
486 | |
487 | bpf_map_init_from_attr(map: &htab->map, attr); |
488 | |
489 | if (percpu_lru) { |
490 | /* ensure each CPU's lru list has >=1 elements. |
491 | * since we are at it, make each lru list has the same |
492 | * number of elements. |
493 | */ |
494 | htab->map.max_entries = roundup(attr->max_entries, |
495 | num_possible_cpus()); |
496 | if (htab->map.max_entries < attr->max_entries) |
497 | htab->map.max_entries = rounddown(attr->max_entries, |
498 | num_possible_cpus()); |
499 | } |
500 | |
501 | /* hash table size must be power of 2 */ |
502 | htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); |
503 | |
504 | htab->elem_size = sizeof(struct htab_elem) + |
505 | round_up(htab->map.key_size, 8); |
506 | if (percpu) |
507 | htab->elem_size += sizeof(void *); |
508 | else |
509 | htab->elem_size += round_up(htab->map.value_size, 8); |
510 | |
511 | err = -E2BIG; |
512 | /* prevent zero size kmalloc and check for u32 overflow */ |
513 | if (htab->n_buckets == 0 || |
514 | htab->n_buckets > U32_MAX / sizeof(struct bucket)) |
515 | goto free_htab; |
516 | |
517 | err = bpf_map_init_elem_count(map: &htab->map); |
518 | if (err) |
519 | goto free_htab; |
520 | |
521 | err = -ENOMEM; |
522 | htab->buckets = bpf_map_area_alloc(size: htab->n_buckets * |
523 | sizeof(struct bucket), |
524 | numa_node: htab->map.numa_node); |
525 | if (!htab->buckets) |
526 | goto free_elem_count; |
527 | |
528 | for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) { |
529 | htab->map_locked[i] = bpf_map_alloc_percpu(map: &htab->map, |
530 | size: sizeof(int), |
531 | align: sizeof(int), |
532 | GFP_USER); |
533 | if (!htab->map_locked[i]) |
534 | goto free_map_locked; |
535 | } |
536 | |
537 | if (htab->map.map_flags & BPF_F_ZERO_SEED) |
538 | htab->hashrnd = 0; |
539 | else |
540 | htab->hashrnd = get_random_u32(); |
541 | |
542 | htab_init_buckets(htab); |
543 | |
544 | /* compute_batch_value() computes batch value as num_online_cpus() * 2 |
545 | * and __percpu_counter_compare() needs |
546 | * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus() |
547 | * for percpu_counter to be faster than atomic_t. In practice the average bpf |
548 | * hash map size is 10k, which means that a system with 64 cpus will fill |
549 | * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore |
550 | * define our own batch count as 32 then 10k hash map can be filled up to 80%: |
551 | * 10k - 8k > 32 _batch_ * 64 _cpus_ |
552 | * and __percpu_counter_compare() will still be fast. At that point hash map |
553 | * collisions will dominate its performance anyway. Assume that hash map filled |
554 | * to 50+% isn't going to be O(1) and use the following formula to choose |
555 | * between percpu_counter and atomic_t. |
556 | */ |
557 | #define PERCPU_COUNTER_BATCH 32 |
558 | if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH) |
559 | htab->use_percpu_counter = true; |
560 | |
561 | if (htab->use_percpu_counter) { |
562 | err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL); |
563 | if (err) |
564 | goto free_map_locked; |
565 | } |
566 | |
567 | if (prealloc) { |
568 | err = prealloc_init(htab); |
569 | if (err) |
570 | goto free_map_locked; |
571 | |
572 | if (!percpu && !lru) { |
573 | /* lru itself can remove the least used element, so |
574 | * there is no need for an extra elem during map_update. |
575 | */ |
576 | err = alloc_extra_elems(htab); |
577 | if (err) |
578 | goto free_prealloc; |
579 | } |
580 | } else { |
581 | err = bpf_mem_alloc_init(ma: &htab->ma, size: htab->elem_size, percpu: false); |
582 | if (err) |
583 | goto free_map_locked; |
584 | if (percpu) { |
585 | err = bpf_mem_alloc_init(ma: &htab->pcpu_ma, |
586 | round_up(htab->map.value_size, 8), percpu: true); |
587 | if (err) |
588 | goto free_map_locked; |
589 | } |
590 | } |
591 | |
592 | return &htab->map; |
593 | |
594 | free_prealloc: |
595 | prealloc_destroy(htab); |
596 | free_map_locked: |
597 | if (htab->use_percpu_counter) |
598 | percpu_counter_destroy(fbc: &htab->pcount); |
599 | for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) |
600 | free_percpu(pdata: htab->map_locked[i]); |
601 | bpf_map_area_free(base: htab->buckets); |
602 | bpf_mem_alloc_destroy(ma: &htab->pcpu_ma); |
603 | bpf_mem_alloc_destroy(ma: &htab->ma); |
604 | free_elem_count: |
605 | bpf_map_free_elem_count(map: &htab->map); |
606 | free_htab: |
607 | lockdep_unregister_key(key: &htab->lockdep_key); |
608 | bpf_map_area_free(base: htab); |
609 | return ERR_PTR(error: err); |
610 | } |
611 | |
612 | static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd) |
613 | { |
614 | if (likely(key_len % 4 == 0)) |
615 | return jhash2(k: key, length: key_len / 4, initval: hashrnd); |
616 | return jhash(key, length: key_len, initval: hashrnd); |
617 | } |
618 | |
619 | static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) |
620 | { |
621 | return &htab->buckets[hash & (htab->n_buckets - 1)]; |
622 | } |
623 | |
624 | static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash) |
625 | { |
626 | return &__select_bucket(htab, hash)->head; |
627 | } |
628 | |
629 | /* this lookup function can only be called with bucket lock taken */ |
630 | static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash, |
631 | void *key, u32 key_size) |
632 | { |
633 | struct hlist_nulls_node *n; |
634 | struct htab_elem *l; |
635 | |
636 | hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) |
637 | if (l->hash == hash && !memcmp(p: &l->key, q: key, size: key_size)) |
638 | return l; |
639 | |
640 | return NULL; |
641 | } |
642 | |
643 | /* can be called without bucket lock. it will repeat the loop in |
644 | * the unlikely event when elements moved from one bucket into another |
645 | * while link list is being walked |
646 | */ |
647 | static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head, |
648 | u32 hash, void *key, |
649 | u32 key_size, u32 n_buckets) |
650 | { |
651 | struct hlist_nulls_node *n; |
652 | struct htab_elem *l; |
653 | |
654 | again: |
655 | hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) |
656 | if (l->hash == hash && !memcmp(p: &l->key, q: key, size: key_size)) |
657 | return l; |
658 | |
659 | if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1)))) |
660 | goto again; |
661 | |
662 | return NULL; |
663 | } |
664 | |
665 | /* Called from syscall or from eBPF program directly, so |
666 | * arguments have to match bpf_map_lookup_elem() exactly. |
667 | * The return value is adjusted by BPF instructions |
668 | * in htab_map_gen_lookup(). |
669 | */ |
670 | static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) |
671 | { |
672 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
673 | struct hlist_nulls_head *head; |
674 | struct htab_elem *l; |
675 | u32 hash, key_size; |
676 | |
677 | WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && |
678 | !rcu_read_lock_bh_held()); |
679 | |
680 | key_size = map->key_size; |
681 | |
682 | hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd); |
683 | |
684 | head = select_bucket(htab, hash); |
685 | |
686 | l = lookup_nulls_elem_raw(head, hash, key, key_size, n_buckets: htab->n_buckets); |
687 | |
688 | return l; |
689 | } |
690 | |
691 | static void *htab_map_lookup_elem(struct bpf_map *map, void *key) |
692 | { |
693 | struct htab_elem *l = __htab_map_lookup_elem(map, key); |
694 | |
695 | if (l) |
696 | return l->key + round_up(map->key_size, 8); |
697 | |
698 | return NULL; |
699 | } |
700 | |
701 | /* inline bpf_map_lookup_elem() call. |
702 | * Instead of: |
703 | * bpf_prog |
704 | * bpf_map_lookup_elem |
705 | * map->ops->map_lookup_elem |
706 | * htab_map_lookup_elem |
707 | * __htab_map_lookup_elem |
708 | * do: |
709 | * bpf_prog |
710 | * __htab_map_lookup_elem |
711 | */ |
712 | static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) |
713 | { |
714 | struct bpf_insn *insn = insn_buf; |
715 | const int ret = BPF_REG_0; |
716 | |
717 | BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, |
718 | (void *(*)(struct bpf_map *map, void *key))NULL)); |
719 | *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); |
720 | *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); |
721 | *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, |
722 | offsetof(struct htab_elem, key) + |
723 | round_up(map->key_size, 8)); |
724 | return insn - insn_buf; |
725 | } |
726 | |
727 | static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map, |
728 | void *key, const bool mark) |
729 | { |
730 | struct htab_elem *l = __htab_map_lookup_elem(map, key); |
731 | |
732 | if (l) { |
733 | if (mark) |
734 | bpf_lru_node_set_ref(node: &l->lru_node); |
735 | return l->key + round_up(map->key_size, 8); |
736 | } |
737 | |
738 | return NULL; |
739 | } |
740 | |
741 | static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) |
742 | { |
743 | return __htab_lru_map_lookup_elem(map, key, mark: true); |
744 | } |
745 | |
746 | static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key) |
747 | { |
748 | return __htab_lru_map_lookup_elem(map, key, mark: false); |
749 | } |
750 | |
751 | static int htab_lru_map_gen_lookup(struct bpf_map *map, |
752 | struct bpf_insn *insn_buf) |
753 | { |
754 | struct bpf_insn *insn = insn_buf; |
755 | const int ret = BPF_REG_0; |
756 | const int ref_reg = BPF_REG_1; |
757 | |
758 | BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, |
759 | (void *(*)(struct bpf_map *map, void *key))NULL)); |
760 | *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); |
761 | *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4); |
762 | *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret, |
763 | offsetof(struct htab_elem, lru_node) + |
764 | offsetof(struct bpf_lru_node, ref)); |
765 | *insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1); |
766 | *insn++ = BPF_ST_MEM(BPF_B, ret, |
767 | offsetof(struct htab_elem, lru_node) + |
768 | offsetof(struct bpf_lru_node, ref), |
769 | 1); |
770 | *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, |
771 | offsetof(struct htab_elem, key) + |
772 | round_up(map->key_size, 8)); |
773 | return insn - insn_buf; |
774 | } |
775 | |
776 | static void check_and_free_fields(struct bpf_htab *htab, |
777 | struct htab_elem *elem) |
778 | { |
779 | if (htab_is_percpu(htab)) { |
780 | void __percpu *pptr = htab_elem_get_ptr(l: elem, key_size: htab->map.key_size); |
781 | int cpu; |
782 | |
783 | for_each_possible_cpu(cpu) |
784 | bpf_obj_free_fields(rec: htab->map.record, per_cpu_ptr(pptr, cpu)); |
785 | } else { |
786 | void *map_value = elem->key + round_up(htab->map.key_size, 8); |
787 | |
788 | bpf_obj_free_fields(rec: htab->map.record, obj: map_value); |
789 | } |
790 | } |
791 | |
792 | /* It is called from the bpf_lru_list when the LRU needs to delete |
793 | * older elements from the htab. |
794 | */ |
795 | static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) |
796 | { |
797 | struct bpf_htab *htab = arg; |
798 | struct htab_elem *l = NULL, *tgt_l; |
799 | struct hlist_nulls_head *head; |
800 | struct hlist_nulls_node *n; |
801 | unsigned long flags; |
802 | struct bucket *b; |
803 | int ret; |
804 | |
805 | tgt_l = container_of(node, struct htab_elem, lru_node); |
806 | b = __select_bucket(htab, hash: tgt_l->hash); |
807 | head = &b->head; |
808 | |
809 | ret = htab_lock_bucket(htab, b, hash: tgt_l->hash, pflags: &flags); |
810 | if (ret) |
811 | return false; |
812 | |
813 | hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) |
814 | if (l == tgt_l) { |
815 | hlist_nulls_del_rcu(n: &l->hash_node); |
816 | check_and_free_fields(htab, elem: l); |
817 | bpf_map_dec_elem_count(map: &htab->map); |
818 | break; |
819 | } |
820 | |
821 | htab_unlock_bucket(htab, b, hash: tgt_l->hash, flags); |
822 | |
823 | return l == tgt_l; |
824 | } |
825 | |
826 | /* Called from syscall */ |
827 | static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) |
828 | { |
829 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
830 | struct hlist_nulls_head *head; |
831 | struct htab_elem *l, *next_l; |
832 | u32 hash, key_size; |
833 | int i = 0; |
834 | |
835 | WARN_ON_ONCE(!rcu_read_lock_held()); |
836 | |
837 | key_size = map->key_size; |
838 | |
839 | if (!key) |
840 | goto find_first_elem; |
841 | |
842 | hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd); |
843 | |
844 | head = select_bucket(htab, hash); |
845 | |
846 | /* lookup the key */ |
847 | l = lookup_nulls_elem_raw(head, hash, key, key_size, n_buckets: htab->n_buckets); |
848 | |
849 | if (!l) |
850 | goto find_first_elem; |
851 | |
852 | /* key was found, get next key in the same bucket */ |
853 | next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)), |
854 | struct htab_elem, hash_node); |
855 | |
856 | if (next_l) { |
857 | /* if next elem in this hash list is non-zero, just return it */ |
858 | memcpy(next_key, next_l->key, key_size); |
859 | return 0; |
860 | } |
861 | |
862 | /* no more elements in this hash list, go to the next bucket */ |
863 | i = hash & (htab->n_buckets - 1); |
864 | i++; |
865 | |
866 | find_first_elem: |
867 | /* iterate over buckets */ |
868 | for (; i < htab->n_buckets; i++) { |
869 | head = select_bucket(htab, hash: i); |
870 | |
871 | /* pick first element in the bucket */ |
872 | next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)), |
873 | struct htab_elem, hash_node); |
874 | if (next_l) { |
875 | /* if it's not empty, just return it */ |
876 | memcpy(next_key, next_l->key, key_size); |
877 | return 0; |
878 | } |
879 | } |
880 | |
881 | /* iterated over all buckets and all elements */ |
882 | return -ENOENT; |
883 | } |
884 | |
885 | static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) |
886 | { |
887 | check_and_free_fields(htab, elem: l); |
888 | if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) |
889 | bpf_mem_cache_free(ma: &htab->pcpu_ma, ptr: l->ptr_to_pptr); |
890 | bpf_mem_cache_free(ma: &htab->ma, ptr: l); |
891 | } |
892 | |
893 | static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) |
894 | { |
895 | struct bpf_map *map = &htab->map; |
896 | void *ptr; |
897 | |
898 | if (map->ops->map_fd_put_ptr) { |
899 | ptr = fd_htab_map_get_ptr(map, l); |
900 | map->ops->map_fd_put_ptr(ptr); |
901 | } |
902 | } |
903 | |
904 | static bool is_map_full(struct bpf_htab *htab) |
905 | { |
906 | if (htab->use_percpu_counter) |
907 | return __percpu_counter_compare(fbc: &htab->pcount, rhs: htab->map.max_entries, |
908 | PERCPU_COUNTER_BATCH) >= 0; |
909 | return atomic_read(v: &htab->count) >= htab->map.max_entries; |
910 | } |
911 | |
912 | static void inc_elem_count(struct bpf_htab *htab) |
913 | { |
914 | bpf_map_inc_elem_count(map: &htab->map); |
915 | |
916 | if (htab->use_percpu_counter) |
917 | percpu_counter_add_batch(fbc: &htab->pcount, amount: 1, PERCPU_COUNTER_BATCH); |
918 | else |
919 | atomic_inc(v: &htab->count); |
920 | } |
921 | |
922 | static void dec_elem_count(struct bpf_htab *htab) |
923 | { |
924 | bpf_map_dec_elem_count(map: &htab->map); |
925 | |
926 | if (htab->use_percpu_counter) |
927 | percpu_counter_add_batch(fbc: &htab->pcount, amount: -1, PERCPU_COUNTER_BATCH); |
928 | else |
929 | atomic_dec(v: &htab->count); |
930 | } |
931 | |
932 | |
933 | static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) |
934 | { |
935 | htab_put_fd_value(htab, l); |
936 | |
937 | if (htab_is_prealloc(htab)) { |
938 | bpf_map_dec_elem_count(map: &htab->map); |
939 | check_and_free_fields(htab, elem: l); |
940 | __pcpu_freelist_push(&htab->freelist, &l->fnode); |
941 | } else { |
942 | dec_elem_count(htab); |
943 | htab_elem_free(htab, l); |
944 | } |
945 | } |
946 | |
947 | static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, |
948 | void *value, bool onallcpus) |
949 | { |
950 | if (!onallcpus) { |
951 | /* copy true value_size bytes */ |
952 | copy_map_value(map: &htab->map, this_cpu_ptr(pptr), src: value); |
953 | } else { |
954 | u32 size = round_up(htab->map.value_size, 8); |
955 | int off = 0, cpu; |
956 | |
957 | for_each_possible_cpu(cpu) { |
958 | copy_map_value_long(map: &htab->map, per_cpu_ptr(pptr, cpu), src: value + off); |
959 | off += size; |
960 | } |
961 | } |
962 | } |
963 | |
964 | static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, |
965 | void *value, bool onallcpus) |
966 | { |
967 | /* When not setting the initial value on all cpus, zero-fill element |
968 | * values for other cpus. Otherwise, bpf program has no way to ensure |
969 | * known initial values for cpus other than current one |
970 | * (onallcpus=false always when coming from bpf prog). |
971 | */ |
972 | if (!onallcpus) { |
973 | int current_cpu = raw_smp_processor_id(); |
974 | int cpu; |
975 | |
976 | for_each_possible_cpu(cpu) { |
977 | if (cpu == current_cpu) |
978 | copy_map_value_long(map: &htab->map, per_cpu_ptr(pptr, cpu), src: value); |
979 | else /* Since elem is preallocated, we cannot touch special fields */ |
980 | zero_map_value(map: &htab->map, per_cpu_ptr(pptr, cpu)); |
981 | } |
982 | } else { |
983 | pcpu_copy_value(htab, pptr, value, onallcpus); |
984 | } |
985 | } |
986 | |
987 | static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) |
988 | { |
989 | return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && |
990 | BITS_PER_LONG == 64; |
991 | } |
992 | |
993 | static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, |
994 | void *value, u32 key_size, u32 hash, |
995 | bool percpu, bool onallcpus, |
996 | struct htab_elem *old_elem) |
997 | { |
998 | u32 size = htab->map.value_size; |
999 | bool prealloc = htab_is_prealloc(htab); |
1000 | struct htab_elem *l_new, **pl_new; |
1001 | void __percpu *pptr; |
1002 | |
1003 | if (prealloc) { |
1004 | if (old_elem) { |
1005 | /* if we're updating the existing element, |
1006 | * use per-cpu extra elems to avoid freelist_pop/push |
1007 | */ |
1008 | pl_new = this_cpu_ptr(htab->extra_elems); |
1009 | l_new = *pl_new; |
1010 | htab_put_fd_value(htab, l: old_elem); |
1011 | *pl_new = old_elem; |
1012 | } else { |
1013 | struct pcpu_freelist_node *l; |
1014 | |
1015 | l = __pcpu_freelist_pop(&htab->freelist); |
1016 | if (!l) |
1017 | return ERR_PTR(error: -E2BIG); |
1018 | l_new = container_of(l, struct htab_elem, fnode); |
1019 | bpf_map_inc_elem_count(map: &htab->map); |
1020 | } |
1021 | } else { |
1022 | if (is_map_full(htab)) |
1023 | if (!old_elem) |
1024 | /* when map is full and update() is replacing |
1025 | * old element, it's ok to allocate, since |
1026 | * old element will be freed immediately. |
1027 | * Otherwise return an error |
1028 | */ |
1029 | return ERR_PTR(error: -E2BIG); |
1030 | inc_elem_count(htab); |
1031 | l_new = bpf_mem_cache_alloc(ma: &htab->ma); |
1032 | if (!l_new) { |
1033 | l_new = ERR_PTR(error: -ENOMEM); |
1034 | goto dec_count; |
1035 | } |
1036 | } |
1037 | |
1038 | memcpy(l_new->key, key, key_size); |
1039 | if (percpu) { |
1040 | if (prealloc) { |
1041 | pptr = htab_elem_get_ptr(l: l_new, key_size); |
1042 | } else { |
1043 | /* alloc_percpu zero-fills */ |
1044 | pptr = bpf_mem_cache_alloc(ma: &htab->pcpu_ma); |
1045 | if (!pptr) { |
1046 | bpf_mem_cache_free(ma: &htab->ma, ptr: l_new); |
1047 | l_new = ERR_PTR(error: -ENOMEM); |
1048 | goto dec_count; |
1049 | } |
1050 | l_new->ptr_to_pptr = pptr; |
1051 | pptr = *(void **)pptr; |
1052 | } |
1053 | |
1054 | pcpu_init_value(htab, pptr, value, onallcpus); |
1055 | |
1056 | if (!prealloc) |
1057 | htab_elem_set_ptr(l: l_new, key_size, pptr); |
1058 | } else if (fd_htab_map_needs_adjust(htab)) { |
1059 | size = round_up(size, 8); |
1060 | memcpy(l_new->key + round_up(key_size, 8), value, size); |
1061 | } else { |
1062 | copy_map_value(map: &htab->map, |
1063 | dst: l_new->key + round_up(key_size, 8), |
1064 | src: value); |
1065 | } |
1066 | |
1067 | l_new->hash = hash; |
1068 | return l_new; |
1069 | dec_count: |
1070 | dec_elem_count(htab); |
1071 | return l_new; |
1072 | } |
1073 | |
1074 | static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, |
1075 | u64 map_flags) |
1076 | { |
1077 | if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) |
1078 | /* elem already exists */ |
1079 | return -EEXIST; |
1080 | |
1081 | if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) |
1082 | /* elem doesn't exist, cannot update it */ |
1083 | return -ENOENT; |
1084 | |
1085 | return 0; |
1086 | } |
1087 | |
1088 | /* Called from syscall or from eBPF program */ |
1089 | static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, |
1090 | u64 map_flags) |
1091 | { |
1092 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
1093 | struct htab_elem *l_new = NULL, *l_old; |
1094 | struct hlist_nulls_head *head; |
1095 | unsigned long flags; |
1096 | struct bucket *b; |
1097 | u32 key_size, hash; |
1098 | int ret; |
1099 | |
1100 | if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) |
1101 | /* unknown flags */ |
1102 | return -EINVAL; |
1103 | |
1104 | WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && |
1105 | !rcu_read_lock_bh_held()); |
1106 | |
1107 | key_size = map->key_size; |
1108 | |
1109 | hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd); |
1110 | |
1111 | b = __select_bucket(htab, hash); |
1112 | head = &b->head; |
1113 | |
1114 | if (unlikely(map_flags & BPF_F_LOCK)) { |
1115 | if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK))) |
1116 | return -EINVAL; |
1117 | /* find an element without taking the bucket lock */ |
1118 | l_old = lookup_nulls_elem_raw(head, hash, key, key_size, |
1119 | n_buckets: htab->n_buckets); |
1120 | ret = check_flags(htab, l_old, map_flags); |
1121 | if (ret) |
1122 | return ret; |
1123 | if (l_old) { |
1124 | /* grab the element lock and update value in place */ |
1125 | copy_map_value_locked(map, |
1126 | dst: l_old->key + round_up(key_size, 8), |
1127 | src: value, lock_src: false); |
1128 | return 0; |
1129 | } |
1130 | /* fall through, grab the bucket lock and lookup again. |
1131 | * 99.9% chance that the element won't be found, |
1132 | * but second lookup under lock has to be done. |
1133 | */ |
1134 | } |
1135 | |
1136 | ret = htab_lock_bucket(htab, b, hash, pflags: &flags); |
1137 | if (ret) |
1138 | return ret; |
1139 | |
1140 | l_old = lookup_elem_raw(head, hash, key, key_size); |
1141 | |
1142 | ret = check_flags(htab, l_old, map_flags); |
1143 | if (ret) |
1144 | goto err; |
1145 | |
1146 | if (unlikely(l_old && (map_flags & BPF_F_LOCK))) { |
1147 | /* first lookup without the bucket lock didn't find the element, |
1148 | * but second lookup with the bucket lock found it. |
1149 | * This case is highly unlikely, but has to be dealt with: |
1150 | * grab the element lock in addition to the bucket lock |
1151 | * and update element in place |
1152 | */ |
1153 | copy_map_value_locked(map, |
1154 | dst: l_old->key + round_up(key_size, 8), |
1155 | src: value, lock_src: false); |
1156 | ret = 0; |
1157 | goto err; |
1158 | } |
1159 | |
1160 | l_new = alloc_htab_elem(htab, key, value, key_size, hash, percpu: false, onallcpus: false, |
1161 | old_elem: l_old); |
1162 | if (IS_ERR(ptr: l_new)) { |
1163 | /* all pre-allocated elements are in use or memory exhausted */ |
1164 | ret = PTR_ERR(ptr: l_new); |
1165 | goto err; |
1166 | } |
1167 | |
1168 | /* add new element to the head of the list, so that |
1169 | * concurrent search will find it before old elem |
1170 | */ |
1171 | hlist_nulls_add_head_rcu(n: &l_new->hash_node, h: head); |
1172 | if (l_old) { |
1173 | hlist_nulls_del_rcu(n: &l_old->hash_node); |
1174 | if (!htab_is_prealloc(htab)) |
1175 | free_htab_elem(htab, l: l_old); |
1176 | else |
1177 | check_and_free_fields(htab, elem: l_old); |
1178 | } |
1179 | ret = 0; |
1180 | err: |
1181 | htab_unlock_bucket(htab, b, hash, flags); |
1182 | return ret; |
1183 | } |
1184 | |
1185 | static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem) |
1186 | { |
1187 | check_and_free_fields(htab, elem); |
1188 | bpf_map_dec_elem_count(map: &htab->map); |
1189 | bpf_lru_push_free(lru: &htab->lru, node: &elem->lru_node); |
1190 | } |
1191 | |
1192 | static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, |
1193 | u64 map_flags) |
1194 | { |
1195 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
1196 | struct htab_elem *l_new, *l_old = NULL; |
1197 | struct hlist_nulls_head *head; |
1198 | unsigned long flags; |
1199 | struct bucket *b; |
1200 | u32 key_size, hash; |
1201 | int ret; |
1202 | |
1203 | if (unlikely(map_flags > BPF_EXIST)) |
1204 | /* unknown flags */ |
1205 | return -EINVAL; |
1206 | |
1207 | WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && |
1208 | !rcu_read_lock_bh_held()); |
1209 | |
1210 | key_size = map->key_size; |
1211 | |
1212 | hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd); |
1213 | |
1214 | b = __select_bucket(htab, hash); |
1215 | head = &b->head; |
1216 | |
1217 | /* For LRU, we need to alloc before taking bucket's |
1218 | * spinlock because getting free nodes from LRU may need |
1219 | * to remove older elements from htab and this removal |
1220 | * operation will need a bucket lock. |
1221 | */ |
1222 | l_new = prealloc_lru_pop(htab, key, hash); |
1223 | if (!l_new) |
1224 | return -ENOMEM; |
1225 | copy_map_value(map: &htab->map, |
1226 | dst: l_new->key + round_up(map->key_size, 8), src: value); |
1227 | |
1228 | ret = htab_lock_bucket(htab, b, hash, pflags: &flags); |
1229 | if (ret) |
1230 | goto err_lock_bucket; |
1231 | |
1232 | l_old = lookup_elem_raw(head, hash, key, key_size); |
1233 | |
1234 | ret = check_flags(htab, l_old, map_flags); |
1235 | if (ret) |
1236 | goto err; |
1237 | |
1238 | /* add new element to the head of the list, so that |
1239 | * concurrent search will find it before old elem |
1240 | */ |
1241 | hlist_nulls_add_head_rcu(n: &l_new->hash_node, h: head); |
1242 | if (l_old) { |
1243 | bpf_lru_node_set_ref(node: &l_new->lru_node); |
1244 | hlist_nulls_del_rcu(n: &l_old->hash_node); |
1245 | } |
1246 | ret = 0; |
1247 | |
1248 | err: |
1249 | htab_unlock_bucket(htab, b, hash, flags); |
1250 | |
1251 | err_lock_bucket: |
1252 | if (ret) |
1253 | htab_lru_push_free(htab, elem: l_new); |
1254 | else if (l_old) |
1255 | htab_lru_push_free(htab, elem: l_old); |
1256 | |
1257 | return ret; |
1258 | } |
1259 | |
1260 | static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, |
1261 | void *value, u64 map_flags, |
1262 | bool onallcpus) |
1263 | { |
1264 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
1265 | struct htab_elem *l_new = NULL, *l_old; |
1266 | struct hlist_nulls_head *head; |
1267 | unsigned long flags; |
1268 | struct bucket *b; |
1269 | u32 key_size, hash; |
1270 | int ret; |
1271 | |
1272 | if (unlikely(map_flags > BPF_EXIST)) |
1273 | /* unknown flags */ |
1274 | return -EINVAL; |
1275 | |
1276 | WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && |
1277 | !rcu_read_lock_bh_held()); |
1278 | |
1279 | key_size = map->key_size; |
1280 | |
1281 | hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd); |
1282 | |
1283 | b = __select_bucket(htab, hash); |
1284 | head = &b->head; |
1285 | |
1286 | ret = htab_lock_bucket(htab, b, hash, pflags: &flags); |
1287 | if (ret) |
1288 | return ret; |
1289 | |
1290 | l_old = lookup_elem_raw(head, hash, key, key_size); |
1291 | |
1292 | ret = check_flags(htab, l_old, map_flags); |
1293 | if (ret) |
1294 | goto err; |
1295 | |
1296 | if (l_old) { |
1297 | /* per-cpu hash map can update value in-place */ |
1298 | pcpu_copy_value(htab, pptr: htab_elem_get_ptr(l: l_old, key_size), |
1299 | value, onallcpus); |
1300 | } else { |
1301 | l_new = alloc_htab_elem(htab, key, value, key_size, |
1302 | hash, percpu: true, onallcpus, NULL); |
1303 | if (IS_ERR(ptr: l_new)) { |
1304 | ret = PTR_ERR(ptr: l_new); |
1305 | goto err; |
1306 | } |
1307 | hlist_nulls_add_head_rcu(n: &l_new->hash_node, h: head); |
1308 | } |
1309 | ret = 0; |
1310 | err: |
1311 | htab_unlock_bucket(htab, b, hash, flags); |
1312 | return ret; |
1313 | } |
1314 | |
1315 | static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, |
1316 | void *value, u64 map_flags, |
1317 | bool onallcpus) |
1318 | { |
1319 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
1320 | struct htab_elem *l_new = NULL, *l_old; |
1321 | struct hlist_nulls_head *head; |
1322 | unsigned long flags; |
1323 | struct bucket *b; |
1324 | u32 key_size, hash; |
1325 | int ret; |
1326 | |
1327 | if (unlikely(map_flags > BPF_EXIST)) |
1328 | /* unknown flags */ |
1329 | return -EINVAL; |
1330 | |
1331 | WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && |
1332 | !rcu_read_lock_bh_held()); |
1333 | |
1334 | key_size = map->key_size; |
1335 | |
1336 | hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd); |
1337 | |
1338 | b = __select_bucket(htab, hash); |
1339 | head = &b->head; |
1340 | |
1341 | /* For LRU, we need to alloc before taking bucket's |
1342 | * spinlock because LRU's elem alloc may need |
1343 | * to remove older elem from htab and this removal |
1344 | * operation will need a bucket lock. |
1345 | */ |
1346 | if (map_flags != BPF_EXIST) { |
1347 | l_new = prealloc_lru_pop(htab, key, hash); |
1348 | if (!l_new) |
1349 | return -ENOMEM; |
1350 | } |
1351 | |
1352 | ret = htab_lock_bucket(htab, b, hash, pflags: &flags); |
1353 | if (ret) |
1354 | goto err_lock_bucket; |
1355 | |
1356 | l_old = lookup_elem_raw(head, hash, key, key_size); |
1357 | |
1358 | ret = check_flags(htab, l_old, map_flags); |
1359 | if (ret) |
1360 | goto err; |
1361 | |
1362 | if (l_old) { |
1363 | bpf_lru_node_set_ref(node: &l_old->lru_node); |
1364 | |
1365 | /* per-cpu hash map can update value in-place */ |
1366 | pcpu_copy_value(htab, pptr: htab_elem_get_ptr(l: l_old, key_size), |
1367 | value, onallcpus); |
1368 | } else { |
1369 | pcpu_init_value(htab, pptr: htab_elem_get_ptr(l: l_new, key_size), |
1370 | value, onallcpus); |
1371 | hlist_nulls_add_head_rcu(n: &l_new->hash_node, h: head); |
1372 | l_new = NULL; |
1373 | } |
1374 | ret = 0; |
1375 | err: |
1376 | htab_unlock_bucket(htab, b, hash, flags); |
1377 | err_lock_bucket: |
1378 | if (l_new) { |
1379 | bpf_map_dec_elem_count(map: &htab->map); |
1380 | bpf_lru_push_free(lru: &htab->lru, node: &l_new->lru_node); |
1381 | } |
1382 | return ret; |
1383 | } |
1384 | |
1385 | static long htab_percpu_map_update_elem(struct bpf_map *map, void *key, |
1386 | void *value, u64 map_flags) |
1387 | { |
1388 | return __htab_percpu_map_update_elem(map, key, value, map_flags, onallcpus: false); |
1389 | } |
1390 | |
1391 | static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, |
1392 | void *value, u64 map_flags) |
1393 | { |
1394 | return __htab_lru_percpu_map_update_elem(map, key, value, map_flags, |
1395 | onallcpus: false); |
1396 | } |
1397 | |
1398 | /* Called from syscall or from eBPF program */ |
1399 | static long htab_map_delete_elem(struct bpf_map *map, void *key) |
1400 | { |
1401 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
1402 | struct hlist_nulls_head *head; |
1403 | struct bucket *b; |
1404 | struct htab_elem *l; |
1405 | unsigned long flags; |
1406 | u32 hash, key_size; |
1407 | int ret; |
1408 | |
1409 | WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && |
1410 | !rcu_read_lock_bh_held()); |
1411 | |
1412 | key_size = map->key_size; |
1413 | |
1414 | hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd); |
1415 | b = __select_bucket(htab, hash); |
1416 | head = &b->head; |
1417 | |
1418 | ret = htab_lock_bucket(htab, b, hash, pflags: &flags); |
1419 | if (ret) |
1420 | return ret; |
1421 | |
1422 | l = lookup_elem_raw(head, hash, key, key_size); |
1423 | |
1424 | if (l) { |
1425 | hlist_nulls_del_rcu(n: &l->hash_node); |
1426 | free_htab_elem(htab, l); |
1427 | } else { |
1428 | ret = -ENOENT; |
1429 | } |
1430 | |
1431 | htab_unlock_bucket(htab, b, hash, flags); |
1432 | return ret; |
1433 | } |
1434 | |
1435 | static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) |
1436 | { |
1437 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
1438 | struct hlist_nulls_head *head; |
1439 | struct bucket *b; |
1440 | struct htab_elem *l; |
1441 | unsigned long flags; |
1442 | u32 hash, key_size; |
1443 | int ret; |
1444 | |
1445 | WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && |
1446 | !rcu_read_lock_bh_held()); |
1447 | |
1448 | key_size = map->key_size; |
1449 | |
1450 | hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd); |
1451 | b = __select_bucket(htab, hash); |
1452 | head = &b->head; |
1453 | |
1454 | ret = htab_lock_bucket(htab, b, hash, pflags: &flags); |
1455 | if (ret) |
1456 | return ret; |
1457 | |
1458 | l = lookup_elem_raw(head, hash, key, key_size); |
1459 | |
1460 | if (l) |
1461 | hlist_nulls_del_rcu(n: &l->hash_node); |
1462 | else |
1463 | ret = -ENOENT; |
1464 | |
1465 | htab_unlock_bucket(htab, b, hash, flags); |
1466 | if (l) |
1467 | htab_lru_push_free(htab, elem: l); |
1468 | return ret; |
1469 | } |
1470 | |
1471 | static void delete_all_elements(struct bpf_htab *htab) |
1472 | { |
1473 | int i; |
1474 | |
1475 | /* It's called from a worker thread, so disable migration here, |
1476 | * since bpf_mem_cache_free() relies on that. |
1477 | */ |
1478 | migrate_disable(); |
1479 | for (i = 0; i < htab->n_buckets; i++) { |
1480 | struct hlist_nulls_head *head = select_bucket(htab, hash: i); |
1481 | struct hlist_nulls_node *n; |
1482 | struct htab_elem *l; |
1483 | |
1484 | hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { |
1485 | hlist_nulls_del_rcu(n: &l->hash_node); |
1486 | htab_elem_free(htab, l); |
1487 | } |
1488 | } |
1489 | migrate_enable(); |
1490 | } |
1491 | |
1492 | static void htab_free_malloced_timers(struct bpf_htab *htab) |
1493 | { |
1494 | int i; |
1495 | |
1496 | rcu_read_lock(); |
1497 | for (i = 0; i < htab->n_buckets; i++) { |
1498 | struct hlist_nulls_head *head = select_bucket(htab, hash: i); |
1499 | struct hlist_nulls_node *n; |
1500 | struct htab_elem *l; |
1501 | |
1502 | hlist_nulls_for_each_entry(l, n, head, hash_node) { |
1503 | /* We only free timer on uref dropping to zero */ |
1504 | bpf_obj_free_timer(rec: htab->map.record, obj: l->key + round_up(htab->map.key_size, 8)); |
1505 | } |
1506 | cond_resched_rcu(); |
1507 | } |
1508 | rcu_read_unlock(); |
1509 | } |
1510 | |
1511 | static void htab_map_free_timers(struct bpf_map *map) |
1512 | { |
1513 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
1514 | |
1515 | /* We only free timer on uref dropping to zero */ |
1516 | if (!btf_record_has_field(rec: htab->map.record, type: BPF_TIMER)) |
1517 | return; |
1518 | if (!htab_is_prealloc(htab)) |
1519 | htab_free_malloced_timers(htab); |
1520 | else |
1521 | htab_free_prealloced_timers(htab); |
1522 | } |
1523 | |
1524 | /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ |
1525 | static void htab_map_free(struct bpf_map *map) |
1526 | { |
1527 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
1528 | int i; |
1529 | |
1530 | /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback. |
1531 | * bpf_free_used_maps() is called after bpf prog is no longer executing. |
1532 | * There is no need to synchronize_rcu() here to protect map elements. |
1533 | */ |
1534 | |
1535 | /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it |
1536 | * underneath and is reponsible for waiting for callbacks to finish |
1537 | * during bpf_mem_alloc_destroy(). |
1538 | */ |
1539 | if (!htab_is_prealloc(htab)) { |
1540 | delete_all_elements(htab); |
1541 | } else { |
1542 | htab_free_prealloced_fields(htab); |
1543 | prealloc_destroy(htab); |
1544 | } |
1545 | |
1546 | bpf_map_free_elem_count(map); |
1547 | free_percpu(pdata: htab->extra_elems); |
1548 | bpf_map_area_free(base: htab->buckets); |
1549 | bpf_mem_alloc_destroy(ma: &htab->pcpu_ma); |
1550 | bpf_mem_alloc_destroy(ma: &htab->ma); |
1551 | if (htab->use_percpu_counter) |
1552 | percpu_counter_destroy(fbc: &htab->pcount); |
1553 | for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) |
1554 | free_percpu(pdata: htab->map_locked[i]); |
1555 | lockdep_unregister_key(key: &htab->lockdep_key); |
1556 | bpf_map_area_free(base: htab); |
1557 | } |
1558 | |
1559 | static void htab_map_seq_show_elem(struct bpf_map *map, void *key, |
1560 | struct seq_file *m) |
1561 | { |
1562 | void *value; |
1563 | |
1564 | rcu_read_lock(); |
1565 | |
1566 | value = htab_map_lookup_elem(map, key); |
1567 | if (!value) { |
1568 | rcu_read_unlock(); |
1569 | return; |
1570 | } |
1571 | |
1572 | btf_type_seq_show(btf: map->btf, type_id: map->btf_key_type_id, obj: key, m); |
1573 | seq_puts(m, s: ": " ); |
1574 | btf_type_seq_show(btf: map->btf, type_id: map->btf_value_type_id, obj: value, m); |
1575 | seq_puts(m, s: "\n" ); |
1576 | |
1577 | rcu_read_unlock(); |
1578 | } |
1579 | |
1580 | static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, |
1581 | void *value, bool is_lru_map, |
1582 | bool is_percpu, u64 flags) |
1583 | { |
1584 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
1585 | struct hlist_nulls_head *head; |
1586 | unsigned long bflags; |
1587 | struct htab_elem *l; |
1588 | u32 hash, key_size; |
1589 | struct bucket *b; |
1590 | int ret; |
1591 | |
1592 | key_size = map->key_size; |
1593 | |
1594 | hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd); |
1595 | b = __select_bucket(htab, hash); |
1596 | head = &b->head; |
1597 | |
1598 | ret = htab_lock_bucket(htab, b, hash, pflags: &bflags); |
1599 | if (ret) |
1600 | return ret; |
1601 | |
1602 | l = lookup_elem_raw(head, hash, key, key_size); |
1603 | if (!l) { |
1604 | ret = -ENOENT; |
1605 | } else { |
1606 | if (is_percpu) { |
1607 | u32 roundup_value_size = round_up(map->value_size, 8); |
1608 | void __percpu *pptr; |
1609 | int off = 0, cpu; |
1610 | |
1611 | pptr = htab_elem_get_ptr(l, key_size); |
1612 | for_each_possible_cpu(cpu) { |
1613 | copy_map_value_long(map: &htab->map, dst: value + off, per_cpu_ptr(pptr, cpu)); |
1614 | check_and_init_map_value(map: &htab->map, dst: value + off); |
1615 | off += roundup_value_size; |
1616 | } |
1617 | } else { |
1618 | u32 roundup_key_size = round_up(map->key_size, 8); |
1619 | |
1620 | if (flags & BPF_F_LOCK) |
1621 | copy_map_value_locked(map, dst: value, src: l->key + |
1622 | roundup_key_size, |
1623 | lock_src: true); |
1624 | else |
1625 | copy_map_value(map, dst: value, src: l->key + |
1626 | roundup_key_size); |
1627 | /* Zeroing special fields in the temp buffer */ |
1628 | check_and_init_map_value(map, dst: value); |
1629 | } |
1630 | |
1631 | hlist_nulls_del_rcu(n: &l->hash_node); |
1632 | if (!is_lru_map) |
1633 | free_htab_elem(htab, l); |
1634 | } |
1635 | |
1636 | htab_unlock_bucket(htab, b, hash, flags: bflags); |
1637 | |
1638 | if (is_lru_map && l) |
1639 | htab_lru_push_free(htab, elem: l); |
1640 | |
1641 | return ret; |
1642 | } |
1643 | |
1644 | static int htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, |
1645 | void *value, u64 flags) |
1646 | { |
1647 | return __htab_map_lookup_and_delete_elem(map, key, value, is_lru_map: false, is_percpu: false, |
1648 | flags); |
1649 | } |
1650 | |
1651 | static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map, |
1652 | void *key, void *value, |
1653 | u64 flags) |
1654 | { |
1655 | return __htab_map_lookup_and_delete_elem(map, key, value, is_lru_map: false, is_percpu: true, |
1656 | flags); |
1657 | } |
1658 | |
1659 | static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key, |
1660 | void *value, u64 flags) |
1661 | { |
1662 | return __htab_map_lookup_and_delete_elem(map, key, value, is_lru_map: true, is_percpu: false, |
1663 | flags); |
1664 | } |
1665 | |
1666 | static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map, |
1667 | void *key, void *value, |
1668 | u64 flags) |
1669 | { |
1670 | return __htab_map_lookup_and_delete_elem(map, key, value, is_lru_map: true, is_percpu: true, |
1671 | flags); |
1672 | } |
1673 | |
1674 | static int |
1675 | __htab_map_lookup_and_delete_batch(struct bpf_map *map, |
1676 | const union bpf_attr *attr, |
1677 | union bpf_attr __user *uattr, |
1678 | bool do_delete, bool is_lru_map, |
1679 | bool is_percpu) |
1680 | { |
1681 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
1682 | u32 bucket_cnt, total, key_size, value_size, roundup_key_size; |
1683 | void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; |
1684 | void __user *uvalues = u64_to_user_ptr(attr->batch.values); |
1685 | void __user *ukeys = u64_to_user_ptr(attr->batch.keys); |
1686 | void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); |
1687 | u32 batch, max_count, size, bucket_size, map_id; |
1688 | struct htab_elem *node_to_free = NULL; |
1689 | u64 elem_map_flags, map_flags; |
1690 | struct hlist_nulls_head *head; |
1691 | struct hlist_nulls_node *n; |
1692 | unsigned long flags = 0; |
1693 | bool locked = false; |
1694 | struct htab_elem *l; |
1695 | struct bucket *b; |
1696 | int ret = 0; |
1697 | |
1698 | elem_map_flags = attr->batch.elem_flags; |
1699 | if ((elem_map_flags & ~BPF_F_LOCK) || |
1700 | ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(rec: map->record, type: BPF_SPIN_LOCK))) |
1701 | return -EINVAL; |
1702 | |
1703 | map_flags = attr->batch.flags; |
1704 | if (map_flags) |
1705 | return -EINVAL; |
1706 | |
1707 | max_count = attr->batch.count; |
1708 | if (!max_count) |
1709 | return 0; |
1710 | |
1711 | if (put_user(0, &uattr->batch.count)) |
1712 | return -EFAULT; |
1713 | |
1714 | batch = 0; |
1715 | if (ubatch && copy_from_user(to: &batch, from: ubatch, n: sizeof(batch))) |
1716 | return -EFAULT; |
1717 | |
1718 | if (batch >= htab->n_buckets) |
1719 | return -ENOENT; |
1720 | |
1721 | key_size = htab->map.key_size; |
1722 | roundup_key_size = round_up(htab->map.key_size, 8); |
1723 | value_size = htab->map.value_size; |
1724 | size = round_up(value_size, 8); |
1725 | if (is_percpu) |
1726 | value_size = size * num_possible_cpus(); |
1727 | total = 0; |
1728 | /* while experimenting with hash tables with sizes ranging from 10 to |
1729 | * 1000, it was observed that a bucket can have up to 5 entries. |
1730 | */ |
1731 | bucket_size = 5; |
1732 | |
1733 | alloc: |
1734 | /* We cannot do copy_from_user or copy_to_user inside |
1735 | * the rcu_read_lock. Allocate enough space here. |
1736 | */ |
1737 | keys = kvmalloc_array(n: key_size, size: bucket_size, GFP_USER | __GFP_NOWARN); |
1738 | values = kvmalloc_array(n: value_size, size: bucket_size, GFP_USER | __GFP_NOWARN); |
1739 | if (!keys || !values) { |
1740 | ret = -ENOMEM; |
1741 | goto after_loop; |
1742 | } |
1743 | |
1744 | again: |
1745 | bpf_disable_instrumentation(); |
1746 | rcu_read_lock(); |
1747 | again_nocopy: |
1748 | dst_key = keys; |
1749 | dst_val = values; |
1750 | b = &htab->buckets[batch]; |
1751 | head = &b->head; |
1752 | /* do not grab the lock unless need it (bucket_cnt > 0). */ |
1753 | if (locked) { |
1754 | ret = htab_lock_bucket(htab, b, hash: batch, pflags: &flags); |
1755 | if (ret) { |
1756 | rcu_read_unlock(); |
1757 | bpf_enable_instrumentation(); |
1758 | goto after_loop; |
1759 | } |
1760 | } |
1761 | |
1762 | bucket_cnt = 0; |
1763 | hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) |
1764 | bucket_cnt++; |
1765 | |
1766 | if (bucket_cnt && !locked) { |
1767 | locked = true; |
1768 | goto again_nocopy; |
1769 | } |
1770 | |
1771 | if (bucket_cnt > (max_count - total)) { |
1772 | if (total == 0) |
1773 | ret = -ENOSPC; |
1774 | /* Note that since bucket_cnt > 0 here, it is implicit |
1775 | * that the locked was grabbed, so release it. |
1776 | */ |
1777 | htab_unlock_bucket(htab, b, hash: batch, flags); |
1778 | rcu_read_unlock(); |
1779 | bpf_enable_instrumentation(); |
1780 | goto after_loop; |
1781 | } |
1782 | |
1783 | if (bucket_cnt > bucket_size) { |
1784 | bucket_size = bucket_cnt; |
1785 | /* Note that since bucket_cnt > 0 here, it is implicit |
1786 | * that the locked was grabbed, so release it. |
1787 | */ |
1788 | htab_unlock_bucket(htab, b, hash: batch, flags); |
1789 | rcu_read_unlock(); |
1790 | bpf_enable_instrumentation(); |
1791 | kvfree(addr: keys); |
1792 | kvfree(addr: values); |
1793 | goto alloc; |
1794 | } |
1795 | |
1796 | /* Next block is only safe to run if you have grabbed the lock */ |
1797 | if (!locked) |
1798 | goto next_batch; |
1799 | |
1800 | hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { |
1801 | memcpy(dst_key, l->key, key_size); |
1802 | |
1803 | if (is_percpu) { |
1804 | int off = 0, cpu; |
1805 | void __percpu *pptr; |
1806 | |
1807 | pptr = htab_elem_get_ptr(l, key_size: map->key_size); |
1808 | for_each_possible_cpu(cpu) { |
1809 | copy_map_value_long(map: &htab->map, dst: dst_val + off, per_cpu_ptr(pptr, cpu)); |
1810 | check_and_init_map_value(map: &htab->map, dst: dst_val + off); |
1811 | off += size; |
1812 | } |
1813 | } else { |
1814 | value = l->key + roundup_key_size; |
1815 | if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { |
1816 | struct bpf_map **inner_map = value; |
1817 | |
1818 | /* Actual value is the id of the inner map */ |
1819 | map_id = map->ops->map_fd_sys_lookup_elem(*inner_map); |
1820 | value = &map_id; |
1821 | } |
1822 | |
1823 | if (elem_map_flags & BPF_F_LOCK) |
1824 | copy_map_value_locked(map, dst: dst_val, src: value, |
1825 | lock_src: true); |
1826 | else |
1827 | copy_map_value(map, dst: dst_val, src: value); |
1828 | /* Zeroing special fields in the temp buffer */ |
1829 | check_and_init_map_value(map, dst: dst_val); |
1830 | } |
1831 | if (do_delete) { |
1832 | hlist_nulls_del_rcu(n: &l->hash_node); |
1833 | |
1834 | /* bpf_lru_push_free() will acquire lru_lock, which |
1835 | * may cause deadlock. See comments in function |
1836 | * prealloc_lru_pop(). Let us do bpf_lru_push_free() |
1837 | * after releasing the bucket lock. |
1838 | */ |
1839 | if (is_lru_map) { |
1840 | l->batch_flink = node_to_free; |
1841 | node_to_free = l; |
1842 | } else { |
1843 | free_htab_elem(htab, l); |
1844 | } |
1845 | } |
1846 | dst_key += key_size; |
1847 | dst_val += value_size; |
1848 | } |
1849 | |
1850 | htab_unlock_bucket(htab, b, hash: batch, flags); |
1851 | locked = false; |
1852 | |
1853 | while (node_to_free) { |
1854 | l = node_to_free; |
1855 | node_to_free = node_to_free->batch_flink; |
1856 | htab_lru_push_free(htab, elem: l); |
1857 | } |
1858 | |
1859 | next_batch: |
1860 | /* If we are not copying data, we can go to next bucket and avoid |
1861 | * unlocking the rcu. |
1862 | */ |
1863 | if (!bucket_cnt && (batch + 1 < htab->n_buckets)) { |
1864 | batch++; |
1865 | goto again_nocopy; |
1866 | } |
1867 | |
1868 | rcu_read_unlock(); |
1869 | bpf_enable_instrumentation(); |
1870 | if (bucket_cnt && (copy_to_user(to: ukeys + total * key_size, from: keys, |
1871 | n: key_size * bucket_cnt) || |
1872 | copy_to_user(to: uvalues + total * value_size, from: values, |
1873 | n: value_size * bucket_cnt))) { |
1874 | ret = -EFAULT; |
1875 | goto after_loop; |
1876 | } |
1877 | |
1878 | total += bucket_cnt; |
1879 | batch++; |
1880 | if (batch >= htab->n_buckets) { |
1881 | ret = -ENOENT; |
1882 | goto after_loop; |
1883 | } |
1884 | goto again; |
1885 | |
1886 | after_loop: |
1887 | if (ret == -EFAULT) |
1888 | goto out; |
1889 | |
1890 | /* copy # of entries and next batch */ |
1891 | ubatch = u64_to_user_ptr(attr->batch.out_batch); |
1892 | if (copy_to_user(to: ubatch, from: &batch, n: sizeof(batch)) || |
1893 | put_user(total, &uattr->batch.count)) |
1894 | ret = -EFAULT; |
1895 | |
1896 | out: |
1897 | kvfree(addr: keys); |
1898 | kvfree(addr: values); |
1899 | return ret; |
1900 | } |
1901 | |
1902 | static int |
1903 | htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, |
1904 | union bpf_attr __user *uattr) |
1905 | { |
1906 | return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: false, |
1907 | is_lru_map: false, is_percpu: true); |
1908 | } |
1909 | |
1910 | static int |
1911 | htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map, |
1912 | const union bpf_attr *attr, |
1913 | union bpf_attr __user *uattr) |
1914 | { |
1915 | return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: true, |
1916 | is_lru_map: false, is_percpu: true); |
1917 | } |
1918 | |
1919 | static int |
1920 | htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, |
1921 | union bpf_attr __user *uattr) |
1922 | { |
1923 | return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: false, |
1924 | is_lru_map: false, is_percpu: false); |
1925 | } |
1926 | |
1927 | static int |
1928 | htab_map_lookup_and_delete_batch(struct bpf_map *map, |
1929 | const union bpf_attr *attr, |
1930 | union bpf_attr __user *uattr) |
1931 | { |
1932 | return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: true, |
1933 | is_lru_map: false, is_percpu: false); |
1934 | } |
1935 | |
1936 | static int |
1937 | htab_lru_percpu_map_lookup_batch(struct bpf_map *map, |
1938 | const union bpf_attr *attr, |
1939 | union bpf_attr __user *uattr) |
1940 | { |
1941 | return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: false, |
1942 | is_lru_map: true, is_percpu: true); |
1943 | } |
1944 | |
1945 | static int |
1946 | htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map, |
1947 | const union bpf_attr *attr, |
1948 | union bpf_attr __user *uattr) |
1949 | { |
1950 | return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: true, |
1951 | is_lru_map: true, is_percpu: true); |
1952 | } |
1953 | |
1954 | static int |
1955 | htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, |
1956 | union bpf_attr __user *uattr) |
1957 | { |
1958 | return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: false, |
1959 | is_lru_map: true, is_percpu: false); |
1960 | } |
1961 | |
1962 | static int |
1963 | htab_lru_map_lookup_and_delete_batch(struct bpf_map *map, |
1964 | const union bpf_attr *attr, |
1965 | union bpf_attr __user *uattr) |
1966 | { |
1967 | return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: true, |
1968 | is_lru_map: true, is_percpu: false); |
1969 | } |
1970 | |
1971 | struct bpf_iter_seq_hash_map_info { |
1972 | struct bpf_map *map; |
1973 | struct bpf_htab *htab; |
1974 | void *percpu_value_buf; // non-zero means percpu hash |
1975 | u32 bucket_id; |
1976 | u32 skip_elems; |
1977 | }; |
1978 | |
1979 | static struct htab_elem * |
1980 | bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info, |
1981 | struct htab_elem *prev_elem) |
1982 | { |
1983 | const struct bpf_htab *htab = info->htab; |
1984 | u32 skip_elems = info->skip_elems; |
1985 | u32 bucket_id = info->bucket_id; |
1986 | struct hlist_nulls_head *head; |
1987 | struct hlist_nulls_node *n; |
1988 | struct htab_elem *elem; |
1989 | struct bucket *b; |
1990 | u32 i, count; |
1991 | |
1992 | if (bucket_id >= htab->n_buckets) |
1993 | return NULL; |
1994 | |
1995 | /* try to find next elem in the same bucket */ |
1996 | if (prev_elem) { |
1997 | /* no update/deletion on this bucket, prev_elem should be still valid |
1998 | * and we won't skip elements. |
1999 | */ |
2000 | n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node)); |
2001 | elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node); |
2002 | if (elem) |
2003 | return elem; |
2004 | |
2005 | /* not found, unlock and go to the next bucket */ |
2006 | b = &htab->buckets[bucket_id++]; |
2007 | rcu_read_unlock(); |
2008 | skip_elems = 0; |
2009 | } |
2010 | |
2011 | for (i = bucket_id; i < htab->n_buckets; i++) { |
2012 | b = &htab->buckets[i]; |
2013 | rcu_read_lock(); |
2014 | |
2015 | count = 0; |
2016 | head = &b->head; |
2017 | hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { |
2018 | if (count >= skip_elems) { |
2019 | info->bucket_id = i; |
2020 | info->skip_elems = count; |
2021 | return elem; |
2022 | } |
2023 | count++; |
2024 | } |
2025 | |
2026 | rcu_read_unlock(); |
2027 | skip_elems = 0; |
2028 | } |
2029 | |
2030 | info->bucket_id = i; |
2031 | info->skip_elems = 0; |
2032 | return NULL; |
2033 | } |
2034 | |
2035 | static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos) |
2036 | { |
2037 | struct bpf_iter_seq_hash_map_info *info = seq->private; |
2038 | struct htab_elem *elem; |
2039 | |
2040 | elem = bpf_hash_map_seq_find_next(info, NULL); |
2041 | if (!elem) |
2042 | return NULL; |
2043 | |
2044 | if (*pos == 0) |
2045 | ++*pos; |
2046 | return elem; |
2047 | } |
2048 | |
2049 | static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) |
2050 | { |
2051 | struct bpf_iter_seq_hash_map_info *info = seq->private; |
2052 | |
2053 | ++*pos; |
2054 | ++info->skip_elems; |
2055 | return bpf_hash_map_seq_find_next(info, prev_elem: v); |
2056 | } |
2057 | |
2058 | static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) |
2059 | { |
2060 | struct bpf_iter_seq_hash_map_info *info = seq->private; |
2061 | u32 roundup_key_size, roundup_value_size; |
2062 | struct bpf_iter__bpf_map_elem ctx = {}; |
2063 | struct bpf_map *map = info->map; |
2064 | struct bpf_iter_meta meta; |
2065 | int ret = 0, off = 0, cpu; |
2066 | struct bpf_prog *prog; |
2067 | void __percpu *pptr; |
2068 | |
2069 | meta.seq = seq; |
2070 | prog = bpf_iter_get_info(meta: &meta, in_stop: elem == NULL); |
2071 | if (prog) { |
2072 | ctx.meta = &meta; |
2073 | ctx.map = info->map; |
2074 | if (elem) { |
2075 | roundup_key_size = round_up(map->key_size, 8); |
2076 | ctx.key = elem->key; |
2077 | if (!info->percpu_value_buf) { |
2078 | ctx.value = elem->key + roundup_key_size; |
2079 | } else { |
2080 | roundup_value_size = round_up(map->value_size, 8); |
2081 | pptr = htab_elem_get_ptr(l: elem, key_size: map->key_size); |
2082 | for_each_possible_cpu(cpu) { |
2083 | copy_map_value_long(map, dst: info->percpu_value_buf + off, |
2084 | per_cpu_ptr(pptr, cpu)); |
2085 | check_and_init_map_value(map, dst: info->percpu_value_buf + off); |
2086 | off += roundup_value_size; |
2087 | } |
2088 | ctx.value = info->percpu_value_buf; |
2089 | } |
2090 | } |
2091 | ret = bpf_iter_run_prog(prog, ctx: &ctx); |
2092 | } |
2093 | |
2094 | return ret; |
2095 | } |
2096 | |
2097 | static int bpf_hash_map_seq_show(struct seq_file *seq, void *v) |
2098 | { |
2099 | return __bpf_hash_map_seq_show(seq, elem: v); |
2100 | } |
2101 | |
2102 | static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v) |
2103 | { |
2104 | if (!v) |
2105 | (void)__bpf_hash_map_seq_show(seq, NULL); |
2106 | else |
2107 | rcu_read_unlock(); |
2108 | } |
2109 | |
2110 | static int bpf_iter_init_hash_map(void *priv_data, |
2111 | struct bpf_iter_aux_info *aux) |
2112 | { |
2113 | struct bpf_iter_seq_hash_map_info *seq_info = priv_data; |
2114 | struct bpf_map *map = aux->map; |
2115 | void *value_buf; |
2116 | u32 buf_size; |
2117 | |
2118 | if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || |
2119 | map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { |
2120 | buf_size = round_up(map->value_size, 8) * num_possible_cpus(); |
2121 | value_buf = kmalloc(size: buf_size, GFP_USER | __GFP_NOWARN); |
2122 | if (!value_buf) |
2123 | return -ENOMEM; |
2124 | |
2125 | seq_info->percpu_value_buf = value_buf; |
2126 | } |
2127 | |
2128 | bpf_map_inc_with_uref(map); |
2129 | seq_info->map = map; |
2130 | seq_info->htab = container_of(map, struct bpf_htab, map); |
2131 | return 0; |
2132 | } |
2133 | |
2134 | static void bpf_iter_fini_hash_map(void *priv_data) |
2135 | { |
2136 | struct bpf_iter_seq_hash_map_info *seq_info = priv_data; |
2137 | |
2138 | bpf_map_put_with_uref(map: seq_info->map); |
2139 | kfree(objp: seq_info->percpu_value_buf); |
2140 | } |
2141 | |
2142 | static const struct seq_operations bpf_hash_map_seq_ops = { |
2143 | .start = bpf_hash_map_seq_start, |
2144 | .next = bpf_hash_map_seq_next, |
2145 | .stop = bpf_hash_map_seq_stop, |
2146 | .show = bpf_hash_map_seq_show, |
2147 | }; |
2148 | |
2149 | static const struct bpf_iter_seq_info iter_seq_info = { |
2150 | .seq_ops = &bpf_hash_map_seq_ops, |
2151 | .init_seq_private = bpf_iter_init_hash_map, |
2152 | .fini_seq_private = bpf_iter_fini_hash_map, |
2153 | .seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info), |
2154 | }; |
2155 | |
2156 | static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn, |
2157 | void *callback_ctx, u64 flags) |
2158 | { |
2159 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
2160 | struct hlist_nulls_head *head; |
2161 | struct hlist_nulls_node *n; |
2162 | struct htab_elem *elem; |
2163 | u32 roundup_key_size; |
2164 | int i, num_elems = 0; |
2165 | void __percpu *pptr; |
2166 | struct bucket *b; |
2167 | void *key, *val; |
2168 | bool is_percpu; |
2169 | u64 ret = 0; |
2170 | |
2171 | if (flags != 0) |
2172 | return -EINVAL; |
2173 | |
2174 | is_percpu = htab_is_percpu(htab); |
2175 | |
2176 | roundup_key_size = round_up(map->key_size, 8); |
2177 | /* disable migration so percpu value prepared here will be the |
2178 | * same as the one seen by the bpf program with bpf_map_lookup_elem(). |
2179 | */ |
2180 | if (is_percpu) |
2181 | migrate_disable(); |
2182 | for (i = 0; i < htab->n_buckets; i++) { |
2183 | b = &htab->buckets[i]; |
2184 | rcu_read_lock(); |
2185 | head = &b->head; |
2186 | hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { |
2187 | key = elem->key; |
2188 | if (is_percpu) { |
2189 | /* current cpu value for percpu map */ |
2190 | pptr = htab_elem_get_ptr(l: elem, key_size: map->key_size); |
2191 | val = this_cpu_ptr(pptr); |
2192 | } else { |
2193 | val = elem->key + roundup_key_size; |
2194 | } |
2195 | num_elems++; |
2196 | ret = callback_fn((u64)(long)map, (u64)(long)key, |
2197 | (u64)(long)val, (u64)(long)callback_ctx, 0); |
2198 | /* return value: 0 - continue, 1 - stop and return */ |
2199 | if (ret) { |
2200 | rcu_read_unlock(); |
2201 | goto out; |
2202 | } |
2203 | } |
2204 | rcu_read_unlock(); |
2205 | } |
2206 | out: |
2207 | if (is_percpu) |
2208 | migrate_enable(); |
2209 | return num_elems; |
2210 | } |
2211 | |
2212 | static u64 htab_map_mem_usage(const struct bpf_map *map) |
2213 | { |
2214 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
2215 | u32 value_size = round_up(htab->map.value_size, 8); |
2216 | bool prealloc = htab_is_prealloc(htab); |
2217 | bool percpu = htab_is_percpu(htab); |
2218 | bool lru = htab_is_lru(htab); |
2219 | u64 num_entries; |
2220 | u64 usage = sizeof(struct bpf_htab); |
2221 | |
2222 | usage += sizeof(struct bucket) * htab->n_buckets; |
2223 | usage += sizeof(int) * num_possible_cpus() * HASHTAB_MAP_LOCK_COUNT; |
2224 | if (prealloc) { |
2225 | num_entries = map->max_entries; |
2226 | if (htab_has_extra_elems(htab)) |
2227 | num_entries += num_possible_cpus(); |
2228 | |
2229 | usage += htab->elem_size * num_entries; |
2230 | |
2231 | if (percpu) |
2232 | usage += value_size * num_possible_cpus() * num_entries; |
2233 | else if (!lru) |
2234 | usage += sizeof(struct htab_elem *) * num_possible_cpus(); |
2235 | } else { |
2236 | #define LLIST_NODE_SZ sizeof(struct llist_node) |
2237 | |
2238 | num_entries = htab->use_percpu_counter ? |
2239 | percpu_counter_sum(fbc: &htab->pcount) : |
2240 | atomic_read(v: &htab->count); |
2241 | usage += (htab->elem_size + LLIST_NODE_SZ) * num_entries; |
2242 | if (percpu) { |
2243 | usage += (LLIST_NODE_SZ + sizeof(void *)) * num_entries; |
2244 | usage += value_size * num_possible_cpus() * num_entries; |
2245 | } |
2246 | } |
2247 | return usage; |
2248 | } |
2249 | |
2250 | BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab) |
2251 | const struct bpf_map_ops htab_map_ops = { |
2252 | .map_meta_equal = bpf_map_meta_equal, |
2253 | .map_alloc_check = htab_map_alloc_check, |
2254 | .map_alloc = htab_map_alloc, |
2255 | .map_free = htab_map_free, |
2256 | .map_get_next_key = htab_map_get_next_key, |
2257 | .map_release_uref = htab_map_free_timers, |
2258 | .map_lookup_elem = htab_map_lookup_elem, |
2259 | .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem, |
2260 | .map_update_elem = htab_map_update_elem, |
2261 | .map_delete_elem = htab_map_delete_elem, |
2262 | .map_gen_lookup = htab_map_gen_lookup, |
2263 | .map_seq_show_elem = htab_map_seq_show_elem, |
2264 | .map_set_for_each_callback_args = map_set_for_each_callback_args, |
2265 | .map_for_each_callback = bpf_for_each_hash_elem, |
2266 | .map_mem_usage = htab_map_mem_usage, |
2267 | BATCH_OPS(htab), |
2268 | .map_btf_id = &htab_map_btf_ids[0], |
2269 | .iter_seq_info = &iter_seq_info, |
2270 | }; |
2271 | |
2272 | const struct bpf_map_ops htab_lru_map_ops = { |
2273 | .map_meta_equal = bpf_map_meta_equal, |
2274 | .map_alloc_check = htab_map_alloc_check, |
2275 | .map_alloc = htab_map_alloc, |
2276 | .map_free = htab_map_free, |
2277 | .map_get_next_key = htab_map_get_next_key, |
2278 | .map_release_uref = htab_map_free_timers, |
2279 | .map_lookup_elem = htab_lru_map_lookup_elem, |
2280 | .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem, |
2281 | .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, |
2282 | .map_update_elem = htab_lru_map_update_elem, |
2283 | .map_delete_elem = htab_lru_map_delete_elem, |
2284 | .map_gen_lookup = htab_lru_map_gen_lookup, |
2285 | .map_seq_show_elem = htab_map_seq_show_elem, |
2286 | .map_set_for_each_callback_args = map_set_for_each_callback_args, |
2287 | .map_for_each_callback = bpf_for_each_hash_elem, |
2288 | .map_mem_usage = htab_map_mem_usage, |
2289 | BATCH_OPS(htab_lru), |
2290 | .map_btf_id = &htab_map_btf_ids[0], |
2291 | .iter_seq_info = &iter_seq_info, |
2292 | }; |
2293 | |
2294 | /* Called from eBPF program */ |
2295 | static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) |
2296 | { |
2297 | struct htab_elem *l = __htab_map_lookup_elem(map, key); |
2298 | |
2299 | if (l) |
2300 | return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); |
2301 | else |
2302 | return NULL; |
2303 | } |
2304 | |
2305 | static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) |
2306 | { |
2307 | struct htab_elem *l; |
2308 | |
2309 | if (cpu >= nr_cpu_ids) |
2310 | return NULL; |
2311 | |
2312 | l = __htab_map_lookup_elem(map, key); |
2313 | if (l) |
2314 | return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu); |
2315 | else |
2316 | return NULL; |
2317 | } |
2318 | |
2319 | static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) |
2320 | { |
2321 | struct htab_elem *l = __htab_map_lookup_elem(map, key); |
2322 | |
2323 | if (l) { |
2324 | bpf_lru_node_set_ref(node: &l->lru_node); |
2325 | return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); |
2326 | } |
2327 | |
2328 | return NULL; |
2329 | } |
2330 | |
2331 | static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) |
2332 | { |
2333 | struct htab_elem *l; |
2334 | |
2335 | if (cpu >= nr_cpu_ids) |
2336 | return NULL; |
2337 | |
2338 | l = __htab_map_lookup_elem(map, key); |
2339 | if (l) { |
2340 | bpf_lru_node_set_ref(node: &l->lru_node); |
2341 | return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu); |
2342 | } |
2343 | |
2344 | return NULL; |
2345 | } |
2346 | |
2347 | int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) |
2348 | { |
2349 | struct htab_elem *l; |
2350 | void __percpu *pptr; |
2351 | int ret = -ENOENT; |
2352 | int cpu, off = 0; |
2353 | u32 size; |
2354 | |
2355 | /* per_cpu areas are zero-filled and bpf programs can only |
2356 | * access 'value_size' of them, so copying rounded areas |
2357 | * will not leak any kernel data |
2358 | */ |
2359 | size = round_up(map->value_size, 8); |
2360 | rcu_read_lock(); |
2361 | l = __htab_map_lookup_elem(map, key); |
2362 | if (!l) |
2363 | goto out; |
2364 | /* We do not mark LRU map element here in order to not mess up |
2365 | * eviction heuristics when user space does a map walk. |
2366 | */ |
2367 | pptr = htab_elem_get_ptr(l, key_size: map->key_size); |
2368 | for_each_possible_cpu(cpu) { |
2369 | copy_map_value_long(map, dst: value + off, per_cpu_ptr(pptr, cpu)); |
2370 | check_and_init_map_value(map, dst: value + off); |
2371 | off += size; |
2372 | } |
2373 | ret = 0; |
2374 | out: |
2375 | rcu_read_unlock(); |
2376 | return ret; |
2377 | } |
2378 | |
2379 | int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, |
2380 | u64 map_flags) |
2381 | { |
2382 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
2383 | int ret; |
2384 | |
2385 | rcu_read_lock(); |
2386 | if (htab_is_lru(htab)) |
2387 | ret = __htab_lru_percpu_map_update_elem(map, key, value, |
2388 | map_flags, onallcpus: true); |
2389 | else |
2390 | ret = __htab_percpu_map_update_elem(map, key, value, map_flags, |
2391 | onallcpus: true); |
2392 | rcu_read_unlock(); |
2393 | |
2394 | return ret; |
2395 | } |
2396 | |
2397 | static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, |
2398 | struct seq_file *m) |
2399 | { |
2400 | struct htab_elem *l; |
2401 | void __percpu *pptr; |
2402 | int cpu; |
2403 | |
2404 | rcu_read_lock(); |
2405 | |
2406 | l = __htab_map_lookup_elem(map, key); |
2407 | if (!l) { |
2408 | rcu_read_unlock(); |
2409 | return; |
2410 | } |
2411 | |
2412 | btf_type_seq_show(btf: map->btf, type_id: map->btf_key_type_id, obj: key, m); |
2413 | seq_puts(m, s: ": {\n" ); |
2414 | pptr = htab_elem_get_ptr(l, key_size: map->key_size); |
2415 | for_each_possible_cpu(cpu) { |
2416 | seq_printf(m, fmt: "\tcpu%d: " , cpu); |
2417 | btf_type_seq_show(btf: map->btf, type_id: map->btf_value_type_id, |
2418 | per_cpu_ptr(pptr, cpu), m); |
2419 | seq_puts(m, s: "\n" ); |
2420 | } |
2421 | seq_puts(m, s: "}\n" ); |
2422 | |
2423 | rcu_read_unlock(); |
2424 | } |
2425 | |
2426 | const struct bpf_map_ops htab_percpu_map_ops = { |
2427 | .map_meta_equal = bpf_map_meta_equal, |
2428 | .map_alloc_check = htab_map_alloc_check, |
2429 | .map_alloc = htab_map_alloc, |
2430 | .map_free = htab_map_free, |
2431 | .map_get_next_key = htab_map_get_next_key, |
2432 | .map_lookup_elem = htab_percpu_map_lookup_elem, |
2433 | .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem, |
2434 | .map_update_elem = htab_percpu_map_update_elem, |
2435 | .map_delete_elem = htab_map_delete_elem, |
2436 | .map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem, |
2437 | .map_seq_show_elem = htab_percpu_map_seq_show_elem, |
2438 | .map_set_for_each_callback_args = map_set_for_each_callback_args, |
2439 | .map_for_each_callback = bpf_for_each_hash_elem, |
2440 | .map_mem_usage = htab_map_mem_usage, |
2441 | BATCH_OPS(htab_percpu), |
2442 | .map_btf_id = &htab_map_btf_ids[0], |
2443 | .iter_seq_info = &iter_seq_info, |
2444 | }; |
2445 | |
2446 | const struct bpf_map_ops htab_lru_percpu_map_ops = { |
2447 | .map_meta_equal = bpf_map_meta_equal, |
2448 | .map_alloc_check = htab_map_alloc_check, |
2449 | .map_alloc = htab_map_alloc, |
2450 | .map_free = htab_map_free, |
2451 | .map_get_next_key = htab_map_get_next_key, |
2452 | .map_lookup_elem = htab_lru_percpu_map_lookup_elem, |
2453 | .map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem, |
2454 | .map_update_elem = htab_lru_percpu_map_update_elem, |
2455 | .map_delete_elem = htab_lru_map_delete_elem, |
2456 | .map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem, |
2457 | .map_seq_show_elem = htab_percpu_map_seq_show_elem, |
2458 | .map_set_for_each_callback_args = map_set_for_each_callback_args, |
2459 | .map_for_each_callback = bpf_for_each_hash_elem, |
2460 | .map_mem_usage = htab_map_mem_usage, |
2461 | BATCH_OPS(htab_lru_percpu), |
2462 | .map_btf_id = &htab_map_btf_ids[0], |
2463 | .iter_seq_info = &iter_seq_info, |
2464 | }; |
2465 | |
2466 | static int fd_htab_map_alloc_check(union bpf_attr *attr) |
2467 | { |
2468 | if (attr->value_size != sizeof(u32)) |
2469 | return -EINVAL; |
2470 | return htab_map_alloc_check(attr); |
2471 | } |
2472 | |
2473 | static void fd_htab_map_free(struct bpf_map *map) |
2474 | { |
2475 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
2476 | struct hlist_nulls_node *n; |
2477 | struct hlist_nulls_head *head; |
2478 | struct htab_elem *l; |
2479 | int i; |
2480 | |
2481 | for (i = 0; i < htab->n_buckets; i++) { |
2482 | head = select_bucket(htab, hash: i); |
2483 | |
2484 | hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { |
2485 | void *ptr = fd_htab_map_get_ptr(map, l); |
2486 | |
2487 | map->ops->map_fd_put_ptr(ptr); |
2488 | } |
2489 | } |
2490 | |
2491 | htab_map_free(map); |
2492 | } |
2493 | |
2494 | /* only called from syscall */ |
2495 | int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) |
2496 | { |
2497 | void **ptr; |
2498 | int ret = 0; |
2499 | |
2500 | if (!map->ops->map_fd_sys_lookup_elem) |
2501 | return -ENOTSUPP; |
2502 | |
2503 | rcu_read_lock(); |
2504 | ptr = htab_map_lookup_elem(map, key); |
2505 | if (ptr) |
2506 | *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr)); |
2507 | else |
2508 | ret = -ENOENT; |
2509 | rcu_read_unlock(); |
2510 | |
2511 | return ret; |
2512 | } |
2513 | |
2514 | /* only called from syscall */ |
2515 | int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, |
2516 | void *key, void *value, u64 map_flags) |
2517 | { |
2518 | void *ptr; |
2519 | int ret; |
2520 | u32 ufd = *(u32 *)value; |
2521 | |
2522 | ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); |
2523 | if (IS_ERR(ptr)) |
2524 | return PTR_ERR(ptr); |
2525 | |
2526 | ret = htab_map_update_elem(map, key, value: &ptr, map_flags); |
2527 | if (ret) |
2528 | map->ops->map_fd_put_ptr(ptr); |
2529 | |
2530 | return ret; |
2531 | } |
2532 | |
2533 | static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) |
2534 | { |
2535 | struct bpf_map *map, *inner_map_meta; |
2536 | |
2537 | inner_map_meta = bpf_map_meta_alloc(inner_map_ufd: attr->inner_map_fd); |
2538 | if (IS_ERR(ptr: inner_map_meta)) |
2539 | return inner_map_meta; |
2540 | |
2541 | map = htab_map_alloc(attr); |
2542 | if (IS_ERR(ptr: map)) { |
2543 | bpf_map_meta_free(map_meta: inner_map_meta); |
2544 | return map; |
2545 | } |
2546 | |
2547 | map->inner_map_meta = inner_map_meta; |
2548 | |
2549 | return map; |
2550 | } |
2551 | |
2552 | static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) |
2553 | { |
2554 | struct bpf_map **inner_map = htab_map_lookup_elem(map, key); |
2555 | |
2556 | if (!inner_map) |
2557 | return NULL; |
2558 | |
2559 | return READ_ONCE(*inner_map); |
2560 | } |
2561 | |
2562 | static int htab_of_map_gen_lookup(struct bpf_map *map, |
2563 | struct bpf_insn *insn_buf) |
2564 | { |
2565 | struct bpf_insn *insn = insn_buf; |
2566 | const int ret = BPF_REG_0; |
2567 | |
2568 | BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, |
2569 | (void *(*)(struct bpf_map *map, void *key))NULL)); |
2570 | *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); |
2571 | *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2); |
2572 | *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, |
2573 | offsetof(struct htab_elem, key) + |
2574 | round_up(map->key_size, 8)); |
2575 | *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0); |
2576 | |
2577 | return insn - insn_buf; |
2578 | } |
2579 | |
2580 | static void htab_of_map_free(struct bpf_map *map) |
2581 | { |
2582 | bpf_map_meta_free(map_meta: map->inner_map_meta); |
2583 | fd_htab_map_free(map); |
2584 | } |
2585 | |
2586 | const struct bpf_map_ops htab_of_maps_map_ops = { |
2587 | .map_alloc_check = fd_htab_map_alloc_check, |
2588 | .map_alloc = htab_of_map_alloc, |
2589 | .map_free = htab_of_map_free, |
2590 | .map_get_next_key = htab_map_get_next_key, |
2591 | .map_lookup_elem = htab_of_map_lookup_elem, |
2592 | .map_delete_elem = htab_map_delete_elem, |
2593 | .map_fd_get_ptr = bpf_map_fd_get_ptr, |
2594 | .map_fd_put_ptr = bpf_map_fd_put_ptr, |
2595 | .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, |
2596 | .map_gen_lookup = htab_of_map_gen_lookup, |
2597 | .map_check_btf = map_check_no_btf, |
2598 | .map_mem_usage = htab_map_mem_usage, |
2599 | BATCH_OPS(htab), |
2600 | .map_btf_id = &htab_map_btf_ids[0], |
2601 | }; |
2602 | |