syscall.c source code [linux/kernel/bpf/syscall.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/ Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com*
3	*/
4	#include <linux/bpf.h>
5	#include <linux/bpf-cgroup.h>
6	#include <linux/bpf_trace.h>
7	#include <linux/bpf_lirc.h>
8	#include <linux/bpf_verifier.h>
9	#include <linux/bsearch.h>
10	#include <linux/btf.h>
11	#include <linux/syscalls.h>
12	#include <linux/slab.h>
13	#include <linux/sched/signal.h>
14	#include <linux/vmalloc.h>
15	#include <linux/mmzone.h>
16	#include <linux/anon_inodes.h>
17	#include <linux/fdtable.h>
18	#include <linux/file.h>
19	#include <linux/fs.h>
20	#include <linux/license.h>
21	#include <linux/filter.h>
22	#include <linux/kernel.h>
23	#include <linux/idr.h>
24	#include <linux/cred.h>
25	#include <linux/timekeeping.h>
26	#include <linux/ctype.h>
27	#include <linux/nospec.h>
28	#include <linux/audit.h>
29	#include <uapi/linux/btf.h>
30	#include <linux/pgtable.h>
31	#include <linux/bpf_lsm.h>
32	#include <linux/poll.h>
33	#include <linux/sort.h>
34	#include <linux/bpf-netns.h>
35	#include <linux/rcupdate_trace.h>
36	#include <linux/memcontrol.h>
37	#include <linux/trace_events.h>
38
39	#include <net/netfilter/nf_bpf_link.h>
40	#include <net/netkit.h>
41	#include <net/tcx.h>
42
43	#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY \|\| \
44	(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY \|\| \
45	(map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
46	#define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY)
47	#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
48	#define IS_FD_MAP(map) (IS_FD_ARRAY(map) \|\| IS_FD_PROG_ARRAY(map) \|\| \
49	IS_FD_HASH(map))
50
51	#define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY \| BPF_F_WRONLY)
52
53	DEFINE_PER_CPU(int, bpf_prog_active);
54	static DEFINE_IDR(prog_idr);
55	static DEFINE_SPINLOCK(prog_idr_lock);
56	static DEFINE_IDR(map_idr);
57	static DEFINE_SPINLOCK(map_idr_lock);
58	static DEFINE_IDR(link_idr);
59	static DEFINE_SPINLOCK(link_idr_lock);
60
61	int sysctl_unprivileged_bpf_disabled __read_mostly =
62	IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? `2` : `0`;
63
64	static const struct bpf_map_ops * const bpf_map_types[] = {
65	#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
66	#define BPF_MAP_TYPE(_id, _ops) \
67	[_id] = &_ops,
68	#define BPF_LINK_TYPE(_id, _name)
69	#include <linux/bpf_types.h>
70	#undef BPF_PROG_TYPE
71	#undef BPF_MAP_TYPE
72	#undef BPF_LINK_TYPE
73	};
74
75	/*
76	* If we're handed a bigger struct than we know of, ensure all the unknown bits
77	* are 0 - i.e. new user-space does not rely on any kernel feature extensions
78	* we don't know about yet.
79	*
80	* There is a ToCToU between this function call and the following
81	* copy_from_user() call. However, this is not a concern since this function is
82	* meant to be a future-proofing of bits.
83	*/
84	int bpf_check_uarg_tail_zero(bpfptr_t uaddr,
85	size_t expected_size,
86	size_t actual_size)
87	{
88	int res;
89
90	if (unlikely(actual_size > PAGE_SIZE)) / silly large /
91	return -E2BIG;
92
93	if (actual_size <= expected_size)
94	return `0`;
95
96	if (uaddr.is_kernel)
97	res = memchr_inv(p: uaddr.kernel + expected_size, c: `0`,
98	size: actual_size - expected_size) == NULL;
99	else
100	res = check_zeroed_user(from: uaddr.user + expected_size,
101	size: actual_size - expected_size);
102	if (res < `0`)
103	return res;
104	return res ? `0` : -E2BIG;
105	}
106
107	const struct bpf_map_ops bpf_map_offload_ops = {
108	.map_meta_equal = bpf_map_meta_equal,
109	.map_alloc = bpf_map_offload_map_alloc,
110	.map_free = bpf_map_offload_map_free,
111	.map_check_btf = map_check_no_btf,
112	.map_mem_usage = bpf_map_offload_map_mem_usage,
113	};
114
115	static void bpf_map_write_active_inc(struct bpf_map *map)
116	{
117	atomic64_inc(v: &map->writecnt);
118	}
119
120	static void bpf_map_write_active_dec(struct bpf_map *map)
121	{
122	atomic64_dec(v: &map->writecnt);
123	}
124
125	bool bpf_map_write_active(const struct bpf_map *map)
126	{
127	return atomic64_read(v: &map->writecnt) != `0`;
128	}
129
130	static u32 bpf_map_value_size(const struct bpf_map *map)
131	{
132	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH \|\|
133	map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH \|\|
134	map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY \|\|
135	map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
136	return round_up(map->value_size, `8`) * num_possible_cpus();
137	else if (IS_FD_MAP(map))
138	return sizeof(u32);
139	else
140	return map->value_size;
141	}
142
143	static void maybe_wait_bpf_programs(struct bpf_map *map)
144	{
145	/ Wait for any running BPF programs to complete so that*
146	* userspace, when we return to it, knows that all programs
147	* that could be running use the new map value.
148	*/
149	if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS \|\|
150	map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
151	synchronize_rcu();
152	}
153
154	static int bpf_map_update_value(struct bpf_map map, struct* file *map_file,
155	void key, void* *value, __u64 flags)
156	{
157	int err;
158
159	/ Need to create a kthread, thus must support schedule /
160	if (bpf_map_is_offloaded(map)) {
161	return bpf_map_offload_update_elem(map, key, value, flags);
162	} else if (map->map_type == BPF_MAP_TYPE_CPUMAP \|\|
163	map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
164	return map->ops->map_update_elem(map, key, value, flags);
165	} else if (map->map_type == BPF_MAP_TYPE_SOCKHASH \|\|
166	map->map_type == BPF_MAP_TYPE_SOCKMAP) {
167	return sock_map_update_elem_sys(map, key, value, flags);
168	} else if (IS_FD_PROG_ARRAY(map)) {
169	return bpf_fd_array_map_update_elem(map, map_file, key, value,
170	map_flags: flags);
171	}
172
173	bpf_disable_instrumentation();
174	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH \|\|
175	map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
176	err = bpf_percpu_hash_update(map, key, value, flags);
177	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
178	err = bpf_percpu_array_update(map, key, value, flags);
179	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
180	err = bpf_percpu_cgroup_storage_update(map, key, value,
181	flags);
182	} else if (IS_FD_ARRAY(map)) {
183	rcu_read_lock();
184	err = bpf_fd_array_map_update_elem(map, map_file, key, value,
185	map_flags: flags);
186	rcu_read_unlock();
187	} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
188	rcu_read_lock();
189	err = bpf_fd_htab_map_update_elem(map, map_file, key, value,
190	map_flags: flags);
191	rcu_read_unlock();
192	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
193	/ rcu_read_lock() is not needed /
194	err = bpf_fd_reuseport_array_update_elem(map, key, value,
195	map_flags: flags);
196	} else if (map->map_type == BPF_MAP_TYPE_QUEUE \|\|
197	map->map_type == BPF_MAP_TYPE_STACK \|\|
198	map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
199	err = map->ops->map_push_elem(map, value, flags);
200	} else {
201	rcu_read_lock();
202	err = map->ops->map_update_elem(map, key, value, flags);
203	rcu_read_unlock();
204	}
205	bpf_enable_instrumentation();
206	maybe_wait_bpf_programs(map);
207
208	return err;
209	}
210
211	static int bpf_map_copy_value(struct bpf_map map, void* key, void* *value,
212	__u64 flags)
213	{
214	void *ptr;
215	int err;
216
217	if (bpf_map_is_offloaded(map))
218	return bpf_map_offload_lookup_elem(map, key, value);
219
220	bpf_disable_instrumentation();
221	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH \|\|
222	map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
223	err = bpf_percpu_hash_copy(map, key, value);
224	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
225	err = bpf_percpu_array_copy(map, key, value);
226	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
227	err = bpf_percpu_cgroup_storage_copy(map, key, value);
228	} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
229	err = bpf_stackmap_copy(map, key, value);
230	} else if (IS_FD_ARRAY(map) \|\| IS_FD_PROG_ARRAY(map)) {
231	err = bpf_fd_array_map_lookup_elem(map, key, value);
232	} else if (IS_FD_HASH(map)) {
233	err = bpf_fd_htab_map_lookup_elem(map, key, value);
234	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
235	err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
236	} else if (map->map_type == BPF_MAP_TYPE_QUEUE \|\|
237	map->map_type == BPF_MAP_TYPE_STACK \|\|
238	map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
239	err = map->ops->map_peek_elem(map, value);
240	} else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
241	/ struct_ops map requires directly updating "value" /
242	err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
243	} else {
244	rcu_read_lock();
245	if (map->ops->map_lookup_elem_sys_only)
246	ptr = map->ops->map_lookup_elem_sys_only(map, key);
247	else
248	ptr = map->ops->map_lookup_elem(map, key);
249	if (IS_ERR(ptr)) {
250	err = PTR_ERR(ptr);
251	} else if (!ptr) {
252	err = -ENOENT;
253	} else {
254	err = `0`;
255	if (flags & BPF_F_LOCK)
256	/ lock 'ptr' and copy everything but lock /
257	copy_map_value_locked(map, dst: value, src: ptr, lock_src: true);
258	else
259	copy_map_value(map, dst: value, src: ptr);
260	/ mask lock and timer, since value wasn't zero inited /
261	check_and_init_map_value(map, dst: value);
262	}
263	rcu_read_unlock();
264	}
265
266	bpf_enable_instrumentation();
267	maybe_wait_bpf_programs(map);
268
269	return err;
270	}
271
272	/ Please, do not use this function outside from the map creation path*
273	* (e.g. in map update path) without taking care of setting the active
274	* memory cgroup (see at bpf_map_kmalloc_node() for example).
275	*/
276	static void __bpf_map_area_alloc(u64 size, int* numa_node, bool mmapable)
277	{
278	/ We really just want to fail instead of triggering OOM killer*
279	* under memory pressure, therefore we set __GFP_NORETRY to kmalloc,
280	* which is used for lower order allocation requests.
281	*
282	* It has been observed that higher order allocation requests done by
283	* vmalloc with __GFP_NORETRY being set might fail due to not trying
284	* to reclaim memory from the page cache, thus we set
285	* __GFP_RETRY_MAYFAIL to avoid such situations.
286	*/
287
288	gfp_t gfp = bpf_memcg_flags(__GFP_NOWARN \| __GFP_ZERO);
289	unsigned int flags = `0`;
290	unsigned long align = `1`;
291	void *area;
292
293	if (size >= SIZE_MAX)
294	return NULL;
295
296	/ kmalloc()'ed memory can't be mmap()'ed /
297	if (mmapable) {
298	BUG_ON(!PAGE_ALIGNED(size));
299	align = SHMLBA;
300	flags = VM_USERMAP;
301	} else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
302	area = kmalloc_node(size, flags: gfp \| GFP_USER \| __GFP_NORETRY,
303	node: numa_node);
304	if (area != NULL)
305	return area;
306	}
307
308	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
309	gfp_mask: gfp \| GFP_KERNEL \| __GFP_RETRY_MAYFAIL, PAGE_KERNEL,
310	vm_flags: flags, node: numa_node, caller: __builtin_return_address(`0`));
311	}
312
313	void bpf_map_area_alloc(u64 size, int* numa_node)
314	{
315	return __bpf_map_area_alloc(size, numa_node, mmapable: false);
316	}
317
318	void bpf_map_area_mmapable_alloc(u64 size, int* numa_node)
319	{
320	return __bpf_map_area_alloc(size, numa_node, mmapable: true);
321	}
322
323	void bpf_map_area_free(void *area)
324	{
325	kvfree(addr: area);
326	}
327
328	static u32 bpf_map_flags_retain_permanent(u32 flags)
329	{
330	/ Some map creation flags are not tied to the map object but*
331	* rather to the map fd instead, so they have no meaning upon
332	* map object inspection since multiple file descriptors with
333	* different (access) properties can exist here. Thus, given
334	* this has zero meaning for the map itself, lets clear these
335	* from here.
336	*/
337	return flags & ~(BPF_F_RDONLY \| BPF_F_WRONLY);
338	}
339
340	void bpf_map_init_from_attr(struct bpf_map map, union* bpf_attr *attr)
341	{
342	map->map_type = attr->map_type;
343	map->key_size = attr->key_size;
344	map->value_size = attr->value_size;
345	map->max_entries = attr->max_entries;
346	map->map_flags = bpf_map_flags_retain_permanent(flags: attr->map_flags);
347	map->numa_node = bpf_map_attr_numa_node(attr);
348	map->map_extra = attr->map_extra;
349	}
350
351	static int bpf_map_alloc_id(struct bpf_map *map)
352	{
353	int id;
354
355	idr_preload(GFP_KERNEL);
356	spin_lock_bh(lock: &map_idr_lock);
357	id = idr_alloc_cyclic(&map_idr, ptr: map, start: `1`, INT_MAX, GFP_ATOMIC);
358	if (id > `0`)
359	map->id = id;
360	spin_unlock_bh(lock: &map_idr_lock);
361	idr_preload_end();
362
363	if (WARN_ON_ONCE(!id))
364	return -ENOSPC;
365
366	return id > `0` ? `0` : id;
367	}
368
369	void bpf_map_free_id(struct bpf_map *map)
370	{
371	unsigned long flags;
372
373	/ Offloaded maps are removed from the IDR store when their device*
374	* disappears - even if someone holds an fd to them they are unusable,
375	* the memory is gone, all ops will fail; they are simply waiting for
376	* refcnt to drop to be freed.
377	*/
378	if (!map->id)
379	return;
380
381	spin_lock_irqsave(&map_idr_lock, flags);
382
383	idr_remove(&map_idr, id: map->id);
384	map->id = `0`;
385
386	spin_unlock_irqrestore(lock: &map_idr_lock, flags);
387	}
388
389	#ifdef CONFIG_MEMCG_KMEM
390	static void bpf_map_save_memcg(struct bpf_map *map)
391	{
392	/ Currently if a map is created by a process belonging to the root*
393	* memory cgroup, get_obj_cgroup_from_current() will return NULL.
394	* So we have to check map->objcg for being NULL each time it's
395	* being used.
396	*/
397	if (memcg_bpf_enabled())
398	map->objcg = get_obj_cgroup_from_current();
399	}
400
401	static void bpf_map_release_memcg(struct bpf_map *map)
402	{
403	if (map->objcg)
404	obj_cgroup_put(objcg: map->objcg);
405	}
406
407	static struct mem_cgroup bpf_map_get_memcg(const* struct bpf_map *map)
408	{
409	if (map->objcg)
410	return get_mem_cgroup_from_objcg(objcg: map->objcg);
411
412	return root_mem_cgroup;
413	}
414
415	void bpf_map_kmalloc_node(const* struct bpf_map *map, size_t size, gfp_t flags,
416	int node)
417	{
418	struct mem_cgroup memcg, old_memcg;
419	void *ptr;
420
421	memcg = bpf_map_get_memcg(map);
422	old_memcg = set_active_memcg(memcg);
423	ptr = kmalloc_node(size, flags: flags \| __GFP_ACCOUNT, node);
424	set_active_memcg(old_memcg);
425	mem_cgroup_put(memcg);
426
427	return ptr;
428	}
429
430	void bpf_map_kzalloc(const* struct bpf_map *map, size_t size, gfp_t flags)
431	{
432	struct mem_cgroup memcg, old_memcg;
433	void *ptr;
434
435	memcg = bpf_map_get_memcg(map);
436	old_memcg = set_active_memcg(memcg);
437	ptr = kzalloc(size, flags: flags \| __GFP_ACCOUNT);
438	set_active_memcg(old_memcg);
439	mem_cgroup_put(memcg);
440
441	return ptr;
442	}
443
444	void bpf_map_kvcalloc(struct* bpf_map *map, size_t n, size_t size,
445	gfp_t flags)
446	{
447	struct mem_cgroup memcg, old_memcg;
448	void *ptr;
449
450	memcg = bpf_map_get_memcg(map);
451	old_memcg = set_active_memcg(memcg);
452	ptr = kvcalloc(n, size, flags: flags \| __GFP_ACCOUNT);
453	set_active_memcg(old_memcg);
454	mem_cgroup_put(memcg);
455
456	return ptr;
457	}
458
459	void __percpu bpf_map_alloc_percpu(const* struct bpf_map *map, size_t size,
460	size_t align, gfp_t flags)
461	{
462	struct mem_cgroup memcg, old_memcg;
463	void __percpu *ptr;
464
465	memcg = bpf_map_get_memcg(map);
466	old_memcg = set_active_memcg(memcg);
467	ptr = __alloc_percpu_gfp(size, align, gfp: flags \| __GFP_ACCOUNT);
468	set_active_memcg(old_memcg);
469	mem_cgroup_put(memcg);
470
471	return ptr;
472	}
473
474	#else
475	static void bpf_map_save_memcg(struct bpf_map *map)
476	{
477	}
478
479	static void bpf_map_release_memcg(struct bpf_map *map)
480	{
481	}
482	#endif
483
484	static int btf_field_cmp(const void a, const* void *b)
485	{
486	const struct btf_field f1 = a, f2 = b;
487
488	if (f1->offset < f2->offset)
489	return -`1`;
490	else if (f1->offset > f2->offset)
491	return `1`;
492	return `0`;
493	}
494
495	struct btf_field btf_record_find(const* struct btf_record *rec, u32 offset,
496	u32 field_mask)
497	{
498	struct btf_field *field;
499
500	if (IS_ERR_OR_NULL(ptr: rec) \|\| !(rec->field_mask & field_mask))
501	return NULL;
502	field = bsearch(key: &offset, base: rec->fields, num: rec->cnt, size: sizeof(rec->fields[`0`]), cmp: btf_field_cmp);
503	if (!field \|\| !(field->type & field_mask))
504	return NULL;
505	return field;
506	}
507
508	void btf_record_free(struct btf_record *rec)
509	{
510	int i;
511
512	if (IS_ERR_OR_NULL(ptr: rec))
513	return;
514	for (i = `0`; i < rec->cnt; i++) {
515	switch (rec->fields[i].type) {
516	case BPF_KPTR_UNREF:
517	case BPF_KPTR_REF:
518	case BPF_KPTR_PERCPU:
519	if (rec->fields[i].kptr.module)
520	module_put(module: rec->fields[i].kptr.module);
521	btf_put(btf: rec->fields[i].kptr.btf);
522	break;
523	case BPF_LIST_HEAD:
524	case BPF_LIST_NODE:
525	case BPF_RB_ROOT:
526	case BPF_RB_NODE:
527	case BPF_SPIN_LOCK:
528	case BPF_TIMER:
529	case BPF_REFCOUNT:
530	/ Nothing to release /
531	break;
532	default:
533	WARN_ON_ONCE(`1`);
534	continue;
535	}
536	}
537	kfree(objp: rec);
538	}
539
540	void bpf_map_free_record(struct bpf_map *map)
541	{
542	btf_record_free(rec: map->record);
543	map->record = NULL;
544	}
545
546	struct btf_record btf_record_dup(const* struct btf_record *rec)
547	{
548	const struct btf_field *fields;
549	struct btf_record *new_rec;
550	int ret, size, i;
551
552	if (IS_ERR_OR_NULL(ptr: rec))
553	return NULL;
554	size = offsetof(struct btf_record, fields[rec->cnt]);
555	new_rec = kmemdup(p: rec, size, GFP_KERNEL \| __GFP_NOWARN);
556	if (!new_rec)
557	return ERR_PTR(error: -ENOMEM);
558	/ Do a deep copy of the btf_record /
559	fields = rec->fields;
560	new_rec->cnt = `0`;
561	for (i = `0`; i < rec->cnt; i++) {
562	switch (fields[i].type) {
563	case BPF_KPTR_UNREF:
564	case BPF_KPTR_REF:
565	case BPF_KPTR_PERCPU:
566	btf_get(btf: fields[i].kptr.btf);
567	if (fields[i].kptr.module && !try_module_get(module: fields[i].kptr.module)) {
568	ret = -ENXIO;
569	goto free;
570	}
571	break;
572	case BPF_LIST_HEAD:
573	case BPF_LIST_NODE:
574	case BPF_RB_ROOT:
575	case BPF_RB_NODE:
576	case BPF_SPIN_LOCK:
577	case BPF_TIMER:
578	case BPF_REFCOUNT:
579	/ Nothing to acquire /
580	break;
581	default:
582	ret = -EFAULT;
583	WARN_ON_ONCE(`1`);
584	goto free;
585	}
586	new_rec->cnt++;
587	}
588	return new_rec;
589	free:
590	btf_record_free(rec: new_rec);
591	return ERR_PTR(error: ret);
592	}
593
594	bool btf_record_equal(const struct btf_record rec_a, const* struct btf_record *rec_b)
595	{
596	bool a_has_fields = !IS_ERR_OR_NULL(ptr: rec_a), b_has_fields = !IS_ERR_OR_NULL(ptr: rec_b);
597	int size;
598
599	if (!a_has_fields && !b_has_fields)
600	return true;
601	if (a_has_fields != b_has_fields)
602	return false;
603	if (rec_a->cnt != rec_b->cnt)
604	return false;
605	size = offsetof(struct btf_record, fields[rec_a->cnt]);
606	/ btf_parse_fields uses kzalloc to allocate a btf_record, so unused*
607	* members are zeroed out. So memcmp is safe to do without worrying
608	* about padding/unused fields.
609	*
610	* While spin_lock, timer, and kptr have no relation to map BTF,
611	* list_head metadata is specific to map BTF, the btf and value_rec
612	* members in particular. btf is the map BTF, while value_rec points to
613	* btf_record in that map BTF.
614	*
615	* So while by default, we don't rely on the map BTF (which the records
616	* were parsed from) matching for both records, which is not backwards
617	* compatible, in case list_head is part of it, we implicitly rely on
618	* that by way of depending on memcmp succeeding for it.
619	*/
620	return !memcmp(p: rec_a, q: rec_b, size);
621	}
622
623	void bpf_obj_free_timer(const struct btf_record rec, void* *obj)
624	{
625	if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER)))
626	return;
627	bpf_timer_cancel_and_free(timer: obj + rec->timer_off);
628	}
629
630	void bpf_obj_free_fields(const struct btf_record rec, void* *obj)
631	{
632	const struct btf_field *fields;
633	int i;
634
635	if (IS_ERR_OR_NULL(ptr: rec))
636	return;
637	fields = rec->fields;
638	for (i = `0`; i < rec->cnt; i++) {
639	struct btf_struct_meta *pointee_struct_meta;
640	const struct btf_field *field = &fields[i];
641	void *field_ptr = obj + field->offset;
642	void *xchgd_field;
643
644	switch (fields[i].type) {
645	case BPF_SPIN_LOCK:
646	break;
647	case BPF_TIMER:
648	bpf_timer_cancel_and_free(timer: field_ptr);
649	break;
650	case BPF_KPTR_UNREF:
651	WRITE_ONCE((u64 )field_ptr, `0`);
652	break;
653	case BPF_KPTR_REF:
654	case BPF_KPTR_PERCPU:
655	xchgd_field = (void )xchg((unsigned* long *)field_ptr, `0`);
656	if (!xchgd_field)
657	break;
658
659	if (!btf_is_kernel(btf: field->kptr.btf)) {
660	pointee_struct_meta = btf_find_struct_meta(btf: field->kptr.btf,
661	btf_id: field->kptr.btf_id);
662	migrate_disable();
663	__bpf_obj_drop_impl(p: xchgd_field, rec: pointee_struct_meta ?
664	pointee_struct_meta->record : NULL,
665	percpu: fields[i].type == BPF_KPTR_PERCPU);
666	migrate_enable();
667	} else {
668	field->kptr.dtor(xchgd_field);
669	}
670	break;
671	case BPF_LIST_HEAD:
672	if (WARN_ON_ONCE(rec->spin_lock_off < `0`))
673	continue;
674	bpf_list_head_free(field, list_head: field_ptr, spin_lock: obj + rec->spin_lock_off);
675	break;
676	case BPF_RB_ROOT:
677	if (WARN_ON_ONCE(rec->spin_lock_off < `0`))
678	continue;
679	bpf_rb_root_free(field, rb_root: field_ptr, spin_lock: obj + rec->spin_lock_off);
680	break;
681	case BPF_LIST_NODE:
682	case BPF_RB_NODE:
683	case BPF_REFCOUNT:
684	break;
685	default:
686	WARN_ON_ONCE(`1`);
687	continue;
688	}
689	}
690	}
691
692	/ called from workqueue /
693	static void bpf_map_free_deferred(struct work_struct *work)
694	{
695	struct bpf_map map = container_of(work, struct* bpf_map, work);
696	struct btf_record *rec = map->record;
697
698	security_bpf_map_free(map);
699	bpf_map_release_memcg(map);
700	/ implementation dependent freeing /
701	map->ops->map_free(map);
702	/ Delay freeing of btf_record for maps, as map_free*
703	* callback usually needs access to them. It is better to do it here
704	* than require each callback to do the free itself manually.
705	*
706	* Note that the btf_record stashed in map->inner_map_meta->record was
707	* already freed using the map_free callback for map in map case which
708	* eventually calls bpf_map_free_meta, since inner_map_meta is only a
709	* template bpf_map struct used during verification.
710	*/
711	btf_record_free(rec);
712	}
713
714	static void bpf_map_put_uref(struct bpf_map *map)
715	{
716	if (atomic64_dec_and_test(v: &map->usercnt)) {
717	if (map->ops->map_release_uref)
718	map->ops->map_release_uref(map);
719	}
720	}
721
722	/ decrement map refcnt and schedule it for freeing via workqueue*
723	* (underlying map implementation ops->map_free() might sleep)
724	*/
725	void bpf_map_put(struct bpf_map *map)
726	{
727	if (atomic64_dec_and_test(v: &map->refcnt)) {
728	/ bpf_map_free_id() must be called first /
729	bpf_map_free_id(map);
730	btf_put(btf: map->btf);
731	INIT_WORK(&map->work, bpf_map_free_deferred);
732	/ Avoid spawning kworkers, since they all might contend*
733	* for the same mutex like slab_mutex.
734	*/
735	queue_work(wq: system_unbound_wq, work: &map->work);
736	}
737	}
738	EXPORT_SYMBOL_GPL(bpf_map_put);
739
740	void bpf_map_put_with_uref(struct bpf_map *map)
741	{
742	bpf_map_put_uref(map);
743	bpf_map_put(map);
744	}
745
746	static int bpf_map_release(struct inode inode, struct* file *filp)
747	{
748	struct bpf_map *map = filp->private_data;
749
750	if (map->ops->map_release)
751	map->ops->map_release(map, filp);
752
753	bpf_map_put_with_uref(map);
754	return `0`;
755	}
756
757	static fmode_t map_get_sys_perms(struct bpf_map map, struct* fd f)
758	{
759	fmode_t mode = f.file->f_mode;
760
761	/ Our file permissions may have been overridden by global*
762	* map permissions facing syscall side.
763	*/
764	if (READ_ONCE(map->frozen))
765	mode &= ~FMODE_CAN_WRITE;
766	return mode;
767	}
768
769	#ifdef CONFIG_PROC_FS
770	/ Show the memory usage of a bpf map /
771	static u64 bpf_map_memory_usage(const struct bpf_map *map)
772	{
773	return map->ops->map_mem_usage(map);
774	}
775
776	static void bpf_map_show_fdinfo(struct seq_file m, struct* file *filp)
777	{
778	struct bpf_map *map = filp->private_data;
779	u32 type = `0`, jited = `0`;
780
781	if (map_type_contains_progs(map)) {
782	spin_lock(lock: &map->owner.lock);
783	type = map->owner.type;
784	jited = map->owner.jited;
785	spin_unlock(lock: &map->owner.lock);
786	}
787
788	seq_printf(m,
789	fmt: "map_type:\t%u\n"
790	"key_size:\t%u\n"
791	"value_size:\t%u\n"
792	"max_entries:\t%u\n"
793	"map_flags:\t%#x\n"
794	"map_extra:\t%#llx\n"
795	"memlock:\t%llu\n"
796	"map_id:\t%u\n"
797	"frozen:\t%u\n",
798	map->map_type,
799	map->key_size,
800	map->value_size,
801	map->max_entries,
802	map->map_flags,
803	(unsigned long long)map->map_extra,
804	bpf_map_memory_usage(map),
805	map->id,
806	READ_ONCE(map->frozen));
807	if (type) {
808	seq_printf(m, fmt: "owner_prog_type:\t%u\n", type);
809	seq_printf(m, fmt: "owner_jited:\t%u\n", jited);
810	}
811	}
812	#endif
813
814	static ssize_t bpf_dummy_read(struct file filp, char* __user *buf, size_t siz,
815	loff_t *ppos)
816	{
817	/ We need this handler such that alloc_file() enables*
818	* f_mode with FMODE_CAN_READ.
819	*/
820	return -EINVAL;
821	}
822
823	static ssize_t bpf_dummy_write(struct file filp, const* char __user *buf,
824	size_t siz, loff_t *ppos)
825	{
826	/ We need this handler such that alloc_file() enables*
827	* f_mode with FMODE_CAN_WRITE.
828	*/
829	return -EINVAL;
830	}
831
832	/ called for any extra memory-mapped regions (except initial) /
833	static void bpf_map_mmap_open(struct vm_area_struct *vma)
834	{
835	struct bpf_map *map = vma->vm_file->private_data;
836
837	if (vma->vm_flags & VM_MAYWRITE)
838	bpf_map_write_active_inc(map);
839	}
840
841	/ called for all unmapped memory region (including initial) /
842	static void bpf_map_mmap_close(struct vm_area_struct *vma)
843	{
844	struct bpf_map *map = vma->vm_file->private_data;
845
846	if (vma->vm_flags & VM_MAYWRITE)
847	bpf_map_write_active_dec(map);
848	}
849
850	static const struct vm_operations_struct bpf_map_default_vmops = {
851	.open = bpf_map_mmap_open,
852	.close = bpf_map_mmap_close,
853	};
854
855	static int bpf_map_mmap(struct file filp, struct* vm_area_struct *vma)
856	{
857	struct bpf_map *map = filp->private_data;
858	int err;
859
860	if (!map->ops->map_mmap \|\| !IS_ERR_OR_NULL(ptr: map->record))
861	return -ENOTSUPP;
862
863	if (!(vma->vm_flags & VM_SHARED))
864	return -EINVAL;
865
866	mutex_lock(&map->freeze_mutex);
867
868	if (vma->vm_flags & VM_WRITE) {
869	if (map->frozen) {
870	err = -EPERM;
871	goto out;
872	}
873	/ map is meant to be read-only, so do not allow mapping as*
874	* writable, because it's possible to leak a writable page
875	* reference and allows user-space to still modify it after
876	* freezing, while verifier will assume contents do not change
877	*/
878	if (map->map_flags & BPF_F_RDONLY_PROG) {
879	err = -EACCES;
880	goto out;
881	}
882	}
883
884	/ set default open/close callbacks /
885	vma->vm_ops = &bpf_map_default_vmops;
886	vma->vm_private_data = map;
887	vm_flags_clear(vma, VM_MAYEXEC);
888	if (!(vma->vm_flags & VM_WRITE))
889	/ disallow re-mapping with PROT_WRITE /
890	vm_flags_clear(vma, VM_MAYWRITE);
891
892	err = map->ops->map_mmap(map, vma);
893	if (err)
894	goto out;
895
896	if (vma->vm_flags & VM_MAYWRITE)
897	bpf_map_write_active_inc(map);
898	out:
899	mutex_unlock(lock: &map->freeze_mutex);
900	return err;
901	}
902
903	static __poll_t bpf_map_poll(struct file filp, struct* poll_table_struct *pts)
904	{
905	struct bpf_map *map = filp->private_data;
906
907	if (map->ops->map_poll)
908	return map->ops->map_poll(map, filp, pts);
909
910	return EPOLLERR;
911	}
912
913	const struct file_operations bpf_map_fops = {
914	#ifdef CONFIG_PROC_FS
915	.show_fdinfo = bpf_map_show_fdinfo,
916	#endif
917	.release = bpf_map_release,
918	.read = bpf_dummy_read,
919	.write = bpf_dummy_write,
920	.mmap = bpf_map_mmap,
921	.poll = bpf_map_poll,
922	};
923
924	int bpf_map_new_fd(struct bpf_map map, int* flags)
925	{
926	int ret;
927
928	ret = security_bpf_map(map, OPEN_FMODE(flags));
929	if (ret < `0`)
930	return ret;
931
932	return anon_inode_getfd(name: "bpf-map", fops: &bpf_map_fops, priv: map,
933	flags: flags \| O_CLOEXEC);
934	}
935
936	int bpf_get_file_flag(int flags)
937	{
938	if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
939	return -EINVAL;
940	if (flags & BPF_F_RDONLY)
941	return O_RDONLY;
942	if (flags & BPF_F_WRONLY)
943	return O_WRONLY;
944	return O_RDWR;
945	}
946
947	/ helper macro to check that unused fields 'union bpf_attr' are zero /
948	#define CHECK_ATTR(CMD) \
949	memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
950	sizeof(attr->CMD##_LAST_FIELD), 0, \
951	sizeof(*attr) - \
952	offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
953	sizeof(attr->CMD##_LAST_FIELD)) != NULL
954
955	/ dst and src must have at least "size" number of bytes.*
956	* Return strlen on success and < 0 on error.
957	*/
958	int bpf_obj_name_cpy(char dst, const* char src, unsigned* int size)
959	{
960	const char *end = src + size;
961	const char *orig_src = src;
962
963	memset(dst, `0`, size);
964	/ Copy all isalnum(), '_' and '.' chars. /
965	while (src < end && *src) {
966	if (!isalnum(*src) &&
967	src != `'_'` && src != `'.'`)
968	return -EINVAL;
969	dst++ = src++;
970	}
971
972	/ No '\0' found in "size" number of bytes /
973	if (src == end)
974	return -EINVAL;
975
976	return src - orig_src;
977	}
978
979	int map_check_no_btf(const struct bpf_map *map,
980	const struct btf *btf,
981	const struct btf_type *key_type,
982	const struct btf_type *value_type)
983	{
984	return -ENOTSUPP;
985	}
986
987	static int map_check_btf(struct bpf_map map, const* struct btf *btf,
988	u32 btf_key_id, u32 btf_value_id)
989	{
990	const struct btf_type key_type, value_type;
991	u32 key_size, value_size;
992	int ret = `0`;
993
994	/ Some maps allow key to be unspecified. /
995	if (btf_key_id) {
996	key_type = btf_type_id_size(btf, type_id: &btf_key_id, ret_size: &key_size);
997	if (!key_type \|\| key_size != map->key_size)
998	return -EINVAL;
999	} else {
1000	key_type = btf_type_by_id(btf, type_id: `0`);
1001	if (!map->ops->map_check_btf)
1002	return -EINVAL;
1003	}
1004
1005	value_type = btf_type_id_size(btf, type_id: &btf_value_id, ret_size: &value_size);
1006	if (!value_type \|\| value_size != map->value_size)
1007	return -EINVAL;
1008
1009	map->record = btf_parse_fields(btf, t: value_type,
1010	field_mask: BPF_SPIN_LOCK \| BPF_TIMER \| BPF_KPTR \| BPF_LIST_HEAD \|
1011	BPF_RB_ROOT \| BPF_REFCOUNT,
1012	value_size: map->value_size);
1013	if (!IS_ERR_OR_NULL(ptr: map->record)) {
1014	int i;
1015
1016	if (!bpf_capable()) {
1017	ret = -EPERM;
1018	goto free_map_tab;
1019	}
1020	if (map->map_flags & (BPF_F_RDONLY_PROG \| BPF_F_WRONLY_PROG)) {
1021	ret = -EACCES;
1022	goto free_map_tab;
1023	}
1024	for (i = `0`; i < sizeof(map->record->field_mask) * `8`; i++) {
1025	switch (map->record->field_mask & (`1` << i)) {
1026	case `0`:
1027	continue;
1028	case BPF_SPIN_LOCK:
1029	if (map->map_type != BPF_MAP_TYPE_HASH &&
1030	map->map_type != BPF_MAP_TYPE_ARRAY &&
1031	map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
1032	map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
1033	map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
1034	map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
1035	map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
1036	ret = -EOPNOTSUPP;
1037	goto free_map_tab;
1038	}
1039	break;
1040	case BPF_TIMER:
1041	if (map->map_type != BPF_MAP_TYPE_HASH &&
1042	map->map_type != BPF_MAP_TYPE_LRU_HASH &&
1043	map->map_type != BPF_MAP_TYPE_ARRAY) {
1044	ret = -EOPNOTSUPP;
1045	goto free_map_tab;
1046	}
1047	break;
1048	case BPF_KPTR_UNREF:
1049	case BPF_KPTR_REF:
1050	case BPF_KPTR_PERCPU:
1051	case BPF_REFCOUNT:
1052	if (map->map_type != BPF_MAP_TYPE_HASH &&
1053	map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
1054	map->map_type != BPF_MAP_TYPE_LRU_HASH &&
1055	map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH &&
1056	map->map_type != BPF_MAP_TYPE_ARRAY &&
1057	map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
1058	map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
1059	map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
1060	map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
1061	map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
1062	ret = -EOPNOTSUPP;
1063	goto free_map_tab;
1064	}
1065	break;
1066	case BPF_LIST_HEAD:
1067	case BPF_RB_ROOT:
1068	if (map->map_type != BPF_MAP_TYPE_HASH &&
1069	map->map_type != BPF_MAP_TYPE_LRU_HASH &&
1070	map->map_type != BPF_MAP_TYPE_ARRAY) {
1071	ret = -EOPNOTSUPP;
1072	goto free_map_tab;
1073	}
1074	break;
1075	default:
1076	/ Fail if map_type checks are missing for a field type /
1077	ret = -EOPNOTSUPP;
1078	goto free_map_tab;
1079	}
1080	}
1081	}
1082
1083	ret = btf_check_and_fixup_fields(btf, rec: map->record);
1084	if (ret < `0`)
1085	goto free_map_tab;
1086
1087	if (map->ops->map_check_btf) {
1088	ret = map->ops->map_check_btf(map, btf, key_type, value_type);
1089	if (ret < `0`)
1090	goto free_map_tab;
1091	}
1092
1093	return ret;
1094	free_map_tab:
1095	bpf_map_free_record(map);
1096	return ret;
1097	}
1098
1099	#define BPF_MAP_CREATE_LAST_FIELD map_extra
1100	/ called via syscall /
1101	static int map_create(union bpf_attr *attr)
1102	{
1103	const struct bpf_map_ops *ops;
1104	int numa_node = bpf_map_attr_numa_node(attr);
1105	u32 map_type = attr->map_type;
1106	struct bpf_map *map;
1107	int f_flags;
1108	int err;
1109
1110	err = CHECK_ATTR(BPF_MAP_CREATE);
1111	if (err)
1112	return -EINVAL;
1113
1114	if (attr->btf_vmlinux_value_type_id) {
1115	if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS \|\|
1116	attr->btf_key_type_id \|\| attr->btf_value_type_id)
1117	return -EINVAL;
1118	} else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
1119	return -EINVAL;
1120	}
1121
1122	if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
1123	attr->map_extra != `0`)
1124	return -EINVAL;
1125
1126	f_flags = bpf_get_file_flag(flags: attr->map_flags);
1127	if (f_flags < `0`)
1128	return f_flags;
1129
1130	if (numa_node != NUMA_NO_NODE &&
1131	((unsigned int)numa_node >= nr_node_ids \|\|
1132	!node_online(numa_node)))
1133	return -EINVAL;
1134
1135	/ find map type and init map: hashtable vs rbtree vs bloom vs ... /
1136	map_type = attr->map_type;
1137	if (map_type >= ARRAY_SIZE(bpf_map_types))
1138	return -EINVAL;
1139	map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types));
1140	ops = bpf_map_types[map_type];
1141	if (!ops)
1142	return -EINVAL;
1143
1144	if (ops->map_alloc_check) {
1145	err = ops->map_alloc_check(attr);
1146	if (err)
1147	return err;
1148	}
1149	if (attr->map_ifindex)
1150	ops = &bpf_map_offload_ops;
1151	if (!ops->map_mem_usage)
1152	return -EINVAL;
1153
1154	/ Intent here is for unprivileged_bpf_disabled to block BPF map*
1155	* creation for unprivileged users; other actions depend
1156	* on fd availability and access to bpffs, so are dependent on
1157	* object creation success. Even with unprivileged BPF disabled,
1158	* capability checks are still carried out.
1159	*/
1160	if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
1161	return -EPERM;
1162
1163	/ check privileged map type permissions /
1164	switch (map_type) {
1165	case BPF_MAP_TYPE_ARRAY:
1166	case BPF_MAP_TYPE_PERCPU_ARRAY:
1167	case BPF_MAP_TYPE_PROG_ARRAY:
1168	case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
1169	case BPF_MAP_TYPE_CGROUP_ARRAY:
1170	case BPF_MAP_TYPE_ARRAY_OF_MAPS:
1171	case BPF_MAP_TYPE_HASH:
1172	case BPF_MAP_TYPE_PERCPU_HASH:
1173	case BPF_MAP_TYPE_HASH_OF_MAPS:
1174	case BPF_MAP_TYPE_RINGBUF:
1175	case BPF_MAP_TYPE_USER_RINGBUF:
1176	case BPF_MAP_TYPE_CGROUP_STORAGE:
1177	case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
1178	/ unprivileged /
1179	break;
1180	case BPF_MAP_TYPE_SK_STORAGE:
1181	case BPF_MAP_TYPE_INODE_STORAGE:
1182	case BPF_MAP_TYPE_TASK_STORAGE:
1183	case BPF_MAP_TYPE_CGRP_STORAGE:
1184	case BPF_MAP_TYPE_BLOOM_FILTER:
1185	case BPF_MAP_TYPE_LPM_TRIE:
1186	case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
1187	case BPF_MAP_TYPE_STACK_TRACE:
1188	case BPF_MAP_TYPE_QUEUE:
1189	case BPF_MAP_TYPE_STACK:
1190	case BPF_MAP_TYPE_LRU_HASH:
1191	case BPF_MAP_TYPE_LRU_PERCPU_HASH:
1192	case BPF_MAP_TYPE_STRUCT_OPS:
1193	case BPF_MAP_TYPE_CPUMAP:
1194	if (!bpf_capable())
1195	return -EPERM;
1196	break;
1197	case BPF_MAP_TYPE_SOCKMAP:
1198	case BPF_MAP_TYPE_SOCKHASH:
1199	case BPF_MAP_TYPE_DEVMAP:
1200	case BPF_MAP_TYPE_DEVMAP_HASH:
1201	case BPF_MAP_TYPE_XSKMAP:
1202	if (!capable(CAP_NET_ADMIN))
1203	return -EPERM;
1204	break;
1205	default:
1206	WARN(`1`, "unsupported map type %d", map_type);
1207	return -EPERM;
1208	}
1209
1210	map = ops->map_alloc(attr);
1211	if (IS_ERR(ptr: map))
1212	return PTR_ERR(ptr: map);
1213	map->ops = ops;
1214	map->map_type = map_type;
1215
1216	err = bpf_obj_name_cpy(dst: map->name, src: attr->map_name,
1217	size: sizeof(attr->map_name));
1218	if (err < `0`)
1219	goto free_map;
1220
1221	atomic64_set(v: &map->refcnt, i: `1`);
1222	atomic64_set(v: &map->usercnt, i: `1`);
1223	mutex_init(&map->freeze_mutex);
1224	spin_lock_init(&map->owner.lock);
1225
1226	if (attr->btf_key_type_id \|\| attr->btf_value_type_id \|\|
1227	/ Even the map's value is a kernel's struct,*
1228	* the bpf_prog.o must have BTF to begin with
1229	* to figure out the corresponding kernel's
1230	* counter part. Thus, attr->btf_fd has
1231	* to be valid also.
1232	*/
1233	attr->btf_vmlinux_value_type_id) {
1234	struct btf *btf;
1235
1236	btf = btf_get_by_fd(fd: attr->btf_fd);
1237	if (IS_ERR(ptr: btf)) {
1238	err = PTR_ERR(ptr: btf);
1239	goto free_map;
1240	}
1241	if (btf_is_kernel(btf)) {
1242	btf_put(btf);
1243	err = -EACCES;
1244	goto free_map;
1245	}
1246	map->btf = btf;
1247
1248	if (attr->btf_value_type_id) {
1249	err = map_check_btf(map, btf, btf_key_id: attr->btf_key_type_id,
1250	btf_value_id: attr->btf_value_type_id);
1251	if (err)
1252	goto free_map;
1253	}
1254
1255	map->btf_key_type_id = attr->btf_key_type_id;
1256	map->btf_value_type_id = attr->btf_value_type_id;
1257	map->btf_vmlinux_value_type_id =
1258	attr->btf_vmlinux_value_type_id;
1259	}
1260
1261	err = security_bpf_map_alloc(map);
1262	if (err)
1263	goto free_map;
1264
1265	err = bpf_map_alloc_id(map);
1266	if (err)
1267	goto free_map_sec;
1268
1269	bpf_map_save_memcg(map);
1270
1271	err = bpf_map_new_fd(map, flags: f_flags);
1272	if (err < `0`) {
1273	/ failed to allocate fd.*
1274	* bpf_map_put_with_uref() is needed because the above
1275	* bpf_map_alloc_id() has published the map
1276	* to the userspace and the userspace may
1277	* have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
1278	*/
1279	bpf_map_put_with_uref(map);
1280	return err;
1281	}
1282
1283	return err;
1284
1285	free_map_sec:
1286	security_bpf_map_free(map);
1287	free_map:
1288	btf_put(btf: map->btf);
1289	map->ops->map_free(map);
1290	return err;
1291	}
1292
1293	/ if error is returned, fd is released.*
1294	* On success caller should complete fd access with matching fdput()
1295	*/
1296	struct bpf_map __bpf_map_get(struct* fd f)
1297	{
1298	if (!f.file)
1299	return ERR_PTR(error: -EBADF);
1300	if (f.file->f_op != &bpf_map_fops) {
1301	fdput(fd: f);
1302	return ERR_PTR(error: -EINVAL);
1303	}
1304
1305	return f.file->private_data;
1306	}
1307
1308	void bpf_map_inc(struct bpf_map *map)
1309	{
1310	atomic64_inc(v: &map->refcnt);
1311	}
1312	EXPORT_SYMBOL_GPL(bpf_map_inc);
1313
1314	void bpf_map_inc_with_uref(struct bpf_map *map)
1315	{
1316	atomic64_inc(v: &map->refcnt);
1317	atomic64_inc(v: &map->usercnt);
1318	}
1319	EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref);
1320
1321	struct bpf_map *bpf_map_get(u32 ufd)
1322	{
1323	struct fd f = fdget(fd: ufd);
1324	struct bpf_map *map;
1325
1326	map = __bpf_map_get(f);
1327	if (IS_ERR(ptr: map))
1328	return map;
1329
1330	bpf_map_inc(map);
1331	fdput(fd: f);
1332
1333	return map;
1334	}
1335	EXPORT_SYMBOL(bpf_map_get);
1336
1337	struct bpf_map *bpf_map_get_with_uref(u32 ufd)
1338	{
1339	struct fd f = fdget(fd: ufd);
1340	struct bpf_map *map;
1341
1342	map = __bpf_map_get(f);
1343	if (IS_ERR(ptr: map))
1344	return map;
1345
1346	bpf_map_inc_with_uref(map);
1347	fdput(fd: f);
1348
1349	return map;
1350	}
1351
1352	/ map_idr_lock should have been held or the map should have been*
1353	* protected by rcu read lock.
1354	*/
1355	struct bpf_map __bpf_map_inc_not_zero(struct* bpf_map *map, bool uref)
1356	{
1357	int refold;
1358
1359	refold = atomic64_fetch_add_unless(v: &map->refcnt, a: `1`, u: `0`);
1360	if (!refold)
1361	return ERR_PTR(error: -ENOENT);
1362	if (uref)
1363	atomic64_inc(v: &map->usercnt);
1364
1365	return map;
1366	}
1367
1368	struct bpf_map bpf_map_inc_not_zero(struct* bpf_map *map)
1369	{
1370	spin_lock_bh(lock: &map_idr_lock);
1371	map = __bpf_map_inc_not_zero(map, uref: false);
1372	spin_unlock_bh(lock: &map_idr_lock);
1373
1374	return map;
1375	}
1376	EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);
1377
1378	int __weak bpf_stackmap_copy(struct bpf_map map, void* key, void* *value)
1379	{
1380	return -ENOTSUPP;
1381	}
1382
1383	static void __bpf_copy_key(void* __user *ukey, u64 key_size)
1384	{
1385	if (key_size)
1386	return vmemdup_user(ukey, key_size);
1387
1388	if (ukey)
1389	return ERR_PTR(error: -EINVAL);
1390
1391	return NULL;
1392	}
1393
1394	static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size)
1395	{
1396	if (key_size)
1397	return kvmemdup_bpfptr(src: ukey, len: key_size);
1398
1399	if (!bpfptr_is_null(bpfptr: ukey))
1400	return ERR_PTR(error: -EINVAL);
1401
1402	return NULL;
1403	}
1404
1405	/ last field in 'union bpf_attr' used by this command /
1406	#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
1407
1408	static int map_lookup_elem(union bpf_attr *attr)
1409	{
1410	void __user *ukey = u64_to_user_ptr(attr->key);
1411	void __user *uvalue = u64_to_user_ptr(attr->value);
1412	int ufd = attr->map_fd;
1413	struct bpf_map *map;
1414	void key, value;
1415	u32 value_size;
1416	struct fd f;
1417	int err;
1418
1419	if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
1420	return -EINVAL;
1421
1422	if (attr->flags & ~BPF_F_LOCK)
1423	return -EINVAL;
1424
1425	f = fdget(fd: ufd);
1426	map = __bpf_map_get(f);
1427	if (IS_ERR(ptr: map))
1428	return PTR_ERR(ptr: map);
1429	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
1430	err = -EPERM;
1431	goto err_put;
1432	}
1433
1434	if ((attr->flags & BPF_F_LOCK) &&
1435	!btf_record_has_field(rec: map->record, type: BPF_SPIN_LOCK)) {
1436	err = -EINVAL;
1437	goto err_put;
1438	}
1439
1440	key = __bpf_copy_key(ukey, key_size: map->key_size);
1441	if (IS_ERR(ptr: key)) {
1442	err = PTR_ERR(ptr: key);
1443	goto err_put;
1444	}
1445
1446	value_size = bpf_map_value_size(map);
1447
1448	err = -ENOMEM;
1449	value = kvmalloc(size: value_size, GFP_USER \| __GFP_NOWARN);
1450	if (!value)
1451	goto free_key;
1452
1453	if (map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
1454	if (copy_from_user(to: value, from: uvalue, n: value_size))
1455	err = -EFAULT;
1456	else
1457	err = bpf_map_copy_value(map, key, value, flags: attr->flags);
1458	goto free_value;
1459	}
1460
1461	err = bpf_map_copy_value(map, key, value, flags: attr->flags);
1462	if (err)
1463	goto free_value;
1464
1465	err = -EFAULT;
1466	if (copy_to_user(to: uvalue, from: value, n: value_size) != `0`)
1467	goto free_value;
1468
1469	err = `0`;
1470
1471	free_value:
1472	kvfree(addr: value);
1473	free_key:
1474	kvfree(addr: key);
1475	err_put:
1476	fdput(fd: f);
1477	return err;
1478	}
1479
1480
1481	#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
1482
1483	static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
1484	{
1485	bpfptr_t ukey = make_bpfptr(addr: attr->key, is_kernel: uattr.is_kernel);
1486	bpfptr_t uvalue = make_bpfptr(addr: attr->value, is_kernel: uattr.is_kernel);
1487	int ufd = attr->map_fd;
1488	struct bpf_map *map;
1489	void key, value;
1490	u32 value_size;
1491	struct fd f;
1492	int err;
1493
1494	if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
1495	return -EINVAL;
1496
1497	f = fdget(fd: ufd);
1498	map = __bpf_map_get(f);
1499	if (IS_ERR(ptr: map))
1500	return PTR_ERR(ptr: map);
1501	bpf_map_write_active_inc(map);
1502	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
1503	err = -EPERM;
1504	goto err_put;
1505	}
1506
1507	if ((attr->flags & BPF_F_LOCK) &&
1508	!btf_record_has_field(rec: map->record, type: BPF_SPIN_LOCK)) {
1509	err = -EINVAL;
1510	goto err_put;
1511	}
1512
1513	key = ___bpf_copy_key(ukey, key_size: map->key_size);
1514	if (IS_ERR(ptr: key)) {
1515	err = PTR_ERR(ptr: key);
1516	goto err_put;
1517	}
1518
1519	value_size = bpf_map_value_size(map);
1520	value = kvmemdup_bpfptr(src: uvalue, len: value_size);
1521	if (IS_ERR(ptr: value)) {
1522	err = PTR_ERR(ptr: value);
1523	goto free_key;
1524	}
1525
1526	err = bpf_map_update_value(map, map_file: f.file, key, value, flags: attr->flags);
1527
1528	kvfree(addr: value);
1529	free_key:
1530	kvfree(addr: key);
1531	err_put:
1532	bpf_map_write_active_dec(map);
1533	fdput(fd: f);
1534	return err;
1535	}
1536
1537	#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
1538
1539	static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
1540	{
1541	bpfptr_t ukey = make_bpfptr(addr: attr->key, is_kernel: uattr.is_kernel);
1542	int ufd = attr->map_fd;
1543	struct bpf_map *map;
1544	struct fd f;
1545	void *key;
1546	int err;
1547
1548	if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
1549	return -EINVAL;
1550
1551	f = fdget(fd: ufd);
1552	map = __bpf_map_get(f);
1553	if (IS_ERR(ptr: map))
1554	return PTR_ERR(ptr: map);
1555	bpf_map_write_active_inc(map);
1556	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
1557	err = -EPERM;
1558	goto err_put;
1559	}
1560
1561	key = ___bpf_copy_key(ukey, key_size: map->key_size);
1562	if (IS_ERR(ptr: key)) {
1563	err = PTR_ERR(ptr: key);
1564	goto err_put;
1565	}
1566
1567	if (bpf_map_is_offloaded(map)) {
1568	err = bpf_map_offload_delete_elem(map, key);
1569	goto out;
1570	} else if (IS_FD_PROG_ARRAY(map) \|\|
1571	map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
1572	/ These maps require sleepable context /
1573	err = map->ops->map_delete_elem(map, key);
1574	goto out;
1575	}
1576
1577	bpf_disable_instrumentation();
1578	rcu_read_lock();
1579	err = map->ops->map_delete_elem(map, key);
1580	rcu_read_unlock();
1581	bpf_enable_instrumentation();
1582	maybe_wait_bpf_programs(map);
1583	out:
1584	kvfree(addr: key);
1585	err_put:
1586	bpf_map_write_active_dec(map);
1587	fdput(fd: f);
1588	return err;
1589	}
1590
1591	/ last field in 'union bpf_attr' used by this command /
1592	#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
1593
1594	static int map_get_next_key(union bpf_attr *attr)
1595	{
1596	void __user *ukey = u64_to_user_ptr(attr->key);
1597	void __user *unext_key = u64_to_user_ptr(attr->next_key);
1598	int ufd = attr->map_fd;
1599	struct bpf_map *map;
1600	void key, next_key;
1601	struct fd f;
1602	int err;
1603
1604	if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
1605	return -EINVAL;
1606
1607	f = fdget(fd: ufd);
1608	map = __bpf_map_get(f);
1609	if (IS_ERR(ptr: map))
1610	return PTR_ERR(ptr: map);
1611	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
1612	err = -EPERM;
1613	goto err_put;
1614	}
1615
1616	if (ukey) {
1617	key = __bpf_copy_key(ukey, key_size: map->key_size);
1618	if (IS_ERR(ptr: key)) {
1619	err = PTR_ERR(ptr: key);
1620	goto err_put;
1621	}
1622	} else {
1623	key = NULL;
1624	}
1625
1626	err = -ENOMEM;
1627	next_key = kvmalloc(size: map->key_size, GFP_USER);
1628	if (!next_key)
1629	goto free_key;
1630
1631	if (bpf_map_is_offloaded(map)) {
1632	err = bpf_map_offload_get_next_key(map, key, next_key);
1633	goto out;
1634	}
1635
1636	rcu_read_lock();
1637	err = map->ops->map_get_next_key(map, key, next_key);
1638	rcu_read_unlock();
1639	out:
1640	if (err)
1641	goto free_next_key;
1642
1643	err = -EFAULT;
1644	if (copy_to_user(to: unext_key, from: next_key, n: map->key_size) != `0`)
1645	goto free_next_key;
1646
1647	err = `0`;
1648
1649	free_next_key:
1650	kvfree(addr: next_key);
1651	free_key:
1652	kvfree(addr: key);
1653	err_put:
1654	fdput(fd: f);
1655	return err;
1656	}
1657
1658	int generic_map_delete_batch(struct bpf_map *map,
1659	const union bpf_attr *attr,
1660	union bpf_attr __user *uattr)
1661	{
1662	void __user *keys = u64_to_user_ptr(attr->batch.keys);
1663	u32 cp, max_count;
1664	int err = `0`;
1665	void *key;
1666
1667	if (attr->batch.elem_flags & ~BPF_F_LOCK)
1668	return -EINVAL;
1669
1670	if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1671	!btf_record_has_field(rec: map->record, type: BPF_SPIN_LOCK)) {
1672	return -EINVAL;
1673	}
1674
1675	max_count = attr->batch.count;
1676	if (!max_count)
1677	return `0`;
1678
1679	key = kvmalloc(size: map->key_size, GFP_USER \| __GFP_NOWARN);
1680	if (!key)
1681	return -ENOMEM;
1682
1683	for (cp = `0`; cp < max_count; cp++) {
1684	err = -EFAULT;
1685	if (copy_from_user(to: key, from: keys + cp * map->key_size,
1686	n: map->key_size))
1687	break;
1688
1689	if (bpf_map_is_offloaded(map)) {
1690	err = bpf_map_offload_delete_elem(map, key);
1691	break;
1692	}
1693
1694	bpf_disable_instrumentation();
1695	rcu_read_lock();
1696	err = map->ops->map_delete_elem(map, key);
1697	rcu_read_unlock();
1698	bpf_enable_instrumentation();
1699	if (err)
1700	break;
1701	cond_resched();
1702	}
1703	if (copy_to_user(to: &uattr->batch.count, from: &cp, n: sizeof(cp)))
1704	err = -EFAULT;
1705
1706	kvfree(addr: key);
1707
1708	maybe_wait_bpf_programs(map);
1709	return err;
1710	}
1711
1712	int generic_map_update_batch(struct bpf_map map, struct* file *map_file,
1713	const union bpf_attr *attr,
1714	union bpf_attr __user *uattr)
1715	{
1716	void __user *values = u64_to_user_ptr(attr->batch.values);
1717	void __user *keys = u64_to_user_ptr(attr->batch.keys);
1718	u32 value_size, cp, max_count;
1719	void key, value;
1720	int err = `0`;
1721
1722	if (attr->batch.elem_flags & ~BPF_F_LOCK)
1723	return -EINVAL;
1724
1725	if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1726	!btf_record_has_field(rec: map->record, type: BPF_SPIN_LOCK)) {
1727	return -EINVAL;
1728	}
1729
1730	value_size = bpf_map_value_size(map);
1731
1732	max_count = attr->batch.count;
1733	if (!max_count)
1734	return `0`;
1735
1736	key = kvmalloc(size: map->key_size, GFP_USER \| __GFP_NOWARN);
1737	if (!key)
1738	return -ENOMEM;
1739
1740	value = kvmalloc(size: value_size, GFP_USER \| __GFP_NOWARN);
1741	if (!value) {
1742	kvfree(addr: key);
1743	return -ENOMEM;
1744	}
1745
1746	for (cp = `0`; cp < max_count; cp++) {
1747	err = -EFAULT;
1748	if (copy_from_user(to: key, from: keys + cp * map->key_size,
1749	n: map->key_size) \|\|
1750	copy_from_user(to: value, from: values + cp * value_size, n: value_size))
1751	break;
1752
1753	err = bpf_map_update_value(map, map_file, key, value,
1754	flags: attr->batch.elem_flags);
1755
1756	if (err)
1757	break;
1758	cond_resched();
1759	}
1760
1761	if (copy_to_user(to: &uattr->batch.count, from: &cp, n: sizeof(cp)))
1762	err = -EFAULT;
1763
1764	kvfree(addr: value);
1765	kvfree(addr: key);
1766	return err;
1767	}
1768
1769	#define MAP_LOOKUP_RETRIES 3
1770
1771	int generic_map_lookup_batch(struct bpf_map *map,
1772	const union bpf_attr *attr,
1773	union bpf_attr __user *uattr)
1774	{
1775	void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch);
1776	void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
1777	void __user *values = u64_to_user_ptr(attr->batch.values);
1778	void __user *keys = u64_to_user_ptr(attr->batch.keys);
1779	void buf, buf_prevkey, prev_key, key, *value;
1780	int err, retry = MAP_LOOKUP_RETRIES;
1781	u32 value_size, cp, max_count;
1782
1783	if (attr->batch.elem_flags & ~BPF_F_LOCK)
1784	return -EINVAL;
1785
1786	if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1787	!btf_record_has_field(rec: map->record, type: BPF_SPIN_LOCK))
1788	return -EINVAL;
1789
1790	value_size = bpf_map_value_size(map);
1791
1792	max_count = attr->batch.count;
1793	if (!max_count)
1794	return `0`;
1795
1796	if (put_user(`0`, &uattr->batch.count))
1797	return -EFAULT;
1798
1799	buf_prevkey = kvmalloc(size: map->key_size, GFP_USER \| __GFP_NOWARN);
1800	if (!buf_prevkey)
1801	return -ENOMEM;
1802
1803	buf = kvmalloc(size: map->key_size + value_size, GFP_USER \| __GFP_NOWARN);
1804	if (!buf) {
1805	kvfree(addr: buf_prevkey);
1806	return -ENOMEM;
1807	}
1808
1809	err = -EFAULT;
1810	prev_key = NULL;
1811	if (ubatch && copy_from_user(to: buf_prevkey, from: ubatch, n: map->key_size))
1812	goto free_buf;
1813	key = buf;
1814	value = key + map->key_size;
1815	if (ubatch)
1816	prev_key = buf_prevkey;
1817
1818	for (cp = `0`; cp < max_count;) {
1819	rcu_read_lock();
1820	err = map->ops->map_get_next_key(map, prev_key, key);
1821	rcu_read_unlock();
1822	if (err)
1823	break;
1824	err = bpf_map_copy_value(map, key, value,
1825	flags: attr->batch.elem_flags);
1826
1827	if (err == -ENOENT) {
1828	if (retry) {
1829	retry--;
1830	continue;
1831	}
1832	err = -EINTR;
1833	break;
1834	}
1835
1836	if (err)
1837	goto free_buf;
1838
1839	if (copy_to_user(to: keys + cp * map->key_size, from: key,
1840	n: map->key_size)) {
1841	err = -EFAULT;
1842	goto free_buf;
1843	}
1844	if (copy_to_user(to: values + cp * value_size, from: value, n: value_size)) {
1845	err = -EFAULT;
1846	goto free_buf;
1847	}
1848
1849	if (!prev_key)
1850	prev_key = buf_prevkey;
1851
1852	swap(prev_key, key);
1853	retry = MAP_LOOKUP_RETRIES;
1854	cp++;
1855	cond_resched();
1856	}
1857
1858	if (err == -EFAULT)
1859	goto free_buf;
1860
1861	if ((copy_to_user(to: &uattr->batch.count, from: &cp, n: sizeof(cp)) \|\|
1862	(cp && copy_to_user(to: uobatch, from: prev_key, n: map->key_size))))
1863	err = -EFAULT;
1864
1865	free_buf:
1866	kvfree(addr: buf_prevkey);
1867	kvfree(addr: buf);
1868	return err;
1869	}
1870
1871	#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags
1872
1873	static int map_lookup_and_delete_elem(union bpf_attr *attr)
1874	{
1875	void __user *ukey = u64_to_user_ptr(attr->key);
1876	void __user *uvalue = u64_to_user_ptr(attr->value);
1877	int ufd = attr->map_fd;
1878	struct bpf_map *map;
1879	void key, value;
1880	u32 value_size;
1881	struct fd f;
1882	int err;
1883
1884	if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
1885	return -EINVAL;
1886
1887	if (attr->flags & ~BPF_F_LOCK)
1888	return -EINVAL;
1889
1890	f = fdget(fd: ufd);
1891	map = __bpf_map_get(f);
1892	if (IS_ERR(ptr: map))
1893	return PTR_ERR(ptr: map);
1894	bpf_map_write_active_inc(map);
1895	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) \|\|
1896	!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
1897	err = -EPERM;
1898	goto err_put;
1899	}
1900
1901	if (attr->flags &&
1902	(map->map_type == BPF_MAP_TYPE_QUEUE \|\|
1903	map->map_type == BPF_MAP_TYPE_STACK)) {
1904	err = -EINVAL;
1905	goto err_put;
1906	}
1907
1908	if ((attr->flags & BPF_F_LOCK) &&
1909	!btf_record_has_field(rec: map->record, type: BPF_SPIN_LOCK)) {
1910	err = -EINVAL;
1911	goto err_put;
1912	}
1913
1914	key = __bpf_copy_key(ukey, key_size: map->key_size);
1915	if (IS_ERR(ptr: key)) {
1916	err = PTR_ERR(ptr: key);
1917	goto err_put;
1918	}
1919
1920	value_size = bpf_map_value_size(map);
1921
1922	err = -ENOMEM;
1923	value = kvmalloc(size: value_size, GFP_USER \| __GFP_NOWARN);
1924	if (!value)
1925	goto free_key;
1926
1927	err = -ENOTSUPP;
1928	if (map->map_type == BPF_MAP_TYPE_QUEUE \|\|
1929	map->map_type == BPF_MAP_TYPE_STACK) {
1930	err = map->ops->map_pop_elem(map, value);
1931	} else if (map->map_type == BPF_MAP_TYPE_HASH \|\|
1932	map->map_type == BPF_MAP_TYPE_PERCPU_HASH \|\|
1933	map->map_type == BPF_MAP_TYPE_LRU_HASH \|\|
1934	map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
1935	if (!bpf_map_is_offloaded(map)) {
1936	bpf_disable_instrumentation();
1937	rcu_read_lock();
1938	err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags);
1939	rcu_read_unlock();
1940	bpf_enable_instrumentation();
1941	}
1942	}
1943
1944	if (err)
1945	goto free_value;
1946
1947	if (copy_to_user(to: uvalue, from: value, n: value_size) != `0`) {
1948	err = -EFAULT;
1949	goto free_value;
1950	}
1951
1952	err = `0`;
1953
1954	free_value:
1955	kvfree(addr: value);
1956	free_key:
1957	kvfree(addr: key);
1958	err_put:
1959	bpf_map_write_active_dec(map);
1960	fdput(fd: f);
1961	return err;
1962	}
1963
1964	#define BPF_MAP_FREEZE_LAST_FIELD map_fd
1965
1966	static int map_freeze(const union bpf_attr *attr)
1967	{
1968	int err = `0`, ufd = attr->map_fd;
1969	struct bpf_map *map;
1970	struct fd f;
1971
1972	if (CHECK_ATTR(BPF_MAP_FREEZE))
1973	return -EINVAL;
1974
1975	f = fdget(fd: ufd);
1976	map = __bpf_map_get(f);
1977	if (IS_ERR(ptr: map))
1978	return PTR_ERR(ptr: map);
1979
1980	if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS \|\| !IS_ERR_OR_NULL(ptr: map->record)) {
1981	fdput(fd: f);
1982	return -ENOTSUPP;
1983	}
1984
1985	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
1986	fdput(fd: f);
1987	return -EPERM;
1988	}
1989
1990	mutex_lock(&map->freeze_mutex);
1991	if (bpf_map_write_active(map)) {
1992	err = -EBUSY;
1993	goto err_put;
1994	}
1995	if (READ_ONCE(map->frozen)) {
1996	err = -EBUSY;
1997	goto err_put;
1998	}
1999
2000	WRITE_ONCE(map->frozen, true);
2001	err_put:
2002	mutex_unlock(lock: &map->freeze_mutex);
2003	fdput(fd: f);
2004	return err;
2005	}
2006
2007	static const struct bpf_prog_ops * const bpf_prog_types[] = {
2008	#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
2009	[_id] = & _name ## _prog_ops,
2010	#define BPF_MAP_TYPE(_id, _ops)
2011	#define BPF_LINK_TYPE(_id, _name)
2012	#include <linux/bpf_types.h>
2013	#undef BPF_PROG_TYPE
2014	#undef BPF_MAP_TYPE
2015	#undef BPF_LINK_TYPE
2016	};
2017
2018	static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
2019	{
2020	const struct bpf_prog_ops *ops;
2021
2022	if (type >= ARRAY_SIZE(bpf_prog_types))
2023	return -EINVAL;
2024	type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types));
2025	ops = bpf_prog_types[type];
2026	if (!ops)
2027	return -EINVAL;
2028
2029	if (!bpf_prog_is_offloaded(aux: prog->aux))
2030	prog->aux->ops = ops;
2031	else
2032	prog->aux->ops = &bpf_offload_prog_ops;
2033	prog->type = type;
2034	return `0`;
2035	}
2036
2037	enum bpf_audit {
2038	BPF_AUDIT_LOAD,
2039	BPF_AUDIT_UNLOAD,
2040	BPF_AUDIT_MAX,
2041	};
2042
2043	static const char * const bpf_audit_str[BPF_AUDIT_MAX] = {
2044	[BPF_AUDIT_LOAD] = "LOAD",
2045	[BPF_AUDIT_UNLOAD] = "UNLOAD",
2046	};
2047
2048	static void bpf_audit_prog(const struct bpf_prog prog, unsigned* int op)
2049	{
2050	struct audit_context *ctx = NULL;
2051	struct audit_buffer *ab;
2052
2053	if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX))
2054	return;
2055	if (audit_enabled == AUDIT_OFF)
2056	return;
2057	if (!in_irq() && !irqs_disabled())
2058	ctx = audit_context();
2059	ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
2060	if (unlikely(!ab))
2061	return;
2062	audit_log_format(ab, fmt: "prog-id=%u op=%s",
2063	prog->aux->id, bpf_audit_str[op]);
2064	audit_log_end(ab);
2065	}
2066
2067	static int bpf_prog_alloc_id(struct bpf_prog *prog)
2068	{
2069	int id;
2070
2071	idr_preload(GFP_KERNEL);
2072	spin_lock_bh(lock: &prog_idr_lock);
2073	id = idr_alloc_cyclic(&prog_idr, ptr: prog, start: `1`, INT_MAX, GFP_ATOMIC);
2074	if (id > `0`)
2075	prog->aux->id = id;
2076	spin_unlock_bh(lock: &prog_idr_lock);
2077	idr_preload_end();
2078
2079	/ id is in [1, INT_MAX) /
2080	if (WARN_ON_ONCE(!id))
2081	return -ENOSPC;
2082
2083	return id > `0` ? `0` : id;
2084	}
2085
2086	void bpf_prog_free_id(struct bpf_prog *prog)
2087	{
2088	unsigned long flags;
2089
2090	/ cBPF to eBPF migrations are currently not in the idr store.*
2091	* Offloaded programs are removed from the store when their device
2092	* disappears - even if someone grabs an fd to them they are unusable,
2093	* simply waiting for refcnt to drop to be freed.
2094	*/
2095	if (!prog->aux->id)
2096	return;
2097
2098	spin_lock_irqsave(&prog_idr_lock, flags);
2099	idr_remove(&prog_idr, id: prog->aux->id);
2100	prog->aux->id = `0`;
2101	spin_unlock_irqrestore(lock: &prog_idr_lock, flags);
2102	}
2103
2104	static void __bpf_prog_put_rcu(struct rcu_head *rcu)
2105	{
2106	struct bpf_prog_aux aux = container_of(rcu, struct* bpf_prog_aux, rcu);
2107
2108	kvfree(addr: aux->func_info);
2109	kfree(objp: aux->func_info_aux);
2110	free_uid(aux->user);
2111	security_bpf_prog_free(aux);
2112	bpf_prog_free(fp: aux->prog);
2113	}
2114
2115	static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
2116	{
2117	bpf_prog_kallsyms_del_all(fp: prog);
2118	btf_put(btf: prog->aux->btf);
2119	module_put(module: prog->aux->mod);
2120	kvfree(addr: prog->aux->jited_linfo);
2121	kvfree(addr: prog->aux->linfo);
2122	kfree(objp: prog->aux->kfunc_tab);
2123	if (prog->aux->attach_btf)
2124	btf_put(btf: prog->aux->attach_btf);
2125
2126	if (deferred) {
2127	if (prog->aux->sleepable)
2128	call_rcu_tasks_trace(rhp: &prog->aux->rcu, func: __bpf_prog_put_rcu);
2129	else
2130	call_rcu(head: &prog->aux->rcu, func: __bpf_prog_put_rcu);
2131	} else {
2132	__bpf_prog_put_rcu(rcu: &prog->aux->rcu);
2133	}
2134	}
2135
2136	static void bpf_prog_put_deferred(struct work_struct *work)
2137	{
2138	struct bpf_prog_aux *aux;
2139	struct bpf_prog *prog;
2140
2141	aux = container_of(work, struct bpf_prog_aux, work);
2142	prog = aux->prog;
2143	perf_event_bpf_event(prog, type: PERF_BPF_EVENT_PROG_UNLOAD, flags: `0`);
2144	bpf_audit_prog(prog, op: BPF_AUDIT_UNLOAD);
2145	bpf_prog_free_id(prog);
2146	__bpf_prog_put_noref(prog, deferred: true);
2147	}
2148
2149	static void __bpf_prog_put(struct bpf_prog *prog)
2150	{
2151	struct bpf_prog_aux *aux = prog->aux;
2152
2153	if (atomic64_dec_and_test(v: &aux->refcnt)) {
2154	if (in_irq() \|\| irqs_disabled()) {
2155	INIT_WORK(&aux->work, bpf_prog_put_deferred);
2156	schedule_work(work: &aux->work);
2157	} else {
2158	bpf_prog_put_deferred(work: &aux->work);
2159	}
2160	}
2161	}
2162
2163	void bpf_prog_put(struct bpf_prog *prog)
2164	{
2165	__bpf_prog_put(prog);
2166	}
2167	EXPORT_SYMBOL_GPL(bpf_prog_put);
2168
2169	static int bpf_prog_release(struct inode inode, struct* file *filp)
2170	{
2171	struct bpf_prog *prog = filp->private_data;
2172
2173	bpf_prog_put(prog);
2174	return `0`;
2175	}
2176
2177	struct bpf_prog_kstats {
2178	u64 nsecs;
2179	u64 cnt;
2180	u64 misses;
2181	};
2182
2183	void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog)
2184	{
2185	struct bpf_prog_stats *stats;
2186	unsigned int flags;
2187
2188	stats = this_cpu_ptr(prog->stats);
2189	flags = u64_stats_update_begin_irqsave(syncp: &stats->syncp);
2190	u64_stats_inc(p: &stats->misses);
2191	u64_stats_update_end_irqrestore(syncp: &stats->syncp, flags);
2192	}
2193
2194	static void bpf_prog_get_stats(const struct bpf_prog *prog,
2195	struct bpf_prog_kstats *stats)
2196	{
2197	u64 nsecs = `0`, cnt = `0`, misses = `0`;
2198	int cpu;
2199
2200	for_each_possible_cpu(cpu) {
2201	const struct bpf_prog_stats *st;
2202	unsigned int start;
2203	u64 tnsecs, tcnt, tmisses;
2204
2205	st = per_cpu_ptr(prog->stats, cpu);
2206	do {
2207	start = u64_stats_fetch_begin(syncp: &st->syncp);
2208	tnsecs = u64_stats_read(p: &st->nsecs);
2209	tcnt = u64_stats_read(p: &st->cnt);
2210	tmisses = u64_stats_read(p: &st->misses);
2211	} while (u64_stats_fetch_retry(syncp: &st->syncp, start));
2212	nsecs += tnsecs;
2213	cnt += tcnt;
2214	misses += tmisses;
2215	}
2216	stats->nsecs = nsecs;
2217	stats->cnt = cnt;
2218	stats->misses = misses;
2219	}
2220
2221	#ifdef CONFIG_PROC_FS
2222	static void bpf_prog_show_fdinfo(struct seq_file m, struct* file *filp)
2223	{
2224	const struct bpf_prog *prog = filp->private_data;
2225	char prog_tag[sizeof(prog->tag) * `2` + `1`] = { };
2226	struct bpf_prog_kstats stats;
2227
2228	bpf_prog_get_stats(prog, stats: &stats);
2229	bin2hex(dst: prog_tag, src: prog->tag, count: sizeof(prog->tag));
2230	seq_printf(m,
2231	fmt: "prog_type:\t%u\n"
2232	"prog_jited:\t%u\n"
2233	"prog_tag:\t%s\n"
2234	"memlock:\t%llu\n"
2235	"prog_id:\t%u\n"
2236	"run_time_ns:\t%llu\n"
2237	"run_cnt:\t%llu\n"
2238	"recursion_misses:\t%llu\n"
2239	"verified_insns:\t%u\n",
2240	prog->type,
2241	prog->jited,
2242	prog_tag,
2243	prog->pages * `1ULL` << PAGE_SHIFT,
2244	prog->aux->id,
2245	stats.nsecs,
2246	stats.cnt,
2247	stats.misses,
2248	prog->aux->verified_insns);
2249	}
2250	#endif
2251
2252	const struct file_operations bpf_prog_fops = {
2253	#ifdef CONFIG_PROC_FS
2254	.show_fdinfo = bpf_prog_show_fdinfo,
2255	#endif
2256	.release = bpf_prog_release,
2257	.read = bpf_dummy_read,
2258	.write = bpf_dummy_write,
2259	};
2260
2261	int bpf_prog_new_fd(struct bpf_prog *prog)
2262	{
2263	int ret;
2264
2265	ret = security_bpf_prog(prog);
2266	if (ret < `0`)
2267	return ret;
2268
2269	return anon_inode_getfd(name: "bpf-prog", fops: &bpf_prog_fops, priv: prog,
2270	O_RDWR \| O_CLOEXEC);
2271	}
2272
2273	static struct bpf_prog ____bpf_prog_get(struct* fd f)
2274	{
2275	if (!f.file)
2276	return ERR_PTR(error: -EBADF);
2277	if (f.file->f_op != &bpf_prog_fops) {
2278	fdput(fd: f);
2279	return ERR_PTR(error: -EINVAL);
2280	}
2281
2282	return f.file->private_data;
2283	}
2284
2285	void bpf_prog_add(struct bpf_prog prog, int* i)
2286	{
2287	atomic64_add(i, v: &prog->aux->refcnt);
2288	}
2289	EXPORT_SYMBOL_GPL(bpf_prog_add);
2290
2291	void bpf_prog_sub(struct bpf_prog prog, int* i)
2292	{
2293	/ Only to be used for undoing previous bpf_prog_add() in some*
2294	* error path. We still know that another entity in our call
2295	* path holds a reference to the program, thus atomic_sub() can
2296	* be safely used in such cases!
2297	*/
2298	WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == `0`);
2299	}
2300	EXPORT_SYMBOL_GPL(bpf_prog_sub);
2301
2302	void bpf_prog_inc(struct bpf_prog *prog)
2303	{
2304	atomic64_inc(v: &prog->aux->refcnt);
2305	}
2306	EXPORT_SYMBOL_GPL(bpf_prog_inc);
2307
2308	/ prog_idr_lock should have been held /
2309	struct bpf_prog bpf_prog_inc_not_zero(struct* bpf_prog *prog)
2310	{
2311	int refold;
2312
2313	refold = atomic64_fetch_add_unless(v: &prog->aux->refcnt, a: `1`, u: `0`);
2314
2315	if (!refold)
2316	return ERR_PTR(error: -ENOENT);
2317
2318	return prog;
2319	}
2320	EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
2321
2322	bool bpf_prog_get_ok(struct bpf_prog *prog,
2323	enum bpf_prog_type *attach_type, bool attach_drv)
2324	{
2325	/ not an attachment, just a refcount inc, always allow /
2326	if (!attach_type)
2327	return true;
2328
2329	if (prog->type != *attach_type)
2330	return false;
2331	if (bpf_prog_is_offloaded(aux: prog->aux) && !attach_drv)
2332	return false;
2333
2334	return true;
2335	}
2336
2337	static struct bpf_prog __bpf_prog_get(u32 ufd, enum* bpf_prog_type *attach_type,
2338	bool attach_drv)
2339	{
2340	struct fd f = fdget(fd: ufd);
2341	struct bpf_prog *prog;
2342
2343	prog = ____bpf_prog_get(f);
2344	if (IS_ERR(ptr: prog))
2345	return prog;
2346	if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) {
2347	prog = ERR_PTR(error: -EINVAL);
2348	goto out;
2349	}
2350
2351	bpf_prog_inc(prog);
2352	out:
2353	fdput(fd: f);
2354	return prog;
2355	}
2356
2357	struct bpf_prog *bpf_prog_get(u32 ufd)
2358	{
2359	return __bpf_prog_get(ufd, NULL, attach_drv: false);
2360	}
2361
2362	struct bpf_prog bpf_prog_get_type_dev(u32 ufd, enum* bpf_prog_type type,
2363	bool attach_drv)
2364	{
2365	return __bpf_prog_get(ufd, attach_type: &type, attach_drv);
2366	}
2367	EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
2368
2369	/ Initially all BPF programs could be loaded w/o specifying*
2370	* expected_attach_type. Later for some of them specifying expected_attach_type
2371	* at load time became required so that program could be validated properly.
2372	* Programs of types that are allowed to be loaded both w/ and w/o (for
2373	* backward compatibility) expected_attach_type, should have the default attach
2374	* type assigned to expected_attach_type for the latter case, so that it can be
2375	* validated later at attach time.
2376	*
2377	* bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if
2378	* prog type requires it but has some attach types that have to be backward
2379	* compatible.
2380	*/
2381	static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
2382	{
2383	switch (attr->prog_type) {
2384	case BPF_PROG_TYPE_CGROUP_SOCK:
2385	/ Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't*
2386	* exist so checking for non-zero is the way to go here.
2387	*/
2388	if (!attr->expected_attach_type)
2389	attr->expected_attach_type =
2390	BPF_CGROUP_INET_SOCK_CREATE;
2391	break;
2392	case BPF_PROG_TYPE_SK_REUSEPORT:
2393	if (!attr->expected_attach_type)
2394	attr->expected_attach_type =
2395	BPF_SK_REUSEPORT_SELECT;
2396	break;
2397	}
2398	}
2399
2400	static int
2401	bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
2402	enum bpf_attach_type expected_attach_type,
2403	struct btf *attach_btf, u32 btf_id,
2404	struct bpf_prog *dst_prog)
2405	{
2406	if (btf_id) {
2407	if (btf_id > BTF_MAX_TYPE)
2408	return -EINVAL;
2409
2410	if (!attach_btf && !dst_prog)
2411	return -EINVAL;
2412
2413	switch (prog_type) {
2414	case BPF_PROG_TYPE_TRACING:
2415	case BPF_PROG_TYPE_LSM:
2416	case BPF_PROG_TYPE_STRUCT_OPS:
2417	case BPF_PROG_TYPE_EXT:
2418	break;
2419	default:
2420	return -EINVAL;
2421	}
2422	}
2423
2424	if (attach_btf && (!btf_id \|\| dst_prog))
2425	return -EINVAL;
2426
2427	if (dst_prog && prog_type != BPF_PROG_TYPE_TRACING &&
2428	prog_type != BPF_PROG_TYPE_EXT)
2429	return -EINVAL;
2430
2431	switch (prog_type) {
2432	case BPF_PROG_TYPE_CGROUP_SOCK:
2433	switch (expected_attach_type) {
2434	case BPF_CGROUP_INET_SOCK_CREATE:
2435	case BPF_CGROUP_INET_SOCK_RELEASE:
2436	case BPF_CGROUP_INET4_POST_BIND:
2437	case BPF_CGROUP_INET6_POST_BIND:
2438	return `0`;
2439	default:
2440	return -EINVAL;
2441	}
2442	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
2443	switch (expected_attach_type) {
2444	case BPF_CGROUP_INET4_BIND:
2445	case BPF_CGROUP_INET6_BIND:
2446	case BPF_CGROUP_INET4_CONNECT:
2447	case BPF_CGROUP_INET6_CONNECT:
2448	case BPF_CGROUP_UNIX_CONNECT:
2449	case BPF_CGROUP_INET4_GETPEERNAME:
2450	case BPF_CGROUP_INET6_GETPEERNAME:
2451	case BPF_CGROUP_UNIX_GETPEERNAME:
2452	case BPF_CGROUP_INET4_GETSOCKNAME:
2453	case BPF_CGROUP_INET6_GETSOCKNAME:
2454	case BPF_CGROUP_UNIX_GETSOCKNAME:
2455	case BPF_CGROUP_UDP4_SENDMSG:
2456	case BPF_CGROUP_UDP6_SENDMSG:
2457	case BPF_CGROUP_UNIX_SENDMSG:
2458	case BPF_CGROUP_UDP4_RECVMSG:
2459	case BPF_CGROUP_UDP6_RECVMSG:
2460	case BPF_CGROUP_UNIX_RECVMSG:
2461	return `0`;
2462	default:
2463	return -EINVAL;
2464	}
2465	case BPF_PROG_TYPE_CGROUP_SKB:
2466	switch (expected_attach_type) {
2467	case BPF_CGROUP_INET_INGRESS:
2468	case BPF_CGROUP_INET_EGRESS:
2469	return `0`;
2470	default:
2471	return -EINVAL;
2472	}
2473	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
2474	switch (expected_attach_type) {
2475	case BPF_CGROUP_SETSOCKOPT:
2476	case BPF_CGROUP_GETSOCKOPT:
2477	return `0`;
2478	default:
2479	return -EINVAL;
2480	}
2481	case BPF_PROG_TYPE_SK_LOOKUP:
2482	if (expected_attach_type == BPF_SK_LOOKUP)
2483	return `0`;
2484	return -EINVAL;
2485	case BPF_PROG_TYPE_SK_REUSEPORT:
2486	switch (expected_attach_type) {
2487	case BPF_SK_REUSEPORT_SELECT:
2488	case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
2489	return `0`;
2490	default:
2491	return -EINVAL;
2492	}
2493	case BPF_PROG_TYPE_NETFILTER:
2494	if (expected_attach_type == BPF_NETFILTER)
2495	return `0`;
2496	return -EINVAL;
2497	case BPF_PROG_TYPE_SYSCALL:
2498	case BPF_PROG_TYPE_EXT:
2499	if (expected_attach_type)
2500	return -EINVAL;
2501	fallthrough;
2502	default:
2503	return `0`;
2504	}
2505	}
2506
2507	static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
2508	{
2509	switch (prog_type) {
2510	case BPF_PROG_TYPE_SCHED_CLS:
2511	case BPF_PROG_TYPE_SCHED_ACT:
2512	case BPF_PROG_TYPE_XDP:
2513	case BPF_PROG_TYPE_LWT_IN:
2514	case BPF_PROG_TYPE_LWT_OUT:
2515	case BPF_PROG_TYPE_LWT_XMIT:
2516	case BPF_PROG_TYPE_LWT_SEG6LOCAL:
2517	case BPF_PROG_TYPE_SK_SKB:
2518	case BPF_PROG_TYPE_SK_MSG:
2519	case BPF_PROG_TYPE_FLOW_DISSECTOR:
2520	case BPF_PROG_TYPE_CGROUP_DEVICE:
2521	case BPF_PROG_TYPE_CGROUP_SOCK:
2522	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
2523	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
2524	case BPF_PROG_TYPE_CGROUP_SYSCTL:
2525	case BPF_PROG_TYPE_SOCK_OPS:
2526	case BPF_PROG_TYPE_EXT: / extends any prog /
2527	case BPF_PROG_TYPE_NETFILTER:
2528	return true;
2529	case BPF_PROG_TYPE_CGROUP_SKB:
2530	/ always unpriv /
2531	case BPF_PROG_TYPE_SK_REUSEPORT:
2532	/ equivalent to SOCKET_FILTER. need CAP_BPF only /
2533	default:
2534	return false;
2535	}
2536	}
2537
2538	static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
2539	{
2540	switch (prog_type) {
2541	case BPF_PROG_TYPE_KPROBE:
2542	case BPF_PROG_TYPE_TRACEPOINT:
2543	case BPF_PROG_TYPE_PERF_EVENT:
2544	case BPF_PROG_TYPE_RAW_TRACEPOINT:
2545	case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
2546	case BPF_PROG_TYPE_TRACING:
2547	case BPF_PROG_TYPE_LSM:
2548	case BPF_PROG_TYPE_STRUCT_OPS: / has access to struct sock /
2549	case BPF_PROG_TYPE_EXT: / extends any prog /
2550	return true;
2551	default:
2552	return false;
2553	}
2554	}
2555
2556	/ last field in 'union bpf_attr' used by this command /
2557	#define BPF_PROG_LOAD_LAST_FIELD log_true_size
2558
2559	static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
2560	{
2561	enum bpf_prog_type type = attr->prog_type;
2562	struct bpf_prog prog, dst_prog = NULL;
2563	struct btf *attach_btf = NULL;
2564	int err;
2565	char license[`128`];
2566
2567	if (CHECK_ATTR(BPF_PROG_LOAD))
2568	return -EINVAL;
2569
2570	if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT \|
2571	BPF_F_ANY_ALIGNMENT \|
2572	BPF_F_TEST_STATE_FREQ \|
2573	BPF_F_SLEEPABLE \|
2574	BPF_F_TEST_RND_HI32 \|
2575	BPF_F_XDP_HAS_FRAGS \|
2576	BPF_F_XDP_DEV_BOUND_ONLY))
2577	return -EINVAL;
2578
2579	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
2580	(attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
2581	!bpf_capable())
2582	return -EPERM;
2583
2584	/ Intent here is for unprivileged_bpf_disabled to block BPF program*
2585	* creation for unprivileged users; other actions depend
2586	* on fd availability and access to bpffs, so are dependent on
2587	* object creation success. Even with unprivileged BPF disabled,
2588	* capability checks are still carried out for these
2589	* and other operations.
2590	*/
2591	if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
2592	return -EPERM;
2593
2594	if (attr->insn_cnt == `0` \|\|
2595	attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
2596	return -E2BIG;
2597	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
2598	type != BPF_PROG_TYPE_CGROUP_SKB &&
2599	!bpf_capable())
2600	return -EPERM;
2601
2602	if (is_net_admin_prog_type(prog_type: type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN))
2603	return -EPERM;
2604	if (is_perfmon_prog_type(prog_type: type) && !perfmon_capable())
2605	return -EPERM;
2606
2607	/ attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog*
2608	* or btf, we need to check which one it is
2609	*/
2610	if (attr->attach_prog_fd) {
2611	dst_prog = bpf_prog_get(ufd: attr->attach_prog_fd);
2612	if (IS_ERR(ptr: dst_prog)) {
2613	dst_prog = NULL;
2614	attach_btf = btf_get_by_fd(fd: attr->attach_btf_obj_fd);
2615	if (IS_ERR(ptr: attach_btf))
2616	return -EINVAL;
2617	if (!btf_is_kernel(btf: attach_btf)) {
2618	/ attaching through specifying bpf_prog's BTF*
2619	* objects directly might be supported eventually
2620	*/
2621	btf_put(btf: attach_btf);
2622	return -ENOTSUPP;
2623	}
2624	}
2625	} else if (attr->attach_btf_id) {
2626	/ fall back to vmlinux BTF, if BTF type ID is specified /
2627	attach_btf = bpf_get_btf_vmlinux();
2628	if (IS_ERR(ptr: attach_btf))
2629	return PTR_ERR(ptr: attach_btf);
2630	if (!attach_btf)
2631	return -EINVAL;
2632	btf_get(btf: attach_btf);
2633	}
2634
2635	bpf_prog_load_fixup_attach_type(attr);
2636	if (bpf_prog_load_check_attach(prog_type: type, expected_attach_type: attr->expected_attach_type,
2637	attach_btf, btf_id: attr->attach_btf_id,
2638	dst_prog)) {
2639	if (dst_prog)
2640	bpf_prog_put(dst_prog);
2641	if (attach_btf)
2642	btf_put(btf: attach_btf);
2643	return -EINVAL;
2644	}
2645
2646	/ plain bpf_prog allocation /
2647	prog = bpf_prog_alloc(size: bpf_prog_size(proglen: attr->insn_cnt), GFP_USER);
2648	if (!prog) {
2649	if (dst_prog)
2650	bpf_prog_put(dst_prog);
2651	if (attach_btf)
2652	btf_put(btf: attach_btf);
2653	return -ENOMEM;
2654	}
2655
2656	prog->expected_attach_type = attr->expected_attach_type;
2657	prog->aux->attach_btf = attach_btf;
2658	prog->aux->attach_btf_id = attr->attach_btf_id;
2659	prog->aux->dst_prog = dst_prog;
2660	prog->aux->dev_bound = !!attr->prog_ifindex;
2661	prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
2662	prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
2663
2664	err = security_bpf_prog_alloc(aux: prog->aux);
2665	if (err)
2666	goto free_prog;
2667
2668	prog->aux->user = get_current_user();
2669	prog->len = attr->insn_cnt;
2670
2671	err = -EFAULT;
2672	if (copy_from_bpfptr(dst: prog->insns,
2673	src: make_bpfptr(addr: attr->insns, is_kernel: uattr.is_kernel),
2674	size: bpf_prog_insn_size(prog)) != `0`)
2675	goto free_prog_sec;
2676	/ copy eBPF program license from user space /
2677	if (strncpy_from_bpfptr(dst: license,
2678	src: make_bpfptr(addr: attr->license, is_kernel: uattr.is_kernel),
2679	count: sizeof(license) - `1`) < `0`)
2680	goto free_prog_sec;
2681	license[sizeof(license) - `1`] = `0`;
2682
2683	/ eBPF programs must be GPL compatible to use GPL-ed functions /
2684	prog->gpl_compatible = license_is_gpl_compatible(license) ? `1` : `0`;
2685
2686	prog->orig_prog = NULL;
2687	prog->jited = `0`;
2688
2689	atomic64_set(v: &prog->aux->refcnt, i: `1`);
2690
2691	if (bpf_prog_is_dev_bound(aux: prog->aux)) {
2692	err = bpf_prog_dev_bound_init(prog, attr);
2693	if (err)
2694	goto free_prog_sec;
2695	}
2696
2697	if (type == BPF_PROG_TYPE_EXT && dst_prog &&
2698	bpf_prog_is_dev_bound(aux: dst_prog->aux)) {
2699	err = bpf_prog_dev_bound_inherit(new_prog: prog, old_prog: dst_prog);
2700	if (err)
2701	goto free_prog_sec;
2702	}
2703
2704	/ find program type: socket_filter vs tracing_filter /
2705	err = find_prog_type(type, prog);
2706	if (err < `0`)
2707	goto free_prog_sec;
2708
2709	prog->aux->load_time = ktime_get_boottime_ns();
2710	err = bpf_obj_name_cpy(dst: prog->aux->name, src: attr->prog_name,
2711	size: sizeof(attr->prog_name));
2712	if (err < `0`)
2713	goto free_prog_sec;
2714
2715	/ run eBPF verifier /
2716	err = bpf_check(fp: &prog, attr, uattr, uattr_size);
2717	if (err < `0`)
2718	goto free_used_maps;
2719
2720	prog = bpf_prog_select_runtime(fp: prog, err: &err);
2721	if (err < `0`)
2722	goto free_used_maps;
2723
2724	err = bpf_prog_alloc_id(prog);
2725	if (err)
2726	goto free_used_maps;
2727
2728	/ Upon success of bpf_prog_alloc_id(), the BPF prog is*
2729	* effectively publicly exposed. However, retrieving via
2730	* bpf_prog_get_fd_by_id() will take another reference,
2731	* therefore it cannot be gone underneath us.
2732	*
2733	* Only for the time /after/ successful bpf_prog_new_fd()
2734	* and before returning to userspace, we might just hold
2735	* one reference and any parallel close on that fd could
2736	* rip everything out. Hence, below notifications must
2737	* happen before bpf_prog_new_fd().
2738	*
2739	* Also, any failure handling from this point onwards must
2740	* be using bpf_prog_put() given the program is exposed.
2741	*/
2742	bpf_prog_kallsyms_add(fp: prog);
2743	perf_event_bpf_event(prog, type: PERF_BPF_EVENT_PROG_LOAD, flags: `0`);
2744	bpf_audit_prog(prog, op: BPF_AUDIT_LOAD);
2745
2746	err = bpf_prog_new_fd(prog);
2747	if (err < `0`)
2748	bpf_prog_put(prog);
2749	return err;
2750
2751	free_used_maps:
2752	/ In case we have subprogs, we need to wait for a grace*
2753	* period before we can tear down JIT memory since symbols
2754	* are already exposed under kallsyms.
2755	*/
2756	__bpf_prog_put_noref(prog, deferred: prog->aux->real_func_cnt);
2757	return err;
2758	free_prog_sec:
2759	free_uid(prog->aux->user);
2760	security_bpf_prog_free(aux: prog->aux);
2761	free_prog:
2762	if (prog->aux->attach_btf)
2763	btf_put(btf: prog->aux->attach_btf);
2764	bpf_prog_free(fp: prog);
2765	return err;
2766	}
2767
2768	#define BPF_OBJ_LAST_FIELD path_fd
2769
2770	static int bpf_obj_pin(const union bpf_attr *attr)
2771	{
2772	int path_fd;
2773
2774	if (CHECK_ATTR(BPF_OBJ) \|\| attr->file_flags & ~BPF_F_PATH_FD)
2775	return -EINVAL;
2776
2777	/ path_fd has to be accompanied by BPF_F_PATH_FD flag /
2778	if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd)
2779	return -EINVAL;
2780
2781	path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD;
2782	return bpf_obj_pin_user(ufd: attr->bpf_fd, path_fd,
2783	u64_to_user_ptr(attr->pathname));
2784	}
2785
2786	static int bpf_obj_get(const union bpf_attr *attr)
2787	{
2788	int path_fd;
2789
2790	if (CHECK_ATTR(BPF_OBJ) \|\| attr->bpf_fd != `0` \|\|
2791	attr->file_flags & ~(BPF_OBJ_FLAG_MASK \| BPF_F_PATH_FD))
2792	return -EINVAL;
2793
2794	/ path_fd has to be accompanied by BPF_F_PATH_FD flag /
2795	if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd)
2796	return -EINVAL;
2797
2798	path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD;
2799	return bpf_obj_get_user(path_fd, u64_to_user_ptr(attr->pathname),
2800	flags: attr->file_flags);
2801	}
2802
2803	void bpf_link_init(struct bpf_link link, enum* bpf_link_type type,
2804	const struct bpf_link_ops ops, struct* bpf_prog *prog)
2805	{
2806	atomic64_set(v: &link->refcnt, i: `1`);
2807	link->type = type;
2808	link->id = `0`;
2809	link->ops = ops;
2810	link->prog = prog;
2811	}
2812
2813	static void bpf_link_free_id(int id)
2814	{
2815	if (!id)
2816	return;
2817
2818	spin_lock_bh(lock: &link_idr_lock);
2819	idr_remove(&link_idr, id);
2820	spin_unlock_bh(lock: &link_idr_lock);
2821	}
2822
2823	/ Clean up bpf_link and corresponding anon_inode file and FD. After*
2824	* anon_inode is created, bpf_link can't be just kfree()'d due to deferred
2825	* anon_inode's release() call. This helper marks bpf_link as
2826	* defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt
2827	* is not decremented, it's the responsibility of a calling code that failed
2828	* to complete bpf_link initialization.
2829	* This helper eventually calls link's dealloc callback, but does not call
2830	* link's release callback.
2831	*/
2832	void bpf_link_cleanup(struct bpf_link_primer *primer)
2833	{
2834	primer->link->prog = NULL;
2835	bpf_link_free_id(id: primer->id);
2836	fput(primer->file);
2837	put_unused_fd(fd: primer->fd);
2838	}
2839
2840	void bpf_link_inc(struct bpf_link *link)
2841	{
2842	atomic64_inc(v: &link->refcnt);
2843	}
2844
2845	/ bpf_link_free is guaranteed to be called from process context /
2846	static void bpf_link_free(struct bpf_link *link)
2847	{
2848	bpf_link_free_id(id: link->id);
2849	if (link->prog) {
2850	/ detach BPF program, clean up used resources /
2851	link->ops->release(link);
2852	bpf_prog_put(link->prog);
2853	}
2854	/ free bpf_link and its containing memory /
2855	link->ops->dealloc(link);
2856	}
2857
2858	static void bpf_link_put_deferred(struct work_struct *work)
2859	{
2860	struct bpf_link link = container_of(work, struct* bpf_link, work);
2861
2862	bpf_link_free(link);
2863	}
2864
2865	/ bpf_link_put might be called from atomic context. It needs to be called*
2866	* from sleepable context in order to acquire sleeping locks during the process.
2867	*/
2868	void bpf_link_put(struct bpf_link *link)
2869	{
2870	if (!atomic64_dec_and_test(v: &link->refcnt))
2871	return;
2872
2873	INIT_WORK(&link->work, bpf_link_put_deferred);
2874	schedule_work(work: &link->work);
2875	}
2876	EXPORT_SYMBOL(bpf_link_put);
2877
2878	static void bpf_link_put_direct(struct bpf_link *link)
2879	{
2880	if (!atomic64_dec_and_test(v: &link->refcnt))
2881	return;
2882	bpf_link_free(link);
2883	}
2884
2885	static int bpf_link_release(struct inode inode, struct* file *filp)
2886	{
2887	struct bpf_link *link = filp->private_data;
2888
2889	bpf_link_put_direct(link);
2890	return `0`;
2891	}
2892
2893	#ifdef CONFIG_PROC_FS
2894	#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
2895	#define BPF_MAP_TYPE(_id, _ops)
2896	#define BPF_LINK_TYPE(_id, _name) [_id] = #_name,
2897	static const char *bpf_link_type_strs[] = {
2898	[BPF_LINK_TYPE_UNSPEC] = "<invalid>",
2899	#include <linux/bpf_types.h>
2900	};
2901	#undef BPF_PROG_TYPE
2902	#undef BPF_MAP_TYPE
2903	#undef BPF_LINK_TYPE
2904
2905	static void bpf_link_show_fdinfo(struct seq_file m, struct* file *filp)
2906	{
2907	const struct bpf_link *link = filp->private_data;
2908	const struct bpf_prog *prog = link->prog;
2909	char prog_tag[sizeof(prog->tag) * `2` + `1`] = { };
2910
2911	seq_printf(m,
2912	fmt: "link_type:\t%s\n"
2913	"link_id:\t%u\n",
2914	bpf_link_type_strs[link->type],
2915	link->id);
2916	if (prog) {
2917	bin2hex(dst: prog_tag, src: prog->tag, count: sizeof(prog->tag));
2918	seq_printf(m,
2919	fmt: "prog_tag:\t%s\n"
2920	"prog_id:\t%u\n",
2921	prog_tag,
2922	prog->aux->id);
2923	}
2924	if (link->ops->show_fdinfo)
2925	link->ops->show_fdinfo(link, m);
2926	}
2927	#endif
2928
2929	static const struct file_operations bpf_link_fops = {
2930	#ifdef CONFIG_PROC_FS
2931	.show_fdinfo = bpf_link_show_fdinfo,
2932	#endif
2933	.release = bpf_link_release,
2934	.read = bpf_dummy_read,
2935	.write = bpf_dummy_write,
2936	};
2937
2938	static int bpf_link_alloc_id(struct bpf_link *link)
2939	{
2940	int id;
2941
2942	idr_preload(GFP_KERNEL);
2943	spin_lock_bh(lock: &link_idr_lock);
2944	id = idr_alloc_cyclic(&link_idr, ptr: link, start: `1`, INT_MAX, GFP_ATOMIC);
2945	spin_unlock_bh(lock: &link_idr_lock);
2946	idr_preload_end();
2947
2948	return id;
2949	}
2950
2951	/ Prepare bpf_link to be exposed to user-space by allocating anon_inode file,*
2952	* reserving unused FD and allocating ID from link_idr. This is to be paired
2953	* with bpf_link_settle() to install FD and ID and expose bpf_link to
2954	* user-space, if bpf_link is successfully attached. If not, bpf_link and
2955	* pre-allocated resources are to be freed with bpf_cleanup() call. All the
2956	* transient state is passed around in struct bpf_link_primer.
2957	* This is preferred way to create and initialize bpf_link, especially when
2958	* there are complicated and expensive operations in between creating bpf_link
2959	* itself and attaching it to BPF hook. By using bpf_link_prime() and
2960	* bpf_link_settle() kernel code using bpf_link doesn't have to perform
2961	* expensive (and potentially failing) roll back operations in a rare case
2962	* that file, FD, or ID can't be allocated.
2963	*/
2964	int bpf_link_prime(struct bpf_link link, struct* bpf_link_primer *primer)
2965	{
2966	struct file *file;
2967	int fd, id;
2968
2969	fd = get_unused_fd_flags(O_CLOEXEC);
2970	if (fd < `0`)
2971	return fd;
2972
2973
2974	id = bpf_link_alloc_id(link);
2975	if (id < `0`) {
2976	put_unused_fd(fd);
2977	return id;
2978	}
2979
2980	file = anon_inode_getfile(name: "bpf_link", fops: &bpf_link_fops, priv: link, O_CLOEXEC);
2981	if (IS_ERR(ptr: file)) {
2982	bpf_link_free_id(id);
2983	put_unused_fd(fd);
2984	return PTR_ERR(ptr: file);
2985	}
2986
2987	primer->link = link;
2988	primer->file = file;
2989	primer->fd = fd;
2990	primer->id = id;
2991	return `0`;
2992	}
2993
2994	int bpf_link_settle(struct bpf_link_primer *primer)
2995	{
2996	/ make bpf_link fetchable by ID /
2997	spin_lock_bh(lock: &link_idr_lock);
2998	primer->link->id = primer->id;
2999	spin_unlock_bh(lock: &link_idr_lock);
3000	/ make bpf_link fetchable by FD /
3001	fd_install(fd: primer->fd, file: primer->file);
3002	/ pass through installed FD /
3003	return primer->fd;
3004	}
3005
3006	int bpf_link_new_fd(struct bpf_link *link)
3007	{
3008	return anon_inode_getfd(name: "bpf-link", fops: &bpf_link_fops, priv: link, O_CLOEXEC);
3009	}
3010
3011	struct bpf_link *bpf_link_get_from_fd(u32 ufd)
3012	{
3013	struct fd f = fdget(fd: ufd);
3014	struct bpf_link *link;
3015
3016	if (!f.file)
3017	return ERR_PTR(error: -EBADF);
3018	if (f.file->f_op != &bpf_link_fops) {
3019	fdput(fd: f);
3020	return ERR_PTR(error: -EINVAL);
3021	}
3022
3023	link = f.file->private_data;
3024	bpf_link_inc(link);
3025	fdput(fd: f);
3026
3027	return link;
3028	}
3029	EXPORT_SYMBOL(bpf_link_get_from_fd);
3030
3031	static void bpf_tracing_link_release(struct bpf_link *link)
3032	{
3033	struct bpf_tracing_link *tr_link =
3034	container_of(link, struct bpf_tracing_link, link.link);
3035
3036	WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link,
3037	tr_link->trampoline));
3038
3039	bpf_trampoline_put(tr: tr_link->trampoline);
3040
3041	/ tgt_prog is NULL if target is a kernel function /
3042	if (tr_link->tgt_prog)
3043	bpf_prog_put(tr_link->tgt_prog);
3044	}
3045
3046	static void bpf_tracing_link_dealloc(struct bpf_link *link)
3047	{
3048	struct bpf_tracing_link *tr_link =
3049	container_of(link, struct bpf_tracing_link, link.link);
3050
3051	kfree(objp: tr_link);
3052	}
3053
3054	static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
3055	struct seq_file *seq)
3056	{
3057	struct bpf_tracing_link *tr_link =
3058	container_of(link, struct bpf_tracing_link, link.link);
3059	u32 target_btf_id, target_obj_id;
3060
3061	bpf_trampoline_unpack_key(key: tr_link->trampoline->key,
3062	obj_id: &target_obj_id, btf_id: &target_btf_id);
3063	seq_printf(m: seq,
3064	fmt: "attach_type:\t%d\n"
3065	"target_obj_id:\t%u\n"
3066	"target_btf_id:\t%u\n",
3067	tr_link->attach_type,
3068	target_obj_id,
3069	target_btf_id);
3070	}
3071
3072	static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
3073	struct bpf_link_info *info)
3074	{
3075	struct bpf_tracing_link *tr_link =
3076	container_of(link, struct bpf_tracing_link, link.link);
3077
3078	info->tracing.attach_type = tr_link->attach_type;
3079	bpf_trampoline_unpack_key(key: tr_link->trampoline->key,
3080	obj_id: &info->tracing.target_obj_id,
3081	btf_id: &info->tracing.target_btf_id);
3082
3083	return `0`;
3084	}
3085
3086	static const struct bpf_link_ops bpf_tracing_link_lops = {
3087	.release = bpf_tracing_link_release,
3088	.dealloc = bpf_tracing_link_dealloc,
3089	.show_fdinfo = bpf_tracing_link_show_fdinfo,
3090	.fill_link_info = bpf_tracing_link_fill_link_info,
3091	};
3092
3093	static int bpf_tracing_prog_attach(struct bpf_prog *prog,
3094	int tgt_prog_fd,
3095	u32 btf_id,
3096	u64 bpf_cookie)
3097	{
3098	struct bpf_link_primer link_primer;
3099	struct bpf_prog *tgt_prog = NULL;
3100	struct bpf_trampoline *tr = NULL;
3101	struct bpf_tracing_link *link;
3102	u64 key = `0`;
3103	int err;
3104
3105	switch (prog->type) {
3106	case BPF_PROG_TYPE_TRACING:
3107	if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
3108	prog->expected_attach_type != BPF_TRACE_FEXIT &&
3109	prog->expected_attach_type != BPF_MODIFY_RETURN) {
3110	err = -EINVAL;
3111	goto out_put_prog;
3112	}
3113	break;
3114	case BPF_PROG_TYPE_EXT:
3115	if (prog->expected_attach_type != `0`) {
3116	err = -EINVAL;
3117	goto out_put_prog;
3118	}
3119	break;
3120	case BPF_PROG_TYPE_LSM:
3121	if (prog->expected_attach_type != BPF_LSM_MAC) {
3122	err = -EINVAL;
3123	goto out_put_prog;
3124	}
3125	break;
3126	default:
3127	err = -EINVAL;
3128	goto out_put_prog;
3129	}
3130
3131	if (!!tgt_prog_fd != !!btf_id) {
3132	err = -EINVAL;
3133	goto out_put_prog;
3134	}
3135
3136	if (tgt_prog_fd) {
3137	/ For now we only allow new targets for BPF_PROG_TYPE_EXT /
3138	if (prog->type != BPF_PROG_TYPE_EXT) {
3139	err = -EINVAL;
3140	goto out_put_prog;
3141	}
3142
3143	tgt_prog = bpf_prog_get(ufd: tgt_prog_fd);
3144	if (IS_ERR(ptr: tgt_prog)) {
3145	err = PTR_ERR(ptr: tgt_prog);
3146	tgt_prog = NULL;
3147	goto out_put_prog;
3148	}
3149
3150	key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id);
3151	}
3152
3153	link = kzalloc(size: sizeof(*link), GFP_USER);
3154	if (!link) {
3155	err = -ENOMEM;
3156	goto out_put_prog;
3157	}
3158	bpf_link_init(link: &link->link.link, type: BPF_LINK_TYPE_TRACING,
3159	ops: &bpf_tracing_link_lops, prog);
3160	link->attach_type = prog->expected_attach_type;
3161	link->link.cookie = bpf_cookie;
3162
3163	mutex_lock(&prog->aux->dst_mutex);
3164
3165	/ There are a few possible cases here:*
3166	*
3167	* - if prog->aux->dst_trampoline is set, the program was just loaded
3168	* and not yet attached to anything, so we can use the values stored
3169	* in prog->aux
3170	*
3171	* - if prog->aux->dst_trampoline is NULL, the program has already been
3172	* attached to a target and its initial target was cleared (below)
3173	*
3174	* - if tgt_prog != NULL, the caller specified tgt_prog_fd +
3175	* target_btf_id using the link_create API.
3176	*
3177	* - if tgt_prog == NULL when this function was called using the old
3178	* raw_tracepoint_open API, and we need a target from prog->aux
3179	*
3180	* - if prog->aux->dst_trampoline and tgt_prog is NULL, the program
3181	* was detached and is going for re-attachment.
3182	*/
3183	if (!prog->aux->dst_trampoline && !tgt_prog) {
3184	/*
3185	* Allow re-attach for TRACING and LSM programs. If it's
3186	* currently linked, bpf_trampoline_link_prog will fail.
3187	* EXT programs need to specify tgt_prog_fd, so they
3188	* re-attach in separate code path.
3189	*/
3190	if (prog->type != BPF_PROG_TYPE_TRACING &&
3191	prog->type != BPF_PROG_TYPE_LSM) {
3192	err = -EINVAL;
3193	goto out_unlock;
3194	}
3195	btf_id = prog->aux->attach_btf_id;
3196	key = bpf_trampoline_compute_key(NULL, btf: prog->aux->attach_btf, btf_id);
3197	}
3198
3199	if (!prog->aux->dst_trampoline \|\|
3200	(key && key != prog->aux->dst_trampoline->key)) {
3201	/ If there is no saved target, or the specified target is*
3202	* different from the destination specified at load time, we
3203	* need a new trampoline and a check for compatibility
3204	*/
3205	struct bpf_attach_target_info tgt_info = {};
3206
3207	err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id,
3208	tgt_info: &tgt_info);
3209	if (err)
3210	goto out_unlock;
3211
3212	if (tgt_info.tgt_mod) {
3213	module_put(module: prog->aux->mod);
3214	prog->aux->mod = tgt_info.tgt_mod;
3215	}
3216
3217	tr = bpf_trampoline_get(key, tgt_info: &tgt_info);
3218	if (!tr) {
3219	err = -ENOMEM;
3220	goto out_unlock;
3221	}
3222	} else {
3223	/ The caller didn't specify a target, or the target was the*
3224	* same as the destination supplied during program load. This
3225	* means we can reuse the trampoline and reference from program
3226	* load time, and there is no need to allocate a new one. This
3227	* can only happen once for any program, as the saved values in
3228	* prog->aux are cleared below.
3229	*/
3230	tr = prog->aux->dst_trampoline;
3231	tgt_prog = prog->aux->dst_prog;
3232	}
3233
3234	err = bpf_link_prime(link: &link->link.link, primer: &link_primer);
3235	if (err)
3236	goto out_unlock;
3237
3238	err = bpf_trampoline_link_prog(link: &link->link, tr);
3239	if (err) {
3240	bpf_link_cleanup(primer: &link_primer);
3241	link = NULL;
3242	goto out_unlock;
3243	}
3244
3245	link->tgt_prog = tgt_prog;
3246	link->trampoline = tr;
3247
3248	/ Always clear the trampoline and target prog from prog->aux to make*
3249	* sure the original attach destination is not kept alive after a
3250	* program is (re-)attached to another target.
3251	*/
3252	if (prog->aux->dst_prog &&
3253	(tgt_prog_fd \|\| tr != prog->aux->dst_trampoline))
3254	/ got extra prog ref from syscall, or attaching to different prog /
3255	bpf_prog_put(prog->aux->dst_prog);
3256	if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline)
3257	/ we allocated a new trampoline, so free the old one /
3258	bpf_trampoline_put(tr: prog->aux->dst_trampoline);
3259
3260	prog->aux->dst_prog = NULL;
3261	prog->aux->dst_trampoline = NULL;
3262	mutex_unlock(lock: &prog->aux->dst_mutex);
3263
3264	return bpf_link_settle(primer: &link_primer);
3265	out_unlock:
3266	if (tr && tr != prog->aux->dst_trampoline)
3267	bpf_trampoline_put(tr);
3268	mutex_unlock(lock: &prog->aux->dst_mutex);
3269	kfree(objp: link);
3270	out_put_prog:
3271	if (tgt_prog_fd && tgt_prog)
3272	bpf_prog_put(tgt_prog);
3273	return err;
3274	}
3275
3276	struct bpf_raw_tp_link {
3277	struct bpf_link link;
3278	struct bpf_raw_event_map *btp;
3279	};
3280
3281	static void bpf_raw_tp_link_release(struct bpf_link *link)
3282	{
3283	struct bpf_raw_tp_link *raw_tp =
3284	container_of(link, struct bpf_raw_tp_link, link);
3285
3286	bpf_probe_unregister(btp: raw_tp->btp, prog: raw_tp->link.prog);
3287	bpf_put_raw_tracepoint(btp: raw_tp->btp);
3288	}
3289
3290	static void bpf_raw_tp_link_dealloc(struct bpf_link *link)
3291	{
3292	struct bpf_raw_tp_link *raw_tp =
3293	container_of(link, struct bpf_raw_tp_link, link);
3294
3295	kfree(objp: raw_tp);
3296	}
3297
3298	static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link,
3299	struct seq_file *seq)
3300	{
3301	struct bpf_raw_tp_link *raw_tp_link =
3302	container_of(link, struct bpf_raw_tp_link, link);
3303
3304	seq_printf(m: seq,
3305	fmt: "tp_name:\t%s\n",
3306	raw_tp_link->btp->tp->name);
3307	}
3308
3309	static int bpf_copy_to_user(char __user ubuf, const* char *buf, u32 ulen,
3310	u32 len)
3311	{
3312	if (ulen >= len + `1`) {
3313	if (copy_to_user(to: ubuf, from: buf, n: len + `1`))
3314	return -EFAULT;
3315	} else {
3316	char zero = `'\0'`;
3317
3318	if (copy_to_user(to: ubuf, from: buf, n: ulen - `1`))
3319	return -EFAULT;
3320	if (put_user(zero, ubuf + ulen - `1`))
3321	return -EFAULT;
3322	return -ENOSPC;
3323	}
3324
3325	return `0`;
3326	}
3327
3328	static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
3329	struct bpf_link_info *info)
3330	{
3331	struct bpf_raw_tp_link *raw_tp_link =
3332	container_of(link, struct bpf_raw_tp_link, link);
3333	char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name);
3334	const char *tp_name = raw_tp_link->btp->tp->name;
3335	u32 ulen = info->raw_tracepoint.tp_name_len;
3336	size_t tp_len = strlen(tp_name);
3337
3338	if (!ulen ^ !ubuf)
3339	return -EINVAL;
3340
3341	info->raw_tracepoint.tp_name_len = tp_len + `1`;
3342
3343	if (!ubuf)
3344	return `0`;
3345
3346	return bpf_copy_to_user(ubuf, buf: tp_name, ulen, len: tp_len);
3347	}
3348
3349	static const struct bpf_link_ops bpf_raw_tp_link_lops = {
3350	.release = bpf_raw_tp_link_release,
3351	.dealloc = bpf_raw_tp_link_dealloc,
3352	.show_fdinfo = bpf_raw_tp_link_show_fdinfo,
3353	.fill_link_info = bpf_raw_tp_link_fill_link_info,
3354	};
3355
3356	#ifdef CONFIG_PERF_EVENTS
3357	struct bpf_perf_link {
3358	struct bpf_link link;
3359	struct file *perf_file;
3360	};
3361
3362	static void bpf_perf_link_release(struct bpf_link *link)
3363	{
3364	struct bpf_perf_link perf_link = container_of(link, struct* bpf_perf_link, link);
3365	struct perf_event *event = perf_link->perf_file->private_data;
3366
3367	perf_event_free_bpf_prog(event);
3368	fput(perf_link->perf_file);
3369	}
3370
3371	static void bpf_perf_link_dealloc(struct bpf_link *link)
3372	{
3373	struct bpf_perf_link perf_link = container_of(link, struct* bpf_perf_link, link);
3374
3375	kfree(objp: perf_link);
3376	}
3377
3378	static int bpf_perf_link_fill_common(const struct perf_event *event,
3379	char __user *uname, u32 ulen,
3380	u64 probe_offset, u64 probe_addr,
3381	u32 fd_type, unsigned* long *missed)
3382	{
3383	const char *buf;
3384	u32 prog_id;
3385	size_t len;
3386	int err;
3387
3388	if (!ulen ^ !uname)
3389	return -EINVAL;
3390
3391	err = bpf_get_perf_event_info(event, prog_id: &prog_id, fd_type, buf: &buf,
3392	probe_offset, probe_addr, missed);
3393	if (err)
3394	return err;
3395	if (!uname)
3396	return `0`;
3397	if (buf) {
3398	len = strlen(buf);
3399	err = bpf_copy_to_user(ubuf: uname, buf, ulen, len);
3400	if (err)
3401	return err;
3402	} else {
3403	char zero = `'\0'`;
3404
3405	if (put_user(zero, uname))
3406	return -EFAULT;
3407	}
3408	return `0`;
3409	}
3410
3411	#ifdef CONFIG_KPROBE_EVENTS
3412	static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
3413	struct bpf_link_info *info)
3414	{
3415	unsigned long missed;
3416	char __user *uname;
3417	u64 addr, offset;
3418	u32 ulen, type;
3419	int err;
3420
3421	uname = u64_to_user_ptr(info->perf_event.kprobe.func_name);
3422	ulen = info->perf_event.kprobe.name_len;
3423	err = bpf_perf_link_fill_common(event, uname, ulen, probe_offset: &offset, probe_addr: &addr,
3424	fd_type: &type, missed: &missed);
3425	if (err)
3426	return err;
3427	if (type == BPF_FD_TYPE_KRETPROBE)
3428	info->perf_event.type = BPF_PERF_EVENT_KRETPROBE;
3429	else
3430	info->perf_event.type = BPF_PERF_EVENT_KPROBE;
3431
3432	info->perf_event.kprobe.offset = offset;
3433	info->perf_event.kprobe.missed = missed;
3434	if (!kallsyms_show_value(current_cred()))
3435	addr = `0`;
3436	info->perf_event.kprobe.addr = addr;
3437	return `0`;
3438	}
3439	#endif
3440
3441	#ifdef CONFIG_UPROBE_EVENTS
3442	static int bpf_perf_link_fill_uprobe(const struct perf_event *event,
3443	struct bpf_link_info *info)
3444	{
3445	char __user *uname;
3446	u64 addr, offset;
3447	u32 ulen, type;
3448	int err;
3449
3450	uname = u64_to_user_ptr(info->perf_event.uprobe.file_name);
3451	ulen = info->perf_event.uprobe.name_len;
3452	err = bpf_perf_link_fill_common(event, uname, ulen, probe_offset: &offset, probe_addr: &addr,
3453	fd_type: &type, NULL);
3454	if (err)
3455	return err;
3456
3457	if (type == BPF_FD_TYPE_URETPROBE)
3458	info->perf_event.type = BPF_PERF_EVENT_URETPROBE;
3459	else
3460	info->perf_event.type = BPF_PERF_EVENT_UPROBE;
3461	info->perf_event.uprobe.offset = offset;
3462	return `0`;
3463	}
3464	#endif
3465
3466	static int bpf_perf_link_fill_probe(const struct perf_event *event,
3467	struct bpf_link_info *info)
3468	{
3469	#ifdef CONFIG_KPROBE_EVENTS
3470	if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE)
3471	return bpf_perf_link_fill_kprobe(event, info);
3472	#endif
3473	#ifdef CONFIG_UPROBE_EVENTS
3474	if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE)
3475	return bpf_perf_link_fill_uprobe(event, info);
3476	#endif
3477	return -EOPNOTSUPP;
3478	}
3479
3480	static int bpf_perf_link_fill_tracepoint(const struct perf_event *event,
3481	struct bpf_link_info *info)
3482	{
3483	char __user *uname;
3484	u32 ulen;
3485
3486	uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name);
3487	ulen = info->perf_event.tracepoint.name_len;
3488	info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT;
3489	return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL, NULL);
3490	}
3491
3492	static int bpf_perf_link_fill_perf_event(const struct perf_event *event,
3493	struct bpf_link_info *info)
3494	{
3495	info->perf_event.event.type = event->attr.type;
3496	info->perf_event.event.config = event->attr.config;
3497	info->perf_event.type = BPF_PERF_EVENT_EVENT;
3498	return `0`;
3499	}
3500
3501	static int bpf_perf_link_fill_link_info(const struct bpf_link *link,
3502	struct bpf_link_info *info)
3503	{
3504	struct bpf_perf_link *perf_link;
3505	const struct perf_event *event;
3506
3507	perf_link = container_of(link, struct bpf_perf_link, link);
3508	event = perf_get_event(file: perf_link->perf_file);
3509	if (IS_ERR(ptr: event))
3510	return PTR_ERR(ptr: event);
3511
3512	switch (event->prog->type) {
3513	case BPF_PROG_TYPE_PERF_EVENT:
3514	return bpf_perf_link_fill_perf_event(event, info);
3515	case BPF_PROG_TYPE_TRACEPOINT:
3516	return bpf_perf_link_fill_tracepoint(event, info);
3517	case BPF_PROG_TYPE_KPROBE:
3518	return bpf_perf_link_fill_probe(event, info);
3519	default:
3520	return -EOPNOTSUPP;
3521	}
3522	}
3523
3524	static const struct bpf_link_ops bpf_perf_link_lops = {
3525	.release = bpf_perf_link_release,
3526	.dealloc = bpf_perf_link_dealloc,
3527	.fill_link_info = bpf_perf_link_fill_link_info,
3528	};
3529
3530	static int bpf_perf_link_attach(const union bpf_attr attr, struct* bpf_prog *prog)
3531	{
3532	struct bpf_link_primer link_primer;
3533	struct bpf_perf_link *link;
3534	struct perf_event *event;
3535	struct file *perf_file;
3536	int err;
3537
3538	if (attr->link_create.flags)
3539	return -EINVAL;
3540
3541	perf_file = perf_event_get(fd: attr->link_create.target_fd);
3542	if (IS_ERR(ptr: perf_file))
3543	return PTR_ERR(ptr: perf_file);
3544
3545	link = kzalloc(size: sizeof(*link), GFP_USER);
3546	if (!link) {
3547	err = -ENOMEM;
3548	goto out_put_file;
3549	}
3550	bpf_link_init(link: &link->link, type: BPF_LINK_TYPE_PERF_EVENT, ops: &bpf_perf_link_lops, prog);
3551	link->perf_file = perf_file;
3552
3553	err = bpf_link_prime(link: &link->link, primer: &link_primer);
3554	if (err) {
3555	kfree(objp: link);
3556	goto out_put_file;
3557	}
3558
3559	event = perf_file->private_data;
3560	err = perf_event_set_bpf_prog(event, prog, bpf_cookie: attr->link_create.perf_event.bpf_cookie);
3561	if (err) {
3562	bpf_link_cleanup(primer: &link_primer);
3563	goto out_put_file;
3564	}
3565	/ perf_event_set_bpf_prog() doesn't take its own refcnt on prog /
3566	bpf_prog_inc(prog);
3567
3568	return bpf_link_settle(primer: &link_primer);
3569
3570	out_put_file:
3571	fput(perf_file);
3572	return err;
3573	}
3574	#else
3575	static int bpf_perf_link_attach(const union bpf_attr attr, struct* bpf_prog *prog)
3576	{
3577	return -EOPNOTSUPP;
3578	}
3579	#endif /* CONFIG_PERF_EVENTS */
3580
3581	static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
3582	const char __user *user_tp_name)
3583	{
3584	struct bpf_link_primer link_primer;
3585	struct bpf_raw_tp_link *link;
3586	struct bpf_raw_event_map *btp;
3587	const char *tp_name;
3588	char buf[`128`];
3589	int err;
3590
3591	switch (prog->type) {
3592	case BPF_PROG_TYPE_TRACING:
3593	case BPF_PROG_TYPE_EXT:
3594	case BPF_PROG_TYPE_LSM:
3595	if (user_tp_name)
3596	/ The attach point for this category of programs*
3597	* should be specified via btf_id during program load.
3598	*/
3599	return -EINVAL;
3600	if (prog->type == BPF_PROG_TYPE_TRACING &&
3601	prog->expected_attach_type == BPF_TRACE_RAW_TP) {
3602	tp_name = prog->aux->attach_func_name;
3603	break;
3604	}
3605	return bpf_tracing_prog_attach(prog, tgt_prog_fd: `0`, btf_id: `0`, bpf_cookie: `0`);
3606	case BPF_PROG_TYPE_RAW_TRACEPOINT:
3607	case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
3608	if (strncpy_from_user(dst: buf, src: user_tp_name, count: sizeof(buf) - `1`) < `0`)
3609	return -EFAULT;
3610	buf[sizeof(buf) - `1`] = `0`;
3611	tp_name = buf;
3612	break;
3613	default:
3614	return -EINVAL;
3615	}
3616
3617	btp = bpf_get_raw_tracepoint(name: tp_name);
3618	if (!btp)
3619	return -ENOENT;
3620
3621	link = kzalloc(size: sizeof(*link), GFP_USER);
3622	if (!link) {
3623	err = -ENOMEM;
3624	goto out_put_btp;
3625	}
3626	bpf_link_init(link: &link->link, type: BPF_LINK_TYPE_RAW_TRACEPOINT,
3627	ops: &bpf_raw_tp_link_lops, prog);
3628	link->btp = btp;
3629
3630	err = bpf_link_prime(link: &link->link, primer: &link_primer);
3631	if (err) {
3632	kfree(objp: link);
3633	goto out_put_btp;
3634	}
3635
3636	err = bpf_probe_register(btp: link->btp, prog);
3637	if (err) {
3638	bpf_link_cleanup(primer: &link_primer);
3639	goto out_put_btp;
3640	}
3641
3642	return bpf_link_settle(primer: &link_primer);
3643
3644	out_put_btp:
3645	bpf_put_raw_tracepoint(btp);
3646	return err;
3647	}
3648
3649	#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
3650
3651	static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
3652	{
3653	struct bpf_prog *prog;
3654	int fd;
3655
3656	if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
3657	return -EINVAL;
3658
3659	prog = bpf_prog_get(ufd: attr->raw_tracepoint.prog_fd);
3660	if (IS_ERR(ptr: prog))
3661	return PTR_ERR(ptr: prog);
3662
3663	fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name));
3664	if (fd < `0`)
3665	bpf_prog_put(prog);
3666	return fd;
3667	}
3668
3669	static enum bpf_prog_type
3670	attach_type_to_prog_type(enum bpf_attach_type attach_type)
3671	{
3672	switch (attach_type) {
3673	case BPF_CGROUP_INET_INGRESS:
3674	case BPF_CGROUP_INET_EGRESS:
3675	return BPF_PROG_TYPE_CGROUP_SKB;
3676	case BPF_CGROUP_INET_SOCK_CREATE:
3677	case BPF_CGROUP_INET_SOCK_RELEASE:
3678	case BPF_CGROUP_INET4_POST_BIND:
3679	case BPF_CGROUP_INET6_POST_BIND:
3680	return BPF_PROG_TYPE_CGROUP_SOCK;
3681	case BPF_CGROUP_INET4_BIND:
3682	case BPF_CGROUP_INET6_BIND:
3683	case BPF_CGROUP_INET4_CONNECT:
3684	case BPF_CGROUP_INET6_CONNECT:
3685	case BPF_CGROUP_UNIX_CONNECT:
3686	case BPF_CGROUP_INET4_GETPEERNAME:
3687	case BPF_CGROUP_INET6_GETPEERNAME:
3688	case BPF_CGROUP_UNIX_GETPEERNAME:
3689	case BPF_CGROUP_INET4_GETSOCKNAME:
3690	case BPF_CGROUP_INET6_GETSOCKNAME:
3691	case BPF_CGROUP_UNIX_GETSOCKNAME:
3692	case BPF_CGROUP_UDP4_SENDMSG:
3693	case BPF_CGROUP_UDP6_SENDMSG:
3694	case BPF_CGROUP_UNIX_SENDMSG:
3695	case BPF_CGROUP_UDP4_RECVMSG:
3696	case BPF_CGROUP_UDP6_RECVMSG:
3697	case BPF_CGROUP_UNIX_RECVMSG:
3698	return BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
3699	case BPF_CGROUP_SOCK_OPS:
3700	return BPF_PROG_TYPE_SOCK_OPS;
3701	case BPF_CGROUP_DEVICE:
3702	return BPF_PROG_TYPE_CGROUP_DEVICE;
3703	case BPF_SK_MSG_VERDICT:
3704	return BPF_PROG_TYPE_SK_MSG;
3705	case BPF_SK_SKB_STREAM_PARSER:
3706	case BPF_SK_SKB_STREAM_VERDICT:
3707	case BPF_SK_SKB_VERDICT:
3708	return BPF_PROG_TYPE_SK_SKB;
3709	case BPF_LIRC_MODE2:
3710	return BPF_PROG_TYPE_LIRC_MODE2;
3711	case BPF_FLOW_DISSECTOR:
3712	return BPF_PROG_TYPE_FLOW_DISSECTOR;
3713	case BPF_CGROUP_SYSCTL:
3714	return BPF_PROG_TYPE_CGROUP_SYSCTL;
3715	case BPF_CGROUP_GETSOCKOPT:
3716	case BPF_CGROUP_SETSOCKOPT:
3717	return BPF_PROG_TYPE_CGROUP_SOCKOPT;
3718	case BPF_TRACE_ITER:
3719	case BPF_TRACE_RAW_TP:
3720	case BPF_TRACE_FENTRY:
3721	case BPF_TRACE_FEXIT:
3722	case BPF_MODIFY_RETURN:
3723	return BPF_PROG_TYPE_TRACING;
3724	case BPF_LSM_MAC:
3725	return BPF_PROG_TYPE_LSM;
3726	case BPF_SK_LOOKUP:
3727	return BPF_PROG_TYPE_SK_LOOKUP;
3728	case BPF_XDP:
3729	return BPF_PROG_TYPE_XDP;
3730	case BPF_LSM_CGROUP:
3731	return BPF_PROG_TYPE_LSM;
3732	case BPF_TCX_INGRESS:
3733	case BPF_TCX_EGRESS:
3734	case BPF_NETKIT_PRIMARY:
3735	case BPF_NETKIT_PEER:
3736	return BPF_PROG_TYPE_SCHED_CLS;
3737	default:
3738	return BPF_PROG_TYPE_UNSPEC;
3739	}
3740	}
3741
3742	static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
3743	enum bpf_attach_type attach_type)
3744	{
3745	enum bpf_prog_type ptype;
3746
3747	switch (prog->type) {
3748	case BPF_PROG_TYPE_CGROUP_SOCK:
3749	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
3750	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
3751	case BPF_PROG_TYPE_SK_LOOKUP:
3752	return attach_type == prog->expected_attach_type ? `0` : -EINVAL;
3753	case BPF_PROG_TYPE_CGROUP_SKB:
3754	if (!capable(CAP_NET_ADMIN))
3755	/ cg-skb progs can be loaded by unpriv user.*
3756	* check permissions at attach time.
3757	*/
3758	return -EPERM;
3759	return prog->enforce_expected_attach_type &&
3760	prog->expected_attach_type != attach_type ?
3761	-EINVAL : `0`;
3762	case BPF_PROG_TYPE_EXT:
3763	return `0`;
3764	case BPF_PROG_TYPE_NETFILTER:
3765	if (attach_type != BPF_NETFILTER)
3766	return -EINVAL;
3767	return `0`;
3768	case BPF_PROG_TYPE_PERF_EVENT:
3769	case BPF_PROG_TYPE_TRACEPOINT:
3770	if (attach_type != BPF_PERF_EVENT)
3771	return -EINVAL;
3772	return `0`;
3773	case BPF_PROG_TYPE_KPROBE:
3774	if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI &&
3775	attach_type != BPF_TRACE_KPROBE_MULTI)
3776	return -EINVAL;
3777	if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI &&
3778	attach_type != BPF_TRACE_UPROBE_MULTI)
3779	return -EINVAL;
3780	if (attach_type != BPF_PERF_EVENT &&
3781	attach_type != BPF_TRACE_KPROBE_MULTI &&
3782	attach_type != BPF_TRACE_UPROBE_MULTI)
3783	return -EINVAL;
3784	return `0`;
3785	case BPF_PROG_TYPE_SCHED_CLS:
3786	if (attach_type != BPF_TCX_INGRESS &&
3787	attach_type != BPF_TCX_EGRESS &&
3788	attach_type != BPF_NETKIT_PRIMARY &&
3789	attach_type != BPF_NETKIT_PEER)
3790	return -EINVAL;
3791	return `0`;
3792	default:
3793	ptype = attach_type_to_prog_type(attach_type);
3794	if (ptype == BPF_PROG_TYPE_UNSPEC \|\| ptype != prog->type)
3795	return -EINVAL;
3796	return `0`;
3797	}
3798	}
3799
3800	#define BPF_PROG_ATTACH_LAST_FIELD expected_revision
3801
3802	#define BPF_F_ATTACH_MASK_BASE \
3803	(BPF_F_ALLOW_OVERRIDE \| \
3804	BPF_F_ALLOW_MULTI \| \
3805	BPF_F_REPLACE)
3806
3807	#define BPF_F_ATTACH_MASK_MPROG \
3808	(BPF_F_REPLACE \| \
3809	BPF_F_BEFORE \| \
3810	BPF_F_AFTER \| \
3811	BPF_F_ID \| \
3812	BPF_F_LINK)
3813
3814	static int bpf_prog_attach(const union bpf_attr *attr)
3815	{
3816	enum bpf_prog_type ptype;
3817	struct bpf_prog *prog;
3818	int ret;
3819
3820	if (CHECK_ATTR(BPF_PROG_ATTACH))
3821	return -EINVAL;
3822
3823	ptype = attach_type_to_prog_type(attach_type: attr->attach_type);
3824	if (ptype == BPF_PROG_TYPE_UNSPEC)
3825	return -EINVAL;
3826	if (bpf_mprog_supported(type: ptype)) {
3827	if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
3828	return -EINVAL;
3829	} else {
3830	if (attr->attach_flags & ~BPF_F_ATTACH_MASK_BASE)
3831	return -EINVAL;
3832	if (attr->relative_fd \|\|
3833	attr->expected_revision)
3834	return -EINVAL;
3835	}
3836
3837	prog = bpf_prog_get_type(ufd: attr->attach_bpf_fd, type: ptype);
3838	if (IS_ERR(ptr: prog))
3839	return PTR_ERR(ptr: prog);
3840
3841	if (bpf_prog_attach_check_attach_type(prog, attach_type: attr->attach_type)) {
3842	bpf_prog_put(prog);
3843	return -EINVAL;
3844	}
3845
3846	switch (ptype) {
3847	case BPF_PROG_TYPE_SK_SKB:
3848	case BPF_PROG_TYPE_SK_MSG:
3849	ret = sock_map_get_from_fd(attr, prog);
3850	break;
3851	case BPF_PROG_TYPE_LIRC_MODE2:
3852	ret = lirc_prog_attach(attr, prog);
3853	break;
3854	case BPF_PROG_TYPE_FLOW_DISSECTOR:
3855	ret = netns_bpf_prog_attach(attr, prog);
3856	break;
3857	case BPF_PROG_TYPE_CGROUP_DEVICE:
3858	case BPF_PROG_TYPE_CGROUP_SKB:
3859	case BPF_PROG_TYPE_CGROUP_SOCK:
3860	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
3861	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
3862	case BPF_PROG_TYPE_CGROUP_SYSCTL:
3863	case BPF_PROG_TYPE_SOCK_OPS:
3864	case BPF_PROG_TYPE_LSM:
3865	if (ptype == BPF_PROG_TYPE_LSM &&
3866	prog->expected_attach_type != BPF_LSM_CGROUP)
3867	ret = -EINVAL;
3868	else
3869	ret = cgroup_bpf_prog_attach(attr, ptype, prog);
3870	break;
3871	case BPF_PROG_TYPE_SCHED_CLS:
3872	if (attr->attach_type == BPF_TCX_INGRESS \|\|
3873	attr->attach_type == BPF_TCX_EGRESS)
3874	ret = tcx_prog_attach(attr, prog);
3875	else
3876	ret = netkit_prog_attach(attr, prog);
3877	break;
3878	default:
3879	ret = -EINVAL;
3880	}
3881
3882	if (ret)
3883	bpf_prog_put(prog);
3884	return ret;
3885	}
3886
3887	#define BPF_PROG_DETACH_LAST_FIELD expected_revision
3888
3889	static int bpf_prog_detach(const union bpf_attr *attr)
3890	{
3891	struct bpf_prog *prog = NULL;
3892	enum bpf_prog_type ptype;
3893	int ret;
3894
3895	if (CHECK_ATTR(BPF_PROG_DETACH))
3896	return -EINVAL;
3897
3898	ptype = attach_type_to_prog_type(attach_type: attr->attach_type);
3899	if (bpf_mprog_supported(type: ptype)) {
3900	if (ptype == BPF_PROG_TYPE_UNSPEC)
3901	return -EINVAL;
3902	if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
3903	return -EINVAL;
3904	if (attr->attach_bpf_fd) {
3905	prog = bpf_prog_get_type(ufd: attr->attach_bpf_fd, type: ptype);
3906	if (IS_ERR(ptr: prog))
3907	return PTR_ERR(ptr: prog);
3908	}
3909	} else if (attr->attach_flags \|\|
3910	attr->relative_fd \|\|
3911	attr->expected_revision) {
3912	return -EINVAL;
3913	}
3914
3915	switch (ptype) {
3916	case BPF_PROG_TYPE_SK_MSG:
3917	case BPF_PROG_TYPE_SK_SKB:
3918	ret = sock_map_prog_detach(attr, ptype);
3919	break;
3920	case BPF_PROG_TYPE_LIRC_MODE2:
3921	ret = lirc_prog_detach(attr);
3922	break;
3923	case BPF_PROG_TYPE_FLOW_DISSECTOR:
3924	ret = netns_bpf_prog_detach(attr, ptype);
3925	break;
3926	case BPF_PROG_TYPE_CGROUP_DEVICE:
3927	case BPF_PROG_TYPE_CGROUP_SKB:
3928	case BPF_PROG_TYPE_CGROUP_SOCK:
3929	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
3930	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
3931	case BPF_PROG_TYPE_CGROUP_SYSCTL:
3932	case BPF_PROG_TYPE_SOCK_OPS:
3933	case BPF_PROG_TYPE_LSM:
3934	ret = cgroup_bpf_prog_detach(attr, ptype);
3935	break;
3936	case BPF_PROG_TYPE_SCHED_CLS:
3937	if (attr->attach_type == BPF_TCX_INGRESS \|\|
3938	attr->attach_type == BPF_TCX_EGRESS)
3939	ret = tcx_prog_detach(attr, prog);
3940	else
3941	ret = netkit_prog_detach(attr, prog);
3942	break;
3943	default:
3944	ret = -EINVAL;
3945	}
3946
3947	if (prog)
3948	bpf_prog_put(prog);
3949	return ret;
3950	}
3951
3952	#define BPF_PROG_QUERY_LAST_FIELD query.revision
3953
3954	static int bpf_prog_query(const union bpf_attr *attr,
3955	union bpf_attr __user *uattr)
3956	{
3957	if (!capable(CAP_NET_ADMIN))
3958	return -EPERM;
3959	if (CHECK_ATTR(BPF_PROG_QUERY))
3960	return -EINVAL;
3961	if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
3962	return -EINVAL;
3963
3964	switch (attr->query.attach_type) {
3965	case BPF_CGROUP_INET_INGRESS:
3966	case BPF_CGROUP_INET_EGRESS:
3967	case BPF_CGROUP_INET_SOCK_CREATE:
3968	case BPF_CGROUP_INET_SOCK_RELEASE:
3969	case BPF_CGROUP_INET4_BIND:
3970	case BPF_CGROUP_INET6_BIND:
3971	case BPF_CGROUP_INET4_POST_BIND:
3972	case BPF_CGROUP_INET6_POST_BIND:
3973	case BPF_CGROUP_INET4_CONNECT:
3974	case BPF_CGROUP_INET6_CONNECT:
3975	case BPF_CGROUP_UNIX_CONNECT:
3976	case BPF_CGROUP_INET4_GETPEERNAME:
3977	case BPF_CGROUP_INET6_GETPEERNAME:
3978	case BPF_CGROUP_UNIX_GETPEERNAME:
3979	case BPF_CGROUP_INET4_GETSOCKNAME:
3980	case BPF_CGROUP_INET6_GETSOCKNAME:
3981	case BPF_CGROUP_UNIX_GETSOCKNAME:
3982	case BPF_CGROUP_UDP4_SENDMSG:
3983	case BPF_CGROUP_UDP6_SENDMSG:
3984	case BPF_CGROUP_UNIX_SENDMSG:
3985	case BPF_CGROUP_UDP4_RECVMSG:
3986	case BPF_CGROUP_UDP6_RECVMSG:
3987	case BPF_CGROUP_UNIX_RECVMSG:
3988	case BPF_CGROUP_SOCK_OPS:
3989	case BPF_CGROUP_DEVICE:
3990	case BPF_CGROUP_SYSCTL:
3991	case BPF_CGROUP_GETSOCKOPT:
3992	case BPF_CGROUP_SETSOCKOPT:
3993	case BPF_LSM_CGROUP:
3994	return cgroup_bpf_prog_query(attr, uattr);
3995	case BPF_LIRC_MODE2:
3996	return lirc_prog_query(attr, uattr);
3997	case BPF_FLOW_DISSECTOR:
3998	case BPF_SK_LOOKUP:
3999	return netns_bpf_prog_query(attr, uattr);
4000	case BPF_SK_SKB_STREAM_PARSER:
4001	case BPF_SK_SKB_STREAM_VERDICT:
4002	case BPF_SK_MSG_VERDICT:
4003	case BPF_SK_SKB_VERDICT:
4004	return sock_map_bpf_prog_query(attr, uattr);
4005	case BPF_TCX_INGRESS:
4006	case BPF_TCX_EGRESS:
4007	return tcx_prog_query(attr, uattr);
4008	case BPF_NETKIT_PRIMARY:
4009	case BPF_NETKIT_PEER:
4010	return netkit_prog_query(attr, uattr);
4011	default:
4012	return -EINVAL;
4013	}
4014	}
4015
4016	#define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size
4017
4018	static int bpf_prog_test_run(const union bpf_attr *attr,
4019	union bpf_attr __user *uattr)
4020	{
4021	struct bpf_prog *prog;
4022	int ret = -ENOTSUPP;
4023
4024	if (CHECK_ATTR(BPF_PROG_TEST_RUN))
4025	return -EINVAL;
4026
4027	if ((attr->test.ctx_size_in && !attr->test.ctx_in) \|\|
4028	(!attr->test.ctx_size_in && attr->test.ctx_in))
4029	return -EINVAL;
4030
4031	if ((attr->test.ctx_size_out && !attr->test.ctx_out) \|\|
4032	(!attr->test.ctx_size_out && attr->test.ctx_out))
4033	return -EINVAL;
4034
4035	prog = bpf_prog_get(ufd: attr->test.prog_fd);
4036	if (IS_ERR(ptr: prog))
4037	return PTR_ERR(ptr: prog);
4038
4039	if (prog->aux->ops->test_run)
4040	ret = prog->aux->ops->test_run(prog, attr, uattr);
4041
4042	bpf_prog_put(prog);
4043	return ret;
4044	}
4045
4046	#define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id
4047
4048	static int bpf_obj_get_next_id(const union bpf_attr *attr,
4049	union bpf_attr __user *uattr,
4050	struct idr *idr,
4051	spinlock_t *lock)
4052	{
4053	u32 next_id = attr->start_id;
4054	int err = `0`;
4055
4056	if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) \|\| next_id >= INT_MAX)
4057	return -EINVAL;
4058
4059	if (!capable(CAP_SYS_ADMIN))
4060	return -EPERM;
4061
4062	next_id++;
4063	spin_lock_bh(lock);
4064	if (!idr_get_next(idr, nextid: &next_id))
4065	err = -ENOENT;
4066	spin_unlock_bh(lock);
4067
4068	if (!err)
4069	err = put_user(next_id, &uattr->next_id);
4070
4071	return err;
4072	}
4073
4074	struct bpf_map bpf_map_get_curr_or_next(u32 id)
4075	{
4076	struct bpf_map *map;
4077
4078	spin_lock_bh(lock: &map_idr_lock);
4079	again:
4080	map = idr_get_next(&map_idr, nextid: id);
4081	if (map) {
4082	map = __bpf_map_inc_not_zero(map, uref: false);
4083	if (IS_ERR(ptr: map)) {
4084	(*id)++;
4085	goto again;
4086	}
4087	}
4088	spin_unlock_bh(lock: &map_idr_lock);
4089
4090	return map;
4091	}
4092
4093	struct bpf_prog bpf_prog_get_curr_or_next(u32 id)
4094	{
4095	struct bpf_prog *prog;
4096
4097	spin_lock_bh(lock: &prog_idr_lock);
4098	again:
4099	prog = idr_get_next(&prog_idr, nextid: id);
4100	if (prog) {
4101	prog = bpf_prog_inc_not_zero(prog);
4102	if (IS_ERR(ptr: prog)) {
4103	(*id)++;
4104	goto again;
4105	}
4106	}
4107	spin_unlock_bh(lock: &prog_idr_lock);
4108
4109	return prog;
4110	}
4111
4112	#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
4113
4114	struct bpf_prog *bpf_prog_by_id(u32 id)
4115	{
4116	struct bpf_prog *prog;
4117
4118	if (!id)
4119	return ERR_PTR(error: -ENOENT);
4120
4121	spin_lock_bh(lock: &prog_idr_lock);
4122	prog = idr_find(&prog_idr, id);
4123	if (prog)
4124	prog = bpf_prog_inc_not_zero(prog);
4125	else
4126	prog = ERR_PTR(error: -ENOENT);
4127	spin_unlock_bh(lock: &prog_idr_lock);
4128	return prog;
4129	}
4130
4131	static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
4132	{
4133	struct bpf_prog *prog;
4134	u32 id = attr->prog_id;
4135	int fd;
4136
4137	if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
4138	return -EINVAL;
4139
4140	if (!capable(CAP_SYS_ADMIN))
4141	return -EPERM;
4142
4143	prog = bpf_prog_by_id(id);
4144	if (IS_ERR(ptr: prog))
4145	return PTR_ERR(ptr: prog);
4146
4147	fd = bpf_prog_new_fd(prog);
4148	if (fd < `0`)
4149	bpf_prog_put(prog);
4150
4151	return fd;
4152	}
4153
4154	#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags
4155
4156	static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
4157	{
4158	struct bpf_map *map;
4159	u32 id = attr->map_id;
4160	int f_flags;
4161	int fd;
4162
4163	if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) \|\|
4164	attr->open_flags & ~BPF_OBJ_FLAG_MASK)
4165	return -EINVAL;
4166
4167	if (!capable(CAP_SYS_ADMIN))
4168	return -EPERM;
4169
4170	f_flags = bpf_get_file_flag(flags: attr->open_flags);
4171	if (f_flags < `0`)
4172	return f_flags;
4173
4174	spin_lock_bh(lock: &map_idr_lock);
4175	map = idr_find(&map_idr, id);
4176	if (map)
4177	map = __bpf_map_inc_not_zero(map, uref: true);
4178	else
4179	map = ERR_PTR(error: -ENOENT);
4180	spin_unlock_bh(lock: &map_idr_lock);
4181
4182	if (IS_ERR(ptr: map))
4183	return PTR_ERR(ptr: map);
4184
4185	fd = bpf_map_new_fd(map, flags: f_flags);
4186	if (fd < `0`)
4187	bpf_map_put_with_uref(map);
4188
4189	return fd;
4190	}
4191
4192	static const struct bpf_map bpf_map_from_imm(const* struct bpf_prog *prog,
4193	unsigned long addr, u32 *off,
4194	u32 *type)
4195	{
4196	const struct bpf_map *map;
4197	int i;
4198
4199	mutex_lock(&prog->aux->used_maps_mutex);
4200	for (i = `0`, *off = `0`; i < prog->aux->used_map_cnt; i++) {
4201	map = prog->aux->used_maps[i];
4202	if (map == (void *)addr) {
4203	*type = BPF_PSEUDO_MAP_FD;
4204	goto out;
4205	}
4206	if (!map->ops->map_direct_value_meta)
4207	continue;
4208	if (!map->ops->map_direct_value_meta(map, addr, off)) {
4209	*type = BPF_PSEUDO_MAP_VALUE;
4210	goto out;
4211	}
4212	}
4213	map = NULL;
4214
4215	out:
4216	mutex_unlock(lock: &prog->aux->used_maps_mutex);
4217	return map;
4218	}
4219
4220	static struct bpf_insn bpf_insn_prepare_dump(const* struct bpf_prog *prog,
4221	const struct cred *f_cred)
4222	{
4223	const struct bpf_map *map;
4224	struct bpf_insn *insns;
4225	u32 off, type;
4226	u64 imm;
4227	u8 code;
4228	int i;
4229
4230	insns = kmemdup(p: prog->insnsi, size: bpf_prog_insn_size(prog),
4231	GFP_USER);
4232	if (!insns)
4233	return insns;
4234
4235	for (i = `0`; i < prog->len; i++) {
4236	code = insns[i].code;
4237
4238	if (code == (BPF_JMP \| BPF_TAIL_CALL)) {
4239	insns[i].code = BPF_JMP \| BPF_CALL;
4240	insns[i].imm = BPF_FUNC_tail_call;
4241	/ fall-through /
4242	}
4243	if (code == (BPF_JMP \| BPF_CALL) \|\|
4244	code == (BPF_JMP \| BPF_CALL_ARGS)) {
4245	if (code == (BPF_JMP \| BPF_CALL_ARGS))
4246	insns[i].code = BPF_JMP \| BPF_CALL;
4247	if (!bpf_dump_raw_ok(cred: f_cred))
4248	insns[i].imm = `0`;
4249	continue;
4250	}
4251	if (BPF_CLASS(code) == BPF_LDX && BPF_MODE(code) == BPF_PROBE_MEM) {
4252	insns[i].code = BPF_LDX \| BPF_SIZE(code) \| BPF_MEM;
4253	continue;
4254	}
4255
4256	if (code != (BPF_LD \| BPF_IMM \| BPF_DW))
4257	continue;
4258
4259	imm = ((u64)insns[i + `1`].imm << `32`) \| (u32)insns[i].imm;
4260	map = bpf_map_from_imm(prog, addr: imm, off: &off, type: &type);
4261	if (map) {
4262	insns[i].src_reg = type;
4263	insns[i].imm = map->id;
4264	insns[i + `1`].imm = off;
4265	continue;
4266	}
4267	}
4268
4269	return insns;
4270	}
4271
4272	static int set_info_rec_size(struct bpf_prog_info *info)
4273	{
4274	/*
4275	* Ensure info.*_rec_size is the same as kernel expected size
4276	*
4277	* or
4278	*
4279	* Only allow zero *_rec_size if both _rec_size and _cnt are
4280	* zero. In this case, the kernel will set the expected
4281	* _rec_size back to the info.
4282	*/
4283
4284	if ((info->nr_func_info \|\| info->func_info_rec_size) &&
4285	info->func_info_rec_size != sizeof(struct bpf_func_info))
4286	return -EINVAL;
4287
4288	if ((info->nr_line_info \|\| info->line_info_rec_size) &&
4289	info->line_info_rec_size != sizeof(struct bpf_line_info))
4290	return -EINVAL;
4291
4292	if ((info->nr_jited_line_info \|\| info->jited_line_info_rec_size) &&
4293	info->jited_line_info_rec_size != sizeof(__u64))
4294	return -EINVAL;
4295
4296	info->func_info_rec_size = sizeof(struct bpf_func_info);
4297	info->line_info_rec_size = sizeof(struct bpf_line_info);
4298	info->jited_line_info_rec_size = sizeof(__u64);
4299
4300	return `0`;
4301	}
4302
4303	static int bpf_prog_get_info_by_fd(struct file *file,
4304	struct bpf_prog *prog,
4305	const union bpf_attr *attr,
4306	union bpf_attr __user *uattr)
4307	{
4308	struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
4309	struct btf *attach_btf = bpf_prog_get_target_btf(prog);
4310	struct bpf_prog_info info;
4311	u32 info_len = attr->info.info_len;
4312	struct bpf_prog_kstats stats;
4313	char __user *uinsns;
4314	u32 ulen;
4315	int err;
4316
4317	err = bpf_check_uarg_tail_zero(uaddr: USER_BPFPTR(p: uinfo), expected_size: sizeof(info), actual_size: info_len);
4318	if (err)
4319	return err;
4320	info_len = min_t(u32, sizeof(info), info_len);
4321
4322	memset(&info, `0`, sizeof(info));
4323	if (copy_from_user(to: &info, from: uinfo, n: info_len))
4324	return -EFAULT;
4325
4326	info.type = prog->type;
4327	info.id = prog->aux->id;
4328	info.load_time = prog->aux->load_time;
4329	info.created_by_uid = from_kuid_munged(current_user_ns(),
4330	uid: prog->aux->user->uid);
4331	info.gpl_compatible = prog->gpl_compatible;
4332
4333	memcpy(info.tag, prog->tag, sizeof(prog->tag));
4334	memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
4335
4336	mutex_lock(&prog->aux->used_maps_mutex);
4337	ulen = info.nr_map_ids;
4338	info.nr_map_ids = prog->aux->used_map_cnt;
4339	ulen = min_t(u32, info.nr_map_ids, ulen);
4340	if (ulen) {
4341	u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids);
4342	u32 i;
4343
4344	for (i = `0`; i < ulen; i++)
4345	if (put_user(prog->aux->used_maps[i]->id,
4346	&user_map_ids[i])) {
4347	mutex_unlock(lock: &prog->aux->used_maps_mutex);
4348	return -EFAULT;
4349	}
4350	}
4351	mutex_unlock(lock: &prog->aux->used_maps_mutex);
4352
4353	err = set_info_rec_size(&info);
4354	if (err)
4355	return err;
4356
4357	bpf_prog_get_stats(prog, stats: &stats);
4358	info.run_time_ns = stats.nsecs;
4359	info.run_cnt = stats.cnt;
4360	info.recursion_misses = stats.misses;
4361
4362	info.verified_insns = prog->aux->verified_insns;
4363
4364	if (!bpf_capable()) {
4365	info.jited_prog_len = `0`;
4366	info.xlated_prog_len = `0`;
4367	info.nr_jited_ksyms = `0`;
4368	info.nr_jited_func_lens = `0`;
4369	info.nr_func_info = `0`;
4370	info.nr_line_info = `0`;
4371	info.nr_jited_line_info = `0`;
4372	goto done;
4373	}
4374
4375	ulen = info.xlated_prog_len;
4376	info.xlated_prog_len = bpf_prog_insn_size(prog);
4377	if (info.xlated_prog_len && ulen) {
4378	struct bpf_insn *insns_sanitized;
4379	bool fault;
4380
4381	if (prog->blinded && !bpf_dump_raw_ok(cred: file->f_cred)) {
4382	info.xlated_prog_insns = `0`;
4383	goto done;
4384	}
4385	insns_sanitized = bpf_insn_prepare_dump(prog, f_cred: file->f_cred);
4386	if (!insns_sanitized)
4387	return -ENOMEM;
4388	uinsns = u64_to_user_ptr(info.xlated_prog_insns);
4389	ulen = min_t(u32, info.xlated_prog_len, ulen);
4390	fault = copy_to_user(to: uinsns, from: insns_sanitized, n: ulen);
4391	kfree(objp: insns_sanitized);
4392	if (fault)
4393	return -EFAULT;
4394	}
4395
4396	if (bpf_prog_is_offloaded(aux: prog->aux)) {
4397	err = bpf_prog_offload_info_fill(info: &info, prog);
4398	if (err)
4399	return err;
4400	goto done;
4401	}
4402
4403	/ NOTE: the following code is supposed to be skipped for offload.*
4404	* bpf_prog_offload_info_fill() is the place to fill similar fields
4405	* for offload.
4406	*/
4407	ulen = info.jited_prog_len;
4408	if (prog->aux->func_cnt) {
4409	u32 i;
4410
4411	info.jited_prog_len = `0`;
4412	for (i = `0`; i < prog->aux->func_cnt; i++)
4413	info.jited_prog_len += prog->aux->func[i]->jited_len;
4414	} else {
4415	info.jited_prog_len = prog->jited_len;
4416	}
4417
4418	if (info.jited_prog_len && ulen) {
4419	if (bpf_dump_raw_ok(cred: file->f_cred)) {
4420	uinsns = u64_to_user_ptr(info.jited_prog_insns);
4421	ulen = min_t(u32, info.jited_prog_len, ulen);
4422
4423	/ for multi-function programs, copy the JITed*
4424	* instructions for all the functions
4425	*/
4426	if (prog->aux->func_cnt) {
4427	u32 len, free, i;
4428	u8 *img;
4429
4430	free = ulen;
4431	for (i = `0`; i < prog->aux->func_cnt; i++) {
4432	len = prog->aux->func[i]->jited_len;
4433	len = min_t(u32, len, free);
4434	img = (u8 *) prog->aux->func[i]->bpf_func;
4435	if (copy_to_user(to: uinsns, from: img, n: len))
4436	return -EFAULT;
4437	uinsns += len;
4438	free -= len;
4439	if (!free)
4440	break;
4441	}
4442	} else {
4443	if (copy_to_user(to: uinsns, from: prog->bpf_func, n: ulen))
4444	return -EFAULT;
4445	}
4446	} else {
4447	info.jited_prog_insns = `0`;
4448	}
4449	}
4450
4451	ulen = info.nr_jited_ksyms;
4452	info.nr_jited_ksyms = prog->aux->func_cnt ? : `1`;
4453	if (ulen) {
4454	if (bpf_dump_raw_ok(cred: file->f_cred)) {
4455	unsigned long ksym_addr;
4456	u64 __user *user_ksyms;
4457	u32 i;
4458
4459	/ copy the address of the kernel symbol*
4460	* corresponding to each function
4461	*/
4462	ulen = min_t(u32, info.nr_jited_ksyms, ulen);
4463	user_ksyms = u64_to_user_ptr(info.jited_ksyms);
4464	if (prog->aux->func_cnt) {
4465	for (i = `0`; i < ulen; i++) {
4466	ksym_addr = (unsigned long)
4467	prog->aux->func[i]->bpf_func;
4468	if (put_user((u64) ksym_addr,
4469	&user_ksyms[i]))
4470	return -EFAULT;
4471	}
4472	} else {
4473	ksym_addr = (unsigned long) prog->bpf_func;
4474	if (put_user((u64) ksym_addr, &user_ksyms[`0`]))
4475	return -EFAULT;
4476	}
4477	} else {
4478	info.jited_ksyms = `0`;
4479	}
4480	}
4481
4482	ulen = info.nr_jited_func_lens;
4483	info.nr_jited_func_lens = prog->aux->func_cnt ? : `1`;
4484	if (ulen) {
4485	if (bpf_dump_raw_ok(cred: file->f_cred)) {
4486	u32 __user *user_lens;
4487	u32 func_len, i;
4488
4489	/ copy the JITed image lengths for each function /
4490	ulen = min_t(u32, info.nr_jited_func_lens, ulen);
4491	user_lens = u64_to_user_ptr(info.jited_func_lens);
4492	if (prog->aux->func_cnt) {
4493	for (i = `0`; i < ulen; i++) {
4494	func_len =
4495	prog->aux->func[i]->jited_len;
4496	if (put_user(func_len, &user_lens[i]))
4497	return -EFAULT;
4498	}
4499	} else {
4500	func_len = prog->jited_len;
4501	if (put_user(func_len, &user_lens[`0`]))
4502	return -EFAULT;
4503	}
4504	} else {
4505	info.jited_func_lens = `0`;
4506	}
4507	}
4508
4509	if (prog->aux->btf)
4510	info.btf_id = btf_obj_id(btf: prog->aux->btf);
4511	info.attach_btf_id = prog->aux->attach_btf_id;
4512	if (attach_btf)
4513	info.attach_btf_obj_id = btf_obj_id(btf: attach_btf);
4514
4515	ulen = info.nr_func_info;
4516	info.nr_func_info = prog->aux->func_info_cnt;
4517	if (info.nr_func_info && ulen) {
4518	char __user *user_finfo;
4519
4520	user_finfo = u64_to_user_ptr(info.func_info);
4521	ulen = min_t(u32, info.nr_func_info, ulen);
4522	if (copy_to_user(to: user_finfo, from: prog->aux->func_info,
4523	n: info.func_info_rec_size * ulen))
4524	return -EFAULT;
4525	}
4526
4527	ulen = info.nr_line_info;
4528	info.nr_line_info = prog->aux->nr_linfo;
4529	if (info.nr_line_info && ulen) {
4530	__u8 __user *user_linfo;
4531
4532	user_linfo = u64_to_user_ptr(info.line_info);
4533	ulen = min_t(u32, info.nr_line_info, ulen);
4534	if (copy_to_user(to: user_linfo, from: prog->aux->linfo,
4535	n: info.line_info_rec_size * ulen))
4536	return -EFAULT;
4537	}
4538
4539	ulen = info.nr_jited_line_info;
4540	if (prog->aux->jited_linfo)
4541	info.nr_jited_line_info = prog->aux->nr_linfo;
4542	else
4543	info.nr_jited_line_info = `0`;
4544	if (info.nr_jited_line_info && ulen) {
4545	if (bpf_dump_raw_ok(cred: file->f_cred)) {
4546	unsigned long line_addr;
4547	__u64 __user *user_linfo;
4548	u32 i;
4549
4550	user_linfo = u64_to_user_ptr(info.jited_line_info);
4551	ulen = min_t(u32, info.nr_jited_line_info, ulen);
4552	for (i = `0`; i < ulen; i++) {
4553	line_addr = (unsigned long)prog->aux->jited_linfo[i];
4554	if (put_user((__u64)line_addr, &user_linfo[i]))
4555	return -EFAULT;
4556	}
4557	} else {
4558	info.jited_line_info = `0`;
4559	}
4560	}
4561
4562	ulen = info.nr_prog_tags;
4563	info.nr_prog_tags = prog->aux->func_cnt ? : `1`;
4564	if (ulen) {
4565	__u8 __user (*user_prog_tags)[BPF_TAG_SIZE];
4566	u32 i;
4567
4568	user_prog_tags = u64_to_user_ptr(info.prog_tags);
4569	ulen = min_t(u32, info.nr_prog_tags, ulen);
4570	if (prog->aux->func_cnt) {
4571	for (i = `0`; i < ulen; i++) {
4572	if (copy_to_user(to: user_prog_tags[i],
4573	from: prog->aux->func[i]->tag,
4574	BPF_TAG_SIZE))
4575	return -EFAULT;
4576	}
4577	} else {
4578	if (copy_to_user(to: user_prog_tags[`0`],
4579	from: prog->tag, BPF_TAG_SIZE))
4580	return -EFAULT;
4581	}
4582	}
4583
4584	done:
4585	if (copy_to_user(to: uinfo, from: &info, n: info_len) \|\|
4586	put_user(info_len, &uattr->info.info_len))
4587	return -EFAULT;
4588
4589	return `0`;
4590	}
4591
4592	static int bpf_map_get_info_by_fd(struct file *file,
4593	struct bpf_map *map,
4594	const union bpf_attr *attr,
4595	union bpf_attr __user *uattr)
4596	{
4597	struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
4598	struct bpf_map_info info;
4599	u32 info_len = attr->info.info_len;
4600	int err;
4601
4602	err = bpf_check_uarg_tail_zero(uaddr: USER_BPFPTR(p: uinfo), expected_size: sizeof(info), actual_size: info_len);
4603	if (err)
4604	return err;
4605	info_len = min_t(u32, sizeof(info), info_len);
4606
4607	memset(&info, `0`, sizeof(info));
4608	info.type = map->map_type;
4609	info.id = map->id;
4610	info.key_size = map->key_size;
4611	info.value_size = map->value_size;
4612	info.max_entries = map->max_entries;
4613	info.map_flags = map->map_flags;
4614	info.map_extra = map->map_extra;
4615	memcpy(info.name, map->name, sizeof(map->name));
4616
4617	if (map->btf) {
4618	info.btf_id = btf_obj_id(btf: map->btf);
4619	info.btf_key_type_id = map->btf_key_type_id;
4620	info.btf_value_type_id = map->btf_value_type_id;
4621	}
4622	info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
4623
4624	if (bpf_map_is_offloaded(map)) {
4625	err = bpf_map_offload_info_fill(info: &info, map);
4626	if (err)
4627	return err;
4628	}
4629
4630	if (copy_to_user(to: uinfo, from: &info, n: info_len) \|\|
4631	put_user(info_len, &uattr->info.info_len))
4632	return -EFAULT;
4633
4634	return `0`;
4635	}
4636
4637	static int bpf_btf_get_info_by_fd(struct file *file,
4638	struct btf *btf,
4639	const union bpf_attr *attr,
4640	union bpf_attr __user *uattr)
4641	{
4642	struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info);
4643	u32 info_len = attr->info.info_len;
4644	int err;
4645
4646	err = bpf_check_uarg_tail_zero(uaddr: USER_BPFPTR(p: uinfo), expected_size: sizeof(*uinfo), actual_size: info_len);
4647	if (err)
4648	return err;
4649
4650	return btf_get_info_by_fd(btf, attr, uattr);
4651	}
4652
4653	static int bpf_link_get_info_by_fd(struct file *file,
4654	struct bpf_link *link,
4655	const union bpf_attr *attr,
4656	union bpf_attr __user *uattr)
4657	{
4658	struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info);
4659	struct bpf_link_info info;
4660	u32 info_len = attr->info.info_len;
4661	int err;
4662
4663	err = bpf_check_uarg_tail_zero(uaddr: USER_BPFPTR(p: uinfo), expected_size: sizeof(info), actual_size: info_len);
4664	if (err)
4665	return err;
4666	info_len = min_t(u32, sizeof(info), info_len);
4667
4668	memset(&info, `0`, sizeof(info));
4669	if (copy_from_user(to: &info, from: uinfo, n: info_len))
4670	return -EFAULT;
4671
4672	info.type = link->type;
4673	info.id = link->id;
4674	if (link->prog)
4675	info.prog_id = link->prog->aux->id;
4676
4677	if (link->ops->fill_link_info) {
4678	err = link->ops->fill_link_info(link, &info);
4679	if (err)
4680	return err;
4681	}
4682
4683	if (copy_to_user(to: uinfo, from: &info, n: info_len) \|\|
4684	put_user(info_len, &uattr->info.info_len))
4685	return -EFAULT;
4686
4687	return `0`;
4688	}
4689
4690
4691	#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
4692
4693	static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
4694	union bpf_attr __user *uattr)
4695	{
4696	int ufd = attr->info.bpf_fd;
4697	struct fd f;
4698	int err;
4699
4700	if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
4701	return -EINVAL;
4702
4703	f = fdget(fd: ufd);
4704	if (!f.file)
4705	return -EBADFD;
4706
4707	if (f.file->f_op == &bpf_prog_fops)
4708	err = bpf_prog_get_info_by_fd(file: f.file, prog: f.file->private_data, attr,
4709	uattr);
4710	else if (f.file->f_op == &bpf_map_fops)
4711	err = bpf_map_get_info_by_fd(file: f.file, map: f.file->private_data, attr,
4712	uattr);
4713	else if (f.file->f_op == &btf_fops)
4714	err = bpf_btf_get_info_by_fd(file: f.file, btf: f.file->private_data, attr, uattr);
4715	else if (f.file->f_op == &bpf_link_fops)
4716	err = bpf_link_get_info_by_fd(file: f.file, link: f.file->private_data,
4717	attr, uattr);
4718	else
4719	err = -EINVAL;
4720
4721	fdput(fd: f);
4722	return err;
4723	}
4724
4725	#define BPF_BTF_LOAD_LAST_FIELD btf_log_true_size
4726
4727	static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
4728	{
4729	if (CHECK_ATTR(BPF_BTF_LOAD))
4730	return -EINVAL;
4731
4732	if (!bpf_capable())
4733	return -EPERM;
4734
4735	return btf_new_fd(attr, uattr, uattr_sz: uattr_size);
4736	}
4737
4738	#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
4739
4740	static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
4741	{
4742	if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
4743	return -EINVAL;
4744
4745	if (!capable(CAP_SYS_ADMIN))
4746	return -EPERM;
4747
4748	return btf_get_fd_by_id(id: attr->btf_id);
4749	}
4750
4751	static int bpf_task_fd_query_copy(const union bpf_attr *attr,
4752	union bpf_attr __user *uattr,
4753	u32 prog_id, u32 fd_type,
4754	const char *buf, u64 probe_offset,
4755	u64 probe_addr)
4756	{
4757	char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
4758	u32 len = buf ? strlen(buf) : `0`, input_len;
4759	int err = `0`;
4760
4761	if (put_user(len, &uattr->task_fd_query.buf_len))
4762	return -EFAULT;
4763	input_len = attr->task_fd_query.buf_len;
4764	if (input_len && ubuf) {
4765	if (!len) {
4766	/ nothing to copy, just make ubuf NULL terminated /
4767	char zero = `'\0'`;
4768
4769	if (put_user(zero, ubuf))
4770	return -EFAULT;
4771	} else if (input_len >= len + `1`) {
4772	/ ubuf can hold the string with NULL terminator /
4773	if (copy_to_user(to: ubuf, from: buf, n: len + `1`))
4774	return -EFAULT;
4775	} else {
4776	/ ubuf cannot hold the string with NULL terminator,*
4777	* do a partial copy with NULL terminator.
4778	*/
4779	char zero = `'\0'`;
4780
4781	err = -ENOSPC;
4782	if (copy_to_user(to: ubuf, from: buf, n: input_len - `1`))
4783	return -EFAULT;
4784	if (put_user(zero, ubuf + input_len - `1`))
4785	return -EFAULT;
4786	}
4787	}
4788
4789	if (put_user(prog_id, &uattr->task_fd_query.prog_id) \|\|
4790	put_user(fd_type, &uattr->task_fd_query.fd_type) \|\|
4791	put_user(probe_offset, &uattr->task_fd_query.probe_offset) \|\|
4792	put_user(probe_addr, &uattr->task_fd_query.probe_addr))
4793	return -EFAULT;
4794
4795	return err;
4796	}
4797
4798	#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
4799
4800	static int bpf_task_fd_query(const union bpf_attr *attr,
4801	union bpf_attr __user *uattr)
4802	{
4803	pid_t pid = attr->task_fd_query.pid;
4804	u32 fd = attr->task_fd_query.fd;
4805	const struct perf_event *event;
4806	struct task_struct *task;
4807	struct file *file;
4808	int err;
4809
4810	if (CHECK_ATTR(BPF_TASK_FD_QUERY))
4811	return -EINVAL;
4812
4813	if (!capable(CAP_SYS_ADMIN))
4814	return -EPERM;
4815
4816	if (attr->task_fd_query.flags != `0`)
4817	return -EINVAL;
4818
4819	rcu_read_lock();
4820	task = get_pid_task(pid: find_vpid(nr: pid), PIDTYPE_PID);
4821	rcu_read_unlock();
4822	if (!task)
4823	return -ENOENT;
4824
4825	err = `0`;
4826	file = fget_task(task, fd);
4827	put_task_struct(t: task);
4828	if (!file)
4829	return -EBADF;
4830
4831	if (file->f_op == &bpf_link_fops) {
4832	struct bpf_link *link = file->private_data;
4833
4834	if (link->ops == &bpf_raw_tp_link_lops) {
4835	struct bpf_raw_tp_link *raw_tp =
4836	container_of(link, struct bpf_raw_tp_link, link);
4837	struct bpf_raw_event_map *btp = raw_tp->btp;
4838
4839	err = bpf_task_fd_query_copy(attr, uattr,
4840	prog_id: raw_tp->link.prog->aux->id,
4841	fd_type: BPF_FD_TYPE_RAW_TRACEPOINT,
4842	buf: btp->tp->name, probe_offset: `0`, probe_addr: `0`);
4843	goto put_file;
4844	}
4845	goto out_not_supp;
4846	}
4847
4848	event = perf_get_event(file);
4849	if (!IS_ERR(ptr: event)) {
4850	u64 probe_offset, probe_addr;
4851	u32 prog_id, fd_type;
4852	const char *buf;
4853
4854	err = bpf_get_perf_event_info(event, prog_id: &prog_id, fd_type: &fd_type,
4855	buf: &buf, probe_offset: &probe_offset,
4856	probe_addr: &probe_addr, NULL);
4857	if (!err)
4858	err = bpf_task_fd_query_copy(attr, uattr, prog_id,
4859	fd_type, buf,
4860	probe_offset,
4861	probe_addr);
4862	goto put_file;
4863	}
4864
4865	out_not_supp:
4866	err = -ENOTSUPP;
4867	put_file:
4868	fput(file);
4869	return err;
4870	}
4871
4872	#define BPF_MAP_BATCH_LAST_FIELD batch.flags
4873
4874	#define BPF_DO_BATCH(fn, ...) \
4875	do { \
4876	if (!fn) { \
4877	err = -ENOTSUPP; \
4878	goto err_put; \
4879	} \
4880	err = fn(__VA_ARGS__); \
4881	} while (0)
4882
4883	static int bpf_map_do_batch(const union bpf_attr *attr,
4884	union bpf_attr __user *uattr,
4885	int cmd)
4886	{
4887	bool has_read = cmd == BPF_MAP_LOOKUP_BATCH \|\|
4888	cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH;
4889	bool has_write = cmd != BPF_MAP_LOOKUP_BATCH;
4890	struct bpf_map *map;
4891	int err, ufd;
4892	struct fd f;
4893
4894	if (CHECK_ATTR(BPF_MAP_BATCH))
4895	return -EINVAL;
4896
4897	ufd = attr->batch.map_fd;
4898	f = fdget(fd: ufd);
4899	map = __bpf_map_get(f);
4900	if (IS_ERR(ptr: map))
4901	return PTR_ERR(ptr: map);
4902	if (has_write)
4903	bpf_map_write_active_inc(map);
4904	if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
4905	err = -EPERM;
4906	goto err_put;
4907	}
4908	if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
4909	err = -EPERM;
4910	goto err_put;
4911	}
4912
4913	if (cmd == BPF_MAP_LOOKUP_BATCH)
4914	BPF_DO_BATCH(map->ops->map_lookup_batch, map, attr, uattr);
4915	else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
4916	BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr);
4917	else if (cmd == BPF_MAP_UPDATE_BATCH)
4918	BPF_DO_BATCH(map->ops->map_update_batch, map, f.file, attr, uattr);
4919	else
4920	BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr);
4921	err_put:
4922	if (has_write)
4923	bpf_map_write_active_dec(map);
4924	fdput(fd: f);
4925	return err;
4926	}
4927
4928	#define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.pid
4929	static int link_create(union bpf_attr *attr, bpfptr_t uattr)
4930	{
4931	struct bpf_prog *prog;
4932	int ret;
4933
4934	if (CHECK_ATTR(BPF_LINK_CREATE))
4935	return -EINVAL;
4936
4937	if (attr->link_create.attach_type == BPF_STRUCT_OPS)
4938	return bpf_struct_ops_link_create(attr);
4939
4940	prog = bpf_prog_get(ufd: attr->link_create.prog_fd);
4941	if (IS_ERR(ptr: prog))
4942	return PTR_ERR(ptr: prog);
4943
4944	ret = bpf_prog_attach_check_attach_type(prog,
4945	attach_type: attr->link_create.attach_type);
4946	if (ret)
4947	goto out;
4948
4949	switch (prog->type) {
4950	case BPF_PROG_TYPE_CGROUP_SKB:
4951	case BPF_PROG_TYPE_CGROUP_SOCK:
4952	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
4953	case BPF_PROG_TYPE_SOCK_OPS:
4954	case BPF_PROG_TYPE_CGROUP_DEVICE:
4955	case BPF_PROG_TYPE_CGROUP_SYSCTL:
4956	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
4957	ret = cgroup_bpf_link_attach(attr, prog);
4958	break;
4959	case BPF_PROG_TYPE_EXT:
4960	ret = bpf_tracing_prog_attach(prog,
4961	tgt_prog_fd: attr->link_create.target_fd,
4962	btf_id: attr->link_create.target_btf_id,
4963	bpf_cookie: attr->link_create.tracing.cookie);
4964	break;
4965	case BPF_PROG_TYPE_LSM:
4966	case BPF_PROG_TYPE_TRACING:
4967	if (attr->link_create.attach_type != prog->expected_attach_type) {
4968	ret = -EINVAL;
4969	goto out;
4970	}
4971	if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
4972	ret = bpf_raw_tp_link_attach(prog, NULL);
4973	else if (prog->expected_attach_type == BPF_TRACE_ITER)
4974	ret = bpf_iter_link_attach(attr, uattr, prog);
4975	else if (prog->expected_attach_type == BPF_LSM_CGROUP)
4976	ret = cgroup_bpf_link_attach(attr, prog);
4977	else
4978	ret = bpf_tracing_prog_attach(prog,
4979	tgt_prog_fd: attr->link_create.target_fd,
4980	btf_id: attr->link_create.target_btf_id,
4981	bpf_cookie: attr->link_create.tracing.cookie);
4982	break;
4983	case BPF_PROG_TYPE_FLOW_DISSECTOR:
4984	case BPF_PROG_TYPE_SK_LOOKUP:
4985	ret = netns_bpf_link_create(attr, prog);
4986	break;
4987	#ifdef CONFIG_NET
4988	case BPF_PROG_TYPE_XDP:
4989	ret = bpf_xdp_link_attach(attr, prog);
4990	break;
4991	case BPF_PROG_TYPE_SCHED_CLS:
4992	if (attr->link_create.attach_type == BPF_TCX_INGRESS \|\|
4993	attr->link_create.attach_type == BPF_TCX_EGRESS)
4994	ret = tcx_link_attach(attr, prog);
4995	else
4996	ret = netkit_link_attach(attr, prog);
4997	break;
4998	case BPF_PROG_TYPE_NETFILTER:
4999	ret = bpf_nf_link_attach(attr, prog);
5000	break;
5001	#endif
5002	case BPF_PROG_TYPE_PERF_EVENT:
5003	case BPF_PROG_TYPE_TRACEPOINT:
5004	ret = bpf_perf_link_attach(attr, prog);
5005	break;
5006	case BPF_PROG_TYPE_KPROBE:
5007	if (attr->link_create.attach_type == BPF_PERF_EVENT)
5008	ret = bpf_perf_link_attach(attr, prog);
5009	else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI)
5010	ret = bpf_kprobe_multi_link_attach(attr, prog);
5011	else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI)
5012	ret = bpf_uprobe_multi_link_attach(attr, prog);
5013	break;
5014	default:
5015	ret = -EINVAL;
5016	}
5017
5018	out:
5019	if (ret < `0`)
5020	bpf_prog_put(prog);
5021	return ret;
5022	}
5023
5024	static int link_update_map(struct bpf_link link, union* bpf_attr *attr)
5025	{
5026	struct bpf_map new_map, old_map = NULL;
5027	int ret;
5028
5029	new_map = bpf_map_get(attr->link_update.new_map_fd);
5030	if (IS_ERR(ptr: new_map))
5031	return PTR_ERR(ptr: new_map);
5032
5033	if (attr->link_update.flags & BPF_F_REPLACE) {
5034	old_map = bpf_map_get(attr->link_update.old_map_fd);
5035	if (IS_ERR(ptr: old_map)) {
5036	ret = PTR_ERR(ptr: old_map);
5037	goto out_put;
5038	}
5039	} else if (attr->link_update.old_map_fd) {
5040	ret = -EINVAL;
5041	goto out_put;
5042	}
5043
5044	ret = link->ops->update_map(link, new_map, old_map);
5045
5046	if (old_map)
5047	bpf_map_put(old_map);
5048	out_put:
5049	bpf_map_put(new_map);
5050	return ret;
5051	}
5052
5053	#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
5054
5055	static int link_update(union bpf_attr *attr)
5056	{
5057	struct bpf_prog old_prog = NULL, new_prog;
5058	struct bpf_link *link;
5059	u32 flags;
5060	int ret;
5061
5062	if (CHECK_ATTR(BPF_LINK_UPDATE))
5063	return -EINVAL;
5064
5065	flags = attr->link_update.flags;
5066	if (flags & ~BPF_F_REPLACE)
5067	return -EINVAL;
5068
5069	link = bpf_link_get_from_fd(attr->link_update.link_fd);
5070	if (IS_ERR(ptr: link))
5071	return PTR_ERR(ptr: link);
5072
5073	if (link->ops->update_map) {
5074	ret = link_update_map(link, attr);
5075	goto out_put_link;
5076	}
5077
5078	new_prog = bpf_prog_get(ufd: attr->link_update.new_prog_fd);
5079	if (IS_ERR(ptr: new_prog)) {
5080	ret = PTR_ERR(ptr: new_prog);
5081	goto out_put_link;
5082	}
5083
5084	if (flags & BPF_F_REPLACE) {
5085	old_prog = bpf_prog_get(ufd: attr->link_update.old_prog_fd);
5086	if (IS_ERR(ptr: old_prog)) {
5087	ret = PTR_ERR(ptr: old_prog);
5088	old_prog = NULL;
5089	goto out_put_progs;
5090	}
5091	} else if (attr->link_update.old_prog_fd) {
5092	ret = -EINVAL;
5093	goto out_put_progs;
5094	}
5095
5096	if (link->ops->update_prog)
5097	ret = link->ops->update_prog(link, new_prog, old_prog);
5098	else
5099	ret = -EINVAL;
5100
5101	out_put_progs:
5102	if (old_prog)
5103	bpf_prog_put(old_prog);
5104	if (ret)
5105	bpf_prog_put(new_prog);
5106	out_put_link:
5107	bpf_link_put_direct(link);
5108	return ret;
5109	}
5110
5111	#define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd
5112
5113	static int link_detach(union bpf_attr *attr)
5114	{
5115	struct bpf_link *link;
5116	int ret;
5117
5118	if (CHECK_ATTR(BPF_LINK_DETACH))
5119	return -EINVAL;
5120
5121	link = bpf_link_get_from_fd(attr->link_detach.link_fd);
5122	if (IS_ERR(ptr: link))
5123	return PTR_ERR(ptr: link);
5124
5125	if (link->ops->detach)
5126	ret = link->ops->detach(link);
5127	else
5128	ret = -EOPNOTSUPP;
5129
5130	bpf_link_put_direct(link);
5131	return ret;
5132	}
5133
5134	static struct bpf_link bpf_link_inc_not_zero(struct* bpf_link *link)
5135	{
5136	return atomic64_fetch_add_unless(v: &link->refcnt, a: `1`, u: `0`) ? link : ERR_PTR(error: -ENOENT);
5137	}
5138
5139	struct bpf_link *bpf_link_by_id(u32 id)
5140	{
5141	struct bpf_link *link;
5142
5143	if (!id)
5144	return ERR_PTR(error: -ENOENT);
5145
5146	spin_lock_bh(lock: &link_idr_lock);
5147	/ before link is "settled", ID is 0, pretend it doesn't exist yet /
5148	link = idr_find(&link_idr, id);
5149	if (link) {
5150	if (link->id)
5151	link = bpf_link_inc_not_zero(link);
5152	else
5153	link = ERR_PTR(error: -EAGAIN);
5154	} else {
5155	link = ERR_PTR(error: -ENOENT);
5156	}
5157	spin_unlock_bh(lock: &link_idr_lock);
5158	return link;
5159	}
5160
5161	struct bpf_link bpf_link_get_curr_or_next(u32 id)
5162	{
5163	struct bpf_link *link;
5164
5165	spin_lock_bh(lock: &link_idr_lock);
5166	again:
5167	link = idr_get_next(&link_idr, nextid: id);
5168	if (link) {
5169	link = bpf_link_inc_not_zero(link);
5170	if (IS_ERR(ptr: link)) {
5171	(*id)++;
5172	goto again;
5173	}
5174	}
5175	spin_unlock_bh(lock: &link_idr_lock);
5176
5177	return link;
5178	}
5179
5180	#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id
5181
5182	static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
5183	{
5184	struct bpf_link *link;
5185	u32 id = attr->link_id;
5186	int fd;
5187
5188	if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID))
5189	return -EINVAL;
5190
5191	if (!capable(CAP_SYS_ADMIN))
5192	return -EPERM;
5193
5194	link = bpf_link_by_id(id);
5195	if (IS_ERR(ptr: link))
5196	return PTR_ERR(ptr: link);
5197
5198	fd = bpf_link_new_fd(link);
5199	if (fd < `0`)
5200	bpf_link_put_direct(link);
5201
5202	return fd;
5203	}
5204
5205	DEFINE_MUTEX(bpf_stats_enabled_mutex);
5206
5207	static int bpf_stats_release(struct inode inode, struct* file *file)
5208	{
5209	mutex_lock(&bpf_stats_enabled_mutex);
5210	static_key_slow_dec(key: &bpf_stats_enabled_key.key);
5211	mutex_unlock(lock: &bpf_stats_enabled_mutex);
5212	return `0`;
5213	}
5214
5215	static const struct file_operations bpf_stats_fops = {
5216	.release = bpf_stats_release,
5217	};
5218
5219	static int bpf_enable_runtime_stats(void)
5220	{
5221	int fd;
5222
5223	mutex_lock(&bpf_stats_enabled_mutex);
5224
5225	/ Set a very high limit to avoid overflow /
5226	if (static_key_count(key: &bpf_stats_enabled_key.key) > INT_MAX / `2`) {
5227	mutex_unlock(lock: &bpf_stats_enabled_mutex);
5228	return -EBUSY;
5229	}
5230
5231	fd = anon_inode_getfd(name: "bpf-stats", fops: &bpf_stats_fops, NULL, O_CLOEXEC);
5232	if (fd >= `0`)
5233	static_key_slow_inc(key: &bpf_stats_enabled_key.key);
5234
5235	mutex_unlock(lock: &bpf_stats_enabled_mutex);
5236	return fd;
5237	}
5238
5239	#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type
5240
5241	static int bpf_enable_stats(union bpf_attr *attr)
5242	{
5243
5244	if (CHECK_ATTR(BPF_ENABLE_STATS))
5245	return -EINVAL;
5246
5247	if (!capable(CAP_SYS_ADMIN))
5248	return -EPERM;
5249
5250	switch (attr->enable_stats.type) {
5251	case BPF_STATS_RUN_TIME:
5252	return bpf_enable_runtime_stats();
5253	default:
5254	break;
5255	}
5256	return -EINVAL;
5257	}
5258
5259	#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags
5260
5261	static int bpf_iter_create(union bpf_attr *attr)
5262	{
5263	struct bpf_link *link;
5264	int err;
5265
5266	if (CHECK_ATTR(BPF_ITER_CREATE))
5267	return -EINVAL;
5268
5269	if (attr->iter_create.flags)
5270	return -EINVAL;
5271
5272	link = bpf_link_get_from_fd(attr->iter_create.link_fd);
5273	if (IS_ERR(ptr: link))
5274	return PTR_ERR(ptr: link);
5275
5276	err = bpf_iter_new_fd(link);
5277	bpf_link_put_direct(link);
5278
5279	return err;
5280	}
5281
5282	#define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags
5283
5284	static int bpf_prog_bind_map(union bpf_attr *attr)
5285	{
5286	struct bpf_prog *prog;
5287	struct bpf_map *map;
5288	struct bpf_map used_maps_old, used_maps_new;
5289	int i, ret = `0`;
5290
5291	if (CHECK_ATTR(BPF_PROG_BIND_MAP))
5292	return -EINVAL;
5293
5294	if (attr->prog_bind_map.flags)
5295	return -EINVAL;
5296
5297	prog = bpf_prog_get(ufd: attr->prog_bind_map.prog_fd);
5298	if (IS_ERR(ptr: prog))
5299	return PTR_ERR(ptr: prog);
5300
5301	map = bpf_map_get(attr->prog_bind_map.map_fd);
5302	if (IS_ERR(ptr: map)) {
5303	ret = PTR_ERR(ptr: map);
5304	goto out_prog_put;
5305	}
5306
5307	mutex_lock(&prog->aux->used_maps_mutex);
5308
5309	used_maps_old = prog->aux->used_maps;
5310
5311	for (i = `0`; i < prog->aux->used_map_cnt; i++)
5312	if (used_maps_old[i] == map) {
5313	bpf_map_put(map);
5314	goto out_unlock;
5315	}
5316
5317	used_maps_new = kmalloc_array(n: prog->aux->used_map_cnt + `1`,
5318	size: sizeof(used_maps_new[`0`]),
5319	GFP_KERNEL);
5320	if (!used_maps_new) {
5321	ret = -ENOMEM;
5322	goto out_unlock;
5323	}
5324
5325	memcpy(used_maps_new, used_maps_old,
5326	sizeof(used_maps_old[`0`]) * prog->aux->used_map_cnt);
5327	used_maps_new[prog->aux->used_map_cnt] = map;
5328
5329	prog->aux->used_map_cnt++;
5330	prog->aux->used_maps = used_maps_new;
5331
5332	kfree(objp: used_maps_old);
5333
5334	out_unlock:
5335	mutex_unlock(lock: &prog->aux->used_maps_mutex);
5336
5337	if (ret)
5338	bpf_map_put(map);
5339	out_prog_put:
5340	bpf_prog_put(prog);
5341	return ret;
5342	}
5343
5344	static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
5345	{
5346	union bpf_attr attr;
5347	int err;
5348
5349	err = bpf_check_uarg_tail_zero(uaddr: uattr, expected_size: sizeof(attr), actual_size: size);
5350	if (err)
5351	return err;
5352	size = min_t(u32, size, sizeof(attr));
5353
5354	/ copy attributes from user space, may be less than sizeof(bpf_attr) /
5355	memset(&attr, `0`, sizeof(attr));
5356	if (copy_from_bpfptr(dst: &attr, src: uattr, size) != `0`)
5357	return -EFAULT;
5358
5359	err = security_bpf(cmd, attr: &attr, size);
5360	if (err < `0`)
5361	return err;
5362
5363	switch (cmd) {
5364	case BPF_MAP_CREATE:
5365	err = map_create(attr: &attr);
5366	break;
5367	case BPF_MAP_LOOKUP_ELEM:
5368	err = map_lookup_elem(attr: &attr);
5369	break;
5370	case BPF_MAP_UPDATE_ELEM:
5371	err = map_update_elem(attr: &attr, uattr);
5372	break;
5373	case BPF_MAP_DELETE_ELEM:
5374	err = map_delete_elem(attr: &attr, uattr);
5375	break;
5376	case BPF_MAP_GET_NEXT_KEY:
5377	err = map_get_next_key(attr: &attr);
5378	break;
5379	case BPF_MAP_FREEZE:
5380	err = map_freeze(attr: &attr);
5381	break;
5382	case BPF_PROG_LOAD:
5383	err = bpf_prog_load(attr: &attr, uattr, uattr_size: size);
5384	break;
5385	case BPF_OBJ_PIN:
5386	err = bpf_obj_pin(attr: &attr);
5387	break;
5388	case BPF_OBJ_GET:
5389	err = bpf_obj_get(attr: &attr);
5390	break;
5391	case BPF_PROG_ATTACH:
5392	err = bpf_prog_attach(attr: &attr);
5393	break;
5394	case BPF_PROG_DETACH:
5395	err = bpf_prog_detach(attr: &attr);
5396	break;
5397	case BPF_PROG_QUERY:
5398	err = bpf_prog_query(attr: &attr, uattr: uattr.user);
5399	break;
5400	case BPF_PROG_TEST_RUN:
5401	err = bpf_prog_test_run(attr: &attr, uattr: uattr.user);
5402	break;
5403	case BPF_PROG_GET_NEXT_ID:
5404	err = bpf_obj_get_next_id(attr: &attr, uattr: uattr.user,
5405	idr: &prog_idr, lock: &prog_idr_lock);
5406	break;
5407	case BPF_MAP_GET_NEXT_ID:
5408	err = bpf_obj_get_next_id(attr: &attr, uattr: uattr.user,
5409	idr: &map_idr, lock: &map_idr_lock);
5410	break;
5411	case BPF_BTF_GET_NEXT_ID:
5412	err = bpf_obj_get_next_id(attr: &attr, uattr: uattr.user,
5413	idr: &btf_idr, lock: &btf_idr_lock);
5414	break;
5415	case BPF_PROG_GET_FD_BY_ID:
5416	err = bpf_prog_get_fd_by_id(attr: &attr);
5417	break;
5418	case BPF_MAP_GET_FD_BY_ID:
5419	err = bpf_map_get_fd_by_id(attr: &attr);
5420	break;
5421	case BPF_OBJ_GET_INFO_BY_FD:
5422	err = bpf_obj_get_info_by_fd(attr: &attr, uattr: uattr.user);
5423	break;
5424	case BPF_RAW_TRACEPOINT_OPEN:
5425	err = bpf_raw_tracepoint_open(attr: &attr);
5426	break;
5427	case BPF_BTF_LOAD:
5428	err = bpf_btf_load(attr: &attr, uattr, uattr_size: size);
5429	break;
5430	case BPF_BTF_GET_FD_BY_ID:
5431	err = bpf_btf_get_fd_by_id(attr: &attr);
5432	break;
5433	case BPF_TASK_FD_QUERY:
5434	err = bpf_task_fd_query(attr: &attr, uattr: uattr.user);
5435	break;
5436	case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
5437	err = map_lookup_and_delete_elem(attr: &attr);
5438	break;
5439	case BPF_MAP_LOOKUP_BATCH:
5440	err = bpf_map_do_batch(attr: &attr, uattr: uattr.user, cmd: BPF_MAP_LOOKUP_BATCH);
5441	break;
5442	case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
5443	err = bpf_map_do_batch(attr: &attr, uattr: uattr.user,
5444	cmd: BPF_MAP_LOOKUP_AND_DELETE_BATCH);
5445	break;
5446	case BPF_MAP_UPDATE_BATCH:
5447	err = bpf_map_do_batch(attr: &attr, uattr: uattr.user, cmd: BPF_MAP_UPDATE_BATCH);
5448	break;
5449	case BPF_MAP_DELETE_BATCH:
5450	err = bpf_map_do_batch(attr: &attr, uattr: uattr.user, cmd: BPF_MAP_DELETE_BATCH);
5451	break;
5452	case BPF_LINK_CREATE:
5453	err = link_create(attr: &attr, uattr);
5454	break;
5455	case BPF_LINK_UPDATE:
5456	err = link_update(attr: &attr);
5457	break;
5458	case BPF_LINK_GET_FD_BY_ID:
5459	err = bpf_link_get_fd_by_id(attr: &attr);
5460	break;
5461	case BPF_LINK_GET_NEXT_ID:
5462	err = bpf_obj_get_next_id(attr: &attr, uattr: uattr.user,
5463	idr: &link_idr, lock: &link_idr_lock);
5464	break;
5465	case BPF_ENABLE_STATS:
5466	err = bpf_enable_stats(attr: &attr);
5467	break;
5468	case BPF_ITER_CREATE:
5469	err = bpf_iter_create(attr: &attr);
5470	break;
5471	case BPF_LINK_DETACH:
5472	err = link_detach(attr: &attr);
5473	break;
5474	case BPF_PROG_BIND_MAP:
5475	err = bpf_prog_bind_map(attr: &attr);
5476	break;
5477	default:
5478	err = -EINVAL;
5479	break;
5480	}
5481
5482	return err;
5483	}
5484
5485	SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user , uattr, unsigned* int, size)
5486	{
5487	return __sys_bpf(cmd, uattr: USER_BPFPTR(p: uattr), size);
5488	}
5489
5490	static bool syscall_prog_is_valid_access(int off, int size,
5491	enum bpf_access_type type,
5492	const struct bpf_prog *prog,
5493	struct bpf_insn_access_aux *info)
5494	{
5495	if (off < `0` \|\| off >= U16_MAX)
5496	return false;
5497	if (off % size != `0`)
5498	return false;
5499	return true;
5500	}
5501
5502	BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
5503	{
5504	switch (cmd) {
5505	case BPF_MAP_CREATE:
5506	case BPF_MAP_DELETE_ELEM:
5507	case BPF_MAP_UPDATE_ELEM:
5508	case BPF_MAP_FREEZE:
5509	case BPF_MAP_GET_FD_BY_ID:
5510	case BPF_PROG_LOAD:
5511	case BPF_BTF_LOAD:
5512	case BPF_LINK_CREATE:
5513	case BPF_RAW_TRACEPOINT_OPEN:
5514	break;
5515	default:
5516	return -EINVAL;
5517	}
5518	return __sys_bpf(cmd, uattr: KERNEL_BPFPTR(p: attr), size: attr_size);
5519	}
5520
5521
5522	/ To shut up -Wmissing-prototypes.*
5523	* This function is used by the kernel light skeleton
5524	* to load bpf programs when modules are loaded or during kernel boot.
5525	* See tools/lib/bpf/skel_internal.h
5526	*/
5527	int kern_sys_bpf(int cmd, union bpf_attr attr, unsigned* int size);
5528
5529	int kern_sys_bpf(int cmd, union bpf_attr attr, unsigned* int size)
5530	{
5531	struct bpf_prog * __maybe_unused prog;
5532	struct bpf_tramp_run_ctx __maybe_unused run_ctx;
5533
5534	switch (cmd) {
5535	#ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */
5536	case BPF_PROG_TEST_RUN:
5537	if (attr->test.data_in \|\| attr->test.data_out \|\|
5538	attr->test.ctx_out \|\| attr->test.duration \|\|
5539	attr->test.repeat \|\| attr->test.flags)
5540	return -EINVAL;
5541
5542	prog = bpf_prog_get_type(ufd: attr->test.prog_fd, type: BPF_PROG_TYPE_SYSCALL);
5543	if (IS_ERR(ptr: prog))
5544	return PTR_ERR(ptr: prog);
5545
5546	if (attr->test.ctx_size_in < prog->aux->max_ctx_offset \|\|
5547	attr->test.ctx_size_in > U16_MAX) {
5548	bpf_prog_put(prog);
5549	return -EINVAL;
5550	}
5551
5552	run_ctx.bpf_cookie = `0`;
5553	if (!__bpf_prog_enter_sleepable_recur(prog, run_ctx: &run_ctx)) {
5554	/ recursion detected /
5555	__bpf_prog_exit_sleepable_recur(prog, start: `0`, run_ctx: &run_ctx);
5556	bpf_prog_put(prog);
5557	return -EBUSY;
5558	}
5559	attr->test.retval = bpf_prog_run(prog, ctx: (void ) (long*) attr->test.ctx_in);
5560	__bpf_prog_exit_sleepable_recur(prog, start: `0` / bpf_prog_run does runtime stats /,
5561	run_ctx: &run_ctx);
5562	bpf_prog_put(prog);
5563	return `0`;
5564	#endif
5565	default:
5566	return ____bpf_sys_bpf(cmd, attr, attr_size: size);
5567	}
5568	}
5569	EXPORT_SYMBOL(kern_sys_bpf);
5570
5571	static const struct bpf_func_proto bpf_sys_bpf_proto = {
5572	.func = bpf_sys_bpf,
5573	.gpl_only = false,
5574	.ret_type = RET_INTEGER,
5575	.arg1_type = ARG_ANYTHING,
5576	.arg2_type = ARG_PTR_TO_MEM \| MEM_RDONLY,
5577	.arg3_type = ARG_CONST_SIZE,
5578	};
5579
5580	const struct bpf_func_proto * __weak
5581	tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
5582	{
5583	return bpf_base_func_proto(func_id);
5584	}
5585
5586	BPF_CALL_1(bpf_sys_close, u32, fd)
5587	{
5588	/ When bpf program calls this helper there should not be*
5589	* an fdget() without matching completed fdput().
5590	* This helper is allowed in the following callchain only:
5591	* sys_bpf->prog_test_run->bpf_prog->bpf_sys_close
5592	*/
5593	return close_fd(fd);
5594	}
5595
5596	static const struct bpf_func_proto bpf_sys_close_proto = {
5597	.func = bpf_sys_close,
5598	.gpl_only = false,
5599	.ret_type = RET_INTEGER,
5600	.arg1_type = ARG_ANYTHING,
5601	};
5602
5603	BPF_CALL_4(bpf_kallsyms_lookup_name, const char , name, int, name_sz, int, flags, u64 , res)
5604	{
5605	if (flags)
5606	return -EINVAL;
5607
5608	if (name_sz <= `1` \|\| name[name_sz - `1`])
5609	return -EINVAL;
5610
5611	if (!bpf_dump_raw_ok(current_cred()))
5612	return -EPERM;
5613
5614	*res = kallsyms_lookup_name(name);
5615	return *res ? `0` : -ENOENT;
5616	}
5617
5618	static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
5619	.func = bpf_kallsyms_lookup_name,
5620	.gpl_only = false,
5621	.ret_type = RET_INTEGER,
5622	.arg1_type = ARG_PTR_TO_MEM,
5623	.arg2_type = ARG_CONST_SIZE_OR_ZERO,
5624	.arg3_type = ARG_ANYTHING,
5625	.arg4_type = ARG_PTR_TO_LONG,
5626	};
5627
5628	static const struct bpf_func_proto *
5629	syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
5630	{
5631	switch (func_id) {
5632	case BPF_FUNC_sys_bpf:
5633	return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto;
5634	case BPF_FUNC_btf_find_by_name_kind:
5635	return &bpf_btf_find_by_name_kind_proto;
5636	case BPF_FUNC_sys_close:
5637	return &bpf_sys_close_proto;
5638	case BPF_FUNC_kallsyms_lookup_name:
5639	return &bpf_kallsyms_lookup_name_proto;
5640	default:
5641	return tracing_prog_func_proto(func_id, prog);
5642	}
5643	}
5644
5645	const struct bpf_verifier_ops bpf_syscall_verifier_ops = {
5646	.get_func_proto = syscall_prog_func_proto,
5647	.is_valid_access = syscall_prog_is_valid_access,
5648	};
5649
5650	const struct bpf_prog_ops bpf_syscall_prog_ops = {
5651	.test_run = bpf_prog_test_run_syscall,
5652	};
5653
5654	#ifdef CONFIG_SYSCTL
5655	static int bpf_stats_handler(struct ctl_table table, int* write,
5656	void buffer, size_t lenp, loff_t *ppos)
5657	{
5658	struct static_key key = (struct* static_key *)table->data;
5659	static int saved_val;
5660	int val, ret;
5661	struct ctl_table tmp = {
5662	.data = &val,
5663	.maxlen = sizeof(val),
5664	.mode = table->mode,
5665	.extra1 = SYSCTL_ZERO,
5666	.extra2 = SYSCTL_ONE,
5667	};
5668
5669	if (write && !capable(CAP_SYS_ADMIN))
5670	return -EPERM;
5671
5672	mutex_lock(&bpf_stats_enabled_mutex);
5673	val = saved_val;
5674	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
5675	if (write && !ret && val != saved_val) {
5676	if (val)
5677	static_key_slow_inc(key);
5678	else
5679	static_key_slow_dec(key);
5680	saved_val = val;
5681	}
5682	mutex_unlock(lock: &bpf_stats_enabled_mutex);
5683	return ret;
5684	}
5685
5686	void __weak unpriv_ebpf_notify(int new_state)
5687	{
5688	}
5689
5690	static int bpf_unpriv_handler(struct ctl_table table, int* write,
5691	void buffer, size_t lenp, loff_t *ppos)
5692	{
5693	int ret, unpriv_enable = (int* *)table->data;
5694	bool locked_state = unpriv_enable == `1`;
5695	struct ctl_table tmp = *table;
5696
5697	if (write && !capable(CAP_SYS_ADMIN))
5698	return -EPERM;
5699
5700	tmp.data = &unpriv_enable;
5701	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
5702	if (write && !ret) {
5703	if (locked_state && unpriv_enable != `1`)
5704	return -EPERM;
5705	(int* *)table->data = unpriv_enable;
5706	}
5707
5708	if (write)
5709	unpriv_ebpf_notify(new_state: unpriv_enable);
5710
5711	return ret;
5712	}
5713
5714	static struct ctl_table bpf_syscall_table[] = {
5715	{
5716	.procname = "unprivileged_bpf_disabled",
5717	.data = &sysctl_unprivileged_bpf_disabled,
5718	.maxlen = sizeof(sysctl_unprivileged_bpf_disabled),
5719	.mode = `0644`,
5720	.proc_handler = bpf_unpriv_handler,
5721	.extra1 = SYSCTL_ZERO,
5722	.extra2 = SYSCTL_TWO,
5723	},
5724	{
5725	.procname = "bpf_stats_enabled",
5726	.data = &bpf_stats_enabled_key.key,
5727	.mode = `0644`,
5728	.proc_handler = bpf_stats_handler,
5729	},
5730	{ }
5731	};
5732
5733	static int __init bpf_syscall_sysctl_init(void)
5734	{
5735	register_sysctl_init("kernel", bpf_syscall_table);
5736	return `0`;
5737	}
5738	late_initcall(bpf_syscall_sysctl_init);
5739	#endif /* CONFIG_SYSCTL */
5740

source code of linux/kernel/bpf/syscall.c