tun.c source code [linux/drivers/net/tun.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* TUN - Universal TUN/TAP device driver.
4	* Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com>
5	*
6	* $Id: tun.c,v 1.15 2002/03/01 02:44:24 maxk Exp $
7	*/
8
9	/*
10	* Changes:
11	*
12	* Mike Kershaw <dragorn@kismetwireless.net> 2005/08/14
13	* Add TUNSETLINK ioctl to set the link encapsulation
14	*
15	* Mark Smith <markzzzsmith@yahoo.com.au>
16	* Use eth_random_addr() for tap MAC address.
17	*
18	* Harald Roelle <harald.roelle@ifi.lmu.de> 2004/04/20
19	* Fixes in packet dropping, queue length setting and queue wakeup.
20	* Increased default tx queue length.
21	* Added ethtool API.
22	* Minor cleanups
23	*
24	* Daniel Podlejski <underley@underley.eu.org>
25	* Modifications for 2.3.99-pre5 kernel.
26	*/
27
28	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
29
30	#define DRV_NAME "tun"
31	#define DRV_VERSION "1.6"
32	#define DRV_DESCRIPTION "Universal TUN/TAP device driver"
33	#define DRV_COPYRIGHT "(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>"
34
35	#include <linux/module.h>
36	#include <linux/errno.h>
37	#include <linux/kernel.h>
38	#include <linux/sched/signal.h>
39	#include <linux/major.h>
40	#include <linux/slab.h>
41	#include <linux/poll.h>
42	#include <linux/fcntl.h>
43	#include <linux/init.h>
44	#include <linux/skbuff.h>
45	#include <linux/netdevice.h>
46	#include <linux/etherdevice.h>
47	#include <linux/miscdevice.h>
48	#include <linux/ethtool.h>
49	#include <linux/rtnetlink.h>
50	#include <linux/compat.h>
51	#include <linux/if.h>
52	#include <linux/if_arp.h>
53	#include <linux/if_ether.h>
54	#include <linux/if_tun.h>
55	#include <linux/if_vlan.h>
56	#include <linux/crc32.h>
57	#include <linux/nsproxy.h>
58	#include <linux/virtio_net.h>
59	#include <linux/rcupdate.h>
60	#include <net/net_namespace.h>
61	#include <net/netns/generic.h>
62	#include <net/rtnetlink.h>
63	#include <net/sock.h>
64	#include <net/xdp.h>
65	#include <net/ip_tunnels.h>
66	#include <linux/seq_file.h>
67	#include <linux/uio.h>
68	#include <linux/skb_array.h>
69	#include <linux/bpf.h>
70	#include <linux/bpf_trace.h>
71	#include <linux/mutex.h>
72	#include <linux/ieee802154.h>
73	#include <linux/if_ltalk.h>
74	#include <uapi/linux/if_fddi.h>
75	#include <uapi/linux/if_hippi.h>
76	#include <uapi/linux/if_fc.h>
77	#include <net/ax25.h>
78	#include <net/rose.h>
79	#include <net/6lowpan.h>
80
81	#include <linux/uaccess.h>
82	#include <linux/proc_fs.h>
83
84	static void tun_default_link_ksettings(struct net_device *dev,
85	struct ethtool_link_ksettings *cmd);
86
87	#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
88
89	/ TUN device flags /
90
91	/ IFF_ATTACH_QUEUE is never stored in device flags,*
92	* overload it to mean fasync when stored there.
93	*/
94	#define TUN_FASYNC IFF_ATTACH_QUEUE
95	/ High bits in flags field are unused. /
96	#define TUN_VNET_LE 0x80000000
97	#define TUN_VNET_BE 0x40000000
98
99	#define TUN_FEATURES (IFF_NO_PI \| IFF_ONE_QUEUE \| IFF_VNET_HDR \| \
100	IFF_MULTI_QUEUE \| IFF_NAPI \| IFF_NAPI_FRAGS)
101
102	#define GOODCOPY_LEN 128
103
104	#define FLT_EXACT_COUNT 8
105	struct tap_filter {
106	unsigned int count; / Number of addrs. Zero means disabled /
107	u32 mask[`2`]; / Mask of the hashed addrs /
108	unsigned char addr[FLT_EXACT_COUNT][ETH_ALEN];
109	};
110
111	/ MAX_TAP_QUEUES 256 is chosen to allow rx/tx queues to be equal*
112	* to max number of VCPUs in guest. */
113	#define MAX_TAP_QUEUES 256
114	#define MAX_TAP_FLOWS 4096
115
116	#define TUN_FLOW_EXPIRE (3 * HZ)
117
118	/ A tun_file connects an open character device to a tuntap netdevice. It*
119	* also contains all socket related structures (except sock_fprog and tap_filter)
120	* to serve as one transmit queue for tuntap device. The sock_fprog and
121	* tap_filter were kept in tun_struct since they were used for filtering for the
122	* netdevice not for a specific queue (at least I didn't see the requirement for
123	* this).
124	*
125	* RCU usage:
126	* The tun_file and tun_struct are loosely coupled, the pointer from one to the
127	* other can only be read while rcu_read_lock or rtnl_lock is held.
128	*/
129	struct tun_file {
130	struct sock sk;
131	struct socket socket;
132	struct tun_struct __rcu *tun;
133	struct fasync_struct *fasync;
134	/ only used for fasnyc /
135	unsigned int flags;
136	union {
137	u16 queue_index;
138	unsigned int ifindex;
139	};
140	struct napi_struct napi;
141	bool napi_enabled;
142	bool napi_frags_enabled;
143	struct mutex napi_mutex; / Protects access to the above napi /
144	struct list_head next;
145	struct tun_struct *detached;
146	struct ptr_ring tx_ring;
147	struct xdp_rxq_info xdp_rxq;
148	};
149
150	struct tun_page {
151	struct page *page;
152	int count;
153	};
154
155	struct tun_flow_entry {
156	struct hlist_node hash_link;
157	struct rcu_head rcu;
158	struct tun_struct *tun;
159
160	u32 rxhash;
161	u32 rps_rxhash;
162	int queue_index;
163	unsigned long updated ____cacheline_aligned_in_smp;
164	};
165
166	#define TUN_NUM_FLOW_ENTRIES 1024
167	#define TUN_MASK_FLOW_ENTRIES (TUN_NUM_FLOW_ENTRIES - 1)
168
169	struct tun_prog {
170	struct rcu_head rcu;
171	struct bpf_prog *prog;
172	};
173
174	/ Since the socket were moved to tun_file, to preserve the behavior of persist*
175	* device, socket filter, sndbuf and vnet header size were restore when the
176	* file were attached to a persist device.
177	*/
178	struct tun_struct {
179	struct tun_file __rcu *tfiles[MAX_TAP_QUEUES];
180	unsigned int numqueues;
181	unsigned int flags;
182	kuid_t owner;
183	kgid_t group;
184
185	struct net_device *dev;
186	netdev_features_t set_features;
187	#define TUN_USER_FEATURES (NETIF_F_HW_CSUM\|NETIF_F_TSO_ECN\|NETIF_F_TSO\| \
188	NETIF_F_TSO6 \| NETIF_F_GSO_UDP_L4)
189
190	int align;
191	int vnet_hdr_sz;
192	int sndbuf;
193	struct tap_filter txflt;
194	struct sock_fprog fprog;
195	/ protected by rtnl lock /
196	bool filter_attached;
197	u32 msg_enable;
198	spinlock_t lock;
199	struct hlist_head flows[TUN_NUM_FLOW_ENTRIES];
200	struct timer_list flow_gc_timer;
201	unsigned long ageing_time;
202	unsigned int numdisabled;
203	struct list_head disabled;
204	void *security;
205	u32 flow_count;
206	u32 rx_batched;
207	atomic_long_t rx_frame_errors;
208	struct bpf_prog __rcu *xdp_prog;
209	struct tun_prog __rcu *steering_prog;
210	struct tun_prog __rcu *filter_prog;
211	struct ethtool_link_ksettings link_ksettings;
212	/ init args /
213	struct file *file;
214	struct ifreq *ifr;
215	};
216
217	struct veth {
218	__be16 h_vlan_proto;
219	__be16 h_vlan_TCI;
220	};
221
222	static void tun_flow_init(struct tun_struct *tun);
223	static void tun_flow_uninit(struct tun_struct *tun);
224
225	static int tun_napi_receive(struct napi_struct napi, int* budget)
226	{
227	struct tun_file tfile = container_of(napi, struct* tun_file, napi);
228	struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
229	struct sk_buff_head process_queue;
230	struct sk_buff *skb;
231	int received = `0`;
232
233	__skb_queue_head_init(list: &process_queue);
234
235	spin_lock(lock: &queue->lock);
236	skb_queue_splice_tail_init(list: queue, head: &process_queue);
237	spin_unlock(lock: &queue->lock);
238
239	while (received < budget && (skb = __skb_dequeue(list: &process_queue))) {
240	napi_gro_receive(napi, skb);
241	++received;
242	}
243
244	if (!skb_queue_empty(list: &process_queue)) {
245	spin_lock(lock: &queue->lock);
246	skb_queue_splice(list: &process_queue, head: queue);
247	spin_unlock(lock: &queue->lock);
248	}
249
250	return received;
251	}
252
253	static int tun_napi_poll(struct napi_struct napi, int* budget)
254	{
255	unsigned int received;
256
257	received = tun_napi_receive(napi, budget);
258
259	if (received < budget)
260	napi_complete_done(n: napi, work_done: received);
261
262	return received;
263	}
264
265	static void tun_napi_init(struct tun_struct tun, struct* tun_file *tfile,
266	bool napi_en, bool napi_frags)
267	{
268	tfile->napi_enabled = napi_en;
269	tfile->napi_frags_enabled = napi_en && napi_frags;
270	if (napi_en) {
271	netif_napi_add_tx(dev: tun->dev, napi: &tfile->napi, poll: tun_napi_poll);
272	napi_enable(n: &tfile->napi);
273	}
274	}
275
276	static void tun_napi_enable(struct tun_file *tfile)
277	{
278	if (tfile->napi_enabled)
279	napi_enable(n: &tfile->napi);
280	}
281
282	static void tun_napi_disable(struct tun_file *tfile)
283	{
284	if (tfile->napi_enabled)
285	napi_disable(n: &tfile->napi);
286	}
287
288	static void tun_napi_del(struct tun_file *tfile)
289	{
290	if (tfile->napi_enabled)
291	netif_napi_del(napi: &tfile->napi);
292	}
293
294	static bool tun_napi_frags_enabled(const struct tun_file *tfile)
295	{
296	return tfile->napi_frags_enabled;
297	}
298
299	#ifdef CONFIG_TUN_VNET_CROSS_LE
300	static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
301	{
302	return tun->flags & TUN_VNET_BE ? false :
303	virtio_legacy_is_little_endian();
304	}
305
306	static long tun_get_vnet_be(struct tun_struct tun, int* __user *argp)
307	{
308	int be = !!(tun->flags & TUN_VNET_BE);
309
310	if (put_user(be, argp))
311	return -EFAULT;
312
313	return `0`;
314	}
315
316	static long tun_set_vnet_be(struct tun_struct tun, int* __user *argp)
317	{
318	int be;
319
320	if (get_user(be, argp))
321	return -EFAULT;
322
323	if (be)
324	tun->flags \|= TUN_VNET_BE;
325	else
326	tun->flags &= ~TUN_VNET_BE;
327
328	return `0`;
329	}
330	#else
331	static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
332	{
333	return virtio_legacy_is_little_endian();
334	}
335
336	static long tun_get_vnet_be(struct tun_struct tun, int* __user *argp)
337	{
338	return -EINVAL;
339	}
340
341	static long tun_set_vnet_be(struct tun_struct tun, int* __user *argp)
342	{
343	return -EINVAL;
344	}
345	#endif /* CONFIG_TUN_VNET_CROSS_LE */
346
347	static inline bool tun_is_little_endian(struct tun_struct *tun)
348	{
349	return tun->flags & TUN_VNET_LE \|\|
350	tun_legacy_is_little_endian(tun);
351	}
352
353	static inline u16 tun16_to_cpu(struct tun_struct *tun, __virtio16 val)
354	{
355	return __virtio16_to_cpu(little_endian: tun_is_little_endian(tun), val);
356	}
357
358	static inline __virtio16 cpu_to_tun16(struct tun_struct *tun, u16 val)
359	{
360	return __cpu_to_virtio16(little_endian: tun_is_little_endian(tun), val);
361	}
362
363	static inline u32 tun_hashfn(u32 rxhash)
364	{
365	return rxhash & TUN_MASK_FLOW_ENTRIES;
366	}
367
368	static struct tun_flow_entry tun_flow_find(struct* hlist_head *head, u32 rxhash)
369	{
370	struct tun_flow_entry *e;
371
372	hlist_for_each_entry_rcu(e, head, hash_link) {
373	if (e->rxhash == rxhash)
374	return e;
375	}
376	return NULL;
377	}
378
379	static struct tun_flow_entry tun_flow_create(struct* tun_struct *tun,
380	struct hlist_head *head,
381	u32 rxhash, u16 queue_index)
382	{
383	struct tun_flow_entry e = kmalloc(size: sizeof(e), GFP_ATOMIC);
384
385	if (e) {
386	netif_info(tun, tx_queued, tun->dev,
387	"create flow: hash %u index %u\n",
388	rxhash, queue_index);
389	e->updated = jiffies;
390	e->rxhash = rxhash;
391	e->rps_rxhash = `0`;
392	e->queue_index = queue_index;
393	e->tun = tun;
394	hlist_add_head_rcu(n: &e->hash_link, h: head);
395	++tun->flow_count;
396	}
397	return e;
398	}
399
400	static void tun_flow_delete(struct tun_struct tun, struct* tun_flow_entry *e)
401	{
402	netif_info(tun, tx_queued, tun->dev, "delete flow: hash %u index %u\n",
403	e->rxhash, e->queue_index);
404	hlist_del_rcu(n: &e->hash_link);
405	kfree_rcu(e, rcu);
406	--tun->flow_count;
407	}
408
409	static void tun_flow_flush(struct tun_struct *tun)
410	{
411	int i;
412
413	spin_lock_bh(lock: &tun->lock);
414	for (i = `0`; i < TUN_NUM_FLOW_ENTRIES; i++) {
415	struct tun_flow_entry *e;
416	struct hlist_node *n;
417
418	hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link)
419	tun_flow_delete(tun, e);
420	}
421	spin_unlock_bh(lock: &tun->lock);
422	}
423
424	static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index)
425	{
426	int i;
427
428	spin_lock_bh(lock: &tun->lock);
429	for (i = `0`; i < TUN_NUM_FLOW_ENTRIES; i++) {
430	struct tun_flow_entry *e;
431	struct hlist_node *n;
432
433	hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
434	if (e->queue_index == queue_index)
435	tun_flow_delete(tun, e);
436	}
437	}
438	spin_unlock_bh(lock: &tun->lock);
439	}
440
441	static void tun_flow_cleanup(struct timer_list *t)
442	{
443	struct tun_struct *tun = from_timer(tun, t, flow_gc_timer);
444	unsigned long delay = tun->ageing_time;
445	unsigned long next_timer = jiffies + delay;
446	unsigned long count = `0`;
447	int i;
448
449	spin_lock(lock: &tun->lock);
450	for (i = `0`; i < TUN_NUM_FLOW_ENTRIES; i++) {
451	struct tun_flow_entry *e;
452	struct hlist_node *n;
453
454	hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
455	unsigned long this_timer;
456
457	this_timer = e->updated + delay;
458	if (time_before_eq(this_timer, jiffies)) {
459	tun_flow_delete(tun, e);
460	continue;
461	}
462	count++;
463	if (time_before(this_timer, next_timer))
464	next_timer = this_timer;
465	}
466	}
467
468	if (count)
469	mod_timer(timer: &tun->flow_gc_timer, expires: round_jiffies_up(j: next_timer));
470	spin_unlock(lock: &tun->lock);
471	}
472
473	static void tun_flow_update(struct tun_struct *tun, u32 rxhash,
474	struct tun_file *tfile)
475	{
476	struct hlist_head *head;
477	struct tun_flow_entry *e;
478	unsigned long delay = tun->ageing_time;
479	u16 queue_index = tfile->queue_index;
480
481	head = &tun->flows[tun_hashfn(rxhash)];
482
483	rcu_read_lock();
484
485	e = tun_flow_find(head, rxhash);
486	if (likely(e)) {
487	/ TODO: keep queueing to old queue until it's empty? /
488	if (READ_ONCE(e->queue_index) != queue_index)
489	WRITE_ONCE(e->queue_index, queue_index);
490	if (e->updated != jiffies)
491	e->updated = jiffies;
492	sock_rps_record_flow_hash(hash: e->rps_rxhash);
493	} else {
494	spin_lock_bh(lock: &tun->lock);
495	if (!tun_flow_find(head, rxhash) &&
496	tun->flow_count < MAX_TAP_FLOWS)
497	tun_flow_create(tun, head, rxhash, queue_index);
498
499	if (!timer_pending(timer: &tun->flow_gc_timer))
500	mod_timer(timer: &tun->flow_gc_timer,
501	expires: round_jiffies_up(j: jiffies + delay));
502	spin_unlock_bh(lock: &tun->lock);
503	}
504
505	rcu_read_unlock();
506	}
507
508	/ Save the hash received in the stack receive path and update the*
509	* flow_hash table accordingly.
510	*/
511	static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
512	{
513	if (unlikely(e->rps_rxhash != hash))
514	e->rps_rxhash = hash;
515	}
516
517	/ We try to identify a flow through its rxhash. The reason that*
518	* we do not check rxq no. is because some cards(e.g 82599), chooses
519	* the rxq based on the txq where the last packet of the flow comes. As
520	* the userspace application move between processors, we may get a
521	* different rxq no. here.
522	*/
523	static u16 tun_automq_select_queue(struct tun_struct tun, struct* sk_buff *skb)
524	{
525	struct tun_flow_entry *e;
526	u32 txq = `0`;
527	u32 numqueues = `0`;
528
529	numqueues = READ_ONCE(tun->numqueues);
530
531	txq = __skb_get_hash_symmetric(skb);
532	e = tun_flow_find(head: &tun->flows[tun_hashfn(rxhash: txq)], rxhash: txq);
533	if (e) {
534	tun_flow_save_rps_rxhash(e, hash: txq);
535	txq = e->queue_index;
536	} else {
537	/ use multiply and shift instead of expensive divide /
538	txq = ((u64)txq * numqueues) >> `32`;
539	}
540
541	return txq;
542	}
543
544	static u16 tun_ebpf_select_queue(struct tun_struct tun, struct* sk_buff *skb)
545	{
546	struct tun_prog *prog;
547	u32 numqueues;
548	u16 ret = `0`;
549
550	numqueues = READ_ONCE(tun->numqueues);
551	if (!numqueues)
552	return `0`;
553
554	prog = rcu_dereference(tun->steering_prog);
555	if (prog)
556	ret = bpf_prog_run_clear_cb(prog: prog->prog, skb);
557
558	return ret % numqueues;
559	}
560
561	static u16 tun_select_queue(struct net_device dev, struct* sk_buff *skb,
562	struct net_device *sb_dev)
563	{
564	struct tun_struct *tun = netdev_priv(dev);
565	u16 ret;
566
567	rcu_read_lock();
568	if (rcu_dereference(tun->steering_prog))
569	ret = tun_ebpf_select_queue(tun, skb);
570	else
571	ret = tun_automq_select_queue(tun, skb);
572	rcu_read_unlock();
573
574	return ret;
575	}
576
577	static inline bool tun_not_capable(struct tun_struct *tun)
578	{
579	const struct cred *cred = current_cred();
580	struct net *net = dev_net(dev: tun->dev);
581
582	return ((uid_valid(uid: tun->owner) && !uid_eq(left: cred->euid, right: tun->owner)) \|\|
583	(gid_valid(gid: tun->group) && !in_egroup_p(tun->group))) &&
584	!ns_capable(ns: net->user_ns, CAP_NET_ADMIN);
585	}
586
587	static void tun_set_real_num_queues(struct tun_struct *tun)
588	{
589	netif_set_real_num_tx_queues(dev: tun->dev, txq: tun->numqueues);
590	netif_set_real_num_rx_queues(dev: tun->dev, rxq: tun->numqueues);
591	}
592
593	static void tun_disable_queue(struct tun_struct tun, struct* tun_file *tfile)
594	{
595	tfile->detached = tun;
596	list_add_tail(new: &tfile->next, head: &tun->disabled);
597	++tun->numdisabled;
598	}
599
600	static struct tun_struct tun_enable_queue(struct* tun_file *tfile)
601	{
602	struct tun_struct *tun = tfile->detached;
603
604	tfile->detached = NULL;
605	list_del_init(entry: &tfile->next);
606	--tun->numdisabled;
607	return tun;
608	}
609
610	void tun_ptr_free(void *ptr)
611	{
612	if (!ptr)
613	return;
614	if (tun_is_xdp_frame(ptr)) {
615	struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
616
617	xdp_return_frame(xdpf);
618	} else {
619	__skb_array_destroy_skb(ptr);
620	}
621	}
622	EXPORT_SYMBOL_GPL(tun_ptr_free);
623
624	static void tun_queue_purge(struct tun_file *tfile)
625	{
626	void *ptr;
627
628	while ((ptr = ptr_ring_consume(r: &tfile->tx_ring)) != NULL)
629	tun_ptr_free(ptr);
630
631	skb_queue_purge(list: &tfile->sk.sk_write_queue);
632	skb_queue_purge(list: &tfile->sk.sk_error_queue);
633	}
634
635	static void __tun_detach(struct tun_file *tfile, bool clean)
636	{
637	struct tun_file *ntfile;
638	struct tun_struct *tun;
639
640	tun = rtnl_dereference(tfile->tun);
641
642	if (tun && clean) {
643	if (!tfile->detached)
644	tun_napi_disable(tfile);
645	tun_napi_del(tfile);
646	}
647
648	if (tun && !tfile->detached) {
649	u16 index = tfile->queue_index;
650	BUG_ON(index >= tun->numqueues);
651
652	rcu_assign_pointer(tun->tfiles[index],
653	tun->tfiles[tun->numqueues - `1`]);
654	ntfile = rtnl_dereference(tun->tfiles[index]);
655	ntfile->queue_index = index;
656	rcu_assign_pointer(tun->tfiles[tun->numqueues - `1`],
657	NULL);
658
659	--tun->numqueues;
660	if (clean) {
661	RCU_INIT_POINTER(tfile->tun, NULL);
662	sock_put(sk: &tfile->sk);
663	} else {
664	tun_disable_queue(tun, tfile);
665	tun_napi_disable(tfile);
666	}
667
668	synchronize_net();
669	tun_flow_delete_by_queue(tun, queue_index: tun->numqueues + `1`);
670	/ Drop read queue /
671	tun_queue_purge(tfile);
672	tun_set_real_num_queues(tun);
673	} else if (tfile->detached && clean) {
674	tun = tun_enable_queue(tfile);
675	sock_put(sk: &tfile->sk);
676	}
677
678	if (clean) {
679	if (tun && tun->numqueues == `0` && tun->numdisabled == `0`) {
680	netif_carrier_off(dev: tun->dev);
681
682	if (!(tun->flags & IFF_PERSIST) &&
683	tun->dev->reg_state == NETREG_REGISTERED)
684	unregister_netdevice(dev: tun->dev);
685	}
686	if (tun)
687	xdp_rxq_info_unreg(xdp_rxq: &tfile->xdp_rxq);
688	ptr_ring_cleanup(r: &tfile->tx_ring, destroy: tun_ptr_free);
689	}
690	}
691
692	static void tun_detach(struct tun_file *tfile, bool clean)
693	{
694	struct tun_struct *tun;
695	struct net_device *dev;
696
697	rtnl_lock();
698	tun = rtnl_dereference(tfile->tun);
699	dev = tun ? tun->dev : NULL;
700	__tun_detach(tfile, clean);
701	if (dev)
702	netdev_state_change(dev);
703	rtnl_unlock();
704
705	if (clean)
706	sock_put(sk: &tfile->sk);
707	}
708
709	static void tun_detach_all(struct net_device *dev)
710	{
711	struct tun_struct *tun = netdev_priv(dev);
712	struct tun_file tfile, tmp;
713	int i, n = tun->numqueues;
714
715	for (i = `0`; i < n; i++) {
716	tfile = rtnl_dereference(tun->tfiles[i]);
717	BUG_ON(!tfile);
718	tun_napi_disable(tfile);
719	tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
720	tfile->socket.sk->sk_data_ready(tfile->socket.sk);
721	RCU_INIT_POINTER(tfile->tun, NULL);
722	--tun->numqueues;
723	}
724	list_for_each_entry(tfile, &tun->disabled, next) {
725	tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
726	tfile->socket.sk->sk_data_ready(tfile->socket.sk);
727	RCU_INIT_POINTER(tfile->tun, NULL);
728	}
729	BUG_ON(tun->numqueues != `0`);
730
731	synchronize_net();
732	for (i = `0`; i < n; i++) {
733	tfile = rtnl_dereference(tun->tfiles[i]);
734	tun_napi_del(tfile);
735	/ Drop read queue /
736	tun_queue_purge(tfile);
737	xdp_rxq_info_unreg(xdp_rxq: &tfile->xdp_rxq);
738	sock_put(sk: &tfile->sk);
739	}
740	list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) {
741	tun_napi_del(tfile);
742	tun_enable_queue(tfile);
743	tun_queue_purge(tfile);
744	xdp_rxq_info_unreg(xdp_rxq: &tfile->xdp_rxq);
745	sock_put(sk: &tfile->sk);
746	}
747	BUG_ON(tun->numdisabled != `0`);
748
749	if (tun->flags & IFF_PERSIST)
750	module_put(THIS_MODULE);
751	}
752
753	static int tun_attach(struct tun_struct tun, struct* file *file,
754	bool skip_filter, bool napi, bool napi_frags,
755	bool publish_tun)
756	{
757	struct tun_file *tfile = file->private_data;
758	struct net_device *dev = tun->dev;
759	int err;
760
761	err = security_tun_dev_attach(sk: tfile->socket.sk, security: tun->security);
762	if (err < `0`)
763	goto out;
764
765	err = -EINVAL;
766	if (rtnl_dereference(tfile->tun) && !tfile->detached)
767	goto out;
768
769	err = -EBUSY;
770	if (!(tun->flags & IFF_MULTI_QUEUE) && tun->numqueues == `1`)
771	goto out;
772
773	err = -E2BIG;
774	if (!tfile->detached &&
775	tun->numqueues + tun->numdisabled == MAX_TAP_QUEUES)
776	goto out;
777
778	err = `0`;
779
780	/ Re-attach the filter to persist device /
781	if (!skip_filter && (tun->filter_attached == true)) {
782	lock_sock(sk: tfile->socket.sk);
783	err = sk_attach_filter(fprog: &tun->fprog, sk: tfile->socket.sk);
784	release_sock(sk: tfile->socket.sk);
785	if (!err)
786	goto out;
787	}
788
789	if (!tfile->detached &&
790	ptr_ring_resize(r: &tfile->tx_ring, size: dev->tx_queue_len,
791	GFP_KERNEL, destroy: tun_ptr_free)) {
792	err = -ENOMEM;
793	goto out;
794	}
795
796	tfile->queue_index = tun->numqueues;
797	tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
798
799	if (tfile->detached) {
800	/ Re-attach detached tfile, updating XDP queue_index /
801	WARN_ON(!xdp_rxq_info_is_reg(&tfile->xdp_rxq));
802
803	if (tfile->xdp_rxq.queue_index != tfile->queue_index)
804	tfile->xdp_rxq.queue_index = tfile->queue_index;
805	} else {
806	/ Setup XDP RX-queue info, for new tfile getting attached /
807	err = xdp_rxq_info_reg(xdp_rxq: &tfile->xdp_rxq,
808	dev: tun->dev, queue_index: tfile->queue_index, napi_id: `0`);
809	if (err < `0`)
810	goto out;
811	err = xdp_rxq_info_reg_mem_model(xdp_rxq: &tfile->xdp_rxq,
812	type: MEM_TYPE_PAGE_SHARED, NULL);
813	if (err < `0`) {
814	xdp_rxq_info_unreg(xdp_rxq: &tfile->xdp_rxq);
815	goto out;
816	}
817	err = `0`;
818	}
819
820	if (tfile->detached) {
821	tun_enable_queue(tfile);
822	tun_napi_enable(tfile);
823	} else {
824	sock_hold(sk: &tfile->sk);
825	tun_napi_init(tun, tfile, napi_en: napi, napi_frags);
826	}
827
828	if (rtnl_dereference(tun->xdp_prog))
829	sock_set_flag(sk: &tfile->sk, flag: SOCK_XDP);
830
831	/ device is allowed to go away first, so no need to hold extra*
832	* refcnt.
833	*/
834
835	/ Publish tfile->tun and tun->tfiles only after we've fully*
836	* initialized tfile; otherwise we risk using half-initialized
837	* object.
838	*/
839	if (publish_tun)
840	rcu_assign_pointer(tfile->tun, tun);
841	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
842	tun->numqueues++;
843	tun_set_real_num_queues(tun);
844	out:
845	return err;
846	}
847
848	static struct tun_struct tun_get(struct* tun_file *tfile)
849	{
850	struct tun_struct *tun;
851
852	rcu_read_lock();
853	tun = rcu_dereference(tfile->tun);
854	if (tun)
855	dev_hold(dev: tun->dev);
856	rcu_read_unlock();
857
858	return tun;
859	}
860
861	static void tun_put(struct tun_struct *tun)
862	{
863	dev_put(dev: tun->dev);
864	}
865
866	/ TAP filtering /
867	static void addr_hash_set(u32 mask, const* u8 *addr)
868	{
869	int n = ether_crc(ETH_ALEN, addr) >> `26`;
870	mask[n >> `5`] \|= (`1` << (n & `31`));
871	}
872
873	static unsigned int addr_hash_test(const u32 mask, const* u8 *addr)
874	{
875	int n = ether_crc(ETH_ALEN, addr) >> `26`;
876	return mask[n >> `5`] & (`1` << (n & `31`));
877	}
878
879	static int update_filter(struct tap_filter filter, void* __user *arg)
880	{
881	struct { u8 u[ETH_ALEN]; } *addr;
882	struct tun_filter uf;
883	int err, alen, n, nexact;
884
885	if (copy_from_user(to: &uf, from: arg, n: sizeof(uf)))
886	return -EFAULT;
887
888	if (!uf.count) {
889	/ Disabled /
890	filter->count = `0`;
891	return `0`;
892	}
893
894	alen = ETH_ALEN * uf.count;
895	addr = memdup_user(arg + sizeof(uf), alen);
896	if (IS_ERR(ptr: addr))
897	return PTR_ERR(ptr: addr);
898
899	/ The filter is updated without holding any locks. Which is*
900	* perfectly safe. We disable it first and in the worst
901	* case we'll accept a few undesired packets. */
902	filter->count = `0`;
903	wmb();
904
905	/ Use first set of addresses as an exact filter /
906	for (n = `0`; n < uf.count && n < FLT_EXACT_COUNT; n++)
907	memcpy(filter->addr[n], addr[n].u, ETH_ALEN);
908
909	nexact = n;
910
911	/ Remaining multicast addresses are hashed,*
912	* unicast will leave the filter disabled. */
913	memset(filter->mask, `0`, sizeof(filter->mask));
914	for (; n < uf.count; n++) {
915	if (!is_multicast_ether_addr(addr: addr[n].u)) {
916	err = `0`; / no filter /
917	goto free_addr;
918	}
919	addr_hash_set(mask: filter->mask, addr: addr[n].u);
920	}
921
922	/ For ALLMULTI just set the mask to all ones.*
923	* This overrides the mask populated above. */
924	if ((uf.flags & TUN_FLT_ALLMULTI))
925	memset(filter->mask, ~`0`, sizeof(filter->mask));
926
927	/ Now enable the filter /
928	wmb();
929	filter->count = nexact;
930
931	/ Return the number of exact filters /
932	err = nexact;
933	free_addr:
934	kfree(objp: addr);
935	return err;
936	}
937
938	/ Returns: 0 - drop, !=0 - accept /
939	static int run_filter(struct tap_filter filter, const* struct sk_buff *skb)
940	{
941	/ Cannot use eth_hdr(skb) here because skb_mac_hdr() is incorrect*
942	* at this point. */
943	struct ethhdr eh = (struct* ethhdr *) skb->data;
944	int i;
945
946	/ Exact match /
947	for (i = `0`; i < filter->count; i++)
948	if (ether_addr_equal(addr1: eh->h_dest, addr2: filter->addr[i]))
949	return `1`;
950
951	/ Inexact match (multicast only) /
952	if (is_multicast_ether_addr(addr: eh->h_dest))
953	return addr_hash_test(mask: filter->mask, addr: eh->h_dest);
954
955	return `0`;
956	}
957
958	/*
959	* Checks whether the packet is accepted or not.
960	* Returns: 0 - drop, !=0 - accept
961	*/
962	static int check_filter(struct tap_filter filter, const* struct sk_buff *skb)
963	{
964	if (!filter->count)
965	return `1`;
966
967	return run_filter(filter, skb);
968	}
969
970	/ Network device part of the driver /
971
972	static const struct ethtool_ops tun_ethtool_ops;
973
974	static int tun_net_init(struct net_device *dev)
975	{
976	struct tun_struct *tun = netdev_priv(dev);
977	struct ifreq *ifr = tun->ifr;
978	int err;
979
980	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
981	if (!dev->tstats)
982	return -ENOMEM;
983
984	spin_lock_init(&tun->lock);
985
986	err = security_tun_dev_alloc_security(security: &tun->security);
987	if (err < `0`) {
988	free_percpu(pdata: dev->tstats);
989	return err;
990	}
991
992	tun_flow_init(tun);
993
994	dev->hw_features = NETIF_F_SG \| NETIF_F_FRAGLIST \|
995	TUN_USER_FEATURES \| NETIF_F_HW_VLAN_CTAG_TX \|
996	NETIF_F_HW_VLAN_STAG_TX;
997	dev->features = dev->hw_features \| NETIF_F_LLTX;
998	dev->vlan_features = dev->features &
999	~(NETIF_F_HW_VLAN_CTAG_TX \|
1000	NETIF_F_HW_VLAN_STAG_TX);
1001
1002	tun->flags = (tun->flags & ~TUN_FEATURES) \|
1003	(ifr->ifr_flags & TUN_FEATURES);
1004
1005	INIT_LIST_HEAD(list: &tun->disabled);
1006	err = tun_attach(tun, file: tun->file, skip_filter: false, napi: ifr->ifr_flags & IFF_NAPI,
1007	napi_frags: ifr->ifr_flags & IFF_NAPI_FRAGS, publish_tun: false);
1008	if (err < `0`) {
1009	tun_flow_uninit(tun);
1010	security_tun_dev_free_security(security: tun->security);
1011	free_percpu(pdata: dev->tstats);
1012	return err;
1013	}
1014	return `0`;
1015	}
1016
1017	/ Net device detach from fd. /
1018	static void tun_net_uninit(struct net_device *dev)
1019	{
1020	tun_detach_all(dev);
1021	}
1022
1023	/ Net device open. /
1024	static int tun_net_open(struct net_device *dev)
1025	{
1026	netif_tx_start_all_queues(dev);
1027
1028	return `0`;
1029	}
1030
1031	/ Net device close. /
1032	static int tun_net_close(struct net_device *dev)
1033	{
1034	netif_tx_stop_all_queues(dev);
1035	return `0`;
1036	}
1037
1038	/ Net device start xmit /
1039	static void tun_automq_xmit(struct tun_struct tun, struct* sk_buff *skb)
1040	{
1041	#ifdef CONFIG_RPS
1042	if (tun->numqueues == `1` && static_branch_unlikely(&rps_needed)) {
1043	/ Select queue was not called for the skbuff, so we extract the*
1044	* RPS hash and save it into the flow_table here.
1045	*/
1046	struct tun_flow_entry *e;
1047	__u32 rxhash;
1048
1049	rxhash = __skb_get_hash_symmetric(skb);
1050	e = tun_flow_find(head: &tun->flows[tun_hashfn(rxhash)], rxhash);
1051	if (e)
1052	tun_flow_save_rps_rxhash(e, hash: rxhash);
1053	}
1054	#endif
1055	}
1056
1057	static unsigned int run_ebpf_filter(struct tun_struct *tun,
1058	struct sk_buff *skb,
1059	int len)
1060	{
1061	struct tun_prog *prog = rcu_dereference(tun->filter_prog);
1062
1063	if (prog)
1064	len = bpf_prog_run_clear_cb(prog: prog->prog, skb);
1065
1066	return len;
1067	}
1068
1069	/ Net device start xmit /
1070	static netdev_tx_t tun_net_xmit(struct sk_buff skb, struct* net_device *dev)
1071	{
1072	struct tun_struct *tun = netdev_priv(dev);
1073	enum skb_drop_reason drop_reason;
1074	int txq = skb->queue_mapping;
1075	struct netdev_queue *queue;
1076	struct tun_file *tfile;
1077	int len = skb->len;
1078
1079	rcu_read_lock();
1080	tfile = rcu_dereference(tun->tfiles[txq]);
1081
1082	/ Drop packet if interface is not attached /
1083	if (!tfile) {
1084	drop_reason = SKB_DROP_REASON_DEV_READY;
1085	goto drop;
1086	}
1087
1088	if (!rcu_dereference(tun->steering_prog))
1089	tun_automq_xmit(tun, skb);
1090
1091	netif_info(tun, tx_queued, tun->dev, "%s %d\n", __func__, skb->len);
1092
1093	/ Drop if the filter does not like it.*
1094	* This is a noop if the filter is disabled.
1095	* Filter can be enabled only for the TAP devices. */
1096	if (!check_filter(filter: &tun->txflt, skb)) {
1097	drop_reason = SKB_DROP_REASON_TAP_TXFILTER;
1098	goto drop;
1099	}
1100
1101	if (tfile->socket.sk->sk_filter &&
1102	sk_filter(sk: tfile->socket.sk, skb)) {
1103	drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
1104	goto drop;
1105	}
1106
1107	len = run_ebpf_filter(tun, skb, len);
1108	if (len == `0`) {
1109	drop_reason = SKB_DROP_REASON_TAP_FILTER;
1110	goto drop;
1111	}
1112
1113	if (pskb_trim(skb, len)) {
1114	drop_reason = SKB_DROP_REASON_NOMEM;
1115	goto drop;
1116	}
1117
1118	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) {
1119	drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT;
1120	goto drop;
1121	}
1122
1123	skb_tx_timestamp(skb);
1124
1125	/ Orphan the skb - required as we might hang on to it*
1126	* for indefinite time.
1127	*/
1128	skb_orphan(skb);
1129
1130	nf_reset_ct(skb);
1131
1132	if (ptr_ring_produce(r: &tfile->tx_ring, ptr: skb)) {
1133	drop_reason = SKB_DROP_REASON_FULL_RING;
1134	goto drop;
1135	}
1136
1137	/ NETIF_F_LLTX requires to do our own update of trans_start /
1138	queue = netdev_get_tx_queue(dev, index: txq);
1139	txq_trans_cond_update(txq: queue);
1140
1141	/ Notify and wake up reader process /
1142	if (tfile->flags & TUN_FASYNC)
1143	kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
1144	tfile->socket.sk->sk_data_ready(tfile->socket.sk);
1145
1146	rcu_read_unlock();
1147	return NETDEV_TX_OK;
1148
1149	drop:
1150	dev_core_stats_tx_dropped_inc(dev);
1151	skb_tx_error(skb);
1152	kfree_skb_reason(skb, reason: drop_reason);
1153	rcu_read_unlock();
1154	return NET_XMIT_DROP;
1155	}
1156
1157	static void tun_net_mclist(struct net_device *dev)
1158	{
1159	/*
1160	* This callback is supposed to deal with mc filter in
1161	* _rx_ path and has nothing to do with the _tx_ path.
1162	* In rx path we always accept everything userspace gives us.
1163	*/
1164	}
1165
1166	static netdev_features_t tun_net_fix_features(struct net_device *dev,
1167	netdev_features_t features)
1168	{
1169	struct tun_struct *tun = netdev_priv(dev);
1170
1171	return (features & tun->set_features) \| (features & ~TUN_USER_FEATURES);
1172	}
1173
1174	static void tun_set_headroom(struct net_device dev, int* new_hr)
1175	{
1176	struct tun_struct *tun = netdev_priv(dev);
1177
1178	if (new_hr < NET_SKB_PAD)
1179	new_hr = NET_SKB_PAD;
1180
1181	tun->align = new_hr;
1182	}
1183
1184	static void
1185	tun_net_get_stats64(struct net_device dev, struct* rtnl_link_stats64 *stats)
1186	{
1187	struct tun_struct *tun = netdev_priv(dev);
1188
1189	dev_get_tstats64(dev, s: stats);
1190
1191	stats->rx_frame_errors +=
1192	(unsigned long)atomic_long_read(v: &tun->rx_frame_errors);
1193	}
1194
1195	static int tun_xdp_set(struct net_device dev, struct* bpf_prog *prog,
1196	struct netlink_ext_ack *extack)
1197	{
1198	struct tun_struct *tun = netdev_priv(dev);
1199	struct tun_file *tfile;
1200	struct bpf_prog *old_prog;
1201	int i;
1202
1203	old_prog = rtnl_dereference(tun->xdp_prog);
1204	rcu_assign_pointer(tun->xdp_prog, prog);
1205	if (old_prog)
1206	bpf_prog_put(prog: old_prog);
1207
1208	for (i = `0`; i < tun->numqueues; i++) {
1209	tfile = rtnl_dereference(tun->tfiles[i]);
1210	if (prog)
1211	sock_set_flag(sk: &tfile->sk, flag: SOCK_XDP);
1212	else
1213	sock_reset_flag(sk: &tfile->sk, flag: SOCK_XDP);
1214	}
1215	list_for_each_entry(tfile, &tun->disabled, next) {
1216	if (prog)
1217	sock_set_flag(sk: &tfile->sk, flag: SOCK_XDP);
1218	else
1219	sock_reset_flag(sk: &tfile->sk, flag: SOCK_XDP);
1220	}
1221
1222	return `0`;
1223	}
1224
1225	static int tun_xdp(struct net_device dev, struct* netdev_bpf *xdp)
1226	{
1227	switch (xdp->command) {
1228	case XDP_SETUP_PROG:
1229	return tun_xdp_set(dev, prog: xdp->prog, extack: xdp->extack);
1230	default:
1231	return -EINVAL;
1232	}
1233	}
1234
1235	static int tun_net_change_carrier(struct net_device *dev, bool new_carrier)
1236	{
1237	if (new_carrier) {
1238	struct tun_struct *tun = netdev_priv(dev);
1239
1240	if (!tun->numqueues)
1241	return -EPERM;
1242
1243	netif_carrier_on(dev);
1244	} else {
1245	netif_carrier_off(dev);
1246	}
1247	return `0`;
1248	}
1249
1250	static const struct net_device_ops tun_netdev_ops = {
1251	.ndo_init = tun_net_init,
1252	.ndo_uninit = tun_net_uninit,
1253	.ndo_open = tun_net_open,
1254	.ndo_stop = tun_net_close,
1255	.ndo_start_xmit = tun_net_xmit,
1256	.ndo_fix_features = tun_net_fix_features,
1257	.ndo_select_queue = tun_select_queue,
1258	.ndo_set_rx_headroom = tun_set_headroom,
1259	.ndo_get_stats64 = tun_net_get_stats64,
1260	.ndo_change_carrier = tun_net_change_carrier,
1261	};
1262
1263	static void __tun_xdp_flush_tfile(struct tun_file *tfile)
1264	{
1265	/ Notify and wake up reader process /
1266	if (tfile->flags & TUN_FASYNC)
1267	kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
1268	tfile->socket.sk->sk_data_ready(tfile->socket.sk);
1269	}
1270
1271	static int tun_xdp_xmit(struct net_device dev, int* n,
1272	struct xdp_frame **frames, u32 flags)
1273	{
1274	struct tun_struct *tun = netdev_priv(dev);
1275	struct tun_file *tfile;
1276	u32 numqueues;
1277	int nxmit = `0`;
1278	int i;
1279
1280	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
1281	return -EINVAL;
1282
1283	rcu_read_lock();
1284
1285	resample:
1286	numqueues = READ_ONCE(tun->numqueues);
1287	if (!numqueues) {
1288	rcu_read_unlock();
1289	return -ENXIO; / Caller will free/return all frames /
1290	}
1291
1292	tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
1293	numqueues]);
1294	if (unlikely(!tfile))
1295	goto resample;
1296
1297	spin_lock(lock: &tfile->tx_ring.producer_lock);
1298	for (i = `0`; i < n; i++) {
1299	struct xdp_frame *xdp = frames[i];
1300	/ Encode the XDP flag into lowest bit for consumer to differ*
1301	* XDP buffer from sk_buff.
1302	*/
1303	void *frame = tun_xdp_to_ptr(xdp);
1304
1305	if (__ptr_ring_produce(r: &tfile->tx_ring, ptr: frame)) {
1306	dev_core_stats_tx_dropped_inc(dev);
1307	break;
1308	}
1309	nxmit++;
1310	}
1311	spin_unlock(lock: &tfile->tx_ring.producer_lock);
1312
1313	if (flags & XDP_XMIT_FLUSH)
1314	__tun_xdp_flush_tfile(tfile);
1315
1316	rcu_read_unlock();
1317	return nxmit;
1318	}
1319
1320	static int tun_xdp_tx(struct net_device dev, struct* xdp_buff *xdp)
1321	{
1322	struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp);
1323	int nxmit;
1324
1325	if (unlikely(!frame))
1326	return -EOVERFLOW;
1327
1328	nxmit = tun_xdp_xmit(dev, n: `1`, frames: &frame, XDP_XMIT_FLUSH);
1329	if (!nxmit)
1330	xdp_return_frame_rx_napi(xdpf: frame);
1331	return nxmit;
1332	}
1333
1334	static const struct net_device_ops tap_netdev_ops = {
1335	.ndo_init = tun_net_init,
1336	.ndo_uninit = tun_net_uninit,
1337	.ndo_open = tun_net_open,
1338	.ndo_stop = tun_net_close,
1339	.ndo_start_xmit = tun_net_xmit,
1340	.ndo_fix_features = tun_net_fix_features,
1341	.ndo_set_rx_mode = tun_net_mclist,
1342	.ndo_set_mac_address = eth_mac_addr,
1343	.ndo_validate_addr = eth_validate_addr,
1344	.ndo_select_queue = tun_select_queue,
1345	.ndo_features_check = passthru_features_check,
1346	.ndo_set_rx_headroom = tun_set_headroom,
1347	.ndo_get_stats64 = dev_get_tstats64,
1348	.ndo_bpf = tun_xdp,
1349	.ndo_xdp_xmit = tun_xdp_xmit,
1350	.ndo_change_carrier = tun_net_change_carrier,
1351	};
1352
1353	static void tun_flow_init(struct tun_struct *tun)
1354	{
1355	int i;
1356
1357	for (i = `0`; i < TUN_NUM_FLOW_ENTRIES; i++)
1358	INIT_HLIST_HEAD(&tun->flows[i]);
1359
1360	tun->ageing_time = TUN_FLOW_EXPIRE;
1361	timer_setup(&tun->flow_gc_timer, tun_flow_cleanup, `0`);
1362	mod_timer(timer: &tun->flow_gc_timer,
1363	expires: round_jiffies_up(j: jiffies + tun->ageing_time));
1364	}
1365
1366	static void tun_flow_uninit(struct tun_struct *tun)
1367	{
1368	del_timer_sync(timer: &tun->flow_gc_timer);
1369	tun_flow_flush(tun);
1370	}
1371
1372	#define MIN_MTU 68
1373	#define MAX_MTU 65535
1374
1375	/ Initialize net device. /
1376	static void tun_net_initialize(struct net_device *dev)
1377	{
1378	struct tun_struct *tun = netdev_priv(dev);
1379
1380	switch (tun->flags & TUN_TYPE_MASK) {
1381	case IFF_TUN:
1382	dev->netdev_ops = &tun_netdev_ops;
1383	dev->header_ops = &ip_tunnel_header_ops;
1384
1385	/ Point-to-Point TUN Device /
1386	dev->hard_header_len = `0`;
1387	dev->addr_len = `0`;
1388	dev->mtu = `1500`;
1389
1390	/ Zero header length /
1391	dev->type = ARPHRD_NONE;
1392	dev->flags = IFF_POINTOPOINT \| IFF_NOARP \| IFF_MULTICAST;
1393	break;
1394
1395	case IFF_TAP:
1396	dev->netdev_ops = &tap_netdev_ops;
1397	/ Ethernet TAP Device /
1398	ether_setup(dev);
1399	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1400	dev->priv_flags \|= IFF_LIVE_ADDR_CHANGE;
1401
1402	eth_hw_addr_random(dev);
1403
1404	/ Currently tun does not support XDP, only tap does. /
1405	dev->xdp_features = NETDEV_XDP_ACT_BASIC \|
1406	NETDEV_XDP_ACT_REDIRECT \|
1407	NETDEV_XDP_ACT_NDO_XMIT;
1408
1409	break;
1410	}
1411
1412	dev->min_mtu = MIN_MTU;
1413	dev->max_mtu = MAX_MTU - dev->hard_header_len;
1414	}
1415
1416	static bool tun_sock_writeable(struct tun_struct tun, struct* tun_file *tfile)
1417	{
1418	struct sock *sk = tfile->socket.sk;
1419
1420	return (tun->dev->flags & IFF_UP) && sock_writeable(sk);
1421	}
1422
1423	/ Character device part /
1424
1425	/ Poll /
1426	static __poll_t tun_chr_poll(struct file file, poll_table wait)
1427	{
1428	struct tun_file *tfile = file->private_data;
1429	struct tun_struct *tun = tun_get(tfile);
1430	struct sock *sk;
1431	__poll_t mask = `0`;
1432
1433	if (!tun)
1434	return EPOLLERR;
1435
1436	sk = tfile->socket.sk;
1437
1438	poll_wait(filp: file, wait_address: sk_sleep(sk), p: wait);
1439
1440	if (!ptr_ring_empty(r: &tfile->tx_ring))
1441	mask \|= EPOLLIN \| EPOLLRDNORM;
1442
1443	/ Make sure SOCKWQ_ASYNC_NOSPACE is set if not writable to*
1444	* guarantee EPOLLOUT to be raised by either here or
1445	* tun_sock_write_space(). Then process could get notification
1446	* after it writes to a down device and meets -EIO.
1447	*/
1448	if (tun_sock_writeable(tun, tfile) \|\|
1449	(!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, addr: &sk->sk_socket->flags) &&
1450	tun_sock_writeable(tun, tfile)))
1451	mask \|= EPOLLOUT \| EPOLLWRNORM;
1452
1453	if (tun->dev->reg_state != NETREG_REGISTERED)
1454	mask = EPOLLERR;
1455
1456	tun_put(tun);
1457	return mask;
1458	}
1459
1460	static struct sk_buff tun_napi_alloc_frags(struct* tun_file *tfile,
1461	size_t len,
1462	const struct iov_iter *it)
1463	{
1464	struct sk_buff *skb;
1465	size_t linear;
1466	int err;
1467	int i;
1468
1469	if (it->nr_segs > MAX_SKB_FRAGS + `1` \|\|
1470	len > (ETH_MAX_MTU - NET_SKB_PAD - NET_IP_ALIGN))
1471	return ERR_PTR(error: -EMSGSIZE);
1472
1473	local_bh_disable();
1474	skb = napi_get_frags(napi: &tfile->napi);
1475	local_bh_enable();
1476	if (!skb)
1477	return ERR_PTR(error: -ENOMEM);
1478
1479	linear = iov_iter_single_seg_count(i: it);
1480	err = __skb_grow(skb, len: linear);
1481	if (err)
1482	goto free;
1483
1484	skb->len = len;
1485	skb->data_len = len - linear;
1486	skb->truesize += skb->data_len;
1487
1488	for (i = `1`; i < it->nr_segs; i++) {
1489	const struct iovec *iov = iter_iov(iter: it);
1490	size_t fragsz = iov->iov_len;
1491	struct page *page;
1492	void *frag;
1493
1494	if (fragsz == `0` \|\| fragsz > PAGE_SIZE) {
1495	err = -EINVAL;
1496	goto free;
1497	}
1498	frag = netdev_alloc_frag(fragsz);
1499	if (!frag) {
1500	err = -ENOMEM;
1501	goto free;
1502	}
1503	page = virt_to_head_page(x: frag);
1504	skb_fill_page_desc(skb, i: i - `1`, page,
1505	off: frag - page_address(page), size: fragsz);
1506	}
1507
1508	return skb;
1509	free:
1510	/ frees skb and all frags allocated with napi_alloc_frag() /
1511	napi_free_frags(napi: &tfile->napi);
1512	return ERR_PTR(error: err);
1513	}
1514
1515	/ prepad is the amount to reserve at front. len is length after that.*
1516	* linear is a hint as to how much to copy (usually headers). */
1517	static struct sk_buff tun_alloc_skb(struct* tun_file *tfile,
1518	size_t prepad, size_t len,
1519	size_t linear, int noblock)
1520	{
1521	struct sock *sk = tfile->socket.sk;
1522	struct sk_buff *skb;
1523	int err;
1524
1525	/ Under a page? Don't bother with paged skb. /
1526	if (prepad + len < PAGE_SIZE)
1527	linear = len;
1528
1529	if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
1530	linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER);
1531	skb = sock_alloc_send_pskb(sk, header_len: prepad + linear, data_len: len - linear, noblock,
1532	errcode: &err, PAGE_ALLOC_COSTLY_ORDER);
1533	if (!skb)
1534	return ERR_PTR(error: err);
1535
1536	skb_reserve(skb, len: prepad);
1537	skb_put(skb, len: linear);
1538	skb->data_len = len - linear;
1539	skb->len += len - linear;
1540
1541	return skb;
1542	}
1543
1544	static void tun_rx_batched(struct tun_struct tun, struct* tun_file *tfile,
1545	struct sk_buff skb, int* more)
1546	{
1547	struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
1548	struct sk_buff_head process_queue;
1549	u32 rx_batched = tun->rx_batched;
1550	bool rcv = false;
1551
1552	if (!rx_batched \|\| (!more && skb_queue_empty(list: queue))) {
1553	local_bh_disable();
1554	skb_record_rx_queue(skb, rx_queue: tfile->queue_index);
1555	netif_receive_skb(skb);
1556	local_bh_enable();
1557	return;
1558	}
1559
1560	spin_lock(lock: &queue->lock);
1561	if (!more \|\| skb_queue_len(list_: queue) == rx_batched) {
1562	__skb_queue_head_init(list: &process_queue);
1563	skb_queue_splice_tail_init(list: queue, head: &process_queue);
1564	rcv = true;
1565	} else {
1566	__skb_queue_tail(list: queue, newsk: skb);
1567	}
1568	spin_unlock(lock: &queue->lock);
1569
1570	if (rcv) {
1571	struct sk_buff *nskb;
1572
1573	local_bh_disable();
1574	while ((nskb = __skb_dequeue(list: &process_queue))) {
1575	skb_record_rx_queue(skb: nskb, rx_queue: tfile->queue_index);
1576	netif_receive_skb(skb: nskb);
1577	}
1578	skb_record_rx_queue(skb, rx_queue: tfile->queue_index);
1579	netif_receive_skb(skb);
1580	local_bh_enable();
1581	}
1582	}
1583
1584	static bool tun_can_build_skb(struct tun_struct tun, struct* tun_file *tfile,
1585	int len, int noblock, bool zerocopy)
1586	{
1587	if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
1588	return false;
1589
1590	if (tfile->socket.sk->sk_sndbuf != INT_MAX)
1591	return false;
1592
1593	if (!noblock)
1594	return false;
1595
1596	if (zerocopy)
1597	return false;
1598
1599	if (SKB_DATA_ALIGN(len + TUN_RX_PAD + XDP_PACKET_HEADROOM) +
1600	SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
1601	return false;
1602
1603	return true;
1604	}
1605
1606	static struct sk_buff __tun_build_skb(struct* tun_file *tfile,
1607	struct page_frag alloc_frag, char* *buf,
1608	int buflen, int len, int pad)
1609	{
1610	struct sk_buff *skb = build_skb(data: buf, frag_size: buflen);
1611
1612	if (!skb)
1613	return ERR_PTR(error: -ENOMEM);
1614
1615	skb_reserve(skb, len: pad);
1616	skb_put(skb, len);
1617	skb_set_owner_w(skb, sk: tfile->socket.sk);
1618
1619	get_page(page: alloc_frag->page);
1620	alloc_frag->offset += buflen;
1621
1622	return skb;
1623	}
1624
1625	static int tun_xdp_act(struct tun_struct tun, struct* bpf_prog *xdp_prog,
1626	struct xdp_buff *xdp, u32 act)
1627	{
1628	int err;
1629
1630	switch (act) {
1631	case XDP_REDIRECT:
1632	err = xdp_do_redirect(dev: tun->dev, xdp, prog: xdp_prog);
1633	if (err)
1634	return err;
1635	break;
1636	case XDP_TX:
1637	err = tun_xdp_tx(dev: tun->dev, xdp);
1638	if (err < `0`)
1639	return err;
1640	break;
1641	case XDP_PASS:
1642	break;
1643	default:
1644	bpf_warn_invalid_xdp_action(dev: tun->dev, prog: xdp_prog, act);
1645	fallthrough;
1646	case XDP_ABORTED:
1647	trace_xdp_exception(dev: tun->dev, xdp: xdp_prog, act);
1648	fallthrough;
1649	case XDP_DROP:
1650	dev_core_stats_rx_dropped_inc(dev: tun->dev);
1651	break;
1652	}
1653
1654	return act;
1655	}
1656
1657	static struct sk_buff tun_build_skb(struct* tun_struct *tun,
1658	struct tun_file *tfile,
1659	struct iov_iter *from,
1660	struct virtio_net_hdr *hdr,
1661	int len, int *skb_xdp)
1662	{
1663	struct page_frag *alloc_frag = &current->task_frag;
1664	struct bpf_prog *xdp_prog;
1665	int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
1666	char *buf;
1667	size_t copied;
1668	int pad = TUN_RX_PAD;
1669	int err = `0`;
1670
1671	rcu_read_lock();
1672	xdp_prog = rcu_dereference(tun->xdp_prog);
1673	if (xdp_prog)
1674	pad += XDP_PACKET_HEADROOM;
1675	buflen += SKB_DATA_ALIGN(len + pad);
1676	rcu_read_unlock();
1677
1678	alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
1679	if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
1680	return ERR_PTR(error: -ENOMEM);
1681
1682	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
1683	copied = copy_page_from_iter(page: alloc_frag->page,
1684	offset: alloc_frag->offset + pad,
1685	bytes: len, i: from);
1686	if (copied != len)
1687	return ERR_PTR(error: -EFAULT);
1688
1689	/ There's a small window that XDP may be set after the check*
1690	* of xdp_prog above, this should be rare and for simplicity
1691	* we do XDP on skb in case the headroom is not enough.
1692	*/
1693	if (hdr->gso_type \|\| !xdp_prog) {
1694	*skb_xdp = `1`;
1695	return __tun_build_skb(tfile, alloc_frag, buf, buflen, len,
1696	pad);
1697	}
1698
1699	*skb_xdp = `0`;
1700
1701	local_bh_disable();
1702	rcu_read_lock();
1703	xdp_prog = rcu_dereference(tun->xdp_prog);
1704	if (xdp_prog) {
1705	struct xdp_buff xdp;
1706	u32 act;
1707
1708	xdp_init_buff(xdp: &xdp, frame_sz: buflen, rxq: &tfile->xdp_rxq);
1709	xdp_prepare_buff(xdp: &xdp, hard_start: buf, headroom: pad, data_len: len, meta_valid: false);
1710
1711	act = bpf_prog_run_xdp(prog: xdp_prog, xdp: &xdp);
1712	if (act == XDP_REDIRECT \|\| act == XDP_TX) {
1713	get_page(page: alloc_frag->page);
1714	alloc_frag->offset += buflen;
1715	}
1716	err = tun_xdp_act(tun, xdp_prog, xdp: &xdp, act);
1717	if (err < `0`) {
1718	if (act == XDP_REDIRECT \|\| act == XDP_TX)
1719	put_page(page: alloc_frag->page);
1720	goto out;
1721	}
1722
1723	if (err == XDP_REDIRECT)
1724	xdp_do_flush();
1725	if (err != XDP_PASS)
1726	goto out;
1727
1728	pad = xdp.data - xdp.data_hard_start;
1729	len = xdp.data_end - xdp.data;
1730	}
1731	rcu_read_unlock();
1732	local_bh_enable();
1733
1734	return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad);
1735
1736	out:
1737	rcu_read_unlock();
1738	local_bh_enable();
1739	return NULL;
1740	}
1741
1742	/ Get packet from user space buffer /
1743	static ssize_t tun_get_user(struct tun_struct tun, struct* tun_file *tfile,
1744	void msg_control, struct* iov_iter *from,
1745	int noblock, bool more)
1746	{
1747	struct tun_pi pi = { `0`, cpu_to_be16(ETH_P_IP) };
1748	struct sk_buff *skb;
1749	size_t total_len = iov_iter_count(i: from);
1750	size_t len = total_len, align = tun->align, linear;
1751	struct virtio_net_hdr gso = { `0` };
1752	int good_linear;
1753	int copylen;
1754	bool zerocopy = false;
1755	int err;
1756	u32 rxhash = `0`;
1757	int skb_xdp = `1`;
1758	bool frags = tun_napi_frags_enabled(tfile);
1759	enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1760
1761	if (!(tun->flags & IFF_NO_PI)) {
1762	if (len < sizeof(pi))
1763	return -EINVAL;
1764	len -= sizeof(pi);
1765
1766	if (!copy_from_iter_full(addr: &pi, bytes: sizeof(pi), i: from))
1767	return -EFAULT;
1768	}
1769
1770	if (tun->flags & IFF_VNET_HDR) {
1771	int vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
1772
1773	if (len < vnet_hdr_sz)
1774	return -EINVAL;
1775	len -= vnet_hdr_sz;
1776
1777	if (!copy_from_iter_full(addr: &gso, bytes: sizeof(gso), i: from))
1778	return -EFAULT;
1779
1780	if ((gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1781	tun16_to_cpu(tun, val: gso.csum_start) + tun16_to_cpu(tun, val: gso.csum_offset) + `2` > tun16_to_cpu(tun, val: gso.hdr_len))
1782	gso.hdr_len = cpu_to_tun16(tun, val: tun16_to_cpu(tun, val: gso.csum_start) + tun16_to_cpu(tun, val: gso.csum_offset) + `2`);
1783
1784	if (tun16_to_cpu(tun, val: gso.hdr_len) > len)
1785	return -EINVAL;
1786	iov_iter_advance(i: from, bytes: vnet_hdr_sz - sizeof(gso));
1787	}
1788
1789	if ((tun->flags & TUN_TYPE_MASK) == IFF_TAP) {
1790	align += NET_IP_ALIGN;
1791	if (unlikely(len < ETH_HLEN \|\|
1792	(gso.hdr_len && tun16_to_cpu(tun, gso.hdr_len) < ETH_HLEN)))
1793	return -EINVAL;
1794	}
1795
1796	good_linear = SKB_MAX_HEAD(align);
1797
1798	if (msg_control) {
1799	struct iov_iter i = *from;
1800
1801	/ There are 256 bytes to be copied in skb, so there is*
1802	* enough room for skb expand head in case it is used.
1803	* The rest of the buffer is mapped from userspace.
1804	*/
1805	copylen = gso.hdr_len ? tun16_to_cpu(tun, val: gso.hdr_len) : GOODCOPY_LEN;
1806	if (copylen > good_linear)
1807	copylen = good_linear;
1808	linear = copylen;
1809	iov_iter_advance(i: &i, bytes: copylen);
1810	if (iov_iter_npages(i: &i, INT_MAX) <= MAX_SKB_FRAGS)
1811	zerocopy = true;
1812	}
1813
1814	if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
1815	/ For the packet that is not easy to be processed*
1816	* (e.g gso or jumbo packet), we will do it at after
1817	* skb was created with generic XDP routine.
1818	*/
1819	skb = tun_build_skb(tun, tfile, from, hdr: &gso, len, skb_xdp: &skb_xdp);
1820	err = PTR_ERR_OR_ZERO(ptr: skb);
1821	if (err)
1822	goto drop;
1823	if (!skb)
1824	return total_len;
1825	} else {
1826	if (!zerocopy) {
1827	copylen = len;
1828	if (tun16_to_cpu(tun, val: gso.hdr_len) > good_linear)
1829	linear = good_linear;
1830	else
1831	linear = tun16_to_cpu(tun, val: gso.hdr_len);
1832	}
1833
1834	if (frags) {
1835	mutex_lock(&tfile->napi_mutex);
1836	skb = tun_napi_alloc_frags(tfile, len: copylen, it: from);
1837	/ tun_napi_alloc_frags() enforces a layout for the skb.*
1838	* If zerocopy is enabled, then this layout will be
1839	* overwritten by zerocopy_sg_from_iter().
1840	*/
1841	zerocopy = false;
1842	} else {
1843	if (!linear)
1844	linear = min_t(size_t, good_linear, copylen);
1845
1846	skb = tun_alloc_skb(tfile, prepad: align, len: copylen, linear,
1847	noblock);
1848	}
1849
1850	err = PTR_ERR_OR_ZERO(ptr: skb);
1851	if (err)
1852	goto drop;
1853
1854	if (zerocopy)
1855	err = zerocopy_sg_from_iter(skb, frm: from);
1856	else
1857	err = skb_copy_datagram_from_iter(skb, offset: `0`, from, len);
1858
1859	if (err) {
1860	err = -EFAULT;
1861	drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT;
1862	goto drop;
1863	}
1864	}
1865
1866	if (virtio_net_hdr_to_skb(skb, hdr: &gso, little_endian: tun_is_little_endian(tun))) {
1867	atomic_long_inc(v: &tun->rx_frame_errors);
1868	err = -EINVAL;
1869	goto free_skb;
1870	}
1871
1872	switch (tun->flags & TUN_TYPE_MASK) {
1873	case IFF_TUN:
1874	if (tun->flags & IFF_NO_PI) {
1875	u8 ip_version = skb->len ? (skb->data[`0`] >> `4`) : `0`;
1876
1877	switch (ip_version) {
1878	case `4`:
1879	pi.proto = htons(ETH_P_IP);
1880	break;
1881	case `6`:
1882	pi.proto = htons(ETH_P_IPV6);
1883	break;
1884	default:
1885	err = -EINVAL;
1886	goto drop;
1887	}
1888	}
1889
1890	skb_reset_mac_header(skb);
1891	skb->protocol = pi.proto;
1892	skb->dev = tun->dev;
1893	break;
1894	case IFF_TAP:
1895	if (frags && !pskb_may_pull(skb, ETH_HLEN)) {
1896	err = -ENOMEM;
1897	drop_reason = SKB_DROP_REASON_HDR_TRUNC;
1898	goto drop;
1899	}
1900	skb->protocol = eth_type_trans(skb, dev: tun->dev);
1901	break;
1902	}
1903
1904	/ copy skb_ubuf_info for callback when skb has no error /
1905	if (zerocopy) {
1906	skb_zcopy_init(skb, uarg: msg_control);
1907	} else if (msg_control) {
1908	struct ubuf_info *uarg = msg_control;
1909	uarg->callback(NULL, uarg, false);
1910	}
1911
1912	skb_reset_network_header(skb);
1913	skb_probe_transport_header(skb);
1914	skb_record_rx_queue(skb, rx_queue: tfile->queue_index);
1915
1916	if (skb_xdp) {
1917	struct bpf_prog *xdp_prog;
1918	int ret;
1919
1920	local_bh_disable();
1921	rcu_read_lock();
1922	xdp_prog = rcu_dereference(tun->xdp_prog);
1923	if (xdp_prog) {
1924	ret = do_xdp_generic(xdp_prog, skb);
1925	if (ret != XDP_PASS) {
1926	rcu_read_unlock();
1927	local_bh_enable();
1928	goto unlock_frags;
1929	}
1930	}
1931	rcu_read_unlock();
1932	local_bh_enable();
1933	}
1934
1935	/ Compute the costly rx hash only if needed for flow updates.*
1936	* We may get a very small possibility of OOO during switching, not
1937	* worth to optimize.
1938	*/
1939	if (!rcu_access_pointer(tun->steering_prog) && tun->numqueues > `1` &&
1940	!tfile->detached)
1941	rxhash = __skb_get_hash_symmetric(skb);
1942
1943	rcu_read_lock();
1944	if (unlikely(!(tun->dev->flags & IFF_UP))) {
1945	err = -EIO;
1946	rcu_read_unlock();
1947	drop_reason = SKB_DROP_REASON_DEV_READY;
1948	goto drop;
1949	}
1950
1951	if (frags) {
1952	u32 headlen;
1953
1954	/ Exercise flow dissector code path. /
1955	skb_push(skb, ETH_HLEN);
1956	headlen = eth_get_headlen(dev: tun->dev, data: skb->data,
1957	len: skb_headlen(skb));
1958
1959	if (unlikely(headlen > skb_headlen(skb))) {
1960	WARN_ON_ONCE(`1`);
1961	err = -ENOMEM;
1962	dev_core_stats_rx_dropped_inc(dev: tun->dev);
1963	napi_busy:
1964	napi_free_frags(napi: &tfile->napi);
1965	rcu_read_unlock();
1966	mutex_unlock(lock: &tfile->napi_mutex);
1967	return err;
1968	}
1969
1970	if (likely(napi_schedule_prep(&tfile->napi))) {
1971	local_bh_disable();
1972	napi_gro_frags(napi: &tfile->napi);
1973	napi_complete(n: &tfile->napi);
1974	local_bh_enable();
1975	} else {
1976	err = -EBUSY;
1977	goto napi_busy;
1978	}
1979	mutex_unlock(lock: &tfile->napi_mutex);
1980	} else if (tfile->napi_enabled) {
1981	struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
1982	int queue_len;
1983
1984	spin_lock_bh(lock: &queue->lock);
1985
1986	if (unlikely(tfile->detached)) {
1987	spin_unlock_bh(lock: &queue->lock);
1988	rcu_read_unlock();
1989	err = -EBUSY;
1990	goto free_skb;
1991	}
1992
1993	__skb_queue_tail(list: queue, newsk: skb);
1994	queue_len = skb_queue_len(list_: queue);
1995	spin_unlock(lock: &queue->lock);
1996
1997	if (!more \|\| queue_len > NAPI_POLL_WEIGHT)
1998	napi_schedule(n: &tfile->napi);
1999
2000	local_bh_enable();
2001	} else if (!IS_ENABLED(CONFIG_4KSTACKS)) {
2002	tun_rx_batched(tun, tfile, skb, more);
2003	} else {
2004	netif_rx(skb);
2005	}
2006	rcu_read_unlock();
2007
2008	preempt_disable();
2009	dev_sw_netstats_rx_add(dev: tun->dev, len);
2010	preempt_enable();
2011
2012	if (rxhash)
2013	tun_flow_update(tun, rxhash, tfile);
2014
2015	return total_len;
2016
2017	drop:
2018	if (err != -EAGAIN)
2019	dev_core_stats_rx_dropped_inc(dev: tun->dev);
2020
2021	free_skb:
2022	if (!IS_ERR_OR_NULL(ptr: skb))
2023	kfree_skb_reason(skb, reason: drop_reason);
2024
2025	unlock_frags:
2026	if (frags) {
2027	tfile->napi.skb = NULL;
2028	mutex_unlock(lock: &tfile->napi_mutex);
2029	}
2030
2031	return err ?: total_len;
2032	}
2033
2034	static ssize_t tun_chr_write_iter(struct kiocb iocb, struct* iov_iter *from)
2035	{
2036	struct file *file = iocb->ki_filp;
2037	struct tun_file *tfile = file->private_data;
2038	struct tun_struct *tun = tun_get(tfile);
2039	ssize_t result;
2040	int noblock = `0`;
2041
2042	if (!tun)
2043	return -EBADFD;
2044
2045	if ((file->f_flags & O_NONBLOCK) \|\| (iocb->ki_flags & IOCB_NOWAIT))
2046	noblock = `1`;
2047
2048	result = tun_get_user(tun, tfile, NULL, from, noblock, more: false);
2049
2050	tun_put(tun);
2051	return result;
2052	}
2053
2054	static ssize_t tun_put_user_xdp(struct tun_struct *tun,
2055	struct tun_file *tfile,
2056	struct xdp_frame *xdp_frame,
2057	struct iov_iter *iter)
2058	{
2059	int vnet_hdr_sz = `0`;
2060	size_t size = xdp_frame->len;
2061	size_t ret;
2062
2063	if (tun->flags & IFF_VNET_HDR) {
2064	struct virtio_net_hdr gso = { `0` };
2065
2066	vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
2067	if (unlikely(iov_iter_count(iter) < vnet_hdr_sz))
2068	return -EINVAL;
2069	if (unlikely(copy_to_iter(&gso, sizeof(gso), iter) !=
2070	sizeof(gso)))
2071	return -EFAULT;
2072	iov_iter_advance(i: iter, bytes: vnet_hdr_sz - sizeof(gso));
2073	}
2074
2075	ret = copy_to_iter(addr: xdp_frame->data, bytes: size, i: iter) + vnet_hdr_sz;
2076
2077	preempt_disable();
2078	dev_sw_netstats_tx_add(dev: tun->dev, packets: `1`, len: ret);
2079	preempt_enable();
2080
2081	return ret;
2082	}
2083
2084	/ Put packet to the user space buffer /
2085	static ssize_t tun_put_user(struct tun_struct *tun,
2086	struct tun_file *tfile,
2087	struct sk_buff *skb,
2088	struct iov_iter *iter)
2089	{
2090	struct tun_pi pi = { `0`, skb->protocol };
2091	ssize_t total;
2092	int vlan_offset = `0`;
2093	int vlan_hlen = `0`;
2094	int vnet_hdr_sz = `0`;
2095
2096	if (skb_vlan_tag_present(skb))
2097	vlan_hlen = VLAN_HLEN;
2098
2099	if (tun->flags & IFF_VNET_HDR)
2100	vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
2101
2102	total = skb->len + vlan_hlen + vnet_hdr_sz;
2103
2104	if (!(tun->flags & IFF_NO_PI)) {
2105	if (iov_iter_count(i: iter) < sizeof(pi))
2106	return -EINVAL;
2107
2108	total += sizeof(pi);
2109	if (iov_iter_count(i: iter) < total) {
2110	/ Packet will be striped /
2111	pi.flags \|= TUN_PKT_STRIP;
2112	}
2113
2114	if (copy_to_iter(addr: &pi, bytes: sizeof(pi), i: iter) != sizeof(pi))
2115	return -EFAULT;
2116	}
2117
2118	if (vnet_hdr_sz) {
2119	struct virtio_net_hdr gso;
2120
2121	if (iov_iter_count(i: iter) < vnet_hdr_sz)
2122	return -EINVAL;
2123
2124	if (virtio_net_hdr_from_skb(skb, hdr: &gso,
2125	little_endian: tun_is_little_endian(tun), has_data_valid: true,
2126	vlan_hlen)) {
2127	struct skb_shared_info *sinfo = skb_shinfo(skb);
2128	pr_err("unexpected GSO type: "
2129	"0x%x, gso_size %d, hdr_len %d\n",
2130	sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size),
2131	tun16_to_cpu(tun, gso.hdr_len));
2132	print_hex_dump(KERN_ERR, prefix_str: "tun: ",
2133	prefix_type: DUMP_PREFIX_NONE,
2134	rowsize: `16`, groupsize: `1`, buf: skb->head,
2135	min((int)tun16_to_cpu(tun, gso.hdr_len), `64`), ascii: true);
2136	WARN_ON_ONCE(`1`);
2137	return -EINVAL;
2138	}
2139
2140	if (copy_to_iter(addr: &gso, bytes: sizeof(gso), i: iter) != sizeof(gso))
2141	return -EFAULT;
2142
2143	iov_iter_advance(i: iter, bytes: vnet_hdr_sz - sizeof(gso));
2144	}
2145
2146	if (vlan_hlen) {
2147	int ret;
2148	struct veth veth;
2149
2150	veth.h_vlan_proto = skb->vlan_proto;
2151	veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));
2152
2153	vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
2154
2155	ret = skb_copy_datagram_iter(from: skb, offset: `0`, to: iter, size: vlan_offset);
2156	if (ret \|\| !iov_iter_count(i: iter))
2157	goto done;
2158
2159	ret = copy_to_iter(addr: &veth, bytes: sizeof(veth), i: iter);
2160	if (ret != sizeof(veth) \|\| !iov_iter_count(i: iter))
2161	goto done;
2162	}
2163
2164	skb_copy_datagram_iter(from: skb, offset: vlan_offset, to: iter, size: skb->len - vlan_offset);
2165
2166	done:
2167	/ caller is in process context, /
2168	preempt_disable();
2169	dev_sw_netstats_tx_add(dev: tun->dev, packets: `1`, len: skb->len + vlan_hlen);
2170	preempt_enable();
2171
2172	return total;
2173	}
2174
2175	static void tun_ring_recv(struct* tun_file tfile, int* noblock, int *err)
2176	{
2177	DECLARE_WAITQUEUE(wait, current);
2178	void *ptr = NULL;
2179	int error = `0`;
2180
2181	ptr = ptr_ring_consume(r: &tfile->tx_ring);
2182	if (ptr)
2183	goto out;
2184	if (noblock) {
2185	error = -EAGAIN;
2186	goto out;
2187	}
2188
2189	add_wait_queue(wq_head: &tfile->socket.wq.wait, wq_entry: &wait);
2190
2191	while (`1`) {
2192	set_current_state(TASK_INTERRUPTIBLE);
2193	ptr = ptr_ring_consume(r: &tfile->tx_ring);
2194	if (ptr)
2195	break;
2196	if (signal_pending(current)) {
2197	error = -ERESTARTSYS;
2198	break;
2199	}
2200	if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) {
2201	error = -EFAULT;
2202	break;
2203	}
2204
2205	schedule();
2206	}
2207
2208	__set_current_state(TASK_RUNNING);
2209	remove_wait_queue(wq_head: &tfile->socket.wq.wait, wq_entry: &wait);
2210
2211	out:
2212	*err = error;
2213	return ptr;
2214	}
2215
2216	static ssize_t tun_do_read(struct tun_struct tun, struct* tun_file *tfile,
2217	struct iov_iter *to,
2218	int noblock, void *ptr)
2219	{
2220	ssize_t ret;
2221	int err;
2222
2223	if (!iov_iter_count(i: to)) {
2224	tun_ptr_free(ptr);
2225	return `0`;
2226	}
2227
2228	if (!ptr) {
2229	/ Read frames from ring /
2230	ptr = tun_ring_recv(tfile, noblock, err: &err);
2231	if (!ptr)
2232	return err;
2233	}
2234
2235	if (tun_is_xdp_frame(ptr)) {
2236	struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
2237
2238	ret = tun_put_user_xdp(tun, tfile, xdp_frame: xdpf, iter: to);
2239	xdp_return_frame(xdpf);
2240	} else {
2241	struct sk_buff *skb = ptr;
2242
2243	ret = tun_put_user(tun, tfile, skb, iter: to);
2244	if (unlikely(ret < `0`))
2245	kfree_skb(skb);
2246	else
2247	consume_skb(skb);
2248	}
2249
2250	return ret;
2251	}
2252
2253	static ssize_t tun_chr_read_iter(struct kiocb iocb, struct* iov_iter *to)
2254	{
2255	struct file *file = iocb->ki_filp;
2256	struct tun_file *tfile = file->private_data;
2257	struct tun_struct *tun = tun_get(tfile);
2258	ssize_t len = iov_iter_count(i: to), ret;
2259	int noblock = `0`;
2260
2261	if (!tun)
2262	return -EBADFD;
2263
2264	if ((file->f_flags & O_NONBLOCK) \|\| (iocb->ki_flags & IOCB_NOWAIT))
2265	noblock = `1`;
2266
2267	ret = tun_do_read(tun, tfile, to, noblock, NULL);
2268	ret = min_t(ssize_t, ret, len);
2269	if (ret > `0`)
2270	iocb->ki_pos = ret;
2271	tun_put(tun);
2272	return ret;
2273	}
2274
2275	static void tun_prog_free(struct rcu_head *rcu)
2276	{
2277	struct tun_prog prog = container_of(rcu, struct* tun_prog, rcu);
2278
2279	bpf_prog_destroy(fp: prog->prog);
2280	kfree(objp: prog);
2281	}
2282
2283	static int __tun_set_ebpf(struct tun_struct *tun,
2284	struct tun_prog __rcu **prog_p,
2285	struct bpf_prog *prog)
2286	{
2287	struct tun_prog old, new = NULL;
2288
2289	if (prog) {
2290	new = kmalloc(size: sizeof(*new), GFP_KERNEL);
2291	if (!new)
2292	return -ENOMEM;
2293	new->prog = prog;
2294	}
2295
2296	spin_lock_bh(lock: &tun->lock);
2297	old = rcu_dereference_protected(*prog_p,
2298	lockdep_is_held(&tun->lock));
2299	rcu_assign_pointer(*prog_p, new);
2300	spin_unlock_bh(lock: &tun->lock);
2301
2302	if (old)
2303	call_rcu(head: &old->rcu, func: tun_prog_free);
2304
2305	return `0`;
2306	}
2307
2308	static void tun_free_netdev(struct net_device *dev)
2309	{
2310	struct tun_struct *tun = netdev_priv(dev);
2311
2312	BUG_ON(!(list_empty(&tun->disabled)));
2313
2314	free_percpu(pdata: dev->tstats);
2315	tun_flow_uninit(tun);
2316	security_tun_dev_free_security(security: tun->security);
2317	__tun_set_ebpf(tun, prog_p: &tun->steering_prog, NULL);
2318	__tun_set_ebpf(tun, prog_p: &tun->filter_prog, NULL);
2319	}
2320
2321	static void tun_setup(struct net_device *dev)
2322	{
2323	struct tun_struct *tun = netdev_priv(dev);
2324
2325	tun->owner = INVALID_UID;
2326	tun->group = INVALID_GID;
2327	tun_default_link_ksettings(dev, cmd: &tun->link_ksettings);
2328
2329	dev->ethtool_ops = &tun_ethtool_ops;
2330	dev->needs_free_netdev = true;
2331	dev->priv_destructor = tun_free_netdev;
2332	/ We prefer our own queue length /
2333	dev->tx_queue_len = TUN_READQ_SIZE;
2334	}
2335
2336	/ Trivial set of netlink ops to allow deleting tun or tap*
2337	* device with netlink.
2338	*/
2339	static int tun_validate(struct nlattr tb[], struct* nlattr *data[],
2340	struct netlink_ext_ack *extack)
2341	{
2342	NL_SET_ERR_MSG(extack,
2343	"tun/tap creation via rtnetlink is not supported.");
2344	return -EOPNOTSUPP;
2345	}
2346
2347	static size_t tun_get_size(const struct net_device *dev)
2348	{
2349	BUILD_BUG_ON(sizeof(u32) != sizeof(uid_t));
2350	BUILD_BUG_ON(sizeof(u32) != sizeof(gid_t));
2351
2352	return nla_total_size(payload: sizeof(uid_t)) + / OWNER /
2353	nla_total_size(payload: sizeof(gid_t)) + / GROUP /
2354	nla_total_size(payload: sizeof(u8)) + / TYPE /
2355	nla_total_size(payload: sizeof(u8)) + / PI /
2356	nla_total_size(payload: sizeof(u8)) + / VNET_HDR /
2357	nla_total_size(payload: sizeof(u8)) + / PERSIST /
2358	nla_total_size(payload: sizeof(u8)) + / MULTI_QUEUE /
2359	nla_total_size(payload: sizeof(u32)) + / NUM_QUEUES /
2360	nla_total_size(payload: sizeof(u32)) + / NUM_DISABLED_QUEUES /
2361	`0`;
2362	}
2363
2364	static int tun_fill_info(struct sk_buff skb, const* struct net_device *dev)
2365	{
2366	struct tun_struct *tun = netdev_priv(dev);
2367
2368	if (nla_put_u8(skb, attrtype: IFLA_TUN_TYPE, value: tun->flags & TUN_TYPE_MASK))
2369	goto nla_put_failure;
2370	if (uid_valid(uid: tun->owner) &&
2371	nla_put_u32(skb, attrtype: IFLA_TUN_OWNER,
2372	value: from_kuid_munged(current_user_ns(), uid: tun->owner)))
2373	goto nla_put_failure;
2374	if (gid_valid(gid: tun->group) &&
2375	nla_put_u32(skb, attrtype: IFLA_TUN_GROUP,
2376	value: from_kgid_munged(current_user_ns(), gid: tun->group)))
2377	goto nla_put_failure;
2378	if (nla_put_u8(skb, attrtype: IFLA_TUN_PI, value: !(tun->flags & IFF_NO_PI)))
2379	goto nla_put_failure;
2380	if (nla_put_u8(skb, attrtype: IFLA_TUN_VNET_HDR, value: !!(tun->flags & IFF_VNET_HDR)))
2381	goto nla_put_failure;
2382	if (nla_put_u8(skb, attrtype: IFLA_TUN_PERSIST, value: !!(tun->flags & IFF_PERSIST)))
2383	goto nla_put_failure;
2384	if (nla_put_u8(skb, attrtype: IFLA_TUN_MULTI_QUEUE,
2385	value: !!(tun->flags & IFF_MULTI_QUEUE)))
2386	goto nla_put_failure;
2387	if (tun->flags & IFF_MULTI_QUEUE) {
2388	if (nla_put_u32(skb, attrtype: IFLA_TUN_NUM_QUEUES, value: tun->numqueues))
2389	goto nla_put_failure;
2390	if (nla_put_u32(skb, attrtype: IFLA_TUN_NUM_DISABLED_QUEUES,
2391	value: tun->numdisabled))
2392	goto nla_put_failure;
2393	}
2394
2395	return `0`;
2396
2397	nla_put_failure:
2398	return -EMSGSIZE;
2399	}
2400
2401	static struct rtnl_link_ops tun_link_ops __read_mostly = {
2402	.kind = DRV_NAME,
2403	.priv_size = sizeof(struct tun_struct),
2404	.setup = tun_setup,
2405	.validate = tun_validate,
2406	.get_size = tun_get_size,
2407	.fill_info = tun_fill_info,
2408	};
2409
2410	static void tun_sock_write_space(struct sock *sk)
2411	{
2412	struct tun_file *tfile;
2413	wait_queue_head_t *wqueue;
2414
2415	if (!sock_writeable(sk))
2416	return;
2417
2418	if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, addr: &sk->sk_socket->flags))
2419	return;
2420
2421	wqueue = sk_sleep(sk);
2422	if (wqueue && waitqueue_active(wq_head: wqueue))
2423	wake_up_interruptible_sync_poll(wqueue, EPOLLOUT \|
2424	EPOLLWRNORM \| EPOLLWRBAND);
2425
2426	tfile = container_of(sk, struct tun_file, sk);
2427	kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
2428	}
2429
2430	static void tun_put_page(struct tun_page *tpage)
2431	{
2432	if (tpage->page)
2433	__page_frag_cache_drain(page: tpage->page, count: tpage->count);
2434	}
2435
2436	static int tun_xdp_one(struct tun_struct *tun,
2437	struct tun_file *tfile,
2438	struct xdp_buff xdp, int* *flush,
2439	struct tun_page *tpage)
2440	{
2441	unsigned int datasize = xdp->data_end - xdp->data;
2442	struct tun_xdp_hdr *hdr = xdp->data_hard_start;
2443	struct virtio_net_hdr *gso = &hdr->gso;
2444	struct bpf_prog *xdp_prog;
2445	struct sk_buff *skb = NULL;
2446	struct sk_buff_head *queue;
2447	u32 rxhash = `0`, act;
2448	int buflen = hdr->buflen;
2449	int ret = `0`;
2450	bool skb_xdp = false;
2451	struct page *page;
2452
2453	xdp_prog = rcu_dereference(tun->xdp_prog);
2454	if (xdp_prog) {
2455	if (gso->gso_type) {
2456	skb_xdp = true;
2457	goto build;
2458	}
2459
2460	xdp_init_buff(xdp, frame_sz: buflen, rxq: &tfile->xdp_rxq);
2461	xdp_set_data_meta_invalid(xdp);
2462
2463	act = bpf_prog_run_xdp(prog: xdp_prog, xdp);
2464	ret = tun_xdp_act(tun, xdp_prog, xdp, act);
2465	if (ret < `0`) {
2466	put_page(page: virt_to_head_page(x: xdp->data));
2467	return ret;
2468	}
2469
2470	switch (ret) {
2471	case XDP_REDIRECT:
2472	*flush = true;
2473	fallthrough;
2474	case XDP_TX:
2475	return `0`;
2476	case XDP_PASS:
2477	break;
2478	default:
2479	page = virt_to_head_page(x: xdp->data);
2480	if (tpage->page == page) {
2481	++tpage->count;
2482	} else {
2483	tun_put_page(tpage);
2484	tpage->page = page;
2485	tpage->count = `1`;
2486	}
2487	return `0`;
2488	}
2489	}
2490
2491	build:
2492	skb = build_skb(data: xdp->data_hard_start, frag_size: buflen);
2493	if (!skb) {
2494	ret = -ENOMEM;
2495	goto out;
2496	}
2497
2498	skb_reserve(skb, len: xdp->data - xdp->data_hard_start);
2499	skb_put(skb, len: xdp->data_end - xdp->data);
2500
2501	if (virtio_net_hdr_to_skb(skb, hdr: gso, little_endian: tun_is_little_endian(tun))) {
2502	atomic_long_inc(v: &tun->rx_frame_errors);
2503	kfree_skb(skb);
2504	ret = -EINVAL;
2505	goto out;
2506	}
2507
2508	skb->protocol = eth_type_trans(skb, dev: tun->dev);
2509	skb_reset_network_header(skb);
2510	skb_probe_transport_header(skb);
2511	skb_record_rx_queue(skb, rx_queue: tfile->queue_index);
2512
2513	if (skb_xdp) {
2514	ret = do_xdp_generic(xdp_prog, skb);
2515	if (ret != XDP_PASS) {
2516	ret = `0`;
2517	goto out;
2518	}
2519	}
2520
2521	if (!rcu_dereference(tun->steering_prog) && tun->numqueues > `1` &&
2522	!tfile->detached)
2523	rxhash = __skb_get_hash_symmetric(skb);
2524
2525	if (tfile->napi_enabled) {
2526	queue = &tfile->sk.sk_write_queue;
2527	spin_lock(lock: &queue->lock);
2528
2529	if (unlikely(tfile->detached)) {
2530	spin_unlock(lock: &queue->lock);
2531	kfree_skb(skb);
2532	return -EBUSY;
2533	}
2534
2535	__skb_queue_tail(list: queue, newsk: skb);
2536	spin_unlock(lock: &queue->lock);
2537	ret = `1`;
2538	} else {
2539	netif_receive_skb(skb);
2540	ret = `0`;
2541	}
2542
2543	/ No need to disable preemption here since this function is*
2544	* always called with bh disabled
2545	*/
2546	dev_sw_netstats_rx_add(dev: tun->dev, len: datasize);
2547
2548	if (rxhash)
2549	tun_flow_update(tun, rxhash, tfile);
2550
2551	out:
2552	return ret;
2553	}
2554
2555	static int tun_sendmsg(struct socket sock, struct* msghdr *m, size_t total_len)
2556	{
2557	int ret, i;
2558	struct tun_file tfile = container_of(sock, struct* tun_file, socket);
2559	struct tun_struct *tun = tun_get(tfile);
2560	struct tun_msg_ctl *ctl = m->msg_control;
2561	struct xdp_buff *xdp;
2562
2563	if (!tun)
2564	return -EBADFD;
2565
2566	if (m->msg_controllen == sizeof(struct tun_msg_ctl) &&
2567	ctl && ctl->type == TUN_MSG_PTR) {
2568	struct tun_page tpage;
2569	int n = ctl->num;
2570	int flush = `0`, queued = `0`;
2571
2572	memset(&tpage, `0`, sizeof(tpage));
2573
2574	local_bh_disable();
2575	rcu_read_lock();
2576
2577	for (i = `0`; i < n; i++) {
2578	xdp = &((struct xdp_buff *)ctl->ptr)[i];
2579	ret = tun_xdp_one(tun, tfile, xdp, flush: &flush, tpage: &tpage);
2580	if (ret > `0`)
2581	queued += ret;
2582	}
2583
2584	if (flush)
2585	xdp_do_flush();
2586
2587	if (tfile->napi_enabled && queued > `0`)
2588	napi_schedule(n: &tfile->napi);
2589
2590	rcu_read_unlock();
2591	local_bh_enable();
2592
2593	tun_put_page(tpage: &tpage);
2594
2595	ret = total_len;
2596	goto out;
2597	}
2598
2599	ret = tun_get_user(tun, tfile, msg_control: ctl ? ctl->ptr : NULL, from: &m->msg_iter,
2600	noblock: m->msg_flags & MSG_DONTWAIT,
2601	more: m->msg_flags & MSG_MORE);
2602	out:
2603	tun_put(tun);
2604	return ret;
2605	}
2606
2607	static int tun_recvmsg(struct socket sock, struct* msghdr *m, size_t total_len,
2608	int flags)
2609	{
2610	struct tun_file tfile = container_of(sock, struct* tun_file, socket);
2611	struct tun_struct *tun = tun_get(tfile);
2612	void *ptr = m->msg_control;
2613	int ret;
2614
2615	if (!tun) {
2616	ret = -EBADFD;
2617	goto out_free;
2618	}
2619
2620	if (flags & ~(MSG_DONTWAIT\|MSG_TRUNC\|MSG_ERRQUEUE)) {
2621	ret = -EINVAL;
2622	goto out_put_tun;
2623	}
2624	if (flags & MSG_ERRQUEUE) {
2625	ret = sock_recv_errqueue(sk: sock->sk, msg: m, len: total_len,
2626	SOL_PACKET, TUN_TX_TIMESTAMP);
2627	goto out;
2628	}
2629	ret = tun_do_read(tun, tfile, to: &m->msg_iter, noblock: flags & MSG_DONTWAIT, ptr);
2630	if (ret > (ssize_t)total_len) {
2631	m->msg_flags \|= MSG_TRUNC;
2632	ret = flags & MSG_TRUNC ? ret : total_len;
2633	}
2634	out:
2635	tun_put(tun);
2636	return ret;
2637
2638	out_put_tun:
2639	tun_put(tun);
2640	out_free:
2641	tun_ptr_free(ptr);
2642	return ret;
2643	}
2644
2645	static int tun_ptr_peek_len(void *ptr)
2646	{
2647	if (likely(ptr)) {
2648	if (tun_is_xdp_frame(ptr)) {
2649	struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
2650
2651	return xdpf->len;
2652	}
2653	return __skb_array_len_with_tag(skb: ptr);
2654	} else {
2655	return `0`;
2656	}
2657	}
2658
2659	static int tun_peek_len(struct socket *sock)
2660	{
2661	struct tun_file tfile = container_of(sock, struct* tun_file, socket);
2662	struct tun_struct *tun;
2663	int ret = `0`;
2664
2665	tun = tun_get(tfile);
2666	if (!tun)
2667	return `0`;
2668
2669	ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
2670	tun_put(tun);
2671
2672	return ret;
2673	}
2674
2675	/ Ops structure to mimic raw sockets with tun /
2676	static const struct proto_ops tun_socket_ops = {
2677	.peek_len = tun_peek_len,
2678	.sendmsg = tun_sendmsg,
2679	.recvmsg = tun_recvmsg,
2680	};
2681
2682	static struct proto tun_proto = {
2683	.name = "tun",
2684	.owner = THIS_MODULE,
2685	.obj_size = sizeof(struct tun_file),
2686	};
2687
2688	static int tun_flags(struct tun_struct *tun)
2689	{
2690	return tun->flags & (TUN_FEATURES \| IFF_PERSIST \| IFF_TUN \| IFF_TAP);
2691	}
2692
2693	static ssize_t tun_flags_show(struct device dev, struct* device_attribute *attr,
2694	char *buf)
2695	{
2696	struct tun_struct *tun = netdev_priv(to_net_dev(dev));
2697	return sysfs_emit(buf, fmt: "0x%x\n", tun_flags(tun));
2698	}
2699
2700	static ssize_t owner_show(struct device dev, struct* device_attribute *attr,
2701	char *buf)
2702	{
2703	struct tun_struct *tun = netdev_priv(to_net_dev(dev));
2704	return uid_valid(uid: tun->owner)?
2705	sysfs_emit(buf, fmt: "%u\n",
2706	from_kuid_munged(current_user_ns(), uid: tun->owner)) :
2707	sysfs_emit(buf, fmt: "-1\n");
2708	}
2709
2710	static ssize_t group_show(struct device dev, struct* device_attribute *attr,
2711	char *buf)
2712	{
2713	struct tun_struct *tun = netdev_priv(to_net_dev(dev));
2714	return gid_valid(gid: tun->group) ?
2715	sysfs_emit(buf, fmt: "%u\n",
2716	from_kgid_munged(current_user_ns(), gid: tun->group)) :
2717	sysfs_emit(buf, fmt: "-1\n");
2718	}
2719
2720	static DEVICE_ATTR_RO(tun_flags);
2721	static DEVICE_ATTR_RO(owner);
2722	static DEVICE_ATTR_RO(group);
2723
2724	static struct attribute *tun_dev_attrs[] = {
2725	&dev_attr_tun_flags.attr,
2726	&dev_attr_owner.attr,
2727	&dev_attr_group.attr,
2728	NULL
2729	};
2730
2731	static const struct attribute_group tun_attr_group = {
2732	.attrs = tun_dev_attrs
2733	};
2734
2735	static int tun_set_iff(struct net net, struct* file file, struct* ifreq *ifr)
2736	{
2737	struct tun_struct *tun;
2738	struct tun_file *tfile = file->private_data;
2739	struct net_device *dev;
2740	int err;
2741
2742	if (tfile->detached)
2743	return -EINVAL;
2744
2745	if ((ifr->ifr_flags & IFF_NAPI_FRAGS)) {
2746	if (!capable(CAP_NET_ADMIN))
2747	return -EPERM;
2748
2749	if (!(ifr->ifr_flags & IFF_NAPI) \|\|
2750	(ifr->ifr_flags & TUN_TYPE_MASK) != IFF_TAP)
2751	return -EINVAL;
2752	}
2753
2754	dev = __dev_get_by_name(net, name: ifr->ifr_name);
2755	if (dev) {
2756	if (ifr->ifr_flags & IFF_TUN_EXCL)
2757	return -EBUSY;
2758	if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops)
2759	tun = netdev_priv(dev);
2760	else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops)
2761	tun = netdev_priv(dev);
2762	else
2763	return -EINVAL;
2764
2765	if (!!(ifr->ifr_flags & IFF_MULTI_QUEUE) !=
2766	!!(tun->flags & IFF_MULTI_QUEUE))
2767	return -EINVAL;
2768
2769	if (tun_not_capable(tun))
2770	return -EPERM;
2771	err = security_tun_dev_open(security: tun->security);
2772	if (err < `0`)
2773	return err;
2774
2775	err = tun_attach(tun, file, skip_filter: ifr->ifr_flags & IFF_NOFILTER,
2776	napi: ifr->ifr_flags & IFF_NAPI,
2777	napi_frags: ifr->ifr_flags & IFF_NAPI_FRAGS, publish_tun: true);
2778	if (err < `0`)
2779	return err;
2780
2781	if (tun->flags & IFF_MULTI_QUEUE &&
2782	(tun->numqueues + tun->numdisabled > `1`)) {
2783	/ One or more queue has already been attached, no need*
2784	* to initialize the device again.
2785	*/
2786	netdev_state_change(dev);
2787	return `0`;
2788	}
2789
2790	tun->flags = (tun->flags & ~TUN_FEATURES) \|
2791	(ifr->ifr_flags & TUN_FEATURES);
2792
2793	netdev_state_change(dev);
2794	} else {
2795	char *name;
2796	unsigned long flags = `0`;
2797	int queues = ifr->ifr_flags & IFF_MULTI_QUEUE ?
2798	MAX_TAP_QUEUES : `1`;
2799
2800	if (!ns_capable(ns: net->user_ns, CAP_NET_ADMIN))
2801	return -EPERM;
2802	err = security_tun_dev_create();
2803	if (err < `0`)
2804	return err;
2805
2806	/ Set dev type /
2807	if (ifr->ifr_flags & IFF_TUN) {
2808	/ TUN device /
2809	flags \|= IFF_TUN;
2810	name = "tun%d";
2811	} else if (ifr->ifr_flags & IFF_TAP) {
2812	/ TAP device /
2813	flags \|= IFF_TAP;
2814	name = "tap%d";
2815	} else
2816	return -EINVAL;
2817
2818	if (*ifr->ifr_name)
2819	name = ifr->ifr_name;
2820
2821	dev = alloc_netdev_mqs(sizeof_priv: sizeof(struct tun_struct), name,
2822	NET_NAME_UNKNOWN, setup: tun_setup, txqs: queues,
2823	rxqs: queues);
2824
2825	if (!dev)
2826	return -ENOMEM;
2827
2828	dev_net_set(dev, net);
2829	dev->rtnl_link_ops = &tun_link_ops;
2830	dev->ifindex = tfile->ifindex;
2831	dev->sysfs_groups[`0`] = &tun_attr_group;
2832
2833	tun = netdev_priv(dev);
2834	tun->dev = dev;
2835	tun->flags = flags;
2836	tun->txflt.count = `0`;
2837	tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
2838
2839	tun->align = NET_SKB_PAD;
2840	tun->filter_attached = false;
2841	tun->sndbuf = tfile->socket.sk->sk_sndbuf;
2842	tun->rx_batched = `0`;
2843	RCU_INIT_POINTER(tun->steering_prog, NULL);
2844
2845	tun->ifr = ifr;
2846	tun->file = file;
2847
2848	tun_net_initialize(dev);
2849
2850	err = register_netdevice(dev: tun->dev);
2851	if (err < `0`) {
2852	free_netdev(dev);
2853	return err;
2854	}
2855	/ free_netdev() won't check refcnt, to avoid race*
2856	* with dev_put() we need publish tun after registration.
2857	*/
2858	rcu_assign_pointer(tfile->tun, tun);
2859	}
2860
2861	if (ifr->ifr_flags & IFF_NO_CARRIER)
2862	netif_carrier_off(dev: tun->dev);
2863	else
2864	netif_carrier_on(dev: tun->dev);
2865
2866	/ Make sure persistent devices do not get stuck in*
2867	* xoff state.
2868	*/
2869	if (netif_running(dev: tun->dev))
2870	netif_tx_wake_all_queues(dev: tun->dev);
2871
2872	strcpy(p: ifr->ifr_name, q: tun->dev->name);
2873	return `0`;
2874	}
2875
2876	static void tun_get_iff(struct tun_struct tun, struct* ifreq *ifr)
2877	{
2878	strcpy(p: ifr->ifr_name, q: tun->dev->name);
2879
2880	ifr->ifr_flags = tun_flags(tun);
2881
2882	}
2883
2884	/ This is like a cut-down ethtool ops, except done via tun fd so no*
2885	* privs required. */
2886	static int set_offload(struct tun_struct tun, unsigned* long arg)
2887	{
2888	netdev_features_t features = `0`;
2889
2890	if (arg & TUN_F_CSUM) {
2891	features \|= NETIF_F_HW_CSUM;
2892	arg &= ~TUN_F_CSUM;
2893
2894	if (arg & (TUN_F_TSO4\|TUN_F_TSO6)) {
2895	if (arg & TUN_F_TSO_ECN) {
2896	features \|= NETIF_F_TSO_ECN;
2897	arg &= ~TUN_F_TSO_ECN;
2898	}
2899	if (arg & TUN_F_TSO4)
2900	features \|= NETIF_F_TSO;
2901	if (arg & TUN_F_TSO6)
2902	features \|= NETIF_F_TSO6;
2903	arg &= ~(TUN_F_TSO4\|TUN_F_TSO6);
2904	}
2905
2906	arg &= ~TUN_F_UFO;
2907
2908	/ TODO: for now USO4 and USO6 should work simultaneously /
2909	if (arg & TUN_F_USO4 && arg & TUN_F_USO6) {
2910	features \|= NETIF_F_GSO_UDP_L4;
2911	arg &= ~(TUN_F_USO4 \| TUN_F_USO6);
2912	}
2913	}
2914
2915	/ This gives the user a way to test for new features in future by*
2916	* trying to set them. */
2917	if (arg)
2918	return -EINVAL;
2919
2920	tun->set_features = features;
2921	tun->dev->wanted_features &= ~TUN_USER_FEATURES;
2922	tun->dev->wanted_features \|= features;
2923	netdev_update_features(dev: tun->dev);
2924
2925	return `0`;
2926	}
2927
2928	static void tun_detach_filter(struct tun_struct tun, int* n)
2929	{
2930	int i;
2931	struct tun_file *tfile;
2932
2933	for (i = `0`; i < n; i++) {
2934	tfile = rtnl_dereference(tun->tfiles[i]);
2935	lock_sock(sk: tfile->socket.sk);
2936	sk_detach_filter(sk: tfile->socket.sk);
2937	release_sock(sk: tfile->socket.sk);
2938	}
2939
2940	tun->filter_attached = false;
2941	}
2942
2943	static int tun_attach_filter(struct tun_struct *tun)
2944	{
2945	int i, ret = `0`;
2946	struct tun_file *tfile;
2947
2948	for (i = `0`; i < tun->numqueues; i++) {
2949	tfile = rtnl_dereference(tun->tfiles[i]);
2950	lock_sock(sk: tfile->socket.sk);
2951	ret = sk_attach_filter(fprog: &tun->fprog, sk: tfile->socket.sk);
2952	release_sock(sk: tfile->socket.sk);
2953	if (ret) {
2954	tun_detach_filter(tun, n: i);
2955	return ret;
2956	}
2957	}
2958
2959	tun->filter_attached = true;
2960	return ret;
2961	}
2962
2963	static void tun_set_sndbuf(struct tun_struct *tun)
2964	{
2965	struct tun_file *tfile;
2966	int i;
2967
2968	for (i = `0`; i < tun->numqueues; i++) {
2969	tfile = rtnl_dereference(tun->tfiles[i]);
2970	tfile->socket.sk->sk_sndbuf = tun->sndbuf;
2971	}
2972	}
2973
2974	static int tun_set_queue(struct file file, struct* ifreq *ifr)
2975	{
2976	struct tun_file *tfile = file->private_data;
2977	struct tun_struct *tun;
2978	int ret = `0`;
2979
2980	rtnl_lock();
2981
2982	if (ifr->ifr_flags & IFF_ATTACH_QUEUE) {
2983	tun = tfile->detached;
2984	if (!tun) {
2985	ret = -EINVAL;
2986	goto unlock;
2987	}
2988	ret = security_tun_dev_attach_queue(security: tun->security);
2989	if (ret < `0`)
2990	goto unlock;
2991	ret = tun_attach(tun, file, skip_filter: false, napi: tun->flags & IFF_NAPI,
2992	napi_frags: tun->flags & IFF_NAPI_FRAGS, publish_tun: true);
2993	} else if (ifr->ifr_flags & IFF_DETACH_QUEUE) {
2994	tun = rtnl_dereference(tfile->tun);
2995	if (!tun \|\| !(tun->flags & IFF_MULTI_QUEUE) \|\| tfile->detached)
2996	ret = -EINVAL;
2997	else
2998	__tun_detach(tfile, clean: false);
2999	} else
3000	ret = -EINVAL;
3001
3002	if (ret >= `0`)
3003	netdev_state_change(dev: tun->dev);
3004
3005	unlock:
3006	rtnl_unlock();
3007	return ret;
3008	}
3009
3010	static int tun_set_ebpf(struct tun_struct tun, struct* tun_prog __rcu **prog_p,
3011	void __user *data)
3012	{
3013	struct bpf_prog *prog;
3014	int fd;
3015
3016	if (copy_from_user(to: &fd, from: data, n: sizeof(fd)))
3017	return -EFAULT;
3018
3019	if (fd == -`1`) {
3020	prog = NULL;
3021	} else {
3022	prog = bpf_prog_get_type(ufd: fd, type: BPF_PROG_TYPE_SOCKET_FILTER);
3023	if (IS_ERR(ptr: prog))
3024	return PTR_ERR(ptr: prog);
3025	}
3026
3027	return __tun_set_ebpf(tun, prog_p, prog);
3028	}
3029
3030	/ Return correct value for tun->dev->addr_len based on tun->dev->type. /
3031	static unsigned char tun_get_addr_len(unsigned short type)
3032	{
3033	switch (type) {
3034	case ARPHRD_IP6GRE:
3035	case ARPHRD_TUNNEL6:
3036	return sizeof(struct in6_addr);
3037	case ARPHRD_IPGRE:
3038	case ARPHRD_TUNNEL:
3039	case ARPHRD_SIT:
3040	return `4`;
3041	case ARPHRD_ETHER:
3042	return ETH_ALEN;
3043	case ARPHRD_IEEE802154:
3044	case ARPHRD_IEEE802154_MONITOR:
3045	return IEEE802154_EXTENDED_ADDR_LEN;
3046	case ARPHRD_PHONET_PIPE:
3047	case ARPHRD_PPP:
3048	case ARPHRD_NONE:
3049	return `0`;
3050	case ARPHRD_6LOWPAN:
3051	return EUI64_ADDR_LEN;
3052	case ARPHRD_FDDI:
3053	return FDDI_K_ALEN;
3054	case ARPHRD_HIPPI:
3055	return HIPPI_ALEN;
3056	case ARPHRD_IEEE802:
3057	return FC_ALEN;
3058	case ARPHRD_ROSE:
3059	return ROSE_ADDR_LEN;
3060	case ARPHRD_NETROM:
3061	return AX25_ADDR_LEN;
3062	case ARPHRD_LOCALTLK:
3063	return LTALK_ALEN;
3064	default:
3065	return `0`;
3066	}
3067	}
3068
3069	static long __tun_chr_ioctl(struct file file, unsigned* int cmd,
3070	unsigned long arg, int ifreq_len)
3071	{
3072	struct tun_file *tfile = file->private_data;
3073	struct net *net = sock_net(sk: &tfile->sk);
3074	struct tun_struct *tun;
3075	void __user* argp = (void __user*)arg;
3076	unsigned int carrier;
3077	struct ifreq ifr;
3078	kuid_t owner;
3079	kgid_t group;
3080	int ifindex;
3081	int sndbuf;
3082	int vnet_hdr_sz;
3083	int le;
3084	int ret;
3085	bool do_notify = false;
3086
3087	if (cmd == TUNSETIFF \|\| cmd == TUNSETQUEUE \|\|
3088	(_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) {
3089	if (copy_from_user(to: &ifr, from: argp, n: ifreq_len))
3090	return -EFAULT;
3091	} else {
3092	memset(&ifr, `0`, sizeof(ifr));
3093	}
3094	if (cmd == TUNGETFEATURES) {
3095	/ Currently this just means: "what IFF flags are valid?".*
3096	* This is needed because we never checked for invalid flags on
3097	* TUNSETIFF.
3098	*/
3099	return put_user(IFF_TUN \| IFF_TAP \| IFF_NO_CARRIER \|
3100	TUN_FEATURES, (unsigned int __user*)argp);
3101	} else if (cmd == TUNSETQUEUE) {
3102	return tun_set_queue(file, ifr: &ifr);
3103	} else if (cmd == SIOCGSKNS) {
3104	if (!ns_capable(ns: net->user_ns, CAP_NET_ADMIN))
3105	return -EPERM;
3106	return open_related_ns(ns: &net->ns, get_ns: get_net_ns);
3107	}
3108
3109	rtnl_lock();
3110
3111	tun = tun_get(tfile);
3112	if (cmd == TUNSETIFF) {
3113	ret = -EEXIST;
3114	if (tun)
3115	goto unlock;
3116
3117	ifr.ifr_name[IFNAMSIZ-`1`] = `'\0'`;
3118
3119	ret = tun_set_iff(net, file, ifr: &ifr);
3120
3121	if (ret)
3122	goto unlock;
3123
3124	if (copy_to_user(to: argp, from: &ifr, n: ifreq_len))
3125	ret = -EFAULT;
3126	goto unlock;
3127	}
3128	if (cmd == TUNSETIFINDEX) {
3129	ret = -EPERM;
3130	if (tun)
3131	goto unlock;
3132
3133	ret = -EFAULT;
3134	if (copy_from_user(to: &ifindex, from: argp, n: sizeof(ifindex)))
3135	goto unlock;
3136	ret = -EINVAL;
3137	if (ifindex < `0`)
3138	goto unlock;
3139	ret = `0`;
3140	tfile->ifindex = ifindex;
3141	goto unlock;
3142	}
3143
3144	ret = -EBADFD;
3145	if (!tun)
3146	goto unlock;
3147
3148	netif_info(tun, drv, tun->dev, "tun_chr_ioctl cmd %u\n", cmd);
3149
3150	net = dev_net(dev: tun->dev);
3151	ret = `0`;
3152	switch (cmd) {
3153	case TUNGETIFF:
3154	tun_get_iff(tun, ifr: &ifr);
3155
3156	if (tfile->detached)
3157	ifr.ifr_flags \|= IFF_DETACH_QUEUE;
3158	if (!tfile->socket.sk->sk_filter)
3159	ifr.ifr_flags \|= IFF_NOFILTER;
3160
3161	if (copy_to_user(to: argp, from: &ifr, n: ifreq_len))
3162	ret = -EFAULT;
3163	break;
3164
3165	case TUNSETNOCSUM:
3166	/ Disable/Enable checksum /
3167
3168	/ [unimplemented] /
3169	netif_info(tun, drv, tun->dev, "ignored: set checksum %s\n",
3170	arg ? "disabled" : "enabled");
3171	break;
3172
3173	case TUNSETPERSIST:
3174	/ Disable/Enable persist mode. Keep an extra reference to the*
3175	* module to prevent the module being unprobed.
3176	*/
3177	if (arg && !(tun->flags & IFF_PERSIST)) {
3178	tun->flags \|= IFF_PERSIST;
3179	__module_get(THIS_MODULE);
3180	do_notify = true;
3181	}
3182	if (!arg && (tun->flags & IFF_PERSIST)) {
3183	tun->flags &= ~IFF_PERSIST;
3184	module_put(THIS_MODULE);
3185	do_notify = true;
3186	}
3187
3188	netif_info(tun, drv, tun->dev, "persist %s\n",
3189	arg ? "enabled" : "disabled");
3190	break;
3191
3192	case TUNSETOWNER:
3193	/ Set owner of the device /
3194	owner = make_kuid(current_user_ns(), uid: arg);
3195	if (!uid_valid(uid: owner)) {
3196	ret = -EINVAL;
3197	break;
3198	}
3199	tun->owner = owner;
3200	do_notify = true;
3201	netif_info(tun, drv, tun->dev, "owner set to %u\n",
3202	from_kuid(&init_user_ns, tun->owner));
3203	break;
3204
3205	case TUNSETGROUP:
3206	/ Set group of the device /
3207	group = make_kgid(current_user_ns(), gid: arg);
3208	if (!gid_valid(gid: group)) {
3209	ret = -EINVAL;
3210	break;
3211	}
3212	tun->group = group;
3213	do_notify = true;
3214	netif_info(tun, drv, tun->dev, "group set to %u\n",
3215	from_kgid(&init_user_ns, tun->group));
3216	break;
3217
3218	case TUNSETLINK:
3219	/ Only allow setting the type when the interface is down /
3220	if (tun->dev->flags & IFF_UP) {
3221	netif_info(tun, drv, tun->dev,
3222	"Linktype set failed because interface is up\n");
3223	ret = -EBUSY;
3224	} else {
3225	ret = call_netdevice_notifiers(val: NETDEV_PRE_TYPE_CHANGE,
3226	dev: tun->dev);
3227	ret = notifier_to_errno(ret);
3228	if (ret) {
3229	netif_info(tun, drv, tun->dev,
3230	"Refused to change device type\n");
3231	break;
3232	}
3233	tun->dev->type = (int) arg;
3234	tun->dev->addr_len = tun_get_addr_len(type: tun->dev->type);
3235	netif_info(tun, drv, tun->dev, "linktype set to %d\n",
3236	tun->dev->type);
3237	call_netdevice_notifiers(val: NETDEV_POST_TYPE_CHANGE,
3238	dev: tun->dev);
3239	}
3240	break;
3241
3242	case TUNSETDEBUG:
3243	tun->msg_enable = (u32)arg;
3244	break;
3245
3246	case TUNSETOFFLOAD:
3247	ret = set_offload(tun, arg);
3248	break;
3249
3250	case TUNSETTXFILTER:
3251	/ Can be set only for TAPs /
3252	ret = -EINVAL;
3253	if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
3254	break;
3255	ret = update_filter(filter: &tun->txflt, arg: (void __user *)arg);
3256	break;
3257
3258	case SIOCGIFHWADDR:
3259	/ Get hw address /
3260	dev_get_mac_address(sa: &ifr.ifr_hwaddr, net, dev_name: tun->dev->name);
3261	if (copy_to_user(to: argp, from: &ifr, n: ifreq_len))
3262	ret = -EFAULT;
3263	break;
3264
3265	case SIOCSIFHWADDR:
3266	/ Set hw address /
3267	ret = dev_set_mac_address_user(dev: tun->dev, sa: &ifr.ifr_hwaddr, NULL);
3268	break;
3269
3270	case TUNGETSNDBUF:
3271	sndbuf = tfile->socket.sk->sk_sndbuf;
3272	if (copy_to_user(to: argp, from: &sndbuf, n: sizeof(sndbuf)))
3273	ret = -EFAULT;
3274	break;
3275
3276	case TUNSETSNDBUF:
3277	if (copy_from_user(to: &sndbuf, from: argp, n: sizeof(sndbuf))) {
3278	ret = -EFAULT;
3279	break;
3280	}
3281	if (sndbuf <= `0`) {
3282	ret = -EINVAL;
3283	break;
3284	}
3285
3286	tun->sndbuf = sndbuf;
3287	tun_set_sndbuf(tun);
3288	break;
3289
3290	case TUNGETVNETHDRSZ:
3291	vnet_hdr_sz = tun->vnet_hdr_sz;
3292	if (copy_to_user(to: argp, from: &vnet_hdr_sz, n: sizeof(vnet_hdr_sz)))
3293	ret = -EFAULT;
3294	break;
3295
3296	case TUNSETVNETHDRSZ:
3297	if (copy_from_user(to: &vnet_hdr_sz, from: argp, n: sizeof(vnet_hdr_sz))) {
3298	ret = -EFAULT;
3299	break;
3300	}
3301	if (vnet_hdr_sz < (int)sizeof(struct virtio_net_hdr)) {
3302	ret = -EINVAL;
3303	break;
3304	}
3305
3306	tun->vnet_hdr_sz = vnet_hdr_sz;
3307	break;
3308
3309	case TUNGETVNETLE:
3310	le = !!(tun->flags & TUN_VNET_LE);
3311	if (put_user(le, (int __user *)argp))
3312	ret = -EFAULT;
3313	break;
3314
3315	case TUNSETVNETLE:
3316	if (get_user(le, (int __user *)argp)) {
3317	ret = -EFAULT;
3318	break;
3319	}
3320	if (le)
3321	tun->flags \|= TUN_VNET_LE;
3322	else
3323	tun->flags &= ~TUN_VNET_LE;
3324	break;
3325
3326	case TUNGETVNETBE:
3327	ret = tun_get_vnet_be(tun, argp);
3328	break;
3329
3330	case TUNSETVNETBE:
3331	ret = tun_set_vnet_be(tun, argp);
3332	break;
3333
3334	case TUNATTACHFILTER:
3335	/ Can be set only for TAPs /
3336	ret = -EINVAL;
3337	if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
3338	break;
3339	ret = -EFAULT;
3340	if (copy_from_user(to: &tun->fprog, from: argp, n: sizeof(tun->fprog)))
3341	break;
3342
3343	ret = tun_attach_filter(tun);
3344	break;
3345
3346	case TUNDETACHFILTER:
3347	/ Can be set only for TAPs /
3348	ret = -EINVAL;
3349	if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
3350	break;
3351	ret = `0`;
3352	tun_detach_filter(tun, n: tun->numqueues);
3353	break;
3354
3355	case TUNGETFILTER:
3356	ret = -EINVAL;
3357	if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
3358	break;
3359	ret = -EFAULT;
3360	if (copy_to_user(to: argp, from: &tun->fprog, n: sizeof(tun->fprog)))
3361	break;
3362	ret = `0`;
3363	break;
3364
3365	case TUNSETSTEERINGEBPF:
3366	ret = tun_set_ebpf(tun, prog_p: &tun->steering_prog, data: argp);
3367	break;
3368
3369	case TUNSETFILTEREBPF:
3370	ret = tun_set_ebpf(tun, prog_p: &tun->filter_prog, data: argp);
3371	break;
3372
3373	case TUNSETCARRIER:
3374	ret = -EFAULT;
3375	if (copy_from_user(to: &carrier, from: argp, n: sizeof(carrier)))
3376	goto unlock;
3377
3378	ret = tun_net_change_carrier(dev: tun->dev, new_carrier: (bool)carrier);
3379	break;
3380
3381	case TUNGETDEVNETNS:
3382	ret = -EPERM;
3383	if (!ns_capable(ns: net->user_ns, CAP_NET_ADMIN))
3384	goto unlock;
3385	ret = open_related_ns(ns: &net->ns, get_ns: get_net_ns);
3386	break;
3387
3388	default:
3389	ret = -EINVAL;
3390	break;
3391	}
3392
3393	if (do_notify)
3394	netdev_state_change(dev: tun->dev);
3395
3396	unlock:
3397	rtnl_unlock();
3398	if (tun)
3399	tun_put(tun);
3400	return ret;
3401	}
3402
3403	static long tun_chr_ioctl(struct file *file,
3404	unsigned int cmd, unsigned long arg)
3405	{
3406	return __tun_chr_ioctl(file, cmd, arg, ifreq_len: sizeof (struct ifreq));
3407	}
3408
3409	#ifdef CONFIG_COMPAT
3410	static long tun_chr_compat_ioctl(struct file *file,
3411	unsigned int cmd, unsigned long arg)
3412	{
3413	switch (cmd) {
3414	case TUNSETIFF:
3415	case TUNGETIFF:
3416	case TUNSETTXFILTER:
3417	case TUNGETSNDBUF:
3418	case TUNSETSNDBUF:
3419	case SIOCGIFHWADDR:
3420	case SIOCSIFHWADDR:
3421	arg = (unsigned long)compat_ptr(uptr: arg);
3422	break;
3423	default:
3424	arg = (compat_ulong_t)arg;
3425	break;
3426	}
3427
3428	/*
3429	* compat_ifreq is shorter than ifreq, so we must not access beyond
3430	* the end of that structure. All fields that are used in this
3431	* driver are compatible though, we don't need to convert the
3432	* contents.
3433	*/
3434	return __tun_chr_ioctl(file, cmd, arg, ifreq_len: sizeof(struct compat_ifreq));
3435	}
3436	#endif /* CONFIG_COMPAT */
3437
3438	static int tun_chr_fasync(int fd, struct file file, int* on)
3439	{
3440	struct tun_file *tfile = file->private_data;
3441	int ret;
3442
3443	if ((ret = fasync_helper(fd, file, on, &tfile->fasync)) < `0`)
3444	goto out;
3445
3446	if (on) {
3447	__f_setown(filp: file, task_pid(current), PIDTYPE_TGID, force: `0`);
3448	tfile->flags \|= TUN_FASYNC;
3449	} else
3450	tfile->flags &= ~TUN_FASYNC;
3451	ret = `0`;
3452	out:
3453	return ret;
3454	}
3455
3456	static int tun_chr_open(struct inode inode, struct* file * file)
3457	{
3458	struct net *net = current->nsproxy->net_ns;
3459	struct tun_file *tfile;
3460
3461	tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
3462	prot: &tun_proto, kern: `0`);
3463	if (!tfile)
3464	return -ENOMEM;
3465	if (ptr_ring_init(r: &tfile->tx_ring, size: `0`, GFP_KERNEL)) {
3466	sk_free(sk: &tfile->sk);
3467	return -ENOMEM;
3468	}
3469
3470	mutex_init(&tfile->napi_mutex);
3471	RCU_INIT_POINTER(tfile->tun, NULL);
3472	tfile->flags = `0`;
3473	tfile->ifindex = `0`;
3474
3475	init_waitqueue_head(&tfile->socket.wq.wait);
3476
3477	tfile->socket.file = file;
3478	tfile->socket.ops = &tun_socket_ops;
3479
3480	sock_init_data_uid(sock: &tfile->socket, sk: &tfile->sk, current_fsuid());
3481
3482	tfile->sk.sk_write_space = tun_sock_write_space;
3483	tfile->sk.sk_sndbuf = INT_MAX;
3484
3485	file->private_data = tfile;
3486	INIT_LIST_HEAD(list: &tfile->next);
3487
3488	sock_set_flag(sk: &tfile->sk, flag: SOCK_ZEROCOPY);
3489
3490	/ tun groks IOCB_NOWAIT just fine, mark it as such /
3491	file->f_mode \|= FMODE_NOWAIT;
3492	return `0`;
3493	}
3494
3495	static int tun_chr_close(struct inode inode, struct* file *file)
3496	{
3497	struct tun_file *tfile = file->private_data;
3498
3499	tun_detach(tfile, clean: true);
3500
3501	return `0`;
3502	}
3503
3504	#ifdef CONFIG_PROC_FS
3505	static void tun_chr_show_fdinfo(struct seq_file m, struct* file *file)
3506	{
3507	struct tun_file *tfile = file->private_data;
3508	struct tun_struct *tun;
3509	struct ifreq ifr;
3510
3511	memset(&ifr, `0`, sizeof(ifr));
3512
3513	rtnl_lock();
3514	tun = tun_get(tfile);
3515	if (tun)
3516	tun_get_iff(tun, ifr: &ifr);
3517	rtnl_unlock();
3518
3519	if (tun)
3520	tun_put(tun);
3521
3522	seq_printf(m, fmt: "iff:\t%s\n", ifr.ifr_name);
3523	}
3524	#endif
3525
3526	static const struct file_operations tun_fops = {
3527	.owner = THIS_MODULE,
3528	.llseek = no_llseek,
3529	.read_iter = tun_chr_read_iter,
3530	.write_iter = tun_chr_write_iter,
3531	.poll = tun_chr_poll,
3532	.unlocked_ioctl = tun_chr_ioctl,
3533	#ifdef CONFIG_COMPAT
3534	.compat_ioctl = tun_chr_compat_ioctl,
3535	#endif
3536	.open = tun_chr_open,
3537	.release = tun_chr_close,
3538	.fasync = tun_chr_fasync,
3539	#ifdef CONFIG_PROC_FS
3540	.show_fdinfo = tun_chr_show_fdinfo,
3541	#endif
3542	};
3543
3544	static struct miscdevice tun_miscdev = {
3545	.minor = TUN_MINOR,
3546	.name = "tun",
3547	.nodename = "net/tun",
3548	.fops = &tun_fops,
3549	};
3550
3551	/ ethtool interface /
3552
3553	static void tun_default_link_ksettings(struct net_device *dev,
3554	struct ethtool_link_ksettings *cmd)
3555	{
3556	ethtool_link_ksettings_zero_link_mode(cmd, supported);
3557	ethtool_link_ksettings_zero_link_mode(cmd, advertising);
3558	cmd->base.speed = SPEED_10000;
3559	cmd->base.duplex = DUPLEX_FULL;
3560	cmd->base.port = PORT_TP;
3561	cmd->base.phy_address = `0`;
3562	cmd->base.autoneg = AUTONEG_DISABLE;
3563	}
3564
3565	static int tun_get_link_ksettings(struct net_device *dev,
3566	struct ethtool_link_ksettings *cmd)
3567	{
3568	struct tun_struct *tun = netdev_priv(dev);
3569
3570	memcpy(cmd, &tun->link_ksettings, sizeof(*cmd));
3571	return `0`;
3572	}
3573
3574	static int tun_set_link_ksettings(struct net_device *dev,
3575	const struct ethtool_link_ksettings *cmd)
3576	{
3577	struct tun_struct *tun = netdev_priv(dev);
3578
3579	memcpy(&tun->link_ksettings, cmd, sizeof(*cmd));
3580	return `0`;
3581	}
3582
3583	static void tun_get_drvinfo(struct net_device dev, struct* ethtool_drvinfo *info)
3584	{
3585	struct tun_struct *tun = netdev_priv(dev);
3586
3587	strscpy(p: info->driver, DRV_NAME, size: sizeof(info->driver));
3588	strscpy(p: info->version, DRV_VERSION, size: sizeof(info->version));
3589
3590	switch (tun->flags & TUN_TYPE_MASK) {
3591	case IFF_TUN:
3592	strscpy(p: info->bus_info, q: "tun", size: sizeof(info->bus_info));
3593	break;
3594	case IFF_TAP:
3595	strscpy(p: info->bus_info, q: "tap", size: sizeof(info->bus_info));
3596	break;
3597	}
3598	}
3599
3600	static u32 tun_get_msglevel(struct net_device *dev)
3601	{
3602	struct tun_struct *tun = netdev_priv(dev);
3603
3604	return tun->msg_enable;
3605	}
3606
3607	static void tun_set_msglevel(struct net_device *dev, u32 value)
3608	{
3609	struct tun_struct *tun = netdev_priv(dev);
3610
3611	tun->msg_enable = value;
3612	}
3613
3614	static int tun_get_coalesce(struct net_device *dev,
3615	struct ethtool_coalesce *ec,
3616	struct kernel_ethtool_coalesce *kernel_coal,
3617	struct netlink_ext_ack *extack)
3618	{
3619	struct tun_struct *tun = netdev_priv(dev);
3620
3621	ec->rx_max_coalesced_frames = tun->rx_batched;
3622
3623	return `0`;
3624	}
3625
3626	static int tun_set_coalesce(struct net_device *dev,
3627	struct ethtool_coalesce *ec,
3628	struct kernel_ethtool_coalesce *kernel_coal,
3629	struct netlink_ext_ack *extack)
3630	{
3631	struct tun_struct *tun = netdev_priv(dev);
3632
3633	if (ec->rx_max_coalesced_frames > NAPI_POLL_WEIGHT)
3634	tun->rx_batched = NAPI_POLL_WEIGHT;
3635	else
3636	tun->rx_batched = ec->rx_max_coalesced_frames;
3637
3638	return `0`;
3639	}
3640
3641	static const struct ethtool_ops tun_ethtool_ops = {
3642	.supported_coalesce_params = ETHTOOL_COALESCE_RX_MAX_FRAMES,
3643	.get_drvinfo = tun_get_drvinfo,
3644	.get_msglevel = tun_get_msglevel,
3645	.set_msglevel = tun_set_msglevel,
3646	.get_link = ethtool_op_get_link,
3647	.get_ts_info = ethtool_op_get_ts_info,
3648	.get_coalesce = tun_get_coalesce,
3649	.set_coalesce = tun_set_coalesce,
3650	.get_link_ksettings = tun_get_link_ksettings,
3651	.set_link_ksettings = tun_set_link_ksettings,
3652	};
3653
3654	static int tun_queue_resize(struct tun_struct *tun)
3655	{
3656	struct net_device *dev = tun->dev;
3657	struct tun_file *tfile;
3658	struct ptr_ring **rings;
3659	int n = tun->numqueues + tun->numdisabled;
3660	int ret, i;
3661
3662	rings = kmalloc_array(n, size: sizeof(*rings), GFP_KERNEL);
3663	if (!rings)
3664	return -ENOMEM;
3665
3666	for (i = `0`; i < tun->numqueues; i++) {
3667	tfile = rtnl_dereference(tun->tfiles[i]);
3668	rings[i] = &tfile->tx_ring;
3669	}
3670	list_for_each_entry(tfile, &tun->disabled, next)
3671	rings[i++] = &tfile->tx_ring;
3672
3673	ret = ptr_ring_resize_multiple(rings, nrings: n,
3674	size: dev->tx_queue_len, GFP_KERNEL,
3675	destroy: tun_ptr_free);
3676
3677	kfree(objp: rings);
3678	return ret;
3679	}
3680
3681	static int tun_device_event(struct notifier_block *unused,
3682	unsigned long event, void *ptr)
3683	{
3684	struct net_device *dev = netdev_notifier_info_to_dev(info: ptr);
3685	struct tun_struct *tun = netdev_priv(dev);
3686	int i;
3687
3688	if (dev->rtnl_link_ops != &tun_link_ops)
3689	return NOTIFY_DONE;
3690
3691	switch (event) {
3692	case NETDEV_CHANGE_TX_QUEUE_LEN:
3693	if (tun_queue_resize(tun))
3694	return NOTIFY_BAD;
3695	break;
3696	case NETDEV_UP:
3697	for (i = `0`; i < tun->numqueues; i++) {
3698	struct tun_file *tfile;
3699
3700	tfile = rtnl_dereference(tun->tfiles[i]);
3701	tfile->socket.sk->sk_write_space(tfile->socket.sk);
3702	}
3703	break;
3704	default:
3705	break;
3706	}
3707
3708	return NOTIFY_DONE;
3709	}
3710
3711	static struct notifier_block tun_notifier_block __read_mostly = {
3712	.notifier_call = tun_device_event,
3713	};
3714
3715	static int __init tun_init(void)
3716	{
3717	int ret = `0`;
3718
3719	pr_info("%s, %s\n", DRV_DESCRIPTION, DRV_VERSION);
3720
3721	ret = rtnl_link_register(ops: &tun_link_ops);
3722	if (ret) {
3723	pr_err("Can't register link_ops\n");
3724	goto err_linkops;
3725	}
3726
3727	ret = misc_register(misc: &tun_miscdev);
3728	if (ret) {
3729	pr_err("Can't register misc device %d\n", TUN_MINOR);
3730	goto err_misc;
3731	}
3732
3733	ret = register_netdevice_notifier(nb: &tun_notifier_block);
3734	if (ret) {
3735	pr_err("Can't register netdevice notifier\n");
3736	goto err_notifier;
3737	}
3738
3739	return `0`;
3740
3741	err_notifier:
3742	misc_deregister(misc: &tun_miscdev);
3743	err_misc:
3744	rtnl_link_unregister(ops: &tun_link_ops);
3745	err_linkops:
3746	return ret;
3747	}
3748
3749	static void __exit tun_cleanup(void)
3750	{
3751	misc_deregister(misc: &tun_miscdev);
3752	rtnl_link_unregister(ops: &tun_link_ops);
3753	unregister_netdevice_notifier(nb: &tun_notifier_block);
3754	}
3755
3756	/ Get an underlying socket object from tun file. Returns error unless file is*
3757	* attached to a device. The returned object works like a packet socket, it
3758	* can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for
3759	* holding a reference to the file for as long as the socket is in use. */
3760	struct socket tun_get_socket(struct* file *file)
3761	{
3762	struct tun_file *tfile;
3763	if (file->f_op != &tun_fops)
3764	return ERR_PTR(error: -EINVAL);
3765	tfile = file->private_data;
3766	if (!tfile)
3767	return ERR_PTR(error: -EBADFD);
3768	return &tfile->socket;
3769	}
3770	EXPORT_SYMBOL_GPL(tun_get_socket);
3771
3772	struct ptr_ring tun_get_tx_ring(struct* file *file)
3773	{
3774	struct tun_file *tfile;
3775
3776	if (file->f_op != &tun_fops)
3777	return ERR_PTR(error: -EINVAL);
3778	tfile = file->private_data;
3779	if (!tfile)
3780	return ERR_PTR(error: -EBADFD);
3781	return &tfile->tx_ring;
3782	}
3783	EXPORT_SYMBOL_GPL(tun_get_tx_ring);
3784
3785	module_init(tun_init);
3786	module_exit(tun_cleanup);
3787	MODULE_DESCRIPTION(DRV_DESCRIPTION);
3788	MODULE_AUTHOR(DRV_COPYRIGHT);
3789	MODULE_LICENSE("GPL");
3790	MODULE_ALIAS_MISCDEV(TUN_MINOR);
3791	MODULE_ALIAS("devname:net/tun");
3792

source code of linux/drivers/net/tun.c