1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | #include <linux/etherdevice.h> |
3 | #include <linux/if_tap.h> |
4 | #include <linux/if_vlan.h> |
5 | #include <linux/interrupt.h> |
6 | #include <linux/nsproxy.h> |
7 | #include <linux/compat.h> |
8 | #include <linux/if_tun.h> |
9 | #include <linux/module.h> |
10 | #include <linux/skbuff.h> |
11 | #include <linux/cache.h> |
12 | #include <linux/sched/signal.h> |
13 | #include <linux/types.h> |
14 | #include <linux/slab.h> |
15 | #include <linux/wait.h> |
16 | #include <linux/cdev.h> |
17 | #include <linux/idr.h> |
18 | #include <linux/fs.h> |
19 | #include <linux/uio.h> |
20 | |
21 | #include <net/gso.h> |
22 | #include <net/net_namespace.h> |
23 | #include <net/rtnetlink.h> |
24 | #include <net/sock.h> |
25 | #include <net/xdp.h> |
26 | #include <linux/virtio_net.h> |
27 | #include <linux/skb_array.h> |
28 | |
29 | #define TAP_IFFEATURES (IFF_VNET_HDR | IFF_MULTI_QUEUE) |
30 | |
31 | #define TAP_VNET_LE 0x80000000 |
32 | #define TAP_VNET_BE 0x40000000 |
33 | |
34 | #ifdef CONFIG_TUN_VNET_CROSS_LE |
35 | static inline bool tap_legacy_is_little_endian(struct tap_queue *q) |
36 | { |
37 | return q->flags & TAP_VNET_BE ? false : |
38 | virtio_legacy_is_little_endian(); |
39 | } |
40 | |
41 | static long tap_get_vnet_be(struct tap_queue *q, int __user *sp) |
42 | { |
43 | int s = !!(q->flags & TAP_VNET_BE); |
44 | |
45 | if (put_user(s, sp)) |
46 | return -EFAULT; |
47 | |
48 | return 0; |
49 | } |
50 | |
51 | static long tap_set_vnet_be(struct tap_queue *q, int __user *sp) |
52 | { |
53 | int s; |
54 | |
55 | if (get_user(s, sp)) |
56 | return -EFAULT; |
57 | |
58 | if (s) |
59 | q->flags |= TAP_VNET_BE; |
60 | else |
61 | q->flags &= ~TAP_VNET_BE; |
62 | |
63 | return 0; |
64 | } |
65 | #else |
66 | static inline bool tap_legacy_is_little_endian(struct tap_queue *q) |
67 | { |
68 | return virtio_legacy_is_little_endian(); |
69 | } |
70 | |
71 | static long tap_get_vnet_be(struct tap_queue *q, int __user *argp) |
72 | { |
73 | return -EINVAL; |
74 | } |
75 | |
76 | static long tap_set_vnet_be(struct tap_queue *q, int __user *argp) |
77 | { |
78 | return -EINVAL; |
79 | } |
80 | #endif /* CONFIG_TUN_VNET_CROSS_LE */ |
81 | |
82 | static inline bool tap_is_little_endian(struct tap_queue *q) |
83 | { |
84 | return q->flags & TAP_VNET_LE || |
85 | tap_legacy_is_little_endian(q); |
86 | } |
87 | |
88 | static inline u16 tap16_to_cpu(struct tap_queue *q, __virtio16 val) |
89 | { |
90 | return __virtio16_to_cpu(little_endian: tap_is_little_endian(q), val); |
91 | } |
92 | |
93 | static inline __virtio16 cpu_to_tap16(struct tap_queue *q, u16 val) |
94 | { |
95 | return __cpu_to_virtio16(little_endian: tap_is_little_endian(q), val); |
96 | } |
97 | |
98 | static struct proto tap_proto = { |
99 | .name = "tap" , |
100 | .owner = THIS_MODULE, |
101 | .obj_size = sizeof(struct tap_queue), |
102 | }; |
103 | |
104 | #define TAP_NUM_DEVS (1U << MINORBITS) |
105 | |
106 | static LIST_HEAD(major_list); |
107 | |
108 | struct major_info { |
109 | struct rcu_head rcu; |
110 | dev_t major; |
111 | struct idr minor_idr; |
112 | spinlock_t minor_lock; |
113 | const char *device_name; |
114 | struct list_head next; |
115 | }; |
116 | |
117 | #define GOODCOPY_LEN 128 |
118 | |
119 | static const struct proto_ops tap_socket_ops; |
120 | |
121 | #define RX_OFFLOADS (NETIF_F_GRO | NETIF_F_LRO) |
122 | #define TAP_FEATURES (NETIF_F_GSO | NETIF_F_SG | NETIF_F_FRAGLIST) |
123 | |
124 | static struct tap_dev *tap_dev_get_rcu(const struct net_device *dev) |
125 | { |
126 | return rcu_dereference(dev->rx_handler_data); |
127 | } |
128 | |
129 | /* |
130 | * RCU usage: |
131 | * The tap_queue and the macvlan_dev are loosely coupled, the |
132 | * pointers from one to the other can only be read while rcu_read_lock |
133 | * or rtnl is held. |
134 | * |
135 | * Both the file and the macvlan_dev hold a reference on the tap_queue |
136 | * through sock_hold(&q->sk). When the macvlan_dev goes away first, |
137 | * q->vlan becomes inaccessible. When the files gets closed, |
138 | * tap_get_queue() fails. |
139 | * |
140 | * There may still be references to the struct sock inside of the |
141 | * queue from outbound SKBs, but these never reference back to the |
142 | * file or the dev. The data structure is freed through __sk_free |
143 | * when both our references and any pending SKBs are gone. |
144 | */ |
145 | |
146 | static int tap_enable_queue(struct tap_dev *tap, struct file *file, |
147 | struct tap_queue *q) |
148 | { |
149 | int err = -EINVAL; |
150 | |
151 | ASSERT_RTNL(); |
152 | |
153 | if (q->enabled) |
154 | goto out; |
155 | |
156 | err = 0; |
157 | rcu_assign_pointer(tap->taps[tap->numvtaps], q); |
158 | q->queue_index = tap->numvtaps; |
159 | q->enabled = true; |
160 | |
161 | tap->numvtaps++; |
162 | out: |
163 | return err; |
164 | } |
165 | |
166 | /* Requires RTNL */ |
167 | static int tap_set_queue(struct tap_dev *tap, struct file *file, |
168 | struct tap_queue *q) |
169 | { |
170 | if (tap->numqueues == MAX_TAP_QUEUES) |
171 | return -EBUSY; |
172 | |
173 | rcu_assign_pointer(q->tap, tap); |
174 | rcu_assign_pointer(tap->taps[tap->numvtaps], q); |
175 | sock_hold(sk: &q->sk); |
176 | |
177 | q->file = file; |
178 | q->queue_index = tap->numvtaps; |
179 | q->enabled = true; |
180 | file->private_data = q; |
181 | list_add_tail(new: &q->next, head: &tap->queue_list); |
182 | |
183 | tap->numvtaps++; |
184 | tap->numqueues++; |
185 | |
186 | return 0; |
187 | } |
188 | |
189 | static int tap_disable_queue(struct tap_queue *q) |
190 | { |
191 | struct tap_dev *tap; |
192 | struct tap_queue *nq; |
193 | |
194 | ASSERT_RTNL(); |
195 | if (!q->enabled) |
196 | return -EINVAL; |
197 | |
198 | tap = rtnl_dereference(q->tap); |
199 | |
200 | if (tap) { |
201 | int index = q->queue_index; |
202 | BUG_ON(index >= tap->numvtaps); |
203 | nq = rtnl_dereference(tap->taps[tap->numvtaps - 1]); |
204 | nq->queue_index = index; |
205 | |
206 | rcu_assign_pointer(tap->taps[index], nq); |
207 | RCU_INIT_POINTER(tap->taps[tap->numvtaps - 1], NULL); |
208 | q->enabled = false; |
209 | |
210 | tap->numvtaps--; |
211 | } |
212 | |
213 | return 0; |
214 | } |
215 | |
216 | /* |
217 | * The file owning the queue got closed, give up both |
218 | * the reference that the files holds as well as the |
219 | * one from the macvlan_dev if that still exists. |
220 | * |
221 | * Using the spinlock makes sure that we don't get |
222 | * to the queue again after destroying it. |
223 | */ |
224 | static void tap_put_queue(struct tap_queue *q) |
225 | { |
226 | struct tap_dev *tap; |
227 | |
228 | rtnl_lock(); |
229 | tap = rtnl_dereference(q->tap); |
230 | |
231 | if (tap) { |
232 | if (q->enabled) |
233 | BUG_ON(tap_disable_queue(q)); |
234 | |
235 | tap->numqueues--; |
236 | RCU_INIT_POINTER(q->tap, NULL); |
237 | sock_put(sk: &q->sk); |
238 | list_del_init(entry: &q->next); |
239 | } |
240 | |
241 | rtnl_unlock(); |
242 | |
243 | synchronize_rcu(); |
244 | sock_put(sk: &q->sk); |
245 | } |
246 | |
247 | /* |
248 | * Select a queue based on the rxq of the device on which this packet |
249 | * arrived. If the incoming device is not mq, calculate a flow hash |
250 | * to select a queue. If all fails, find the first available queue. |
251 | * Cache vlan->numvtaps since it can become zero during the execution |
252 | * of this function. |
253 | */ |
254 | static struct tap_queue *tap_get_queue(struct tap_dev *tap, |
255 | struct sk_buff *skb) |
256 | { |
257 | struct tap_queue *queue = NULL; |
258 | /* Access to taps array is protected by rcu, but access to numvtaps |
259 | * isn't. Below we use it to lookup a queue, but treat it as a hint |
260 | * and validate that the result isn't NULL - in case we are |
261 | * racing against queue removal. |
262 | */ |
263 | int numvtaps = READ_ONCE(tap->numvtaps); |
264 | __u32 rxq; |
265 | |
266 | if (!numvtaps) |
267 | goto out; |
268 | |
269 | if (numvtaps == 1) |
270 | goto single; |
271 | |
272 | /* Check if we can use flow to select a queue */ |
273 | rxq = skb_get_hash(skb); |
274 | if (rxq) { |
275 | queue = rcu_dereference(tap->taps[rxq % numvtaps]); |
276 | goto out; |
277 | } |
278 | |
279 | if (likely(skb_rx_queue_recorded(skb))) { |
280 | rxq = skb_get_rx_queue(skb); |
281 | |
282 | while (unlikely(rxq >= numvtaps)) |
283 | rxq -= numvtaps; |
284 | |
285 | queue = rcu_dereference(tap->taps[rxq]); |
286 | goto out; |
287 | } |
288 | |
289 | single: |
290 | queue = rcu_dereference(tap->taps[0]); |
291 | out: |
292 | return queue; |
293 | } |
294 | |
295 | /* |
296 | * The net_device is going away, give up the reference |
297 | * that it holds on all queues and safely set the pointer |
298 | * from the queues to NULL. |
299 | */ |
300 | void tap_del_queues(struct tap_dev *tap) |
301 | { |
302 | struct tap_queue *q, *tmp; |
303 | |
304 | ASSERT_RTNL(); |
305 | list_for_each_entry_safe(q, tmp, &tap->queue_list, next) { |
306 | list_del_init(entry: &q->next); |
307 | RCU_INIT_POINTER(q->tap, NULL); |
308 | if (q->enabled) |
309 | tap->numvtaps--; |
310 | tap->numqueues--; |
311 | sock_put(sk: &q->sk); |
312 | } |
313 | BUG_ON(tap->numvtaps); |
314 | BUG_ON(tap->numqueues); |
315 | /* guarantee that any future tap_set_queue will fail */ |
316 | tap->numvtaps = MAX_TAP_QUEUES; |
317 | } |
318 | EXPORT_SYMBOL_GPL(tap_del_queues); |
319 | |
320 | rx_handler_result_t tap_handle_frame(struct sk_buff **pskb) |
321 | { |
322 | struct sk_buff *skb = *pskb; |
323 | struct net_device *dev = skb->dev; |
324 | struct tap_dev *tap; |
325 | struct tap_queue *q; |
326 | netdev_features_t features = TAP_FEATURES; |
327 | enum skb_drop_reason drop_reason; |
328 | |
329 | tap = tap_dev_get_rcu(dev); |
330 | if (!tap) |
331 | return RX_HANDLER_PASS; |
332 | |
333 | q = tap_get_queue(tap, skb); |
334 | if (!q) |
335 | return RX_HANDLER_PASS; |
336 | |
337 | skb_push(skb, ETH_HLEN); |
338 | |
339 | /* Apply the forward feature mask so that we perform segmentation |
340 | * according to users wishes. This only works if VNET_HDR is |
341 | * enabled. |
342 | */ |
343 | if (q->flags & IFF_VNET_HDR) |
344 | features |= tap->tap_features; |
345 | if (netif_needs_gso(skb, features)) { |
346 | struct sk_buff *segs = __skb_gso_segment(skb, features, tx_path: false); |
347 | struct sk_buff *next; |
348 | |
349 | if (IS_ERR(ptr: segs)) { |
350 | drop_reason = SKB_DROP_REASON_SKB_GSO_SEG; |
351 | goto drop; |
352 | } |
353 | |
354 | if (!segs) { |
355 | if (ptr_ring_produce(r: &q->ring, ptr: skb)) { |
356 | drop_reason = SKB_DROP_REASON_FULL_RING; |
357 | goto drop; |
358 | } |
359 | goto wake_up; |
360 | } |
361 | |
362 | consume_skb(skb); |
363 | skb_list_walk_safe(segs, skb, next) { |
364 | skb_mark_not_on_list(skb); |
365 | if (ptr_ring_produce(r: &q->ring, ptr: skb)) { |
366 | drop_reason = SKB_DROP_REASON_FULL_RING; |
367 | kfree_skb_reason(skb, reason: drop_reason); |
368 | kfree_skb_list_reason(segs: next, reason: drop_reason); |
369 | break; |
370 | } |
371 | } |
372 | } else { |
373 | /* If we receive a partial checksum and the tap side |
374 | * doesn't support checksum offload, compute the checksum. |
375 | * Note: it doesn't matter which checksum feature to |
376 | * check, we either support them all or none. |
377 | */ |
378 | if (skb->ip_summed == CHECKSUM_PARTIAL && |
379 | !(features & NETIF_F_CSUM_MASK) && |
380 | skb_checksum_help(skb)) { |
381 | drop_reason = SKB_DROP_REASON_SKB_CSUM; |
382 | goto drop; |
383 | } |
384 | if (ptr_ring_produce(r: &q->ring, ptr: skb)) { |
385 | drop_reason = SKB_DROP_REASON_FULL_RING; |
386 | goto drop; |
387 | } |
388 | } |
389 | |
390 | wake_up: |
391 | wake_up_interruptible_poll(sk_sleep(&q->sk), EPOLLIN | EPOLLRDNORM | EPOLLRDBAND); |
392 | return RX_HANDLER_CONSUMED; |
393 | |
394 | drop: |
395 | /* Count errors/drops only here, thus don't care about args. */ |
396 | if (tap->count_rx_dropped) |
397 | tap->count_rx_dropped(tap); |
398 | kfree_skb_reason(skb, reason: drop_reason); |
399 | return RX_HANDLER_CONSUMED; |
400 | } |
401 | EXPORT_SYMBOL_GPL(tap_handle_frame); |
402 | |
403 | static struct major_info *tap_get_major(int major) |
404 | { |
405 | struct major_info *tap_major; |
406 | |
407 | list_for_each_entry_rcu(tap_major, &major_list, next) { |
408 | if (tap_major->major == major) |
409 | return tap_major; |
410 | } |
411 | |
412 | return NULL; |
413 | } |
414 | |
415 | int tap_get_minor(dev_t major, struct tap_dev *tap) |
416 | { |
417 | int retval = -ENOMEM; |
418 | struct major_info *tap_major; |
419 | |
420 | rcu_read_lock(); |
421 | tap_major = tap_get_major(MAJOR(major)); |
422 | if (!tap_major) { |
423 | retval = -EINVAL; |
424 | goto unlock; |
425 | } |
426 | |
427 | spin_lock(lock: &tap_major->minor_lock); |
428 | retval = idr_alloc(&tap_major->minor_idr, ptr: tap, start: 1, TAP_NUM_DEVS, GFP_ATOMIC); |
429 | if (retval >= 0) { |
430 | tap->minor = retval; |
431 | } else if (retval == -ENOSPC) { |
432 | netdev_err(dev: tap->dev, format: "Too many tap devices\n" ); |
433 | retval = -EINVAL; |
434 | } |
435 | spin_unlock(lock: &tap_major->minor_lock); |
436 | |
437 | unlock: |
438 | rcu_read_unlock(); |
439 | return retval < 0 ? retval : 0; |
440 | } |
441 | EXPORT_SYMBOL_GPL(tap_get_minor); |
442 | |
443 | void tap_free_minor(dev_t major, struct tap_dev *tap) |
444 | { |
445 | struct major_info *tap_major; |
446 | |
447 | rcu_read_lock(); |
448 | tap_major = tap_get_major(MAJOR(major)); |
449 | if (!tap_major) { |
450 | goto unlock; |
451 | } |
452 | |
453 | spin_lock(lock: &tap_major->minor_lock); |
454 | if (tap->minor) { |
455 | idr_remove(&tap_major->minor_idr, id: tap->minor); |
456 | tap->minor = 0; |
457 | } |
458 | spin_unlock(lock: &tap_major->minor_lock); |
459 | |
460 | unlock: |
461 | rcu_read_unlock(); |
462 | } |
463 | EXPORT_SYMBOL_GPL(tap_free_minor); |
464 | |
465 | static struct tap_dev *dev_get_by_tap_file(int major, int minor) |
466 | { |
467 | struct net_device *dev = NULL; |
468 | struct tap_dev *tap; |
469 | struct major_info *tap_major; |
470 | |
471 | rcu_read_lock(); |
472 | tap_major = tap_get_major(major); |
473 | if (!tap_major) { |
474 | tap = NULL; |
475 | goto unlock; |
476 | } |
477 | |
478 | spin_lock(lock: &tap_major->minor_lock); |
479 | tap = idr_find(&tap_major->minor_idr, id: minor); |
480 | if (tap) { |
481 | dev = tap->dev; |
482 | dev_hold(dev); |
483 | } |
484 | spin_unlock(lock: &tap_major->minor_lock); |
485 | |
486 | unlock: |
487 | rcu_read_unlock(); |
488 | return tap; |
489 | } |
490 | |
491 | static void tap_sock_write_space(struct sock *sk) |
492 | { |
493 | wait_queue_head_t *wqueue; |
494 | |
495 | if (!sock_writeable(sk) || |
496 | !test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, addr: &sk->sk_socket->flags)) |
497 | return; |
498 | |
499 | wqueue = sk_sleep(sk); |
500 | if (wqueue && waitqueue_active(wq_head: wqueue)) |
501 | wake_up_interruptible_poll(wqueue, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); |
502 | } |
503 | |
504 | static void tap_sock_destruct(struct sock *sk) |
505 | { |
506 | struct tap_queue *q = container_of(sk, struct tap_queue, sk); |
507 | |
508 | ptr_ring_cleanup(r: &q->ring, destroy: __skb_array_destroy_skb); |
509 | } |
510 | |
511 | static int tap_open(struct inode *inode, struct file *file) |
512 | { |
513 | struct net *net = current->nsproxy->net_ns; |
514 | struct tap_dev *tap; |
515 | struct tap_queue *q; |
516 | int err = -ENODEV; |
517 | |
518 | rtnl_lock(); |
519 | tap = dev_get_by_tap_file(major: imajor(inode), minor: iminor(inode)); |
520 | if (!tap) |
521 | goto err; |
522 | |
523 | err = -ENOMEM; |
524 | q = (struct tap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, |
525 | prot: &tap_proto, kern: 0); |
526 | if (!q) |
527 | goto err; |
528 | if (ptr_ring_init(r: &q->ring, size: tap->dev->tx_queue_len, GFP_KERNEL)) { |
529 | sk_free(sk: &q->sk); |
530 | goto err; |
531 | } |
532 | |
533 | init_waitqueue_head(&q->sock.wq.wait); |
534 | q->sock.type = SOCK_RAW; |
535 | q->sock.state = SS_CONNECTED; |
536 | q->sock.file = file; |
537 | q->sock.ops = &tap_socket_ops; |
538 | sock_init_data_uid(sock: &q->sock, sk: &q->sk, current_fsuid()); |
539 | q->sk.sk_write_space = tap_sock_write_space; |
540 | q->sk.sk_destruct = tap_sock_destruct; |
541 | q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP; |
542 | q->vnet_hdr_sz = sizeof(struct virtio_net_hdr); |
543 | |
544 | /* |
545 | * so far only KVM virtio_net uses tap, enable zero copy between |
546 | * guest kernel and host kernel when lower device supports zerocopy |
547 | * |
548 | * The macvlan supports zerocopy iff the lower device supports zero |
549 | * copy so we don't have to look at the lower device directly. |
550 | */ |
551 | if ((tap->dev->features & NETIF_F_HIGHDMA) && (tap->dev->features & NETIF_F_SG)) |
552 | sock_set_flag(sk: &q->sk, flag: SOCK_ZEROCOPY); |
553 | |
554 | err = tap_set_queue(tap, file, q); |
555 | if (err) { |
556 | /* tap_sock_destruct() will take care of freeing ptr_ring */ |
557 | goto err_put; |
558 | } |
559 | |
560 | /* tap groks IOCB_NOWAIT just fine, mark it as such */ |
561 | file->f_mode |= FMODE_NOWAIT; |
562 | |
563 | dev_put(dev: tap->dev); |
564 | |
565 | rtnl_unlock(); |
566 | return err; |
567 | |
568 | err_put: |
569 | sock_put(sk: &q->sk); |
570 | err: |
571 | if (tap) |
572 | dev_put(dev: tap->dev); |
573 | |
574 | rtnl_unlock(); |
575 | return err; |
576 | } |
577 | |
578 | static int tap_release(struct inode *inode, struct file *file) |
579 | { |
580 | struct tap_queue *q = file->private_data; |
581 | tap_put_queue(q); |
582 | return 0; |
583 | } |
584 | |
585 | static __poll_t tap_poll(struct file *file, poll_table *wait) |
586 | { |
587 | struct tap_queue *q = file->private_data; |
588 | __poll_t mask = EPOLLERR; |
589 | |
590 | if (!q) |
591 | goto out; |
592 | |
593 | mask = 0; |
594 | poll_wait(filp: file, wait_address: &q->sock.wq.wait, p: wait); |
595 | |
596 | if (!ptr_ring_empty(r: &q->ring)) |
597 | mask |= EPOLLIN | EPOLLRDNORM; |
598 | |
599 | if (sock_writeable(sk: &q->sk) || |
600 | (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, addr: &q->sock.flags) && |
601 | sock_writeable(sk: &q->sk))) |
602 | mask |= EPOLLOUT | EPOLLWRNORM; |
603 | |
604 | out: |
605 | return mask; |
606 | } |
607 | |
608 | static inline struct sk_buff *tap_alloc_skb(struct sock *sk, size_t prepad, |
609 | size_t len, size_t linear, |
610 | int noblock, int *err) |
611 | { |
612 | struct sk_buff *skb; |
613 | |
614 | /* Under a page? Don't bother with paged skb. */ |
615 | if (prepad + len < PAGE_SIZE || !linear) |
616 | linear = len; |
617 | |
618 | if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) |
619 | linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER); |
620 | skb = sock_alloc_send_pskb(sk, header_len: prepad + linear, data_len: len - linear, noblock, |
621 | errcode: err, PAGE_ALLOC_COSTLY_ORDER); |
622 | if (!skb) |
623 | return NULL; |
624 | |
625 | skb_reserve(skb, len: prepad); |
626 | skb_put(skb, len: linear); |
627 | skb->data_len = len - linear; |
628 | skb->len += len - linear; |
629 | |
630 | return skb; |
631 | } |
632 | |
633 | /* Neighbour code has some assumptions on HH_DATA_MOD alignment */ |
634 | #define TAP_RESERVE HH_DATA_OFF(ETH_HLEN) |
635 | |
636 | /* Get packet from user space buffer */ |
637 | static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, |
638 | struct iov_iter *from, int noblock) |
639 | { |
640 | int good_linear = SKB_MAX_HEAD(TAP_RESERVE); |
641 | struct sk_buff *skb; |
642 | struct tap_dev *tap; |
643 | unsigned long total_len = iov_iter_count(i: from); |
644 | unsigned long len = total_len; |
645 | int err; |
646 | struct virtio_net_hdr vnet_hdr = { 0 }; |
647 | int vnet_hdr_len = 0; |
648 | int copylen = 0; |
649 | int depth; |
650 | bool zerocopy = false; |
651 | size_t linear; |
652 | enum skb_drop_reason drop_reason; |
653 | |
654 | if (q->flags & IFF_VNET_HDR) { |
655 | vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); |
656 | |
657 | err = -EINVAL; |
658 | if (len < vnet_hdr_len) |
659 | goto err; |
660 | len -= vnet_hdr_len; |
661 | |
662 | err = -EFAULT; |
663 | if (!copy_from_iter_full(addr: &vnet_hdr, bytes: sizeof(vnet_hdr), i: from)) |
664 | goto err; |
665 | iov_iter_advance(i: from, bytes: vnet_hdr_len - sizeof(vnet_hdr)); |
666 | if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && |
667 | tap16_to_cpu(q, val: vnet_hdr.csum_start) + |
668 | tap16_to_cpu(q, val: vnet_hdr.csum_offset) + 2 > |
669 | tap16_to_cpu(q, val: vnet_hdr.hdr_len)) |
670 | vnet_hdr.hdr_len = cpu_to_tap16(q, |
671 | val: tap16_to_cpu(q, val: vnet_hdr.csum_start) + |
672 | tap16_to_cpu(q, val: vnet_hdr.csum_offset) + 2); |
673 | err = -EINVAL; |
674 | if (tap16_to_cpu(q, val: vnet_hdr.hdr_len) > len) |
675 | goto err; |
676 | } |
677 | |
678 | err = -EINVAL; |
679 | if (unlikely(len < ETH_HLEN)) |
680 | goto err; |
681 | |
682 | if (msg_control && sock_flag(sk: &q->sk, flag: SOCK_ZEROCOPY)) { |
683 | struct iov_iter i; |
684 | |
685 | copylen = vnet_hdr.hdr_len ? |
686 | tap16_to_cpu(q, val: vnet_hdr.hdr_len) : GOODCOPY_LEN; |
687 | if (copylen > good_linear) |
688 | copylen = good_linear; |
689 | else if (copylen < ETH_HLEN) |
690 | copylen = ETH_HLEN; |
691 | linear = copylen; |
692 | i = *from; |
693 | iov_iter_advance(i: &i, bytes: copylen); |
694 | if (iov_iter_npages(i: &i, INT_MAX) <= MAX_SKB_FRAGS) |
695 | zerocopy = true; |
696 | } |
697 | |
698 | if (!zerocopy) { |
699 | copylen = len; |
700 | linear = tap16_to_cpu(q, val: vnet_hdr.hdr_len); |
701 | if (linear > good_linear) |
702 | linear = good_linear; |
703 | else if (linear < ETH_HLEN) |
704 | linear = ETH_HLEN; |
705 | } |
706 | |
707 | skb = tap_alloc_skb(sk: &q->sk, TAP_RESERVE, len: copylen, |
708 | linear, noblock, err: &err); |
709 | if (!skb) |
710 | goto err; |
711 | |
712 | if (zerocopy) |
713 | err = zerocopy_sg_from_iter(skb, frm: from); |
714 | else |
715 | err = skb_copy_datagram_from_iter(skb, offset: 0, from, len); |
716 | |
717 | if (err) { |
718 | drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT; |
719 | goto err_kfree; |
720 | } |
721 | |
722 | skb_set_network_header(skb, ETH_HLEN); |
723 | skb_reset_mac_header(skb); |
724 | skb->protocol = eth_hdr(skb)->h_proto; |
725 | |
726 | rcu_read_lock(); |
727 | tap = rcu_dereference(q->tap); |
728 | if (!tap) { |
729 | kfree_skb(skb); |
730 | rcu_read_unlock(); |
731 | return total_len; |
732 | } |
733 | skb->dev = tap->dev; |
734 | |
735 | if (vnet_hdr_len) { |
736 | err = virtio_net_hdr_to_skb(skb, hdr: &vnet_hdr, |
737 | little_endian: tap_is_little_endian(q)); |
738 | if (err) { |
739 | rcu_read_unlock(); |
740 | drop_reason = SKB_DROP_REASON_DEV_HDR; |
741 | goto err_kfree; |
742 | } |
743 | } |
744 | |
745 | skb_probe_transport_header(skb); |
746 | |
747 | /* Move network header to the right position for VLAN tagged packets */ |
748 | if (eth_type_vlan(ethertype: skb->protocol) && |
749 | vlan_get_protocol_and_depth(skb, type: skb->protocol, depth: &depth) != 0) |
750 | skb_set_network_header(skb, offset: depth); |
751 | |
752 | /* copy skb_ubuf_info for callback when skb has no error */ |
753 | if (zerocopy) { |
754 | skb_zcopy_init(skb, uarg: msg_control); |
755 | } else if (msg_control) { |
756 | struct ubuf_info *uarg = msg_control; |
757 | uarg->callback(NULL, uarg, false); |
758 | } |
759 | |
760 | dev_queue_xmit(skb); |
761 | rcu_read_unlock(); |
762 | return total_len; |
763 | |
764 | err_kfree: |
765 | kfree_skb_reason(skb, reason: drop_reason); |
766 | |
767 | err: |
768 | rcu_read_lock(); |
769 | tap = rcu_dereference(q->tap); |
770 | if (tap && tap->count_tx_dropped) |
771 | tap->count_tx_dropped(tap); |
772 | rcu_read_unlock(); |
773 | |
774 | return err; |
775 | } |
776 | |
777 | static ssize_t tap_write_iter(struct kiocb *iocb, struct iov_iter *from) |
778 | { |
779 | struct file *file = iocb->ki_filp; |
780 | struct tap_queue *q = file->private_data; |
781 | int noblock = 0; |
782 | |
783 | if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) |
784 | noblock = 1; |
785 | |
786 | return tap_get_user(q, NULL, from, noblock); |
787 | } |
788 | |
789 | /* Put packet to the user space buffer */ |
790 | static ssize_t tap_put_user(struct tap_queue *q, |
791 | const struct sk_buff *skb, |
792 | struct iov_iter *iter) |
793 | { |
794 | int ret; |
795 | int vnet_hdr_len = 0; |
796 | int vlan_offset = 0; |
797 | int total; |
798 | |
799 | if (q->flags & IFF_VNET_HDR) { |
800 | int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0; |
801 | struct virtio_net_hdr vnet_hdr; |
802 | |
803 | vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); |
804 | if (iov_iter_count(i: iter) < vnet_hdr_len) |
805 | return -EINVAL; |
806 | |
807 | if (virtio_net_hdr_from_skb(skb, hdr: &vnet_hdr, |
808 | little_endian: tap_is_little_endian(q), has_data_valid: true, |
809 | vlan_hlen)) |
810 | BUG(); |
811 | |
812 | if (copy_to_iter(addr: &vnet_hdr, bytes: sizeof(vnet_hdr), i: iter) != |
813 | sizeof(vnet_hdr)) |
814 | return -EFAULT; |
815 | |
816 | iov_iter_advance(i: iter, bytes: vnet_hdr_len - sizeof(vnet_hdr)); |
817 | } |
818 | total = vnet_hdr_len; |
819 | total += skb->len; |
820 | |
821 | if (skb_vlan_tag_present(skb)) { |
822 | struct { |
823 | __be16 h_vlan_proto; |
824 | __be16 h_vlan_TCI; |
825 | } veth; |
826 | veth.h_vlan_proto = skb->vlan_proto; |
827 | veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb)); |
828 | |
829 | vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); |
830 | total += VLAN_HLEN; |
831 | |
832 | ret = skb_copy_datagram_iter(from: skb, offset: 0, to: iter, size: vlan_offset); |
833 | if (ret || !iov_iter_count(i: iter)) |
834 | goto done; |
835 | |
836 | ret = copy_to_iter(addr: &veth, bytes: sizeof(veth), i: iter); |
837 | if (ret != sizeof(veth) || !iov_iter_count(i: iter)) |
838 | goto done; |
839 | } |
840 | |
841 | ret = skb_copy_datagram_iter(from: skb, offset: vlan_offset, to: iter, |
842 | size: skb->len - vlan_offset); |
843 | |
844 | done: |
845 | return ret ? ret : total; |
846 | } |
847 | |
848 | static ssize_t tap_do_read(struct tap_queue *q, |
849 | struct iov_iter *to, |
850 | int noblock, struct sk_buff *skb) |
851 | { |
852 | DEFINE_WAIT(wait); |
853 | ssize_t ret = 0; |
854 | |
855 | if (!iov_iter_count(i: to)) { |
856 | kfree_skb(skb); |
857 | return 0; |
858 | } |
859 | |
860 | if (skb) |
861 | goto put; |
862 | |
863 | while (1) { |
864 | if (!noblock) |
865 | prepare_to_wait(wq_head: sk_sleep(sk: &q->sk), wq_entry: &wait, |
866 | TASK_INTERRUPTIBLE); |
867 | |
868 | /* Read frames from the queue */ |
869 | skb = ptr_ring_consume(r: &q->ring); |
870 | if (skb) |
871 | break; |
872 | if (noblock) { |
873 | ret = -EAGAIN; |
874 | break; |
875 | } |
876 | if (signal_pending(current)) { |
877 | ret = -ERESTARTSYS; |
878 | break; |
879 | } |
880 | /* Nothing to read, let's sleep */ |
881 | schedule(); |
882 | } |
883 | if (!noblock) |
884 | finish_wait(wq_head: sk_sleep(sk: &q->sk), wq_entry: &wait); |
885 | |
886 | put: |
887 | if (skb) { |
888 | ret = tap_put_user(q, skb, iter: to); |
889 | if (unlikely(ret < 0)) |
890 | kfree_skb(skb); |
891 | else |
892 | consume_skb(skb); |
893 | } |
894 | return ret; |
895 | } |
896 | |
897 | static ssize_t tap_read_iter(struct kiocb *iocb, struct iov_iter *to) |
898 | { |
899 | struct file *file = iocb->ki_filp; |
900 | struct tap_queue *q = file->private_data; |
901 | ssize_t len = iov_iter_count(i: to), ret; |
902 | int noblock = 0; |
903 | |
904 | if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) |
905 | noblock = 1; |
906 | |
907 | ret = tap_do_read(q, to, noblock, NULL); |
908 | ret = min_t(ssize_t, ret, len); |
909 | if (ret > 0) |
910 | iocb->ki_pos = ret; |
911 | return ret; |
912 | } |
913 | |
914 | static struct tap_dev *tap_get_tap_dev(struct tap_queue *q) |
915 | { |
916 | struct tap_dev *tap; |
917 | |
918 | ASSERT_RTNL(); |
919 | tap = rtnl_dereference(q->tap); |
920 | if (tap) |
921 | dev_hold(dev: tap->dev); |
922 | |
923 | return tap; |
924 | } |
925 | |
926 | static void tap_put_tap_dev(struct tap_dev *tap) |
927 | { |
928 | dev_put(dev: tap->dev); |
929 | } |
930 | |
931 | static int tap_ioctl_set_queue(struct file *file, unsigned int flags) |
932 | { |
933 | struct tap_queue *q = file->private_data; |
934 | struct tap_dev *tap; |
935 | int ret; |
936 | |
937 | tap = tap_get_tap_dev(q); |
938 | if (!tap) |
939 | return -EINVAL; |
940 | |
941 | if (flags & IFF_ATTACH_QUEUE) |
942 | ret = tap_enable_queue(tap, file, q); |
943 | else if (flags & IFF_DETACH_QUEUE) |
944 | ret = tap_disable_queue(q); |
945 | else |
946 | ret = -EINVAL; |
947 | |
948 | tap_put_tap_dev(tap); |
949 | return ret; |
950 | } |
951 | |
952 | static int set_offload(struct tap_queue *q, unsigned long arg) |
953 | { |
954 | struct tap_dev *tap; |
955 | netdev_features_t features; |
956 | netdev_features_t feature_mask = 0; |
957 | |
958 | tap = rtnl_dereference(q->tap); |
959 | if (!tap) |
960 | return -ENOLINK; |
961 | |
962 | features = tap->dev->features; |
963 | |
964 | if (arg & TUN_F_CSUM) { |
965 | feature_mask = NETIF_F_HW_CSUM; |
966 | |
967 | if (arg & (TUN_F_TSO4 | TUN_F_TSO6)) { |
968 | if (arg & TUN_F_TSO_ECN) |
969 | feature_mask |= NETIF_F_TSO_ECN; |
970 | if (arg & TUN_F_TSO4) |
971 | feature_mask |= NETIF_F_TSO; |
972 | if (arg & TUN_F_TSO6) |
973 | feature_mask |= NETIF_F_TSO6; |
974 | } |
975 | |
976 | /* TODO: for now USO4 and USO6 should work simultaneously */ |
977 | if ((arg & (TUN_F_USO4 | TUN_F_USO6)) == (TUN_F_USO4 | TUN_F_USO6)) |
978 | features |= NETIF_F_GSO_UDP_L4; |
979 | } |
980 | |
981 | /* tun/tap driver inverts the usage for TSO offloads, where |
982 | * setting the TSO bit means that the userspace wants to |
983 | * accept TSO frames and turning it off means that user space |
984 | * does not support TSO. |
985 | * For tap, we have to invert it to mean the same thing. |
986 | * When user space turns off TSO, we turn off GSO/LRO so that |
987 | * user-space will not receive TSO frames. |
988 | */ |
989 | if (feature_mask & (NETIF_F_TSO | NETIF_F_TSO6) || |
990 | (feature_mask & (TUN_F_USO4 | TUN_F_USO6)) == (TUN_F_USO4 | TUN_F_USO6)) |
991 | features |= RX_OFFLOADS; |
992 | else |
993 | features &= ~RX_OFFLOADS; |
994 | |
995 | /* tap_features are the same as features on tun/tap and |
996 | * reflect user expectations. |
997 | */ |
998 | tap->tap_features = feature_mask; |
999 | if (tap->update_features) |
1000 | tap->update_features(tap, features); |
1001 | |
1002 | return 0; |
1003 | } |
1004 | |
1005 | /* |
1006 | * provide compatibility with generic tun/tap interface |
1007 | */ |
1008 | static long tap_ioctl(struct file *file, unsigned int cmd, |
1009 | unsigned long arg) |
1010 | { |
1011 | struct tap_queue *q = file->private_data; |
1012 | struct tap_dev *tap; |
1013 | void __user *argp = (void __user *)arg; |
1014 | struct ifreq __user *ifr = argp; |
1015 | unsigned int __user *up = argp; |
1016 | unsigned short u; |
1017 | int __user *sp = argp; |
1018 | struct sockaddr sa; |
1019 | int s; |
1020 | int ret; |
1021 | |
1022 | switch (cmd) { |
1023 | case TUNSETIFF: |
1024 | /* ignore the name, just look at flags */ |
1025 | if (get_user(u, &ifr->ifr_flags)) |
1026 | return -EFAULT; |
1027 | |
1028 | ret = 0; |
1029 | if ((u & ~TAP_IFFEATURES) != (IFF_NO_PI | IFF_TAP)) |
1030 | ret = -EINVAL; |
1031 | else |
1032 | q->flags = (q->flags & ~TAP_IFFEATURES) | u; |
1033 | |
1034 | return ret; |
1035 | |
1036 | case TUNGETIFF: |
1037 | rtnl_lock(); |
1038 | tap = tap_get_tap_dev(q); |
1039 | if (!tap) { |
1040 | rtnl_unlock(); |
1041 | return -ENOLINK; |
1042 | } |
1043 | |
1044 | ret = 0; |
1045 | u = q->flags; |
1046 | if (copy_to_user(to: &ifr->ifr_name, from: tap->dev->name, IFNAMSIZ) || |
1047 | put_user(u, &ifr->ifr_flags)) |
1048 | ret = -EFAULT; |
1049 | tap_put_tap_dev(tap); |
1050 | rtnl_unlock(); |
1051 | return ret; |
1052 | |
1053 | case TUNSETQUEUE: |
1054 | if (get_user(u, &ifr->ifr_flags)) |
1055 | return -EFAULT; |
1056 | rtnl_lock(); |
1057 | ret = tap_ioctl_set_queue(file, flags: u); |
1058 | rtnl_unlock(); |
1059 | return ret; |
1060 | |
1061 | case TUNGETFEATURES: |
1062 | if (put_user(IFF_TAP | IFF_NO_PI | TAP_IFFEATURES, up)) |
1063 | return -EFAULT; |
1064 | return 0; |
1065 | |
1066 | case TUNSETSNDBUF: |
1067 | if (get_user(s, sp)) |
1068 | return -EFAULT; |
1069 | if (s <= 0) |
1070 | return -EINVAL; |
1071 | |
1072 | q->sk.sk_sndbuf = s; |
1073 | return 0; |
1074 | |
1075 | case TUNGETVNETHDRSZ: |
1076 | s = q->vnet_hdr_sz; |
1077 | if (put_user(s, sp)) |
1078 | return -EFAULT; |
1079 | return 0; |
1080 | |
1081 | case TUNSETVNETHDRSZ: |
1082 | if (get_user(s, sp)) |
1083 | return -EFAULT; |
1084 | if (s < (int)sizeof(struct virtio_net_hdr)) |
1085 | return -EINVAL; |
1086 | |
1087 | q->vnet_hdr_sz = s; |
1088 | return 0; |
1089 | |
1090 | case TUNGETVNETLE: |
1091 | s = !!(q->flags & TAP_VNET_LE); |
1092 | if (put_user(s, sp)) |
1093 | return -EFAULT; |
1094 | return 0; |
1095 | |
1096 | case TUNSETVNETLE: |
1097 | if (get_user(s, sp)) |
1098 | return -EFAULT; |
1099 | if (s) |
1100 | q->flags |= TAP_VNET_LE; |
1101 | else |
1102 | q->flags &= ~TAP_VNET_LE; |
1103 | return 0; |
1104 | |
1105 | case TUNGETVNETBE: |
1106 | return tap_get_vnet_be(q, sp); |
1107 | |
1108 | case TUNSETVNETBE: |
1109 | return tap_set_vnet_be(q, sp); |
1110 | |
1111 | case TUNSETOFFLOAD: |
1112 | /* let the user check for future flags */ |
1113 | if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | |
1114 | TUN_F_TSO_ECN | TUN_F_UFO | |
1115 | TUN_F_USO4 | TUN_F_USO6)) |
1116 | return -EINVAL; |
1117 | |
1118 | rtnl_lock(); |
1119 | ret = set_offload(q, arg); |
1120 | rtnl_unlock(); |
1121 | return ret; |
1122 | |
1123 | case SIOCGIFHWADDR: |
1124 | rtnl_lock(); |
1125 | tap = tap_get_tap_dev(q); |
1126 | if (!tap) { |
1127 | rtnl_unlock(); |
1128 | return -ENOLINK; |
1129 | } |
1130 | ret = 0; |
1131 | dev_get_mac_address(sa: &sa, net: dev_net(dev: tap->dev), dev_name: tap->dev->name); |
1132 | if (copy_to_user(to: &ifr->ifr_name, from: tap->dev->name, IFNAMSIZ) || |
1133 | copy_to_user(to: &ifr->ifr_hwaddr, from: &sa, n: sizeof(sa))) |
1134 | ret = -EFAULT; |
1135 | tap_put_tap_dev(tap); |
1136 | rtnl_unlock(); |
1137 | return ret; |
1138 | |
1139 | case SIOCSIFHWADDR: |
1140 | if (copy_from_user(to: &sa, from: &ifr->ifr_hwaddr, n: sizeof(sa))) |
1141 | return -EFAULT; |
1142 | rtnl_lock(); |
1143 | tap = tap_get_tap_dev(q); |
1144 | if (!tap) { |
1145 | rtnl_unlock(); |
1146 | return -ENOLINK; |
1147 | } |
1148 | ret = dev_set_mac_address_user(dev: tap->dev, sa: &sa, NULL); |
1149 | tap_put_tap_dev(tap); |
1150 | rtnl_unlock(); |
1151 | return ret; |
1152 | |
1153 | default: |
1154 | return -EINVAL; |
1155 | } |
1156 | } |
1157 | |
1158 | static const struct file_operations tap_fops = { |
1159 | .owner = THIS_MODULE, |
1160 | .open = tap_open, |
1161 | .release = tap_release, |
1162 | .read_iter = tap_read_iter, |
1163 | .write_iter = tap_write_iter, |
1164 | .poll = tap_poll, |
1165 | .llseek = no_llseek, |
1166 | .unlocked_ioctl = tap_ioctl, |
1167 | .compat_ioctl = compat_ptr_ioctl, |
1168 | }; |
1169 | |
1170 | static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp) |
1171 | { |
1172 | struct tun_xdp_hdr *hdr = xdp->data_hard_start; |
1173 | struct virtio_net_hdr *gso = &hdr->gso; |
1174 | int buflen = hdr->buflen; |
1175 | int vnet_hdr_len = 0; |
1176 | struct tap_dev *tap; |
1177 | struct sk_buff *skb; |
1178 | int err, depth; |
1179 | |
1180 | if (q->flags & IFF_VNET_HDR) |
1181 | vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); |
1182 | |
1183 | skb = build_skb(data: xdp->data_hard_start, frag_size: buflen); |
1184 | if (!skb) { |
1185 | err = -ENOMEM; |
1186 | goto err; |
1187 | } |
1188 | |
1189 | skb_reserve(skb, len: xdp->data - xdp->data_hard_start); |
1190 | skb_put(skb, len: xdp->data_end - xdp->data); |
1191 | |
1192 | skb_set_network_header(skb, ETH_HLEN); |
1193 | skb_reset_mac_header(skb); |
1194 | skb->protocol = eth_hdr(skb)->h_proto; |
1195 | |
1196 | if (vnet_hdr_len) { |
1197 | err = virtio_net_hdr_to_skb(skb, hdr: gso, little_endian: tap_is_little_endian(q)); |
1198 | if (err) |
1199 | goto err_kfree; |
1200 | } |
1201 | |
1202 | /* Move network header to the right position for VLAN tagged packets */ |
1203 | if (eth_type_vlan(ethertype: skb->protocol) && |
1204 | vlan_get_protocol_and_depth(skb, type: skb->protocol, depth: &depth) != 0) |
1205 | skb_set_network_header(skb, offset: depth); |
1206 | |
1207 | rcu_read_lock(); |
1208 | tap = rcu_dereference(q->tap); |
1209 | if (tap) { |
1210 | skb->dev = tap->dev; |
1211 | skb_probe_transport_header(skb); |
1212 | dev_queue_xmit(skb); |
1213 | } else { |
1214 | kfree_skb(skb); |
1215 | } |
1216 | rcu_read_unlock(); |
1217 | |
1218 | return 0; |
1219 | |
1220 | err_kfree: |
1221 | kfree_skb(skb); |
1222 | err: |
1223 | rcu_read_lock(); |
1224 | tap = rcu_dereference(q->tap); |
1225 | if (tap && tap->count_tx_dropped) |
1226 | tap->count_tx_dropped(tap); |
1227 | rcu_read_unlock(); |
1228 | return err; |
1229 | } |
1230 | |
1231 | static int tap_sendmsg(struct socket *sock, struct msghdr *m, |
1232 | size_t total_len) |
1233 | { |
1234 | struct tap_queue *q = container_of(sock, struct tap_queue, sock); |
1235 | struct tun_msg_ctl *ctl = m->msg_control; |
1236 | struct xdp_buff *xdp; |
1237 | int i; |
1238 | |
1239 | if (m->msg_controllen == sizeof(struct tun_msg_ctl) && |
1240 | ctl && ctl->type == TUN_MSG_PTR) { |
1241 | for (i = 0; i < ctl->num; i++) { |
1242 | xdp = &((struct xdp_buff *)ctl->ptr)[i]; |
1243 | tap_get_user_xdp(q, xdp); |
1244 | } |
1245 | return 0; |
1246 | } |
1247 | |
1248 | return tap_get_user(q, msg_control: ctl ? ctl->ptr : NULL, from: &m->msg_iter, |
1249 | noblock: m->msg_flags & MSG_DONTWAIT); |
1250 | } |
1251 | |
1252 | static int tap_recvmsg(struct socket *sock, struct msghdr *m, |
1253 | size_t total_len, int flags) |
1254 | { |
1255 | struct tap_queue *q = container_of(sock, struct tap_queue, sock); |
1256 | struct sk_buff *skb = m->msg_control; |
1257 | int ret; |
1258 | if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) { |
1259 | kfree_skb(skb); |
1260 | return -EINVAL; |
1261 | } |
1262 | ret = tap_do_read(q, to: &m->msg_iter, noblock: flags & MSG_DONTWAIT, skb); |
1263 | if (ret > total_len) { |
1264 | m->msg_flags |= MSG_TRUNC; |
1265 | ret = flags & MSG_TRUNC ? ret : total_len; |
1266 | } |
1267 | return ret; |
1268 | } |
1269 | |
1270 | static int tap_peek_len(struct socket *sock) |
1271 | { |
1272 | struct tap_queue *q = container_of(sock, struct tap_queue, |
1273 | sock); |
1274 | return PTR_RING_PEEK_CALL(&q->ring, __skb_array_len_with_tag); |
1275 | } |
1276 | |
1277 | /* Ops structure to mimic raw sockets with tun */ |
1278 | static const struct proto_ops tap_socket_ops = { |
1279 | .sendmsg = tap_sendmsg, |
1280 | .recvmsg = tap_recvmsg, |
1281 | .peek_len = tap_peek_len, |
1282 | }; |
1283 | |
1284 | /* Get an underlying socket object from tun file. Returns error unless file is |
1285 | * attached to a device. The returned object works like a packet socket, it |
1286 | * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for |
1287 | * holding a reference to the file for as long as the socket is in use. */ |
1288 | struct socket *tap_get_socket(struct file *file) |
1289 | { |
1290 | struct tap_queue *q; |
1291 | if (file->f_op != &tap_fops) |
1292 | return ERR_PTR(error: -EINVAL); |
1293 | q = file->private_data; |
1294 | if (!q) |
1295 | return ERR_PTR(error: -EBADFD); |
1296 | return &q->sock; |
1297 | } |
1298 | EXPORT_SYMBOL_GPL(tap_get_socket); |
1299 | |
1300 | struct ptr_ring *tap_get_ptr_ring(struct file *file) |
1301 | { |
1302 | struct tap_queue *q; |
1303 | |
1304 | if (file->f_op != &tap_fops) |
1305 | return ERR_PTR(error: -EINVAL); |
1306 | q = file->private_data; |
1307 | if (!q) |
1308 | return ERR_PTR(error: -EBADFD); |
1309 | return &q->ring; |
1310 | } |
1311 | EXPORT_SYMBOL_GPL(tap_get_ptr_ring); |
1312 | |
1313 | int tap_queue_resize(struct tap_dev *tap) |
1314 | { |
1315 | struct net_device *dev = tap->dev; |
1316 | struct tap_queue *q; |
1317 | struct ptr_ring **rings; |
1318 | int n = tap->numqueues; |
1319 | int ret, i = 0; |
1320 | |
1321 | rings = kmalloc_array(n, size: sizeof(*rings), GFP_KERNEL); |
1322 | if (!rings) |
1323 | return -ENOMEM; |
1324 | |
1325 | list_for_each_entry(q, &tap->queue_list, next) |
1326 | rings[i++] = &q->ring; |
1327 | |
1328 | ret = ptr_ring_resize_multiple(rings, nrings: n, |
1329 | size: dev->tx_queue_len, GFP_KERNEL, |
1330 | destroy: __skb_array_destroy_skb); |
1331 | |
1332 | kfree(objp: rings); |
1333 | return ret; |
1334 | } |
1335 | EXPORT_SYMBOL_GPL(tap_queue_resize); |
1336 | |
1337 | static int tap_list_add(dev_t major, const char *device_name) |
1338 | { |
1339 | struct major_info *tap_major; |
1340 | |
1341 | tap_major = kzalloc(size: sizeof(*tap_major), GFP_ATOMIC); |
1342 | if (!tap_major) |
1343 | return -ENOMEM; |
1344 | |
1345 | tap_major->major = MAJOR(major); |
1346 | |
1347 | idr_init(idr: &tap_major->minor_idr); |
1348 | spin_lock_init(&tap_major->minor_lock); |
1349 | |
1350 | tap_major->device_name = device_name; |
1351 | |
1352 | list_add_tail_rcu(new: &tap_major->next, head: &major_list); |
1353 | return 0; |
1354 | } |
1355 | |
1356 | int tap_create_cdev(struct cdev *tap_cdev, dev_t *tap_major, |
1357 | const char *device_name, struct module *module) |
1358 | { |
1359 | int err; |
1360 | |
1361 | err = alloc_chrdev_region(tap_major, 0, TAP_NUM_DEVS, device_name); |
1362 | if (err) |
1363 | goto out1; |
1364 | |
1365 | cdev_init(tap_cdev, &tap_fops); |
1366 | tap_cdev->owner = module; |
1367 | err = cdev_add(tap_cdev, *tap_major, TAP_NUM_DEVS); |
1368 | if (err) |
1369 | goto out2; |
1370 | |
1371 | err = tap_list_add(major: *tap_major, device_name); |
1372 | if (err) |
1373 | goto out3; |
1374 | |
1375 | return 0; |
1376 | |
1377 | out3: |
1378 | cdev_del(tap_cdev); |
1379 | out2: |
1380 | unregister_chrdev_region(*tap_major, TAP_NUM_DEVS); |
1381 | out1: |
1382 | return err; |
1383 | } |
1384 | EXPORT_SYMBOL_GPL(tap_create_cdev); |
1385 | |
1386 | void tap_destroy_cdev(dev_t major, struct cdev *tap_cdev) |
1387 | { |
1388 | struct major_info *tap_major, *tmp; |
1389 | |
1390 | cdev_del(tap_cdev); |
1391 | unregister_chrdev_region(major, TAP_NUM_DEVS); |
1392 | list_for_each_entry_safe(tap_major, tmp, &major_list, next) { |
1393 | if (tap_major->major == MAJOR(major)) { |
1394 | idr_destroy(&tap_major->minor_idr); |
1395 | list_del_rcu(entry: &tap_major->next); |
1396 | kfree_rcu(tap_major, rcu); |
1397 | } |
1398 | } |
1399 | } |
1400 | EXPORT_SYMBOL_GPL(tap_destroy_cdev); |
1401 | |
1402 | MODULE_DESCRIPTION("Common library for drivers implementing the TAP interface" ); |
1403 | MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>" ); |
1404 | MODULE_AUTHOR("Sainath Grandhi <sainath.grandhi@intel.com>" ); |
1405 | MODULE_LICENSE("GPL" ); |
1406 | |