1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* Copyright (C) 2009 Red Hat, Inc. |
3 | * Author: Michael S. Tsirkin <mst@redhat.com> |
4 | * |
5 | * virtio-net server in host kernel. |
6 | */ |
7 | |
8 | #include <linux/compat.h> |
9 | #include <linux/eventfd.h> |
10 | #include <linux/vhost.h> |
11 | #include <linux/virtio_net.h> |
12 | #include <linux/miscdevice.h> |
13 | #include <linux/module.h> |
14 | #include <linux/moduleparam.h> |
15 | #include <linux/mutex.h> |
16 | #include <linux/workqueue.h> |
17 | #include <linux/file.h> |
18 | #include <linux/slab.h> |
19 | #include <linux/sched/clock.h> |
20 | #include <linux/sched/signal.h> |
21 | #include <linux/vmalloc.h> |
22 | |
23 | #include <linux/net.h> |
24 | #include <linux/if_packet.h> |
25 | #include <linux/if_arp.h> |
26 | #include <linux/if_tun.h> |
27 | #include <linux/if_macvlan.h> |
28 | #include <linux/if_tap.h> |
29 | #include <linux/if_vlan.h> |
30 | #include <linux/skb_array.h> |
31 | #include <linux/skbuff.h> |
32 | |
33 | #include <net/sock.h> |
34 | #include <net/xdp.h> |
35 | |
36 | #include "vhost.h" |
37 | |
38 | static int experimental_zcopytx = 0; |
39 | module_param(experimental_zcopytx, int, 0444); |
40 | MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;" |
41 | " 1 -Enable; 0 - Disable" ); |
42 | |
43 | /* Max number of bytes transferred before requeueing the job. |
44 | * Using this limit prevents one virtqueue from starving others. */ |
45 | #define VHOST_NET_WEIGHT 0x80000 |
46 | |
47 | /* Max number of packets transferred before requeueing the job. |
48 | * Using this limit prevents one virtqueue from starving others with small |
49 | * pkts. |
50 | */ |
51 | #define VHOST_NET_PKT_WEIGHT 256 |
52 | |
53 | /* MAX number of TX used buffers for outstanding zerocopy */ |
54 | #define VHOST_MAX_PEND 128 |
55 | #define VHOST_GOODCOPY_LEN 256 |
56 | |
57 | /* |
58 | * For transmit, used buffer len is unused; we override it to track buffer |
59 | * status internally; used for zerocopy tx only. |
60 | */ |
61 | /* Lower device DMA failed */ |
62 | #define VHOST_DMA_FAILED_LEN ((__force __virtio32)3) |
63 | /* Lower device DMA done */ |
64 | #define VHOST_DMA_DONE_LEN ((__force __virtio32)2) |
65 | /* Lower device DMA in progress */ |
66 | #define VHOST_DMA_IN_PROGRESS ((__force __virtio32)1) |
67 | /* Buffer unused */ |
68 | #define VHOST_DMA_CLEAR_LEN ((__force __virtio32)0) |
69 | |
70 | #define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force u32)VHOST_DMA_DONE_LEN) |
71 | |
72 | enum { |
73 | VHOST_NET_FEATURES = VHOST_FEATURES | |
74 | (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) | |
75 | (1ULL << VIRTIO_NET_F_MRG_RXBUF) | |
76 | (1ULL << VIRTIO_F_ACCESS_PLATFORM) | |
77 | (1ULL << VIRTIO_F_RING_RESET) |
78 | }; |
79 | |
80 | enum { |
81 | VHOST_NET_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2) |
82 | }; |
83 | |
84 | enum { |
85 | VHOST_NET_VQ_RX = 0, |
86 | VHOST_NET_VQ_TX = 1, |
87 | VHOST_NET_VQ_MAX = 2, |
88 | }; |
89 | |
90 | struct vhost_net_ubuf_ref { |
91 | /* refcount follows semantics similar to kref: |
92 | * 0: object is released |
93 | * 1: no outstanding ubufs |
94 | * >1: outstanding ubufs |
95 | */ |
96 | atomic_t refcount; |
97 | wait_queue_head_t wait; |
98 | struct vhost_virtqueue *vq; |
99 | }; |
100 | |
101 | #define VHOST_NET_BATCH 64 |
102 | struct vhost_net_buf { |
103 | void **queue; |
104 | int tail; |
105 | int head; |
106 | }; |
107 | |
108 | struct vhost_net_virtqueue { |
109 | struct vhost_virtqueue vq; |
110 | size_t vhost_hlen; |
111 | size_t sock_hlen; |
112 | /* vhost zerocopy support fields below: */ |
113 | /* last used idx for outstanding DMA zerocopy buffers */ |
114 | int upend_idx; |
115 | /* For TX, first used idx for DMA done zerocopy buffers |
116 | * For RX, number of batched heads |
117 | */ |
118 | int done_idx; |
119 | /* Number of XDP frames batched */ |
120 | int batched_xdp; |
121 | /* an array of userspace buffers info */ |
122 | struct ubuf_info_msgzc *ubuf_info; |
123 | /* Reference counting for outstanding ubufs. |
124 | * Protected by vq mutex. Writers must also take device mutex. */ |
125 | struct vhost_net_ubuf_ref *ubufs; |
126 | struct ptr_ring *rx_ring; |
127 | struct vhost_net_buf rxq; |
128 | /* Batched XDP buffs */ |
129 | struct xdp_buff *xdp; |
130 | }; |
131 | |
132 | struct vhost_net { |
133 | struct vhost_dev dev; |
134 | struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX]; |
135 | struct vhost_poll poll[VHOST_NET_VQ_MAX]; |
136 | /* Number of TX recently submitted. |
137 | * Protected by tx vq lock. */ |
138 | unsigned tx_packets; |
139 | /* Number of times zerocopy TX recently failed. |
140 | * Protected by tx vq lock. */ |
141 | unsigned tx_zcopy_err; |
142 | /* Flush in progress. Protected by tx vq lock. */ |
143 | bool tx_flush; |
144 | /* Private page frag */ |
145 | struct page_frag page_frag; |
146 | /* Refcount bias of page frag */ |
147 | int refcnt_bias; |
148 | }; |
149 | |
150 | static unsigned vhost_net_zcopy_mask __read_mostly; |
151 | |
152 | static void *vhost_net_buf_get_ptr(struct vhost_net_buf *rxq) |
153 | { |
154 | if (rxq->tail != rxq->head) |
155 | return rxq->queue[rxq->head]; |
156 | else |
157 | return NULL; |
158 | } |
159 | |
160 | static int vhost_net_buf_get_size(struct vhost_net_buf *rxq) |
161 | { |
162 | return rxq->tail - rxq->head; |
163 | } |
164 | |
165 | static int vhost_net_buf_is_empty(struct vhost_net_buf *rxq) |
166 | { |
167 | return rxq->tail == rxq->head; |
168 | } |
169 | |
170 | static void *vhost_net_buf_consume(struct vhost_net_buf *rxq) |
171 | { |
172 | void *ret = vhost_net_buf_get_ptr(rxq); |
173 | ++rxq->head; |
174 | return ret; |
175 | } |
176 | |
177 | static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq) |
178 | { |
179 | struct vhost_net_buf *rxq = &nvq->rxq; |
180 | |
181 | rxq->head = 0; |
182 | rxq->tail = ptr_ring_consume_batched(r: nvq->rx_ring, array: rxq->queue, |
183 | VHOST_NET_BATCH); |
184 | return rxq->tail; |
185 | } |
186 | |
187 | static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq) |
188 | { |
189 | struct vhost_net_buf *rxq = &nvq->rxq; |
190 | |
191 | if (nvq->rx_ring && !vhost_net_buf_is_empty(rxq)) { |
192 | ptr_ring_unconsume(r: nvq->rx_ring, batch: rxq->queue + rxq->head, |
193 | n: vhost_net_buf_get_size(rxq), |
194 | destroy: tun_ptr_free); |
195 | rxq->head = rxq->tail = 0; |
196 | } |
197 | } |
198 | |
199 | static int vhost_net_buf_peek_len(void *ptr) |
200 | { |
201 | if (tun_is_xdp_frame(ptr)) { |
202 | struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); |
203 | |
204 | return xdpf->len; |
205 | } |
206 | |
207 | return __skb_array_len_with_tag(skb: ptr); |
208 | } |
209 | |
210 | static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq) |
211 | { |
212 | struct vhost_net_buf *rxq = &nvq->rxq; |
213 | |
214 | if (!vhost_net_buf_is_empty(rxq)) |
215 | goto out; |
216 | |
217 | if (!vhost_net_buf_produce(nvq)) |
218 | return 0; |
219 | |
220 | out: |
221 | return vhost_net_buf_peek_len(ptr: vhost_net_buf_get_ptr(rxq)); |
222 | } |
223 | |
224 | static void vhost_net_buf_init(struct vhost_net_buf *rxq) |
225 | { |
226 | rxq->head = rxq->tail = 0; |
227 | } |
228 | |
229 | static void vhost_net_enable_zcopy(int vq) |
230 | { |
231 | vhost_net_zcopy_mask |= 0x1 << vq; |
232 | } |
233 | |
234 | static struct vhost_net_ubuf_ref * |
235 | vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy) |
236 | { |
237 | struct vhost_net_ubuf_ref *ubufs; |
238 | /* No zero copy backend? Nothing to count. */ |
239 | if (!zcopy) |
240 | return NULL; |
241 | ubufs = kmalloc(size: sizeof(*ubufs), GFP_KERNEL); |
242 | if (!ubufs) |
243 | return ERR_PTR(error: -ENOMEM); |
244 | atomic_set(v: &ubufs->refcount, i: 1); |
245 | init_waitqueue_head(&ubufs->wait); |
246 | ubufs->vq = vq; |
247 | return ubufs; |
248 | } |
249 | |
250 | static int vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs) |
251 | { |
252 | int r = atomic_sub_return(i: 1, v: &ubufs->refcount); |
253 | if (unlikely(!r)) |
254 | wake_up(&ubufs->wait); |
255 | return r; |
256 | } |
257 | |
258 | static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs) |
259 | { |
260 | vhost_net_ubuf_put(ubufs); |
261 | wait_event(ubufs->wait, !atomic_read(&ubufs->refcount)); |
262 | } |
263 | |
264 | static void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs) |
265 | { |
266 | vhost_net_ubuf_put_and_wait(ubufs); |
267 | kfree(objp: ubufs); |
268 | } |
269 | |
270 | static void vhost_net_clear_ubuf_info(struct vhost_net *n) |
271 | { |
272 | int i; |
273 | |
274 | for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { |
275 | kfree(objp: n->vqs[i].ubuf_info); |
276 | n->vqs[i].ubuf_info = NULL; |
277 | } |
278 | } |
279 | |
280 | static int vhost_net_set_ubuf_info(struct vhost_net *n) |
281 | { |
282 | bool zcopy; |
283 | int i; |
284 | |
285 | for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { |
286 | zcopy = vhost_net_zcopy_mask & (0x1 << i); |
287 | if (!zcopy) |
288 | continue; |
289 | n->vqs[i].ubuf_info = |
290 | kmalloc_array(UIO_MAXIOV, |
291 | size: sizeof(*n->vqs[i].ubuf_info), |
292 | GFP_KERNEL); |
293 | if (!n->vqs[i].ubuf_info) |
294 | goto err; |
295 | } |
296 | return 0; |
297 | |
298 | err: |
299 | vhost_net_clear_ubuf_info(n); |
300 | return -ENOMEM; |
301 | } |
302 | |
303 | static void vhost_net_vq_reset(struct vhost_net *n) |
304 | { |
305 | int i; |
306 | |
307 | vhost_net_clear_ubuf_info(n); |
308 | |
309 | for (i = 0; i < VHOST_NET_VQ_MAX; i++) { |
310 | n->vqs[i].done_idx = 0; |
311 | n->vqs[i].upend_idx = 0; |
312 | n->vqs[i].ubufs = NULL; |
313 | n->vqs[i].vhost_hlen = 0; |
314 | n->vqs[i].sock_hlen = 0; |
315 | vhost_net_buf_init(rxq: &n->vqs[i].rxq); |
316 | } |
317 | |
318 | } |
319 | |
320 | static void vhost_net_tx_packet(struct vhost_net *net) |
321 | { |
322 | ++net->tx_packets; |
323 | if (net->tx_packets < 1024) |
324 | return; |
325 | net->tx_packets = 0; |
326 | net->tx_zcopy_err = 0; |
327 | } |
328 | |
329 | static void vhost_net_tx_err(struct vhost_net *net) |
330 | { |
331 | ++net->tx_zcopy_err; |
332 | } |
333 | |
334 | static bool vhost_net_tx_select_zcopy(struct vhost_net *net) |
335 | { |
336 | /* TX flush waits for outstanding DMAs to be done. |
337 | * Don't start new DMAs. |
338 | */ |
339 | return !net->tx_flush && |
340 | net->tx_packets / 64 >= net->tx_zcopy_err; |
341 | } |
342 | |
343 | static bool vhost_sock_zcopy(struct socket *sock) |
344 | { |
345 | return unlikely(experimental_zcopytx) && |
346 | sock_flag(sk: sock->sk, flag: SOCK_ZEROCOPY); |
347 | } |
348 | |
349 | static bool vhost_sock_xdp(struct socket *sock) |
350 | { |
351 | return sock_flag(sk: sock->sk, flag: SOCK_XDP); |
352 | } |
353 | |
354 | /* In case of DMA done not in order in lower device driver for some reason. |
355 | * upend_idx is used to track end of used idx, done_idx is used to track head |
356 | * of used idx. Once lower device DMA done contiguously, we will signal KVM |
357 | * guest used idx. |
358 | */ |
359 | static void vhost_zerocopy_signal_used(struct vhost_net *net, |
360 | struct vhost_virtqueue *vq) |
361 | { |
362 | struct vhost_net_virtqueue *nvq = |
363 | container_of(vq, struct vhost_net_virtqueue, vq); |
364 | int i, add; |
365 | int j = 0; |
366 | |
367 | for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) { |
368 | if (vq->heads[i].len == VHOST_DMA_FAILED_LEN) |
369 | vhost_net_tx_err(net); |
370 | if (VHOST_DMA_IS_DONE(vq->heads[i].len)) { |
371 | vq->heads[i].len = VHOST_DMA_CLEAR_LEN; |
372 | ++j; |
373 | } else |
374 | break; |
375 | } |
376 | while (j) { |
377 | add = min(UIO_MAXIOV - nvq->done_idx, j); |
378 | vhost_add_used_and_signal_n(vq->dev, vq, |
379 | heads: &vq->heads[nvq->done_idx], count: add); |
380 | nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV; |
381 | j -= add; |
382 | } |
383 | } |
384 | |
385 | static void vhost_zerocopy_callback(struct sk_buff *skb, |
386 | struct ubuf_info *ubuf_base, bool success) |
387 | { |
388 | struct ubuf_info_msgzc *ubuf = uarg_to_msgzc(ubuf_base); |
389 | struct vhost_net_ubuf_ref *ubufs = ubuf->ctx; |
390 | struct vhost_virtqueue *vq = ubufs->vq; |
391 | int cnt; |
392 | |
393 | rcu_read_lock_bh(); |
394 | |
395 | /* set len to mark this desc buffers done DMA */ |
396 | vq->heads[ubuf->desc].len = success ? |
397 | VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN; |
398 | cnt = vhost_net_ubuf_put(ubufs); |
399 | |
400 | /* |
401 | * Trigger polling thread if guest stopped submitting new buffers: |
402 | * in this case, the refcount after decrement will eventually reach 1. |
403 | * We also trigger polling periodically after each 16 packets |
404 | * (the value 16 here is more or less arbitrary, it's tuned to trigger |
405 | * less than 10% of times). |
406 | */ |
407 | if (cnt <= 1 || !(cnt % 16)) |
408 | vhost_poll_queue(poll: &vq->poll); |
409 | |
410 | rcu_read_unlock_bh(); |
411 | } |
412 | |
413 | static inline unsigned long busy_clock(void) |
414 | { |
415 | return local_clock() >> 10; |
416 | } |
417 | |
418 | static bool vhost_can_busy_poll(unsigned long endtime) |
419 | { |
420 | return likely(!need_resched() && !time_after(busy_clock(), endtime) && |
421 | !signal_pending(current)); |
422 | } |
423 | |
424 | static void vhost_net_disable_vq(struct vhost_net *n, |
425 | struct vhost_virtqueue *vq) |
426 | { |
427 | struct vhost_net_virtqueue *nvq = |
428 | container_of(vq, struct vhost_net_virtqueue, vq); |
429 | struct vhost_poll *poll = n->poll + (nvq - n->vqs); |
430 | if (!vhost_vq_get_backend(vq)) |
431 | return; |
432 | vhost_poll_stop(poll); |
433 | } |
434 | |
435 | static int vhost_net_enable_vq(struct vhost_net *n, |
436 | struct vhost_virtqueue *vq) |
437 | { |
438 | struct vhost_net_virtqueue *nvq = |
439 | container_of(vq, struct vhost_net_virtqueue, vq); |
440 | struct vhost_poll *poll = n->poll + (nvq - n->vqs); |
441 | struct socket *sock; |
442 | |
443 | sock = vhost_vq_get_backend(vq); |
444 | if (!sock) |
445 | return 0; |
446 | |
447 | return vhost_poll_start(poll, file: sock->file); |
448 | } |
449 | |
450 | static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq) |
451 | { |
452 | struct vhost_virtqueue *vq = &nvq->vq; |
453 | struct vhost_dev *dev = vq->dev; |
454 | |
455 | if (!nvq->done_idx) |
456 | return; |
457 | |
458 | vhost_add_used_and_signal_n(dev, vq, heads: vq->heads, count: nvq->done_idx); |
459 | nvq->done_idx = 0; |
460 | } |
461 | |
462 | static void vhost_tx_batch(struct vhost_net *net, |
463 | struct vhost_net_virtqueue *nvq, |
464 | struct socket *sock, |
465 | struct msghdr *msghdr) |
466 | { |
467 | struct tun_msg_ctl ctl = { |
468 | .type = TUN_MSG_PTR, |
469 | .num = nvq->batched_xdp, |
470 | .ptr = nvq->xdp, |
471 | }; |
472 | int i, err; |
473 | |
474 | if (nvq->batched_xdp == 0) |
475 | goto signal_used; |
476 | |
477 | msghdr->msg_control = &ctl; |
478 | msghdr->msg_controllen = sizeof(ctl); |
479 | err = sock->ops->sendmsg(sock, msghdr, 0); |
480 | if (unlikely(err < 0)) { |
481 | vq_err(&nvq->vq, "Fail to batch sending packets\n" ); |
482 | |
483 | /* free pages owned by XDP; since this is an unlikely error path, |
484 | * keep it simple and avoid more complex bulk update for the |
485 | * used pages |
486 | */ |
487 | for (i = 0; i < nvq->batched_xdp; ++i) |
488 | put_page(page: virt_to_head_page(x: nvq->xdp[i].data)); |
489 | nvq->batched_xdp = 0; |
490 | nvq->done_idx = 0; |
491 | return; |
492 | } |
493 | |
494 | signal_used: |
495 | vhost_net_signal_used(nvq); |
496 | nvq->batched_xdp = 0; |
497 | } |
498 | |
499 | static int sock_has_rx_data(struct socket *sock) |
500 | { |
501 | if (unlikely(!sock)) |
502 | return 0; |
503 | |
504 | if (sock->ops->peek_len) |
505 | return sock->ops->peek_len(sock); |
506 | |
507 | return skb_queue_empty(list: &sock->sk->sk_receive_queue); |
508 | } |
509 | |
510 | static void vhost_net_busy_poll_try_queue(struct vhost_net *net, |
511 | struct vhost_virtqueue *vq) |
512 | { |
513 | if (!vhost_vq_avail_empty(&net->dev, vq)) { |
514 | vhost_poll_queue(poll: &vq->poll); |
515 | } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { |
516 | vhost_disable_notify(&net->dev, vq); |
517 | vhost_poll_queue(poll: &vq->poll); |
518 | } |
519 | } |
520 | |
521 | static void vhost_net_busy_poll(struct vhost_net *net, |
522 | struct vhost_virtqueue *rvq, |
523 | struct vhost_virtqueue *tvq, |
524 | bool *busyloop_intr, |
525 | bool poll_rx) |
526 | { |
527 | unsigned long busyloop_timeout; |
528 | unsigned long endtime; |
529 | struct socket *sock; |
530 | struct vhost_virtqueue *vq = poll_rx ? tvq : rvq; |
531 | |
532 | /* Try to hold the vq mutex of the paired virtqueue. We can't |
533 | * use mutex_lock() here since we could not guarantee a |
534 | * consistenet lock ordering. |
535 | */ |
536 | if (!mutex_trylock(lock: &vq->mutex)) |
537 | return; |
538 | |
539 | vhost_disable_notify(&net->dev, vq); |
540 | sock = vhost_vq_get_backend(vq: rvq); |
541 | |
542 | busyloop_timeout = poll_rx ? rvq->busyloop_timeout: |
543 | tvq->busyloop_timeout; |
544 | |
545 | preempt_disable(); |
546 | endtime = busy_clock() + busyloop_timeout; |
547 | |
548 | while (vhost_can_busy_poll(endtime)) { |
549 | if (vhost_vq_has_work(vq)) { |
550 | *busyloop_intr = true; |
551 | break; |
552 | } |
553 | |
554 | if ((sock_has_rx_data(sock) && |
555 | !vhost_vq_avail_empty(&net->dev, rvq)) || |
556 | !vhost_vq_avail_empty(&net->dev, tvq)) |
557 | break; |
558 | |
559 | cpu_relax(); |
560 | } |
561 | |
562 | preempt_enable(); |
563 | |
564 | if (poll_rx || sock_has_rx_data(sock)) |
565 | vhost_net_busy_poll_try_queue(net, vq); |
566 | else if (!poll_rx) /* On tx here, sock has no rx data. */ |
567 | vhost_enable_notify(&net->dev, rvq); |
568 | |
569 | mutex_unlock(lock: &vq->mutex); |
570 | } |
571 | |
572 | static int vhost_net_tx_get_vq_desc(struct vhost_net *net, |
573 | struct vhost_net_virtqueue *tnvq, |
574 | unsigned int *out_num, unsigned int *in_num, |
575 | struct msghdr *msghdr, bool *busyloop_intr) |
576 | { |
577 | struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX]; |
578 | struct vhost_virtqueue *rvq = &rnvq->vq; |
579 | struct vhost_virtqueue *tvq = &tnvq->vq; |
580 | |
581 | int r = vhost_get_vq_desc(tvq, iov: tvq->iov, ARRAY_SIZE(tvq->iov), |
582 | out_num, in_num, NULL, NULL); |
583 | |
584 | if (r == tvq->num && tvq->busyloop_timeout) { |
585 | /* Flush batched packets first */ |
586 | if (!vhost_sock_zcopy(sock: vhost_vq_get_backend(vq: tvq))) |
587 | vhost_tx_batch(net, nvq: tnvq, |
588 | sock: vhost_vq_get_backend(vq: tvq), |
589 | msghdr); |
590 | |
591 | vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, poll_rx: false); |
592 | |
593 | r = vhost_get_vq_desc(tvq, iov: tvq->iov, ARRAY_SIZE(tvq->iov), |
594 | out_num, in_num, NULL, NULL); |
595 | } |
596 | |
597 | return r; |
598 | } |
599 | |
600 | static bool vhost_exceeds_maxpend(struct vhost_net *net) |
601 | { |
602 | struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; |
603 | struct vhost_virtqueue *vq = &nvq->vq; |
604 | |
605 | return (nvq->upend_idx + UIO_MAXIOV - nvq->done_idx) % UIO_MAXIOV > |
606 | min_t(unsigned int, VHOST_MAX_PEND, vq->num >> 2); |
607 | } |
608 | |
609 | static size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter, |
610 | size_t hdr_size, int out) |
611 | { |
612 | /* Skip header. TODO: support TSO. */ |
613 | size_t len = iov_length(iov: vq->iov, nr_segs: out); |
614 | |
615 | iov_iter_init(i: iter, ITER_SOURCE, iov: vq->iov, nr_segs: out, count: len); |
616 | iov_iter_advance(i: iter, bytes: hdr_size); |
617 | |
618 | return iov_iter_count(i: iter); |
619 | } |
620 | |
621 | static int get_tx_bufs(struct vhost_net *net, |
622 | struct vhost_net_virtqueue *nvq, |
623 | struct msghdr *msg, |
624 | unsigned int *out, unsigned int *in, |
625 | size_t *len, bool *busyloop_intr) |
626 | { |
627 | struct vhost_virtqueue *vq = &nvq->vq; |
628 | int ret; |
629 | |
630 | ret = vhost_net_tx_get_vq_desc(net, tnvq: nvq, out_num: out, in_num: in, msghdr: msg, busyloop_intr); |
631 | |
632 | if (ret < 0 || ret == vq->num) |
633 | return ret; |
634 | |
635 | if (*in) { |
636 | vq_err(vq, "Unexpected descriptor format for TX: out %d, int %d\n" , |
637 | *out, *in); |
638 | return -EFAULT; |
639 | } |
640 | |
641 | /* Sanity check */ |
642 | *len = init_iov_iter(vq, iter: &msg->msg_iter, hdr_size: nvq->vhost_hlen, out: *out); |
643 | if (*len == 0) { |
644 | vq_err(vq, "Unexpected header len for TX: %zd expected %zd\n" , |
645 | *len, nvq->vhost_hlen); |
646 | return -EFAULT; |
647 | } |
648 | |
649 | return ret; |
650 | } |
651 | |
652 | static bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len) |
653 | { |
654 | return total_len < VHOST_NET_WEIGHT && |
655 | !vhost_vq_avail_empty(vq->dev, vq); |
656 | } |
657 | |
658 | static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz, |
659 | struct page_frag *pfrag, gfp_t gfp) |
660 | { |
661 | if (pfrag->page) { |
662 | if (pfrag->offset + sz <= pfrag->size) |
663 | return true; |
664 | __page_frag_cache_drain(page: pfrag->page, count: net->refcnt_bias); |
665 | } |
666 | |
667 | pfrag->offset = 0; |
668 | net->refcnt_bias = 0; |
669 | if (SKB_FRAG_PAGE_ORDER) { |
670 | /* Avoid direct reclaim but allow kswapd to wake */ |
671 | pfrag->page = alloc_pages(gfp: (gfp & ~__GFP_DIRECT_RECLAIM) | |
672 | __GFP_COMP | __GFP_NOWARN | |
673 | __GFP_NORETRY, |
674 | SKB_FRAG_PAGE_ORDER); |
675 | if (likely(pfrag->page)) { |
676 | pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; |
677 | goto done; |
678 | } |
679 | } |
680 | pfrag->page = alloc_page(gfp); |
681 | if (likely(pfrag->page)) { |
682 | pfrag->size = PAGE_SIZE; |
683 | goto done; |
684 | } |
685 | return false; |
686 | |
687 | done: |
688 | net->refcnt_bias = USHRT_MAX; |
689 | page_ref_add(page: pfrag->page, USHRT_MAX - 1); |
690 | return true; |
691 | } |
692 | |
693 | #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) |
694 | |
695 | static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, |
696 | struct iov_iter *from) |
697 | { |
698 | struct vhost_virtqueue *vq = &nvq->vq; |
699 | struct vhost_net *net = container_of(vq->dev, struct vhost_net, |
700 | dev); |
701 | struct socket *sock = vhost_vq_get_backend(vq); |
702 | struct page_frag *alloc_frag = &net->page_frag; |
703 | struct virtio_net_hdr *gso; |
704 | struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp]; |
705 | struct tun_xdp_hdr *hdr; |
706 | size_t len = iov_iter_count(i: from); |
707 | int headroom = vhost_sock_xdp(sock) ? XDP_PACKET_HEADROOM : 0; |
708 | int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); |
709 | int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + headroom + nvq->sock_hlen); |
710 | int sock_hlen = nvq->sock_hlen; |
711 | void *buf; |
712 | int copied; |
713 | |
714 | if (unlikely(len < nvq->sock_hlen)) |
715 | return -EFAULT; |
716 | |
717 | if (SKB_DATA_ALIGN(len + pad) + |
718 | SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE) |
719 | return -ENOSPC; |
720 | |
721 | buflen += SKB_DATA_ALIGN(len + pad); |
722 | alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES); |
723 | if (unlikely(!vhost_net_page_frag_refill(net, buflen, |
724 | alloc_frag, GFP_KERNEL))) |
725 | return -ENOMEM; |
726 | |
727 | buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; |
728 | copied = copy_page_from_iter(page: alloc_frag->page, |
729 | offset: alloc_frag->offset + |
730 | offsetof(struct tun_xdp_hdr, gso), |
731 | bytes: sock_hlen, i: from); |
732 | if (copied != sock_hlen) |
733 | return -EFAULT; |
734 | |
735 | hdr = buf; |
736 | gso = &hdr->gso; |
737 | |
738 | if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && |
739 | vhost16_to_cpu(vq, val: gso->csum_start) + |
740 | vhost16_to_cpu(vq, val: gso->csum_offset) + 2 > |
741 | vhost16_to_cpu(vq, val: gso->hdr_len)) { |
742 | gso->hdr_len = cpu_to_vhost16(vq, |
743 | val: vhost16_to_cpu(vq, val: gso->csum_start) + |
744 | vhost16_to_cpu(vq, val: gso->csum_offset) + 2); |
745 | |
746 | if (vhost16_to_cpu(vq, val: gso->hdr_len) > len) |
747 | return -EINVAL; |
748 | } |
749 | |
750 | len -= sock_hlen; |
751 | copied = copy_page_from_iter(page: alloc_frag->page, |
752 | offset: alloc_frag->offset + pad, |
753 | bytes: len, i: from); |
754 | if (copied != len) |
755 | return -EFAULT; |
756 | |
757 | xdp_init_buff(xdp, frame_sz: buflen, NULL); |
758 | xdp_prepare_buff(xdp, hard_start: buf, headroom: pad, data_len: len, meta_valid: true); |
759 | hdr->buflen = buflen; |
760 | |
761 | --net->refcnt_bias; |
762 | alloc_frag->offset += buflen; |
763 | |
764 | ++nvq->batched_xdp; |
765 | |
766 | return 0; |
767 | } |
768 | |
769 | static void handle_tx_copy(struct vhost_net *net, struct socket *sock) |
770 | { |
771 | struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; |
772 | struct vhost_virtqueue *vq = &nvq->vq; |
773 | unsigned out, in; |
774 | int head; |
775 | struct msghdr msg = { |
776 | .msg_name = NULL, |
777 | .msg_namelen = 0, |
778 | .msg_control = NULL, |
779 | .msg_controllen = 0, |
780 | .msg_flags = MSG_DONTWAIT, |
781 | }; |
782 | size_t len, total_len = 0; |
783 | int err; |
784 | int sent_pkts = 0; |
785 | bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX); |
786 | |
787 | do { |
788 | bool busyloop_intr = false; |
789 | |
790 | if (nvq->done_idx == VHOST_NET_BATCH) |
791 | vhost_tx_batch(net, nvq, sock, msghdr: &msg); |
792 | |
793 | head = get_tx_bufs(net, nvq, msg: &msg, out: &out, in: &in, len: &len, |
794 | busyloop_intr: &busyloop_intr); |
795 | /* On error, stop handling until the next kick. */ |
796 | if (unlikely(head < 0)) |
797 | break; |
798 | /* Nothing new? Wait for eventfd to tell us they refilled. */ |
799 | if (head == vq->num) { |
800 | if (unlikely(busyloop_intr)) { |
801 | vhost_poll_queue(poll: &vq->poll); |
802 | } else if (unlikely(vhost_enable_notify(&net->dev, |
803 | vq))) { |
804 | vhost_disable_notify(&net->dev, vq); |
805 | continue; |
806 | } |
807 | break; |
808 | } |
809 | |
810 | total_len += len; |
811 | |
812 | /* For simplicity, TX batching is only enabled if |
813 | * sndbuf is unlimited. |
814 | */ |
815 | if (sock_can_batch) { |
816 | err = vhost_net_build_xdp(nvq, from: &msg.msg_iter); |
817 | if (!err) { |
818 | goto done; |
819 | } else if (unlikely(err != -ENOSPC)) { |
820 | vhost_tx_batch(net, nvq, sock, msghdr: &msg); |
821 | vhost_discard_vq_desc(vq, n: 1); |
822 | vhost_net_enable_vq(n: net, vq); |
823 | break; |
824 | } |
825 | |
826 | /* We can't build XDP buff, go for single |
827 | * packet path but let's flush batched |
828 | * packets. |
829 | */ |
830 | vhost_tx_batch(net, nvq, sock, msghdr: &msg); |
831 | msg.msg_control = NULL; |
832 | } else { |
833 | if (tx_can_batch(vq, total_len)) |
834 | msg.msg_flags |= MSG_MORE; |
835 | else |
836 | msg.msg_flags &= ~MSG_MORE; |
837 | } |
838 | |
839 | err = sock->ops->sendmsg(sock, &msg, len); |
840 | if (unlikely(err < 0)) { |
841 | if (err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS) { |
842 | vhost_discard_vq_desc(vq, n: 1); |
843 | vhost_net_enable_vq(n: net, vq); |
844 | break; |
845 | } |
846 | pr_debug("Fail to send packet: err %d" , err); |
847 | } else if (unlikely(err != len)) |
848 | pr_debug("Truncated TX packet: len %d != %zd\n" , |
849 | err, len); |
850 | done: |
851 | vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, val: head); |
852 | vq->heads[nvq->done_idx].len = 0; |
853 | ++nvq->done_idx; |
854 | } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); |
855 | |
856 | vhost_tx_batch(net, nvq, sock, msghdr: &msg); |
857 | } |
858 | |
859 | static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) |
860 | { |
861 | struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; |
862 | struct vhost_virtqueue *vq = &nvq->vq; |
863 | unsigned out, in; |
864 | int head; |
865 | struct msghdr msg = { |
866 | .msg_name = NULL, |
867 | .msg_namelen = 0, |
868 | .msg_control = NULL, |
869 | .msg_controllen = 0, |
870 | .msg_flags = MSG_DONTWAIT, |
871 | }; |
872 | struct tun_msg_ctl ctl; |
873 | size_t len, total_len = 0; |
874 | int err; |
875 | struct vhost_net_ubuf_ref *ubufs; |
876 | struct ubuf_info_msgzc *ubuf; |
877 | bool zcopy_used; |
878 | int sent_pkts = 0; |
879 | |
880 | do { |
881 | bool busyloop_intr; |
882 | |
883 | /* Release DMAs done buffers first */ |
884 | vhost_zerocopy_signal_used(net, vq); |
885 | |
886 | busyloop_intr = false; |
887 | head = get_tx_bufs(net, nvq, msg: &msg, out: &out, in: &in, len: &len, |
888 | busyloop_intr: &busyloop_intr); |
889 | /* On error, stop handling until the next kick. */ |
890 | if (unlikely(head < 0)) |
891 | break; |
892 | /* Nothing new? Wait for eventfd to tell us they refilled. */ |
893 | if (head == vq->num) { |
894 | if (unlikely(busyloop_intr)) { |
895 | vhost_poll_queue(poll: &vq->poll); |
896 | } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { |
897 | vhost_disable_notify(&net->dev, vq); |
898 | continue; |
899 | } |
900 | break; |
901 | } |
902 | |
903 | zcopy_used = len >= VHOST_GOODCOPY_LEN |
904 | && !vhost_exceeds_maxpend(net) |
905 | && vhost_net_tx_select_zcopy(net); |
906 | |
907 | /* use msg_control to pass vhost zerocopy ubuf info to skb */ |
908 | if (zcopy_used) { |
909 | ubuf = nvq->ubuf_info + nvq->upend_idx; |
910 | vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, val: head); |
911 | vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS; |
912 | ubuf->ctx = nvq->ubufs; |
913 | ubuf->desc = nvq->upend_idx; |
914 | ubuf->ubuf.callback = vhost_zerocopy_callback; |
915 | ubuf->ubuf.flags = SKBFL_ZEROCOPY_FRAG; |
916 | refcount_set(r: &ubuf->ubuf.refcnt, n: 1); |
917 | msg.msg_control = &ctl; |
918 | ctl.type = TUN_MSG_UBUF; |
919 | ctl.ptr = &ubuf->ubuf; |
920 | msg.msg_controllen = sizeof(ctl); |
921 | ubufs = nvq->ubufs; |
922 | atomic_inc(v: &ubufs->refcount); |
923 | nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV; |
924 | } else { |
925 | msg.msg_control = NULL; |
926 | ubufs = NULL; |
927 | } |
928 | total_len += len; |
929 | if (tx_can_batch(vq, total_len) && |
930 | likely(!vhost_exceeds_maxpend(net))) { |
931 | msg.msg_flags |= MSG_MORE; |
932 | } else { |
933 | msg.msg_flags &= ~MSG_MORE; |
934 | } |
935 | |
936 | err = sock->ops->sendmsg(sock, &msg, len); |
937 | if (unlikely(err < 0)) { |
938 | bool retry = err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS; |
939 | |
940 | if (zcopy_used) { |
941 | if (vq->heads[ubuf->desc].len == VHOST_DMA_IN_PROGRESS) |
942 | vhost_net_ubuf_put(ubufs); |
943 | if (retry) |
944 | nvq->upend_idx = ((unsigned)nvq->upend_idx - 1) |
945 | % UIO_MAXIOV; |
946 | else |
947 | vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN; |
948 | } |
949 | if (retry) { |
950 | vhost_discard_vq_desc(vq, n: 1); |
951 | vhost_net_enable_vq(n: net, vq); |
952 | break; |
953 | } |
954 | pr_debug("Fail to send packet: err %d" , err); |
955 | } else if (unlikely(err != len)) |
956 | pr_debug("Truncated TX packet: " |
957 | " len %d != %zd\n" , err, len); |
958 | if (!zcopy_used) |
959 | vhost_add_used_and_signal(&net->dev, vq, id: head, len: 0); |
960 | else |
961 | vhost_zerocopy_signal_used(net, vq); |
962 | vhost_net_tx_packet(net); |
963 | } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); |
964 | } |
965 | |
966 | /* Expects to be always run from workqueue - which acts as |
967 | * read-size critical section for our kind of RCU. */ |
968 | static void handle_tx(struct vhost_net *net) |
969 | { |
970 | struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; |
971 | struct vhost_virtqueue *vq = &nvq->vq; |
972 | struct socket *sock; |
973 | |
974 | mutex_lock_nested(lock: &vq->mutex, subclass: VHOST_NET_VQ_TX); |
975 | sock = vhost_vq_get_backend(vq); |
976 | if (!sock) |
977 | goto out; |
978 | |
979 | if (!vq_meta_prefetch(vq)) |
980 | goto out; |
981 | |
982 | vhost_disable_notify(&net->dev, vq); |
983 | vhost_net_disable_vq(n: net, vq); |
984 | |
985 | if (vhost_sock_zcopy(sock)) |
986 | handle_tx_zerocopy(net, sock); |
987 | else |
988 | handle_tx_copy(net, sock); |
989 | |
990 | out: |
991 | mutex_unlock(lock: &vq->mutex); |
992 | } |
993 | |
994 | static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk) |
995 | { |
996 | struct sk_buff *head; |
997 | int len = 0; |
998 | unsigned long flags; |
999 | |
1000 | if (rvq->rx_ring) |
1001 | return vhost_net_buf_peek(nvq: rvq); |
1002 | |
1003 | spin_lock_irqsave(&sk->sk_receive_queue.lock, flags); |
1004 | head = skb_peek(list_: &sk->sk_receive_queue); |
1005 | if (likely(head)) { |
1006 | len = head->len; |
1007 | if (skb_vlan_tag_present(head)) |
1008 | len += VLAN_HLEN; |
1009 | } |
1010 | |
1011 | spin_unlock_irqrestore(lock: &sk->sk_receive_queue.lock, flags); |
1012 | return len; |
1013 | } |
1014 | |
1015 | static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk, |
1016 | bool *busyloop_intr) |
1017 | { |
1018 | struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX]; |
1019 | struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX]; |
1020 | struct vhost_virtqueue *rvq = &rnvq->vq; |
1021 | struct vhost_virtqueue *tvq = &tnvq->vq; |
1022 | int len = peek_head_len(rvq: rnvq, sk); |
1023 | |
1024 | if (!len && rvq->busyloop_timeout) { |
1025 | /* Flush batched heads first */ |
1026 | vhost_net_signal_used(nvq: rnvq); |
1027 | /* Both tx vq and rx socket were polled here */ |
1028 | vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, poll_rx: true); |
1029 | |
1030 | len = peek_head_len(rvq: rnvq, sk); |
1031 | } |
1032 | |
1033 | return len; |
1034 | } |
1035 | |
1036 | /* This is a multi-buffer version of vhost_get_desc, that works if |
1037 | * vq has read descriptors only. |
1038 | * @vq - the relevant virtqueue |
1039 | * @datalen - data length we'll be reading |
1040 | * @iovcount - returned count of io vectors we fill |
1041 | * @log - vhost log |
1042 | * @log_num - log offset |
1043 | * @quota - headcount quota, 1 for big buffer |
1044 | * returns number of buffer heads allocated, negative on error |
1045 | */ |
1046 | static int get_rx_bufs(struct vhost_virtqueue *vq, |
1047 | struct vring_used_elem *heads, |
1048 | int datalen, |
1049 | unsigned *iovcount, |
1050 | struct vhost_log *log, |
1051 | unsigned *log_num, |
1052 | unsigned int quota) |
1053 | { |
1054 | unsigned int out, in; |
1055 | int seg = 0; |
1056 | int headcount = 0; |
1057 | unsigned d; |
1058 | int r, nlogs = 0; |
1059 | /* len is always initialized before use since we are always called with |
1060 | * datalen > 0. |
1061 | */ |
1062 | u32 len; |
1063 | |
1064 | while (datalen > 0 && headcount < quota) { |
1065 | if (unlikely(seg >= UIO_MAXIOV)) { |
1066 | r = -ENOBUFS; |
1067 | goto err; |
1068 | } |
1069 | r = vhost_get_vq_desc(vq, iov: vq->iov + seg, |
1070 | ARRAY_SIZE(vq->iov) - seg, out_num: &out, |
1071 | in_num: &in, log, log_num); |
1072 | if (unlikely(r < 0)) |
1073 | goto err; |
1074 | |
1075 | d = r; |
1076 | if (d == vq->num) { |
1077 | r = 0; |
1078 | goto err; |
1079 | } |
1080 | if (unlikely(out || in <= 0)) { |
1081 | vq_err(vq, "unexpected descriptor format for RX: " |
1082 | "out %d, in %d\n" , out, in); |
1083 | r = -EINVAL; |
1084 | goto err; |
1085 | } |
1086 | if (unlikely(log)) { |
1087 | nlogs += *log_num; |
1088 | log += *log_num; |
1089 | } |
1090 | heads[headcount].id = cpu_to_vhost32(vq, val: d); |
1091 | len = iov_length(iov: vq->iov + seg, nr_segs: in); |
1092 | heads[headcount].len = cpu_to_vhost32(vq, val: len); |
1093 | datalen -= len; |
1094 | ++headcount; |
1095 | seg += in; |
1096 | } |
1097 | heads[headcount - 1].len = cpu_to_vhost32(vq, val: len + datalen); |
1098 | *iovcount = seg; |
1099 | if (unlikely(log)) |
1100 | *log_num = nlogs; |
1101 | |
1102 | /* Detect overrun */ |
1103 | if (unlikely(datalen > 0)) { |
1104 | r = UIO_MAXIOV + 1; |
1105 | goto err; |
1106 | } |
1107 | return headcount; |
1108 | err: |
1109 | vhost_discard_vq_desc(vq, n: headcount); |
1110 | return r; |
1111 | } |
1112 | |
1113 | /* Expects to be always run from workqueue - which acts as |
1114 | * read-size critical section for our kind of RCU. */ |
1115 | static void handle_rx(struct vhost_net *net) |
1116 | { |
1117 | struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX]; |
1118 | struct vhost_virtqueue *vq = &nvq->vq; |
1119 | unsigned in, log; |
1120 | struct vhost_log *vq_log; |
1121 | struct msghdr msg = { |
1122 | .msg_name = NULL, |
1123 | .msg_namelen = 0, |
1124 | .msg_control = NULL, /* FIXME: get and handle RX aux data. */ |
1125 | .msg_controllen = 0, |
1126 | .msg_flags = MSG_DONTWAIT, |
1127 | }; |
1128 | struct virtio_net_hdr hdr = { |
1129 | .flags = 0, |
1130 | .gso_type = VIRTIO_NET_HDR_GSO_NONE |
1131 | }; |
1132 | size_t total_len = 0; |
1133 | int err, mergeable; |
1134 | s16 headcount; |
1135 | size_t vhost_hlen, sock_hlen; |
1136 | size_t vhost_len, sock_len; |
1137 | bool busyloop_intr = false; |
1138 | struct socket *sock; |
1139 | struct iov_iter fixup; |
1140 | __virtio16 num_buffers; |
1141 | int recv_pkts = 0; |
1142 | |
1143 | mutex_lock_nested(lock: &vq->mutex, subclass: VHOST_NET_VQ_RX); |
1144 | sock = vhost_vq_get_backend(vq); |
1145 | if (!sock) |
1146 | goto out; |
1147 | |
1148 | if (!vq_meta_prefetch(vq)) |
1149 | goto out; |
1150 | |
1151 | vhost_disable_notify(&net->dev, vq); |
1152 | vhost_net_disable_vq(n: net, vq); |
1153 | |
1154 | vhost_hlen = nvq->vhost_hlen; |
1155 | sock_hlen = nvq->sock_hlen; |
1156 | |
1157 | vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ? |
1158 | vq->log : NULL; |
1159 | mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF); |
1160 | |
1161 | do { |
1162 | sock_len = vhost_net_rx_peek_head_len(net, sk: sock->sk, |
1163 | busyloop_intr: &busyloop_intr); |
1164 | if (!sock_len) |
1165 | break; |
1166 | sock_len += sock_hlen; |
1167 | vhost_len = sock_len + vhost_hlen; |
1168 | headcount = get_rx_bufs(vq, heads: vq->heads + nvq->done_idx, |
1169 | datalen: vhost_len, iovcount: &in, log: vq_log, log_num: &log, |
1170 | likely(mergeable) ? UIO_MAXIOV : 1); |
1171 | /* On error, stop handling until the next kick. */ |
1172 | if (unlikely(headcount < 0)) |
1173 | goto out; |
1174 | /* OK, now we need to know about added descriptors. */ |
1175 | if (!headcount) { |
1176 | if (unlikely(busyloop_intr)) { |
1177 | vhost_poll_queue(poll: &vq->poll); |
1178 | } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { |
1179 | /* They have slipped one in as we were |
1180 | * doing that: check again. */ |
1181 | vhost_disable_notify(&net->dev, vq); |
1182 | continue; |
1183 | } |
1184 | /* Nothing new? Wait for eventfd to tell us |
1185 | * they refilled. */ |
1186 | goto out; |
1187 | } |
1188 | busyloop_intr = false; |
1189 | if (nvq->rx_ring) |
1190 | msg.msg_control = vhost_net_buf_consume(rxq: &nvq->rxq); |
1191 | /* On overrun, truncate and discard */ |
1192 | if (unlikely(headcount > UIO_MAXIOV)) { |
1193 | iov_iter_init(i: &msg.msg_iter, ITER_DEST, iov: vq->iov, nr_segs: 1, count: 1); |
1194 | err = sock->ops->recvmsg(sock, &msg, |
1195 | 1, MSG_DONTWAIT | MSG_TRUNC); |
1196 | pr_debug("Discarded rx packet: len %zd\n" , sock_len); |
1197 | continue; |
1198 | } |
1199 | /* We don't need to be notified again. */ |
1200 | iov_iter_init(i: &msg.msg_iter, ITER_DEST, iov: vq->iov, nr_segs: in, count: vhost_len); |
1201 | fixup = msg.msg_iter; |
1202 | if (unlikely((vhost_hlen))) { |
1203 | /* We will supply the header ourselves |
1204 | * TODO: support TSO. |
1205 | */ |
1206 | iov_iter_advance(i: &msg.msg_iter, bytes: vhost_hlen); |
1207 | } |
1208 | err = sock->ops->recvmsg(sock, &msg, |
1209 | sock_len, MSG_DONTWAIT | MSG_TRUNC); |
1210 | /* Userspace might have consumed the packet meanwhile: |
1211 | * it's not supposed to do this usually, but might be hard |
1212 | * to prevent. Discard data we got (if any) and keep going. */ |
1213 | if (unlikely(err != sock_len)) { |
1214 | pr_debug("Discarded rx packet: " |
1215 | " len %d, expected %zd\n" , err, sock_len); |
1216 | vhost_discard_vq_desc(vq, n: headcount); |
1217 | continue; |
1218 | } |
1219 | /* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */ |
1220 | if (unlikely(vhost_hlen)) { |
1221 | if (copy_to_iter(addr: &hdr, bytes: sizeof(hdr), |
1222 | i: &fixup) != sizeof(hdr)) { |
1223 | vq_err(vq, "Unable to write vnet_hdr " |
1224 | "at addr %p\n" , vq->iov->iov_base); |
1225 | goto out; |
1226 | } |
1227 | } else { |
1228 | /* Header came from socket; we'll need to patch |
1229 | * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF |
1230 | */ |
1231 | iov_iter_advance(i: &fixup, bytes: sizeof(hdr)); |
1232 | } |
1233 | /* TODO: Should check and handle checksum. */ |
1234 | |
1235 | num_buffers = cpu_to_vhost16(vq, val: headcount); |
1236 | if (likely(mergeable) && |
1237 | copy_to_iter(addr: &num_buffers, bytes: sizeof num_buffers, |
1238 | i: &fixup) != sizeof num_buffers) { |
1239 | vq_err(vq, "Failed num_buffers write" ); |
1240 | vhost_discard_vq_desc(vq, n: headcount); |
1241 | goto out; |
1242 | } |
1243 | nvq->done_idx += headcount; |
1244 | if (nvq->done_idx > VHOST_NET_BATCH) |
1245 | vhost_net_signal_used(nvq); |
1246 | if (unlikely(vq_log)) |
1247 | vhost_log_write(vq, log: vq_log, log_num: log, len: vhost_len, |
1248 | iov: vq->iov, count: in); |
1249 | total_len += vhost_len; |
1250 | } while (likely(!vhost_exceeds_weight(vq, ++recv_pkts, total_len))); |
1251 | |
1252 | if (unlikely(busyloop_intr)) |
1253 | vhost_poll_queue(poll: &vq->poll); |
1254 | else if (!sock_len) |
1255 | vhost_net_enable_vq(n: net, vq); |
1256 | out: |
1257 | vhost_net_signal_used(nvq); |
1258 | mutex_unlock(lock: &vq->mutex); |
1259 | } |
1260 | |
1261 | static void handle_tx_kick(struct vhost_work *work) |
1262 | { |
1263 | struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, |
1264 | poll.work); |
1265 | struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); |
1266 | |
1267 | handle_tx(net); |
1268 | } |
1269 | |
1270 | static void handle_rx_kick(struct vhost_work *work) |
1271 | { |
1272 | struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, |
1273 | poll.work); |
1274 | struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); |
1275 | |
1276 | handle_rx(net); |
1277 | } |
1278 | |
1279 | static void handle_tx_net(struct vhost_work *work) |
1280 | { |
1281 | struct vhost_net *net = container_of(work, struct vhost_net, |
1282 | poll[VHOST_NET_VQ_TX].work); |
1283 | handle_tx(net); |
1284 | } |
1285 | |
1286 | static void handle_rx_net(struct vhost_work *work) |
1287 | { |
1288 | struct vhost_net *net = container_of(work, struct vhost_net, |
1289 | poll[VHOST_NET_VQ_RX].work); |
1290 | handle_rx(net); |
1291 | } |
1292 | |
1293 | static int vhost_net_open(struct inode *inode, struct file *f) |
1294 | { |
1295 | struct vhost_net *n; |
1296 | struct vhost_dev *dev; |
1297 | struct vhost_virtqueue **vqs; |
1298 | void **queue; |
1299 | struct xdp_buff *xdp; |
1300 | int i; |
1301 | |
1302 | n = kvmalloc(size: sizeof *n, GFP_KERNEL | __GFP_RETRY_MAYFAIL); |
1303 | if (!n) |
1304 | return -ENOMEM; |
1305 | vqs = kmalloc_array(n: VHOST_NET_VQ_MAX, size: sizeof(*vqs), GFP_KERNEL); |
1306 | if (!vqs) { |
1307 | kvfree(addr: n); |
1308 | return -ENOMEM; |
1309 | } |
1310 | |
1311 | queue = kmalloc_array(VHOST_NET_BATCH, size: sizeof(void *), |
1312 | GFP_KERNEL); |
1313 | if (!queue) { |
1314 | kfree(objp: vqs); |
1315 | kvfree(addr: n); |
1316 | return -ENOMEM; |
1317 | } |
1318 | n->vqs[VHOST_NET_VQ_RX].rxq.queue = queue; |
1319 | |
1320 | xdp = kmalloc_array(VHOST_NET_BATCH, size: sizeof(*xdp), GFP_KERNEL); |
1321 | if (!xdp) { |
1322 | kfree(objp: vqs); |
1323 | kvfree(addr: n); |
1324 | kfree(objp: queue); |
1325 | return -ENOMEM; |
1326 | } |
1327 | n->vqs[VHOST_NET_VQ_TX].xdp = xdp; |
1328 | |
1329 | dev = &n->dev; |
1330 | vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq; |
1331 | vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq; |
1332 | n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick; |
1333 | n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick; |
1334 | for (i = 0; i < VHOST_NET_VQ_MAX; i++) { |
1335 | n->vqs[i].ubufs = NULL; |
1336 | n->vqs[i].ubuf_info = NULL; |
1337 | n->vqs[i].upend_idx = 0; |
1338 | n->vqs[i].done_idx = 0; |
1339 | n->vqs[i].batched_xdp = 0; |
1340 | n->vqs[i].vhost_hlen = 0; |
1341 | n->vqs[i].sock_hlen = 0; |
1342 | n->vqs[i].rx_ring = NULL; |
1343 | vhost_net_buf_init(rxq: &n->vqs[i].rxq); |
1344 | } |
1345 | vhost_dev_init(dev, vqs, nvqs: VHOST_NET_VQ_MAX, |
1346 | UIO_MAXIOV + VHOST_NET_BATCH, |
1347 | VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT, use_worker: true, |
1348 | NULL); |
1349 | |
1350 | vhost_poll_init(poll: n->poll + VHOST_NET_VQ_TX, fn: handle_tx_net, EPOLLOUT, dev, |
1351 | vq: vqs[VHOST_NET_VQ_TX]); |
1352 | vhost_poll_init(poll: n->poll + VHOST_NET_VQ_RX, fn: handle_rx_net, EPOLLIN, dev, |
1353 | vq: vqs[VHOST_NET_VQ_RX]); |
1354 | |
1355 | f->private_data = n; |
1356 | n->page_frag.page = NULL; |
1357 | n->refcnt_bias = 0; |
1358 | |
1359 | return 0; |
1360 | } |
1361 | |
1362 | static struct socket *vhost_net_stop_vq(struct vhost_net *n, |
1363 | struct vhost_virtqueue *vq) |
1364 | { |
1365 | struct socket *sock; |
1366 | struct vhost_net_virtqueue *nvq = |
1367 | container_of(vq, struct vhost_net_virtqueue, vq); |
1368 | |
1369 | mutex_lock(&vq->mutex); |
1370 | sock = vhost_vq_get_backend(vq); |
1371 | vhost_net_disable_vq(n, vq); |
1372 | vhost_vq_set_backend(vq, NULL); |
1373 | vhost_net_buf_unproduce(nvq); |
1374 | nvq->rx_ring = NULL; |
1375 | mutex_unlock(lock: &vq->mutex); |
1376 | return sock; |
1377 | } |
1378 | |
1379 | static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock, |
1380 | struct socket **rx_sock) |
1381 | { |
1382 | *tx_sock = vhost_net_stop_vq(n, vq: &n->vqs[VHOST_NET_VQ_TX].vq); |
1383 | *rx_sock = vhost_net_stop_vq(n, vq: &n->vqs[VHOST_NET_VQ_RX].vq); |
1384 | } |
1385 | |
1386 | static void vhost_net_flush(struct vhost_net *n) |
1387 | { |
1388 | vhost_dev_flush(dev: &n->dev); |
1389 | if (n->vqs[VHOST_NET_VQ_TX].ubufs) { |
1390 | mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); |
1391 | n->tx_flush = true; |
1392 | mutex_unlock(lock: &n->vqs[VHOST_NET_VQ_TX].vq.mutex); |
1393 | /* Wait for all lower device DMAs done. */ |
1394 | vhost_net_ubuf_put_and_wait(ubufs: n->vqs[VHOST_NET_VQ_TX].ubufs); |
1395 | mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); |
1396 | n->tx_flush = false; |
1397 | atomic_set(v: &n->vqs[VHOST_NET_VQ_TX].ubufs->refcount, i: 1); |
1398 | mutex_unlock(lock: &n->vqs[VHOST_NET_VQ_TX].vq.mutex); |
1399 | } |
1400 | } |
1401 | |
1402 | static int vhost_net_release(struct inode *inode, struct file *f) |
1403 | { |
1404 | struct vhost_net *n = f->private_data; |
1405 | struct socket *tx_sock; |
1406 | struct socket *rx_sock; |
1407 | |
1408 | vhost_net_stop(n, tx_sock: &tx_sock, rx_sock: &rx_sock); |
1409 | vhost_net_flush(n); |
1410 | vhost_dev_stop(&n->dev); |
1411 | vhost_dev_cleanup(&n->dev); |
1412 | vhost_net_vq_reset(n); |
1413 | if (tx_sock) |
1414 | sockfd_put(tx_sock); |
1415 | if (rx_sock) |
1416 | sockfd_put(rx_sock); |
1417 | /* Make sure no callbacks are outstanding */ |
1418 | synchronize_rcu(); |
1419 | /* We do an extra flush before freeing memory, |
1420 | * since jobs can re-queue themselves. */ |
1421 | vhost_net_flush(n); |
1422 | kfree(objp: n->vqs[VHOST_NET_VQ_RX].rxq.queue); |
1423 | kfree(objp: n->vqs[VHOST_NET_VQ_TX].xdp); |
1424 | kfree(objp: n->dev.vqs); |
1425 | if (n->page_frag.page) |
1426 | __page_frag_cache_drain(page: n->page_frag.page, count: n->refcnt_bias); |
1427 | kvfree(addr: n); |
1428 | return 0; |
1429 | } |
1430 | |
1431 | static struct socket *get_raw_socket(int fd) |
1432 | { |
1433 | int r; |
1434 | struct socket *sock = sockfd_lookup(fd, err: &r); |
1435 | |
1436 | if (!sock) |
1437 | return ERR_PTR(error: -ENOTSOCK); |
1438 | |
1439 | /* Parameter checking */ |
1440 | if (sock->sk->sk_type != SOCK_RAW) { |
1441 | r = -ESOCKTNOSUPPORT; |
1442 | goto err; |
1443 | } |
1444 | |
1445 | if (sock->sk->sk_family != AF_PACKET) { |
1446 | r = -EPFNOSUPPORT; |
1447 | goto err; |
1448 | } |
1449 | return sock; |
1450 | err: |
1451 | sockfd_put(sock); |
1452 | return ERR_PTR(error: r); |
1453 | } |
1454 | |
1455 | static struct ptr_ring *get_tap_ptr_ring(struct file *file) |
1456 | { |
1457 | struct ptr_ring *ring; |
1458 | ring = tun_get_tx_ring(file); |
1459 | if (!IS_ERR(ptr: ring)) |
1460 | goto out; |
1461 | ring = tap_get_ptr_ring(file); |
1462 | if (!IS_ERR(ptr: ring)) |
1463 | goto out; |
1464 | ring = NULL; |
1465 | out: |
1466 | return ring; |
1467 | } |
1468 | |
1469 | static struct socket *get_tap_socket(int fd) |
1470 | { |
1471 | struct file *file = fget(fd); |
1472 | struct socket *sock; |
1473 | |
1474 | if (!file) |
1475 | return ERR_PTR(error: -EBADF); |
1476 | sock = tun_get_socket(file); |
1477 | if (!IS_ERR(ptr: sock)) |
1478 | return sock; |
1479 | sock = tap_get_socket(file); |
1480 | if (IS_ERR(ptr: sock)) |
1481 | fput(file); |
1482 | return sock; |
1483 | } |
1484 | |
1485 | static struct socket *get_socket(int fd) |
1486 | { |
1487 | struct socket *sock; |
1488 | |
1489 | /* special case to disable backend */ |
1490 | if (fd == -1) |
1491 | return NULL; |
1492 | sock = get_raw_socket(fd); |
1493 | if (!IS_ERR(ptr: sock)) |
1494 | return sock; |
1495 | sock = get_tap_socket(fd); |
1496 | if (!IS_ERR(ptr: sock)) |
1497 | return sock; |
1498 | return ERR_PTR(error: -ENOTSOCK); |
1499 | } |
1500 | |
1501 | static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) |
1502 | { |
1503 | struct socket *sock, *oldsock; |
1504 | struct vhost_virtqueue *vq; |
1505 | struct vhost_net_virtqueue *nvq; |
1506 | struct vhost_net_ubuf_ref *ubufs, *oldubufs = NULL; |
1507 | int r; |
1508 | |
1509 | mutex_lock(&n->dev.mutex); |
1510 | r = vhost_dev_check_owner(&n->dev); |
1511 | if (r) |
1512 | goto err; |
1513 | |
1514 | if (index >= VHOST_NET_VQ_MAX) { |
1515 | r = -ENOBUFS; |
1516 | goto err; |
1517 | } |
1518 | vq = &n->vqs[index].vq; |
1519 | nvq = &n->vqs[index]; |
1520 | mutex_lock(&vq->mutex); |
1521 | |
1522 | if (fd == -1) |
1523 | vhost_clear_msg(dev: &n->dev); |
1524 | |
1525 | /* Verify that ring has been setup correctly. */ |
1526 | if (!vhost_vq_access_ok(vq)) { |
1527 | r = -EFAULT; |
1528 | goto err_vq; |
1529 | } |
1530 | sock = get_socket(fd); |
1531 | if (IS_ERR(ptr: sock)) { |
1532 | r = PTR_ERR(ptr: sock); |
1533 | goto err_vq; |
1534 | } |
1535 | |
1536 | /* start polling new socket */ |
1537 | oldsock = vhost_vq_get_backend(vq); |
1538 | if (sock != oldsock) { |
1539 | ubufs = vhost_net_ubuf_alloc(vq, |
1540 | zcopy: sock && vhost_sock_zcopy(sock)); |
1541 | if (IS_ERR(ptr: ubufs)) { |
1542 | r = PTR_ERR(ptr: ubufs); |
1543 | goto err_ubufs; |
1544 | } |
1545 | |
1546 | vhost_net_disable_vq(n, vq); |
1547 | vhost_vq_set_backend(vq, private_data: sock); |
1548 | vhost_net_buf_unproduce(nvq); |
1549 | r = vhost_vq_init_access(vq); |
1550 | if (r) |
1551 | goto err_used; |
1552 | r = vhost_net_enable_vq(n, vq); |
1553 | if (r) |
1554 | goto err_used; |
1555 | if (index == VHOST_NET_VQ_RX) { |
1556 | if (sock) |
1557 | nvq->rx_ring = get_tap_ptr_ring(file: sock->file); |
1558 | else |
1559 | nvq->rx_ring = NULL; |
1560 | } |
1561 | |
1562 | oldubufs = nvq->ubufs; |
1563 | nvq->ubufs = ubufs; |
1564 | |
1565 | n->tx_packets = 0; |
1566 | n->tx_zcopy_err = 0; |
1567 | n->tx_flush = false; |
1568 | } |
1569 | |
1570 | mutex_unlock(lock: &vq->mutex); |
1571 | |
1572 | if (oldubufs) { |
1573 | vhost_net_ubuf_put_wait_and_free(ubufs: oldubufs); |
1574 | mutex_lock(&vq->mutex); |
1575 | vhost_zerocopy_signal_used(net: n, vq); |
1576 | mutex_unlock(lock: &vq->mutex); |
1577 | } |
1578 | |
1579 | if (oldsock) { |
1580 | vhost_dev_flush(dev: &n->dev); |
1581 | sockfd_put(oldsock); |
1582 | } |
1583 | |
1584 | mutex_unlock(lock: &n->dev.mutex); |
1585 | return 0; |
1586 | |
1587 | err_used: |
1588 | vhost_vq_set_backend(vq, private_data: oldsock); |
1589 | vhost_net_enable_vq(n, vq); |
1590 | if (ubufs) |
1591 | vhost_net_ubuf_put_wait_and_free(ubufs); |
1592 | err_ubufs: |
1593 | if (sock) |
1594 | sockfd_put(sock); |
1595 | err_vq: |
1596 | mutex_unlock(lock: &vq->mutex); |
1597 | err: |
1598 | mutex_unlock(lock: &n->dev.mutex); |
1599 | return r; |
1600 | } |
1601 | |
1602 | static long vhost_net_reset_owner(struct vhost_net *n) |
1603 | { |
1604 | struct socket *tx_sock = NULL; |
1605 | struct socket *rx_sock = NULL; |
1606 | long err; |
1607 | struct vhost_iotlb *umem; |
1608 | |
1609 | mutex_lock(&n->dev.mutex); |
1610 | err = vhost_dev_check_owner(&n->dev); |
1611 | if (err) |
1612 | goto done; |
1613 | umem = vhost_dev_reset_owner_prepare(); |
1614 | if (!umem) { |
1615 | err = -ENOMEM; |
1616 | goto done; |
1617 | } |
1618 | vhost_net_stop(n, tx_sock: &tx_sock, rx_sock: &rx_sock); |
1619 | vhost_net_flush(n); |
1620 | vhost_dev_stop(&n->dev); |
1621 | vhost_dev_reset_owner(dev: &n->dev, iotlb: umem); |
1622 | vhost_net_vq_reset(n); |
1623 | done: |
1624 | mutex_unlock(lock: &n->dev.mutex); |
1625 | if (tx_sock) |
1626 | sockfd_put(tx_sock); |
1627 | if (rx_sock) |
1628 | sockfd_put(rx_sock); |
1629 | return err; |
1630 | } |
1631 | |
1632 | static int vhost_net_set_features(struct vhost_net *n, u64 features) |
1633 | { |
1634 | size_t vhost_hlen, sock_hlen, hdr_len; |
1635 | int i; |
1636 | |
1637 | hdr_len = (features & ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | |
1638 | (1ULL << VIRTIO_F_VERSION_1))) ? |
1639 | sizeof(struct virtio_net_hdr_mrg_rxbuf) : |
1640 | sizeof(struct virtio_net_hdr); |
1641 | if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) { |
1642 | /* vhost provides vnet_hdr */ |
1643 | vhost_hlen = hdr_len; |
1644 | sock_hlen = 0; |
1645 | } else { |
1646 | /* socket provides vnet_hdr */ |
1647 | vhost_hlen = 0; |
1648 | sock_hlen = hdr_len; |
1649 | } |
1650 | mutex_lock(&n->dev.mutex); |
1651 | if ((features & (1 << VHOST_F_LOG_ALL)) && |
1652 | !vhost_log_access_ok(&n->dev)) |
1653 | goto out_unlock; |
1654 | |
1655 | if ((features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) { |
1656 | if (vhost_init_device_iotlb(d: &n->dev)) |
1657 | goto out_unlock; |
1658 | } |
1659 | |
1660 | for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { |
1661 | mutex_lock(&n->vqs[i].vq.mutex); |
1662 | n->vqs[i].vq.acked_features = features; |
1663 | n->vqs[i].vhost_hlen = vhost_hlen; |
1664 | n->vqs[i].sock_hlen = sock_hlen; |
1665 | mutex_unlock(lock: &n->vqs[i].vq.mutex); |
1666 | } |
1667 | mutex_unlock(lock: &n->dev.mutex); |
1668 | return 0; |
1669 | |
1670 | out_unlock: |
1671 | mutex_unlock(lock: &n->dev.mutex); |
1672 | return -EFAULT; |
1673 | } |
1674 | |
1675 | static long vhost_net_set_owner(struct vhost_net *n) |
1676 | { |
1677 | int r; |
1678 | |
1679 | mutex_lock(&n->dev.mutex); |
1680 | if (vhost_dev_has_owner(dev: &n->dev)) { |
1681 | r = -EBUSY; |
1682 | goto out; |
1683 | } |
1684 | r = vhost_net_set_ubuf_info(n); |
1685 | if (r) |
1686 | goto out; |
1687 | r = vhost_dev_set_owner(dev: &n->dev); |
1688 | if (r) |
1689 | vhost_net_clear_ubuf_info(n); |
1690 | vhost_net_flush(n); |
1691 | out: |
1692 | mutex_unlock(lock: &n->dev.mutex); |
1693 | return r; |
1694 | } |
1695 | |
1696 | static long vhost_net_ioctl(struct file *f, unsigned int ioctl, |
1697 | unsigned long arg) |
1698 | { |
1699 | struct vhost_net *n = f->private_data; |
1700 | void __user *argp = (void __user *)arg; |
1701 | u64 __user *featurep = argp; |
1702 | struct vhost_vring_file backend; |
1703 | u64 features; |
1704 | int r; |
1705 | |
1706 | switch (ioctl) { |
1707 | case VHOST_NET_SET_BACKEND: |
1708 | if (copy_from_user(to: &backend, from: argp, n: sizeof backend)) |
1709 | return -EFAULT; |
1710 | return vhost_net_set_backend(n, index: backend.index, fd: backend.fd); |
1711 | case VHOST_GET_FEATURES: |
1712 | features = VHOST_NET_FEATURES; |
1713 | if (copy_to_user(to: featurep, from: &features, n: sizeof features)) |
1714 | return -EFAULT; |
1715 | return 0; |
1716 | case VHOST_SET_FEATURES: |
1717 | if (copy_from_user(to: &features, from: featurep, n: sizeof features)) |
1718 | return -EFAULT; |
1719 | if (features & ~VHOST_NET_FEATURES) |
1720 | return -EOPNOTSUPP; |
1721 | return vhost_net_set_features(n, features); |
1722 | case VHOST_GET_BACKEND_FEATURES: |
1723 | features = VHOST_NET_BACKEND_FEATURES; |
1724 | if (copy_to_user(to: featurep, from: &features, n: sizeof(features))) |
1725 | return -EFAULT; |
1726 | return 0; |
1727 | case VHOST_SET_BACKEND_FEATURES: |
1728 | if (copy_from_user(to: &features, from: featurep, n: sizeof(features))) |
1729 | return -EFAULT; |
1730 | if (features & ~VHOST_NET_BACKEND_FEATURES) |
1731 | return -EOPNOTSUPP; |
1732 | vhost_set_backend_features(dev: &n->dev, features); |
1733 | return 0; |
1734 | case VHOST_RESET_OWNER: |
1735 | return vhost_net_reset_owner(n); |
1736 | case VHOST_SET_OWNER: |
1737 | return vhost_net_set_owner(n); |
1738 | default: |
1739 | mutex_lock(&n->dev.mutex); |
1740 | r = vhost_dev_ioctl(&n->dev, ioctl, argp); |
1741 | if (r == -ENOIOCTLCMD) |
1742 | r = vhost_vring_ioctl(d: &n->dev, ioctl, argp); |
1743 | else |
1744 | vhost_net_flush(n); |
1745 | mutex_unlock(lock: &n->dev.mutex); |
1746 | return r; |
1747 | } |
1748 | } |
1749 | |
1750 | static ssize_t vhost_net_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) |
1751 | { |
1752 | struct file *file = iocb->ki_filp; |
1753 | struct vhost_net *n = file->private_data; |
1754 | struct vhost_dev *dev = &n->dev; |
1755 | int noblock = file->f_flags & O_NONBLOCK; |
1756 | |
1757 | return vhost_chr_read_iter(dev, to, noblock); |
1758 | } |
1759 | |
1760 | static ssize_t vhost_net_chr_write_iter(struct kiocb *iocb, |
1761 | struct iov_iter *from) |
1762 | { |
1763 | struct file *file = iocb->ki_filp; |
1764 | struct vhost_net *n = file->private_data; |
1765 | struct vhost_dev *dev = &n->dev; |
1766 | |
1767 | return vhost_chr_write_iter(dev, from); |
1768 | } |
1769 | |
1770 | static __poll_t vhost_net_chr_poll(struct file *file, poll_table *wait) |
1771 | { |
1772 | struct vhost_net *n = file->private_data; |
1773 | struct vhost_dev *dev = &n->dev; |
1774 | |
1775 | return vhost_chr_poll(file, dev, wait); |
1776 | } |
1777 | |
1778 | static const struct file_operations vhost_net_fops = { |
1779 | .owner = THIS_MODULE, |
1780 | .release = vhost_net_release, |
1781 | .read_iter = vhost_net_chr_read_iter, |
1782 | .write_iter = vhost_net_chr_write_iter, |
1783 | .poll = vhost_net_chr_poll, |
1784 | .unlocked_ioctl = vhost_net_ioctl, |
1785 | .compat_ioctl = compat_ptr_ioctl, |
1786 | .open = vhost_net_open, |
1787 | .llseek = noop_llseek, |
1788 | }; |
1789 | |
1790 | static struct miscdevice vhost_net_misc = { |
1791 | .minor = VHOST_NET_MINOR, |
1792 | .name = "vhost-net" , |
1793 | .fops = &vhost_net_fops, |
1794 | }; |
1795 | |
1796 | static int __init vhost_net_init(void) |
1797 | { |
1798 | if (experimental_zcopytx) |
1799 | vhost_net_enable_zcopy(vq: VHOST_NET_VQ_TX); |
1800 | return misc_register(misc: &vhost_net_misc); |
1801 | } |
1802 | module_init(vhost_net_init); |
1803 | |
1804 | static void __exit vhost_net_exit(void) |
1805 | { |
1806 | misc_deregister(misc: &vhost_net_misc); |
1807 | } |
1808 | module_exit(vhost_net_exit); |
1809 | |
1810 | MODULE_VERSION("0.0.1" ); |
1811 | MODULE_LICENSE("GPL v2" ); |
1812 | MODULE_AUTHOR("Michael S. Tsirkin" ); |
1813 | MODULE_DESCRIPTION("Host kernel accelerator for virtio net" ); |
1814 | MODULE_ALIAS_MISCDEV(VHOST_NET_MINOR); |
1815 | MODULE_ALIAS("devname:vhost-net" ); |
1816 | |