1 /* Copyright (C) 2009 Red Hat, Inc.
2 * Author: Michael S. Tsirkin <mst@redhat.com>
4 * This work is licensed under the terms of the GNU GPL, version 2.
6 * virtio-net server in host kernel.
9 #include <linux/compat.h>
10 #include <linux/eventfd.h>
11 #include <linux/vhost.h>
12 #include <linux/virtio_net.h>
13 #include <linux/miscdevice.h>
14 #include <linux/module.h>
15 #include <linux/moduleparam.h>
16 #include <linux/mutex.h>
17 #include <linux/workqueue.h>
18 #include <linux/rcupdate.h>
19 #include <linux/file.h>
20 #include <linux/slab.h>
22 #include <linux/net.h>
23 #include <linux/if_packet.h>
24 #include <linux/if_arp.h>
25 #include <linux/if_tun.h>
26 #include <linux/if_macvlan.h>
27 #include <linux/if_vlan.h>
33 static int experimental_zcopytx
;
34 module_param(experimental_zcopytx
, int, 0444);
35 MODULE_PARM_DESC(experimental_zcopytx
, "Enable Experimental Zero Copy TX");
37 /* Max number of bytes transferred before requeueing the job.
38 * Using this limit prevents one virtqueue from starving others. */
39 #define VHOST_NET_WEIGHT 0x80000
41 /* MAX number of TX used buffers for outstanding zerocopy */
42 #define VHOST_MAX_PEND 128
43 #define VHOST_GOODCOPY_LEN 256
46 * For transmit, used buffer len is unused; we override it to track buffer
47 * status internally; used for zerocopy tx only.
49 /* Lower device DMA failed */
50 #define VHOST_DMA_FAILED_LEN 3
51 /* Lower device DMA done */
52 #define VHOST_DMA_DONE_LEN 2
53 /* Lower device DMA in progress */
54 #define VHOST_DMA_IN_PROGRESS 1
56 #define VHOST_DMA_CLEAR_LEN 0
58 #define VHOST_DMA_IS_DONE(len) ((len) >= VHOST_DMA_DONE_LEN)
66 enum vhost_net_poll_state
{
67 VHOST_NET_POLL_DISABLED
= 0,
68 VHOST_NET_POLL_STARTED
= 1,
69 VHOST_NET_POLL_STOPPED
= 2,
74 struct vhost_virtqueue vqs
[VHOST_NET_VQ_MAX
];
75 struct vhost_poll poll
[VHOST_NET_VQ_MAX
];
76 /* Tells us whether we are polling a socket for TX.
77 * We only do this when socket buffer fills up.
78 * Protected by tx vq lock. */
79 enum vhost_net_poll_state tx_poll_state
;
80 /* Number of TX recently submitted.
81 * Protected by tx vq lock. */
83 /* Number of times zerocopy TX recently failed.
84 * Protected by tx vq lock. */
85 unsigned tx_zcopy_err
;
86 /* Flush in progress. Protected by tx vq lock. */
90 static void vhost_net_tx_packet(struct vhost_net
*net
)
93 if (net
->tx_packets
< 1024)
96 net
->tx_zcopy_err
= 0;
99 static void vhost_net_tx_err(struct vhost_net
*net
)
104 static bool vhost_net_tx_select_zcopy(struct vhost_net
*net
)
106 /* TX flush waits for outstanding DMAs to be done.
107 * Don't start new DMAs.
109 return !net
->tx_flush
&&
110 net
->tx_packets
/ 64 >= net
->tx_zcopy_err
;
113 static bool vhost_sock_zcopy(struct socket
*sock
)
115 return unlikely(experimental_zcopytx
) &&
116 sock_flag(sock
->sk
, SOCK_ZEROCOPY
);
119 /* Pop first len bytes from iovec. Return number of segments used. */
120 static int move_iovec_hdr(struct iovec
*from
, struct iovec
*to
,
121 size_t len
, int iov_count
)
126 while (len
&& seg
< iov_count
) {
127 size
= min(from
->iov_len
, len
);
128 to
->iov_base
= from
->iov_base
;
130 from
->iov_len
-= size
;
131 from
->iov_base
+= size
;
139 /* Copy iovec entries for len bytes from iovec. */
140 static void copy_iovec_hdr(const struct iovec
*from
, struct iovec
*to
,
141 size_t len
, int iovcount
)
146 while (len
&& seg
< iovcount
) {
147 size
= min(from
->iov_len
, len
);
148 to
->iov_base
= from
->iov_base
;
157 /* Caller must have TX VQ lock */
158 static void tx_poll_stop(struct vhost_net
*net
)
160 if (likely(net
->tx_poll_state
!= VHOST_NET_POLL_STARTED
))
162 vhost_poll_stop(net
->poll
+ VHOST_NET_VQ_TX
);
163 net
->tx_poll_state
= VHOST_NET_POLL_STOPPED
;
166 /* Caller must have TX VQ lock */
167 static void tx_poll_start(struct vhost_net
*net
, struct socket
*sock
)
169 if (unlikely(net
->tx_poll_state
!= VHOST_NET_POLL_STOPPED
))
171 vhost_poll_start(net
->poll
+ VHOST_NET_VQ_TX
, sock
->file
);
172 net
->tx_poll_state
= VHOST_NET_POLL_STARTED
;
175 /* In case of DMA done not in order in lower device driver for some reason.
176 * upend_idx is used to track end of used idx, done_idx is used to track head
177 * of used idx. Once lower device DMA done contiguously, we will signal KVM
180 static int vhost_zerocopy_signal_used(struct vhost_net
*net
,
181 struct vhost_virtqueue
*vq
)
186 for (i
= vq
->done_idx
; i
!= vq
->upend_idx
; i
= (i
+ 1) % UIO_MAXIOV
) {
187 if (vq
->heads
[i
].len
== VHOST_DMA_FAILED_LEN
)
188 vhost_net_tx_err(net
);
189 if (VHOST_DMA_IS_DONE(vq
->heads
[i
].len
)) {
190 vq
->heads
[i
].len
= VHOST_DMA_CLEAR_LEN
;
191 vhost_add_used_and_signal(vq
->dev
, vq
,
202 static void vhost_zerocopy_callback(struct ubuf_info
*ubuf
, bool success
)
204 struct vhost_ubuf_ref
*ubufs
= ubuf
->ctx
;
205 struct vhost_virtqueue
*vq
= ubufs
->vq
;
206 int cnt
= atomic_read(&ubufs
->kref
.refcount
);
209 * Trigger polling thread if guest stopped submitting new buffers:
210 * in this case, the refcount after decrement will eventually reach 1
212 * We also trigger polling periodically after each 16 packets
213 * (the value 16 here is more or less arbitrary, it's tuned to trigger
214 * less than 10% of times).
216 if (cnt
<= 2 || !(cnt
% 16))
217 vhost_poll_queue(&vq
->poll
);
218 /* set len to mark this desc buffers done DMA */
219 vq
->heads
[ubuf
->desc
].len
= success
?
220 VHOST_DMA_DONE_LEN
: VHOST_DMA_FAILED_LEN
;
221 vhost_ubuf_put(ubufs
);
224 /* Expects to be always run from workqueue - which acts as
225 * read-size critical section for our kind of RCU. */
226 static void handle_tx(struct vhost_net
*net
)
228 struct vhost_virtqueue
*vq
= &net
->dev
.vqs
[VHOST_NET_VQ_TX
];
231 struct msghdr msg
= {
237 .msg_flags
= MSG_DONTWAIT
,
239 size_t len
, total_len
= 0;
243 struct vhost_ubuf_ref
*uninitialized_var(ubufs
);
244 bool zcopy
, zcopy_used
;
246 /* TODO: check that we are running from vhost_worker? */
247 sock
= rcu_dereference_check(vq
->private_data
, 1);
251 wmem
= atomic_read(&sock
->sk
->sk_wmem_alloc
);
252 if (wmem
>= sock
->sk
->sk_sndbuf
) {
253 mutex_lock(&vq
->mutex
);
254 tx_poll_start(net
, sock
);
255 mutex_unlock(&vq
->mutex
);
259 mutex_lock(&vq
->mutex
);
260 vhost_disable_notify(&net
->dev
, vq
);
262 if (wmem
< sock
->sk
->sk_sndbuf
/ 2)
264 hdr_size
= vq
->vhost_hlen
;
268 /* Release DMAs done buffers first */
270 vhost_zerocopy_signal_used(net
, vq
);
272 head
= vhost_get_vq_desc(&net
->dev
, vq
, vq
->iov
,
276 /* On error, stop handling until the next kick. */
277 if (unlikely(head
< 0))
279 /* Nothing new? Wait for eventfd to tell us they refilled. */
280 if (head
== vq
->num
) {
283 wmem
= atomic_read(&sock
->sk
->sk_wmem_alloc
);
284 if (wmem
>= sock
->sk
->sk_sndbuf
* 3 / 4) {
285 tx_poll_start(net
, sock
);
286 set_bit(SOCK_ASYNC_NOSPACE
, &sock
->flags
);
289 /* If more outstanding DMAs, queue the work.
290 * Handle upend_idx wrap around
292 num_pends
= likely(vq
->upend_idx
>= vq
->done_idx
) ?
293 (vq
->upend_idx
- vq
->done_idx
) :
294 (vq
->upend_idx
+ UIO_MAXIOV
- vq
->done_idx
);
295 if (unlikely(num_pends
> VHOST_MAX_PEND
)) {
296 tx_poll_start(net
, sock
);
297 set_bit(SOCK_ASYNC_NOSPACE
, &sock
->flags
);
300 if (unlikely(vhost_enable_notify(&net
->dev
, vq
))) {
301 vhost_disable_notify(&net
->dev
, vq
);
307 vq_err(vq
, "Unexpected descriptor format for TX: "
308 "out %d, int %d\n", out
, in
);
311 /* Skip header. TODO: support TSO. */
312 s
= move_iovec_hdr(vq
->iov
, vq
->hdr
, hdr_size
, out
);
313 msg
.msg_iovlen
= out
;
314 len
= iov_length(vq
->iov
, out
);
317 vq_err(vq
, "Unexpected header len for TX: "
318 "%zd expected %zd\n",
319 iov_length(vq
->hdr
, s
), hdr_size
);
322 zcopy_used
= zcopy
&& (len
>= VHOST_GOODCOPY_LEN
||
323 vq
->upend_idx
!= vq
->done_idx
);
325 /* use msg_control to pass vhost zerocopy ubuf info to skb */
327 vq
->heads
[vq
->upend_idx
].id
= head
;
328 if (!vhost_net_tx_select_zcopy(net
) ||
329 len
< VHOST_GOODCOPY_LEN
) {
330 /* copy don't need to wait for DMA done */
331 vq
->heads
[vq
->upend_idx
].len
=
333 msg
.msg_control
= NULL
;
334 msg
.msg_controllen
= 0;
337 struct ubuf_info
*ubuf
= &vq
->ubuf_info
[head
];
339 vq
->heads
[vq
->upend_idx
].len
=
340 VHOST_DMA_IN_PROGRESS
;
341 ubuf
->callback
= vhost_zerocopy_callback
;
342 ubuf
->ctx
= vq
->ubufs
;
343 ubuf
->desc
= vq
->upend_idx
;
344 msg
.msg_control
= ubuf
;
345 msg
.msg_controllen
= sizeof(ubuf
);
347 kref_get(&ubufs
->kref
);
349 vq
->upend_idx
= (vq
->upend_idx
+ 1) % UIO_MAXIOV
;
351 /* TODO: Check specific error and bomb out unless ENOBUFS? */
352 err
= sock
->ops
->sendmsg(NULL
, sock
, &msg
, len
);
353 if (unlikely(err
< 0)) {
356 vhost_ubuf_put(ubufs
);
357 vq
->upend_idx
= ((unsigned)vq
->upend_idx
- 1) %
360 vhost_discard_vq_desc(vq
, 1);
361 if (err
== -EAGAIN
|| err
== -ENOBUFS
)
362 tx_poll_start(net
, sock
);
366 pr_debug("Truncated TX packet: "
367 " len %d != %zd\n", err
, len
);
369 vhost_add_used_and_signal(&net
->dev
, vq
, head
, 0);
371 vhost_zerocopy_signal_used(net
, vq
);
373 vhost_net_tx_packet(net
);
374 if (unlikely(total_len
>= VHOST_NET_WEIGHT
)) {
375 vhost_poll_queue(&vq
->poll
);
380 mutex_unlock(&vq
->mutex
);
383 static int peek_head_len(struct sock
*sk
)
385 struct sk_buff
*head
;
389 spin_lock_irqsave(&sk
->sk_receive_queue
.lock
, flags
);
390 head
= skb_peek(&sk
->sk_receive_queue
);
393 if (vlan_tx_tag_present(head
))
397 spin_unlock_irqrestore(&sk
->sk_receive_queue
.lock
, flags
);
401 /* This is a multi-buffer version of vhost_get_desc, that works if
402 * vq has read descriptors only.
403 * @vq - the relevant virtqueue
404 * @datalen - data length we'll be reading
405 * @iovcount - returned count of io vectors we fill
407 * @log_num - log offset
408 * @quota - headcount quota, 1 for big buffer
409 * returns number of buffer heads allocated, negative on error
411 static int get_rx_bufs(struct vhost_virtqueue
*vq
,
412 struct vring_used_elem
*heads
,
415 struct vhost_log
*log
,
419 unsigned int out
, in
;
425 while (datalen
> 0 && headcount
< quota
) {
426 if (unlikely(seg
>= UIO_MAXIOV
)) {
430 d
= vhost_get_vq_desc(vq
->dev
, vq
, vq
->iov
+ seg
,
431 ARRAY_SIZE(vq
->iov
) - seg
, &out
,
437 if (unlikely(out
|| in
<= 0)) {
438 vq_err(vq
, "unexpected descriptor format for RX: "
439 "out %d, in %d\n", out
, in
);
447 heads
[headcount
].id
= d
;
448 heads
[headcount
].len
= iov_length(vq
->iov
+ seg
, in
);
449 datalen
-= heads
[headcount
].len
;
453 heads
[headcount
- 1].len
+= datalen
;
459 vhost_discard_vq_desc(vq
, headcount
);
463 /* Expects to be always run from workqueue - which acts as
464 * read-size critical section for our kind of RCU. */
465 static void handle_rx(struct vhost_net
*net
)
467 struct vhost_virtqueue
*vq
= &net
->dev
.vqs
[VHOST_NET_VQ_RX
];
468 unsigned uninitialized_var(in
), log
;
469 struct vhost_log
*vq_log
;
470 struct msghdr msg
= {
473 .msg_control
= NULL
, /* FIXME: get and handle RX aux data. */
476 .msg_flags
= MSG_DONTWAIT
,
478 struct virtio_net_hdr_mrg_rxbuf hdr
= {
480 .hdr
.gso_type
= VIRTIO_NET_HDR_GSO_NONE
482 size_t total_len
= 0;
485 size_t vhost_hlen
, sock_hlen
;
486 size_t vhost_len
, sock_len
;
487 /* TODO: check that we are running from vhost_worker? */
488 struct socket
*sock
= rcu_dereference_check(vq
->private_data
, 1);
493 mutex_lock(&vq
->mutex
);
494 vhost_disable_notify(&net
->dev
, vq
);
495 vhost_hlen
= vq
->vhost_hlen
;
496 sock_hlen
= vq
->sock_hlen
;
498 vq_log
= unlikely(vhost_has_feature(&net
->dev
, VHOST_F_LOG_ALL
)) ?
500 mergeable
= vhost_has_feature(&net
->dev
, VIRTIO_NET_F_MRG_RXBUF
);
502 while ((sock_len
= peek_head_len(sock
->sk
))) {
503 sock_len
+= sock_hlen
;
504 vhost_len
= sock_len
+ vhost_hlen
;
505 headcount
= get_rx_bufs(vq
, vq
->heads
, vhost_len
,
507 likely(mergeable
) ? UIO_MAXIOV
: 1);
508 /* On error, stop handling until the next kick. */
509 if (unlikely(headcount
< 0))
511 /* OK, now we need to know about added descriptors. */
513 if (unlikely(vhost_enable_notify(&net
->dev
, vq
))) {
514 /* They have slipped one in as we were
515 * doing that: check again. */
516 vhost_disable_notify(&net
->dev
, vq
);
519 /* Nothing new? Wait for eventfd to tell us
523 /* We don't need to be notified again. */
524 if (unlikely((vhost_hlen
)))
525 /* Skip header. TODO: support TSO. */
526 move_iovec_hdr(vq
->iov
, vq
->hdr
, vhost_hlen
, in
);
528 /* Copy the header for use in VIRTIO_NET_F_MRG_RXBUF:
529 * needed because recvmsg can modify msg_iov. */
530 copy_iovec_hdr(vq
->iov
, vq
->hdr
, sock_hlen
, in
);
532 err
= sock
->ops
->recvmsg(NULL
, sock
, &msg
,
533 sock_len
, MSG_DONTWAIT
| MSG_TRUNC
);
534 /* Userspace might have consumed the packet meanwhile:
535 * it's not supposed to do this usually, but might be hard
536 * to prevent. Discard data we got (if any) and keep going. */
537 if (unlikely(err
!= sock_len
)) {
538 pr_debug("Discarded rx packet: "
539 " len %d, expected %zd\n", err
, sock_len
);
540 vhost_discard_vq_desc(vq
, headcount
);
543 if (unlikely(vhost_hlen
) &&
544 memcpy_toiovecend(vq
->hdr
, (unsigned char *)&hdr
, 0,
546 vq_err(vq
, "Unable to write vnet_hdr at addr %p\n",
550 /* TODO: Should check and handle checksum. */
551 if (likely(mergeable
) &&
552 memcpy_toiovecend(vq
->hdr
, (unsigned char *)&headcount
,
553 offsetof(typeof(hdr
), num_buffers
),
554 sizeof hdr
.num_buffers
)) {
555 vq_err(vq
, "Failed num_buffers write");
556 vhost_discard_vq_desc(vq
, headcount
);
559 vhost_add_used_and_signal_n(&net
->dev
, vq
, vq
->heads
,
561 if (unlikely(vq_log
))
562 vhost_log_write(vq
, vq_log
, log
, vhost_len
);
563 total_len
+= vhost_len
;
564 if (unlikely(total_len
>= VHOST_NET_WEIGHT
)) {
565 vhost_poll_queue(&vq
->poll
);
570 mutex_unlock(&vq
->mutex
);
573 static void handle_tx_kick(struct vhost_work
*work
)
575 struct vhost_virtqueue
*vq
= container_of(work
, struct vhost_virtqueue
,
577 struct vhost_net
*net
= container_of(vq
->dev
, struct vhost_net
, dev
);
582 static void handle_rx_kick(struct vhost_work
*work
)
584 struct vhost_virtqueue
*vq
= container_of(work
, struct vhost_virtqueue
,
586 struct vhost_net
*net
= container_of(vq
->dev
, struct vhost_net
, dev
);
591 static void handle_tx_net(struct vhost_work
*work
)
593 struct vhost_net
*net
= container_of(work
, struct vhost_net
,
594 poll
[VHOST_NET_VQ_TX
].work
);
598 static void handle_rx_net(struct vhost_work
*work
)
600 struct vhost_net
*net
= container_of(work
, struct vhost_net
,
601 poll
[VHOST_NET_VQ_RX
].work
);
605 static int vhost_net_open(struct inode
*inode
, struct file
*f
)
607 struct vhost_net
*n
= kmalloc(sizeof *n
, GFP_KERNEL
);
608 struct vhost_dev
*dev
;
615 n
->vqs
[VHOST_NET_VQ_TX
].handle_kick
= handle_tx_kick
;
616 n
->vqs
[VHOST_NET_VQ_RX
].handle_kick
= handle_rx_kick
;
617 r
= vhost_dev_init(dev
, n
->vqs
, VHOST_NET_VQ_MAX
);
623 vhost_poll_init(n
->poll
+ VHOST_NET_VQ_TX
, handle_tx_net
, POLLOUT
, dev
);
624 vhost_poll_init(n
->poll
+ VHOST_NET_VQ_RX
, handle_rx_net
, POLLIN
, dev
);
625 n
->tx_poll_state
= VHOST_NET_POLL_DISABLED
;
632 static void vhost_net_disable_vq(struct vhost_net
*n
,
633 struct vhost_virtqueue
*vq
)
635 if (!vq
->private_data
)
637 if (vq
== n
->vqs
+ VHOST_NET_VQ_TX
) {
639 n
->tx_poll_state
= VHOST_NET_POLL_DISABLED
;
641 vhost_poll_stop(n
->poll
+ VHOST_NET_VQ_RX
);
644 static void vhost_net_enable_vq(struct vhost_net
*n
,
645 struct vhost_virtqueue
*vq
)
649 sock
= rcu_dereference_protected(vq
->private_data
,
650 lockdep_is_held(&vq
->mutex
));
653 if (vq
== n
->vqs
+ VHOST_NET_VQ_TX
) {
654 n
->tx_poll_state
= VHOST_NET_POLL_STOPPED
;
655 tx_poll_start(n
, sock
);
657 vhost_poll_start(n
->poll
+ VHOST_NET_VQ_RX
, sock
->file
);
660 static struct socket
*vhost_net_stop_vq(struct vhost_net
*n
,
661 struct vhost_virtqueue
*vq
)
665 mutex_lock(&vq
->mutex
);
666 sock
= rcu_dereference_protected(vq
->private_data
,
667 lockdep_is_held(&vq
->mutex
));
668 vhost_net_disable_vq(n
, vq
);
669 rcu_assign_pointer(vq
->private_data
, NULL
);
670 mutex_unlock(&vq
->mutex
);
674 static void vhost_net_stop(struct vhost_net
*n
, struct socket
**tx_sock
,
675 struct socket
**rx_sock
)
677 *tx_sock
= vhost_net_stop_vq(n
, n
->vqs
+ VHOST_NET_VQ_TX
);
678 *rx_sock
= vhost_net_stop_vq(n
, n
->vqs
+ VHOST_NET_VQ_RX
);
681 static void vhost_net_flush_vq(struct vhost_net
*n
, int index
)
683 vhost_poll_flush(n
->poll
+ index
);
684 vhost_poll_flush(&n
->dev
.vqs
[index
].poll
);
687 static void vhost_net_flush(struct vhost_net
*n
)
689 vhost_net_flush_vq(n
, VHOST_NET_VQ_TX
);
690 vhost_net_flush_vq(n
, VHOST_NET_VQ_RX
);
691 if (n
->dev
.vqs
[VHOST_NET_VQ_TX
].ubufs
) {
692 mutex_lock(&n
->dev
.vqs
[VHOST_NET_VQ_TX
].mutex
);
694 mutex_unlock(&n
->dev
.vqs
[VHOST_NET_VQ_TX
].mutex
);
695 /* Wait for all lower device DMAs done. */
696 vhost_ubuf_put_and_wait(n
->dev
.vqs
[VHOST_NET_VQ_TX
].ubufs
);
697 mutex_lock(&n
->dev
.vqs
[VHOST_NET_VQ_TX
].mutex
);
699 kref_init(&n
->dev
.vqs
[VHOST_NET_VQ_TX
].ubufs
->kref
);
700 mutex_unlock(&n
->dev
.vqs
[VHOST_NET_VQ_TX
].mutex
);
704 static int vhost_net_release(struct inode
*inode
, struct file
*f
)
706 struct vhost_net
*n
= f
->private_data
;
707 struct socket
*tx_sock
;
708 struct socket
*rx_sock
;
710 vhost_net_stop(n
, &tx_sock
, &rx_sock
);
712 vhost_dev_stop(&n
->dev
);
713 vhost_dev_cleanup(&n
->dev
, false);
718 /* We do an extra flush before freeing memory,
719 * since jobs can re-queue themselves. */
725 static struct socket
*get_raw_socket(int fd
)
728 struct sockaddr_ll sa
;
729 char buf
[MAX_ADDR_LEN
];
731 int uaddr_len
= sizeof uaddr
, r
;
732 struct socket
*sock
= sockfd_lookup(fd
, &r
);
735 return ERR_PTR(-ENOTSOCK
);
737 /* Parameter checking */
738 if (sock
->sk
->sk_type
!= SOCK_RAW
) {
739 r
= -ESOCKTNOSUPPORT
;
743 r
= sock
->ops
->getname(sock
, (struct sockaddr
*)&uaddr
.sa
,
748 if (uaddr
.sa
.sll_family
!= AF_PACKET
) {
758 static struct socket
*get_tap_socket(int fd
)
760 struct file
*file
= fget(fd
);
764 return ERR_PTR(-EBADF
);
765 sock
= tun_get_socket(file
);
768 sock
= macvtap_get_socket(file
);
774 static struct socket
*get_socket(int fd
)
778 /* special case to disable backend */
781 sock
= get_raw_socket(fd
);
784 sock
= get_tap_socket(fd
);
787 return ERR_PTR(-ENOTSOCK
);
790 static long vhost_net_set_backend(struct vhost_net
*n
, unsigned index
, int fd
)
792 struct socket
*sock
, *oldsock
;
793 struct vhost_virtqueue
*vq
;
794 struct vhost_ubuf_ref
*ubufs
, *oldubufs
= NULL
;
797 mutex_lock(&n
->dev
.mutex
);
798 r
= vhost_dev_check_owner(&n
->dev
);
802 if (index
>= VHOST_NET_VQ_MAX
) {
807 mutex_lock(&vq
->mutex
);
809 /* Verify that ring has been setup correctly. */
810 if (!vhost_vq_access_ok(vq
)) {
814 sock
= get_socket(fd
);
820 /* start polling new socket */
821 oldsock
= rcu_dereference_protected(vq
->private_data
,
822 lockdep_is_held(&vq
->mutex
));
823 if (sock
!= oldsock
) {
824 ubufs
= vhost_ubuf_alloc(vq
, sock
&& vhost_sock_zcopy(sock
));
829 oldubufs
= vq
->ubufs
;
831 vhost_net_disable_vq(n
, vq
);
832 rcu_assign_pointer(vq
->private_data
, sock
);
833 vhost_net_enable_vq(n
, vq
);
835 r
= vhost_init_used(vq
);
844 mutex_unlock(&vq
->mutex
);
847 vhost_ubuf_put_and_wait(oldubufs
);
848 mutex_lock(&vq
->mutex
);
849 vhost_zerocopy_signal_used(n
, vq
);
850 mutex_unlock(&vq
->mutex
);
854 vhost_net_flush_vq(n
, index
);
858 mutex_unlock(&n
->dev
.mutex
);
864 mutex_unlock(&vq
->mutex
);
866 mutex_unlock(&n
->dev
.mutex
);
870 static long vhost_net_reset_owner(struct vhost_net
*n
)
872 struct socket
*tx_sock
= NULL
;
873 struct socket
*rx_sock
= NULL
;
876 mutex_lock(&n
->dev
.mutex
);
877 err
= vhost_dev_check_owner(&n
->dev
);
880 vhost_net_stop(n
, &tx_sock
, &rx_sock
);
882 err
= vhost_dev_reset_owner(&n
->dev
);
884 mutex_unlock(&n
->dev
.mutex
);
892 static int vhost_net_set_features(struct vhost_net
*n
, u64 features
)
894 size_t vhost_hlen
, sock_hlen
, hdr_len
;
897 hdr_len
= (features
& (1 << VIRTIO_NET_F_MRG_RXBUF
)) ?
898 sizeof(struct virtio_net_hdr_mrg_rxbuf
) :
899 sizeof(struct virtio_net_hdr
);
900 if (features
& (1 << VHOST_NET_F_VIRTIO_NET_HDR
)) {
901 /* vhost provides vnet_hdr */
902 vhost_hlen
= hdr_len
;
905 /* socket provides vnet_hdr */
909 mutex_lock(&n
->dev
.mutex
);
910 if ((features
& (1 << VHOST_F_LOG_ALL
)) &&
911 !vhost_log_access_ok(&n
->dev
)) {
912 mutex_unlock(&n
->dev
.mutex
);
915 n
->dev
.acked_features
= features
;
917 for (i
= 0; i
< VHOST_NET_VQ_MAX
; ++i
) {
918 mutex_lock(&n
->vqs
[i
].mutex
);
919 n
->vqs
[i
].vhost_hlen
= vhost_hlen
;
920 n
->vqs
[i
].sock_hlen
= sock_hlen
;
921 mutex_unlock(&n
->vqs
[i
].mutex
);
924 mutex_unlock(&n
->dev
.mutex
);
928 static long vhost_net_ioctl(struct file
*f
, unsigned int ioctl
,
931 struct vhost_net
*n
= f
->private_data
;
932 void __user
*argp
= (void __user
*)arg
;
933 u64 __user
*featurep
= argp
;
934 struct vhost_vring_file backend
;
939 case VHOST_NET_SET_BACKEND
:
940 if (copy_from_user(&backend
, argp
, sizeof backend
))
942 return vhost_net_set_backend(n
, backend
.index
, backend
.fd
);
943 case VHOST_GET_FEATURES
:
944 features
= VHOST_NET_FEATURES
;
945 if (copy_to_user(featurep
, &features
, sizeof features
))
948 case VHOST_SET_FEATURES
:
949 if (copy_from_user(&features
, featurep
, sizeof features
))
951 if (features
& ~VHOST_NET_FEATURES
)
953 return vhost_net_set_features(n
, features
);
954 case VHOST_RESET_OWNER
:
955 return vhost_net_reset_owner(n
);
957 mutex_lock(&n
->dev
.mutex
);
958 r
= vhost_dev_ioctl(&n
->dev
, ioctl
, argp
);
959 if (r
== -ENOIOCTLCMD
)
960 r
= vhost_vring_ioctl(&n
->dev
, ioctl
, argp
);
963 mutex_unlock(&n
->dev
.mutex
);
969 static long vhost_net_compat_ioctl(struct file
*f
, unsigned int ioctl
,
972 return vhost_net_ioctl(f
, ioctl
, (unsigned long)compat_ptr(arg
));
976 static const struct file_operations vhost_net_fops
= {
977 .owner
= THIS_MODULE
,
978 .release
= vhost_net_release
,
979 .unlocked_ioctl
= vhost_net_ioctl
,
981 .compat_ioctl
= vhost_net_compat_ioctl
,
983 .open
= vhost_net_open
,
984 .llseek
= noop_llseek
,
987 static struct miscdevice vhost_net_misc
= {
988 .minor
= VHOST_NET_MINOR
,
990 .fops
= &vhost_net_fops
,
993 static int vhost_net_init(void)
995 if (experimental_zcopytx
)
996 vhost_enable_zcopy(VHOST_NET_VQ_TX
);
997 return misc_register(&vhost_net_misc
);
999 module_init(vhost_net_init
);
1001 static void vhost_net_exit(void)
1003 misc_deregister(&vhost_net_misc
);
1005 module_exit(vhost_net_exit
);
1007 MODULE_VERSION("0.0.1");
1008 MODULE_LICENSE("GPL v2");
1009 MODULE_AUTHOR("Michael S. Tsirkin");
1010 MODULE_DESCRIPTION("Host kernel accelerator for virtio net");
1011 MODULE_ALIAS_MISCDEV(VHOST_NET_MINOR
);
1012 MODULE_ALIAS("devname:vhost-net");