From bab632d69ee48a106e779b60cc01adfe80a72807 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 18 Jul 2011 03:48:46 +0000 Subject: vhost: vhost TX zero-copy support >From: Shirley Ma This adds experimental zero copy support in vhost-net, disabled by default. To enable, set experimental_zcopytx module option to 1. This patch maintains the outstanding userspace buffers in the sequence it is delivered to vhost. The outstanding userspace buffers will be marked as done once the lower device buffers DMA has finished. This is monitored through last reference of kfree_skb callback. Two buffer indices are used for this purpose. The vhost-net device passes the userspace buffers info to lower device skb through message control. DMA done status check and guest notification are handled by handle_tx: in the worst case is all buffers in the vq are in pending/done status, so we need to notify guest to release DMA done buffers first before we get any new buffers from the vq. One known problem is that if the guest stops submitting buffers, buffers might never get used until some further action, e.g. device reset. This does not seem to affect linux guests. Signed-off-by: Shirley Signed-off-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- drivers/vhost/net.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 76 insertions(+), 1 deletion(-) (limited to 'drivers/vhost/net.c') diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index e224a92baa1..f0fd52cdfad 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -28,10 +29,18 @@ #include "vhost.h" +static int experimental_zcopytx; +module_param(experimental_zcopytx, int, 0444); +MODULE_PARM_DESC(experimental_zcopytx, "Enable Experimental Zero Copy TX"); + /* Max number of bytes transferred before requeueing the job. * Using this limit prevents one virtqueue from starving others. */ #define VHOST_NET_WEIGHT 0x80000 +/* MAX number of TX used buffers for outstanding zerocopy */ +#define VHOST_MAX_PEND 128 +#define VHOST_GOODCOPY_LEN 256 + enum { VHOST_NET_VQ_RX = 0, VHOST_NET_VQ_TX = 1, @@ -54,6 +63,12 @@ struct vhost_net { enum vhost_net_poll_state tx_poll_state; }; +static bool vhost_sock_zcopy(struct socket *sock) +{ + return unlikely(experimental_zcopytx) && + sock_flag(sock->sk, SOCK_ZEROCOPY); +} + /* Pop first len bytes from iovec. Return number of segments used. */ static int move_iovec_hdr(struct iovec *from, struct iovec *to, size_t len, int iov_count) @@ -129,6 +144,8 @@ static void handle_tx(struct vhost_net *net) int err, wmem; size_t hdr_size; struct socket *sock; + struct vhost_ubuf_ref *uninitialized_var(ubufs); + bool zcopy; /* TODO: check that we are running from vhost_worker? */ sock = rcu_dereference_check(vq->private_data, 1); @@ -149,8 +166,13 @@ static void handle_tx(struct vhost_net *net) if (wmem < sock->sk->sk_sndbuf / 2) tx_poll_stop(net); hdr_size = vq->vhost_hlen; + zcopy = vhost_sock_zcopy(sock); for (;;) { + /* Release DMAs done buffers first */ + if (zcopy) + vhost_zerocopy_signal_used(vq); + head = vhost_get_vq_desc(&net->dev, vq, vq->iov, ARRAY_SIZE(vq->iov), &out, &in, @@ -166,6 +188,13 @@ static void handle_tx(struct vhost_net *net) set_bit(SOCK_ASYNC_NOSPACE, &sock->flags); break; } + /* If more outstanding DMAs, queue the work */ + if (unlikely(vq->upend_idx - vq->done_idx > + VHOST_MAX_PEND)) { + tx_poll_start(net, sock); + set_bit(SOCK_ASYNC_NOSPACE, &sock->flags); + break; + } if (unlikely(vhost_enable_notify(&net->dev, vq))) { vhost_disable_notify(&net->dev, vq); continue; @@ -188,9 +217,39 @@ static void handle_tx(struct vhost_net *net) iov_length(vq->hdr, s), hdr_size); break; } + /* use msg_control to pass vhost zerocopy ubuf info to skb */ + if (zcopy) { + vq->heads[vq->upend_idx].id = head; + if (len < VHOST_GOODCOPY_LEN) { + /* copy don't need to wait for DMA done */ + vq->heads[vq->upend_idx].len = + VHOST_DMA_DONE_LEN; + msg.msg_control = NULL; + msg.msg_controllen = 0; + ubufs = NULL; + } else { + struct ubuf_info *ubuf = &vq->ubuf_info[head]; + + vq->heads[vq->upend_idx].len = len; + ubuf->callback = vhost_zerocopy_callback; + ubuf->arg = vq->ubufs; + ubuf->desc = vq->upend_idx; + msg.msg_control = ubuf; + msg.msg_controllen = sizeof(ubuf); + ubufs = vq->ubufs; + kref_get(&ubufs->kref); + } + vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV; + } /* TODO: Check specific error and bomb out unless ENOBUFS? */ err = sock->ops->sendmsg(NULL, sock, &msg, len); if (unlikely(err < 0)) { + if (zcopy) { + if (ubufs) + vhost_ubuf_put(ubufs); + vq->upend_idx = ((unsigned)vq->upend_idx - 1) % + UIO_MAXIOV; + } vhost_discard_vq_desc(vq, 1); tx_poll_start(net, sock); break; @@ -198,7 +257,8 @@ static void handle_tx(struct vhost_net *net) if (err != len) pr_debug("Truncated TX packet: " " len %d != %zd\n", err, len); - vhost_add_used_and_signal(&net->dev, vq, head, 0); + if (!zcopy) + vhost_add_used_and_signal(&net->dev, vq, head, 0); total_len += len; if (unlikely(total_len >= VHOST_NET_WEIGHT)) { vhost_poll_queue(&vq->poll); @@ -603,6 +663,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) { struct socket *sock, *oldsock; struct vhost_virtqueue *vq; + struct vhost_ubuf_ref *ubufs, *oldubufs = NULL; int r; mutex_lock(&n->dev.mutex); @@ -632,6 +693,13 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) oldsock = rcu_dereference_protected(vq->private_data, lockdep_is_held(&vq->mutex)); if (sock != oldsock) { + ubufs = vhost_ubuf_alloc(vq, sock && vhost_sock_zcopy(sock)); + if (IS_ERR(ubufs)) { + r = PTR_ERR(ubufs); + goto err_ubufs; + } + oldubufs = vq->ubufs; + vq->ubufs = ubufs; vhost_net_disable_vq(n, vq); rcu_assign_pointer(vq->private_data, sock); vhost_net_enable_vq(n, vq); @@ -639,6 +707,9 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) mutex_unlock(&vq->mutex); + if (oldubufs) + vhost_ubuf_put_and_wait(oldubufs); + if (oldsock) { vhost_net_flush_vq(n, index); fput(oldsock->file); @@ -647,6 +718,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) mutex_unlock(&n->dev.mutex); return 0; +err_ubufs: + fput(sock->file); err_vq: mutex_unlock(&vq->mutex); err: @@ -776,6 +849,8 @@ static struct miscdevice vhost_net_misc = { static int vhost_net_init(void) { + if (experimental_zcopytx) + vhost_enable_zcopy(VHOST_NET_VQ_TX); return misc_register(&vhost_net_misc); } module_init(vhost_net_init); -- cgit v1.2.3-70-g09d2 From f59281dafb832b161133743fcf3dc29051e6fdb8 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 21 Jun 2011 18:04:27 +0800 Subject: vhost: init used ring after backend was set Move the used ring initialization after backend was set. This makes it possible to disable the backend and tweak the used ring, then restart. This will also make it possible to log the used ring write correctly. Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- drivers/vhost/net.c | 4 ++++ drivers/vhost/test.c | 5 +++++ drivers/vhost/vhost.c | 14 ++++++-------- drivers/vhost/vhost.h | 1 + 4 files changed, 16 insertions(+), 8 deletions(-) (limited to 'drivers/vhost/net.c') diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index f0fd52cdfad..70ac60437d1 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -703,6 +703,10 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) vhost_net_disable_vq(n, vq); rcu_assign_pointer(vq->private_data, sock); vhost_net_enable_vq(n, vq); + + r = vhost_init_used(vq); + if (r) + goto err_vq; } mutex_unlock(&vq->mutex); diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c index 734e1d74ad8..fc9a1d75281 100644 --- a/drivers/vhost/test.c +++ b/drivers/vhost/test.c @@ -195,8 +195,13 @@ static long vhost_test_run(struct vhost_test *n, int test) lockdep_is_held(&vq->mutex)); rcu_assign_pointer(vq->private_data, priv); + r = vhost_init_used(&n->vqs[index]); + mutex_unlock(&vq->mutex); + if (r) + goto err; + if (oldpriv) { vhost_test_flush_vq(n, index); } diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 5ef2f62becf..9a108038fe5 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -629,15 +629,17 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) return 0; } -static int init_used(struct vhost_virtqueue *vq, - struct vring_used __user *used) +int vhost_init_used(struct vhost_virtqueue *vq) { - int r = put_user(vq->used_flags, &used->flags); + int r; + if (!vq->private_data) + return 0; + r = put_user(vq->used_flags, &vq->used->flags); if (r) return r; vq->signalled_used_valid = false; - return get_user(vq->last_used_idx, &used->idx); + return get_user(vq->last_used_idx, &vq->used->idx); } static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp) @@ -752,10 +754,6 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp) } } - r = init_used(vq, (struct vring_used __user *)(unsigned long) - a.used_user_addr); - if (r) - break; vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG)); vq->desc = (void __user *)(unsigned long)a.desc_user_addr; vq->avail = (void __user *)(unsigned long)a.avail_user_addr; diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 1544b782529..14c9abf0d80 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -174,6 +174,7 @@ int vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *, struct vhost_log *log, unsigned int *log_num); void vhost_discard_vq_desc(struct vhost_virtqueue *, int n); +int vhost_init_used(struct vhost_virtqueue *); int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len); int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads, unsigned count); -- cgit v1.2.3-70-g09d2 From c047e5f3170c2595e66ed67f87cec01afd717212 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 20 Jul 2011 13:41:31 +0300 Subject: vhost-net: update used ring on backend change On backend change, we flushed out outstanding skbs but forgot to update the used ring, so that done entries were left in the ubuf_info ring. As a result we lose heads or complete incorrect ones, crashing the guest or leaking memory. Fix by updating the used ring. Signed-off-by: Michael S. Tsirkin --- drivers/vhost/net.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers/vhost/net.c') diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 70ac60437d1..248b25008d1 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -711,8 +711,12 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) mutex_unlock(&vq->mutex); - if (oldubufs) + if (oldubufs) { vhost_ubuf_put_and_wait(oldubufs); + mutex_lock(&vq->mutex); + vhost_zerocopy_signal_used(vq); + mutex_unlock(&vq->mutex); + } if (oldsock) { vhost_net_flush_vq(n, index); -- cgit v1.2.3-70-g09d2 From 9e380825ab3f5176f65306c4ac119fd23634ce03 Mon Sep 17 00:00:00 2001 From: Shirley Ma Date: Wed, 20 Jul 2011 10:23:12 -0700 Subject: vhost: handle wrap around in # of bufs math The meth for calculating the # of outstanding buffers gives incorrect results when vq->upend_idx wraps around zero. Fix that. Signed-off-by: Shirley Ma Signed-off-by: Michael S. Tsirkin --- drivers/vhost/net.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'drivers/vhost/net.c') diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 248b25008d1..882a51fe7b3 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -182,15 +182,21 @@ static void handle_tx(struct vhost_net *net) break; /* Nothing new? Wait for eventfd to tell us they refilled. */ if (head == vq->num) { + int num_pends; + wmem = atomic_read(&sock->sk->sk_wmem_alloc); if (wmem >= sock->sk->sk_sndbuf * 3 / 4) { tx_poll_start(net, sock); set_bit(SOCK_ASYNC_NOSPACE, &sock->flags); break; } - /* If more outstanding DMAs, queue the work */ - if (unlikely(vq->upend_idx - vq->done_idx > - VHOST_MAX_PEND)) { + /* If more outstanding DMAs, queue the work. + * Handle upend_idx wrap around + */ + num_pends = likely(vq->upend_idx >= vq->done_idx) ? + (vq->upend_idx - vq->done_idx) : + (vq->upend_idx + UIO_MAXIOV - vq->done_idx); + if (unlikely(num_pends > VHOST_MAX_PEND)) { tx_poll_start(net, sock); set_bit(SOCK_ASYNC_NOSPACE, &sock->flags); break; -- cgit v1.2.3-70-g09d2