diff options
Diffstat (limited to 'drivers/vhost/net.c')
-rw-r--r-- | drivers/vhost/net.c | 331 |
1 files changed, 277 insertions, 54 deletions
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index df5b6b971f2..29e850a7a2f 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -74,6 +74,22 @@ static int move_iovec_hdr(struct iovec *from, struct iovec *to, } return seg; } +/* Copy iovec entries for len bytes from iovec. */ +static void copy_iovec_hdr(const struct iovec *from, struct iovec *to, + size_t len, int iovcount) +{ + int seg = 0; + size_t size; + while (len && seg < iovcount) { + size = min(from->iov_len, len); + to->iov_base = from->iov_base; + to->iov_len = size; + len -= size; + ++from; + ++to; + ++seg; + } +} /* Caller must have TX VQ lock */ static void tx_poll_stop(struct vhost_net *net) @@ -98,7 +114,8 @@ static void tx_poll_start(struct vhost_net *net, struct socket *sock) static void handle_tx(struct vhost_net *net) { struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX]; - unsigned head, out, in, s; + unsigned out, in, s; + int head; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, @@ -128,13 +145,16 @@ static void handle_tx(struct vhost_net *net) if (wmem < sock->sk->sk_sndbuf / 2) tx_poll_stop(net); - hdr_size = vq->hdr_size; + hdr_size = vq->vhost_hlen; for (;;) { head = vhost_get_vq_desc(&net->dev, vq, vq->iov, ARRAY_SIZE(vq->iov), &out, &in, NULL, NULL); + /* On error, stop handling until the next kick. */ + if (unlikely(head < 0)) + break; /* Nothing new? Wait for eventfd to tell us they refilled. */ if (head == vq->num) { wmem = atomic_read(&sock->sk->sk_wmem_alloc); @@ -168,13 +188,13 @@ static void handle_tx(struct vhost_net *net) /* TODO: Check specific error and bomb out unless ENOBUFS? */ err = sock->ops->sendmsg(NULL, sock, &msg, len); if (unlikely(err < 0)) { - vhost_discard_vq_desc(vq); + vhost_discard_vq_desc(vq, 1); tx_poll_start(net, sock); break; } if (err != len) - pr_err("Truncated TX packet: " - " len %d != %zd\n", err, len); + pr_debug("Truncated TX packet: " + " len %d != %zd\n", err, len); vhost_add_used_and_signal(&net->dev, vq, head, 0); total_len += len; if (unlikely(total_len >= VHOST_NET_WEIGHT)) { @@ -187,12 +207,86 @@ static void handle_tx(struct vhost_net *net) unuse_mm(net->dev.mm); } +static int peek_head_len(struct sock *sk) +{ + struct sk_buff *head; + int len = 0; + + lock_sock(sk); + head = skb_peek(&sk->sk_receive_queue); + if (head) + len = head->len; + release_sock(sk); + return len; +} + +/* This is a multi-buffer version of vhost_get_desc, that works if + * vq has read descriptors only. + * @vq - the relevant virtqueue + * @datalen - data length we'll be reading + * @iovcount - returned count of io vectors we fill + * @log - vhost log + * @log_num - log offset + * returns number of buffer heads allocated, negative on error + */ +static int get_rx_bufs(struct vhost_virtqueue *vq, + struct vring_used_elem *heads, + int datalen, + unsigned *iovcount, + struct vhost_log *log, + unsigned *log_num) +{ + unsigned int out, in; + int seg = 0; + int headcount = 0; + unsigned d; + int r, nlogs = 0; + + while (datalen > 0) { + if (unlikely(headcount >= VHOST_NET_MAX_SG)) { + r = -ENOBUFS; + goto err; + } + d = vhost_get_vq_desc(vq->dev, vq, vq->iov + seg, + ARRAY_SIZE(vq->iov) - seg, &out, + &in, log, log_num); + if (d == vq->num) { + r = 0; + goto err; + } + if (unlikely(out || in <= 0)) { + vq_err(vq, "unexpected descriptor format for RX: " + "out %d, in %d\n", out, in); + r = -EINVAL; + goto err; + } + if (unlikely(log)) { + nlogs += *log_num; + log += *log_num; + } + heads[headcount].id = d; + heads[headcount].len = iov_length(vq->iov + seg, in); + datalen -= heads[headcount].len; + ++headcount; + seg += in; + } + heads[headcount - 1].len += datalen; + *iovcount = seg; + if (unlikely(log)) + *log_num = nlogs; + return headcount; +err: + vhost_discard_vq_desc(vq, headcount); + return r; +} + /* Expects to be always run from workqueue - which acts as * read-size critical section for our kind of RCU. */ -static void handle_rx(struct vhost_net *net) +static void handle_rx_big(struct vhost_net *net) { struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX]; - unsigned head, out, in, log, s; + unsigned out, in, log, s; + int head; struct vhost_log *vq_log; struct msghdr msg = { .msg_name = NULL, @@ -218,7 +312,7 @@ static void handle_rx(struct vhost_net *net) use_mm(net->dev.mm); mutex_lock(&vq->mutex); vhost_disable_notify(vq); - hdr_size = vq->hdr_size; + hdr_size = vq->vhost_hlen; vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL; @@ -228,6 +322,9 @@ static void handle_rx(struct vhost_net *net) ARRAY_SIZE(vq->iov), &out, &in, vq_log, &log); + /* On error, stop handling until the next kick. */ + if (unlikely(head < 0)) + break; /* OK, now we need to know about added descriptors. */ if (head == vq->num) { if (unlikely(vhost_enable_notify(vq))) { @@ -262,14 +359,14 @@ static void handle_rx(struct vhost_net *net) len, MSG_DONTWAIT | MSG_TRUNC); /* TODO: Check specific error and bomb out unless EAGAIN? */ if (err < 0) { - vhost_discard_vq_desc(vq); + vhost_discard_vq_desc(vq, 1); break; } /* TODO: Should check and handle checksum. */ if (err > len) { - pr_err("Discarded truncated rx packet: " - " len %d > %zd\n", err, len); - vhost_discard_vq_desc(vq); + pr_debug("Discarded truncated rx packet: " + " len %d > %zd\n", err, len); + vhost_discard_vq_desc(vq, 1); continue; } len = err; @@ -294,54 +391,175 @@ static void handle_rx(struct vhost_net *net) unuse_mm(net->dev.mm); } -static void handle_tx_kick(struct work_struct *work) +/* Expects to be always run from workqueue - which acts as + * read-size critical section for our kind of RCU. */ +static void handle_rx_mergeable(struct vhost_net *net) { - struct vhost_virtqueue *vq; - struct vhost_net *net; - vq = container_of(work, struct vhost_virtqueue, poll.work); - net = container_of(vq->dev, struct vhost_net, dev); + struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX]; + unsigned uninitialized_var(in), log; + struct vhost_log *vq_log; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_control = NULL, /* FIXME: get and handle RX aux data. */ + .msg_controllen = 0, + .msg_iov = vq->iov, + .msg_flags = MSG_DONTWAIT, + }; + + struct virtio_net_hdr_mrg_rxbuf hdr = { + .hdr.flags = 0, + .hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE + }; + + size_t total_len = 0; + int err, headcount; + size_t vhost_hlen, sock_hlen; + size_t vhost_len, sock_len; + struct socket *sock = rcu_dereference(vq->private_data); + if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue)) + return; + + use_mm(net->dev.mm); + mutex_lock(&vq->mutex); + vhost_disable_notify(vq); + vhost_hlen = vq->vhost_hlen; + sock_hlen = vq->sock_hlen; + + vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ? + vq->log : NULL; + + while ((sock_len = peek_head_len(sock->sk))) { + sock_len += sock_hlen; + vhost_len = sock_len + vhost_hlen; + headcount = get_rx_bufs(vq, vq->heads, vhost_len, + &in, vq_log, &log); + /* On error, stop handling until the next kick. */ + if (unlikely(headcount < 0)) + break; + /* OK, now we need to know about added descriptors. */ + if (!headcount) { + if (unlikely(vhost_enable_notify(vq))) { + /* They have slipped one in as we were + * doing that: check again. */ + vhost_disable_notify(vq); + continue; + } + /* Nothing new? Wait for eventfd to tell us + * they refilled. */ + break; + } + /* We don't need to be notified again. */ + if (unlikely((vhost_hlen))) + /* Skip header. TODO: support TSO. */ + move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in); + else + /* Copy the header for use in VIRTIO_NET_F_MRG_RXBUF: + * needed because sendmsg can modify msg_iov. */ + copy_iovec_hdr(vq->iov, vq->hdr, sock_hlen, in); + msg.msg_iovlen = in; + err = sock->ops->recvmsg(NULL, sock, &msg, + sock_len, MSG_DONTWAIT | MSG_TRUNC); + /* Userspace might have consumed the packet meanwhile: + * it's not supposed to do this usually, but might be hard + * to prevent. Discard data we got (if any) and keep going. */ + if (unlikely(err != sock_len)) { + pr_debug("Discarded rx packet: " + " len %d, expected %zd\n", err, sock_len); + vhost_discard_vq_desc(vq, headcount); + continue; + } + if (unlikely(vhost_hlen) && + memcpy_toiovecend(vq->hdr, (unsigned char *)&hdr, 0, + vhost_hlen)) { + vq_err(vq, "Unable to write vnet_hdr at addr %p\n", + vq->iov->iov_base); + break; + } + /* TODO: Should check and handle checksum. */ + if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) && + memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount, + offsetof(typeof(hdr), num_buffers), + sizeof hdr.num_buffers)) { + vq_err(vq, "Failed num_buffers write"); + vhost_discard_vq_desc(vq, headcount); + break; + } + vhost_add_used_and_signal_n(&net->dev, vq, vq->heads, + headcount); + if (unlikely(vq_log)) + vhost_log_write(vq, vq_log, log, vhost_len); + total_len += vhost_len; + if (unlikely(total_len >= VHOST_NET_WEIGHT)) { + vhost_poll_queue(&vq->poll); + break; + } + } + + mutex_unlock(&vq->mutex); + unuse_mm(net->dev.mm); +} + +static void handle_rx(struct vhost_net *net) +{ + if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF)) + handle_rx_mergeable(net); + else + handle_rx_big(net); +} + +static void handle_tx_kick(struct vhost_work *work) +{ + struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, + poll.work); + struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); + handle_tx(net); } -static void handle_rx_kick(struct work_struct *work) +static void handle_rx_kick(struct vhost_work *work) { - struct vhost_virtqueue *vq; - struct vhost_net *net; - vq = container_of(work, struct vhost_virtqueue, poll.work); - net = container_of(vq->dev, struct vhost_net, dev); + struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, + poll.work); + struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); + handle_rx(net); } -static void handle_tx_net(struct work_struct *work) +static void handle_tx_net(struct vhost_work *work) { - struct vhost_net *net; - net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work); + struct vhost_net *net = container_of(work, struct vhost_net, + poll[VHOST_NET_VQ_TX].work); handle_tx(net); } -static void handle_rx_net(struct work_struct *work) +static void handle_rx_net(struct vhost_work *work) { - struct vhost_net *net; - net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work); + struct vhost_net *net = container_of(work, struct vhost_net, + poll[VHOST_NET_VQ_RX].work); handle_rx(net); } static int vhost_net_open(struct inode *inode, struct file *f) { struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL); + struct vhost_dev *dev; int r; + if (!n) return -ENOMEM; + + dev = &n->dev; n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick; n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick; - r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX); + r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX); if (r < 0) { kfree(n); return r; } - vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT); - vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN); + vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev); + vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev); n->tx_poll_state = VHOST_NET_POLL_DISABLED; f->private_data = n; @@ -519,18 +737,22 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) /* start polling new socket */ oldsock = vq->private_data; - if (sock == oldsock) - goto done; + if (sock != oldsock) { + vhost_net_disable_vq(n, vq); + rcu_assign_pointer(vq->private_data, sock); + vhost_net_enable_vq(n, vq); + } + + mutex_unlock(&vq->mutex); - vhost_net_disable_vq(n, vq); - rcu_assign_pointer(vq->private_data, sock); - vhost_net_enable_vq(n, vq); -done: if (oldsock) { vhost_net_flush_vq(n, index); fput(oldsock->file); } + mutex_unlock(&n->dev.mutex); + return 0; + err_vq: mutex_unlock(&vq->mutex); err: @@ -561,9 +783,21 @@ done: static int vhost_net_set_features(struct vhost_net *n, u64 features) { - size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ? - sizeof(struct virtio_net_hdr) : 0; + size_t vhost_hlen, sock_hlen, hdr_len; int i; + + hdr_len = (features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? + sizeof(struct virtio_net_hdr_mrg_rxbuf) : + sizeof(struct virtio_net_hdr); + if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) { + /* vhost provides vnet_hdr */ + vhost_hlen = hdr_len; + sock_hlen = 0; + } else { + /* socket provides vnet_hdr */ + vhost_hlen = 0; + sock_hlen = hdr_len; + } mutex_lock(&n->dev.mutex); if ((features & (1 << VHOST_F_LOG_ALL)) && !vhost_log_access_ok(&n->dev)) { @@ -574,7 +808,8 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features) smp_wmb(); for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { mutex_lock(&n->vqs[i].mutex); - n->vqs[i].hdr_size = hdr_size; + n->vqs[i].vhost_hlen = vhost_hlen; + n->vqs[i].sock_hlen = sock_hlen; mutex_unlock(&n->vqs[i].mutex); } vhost_net_flush(n); @@ -626,7 +861,7 @@ static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl, } #endif -const static struct file_operations vhost_net_fops = { +static const struct file_operations vhost_net_fops = { .owner = THIS_MODULE, .release = vhost_net_release, .unlocked_ioctl = vhost_net_ioctl, @@ -644,25 +879,13 @@ static struct miscdevice vhost_net_misc = { static int vhost_net_init(void) { - int r = vhost_init(); - if (r) - goto err_init; - r = misc_register(&vhost_net_misc); - if (r) - goto err_reg; - return 0; -err_reg: - vhost_cleanup(); -err_init: - return r; - + return misc_register(&vhost_net_misc); } module_init(vhost_net_init); static void vhost_net_exit(void) { misc_deregister(&vhost_net_misc); - vhost_cleanup(); } module_exit(vhost_net_exit); |