From 4f0581d25827d5e864bcf07b05d73d0d12a20a5c Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Sun, 22 Sep 2013 19:03:44 +0100 Subject: xen-netback: improve ring effeciency for guest RX There was a bug that netback routines netbk/xenvif_skb_count_slots and netbk/xenvif_gop_frag_copy disagreed with each other, which caused netback to push wrong number of responses to netfront, which caused netfront to eventually crash. The bug was fixed in 6e43fc04a ("xen-netback: count number required slots for an skb more carefully"). Commit 6e43fc04a focused on backport-ability. The drawback with the existing packing scheme is that the ring is not used effeciently, as stated in 6e43fc04a. skb->data like: | 1111|222222222222|3333 | is arranged as: |1111 |222222222222|3333 | If we can do this: |111122222222|22223333 | That would save one ring slot, which improves ring effeciency. This patch effectively reverts 6e43fc04a. That patch made count_slots agree with gop_frag_copy, while this patch goes the other way around -- make gop_frag_copy agree with count_slots. The end result is that they still agree with each other, and the ring is now arranged like: |111122222222|22223333 | The patch that improves packing was first posted by Xi Xong and Matt Wilson. I only rebase it on top of net-next and rewrite commit message, so I retain all their SoBs. For more infomation about the original bug please refer to email listed below and commit message of 6e43fc04a. Original patch: http://lists.xen.org/archives/html/xen-devel/2013-07/msg00760.html Signed-off-by: Xi Xiong Reviewed-by: Matt Wilson [ msw: minor code cleanups, rewrote commit message, adjusted code to count RX slots instead of meta structures ] Signed-off-by: Matt Wilson Cc: Annie Li Cc: Wei Liu Cc: Ian Campbell [ liuw: rebased on top of net-next tree, rewrote commit message, coding style cleanup. ] Signed-off-by: Wei Liu Cc: David Vrabel Acked-by: Ian Campbell Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 144 ++++++++++++++++---------------------- 1 file changed, 61 insertions(+), 83 deletions(-) (limited to 'drivers/net/xen-netback') diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index f3e591c611d..d0b0feb035f 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -47,6 +47,14 @@ #include #include +/* SKB control block overlay is used to store useful information when + * doing guest RX. + */ +struct skb_cb_overlay { + int meta_slots_used; + int peek_slots_count; +}; + /* Provide an option to disable split event channels at load time as * event channels are limited resource. Split event channels are * enabled by default. @@ -212,49 +220,6 @@ static bool start_new_rx_buffer(int offset, unsigned long size, int head) return false; } -struct xenvif_count_slot_state { - unsigned long copy_off; - bool head; -}; - -unsigned int xenvif_count_frag_slots(struct xenvif *vif, - unsigned long offset, unsigned long size, - struct xenvif_count_slot_state *state) -{ - unsigned count = 0; - - offset &= ~PAGE_MASK; - - while (size > 0) { - unsigned long bytes; - - bytes = PAGE_SIZE - offset; - - if (bytes > size) - bytes = size; - - if (start_new_rx_buffer(state->copy_off, bytes, state->head)) { - count++; - state->copy_off = 0; - } - - if (state->copy_off + bytes > MAX_BUFFER_OFFSET) - bytes = MAX_BUFFER_OFFSET - state->copy_off; - - state->copy_off += bytes; - - offset += bytes; - size -= bytes; - - if (offset == PAGE_SIZE) - offset = 0; - - state->head = false; - } - - return count; -} - /* * Figure out how many ring slots we're going to need to send @skb to * the guest. This function is essentially a dry run of @@ -262,40 +227,53 @@ unsigned int xenvif_count_frag_slots(struct xenvif *vif, */ unsigned int xenvif_count_skb_slots(struct xenvif *vif, struct sk_buff *skb) { - struct xenvif_count_slot_state state; unsigned int count; - unsigned char *data; - unsigned i; + int i, copy_off; + struct skb_cb_overlay *sco; - state.head = true; - state.copy_off = 0; + count = DIV_ROUND_UP(skb_headlen(skb), PAGE_SIZE); - /* Slot for the first (partial) page of data. */ - count = 1; + copy_off = skb_headlen(skb) % PAGE_SIZE; - /* Need a slot for the GSO prefix for GSO extra data? */ if (skb_shinfo(skb)->gso_size) count++; - data = skb->data; - while (data < skb_tail_pointer(skb)) { - unsigned long offset = offset_in_page(data); - unsigned long size = PAGE_SIZE - offset; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + unsigned long size = skb_frag_size(&skb_shinfo(skb)->frags[i]); + unsigned long offset = skb_shinfo(skb)->frags[i].page_offset; + unsigned long bytes; - if (data + size > skb_tail_pointer(skb)) - size = skb_tail_pointer(skb) - data; + offset &= ~PAGE_MASK; - count += xenvif_count_frag_slots(vif, offset, size, &state); + while (size > 0) { + BUG_ON(offset >= PAGE_SIZE); + BUG_ON(copy_off > MAX_BUFFER_OFFSET); - data += size; - } + bytes = PAGE_SIZE - offset; - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - unsigned long size = skb_frag_size(&skb_shinfo(skb)->frags[i]); - unsigned long offset = skb_shinfo(skb)->frags[i].page_offset; + if (bytes > size) + bytes = size; + + if (start_new_rx_buffer(copy_off, bytes, 0)) { + count++; + copy_off = 0; + } - count += xenvif_count_frag_slots(vif, offset, size, &state); + if (copy_off + bytes > MAX_BUFFER_OFFSET) + bytes = MAX_BUFFER_OFFSET - copy_off; + + copy_off += bytes; + + offset += bytes; + size -= bytes; + + if (offset == PAGE_SIZE) + offset = 0; + } } + + sco = (struct skb_cb_overlay *)skb->cb; + sco->peek_slots_count = count; return count; } @@ -327,14 +305,11 @@ static struct xenvif_rx_meta *get_next_rx_buffer(struct xenvif *vif, return meta; } -/* - * Set up the grant operations for this fragment. If it's a flipping - * interface, we also set up the unmap request from here. - */ +/* Set up the grant operations for this fragment. */ static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb, struct netrx_pending_operations *npo, struct page *page, unsigned long size, - unsigned long offset, int *head) + unsigned long offset, int head, int *first) { struct gnttab_copy *copy_gop; struct xenvif_rx_meta *meta; @@ -358,12 +333,12 @@ static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb, if (bytes > size) bytes = size; - if (start_new_rx_buffer(npo->copy_off, bytes, *head)) { + if (start_new_rx_buffer(npo->copy_off, bytes, head)) { /* * Netfront requires there to be some data in the head * buffer. */ - BUG_ON(*head); + BUG_ON(*first); meta = get_next_rx_buffer(vif, npo); } @@ -397,10 +372,10 @@ static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb, } /* Leave a gap for the GSO descriptor. */ - if (*head && skb_shinfo(skb)->gso_size && !vif->gso_prefix) + if (*first && skb_shinfo(skb)->gso_size && !vif->gso_prefix) vif->rx.req_cons++; - *head = 0; /* There must be something in this buffer now. */ + *first = 0; /* There must be something in this buffer now. */ } } @@ -426,7 +401,7 @@ static int xenvif_gop_skb(struct sk_buff *skb, struct xen_netif_rx_request *req; struct xenvif_rx_meta *meta; unsigned char *data; - int head = 1; + int first = 1; int old_meta_prod; old_meta_prod = npo->meta_prod; @@ -462,7 +437,7 @@ static int xenvif_gop_skb(struct sk_buff *skb, len = skb_tail_pointer(skb) - data; xenvif_gop_frag_copy(vif, skb, npo, - virt_to_page(data), len, offset, &head); + virt_to_page(data), len, offset, 1, &first); data += len; } @@ -471,7 +446,7 @@ static int xenvif_gop_skb(struct sk_buff *skb, skb_frag_page(&skb_shinfo(skb)->frags[i]), skb_frag_size(&skb_shinfo(skb)->frags[i]), skb_shinfo(skb)->frags[i].page_offset, - &head); + 0, &first); } return npo->meta_prod - old_meta_prod; @@ -529,10 +504,6 @@ static void xenvif_add_frag_responses(struct xenvif *vif, int status, } } -struct skb_cb_overlay { - int meta_slots_used; -}; - static void xenvif_kick_thread(struct xenvif *vif) { wake_up(&vif->wq); @@ -563,19 +534,26 @@ void xenvif_rx_action(struct xenvif *vif) count = 0; while ((skb = skb_dequeue(&vif->rx_queue)) != NULL) { + RING_IDX old_rx_req_cons; + vif = netdev_priv(skb->dev); nr_frags = skb_shinfo(skb)->nr_frags; + old_rx_req_cons = vif->rx.req_cons; sco = (struct skb_cb_overlay *)skb->cb; sco->meta_slots_used = xenvif_gop_skb(skb, &npo); - count += nr_frags + 1; + count += vif->rx.req_cons - old_rx_req_cons; __skb_queue_tail(&rxq, skb); + skb = skb_peek(&vif->rx_queue); + if (skb == NULL) + break; + sco = (struct skb_cb_overlay *)skb->cb; + /* Filled the batch queue? */ - /* XXX FIXME: RX path dependent on MAX_SKB_FRAGS */ - if (count + MAX_SKB_FRAGS >= XEN_NETIF_RX_RING_SIZE) + if (count + sco->peek_slots_count >= XEN_NETIF_RX_RING_SIZE) break; } -- cgit v1.2.3-70-g09d2 From 33bc801dddc14f0f96b79e453ec51cecfe5ed612 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Tue, 8 Oct 2013 10:54:21 +0100 Subject: Revert "xen-netback: improve ring effeciency for guest RX" This reverts commit 4f0581d25827d5e864bcf07b05d73d0d12a20a5c. The named changeset is causing problem. Let's aim to make this part less fragile before trying to improve things. Signed-off-by: Wei Liu Cc: Ian Campbell Cc: Annie Li Cc: Matt Wilson Cc: Xi Xiong Cc: David Vrabel Cc: Paul Durrant Acked-by: Ian Campbell Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 144 ++++++++++++++++++++++---------------- 1 file changed, 83 insertions(+), 61 deletions(-) (limited to 'drivers/net/xen-netback') diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index d0b0feb035f..f3e591c611d 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -47,14 +47,6 @@ #include #include -/* SKB control block overlay is used to store useful information when - * doing guest RX. - */ -struct skb_cb_overlay { - int meta_slots_used; - int peek_slots_count; -}; - /* Provide an option to disable split event channels at load time as * event channels are limited resource. Split event channels are * enabled by default. @@ -220,6 +212,49 @@ static bool start_new_rx_buffer(int offset, unsigned long size, int head) return false; } +struct xenvif_count_slot_state { + unsigned long copy_off; + bool head; +}; + +unsigned int xenvif_count_frag_slots(struct xenvif *vif, + unsigned long offset, unsigned long size, + struct xenvif_count_slot_state *state) +{ + unsigned count = 0; + + offset &= ~PAGE_MASK; + + while (size > 0) { + unsigned long bytes; + + bytes = PAGE_SIZE - offset; + + if (bytes > size) + bytes = size; + + if (start_new_rx_buffer(state->copy_off, bytes, state->head)) { + count++; + state->copy_off = 0; + } + + if (state->copy_off + bytes > MAX_BUFFER_OFFSET) + bytes = MAX_BUFFER_OFFSET - state->copy_off; + + state->copy_off += bytes; + + offset += bytes; + size -= bytes; + + if (offset == PAGE_SIZE) + offset = 0; + + state->head = false; + } + + return count; +} + /* * Figure out how many ring slots we're going to need to send @skb to * the guest. This function is essentially a dry run of @@ -227,53 +262,40 @@ static bool start_new_rx_buffer(int offset, unsigned long size, int head) */ unsigned int xenvif_count_skb_slots(struct xenvif *vif, struct sk_buff *skb) { + struct xenvif_count_slot_state state; unsigned int count; - int i, copy_off; - struct skb_cb_overlay *sco; + unsigned char *data; + unsigned i; - count = DIV_ROUND_UP(skb_headlen(skb), PAGE_SIZE); + state.head = true; + state.copy_off = 0; - copy_off = skb_headlen(skb) % PAGE_SIZE; + /* Slot for the first (partial) page of data. */ + count = 1; + /* Need a slot for the GSO prefix for GSO extra data? */ if (skb_shinfo(skb)->gso_size) count++; - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - unsigned long size = skb_frag_size(&skb_shinfo(skb)->frags[i]); - unsigned long offset = skb_shinfo(skb)->frags[i].page_offset; - unsigned long bytes; - - offset &= ~PAGE_MASK; - - while (size > 0) { - BUG_ON(offset >= PAGE_SIZE); - BUG_ON(copy_off > MAX_BUFFER_OFFSET); - - bytes = PAGE_SIZE - offset; - - if (bytes > size) - bytes = size; + data = skb->data; + while (data < skb_tail_pointer(skb)) { + unsigned long offset = offset_in_page(data); + unsigned long size = PAGE_SIZE - offset; - if (start_new_rx_buffer(copy_off, bytes, 0)) { - count++; - copy_off = 0; - } + if (data + size > skb_tail_pointer(skb)) + size = skb_tail_pointer(skb) - data; - if (copy_off + bytes > MAX_BUFFER_OFFSET) - bytes = MAX_BUFFER_OFFSET - copy_off; + count += xenvif_count_frag_slots(vif, offset, size, &state); - copy_off += bytes; + data += size; + } - offset += bytes; - size -= bytes; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + unsigned long size = skb_frag_size(&skb_shinfo(skb)->frags[i]); + unsigned long offset = skb_shinfo(skb)->frags[i].page_offset; - if (offset == PAGE_SIZE) - offset = 0; - } + count += xenvif_count_frag_slots(vif, offset, size, &state); } - - sco = (struct skb_cb_overlay *)skb->cb; - sco->peek_slots_count = count; return count; } @@ -305,11 +327,14 @@ static struct xenvif_rx_meta *get_next_rx_buffer(struct xenvif *vif, return meta; } -/* Set up the grant operations for this fragment. */ +/* + * Set up the grant operations for this fragment. If it's a flipping + * interface, we also set up the unmap request from here. + */ static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb, struct netrx_pending_operations *npo, struct page *page, unsigned long size, - unsigned long offset, int head, int *first) + unsigned long offset, int *head) { struct gnttab_copy *copy_gop; struct xenvif_rx_meta *meta; @@ -333,12 +358,12 @@ static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb, if (bytes > size) bytes = size; - if (start_new_rx_buffer(npo->copy_off, bytes, head)) { + if (start_new_rx_buffer(npo->copy_off, bytes, *head)) { /* * Netfront requires there to be some data in the head * buffer. */ - BUG_ON(*first); + BUG_ON(*head); meta = get_next_rx_buffer(vif, npo); } @@ -372,10 +397,10 @@ static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb, } /* Leave a gap for the GSO descriptor. */ - if (*first && skb_shinfo(skb)->gso_size && !vif->gso_prefix) + if (*head && skb_shinfo(skb)->gso_size && !vif->gso_prefix) vif->rx.req_cons++; - *first = 0; /* There must be something in this buffer now. */ + *head = 0; /* There must be something in this buffer now. */ } } @@ -401,7 +426,7 @@ static int xenvif_gop_skb(struct sk_buff *skb, struct xen_netif_rx_request *req; struct xenvif_rx_meta *meta; unsigned char *data; - int first = 1; + int head = 1; int old_meta_prod; old_meta_prod = npo->meta_prod; @@ -437,7 +462,7 @@ static int xenvif_gop_skb(struct sk_buff *skb, len = skb_tail_pointer(skb) - data; xenvif_gop_frag_copy(vif, skb, npo, - virt_to_page(data), len, offset, 1, &first); + virt_to_page(data), len, offset, &head); data += len; } @@ -446,7 +471,7 @@ static int xenvif_gop_skb(struct sk_buff *skb, skb_frag_page(&skb_shinfo(skb)->frags[i]), skb_frag_size(&skb_shinfo(skb)->frags[i]), skb_shinfo(skb)->frags[i].page_offset, - 0, &first); + &head); } return npo->meta_prod - old_meta_prod; @@ -504,6 +529,10 @@ static void xenvif_add_frag_responses(struct xenvif *vif, int status, } } +struct skb_cb_overlay { + int meta_slots_used; +}; + static void xenvif_kick_thread(struct xenvif *vif) { wake_up(&vif->wq); @@ -534,26 +563,19 @@ void xenvif_rx_action(struct xenvif *vif) count = 0; while ((skb = skb_dequeue(&vif->rx_queue)) != NULL) { - RING_IDX old_rx_req_cons; - vif = netdev_priv(skb->dev); nr_frags = skb_shinfo(skb)->nr_frags; - old_rx_req_cons = vif->rx.req_cons; sco = (struct skb_cb_overlay *)skb->cb; sco->meta_slots_used = xenvif_gop_skb(skb, &npo); - count += vif->rx.req_cons - old_rx_req_cons; + count += nr_frags + 1; __skb_queue_tail(&rxq, skb); - skb = skb_peek(&vif->rx_queue); - if (skb == NULL) - break; - sco = (struct skb_cb_overlay *)skb->cb; - /* Filled the batch queue? */ - if (count + sco->peek_slots_count >= XEN_NETIF_RX_RING_SIZE) + /* XXX FIXME: RX path dependent on MAX_SKB_FRAGS */ + if (count + MAX_SKB_FRAGS >= XEN_NETIF_RX_RING_SIZE) break; } -- cgit v1.2.3-70-g09d2 From 146c8a77d27bcbd7722120f70f51e3b287205d0a Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Wed, 16 Oct 2013 17:50:28 +0100 Subject: xen-netback: add support for IPv6 checksum offload to guest Check xenstore flag feature-ipv6-csum-offload to determine if a guest is happy to accept IPv6 packets with only partial checksum. Signed-off-by: Paul Durrant Cc: Wei Liu Cc: David Vrabel Cc: Ian Campbell Signed-off-by: David S. Miller --- drivers/net/xen-netback/common.h | 3 ++- drivers/net/xen-netback/interface.c | 10 +++++++--- drivers/net/xen-netback/xenbus.c | 7 ++++++- include/xen/interface/io/netif.h | 7 +++++++ 4 files changed, 22 insertions(+), 5 deletions(-) (limited to 'drivers/net/xen-netback') diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index 5715318d6ba..b4a9a3c844b 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -153,7 +153,8 @@ struct xenvif { u8 can_sg:1; u8 gso:1; u8 gso_prefix:1; - u8 csum:1; + u8 ip_csum:1; + u8 ipv6_csum:1; /* Internal feature information. */ u8 can_queue:1; /* can queue packets for receiver? */ diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index 01bb854c7f6..8e927838652 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -216,8 +216,10 @@ static netdev_features_t xenvif_fix_features(struct net_device *dev, features &= ~NETIF_F_SG; if (!vif->gso && !vif->gso_prefix) features &= ~NETIF_F_TSO; - if (!vif->csum) + if (!vif->ip_csum) features &= ~NETIF_F_IP_CSUM; + if (!vif->ipv6_csum) + features &= ~NETIF_F_IPV6_CSUM; return features; } @@ -306,7 +308,7 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid, vif->domid = domid; vif->handle = handle; vif->can_sg = 1; - vif->csum = 1; + vif->ip_csum = 1; vif->dev = dev; vif->credit_bytes = vif->remaining_credit = ~0UL; @@ -316,7 +318,9 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid, vif->credit_timeout.expires = jiffies; dev->netdev_ops = &xenvif_netdev_ops; - dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO; + dev->hw_features = NETIF_F_SG | + NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | + NETIF_F_TSO; dev->features = dev->hw_features; SET_ETHTOOL_OPS(dev, &xenvif_ethtool_ops); diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c index 1b08d879837..ad27b15242c 100644 --- a/drivers/net/xen-netback/xenbus.c +++ b/drivers/net/xen-netback/xenbus.c @@ -574,7 +574,12 @@ static int connect_rings(struct backend_info *be) if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-no-csum-offload", "%d", &val) < 0) val = 0; - vif->csum = !val; + vif->ip_csum = !val; + + if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-ipv6-csum-offload", + "%d", &val) < 0) + val = 0; + vif->ipv6_csum = !!val; /* Map the shared frame, irq etc. */ err = xenvif_connect(vif, tx_ring_ref, rx_ring_ref, diff --git a/include/xen/interface/io/netif.h b/include/xen/interface/io/netif.h index eb262e3324d..c9e81849fcd 100644 --- a/include/xen/interface/io/netif.h +++ b/include/xen/interface/io/netif.h @@ -50,6 +50,13 @@ * node as before. */ +/* + * "feature-no-csum-offload" should be used to turn IPv4 TCP/UDP checksum + * offload off or on. If it is missing then the feature is assumed to be on. + * "feature-ipv6-csum-offload" should be used to turn IPv6 TCP/UDP checksum + * offload on or off. If it is missing then the feature is assumed to be off. + */ + /* * This is the 'wire' format for packets: * Request 1: xen_netif_tx_request -- XEN_NETTXF_* (any flags) -- cgit v1.2.3-70-g09d2 From 2eba61d55e5104d0bf08ba4a9cc609613f52b4c9 Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Wed, 16 Oct 2013 17:50:29 +0100 Subject: xen-netback: add support for IPv6 checksum offload from guest For performance of VM to VM traffic on a single host it is better to avoid calculation of TCP/UDP checksum in the sending frontend. To allow this this patch adds the code necessary to set up partial checksum for IPv6 packets and xenstore flag feature-ipv6-csum-offload to advertise that fact to frontends. Signed-off-by: Paul Durrant Cc: Wei Liu Cc: David Vrabel Cc: Ian Campbell Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 235 +++++++++++++++++++++++++++++++------- drivers/net/xen-netback/xenbus.c | 9 ++ 2 files changed, 205 insertions(+), 39 deletions(-) (limited to 'drivers/net/xen-netback') diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index f3e591c611d..4271f8d1da7 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -109,15 +109,12 @@ static inline unsigned long idx_to_kaddr(struct xenvif *vif, return (unsigned long)pfn_to_kaddr(idx_to_pfn(vif, idx)); } -/* - * This is the amount of packet we copy rather than map, so that the - * guest can't fiddle with the contents of the headers while we do - * packet processing on them (netfilter, routing, etc). +/* This is a miniumum size for the linear area to avoid lots of + * calls to __pskb_pull_tail() as we set up checksum offsets. The + * value 128 was chosen as it covers all IPv4 and most likely + * IPv6 headers. */ -#define PKT_PROT_LEN (ETH_HLEN + \ - VLAN_HLEN + \ - sizeof(struct iphdr) + MAX_IPOPTLEN + \ - sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE) +#define PKT_PROT_LEN 128 static u16 frag_get_pending_idx(skb_frag_t *frag) { @@ -1118,61 +1115,74 @@ static int xenvif_set_skb_gso(struct xenvif *vif, return 0; } -static int checksum_setup(struct xenvif *vif, struct sk_buff *skb) +static inline void maybe_pull_tail(struct sk_buff *skb, unsigned int len) +{ + if (skb_is_nonlinear(skb) && skb_headlen(skb) < len) { + /* If we need to pullup then pullup to the max, so we + * won't need to do it again. + */ + int target = min_t(int, skb->len, MAX_TCP_HEADER); + __pskb_pull_tail(skb, target - skb_headlen(skb)); + } +} + +static int checksum_setup_ip(struct xenvif *vif, struct sk_buff *skb, + int recalculate_partial_csum) { - struct iphdr *iph; + struct iphdr *iph = (void *)skb->data; + unsigned int header_size; + unsigned int off; int err = -EPROTO; - int recalculate_partial_csum = 0; - /* - * A GSO SKB must be CHECKSUM_PARTIAL. However some buggy - * peers can fail to set NETRXF_csum_blank when sending a GSO - * frame. In this case force the SKB to CHECKSUM_PARTIAL and - * recalculate the partial checksum. - */ - if (skb->ip_summed != CHECKSUM_PARTIAL && skb_is_gso(skb)) { - vif->rx_gso_checksum_fixup++; - skb->ip_summed = CHECKSUM_PARTIAL; - recalculate_partial_csum = 1; - } + off = sizeof(struct iphdr); - /* A non-CHECKSUM_PARTIAL SKB does not require setup. */ - if (skb->ip_summed != CHECKSUM_PARTIAL) - return 0; + header_size = skb->network_header + off + MAX_IPOPTLEN; + maybe_pull_tail(skb, header_size); - if (skb->protocol != htons(ETH_P_IP)) - goto out; + off = iph->ihl * 4; - iph = (void *)skb->data; switch (iph->protocol) { case IPPROTO_TCP: - if (!skb_partial_csum_set(skb, 4 * iph->ihl, + if (!skb_partial_csum_set(skb, off, offsetof(struct tcphdr, check))) goto out; if (recalculate_partial_csum) { struct tcphdr *tcph = tcp_hdr(skb); + + header_size = skb->network_header + + off + + sizeof(struct tcphdr); + maybe_pull_tail(skb, header_size); + tcph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, - skb->len - iph->ihl*4, + skb->len - off, IPPROTO_TCP, 0); } break; case IPPROTO_UDP: - if (!skb_partial_csum_set(skb, 4 * iph->ihl, + if (!skb_partial_csum_set(skb, off, offsetof(struct udphdr, check))) goto out; if (recalculate_partial_csum) { struct udphdr *udph = udp_hdr(skb); + + header_size = skb->network_header + + off + + sizeof(struct udphdr); + maybe_pull_tail(skb, header_size); + udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, - skb->len - iph->ihl*4, + skb->len - off, IPPROTO_UDP, 0); } break; default: if (net_ratelimit()) netdev_err(vif->dev, - "Attempting to checksum a non-TCP/UDP packet, dropping a protocol %d packet\n", + "Attempting to checksum a non-TCP/UDP packet, " + "dropping a protocol %d packet\n", iph->protocol); goto out; } @@ -1183,6 +1193,158 @@ out: return err; } +static int checksum_setup_ipv6(struct xenvif *vif, struct sk_buff *skb, + int recalculate_partial_csum) +{ + int err = -EPROTO; + struct ipv6hdr *ipv6h = (void *)skb->data; + u8 nexthdr; + unsigned int header_size; + unsigned int off; + bool fragment; + bool done; + + done = false; + + off = sizeof(struct ipv6hdr); + + header_size = skb->network_header + off; + maybe_pull_tail(skb, header_size); + + nexthdr = ipv6h->nexthdr; + + while ((off <= sizeof(struct ipv6hdr) + ntohs(ipv6h->payload_len)) && + !done) { + switch (nexthdr) { + case IPPROTO_DSTOPTS: + case IPPROTO_HOPOPTS: + case IPPROTO_ROUTING: { + struct ipv6_opt_hdr *hp = (void *)(skb->data + off); + + header_size = skb->network_header + + off + + sizeof(struct ipv6_opt_hdr); + maybe_pull_tail(skb, header_size); + + nexthdr = hp->nexthdr; + off += ipv6_optlen(hp); + break; + } + case IPPROTO_AH: { + struct ip_auth_hdr *hp = (void *)(skb->data + off); + + header_size = skb->network_header + + off + + sizeof(struct ip_auth_hdr); + maybe_pull_tail(skb, header_size); + + nexthdr = hp->nexthdr; + off += (hp->hdrlen+2)<<2; + break; + } + case IPPROTO_FRAGMENT: + fragment = true; + /* fall through */ + default: + done = true; + break; + } + } + + if (!done) { + if (net_ratelimit()) + netdev_err(vif->dev, "Failed to parse packet header\n"); + goto out; + } + + if (fragment) { + if (net_ratelimit()) + netdev_err(vif->dev, "Packet is a fragment!\n"); + goto out; + } + + switch (nexthdr) { + case IPPROTO_TCP: + if (!skb_partial_csum_set(skb, off, + offsetof(struct tcphdr, check))) + goto out; + + if (recalculate_partial_csum) { + struct tcphdr *tcph = tcp_hdr(skb); + + header_size = skb->network_header + + off + + sizeof(struct tcphdr); + maybe_pull_tail(skb, header_size); + + tcph->check = ~csum_ipv6_magic(&ipv6h->saddr, + &ipv6h->daddr, + skb->len - off, + IPPROTO_TCP, 0); + } + break; + case IPPROTO_UDP: + if (!skb_partial_csum_set(skb, off, + offsetof(struct udphdr, check))) + goto out; + + if (recalculate_partial_csum) { + struct udphdr *udph = udp_hdr(skb); + + header_size = skb->network_header + + off + + sizeof(struct udphdr); + maybe_pull_tail(skb, header_size); + + udph->check = ~csum_ipv6_magic(&ipv6h->saddr, + &ipv6h->daddr, + skb->len - off, + IPPROTO_UDP, 0); + } + break; + default: + if (net_ratelimit()) + netdev_err(vif->dev, + "Attempting to checksum a non-TCP/UDP packet, " + "dropping a protocol %d packet\n", + nexthdr); + goto out; + } + + err = 0; + +out: + return err; +} + +static int checksum_setup(struct xenvif *vif, struct sk_buff *skb) +{ + int err = -EPROTO; + int recalculate_partial_csum = 0; + + /* A GSO SKB must be CHECKSUM_PARTIAL. However some buggy + * peers can fail to set NETRXF_csum_blank when sending a GSO + * frame. In this case force the SKB to CHECKSUM_PARTIAL and + * recalculate the partial checksum. + */ + if (skb->ip_summed != CHECKSUM_PARTIAL && skb_is_gso(skb)) { + vif->rx_gso_checksum_fixup++; + skb->ip_summed = CHECKSUM_PARTIAL; + recalculate_partial_csum = 1; + } + + /* A non-CHECKSUM_PARTIAL SKB does not require setup. */ + if (skb->ip_summed != CHECKSUM_PARTIAL) + return 0; + + if (skb->protocol == htons(ETH_P_IP)) + err = checksum_setup_ip(vif, skb, recalculate_partial_csum); + else if (skb->protocol == htons(ETH_P_IPV6)) + err = checksum_setup_ipv6(vif, skb, recalculate_partial_csum); + + return err; +} + static bool tx_credit_exceeded(struct xenvif *vif, unsigned size) { unsigned long now = jiffies; @@ -1428,12 +1590,7 @@ static int xenvif_tx_submit(struct xenvif *vif, int budget) xenvif_fill_frags(vif, skb); - /* - * If the initial fragment was < PKT_PROT_LEN then - * pull through some bytes from the other fragments to - * increase the linear region to PKT_PROT_LEN bytes. - */ - if (skb_headlen(skb) < PKT_PROT_LEN && skb_is_nonlinear(skb)) { + if (skb_is_nonlinear(skb) && skb_headlen(skb) < PKT_PROT_LEN) { int target = min_t(int, skb->len, PKT_PROT_LEN); __pskb_pull_tail(skb, target - skb_headlen(skb)); } diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c index ad27b15242c..108e7523017 100644 --- a/drivers/net/xen-netback/xenbus.c +++ b/drivers/net/xen-netback/xenbus.c @@ -105,6 +105,15 @@ static int netback_probe(struct xenbus_device *dev, goto abort_transaction; } + /* We support partial checksum setup for IPv6 packets */ + err = xenbus_printf(xbt, dev->nodename, + "feature-ipv6-csum-offload", + "%d", 1); + if (err) { + message = "writing feature-ipv6-csum-offload"; + goto abort_transaction; + } + /* We support rx-copy path. */ err = xenbus_printf(xbt, dev->nodename, "feature-rx-copy", "%d", 1); -- cgit v1.2.3-70-g09d2 From 7365bcfa32d2c9d212c41d52ff3509d70b6a3466 Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Wed, 16 Oct 2013 17:50:30 +0100 Subject: xen-netback: Unconditionally set NETIF_F_RXCSUM There is no mechanism to insist that a guest always generates a packet with good checksum (at least for IPv4) so we must handle checksum offloading from the guest and hence should set NETIF_F_RXCSUM. Signed-off-by: Paul Durrant Cc: Wei Liu Cc: David Vrabel Cc: Ian Campbell Signed-off-by: David S. Miller --- drivers/net/xen-netback/interface.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/net/xen-netback') diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index 8e927838652..cb0d8ea3d9f 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -321,7 +321,7 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid, dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_TSO; - dev->features = dev->hw_features; + dev->features = dev->hw_features | NETIF_F_RXCSUM; SET_ETHTOOL_OPS(dev, &xenvif_ethtool_ops); dev->tx_queue_len = XENVIF_QUEUE_LENGTH; -- cgit v1.2.3-70-g09d2 From a94685876859be30446357db6d6c4a9c951305b4 Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Wed, 16 Oct 2013 17:50:31 +0100 Subject: xen-netback: handle IPv6 TCP GSO packets from the guest This patch adds a xenstore feature flag, festure-gso-tcpv6, to advertise that netback can handle IPv6 TCP GSO packets. It creates SKB_GSO_TCPV6 skbs if the frontend passes an extra segment with the new type XEN_NETIF_GSO_TYPE_TCPV6 added to netif.h. Signed-off-by: Paul Durrant Cc: Wei Liu Cc: David Vrabel Acked-by: Ian Campbell Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 11 ++++++++--- drivers/net/xen-netback/xenbus.c | 7 +++++++ include/xen/interface/io/netif.h | 10 +++++++++- 3 files changed, 24 insertions(+), 4 deletions(-) (limited to 'drivers/net/xen-netback') diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 4271f8d1da7..0e327d46a13 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -1098,15 +1098,20 @@ static int xenvif_set_skb_gso(struct xenvif *vif, return -EINVAL; } - /* Currently only TCPv4 S.O. is supported. */ - if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) { + switch (gso->u.gso.type) { + case XEN_NETIF_GSO_TYPE_TCPV4: + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; + break; + case XEN_NETIF_GSO_TYPE_TCPV6: + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; + break; + default: netdev_err(vif->dev, "Bad GSO type %d.\n", gso->u.gso.type); xenvif_fatal_tx_err(vif); return -EINVAL; } skb_shinfo(skb)->gso_size = gso->u.gso.size; - skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; /* Header must be checked, and gso_segs computed. */ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c index 108e7523017..02cb00bebdc 100644 --- a/drivers/net/xen-netback/xenbus.c +++ b/drivers/net/xen-netback/xenbus.c @@ -105,6 +105,13 @@ static int netback_probe(struct xenbus_device *dev, goto abort_transaction; } + err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv6", + "%d", sg); + if (err) { + message = "writing feature-gso-tcpv6"; + goto abort_transaction; + } + /* We support partial checksum setup for IPv6 packets */ err = xenbus_printf(xbt, dev->nodename, "feature-ipv6-csum-offload", diff --git a/include/xen/interface/io/netif.h b/include/xen/interface/io/netif.h index c9e81849fcd..5e766ebe77f 100644 --- a/include/xen/interface/io/netif.h +++ b/include/xen/interface/io/netif.h @@ -57,6 +57,13 @@ * offload on or off. If it is missing then the feature is assumed to be off. */ +/* + * "feature-gso-tcpv4" and "feature-gso-tcpv6" advertise the capability to + * handle large TCP packets (in IPv4 or IPv6 form respectively). Neither + * frontends nor backends are assumed to be capable unless the flags are + * present. + */ + /* * This is the 'wire' format for packets: * Request 1: xen_netif_tx_request -- XEN_NETTXF_* (any flags) @@ -102,8 +109,9 @@ struct xen_netif_tx_request { #define _XEN_NETIF_EXTRA_FLAG_MORE (0) #define XEN_NETIF_EXTRA_FLAG_MORE (1U<<_XEN_NETIF_EXTRA_FLAG_MORE) -/* GSO types - only TCPv4 currently supported. */ +/* GSO types */ #define XEN_NETIF_GSO_TYPE_TCPV4 (1) +#define XEN_NETIF_GSO_TYPE_TCPV6 (2) /* * This structure needs to fit within both netif_tx_request and -- cgit v1.2.3-70-g09d2 From 82cada22a0bbec6a7afb573ef5fb6c512aaa2739 Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Wed, 16 Oct 2013 17:50:32 +0100 Subject: xen-netback: enable IPv6 TCP GSO to the guest This patch adds code to handle SKB_GSO_TCPV6 skbs and construct appropriate extra or prefix segments to pass the large packet to the frontend. New xenstore flags, feature-gso-tcpv6 and feature-gso-tcpv6-prefix, are sampled to determine if the frontend is capable of handling such packets. Signed-off-by: Paul Durrant Cc: Wei Liu Cc: David Vrabel Cc: Ian Campbell Signed-off-by: David S. Miller --- drivers/net/xen-netback/common.h | 9 +++++-- drivers/net/xen-netback/interface.c | 6 +++-- drivers/net/xen-netback/netback.c | 48 +++++++++++++++++++++++++++++-------- drivers/net/xen-netback/xenbus.c | 29 ++++++++++++++++++++-- include/xen/interface/io/netif.h | 1 + 5 files changed, 77 insertions(+), 16 deletions(-) (limited to 'drivers/net/xen-netback') diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index b4a9a3c844b..55b8dec8623 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -87,9 +87,13 @@ struct pending_tx_info { struct xenvif_rx_meta { int id; int size; + int gso_type; int gso_size; }; +#define GSO_BIT(type) \ + (1 << XEN_NETIF_GSO_TYPE_ ## type) + /* Discriminate from any valid pending_idx value. */ #define INVALID_PENDING_IDX 0xFFFF @@ -150,9 +154,10 @@ struct xenvif { u8 fe_dev_addr[6]; /* Frontend feature information. */ + int gso_mask; + int gso_prefix_mask; + u8 can_sg:1; - u8 gso:1; - u8 gso_prefix:1; u8 ip_csum:1; u8 ipv6_csum:1; diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index cb0d8ea3d9f..e4aa26748f8 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -214,8 +214,10 @@ static netdev_features_t xenvif_fix_features(struct net_device *dev, if (!vif->can_sg) features &= ~NETIF_F_SG; - if (!vif->gso && !vif->gso_prefix) + if (~(vif->gso_mask | vif->gso_prefix_mask) & GSO_BIT(TCPV4)) features &= ~NETIF_F_TSO; + if (~(vif->gso_mask | vif->gso_prefix_mask) & GSO_BIT(TCPV6)) + features &= ~NETIF_F_TSO6; if (!vif->ip_csum) features &= ~NETIF_F_IP_CSUM; if (!vif->ipv6_csum) @@ -320,7 +322,7 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid, dev->netdev_ops = &xenvif_netdev_ops; dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | - NETIF_F_TSO; + NETIF_F_TSO | NETIF_F_TSO6; dev->features = dev->hw_features | NETIF_F_RXCSUM; SET_ETHTOOL_OPS(dev, &xenvif_ethtool_ops); diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 0e327d46a13..828fdab4f1a 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -142,7 +142,7 @@ static int max_required_rx_slots(struct xenvif *vif) int max = DIV_ROUND_UP(vif->dev->mtu, PAGE_SIZE); /* XXX FIXME: RX path dependent on MAX_SKB_FRAGS */ - if (vif->can_sg || vif->gso || vif->gso_prefix) + if (vif->can_sg || vif->gso_mask || vif->gso_prefix_mask) max += MAX_SKB_FRAGS + 1; /* extra_info + frags */ return max; @@ -314,6 +314,7 @@ static struct xenvif_rx_meta *get_next_rx_buffer(struct xenvif *vif, req = RING_GET_REQUEST(&vif->rx, vif->rx.req_cons++); meta = npo->meta + npo->meta_prod++; + meta->gso_type = XEN_NETIF_GSO_TYPE_NONE; meta->gso_size = 0; meta->size = 0; meta->id = req->id; @@ -336,6 +337,7 @@ static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb, struct gnttab_copy *copy_gop; struct xenvif_rx_meta *meta; unsigned long bytes; + int gso_type; /* Data must not cross a page boundary. */ BUG_ON(size + offset > PAGE_SIZE<gso_size && !vif->gso_prefix) + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) + gso_type = XEN_NETIF_GSO_TYPE_TCPV4; + else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) + gso_type = XEN_NETIF_GSO_TYPE_TCPV6; + else + gso_type = XEN_NETIF_GSO_TYPE_NONE; + + if (*head && ((1 << gso_type) & vif->gso_mask)) vif->rx.req_cons++; *head = 0; /* There must be something in this buffer now. */ @@ -425,14 +434,28 @@ static int xenvif_gop_skb(struct sk_buff *skb, unsigned char *data; int head = 1; int old_meta_prod; + int gso_type; + int gso_size; old_meta_prod = npo->meta_prod; + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { + gso_type = XEN_NETIF_GSO_TYPE_TCPV4; + gso_size = skb_shinfo(skb)->gso_size; + } else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) { + gso_type = XEN_NETIF_GSO_TYPE_TCPV6; + gso_size = skb_shinfo(skb)->gso_size; + } else { + gso_type = XEN_NETIF_GSO_TYPE_NONE; + gso_size = 0; + } + /* Set up a GSO prefix descriptor, if necessary */ - if (skb_shinfo(skb)->gso_size && vif->gso_prefix) { + if ((1 << skb_shinfo(skb)->gso_type) & vif->gso_prefix_mask) { req = RING_GET_REQUEST(&vif->rx, vif->rx.req_cons++); meta = npo->meta + npo->meta_prod++; - meta->gso_size = skb_shinfo(skb)->gso_size; + meta->gso_type = gso_type; + meta->gso_size = gso_size; meta->size = 0; meta->id = req->id; } @@ -440,10 +463,13 @@ static int xenvif_gop_skb(struct sk_buff *skb, req = RING_GET_REQUEST(&vif->rx, vif->rx.req_cons++); meta = npo->meta + npo->meta_prod++; - if (!vif->gso_prefix) - meta->gso_size = skb_shinfo(skb)->gso_size; - else + if ((1 << gso_type) & vif->gso_mask) { + meta->gso_type = gso_type; + meta->gso_size = gso_size; + } else { + meta->gso_type = XEN_NETIF_GSO_TYPE_NONE; meta->gso_size = 0; + } meta->size = 0; meta->id = req->id; @@ -589,7 +615,8 @@ void xenvif_rx_action(struct xenvif *vif) vif = netdev_priv(skb->dev); - if (vif->meta[npo.meta_cons].gso_size && vif->gso_prefix) { + if ((1 << vif->meta[npo.meta_cons].gso_type) & + vif->gso_prefix_mask) { resp = RING_GET_RESPONSE(&vif->rx, vif->rx.rsp_prod_pvt++); @@ -626,7 +653,8 @@ void xenvif_rx_action(struct xenvif *vif) vif->meta[npo.meta_cons].size, flags); - if (vif->meta[npo.meta_cons].gso_size && !vif->gso_prefix) { + if ((1 << vif->meta[npo.meta_cons].gso_type) & + vif->gso_mask) { struct xen_netif_extra_info *gso = (struct xen_netif_extra_info *) RING_GET_RESPONSE(&vif->rx, @@ -634,8 +662,8 @@ void xenvif_rx_action(struct xenvif *vif) resp->flags |= XEN_NETRXF_extra_info; + gso->u.gso.type = vif->meta[npo.meta_cons].gso_type; gso->u.gso.size = vif->meta[npo.meta_cons].gso_size; - gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4; gso->u.gso.pad = 0; gso->u.gso.features = 0; diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c index 02cb00bebdc..f0358992b04 100644 --- a/drivers/net/xen-netback/xenbus.c +++ b/drivers/net/xen-netback/xenbus.c @@ -577,15 +577,40 @@ static int connect_rings(struct backend_info *be) val = 0; vif->can_sg = !!val; + vif->gso_mask = 0; + vif->gso_prefix_mask = 0; + if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4", "%d", &val) < 0) val = 0; - vif->gso = !!val; + if (val) + vif->gso_mask |= GSO_BIT(TCPV4); if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4-prefix", "%d", &val) < 0) val = 0; - vif->gso_prefix = !!val; + if (val) + vif->gso_prefix_mask |= GSO_BIT(TCPV4); + + if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv6", + "%d", &val) < 0) + val = 0; + if (val) + vif->gso_mask |= GSO_BIT(TCPV6); + + if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv6-prefix", + "%d", &val) < 0) + val = 0; + if (val) + vif->gso_prefix_mask |= GSO_BIT(TCPV6); + + if (vif->gso_mask & vif->gso_prefix_mask) { + xenbus_dev_fatal(dev, err, + "%s: gso and gso prefix flags are not " + "mutually exclusive", + dev->otherend); + return -EOPNOTSUPP; + } if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-no-csum-offload", "%d", &val) < 0) diff --git a/include/xen/interface/io/netif.h b/include/xen/interface/io/netif.h index 5e766ebe77f..c50061db609 100644 --- a/include/xen/interface/io/netif.h +++ b/include/xen/interface/io/netif.h @@ -110,6 +110,7 @@ struct xen_netif_tx_request { #define XEN_NETIF_EXTRA_FLAG_MORE (1U<<_XEN_NETIF_EXTRA_FLAG_MORE) /* GSO types */ +#define XEN_NETIF_GSO_TYPE_NONE (0) #define XEN_NETIF_GSO_TYPE_TCPV4 (1) #define XEN_NETIF_GSO_TYPE_TCPV6 (2) -- cgit v1.2.3-70-g09d2 From 059dfa6a93b779516321e5112db9d7621b1367ba Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Mon, 28 Oct 2013 12:07:57 +0000 Subject: xen-netback: use jiffies_64 value to calculate credit timeout time_after_eq() only works if the delta is < MAX_ULONG/2. For a 32bit Dom0, if netfront sends packets at a very low rate, the time between subsequent calls to tx_credit_exceeded() may exceed MAX_ULONG/2 and the test for timer_after_eq() will be incorrect. Credit will not be replenished and the guest may become unable to send packets (e.g., if prior to the long gap, all credit was exhausted). Use jiffies_64 variant to mitigate this problem for 32bit Dom0. Suggested-by: Jan Beulich Signed-off-by: Wei Liu Reviewed-by: David Vrabel Cc: Ian Campbell Cc: Jason Luan Signed-off-by: David S. Miller --- drivers/net/xen-netback/common.h | 1 + drivers/net/xen-netback/interface.c | 3 +-- drivers/net/xen-netback/netback.c | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'drivers/net/xen-netback') diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index 5715318d6ba..400fea1de08 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -163,6 +163,7 @@ struct xenvif { unsigned long credit_usec; unsigned long remaining_credit; struct timer_list credit_timeout; + u64 credit_window_start; /* Statistics */ unsigned long rx_gso_checksum_fixup; diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index 01bb854c7f6..459935a6bfa 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -312,8 +312,7 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid, vif->credit_bytes = vif->remaining_credit = ~0UL; vif->credit_usec = 0UL; init_timer(&vif->credit_timeout); - /* Initialize 'expires' now: it's used to track the credit window. */ - vif->credit_timeout.expires = jiffies; + vif->credit_window_start = get_jiffies_64(); dev->netdev_ops = &xenvif_netdev_ops; dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO; diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index f3e591c611d..900da4b243a 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -1185,9 +1185,8 @@ out: static bool tx_credit_exceeded(struct xenvif *vif, unsigned size) { - unsigned long now = jiffies; - unsigned long next_credit = - vif->credit_timeout.expires + + u64 now = get_jiffies_64(); + u64 next_credit = vif->credit_window_start + msecs_to_jiffies(vif->credit_usec / 1000); /* Timer could already be pending in rare cases. */ @@ -1195,8 +1194,8 @@ static bool tx_credit_exceeded(struct xenvif *vif, unsigned size) return true; /* Passed the point where we can replenish credit? */ - if (time_after_eq(now, next_credit)) { - vif->credit_timeout.expires = now; + if (time_after_eq64(now, next_credit)) { + vif->credit_window_start = now; tx_add_credit(vif); } @@ -1208,6 +1207,7 @@ static bool tx_credit_exceeded(struct xenvif *vif, unsigned size) tx_credit_callback; mod_timer(&vif->credit_timeout, next_credit); + vif->credit_window_start = next_credit; return true; } -- cgit v1.2.3-70-g09d2 From db739ef37f07a1a12e388dbaec225d9d9d5d6ded Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 21 Nov 2013 15:26:09 +0000 Subject: xen-netback: stop the VIF thread before unbinding IRQs If the VIF thread is still running after unbinding the Tx and Rx IRQs in xenvif_disconnect(), the thread may attempt to raise an event which will BUG (as the irq is unbound). Signed-off-by: David Vrabel Acked-by: Wei Liu Signed-off-by: David S. Miller --- drivers/net/xen-netback/interface.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/net/xen-netback') diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index b78ee10a956..2329cccf1fa 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -461,6 +461,9 @@ void xenvif_disconnect(struct xenvif *vif) if (netif_carrier_ok(vif->dev)) xenvif_carrier_off(vif); + if (vif->task) + kthread_stop(vif->task); + if (vif->tx_irq) { if (vif->tx_irq == vif->rx_irq) unbind_from_irqhandler(vif->tx_irq, vif); @@ -471,9 +474,6 @@ void xenvif_disconnect(struct xenvif *vif) vif->tx_irq = 0; } - if (vif->task) - kthread_stop(vif->task); - xenvif_unmap_frontend_rings(vif); } -- cgit v1.2.3-70-g09d2 From ae5e8127b712313ec1b99356019ce9226fea8b88 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Mon, 25 Nov 2013 16:52:34 +0000 Subject: xen-netback: include definition of csum_ipv6_magic We are now using csum_ipv6_magic, include the appropriate header. Avoids the following error: drivers/net/xen-netback/netback.c:1313:4: error: implicit declaration of function 'csum_ipv6_magic' [-Werror=implicit-function-declaration] tcph->check = ~csum_ipv6_magic(&ipv6h->saddr, Signed-off-by: Andy Whitcroft Acked-by: Ian Campbell Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/net/xen-netback') diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 919b6509455..64f0e0d18b8 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -39,6 +39,7 @@ #include #include +#include #include #include -- cgit v1.2.3-70-g09d2 From 67fa36609fe0a0a4b4c99120e5093599556e4c5b Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Tue, 3 Dec 2013 14:06:25 +0000 Subject: xen-netback: clear vif->task on disconnect xenvif_start_xmit() relies on checking vif->task for NULL to determine whether the vif is ready to accept packets. The task thread is stopped in xenvif_disconnect() but task is not set to NULL. Thus, on a re-connect the check will give a false positive. Also since commit ea732dff5cfa10789007bf4a5b935388a0bb2a8f (Handle backend state transitions in a more robust way) it should not be possible for xenvif_connect() to be called if the vif is already connected so change the check of vif->tx_irq to a BUG_ON() and also add a BUG_ON(vif->task). Signed-off-by: Paul Durrant Cc: Wei Liu Cc: Ian Campbell Cc: David Vrabel Acked-by: Wei Liu Signed-off-by: David S. Miller --- drivers/net/xen-netback/interface.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'drivers/net/xen-netback') diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index 2329cccf1fa..870f1fa5837 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -368,11 +368,11 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref, unsigned long rx_ring_ref, unsigned int tx_evtchn, unsigned int rx_evtchn) { + struct task_struct *task; int err = -ENOMEM; - /* Already connected through? */ - if (vif->tx_irq) - return 0; + BUG_ON(vif->tx_irq); + BUG_ON(vif->task); err = xenvif_map_frontend_rings(vif, tx_ring_ref, rx_ring_ref); if (err < 0) @@ -411,14 +411,16 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref, } init_waitqueue_head(&vif->wq); - vif->task = kthread_create(xenvif_kthread, - (void *)vif, "%s", vif->dev->name); - if (IS_ERR(vif->task)) { + task = kthread_create(xenvif_kthread, + (void *)vif, "%s", vif->dev->name); + if (IS_ERR(task)) { pr_warn("Could not allocate kthread for %s\n", vif->dev->name); - err = PTR_ERR(vif->task); + err = PTR_ERR(task); goto err_rx_unbind; } + vif->task = task; + rtnl_lock(); if (!vif->can_sg && vif->dev->mtu > ETH_DATA_LEN) dev_set_mtu(vif->dev, ETH_DATA_LEN); @@ -461,8 +463,10 @@ void xenvif_disconnect(struct xenvif *vif) if (netif_carrier_ok(vif->dev)) xenvif_carrier_off(vif); - if (vif->task) + if (vif->task) { kthread_stop(vif->task); + vif->task = NULL; + } if (vif->tx_irq) { if (vif->tx_irq == vif->rx_irq) -- cgit v1.2.3-70-g09d2 From 1431fb31ecbaba1b5718006128f0f2ed0b94e1c3 Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Tue, 3 Dec 2013 17:39:29 +0000 Subject: xen-netback: fix fragment detection in checksum setup The code to detect fragments in checksum_setup() was missing for IPv4 and too eager for IPv6. (It transpires that Windows seems to send IPv6 packets with a fragment header even if they are not a fragment - i.e. offset is zero, and M bit is not set). This patch also incorporates a fix to callers of maybe_pull_tail() where skb->network_header was being erroneously added to the length argument. Signed-off-by: Paul Durrant Signed-off-by: Zoltan Kiss Cc: Wei Liu Cc: Ian Campbell Cc: David Vrabel cc: David Miller Acked-by: Wei Liu Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 236 ++++++++++++++++++++++---------------- include/linux/ipv6.h | 1 + include/net/ipv6.h | 3 +- 3 files changed, 140 insertions(+), 100 deletions(-) (limited to 'drivers/net/xen-netback') diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 64f0e0d18b8..acf13920e6d 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -1149,49 +1149,72 @@ static int xenvif_set_skb_gso(struct xenvif *vif, return 0; } -static inline void maybe_pull_tail(struct sk_buff *skb, unsigned int len) +static inline int maybe_pull_tail(struct sk_buff *skb, unsigned int len, + unsigned int max) { - if (skb_is_nonlinear(skb) && skb_headlen(skb) < len) { - /* If we need to pullup then pullup to the max, so we - * won't need to do it again. - */ - int target = min_t(int, skb->len, MAX_TCP_HEADER); - __pskb_pull_tail(skb, target - skb_headlen(skb)); - } + if (skb_headlen(skb) >= len) + return 0; + + /* If we need to pullup then pullup to the max, so we + * won't need to do it again. + */ + if (max > skb->len) + max = skb->len; + + if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL) + return -ENOMEM; + + if (skb_headlen(skb) < len) + return -EPROTO; + + return 0; } +/* This value should be large enough to cover a tagged ethernet header plus + * maximally sized IP and TCP or UDP headers. + */ +#define MAX_IP_HDR_LEN 128 + static int checksum_setup_ip(struct xenvif *vif, struct sk_buff *skb, int recalculate_partial_csum) { - struct iphdr *iph = (void *)skb->data; - unsigned int header_size; unsigned int off; - int err = -EPROTO; + bool fragment; + int err; + + fragment = false; + + err = maybe_pull_tail(skb, + sizeof(struct iphdr), + MAX_IP_HDR_LEN); + if (err < 0) + goto out; - off = sizeof(struct iphdr); + if (ip_hdr(skb)->frag_off & htons(IP_OFFSET | IP_MF)) + fragment = true; - header_size = skb->network_header + off + MAX_IPOPTLEN; - maybe_pull_tail(skb, header_size); + off = ip_hdrlen(skb); - off = iph->ihl * 4; + err = -EPROTO; - switch (iph->protocol) { + switch (ip_hdr(skb)->protocol) { case IPPROTO_TCP: if (!skb_partial_csum_set(skb, off, offsetof(struct tcphdr, check))) goto out; if (recalculate_partial_csum) { - struct tcphdr *tcph = tcp_hdr(skb); - - header_size = skb->network_header + - off + - sizeof(struct tcphdr); - maybe_pull_tail(skb, header_size); - - tcph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, - skb->len - off, - IPPROTO_TCP, 0); + err = maybe_pull_tail(skb, + off + sizeof(struct tcphdr), + MAX_IP_HDR_LEN); + if (err < 0) + goto out; + + tcp_hdr(skb)->check = + ~csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + skb->len - off, + IPPROTO_TCP, 0); } break; case IPPROTO_UDP: @@ -1200,24 +1223,20 @@ static int checksum_setup_ip(struct xenvif *vif, struct sk_buff *skb, goto out; if (recalculate_partial_csum) { - struct udphdr *udph = udp_hdr(skb); - - header_size = skb->network_header + - off + - sizeof(struct udphdr); - maybe_pull_tail(skb, header_size); - - udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, - skb->len - off, - IPPROTO_UDP, 0); + err = maybe_pull_tail(skb, + off + sizeof(struct udphdr), + MAX_IP_HDR_LEN); + if (err < 0) + goto out; + + udp_hdr(skb)->check = + ~csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + skb->len - off, + IPPROTO_UDP, 0); } break; default: - if (net_ratelimit()) - netdev_err(vif->dev, - "Attempting to checksum a non-TCP/UDP packet, " - "dropping a protocol %d packet\n", - iph->protocol); goto out; } @@ -1227,75 +1246,99 @@ out: return err; } +/* This value should be large enough to cover a tagged ethernet header plus + * an IPv6 header, all options, and a maximal TCP or UDP header. + */ +#define MAX_IPV6_HDR_LEN 256 + +#define OPT_HDR(type, skb, off) \ + (type *)(skb_network_header(skb) + (off)) + static int checksum_setup_ipv6(struct xenvif *vif, struct sk_buff *skb, int recalculate_partial_csum) { - int err = -EPROTO; - struct ipv6hdr *ipv6h = (void *)skb->data; + int err; u8 nexthdr; - unsigned int header_size; unsigned int off; + unsigned int len; bool fragment; bool done; + fragment = false; done = false; off = sizeof(struct ipv6hdr); - header_size = skb->network_header + off; - maybe_pull_tail(skb, header_size); + err = maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; - nexthdr = ipv6h->nexthdr; + nexthdr = ipv6_hdr(skb)->nexthdr; - while ((off <= sizeof(struct ipv6hdr) + ntohs(ipv6h->payload_len)) && - !done) { + len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len); + while (off <= len && !done) { switch (nexthdr) { case IPPROTO_DSTOPTS: case IPPROTO_HOPOPTS: case IPPROTO_ROUTING: { - struct ipv6_opt_hdr *hp = (void *)(skb->data + off); + struct ipv6_opt_hdr *hp; - header_size = skb->network_header + - off + - sizeof(struct ipv6_opt_hdr); - maybe_pull_tail(skb, header_size); + err = maybe_pull_tail(skb, + off + + sizeof(struct ipv6_opt_hdr), + MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + hp = OPT_HDR(struct ipv6_opt_hdr, skb, off); nexthdr = hp->nexthdr; off += ipv6_optlen(hp); break; } case IPPROTO_AH: { - struct ip_auth_hdr *hp = (void *)(skb->data + off); + struct ip_auth_hdr *hp; + + err = maybe_pull_tail(skb, + off + + sizeof(struct ip_auth_hdr), + MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + + hp = OPT_HDR(struct ip_auth_hdr, skb, off); + nexthdr = hp->nexthdr; + off += ipv6_authlen(hp); + break; + } + case IPPROTO_FRAGMENT: { + struct frag_hdr *hp; - header_size = skb->network_header + - off + - sizeof(struct ip_auth_hdr); - maybe_pull_tail(skb, header_size); + err = maybe_pull_tail(skb, + off + + sizeof(struct frag_hdr), + MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + + hp = OPT_HDR(struct frag_hdr, skb, off); + + if (hp->frag_off & htons(IP6_OFFSET | IP6_MF)) + fragment = true; nexthdr = hp->nexthdr; - off += (hp->hdrlen+2)<<2; + off += sizeof(struct frag_hdr); break; } - case IPPROTO_FRAGMENT: - fragment = true; - /* fall through */ default: done = true; break; } } - if (!done) { - if (net_ratelimit()) - netdev_err(vif->dev, "Failed to parse packet header\n"); - goto out; - } + err = -EPROTO; - if (fragment) { - if (net_ratelimit()) - netdev_err(vif->dev, "Packet is a fragment!\n"); + if (!done || fragment) goto out; - } switch (nexthdr) { case IPPROTO_TCP: @@ -1304,17 +1347,17 @@ static int checksum_setup_ipv6(struct xenvif *vif, struct sk_buff *skb, goto out; if (recalculate_partial_csum) { - struct tcphdr *tcph = tcp_hdr(skb); - - header_size = skb->network_header + - off + - sizeof(struct tcphdr); - maybe_pull_tail(skb, header_size); - - tcph->check = ~csum_ipv6_magic(&ipv6h->saddr, - &ipv6h->daddr, - skb->len - off, - IPPROTO_TCP, 0); + err = maybe_pull_tail(skb, + off + sizeof(struct tcphdr), + MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + + tcp_hdr(skb)->check = + ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len - off, + IPPROTO_TCP, 0); } break; case IPPROTO_UDP: @@ -1323,25 +1366,20 @@ static int checksum_setup_ipv6(struct xenvif *vif, struct sk_buff *skb, goto out; if (recalculate_partial_csum) { - struct udphdr *udph = udp_hdr(skb); - - header_size = skb->network_header + - off + - sizeof(struct udphdr); - maybe_pull_tail(skb, header_size); - - udph->check = ~csum_ipv6_magic(&ipv6h->saddr, - &ipv6h->daddr, - skb->len - off, - IPPROTO_UDP, 0); + err = maybe_pull_tail(skb, + off + sizeof(struct udphdr), + MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + + udp_hdr(skb)->check = + ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len - off, + IPPROTO_UDP, 0); } break; default: - if (net_ratelimit()) - netdev_err(vif->dev, - "Attempting to checksum a non-TCP/UDP packet, " - "dropping a protocol %d packet\n", - nexthdr); goto out; } diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 5d89d1b808a..c56c350324e 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -4,6 +4,7 @@ #include #define ipv6_optlen(p) (((p)->hdrlen+1) << 3) +#define ipv6_authlen(p) (((p)->hdrlen+2) << 2) /* * This structure contains configuration options per IPv6 link. */ diff --git a/include/net/ipv6.h b/include/net/ipv6.h index eb198acaac1..488316e339a 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -110,7 +110,8 @@ struct frag_hdr { __be32 identification; }; -#define IP6_MF 0x0001 +#define IP6_MF 0x0001 +#define IP6_OFFSET 0xFFF8 #include -- cgit v1.2.3-70-g09d2 From d52eb0d46f3606b9de9965cebb2beb2202a0dc62 Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Wed, 11 Dec 2013 16:37:40 +0000 Subject: xen-netback: make sure skb linear area covers checksum field skb_partial_csum_set requires that the linear area of the skb covers the checksum field. The checksum setup code in netback was only doing that pullup in the case when the pseudo header checksum was being recalculated though. This patch makes that pullup unconditional. (I pullup the whole transport header just for simplicity; the requirement is only for the check field but in the case of UDP this is the last field in the header and in the case of TCP it's the last but one). The lack of pullup manifested as failures running Microsoft HCK network tests on a pair of Windows 8 VMs and it has been verified that this patch fixes the problem. Suggested-by: Jan Beulich Signed-off-by: Paul Durrant Cc: Wei Liu Cc: Ian Campbell Cc: David Vrabel Reviewed-by: Jan Beulich Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 60 ++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 32 deletions(-) (limited to 'drivers/net/xen-netback') diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index acf13920e6d..d158fc40cff 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -1199,42 +1199,40 @@ static int checksum_setup_ip(struct xenvif *vif, struct sk_buff *skb, switch (ip_hdr(skb)->protocol) { case IPPROTO_TCP: + err = maybe_pull_tail(skb, + off + sizeof(struct tcphdr), + MAX_IP_HDR_LEN); + if (err < 0) + goto out; + if (!skb_partial_csum_set(skb, off, offsetof(struct tcphdr, check))) goto out; - if (recalculate_partial_csum) { - err = maybe_pull_tail(skb, - off + sizeof(struct tcphdr), - MAX_IP_HDR_LEN); - if (err < 0) - goto out; - + if (recalculate_partial_csum) tcp_hdr(skb)->check = ~csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, skb->len - off, IPPROTO_TCP, 0); - } break; case IPPROTO_UDP: + err = maybe_pull_tail(skb, + off + sizeof(struct udphdr), + MAX_IP_HDR_LEN); + if (err < 0) + goto out; + if (!skb_partial_csum_set(skb, off, offsetof(struct udphdr, check))) goto out; - if (recalculate_partial_csum) { - err = maybe_pull_tail(skb, - off + sizeof(struct udphdr), - MAX_IP_HDR_LEN); - if (err < 0) - goto out; - + if (recalculate_partial_csum) udp_hdr(skb)->check = ~csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, skb->len - off, IPPROTO_UDP, 0); - } break; default: goto out; @@ -1342,42 +1340,40 @@ static int checksum_setup_ipv6(struct xenvif *vif, struct sk_buff *skb, switch (nexthdr) { case IPPROTO_TCP: + err = maybe_pull_tail(skb, + off + sizeof(struct tcphdr), + MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + if (!skb_partial_csum_set(skb, off, offsetof(struct tcphdr, check))) goto out; - if (recalculate_partial_csum) { - err = maybe_pull_tail(skb, - off + sizeof(struct tcphdr), - MAX_IPV6_HDR_LEN); - if (err < 0) - goto out; - + if (recalculate_partial_csum) tcp_hdr(skb)->check = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->len - off, IPPROTO_TCP, 0); - } break; case IPPROTO_UDP: + err = maybe_pull_tail(skb, + off + sizeof(struct udphdr), + MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + if (!skb_partial_csum_set(skb, off, offsetof(struct udphdr, check))) goto out; - if (recalculate_partial_csum) { - err = maybe_pull_tail(skb, - off + sizeof(struct udphdr), - MAX_IPV6_HDR_LEN); - if (err < 0) - goto out; - + if (recalculate_partial_csum) udp_hdr(skb)->check = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->len - off, IPPROTO_UDP, 0); - } break; default: goto out; -- cgit v1.2.3-70-g09d2 From 10574059ce0451c6572c85329c772aa15085f8eb Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Wed, 11 Dec 2013 10:57:15 +0000 Subject: xen-netback: napi: fix abuse of budget netback seems to be somewhat confused about the napi budget parameter. The parameter is supposed to limit the number of skbs processed in each poll, but netback has this confused with grant operations. This patch fixes that, properly limiting the work done in each poll. Note that this limit makes sure we do not process any more data from the shared ring than we intend to pass back from the poll. This is important to prevent tx_queue potentially growing without bound. Signed-off-by: Paul Durrant Cc: Wei Liu Cc: Ian Campbell Cc: David Vrabel Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'drivers/net/xen-netback') diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index d158fc40cff..db79e29b3d0 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -1445,14 +1445,15 @@ static bool tx_credit_exceeded(struct xenvif *vif, unsigned size) return false; } -static unsigned xenvif_tx_build_gops(struct xenvif *vif) +static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget) { struct gnttab_copy *gop = vif->tx_copy_ops, *request_gop; struct sk_buff *skb; int ret; while ((nr_pending_reqs(vif) + XEN_NETBK_LEGACY_SLOTS_MAX - < MAX_PENDING_REQS)) { + < MAX_PENDING_REQS) && + (skb_queue_len(&vif->tx_queue) < budget)) { struct xen_netif_tx_request txreq; struct xen_netif_tx_request txfrags[XEN_NETBK_LEGACY_SLOTS_MAX]; struct page *page; @@ -1614,14 +1615,13 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif) } -static int xenvif_tx_submit(struct xenvif *vif, int budget) +static int xenvif_tx_submit(struct xenvif *vif) { struct gnttab_copy *gop = vif->tx_copy_ops; struct sk_buff *skb; int work_done = 0; - while (work_done < budget && - (skb = __skb_dequeue(&vif->tx_queue)) != NULL) { + while ((skb = __skb_dequeue(&vif->tx_queue)) != NULL) { struct xen_netif_tx_request *txp; u16 pending_idx; unsigned data_len; @@ -1696,14 +1696,14 @@ int xenvif_tx_action(struct xenvif *vif, int budget) if (unlikely(!tx_work_todo(vif))) return 0; - nr_gops = xenvif_tx_build_gops(vif); + nr_gops = xenvif_tx_build_gops(vif, budget); if (nr_gops == 0) return 0; gnttab_batch_copy(vif->tx_copy_ops, nr_gops); - work_done = xenvif_tx_submit(vif, nr_gops); + work_done = xenvif_tx_submit(vif); return work_done; } -- cgit v1.2.3-70-g09d2 From d9601a36ffdb5c142697bef1228afb5ba4ee4003 Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Wed, 11 Dec 2013 10:57:16 +0000 Subject: xen-netback: napi: don't prematurely request a tx event This patch changes the RING_FINAL_CHECK_FOR_REQUESTS in xenvif_build_tx_gops to a check for RING_HAS_UNCONSUMED_REQUESTS as the former call has the side effect of advancing the ring event pointer and therefore inviting another interrupt from the frontend before the napi poll has actually finished, thereby defeating the point of napi. The event pointer is updated by RING_FINAL_CHECK_FOR_REQUESTS in xenvif_poll, the napi poll function, if the work done is less than the budget i.e. when actually transitioning back to interrupt mode. Reported-by: Malcolm Crossley Signed-off-by: Paul Durrant Cc: Wei Liu Cc: Ian Campbell Cc: David Vrabel Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/net/xen-netback') diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index db79e29b3d0..33b8aa612d1 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -1475,7 +1475,7 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget) continue; } - RING_FINAL_CHECK_FOR_REQUESTS(&vif->tx, work_to_do); + work_to_do = RING_HAS_UNCONSUMED_REQUESTS(&vif->tx); if (!work_to_do) break; -- cgit v1.2.3-70-g09d2 From a3314f3d40215349ab2427800c1e10676691389f Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Thu, 12 Dec 2013 14:20:13 +0000 Subject: xen-netback: fix gso_prefix check There is a mistake in checking the gso_prefix mask when passing large packets to a guest. The wrong shift is applied to the bit - the raw skb gso type is used rather then the translated one. This leads to large packets being handed to the guest without the GSO metadata. This patch fixes the check. The mistake manifested as errors whilst running Microsoft HCK large packet offload tests between a pair of Windows 8 VMs. I have verified this patch fixes those errors. Signed-off-by: Paul Durrant Cc: Wei Liu Cc: Ian Campbell Cc: David Vrabel Acked-by: Ian Campbell Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/net/xen-netback') diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 33b8aa612d1..e884ee1fe7e 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -452,7 +452,7 @@ static int xenvif_gop_skb(struct sk_buff *skb, } /* Set up a GSO prefix descriptor, if necessary */ - if ((1 << skb_shinfo(skb)->gso_type) & vif->gso_prefix_mask) { + if ((1 << gso_type) & vif->gso_prefix_mask) { req = RING_GET_REQUEST(&vif->rx, vif->rx.req_cons++); meta = npo->meta + npo->meta_prod++; meta->gso_type = gso_type; -- cgit v1.2.3-70-g09d2 From 7022ef8b2a673db3d12a348331181f579afe9b22 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 16 Dec 2013 10:45:05 +0800 Subject: xen-netback: fix fragments error handling in checksum_setup_ip() Fix to return -EPROTO error if fragments detected in checksum_setup_ip(). Fixes: 1431fb31ecba ('xen-netback: fix fragment detection in checksum setup') Signed-off-by: Wei Yongjun Reviewed-by: Paul Durrant Acked-by: Ian Campbell Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/net/xen-netback') diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index e884ee1fe7e..27bbe58dcbe 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -1197,6 +1197,9 @@ static int checksum_setup_ip(struct xenvif *vif, struct sk_buff *skb, err = -EPROTO; + if (fragment) + goto out; + switch (ip_hdr(skb)->protocol) { case IPPROTO_TCP: err = maybe_pull_tail(skb, -- cgit v1.2.3-70-g09d2 From 0c8d087c04cdcef501064552149289866e53aa6c Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 17 Dec 2013 10:42:09 +0800 Subject: xen-netback: fix some error return code 'err' is overwrited to 0 after maybe_pull_tail() call, so the error code was not set if skb_partial_csum_set() call failed. Fix to return error -EPROTO from those error handling case instead of 0. Fixes: d52eb0d46f36 ('xen-netback: make sure skb linear area covers checksum field') Signed-off-by: Wei Yongjun Acked-by: Wei Liu Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'drivers/net/xen-netback') diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 27bbe58dcbe..7b4fd93be76 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -1209,8 +1209,10 @@ static int checksum_setup_ip(struct xenvif *vif, struct sk_buff *skb, goto out; if (!skb_partial_csum_set(skb, off, - offsetof(struct tcphdr, check))) + offsetof(struct tcphdr, check))) { + err = -EPROTO; goto out; + } if (recalculate_partial_csum) tcp_hdr(skb)->check = @@ -1227,8 +1229,10 @@ static int checksum_setup_ip(struct xenvif *vif, struct sk_buff *skb, goto out; if (!skb_partial_csum_set(skb, off, - offsetof(struct udphdr, check))) + offsetof(struct udphdr, check))) { + err = -EPROTO; goto out; + } if (recalculate_partial_csum) udp_hdr(skb)->check = @@ -1350,8 +1354,10 @@ static int checksum_setup_ipv6(struct xenvif *vif, struct sk_buff *skb, goto out; if (!skb_partial_csum_set(skb, off, - offsetof(struct tcphdr, check))) + offsetof(struct tcphdr, check))) { + err = -EPROTO; goto out; + } if (recalculate_partial_csum) tcp_hdr(skb)->check = @@ -1368,8 +1374,10 @@ static int checksum_setup_ipv6(struct xenvif *vif, struct sk_buff *skb, goto out; if (!skb_partial_csum_set(skb, off, - offsetof(struct udphdr, check))) + offsetof(struct udphdr, check))) { + err = -EPROTO; goto out; + } if (recalculate_partial_csum) udp_hdr(skb)->check = -- cgit v1.2.3-70-g09d2 From ac3d5ac277352fe6e27809286768e9f1f8aa388d Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Mon, 23 Dec 2013 09:27:17 +0000 Subject: xen-netback: fix guest-receive-side array sizes The sizes chosen for the metadata and grant_copy_op arrays on the guest receive size are wrong; - The meta array is needlessly twice the ring size, when we only ever consume a single array element per RX ring slot - The grant_copy_op array is way too small. It's sized based on a bogus assumption: that at most two copy ops will be used per ring slot. This may have been true at some point in the past but it's clear from looking at start_new_rx_buffer() that a new ring slot is only consumed if a frag would overflow the current slot (plus some other conditions) so the actual limit is MAX_SKB_FRAGS grant_copy_ops per ring slot. This patch fixes those two sizing issues and, because grant_copy_ops grows so much, it pulls it out into a separate chunk of vmalloc()ed memory. Signed-off-by: Paul Durrant Acked-by: Wei Liu Cc: Ian Campbell Cc: David Vrabel Signed-off-by: David S. Miller --- drivers/net/xen-netback/common.h | 19 +++++++++++++------ drivers/net/xen-netback/interface.c | 10 ++++++++++ drivers/net/xen-netback/netback.c | 2 +- 3 files changed, 24 insertions(+), 7 deletions(-) (limited to 'drivers/net/xen-netback') diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index 08ae01b41c8..c47794b9d42 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -101,6 +101,13 @@ struct xenvif_rx_meta { #define MAX_PENDING_REQS 256 +/* It's possible for an skb to have a maximal number of frags + * but still be less than MAX_BUFFER_OFFSET in size. Thus the + * worst-case number of copy operations is MAX_SKB_FRAGS per + * ring slot. + */ +#define MAX_GRANT_COPY_OPS (MAX_SKB_FRAGS * XEN_NETIF_RX_RING_SIZE) + struct xenvif { /* Unique identifier for this interface. */ domid_t domid; @@ -143,13 +150,13 @@ struct xenvif { */ RING_IDX rx_req_cons_peek; - /* Given MAX_BUFFER_OFFSET of 4096 the worst case is that each - * head/fragment page uses 2 copy operations because it - * straddles two buffers in the frontend. - */ - struct gnttab_copy grant_copy_op[2*XEN_NETIF_RX_RING_SIZE]; - struct xenvif_rx_meta meta[2*XEN_NETIF_RX_RING_SIZE]; + /* This array is allocated seperately as it is large */ + struct gnttab_copy *grant_copy_op; + /* We create one meta structure per ring request we consume, so + * the maximum number is the same as the ring size. + */ + struct xenvif_rx_meta meta[XEN_NETIF_RX_RING_SIZE]; u8 fe_dev_addr[6]; diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index 870f1fa5837..34ca4e58a43 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -307,6 +307,15 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid, SET_NETDEV_DEV(dev, parent); vif = netdev_priv(dev); + + vif->grant_copy_op = vmalloc(sizeof(struct gnttab_copy) * + MAX_GRANT_COPY_OPS); + if (vif->grant_copy_op == NULL) { + pr_warn("Could not allocate grant copy space for %s\n", name); + free_netdev(dev); + return ERR_PTR(-ENOMEM); + } + vif->domid = domid; vif->handle = handle; vif->can_sg = 1; @@ -487,6 +496,7 @@ void xenvif_free(struct xenvif *vif) unregister_netdev(vif->dev); + vfree(vif->grant_copy_op); free_netdev(vif->dev); module_put(THIS_MODULE); diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 7b4fd93be76..78425554a53 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -608,7 +608,7 @@ void xenvif_rx_action(struct xenvif *vif) if (!npo.copy_prod) return; - BUG_ON(npo.copy_prod > ARRAY_SIZE(vif->grant_copy_op)); + BUG_ON(npo.copy_prod > MAX_GRANT_COPY_OPS); gnttab_batch_copy(vif->grant_copy_op, npo.copy_prod); while ((skb = __skb_dequeue(&rxq)) != NULL) { -- cgit v1.2.3-70-g09d2 From f35f76ee76df008131bbe01a2297de0c55ee2297 Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Sun, 5 Jan 2014 10:24:01 -0500 Subject: xen-netback: Include header for vmalloc Commit ac3d5ac27735 ("xen-netback: fix guest-receive-side array sizes") added calls to vmalloc and vfree in the interface.c file without including . This causes build failures if the -Werror=implicit-function-declaration flag is passed. Signed-off-by: Josh Boyer Acked-by: Wei Liu Signed-off-by: David S. Miller --- drivers/net/xen-netback/interface.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/net/xen-netback') diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index 34ca4e58a43..fff8cddfed8 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3-70-g09d2