From 8f13dd9612286cc0d38d32ff9543763b7c74f6a5 Mon Sep 17 00:00:00 2001
From: Zoltan Kiss <zoltan.kiss@citrix.com>
Date: Thu, 6 Mar 2014 21:48:23 +0000
Subject: xen-netback: Use skb->cb for pending_idx

Storing the pending_idx at the first byte of the linear buffer never looked
good, skb->cb is a more proper place for this. It also prevents the header to
be directly grant copied there, and we don't have the pending_idx after we
copied the header here, so it's time to change it.
It also introduces helpers for the RX side

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netback/netback.c | 42 +++++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 17 deletions(-)

(limited to 'drivers/net/xen-netback/netback.c')

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index e5284bca2d9..43ae4bad50c 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -455,10 +455,12 @@ static void xenvif_add_frag_responses(struct xenvif *vif, int status,
 	}
 }
 
-struct skb_cb_overlay {
+struct xenvif_rx_cb {
 	int meta_slots_used;
 };
 
+#define XENVIF_RX_CB(skb) ((struct xenvif_rx_cb *)(skb)->cb)
+
 void xenvif_kick_thread(struct xenvif *vif)
 {
 	wake_up(&vif->wq);
@@ -474,7 +476,6 @@ static void xenvif_rx_action(struct xenvif *vif)
 	LIST_HEAD(notify);
 	int ret;
 	unsigned long offset;
-	struct skb_cb_overlay *sco;
 	bool need_to_notify = false;
 
 	struct netrx_pending_operations npo = {
@@ -513,9 +514,8 @@ static void xenvif_rx_action(struct xenvif *vif)
 		} else
 			vif->rx_last_skb_slots = 0;
 
-		sco = (struct skb_cb_overlay *)skb->cb;
-		sco->meta_slots_used = xenvif_gop_skb(skb, &npo);
-		BUG_ON(sco->meta_slots_used > max_slots_needed);
+		XENVIF_RX_CB(skb)->meta_slots_used = xenvif_gop_skb(skb, &npo);
+		BUG_ON(XENVIF_RX_CB(skb)->meta_slots_used > max_slots_needed);
 
 		__skb_queue_tail(&rxq, skb);
 	}
@@ -529,7 +529,6 @@ static void xenvif_rx_action(struct xenvif *vif)
 	gnttab_batch_copy(vif->grant_copy_op, npo.copy_prod);
 
 	while ((skb = __skb_dequeue(&rxq)) != NULL) {
-		sco = (struct skb_cb_overlay *)skb->cb;
 
 		if ((1 << vif->meta[npo.meta_cons].gso_type) &
 		    vif->gso_prefix_mask) {
@@ -540,19 +539,21 @@ static void xenvif_rx_action(struct xenvif *vif)
 
 			resp->offset = vif->meta[npo.meta_cons].gso_size;
 			resp->id = vif->meta[npo.meta_cons].id;
-			resp->status = sco->meta_slots_used;
+			resp->status = XENVIF_RX_CB(skb)->meta_slots_used;
 
 			npo.meta_cons++;
-			sco->meta_slots_used--;
+			XENVIF_RX_CB(skb)->meta_slots_used--;
 		}
 
 
 		vif->dev->stats.tx_bytes += skb->len;
 		vif->dev->stats.tx_packets++;
 
-		status = xenvif_check_gop(vif, sco->meta_slots_used, &npo);
+		status = xenvif_check_gop(vif,
+					  XENVIF_RX_CB(skb)->meta_slots_used,
+					  &npo);
 
-		if (sco->meta_slots_used == 1)
+		if (XENVIF_RX_CB(skb)->meta_slots_used == 1)
 			flags = 0;
 		else
 			flags = XEN_NETRXF_more_data;
@@ -589,13 +590,13 @@ static void xenvif_rx_action(struct xenvif *vif)
 
 		xenvif_add_frag_responses(vif, status,
 					  vif->meta + npo.meta_cons + 1,
-					  sco->meta_slots_used);
+					  XENVIF_RX_CB(skb)->meta_slots_used);
 
 		RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&vif->rx, ret);
 
 		need_to_notify |= !!ret;
 
-		npo.meta_cons += sco->meta_slots_used;
+		npo.meta_cons += XENVIF_RX_CB(skb)->meta_slots_used;
 		dev_kfree_skb(skb);
 	}
 
@@ -772,6 +773,13 @@ static struct page *xenvif_alloc_page(struct xenvif *vif,
 	return page;
 }
 
+
+struct xenvif_tx_cb {
+	u16 pending_idx;
+};
+
+#define XENVIF_TX_CB(skb) ((struct xenvif_tx_cb *)(skb)->cb)
+
 static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif,
 					       struct sk_buff *skb,
 					       struct xen_netif_tx_request *txp,
@@ -779,7 +787,7 @@ static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif,
 {
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	skb_frag_t *frags = shinfo->frags;
-	u16 pending_idx = *((u16 *)skb->data);
+	u16 pending_idx = XENVIF_TX_CB(skb)->pending_idx;
 	u16 head_idx = 0;
 	int slot, start;
 	struct page *page;
@@ -897,7 +905,7 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
 			       struct gnttab_copy **gopp)
 {
 	struct gnttab_copy *gop = *gopp;
-	u16 pending_idx = *((u16 *)skb->data);
+	u16 pending_idx = XENVIF_TX_CB(skb)->pending_idx;
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	struct pending_tx_info *tx_info;
 	int nr_frags = shinfo->nr_frags;
@@ -944,7 +952,7 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
 			continue;
 
 		/* First error: invalidate header and preceding fragments. */
-		pending_idx = *((u16 *)skb->data);
+		pending_idx = XENVIF_TX_CB(skb)->pending_idx;
 		xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY);
 		for (j = start; j < i; j++) {
 			pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
@@ -1236,7 +1244,7 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
 		memcpy(&vif->pending_tx_info[pending_idx].req,
 		       &txreq, sizeof(txreq));
 		vif->pending_tx_info[pending_idx].head = index;
-		*((u16 *)skb->data) = pending_idx;
+		XENVIF_TX_CB(skb)->pending_idx = pending_idx;
 
 		__skb_put(skb, data_len);
 
@@ -1283,7 +1291,7 @@ static int xenvif_tx_submit(struct xenvif *vif)
 		u16 pending_idx;
 		unsigned data_len;
 
-		pending_idx = *((u16 *)skb->data);
+		pending_idx = XENVIF_TX_CB(skb)->pending_idx;
 		txp = &vif->pending_tx_info[pending_idx].req;
 
 		/* Check the remap error code. */
-- 
cgit v1.2.3-70-g09d2


From 121fa4b77775549c3c5eb41eb335d7dcbb801f90 Mon Sep 17 00:00:00 2001
From: Zoltan Kiss <zoltan.kiss@citrix.com>
Date: Thu, 6 Mar 2014 21:48:24 +0000
Subject: xen-netback: Minor refactoring of netback code

This patch contains a few bits of refactoring before introducing the grant
mapping changes:
- introducing xenvif_tx_pending_slots_available(), as this is used several
  times, and will be used more often
- rename the thread to vifX.Y-guest-rx, to signify it does RX work from the
  guest point of view

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netback/common.h    | 23 ++++++++++++++++++++++-
 drivers/net/xen-netback/interface.c |  4 ++--
 drivers/net/xen-netback/netback.c   | 22 +++-------------------
 3 files changed, 27 insertions(+), 22 deletions(-)

(limited to 'drivers/net/xen-netback/netback.c')

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index ae413a2cbee..9d3584545e5 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -108,6 +108,15 @@ struct xenvif_rx_meta {
  */
 #define MAX_GRANT_COPY_OPS (MAX_SKB_FRAGS * XEN_NETIF_RX_RING_SIZE)
 
+#define NETBACK_INVALID_HANDLE -1
+
+/* To avoid confusion, we define XEN_NETBK_LEGACY_SLOTS_MAX indicating
+ * the maximum slots a valid packet can use. Now this value is defined
+ * to be XEN_NETIF_NR_SLOTS_MIN, which is supposed to be supported by
+ * all backend.
+ */
+#define XEN_NETBK_LEGACY_SLOTS_MAX XEN_NETIF_NR_SLOTS_MIN
+
 struct xenvif {
 	/* Unique identifier for this interface. */
 	domid_t          domid;
@@ -216,7 +225,7 @@ void xenvif_carrier_off(struct xenvif *vif);
 
 int xenvif_tx_action(struct xenvif *vif, int budget);
 
-int xenvif_kthread(void *data);
+int xenvif_kthread_guest_rx(void *data);
 void xenvif_kick_thread(struct xenvif *vif);
 
 /* Determine whether the needed number of slots (req) are available,
@@ -226,6 +235,18 @@ bool xenvif_rx_ring_slots_available(struct xenvif *vif, int needed);
 
 void xenvif_stop_queue(struct xenvif *vif);
 
+static inline pending_ring_idx_t nr_pending_reqs(struct xenvif *vif)
+{
+	return MAX_PENDING_REQS -
+		vif->pending_prod + vif->pending_cons;
+}
+
+static inline bool xenvif_tx_pending_slots_available(struct xenvif *vif)
+{
+	return nr_pending_reqs(vif) + XEN_NETBK_LEGACY_SLOTS_MAX
+		< MAX_PENDING_REQS;
+}
+
 extern bool separate_tx_rx_irq;
 
 #endif /* __XEN_NETBACK__COMMON_H__ */
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 7669d49a67e..bc32627a22c 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -421,8 +421,8 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
 		disable_irq(vif->rx_irq);
 	}
 
-	task = kthread_create(xenvif_kthread,
-			      (void *)vif, "%s", vif->dev->name);
+	task = kthread_create(xenvif_kthread_guest_rx,
+			      (void *)vif, "%s-guest-rx", vif->dev->name);
 	if (IS_ERR(task)) {
 		pr_warn("Could not allocate kthread for %s\n", vif->dev->name);
 		err = PTR_ERR(task);
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 43ae4bad50c..715d810124e 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -62,14 +62,6 @@ module_param(separate_tx_rx_irq, bool, 0644);
 static unsigned int fatal_skb_slots = FATAL_SKB_SLOTS_DEFAULT;
 module_param(fatal_skb_slots, uint, 0444);
 
-/*
- * To avoid confusion, we define XEN_NETBK_LEGACY_SLOTS_MAX indicating
- * the maximum slots a valid packet can use. Now this value is defined
- * to be XEN_NETIF_NR_SLOTS_MIN, which is supposed to be supported by
- * all backend.
- */
-#define XEN_NETBK_LEGACY_SLOTS_MAX XEN_NETIF_NR_SLOTS_MIN
-
 /*
  * If head != INVALID_PENDING_RING_IDX, it means this tx request is head of
  * one or more merged tx requests, otherwise it is the continuation of
@@ -131,12 +123,6 @@ static inline pending_ring_idx_t pending_index(unsigned i)
 	return i & (MAX_PENDING_REQS-1);
 }
 
-static inline pending_ring_idx_t nr_pending_reqs(struct xenvif *vif)
-{
-	return MAX_PENDING_REQS -
-		vif->pending_prod + vif->pending_cons;
-}
-
 bool xenvif_rx_ring_slots_available(struct xenvif *vif, int needed)
 {
 	RING_IDX prod, cons;
@@ -1116,8 +1102,7 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
 	struct sk_buff *skb;
 	int ret;
 
-	while ((nr_pending_reqs(vif) + XEN_NETBK_LEGACY_SLOTS_MAX
-		< MAX_PENDING_REQS) &&
+	while (xenvif_tx_pending_slots_available(vif) &&
 	       (skb_queue_len(&vif->tx_queue) < budget)) {
 		struct xen_netif_tx_request txreq;
 		struct xen_netif_tx_request txfrags[XEN_NETBK_LEGACY_SLOTS_MAX];
@@ -1487,8 +1472,7 @@ static inline int tx_work_todo(struct xenvif *vif)
 {
 
 	if (likely(RING_HAS_UNCONSUMED_REQUESTS(&vif->tx)) &&
-	    (nr_pending_reqs(vif) + XEN_NETBK_LEGACY_SLOTS_MAX
-	     < MAX_PENDING_REQS))
+	    xenvif_tx_pending_slots_available(vif))
 		return 1;
 
 	return 0;
@@ -1551,7 +1535,7 @@ static void xenvif_start_queue(struct xenvif *vif)
 		netif_wake_queue(vif->dev);
 }
 
-int xenvif_kthread(void *data)
+int xenvif_kthread_guest_rx(void *data)
 {
 	struct xenvif *vif = data;
 	struct sk_buff *skb;
-- 
cgit v1.2.3-70-g09d2


From 3e2234b3149f66bc4be2343a3a0f637d922e4a36 Mon Sep 17 00:00:00 2001
From: Zoltan Kiss <zoltan.kiss@citrix.com>
Date: Thu, 6 Mar 2014 21:48:25 +0000
Subject: xen-netback: Handle foreign mapped pages on the guest RX path

RX path need to know if the SKB fragments are stored on pages from another
domain.
Logically this patch should be after introducing the grant mapping itself, as
it makes sense only after that. But to keep bisectability, I moved it here. It
shouldn't change any functionality here. xenvif_zerocopy_callback and
ubuf_to_vif are just stubs here, they will be introduced properly later on.

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netback/common.h  |  3 +++
 drivers/net/xen-netback/netback.c | 48 +++++++++++++++++++++++++++++++++++----
 2 files changed, 46 insertions(+), 5 deletions(-)

(limited to 'drivers/net/xen-netback/netback.c')

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 9d3584545e5..8f264df8818 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -247,6 +247,9 @@ static inline bool xenvif_tx_pending_slots_available(struct xenvif *vif)
 		< MAX_PENDING_REQS;
 }
 
+/* Callback from stack when TX packet can be released */
+void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success);
+
 extern bool separate_tx_rx_irq;
 
 #endif /* __XEN_NETBACK__COMMON_H__ */
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 715d810124e..e9391badfa4 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -101,6 +101,10 @@ static inline unsigned long idx_to_kaddr(struct xenvif *vif,
 	return (unsigned long)pfn_to_kaddr(idx_to_pfn(vif, idx));
 }
 
+static inline struct xenvif* ubuf_to_vif(struct ubuf_info *ubuf)
+{
+	return NULL;
+}
 /* This is a miniumum size for the linear area to avoid lots of
  * calls to __pskb_pull_tail() as we set up checksum offsets. The
  * value 128 was chosen as it covers all IPv4 and most likely
@@ -221,7 +225,9 @@ static struct xenvif_rx_meta *get_next_rx_buffer(struct xenvif *vif,
 static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
 				 struct netrx_pending_operations *npo,
 				 struct page *page, unsigned long size,
-				 unsigned long offset, int *head)
+				 unsigned long offset, int *head,
+				 struct xenvif *foreign_vif,
+				 grant_ref_t foreign_gref)
 {
 	struct gnttab_copy *copy_gop;
 	struct xenvif_rx_meta *meta;
@@ -263,8 +269,15 @@ static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
 		copy_gop->flags = GNTCOPY_dest_gref;
 		copy_gop->len = bytes;
 
-		copy_gop->source.domid = DOMID_SELF;
-		copy_gop->source.u.gmfn = virt_to_mfn(page_address(page));
+		if (foreign_vif) {
+			copy_gop->source.domid = foreign_vif->domid;
+			copy_gop->source.u.ref = foreign_gref;
+			copy_gop->flags |= GNTCOPY_source_gref;
+		} else {
+			copy_gop->source.domid = DOMID_SELF;
+			copy_gop->source.u.gmfn =
+				virt_to_mfn(page_address(page));
+		}
 		copy_gop->source.offset = offset;
 
 		copy_gop->dest.domid = vif->domid;
@@ -325,6 +338,9 @@ static int xenvif_gop_skb(struct sk_buff *skb,
 	int old_meta_prod;
 	int gso_type;
 	int gso_size;
+	struct ubuf_info *ubuf = skb_shinfo(skb)->destructor_arg;
+	grant_ref_t foreign_grefs[MAX_SKB_FRAGS];
+	struct xenvif *foreign_vif = NULL;
 
 	old_meta_prod = npo->meta_prod;
 
@@ -365,6 +381,19 @@ static int xenvif_gop_skb(struct sk_buff *skb,
 	npo->copy_off = 0;
 	npo->copy_gref = req->gref;
 
+	if ((skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) &&
+		 (ubuf->callback == &xenvif_zerocopy_callback)) {
+		int i = 0;
+		foreign_vif = ubuf_to_vif(ubuf);
+
+		do {
+			u16 pending_idx = ubuf->desc;
+			foreign_grefs[i++] =
+				foreign_vif->pending_tx_info[pending_idx].req.gref;
+			ubuf = (struct ubuf_info *) ubuf->ctx;
+		} while (ubuf);
+	}
+
 	data = skb->data;
 	while (data < skb_tail_pointer(skb)) {
 		unsigned int offset = offset_in_page(data);
@@ -374,7 +403,9 @@ static int xenvif_gop_skb(struct sk_buff *skb,
 			len = skb_tail_pointer(skb) - data;
 
 		xenvif_gop_frag_copy(vif, skb, npo,
-				     virt_to_page(data), len, offset, &head);
+				     virt_to_page(data), len, offset, &head,
+				     NULL,
+				     0);
 		data += len;
 	}
 
@@ -383,7 +414,9 @@ static int xenvif_gop_skb(struct sk_buff *skb,
 				     skb_frag_page(&skb_shinfo(skb)->frags[i]),
 				     skb_frag_size(&skb_shinfo(skb)->frags[i]),
 				     skb_shinfo(skb)->frags[i].page_offset,
-				     &head);
+				     &head,
+				     foreign_vif,
+				     foreign_grefs[i]);
 	}
 
 	return npo->meta_prod - old_meta_prod;
@@ -1351,6 +1384,11 @@ static int xenvif_tx_submit(struct xenvif *vif)
 	return work_done;
 }
 
+void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success)
+{
+	return;
+}
+
 /* Called after netfront has transmitted */
 int xenvif_tx_action(struct xenvif *vif, int budget)
 {
-- 
cgit v1.2.3-70-g09d2


From f53c3fe8dad725b014e9c7682720d8e3e2a8a5b3 Mon Sep 17 00:00:00 2001
From: Zoltan Kiss <zoltan.kiss@citrix.com>
Date: Thu, 6 Mar 2014 21:48:26 +0000
Subject: xen-netback: Introduce TX grant mapping

This patch introduces grant mapping on netback TX path. It replaces grant copy
operations, ditching grant copy coalescing along the way. Another solution for
copy coalescing is introduced in "xen-netback: Handle guests with too many
frags", older guests and Windows can broke before that patch applies.
There is a callback (xenvif_zerocopy_callback) from core stack to release the
slots back to the guests when kfree_skb or skb_orphan_frags called. It feeds a
separate dealloc thread, as scheduling NAPI instance from there is inefficient,
therefore we can't do dealloc from the instance.

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netback/common.h    |  39 +++-
 drivers/net/xen-netback/interface.c |  65 +++++-
 drivers/net/xen-netback/netback.c   | 432 +++++++++++++++++++++++-------------
 3 files changed, 371 insertions(+), 165 deletions(-)

(limited to 'drivers/net/xen-netback/netback.c')

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 8f264df8818..5a991266a39 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -79,6 +79,17 @@ struct pending_tx_info {
 				  * if it is head of one or more tx
 				  * reqs
 				  */
+	/* Callback data for released SKBs. The callback is always
+	 * xenvif_zerocopy_callback, desc contains the pending_idx, which is
+	 * also an index in pending_tx_info array. It is initialized in
+	 * xenvif_alloc and it never changes.
+	 * skb_shinfo(skb)->destructor_arg points to the first mapped slot's
+	 * callback_struct in this array of struct pending_tx_info's, then ctx
+	 * to the next, or NULL if there is no more slot for this skb.
+	 * ubuf_to_vif is a helper which finds the struct xenvif from a pointer
+	 * to this field.
+	 */
+	struct ubuf_info callback_struct;
 };
 
 #define XEN_NETIF_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE)
@@ -135,13 +146,31 @@ struct xenvif {
 	pending_ring_idx_t pending_cons;
 	u16 pending_ring[MAX_PENDING_REQS];
 	struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
+	grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
 
 	/* Coalescing tx requests before copying makes number of grant
 	 * copy ops greater or equal to number of slots required. In
 	 * worst case a tx request consumes 2 gnttab_copy.
 	 */
 	struct gnttab_copy tx_copy_ops[2*MAX_PENDING_REQS];
-
+	struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS];
+	struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS];
+	/* passed to gnttab_[un]map_refs with pages under (un)mapping */
+	struct page *pages_to_map[MAX_PENDING_REQS];
+	struct page *pages_to_unmap[MAX_PENDING_REQS];
+
+	/* This prevents zerocopy callbacks  to race over dealloc_ring */
+	spinlock_t callback_lock;
+	/* This prevents dealloc thread and NAPI instance to race over response
+	 * creation and pending_ring in xenvif_idx_release. In xenvif_tx_err
+	 * it only protect response creation
+	 */
+	spinlock_t response_lock;
+	pending_ring_idx_t dealloc_prod;
+	pending_ring_idx_t dealloc_cons;
+	u16 dealloc_ring[MAX_PENDING_REQS];
+	struct task_struct *dealloc_task;
+	wait_queue_head_t dealloc_wq;
 
 	/* Use kthread for guest RX */
 	struct task_struct *task;
@@ -228,6 +257,8 @@ int xenvif_tx_action(struct xenvif *vif, int budget);
 int xenvif_kthread_guest_rx(void *data);
 void xenvif_kick_thread(struct xenvif *vif);
 
+int xenvif_dealloc_kthread(void *data);
+
 /* Determine whether the needed number of slots (req) are available,
  * and set req_event if not.
  */
@@ -235,6 +266,12 @@ bool xenvif_rx_ring_slots_available(struct xenvif *vif, int needed);
 
 void xenvif_stop_queue(struct xenvif *vif);
 
+/* Callback from stack when TX packet can be released */
+void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success);
+
+/* Unmap a pending page and release it back to the guest */
+void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx);
+
 static inline pending_ring_idx_t nr_pending_reqs(struct xenvif *vif)
 {
 	return MAX_PENDING_REQS -
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index bc32627a22c..1fe9fe523cc 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -38,6 +38,7 @@
 
 #include <xen/events.h>
 #include <asm/xen/hypercall.h>
+#include <xen/balloon.h>
 
 #define XENVIF_QUEUE_LENGTH 32
 #define XENVIF_NAPI_WEIGHT  64
@@ -87,7 +88,8 @@ static int xenvif_poll(struct napi_struct *napi, int budget)
 		local_irq_save(flags);
 
 		RING_FINAL_CHECK_FOR_REQUESTS(&vif->tx, more_to_do);
-		if (!more_to_do)
+		if (!(more_to_do &&
+		      xenvif_tx_pending_slots_available(vif)))
 			__napi_complete(napi);
 
 		local_irq_restore(flags);
@@ -121,7 +123,9 @@ static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	BUG_ON(skb->dev != dev);
 
 	/* Drop the packet if vif is not ready */
-	if (vif->task == NULL || !xenvif_schedulable(vif))
+	if (vif->task == NULL ||
+	    vif->dealloc_task == NULL ||
+	    !xenvif_schedulable(vif))
 		goto drop;
 
 	/* At best we'll need one slot for the header and one for each
@@ -343,8 +347,26 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
 	vif->pending_prod = MAX_PENDING_REQS;
 	for (i = 0; i < MAX_PENDING_REQS; i++)
 		vif->pending_ring[i] = i;
-	for (i = 0; i < MAX_PENDING_REQS; i++)
-		vif->mmap_pages[i] = NULL;
+	spin_lock_init(&vif->callback_lock);
+	spin_lock_init(&vif->response_lock);
+	/* If ballooning is disabled, this will consume real memory, so you
+	 * better enable it. The long term solution would be to use just a
+	 * bunch of valid page descriptors, without dependency on ballooning
+	 */
+	err = alloc_xenballooned_pages(MAX_PENDING_REQS,
+				       vif->mmap_pages,
+				       false);
+	if (err) {
+		netdev_err(dev, "Could not reserve mmap_pages\n");
+		return ERR_PTR(-ENOMEM);
+	}
+	for (i = 0; i < MAX_PENDING_REQS; i++) {
+		vif->pending_tx_info[i].callback_struct = (struct ubuf_info)
+			{ .callback = xenvif_zerocopy_callback,
+			  .ctx = NULL,
+			  .desc = i };
+		vif->grant_tx_handle[i] = NETBACK_INVALID_HANDLE;
+	}
 
 	/*
 	 * Initialise a dummy MAC address. We choose the numerically
@@ -382,12 +404,14 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
 
 	BUG_ON(vif->tx_irq);
 	BUG_ON(vif->task);
+	BUG_ON(vif->dealloc_task);
 
 	err = xenvif_map_frontend_rings(vif, tx_ring_ref, rx_ring_ref);
 	if (err < 0)
 		goto err;
 
 	init_waitqueue_head(&vif->wq);
+	init_waitqueue_head(&vif->dealloc_wq);
 
 	if (tx_evtchn == rx_evtchn) {
 		/* feature-split-event-channels == 0 */
@@ -431,6 +455,16 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
 
 	vif->task = task;
 
+	task = kthread_create(xenvif_dealloc_kthread,
+			      (void *)vif, "%s-dealloc", vif->dev->name);
+	if (IS_ERR(task)) {
+		pr_warn("Could not allocate kthread for %s\n", vif->dev->name);
+		err = PTR_ERR(task);
+		goto err_rx_unbind;
+	}
+
+	vif->dealloc_task = task;
+
 	rtnl_lock();
 	if (!vif->can_sg && vif->dev->mtu > ETH_DATA_LEN)
 		dev_set_mtu(vif->dev, ETH_DATA_LEN);
@@ -441,6 +475,7 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
 	rtnl_unlock();
 
 	wake_up_process(vif->task);
+	wake_up_process(vif->dealloc_task);
 
 	return 0;
 
@@ -478,6 +513,11 @@ void xenvif_disconnect(struct xenvif *vif)
 		vif->task = NULL;
 	}
 
+	if (vif->dealloc_task) {
+		kthread_stop(vif->dealloc_task);
+		vif->dealloc_task = NULL;
+	}
+
 	if (vif->tx_irq) {
 		if (vif->tx_irq == vif->rx_irq)
 			unbind_from_irqhandler(vif->tx_irq, vif);
@@ -493,6 +533,23 @@ void xenvif_disconnect(struct xenvif *vif)
 
 void xenvif_free(struct xenvif *vif)
 {
+	int i, unmap_timeout = 0;
+
+	for (i = 0; i < MAX_PENDING_REQS; ++i) {
+		if (vif->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) {
+			unmap_timeout++;
+			schedule_timeout(msecs_to_jiffies(1000));
+			if (unmap_timeout > 9 &&
+			    net_ratelimit())
+				netdev_err(vif->dev,
+					   "Page still granted! Index: %x\n",
+					   i);
+			i = -1;
+		}
+	}
+
+	free_xenballooned_pages(MAX_PENDING_REQS, vif->mmap_pages);
+
 	netif_napi_del(&vif->napi);
 
 	unregister_netdev(vif->dev);
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index e9391badfa4..cb29134147d 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -101,10 +101,18 @@ static inline unsigned long idx_to_kaddr(struct xenvif *vif,
 	return (unsigned long)pfn_to_kaddr(idx_to_pfn(vif, idx));
 }
 
+/* Find the containing VIF's structure from a pointer in pending_tx_info array
+ */
 static inline struct xenvif* ubuf_to_vif(struct ubuf_info *ubuf)
 {
-	return NULL;
+	u16 pending_idx = ubuf->desc;
+	struct pending_tx_info *temp =
+		container_of(ubuf, struct pending_tx_info, callback_struct);
+	return container_of(temp - pending_idx,
+			    struct xenvif,
+			    pending_tx_info[0]);
 }
+
 /* This is a miniumum size for the linear area to avoid lots of
  * calls to __pskb_pull_tail() as we set up checksum offsets. The
  * value 128 was chosen as it covers all IPv4 and most likely
@@ -665,9 +673,12 @@ static void xenvif_tx_err(struct xenvif *vif,
 			  struct xen_netif_tx_request *txp, RING_IDX end)
 {
 	RING_IDX cons = vif->tx.req_cons;
+	unsigned long flags;
 
 	do {
+		spin_lock_irqsave(&vif->response_lock, flags);
 		make_tx_response(vif, txp, XEN_NETIF_RSP_ERROR);
+		spin_unlock_irqrestore(&vif->response_lock, flags);
 		if (cons == end)
 			break;
 		txp = RING_GET_REQUEST(&vif->tx, cons++);
@@ -799,10 +810,24 @@ struct xenvif_tx_cb {
 
 #define XENVIF_TX_CB(skb) ((struct xenvif_tx_cb *)(skb)->cb)
 
-static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif,
-					       struct sk_buff *skb,
-					       struct xen_netif_tx_request *txp,
-					       struct gnttab_copy *gop)
+static inline void xenvif_tx_create_gop(struct xenvif *vif,
+					u16 pending_idx,
+					struct xen_netif_tx_request *txp,
+					struct gnttab_map_grant_ref *gop)
+{
+	vif->pages_to_map[gop-vif->tx_map_ops] = vif->mmap_pages[pending_idx];
+	gnttab_set_map_op(gop, idx_to_kaddr(vif, pending_idx),
+			  GNTMAP_host_map | GNTMAP_readonly,
+			  txp->gref, vif->domid);
+
+	memcpy(&vif->pending_tx_info[pending_idx].req, txp,
+	       sizeof(*txp));
+}
+
+static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
+							struct sk_buff *skb,
+							struct xen_netif_tx_request *txp,
+							struct gnttab_map_grant_ref *gop)
 {
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	skb_frag_t *frags = shinfo->frags;
@@ -823,83 +848,12 @@ static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif,
 	/* Skip first skb fragment if it is on same page as header fragment. */
 	start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
 
-	/* Coalesce tx requests, at this point the packet passed in
-	 * should be <= 64K. Any packets larger than 64K have been
-	 * handled in xenvif_count_requests().
-	 */
-	for (shinfo->nr_frags = slot = start; slot < nr_slots;
-	     shinfo->nr_frags++) {
-		struct pending_tx_info *pending_tx_info =
-			vif->pending_tx_info;
-
-		page = alloc_page(GFP_ATOMIC|__GFP_COLD);
-		if (!page)
-			goto err;
-
-		dst_offset = 0;
-		first = NULL;
-		while (dst_offset < PAGE_SIZE && slot < nr_slots) {
-			gop->flags = GNTCOPY_source_gref;
-
-			gop->source.u.ref = txp->gref;
-			gop->source.domid = vif->domid;
-			gop->source.offset = txp->offset;
-
-			gop->dest.domid = DOMID_SELF;
-
-			gop->dest.offset = dst_offset;
-			gop->dest.u.gmfn = virt_to_mfn(page_address(page));
-
-			if (dst_offset + txp->size > PAGE_SIZE) {
-				/* This page can only merge a portion
-				 * of tx request. Do not increment any
-				 * pointer / counter here. The txp
-				 * will be dealt with in future
-				 * rounds, eventually hitting the
-				 * `else` branch.
-				 */
-				gop->len = PAGE_SIZE - dst_offset;
-				txp->offset += gop->len;
-				txp->size -= gop->len;
-				dst_offset += gop->len; /* quit loop */
-			} else {
-				/* This tx request can be merged in the page */
-				gop->len = txp->size;
-				dst_offset += gop->len;
-
+	for (shinfo->nr_frags = start; shinfo->nr_frags < nr_slots;
+	     shinfo->nr_frags++, txp++, gop++) {
 				index = pending_index(vif->pending_cons++);
-
 				pending_idx = vif->pending_ring[index];
-
-				memcpy(&pending_tx_info[pending_idx].req, txp,
-				       sizeof(*txp));
-
-				/* Poison these fields, corresponding
-				 * fields for head tx req will be set
-				 * to correct values after the loop.
-				 */
-				vif->mmap_pages[pending_idx] = (void *)(~0UL);
-				pending_tx_info[pending_idx].head =
-					INVALID_PENDING_RING_IDX;
-
-				if (!first) {
-					first = &pending_tx_info[pending_idx];
-					start_idx = index;
-					head_idx = pending_idx;
-				}
-
-				txp++;
-				slot++;
-			}
-
-			gop++;
-		}
-
-		first->req.offset = 0;
-		first->req.size = dst_offset;
-		first->head = start_idx;
-		vif->mmap_pages[head_idx] = page;
-		frag_set_pending_idx(&frags[shinfo->nr_frags], head_idx);
+		xenvif_tx_create_gop(vif, pending_idx, txp, gop);
+		frag_set_pending_idx(&frags[shinfo->nr_frags], pending_idx);
 	}
 
 	BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS);
@@ -919,11 +873,38 @@ err:
 	return NULL;
 }
 
+static inline void xenvif_grant_handle_set(struct xenvif *vif,
+					   u16 pending_idx,
+					   grant_handle_t handle)
+{
+	if (unlikely(vif->grant_tx_handle[pending_idx] !=
+		     NETBACK_INVALID_HANDLE)) {
+		netdev_err(vif->dev,
+			   "Trying to overwrite active handle! pending_idx: %x\n",
+			   pending_idx);
+		BUG();
+	}
+	vif->grant_tx_handle[pending_idx] = handle;
+}
+
+static inline void xenvif_grant_handle_reset(struct xenvif *vif,
+					     u16 pending_idx)
+{
+	if (unlikely(vif->grant_tx_handle[pending_idx] ==
+		     NETBACK_INVALID_HANDLE)) {
+		netdev_err(vif->dev,
+			   "Trying to unmap invalid handle! pending_idx: %x\n",
+			   pending_idx);
+		BUG();
+	}
+	vif->grant_tx_handle[pending_idx] = NETBACK_INVALID_HANDLE;
+}
+
 static int xenvif_tx_check_gop(struct xenvif *vif,
 			       struct sk_buff *skb,
-			       struct gnttab_copy **gopp)
+			       struct gnttab_map_grant_ref **gopp)
 {
-	struct gnttab_copy *gop = *gopp;
+	struct gnttab_map_grant_ref *gop = *gopp;
 	u16 pending_idx = XENVIF_TX_CB(skb)->pending_idx;
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	struct pending_tx_info *tx_info;
@@ -935,6 +916,8 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
 	err = gop->status;
 	if (unlikely(err))
 		xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR);
+	else
+		xenvif_grant_handle_set(vif, pending_idx , gop->handle);
 
 	/* Skip first skb fragment if it is on same page as header fragment. */
 	start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
@@ -948,18 +931,13 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
 		head = tx_info->head;
 
 		/* Check error status: if okay then remember grant handle. */
-		do {
 			newerr = (++gop)->status;
-			if (newerr)
-				break;
-			peek = vif->pending_ring[pending_index(++head)];
-		} while (!pending_tx_is_head(vif, peek));
 
 		if (likely(!newerr)) {
+			xenvif_grant_handle_set(vif, pending_idx , gop->handle);
 			/* Had a previous error? Invalidate this fragment. */
 			if (unlikely(err))
-				xenvif_idx_release(vif, pending_idx,
-						   XEN_NETIF_RSP_OKAY);
+				xenvif_idx_unmap(vif, pending_idx);
 			continue;
 		}
 
@@ -972,11 +950,10 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
 
 		/* First error: invalidate header and preceding fragments. */
 		pending_idx = XENVIF_TX_CB(skb)->pending_idx;
-		xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY);
+		xenvif_idx_unmap(vif, pending_idx);
 		for (j = start; j < i; j++) {
 			pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
-			xenvif_idx_release(vif, pending_idx,
-					   XEN_NETIF_RSP_OKAY);
+			xenvif_idx_unmap(vif, pending_idx);
 		}
 
 		/* Remember the error: invalidate all subsequent fragments. */
@@ -992,6 +969,10 @@ static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb)
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	int nr_frags = shinfo->nr_frags;
 	int i;
+	u16 prev_pending_idx = INVALID_PENDING_IDX;
+
+	if (skb_shinfo(skb)->destructor_arg)
+		prev_pending_idx = XENVIF_TX_CB(skb)->pending_idx;
 
 	for (i = 0; i < nr_frags; i++) {
 		skb_frag_t *frag = shinfo->frags + i;
@@ -1001,6 +982,17 @@ static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb)
 
 		pending_idx = frag_get_pending_idx(frag);
 
+		/* If this is not the first frag, chain it to the previous*/
+		if (unlikely(prev_pending_idx == INVALID_PENDING_IDX))
+			skb_shinfo(skb)->destructor_arg =
+				&vif->pending_tx_info[pending_idx].callback_struct;
+		else if (likely(pending_idx != prev_pending_idx))
+			vif->pending_tx_info[prev_pending_idx].callback_struct.ctx =
+				&(vif->pending_tx_info[pending_idx].callback_struct);
+
+		vif->pending_tx_info[pending_idx].callback_struct.ctx = NULL;
+		prev_pending_idx = pending_idx;
+
 		txp = &vif->pending_tx_info[pending_idx].req;
 		page = virt_to_page(idx_to_kaddr(vif, pending_idx));
 		__skb_fill_page_desc(skb, i, page, txp->offset, txp->size);
@@ -1008,10 +1000,15 @@ static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb)
 		skb->data_len += txp->size;
 		skb->truesize += txp->size;
 
-		/* Take an extra reference to offset xenvif_idx_release */
+		/* Take an extra reference to offset network stack's put_page */
 		get_page(vif->mmap_pages[pending_idx]);
-		xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY);
 	}
+	/* FIXME: __skb_fill_page_desc set this to true because page->pfmemalloc
+	 * overlaps with "index", and "mapping" is not set. I think mapping
+	 * should be set. If delivered to local stack, it would drop this
+	 * skb in sk_filter unless the socket has the right to use it.
+	 */
+	skb->pfmemalloc	= false;
 }
 
 static int xenvif_get_extras(struct xenvif *vif,
@@ -1131,7 +1128,7 @@ static bool tx_credit_exceeded(struct xenvif *vif, unsigned size)
 
 static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
 {
-	struct gnttab_copy *gop = vif->tx_copy_ops, *request_gop;
+	struct gnttab_map_grant_ref *gop = vif->tx_map_ops, *request_gop;
 	struct sk_buff *skb;
 	int ret;
 
@@ -1238,30 +1235,10 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
 			}
 		}
 
-		/* XXX could copy straight to head */
-		page = xenvif_alloc_page(vif, pending_idx);
-		if (!page) {
-			kfree_skb(skb);
-			xenvif_tx_err(vif, &txreq, idx);
-			break;
-		}
-
-		gop->source.u.ref = txreq.gref;
-		gop->source.domid = vif->domid;
-		gop->source.offset = txreq.offset;
-
-		gop->dest.u.gmfn = virt_to_mfn(page_address(page));
-		gop->dest.domid = DOMID_SELF;
-		gop->dest.offset = txreq.offset;
-
-		gop->len = txreq.size;
-		gop->flags = GNTCOPY_source_gref;
+		xenvif_tx_create_gop(vif, pending_idx, &txreq, gop);
 
 		gop++;
 
-		memcpy(&vif->pending_tx_info[pending_idx].req,
-		       &txreq, sizeof(txreq));
-		vif->pending_tx_info[pending_idx].head = index;
 		XENVIF_TX_CB(skb)->pending_idx = pending_idx;
 
 		__skb_put(skb, data_len);
@@ -1290,17 +1267,17 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
 
 		vif->tx.req_cons = idx;
 
-		if ((gop-vif->tx_copy_ops) >= ARRAY_SIZE(vif->tx_copy_ops))
+		if ((gop-vif->tx_map_ops) >= ARRAY_SIZE(vif->tx_map_ops))
 			break;
 	}
 
-	return gop - vif->tx_copy_ops;
+	return gop - vif->tx_map_ops;
 }
 
 
 static int xenvif_tx_submit(struct xenvif *vif)
 {
-	struct gnttab_copy *gop = vif->tx_copy_ops;
+	struct gnttab_map_grant_ref *gop = vif->tx_map_ops;
 	struct sk_buff *skb;
 	int work_done = 0;
 
@@ -1324,14 +1301,17 @@ static int xenvif_tx_submit(struct xenvif *vif)
 		memcpy(skb->data,
 		       (void *)(idx_to_kaddr(vif, pending_idx)|txp->offset),
 		       data_len);
+		vif->pending_tx_info[pending_idx].callback_struct.ctx = NULL;
 		if (data_len < txp->size) {
 			/* Append the packet payload as a fragment. */
 			txp->offset += data_len;
 			txp->size -= data_len;
+			skb_shinfo(skb)->destructor_arg =
+				&vif->pending_tx_info[pending_idx].callback_struct;
 		} else {
 			/* Schedule a response immediately. */
-			xenvif_idx_release(vif, pending_idx,
-					   XEN_NETIF_RSP_OKAY);
+			skb_shinfo(skb)->destructor_arg = NULL;
+			xenvif_idx_unmap(vif, pending_idx);
 		}
 
 		if (txp->flags & XEN_NETTXF_csum_blank)
@@ -1353,6 +1333,9 @@ static int xenvif_tx_submit(struct xenvif *vif)
 		if (checksum_setup(vif, skb)) {
 			netdev_dbg(vif->dev,
 				   "Can't setup checksum in net_tx_action\n");
+			/* We have to set this flag to trigger the callback */
+			if (skb_shinfo(skb)->destructor_arg)
+				skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
 			kfree_skb(skb);
 			continue;
 		}
@@ -1378,6 +1361,14 @@ static int xenvif_tx_submit(struct xenvif *vif)
 
 		work_done++;
 
+		/* Set this flag right before netif_receive_skb, otherwise
+		 * someone might think this packet already left netback, and
+		 * do a skb_copy_ubufs while we are still in control of the
+		 * skb. E.g. the __pskb_pull_tail earlier can do such thing.
+		 */
+		if (skb_shinfo(skb)->destructor_arg)
+			skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+
 		netif_receive_skb(skb);
 	}
 
@@ -1386,14 +1377,111 @@ static int xenvif_tx_submit(struct xenvif *vif)
 
 void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success)
 {
-	return;
+	unsigned long flags;
+	pending_ring_idx_t index;
+	struct xenvif *vif = ubuf_to_vif(ubuf);
+
+	/* This is the only place where we grab this lock, to protect callbacks
+	 * from each other.
+	 */
+	spin_lock_irqsave(&vif->callback_lock, flags);
+	do {
+		u16 pending_idx = ubuf->desc;
+		ubuf = (struct ubuf_info *) ubuf->ctx;
+		BUG_ON(vif->dealloc_prod - vif->dealloc_cons >=
+			MAX_PENDING_REQS);
+		index = pending_index(vif->dealloc_prod);
+		vif->dealloc_ring[index] = pending_idx;
+		/* Sync with xenvif_tx_dealloc_action:
+		 * insert idx then incr producer.
+		 */
+		smp_wmb();
+		vif->dealloc_prod++;
+	} while (ubuf);
+	wake_up(&vif->dealloc_wq);
+	spin_unlock_irqrestore(&vif->callback_lock, flags);
+
+	if (RING_HAS_UNCONSUMED_REQUESTS(&vif->tx) &&
+	    xenvif_tx_pending_slots_available(vif)) {
+		local_bh_disable();
+		napi_schedule(&vif->napi);
+		local_bh_enable();
+	}
+}
+
+static inline void xenvif_tx_dealloc_action(struct xenvif *vif)
+{
+	struct gnttab_unmap_grant_ref *gop;
+	pending_ring_idx_t dc, dp;
+	u16 pending_idx, pending_idx_release[MAX_PENDING_REQS];
+	unsigned int i = 0;
+
+	dc = vif->dealloc_cons;
+	gop = vif->tx_unmap_ops;
+
+	/* Free up any grants we have finished using */
+	do {
+		dp = vif->dealloc_prod;
+
+		/* Ensure we see all indices enqueued by all
+		 * xenvif_zerocopy_callback().
+		 */
+		smp_rmb();
+
+		while (dc != dp) {
+			BUG_ON(gop - vif->tx_unmap_ops > MAX_PENDING_REQS);
+			pending_idx =
+				vif->dealloc_ring[pending_index(dc++)];
+
+			pending_idx_release[gop-vif->tx_unmap_ops] =
+				pending_idx;
+			vif->pages_to_unmap[gop-vif->tx_unmap_ops] =
+				vif->mmap_pages[pending_idx];
+			gnttab_set_unmap_op(gop,
+					    idx_to_kaddr(vif, pending_idx),
+					    GNTMAP_host_map,
+					    vif->grant_tx_handle[pending_idx]);
+			/* Btw. already unmapped? */
+			xenvif_grant_handle_reset(vif, pending_idx);
+			++gop;
+		}
+
+	} while (dp != vif->dealloc_prod);
+
+	vif->dealloc_cons = dc;
+
+	if (gop - vif->tx_unmap_ops > 0) {
+		int ret;
+		ret = gnttab_unmap_refs(vif->tx_unmap_ops,
+					NULL,
+					vif->pages_to_unmap,
+					gop - vif->tx_unmap_ops);
+		if (ret) {
+			netdev_err(vif->dev, "Unmap fail: nr_ops %x ret %d\n",
+				   gop - vif->tx_unmap_ops, ret);
+			for (i = 0; i < gop - vif->tx_unmap_ops; ++i) {
+				if (gop[i].status != GNTST_okay)
+					netdev_err(vif->dev,
+						   " host_addr: %llx handle: %x status: %d\n",
+						   gop[i].host_addr,
+						   gop[i].handle,
+						   gop[i].status);
+			}
+			BUG();
+		}
+	}
+
+	for (i = 0; i < gop - vif->tx_unmap_ops; ++i)
+		xenvif_idx_release(vif, pending_idx_release[i],
+				   XEN_NETIF_RSP_OKAY);
 }
 
+
 /* Called after netfront has transmitted */
 int xenvif_tx_action(struct xenvif *vif, int budget)
 {
 	unsigned nr_gops;
-	int work_done;
+	int work_done, ret;
 
 	if (unlikely(!tx_work_todo(vif)))
 		return 0;
@@ -1403,7 +1491,11 @@ int xenvif_tx_action(struct xenvif *vif, int budget)
 	if (nr_gops == 0)
 		return 0;
 
-	gnttab_batch_copy(vif->tx_copy_ops, nr_gops);
+	ret = gnttab_map_refs(vif->tx_map_ops,
+			      NULL,
+			      vif->pages_to_map,
+			      nr_gops);
+	BUG_ON(ret);
 
 	work_done = xenvif_tx_submit(vif);
 
@@ -1414,45 +1506,19 @@ static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,
 			       u8 status)
 {
 	struct pending_tx_info *pending_tx_info;
-	pending_ring_idx_t head;
+	pending_ring_idx_t index;
 	u16 peek; /* peek into next tx request */
+	unsigned long flags;
 
-	BUG_ON(vif->mmap_pages[pending_idx] == (void *)(~0UL));
-
-	/* Already complete? */
-	if (vif->mmap_pages[pending_idx] == NULL)
-		return;
-
-	pending_tx_info = &vif->pending_tx_info[pending_idx];
-
-	head = pending_tx_info->head;
-
-	BUG_ON(!pending_tx_is_head(vif, head));
-	BUG_ON(vif->pending_ring[pending_index(head)] != pending_idx);
-
-	do {
-		pending_ring_idx_t index;
-		pending_ring_idx_t idx = pending_index(head);
-		u16 info_idx = vif->pending_ring[idx];
-
-		pending_tx_info = &vif->pending_tx_info[info_idx];
+		pending_tx_info = &vif->pending_tx_info[pending_idx];
+		spin_lock_irqsave(&vif->response_lock, flags);
 		make_tx_response(vif, &pending_tx_info->req, status);
-
-		/* Setting any number other than
-		 * INVALID_PENDING_RING_IDX indicates this slot is
-		 * starting a new packet / ending a previous packet.
-		 */
-		pending_tx_info->head = 0;
-
-		index = pending_index(vif->pending_prod++);
-		vif->pending_ring[index] = vif->pending_ring[info_idx];
-
-		peek = vif->pending_ring[pending_index(++head)];
-
-	} while (!pending_tx_is_head(vif, peek));
-
-	put_page(vif->mmap_pages[pending_idx]);
-	vif->mmap_pages[pending_idx] = NULL;
+		index = pending_index(vif->pending_prod);
+		vif->pending_ring[index] = pending_idx;
+		/* TX shouldn't use the index before we give it back here */
+		mb();
+		vif->pending_prod++;
+		spin_unlock_irqrestore(&vif->response_lock, flags);
 }
 
 
@@ -1500,6 +1566,25 @@ static struct xen_netif_rx_response *make_rx_response(struct xenvif *vif,
 	return resp;
 }
 
+void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)
+{
+	int ret;
+	struct gnttab_unmap_grant_ref tx_unmap_op;
+
+	gnttab_set_unmap_op(&tx_unmap_op,
+			    idx_to_kaddr(vif, pending_idx),
+			    GNTMAP_host_map,
+			    vif->grant_tx_handle[pending_idx]);
+	/* Btw. already unmapped? */
+	xenvif_grant_handle_reset(vif, pending_idx);
+
+	ret = gnttab_unmap_refs(&tx_unmap_op, NULL,
+				&vif->mmap_pages[pending_idx], 1);
+	BUG_ON(ret);
+
+	xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY);
+}
+
 static inline int rx_work_todo(struct xenvif *vif)
 {
 	return !skb_queue_empty(&vif->rx_queue) &&
@@ -1516,6 +1601,11 @@ static inline int tx_work_todo(struct xenvif *vif)
 	return 0;
 }
 
+static inline bool tx_dealloc_work_todo(struct xenvif *vif)
+{
+	return vif->dealloc_cons != vif->dealloc_prod;
+}
+
 void xenvif_unmap_frontend_rings(struct xenvif *vif)
 {
 	if (vif->tx.sring)
@@ -1602,6 +1692,28 @@ int xenvif_kthread_guest_rx(void *data)
 	return 0;
 }
 
+int xenvif_dealloc_kthread(void *data)
+{
+	struct xenvif *vif = data;
+
+	while (!kthread_should_stop()) {
+		wait_event_interruptible(vif->dealloc_wq,
+					 tx_dealloc_work_todo(vif) ||
+					 kthread_should_stop());
+		if (kthread_should_stop())
+			break;
+
+		xenvif_tx_dealloc_action(vif);
+		cond_resched();
+	}
+
+	/* Unmap anything remaining*/
+	if (tx_dealloc_work_todo(vif))
+		xenvif_tx_dealloc_action(vif);
+
+	return 0;
+}
+
 static int __init netback_init(void)
 {
 	int rc = 0;
-- 
cgit v1.2.3-70-g09d2


From 62bad3199a4c20505fc36c169deef20b25e17c5f Mon Sep 17 00:00:00 2001
From: Zoltan Kiss <zoltan.kiss@citrix.com>
Date: Thu, 6 Mar 2014 21:48:27 +0000
Subject: xen-netback: Remove old TX grant copy definitons and fix indentations

These became obsolete with grant mapping. I've left intentionally the
indentations in this way, to improve readability of previous patches.

NOTE: if bisect brought you here, you should apply the series up until
"xen-netback: Timeout packets in RX path", otherwise Windows guests can't work
properly and malicious guests can block other guests by not releasing their sent
packets.

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netback/common.h  | 36 +-------------------
 drivers/net/xen-netback/netback.c | 72 ++++++++-------------------------------
 2 files changed, 15 insertions(+), 93 deletions(-)

(limited to 'drivers/net/xen-netback/netback.c')

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 5a991266a39..49109afa225 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -48,37 +48,8 @@
 typedef unsigned int pending_ring_idx_t;
 #define INVALID_PENDING_RING_IDX (~0U)
 
-/* For the head field in pending_tx_info: it is used to indicate
- * whether this tx info is the head of one or more coalesced requests.
- *
- * When head != INVALID_PENDING_RING_IDX, it means the start of a new
- * tx requests queue and the end of previous queue.
- *
- * An example sequence of head fields (I = INVALID_PENDING_RING_IDX):
- *
- * ...|0 I I I|5 I|9 I I I|...
- * -->|<-INUSE----------------
- *
- * After consuming the first slot(s) we have:
- *
- * ...|V V V V|5 I|9 I I I|...
- * -----FREE->|<-INUSE--------
- *
- * where V stands for "valid pending ring index". Any number other
- * than INVALID_PENDING_RING_IDX is OK. These entries are considered
- * free and can contain any number other than
- * INVALID_PENDING_RING_IDX. In practice we use 0.
- *
- * The in use non-INVALID_PENDING_RING_IDX (say 0, 5 and 9 in the
- * above example) number is the index into pending_tx_info and
- * mmap_pages arrays.
- */
 struct pending_tx_info {
-	struct xen_netif_tx_request req; /* coalesced tx request */
-	pending_ring_idx_t head; /* head != INVALID_PENDING_RING_IDX
-				  * if it is head of one or more tx
-				  * reqs
-				  */
+	struct xen_netif_tx_request req; /* tx request */
 	/* Callback data for released SKBs. The callback is always
 	 * xenvif_zerocopy_callback, desc contains the pending_idx, which is
 	 * also an index in pending_tx_info array. It is initialized in
@@ -148,11 +119,6 @@ struct xenvif {
 	struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
 	grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
 
-	/* Coalescing tx requests before copying makes number of grant
-	 * copy ops greater or equal to number of slots required. In
-	 * worst case a tx request consumes 2 gnttab_copy.
-	 */
-	struct gnttab_copy tx_copy_ops[2*MAX_PENDING_REQS];
 	struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS];
 	struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS];
 	/* passed to gnttab_[un]map_refs with pages under (un)mapping */
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index cb29134147d..46a75706cb7 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -62,16 +62,6 @@ module_param(separate_tx_rx_irq, bool, 0644);
 static unsigned int fatal_skb_slots = FATAL_SKB_SLOTS_DEFAULT;
 module_param(fatal_skb_slots, uint, 0444);
 
-/*
- * If head != INVALID_PENDING_RING_IDX, it means this tx request is head of
- * one or more merged tx requests, otherwise it is the continuation of
- * previous tx request.
- */
-static inline int pending_tx_is_head(struct xenvif *vif, RING_IDX idx)
-{
-	return vif->pending_tx_info[idx].head != INVALID_PENDING_RING_IDX;
-}
-
 static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,
 			       u8 status);
 
@@ -790,19 +780,6 @@ static int xenvif_count_requests(struct xenvif *vif,
 	return slots;
 }
 
-static struct page *xenvif_alloc_page(struct xenvif *vif,
-				      u16 pending_idx)
-{
-	struct page *page;
-
-	page = alloc_page(GFP_ATOMIC|__GFP_COLD);
-	if (!page)
-		return NULL;
-	vif->mmap_pages[pending_idx] = page;
-
-	return page;
-}
-
 
 struct xenvif_tx_cb {
 	u16 pending_idx;
@@ -832,13 +809,9 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	skb_frag_t *frags = shinfo->frags;
 	u16 pending_idx = XENVIF_TX_CB(skb)->pending_idx;
-	u16 head_idx = 0;
-	int slot, start;
-	struct page *page;
-	pending_ring_idx_t index, start_idx = 0;
-	uint16_t dst_offset;
+	int start;
+	pending_ring_idx_t index;
 	unsigned int nr_slots;
-	struct pending_tx_info *first = NULL;
 
 	/* At this point shinfo->nr_frags is in fact the number of
 	 * slots, which can be as large as XEN_NETBK_LEGACY_SLOTS_MAX.
@@ -850,8 +823,8 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
 
 	for (shinfo->nr_frags = start; shinfo->nr_frags < nr_slots;
 	     shinfo->nr_frags++, txp++, gop++) {
-				index = pending_index(vif->pending_cons++);
-				pending_idx = vif->pending_ring[index];
+		index = pending_index(vif->pending_cons++);
+		pending_idx = vif->pending_ring[index];
 		xenvif_tx_create_gop(vif, pending_idx, txp, gop);
 		frag_set_pending_idx(&frags[shinfo->nr_frags], pending_idx);
 	}
@@ -859,18 +832,6 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
 	BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS);
 
 	return gop;
-err:
-	/* Unwind, freeing all pages and sending error responses. */
-	while (shinfo->nr_frags-- > start) {
-		xenvif_idx_release(vif,
-				frag_get_pending_idx(&frags[shinfo->nr_frags]),
-				XEN_NETIF_RSP_ERROR);
-	}
-	/* The head too, if necessary. */
-	if (start)
-		xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR);
-
-	return NULL;
 }
 
 static inline void xenvif_grant_handle_set(struct xenvif *vif,
@@ -910,7 +871,6 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
 	struct pending_tx_info *tx_info;
 	int nr_frags = shinfo->nr_frags;
 	int i, err, start;
-	u16 peek; /* peek into next tx request */
 
 	/* Check status of header. */
 	err = gop->status;
@@ -924,14 +884,12 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
 
 	for (i = start; i < nr_frags; i++) {
 		int j, newerr;
-		pending_ring_idx_t head;
 
 		pending_idx = frag_get_pending_idx(&shinfo->frags[i]);
 		tx_info = &vif->pending_tx_info[pending_idx];
-		head = tx_info->head;
 
 		/* Check error status: if okay then remember grant handle. */
-			newerr = (++gop)->status;
+		newerr = (++gop)->status;
 
 		if (likely(!newerr)) {
 			xenvif_grant_handle_set(vif, pending_idx , gop->handle);
@@ -1136,7 +1094,6 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
 	       (skb_queue_len(&vif->tx_queue) < budget)) {
 		struct xen_netif_tx_request txreq;
 		struct xen_netif_tx_request txfrags[XEN_NETBK_LEGACY_SLOTS_MAX];
-		struct page *page;
 		struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX-1];
 		u16 pending_idx;
 		RING_IDX idx;
@@ -1507,18 +1464,17 @@ static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,
 {
 	struct pending_tx_info *pending_tx_info;
 	pending_ring_idx_t index;
-	u16 peek; /* peek into next tx request */
 	unsigned long flags;
 
-		pending_tx_info = &vif->pending_tx_info[pending_idx];
-		spin_lock_irqsave(&vif->response_lock, flags);
-		make_tx_response(vif, &pending_tx_info->req, status);
-		index = pending_index(vif->pending_prod);
-		vif->pending_ring[index] = pending_idx;
-		/* TX shouldn't use the index before we give it back here */
-		mb();
-		vif->pending_prod++;
-		spin_unlock_irqrestore(&vif->response_lock, flags);
+	pending_tx_info = &vif->pending_tx_info[pending_idx];
+	spin_lock_irqsave(&vif->response_lock, flags);
+	make_tx_response(vif, &pending_tx_info->req, status);
+	index = pending_index(vif->pending_prod);
+	vif->pending_ring[index] = pending_idx;
+	/* TX shouldn't use the index before we give it back here */
+	mb();
+	vif->pending_prod++;
+	spin_unlock_irqrestore(&vif->response_lock, flags);
 }
 
 
-- 
cgit v1.2.3-70-g09d2


From 1bb332af4cd889e4b64dacbf4a793ceb3a70445d Mon Sep 17 00:00:00 2001
From: Zoltan Kiss <zoltan.kiss@citrix.com>
Date: Thu, 6 Mar 2014 21:48:28 +0000
Subject: xen-netback: Add stat counters for zerocopy

These counters help determine how often the buffers had to be copied. Also
they help find out if packets are leaked, as if "sent != success + fail",
there are probably packets never freed up properly.

NOTE: if bisect brought you here, you should apply the series up until
"xen-netback: Timeout packets in RX path", otherwise Windows guests can't work
properly and malicious guests can block other guests by not releasing their sent
packets.

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netback/common.h    |  3 +++
 drivers/net/xen-netback/interface.c | 15 +++++++++++++++
 drivers/net/xen-netback/netback.c   |  9 ++++++++-
 3 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'drivers/net/xen-netback/netback.c')

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 49109afa225..683d30160a7 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -179,6 +179,9 @@ struct xenvif {
 
 	/* Statistics */
 	unsigned long rx_gso_checksum_fixup;
+	unsigned long tx_zerocopy_sent;
+	unsigned long tx_zerocopy_success;
+	unsigned long tx_zerocopy_fail;
 
 	/* Miscellaneous private stuff. */
 	struct net_device *dev;
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 1fe9fe523cc..44df8581b4d 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -238,6 +238,21 @@ static const struct xenvif_stat {
 		"rx_gso_checksum_fixup",
 		offsetof(struct xenvif, rx_gso_checksum_fixup)
 	},
+	/* If (sent != success + fail), there are probably packets never
+	 * freed up properly!
+	 */
+	{
+		"tx_zerocopy_sent",
+		offsetof(struct xenvif, tx_zerocopy_sent),
+	},
+	{
+		"tx_zerocopy_success",
+		offsetof(struct xenvif, tx_zerocopy_success),
+	},
+	{
+		"tx_zerocopy_fail",
+		offsetof(struct xenvif, tx_zerocopy_fail)
+	},
 };
 
 static int xenvif_get_sset_count(struct net_device *dev, int string_set)
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 46a75706cb7..3cb586357df 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -1323,8 +1323,10 @@ static int xenvif_tx_submit(struct xenvif *vif)
 		 * do a skb_copy_ubufs while we are still in control of the
 		 * skb. E.g. the __pskb_pull_tail earlier can do such thing.
 		 */
-		if (skb_shinfo(skb)->destructor_arg)
+		if (skb_shinfo(skb)->destructor_arg) {
 			skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+			vif->tx_zerocopy_sent++;
+		}
 
 		netif_receive_skb(skb);
 	}
@@ -1364,6 +1366,11 @@ void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success)
 		napi_schedule(&vif->napi);
 		local_bh_enable();
 	}
+
+	if (likely(zerocopy_success))
+		vif->tx_zerocopy_success++;
+	else
+		vif->tx_zerocopy_fail++;
 }
 
 static inline void xenvif_tx_dealloc_action(struct xenvif *vif)
-- 
cgit v1.2.3-70-g09d2


From e3377f36ca20a034dce56335dc9b89f41094d845 Mon Sep 17 00:00:00 2001
From: Zoltan Kiss <zoltan.kiss@citrix.com>
Date: Thu, 6 Mar 2014 21:48:29 +0000
Subject: xen-netback: Handle guests with too many frags

Xen network protocol had implicit dependency on MAX_SKB_FRAGS. Netback has to
handle guests sending up to XEN_NETBK_LEGACY_SLOTS_MAX slots. To achieve that:
- create a new skb
- map the leftover slots to its frags (no linear buffer here!)
- chain it to the previous through skb_shinfo(skb)->frag_list
- map them
- copy and coalesce the frags into a brand new one and send it to the stack
- unmap the 2 old skb's pages

It's also introduces new stat counters, which help determine how often the guest
sends a packet with more than MAX_SKB_FRAGS frags.

NOTE: if bisect brought you here, you should apply the series up until
"xen-netback: Timeout packets in RX path", otherwise malicious guests can block
other guests by not releasing their sent packets.

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netback/common.h    |   1 +
 drivers/net/xen-netback/interface.c |   7 ++
 drivers/net/xen-netback/netback.c   | 164 +++++++++++++++++++++++++++++++++---
 3 files changed, 162 insertions(+), 10 deletions(-)

(limited to 'drivers/net/xen-netback/netback.c')

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 683d30160a7..f2f8a02afc3 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -182,6 +182,7 @@ struct xenvif {
 	unsigned long tx_zerocopy_sent;
 	unsigned long tx_zerocopy_success;
 	unsigned long tx_zerocopy_fail;
+	unsigned long tx_frag_overflow;
 
 	/* Miscellaneous private stuff. */
 	struct net_device *dev;
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 44df8581b4d..b646039e539 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -253,6 +253,13 @@ static const struct xenvif_stat {
 		"tx_zerocopy_fail",
 		offsetof(struct xenvif, tx_zerocopy_fail)
 	},
+	/* Number of packets exceeding MAX_SKB_FRAG slots. You should use
+	 * a guest with the same MAX_SKB_FRAG
+	 */
+	{
+		"tx_frag_overflow",
+		offsetof(struct xenvif, tx_frag_overflow)
+	},
 };
 
 static int xenvif_get_sset_count(struct net_device *dev, int string_set)
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 3cb586357df..58effc49f52 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -37,6 +37,7 @@
 #include <linux/kthread.h>
 #include <linux/if_vlan.h>
 #include <linux/udp.h>
+#include <linux/highmem.h>
 
 #include <net/tcp.h>
 
@@ -801,6 +802,23 @@ static inline void xenvif_tx_create_gop(struct xenvif *vif,
 	       sizeof(*txp));
 }
 
+static inline struct sk_buff *xenvif_alloc_skb(unsigned int size)
+{
+	struct sk_buff *skb =
+		alloc_skb(size + NET_SKB_PAD + NET_IP_ALIGN,
+			  GFP_ATOMIC | __GFP_NOWARN);
+	if (unlikely(skb == NULL))
+		return NULL;
+
+	/* Packets passed to netif_rx() must have some headroom. */
+	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+
+	/* Initialize it here to avoid later surprises */
+	skb_shinfo(skb)->destructor_arg = NULL;
+
+	return skb;
+}
+
 static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
 							struct sk_buff *skb,
 							struct xen_netif_tx_request *txp,
@@ -811,11 +829,16 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
 	u16 pending_idx = XENVIF_TX_CB(skb)->pending_idx;
 	int start;
 	pending_ring_idx_t index;
-	unsigned int nr_slots;
+	unsigned int nr_slots, frag_overflow = 0;
 
 	/* At this point shinfo->nr_frags is in fact the number of
 	 * slots, which can be as large as XEN_NETBK_LEGACY_SLOTS_MAX.
 	 */
+	if (shinfo->nr_frags > MAX_SKB_FRAGS) {
+		frag_overflow = shinfo->nr_frags - MAX_SKB_FRAGS;
+		BUG_ON(frag_overflow > MAX_SKB_FRAGS);
+		shinfo->nr_frags = MAX_SKB_FRAGS;
+	}
 	nr_slots = shinfo->nr_frags;
 
 	/* Skip first skb fragment if it is on same page as header fragment. */
@@ -829,7 +852,29 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
 		frag_set_pending_idx(&frags[shinfo->nr_frags], pending_idx);
 	}
 
-	BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS);
+	if (frag_overflow) {
+		struct sk_buff *nskb = xenvif_alloc_skb(0);
+		if (unlikely(nskb == NULL)) {
+			if (net_ratelimit())
+				netdev_err(vif->dev,
+					   "Can't allocate the frag_list skb.\n");
+			return NULL;
+		}
+
+		shinfo = skb_shinfo(nskb);
+		frags = shinfo->frags;
+
+		for (shinfo->nr_frags = 0; shinfo->nr_frags < frag_overflow;
+		     shinfo->nr_frags++, txp++, gop++) {
+			index = pending_index(vif->pending_cons++);
+			pending_idx = vif->pending_ring[index];
+			xenvif_tx_create_gop(vif, pending_idx, txp, gop);
+			frag_set_pending_idx(&frags[shinfo->nr_frags],
+					     pending_idx);
+		}
+
+		skb_shinfo(skb)->frag_list = nskb;
+	}
 
 	return gop;
 }
@@ -871,6 +916,7 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
 	struct pending_tx_info *tx_info;
 	int nr_frags = shinfo->nr_frags;
 	int i, err, start;
+	struct sk_buff *first_skb = NULL;
 
 	/* Check status of header. */
 	err = gop->status;
@@ -882,6 +928,7 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
 	/* Skip first skb fragment if it is on same page as header fragment. */
 	start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
 
+check_frags:
 	for (i = start; i < nr_frags; i++) {
 		int j, newerr;
 
@@ -905,9 +952,11 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
 		/* Not the first error? Preceding frags already invalidated. */
 		if (err)
 			continue;
-
 		/* First error: invalidate header and preceding fragments. */
-		pending_idx = XENVIF_TX_CB(skb)->pending_idx;
+		if (!first_skb)
+			pending_idx = XENVIF_TX_CB(skb)->pending_idx;
+		else
+			pending_idx = XENVIF_TX_CB(skb)->pending_idx;
 		xenvif_idx_unmap(vif, pending_idx);
 		for (j = start; j < i; j++) {
 			pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
@@ -918,6 +967,30 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
 		err = newerr;
 	}
 
+	if (skb_has_frag_list(skb)) {
+		first_skb = skb;
+		skb = shinfo->frag_list;
+		shinfo = skb_shinfo(skb);
+		nr_frags = shinfo->nr_frags;
+		start = 0;
+
+		goto check_frags;
+	}
+
+	/* There was a mapping error in the frag_list skb. We have to unmap
+	 * the first skb's frags
+	 */
+	if (first_skb && err) {
+		int j;
+		shinfo = skb_shinfo(first_skb);
+		pending_idx = XENVIF_TX_CB(skb)->pending_idx;
+		start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
+		for (j = start; j < shinfo->nr_frags; j++) {
+			pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
+			xenvif_idx_unmap(vif, pending_idx);
+		}
+	}
+
 	*gopp = gop + 1;
 	return err;
 }
@@ -1169,8 +1242,7 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
 			    ret < XEN_NETBK_LEGACY_SLOTS_MAX) ?
 			PKT_PROT_LEN : txreq.size;
 
-		skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN,
-				GFP_ATOMIC | __GFP_NOWARN);
+		skb = xenvif_alloc_skb(data_len);
 		if (unlikely(skb == NULL)) {
 			netdev_dbg(vif->dev,
 				   "Can't allocate a skb in start_xmit.\n");
@@ -1178,9 +1250,6 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
 			break;
 		}
 
-		/* Packets passed to netif_rx() must have some headroom. */
-		skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
-
 		if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
 			struct xen_netif_extra_info *gso;
 			gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
@@ -1231,6 +1300,71 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
 	return gop - vif->tx_map_ops;
 }
 
+/* Consolidate skb with a frag_list into a brand new one with local pages on
+ * frags. Returns 0 or -ENOMEM if can't allocate new pages.
+ */
+static int xenvif_handle_frag_list(struct xenvif *vif, struct sk_buff *skb)
+{
+	unsigned int offset = skb_headlen(skb);
+	skb_frag_t frags[MAX_SKB_FRAGS];
+	int i;
+	struct ubuf_info *uarg;
+	struct sk_buff *nskb = skb_shinfo(skb)->frag_list;
+
+	vif->tx_zerocopy_sent += 2;
+	vif->tx_frag_overflow++;
+
+	xenvif_fill_frags(vif, nskb);
+	/* Subtract frags size, we will correct it later */
+	skb->truesize -= skb->data_len;
+	skb->len += nskb->len;
+	skb->data_len += nskb->len;
+
+	/* create a brand new frags array and coalesce there */
+	for (i = 0; offset < skb->len; i++) {
+		struct page *page;
+		unsigned int len;
+
+		BUG_ON(i >= MAX_SKB_FRAGS);
+		page = alloc_page(GFP_ATOMIC|__GFP_COLD);
+		if (!page) {
+			int j;
+			skb->truesize += skb->data_len;
+			for (j = 0; j < i; j++)
+				put_page(frags[j].page.p);
+			return -ENOMEM;
+		}
+
+		if (offset + PAGE_SIZE < skb->len)
+			len = PAGE_SIZE;
+		else
+			len = skb->len - offset;
+		if (skb_copy_bits(skb, offset, page_address(page), len))
+			BUG();
+
+		offset += len;
+		frags[i].page.p = page;
+		frags[i].page_offset = 0;
+		skb_frag_size_set(&frags[i], len);
+	}
+	/* swap out with old one */
+	memcpy(skb_shinfo(skb)->frags,
+	       frags,
+	       i * sizeof(skb_frag_t));
+	skb_shinfo(skb)->nr_frags = i;
+	skb->truesize += i * PAGE_SIZE;
+
+	/* remove traces of mapped pages and frag_list */
+	skb_frag_list_init(skb);
+	uarg = skb_shinfo(skb)->destructor_arg;
+	uarg->callback(uarg, true);
+	skb_shinfo(skb)->destructor_arg = NULL;
+
+	skb_shinfo(nskb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+	kfree_skb(nskb);
+
+	return 0;
+}
 
 static int xenvif_tx_submit(struct xenvif *vif)
 {
@@ -1267,7 +1401,6 @@ static int xenvif_tx_submit(struct xenvif *vif)
 				&vif->pending_tx_info[pending_idx].callback_struct;
 		} else {
 			/* Schedule a response immediately. */
-			skb_shinfo(skb)->destructor_arg = NULL;
 			xenvif_idx_unmap(vif, pending_idx);
 		}
 
@@ -1278,6 +1411,17 @@ static int xenvif_tx_submit(struct xenvif *vif)
 
 		xenvif_fill_frags(vif, skb);
 
+		if (unlikely(skb_has_frag_list(skb))) {
+			if (xenvif_handle_frag_list(vif, skb)) {
+				if (net_ratelimit())
+					netdev_err(vif->dev,
+						   "Not enough memory to consolidate frag_list!\n");
+				skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+				kfree_skb(skb);
+				continue;
+			}
+		}
+
 		if (skb_is_nonlinear(skb) && skb_headlen(skb) < PKT_PROT_LEN) {
 			int target = min_t(int, skb->len, PKT_PROT_LEN);
 			__pskb_pull_tail(skb, target - skb_headlen(skb));
-- 
cgit v1.2.3-70-g09d2


From 093507885ae5dc0288af07fbb922d2f85b3a88a6 Mon Sep 17 00:00:00 2001
From: Zoltan Kiss <zoltan.kiss@citrix.com>
Date: Thu, 6 Mar 2014 21:48:30 +0000
Subject: xen-netback: Timeout packets in RX path

A malicious or buggy guest can leave its queue filled indefinitely, in which
case qdisc start to queue packets for that VIF. If those packets came from an
another guest, it can block its slots and prevent shutdown. To avoid that, we
make sure the queue is drained in every 10 seconds.
The QDisc queue in worst case takes 3 round to flush usually.

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netback/common.h    |  6 ++++++
 drivers/net/xen-netback/interface.c | 37 +++++++++++++++++++++++++++++++++++--
 drivers/net/xen-netback/netback.c   | 23 ++++++++++++++++++++---
 3 files changed, 61 insertions(+), 5 deletions(-)

(limited to 'drivers/net/xen-netback/netback.c')

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index f2f8a02afc3..0355f8767e3 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -148,6 +148,9 @@ struct xenvif {
 	struct xen_netif_rx_back_ring rx;
 	struct sk_buff_head rx_queue;
 	RING_IDX rx_last_skb_slots;
+	bool rx_queue_purge;
+
+	struct timer_list wake_queue;
 
 	/* This array is allocated seperately as it is large */
 	struct gnttab_copy *grant_copy_op;
@@ -259,4 +262,7 @@ void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success);
 
 extern bool separate_tx_rx_irq;
 
+extern unsigned int rx_drain_timeout_msecs;
+extern unsigned int rx_drain_timeout_jiffies;
+
 #endif /* __XEN_NETBACK__COMMON_H__ */
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index b646039e539..9cc9f638f44 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -115,6 +115,18 @@ static irqreturn_t xenvif_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
+static void xenvif_wake_queue(unsigned long data)
+{
+	struct xenvif *vif = (struct xenvif *)data;
+
+	if (netif_queue_stopped(vif->dev)) {
+		netdev_err(vif->dev, "draining TX queue\n");
+		vif->rx_queue_purge = true;
+		xenvif_kick_thread(vif);
+		netif_wake_queue(vif->dev);
+	}
+}
+
 static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct xenvif *vif = netdev_priv(dev);
@@ -144,8 +156,13 @@ static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	 * then turn off the queue to give the ring a chance to
 	 * drain.
 	 */
-	if (!xenvif_rx_ring_slots_available(vif, min_slots_needed))
+	if (!xenvif_rx_ring_slots_available(vif, min_slots_needed)) {
+		vif->wake_queue.function = xenvif_wake_queue;
+		vif->wake_queue.data = (unsigned long)vif;
 		xenvif_stop_queue(vif);
+		mod_timer(&vif->wake_queue,
+			jiffies + rx_drain_timeout_jiffies);
+	}
 
 	skb_queue_tail(&vif->rx_queue, skb);
 	xenvif_kick_thread(vif);
@@ -353,6 +370,8 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
 	init_timer(&vif->credit_timeout);
 	vif->credit_window_start = get_jiffies_64();
 
+	init_timer(&vif->wake_queue);
+
 	dev->netdev_ops	= &xenvif_netdev_ops;
 	dev->hw_features = NETIF_F_SG |
 		NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
@@ -531,6 +550,7 @@ void xenvif_disconnect(struct xenvif *vif)
 		xenvif_carrier_off(vif);
 
 	if (vif->task) {
+		del_timer_sync(&vif->wake_queue);
 		kthread_stop(vif->task);
 		vif->task = NULL;
 	}
@@ -556,12 +576,25 @@ void xenvif_disconnect(struct xenvif *vif)
 void xenvif_free(struct xenvif *vif)
 {
 	int i, unmap_timeout = 0;
+	/* Here we want to avoid timeout messages if an skb can be legitimatly
+	 * stucked somewhere else. Realisticly this could be an another vif's
+	 * internal or QDisc queue. That another vif also has this
+	 * rx_drain_timeout_msecs timeout, but the timer only ditches the
+	 * internal queue. After that, the QDisc queue can put in worst case
+	 * XEN_NETIF_RX_RING_SIZE / MAX_SKB_FRAGS skbs into that another vif's
+	 * internal queue, so we need several rounds of such timeouts until we
+	 * can be sure that no another vif should have skb's from us. We are
+	 * not sending more skb's, so newly stucked packets are not interesting
+	 * for us here.
+	 */
+	unsigned int worst_case_skb_lifetime = (rx_drain_timeout_msecs/1000) *
+		DIV_ROUND_UP(XENVIF_QUEUE_LENGTH, (XEN_NETIF_RX_RING_SIZE / MAX_SKB_FRAGS));
 
 	for (i = 0; i < MAX_PENDING_REQS; ++i) {
 		if (vif->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) {
 			unmap_timeout++;
 			schedule_timeout(msecs_to_jiffies(1000));
-			if (unmap_timeout > 9 &&
+			if (unmap_timeout > worst_case_skb_lifetime &&
 			    net_ratelimit())
 				netdev_err(vif->dev,
 					   "Page still granted! Index: %x\n",
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 58effc49f52..8518a0d1f6f 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -55,6 +55,13 @@
 bool separate_tx_rx_irq = 1;
 module_param(separate_tx_rx_irq, bool, 0644);
 
+/* When guest ring is filled up, qdisc queues the packets for us, but we have
+ * to timeout them, otherwise other guests' packets can get stucked there
+ */
+unsigned int rx_drain_timeout_msecs = 10000;
+module_param(rx_drain_timeout_msecs, uint, 0444);
+unsigned int rx_drain_timeout_jiffies;
+
 /*
  * This is the maximum slots a skb can have. If a guest sends a skb
  * which exceeds this limit it is considered malicious.
@@ -1694,8 +1701,9 @@ void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)
 
 static inline int rx_work_todo(struct xenvif *vif)
 {
-	return !skb_queue_empty(&vif->rx_queue) &&
-	       xenvif_rx_ring_slots_available(vif, vif->rx_last_skb_slots);
+	return (!skb_queue_empty(&vif->rx_queue) &&
+	       xenvif_rx_ring_slots_available(vif, vif->rx_last_skb_slots)) ||
+	       vif->rx_queue_purge;
 }
 
 static inline int tx_work_todo(struct xenvif *vif)
@@ -1782,12 +1790,19 @@ int xenvif_kthread_guest_rx(void *data)
 		if (kthread_should_stop())
 			break;
 
+		if (vif->rx_queue_purge) {
+			skb_queue_purge(&vif->rx_queue);
+			vif->rx_queue_purge = false;
+		}
+
 		if (!skb_queue_empty(&vif->rx_queue))
 			xenvif_rx_action(vif);
 
 		if (skb_queue_empty(&vif->rx_queue) &&
-		    netif_queue_stopped(vif->dev))
+		    netif_queue_stopped(vif->dev)) {
+			del_timer_sync(&vif->wake_queue);
 			xenvif_start_queue(vif);
+		}
 
 		cond_resched();
 	}
@@ -1838,6 +1853,8 @@ static int __init netback_init(void)
 	if (rc)
 		goto failed_init;
 
+	rx_drain_timeout_jiffies = msecs_to_jiffies(rx_drain_timeout_msecs);
+
 	return 0;
 
 failed_init:
-- 
cgit v1.2.3-70-g09d2


From e9275f5e2df1b2098a8cc405d87b88b9affd73e6 Mon Sep 17 00:00:00 2001
From: Zoltan Kiss <zoltan.kiss@citrix.com>
Date: Thu, 6 Mar 2014 21:48:31 +0000
Subject: xen-netback: Aggregate TX unmap operations

Unmapping causes TLB flushing, therefore we should make it in the largest
possible batches. However we shouldn't starve the guest for too long. So if
the guest has space for at least two big packets and we don't have at least a
quarter ring to unmap, delay it for at most 1 milisec.

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netback/common.h    |  2 ++
 drivers/net/xen-netback/interface.c |  2 ++
 drivers/net/xen-netback/netback.c   | 34 +++++++++++++++++++++++++++++++++-
 3 files changed, 37 insertions(+), 1 deletion(-)

(limited to 'drivers/net/xen-netback/netback.c')

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 0355f8767e3..bef37be402b 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -137,6 +137,8 @@ struct xenvif {
 	u16 dealloc_ring[MAX_PENDING_REQS];
 	struct task_struct *dealloc_task;
 	wait_queue_head_t dealloc_wq;
+	struct timer_list dealloc_delay;
+	bool dealloc_delay_timed_out;
 
 	/* Use kthread for guest RX */
 	struct task_struct *task;
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 9cc9f638f44..83a71ac5b93 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -408,6 +408,7 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
 			  .desc = i };
 		vif->grant_tx_handle[i] = NETBACK_INVALID_HANDLE;
 	}
+	init_timer(&vif->dealloc_delay);
 
 	/*
 	 * Initialise a dummy MAC address. We choose the numerically
@@ -556,6 +557,7 @@ void xenvif_disconnect(struct xenvif *vif)
 	}
 
 	if (vif->dealloc_task) {
+		del_timer_sync(&vif->dealloc_delay);
 		kthread_stop(vif->dealloc_task);
 		vif->dealloc_task = NULL;
 	}
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 8518a0d1f6f..bc943205a69 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -133,6 +133,11 @@ static inline pending_ring_idx_t pending_index(unsigned i)
 	return i & (MAX_PENDING_REQS-1);
 }
 
+static inline pending_ring_idx_t nr_free_slots(struct xen_netif_tx_back_ring *ring)
+{
+	return ring->nr_ents -	(ring->sring->req_prod - ring->rsp_prod_pvt);
+}
+
 bool xenvif_rx_ring_slots_available(struct xenvif *vif, int needed)
 {
 	RING_IDX prod, cons;
@@ -1716,9 +1721,36 @@ static inline int tx_work_todo(struct xenvif *vif)
 	return 0;
 }
 
+static void xenvif_dealloc_delay(unsigned long data)
+{
+	struct xenvif *vif = (struct xenvif *)data;
+
+	vif->dealloc_delay_timed_out = true;
+	wake_up(&vif->dealloc_wq);
+}
+
 static inline bool tx_dealloc_work_todo(struct xenvif *vif)
 {
-	return vif->dealloc_cons != vif->dealloc_prod;
+	if (vif->dealloc_cons != vif->dealloc_prod) {
+		if ((nr_free_slots(&vif->tx) > 2 * XEN_NETBK_LEGACY_SLOTS_MAX) &&
+		    (vif->dealloc_prod - vif->dealloc_cons < MAX_PENDING_REQS / 4) &&
+		    !vif->dealloc_delay_timed_out) {
+			if (!timer_pending(&vif->dealloc_delay)) {
+				vif->dealloc_delay.function =
+					xenvif_dealloc_delay;
+				vif->dealloc_delay.data = (unsigned long)vif;
+				mod_timer(&vif->dealloc_delay,
+					  jiffies + msecs_to_jiffies(1));
+
+			}
+			return false;
+		}
+		del_timer_sync(&vif->dealloc_delay);
+		vif->dealloc_delay_timed_out = false;
+		return true;
+	}
+
+	return false;
 }
 
 void xenvif_unmap_frontend_rings(struct xenvif *vif)
-- 
cgit v1.2.3-70-g09d2


From 397dfd9f93ccfe71660eafbaac651a96195c24ed Mon Sep 17 00:00:00 2001
From: Zoltan Kiss <zoltan.kiss@citrix.com>
Date: Fri, 21 Mar 2014 17:23:04 +0000
Subject: Revert "xen-netback: Aggregate TX unmap operations"

This reverts commit e9275f5e2df1b2098a8cc405d87b88b9affd73e6. This commit is the
last in the netback grant mapping series, and it tries to do more aggressive
aggreagtion of unmap operations. However practical use showed almost no
positive effect, whilst with certain frontends it causes significant performance
regression.

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netback/common.h    |  2 --
 drivers/net/xen-netback/interface.c |  2 --
 drivers/net/xen-netback/netback.c   | 34 +---------------------------------
 3 files changed, 1 insertion(+), 37 deletions(-)

(limited to 'drivers/net/xen-netback/netback.c')

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index bef37be402b..0355f8767e3 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -137,8 +137,6 @@ struct xenvif {
 	u16 dealloc_ring[MAX_PENDING_REQS];
 	struct task_struct *dealloc_task;
 	wait_queue_head_t dealloc_wq;
-	struct timer_list dealloc_delay;
-	bool dealloc_delay_timed_out;
 
 	/* Use kthread for guest RX */
 	struct task_struct *task;
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index a6a8c1579eb..23bb2f4b18f 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -407,7 +407,6 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
 			  .desc = i };
 		vif->grant_tx_handle[i] = NETBACK_INVALID_HANDLE;
 	}
-	init_timer(&vif->dealloc_delay);
 
 	/*
 	 * Initialise a dummy MAC address. We choose the numerically
@@ -556,7 +555,6 @@ void xenvif_disconnect(struct xenvif *vif)
 	}
 
 	if (vif->dealloc_task) {
-		del_timer_sync(&vif->dealloc_delay);
 		kthread_stop(vif->dealloc_task);
 		vif->dealloc_task = NULL;
 	}
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 5a8c4a43c52..1f595e51791 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -133,11 +133,6 @@ static inline pending_ring_idx_t pending_index(unsigned i)
 	return i & (MAX_PENDING_REQS-1);
 }
 
-static inline pending_ring_idx_t nr_free_slots(struct xen_netif_tx_back_ring *ring)
-{
-	return ring->nr_ents -	(ring->sring->req_prod - ring->rsp_prod_pvt);
-}
-
 bool xenvif_rx_ring_slots_available(struct xenvif *vif, int needed)
 {
 	RING_IDX prod, cons;
@@ -1718,36 +1713,9 @@ static inline int tx_work_todo(struct xenvif *vif)
 	return 0;
 }
 
-static void xenvif_dealloc_delay(unsigned long data)
-{
-	struct xenvif *vif = (struct xenvif *)data;
-
-	vif->dealloc_delay_timed_out = true;
-	wake_up(&vif->dealloc_wq);
-}
-
 static inline bool tx_dealloc_work_todo(struct xenvif *vif)
 {
-	if (vif->dealloc_cons != vif->dealloc_prod) {
-		if ((nr_free_slots(&vif->tx) > 2 * XEN_NETBK_LEGACY_SLOTS_MAX) &&
-		    (vif->dealloc_prod - vif->dealloc_cons < MAX_PENDING_REQS / 4) &&
-		    !vif->dealloc_delay_timed_out) {
-			if (!timer_pending(&vif->dealloc_delay)) {
-				vif->dealloc_delay.function =
-					xenvif_dealloc_delay;
-				vif->dealloc_delay.data = (unsigned long)vif;
-				mod_timer(&vif->dealloc_delay,
-					  jiffies + msecs_to_jiffies(1));
-
-			}
-			return false;
-		}
-		del_timer_sync(&vif->dealloc_delay);
-		vif->dealloc_delay_timed_out = false;
-		return true;
-	}
-
-	return false;
+	return vif->dealloc_cons != vif->dealloc_prod;
 }
 
 void xenvif_unmap_frontend_rings(struct xenvif *vif)
-- 
cgit v1.2.3-70-g09d2


From 2c5f4f8422e8cf3dd15638226e964f2e13132267 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 25 Mar 2014 19:02:16 -0400
Subject: xen-netback: Proper printf format for ptrdiff_t is 't'.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes:

drivers/net/xen-netback/netback.c: In function ‘xenvif_tx_dealloc_action’:
drivers/net/xen-netback/netback.c:1573:8: warning: format ‘%x’ expects argument of type ‘unsigned int’, but argument 3 has type ‘long int’ [-Wformat=]

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netback/netback.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/net/xen-netback/netback.c')

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 1f595e51791..1e462872477 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -1569,7 +1569,7 @@ static inline void xenvif_tx_dealloc_action(struct xenvif *vif)
 					vif->pages_to_unmap,
 					gop - vif->tx_unmap_ops);
 		if (ret) {
-			netdev_err(vif->dev, "Unmap fail: nr_ops %x ret %d\n",
+			netdev_err(vif->dev, "Unmap fail: nr_ops %tx ret %d\n",
 				   gop - vif->tx_unmap_ops, ret);
 			for (i = 0; i < gop - vif->tx_unmap_ops; ++i) {
 				if (gop[i].status != GNTST_okay)
-- 
cgit v1.2.3-70-g09d2


From 869b9b19b3affd81cee853d33c0b124797f3c387 Mon Sep 17 00:00:00 2001
From: Zoltan Kiss <zoltan.kiss@citrix.com>
Date: Mon, 24 Mar 2014 23:59:49 +0000
Subject: xen-netback: Stop using xenvif_tx_pending_slots_available

Since the early days TX stops if there isn't enough free pending slots to
consume a maximum sized (slot-wise) packet. Probably the reason for that is to
avoid the case when we don't have enough free pending slot in the ring to finish
the packet. But if we make sure that the pending ring has the same size as the
shared ring, that shouldn't really happen. The frontend can only post packets
which fit the to the free space of the shared ring. If it doesn't, the frontend
has to stop, as it can only increase the req_prod when the whole packet fits
onto the ring.
This patch avoid using this checking, makes sure the 2 ring has the same size,
and remove a checking from the callback. As now we don't stop the NAPI instance
on this condition, we don't have to wake it up if we free pending slots up.

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netback/common.h    |  8 +-------
 drivers/net/xen-netback/interface.c |  3 +--
 drivers/net/xen-netback/netback.c   | 13 ++-----------
 3 files changed, 4 insertions(+), 20 deletions(-)

(limited to 'drivers/net/xen-netback/netback.c')

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 0355f8767e3..89b2d429c44 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -81,7 +81,7 @@ struct xenvif_rx_meta {
 
 #define MAX_BUFFER_OFFSET PAGE_SIZE
 
-#define MAX_PENDING_REQS 256
+#define MAX_PENDING_REQS XEN_NETIF_TX_RING_SIZE
 
 /* It's possible for an skb to have a maximal number of frags
  * but still be less than MAX_BUFFER_OFFSET in size. Thus the
@@ -251,12 +251,6 @@ static inline pending_ring_idx_t nr_pending_reqs(struct xenvif *vif)
 		vif->pending_prod + vif->pending_cons;
 }
 
-static inline bool xenvif_tx_pending_slots_available(struct xenvif *vif)
-{
-	return nr_pending_reqs(vif) + XEN_NETBK_LEGACY_SLOTS_MAX
-		< MAX_PENDING_REQS;
-}
-
 /* Callback from stack when TX packet can be released */
 void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success);
 
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 23bb2f4b18f..e71fb1ac5c4 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -88,8 +88,7 @@ static int xenvif_poll(struct napi_struct *napi, int budget)
 		local_irq_save(flags);
 
 		RING_FINAL_CHECK_FOR_REQUESTS(&vif->tx, more_to_do);
-		if (!(more_to_do &&
-		      xenvif_tx_pending_slots_available(vif)))
+		if (!more_to_do)
 			__napi_complete(napi);
 
 		local_irq_restore(flags);
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 1e462872477..5d2dd1d5417 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -1167,8 +1167,7 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
 	struct sk_buff *skb;
 	int ret;
 
-	while (xenvif_tx_pending_slots_available(vif) &&
-	       (skb_queue_len(&vif->tx_queue) < budget)) {
+	while (skb_queue_len(&vif->tx_queue) < budget) {
 		struct xen_netif_tx_request txreq;
 		struct xen_netif_tx_request txfrags[XEN_NETBK_LEGACY_SLOTS_MAX];
 		struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX-1];
@@ -1508,13 +1507,6 @@ void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success)
 	wake_up(&vif->dealloc_wq);
 	spin_unlock_irqrestore(&vif->callback_lock, flags);
 
-	if (RING_HAS_UNCONSUMED_REQUESTS(&vif->tx) &&
-	    xenvif_tx_pending_slots_available(vif)) {
-		local_bh_disable();
-		napi_schedule(&vif->napi);
-		local_bh_enable();
-	}
-
 	if (likely(zerocopy_success))
 		vif->tx_zerocopy_success++;
 	else
@@ -1706,8 +1698,7 @@ static inline int rx_work_todo(struct xenvif *vif)
 static inline int tx_work_todo(struct xenvif *vif)
 {
 
-	if (likely(RING_HAS_UNCONSUMED_REQUESTS(&vif->tx)) &&
-	    xenvif_tx_pending_slots_available(vif))
+	if (likely(RING_HAS_UNCONSUMED_REQUESTS(&vif->tx)))
 		return 1;
 
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From 0e59a4a553df312b5308c75085f7f02b12680d12 Mon Sep 17 00:00:00 2001
From: Zoltan Kiss <zoltan.kiss@citrix.com>
Date: Mon, 24 Mar 2014 23:59:50 +0000
Subject: xen-netback: Non-functional follow-up patch for grant mapping series

Ian made some late comments about the grant mapping series, I incorporated the
non-functional outcomes into this patch:

- typo fixes in a comment of xenvif_free(), and add another one there as well
- typo fix for comment of rx_drain_timeout_msecs
- remove stale comment before calling xenvif_grant_handle_reset()

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netback/interface.c | 13 ++++++++++---
 drivers/net/xen-netback/netback.c   |  4 +---
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'drivers/net/xen-netback/netback.c')

diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index e71fb1ac5c4..cdc298e3b74 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -574,15 +574,15 @@ void xenvif_disconnect(struct xenvif *vif)
 void xenvif_free(struct xenvif *vif)
 {
 	int i, unmap_timeout = 0;
-	/* Here we want to avoid timeout messages if an skb can be legitimatly
-	 * stucked somewhere else. Realisticly this could be an another vif's
+	/* Here we want to avoid timeout messages if an skb can be legitimately
+	 * stuck somewhere else. Realistically this could be an another vif's
 	 * internal or QDisc queue. That another vif also has this
 	 * rx_drain_timeout_msecs timeout, but the timer only ditches the
 	 * internal queue. After that, the QDisc queue can put in worst case
 	 * XEN_NETIF_RX_RING_SIZE / MAX_SKB_FRAGS skbs into that another vif's
 	 * internal queue, so we need several rounds of such timeouts until we
 	 * can be sure that no another vif should have skb's from us. We are
-	 * not sending more skb's, so newly stucked packets are not interesting
+	 * not sending more skb's, so newly stuck packets are not interesting
 	 * for us here.
 	 */
 	unsigned int worst_case_skb_lifetime = (rx_drain_timeout_msecs/1000) *
@@ -597,6 +597,13 @@ void xenvif_free(struct xenvif *vif)
 				netdev_err(vif->dev,
 					   "Page still granted! Index: %x\n",
 					   i);
+			/* If there are still unmapped pages, reset the loop to
+			 * start checking again. We shouldn't exit here until
+			 * dealloc thread and NAPI instance release all the
+			 * pages. If a kernel bug causes the skbs to stall
+			 * somewhere, the interface cannot be brought down
+			 * properly.
+			 */
 			i = -1;
 		}
 	}
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 5d2dd1d5417..d3172fe0306 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -56,7 +56,7 @@ bool separate_tx_rx_irq = 1;
 module_param(separate_tx_rx_irq, bool, 0644);
 
 /* When guest ring is filled up, qdisc queues the packets for us, but we have
- * to timeout them, otherwise other guests' packets can get stucked there
+ * to timeout them, otherwise other guests' packets can get stuck there
  */
 unsigned int rx_drain_timeout_msecs = 10000;
 module_param(rx_drain_timeout_msecs, uint, 0444);
@@ -1545,7 +1545,6 @@ static inline void xenvif_tx_dealloc_action(struct xenvif *vif)
 					    idx_to_kaddr(vif, pending_idx),
 					    GNTMAP_host_map,
 					    vif->grant_tx_handle[pending_idx]);
-			/* Btw. already unmapped? */
 			xenvif_grant_handle_reset(vif, pending_idx);
 			++gop;
 		}
@@ -1678,7 +1677,6 @@ void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)
 			    idx_to_kaddr(vif, pending_idx),
 			    GNTMAP_host_map,
 			    vif->grant_tx_handle[pending_idx]);
-	/* Btw. already unmapped? */
 	xenvif_grant_handle_reset(vif, pending_idx);
 
 	ret = gnttab_unmap_refs(&tx_unmap_op, NULL,
-- 
cgit v1.2.3-70-g09d2


From 7aceb47a9df3383b24824c3e4bd4f029e4598fda Mon Sep 17 00:00:00 2001
From: Zoltan Kiss <zoltan.kiss@citrix.com>
Date: Mon, 24 Mar 2014 23:59:51 +0000
Subject: xen-netback: Functional follow-up patch for grant mapping series

Ian made some late comments about the grant mapping series, I incorporated the
functional outcomes into this patch:

- use callback_param macro to shorten access to pending_tx_info in
  xenvif_fill_frags() and xenvif_tx_submit()
- print an error message in xenvif_idx_unmap() before panic

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netback/netback.c | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

(limited to 'drivers/net/xen-netback/netback.c')

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index d3172fe0306..cb784fe5220 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -99,6 +99,9 @@ static inline unsigned long idx_to_kaddr(struct xenvif *vif,
 	return (unsigned long)pfn_to_kaddr(idx_to_pfn(vif, idx));
 }
 
+#define callback_param(vif, pending_idx) \
+	(vif->pending_tx_info[pending_idx].callback_struct)
+
 /* Find the containing VIF's structure from a pointer in pending_tx_info array
  */
 static inline struct xenvif* ubuf_to_vif(struct ubuf_info *ubuf)
@@ -1020,12 +1023,12 @@ static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb)
 		/* If this is not the first frag, chain it to the previous*/
 		if (unlikely(prev_pending_idx == INVALID_PENDING_IDX))
 			skb_shinfo(skb)->destructor_arg =
-				&vif->pending_tx_info[pending_idx].callback_struct;
+				&callback_param(vif, pending_idx);
 		else if (likely(pending_idx != prev_pending_idx))
-			vif->pending_tx_info[prev_pending_idx].callback_struct.ctx =
-				&(vif->pending_tx_info[pending_idx].callback_struct);
+			callback_param(vif, prev_pending_idx).ctx =
+				&callback_param(vif, pending_idx);
 
-		vif->pending_tx_info[pending_idx].callback_struct.ctx = NULL;
+		callback_param(vif, pending_idx).ctx = NULL;
 		prev_pending_idx = pending_idx;
 
 		txp = &vif->pending_tx_info[pending_idx].req;
@@ -1395,13 +1398,13 @@ static int xenvif_tx_submit(struct xenvif *vif)
 		memcpy(skb->data,
 		       (void *)(idx_to_kaddr(vif, pending_idx)|txp->offset),
 		       data_len);
-		vif->pending_tx_info[pending_idx].callback_struct.ctx = NULL;
+		callback_param(vif, pending_idx).ctx = NULL;
 		if (data_len < txp->size) {
 			/* Append the packet payload as a fragment. */
 			txp->offset += data_len;
 			txp->size -= data_len;
 			skb_shinfo(skb)->destructor_arg =
-				&vif->pending_tx_info[pending_idx].callback_struct;
+				&callback_param(vif, pending_idx);
 		} else {
 			/* Schedule a response immediately. */
 			xenvif_idx_unmap(vif, pending_idx);
@@ -1681,7 +1684,16 @@ void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)
 
 	ret = gnttab_unmap_refs(&tx_unmap_op, NULL,
 				&vif->mmap_pages[pending_idx], 1);
-	BUG_ON(ret);
+	if (ret) {
+		netdev_err(vif->dev,
+			   "Unmap fail: ret: %d pending_idx: %d host_addr: %llx handle: %x status: %d\n",
+			   ret,
+			   pending_idx,
+			   tx_unmap_op.host_addr,
+			   tx_unmap_op.handle,
+			   tx_unmap_op.status);
+		BUG();
+	}
 
 	xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY);
 }
-- 
cgit v1.2.3-70-g09d2


From 0576eddf24df716d8570ef8ca11452a9f98eaab2 Mon Sep 17 00:00:00 2001
From: Paul Durrant <Paul.Durrant@citrix.com>
Date: Fri, 28 Mar 2014 11:39:05 +0000
Subject: xen-netback: remove pointless clause from if statement

This patch removes a test in start_new_rx_buffer() that checks whether
a copy operation is less than MAX_BUFFER_OFFSET in length, since
MAX_BUFFER_OFFSET is defined to be PAGE_SIZE and the only caller of
start_new_rx_buffer() already limits copy operations to PAGE_SIZE or less.

Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
Cc: Ian Campbell <ian.campbell@citrix.com>
Cc: Wei Liu <wei.liu2@citrix.com>
Cc: Sander Eikelenboom <linux@eikelenboom.it>
Reported-By: Sander Eikelenboom <linux@eikelenboom.it>
Tested-By: Sander Eikelenboom <linux@eikelenboom.it>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netback/netback.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/net/xen-netback/netback.c')

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 438d0c09b7e..72314c7998f 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -192,8 +192,8 @@ static bool start_new_rx_buffer(int offset, unsigned long size, int head)
 	 * into multiple copies tend to give large frags their
 	 * own buffers as before.
 	 */
-	if ((offset + size > MAX_BUFFER_OFFSET) &&
-	    (size <= MAX_BUFFER_OFFSET) && offset && !head)
+	BUG_ON(size > MAX_BUFFER_OFFSET);
+	if ((offset + size > MAX_BUFFER_OFFSET) && offset && !head)
 		return true;
 
 	return false;
-- 
cgit v1.2.3-70-g09d2


From a02eb4732cf975d7fc71b6d1a71c058c9988b949 Mon Sep 17 00:00:00 2001
From: Paul Durrant <Paul.Durrant@citrix.com>
Date: Fri, 28 Mar 2014 11:39:06 +0000
Subject: xen-netback: worse-case estimate in xenvif_rx_action is
 underestimating

The worse-case estimate for skb ring slot usage in xenvif_rx_action()
fails to take fragment page_offset into account. The page_offset does,
however, affect the number of times the fragmentation code calls
start_new_rx_buffer() (i.e. consume another slot) and the worse-case
should assume that will always return true. This patch adds the page_offset
into the DIV_ROUND_UP for each frag.

Unfortunately some frontends aggressively limit the number of requests
they post into the shared ring so to avoid an estimate that is 'too'
pessimal it is capped at MAX_SKB_FRAGS.

Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
Cc: Ian Campbell <ian.campbell@citrix.com>
Cc: Wei Liu <wei.liu2@citrix.com>
Cc: Sander Eikelenboom <linux@eikelenboom.it>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netback/netback.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

(limited to 'drivers/net/xen-netback/netback.c')

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 72314c7998f..573f3e81e5d 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -493,9 +493,28 @@ static void xenvif_rx_action(struct xenvif *vif)
 						PAGE_SIZE);
 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 			unsigned int size;
+			unsigned int offset;
+
 			size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
-			max_slots_needed += DIV_ROUND_UP(size, PAGE_SIZE);
+			offset = skb_shinfo(skb)->frags[i].page_offset;
+
+			/* For a worse-case estimate we need to factor in
+			 * the fragment page offset as this will affect the
+			 * number of times xenvif_gop_frag_copy() will
+			 * call start_new_rx_buffer().
+			 */
+			max_slots_needed += DIV_ROUND_UP(offset + size,
+							 PAGE_SIZE);
 		}
+
+		/* To avoid the estimate becoming too pessimal for some
+		 * frontends that limit posted rx requests, cap the estimate
+		 * at MAX_SKB_FRAGS.
+		 */
+		if (max_slots_needed > MAX_SKB_FRAGS)
+			max_slots_needed = MAX_SKB_FRAGS;
+
+		/* We may need one more slot for GSO metadata */
 		if (skb_is_gso(skb) &&
 		   (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4 ||
 		    skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6))
-- 
cgit v1.2.3-70-g09d2


From 1425c7a4e8d3d2eebf308bcbdc3fa3c1247686b4 Mon Sep 17 00:00:00 2001
From: Paul Durrant <Paul.Durrant@citrix.com>
Date: Fri, 28 Mar 2014 11:39:07 +0000
Subject: xen-netback: BUG_ON in xenvif_rx_action() not catching overflow

The BUG_ON to catch ring overflow in xenvif_rx_action() makes the assumption
that meta_slots_used == ring slots used. This is not necessarily the case
for GSO packets, because the non-prefix GSO protocol consumes one more ring
slot than meta-slot for the 'extra_info'. This patch changes the test to
actually check ring slots.

Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
Cc: Ian Campbell <ian.campbell@citrix.com>
Cc: Wei Liu <wei.liu2@citrix.com>
Cc: Sander Eikelenboom <linux@eikelenboom.it>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netback/netback.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'drivers/net/xen-netback/netback.c')

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 573f3e81e5d..cd0bd95ccc1 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -482,6 +482,8 @@ static void xenvif_rx_action(struct xenvif *vif)
 
 	while ((skb = skb_dequeue(&vif->rx_queue)) != NULL) {
 		RING_IDX max_slots_needed;
+		RING_IDX old_req_cons;
+		RING_IDX ring_slots_used;
 		int i;
 
 		/* We need a cheap worse case estimate for the number of
@@ -530,8 +532,12 @@ static void xenvif_rx_action(struct xenvif *vif)
 			vif->rx_last_skb_slots = 0;
 
 		sco = (struct skb_cb_overlay *)skb->cb;
+
+		old_req_cons = vif->rx.req_cons;
 		sco->meta_slots_used = xenvif_gop_skb(skb, &npo);
-		BUG_ON(sco->meta_slots_used > max_slots_needed);
+		ring_slots_used = vif->rx.req_cons - old_req_cons;
+
+		BUG_ON(ring_slots_used > max_slots_needed);
 
 		__skb_queue_tail(&rxq, skb);
 	}
-- 
cgit v1.2.3-70-g09d2


From e9d8b2c2968499c1f96563e6522c56958d5a1d0d Mon Sep 17 00:00:00 2001
From: Wei Liu <wei.liu2@citrix.com>
Date: Tue, 1 Apr 2014 12:46:12 +0100
Subject: xen-netback: disable rogue vif in kthread context
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When netback discovers frontend is sending malformed packet it will
disables the interface which serves that frontend.

However disabling a network interface involving taking a mutex which
cannot be done in softirq context, so we need to defer this process to
kthread context.

This patch does the following:
1. introduce a flag to indicate the interface is disabled.
2. check that flag in TX path, don't do any work if it's true.
3. check that flag in RX path, turn off that interface if it's true.

The reason to disable it in RX path is because RX uses kthread. After
this change the behavior of netback is still consistent -- it won't do
any TX work for a rogue frontend, and the interface will be eventually
turned off.

Also change a "continue" to "break" after xenvif_fatal_tx_err, as it
doesn't make sense to continue processing packets if frontend is rogue.

This is a fix for XSA-90.

Reported-by: Török Edwin <edwin@etorok.net>
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
Cc: Ian Campbell <ian.campbell@citrix.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netback/common.h    |  5 +++++
 drivers/net/xen-netback/interface.c | 11 +++++++++++
 drivers/net/xen-netback/netback.c   | 16 ++++++++++++++--
 3 files changed, 30 insertions(+), 2 deletions(-)

(limited to 'drivers/net/xen-netback/netback.c')

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 89b2d429c44..89d1d0556b6 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -104,6 +104,11 @@ struct xenvif {
 	domid_t          domid;
 	unsigned int     handle;
 
+	/* Is this interface disabled? True when backend discovers
+	 * frontend is rogue.
+	 */
+	bool disabled;
+
 	/* Use NAPI for guest TX */
 	struct napi_struct napi;
 	/* When feature-split-event-channels = 0, tx_irq = rx_irq. */
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index cdc298e3b74..ef05c5c49d4 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -63,6 +63,15 @@ static int xenvif_poll(struct napi_struct *napi, int budget)
 	struct xenvif *vif = container_of(napi, struct xenvif, napi);
 	int work_done;
 
+	/* This vif is rogue, we pretend we've there is nothing to do
+	 * for this vif to deschedule it from NAPI. But this interface
+	 * will be turned off in thread context later.
+	 */
+	if (unlikely(vif->disabled)) {
+		napi_complete(napi);
+		return 0;
+	}
+
 	work_done = xenvif_tx_action(vif, budget);
 
 	if (work_done < budget) {
@@ -363,6 +372,8 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
 	vif->ip_csum = 1;
 	vif->dev = dev;
 
+	vif->disabled = false;
+
 	vif->credit_bytes = vif->remaining_credit = ~0UL;
 	vif->credit_usec  = 0UL;
 	init_timer(&vif->credit_timeout);
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index ae34f5fc7fb..3f021e054ba 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -711,7 +711,8 @@ static void xenvif_tx_err(struct xenvif *vif,
 static void xenvif_fatal_tx_err(struct xenvif *vif)
 {
 	netdev_err(vif->dev, "fatal error; disabling device\n");
-	xenvif_carrier_off(vif);
+	vif->disabled = true;
+	xenvif_kick_thread(vif);
 }
 
 static int xenvif_count_requests(struct xenvif *vif,
@@ -1212,7 +1213,7 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
 				   vif->tx.sring->req_prod, vif->tx.req_cons,
 				   XEN_NETIF_TX_RING_SIZE);
 			xenvif_fatal_tx_err(vif);
-			continue;
+			break;
 		}
 
 		work_to_do = RING_HAS_UNCONSUMED_REQUESTS(&vif->tx);
@@ -1808,7 +1809,18 @@ int xenvif_kthread_guest_rx(void *data)
 	while (!kthread_should_stop()) {
 		wait_event_interruptible(vif->wq,
 					 rx_work_todo(vif) ||
+					 vif->disabled ||
 					 kthread_should_stop());
+
+		/* This frontend is found to be rogue, disable it in
+		 * kthread context. Currently this is only set when
+		 * netback finds out frontend sends malformed packet,
+		 * but we cannot disable the interface in softirq
+		 * context so we defer it here.
+		 */
+		if (unlikely(vif->disabled && netif_carrier_ok(vif->dev)))
+			xenvif_carrier_off(vif);
+
 		if (kthread_should_stop())
 			break;
 
-- 
cgit v1.2.3-70-g09d2


From 9074ce249321861e535cdf8de9af0930a174dda9 Mon Sep 17 00:00:00 2001
From: Zoltan Kiss <zoltan.kiss@citrix.com>
Date: Wed, 2 Apr 2014 18:04:57 +0100
Subject: xen-netback: Rename map ops

Rename identifiers to state explicitly that they refer to map ops.

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
Reviewed-by: Paul Durrant <paul.durrant@citrix.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netback/netback.c | 46 ++++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

(limited to 'drivers/net/xen-netback/netback.c')

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 3f021e054ba..4bb78862afc 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -820,13 +820,13 @@ struct xenvif_tx_cb {
 
 #define XENVIF_TX_CB(skb) ((struct xenvif_tx_cb *)(skb)->cb)
 
-static inline void xenvif_tx_create_gop(struct xenvif *vif,
-					u16 pending_idx,
-					struct xen_netif_tx_request *txp,
-					struct gnttab_map_grant_ref *gop)
+static inline void xenvif_tx_create_map_op(struct xenvif *vif,
+					  u16 pending_idx,
+					  struct xen_netif_tx_request *txp,
+					  struct gnttab_map_grant_ref *mop)
 {
-	vif->pages_to_map[gop-vif->tx_map_ops] = vif->mmap_pages[pending_idx];
-	gnttab_set_map_op(gop, idx_to_kaddr(vif, pending_idx),
+	vif->pages_to_map[mop-vif->tx_map_ops] = vif->mmap_pages[pending_idx];
+	gnttab_set_map_op(mop, idx_to_kaddr(vif, pending_idx),
 			  GNTMAP_host_map | GNTMAP_readonly,
 			  txp->gref, vif->domid);
 
@@ -880,7 +880,7 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
 	     shinfo->nr_frags++, txp++, gop++) {
 		index = pending_index(vif->pending_cons++);
 		pending_idx = vif->pending_ring[index];
-		xenvif_tx_create_gop(vif, pending_idx, txp, gop);
+		xenvif_tx_create_map_op(vif, pending_idx, txp, gop);
 		frag_set_pending_idx(&frags[shinfo->nr_frags], pending_idx);
 	}
 
@@ -900,7 +900,7 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
 		     shinfo->nr_frags++, txp++, gop++) {
 			index = pending_index(vif->pending_cons++);
 			pending_idx = vif->pending_ring[index];
-			xenvif_tx_create_gop(vif, pending_idx, txp, gop);
+			xenvif_tx_create_map_op(vif, pending_idx, txp, gop);
 			frag_set_pending_idx(&frags[shinfo->nr_frags],
 					     pending_idx);
 		}
@@ -940,9 +940,9 @@ static inline void xenvif_grant_handle_reset(struct xenvif *vif,
 
 static int xenvif_tx_check_gop(struct xenvif *vif,
 			       struct sk_buff *skb,
-			       struct gnttab_map_grant_ref **gopp)
+			       struct gnttab_map_grant_ref **gopp_map)
 {
-	struct gnttab_map_grant_ref *gop = *gopp;
+	struct gnttab_map_grant_ref *gop_map = *gopp_map;
 	u16 pending_idx = XENVIF_TX_CB(skb)->pending_idx;
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	struct pending_tx_info *tx_info;
@@ -951,11 +951,11 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
 	struct sk_buff *first_skb = NULL;
 
 	/* Check status of header. */
-	err = gop->status;
+	err = gop_map->status;
 	if (unlikely(err))
 		xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR);
 	else
-		xenvif_grant_handle_set(vif, pending_idx , gop->handle);
+		xenvif_grant_handle_set(vif, pending_idx , gop_map->handle);
 
 	/* Skip first skb fragment if it is on same page as header fragment. */
 	start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
@@ -968,10 +968,12 @@ check_frags:
 		tx_info = &vif->pending_tx_info[pending_idx];
 
 		/* Check error status: if okay then remember grant handle. */
-		newerr = (++gop)->status;
+		newerr = (++gop_map)->status;
 
 		if (likely(!newerr)) {
-			xenvif_grant_handle_set(vif, pending_idx , gop->handle);
+			xenvif_grant_handle_set(vif,
+						pending_idx,
+						gop_map->handle);
 			/* Had a previous error? Invalidate this fragment. */
 			if (unlikely(err))
 				xenvif_idx_unmap(vif, pending_idx);
@@ -1023,7 +1025,7 @@ check_frags:
 		}
 	}
 
-	*gopp = gop + 1;
+	*gopp_map = gop_map + 1;
 	return err;
 }
 
@@ -1292,7 +1294,7 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
 			}
 		}
 
-		xenvif_tx_create_gop(vif, pending_idx, &txreq, gop);
+		xenvif_tx_create_map_op(vif, pending_idx, &txreq, gop);
 
 		gop++;
 
@@ -1399,7 +1401,7 @@ static int xenvif_handle_frag_list(struct xenvif *vif, struct sk_buff *skb)
 
 static int xenvif_tx_submit(struct xenvif *vif)
 {
-	struct gnttab_map_grant_ref *gop = vif->tx_map_ops;
+	struct gnttab_map_grant_ref *gop_map = vif->tx_map_ops;
 	struct sk_buff *skb;
 	int work_done = 0;
 
@@ -1412,7 +1414,7 @@ static int xenvif_tx_submit(struct xenvif *vif)
 		txp = &vif->pending_tx_info[pending_idx].req;
 
 		/* Check the remap error code. */
-		if (unlikely(xenvif_tx_check_gop(vif, skb, &gop))) {
+		if (unlikely(xenvif_tx_check_gop(vif, skb, &gop_map))) {
 			netdev_dbg(vif->dev, "netback grant failed.\n");
 			skb_shinfo(skb)->nr_frags = 0;
 			kfree_skb(skb);
@@ -1611,21 +1613,21 @@ static inline void xenvif_tx_dealloc_action(struct xenvif *vif)
 /* Called after netfront has transmitted */
 int xenvif_tx_action(struct xenvif *vif, int budget)
 {
-	unsigned nr_gops;
+	unsigned nr_mops;
 	int work_done, ret;
 
 	if (unlikely(!tx_work_todo(vif)))
 		return 0;
 
-	nr_gops = xenvif_tx_build_gops(vif, budget);
+	nr_mops = xenvif_tx_build_gops(vif, budget);
 
-	if (nr_gops == 0)
+	if (nr_mops == 0)
 		return 0;
 
 	ret = gnttab_map_refs(vif->tx_map_ops,
 			      NULL,
 			      vif->pages_to_map,
-			      nr_gops);
+			      nr_mops);
 	BUG_ON(ret);
 
 	work_done = xenvif_tx_submit(vif);
-- 
cgit v1.2.3-70-g09d2


From bdab82759b8e3620096d6db46dc1cac38a52d779 Mon Sep 17 00:00:00 2001
From: Zoltan Kiss <zoltan.kiss@citrix.com>
Date: Wed, 2 Apr 2014 18:04:58 +0100
Subject: xen-netback: Grant copy the header instead of map and memcpy

An old inefficiency of the TX path that we are grant mapping the first slot,
and then copy the header part to the linear area. Instead, doing a grant copy
for that header straight on is more reasonable. Especially because there are
ongoing efforts to make Xen avoiding TLB flush after unmap when the page were
not touched in Dom0. In the original way the memcpy ruined that.
The key changes:
- the vif has a tx_copy_ops array again
- xenvif_tx_build_gops sets up the grant copy operations
- we don't have to figure out whether the header and first frag are on the same
  grant mapped page or not
Note, we only grant copy PKT_PROT_LEN bytes from the first slot, the rest (if
any) will be on the first frag, which is grant mapped. If the first slot is
smaller than PKT_PROT_LEN, then we grant copy that, and later __pskb_pull_tail
will pull more from the frags (if any)

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
Reviewed-by: Paul Durrant <paul.durrant@citrix.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netback/common.h  |   1 +
 drivers/net/xen-netback/netback.c | 122 +++++++++++++++++++++-----------------
 2 files changed, 70 insertions(+), 53 deletions(-)

(limited to 'drivers/net/xen-netback/netback.c')

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 89d1d0556b6..630a3fcf65b 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -124,6 +124,7 @@ struct xenvif {
 	struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
 	grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
 
+	struct gnttab_copy tx_copy_ops[MAX_PENDING_REQS];
 	struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS];
 	struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS];
 	/* passed to gnttab_[un]map_refs with pages under (un)mapping */
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 4bb78862afc..99c8f09b465 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -940,35 +940,37 @@ static inline void xenvif_grant_handle_reset(struct xenvif *vif,
 
 static int xenvif_tx_check_gop(struct xenvif *vif,
 			       struct sk_buff *skb,
-			       struct gnttab_map_grant_ref **gopp_map)
+			       struct gnttab_map_grant_ref **gopp_map,
+			       struct gnttab_copy **gopp_copy)
 {
 	struct gnttab_map_grant_ref *gop_map = *gopp_map;
 	u16 pending_idx = XENVIF_TX_CB(skb)->pending_idx;
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
-	struct pending_tx_info *tx_info;
 	int nr_frags = shinfo->nr_frags;
-	int i, err, start;
+	int i, err;
 	struct sk_buff *first_skb = NULL;
 
 	/* Check status of header. */
-	err = gop_map->status;
-	if (unlikely(err))
+	err = (*gopp_copy)->status;
+	(*gopp_copy)++;
+	if (unlikely(err)) {
+		if (net_ratelimit())
+			netdev_dbg(vif->dev,
+				   "Grant copy of header failed! status: %d pending_idx% %u ref: %u\n",
+				   (*gopp_copy)->status,
+				   pending_idx,
+				   (*gopp_copy)->source.u.ref);
 		xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR);
-	else
-		xenvif_grant_handle_set(vif, pending_idx , gop_map->handle);
-
-	/* Skip first skb fragment if it is on same page as header fragment. */
-	start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
+	}
 
 check_frags:
-	for (i = start; i < nr_frags; i++) {
+	for (i = 0; i < nr_frags; i++, gop_map++) {
 		int j, newerr;
 
 		pending_idx = frag_get_pending_idx(&shinfo->frags[i]);
-		tx_info = &vif->pending_tx_info[pending_idx];
 
 		/* Check error status: if okay then remember grant handle. */
-		newerr = (++gop_map)->status;
+		newerr = gop_map->status;
 
 		if (likely(!newerr)) {
 			xenvif_grant_handle_set(vif,
@@ -981,18 +983,20 @@ check_frags:
 		}
 
 		/* Error on this fragment: respond to client with an error. */
+		if (net_ratelimit())
+			netdev_dbg(vif->dev,
+				   "Grant map of %d. frag failed! status: %d pending_idx% %u ref: %u\n",
+				   i,
+				   gop_map->status,
+				   pending_idx,
+				   gop_map->ref);
 		xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR);
 
 		/* Not the first error? Preceding frags already invalidated. */
 		if (err)
 			continue;
-		/* First error: invalidate header and preceding fragments. */
-		if (!first_skb)
-			pending_idx = XENVIF_TX_CB(skb)->pending_idx;
-		else
-			pending_idx = XENVIF_TX_CB(skb)->pending_idx;
-		xenvif_idx_unmap(vif, pending_idx);
-		for (j = start; j < i; j++) {
+		/* First error: invalidate preceding fragments. */
+		for (j = 0; j < i; j++) {
 			pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
 			xenvif_idx_unmap(vif, pending_idx);
 		}
@@ -1006,7 +1010,6 @@ check_frags:
 		skb = shinfo->frag_list;
 		shinfo = skb_shinfo(skb);
 		nr_frags = shinfo->nr_frags;
-		start = 0;
 
 		goto check_frags;
 	}
@@ -1017,15 +1020,13 @@ check_frags:
 	if (first_skb && err) {
 		int j;
 		shinfo = skb_shinfo(first_skb);
-		pending_idx = XENVIF_TX_CB(skb)->pending_idx;
-		start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
-		for (j = start; j < shinfo->nr_frags; j++) {
+		for (j = 0; j < shinfo->nr_frags; j++) {
 			pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
 			xenvif_idx_unmap(vif, pending_idx);
 		}
 	}
 
-	*gopp_map = gop_map + 1;
+	*gopp_map = gop_map;
 	return err;
 }
 
@@ -1036,9 +1037,6 @@ static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb)
 	int i;
 	u16 prev_pending_idx = INVALID_PENDING_IDX;
 
-	if (skb_shinfo(skb)->destructor_arg)
-		prev_pending_idx = XENVIF_TX_CB(skb)->pending_idx;
-
 	for (i = 0; i < nr_frags; i++) {
 		skb_frag_t *frag = shinfo->frags + i;
 		struct xen_netif_tx_request *txp;
@@ -1048,10 +1046,10 @@ static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb)
 		pending_idx = frag_get_pending_idx(frag);
 
 		/* If this is not the first frag, chain it to the previous*/
-		if (unlikely(prev_pending_idx == INVALID_PENDING_IDX))
+		if (prev_pending_idx == INVALID_PENDING_IDX)
 			skb_shinfo(skb)->destructor_arg =
 				&callback_param(vif, pending_idx);
-		else if (likely(pending_idx != prev_pending_idx))
+		else
 			callback_param(vif, prev_pending_idx).ctx =
 				&callback_param(vif, pending_idx);
 
@@ -1191,7 +1189,10 @@ static bool tx_credit_exceeded(struct xenvif *vif, unsigned size)
 	return false;
 }
 
-static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
+static void xenvif_tx_build_gops(struct xenvif *vif,
+				     int budget,
+				     unsigned *copy_ops,
+				     unsigned *map_ops)
 {
 	struct gnttab_map_grant_ref *gop = vif->tx_map_ops, *request_gop;
 	struct sk_buff *skb;
@@ -1294,22 +1295,36 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
 			}
 		}
 
-		xenvif_tx_create_map_op(vif, pending_idx, &txreq, gop);
-
-		gop++;
-
 		XENVIF_TX_CB(skb)->pending_idx = pending_idx;
 
 		__skb_put(skb, data_len);
+		vif->tx_copy_ops[*copy_ops].source.u.ref = txreq.gref;
+		vif->tx_copy_ops[*copy_ops].source.domid = vif->domid;
+		vif->tx_copy_ops[*copy_ops].source.offset = txreq.offset;
+
+		vif->tx_copy_ops[*copy_ops].dest.u.gmfn =
+			virt_to_mfn(skb->data);
+		vif->tx_copy_ops[*copy_ops].dest.domid = DOMID_SELF;
+		vif->tx_copy_ops[*copy_ops].dest.offset =
+			offset_in_page(skb->data);
+
+		vif->tx_copy_ops[*copy_ops].len = data_len;
+		vif->tx_copy_ops[*copy_ops].flags = GNTCOPY_source_gref;
+
+		(*copy_ops)++;
 
 		skb_shinfo(skb)->nr_frags = ret;
 		if (data_len < txreq.size) {
 			skb_shinfo(skb)->nr_frags++;
 			frag_set_pending_idx(&skb_shinfo(skb)->frags[0],
 					     pending_idx);
+			xenvif_tx_create_map_op(vif, pending_idx, &txreq, gop);
+			gop++;
 		} else {
 			frag_set_pending_idx(&skb_shinfo(skb)->frags[0],
 					     INVALID_PENDING_IDX);
+			memcpy(&vif->pending_tx_info[pending_idx].req, &txreq,
+			       sizeof(txreq));
 		}
 
 		vif->pending_cons++;
@@ -1326,11 +1341,13 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
 
 		vif->tx.req_cons = idx;
 
-		if ((gop-vif->tx_map_ops) >= ARRAY_SIZE(vif->tx_map_ops))
+		if (((gop-vif->tx_map_ops) >= ARRAY_SIZE(vif->tx_map_ops)) ||
+		    (*copy_ops >= ARRAY_SIZE(vif->tx_copy_ops)))
 			break;
 	}
 
-	return gop - vif->tx_map_ops;
+	(*map_ops) = gop - vif->tx_map_ops;
+	return;
 }
 
 /* Consolidate skb with a frag_list into a brand new one with local pages on
@@ -1402,6 +1419,7 @@ static int xenvif_handle_frag_list(struct xenvif *vif, struct sk_buff *skb)
 static int xenvif_tx_submit(struct xenvif *vif)
 {
 	struct gnttab_map_grant_ref *gop_map = vif->tx_map_ops;
+	struct gnttab_copy *gop_copy = vif->tx_copy_ops;
 	struct sk_buff *skb;
 	int work_done = 0;
 
@@ -1414,27 +1432,22 @@ static int xenvif_tx_submit(struct xenvif *vif)
 		txp = &vif->pending_tx_info[pending_idx].req;
 
 		/* Check the remap error code. */
-		if (unlikely(xenvif_tx_check_gop(vif, skb, &gop_map))) {
-			netdev_dbg(vif->dev, "netback grant failed.\n");
+		if (unlikely(xenvif_tx_check_gop(vif, skb, &gop_map, &gop_copy))) {
 			skb_shinfo(skb)->nr_frags = 0;
 			kfree_skb(skb);
 			continue;
 		}
 
 		data_len = skb->len;
-		memcpy(skb->data,
-		       (void *)(idx_to_kaddr(vif, pending_idx)|txp->offset),
-		       data_len);
 		callback_param(vif, pending_idx).ctx = NULL;
 		if (data_len < txp->size) {
 			/* Append the packet payload as a fragment. */
 			txp->offset += data_len;
 			txp->size -= data_len;
-			skb_shinfo(skb)->destructor_arg =
-				&callback_param(vif, pending_idx);
 		} else {
 			/* Schedule a response immediately. */
-			xenvif_idx_unmap(vif, pending_idx);
+			xenvif_idx_release(vif, pending_idx,
+					   XEN_NETIF_RSP_OKAY);
 		}
 
 		if (txp->flags & XEN_NETTXF_csum_blank)
@@ -1613,22 +1626,25 @@ static inline void xenvif_tx_dealloc_action(struct xenvif *vif)
 /* Called after netfront has transmitted */
 int xenvif_tx_action(struct xenvif *vif, int budget)
 {
-	unsigned nr_mops;
+	unsigned nr_mops, nr_cops = 0;
 	int work_done, ret;
 
 	if (unlikely(!tx_work_todo(vif)))
 		return 0;
 
-	nr_mops = xenvif_tx_build_gops(vif, budget);
+	xenvif_tx_build_gops(vif, budget, &nr_cops, &nr_mops);
 
-	if (nr_mops == 0)
+	if (nr_cops == 0)
 		return 0;
 
-	ret = gnttab_map_refs(vif->tx_map_ops,
-			      NULL,
-			      vif->pages_to_map,
-			      nr_mops);
-	BUG_ON(ret);
+	gnttab_batch_copy(vif->tx_copy_ops, nr_cops);
+	if (nr_mops != 0) {
+		ret = gnttab_map_refs(vif->tx_map_ops,
+				      NULL,
+				      vif->pages_to_map,
+				      nr_mops);
+		BUG_ON(ret);
+	}
 
 	work_done = xenvif_tx_submit(vif);
 
-- 
cgit v1.2.3-70-g09d2


From 00aefceb2fffcf4ea2fbc97ef5d4f79ef2668ecc Mon Sep 17 00:00:00 2001
From: Zoltan Kiss <zoltan.kiss@citrix.com>
Date: Fri, 4 Apr 2014 15:45:24 +0100
Subject: xen-netback: Trivial format string fix

There is a "%" after pending_idx instead of ":".

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netback/netback.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/net/xen-netback/netback.c')

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 99c8f09b465..76665405c5a 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -956,7 +956,7 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
 	if (unlikely(err)) {
 		if (net_ratelimit())
 			netdev_dbg(vif->dev,
-				   "Grant copy of header failed! status: %d pending_idx% %u ref: %u\n",
+				   "Grant copy of header failed! status: %d pending_idx: %u ref: %u\n",
 				   (*gopp_copy)->status,
 				   pending_idx,
 				   (*gopp_copy)->source.u.ref);
@@ -985,7 +985,7 @@ check_frags:
 		/* Error on this fragment: respond to client with an error. */
 		if (net_ratelimit())
 			netdev_dbg(vif->dev,
-				   "Grant map of %d. frag failed! status: %d pending_idx% %u ref: %u\n",
+				   "Grant map of %d. frag failed! status: %d pending_idx: %u ref: %u\n",
 				   i,
 				   gop_map->status,
 				   pending_idx,
-- 
cgit v1.2.3-70-g09d2


From 583757446ba6850eff96cef6565d729266da9c5b Mon Sep 17 00:00:00 2001
From: Zoltan Kiss <zoltan.kiss@citrix.com>
Date: Thu, 15 May 2014 11:08:34 +0100
Subject: xen-netback: Fix grant ref resolution in RX path

The original series for reintroducing grant mapping for netback had a patch [1]
to handle receiving of packets from an another VIF. Grant copy on the receiving
side needs the grant ref of the page to set up the op.
The original patch assumed (wrongly) that the frags array haven't changed. In
the case reported by Sander, the sending guest sent a packet where the linear
buffer and the first frag were under PKT_PROT_LEN (=128) bytes.
xenvif_tx_submit() then pulled up the linear area to 128 bytes, and ditched the
first frag. The receiving side had an off-by-one problem when gathered the grant
refs.
This patch fixes that by checking whether the actual frag's page pointer is the
same as the page in the original frag list. It can handle any kind of changes on
the original frags array, like:
- removing granted frags from the array at any point
- adding local pages to the frags list anywhere
- reordering the frags
It's optimized to the most common case, when there is 1:1 relation between the
frags and the list, plus works optimal when frags are removed from the end or
the beginning.

[1]: 3e2234: xen-netback: Handle foreign mapped pages on the guest RX path

Reported-by: Sander Eikelenboom <linux@eikelenboom.it>
Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netback/netback.c | 98 ++++++++++++++++++++++++++++++++-------
 1 file changed, 80 insertions(+), 18 deletions(-)

(limited to 'drivers/net/xen-netback/netback.c')

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 76665405c5a..64ab1d141f1 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -104,7 +104,7 @@ static inline unsigned long idx_to_kaddr(struct xenvif *vif,
 
 /* Find the containing VIF's structure from a pointer in pending_tx_info array
  */
-static inline struct xenvif* ubuf_to_vif(struct ubuf_info *ubuf)
+static inline struct xenvif *ubuf_to_vif(const struct ubuf_info *ubuf)
 {
 	u16 pending_idx = ubuf->desc;
 	struct pending_tx_info *temp =
@@ -322,6 +322,35 @@ static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
 	}
 }
 
+/*
+ * Find the grant ref for a given frag in a chain of struct ubuf_info's
+ * skb: the skb itself
+ * i: the frag's number
+ * ubuf: a pointer to an element in the chain. It should not be NULL
+ *
+ * Returns a pointer to the element in the chain where the page were found. If
+ * not found, returns NULL.
+ * See the definition of callback_struct in common.h for more details about
+ * the chain.
+ */
+static const struct ubuf_info *xenvif_find_gref(const struct sk_buff *const skb,
+						const int i,
+						const struct ubuf_info *ubuf)
+{
+	struct xenvif *foreign_vif = ubuf_to_vif(ubuf);
+
+	do {
+		u16 pending_idx = ubuf->desc;
+
+		if (skb_shinfo(skb)->frags[i].page.p ==
+		    foreign_vif->mmap_pages[pending_idx])
+			break;
+		ubuf = (struct ubuf_info *) ubuf->ctx;
+	} while (ubuf);
+
+	return ubuf;
+}
+
 /*
  * Prepare an SKB to be transmitted to the frontend.
  *
@@ -346,9 +375,8 @@ static int xenvif_gop_skb(struct sk_buff *skb,
 	int head = 1;
 	int old_meta_prod;
 	int gso_type;
-	struct ubuf_info *ubuf = skb_shinfo(skb)->destructor_arg;
-	grant_ref_t foreign_grefs[MAX_SKB_FRAGS];
-	struct xenvif *foreign_vif = NULL;
+	const struct ubuf_info *ubuf = skb_shinfo(skb)->destructor_arg;
+	const struct ubuf_info *const head_ubuf = ubuf;
 
 	old_meta_prod = npo->meta_prod;
 
@@ -386,19 +414,6 @@ static int xenvif_gop_skb(struct sk_buff *skb,
 	npo->copy_off = 0;
 	npo->copy_gref = req->gref;
 
-	if ((skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) &&
-		 (ubuf->callback == &xenvif_zerocopy_callback)) {
-		int i = 0;
-		foreign_vif = ubuf_to_vif(ubuf);
-
-		do {
-			u16 pending_idx = ubuf->desc;
-			foreign_grefs[i++] =
-				foreign_vif->pending_tx_info[pending_idx].req.gref;
-			ubuf = (struct ubuf_info *) ubuf->ctx;
-		} while (ubuf);
-	}
-
 	data = skb->data;
 	while (data < skb_tail_pointer(skb)) {
 		unsigned int offset = offset_in_page(data);
@@ -415,13 +430,60 @@ static int xenvif_gop_skb(struct sk_buff *skb,
 	}
 
 	for (i = 0; i < nr_frags; i++) {
+		/* This variable also signals whether foreign_gref has a real
+		 * value or not.
+		 */
+		struct xenvif *foreign_vif = NULL;
+		grant_ref_t foreign_gref;
+
+		if ((skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) &&
+			(ubuf->callback == &xenvif_zerocopy_callback)) {
+			const struct ubuf_info *const startpoint = ubuf;
+
+			/* Ideally ubuf points to the chain element which
+			 * belongs to this frag. Or if frags were removed from
+			 * the beginning, then shortly before it.
+			 */
+			ubuf = xenvif_find_gref(skb, i, ubuf);
+
+			/* Try again from the beginning of the list, if we
+			 * haven't tried from there. This only makes sense in
+			 * the unlikely event of reordering the original frags.
+			 * For injected local pages it's an unnecessary second
+			 * run.
+			 */
+			if (unlikely(!ubuf) && startpoint != head_ubuf)
+				ubuf = xenvif_find_gref(skb, i, head_ubuf);
+
+			if (likely(ubuf)) {
+				u16 pending_idx = ubuf->desc;
+
+				foreign_vif = ubuf_to_vif(ubuf);
+				foreign_gref = foreign_vif->pending_tx_info[pending_idx].req.gref;
+				/* Just a safety measure. If this was the last
+				 * element on the list, the for loop will
+				 * iterate again if a local page were added to
+				 * the end. Using head_ubuf here prevents the
+				 * second search on the chain. Or the original
+				 * frags changed order, but that's less likely.
+				 * In any way, ubuf shouldn't be NULL.
+				 */
+				ubuf = ubuf->ctx ?
+					(struct ubuf_info *) ubuf->ctx :
+					head_ubuf;
+			} else
+				/* This frag was a local page, added to the
+				 * array after the skb left netback.
+				 */
+				ubuf = head_ubuf;
+		}
 		xenvif_gop_frag_copy(vif, skb, npo,
 				     skb_frag_page(&skb_shinfo(skb)->frags[i]),
 				     skb_frag_size(&skb_shinfo(skb)->frags[i]),
 				     skb_shinfo(skb)->frags[i].page_offset,
 				     &head,
 				     foreign_vif,
-				     foreign_grefs[i]);
+				     foreign_vif ? foreign_gref : UINT_MAX);
 	}
 
 	return npo->meta_prod - old_meta_prod;
-- 
cgit v1.2.3-70-g09d2


From 0d08fceb2e21c30ca3e1e462e678723f806acf18 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Fri, 16 May 2014 12:26:04 +0100
Subject: xen-netback: fix race between napi_complete() and interrupt handler

When the NAPI budget was not all used, xenvif_poll() would call
napi_complete() /after/ enabling the interrupt.  This resulted in a
race between the napi_complete() and the napi_schedule() in the
interrupt handler.  The use of local_irq_save/restore() avoided by
race iff the handler is running on the same CPU but not if it was
running on a different CPU.

Fix this properly by calling napi_complete() before reenabling
interrupts (in the xenvif_napi_schedule_or_enable_irq() call).

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Acked-by: Wei Liu <wei.liu2@citrix.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netback/common.h    |  2 +-
 drivers/net/xen-netback/interface.c | 30 +++---------------------------
 drivers/net/xen-netback/netback.c   |  4 ++--
 3 files changed, 6 insertions(+), 30 deletions(-)

(limited to 'drivers/net/xen-netback/netback.c')

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 630a3fcf65b..0d4a285cbd7 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -226,7 +226,7 @@ int xenvif_map_frontend_rings(struct xenvif *vif,
 			      grant_ref_t rx_ring_ref);
 
 /* Check for SKBs from frontend and schedule backend processing */
-void xenvif_check_rx_xenvif(struct xenvif *vif);
+void xenvif_napi_schedule_or_enable_events(struct xenvif *vif);
 
 /* Prevent the device from generating any further traffic. */
 void xenvif_carrier_off(struct xenvif *vif);
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index ef05c5c49d4..20e9defa106 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -75,32 +75,8 @@ static int xenvif_poll(struct napi_struct *napi, int budget)
 	work_done = xenvif_tx_action(vif, budget);
 
 	if (work_done < budget) {
-		int more_to_do = 0;
-		unsigned long flags;
-
-		/* It is necessary to disable IRQ before calling
-		 * RING_HAS_UNCONSUMED_REQUESTS. Otherwise we might
-		 * lose event from the frontend.
-		 *
-		 * Consider:
-		 *   RING_HAS_UNCONSUMED_REQUESTS
-		 *   <frontend generates event to trigger napi_schedule>
-		 *   __napi_complete
-		 *
-		 * This handler is still in scheduled state so the
-		 * event has no effect at all. After __napi_complete
-		 * this handler is descheduled and cannot get
-		 * scheduled again. We lose event in this case and the ring
-		 * will be completely stalled.
-		 */
-
-		local_irq_save(flags);
-
-		RING_FINAL_CHECK_FOR_REQUESTS(&vif->tx, more_to_do);
-		if (!more_to_do)
-			__napi_complete(napi);
-
-		local_irq_restore(flags);
+		napi_complete(napi);
+		xenvif_napi_schedule_or_enable_events(vif);
 	}
 
 	return work_done;
@@ -194,7 +170,7 @@ static void xenvif_up(struct xenvif *vif)
 	enable_irq(vif->tx_irq);
 	if (vif->tx_irq != vif->rx_irq)
 		enable_irq(vif->rx_irq);
-	xenvif_check_rx_xenvif(vif);
+	xenvif_napi_schedule_or_enable_events(vif);
 }
 
 static void xenvif_down(struct xenvif *vif)
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 64ab1d141f1..7367208ee8c 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -716,7 +716,7 @@ done:
 		notify_remote_via_irq(vif->rx_irq);
 }
 
-void xenvif_check_rx_xenvif(struct xenvif *vif)
+void xenvif_napi_schedule_or_enable_events(struct xenvif *vif)
 {
 	int more_to_do;
 
@@ -750,7 +750,7 @@ static void tx_credit_callback(unsigned long data)
 {
 	struct xenvif *vif = (struct xenvif *)data;
 	tx_add_credit(vif);
-	xenvif_check_rx_xenvif(vif);
+	xenvif_napi_schedule_or_enable_events(vif);
 }
 
 static void xenvif_tx_err(struct xenvif *vif,
-- 
cgit v1.2.3-70-g09d2