From d4828d85d188dc70ed172802e798d3978bb6e29e Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 22 Jun 2006 02:28:18 -0700
Subject: [NET]: Prevent transmission after dev_deactivate

The dev_deactivate function has bit-rotted since the introduction of
lockless drivers.  In particular, the spin_unlock_wait call at the end
has no effect on the xmit routine of lockless drivers.

With a little bit of work, we can make it much more useful by providing
the guarantee that when it returns, no more calls to the xmit routine
of the underlying driver will be made.

The idea is simple.  There are two entry points in to the xmit routine.
The first comes from dev_queue_xmit.  That one is easily stopped by
using synchronize_rcu.  This works because we set the qdisc to noop_qdisc
before the synchronize_rcu call.  That in turn causes all subsequent
packets sent to dev_queue_xmit to be dropped.  The synchronize_rcu call
also ensures all outstanding calls leave their critical section.

The other entry point is from qdisc_run.  Since we now have a bit that
indicates whether it's running, all we have to do is to wait until the
bit is off.

I've removed the loop to wait for __LINK_STATE_SCHED to clear.  This is
useless because netif_wake_queue can cause it to be set again.  It is
also harmless because we've disarmed qdisc_run.

I've also removed the spin_unlock_wait on xmit_lock because its only
purpose of making sure that all outstanding xmit_lock holders have
exited is also given by dev_watchdog_down.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index ab39fe17cb5..29e3888102b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1295,7 +1295,7 @@ int dev_queue_xmit(struct sk_buff *skb)
 	/* Disable soft irqs for various locks below. Also 
 	 * stops preemption for RCU. 
 	 */
-	local_bh_disable(); 
+	rcu_read_lock_bh(); 
 
 	/* Updates of qdisc are serialized by queue_lock. 
 	 * The struct Qdisc which is pointed to by qdisc is now a 
@@ -1369,13 +1369,13 @@ int dev_queue_xmit(struct sk_buff *skb)
 	}
 
 	rc = -ENETDOWN;
-	local_bh_enable();
+	rcu_read_unlock_bh();
 
 out_kfree_skb:
 	kfree_skb(skb);
 	return rc;
 out:
-	local_bh_enable();
+	rcu_read_unlock_bh();
 	return rc;
 }
 
-- 
cgit v1.2.3-70-g09d2


From f6a78bfcb141f963187464bac838d46a81c3882a Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 22 Jun 2006 02:57:17 -0700
Subject: [NET]: Add generic segmentation offload

This patch adds the infrastructure for generic segmentation offload.
The idea is to tap into the potential savings of TSO without hardware
support by postponing the allocation of segmented skb's until just
before the entry point into the NIC driver.

The same structure can be used to support software IPv6 TSO, as well as
UFO and segmentation offload for other relevant protocols, e.g., DCCP.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |   8 ++-
 net/core/dev.c            | 127 ++++++++++++++++++++++++++++++++++++++++++++--
 net/sched/sch_generic.c   |  19 +++++--
 3 files changed, 143 insertions(+), 11 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fa5671307b9..b4eae18390c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -405,6 +405,9 @@ struct net_device
 	struct list_head	qdisc_list;
 	unsigned long		tx_queue_len;	/* Max frames per queue allowed */
 
+	/* Partially transmitted GSO packet. */
+	struct sk_buff		*gso_skb;
+
 	/* ingress path synchronizer */
 	spinlock_t		ingress_lock;
 	struct Qdisc		*qdisc_ingress;
@@ -539,6 +542,7 @@ struct packet_type {
 					 struct net_device *,
 					 struct packet_type *,
 					 struct net_device *);
+	struct sk_buff		*(*gso_segment)(struct sk_buff *skb, int sg);
 	void			*af_packet_priv;
 	struct list_head	list;
 };
@@ -689,7 +693,8 @@ extern int		dev_change_name(struct net_device *, char *);
 extern int		dev_set_mtu(struct net_device *, int);
 extern int		dev_set_mac_address(struct net_device *,
 					    struct sockaddr *);
-extern void		dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
+extern int		dev_hard_start_xmit(struct sk_buff *skb,
+					    struct net_device *dev);
 
 extern void		dev_init(void);
 
@@ -963,6 +968,7 @@ extern int		netdev_max_backlog;
 extern int		weight_p;
 extern int		netdev_set_master(struct net_device *dev, struct net_device *master);
 extern int skb_checksum_help(struct sk_buff *skb, int inward);
+extern struct sk_buff *skb_gso_segment(struct sk_buff *skb, int sg);
 #ifdef CONFIG_BUG
 extern void netdev_rx_csum_fault(struct net_device *dev);
 #else
diff --git a/net/core/dev.c b/net/core/dev.c
index 29e3888102b..d293e0f90a0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -116,6 +116,7 @@
 #include <asm/current.h>
 #include <linux/audit.h>
 #include <linux/dmaengine.h>
+#include <linux/err.h>
 
 /*
  *	The list of packet types we will receive (as opposed to discard)
@@ -1048,7 +1049,7 @@ static inline void net_timestamp(struct sk_buff *skb)
  *	taps currently in use.
  */
 
-void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
+static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct packet_type *ptype;
 
@@ -1186,6 +1187,40 @@ out:
 	return ret;
 }
 
+/**
+ *	skb_gso_segment - Perform segmentation on skb.
+ *	@skb: buffer to segment
+ *	@sg: whether scatter-gather is supported on the target.
+ *
+ *	This function segments the given skb and returns a list of segments.
+ */
+struct sk_buff *skb_gso_segment(struct sk_buff *skb, int sg)
+{
+	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
+	struct packet_type *ptype;
+	int type = skb->protocol;
+
+	BUG_ON(skb_shinfo(skb)->frag_list);
+	BUG_ON(skb->ip_summed != CHECKSUM_HW);
+
+	skb->mac.raw = skb->data;
+	skb->mac_len = skb->nh.raw - skb->data;
+	__skb_pull(skb, skb->mac_len);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & 15], list) {
+		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
+			segs = ptype->gso_segment(skb, sg);
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return segs;
+}
+
+EXPORT_SYMBOL(skb_gso_segment);
+
 /* Take action when hardware reception checksum errors are detected. */
 #ifdef CONFIG_BUG
 void netdev_rx_csum_fault(struct net_device *dev)
@@ -1222,6 +1257,86 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 #define illegal_highdma(dev, skb)	(0)
 #endif
 
+struct dev_gso_cb {
+	void (*destructor)(struct sk_buff *skb);
+};
+
+#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
+
+static void dev_gso_skb_destructor(struct sk_buff *skb)
+{
+	struct dev_gso_cb *cb;
+
+	do {
+		struct sk_buff *nskb = skb->next;
+
+		skb->next = nskb->next;
+		nskb->next = NULL;
+		kfree_skb(nskb);
+	} while (skb->next);
+
+	cb = DEV_GSO_CB(skb);
+	if (cb->destructor)
+		cb->destructor(skb);
+}
+
+/**
+ *	dev_gso_segment - Perform emulated hardware segmentation on skb.
+ *	@skb: buffer to segment
+ *
+ *	This function segments the given skb and stores the list of segments
+ *	in skb->next.
+ */
+static int dev_gso_segment(struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	struct sk_buff *segs;
+
+	segs = skb_gso_segment(skb, dev->features & NETIF_F_SG &&
+				    !illegal_highdma(dev, skb));
+	if (unlikely(IS_ERR(segs)))
+		return PTR_ERR(segs);
+
+	skb->next = segs;
+	DEV_GSO_CB(skb)->destructor = skb->destructor;
+	skb->destructor = dev_gso_skb_destructor;
+
+	return 0;
+}
+
+int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	if (likely(!skb->next)) {
+		if (netdev_nit)
+			dev_queue_xmit_nit(skb, dev);
+
+		if (!netif_needs_gso(dev, skb))
+			return dev->hard_start_xmit(skb, dev);
+
+		if (unlikely(dev_gso_segment(skb)))
+			goto out_kfree_skb;
+	}
+
+	do {
+		struct sk_buff *nskb = skb->next;
+		int rc;
+
+		skb->next = nskb->next;
+		nskb->next = NULL;
+		rc = dev->hard_start_xmit(nskb, dev);
+		if (unlikely(rc)) {
+			skb->next = nskb;
+			return rc;
+		}
+	} while (skb->next);
+	
+	skb->destructor = DEV_GSO_CB(skb)->destructor;
+
+out_kfree_skb:
+	kfree_skb(skb);
+	return 0;
+}
+
 #define HARD_TX_LOCK(dev, cpu) {			\
 	if ((dev->features & NETIF_F_LLTX) == 0) {	\
 		netif_tx_lock(dev);			\
@@ -1266,6 +1381,10 @@ int dev_queue_xmit(struct sk_buff *skb)
 	struct Qdisc *q;
 	int rc = -ENOMEM;
 
+	/* GSO will handle the following emulations directly. */
+	if (netif_needs_gso(dev, skb))
+		goto gso;
+
 	if (skb_shinfo(skb)->frag_list &&
 	    !(dev->features & NETIF_F_FRAGLIST) &&
 	    __skb_linearize(skb))
@@ -1290,6 +1409,7 @@ int dev_queue_xmit(struct sk_buff *skb)
 	      	if (skb_checksum_help(skb, 0))
 	      		goto out_kfree_skb;
 
+gso:
 	spin_lock_prefetch(&dev->queue_lock);
 
 	/* Disable soft irqs for various locks below. Also 
@@ -1346,11 +1466,8 @@ int dev_queue_xmit(struct sk_buff *skb)
 			HARD_TX_LOCK(dev, cpu);
 
 			if (!netif_queue_stopped(dev)) {
-				if (netdev_nit)
-					dev_queue_xmit_nit(skb, dev);
-
 				rc = 0;
-				if (!dev->hard_start_xmit(skb, dev)) {
+				if (!dev_hard_start_xmit(skb, dev)) {
 					HARD_TX_UNLOCK(dev);
 					goto out;
 				}
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 7aad0121232..74d4a1dceee 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -96,8 +96,11 @@ static inline int qdisc_restart(struct net_device *dev)
 	struct sk_buff *skb;
 
 	/* Dequeue packet */
-	if ((skb = q->dequeue(q)) != NULL) {
+	if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
 		unsigned nolock = (dev->features & NETIF_F_LLTX);
+
+		dev->gso_skb = NULL;
+
 		/*
 		 * When the driver has LLTX set it does its own locking
 		 * in start_xmit. No need to add additional overhead by
@@ -134,10 +137,8 @@ static inline int qdisc_restart(struct net_device *dev)
 
 			if (!netif_queue_stopped(dev)) {
 				int ret;
-				if (netdev_nit)
-					dev_queue_xmit_nit(skb, dev);
 
-				ret = dev->hard_start_xmit(skb, dev);
+				ret = dev_hard_start_xmit(skb, dev);
 				if (ret == NETDEV_TX_OK) { 
 					if (!nolock) {
 						netif_tx_unlock(dev);
@@ -171,7 +172,10 @@ static inline int qdisc_restart(struct net_device *dev)
 		 */
 
 requeue:
-		q->ops->requeue(skb, q);
+		if (skb->next)
+			dev->gso_skb = skb;
+		else
+			q->ops->requeue(skb, q);
 		netif_schedule(dev);
 		return 1;
 	}
@@ -593,6 +597,11 @@ void dev_deactivate(struct net_device *dev)
 	/* Wait for outstanding qdisc_run calls. */
 	while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
 		yield();
+
+	if (dev->gso_skb) {
+		kfree_skb(dev->gso_skb);
+		dev->gso_skb = NULL;
+	}
 }
 
 void dev_init_scheduler(struct net_device *dev)
-- 
cgit v1.2.3-70-g09d2


From f4b8ea7849544114e9d3d682df4d400180854677 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Thu, 22 Jun 2006 16:00:11 -0700
Subject: [NET]: fix net-core kernel-doc

Warning(/var/linsrc/linux-2617-g4//include/linux/skbuff.h:304): No description found for parameter 'dma_cookie'
Warning(/var/linsrc/linux-2617-g4//include/net/sock.h:1274): No description found for parameter 'copied_early'
Warning(/var/linsrc/linux-2617-g4//net/core/dev.c:3309): No description found for parameter 'chan'
Warning(/var/linsrc/linux-2617-g4//net/core/dev.c:3309): No description found for parameter 'event'

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 ++
 include/net/sock.h     | 1 +
 net/core/dev.c         | 4 ++--
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a45bba9b8cb..16eef03ce0e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -215,6 +215,8 @@ enum {
  *	@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
  *	@tc_index: Traffic control index
  *	@tc_verd: traffic control verdict
+ *	@dma_cookie: a cookie to one of several possible DMA operations
+ *		done by skb DMA functions
  *	@secmark: security marking
  */
 
diff --git a/include/net/sock.h b/include/net/sock.h
index a897f05de3b..2d8d6adf161 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1269,6 +1269,7 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
  * sk_eat_skb - Release a skb if it is no longer needed
  * @sk: socket to eat this skb from
  * @skb: socket buffer to eat
+ * @copied_early: flag indicating whether DMA operations copied this data early
  *
  * This routine must be called with interrupts disabled or with the socket
  * locked so that the sk_buff queue operation is ok.
diff --git a/net/core/dev.c b/net/core/dev.c
index d293e0f90a0..9b8f0f22c81 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3418,8 +3418,8 @@ static void net_dma_rebalance(void)
 /**
  * netdev_dma_event - event callback for the net_dma_client
  * @client: should always be net_dma_client
- * @chan:
- * @event:
+ * @chan: DMA channel for the event
+ * @event: event type
  */
 static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
 	enum dma_event event)
-- 
cgit v1.2.3-70-g09d2


From 626ab0e69d376fa07599af669af8ba92d58e87c1 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 23 Jun 2006 02:05:55 -0700
Subject: [PATCH] list: use list_replace_init() instead of list_splice_init()

list_splice_init(list, head) does unneeded job if it is known that
list_empty(head) == 1.  We can use list_replace_init() instead.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/pageattr.c | 8 ++++----
 block/ll_rw_blk.c       | 5 ++---
 fs/aio.c                | 4 ++--
 kernel/timer.c          | 8 ++++----
 kernel/workqueue.c      | 4 ++--
 net/core/dev.c          | 6 +++---
 net/core/link_watch.c   | 5 ++---
 7 files changed, 19 insertions(+), 21 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index 92c3d9f0e73..0887b34bc59 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -209,19 +209,19 @@ int change_page_attr(struct page *page, int numpages, pgprot_t prot)
 }
 
 void global_flush_tlb(void)
-{ 
-	LIST_HEAD(l);
+{
+	struct list_head l;
 	struct page *pg, *next;
 
 	BUG_ON(irqs_disabled());
 
 	spin_lock_irq(&cpa_lock);
-	list_splice_init(&df_list, &l);
+	list_replace_init(&df_list, &l);
 	spin_unlock_irq(&cpa_lock);
 	flush_map();
 	list_for_each_entry_safe(pg, next, &l, lru)
 		__free_page(pg);
-} 
+}
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
 void kernel_map_pages(struct page *page, int numpages, int enable)
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 7eb36c53f4b..465b54312c5 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3359,12 +3359,11 @@ EXPORT_SYMBOL(end_that_request_chunk);
  */
 static void blk_done_softirq(struct softirq_action *h)
 {
-	struct list_head *cpu_list;
-	LIST_HEAD(local_list);
+	struct list_head *cpu_list, local_list;
 
 	local_irq_disable();
 	cpu_list = &__get_cpu_var(blk_cpu_done);
-	list_splice_init(cpu_list, &local_list);
+	list_replace_init(cpu_list, &local_list);
 	local_irq_enable();
 
 	while (!list_empty(&local_list)) {
diff --git a/fs/aio.c b/fs/aio.c
index e41e932ba48..8c34a62df7d 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -777,11 +777,11 @@ out:
 static int __aio_run_iocbs(struct kioctx *ctx)
 {
 	struct kiocb *iocb;
-	LIST_HEAD(run_list);
+	struct list_head run_list;
 
 	assert_spin_locked(&ctx->ctx_lock);
 
-	list_splice_init(&ctx->run_list, &run_list);
+	list_replace_init(&ctx->run_list, &run_list);
 	while (!list_empty(&run_list)) {
 		iocb = list_entry(run_list.next, struct kiocb,
 			ki_run_list);
diff --git a/kernel/timer.c b/kernel/timer.c
index 9e49deed468..3bf0e9ed2db 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -419,10 +419,10 @@ static inline void __run_timers(tvec_base_t *base)
 
 	spin_lock_irq(&base->lock);
 	while (time_after_eq(jiffies, base->timer_jiffies)) {
-		struct list_head work_list = LIST_HEAD_INIT(work_list);
+		struct list_head work_list;
 		struct list_head *head = &work_list;
  		int index = base->timer_jiffies & TVR_MASK;
- 
+
 		/*
 		 * Cascade timers:
 		 */
@@ -431,8 +431,8 @@ static inline void __run_timers(tvec_base_t *base)
 				(!cascade(base, &base->tv3, INDEX(1))) &&
 					!cascade(base, &base->tv4, INDEX(2)))
 			cascade(base, &base->tv5, INDEX(3));
-		++base->timer_jiffies; 
-		list_splice_init(base->tv1.vec + index, &work_list);
+		++base->timer_jiffies;
+		list_replace_init(base->tv1.vec + index, &work_list);
 		while (!list_empty(head)) {
 			void (*fn)(unsigned long);
 			unsigned long data;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 880fb415a8f..740c5abceb0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -531,11 +531,11 @@ int current_is_keventd(void)
 static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
 {
 	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
-	LIST_HEAD(list);
+	struct list_head list;
 	struct work_struct *work;
 
 	spin_lock_irq(&cwq->lock);
-	list_splice_init(&cwq->worklist, &list);
+	list_replace_init(&cwq->worklist, &list);
 
 	while (!list_empty(&list)) {
 		printk("Taking work for %s\n", wq->name);
diff --git a/net/core/dev.c b/net/core/dev.c
index ab39fe17cb5..195a5e96b2d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2980,7 +2980,7 @@ static void netdev_wait_allrefs(struct net_device *dev)
 static DEFINE_MUTEX(net_todo_run_mutex);
 void netdev_run_todo(void)
 {
-	struct list_head list = LIST_HEAD_INIT(list);
+	struct list_head list;
 
 	/* Need to guard against multiple cpu's getting out of order. */
 	mutex_lock(&net_todo_run_mutex);
@@ -2995,9 +2995,9 @@ void netdev_run_todo(void)
 
 	/* Snapshot list, allow later requests */
 	spin_lock(&net_todo_list_lock);
-	list_splice_init(&net_todo_list, &list);
+	list_replace_init(&net_todo_list, &list);
 	spin_unlock(&net_todo_list_lock);
-		
+
 	while (!list_empty(&list)) {
 		struct net_device *dev
 			= list_entry(list.next, struct net_device, todo_list);
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index 646937cc2d8..0f37266411b 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -91,11 +91,10 @@ static void rfc2863_policy(struct net_device *dev)
 /* Must be called with the rtnl semaphore held */
 void linkwatch_run_queue(void)
 {
-	LIST_HEAD(head);
-	struct list_head *n, *next;
+	struct list_head head, *n, *next;
 
 	spin_lock_irq(&lweventlist_lock);
-	list_splice_init(&lweventlist, &head);
+	list_replace_init(&lweventlist, &head);
 	spin_unlock_irq(&lweventlist_lock);
 
 	list_for_each_safe(n, next, &head) {
-- 
cgit v1.2.3-70-g09d2