From db21733488f84a596faaad0d05430b3f51804692 Mon Sep 17 00:00:00 2001
From: Chris Leech <christopher.leech@intel.com>
Date: Sat, 17 Jun 2006 21:24:58 -0700
Subject: [I/OAT]: Setup the networking subsystem as a DMA client

Attempts to allocate per-CPU DMA channels

Signed-off-by: Chris Leech <christopher.leech@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netdma.h | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 include/net/netdma.h

(limited to 'include/net')

diff --git a/include/net/netdma.h b/include/net/netdma.h
new file mode 100644
index 00000000000..cbfe89d7e5d
--- /dev/null
+++ b/include/net/netdma.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#ifndef NETDMA_H
+#define NETDMA_H
+#include <linux/config.h>
+#ifdef CONFIG_NET_DMA
+#include <linux/dmaengine.h>
+
+static inline struct dma_chan *get_softnet_dma(void)
+{
+	struct dma_chan *chan;
+	rcu_read_lock();
+	chan = rcu_dereference(__get_cpu_var(softnet_data.net_dma));
+	if (chan)
+		dma_chan_get(chan);
+	rcu_read_unlock();
+	return chan;
+}
+#endif /* CONFIG_NET_DMA */
+#endif /* NETDMA_H */
-- 
cgit v1.2.3-70-g09d2


From de5506e155276d385712c2aa1c2d9a27cd4ed947 Mon Sep 17 00:00:00 2001
From: Chris Leech <christopher.leech@intel.com>
Date: Tue, 23 May 2006 17:50:37 -0700
Subject: [I/OAT]: Utility functions for offloading sk_buff to iovec copies

Provides for pinning user space pages in memory, copying to iovecs,
and copying from sk_buffs including fragmented and chained sk_buffs.

Signed-off-by: Chris Leech <christopher.leech@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/dma/Makefile      |   3 +-
 drivers/dma/iovlock.c     | 301 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/dmaengine.h |  22 ++++
 include/net/netdma.h      |   6 +
 net/core/Makefile         |   1 +
 net/core/user_dma.c       | 127 +++++++++++++++++++
 6 files changed, 459 insertions(+), 1 deletion(-)
 create mode 100644 drivers/dma/iovlock.c
 create mode 100644 net/core/user_dma.c

(limited to 'include/net')

diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index c8a5f567731..bdcfdbdb1ae 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -1,2 +1,3 @@
-obj-y += dmaengine.o
+obj-$(CONFIG_DMA_ENGINE) += dmaengine.o
+obj-$(CONFIG_NET_DMA) += iovlock.o
 obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o
diff --git a/drivers/dma/iovlock.c b/drivers/dma/iovlock.c
new file mode 100644
index 00000000000..5ed327e453a
--- /dev/null
+++ b/drivers/dma/iovlock.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ * Portions based on net/core/datagram.c and copyrighted by their authors.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ * This code allows the net stack to make use of a DMA engine for
+ * skb to iovec copies.
+ */
+
+#include <linux/dmaengine.h>
+#include <linux/pagemap.h>
+#include <net/tcp.h> /* for memcpy_toiovec */
+#include <asm/io.h>
+#include <asm/uaccess.h>
+
+int num_pages_spanned(struct iovec *iov)
+{
+	return
+	((PAGE_ALIGN((unsigned long)iov->iov_base + iov->iov_len) -
+	((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT);
+}
+
+/*
+ * Pin down all the iovec pages needed for len bytes.
+ * Return a struct dma_pinned_list to keep track of pages pinned down.
+ *
+ * We are allocating a single chunk of memory, and then carving it up into
+ * 3 sections, the latter 2 whose size depends on the number of iovecs and the
+ * total number of pages, respectively.
+ */
+struct dma_pinned_list *dma_pin_iovec_pages(struct iovec *iov, size_t len)
+{
+	struct dma_pinned_list *local_list;
+	struct page **pages;
+	int i;
+	int ret;
+	int nr_iovecs = 0;
+	int iovec_len_used = 0;
+	int iovec_pages_used = 0;
+	long err;
+
+	/* don't pin down non-user-based iovecs */
+	if (segment_eq(get_fs(), KERNEL_DS))
+		return NULL;
+
+	/* determine how many iovecs/pages there are, up front */
+	do {
+		iovec_len_used += iov[nr_iovecs].iov_len;
+		iovec_pages_used += num_pages_spanned(&iov[nr_iovecs]);
+		nr_iovecs++;
+	} while (iovec_len_used < len);
+
+	/* single kmalloc for pinned list, page_list[], and the page arrays */
+	local_list = kmalloc(sizeof(*local_list)
+		+ (nr_iovecs * sizeof (struct dma_page_list))
+		+ (iovec_pages_used * sizeof (struct page*)), GFP_KERNEL);
+	if (!local_list) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	/* list of pages starts right after the page list array */
+	pages = (struct page **) &local_list->page_list[nr_iovecs];
+
+	for (i = 0; i < nr_iovecs; i++) {
+		struct dma_page_list *page_list = &local_list->page_list[i];
+
+		len -= iov[i].iov_len;
+
+		if (!access_ok(VERIFY_WRITE, iov[i].iov_base, iov[i].iov_len)) {
+			err = -EFAULT;
+			goto unpin;
+		}
+
+		page_list->nr_pages = num_pages_spanned(&iov[i]);
+		page_list->base_address = iov[i].iov_base;
+
+		page_list->pages = pages;
+		pages += page_list->nr_pages;
+
+		/* pin pages down */
+		down_read(&current->mm->mmap_sem);
+		ret = get_user_pages(
+			current,
+			current->mm,
+			(unsigned long) iov[i].iov_base,
+			page_list->nr_pages,
+			1,	/* write */
+			0,	/* force */
+			page_list->pages,
+			NULL);
+		up_read(&current->mm->mmap_sem);
+
+		if (ret != page_list->nr_pages) {
+			err = -ENOMEM;
+			goto unpin;
+		}
+
+		local_list->nr_iovecs = i + 1;
+	}
+
+	return local_list;
+
+unpin:
+	dma_unpin_iovec_pages(local_list);
+out:
+	return ERR_PTR(err);
+}
+
+void dma_unpin_iovec_pages(struct dma_pinned_list *pinned_list)
+{
+	int i, j;
+
+	if (!pinned_list)
+		return;
+
+	for (i = 0; i < pinned_list->nr_iovecs; i++) {
+		struct dma_page_list *page_list = &pinned_list->page_list[i];
+		for (j = 0; j < page_list->nr_pages; j++) {
+			set_page_dirty_lock(page_list->pages[j]);
+			page_cache_release(page_list->pages[j]);
+		}
+	}
+
+	kfree(pinned_list);
+}
+
+static dma_cookie_t dma_memcpy_to_kernel_iovec(struct dma_chan *chan, struct
+	iovec *iov, unsigned char *kdata, size_t len)
+{
+	dma_cookie_t dma_cookie = 0;
+
+	while (len > 0) {
+		if (iov->iov_len) {
+			int copy = min_t(unsigned int, iov->iov_len, len);
+			dma_cookie = dma_async_memcpy_buf_to_buf(
+					chan,
+					iov->iov_base,
+					kdata,
+					copy);
+			kdata += copy;
+			len -= copy;
+			iov->iov_len -= copy;
+			iov->iov_base += copy;
+		}
+		iov++;
+	}
+
+	return dma_cookie;
+}
+
+/*
+ * We have already pinned down the pages we will be using in the iovecs.
+ * Each entry in iov array has corresponding entry in pinned_list->page_list.
+ * Using array indexing to keep iov[] and page_list[] in sync.
+ * Initial elements in iov array's iov->iov_len will be 0 if already copied into
+ *   by another call.
+ * iov array length remaining guaranteed to be bigger than len.
+ */
+dma_cookie_t dma_memcpy_to_iovec(struct dma_chan *chan, struct iovec *iov,
+	struct dma_pinned_list *pinned_list, unsigned char *kdata, size_t len)
+{
+	int iov_byte_offset;
+	int copy;
+	dma_cookie_t dma_cookie = 0;
+	int iovec_idx;
+	int page_idx;
+
+	if (!chan)
+		return memcpy_toiovec(iov, kdata, len);
+
+	/* -> kernel copies (e.g. smbfs) */
+	if (!pinned_list)
+		return dma_memcpy_to_kernel_iovec(chan, iov, kdata, len);
+
+	iovec_idx = 0;
+	while (iovec_idx < pinned_list->nr_iovecs) {
+		struct dma_page_list *page_list;
+
+		/* skip already used-up iovecs */
+		while (!iov[iovec_idx].iov_len)
+			iovec_idx++;
+
+		page_list = &pinned_list->page_list[iovec_idx];
+
+		iov_byte_offset = ((unsigned long)iov[iovec_idx].iov_base & ~PAGE_MASK);
+		page_idx = (((unsigned long)iov[iovec_idx].iov_base & PAGE_MASK)
+			 - ((unsigned long)page_list->base_address & PAGE_MASK)) >> PAGE_SHIFT;
+
+		/* break up copies to not cross page boundary */
+		while (iov[iovec_idx].iov_len) {
+			copy = min_t(int, PAGE_SIZE - iov_byte_offset, len);
+			copy = min_t(int, copy, iov[iovec_idx].iov_len);
+
+			dma_cookie = dma_async_memcpy_buf_to_pg(chan,
+					page_list->pages[page_idx],
+					iov_byte_offset,
+					kdata,
+					copy);
+
+			len -= copy;
+			iov[iovec_idx].iov_len -= copy;
+			iov[iovec_idx].iov_base += copy;
+
+			if (!len)
+				return dma_cookie;
+
+			kdata += copy;
+			iov_byte_offset = 0;
+			page_idx++;
+		}
+		iovec_idx++;
+	}
+
+	/* really bad if we ever run out of iovecs */
+	BUG();
+	return -EFAULT;
+}
+
+dma_cookie_t dma_memcpy_pg_to_iovec(struct dma_chan *chan, struct iovec *iov,
+	struct dma_pinned_list *pinned_list, struct page *page,
+	unsigned int offset, size_t len)
+{
+	int iov_byte_offset;
+	int copy;
+	dma_cookie_t dma_cookie = 0;
+	int iovec_idx;
+	int page_idx;
+	int err;
+
+	/* this needs as-yet-unimplemented buf-to-buff, so punt. */
+	/* TODO: use dma for this */
+	if (!chan || !pinned_list) {
+		u8 *vaddr = kmap(page);
+		err = memcpy_toiovec(iov, vaddr + offset, len);
+		kunmap(page);
+		return err;
+	}
+
+	iovec_idx = 0;
+	while (iovec_idx < pinned_list->nr_iovecs) {
+		struct dma_page_list *page_list;
+
+		/* skip already used-up iovecs */
+		while (!iov[iovec_idx].iov_len)
+			iovec_idx++;
+
+		page_list = &pinned_list->page_list[iovec_idx];
+
+		iov_byte_offset = ((unsigned long)iov[iovec_idx].iov_base & ~PAGE_MASK);
+		page_idx = (((unsigned long)iov[iovec_idx].iov_base & PAGE_MASK)
+			 - ((unsigned long)page_list->base_address & PAGE_MASK)) >> PAGE_SHIFT;
+
+		/* break up copies to not cross page boundary */
+		while (iov[iovec_idx].iov_len) {
+			copy = min_t(int, PAGE_SIZE - iov_byte_offset, len);
+			copy = min_t(int, copy, iov[iovec_idx].iov_len);
+
+			dma_cookie = dma_async_memcpy_pg_to_pg(chan,
+					page_list->pages[page_idx],
+					iov_byte_offset,
+					page,
+					offset,
+					copy);
+
+			len -= copy;
+			iov[iovec_idx].iov_len -= copy;
+			iov[iovec_idx].iov_base += copy;
+
+			if (!len)
+				return dma_cookie;
+
+			offset += copy;
+			iov_byte_offset = 0;
+			page_idx++;
+		}
+		iovec_idx++;
+	}
+
+	/* really bad if we ever run out of iovecs */
+	BUG();
+	return -EFAULT;
+}
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 30781546ac9..78b236ca04f 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -333,5 +333,27 @@ static inline enum dma_status dma_async_is_complete(dma_cookie_t cookie,
 int dma_async_device_register(struct dma_device *device);
 void dma_async_device_unregister(struct dma_device *device);
 
+/* --- Helper iov-locking functions --- */
+
+struct dma_page_list {
+	char *base_address;
+	int nr_pages;
+	struct page **pages;
+};
+
+struct dma_pinned_list {
+	int nr_iovecs;
+	struct dma_page_list page_list[0];
+};
+
+struct dma_pinned_list *dma_pin_iovec_pages(struct iovec *iov, size_t len);
+void dma_unpin_iovec_pages(struct dma_pinned_list* pinned_list);
+
+dma_cookie_t dma_memcpy_to_iovec(struct dma_chan *chan, struct iovec *iov,
+	struct dma_pinned_list *pinned_list, unsigned char *kdata, size_t len);
+dma_cookie_t dma_memcpy_pg_to_iovec(struct dma_chan *chan, struct iovec *iov,
+	struct dma_pinned_list *pinned_list, struct page *page,
+	unsigned int offset, size_t len);
+
 #endif /* CONFIG_DMA_ENGINE */
 #endif /* DMAENGINE_H */
diff --git a/include/net/netdma.h b/include/net/netdma.h
index cbfe89d7e5d..19760eb131a 100644
--- a/include/net/netdma.h
+++ b/include/net/netdma.h
@@ -23,6 +23,7 @@
 #include <linux/config.h>
 #ifdef CONFIG_NET_DMA
 #include <linux/dmaengine.h>
+#include <linux/skbuff.h>
 
 static inline struct dma_chan *get_softnet_dma(void)
 {
@@ -34,5 +35,10 @@ static inline struct dma_chan *get_softnet_dma(void)
 	rcu_read_unlock();
 	return chan;
 }
+
+int dma_skb_copy_datagram_iovec(struct dma_chan* chan,
+		const struct sk_buff *skb, int offset, struct iovec *to,
+		size_t len, struct dma_pinned_list *pinned_list);
+
 #endif /* CONFIG_NET_DMA */
 #endif /* NETDMA_H */
diff --git a/net/core/Makefile b/net/core/Makefile
index 79fe12cced2..e9bd2467d5a 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -16,3 +16,4 @@ obj-$(CONFIG_NET_DIVERT) += dv.o
 obj-$(CONFIG_NET_PKTGEN) += pktgen.o
 obj-$(CONFIG_WIRELESS_EXT) += wireless.o
 obj-$(CONFIG_NETPOLL) += netpoll.o
+obj-$(CONFIG_NET_DMA) += user_dma.o
diff --git a/net/core/user_dma.c b/net/core/user_dma.c
new file mode 100644
index 00000000000..9eee91bcbf3
--- /dev/null
+++ b/net/core/user_dma.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ * Portions based on net/core/datagram.c and copyrighted by their authors.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ * This code allows the net stack to make use of a DMA engine for
+ * skb to iovec copies.
+ */
+
+#include <linux/dmaengine.h>
+#include <linux/socket.h>
+#include <linux/rtnetlink.h> /* for BUG_TRAP */
+#include <net/tcp.h>
+
+/**
+ *	dma_skb_copy_datagram_iovec - Copy a datagram to an iovec.
+ *	@skb - buffer to copy
+ *	@offset - offset in the buffer to start copying from
+ *	@iovec - io vector to copy to
+ *	@len - amount of data to copy from buffer to iovec
+ *	@pinned_list - locked iovec buffer data
+ *
+ *	Note: the iovec is modified during the copy.
+ */
+int dma_skb_copy_datagram_iovec(struct dma_chan *chan,
+			struct sk_buff *skb, int offset, struct iovec *to,
+			size_t len, struct dma_pinned_list *pinned_list)
+{
+	int start = skb_headlen(skb);
+	int i, copy = start - offset;
+	dma_cookie_t cookie = 0;
+
+	/* Copy header. */
+	if (copy > 0) {
+		if (copy > len)
+			copy = len;
+		cookie = dma_memcpy_to_iovec(chan, to, pinned_list,
+		                            skb->data + offset, copy);
+		if (cookie < 0)
+			goto fault;
+		len -= copy;
+		if (len == 0)
+			goto end;
+		offset += copy;
+	}
+
+	/* Copy paged appendix. Hmm... why does this look so complicated? */
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		int end;
+
+		BUG_TRAP(start <= offset + len);
+
+		end = start + skb_shinfo(skb)->frags[i].size;
+		copy = end - offset;
+		if ((copy = end - offset) > 0) {
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+			struct page *page = frag->page;
+
+			if (copy > len)
+				copy = len;
+
+			cookie = dma_memcpy_pg_to_iovec(chan, to, pinned_list, page,
+					frag->page_offset + offset - start, copy);
+			if (cookie < 0)
+				goto fault;
+			len -= copy;
+			if (len == 0)
+				goto end;
+			offset += copy;
+		}
+		start = end;
+	}
+
+	if (skb_shinfo(skb)->frag_list) {
+		struct sk_buff *list = skb_shinfo(skb)->frag_list;
+
+		for (; list; list = list->next) {
+			int end;
+
+			BUG_TRAP(start <= offset + len);
+
+			end = start + list->len;
+			copy = end - offset;
+			if (copy > 0) {
+				if (copy > len)
+					copy = len;
+				cookie = dma_skb_copy_datagram_iovec(chan, list,
+				                offset - start, to, copy,
+				                pinned_list);
+				if (cookie < 0)
+					goto fault;
+				len -= copy;
+				if (len == 0)
+					goto end;
+				offset += copy;
+			}
+			start = end;
+		}
+	}
+
+end:
+	if (!len) {
+		skb->dma_cookie = cookie;
+		return cookie;
+	}
+
+fault:
+ 	return -EFAULT;
+}
-- 
cgit v1.2.3-70-g09d2


From 97fc2f0848c928c63c2ae619deee61a0b1107b69 Mon Sep 17 00:00:00 2001
From: Chris Leech <christopher.leech@intel.com>
Date: Tue, 23 May 2006 17:55:33 -0700
Subject: [I/OAT]: Structure changes for TCP recv offload to I/OAT

Adds an async_wait_queue and some additional fields to tcp_sock, and a
dma_cookie_t to sk_buff.

Signed-off-by: Chris Leech <christopher.leech@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 4 ++++
 include/linux/tcp.h    | 8 ++++++++
 include/net/sock.h     | 2 ++
 include/net/tcp.h      | 7 +++++++
 net/core/sock.c        | 6 ++++++
 5 files changed, 27 insertions(+)

(limited to 'include/net')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f8f234708b9..23bad3bf3c9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -29,6 +29,7 @@
 #include <linux/net.h>
 #include <linux/textsearch.h>
 #include <net/checksum.h>
+#include <linux/dmaengine.h>
 
 #define HAVE_ALLOC_SKB		/* For the drivers to know */
 #define HAVE_ALIGNABLE_SKB	/* Ditto 8)		   */
@@ -285,6 +286,9 @@ struct sk_buff {
 	__u16			tc_verd;	/* traffic control verdict */
 #endif
 #endif
+#ifdef CONFIG_NET_DMA
+	dma_cookie_t		dma_cookie;
+#endif
 
 
 	/* These elements must be at the end, see alloc_skb() for details.  */
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 542d39596bd..c90daa5da6c 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -18,6 +18,7 @@
 #define _LINUX_TCP_H
 
 #include <linux/types.h>
+#include <linux/dmaengine.h>
 #include <asm/byteorder.h>
 
 struct tcphdr {
@@ -233,6 +234,13 @@ struct tcp_sock {
 		struct iovec		*iov;
 		int			memory;
 		int			len;
+#ifdef CONFIG_NET_DMA
+		/* members for async copy */
+		struct dma_chan		*dma_chan;
+		int			wakeup;
+		struct dma_pinned_list	*pinned_list;
+		dma_cookie_t		dma_cookie;
+#endif
 	} ucopy;
 
 	__u32	snd_wl1;	/* Sequence for window update		*/
diff --git a/include/net/sock.h b/include/net/sock.h
index c9fad6fb629..90c65cb091a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -132,6 +132,7 @@ struct sock_common {
   *	@sk_receive_queue: incoming packets
   *	@sk_wmem_alloc: transmit queue bytes committed
   *	@sk_write_queue: Packet sending queue
+  *	@sk_async_wait_queue: DMA copied packets
   *	@sk_omem_alloc: "o" is "option" or "other"
   *	@sk_wmem_queued: persistent queue size
   *	@sk_forward_alloc: space allocated forward
@@ -205,6 +206,7 @@ struct sock {
 	atomic_t		sk_omem_alloc;
 	struct sk_buff_head	sk_receive_queue;
 	struct sk_buff_head	sk_write_queue;
+	struct sk_buff_head	sk_async_wait_queue;
 	int			sk_wmem_queued;
 	int			sk_forward_alloc;
 	gfp_t			sk_allocation;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3c989db8a7a..d0c2c2fa758 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -28,6 +28,7 @@
 #include <linux/cache.h>
 #include <linux/percpu.h>
 #include <linux/skbuff.h>
+#include <linux/dmaengine.h>
 
 #include <net/inet_connection_sock.h>
 #include <net/inet_timewait_sock.h>
@@ -817,6 +818,12 @@ static inline void tcp_prequeue_init(struct tcp_sock *tp)
 	tp->ucopy.len = 0;
 	tp->ucopy.memory = 0;
 	skb_queue_head_init(&tp->ucopy.prequeue);
+#ifdef CONFIG_NET_DMA
+	tp->ucopy.dma_chan = NULL;
+	tp->ucopy.wakeup = 0;
+	tp->ucopy.pinned_list = NULL;
+	tp->ucopy.dma_cookie = 0;
+#endif
 }
 
 /* Packet is added to VJ-style prequeue for processing in process
diff --git a/net/core/sock.c b/net/core/sock.c
index ed2afdb9ea2..5d820c37665 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -832,6 +832,9 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
 		atomic_set(&newsk->sk_omem_alloc, 0);
 		skb_queue_head_init(&newsk->sk_receive_queue);
 		skb_queue_head_init(&newsk->sk_write_queue);
+#ifdef CONFIG_NET_DMA
+		skb_queue_head_init(&newsk->sk_async_wait_queue);
+#endif
 
 		rwlock_init(&newsk->sk_dst_lock);
 		rwlock_init(&newsk->sk_callback_lock);
@@ -1383,6 +1386,9 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	skb_queue_head_init(&sk->sk_receive_queue);
 	skb_queue_head_init(&sk->sk_write_queue);
 	skb_queue_head_init(&sk->sk_error_queue);
+#ifdef CONFIG_NET_DMA
+	skb_queue_head_init(&sk->sk_async_wait_queue);
+#endif
 
 	sk->sk_send_head	=	NULL;
 
-- 
cgit v1.2.3-70-g09d2


From 0e4b4992b8007c6b62ec143cbbb292f98813ca11 Mon Sep 17 00:00:00 2001
From: Chris Leech <christopher.leech@intel.com>
Date: Tue, 23 May 2006 18:00:16 -0700
Subject: [I/OAT]: Rename cleanup_rbuf to tcp_cleanup_rbuf and make non-static

Needed to be able to call tcp_cleanup_rbuf in tcp_input.c for I/OAT

Signed-off-by: Chris Leech <christopher.leech@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h |  2 ++
 net/ipv4/tcp.c    | 10 +++++-----
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index d0c2c2fa758..578cccf275d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -294,6 +294,8 @@ extern int			tcp_rcv_established(struct sock *sk,
 
 extern void			tcp_rcv_space_adjust(struct sock *sk);
 
+extern void			tcp_cleanup_rbuf(struct sock *sk, int copied);
+
 extern int			tcp_twsk_unique(struct sock *sk,
 						struct sock *sktw, void *twp);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e2b7b805503..1c0cfd7a8bb 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -937,7 +937,7 @@ static int tcp_recv_urg(struct sock *sk, long timeo,
  * calculation of whether or not we must ACK for the sake of
  * a window update.
  */
-static void cleanup_rbuf(struct sock *sk, int copied)
+void tcp_cleanup_rbuf(struct sock *sk, int copied)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	int time_to_ack = 0;
@@ -1086,7 +1086,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 
 	/* Clean up data we have read: This will do ACK frames. */
 	if (copied)
-		cleanup_rbuf(sk, copied);
+		tcp_cleanup_rbuf(sk, copied);
 	return copied;
 }
 
@@ -1220,7 +1220,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			}
 		}
 
-		cleanup_rbuf(sk, copied);
+		tcp_cleanup_rbuf(sk, copied);
 
 		if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
 			/* Install new reader */
@@ -1391,7 +1391,7 @@ skip_copy:
 	 */
 
 	/* Clean up data we have read: This will do ACK frames. */
-	cleanup_rbuf(sk, copied);
+	tcp_cleanup_rbuf(sk, copied);
 
 	TCP_CHECK_TIMER(sk);
 	release_sock(sk);
@@ -1858,7 +1858,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
 			    inet_csk_ack_scheduled(sk)) {
 				icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
-				cleanup_rbuf(sk, 1);
+				tcp_cleanup_rbuf(sk, 1);
 				if (!(val & 1))
 					icsk->icsk_ack.pingpong = 1;
 			}
-- 
cgit v1.2.3-70-g09d2


From 624d1164730d58a494cc5aa4afa37d02c41e83a7 Mon Sep 17 00:00:00 2001
From: Chris Leech <christopher.leech@intel.com>
Date: Tue, 23 May 2006 18:01:28 -0700
Subject: [I/OAT]: Make sk_eat_skb I/OAT aware.

Add an extra argument to sk_eat_skb, and make it move early copied
packets to the async_wait_queue instead of freeing them.

Signed-off-by: Chris Leech <christopher.leech@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 13 ++++++++++++-
 net/dccp/proto.c   |  4 ++--
 net/ipv4/tcp.c     |  8 ++++----
 net/llc/af_llc.c   |  2 +-
 4 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index 90c65cb091a..75b0e97ed93 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1273,11 +1273,22 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
  * This routine must be called with interrupts disabled or with the socket
  * locked so that the sk_buff queue operation is ok.
 */
-static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
+#ifdef CONFIG_NET_DMA
+static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_early)
+{
+	__skb_unlink(skb, &sk->sk_receive_queue);
+	if (!copied_early)
+		__kfree_skb(skb);
+	else
+		__skb_queue_tail(&sk->sk_async_wait_queue, skb);
+}
+#else
+static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_early)
 {
 	__skb_unlink(skb, &sk->sk_receive_queue);
 	__kfree_skb(skb);
 }
+#endif
 
 extern void sock_enable_timestamp(struct sock *sk);
 extern int sock_get_timestamp(struct sock *, struct timeval __user *);
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 2e0ee8355c4..5317fd3e669 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -719,7 +719,7 @@ int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		}
 		dccp_pr_debug("packet_type=%s\n",
 			      dccp_packet_name(dh->dccph_type));
-		sk_eat_skb(sk, skb);
+		sk_eat_skb(sk, skb, 0);
 verify_sock_status:
 		if (sock_flag(sk, SOCK_DONE)) {
 			len = 0;
@@ -773,7 +773,7 @@ verify_sock_status:
 		}
 	found_fin_ok:
 		if (!(flags & MSG_PEEK))
-			sk_eat_skb(sk, skb);
+			sk_eat_skb(sk, skb, 0);
 		break;
 	} while (1);
 out:
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1c0cfd7a8bb..4e067d25a63 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1072,11 +1072,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 				break;
 		}
 		if (skb->h.th->fin) {
-			sk_eat_skb(sk, skb);
+			sk_eat_skb(sk, skb, 0);
 			++seq;
 			break;
 		}
-		sk_eat_skb(sk, skb);
+		sk_eat_skb(sk, skb, 0);
 		if (!desc->count)
 			break;
 	}
@@ -1356,14 +1356,14 @@ skip_copy:
 		if (skb->h.th->fin)
 			goto found_fin_ok;
 		if (!(flags & MSG_PEEK))
-			sk_eat_skb(sk, skb);
+			sk_eat_skb(sk, skb, 0);
 		continue;
 
 	found_fin_ok:
 		/* Process the FIN. */
 		++*seq;
 		if (!(flags & MSG_PEEK))
-			sk_eat_skb(sk, skb);
+			sk_eat_skb(sk, skb, 0);
 		break;
 	} while (len > 0);
 
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 5a04db745c8..7465170a36c 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -789,7 +789,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock,
 			continue;
 
 		if (!(flags & MSG_PEEK)) {
-			sk_eat_skb(sk, skb);
+			sk_eat_skb(sk, skb, 0);
 			*seq = 0;
 		}
 	} while (len > 0);
-- 
cgit v1.2.3-70-g09d2


From 9593782585e0cf70babe787a8463d492a68b1744 Mon Sep 17 00:00:00 2001
From: Chris Leech <christopher.leech@intel.com>
Date: Tue, 23 May 2006 18:02:55 -0700
Subject: [I/OAT]: Add a sysctl for tuning the I/OAT offloaded I/O threshold

Any socket recv of less than this ammount will not be offloaded

Signed-off-by: Chris Leech <christopher.leech@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sysctl.h     |  1 +
 include/net/tcp.h          |  1 +
 net/core/user_dma.c        |  4 ++++
 net/ipv4/sysctl_net_ipv4.c | 10 ++++++++++
 4 files changed, 16 insertions(+)

(limited to 'include/net')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 76eaeff76f8..cd9e7c0825a 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -403,6 +403,7 @@ enum
  	NET_TCP_MTU_PROBING=113,
 	NET_TCP_BASE_MSS=114,
 	NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS=115,
+	NET_TCP_DMA_COPYBREAK=116,
 };
 
 enum {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 578cccf275d..f1f472746e6 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -219,6 +219,7 @@ extern int sysctl_tcp_adv_win_scale;
 extern int sysctl_tcp_tw_reuse;
 extern int sysctl_tcp_frto;
 extern int sysctl_tcp_low_latency;
+extern int sysctl_tcp_dma_copybreak;
 extern int sysctl_tcp_nometrics_save;
 extern int sysctl_tcp_moderate_rcvbuf;
 extern int sysctl_tcp_tso_win_divisor;
diff --git a/net/core/user_dma.c b/net/core/user_dma.c
index 9eee91bcbf3..b7c98dbcdb8 100644
--- a/net/core/user_dma.c
+++ b/net/core/user_dma.c
@@ -30,6 +30,10 @@
 #include <linux/rtnetlink.h> /* for BUG_TRAP */
 #include <net/tcp.h>
 
+#define NET_DMA_DEFAULT_COPYBREAK 4096
+
+int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK;
+
 /**
  *	dma_skb_copy_datagram_iovec - Copy a datagram to an iovec.
  *	@skb - buffer to copy
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 6b6c3adfcf0..6a6aa537b7a 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -688,6 +688,16 @@ ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
+#ifdef CONFIG_NET_DMA
+	{
+		.ctl_name	= NET_TCP_DMA_COPYBREAK,
+		.procname	= "tcp_dma_copybreak",
+		.data		= &sysctl_tcp_dma_copybreak,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+#endif
 	{ .ctl_name = 0 }
 };
 
-- 
cgit v1.2.3-70-g09d2


From aecbd4e45c2e469e0452ffb2c0b0d881e2815bb8 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 25 May 2006 15:08:30 -0700
Subject: [LLC]: use more efficient ether address routines

Use more cache efficient Ethernet address manipulation functions
in etherdevice.h.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
---
 include/net/llc_if.h | 13 ++++++-------
 net/llc/llc_if.c     |  2 --
 2 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'include/net')

diff --git a/include/net/llc_if.h b/include/net/llc_if.h
index 090eaa0d71f..a05d04ac451 100644
--- a/include/net/llc_if.h
+++ b/include/net/llc_if.h
@@ -16,6 +16,7 @@
 #include <linux/if.h>
 #include <linux/if_arp.h>
 #include <linux/llc.h>
+#include <linux/etherdevice.h>
 #include <net/llc.h>
 
 #define LLC_DATAUNIT_PRIM	1
@@ -61,8 +62,6 @@
 #define LLC_STATUS_CONFLICT	7 /* disconnect conn */
 #define LLC_STATUS_RESET_DONE	8 /*  */
 
-extern u8 llc_mac_null_var[IFHWADDRLEN];
-
 /**
  *      llc_mac_null - determines if a address is a null mac address
  *      @mac: Mac address to test if null.
@@ -70,12 +69,12 @@ extern u8 llc_mac_null_var[IFHWADDRLEN];
  *      Determines if a given address is a null mac address.  Returns 0 if the
  *      address is not a null mac, 1 if the address is a null mac.
  */
-static __inline__ int llc_mac_null(u8 *mac)
+static inline int llc_mac_null(const u8 *mac)
 {
-	return !memcmp(mac, llc_mac_null_var, IFHWADDRLEN);
+	return is_zero_ether_addr(mac);
 }
 
-static __inline__ int llc_addrany(struct llc_addr *addr)
+static inline int llc_addrany(const struct llc_addr *addr)
 {
 	return llc_mac_null(addr->mac) && !addr->lsap;
 }
@@ -89,9 +88,9 @@ static __inline__ int llc_addrany(struct llc_addr *addr)
  *	is not a complete match up to len, 1 if a complete match up to len is
  *	found.
  */
-static __inline__ int llc_mac_match(u8 *mac1, u8 *mac2)
+static inline int llc_mac_match(const u8 *mac1, const u8 *mac2)
 {
-	return !memcmp(mac1, mac2, IFHWADDRLEN);
+	return !compare_ether_addr(mac1, mac2);
 }
 
 extern int llc_establish_connection(struct sock *sk, u8 *lmac,
diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c
index ba90f7f0801..5ae47be7dde 100644
--- a/net/llc/llc_if.c
+++ b/net/llc/llc_if.c
@@ -26,8 +26,6 @@
 #include <net/llc_c_st.h>
 #include <net/tcp_states.h>
 
-u8 llc_mac_null_var[IFHWADDRLEN];
-
 /**
  *	llc_build_and_send_pkt - Connection data sending for upper layers.
  *	@sk: connection
-- 
cgit v1.2.3-70-g09d2


From bc0e646796928918e45b6465e02616f2fe65c3c1 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 25 May 2006 15:10:37 -0700
Subject: [LLC]: add multicast support for datagrams

Allow mulitcast reception of datagrams (similar to UDP).
All sockets bound to the same SAP receive a clone.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/llc_if.h |  4 ++++
 net/llc/llc_sap.c    | 59 +++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 55 insertions(+), 8 deletions(-)

(limited to 'include/net')

diff --git a/include/net/llc_if.h b/include/net/llc_if.h
index a05d04ac451..c608812a8e8 100644
--- a/include/net/llc_if.h
+++ b/include/net/llc_if.h
@@ -79,6 +79,10 @@ static inline int llc_addrany(const struct llc_addr *addr)
 	return llc_mac_null(addr->mac) && !addr->lsap;
 }
 
+static inline int llc_mac_multicast(const u8 *mac)
+{
+	return is_multicast_ether_addr(mac);
+}
 /**
  *	llc_mac_match - determines if two mac addresses are the same
  *	@mac1: First mac address to compare.
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
index 4029ceee9b9..20c4eb5c1ac 100644
--- a/net/llc/llc_sap.c
+++ b/net/llc/llc_sap.c
@@ -282,7 +282,7 @@ static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb)
  *	mac, and local sap. Returns pointer for socket found, %NULL otherwise.
  */
 static struct sock *llc_lookup_dgram(struct llc_sap *sap,
-				     struct llc_addr *laddr)
+				     const struct llc_addr *laddr)
 {
 	struct sock *rc;
 	struct hlist_node *node;
@@ -304,19 +304,62 @@ found:
 	return rc;
 }
 
+/**
+ * 	llc_sap_mcast - Deliver multicast PDU's to all matching datagram sockets.
+ *	@sap: SAP
+ *	@laddr: address of local LLC (MAC + SAP)
+ *
+ *	Search socket list of the SAP and finds connections with same sap.
+ *	Deliver clone to each.
+ */
+static void llc_sap_mcast(struct llc_sap *sap,
+			  const struct llc_addr *laddr,
+			  struct sk_buff *skb)
+{
+	struct sock *sk;
+	struct hlist_node *node;
+
+	read_lock_bh(&sap->sk_list.lock);
+	sk_for_each(sk, node, &sap->sk_list.list) {
+		struct llc_sock *llc = llc_sk(sk);
+		struct sk_buff *skb1;
+
+		if (sk->sk_type != SOCK_DGRAM)
+			continue;
+
+		if (llc->laddr.lsap != laddr->lsap)
+			continue;
+
+		skb1 = skb_clone(skb, GFP_ATOMIC);
+		if (!skb1)
+			break;
+
+		sock_hold(sk);
+		skb_set_owner_r(skb1, sk);
+		llc_sap_rcv(sap, skb1);
+		sock_put(sk);
+	}
+	read_unlock_bh(&sap->sk_list.lock);
+}
+
+
 void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb)
 {
 	struct llc_addr laddr;
-	struct sock *sk;
 
 	llc_pdu_decode_da(skb, laddr.mac);
 	llc_pdu_decode_dsap(skb, &laddr.lsap);
 
-	sk = llc_lookup_dgram(sap, &laddr);
-	if (sk) {
-		skb_set_owner_r(skb, sk);
-		llc_sap_rcv(sap, skb);
-		sock_put(sk);
-	} else
+	if (llc_mac_multicast(laddr.mac)) {
+		llc_sap_mcast(sap, &laddr, skb);
 		kfree_skb(skb);
+	} else {
+		struct sock *sk = llc_lookup_dgram(sap, &laddr);
+		if (sk) {
+			skb_set_owner_r(skb, sk);
+			llc_sap_rcv(sap, skb);
+			sock_put(sk);
+		} else
+			kfree_skb(skb);
+	}
 }
-- 
cgit v1.2.3-70-g09d2


From 546be2405be119ef55467aace45f337a16e5d424 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 27 May 2006 23:03:58 -0700
Subject: [IPSEC] xfrm: Undo afinfo lock proliferation

The number of locks used to manage afinfo structures can easily be reduced
down to one each for policy and state respectively.  This is based on the
observation that the write locks are only held by module insertion/removal
which are very rare events so there is no need to further differentiate
between the insertion of modules like ipv6 versus esp6.

The removal of the read locks in xfrm4_policy.c/xfrm6_policy.c might look
suspicious at first.  However, after you realise that nobody ever takes
the corresponding write lock you'll feel better :)

As far as I can gather it's an attempt to guard against the removal of
the corresponding modules.  Since neither module can be unloaded at all
we can leave it to whoever fixes up IPv6 unloading :)

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h      |  9 +-------
 net/ipv4/xfrm4_policy.c |  6 -----
 net/ipv4/xfrm4_state.c  |  1 -
 net/ipv6/xfrm6_policy.c |  6 -----
 net/ipv6/xfrm6_state.c  |  1 -
 net/xfrm/xfrm_policy.c  | 58 +++++++++++++++++++++++++++++--------------------
 net/xfrm/xfrm_state.c   |  9 +++-----
 7 files changed, 38 insertions(+), 52 deletions(-)

(limited to 'include/net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index afa508d92c9..ed7c9747059 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -204,8 +204,7 @@ struct xfrm_type;
 struct xfrm_dst;
 struct xfrm_policy_afinfo {
 	unsigned short		family;
-	rwlock_t		lock;
-	struct xfrm_type_map	*type_map;
+	struct xfrm_type	*type_map[256];
 	struct dst_ops		*dst_ops;
 	void			(*garbage_collect)(void);
 	int			(*dst_lookup)(struct xfrm_dst **dst, struct flowi *fl);
@@ -232,7 +231,6 @@ extern int __xfrm_state_delete(struct xfrm_state *x);
 
 struct xfrm_state_afinfo {
 	unsigned short		family;
-	rwlock_t		lock;
 	struct list_head	*state_bydst;
 	struct list_head	*state_byspi;
 	int			(*init_flags)(struct xfrm_state *x);
@@ -264,11 +262,6 @@ struct xfrm_type
 	u32			(*get_max_size)(struct xfrm_state *, int size);
 };
 
-struct xfrm_type_map {
-	rwlock_t		lock;
-	struct xfrm_type	*map[256];
-};
-
 extern int xfrm_register_type(struct xfrm_type *type, unsigned short family);
 extern int xfrm_unregister_type(struct xfrm_type *type, unsigned short family);
 extern struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 8604c747bca..c0465284dfa 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -17,8 +17,6 @@
 static struct dst_ops xfrm4_dst_ops;
 static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
 
-static struct xfrm_type_map xfrm4_type_map = { .lock = RW_LOCK_UNLOCKED };
-
 static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl)
 {
 	return __ip_route_output_key((struct rtable**)dst, fl);
@@ -237,9 +235,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl)
 
 static inline int xfrm4_garbage_collect(void)
 {
-	read_lock(&xfrm4_policy_afinfo.lock);
 	xfrm4_policy_afinfo.garbage_collect();
-	read_unlock(&xfrm4_policy_afinfo.lock);
 	return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2);
 }
 
@@ -299,8 +295,6 @@ static struct dst_ops xfrm4_dst_ops = {
 
 static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
 	.family = 		AF_INET,
-	.lock = 		RW_LOCK_UNLOCKED,
-	.type_map = 		&xfrm4_type_map,
 	.dst_ops =		&xfrm4_dst_ops,
 	.dst_lookup =		xfrm4_dst_lookup,
 	.find_bundle = 		__xfrm4_find_bundle,
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index dbabf81a9b7..81e1751c966 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -131,7 +131,6 @@ __xfrm4_find_acq(u8 mode, u32 reqid, u8 proto,
 
 static struct xfrm_state_afinfo xfrm4_state_afinfo = {
 	.family			= AF_INET,
-	.lock			= RW_LOCK_UNLOCKED,
 	.init_flags		= xfrm4_init_flags,
 	.init_tempsel		= __xfrm4_init_tempsel,
 	.state_lookup		= __xfrm4_state_lookup,
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 88c840f1beb..ee715f2691e 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -23,8 +23,6 @@
 static struct dst_ops xfrm6_dst_ops;
 static struct xfrm_policy_afinfo xfrm6_policy_afinfo;
 
-static struct xfrm_type_map xfrm6_type_map = { .lock = RW_LOCK_UNLOCKED };
-
 static int xfrm6_dst_lookup(struct xfrm_dst **dst, struct flowi *fl)
 {
 	int err = 0;
@@ -249,9 +247,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl)
 
 static inline int xfrm6_garbage_collect(void)
 {
-	read_lock(&xfrm6_policy_afinfo.lock);
 	xfrm6_policy_afinfo.garbage_collect();
-	read_unlock(&xfrm6_policy_afinfo.lock);
 	return (atomic_read(&xfrm6_dst_ops.entries) > xfrm6_dst_ops.gc_thresh*2);
 }
 
@@ -311,8 +307,6 @@ static struct dst_ops xfrm6_dst_ops = {
 
 static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
 	.family =		AF_INET6,
-	.lock = 		RW_LOCK_UNLOCKED,
-	.type_map = 		&xfrm6_type_map,
 	.dst_ops =		&xfrm6_dst_ops,
 	.dst_lookup =		xfrm6_dst_lookup,
 	.find_bundle =		__xfrm6_find_bundle,
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index a5723024d3b..b33296b3f6d 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -135,7 +135,6 @@ __xfrm6_find_acq(u8 mode, u32 reqid, u8 proto,
 
 static struct xfrm_state_afinfo xfrm6_state_afinfo = {
 	.family			= AF_INET6,
-	.lock			= RW_LOCK_UNLOCKED,
 	.init_tempsel		= __xfrm6_init_tempsel,
 	.state_lookup		= __xfrm6_state_lookup,
 	.find_acq		= __xfrm6_find_acq,
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index b469c8b5461..44b64a593c0 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -46,45 +46,43 @@ static DEFINE_SPINLOCK(xfrm_policy_gc_lock);
 
 static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family);
 static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo);
+static struct xfrm_policy_afinfo *xfrm_policy_lock_afinfo(unsigned int family);
+static void xfrm_policy_unlock_afinfo(struct xfrm_policy_afinfo *afinfo);
 
 int xfrm_register_type(struct xfrm_type *type, unsigned short family)
 {
-	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
-	struct xfrm_type_map *typemap;
+	struct xfrm_policy_afinfo *afinfo = xfrm_policy_lock_afinfo(family);
+	struct xfrm_type **typemap;
 	int err = 0;
 
 	if (unlikely(afinfo == NULL))
 		return -EAFNOSUPPORT;
 	typemap = afinfo->type_map;
 
-	write_lock_bh(&typemap->lock);
-	if (likely(typemap->map[type->proto] == NULL))
-		typemap->map[type->proto] = type;
+	if (likely(typemap[type->proto] == NULL))
+		typemap[type->proto] = type;
 	else
 		err = -EEXIST;
-	write_unlock_bh(&typemap->lock);
-	xfrm_policy_put_afinfo(afinfo);
+	xfrm_policy_unlock_afinfo(afinfo);
 	return err;
 }
 EXPORT_SYMBOL(xfrm_register_type);
 
 int xfrm_unregister_type(struct xfrm_type *type, unsigned short family)
 {
-	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
-	struct xfrm_type_map *typemap;
+	struct xfrm_policy_afinfo *afinfo = xfrm_policy_lock_afinfo(family);
+	struct xfrm_type **typemap;
 	int err = 0;
 
 	if (unlikely(afinfo == NULL))
 		return -EAFNOSUPPORT;
 	typemap = afinfo->type_map;
 
-	write_lock_bh(&typemap->lock);
-	if (unlikely(typemap->map[type->proto] != type))
+	if (unlikely(typemap[type->proto] != type))
 		err = -ENOENT;
 	else
-		typemap->map[type->proto] = NULL;
-	write_unlock_bh(&typemap->lock);
-	xfrm_policy_put_afinfo(afinfo);
+		typemap[type->proto] = NULL;
+	xfrm_policy_unlock_afinfo(afinfo);
 	return err;
 }
 EXPORT_SYMBOL(xfrm_unregister_type);
@@ -92,7 +90,7 @@ EXPORT_SYMBOL(xfrm_unregister_type);
 struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family)
 {
 	struct xfrm_policy_afinfo *afinfo;
-	struct xfrm_type_map *typemap;
+	struct xfrm_type **typemap;
 	struct xfrm_type *type;
 	int modload_attempted = 0;
 
@@ -102,11 +100,9 @@ retry:
 		return NULL;
 	typemap = afinfo->type_map;
 
-	read_lock(&typemap->lock);
-	type = typemap->map[proto];
+	type = typemap[proto];
 	if (unlikely(type && !try_module_get(type->owner)))
 		type = NULL;
-	read_unlock(&typemap->lock);
 	if (!type && !modload_attempted) {
 		xfrm_policy_put_afinfo(afinfo);
 		request_module("xfrm-type-%d-%d",
@@ -1306,17 +1302,31 @@ static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
 		return NULL;
 	read_lock(&xfrm_policy_afinfo_lock);
 	afinfo = xfrm_policy_afinfo[family];
-	if (likely(afinfo != NULL))
-		read_lock(&afinfo->lock);
-	read_unlock(&xfrm_policy_afinfo_lock);
+	if (unlikely(!afinfo))
+		read_unlock(&xfrm_policy_afinfo_lock);
 	return afinfo;
 }
 
 static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo)
 {
-	if (unlikely(afinfo == NULL))
-		return;
-	read_unlock(&afinfo->lock);
+	read_unlock(&xfrm_policy_afinfo_lock);
+}
+
+static struct xfrm_policy_afinfo *xfrm_policy_lock_afinfo(unsigned int family)
+{
+	struct xfrm_policy_afinfo *afinfo;
+	if (unlikely(family >= NPROTO))
+		return NULL;
+	write_lock_bh(&xfrm_policy_afinfo_lock);
+	afinfo = xfrm_policy_afinfo[family];
+	if (unlikely(!afinfo))
+		write_unlock_bh(&xfrm_policy_afinfo_lock);
+	return afinfo;
+}
+
+static void xfrm_policy_unlock_afinfo(struct xfrm_policy_afinfo *afinfo)
+{
+	write_unlock_bh(&xfrm_policy_afinfo_lock);
 }
 
 static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 93a2f36ad3d..ee62c239a7e 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1103,17 +1103,14 @@ static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned short family)
 		return NULL;
 	read_lock(&xfrm_state_afinfo_lock);
 	afinfo = xfrm_state_afinfo[family];
-	if (likely(afinfo != NULL))
-		read_lock(&afinfo->lock);
-	read_unlock(&xfrm_state_afinfo_lock);
+	if (unlikely(!afinfo))
+		read_unlock(&xfrm_state_afinfo_lock);
 	return afinfo;
 }
 
 static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo)
 {
-	if (unlikely(afinfo == NULL))
-		return;
-	read_unlock(&afinfo->lock);
+	read_unlock(&xfrm_state_afinfo_lock);
 }
 
 /* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */
-- 
cgit v1.2.3-70-g09d2


From b59f45d0b2878ab76f8053b0973654e6621828ee Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 27 May 2006 23:05:54 -0700
Subject: [IPSEC] xfrm: Abstract out encapsulation modes

This patch adds the structure xfrm_mode.  It is meant to represent
the operations carried out by transport/tunnel modes.

By doing this we allow additional encapsulation modes to be added
without clogging up the xfrm_input/xfrm_output paths.

Candidate modes include 4-to-6 tunnel mode, 6-to-4 tunnel mode, and
BEET modes.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/xfrm.h            |   4 ++
 include/net/xfrm.h              |  17 ++++++
 net/ipv4/Kconfig                |  18 ++++++
 net/ipv4/Makefile               |   2 +
 net/ipv4/xfrm4_input.c          |  28 +--------
 net/ipv4/xfrm4_mode_transport.c |  69 ++++++++++++++++++++++
 net/ipv4/xfrm4_mode_tunnel.c    | 125 ++++++++++++++++++++++++++++++++++++++++
 net/ipv4/xfrm4_output.c         |  61 +-------------------
 net/ipv6/Kconfig                |  20 +++++++
 net/ipv6/Makefile               |   2 +
 net/ipv6/ip6_output.c           |   2 +
 net/ipv6/xfrm6_input.c          |  29 +---------
 net/ipv6/xfrm6_mode_transport.c |  73 +++++++++++++++++++++++
 net/ipv6/xfrm6_mode_tunnel.c    | 121 ++++++++++++++++++++++++++++++++++++++
 net/ipv6/xfrm6_output.c         |  63 +-------------------
 net/xfrm/xfrm_policy.c          |  83 ++++++++++++++++++++++++++
 net/xfrm/xfrm_state.c           |   6 ++
 17 files changed, 553 insertions(+), 170 deletions(-)
 create mode 100644 net/ipv4/xfrm4_mode_transport.c
 create mode 100644 net/ipv4/xfrm4_mode_tunnel.c
 create mode 100644 net/ipv6/xfrm6_mode_transport.c
 create mode 100644 net/ipv6/xfrm6_mode_tunnel.c

(limited to 'include/net')

diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index 6b42cc474c0..46a15c7a1a1 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -118,6 +118,10 @@ enum
 	XFRM_SHARE_UNIQUE	/* Use once */
 };
 
+#define XFRM_MODE_TRANSPORT 0
+#define XFRM_MODE_TUNNEL 1
+#define XFRM_MODE_MAX 2
+
 /* Netlink configuration messages.  */
 enum {
 	XFRM_MSG_BASE = 0x10,
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index ed7c9747059..ed5bb34f817 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -20,6 +20,8 @@
 #include <net/ip6_fib.h>
 
 #define XFRM_ALIGN8(len)	(((len) + 7) & ~7)
+#define MODULE_ALIAS_XFRM_MODE(family, encap) \
+	MODULE_ALIAS("xfrm-mode-" __stringify(family) "-" __stringify(encap))
 
 extern struct sock *xfrm_nl;
 extern u32 sysctl_xfrm_aevent_etime;
@@ -164,6 +166,7 @@ struct xfrm_state
 	/* Reference to data common to all the instances of this
 	 * transformer. */
 	struct xfrm_type	*type;
+	struct xfrm_mode	*mode;
 
 	/* Security context */
 	struct xfrm_sec_ctx	*security;
@@ -205,6 +208,7 @@ struct xfrm_dst;
 struct xfrm_policy_afinfo {
 	unsigned short		family;
 	struct xfrm_type	*type_map[256];
+	struct xfrm_mode	*mode_map[XFRM_MODE_MAX];
 	struct dst_ops		*dst_ops;
 	void			(*garbage_collect)(void);
 	int			(*dst_lookup)(struct xfrm_dst **dst, struct flowi *fl);
@@ -267,6 +271,19 @@ extern int xfrm_unregister_type(struct xfrm_type *type, unsigned short family);
 extern struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family);
 extern void xfrm_put_type(struct xfrm_type *type);
 
+struct xfrm_mode {
+	int (*input)(struct xfrm_state *x, struct sk_buff *skb);
+	int (*output)(struct sk_buff *skb);
+
+	struct module *owner;
+	unsigned int encap;
+};
+
+extern int xfrm_register_mode(struct xfrm_mode *mode, int family);
+extern int xfrm_unregister_mode(struct xfrm_mode *mode, int family);
+extern struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family);
+extern void xfrm_put_mode(struct xfrm_mode *mode);
+
 struct xfrm_tmpl
 {
 /* id in template is interpreted as:
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index e40f7532237..35eb70b1448 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -414,6 +414,24 @@ config INET_TUNNEL
 	tristate
 	default n
 
+config INET_XFRM_MODE_TRANSPORT
+	tristate "IP: IPsec transport mode"
+	default y
+	select XFRM
+	---help---
+	  Support for IPsec transport mode.
+
+	  If unsure, say Y.
+
+config INET_XFRM_MODE_TUNNEL
+	tristate "IP: IPsec tunnel mode"
+	default y
+	select XFRM
+	---help---
+	  Support for IPsec tunnel mode.
+
+	  If unsure, say Y.
+
 config INET_DIAG
 	tristate "INET: socket monitoring interface"
 	default y
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 9ef50a0b9d2..4cc94482eac 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -24,6 +24,8 @@ obj-$(CONFIG_INET_ESP) += esp4.o
 obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
 obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o
 obj-$(CONFIG_INET_TUNNEL) += tunnel4.o
+obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
+obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
 obj-$(CONFIG_IP_PNP) += ipconfig.o
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 3e174c83bfe..817ed84511a 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -13,7 +13,6 @@
 #include <linux/string.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
-#include <net/inet_ecn.h>
 #include <net/ip.h>
 #include <net/xfrm.h>
 
@@ -24,15 +23,6 @@ int xfrm4_rcv(struct sk_buff *skb)
 
 EXPORT_SYMBOL(xfrm4_rcv);
 
-static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
-{
-	struct iphdr *outer_iph = skb->nh.iph;
-	struct iphdr *inner_iph = skb->h.ipiph;
-
-	if (INET_ECN_is_ce(outer_iph->tos))
-		IP_ECN_set_ce(inner_iph);
-}
-
 static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq)
 {
 	switch (nexthdr) {
@@ -113,24 +103,10 @@ int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type)
 
 		xfrm_vec[xfrm_nr++] = x;
 
-		iph = skb->nh.iph;
+		if (x->mode->input(x, skb))
+			goto drop;
 
 		if (x->props.mode) {
-			if (iph->protocol != IPPROTO_IPIP)
-				goto drop;
-			if (!pskb_may_pull(skb, sizeof(struct iphdr)))
-				goto drop;
-			if (skb_cloned(skb) &&
-			    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
-				goto drop;
-			if (x->props.flags & XFRM_STATE_DECAP_DSCP)
-				ipv4_copy_dscp(iph, skb->h.ipiph);
-			if (!(x->props.flags & XFRM_STATE_NOECN))
-				ipip_ecn_decapsulate(skb);
-			skb->mac.raw = memmove(skb->data - skb->mac_len,
-					       skb->mac.raw, skb->mac_len);
-			skb->nh.raw = skb->data;
-			memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
 			decaps = 1;
 			break;
 		}
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
new file mode 100644
index 00000000000..e46d9a4ccc5
--- /dev/null
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -0,0 +1,69 @@
+/*
+ * xfrm4_mode_transport.c - Transport mode encapsulation for IPv4.
+ *
+ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dst.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+
+/* Add encapsulation header.
+ *
+ * The IP header will be moved forward to make space for the encapsulation
+ * header.
+ *
+ * On exit, skb->h will be set to the start of the payload to be processed
+ * by x->type->output and skb->nh will be set to the top IP header.
+ */
+static int xfrm4_transport_output(struct sk_buff *skb)
+{
+	struct xfrm_state *x;
+	struct iphdr *iph;
+	int ihl;
+
+	iph = skb->nh.iph;
+	skb->h.ipiph = iph;
+
+	ihl = iph->ihl * 4;
+	skb->h.raw += ihl;
+
+	x = skb->dst->xfrm;
+	skb->nh.raw = memmove(skb_push(skb, x->props.header_len), iph, ihl);
+	return 0;
+}
+
+static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	return 0;
+}
+
+static struct xfrm_mode xfrm4_transport_mode = {
+	.input = xfrm4_transport_input,
+	.output = xfrm4_transport_output,
+	.owner = THIS_MODULE,
+	.encap = XFRM_MODE_TRANSPORT,
+};
+
+static int __init xfrm4_transport_init(void)
+{
+	return xfrm_register_mode(&xfrm4_transport_mode, AF_INET);
+}
+
+static void __exit xfrm4_transport_exit(void)
+{
+	int err;
+
+	err = xfrm_unregister_mode(&xfrm4_transport_mode, AF_INET);
+	BUG_ON(err);
+}
+
+module_init(xfrm4_transport_init);
+module_exit(xfrm4_transport_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TRANSPORT);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
new file mode 100644
index 00000000000..f8d880beb12
--- /dev/null
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -0,0 +1,125 @@
+/*
+ * xfrm4_mode_tunnel.c - Tunnel mode encapsulation for IPv4.
+ *
+ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dst.h>
+#include <net/inet_ecn.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+
+static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
+{
+	struct iphdr *outer_iph = skb->nh.iph;
+	struct iphdr *inner_iph = skb->h.ipiph;
+
+	if (INET_ECN_is_ce(outer_iph->tos))
+		IP_ECN_set_ce(inner_iph);
+}
+
+/* Add encapsulation header.
+ *
+ * The top IP header will be constructed per RFC 2401.  The following fields
+ * in it shall be filled in by x->type->output:
+ *      tot_len
+ *      check
+ *
+ * On exit, skb->h will be set to the start of the payload to be processed
+ * by x->type->output and skb->nh will be set to the top IP header.
+ */
+static int xfrm4_tunnel_output(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb->dst;
+	struct xfrm_state *x = dst->xfrm;
+	struct iphdr *iph, *top_iph;
+	int flags;
+
+	iph = skb->nh.iph;
+	skb->h.ipiph = iph;
+
+	skb->nh.raw = skb_push(skb, x->props.header_len);
+	top_iph = skb->nh.iph;
+
+	top_iph->ihl = 5;
+	top_iph->version = 4;
+
+	/* DS disclosed */
+	top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos);
+
+	flags = x->props.flags;
+	if (flags & XFRM_STATE_NOECN)
+		IP_ECN_clear(top_iph);
+
+	top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
+		0 : (iph->frag_off & htons(IP_DF));
+	if (!top_iph->frag_off)
+		__ip_select_ident(top_iph, dst->child, 0);
+
+	top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT);
+
+	top_iph->saddr = x->props.saddr.a4;
+	top_iph->daddr = x->id.daddr.a4;
+	top_iph->protocol = IPPROTO_IPIP;
+
+	memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+	return 0;
+}
+
+static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct iphdr *iph = skb->nh.iph;
+	int err = -EINVAL;
+
+	if (iph->protocol != IPPROTO_IPIP)
+		goto out;
+	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+		goto out;
+
+	if (skb_cloned(skb) &&
+	    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+		goto out;
+
+	if (x->props.flags & XFRM_STATE_DECAP_DSCP)
+		ipv4_copy_dscp(iph, skb->h.ipiph);
+	if (!(x->props.flags & XFRM_STATE_NOECN))
+		ipip_ecn_decapsulate(skb);
+	skb->mac.raw = memmove(skb->data - skb->mac_len,
+			       skb->mac.raw, skb->mac_len);
+	skb->nh.raw = skb->data;
+	memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+	err = 0;
+
+out:
+	return err;
+}
+
+static struct xfrm_mode xfrm4_tunnel_mode = {
+	.input = xfrm4_tunnel_input,
+	.output = xfrm4_tunnel_output,
+	.owner = THIS_MODULE,
+	.encap = XFRM_MODE_TUNNEL,
+};
+
+static int __init xfrm4_tunnel_init(void)
+{
+	return xfrm_register_mode(&xfrm4_tunnel_mode, AF_INET);
+}
+
+static void __exit xfrm4_tunnel_exit(void)
+{
+	int err;
+
+	err = xfrm_unregister_mode(&xfrm4_tunnel_mode, AF_INET);
+	BUG_ON(err);
+}
+
+module_init(xfrm4_tunnel_init);
+module_exit(xfrm4_tunnel_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TUNNEL);
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 4ef8efaf6a6..ac9d91d4bb0 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -12,67 +12,10 @@
 #include <linux/skbuff.h>
 #include <linux/spinlock.h>
 #include <linux/netfilter_ipv4.h>
-#include <net/inet_ecn.h>
 #include <net/ip.h>
 #include <net/xfrm.h>
 #include <net/icmp.h>
 
-/* Add encapsulation header.
- *
- * In transport mode, the IP header will be moved forward to make space
- * for the encapsulation header.
- *
- * In tunnel mode, the top IP header will be constructed per RFC 2401.
- * The following fields in it shall be filled in by x->type->output:
- *	tot_len
- *	check
- *
- * On exit, skb->h will be set to the start of the payload to be processed
- * by x->type->output and skb->nh will be set to the top IP header.
- */
-static void xfrm4_encap(struct sk_buff *skb)
-{
-	struct dst_entry *dst = skb->dst;
-	struct xfrm_state *x = dst->xfrm;
-	struct iphdr *iph, *top_iph;
-	int flags;
-
-	iph = skb->nh.iph;
-	skb->h.ipiph = iph;
-
-	skb->nh.raw = skb_push(skb, x->props.header_len);
-	top_iph = skb->nh.iph;
-
-	if (!x->props.mode) {
-		skb->h.raw += iph->ihl*4;
-		memmove(top_iph, iph, iph->ihl*4);
-		return;
-	}
-
-	top_iph->ihl = 5;
-	top_iph->version = 4;
-
-	/* DS disclosed */
-	top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos);
-
-	flags = x->props.flags;
-	if (flags & XFRM_STATE_NOECN)
-		IP_ECN_clear(top_iph);
-
-	top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
-		0 : (iph->frag_off & htons(IP_DF));
-	if (!top_iph->frag_off)
-		__ip_select_ident(top_iph, dst->child, 0);
-
-	top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT);
-
-	top_iph->saddr = x->props.saddr.a4;
-	top_iph->daddr = x->id.daddr.a4;
-	top_iph->protocol = IPPROTO_IPIP;
-
-	memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
-}
-
 static int xfrm4_tunnel_check_size(struct sk_buff *skb)
 {
 	int mtu, ret = 0;
@@ -121,7 +64,9 @@ static int xfrm4_output_one(struct sk_buff *skb)
 		if (err)
 			goto error;
 
-		xfrm4_encap(skb);
+		err = x->mode->output(skb);
+		if (err)
+			goto error;
 
 		err = x->type->output(x, skb);
 		if (err)
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index f8a107ab559..e923d4dea41 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -106,6 +106,26 @@ config INET6_TUNNEL
 	tristate
 	default n
 
+config INET6_XFRM_MODE_TRANSPORT
+	tristate "IPv6: IPsec transport mode"
+	depends on IPV6
+	default IPV6
+	select XFRM
+	---help---
+	  Support for IPsec transport mode.
+
+	  If unsure, say Y.
+
+config INET6_XFRM_MODE_TUNNEL
+	tristate "IPv6: IPsec tunnel mode"
+	depends on IPV6
+	default IPV6
+	select XFRM
+	---help---
+	  Support for IPsec tunnel mode.
+
+	  If unsure, say Y.
+
 config IPV6_TUNNEL
 	tristate "IPv6: IPv6-in-IPv6 tunnel"
 	select INET6_TUNNEL
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index a760b0988fb..386e0a62694 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -20,6 +20,8 @@ obj-$(CONFIG_INET6_ESP) += esp6.o
 obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o
 obj-$(CONFIG_INET6_XFRM_TUNNEL) += xfrm6_tunnel.o
 obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o
+obj-$(CONFIG_INET6_XFRM_MODE_TRANSPORT) += xfrm6_mode_transport.o
+obj-$(CONFIG_INET6_XFRM_MODE_TUNNEL) += xfrm6_mode_tunnel.o
 obj-$(CONFIG_NETFILTER)	+= netfilter/
 
 obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index e46048974f3..416f6e428a0 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -39,6 +39,7 @@
 #include <linux/in6.h>
 #include <linux/tcp.h>
 #include <linux/route.h>
+#include <linux/module.h>
 
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv6.h>
@@ -488,6 +489,7 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 
 	return offset;
 }
+EXPORT_SYMBOL_GPL(ip6_find_1stfragopt);
 
 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 {
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 00cfdee18dc..0405d74ff91 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -13,21 +13,9 @@
 #include <linux/string.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv6.h>
-#include <net/dsfield.h>
-#include <net/inet_ecn.h>
-#include <net/ip.h>
 #include <net/ipv6.h>
 #include <net/xfrm.h>
 
-static inline void ipip6_ecn_decapsulate(struct sk_buff *skb)
-{
-	struct ipv6hdr *outer_iph = skb->nh.ipv6h;
-	struct ipv6hdr *inner_iph = skb->h.ipv6h;
-
-	if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph)))
-		IP6_ECN_set_ce(inner_iph);
-}
-
 int xfrm6_rcv_spi(struct sk_buff *skb, u32 spi)
 {
 	int err;
@@ -81,21 +69,10 @@ int xfrm6_rcv_spi(struct sk_buff *skb, u32 spi)
 
 		xfrm_vec[xfrm_nr++] = x;
 
+		if (x->mode->input(x, skb))
+			goto drop;
+
 		if (x->props.mode) { /* XXX */
-			if (nexthdr != IPPROTO_IPV6)
-				goto drop;
-			if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
-				goto drop;
-			if (skb_cloned(skb) &&
-			    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
-				goto drop;
-			if (x->props.flags & XFRM_STATE_DECAP_DSCP)
-				ipv6_copy_dscp(skb->nh.ipv6h, skb->h.ipv6h);
-			if (!(x->props.flags & XFRM_STATE_NOECN))
-				ipip6_ecn_decapsulate(skb);
-			skb->mac.raw = memmove(skb->data - skb->mac_len,
-					       skb->mac.raw, skb->mac_len);
-			skb->nh.raw = skb->data;
 			decaps = 1;
 			break;
 		}
diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c
new file mode 100644
index 00000000000..5efbbae08ef
--- /dev/null
+++ b/net/ipv6/xfrm6_mode_transport.c
@@ -0,0 +1,73 @@
+/*
+ * xfrm6_mode_transport.c - Transport mode encapsulation for IPv6.
+ *
+ * Copyright (C) 2002 USAGI/WIDE Project
+ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dst.h>
+#include <net/ipv6.h>
+#include <net/xfrm.h>
+
+/* Add encapsulation header.
+ *
+ * The IP header and mutable extension headers will be moved forward to make
+ * space for the encapsulation header.
+ *
+ * On exit, skb->h will be set to the start of the encapsulation header to be
+ * filled in by x->type->output and skb->nh will be set to the nextheader field
+ * of the extension header directly preceding the encapsulation header, or in
+ * its absence, that of the top IP header.  The value of skb->data will always
+ * point to the top IP header.
+ */
+static int xfrm6_transport_output(struct sk_buff *skb)
+{
+	struct xfrm_state *x = skb->dst->xfrm;
+	struct ipv6hdr *iph;
+	u8 *prevhdr;
+	int hdr_len;
+
+	skb_push(skb, x->props.header_len);
+	iph = skb->nh.ipv6h;
+
+	hdr_len = ip6_find_1stfragopt(skb, &prevhdr);
+	skb->nh.raw = prevhdr - x->props.header_len;
+	skb->h.raw = skb->data + hdr_len;
+	memmove(skb->data, iph, hdr_len);
+	return 0;
+}
+
+static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	return 0;
+}
+
+static struct xfrm_mode xfrm6_transport_mode = {
+	.input = xfrm6_transport_input,
+	.output = xfrm6_transport_output,
+	.owner = THIS_MODULE,
+	.encap = XFRM_MODE_TRANSPORT,
+};
+
+static int __init xfrm6_transport_init(void)
+{
+	return xfrm_register_mode(&xfrm6_transport_mode, AF_INET6);
+}
+
+static void __exit xfrm6_transport_exit(void)
+{
+	int err;
+
+	err = xfrm_unregister_mode(&xfrm6_transport_mode, AF_INET6);
+	BUG_ON(err);
+}
+
+module_init(xfrm6_transport_init);
+module_exit(xfrm6_transport_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TRANSPORT);
diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c
new file mode 100644
index 00000000000..8af79be2edc
--- /dev/null
+++ b/net/ipv6/xfrm6_mode_tunnel.c
@@ -0,0 +1,121 @@
+/*
+ * xfrm6_mode_tunnel.c - Tunnel mode encapsulation for IPv6.
+ *
+ * Copyright (C) 2002 USAGI/WIDE Project
+ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dsfield.h>
+#include <net/dst.h>
+#include <net/inet_ecn.h>
+#include <net/ipv6.h>
+#include <net/xfrm.h>
+
+static inline void ipip6_ecn_decapsulate(struct sk_buff *skb)
+{
+	struct ipv6hdr *outer_iph = skb->nh.ipv6h;
+	struct ipv6hdr *inner_iph = skb->h.ipv6h;
+
+	if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph)))
+		IP6_ECN_set_ce(inner_iph);
+}
+
+/* Add encapsulation header.
+ *
+ * The top IP header will be constructed per RFC 2401.  The following fields
+ * in it shall be filled in by x->type->output:
+ *	payload_len
+ *
+ * On exit, skb->h will be set to the start of the encapsulation header to be
+ * filled in by x->type->output and skb->nh will be set to the nextheader field
+ * of the extension header directly preceding the encapsulation header, or in
+ * its absence, that of the top IP header.  The value of skb->data will always
+ * point to the top IP header.
+ */
+static int xfrm6_tunnel_output(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb->dst;
+	struct xfrm_state *x = dst->xfrm;
+	struct ipv6hdr *iph, *top_iph;
+	int dsfield;
+
+	skb_push(skb, x->props.header_len);
+	iph = skb->nh.ipv6h;
+
+	skb->nh.raw = skb->data;
+	top_iph = skb->nh.ipv6h;
+	skb->nh.raw = &top_iph->nexthdr;
+	skb->h.ipv6h = top_iph + 1;
+
+	top_iph->version = 6;
+	top_iph->priority = iph->priority;
+	top_iph->flow_lbl[0] = iph->flow_lbl[0];
+	top_iph->flow_lbl[1] = iph->flow_lbl[1];
+	top_iph->flow_lbl[2] = iph->flow_lbl[2];
+	dsfield = ipv6_get_dsfield(top_iph);
+	dsfield = INET_ECN_encapsulate(dsfield, dsfield);
+	if (x->props.flags & XFRM_STATE_NOECN)
+		dsfield &= ~INET_ECN_MASK;
+	ipv6_change_dsfield(top_iph, 0, dsfield);
+	top_iph->nexthdr = IPPROTO_IPV6; 
+	top_iph->hop_limit = dst_metric(dst->child, RTAX_HOPLIMIT);
+	ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr);
+	ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr);
+	return 0;
+}
+
+static int xfrm6_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int err = -EINVAL;
+
+	if (skb->nh.raw[IP6CB(skb)->nhoff] != IPPROTO_IPV6)
+		goto out;
+	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+		goto out;
+
+	if (skb_cloned(skb) &&
+	    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+		goto out;
+
+	if (x->props.flags & XFRM_STATE_DECAP_DSCP)
+		ipv6_copy_dscp(skb->nh.ipv6h, skb->h.ipv6h);
+	if (!(x->props.flags & XFRM_STATE_NOECN))
+		ipip6_ecn_decapsulate(skb);
+	skb->mac.raw = memmove(skb->data - skb->mac_len,
+			       skb->mac.raw, skb->mac_len);
+	skb->nh.raw = skb->data;
+	err = 0;
+
+out:
+	return err;
+}
+
+static struct xfrm_mode xfrm6_tunnel_mode = {
+	.input = xfrm6_tunnel_input,
+	.output = xfrm6_tunnel_output,
+	.owner = THIS_MODULE,
+	.encap = XFRM_MODE_TUNNEL,
+};
+
+static int __init xfrm6_tunnel_init(void)
+{
+	return xfrm_register_mode(&xfrm6_tunnel_mode, AF_INET6);
+}
+
+static void __exit xfrm6_tunnel_exit(void)
+{
+	int err;
+
+	err = xfrm_unregister_mode(&xfrm6_tunnel_mode, AF_INET6);
+	BUG_ON(err);
+}
+
+module_init(xfrm6_tunnel_init);
+module_exit(xfrm6_tunnel_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TUNNEL);
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index 80242172a5d..16e84254a25 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -14,68 +14,9 @@
 #include <linux/spinlock.h>
 #include <linux/icmpv6.h>
 #include <linux/netfilter_ipv6.h>
-#include <net/dsfield.h>
-#include <net/inet_ecn.h>
 #include <net/ipv6.h>
 #include <net/xfrm.h>
 
-/* Add encapsulation header.
- *
- * In transport mode, the IP header and mutable extension headers will be moved
- * forward to make space for the encapsulation header.
- *
- * In tunnel mode, the top IP header will be constructed per RFC 2401.
- * The following fields in it shall be filled in by x->type->output:
- *	payload_len
- *
- * On exit, skb->h will be set to the start of the encapsulation header to be
- * filled in by x->type->output and skb->nh will be set to the nextheader field
- * of the extension header directly preceding the encapsulation header, or in
- * its absence, that of the top IP header.  The value of skb->data will always
- * point to the top IP header.
- */
-static void xfrm6_encap(struct sk_buff *skb)
-{
-	struct dst_entry *dst = skb->dst;
-	struct xfrm_state *x = dst->xfrm;
-	struct ipv6hdr *iph, *top_iph;
-	int dsfield;
-
-	skb_push(skb, x->props.header_len);
-	iph = skb->nh.ipv6h;
-
-	if (!x->props.mode) {
-		u8 *prevhdr;
-		int hdr_len;
-
-		hdr_len = ip6_find_1stfragopt(skb, &prevhdr);
-		skb->nh.raw = prevhdr - x->props.header_len;
-		skb->h.raw = skb->data + hdr_len;
-		memmove(skb->data, iph, hdr_len);
-		return;
-	}
-
-	skb->nh.raw = skb->data;
-	top_iph = skb->nh.ipv6h;
-	skb->nh.raw = &top_iph->nexthdr;
-	skb->h.ipv6h = top_iph + 1;
-
-	top_iph->version = 6;
-	top_iph->priority = iph->priority;
-	top_iph->flow_lbl[0] = iph->flow_lbl[0];
-	top_iph->flow_lbl[1] = iph->flow_lbl[1];
-	top_iph->flow_lbl[2] = iph->flow_lbl[2];
-	dsfield = ipv6_get_dsfield(top_iph);
-	dsfield = INET_ECN_encapsulate(dsfield, dsfield);
-	if (x->props.flags & XFRM_STATE_NOECN)
-		dsfield &= ~INET_ECN_MASK;
-	ipv6_change_dsfield(top_iph, 0, dsfield);
-	top_iph->nexthdr = IPPROTO_IPV6; 
-	top_iph->hop_limit = dst_metric(dst->child, RTAX_HOPLIMIT);
-	ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr);
-	ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr);
-}
-
 static int xfrm6_tunnel_check_size(struct sk_buff *skb)
 {
 	int mtu, ret = 0;
@@ -118,7 +59,9 @@ static int xfrm6_output_one(struct sk_buff *skb)
 		if (err)
 			goto error;
 
-		xfrm6_encap(skb);
+		err = x->mode->output(skb);
+		if (err)
+			goto error;
 
 		err = x->type->output(x, skb);
 		if (err)
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 44b64a593c0..b8936926c24 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -138,6 +138,89 @@ void xfrm_put_type(struct xfrm_type *type)
 	module_put(type->owner);
 }
 
+int xfrm_register_mode(struct xfrm_mode *mode, int family)
+{
+	struct xfrm_policy_afinfo *afinfo;
+	struct xfrm_mode **modemap;
+	int err;
+
+	if (unlikely(mode->encap >= XFRM_MODE_MAX))
+		return -EINVAL;
+
+	afinfo = xfrm_policy_lock_afinfo(family);
+	if (unlikely(afinfo == NULL))
+		return -EAFNOSUPPORT;
+
+	err = -EEXIST;
+	modemap = afinfo->mode_map;
+	if (likely(modemap[mode->encap] == NULL)) {
+		modemap[mode->encap] = mode;
+		err = 0;
+	}
+
+	xfrm_policy_unlock_afinfo(afinfo);
+	return err;
+}
+EXPORT_SYMBOL(xfrm_register_mode);
+
+int xfrm_unregister_mode(struct xfrm_mode *mode, int family)
+{
+	struct xfrm_policy_afinfo *afinfo;
+	struct xfrm_mode **modemap;
+	int err;
+
+	if (unlikely(mode->encap >= XFRM_MODE_MAX))
+		return -EINVAL;
+
+	afinfo = xfrm_policy_lock_afinfo(family);
+	if (unlikely(afinfo == NULL))
+		return -EAFNOSUPPORT;
+
+	err = -ENOENT;
+	modemap = afinfo->mode_map;
+	if (likely(modemap[mode->encap] == mode)) {
+		modemap[mode->encap] = NULL;
+		err = 0;
+	}
+
+	xfrm_policy_unlock_afinfo(afinfo);
+	return err;
+}
+EXPORT_SYMBOL(xfrm_unregister_mode);
+
+struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family)
+{
+	struct xfrm_policy_afinfo *afinfo;
+	struct xfrm_mode *mode;
+	int modload_attempted = 0;
+
+	if (unlikely(encap >= XFRM_MODE_MAX))
+		return NULL;
+
+retry:
+	afinfo = xfrm_policy_get_afinfo(family);
+	if (unlikely(afinfo == NULL))
+		return NULL;
+
+	mode = afinfo->mode_map[encap];
+	if (unlikely(mode && !try_module_get(mode->owner)))
+		mode = NULL;
+	if (!mode && !modload_attempted) {
+		xfrm_policy_put_afinfo(afinfo);
+		request_module("xfrm-mode-%d-%d", family, encap);
+		modload_attempted = 1;
+		goto retry;
+	}
+
+	xfrm_policy_put_afinfo(afinfo);
+	return mode;
+}
+
+void xfrm_put_mode(struct xfrm_mode *mode)
+{
+	module_put(mode->owner);
+}
+
 static inline unsigned long make_jiffies(long secs)
 {
 	if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index ee62c239a7e..17b29ec3c41 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -77,6 +77,8 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
 	kfree(x->ealg);
 	kfree(x->calg);
 	kfree(x->encap);
+	if (x->mode)
+		xfrm_put_mode(x->mode);
 	if (x->type) {
 		x->type->destructor(x);
 		xfrm_put_type(x->type);
@@ -1193,6 +1195,10 @@ int xfrm_init_state(struct xfrm_state *x)
 	if (err)
 		goto error;
 
+	x->mode = xfrm_get_mode(x->props.mode, family);
+	if (x->mode == NULL)
+		goto error;
+
 	x->km.state = XFRM_STATE_VALID;
 
 error:
-- 
cgit v1.2.3-70-g09d2


From 73654d61e556483ad324b90989eae26b22df6ef6 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 27 May 2006 23:06:33 -0700
Subject: [IPSEC] xfrm: Use IPPROTO_MAX instead of 256

The size of the type_map array (256) comes from the number of IP protocols,
i.e., IPPROTO_MAX.  This patch is based on a suggestion from Ingo Oeser.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index ed5bb34f817..9c5ee9f20b6 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -207,7 +207,7 @@ struct xfrm_type;
 struct xfrm_dst;
 struct xfrm_policy_afinfo {
 	unsigned short		family;
-	struct xfrm_type	*type_map[256];
+	struct xfrm_type	*type_map[IPPROTO_MAX];
 	struct xfrm_mode	*mode_map[XFRM_MODE_MAX];
 	struct dst_ops		*dst_ops;
 	void			(*garbage_collect)(void);
-- 
cgit v1.2.3-70-g09d2


From 39a27a35c5c1b5be499a0576a35c45a011788bf8 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 29 May 2006 18:23:54 -0700
Subject: [NETFILTER]: conntrack: add sysctl to disable checksumming

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ip_conntrack.h    |  1 +
 include/linux/sysctl.h                         |  2 ++
 include/net/netfilter/nf_conntrack.h           |  1 +
 net/ipv4/netfilter/ip_conntrack_proto_icmp.c   |  2 +-
 net/ipv4/netfilter/ip_conntrack_proto_tcp.c    |  2 +-
 net/ipv4/netfilter/ip_conntrack_proto_udp.c    |  2 +-
 net/ipv4/netfilter/ip_conntrack_standalone.c   | 11 +++++++++++
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   |  2 +-
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c |  2 +-
 net/netfilter/nf_conntrack_proto_tcp.c         |  5 +++--
 net/netfilter/nf_conntrack_proto_udp.c         |  3 ++-
 net/netfilter/nf_conntrack_standalone.c        | 11 +++++++++++
 12 files changed, 36 insertions(+), 8 deletions(-)

(limited to 'include/net')

diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h
index d54d7b278e9..5473c01f69e 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack.h
@@ -293,6 +293,7 @@ static inline int is_dying(struct ip_conntrack *ct)
 }
 
 extern unsigned int ip_conntrack_htable_size;
+extern int ip_conntrack_checksum;
  
 #define CONNTRACK_STAT_INC(count) (__get_cpu_var(ip_conntrack_stat).count++)
 
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index cd9e7c0825a..98338ed2c0b 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -313,6 +313,7 @@ enum
 	NET_NF_CONNTRACK_FRAG6_TIMEOUT=29,
 	NET_NF_CONNTRACK_FRAG6_LOW_THRESH=30,
 	NET_NF_CONNTRACK_FRAG6_HIGH_THRESH=31,
+	NET_NF_CONNTRACK_CHECKSUM=32,
 };
 
 /* /proc/sys/net/ipv4 */
@@ -492,6 +493,7 @@ enum
  	NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD=25,
  	NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT=26,
 	NET_IPV4_NF_CONNTRACK_COUNT=27,
+	NET_IPV4_NF_CONNTRACK_CHECKSUM=28,
 };
  
 /* /proc/sys/net/ipv6 */
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 916013ca4a5..dbe7a114d0c 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -285,6 +285,7 @@ static inline int nf_ct_is_dying(struct nf_conn *ct)
 }
 
 extern unsigned int nf_conntrack_htable_size;
+extern int nf_conntrack_checksum;
 
 #define NF_CT_STAT_INC(count) (__get_cpu_var(nf_conntrack_stat).count++)
 
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
index d8b14a9010a..23f1c504586 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -224,7 +224,7 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
 	}
 
 	/* See ip_conntrack_proto_tcp.c */
-	if (hooknum == NF_IP_PRE_ROUTING &&
+	if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
 	    nf_ip_checksum(skb, hooknum, skb->nh.iph->ihl * 4, 0)) {
 		if (LOG_INVALID(IPPROTO_ICMP))
 			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index 062b252b58a..c5c2ce5cdeb 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -870,7 +870,7 @@ static int tcp_error(struct sk_buff *skb,
 	 * and moreover root might send raw packets.
 	 */
 	/* FIXME: Source route IP option packets --RR */
-	if (hooknum == NF_IP_PRE_ROUTING &&
+	if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
 	    nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_TCP)) {
 		if (LOG_INVALID(IPPROTO_TCP))
 			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index 70899868783..9b2c16b4d2f 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -120,7 +120,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
 	 * because the semantic of CHECKSUM_HW is different there 
 	 * and moreover root might send raw packets.
 	 * FIXME: Source route IP option packets --RR */
-	if (hooknum == NF_IP_PRE_ROUTING &&
+	if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
 	    nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_UDP)) {
 		if (LOG_INVALID(IPPROTO_UDP))
 			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index f0cc7feb0da..6cb9b989d14 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -564,6 +564,8 @@ extern unsigned int ip_ct_generic_timeout;
 static int log_invalid_proto_min = 0;
 static int log_invalid_proto_max = 255;
 
+int ip_conntrack_checksum = 1;
+
 static struct ctl_table_header *ip_ct_sysctl_header;
 
 static ctl_table ip_ct_sysctl_table[] = {
@@ -591,6 +593,14 @@ static ctl_table ip_ct_sysctl_table[] = {
 		.mode		= 0444,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_CHECKSUM,
+		.procname	= "ip_conntrack_checksum",
+		.data		= &ip_conntrack_checksum,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 	{
 		.ctl_name	= NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT,
 		.procname	= "ip_conntrack_tcp_timeout_syn_sent",
@@ -946,6 +956,7 @@ EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname);
 EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get);
 EXPORT_SYMBOL_GPL(ip_conntrack_proto_put);
 EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find);
+EXPORT_SYMBOL_GPL(ip_conntrack_checksum);
 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
     defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
 EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr);
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 4b0d361cc6e..663a73ee3f2 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -235,7 +235,7 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff,
 	}
 
 	/* See ip_conntrack_proto_tcp.c */
-	if (hooknum == NF_IP_PRE_ROUTING &&
+	if (nf_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
 	    nf_ip_checksum(skb, hooknum, dataoff, 0)) {
 		if (LOG_INVALID(IPPROTO_ICMP))
 			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 86c6703265d..ef18a7b7014 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -233,7 +233,7 @@ icmpv6_error(struct sk_buff *skb, unsigned int dataoff,
 		return -NF_ACCEPT;
 	}
 
-	if (hooknum == NF_IP6_PRE_ROUTING &&
+	if (nf_conntrack_checksum && hooknum == NF_IP6_PRE_ROUTING &&
 	    nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) {
 		nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL,
 			      "nf_ct_icmpv6: ICMPv6 checksum failed\n");
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 69899f27d26..12fb7c0a150 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -828,8 +828,9 @@ static int tcp_error(struct sk_buff *skb,
 	 * and moreover root might send raw packets.
 	 */
 	/* FIXME: Source route IP option packets --RR */
-	if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) ||
-	     (pf == PF_INET6 && hooknum  == NF_IP6_PRE_ROUTING)) &&
+	if (nf_conntrack_checksum &&
+	    ((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) ||
+	     (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) &&
 	    nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) {
 		if (LOG_INVALID(IPPROTO_TCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index d93edbfde9e..ae07ebe3ab3 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -134,7 +134,8 @@ static int udp_error(struct sk_buff *skb, unsigned int dataoff,
 	 * because the semantic of CHECKSUM_HW is different there
 	 * and moreover root might send raw packets.
 	 * FIXME: Source route IP option packets --RR */
-	if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) ||
+	if (nf_conntrack_checksum &&
+	    ((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) ||
 	     (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) &&
 	    nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) {
 		if (LOG_INVALID(IPPROTO_UDP))
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 408960c6a54..e01d20d8e28 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -455,6 +455,8 @@ extern unsigned int nf_ct_generic_timeout;
 static int log_invalid_proto_min = 0;
 static int log_invalid_proto_max = 255;
 
+int nf_conntrack_checksum = 1;
+
 static struct ctl_table_header *nf_ct_sysctl_header;
 
 static ctl_table nf_ct_sysctl_table[] = {
@@ -482,6 +484,14 @@ static ctl_table nf_ct_sysctl_table[] = {
 		.mode           = 0444,
 		.proc_handler   = &proc_dointvec,
 	},
+	{
+		.ctl_name	= NET_NF_CONNTRACK_CHECKSUM,
+		.procname	= "nf_conntrack_checksum",
+		.data		= &nf_conntrack_checksum,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 	{
 		.ctl_name	= NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT,
 		.procname	= "nf_conntrack_tcp_timeout_syn_sent",
@@ -851,6 +861,7 @@ EXPORT_SYMBOL(nf_ct_proto_put);
 EXPORT_SYMBOL(nf_ct_l3proto_find_get);
 EXPORT_SYMBOL(nf_ct_l3proto_put);
 EXPORT_SYMBOL(nf_ct_l3protos);
+EXPORT_SYMBOL_GPL(nf_conntrack_checksum);
 EXPORT_SYMBOL(nf_conntrack_expect_alloc);
 EXPORT_SYMBOL(nf_conntrack_expect_put);
 EXPORT_SYMBOL(nf_conntrack_expect_related);
-- 
cgit v1.2.3-70-g09d2


From 72dc5b9225c53310c010b68a70ea97c8c8e24bdf Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Mon, 5 Jun 2006 17:30:08 -0700
Subject: [TCP]: Minimum congestion window consolidation.

Many of the TCP congestion methods all just use ssthresh
as the minimum congestion window on decrease.  Rather than
duplicating the code, just have that be the default if that
handle in the ops structure is not set.

Minor behaviour change to TCP compound.  It probably wants
to use this (ssthresh) as lower bound, rather than ssthresh/2
because the latter causes undershoot on loss.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h       |  4 ++--
 net/ipv4/tcp_bic.c      |  7 -------
 net/ipv4/tcp_compound.c |  1 -
 net/ipv4/tcp_cong.c     |  6 +++---
 net/ipv4/tcp_cubic.c    |  6 ------
 net/ipv4/tcp_htcp.c     |  9 ---------
 net/ipv4/tcp_input.c    | 13 +++++++++++--
 net/ipv4/tcp_veno.c     |  7 -------
 net/ipv4/tcp_westwood.c | 18 +++++++-----------
 9 files changed, 23 insertions(+), 48 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index f1f472746e6..de88c5472bf 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -632,7 +632,7 @@ struct tcp_congestion_ops {
 	/* return slow start threshold (required) */
 	u32 (*ssthresh)(struct sock *sk);
 	/* lower bound for congestion window (optional) */
-	u32 (*min_cwnd)(struct sock *sk);
+	u32 (*min_cwnd)(const struct sock *sk);
 	/* do new cwnd calculation (required) */
 	void (*cong_avoid)(struct sock *sk, u32 ack,
 			   u32 rtt, u32 in_flight, int good_ack);
@@ -667,7 +667,7 @@ extern struct tcp_congestion_ops tcp_init_congestion_ops;
 extern u32 tcp_reno_ssthresh(struct sock *sk);
 extern void tcp_reno_cong_avoid(struct sock *sk, u32 ack,
 				u32 rtt, u32 in_flight, int flag);
-extern u32 tcp_reno_min_cwnd(struct sock *sk);
+extern u32 tcp_reno_min_cwnd(const struct sock *sk);
 extern struct tcp_congestion_ops tcp_reno;
 
 static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 035f2092d73..b2d9021ad22 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -198,12 +198,6 @@ static u32 bictcp_undo_cwnd(struct sock *sk)
 	return max(tp->snd_cwnd, ca->last_max_cwnd);
 }
 
-static u32 bictcp_min_cwnd(struct sock *sk)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-	return tp->snd_ssthresh;
-}
-
 static void bictcp_state(struct sock *sk, u8 new_state)
 {
 	if (new_state == TCP_CA_Loss)
@@ -231,7 +225,6 @@ static struct tcp_congestion_ops bictcp = {
 	.cong_avoid	= bictcp_cong_avoid,
 	.set_state	= bictcp_state,
 	.undo_cwnd	= bictcp_undo_cwnd,
-	.min_cwnd	= bictcp_min_cwnd,
 	.pkts_acked     = bictcp_acked,
 	.owner		= THIS_MODULE,
 	.name		= "bic",
diff --git a/net/ipv4/tcp_compound.c b/net/ipv4/tcp_compound.c
index ec68cb8081c..bc54f7e9aea 100644
--- a/net/ipv4/tcp_compound.c
+++ b/net/ipv4/tcp_compound.c
@@ -419,7 +419,6 @@ static struct tcp_congestion_ops tcp_compound = {
 	.init		= tcp_compound_init,
 	.ssthresh	= tcp_reno_ssthresh,
 	.cong_avoid	= tcp_compound_cong_avoid,
-	.min_cwnd	= tcp_reno_min_cwnd,
 	.rtt_sample	= tcp_compound_rtt_calc,
 	.set_state	= tcp_compound_state,
 	.cwnd_event	= tcp_compound_cwnd_event,
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 91c2f41c7f5..857eefc52aa 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -38,7 +38,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
 	int ret = 0;
 
 	/* all algorithms must implement ssthresh and cong_avoid ops */
-	if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) {
+	if (!ca->ssthresh || !ca->cong_avoid) {
 		printk(KERN_ERR "TCP %s does not implement required ops\n",
 		       ca->name);
 		return -EINVAL;
@@ -251,8 +251,8 @@ u32 tcp_reno_ssthresh(struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
 
-/* Lower bound on congestion window. */
-u32 tcp_reno_min_cwnd(struct sock *sk)
+/* Lower bound on congestion window with halving. */
+u32 tcp_reno_min_cwnd(const struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	return tp->snd_ssthresh/2;
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 31a4986dfbf..78b7a6b9e4d 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -325,11 +325,6 @@ static u32 bictcp_undo_cwnd(struct sock *sk)
 	return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd);
 }
 
-static u32 bictcp_min_cwnd(struct sock *sk)
-{
-	return tcp_sk(sk)->snd_ssthresh;
-}
-
 static void bictcp_state(struct sock *sk, u8 new_state)
 {
 	if (new_state == TCP_CA_Loss)
@@ -357,7 +352,6 @@ static struct tcp_congestion_ops cubictcp = {
 	.cong_avoid	= bictcp_cong_avoid,
 	.set_state	= bictcp_state,
 	.undo_cwnd	= bictcp_undo_cwnd,
-	.min_cwnd	= bictcp_min_cwnd,
 	.pkts_acked     = bictcp_acked,
 	.owner		= THIS_MODULE,
 	.name		= "cubic",
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 1b2ff53f98e..3d92c185926 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -246,14 +246,6 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
 	}
 }
 
-/* Lower bound on congestion window. */
-static u32 htcp_min_cwnd(struct sock *sk)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-	return tp->snd_ssthresh;
-}
-
-
 static void htcp_init(struct sock *sk)
 {
 	struct htcp *ca = inet_csk_ca(sk);
@@ -285,7 +277,6 @@ static void htcp_state(struct sock *sk, u8 new_state)
 static struct tcp_congestion_ops htcp = {
 	.init		= htcp_init,
 	.ssthresh	= htcp_recalc_ssthresh,
-	.min_cwnd	= htcp_min_cwnd,
 	.cong_avoid	= htcp_cong_avoid,
 	.set_state	= htcp_state,
 	.undo_cwnd	= htcp_cwnd_undo,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6d167889a4b..e08245bdda3 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1689,17 +1689,26 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
+/* Lower bound on congestion window is slow start threshold
+ * unless congestion avoidance choice decides to overide it.
+ */
+static inline u32 tcp_cwnd_min(const struct sock *sk)
+{
+	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+
+	return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh;
+}
+
 /* Decrease cwnd each second ack. */
 static void tcp_cwnd_down(struct sock *sk)
 {
-	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	int decr = tp->snd_cwnd_cnt + 1;
 
 	tp->snd_cwnd_cnt = decr&1;
 	decr >>= 1;
 
-	if (decr && tp->snd_cwnd > icsk->icsk_ca_ops->min_cwnd(sk))
+	if (decr && tp->snd_cwnd > tcp_cwnd_min(sk))
 		tp->snd_cwnd -= decr;
 
 	tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 1091671751c..11b42a7135c 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -199,17 +199,10 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
 		return max(tp->snd_cwnd >> 1U, 2U);
 }
 
-static u32 tcp_veno_min_cwnd(struct sock * sk)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-	return tp->snd_ssthresh;
-}
-
 static struct tcp_congestion_ops tcp_veno = {
 	.init		= tcp_veno_init,
 	.ssthresh	= tcp_veno_ssthresh,
 	.cong_avoid	= tcp_veno_cong_avoid,
-	.min_cwnd 	= tcp_veno_min_cwnd,
 	.rtt_sample	= tcp_veno_rtt_calc,
 	.set_state	= tcp_veno_state,
 	.cwnd_event	= tcp_veno_cwnd_event,
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 0c340c3756c..29eb258b6d8 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -162,12 +162,6 @@ static inline u32 westwood_acked_count(struct sock *sk)
 	return w->cumul_ack;
 }
 
-static inline u32 westwood_bw_rttmin(const struct sock *sk)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-	const struct westwood *w = inet_csk_ca(sk);
-	return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
-}
 
 /*
  * TCP Westwood
@@ -175,9 +169,11 @@ static inline u32 westwood_bw_rttmin(const struct sock *sk)
  * in packets we use mss_cache). Rttmin is guaranteed to be >= 2
  * so avoids ever returning 0.
  */
-static u32 tcp_westwood_cwnd_min(struct sock *sk)
+static u32 tcp_westwood_bw_rttmin(const struct sock *sk)
 {
-	return westwood_bw_rttmin(sk);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct westwood *w = inet_csk_ca(sk);
+	return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
 }
 
 static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
@@ -191,11 +187,11 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
 		break;
 
 	case CA_EVENT_COMPLETE_CWR:
-		tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(sk);
+		tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
 		break;
 
 	case CA_EVENT_FRTO:
-		tp->snd_ssthresh = westwood_bw_rttmin(sk);
+		tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
 		break;
 
 	case CA_EVENT_SLOW_ACK:
@@ -235,7 +231,7 @@ static struct tcp_congestion_ops tcp_westwood = {
 	.init		= tcp_westwood_init,
 	.ssthresh	= tcp_reno_ssthresh,
 	.cong_avoid	= tcp_reno_cong_avoid,
-	.min_cwnd	= tcp_westwood_cwnd_min,
+	.min_cwnd	= tcp_westwood_bw_rttmin,
 	.cwnd_event	= tcp_westwood_event,
 	.get_info	= tcp_westwood_info,
 	.pkts_acked	= tcp_westwood_pkts_acked,
-- 
cgit v1.2.3-70-g09d2


From 6d7416535097ed0943bdae8e69c14ba43061cab1 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 5 Jun 2006 21:06:41 -0700
Subject: [IPV4]: Right prototype of __raw_v4_lookup()

All users pass 32-bit values as addresses and internally they're
compared with 32-bit entities. So, change "laddr" and "raddr" types to
__be32.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/raw.h | 2 +-
 net/ipv4/raw.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/raw.h b/include/net/raw.h
index e67b28a0248..d83571fe4c6 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -36,7 +36,7 @@ extern rwlock_t raw_v4_lock;
 
 
 extern struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num,
-				    unsigned long raddr, unsigned long laddr,
+				    __be32 raddr, __be32 laddr,
 				    int dif);
 
 extern int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index fc256241555..bd221ec3f81 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -103,7 +103,7 @@ static void raw_v4_unhash(struct sock *sk)
 }
 
 struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num,
-			     unsigned long raddr, unsigned long laddr,
+			     __be32 raddr, __be32 laddr,
 			     int dif)
 {
 	struct hlist_node *node;
-- 
cgit v1.2.3-70-g09d2


From 7c9728c393dceb724d66d696cfabce82151a78e5 Mon Sep 17 00:00:00 2001
From: James Morris <jmorris@namei.org>
Date: Fri, 9 Jun 2006 00:31:46 -0700
Subject: [SECMARK]: Add secmark support to conntrack

Add a secmark field to IP and NF conntracks, so that security markings
on packets can be copied to their associated connections, and also
copied back to packets as required.  This is similar to the network
mark field currently used with conntrack, although it is intended for
enforcement of security policy rather than network policy.

Signed-off-by: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ip_conntrack.h  |  4 ++++
 include/net/netfilter/nf_conntrack.h         |  4 ++++
 include/net/netfilter/nf_conntrack_compat.h  | 26 ++++++++++++++++++++++++++
 net/ipv4/netfilter/Kconfig                   | 12 ++++++++++++
 net/ipv4/netfilter/ip_conntrack_core.c       |  3 +++
 net/ipv4/netfilter/ip_conntrack_standalone.c |  5 +++++
 net/netfilter/Kconfig                        | 12 ++++++++++++
 net/netfilter/nf_conntrack_core.c            |  3 +++
 net/netfilter/nf_conntrack_standalone.c      |  5 +++++
 9 files changed, 74 insertions(+)

(limited to 'include/net')

diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h
index 17d7ef938a0..e0e9951eb8c 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack.h
@@ -121,6 +121,10 @@ struct ip_conntrack
 	u_int32_t mark;
 #endif
 
+#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
+	u_int32_t secmark;
+#endif
+
 	/* Traversed often, so hopefully in different cacheline to top */
 	/* These are my tuples; original and reply */
 	struct ip_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX];
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index dbe7a114d0c..41111781580 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -114,6 +114,10 @@ struct nf_conn
 	u_int32_t mark;
 #endif
 
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+	u_int32_t secmark;
+#endif
+
 	/* Storage reserved for other modules: */
 	union nf_conntrack_proto proto;
 
diff --git a/include/net/netfilter/nf_conntrack_compat.h b/include/net/netfilter/nf_conntrack_compat.h
index 3cac19fb364..f1b1482d720 100644
--- a/include/net/netfilter/nf_conntrack_compat.h
+++ b/include/net/netfilter/nf_conntrack_compat.h
@@ -20,6 +20,19 @@ static inline u_int32_t *nf_ct_get_mark(const struct sk_buff *skb,
 }
 #endif /* CONFIG_IP_NF_CONNTRACK_MARK */
 
+#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
+static inline u_int32_t *nf_ct_get_secmark(const struct sk_buff *skb,
+					   u_int32_t *ctinfo)
+{
+	struct ip_conntrack *ct = ip_conntrack_get(skb, ctinfo);
+
+	if (ct)
+		return &ct->secmark;
+	else
+		return NULL;
+}
+#endif /* CONFIG_IP_NF_CONNTRACK_SECMARK */
+
 #ifdef CONFIG_IP_NF_CT_ACCT
 static inline struct ip_conntrack_counter *
 nf_ct_get_counters(const struct sk_buff *skb)
@@ -70,6 +83,19 @@ static inline u_int32_t *nf_ct_get_mark(const struct sk_buff *skb,
 }
 #endif /* CONFIG_NF_CONNTRACK_MARK */
 
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static inline u_int32_t *nf_ct_get_secmark(const struct sk_buff *skb,
+					   u_int32_t *ctinfo)
+{
+	struct nf_conn *ct = nf_ct_get(skb, ctinfo);
+
+	if (ct)
+		return &ct->secmark;
+	else
+		return NULL;
+}
+#endif /* CONFIG_NF_CONNTRACK_MARK */
+
 #ifdef CONFIG_NF_CT_ACCT
 static inline struct ip_conntrack_counter *
 nf_ct_get_counters(const struct sk_buff *skb)
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index ff4b118f14a..e1d7f5fbc52 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -55,6 +55,18 @@ config IP_NF_CONNTRACK_MARK
 	  of packets, but this mark value is kept in the conntrack session
 	  instead of the individual packets.
 	
+config IP_NF_CONNTRACK_SECMARK
+	bool  'Connection tracking security mark support'
+	depends on IP_NF_CONNTRACK && NETWORK_SECMARK
+	help
+	  This option enables security markings to be applied to
+	  connections.  Typically they are copied to connections from
+	  packets using the CONNSECMARK target and copied back from
+	  connections to packets with the same target, with the packets
+	  being originally labeled via SECMARK.
+
+	  If unsure, say 'N'.
+
 config IP_NF_CONNTRACK_EVENTS
 	bool "Connection tracking events (EXPERIMENTAL)"
 	depends on EXPERIMENTAL && IP_NF_CONNTRACK
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 4fe9e69378d..7e4cf9a4d15 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -723,6 +723,9 @@ init_conntrack(struct ip_conntrack_tuple *tuple,
     defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
 		/* this is ugly, but there is no other place where to put it */
 		conntrack->nat.masq_index = exp->master->nat.masq_index;
+#endif
+#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
+		conntrack->secmark = exp->master->secmark;
 #endif
 		nf_conntrack_get(&conntrack->master->ct_general);
 		CONNTRACK_STAT_INC(expect_new);
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 6cb9b989d14..88445aac3f2 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -189,6 +189,11 @@ static int ct_seq_show(struct seq_file *s, void *v)
 		return -ENOSPC;
 #endif
 
+#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
+	if (seq_printf(s, "secmark=%u ", conntrack->secmark))
+		return -ENOSPC;
+#endif
+
 	if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use)))
 		return -ENOSPC;
 
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 10eccdd4d6e..023f81e5f96 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -60,6 +60,18 @@ config NF_CONNTRACK_MARK
 	  of packets, but this mark value is kept in the conntrack session
 	  instead of the individual packets.
 
+config NF_CONNTRACK_SECMARK
+	bool  'Connection tracking security mark support'
+	depends on NF_CONNTRACK && NETWORK_SECMARK
+	help
+	  This option enables security markings to be applied to
+	  connections.  Typically they are copied to connections from
+	  packets using the CONNSECMARK target and copied back from
+	  connections to packets with the same target, with the packets
+	  being originally labeled via SECMARK.
+
+	  If unsure, say 'N'.
+
 config NF_CONNTRACK_EVENTS
 	bool "Connection tracking events (EXPERIMENTAL)"
 	depends on EXPERIMENTAL && NF_CONNTRACK
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index bc2bd4c3859..cd299f4b7db 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -989,6 +989,9 @@ init_conntrack(const struct nf_conntrack_tuple *tuple,
 		conntrack->master = exp->master;
 #ifdef CONFIG_NF_CONNTRACK_MARK
 		conntrack->mark = exp->master->mark;
+#endif
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+		conntrack->secmark = exp->master->secmark;
 #endif
 		nf_conntrack_get(&conntrack->master->ct_general);
 		NF_CT_STAT_INC(expect_new);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index e01d20d8e28..e34c574f035 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -213,6 +213,11 @@ static int ct_seq_show(struct seq_file *s, void *v)
 		return -ENOSPC;
 #endif
 
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+	if (seq_printf(s, "secmark=%u ", conntrack->secmark))
+		return -ENOSPC;
+#endif
+
 	if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use)))
 		return -ENOSPC;
 	
-- 
cgit v1.2.3-70-g09d2


From b38dfee3d616ffadb58d4215e3ff9d1d7921031e Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 9 Jun 2006 16:13:01 -0700
Subject: [NET]: skb_trim audit

I found a few more spots where pskb_trim_rcsum could be used but were not.
This patch changes them to use it.

Also, sk_filter can get paged skb data.  Therefore we must use pskb_trim
instead of skb_trim.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h                      |  5 +----
 net/bridge/br_netfilter.c               | 14 +++-----------
 net/ipv6/netfilter/nf_conntrack_reasm.c | 10 +++-------
 3 files changed, 7 insertions(+), 22 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index 75b0e97ed93..96565ff0de6 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -873,10 +873,7 @@ static inline int sk_filter(struct sock *sk, struct sk_buff *skb, int needlock)
 		if (filter) {
 			unsigned int pkt_len = sk_run_filter(skb, filter->insns,
 							     filter->len);
-			if (!pkt_len)
-				err = -EPERM;
-			else
-				skb_trim(skb, pkt_len);
+			err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
 		}
 
 		if (needlock)
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 3da9264449f..3e41f9d6d51 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -407,12 +407,8 @@ static unsigned int br_nf_pre_routing_ipv6(unsigned int hook,
 	if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
 		if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
 			goto inhdr_error;
-		if (pkt_len + sizeof(struct ipv6hdr) < skb->len) {
-			if (__pskb_trim(skb, pkt_len + sizeof(struct ipv6hdr)))
-				goto inhdr_error;
-			if (skb->ip_summed == CHECKSUM_HW)
-				skb->ip_summed = CHECKSUM_NONE;
-		}
+		if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr)))
+			goto inhdr_error;
 	}
 	if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
 		goto inhdr_error;
@@ -495,11 +491,7 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb,
 	if (skb->len < len || len < 4 * iph->ihl)
 		goto inhdr_error;
 
-	if (skb->len > len) {
-		__pskb_trim(skb, len);
-		if (skb->ip_summed == CHECKSUM_HW)
-			skb->ip_summed = CHECKSUM_NONE;
-	}
+	pskb_trim_rcsum(skb, len);
 
 	nf_bridge_put(skb->nf_bridge);
 	if (!nf_bridge_alloc(skb))
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 3e319035f82..c32a029e43f 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -456,13 +456,9 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
 		DEBUGP("queue: message is too short.\n");
 		goto err;
 	}
-	if (end-offset < skb->len) {
-		if (pskb_trim(skb, end - offset)) {
-			DEBUGP("Can't trim\n");
-			goto err;
-		}
-		if (skb->ip_summed != CHECKSUM_UNNECESSARY)
-			skb->ip_summed = CHECKSUM_NONE;
+	if (pskb_trim_rcsum(skb, end - offset)) {
+		DEBUGP("Can't trim\n");
+		goto err;
 	}
 
 	/* Find out which fragments are in front and at the back of us
-- 
cgit v1.2.3-70-g09d2


From bdeb04c6d9a957ae2a51c3033414467b82b2a736 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Sun, 11 Jun 2006 21:20:38 -0700
Subject: [NET]: net.ipv4.ip_autoconfig sysctl removal

The sysctl net.ipv4.ip_autoconfig is a legacy value that is not used.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h           | 1 -
 net/ipv4/sysctl_net_ipv4.c | 8 --------
 2 files changed, 9 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip.h b/include/net/ip.h
index 3d2e5ca62a5..ead233c9540 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -147,7 +147,6 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
 struct ipv4_config
 {
 	int	log_martians;
-	int	autoconfig;
 	int	no_pmtu_disc;
 };
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 6a6aa537b7a..e7fb665873c 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -181,14 +181,6 @@ ctl_table ipv4_table[] = {
 		.proc_handler	= &ipv4_doint_and_flush,
 		.strategy	= &ipv4_doint_and_flush_strategy,
 	},
-        {
-		.ctl_name	= NET_IPV4_AUTOCONFIG,
-		.procname	= "ip_autoconfig",
-		.data		= &ipv4_config.autoconfig,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
-	},
         {
 		.ctl_name	= NET_IPV4_NO_PMTU_DISC,
 		.procname	= "ip_no_pmtu_disc",
-- 
cgit v1.2.3-70-g09d2


From 35089bb203f44e33b6bbb6c4de0b0708f9a48921 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Tue, 13 Jun 2006 22:33:04 -0700
Subject: [TCP]: Add tcp_slow_start_after_idle sysctl.

A lot of people have asked for a way to disable tcp_cwnd_restart(),
and it seems reasonable to add a sysctl to do that.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 7 +++++++
 include/linux/sysctl.h                 | 1 +
 include/net/tcp.h                      | 1 +
 net/ipv4/sysctl_net_ipv4.c             | 8 ++++++++
 net/ipv4/tcp_output.c                  | 6 +++++-
 5 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index f12007b80a4..d46338af600 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -362,6 +362,13 @@ tcp_workaround_signed_windows - BOOLEAN
 	not receive a window scaling option from them.
 	Default: 0
 
+tcp_slow_start_after_idle - BOOLEAN
+	If set, provide RFC2861 behavior and time out the congestion
+	window after an idle period.  An idle period is defined at
+	the current RTO.  If unset, the congestion window will not
+	be timed out after an idle period.
+	Default: 1
+
 IP Variables:
 
 ip_local_port_range - 2 INTEGERS
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 98338ed2c0b..cee944dbdcd 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -405,6 +405,7 @@ enum
 	NET_TCP_BASE_MSS=114,
 	NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS=115,
 	NET_TCP_DMA_COPYBREAK=116,
+	NET_TCP_SLOW_START_AFTER_IDLE=117,
 };
 
 enum {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index de88c5472bf..bfc71f954bb 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -227,6 +227,7 @@ extern int sysctl_tcp_abc;
 extern int sysctl_tcp_mtu_probing;
 extern int sysctl_tcp_base_mss;
 extern int sysctl_tcp_workaround_signed_windows;
+extern int sysctl_tcp_slow_start_after_idle;
 
 extern atomic_t tcp_memory_allocated;
 extern atomic_t tcp_sockets_allocated;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e7fb665873c..ce4cd5f3551 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -690,6 +690,14 @@ ctl_table ipv4_table[] = {
 		.proc_handler	= &proc_dointvec
 	},
 #endif
+	{
+		.ctl_name	= NET_TCP_SLOW_START_AFTER_IDLE,
+		.procname	= "tcp_slow_start_after_idle",
+		.data		= &sysctl_tcp_slow_start_after_idle,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
 	{ .ctl_name = 0 }
 };
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f33c9dddaa1..07bb5a2b375 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -59,6 +59,9 @@ int sysctl_tcp_tso_win_divisor = 3;
 int sysctl_tcp_mtu_probing = 0;
 int sysctl_tcp_base_mss = 512;
 
+/* By default, RFC2861 behavior.  */
+int sysctl_tcp_slow_start_after_idle = 1;
+
 static void update_send_head(struct sock *sk, struct tcp_sock *tp,
 			     struct sk_buff *skb)
 {
@@ -138,7 +141,8 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	const u32 now = tcp_time_stamp;
 
-	if (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)
+	if (sysctl_tcp_slow_start_after_idle &&
+	    (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
 		tcp_cwnd_restart(sk, __sk_dst_get(sk));
 
 	tp->lsndtime = now;
-- 
cgit v1.2.3-70-g09d2


From 5636bef7324f49e36f05ec8a5f6284e11b1bcca4 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vladislav.yasevich@hp.com>
Date: Sat, 17 Jun 2006 22:55:35 -0700
Subject: [SCTP]: Reject sctp packets with broadcast addresses.

Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h | 3 ++-
 net/sctp/input.c           | 3 ++-
 net/sctp/ipv6.c            | 6 ++++--
 net/sctp/protocol.c        | 8 +++++++-
 net/sctp/socket.c          | 2 +-
 5 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 7f4fea173fb..5f69158c100 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -555,7 +555,8 @@ struct sctp_af {
 	int		(*to_addr_param) (const union sctp_addr *,
 					  union sctp_addr_param *); 
 	int		(*addr_valid)	(union sctp_addr *,
-					 struct sctp_sock *);
+					 struct sctp_sock *,
+					 const struct sk_buff *);
 	sctp_scope_t	(*scope) (union sctp_addr *);
 	void		(*inaddr_any)	(union sctp_addr *, unsigned short);
 	int		(*is_any)	(const union sctp_addr *);
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 1662f9cc869..70d6606e281 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -170,7 +170,8 @@ int sctp_rcv(struct sk_buff *skb)
 	 * IP broadcast addresses cannot be used in an SCTP transport
 	 * address."
 	 */
-	if (!af->addr_valid(&src, NULL) || !af->addr_valid(&dest, NULL))
+	if (!af->addr_valid(&src, NULL, skb) ||
+	    !af->addr_valid(&dest, NULL, skb))
 		goto discard_it;
 
 	asoc = __sctp_rcv_lookup(skb, &src, &dest, &transport);
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index c20d282fac0..8ef08070c8b 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -523,7 +523,9 @@ static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp)
  * Return 0 - If the address is a non-unicast or an illegal address.
  * Return 1 - If the address is a unicast.
  */
-static int sctp_v6_addr_valid(union sctp_addr *addr, struct sctp_sock *sp)
+static int sctp_v6_addr_valid(union sctp_addr *addr,
+			      struct sctp_sock *sp,
+			      const struct sk_buff *skb)
 {
 	int ret = ipv6_addr_type(&addr->v6.sin6_addr);
 
@@ -537,7 +539,7 @@ static int sctp_v6_addr_valid(union sctp_addr *addr, struct sctp_sock *sp)
 		if (sp && ipv6_only_sock(sctp_opt2sk(sp)))
 			return 0;
 		sctp_v6_map_v4(addr);
-		return sctp_get_af_specific(AF_INET)->addr_valid(addr, sp);
+		return sctp_get_af_specific(AF_INET)->addr_valid(addr, sp, skb);
 	}
 
 	/* Is this a non-unicast address */
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 2088aa992b7..816c033d788 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -365,12 +365,18 @@ static int sctp_v4_is_any(const union sctp_addr *addr)
  * Return 0 - If the address is a non-unicast or an illegal address.
  * Return 1 - If the address is a unicast.
  */
-static int sctp_v4_addr_valid(union sctp_addr *addr, struct sctp_sock *sp)
+static int sctp_v4_addr_valid(union sctp_addr *addr,
+			      struct sctp_sock *sp,
+			      const struct sk_buff *skb)
 {
 	/* Is this a non-unicast address or a unusable SCTP address? */
 	if (IS_IPV4_UNUSABLE_ADDRESS(&addr->v4.sin_addr.s_addr))
 		return 0;
 
+ 	/* Is this a broadcast address? */
+ 	if (skb && ((struct rtable *)skb->dst)->rt_flags & RTCF_BROADCAST)
+ 		return 0;
+
 	return 1;
 }
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index b41dcbb8968..b811691c35b 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -172,7 +172,7 @@ static inline int sctp_verify_addr(struct sock *sk, union sctp_addr *addr,
 		return -EINVAL;
 
 	/* Is this a valid SCTP address?  */
-	if (!af->addr_valid(addr, sctp_sk(sk)))
+	if (!af->addr_valid(addr, sctp_sk(sk), NULL))
 		return -EINVAL;
 
 	if (!sctp_sk(sk)->pf->send_verify(sctp_sk(sk), (addr)))
-- 
cgit v1.2.3-70-g09d2


From 4c9f5d5305a23851e67471b147e0d459a7166717 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vladislav.yasevich@hp.com>
Date: Sat, 17 Jun 2006 22:56:08 -0700
Subject: [SCTP] Reset rtt_in_progress for the chunk when processing its sack.

Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h | 2 +-
 net/sctp/outqueue.c     | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index aa6033ca7cd..b2b40f951ae 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -255,7 +255,7 @@ extern int sctp_debug_flag;
 #define SCTP_DEBUG_PRINTK_IPADDR(whatever...)
 #define SCTP_ENABLE_DEBUG
 #define SCTP_DISABLE_DEBUG
-#define SCTP_ASSERT(expr, str, func)
+#define SCTP_ASSERT(expr, str, func) BUG_ON(!(expr))
 
 #endif /* SCTP_DEBUG */
 
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index f148f9576dd..e5faa351aaa 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -1262,6 +1262,7 @@ static void sctp_check_transmitted(struct sctp_outq *q,
 			   	if (!tchunk->tsn_gap_acked &&
 				    !tchunk->resent &&
 				    tchunk->rtt_in_progress) {
+					tchunk->rtt_in_progress = 0;
 					rtt = jiffies - tchunk->sent_at;
 					sctp_transport_update_rto(transport,
 								  rtt);
-- 
cgit v1.2.3-70-g09d2