Merge HEAD from /spare/repo/linux-2.6/.git

author: Jeff Garzik <jgarzik@pobox.com> 2005-08-30 13:32:29 -0400
committer: Jeff Garzik <jgarzik@pobox.com> 2005-08-30 13:32:29 -0400
commit: ed735ccbefaf7e5e3ef61418f7e209b8c59308a7 (patch)
tree: b8cc69814d2368b08d0a84c8da0c12028bd04867 /net/ipv4
parent: 39fbe47377062200acc26ea0ccef223b4399a82c (diff)
parent: d8971fcb702e24d1e22c77fd1772f182ffee87e3 (diff)
105 files changed, 7415 insertions, 4538 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 0b3d9f1d806..e55136ae09f 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -413,20 +413,19 @@ config INET_TUNNEL
 	  
 	  If unsure, say Y.
 
-config IP_TCPDIAG
-	tristate "IP: TCP socket monitoring interface"
+config INET_DIAG
+	tristate "INET: socket monitoring interface"
 	default y
 	---help---
-	  Support for TCP socket monitoring interface used by native Linux
-	  tools such as ss. ss is included in iproute2, currently downloadable
-	  at <http://developer.osdl.org/dev/iproute2>. If you want IPv6 support
-	  and have selected IPv6 as a module, you need to build this as a
-	  module too.
+	  Support for INET (TCP, DCCP, etc) socket monitoring interface used by
+	  native Linux tools such as ss. ss is included in iproute2, currently
+	  downloadable at <http://developer.osdl.org/dev/iproute2>. 
 	  
 	  If unsure, say Y.
 
-config IP_TCPDIAG_IPV6
-	def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6)
+config INET_TCP_DIAG
+	depends on INET_DIAG
+	def_tristate INET_DIAG
 
 config TCP_CONG_ADVANCED
 	bool "TCP: advanced congestion control"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 55dc6cca1e7..f0435d00db6 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -4,11 +4,12 @@
 
 obj-y     := route.o inetpeer.o protocol.o \
 	     ip_input.o ip_fragment.o ip_forward.o ip_options.o \
-	     ip_output.o ip_sockglue.o \
+	     ip_output.o ip_sockglue.o inet_hashtables.o \
+	     inet_timewait_sock.o inet_connection_sock.o \
 	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
 	     tcp_minisocks.o tcp_cong.o \
 	     datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
-	     sysctl_net_ipv4.o fib_frontend.o fib_semantics.o
+	     sysctl_net_ipv4.o fib_frontend.o fib_semantics.o netfilter.o
 
 obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
 obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
@@ -29,8 +30,9 @@ obj-$(CONFIG_IP_ROUTE_MULTIPATH_WRANDOM) += multipath_wrandom.o
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o
 obj-$(CONFIG_NETFILTER)	+= netfilter/
 obj-$(CONFIG_IP_VS) += ipvs/
-obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o 
+obj-$(CONFIG_INET_DIAG) += inet_diag.o 
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
+obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
 obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
 obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 163ae4068b5..bf147f8db39 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -99,6 +99,7 @@
 #include <net/arp.h>
 #include <net/route.h>
 #include <net/ip_fib.h>
+#include <net/inet_connection_sock.h>
 #include <net/tcp.h>
 #include <net/udp.h>
 #include <linux/skbuff.h>
@@ -112,11 +113,7 @@
 #include <linux/mroute.h>
 #endif
 
-DEFINE_SNMP_STAT(struct linux_mib, net_statistics);
-
-#ifdef INET_REFCNT_DEBUG
-atomic_t inet_sock_nr;
-#endif
+DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly;
 
 extern void ip_mc_drop_socket(struct sock *sk);
 
@@ -153,11 +150,7 @@ void inet_sock_destruct(struct sock *sk)
 	if (inet->opt)
 		kfree(inet->opt);
 	dst_release(sk->sk_dst_cache);
-#ifdef INET_REFCNT_DEBUG
-	atomic_dec(&inet_sock_nr);
-	printk(KERN_DEBUG "INET socket %p released, %d are still alive\n",
-	       sk, atomic_read(&inet_sock_nr));
-#endif
+	sk_refcnt_debug_dec(sk);
 }
 
 /*
@@ -210,7 +203,7 @@ int inet_listen(struct socket *sock, int backlog)
 	 * we can only allow the backlog to be adjusted.
 	 */
 	if (old_state != TCP_LISTEN) {
-		err = tcp_listen_start(sk);
+		err = inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
 		if (err)
 			goto out;
 	}
@@ -235,12 +228,14 @@ static int inet_create(struct socket *sock, int protocol)
 	struct proto *answer_prot;
 	unsigned char answer_flags;
 	char answer_no_check;
-	int err;
+	int try_loading_module = 0;
+	int err = -ESOCKTNOSUPPORT;
 
 	sock->state = SS_UNCONNECTED;
 
 	/* Look for the requested type/protocol pair. */
 	answer = NULL;
+lookup_protocol:
 	rcu_read_lock();
 	list_for_each_rcu(p, &inetsw[sock->type]) {
 		answer = list_entry(p, struct inet_protosw, list);
@@ -261,9 +256,28 @@ static int inet_create(struct socket *sock, int protocol)
 		answer = NULL;
 	}
 
-	err = -ESOCKTNOSUPPORT;
-	if (!answer)
-		goto out_rcu_unlock;
+	if (unlikely(answer == NULL)) {
+		if (try_loading_module < 2) {
+			rcu_read_unlock();
+			/*
+			 * Be more specific, e.g. net-pf-2-proto-132-type-1
+			 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
+			 */
+			if (++try_loading_module == 1)
+				request_module("net-pf-%d-proto-%d-type-%d",
+					       PF_INET, protocol, sock->type);
+			/*
+			 * Fall back to generic, e.g. net-pf-2-proto-132
+			 * (net-pf-PF_INET-proto-IPPROTO_SCTP)
+			 */
+			else
+				request_module("net-pf-%d-proto-%d",
+					       PF_INET, protocol);
+			goto lookup_protocol;
+		} else
+			goto out_rcu_unlock;
+	}
+
 	err = -EPERM;
 	if (answer->capability > 0 && !capable(answer->capability))
 		goto out_rcu_unlock;
@@ -317,9 +331,7 @@ static int inet_create(struct socket *sock, int protocol)
 	inet->mc_index	= 0;
 	inet->mc_list	= NULL;
 
-#ifdef INET_REFCNT_DEBUG
-	atomic_inc(&inet_sock_nr);
-#endif
+	sk_refcnt_debug_inc(sk);
 
 	if (inet->num) {
 		/* It assumes that any protocol which allows
@@ -847,10 +859,6 @@ static struct net_proto_family inet_family_ops = {
 	.owner	= THIS_MODULE,
 };
 
-
-extern void tcp_init(void);
-extern void tcp_v4_init(struct net_proto_family *);
-
 /* Upon startup we insert all the elements in inetsw_array[] into
  * the linked list inetsw.
  */
@@ -961,6 +969,119 @@ void inet_unregister_protosw(struct inet_protosw *p)
 	}
 }
 
+/*
+ *      Shall we try to damage output packets if routing dev changes?
+ */
+
+int sysctl_ip_dynaddr;
+
+static int inet_sk_reselect_saddr(struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	int err;
+	struct rtable *rt;
+	__u32 old_saddr = inet->saddr;
+	__u32 new_saddr;
+	__u32 daddr = inet->daddr;
+
+	if (inet->opt && inet->opt->srr)
+		daddr = inet->opt->faddr;
+
+	/* Query new route. */
+	err = ip_route_connect(&rt, daddr, 0,
+			       RT_CONN_FLAGS(sk),
+			       sk->sk_bound_dev_if,
+			       sk->sk_protocol,
+			       inet->sport, inet->dport, sk);
+	if (err)
+		return err;
+
+	sk_setup_caps(sk, &rt->u.dst);
+
+	new_saddr = rt->rt_src;
+
+	if (new_saddr == old_saddr)
+		return 0;
+
+	if (sysctl_ip_dynaddr > 1) {
+		printk(KERN_INFO "%s(): shifting inet->"
+				 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
+		       __FUNCTION__,
+		       NIPQUAD(old_saddr),
+		       NIPQUAD(new_saddr));
+	}
+
+	inet->saddr = inet->rcv_saddr = new_saddr;
+
+	/*
+	 * XXX The only one ugly spot where we need to
+	 * XXX really change the sockets identity after
+	 * XXX it has entered the hashes. -DaveM
+	 *
+	 * Besides that, it does not check for connection
+	 * uniqueness. Wait for troubles.
+	 */
+	__sk_prot_rehash(sk);
+	return 0;
+}
+
+int inet_sk_rebuild_header(struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
+	u32 daddr;
+	int err;
+
+	/* Route is OK, nothing to do. */
+	if (rt)
+		return 0;
+
+	/* Reroute. */
+	daddr = inet->daddr;
+	if (inet->opt && inet->opt->srr)
+		daddr = inet->opt->faddr;
+{
+	struct flowi fl = {
+		.oif = sk->sk_bound_dev_if,
+		.nl_u = {
+			.ip4_u = {
+				.daddr	= daddr,
+				.saddr	= inet->saddr,
+				.tos	= RT_CONN_FLAGS(sk),
+			},
+		},
+		.proto = sk->sk_protocol,
+		.uli_u = {
+			.ports = {
+				.sport = inet->sport,
+				.dport = inet->dport,
+			},
+		},
+	};
+						
+	err = ip_route_output_flow(&rt, &fl, sk, 0);
+}
+	if (!err)
+		sk_setup_caps(sk, &rt->u.dst);
+	else {
+		/* Routing failed... */
+		sk->sk_route_caps = 0;
+		/*
+		 * Other protocols have to map its equivalent state to TCP_SYN_SENT.
+		 * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
+		 */
+		if (!sysctl_ip_dynaddr ||
+		    sk->sk_state != TCP_SYN_SENT ||
+		    (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
+		    (err = inet_sk_reselect_saddr(sk)) != 0)
+			sk->sk_err_soft = -err;
+	}
+
+	return err;
+}
+
+EXPORT_SYMBOL(inet_sk_rebuild_header);
+
 #ifdef CONFIG_IP_MULTICAST
 static struct net_protocol igmp_protocol = {
 	.handler =	igmp_rcv,
@@ -1007,7 +1128,6 @@ static int __init init_ipv4_mibs(void)
 }
 
 static int ipv4_proc_init(void);
-extern void ipfrag_init(void);
 
 /*
  *	IP protocol layer initialiser
@@ -1128,19 +1248,10 @@ module_init(inet_init);
 /* ------------------------------------------------------------------------ */
 
 #ifdef CONFIG_PROC_FS
-extern int  fib_proc_init(void);
-extern void fib_proc_exit(void);
 #ifdef CONFIG_IP_FIB_TRIE
 extern int  fib_stat_proc_init(void);
 extern void fib_stat_proc_exit(void);
 #endif
-extern int  ip_misc_proc_init(void);
-extern int  raw_proc_init(void);
-extern void raw_proc_exit(void);
-extern int  tcp4_proc_init(void);
-extern void tcp4_proc_exit(void);
-extern int  udp4_proc_init(void);
-extern void udp4_proc_exit(void);
 
 static int __init ipv4_proc_init(void)
 {
@@ -1205,7 +1316,3 @@ EXPORT_SYMBOL(inet_stream_ops);
 EXPORT_SYMBOL(inet_unregister_protosw);
 EXPORT_SYMBOL(net_statistics);
 EXPORT_SYMBOL(sysctl_ip_nonlocal_bind);
-
-#ifdef INET_REFCNT_DEBUG
-EXPORT_SYMBOL(inet_sock_nr);
-#endif
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index a642fd61285..8bf312bdea1 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -700,7 +700,7 @@ void arp_send(int type, int ptype, u32 dest_ip,
 static void parp_redo(struct sk_buff *skb)
 {
 	nf_reset(skb);
-	arp_rcv(skb, skb->dev, NULL);
+	arp_rcv(skb, skb->dev, NULL, skb->dev);
 }
 
 /*
@@ -865,7 +865,7 @@ static int arp_process(struct sk_buff *skb)
 				if (n)
 					neigh_release(n);
 
-				if (skb->stamp.tv_sec == LOCALLY_ENQUEUED || 
+				if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || 
 				    skb->pkt_type == PACKET_HOST ||
 				    in_dev->arp_parms->proxy_delay == 0) {
 					arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
@@ -927,7 +927,7 @@ out:
  *	Receive an arp request from the device layer.
  */
 
-int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 {
 	struct arphdr *arp;
 
@@ -948,6 +948,8 @@ int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
 	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
 		goto out_of_mem;
 
+	memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
+
 	return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process);
 
 freeskb:
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index b1db561f254..c1b42b5257f 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -16,9 +16,10 @@
 #include <linux/module.h>
 #include <linux/ip.h>
 #include <linux/in.h>
+#include <net/ip.h>
 #include <net/sock.h>
-#include <net/tcp.h>
 #include <net/route.h>
+#include <net/tcp_states.h>
 
 int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index d8a10e3dd77..ba2895ae815 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1111,13 +1111,12 @@ static void rtmsg_ifa(int event, struct in_ifaddr* ifa)
 	struct sk_buff *skb = alloc_skb(size, GFP_KERNEL);
 
 	if (!skb)
-		netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS);
+		netlink_set_err(rtnl, 0, RTNLGRP_IPV4_IFADDR, ENOBUFS);
 	else if (inet_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) {
 		kfree_skb(skb);
-		netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL);
+		netlink_set_err(rtnl, 0, RTNLGRP_IPV4_IFADDR, EINVAL);
 	} else {
-		NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_IFADDR;
-		netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV4_IFADDR, GFP_KERNEL);
+		netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV4_IFADDR, GFP_KERNEL);
 	}
 }
 
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index ba57446d5d1..b31ffc5053d 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -331,8 +331,8 @@ static void esp4_err(struct sk_buff *skb, u32 info)
 	x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET);
 	if (!x)
 		return;
-	NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
-			ntohl(esph->spi), ntohl(iph->daddr)));
+	NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
+		 ntohl(esph->spi), ntohl(iph->daddr));
 	xfrm_state_put(x);
 }
 
@@ -395,10 +395,10 @@ static int esp_init_state(struct xfrm_state *x)
 
 		if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
 		    crypto_tfm_alg_digestsize(esp->auth.tfm)) {
-			NETDEBUG(printk(KERN_INFO "ESP: %s digestsize %u != %hu\n",
-			       x->aalg->alg_name,
-			       crypto_tfm_alg_digestsize(esp->auth.tfm),
-			       aalg_desc->uinfo.auth.icv_fullbits/8));
+			NETDEBUG(KERN_INFO "ESP: %s digestsize %u != %hu\n",
+				 x->aalg->alg_name,
+				 crypto_tfm_alg_digestsize(esp->auth.tfm),
+				 aalg_desc->uinfo.auth.icv_fullbits/8);
 			goto error;
 		}
 
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index cd8e45ab958..4e1379f7126 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -558,16 +558,15 @@ static void nl_fib_input(struct sock *sk, int len)
 	nl_fib_lookup(frn, tb);
 	
 	pid = nlh->nlmsg_pid;           /*pid of sending process */
-	NETLINK_CB(skb).groups = 0;     /* not in mcast group */
 	NETLINK_CB(skb).pid = 0;         /* from kernel */
 	NETLINK_CB(skb).dst_pid = pid;
-	NETLINK_CB(skb).dst_groups = 0;  /* unicast */
+	NETLINK_CB(skb).dst_group = 0;  /* unicast */
 	netlink_unicast(sk, skb, pid, MSG_DONTWAIT);
 }    
 
 static void nl_fib_lookup_init(void)
 {
-      netlink_kernel_create(NETLINK_FIB_LOOKUP, nl_fib_input);
+      netlink_kernel_create(NETLINK_FIB_LOOKUP, 0, nl_fib_input, THIS_MODULE);
 }
 
 static void fib_disable_ip(struct net_device *dev, int force)
@@ -662,5 +661,4 @@ void __init ip_fib_init(void)
 }
 
 EXPORT_SYMBOL(inet_addr_type);
-EXPORT_SYMBOL(ip_dev_find);
 EXPORT_SYMBOL(ip_rt_ioctl);
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index b10d6bb5ef3..2a8c9afc369 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -45,8 +45,8 @@
 
 #include "fib_lookup.h"
 
-static kmem_cache_t *fn_hash_kmem;
-static kmem_cache_t *fn_alias_kmem;
+static kmem_cache_t *fn_hash_kmem __read_mostly;
+static kmem_cache_t *fn_alias_kmem __read_mostly;
 
 struct fib_node {
 	struct hlist_node	fn_hash;
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index b729d97cfa9..ef6609ea0eb 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -7,6 +7,7 @@
 
 struct fib_alias {
 	struct list_head	fa_list;
+	struct rcu_head rcu;
 	struct fib_info		*fa_info;
 	u8			fa_tos;
 	u8			fa_type;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index e278cb9d007..d41219e8037 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -290,10 +290,10 @@ void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
 		kfree_skb(skb);
 		return;
 	}
-	NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_ROUTE;
+	NETLINK_CB(skb).dst_group = RTNLGRP_IPV4_ROUTE;
 	if (n->nlmsg_flags&NLM_F_ECHO)
 		atomic_inc(&skb->users);
-	netlink_broadcast(rtnl, skb, pid, RTMGRP_IPV4_ROUTE, GFP_KERNEL);
+	netlink_broadcast(rtnl, skb, pid, RTNLGRP_IPV4_ROUTE, GFP_KERNEL);
 	if (n->nlmsg_flags&NLM_F_ECHO)
 		netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
 }
@@ -854,6 +854,7 @@ failure:
 	return NULL;
 }
 
+/* Note! fib_semantic_match intentionally uses  RCU list functions. */
 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
 		       struct fib_result *res, __u32 zone, __u32 mask, 
 			int prefixlen)
@@ -861,7 +862,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
 	struct fib_alias *fa;
 	int nh_sel = 0;
 
-	list_for_each_entry(fa, head, fa_list) {
+	list_for_each_entry_rcu(fa, head, fa_list) {
 		int err;
 
 		if (fa->fa_tos &&
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 45efd5f4741..b2dea4e5da7 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -43,7 +43,7 @@
  *		2 of the License, or (at your option) any later version.
  */
 
-#define VERSION "0.325"
+#define VERSION "0.402"
 
 #include <linux/config.h>
 #include <asm/uaccess.h>
@@ -62,6 +62,7 @@
 #include <linux/netdevice.h>
 #include <linux/if_arp.h>
 #include <linux/proc_fs.h>
+#include <linux/rcupdate.h>
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
 #include <linux/init.h>
@@ -77,56 +78,55 @@
 #undef CONFIG_IP_FIB_TRIE_STATS
 #define MAX_CHILDS 16384
 
-#define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n)))
 #define KEYLENGTH (8*sizeof(t_key))
 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
 
-static DEFINE_RWLOCK(fib_lock);
-
 typedef unsigned int t_key;
 
 #define T_TNODE 0
 #define T_LEAF  1
 #define NODE_TYPE_MASK	0x1UL
-#define NODE_PARENT(_node) \
-	((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
-#define NODE_SET_PARENT(_node, _ptr) \
-	((_node)->_parent = (((unsigned long)(_ptr)) | \
-                     ((_node)->_parent & NODE_TYPE_MASK)))
-#define NODE_INIT_PARENT(_node, _type) \
-	((_node)->_parent = (_type))
-#define NODE_TYPE(_node) \
-	((_node)->_parent & NODE_TYPE_MASK)
-
-#define IS_TNODE(n) (!(n->_parent & T_LEAF))
-#define IS_LEAF(n) (n->_parent & T_LEAF)
+#define NODE_PARENT(node) \
+	((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK)))
+
+#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
+
+#define NODE_SET_PARENT(node, ptr)		\
+	rcu_assign_pointer((node)->parent,	\
+			   ((unsigned long)(ptr)) | NODE_TYPE(node))
+
+#define IS_TNODE(n) (!(n->parent & T_LEAF))
+#define IS_LEAF(n) (n->parent & T_LEAF)
 
 struct node {
-        t_key key;
-	unsigned long _parent;
+	t_key key;
+	unsigned long parent;
 };
 
 struct leaf {
-        t_key key;
-	unsigned long _parent;
+	t_key key;
+	unsigned long parent;
 	struct hlist_head list;
+	struct rcu_head rcu;
 };
 
 struct leaf_info {
 	struct hlist_node hlist;
+	struct rcu_head rcu;
 	int plen;
 	struct list_head falh;
 };
 
 struct tnode {
-        t_key key;
-	unsigned long _parent;
-        unsigned short pos:5;        /* 2log(KEYLENGTH) bits needed */
-        unsigned short bits:5;       /* 2log(KEYLENGTH) bits needed */
-        unsigned short full_children;  /* KEYLENGTH bits needed */
-        unsigned short empty_children; /* KEYLENGTH bits needed */
-        struct node *child[0];
+	t_key key;
+	unsigned long parent;
+	unsigned short pos:5;		/* 2log(KEYLENGTH) bits needed */
+	unsigned short bits:5;		/* 2log(KEYLENGTH) bits needed */
+	unsigned short full_children;	/* KEYLENGTH bits needed */
+	unsigned short empty_children;	/* KEYLENGTH bits needed */
+	struct rcu_head rcu;
+	struct node *child[0];
 };
 
 #ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -150,77 +150,45 @@ struct trie_stat {
 };
 
 struct trie {
-        struct node *trie;
+	struct node *trie;
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 	struct trie_use_stats stats;
 #endif
-        int size;
+	int size;
 	unsigned int revision;
 };
 
-static int trie_debug = 0;
-
-static int tnode_full(struct tnode *tn, struct node *n);
 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
-static int tnode_child_length(struct tnode *tn);
 static struct node *resize(struct trie *t, struct tnode *tn);
-static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err);
-static struct tnode *halve(struct trie *t, struct tnode *tn, int *err);
+static struct tnode *inflate(struct trie *t, struct tnode *tn);
+static struct tnode *halve(struct trie *t, struct tnode *tn);
 static void tnode_free(struct tnode *tn);
 static void trie_dump_seq(struct seq_file *seq, struct trie *t);
-extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
-extern int fib_detect_death(struct fib_info *fi, int order,
-                            struct fib_info **last_resort, int *last_idx, int *dflt);
-
-extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id,
-               struct nlmsghdr *n, struct netlink_skb_parms *req);
 
-static kmem_cache_t *fn_alias_kmem;
+static kmem_cache_t *fn_alias_kmem __read_mostly;
 static struct trie *trie_local = NULL, *trie_main = NULL;
 
-static void trie_bug(char *err)
-{
-	printk("Trie Bug: %s\n", err);
-	BUG();
-}
+
+/* rcu_read_lock needs to be hold by caller from readside */
 
 static inline struct node *tnode_get_child(struct tnode *tn, int i)
 {
-        if (i >= 1<<tn->bits)
-                trie_bug("tnode_get_child");
+	BUG_ON(i >= 1 << tn->bits);
 
-        return tn->child[i];
+	return rcu_dereference(tn->child[i]);
 }
 
-static inline int tnode_child_length(struct tnode *tn)
+static inline int tnode_child_length(const struct tnode *tn)
 {
-        return 1<<tn->bits;
+	return 1 << tn->bits;
 }
 
-/*
-  _________________________________________________________________
-  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
-  ----------------------------------------------------------------
-    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
-
-  _________________________________________________________________
-  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
-  -----------------------------------------------------------------
-   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
-
-  tp->pos = 7
-  tp->bits = 3
-  n->pos = 15
-  n->bits=4
-  KEYLENGTH=32
-*/
-
 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
 {
-        if (offset < KEYLENGTH)
+	if (offset < KEYLENGTH)
 		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
-        else
+	else
 		return 0;
 }
 
@@ -233,8 +201,8 @@ static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
 {
 	if (bits == 0 || offset >= KEYLENGTH)
 		return 1;
-        bits = bits > KEYLENGTH ? KEYLENGTH : bits;
-        return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
+	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
+	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
 }
 
 static inline int tkey_mismatch(t_key a, int offset, t_key b)
@@ -249,14 +217,6 @@ static inline int tkey_mismatch(t_key a, int offset, t_key b)
 	return i;
 }
 
-/* Candiate for fib_semantics */
-
-static void fn_free_alias(struct fib_alias *fa)
-{
-	fib_release_info(fa->fa_info);
-	kmem_cache_free(fn_alias_kmem, fa);
-}
-
 /*
   To understand this stuff, an understanding of keys and all their bits is 
   necessary. Every node in the trie has a key associated with it, but not 
@@ -295,7 +255,7 @@ static void fn_free_alias(struct fib_alias *fa)
   tp->pos = 7
   tp->bits = 3
   n->pos = 15
-  n->bits=4
+  n->bits = 4
 
   First, let's just ignore the bits that come before the parent tp, that is 
   the bits from 0 to (tp->pos-1). They are *known* but at this point we do 
@@ -320,60 +280,65 @@ static void fn_free_alias(struct fib_alias *fa)
 
 */
 
-static void check_tnode(struct tnode *tn)
+static inline void check_tnode(const struct tnode *tn)
 {
-	if (tn && tn->pos+tn->bits > 32) {
-		printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
-	}
+	WARN_ON(tn && tn->pos+tn->bits > 32);
 }
 
 static int halve_threshold = 25;
 static int inflate_threshold = 50;
 
-static struct leaf *leaf_new(void)
+
+static void __alias_free_mem(struct rcu_head *head)
 {
-	struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
-	if (l) {
-		NODE_INIT_PARENT(l, T_LEAF);
-		INIT_HLIST_HEAD(&l->list);
-	}
-	return l;
+	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
+	kmem_cache_free(fn_alias_kmem, fa);
 }
 
-static struct leaf_info *leaf_info_new(int plen)
+static inline void alias_free_mem_rcu(struct fib_alias *fa)
 {
-	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
-	if (li) {
-		li->plen = plen;
-		INIT_LIST_HEAD(&li->falh);
-	}
-	return li;
+	call_rcu(&fa->rcu, __alias_free_mem);
+}
+
+static void __leaf_free_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct leaf, rcu));
+}
+
+static inline void free_leaf(struct leaf *leaf)
+{
+	call_rcu(&leaf->rcu, __leaf_free_rcu);
 }
 
-static inline void free_leaf(struct leaf *l)
+static void __leaf_info_free_rcu(struct rcu_head *head)
 {
-	kfree(l);
+	kfree(container_of(head, struct leaf_info, rcu));
 }
 
-static inline void free_leaf_info(struct leaf_info *li)
+static inline void free_leaf_info(struct leaf_info *leaf)
 {
-	kfree(li);
+	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
 }
 
 static struct tnode *tnode_alloc(unsigned int size)
 {
-	if (size <= PAGE_SIZE) {
-		return kmalloc(size, GFP_KERNEL);
-	} else {
-		return (struct tnode *)
-			__get_free_pages(GFP_KERNEL, get_order(size));
-	}
+	struct page *pages;
+
+	if (size <= PAGE_SIZE)
+		return kcalloc(size, 1, GFP_KERNEL);
+
+	pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
+	if (!pages)
+		return NULL;
+
+	return page_address(pages);
 }
 
-static void __tnode_free(struct tnode *tn)
+static void __tnode_free_rcu(struct rcu_head *head)
 {
+	struct tnode *tn = container_of(head, struct tnode, rcu);
 	unsigned int size = sizeof(struct tnode) +
-	                    (1<<tn->bits) * sizeof(struct node *);
+		(1 << tn->bits) * sizeof(struct node *);
 
 	if (size <= PAGE_SIZE)
 		kfree(tn);
@@ -381,15 +346,40 @@ static void __tnode_free(struct tnode *tn)
 		free_pages((unsigned long)tn, get_order(size));
 }
 
+static inline void tnode_free(struct tnode *tn)
+{
+	call_rcu(&tn->rcu, __tnode_free_rcu);
+}
+
+static struct leaf *leaf_new(void)
+{
+	struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
+	if (l) {
+		l->parent = T_LEAF;
+		INIT_HLIST_HEAD(&l->list);
+	}
+	return l;
+}
+
+static struct leaf_info *leaf_info_new(int plen)
+{
+	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
+	if (li) {
+		li->plen = plen;
+		INIT_LIST_HEAD(&li->falh);
+	}
+	return li;
+}
+
 static struct tnode* tnode_new(t_key key, int pos, int bits)
 {
 	int nchildren = 1<<bits;
 	int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
 	struct tnode *tn = tnode_alloc(sz);
 
-	if (tn)  {
+	if (tn) {
 		memset(tn, 0, sz);
-		NODE_INIT_PARENT(tn, T_TNODE);
+		tn->parent = T_TNODE;
 		tn->pos = pos;
 		tn->bits = bits;
 		tn->key = key;
@@ -397,38 +387,17 @@ static struct tnode* tnode_new(t_key key, int pos, int bits)
 		tn->empty_children = 1<<bits;
 	}
 
-	if (trie_debug > 0)
-		printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
-		       (unsigned int) (sizeof(struct node) * 1<<bits));
+	pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
+		 (unsigned int) (sizeof(struct node) * 1<<bits));
 	return tn;
 }
 
-static void tnode_free(struct tnode *tn)
-{
-	if (!tn) {
-		trie_bug("tnode_free\n");
-	}
-	if (IS_LEAF(tn)) {
-		free_leaf((struct leaf *)tn);
-		if (trie_debug > 0 )
-			printk("FL %p \n", tn);
-	}
-	else if (IS_TNODE(tn)) {
-		__tnode_free(tn);
-		if (trie_debug > 0 )
-			printk("FT %p \n", tn);
-	}
-	else {
-		trie_bug("tnode_free\n");
-	}
-}
-
 /*
  * Check whether a tnode 'n' is "full", i.e. it is an internal node
  * and no bits are skipped. See discussion in dyntree paper p. 6
  */
 
-static inline int tnode_full(struct tnode *tn, struct node *n)
+static inline int tnode_full(const struct tnode *tn, const struct node *n)
 {
 	if (n == NULL || IS_LEAF(n))
 		return 0;
@@ -448,15 +417,11 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i, struct nod
 
 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
 {
-	struct node *chi;
+	struct node *chi = tn->child[i];
 	int isfull;
 
-	if (i >= 1<<tn->bits) {
-		printk("bits=%d, i=%d\n", tn->bits, i);
-		trie_bug("tnode_put_child_reorg bits");
-	}
-	write_lock_bh(&fib_lock);
-	chi = tn->child[i];
+	BUG_ON(i >= 1<<tn->bits);
+
 
 	/* update emptyChildren */
 	if (n == NULL && chi != NULL)
@@ -465,33 +430,32 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w
 		tn->empty_children--;
 
 	/* update fullChildren */
-        if (wasfull == -1)
+	if (wasfull == -1)
 		wasfull = tnode_full(tn, chi);
 
 	isfull = tnode_full(tn, n);
 	if (wasfull && !isfull)
 		tn->full_children--;
-
 	else if (!wasfull && isfull)
 		tn->full_children++;
+
 	if (n)
 		NODE_SET_PARENT(n, tn);
 
-	tn->child[i] = n;
-	write_unlock_bh(&fib_lock);
+	rcu_assign_pointer(tn->child[i], n);
 }
 
 static struct node *resize(struct trie *t, struct tnode *tn)
 {
 	int i;
 	int err = 0;
+	struct tnode *old_tn;
 
  	if (!tn)
 		return NULL;
 
-	if (trie_debug)
-		printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
-		      tn, inflate_threshold, halve_threshold);
+	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
+		 tn, inflate_threshold, halve_threshold);
 
 	/* No children */
 	if (tn->empty_children == tnode_child_length(tn)) {
@@ -501,20 +465,16 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 	/* One child */
 	if (tn->empty_children == tnode_child_length(tn) - 1)
 		for (i = 0; i < tnode_child_length(tn); i++) {
+			struct node *n;
 
-			write_lock_bh(&fib_lock);
-			if (tn->child[i] != NULL) {
-
-				/* compress one level */
-				struct node *n = tn->child[i];
-				if (n)
-					NODE_INIT_PARENT(n, NODE_TYPE(n));
+			n = tn->child[i];
+			if (!n)
+				continue;
 
-				write_unlock_bh(&fib_lock);
-				tnode_free(tn);
-				return n;
-			}
-			write_unlock_bh(&fib_lock);
+			/* compress one level */
+			NODE_SET_PARENT(n, NULL);
+			tnode_free(tn);
+			return n;
 		}
 	/*
 	 * Double as long as the resulting node has a number of
@@ -566,16 +526,16 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 	 *
 	 * expand not_to_be_doubled and to_be_doubled, and shorten:
 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
-	 *    tn->full_children ) >= inflate_threshold * new_child_length
+	 *    tn->full_children) >= inflate_threshold * new_child_length
 	 *
 	 * expand new_child_length:
 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
-	 *    tn->full_children ) >=
+	 *    tn->full_children) >=
 	 *      inflate_threshold * tnode_child_length(tn) * 2
 	 *
 	 * shorten again:
 	 * 50 * (tn->full_children + tnode_child_length(tn) -
-	 *    tn->empty_children ) >= inflate_threshold *
+	 *    tn->empty_children) >= inflate_threshold *
 	 *    tnode_child_length(tn)
 	 *
 	 */
@@ -587,9 +547,10 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 	       50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
 				inflate_threshold * tnode_child_length(tn))) {
 
-		tn = inflate(t, tn, &err);
-
-		if (err) {
+		old_tn = tn;
+		tn = inflate(t, tn);
+		if (IS_ERR(tn)) {
+			tn = old_tn;
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 			t->stats.resize_node_skipped++;
 #endif
@@ -609,9 +570,10 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 	       100 * (tnode_child_length(tn) - tn->empty_children) <
 	       halve_threshold * tnode_child_length(tn)) {
 
-		tn = halve(t, tn, &err);
-
-		if (err) {
+		old_tn = tn;
+		tn = halve(t, tn);
+		if (IS_ERR(tn)) {
+			tn = old_tn;
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 			t->stats.resize_node_skipped++;
 #endif
@@ -621,44 +583,37 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 
 
 	/* Only one child remains */
-
 	if (tn->empty_children == tnode_child_length(tn) - 1)
 		for (i = 0; i < tnode_child_length(tn); i++) {
-		
-			write_lock_bh(&fib_lock);
-			if (tn->child[i] != NULL) {
-				/* compress one level */
-				struct node *n = tn->child[i];
-
-				if (n)
-					NODE_INIT_PARENT(n, NODE_TYPE(n));
-
-				write_unlock_bh(&fib_lock);
-				tnode_free(tn);
-				return n;
-			}
-			write_unlock_bh(&fib_lock);
+			struct node *n;
+
+			n = tn->child[i];
+			if (!n)
+				continue;
+
+			/* compress one level */
+
+			NODE_SET_PARENT(n, NULL);
+			tnode_free(tn);
+			return n;
 		}
 
 	return (struct node *) tn;
 }
 
-static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
+static struct tnode *inflate(struct trie *t, struct tnode *tn)
 {
 	struct tnode *inode;
 	struct tnode *oldtnode = tn;
 	int olen = tnode_child_length(tn);
 	int i;
 
-  	if (trie_debug)
-		printk("In inflate\n");
+	pr_debug("In inflate\n");
 
 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
 
-	if (!tn) {
-		*err = -ENOMEM;
-		return oldtnode;
-	}
+	if (!tn)
+		return ERR_PTR(-ENOMEM);
 
 	/*
 	 * Preallocate and store tnodes before the actual work so we
@@ -666,8 +621,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
 	 * fails. In case of failure we return the oldnode and  inflate
 	 * of tnode is ignored.
 	 */
-		
-	for(i = 0; i < olen; i++) {
+
+	for (i = 0; i < olen; i++) {
 		struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
 
 		if (inode &&
@@ -675,46 +630,30 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
 		    inode->pos == oldtnode->pos + oldtnode->bits &&
 		    inode->bits > 1) {
 			struct tnode *left, *right;
-
 			t_key m = TKEY_GET_MASK(inode->pos, 1);
 
 			left = tnode_new(inode->key&(~m), inode->pos + 1,
 					 inode->bits - 1);
+			if (!left)
+				goto nomem;
 
-			if (!left) {
-				*err = -ENOMEM;
-				break;
-			}
-		
 			right = tnode_new(inode->key|m, inode->pos + 1,
 					  inode->bits - 1);
 
-			if (!right) {
-				*err = -ENOMEM;
-				break;
-			}
+                        if (!right) {
+				tnode_free(left);
+				goto nomem;
+                        }
 
 			put_child(t, tn, 2*i, (struct node *) left);
 			put_child(t, tn, 2*i+1, (struct node *) right);
 		}
 	}
 
-	if (*err) {
-		int size = tnode_child_length(tn);
-		int j;
-
-		for(j = 0; j < size; j++)
-			if (tn->child[j])
-				tnode_free((struct tnode *)tn->child[j]);
-
-		tnode_free(tn);
-	
-		*err = -ENOMEM;
-		return oldtnode;
-	}
-
-	for(i = 0; i < olen; i++) {
+	for (i = 0; i < olen; i++) {
 		struct node *node = tnode_get_child(oldtnode, i);
+		struct tnode *left, *right;
+		int size, j;
 
 		/* An empty child */
 		if (node == NULL)
@@ -740,76 +679,82 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
 			put_child(t, tn, 2*i+1, inode->child[1]);
 
 			tnode_free(inode);
+			continue;
 		}
 
-			/* An internal node with more than two children */
-		else {
-			struct tnode *left, *right;
-			int size, j;
-
-			/* We will replace this node 'inode' with two new
-			 * ones, 'left' and 'right', each with half of the
-			 * original children. The two new nodes will have
-			 * a position one bit further down the key and this
-			 * means that the "significant" part of their keys
-			 * (see the discussion near the top of this file)
-			 * will differ by one bit, which will be "0" in
-			 * left's key and "1" in right's key. Since we are
-			 * moving the key position by one step, the bit that
-			 * we are moving away from - the bit at position
-			 * (inode->pos) - is the one that will differ between
-			 * left and right. So... we synthesize that bit in the
-			 * two  new keys.
-			 * The mask 'm' below will be a single "one" bit at
-			 * the position (inode->pos)
-			 */
-
-			/* Use the old key, but set the new significant
-			 *   bit to zero.
-			 */
+		/* An internal node with more than two children */
+
+		/* We will replace this node 'inode' with two new
+		 * ones, 'left' and 'right', each with half of the
+		 * original children. The two new nodes will have
+		 * a position one bit further down the key and this
+		 * means that the "significant" part of their keys
+		 * (see the discussion near the top of this file)
+		 * will differ by one bit, which will be "0" in
+		 * left's key and "1" in right's key. Since we are
+		 * moving the key position by one step, the bit that
+		 * we are moving away from - the bit at position
+		 * (inode->pos) - is the one that will differ between
+		 * left and right. So... we synthesize that bit in the
+		 * two  new keys.
+		 * The mask 'm' below will be a single "one" bit at
+		 * the position (inode->pos)
+		 */
 
-			left = (struct tnode *) tnode_get_child(tn, 2*i);
-			put_child(t, tn, 2*i, NULL);
+		/* Use the old key, but set the new significant
+		 *   bit to zero.
+		 */
 
-			if (!left)
-				BUG();
+		left = (struct tnode *) tnode_get_child(tn, 2*i);
+		put_child(t, tn, 2*i, NULL);
 
-			right = (struct tnode *) tnode_get_child(tn, 2*i+1);
-			put_child(t, tn, 2*i+1, NULL);
+		BUG_ON(!left);
 
-			if (!right)
-				BUG();
+		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
+		put_child(t, tn, 2*i+1, NULL);
 
-			size = tnode_child_length(left);
-			for(j = 0; j < size; j++) {
-				put_child(t, left, j, inode->child[j]);
-				put_child(t, right, j, inode->child[j + size]);
-			}
-			put_child(t, tn, 2*i, resize(t, left));
-			put_child(t, tn, 2*i+1, resize(t, right));
+		BUG_ON(!right);
 
-			tnode_free(inode);
+		size = tnode_child_length(left);
+		for (j = 0; j < size; j++) {
+			put_child(t, left, j, inode->child[j]);
+			put_child(t, right, j, inode->child[j + size]);
 		}
+		put_child(t, tn, 2*i, resize(t, left));
+		put_child(t, tn, 2*i+1, resize(t, right));
+
+		tnode_free(inode);
 	}
 	tnode_free(oldtnode);
 	return tn;
+nomem:
+	{
+		int size = tnode_child_length(tn);
+		int j;
+
+		for (j = 0; j < size; j++)
+			if (tn->child[j])
+				tnode_free((struct tnode *)tn->child[j]);
+
+		tnode_free(tn);
+
+		return ERR_PTR(-ENOMEM);
+	}
 }
 
-static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
+static struct tnode *halve(struct trie *t, struct tnode *tn)
 {
 	struct tnode *oldtnode = tn;
 	struct node *left, *right;
 	int i;
 	int olen = tnode_child_length(tn);
 
-	if (trie_debug) printk("In halve\n");
+	pr_debug("In halve\n");
 
 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
 
-	if (!tn) {
-		*err = -ENOMEM;
-		return oldtnode;
-	}
+	if (!tn)
+		return ERR_PTR(-ENOMEM);
 
 	/*
 	 * Preallocate and store tnodes before the actual work so we
@@ -818,38 +763,27 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
 	 * of tnode is ignored.
 	 */
 
-	for(i = 0; i < olen; i += 2) {
+	for (i = 0; i < olen; i += 2) {
 		left = tnode_get_child(oldtnode, i);
 		right = tnode_get_child(oldtnode, i+1);
 
 		/* Two nonempty children */
-		if (left && right)  {
-			struct tnode *newBinNode =
-				tnode_new(left->key, tn->pos + tn->bits, 1);
+		if (left && right) {
+			struct tnode *newn;
 
-			if (!newBinNode) {
-				*err = -ENOMEM;
-				break;
-			}
-			put_child(t, tn, i/2, (struct node *)newBinNode);
-		}
-	}
+			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
 
-	if (*err) {
-		int size = tnode_child_length(tn);
-		int j;
+			if (!newn)
+				goto nomem;
 
-		for(j = 0; j < size; j++)
-			if (tn->child[j])
-				tnode_free((struct tnode *)tn->child[j]);
+			put_child(t, tn, i/2, (struct node *)newn);
+		}
 
-		tnode_free(tn);
-	
-		*err = -ENOMEM;
-		return oldtnode;
 	}
 
-	for(i = 0; i < olen; i += 2) {
+	for (i = 0; i < olen; i += 2) {
+		struct tnode *newBinNode;
+
 		left = tnode_get_child(oldtnode, i);
 		right = tnode_get_child(oldtnode, i+1);
 
@@ -858,88 +792,99 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
 			if (right == NULL)    /* Both are empty */
 				continue;
 			put_child(t, tn, i/2, right);
-		} else if (right == NULL)
+			continue;
+		}
+
+		if (right == NULL) {
 			put_child(t, tn, i/2, left);
+			continue;
+		}
 
 		/* Two nonempty children */
-		else {
-			struct tnode *newBinNode =
-				(struct tnode *) tnode_get_child(tn, i/2);
-			put_child(t, tn, i/2, NULL);
-
-			if (!newBinNode)
-				BUG();
-
-			put_child(t, newBinNode, 0, left);
-			put_child(t, newBinNode, 1, right);
-			put_child(t, tn, i/2, resize(t, newBinNode));
-		}
+		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
+		put_child(t, tn, i/2, NULL);
+		put_child(t, newBinNode, 0, left);
+		put_child(t, newBinNode, 1, right);
+		put_child(t, tn, i/2, resize(t, newBinNode));
 	}
 	tnode_free(oldtnode);
 	return tn;
+nomem:
+	{
+		int size = tnode_child_length(tn);
+		int j;
+
+		for (j = 0; j < size; j++)
+			if (tn->child[j])
+				tnode_free((struct tnode *)tn->child[j]);
+
+		tnode_free(tn);
+
+		return ERR_PTR(-ENOMEM);
+	}
 }
 
-static void *trie_init(struct trie *t)
+static void trie_init(struct trie *t)
 {
-	if (t) {
-		t->size = 0;
-		t->trie = NULL;
-		t->revision = 0;
+	if (!t)
+		return;
+
+	t->size = 0;
+	rcu_assign_pointer(t->trie, NULL);
+	t->revision = 0;
 #ifdef CONFIG_IP_FIB_TRIE_STATS
-       		memset(&t->stats, 0, sizeof(struct trie_use_stats));
+	memset(&t->stats, 0, sizeof(struct trie_use_stats));
 #endif
-	}
-	return t;
 }
 
+/* readside most use rcu_read_lock currently dump routines
+ via get_fa_head and dump */
+
 static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
 {
 	struct hlist_node *node;
 	struct leaf_info *li;
 
-	hlist_for_each_entry(li, node, head, hlist) {
+	hlist_for_each_entry_rcu(li, node, head, hlist)
 		if (li->plen == plen)
 			return li;
-	}
+
 	return NULL;
 }
 
 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
 {
-	struct list_head *fa_head = NULL;
 	struct leaf_info *li = find_leaf_info(&l->list, plen);
 
-	if (li)
-		fa_head = &li->falh;
+	if (!li)
+		return NULL;
 
-	return fa_head;
+	return &li->falh;
 }
 
 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
 {
-	struct leaf_info *li = NULL, *last = NULL;
-	struct hlist_node *node, *tmp;
-
-	write_lock_bh(&fib_lock);
-
-	if (hlist_empty(head))
-		hlist_add_head(&new->hlist, head);
-	else {
-		hlist_for_each_entry_safe(li, node, tmp, head, hlist) {
-		
-			if (new->plen > li->plen)
-				break;
-		
-			last = li;
-		}
-		if (last)
-			hlist_add_after(&last->hlist, &new->hlist);
-		else
-			hlist_add_before(&new->hlist, &li->hlist);
-	}
-	write_unlock_bh(&fib_lock);
+        struct leaf_info *li = NULL, *last = NULL;
+        struct hlist_node *node;
+
+        if (hlist_empty(head)) {
+                hlist_add_head_rcu(&new->hlist, head);
+        } else {
+                hlist_for_each_entry(li, node, head, hlist) {
+                        if (new->plen > li->plen)
+                                break;
+
+                        last = li;
+                }
+                if (last)
+                        hlist_add_after_rcu(&last->hlist, &new->hlist);
+                else
+                        hlist_add_before_rcu(&new->hlist, &li->hlist);
+        }
 }
 
+/* rcu_read_lock needs to be hold by caller from readside */
+
 static struct leaf *
 fib_find_node(struct trie *t, u32 key)
 {
@@ -948,61 +893,43 @@ fib_find_node(struct trie *t, u32 key)
 	struct node *n;
 
 	pos = 0;
-	n = t->trie;
+	n = rcu_dereference(t->trie);
 
 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
 		tn = (struct tnode *) n;
-		
+
 		check_tnode(tn);
-		
+
 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
-			pos=tn->pos + tn->bits;
+			pos = tn->pos + tn->bits;
 			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
-		}
-		else
+		} else
 			break;
 	}
 	/* Case we have found a leaf. Compare prefixes */
 
-	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
-		struct leaf *l = (struct leaf *) n;
-		return l;
-	}
+	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
+		return (struct leaf *)n;
+
 	return NULL;
 }
 
 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
 {
-	int i = 0;
 	int wasfull;
 	t_key cindex, key;
 	struct tnode *tp = NULL;
 
-	if (!tn)
-		BUG();
-
 	key = tn->key;
-	i = 0;
 
 	while (tn != NULL && NODE_PARENT(tn) != NULL) {
 
-		if (i > 10) {
-			printk("Rebalance tn=%p \n", tn);
-			if (tn) 		printk("tn->parent=%p \n", NODE_PARENT(tn));
-		
-			printk("Rebalance tp=%p \n", tp);
-			if (tp) 		printk("tp->parent=%p \n", NODE_PARENT(tp));
-		}
-
-		if (i > 12) BUG();
-		i++;
-
 		tp = NODE_PARENT(tn);
 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
 		tn = (struct tnode *) resize (t, (struct tnode *)tn);
 		tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
-	
+
 		if (!NODE_PARENT(tn))
 			break;
 
@@ -1015,6 +942,8 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
 	return (struct node*) tn;
 }
 
+/* only used from updater-side */
+
 static  struct list_head *
 fib_insert_node(struct trie *t, int *err, u32 key, int plen)
 {
@@ -1050,20 +979,16 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
 
 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
 		tn = (struct tnode *) n;
-		
+
 		check_tnode(tn);
-	
+
 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
 			tp = tn;
-			pos=tn->pos + tn->bits;
+			pos = tn->pos + tn->bits;
 			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
 
-			if (n && NODE_PARENT(n) != tn) {
-				printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
-				BUG();
-			}
-		}
-		else
+			BUG_ON(n && NODE_PARENT(n) != tn);
+		} else
 			break;
 	}
 
@@ -1073,17 +998,15 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
 	 * tp is n's (parent) ----> NULL or TNODE
 	 */
 
-	if (tp && IS_LEAF(tp))
-		BUG();
-
+	BUG_ON(tp && IS_LEAF(tp));
 
 	/* Case 1: n is a leaf. Compare prefixes */
 
 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
-		struct leaf *l = ( struct leaf *)  n;
-	
+		struct leaf *l = (struct leaf *) n;
+
 		li = leaf_info_new(plen);
-	
+
 		if (!li) {
 			*err = -ENOMEM;
 			goto err;
@@ -1113,35 +1036,29 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
 	fa_head = &li->falh;
 	insert_leaf_info(&l->list, li);
 
-	/* Case 2: n is NULL, and will just insert a new leaf */
 	if (t->trie && n == NULL) {
+		/* Case 2: n is NULL, and will just insert a new leaf */
 
 		NODE_SET_PARENT(l, tp);
-	
-		if (!tp)
-			BUG();
 
-		else {
-			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
-			put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
-		}
-	}
-	/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
-	else {
+		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
+		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
+	} else {
+		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
 		/*
 		 *  Add a new tnode here
 		 *  first tnode need some special handling
 		 */
 
 		if (tp)
-			pos=tp->pos+tp->bits;
+			pos = tp->pos+tp->bits;
 		else
-			pos=0;
+			pos = 0;
+
 		if (n) {
 			newpos = tkey_mismatch(key, pos, n->key);
 			tn = tnode_new(n->key, newpos, 1);
-		}
-		else {
+		} else {
 			newpos = 0;
 			tn = tnode_new(key, newpos, 1); /* First tnode */
 		}
@@ -1151,32 +1068,33 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
 			tnode_free((struct tnode *) l);
 			*err = -ENOMEM;
 			goto err;
-		}		
-		
+		}
+
 		NODE_SET_PARENT(tn, tp);
 
-		missbit=tkey_extract_bits(key, newpos, 1);
+		missbit = tkey_extract_bits(key, newpos, 1);
 		put_child(t, tn, missbit, (struct node *)l);
 		put_child(t, tn, 1-missbit, n);
 
 		if (tp) {
 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
 			put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
-		}
-		else {
-			t->trie = (struct node*) tn; /* First tnode */
+		} else {
+			rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
 			tp = tn;
 		}
 	}
-	if (tp && tp->pos+tp->bits > 32) {
+
+	if (tp && tp->pos + tp->bits > 32)
 		printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
 		       tp, tp->pos, tp->bits, key, plen);
-	}
+
 	/* Rebalance the trie */
-	t->trie = trie_rebalance(t, tp);
+
+	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
 done:
 	t->revision++;
-err:;
+err:
 	return fa_head;
 }
 
@@ -1204,17 +1122,18 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 
 	key = ntohl(key);
 
-	if (trie_debug)
-		printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
+	pr_debug("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
 
-	mask = ntohl( inet_make_mask(plen) );
+	mask = ntohl(inet_make_mask(plen));
 
 	if (key & ~mask)
 		return -EINVAL;
 
 	key = key & mask;
 
-	if  ((fi = fib_create_info(r, rta, nlhdr, &err)) == NULL)
+	fi = fib_create_info(r, rta, nlhdr, &err);
+
+	if (!fi)
 		goto err;
 
 	l = fib_find_node(t, key);
@@ -1236,8 +1155,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 	 * and we need to allocate a new one of those as well.
 	 */
 
-	if (fa &&
-	    fa->fa_info->fib_priority == fi->fib_priority) {
+	if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
 		struct fib_alias *fa_orig;
 
 		err = -EEXIST;
@@ -1248,22 +1166,27 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 			struct fib_info *fi_drop;
 			u8 state;
 
-			write_lock_bh(&fib_lock);
+			err = -ENOBUFS;
+			new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
+			if (new_fa == NULL)
+				goto out;
 
 			fi_drop = fa->fa_info;
-			fa->fa_info = fi;
-			fa->fa_type = type;
-			fa->fa_scope = r->rtm_scope;
+			new_fa->fa_tos = fa->fa_tos;
+			new_fa->fa_info = fi;
+			new_fa->fa_type = type;
+			new_fa->fa_scope = r->rtm_scope;
 			state = fa->fa_state;
-			fa->fa_state &= ~FA_S_ACCESSED;
+			new_fa->fa_state &= ~FA_S_ACCESSED;
 
-			write_unlock_bh(&fib_lock);
+			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
+			alias_free_mem_rcu(fa);
 
 			fib_release_info(fi_drop);
 			if (state & FA_S_ACCESSED)
-			  rt_cache_flush(-1);
+				rt_cache_flush(-1);
 
-			    goto succeeded;
+			goto succeeded;
 		}
 		/* Error if we find a perfect match which
 		 * uses the same scope, type, and nexthop
@@ -1285,7 +1208,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 			fa = fa_orig;
 	}
 	err = -ENOENT;
-	if (!(nlhdr->nlmsg_flags&NLM_F_CREATE))
+	if (!(nlhdr->nlmsg_flags & NLM_F_CREATE))
 		goto out;
 
 	err = -ENOBUFS;
@@ -1298,9 +1221,6 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 	new_fa->fa_type = type;
 	new_fa->fa_scope = r->rtm_scope;
 	new_fa->fa_state = 0;
-#if 0
-	new_fa->dst = NULL;
-#endif
 	/*
 	 * Insert new entry to the list.
 	 */
@@ -1312,12 +1232,8 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 			goto out_free_new_fa;
 	}
 
-	write_lock_bh(&fib_lock);
-
-	list_add_tail(&new_fa->fa_list,
-		 (fa ? &fa->fa_list : fa_head));
-
-	write_unlock_bh(&fib_lock);
+	list_add_tail_rcu(&new_fa->fa_list,
+			  (fa ? &fa->fa_list : fa_head));
 
 	rt_cache_flush(-1);
 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
@@ -1328,11 +1244,14 @@ out_free_new_fa:
 	kmem_cache_free(fn_alias_kmem, new_fa);
 out:
 	fib_release_info(fi);
-err:;
+err:
 	return err;
 }
 
-static inline int check_leaf(struct trie *t, struct leaf *l,  t_key key, int *plen, const struct flowi *flp,
+
+/* should be clalled with rcu_read_lock */
+static inline int check_leaf(struct trie *t, struct leaf *l,
+			     t_key key, int *plen, const struct flowi *flp,
 			     struct fib_result *res)
 {
 	int err, i;
@@ -1341,8 +1260,7 @@ static inline int check_leaf(struct trie *t, struct leaf *l,  t_key key, int *pl
 	struct hlist_head *hhead = &l->list;
 	struct hlist_node *node;
 
-	hlist_for_each_entry(li, node, hhead, hlist) {
-
+	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
 		i = li->plen;
 		mask = ntohl(inet_make_mask(i));
 		if (l->key != (key & mask))
@@ -1370,13 +1288,17 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
 	struct node *n;
 	struct tnode *pn;
 	int pos, bits;
-	t_key key=ntohl(flp->fl4_dst);
+	t_key key = ntohl(flp->fl4_dst);
 	int chopped_off;
 	t_key cindex = 0;
 	int current_prefix_length = KEYLENGTH;
-	n = t->trie;
+	struct tnode *cn;
+	t_key node_prefix, key_prefix, pref_mismatch;
+	int mp;
+
+	rcu_read_lock();
 
-	read_lock(&fib_lock);
+	n = rcu_dereference(t->trie);
 	if (!n)
 		goto failed;
 
@@ -1393,8 +1315,7 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
 	pn = (struct tnode *) n;
 	chopped_off = 0;
 
-        while (pn) {
-
+	while (pn) {
 		pos = pn->pos;
 		bits = pn->bits;
 
@@ -1410,130 +1331,129 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
 			goto backtrace;
 		}
 
-		if (IS_TNODE(n)) {
+		if (IS_LEAF(n)) {
+			if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
+				goto found;
+			else
+				goto backtrace;
+		}
+
 #define HL_OPTIMIZE
 #ifdef HL_OPTIMIZE
-			struct tnode *cn = (struct tnode *)n;
-			t_key node_prefix, key_prefix, pref_mismatch;
-			int mp;
+		cn = (struct tnode *)n;
 
-			/*
-			 * It's a tnode, and we can do some extra checks here if we
-			 * like, to avoid descending into a dead-end branch.
-			 * This tnode is in the parent's child array at index
-			 * key[p_pos..p_pos+p_bits] but potentially with some bits
-			 * chopped off, so in reality the index may be just a
-			 * subprefix, padded with zero at the end.
-			 * We can also take a look at any skipped bits in this
-			 * tnode - everything up to p_pos is supposed to be ok,
-			 * and the non-chopped bits of the index (se previous
-			 * paragraph) are also guaranteed ok, but the rest is
-			 * considered unknown.
-			 *
-			 * The skipped bits are key[pos+bits..cn->pos].
-			 */
-		
-			/* If current_prefix_length < pos+bits, we are already doing
-			 * actual prefix  matching, which means everything from
-			 * pos+(bits-chopped_off) onward must be zero along some
-			 * branch of this subtree - otherwise there is *no* valid
-			 * prefix present. Here we can only check the skipped
-			 * bits. Remember, since we have already indexed into the
-			 * parent's child array, we know that the bits we chopped of
-			 * *are* zero.
-			 */
+		/*
+		 * It's a tnode, and we can do some extra checks here if we
+		 * like, to avoid descending into a dead-end branch.
+		 * This tnode is in the parent's child array at index
+		 * key[p_pos..p_pos+p_bits] but potentially with some bits
+		 * chopped off, so in reality the index may be just a
+		 * subprefix, padded with zero at the end.
+		 * We can also take a look at any skipped bits in this
+		 * tnode - everything up to p_pos is supposed to be ok,
+		 * and the non-chopped bits of the index (se previous
+		 * paragraph) are also guaranteed ok, but the rest is
+		 * considered unknown.
+		 *
+		 * The skipped bits are key[pos+bits..cn->pos].
+		 */
 
-			/* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
-		
-			if (current_prefix_length < pos+bits) {
-				if (tkey_extract_bits(cn->key, current_prefix_length,
-						      cn->pos - current_prefix_length) != 0 ||
-				    !(cn->child[0]))
-					goto backtrace;
-			}
+		/* If current_prefix_length < pos+bits, we are already doing
+		 * actual prefix  matching, which means everything from
+		 * pos+(bits-chopped_off) onward must be zero along some
+		 * branch of this subtree - otherwise there is *no* valid
+		 * prefix present. Here we can only check the skipped
+		 * bits. Remember, since we have already indexed into the
+		 * parent's child array, we know that the bits we chopped of
+		 * *are* zero.
+		 */
 
-			/*
-			 * If chopped_off=0, the index is fully validated and we
-			 * only need to look at the skipped bits for this, the new,
-			 * tnode. What we actually want to do is to find out if
-			 * these skipped bits match our key perfectly, or if we will
-			 * have to count on finding a matching prefix further down,
-			 * because if we do, we would like to have some way of
-			 * verifying the existence of such a prefix at this point.
-			 */
+		/* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
 
-			/* The only thing we can do at this point is to verify that
-			 * any such matching prefix can indeed be a prefix to our
-			 * key, and if the bits in the node we are inspecting that
-			 * do not match our key are not ZERO, this cannot be true.
-			 * Thus, find out where there is a mismatch (before cn->pos)
-			 * and verify that all the mismatching bits are zero in the
-			 * new tnode's key.
-			 */
+		if (current_prefix_length < pos+bits) {
+			if (tkey_extract_bits(cn->key, current_prefix_length,
+						cn->pos - current_prefix_length) != 0 ||
+			    !(cn->child[0]))
+				goto backtrace;
+		}
 
-			/* Note: We aren't very concerned about the piece of the key
-			 * that precede pn->pos+pn->bits, since these have already been
-			 * checked. The bits after cn->pos aren't checked since these are
-			 * by definition "unknown" at this point. Thus, what we want to
-			 * see is if we are about to enter the "prefix matching" state,
-			 * and in that case verify that the skipped bits that will prevail
-			 * throughout this subtree are zero, as they have to be if we are
-			 * to find a matching prefix.
-			 */
+		/*
+		 * If chopped_off=0, the index is fully validated and we
+		 * only need to look at the skipped bits for this, the new,
+		 * tnode. What we actually want to do is to find out if
+		 * these skipped bits match our key perfectly, or if we will
+		 * have to count on finding a matching prefix further down,
+		 * because if we do, we would like to have some way of
+		 * verifying the existence of such a prefix at this point.
+		 */
 
-			node_prefix = MASK_PFX(cn->key, cn->pos);
-			key_prefix = MASK_PFX(key, cn->pos);
-			pref_mismatch = key_prefix^node_prefix;
-			mp = 0;
+		/* The only thing we can do at this point is to verify that
+		 * any such matching prefix can indeed be a prefix to our
+		 * key, and if the bits in the node we are inspecting that
+		 * do not match our key are not ZERO, this cannot be true.
+		 * Thus, find out where there is a mismatch (before cn->pos)
+		 * and verify that all the mismatching bits are zero in the
+		 * new tnode's key.
+		 */
 
-			/* In short: If skipped bits in this node do not match the search
-			 * key, enter the "prefix matching" state.directly.
-			 */
-			if (pref_mismatch) {
-				while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
-					mp++;
-					pref_mismatch = pref_mismatch <<1;
-				}
-				key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
-			
-				if (key_prefix != 0)
-					goto backtrace;
-
-				if (current_prefix_length >= cn->pos)
-					current_prefix_length=mp;
-		       }
-#endif
-		       pn = (struct tnode *)n; /* Descend */
-		       chopped_off = 0;
-		       continue;
+		/* Note: We aren't very concerned about the piece of the key
+		 * that precede pn->pos+pn->bits, since these have already been
+		 * checked. The bits after cn->pos aren't checked since these are
+		 * by definition "unknown" at this point. Thus, what we want to
+		 * see is if we are about to enter the "prefix matching" state,
+		 * and in that case verify that the skipped bits that will prevail
+		 * throughout this subtree are zero, as they have to be if we are
+		 * to find a matching prefix.
+		 */
+
+		node_prefix = MASK_PFX(cn->key, cn->pos);
+		key_prefix = MASK_PFX(key, cn->pos);
+		pref_mismatch = key_prefix^node_prefix;
+		mp = 0;
+
+		/* In short: If skipped bits in this node do not match the search
+		 * key, enter the "prefix matching" state.directly.
+		 */
+		if (pref_mismatch) {
+			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
+				mp++;
+				pref_mismatch = pref_mismatch <<1;
+			}
+			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
+
+			if (key_prefix != 0)
+				goto backtrace;
+
+			if (current_prefix_length >= cn->pos)
+				current_prefix_length = mp;
 		}
-		if (IS_LEAF(n)) {
-			if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
-				goto found;
-	       }
+#endif
+		pn = (struct tnode *)n; /* Descend */
+		chopped_off = 0;
+		continue;
+
 backtrace:
 		chopped_off++;
 
 		/* As zero don't change the child key (cindex) */
-		while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) {
+		while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
 			chopped_off++;
-		}
 
 		/* Decrease current_... with bits chopped off */
 		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
 			current_prefix_length = pn->pos + pn->bits - chopped_off;
-	
+
 		/*
 		 * Either we do the actual chop off according or if we have
 		 * chopped off all bits in this tnode walk up to our parent.
 		 */
 
-		if (chopped_off <= pn->bits)
+		if (chopped_off <= pn->bits) {
 			cindex &= ~(1 << (chopped_off-1));
-		else {
+		} else {
 			if (NODE_PARENT(pn) == NULL)
 				goto failed;
-		
+
 			/* Get Child's index */
 			cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
 			pn = NODE_PARENT(pn);
@@ -1548,10 +1468,11 @@ backtrace:
 failed:
 	ret = 1;
 found:
-	read_unlock(&fib_lock);
+	rcu_read_unlock();
 	return ret;
 }
 
+/* only called from updater side */
 static int trie_leaf_remove(struct trie *t, t_key key)
 {
 	t_key cindex;
@@ -1559,24 +1480,20 @@ static int trie_leaf_remove(struct trie *t, t_key key)
 	struct node *n = t->trie;
 	struct leaf *l;
 
-	if (trie_debug)
-		printk("entering trie_leaf_remove(%p)\n", n);
+	pr_debug("entering trie_leaf_remove(%p)\n", n);
 
 	/* Note that in the case skipped bits, those bits are *not* checked!
 	 * When we finish this, we will have NULL or a T_LEAF, and the
 	 * T_LEAF may or may not match our key.
 	 */
 
-        while (n != NULL && IS_TNODE(n)) {
+	while (n != NULL && IS_TNODE(n)) {
 		struct tnode *tn = (struct tnode *) n;
 		check_tnode(tn);
 		n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
 
-			if (n && NODE_PARENT(n) != tn) {
-				printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
-				BUG();
-			}
-        }
+		BUG_ON(n && NODE_PARENT(n) != tn);
+	}
 	l = (struct leaf *) n;
 
 	if (!n || !tkey_equals(l->key, key))
@@ -1590,23 +1507,24 @@ static int trie_leaf_remove(struct trie *t, t_key key)
 	t->revision++;
 	t->size--;
 
+	preempt_disable();
 	tp = NODE_PARENT(n);
 	tnode_free((struct tnode *) n);
 
 	if (tp) {
 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
 		put_child(t, (struct tnode *)tp, cindex, NULL);
-		t->trie = trie_rebalance(t, tp);
-	}
-	else
-		t->trie = NULL;
+		rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
+	} else
+		rcu_assign_pointer(t->trie, NULL);
+	preempt_enable();
 
 	return 1;
 }
 
 static int
 fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
-	       struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
+		struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
 {
 	struct trie *t = (struct trie *) tb->tb_data;
 	u32 key, mask;
@@ -1615,6 +1533,8 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 	struct fib_alias *fa, *fa_to_delete;
 	struct list_head *fa_head;
 	struct leaf *l;
+	struct leaf_info *li;
+
 
 	if (plen > 32)
 		return -EINVAL;
@@ -1624,7 +1544,7 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 		memcpy(&key, rta->rta_dst, 4);
 
 	key = ntohl(key);
-	mask = ntohl( inet_make_mask(plen) );
+	mask = ntohl(inet_make_mask(plen));
 
 	if (key & ~mask)
 		return -EINVAL;
@@ -1641,11 +1561,11 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 	if (!fa)
 		return -ESRCH;
 
-	if (trie_debug)
-		printk("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
+	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
 
 	fa_to_delete = NULL;
 	fa_head = fa->fa_list.prev;
+
 	list_for_each_entry(fa, fa_head, fa_list) {
 		struct fib_info *fi = fa->fa_info;
 
@@ -1664,39 +1584,31 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 		}
 	}
 
-	if (fa_to_delete) {
-		int kill_li = 0;
-		struct leaf_info *li;
-
-		fa = fa_to_delete;
-		rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
+	if (!fa_to_delete)
+		return -ESRCH;
 
-		l = fib_find_node(t, key);
-		li = find_leaf_info(&l->list, plen);
+	fa = fa_to_delete;
+	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
 
-		write_lock_bh(&fib_lock);
+	l = fib_find_node(t, key);
+	li = find_leaf_info(&l->list, plen);
 
-		list_del(&fa->fa_list);
+	list_del_rcu(&fa->fa_list);
 
-		if (list_empty(fa_head)) {
-			hlist_del(&li->hlist);
-			kill_li = 1;
-		}
-		write_unlock_bh(&fib_lock);
-	
-		if (kill_li)
-			free_leaf_info(li);
+	if (list_empty(fa_head)) {
+		hlist_del_rcu(&li->hlist);
+		free_leaf_info(li);
+	}
 
-		if (hlist_empty(&l->list))
-			trie_leaf_remove(t, key);
+	if (hlist_empty(&l->list))
+		trie_leaf_remove(t, key);
 
-		if (fa->fa_state & FA_S_ACCESSED)
-			rt_cache_flush(-1);
+	if (fa->fa_state & FA_S_ACCESSED)
+		rt_cache_flush(-1);
 
-		fn_free_alias(fa);
-		return 0;
-	}
-	return -ESRCH;
+	fib_release_info(fa->fa_info);
+	alias_free_mem_rcu(fa);
+	return 0;
 }
 
 static int trie_flush_list(struct trie *t, struct list_head *head)
@@ -1706,14 +1618,11 @@ static int trie_flush_list(struct trie *t, struct list_head *head)
 
 	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
 		struct fib_info *fi = fa->fa_info;
-	
-		if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
-
- 			write_lock_bh(&fib_lock);
-			list_del(&fa->fa_list);
-			write_unlock_bh(&fib_lock);
 
-			fn_free_alias(fa);
+		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
+			list_del_rcu(&fa->fa_list);
+			fib_release_info(fa->fa_info);
+			alias_free_mem_rcu(fa);
 			found++;
 		}
 	}
@@ -1728,37 +1637,34 @@ static int trie_flush_leaf(struct trie *t, struct leaf *l)
 	struct leaf_info *li = NULL;
 
 	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
-		
 		found += trie_flush_list(t, &li->falh);
 
 		if (list_empty(&li->falh)) {
-
- 			write_lock_bh(&fib_lock);
-			hlist_del(&li->hlist);
-			write_unlock_bh(&fib_lock);
-
+			hlist_del_rcu(&li->hlist);
 			free_leaf_info(li);
 		}
 	}
 	return found;
 }
 
+/* rcu_read_lock needs to be hold by caller from readside */
+
 static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
 {
 	struct node *c = (struct node *) thisleaf;
 	struct tnode *p;
 	int idx;
+	struct node *trie = rcu_dereference(t->trie);
 
 	if (c == NULL) {
-		if (t->trie == NULL)
+		if (trie == NULL)
 			return NULL;
 
-		if (IS_LEAF(t->trie))          /* trie w. just a leaf */
-			return (struct leaf *) t->trie;
+		if (IS_LEAF(trie))          /* trie w. just a leaf */
+			return (struct leaf *) trie;
 
-		p = (struct tnode*) t->trie;  /* Start */
-	}
-	else
+		p = (struct tnode*) trie;  /* Start */
+	} else
 		p = (struct tnode *) NODE_PARENT(c);
 
 	while (p) {
@@ -1771,29 +1677,31 @@ static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
 			pos = 0;
 
 		last = 1 << p->bits;
-		for(idx = pos; idx < last ; idx++) {
-			if (p->child[idx]) {
-
-				/* Decend if tnode */
-
-				while (IS_TNODE(p->child[idx])) {
-					p = (struct tnode*) p->child[idx];
-					idx = 0;
-				
-					/* Rightmost non-NULL branch */
-					if (p && IS_TNODE(p))
-						while (p->child[idx] == NULL && idx < (1 << p->bits)) idx++;
-
-					/* Done with this tnode? */
-					if (idx >= (1 << p->bits) || p->child[idx] == NULL )
-						goto up;
-				}
-				return (struct leaf*) p->child[idx];
+		for (idx = pos; idx < last ; idx++) {
+			c = rcu_dereference(p->child[idx]);
+
+			if (!c)
+				continue;
+
+			/* Decend if tnode */
+			while (IS_TNODE(c)) {
+				p = (struct tnode *) c;
+  				idx = 0;
+
+				/* Rightmost non-NULL branch */
+				if (p && IS_TNODE(p))
+					while (!(c = rcu_dereference(p->child[idx]))
+					       && idx < (1<<p->bits)) idx++;
+
+				/* Done with this tnode? */
+				if (idx >= (1 << p->bits) || !c)
+					goto up;
 			}
+			return (struct leaf *) c;
 		}
 up:
 		/* No more children go up one step  */
-		c = (struct node*) p;
+		c = (struct node *) p;
 		p = (struct tnode *) NODE_PARENT(p);
 	}
 	return NULL; /* Ready. Root of trie */
@@ -1807,23 +1715,24 @@ static int fn_trie_flush(struct fib_table *tb)
 
 	t->revision++;
 
-	for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
+	rcu_read_lock();
+	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
 		found += trie_flush_leaf(t, l);
 
 		if (ll && hlist_empty(&ll->list))
 			trie_leaf_remove(t, ll->key);
 		ll = l;
 	}
+	rcu_read_unlock();  
 
 	if (ll && hlist_empty(&ll->list))
 		trie_leaf_remove(t, ll->key);
 
-	if (trie_debug)
-		printk("trie_flush found=%d\n", found);
+	pr_debug("trie_flush found=%d\n", found);
 	return found;
 }
 
-static int trie_last_dflt=-1;
+static int trie_last_dflt = -1;
 
 static void
 fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
@@ -1840,7 +1749,7 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
 	last_resort = NULL;
 	order = -1;
 
-	read_lock(&fib_lock);
+	rcu_read_lock();
 
 	l = fib_find_node(t, 0);
 	if (!l)
@@ -1853,20 +1762,20 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
 	if (list_empty(fa_head))
 		goto out;
 
-	list_for_each_entry(fa, fa_head, fa_list) {
+	list_for_each_entry_rcu(fa, fa_head, fa_list) {
 		struct fib_info *next_fi = fa->fa_info;
-	
+
 		if (fa->fa_scope != res->scope ||
 		    fa->fa_type != RTN_UNICAST)
 			continue;
-	
+
 		if (next_fi->fib_priority > res->fi->fib_priority)
 			break;
 		if (!next_fi->fib_nh[0].nh_gw ||
 		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
 			continue;
 		fa->fa_state |= FA_S_ACCESSED;
-	
+
 		if (fi == NULL) {
 			if (next_fi != res->fi)
 				break;
@@ -1904,7 +1813,7 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
 	}
 	trie_last_dflt = last_idx;
  out:;
-	read_unlock(&fib_lock);
+	rcu_read_unlock();
 }
 
 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
@@ -1913,12 +1822,14 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi
 	int i, s_i;
 	struct fib_alias *fa;
 
-	u32 xkey=htonl(key);
+	u32 xkey = htonl(key);
 
-	s_i=cb->args[3];
+	s_i = cb->args[3];
 	i = 0;
 
-	list_for_each_entry(fa, fah, fa_list) {
+	/* rcu_read_lock is hold by caller */
+
+	list_for_each_entry_rcu(fa, fah, fa_list) {
 		if (i < s_i) {
 			i++;
 			continue;
@@ -1946,10 +1857,10 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi
 				  fa->fa_info, 0) < 0) {
 			cb->args[3] = i;
 			return -1;
-			}
+		}
 		i++;
 	}
-	cb->args[3]=i;
+	cb->args[3] = i;
 	return skb->len;
 }
 
@@ -1959,10 +1870,10 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str
 	int h, s_h;
 	struct list_head *fa_head;
 	struct leaf *l = NULL;
-	s_h=cb->args[2];
 
-	for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
+	s_h = cb->args[2];
 
+	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
 		if (h < s_h)
 			continue;
 		if (h > s_h)
@@ -1970,7 +1881,7 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str
 			       sizeof(cb->args) - 3*sizeof(cb->args[0]));
 
 		fa_head = get_fa_head(l, plen);
-	
+
 		if (!fa_head)
 			continue;
 
@@ -1978,11 +1889,11 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str
 			continue;
 
 		if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
-			cb->args[2]=h;
+			cb->args[2] = h;
 			return -1;
 		}
 	}
-	cb->args[2]=h;
+	cb->args[2] = h;
 	return skb->len;
 }
 
@@ -1993,25 +1904,24 @@ static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin
 
 	s_m = cb->args[1];
 
-	read_lock(&fib_lock);
-	for (m=0; m<=32; m++) {
-
+	rcu_read_lock();
+	for (m = 0; m <= 32; m++) {
 		if (m < s_m)
 			continue;
 		if (m > s_m)
 			memset(&cb->args[2], 0,
-			       sizeof(cb->args) - 2*sizeof(cb->args[0]));
+				sizeof(cb->args) - 2*sizeof(cb->args[0]));
 
 		if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
 			cb->args[1] = m;
 			goto out;
 		}
 	}
-	read_unlock(&fib_lock);
+	rcu_read_unlock();
 	cb->args[1] = m;
 	return skb->len;
- out:
-	read_unlock(&fib_lock);
+out:
+	rcu_read_unlock();
 	return -1;
 }
 
@@ -2051,9 +1961,9 @@ struct fib_table * __init fib_hash_init(int id)
 	trie_init(t);
 
 	if (id == RT_TABLE_LOCAL)
-                trie_local = t;
+		trie_local = t;
 	else if (id == RT_TABLE_MAIN)
-                trie_main = t;
+		trie_main = t;
 
 	if (id == RT_TABLE_LOCAL)
 		printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
@@ -2065,7 +1975,8 @@ struct fib_table * __init fib_hash_init(int id)
 
 static void putspace_seq(struct seq_file *seq, int n)
 {
-	while (n--) seq_printf(seq, " ");
+	while (n--)
+		seq_printf(seq, " ");
 }
 
 static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
@@ -2086,29 +1997,22 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
 		seq_printf(seq, "%d/", cindex);
 		printbin_seq(seq, cindex, bits);
 		seq_printf(seq, ": ");
-	}
-	else
+	} else
 		seq_printf(seq, "<root>: ");
 	seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
 
-	if (IS_LEAF(n))
-		seq_printf(seq, "key=%d.%d.%d.%d\n",
-			   n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
-	else {
-		int plen = ((struct tnode *)n)->pos;
-		t_key prf=MASK_PFX(n->key, plen);
-		seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
-			   prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
-	}
 	if (IS_LEAF(n)) {
-		struct leaf *l=(struct leaf *)n;
+		struct leaf *l = (struct leaf *)n;
 		struct fib_alias *fa;
 		int i;
-		for (i=32; i>=0; i--)
-		  if (find_leaf_info(&l->list, i)) {
-		
+
+		seq_printf(seq, "key=%d.%d.%d.%d\n",
+			   n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
+
+		for (i = 32; i >= 0; i--)
+			if (find_leaf_info(&l->list, i)) {
 				struct list_head *fa_head = get_fa_head(l, i);
-			
+
 				if (!fa_head)
 					continue;
 
@@ -2118,17 +2022,16 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
 				putspace_seq(seq, indent+2);
 				seq_printf(seq, "{/%d...dumping}\n", i);
 
-
-				list_for_each_entry(fa, fa_head, fa_list) {
+				list_for_each_entry_rcu(fa, fa_head, fa_list) {
 					putspace_seq(seq, indent+2);
-					if (fa->fa_info->fib_nh == NULL) {
-						seq_printf(seq, "Error _fib_nh=NULL\n");
-						continue;
-					}
 					if (fa->fa_info == NULL) {
 						seq_printf(seq, "Error fa_info=NULL\n");
 						continue;
 					}
+					if (fa->fa_info->fib_nh == NULL) {
+						seq_printf(seq, "Error _fib_nh=NULL\n");
+						continue;
+					}
 
 					seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
 					      fa->fa_type,
@@ -2136,11 +2039,16 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
 					      fa->fa_tos);
 				}
 			}
-	}
-	else if (IS_TNODE(n)) {
+	} else {
 		struct tnode *tn = (struct tnode *)n;
+		int plen = ((struct tnode *)n)->pos;
+		t_key prf = MASK_PFX(n->key, plen);
+
+		seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
+			   prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
+
 		putspace_seq(seq, indent); seq_printf(seq, "|    ");
-		seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos));
+		seq_printf(seq, "{key prefix=%08x/", tn->key & TKEY_GET_MASK(0, tn->pos));
 		printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
 		seq_printf(seq, "}\n");
 		putspace_seq(seq, indent); seq_printf(seq, "|    ");
@@ -2154,194 +2062,196 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
 
 static void trie_dump_seq(struct seq_file *seq, struct trie *t)
 {
-	struct node *n = t->trie;
-	int cindex=0;
-	int indent=1;
-	int pend=0;
+	struct node *n;
+	int cindex = 0;
+	int indent = 1;
+	int pend = 0;
 	int depth = 0;
+	struct tnode *tn;
 
-  	read_lock(&fib_lock);
-
+	rcu_read_lock();
+	n = rcu_dereference(t->trie);
 	seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
-	if (n) {
-		printnode_seq(seq, indent, n, pend, cindex, 0);
-		if (IS_TNODE(n)) {
-			struct tnode *tn = (struct tnode *)n;
-			pend = tn->pos+tn->bits;
-			putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
-			indent += 3;
-			depth++;
-
-			while (tn && cindex < (1 << tn->bits)) {
-				if (tn->child[cindex]) {
-				
-					/* Got a child */
-				
-					printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
-					if (IS_LEAF(tn->child[cindex])) {
-						cindex++;
-					
-					}
-					else {
-						/*
-						 * New tnode. Decend one level
-						 */
-					
-						depth++;
-						n = tn->child[cindex];
-						tn = (struct tnode *)n;
-						pend = tn->pos+tn->bits;
-						putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
-						indent+=3;
-						cindex=0;
-					}
-				}
-				else
-					cindex++;
 
+	if (!n) {
+		seq_printf(seq, "------ trie is empty\n");
+
+		rcu_read_unlock();
+		return;
+	}
+
+	printnode_seq(seq, indent, n, pend, cindex, 0);
+
+	if (!IS_TNODE(n)) {
+		rcu_read_unlock();
+		return;
+	}
+
+	tn = (struct tnode *)n;
+	pend = tn->pos+tn->bits;
+	putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
+	indent += 3;
+	depth++;
+
+	while (tn && cindex < (1 << tn->bits)) {
+		struct node *child = rcu_dereference(tn->child[cindex]);
+		if (!child)
+			cindex++;
+		else {
+			/* Got a child */
+			printnode_seq(seq, indent, child, pend,
+				      cindex, tn->bits);
+
+			if (IS_LEAF(child))
+				cindex++;
+
+			else {
 				/*
-				 * Test if we are done
+				 * New tnode. Decend one level
 				 */
-			
-				while (cindex >= (1 << tn->bits)) {
 
-					/*
-					 * Move upwards and test for root
-					 * pop off all traversed  nodes
-					 */
-				
-					if (NODE_PARENT(tn) == NULL) {
-						tn = NULL;
-						n = NULL;
-						break;
-					}
-					else {
-						cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
-						tn = NODE_PARENT(tn);
-						cindex++;
-						n = (struct node *)tn;
-						pend = tn->pos+tn->bits;
-						indent-=3;
-						depth--;
-					}
-				}
+				depth++;
+				n = child;
+				tn = (struct tnode *)n;
+				pend = tn->pos+tn->bits;
+				putspace_seq(seq, indent);
+				seq_printf(seq, "\\--\n");
+				indent += 3;
+				cindex = 0;
 			}
 		}
-		else n = NULL;
-	}
-	else seq_printf(seq, "------ trie is empty\n");
 
-  	read_unlock(&fib_lock);
+		/*
+		 * Test if we are done
+		 */
+
+		while (cindex >= (1 << tn->bits)) {
+			/*
+			 * Move upwards and test for root
+			 * pop off all traversed  nodes
+			 */
+
+			if (NODE_PARENT(tn) == NULL) {
+				tn = NULL;
+				break;
+			}
+
+			cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
+			cindex++;
+			tn = NODE_PARENT(tn);
+			pend = tn->pos + tn->bits;
+			indent -= 3;
+			depth--;
+		}
+	}
+	rcu_read_unlock();
 }
 
 static struct trie_stat *trie_stat_new(void)
 {
-	struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
+	struct trie_stat *s;
 	int i;
 
-	if (s) {
-		s->totdepth = 0;
-		s->maxdepth = 0;
-		s->tnodes = 0;
-		s->leaves = 0;
-		s->nullpointers = 0;
-	
-		for(i=0; i< MAX_CHILDS; i++)
-			s->nodesizes[i] = 0;
-	}
+	s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
+	if (!s)
+		return NULL;
+
+	s->totdepth = 0;
+	s->maxdepth = 0;
+	s->tnodes = 0;
+	s->leaves = 0;
+	s->nullpointers = 0;
+
+	for (i = 0; i < MAX_CHILDS; i++)
+		s->nodesizes[i] = 0;
+
 	return s;
 }
 
 static struct trie_stat *trie_collect_stats(struct trie *t)
 {
-	struct node *n = t->trie;
+	struct node *n;
 	struct trie_stat *s = trie_stat_new();
 	int cindex = 0;
-	int indent = 1;
 	int pend = 0;
 	int depth = 0;
 
-	read_lock(&fib_lock);	
+	if (!s)
+		return NULL;
 
-	if (s) {
-		if (n) {
-			if (IS_TNODE(n)) {
-				struct tnode *tn = (struct tnode *)n;
-				pend = tn->pos+tn->bits;
-				indent += 3;
-				s->nodesizes[tn->bits]++;
-				depth++;
+	rcu_read_lock();
+	n = rcu_dereference(t->trie);
 
-				while (tn && cindex < (1 << tn->bits)) {
-					if (tn->child[cindex]) {
-						/* Got a child */
-				
-						if (IS_LEAF(tn->child[cindex])) {
-							cindex++;
-					
-							/* stats */
-							if (depth > s->maxdepth)
-								s->maxdepth = depth;
-							s->totdepth += depth;
-							s->leaves++;
-						}
-				
-						else {
-							/*
-							 * New tnode. Decend one level
-							 */
-					
-							s->tnodes++;
-							s->nodesizes[tn->bits]++;
-							depth++;
-					
-							n = tn->child[cindex];
-							tn = (struct tnode *)n;
-							pend = tn->pos+tn->bits;
-
-							indent += 3;
-							cindex = 0;
-						}
-					}
-					else {
-						cindex++;
-						s->nullpointers++;
-					}
+	if (!n)
+		return s;
+
+	if (IS_TNODE(n)) {
+		struct tnode *tn = (struct tnode *)n;
+		pend = tn->pos+tn->bits;
+		s->nodesizes[tn->bits]++;
+		depth++;
+
+		while (tn && cindex < (1 << tn->bits)) {
+			struct node *ch = rcu_dereference(tn->child[cindex]);
+			if (ch) {
 
+				/* Got a child */
+
+				if (IS_LEAF(tn->child[cindex])) {
+					cindex++;
+
+					/* stats */
+					if (depth > s->maxdepth)
+						s->maxdepth = depth;
+					s->totdepth += depth;
+					s->leaves++;
+				} else {
 					/*
-					 * Test if we are done
+					 * New tnode. Decend one level
 					 */
-			
-					while (cindex >= (1 << tn->bits)) {
-
-						/*
-						 * Move upwards and test for root
-						 * pop off all traversed  nodes
-						 */
-
-					
-						if (NODE_PARENT(tn) == NULL) {
-							tn = NULL;
-							n = NULL;
-							break;
-						}
-						else {
-							cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
-							tn = NODE_PARENT(tn);
-							cindex++;
-							n = (struct node *)tn;
-							pend = tn->pos+tn->bits;
-							indent -= 3;
-							depth--;
-						}
- 					}
+
+					s->tnodes++;
+					s->nodesizes[tn->bits]++;
+					depth++;
+
+					n = ch;
+					tn = (struct tnode *)n;
+					pend = tn->pos+tn->bits;
+
+					cindex = 0;
 				}
+			} else {
+				cindex++;
+				s->nullpointers++;
 			}
-			else n = NULL;
+
+			/*
+			 * Test if we are done
+			 */
+
+			while (cindex >= (1 << tn->bits)) {
+				/*
+				 * Move upwards and test for root
+				 * pop off all traversed  nodes
+				 */
+
+				if (NODE_PARENT(tn) == NULL) {
+					tn = NULL;
+					n = NULL;
+					break;
+				}
+
+				cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
+				tn = NODE_PARENT(tn);
+				cindex++;
+				n = (struct node *)tn;
+				pend = tn->pos+tn->bits;
+				depth--;
+ 			}
 		}
 	}
 
-	read_unlock(&fib_lock);	
+	rcu_read_unlock();
 	return s;
 }
 
@@ -2359,17 +2269,22 @@ static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
 
 static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	void *v = NULL;
+	if (!ip_fib_main_table)
+		return NULL;
 
-	if (ip_fib_main_table)
-		v = *pos ? fib_triestat_get_next(seq) : SEQ_START_TOKEN;
-	return v;
+	if (*pos)
+		return fib_triestat_get_next(seq);
+	else
+		return SEQ_START_TOKEN;
 }
 
 static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	++*pos;
-	return v == SEQ_START_TOKEN ? fib_triestat_get_first(seq) : fib_triestat_get_next(seq);
+	if (v == SEQ_START_TOKEN)
+		return fib_triestat_get_first(seq);
+	else
+		return fib_triestat_get_next(seq);
 }
 
 static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
@@ -2388,22 +2303,22 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
 {
 	int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
 	int i, max, pointers;
-        struct trie_stat *stat;
+	struct trie_stat *stat;
 	int avdepth;
 
 	stat = trie_collect_stats(t);
 
-	bytes=0;
+	bytes = 0;
 	seq_printf(seq, "trie=%p\n", t);
 
 	if (stat) {
 		if (stat->leaves)
-			avdepth=stat->totdepth*100 / stat->leaves;
+			avdepth = stat->totdepth*100 / stat->leaves;
 		else
-			avdepth=0;
-		seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
+			avdepth = 0;
+		seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100);
 		seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
-			
+
 		seq_printf(seq, "Leaves: %d\n", stat->leaves);
 		bytes += sizeof(struct leaf) * stat->leaves;
 		seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
@@ -2455,11 +2370,9 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
 
 		if (trie_main)
 			collect_and_show(trie_main, seq);
-	}
-	else {
-		snprintf(bf, sizeof(bf),
-			 "*\t%08X\t%08X", 200, 400);
-	
+	} else {
+		snprintf(bf, sizeof(bf), "*\t%08X\t%08X", 200, 400);
+
 		seq_printf(seq, "%-127s\n", bf);
 	}
 	return 0;
@@ -2520,22 +2433,27 @@ static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
 
 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	void *v = NULL;
+	if (!ip_fib_main_table)
+		return NULL;
 
-	if (ip_fib_main_table)
-		v = *pos ? fib_trie_get_next(seq) : SEQ_START_TOKEN;
-	return v;
+	if (*pos)
+		return fib_trie_get_next(seq);
+	else
+		return SEQ_START_TOKEN;
 }
 
 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	++*pos;
-	return v == SEQ_START_TOKEN ? fib_trie_get_first(seq) : fib_trie_get_next(seq);
+	if (v == SEQ_START_TOKEN)
+		return fib_trie_get_first(seq);
+	else
+		return fib_trie_get_next(seq);
+
 }
 
 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
 {
-
 }
 
 /*
@@ -2555,9 +2473,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
 
 		if (trie_main)
 			trie_dump_seq(seq, trie_main);
-	}
-
-	else {
+	} else {
 		snprintf(bf, sizeof(bf),
 			 "*\t%08X\t%08X", 200, 400);
 		seq_printf(seq, "%-127s\n", bf);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index badfc584997..24eb56ae1b5 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -114,7 +114,7 @@ struct icmp_bxm {
 /*
  *	Statistics
  */
-DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics);
+DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics) __read_mostly;
 
 /* An array of errno for error messages from dest unreach. */
 /* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */
@@ -627,11 +627,10 @@ static void icmp_unreach(struct sk_buff *skb)
 			break;
 		case ICMP_FRAG_NEEDED:
 			if (ipv4_config.no_pmtu_disc) {
-				LIMIT_NETDEBUG(
-					printk(KERN_INFO "ICMP: %u.%u.%u.%u: "
+				LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: "
 							 "fragmentation needed "
 							 "and DF set.\n",
-					       NIPQUAD(iph->daddr)));
+					       NIPQUAD(iph->daddr));
 			} else {
 				info = ip_rt_frag_needed(iph,
 						     ntohs(icmph->un.frag.mtu));
@@ -640,10 +639,9 @@ static void icmp_unreach(struct sk_buff *skb)
 			}
 			break;
 		case ICMP_SR_FAILED:
-			LIMIT_NETDEBUG(
-				printk(KERN_INFO "ICMP: %u.%u.%u.%u: Source "
+			LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: Source "
 						 "Route Failed.\n",
-				       NIPQUAD(iph->daddr)));
+				       NIPQUAD(iph->daddr));
 			break;
 		default:
 			break;
@@ -936,7 +934,7 @@ int icmp_rcv(struct sk_buff *skb)
 	case CHECKSUM_HW:
 		if (!(u16)csum_fold(skb->csum))
 			break;
-		LIMIT_NETDEBUG(printk(KERN_DEBUG "icmp v4 hw csum failure\n"));
+		LIMIT_NETDEBUG(KERN_DEBUG "icmp v4 hw csum failure\n");
 	case CHECKSUM_NONE:
 		if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))
 			goto error;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 5088f90835a..44607f4767b 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -904,7 +904,7 @@ int igmp_rcv(struct sk_buff *skb)
 	case IGMP_MTRACE_RESP:
 		break;
 	default:
-		NETDEBUG(printk(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type));
+		NETDEBUG(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type);
 	}
 	in_dev_put(in_dev);
 	kfree_skb(skb);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
new file mode 100644
index 00000000000..fe3c6d3d0c9
--- /dev/null
+++ b/net/ipv4/inet_connection_sock.c
@@ -0,0 +1,641 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Support for INET connection oriented protocols.
+ *
+ * Authors:	See the TCP sources
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or(at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/jhash.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/tcp_states.h>
+#include <net/xfrm.h>
+
+#ifdef INET_CSK_DEBUG
+const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
+EXPORT_SYMBOL(inet_csk_timer_bug_msg);
+#endif
+
+/*
+ * This array holds the first and last local port number.
+ * For high-usage systems, use sysctl to change this to
+ * 32768-61000
+ */
+int sysctl_local_port_range[2] = { 1024, 4999 };
+
+static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
+{
+	const u32 sk_rcv_saddr = inet_rcv_saddr(sk);
+	struct sock *sk2;
+	struct hlist_node *node;
+	int reuse = sk->sk_reuse;
+
+	sk_for_each_bound(sk2, node, &tb->owners) {
+		if (sk != sk2 &&
+		    !inet_v6_ipv6only(sk2) &&
+		    (!sk->sk_bound_dev_if ||
+		     !sk2->sk_bound_dev_if ||
+		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
+			if (!reuse || !sk2->sk_reuse ||
+			    sk2->sk_state == TCP_LISTEN) {
+				const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
+				if (!sk2_rcv_saddr || !sk_rcv_saddr ||
+				    sk2_rcv_saddr == sk_rcv_saddr)
+					break;
+			}
+		}
+	}
+	return node != NULL;
+}
+
+/* Obtain a reference to a local port for the given sock,
+ * if snum is zero it means select any available local port.
+ */
+int inet_csk_get_port(struct inet_hashinfo *hashinfo,
+		      struct sock *sk, unsigned short snum)
+{
+	struct inet_bind_hashbucket *head;
+	struct hlist_node *node;
+	struct inet_bind_bucket *tb;
+	int ret;
+
+	local_bh_disable();
+	if (!snum) {
+		int low = sysctl_local_port_range[0];
+		int high = sysctl_local_port_range[1];
+		int remaining = (high - low) + 1;
+		int rover;
+
+		spin_lock(&hashinfo->portalloc_lock);
+		if (hashinfo->port_rover < low)
+			rover = low;
+		else
+			rover = hashinfo->port_rover;
+		do {
+			rover++;
+			if (rover > high)
+				rover = low;
+			head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
+			spin_lock(&head->lock);
+			inet_bind_bucket_for_each(tb, node, &head->chain)
+				if (tb->port == rover)
+					goto next;
+			break;
+		next:
+			spin_unlock(&head->lock);
+		} while (--remaining > 0);
+		hashinfo->port_rover = rover;
+		spin_unlock(&hashinfo->portalloc_lock);
+
+		/* Exhausted local port range during search?  It is not
+		 * possible for us to be holding one of the bind hash
+		 * locks if this test triggers, because if 'remaining'
+		 * drops to zero, we broke out of the do/while loop at
+		 * the top level, not from the 'break;' statement.
+		 */
+		ret = 1;
+		if (remaining <= 0)
+			goto fail;
+
+		/* OK, here is the one we will use.  HEAD is
+		 * non-NULL and we hold it's mutex.
+		 */
+		snum = rover;
+	} else {
+		head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)];
+		spin_lock(&head->lock);
+		inet_bind_bucket_for_each(tb, node, &head->chain)
+			if (tb->port == snum)
+				goto tb_found;
+	}
+	tb = NULL;
+	goto tb_not_found;
+tb_found:
+	if (!hlist_empty(&tb->owners)) {
+		if (sk->sk_reuse > 1)
+			goto success;
+		if (tb->fastreuse > 0 &&
+		    sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
+			goto success;
+		} else {
+			ret = 1;
+			if (inet_csk_bind_conflict(sk, tb))
+				goto fail_unlock;
+		}
+	}
+tb_not_found:
+	ret = 1;
+	if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL)
+		goto fail_unlock;
+	if (hlist_empty(&tb->owners)) {
+		if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
+			tb->fastreuse = 1;
+		else
+			tb->fastreuse = 0;
+	} else if (tb->fastreuse &&
+		   (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
+		tb->fastreuse = 0;
+success:
+	if (!inet_csk(sk)->icsk_bind_hash)
+		inet_bind_hash(sk, tb, snum);
+	BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
+ 	ret = 0;
+
+fail_unlock:
+	spin_unlock(&head->lock);
+fail:
+	local_bh_enable();
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_get_port);
+
+/*
+ * Wait for an incoming connection, avoid race conditions. This must be called
+ * with the socket locked.
+ */
+static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	DEFINE_WAIT(wait);
+	int err;
+
+	/*
+	 * True wake-one mechanism for incoming connections: only
+	 * one process gets woken up, not the 'whole herd'.
+	 * Since we do not 'race & poll' for established sockets
+	 * anymore, the common case will execute the loop only once.
+	 *
+	 * Subtle issue: "add_wait_queue_exclusive()" will be added
+	 * after any current non-exclusive waiters, and we know that
+	 * it will always _stay_ after any new non-exclusive waiters
+	 * because all non-exclusive waiters are added at the
+	 * beginning of the wait-queue. As such, it's ok to "drop"
+	 * our exclusiveness temporarily when we get woken up without
+	 * having to remove and re-insert us on the wait queue.
+	 */
+	for (;;) {
+		prepare_to_wait_exclusive(sk->sk_sleep, &wait,
+					  TASK_INTERRUPTIBLE);
+		release_sock(sk);
+		if (reqsk_queue_empty(&icsk->icsk_accept_queue))
+			timeo = schedule_timeout(timeo);
+		lock_sock(sk);
+		err = 0;
+		if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
+			break;
+		err = -EINVAL;
+		if (sk->sk_state != TCP_LISTEN)
+			break;
+		err = sock_intr_errno(timeo);
+		if (signal_pending(current))
+			break;
+		err = -EAGAIN;
+		if (!timeo)
+			break;
+	}
+	finish_wait(sk->sk_sleep, &wait);
+	return err;
+}
+
+/*
+ * This will accept the next outstanding connection.
+ */
+struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct sock *newsk;
+	int error;
+
+	lock_sock(sk);
+
+	/* We need to make sure that this socket is listening,
+	 * and that it has something pending.
+	 */
+	error = -EINVAL;
+	if (sk->sk_state != TCP_LISTEN)
+		goto out_err;
+
+	/* Find already established connection */
+	if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
+		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+
+		/* If this is a non blocking socket don't sleep */
+		error = -EAGAIN;
+		if (!timeo)
+			goto out_err;
+
+		error = inet_csk_wait_for_connect(sk, timeo);
+		if (error)
+			goto out_err;
+	}
+
+	newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
+	BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
+out:
+	release_sock(sk);
+	return newsk;
+out_err:
+	newsk = NULL;
+	*err = error;
+	goto out;
+}
+
+EXPORT_SYMBOL(inet_csk_accept);
+
+/*
+ * Using different timers for retransmit, delayed acks and probes
+ * We may wish use just one timer maintaining a list of expire jiffies 
+ * to optimize.
+ */
+void inet_csk_init_xmit_timers(struct sock *sk,
+			       void (*retransmit_handler)(unsigned long),
+			       void (*delack_handler)(unsigned long),
+			       void (*keepalive_handler)(unsigned long))
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	init_timer(&icsk->icsk_retransmit_timer);
+	init_timer(&icsk->icsk_delack_timer);
+	init_timer(&sk->sk_timer);
+
+	icsk->icsk_retransmit_timer.function = retransmit_handler;
+	icsk->icsk_delack_timer.function     = delack_handler;
+	sk->sk_timer.function		     = keepalive_handler;
+
+	icsk->icsk_retransmit_timer.data = 
+		icsk->icsk_delack_timer.data =
+			sk->sk_timer.data  = (unsigned long)sk;
+
+	icsk->icsk_pending = icsk->icsk_ack.pending = 0;
+}
+
+EXPORT_SYMBOL(inet_csk_init_xmit_timers);
+
+void inet_csk_clear_xmit_timers(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0;
+
+	sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
+	sk_stop_timer(sk, &icsk->icsk_delack_timer);
+	sk_stop_timer(sk, &sk->sk_timer);
+}
+
+EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
+
+void inet_csk_delete_keepalive_timer(struct sock *sk)
+{
+	sk_stop_timer(sk, &sk->sk_timer);
+}
+
+EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
+
+void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
+{
+	sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
+}
+
+EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
+
+struct dst_entry* inet_csk_route_req(struct sock *sk,
+				     const struct request_sock *req)
+{
+	struct rtable *rt;
+	const struct inet_request_sock *ireq = inet_rsk(req);
+	struct ip_options *opt = inet_rsk(req)->opt;
+	struct flowi fl = { .oif = sk->sk_bound_dev_if,
+			    .nl_u = { .ip4_u =
+				      { .daddr = ((opt && opt->srr) ?
+						  opt->faddr :
+						  ireq->rmt_addr),
+					.saddr = ireq->loc_addr,
+					.tos = RT_CONN_FLAGS(sk) } },
+			    .proto = sk->sk_protocol,
+			    .uli_u = { .ports =
+				       { .sport = inet_sk(sk)->sport,
+					 .dport = ireq->rmt_port } } };
+
+	if (ip_route_output_flow(&rt, &fl, sk, 0)) {
+		IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
+		return NULL;
+	}
+	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
+		ip_rt_put(rt);
+		IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
+		return NULL;
+	}
+	return &rt->u.dst;
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_route_req);
+
+static inline u32 inet_synq_hash(const u32 raddr, const u16 rport,
+				 const u32 rnd, const u16 synq_hsize)
+{
+	return jhash_2words(raddr, (u32)rport, rnd) & (synq_hsize - 1);
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
+#else
+#define AF_INET_FAMILY(fam) 1
+#endif
+
+struct request_sock *inet_csk_search_req(const struct sock *sk,
+					 struct request_sock ***prevp,
+					 const __u16 rport, const __u32 raddr,
+					 const __u32 laddr)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+	struct request_sock *req, **prev;
+
+	for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd,
+						    lopt->nr_table_entries)];
+	     (req = *prev) != NULL;
+	     prev = &req->dl_next) {
+		const struct inet_request_sock *ireq = inet_rsk(req);
+
+		if (ireq->rmt_port == rport &&
+		    ireq->rmt_addr == raddr &&
+		    ireq->loc_addr == laddr &&
+		    AF_INET_FAMILY(req->rsk_ops->family)) {
+			BUG_TRAP(!req->sk);
+			*prevp = prev;
+			break;
+		}
+	}
+
+	return req;
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_search_req);
+
+void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
+				   const unsigned timeout)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+	const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port,
+				     lopt->hash_rnd, lopt->nr_table_entries);
+
+	reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
+	inet_csk_reqsk_queue_added(sk, timeout);
+}
+
+/* Only thing we need from tcp.h */
+extern int sysctl_tcp_synack_retries;
+
+EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
+
+void inet_csk_reqsk_queue_prune(struct sock *parent,
+				const unsigned long interval,
+				const unsigned long timeout,
+				const unsigned long max_rto)
+{
+	struct inet_connection_sock *icsk = inet_csk(parent);
+	struct request_sock_queue *queue = &icsk->icsk_accept_queue;
+	struct listen_sock *lopt = queue->listen_opt;
+	int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
+	int thresh = max_retries;
+	unsigned long now = jiffies;
+	struct request_sock **reqp, *req;
+	int i, budget;
+
+	if (lopt == NULL || lopt->qlen == 0)
+		return;
+
+	/* Normally all the openreqs are young and become mature
+	 * (i.e. converted to established socket) for first timeout.
+	 * If synack was not acknowledged for 3 seconds, it means
+	 * one of the following things: synack was lost, ack was lost,
+	 * rtt is high or nobody planned to ack (i.e. synflood).
+	 * When server is a bit loaded, queue is populated with old
+	 * open requests, reducing effective size of queue.
+	 * When server is well loaded, queue size reduces to zero
+	 * after several minutes of work. It is not synflood,
+	 * it is normal operation. The solution is pruning
+	 * too old entries overriding normal timeout, when
+	 * situation becomes dangerous.
+	 *
+	 * Essentially, we reserve half of room for young
+	 * embrions; and abort old ones without pity, if old
+	 * ones are about to clog our table.
+	 */
+	if (lopt->qlen>>(lopt->max_qlen_log-1)) {
+		int young = (lopt->qlen_young<<1);
+
+		while (thresh > 2) {
+			if (lopt->qlen < young)
+				break;
+			thresh--;
+			young <<= 1;
+		}
+	}
+
+	if (queue->rskq_defer_accept)
+		max_retries = queue->rskq_defer_accept;
+
+	budget = 2 * (lopt->nr_table_entries / (timeout / interval));
+	i = lopt->clock_hand;
+
+	do {
+		reqp=&lopt->syn_table[i];
+		while ((req = *reqp) != NULL) {
+			if (time_after_eq(now, req->expires)) {
+				if ((req->retrans < thresh ||
+				     (inet_rsk(req)->acked && req->retrans < max_retries))
+				    && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) {
+					unsigned long timeo;
+
+					if (req->retrans++ == 0)
+						lopt->qlen_young--;
+					timeo = min((timeout << req->retrans), max_rto);
+					req->expires = now + timeo;
+					reqp = &req->dl_next;
+					continue;
+				}
+
+				/* Drop this request */
+				inet_csk_reqsk_queue_unlink(parent, req, reqp);
+				reqsk_queue_removed(queue, req);
+				reqsk_free(req);
+				continue;
+			}
+			reqp = &req->dl_next;
+		}
+
+		i = (i + 1) & (lopt->nr_table_entries - 1);
+
+	} while (--budget > 0);
+
+	lopt->clock_hand = i;
+
+	if (lopt->qlen)
+		inet_csk_reset_keepalive_timer(parent, interval);
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
+
+struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
+			    const unsigned int __nocast priority)
+{
+	struct sock *newsk = sk_clone(sk, priority);
+
+	if (newsk != NULL) {
+		struct inet_connection_sock *newicsk = inet_csk(newsk);
+
+		newsk->sk_state = TCP_SYN_RECV;
+		newicsk->icsk_bind_hash = NULL;
+
+		inet_sk(newsk)->dport = inet_rsk(req)->rmt_port;
+		newsk->sk_write_space = sk_stream_write_space;
+
+		newicsk->icsk_retransmits = 0;
+		newicsk->icsk_backoff	  = 0;
+		newicsk->icsk_probes_out  = 0;
+
+		/* Deinitialize accept_queue to trap illegal accesses. */
+		memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
+	}
+	return newsk;
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_clone);
+
+/*
+ * At this point, there should be no process reference to this
+ * socket, and thus no user references at all.  Therefore we
+ * can assume the socket waitqueue is inactive and nobody will
+ * try to jump onto it.
+ */
+void inet_csk_destroy_sock(struct sock *sk)
+{
+	BUG_TRAP(sk->sk_state == TCP_CLOSE);
+	BUG_TRAP(sock_flag(sk, SOCK_DEAD));
+
+	/* It cannot be in hash table! */
+	BUG_TRAP(sk_unhashed(sk));
+
+	/* If it has not 0 inet_sk(sk)->num, it must be bound */
+	BUG_TRAP(!inet_sk(sk)->num || inet_csk(sk)->icsk_bind_hash);
+
+	sk->sk_prot->destroy(sk);
+
+	sk_stream_kill_queues(sk);
+
+	xfrm_sk_free_policy(sk);
+
+	sk_refcnt_debug_release(sk);
+
+	atomic_dec(sk->sk_prot->orphan_count);
+	sock_put(sk);
+}
+
+EXPORT_SYMBOL(inet_csk_destroy_sock);
+
+int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
+
+	if (rc != 0)
+		return rc;
+
+	sk->sk_max_ack_backlog = 0;
+	sk->sk_ack_backlog = 0;
+	inet_csk_delack_init(sk);
+
+	/* There is race window here: we announce ourselves listening,
+	 * but this transition is still not validated by get_port().
+	 * It is OK, because this socket enters to hash table only
+	 * after validation is complete.
+	 */
+	sk->sk_state = TCP_LISTEN;
+	if (!sk->sk_prot->get_port(sk, inet->num)) {
+		inet->sport = htons(inet->num);
+
+		sk_dst_reset(sk);
+		sk->sk_prot->hash(sk);
+
+		return 0;
+	}
+
+	sk->sk_state = TCP_CLOSE;
+	__reqsk_queue_destroy(&icsk->icsk_accept_queue);
+	return -EADDRINUSE;
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_listen_start);
+
+/*
+ *	This routine closes sockets which have been at least partially
+ *	opened, but not yet accepted.
+ */
+void inet_csk_listen_stop(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct request_sock *acc_req;
+	struct request_sock *req;
+
+	inet_csk_delete_keepalive_timer(sk);
+
+	/* make all the listen_opt local to us */
+	acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue);
+
+	/* Following specs, it would be better either to send FIN
+	 * (and enter FIN-WAIT-1, it is normal close)
+	 * or to send active reset (abort).
+	 * Certainly, it is pretty dangerous while synflood, but it is
+	 * bad justification for our negligence 8)
+	 * To be honest, we are not able to make either
+	 * of the variants now.			--ANK
+	 */
+	reqsk_queue_destroy(&icsk->icsk_accept_queue);
+
+	while ((req = acc_req) != NULL) {
+		struct sock *child = req->sk;
+
+		acc_req = req->dl_next;
+
+		local_bh_disable();
+		bh_lock_sock(child);
+		BUG_TRAP(!sock_owned_by_user(child));
+		sock_hold(child);
+
+		sk->sk_prot->disconnect(child, O_NONBLOCK);
+
+		sock_orphan(child);
+
+		atomic_inc(sk->sk_prot->orphan_count);
+
+		inet_csk_destroy_sock(child);
+
+		bh_unlock_sock(child);
+		local_bh_enable();
+		sock_put(child);
+
+		sk_acceptq_removed(sk);
+		__reqsk_free(req);
+	}
+	BUG_TRAP(!sk->sk_ack_backlog);
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
new file mode 100644
index 00000000000..71f3c7350c6
--- /dev/null
+++ b/net/ipv4/inet_diag.c
@@ -0,0 +1,868 @@
+/*
+ * inet_diag.c	Module for monitoring INET transport protocols sockets.
+ *
+ * Version:	$Id: inet_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/random.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/time.h>
+
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/ipv6.h>
+#include <net/inet_common.h>
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/inet6_hashtables.h>
+
+#include <linux/inet.h>
+#include <linux/stddef.h>
+
+#include <linux/inet_diag.h>
+
+static const struct inet_diag_handler **inet_diag_table;
+
+struct inet_diag_entry {
+	u32 *saddr;
+	u32 *daddr;
+	u16 sport;
+	u16 dport;
+	u16 family;
+	u16 userlocks;
+};
+
+static struct sock *idiagnl;
+
+#define INET_DIAG_PUT(skb, attrtype, attrlen) \
+	RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
+
+static int inet_diag_fill(struct sk_buff *skb, struct sock *sk,
+			int ext, u32 pid, u32 seq, u16 nlmsg_flags,
+			const struct nlmsghdr *unlh)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct inet_diag_msg *r;
+	struct nlmsghdr  *nlh;
+	void *info = NULL;
+	struct inet_diag_meminfo  *minfo = NULL;
+	unsigned char	 *b = skb->tail;
+	const struct inet_diag_handler *handler;
+
+	handler = inet_diag_table[unlh->nlmsg_type];
+	BUG_ON(handler == NULL);
+
+	nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
+	nlh->nlmsg_flags = nlmsg_flags;
+
+	r = NLMSG_DATA(nlh);
+	if (sk->sk_state != TCP_TIME_WAIT) {
+		if (ext & (1 << (INET_DIAG_MEMINFO - 1)))
+			minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO,
+					      sizeof(*minfo));
+		if (ext & (1 << (INET_DIAG_INFO - 1)))
+			info = INET_DIAG_PUT(skb, INET_DIAG_INFO,
+					   handler->idiag_info_size);
+		
+		if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) {
+			size_t len = strlen(icsk->icsk_ca_ops->name);
+			strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1),
+			       icsk->icsk_ca_ops->name);
+		}
+	}
+	r->idiag_family = sk->sk_family;
+	r->idiag_state = sk->sk_state;
+	r->idiag_timer = 0;
+	r->idiag_retrans = 0;
+
+	r->id.idiag_if = sk->sk_bound_dev_if;
+	r->id.idiag_cookie[0] = (u32)(unsigned long)sk;
+	r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
+
+	if (r->idiag_state == TCP_TIME_WAIT) {
+		const struct inet_timewait_sock *tw = inet_twsk(sk);
+		long tmo = tw->tw_ttd - jiffies;
+		if (tmo < 0)
+			tmo = 0;
+
+		r->id.idiag_sport = tw->tw_sport;
+		r->id.idiag_dport = tw->tw_dport;
+		r->id.idiag_src[0] = tw->tw_rcv_saddr;
+		r->id.idiag_dst[0] = tw->tw_daddr;
+		r->idiag_state = tw->tw_substate;
+		r->idiag_timer = 3;
+		r->idiag_expires = (tmo * 1000 + HZ - 1) / HZ;
+		r->idiag_rqueue = 0;
+		r->idiag_wqueue = 0;
+		r->idiag_uid = 0;
+		r->idiag_inode = 0;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+		if (r->idiag_family == AF_INET6) {
+			const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk);
+
+			ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
+				       &tcp6tw->tw_v6_rcv_saddr);
+			ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
+				       &tcp6tw->tw_v6_daddr);
+		}
+#endif
+		nlh->nlmsg_len = skb->tail - b;
+		return skb->len;
+	}
+
+	r->id.idiag_sport = inet->sport;
+	r->id.idiag_dport = inet->dport;
+	r->id.idiag_src[0] = inet->rcv_saddr;
+	r->id.idiag_dst[0] = inet->daddr;
+
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+	if (r->idiag_family == AF_INET6) {
+		struct ipv6_pinfo *np = inet6_sk(sk);
+
+		ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
+			       &np->rcv_saddr);
+		ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
+			       &np->daddr);
+	}
+#endif
+
+#define EXPIRES_IN_MS(tmo)  ((tmo - jiffies) * 1000 + HZ - 1) / HZ
+
+	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
+		r->idiag_timer = 1;
+		r->idiag_retrans = icsk->icsk_retransmits;
+		r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
+	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
+		r->idiag_timer = 4;
+		r->idiag_retrans = icsk->icsk_probes_out;
+		r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
+	} else if (timer_pending(&sk->sk_timer)) {
+		r->idiag_timer = 2;
+		r->idiag_retrans = icsk->icsk_probes_out;
+		r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
+	} else {
+		r->idiag_timer = 0;
+		r->idiag_expires = 0;
+	}
+#undef EXPIRES_IN_MS
+
+	r->idiag_uid = sock_i_uid(sk);
+	r->idiag_inode = sock_i_ino(sk);
+
+	if (minfo) {
+		minfo->idiag_rmem = atomic_read(&sk->sk_rmem_alloc);
+		minfo->idiag_wmem = sk->sk_wmem_queued;
+		minfo->idiag_fmem = sk->sk_forward_alloc;
+		minfo->idiag_tmem = atomic_read(&sk->sk_wmem_alloc);
+	}
+
+	handler->idiag_get_info(sk, r, info);
+
+	if (sk->sk_state < TCP_TIME_WAIT &&
+	    icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info)
+		icsk->icsk_ca_ops->get_info(sk, ext, skb);
+
+	nlh->nlmsg_len = skb->tail - b;
+	return skb->len;
+
+rtattr_failure:
+nlmsg_failure:
+	skb_trim(skb, b - skb->data);
+	return -1;
+}
+
+static int inet_diag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh)
+{
+	int err;
+	struct sock *sk;
+	struct inet_diag_req *req = NLMSG_DATA(nlh);
+	struct sk_buff *rep;
+	struct inet_hashinfo *hashinfo;
+	const struct inet_diag_handler *handler;
+
+	handler = inet_diag_table[nlh->nlmsg_type];
+	BUG_ON(handler == NULL);
+	hashinfo = handler->idiag_hashinfo;
+
+	if (req->idiag_family == AF_INET) {
+		sk = inet_lookup(hashinfo, req->id.idiag_dst[0],
+				 req->id.idiag_dport, req->id.idiag_src[0],
+				 req->id.idiag_sport, req->id.idiag_if);
+	}
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+	else if (req->idiag_family == AF_INET6) {
+		sk = inet6_lookup(hashinfo,
+				  (struct in6_addr *)req->id.idiag_dst,
+				  req->id.idiag_dport,
+				  (struct in6_addr *)req->id.idiag_src,
+				  req->id.idiag_sport,
+				  req->id.idiag_if);
+	}
+#endif
+	else {
+		return -EINVAL;
+	}
+
+	if (sk == NULL)
+		return -ENOENT;
+
+	err = -ESTALE;
+	if ((req->id.idiag_cookie[0] != INET_DIAG_NOCOOKIE ||
+	     req->id.idiag_cookie[1] != INET_DIAG_NOCOOKIE) &&
+	    ((u32)(unsigned long)sk != req->id.idiag_cookie[0] ||
+	     (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.idiag_cookie[1]))
+		goto out;
+
+	err = -ENOMEM;
+	rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) +
+				     sizeof(struct inet_diag_meminfo) +
+				     handler->idiag_info_size + 64)),
+			GFP_KERNEL);
+	if (!rep)
+		goto out;
+
+	if (inet_diag_fill(rep, sk, req->idiag_ext,
+			 NETLINK_CB(in_skb).pid,
+			 nlh->nlmsg_seq, 0, nlh) <= 0)
+		BUG();
+
+	err = netlink_unicast(idiagnl, rep, NETLINK_CB(in_skb).pid,
+			      MSG_DONTWAIT);
+	if (err > 0)
+		err = 0;
+
+out:
+	if (sk) {
+		if (sk->sk_state == TCP_TIME_WAIT)
+			inet_twsk_put((struct inet_timewait_sock *)sk);
+		else
+			sock_put(sk);
+	}
+	return err;
+}
+
+static int bitstring_match(const u32 *a1, const u32 *a2, int bits)
+{
+	int words = bits >> 5;
+
+	bits &= 0x1f;
+
+	if (words) {
+		if (memcmp(a1, a2, words << 2))
+			return 0;
+	}
+	if (bits) {
+		__u32 w1, w2;
+		__u32 mask;
+
+		w1 = a1[words];
+		w2 = a2[words];
+
+		mask = htonl((0xffffffff) << (32 - bits));
+
+		if ((w1 ^ w2) & mask)
+			return 0;
+	}
+
+	return 1;
+}
+
+
+static int inet_diag_bc_run(const void *bc, int len,
+			  const struct inet_diag_entry *entry)
+{
+	while (len > 0) {
+		int yes = 1;
+		const struct inet_diag_bc_op *op = bc;
+
+		switch (op->code) {
+		case INET_DIAG_BC_NOP:
+			break;
+		case INET_DIAG_BC_JMP:
+			yes = 0;
+			break;
+		case INET_DIAG_BC_S_GE:
+			yes = entry->sport >= op[1].no;
+			break;
+		case INET_DIAG_BC_S_LE:
+			yes = entry->dport <= op[1].no;
+			break;
+		case INET_DIAG_BC_D_GE:
+			yes = entry->dport >= op[1].no;
+			break;
+		case INET_DIAG_BC_D_LE:
+			yes = entry->dport <= op[1].no;
+			break;
+		case INET_DIAG_BC_AUTO:
+			yes = !(entry->userlocks & SOCK_BINDPORT_LOCK);
+			break;
+		case INET_DIAG_BC_S_COND:
+		case INET_DIAG_BC_D_COND: {
+			struct inet_diag_hostcond *cond;
+			u32 *addr;
+
+			cond = (struct inet_diag_hostcond *)(op + 1);
+			if (cond->port != -1 &&
+			    cond->port != (op->code == INET_DIAG_BC_S_COND ?
+					     entry->sport : entry->dport)) {
+				yes = 0;
+				break;
+			}
+			
+			if (cond->prefix_len == 0)
+				break;
+
+			if (op->code == INET_DIAG_BC_S_COND)
+				addr = entry->saddr;
+			else
+				addr = entry->daddr;
+
+			if (bitstring_match(addr, cond->addr, cond->prefix_len))
+				break;
+			if (entry->family == AF_INET6 &&
+			    cond->family == AF_INET) {
+				if (addr[0] == 0 && addr[1] == 0 &&
+				    addr[2] == htonl(0xffff) &&
+				    bitstring_match(addr + 3, cond->addr,
+					    	    cond->prefix_len))
+					break;
+			}
+			yes = 0;
+			break;
+		}
+		}
+
+		if (yes) { 
+			len -= op->yes;
+			bc += op->yes;
+		} else {
+			len -= op->no;
+			bc += op->no;
+		}
+	}
+	return (len == 0);
+}
+
+static int valid_cc(const void *bc, int len, int cc)
+{
+	while (len >= 0) {
+		const struct inet_diag_bc_op *op = bc;
+
+		if (cc > len)
+			return 0;
+		if (cc == len)
+			return 1;
+		if (op->yes < 4)
+			return 0;
+		len -= op->yes;
+		bc  += op->yes;
+	}
+	return 0;
+}
+
+static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
+{
+	const unsigned char *bc = bytecode;
+	int  len = bytecode_len;
+
+	while (len > 0) {
+		struct inet_diag_bc_op *op = (struct inet_diag_bc_op *)bc;
+
+//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
+		switch (op->code) {
+		case INET_DIAG_BC_AUTO:
+		case INET_DIAG_BC_S_COND:
+		case INET_DIAG_BC_D_COND:
+		case INET_DIAG_BC_S_GE:
+		case INET_DIAG_BC_S_LE:
+		case INET_DIAG_BC_D_GE:
+		case INET_DIAG_BC_D_LE:
+			if (op->yes < 4 || op->yes > len + 4)
+				return -EINVAL;
+		case INET_DIAG_BC_JMP:
+			if (op->no < 4 || op->no > len + 4)
+				return -EINVAL;
+			if (op->no < len &&
+			    !valid_cc(bytecode, bytecode_len, len - op->no))
+				return -EINVAL;
+			break;
+		case INET_DIAG_BC_NOP:
+			if (op->yes < 4 || op->yes > len + 4)
+				return -EINVAL;
+			break;
+		default:
+			return -EINVAL;
+		}
+		bc += op->yes;
+		len -= op->yes;
+	}
+	return len == 0 ? 0 : -EINVAL;
+}
+
+static int inet_diag_dump_sock(struct sk_buff *skb, struct sock *sk,
+			     struct netlink_callback *cb)
+{
+	struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
+
+	if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
+		struct inet_diag_entry entry;
+		struct rtattr *bc = (struct rtattr *)(r + 1);
+		struct inet_sock *inet = inet_sk(sk);
+
+		entry.family = sk->sk_family;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+		if (entry.family == AF_INET6) {
+			struct ipv6_pinfo *np = inet6_sk(sk);
+
+			entry.saddr = np->rcv_saddr.s6_addr32;
+			entry.daddr = np->daddr.s6_addr32;
+		} else
+#endif
+		{
+			entry.saddr = &inet->rcv_saddr;
+			entry.daddr = &inet->daddr;
+		}
+		entry.sport = inet->num;
+		entry.dport = ntohs(inet->dport);
+		entry.userlocks = sk->sk_userlocks;
+
+		if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
+			return 0;
+	}
+
+	return inet_diag_fill(skb, sk, r->idiag_ext, NETLINK_CB(cb->skb).pid,
+			    cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+}
+
+static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
+			    struct request_sock *req,
+			    u32 pid, u32 seq,
+			    const struct nlmsghdr *unlh)
+{
+	const struct inet_request_sock *ireq = inet_rsk(req);
+	struct inet_sock *inet = inet_sk(sk);
+	unsigned char *b = skb->tail;
+	struct inet_diag_msg *r;
+	struct nlmsghdr *nlh;
+	long tmo;
+
+	nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
+	nlh->nlmsg_flags = NLM_F_MULTI;
+	r = NLMSG_DATA(nlh);
+
+	r->idiag_family = sk->sk_family;
+	r->idiag_state = TCP_SYN_RECV;
+	r->idiag_timer = 1;
+	r->idiag_retrans = req->retrans;
+
+	r->id.idiag_if = sk->sk_bound_dev_if;
+	r->id.idiag_cookie[0] = (u32)(unsigned long)req;
+	r->id.idiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1);
+
+	tmo = req->expires - jiffies;
+	if (tmo < 0)
+		tmo = 0;
+
+	r->id.idiag_sport = inet->sport;
+	r->id.idiag_dport = ireq->rmt_port;
+	r->id.idiag_src[0] = ireq->loc_addr;
+	r->id.idiag_dst[0] = ireq->rmt_addr;
+	r->idiag_expires = jiffies_to_msecs(tmo);
+	r->idiag_rqueue = 0;
+	r->idiag_wqueue = 0;
+	r->idiag_uid = sock_i_uid(sk);
+	r->idiag_inode = 0;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+	if (r->idiag_family == AF_INET6) {
+		ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
+			       &tcp6_rsk(req)->loc_addr);
+		ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
+			       &tcp6_rsk(req)->rmt_addr);
+	}
+#endif
+	nlh->nlmsg_len = skb->tail - b;
+
+	return skb->len;
+
+nlmsg_failure:
+	skb_trim(skb, b - skb->data);
+	return -1;
+}
+
+static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
+			     struct netlink_callback *cb)
+{
+	struct inet_diag_entry entry;
+	struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct listen_sock *lopt;
+	struct rtattr *bc = NULL;
+	struct inet_sock *inet = inet_sk(sk);
+	int j, s_j;
+	int reqnum, s_reqnum;
+	int err = 0;
+
+	s_j = cb->args[3];
+	s_reqnum = cb->args[4];
+
+	if (s_j > 0)
+		s_j--;
+
+	entry.family = sk->sk_family;
+
+	read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+
+	lopt = icsk->icsk_accept_queue.listen_opt;
+	if (!lopt || !lopt->qlen)
+		goto out;
+
+	if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
+		bc = (struct rtattr *)(r + 1);
+		entry.sport = inet->num;
+		entry.userlocks = sk->sk_userlocks;
+	}
+
+	for (j = s_j; j < lopt->nr_table_entries; j++) {
+		struct request_sock *req, *head = lopt->syn_table[j];
+
+		reqnum = 0;
+		for (req = head; req; reqnum++, req = req->dl_next) {
+			struct inet_request_sock *ireq = inet_rsk(req);
+
+			if (reqnum < s_reqnum)
+				continue;
+			if (r->id.idiag_dport != ireq->rmt_port &&
+			    r->id.idiag_dport)
+				continue;
+
+			if (bc) {
+				entry.saddr =
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+					(entry.family == AF_INET6) ?
+					tcp6_rsk(req)->loc_addr.s6_addr32 :
+#endif
+					&ireq->loc_addr;
+				entry.daddr = 
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+					(entry.family == AF_INET6) ?
+					tcp6_rsk(req)->rmt_addr.s6_addr32 :
+#endif
+					&ireq->rmt_addr;
+				entry.dport = ntohs(ireq->rmt_port);
+
+				if (!inet_diag_bc_run(RTA_DATA(bc),
+						    RTA_PAYLOAD(bc), &entry))
+					continue;
+			}
+
+			err = inet_diag_fill_req(skb, sk, req,
+					       NETLINK_CB(cb->skb).pid,
+					       cb->nlh->nlmsg_seq, cb->nlh);
+			if (err < 0) {
+				cb->args[3] = j + 1;
+				cb->args[4] = reqnum;
+				goto out;
+			}
+		}
+
+		s_reqnum = 0;
+	}
+
+out:
+	read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+
+	return err;
+}
+
+static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	int i, num;
+	int s_i, s_num;
+	struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
+	const struct inet_diag_handler *handler;
+	struct inet_hashinfo *hashinfo;
+
+	handler = inet_diag_table[cb->nlh->nlmsg_type];
+	BUG_ON(handler == NULL);
+	hashinfo = handler->idiag_hashinfo;
+		
+	s_i = cb->args[1];
+	s_num = num = cb->args[2];
+
+	if (cb->args[0] == 0) {
+		if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV)))
+			goto skip_listen_ht;
+
+		inet_listen_lock(hashinfo);
+		for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
+			struct sock *sk;
+			struct hlist_node *node;
+
+			num = 0;
+			sk_for_each(sk, node, &hashinfo->listening_hash[i]) {
+				struct inet_sock *inet = inet_sk(sk);
+
+				if (num < s_num) {
+					num++;
+					continue;
+				}
+
+				if (r->id.idiag_sport != inet->sport &&
+				    r->id.idiag_sport)
+					goto next_listen;
+
+				if (!(r->idiag_states & TCPF_LISTEN) ||
+				    r->id.idiag_dport ||
+				    cb->args[3] > 0)
+					goto syn_recv;
+
+				if (inet_diag_dump_sock(skb, sk, cb) < 0) {
+					inet_listen_unlock(hashinfo);
+					goto done;
+				}
+
+syn_recv:
+				if (!(r->idiag_states & TCPF_SYN_RECV))
+					goto next_listen;
+
+				if (inet_diag_dump_reqs(skb, sk, cb) < 0) {
+					inet_listen_unlock(hashinfo);
+					goto done;
+				}
+
+next_listen:
+				cb->args[3] = 0;
+				cb->args[4] = 0;
+				++num;
+			}
+
+			s_num = 0;
+			cb->args[3] = 0;
+			cb->args[4] = 0;
+		}
+		inet_listen_unlock(hashinfo);
+skip_listen_ht:
+		cb->args[0] = 1;
+		s_i = num = s_num = 0;
+	}
+
+	if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV)))
+		return skb->len;
+
+	for (i = s_i; i < hashinfo->ehash_size; i++) {
+		struct inet_ehash_bucket *head = &hashinfo->ehash[i];
+		struct sock *sk;
+		struct hlist_node *node;
+
+		if (i > s_i)
+			s_num = 0;
+
+		read_lock_bh(&head->lock);
+
+		num = 0;
+		sk_for_each(sk, node, &head->chain) {
+			struct inet_sock *inet = inet_sk(sk);
+
+			if (num < s_num)
+				goto next_normal;
+			if (!(r->idiag_states & (1 << sk->sk_state)))
+				goto next_normal;
+			if (r->id.idiag_sport != inet->sport &&
+			    r->id.idiag_sport)
+				goto next_normal;
+			if (r->id.idiag_dport != inet->dport && r->id.idiag_dport)
+				goto next_normal;
+			if (inet_diag_dump_sock(skb, sk, cb) < 0) {
+				read_unlock_bh(&head->lock);
+				goto done;
+			}
+next_normal:
+			++num;
+		}
+
+		if (r->idiag_states & TCPF_TIME_WAIT) {
+			sk_for_each(sk, node,
+				    &hashinfo->ehash[i + hashinfo->ehash_size].chain) {
+				struct inet_sock *inet = inet_sk(sk);
+
+				if (num < s_num)
+					goto next_dying;
+				if (r->id.idiag_sport != inet->sport &&
+				    r->id.idiag_sport)
+					goto next_dying;
+				if (r->id.idiag_dport != inet->dport &&
+				    r->id.idiag_dport)
+					goto next_dying;
+				if (inet_diag_dump_sock(skb, sk, cb) < 0) {
+					read_unlock_bh(&head->lock);
+					goto done;
+				}
+next_dying:
+				++num;
+			}
+		}
+		read_unlock_bh(&head->lock);
+	}
+
+done:
+	cb->args[1] = i;
+	cb->args[2] = num;
+	return skb->len;
+}
+
+static int inet_diag_dump_done(struct netlink_callback *cb)
+{
+	return 0;
+}
+
+
+static __inline__ int
+inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+	if (!(nlh->nlmsg_flags&NLM_F_REQUEST))
+		return 0;
+
+	if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX)
+		goto err_inval;
+
+	if (inet_diag_table[nlh->nlmsg_type] == NULL)
+		return -ENOENT;
+
+	if (NLMSG_LENGTH(sizeof(struct inet_diag_req)) > skb->len)
+		goto err_inval;
+
+	if (nlh->nlmsg_flags&NLM_F_DUMP) {
+		if (nlh->nlmsg_len >
+		    (4 + NLMSG_SPACE(sizeof(struct inet_diag_req)))) {
+			struct rtattr *rta = (void *)(NLMSG_DATA(nlh) +
+						 sizeof(struct inet_diag_req));
+			if (rta->rta_type != INET_DIAG_REQ_BYTECODE ||
+			    rta->rta_len < 8 ||
+			    rta->rta_len >
+			    (nlh->nlmsg_len -
+			     NLMSG_SPACE(sizeof(struct inet_diag_req))))
+				goto err_inval;
+			if (inet_diag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta)))
+				goto err_inval;
+		}
+		return netlink_dump_start(idiagnl, skb, nlh,
+					  inet_diag_dump,
+					  inet_diag_dump_done);
+	} else {
+		return inet_diag_get_exact(skb, nlh);
+	}
+
+err_inval:
+	return -EINVAL;
+}
+
+
+static inline void inet_diag_rcv_skb(struct sk_buff *skb)
+{
+	int err;
+	struct nlmsghdr * nlh;
+
+	if (skb->len >= NLMSG_SPACE(0)) {
+		nlh = (struct nlmsghdr *)skb->data;
+		if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
+			return;
+		err = inet_diag_rcv_msg(skb, nlh);
+		if (err || nlh->nlmsg_flags & NLM_F_ACK) 
+			netlink_ack(skb, nlh, err);
+	}
+}
+
+static void inet_diag_rcv(struct sock *sk, int len)
+{
+	struct sk_buff *skb;
+	unsigned int qlen = skb_queue_len(&sk->sk_receive_queue);
+
+	while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) {
+		inet_diag_rcv_skb(skb);
+		kfree_skb(skb);
+	}
+}
+
+static DEFINE_SPINLOCK(inet_diag_register_lock);
+
+int inet_diag_register(const struct inet_diag_handler *h)
+{
+	const __u16 type = h->idiag_type;
+	int err = -EINVAL;
+
+	if (type >= INET_DIAG_GETSOCK_MAX)
+		goto out;
+
+	spin_lock(&inet_diag_register_lock);
+	err = -EEXIST;
+	if (inet_diag_table[type] == NULL) {
+		inet_diag_table[type] = h;
+		err = 0;
+	}
+	spin_unlock(&inet_diag_register_lock);
+out:
+	return err;
+}
+EXPORT_SYMBOL_GPL(inet_diag_register);
+
+void inet_diag_unregister(const struct inet_diag_handler *h)
+{
+	const __u16 type = h->idiag_type;
+
+	if (type >= INET_DIAG_GETSOCK_MAX)
+		return;
+
+	spin_lock(&inet_diag_register_lock);
+	inet_diag_table[type] = NULL;
+	spin_unlock(&inet_diag_register_lock);
+
+	synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(inet_diag_unregister);
+
+static int __init inet_diag_init(void)
+{
+	const int inet_diag_table_size = (INET_DIAG_GETSOCK_MAX *
+					  sizeof(struct inet_diag_handler *));
+	int err = -ENOMEM;
+
+	inet_diag_table = kmalloc(inet_diag_table_size, GFP_KERNEL);
+	if (!inet_diag_table)
+		goto out;
+
+	memset(inet_diag_table, 0, inet_diag_table_size);
+	idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, 0, inet_diag_rcv,
+					THIS_MODULE);
+	if (idiagnl == NULL)
+		goto out_free_table;
+	err = 0;
+out:
+	return err;
+out_free_table:
+	kfree(inet_diag_table);
+	goto out;
+}
+
+static void __exit inet_diag_exit(void)
+{
+	sock_release(idiagnl->sk_socket);
+	kfree(inet_diag_table);
+}
+
+module_init(inet_diag_init);
+module_exit(inet_diag_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
new file mode 100644
index 00000000000..e8d29fe736d
--- /dev/null
+++ b/net/ipv4/inet_hashtables.c
@@ -0,0 +1,165 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Generic INET transport hashtables
+ *
+ * Authors:	Lotsa people, from code originally in tcp
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+
+/*
+ * Allocate and initialize a new local port bind bucket.
+ * The bindhash mutex for snum's hash chain must be held here.
+ */
+struct inet_bind_bucket *inet_bind_bucket_create(kmem_cache_t *cachep,
+						 struct inet_bind_hashbucket *head,
+						 const unsigned short snum)
+{
+	struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, SLAB_ATOMIC);
+
+	if (tb != NULL) {
+		tb->port      = snum;
+		tb->fastreuse = 0;
+		INIT_HLIST_HEAD(&tb->owners);
+		hlist_add_head(&tb->node, &head->chain);
+	}
+	return tb;
+}
+
+EXPORT_SYMBOL(inet_bind_bucket_create);
+
+/*
+ * Caller must hold hashbucket lock for this tb with local BH disabled
+ */
+void inet_bind_bucket_destroy(kmem_cache_t *cachep, struct inet_bind_bucket *tb)
+{
+	if (hlist_empty(&tb->owners)) {
+		__hlist_del(&tb->node);
+		kmem_cache_free(cachep, tb);
+	}
+}
+
+void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
+		    const unsigned short snum)
+{
+	inet_sk(sk)->num = snum;
+	sk_add_bind_node(sk, &tb->owners);
+	inet_csk(sk)->icsk_bind_hash = tb;
+}
+
+EXPORT_SYMBOL(inet_bind_hash);
+
+/*
+ * Get rid of any references to a local port held by the given sock.
+ */
+static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk)
+{
+	const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size);
+	struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
+	struct inet_bind_bucket *tb;
+
+	spin_lock(&head->lock);
+	tb = inet_csk(sk)->icsk_bind_hash;
+	__sk_del_bind_node(sk);
+	inet_csk(sk)->icsk_bind_hash = NULL;
+	inet_sk(sk)->num = 0;
+	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+	spin_unlock(&head->lock);
+}
+
+void inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk)
+{
+	local_bh_disable();
+	__inet_put_port(hashinfo, sk);
+	local_bh_enable();
+}
+
+EXPORT_SYMBOL(inet_put_port);
+
+/*
+ * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
+ * Look, when several writers sleep and reader wakes them up, all but one
+ * immediately hit write lock and grab all the cpus. Exclusive sleep solves
+ * this, _but_ remember, it adds useless work on UP machines (wake up each
+ * exclusive lock release). It should be ifdefed really.
+ */
+void inet_listen_wlock(struct inet_hashinfo *hashinfo)
+{
+	write_lock(&hashinfo->lhash_lock);
+
+	if (atomic_read(&hashinfo->lhash_users)) {
+		DEFINE_WAIT(wait);
+
+		for (;;) {
+			prepare_to_wait_exclusive(&hashinfo->lhash_wait,
+						  &wait, TASK_UNINTERRUPTIBLE);
+			if (!atomic_read(&hashinfo->lhash_users))
+				break;
+			write_unlock_bh(&hashinfo->lhash_lock);
+			schedule();
+			write_lock_bh(&hashinfo->lhash_lock);
+		}
+
+		finish_wait(&hashinfo->lhash_wait, &wait);
+	}
+}
+
+EXPORT_SYMBOL(inet_listen_wlock);
+
+/*
+ * Don't inline this cruft. Here are some nice properties to exploit here. The
+ * BSD API does not allow a listening sock to specify the remote port nor the
+ * remote address for the connection. So always assume those are both
+ * wildcarded during the search since they can never be otherwise.
+ */
+struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 daddr,
+				    const unsigned short hnum, const int dif)
+{
+	struct sock *result = NULL, *sk;
+	const struct hlist_node *node;
+	int hiscore = -1;
+
+	sk_for_each(sk, node, head) {
+		const struct inet_sock *inet = inet_sk(sk);
+
+		if (inet->num == hnum && !ipv6_only_sock(sk)) {
+			const __u32 rcv_saddr = inet->rcv_saddr;
+			int score = sk->sk_family == PF_INET ? 1 : 0;
+
+			if (rcv_saddr) {
+				if (rcv_saddr != daddr)
+					continue;
+				score += 2;
+			}
+			if (sk->sk_bound_dev_if) {
+				if (sk->sk_bound_dev_if != dif)
+					continue;
+				score += 2;
+			}
+			if (score == 5)
+				return sk;
+			if (score > hiscore) {
+				hiscore	= score;
+				result	= sk;
+			}
+		}
+	}
+	return result;
+}
+
+EXPORT_SYMBOL_GPL(__inet_lookup_listener);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
new file mode 100644
index 00000000000..4d1502a4985
--- /dev/null
+++ b/net/ipv4/inet_timewait_sock.c
@@ -0,0 +1,384 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Generic TIME_WAIT sockets functions
+ *
+ *		From code orinally in TCP
+ */
+
+#include <linux/config.h>
+
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/ip.h>
+
+/* Must be called with locally disabled BHs. */
+void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo)
+{
+	struct inet_bind_hashbucket *bhead;
+	struct inet_bind_bucket *tb;
+	/* Unlink from established hashes. */
+	struct inet_ehash_bucket *ehead = &hashinfo->ehash[tw->tw_hashent];
+
+	write_lock(&ehead->lock);
+	if (hlist_unhashed(&tw->tw_node)) {
+		write_unlock(&ehead->lock);
+		return;
+	}
+	__hlist_del(&tw->tw_node);
+	sk_node_init(&tw->tw_node);
+	write_unlock(&ehead->lock);
+
+	/* Disassociate with bind bucket. */
+	bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)];
+	spin_lock(&bhead->lock);
+	tb = tw->tw_tb;
+	__hlist_del(&tw->tw_bind_node);
+	tw->tw_tb = NULL;
+	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+	spin_unlock(&bhead->lock);
+#ifdef SOCK_REFCNT_DEBUG
+	if (atomic_read(&tw->tw_refcnt) != 1) {
+		printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n",
+		       tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
+	}
+#endif
+	inet_twsk_put(tw);
+}
+
+EXPORT_SYMBOL_GPL(__inet_twsk_kill);
+
+/*
+ * Enter the time wait state. This is called with locally disabled BH.
+ * Essentially we whip up a timewait bucket, copy the relevant info into it
+ * from the SK, and mess with hash chains and list linkage.
+ */
+void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
+			   struct inet_hashinfo *hashinfo)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct inet_ehash_bucket *ehead = &hashinfo->ehash[sk->sk_hashent];
+	struct inet_bind_hashbucket *bhead;
+	/* Step 1: Put TW into bind hash. Original socket stays there too.
+	   Note, that any socket with inet->num != 0 MUST be bound in
+	   binding cache, even if it is closed.
+	 */
+	bhead = &hashinfo->bhash[inet_bhashfn(inet->num, hashinfo->bhash_size)];
+	spin_lock(&bhead->lock);
+	tw->tw_tb = icsk->icsk_bind_hash;
+	BUG_TRAP(icsk->icsk_bind_hash);
+	inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
+	spin_unlock(&bhead->lock);
+
+	write_lock(&ehead->lock);
+
+	/* Step 2: Remove SK from established hash. */
+	if (__sk_del_node_init(sk))
+		sock_prot_dec_use(sk->sk_prot);
+
+	/* Step 3: Hash TW into TIMEWAIT half of established hash table. */
+	inet_twsk_add_node(tw, &(ehead + hashinfo->ehash_size)->chain);
+	atomic_inc(&tw->tw_refcnt);
+
+	write_unlock(&ehead->lock);
+}
+
+EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
+
+struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
+{
+	struct inet_timewait_sock *tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab,
+							 SLAB_ATOMIC);
+	if (tw != NULL) {
+		const struct inet_sock *inet = inet_sk(sk);
+
+		/* Give us an identity. */
+		tw->tw_daddr	    = inet->daddr;
+		tw->tw_rcv_saddr    = inet->rcv_saddr;
+		tw->tw_bound_dev_if = sk->sk_bound_dev_if;
+		tw->tw_num	    = inet->num;
+		tw->tw_state	    = TCP_TIME_WAIT;
+		tw->tw_substate	    = state;
+		tw->tw_sport	    = inet->sport;
+		tw->tw_dport	    = inet->dport;
+		tw->tw_family	    = sk->sk_family;
+		tw->tw_reuse	    = sk->sk_reuse;
+		tw->tw_hashent	    = sk->sk_hashent;
+		tw->tw_ipv6only	    = 0;
+		tw->tw_prot	    = sk->sk_prot_creator;
+		atomic_set(&tw->tw_refcnt, 1);
+		inet_twsk_dead_node_init(tw);
+	}
+
+	return tw;
+}
+
+EXPORT_SYMBOL_GPL(inet_twsk_alloc);
+
+/* Returns non-zero if quota exceeded.  */
+static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
+				    const int slot)
+{
+	struct inet_timewait_sock *tw;
+	struct hlist_node *node;
+	unsigned int killed;
+	int ret;
+
+	/* NOTE: compare this to previous version where lock
+	 * was released after detaching chain. It was racy,
+	 * because tw buckets are scheduled in not serialized context
+	 * in 2.3 (with netfilter), and with softnet it is common, because
+	 * soft irqs are not sequenced.
+	 */
+	killed = 0;
+	ret = 0;
+rescan:
+	inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) {
+		__inet_twsk_del_dead_node(tw);
+		spin_unlock(&twdr->death_lock);
+		__inet_twsk_kill(tw, twdr->hashinfo);
+		inet_twsk_put(tw);
+		killed++;
+		spin_lock(&twdr->death_lock);
+		if (killed > INET_TWDR_TWKILL_QUOTA) {
+			ret = 1;
+			break;
+		}
+
+		/* While we dropped twdr->death_lock, another cpu may have
+		 * killed off the next TW bucket in the list, therefore
+		 * do a fresh re-read of the hlist head node with the
+		 * lock reacquired.  We still use the hlist traversal
+		 * macro in order to get the prefetches.
+		 */
+		goto rescan;
+	}
+
+	twdr->tw_count -= killed;
+	NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
+
+	return ret;
+}
+
+void inet_twdr_hangman(unsigned long data)
+{
+	struct inet_timewait_death_row *twdr;
+	int unsigned need_timer;
+
+	twdr = (struct inet_timewait_death_row *)data;
+	spin_lock(&twdr->death_lock);
+
+	if (twdr->tw_count == 0)
+		goto out;
+
+	need_timer = 0;
+	if (inet_twdr_do_twkill_work(twdr, twdr->slot)) {
+		twdr->thread_slots |= (1 << twdr->slot);
+		mb();
+		schedule_work(&twdr->twkill_work);
+		need_timer = 1;
+	} else {
+		/* We purged the entire slot, anything left?  */
+		if (twdr->tw_count)
+			need_timer = 1;
+	}
+	twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
+	if (need_timer)
+		mod_timer(&twdr->tw_timer, jiffies + twdr->period);
+out:
+	spin_unlock(&twdr->death_lock);
+}
+
+EXPORT_SYMBOL_GPL(inet_twdr_hangman);
+
+extern void twkill_slots_invalid(void);
+
+void inet_twdr_twkill_work(void *data)
+{
+	struct inet_timewait_death_row *twdr = data;
+	int i;
+
+	if ((INET_TWDR_TWKILL_SLOTS - 1) > (sizeof(twdr->thread_slots) * 8))
+		twkill_slots_invalid();
+
+	while (twdr->thread_slots) {
+		spin_lock_bh(&twdr->death_lock);
+		for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) {
+			if (!(twdr->thread_slots & (1 << i)))
+				continue;
+
+			while (inet_twdr_do_twkill_work(twdr, i) != 0) {
+				if (need_resched()) {
+					spin_unlock_bh(&twdr->death_lock);
+					schedule();
+					spin_lock_bh(&twdr->death_lock);
+				}
+			}
+
+			twdr->thread_slots &= ~(1 << i);
+		}
+		spin_unlock_bh(&twdr->death_lock);
+	}
+}
+
+EXPORT_SYMBOL_GPL(inet_twdr_twkill_work);
+
+/* These are always called from BH context.  See callers in
+ * tcp_input.c to verify this.
+ */
+
+/* This is for handling early-kills of TIME_WAIT sockets. */
+void inet_twsk_deschedule(struct inet_timewait_sock *tw,
+			  struct inet_timewait_death_row *twdr)
+{
+	spin_lock(&twdr->death_lock);
+	if (inet_twsk_del_dead_node(tw)) {
+		inet_twsk_put(tw);
+		if (--twdr->tw_count == 0)
+			del_timer(&twdr->tw_timer);
+	}
+	spin_unlock(&twdr->death_lock);
+	__inet_twsk_kill(tw, twdr->hashinfo);
+}
+
+EXPORT_SYMBOL(inet_twsk_deschedule);
+
+void inet_twsk_schedule(struct inet_timewait_sock *tw,
+		       struct inet_timewait_death_row *twdr,
+		       const int timeo, const int timewait_len)
+{
+	struct hlist_head *list;
+	int slot;
+
+	/* timeout := RTO * 3.5
+	 *
+	 * 3.5 = 1+2+0.5 to wait for two retransmits.
+	 *
+	 * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
+	 * our ACK acking that FIN can be lost. If N subsequent retransmitted
+	 * FINs (or previous seqments) are lost (probability of such event
+	 * is p^(N+1), where p is probability to lose single packet and
+	 * time to detect the loss is about RTO*(2^N - 1) with exponential
+	 * backoff). Normal timewait length is calculated so, that we
+	 * waited at least for one retransmitted FIN (maximal RTO is 120sec).
+	 * [ BTW Linux. following BSD, violates this requirement waiting
+	 *   only for 60sec, we should wait at least for 240 secs.
+	 *   Well, 240 consumes too much of resources 8)
+	 * ]
+	 * This interval is not reduced to catch old duplicate and
+	 * responces to our wandering segments living for two MSLs.
+	 * However, if we use PAWS to detect
+	 * old duplicates, we can reduce the interval to bounds required
+	 * by RTO, rather than MSL. So, if peer understands PAWS, we
+	 * kill tw bucket after 3.5*RTO (it is important that this number
+	 * is greater than TS tick!) and detect old duplicates with help
+	 * of PAWS.
+	 */
+	slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK;
+
+	spin_lock(&twdr->death_lock);
+
+	/* Unlink it, if it was scheduled */
+	if (inet_twsk_del_dead_node(tw))
+		twdr->tw_count--;
+	else
+		atomic_inc(&tw->tw_refcnt);
+
+	if (slot >= INET_TWDR_RECYCLE_SLOTS) {
+		/* Schedule to slow timer */
+		if (timeo >= timewait_len) {
+			slot = INET_TWDR_TWKILL_SLOTS - 1;
+		} else {
+			slot = (timeo + twdr->period - 1) / twdr->period;
+			if (slot >= INET_TWDR_TWKILL_SLOTS)
+				slot = INET_TWDR_TWKILL_SLOTS - 1;
+		}
+		tw->tw_ttd = jiffies + timeo;
+		slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1);
+		list = &twdr->cells[slot];
+	} else {
+		tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK);
+
+		if (twdr->twcal_hand < 0) {
+			twdr->twcal_hand = 0;
+			twdr->twcal_jiffie = jiffies;
+			twdr->twcal_timer.expires = twdr->twcal_jiffie +
+					      (slot << INET_TWDR_RECYCLE_TICK);
+			add_timer(&twdr->twcal_timer);
+		} else {
+			if (time_after(twdr->twcal_timer.expires,
+				       jiffies + (slot << INET_TWDR_RECYCLE_TICK)))
+				mod_timer(&twdr->twcal_timer,
+					  jiffies + (slot << INET_TWDR_RECYCLE_TICK));
+			slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1);
+		}
+		list = &twdr->twcal_row[slot];
+	}
+
+	hlist_add_head(&tw->tw_death_node, list);
+
+	if (twdr->tw_count++ == 0)
+		mod_timer(&twdr->tw_timer, jiffies + twdr->period);
+	spin_unlock(&twdr->death_lock);
+}
+
+EXPORT_SYMBOL_GPL(inet_twsk_schedule);
+
+void inet_twdr_twcal_tick(unsigned long data)
+{
+	struct inet_timewait_death_row *twdr;
+	int n, slot;
+	unsigned long j;
+	unsigned long now = jiffies;
+	int killed = 0;
+	int adv = 0;
+
+	twdr = (struct inet_timewait_death_row *)data;
+
+	spin_lock(&twdr->death_lock);
+	if (twdr->twcal_hand < 0)
+		goto out;
+
+	slot = twdr->twcal_hand;
+	j = twdr->twcal_jiffie;
+
+	for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) {
+		if (time_before_eq(j, now)) {
+			struct hlist_node *node, *safe;
+			struct inet_timewait_sock *tw;
+
+			inet_twsk_for_each_inmate_safe(tw, node, safe,
+						       &twdr->twcal_row[slot]) {
+				__inet_twsk_del_dead_node(tw);
+				__inet_twsk_kill(tw, twdr->hashinfo);
+				inet_twsk_put(tw);
+				killed++;
+			}
+		} else {
+			if (!adv) {
+				adv = 1;
+				twdr->twcal_jiffie = j;
+				twdr->twcal_hand = slot;
+			}
+
+			if (!hlist_empty(&twdr->twcal_row[slot])) {
+				mod_timer(&twdr->twcal_timer, j);
+				goto out;
+			}
+		}
+		j += 1 << INET_TWDR_RECYCLE_TICK;
+		slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1);
+	}
+	twdr->twcal_hand = -1;
+
+out:
+	if ((twdr->tw_count -= killed) == 0)
+		del_timer(&twdr->tw_timer);
+	NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
+	spin_unlock(&twdr->death_lock);
+}
+
+EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index ab18a853d7c..f84ba9c9655 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -20,6 +20,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/net.h>
+#include <net/ip.h>
 #include <net/inetpeer.h>
 
 /*
@@ -72,7 +73,7 @@
 /* Exported for inet_getid inline function.  */
 DEFINE_SPINLOCK(inet_peer_idlock);
 
-static kmem_cache_t *peer_cachep;
+static kmem_cache_t *peer_cachep __read_mostly;
 
 #define node_height(x) x->avl_height
 static struct inet_peer peer_fake_node = {
@@ -459,5 +460,3 @@ static void peer_check_expire(unsigned long dummy)
 				peer_total / inet_peer_threshold * HZ;
 	add_timer(&peer_periodic_timer);
 }
-
-EXPORT_SYMBOL(inet_peer_idlock);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 77094aac6c2..0923add122b 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -76,16 +76,12 @@ int ip_forward(struct sk_buff *skb)
 	 *	that reaches zero, we must reply an ICMP control message telling
 	 *	that the packet's lifetime expired.
 	 */
-
-	iph = skb->nh.iph;
-
-	if (iph->ttl <= 1)
+	if (skb->nh.iph->ttl <= 1)
                 goto too_many_hops;
 
 	if (!xfrm4_route_forward(skb))
 		goto drop;
 
-	iph = skb->nh.iph;
 	rt = (struct rtable*)skb->dst;
 
 	if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index eb377ae1530..9e6e683cc34 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -377,7 +377,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
 	return ip_frag_intern(hash, qp);
 
 out_nomem:
-	LIMIT_NETDEBUG(printk(KERN_ERR "ip_frag_create: no memory left !\n"));
+	LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n");
 	return NULL;
 }
 
@@ -533,7 +533,7 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
  	if (skb->dev)
  		qp->iif = skb->dev->ifindex;
 	skb->dev = NULL;
-	qp->stamp = skb->stamp;
+	skb_get_timestamp(skb, &qp->stamp);
 	qp->meat += skb->len;
 	atomic_add(skb->truesize, &ip_frag_mem);
 	if (offset == 0)
@@ -615,7 +615,7 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
 
 	head->next = NULL;
 	head->dev = dev;
-	head->stamp = qp->stamp;
+	skb_set_timestamp(head, &qp->stamp);
 
 	iph = head->nh.iph;
 	iph->frag_off = 0;
@@ -625,8 +625,8 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
 	return head;
 
 out_nomem:
- 	LIMIT_NETDEBUG(printk(KERN_ERR "IP: queue_glue: no memory for gluing "
-			      "queue %p\n", qp));
+ 	LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing "
+			      "queue %p\n", qp);
 	goto out_fail;
 out_oversize:
 	if (net_ratelimit())
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index c703528e0bc..473d0f2b2e0 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -150,7 +150,7 @@
  *	SNMP management statistics
  */
 
-DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics);
+DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics) __read_mostly;
 
 /*
  *	Process Router Attention IP option
@@ -225,8 +225,8 @@ static inline int ip_local_deliver_finish(struct sk_buff *skb)
 		/* If there maybe a raw socket we must check - if not we
 		 * don't care less
 		 */
-		if (raw_sk)
-			raw_v4_input(skb, skb->nh.iph, hash);
+		if (raw_sk && !raw_v4_input(skb, skb->nh.iph, hash))
+			raw_sk = NULL;
 
 		if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) {
 			int ret;
@@ -279,18 +279,70 @@ int ip_local_deliver(struct sk_buff *skb)
 		       ip_local_deliver_finish);
 }
 
-static inline int ip_rcv_finish(struct sk_buff *skb)
+static inline int ip_rcv_options(struct sk_buff *skb)
 {
+	struct ip_options *opt;
+	struct iphdr *iph;
 	struct net_device *dev = skb->dev;
+
+	/* It looks as overkill, because not all
+	   IP options require packet mangling.
+	   But it is the easiest for now, especially taking
+	   into account that combination of IP options
+	   and running sniffer is extremely rare condition.
+					      --ANK (980813)
+	*/
+	if (skb_cow(skb, skb_headroom(skb))) {
+		IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
+		goto drop;
+	}
+
+	iph = skb->nh.iph;
+
+	if (ip_options_compile(NULL, skb)) {
+		IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
+		goto drop;
+	}
+
+	opt = &(IPCB(skb)->opt);
+	if (unlikely(opt->srr)) {
+		struct in_device *in_dev = in_dev_get(dev);
+		if (in_dev) {
+			if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
+				if (IN_DEV_LOG_MARTIANS(in_dev) &&
+				    net_ratelimit())
+					printk(KERN_INFO "source route option "
+					       "%u.%u.%u.%u -> %u.%u.%u.%u\n",
+					       NIPQUAD(iph->saddr),
+					       NIPQUAD(iph->daddr));
+				in_dev_put(in_dev);
+				goto drop;
+			}
+
+			in_dev_put(in_dev);
+		}
+
+		if (ip_options_rcv_srr(skb))
+			goto drop;
+	}
+
+	return 0;
+drop:
+	return -1;
+}
+
+static inline int ip_rcv_finish(struct sk_buff *skb)
+{
 	struct iphdr *iph = skb->nh.iph;
-	int err;
 
 	/*
 	 *	Initialise the virtual path cache for the packet. It describes
 	 *	how the packet travels inside Linux networking.
 	 */ 
-	if (skb->dst == NULL) {
-		if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
+	if (likely(skb->dst == NULL)) {
+		int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
+					 skb->dev);
+		if (unlikely(err)) {
 			if (err == -EHOSTUNREACH)
 				IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
 			goto drop; 
@@ -298,7 +350,7 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
 	}
 
 #ifdef CONFIG_NET_CLS_ROUTE
-	if (skb->dst->tclassid) {
+	if (unlikely(skb->dst->tclassid)) {
 		struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id();
 		u32 idx = skb->dst->tclassid;
 		st[idx&0xFF].o_packets++;
@@ -308,48 +360,11 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
 	}
 #endif
 
-	if (iph->ihl > 5) {
-		struct ip_options *opt;
-
-		/* It looks as overkill, because not all
-		   IP options require packet mangling.
-		   But it is the easiest for now, especially taking
-		   into account that combination of IP options
-		   and running sniffer is extremely rare condition.
-		                                      --ANK (980813)
-		*/
-
-		if (skb_cow(skb, skb_headroom(skb))) {
-			IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
-			goto drop;
-		}
-		iph = skb->nh.iph;
-
-		if (ip_options_compile(NULL, skb))
-			goto inhdr_error;
-
-		opt = &(IPCB(skb)->opt);
-		if (opt->srr) {
-			struct in_device *in_dev = in_dev_get(dev);
-			if (in_dev) {
-				if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
-					if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
-						printk(KERN_INFO "source route option %u.%u.%u.%u -> %u.%u.%u.%u\n",
-						       NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
-					in_dev_put(in_dev);
-					goto drop;
-				}
-				in_dev_put(in_dev);
-			}
-			if (ip_options_rcv_srr(skb))
-				goto drop;
-		}
-	}
+	if (iph->ihl > 5 && ip_rcv_options(skb))
+		goto drop;
 
 	return dst_input(skb);
 
-inhdr_error:
-	IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
 drop:
         kfree_skb(skb);
         return NET_RX_DROP;
@@ -358,9 +373,10 @@ drop:
 /*
  * 	Main IP Receive routine.
  */ 
-int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 {
 	struct iphdr *iph;
+	u32 len;
 
 	/* When the interface is in promisc. mode, drop all the crap
 	 * that it receives, do not try to analyse it.
@@ -392,29 +408,27 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
 	 */
 
 	if (iph->ihl < 5 || iph->version != 4)
-		goto inhdr_error; 
+		goto inhdr_error;
 
 	if (!pskb_may_pull(skb, iph->ihl*4))
 		goto inhdr_error;
 
 	iph = skb->nh.iph;
 
-	if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
-		goto inhdr_error; 
+	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+		goto inhdr_error;
 
-	{
-		__u32 len = ntohs(iph->tot_len); 
-		if (skb->len < len || len < (iph->ihl<<2))
-			goto inhdr_error;
+	len = ntohs(iph->tot_len);
+	if (skb->len < len || len < (iph->ihl*4))
+		goto inhdr_error;
 
-		/* Our transport medium may have padded the buffer out. Now we know it
-		 * is IP we can trim to the true length of the frame.
-		 * Note this now means skb->len holds ntohs(iph->tot_len).
-		 */
-		if (pskb_trim_rcsum(skb, len)) {
-			IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
-			goto drop;
-		}
+	/* Our transport medium may have padded the buffer out. Now we know it
+	 * is IP we can trim to the true length of the frame.
+	 * Note this now means skb->len holds ntohs(iph->tot_len).
+	 */
+	if (pskb_trim_rcsum(skb, len)) {
+		IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
+		goto drop;
 	}
 
 	return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL,
@@ -428,5 +442,4 @@ out:
         return NET_RX_DROP;
 }
 
-EXPORT_SYMBOL(ip_rcv);
 EXPORT_SYMBOL(ip_statistics);
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 6d89f3f3e70..bce4e875193 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -489,23 +489,18 @@ void ip_options_undo(struct ip_options * opt)
 	}
 }
 
-int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, int user)
+static struct ip_options *ip_options_get_alloc(const int optlen)
 {
-	struct ip_options *opt;
+	struct ip_options *opt = kmalloc(sizeof(*opt) + ((optlen + 3) & ~3),
+					 GFP_KERNEL);
+	if (opt)
+		memset(opt, 0, sizeof(*opt));
+	return opt;
+}
 
-	opt = kmalloc(sizeof(struct ip_options)+((optlen+3)&~3), GFP_KERNEL);
-	if (!opt)
-		return -ENOMEM;
-	memset(opt, 0, sizeof(struct ip_options));
-	if (optlen) {
-		if (user) {
-			if (copy_from_user(opt->__data, data, optlen)) {
-				kfree(opt);
-				return -EFAULT;
-			}
-		} else
-			memcpy(opt->__data, data, optlen);
-	}
+static int ip_options_get_finish(struct ip_options **optp,
+				 struct ip_options *opt, int optlen)
+{
 	while (optlen & 3)
 		opt->__data[optlen++] = IPOPT_END;
 	opt->optlen = optlen;
@@ -521,6 +516,30 @@ int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, in
 	return 0;
 }
 
+int ip_options_get_from_user(struct ip_options **optp, unsigned char __user *data, int optlen)
+{
+	struct ip_options *opt = ip_options_get_alloc(optlen);
+
+	if (!opt)
+		return -ENOMEM;
+	if (optlen && copy_from_user(opt->__data, data, optlen)) {
+		kfree(opt);
+		return -EFAULT;
+	}
+	return ip_options_get_finish(optp, opt, optlen);
+}
+
+int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen)
+{
+	struct ip_options *opt = ip_options_get_alloc(optlen);
+
+	if (!opt)
+		return -ENOMEM;
+	if (optlen)
+		memcpy(opt->__data, data, optlen);
+	return ip_options_get_finish(optp, opt, optlen);
+}
+
 void ip_forward_options(struct sk_buff *skb)
 {
 	struct   ip_options * opt	= &(IPCB(skb)->opt);
@@ -620,6 +639,3 @@ int ip_options_rcv_srr(struct sk_buff *skb)
 	}
 	return 0;
 }
-
-EXPORT_SYMBOL(ip_options_compile);
-EXPORT_SYMBOL(ip_options_undo);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 80d13103b2b..3f1a263e124 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -69,13 +69,10 @@
 #include <net/ip.h>
 #include <net/protocol.h>
 #include <net/route.h>
-#include <net/tcp.h>
-#include <net/udp.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
 #include <net/arp.h>
 #include <net/icmp.h>
-#include <net/raw.h>
 #include <net/checksum.h>
 #include <net/inetpeer.h>
 #include <net/checksum.h>
@@ -84,12 +81,8 @@
 #include <linux/netfilter_bridge.h>
 #include <linux/mroute.h>
 #include <linux/netlink.h>
+#include <linux/tcp.h>
 
-/*
- *      Shall we try to damage output packets if routing dev changes?
- */
-
-int sysctl_ip_dynaddr;
 int sysctl_ip_default_ttl = IPDEFTTL;
 
 /* Generate a checksum for an outgoing IP datagram. */
@@ -165,6 +158,8 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 		       dst_output);
 }
 
+EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
+
 static inline int ip_finish_output2(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb->dst;
@@ -205,7 +200,7 @@ static inline int ip_finish_output2(struct sk_buff *skb)
 	return -EINVAL;
 }
 
-int ip_finish_output(struct sk_buff *skb)
+static inline int ip_finish_output(struct sk_buff *skb)
 {
 	struct net_device *dev = skb->dst->dev;
 
@@ -329,8 +324,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 			if (ip_route_output_flow(&rt, &fl, sk, 0))
 				goto no_route;
 		}
-		__sk_dst_set(sk, &rt->u.dst);
-		tcp_v4_setup_caps(sk, &rt->u.dst);
+		sk_setup_caps(sk, &rt->u.dst);
 	}
 	skb->dst = dst_clone(&rt->u.dst);
 
@@ -392,7 +386,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 #endif
 #ifdef CONFIG_NETFILTER
 	to->nfmark = from->nfmark;
-	to->nfcache = from->nfcache;
 	/* Connection association is same as pre-frag packet */
 	nf_conntrack_put(to->nfct);
 	to->nfct = from->nfct;
@@ -580,7 +573,7 @@ slow_path:
 		 */
 
 		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
-			NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
+			NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
 			err = -ENOMEM;
 			goto fail;
 		}
@@ -1329,12 +1322,7 @@ void __init ip_init(void)
 #endif
 }
 
-EXPORT_SYMBOL(ip_finish_output);
 EXPORT_SYMBOL(ip_fragment);
 EXPORT_SYMBOL(ip_generic_getfrag);
 EXPORT_SYMBOL(ip_queue_xmit);
 EXPORT_SYMBOL(ip_send_check);
-
-#ifdef CONFIG_SYSCTL
-EXPORT_SYMBOL(sysctl_ip_default_ttl);
-#endif
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index ff4bd067b39..2f0b47da5b3 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -153,7 +153,7 @@ int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc)
 		switch (cmsg->cmsg_type) {
 		case IP_RETOPTS:
 			err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
-			err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40, 0);
+			err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40);
 			if (err)
 				return err;
 			break;
@@ -425,7 +425,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 			struct ip_options * opt = NULL;
 			if (optlen > 40 || optlen < 0)
 				goto e_inval;
-			err = ip_options_get(&opt, optval, optlen, 1);
+			err = ip_options_get_from_user(&opt, optval, optlen);
 			if (err)
 				break;
 			if (sk->sk_type == SOCK_STREAM) {
@@ -614,7 +614,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 		}
 		case IP_MSFILTER:
 		{
-			extern int sysctl_optmem_max;
 			extern int sysctl_igmp_max_msf;
 			struct ip_msfilter *msf;
 
@@ -769,7 +768,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 		}
 		case MCAST_MSFILTER:
 		{
-			extern int sysctl_optmem_max;
 			extern int sysctl_igmp_max_msf;
 			struct sockaddr_in *psin;
 			struct ip_msfilter *msf = NULL;
@@ -1090,7 +1088,5 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 
 EXPORT_SYMBOL(ip_cmsg_recv);
 
-#ifdef CONFIG_IP_SCTP_MODULE
 EXPORT_SYMBOL(ip_getsockopt);
 EXPORT_SYMBOL(ip_setsockopt);
-#endif
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 7ded6e60f43..dcb7ee6c485 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -214,8 +214,8 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
 	                      spi, IPPROTO_COMP, AF_INET);
 	if (!x)
 		return;
-	NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n",
-	       spi, NIPQUAD(iph->daddr)));
+	NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n",
+		 spi, NIPQUAD(iph->daddr));
 	xfrm_state_put(x);
 }
 
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index d2bf8e1930a..63e106605f2 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -393,7 +393,7 @@ static int __init ic_defaults(void)
 
 #ifdef IPCONFIG_RARP
 
-static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt);
+static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
 
 static struct packet_type rarp_packet_type __initdata = {
 	.type =	__constant_htons(ETH_P_RARP),
@@ -414,7 +414,7 @@ static inline void ic_rarp_cleanup(void)
  *  Process received RARP packet.
  */
 static int __init
-ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 {
 	struct arphdr *rarp;
 	unsigned char *rarp_ptr;
@@ -555,7 +555,7 @@ struct bootp_pkt {		/* BOOTP packet format */
 #define DHCPRELEASE	7
 #define DHCPINFORM	8
 
-static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt);
+static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
 
 static struct packet_type bootp_packet_type __initdata = {
 	.type =	__constant_htons(ETH_P_IP),
@@ -823,7 +823,7 @@ static void __init ic_do_bootp_ext(u8 *ext)
 /*
  *  Receive BOOTP reply.
  */
-static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 {
 	struct bootp_pkt *b;
 	struct iphdr *h;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index dc806b57842..9dbf5909f3a 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -103,7 +103,7 @@ static DEFINE_SPINLOCK(mfc_unres_lock);
    In this case data path is free of exclusive locks at all.
  */
 
-static kmem_cache_t *mrt_cachep;
+static kmem_cache_t *mrt_cachep __read_mostly;
 
 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
index d9212addd19..6e092dadb38 100644
--- a/net/ipv4/ipvs/ip_vs_app.c
+++ b/net/ipv4/ipvs/ip_vs_app.c
@@ -26,6 +26,7 @@
 #include <linux/in.h>
 #include <linux/ip.h>
 #include <net/protocol.h>
+#include <net/tcp.h>
 #include <asm/system.h>
 #include <linux/stat.h>
 #include <linux/proc_fs.h>
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index d0145a8b155..e11952ea17a 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -40,7 +40,7 @@
 static struct list_head *ip_vs_conn_tab;
 
 /*  SLAB cache for IPVS connections */
-static kmem_cache_t *ip_vs_conn_cachep;
+static kmem_cache_t *ip_vs_conn_cachep __read_mostly;
 
 /*  counter for current IPVS connections */
 static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 5fb257dd07c..3ac7eeca04a 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -22,6 +22,7 @@
  *
  * Changes:
  *	Paul `Rusty' Russell		properly handle non-linear skbs
+ *	Harald Welte			don't use nfcache
  *
  */
 
@@ -529,7 +530,7 @@ static unsigned int ip_vs_post_routing(unsigned int hooknum,
 				       const struct net_device *out,
 				       int (*okfn)(struct sk_buff *))
 {
-	if (!((*pskb)->nfcache & NFC_IPVS_PROPERTY))
+	if (!((*pskb)->ipvs_property))
 		return NF_ACCEPT;
 
 	/* The packet was sent from IPVS, exit this chain */
@@ -701,7 +702,7 @@ static int ip_vs_out_icmp(struct sk_buff **pskb, int *related)
 	/* do the statistics and put it back */
 	ip_vs_out_stats(cp, skb);
 
-	skb->nfcache |= NFC_IPVS_PROPERTY;
+	skb->ipvs_property = 1;
 	verdict = NF_ACCEPT;
 
   out:
@@ -739,7 +740,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
 
 	EnterFunction(11);
 
-	if (skb->nfcache & NFC_IPVS_PROPERTY)
+	if (skb->ipvs_property)
 		return NF_ACCEPT;
 
 	iph = skb->nh.iph;
@@ -821,7 +822,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
 	ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
 	ip_vs_conn_put(cp);
 
-	skb->nfcache |= NFC_IPVS_PROPERTY;
+	skb->ipvs_property = 1;
 
 	LeaveFunction(11);
 	return NF_ACCEPT;
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 7d99ede2ef7..2d66848e7aa 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -1598,7 +1598,7 @@ static ctl_table vs_table[] = {
 	{ .ctl_name = 0 }
 };
 
-static ctl_table ipv4_table[] = {
+static ctl_table ipvs_ipv4_table[] = {
 	{
 		.ctl_name	= NET_IPV4,
 		.procname	= "ipv4",
@@ -1613,7 +1613,7 @@ static ctl_table vs_root_table[] = {
 		.ctl_name	= CTL_NET,
 		.procname	= "net",
 		.mode		= 0555,
-		.child		= ipv4_table,
+		.child		= ipvs_ipv4_table,
 	},
 	{ .ctl_name = 0 }
 };
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
index c035838b780..561cda326fa 100644
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -131,7 +131,7 @@ static ctl_table vs_table[] = {
 	{ .ctl_name = 0 }
 };
 
-static ctl_table ipv4_table[] = {
+static ctl_table ipvs_ipv4_table[] = {
 	{
 		.ctl_name	= NET_IPV4,
 		.procname	= "ipv4", 
@@ -146,7 +146,7 @@ static ctl_table lblc_root_table[] = {
 		.ctl_name	= CTL_NET,
 		.procname	= "net", 
 		.mode		= 0555, 
-		.child		= ipv4_table
+		.child		= ipvs_ipv4_table
 	},
 	{ .ctl_name = 0 }
 };
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
index 22b5dd55d27..ce456dbf09a 100644
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -320,7 +320,7 @@ static ctl_table vs_table[] = {
 	{ .ctl_name = 0 }
 };
 
-static ctl_table ipv4_table[] = {
+static ctl_table ipvs_ipv4_table[] = {
 	{
 		.ctl_name	= NET_IPV4,
 		.procname	= "ipv4", 
@@ -335,7 +335,7 @@ static ctl_table lblcr_root_table[] = {
 		.ctl_name	= CTL_NET,
 		.procname	= "net", 
 		.mode		= 0555, 
-		.child		= ipv4_table
+		.child		= ipvs_ipv4_table
 	},
 	{ .ctl_name = 0 }
 };
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
index e65de675da7..c19408973c0 100644
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -604,14 +604,14 @@ void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
 }
 
 
-static void tcp_init(struct ip_vs_protocol *pp)
+static void ip_vs_tcp_init(struct ip_vs_protocol *pp)
 {
 	IP_VS_INIT_HASH_TABLE(tcp_apps);
 	pp->timeout_table = tcp_timeouts;
 }
 
 
-static void tcp_exit(struct ip_vs_protocol *pp)
+static void ip_vs_tcp_exit(struct ip_vs_protocol *pp)
 {
 }
 
@@ -621,8 +621,8 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
 	.protocol =		IPPROTO_TCP,
 	.dont_defrag =		0,
 	.appcnt =		ATOMIC_INIT(0),
-	.init =			tcp_init,
-	.exit =			tcp_exit,
+	.init =			ip_vs_tcp_init,
+	.exit =			ip_vs_tcp_exit,
 	.register_app =		tcp_register_app,
 	.unregister_app =	tcp_unregister_app,
 	.conn_schedule =	tcp_conn_schedule,
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
index a8512a3fd08..3b87482049c 100644
--- a/net/ipv4/ipvs/ip_vs_xmit.c
+++ b/net/ipv4/ipvs/ip_vs_xmit.c
@@ -127,7 +127,7 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
 
 #define IP_VS_XMIT(skb, rt)				\
 do {							\
-	(skb)->nfcache |= NFC_IPVS_PROPERTY;		\
+	(skb)->ipvs_property = 1;			\
 	(skb)->ip_summed = CHECKSUM_NONE;		\
 	NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL,	\
 		(rt)->u.dst.dev, dst_output);		\
diff --git a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c
index c9cf8726051..db67373f9b3 100644
--- a/net/ipv4/multipath_drr.c
+++ b/net/ipv4/multipath_drr.c
@@ -107,7 +107,7 @@ static int drr_dev_event(struct notifier_block *this,
 	return NOTIFY_DONE;
 }
 
-struct notifier_block drr_dev_notifier = {
+static struct notifier_block drr_dev_notifier = {
 	.notifier_call	= drr_dev_event,
 };
 
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
new file mode 100644
index 00000000000..ae0779d82c5
--- /dev/null
+++ b/net/ipv4/netfilter.c
@@ -0,0 +1,139 @@
+/* IPv4 specific functions of netfilter core */
+
+#include <linux/config.h>
+#ifdef CONFIG_NETFILTER
+
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <net/route.h>
+#include <linux/ip.h>
+
+/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
+int ip_route_me_harder(struct sk_buff **pskb)
+{
+	struct iphdr *iph = (*pskb)->nh.iph;
+	struct rtable *rt;
+	struct flowi fl = {};
+	struct dst_entry *odst;
+	unsigned int hh_len;
+
+	/* some non-standard hacks like ipt_REJECT.c:send_reset() can cause
+	 * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook.
+	 */
+	if (inet_addr_type(iph->saddr) == RTN_LOCAL) {
+		fl.nl_u.ip4_u.daddr = iph->daddr;
+		fl.nl_u.ip4_u.saddr = iph->saddr;
+		fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
+		fl.oif = (*pskb)->sk ? (*pskb)->sk->sk_bound_dev_if : 0;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+		fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark;
+#endif
+		fl.proto = iph->protocol;
+		if (ip_route_output_key(&rt, &fl) != 0)
+			return -1;
+
+		/* Drop old route. */
+		dst_release((*pskb)->dst);
+		(*pskb)->dst = &rt->u.dst;
+	} else {
+		/* non-local src, find valid iif to satisfy
+		 * rp-filter when calling ip_route_input. */
+		fl.nl_u.ip4_u.daddr = iph->saddr;
+		if (ip_route_output_key(&rt, &fl) != 0)
+			return -1;
+
+		odst = (*pskb)->dst;
+		if (ip_route_input(*pskb, iph->daddr, iph->saddr,
+				   RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
+			dst_release(&rt->u.dst);
+			return -1;
+		}
+		dst_release(&rt->u.dst);
+		dst_release(odst);
+	}
+	
+	if ((*pskb)->dst->error)
+		return -1;
+
+	/* Change in oif may mean change in hh_len. */
+	hh_len = (*pskb)->dst->dev->hard_header_len;
+	if (skb_headroom(*pskb) < hh_len) {
+		struct sk_buff *nskb;
+
+		nskb = skb_realloc_headroom(*pskb, hh_len);
+		if (!nskb) 
+			return -1;
+		if ((*pskb)->sk)
+			skb_set_owner_w(nskb, (*pskb)->sk);
+		kfree_skb(*pskb);
+		*pskb = nskb;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(ip_route_me_harder);
+
+/*
+ * Extra routing may needed on local out, as the QUEUE target never
+ * returns control to the table.
+ */
+
+struct ip_rt_info {
+	u_int32_t daddr;
+	u_int32_t saddr;
+	u_int8_t tos;
+};
+
+static void queue_save(const struct sk_buff *skb, struct nf_info *info)
+{
+	struct ip_rt_info *rt_info = nf_info_reroute(info);
+
+	if (info->hook == NF_IP_LOCAL_OUT) {
+		const struct iphdr *iph = skb->nh.iph;
+
+		rt_info->tos = iph->tos;
+		rt_info->daddr = iph->daddr;
+		rt_info->saddr = iph->saddr;
+	}
+}
+
+static int queue_reroute(struct sk_buff **pskb, const struct nf_info *info)
+{
+	const struct ip_rt_info *rt_info = nf_info_reroute(info);
+
+	if (info->hook == NF_IP_LOCAL_OUT) {
+		struct iphdr *iph = (*pskb)->nh.iph;
+
+		if (!(iph->tos == rt_info->tos
+		      && iph->daddr == rt_info->daddr
+		      && iph->saddr == rt_info->saddr))
+			return ip_route_me_harder(pskb);
+	}
+	return 0;
+}
+
+static struct nf_queue_rerouter ip_reroute = {
+	.rer_size	= sizeof(struct ip_rt_info),
+	.save		= queue_save,
+	.reroute	= queue_reroute,
+};
+
+static int init(void)
+{
+	return nf_register_queue_rerouter(PF_INET, &ip_reroute);
+}
+
+static void fini(void)
+{
+	nf_unregister_queue_rerouter(PF_INET);
+}
+
+module_init(init);
+module_exit(fini);
+
+#endif /* CONFIG_NETFILTER */
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 46d4cb1c06f..e046f552181 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -40,6 +40,16 @@ config IP_NF_CONNTRACK_MARK
 	  of packets, but this mark value is kept in the conntrack session
 	  instead of the individual packets.
 	
+config IP_NF_CONNTRACK_EVENTS
+	bool "Connection tracking events"
+	depends on IP_NF_CONNTRACK
+	help
+	  If this option is enabled, the connection tracking code will
+	  provide a notifier chain that can be used by other kernel code
+	  to get notified about changes in the connection tracking state.
+	  
+	  IF unsure, say `N'.
+
 config IP_NF_CT_PROTO_SCTP
 	tristate  'SCTP protocol connection tracking support (EXPERIMENTAL)'
 	depends on IP_NF_CONNTRACK && EXPERIMENTAL
@@ -100,11 +110,15 @@ config IP_NF_AMANDA
 	  To compile it as a module, choose M here.  If unsure, say Y.
 
 config IP_NF_QUEUE
-	tristate "Userspace queueing via NETLINK"
+	tristate "IP Userspace queueing via NETLINK (OBSOLETE)"
 	help
 	  Netfilter has the ability to queue packets to user space: the
 	  netlink device can be used to access them using this driver.
 
+	  This option enables the old IPv4-only "ip_queue" implementation
+	  which has been obsoleted by the new "nfnetlink_queue" code (see
+	  CONFIG_NETFILTER_NETLINK_QUEUE).
+
 	  To compile it as a module, choose M here.  If unsure, say N.
 
 config IP_NF_IPTABLES
@@ -340,6 +354,17 @@ config IP_NF_MATCH_SCTP
 	  If you want to compile it as a module, say M here and read
 	  <file:Documentation/modules.txt>.  If unsure, say `N'.
 
+config IP_NF_MATCH_DCCP
+	tristate  'DCCP protocol match support'
+	depends on IP_NF_IPTABLES
+	help
+	  With this option enabled, you will be able to use the iptables
+	  `dccp' match in order to match on DCCP source/destination ports
+	  and DCCP flags.
+
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/modules.txt>.  If unsure, say `N'.
+
 config IP_NF_MATCH_COMMENT
 	tristate  'comment match support'
 	depends on IP_NF_IPTABLES
@@ -361,6 +386,16 @@ config IP_NF_MATCH_CONNMARK
 	  <file:Documentation/modules.txt>.  The module will be called
 	  ipt_connmark.o.  If unsure, say `N'.
 
+config IP_NF_MATCH_CONNBYTES
+	tristate  'Connection byte/packet counter match support'
+	depends on IP_NF_CT_ACCT && IP_NF_IPTABLES
+	help
+	  This option adds a `connbytes' match, which allows you to match the
+	  number of bytes and/or packets for each direction within a connection.
+
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/modules.txt>.  If unsure, say `N'.
+
 config IP_NF_MATCH_HASHLIMIT
 	tristate  'hashlimit match support'
 	depends on IP_NF_IPTABLES
@@ -375,6 +410,19 @@ config IP_NF_MATCH_HASHLIMIT
 	  destination IP' or `500pps from any given source IP'  with a single
 	  IPtables rule.
 
+config IP_NF_MATCH_STRING
+	tristate  'string match support'
+	depends on IP_NF_IPTABLES 
+	select TEXTSEARCH
+	select TEXTSEARCH_KMP
+	select TEXTSEARCH_BM
+	select TEXTSEARCH_FSM
+	help
+	  This option adds a `string' match, which allows you to look for
+	  pattern matchings in packets.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 # `filter', generic and specific targets
 config IP_NF_FILTER
 	tristate "Packet filtering"
@@ -616,6 +664,20 @@ config IP_NF_TARGET_CLASSIFY
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config IP_NF_TARGET_TTL
+	tristate  'TTL target support'
+	depends on IP_NF_MANGLE
+	help
+	  This option adds a `TTL' target, which enables the user to modify
+	  the TTL value of the IP header.
+
+	  While it is safe to decrement/lower the TTL, this target also enables
+	  functionality to increment and set the TTL value of the IP header to
+	  arbitrary values.  This is EXTREMELY DANGEROUS since you can easily
+	  create immortal packets that loop forever on the network.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config IP_NF_TARGET_CONNMARK
 	tristate  'CONNMARK target support'
 	depends on IP_NF_CONNTRACK_MARK && IP_NF_MANGLE
@@ -692,5 +754,11 @@ config IP_NF_ARP_MANGLE
 	  Allows altering the ARP packet payload: source and destination
 	  hardware and network addresses.
 
+config IP_NF_CONNTRACK_NETLINK
+        tristate 'Connection tracking netlink interface'
+        depends on IP_NF_CONNTRACK && NETFILTER_NETLINK
+        help
+          This option enables support for a netlink-based userspace interface
+
 endmenu
 
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 45796d5924d..a7bd38f5052 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -9,6 +9,10 @@ iptable_nat-objs	:= ip_nat_standalone.o ip_nat_rule.o ip_nat_core.o ip_nat_helpe
 # connection tracking
 obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o
 
+# conntrack netlink interface
+obj-$(CONFIG_IP_NF_CONNTRACK_NETLINK) += ip_conntrack_netlink.o
+
+
 # SCTP protocol connection tracking
 obj-$(CONFIG_IP_NF_CT_PROTO_SCTP) += ip_conntrack_proto_sctp.o
 
@@ -38,6 +42,7 @@ obj-$(CONFIG_IP_NF_MATCH_HELPER) += ipt_helper.o
 obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_limit.o
 obj-$(CONFIG_IP_NF_MATCH_HASHLIMIT) += ipt_hashlimit.o
 obj-$(CONFIG_IP_NF_MATCH_SCTP) += ipt_sctp.o
+obj-$(CONFIG_IP_NF_MATCH_DCCP) += ipt_dccp.o
 obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o
 obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o
 obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o
@@ -54,11 +59,13 @@ obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o
 obj-$(CONFIG_IP_NF_MATCH_STATE) += ipt_state.o
 obj-$(CONFIG_IP_NF_MATCH_CONNMARK) += ipt_connmark.o
 obj-$(CONFIG_IP_NF_MATCH_CONNTRACK) += ipt_conntrack.o
+obj-$(CONFIG_IP_NF_MATCH_CONNBYTES) += ipt_connbytes.o
 obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o
 obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o
 obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
 obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o
 obj-$(CONFIG_IP_NF_MATCH_COMMENT) += ipt_comment.o
+obj-$(CONFIG_IP_NF_MATCH_STRING) += ipt_string.o
 
 # targets
 obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
@@ -78,6 +85,7 @@ obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
 obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o
 obj-$(CONFIG_IP_NF_TARGET_NOTRACK) += ipt_NOTRACK.o
 obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
+obj-$(CONFIG_IP_NF_TARGET_TTL) += ipt_TTL.o
 
 # generic ARP tables
 obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
@@ -87,3 +95,4 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
 obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
 
 obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o
+obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += ipt_NFQUEUE.o
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index 01e1b58322a..be4c9eb3243 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -40,7 +40,7 @@ MODULE_PARM_DESC(master_timeout, "timeout for the master connection");
 static char *conns[] = { "DATA ", "MESG ", "INDEX " };
 
 /* This is slow, but it's simple. --RR */
-static char amanda_buffer[65536];
+static char *amanda_buffer;
 static DEFINE_SPINLOCK(amanda_buffer_lock);
 
 unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb,
@@ -153,11 +153,25 @@ static struct ip_conntrack_helper amanda_helper = {
 static void __exit fini(void)
 {
 	ip_conntrack_helper_unregister(&amanda_helper);
+	kfree(amanda_buffer);
 }
 
 static int __init init(void)
 {
-	return ip_conntrack_helper_register(&amanda_helper);
+	int ret;
+
+	amanda_buffer = kmalloc(65536, GFP_KERNEL);
+	if (!amanda_buffer)
+		return -ENOMEM;
+
+	ret = ip_conntrack_helper_register(&amanda_helper);
+	if (ret < 0) {
+		kfree(amanda_buffer);
+		return ret;
+	}
+	return 0;
+
+
 }
 
 module_init(init);
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index a7f0c821a9b..a0648600190 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -37,6 +37,7 @@
 #include <linux/err.h>
 #include <linux/percpu.h>
 #include <linux/moduleparam.h>
+#include <linux/notifier.h>
 
 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
    registrations, conntrack timers*/
@@ -49,7 +50,7 @@
 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
 #include <linux/netfilter_ipv4/listhelp.h>
 
-#define IP_CONNTRACK_VERSION	"2.1"
+#define IP_CONNTRACK_VERSION	"2.3"
 
 #if 0
 #define DEBUGP printk
@@ -69,22 +70,81 @@ static LIST_HEAD(helpers);
 unsigned int ip_conntrack_htable_size = 0;
 int ip_conntrack_max;
 struct list_head *ip_conntrack_hash;
-static kmem_cache_t *ip_conntrack_cachep;
-static kmem_cache_t *ip_conntrack_expect_cachep;
+static kmem_cache_t *ip_conntrack_cachep __read_mostly;
+static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
 struct ip_conntrack ip_conntrack_untracked;
 unsigned int ip_ct_log_invalid;
 static LIST_HEAD(unconfirmed);
 static int ip_conntrack_vmalloc;
 
-DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
+static unsigned int ip_conntrack_next_id = 1;
+static unsigned int ip_conntrack_expect_next_id = 1;
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+struct notifier_block *ip_conntrack_chain;
+struct notifier_block *ip_conntrack_expect_chain;
+
+DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
 
-void 
-ip_conntrack_put(struct ip_conntrack *ct)
+/* deliver cached events and clear cache entry - must be called with locally
+ * disabled softirqs */
+static inline void
+__ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
 {
-	IP_NF_ASSERT(ct);
-	nf_conntrack_put(&ct->ct_general);
+	DEBUGP("ecache: delivering events for %p\n", ecache->ct);
+	if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
+		notifier_call_chain(&ip_conntrack_chain, ecache->events,
+				    ecache->ct);
+	ecache->events = 0;
+	ip_conntrack_put(ecache->ct);
+	ecache->ct = NULL;
 }
 
+/* Deliver all cached events for a particular conntrack. This is called
+ * by code prior to async packet handling or freeing the skb */
+void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
+{
+	struct ip_conntrack_ecache *ecache;
+	
+	local_bh_disable();
+	ecache = &__get_cpu_var(ip_conntrack_ecache);
+	if (ecache->ct == ct)
+		__ip_ct_deliver_cached_events(ecache);
+	local_bh_enable();
+}
+
+void __ip_ct_event_cache_init(struct ip_conntrack *ct)
+{
+	struct ip_conntrack_ecache *ecache;
+
+	/* take care of delivering potentially old events */
+	ecache = &__get_cpu_var(ip_conntrack_ecache);
+	BUG_ON(ecache->ct == ct);
+	if (ecache->ct)
+		__ip_ct_deliver_cached_events(ecache);
+	/* initialize for this conntrack/packet */
+	ecache->ct = ct;
+	nf_conntrack_get(&ct->ct_general);
+}
+
+/* flush the event cache - touches other CPU's data and must not be called while
+ * packets are still passing through the code */
+static void ip_ct_event_cache_flush(void)
+{
+	struct ip_conntrack_ecache *ecache;
+	int cpu;
+
+	for_each_cpu(cpu) {
+		ecache = &per_cpu(ip_conntrack_ecache, cpu);
+		if (ecache->ct)
+			ip_conntrack_put(ecache->ct);
+	}
+}
+#else
+static inline void ip_ct_event_cache_flush(void) {}
+#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
+
+DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
+
 static int ip_conntrack_hash_rnd_initted;
 static unsigned int ip_conntrack_hash_rnd;
 
@@ -144,6 +204,13 @@ static void unlink_expect(struct ip_conntrack_expect *exp)
 	list_del(&exp->list);
 	CONNTRACK_STAT_INC(expect_delete);
 	exp->master->expecting--;
+	ip_conntrack_expect_put(exp);
+}
+
+void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp)
+{
+	unlink_expect(exp);
+	ip_conntrack_expect_put(exp);
 }
 
 static void expectation_timed_out(unsigned long ul_expect)
@@ -156,6 +223,33 @@ static void expectation_timed_out(unsigned long ul_expect)
 	ip_conntrack_expect_put(exp);
 }
 
+struct ip_conntrack_expect *
+__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
+{
+	struct ip_conntrack_expect *i;
+	
+	list_for_each_entry(i, &ip_conntrack_expect_list, list) {
+		if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
+			atomic_inc(&i->use);
+			return i;
+		}
+	}
+	return NULL;
+}
+
+/* Just find a expectation corresponding to a tuple. */
+struct ip_conntrack_expect *
+ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
+{
+	struct ip_conntrack_expect *i;
+	
+	read_lock_bh(&ip_conntrack_lock);
+	i = __ip_conntrack_expect_find(tuple);
+	read_unlock_bh(&ip_conntrack_lock);
+
+	return i;
+}
+
 /* If an expectation for this connection is found, it gets delete from
  * global list then returned. */
 static struct ip_conntrack_expect *
@@ -180,7 +274,7 @@ find_expectation(const struct ip_conntrack_tuple *tuple)
 }
 
 /* delete all expectations for this conntrack */
-static void remove_expectations(struct ip_conntrack *ct)
+void ip_ct_remove_expectations(struct ip_conntrack *ct)
 {
 	struct ip_conntrack_expect *i, *tmp;
 
@@ -210,7 +304,7 @@ clean_from_lists(struct ip_conntrack *ct)
 	LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
 
 	/* Destroy all pending expectations */
-	remove_expectations(ct);
+	ip_ct_remove_expectations(ct);
 }
 
 static void
@@ -223,10 +317,13 @@ destroy_conntrack(struct nf_conntrack *nfct)
 	IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
 	IP_NF_ASSERT(!timer_pending(&ct->timeout));
 
+	ip_conntrack_event(IPCT_DESTROY, ct);
+	set_bit(IPS_DYING_BIT, &ct->status);
+
 	/* To make sure we don't get any weird locking issues here:
 	 * destroy_conntrack() MUST NOT be called with a write lock
 	 * to ip_conntrack_lock!!! -HW */
-	proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
+	proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
 	if (proto && proto->destroy)
 		proto->destroy(ct);
 
@@ -238,7 +335,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
 	 * except TFTP can create an expectation on the first packet,
 	 * before connection is in the list, so we need to clean here,
 	 * too. */
-	remove_expectations(ct);
+	ip_ct_remove_expectations(ct);
 
 	/* We overload first tuple to link into unconfirmed list. */
 	if (!is_confirmed(ct)) {
@@ -253,8 +350,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
 		ip_conntrack_put(ct->master);
 
 	DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
-	kmem_cache_free(ip_conntrack_cachep, ct);
-	atomic_dec(&ip_conntrack_count);
+	ip_conntrack_free(ct);
 }
 
 static void death_by_timeout(unsigned long ul_conntrack)
@@ -280,7 +376,7 @@ conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
 		&& ip_ct_tuple_equal(tuple, &i->tuple);
 }
 
-static struct ip_conntrack_tuple_hash *
+struct ip_conntrack_tuple_hash *
 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
 		    const struct ip_conntrack *ignored_conntrack)
 {
@@ -315,6 +411,29 @@ ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
 	return h;
 }
 
+static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
+					unsigned int hash,
+					unsigned int repl_hash) 
+{
+	ct->id = ++ip_conntrack_next_id;
+	list_prepend(&ip_conntrack_hash[hash],
+		     &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+	list_prepend(&ip_conntrack_hash[repl_hash],
+		     &ct->tuplehash[IP_CT_DIR_REPLY].list);
+}
+
+void ip_conntrack_hash_insert(struct ip_conntrack *ct)
+{
+	unsigned int hash, repl_hash;
+
+	hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+	repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+	write_lock_bh(&ip_conntrack_lock);
+	__ip_conntrack_hash_insert(ct, hash, repl_hash);
+	write_unlock_bh(&ip_conntrack_lock);
+}
+
 /* Confirm a connection given skb; places it in hash table */
 int
 __ip_conntrack_confirm(struct sk_buff **pskb)
@@ -361,10 +480,7 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
 		/* Remove from unconfirmed list */
 		list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
 
-		list_prepend(&ip_conntrack_hash[hash],
-			     &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
-		list_prepend(&ip_conntrack_hash[repl_hash],
-			     &ct->tuplehash[IP_CT_DIR_REPLY]);
+		__ip_conntrack_hash_insert(ct, hash, repl_hash);
 		/* Timer relative to confirmation time, not original
 		   setting time, otherwise we'd get timer wrap in
 		   weird delay cases. */
@@ -374,6 +490,16 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
 		set_bit(IPS_CONFIRMED_BIT, &ct->status);
 		CONNTRACK_STAT_INC(insert);
 		write_unlock_bh(&ip_conntrack_lock);
+		if (ct->helper)
+			ip_conntrack_event_cache(IPCT_HELPER, *pskb);
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+		if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
+		    test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
+			ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
+#endif
+		ip_conntrack_event_cache(master_ct(ct) ?
+					 IPCT_RELATED : IPCT_NEW, *pskb);
+
 		return NF_ACCEPT;
 	}
 
@@ -438,34 +564,84 @@ static inline int helper_cmp(const struct ip_conntrack_helper *i,
 	return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
 }
 
-static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
+static struct ip_conntrack_helper *
+__ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
 {
 	return LIST_FIND(&helpers, helper_cmp,
 			 struct ip_conntrack_helper *,
 			 tuple);
 }
 
-/* Allocate a new conntrack: we return -ENOMEM if classification
-   failed due to stress.  Otherwise it really is unclassifiable. */
-static struct ip_conntrack_tuple_hash *
-init_conntrack(const struct ip_conntrack_tuple *tuple,
-	       struct ip_conntrack_protocol *protocol,
-	       struct sk_buff *skb)
+struct ip_conntrack_helper *
+ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
+{
+	struct ip_conntrack_helper *helper;
+
+	/* need ip_conntrack_lock to assure that helper exists until
+	 * try_module_get() is called */
+	read_lock_bh(&ip_conntrack_lock);
+
+	helper = __ip_conntrack_helper_find(tuple);
+	if (helper) {
+		/* need to increase module usage count to assure helper will
+		 * not go away while the caller is e.g. busy putting a
+		 * conntrack in the hash that uses the helper */
+		if (!try_module_get(helper->me))
+			helper = NULL;
+	}
+
+	read_unlock_bh(&ip_conntrack_lock);
+
+	return helper;
+}
+
+void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
+{
+	module_put(helper->me);
+}
+
+struct ip_conntrack_protocol *
+__ip_conntrack_proto_find(u_int8_t protocol)
+{
+	return ip_ct_protos[protocol];
+}
+
+/* this is guaranteed to always return a valid protocol helper, since
+ * it falls back to generic_protocol */
+struct ip_conntrack_protocol *
+ip_conntrack_proto_find_get(u_int8_t protocol)
+{
+	struct ip_conntrack_protocol *p;
+
+	preempt_disable();
+	p = __ip_conntrack_proto_find(protocol);
+	if (p) {
+		if (!try_module_get(p->me))
+			p = &ip_conntrack_generic_protocol;
+	}
+	preempt_enable();
+	
+	return p;
+}
+
+void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
+{
+	module_put(p->me);
+}
+
+struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
+					struct ip_conntrack_tuple *repl)
 {
 	struct ip_conntrack *conntrack;
-	struct ip_conntrack_tuple repl_tuple;
-	size_t hash;
-	struct ip_conntrack_expect *exp;
 
 	if (!ip_conntrack_hash_rnd_initted) {
 		get_random_bytes(&ip_conntrack_hash_rnd, 4);
 		ip_conntrack_hash_rnd_initted = 1;
 	}
 
-	hash = hash_conntrack(tuple);
-
 	if (ip_conntrack_max
 	    && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
+		unsigned int hash = hash_conntrack(orig);
 		/* Try dropping from this hash chain. */
 		if (!early_drop(&ip_conntrack_hash[hash])) {
 			if (net_ratelimit())
@@ -476,11 +652,6 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
 		}
 	}
 
-	if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
-		DEBUGP("Can't invert tuple.\n");
-		return NULL;
-	}
-
 	conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
 	if (!conntrack) {
 		DEBUGP("Can't allocate conntrack.\n");
@@ -490,17 +661,50 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
 	memset(conntrack, 0, sizeof(*conntrack));
 	atomic_set(&conntrack->ct_general.use, 1);
 	conntrack->ct_general.destroy = destroy_conntrack;
-	conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
-	conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
-	if (!protocol->new(conntrack, skb)) {
-		kmem_cache_free(ip_conntrack_cachep, conntrack);
-		return NULL;
-	}
+	conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
+	conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
 	/* Don't set timer yet: wait for confirmation */
 	init_timer(&conntrack->timeout);
 	conntrack->timeout.data = (unsigned long)conntrack;
 	conntrack->timeout.function = death_by_timeout;
 
+	atomic_inc(&ip_conntrack_count);
+
+	return conntrack;
+}
+
+void
+ip_conntrack_free(struct ip_conntrack *conntrack)
+{
+	atomic_dec(&ip_conntrack_count);
+	kmem_cache_free(ip_conntrack_cachep, conntrack);
+}
+
+/* Allocate a new conntrack: we return -ENOMEM if classification
+ * failed due to stress.   Otherwise it really is unclassifiable */
+static struct ip_conntrack_tuple_hash *
+init_conntrack(struct ip_conntrack_tuple *tuple,
+	       struct ip_conntrack_protocol *protocol,
+	       struct sk_buff *skb)
+{
+	struct ip_conntrack *conntrack;
+	struct ip_conntrack_tuple repl_tuple;
+	struct ip_conntrack_expect *exp;
+
+	if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
+		DEBUGP("Can't invert tuple.\n");
+		return NULL;
+	}
+
+	conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
+	if (conntrack == NULL || IS_ERR(conntrack))
+		return (struct ip_conntrack_tuple_hash *)conntrack;
+
+	if (!protocol->new(conntrack, skb)) {
+		ip_conntrack_free(conntrack);
+		return NULL;
+	}
+
 	write_lock_bh(&ip_conntrack_lock);
 	exp = find_expectation(tuple);
 
@@ -521,7 +725,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
 		nf_conntrack_get(&conntrack->master->ct_general);
 		CONNTRACK_STAT_INC(expect_new);
 	} else {
-		conntrack->helper = ip_ct_find_helper(&repl_tuple);
+		conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
 
 		CONNTRACK_STAT_INC(new);
 	}
@@ -529,7 +733,6 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
 	/* Overload tuple linked list to put us in unconfirmed list. */
 	list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
 
-	atomic_inc(&ip_conntrack_count);
 	write_unlock_bh(&ip_conntrack_lock);
 
 	if (exp) {
@@ -607,7 +810,7 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
 	struct ip_conntrack *ct;
 	enum ip_conntrack_info ctinfo;
 	struct ip_conntrack_protocol *proto;
-	int set_reply;
+	int set_reply = 0;
 	int ret;
 
 	/* Previously seen (loopback or untracked)?  Ignore. */
@@ -625,9 +828,6 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
 		return NF_DROP;
 	}
 
-	/* FIXME: Do this right please. --RR */
-	(*pskb)->nfcache |= NFC_UNKNOWN;
-
 /* Doesn't cover locally-generated broadcast, so not worth it. */
 #if 0
 	/* Ignore broadcast: no `connection'. */
@@ -643,7 +843,7 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
 	}
 #endif
 
-	proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
+	proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
 
 	/* It may be an special packet, error, unclean...
 	 * inverse of the return code tells to the netfilter
@@ -679,8 +879,8 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
 		return -ret;
 	}
 
-	if (set_reply)
-		set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
+	if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
+		ip_conntrack_event_cache(IPCT_STATUS, *pskb);
 
 	return ret;
 }
@@ -689,7 +889,7 @@ int invert_tuplepr(struct ip_conntrack_tuple *inverse,
 		   const struct ip_conntrack_tuple *orig)
 {
 	return ip_ct_invert_tuple(inverse, orig, 
-				  ip_ct_find_proto(orig->dst.protonum));
+				  __ip_conntrack_proto_find(orig->dst.protonum));
 }
 
 /* Would two expected things clash? */
@@ -769,6 +969,8 @@ static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
 	exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
 	add_timer(&exp->timeout);
 
+	exp->id = ++ip_conntrack_expect_next_id;
+	atomic_inc(&exp->use);
 	CONNTRACK_STAT_INC(expect_create);
 }
 
@@ -827,6 +1029,7 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
 		evict_oldest_expect(expect->master);
 
 	ip_conntrack_expect_insert(expect);
+	ip_conntrack_expect_event(IPEXP_NEW, expect);
 	ret = 0;
 out:
 	write_unlock_bh(&ip_conntrack_lock);
@@ -847,7 +1050,7 @@ void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
 
 	conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
 	if (!conntrack->master && conntrack->expecting == 0)
-		conntrack->helper = ip_ct_find_helper(newreply);
+		conntrack->helper = __ip_conntrack_helper_find(newreply);
 	write_unlock_bh(&ip_conntrack_lock);
 }
 
@@ -861,11 +1064,26 @@ int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
 	return 0;
 }
 
+struct ip_conntrack_helper *
+__ip_conntrack_helper_find_byname(const char *name)
+{
+	struct ip_conntrack_helper *h;
+
+	list_for_each_entry(h, &helpers, list) {
+		if (!strcmp(h->name, name))
+			return h;
+	}
+
+	return NULL;
+}
+
 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
 			 const struct ip_conntrack_helper *me)
 {
-	if (tuplehash_to_ctrack(i)->helper == me)
+	if (tuplehash_to_ctrack(i)->helper == me) {
+ 		ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
 		tuplehash_to_ctrack(i)->helper = NULL;
+	}
 	return 0;
 }
 
@@ -927,12 +1145,46 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct,
 		if (del_timer(&ct->timeout)) {
 			ct->timeout.expires = jiffies + extra_jiffies;
 			add_timer(&ct->timeout);
+			ip_conntrack_event_cache(IPCT_REFRESH, skb);
 		}
 		ct_add_counters(ct, ctinfo, skb);
 		write_unlock_bh(&ip_conntrack_lock);
 	}
 }
 
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
+ * in ip_conntrack_core, since we don't want the protocols to autoload
+ * or depend on ctnetlink */
+int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
+			       const struct ip_conntrack_tuple *tuple)
+{
+	NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
+		&tuple->src.u.tcp.port);
+	NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
+		&tuple->dst.u.tcp.port);
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
+			       struct ip_conntrack_tuple *t)
+{
+	if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
+		return -EINVAL;
+
+	t->src.u.tcp.port =
+		*(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
+	t->dst.u.tcp.port =
+		*(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
+
+	return 0;
+}
+#endif
+
 /* Returns new sk_buff, or NULL */
 struct sk_buff *
 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
@@ -943,10 +1195,8 @@ ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
 	skb = ip_defrag(skb, user);
 	local_bh_enable();
 
-	if (skb) {
+	if (skb)
 		ip_send_check(skb->nh.iph);
-		skb->nfcache |= NFC_ALTERED;
-	}
 	return skb;
 }
 
@@ -1096,16 +1346,14 @@ static void free_conntrack_hash(void)
 				     * ip_conntrack_htable_size));
 }
 
-/* Mishearing the voices in his head, our hero wonders how he's
-   supposed to kill the mall. */
-void ip_conntrack_cleanup(void)
+void ip_conntrack_flush()
 {
-	ip_ct_attach = NULL;
 	/* This makes sure all current packets have passed through
            netfilter framework.  Roll on, two-stage module
            delete... */
 	synchronize_net();
- 
+
+	ip_ct_event_cache_flush();
  i_see_dead_people:
 	ip_ct_iterate_cleanup(kill_all, NULL);
 	if (atomic_read(&ip_conntrack_count) != 0) {
@@ -1115,7 +1363,14 @@ void ip_conntrack_cleanup(void)
 	/* wait until all references to ip_conntrack_untracked are dropped */
 	while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
 		schedule();
+}
 
+/* Mishearing the voices in his head, our hero wonders how he's
+   supposed to kill the mall. */
+void ip_conntrack_cleanup(void)
+{
+	ip_ct_attach = NULL;
+	ip_conntrack_flush();
 	kmem_cache_destroy(ip_conntrack_cachep);
 	kmem_cache_destroy(ip_conntrack_expect_cachep);
 	free_conntrack_hash();
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
index 7a3b773be3f..3a2627db172 100644
--- a/net/ipv4/netfilter/ip_conntrack_ftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -25,8 +25,7 @@ MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
 MODULE_DESCRIPTION("ftp connection tracking helper");
 
 /* This is slow, but it's simple. --RR */
-static char ftp_buffer[65536];
-
+static char *ftp_buffer;
 static DEFINE_SPINLOCK(ip_ftp_lock);
 
 #define MAX_PORTS 8
@@ -262,7 +261,8 @@ static int find_nl_seq(u32 seq, const struct ip_ct_ftp_master *info, int dir)
 }
 
 /* We don't update if it's older than what we have. */
-static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir)
+static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir,
+			  struct sk_buff *skb)
 {
 	unsigned int i, oldest = NUM_SEQ_TO_REMEMBER;
 
@@ -276,10 +276,13 @@ static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir)
 			oldest = i;
 	}
 
-	if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER)
+	if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) {
 		info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq;
-	else if (oldest != NUM_SEQ_TO_REMEMBER)
+		ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
+	} else if (oldest != NUM_SEQ_TO_REMEMBER) {
 		info->seq_aft_nl[dir][oldest] = nl_seq;
+		ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
+	}
 }
 
 static int help(struct sk_buff **pskb,
@@ -439,7 +442,7 @@ out_update_nl:
 	/* Now if this ends in \n, update ftp info.  Seq may have been
 	 * adjusted by NAT code. */
 	if (ends_in_nl)
-		update_nl_seq(seq, ct_ftp_info,dir);
+		update_nl_seq(seq, ct_ftp_info,dir, *pskb);
  out:
 	spin_unlock_bh(&ip_ftp_lock);
 	return ret;
@@ -457,6 +460,8 @@ static void fini(void)
 				ports[i]);
 		ip_conntrack_helper_unregister(&ftp[i]);
 	}
+
+	kfree(ftp_buffer);
 }
 
 static int __init init(void)
@@ -464,6 +469,10 @@ static int __init init(void)
 	int i, ret;
 	char *tmpname;
 
+	ftp_buffer = kmalloc(65536, GFP_KERNEL);
+	if (!ftp_buffer)
+		return -ENOMEM;
+
 	if (ports_c == 0)
 		ports[ports_c++] = FTP_PORT;
 
diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c
index 4a28f297d50..25438eec21a 100644
--- a/net/ipv4/netfilter/ip_conntrack_irc.c
+++ b/net/ipv4/netfilter/ip_conntrack_irc.c
@@ -39,7 +39,7 @@ static int ports_c;
 static int max_dcc_channels = 8;
 static unsigned int dcc_timeout = 300;
 /* This is slow, but it's simple. --RR */
-static char irc_buffer[65536];
+static char *irc_buffer;
 static DEFINE_SPINLOCK(irc_buffer_lock);
 
 unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb,
@@ -257,6 +257,10 @@ static int __init init(void)
 		printk("ip_conntrack_irc: dcc_timeout must be a positive integer\n");
 		return -EBUSY;
 	}
+
+	irc_buffer = kmalloc(65536, GFP_KERNEL);
+	if (!irc_buffer)
+		return -ENOMEM;
 	
 	/* If no port given, default to standard irc port */
 	if (ports_c == 0)
@@ -304,6 +308,7 @@ static void fini(void)
 		       ports[i]);
 		ip_conntrack_helper_unregister(&irc_helpers[i]);
 	}
+	kfree(irc_buffer);
 }
 
 module_init(init);
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
new file mode 100644
index 00000000000..a4e9278db4e
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -0,0 +1,1579 @@
+/* Connection tracking via netlink socket. Allows for user space
+ * protocol helpers and general trouble making from userspace.
+ *
+ * (C) 2001 by Jay Schulist <jschlst@samba.org>
+ * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2003 by Patrick Mchardy <kaber@trash.net>
+ * (C) 2005 by Pablo Neira Ayuso <pablo@eurodev.net>
+ *
+ * I've reworked this stuff to use attributes instead of conntrack 
+ * structures. 5.44 am. I need more tea. --pablo 05/07/11.
+ *
+ * Initial connection tracking via netlink development funded and 
+ * generally made possible by Network Robots, Inc. (www.networkrobots.com)
+ *
+ * Further development of this code funded by Astaro AG (http://www.astaro.com)
+ *
+ * This software may be used and distributed according to the terms
+ * of the GNU General Public License, incorporated herein by reference.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/netlink.h>
+#include <linux/spinlock.h>
+#include <linux/notifier.h>
+#include <linux/rtnetlink.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+MODULE_LICENSE("GPL");
+
+static char __initdata version[] = "0.90";
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+
+static inline int
+ctnetlink_dump_tuples_proto(struct sk_buff *skb, 
+			    const struct ip_conntrack_tuple *tuple)
+{
+	struct ip_conntrack_protocol *proto;
+
+	NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum);
+
+	proto = ip_conntrack_proto_find_get(tuple->dst.protonum);
+	if (proto && proto->tuple_to_nfattr)
+		return proto->tuple_to_nfattr(skb, tuple);
+
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+static inline int
+ctnetlink_dump_tuples(struct sk_buff *skb, 
+		      const struct ip_conntrack_tuple *tuple)
+{
+	struct nfattr *nest_parms;
+	
+	nest_parms = NFA_NEST(skb, CTA_TUPLE_IP);
+	NFA_PUT(skb, CTA_IP_V4_SRC, sizeof(u_int32_t), &tuple->src.ip);
+	NFA_PUT(skb, CTA_IP_V4_DST, sizeof(u_int32_t), &tuple->dst.ip);
+	NFA_NEST_END(skb, nest_parms);
+
+	nest_parms = NFA_NEST(skb, CTA_TUPLE_PROTO);
+	ctnetlink_dump_tuples_proto(skb, tuple);
+	NFA_NEST_END(skb, nest_parms);
+
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+static inline int
+ctnetlink_dump_status(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+	u_int32_t status = htonl((u_int32_t) ct->status);
+	NFA_PUT(skb, CTA_STATUS, sizeof(status), &status);
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+static inline int
+ctnetlink_dump_timeout(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+	long timeout_l = ct->timeout.expires - jiffies;
+	u_int32_t timeout;
+
+	if (timeout_l < 0)
+		timeout = 0;
+	else
+		timeout = htonl(timeout_l / HZ);
+	
+	NFA_PUT(skb, CTA_TIMEOUT, sizeof(timeout), &timeout);
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+static inline int
+ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+	struct ip_conntrack_protocol *proto = ip_conntrack_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
+
+	struct nfattr *nest_proto;
+	int ret;
+	
+	if (!proto || !proto->to_nfattr)
+		return 0;
+	
+	nest_proto = NFA_NEST(skb, CTA_PROTOINFO);
+
+	ret = proto->to_nfattr(skb, nest_proto, ct);
+
+	ip_conntrack_proto_put(proto);
+
+	NFA_NEST_END(skb, nest_proto);
+
+	return ret;
+
+nfattr_failure:
+	return -1;
+}
+
+static inline int
+ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+	struct nfattr *nest_helper;
+
+	if (!ct->helper)
+		return 0;
+		
+	nest_helper = NFA_NEST(skb, CTA_HELP);
+	NFA_PUT(skb, CTA_HELP_NAME, CTA_HELP_MAXNAMESIZE, &ct->helper->name);
+
+	if (ct->helper->to_nfattr)
+		ct->helper->to_nfattr(skb, ct);
+
+	NFA_NEST_END(skb, nest_helper);
+
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+#ifdef CONFIG_IP_NF_CT_ACCT
+static inline int
+ctnetlink_dump_counters(struct sk_buff *skb, const struct ip_conntrack *ct,
+			enum ip_conntrack_dir dir)
+{
+	enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG;
+	struct nfattr *nest_count = NFA_NEST(skb, type);
+	u_int64_t tmp;
+
+	tmp = cpu_to_be64(ct->counters[dir].packets);
+	NFA_PUT(skb, CTA_COUNTERS_PACKETS, sizeof(u_int64_t), &tmp);
+
+	tmp = cpu_to_be64(ct->counters[dir].bytes);
+	NFA_PUT(skb, CTA_COUNTERS_BYTES, sizeof(u_int64_t), &tmp);
+
+	NFA_NEST_END(skb, nest_count);
+
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+#else
+#define ctnetlink_dump_counters(a, b, c) (0)
+#endif
+
+#ifdef CONFIG_IP_NF_CONNTRACK_MARK
+static inline int
+ctnetlink_dump_mark(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+	u_int32_t mark = htonl(ct->mark);
+
+	NFA_PUT(skb, CTA_MARK, sizeof(u_int32_t), &mark);
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+#else
+#define ctnetlink_dump_mark(a, b) (0)
+#endif
+
+static inline int
+ctnetlink_dump_id(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+	u_int32_t id = htonl(ct->id);
+	NFA_PUT(skb, CTA_ID, sizeof(u_int32_t), &id);
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+static inline int
+ctnetlink_dump_use(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+	unsigned int use = htonl(atomic_read(&ct->ct_general.use));
+	
+	NFA_PUT(skb, CTA_USE, sizeof(u_int32_t), &use);
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+#define tuple(ct, dir) (&(ct)->tuplehash[dir].tuple)
+
+static int
+ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
+		    int event, int nowait, 
+		    const struct ip_conntrack *ct)
+{
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+	struct nfattr *nest_parms;
+	unsigned char *b;
+
+	b = skb->tail;
+
+	event |= NFNL_SUBSYS_CTNETLINK << 8;
+	nlh    = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg));
+	nfmsg  = NLMSG_DATA(nlh);
+
+	nlh->nlmsg_flags    = (nowait && pid) ? NLM_F_MULTI : 0;
+	nfmsg->nfgen_family = AF_INET;
+	nfmsg->version      = NFNETLINK_V0;
+	nfmsg->res_id	    = 0;
+
+	nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG);
+	if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
+		goto nfattr_failure;
+	NFA_NEST_END(skb, nest_parms);
+	
+	nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY);
+	if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0)
+		goto nfattr_failure;
+	NFA_NEST_END(skb, nest_parms);
+
+	if (ctnetlink_dump_status(skb, ct) < 0 ||
+	    ctnetlink_dump_timeout(skb, ct) < 0 ||
+	    ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
+	    ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 ||
+	    ctnetlink_dump_protoinfo(skb, ct) < 0 ||
+	    ctnetlink_dump_helpinfo(skb, ct) < 0 ||
+	    ctnetlink_dump_mark(skb, ct) < 0 ||
+	    ctnetlink_dump_id(skb, ct) < 0 ||
+	    ctnetlink_dump_use(skb, ct) < 0)
+		goto nfattr_failure;
+
+	nlh->nlmsg_len = skb->tail - b;
+	return skb->len;
+
+nlmsg_failure:
+nfattr_failure:
+	skb_trim(skb, b - skb->data);
+	return -1;
+}
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+static int ctnetlink_conntrack_event(struct notifier_block *this,
+                                     unsigned long events, void *ptr)
+{
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+	struct nfattr *nest_parms;
+	struct ip_conntrack *ct = (struct ip_conntrack *)ptr;
+	struct sk_buff *skb;
+	unsigned int type;
+	unsigned char *b;
+	unsigned int flags = 0, group;
+
+	/* ignore our fake conntrack entry */
+	if (ct == &ip_conntrack_untracked)
+		return NOTIFY_DONE;
+
+	if (events & IPCT_DESTROY) {
+		type = IPCTNL_MSG_CT_DELETE;
+		group = NFNLGRP_CONNTRACK_DESTROY;
+		goto alloc_skb;
+	}
+	if (events & (IPCT_NEW | IPCT_RELATED)) {
+		type = IPCTNL_MSG_CT_NEW;
+		flags = NLM_F_CREATE|NLM_F_EXCL;
+		/* dump everything */
+		events = ~0UL;
+		group = NFNLGRP_CONNTRACK_NEW;
+		goto alloc_skb;
+	}
+	if (events & (IPCT_STATUS |
+		      IPCT_PROTOINFO |
+		      IPCT_HELPER |
+		      IPCT_HELPINFO |
+		      IPCT_NATINFO)) {
+		type = IPCTNL_MSG_CT_NEW;
+		group = NFNLGRP_CONNTRACK_UPDATE;
+		goto alloc_skb;
+	} 
+	
+	return NOTIFY_DONE;
+
+alloc_skb:
+  /* FIXME: Check if there are any listeners before, don't hurt performance */
+	
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
+	if (!skb)
+		return NOTIFY_DONE;
+
+	b = skb->tail;
+
+	type |= NFNL_SUBSYS_CTNETLINK << 8;
+	nlh   = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg));
+	nfmsg = NLMSG_DATA(nlh);
+
+	nlh->nlmsg_flags    = flags;
+	nfmsg->nfgen_family = AF_INET;
+	nfmsg->version	= NFNETLINK_V0;
+	nfmsg->res_id	= 0;
+
+	nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG);
+	if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
+		goto nfattr_failure;
+	NFA_NEST_END(skb, nest_parms);
+	
+	nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY);
+	if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0)
+		goto nfattr_failure;
+	NFA_NEST_END(skb, nest_parms);
+	
+	/* NAT stuff is now a status flag */
+	if ((events & IPCT_STATUS || events & IPCT_NATINFO)
+	    && ctnetlink_dump_status(skb, ct) < 0)
+		goto nfattr_failure;
+	if (events & IPCT_REFRESH
+	    && ctnetlink_dump_timeout(skb, ct) < 0)
+		goto nfattr_failure;
+	if (events & IPCT_PROTOINFO
+	    && ctnetlink_dump_protoinfo(skb, ct) < 0)
+		goto nfattr_failure;
+	if (events & IPCT_HELPINFO
+	    && ctnetlink_dump_helpinfo(skb, ct) < 0)
+		goto nfattr_failure;
+
+	if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
+	    ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0)
+		goto nfattr_failure;
+
+	nlh->nlmsg_len = skb->tail - b;
+	nfnetlink_send(skb, 0, group, 0);
+	return NOTIFY_DONE;
+
+nlmsg_failure:
+nfattr_failure:
+	kfree_skb(skb);
+	return NOTIFY_DONE;
+}
+#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
+
+static int ctnetlink_done(struct netlink_callback *cb)
+{
+	DEBUGP("entered %s\n", __FUNCTION__);
+	return 0;
+}
+
+static int
+ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct ip_conntrack *ct = NULL;
+	struct ip_conntrack_tuple_hash *h;
+	struct list_head *i;
+	u_int32_t *id = (u_int32_t *) &cb->args[1];
+
+	DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__, 
+			cb->args[0], *id);
+
+	read_lock_bh(&ip_conntrack_lock);
+	for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) {
+		list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) {
+			h = (struct ip_conntrack_tuple_hash *) i;
+			if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
+				continue;
+			ct = tuplehash_to_ctrack(h);
+			if (ct->id <= *id)
+				continue;
+			if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
+		                        	cb->nlh->nlmsg_seq,
+						IPCTNL_MSG_CT_NEW,
+						1, ct) < 0)
+				goto out;
+			*id = ct->id;
+		}
+	}
+out:	
+	read_unlock_bh(&ip_conntrack_lock);
+
+	DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id);
+
+	return skb->len;
+}
+
+#ifdef CONFIG_IP_NF_CT_ACCT
+static int
+ctnetlink_dump_table_w(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct ip_conntrack *ct = NULL;
+	struct ip_conntrack_tuple_hash *h;
+	struct list_head *i;
+	u_int32_t *id = (u_int32_t *) &cb->args[1];
+
+	DEBUGP("entered %s, last bucket=%u id=%u\n", __FUNCTION__, 
+			cb->args[0], *id);
+
+	write_lock_bh(&ip_conntrack_lock);
+	for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) {
+		list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) {
+			h = (struct ip_conntrack_tuple_hash *) i;
+			if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
+				continue;
+			ct = tuplehash_to_ctrack(h);
+			if (ct->id <= *id)
+				continue;
+			if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
+		                        	cb->nlh->nlmsg_seq,
+						IPCTNL_MSG_CT_NEW,
+						1, ct) < 0)
+				goto out;
+			*id = ct->id;
+
+			memset(&ct->counters, 0, sizeof(ct->counters));
+		}
+	}
+out:	
+	write_unlock_bh(&ip_conntrack_lock);
+
+	DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id);
+
+	return skb->len;
+}
+#endif
+
+static const int cta_min_ip[CTA_IP_MAX] = {
+	[CTA_IP_V4_SRC-1]	= sizeof(u_int32_t),
+	[CTA_IP_V4_DST-1]	= sizeof(u_int32_t),
+};
+
+static inline int
+ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple)
+{
+	struct nfattr *tb[CTA_IP_MAX];
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	
+	if (nfattr_parse_nested(tb, CTA_IP_MAX, attr) < 0)
+		goto nfattr_failure;
+
+	if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip))
+		return -EINVAL;
+
+	if (!tb[CTA_IP_V4_SRC-1])
+		return -EINVAL;
+	tuple->src.ip = *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_SRC-1]);
+
+	if (!tb[CTA_IP_V4_DST-1])
+		return -EINVAL;
+	tuple->dst.ip = *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_DST-1]);
+
+	DEBUGP("leaving\n");
+
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+static const int cta_min_proto[CTA_PROTO_MAX] = {
+	[CTA_PROTO_NUM-1]	= sizeof(u_int16_t),
+	[CTA_PROTO_SRC_PORT-1]	= sizeof(u_int16_t),
+	[CTA_PROTO_DST_PORT-1]	= sizeof(u_int16_t),
+	[CTA_PROTO_ICMP_TYPE-1]	= sizeof(u_int8_t),
+	[CTA_PROTO_ICMP_CODE-1]	= sizeof(u_int8_t),
+	[CTA_PROTO_ICMP_ID-1]	= sizeof(u_int16_t),
+};
+
+static inline int
+ctnetlink_parse_tuple_proto(struct nfattr *attr, 
+			    struct ip_conntrack_tuple *tuple)
+{
+	struct nfattr *tb[CTA_PROTO_MAX];
+	struct ip_conntrack_protocol *proto;
+	int ret = 0;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	if (nfattr_parse_nested(tb, CTA_PROTO_MAX, attr) < 0)
+		goto nfattr_failure;
+
+	if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
+		return -EINVAL;
+
+	if (!tb[CTA_PROTO_NUM-1])
+		return -EINVAL;
+	tuple->dst.protonum = *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]);
+
+	proto = ip_conntrack_proto_find_get(tuple->dst.protonum);
+
+	if (likely(proto && proto->nfattr_to_tuple)) {
+		ret = proto->nfattr_to_tuple(tb, tuple);
+		ip_conntrack_proto_put(proto);
+	}
+	
+	return ret;
+
+nfattr_failure:
+	return -1;
+}
+
+static inline int
+ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple,
+		      enum ctattr_tuple type)
+{
+	struct nfattr *tb[CTA_TUPLE_MAX];
+	int err;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	memset(tuple, 0, sizeof(*tuple));
+
+	if (nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]) < 0)
+		goto nfattr_failure;
+
+	if (!tb[CTA_TUPLE_IP-1])
+		return -EINVAL;
+
+	err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP-1], tuple);
+	if (err < 0)
+		return err;
+
+	if (!tb[CTA_TUPLE_PROTO-1])
+		return -EINVAL;
+
+	err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO-1], tuple);
+	if (err < 0)
+		return err;
+
+	/* orig and expect tuples get DIR_ORIGINAL */
+	if (type == CTA_TUPLE_REPLY)
+		tuple->dst.dir = IP_CT_DIR_REPLY;
+	else
+		tuple->dst.dir = IP_CT_DIR_ORIGINAL;
+
+	DUMP_TUPLE(tuple);
+
+	DEBUGP("leaving\n");
+
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+static const int cta_min_protonat[CTA_PROTONAT_MAX] = {
+	[CTA_PROTONAT_PORT_MIN-1]	= sizeof(u_int16_t),
+	[CTA_PROTONAT_PORT_MAX-1]	= sizeof(u_int16_t),
+};
+
+static int ctnetlink_parse_nat_proto(struct nfattr *attr,
+				     const struct ip_conntrack *ct,
+				     struct ip_nat_range *range)
+{
+	struct nfattr *tb[CTA_PROTONAT_MAX];
+	struct ip_nat_protocol *npt;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	if (nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr) < 0)
+		goto nfattr_failure;
+
+	if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat))
+		goto nfattr_failure;
+
+	npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
+	if (!npt)
+		return 0;
+
+	if (!npt->nfattr_to_range) {
+		ip_nat_proto_put(npt);
+		return 0;
+	}
+
+	/* nfattr_to_range returns 1 if it parsed, 0 if not, neg. on error */
+	if (npt->nfattr_to_range(tb, range) > 0)
+		range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
+
+	ip_nat_proto_put(npt);
+
+	DEBUGP("leaving\n");
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+static inline int
+ctnetlink_parse_nat(struct nfattr *cda[],
+		    const struct ip_conntrack *ct, struct ip_nat_range *range)
+{
+	struct nfattr *tb[CTA_NAT_MAX];
+	int err;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	memset(range, 0, sizeof(*range));
+	
+	if (nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]) < 0)
+		goto nfattr_failure;
+
+	if (tb[CTA_NAT_MINIP-1])
+		range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]);
+
+	if (!tb[CTA_NAT_MAXIP-1])
+		range->max_ip = range->min_ip;
+	else
+		range->max_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MAXIP-1]);
+
+	if (range->min_ip)
+		range->flags |= IP_NAT_RANGE_MAP_IPS;
+
+	if (!tb[CTA_NAT_PROTO-1])
+		return 0;
+
+	err = ctnetlink_parse_nat_proto(tb[CTA_NAT_PROTO-1], ct, range);
+	if (err < 0)
+		return err;
+
+	DEBUGP("leaving\n");
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+#endif
+
+static inline int
+ctnetlink_parse_help(struct nfattr *attr, char **helper_name)
+{
+	struct nfattr *tb[CTA_HELP_MAX];
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	if (nfattr_parse_nested(tb, CTA_HELP_MAX, attr) < 0)
+		goto nfattr_failure;
+
+	if (!tb[CTA_HELP_NAME-1])
+		return -EINVAL;
+
+	*helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]);
+
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+static int
+ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, 
+			struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+	struct ip_conntrack_tuple_hash *h;
+	struct ip_conntrack_tuple tuple;
+	struct ip_conntrack *ct;
+	int err = 0;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	if (cda[CTA_TUPLE_ORIG-1])
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG);
+	else if (cda[CTA_TUPLE_REPLY-1])
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY);
+	else {
+		/* Flush the whole table */
+		ip_conntrack_flush();
+		return 0;
+	}
+
+	if (err < 0)
+		return err;
+
+	h = ip_conntrack_find_get(&tuple, NULL);
+	if (!h) {
+		DEBUGP("tuple not found in conntrack hash\n");
+		return -ENOENT;
+	}
+
+	ct = tuplehash_to_ctrack(h);
+	
+	if (cda[CTA_ID-1]) {
+		u_int32_t id = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_ID-1]));
+		if (ct->id != id) {
+			ip_conntrack_put(ct);
+			return -ENOENT;
+		}
+	}	
+	if (del_timer(&ct->timeout)) {
+		ip_conntrack_put(ct);
+		ct->timeout.function((unsigned long)ct);
+		return 0;
+	}
+	ip_conntrack_put(ct);
+	DEBUGP("leaving\n");
+
+	return 0;
+}
+
+static int
+ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, 
+			struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+	struct ip_conntrack_tuple_hash *h;
+	struct ip_conntrack_tuple tuple;
+	struct ip_conntrack *ct;
+	struct sk_buff *skb2 = NULL;
+	int err = 0;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+		struct nfgenmsg *msg = NLMSG_DATA(nlh);
+		u32 rlen;
+
+		if (msg->nfgen_family != AF_INET)
+			return -EAFNOSUPPORT;
+
+		if (NFNL_MSG_TYPE(nlh->nlmsg_type) ==
+					IPCTNL_MSG_CT_GET_CTRZERO) {
+#ifdef CONFIG_IP_NF_CT_ACCT
+			if ((*errp = netlink_dump_start(ctnl, skb, nlh,
+						ctnetlink_dump_table_w,
+						ctnetlink_done)) != 0)
+				return -EINVAL;
+#else
+			return -ENOTSUPP;
+#endif
+		} else {
+			if ((*errp = netlink_dump_start(ctnl, skb, nlh,
+		      		                        ctnetlink_dump_table,
+		                                	ctnetlink_done)) != 0)
+			return -EINVAL;
+		}
+
+		rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+		if (rlen > skb->len)
+			rlen = skb->len;
+		skb_pull(skb, rlen);
+		return 0;
+	}
+
+	if (cda[CTA_TUPLE_ORIG-1])
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG);
+	else if (cda[CTA_TUPLE_REPLY-1])
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY);
+	else
+		return -EINVAL;
+
+	if (err < 0)
+		return err;
+
+	h = ip_conntrack_find_get(&tuple, NULL);
+	if (!h) {
+		DEBUGP("tuple not found in conntrack hash");
+		return -ENOENT;
+	}
+	DEBUGP("tuple found\n");
+	ct = tuplehash_to_ctrack(h);
+
+	err = -ENOMEM;
+	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
+	if (!skb2) {
+		ip_conntrack_put(ct);
+		return -ENOMEM;
+	}
+	NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid;
+
+	err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, 
+				  IPCTNL_MSG_CT_NEW, 1, ct);
+	ip_conntrack_put(ct);
+	if (err <= 0)
+		goto out;
+
+	err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+	if (err < 0)
+		goto out;
+
+	DEBUGP("leaving\n");
+	return 0;
+
+out:
+	if (skb2)
+		kfree_skb(skb2);
+	return -1;
+}
+
+static inline int
+ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[])
+{
+	unsigned long d, status = *(u_int32_t *)NFA_DATA(cda[CTA_STATUS-1]);
+	d = ct->status ^ status;
+
+	if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
+		/* unchangeable */
+		return -EINVAL;
+	
+	if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
+		/* SEEN_REPLY bit can only be set */
+		return -EINVAL;
+
+	
+	if (d & IPS_ASSURED && !(status & IPS_ASSURED))
+		/* ASSURED bit can only be set */
+		return -EINVAL;
+
+	if (cda[CTA_NAT-1]) {
+#ifndef CONFIG_IP_NF_NAT_NEEDED
+		return -EINVAL;
+#else
+		unsigned int hooknum;
+		struct ip_nat_range range;
+
+		if (ctnetlink_parse_nat(cda, ct, &range) < 0)
+			return -EINVAL;
+
+		DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", 
+		       NIPQUAD(range.min_ip), NIPQUAD(range.max_ip),
+		       htons(range.min.all), htons(range.max.all));
+		
+		/* This is tricky but it works. ip_nat_setup_info needs the
+		 * hook number as parameter, so let's do the correct 
+		 * conversion and run away */
+		if (status & IPS_SRC_NAT_DONE)
+			hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */
+		else if (status & IPS_DST_NAT_DONE)
+			hooknum = NF_IP_PRE_ROUTING;  /* IP_NAT_MANIP_DST */
+		else 
+			return -EINVAL; /* Missing NAT flags */
+
+		DEBUGP("NAT status: %lu\n", 
+		       status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
+		
+		if (ip_nat_initialized(ct, hooknum))
+			return -EEXIST;
+		ip_nat_setup_info(ct, &range, hooknum);
+
+                DEBUGP("NAT status after setup_info: %lu\n",
+                       ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
+#endif
+	}
+
+	/* Be careful here, modifying NAT bits can screw up things,
+	 * so don't let users modify them directly if they don't pass
+	 * ip_nat_range. */
+	ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK);
+	return 0;
+}
+
+
+static inline int
+ctnetlink_change_helper(struct ip_conntrack *ct, struct nfattr *cda[])
+{
+	struct ip_conntrack_helper *helper;
+	char *helpname;
+	int err;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	/* don't change helper of sibling connections */
+	if (ct->master)
+		return -EINVAL;
+
+	err = ctnetlink_parse_help(cda[CTA_HELP-1], &helpname);
+	if (err < 0)
+		return err;
+
+	helper = __ip_conntrack_helper_find_byname(helpname);
+	if (!helper) {
+		if (!strcmp(helpname, ""))
+			helper = NULL;
+		else
+			return -EINVAL;
+	}
+
+	if (ct->helper) {
+		if (!helper) {
+			/* we had a helper before ... */
+			ip_ct_remove_expectations(ct);
+			ct->helper = NULL;
+		} else {
+			/* need to zero data of old helper */
+			memset(&ct->help, 0, sizeof(ct->help));
+		}
+	}
+	
+	ct->helper = helper;
+
+	return 0;
+}
+
+static inline int
+ctnetlink_change_timeout(struct ip_conntrack *ct, struct nfattr *cda[])
+{
+	u_int32_t timeout = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1]));
+	
+	if (!del_timer(&ct->timeout))
+		return -ETIME;
+
+	ct->timeout.expires = jiffies + timeout * HZ;
+	add_timer(&ct->timeout);
+
+	return 0;
+}
+
+static int
+ctnetlink_change_conntrack(struct ip_conntrack *ct, struct nfattr *cda[])
+{
+	int err;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	if (cda[CTA_HELP-1]) {
+		err = ctnetlink_change_helper(ct, cda);
+		if (err < 0)
+			return err;
+	}
+
+	if (cda[CTA_TIMEOUT-1]) {
+		err = ctnetlink_change_timeout(ct, cda);
+		if (err < 0)
+			return err;
+	}
+
+	if (cda[CTA_STATUS-1]) {
+		err = ctnetlink_change_status(ct, cda);
+		if (err < 0)
+			return err;
+	}
+
+	DEBUGP("all done\n");
+	return 0;
+}
+
+static int
+ctnetlink_create_conntrack(struct nfattr *cda[], 
+			   struct ip_conntrack_tuple *otuple,
+			   struct ip_conntrack_tuple *rtuple)
+{
+	struct ip_conntrack *ct;
+	int err = -EINVAL;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	ct = ip_conntrack_alloc(otuple, rtuple);
+	if (ct == NULL || IS_ERR(ct))
+		return -ENOMEM;	
+
+	if (!cda[CTA_TIMEOUT-1])
+		goto err;
+	ct->timeout.expires = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1]));
+
+	ct->timeout.expires = jiffies + ct->timeout.expires * HZ;
+	ct->status |= IPS_CONFIRMED;
+
+	err = ctnetlink_change_status(ct, cda);
+	if (err < 0)
+		goto err;
+
+	ct->helper = ip_conntrack_helper_find_get(rtuple);
+
+	add_timer(&ct->timeout);
+	ip_conntrack_hash_insert(ct);
+
+	if (ct->helper)
+		ip_conntrack_helper_put(ct->helper);
+
+	DEBUGP("conntrack with id %u inserted\n", ct->id);
+	return 0;
+
+err:	
+	ip_conntrack_free(ct);
+	return err;
+}
+
+static int 
+ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, 
+			struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+	struct ip_conntrack_tuple otuple, rtuple;
+	struct ip_conntrack_tuple_hash *h = NULL;
+	int err = 0;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	if (cda[CTA_TUPLE_ORIG-1]) {
+		err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG);
+		if (err < 0)
+			return err;
+	}
+
+	if (cda[CTA_TUPLE_REPLY-1]) {
+		err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY);
+		if (err < 0)
+			return err;
+	}
+
+	write_lock_bh(&ip_conntrack_lock);
+	if (cda[CTA_TUPLE_ORIG-1])
+		h = __ip_conntrack_find(&otuple, NULL);
+	else if (cda[CTA_TUPLE_REPLY-1])
+		h = __ip_conntrack_find(&rtuple, NULL);
+
+	if (h == NULL) {
+		write_unlock_bh(&ip_conntrack_lock);
+		DEBUGP("no such conntrack, create new\n");
+		err = -ENOENT;
+		if (nlh->nlmsg_flags & NLM_F_CREATE)
+			err = ctnetlink_create_conntrack(cda, &otuple, &rtuple);
+		return err;
+	}
+	/* implicit 'else' */
+
+	/* we only allow nat config for new conntracks */
+	if (cda[CTA_NAT-1]) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* We manipulate the conntrack inside the global conntrack table lock,
+	 * so there's no need to increase the refcount */
+	DEBUGP("conntrack found\n");
+	err = -EEXIST;
+	if (!(nlh->nlmsg_flags & NLM_F_EXCL))
+		err = ctnetlink_change_conntrack(tuplehash_to_ctrack(h), cda);
+
+out_unlock:
+	write_unlock_bh(&ip_conntrack_lock);
+	return err;
+}
+
+/*********************************************************************** 
+ * EXPECT 
+ ***********************************************************************/ 
+
+static inline int
+ctnetlink_exp_dump_tuple(struct sk_buff *skb,
+			 const struct ip_conntrack_tuple *tuple,
+			 enum ctattr_expect type)
+{
+	struct nfattr *nest_parms = NFA_NEST(skb, type);
+	
+	if (ctnetlink_dump_tuples(skb, tuple) < 0)
+		goto nfattr_failure;
+
+	NFA_NEST_END(skb, nest_parms);
+
+	return 0;
+
+nfattr_failure:
+	return -1;
+}			
+
+static inline int
+ctnetlink_exp_dump_expect(struct sk_buff *skb,
+                          const struct ip_conntrack_expect *exp)
+{
+	struct ip_conntrack *master = exp->master;
+	u_int32_t timeout = htonl((exp->timeout.expires - jiffies) / HZ);
+	u_int32_t id = htonl(exp->id);
+
+	if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0)
+		goto nfattr_failure;
+	if (ctnetlink_exp_dump_tuple(skb, &exp->mask, CTA_EXPECT_MASK) < 0)
+		goto nfattr_failure;
+	if (ctnetlink_exp_dump_tuple(skb,
+				 &master->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+				 CTA_EXPECT_MASTER) < 0)
+		goto nfattr_failure;
+	
+	NFA_PUT(skb, CTA_EXPECT_TIMEOUT, sizeof(timeout), &timeout);
+	NFA_PUT(skb, CTA_EXPECT_ID, sizeof(u_int32_t), &id);
+
+	return 0;
+	
+nfattr_failure:
+	return -1;
+}
+
+static int
+ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
+		    int event, 
+		    int nowait, 
+		    const struct ip_conntrack_expect *exp)
+{
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+	unsigned char *b;
+
+	b = skb->tail;
+
+	event |= NFNL_SUBSYS_CTNETLINK_EXP << 8;
+	nlh    = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg));
+	nfmsg  = NLMSG_DATA(nlh);
+
+	nlh->nlmsg_flags    = (nowait && pid) ? NLM_F_MULTI : 0;
+	nfmsg->nfgen_family = AF_INET;
+	nfmsg->version	    = NFNETLINK_V0;
+	nfmsg->res_id	    = 0;
+
+	if (ctnetlink_exp_dump_expect(skb, exp) < 0)
+		goto nfattr_failure;
+
+	nlh->nlmsg_len = skb->tail - b;
+	return skb->len;
+
+nlmsg_failure:
+nfattr_failure:
+	skb_trim(skb, b - skb->data);
+	return -1;
+}
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+static int ctnetlink_expect_event(struct notifier_block *this,
+				  unsigned long events, void *ptr)
+{
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+	struct ip_conntrack_expect *exp = (struct ip_conntrack_expect *)ptr;
+	struct sk_buff *skb;
+	unsigned int type;
+	unsigned char *b;
+	int flags = 0;
+	u16 proto;
+
+	if (events & IPEXP_NEW) {
+		type = IPCTNL_MSG_EXP_NEW;
+		flags = NLM_F_CREATE|NLM_F_EXCL;
+	} else
+		return NOTIFY_DONE;
+
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
+	if (!skb)
+		return NOTIFY_DONE;
+
+	b = skb->tail;
+
+	type |= NFNL_SUBSYS_CTNETLINK << 8;
+	nlh   = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg));
+	nfmsg = NLMSG_DATA(nlh);
+
+	nlh->nlmsg_flags    = flags;
+	nfmsg->nfgen_family = AF_INET;
+	nfmsg->version	    = NFNETLINK_V0;
+	nfmsg->res_id	    = 0;
+
+	if (ctnetlink_exp_dump_expect(skb, exp) < 0)
+		goto nfattr_failure;
+
+	nlh->nlmsg_len = skb->tail - b;
+	proto = exp->tuple.dst.protonum;
+	nfnetlink_send(skb, 0, NFNLGRP_CONNTRACK_EXP_NEW, 0);
+	return NOTIFY_DONE;
+
+nlmsg_failure:
+nfattr_failure:
+	kfree_skb(skb);
+	return NOTIFY_DONE;
+}
+#endif
+
+static int
+ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct ip_conntrack_expect *exp = NULL;
+	struct list_head *i;
+	u_int32_t *id = (u_int32_t *) &cb->args[0];
+
+	DEBUGP("entered %s, last id=%llu\n", __FUNCTION__, *id);
+
+	read_lock_bh(&ip_conntrack_lock);
+	list_for_each_prev(i, &ip_conntrack_expect_list) {
+		exp = (struct ip_conntrack_expect *) i;
+		if (exp->id <= *id)
+			continue;
+		if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).pid,
+					    cb->nlh->nlmsg_seq,
+					    IPCTNL_MSG_EXP_NEW,
+					    1, exp) < 0)
+			goto out;
+		*id = exp->id;
+	}
+out:	
+	read_unlock_bh(&ip_conntrack_lock);
+
+	DEBUGP("leaving, last id=%llu\n", *id);
+
+	return skb->len;
+}
+
+static int
+ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, 
+		     struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+	struct ip_conntrack_tuple tuple;
+	struct ip_conntrack_expect *exp;
+	struct sk_buff *skb2;
+	int err = 0;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+		struct nfgenmsg *msg = NLMSG_DATA(nlh);
+		u32 rlen;
+
+		if (msg->nfgen_family != AF_INET)
+			return -EAFNOSUPPORT;
+
+		if ((*errp = netlink_dump_start(ctnl, skb, nlh,
+		    				ctnetlink_exp_dump_table,
+						ctnetlink_done)) != 0)
+			return -EINVAL;
+		rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+		if (rlen > skb->len)
+			rlen = skb->len;
+		skb_pull(skb, rlen);
+		return 0;
+	}
+
+	if (cda[CTA_EXPECT_MASTER-1])
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER);
+	else
+		return -EINVAL;
+
+	if (err < 0)
+		return err;
+
+	exp = ip_conntrack_expect_find_get(&tuple);
+	if (!exp)
+		return -ENOENT;
+
+	err = -ENOMEM;
+	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb2)
+		goto out;
+	NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid;
+	
+	err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid, 
+				      nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW,
+				      1, exp);
+	if (err <= 0)
+		goto out;
+
+	ip_conntrack_expect_put(exp);
+
+	err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+	if (err < 0)
+		goto free;
+
+	return err;
+
+out:
+	ip_conntrack_expect_put(exp);
+free:
+	if (skb2)
+		kfree_skb(skb2);
+	return err;
+}
+
+static int
+ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, 
+		     struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+	struct ip_conntrack_expect *exp, *tmp;
+	struct ip_conntrack_tuple tuple;
+	struct ip_conntrack_helper *h;
+	int err;
+
+	if (cda[CTA_EXPECT_TUPLE-1]) {
+		/* delete a single expect by tuple */
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
+		if (err < 0)
+			return err;
+
+		/* bump usage count to 2 */
+		exp = ip_conntrack_expect_find_get(&tuple);
+		if (!exp)
+			return -ENOENT;
+
+		if (cda[CTA_EXPECT_ID-1]) {
+			u_int32_t id = 
+				*(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]);
+			if (exp->id != ntohl(id)) {
+				ip_conntrack_expect_put(exp);
+				return -ENOENT;
+			}
+		}
+
+		/* after list removal, usage count == 1 */
+		ip_conntrack_unexpect_related(exp);
+		/* have to put what we 'get' above. 
+		 * after this line usage count == 0 */
+		ip_conntrack_expect_put(exp);
+	} else if (cda[CTA_EXPECT_HELP_NAME-1]) {
+		char *name = NFA_DATA(cda[CTA_EXPECT_HELP_NAME-1]);
+
+		/* delete all expectations for this helper */
+		write_lock_bh(&ip_conntrack_lock);
+		h = __ip_conntrack_helper_find_byname(name);
+		if (!h) {
+			write_unlock_bh(&ip_conntrack_lock);
+			return -EINVAL;
+		}
+		list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list,
+					 list) {
+			if (exp->master->helper == h 
+			    && del_timer(&exp->timeout))
+				__ip_ct_expect_unlink_destroy(exp);
+		}
+		write_unlock(&ip_conntrack_lock);
+	} else {
+		/* This basically means we have to flush everything*/
+		write_lock_bh(&ip_conntrack_lock);
+		list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list,
+					 list) {
+			if (del_timer(&exp->timeout))
+				__ip_ct_expect_unlink_destroy(exp);
+		}
+		write_unlock_bh(&ip_conntrack_lock);
+	}
+
+	return 0;
+}
+static int
+ctnetlink_change_expect(struct ip_conntrack_expect *x, struct nfattr *cda[])
+{
+	return -EOPNOTSUPP;
+}
+
+static int
+ctnetlink_create_expect(struct nfattr *cda[])
+{
+	struct ip_conntrack_tuple tuple, mask, master_tuple;
+	struct ip_conntrack_tuple_hash *h = NULL;
+	struct ip_conntrack_expect *exp;
+	struct ip_conntrack *ct;
+	int err = 0;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	/* caller guarantees that those three CTA_EXPECT_* exist */
+	err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
+	if (err < 0)
+		return err;
+	err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK);
+	if (err < 0)
+		return err;
+	err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER);
+	if (err < 0)
+		return err;
+
+	/* Look for master conntrack of this expectation */
+	h = ip_conntrack_find_get(&master_tuple, NULL);
+	if (!h)
+		return -ENOENT;
+	ct = tuplehash_to_ctrack(h);
+
+	if (!ct->helper) {
+		/* such conntrack hasn't got any helper, abort */
+		err = -EINVAL;
+		goto out;
+	}
+
+	exp = ip_conntrack_expect_alloc(ct);
+	if (!exp) {
+		err = -ENOMEM;
+		goto out;
+	}
+	
+	exp->expectfn = NULL;
+	exp->master = ct;
+	memcpy(&exp->tuple, &tuple, sizeof(struct ip_conntrack_tuple));
+	memcpy(&exp->mask, &mask, sizeof(struct ip_conntrack_tuple));
+
+	err = ip_conntrack_expect_related(exp);
+	ip_conntrack_expect_put(exp);
+
+out:	
+	ip_conntrack_put(tuplehash_to_ctrack(h));
+	return err;
+}
+
+static int
+ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
+		     struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+	struct ip_conntrack_tuple tuple;
+	struct ip_conntrack_expect *exp;
+	int err = 0;
+
+	DEBUGP("entered %s\n", __FUNCTION__);	
+
+	if (!cda[CTA_EXPECT_TUPLE-1]
+	    || !cda[CTA_EXPECT_MASK-1]
+	    || !cda[CTA_EXPECT_MASTER-1])
+		return -EINVAL;
+
+	err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
+	if (err < 0)
+		return err;
+
+	write_lock_bh(&ip_conntrack_lock);
+	exp = __ip_conntrack_expect_find(&tuple);
+
+	if (!exp) {
+		write_unlock_bh(&ip_conntrack_lock);
+		err = -ENOENT;
+		if (nlh->nlmsg_flags & NLM_F_CREATE)
+			err = ctnetlink_create_expect(cda);
+		return err;
+	}
+
+	err = -EEXIST;
+	if (!(nlh->nlmsg_flags & NLM_F_EXCL))
+		err = ctnetlink_change_expect(exp, cda);
+	write_unlock_bh(&ip_conntrack_lock);
+
+	DEBUGP("leaving\n");
+	
+	return err;
+}
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+static struct notifier_block ctnl_notifier = {
+	.notifier_call	= ctnetlink_conntrack_event,
+};
+
+static struct notifier_block ctnl_notifier_exp = {
+	.notifier_call	= ctnetlink_expect_event,
+};
+#endif
+
+static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = {
+	[IPCTNL_MSG_CT_NEW]		= { .call = ctnetlink_new_conntrack,
+					    .attr_count = CTA_MAX,
+					    .cap_required = CAP_NET_ADMIN },
+	[IPCTNL_MSG_CT_GET] 		= { .call = ctnetlink_get_conntrack,
+					    .attr_count = CTA_MAX,
+					    .cap_required = CAP_NET_ADMIN },
+	[IPCTNL_MSG_CT_DELETE]  	= { .call = ctnetlink_del_conntrack,
+					    .attr_count = CTA_MAX,
+					    .cap_required = CAP_NET_ADMIN },
+	[IPCTNL_MSG_CT_GET_CTRZERO] 	= { .call = ctnetlink_get_conntrack,
+					    .attr_count = CTA_MAX,
+					    .cap_required = CAP_NET_ADMIN },
+};
+
+static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = {
+	[IPCTNL_MSG_EXP_GET]		= { .call = ctnetlink_get_expect,
+					    .attr_count = CTA_EXPECT_MAX,
+					    .cap_required = CAP_NET_ADMIN },
+	[IPCTNL_MSG_EXP_NEW]		= { .call = ctnetlink_new_expect,
+					    .attr_count = CTA_EXPECT_MAX,
+					    .cap_required = CAP_NET_ADMIN },
+	[IPCTNL_MSG_EXP_DELETE]		= { .call = ctnetlink_del_expect,
+					    .attr_count = CTA_EXPECT_MAX,
+					    .cap_required = CAP_NET_ADMIN },
+};
+
+static struct nfnetlink_subsystem ctnl_subsys = {
+	.name				= "conntrack",
+	.subsys_id			= NFNL_SUBSYS_CTNETLINK,
+	.cb_count			= IPCTNL_MSG_MAX,
+	.cb				= ctnl_cb,
+};
+
+static struct nfnetlink_subsystem ctnl_exp_subsys = {
+	.name				= "conntrack_expect",
+	.subsys_id			= NFNL_SUBSYS_CTNETLINK_EXP,
+	.cb_count			= IPCTNL_MSG_EXP_MAX,
+	.cb				= ctnl_exp_cb,
+};
+
+static int __init ctnetlink_init(void)
+{
+	int ret;
+
+	printk("ctnetlink v%s: registering with nfnetlink.\n", version);
+	ret = nfnetlink_subsys_register(&ctnl_subsys);
+	if (ret < 0) {
+		printk("ctnetlink_init: cannot register with nfnetlink.\n");
+		goto err_out;
+	}
+
+	ret = nfnetlink_subsys_register(&ctnl_exp_subsys);
+	if (ret < 0) {
+		printk("ctnetlink_init: cannot register exp with nfnetlink.\n");
+		goto err_unreg_subsys;
+	}
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+	ret = ip_conntrack_register_notifier(&ctnl_notifier);
+	if (ret < 0) {
+		printk("ctnetlink_init: cannot register notifier.\n");
+		goto err_unreg_exp_subsys;
+	}
+
+	ret = ip_conntrack_expect_register_notifier(&ctnl_notifier_exp);
+	if (ret < 0) {
+		printk("ctnetlink_init: cannot expect register notifier.\n");
+		goto err_unreg_notifier;
+	}
+#endif
+
+	return 0;
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+err_unreg_notifier:
+	ip_conntrack_unregister_notifier(&ctnl_notifier);
+err_unreg_exp_subsys:
+	nfnetlink_subsys_unregister(&ctnl_exp_subsys);
+#endif
+err_unreg_subsys:
+	nfnetlink_subsys_unregister(&ctnl_subsys);
+err_out:
+	return ret;
+}
+
+static void __exit ctnetlink_exit(void)
+{
+	printk("ctnetlink: unregistering from nfnetlink.\n");
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+	ip_conntrack_unregister_notifier(&ctnl_notifier_exp);
+	ip_conntrack_unregister_notifier(&ctnl_notifier);
+#endif
+
+	nfnetlink_subsys_unregister(&ctnl_exp_subsys);
+	nfnetlink_subsys_unregister(&ctnl_subsys);
+	return;
+}
+
+module_init(ctnetlink_init);
+module_exit(ctnetlink_exit);
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
index 602c74db325..838d1d69b36 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -102,22 +102,24 @@ static int icmp_packet(struct ip_conntrack *ct,
 			ct->timeout.function((unsigned long)ct);
 	} else {
 		atomic_inc(&ct->proto.icmp.count);
+		ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
 		ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout);
 	}
 
 	return NF_ACCEPT;
 }
 
+static u_int8_t valid_new[] = { 
+	[ICMP_ECHO] = 1,
+	[ICMP_TIMESTAMP] = 1,
+	[ICMP_INFO_REQUEST] = 1,
+	[ICMP_ADDRESS] = 1 
+};
+
 /* Called when a new connection for this protocol found. */
 static int icmp_new(struct ip_conntrack *conntrack,
 		    const struct sk_buff *skb)
 {
-	static u_int8_t valid_new[]
-		= { [ICMP_ECHO] = 1,
-		    [ICMP_TIMESTAMP] = 1,
-		    [ICMP_INFO_REQUEST] = 1,
-		    [ICMP_ADDRESS] = 1 };
-
 	if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new)
 	    || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) {
 		/* Can't create a new ICMP `conn' with this. */
@@ -158,11 +160,12 @@ icmp_error_message(struct sk_buff *skb,
 		return NF_ACCEPT;
 	}
 
-	innerproto = ip_ct_find_proto(inside->ip.protocol);
+	innerproto = ip_conntrack_proto_find_get(inside->ip.protocol);
 	dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp) + inside->ip.ihl*4;
 	/* Are they talking about one of our connections? */
 	if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) {
 		DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol);
+		ip_conntrack_proto_put(innerproto);
 		return NF_ACCEPT;
 	}
 
@@ -170,8 +173,10 @@ icmp_error_message(struct sk_buff *skb,
 	   been preserved inside the ICMP. */
 	if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) {
 		DEBUGP("icmp_error_track: Can't invert tuple\n");
+		ip_conntrack_proto_put(innerproto);
 		return NF_ACCEPT;
 	}
+	ip_conntrack_proto_put(innerproto);
 
 	*ctinfo = IP_CT_RELATED;
 
@@ -212,7 +217,7 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
 	icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih);
 	if (icmph == NULL) {
 		if (LOG_INVALID(IPPROTO_ICMP))
-			nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 				      "ip_ct_icmp: short packet ");
 		return -NF_ACCEPT;
 	}
@@ -226,13 +231,13 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
 		if (!(u16)csum_fold(skb->csum)) 
 			break;
 		if (LOG_INVALID(IPPROTO_ICMP))
-			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 				      "ip_ct_icmp: bad HW ICMP checksum ");
 		return -NF_ACCEPT;
 	case CHECKSUM_NONE:
 		if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) {
 			if (LOG_INVALID(IPPROTO_ICMP))
-				nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+				nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 					      "ip_ct_icmp: bad ICMP checksum ");
 			return -NF_ACCEPT;
 		}
@@ -249,7 +254,7 @@ checksum_skipped:
 	 */
 	if (icmph->type > NR_ICMP_TYPES) {
 		if (LOG_INVALID(IPPROTO_ICMP))
-			nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 				      "ip_ct_icmp: invalid ICMP type ");
 		return -NF_ACCEPT;
 	}
@@ -265,6 +270,47 @@ checksum_skipped:
 	return icmp_error_message(skb, ctinfo, hooknum);
 }
 
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+static int icmp_tuple_to_nfattr(struct sk_buff *skb,
+				const struct ip_conntrack_tuple *t)
+{
+	NFA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(u_int16_t),
+		&t->src.u.icmp.id);
+	NFA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t),
+		&t->dst.u.icmp.type);
+	NFA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t),
+		&t->dst.u.icmp.code);
+
+	if (t->dst.u.icmp.type >= sizeof(valid_new) 
+	    || !valid_new[t->dst.u.icmp.type])
+		return -EINVAL;
+
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+static int icmp_nfattr_to_tuple(struct nfattr *tb[],
+				struct ip_conntrack_tuple *tuple)
+{
+	if (!tb[CTA_PROTO_ICMP_TYPE-1]
+	    || !tb[CTA_PROTO_ICMP_CODE-1]
+	    || !tb[CTA_PROTO_ICMP_ID-1])
+		return -1;
+
+	tuple->dst.u.icmp.type = 
+			*(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_TYPE-1]);
+	tuple->dst.u.icmp.code =
+			*(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]);
+	tuple->src.u.icmp.id =
+			*(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]);
+
+	return 0;
+}
+#endif
+
 struct ip_conntrack_protocol ip_conntrack_protocol_icmp =
 {
 	.proto 			= IPPROTO_ICMP,
@@ -276,4 +322,9 @@ struct ip_conntrack_protocol ip_conntrack_protocol_icmp =
 	.packet			= icmp_packet,
 	.new			= icmp_new,
 	.error			= icmp_error,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+	.tuple_to_nfattr	= icmp_tuple_to_nfattr,
+	.nfattr_to_tuple	= icmp_nfattr_to_tuple,
+#endif
 };
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
index 31d75390bf1..a875f35e576 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
@@ -404,6 +404,8 @@ static int sctp_packet(struct ip_conntrack *conntrack,
 		}
 
 		conntrack->proto.sctp.state = newconntrack;
+		if (oldsctpstate != newconntrack)
+			ip_conntrack_event_cache(IPCT_PROTOINFO, skb);
 		write_unlock_bh(&sctp_lock);
 	}
 
@@ -503,7 +505,12 @@ static struct ip_conntrack_protocol ip_conntrack_protocol_sctp = {
 	.packet 	 = sctp_packet, 
 	.new 		 = sctp_new, 
 	.destroy 	 = NULL, 
-	.me 		 = THIS_MODULE 
+	.me 		 = THIS_MODULE,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+	.tuple_to_nfattr = ip_ct_port_tuple_to_nfattr,
+	.nfattr_to_tuple = ip_ct_port_nfattr_to_tuple,
+#endif
 };
 
 #ifdef CONFIG_SYSCTL
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index 809dfed766d..f23ef1f88c4 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -336,6 +336,23 @@ static int tcp_print_conntrack(struct seq_file *s,
 	return seq_printf(s, "%s ", tcp_conntrack_names[state]);
 }
 
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa,
+			 const struct ip_conntrack *ct)
+{
+	read_lock_bh(&tcp_lock);
+	NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t),
+		&ct->proto.tcp.state);
+	read_unlock_bh(&tcp_lock);
+
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+#endif
+
 static unsigned int get_conntrack_index(const struct tcphdr *tcph)
 {
 	if (tcph->rst) return TCP_RST_SET;
@@ -699,7 +716,7 @@ static int tcp_in_window(struct ip_ct_tcp *state,
 		res = 1;
 	} else {
 		if (LOG_INVALID(IPPROTO_TCP))
-			nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 			"ip_ct_tcp: %s ",
 			before(seq, sender->td_maxend + 1) ?
 			after(end, sender->td_end - receiver->td_maxwin - 1) ?
@@ -798,7 +815,7 @@ static int tcp_error(struct sk_buff *skb,
 				sizeof(_tcph), &_tcph);
 	if (th == NULL) {
 		if (LOG_INVALID(IPPROTO_TCP))
-			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 				"ip_ct_tcp: short packet ");
 		return -NF_ACCEPT;
   	}
@@ -806,7 +823,7 @@ static int tcp_error(struct sk_buff *skb,
 	/* Not whole TCP header or malformed packet */
 	if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
 		if (LOG_INVALID(IPPROTO_TCP))
-			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 				"ip_ct_tcp: truncated/malformed packet ");
 		return -NF_ACCEPT;
 	}
@@ -823,7 +840,7 @@ static int tcp_error(struct sk_buff *skb,
 			         skb->ip_summed == CHECKSUM_HW ? skb->csum
 			      	 : skb_checksum(skb, iph->ihl*4, tcplen, 0))) {
 		if (LOG_INVALID(IPPROTO_TCP))
-			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 				  "ip_ct_tcp: bad TCP checksum ");
 		return -NF_ACCEPT;
 	}
@@ -832,7 +849,7 @@ static int tcp_error(struct sk_buff *skb,
 	tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR));
 	if (!tcp_valid_flags[tcpflags]) {
 		if (LOG_INVALID(IPPROTO_TCP))
-			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 				  "ip_ct_tcp: invalid TCP flag combination ");
 		return -NF_ACCEPT;
 	}
@@ -880,8 +897,9 @@ static int tcp_packet(struct ip_conntrack *conntrack,
 			 */
 		    	write_unlock_bh(&tcp_lock);
 			if (LOG_INVALID(IPPROTO_TCP))
-				nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
-					  "ip_ct_tcp: killing out of sync session ");
+				nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+					      NULL, "ip_ct_tcp: "
+					      "killing out of sync session ");
 		    	if (del_timer(&conntrack->timeout))
 		    		conntrack->timeout.function((unsigned long)
 		    					    conntrack);
@@ -895,7 +913,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
 		
 		write_unlock_bh(&tcp_lock);
 		if (LOG_INVALID(IPPROTO_TCP))
-			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 				  "ip_ct_tcp: invalid packet ignored ");
 		return NF_ACCEPT;
 	case TCP_CONNTRACK_MAX:
@@ -905,7 +923,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
 		       old_state);
 		write_unlock_bh(&tcp_lock);
 		if (LOG_INVALID(IPPROTO_TCP))
-			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 				  "ip_ct_tcp: invalid state ");
 		return -NF_ACCEPT;
 	case TCP_CONNTRACK_SYN_SENT:
@@ -926,7 +944,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
 			write_unlock_bh(&tcp_lock);
 			if (LOG_INVALID(IPPROTO_TCP))
 				nf_log_packet(PF_INET, 0, skb, NULL, NULL,
-				              "ip_ct_tcp: invalid SYN");
+					      NULL, "ip_ct_tcp: invalid SYN");
 			return -NF_ACCEPT;
 		}
 	case TCP_CONNTRACK_CLOSE:
@@ -973,6 +991,10 @@ static int tcp_packet(struct ip_conntrack *conntrack,
 		  ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state];
 	write_unlock_bh(&tcp_lock);
 
+	ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
+	if (new_state != old_state)
+		ip_conntrack_event_cache(IPCT_PROTOINFO, skb);
+
 	if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
 		/* If only reply is a RST, we can consider ourselves not to
 		   have an established connection: this is a fairly common
@@ -1096,4 +1118,10 @@ struct ip_conntrack_protocol ip_conntrack_protocol_tcp =
 	.packet 		= tcp_packet,
 	.new 			= tcp_new,
 	.error			= tcp_error,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+	.to_nfattr		= tcp_to_nfattr,
+	.tuple_to_nfattr	= ip_ct_port_tuple_to_nfattr,
+	.nfattr_to_tuple	= ip_ct_port_nfattr_to_tuple,
+#endif
 };
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index 8c1eaba098d..f2dcac7c766 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -73,7 +73,8 @@ static int udp_packet(struct ip_conntrack *conntrack,
 		ip_ct_refresh_acct(conntrack, ctinfo, skb, 
 				   ip_ct_udp_timeout_stream);
 		/* Also, more likely to be important, and not a probe */
-		set_bit(IPS_ASSURED_BIT, &conntrack->status);
+		if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status))
+			ip_conntrack_event_cache(IPCT_STATUS, skb);
 	} else
 		ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout);
 
@@ -97,7 +98,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
 	hdr = skb_header_pointer(skb, iph->ihl*4, sizeof(_hdr), &_hdr);
 	if (hdr == NULL) {
 		if (LOG_INVALID(IPPROTO_UDP))
-			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 				  "ip_ct_udp: short packet ");
 		return -NF_ACCEPT;
 	}
@@ -105,7 +106,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
 	/* Truncated/malformed packets */
 	if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
 		if (LOG_INVALID(IPPROTO_UDP))
-			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 				  "ip_ct_udp: truncated/malformed packet ");
 		return -NF_ACCEPT;
 	}
@@ -125,7 +126,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
 			         skb->ip_summed == CHECKSUM_HW ? skb->csum
 			      	 : skb_checksum(skb, iph->ihl*4, udplen, 0))) {
 		if (LOG_INVALID(IPPROTO_UDP))
-			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 				  "ip_ct_udp: bad UDP checksum ");
 		return -NF_ACCEPT;
 	}
@@ -144,4 +145,9 @@ struct ip_conntrack_protocol ip_conntrack_protocol_udp =
 	.packet			= udp_packet,
 	.new			= udp_new,
 	.error			= udp_error,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+	.tuple_to_nfattr	= ip_ct_port_tuple_to_nfattr,
+	.nfattr_to_tuple	= ip_ct_port_nfattr_to_tuple,
+#endif
 };
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 61798c46e91..ee5895afd0c 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -5,7 +5,7 @@
 */
 
 /* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -147,8 +147,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
 	if (DIRECTION(hash))
 		return 0;
 
-	proto = ip_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
-			       .tuple.dst.protonum);
+	proto = __ip_conntrack_proto_find(conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
 	IP_NF_ASSERT(proto);
 
 	if (seq_printf(s, "%-8s %u %ld ",
@@ -185,7 +184,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
 			return -ENOSPC;
 
 #if defined(CONFIG_IP_NF_CONNTRACK_MARK)
-	if (seq_printf(s, "mark=%lu ", conntrack->mark))
+	if (seq_printf(s, "mark=%u ", conntrack->mark))
 		return -ENOSPC;
 #endif
 
@@ -283,7 +282,7 @@ static int exp_seq_show(struct seq_file *s, void *v)
 	seq_printf(s, "proto=%u ", expect->tuple.dst.protonum);
 
 	print_tuple(s, &expect->tuple,
-		    ip_ct_find_proto(expect->tuple.dst.protonum));
+		    __ip_conntrack_proto_find(expect->tuple.dst.protonum));
 	return seq_putc(s, '\n');
 }
 
@@ -889,6 +888,7 @@ static int init_or_cleanup(int init)
 	return ret;
 
  cleanup:
+	synchronize_net();
 #ifdef CONFIG_SYSCTL
  	unregister_sysctl_table(ip_ct_sysctl_header);
  cleanup_localinops:
@@ -971,6 +971,14 @@ void need_ip_conntrack(void)
 {
 }
 
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+EXPORT_SYMBOL_GPL(ip_conntrack_chain);
+EXPORT_SYMBOL_GPL(ip_conntrack_expect_chain);
+EXPORT_SYMBOL_GPL(ip_conntrack_register_notifier);
+EXPORT_SYMBOL_GPL(ip_conntrack_unregister_notifier);
+EXPORT_SYMBOL_GPL(__ip_ct_event_cache_init);
+EXPORT_PER_CPU_SYMBOL_GPL(ip_conntrack_ecache);
+#endif
 EXPORT_SYMBOL(ip_conntrack_protocol_register);
 EXPORT_SYMBOL(ip_conntrack_protocol_unregister);
 EXPORT_SYMBOL(ip_ct_get_tuple);
@@ -982,12 +990,16 @@ EXPORT_SYMBOL(ip_conntrack_helper_register);
 EXPORT_SYMBOL(ip_conntrack_helper_unregister);
 EXPORT_SYMBOL(ip_ct_iterate_cleanup);
 EXPORT_SYMBOL(ip_ct_refresh_acct);
-EXPORT_SYMBOL(ip_ct_protos);
-EXPORT_SYMBOL(ip_ct_find_proto);
+
 EXPORT_SYMBOL(ip_conntrack_expect_alloc);
 EXPORT_SYMBOL(ip_conntrack_expect_put);
+EXPORT_SYMBOL_GPL(ip_conntrack_expect_find_get);
 EXPORT_SYMBOL(ip_conntrack_expect_related);
 EXPORT_SYMBOL(ip_conntrack_unexpect_related);
+EXPORT_SYMBOL_GPL(ip_conntrack_expect_list);
+EXPORT_SYMBOL_GPL(__ip_conntrack_expect_find);
+EXPORT_SYMBOL_GPL(__ip_ct_expect_unlink_destroy);
+
 EXPORT_SYMBOL(ip_conntrack_tuple_taken);
 EXPORT_SYMBOL(ip_ct_gather_frags);
 EXPORT_SYMBOL(ip_conntrack_htable_size);
@@ -995,7 +1007,28 @@ EXPORT_SYMBOL(ip_conntrack_lock);
 EXPORT_SYMBOL(ip_conntrack_hash);
 EXPORT_SYMBOL(ip_conntrack_untracked);
 EXPORT_SYMBOL_GPL(ip_conntrack_find_get);
-EXPORT_SYMBOL_GPL(ip_conntrack_put);
 #ifdef CONFIG_IP_NF_NAT_NEEDED
 EXPORT_SYMBOL(ip_conntrack_tcp_update);
 #endif
+
+EXPORT_SYMBOL_GPL(ip_conntrack_flush);
+EXPORT_SYMBOL_GPL(__ip_conntrack_find);
+
+EXPORT_SYMBOL_GPL(ip_conntrack_alloc);
+EXPORT_SYMBOL_GPL(ip_conntrack_free);
+EXPORT_SYMBOL_GPL(ip_conntrack_hash_insert);
+
+EXPORT_SYMBOL_GPL(ip_ct_remove_expectations);
+
+EXPORT_SYMBOL_GPL(ip_conntrack_helper_find_get);
+EXPORT_SYMBOL_GPL(ip_conntrack_helper_put);
+EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname);
+
+EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get);
+EXPORT_SYMBOL_GPL(ip_conntrack_proto_put);
+EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find);
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr);
+EXPORT_SYMBOL_GPL(ip_ct_port_nfattr_to_tuple);
+#endif
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
index 739b6dde1c8..1adedb743f6 100644
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -47,8 +47,39 @@ DEFINE_RWLOCK(ip_nat_lock);
 static unsigned int ip_nat_htable_size;
 
 static struct list_head *bysource;
+
+#define MAX_IP_NAT_PROTO 256
 struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO];
 
+static inline struct ip_nat_protocol *
+__ip_nat_proto_find(u_int8_t protonum)
+{
+	return ip_nat_protos[protonum];
+}
+
+struct ip_nat_protocol *
+ip_nat_proto_find_get(u_int8_t protonum)
+{
+	struct ip_nat_protocol *p;
+
+	/* we need to disable preemption to make sure 'p' doesn't get
+	 * removed until we've grabbed the reference */
+	preempt_disable();
+	p = __ip_nat_proto_find(protonum);
+	if (p) {
+		if (!try_module_get(p->me))
+			p = &ip_nat_unknown_protocol;
+	}
+	preempt_enable();
+
+	return p;
+}
+
+void
+ip_nat_proto_put(struct ip_nat_protocol *p)
+{
+	module_put(p->me);
+}
 
 /* We keep an extra hash for each conntrack, for fast searching. */
 static inline unsigned int
@@ -103,7 +134,8 @@ static int
 in_range(const struct ip_conntrack_tuple *tuple,
 	 const struct ip_nat_range *range)
 {
-	struct ip_nat_protocol *proto = ip_nat_find_proto(tuple->dst.protonum);
+	struct ip_nat_protocol *proto = 
+				__ip_nat_proto_find(tuple->dst.protonum);
 
 	/* If we are supposed to map IPs, then we must be in the
 	   range specified, otherwise let this drag us onto a new src IP. */
@@ -216,8 +248,7 @@ get_unique_tuple(struct ip_conntrack_tuple *tuple,
 		 struct ip_conntrack *conntrack,
 		 enum ip_nat_manip_type maniptype)
 {
-	struct ip_nat_protocol *proto
-		= ip_nat_find_proto(orig_tuple->dst.protonum);
+	struct ip_nat_protocol *proto;
 
 	/* 1) If this srcip/proto/src-proto-part is currently mapped,
 	   and that same mapping gives a unique tuple within the given
@@ -242,14 +273,20 @@ get_unique_tuple(struct ip_conntrack_tuple *tuple,
 	/* 3) The per-protocol part of the manip is made to map into
 	   the range to make a unique tuple. */
 
+	proto = ip_nat_proto_find_get(orig_tuple->dst.protonum);
+
 	/* Only bother mapping if it's not already in range and unique */
 	if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
 	     || proto->in_range(tuple, maniptype, &range->min, &range->max))
-	    && !ip_nat_used_tuple(tuple, conntrack))
+	    && !ip_nat_used_tuple(tuple, conntrack)) {
+		ip_nat_proto_put(proto);
 		return;
+	}
 
 	/* Last change: get protocol to try to obtain unique tuple. */
 	proto->unique_tuple(tuple, range, maniptype, conntrack);
+
+	ip_nat_proto_put(proto);
 }
 
 unsigned int
@@ -320,17 +357,20 @@ manip_pkt(u_int16_t proto,
 	  enum ip_nat_manip_type maniptype)
 {
 	struct iphdr *iph;
+	struct ip_nat_protocol *p;
 
-	(*pskb)->nfcache |= NFC_ALTERED;
-	if (!skb_ip_make_writable(pskb, iphdroff + sizeof(*iph)))
+	if (!skb_make_writable(pskb, iphdroff + sizeof(*iph)))
 		return 0;
 
 	iph = (void *)(*pskb)->data + iphdroff;
 
 	/* Manipulate protcol part. */
-	if (!ip_nat_find_proto(proto)->manip_pkt(pskb, iphdroff,
-	                                         target, maniptype))
+	p = ip_nat_proto_find_get(proto);
+	if (!p->manip_pkt(pskb, iphdroff, target, maniptype)) {
+		ip_nat_proto_put(p);
 		return 0;
+	}
+	ip_nat_proto_put(p);
 
 	iph = (void *)(*pskb)->data + iphdroff;
 
@@ -391,7 +431,7 @@ int icmp_reply_translation(struct sk_buff **pskb,
 	struct ip_conntrack_tuple inner, target;
 	int hdrlen = (*pskb)->nh.iph->ihl * 4;
 
-	if (!skb_ip_make_writable(pskb, hdrlen + sizeof(*inside)))
+	if (!skb_make_writable(pskb, hdrlen + sizeof(*inside)))
 		return 0;
 
 	inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
@@ -426,7 +466,8 @@ int icmp_reply_translation(struct sk_buff **pskb,
 
 	if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 +
 	                     sizeof(struct icmphdr) + inside->ip.ihl*4,
-	                     &inner, ip_ct_find_proto(inside->ip.protocol)))
+	                     &inner,
+			     __ip_conntrack_proto_find(inside->ip.protocol)))
 		return 0;
 
 	/* Change inner back to look like incoming packet.  We do the
@@ -496,6 +537,49 @@ void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
 	synchronize_net();
 }
 
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+int
+ip_nat_port_range_to_nfattr(struct sk_buff *skb, 
+			    const struct ip_nat_range *range)
+{
+	NFA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(u_int16_t),
+		&range->min.tcp.port);
+	NFA_PUT(skb, CTA_PROTONAT_PORT_MAX, sizeof(u_int16_t),
+		&range->max.tcp.port);
+
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+int
+ip_nat_port_nfattr_to_range(struct nfattr *tb[], struct ip_nat_range *range)
+{
+	int ret = 0;
+	
+	/* we have to return whether we actually parsed something or not */
+
+	if (tb[CTA_PROTONAT_PORT_MIN-1]) {
+		ret = 1;
+		range->min.tcp.port = 
+			*(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MIN-1]);
+	}
+	
+	if (!tb[CTA_PROTONAT_PORT_MAX-1]) {
+		if (ret) 
+			range->max.tcp.port = range->min.tcp.port;
+	} else {
+		ret = 1;
+		range->max.tcp.port = 
+			*(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MAX-1]);
+	}
+
+	return ret;
+}
+#endif
+
 int __init ip_nat_init(void)
 {
 	size_t i;
diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c
index 158f34f32c0..d2dd5d31355 100644
--- a/net/ipv4/netfilter/ip_nat_helper.c
+++ b/net/ipv4/netfilter/ip_nat_helper.c
@@ -168,7 +168,7 @@ ip_nat_mangle_tcp_packet(struct sk_buff **pskb,
 	struct tcphdr *tcph;
 	int datalen;
 
-	if (!skb_ip_make_writable(pskb, (*pskb)->len))
+	if (!skb_make_writable(pskb, (*pskb)->len))
 		return 0;
 
 	if (rep_len > match_len
@@ -228,7 +228,7 @@ ip_nat_mangle_udp_packet(struct sk_buff **pskb,
 	                       match_offset + match_len)
 		return 0;
 
-	if (!skb_ip_make_writable(pskb, (*pskb)->len))
+	if (!skb_make_writable(pskb, (*pskb)->len))
 		return 0;
 
 	if (rep_len > match_len
@@ -315,7 +315,7 @@ ip_nat_sack_adjust(struct sk_buff **pskb,
 	optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr);
 	optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4;
 
-	if (!skb_ip_make_writable(pskb, optend))
+	if (!skb_make_writable(pskb, optend))
 		return 0;
 
 	dir = CTINFO2DIR(ctinfo);
@@ -363,7 +363,7 @@ ip_nat_seq_adjust(struct sk_buff **pskb,
 	this_way = &ct->nat.info.seq[dir];
 	other_way = &ct->nat.info.seq[!dir];
 
-	if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
+	if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
 		return 0;
 
 	tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c
index 6596c9ee165..93871904399 100644
--- a/net/ipv4/netfilter/ip_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c
@@ -62,7 +62,7 @@ icmp_manip_pkt(struct sk_buff **pskb,
 	struct icmphdr *hdr;
 	unsigned int hdroff = iphdroff + iph->ihl*4;
 
-	if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr)))
+	if (!skb_make_writable(pskb, hdroff + sizeof(*hdr)))
 		return 0;
 
 	hdr = (struct icmphdr *)((*pskb)->data + hdroff);
@@ -106,11 +106,18 @@ icmp_print_range(char *buffer, const struct ip_nat_range *range)
 	else return 0;
 }
 
-struct ip_nat_protocol ip_nat_protocol_icmp
-= { "ICMP", IPPROTO_ICMP,
-    icmp_manip_pkt,
-    icmp_in_range,
-    icmp_unique_tuple,
-    icmp_print,
-    icmp_print_range
+struct ip_nat_protocol ip_nat_protocol_icmp = {
+	.name			= "ICMP",
+	.protonum		= IPPROTO_ICMP,
+	.me			= THIS_MODULE,
+	.manip_pkt		= icmp_manip_pkt,
+	.in_range		= icmp_in_range,
+	.unique_tuple		= icmp_unique_tuple,
+	.print			= icmp_print,
+	.print_range		= icmp_print_range,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+	.range_to_nfattr	= ip_nat_port_range_to_nfattr,
+	.nfattr_to_range	= ip_nat_port_nfattr_to_range,
+#endif
 };
diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c
index a98e36d2b3c..1d381bf6857 100644
--- a/net/ipv4/netfilter/ip_nat_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c
@@ -12,6 +12,7 @@
 #include <linux/ip.h>
 #include <linux/tcp.h>
 #include <linux/if.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
 #include <linux/netfilter_ipv4/ip_nat.h>
 #include <linux/netfilter_ipv4/ip_nat_rule.h>
 #include <linux/netfilter_ipv4/ip_nat_protocol.h>
@@ -102,7 +103,7 @@ tcp_manip_pkt(struct sk_buff **pskb,
 	if ((*pskb)->len >= hdroff + sizeof(struct tcphdr))
 		hdrsize = sizeof(struct tcphdr);
 
-	if (!skb_ip_make_writable(pskb, hdroff + hdrsize))
+	if (!skb_make_writable(pskb, hdroff + hdrsize))
 		return 0;
 
 	iph = (struct iphdr *)((*pskb)->data + iphdroff);
@@ -169,11 +170,18 @@ tcp_print_range(char *buffer, const struct ip_nat_range *range)
 	else return 0;
 }
 
-struct ip_nat_protocol ip_nat_protocol_tcp
-= { "TCP", IPPROTO_TCP,
-    tcp_manip_pkt,
-    tcp_in_range,
-    tcp_unique_tuple,
-    tcp_print,
-    tcp_print_range
+struct ip_nat_protocol ip_nat_protocol_tcp = {
+	.name			= "TCP",
+	.protonum		= IPPROTO_TCP,
+	.me			= THIS_MODULE,
+	.manip_pkt		= tcp_manip_pkt,
+	.in_range		= tcp_in_range,
+	.unique_tuple		= tcp_unique_tuple,
+	.print			= tcp_print,
+	.print_range		= tcp_print_range,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+	.range_to_nfattr	= ip_nat_port_range_to_nfattr,
+	.nfattr_to_range	= ip_nat_port_nfattr_to_range,
+#endif
 };
diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c
index 9f66e562566..c4906e1aa24 100644
--- a/net/ipv4/netfilter/ip_nat_proto_udp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_udp.c
@@ -94,7 +94,7 @@ udp_manip_pkt(struct sk_buff **pskb,
 	u32 oldip, newip;
 	u16 *portptr, newport;
 
-	if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr)))
+	if (!skb_make_writable(pskb, hdroff + sizeof(*hdr)))
 		return 0;
 
 	iph = (struct iphdr *)((*pskb)->data + iphdroff);
@@ -156,11 +156,18 @@ udp_print_range(char *buffer, const struct ip_nat_range *range)
 	else return 0;
 }
 
-struct ip_nat_protocol ip_nat_protocol_udp
-= { "UDP", IPPROTO_UDP,
-    udp_manip_pkt,
-    udp_in_range,
-    udp_unique_tuple,
-    udp_print,
-    udp_print_range
+struct ip_nat_protocol ip_nat_protocol_udp = {
+	.name			= "UDP",
+	.protonum		= IPPROTO_UDP,
+	.me			= THIS_MODULE,
+	.manip_pkt		= udp_manip_pkt,
+	.in_range		= udp_in_range,
+	.unique_tuple		= udp_unique_tuple,
+	.print			= udp_print,
+	.print_range		= udp_print_range,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+	.range_to_nfattr	= ip_nat_port_range_to_nfattr,
+	.nfattr_to_range	= ip_nat_port_nfattr_to_range,
+#endif
 };
diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c
index f5525bd58d1..99bbef56f84 100644
--- a/net/ipv4/netfilter/ip_nat_proto_unknown.c
+++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c
@@ -61,10 +61,11 @@ unknown_print_range(char *buffer, const struct ip_nat_range *range)
 }
 
 struct ip_nat_protocol ip_nat_unknown_protocol = {
-	"unknown", 0,
-	unknown_manip_pkt,
-	unknown_in_range,
-	unknown_unique_tuple,
-	unknown_print,
-	unknown_print_range
+	.name			= "unknown",
+	.me			= THIS_MODULE,
+	.manip_pkt		= unknown_manip_pkt,
+	.in_range		= unknown_in_range,
+	.unique_tuple		= unknown_unique_tuple,
+	.print			= unknown_print,
+	.print_range		= unknown_print_range
 };
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c
index 2a48b6e635a..93b2c5111bb 100644
--- a/net/ipv4/netfilter/ip_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c
@@ -1275,7 +1275,7 @@ static int help(struct sk_buff **pskb,
 		 return NF_DROP;
 	}
 
-	if (!skb_ip_make_writable(pskb, (*pskb)->len))
+	if (!skb_make_writable(pskb, (*pskb)->len))
 		return NF_DROP;
 
 	spin_lock_bh(&snmp_lock);
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index 91d5ea1dbbc..89db052add8 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -73,8 +73,6 @@ ip_nat_fn(unsigned int hooknum,
 	IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
 		       & htons(IP_MF|IP_OFFSET)));
 
-	(*pskb)->nfcache |= NFC_UNKNOWN;
-
 	/* If we had a hardware checksum before, it's now invalid */
 	if ((*pskb)->ip_summed == CHECKSUM_HW)
 		if (skb_checksum_help(*pskb, (out == NULL)))
@@ -396,6 +394,8 @@ module_exit(fini);
 EXPORT_SYMBOL(ip_nat_setup_info);
 EXPORT_SYMBOL(ip_nat_protocol_register);
 EXPORT_SYMBOL(ip_nat_protocol_unregister);
+EXPORT_SYMBOL_GPL(ip_nat_proto_find_get);
+EXPORT_SYMBOL_GPL(ip_nat_proto_put);
 EXPORT_SYMBOL(ip_nat_cheat_check);
 EXPORT_SYMBOL(ip_nat_mangle_tcp_packet);
 EXPORT_SYMBOL(ip_nat_mangle_udp_packet);
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index c6baa817438..d54f14d926f 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -43,17 +43,10 @@
 #define NET_IPQ_QMAX 2088
 #define NET_IPQ_QMAX_NAME "ip_queue_maxlen"
 
-struct ipq_rt_info {
-	__u8 tos;
-	__u32 daddr;
-	__u32 saddr;
-};
-
 struct ipq_queue_entry {
 	struct list_head list;
 	struct nf_info *info;
 	struct sk_buff *skb;
-	struct ipq_rt_info rt_info;
 };
 
 typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long);
@@ -247,8 +240,8 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
 
 	pmsg->packet_id       = (unsigned long )entry;
 	pmsg->data_len        = data_len;
-	pmsg->timestamp_sec   = entry->skb->stamp.tv_sec;
-	pmsg->timestamp_usec  = entry->skb->stamp.tv_usec;
+	pmsg->timestamp_sec   = skb_tv_base.tv_sec + entry->skb->tstamp.off_sec;
+	pmsg->timestamp_usec  = skb_tv_base.tv_usec + entry->skb->tstamp.off_usec;
 	pmsg->mark            = entry->skb->nfmark;
 	pmsg->hook            = entry->info->hook;
 	pmsg->hw_protocol     = entry->skb->protocol;
@@ -287,7 +280,8 @@ nlmsg_failure:
 }
 
 static int
-ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
+ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info,
+		   unsigned int queuenum, void *data)
 {
 	int status = -EINVAL;
 	struct sk_buff *nskb;
@@ -305,14 +299,6 @@ ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
 	entry->info = info;
 	entry->skb = skb;
 
-	if (entry->info->hook == NF_IP_LOCAL_OUT) {
-		struct iphdr *iph = skb->nh.iph;
-
-		entry->rt_info.tos = iph->tos;
-		entry->rt_info.daddr = iph->daddr;
-		entry->rt_info.saddr = iph->saddr;
-	}
-
 	nskb = ipq_build_packet_message(entry, &status);
 	if (nskb == NULL)
 		goto err_out_free;
@@ -388,24 +374,11 @@ ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
 		}
 		skb_put(e->skb, diff);
 	}
-	if (!skb_ip_make_writable(&e->skb, v->data_len))
+	if (!skb_make_writable(&e->skb, v->data_len))
 		return -ENOMEM;
 	memcpy(e->skb->data, v->payload, v->data_len);
 	e->skb->ip_summed = CHECKSUM_NONE;
-	e->skb->nfcache |= NFC_ALTERED;
-
-	/*
-	 * Extra routing may needed on local out, as the QUEUE target never
-	 * returns control to the table.
-	 */
-	if (e->info->hook == NF_IP_LOCAL_OUT) {
-		struct iphdr *iph = e->skb->nh.iph;
-
-		if (!(iph->tos == e->rt_info.tos
-		      && iph->daddr == e->rt_info.daddr
-		      && iph->saddr == e->rt_info.saddr))
-			return ip_route_me_harder(&e->skb);
-	}
+
 	return 0;
 }
 
@@ -683,6 +656,11 @@ ipq_get_info(char *buffer, char **start, off_t offset, int length)
 }
 #endif /* CONFIG_PROC_FS */
 
+static struct nf_queue_handler nfqh = {
+	.name	= "ip_queue",
+	.outfn	= &ipq_enqueue_packet,
+};
+
 static int
 init_or_cleanup(int init)
 {
@@ -693,7 +671,8 @@ init_or_cleanup(int init)
 		goto cleanup;
 
 	netlink_register_notifier(&ipq_nl_notifier);
-	ipqnl = netlink_kernel_create(NETLINK_FIREWALL, ipq_rcv_sk);
+	ipqnl = netlink_kernel_create(NETLINK_FIREWALL, 0, ipq_rcv_sk,
+				      THIS_MODULE);
 	if (ipqnl == NULL) {
 		printk(KERN_ERR "ip_queue: failed to create netlink socket\n");
 		goto cleanup_netlink_notifier;
@@ -710,7 +689,7 @@ init_or_cleanup(int init)
 	register_netdevice_notifier(&ipq_dev_notifier);
 	ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0);
 	
-	status = nf_register_queue_handler(PF_INET, ipq_enqueue_packet, NULL);
+	status = nf_register_queue_handler(PF_INET, &nfqh);
 	if (status < 0) {
 		printk(KERN_ERR "ip_queue: failed to register queue handler\n");
 		goto cleanup_sysctl;
@@ -718,7 +697,7 @@ init_or_cleanup(int init)
 	return status;
 
 cleanup:
-	nf_unregister_queue_handler(PF_INET);
+	nf_unregister_queue_handlers(&nfqh);
 	synchronize_net();
 	ipq_flush(NF_DROP);
 	
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index c88dfcd38c5..eef99a1b5de 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -312,7 +312,6 @@ ipt_do_table(struct sk_buff **pskb,
 	do {
 		IP_NF_ASSERT(e);
 		IP_NF_ASSERT(back);
-		(*pskb)->nfcache |= e->nfcache;
 		if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) {
 			struct ipt_entry_target *t;
 
@@ -341,8 +340,8 @@ ipt_do_table(struct sk_buff **pskb,
 							 back->comefrom);
 					continue;
 				}
-				if (table_base + v
-				    != (void *)e + e->next_offset) {
+				if (table_base + v != (void *)e + e->next_offset
+				    && !(e->ip.flags & IPT_F_GOTO)) {
 					/* Save old back ptr in next entry */
 					struct ipt_entry *next
 						= (void *)e + e->next_offset;
diff --git a/net/ipv4/netfilter/ipt_CLASSIFY.c b/net/ipv4/netfilter/ipt_CLASSIFY.c
index 9842e6e2318..dab78d8bd49 100644
--- a/net/ipv4/netfilter/ipt_CLASSIFY.c
+++ b/net/ipv4/netfilter/ipt_CLASSIFY.c
@@ -32,10 +32,8 @@ target(struct sk_buff **pskb,
 {
 	const struct ipt_classify_target_info *clinfo = targinfo;
 
-	if((*pskb)->priority != clinfo->priority) {
+	if((*pskb)->priority != clinfo->priority) 
 		(*pskb)->priority = clinfo->priority;
-		(*pskb)->nfcache |= NFC_ALTERED;
-	}
 
 	return IPT_CONTINUE;
 }
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 6706d3a1bc4..2d05cafec22 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -367,7 +367,7 @@ target(struct sk_buff **pskb,
 #ifdef DEBUG_CLUSTERP
 	DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 #endif
-	DEBUGP("hash=%u ct_hash=%lu ", hash, ct->mark);
+	DEBUGP("hash=%u ct_hash=%u ", hash, ct->mark);
 	if (!clusterip_responsible(cipinfo->config, hash)) {
 		DEBUGP("not responsible\n");
 		return NF_DROP;
diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c
index 30ddd3e18eb..13463802133 100644
--- a/net/ipv4/netfilter/ipt_CONNMARK.c
+++ b/net/ipv4/netfilter/ipt_CONNMARK.c
@@ -40,9 +40,9 @@ target(struct sk_buff **pskb,
        void *userinfo)
 {
 	const struct ipt_connmark_target_info *markinfo = targinfo;
-	unsigned long diff;
-	unsigned long nfmark;
-	unsigned long newmark;
+	u_int32_t diff;
+	u_int32_t nfmark;
+	u_int32_t newmark;
 
 	enum ip_conntrack_info ctinfo;
 	struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo);
@@ -61,10 +61,8 @@ target(struct sk_buff **pskb,
 	    case IPT_CONNMARK_RESTORE:
 		nfmark = (*pskb)->nfmark;
 		diff = (ct->mark ^ nfmark) & markinfo->mask;
-		if (diff != 0) {
+		if (diff != 0)
 		    (*pskb)->nfmark = nfmark ^ diff;
-		    (*pskb)->nfcache |= NFC_ALTERED;
-		}
 		break;
 	    }
 	}
@@ -94,6 +92,11 @@ checkentry(const char *tablename,
 	    }
 	}
 
+	if (matchinfo->mark > 0xffffffff || matchinfo->mask > 0xffffffff) {
+		printk(KERN_WARNING "CONNMARK: Only supports 32bit mark\n");
+		return 0;
+	}
+
 	return 1;
 }
 
diff --git a/net/ipv4/netfilter/ipt_DSCP.c b/net/ipv4/netfilter/ipt_DSCP.c
index 3ea4509099f..6e319570a28 100644
--- a/net/ipv4/netfilter/ipt_DSCP.c
+++ b/net/ipv4/netfilter/ipt_DSCP.c
@@ -39,7 +39,7 @@ target(struct sk_buff **pskb,
 	if (((*pskb)->nh.iph->tos & IPT_DSCP_MASK) != sh_dscp) {
 		u_int16_t diffs[2];
 
-		if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
+		if (!skb_make_writable(pskb, sizeof(struct iphdr)))
 			return NF_DROP;
 
 		diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
@@ -51,7 +51,6 @@ target(struct sk_buff **pskb,
 						 sizeof(diffs),
 						 (*pskb)->nh.iph->check
 						 ^ 0xFFFF));
-		(*pskb)->nfcache |= NFC_ALTERED;
 	}
 	return IPT_CONTINUE;
 }
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index 94a0ce1c1c9..a1319693f64 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -31,7 +31,7 @@ set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
 	    != (einfo->ip_ect & IPT_ECN_IP_MASK)) {
 		u_int16_t diffs[2];
 
-		if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
+		if (!skb_make_writable(pskb, sizeof(struct iphdr)))
 			return 0;
 
 		diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
@@ -43,7 +43,6 @@ set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
 						 sizeof(diffs),
 						 (*pskb)->nh.iph->check
 						 ^0xFFFF));
-		(*pskb)->nfcache |= NFC_ALTERED;
 	} 
 	return 1;
 }
@@ -67,7 +66,7 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward)
 	     tcph->cwr == einfo->proto.tcp.cwr)))
 		return 1;
 
-	if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
+	if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
 		return 0;
 	tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4;
 
@@ -87,7 +86,6 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward)
 		tcph->check = csum_fold(csum_partial((char *)diffs,
 						     sizeof(diffs),
 						     tcph->check^0xFFFF));
-	(*pskb)->nfcache |= NFC_ALTERED;
 	return 1;
 }
 
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index ef08733d26d..92ed050fac6 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -27,10 +27,6 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
 MODULE_DESCRIPTION("iptables syslog logging module");
 
-static unsigned int nflog = 1;
-module_param(nflog, int, 0400);
-MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
- 
 #if 0
 #define DEBUGP printk
 #else
@@ -41,11 +37,17 @@ MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
 static DEFINE_SPINLOCK(log_lock);
 
 /* One level of recursion won't kill us */
-static void dump_packet(const struct ipt_log_info *info,
+static void dump_packet(const struct nf_loginfo *info,
 			const struct sk_buff *skb,
 			unsigned int iphoff)
 {
 	struct iphdr _iph, *ih;
+	unsigned int logflags;
+
+	if (info->type == NF_LOG_TYPE_LOG)
+		logflags = info->u.log.logflags;
+	else
+		logflags = NF_LOG_MASK;
 
 	ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
 	if (ih == NULL) {
@@ -76,7 +78,7 @@ static void dump_packet(const struct ipt_log_info *info,
 	if (ntohs(ih->frag_off) & IP_OFFSET)
 		printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
 
-	if ((info->logflags & IPT_LOG_IPOPT)
+	if ((logflags & IPT_LOG_IPOPT)
 	    && ih->ihl * 4 > sizeof(struct iphdr)) {
 		unsigned char _opt[4 * 15 - sizeof(struct iphdr)], *op;
 		unsigned int i, optsize;
@@ -119,7 +121,7 @@ static void dump_packet(const struct ipt_log_info *info,
 		printk("SPT=%u DPT=%u ",
 		       ntohs(th->source), ntohs(th->dest));
 		/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
-		if (info->logflags & IPT_LOG_TCPSEQ)
+		if (logflags & IPT_LOG_TCPSEQ)
 			printk("SEQ=%u ACK=%u ",
 			       ntohl(th->seq), ntohl(th->ack_seq));
 		/* Max length: 13 "WINDOW=65535 " */
@@ -146,7 +148,7 @@ static void dump_packet(const struct ipt_log_info *info,
 		/* Max length: 11 "URGP=65535 " */
 		printk("URGP=%u ", ntohs(th->urg_ptr));
 
-		if ((info->logflags & IPT_LOG_TCPOPT)
+		if ((logflags & IPT_LOG_TCPOPT)
 		    && th->doff * 4 > sizeof(struct tcphdr)) {
 			unsigned char _opt[4 * 15 - sizeof(struct tcphdr)];
 			unsigned char *op;
@@ -328,7 +330,7 @@ static void dump_packet(const struct ipt_log_info *info,
 	}
 
 	/* Max length: 15 "UID=4294967295 " */
- 	if ((info->logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
+ 	if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
 		read_lock_bh(&skb->sk->sk_callback_lock);
 		if (skb->sk->sk_socket && skb->sk->sk_socket->file)
  			printk("UID=%u ", skb->sk->sk_socket->file->f_uid);
@@ -349,19 +351,31 @@ static void dump_packet(const struct ipt_log_info *info,
 	/* maxlen = 230+   91  + 230 + 252 = 803 */
 }
 
+struct nf_loginfo default_loginfo = {
+	.type	= NF_LOG_TYPE_LOG,
+	.u = {
+		.log = {
+			.level    = 0,
+			.logflags = NF_LOG_MASK,
+		},
+	},
+};
+
 static void
-ipt_log_packet(unsigned int hooknum,
+ipt_log_packet(unsigned int pf,
+	       unsigned int hooknum,
 	       const struct sk_buff *skb,
 	       const struct net_device *in,
 	       const struct net_device *out,
-	       const struct ipt_log_info *loginfo,
-	       const char *level_string,
+	       const struct nf_loginfo *loginfo,
 	       const char *prefix)
 {
+	if (!loginfo)
+		loginfo = &default_loginfo;
+
 	spin_lock_bh(&log_lock);
-	printk(level_string);
-	printk("%sIN=%s OUT=%s ",
-	       prefix == NULL ? loginfo->prefix : prefix,
+	printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
+	       prefix,
 	       in ? in->name : "",
 	       out ? out->name : "");
 #ifdef CONFIG_BRIDGE_NETFILTER
@@ -405,28 +419,15 @@ ipt_log_target(struct sk_buff **pskb,
 	       void *userinfo)
 {
 	const struct ipt_log_info *loginfo = targinfo;
-	char level_string[4] = "< >";
+	struct nf_loginfo li;
 
-	level_string[1] = '0' + (loginfo->level % 8);
-	ipt_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL);
+	li.type = NF_LOG_TYPE_LOG;
+	li.u.log.level = loginfo->level;
+	li.u.log.logflags = loginfo->logflags;
 
-	return IPT_CONTINUE;
-}
+	nf_log_packet(PF_INET, hooknum, *pskb, in, out, &li, loginfo->prefix);
 
-static void
-ipt_logfn(unsigned int hooknum,
-	  const struct sk_buff *skb,
-	  const struct net_device *in,
-	  const struct net_device *out,
-	  const char *prefix)
-{
-	struct ipt_log_info loginfo = { 
-		.level = 0, 
-		.logflags = IPT_LOG_MASK, 
-		.prefix = "" 
-	};
-
-	ipt_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix);
+	return IPT_CONTINUE;
 }
 
 static int ipt_log_checkentry(const char *tablename,
@@ -464,20 +465,29 @@ static struct ipt_target ipt_log_reg = {
 	.me		= THIS_MODULE,
 };
 
+static struct nf_logger ipt_log_logger ={
+	.name		= "ipt_LOG",
+	.logfn		= &ipt_log_packet,
+	.me		= THIS_MODULE,
+};
+
 static int __init init(void)
 {
 	if (ipt_register_target(&ipt_log_reg))
 		return -EINVAL;
-	if (nflog)
-		nf_log_register(PF_INET, &ipt_logfn);
+	if (nf_log_register(PF_INET, &ipt_log_logger) < 0) {
+		printk(KERN_WARNING "ipt_LOG: not logging via system console "
+		       "since somebody else already registered for PF_INET\n");
+		/* we cannot make module load fail here, since otherwise
+		 * iptables userspace would abort */
+	}
 	
 	return 0;
 }
 
 static void __exit fini(void)
 {
-	if (nflog)
-		nf_log_unregister(PF_INET, &ipt_logfn);
+	nf_log_unregister_logger(&ipt_log_logger);
 	ipt_unregister_target(&ipt_log_reg);
 }
 
diff --git a/net/ipv4/netfilter/ipt_MARK.c b/net/ipv4/netfilter/ipt_MARK.c
index 33c6f9b63b8..52b4f2c296b 100644
--- a/net/ipv4/netfilter/ipt_MARK.c
+++ b/net/ipv4/netfilter/ipt_MARK.c
@@ -29,10 +29,9 @@ target_v0(struct sk_buff **pskb,
 {
 	const struct ipt_mark_target_info *markinfo = targinfo;
 
-	if((*pskb)->nfmark != markinfo->mark) {
+	if((*pskb)->nfmark != markinfo->mark)
 		(*pskb)->nfmark = markinfo->mark;
-		(*pskb)->nfcache |= NFC_ALTERED;
-	}
+
 	return IPT_CONTINUE;
 }
 
@@ -61,10 +60,9 @@ target_v1(struct sk_buff **pskb,
 		break;
 	}
 
-	if((*pskb)->nfmark != mark) {
+	if((*pskb)->nfmark != mark)
 		(*pskb)->nfmark = mark;
-		(*pskb)->nfcache |= NFC_ALTERED;
-	}
+
 	return IPT_CONTINUE;
 }
 
@@ -76,6 +74,8 @@ checkentry_v0(const char *tablename,
 	      unsigned int targinfosize,
 	      unsigned int hook_mask)
 {
+	struct ipt_mark_target_info *markinfo = targinfo;
+
 	if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info))) {
 		printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n",
 		       targinfosize,
@@ -88,6 +88,11 @@ checkentry_v0(const char *tablename,
 		return 0;
 	}
 
+	if (markinfo->mark > 0xffffffff) {
+		printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
+		return 0;
+	}
+
 	return 1;
 }
 
@@ -120,6 +125,11 @@ checkentry_v1(const char *tablename,
 		return 0;
 	}
 
+	if (markinfo->mark > 0xffffffff) {
+		printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
+		return 0;
+	}
+
 	return 1;
 }
 
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 91e74502c3d..2f3e181c8e9 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -86,11 +86,6 @@ masquerade_target(struct sk_buff **pskb,
 
 	IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING);
 
-	/* FIXME: For the moment, don't do local packets, breaks
-	   testsuite for 2.3.49 --RR */
-	if ((*pskb)->sk)
-		return NF_ACCEPT;
-
 	ct = ip_conntrack_get(*pskb, &ctinfo);
 	IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED
 	                    || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
index 06254b29d03..e6e7b609536 100644
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -46,7 +46,8 @@ check(const char *tablename,
 		DEBUGP(MODULENAME":check: size %u.\n", targinfosize);
 		return 0;
 	}
-	if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING))) {
+	if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING) |
+	                  (1 << NF_IP_LOCAL_OUT))) {
 		DEBUGP(MODULENAME":check: bad hooks %x.\n", hook_mask);
 		return 0;
 	}
@@ -76,12 +77,13 @@ target(struct sk_buff **pskb,
 	struct ip_nat_range newrange;
 
 	IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
-		     || hooknum == NF_IP_POST_ROUTING);
+		     || hooknum == NF_IP_POST_ROUTING
+		     || hooknum == NF_IP_LOCAL_OUT);
 	ct = ip_conntrack_get(*pskb, &ctinfo);
 
 	netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
 
-	if (hooknum == NF_IP_PRE_ROUTING)
+	if (hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_LOCAL_OUT)
 		new_ip = (*pskb)->nh.iph->daddr & ~netmask;
 	else
 		new_ip = (*pskb)->nh.iph->saddr & ~netmask;
diff --git a/net/ipv4/netfilter/ipt_NFQUEUE.c b/net/ipv4/netfilter/ipt_NFQUEUE.c
new file mode 100644
index 00000000000..3cedc9be880
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_NFQUEUE.c
@@ -0,0 +1,70 @@
+/* iptables module for using new netfilter netlink queue
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as 
+ * published by the Free Software Foundation.
+ * 
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_NFQUEUE.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables NFQUEUE target");
+MODULE_LICENSE("GPL");
+
+static unsigned int
+target(struct sk_buff **pskb,
+       const struct net_device *in,
+       const struct net_device *out,
+       unsigned int hooknum,
+       const void *targinfo,
+       void *userinfo)
+{
+	const struct ipt_NFQ_info *tinfo = targinfo;
+
+	return NF_QUEUE_NR(tinfo->queuenum);
+}
+
+static int
+checkentry(const char *tablename,
+	   const struct ipt_entry *e,
+           void *targinfo,
+           unsigned int targinfosize,
+           unsigned int hook_mask)
+{
+	if (targinfosize != IPT_ALIGN(sizeof(struct ipt_NFQ_info))) {
+		printk(KERN_WARNING "NFQUEUE: targinfosize %u != %Zu\n",
+		       targinfosize,
+		       IPT_ALIGN(sizeof(struct ipt_NFQ_info)));
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct ipt_target ipt_NFQ_reg = {
+	.name		= "NFQUEUE",
+	.target		= target,
+	.checkentry	= checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ipt_register_target(&ipt_NFQ_reg);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_target(&ipt_NFQ_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 91569644602..f115a84a4ac 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -156,7 +156,6 @@ static void send_reset(struct sk_buff *oldskb, int hook)
 
 	/* This packet will not be the same as the other: clear nf fields */
 	nf_reset(nskb);
-	nskb->nfcache = 0;
 	nskb->nfmark = 0;
 #ifdef CONFIG_BRIDGE_NETFILTER
 	nf_bridge_put(nskb->nf_bridge);
diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c
index 7b84a254440..8db70d6908c 100644
--- a/net/ipv4/netfilter/ipt_TCPMSS.c
+++ b/net/ipv4/netfilter/ipt_TCPMSS.c
@@ -58,7 +58,7 @@ ipt_tcpmss_target(struct sk_buff **pskb,
 	unsigned int i;
 	u_int8_t *opt;
 
-	if (!skb_ip_make_writable(pskb, (*pskb)->len))
+	if (!skb_make_writable(pskb, (*pskb)->len))
 		return NF_DROP;
 
 	if ((*pskb)->ip_summed == CHECKSUM_HW &&
@@ -190,7 +190,6 @@ ipt_tcpmss_target(struct sk_buff **pskb,
 	       newmss);
 
  retmodified:
-	(*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED;
 	return IPT_CONTINUE;
 }
 
diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c
index 85c70d240f8..deadb36d442 100644
--- a/net/ipv4/netfilter/ipt_TOS.c
+++ b/net/ipv4/netfilter/ipt_TOS.c
@@ -33,7 +33,7 @@ target(struct sk_buff **pskb,
 	if (((*pskb)->nh.iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) {
 		u_int16_t diffs[2];
 
-		if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
+		if (!skb_make_writable(pskb, sizeof(struct iphdr)))
 			return NF_DROP;
 
 		diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
@@ -46,7 +46,6 @@ target(struct sk_buff **pskb,
 						 sizeof(diffs),
 						 (*pskb)->nh.iph->check
 						 ^0xFFFF));
-		(*pskb)->nfcache |= NFC_ALTERED;
 	}
 	return IPT_CONTINUE;
 }
diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c
new file mode 100644
index 00000000000..b9ae6a9382f
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_TTL.c
@@ -0,0 +1,119 @@
+/* TTL modification target for IP tables
+ * (C) 2000,2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_TTL.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("IP tables TTL modification module");
+MODULE_LICENSE("GPL");
+
+static unsigned int 
+ipt_ttl_target(struct sk_buff **pskb, const struct net_device *in, 
+		const struct net_device *out, unsigned int hooknum, 
+		const void *targinfo, void *userinfo)
+{
+	struct iphdr *iph;
+	const struct ipt_TTL_info *info = targinfo;
+	u_int16_t diffs[2];
+	int new_ttl;
+
+	if (!skb_make_writable(pskb, (*pskb)->len))
+		return NF_DROP;
+
+	iph = (*pskb)->nh.iph;
+
+	switch (info->mode) {
+		case IPT_TTL_SET:
+			new_ttl = info->ttl;
+			break;
+		case IPT_TTL_INC:
+			new_ttl = iph->ttl + info->ttl;
+			if (new_ttl > 255)
+				new_ttl = 255;
+			break;
+		case IPT_TTL_DEC:
+			new_ttl = iph->ttl - info->ttl;
+			if (new_ttl < 0)
+				new_ttl = 0;
+			break;
+		default:
+			new_ttl = iph->ttl;
+			break;
+	}
+
+	if (new_ttl != iph->ttl) {
+		diffs[0] = htons(((unsigned)iph->ttl) << 8) ^ 0xFFFF;
+		iph->ttl = new_ttl;
+		diffs[1] = htons(((unsigned)iph->ttl) << 8);
+		iph->check = csum_fold(csum_partial((char *)diffs,
+						    sizeof(diffs),
+						    iph->check^0xFFFF));
+	}
+
+	return IPT_CONTINUE;
+}
+
+static int ipt_ttl_checkentry(const char *tablename,
+		const struct ipt_entry *e,
+		void *targinfo,
+		unsigned int targinfosize,
+		unsigned int hook_mask)
+{
+	struct ipt_TTL_info *info = targinfo;
+
+	if (targinfosize != IPT_ALIGN(sizeof(struct ipt_TTL_info))) {
+		printk(KERN_WARNING "ipt_TTL: targinfosize %u != %Zu\n",
+				targinfosize,
+				IPT_ALIGN(sizeof(struct ipt_TTL_info)));
+		return 0;
+	}
+
+	if (strcmp(tablename, "mangle")) {
+		printk(KERN_WARNING "ipt_TTL: can only be called from "
+			"\"mangle\" table, not \"%s\"\n", tablename);
+		return 0;
+	}
+
+	if (info->mode > IPT_TTL_MAXMODE) {
+		printk(KERN_WARNING "ipt_TTL: invalid or unknown Mode %u\n", 
+			info->mode);
+		return 0;
+	}
+
+	if ((info->mode != IPT_TTL_SET) && (info->ttl == 0))
+		return 0;
+
+	return 1;
+}
+
+static struct ipt_target ipt_TTL = { 
+	.name 		= "TTL",
+	.target 	= ipt_ttl_target, 
+	.checkentry 	= ipt_ttl_checkentry, 
+	.me 		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ipt_register_target(&ipt_TTL);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_target(&ipt_TTL);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 52a0076302a..e2c14f3cb2f 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -62,6 +62,7 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
 MODULE_DESCRIPTION("iptables userspace logging module");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG);
 
 #define ULOG_NL_EVENT		111		/* Harald's favorite number */
 #define ULOG_MAXNLGROUPS	32		/* numer of nlgroups */
@@ -115,10 +116,10 @@ static void ulog_send(unsigned int nlgroupnum)
 	if (ub->qlen > 1)
 		ub->lastnlh->nlmsg_type = NLMSG_DONE;
 
-	NETLINK_CB(ub->skb).dst_groups = (1 << nlgroupnum);
-	DEBUGP("ipt_ULOG: throwing %d packets to netlink mask %u\n",
-		ub->qlen, nlgroupnum);
-	netlink_broadcast(nflognl, ub->skb, 0, (1 << nlgroupnum), GFP_ATOMIC);
+	NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1;
+	DEBUGP("ipt_ULOG: throwing %d packets to netlink group %u\n",
+		ub->qlen, nlgroupnum + 1);
+	netlink_broadcast(nflognl, ub->skb, 0, nlgroupnum + 1, GFP_ATOMIC);
 
 	ub->qlen = 0;
 	ub->skb = NULL;
@@ -219,13 +220,13 @@ static void ipt_ulog_packet(unsigned int hooknum,
 	pm = NLMSG_DATA(nlh);
 
 	/* We might not have a timestamp, get one */
-	if (skb->stamp.tv_sec == 0)
-		do_gettimeofday((struct timeval *)&skb->stamp);
+	if (skb->tstamp.off_sec == 0)
+		__net_timestamp((struct sk_buff *)skb);
 
 	/* copy hook, prefix, timestamp, payload, etc. */
 	pm->data_len = copy_len;
-	pm->timestamp_sec = skb->stamp.tv_sec;
-	pm->timestamp_usec = skb->stamp.tv_usec;
+	pm->timestamp_sec = skb_tv_base.tv_sec + skb->tstamp.off_sec;
+	pm->timestamp_usec = skb_tv_base.tv_usec + skb->tstamp.off_usec;
 	pm->mark = skb->nfmark;
 	pm->hook = hooknum;
 	if (prefix != NULL)
@@ -303,18 +304,27 @@ static unsigned int ipt_ulog_target(struct sk_buff **pskb,
  	return IPT_CONTINUE;
 }
  
-static void ipt_logfn(unsigned int hooknum,
+static void ipt_logfn(unsigned int pf,
+		      unsigned int hooknum,
 		      const struct sk_buff *skb,
 		      const struct net_device *in,
 		      const struct net_device *out,
+		      const struct nf_loginfo *li,
 		      const char *prefix)
 {
-	struct ipt_ulog_info loginfo = { 
-		.nl_group = ULOG_DEFAULT_NLGROUP,
-		.copy_range = 0,
-		.qthreshold = ULOG_DEFAULT_QTHRESHOLD,
-		.prefix = ""
-	};
+	struct ipt_ulog_info loginfo;
+
+	if (!li || li->type != NF_LOG_TYPE_ULOG) {
+		loginfo.nl_group = ULOG_DEFAULT_NLGROUP;
+		loginfo.copy_range = 0;
+		loginfo.qthreshold = ULOG_DEFAULT_QTHRESHOLD;
+		loginfo.prefix[0] = '\0';
+	} else {
+		loginfo.nl_group = li->u.ulog.group;
+		loginfo.copy_range = li->u.ulog.copy_len;
+		loginfo.qthreshold = li->u.ulog.qthreshold;
+		strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix));
+	}
 
 	ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
 }
@@ -354,6 +364,12 @@ static struct ipt_target ipt_ulog_reg = {
 	.me		= THIS_MODULE,
 };
 
+static struct nf_logger ipt_ulog_logger = {
+	.name		= "ipt_ULOG",
+	.logfn		= &ipt_logfn,
+	.me		= THIS_MODULE,
+};
+
 static int __init init(void)
 {
 	int i;
@@ -372,7 +388,8 @@ static int __init init(void)
 		ulog_buffers[i].timer.data = i;
 	}
 
-	nflognl = netlink_kernel_create(NETLINK_NFLOG, NULL);
+	nflognl = netlink_kernel_create(NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL,
+	                                THIS_MODULE);
 	if (!nflognl)
 		return -ENOMEM;
 
@@ -381,7 +398,7 @@ static int __init init(void)
 		return -EINVAL;
 	}
 	if (nflog)
-		nf_log_register(PF_INET, &ipt_logfn);
+		nf_log_register(PF_INET, &ipt_ulog_logger);
 	
 	return 0;
 }
@@ -394,7 +411,7 @@ static void __exit fini(void)
 	DEBUGP("ipt_ULOG: cleanup_module\n");
 
 	if (nflog)
-		nf_log_unregister(PF_INET, &ipt_logfn);
+		nf_log_unregister_logger(&ipt_ulog_logger);
 	ipt_unregister_target(&ipt_ulog_reg);
 	sock_release(nflognl->sk_socket);
 
diff --git a/net/ipv4/netfilter/ipt_connbytes.c b/net/ipv4/netfilter/ipt_connbytes.c
new file mode 100644
index 00000000000..df4a42c6da2
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_connbytes.c
@@ -0,0 +1,162 @@
+/* Kernel module to match connection tracking byte counter.
+ * GPL (C) 2002 Martin Devera (devik@cdi.cz).
+ *
+ * 2004-07-20 Harald Welte <laforge@netfilter.org>
+ * 	- reimplemented to use per-connection accounting counters
+ * 	- add functionality to match number of packets
+ * 	- add functionality to match average packet size
+ * 	- add support to match directions seperately
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_connbytes.h>
+
+#include <asm/div64.h>
+#include <asm/bitops.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables match for matching number of pkts/bytes per connection");
+
+/* 64bit divisor, dividend and result. dynamic precision */
+static u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor)
+{
+	u_int32_t d = divisor;
+
+	if (divisor > 0xffffffffULL) {
+		unsigned int shift = fls(divisor >> 32);
+
+		d = divisor >> shift;
+		dividend >>= shift;
+	}
+
+	do_div(dividend, d);
+	return dividend;
+}
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+	const struct ipt_connbytes_info *sinfo = matchinfo;
+	enum ip_conntrack_info ctinfo;
+	struct ip_conntrack *ct;
+	u_int64_t what = 0;	/* initialize to make gcc happy */
+
+	if (!(ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo)))
+		return 0; /* no match */
+
+	switch (sinfo->what) {
+	case IPT_CONNBYTES_PKTS:
+		switch (sinfo->direction) {
+		case IPT_CONNBYTES_DIR_ORIGINAL:
+			what = ct->counters[IP_CT_DIR_ORIGINAL].packets;
+			break;
+		case IPT_CONNBYTES_DIR_REPLY:
+			what = ct->counters[IP_CT_DIR_REPLY].packets;
+			break;
+		case IPT_CONNBYTES_DIR_BOTH:
+			what = ct->counters[IP_CT_DIR_ORIGINAL].packets;
+			what += ct->counters[IP_CT_DIR_REPLY].packets;
+			break;
+		}
+		break;
+	case IPT_CONNBYTES_BYTES:
+		switch (sinfo->direction) {
+		case IPT_CONNBYTES_DIR_ORIGINAL:
+			what = ct->counters[IP_CT_DIR_ORIGINAL].bytes;
+			break;
+		case IPT_CONNBYTES_DIR_REPLY:
+			what = ct->counters[IP_CT_DIR_REPLY].bytes;
+			break;
+		case IPT_CONNBYTES_DIR_BOTH:
+			what = ct->counters[IP_CT_DIR_ORIGINAL].bytes;
+			what += ct->counters[IP_CT_DIR_REPLY].bytes;
+			break;
+		}
+		break;
+	case IPT_CONNBYTES_AVGPKT:
+		switch (sinfo->direction) {
+		case IPT_CONNBYTES_DIR_ORIGINAL:
+			what = div64_64(ct->counters[IP_CT_DIR_ORIGINAL].bytes,
+					ct->counters[IP_CT_DIR_ORIGINAL].packets);
+			break;
+		case IPT_CONNBYTES_DIR_REPLY:
+			what = div64_64(ct->counters[IP_CT_DIR_REPLY].bytes,
+					ct->counters[IP_CT_DIR_REPLY].packets);
+			break;
+		case IPT_CONNBYTES_DIR_BOTH:
+			{
+				u_int64_t bytes;
+				u_int64_t pkts;
+				bytes = ct->counters[IP_CT_DIR_ORIGINAL].bytes +
+					ct->counters[IP_CT_DIR_REPLY].bytes;
+				pkts = ct->counters[IP_CT_DIR_ORIGINAL].packets+
+					ct->counters[IP_CT_DIR_REPLY].packets;
+
+				/* FIXME_THEORETICAL: what to do if sum
+				 * overflows ? */
+
+				what = div64_64(bytes, pkts);
+			}
+			break;
+		}
+		break;
+	}
+
+	if (sinfo->count.to)
+		return (what <= sinfo->count.to && what >= sinfo->count.from);
+	else
+		return (what >= sinfo->count.from);
+}
+
+static int check(const char *tablename,
+		 const struct ipt_ip *ip,
+		 void *matchinfo,
+		 unsigned int matchsize,
+		 unsigned int hook_mask)
+{
+	const struct ipt_connbytes_info *sinfo = matchinfo;
+
+	if (matchsize != IPT_ALIGN(sizeof(struct ipt_connbytes_info)))
+		return 0;
+
+	if (sinfo->what != IPT_CONNBYTES_PKTS &&
+	    sinfo->what != IPT_CONNBYTES_BYTES &&
+	    sinfo->what != IPT_CONNBYTES_AVGPKT)
+		return 0;
+
+	if (sinfo->direction != IPT_CONNBYTES_DIR_ORIGINAL &&
+	    sinfo->direction != IPT_CONNBYTES_DIR_REPLY &&
+	    sinfo->direction != IPT_CONNBYTES_DIR_BOTH)
+		return 0;
+
+	return 1;
+}
+
+static struct ipt_match state_match = {
+	.name		= "connbytes",
+	.match		= &match,
+	.checkentry	= &check,
+	.me		= THIS_MODULE
+};
+
+static int __init init(void)
+{
+	return ipt_register_match(&state_match);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&state_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_connmark.c b/net/ipv4/netfilter/ipt_connmark.c
index 2706f96cea5..bf8de47ce00 100644
--- a/net/ipv4/netfilter/ipt_connmark.c
+++ b/net/ipv4/netfilter/ipt_connmark.c
@@ -54,9 +54,16 @@ checkentry(const char *tablename,
 	   unsigned int matchsize,
 	   unsigned int hook_mask)
 {
+	struct ipt_connmark_info *cm = 
+				(struct ipt_connmark_info *)matchinfo;
 	if (matchsize != IPT_ALIGN(sizeof(struct ipt_connmark_info)))
 		return 0;
 
+	if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) {
+		printk(KERN_WARNING "connmark: only support 32bit mark\n");
+		return 0;
+	}
+
 	return 1;
 }
 
diff --git a/net/ipv4/netfilter/ipt_dccp.c b/net/ipv4/netfilter/ipt_dccp.c
new file mode 100644
index 00000000000..ad3278bba6c
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_dccp.c
@@ -0,0 +1,176 @@
+/*
+ * iptables module for DCCP protocol header matching
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <net/ip.h>
+#include <linux/dccp.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_dccp.h>
+
+#define DCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \
+		                  || (!!((invflag) & (option)) ^ (cond)))
+
+static unsigned char *dccp_optbuf;
+static DEFINE_SPINLOCK(dccp_buflock);
+
+static inline int
+dccp_find_option(u_int8_t option,
+		 const struct sk_buff *skb,
+		 const struct dccp_hdr *dh,
+		 int *hotdrop)
+{
+	/* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
+	unsigned char *op;
+	unsigned int optoff = __dccp_hdr_len(dh);
+	unsigned int optlen = dh->dccph_doff*4 - __dccp_hdr_len(dh);
+	unsigned int i;
+
+	if (dh->dccph_doff * 4 < __dccp_hdr_len(dh)) {
+		*hotdrop = 1;
+		return 0;
+	}
+
+	if (!optlen)
+		return 0;
+
+	spin_lock_bh(&dccp_buflock);
+	op = skb_header_pointer(skb,
+				skb->nh.iph->ihl*4 + optoff,
+				optlen, dccp_optbuf);
+	if (op == NULL) {
+		/* If we don't have the whole header, drop packet. */
+		spin_unlock_bh(&dccp_buflock);
+		*hotdrop = 1;
+		return 0;
+	}
+
+	for (i = 0; i < optlen; ) {
+		if (op[i] == option) {
+			spin_unlock_bh(&dccp_buflock);
+			return 1;
+		}
+
+		if (op[i] < 2) 
+			i++;
+		else 
+			i += op[i+1]?:1;
+	}
+
+	spin_unlock_bh(&dccp_buflock);
+	return 0;
+}
+
+
+static inline int
+match_types(const struct dccp_hdr *dh, u_int16_t typemask)
+{
+	return (typemask & (1 << dh->dccph_type));
+}
+
+static inline int
+match_option(u_int8_t option, const struct sk_buff *skb,
+	     const struct dccp_hdr *dh, int *hotdrop)
+{
+	return dccp_find_option(option, skb, dh, hotdrop);
+}
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+	const struct ipt_dccp_info *info = 
+				(const struct ipt_dccp_info *)matchinfo;
+	struct dccp_hdr _dh, *dh;
+
+	if (offset)
+		return 0;
+	
+	dh = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_dh), &_dh);
+	if (dh == NULL) {
+		*hotdrop = 1;
+		return 0;
+       	}
+
+	return  DCCHECK(((ntohs(dh->dccph_sport) >= info->spts[0]) 
+			&& (ntohs(dh->dccph_sport) <= info->spts[1])), 
+		   	IPT_DCCP_SRC_PORTS, info->flags, info->invflags)
+		&& DCCHECK(((ntohs(dh->dccph_dport) >= info->dpts[0]) 
+			&& (ntohs(dh->dccph_dport) <= info->dpts[1])), 
+			IPT_DCCP_DEST_PORTS, info->flags, info->invflags)
+		&& DCCHECK(match_types(dh, info->typemask),
+			   IPT_DCCP_TYPE, info->flags, info->invflags)
+		&& DCCHECK(match_option(info->option, skb, dh, hotdrop),
+			   IPT_DCCP_OPTION, info->flags, info->invflags);
+}
+
+static int
+checkentry(const char *tablename,
+	   const struct ipt_ip *ip,
+	   void *matchinfo,
+	   unsigned int matchsize,
+	   unsigned int hook_mask)
+{
+	const struct ipt_dccp_info *info;
+
+	info = (const struct ipt_dccp_info *)matchinfo;
+
+	return ip->proto == IPPROTO_DCCP
+		&& !(ip->invflags & IPT_INV_PROTO)
+		&& matchsize == IPT_ALIGN(sizeof(struct ipt_dccp_info))
+		&& !(info->flags & ~IPT_DCCP_VALID_FLAGS)
+		&& !(info->invflags & ~IPT_DCCP_VALID_FLAGS)
+		&& !(info->invflags & ~info->flags);
+}
+
+static struct ipt_match dccp_match = 
+{ 
+	.name 		= "dccp",
+	.match		= &match,
+	.checkentry	= &checkentry,
+	.me 		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	int ret;
+
+	/* doff is 8 bits, so the maximum option size is (4*256).  Don't put
+	 * this in BSS since DaveM is worried about locked TLB's for kernel
+	 * BSS. */
+	dccp_optbuf = kmalloc(256 * 4, GFP_KERNEL);
+	if (!dccp_optbuf)
+		return -ENOMEM;
+	ret = ipt_register_match(&dccp_match);
+	if (ret)
+		kfree(dccp_optbuf);
+
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&dccp_match);
+	kfree(dccp_optbuf);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("Match for DCCP protocol packets");
+
diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c
index 564b49bfebc..2dd1cccbdab 100644
--- a/net/ipv4/netfilter/ipt_hashlimit.c
+++ b/net/ipv4/netfilter/ipt_hashlimit.c
@@ -94,7 +94,7 @@ struct ipt_hashlimit_htable {
 static DEFINE_SPINLOCK(hashlimit_lock);	/* protects htables list */
 static DECLARE_MUTEX(hlimit_mutex);	/* additional checkentry protection */
 static HLIST_HEAD(hashlimit_htables);
-static kmem_cache_t *hashlimit_cachep;
+static kmem_cache_t *hashlimit_cachep __read_mostly;
 
 static inline int dst_cmp(const struct dsthash_ent *ent, struct dsthash_dst *b)
 {
diff --git a/net/ipv4/netfilter/ipt_mark.c b/net/ipv4/netfilter/ipt_mark.c
index 8955728127b..00bef6cdd3f 100644
--- a/net/ipv4/netfilter/ipt_mark.c
+++ b/net/ipv4/netfilter/ipt_mark.c
@@ -37,9 +37,16 @@ checkentry(const char *tablename,
            unsigned int matchsize,
            unsigned int hook_mask)
 {
+	struct ipt_mark_info *minfo = (struct ipt_mark_info *) matchinfo;
+
 	if (matchsize != IPT_ALIGN(sizeof(struct ipt_mark_info)))
 		return 0;
 
+	if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) {
+		printk(KERN_WARNING "mark: only supports 32bit mark\n");
+		return 0;
+	}
+
 	return 1;
 }
 
diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c
index 3b9065e0638..c1889f88262 100644
--- a/net/ipv4/netfilter/ipt_owner.c
+++ b/net/ipv4/netfilter/ipt_owner.c
@@ -21,106 +21,6 @@ MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
 MODULE_DESCRIPTION("iptables owner match");
 
 static int
-match_comm(const struct sk_buff *skb, const char *comm)
-{
-	struct task_struct *g, *p;
-	struct files_struct *files;
-	int i;
-
-	read_lock(&tasklist_lock);
-	do_each_thread(g, p) {
-		if(strncmp(p->comm, comm, sizeof(p->comm)))
-			continue;
-
-		task_lock(p);
-		files = p->files;
-		if(files) {
-			spin_lock(&files->file_lock);
-			for (i=0; i < files->max_fds; i++) {
-				if (fcheck_files(files, i) ==
-				    skb->sk->sk_socket->file) {
-					spin_unlock(&files->file_lock);
-					task_unlock(p);
-					read_unlock(&tasklist_lock);
-					return 1;
-				}
-			}
-			spin_unlock(&files->file_lock);
-		}
-		task_unlock(p);
-	} while_each_thread(g, p);
-	read_unlock(&tasklist_lock);
-	return 0;
-}
-
-static int
-match_pid(const struct sk_buff *skb, pid_t pid)
-{
-	struct task_struct *p;
-	struct files_struct *files;
-	int i;
-
-	read_lock(&tasklist_lock);
-	p = find_task_by_pid(pid);
-	if (!p)
-		goto out;
-	task_lock(p);
-	files = p->files;
-	if(files) {
-		spin_lock(&files->file_lock);
-		for (i=0; i < files->max_fds; i++) {
-			if (fcheck_files(files, i) ==
-			    skb->sk->sk_socket->file) {
-				spin_unlock(&files->file_lock);
-				task_unlock(p);
-				read_unlock(&tasklist_lock);
-				return 1;
-			}
-		}
-		spin_unlock(&files->file_lock);
-	}
-	task_unlock(p);
-out:
-	read_unlock(&tasklist_lock);
-	return 0;
-}
-
-static int
-match_sid(const struct sk_buff *skb, pid_t sid)
-{
-	struct task_struct *g, *p;
-	struct file *file = skb->sk->sk_socket->file;
-	int i, found=0;
-
-	read_lock(&tasklist_lock);
-	do_each_thread(g, p) {
-		struct files_struct *files;
-		if (p->signal->session != sid)
-			continue;
-
-		task_lock(p);
-		files = p->files;
-		if (files) {
-			spin_lock(&files->file_lock);
-			for (i=0; i < files->max_fds; i++) {
-				if (fcheck_files(files, i) == file) {
-					found = 1;
-					break;
-				}
-			}
-			spin_unlock(&files->file_lock);
-		}
-		task_unlock(p);
-		if (found)
-			goto out;
-	} while_each_thread(g, p);
-out:
-	read_unlock(&tasklist_lock);
-
-	return found;
-}
-
-static int
 match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
@@ -145,24 +45,6 @@ match(const struct sk_buff *skb,
 			return 0;
 	}
 
-	if(info->match & IPT_OWNER_PID) {
-		if (!match_pid(skb, info->pid) ^
-		    !!(info->invert & IPT_OWNER_PID))
-			return 0;
-	}
-
-	if(info->match & IPT_OWNER_SID) {
-		if (!match_sid(skb, info->sid) ^
-		    !!(info->invert & IPT_OWNER_SID))
-			return 0;
-	}
-
-	if(info->match & IPT_OWNER_COMM) {
-		if (!match_comm(skb, info->comm) ^
-		    !!(info->invert & IPT_OWNER_COMM))
-			return 0;
-	}
-
 	return 1;
 }
 
@@ -173,6 +55,8 @@ checkentry(const char *tablename,
            unsigned int matchsize,
            unsigned int hook_mask)
 {
+	const struct ipt_owner_info *info = matchinfo;
+
         if (hook_mask
             & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING))) {
                 printk("ipt_owner: only valid for LOCAL_OUT or POST_ROUTING.\n");
@@ -184,15 +68,13 @@ checkentry(const char *tablename,
 		       IPT_ALIGN(sizeof(struct ipt_owner_info)));
 		return 0;
 	}
-#ifdef CONFIG_SMP
-	/* files->file_lock can not be used in a BH */
-	if (((struct ipt_owner_info *)matchinfo)->match
-	    & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) {
-		printk("ipt_owner: pid, sid and command matching is broken "
-		       "on SMP.\n");
+
+	if (info->match & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) {
+		printk("ipt_owner: pid, sid and command matching "
+		       "not supported anymore\n");
 		return 0;
 	}
-#endif
+
 	return 1;
 }
 
diff --git a/net/ipv4/netfilter/ipt_string.c b/net/ipv4/netfilter/ipt_string.c
new file mode 100644
index 00000000000..b5def204d79
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_string.c
@@ -0,0 +1,91 @@
+/* String matching match for iptables
+ * 
+ * (C) 2005 Pablo Neira Ayuso <pablo@eurodev.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_string.h>
+#include <linux/textsearch.h>
+
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@eurodev.net>");
+MODULE_DESCRIPTION("IP tables string match module");
+MODULE_LICENSE("GPL");
+
+static int match(const struct sk_buff *skb,
+		 const struct net_device *in,
+		 const struct net_device *out,
+		 const void *matchinfo,
+		 int offset,
+		 int *hotdrop)
+{
+	struct ts_state state;
+	struct ipt_string_info *conf = (struct ipt_string_info *) matchinfo;
+
+	memset(&state, 0, sizeof(struct ts_state));
+
+	return (skb_find_text((struct sk_buff *)skb, conf->from_offset, 
+			     conf->to_offset, conf->config, &state) 
+			     != UINT_MAX) && !conf->invert;
+}
+
+#define STRING_TEXT_PRIV(m) ((struct ipt_string_info *) m)
+
+static int checkentry(const char *tablename,
+		      const struct ipt_ip *ip,
+		      void *matchinfo,
+		      unsigned int matchsize,
+		      unsigned int hook_mask)
+{
+	struct ipt_string_info *conf = matchinfo;
+	struct ts_config *ts_conf;
+
+	if (matchsize != IPT_ALIGN(sizeof(struct ipt_string_info)))
+		return 0;
+
+	/* Damn, can't handle this case properly with iptables... */
+	if (conf->from_offset > conf->to_offset)
+		return 0;
+
+	ts_conf = textsearch_prepare(conf->algo, conf->pattern, conf->patlen,
+				     GFP_KERNEL, TS_AUTOLOAD);
+	if (IS_ERR(ts_conf))
+		return 0;
+
+	conf->config = ts_conf;
+
+	return 1;
+}
+
+static void destroy(void *matchinfo, unsigned int matchsize)
+{
+	textsearch_destroy(STRING_TEXT_PRIV(matchinfo)->config);
+}
+
+static struct ipt_match string_match = {
+	.name 		= "string",
+	.match 		= match,
+	.checkentry	= checkentry,
+	.destroy 	= destroy,
+	.me 		= THIS_MODULE
+};
+
+static int __init init(void)
+{
+	return ipt_register_match(&string_match);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&string_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 912bbcc7f41..f7943ba1f43 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -59,13 +59,10 @@ static int fold_prot_inuse(struct proto *proto)
  */
 static int sockstat_seq_show(struct seq_file *seq, void *v)
 {
-	/* From net/socket.c */
-	extern void socket_seq_show(struct seq_file *seq);
-
 	socket_seq_show(seq);
 	seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n",
 		   fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count),
-		   tcp_tw_count, atomic_read(&tcp_sockets_allocated),
+		   tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated),
 		   atomic_read(&tcp_memory_allocated));
 	seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot));
 	seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot));
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 0db405a869f..291831e792a 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -40,7 +40,6 @@
 #include <linux/timer.h>
 #include <net/ip.h>
 #include <net/protocol.h>
-#include <net/tcp.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
 #include <net/icmp.h>
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index d1835b1bc8c..304bb0a1d4f 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -59,7 +59,6 @@
 #include <linux/netdevice.h>
 #include <linux/in_route.h>
 #include <linux/route.h>
-#include <linux/tcp.h>
 #include <linux/skbuff.h>
 #include <net/dst.h>
 #include <net/sock.h>
@@ -71,6 +70,7 @@
 #include <net/udp.h>
 #include <net/raw.h>
 #include <net/snmp.h>
+#include <net/tcp_states.h>
 #include <net/inet_common.h>
 #include <net/checksum.h>
 #include <net/xfrm.h>
@@ -150,10 +150,11 @@ static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
  * RFC 1122: SHOULD pass TOS value up to the transport layer.
  * -> It does. And not only TOS, but all IP header.
  */
-void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
+int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
 {
 	struct sock *sk;
 	struct hlist_head *head;
+	int delivered = 0;
 
 	read_lock(&raw_v4_lock);
 	head = &raw_v4_htable[hash];
@@ -164,6 +165,7 @@ void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
 			     skb->dev->ifindex);
 
 	while (sk) {
+		delivered = 1;
 		if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) {
 			struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
 
@@ -177,6 +179,7 @@ void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
 	}
 out:
 	read_unlock(&raw_v4_lock);
+	return delivered;
 }
 
 void raw_err (struct sock *sk, struct sk_buff *skb, u32 info)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d675ff80b04..8c0b14e3bee 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -240,7 +240,9 @@ static unsigned			rt_hash_mask;
 static int			rt_hash_log;
 static unsigned int		rt_hash_rnd;
 
-struct rt_cache_stat *rt_cache_stat;
+static struct rt_cache_stat *rt_cache_stat;
+#define RT_CACHE_STAT_INC(field)					  \
+		(per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++)
 
 static int rt_intern_hash(unsigned hash, struct rtable *rth,
 				struct rtable **res);
@@ -2600,6 +2602,8 @@ int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
 	return ip_route_output_slow(rp, flp);
 }
 
+EXPORT_SYMBOL_GPL(__ip_route_output_key);
+
 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
 {
 	int err;
@@ -2618,6 +2622,8 @@ int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk,
 	return 0;
 }
 
+EXPORT_SYMBOL_GPL(ip_route_output_flow);
+
 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
 {
 	return ip_route_output_flow(rp, flp, NULL, 0);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 72d01444218..a34e60ea48a 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -169,8 +169,6 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
 	return mssind < NUM_MSS ? msstab[mssind] + 1 : 0;
 }
 
-extern struct request_sock_ops tcp_request_sock_ops;
-
 static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
 					   struct request_sock *req,
 					   struct dst_entry *dst)
@@ -180,7 +178,7 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
 
 	child = tp->af_specific->syn_recv_sock(sk, skb, req, dst);
 	if (child)
-		tcp_acceptq_queue(sk, req, child);
+		inet_csk_reqsk_queue_add(sk, req, child);
 	else
 		reqsk_free(req);
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e3289453241..65268562351 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -11,7 +11,9 @@
 #include <linux/module.h>
 #include <linux/sysctl.h>
 #include <linux/config.h>
+#include <linux/igmp.h>
 #include <net/snmp.h>
+#include <net/icmp.h>
 #include <net/ip.h>
 #include <net/route.h>
 #include <net/tcp.h>
@@ -19,36 +21,6 @@
 /* From af_inet.c */
 extern int sysctl_ip_nonlocal_bind;
 
-/* From icmp.c */
-extern int sysctl_icmp_echo_ignore_all;
-extern int sysctl_icmp_echo_ignore_broadcasts;
-extern int sysctl_icmp_ignore_bogus_error_responses;
-extern int sysctl_icmp_errors_use_inbound_ifaddr;
-
-/* From ip_fragment.c */
-extern int sysctl_ipfrag_low_thresh;
-extern int sysctl_ipfrag_high_thresh; 
-extern int sysctl_ipfrag_time;
-extern int sysctl_ipfrag_secret_interval;
-
-/* From ip_output.c */
-extern int sysctl_ip_dynaddr;
-
-/* From icmp.c */
-extern int sysctl_icmp_ratelimit;
-extern int sysctl_icmp_ratemask;
-
-/* From igmp.c */
-extern int sysctl_igmp_max_memberships;
-extern int sysctl_igmp_max_msf;
-
-/* From inetpeer.c */
-extern int inet_peer_threshold;
-extern int inet_peer_minttl;
-extern int inet_peer_maxttl;
-extern int inet_peer_gc_mintime;
-extern int inet_peer_gc_maxtime;
-
 #ifdef CONFIG_SYSCTL
 static int tcp_retr1_max = 255; 
 static int ip_local_port_range_min[] = { 1, 1 };
@@ -57,8 +29,6 @@ static int ip_local_port_range_max[] = { 65535, 65535 };
 
 struct ipv4_config ipv4_config;
 
-extern ctl_table ipv4_route_table[];
-
 #ifdef CONFIG_SYSCTL
 
 static
@@ -136,10 +106,11 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file *
 	return ret;
 }
 
-int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen,
-				  void __user *oldval, size_t __user *oldlenp,
-				  void __user *newval, size_t newlen,
-				  void **context)
+static int sysctl_tcp_congestion_control(ctl_table *table, int __user *name,
+					 int nlen, void __user *oldval,
+					 size_t __user *oldlenp,
+					 void __user *newval, size_t newlen,
+					 void **context)
 {
 	char val[TCP_CA_NAME_MAX];
 	ctl_table tbl = {
@@ -259,7 +230,7 @@ ctl_table ipv4_table[] = {
 	{
 		.ctl_name	= NET_TCP_MAX_TW_BUCKETS,
 		.procname	= "tcp_max_tw_buckets",
-		.data		= &sysctl_tcp_max_tw_buckets,
+		.data		= &tcp_death_row.sysctl_max_tw_buckets,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
@@ -363,7 +334,7 @@ ctl_table ipv4_table[] = {
 	{
 		.ctl_name	= NET_TCP_TW_RECYCLE,
 		.procname	= "tcp_tw_recycle",
-		.data		= &sysctl_tcp_tw_recycle,
+		.data		= &tcp_death_row.sysctl_tw_recycle,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 69b1fcf7007..02fdda68718 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -269,13 +269,12 @@
 
 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 
-DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
-
-kmem_cache_t *tcp_bucket_cachep;
-kmem_cache_t *tcp_timewait_cachep;
+DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly;
 
 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
 
+EXPORT_SYMBOL_GPL(tcp_orphan_count);
+
 int sysctl_tcp_mem[3];
 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
@@ -311,15 +310,6 @@ void tcp_enter_memory_pressure(void)
 EXPORT_SYMBOL(tcp_enter_memory_pressure);
 
 /*
- * LISTEN is a special case for poll..
- */
-static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
-					       poll_table *wait)
-{
-	return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0;
-}
-
-/*
  *	Wait for a TCP event.
  *
  *	Note that we don't need to lock the socket, as the upper poll layers
@@ -334,7 +324,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 
 	poll_wait(file, sk->sk_sleep, wait);
 	if (sk->sk_state == TCP_LISTEN)
-		return tcp_listen_poll(sk, wait);
+		return inet_csk_listen_poll(sk);
 
 	/* Socket is not locked. We are protected from async events
 	   by poll logic and correct handling of state changes
@@ -457,109 +447,6 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 	return put_user(answ, (int __user *)arg);
 }
 
-
-int tcp_listen_start(struct sock *sk)
-{
-	struct inet_sock *inet = inet_sk(sk);
-	struct tcp_sock *tp = tcp_sk(sk);
-	int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE);
-
-	if (rc != 0)
-		return rc;
-
-	sk->sk_max_ack_backlog = 0;
-	sk->sk_ack_backlog = 0;
-	tcp_delack_init(tp);
-
-	/* There is race window here: we announce ourselves listening,
-	 * but this transition is still not validated by get_port().
-	 * It is OK, because this socket enters to hash table only
-	 * after validation is complete.
-	 */
-	sk->sk_state = TCP_LISTEN;
-	if (!sk->sk_prot->get_port(sk, inet->num)) {
-		inet->sport = htons(inet->num);
-
-		sk_dst_reset(sk);
-		sk->sk_prot->hash(sk);
-
-		return 0;
-	}
-
-	sk->sk_state = TCP_CLOSE;
-	reqsk_queue_destroy(&tp->accept_queue);
-	return -EADDRINUSE;
-}
-
-/*
- *	This routine closes sockets which have been at least partially
- *	opened, but not yet accepted.
- */
-
-static void tcp_listen_stop (struct sock *sk)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct listen_sock *lopt;
-	struct request_sock *acc_req;
-	struct request_sock *req;
-	int i;
-
-	tcp_delete_keepalive_timer(sk);
-
-	/* make all the listen_opt local to us */
-	lopt = reqsk_queue_yank_listen_sk(&tp->accept_queue);
-	acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue);
-
-	if (lopt->qlen) {
-		for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
-			while ((req = lopt->syn_table[i]) != NULL) {
-				lopt->syn_table[i] = req->dl_next;
-				lopt->qlen--;
-				reqsk_free(req);
-
-		/* Following specs, it would be better either to send FIN
-		 * (and enter FIN-WAIT-1, it is normal close)
-		 * or to send active reset (abort).
-		 * Certainly, it is pretty dangerous while synflood, but it is
-		 * bad justification for our negligence 8)
-		 * To be honest, we are not able to make either
-		 * of the variants now.			--ANK
-		 */
-			}
-		}
-	}
-	BUG_TRAP(!lopt->qlen);
-
-	kfree(lopt);
-
-	while ((req = acc_req) != NULL) {
-		struct sock *child = req->sk;
-
-		acc_req = req->dl_next;
-
-		local_bh_disable();
-		bh_lock_sock(child);
-		BUG_TRAP(!sock_owned_by_user(child));
-		sock_hold(child);
-
-		tcp_disconnect(child, O_NONBLOCK);
-
-		sock_orphan(child);
-
-		atomic_inc(&tcp_orphan_count);
-
-		tcp_destroy_sock(child);
-
-		bh_unlock_sock(child);
-		local_bh_enable();
-		sock_put(child);
-
-		sk_acceptq_removed(sk);
-		__reqsk_free(req);
-	}
-	BUG_TRAP(!sk->sk_ack_backlog);
-}
-
 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
 {
 	TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
@@ -975,7 +862,7 @@ do_fault:
 	if (!skb->len) {
 		if (sk->sk_send_head == skb)
 			sk->sk_send_head = NULL;
-		__skb_unlink(skb, skb->list);
+		__skb_unlink(skb, &sk->sk_write_queue);
 		sk_stream_free_skb(sk, skb);
 	}
 
@@ -1057,20 +944,21 @@ static void cleanup_rbuf(struct sock *sk, int copied)
 	BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
 #endif
 
-	if (tcp_ack_scheduled(tp)) {
+	if (inet_csk_ack_scheduled(sk)) {
+		const struct inet_connection_sock *icsk = inet_csk(sk);
 		   /* Delayed ACKs frequently hit locked sockets during bulk
 		    * receive. */
-		if (tp->ack.blocked ||
+		if (icsk->icsk_ack.blocked ||
 		    /* Once-per-two-segments ACK was not sent by tcp_input.c */
-		    tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
+		    tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
 		    /*
 		     * If this read emptied read buffer, we send ACK, if
 		     * connection is not bidirectional, user drained
 		     * receive buffer and there was a small segment
 		     * in queue.
 		     */
-		    (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
-		     !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
+		    (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
+		     !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
 			time_to_ack = 1;
 	}
 
@@ -1572,40 +1460,6 @@ void tcp_shutdown(struct sock *sk, int how)
 	}
 }
 
-/*
- * At this point, there should be no process reference to this
- * socket, and thus no user references at all.  Therefore we
- * can assume the socket waitqueue is inactive and nobody will
- * try to jump onto it.
- */
-void tcp_destroy_sock(struct sock *sk)
-{
-	BUG_TRAP(sk->sk_state == TCP_CLOSE);
-	BUG_TRAP(sock_flag(sk, SOCK_DEAD));
-
-	/* It cannot be in hash table! */
-	BUG_TRAP(sk_unhashed(sk));
-
-	/* If it has not 0 inet_sk(sk)->num, it must be bound */
-	BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
-
-	sk->sk_prot->destroy(sk);
-
-	sk_stream_kill_queues(sk);
-
-	xfrm_sk_free_policy(sk);
-
-#ifdef INET_REFCNT_DEBUG
-	if (atomic_read(&sk->sk_refcnt) != 1) {
-		printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
-		       sk, atomic_read(&sk->sk_refcnt));
-	}
-#endif
-
-	atomic_dec(&tcp_orphan_count);
-	sock_put(sk);
-}
-
 void tcp_close(struct sock *sk, long timeout)
 {
 	struct sk_buff *skb;
@@ -1618,7 +1472,7 @@ void tcp_close(struct sock *sk, long timeout)
 		tcp_set_state(sk, TCP_CLOSE);
 
 		/* Special case. */
-		tcp_listen_stop(sk);
+		inet_csk_listen_stop(sk);
 
 		goto adjudge_to_death;
 	}
@@ -1721,12 +1575,12 @@ adjudge_to_death:
 			tcp_send_active_reset(sk, GFP_ATOMIC);
 			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
 		} else {
-			int tmo = tcp_fin_time(tp);
+			const int tmo = tcp_fin_time(sk);
 
 			if (tmo > TCP_TIMEWAIT_LEN) {
-				tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
+				inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk));
 			} else {
-				atomic_inc(&tcp_orphan_count);
+				atomic_inc(sk->sk_prot->orphan_count);
 				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
 				goto out;
 			}
@@ -1734,7 +1588,7 @@ adjudge_to_death:
 	}
 	if (sk->sk_state != TCP_CLOSE) {
 		sk_stream_mem_reclaim(sk);
-		if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
+		if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans ||
 		    (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
 		     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
 			if (net_ratelimit())
@@ -1745,10 +1599,10 @@ adjudge_to_death:
 			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
 		}
 	}
-	atomic_inc(&tcp_orphan_count);
+	atomic_inc(sk->sk_prot->orphan_count);
 
 	if (sk->sk_state == TCP_CLOSE)
-		tcp_destroy_sock(sk);
+		inet_csk_destroy_sock(sk);
 	/* Otherwise, socket is reprieved until protocol close. */
 
 out:
@@ -1769,6 +1623,7 @@ static inline int tcp_need_reset(int state)
 int tcp_disconnect(struct sock *sk, int flags)
 {
 	struct inet_sock *inet = inet_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	int err = 0;
 	int old_state = sk->sk_state;
@@ -1778,7 +1633,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 
 	/* ABORT function of RFC793 */
 	if (old_state == TCP_LISTEN) {
-		tcp_listen_stop(sk);
+		inet_csk_listen_stop(sk);
 	} else if (tcp_need_reset(old_state) ||
 		   (tp->snd_nxt != tp->write_seq &&
 		    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
@@ -1805,125 +1660,34 @@ int tcp_disconnect(struct sock *sk, int flags)
 	tp->srtt = 0;
 	if ((tp->write_seq += tp->max_window + 2) == 0)
 		tp->write_seq = 1;
-	tp->backoff = 0;
+	icsk->icsk_backoff = 0;
 	tp->snd_cwnd = 2;
-	tp->probes_out = 0;
+	icsk->icsk_probes_out = 0;
 	tp->packets_out = 0;
 	tp->snd_ssthresh = 0x7fffffff;
 	tp->snd_cwnd_cnt = 0;
-	tcp_set_ca_state(tp, TCP_CA_Open);
+	tcp_set_ca_state(sk, TCP_CA_Open);
 	tcp_clear_retrans(tp);
-	tcp_delack_init(tp);
+	inet_csk_delack_init(sk);
 	sk->sk_send_head = NULL;
 	tp->rx_opt.saw_tstamp = 0;
 	tcp_sack_reset(&tp->rx_opt);
 	__sk_dst_reset(sk);
 
-	BUG_TRAP(!inet->num || tp->bind_hash);
+	BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
 
 	sk->sk_error_report(sk);
 	return err;
 }
 
 /*
- *	Wait for an incoming connection, avoid race
- *	conditions. This must be called with the socket locked.
- */
-static int wait_for_connect(struct sock *sk, long timeo)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	DEFINE_WAIT(wait);
-	int err;
-
-	/*
-	 * True wake-one mechanism for incoming connections: only
-	 * one process gets woken up, not the 'whole herd'.
-	 * Since we do not 'race & poll' for established sockets
-	 * anymore, the common case will execute the loop only once.
-	 *
-	 * Subtle issue: "add_wait_queue_exclusive()" will be added
-	 * after any current non-exclusive waiters, and we know that
-	 * it will always _stay_ after any new non-exclusive waiters
-	 * because all non-exclusive waiters are added at the
-	 * beginning of the wait-queue. As such, it's ok to "drop"
-	 * our exclusiveness temporarily when we get woken up without
-	 * having to remove and re-insert us on the wait queue.
-	 */
-	for (;;) {
-		prepare_to_wait_exclusive(sk->sk_sleep, &wait,
-					  TASK_INTERRUPTIBLE);
-		release_sock(sk);
-		if (reqsk_queue_empty(&tp->accept_queue))
-			timeo = schedule_timeout(timeo);
-		lock_sock(sk);
-		err = 0;
-		if (!reqsk_queue_empty(&tp->accept_queue))
-			break;
-		err = -EINVAL;
-		if (sk->sk_state != TCP_LISTEN)
-			break;
-		err = sock_intr_errno(timeo);
-		if (signal_pending(current))
-			break;
-		err = -EAGAIN;
-		if (!timeo)
-			break;
-	}
-	finish_wait(sk->sk_sleep, &wait);
-	return err;
-}
-
-/*
- *	This will accept the next outstanding connection.
- */
-
-struct sock *tcp_accept(struct sock *sk, int flags, int *err)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct sock *newsk;
-	int error;
-
-	lock_sock(sk);
-
-	/* We need to make sure that this socket is listening,
-	 * and that it has something pending.
-	 */
-	error = -EINVAL;
-	if (sk->sk_state != TCP_LISTEN)
-		goto out_err;
-
-	/* Find already established connection */
-	if (reqsk_queue_empty(&tp->accept_queue)) {
-		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
-
-		/* If this is a non blocking socket don't sleep */
-		error = -EAGAIN;
-		if (!timeo)
-			goto out_err;
-
-		error = wait_for_connect(sk, timeo);
-		if (error)
-			goto out_err;
-	}
-
-	newsk = reqsk_queue_get_child(&tp->accept_queue, sk);
-	BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
-out:
-	release_sock(sk);
-	return newsk;
-out_err:
-	newsk = NULL;
-	*err = error;
-	goto out;
-}
-
-/*
  *	Socket option code for TCP.
  */
 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 		   int optlen)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	int val;
 	int err = 0;
 
@@ -1945,7 +1709,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 		name[val] = 0;
 
 		lock_sock(sk);
-		err = tcp_set_congestion_control(tp, name);
+		err = tcp_set_congestion_control(sk, name);
 		release_sock(sk);
 		return err;
 	}
@@ -2022,7 +1786,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 					elapsed = tp->keepalive_time - elapsed;
 				else
 					elapsed = 0;
-				tcp_reset_keepalive_timer(sk, elapsed);
+				inet_csk_reset_keepalive_timer(sk, elapsed);
 			}
 		}
 		break;
@@ -2042,7 +1806,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 		if (val < 1 || val > MAX_TCP_SYNCNT)
 			err = -EINVAL;
 		else
-			tp->syn_retries = val;
+			icsk->icsk_syn_retries = val;
 		break;
 
 	case TCP_LINGER2:
@@ -2055,15 +1819,15 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 		break;
 
 	case TCP_DEFER_ACCEPT:
-		tp->defer_accept = 0;
+		icsk->icsk_accept_queue.rskq_defer_accept = 0;
 		if (val > 0) {
 			/* Translate value in seconds to number of
 			 * retransmits */
-			while (tp->defer_accept < 32 &&
+			while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
 			       val > ((TCP_TIMEOUT_INIT / HZ) <<
-				       tp->defer_accept))
-				tp->defer_accept++;
-			tp->defer_accept++;
+				       icsk->icsk_accept_queue.rskq_defer_accept))
+				icsk->icsk_accept_queue.rskq_defer_accept++;
+			icsk->icsk_accept_queue.rskq_defer_accept++;
 		}
 		break;
 
@@ -2081,16 +1845,16 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 
 	case TCP_QUICKACK:
 		if (!val) {
-			tp->ack.pingpong = 1;
+			icsk->icsk_ack.pingpong = 1;
 		} else {
-			tp->ack.pingpong = 0;
+			icsk->icsk_ack.pingpong = 0;
 			if ((1 << sk->sk_state) &
 			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
-			    tcp_ack_scheduled(tp)) {
-				tp->ack.pending |= TCP_ACK_PUSHED;
+			    inet_csk_ack_scheduled(sk)) {
+				icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
 				cleanup_rbuf(sk, 1);
 				if (!(val & 1))
-					tp->ack.pingpong = 1;
+					icsk->icsk_ack.pingpong = 1;
 			}
 		}
 		break;
@@ -2107,15 +1871,16 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 void tcp_get_info(struct sock *sk, struct tcp_info *info)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 	u32 now = tcp_time_stamp;
 
 	memset(info, 0, sizeof(*info));
 
 	info->tcpi_state = sk->sk_state;
-	info->tcpi_ca_state = tp->ca_state;
-	info->tcpi_retransmits = tp->retransmits;
-	info->tcpi_probes = tp->probes_out;
-	info->tcpi_backoff = tp->backoff;
+	info->tcpi_ca_state = icsk->icsk_ca_state;
+	info->tcpi_retransmits = icsk->icsk_retransmits;
+	info->tcpi_probes = icsk->icsk_probes_out;
+	info->tcpi_backoff = icsk->icsk_backoff;
 
 	if (tp->rx_opt.tstamp_ok)
 		info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
@@ -2130,10 +1895,10 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	if (tp->ecn_flags&TCP_ECN_OK)
 		info->tcpi_options |= TCPI_OPT_ECN;
 
-	info->tcpi_rto = jiffies_to_usecs(tp->rto);
-	info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
+	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
+	info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
 	info->tcpi_snd_mss = tp->mss_cache;
-	info->tcpi_rcv_mss = tp->ack.rcv_mss;
+	info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
 
 	info->tcpi_unacked = tp->packets_out;
 	info->tcpi_sacked = tp->sacked_out;
@@ -2142,7 +1907,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_fackets = tp->fackets_out;
 
 	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
-	info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
+	info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
 	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
 
 	info->tcpi_pmtu = tp->pmtu_cookie;
@@ -2165,6 +1930,7 @@ EXPORT_SYMBOL_GPL(tcp_get_info);
 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 		   int __user *optlen)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	int val, len;
 
@@ -2202,7 +1968,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 		val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
 		break;
 	case TCP_SYNCNT:
-		val = tp->syn_retries ? : sysctl_tcp_syn_retries;
+		val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
 		break;
 	case TCP_LINGER2:
 		val = tp->linger2;
@@ -2210,8 +1976,8 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 			val = (val ? : sysctl_tcp_fin_timeout) / HZ;
 		break;
 	case TCP_DEFER_ACCEPT:
-		val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
-					       (tp->defer_accept - 1));
+		val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
+			((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
 		break;
 	case TCP_WINDOW_CLAMP:
 		val = tp->window_clamp;
@@ -2232,7 +1998,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 		return 0;
 	}
 	case TCP_QUICKACK:
-		val = !tp->ack.pingpong;
+		val = !icsk->icsk_ack.pingpong;
 		break;
 
 	case TCP_CONGESTION:
@@ -2241,7 +2007,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 		len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
 		if (put_user(len, optlen))
 			return -EFAULT;
-		if (copy_to_user(optval, tp->ca_ops->name, len))
+		if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
 			return -EFAULT;
 		return 0;
 	default:
@@ -2278,79 +2044,72 @@ void __init tcp_init(void)
 		__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
 					   sizeof(skb->cb));
 
-	tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
-					      sizeof(struct tcp_bind_bucket),
-					      0, SLAB_HWCACHE_ALIGN,
-					      NULL, NULL);
-	if (!tcp_bucket_cachep)
+	tcp_hashinfo.bind_bucket_cachep =
+		kmem_cache_create("tcp_bind_bucket",
+				  sizeof(struct inet_bind_bucket), 0,
+				  SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (!tcp_hashinfo.bind_bucket_cachep)
 		panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
 
-	tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
-						sizeof(struct tcp_tw_bucket),
-						0, SLAB_HWCACHE_ALIGN,
-						NULL, NULL);
-	if (!tcp_timewait_cachep)
-		panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
-
 	/* Size and allocate the main established and bind bucket
 	 * hash tables.
 	 *
 	 * The methodology is similar to that of the buffer cache.
 	 */
-	tcp_ehash = (struct tcp_ehash_bucket *)
+	tcp_hashinfo.ehash =
 		alloc_large_system_hash("TCP established",
-					sizeof(struct tcp_ehash_bucket),
+					sizeof(struct inet_ehash_bucket),
 					thash_entries,
 					(num_physpages >= 128 * 1024) ?
 						(25 - PAGE_SHIFT) :
 						(27 - PAGE_SHIFT),
 					HASH_HIGHMEM,
-					&tcp_ehash_size,
+					&tcp_hashinfo.ehash_size,
 					NULL,
 					0);
-	tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
-	for (i = 0; i < (tcp_ehash_size << 1); i++) {
-		rwlock_init(&tcp_ehash[i].lock);
-		INIT_HLIST_HEAD(&tcp_ehash[i].chain);
+	tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
+	for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
+		rwlock_init(&tcp_hashinfo.ehash[i].lock);
+		INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
 	}
 
-	tcp_bhash = (struct tcp_bind_hashbucket *)
+	tcp_hashinfo.bhash =
 		alloc_large_system_hash("TCP bind",
-					sizeof(struct tcp_bind_hashbucket),
-					tcp_ehash_size,
+					sizeof(struct inet_bind_hashbucket),
+					tcp_hashinfo.ehash_size,
 					(num_physpages >= 128 * 1024) ?
 						(25 - PAGE_SHIFT) :
 						(27 - PAGE_SHIFT),
 					HASH_HIGHMEM,
-					&tcp_bhash_size,
+					&tcp_hashinfo.bhash_size,
 					NULL,
 					64 * 1024);
-	tcp_bhash_size = 1 << tcp_bhash_size;
-	for (i = 0; i < tcp_bhash_size; i++) {
-		spin_lock_init(&tcp_bhash[i].lock);
-		INIT_HLIST_HEAD(&tcp_bhash[i].chain);
+	tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
+	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
+		spin_lock_init(&tcp_hashinfo.bhash[i].lock);
+		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
 	}
 
 	/* Try to be a bit smarter and adjust defaults depending
 	 * on available memory.
 	 */
 	for (order = 0; ((1 << order) << PAGE_SHIFT) <
-			(tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
+			(tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
 			order++)
 		;
 	if (order >= 4) {
 		sysctl_local_port_range[0] = 32768;
 		sysctl_local_port_range[1] = 61000;
-		sysctl_tcp_max_tw_buckets = 180000;
+		tcp_death_row.sysctl_max_tw_buckets = 180000;
 		sysctl_tcp_max_orphans = 4096 << (order - 4);
 		sysctl_max_syn_backlog = 1024;
 	} else if (order < 3) {
 		sysctl_local_port_range[0] = 1024 * (3 - order);
-		sysctl_tcp_max_tw_buckets >>= (3 - order);
+		tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
 		sysctl_tcp_max_orphans >>= (3 - order);
 		sysctl_max_syn_backlog = 128;
 	}
-	tcp_port_rover = sysctl_local_port_range[0] - 1;
+	tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1;
 
 	sysctl_tcp_mem[0] =  768 << order;
 	sysctl_tcp_mem[1] = 1024 << order;
@@ -2365,14 +2124,12 @@ void __init tcp_init(void)
 
 	printk(KERN_INFO "TCP: Hash tables configured "
 	       "(established %d bind %d)\n",
-	       tcp_ehash_size << 1, tcp_bhash_size);
+	       tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
 
 	tcp_register_congestion_control(&tcp_reno);
 }
 
-EXPORT_SYMBOL(tcp_accept);
 EXPORT_SYMBOL(tcp_close);
-EXPORT_SYMBOL(tcp_destroy_sock);
 EXPORT_SYMBOL(tcp_disconnect);
 EXPORT_SYMBOL(tcp_getsockopt);
 EXPORT_SYMBOL(tcp_ioctl);
@@ -2384,4 +2141,3 @@ EXPORT_SYMBOL(tcp_sendpage);
 EXPORT_SYMBOL(tcp_setsockopt);
 EXPORT_SYMBOL(tcp_shutdown);
 EXPORT_SYMBOL(tcp_statistics);
-EXPORT_SYMBOL(tcp_timewait_cachep);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index ec38d45d664..b940346de4e 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -86,11 +86,11 @@ static inline void bictcp_reset(struct bictcp *ca)
 	ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
 }
 
-static void bictcp_init(struct tcp_sock *tp)
+static void bictcp_init(struct sock *sk)
 {
-	bictcp_reset(tcp_ca(tp));
+	bictcp_reset(inet_csk_ca(sk));
 	if (initial_ssthresh)
-		tp->snd_ssthresh = initial_ssthresh;
+		tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
 }
 
 /*
@@ -156,9 +156,10 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
 
 
 /* Detect low utilization in congestion avoidance */
-static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag)
+static inline void bictcp_low_utilization(struct sock *sk, int flag)
 {
-	struct bictcp *ca = tcp_ca(tp);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct bictcp *ca = inet_csk_ca(sk);
 	u32 dist, delay;
 
 	/* No time stamp */
@@ -208,12 +209,13 @@ static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag)
 
 }
 
-static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack,
+static void bictcp_cong_avoid(struct sock *sk, u32 ack,
 			      u32 seq_rtt, u32 in_flight, int data_acked)
 {
-	struct bictcp *ca = tcp_ca(tp);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bictcp *ca = inet_csk_ca(sk);
 
-	bictcp_low_utilization(tp, data_acked);
+	bictcp_low_utilization(sk, data_acked);
 
 	if (in_flight < tp->snd_cwnd)
 		return;
@@ -242,9 +244,10 @@ static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack,
  *	behave like Reno until low_window is reached,
  *	then increase congestion window slowly
  */
-static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp)
+static u32 bictcp_recalc_ssthresh(struct sock *sk)
 {
-	struct bictcp *ca = tcp_ca(tp);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct bictcp *ca = inet_csk_ca(sk);
 
 	ca->epoch_start = 0;	/* end of epoch */
 
@@ -269,31 +272,34 @@ static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp)
 		return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
 }
 
-static u32 bictcp_undo_cwnd(struct tcp_sock *tp)
+static u32 bictcp_undo_cwnd(struct sock *sk)
 {
-	struct bictcp *ca = tcp_ca(tp);
-
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct bictcp *ca = inet_csk_ca(sk);
 	return max(tp->snd_cwnd, ca->last_max_cwnd);
 }
 
-static u32 bictcp_min_cwnd(struct tcp_sock *tp)
+static u32 bictcp_min_cwnd(struct sock *sk)
 {
+	const struct tcp_sock *tp = tcp_sk(sk);
 	return tp->snd_ssthresh;
 }
 
-static void bictcp_state(struct tcp_sock *tp, u8 new_state)
+static void bictcp_state(struct sock *sk, u8 new_state)
 {
 	if (new_state == TCP_CA_Loss)
-		bictcp_reset(tcp_ca(tp));
+		bictcp_reset(inet_csk_ca(sk));
 }
 
 /* Track delayed acknowledgement ratio using sliding window
  * ratio = (15*ratio + sample) / 16
  */
-static void bictcp_acked(struct tcp_sock *tp, u32 cnt)
+static void bictcp_acked(struct sock *sk, u32 cnt)
 {
-	if (cnt > 0 && 	tp->ca_state == TCP_CA_Open) {
-		struct bictcp *ca = tcp_ca(tp);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (cnt > 0 && 	icsk->icsk_ca_state == TCP_CA_Open) {
+		struct bictcp *ca = inet_csk_ca(sk);
 		cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
 		ca->delayed_ack += cnt;
 	}
@@ -314,7 +320,7 @@ static struct tcp_congestion_ops bictcp = {
 
 static int __init bictcp_register(void)
 {
-	BUG_ON(sizeof(struct bictcp) > TCP_CA_PRIV_SIZE);
+	BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
 	return tcp_register_congestion_control(&bictcp);
 }
 
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 4970d10a778..bbf2d6624e8 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -73,33 +73,36 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
 EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
 
 /* Assign choice of congestion control. */
-void tcp_init_congestion_control(struct tcp_sock *tp)
+void tcp_init_congestion_control(struct sock *sk)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_congestion_ops *ca;
 
-	if (tp->ca_ops != &tcp_init_congestion_ops)
+	if (icsk->icsk_ca_ops != &tcp_init_congestion_ops)
 		return;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
 		if (try_module_get(ca->owner)) {
-			tp->ca_ops = ca;
+			icsk->icsk_ca_ops = ca;
 			break;
 		}
 
 	}
 	rcu_read_unlock();
 
-	if (tp->ca_ops->init)
-		tp->ca_ops->init(tp);
+	if (icsk->icsk_ca_ops->init)
+		icsk->icsk_ca_ops->init(sk);
 }
 
 /* Manage refcounts on socket close. */
-void tcp_cleanup_congestion_control(struct tcp_sock *tp)
+void tcp_cleanup_congestion_control(struct sock *sk)
 {
-	if (tp->ca_ops->release)
-		tp->ca_ops->release(tp);
-	module_put(tp->ca_ops->owner);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (icsk->icsk_ca_ops->release)
+		icsk->icsk_ca_ops->release(sk);
+	module_put(icsk->icsk_ca_ops->owner);
 }
 
 /* Used by sysctl to change default congestion control */
@@ -143,14 +146,15 @@ void tcp_get_default_congestion_control(char *name)
 }
 
 /* Change congestion control for socket */
-int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
+int tcp_set_congestion_control(struct sock *sk, const char *name)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_congestion_ops *ca;
 	int err = 0;
 
 	rcu_read_lock();
 	ca = tcp_ca_find(name);
-	if (ca == tp->ca_ops)
+	if (ca == icsk->icsk_ca_ops)
 		goto out;
 
 	if (!ca)
@@ -160,10 +164,10 @@ int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
 		err = -EBUSY;
 
 	else {
-		tcp_cleanup_congestion_control(tp);
-		tp->ca_ops = ca;
-		if (tp->ca_ops->init)
-			tp->ca_ops->init(tp);
+		tcp_cleanup_congestion_control(sk);
+		icsk->icsk_ca_ops = ca;
+		if (icsk->icsk_ca_ops->init)
+			icsk->icsk_ca_ops->init(sk);
 	}
  out:
 	rcu_read_unlock();
@@ -177,9 +181,11 @@ int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
 /* This is Jacobson's slow start and congestion avoidance.
  * SIGCOMM '88, p. 328.
  */
-void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight,
+void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight,
 			 int flag)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
+
 	if (in_flight < tp->snd_cwnd)
 		return;
 
@@ -202,15 +208,17 @@ void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight,
 EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
 
 /* Slow start threshold is half the congestion window (min 2) */
-u32 tcp_reno_ssthresh(struct tcp_sock *tp)
+u32 tcp_reno_ssthresh(struct sock *sk)
 {
+	const struct tcp_sock *tp = tcp_sk(sk);
 	return max(tp->snd_cwnd >> 1U, 2U);
 }
 EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
 
 /* Lower bound on congestion window. */
-u32 tcp_reno_min_cwnd(struct tcp_sock *tp)
+u32 tcp_reno_min_cwnd(struct sock *sk)
 {
+	const struct tcp_sock *tp = tcp_sk(sk);
 	return tp->snd_ssthresh/2;
 }
 EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index f66945cb158..c148c108188 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -1,5 +1,5 @@
 /*
- * tcp_diag.c	Module for monitoring TCP sockets.
+ * tcp_diag.c	Module for monitoring TCP transport protocols sockets.
  *
  * Version:	$Id: tcp_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $
  *
@@ -12,779 +12,43 @@
  */
 
 #include <linux/config.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/fcntl.h>
-#include <linux/random.h>
-#include <linux/cache.h>
-#include <linux/init.h>
-#include <linux/time.h>
-
-#include <net/icmp.h>
-#include <net/tcp.h>
-#include <net/ipv6.h>
-#include <net/inet_common.h>
-
-#include <linux/inet.h>
-#include <linux/stddef.h>
-
-#include <linux/tcp_diag.h>
 
-struct tcpdiag_entry
-{
-	u32 *saddr;
-	u32 *daddr;
-	u16 sport;
-	u16 dport;
-	u16 family;
-	u16 userlocks;
-};
+#include <linux/module.h>
+#include <linux/inet_diag.h>
 
-static struct sock *tcpnl;
+#include <linux/tcp.h>
 
-#define TCPDIAG_PUT(skb, attrtype, attrlen) \
-	RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
+#include <net/tcp.h>
 
-static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
-			int ext, u32 pid, u32 seq, u16 nlmsg_flags)
+static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+			      void *_info)
 {
-	struct inet_sock *inet = inet_sk(sk);
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct tcpdiagmsg *r;
-	struct nlmsghdr  *nlh;
-	struct tcp_info  *info = NULL;
-	struct tcpdiag_meminfo  *minfo = NULL;
-	unsigned char	 *b = skb->tail;
-
-	nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
-	nlh->nlmsg_flags = nlmsg_flags;
-	r = NLMSG_DATA(nlh);
-	if (sk->sk_state != TCP_TIME_WAIT) {
-		if (ext & (1<<(TCPDIAG_MEMINFO-1)))
-			minfo = TCPDIAG_PUT(skb, TCPDIAG_MEMINFO, sizeof(*minfo));
-		if (ext & (1<<(TCPDIAG_INFO-1)))
-			info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info));
-		
-		if (ext & (1<<(TCPDIAG_CONG-1))) {
-			size_t len = strlen(tp->ca_ops->name);
-			strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1),
-			       tp->ca_ops->name);
-		}
-	}
-	r->tcpdiag_family = sk->sk_family;
-	r->tcpdiag_state = sk->sk_state;
-	r->tcpdiag_timer = 0;
-	r->tcpdiag_retrans = 0;
-
-	r->id.tcpdiag_if = sk->sk_bound_dev_if;
-	r->id.tcpdiag_cookie[0] = (u32)(unsigned long)sk;
-	r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
-
-	if (r->tcpdiag_state == TCP_TIME_WAIT) {
-		struct tcp_tw_bucket *tw = (struct tcp_tw_bucket*)sk;
-		long tmo = tw->tw_ttd - jiffies;
-		if (tmo < 0)
-			tmo = 0;
-
-		r->id.tcpdiag_sport = tw->tw_sport;
-		r->id.tcpdiag_dport = tw->tw_dport;
-		r->id.tcpdiag_src[0] = tw->tw_rcv_saddr;
-		r->id.tcpdiag_dst[0] = tw->tw_daddr;
-		r->tcpdiag_state = tw->tw_substate;
-		r->tcpdiag_timer = 3;
-		r->tcpdiag_expires = (tmo*1000+HZ-1)/HZ;
-		r->tcpdiag_rqueue = 0;
-		r->tcpdiag_wqueue = 0;
-		r->tcpdiag_uid = 0;
-		r->tcpdiag_inode = 0;
-#ifdef CONFIG_IP_TCPDIAG_IPV6
-		if (r->tcpdiag_family == AF_INET6) {
-			ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
-				       &tw->tw_v6_rcv_saddr);
-			ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
-				       &tw->tw_v6_daddr);
-		}
-#endif
-		nlh->nlmsg_len = skb->tail - b;
-		return skb->len;
-	}
-
-	r->id.tcpdiag_sport = inet->sport;
-	r->id.tcpdiag_dport = inet->dport;
-	r->id.tcpdiag_src[0] = inet->rcv_saddr;
-	r->id.tcpdiag_dst[0] = inet->daddr;
-
-#ifdef CONFIG_IP_TCPDIAG_IPV6
-	if (r->tcpdiag_family == AF_INET6) {
-		struct ipv6_pinfo *np = inet6_sk(sk);
-
-		ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
-			       &np->rcv_saddr);
-		ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
-			       &np->daddr);
-	}
-#endif
-
-#define EXPIRES_IN_MS(tmo)  ((tmo-jiffies)*1000+HZ-1)/HZ
-
-	if (tp->pending == TCP_TIME_RETRANS) {
-		r->tcpdiag_timer = 1;
-		r->tcpdiag_retrans = tp->retransmits;
-		r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout);
-	} else if (tp->pending == TCP_TIME_PROBE0) {
-		r->tcpdiag_timer = 4;
-		r->tcpdiag_retrans = tp->probes_out;
-		r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout);
-	} else if (timer_pending(&sk->sk_timer)) {
-		r->tcpdiag_timer = 2;
-		r->tcpdiag_retrans = tp->probes_out;
-		r->tcpdiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
-	} else {
-		r->tcpdiag_timer = 0;
-		r->tcpdiag_expires = 0;
-	}
-#undef EXPIRES_IN_MS
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_info *info = _info;
 
-	r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq;
-	r->tcpdiag_wqueue = tp->write_seq - tp->snd_una;
-	r->tcpdiag_uid = sock_i_uid(sk);
-	r->tcpdiag_inode = sock_i_ino(sk);
-
-	if (minfo) {
-		minfo->tcpdiag_rmem = atomic_read(&sk->sk_rmem_alloc);
-		minfo->tcpdiag_wmem = sk->sk_wmem_queued;
-		minfo->tcpdiag_fmem = sk->sk_forward_alloc;
-		minfo->tcpdiag_tmem = atomic_read(&sk->sk_wmem_alloc);
-	}
-
-	if (info) 
+	r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq;
+	r->idiag_wqueue = tp->write_seq - tp->snd_una;
+	if (info != NULL)
 		tcp_get_info(sk, info);
-
-	if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info)
-		tp->ca_ops->get_info(tp, ext, skb);
-
-	nlh->nlmsg_len = skb->tail - b;
-	return skb->len;
-
-rtattr_failure:
-nlmsg_failure:
-	skb_trim(skb, b - skb->data);
-	return -1;
-}
-
-extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport,
-				  int dif);
-#ifdef CONFIG_IP_TCPDIAG_IPV6
-extern struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
-				  struct in6_addr *daddr, u16 dport,
-				  int dif);
-#else
-static inline struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
-					 struct in6_addr *daddr, u16 dport,
-					 int dif)
-{
-	return NULL;
-}
-#endif
-
-static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh)
-{
-	int err;
-	struct sock *sk;
-	struct tcpdiagreq *req = NLMSG_DATA(nlh);
-	struct sk_buff *rep;
-
-	if (req->tcpdiag_family == AF_INET) {
-		sk = tcp_v4_lookup(req->id.tcpdiag_dst[0], req->id.tcpdiag_dport,
-				   req->id.tcpdiag_src[0], req->id.tcpdiag_sport,
-				   req->id.tcpdiag_if);
-	}
-#ifdef CONFIG_IP_TCPDIAG_IPV6
-	else if (req->tcpdiag_family == AF_INET6) {
-		sk = tcp_v6_lookup((struct in6_addr*)req->id.tcpdiag_dst, req->id.tcpdiag_dport,
-				   (struct in6_addr*)req->id.tcpdiag_src, req->id.tcpdiag_sport,
-				   req->id.tcpdiag_if);
-	}
-#endif
-	else {
-		return -EINVAL;
-	}
-
-	if (sk == NULL)
-		return -ENOENT;
-
-	err = -ESTALE;
-	if ((req->id.tcpdiag_cookie[0] != TCPDIAG_NOCOOKIE ||
-	     req->id.tcpdiag_cookie[1] != TCPDIAG_NOCOOKIE) &&
-	    ((u32)(unsigned long)sk != req->id.tcpdiag_cookie[0] ||
-	     (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.tcpdiag_cookie[1]))
-		goto out;
-
-	err = -ENOMEM;
-	rep = alloc_skb(NLMSG_SPACE(sizeof(struct tcpdiagmsg)+
-				    sizeof(struct tcpdiag_meminfo)+
-				    sizeof(struct tcp_info)+64), GFP_KERNEL);
-	if (!rep)
-		goto out;
-
-	if (tcpdiag_fill(rep, sk, req->tcpdiag_ext,
-			 NETLINK_CB(in_skb).pid,
-			 nlh->nlmsg_seq, 0) <= 0)
-		BUG();
-
-	err = netlink_unicast(tcpnl, rep, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
-	if (err > 0)
-		err = 0;
-
-out:
-	if (sk) {
-		if (sk->sk_state == TCP_TIME_WAIT)
-			tcp_tw_put((struct tcp_tw_bucket*)sk);
-		else
-			sock_put(sk);
-	}
-	return err;
-}
-
-static int bitstring_match(const u32 *a1, const u32 *a2, int bits)
-{
-	int words = bits >> 5;
-
-	bits &= 0x1f;
-
-	if (words) {
-		if (memcmp(a1, a2, words << 2))
-			return 0;
-	}
-	if (bits) {
-		__u32 w1, w2;
-		__u32 mask;
-
-		w1 = a1[words];
-		w2 = a2[words];
-
-		mask = htonl((0xffffffff) << (32 - bits));
-
-		if ((w1 ^ w2) & mask)
-			return 0;
-	}
-
-	return 1;
-}
-
-
-static int tcpdiag_bc_run(const void *bc, int len,
-			  const struct tcpdiag_entry *entry)
-{
-	while (len > 0) {
-		int yes = 1;
-		const struct tcpdiag_bc_op *op = bc;
-
-		switch (op->code) {
-		case TCPDIAG_BC_NOP:
-			break;
-		case TCPDIAG_BC_JMP:
-			yes = 0;
-			break;
-		case TCPDIAG_BC_S_GE:
-			yes = entry->sport >= op[1].no;
-			break;
-		case TCPDIAG_BC_S_LE:
-			yes = entry->dport <= op[1].no;
-			break;
-		case TCPDIAG_BC_D_GE:
-			yes = entry->dport >= op[1].no;
-			break;
-		case TCPDIAG_BC_D_LE:
-			yes = entry->dport <= op[1].no;
-			break;
-		case TCPDIAG_BC_AUTO:
-			yes = !(entry->userlocks & SOCK_BINDPORT_LOCK);
-			break;
-		case TCPDIAG_BC_S_COND:
-		case TCPDIAG_BC_D_COND:
-		{
-			struct tcpdiag_hostcond *cond = (struct tcpdiag_hostcond*)(op+1);
-			u32 *addr;
-
-			if (cond->port != -1 &&
-			    cond->port != (op->code == TCPDIAG_BC_S_COND ?
-					     entry->sport : entry->dport)) {
-				yes = 0;
-				break;
-			}
-			
-			if (cond->prefix_len == 0)
-				break;
-
-			if (op->code == TCPDIAG_BC_S_COND)
-				addr = entry->saddr;
-			else
-				addr = entry->daddr;
-
-			if (bitstring_match(addr, cond->addr, cond->prefix_len))
-				break;
-			if (entry->family == AF_INET6 &&
-			    cond->family == AF_INET) {
-				if (addr[0] == 0 && addr[1] == 0 &&
-				    addr[2] == htonl(0xffff) &&
-				    bitstring_match(addr+3, cond->addr, cond->prefix_len))
-					break;
-			}
-			yes = 0;
-			break;
-		}
-		}
-
-		if (yes) { 
-			len -= op->yes;
-			bc += op->yes;
-		} else {
-			len -= op->no;
-			bc += op->no;
-		}
-	}
-	return (len == 0);
-}
-
-static int valid_cc(const void *bc, int len, int cc)
-{
-	while (len >= 0) {
-		const struct tcpdiag_bc_op *op = bc;
-
-		if (cc > len)
-			return 0;
-		if (cc == len)
-			return 1;
-		if (op->yes < 4)
-			return 0;
-		len -= op->yes;
-		bc  += op->yes;
-	}
-	return 0;
-}
-
-static int tcpdiag_bc_audit(const void *bytecode, int bytecode_len)
-{
-	const unsigned char *bc = bytecode;
-	int  len = bytecode_len;
-
-	while (len > 0) {
-		struct tcpdiag_bc_op *op = (struct tcpdiag_bc_op*)bc;
-
-//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
-		switch (op->code) {
-		case TCPDIAG_BC_AUTO:
-		case TCPDIAG_BC_S_COND:
-		case TCPDIAG_BC_D_COND:
-		case TCPDIAG_BC_S_GE:
-		case TCPDIAG_BC_S_LE:
-		case TCPDIAG_BC_D_GE:
-		case TCPDIAG_BC_D_LE:
-			if (op->yes < 4 || op->yes > len+4)
-				return -EINVAL;
-		case TCPDIAG_BC_JMP:
-			if (op->no < 4 || op->no > len+4)
-				return -EINVAL;
-			if (op->no < len &&
-			    !valid_cc(bytecode, bytecode_len, len-op->no))
-				return -EINVAL;
-			break;
-		case TCPDIAG_BC_NOP:
-			if (op->yes < 4 || op->yes > len+4)
-				return -EINVAL;
-			break;
-		default:
-			return -EINVAL;
-		}
-		bc += op->yes;
-		len -= op->yes;
-	}
-	return len == 0 ? 0 : -EINVAL;
-}
-
-static int tcpdiag_dump_sock(struct sk_buff *skb, struct sock *sk,
-			     struct netlink_callback *cb)
-{
-	struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
-
-	if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
-		struct tcpdiag_entry entry;
-		struct rtattr *bc = (struct rtattr *)(r + 1);
-		struct inet_sock *inet = inet_sk(sk);
-
-		entry.family = sk->sk_family;
-#ifdef CONFIG_IP_TCPDIAG_IPV6
-		if (entry.family == AF_INET6) {
-			struct ipv6_pinfo *np = inet6_sk(sk);
-
-			entry.saddr = np->rcv_saddr.s6_addr32;
-			entry.daddr = np->daddr.s6_addr32;
-		} else
-#endif
-		{
-			entry.saddr = &inet->rcv_saddr;
-			entry.daddr = &inet->daddr;
-		}
-		entry.sport = inet->num;
-		entry.dport = ntohs(inet->dport);
-		entry.userlocks = sk->sk_userlocks;
-
-		if (!tcpdiag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
-			return 0;
-	}
-
-	return tcpdiag_fill(skb, sk, r->tcpdiag_ext, NETLINK_CB(cb->skb).pid,
-			    cb->nlh->nlmsg_seq, NLM_F_MULTI);
 }
 
-static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk,
-			    struct request_sock *req,
-			    u32 pid, u32 seq)
-{
-	const struct inet_request_sock *ireq = inet_rsk(req);
-	struct inet_sock *inet = inet_sk(sk);
-	unsigned char *b = skb->tail;
-	struct tcpdiagmsg *r;
-	struct nlmsghdr *nlh;
-	long tmo;
-
-	nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
-	nlh->nlmsg_flags = NLM_F_MULTI;
-	r = NLMSG_DATA(nlh);
-
-	r->tcpdiag_family = sk->sk_family;
-	r->tcpdiag_state = TCP_SYN_RECV;
-	r->tcpdiag_timer = 1;
-	r->tcpdiag_retrans = req->retrans;
-
-	r->id.tcpdiag_if = sk->sk_bound_dev_if;
-	r->id.tcpdiag_cookie[0] = (u32)(unsigned long)req;
-	r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1);
-
-	tmo = req->expires - jiffies;
-	if (tmo < 0)
-		tmo = 0;
-
-	r->id.tcpdiag_sport = inet->sport;
-	r->id.tcpdiag_dport = ireq->rmt_port;
-	r->id.tcpdiag_src[0] = ireq->loc_addr;
-	r->id.tcpdiag_dst[0] = ireq->rmt_addr;
-	r->tcpdiag_expires = jiffies_to_msecs(tmo),
-	r->tcpdiag_rqueue = 0;
-	r->tcpdiag_wqueue = 0;
-	r->tcpdiag_uid = sock_i_uid(sk);
-	r->tcpdiag_inode = 0;
-#ifdef CONFIG_IP_TCPDIAG_IPV6
-	if (r->tcpdiag_family == AF_INET6) {
-		ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
-			       &tcp6_rsk(req)->loc_addr);
-		ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
-			       &tcp6_rsk(req)->rmt_addr);
-	}
-#endif
-	nlh->nlmsg_len = skb->tail - b;
-
-	return skb->len;
-
-nlmsg_failure:
-	skb_trim(skb, b - skb->data);
-	return -1;
-}
-
-static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
-			     struct netlink_callback *cb)
-{
-	struct tcpdiag_entry entry;
-	struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct listen_sock *lopt;
-	struct rtattr *bc = NULL;
-	struct inet_sock *inet = inet_sk(sk);
-	int j, s_j;
-	int reqnum, s_reqnum;
-	int err = 0;
-
-	s_j = cb->args[3];
-	s_reqnum = cb->args[4];
-
-	if (s_j > 0)
-		s_j--;
-
-	entry.family = sk->sk_family;
-
-	read_lock_bh(&tp->accept_queue.syn_wait_lock);
-
-	lopt = tp->accept_queue.listen_opt;
-	if (!lopt || !lopt->qlen)
-		goto out;
-
-	if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
-		bc = (struct rtattr *)(r + 1);
-		entry.sport = inet->num;
-		entry.userlocks = sk->sk_userlocks;
-	}
-
-	for (j = s_j; j < TCP_SYNQ_HSIZE; j++) {
-		struct request_sock *req, *head = lopt->syn_table[j];
-
-		reqnum = 0;
-		for (req = head; req; reqnum++, req = req->dl_next) {
-			struct inet_request_sock *ireq = inet_rsk(req);
-
-			if (reqnum < s_reqnum)
-				continue;
-			if (r->id.tcpdiag_dport != ireq->rmt_port &&
-			    r->id.tcpdiag_dport)
-				continue;
-
-			if (bc) {
-				entry.saddr =
-#ifdef CONFIG_IP_TCPDIAG_IPV6
-					(entry.family == AF_INET6) ?
-					tcp6_rsk(req)->loc_addr.s6_addr32 :
-#endif
-					&ireq->loc_addr;
-				entry.daddr = 
-#ifdef CONFIG_IP_TCPDIAG_IPV6
-					(entry.family == AF_INET6) ?
-					tcp6_rsk(req)->rmt_addr.s6_addr32 :
-#endif
-					&ireq->rmt_addr;
-				entry.dport = ntohs(ireq->rmt_port);
-
-				if (!tcpdiag_bc_run(RTA_DATA(bc),
-						    RTA_PAYLOAD(bc), &entry))
-					continue;
-			}
-
-			err = tcpdiag_fill_req(skb, sk, req,
-					       NETLINK_CB(cb->skb).pid,
-					       cb->nlh->nlmsg_seq);
-			if (err < 0) {
-				cb->args[3] = j + 1;
-				cb->args[4] = reqnum;
-				goto out;
-			}
-		}
-
-		s_reqnum = 0;
-	}
-
-out:
-	read_unlock_bh(&tp->accept_queue.syn_wait_lock);
-
-	return err;
-}
-
-static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb)
-{
-	int i, num;
-	int s_i, s_num;
-	struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
-
-	s_i = cb->args[1];
-	s_num = num = cb->args[2];
-
-	if (cb->args[0] == 0) {
-		if (!(r->tcpdiag_states&(TCPF_LISTEN|TCPF_SYN_RECV)))
-			goto skip_listen_ht;
-		tcp_listen_lock();
-		for (i = s_i; i < TCP_LHTABLE_SIZE; i++) {
-			struct sock *sk;
-			struct hlist_node *node;
-
-			num = 0;
-			sk_for_each(sk, node, &tcp_listening_hash[i]) {
-				struct inet_sock *inet = inet_sk(sk);
-
-				if (num < s_num) {
-					num++;
-					continue;
-				}
-
-				if (r->id.tcpdiag_sport != inet->sport &&
-				    r->id.tcpdiag_sport)
-					goto next_listen;
-
-				if (!(r->tcpdiag_states&TCPF_LISTEN) ||
-				    r->id.tcpdiag_dport ||
-				    cb->args[3] > 0)
-					goto syn_recv;
-
-				if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
-					tcp_listen_unlock();
-					goto done;
-				}
-
-syn_recv:
-				if (!(r->tcpdiag_states&TCPF_SYN_RECV))
-					goto next_listen;
-
-				if (tcpdiag_dump_reqs(skb, sk, cb) < 0) {
-					tcp_listen_unlock();
-					goto done;
-				}
-
-next_listen:
-				cb->args[3] = 0;
-				cb->args[4] = 0;
-				++num;
-			}
-
-			s_num = 0;
-			cb->args[3] = 0;
-			cb->args[4] = 0;
-		}
-		tcp_listen_unlock();
-skip_listen_ht:
-		cb->args[0] = 1;
-		s_i = num = s_num = 0;
-	}
-
-	if (!(r->tcpdiag_states&~(TCPF_LISTEN|TCPF_SYN_RECV)))
-		return skb->len;
-
-	for (i = s_i; i < tcp_ehash_size; i++) {
-		struct tcp_ehash_bucket *head = &tcp_ehash[i];
-		struct sock *sk;
-		struct hlist_node *node;
-
-		if (i > s_i)
-			s_num = 0;
-
-		read_lock_bh(&head->lock);
-
-		num = 0;
-		sk_for_each(sk, node, &head->chain) {
-			struct inet_sock *inet = inet_sk(sk);
-
-			if (num < s_num)
-				goto next_normal;
-			if (!(r->tcpdiag_states & (1 << sk->sk_state)))
-				goto next_normal;
-			if (r->id.tcpdiag_sport != inet->sport &&
-			    r->id.tcpdiag_sport)
-				goto next_normal;
-			if (r->id.tcpdiag_dport != inet->dport && r->id.tcpdiag_dport)
-				goto next_normal;
-			if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
-				read_unlock_bh(&head->lock);
-				goto done;
-			}
-next_normal:
-			++num;
-		}
-
-		if (r->tcpdiag_states&TCPF_TIME_WAIT) {
-			sk_for_each(sk, node,
-				    &tcp_ehash[i + tcp_ehash_size].chain) {
-				struct inet_sock *inet = inet_sk(sk);
-
-				if (num < s_num)
-					goto next_dying;
-				if (r->id.tcpdiag_sport != inet->sport &&
-				    r->id.tcpdiag_sport)
-					goto next_dying;
-				if (r->id.tcpdiag_dport != inet->dport &&
-				    r->id.tcpdiag_dport)
-					goto next_dying;
-				if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
-					read_unlock_bh(&head->lock);
-					goto done;
-				}
-next_dying:
-				++num;
-			}
-		}
-		read_unlock_bh(&head->lock);
-	}
-
-done:
-	cb->args[1] = i;
-	cb->args[2] = num;
-	return skb->len;
-}
-
-static int tcpdiag_dump_done(struct netlink_callback *cb)
-{
-	return 0;
-}
-
-
-static __inline__ int
-tcpdiag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
-{
-	if (!(nlh->nlmsg_flags&NLM_F_REQUEST))
-		return 0;
-
-	if (nlh->nlmsg_type != TCPDIAG_GETSOCK)
-		goto err_inval;
-
-	if (NLMSG_LENGTH(sizeof(struct tcpdiagreq)) > skb->len)
-		goto err_inval;
-
-	if (nlh->nlmsg_flags&NLM_F_DUMP) {
-		if (nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(struct tcpdiagreq))) {
-			struct rtattr *rta = (struct rtattr*)(NLMSG_DATA(nlh) + sizeof(struct tcpdiagreq));
-			if (rta->rta_type != TCPDIAG_REQ_BYTECODE ||
-			    rta->rta_len < 8 ||
-			    rta->rta_len > nlh->nlmsg_len - NLMSG_SPACE(sizeof(struct tcpdiagreq)))
-				goto err_inval;
-			if (tcpdiag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta)))
-				goto err_inval;
-		}
-		return netlink_dump_start(tcpnl, skb, nlh,
-					  tcpdiag_dump,
-					  tcpdiag_dump_done);
-	} else {
-		return tcpdiag_get_exact(skb, nlh);
-	}
-
-err_inval:
-	return -EINVAL;
-}
-
-
-static inline void tcpdiag_rcv_skb(struct sk_buff *skb)
-{
-	int err;
-	struct nlmsghdr * nlh;
-
-	if (skb->len >= NLMSG_SPACE(0)) {
-		nlh = (struct nlmsghdr *)skb->data;
-		if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
-			return;
-		err = tcpdiag_rcv_msg(skb, nlh);
-		if (err || nlh->nlmsg_flags & NLM_F_ACK) 
-			netlink_ack(skb, nlh, err);
-	}
-}
-
-static void tcpdiag_rcv(struct sock *sk, int len)
-{
-	struct sk_buff *skb;
-	unsigned int qlen = skb_queue_len(&sk->sk_receive_queue);
-
-	while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) {
-		tcpdiag_rcv_skb(skb);
-		kfree_skb(skb);
-	}
-}
+static struct inet_diag_handler tcp_diag_handler = {
+	.idiag_hashinfo	 = &tcp_hashinfo,
+	.idiag_get_info	 = tcp_diag_get_info,
+	.idiag_type	 = TCPDIAG_GETSOCK,
+	.idiag_info_size = sizeof(struct tcp_info),
+};
 
-static int __init tcpdiag_init(void)
+static int __init tcp_diag_init(void)
 {
-	tcpnl = netlink_kernel_create(NETLINK_TCPDIAG, tcpdiag_rcv);
-	if (tcpnl == NULL)
-		return -ENOMEM;
-	return 0;
+	return inet_diag_register(&tcp_diag_handler);
 }
 
-static void __exit tcpdiag_exit(void)
+static void __exit tcp_diag_exit(void)
 {
-	sock_release(tcpnl->sk_socket);
+	inet_diag_unregister(&tcp_diag_handler);
 }
 
-module_init(tcpdiag_init);
-module_exit(tcpdiag_exit);
+module_init(tcp_diag_init);
+module_exit(tcp_diag_exit);
 MODULE_LICENSE("GPL");
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 36c51f8136b..6acc04bde08 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -98,9 +98,10 @@ struct hstcp {
 	u32	ai;
 };
 
-static void hstcp_init(struct tcp_sock *tp)
+static void hstcp_init(struct sock *sk)
 {
-	struct hstcp *ca = tcp_ca(tp);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct hstcp *ca = inet_csk_ca(sk);
 
 	ca->ai = 0;
 
@@ -109,10 +110,11 @@ static void hstcp_init(struct tcp_sock *tp)
 	tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
 }
 
-static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt,
+static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt,
 			     u32 in_flight, int good)
 {
-	struct hstcp *ca = tcp_ca(tp);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct hstcp *ca = inet_csk_ca(sk);
 
 	if (in_flight < tp->snd_cwnd)
 		return;
@@ -143,9 +145,10 @@ static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt,
 	}
 }
 
-static u32 hstcp_ssthresh(struct tcp_sock *tp)
+static u32 hstcp_ssthresh(struct sock *sk)
 {
-	struct hstcp *ca = tcp_ca(tp);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct hstcp *ca = inet_csk_ca(sk);
 
 	/* Do multiplicative decrease */
 	return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
@@ -164,7 +167,7 @@ static struct tcp_congestion_ops tcp_highspeed = {
 
 static int __init hstcp_register(void)
 {
-	BUG_ON(sizeof(struct hstcp) > TCP_CA_PRIV_SIZE);
+	BUG_ON(sizeof(struct hstcp) > ICSK_CA_PRIV_SIZE);
 	return tcp_register_congestion_control(&tcp_highspeed);
 }
 
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 40168275acf..e47b37984e9 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -55,18 +55,21 @@ static inline void htcp_reset(struct htcp *ca)
 	ca->snd_cwnd_cnt2 = 0;
 }
 
-static u32 htcp_cwnd_undo(struct tcp_sock *tp)
+static u32 htcp_cwnd_undo(struct sock *sk)
 {
-	struct htcp *ca = tcp_ca(tp);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct htcp *ca = inet_csk_ca(sk);
 	ca->ccount = ca->undo_ccount;
 	ca->maxRTT = ca->undo_maxRTT;
 	ca->old_maxB = ca->undo_old_maxB;
 	return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta);
 }
 
-static inline void measure_rtt(struct tcp_sock *tp)
+static inline void measure_rtt(struct sock *sk)
 {
-	struct htcp *ca = tcp_ca(tp);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct htcp *ca = inet_csk_ca(sk);
 	u32 srtt = tp->srtt>>3;
 
 	/* keep track of minimum RTT seen so far, minRTT is zero at first */
@@ -74,7 +77,7 @@ static inline void measure_rtt(struct tcp_sock *tp)
 		ca->minRTT = srtt;
 
 	/* max RTT */
-	if (tp->ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) {
+	if (icsk->icsk_ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) {
 		if (ca->maxRTT < ca->minRTT)
 			ca->maxRTT = ca->minRTT;
 		if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50)
@@ -82,13 +85,16 @@ static inline void measure_rtt(struct tcp_sock *tp)
 	}
 }
 
-static void measure_achieved_throughput(struct tcp_sock *tp, u32 pkts_acked)
+static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked)
 {
-	struct htcp *ca = tcp_ca(tp);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct htcp *ca = inet_csk_ca(sk);
 	u32 now = tcp_time_stamp;
 
 	/* achieved throughput calculations */
-	if (tp->ca_state != TCP_CA_Open && tp->ca_state != TCP_CA_Disorder) {
+	if (icsk->icsk_ca_state != TCP_CA_Open &&
+	    icsk->icsk_ca_state != TCP_CA_Disorder) {
 		ca->packetcount = 0;
 		ca->lasttime = now;
 		return;
@@ -173,9 +179,9 @@ static inline void htcp_alpha_update(struct htcp *ca)
  * that point do we really have a real sense of maxRTT (the queues en route
  * were getting just too full now).
  */
-static void htcp_param_update(struct tcp_sock *tp)
+static void htcp_param_update(struct sock *sk)
 {
-	struct htcp *ca = tcp_ca(tp);
+	struct htcp *ca = inet_csk_ca(sk);
 	u32 minRTT = ca->minRTT;
 	u32 maxRTT = ca->maxRTT;
 
@@ -187,17 +193,19 @@ static void htcp_param_update(struct tcp_sock *tp)
 		ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100;
 }
 
-static u32 htcp_recalc_ssthresh(struct tcp_sock *tp)
+static u32 htcp_recalc_ssthresh(struct sock *sk)
 {
-	struct htcp *ca = tcp_ca(tp);
-	htcp_param_update(tp);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct htcp *ca = inet_csk_ca(sk);
+	htcp_param_update(sk);
 	return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
 }
 
-static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
 			    u32 in_flight, int data_acked)
 {
-	struct htcp *ca = tcp_ca(tp);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct htcp *ca = inet_csk_ca(sk);
 
 	if (in_flight < tp->snd_cwnd)
 		return;
@@ -207,7 +215,7 @@ static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
 		if (tp->snd_cwnd < tp->snd_cwnd_clamp)
 			tp->snd_cwnd++;
 	} else {
-		measure_rtt(tp);
+		measure_rtt(sk);
 
 		/* keep track of number of round-trip times since last backoff event */
 		if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) {
@@ -229,28 +237,29 @@ static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
 }
 
 /* Lower bound on congestion window. */
-static u32 htcp_min_cwnd(struct tcp_sock *tp)
+static u32 htcp_min_cwnd(struct sock *sk)
 {
+	const struct tcp_sock *tp = tcp_sk(sk);
 	return tp->snd_ssthresh;
 }
 
 
-static void htcp_init(struct tcp_sock *tp)
+static void htcp_init(struct sock *sk)
 {
-	struct htcp *ca = tcp_ca(tp);
+	struct htcp *ca = inet_csk_ca(sk);
 
 	memset(ca, 0, sizeof(struct htcp));
 	ca->alpha = ALPHA_BASE;
 	ca->beta = BETA_MIN;
 }
 
-static void htcp_state(struct tcp_sock *tp, u8 new_state)
+static void htcp_state(struct sock *sk, u8 new_state)
 {
 	switch (new_state) {
 	case TCP_CA_CWR:
 	case TCP_CA_Recovery:
 	case TCP_CA_Loss:
-		htcp_reset(tcp_ca(tp));
+		htcp_reset(inet_csk_ca(sk));
 		break;
 	}
 }
@@ -269,7 +278,7 @@ static struct tcp_congestion_ops htcp = {
 
 static int __init htcp_register(void)
 {
-	BUG_ON(sizeof(struct htcp) > TCP_CA_PRIV_SIZE);
+	BUG_ON(sizeof(struct htcp) > ICSK_CA_PRIV_SIZE);
 	BUILD_BUG_ON(BETA_MIN >= BETA_MAX);
 	if (!use_bandwidth_switch)
 		htcp.pkts_acked = NULL;
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 13a66342c30..77add63623d 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -33,19 +33,20 @@ MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
 
 
 /* This is called to refresh values for hybla parameters */
-static inline void hybla_recalc_param (struct tcp_sock *tp)
+static inline void hybla_recalc_param (struct sock *sk)
 {
-	struct hybla *ca = tcp_ca(tp);
+	struct hybla *ca = inet_csk_ca(sk);
 
-	ca->rho_3ls = max_t(u32, tp->srtt / msecs_to_jiffies(rtt0), 8);
+	ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8);
 	ca->rho = ca->rho_3ls >> 3;
 	ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
 	ca->rho2 = ca->rho2_7ls >>7;
 }
 
-static void hybla_init(struct tcp_sock *tp)
+static void hybla_init(struct sock *sk)
 {
-	struct hybla *ca = tcp_ca(tp);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct hybla *ca = inet_csk_ca(sk);
 
 	ca->rho = 0;
 	ca->rho2 = 0;
@@ -57,17 +58,16 @@ static void hybla_init(struct tcp_sock *tp)
 	tp->snd_cwnd_clamp = 65535;
 
 	/* 1st Rho measurement based on initial srtt */
-	hybla_recalc_param(tp);
+	hybla_recalc_param(sk);
 
 	/* set minimum rtt as this is the 1st ever seen */
 	ca->minrtt = tp->srtt;
 	tp->snd_cwnd = ca->rho;
 }
 
-static void hybla_state(struct tcp_sock *tp, u8 ca_state)
+static void hybla_state(struct sock *sk, u8 ca_state)
 {
-	struct hybla *ca = tcp_ca(tp);
-
+	struct hybla *ca = inet_csk_ca(sk);
 	ca->hybla_en = (ca_state == TCP_CA_Open);
 }
 
@@ -86,27 +86,28 @@ static inline u32 hybla_fraction(u32 odds)
  *     o Give cwnd a new value based on the model proposed
  *     o remember increments <1
  */
-static void hybla_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
 			    u32 in_flight, int flag)
 {
-	struct hybla *ca = tcp_ca(tp);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct hybla *ca = inet_csk_ca(sk);
 	u32 increment, odd, rho_fractions;
 	int is_slowstart = 0;
 
 	/*  Recalculate rho only if this srtt is the lowest */
 	if (tp->srtt < ca->minrtt){
-		hybla_recalc_param(tp);
+		hybla_recalc_param(sk);
 		ca->minrtt = tp->srtt;
 	}
 
 	if (!ca->hybla_en)
-		return tcp_reno_cong_avoid(tp, ack, rtt, in_flight, flag);
+		return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag);
 
 	if (in_flight < tp->snd_cwnd)
 		return;
 
 	if (ca->rho == 0)
-		hybla_recalc_param(tp);
+		hybla_recalc_param(sk);
 
 	rho_fractions = ca->rho_3ls - (ca->rho << 3);
 
@@ -170,7 +171,7 @@ static struct tcp_congestion_ops tcp_hybla = {
 
 static int __init hybla_register(void)
 {
-	BUG_ON(sizeof(struct hybla) > TCP_CA_PRIV_SIZE);
+	BUG_ON(sizeof(struct hybla) > ICSK_CA_PRIV_SIZE);
 	return tcp_register_congestion_control(&tcp_hybla);
 }
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 53a8a5399f1..1afb080bdf0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -114,20 +114,21 @@ int sysctl_tcp_moderate_rcvbuf = 1;
 /* Adapt the MSS value used to make delayed ack decision to the 
  * real world.
  */ 
-static inline void tcp_measure_rcv_mss(struct tcp_sock *tp,
-				       struct sk_buff *skb)
+static inline void tcp_measure_rcv_mss(struct sock *sk,
+				       const struct sk_buff *skb)
 {
-	unsigned int len, lss;
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	const unsigned int lss = icsk->icsk_ack.last_seg_size; 
+	unsigned int len;
 
-	lss = tp->ack.last_seg_size; 
-	tp->ack.last_seg_size = 0; 
+	icsk->icsk_ack.last_seg_size = 0; 
 
 	/* skb->len may jitter because of SACKs, even if peer
 	 * sends good full-sized frames.
 	 */
 	len = skb->len;
-	if (len >= tp->ack.rcv_mss) {
-		tp->ack.rcv_mss = len;
+	if (len >= icsk->icsk_ack.rcv_mss) {
+		icsk->icsk_ack.rcv_mss = len;
 	} else {
 		/* Otherwise, we make more careful check taking into account,
 		 * that SACKs block is variable.
@@ -147,41 +148,44 @@ static inline void tcp_measure_rcv_mss(struct tcp_sock *tp,
 			 * tcp header plus fixed timestamp option length.
 			 * Resulting "len" is MSS free of SACK jitter.
 			 */
-			len -= tp->tcp_header_len;
-			tp->ack.last_seg_size = len;
+			len -= tcp_sk(sk)->tcp_header_len;
+			icsk->icsk_ack.last_seg_size = len;
 			if (len == lss) {
-				tp->ack.rcv_mss = len;
+				icsk->icsk_ack.rcv_mss = len;
 				return;
 			}
 		}
-		tp->ack.pending |= TCP_ACK_PUSHED;
+		icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
 	}
 }
 
-static void tcp_incr_quickack(struct tcp_sock *tp)
+static void tcp_incr_quickack(struct sock *sk)
 {
-	unsigned quickacks = tp->rcv_wnd/(2*tp->ack.rcv_mss);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
 
 	if (quickacks==0)
 		quickacks=2;
-	if (quickacks > tp->ack.quick)
-		tp->ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
+	if (quickacks > icsk->icsk_ack.quick)
+		icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
 }
 
-void tcp_enter_quickack_mode(struct tcp_sock *tp)
+void tcp_enter_quickack_mode(struct sock *sk)
 {
-	tcp_incr_quickack(tp);
-	tp->ack.pingpong = 0;
-	tp->ack.ato = TCP_ATO_MIN;
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	tcp_incr_quickack(sk);
+	icsk->icsk_ack.pingpong = 0;
+	icsk->icsk_ack.ato = TCP_ATO_MIN;
 }
 
 /* Send ACKs quickly, if "quick" count is not exhausted
  * and the session is not interactive.
  */
 
-static __inline__ int tcp_in_quickack_mode(struct tcp_sock *tp)
+static inline int tcp_in_quickack_mode(const struct sock *sk)
 {
-	return (tp->ack.quick && !tp->ack.pingpong);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
 }
 
 /* Buffer size and advertised window tuning.
@@ -224,8 +228,8 @@ static void tcp_fixup_sndbuf(struct sock *sk)
  */
 
 /* Slow part of check#2. */
-static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
-			     struct sk_buff *skb)
+static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp,
+			     const struct sk_buff *skb)
 {
 	/* Optimize this! */
 	int truesize = tcp_win_from_space(skb->truesize)/2;
@@ -233,7 +237,7 @@ static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
 
 	while (tp->rcv_ssthresh <= window) {
 		if (truesize <= skb->len)
-			return 2*tp->ack.rcv_mss;
+			return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
 
 		truesize >>= 1;
 		window >>= 1;
@@ -260,7 +264,7 @@ static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
 
 		if (incr) {
 			tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp);
-			tp->ack.quick |= 1;
+			inet_csk(sk)->icsk_ack.quick |= 1;
 		}
 	}
 }
@@ -321,11 +325,12 @@ static void tcp_init_buffer_space(struct sock *sk)
 /* 5. Recalculate window clamp after socket hit its memory bounds. */
 static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct sk_buff *skb;
 	unsigned int app_win = tp->rcv_nxt - tp->copied_seq;
 	int ofo_win = 0;
 
-	tp->ack.quick = 0;
+	icsk->icsk_ack.quick = 0;
 
 	skb_queue_walk(&tp->out_of_order_queue, skb) {
 		ofo_win += skb->len;
@@ -346,8 +351,8 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
 		app_win += ofo_win;
 		if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf)
 			app_win >>= 1;
-		if (app_win > tp->ack.rcv_mss)
-			app_win -= tp->ack.rcv_mss;
+		if (app_win > icsk->icsk_ack.rcv_mss)
+			app_win -= icsk->icsk_ack.rcv_mss;
 		app_win = max(app_win, 2U*tp->advmss);
 
 		if (!ofo_win)
@@ -415,11 +420,12 @@ new_measure:
 	tp->rcv_rtt_est.time = tcp_time_stamp;
 }
 
-static inline void tcp_rcv_rtt_measure_ts(struct tcp_sock *tp, struct sk_buff *skb)
+static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
 	if (tp->rx_opt.rcv_tsecr &&
 	    (TCP_SKB_CB(skb)->end_seq -
-	     TCP_SKB_CB(skb)->seq >= tp->ack.rcv_mss))
+	     TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss))
 		tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
 }
 
@@ -492,41 +498,42 @@ new_measure:
  */
 static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	u32 now;
 
-	tcp_schedule_ack(tp);
+	inet_csk_schedule_ack(sk);
 
-	tcp_measure_rcv_mss(tp, skb);
+	tcp_measure_rcv_mss(sk, skb);
 
 	tcp_rcv_rtt_measure(tp);
 	
 	now = tcp_time_stamp;
 
-	if (!tp->ack.ato) {
+	if (!icsk->icsk_ack.ato) {
 		/* The _first_ data packet received, initialize
 		 * delayed ACK engine.
 		 */
-		tcp_incr_quickack(tp);
-		tp->ack.ato = TCP_ATO_MIN;
+		tcp_incr_quickack(sk);
+		icsk->icsk_ack.ato = TCP_ATO_MIN;
 	} else {
-		int m = now - tp->ack.lrcvtime;
+		int m = now - icsk->icsk_ack.lrcvtime;
 
 		if (m <= TCP_ATO_MIN/2) {
 			/* The fastest case is the first. */
-			tp->ack.ato = (tp->ack.ato>>1) + TCP_ATO_MIN/2;
-		} else if (m < tp->ack.ato) {
-			tp->ack.ato = (tp->ack.ato>>1) + m;
-			if (tp->ack.ato > tp->rto)
-				tp->ack.ato = tp->rto;
-		} else if (m > tp->rto) {
+			icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
+		} else if (m < icsk->icsk_ack.ato) {
+			icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
+			if (icsk->icsk_ack.ato > icsk->icsk_rto)
+				icsk->icsk_ack.ato = icsk->icsk_rto;
+		} else if (m > icsk->icsk_rto) {
 			/* Too long gap. Apparently sender falled to
 			 * restart window, so that we send ACKs quickly.
 			 */
-			tcp_incr_quickack(tp);
+			tcp_incr_quickack(sk);
 			sk_stream_mem_reclaim(sk);
 		}
 	}
-	tp->ack.lrcvtime = now;
+	icsk->icsk_ack.lrcvtime = now;
 
 	TCP_ECN_check_ce(tp, skb);
 
@@ -543,8 +550,10 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
  * To save cycles in the RFC 1323 implementation it was better to break
  * it up into three procedures. -- erics
  */
-static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt)
+static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 	long m = mrtt; /* RTT */
 
 	/*	The following amusing code comes from Jacobson's
@@ -604,15 +613,16 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt)
 		tp->rtt_seq = tp->snd_nxt;
 	}
 
-	if (tp->ca_ops->rtt_sample)
-		tp->ca_ops->rtt_sample(tp, *usrtt);
+	if (icsk->icsk_ca_ops->rtt_sample)
+		icsk->icsk_ca_ops->rtt_sample(sk, *usrtt);
 }
 
 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
  * routine referred to above.
  */
-static inline void tcp_set_rto(struct tcp_sock *tp)
+static inline void tcp_set_rto(struct sock *sk)
 {
+	const struct tcp_sock *tp = tcp_sk(sk);
 	/* Old crap is replaced with new one. 8)
 	 *
 	 * More seriously:
@@ -623,7 +633,7 @@ static inline void tcp_set_rto(struct tcp_sock *tp)
 	 *    is invisible. Actually, Linux-2.4 also generates erratic
 	 *    ACKs in some curcumstances.
 	 */
-	tp->rto = (tp->srtt >> 3) + tp->rttvar;
+	inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar;
 
 	/* 2. Fixups made earlier cannot be right.
 	 *    If we do not estimate RTO correctly without them,
@@ -635,10 +645,10 @@ static inline void tcp_set_rto(struct tcp_sock *tp)
 /* NOTE: clamping at TCP_RTO_MIN is not required, current algo
  * guarantees that rto is higher.
  */
-static inline void tcp_bound_rto(struct tcp_sock *tp)
+static inline void tcp_bound_rto(struct sock *sk)
 {
-	if (tp->rto > TCP_RTO_MAX)
-		tp->rto = TCP_RTO_MAX;
+	if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)
+		inet_csk(sk)->icsk_rto = TCP_RTO_MAX;
 }
 
 /* Save metrics learned by this TCP session.
@@ -656,9 +666,10 @@ void tcp_update_metrics(struct sock *sk)
 	dst_confirm(dst);
 
 	if (dst && (dst->flags&DST_HOST)) {
+		const struct inet_connection_sock *icsk = inet_csk(sk);
 		int m;
 
-		if (tp->backoff || !tp->srtt) {
+		if (icsk->icsk_backoff || !tp->srtt) {
 			/* This session failed to estimate rtt. Why?
 			 * Probably, no packets returned in time.
 			 * Reset our results.
@@ -707,7 +718,7 @@ void tcp_update_metrics(struct sock *sk)
 			    tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
 				dst->metrics[RTAX_CWND-1] = tp->snd_cwnd;
 		} else if (tp->snd_cwnd > tp->snd_ssthresh &&
-			   tp->ca_state == TCP_CA_Open) {
+			   icsk->icsk_ca_state == TCP_CA_Open) {
 			/* Cong. avoidance phase, cwnd is reliable. */
 			if (!dst_metric_locked(dst, RTAX_SSTHRESH))
 				dst->metrics[RTAX_SSTHRESH-1] =
@@ -801,9 +812,9 @@ static void tcp_init_metrics(struct sock *sk)
 		tp->mdev = dst_metric(dst, RTAX_RTTVAR);
 		tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
 	}
-	tcp_set_rto(tp);
-	tcp_bound_rto(tp);
-	if (tp->rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)
+	tcp_set_rto(sk);
+	tcp_bound_rto(sk);
+	if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)
 		goto reset;
 	tp->snd_cwnd = tcp_init_cwnd(tp, dst);
 	tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -817,12 +828,14 @@ reset:
 	if (!tp->rx_opt.saw_tstamp && tp->srtt) {
 		tp->srtt = 0;
 		tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
-		tp->rto = TCP_TIMEOUT_INIT;
+		inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
 	}
 }
 
-static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts)
+static void tcp_update_reordering(struct sock *sk, const int metric,
+				  const int ts)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
 	if (metric > tp->reordering) {
 		tp->reordering = min(TCP_MAX_REORDERING, metric);
 
@@ -837,7 +850,7 @@ static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts)
 			NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER);
 #if FASTRETRANS_DEBUG > 1
 		printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n",
-		       tp->rx_opt.sack_ok, tp->ca_state,
+		       tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
 		       tp->reordering,
 		       tp->fackets_out,
 		       tp->sacked_out,
@@ -899,6 +912,7 @@ static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts)
 static int
 tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una)
 {
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked;
 	struct tcp_sack_block *sp = (struct tcp_sack_block *)(ptr+2);
@@ -1064,7 +1078,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
 	 * we have to account for reordering! Ugly,
 	 * but should help.
 	 */
-	if (lost_retrans && tp->ca_state == TCP_CA_Recovery) {
+	if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) {
 		struct sk_buff *skb;
 
 		sk_stream_for_retrans_queue(skb, sk) {
@@ -1093,8 +1107,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
 
 	tp->left_out = tp->sacked_out + tp->lost_out;
 
-	if ((reord < tp->fackets_out) && tp->ca_state != TCP_CA_Loss)
-		tcp_update_reordering(tp, ((tp->fackets_out + 1) - reord), 0);
+	if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss)
+		tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0);
 
 #if FASTRETRANS_DEBUG > 0
 	BUG_TRAP((int)tp->sacked_out >= 0);
@@ -1111,17 +1125,18 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
  */
 void tcp_enter_frto(struct sock *sk)
 {
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 
 	tp->frto_counter = 1;
 
-	if (tp->ca_state <= TCP_CA_Disorder ||
+	if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
             tp->snd_una == tp->high_seq ||
-            (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
-		tp->prior_ssthresh = tcp_current_ssthresh(tp);
-		tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
-		tcp_ca_event(tp, CA_EVENT_FRTO);
+            (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
+		tp->prior_ssthresh = tcp_current_ssthresh(sk);
+		tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+		tcp_ca_event(sk, CA_EVENT_FRTO);
 	}
 
 	/* Have to clear retransmission markers here to keep the bookkeeping
@@ -1138,7 +1153,7 @@ void tcp_enter_frto(struct sock *sk)
 	}
 	tcp_sync_left_out(tp);
 
-	tcp_set_ca_state(tp, TCP_CA_Open);
+	tcp_set_ca_state(sk, TCP_CA_Open);
 	tp->frto_highmark = tp->snd_nxt;
 }
 
@@ -1184,7 +1199,7 @@ static void tcp_enter_frto_loss(struct sock *sk)
 
 	tp->reordering = min_t(unsigned int, tp->reordering,
 					     sysctl_tcp_reordering);
-	tcp_set_ca_state(tp, TCP_CA_Loss);
+	tcp_set_ca_state(sk, TCP_CA_Loss);
 	tp->high_seq = tp->frto_highmark;
 	TCP_ECN_queue_cwr(tp);
 }
@@ -1208,16 +1223,17 @@ void tcp_clear_retrans(struct tcp_sock *tp)
  */
 void tcp_enter_loss(struct sock *sk, int how)
 {
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 	int cnt = 0;
 
 	/* Reduce ssthresh if it has not yet been made inside this window. */
-	if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
-	    (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
-		tp->prior_ssthresh = tcp_current_ssthresh(tp);
-		tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
-		tcp_ca_event(tp, CA_EVENT_LOSS);
+	if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
+	    (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
+		tp->prior_ssthresh = tcp_current_ssthresh(sk);
+		tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+		tcp_ca_event(sk, CA_EVENT_LOSS);
 	}
 	tp->snd_cwnd	   = 1;
 	tp->snd_cwnd_cnt   = 0;
@@ -1248,12 +1264,12 @@ void tcp_enter_loss(struct sock *sk, int how)
 
 	tp->reordering = min_t(unsigned int, tp->reordering,
 					     sysctl_tcp_reordering);
-	tcp_set_ca_state(tp, TCP_CA_Loss);
+	tcp_set_ca_state(sk, TCP_CA_Loss);
 	tp->high_seq = tp->snd_nxt;
 	TCP_ECN_queue_cwr(tp);
 }
 
-static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp)
+static int tcp_check_sack_reneging(struct sock *sk)
 {
 	struct sk_buff *skb;
 
@@ -1265,12 +1281,14 @@ static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp)
 	 */
 	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL &&
 	    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
+		struct inet_connection_sock *icsk = inet_csk(sk);
 		NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING);
 
 		tcp_enter_loss(sk, 1);
-		tp->retransmits++;
+		icsk->icsk_retransmits++;
 		tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
-		tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+					  icsk->icsk_rto, TCP_RTO_MAX);
 		return 1;
 	}
 	return 0;
@@ -1281,15 +1299,15 @@ static inline int tcp_fackets_out(struct tcp_sock *tp)
 	return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out;
 }
 
-static inline int tcp_skb_timedout(struct tcp_sock *tp, struct sk_buff *skb)
+static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
 {
-	return (tcp_time_stamp - TCP_SKB_CB(skb)->when > tp->rto);
+	return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto);
 }
 
 static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp)
 {
 	return tp->packets_out &&
-	       tcp_skb_timedout(tp, skb_peek(&sk->sk_write_queue));
+	       tcp_skb_timedout(sk, skb_peek(&sk->sk_write_queue));
 }
 
 /* Linux NewReno/SACK/FACK/ECN state machine.
@@ -1423,8 +1441,9 @@ static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp)
  * in assumption of absent reordering, interpret this as reordering.
  * The only another reason could be bug in receiver TCP.
  */
-static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend)
+static void tcp_check_reno_reordering(struct sock *sk, const int addend)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
 	u32 holes;
 
 	holes = max(tp->lost_out, 1U);
@@ -1432,16 +1451,17 @@ static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend)
 
 	if ((tp->sacked_out + holes) > tp->packets_out) {
 		tp->sacked_out = tp->packets_out - holes;
-		tcp_update_reordering(tp, tp->packets_out+addend, 0);
+		tcp_update_reordering(sk, tp->packets_out + addend, 0);
 	}
 }
 
 /* Emulate SACKs for SACKless connection: account for a new dupack. */
 
-static void tcp_add_reno_sack(struct tcp_sock *tp)
+static void tcp_add_reno_sack(struct sock *sk)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
 	tp->sacked_out++;
-	tcp_check_reno_reordering(tp, 0);
+	tcp_check_reno_reordering(sk, 0);
 	tcp_sync_left_out(tp);
 }
 
@@ -1456,7 +1476,7 @@ static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_sock *tp, int acke
 		else
 			tp->sacked_out -= acked-1;
 	}
-	tcp_check_reno_reordering(tp, acked);
+	tcp_check_reno_reordering(sk, acked);
 	tcp_sync_left_out(tp);
 }
 
@@ -1509,7 +1529,7 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
 		struct sk_buff *skb;
 
 		sk_stream_for_retrans_queue(skb, sk) {
-			if (tcp_skb_timedout(tp, skb) &&
+			if (tcp_skb_timedout(sk, skb) &&
 			    !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
 				TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
 				tp->lost_out += tcp_skb_pcount(skb);
@@ -1530,14 +1550,16 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
 }
 
 /* Decrease cwnd each second ack. */
-static void tcp_cwnd_down(struct tcp_sock *tp)
+static void tcp_cwnd_down(struct sock *sk)
 {
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
 	int decr = tp->snd_cwnd_cnt + 1;
 
 	tp->snd_cwnd_cnt = decr&1;
 	decr >>= 1;
 
-	if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp))
+	if (decr && tp->snd_cwnd > icsk->icsk_ca_ops->min_cwnd(sk))
 		tp->snd_cwnd -= decr;
 
 	tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
@@ -1571,11 +1593,15 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg)
 #define DBGUNDO(x...) do { } while (0)
 #endif
 
-static void tcp_undo_cwr(struct tcp_sock *tp, int undo)
+static void tcp_undo_cwr(struct sock *sk, const int undo)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
+
 	if (tp->prior_ssthresh) {
-		if (tp->ca_ops->undo_cwnd)
-			tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp);
+		const struct inet_connection_sock *icsk = inet_csk(sk);
+
+		if (icsk->icsk_ca_ops->undo_cwnd)
+			tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
 		else
 			tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1);
 
@@ -1603,9 +1629,9 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp)
 		/* Happy end! We did not retransmit anything
 		 * or our original transmission succeeded.
 		 */
-		DBGUNDO(sk, tp, tp->ca_state == TCP_CA_Loss ? "loss" : "retrans");
-		tcp_undo_cwr(tp, 1);
-		if (tp->ca_state == TCP_CA_Loss)
+		DBGUNDO(sk, tp, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
+		tcp_undo_cwr(sk, 1);
+		if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
 			NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
 		else
 			NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO);
@@ -1618,7 +1644,7 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp)
 		tcp_moderate_cwnd(tp);
 		return 1;
 	}
-	tcp_set_ca_state(tp, TCP_CA_Open);
+	tcp_set_ca_state(sk, TCP_CA_Open);
 	return 0;
 }
 
@@ -1627,7 +1653,7 @@ static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp)
 {
 	if (tp->undo_marker && !tp->undo_retrans) {
 		DBGUNDO(sk, tp, "D-SACK");
-		tcp_undo_cwr(tp, 1);
+		tcp_undo_cwr(sk, 1);
 		tp->undo_marker = 0;
 		NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO);
 	}
@@ -1648,10 +1674,10 @@ static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp,
 		if (tp->retrans_out == 0)
 			tp->retrans_stamp = 0;
 
-		tcp_update_reordering(tp, tcp_fackets_out(tp)+acked, 1);
+		tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
 
 		DBGUNDO(sk, tp, "Hoe");
-		tcp_undo_cwr(tp, 0);
+		tcp_undo_cwr(sk, 0);
 		NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO);
 
 		/* So... Do not make Hoe's retransmit yet.
@@ -1674,22 +1700,23 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
 		DBGUNDO(sk, tp, "partial loss");
 		tp->lost_out = 0;
 		tp->left_out = tp->sacked_out;
-		tcp_undo_cwr(tp, 1);
+		tcp_undo_cwr(sk, 1);
 		NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
-		tp->retransmits = 0;
+		inet_csk(sk)->icsk_retransmits = 0;
 		tp->undo_marker = 0;
 		if (!IsReno(tp))
-			tcp_set_ca_state(tp, TCP_CA_Open);
+			tcp_set_ca_state(sk, TCP_CA_Open);
 		return 1;
 	}
 	return 0;
 }
 
-static inline void tcp_complete_cwr(struct tcp_sock *tp)
+static inline void tcp_complete_cwr(struct sock *sk)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
 	tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
 	tp->snd_cwnd_stamp = tcp_time_stamp;
-	tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR);
+	tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
 }
 
 static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
@@ -1700,21 +1727,21 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
 		tp->retrans_stamp = 0;
 
 	if (flag&FLAG_ECE)
-		tcp_enter_cwr(tp);
+		tcp_enter_cwr(sk);
 
-	if (tp->ca_state != TCP_CA_CWR) {
+	if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
 		int state = TCP_CA_Open;
 
 		if (tp->left_out || tp->retrans_out || tp->undo_marker)
 			state = TCP_CA_Disorder;
 
-		if (tp->ca_state != state) {
-			tcp_set_ca_state(tp, state);
+		if (inet_csk(sk)->icsk_ca_state != state) {
+			tcp_set_ca_state(sk, state);
 			tp->high_seq = tp->snd_nxt;
 		}
 		tcp_moderate_cwnd(tp);
 	} else {
-		tcp_cwnd_down(tp);
+		tcp_cwnd_down(sk);
 	}
 }
 
@@ -1733,6 +1760,7 @@ static void
 tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 		      int prior_packets, int flag)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP));
 
@@ -1750,13 +1778,13 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 		tp->prior_ssthresh = 0;
 
 	/* B. In all the states check for reneging SACKs. */
-	if (tp->sacked_out && tcp_check_sack_reneging(sk, tp))
+	if (tp->sacked_out && tcp_check_sack_reneging(sk))
 		return;
 
 	/* C. Process data loss notification, provided it is valid. */
 	if ((flag&FLAG_DATA_LOST) &&
 	    before(tp->snd_una, tp->high_seq) &&
-	    tp->ca_state != TCP_CA_Open &&
+	    icsk->icsk_ca_state != TCP_CA_Open &&
 	    tp->fackets_out > tp->reordering) {
 		tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq);
 		NET_INC_STATS_BH(LINUX_MIB_TCPLOSS);
@@ -1767,14 +1795,14 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 
 	/* E. Check state exit conditions. State can be terminated
 	 *    when high_seq is ACKed. */
-	if (tp->ca_state == TCP_CA_Open) {
+	if (icsk->icsk_ca_state == TCP_CA_Open) {
 		if (!sysctl_tcp_frto)
 			BUG_TRAP(tp->retrans_out == 0);
 		tp->retrans_stamp = 0;
 	} else if (!before(tp->snd_una, tp->high_seq)) {
-		switch (tp->ca_state) {
+		switch (icsk->icsk_ca_state) {
 		case TCP_CA_Loss:
-			tp->retransmits = 0;
+			icsk->icsk_retransmits = 0;
 			if (tcp_try_undo_recovery(sk, tp))
 				return;
 			break;
@@ -1783,8 +1811,8 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 			/* CWR is to be held something *above* high_seq
 			 * is ACKed for CWR bit to reach receiver. */
 			if (tp->snd_una != tp->high_seq) {
-				tcp_complete_cwr(tp);
-				tcp_set_ca_state(tp, TCP_CA_Open);
+				tcp_complete_cwr(sk);
+				tcp_set_ca_state(sk, TCP_CA_Open);
 			}
 			break;
 
@@ -1795,7 +1823,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 			     * catching for all duplicate ACKs. */
 			    IsReno(tp) || tp->snd_una != tp->high_seq) {
 				tp->undo_marker = 0;
-				tcp_set_ca_state(tp, TCP_CA_Open);
+				tcp_set_ca_state(sk, TCP_CA_Open);
 			}
 			break;
 
@@ -1804,17 +1832,17 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 				tcp_reset_reno_sack(tp);
 			if (tcp_try_undo_recovery(sk, tp))
 				return;
-			tcp_complete_cwr(tp);
+			tcp_complete_cwr(sk);
 			break;
 		}
 	}
 
 	/* F. Process state. */
-	switch (tp->ca_state) {
+	switch (icsk->icsk_ca_state) {
 	case TCP_CA_Recovery:
 		if (prior_snd_una == tp->snd_una) {
 			if (IsReno(tp) && is_dupack)
-				tcp_add_reno_sack(tp);
+				tcp_add_reno_sack(sk);
 		} else {
 			int acked = prior_packets - tp->packets_out;
 			if (IsReno(tp))
@@ -1824,13 +1852,13 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 		break;
 	case TCP_CA_Loss:
 		if (flag&FLAG_DATA_ACKED)
-			tp->retransmits = 0;
+			icsk->icsk_retransmits = 0;
 		if (!tcp_try_undo_loss(sk, tp)) {
 			tcp_moderate_cwnd(tp);
 			tcp_xmit_retransmit_queue(sk);
 			return;
 		}
-		if (tp->ca_state != TCP_CA_Open)
+		if (icsk->icsk_ca_state != TCP_CA_Open)
 			return;
 		/* Loss is undone; fall through to processing in Open state. */
 	default:
@@ -1838,10 +1866,10 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 			if (tp->snd_una != prior_snd_una)
 				tcp_reset_reno_sack(tp);
 			if (is_dupack)
-				tcp_add_reno_sack(tp);
+				tcp_add_reno_sack(sk);
 		}
 
-		if (tp->ca_state == TCP_CA_Disorder)
+		if (icsk->icsk_ca_state == TCP_CA_Disorder)
 			tcp_try_undo_dsack(sk, tp);
 
 		if (!tcp_time_to_recover(sk, tp)) {
@@ -1861,30 +1889,28 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 		tp->undo_marker = tp->snd_una;
 		tp->undo_retrans = tp->retrans_out;
 
-		if (tp->ca_state < TCP_CA_CWR) {
+		if (icsk->icsk_ca_state < TCP_CA_CWR) {
 			if (!(flag&FLAG_ECE))
-				tp->prior_ssthresh = tcp_current_ssthresh(tp);
-			tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
+				tp->prior_ssthresh = tcp_current_ssthresh(sk);
+			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
 			TCP_ECN_queue_cwr(tp);
 		}
 
 		tp->snd_cwnd_cnt = 0;
-		tcp_set_ca_state(tp, TCP_CA_Recovery);
+		tcp_set_ca_state(sk, TCP_CA_Recovery);
 	}
 
 	if (is_dupack || tcp_head_timedout(sk, tp))
 		tcp_update_scoreboard(sk, tp);
-	tcp_cwnd_down(tp);
+	tcp_cwnd_down(sk);
 	tcp_xmit_retransmit_queue(sk);
 }
 
 /* Read draft-ietf-tcplw-high-performance before mucking
  * with this code. (Superceeds RFC1323)
  */
-static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag)
+static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag)
 {
-	__u32 seq_rtt;
-
 	/* RTTM Rule: A TSecr value received in a segment is used to
 	 * update the averaged RTT measurement only if the segment
 	 * acknowledges some new data, i.e., only if it advances the
@@ -1900,14 +1926,15 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag)
 	 * answer arrives rto becomes 120 seconds! If at least one of segments
 	 * in window is lost... Voila.	 			--ANK (010210)
 	 */
-	seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
-	tcp_rtt_estimator(tp, seq_rtt, usrtt);
-	tcp_set_rto(tp);
-	tp->backoff = 0;
-	tcp_bound_rto(tp);
+	struct tcp_sock *tp = tcp_sk(sk);
+	const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
+	tcp_rtt_estimator(sk, seq_rtt, usrtt);
+	tcp_set_rto(sk);
+	inet_csk(sk)->icsk_backoff = 0;
+	tcp_bound_rto(sk);
 }
 
-static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag)
+static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag)
 {
 	/* We don't have a timestamp. Can only use
 	 * packets that are not retransmitted to determine
@@ -1921,27 +1948,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int
 	if (flag & FLAG_RETRANS_DATA_ACKED)
 		return;
 
-	tcp_rtt_estimator(tp, seq_rtt, usrtt);
-	tcp_set_rto(tp);
-	tp->backoff = 0;
-	tcp_bound_rto(tp);
+	tcp_rtt_estimator(sk, seq_rtt, usrtt);
+	tcp_set_rto(sk);
+	inet_csk(sk)->icsk_backoff = 0;
+	tcp_bound_rto(sk);
 }
 
-static inline void tcp_ack_update_rtt(struct tcp_sock *tp,
-				      int flag, s32 seq_rtt, u32 *usrtt)
+static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
+				      const s32 seq_rtt, u32 *usrtt)
 {
+	const struct tcp_sock *tp = tcp_sk(sk);
 	/* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
 	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
-		tcp_ack_saw_tstamp(tp, usrtt, flag);
+		tcp_ack_saw_tstamp(sk, usrtt, flag);
 	else if (seq_rtt >= 0)
-		tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag);
+		tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag);
 }
 
-static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
 				  u32 in_flight, int good)
 {
-	tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good);
-	tp->snd_cwnd_stamp = tcp_time_stamp;
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good);
+	tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
 }
 
 /* Restart timer after forward progress on connection.
@@ -1951,9 +1980,9 @@ static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
 static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
 {
 	if (!tp->packets_out) {
-		tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS);
+		inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
 	} else {
-		tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
 	}
 }
 
@@ -2068,9 +2097,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
 				seq_rtt = -1;
 			} else if (seq_rtt < 0)
 				seq_rtt = now - scb->when;
-			if (seq_usrtt)
-				*seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000
-					+ (usnow.tv_usec - skb->stamp.tv_usec);
+			if (seq_usrtt) {
+				struct timeval tv;
+			
+				skb_get_timestamp(skb, &tv);
+				*seq_usrtt = (usnow.tv_sec - tv.tv_sec) * 1000000
+					+ (usnow.tv_usec - tv.tv_usec);
+			}
 
 			if (sacked & TCPCB_SACKED_ACKED)
 				tp->sacked_out -= tcp_skb_pcount(skb);
@@ -2085,16 +2118,17 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
 			seq_rtt = now - scb->when;
 		tcp_dec_pcount_approx(&tp->fackets_out, skb);
 		tcp_packets_out_dec(tp, skb);
-		__skb_unlink(skb, skb->list);
+		__skb_unlink(skb, &sk->sk_write_queue);
 		sk_stream_free_skb(sk, skb);
 	}
 
 	if (acked&FLAG_ACKED) {
-		tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt);
+		const struct inet_connection_sock *icsk = inet_csk(sk);
+		tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt);
 		tcp_ack_packets_out(sk, tp);
 
-		if (tp->ca_ops->pkts_acked)
-			tp->ca_ops->pkts_acked(tp, pkts_acked);
+		if (icsk->icsk_ca_ops->pkts_acked)
+			icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked);
 	}
 
 #if FASTRETRANS_DEBUG > 0
@@ -2102,19 +2136,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
 	BUG_TRAP((int)tp->lost_out >= 0);
 	BUG_TRAP((int)tp->retrans_out >= 0);
 	if (!tp->packets_out && tp->rx_opt.sack_ok) {
+		const struct inet_connection_sock *icsk = inet_csk(sk);
 		if (tp->lost_out) {
 			printk(KERN_DEBUG "Leak l=%u %d\n",
-			       tp->lost_out, tp->ca_state);
+			       tp->lost_out, icsk->icsk_ca_state);
 			tp->lost_out = 0;
 		}
 		if (tp->sacked_out) {
 			printk(KERN_DEBUG "Leak s=%u %d\n",
-			       tp->sacked_out, tp->ca_state);
+			       tp->sacked_out, icsk->icsk_ca_state);
 			tp->sacked_out = 0;
 		}
 		if (tp->retrans_out) {
 			printk(KERN_DEBUG "Leak r=%u %d\n",
-			       tp->retrans_out, tp->ca_state);
+			       tp->retrans_out, icsk->icsk_ca_state);
 			tp->retrans_out = 0;
 		}
 	}
@@ -2125,40 +2160,43 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
 
 static void tcp_ack_probe(struct sock *sk)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
 
 	/* Was it a usable window open? */
 
 	if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq,
 		   tp->snd_una + tp->snd_wnd)) {
-		tp->backoff = 0;
-		tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0);
+		icsk->icsk_backoff = 0;
+		inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
 		/* Socket must be waked up by subsequent tcp_data_snd_check().
 		 * This function is not for random using!
 		 */
 	} else {
-		tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0,
-				     min(tp->rto << tp->backoff, TCP_RTO_MAX));
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+					  min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
+					  TCP_RTO_MAX);
 	}
 }
 
-static inline int tcp_ack_is_dubious(struct tcp_sock *tp, int flag)
+static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
 {
 	return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
-		tp->ca_state != TCP_CA_Open);
+		inet_csk(sk)->icsk_ca_state != TCP_CA_Open);
 }
 
-static inline int tcp_may_raise_cwnd(struct tcp_sock *tp, int flag)
+static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
 {
+	const struct tcp_sock *tp = tcp_sk(sk);
 	return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
-		!((1<<tp->ca_state)&(TCPF_CA_Recovery|TCPF_CA_CWR));
+		!((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR));
 }
 
 /* Check that window update is acceptable.
  * The function assumes that snd_una<=ack<=snd_next.
  */
-static inline int tcp_may_update_window(struct tcp_sock *tp, u32 ack,
-					u32 ack_seq, u32 nwin)
+static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
+					const u32 ack_seq, const u32 nwin)
 {
 	return (after(ack, tp->snd_una) ||
 		after(ack_seq, tp->snd_wl1) ||
@@ -2241,6 +2279,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
 /* This routine deals with incoming acks, but not outgoing ones. */
 static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 prior_snd_una = tp->snd_una;
 	u32 ack_seq = TCP_SKB_CB(skb)->seq;
@@ -2268,7 +2307,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 		tp->snd_una = ack;
 		flag |= FLAG_WIN_UPDATE;
 
-		tcp_ca_event(tp, CA_EVENT_FAST_ACK);
+		tcp_ca_event(sk, CA_EVENT_FAST_ACK);
 
 		NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS);
 	} else {
@@ -2285,7 +2324,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 		if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th))
 			flag |= FLAG_ECE;
 
-		tcp_ca_event(tp, CA_EVENT_SLOW_ACK);
+		tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
 	}
 
 	/* We passed data and got it acked, remove any soft error
@@ -2301,19 +2340,19 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 
 	/* See if we can take anything off of the retransmit queue. */
 	flag |= tcp_clean_rtx_queue(sk, &seq_rtt,
-				    tp->ca_ops->rtt_sample ? &seq_usrtt : NULL);
+				    icsk->icsk_ca_ops->rtt_sample ? &seq_usrtt : NULL);
 
 	if (tp->frto_counter)
 		tcp_process_frto(sk, prior_snd_una);
 
-	if (tcp_ack_is_dubious(tp, flag)) {
+	if (tcp_ack_is_dubious(sk, flag)) {
 		/* Advanve CWND, if state allows this. */
-		if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag))
-			tcp_cong_avoid(tp, ack,  seq_rtt, prior_in_flight, 0);
+		if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag))
+			tcp_cong_avoid(sk, ack,  seq_rtt, prior_in_flight, 0);
 		tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
 	} else {
 		if ((flag & FLAG_DATA_ACKED))
-			tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1);
+			tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1);
 	}
 
 	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
@@ -2322,7 +2361,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 	return 1;
 
 no_queue:
-	tp->probes_out = 0;
+	icsk->icsk_probes_out = 0;
 
 	/* If this ack opens up a zero window, clear backoff.  It was
 	 * being used to time the probes, and is probably far higher than
@@ -2500,8 +2539,9 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
  * up to bandwidth of 18Gigabit/sec. 8) ]
  */
 
-static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb)
+static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
 	struct tcphdr *th = skb->h.th;
 	u32 seq = TCP_SKB_CB(skb)->seq;
 	u32 ack = TCP_SKB_CB(skb)->ack_seq;
@@ -2516,14 +2556,15 @@ static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb)
 		!tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
 
 		/* 4. ... and sits in replay window. */
-		(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (tp->rto*1024)/HZ);
+		(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
 }
 
-static inline int tcp_paws_discard(struct tcp_sock *tp, struct sk_buff *skb)
+static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff *skb)
 {
+	const struct tcp_sock *tp = tcp_sk(sk);
 	return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW &&
 		xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS &&
-		!tcp_disordered_ack(tp, skb));
+		!tcp_disordered_ack(sk, skb));
 }
 
 /* Check segment sequence number for validity.
@@ -2586,7 +2627,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	tcp_schedule_ack(tp);
+	inet_csk_schedule_ack(sk);
 
 	sk->sk_shutdown |= RCV_SHUTDOWN;
 	sock_set_flag(sk, SOCK_DONE);
@@ -2596,7 +2637,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
 		case TCP_ESTABLISHED:
 			/* Move to CLOSE_WAIT */
 			tcp_set_state(sk, TCP_CLOSE_WAIT);
-			tp->ack.pingpong = 1;
+			inet_csk(sk)->icsk_ack.pingpong = 1;
 			break;
 
 		case TCP_CLOSE_WAIT:
@@ -2694,7 +2735,7 @@ static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb)
 	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
 	    before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
 		NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST);
-		tcp_enter_quickack_mode(tp);
+		tcp_enter_quickack_mode(sk);
 
 		if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
 			u32 end_seq = TCP_SKB_CB(skb)->end_seq;
@@ -2853,7 +2894,7 @@ static void tcp_ofo_queue(struct sock *sk)
 
 		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
 			SOCK_DEBUG(sk, "ofo packet was already received \n");
-			__skb_unlink(skb, skb->list);
+			__skb_unlink(skb, &tp->out_of_order_queue);
 			__kfree_skb(skb);
 			continue;
 		}
@@ -2861,7 +2902,7 @@ static void tcp_ofo_queue(struct sock *sk)
 			   tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
 			   TCP_SKB_CB(skb)->end_seq);
 
-		__skb_unlink(skb, skb->list);
+		__skb_unlink(skb, &tp->out_of_order_queue);
 		__skb_queue_tail(&sk->sk_receive_queue, skb);
 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 		if(skb->h.th->fin)
@@ -2942,7 +2983,7 @@ queue_and_out:
 			 * gap in queue is filled.
 			 */
 			if (skb_queue_empty(&tp->out_of_order_queue))
-				tp->ack.pingpong = 0;
+				inet_csk(sk)->icsk_ack.pingpong = 0;
 		}
 
 		if (tp->rx_opt.num_sacks)
@@ -2963,8 +3004,8 @@ queue_and_out:
 		tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
 
 out_of_window:
-		tcp_enter_quickack_mode(tp);
-		tcp_schedule_ack(tp);
+		tcp_enter_quickack_mode(sk);
+		inet_csk_schedule_ack(sk);
 drop:
 		__kfree_skb(skb);
 		return;
@@ -2974,7 +3015,7 @@ drop:
 	if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
 		goto out_of_window;
 
-	tcp_enter_quickack_mode(tp);
+	tcp_enter_quickack_mode(sk);
 
 	if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
 		/* Partial packet, seq < rcv_next < end_seq */
@@ -3003,7 +3044,7 @@ drop:
 
 	/* Disable header prediction. */
 	tp->pred_flags = 0;
-	tcp_schedule_ack(tp);
+	inet_csk_schedule_ack(sk);
 
 	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
 		   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
@@ -3027,7 +3068,7 @@ drop:
 		u32 end_seq = TCP_SKB_CB(skb)->end_seq;
 
 		if (seq == TCP_SKB_CB(skb1)->end_seq) {
-			__skb_append(skb1, skb);
+			__skb_append(skb1, skb, &tp->out_of_order_queue);
 
 			if (!tp->rx_opt.num_sacks ||
 			    tp->selective_acks[0].end_seq != seq)
@@ -3071,7 +3112,7 @@ drop:
 			       tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq);
 			       break;
 		       }
-		       __skb_unlink(skb1, skb1->list);
+		       __skb_unlink(skb1, &tp->out_of_order_queue);
 		       tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq);
 		       __kfree_skb(skb1);
 		}
@@ -3088,8 +3129,9 @@ add_sack:
  * simplifies code)
  */
 static void
-tcp_collapse(struct sock *sk, struct sk_buff *head,
-	     struct sk_buff *tail, u32 start, u32 end)
+tcp_collapse(struct sock *sk, struct sk_buff_head *list,
+	     struct sk_buff *head, struct sk_buff *tail,
+	     u32 start, u32 end)
 {
 	struct sk_buff *skb;
 
@@ -3099,7 +3141,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head,
 		/* No new bits? It is possible on ofo queue. */
 		if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
 			struct sk_buff *next = skb->next;
-			__skb_unlink(skb, skb->list);
+			__skb_unlink(skb, list);
 			__kfree_skb(skb);
 			NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
 			skb = next;
@@ -3145,7 +3187,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head,
 		nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head);
 		memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
 		TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
-		__skb_insert(nskb, skb->prev, skb, skb->list);
+		__skb_insert(nskb, skb->prev, skb, list);
 		sk_stream_set_owner_r(nskb, sk);
 
 		/* Copy data, releasing collapsed skbs. */
@@ -3164,7 +3206,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head,
 			}
 			if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
 				struct sk_buff *next = skb->next;
-				__skb_unlink(skb, skb->list);
+				__skb_unlink(skb, list);
 				__kfree_skb(skb);
 				NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
 				skb = next;
@@ -3200,7 +3242,8 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
 		if (skb == (struct sk_buff *)&tp->out_of_order_queue ||
 		    after(TCP_SKB_CB(skb)->seq, end) ||
 		    before(TCP_SKB_CB(skb)->end_seq, start)) {
-			tcp_collapse(sk, head, skb, start, end);
+			tcp_collapse(sk, &tp->out_of_order_queue,
+				     head, skb, start, end);
 			head = skb;
 			if (skb == (struct sk_buff *)&tp->out_of_order_queue)
 				break;
@@ -3237,7 +3280,8 @@ static int tcp_prune_queue(struct sock *sk)
 		tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
 
 	tcp_collapse_ofo_queue(sk);
-	tcp_collapse(sk, sk->sk_receive_queue.next,
+	tcp_collapse(sk, &sk->sk_receive_queue,
+		     sk->sk_receive_queue.next,
 		     (struct sk_buff*)&sk->sk_receive_queue,
 		     tp->copied_seq, tp->rcv_nxt);
 	sk_stream_mem_reclaim(sk);
@@ -3286,12 +3330,12 @@ void tcp_cwnd_application_limited(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	if (tp->ca_state == TCP_CA_Open &&
+	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
 	    sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
 		/* Limited by application or receiver window. */
 		u32 win_used = max(tp->snd_cwnd_used, 2U);
 		if (win_used < tp->snd_cwnd) {
-			tp->snd_ssthresh = tcp_current_ssthresh(tp);
+			tp->snd_ssthresh = tcp_current_ssthresh(sk);
 			tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
 		}
 		tp->snd_cwnd_used = 0;
@@ -3370,13 +3414,13 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	    /* More than one full frame received... */
-	if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss
+	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss
 	     /* ... and right edge of window advances far enough.
 	      * (tcp_recvmsg() will send ACK otherwise). Or...
 	      */
 	     && __tcp_select_window(sk) >= tp->rcv_wnd) ||
 	    /* We ACK each frame or... */
-	    tcp_in_quickack_mode(tp) ||
+	    tcp_in_quickack_mode(sk) ||
 	    /* We have out of order data. */
 	    (ofo_possible &&
 	     skb_peek(&tp->out_of_order_queue))) {
@@ -3390,8 +3434,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 
 static __inline__ void tcp_ack_snd_check(struct sock *sk)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
-	if (!tcp_ack_scheduled(tp)) {
+	if (!inet_csk_ack_scheduled(sk)) {
 		/* We sent a data segment already. */
 		return;
 	}
@@ -3462,7 +3505,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
 		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
 		tp->copied_seq++;
 		if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
-			__skb_unlink(skb, skb->list);
+			__skb_unlink(skb, &sk->sk_receive_queue);
 			__kfree_skb(skb);
 		}
 	}
@@ -3645,7 +3688,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 				    tp->rcv_nxt == tp->rcv_wup)
 					tcp_store_ts_recent(tp);
 
-				tcp_rcv_rtt_measure_ts(tp, skb);
+				tcp_rcv_rtt_measure_ts(sk, skb);
 
 				/* We know that such packets are checksummed
 				 * on entry.
@@ -3678,7 +3721,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 					    tp->rcv_nxt == tp->rcv_wup)
 						tcp_store_ts_recent(tp);
 
-					tcp_rcv_rtt_measure_ts(tp, skb);
+					tcp_rcv_rtt_measure_ts(sk, skb);
 
 					__skb_pull(skb, tcp_header_len);
 					tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
@@ -3699,7 +3742,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 				    tp->rcv_nxt == tp->rcv_wup)
 					tcp_store_ts_recent(tp);
 
-				tcp_rcv_rtt_measure_ts(tp, skb);
+				tcp_rcv_rtt_measure_ts(sk, skb);
 
 				if ((int)skb->truesize > sk->sk_forward_alloc)
 					goto step5;
@@ -3719,7 +3762,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 				/* Well, only one small jumplet in fast path... */
 				tcp_ack(sk, skb, FLAG_DATA);
 				tcp_data_snd_check(sk, tp);
-				if (!tcp_ack_scheduled(tp))
+				if (!inet_csk_ack_scheduled(sk))
 					goto no_ack;
 			}
 
@@ -3741,7 +3784,7 @@ slow_path:
 	 * RFC1323: H1. Apply PAWS check first.
 	 */
 	if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
-	    tcp_paws_discard(tp, skb)) {
+	    tcp_paws_discard(sk, skb)) {
 		if (!th->rst) {
 			NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
 			tcp_send_dupack(sk, skb);
@@ -3788,7 +3831,7 @@ step5:
 	if(th->ack)
 		tcp_ack(sk, skb, FLAG_SLOWPATH);
 
-	tcp_rcv_rtt_measure_ts(tp, skb);
+	tcp_rcv_rtt_measure_ts(sk, skb);
 
 	/* Process urgent data. */
 	tcp_urg(sk, skb, th);
@@ -3817,6 +3860,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 	tcp_parse_options(skb, &tp->rx_opt, 0);
 
 	if (th->ack) {
+		struct inet_connection_sock *icsk;
 		/* rfc793:
 		 * "If the state is SYN-SENT then
 		 *    first check the ACK bit
@@ -3920,7 +3964,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 
 		tcp_init_metrics(sk);
 
-		tcp_init_congestion_control(tp);
+		tcp_init_congestion_control(sk);
 
 		/* Prevent spurious tcp_cwnd_restart() on first data
 		 * packet.
@@ -3930,7 +3974,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		tcp_init_buffer_space(sk);
 
 		if (sock_flag(sk, SOCK_KEEPOPEN))
-			tcp_reset_keepalive_timer(sk, keepalive_time_when(tp));
+			inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
 
 		if (!tp->rx_opt.snd_wscale)
 			__tcp_fast_path_on(tp, tp->snd_wnd);
@@ -3942,7 +3986,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 			sk_wake_async(sk, 0, POLL_OUT);
 		}
 
-		if (sk->sk_write_pending || tp->defer_accept || tp->ack.pingpong) {
+		icsk = inet_csk(sk);
+
+		if (sk->sk_write_pending ||
+		    icsk->icsk_accept_queue.rskq_defer_accept ||
+		    icsk->icsk_ack.pingpong) {
 			/* Save one ACK. Data will be ready after
 			 * several ticks, if write_pending is set.
 			 *
@@ -3950,12 +3998,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 			 * look so _wonderfully_ clever, that I was not able
 			 * to stand against the temptation 8)     --ANK
 			 */
-			tcp_schedule_ack(tp);
-			tp->ack.lrcvtime = tcp_time_stamp;
-			tp->ack.ato	 = TCP_ATO_MIN;
-			tcp_incr_quickack(tp);
-			tcp_enter_quickack_mode(tp);
-			tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
+			inet_csk_schedule_ack(sk);
+			icsk->icsk_ack.lrcvtime = tcp_time_stamp;
+			icsk->icsk_ack.ato	 = TCP_ATO_MIN;
+			tcp_incr_quickack(sk);
+			tcp_enter_quickack_mode(sk);
+			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+						  TCP_DELACK_MAX, TCP_RTO_MAX);
 
 discard:
 			__kfree_skb(skb);
@@ -4111,7 +4160,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 	}
 
 	if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
-	    tcp_paws_discard(tp, skb)) {
+	    tcp_paws_discard(sk, skb)) {
 		if (!th->rst) {
 			NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
 			tcp_send_dupack(sk, skb);
@@ -4180,7 +4229,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 				 */
 				if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
 				    !tp->srtt)
-					tcp_ack_saw_tstamp(tp, 0, 0);
+					tcp_ack_saw_tstamp(sk, NULL, 0);
 
 				if (tp->rx_opt.tstamp_ok)
 					tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -4192,7 +4241,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 
 				tcp_init_metrics(sk);
 
-				tcp_init_congestion_control(tp);
+				tcp_init_congestion_control(sk);
 
 				/* Prevent spurious tcp_cwnd_restart() on
 				 * first data packet.
@@ -4227,9 +4276,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 						return 1;
 					}
 
-					tmo = tcp_fin_time(tp);
+					tmo = tcp_fin_time(sk);
 					if (tmo > TCP_TIMEWAIT_LEN) {
-						tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
+						inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
 					} else if (th->fin || sock_owned_by_user(sk)) {
 						/* Bad case. We could lose such FIN otherwise.
 						 * It is not a big problem, but it looks confusing
@@ -4237,7 +4286,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 						 * if it spins in bh_lock_sock(), but it is really
 						 * marginal case.
 						 */
-						tcp_reset_keepalive_timer(sk, tmo);
+						inet_csk_reset_keepalive_timer(sk, tmo);
 					} else {
 						tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
 						goto discard;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 67c670886c1..13dfb391cdf 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -64,7 +64,9 @@
 #include <linux/times.h>
 
 #include <net/icmp.h>
+#include <net/inet_hashtables.h>
 #include <net/tcp.h>
+#include <net/transp_v6.h>
 #include <net/ipv6.h>
 #include <net/inet_common.h>
 #include <net/xfrm.h>
@@ -75,7 +77,6 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 
-extern int sysctl_ip_dynaddr;
 int sysctl_tcp_tw_reuse;
 int sysctl_tcp_low_latency;
 
@@ -88,463 +89,29 @@ static struct socket *tcp_socket;
 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
 		       struct sk_buff *skb);
 
-struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
-	.__tcp_lhash_lock	=	RW_LOCK_UNLOCKED,
-	.__tcp_lhash_users	=	ATOMIC_INIT(0),
-	.__tcp_lhash_wait
-	  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
-	.__tcp_portalloc_lock	=	SPIN_LOCK_UNLOCKED
+struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
+	.lhash_lock	= RW_LOCK_UNLOCKED,
+	.lhash_users	= ATOMIC_INIT(0),
+	.lhash_wait	= __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
+	.portalloc_lock	= SPIN_LOCK_UNLOCKED,
+	.port_rover	= 1024 - 1,
 };
 
-/*
- * This array holds the first and last local port number.
- * For high-usage systems, use sysctl to change this to
- * 32768-61000
- */
-int sysctl_local_port_range[2] = { 1024, 4999 };
-int tcp_port_rover = 1024 - 1;
-
-static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
-				 __u32 faddr, __u16 fport)
-{
-	int h = (laddr ^ lport) ^ (faddr ^ fport);
-	h ^= h >> 16;
-	h ^= h >> 8;
-	return h & (tcp_ehash_size - 1);
-}
-
-static __inline__ int tcp_sk_hashfn(struct sock *sk)
-{
-	struct inet_sock *inet = inet_sk(sk);
-	__u32 laddr = inet->rcv_saddr;
-	__u16 lport = inet->num;
-	__u32 faddr = inet->daddr;
-	__u16 fport = inet->dport;
-
-	return tcp_hashfn(laddr, lport, faddr, fport);
-}
-
-/* Allocate and initialize a new TCP local port bind bucket.
- * The bindhash mutex for snum's hash chain must be held here.
- */
-struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
-					  unsigned short snum)
-{
-	struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
-						      SLAB_ATOMIC);
-	if (tb) {
-		tb->port = snum;
-		tb->fastreuse = 0;
-		INIT_HLIST_HEAD(&tb->owners);
-		hlist_add_head(&tb->node, &head->chain);
-	}
-	return tb;
-}
-
-/* Caller must hold hashbucket lock for this tb with local BH disabled */
-void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
-{
-	if (hlist_empty(&tb->owners)) {
-		__hlist_del(&tb->node);
-		kmem_cache_free(tcp_bucket_cachep, tb);
-	}
-}
-
-/* Caller must disable local BH processing. */
-static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
-{
-	struct tcp_bind_hashbucket *head =
-				&tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
-	struct tcp_bind_bucket *tb;
-
-	spin_lock(&head->lock);
-	tb = tcp_sk(sk)->bind_hash;
-	sk_add_bind_node(child, &tb->owners);
-	tcp_sk(child)->bind_hash = tb;
-	spin_unlock(&head->lock);
-}
-
-inline void tcp_inherit_port(struct sock *sk, struct sock *child)
-{
-	local_bh_disable();
-	__tcp_inherit_port(sk, child);
-	local_bh_enable();
-}
-
-void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
-		   unsigned short snum)
-{
-	inet_sk(sk)->num = snum;
-	sk_add_bind_node(sk, &tb->owners);
-	tcp_sk(sk)->bind_hash = tb;
-}
-
-static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
-{
-	const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
-	struct sock *sk2;
-	struct hlist_node *node;
-	int reuse = sk->sk_reuse;
-
-	sk_for_each_bound(sk2, node, &tb->owners) {
-		if (sk != sk2 &&
-		    !tcp_v6_ipv6only(sk2) &&
-		    (!sk->sk_bound_dev_if ||
-		     !sk2->sk_bound_dev_if ||
-		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
-			if (!reuse || !sk2->sk_reuse ||
-			    sk2->sk_state == TCP_LISTEN) {
-				const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
-				if (!sk2_rcv_saddr || !sk_rcv_saddr ||
-				    sk2_rcv_saddr == sk_rcv_saddr)
-					break;
-			}
-		}
-	}
-	return node != NULL;
-}
-
-/* Obtain a reference to a local port for the given sock,
- * if snum is zero it means select any available local port.
- */
 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 {
-	struct tcp_bind_hashbucket *head;
-	struct hlist_node *node;
-	struct tcp_bind_bucket *tb;
-	int ret;
-
-	local_bh_disable();
-	if (!snum) {
-		int low = sysctl_local_port_range[0];
-		int high = sysctl_local_port_range[1];
-		int remaining = (high - low) + 1;
-		int rover;
-
-		spin_lock(&tcp_portalloc_lock);
-		if (tcp_port_rover < low)
-			rover = low;
-		else
-			rover = tcp_port_rover;
-		do {
-			rover++;
-			if (rover > high)
-				rover = low;
-			head = &tcp_bhash[tcp_bhashfn(rover)];
-			spin_lock(&head->lock);
-			tb_for_each(tb, node, &head->chain)
-				if (tb->port == rover)
-					goto next;
-			break;
-		next:
-			spin_unlock(&head->lock);
-		} while (--remaining > 0);
-		tcp_port_rover = rover;
-		spin_unlock(&tcp_portalloc_lock);
-
-		/* Exhausted local port range during search?  It is not
-		 * possible for us to be holding one of the bind hash
-		 * locks if this test triggers, because if 'remaining'
-		 * drops to zero, we broke out of the do/while loop at
-		 * the top level, not from the 'break;' statement.
-		 */
-		ret = 1;
-		if (unlikely(remaining <= 0))
-			goto fail;
-
-		/* OK, here is the one we will use.  HEAD is
-		 * non-NULL and we hold it's mutex.
-		 */
-		snum = rover;
-	} else {
-		head = &tcp_bhash[tcp_bhashfn(snum)];
-		spin_lock(&head->lock);
-		tb_for_each(tb, node, &head->chain)
-			if (tb->port == snum)
-				goto tb_found;
-	}
-	tb = NULL;
-	goto tb_not_found;
-tb_found:
-	if (!hlist_empty(&tb->owners)) {
-		if (sk->sk_reuse > 1)
-			goto success;
-		if (tb->fastreuse > 0 &&
-		    sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
-			goto success;
-		} else {
-			ret = 1;
-			if (tcp_bind_conflict(sk, tb))
-				goto fail_unlock;
-		}
-	}
-tb_not_found:
-	ret = 1;
-	if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
-		goto fail_unlock;
-	if (hlist_empty(&tb->owners)) {
-		if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
-			tb->fastreuse = 1;
-		else
-			tb->fastreuse = 0;
-	} else if (tb->fastreuse &&
-		   (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
-		tb->fastreuse = 0;
-success:
-	if (!tcp_sk(sk)->bind_hash)
-		tcp_bind_hash(sk, tb, snum);
-	BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
- 	ret = 0;
-
-fail_unlock:
-	spin_unlock(&head->lock);
-fail:
-	local_bh_enable();
-	return ret;
-}
-
-/* Get rid of any references to a local port held by the
- * given sock.
- */
-static void __tcp_put_port(struct sock *sk)
-{
-	struct inet_sock *inet = inet_sk(sk);
-	struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
-	struct tcp_bind_bucket *tb;
-
-	spin_lock(&head->lock);
-	tb = tcp_sk(sk)->bind_hash;
-	__sk_del_bind_node(sk);
-	tcp_sk(sk)->bind_hash = NULL;
-	inet->num = 0;
-	tcp_bucket_destroy(tb);
-	spin_unlock(&head->lock);
-}
-
-void tcp_put_port(struct sock *sk)
-{
-	local_bh_disable();
-	__tcp_put_port(sk);
-	local_bh_enable();
-}
-
-/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
- * Look, when several writers sleep and reader wakes them up, all but one
- * immediately hit write lock and grab all the cpus. Exclusive sleep solves
- * this, _but_ remember, it adds useless work on UP machines (wake up each
- * exclusive lock release). It should be ifdefed really.
- */
-
-void tcp_listen_wlock(void)
-{
-	write_lock(&tcp_lhash_lock);
-
-	if (atomic_read(&tcp_lhash_users)) {
-		DEFINE_WAIT(wait);
-
-		for (;;) {
-			prepare_to_wait_exclusive(&tcp_lhash_wait,
-						&wait, TASK_UNINTERRUPTIBLE);
-			if (!atomic_read(&tcp_lhash_users))
-				break;
-			write_unlock_bh(&tcp_lhash_lock);
-			schedule();
-			write_lock_bh(&tcp_lhash_lock);
-		}
-
-		finish_wait(&tcp_lhash_wait, &wait);
-	}
-}
-
-static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
-{
-	struct hlist_head *list;
-	rwlock_t *lock;
-
-	BUG_TRAP(sk_unhashed(sk));
-	if (listen_possible && sk->sk_state == TCP_LISTEN) {
-		list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
-		lock = &tcp_lhash_lock;
-		tcp_listen_wlock();
-	} else {
-		list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
-		lock = &tcp_ehash[sk->sk_hashent].lock;
-		write_lock(lock);
-	}
-	__sk_add_node(sk, list);
-	sock_prot_inc_use(sk->sk_prot);
-	write_unlock(lock);
-	if (listen_possible && sk->sk_state == TCP_LISTEN)
-		wake_up(&tcp_lhash_wait);
+	return inet_csk_get_port(&tcp_hashinfo, sk, snum);
 }
 
 static void tcp_v4_hash(struct sock *sk)
 {
-	if (sk->sk_state != TCP_CLOSE) {
-		local_bh_disable();
-		__tcp_v4_hash(sk, 1);
-		local_bh_enable();
-	}
+	inet_hash(&tcp_hashinfo, sk);
 }
 
 void tcp_unhash(struct sock *sk)
 {
-	rwlock_t *lock;
-
-	if (sk_unhashed(sk))
-		goto ende;
-
-	if (sk->sk_state == TCP_LISTEN) {
-		local_bh_disable();
-		tcp_listen_wlock();
-		lock = &tcp_lhash_lock;
-	} else {
-		struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
-		lock = &head->lock;
-		write_lock_bh(&head->lock);
-	}
-
-	if (__sk_del_node_init(sk))
-		sock_prot_dec_use(sk->sk_prot);
-	write_unlock_bh(lock);
-
- ende:
-	if (sk->sk_state == TCP_LISTEN)
-		wake_up(&tcp_lhash_wait);
-}
-
-/* Don't inline this cruft.  Here are some nice properties to
- * exploit here.  The BSD API does not allow a listening TCP
- * to specify the remote port nor the remote address for the
- * connection.  So always assume those are both wildcarded
- * during the search since they can never be otherwise.
- */
-static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
-					     unsigned short hnum, int dif)
-{
-	struct sock *result = NULL, *sk;
-	struct hlist_node *node;
-	int score, hiscore;
-
-	hiscore=-1;
-	sk_for_each(sk, node, head) {
-		struct inet_sock *inet = inet_sk(sk);
-
-		if (inet->num == hnum && !ipv6_only_sock(sk)) {
-			__u32 rcv_saddr = inet->rcv_saddr;
-
-			score = (sk->sk_family == PF_INET ? 1 : 0);
-			if (rcv_saddr) {
-				if (rcv_saddr != daddr)
-					continue;
-				score+=2;
-			}
-			if (sk->sk_bound_dev_if) {
-				if (sk->sk_bound_dev_if != dif)
-					continue;
-				score+=2;
-			}
-			if (score == 5)
-				return sk;
-			if (score > hiscore) {
-				hiscore = score;
-				result = sk;
-			}
-		}
-	}
-	return result;
-}
-
-/* Optimize the common listener case. */
-static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
-		unsigned short hnum, int dif)
-{
-	struct sock *sk = NULL;
-	struct hlist_head *head;
-
-	read_lock(&tcp_lhash_lock);
-	head = &tcp_listening_hash[tcp_lhashfn(hnum)];
-	if (!hlist_empty(head)) {
-		struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
-
-		if (inet->num == hnum && !sk->sk_node.next &&
-		    (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
-		    (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
-		    !sk->sk_bound_dev_if)
-			goto sherry_cache;
-		sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
-	}
-	if (sk) {
-sherry_cache:
-		sock_hold(sk);
-	}
-	read_unlock(&tcp_lhash_lock);
-	return sk;
+	inet_unhash(&tcp_hashinfo, sk);
 }
 
-/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
- * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
- *
- * Local BH must be disabled here.
- */
-
-static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
-						       u32 daddr, u16 hnum,
-						       int dif)
-{
-	struct tcp_ehash_bucket *head;
-	TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
-	__u32 ports = TCP_COMBINED_PORTS(sport, hnum);
-	struct sock *sk;
-	struct hlist_node *node;
-	/* Optimize here for direct hit, only listening connections can
-	 * have wildcards anyways.
-	 */
-	int hash = tcp_hashfn(daddr, hnum, saddr, sport);
-	head = &tcp_ehash[hash];
-	read_lock(&head->lock);
-	sk_for_each(sk, node, &head->chain) {
-		if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
-			goto hit; /* You sunk my battleship! */
-	}
-
-	/* Must check for a TIME_WAIT'er before going to listener hash. */
-	sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
-		if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
-			goto hit;
-	}
-	sk = NULL;
-out:
-	read_unlock(&head->lock);
-	return sk;
-hit:
-	sock_hold(sk);
-	goto out;
-}
-
-static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
-					   u32 daddr, u16 hnum, int dif)
-{
-	struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
-						      daddr, hnum, dif);
-
-	return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
-}
-
-inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
-				  u16 dport, int dif)
-{
-	struct sock *sk;
-
-	local_bh_disable();
-	sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
-	local_bh_enable();
-
-	return sk;
-}
-
-EXPORT_SYMBOL_GPL(tcp_v4_lookup);
-
 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 {
 	return secure_tcp_sequence_number(skb->nh.iph->daddr,
@@ -555,27 +122,28 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 
 /* called with local bh disabled */
 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
-				      struct tcp_tw_bucket **twp)
+				      struct inet_timewait_sock **twp)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	u32 daddr = inet->rcv_saddr;
 	u32 saddr = inet->daddr;
 	int dif = sk->sk_bound_dev_if;
-	TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
-	__u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
-	int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
-	struct tcp_ehash_bucket *head = &tcp_ehash[hash];
+	INET_ADDR_COOKIE(acookie, saddr, daddr)
+	const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
+	const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
+	struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
 	struct sock *sk2;
-	struct hlist_node *node;
-	struct tcp_tw_bucket *tw;
+	const struct hlist_node *node;
+	struct inet_timewait_sock *tw;
 
 	write_lock(&head->lock);
 
 	/* Check TIME-WAIT sockets first. */
-	sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
-		tw = (struct tcp_tw_bucket *)sk2;
+	sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
+		tw = inet_twsk(sk2);
 
-		if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
+		if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
+			const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
 			struct tcp_sock *tp = tcp_sk(sk);
 
 			/* With PAWS, it is safe from the viewpoint
@@ -592,15 +160,15 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 			   fall back to VJ's scheme and use initial
 			   timestamp retrieved from peer table.
 			 */
-			if (tw->tw_ts_recent_stamp &&
+			if (tcptw->tw_ts_recent_stamp &&
 			    (!twp || (sysctl_tcp_tw_reuse &&
 				      xtime.tv_sec -
-				      tw->tw_ts_recent_stamp > 1))) {
-				if ((tp->write_seq =
-						tw->tw_snd_nxt + 65535 + 2) == 0)
+				      tcptw->tw_ts_recent_stamp > 1))) {
+				tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
+				if (tp->write_seq == 0)
 					tp->write_seq = 1;
-				tp->rx_opt.ts_recent	   = tw->tw_ts_recent;
-				tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
+				tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
+				tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 				sock_hold(sk2);
 				goto unique;
 			} else
@@ -611,7 +179,7 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 
 	/* And established part... */
 	sk_for_each(sk2, node, &head->chain) {
-		if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
+		if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif))
 			goto not_unique;
 	}
 
@@ -631,10 +199,10 @@ unique:
 		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 	} else if (tw) {
 		/* Silly. Should hash-dance instead... */
-		tcp_tw_deschedule(tw);
+		inet_twsk_deschedule(tw, &tcp_death_row);
 		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 
-		tcp_tw_put(tw);
+		inet_twsk_put(tw);
 	}
 
 	return 0;
@@ -657,9 +225,9 @@ static inline u32 connect_port_offset(const struct sock *sk)
  */
 static inline int tcp_v4_hash_connect(struct sock *sk)
 {
-	unsigned short snum = inet_sk(sk)->num;
- 	struct tcp_bind_hashbucket *head;
- 	struct tcp_bind_bucket *tb;
+	const unsigned short snum = inet_sk(sk)->num;
+ 	struct inet_bind_hashbucket *head;
+ 	struct inet_bind_bucket *tb;
 	int ret;
 
  	if (!snum) {
@@ -671,19 +239,19 @@ static inline int tcp_v4_hash_connect(struct sock *sk)
 		static u32 hint;
 		u32 offset = hint + connect_port_offset(sk);
 		struct hlist_node *node;
- 		struct tcp_tw_bucket *tw = NULL;
+ 		struct inet_timewait_sock *tw = NULL;
 
  		local_bh_disable();
 		for (i = 1; i <= range; i++) {
 			port = low + (i + offset) % range;
- 			head = &tcp_bhash[tcp_bhashfn(port)];
+ 			head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
  			spin_lock(&head->lock);
 
  			/* Does not bother with rcv_saddr checks,
  			 * because the established check is already
  			 * unique enough.
  			 */
-			tb_for_each(tb, node, &head->chain) {
+			inet_bind_bucket_for_each(tb, node, &head->chain) {
  				if (tb->port == port) {
  					BUG_TRAP(!hlist_empty(&tb->owners));
  					if (tb->fastreuse >= 0)
@@ -696,7 +264,7 @@ static inline int tcp_v4_hash_connect(struct sock *sk)
  				}
  			}
 
- 			tb = tcp_bucket_create(head, port);
+ 			tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
  			if (!tb) {
  				spin_unlock(&head->lock);
  				break;
@@ -715,27 +283,27 @@ ok:
 		hint += i;
 
  		/* Head lock still held and bh's disabled */
- 		tcp_bind_hash(sk, tb, port);
+ 		inet_bind_hash(sk, tb, port);
 		if (sk_unhashed(sk)) {
  			inet_sk(sk)->sport = htons(port);
- 			__tcp_v4_hash(sk, 0);
+ 			__inet_hash(&tcp_hashinfo, sk, 0);
  		}
  		spin_unlock(&head->lock);
 
  		if (tw) {
- 			tcp_tw_deschedule(tw);
- 			tcp_tw_put(tw);
+ 			inet_twsk_deschedule(tw, &tcp_death_row);;
+ 			inet_twsk_put(tw);
  		}
 
 		ret = 0;
 		goto out;
  	}
 
- 	head  = &tcp_bhash[tcp_bhashfn(snum)];
- 	tb  = tcp_sk(sk)->bind_hash;
+ 	head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
+ 	tb  = inet_csk(sk)->icsk_bind_hash;
 	spin_lock_bh(&head->lock);
 	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
-		__tcp_v4_hash(sk, 0);
+		__inet_hash(&tcp_hashinfo, sk, 0);
 		spin_unlock_bh(&head->lock);
 		return 0;
 	} else {
@@ -798,7 +366,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		tp->write_seq		   = 0;
 	}
 
-	if (sysctl_tcp_tw_recycle &&
+	if (tcp_death_row.sysctl_tw_recycle &&
 	    !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 		struct inet_peer *peer = rt_get_peer(rt);
 
@@ -837,8 +405,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		goto failure;
 
 	/* OK, now commit destination to socket.  */
-	__sk_dst_set(sk, &rt->u.dst);
-	tcp_v4_setup_caps(sk, &rt->u.dst);
+	sk_setup_caps(sk, &rt->u.dst);
 
 	if (!tp->write_seq)
 		tp->write_seq = secure_tcp_sequence_number(inet->saddr,
@@ -864,53 +431,6 @@ failure:
 	return err;
 }
 
-static __inline__ int tcp_v4_iif(struct sk_buff *skb)
-{
-	return ((struct rtable *)skb->dst)->rt_iif;
-}
-
-static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
-{
-	return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
-}
-
-static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
-					      struct request_sock ***prevp,
-					      __u16 rport,
-					      __u32 raddr, __u32 laddr)
-{
-	struct listen_sock *lopt = tp->accept_queue.listen_opt;
-	struct request_sock *req, **prev;
-
-	for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
-	     (req = *prev) != NULL;
-	     prev = &req->dl_next) {
-		const struct inet_request_sock *ireq = inet_rsk(req);
-
-		if (ireq->rmt_port == rport &&
-		    ireq->rmt_addr == raddr &&
-		    ireq->loc_addr == laddr &&
-		    TCP_INET_FAMILY(req->rsk_ops->family)) {
-			BUG_TRAP(!req->sk);
-			*prevp = prev;
-			break;
-		}
-	}
-
-	return req;
-}
-
-static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct listen_sock *lopt = tp->accept_queue.listen_opt;
-	u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
-
-	reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
-	tcp_synq_added(sk);
-}
-
-
 /*
  * This routine does path mtu discovery as defined in RFC1191.
  */
@@ -993,14 +513,14 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
 		return;
 	}
 
-	sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
-			   th->source, tcp_v4_iif(skb));
+	sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
+			 th->source, inet_iif(skb));
 	if (!sk) {
 		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 		return;
 	}
 	if (sk->sk_state == TCP_TIME_WAIT) {
-		tcp_tw_put((struct tcp_tw_bucket *)sk);
+		inet_twsk_put((struct inet_timewait_sock *)sk);
 		return;
 	}
 
@@ -1054,8 +574,8 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
 		if (sock_owned_by_user(sk))
 			goto out;
 
-		req = tcp_v4_search_req(tp, &prev, th->dest,
-					iph->daddr, iph->saddr);
+		req = inet_csk_search_req(sk, &prev, th->dest,
+					  iph->daddr, iph->saddr);
 		if (!req)
 			goto out;
 
@@ -1075,7 +595,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
 		 * created socket, and POSIX does not want network
 		 * errors returned from accept().
 		 */
-		tcp_synq_drop(sk, req, prev);
+		inet_csk_reqsk_queue_drop(sk, req, prev);
 		goto out;
 
 	case TCP_SYN_SENT:
@@ -1245,12 +765,13 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 
 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 {
-	struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
+	struct inet_timewait_sock *tw = inet_twsk(sk);
+	const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 
-	tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
-			tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
+	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
 
-	tcp_tw_put(tw);
+	inet_twsk_put(tw);
 }
 
 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
@@ -1259,36 +780,6 @@ static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
 			req->ts_recent);
 }
 
-static struct dst_entry* tcp_v4_route_req(struct sock *sk,
-					  struct request_sock *req)
-{
-	struct rtable *rt;
-	const struct inet_request_sock *ireq = inet_rsk(req);
-	struct ip_options *opt = inet_rsk(req)->opt;
-	struct flowi fl = { .oif = sk->sk_bound_dev_if,
-			    .nl_u = { .ip4_u =
-				      { .daddr = ((opt && opt->srr) ?
-						  opt->faddr :
-						  ireq->rmt_addr),
-					.saddr = ireq->loc_addr,
-					.tos = RT_CONN_FLAGS(sk) } },
-			    .proto = IPPROTO_TCP,
-			    .uli_u = { .ports =
-				       { .sport = inet_sk(sk)->sport,
-					 .dport = ireq->rmt_port } } };
-
-	if (ip_route_output_flow(&rt, &fl, sk, 0)) {
-		IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
-		return NULL;
-	}
-	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
-		ip_rt_put(rt);
-		IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
-		return NULL;
-	}
-	return &rt->u.dst;
-}
-
 /*
  *	Send a SYN-ACK after having received an ACK.
  *	This still operates on a request_sock only, not on a big
@@ -1302,7 +793,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
 	struct sk_buff * skb;
 
 	/* First, grab a route. */
-	if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
+	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 		goto out;
 
 	skb = tcp_make_synack(sk, dst, req);
@@ -1404,7 +895,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	 * limitations, they conserve resources and peer is
 	 * evidently real one.
 	 */
-	if (tcp_synq_is_full(sk) && !isn) {
+	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
 #ifdef CONFIG_SYN_COOKIES
 		if (sysctl_tcp_syncookies) {
 			want_cookie = 1;
@@ -1418,7 +909,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	 * clogging syn queue with openreqs with exponentially increasing
 	 * timeout.
 	 */
-	if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
+	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
 		goto drop;
 
 	req = reqsk_alloc(&tcp_request_sock_ops);
@@ -1474,8 +965,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		 * are made in the function processing timewait state.
 		 */
 		if (tmp_opt.saw_tstamp &&
-		    sysctl_tcp_tw_recycle &&
-		    (dst = tcp_v4_route_req(sk, req)) != NULL &&
+		    tcp_death_row.sysctl_tw_recycle &&
+		    (dst = inet_csk_route_req(sk, req)) != NULL &&
 		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
 		    peer->v4daddr == saddr) {
 			if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
@@ -1488,7 +979,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		}
 		/* Kill the following clause, if you dislike this way. */
 		else if (!sysctl_tcp_syncookies &&
-			 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
+			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
 			  (sysctl_max_syn_backlog >> 2)) &&
 			 (!peer || !peer->tcp_ts_stamp) &&
 			 (!dst || !dst_metric(dst, RTAX_RTT))) {
@@ -1499,11 +990,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 			 * to destinations, already remembered
 			 * to the moment of synflood.
 			 */
-			LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
-					      "request from %u.%u."
-					      "%u.%u/%u\n",
-					      NIPQUAD(saddr),
-					      ntohs(skb->h.th->source)));
+			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
+				       "request from %u.%u.%u.%u/%u\n",
+				       NIPQUAD(saddr),
+				       ntohs(skb->h.th->source));
 			dst_release(dst);
 			goto drop_and_free;
 		}
@@ -1518,7 +1008,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	if (want_cookie) {
 	   	reqsk_free(req);
 	} else {
-		tcp_v4_synq_add(sk, req);
+		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
 	}
 	return 0;
 
@@ -1546,15 +1036,14 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	if (sk_acceptq_is_full(sk))
 		goto exit_overflow;
 
-	if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
+	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 		goto exit;
 
 	newsk = tcp_create_openreq_child(sk, req, skb);
 	if (!newsk)
 		goto exit;
 
-	newsk->sk_dst_cache = dst;
-	tcp_v4_setup_caps(newsk, dst);
+	sk_setup_caps(newsk, dst);
 
 	newtp		      = tcp_sk(newsk);
 	newinet		      = inet_sk(newsk);
@@ -1564,7 +1053,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	newinet->saddr	      = ireq->loc_addr;
 	newinet->opt	      = ireq->opt;
 	ireq->opt	      = NULL;
-	newinet->mc_index     = tcp_v4_iif(skb);
+	newinet->mc_index     = inet_iif(skb);
 	newinet->mc_ttl	      = skb->nh.iph->ttl;
 	newtp->ext_header_len = 0;
 	if (newinet->opt)
@@ -1575,8 +1064,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
 	tcp_initialize_rcv_mss(newsk);
 
-	__tcp_v4_hash(newsk, 0);
-	__tcp_inherit_port(sk, newsk);
+	__inet_hash(&tcp_hashinfo, newsk, 0);
+	__inet_inherit_port(&tcp_hashinfo, sk, newsk);
 
 	return newsk;
 
@@ -1592,27 +1081,24 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcphdr *th = skb->h.th;
 	struct iphdr *iph = skb->nh.iph;
-	struct tcp_sock *tp = tcp_sk(sk);
 	struct sock *nsk;
 	struct request_sock **prev;
 	/* Find possible connection requests. */
-	struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
-						     iph->saddr, iph->daddr);
+	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
+						       iph->saddr, iph->daddr);
 	if (req)
 		return tcp_check_req(sk, skb, req, prev);
 
-	nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
-					  th->source,
-					  skb->nh.iph->daddr,
-					  ntohs(th->dest),
-					  tcp_v4_iif(skb));
+	nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
+					th->source, skb->nh.iph->daddr,
+					ntohs(th->dest), inet_iif(skb));
 
 	if (nsk) {
 		if (nsk->sk_state != TCP_TIME_WAIT) {
 			bh_lock_sock(nsk);
 			return nsk;
 		}
-		tcp_tw_put((struct tcp_tw_bucket *)nsk);
+		inet_twsk_put((struct inet_timewait_sock *)nsk);
 		return NULL;
 	}
 
@@ -1631,7 +1117,7 @@ static int tcp_v4_checksum_init(struct sk_buff *skb)
 				  skb->nh.iph->daddr, skb->csum))
 			return 0;
 
-		LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
+		LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n");
 		skb->ip_summed = CHECKSUM_NONE;
 	}
 	if (skb->len <= 76) {
@@ -1747,9 +1233,9 @@ int tcp_v4_rcv(struct sk_buff *skb)
 	TCP_SKB_CB(skb)->flags	 = skb->nh.iph->tos;
 	TCP_SKB_CB(skb)->sacked	 = 0;
 
-	sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
-			     skb->nh.iph->daddr, ntohs(th->dest),
-			     tcp_v4_iif(skb));
+	sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
+			   skb->nh.iph->daddr, ntohs(th->dest),
+			   inet_iif(skb));
 
 	if (!sk)
 		goto no_tcp_socket;
@@ -1801,24 +1287,26 @@ discard_and_relse:
 
 do_time_wait:
 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
-		tcp_tw_put((struct tcp_tw_bucket *) sk);
+		inet_twsk_put((struct inet_timewait_sock *) sk);
 		goto discard_it;
 	}
 
 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
 		TCP_INC_STATS_BH(TCP_MIB_INERRS);
-		tcp_tw_put((struct tcp_tw_bucket *) sk);
+		inet_twsk_put((struct inet_timewait_sock *) sk);
 		goto discard_it;
 	}
-	switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
-					   skb, th, skb->len)) {
+	switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
+					   skb, th)) {
 	case TCP_TW_SYN: {
-		struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
-							  ntohs(th->dest),
-							  tcp_v4_iif(skb));
+		struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
+							skb->nh.iph->daddr,
+							ntohs(th->dest),
+							inet_iif(skb));
 		if (sk2) {
-			tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
-			tcp_tw_put((struct tcp_tw_bucket *)sk);
+			inet_twsk_deschedule((struct inet_timewait_sock *)sk,
+					     &tcp_death_row);
+			inet_twsk_put((struct inet_timewait_sock *)sk);
 			sk = sk2;
 			goto process;
 		}
@@ -1834,112 +1322,6 @@ do_time_wait:
 	goto discard_it;
 }
 
-/* With per-bucket locks this operation is not-atomic, so that
- * this version is not worse.
- */
-static void __tcp_v4_rehash(struct sock *sk)
-{
-	sk->sk_prot->unhash(sk);
-	sk->sk_prot->hash(sk);
-}
-
-static int tcp_v4_reselect_saddr(struct sock *sk)
-{
-	struct inet_sock *inet = inet_sk(sk);
-	int err;
-	struct rtable *rt;
-	__u32 old_saddr = inet->saddr;
-	__u32 new_saddr;
-	__u32 daddr = inet->daddr;
-
-	if (inet->opt && inet->opt->srr)
-		daddr = inet->opt->faddr;
-
-	/* Query new route. */
-	err = ip_route_connect(&rt, daddr, 0,
-			       RT_CONN_FLAGS(sk),
-			       sk->sk_bound_dev_if,
-			       IPPROTO_TCP,
-			       inet->sport, inet->dport, sk);
-	if (err)
-		return err;
-
-	__sk_dst_set(sk, &rt->u.dst);
-	tcp_v4_setup_caps(sk, &rt->u.dst);
-
-	new_saddr = rt->rt_src;
-
-	if (new_saddr == old_saddr)
-		return 0;
-
-	if (sysctl_ip_dynaddr > 1) {
-		printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
-				 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
-		       NIPQUAD(old_saddr),
-		       NIPQUAD(new_saddr));
-	}
-
-	inet->saddr = new_saddr;
-	inet->rcv_saddr = new_saddr;
-
-	/* XXX The only one ugly spot where we need to
-	 * XXX really change the sockets identity after
-	 * XXX it has entered the hashes. -DaveM
-	 *
-	 * Besides that, it does not check for connection
-	 * uniqueness. Wait for troubles.
-	 */
-	__tcp_v4_rehash(sk);
-	return 0;
-}
-
-int tcp_v4_rebuild_header(struct sock *sk)
-{
-	struct inet_sock *inet = inet_sk(sk);
-	struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
-	u32 daddr;
-	int err;
-
-	/* Route is OK, nothing to do. */
-	if (rt)
-		return 0;
-
-	/* Reroute. */
-	daddr = inet->daddr;
-	if (inet->opt && inet->opt->srr)
-		daddr = inet->opt->faddr;
-
-	{
-		struct flowi fl = { .oif = sk->sk_bound_dev_if,
-				    .nl_u = { .ip4_u =
-					      { .daddr = daddr,
-						.saddr = inet->saddr,
-						.tos = RT_CONN_FLAGS(sk) } },
-				    .proto = IPPROTO_TCP,
-				    .uli_u = { .ports =
-					       { .sport = inet->sport,
-						 .dport = inet->dport } } };
-						
-		err = ip_route_output_flow(&rt, &fl, sk, 0);
-	}
-	if (!err) {
-		__sk_dst_set(sk, &rt->u.dst);
-		tcp_v4_setup_caps(sk, &rt->u.dst);
-		return 0;
-	}
-
-	/* Routing failed... */
-	sk->sk_route_caps = 0;
-
-	if (!sysctl_ip_dynaddr ||
-	    sk->sk_state != TCP_SYN_SENT ||
-	    (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
-	    (err = tcp_v4_reselect_saddr(sk)) != 0)
-		sk->sk_err_soft = -err;
-
-	return err;
-}
-
 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
 {
 	struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
@@ -1988,18 +1370,18 @@ int tcp_v4_remember_stamp(struct sock *sk)
 	return 0;
 }
 
-int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
+int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
 {
-	struct inet_peer *peer = NULL;
-
-	peer = inet_getpeer(tw->tw_daddr, 1);
+	struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
 
 	if (peer) {
-		if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
+		const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+
+		if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
 		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
-		     peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
-			peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
-			peer->tcp_ts = tw->tw_ts_recent;
+		     peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
+			peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
+			peer->tcp_ts	   = tcptw->tw_ts_recent;
 		}
 		inet_putpeer(peer);
 		return 1;
@@ -2011,7 +1393,7 @@ int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
 struct tcp_func ipv4_specific = {
 	.queue_xmit	=	ip_queue_xmit,
 	.send_check	=	tcp_v4_send_check,
-	.rebuild_header	=	tcp_v4_rebuild_header,
+	.rebuild_header	=	inet_sk_rebuild_header,
 	.conn_request	=	tcp_v4_conn_request,
 	.syn_recv_sock	=	tcp_v4_syn_recv_sock,
 	.remember_stamp	=	tcp_v4_remember_stamp,
@@ -2027,13 +1409,14 @@ struct tcp_func ipv4_specific = {
  */
 static int tcp_v4_init_sock(struct sock *sk)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	skb_queue_head_init(&tp->out_of_order_queue);
 	tcp_init_xmit_timers(sk);
 	tcp_prequeue_init(tp);
 
-	tp->rto  = TCP_TIMEOUT_INIT;
+	icsk->icsk_rto = TCP_TIMEOUT_INIT;
 	tp->mdev = TCP_TIMEOUT_INIT;
 
 	/* So many TCP implementations out there (incorrectly) count the
@@ -2051,7 +1434,7 @@ static int tcp_v4_init_sock(struct sock *sk)
 	tp->mss_cache = 536;
 
 	tp->reordering = sysctl_tcp_reordering;
-	tp->ca_ops = &tcp_init_congestion_ops;
+	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
 
 	sk->sk_state = TCP_CLOSE;
 
@@ -2074,7 +1457,7 @@ int tcp_v4_destroy_sock(struct sock *sk)
 
 	tcp_clear_xmit_timers(sk);
 
-	tcp_cleanup_congestion_control(tp);
+	tcp_cleanup_congestion_control(sk);
 
 	/* Cleanup up the write buffer. */
   	sk_stream_writequeue_purge(sk);
@@ -2086,8 +1469,8 @@ int tcp_v4_destroy_sock(struct sock *sk)
 	__skb_queue_purge(&tp->ucopy.prequeue);
 
 	/* Clean up a referenced TCP bind bucket. */
-	if (tp->bind_hash)
-		tcp_put_port(sk);
+	if (inet_csk(sk)->icsk_bind_hash)
+		inet_put_port(&tcp_hashinfo, sk);
 
 	/*
 	 * If sendmsg cached page exists, toss it.
@@ -2107,13 +1490,13 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
 #ifdef CONFIG_PROC_FS
 /* Proc filesystem TCP sock list dumping. */
 
-static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
+static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
 {
 	return hlist_empty(head) ? NULL :
-		list_entry(head->first, struct tcp_tw_bucket, tw_node);
+		list_entry(head->first, struct inet_timewait_sock, tw_node);
 }
 
-static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
+static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
 {
 	return tw->tw_node.next ?
 		hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
@@ -2121,14 +1504,14 @@ static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
 
 static void *listening_get_next(struct seq_file *seq, void *cur)
 {
-	struct tcp_sock *tp;
+	struct inet_connection_sock *icsk;
 	struct hlist_node *node;
 	struct sock *sk = cur;
 	struct tcp_iter_state* st = seq->private;
 
 	if (!sk) {
 		st->bucket = 0;
-		sk = sk_head(&tcp_listening_hash[0]);
+		sk = sk_head(&tcp_hashinfo.listening_hash[0]);
 		goto get_sk;
 	}
 
@@ -2137,7 +1520,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
 	if (st->state == TCP_SEQ_STATE_OPENREQ) {
 		struct request_sock *req = cur;
 
-	       	tp = tcp_sk(st->syn_wait_sk);
+	       	icsk = inet_csk(st->syn_wait_sk);
 		req = req->dl_next;
 		while (1) {
 			while (req) {
@@ -2150,17 +1533,17 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
 			if (++st->sbucket >= TCP_SYNQ_HSIZE)
 				break;
 get_req:
-			req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
+			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
 		}
 		sk	  = sk_next(st->syn_wait_sk);
 		st->state = TCP_SEQ_STATE_LISTENING;
-		read_unlock_bh(&tp->accept_queue.syn_wait_lock);
+		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
 	} else {
-	       	tp = tcp_sk(sk);
-		read_lock_bh(&tp->accept_queue.syn_wait_lock);
-		if (reqsk_queue_len(&tp->accept_queue))
+	       	icsk = inet_csk(sk);
+		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+		if (reqsk_queue_len(&icsk->icsk_accept_queue))
 			goto start_req;
-		read_unlock_bh(&tp->accept_queue.syn_wait_lock);
+		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
 		sk = sk_next(sk);
 	}
 get_sk:
@@ -2169,9 +1552,9 @@ get_sk:
 			cur = sk;
 			goto out;
 		}
-	       	tp = tcp_sk(sk);
-		read_lock_bh(&tp->accept_queue.syn_wait_lock);
-		if (reqsk_queue_len(&tp->accept_queue)) {
+	       	icsk = inet_csk(sk);
+		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
 start_req:
 			st->uid		= sock_i_uid(sk);
 			st->syn_wait_sk = sk;
@@ -2179,10 +1562,10 @@ start_req:
 			st->sbucket	= 0;
 			goto get_req;
 		}
-		read_unlock_bh(&tp->accept_queue.syn_wait_lock);
+		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
 	}
-	if (++st->bucket < TCP_LHTABLE_SIZE) {
-		sk = sk_head(&tcp_listening_hash[st->bucket]);
+	if (++st->bucket < INET_LHTABLE_SIZE) {
+		sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
 		goto get_sk;
 	}
 	cur = NULL;
@@ -2206,16 +1589,16 @@ static void *established_get_first(struct seq_file *seq)
 	struct tcp_iter_state* st = seq->private;
 	void *rc = NULL;
 
-	for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
+	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
 		struct sock *sk;
 		struct hlist_node *node;
-		struct tcp_tw_bucket *tw;
+		struct inet_timewait_sock *tw;
 
 		/* We can reschedule _before_ having picked the target: */
 		cond_resched_softirq();
 
-		read_lock(&tcp_ehash[st->bucket].lock);
-		sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
+		read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
+		sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
 			if (sk->sk_family != st->family) {
 				continue;
 			}
@@ -2223,15 +1606,15 @@ static void *established_get_first(struct seq_file *seq)
 			goto out;
 		}
 		st->state = TCP_SEQ_STATE_TIME_WAIT;
-		tw_for_each(tw, node,
-			    &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
+		inet_twsk_for_each(tw, node,
+				   &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
 			if (tw->tw_family != st->family) {
 				continue;
 			}
 			rc = tw;
 			goto out;
 		}
-		read_unlock(&tcp_ehash[st->bucket].lock);
+		read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
 		st->state = TCP_SEQ_STATE_ESTABLISHED;
 	}
 out:
@@ -2241,7 +1624,7 @@ out:
 static void *established_get_next(struct seq_file *seq, void *cur)
 {
 	struct sock *sk = cur;
-	struct tcp_tw_bucket *tw;
+	struct inet_timewait_sock *tw;
 	struct hlist_node *node;
 	struct tcp_iter_state* st = seq->private;
 
@@ -2258,15 +1641,15 @@ get_tw:
 			cur = tw;
 			goto out;
 		}
-		read_unlock(&tcp_ehash[st->bucket].lock);
+		read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
 		st->state = TCP_SEQ_STATE_ESTABLISHED;
 
 		/* We can reschedule between buckets: */
 		cond_resched_softirq();
 
-		if (++st->bucket < tcp_ehash_size) {
-			read_lock(&tcp_ehash[st->bucket].lock);
-			sk = sk_head(&tcp_ehash[st->bucket].chain);
+		if (++st->bucket < tcp_hashinfo.ehash_size) {
+			read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
+			sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
 		} else {
 			cur = NULL;
 			goto out;
@@ -2280,7 +1663,7 @@ get_tw:
 	}
 
 	st->state = TCP_SEQ_STATE_TIME_WAIT;
-	tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
+	tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
 	goto get_tw;
 found:
 	cur = sk;
@@ -2304,12 +1687,12 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
 	void *rc;
 	struct tcp_iter_state* st = seq->private;
 
-	tcp_listen_lock();
+	inet_listen_lock(&tcp_hashinfo);
 	st->state = TCP_SEQ_STATE_LISTENING;
 	rc	  = listening_get_idx(seq, &pos);
 
 	if (!rc) {
-		tcp_listen_unlock();
+		inet_listen_unlock(&tcp_hashinfo);
 		local_bh_disable();
 		st->state = TCP_SEQ_STATE_ESTABLISHED;
 		rc	  = established_get_idx(seq, pos);
@@ -2342,7 +1725,7 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	case TCP_SEQ_STATE_LISTENING:
 		rc = listening_get_next(seq, v);
 		if (!rc) {
-			tcp_listen_unlock();
+			inet_listen_unlock(&tcp_hashinfo);
 			local_bh_disable();
 			st->state = TCP_SEQ_STATE_ESTABLISHED;
 			rc	  = established_get_first(seq);
@@ -2365,17 +1748,17 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
 	switch (st->state) {
 	case TCP_SEQ_STATE_OPENREQ:
 		if (v) {
-			struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
-			read_unlock_bh(&tp->accept_queue.syn_wait_lock);
+			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
+			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
 		}
 	case TCP_SEQ_STATE_LISTENING:
 		if (v != SEQ_START_TOKEN)
-			tcp_listen_unlock();
+			inet_listen_unlock(&tcp_hashinfo);
 		break;
 	case TCP_SEQ_STATE_TIME_WAIT:
 	case TCP_SEQ_STATE_ESTABLISHED:
 		if (v)
-			read_unlock(&tcp_ehash[st->bucket].lock);
+			read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
 		local_bh_enable();
 		break;
 	}
@@ -2472,18 +1855,19 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
 	int timer_active;
 	unsigned long timer_expires;
 	struct tcp_sock *tp = tcp_sk(sp);
+	const struct inet_connection_sock *icsk = inet_csk(sp);
 	struct inet_sock *inet = inet_sk(sp);
 	unsigned int dest = inet->daddr;
 	unsigned int src = inet->rcv_saddr;
 	__u16 destp = ntohs(inet->dport);
 	__u16 srcp = ntohs(inet->sport);
 
-	if (tp->pending == TCP_TIME_RETRANS) {
+	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
 		timer_active	= 1;
-		timer_expires	= tp->timeout;
-	} else if (tp->pending == TCP_TIME_PROBE0) {
+		timer_expires	= icsk->icsk_timeout;
+	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
 		timer_active	= 4;
-		timer_expires	= tp->timeout;
+		timer_expires	= icsk->icsk_timeout;
 	} else if (timer_pending(&sp->sk_timer)) {
 		timer_active	= 2;
 		timer_expires	= sp->sk_timer.expires;
@@ -2498,17 +1882,19 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
 		tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
 		timer_active,
 		jiffies_to_clock_t(timer_expires - jiffies),
-		tp->retransmits,
+		icsk->icsk_retransmits,
 		sock_i_uid(sp),
-		tp->probes_out,
+		icsk->icsk_probes_out,
 		sock_i_ino(sp),
 		atomic_read(&sp->sk_refcnt), sp,
-		tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
+		icsk->icsk_rto,
+		icsk->icsk_ack.ato,
+		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
 		tp->snd_cwnd,
 		tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
 }
 
-static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
+static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
 {
 	unsigned int dest, src;
 	__u16 destp, srcp;
@@ -2588,7 +1974,7 @@ struct proto tcp_prot = {
 	.close			= tcp_close,
 	.connect		= tcp_v4_connect,
 	.disconnect		= tcp_disconnect,
-	.accept			= tcp_accept,
+	.accept			= inet_csk_accept,
 	.ioctl			= tcp_ioctl,
 	.init			= tcp_v4_init_sock,
 	.destroy		= tcp_v4_destroy_sock,
@@ -2603,6 +1989,7 @@ struct proto tcp_prot = {
 	.get_port		= tcp_v4_get_port,
 	.enter_memory_pressure	= tcp_enter_memory_pressure,
 	.sockets_allocated	= &tcp_sockets_allocated,
+	.orphan_count		= &tcp_orphan_count,
 	.memory_allocated	= &tcp_memory_allocated,
 	.memory_pressure	= &tcp_memory_pressure,
 	.sysctl_mem		= sysctl_tcp_mem,
@@ -2610,6 +1997,7 @@ struct proto tcp_prot = {
 	.sysctl_rmem		= sysctl_tcp_rmem,
 	.max_header		= MAX_TCP_HEADER,
 	.obj_size		= sizeof(struct tcp_sock),
+	.twsk_obj_size		= sizeof(struct tcp_timewait_sock),
 	.rsk_prot		= &tcp_request_sock_ops,
 };
 
@@ -2631,19 +2019,13 @@ void __init tcp_v4_init(struct net_proto_family *ops)
 }
 
 EXPORT_SYMBOL(ipv4_specific);
-EXPORT_SYMBOL(tcp_bind_hash);
-EXPORT_SYMBOL(tcp_bucket_create);
+EXPORT_SYMBOL(inet_bind_bucket_create);
 EXPORT_SYMBOL(tcp_hashinfo);
-EXPORT_SYMBOL(tcp_inherit_port);
-EXPORT_SYMBOL(tcp_listen_wlock);
-EXPORT_SYMBOL(tcp_port_rover);
 EXPORT_SYMBOL(tcp_prot);
-EXPORT_SYMBOL(tcp_put_port);
 EXPORT_SYMBOL(tcp_unhash);
 EXPORT_SYMBOL(tcp_v4_conn_request);
 EXPORT_SYMBOL(tcp_v4_connect);
 EXPORT_SYMBOL(tcp_v4_do_rcv);
-EXPORT_SYMBOL(tcp_v4_rebuild_header);
 EXPORT_SYMBOL(tcp_v4_remember_stamp);
 EXPORT_SYMBOL(tcp_v4_send_check);
 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f42a284164b..a88db28b0af 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -35,13 +35,27 @@
 #define SYNC_INIT 1
 #endif
 
-int sysctl_tcp_tw_recycle;
-int sysctl_tcp_max_tw_buckets = NR_FILE*2;
-
 int sysctl_tcp_syncookies = SYNC_INIT; 
 int sysctl_tcp_abort_on_overflow;
 
-static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo);
+struct inet_timewait_death_row tcp_death_row = {
+	.sysctl_max_tw_buckets = NR_FILE * 2,
+	.period		= TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
+	.death_lock	= SPIN_LOCK_UNLOCKED,
+	.hashinfo	= &tcp_hashinfo,
+	.tw_timer	= TIMER_INITIALIZER(inet_twdr_hangman, 0,
+					    (unsigned long)&tcp_death_row),
+	.twkill_work	= __WORK_INITIALIZER(tcp_death_row.twkill_work,
+					     inet_twdr_twkill_work,
+					     &tcp_death_row),
+/* Short-time timewait calendar */
+
+	.twcal_hand	= -1,
+	.twcal_timer	= TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
+					    (unsigned long)&tcp_death_row),
+};
+
+EXPORT_SYMBOL_GPL(tcp_death_row);
 
 static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
 {
@@ -52,47 +66,6 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
 	return (seq == e_win && seq == end_seq);
 }
 
-/* New-style handling of TIME_WAIT sockets. */
-
-int tcp_tw_count;
-
-
-/* Must be called with locally disabled BHs. */
-static void tcp_timewait_kill(struct tcp_tw_bucket *tw)
-{
-	struct tcp_ehash_bucket *ehead;
-	struct tcp_bind_hashbucket *bhead;
-	struct tcp_bind_bucket *tb;
-
-	/* Unlink from established hashes. */
-	ehead = &tcp_ehash[tw->tw_hashent];
-	write_lock(&ehead->lock);
-	if (hlist_unhashed(&tw->tw_node)) {
-		write_unlock(&ehead->lock);
-		return;
-	}
-	__hlist_del(&tw->tw_node);
-	sk_node_init(&tw->tw_node);
-	write_unlock(&ehead->lock);
-
-	/* Disassociate with bind bucket. */
-	bhead = &tcp_bhash[tcp_bhashfn(tw->tw_num)];
-	spin_lock(&bhead->lock);
-	tb = tw->tw_tb;
-	__hlist_del(&tw->tw_bind_node);
-	tw->tw_tb = NULL;
-	tcp_bucket_destroy(tb);
-	spin_unlock(&bhead->lock);
-
-#ifdef INET_REFCNT_DEBUG
-	if (atomic_read(&tw->tw_refcnt) != 1) {
-		printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw,
-		       atomic_read(&tw->tw_refcnt));
-	}
-#endif
-	tcp_tw_put(tw);
-}
-
 /* 
  * * Main purpose of TIME-WAIT state is to close connection gracefully,
  *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
@@ -122,19 +95,20 @@ static void tcp_timewait_kill(struct tcp_tw_bucket *tw)
  * to avoid misread sequence numbers, states etc.  --ANK
  */
 enum tcp_tw_status
-tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
-			   struct tcphdr *th, unsigned len)
+tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
+			   const struct tcphdr *th)
 {
+	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
 	struct tcp_options_received tmp_opt;
 	int paws_reject = 0;
 
 	tmp_opt.saw_tstamp = 0;
-	if (th->doff > (sizeof(struct tcphdr) >> 2) && tw->tw_ts_recent_stamp) {
+	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
 		tcp_parse_options(skb, &tmp_opt, 0);
 
 		if (tmp_opt.saw_tstamp) {
-			tmp_opt.ts_recent	   = tw->tw_ts_recent;
-			tmp_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
+			tmp_opt.ts_recent	= tcptw->tw_ts_recent;
+			tmp_opt.ts_recent_stamp	= tcptw->tw_ts_recent_stamp;
 			paws_reject = tcp_paws_check(&tmp_opt, th->rst);
 		}
 	}
@@ -145,20 +119,20 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
 		/* Out of window, send ACK */
 		if (paws_reject ||
 		    !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
-				   tw->tw_rcv_nxt,
-				   tw->tw_rcv_nxt + tw->tw_rcv_wnd))
+				   tcptw->tw_rcv_nxt,
+				   tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
 			return TCP_TW_ACK;
 
 		if (th->rst)
 			goto kill;
 
-		if (th->syn && !before(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt))
+		if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
 			goto kill_with_rst;
 
 		/* Dup ACK? */
-		if (!after(TCP_SKB_CB(skb)->end_seq, tw->tw_rcv_nxt) ||
+		if (!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
 		    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
-			tcp_tw_put(tw);
+			inet_twsk_put(tw);
 			return TCP_TW_SUCCESS;
 		}
 
@@ -166,19 +140,19 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
 		 * reset.
 		 */
 		if (!th->fin ||
-		    TCP_SKB_CB(skb)->end_seq != tw->tw_rcv_nxt + 1) {
+		    TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
 kill_with_rst:
-			tcp_tw_deschedule(tw);
-			tcp_tw_put(tw);
+			inet_twsk_deschedule(tw, &tcp_death_row);
+			inet_twsk_put(tw);
 			return TCP_TW_RST;
 		}
 
 		/* FIN arrived, enter true time-wait state. */
-		tw->tw_substate	= TCP_TIME_WAIT;
-		tw->tw_rcv_nxt	= TCP_SKB_CB(skb)->end_seq;
+		tw->tw_substate	  = TCP_TIME_WAIT;
+		tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 		if (tmp_opt.saw_tstamp) {
-			tw->tw_ts_recent_stamp	= xtime.tv_sec;
-			tw->tw_ts_recent	= tmp_opt.rcv_tsval;
+			tcptw->tw_ts_recent_stamp = xtime.tv_sec;
+			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval;
 		}
 
 		/* I am shamed, but failed to make it more elegant.
@@ -187,11 +161,13 @@ kill_with_rst:
 		 * do not undertsnad recycling in any case, it not
 		 * a big problem in practice. --ANK */
 		if (tw->tw_family == AF_INET &&
-		    sysctl_tcp_tw_recycle && tw->tw_ts_recent_stamp &&
+		    tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&
 		    tcp_v4_tw_remember_stamp(tw))
-			tcp_tw_schedule(tw, tw->tw_timeout);
+			inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,
+					   TCP_TIMEWAIT_LEN);
 		else
-			tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+			inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
+					   TCP_TIMEWAIT_LEN);
 		return TCP_TW_ACK;
 	}
 
@@ -213,7 +189,7 @@ kill_with_rst:
 	 */
 
 	if (!paws_reject &&
-	    (TCP_SKB_CB(skb)->seq == tw->tw_rcv_nxt &&
+	    (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
 	     (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
 		/* In window segment, it may be only reset or bare ack. */
 
@@ -224,19 +200,20 @@ kill_with_rst:
 			 */
 			if (sysctl_tcp_rfc1337 == 0) {
 kill:
-				tcp_tw_deschedule(tw);
-				tcp_tw_put(tw);
+				inet_twsk_deschedule(tw, &tcp_death_row);
+				inet_twsk_put(tw);
 				return TCP_TW_SUCCESS;
 			}
 		}
-		tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+		inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
+				   TCP_TIMEWAIT_LEN);
 
 		if (tmp_opt.saw_tstamp) {
-			tw->tw_ts_recent	= tmp_opt.rcv_tsval;
-			tw->tw_ts_recent_stamp	= xtime.tv_sec;
+			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval;
+			tcptw->tw_ts_recent_stamp = xtime.tv_sec;
 		}
 
-		tcp_tw_put(tw);
+		inet_twsk_put(tw);
 		return TCP_TW_SUCCESS;
 	}
 
@@ -258,9 +235,10 @@ kill:
 	 */
 
 	if (th->syn && !th->rst && !th->ack && !paws_reject &&
-	    (after(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt) ||
-	     (tmp_opt.saw_tstamp && (s32)(tw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
-		u32 isn = tw->tw_snd_nxt + 65535 + 2;
+	    (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
+	     (tmp_opt.saw_tstamp &&
+	      (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
+		u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
 		if (isn == 0)
 			isn++;
 		TCP_SKB_CB(skb)->when = isn;
@@ -278,107 +256,57 @@ kill:
 		 * Do not reschedule in the last case.
 		 */
 		if (paws_reject || th->ack)
-			tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+			inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
+					   TCP_TIMEWAIT_LEN);
 
 		/* Send ACK. Note, we do not put the bucket,
 		 * it will be released by caller.
 		 */
 		return TCP_TW_ACK;
 	}
-	tcp_tw_put(tw);
+	inet_twsk_put(tw);
 	return TCP_TW_SUCCESS;
 }
 
-/* Enter the time wait state.  This is called with locally disabled BH.
- * Essentially we whip up a timewait bucket, copy the
- * relevant info into it from the SK, and mess with hash chains
- * and list linkage.
- */
-static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
-{
-	struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent];
-	struct tcp_bind_hashbucket *bhead;
-
-	/* Step 1: Put TW into bind hash. Original socket stays there too.
-	   Note, that any socket with inet_sk(sk)->num != 0 MUST be bound in
-	   binding cache, even if it is closed.
-	 */
-	bhead = &tcp_bhash[tcp_bhashfn(inet_sk(sk)->num)];
-	spin_lock(&bhead->lock);
-	tw->tw_tb = tcp_sk(sk)->bind_hash;
-	BUG_TRAP(tcp_sk(sk)->bind_hash);
-	tw_add_bind_node(tw, &tw->tw_tb->owners);
-	spin_unlock(&bhead->lock);
-
-	write_lock(&ehead->lock);
-
-	/* Step 2: Remove SK from established hash. */
-	if (__sk_del_node_init(sk))
-		sock_prot_dec_use(sk->sk_prot);
-
-	/* Step 3: Hash TW into TIMEWAIT half of established hash table. */
-	tw_add_node(tw, &(ehead + tcp_ehash_size)->chain);
-	atomic_inc(&tw->tw_refcnt);
-
-	write_unlock(&ehead->lock);
-}
-
 /* 
  * Move a socket to time-wait or dead fin-wait-2 state.
  */ 
 void tcp_time_wait(struct sock *sk, int state, int timeo)
 {
-	struct tcp_tw_bucket *tw = NULL;
-	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_timewait_sock *tw = NULL;
+	const struct tcp_sock *tp = tcp_sk(sk);
 	int recycle_ok = 0;
 
-	if (sysctl_tcp_tw_recycle && tp->rx_opt.ts_recent_stamp)
+	if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
 		recycle_ok = tp->af_specific->remember_stamp(sk);
 
-	if (tcp_tw_count < sysctl_tcp_max_tw_buckets)
-		tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
-
-	if(tw != NULL) {
-		struct inet_sock *inet = inet_sk(sk);
-		int rto = (tp->rto<<2) - (tp->rto>>1);
-
-		/* Give us an identity. */
-		tw->tw_daddr		= inet->daddr;
-		tw->tw_rcv_saddr	= inet->rcv_saddr;
-		tw->tw_bound_dev_if	= sk->sk_bound_dev_if;
-		tw->tw_num		= inet->num;
-		tw->tw_state		= TCP_TIME_WAIT;
-		tw->tw_substate		= state;
-		tw->tw_sport		= inet->sport;
-		tw->tw_dport		= inet->dport;
-		tw->tw_family		= sk->sk_family;
-		tw->tw_reuse		= sk->sk_reuse;
-		tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale;
-		atomic_set(&tw->tw_refcnt, 1);
+	if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
+		tw = inet_twsk_alloc(sk, state);
 
-		tw->tw_hashent		= sk->sk_hashent;
-		tw->tw_rcv_nxt		= tp->rcv_nxt;
-		tw->tw_snd_nxt		= tp->snd_nxt;
-		tw->tw_rcv_wnd		= tcp_receive_window(tp);
-		tw->tw_ts_recent	= tp->rx_opt.ts_recent;
-		tw->tw_ts_recent_stamp	= tp->rx_opt.ts_recent_stamp;
-		tw_dead_node_init(tw);
+	if (tw != NULL) {
+		struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+		const struct inet_connection_sock *icsk = inet_csk(sk);
+		const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
+
+		tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale;
+		tcptw->tw_rcv_nxt	= tp->rcv_nxt;
+		tcptw->tw_snd_nxt	= tp->snd_nxt;
+		tcptw->tw_rcv_wnd	= tcp_receive_window(tp);
+		tcptw->tw_ts_recent	= tp->rx_opt.ts_recent;
+		tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 		if (tw->tw_family == PF_INET6) {
 			struct ipv6_pinfo *np = inet6_sk(sk);
+			struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw);
 
-			ipv6_addr_copy(&tw->tw_v6_daddr, &np->daddr);
-			ipv6_addr_copy(&tw->tw_v6_rcv_saddr, &np->rcv_saddr);
-			tw->tw_v6_ipv6only = np->ipv6only;
-		} else {
-			memset(&tw->tw_v6_daddr, 0, sizeof(tw->tw_v6_daddr));
-			memset(&tw->tw_v6_rcv_saddr, 0, sizeof(tw->tw_v6_rcv_saddr));
-			tw->tw_v6_ipv6only = 0;
+			ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr);
+			ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr);
+			tw->tw_ipv6only = np->ipv6only;
 		}
 #endif
 		/* Linkage updates. */
-		__tcp_tw_hashdance(sk, tw);
+		__inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
 
 		/* Get the TIME_WAIT timeout firing. */
 		if (timeo < rto)
@@ -392,8 +320,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 				timeo = TCP_TIMEWAIT_LEN;
 		}
 
-		tcp_tw_schedule(tw, timeo);
-		tcp_tw_put(tw);
+		inet_twsk_schedule(tw, &tcp_death_row, timeo,
+				   TCP_TIMEWAIT_LEN);
+		inet_twsk_put(tw);
 	} else {
 		/* Sorry, if we're out of memory, just CLOSE this
 		 * socket up.  We've got bigger problems than
@@ -407,277 +336,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 	tcp_done(sk);
 }
 
-/* Kill off TIME_WAIT sockets once their lifetime has expired. */
-static int tcp_tw_death_row_slot;
-
-static void tcp_twkill(unsigned long);
-
-/* TIME_WAIT reaping mechanism. */
-#define TCP_TWKILL_SLOTS	8	/* Please keep this a power of 2. */
-#define TCP_TWKILL_PERIOD	(TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS)
-
-#define TCP_TWKILL_QUOTA	100
-
-static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS];
-static DEFINE_SPINLOCK(tw_death_lock);
-static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0);
-static void twkill_work(void *);
-static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL);
-static u32 twkill_thread_slots;
-
-/* Returns non-zero if quota exceeded.  */
-static int tcp_do_twkill_work(int slot, unsigned int quota)
-{
-	struct tcp_tw_bucket *tw;
-	struct hlist_node *node;
-	unsigned int killed;
-	int ret;
-
-	/* NOTE: compare this to previous version where lock
-	 * was released after detaching chain. It was racy,
-	 * because tw buckets are scheduled in not serialized context
-	 * in 2.3 (with netfilter), and with softnet it is common, because
-	 * soft irqs are not sequenced.
-	 */
-	killed = 0;
-	ret = 0;
-rescan:
-	tw_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) {
-		__tw_del_dead_node(tw);
-		spin_unlock(&tw_death_lock);
-		tcp_timewait_kill(tw);
-		tcp_tw_put(tw);
-		killed++;
-		spin_lock(&tw_death_lock);
-		if (killed > quota) {
-			ret = 1;
-			break;
-		}
-
-		/* While we dropped tw_death_lock, another cpu may have
-		 * killed off the next TW bucket in the list, therefore
-		 * do a fresh re-read of the hlist head node with the
-		 * lock reacquired.  We still use the hlist traversal
-		 * macro in order to get the prefetches.
-		 */
-		goto rescan;
-	}
-
-	tcp_tw_count -= killed;
-	NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
-
-	return ret;
-}
-
-static void tcp_twkill(unsigned long dummy)
-{
-	int need_timer, ret;
-
-	spin_lock(&tw_death_lock);
-
-	if (tcp_tw_count == 0)
-		goto out;
-
-	need_timer = 0;
-	ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA);
-	if (ret) {
-		twkill_thread_slots |= (1 << tcp_tw_death_row_slot);
-		mb();
-		schedule_work(&tcp_twkill_work);
-		need_timer = 1;
-	} else {
-		/* We purged the entire slot, anything left?  */
-		if (tcp_tw_count)
-			need_timer = 1;
-	}
-	tcp_tw_death_row_slot =
-		((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
-	if (need_timer)
-		mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD);
-out:
-	spin_unlock(&tw_death_lock);
-}
-
-extern void twkill_slots_invalid(void);
-
-static void twkill_work(void *dummy)
-{
-	int i;
-
-	if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8))
-		twkill_slots_invalid();
-
-	while (twkill_thread_slots) {
-		spin_lock_bh(&tw_death_lock);
-		for (i = 0; i < TCP_TWKILL_SLOTS; i++) {
-			if (!(twkill_thread_slots & (1 << i)))
-				continue;
-
-			while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) {
-				if (need_resched()) {
-					spin_unlock_bh(&tw_death_lock);
-					schedule();
-					spin_lock_bh(&tw_death_lock);
-				}
-			}
-
-			twkill_thread_slots &= ~(1 << i);
-		}
-		spin_unlock_bh(&tw_death_lock);
-	}
-}
-
-/* These are always called from BH context.  See callers in
- * tcp_input.c to verify this.
- */
-
-/* This is for handling early-kills of TIME_WAIT sockets. */
-void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
-{
-	spin_lock(&tw_death_lock);
-	if (tw_del_dead_node(tw)) {
-		tcp_tw_put(tw);
-		if (--tcp_tw_count == 0)
-			del_timer(&tcp_tw_timer);
-	}
-	spin_unlock(&tw_death_lock);
-	tcp_timewait_kill(tw);
-}
-
-/* Short-time timewait calendar */
-
-static int tcp_twcal_hand = -1;
-static int tcp_twcal_jiffie;
-static void tcp_twcal_tick(unsigned long);
-static struct timer_list tcp_twcal_timer =
-		TIMER_INITIALIZER(tcp_twcal_tick, 0, 0);
-static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS];
-
-static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo)
-{
-	struct hlist_head *list;
-	int slot;
-
-	/* timeout := RTO * 3.5
-	 *
-	 * 3.5 = 1+2+0.5 to wait for two retransmits.
-	 *
-	 * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
-	 * our ACK acking that FIN can be lost. If N subsequent retransmitted
-	 * FINs (or previous seqments) are lost (probability of such event
-	 * is p^(N+1), where p is probability to lose single packet and
-	 * time to detect the loss is about RTO*(2^N - 1) with exponential
-	 * backoff). Normal timewait length is calculated so, that we
-	 * waited at least for one retransmitted FIN (maximal RTO is 120sec).
-	 * [ BTW Linux. following BSD, violates this requirement waiting
-	 *   only for 60sec, we should wait at least for 240 secs.
-	 *   Well, 240 consumes too much of resources 8)
-	 * ]
-	 * This interval is not reduced to catch old duplicate and
-	 * responces to our wandering segments living for two MSLs.
-	 * However, if we use PAWS to detect
-	 * old duplicates, we can reduce the interval to bounds required
-	 * by RTO, rather than MSL. So, if peer understands PAWS, we
-	 * kill tw bucket after 3.5*RTO (it is important that this number
-	 * is greater than TS tick!) and detect old duplicates with help
-	 * of PAWS.
-	 */
-	slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK;
-
-	spin_lock(&tw_death_lock);
-
-	/* Unlink it, if it was scheduled */
-	if (tw_del_dead_node(tw))
-		tcp_tw_count--;
-	else
-		atomic_inc(&tw->tw_refcnt);
-
-	if (slot >= TCP_TW_RECYCLE_SLOTS) {
-		/* Schedule to slow timer */
-		if (timeo >= TCP_TIMEWAIT_LEN) {
-			slot = TCP_TWKILL_SLOTS-1;
-		} else {
-			slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD;
-			if (slot >= TCP_TWKILL_SLOTS)
-				slot = TCP_TWKILL_SLOTS-1;
-		}
-		tw->tw_ttd = jiffies + timeo;
-		slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1);
-		list = &tcp_tw_death_row[slot];
-	} else {
-		tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK);
-
-		if (tcp_twcal_hand < 0) {
-			tcp_twcal_hand = 0;
-			tcp_twcal_jiffie = jiffies;
-			tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK);
-			add_timer(&tcp_twcal_timer);
-		} else {
-			if (time_after(tcp_twcal_timer.expires, jiffies + (slot<<TCP_TW_RECYCLE_TICK)))
-				mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK));
-			slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1);
-		}
-		list = &tcp_twcal_row[slot];
-	}
-
-	hlist_add_head(&tw->tw_death_node, list);
-
-	if (tcp_tw_count++ == 0)
-		mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
-	spin_unlock(&tw_death_lock);
-}
-
-void tcp_twcal_tick(unsigned long dummy)
-{
-	int n, slot;
-	unsigned long j;
-	unsigned long now = jiffies;
-	int killed = 0;
-	int adv = 0;
-
-	spin_lock(&tw_death_lock);
-	if (tcp_twcal_hand < 0)
-		goto out;
-
-	slot = tcp_twcal_hand;
-	j = tcp_twcal_jiffie;
-
-	for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) {
-		if (time_before_eq(j, now)) {
-			struct hlist_node *node, *safe;
-			struct tcp_tw_bucket *tw;
-
-			tw_for_each_inmate_safe(tw, node, safe,
-					   &tcp_twcal_row[slot]) {
-				__tw_del_dead_node(tw);
-				tcp_timewait_kill(tw);
-				tcp_tw_put(tw);
-				killed++;
-			}
-		} else {
-			if (!adv) {
-				adv = 1;
-				tcp_twcal_jiffie = j;
-				tcp_twcal_hand = slot;
-			}
-
-			if (!hlist_empty(&tcp_twcal_row[slot])) {
-				mod_timer(&tcp_twcal_timer, j);
-				goto out;
-			}
-		}
-		j += (1<<TCP_TW_RECYCLE_TICK);
-		slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1);
-	}
-	tcp_twcal_hand = -1;
-
-out:
-	if ((tcp_tw_count -= killed) == 0)
-		del_timer(&tcp_tw_timer);
-	NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
-	spin_unlock(&tw_death_lock);
-}
-
 /* This is not only more efficient than what we used to do, it eliminates
  * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
  *
@@ -686,75 +344,27 @@ out:
  */
 struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
 {
-	/* allocate the newsk from the same slab of the master sock,
-	 * if not, at sk_free time we'll try to free it from the wrong
-	 * slabcache (i.e. is it TCPv4 or v6?), this is handled thru sk->sk_prot -acme */
-	struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0);
+	struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
 
-	if(newsk != NULL) {
-		struct inet_request_sock *ireq = inet_rsk(req);
+	if (newsk != NULL) {
+		const struct inet_request_sock *ireq = inet_rsk(req);
 		struct tcp_request_sock *treq = tcp_rsk(req);
+		struct inet_connection_sock *newicsk = inet_csk(sk);
 		struct tcp_sock *newtp;
-		struct sk_filter *filter;
-
-		memcpy(newsk, sk, sizeof(struct tcp_sock));
-		newsk->sk_state = TCP_SYN_RECV;
-
-		/* SANITY */
-		sk_node_init(&newsk->sk_node);
-		tcp_sk(newsk)->bind_hash = NULL;
-
-		/* Clone the TCP header template */
-		inet_sk(newsk)->dport = ireq->rmt_port;
-
-		sock_lock_init(newsk);
-		bh_lock_sock(newsk);
-
-		rwlock_init(&newsk->sk_dst_lock);
-		atomic_set(&newsk->sk_rmem_alloc, 0);
-		skb_queue_head_init(&newsk->sk_receive_queue);
-		atomic_set(&newsk->sk_wmem_alloc, 0);
-		skb_queue_head_init(&newsk->sk_write_queue);
-		atomic_set(&newsk->sk_omem_alloc, 0);
-		newsk->sk_wmem_queued = 0;
-		newsk->sk_forward_alloc = 0;
-
-		sock_reset_flag(newsk, SOCK_DONE);
-		newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
-		newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
-		newsk->sk_send_head = NULL;
-		rwlock_init(&newsk->sk_callback_lock);
-		skb_queue_head_init(&newsk->sk_error_queue);
-		newsk->sk_write_space = sk_stream_write_space;
-
-		if ((filter = newsk->sk_filter) != NULL)
-			sk_filter_charge(newsk, filter);
-
-		if (unlikely(xfrm_sk_clone_policy(newsk))) {
-			/* It is still raw copy of parent, so invalidate
-			 * destructor and make plain sk_free() */
-			newsk->sk_destruct = NULL;
-			sk_free(newsk);
-			return NULL;
-		}
 
 		/* Now setup tcp_sock */
 		newtp = tcp_sk(newsk);
 		newtp->pred_flags = 0;
 		newtp->rcv_nxt = treq->rcv_isn + 1;
-		newtp->snd_nxt = treq->snt_isn + 1;
-		newtp->snd_una = treq->snt_isn + 1;
-		newtp->snd_sml = treq->snt_isn + 1;
+		newtp->snd_nxt = newtp->snd_una = newtp->snd_sml = treq->snt_isn + 1;
 
 		tcp_prequeue_init(newtp);
 
 		tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn);
 
-		newtp->retransmits = 0;
-		newtp->backoff = 0;
 		newtp->srtt = 0;
 		newtp->mdev = TCP_TIMEOUT_INIT;
-		newtp->rto = TCP_TIMEOUT_INIT;
+		newicsk->icsk_rto = TCP_TIMEOUT_INIT;
 
 		newtp->packets_out = 0;
 		newtp->left_out = 0;
@@ -774,9 +384,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		newtp->frto_counter = 0;
 		newtp->frto_highmark = 0;
 
-		newtp->ca_ops = &tcp_reno;
+		newicsk->icsk_ca_ops = &tcp_reno;
 
-		tcp_set_ca_state(newtp, TCP_CA_Open);
+		tcp_set_ca_state(newsk, TCP_CA_Open);
 		tcp_init_xmit_timers(newsk);
 		skb_queue_head_init(&newtp->out_of_order_queue);
 		newtp->rcv_wup = treq->rcv_isn + 1;
@@ -789,26 +399,12 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		newtp->rx_opt.dsack = 0;
 		newtp->rx_opt.eff_sacks = 0;
 
-		newtp->probes_out = 0;
 		newtp->rx_opt.num_sacks = 0;
 		newtp->urg_data = 0;
-		/* Deinitialize accept_queue to trap illegal accesses. */
-		memset(&newtp->accept_queue, 0, sizeof(newtp->accept_queue));
-
-		/* Back to base struct sock members. */
-		newsk->sk_err = 0;
-		newsk->sk_priority = 0;
-		atomic_set(&newsk->sk_refcnt, 2);
-#ifdef INET_REFCNT_DEBUG
-		atomic_inc(&inet_sock_nr);
-#endif
-		atomic_inc(&tcp_sockets_allocated);
 
 		if (sock_flag(newsk, SOCK_KEEPOPEN))
-			tcp_reset_keepalive_timer(newsk,
-						  keepalive_time_when(newtp));
-		newsk->sk_socket = NULL;
-		newsk->sk_sleep = NULL;
+			inet_csk_reset_keepalive_timer(newsk,
+						       keepalive_time_when(newtp));
 
 		newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
 		if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
@@ -838,7 +434,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 			newtp->tcp_header_len = sizeof(struct tcphdr);
 		}
 		if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
-			newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len;
+			newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
 		newtp->rx_opt.mss_clamp = req->mss;
 		TCP_ECN_openreq_child(newtp, req);
 		if (newtp->ecn_flags&TCP_ECN_OK)
@@ -934,9 +530,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 	   does sequence test, SYN is truncated, and thus we consider
 	   it a bare ACK.
 
-	   If tp->defer_accept, we silently drop this bare ACK.  Otherwise,
-	   we create an established connection.  Both ends (listening sockets)
-	   accept the new incoming connection and try to talk to each other. 8-)
+	   If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
+	   bare ACK.  Otherwise, we create an established connection.  Both
+	   ends (listening sockets) accept the new incoming connection and try
+	   to talk to each other. 8-)
 
 	   Note: This case is both harmless, and rare.  Possibility is about the
 	   same as us discovering intelligent life on another plant tomorrow.
@@ -1003,7 +600,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 			return NULL;
 
 		/* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
-		if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
+		if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
+		    TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
 			inet_rsk(req)->acked = 1;
 			return NULL;
 		}
@@ -1018,10 +616,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 		if (child == NULL)
 			goto listen_overflow;
 
-		tcp_synq_unlink(tp, req, prev);
-		tcp_synq_removed(sk, req);
+		inet_csk_reqsk_queue_unlink(sk, req, prev);
+		inet_csk_reqsk_queue_removed(sk, req);
 
-		tcp_acceptq_queue(sk, req, child);
+		inet_csk_reqsk_queue_add(sk, req, child);
 		return child;
 
 	listen_overflow:
@@ -1035,7 +633,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 		if (!(flg & TCP_FLAG_RST))
 			req->rsk_ops->send_reset(skb);
 
-		tcp_synq_drop(sk, req, prev);
+		inet_csk_reqsk_queue_drop(sk, req, prev);
 		return NULL;
 }
 
@@ -1074,4 +672,3 @@ EXPORT_SYMBOL(tcp_check_req);
 EXPORT_SYMBOL(tcp_child_process);
 EXPORT_SYMBOL(tcp_create_openreq_child);
 EXPORT_SYMBOL(tcp_timewait_state_process);
-EXPORT_SYMBOL(tcp_tw_deschedule);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index dd30dd137b7..75b68116682 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -105,18 +105,19 @@ static __u16 tcp_advertise_mss(struct sock *sk)
 
 /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
  * This is the first part of cwnd validation mechanism. */
-static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
+static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
 	s32 delta = tcp_time_stamp - tp->lsndtime;
 	u32 restart_cwnd = tcp_init_cwnd(tp, dst);
 	u32 cwnd = tp->snd_cwnd;
 
-	tcp_ca_event(tp, CA_EVENT_CWND_RESTART);
+	tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
 
-	tp->snd_ssthresh = tcp_current_ssthresh(tp);
+	tp->snd_ssthresh = tcp_current_ssthresh(sk);
 	restart_cwnd = min(restart_cwnd, cwnd);
 
-	while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd)
+	while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
 		cwnd >>= 1;
 	tp->snd_cwnd = max(cwnd, restart_cwnd);
 	tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -126,26 +127,25 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
 static inline void tcp_event_data_sent(struct tcp_sock *tp,
 				       struct sk_buff *skb, struct sock *sk)
 {
-	u32 now = tcp_time_stamp;
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	const u32 now = tcp_time_stamp;
 
-	if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto)
-		tcp_cwnd_restart(tp, __sk_dst_get(sk));
+	if (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)
+		tcp_cwnd_restart(sk, __sk_dst_get(sk));
 
 	tp->lsndtime = now;
 
 	/* If it is a reply for ato after last received
 	 * packet, enter pingpong mode.
 	 */
-	if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
-		tp->ack.pingpong = 1;
+	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
+		icsk->icsk_ack.pingpong = 1;
 }
 
 static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	tcp_dec_quickack_mode(tp, pkts);
-	tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
+	tcp_dec_quickack_mode(sk, pkts);
+	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
 }
 
 /* Determine a window scaling and initial window to offer.
@@ -265,6 +265,7 @@ static __inline__ u16 tcp_select_window(struct sock *sk)
 static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 {
 	if (skb != NULL) {
+		const struct inet_connection_sock *icsk = inet_csk(sk);
 		struct inet_sock *inet = inet_sk(sk);
 		struct tcp_sock *tp = tcp_sk(sk);
 		struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
@@ -280,8 +281,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 #define SYSCTL_FLAG_SACK	0x4
 
 		/* If congestion control is doing timestamping */
-		if (tp->ca_ops->rtt_sample)
-			do_gettimeofday(&skb->stamp);
+		if (icsk->icsk_ca_ops->rtt_sample)
+			__net_timestamp(skb);
 
 		sysctl_flags = 0;
 		if (tcb->flags & TCPCB_FLAG_SYN) {
@@ -308,7 +309,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 		}
 		
 		if (tcp_packets_in_flight(tp) == 0)
-			tcp_ca_event(tp, CA_EVENT_TX_START);
+			tcp_ca_event(sk, CA_EVENT_TX_START);
 
 		th = (struct tcphdr *) skb_push(skb, tcp_header_size);
 		skb->h.th = th;
@@ -366,7 +367,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 		if (err <= 0)
 			return err;
 
-		tcp_enter_cwr(tp);
+		tcp_enter_cwr(sk);
 
 		/* NET_XMIT_CN is special. It does not guarantee,
 		 * that this packet is lost. It tells that device
@@ -482,7 +483,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned
 	 * skbs, which it never sent before. --ANK
 	 */
 	TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
-	buff->stamp = skb->stamp;
+	buff->tstamp = skb->tstamp;
 
 	if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
 		tp->lost_out -= tcp_skb_pcount(skb);
@@ -505,7 +506,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned
 
 	/* Link BUFF into the send queue. */
 	skb_header_release(buff);
-	__skb_append(skb, buff);
+	__skb_append(skb, buff, &sk->sk_write_queue);
 
 	return 0;
 }
@@ -696,7 +697,7 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
 		if (tp->packets_out > tp->snd_cwnd_used)
 			tp->snd_cwnd_used = tp->packets_out;
 
-		if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
+		if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
 			tcp_cwnd_application_limited(sk);
 	}
 }
@@ -893,7 +894,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 
 	/* Link BUFF into the send queue. */
 	skb_header_release(buff);
-	__skb_append(skb, buff);
+	__skb_append(skb, buff, &sk->sk_write_queue);
 
 	return 0;
 }
@@ -905,12 +906,13 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
  */
 static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
 {
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 	u32 send_win, cong_win, limit, in_flight;
 
 	if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
 		return 0;
 
-	if (tp->ca_state != TCP_CA_Open)
+	if (icsk->icsk_ca_state != TCP_CA_Open)
 		return 0;
 
 	in_flight = tcp_packets_in_flight(tp);
@@ -1147,6 +1149,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
  */
 u32 __tcp_select_window(struct sock *sk)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	/* MSS for the peer's data.  Previous verions used mss_clamp
 	 * here.  I don't know if the value based on our guesses
@@ -1154,7 +1157,7 @@ u32 __tcp_select_window(struct sock *sk)
 	 * but may be worse for the performance because of rcv_mss
 	 * fluctuations.  --SAW  1998/11/1
 	 */
-	int mss = tp->ack.rcv_mss;
+	int mss = icsk->icsk_ack.rcv_mss;
 	int free_space = tcp_space(sk);
 	int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
 	int window;
@@ -1163,7 +1166,7 @@ u32 __tcp_select_window(struct sock *sk)
 		mss = full_space; 
 
 	if (free_space < full_space/2) {
-		tp->ack.quick = 0;
+		icsk->icsk_ack.quick = 0;
 
 		if (tcp_memory_pressure)
 			tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
@@ -1238,7 +1241,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
 		       tcp_skb_pcount(next_skb) != 1);
 
 		/* Ok.  We will be able to collapse the packet. */
-		__skb_unlink(next_skb, next_skb->list);
+		__skb_unlink(next_skb, &sk->sk_write_queue);
 
 		memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
 
@@ -1286,6 +1289,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
  */ 
 void tcp_simple_retransmit(struct sock *sk)
 {
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 	unsigned int mss = tcp_current_mss(sk, 0);
@@ -1316,12 +1320,12 @@ void tcp_simple_retransmit(struct sock *sk)
 	 * in network, but units changed and effective
 	 * cwnd/ssthresh really reduced now.
 	 */
-	if (tp->ca_state != TCP_CA_Loss) {
+	if (icsk->icsk_ca_state != TCP_CA_Loss) {
 		tp->high_seq = tp->snd_nxt;
-		tp->snd_ssthresh = tcp_current_ssthresh(tp);
+		tp->snd_ssthresh = tcp_current_ssthresh(sk);
 		tp->prior_ssthresh = 0;
 		tp->undo_marker = 0;
-		tcp_set_ca_state(tp, TCP_CA_Loss);
+		tcp_set_ca_state(sk, TCP_CA_Loss);
 	}
 	tcp_xmit_retransmit_queue(sk);
 }
@@ -1461,6 +1465,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
  */
 void tcp_xmit_retransmit_queue(struct sock *sk)
 {
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 	int packet_cnt = tp->lost_out;
@@ -1484,14 +1489,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 				if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
 					if (tcp_retransmit_skb(sk, skb))
 						return;
-					if (tp->ca_state != TCP_CA_Loss)
+					if (icsk->icsk_ca_state != TCP_CA_Loss)
 						NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
 					else
 						NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS);
 
 					if (skb ==
 					    skb_peek(&sk->sk_write_queue))
-						tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+						inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+									  inet_csk(sk)->icsk_rto,
+									  TCP_RTO_MAX);
 				}
 
 				packet_cnt -= tcp_skb_pcount(skb);
@@ -1504,7 +1511,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 	/* OK, demanded retransmission is finished. */
 
 	/* Forward retransmissions are possible only during Recovery. */
-	if (tp->ca_state != TCP_CA_Recovery)
+	if (icsk->icsk_ca_state != TCP_CA_Recovery)
 		return;
 
 	/* No forward retransmissions in Reno are possible. */
@@ -1544,7 +1551,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 			break;
 
 		if (skb == skb_peek(&sk->sk_write_queue))
-			tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+						  inet_csk(sk)->icsk_rto,
+						  TCP_RTO_MAX);
 
 		NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS);
 	}
@@ -1573,7 +1582,7 @@ void tcp_send_fin(struct sock *sk)
 	} else {
 		/* Socket is locked, keep trying until memory is available. */
 		for (;;) {
-			skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
+			skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_KERNEL);
 			if (skb)
 				break;
 			yield();
@@ -1780,8 +1789,8 @@ static inline void tcp_connect_init(struct sock *sk)
 	tp->rcv_wup = 0;
 	tp->copied_seq = 0;
 
-	tp->rto = TCP_TIMEOUT_INIT;
-	tp->retransmits = 0;
+	inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
+	inet_csk(sk)->icsk_retransmits = 0;
 	tcp_clear_retrans(tp);
 }
 
@@ -1795,7 +1804,7 @@ int tcp_connect(struct sock *sk)
 
 	tcp_connect_init(sk);
 
-	buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation);
+	buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
 	if (unlikely(buff == NULL))
 		return -ENOBUFS;
 
@@ -1824,7 +1833,8 @@ int tcp_connect(struct sock *sk)
 	TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
 
 	/* Timer for repeating the SYN until an answer. */
-	tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+				  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
 	return 0;
 }
 
@@ -1834,20 +1844,21 @@ int tcp_connect(struct sock *sk)
  */
 void tcp_send_delayed_ack(struct sock *sk)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
-	int ato = tp->ack.ato;
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int ato = icsk->icsk_ack.ato;
 	unsigned long timeout;
 
 	if (ato > TCP_DELACK_MIN) {
+		const struct tcp_sock *tp = tcp_sk(sk);
 		int max_ato = HZ/2;
 
-		if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
+		if (icsk->icsk_ack.pingpong || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
 			max_ato = TCP_DELACK_MAX;
 
 		/* Slow path, intersegment interval is "high". */
 
 		/* If some rtt estimate is known, use it to bound delayed ack.
-		 * Do not use tp->rto here, use results of rtt measurements
+		 * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
 		 * directly.
 		 */
 		if (tp->srtt) {
@@ -1864,21 +1875,22 @@ void tcp_send_delayed_ack(struct sock *sk)
 	timeout = jiffies + ato;
 
 	/* Use new timeout only if there wasn't a older one earlier. */
-	if (tp->ack.pending&TCP_ACK_TIMER) {
+	if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
 		/* If delack timer was blocked or is about to expire,
 		 * send ACK now.
 		 */
-		if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
+		if (icsk->icsk_ack.blocked ||
+		    time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
 			tcp_send_ack(sk);
 			return;
 		}
 
-		if (!time_before(timeout, tp->ack.timeout))
-			timeout = tp->ack.timeout;
+		if (!time_before(timeout, icsk->icsk_ack.timeout))
+			timeout = icsk->icsk_ack.timeout;
 	}
-	tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
-	tp->ack.timeout = timeout;
-	sk_reset_timer(sk, &tp->delack_timer, timeout);
+	icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
+	icsk->icsk_ack.timeout = timeout;
+	sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
 }
 
 /* This routine sends an ack and also updates the window. */
@@ -1895,9 +1907,10 @@ void tcp_send_ack(struct sock *sk)
 		 */
 		buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
 		if (buff == NULL) {
-			tcp_schedule_ack(tp);
-			tp->ack.ato = TCP_ATO_MIN;
-			tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
+			inet_csk_schedule_ack(sk);
+			inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
+			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+						  TCP_DELACK_MAX, TCP_RTO_MAX);
 			return;
 		}
 
@@ -2011,6 +2024,7 @@ int tcp_write_wakeup(struct sock *sk)
  */
 void tcp_send_probe0(struct sock *sk)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	int err;
 
@@ -2018,28 +2032,31 @@ void tcp_send_probe0(struct sock *sk)
 
 	if (tp->packets_out || !sk->sk_send_head) {
 		/* Cancel probe timer, if it is not required. */
-		tp->probes_out = 0;
-		tp->backoff = 0;
+		icsk->icsk_probes_out = 0;
+		icsk->icsk_backoff = 0;
 		return;
 	}
 
 	if (err <= 0) {
-		if (tp->backoff < sysctl_tcp_retries2)
-			tp->backoff++;
-		tp->probes_out++;
-		tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 
-				      min(tp->rto << tp->backoff, TCP_RTO_MAX));
+		if (icsk->icsk_backoff < sysctl_tcp_retries2)
+			icsk->icsk_backoff++;
+		icsk->icsk_probes_out++;
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, 
+					  min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
+					  TCP_RTO_MAX);
 	} else {
 		/* If packet was not sent due to local congestion,
-		 * do not backoff and do not remember probes_out.
+		 * do not backoff and do not remember icsk_probes_out.
 		 * Let local senders to fight for local resources.
 		 *
 		 * Use accumulated backoff yet.
 		 */
-		if (!tp->probes_out)
-			tp->probes_out=1;
-		tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 
-				      min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
+		if (!icsk->icsk_probes_out)
+			icsk->icsk_probes_out = 1;
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, 
+					  min(icsk->icsk_rto << icsk->icsk_backoff,
+					      TCP_RESOURCE_PROBE_INTERVAL),
+					  TCP_RTO_MAX);
 	}
 }
 
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 70e108e15c7..327770bf552 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -16,9 +16,10 @@
 #define TCP_SCALABLE_AI_CNT	50U
 #define TCP_SCALABLE_MD_SCALE	3
 
-static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
 				    u32 in_flight, int flag)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
 	if (in_flight < tp->snd_cwnd)
 		return;
 
@@ -35,8 +36,9 @@ static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
-static u32 tcp_scalable_ssthresh(struct tcp_sock *tp)
+static u32 tcp_scalable_ssthresh(struct sock *sk)
 {
+	const struct tcp_sock *tp = tcp_sk(sk);
 	return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
 }
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 0084227438c..415ee47ac1c 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -36,49 +36,13 @@ static void tcp_write_timer(unsigned long);
 static void tcp_delack_timer(unsigned long);
 static void tcp_keepalive_timer (unsigned long data);
 
-#ifdef TCP_DEBUG
-const char tcp_timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
-EXPORT_SYMBOL(tcp_timer_bug_msg);
-#endif
-
-/*
- * Using different timers for retransmit, delayed acks and probes
- * We may wish use just one timer maintaining a list of expire jiffies 
- * to optimize.
- */
-
 void tcp_init_xmit_timers(struct sock *sk)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	init_timer(&tp->retransmit_timer);
-	tp->retransmit_timer.function=&tcp_write_timer;
-	tp->retransmit_timer.data = (unsigned long) sk;
-	tp->pending = 0;
-
-	init_timer(&tp->delack_timer);
-	tp->delack_timer.function=&tcp_delack_timer;
-	tp->delack_timer.data = (unsigned long) sk;
-	tp->ack.pending = 0;
-
-	init_timer(&sk->sk_timer);
-	sk->sk_timer.function	= &tcp_keepalive_timer;
-	sk->sk_timer.data	= (unsigned long)sk;
+	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
+				  &tcp_keepalive_timer);
 }
 
-void tcp_clear_xmit_timers(struct sock *sk)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	tp->pending = 0;
-	sk_stop_timer(sk, &tp->retransmit_timer);
-
-	tp->ack.pending = 0;
-	tp->ack.blocked = 0;
-	sk_stop_timer(sk, &tp->delack_timer);
-
-	sk_stop_timer(sk, &sk->sk_timer);
-}
+EXPORT_SYMBOL(tcp_init_xmit_timers);
 
 static void tcp_write_err(struct sock *sk)
 {
@@ -155,15 +119,15 @@ static int tcp_orphan_retries(struct sock *sk, int alive)
 /* A write timeout has occurred. Process the after effects. */
 static int tcp_write_timeout(struct sock *sk)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 	int retry_until;
 
 	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
-		if (tp->retransmits)
+		if (icsk->icsk_retransmits)
 			dst_negative_advice(&sk->sk_dst_cache);
-		retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries;
+		retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
 	} else {
-		if (tp->retransmits >= sysctl_tcp_retries1) {
+		if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
 			/* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
 			   hole detection. :-(
 
@@ -189,16 +153,16 @@ static int tcp_write_timeout(struct sock *sk)
 
 		retry_until = sysctl_tcp_retries2;
 		if (sock_flag(sk, SOCK_DEAD)) {
-			int alive = (tp->rto < TCP_RTO_MAX);
+			const int alive = (icsk->icsk_rto < TCP_RTO_MAX);
  
 			retry_until = tcp_orphan_retries(sk, alive);
 
-			if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until))
+			if (tcp_out_of_resources(sk, alive || icsk->icsk_retransmits < retry_until))
 				return 1;
 		}
 	}
 
-	if (tp->retransmits >= retry_until) {
+	if (icsk->icsk_retransmits >= retry_until) {
 		/* Has it gone just too far? */
 		tcp_write_err(sk);
 		return 1;
@@ -210,26 +174,27 @@ static void tcp_delack_timer(unsigned long data)
 {
 	struct sock *sk = (struct sock*)data;
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
 
 	bh_lock_sock(sk);
 	if (sock_owned_by_user(sk)) {
 		/* Try again later. */
-		tp->ack.blocked = 1;
+		icsk->icsk_ack.blocked = 1;
 		NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
-		sk_reset_timer(sk, &tp->delack_timer, jiffies + TCP_DELACK_MIN);
+		sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN);
 		goto out_unlock;
 	}
 
 	sk_stream_mem_reclaim(sk);
 
-	if (sk->sk_state == TCP_CLOSE || !(tp->ack.pending & TCP_ACK_TIMER))
+	if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
 		goto out;
 
-	if (time_after(tp->ack.timeout, jiffies)) {
-		sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout);
+	if (time_after(icsk->icsk_ack.timeout, jiffies)) {
+		sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
 		goto out;
 	}
-	tp->ack.pending &= ~TCP_ACK_TIMER;
+	icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
 
 	if (!skb_queue_empty(&tp->ucopy.prequeue)) {
 		struct sk_buff *skb;
@@ -242,16 +207,16 @@ static void tcp_delack_timer(unsigned long data)
 		tp->ucopy.memory = 0;
 	}
 
-	if (tcp_ack_scheduled(tp)) {
-		if (!tp->ack.pingpong) {
+	if (inet_csk_ack_scheduled(sk)) {
+		if (!icsk->icsk_ack.pingpong) {
 			/* Delayed ACK missed: inflate ATO. */
-			tp->ack.ato = min(tp->ack.ato << 1, tp->rto);
+			icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
 		} else {
 			/* Delayed ACK missed: leave pingpong mode and
 			 * deflate ATO.
 			 */
-			tp->ack.pingpong = 0;
-			tp->ack.ato = TCP_ATO_MIN;
+			icsk->icsk_ack.pingpong = 0;
+			icsk->icsk_ack.ato      = TCP_ATO_MIN;
 		}
 		tcp_send_ack(sk);
 		NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
@@ -268,11 +233,12 @@ out_unlock:
 
 static void tcp_probe_timer(struct sock *sk)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	int max_probes;
 
 	if (tp->packets_out || !sk->sk_send_head) {
-		tp->probes_out = 0;
+		icsk->icsk_probes_out = 0;
 		return;
 	}
 
@@ -283,7 +249,7 @@ static void tcp_probe_timer(struct sock *sk)
 	 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
 	 * this behaviour in Solaris down as a bug fix. [AC]
 	 *
-	 * Let me to explain. probes_out is zeroed by incoming ACKs
+	 * Let me to explain. icsk_probes_out is zeroed by incoming ACKs
 	 * even if they advertise zero window. Hence, connection is killed only
 	 * if we received no ACKs for normal connection timeout. It is not killed
 	 * only because window stays zero for some time, window may be zero
@@ -294,15 +260,15 @@ static void tcp_probe_timer(struct sock *sk)
 	max_probes = sysctl_tcp_retries2;
 
 	if (sock_flag(sk, SOCK_DEAD)) {
-		int alive = ((tp->rto<<tp->backoff) < TCP_RTO_MAX);
+		const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX);
  
 		max_probes = tcp_orphan_retries(sk, alive);
 
-		if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes))
+		if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes))
 			return;
 	}
 
-	if (tp->probes_out > max_probes) {
+	if (icsk->icsk_probes_out > max_probes) {
 		tcp_write_err(sk);
 	} else {
 		/* Only send another probe if we didn't close things up. */
@@ -317,6 +283,7 @@ static void tcp_probe_timer(struct sock *sk)
 static void tcp_retransmit_timer(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
 
 	if (!tp->packets_out)
 		goto out;
@@ -351,20 +318,21 @@ static void tcp_retransmit_timer(struct sock *sk)
 	if (tcp_write_timeout(sk))
 		goto out;
 
-	if (tp->retransmits == 0) {
-		if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) {
+	if (icsk->icsk_retransmits == 0) {
+		if (icsk->icsk_ca_state == TCP_CA_Disorder ||
+		    icsk->icsk_ca_state == TCP_CA_Recovery) {
 			if (tp->rx_opt.sack_ok) {
-				if (tp->ca_state == TCP_CA_Recovery)
+				if (icsk->icsk_ca_state == TCP_CA_Recovery)
 					NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL);
 				else
 					NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES);
 			} else {
-				if (tp->ca_state == TCP_CA_Recovery)
+				if (icsk->icsk_ca_state == TCP_CA_Recovery)
 					NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL);
 				else
 					NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES);
 			}
-		} else if (tp->ca_state == TCP_CA_Loss) {
+		} else if (icsk->icsk_ca_state == TCP_CA_Loss) {
 			NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES);
 		} else {
 			NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS);
@@ -381,10 +349,11 @@ static void tcp_retransmit_timer(struct sock *sk)
 		/* Retransmission failed because of local congestion,
 		 * do not backoff.
 		 */
-		if (!tp->retransmits)
-			tp->retransmits=1;
-		tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS,
-				     min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL));
+		if (!icsk->icsk_retransmits)
+			icsk->icsk_retransmits = 1;
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+					  min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
+					  TCP_RTO_MAX);
 		goto out;
 	}
 
@@ -403,13 +372,13 @@ static void tcp_retransmit_timer(struct sock *sk)
 	 * implemented ftp to mars will work nicely. We will have to fix
 	 * the 120 second clamps though!
 	 */
-	tp->backoff++;
-	tp->retransmits++;
+	icsk->icsk_backoff++;
+	icsk->icsk_retransmits++;
 
 out_reset_timer:
-	tp->rto = min(tp->rto << 1, TCP_RTO_MAX);
-	tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
-	if (tp->retransmits > sysctl_tcp_retries1)
+	icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
+	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
+	if (icsk->icsk_retransmits > sysctl_tcp_retries1)
 		__sk_dst_reset(sk);
 
 out:;
@@ -418,32 +387,32 @@ out:;
 static void tcp_write_timer(unsigned long data)
 {
 	struct sock *sk = (struct sock*)data;
-	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	int event;
 
 	bh_lock_sock(sk);
 	if (sock_owned_by_user(sk)) {
 		/* Try again later */
-		sk_reset_timer(sk, &tp->retransmit_timer, jiffies + (HZ / 20));
+		sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
 		goto out_unlock;
 	}
 
-	if (sk->sk_state == TCP_CLOSE || !tp->pending)
+	if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
 		goto out;
 
-	if (time_after(tp->timeout, jiffies)) {
-		sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout);
+	if (time_after(icsk->icsk_timeout, jiffies)) {
+		sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
 		goto out;
 	}
 
-	event = tp->pending;
-	tp->pending = 0;
+	event = icsk->icsk_pending;
+	icsk->icsk_pending = 0;
 
 	switch (event) {
-	case TCP_TIME_RETRANS:
+	case ICSK_TIME_RETRANS:
 		tcp_retransmit_timer(sk);
 		break;
-	case TCP_TIME_PROBE0:
+	case ICSK_TIME_PROBE0:
 		tcp_probe_timer(sk);
 		break;
 	}
@@ -462,96 +431,8 @@ out_unlock:
 
 static void tcp_synack_timer(struct sock *sk)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct listen_sock *lopt = tp->accept_queue.listen_opt;
-	int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
-	int thresh = max_retries;
-	unsigned long now = jiffies;
-	struct request_sock **reqp, *req;
-	int i, budget;
-
-	if (lopt == NULL || lopt->qlen == 0)
-		return;
-
-	/* Normally all the openreqs are young and become mature
-	 * (i.e. converted to established socket) for first timeout.
-	 * If synack was not acknowledged for 3 seconds, it means
-	 * one of the following things: synack was lost, ack was lost,
-	 * rtt is high or nobody planned to ack (i.e. synflood).
-	 * When server is a bit loaded, queue is populated with old
-	 * open requests, reducing effective size of queue.
-	 * When server is well loaded, queue size reduces to zero
-	 * after several minutes of work. It is not synflood,
-	 * it is normal operation. The solution is pruning
-	 * too old entries overriding normal timeout, when
-	 * situation becomes dangerous.
-	 *
-	 * Essentially, we reserve half of room for young
-	 * embrions; and abort old ones without pity, if old
-	 * ones are about to clog our table.
-	 */
-	if (lopt->qlen>>(lopt->max_qlen_log-1)) {
-		int young = (lopt->qlen_young<<1);
-
-		while (thresh > 2) {
-			if (lopt->qlen < young)
-				break;
-			thresh--;
-			young <<= 1;
-		}
-	}
-
-	if (tp->defer_accept)
-		max_retries = tp->defer_accept;
-
-	budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL));
-	i = lopt->clock_hand;
-
-	do {
-		reqp=&lopt->syn_table[i];
-		while ((req = *reqp) != NULL) {
-			if (time_after_eq(now, req->expires)) {
-				if ((req->retrans < thresh ||
-				     (inet_rsk(req)->acked && req->retrans < max_retries))
-				    && !req->rsk_ops->rtx_syn_ack(sk, req, NULL)) {
-					unsigned long timeo;
-
-					if (req->retrans++ == 0)
-						lopt->qlen_young--;
-					timeo = min((TCP_TIMEOUT_INIT << req->retrans),
-						    TCP_RTO_MAX);
-					req->expires = now + timeo;
-					reqp = &req->dl_next;
-					continue;
-				}
-
-				/* Drop this request */
-				tcp_synq_unlink(tp, req, reqp);
-				reqsk_queue_removed(&tp->accept_queue, req);
-				reqsk_free(req);
-				continue;
-			}
-			reqp = &req->dl_next;
-		}
-
-		i = (i+1)&(TCP_SYNQ_HSIZE-1);
-
-	} while (--budget > 0);
-
-	lopt->clock_hand = i;
-
-	if (lopt->qlen)
-		tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
-}
-
-void tcp_delete_keepalive_timer (struct sock *sk)
-{
-	sk_stop_timer(sk, &sk->sk_timer);
-}
-
-void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len)
-{
-	sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
+	inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL,
+				   TCP_TIMEOUT_INIT, TCP_RTO_MAX);
 }
 
 void tcp_set_keepalive(struct sock *sk, int val)
@@ -560,15 +441,16 @@ void tcp_set_keepalive(struct sock *sk, int val)
 		return;
 
 	if (val && !sock_flag(sk, SOCK_KEEPOPEN))
-		tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
+		inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
 	else if (!val)
-		tcp_delete_keepalive_timer(sk);
+		inet_csk_delete_keepalive_timer(sk);
 }
 
 
 static void tcp_keepalive_timer (unsigned long data)
 {
 	struct sock *sk = (struct sock *) data;
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	__u32 elapsed;
 
@@ -576,7 +458,7 @@ static void tcp_keepalive_timer (unsigned long data)
 	bh_lock_sock(sk);
 	if (sock_owned_by_user(sk)) {
 		/* Try again later. */ 
-		tcp_reset_keepalive_timer (sk, HZ/20);
+		inet_csk_reset_keepalive_timer (sk, HZ/20);
 		goto out;
 	}
 
@@ -587,7 +469,7 @@ static void tcp_keepalive_timer (unsigned long data)
 
 	if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
 		if (tp->linger2 >= 0) {
-			int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN;
+			const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
 
 			if (tmo > 0) {
 				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
@@ -610,14 +492,14 @@ static void tcp_keepalive_timer (unsigned long data)
 	elapsed = tcp_time_stamp - tp->rcv_tstamp;
 
 	if (elapsed >= keepalive_time_when(tp)) {
-		if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
-		     (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
+		if ((!tp->keepalive_probes && icsk->icsk_probes_out >= sysctl_tcp_keepalive_probes) ||
+		     (tp->keepalive_probes && icsk->icsk_probes_out >= tp->keepalive_probes)) {
 			tcp_send_active_reset(sk, GFP_ATOMIC);
 			tcp_write_err(sk);
 			goto out;
 		}
 		if (tcp_write_wakeup(sk) <= 0) {
-			tp->probes_out++;
+			icsk->icsk_probes_out++;
 			elapsed = keepalive_intvl_when(tp);
 		} else {
 			/* If keepalive was lost due to local congestion,
@@ -634,7 +516,7 @@ static void tcp_keepalive_timer (unsigned long data)
 	sk_stream_mem_reclaim(sk);
 
 resched:
-	tcp_reset_keepalive_timer (sk, elapsed);
+	inet_csk_reset_keepalive_timer (sk, elapsed);
 	goto out;
 
 death:	
@@ -644,8 +526,3 @@ out:
 	bh_unlock_sock(sk);
 	sock_put(sk);
 }
-
-EXPORT_SYMBOL(tcp_clear_xmit_timers);
-EXPORT_SYMBOL(tcp_delete_keepalive_timer);
-EXPORT_SYMBOL(tcp_init_xmit_timers);
-EXPORT_SYMBOL(tcp_reset_keepalive_timer);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 9bd443db519..93c5f92070f 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -35,7 +35,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/skbuff.h>
-#include <linux/tcp_diag.h>
+#include <linux/inet_diag.h>
 
 #include <net/tcp.h>
 
@@ -82,9 +82,10 @@ struct vegas {
  * Instead we must wait until the completion of an RTT during
  * which we actually receive ACKs.
  */
-static inline void vegas_enable(struct tcp_sock *tp)
+static inline void vegas_enable(struct sock *sk)
 {
-	struct vegas *vegas = tcp_ca(tp);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct vegas *vegas = inet_csk_ca(sk);
 
 	/* Begin taking Vegas samples next time we send something. */
 	vegas->doing_vegas_now = 1;
@@ -97,19 +98,19 @@ static inline void vegas_enable(struct tcp_sock *tp)
 }
 
 /* Stop taking Vegas samples for now. */
-static inline void vegas_disable(struct tcp_sock *tp)
+static inline void vegas_disable(struct sock *sk)
 {
-	struct vegas *vegas = tcp_ca(tp);
+	struct vegas *vegas = inet_csk_ca(sk);
 
 	vegas->doing_vegas_now = 0;
 }
 
-static void tcp_vegas_init(struct tcp_sock *tp)
+static void tcp_vegas_init(struct sock *sk)
 {
-	struct vegas *vegas = tcp_ca(tp);
+	struct vegas *vegas = inet_csk_ca(sk);
 
 	vegas->baseRTT = 0x7fffffff;
-	vegas_enable(tp);
+	vegas_enable(sk);
 }
 
 /* Do RTT sampling needed for Vegas.
@@ -120,9 +121,9 @@ static void tcp_vegas_init(struct tcp_sock *tp)
  *   o min-filter RTT samples from a much longer window (forever for now)
  *     to find the propagation delay (baseRTT)
  */
-static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt)
+static void tcp_vegas_rtt_calc(struct sock *sk, u32 usrtt)
 {
-	struct vegas *vegas = tcp_ca(tp);
+	struct vegas *vegas = inet_csk_ca(sk);
 	u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */
 
 	/* Filter to find propagation delay: */
@@ -136,13 +137,13 @@ static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt)
 	vegas->cntRTT++;
 }
 
-static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state)
+static void tcp_vegas_state(struct sock *sk, u8 ca_state)
 {
 
 	if (ca_state == TCP_CA_Open)
-		vegas_enable(tp);
+		vegas_enable(sk);
 	else
-		vegas_disable(tp);
+		vegas_disable(sk);
 }
 
 /*
@@ -154,20 +155,21 @@ static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state)
  * packets, _then_ we can make Vegas calculations
  * again.
  */
-static void tcp_vegas_cwnd_event(struct tcp_sock *tp, enum tcp_ca_event event)
+static void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
 {
 	if (event == CA_EVENT_CWND_RESTART ||
 	    event == CA_EVENT_TX_START)
-		tcp_vegas_init(tp);
+		tcp_vegas_init(sk);
 }
 
-static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
+static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
 				 u32 seq_rtt, u32 in_flight, int flag)
 {
-	struct vegas *vegas = tcp_ca(tp);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct vegas *vegas = inet_csk_ca(sk);
 
 	if (!vegas->doing_vegas_now)
-		return tcp_reno_cong_avoid(tp, ack, seq_rtt, in_flight, flag);
+		return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag);
 
 	/* The key players are v_beg_snd_una and v_beg_snd_nxt.
 	 *
@@ -219,7 +221,7 @@ static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
 		 * but that's not too awful, since we're taking the min,
 		 * rather than averaging.
 		 */
-		tcp_vegas_rtt_calc(tp, seq_rtt*1000);
+		tcp_vegas_rtt_calc(sk, seq_rtt * 1000);
 
 		/* We do the Vegas calculations only if we got enough RTT
 		 * samples that we can be reasonably sure that we got
@@ -359,14 +361,14 @@ static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
 }
 
 /* Extract info for Tcp socket info provided via netlink. */
-static void tcp_vegas_get_info(struct tcp_sock *tp, u32 ext,
+static void tcp_vegas_get_info(struct sock *sk, u32 ext,
 			       struct sk_buff *skb)
 {
-	const struct vegas *ca = tcp_ca(tp);
-	if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
+	const struct vegas *ca = inet_csk_ca(sk);
+	if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
 		struct tcpvegas_info *info;
 
-		info = RTA_DATA(__RTA_PUT(skb, TCPDIAG_VEGASINFO,
+		info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO,
 					  sizeof(*info)));
 
 		info->tcpv_enabled = ca->doing_vegas_now;
@@ -393,7 +395,7 @@ static struct tcp_congestion_ops tcp_vegas = {
 
 static int __init tcp_vegas_register(void)
 {
-	BUG_ON(sizeof(struct vegas) > TCP_CA_PRIV_SIZE);
+	BUG_ON(sizeof(struct vegas) > ICSK_CA_PRIV_SIZE);
 	tcp_register_congestion_control(&tcp_vegas);
 	return 0;
 }
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index ef827242c94..0c340c3756c 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -8,7 +8,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/skbuff.h>
-#include <linux/tcp_diag.h>
+#include <linux/inet_diag.h>
 #include <net/tcp.h>
 
 /* TCP Westwood structure */
@@ -40,9 +40,9 @@ struct westwood {
  * way as soon as possible. It will reasonably happen within the first
  * RTT period of the connection lifetime.
  */
-static void tcp_westwood_init(struct tcp_sock *tp)
+static void tcp_westwood_init(struct sock *sk)
 {
-	struct westwood *w = tcp_ca(tp);
+	struct westwood *w = inet_csk_ca(sk);
 
 	w->bk = 0;
         w->bw_ns_est = 0;
@@ -51,7 +51,7 @@ static void tcp_westwood_init(struct tcp_sock *tp)
         w->cumul_ack = 0;
 	w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT;
 	w->rtt_win_sx = tcp_time_stamp;
-	w->snd_una = tp->snd_una;
+	w->snd_una = tcp_sk(sk)->snd_una;
 }
 
 /*
@@ -74,11 +74,11 @@ static inline void westwood_filter(struct westwood *w, u32 delta)
  * Called after processing group of packets.
  * but all westwood needs is the last sample of srtt.
  */
-static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt)
+static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt)
 {
-	struct westwood *w = tcp_ca(tp);
+	struct westwood *w = inet_csk_ca(sk);
 	if (cnt > 0)
-		w->rtt = tp->srtt >> 3;
+		w->rtt = tcp_sk(sk)->srtt >> 3;
 }
 
 /*
@@ -86,9 +86,9 @@ static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt)
  * It updates RTT evaluation window if it is the right moment to do
  * it. If so it calls filter for evaluating bandwidth.
  */
-static void westwood_update_window(struct tcp_sock *tp)
+static void westwood_update_window(struct sock *sk)
 {
-	struct westwood *w = tcp_ca(tp);
+	struct westwood *w = inet_csk_ca(sk);
 	s32 delta = tcp_time_stamp - w->rtt_win_sx;
 
 	/*
@@ -114,11 +114,12 @@ static void westwood_update_window(struct tcp_sock *tp)
  * header prediction is successful. In such case in fact update is
  * straight forward and doesn't need any particular care.
  */
-static inline void westwood_fast_bw(struct tcp_sock *tp)
+static inline void westwood_fast_bw(struct sock *sk)
 {
-	struct westwood *w = tcp_ca(tp);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct westwood *w = inet_csk_ca(sk);
 
-	westwood_update_window(tp);
+	westwood_update_window(sk);
 
 	w->bk += tp->snd_una - w->snd_una;
 	w->snd_una = tp->snd_una;
@@ -130,9 +131,10 @@ static inline void westwood_fast_bw(struct tcp_sock *tp)
  * This function evaluates cumul_ack for evaluating bk in case of
  * delayed or partial acks.
  */
-static inline u32 westwood_acked_count(struct tcp_sock *tp)
+static inline u32 westwood_acked_count(struct sock *sk)
 {
-	struct westwood *w = tcp_ca(tp);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct westwood *w = inet_csk_ca(sk);
 
 	w->cumul_ack = tp->snd_una - w->snd_una;
 
@@ -160,9 +162,10 @@ static inline u32 westwood_acked_count(struct tcp_sock *tp)
 	return w->cumul_ack;
 }
 
-static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp)
+static inline u32 westwood_bw_rttmin(const struct sock *sk)
 {
-	struct westwood *w = tcp_ca(tp);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct westwood *w = inet_csk_ca(sk);
 	return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
 }
 
@@ -172,31 +175,32 @@ static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp)
  * in packets we use mss_cache). Rttmin is guaranteed to be >= 2
  * so avoids ever returning 0.
  */
-static u32 tcp_westwood_cwnd_min(struct tcp_sock *tp)
+static u32 tcp_westwood_cwnd_min(struct sock *sk)
 {
-	return westwood_bw_rttmin(tp);
+	return westwood_bw_rttmin(sk);
 }
 
-static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event)
+static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
 {
-	struct westwood *w = tcp_ca(tp);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct westwood *w = inet_csk_ca(sk);
 
 	switch(event) {
 	case CA_EVENT_FAST_ACK:
-		westwood_fast_bw(tp);
+		westwood_fast_bw(sk);
 		break;
 
 	case CA_EVENT_COMPLETE_CWR:
-		tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(tp);
+		tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(sk);
 		break;
 
 	case CA_EVENT_FRTO:
-		tp->snd_ssthresh = westwood_bw_rttmin(tp);
+		tp->snd_ssthresh = westwood_bw_rttmin(sk);
 		break;
 
 	case CA_EVENT_SLOW_ACK:
-		westwood_update_window(tp);
-		w->bk += westwood_acked_count(tp);
+		westwood_update_window(sk);
+		w->bk += westwood_acked_count(sk);
 		w->rtt_min = min(w->rtt, w->rtt_min);
 		break;
 
@@ -208,15 +212,15 @@ static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event)
 
 
 /* Extract info for Tcp socket info provided via netlink. */
-static void tcp_westwood_info(struct tcp_sock *tp, u32 ext,
+static void tcp_westwood_info(struct sock *sk, u32 ext,
 			      struct sk_buff *skb)
 {
-	const struct westwood *ca = tcp_ca(tp);
-	if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
+	const struct westwood *ca = inet_csk_ca(sk);
+	if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
 		struct rtattr *rta;
 		struct tcpvegas_info *info;
 
-		rta = __RTA_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*info));
+		rta = __RTA_PUT(skb, INET_DIAG_VEGASINFO, sizeof(*info));
 		info = RTA_DATA(rta);
 		info->tcpv_enabled = 1;
 		info->tcpv_rttcnt = 0;
@@ -242,7 +246,7 @@ static struct tcp_congestion_ops tcp_westwood = {
 
 static int __init tcp_westwood_register(void)
 {
-	BUG_ON(sizeof(struct westwood) > TCP_CA_PRIV_SIZE);
+	BUG_ON(sizeof(struct westwood) > ICSK_CA_PRIV_SIZE);
 	return tcp_register_congestion_control(&tcp_westwood);
 }
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index dc4d07357e3..e5beca7de86 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -95,7 +95,8 @@
 #include <linux/ipv6.h>
 #include <linux/netdevice.h>
 #include <net/snmp.h>
-#include <net/tcp.h>
+#include <net/ip.h>
+#include <net/tcp_states.h>
 #include <net/protocol.h>
 #include <linux/skbuff.h>
 #include <linux/proc_fs.h>
@@ -112,7 +113,7 @@
  *	Snmp MIB for the UDP layer
  */
 
-DEFINE_SNMP_STAT(struct udp_mib, udp_statistics);
+DEFINE_SNMP_STAT(struct udp_mib, udp_statistics) __read_mostly;
 
 struct hlist_head udp_hash[UDP_HTABLE_SIZE];
 DEFINE_RWLOCK(udp_hash_lock);
@@ -628,7 +629,7 @@ back_from_confirm:
 		/* ... which is an evident application bug. --ANK */
 		release_sock(sk);
 
-		LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 2\n"));
+		LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n");
 		err = -EINVAL;
 		goto out;
 	}
@@ -693,7 +694,7 @@ static int udp_sendpage(struct sock *sk, struct page *page, int offset,
 	if (unlikely(!up->pending)) {
 		release_sock(sk);
 
-		LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 3\n"));
+		LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 3\n");
 		return -EINVAL;
 	}
 
@@ -1102,7 +1103,7 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
 		if (!udp_check(uh, ulen, saddr, daddr, skb->csum))
 			return 0;
-		LIMIT_NETDEBUG(printk(KERN_DEBUG "udp v4 hw csum failure.\n"));
+		LIMIT_NETDEBUG(KERN_DEBUG "udp v4 hw csum failure.\n");
 		skb->ip_summed = CHECKSUM_NONE;
 	}
 	if (skb->ip_summed != CHECKSUM_UNNECESSARY)
@@ -1181,13 +1182,13 @@ int udp_rcv(struct sk_buff *skb)
 	return(0);
 
 short_packet:
-	LIMIT_NETDEBUG(printk(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n",
-			      NIPQUAD(saddr),
-			      ntohs(uh->source),
-			      ulen,
-			      len,
-			      NIPQUAD(daddr),
-			      ntohs(uh->dest)));
+	LIMIT_NETDEBUG(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n",
+		       NIPQUAD(saddr),
+		       ntohs(uh->source),
+		       ulen,
+		       len,
+		       NIPQUAD(daddr),
+		       ntohs(uh->dest));
 no_header:
 	UDP_INC_STATS_BH(UDP_MIB_INERRORS);
 	kfree_skb(skb);
@@ -1198,12 +1199,12 @@ csum_error:
 	 * RFC1122: OK.  Discards the bad packet silently (as far as 
 	 * the network is concerned, anyway) as per 4.1.3.4 (MUST). 
 	 */
-	LIMIT_NETDEBUG(printk(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
-			      NIPQUAD(saddr),
-			      ntohs(uh->source),
-			      NIPQUAD(daddr),
-			      ntohs(uh->dest),
-			      ulen));
+	LIMIT_NETDEBUG(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
+		       NIPQUAD(saddr),
+		       ntohs(uh->source),
+		       NIPQUAD(daddr),
+		       ntohs(uh->dest),
+		       ulen);
 drop:
 	UDP_INC_STATS_BH(UDP_MIB_INERRORS);
 	kfree_skb(skb);
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 050611d7a96..d23e07fc81f 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -128,8 +128,10 @@ void __init xfrm4_state_init(void)
 	xfrm_state_register_afinfo(&xfrm4_state_afinfo);
 }
 
+#if 0
 void __exit xfrm4_state_fini(void)
 {
 	xfrm_state_unregister_afinfo(&xfrm4_state_afinfo);
 }
+#endif  /*  0  */
author	Jeff Garzik <jgarzik@pobox.com>	2005-08-30 13:32:29 -0400
committer	Jeff Garzik <jgarzik@pobox.com>	2005-08-30 13:32:29 -0400
commit	ed735ccbefaf7e5e3ef61418f7e209b8c59308a7 (patch)
tree	b8cc69814d2368b08d0a84c8da0c12028bd04867 /net/ipv4
parent	39fbe47377062200acc26ea0ccef223b4399a82c (diff)
parent	d8971fcb702e24d1e22c77fd1772f182ffee87e3 (diff)