From 670c02c2bfd2c8a305a90f5285409a7b0a8fd630 Mon Sep 17 00:00:00 2001
From: John Hawkes <hawkes@sgi.com>
Date: Thu, 13 Oct 2005 09:30:31 -0700
Subject: [NET]: Wider use of for_each_*cpu()

In 'net' change the explicit use of for-loops and NR_CPUS into the
general for_each_cpu() or for_each_online_cpu() constructs, as
appropriate.  This widens the scope of potential future optimizations
of the general constructs, as well as takes advantage of the existing
optimizations of first_cpu() and next_cpu(), which is advantageous
when the true CPU count is much smaller than NR_CPUS.

Signed-off-by: John Hawkes <hawkes@sgi.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
---
 net/ipv4/icmp.c | 5 +----
 net/ipv4/proc.c | 4 +---
 2 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 90dca711ac9..175e093ec56 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1108,12 +1108,9 @@ void __init icmp_init(struct net_proto_family *ops)
 	struct inet_sock *inet;
 	int i;
 
-	for (i = 0; i < NR_CPUS; i++) {
+	for_each_cpu(i) {
 		int err;
 
-		if (!cpu_possible(i))
-			continue;
-
 		err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_ICMP,
 				       &per_cpu(__icmp_socket, i));
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index f7943ba1f43..a65e508fbd4 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -90,9 +90,7 @@ fold_field(void *mib[], int offt)
 	unsigned long res = 0;
 	int i;
 
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_possible(i))
-			continue;
+	for_each_cpu(i) {
 		res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt);
 		res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt);
 	}
-- 
cgit v1.2.3-70-g09d2


From eed75f191d8318a2b144da8aae9774e1cfcae492 Mon Sep 17 00:00:00 2001
From: Harald Welte <laforge@netfilter.org>
Date: Sun, 16 Oct 2005 14:22:59 +0200
Subject: [NETFILTER] ip_conntrack: Make "hashsize" conntrack parameter
 writable

It's fairly simple to resize the hash table, but currently you need to
remove and reinsert the module.  That's bad (we lose connection
state).  Harald has even offered to write a daemon which sets this
based on load.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
---
 net/ipv4/netfilter/ip_conntrack_core.c | 132 ++++++++++++++++++++++++---------
 1 file changed, 95 insertions(+), 37 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 07a80b56e8d..422ab68ee7f 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -50,7 +50,7 @@
 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
 #include <linux/netfilter_ipv4/listhelp.h>
 
-#define IP_CONNTRACK_VERSION	"2.3"
+#define IP_CONNTRACK_VERSION	"2.4"
 
 #if 0
 #define DEBUGP printk
@@ -148,16 +148,20 @@ DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
 static int ip_conntrack_hash_rnd_initted;
 static unsigned int ip_conntrack_hash_rnd;
 
-static u_int32_t
-hash_conntrack(const struct ip_conntrack_tuple *tuple)
+static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple,
+			    unsigned int size, unsigned int rnd)
 {
-#if 0
-	dump_tuple(tuple);
-#endif
 	return (jhash_3words(tuple->src.ip,
 	                     (tuple->dst.ip ^ tuple->dst.protonum),
 	                     (tuple->src.u.all | (tuple->dst.u.all << 16)),
-	                     ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
+	                     rnd) % size);
+}
+
+static u_int32_t
+hash_conntrack(const struct ip_conntrack_tuple *tuple)
+{
+	return __hash_conntrack(tuple, ip_conntrack_htable_size,
+				ip_conntrack_hash_rnd);
 }
 
 int
@@ -1341,14 +1345,13 @@ static int kill_all(struct ip_conntrack *i, void *data)
 	return 1;
 }
 
-static void free_conntrack_hash(void)
+static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
 {
-	if (ip_conntrack_vmalloc)
-		vfree(ip_conntrack_hash);
+	if (vmalloced)
+		vfree(hash);
 	else
-		free_pages((unsigned long)ip_conntrack_hash, 
-			   get_order(sizeof(struct list_head)
-				     * ip_conntrack_htable_size));
+		free_pages((unsigned long)hash, 
+			   get_order(sizeof(struct list_head) * size));
 }
 
 void ip_conntrack_flush()
@@ -1378,12 +1381,83 @@ void ip_conntrack_cleanup(void)
 	ip_conntrack_flush();
 	kmem_cache_destroy(ip_conntrack_cachep);
 	kmem_cache_destroy(ip_conntrack_expect_cachep);
-	free_conntrack_hash();
+	free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
+			    ip_conntrack_htable_size);
 	nf_unregister_sockopt(&so_getorigdst);
 }
 
-static int hashsize;
-module_param(hashsize, int, 0400);
+static struct list_head *alloc_hashtable(int size, int *vmalloced)
+{
+	struct list_head *hash;
+	unsigned int i;
+
+	*vmalloced = 0; 
+	hash = (void*)__get_free_pages(GFP_KERNEL, 
+				       get_order(sizeof(struct list_head)
+						 * size));
+	if (!hash) { 
+		*vmalloced = 1;
+		printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
+		hash = vmalloc(sizeof(struct list_head) * size);
+	}
+
+	if (hash)
+		for (i = 0; i < size; i++)
+			INIT_LIST_HEAD(&hash[i]);
+
+	return hash;
+}
+
+int set_hashsize(const char *val, struct kernel_param *kp)
+{
+	int i, bucket, hashsize, vmalloced;
+	int old_vmalloced, old_size;
+	int rnd;
+	struct list_head *hash, *old_hash;
+	struct ip_conntrack_tuple_hash *h;
+
+	/* On boot, we can set this without any fancy locking. */
+	if (!ip_conntrack_htable_size)
+		return param_set_int(val, kp);
+
+	hashsize = simple_strtol(val, NULL, 0);
+	if (!hashsize)
+		return -EINVAL;
+
+	hash = alloc_hashtable(hashsize, &vmalloced);
+	if (!hash)
+		return -ENOMEM;
+
+	/* We have to rehash for the new table anyway, so we also can 
+	 * use a new random seed */
+	get_random_bytes(&rnd, 4);
+
+	write_lock_bh(&ip_conntrack_lock);
+	for (i = 0; i < ip_conntrack_htable_size; i++) {
+		while (!list_empty(&ip_conntrack_hash[i])) {
+			h = list_entry(ip_conntrack_hash[i].next,
+				       struct ip_conntrack_tuple_hash, list);
+			list_del(&h->list);
+			bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
+			list_add_tail(&h->list, &hash[bucket]);
+		}
+	}
+	old_size = ip_conntrack_htable_size;
+	old_vmalloced = ip_conntrack_vmalloc;
+	old_hash = ip_conntrack_hash;
+
+	ip_conntrack_htable_size = hashsize;
+	ip_conntrack_vmalloc = vmalloced;
+	ip_conntrack_hash = hash;
+	ip_conntrack_hash_rnd = rnd;
+	write_unlock_bh(&ip_conntrack_lock);
+
+	free_conntrack_hash(old_hash, old_vmalloced, old_size);
+	return 0;
+}
+
+module_param_call(hashsize, set_hashsize, param_get_uint,
+		  &ip_conntrack_htable_size, 0600);
 
 int __init ip_conntrack_init(void)
 {
@@ -1392,9 +1466,7 @@ int __init ip_conntrack_init(void)
 
 	/* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
 	 * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
- 	if (hashsize) {
- 		ip_conntrack_htable_size = hashsize;
- 	} else {
+ 	if (!ip_conntrack_htable_size) {
 		ip_conntrack_htable_size
 			= (((num_physpages << PAGE_SHIFT) / 16384)
 			   / sizeof(struct list_head));
@@ -1416,20 +1488,8 @@ int __init ip_conntrack_init(void)
 		return ret;
 	}
 
-	/* AK: the hash table is twice as big than needed because it
-	   uses list_head.  it would be much nicer to caches to use a
-	   single pointer list head here. */
-	ip_conntrack_vmalloc = 0; 
-	ip_conntrack_hash 
-		=(void*)__get_free_pages(GFP_KERNEL, 
-					 get_order(sizeof(struct list_head)
-						   *ip_conntrack_htable_size));
-	if (!ip_conntrack_hash) { 
-		ip_conntrack_vmalloc = 1;
-		printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
-		ip_conntrack_hash = vmalloc(sizeof(struct list_head)
-					    * ip_conntrack_htable_size);
-	}
+	ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
+					    &ip_conntrack_vmalloc);
 	if (!ip_conntrack_hash) {
 		printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
 		goto err_unreg_sockopt;
@@ -1461,9 +1521,6 @@ int __init ip_conntrack_init(void)
 	ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
 	write_unlock_bh(&ip_conntrack_lock);
 
-	for (i = 0; i < ip_conntrack_htable_size; i++)
-		INIT_LIST_HEAD(&ip_conntrack_hash[i]);
-
 	/* For use by ipt_REJECT */
 	ip_ct_attach = ip_conntrack_attach;
 
@@ -1478,7 +1535,8 @@ int __init ip_conntrack_init(void)
 err_free_conntrack_slab:
 	kmem_cache_destroy(ip_conntrack_cachep);
 err_free_hash:
-	free_conntrack_hash();
+	free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
+			    ip_conntrack_htable_size);
 err_unreg_sockopt:
 	nf_unregister_sockopt(&so_getorigdst);
 
-- 
cgit v1.2.3-70-g09d2


From 1371e37da299d4df6267ad0ddf010435782c28e9 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 15 Oct 2005 09:42:39 +1000
Subject: [IPV4]: Kill redundant rcu_dereference on fa_info

This patch kills a redundant rcu_dereference on fa->fa_info in fib_trie.c.
As this dereference directly follows a list_for_each_entry_rcu line, we
have already taken a read barrier with respect to getting an entry from
the list.

This read barrier guarantees that all values read out of fa are valid.
In particular, the contents of structure pointed to by fa->fa_info is
initialised before fa->fa_info is actually set (see fn_trie_insert);
the setting of fa->fa_info itself is further separated with a write
barrier from the insertion of fa into the list.

Therefore by taking a read barrier after obtaining fa from the list
(which is given by list_for_each_entry_rcu), we can be sure that
fa->fa_info contains a valid pointer, as well as the fact that the
data pointed to by fa->fa_info is itself valid.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
---
 net/ipv4/fib_trie.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 0093ea08c7f..66247f38b37 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2404,7 +2404,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
 		prefix = htonl(l->key);
 
 		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
-			const struct fib_info *fi = rcu_dereference(fa->fa_info);
+			const struct fib_info *fi = fa->fa_info;
 			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
 
 			if (fa->fa_type == RTN_BROADCAST
-- 
cgit v1.2.3-70-g09d2


From 0d0d2bba97cb091218ea0bcb3d8debcc7bf48397 Mon Sep 17 00:00:00 2001
From: Jayachandran C <jchandra@digeo.com>
Date: Thu, 13 Oct 2005 11:43:02 -0700
Subject: [IPV4]: Remove dead code from ip_output.c

skb_prev is assigned from skb, which cannot be NULL. This patch removes the
unnecessary NULL check.

Signed-off-by: Jayachandran C. <c.jayachandran at gmail.com>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
---
 net/ipv4/ip_output.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 1ad5202e556..87e350069ab 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1023,10 +1023,7 @@ ssize_t	ip_append_page(struct sock *sk, struct page *page,
 			int alloclen;
 
 			skb_prev = skb;
-			if (skb_prev)
-				fraggap = skb_prev->len - maxfraglen;
-			else
-				fraggap = 0;
+			fraggap = skb_prev->len - maxfraglen;
 
 			alloclen = fragheaderlen + hh_len + fraggap + 15;
 			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
-- 
cgit v1.2.3-70-g09d2


From dcab5e1eeccf5e226c771ecc013631cde157435f Mon Sep 17 00:00:00 2001
From: David Engel <gigem@comcast.net>
Date: Fri, 21 Oct 2005 22:09:16 -0500
Subject: [IPV4]: Fix setting broadcast for SIOCSIFNETMASK

Fix setting of the broadcast address when the netmask is set via
SIOCSIFNETMASK in Linux 2.6.  The code wanted the old value of
ifa->ifa_mask but used it after it had already been overwritten with
the new value.

Signed-off-by: David Engel <gigem@comcast.net>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
---
 net/ipv4/devinet.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 74f2207e131..4ec4b2ca6ab 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -715,6 +715,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg)
 			break;
 		ret = 0;
 		if (ifa->ifa_mask != sin->sin_addr.s_addr) {
+			u32 old_mask = ifa->ifa_mask;
 			inet_del_ifa(in_dev, ifap, 0);
 			ifa->ifa_mask = sin->sin_addr.s_addr;
 			ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask);
@@ -728,7 +729,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg)
 			if ((dev->flags & IFF_BROADCAST) &&
 			    (ifa->ifa_prefixlen < 31) &&
 			    (ifa->ifa_broadcast ==
-			     (ifa->ifa_local|~ifa->ifa_mask))) {
+			     (ifa->ifa_local|~old_mask))) {
 				ifa->ifa_broadcast = (ifa->ifa_local |
 						      ~sin->sin_addr.s_addr);
 			}
-- 
cgit v1.2.3-70-g09d2