From 670c02c2bfd2c8a305a90f5285409a7b0a8fd630 Mon Sep 17 00:00:00 2001 From: John Hawkes Date: Thu, 13 Oct 2005 09:30:31 -0700 Subject: [NET]: Wider use of for_each_*cpu() In 'net' change the explicit use of for-loops and NR_CPUS into the general for_each_cpu() or for_each_online_cpu() constructs, as appropriate. This widens the scope of potential future optimizations of the general constructs, as well as takes advantage of the existing optimizations of first_cpu() and next_cpu(), which is advantageous when the true CPU count is much smaller than NR_CPUS. Signed-off-by: John Hawkes Signed-off-by: David S. Miller Signed-off-by: Arnaldo Carvalho de Melo --- net/ipv4/icmp.c | 5 +---- net/ipv4/proc.c | 4 +--- 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 90dca711ac9..175e093ec56 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1108,12 +1108,9 @@ void __init icmp_init(struct net_proto_family *ops) struct inet_sock *inet; int i; - for (i = 0; i < NR_CPUS; i++) { + for_each_cpu(i) { int err; - if (!cpu_possible(i)) - continue; - err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_ICMP, &per_cpu(__icmp_socket, i)); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index f7943ba1f43..a65e508fbd4 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -90,9 +90,7 @@ fold_field(void *mib[], int offt) unsigned long res = 0; int i; - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_possible(i)) - continue; + for_each_cpu(i) { res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt); res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt); } -- cgit v1.2.3-70-g09d2 From eed75f191d8318a2b144da8aae9774e1cfcae492 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Sun, 16 Oct 2005 14:22:59 +0200 Subject: [NETFILTER] ip_conntrack: Make "hashsize" conntrack parameter writable It's fairly simple to resize the hash table, but currently you need to remove and reinsert the module. That's bad (we lose connection state). Harald has even offered to write a daemon which sets this based on load. Signed-off-by: Rusty Russell Signed-off-by: Harald Welte Signed-off-by: Arnaldo Carvalho de Melo --- net/ipv4/netfilter/ip_conntrack_core.c | 132 ++++++++++++++++++++++++--------- 1 file changed, 95 insertions(+), 37 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 07a80b56e8d..422ab68ee7f 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -50,7 +50,7 @@ #include #include -#define IP_CONNTRACK_VERSION "2.3" +#define IP_CONNTRACK_VERSION "2.4" #if 0 #define DEBUGP printk @@ -148,16 +148,20 @@ DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); static int ip_conntrack_hash_rnd_initted; static unsigned int ip_conntrack_hash_rnd; -static u_int32_t -hash_conntrack(const struct ip_conntrack_tuple *tuple) +static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple, + unsigned int size, unsigned int rnd) { -#if 0 - dump_tuple(tuple); -#endif return (jhash_3words(tuple->src.ip, (tuple->dst.ip ^ tuple->dst.protonum), (tuple->src.u.all | (tuple->dst.u.all << 16)), - ip_conntrack_hash_rnd) % ip_conntrack_htable_size); + rnd) % size); +} + +static u_int32_t +hash_conntrack(const struct ip_conntrack_tuple *tuple) +{ + return __hash_conntrack(tuple, ip_conntrack_htable_size, + ip_conntrack_hash_rnd); } int @@ -1341,14 +1345,13 @@ static int kill_all(struct ip_conntrack *i, void *data) return 1; } -static void free_conntrack_hash(void) +static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size) { - if (ip_conntrack_vmalloc) - vfree(ip_conntrack_hash); + if (vmalloced) + vfree(hash); else - free_pages((unsigned long)ip_conntrack_hash, - get_order(sizeof(struct list_head) - * ip_conntrack_htable_size)); + free_pages((unsigned long)hash, + get_order(sizeof(struct list_head) * size)); } void ip_conntrack_flush() @@ -1378,12 +1381,83 @@ void ip_conntrack_cleanup(void) ip_conntrack_flush(); kmem_cache_destroy(ip_conntrack_cachep); kmem_cache_destroy(ip_conntrack_expect_cachep); - free_conntrack_hash(); + free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc, + ip_conntrack_htable_size); nf_unregister_sockopt(&so_getorigdst); } -static int hashsize; -module_param(hashsize, int, 0400); +static struct list_head *alloc_hashtable(int size, int *vmalloced) +{ + struct list_head *hash; + unsigned int i; + + *vmalloced = 0; + hash = (void*)__get_free_pages(GFP_KERNEL, + get_order(sizeof(struct list_head) + * size)); + if (!hash) { + *vmalloced = 1; + printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n"); + hash = vmalloc(sizeof(struct list_head) * size); + } + + if (hash) + for (i = 0; i < size; i++) + INIT_LIST_HEAD(&hash[i]); + + return hash; +} + +int set_hashsize(const char *val, struct kernel_param *kp) +{ + int i, bucket, hashsize, vmalloced; + int old_vmalloced, old_size; + int rnd; + struct list_head *hash, *old_hash; + struct ip_conntrack_tuple_hash *h; + + /* On boot, we can set this without any fancy locking. */ + if (!ip_conntrack_htable_size) + return param_set_int(val, kp); + + hashsize = simple_strtol(val, NULL, 0); + if (!hashsize) + return -EINVAL; + + hash = alloc_hashtable(hashsize, &vmalloced); + if (!hash) + return -ENOMEM; + + /* We have to rehash for the new table anyway, so we also can + * use a new random seed */ + get_random_bytes(&rnd, 4); + + write_lock_bh(&ip_conntrack_lock); + for (i = 0; i < ip_conntrack_htable_size; i++) { + while (!list_empty(&ip_conntrack_hash[i])) { + h = list_entry(ip_conntrack_hash[i].next, + struct ip_conntrack_tuple_hash, list); + list_del(&h->list); + bucket = __hash_conntrack(&h->tuple, hashsize, rnd); + list_add_tail(&h->list, &hash[bucket]); + } + } + old_size = ip_conntrack_htable_size; + old_vmalloced = ip_conntrack_vmalloc; + old_hash = ip_conntrack_hash; + + ip_conntrack_htable_size = hashsize; + ip_conntrack_vmalloc = vmalloced; + ip_conntrack_hash = hash; + ip_conntrack_hash_rnd = rnd; + write_unlock_bh(&ip_conntrack_lock); + + free_conntrack_hash(old_hash, old_vmalloced, old_size); + return 0; +} + +module_param_call(hashsize, set_hashsize, param_get_uint, + &ip_conntrack_htable_size, 0600); int __init ip_conntrack_init(void) { @@ -1392,9 +1466,7 @@ int __init ip_conntrack_init(void) /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB * machine has 256 buckets. >= 1GB machines have 8192 buckets. */ - if (hashsize) { - ip_conntrack_htable_size = hashsize; - } else { + if (!ip_conntrack_htable_size) { ip_conntrack_htable_size = (((num_physpages << PAGE_SHIFT) / 16384) / sizeof(struct list_head)); @@ -1416,20 +1488,8 @@ int __init ip_conntrack_init(void) return ret; } - /* AK: the hash table is twice as big than needed because it - uses list_head. it would be much nicer to caches to use a - single pointer list head here. */ - ip_conntrack_vmalloc = 0; - ip_conntrack_hash - =(void*)__get_free_pages(GFP_KERNEL, - get_order(sizeof(struct list_head) - *ip_conntrack_htable_size)); - if (!ip_conntrack_hash) { - ip_conntrack_vmalloc = 1; - printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n"); - ip_conntrack_hash = vmalloc(sizeof(struct list_head) - * ip_conntrack_htable_size); - } + ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size, + &ip_conntrack_vmalloc); if (!ip_conntrack_hash) { printk(KERN_ERR "Unable to create ip_conntrack_hash\n"); goto err_unreg_sockopt; @@ -1461,9 +1521,6 @@ int __init ip_conntrack_init(void) ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp; write_unlock_bh(&ip_conntrack_lock); - for (i = 0; i < ip_conntrack_htable_size; i++) - INIT_LIST_HEAD(&ip_conntrack_hash[i]); - /* For use by ipt_REJECT */ ip_ct_attach = ip_conntrack_attach; @@ -1478,7 +1535,8 @@ int __init ip_conntrack_init(void) err_free_conntrack_slab: kmem_cache_destroy(ip_conntrack_cachep); err_free_hash: - free_conntrack_hash(); + free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc, + ip_conntrack_htable_size); err_unreg_sockopt: nf_unregister_sockopt(&so_getorigdst); -- cgit v1.2.3-70-g09d2 From 1371e37da299d4df6267ad0ddf010435782c28e9 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 15 Oct 2005 09:42:39 +1000 Subject: [IPV4]: Kill redundant rcu_dereference on fa_info This patch kills a redundant rcu_dereference on fa->fa_info in fib_trie.c. As this dereference directly follows a list_for_each_entry_rcu line, we have already taken a read barrier with respect to getting an entry from the list. This read barrier guarantees that all values read out of fa are valid. In particular, the contents of structure pointed to by fa->fa_info is initialised before fa->fa_info is actually set (see fn_trie_insert); the setting of fa->fa_info itself is further separated with a write barrier from the insertion of fa into the list. Therefore by taking a read barrier after obtaining fa from the list (which is given by list_for_each_entry_rcu), we can be sure that fa->fa_info contains a valid pointer, as well as the fact that the data pointed to by fa->fa_info is itself valid. Signed-off-by: Herbert Xu Acked-by: Paul E. McKenney Signed-off-by: David S. Miller Signed-off-by: Arnaldo Carvalho de Melo --- net/ipv4/fib_trie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 0093ea08c7f..66247f38b37 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2404,7 +2404,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) prefix = htonl(l->key); list_for_each_entry_rcu(fa, &li->falh, fa_list) { - const struct fib_info *fi = rcu_dereference(fa->fa_info); + const struct fib_info *fi = fa->fa_info; unsigned flags = fib_flag_trans(fa->fa_type, mask, fi); if (fa->fa_type == RTN_BROADCAST -- cgit v1.2.3-70-g09d2 From 0d0d2bba97cb091218ea0bcb3d8debcc7bf48397 Mon Sep 17 00:00:00 2001 From: Jayachandran C Date: Thu, 13 Oct 2005 11:43:02 -0700 Subject: [IPV4]: Remove dead code from ip_output.c skb_prev is assigned from skb, which cannot be NULL. This patch removes the unnecessary NULL check. Signed-off-by: Jayachandran C. Acked-by: James Morris Signed-off-by: Arnaldo Carvalho de Melo --- net/ipv4/ip_output.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 1ad5202e556..87e350069ab 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1023,10 +1023,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, int alloclen; skb_prev = skb; - if (skb_prev) - fraggap = skb_prev->len - maxfraglen; - else - fraggap = 0; + fraggap = skb_prev->len - maxfraglen; alloclen = fragheaderlen + hh_len + fraggap + 15; skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation); -- cgit v1.2.3-70-g09d2 From dcab5e1eeccf5e226c771ecc013631cde157435f Mon Sep 17 00:00:00 2001 From: David Engel Date: Fri, 21 Oct 2005 22:09:16 -0500 Subject: [IPV4]: Fix setting broadcast for SIOCSIFNETMASK Fix setting of the broadcast address when the netmask is set via SIOCSIFNETMASK in Linux 2.6. The code wanted the old value of ifa->ifa_mask but used it after it had already been overwritten with the new value. Signed-off-by: David Engel Signed-off-by: Arnaldo Carvalho de Melo --- net/ipv4/devinet.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 74f2207e131..4ec4b2ca6ab 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -715,6 +715,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg) break; ret = 0; if (ifa->ifa_mask != sin->sin_addr.s_addr) { + u32 old_mask = ifa->ifa_mask; inet_del_ifa(in_dev, ifap, 0); ifa->ifa_mask = sin->sin_addr.s_addr; ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask); @@ -728,7 +729,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg) if ((dev->flags & IFF_BROADCAST) && (ifa->ifa_prefixlen < 31) && (ifa->ifa_broadcast == - (ifa->ifa_local|~ifa->ifa_mask))) { + (ifa->ifa_local|~old_mask))) { ifa->ifa_broadcast = (ifa->ifa_local | ~sin->sin_addr.s_addr); } -- cgit v1.2.3-70-g09d2