From 645ca708f936b2fbeb79e52d7823e3eb2c0905f8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Oct 2008 01:41:45 -0700 Subject: udp: introduce struct udp_table and multiple spinlocks UDP sockets are hashed in a 128 slots hash table. This hash table is protected by *one* rwlock. This rwlock is readlocked each time an incoming UDP message is handled. This rwlock is writelocked each time a socket must be inserted in hash table (bind time), or deleted from this table (close time) This is not scalable on SMP machines : 1) Even in read mode, lock() and unlock() are atomic operations and must dirty a contended cache line, shared by all cpus. 2) A writer might be starved if many readers are 'in flight'. This can happen on a machine with some NIC receiving many UDP messages. User process can be delayed a long time at socket creation/dismantle time. This patch prepares RCU migration, by introducing 'struct udp_table and struct udp_hslot', and using one spinlock per chain, to reduce contention on central rwlock. Introducing one spinlock per chain reduces latencies, for port randomization on heavily loaded UDP servers. This also speedup bindings to specific ports. udp_lib_unhash() was uninlined, becoming to big. Some cleanups were done to ease review of following patch (RCUification of UDP Unicast lookups) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/udp.c | 207 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 123 insertions(+), 84 deletions(-) (limited to 'net/ipv4/udp.c') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 2095abc3cab..2a6c491f97d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -104,12 +104,8 @@ #include #include "udp_impl.h" -/* - * Snmp MIB for the UDP layer - */ - -struct hlist_head udp_hash[UDP_HTABLE_SIZE]; -DEFINE_RWLOCK(udp_hash_lock); +struct udp_table udp_table; +EXPORT_SYMBOL(udp_table); int sysctl_udp_mem[3] __read_mostly; int sysctl_udp_rmem_min __read_mostly; @@ -123,7 +119,7 @@ atomic_t udp_memory_allocated; EXPORT_SYMBOL(udp_memory_allocated); static int udp_lib_lport_inuse(struct net *net, __u16 num, - const struct hlist_head udptable[], + const struct udp_hslot *hslot, struct sock *sk, int (*saddr_comp)(const struct sock *sk1, const struct sock *sk2)) @@ -131,7 +127,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, struct sock *sk2; struct hlist_node *node; - sk_for_each(sk2, node, &udptable[udp_hashfn(net, num)]) + sk_for_each(sk2, node, &hslot->head) if (net_eq(sock_net(sk2), net) && sk2 != sk && sk2->sk_hash == num && @@ -154,12 +150,11 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, int (*saddr_comp)(const struct sock *sk1, const struct sock *sk2 ) ) { - struct hlist_head *udptable = sk->sk_prot->h.udp_hash; + struct udp_hslot *hslot; + struct udp_table *udptable = sk->sk_prot->h.udp_table; int error = 1; struct net *net = sock_net(sk); - write_lock_bh(&udp_hash_lock); - if (!snum) { int low, high, remaining; unsigned rand; @@ -171,26 +166,34 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, rand = net_random(); snum = first = rand % remaining + low; rand |= 1; - while (udp_lib_lport_inuse(net, snum, udptable, sk, - saddr_comp)) { + for (;;) { + hslot = &udptable->hash[udp_hashfn(net, snum)]; + spin_lock_bh(&hslot->lock); + if (!udp_lib_lport_inuse(net, snum, hslot, sk, saddr_comp)) + break; + spin_unlock_bh(&hslot->lock); do { snum = snum + rand; } while (snum < low || snum > high); if (snum == first) goto fail; } - } else if (udp_lib_lport_inuse(net, snum, udptable, sk, saddr_comp)) - goto fail; - + } else { + hslot = &udptable->hash[udp_hashfn(net, snum)]; + spin_lock_bh(&hslot->lock); + if (udp_lib_lport_inuse(net, snum, hslot, sk, saddr_comp)) + goto fail_unlock; + } inet_sk(sk)->num = snum; sk->sk_hash = snum; if (sk_unhashed(sk)) { - sk_add_node(sk, &udptable[udp_hashfn(net, snum)]); + sk_add_node(sk, &hslot->head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } error = 0; +fail_unlock: + spin_unlock_bh(&hslot->lock); fail: - write_unlock_bh(&udp_hash_lock); return error; } @@ -208,63 +211,73 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum) return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal); } +static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr, + unsigned short hnum, + __be16 sport, __be32 daddr, __be16 dport, int dif) +{ + int score = -1; + + if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum && + !ipv6_only_sock(sk)) { + struct inet_sock *inet = inet_sk(sk); + + score = (sk->sk_family == PF_INET ? 1 : 0); + if (inet->rcv_saddr) { + if (inet->rcv_saddr != daddr) + return -1; + score += 2; + } + if (inet->daddr) { + if (inet->daddr != saddr) + return -1; + score += 2; + } + if (inet->dport) { + if (inet->dport != sport) + return -1; + score += 2; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score += 2; + } + } + return score; +} + /* UDP is nearly always wildcards out the wazoo, it makes no sense to try * harder than this. -DaveM */ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, - int dif, struct hlist_head udptable[]) + int dif, struct udp_table *udptable) { struct sock *sk, *result = NULL; struct hlist_node *node; unsigned short hnum = ntohs(dport); - int badness = -1; - - read_lock(&udp_hash_lock); - sk_for_each(sk, node, &udptable[udp_hashfn(net, hnum)]) { - struct inet_sock *inet = inet_sk(sk); - - if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum && - !ipv6_only_sock(sk)) { - int score = (sk->sk_family == PF_INET ? 1 : 0); - if (inet->rcv_saddr) { - if (inet->rcv_saddr != daddr) - continue; - score+=2; - } - if (inet->daddr) { - if (inet->daddr != saddr) - continue; - score+=2; - } - if (inet->dport) { - if (inet->dport != sport) - continue; - score+=2; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - continue; - score+=2; - } - if (score == 9) { - result = sk; - break; - } else if (score > badness) { - result = sk; - badness = score; - } + unsigned int hash = udp_hashfn(net, hnum); + struct udp_hslot *hslot = &udptable->hash[hash]; + int score, badness = -1; + + spin_lock(&hslot->lock); + sk_for_each(sk, node, &hslot->head) { + score = compute_score(sk, net, saddr, hnum, sport, + daddr, dport, dif); + if (score > badness) { + result = sk; + badness = score; } } if (result) sock_hold(result); - read_unlock(&udp_hash_lock); + spin_unlock(&hslot->lock); return result; } static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport, - struct hlist_head udptable[]) + struct udp_table *udptable) { struct sock *sk; const struct iphdr *iph = ip_hdr(skb); @@ -280,7 +293,7 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif) { - return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, udp_hash); + return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table); } EXPORT_SYMBOL_GPL(udp4_lib_lookup); @@ -323,7 +336,7 @@ found: * to find the appropriate port. */ -void __udp4_lib_err(struct sk_buff *skb, u32 info, struct hlist_head udptable[]) +void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) { struct inet_sock *inet; struct iphdr *iph = (struct iphdr*)skb->data; @@ -392,7 +405,7 @@ out: void udp_err(struct sk_buff *skb, u32 info) { - __udp4_lib_err(skb, info, udp_hash); + __udp4_lib_err(skb, info, &udp_table); } /* @@ -933,6 +946,21 @@ int udp_disconnect(struct sock *sk, int flags) return 0; } +void udp_lib_unhash(struct sock *sk) +{ + struct udp_table *udptable = sk->sk_prot->h.udp_table; + unsigned int hash = udp_hashfn(sock_net(sk), sk->sk_hash); + struct udp_hslot *hslot = &udptable->hash[hash]; + + spin_lock(&hslot->lock); + if (sk_del_node_init(sk)) { + inet_sk(sk)->num = 0; + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + } + spin_unlock(&hslot->lock); +} +EXPORT_SYMBOL(udp_lib_unhash); + static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int is_udplite = IS_UDPLITE(sk); @@ -1071,13 +1099,14 @@ drop: static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct udphdr *uh, __be32 saddr, __be32 daddr, - struct hlist_head udptable[]) + struct udp_table *udptable) { struct sock *sk; + struct udp_hslot *hslot = &udptable->hash[udp_hashfn(net, ntohs(uh->dest))]; int dif; - read_lock(&udp_hash_lock); - sk = sk_head(&udptable[udp_hashfn(net, ntohs(uh->dest))]); + spin_lock(&hslot->lock); + sk = sk_head(&hslot->head); dif = skb->dev->ifindex; sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); if (sk) { @@ -1102,7 +1131,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, } while (sknext); } else kfree_skb(skb); - read_unlock(&udp_hash_lock); + spin_unlock(&hslot->lock); return 0; } @@ -1148,7 +1177,7 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, * All we need to do is get the socket, and then do a checksum. */ -int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], +int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto) { struct sock *sk; @@ -1246,7 +1275,7 @@ drop: int udp_rcv(struct sk_buff *skb) { - return __udp4_lib_rcv(skb, udp_hash, IPPROTO_UDP); + return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP); } void udp_destroy_sock(struct sock *sk) @@ -1488,7 +1517,7 @@ struct proto udp_prot = { .sysctl_wmem = &sysctl_udp_wmem_min, .sysctl_rmem = &sysctl_udp_rmem_min, .obj_size = sizeof(struct udp_sock), - .h.udp_hash = udp_hash, + .h.udp_table = &udp_table, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udp_setsockopt, .compat_getsockopt = compat_udp_getsockopt, @@ -1498,20 +1527,23 @@ struct proto udp_prot = { /* ------------------------------------------------------------------------ */ #ifdef CONFIG_PROC_FS -static struct sock *udp_get_first(struct seq_file *seq) +static struct sock *udp_get_first(struct seq_file *seq, int start) { struct sock *sk; struct udp_iter_state *state = seq->private; struct net *net = seq_file_net(seq); - for (state->bucket = 0; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) { + for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) { struct hlist_node *node; - sk_for_each(sk, node, state->hashtable + state->bucket) { + struct udp_hslot *hslot = &state->udp_table->hash[state->bucket]; + spin_lock_bh(&hslot->lock); + sk_for_each(sk, node, &hslot->head) { if (!net_eq(sock_net(sk), net)) continue; if (sk->sk_family == state->family) goto found; } + spin_unlock_bh(&hslot->lock); } sk = NULL; found: @@ -1525,20 +1557,18 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) do { sk = sk_next(sk); -try_again: - ; } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); - if (!sk && ++state->bucket < UDP_HTABLE_SIZE) { - sk = sk_head(state->hashtable + state->bucket); - goto try_again; + if (!sk) { + spin_unlock(&state->udp_table->hash[state->bucket].lock); + return udp_get_first(seq, state->bucket + 1); } return sk; } static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos) { - struct sock *sk = udp_get_first(seq); + struct sock *sk = udp_get_first(seq, 0); if (sk) while (pos && (sk = udp_get_next(seq, sk)) != NULL) @@ -1547,9 +1577,7 @@ static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos) } static void *udp_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(udp_hash_lock) { - read_lock(&udp_hash_lock); return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN; } @@ -1567,9 +1595,11 @@ static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void udp_seq_stop(struct seq_file *seq, void *v) - __releases(udp_hash_lock) { - read_unlock(&udp_hash_lock); + struct udp_iter_state *state = seq->private; + + if (state->bucket < UDP_HTABLE_SIZE) + spin_unlock_bh(&state->udp_table->hash[state->bucket].lock); } static int udp_seq_open(struct inode *inode, struct file *file) @@ -1585,7 +1615,7 @@ static int udp_seq_open(struct inode *inode, struct file *file) s = ((struct seq_file *)file->private_data)->private; s->family = afinfo->family; - s->hashtable = afinfo->hashtable; + s->udp_table = afinfo->udp_table; return err; } @@ -1657,7 +1687,7 @@ int udp4_seq_show(struct seq_file *seq, void *v) static struct udp_seq_afinfo udp4_seq_afinfo = { .name = "udp", .family = AF_INET, - .hashtable = udp_hash, + .udp_table = &udp_table, .seq_fops = { .owner = THIS_MODULE, }, @@ -1692,10 +1722,21 @@ void udp4_proc_exit(void) } #endif /* CONFIG_PROC_FS */ +void __init udp_table_init(struct udp_table *table) +{ + int i; + + for (i = 0; i < UDP_HTABLE_SIZE; i++) { + INIT_HLIST_HEAD(&table->hash[i].head); + spin_lock_init(&table->hash[i].lock); + } +} + void __init udp_init(void) { unsigned long limit; + udp_table_init(&udp_table); /* Set the pressure threshold up by the same strategy of TCP. It is a * fraction of global memory that is up to 1/2 at 256 MB, decreasing * toward zero with the amount of memory, with a floor of 128 pages. @@ -1712,8 +1753,6 @@ void __init udp_init(void) } EXPORT_SYMBOL(udp_disconnect); -EXPORT_SYMBOL(udp_hash); -EXPORT_SYMBOL(udp_hash_lock); EXPORT_SYMBOL(udp_ioctl); EXPORT_SYMBOL(udp_prot); EXPORT_SYMBOL(udp_sendmsg); -- cgit v1.2.3-70-g09d2 From 271b72c7fa82c2c7a795bc16896149933110672d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Oct 2008 02:11:14 -0700 Subject: udp: RCU handling for Unicast packets. Goals are : 1) Optimizing handling of incoming Unicast UDP frames, so that no memory writes should happen in the fast path. Note: Multicasts and broadcasts still will need to take a lock, because doing a full lockless lookup in this case is difficult. 2) No expensive operations in the socket bind/unhash phases : - No expensive synchronize_rcu() calls. - No added rcu_head in socket structure, increasing memory needs, but more important, forcing us to use call_rcu() calls, that have the bad property of making sockets structure cold. (rcu grace period between socket freeing and its potential reuse make this socket being cold in CPU cache). David did a previous patch using call_rcu() and noticed a 20% impact on TCP connection rates. Quoting Cristopher Lameter : "Right. That results in cacheline cooldown. You'd want to recycle the object as they are cache hot on a per cpu basis. That is screwed up by the delayed regular rcu processing. We have seen multiple regressions due to cacheline cooldown. The only choice in cacheline hot sensitive areas is to deal with the complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU." - Because udp sockets are allocated from dedicated kmem_cache, use of SLAB_DESTROY_BY_RCU can help here. Theory of operation : --------------------- As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()), special attention must be taken by readers and writers. Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed, reused, inserted in a different chain or in worst case in the same chain while readers could do lookups in the same time. In order to avoid loops, a reader must check each socket found in a chain really belongs to the chain the reader was traversing. If it finds a mismatch, lookup must start again at the begining. This *restart* loop is the reason we had to use rdlock for the multicast case, because we dont want to send same message several times to the same socket. We use RCU only for fast path. Thus, /proc/net/udp still takes spinlocks. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 37 ++++++++++++++++++++++++++++++++++++- net/core/sock.c | 3 ++- net/ipv4/udp.c | 35 ++++++++++++++++++++++++++--------- net/ipv4/udplite.c | 1 + net/ipv6/udp.c | 31 ++++++++++++++++++++++++------- net/ipv6/udplite.c | 1 + 6 files changed, 90 insertions(+), 18 deletions(-) (limited to 'net/ipv4/udp.c') diff --git a/include/net/sock.h b/include/net/sock.h index d200dfbe1ef..0bea25db547 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -363,6 +363,27 @@ static __inline__ int sk_del_node_init(struct sock *sk) return rc; } +static __inline__ int __sk_del_node_init_rcu(struct sock *sk) +{ + if (sk_hashed(sk)) { + hlist_del_init_rcu(&sk->sk_node); + return 1; + } + return 0; +} + +static __inline__ int sk_del_node_init_rcu(struct sock *sk) +{ + int rc = __sk_del_node_init_rcu(sk); + + if (rc) { + /* paranoid for a while -acme */ + WARN_ON(atomic_read(&sk->sk_refcnt) == 1); + __sock_put(sk); + } + return rc; +} + static __inline__ void __sk_add_node(struct sock *sk, struct hlist_head *list) { hlist_add_head(&sk->sk_node, list); @@ -374,6 +395,17 @@ static __inline__ void sk_add_node(struct sock *sk, struct hlist_head *list) __sk_add_node(sk, list); } +static __inline__ void __sk_add_node_rcu(struct sock *sk, struct hlist_head *list) +{ + hlist_add_head_rcu(&sk->sk_node, list); +} + +static __inline__ void sk_add_node_rcu(struct sock *sk, struct hlist_head *list) +{ + sock_hold(sk); + __sk_add_node_rcu(sk, list); +} + static __inline__ void __sk_del_bind_node(struct sock *sk) { __hlist_del(&sk->sk_bind_node); @@ -387,6 +419,8 @@ static __inline__ void sk_add_bind_node(struct sock *sk, #define sk_for_each(__sk, node, list) \ hlist_for_each_entry(__sk, node, list, sk_node) +#define sk_for_each_rcu(__sk, node, list) \ + hlist_for_each_entry_rcu(__sk, node, list, sk_node) #define sk_for_each_from(__sk, node) \ if (__sk && ({ node = &(__sk)->sk_node; 1; })) \ hlist_for_each_entry_from(__sk, node, sk_node) @@ -589,8 +623,9 @@ struct proto { int *sysctl_rmem; int max_header; - struct kmem_cache *slab; + struct kmem_cache *slab; unsigned int obj_size; + int slab_flags; atomic_t *orphan_count; diff --git a/net/core/sock.c b/net/core/sock.c index 5e2a3132a8c..ded1eb5d2fd 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2042,7 +2042,8 @@ int proto_register(struct proto *prot, int alloc_slab) if (alloc_slab) { prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0, - SLAB_HWCACHE_ALIGN, NULL); + SLAB_HWCACHE_ALIGN | prot->slab_flags, + NULL); if (prot->slab == NULL) { printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n", diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 2a6c491f97d..0ea974bf796 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -187,7 +187,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, inet_sk(sk)->num = snum; sk->sk_hash = snum; if (sk_unhashed(sk)) { - sk_add_node(sk, &hslot->head); + sk_add_node_rcu(sk, &hslot->head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } error = 0; @@ -253,15 +253,24 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif, struct udp_table *udptable) { - struct sock *sk, *result = NULL; + struct sock *sk, *result; struct hlist_node *node; unsigned short hnum = ntohs(dport); unsigned int hash = udp_hashfn(net, hnum); struct udp_hslot *hslot = &udptable->hash[hash]; - int score, badness = -1; + int score, badness; - spin_lock(&hslot->lock); - sk_for_each(sk, node, &hslot->head) { + rcu_read_lock(); +begin: + result = NULL; + badness = -1; + sk_for_each_rcu(sk, node, &hslot->head) { + /* + * lockless reader, and SLAB_DESTROY_BY_RCU items: + * We must check this item was not moved to another chain + */ + if (udp_hashfn(net, sk->sk_hash) != hash) + goto begin; score = compute_score(sk, net, saddr, hnum, sport, daddr, dport, dif); if (score > badness) { @@ -269,9 +278,16 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, badness = score; } } - if (result) - sock_hold(result); - spin_unlock(&hslot->lock); + if (result) { + if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) + result = NULL; + else if (unlikely(compute_score(result, net, saddr, hnum, sport, + daddr, dport, dif) < badness)) { + sock_put(result); + goto begin; + } + } + rcu_read_unlock(); return result; } @@ -953,7 +969,7 @@ void udp_lib_unhash(struct sock *sk) struct udp_hslot *hslot = &udptable->hash[hash]; spin_lock(&hslot->lock); - if (sk_del_node_init(sk)) { + if (sk_del_node_init_rcu(sk)) { inet_sk(sk)->num = 0; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); } @@ -1517,6 +1533,7 @@ struct proto udp_prot = { .sysctl_wmem = &sysctl_udp_wmem_min, .sysctl_rmem = &sysctl_udp_rmem_min, .obj_size = sizeof(struct udp_sock), + .slab_flags = SLAB_DESTROY_BY_RCU, .h.udp_table = &udp_table, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udp_setsockopt, diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index d8ea8e5f5ea..c784891cb7e 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -51,6 +51,7 @@ struct proto udplite_prot = { .unhash = udp_lib_unhash, .get_port = udp_v4_get_port, .obj_size = sizeof(struct udp_sock), + .slab_flags = SLAB_DESTROY_BY_RCU, .h.udp_table = &udplite_table, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udp_setsockopt, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index ccee7244ca0..1d9790e43df 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -97,24 +97,40 @@ static struct sock *__udp6_lib_lookup(struct net *net, struct in6_addr *daddr, __be16 dport, int dif, struct udp_table *udptable) { - struct sock *sk, *result = NULL; + struct sock *sk, *result; struct hlist_node *node; unsigned short hnum = ntohs(dport); unsigned int hash = udp_hashfn(net, hnum); struct udp_hslot *hslot = &udptable->hash[hash]; - int score, badness = -1; + int score, badness; - spin_lock(&hslot->lock); - sk_for_each(sk, node, &hslot->head) { + rcu_read_lock(); +begin: + result = NULL; + badness = -1; + sk_for_each_rcu(sk, node, &hslot->head) { + /* + * lockless reader, and SLAB_DESTROY_BY_RCU items: + * We must check this item was not moved to another chain + */ + if (udp_hashfn(net, sk->sk_hash) != hash) + goto begin; score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif); if (score > badness) { result = sk; badness = score; } } - if (result) - sock_hold(result); - spin_unlock(&hslot->lock); + if (result) { + if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) + result = NULL; + else if (unlikely(compute_score(result, net, hnum, saddr, sport, + daddr, dport, dif) < badness)) { + sock_put(result); + goto begin; + } + } + rcu_read_unlock(); return result; } @@ -1062,6 +1078,7 @@ struct proto udpv6_prot = { .sysctl_wmem = &sysctl_udp_wmem_min, .sysctl_rmem = &sysctl_udp_rmem_min, .obj_size = sizeof(struct udp6_sock), + .slab_flags = SLAB_DESTROY_BY_RCU, .h.udp_table = &udp_table, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udpv6_setsockopt, diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index f1e892a99e0..ba162a82458 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -49,6 +49,7 @@ struct proto udplitev6_prot = { .unhash = udp_lib_unhash, .get_port = udp_v6_get_port, .obj_size = sizeof(struct udp6_sock), + .slab_flags = SLAB_DESTROY_BY_RCU, .h.udp_table = &udplite_table, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udpv6_setsockopt, -- cgit v1.2.3-70-g09d2 From 8203efb3c612743fecb1ed67cf1daf9d9c127462 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Oct 2008 02:32:32 -0700 Subject: udp: calculate udp_mem based on low memory instead of all memory This patch mimics commit 57413ebc4e0f1e471a3b4db4aff9a85c083d090e (tcp: calculate tcp_mem based on low memory instead of all memory) The udp_mem array which contains limits on the total amount of memory used by UDP sockets is calculated based on nr_all_pages. On a 32 bits x86 system, we should base this on the number of lowmem pages. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/udp.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net/ipv4/udp.c') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 0ea974bf796..5ba03401b91 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -81,6 +81,8 @@ #include #include #include +#include +#include #include #include #include @@ -1751,15 +1753,16 @@ void __init udp_table_init(struct udp_table *table) void __init udp_init(void) { - unsigned long limit; + unsigned long nr_pages, limit; udp_table_init(&udp_table); /* Set the pressure threshold up by the same strategy of TCP. It is a * fraction of global memory that is up to 1/2 at 256 MB, decreasing * toward zero with the amount of memory, with a floor of 128 pages. */ - limit = min(nr_all_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT); - limit = (limit * (nr_all_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11); + nr_pages = totalram_pages - totalhigh_pages; + limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT); + limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11); limit = max(limit, 128UL); sysctl_udp_mem[0] = limit / 4 * 3; sysctl_udp_mem[1] = limit; -- cgit v1.2.3-70-g09d2 From f52b5054ec108aaa9e903850d6b62af8ae3fe6ae Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Oct 2008 11:19:11 -0700 Subject: udp: udp_get_next() should use spin_unlock_bh() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/udp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/udp.c') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 5ba03401b91..ced820318f9 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1579,7 +1579,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); if (!sk) { - spin_unlock(&state->udp_table->hash[state->bucket].lock); + spin_unlock_bh(&state->udp_table->hash[state->bucket].lock); return udp_get_first(seq, state->bucket + 1); } return sk; -- cgit v1.2.3-70-g09d2 From 96631ed16c514cf8b28fab991a076985ce378c26 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Oct 2008 11:19:58 -0700 Subject: udp: introduce sk_for_each_rcu_safenext() Corey Minyard found a race added in commit 271b72c7fa82c2c7a795bc16896149933110672d (udp: RCU handling for Unicast packets.) "If the socket is moved from one list to another list in-between the time the hash is calculated and the next field is accessed, and the socket has moved to the end of the new list, the traversal will not complete properly on the list it should have, since the socket will be on the end of the new list and there's not a way to tell it's on a new list and restart the list traversal. I think that this can be solved by pre-fetching the "next" field (with proper barriers) before checking the hash." This patch corrects this problem, introducing a new sk_for_each_rcu_safenext() macro. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/rculist.h | 17 +++++++++++++++++ include/net/sock.h | 4 ++-- net/ipv4/udp.c | 4 ++-- net/ipv6/udp.c | 4 ++-- 4 files changed, 23 insertions(+), 6 deletions(-) (limited to 'net/ipv4/udp.c') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index e649bd3f2c9..3ba2998b22b 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -383,5 +383,22 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev, ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \ pos = rcu_dereference(pos->next)) +/** + * hlist_for_each_entry_rcu_safenext - iterate over rcu list of given type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct hlist_node to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + * @next: the &struct hlist_node to use as a next cursor + * + * Special version of hlist_for_each_entry_rcu that make sure + * each next pointer is fetched before each iteration. + */ +#define hlist_for_each_entry_rcu_safenext(tpos, pos, head, member, next) \ + for (pos = rcu_dereference((head)->first); \ + pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) && \ + ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \ + pos = rcu_dereference(next)) + #endif /* __KERNEL__ */ #endif diff --git a/include/net/sock.h b/include/net/sock.h index 0bea25db547..a4f6d3fc047 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -419,8 +419,8 @@ static __inline__ void sk_add_bind_node(struct sock *sk, #define sk_for_each(__sk, node, list) \ hlist_for_each_entry(__sk, node, list, sk_node) -#define sk_for_each_rcu(__sk, node, list) \ - hlist_for_each_entry_rcu(__sk, node, list, sk_node) +#define sk_for_each_rcu_safenext(__sk, node, list, next) \ + hlist_for_each_entry_rcu_safenext(__sk, node, list, sk_node, next) #define sk_for_each_from(__sk, node) \ if (__sk && ({ node = &(__sk)->sk_node; 1; })) \ hlist_for_each_entry_from(__sk, node, sk_node) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ced820318f9..c3ecec8a9e1 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -256,7 +256,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, int dif, struct udp_table *udptable) { struct sock *sk, *result; - struct hlist_node *node; + struct hlist_node *node, *next; unsigned short hnum = ntohs(dport); unsigned int hash = udp_hashfn(net, hnum); struct udp_hslot *hslot = &udptable->hash[hash]; @@ -266,7 +266,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, begin: result = NULL; badness = -1; - sk_for_each_rcu(sk, node, &hslot->head) { + sk_for_each_rcu_safenext(sk, node, &hslot->head, next) { /* * lockless reader, and SLAB_DESTROY_BY_RCU items: * We must check this item was not moved to another chain diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 1d9790e43df..32d914db6c4 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -98,7 +98,7 @@ static struct sock *__udp6_lib_lookup(struct net *net, int dif, struct udp_table *udptable) { struct sock *sk, *result; - struct hlist_node *node; + struct hlist_node *node, *next; unsigned short hnum = ntohs(dport); unsigned int hash = udp_hashfn(net, hnum); struct udp_hslot *hslot = &udptable->hash[hash]; @@ -108,7 +108,7 @@ static struct sock *__udp6_lib_lookup(struct net *net, begin: result = NULL; badness = -1; - sk_for_each_rcu(sk, node, &hslot->head) { + sk_for_each_rcu_safenext(sk, node, &hslot->head, next) { /* * lockless reader, and SLAB_DESTROY_BY_RCU items: * We must check this item was not moved to another chain -- cgit v1.2.3-70-g09d2 From c8db3fec5b02f4cefe441903fe1c142ff14e1771 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Oct 2008 14:00:53 -0700 Subject: udp: Should use spin_lock_bh()/spin_unlock_bh() in udp_lib_unhash() Spotted by Alexander Beregalov Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/udp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/udp.c') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index c3ecec8a9e1..f760b86e753 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -970,12 +970,12 @@ void udp_lib_unhash(struct sock *sk) unsigned int hash = udp_hashfn(sock_net(sk), sk->sk_hash); struct udp_hslot *hslot = &udptable->hash[hash]; - spin_lock(&hslot->lock); + spin_lock_bh(&hslot->lock); if (sk_del_node_init_rcu(sk)) { inet_sk(sk)->num = 0; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); } - spin_unlock(&hslot->lock); + spin_unlock_bh(&hslot->lock); } EXPORT_SYMBOL(udp_lib_unhash); -- cgit v1.2.3-70-g09d2 From 673d57e72398edfedc93fb50ff58048077c9d587 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Fri, 31 Oct 2008 00:53:57 -0700 Subject: net: replace NIPQUAD() in net/ipv4/ net/ipv6/ Using NIPQUAD() with NIPQUAD_FMT, %d.%d.%d.%d or %u.%u.%u.%u can be replaced with %pI4 Signed-off-by: Harvey Harrison Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 7 ++----- net/ipv4/arp.c | 4 ++-- net/ipv4/fib_trie.c | 6 +++--- net/ipv4/icmp.c | 24 ++++++++++-------------- net/ipv4/ip_fragment.c | 5 ++--- net/ipv4/ip_input.c | 6 ++---- net/ipv4/ipcomp.c | 4 ++-- net/ipv4/ipconfig.c | 40 ++++++++++++++++++---------------------- net/ipv4/route.c | 40 +++++++++++++++++----------------------- net/ipv4/tcp_input.c | 4 ++-- net/ipv4/tcp_ipv4.c | 13 +++++-------- net/ipv4/tcp_probe.c | 7 +++---- net/ipv4/tcp_timer.c | 4 ++-- net/ipv4/udp.c | 12 ++++++------ net/ipv6/netfilter/ip6t_LOG.c | 5 ++--- 15 files changed, 78 insertions(+), 103 deletions(-) (limited to 'net/ipv4/udp.c') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 1fbff5fa424..e3286814c8d 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1070,11 +1070,8 @@ static int inet_sk_reselect_saddr(struct sock *sk) return 0; if (sysctl_ip_dynaddr > 1) { - printk(KERN_INFO "%s(): shifting inet->" - "saddr from " NIPQUAD_FMT " to " NIPQUAD_FMT "\n", - __func__, - NIPQUAD(old_saddr), - NIPQUAD(new_saddr)); + printk(KERN_INFO "%s(): shifting inet->saddr from %pI4 to %pI4\n", + __func__, &old_saddr, &new_saddr); } inet->saddr = inet->rcv_saddr = new_saddr; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 1a9dd66511f..051a62fc818 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1308,7 +1308,7 @@ static void arp_format_neigh_entry(struct seq_file *seq, #if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) } #endif - sprintf(tbuf, NIPQUAD_FMT, NIPQUAD(*(u32*)n->primary_key)); + sprintf(tbuf, "%pI4", n->primary_key); seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n", tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name); read_unlock(&n->lock); @@ -1321,7 +1321,7 @@ static void arp_format_pneigh_entry(struct seq_file *seq, int hatype = dev ? dev->type : 0; char tbuf[16]; - sprintf(tbuf, NIPQUAD_FMT, NIPQUAD(*(u32*)n->key)); + sprintf(tbuf, "%pI4", n->key); seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n", tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00", dev ? dev->name : "*"); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 5cb72786a8a..ec0ae490f0b 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2399,8 +2399,8 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) __be32 prf = htonl(mask_pfx(tn->key, tn->pos)); seq_indent(seq, iter->depth-1); - seq_printf(seq, " +-- " NIPQUAD_FMT "/%d %d %d %d\n", - NIPQUAD(prf), tn->pos, tn->bits, tn->full_children, + seq_printf(seq, " +-- %pI4/%d %d %d %d\n", + &prf, tn->pos, tn->bits, tn->full_children, tn->empty_children); } else { @@ -2410,7 +2410,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) __be32 val = htonl(l->key); seq_indent(seq, iter->depth); - seq_printf(seq, " |-- " NIPQUAD_FMT "\n", NIPQUAD(val)); + seq_printf(seq, " |-- %pI4\n", &val); hlist_for_each_entry_rcu(li, node, &l->list, hlist) { struct fib_alias *fa; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index e9d6ea0b49c..21e497efbd7 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -683,10 +683,8 @@ static void icmp_unreach(struct sk_buff *skb) break; case ICMP_FRAG_NEEDED: if (ipv4_config.no_pmtu_disc) { - LIMIT_NETDEBUG(KERN_INFO "ICMP: " NIPQUAD_FMT ": " - "fragmentation needed " - "and DF set.\n", - NIPQUAD(iph->daddr)); + LIMIT_NETDEBUG(KERN_INFO "ICMP: %pI4: fragmentation needed and DF set.\n", + &iph->daddr); } else { info = ip_rt_frag_needed(net, iph, ntohs(icmph->un.frag.mtu), @@ -696,9 +694,8 @@ static void icmp_unreach(struct sk_buff *skb) } break; case ICMP_SR_FAILED: - LIMIT_NETDEBUG(KERN_INFO "ICMP: " NIPQUAD_FMT ": Source " - "Route Failed.\n", - NIPQUAD(iph->daddr)); + LIMIT_NETDEBUG(KERN_INFO "ICMP: %pI4: Source Route Failed.\n", + &iph->daddr); break; default: break; @@ -729,12 +726,12 @@ static void icmp_unreach(struct sk_buff *skb) if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses && inet_addr_type(net, iph->daddr) == RTN_BROADCAST) { if (net_ratelimit()) - printk(KERN_WARNING NIPQUAD_FMT " sent an invalid ICMP " + printk(KERN_WARNING "%pI4 sent an invalid ICMP " "type %u, code %u " - "error to a broadcast: " NIPQUAD_FMT " on %s\n", - NIPQUAD(ip_hdr(skb)->saddr), + "error to a broadcast: %pI4 on %s\n", + &ip_hdr(skb)->saddr, icmph->type, icmph->code, - NIPQUAD(iph->daddr), + &iph->daddr, skb->dev->name); goto out; } @@ -952,9 +949,8 @@ static void icmp_address_reply(struct sk_buff *skb) break; } if (!ifa && net_ratelimit()) { - printk(KERN_INFO "Wrong address mask " NIPQUAD_FMT " from " - "%s/" NIPQUAD_FMT "\n", - NIPQUAD(*mp), dev->name, NIPQUAD(rt->rt_src)); + printk(KERN_INFO "Wrong address mask %pI4 from %s/%pI4\n", + mp, dev->name, &rt->rt_src); } } rcu_read_unlock(); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index e4f81f54bef..8c495e34d56 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -559,9 +559,8 @@ out_nomem: goto out_fail; out_oversize: if (net_ratelimit()) - printk(KERN_INFO - "Oversized IP packet from " NIPQUAD_FMT ".\n", - NIPQUAD(qp->saddr)); + printk(KERN_INFO "Oversized IP packet from %pI4.\n", + &qp->saddr); out_fail: IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_REASMFAILS); return err; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 861978a4f1a..1b6475c05eb 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -294,10 +294,8 @@ static inline int ip_rcv_options(struct sk_buff *skb) if (!IN_DEV_SOURCE_ROUTE(in_dev)) { if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) - printk(KERN_INFO "source route option " - NIPQUAD_FMT " -> " NIPQUAD_FMT "\n", - NIPQUAD(iph->saddr), - NIPQUAD(iph->daddr)); + printk(KERN_INFO "source route option %pI4 -> %pI4\n", + &iph->saddr, &iph->daddr); in_dev_put(in_dev); goto drop; } diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 38ccb6dfb02..ec8264ae45c 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -39,8 +39,8 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) spi, IPPROTO_COMP, AF_INET); if (!x) return; - NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/" NIPQUAD_FMT "\n", - spi, NIPQUAD(iph->daddr)); + NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%pI4\n", + spi, &iph->daddr); xfrm_state_put(x); } diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 42065fff46c..42a0f3dd3fd 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -374,7 +374,7 @@ static int __init ic_defaults(void) */ if (!ic_host_name_set) - sprintf(init_utsname()->nodename, NIPQUAD_FMT, NIPQUAD(ic_myaddr)); + sprintf(init_utsname()->nodename, "%pI4", &ic_myaddr); if (root_server_addr == NONE) root_server_addr = ic_servaddr; @@ -387,11 +387,11 @@ static int __init ic_defaults(void) else if (IN_CLASSC(ntohl(ic_myaddr))) ic_netmask = htonl(IN_CLASSC_NET); else { - printk(KERN_ERR "IP-Config: Unable to guess netmask for address " NIPQUAD_FMT "\n", - NIPQUAD(ic_myaddr)); + printk(KERN_ERR "IP-Config: Unable to guess netmask for address %pI4\n", + &ic_myaddr); return -1; } - printk("IP-Config: Guessing netmask " NIPQUAD_FMT "\n", NIPQUAD(ic_netmask)); + printk("IP-Config: Guessing netmask %pI4\n", &ic_netmask); } return 0; @@ -979,10 +979,8 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str ic_myaddr = b->your_ip; ic_servaddr = server_id; #ifdef IPCONFIG_DEBUG - printk("DHCP: Offered address " NIPQUAD_FMT, - NIPQUAD(ic_myaddr)); - printk(" by server " NIPQUAD_FMT "\n", - NIPQUAD(ic_servaddr)); + printk("DHCP: Offered address %pI4 by server %pI4\n", + &ic_myaddr, &ic_servaddr); #endif /* The DHCP indicated server address takes * precedence over the bootp header one if @@ -1177,11 +1175,11 @@ static int __init ic_dynamic(void) return -1; } - printk("IP-Config: Got %s answer from " NIPQUAD_FMT ", ", + printk("IP-Config: Got %s answer from %pI4, ", ((ic_got_reply & IC_RARP) ? "RARP" : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"), - NIPQUAD(ic_servaddr)); - printk("my address is " NIPQUAD_FMT "\n", NIPQUAD(ic_myaddr)); + &ic_servaddr); + printk("my address is %pI4\n", &ic_myaddr); return 0; } @@ -1206,14 +1204,12 @@ static int pnp_seq_show(struct seq_file *seq, void *v) "domain %s\n", ic_domain); for (i = 0; i < CONF_NAMESERVERS_MAX; i++) { if (ic_nameservers[i] != NONE) - seq_printf(seq, - "nameserver " NIPQUAD_FMT "\n", - NIPQUAD(ic_nameservers[i])); + seq_printf(seq, "nameserver %pI4\n", + &ic_nameservers[i]); } if (ic_servaddr != NONE) - seq_printf(seq, - "bootserver " NIPQUAD_FMT "\n", - NIPQUAD(ic_servaddr)); + seq_printf(seq, "bootserver %pI4\n", + &ic_servaddr); return 0; } @@ -1387,13 +1383,13 @@ static int __init ip_auto_config(void) */ printk("IP-Config: Complete:"); printk("\n device=%s", ic_dev->name); - printk(", addr=" NIPQUAD_FMT, NIPQUAD(ic_myaddr)); - printk(", mask=" NIPQUAD_FMT, NIPQUAD(ic_netmask)); - printk(", gw=" NIPQUAD_FMT, NIPQUAD(ic_gateway)); + printk(", addr=%pI4", &ic_myaddr); + printk(", mask=%pI4", &ic_netmask); + printk(", gw=%pI4", &ic_gateway); printk(",\n host=%s, domain=%s, nis-domain=%s", utsname()->nodename, ic_domain, utsname()->domainname); - printk(",\n bootserver=" NIPQUAD_FMT, NIPQUAD(ic_servaddr)); - printk(", rootserver=" NIPQUAD_FMT, NIPQUAD(root_server_addr)); + printk(",\n bootserver=%pI4", &ic_servaddr); + printk(", rootserver=%pI4", &root_server_addr); printk(", rootpath=%s", root_server_path); printk("\n"); #endif /* !SILENT */ diff --git a/net/ipv4/route.c b/net/ipv4/route.c index e59b4dcf677..f47b9db0db7 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1214,10 +1214,9 @@ restart: #if RT_CACHE_DEBUG >= 2 if (rt->u.dst.rt_next) { struct rtable *trt; - printk(KERN_DEBUG "rt_cache @%02x: " NIPQUAD_FMT, hash, - NIPQUAD(rt->rt_dst)); + printk(KERN_DEBUG "rt_cache @%02x: %pI4", hash, &rt->rt_dst); for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next) - printk(" . " NIPQUAD_FMT, NIPQUAD(trt->rt_dst)); + printk(" . %pI4", &trt->rt_dst); printk("\n"); } #endif @@ -1444,11 +1443,10 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, reject_redirect: #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) - printk(KERN_INFO "Redirect from " NIPQUAD_FMT " on %s about " - NIPQUAD_FMT " ignored.\n" - " Advised path = " NIPQUAD_FMT " -> " NIPQUAD_FMT "\n", - NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw), - NIPQUAD(saddr), NIPQUAD(daddr)); + printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n" + " Advised path = %pI4 -> %pI4\n", + &old_gw, dev->name, &new_gw, + &saddr, &daddr); #endif in_dev_put(in_dev); } @@ -1468,9 +1466,8 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) rt->fl.oif, rt_genid(dev_net(dst->dev))); #if RT_CACHE_DEBUG >= 1 - printk(KERN_DEBUG "ipv4_negative_advice: redirect to " - NIPQUAD_FMT "/%02x dropped\n", - NIPQUAD(rt->rt_dst), rt->fl.fl4_tos); + printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n", + &rt->rt_dst, rt->fl.fl4_tos); #endif rt_del(hash, rt); ret = NULL; @@ -1534,10 +1531,9 @@ void ip_rt_send_redirect(struct sk_buff *skb) if (IN_DEV_LOG_MARTIANS(in_dev) && rt->u.dst.rate_tokens == ip_rt_redirect_number && net_ratelimit()) - printk(KERN_WARNING "host " NIPQUAD_FMT "/if%d ignores " - "redirects for " NIPQUAD_FMT " to " NIPQUAD_FMT ".\n", - NIPQUAD(rt->rt_src), rt->rt_iif, - NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway)); + printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", + &rt->rt_src, rt->rt_iif, + &rt->rt_dst, &rt->rt_gateway); #endif } out: @@ -1730,8 +1726,8 @@ static void ipv4_link_failure(struct sk_buff *skb) static int ip_rt_bug(struct sk_buff *skb) { - printk(KERN_DEBUG "ip_rt_bug: " NIPQUAD_FMT " -> " NIPQUAD_FMT ", %s\n", - NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr), + printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n", + &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, skb->dev ? skb->dev->name : "?"); kfree_skb(skb); return 0; @@ -1908,9 +1904,8 @@ static void ip_handle_martian_source(struct net_device *dev, * RFC1812 recommendation, if source is martian, * the only hint is MAC header. */ - printk(KERN_WARNING "martian source " NIPQUAD_FMT " from " - NIPQUAD_FMT", on dev %s\n", - NIPQUAD(daddr), NIPQUAD(saddr), dev->name); + printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n", + &daddr, &saddr, dev->name); if (dev->hard_header_len && skb_mac_header_was_set(skb)) { int i; const unsigned char *p = skb_mac_header(skb); @@ -2219,9 +2214,8 @@ martian_destination: RT_CACHE_STAT_INC(in_martian_dst); #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) - printk(KERN_WARNING "martian destination " NIPQUAD_FMT " from " - NIPQUAD_FMT ", dev %s\n", - NIPQUAD(daddr), NIPQUAD(saddr), dev->name); + printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n", + &daddr, &saddr, dev->name); #endif e_hostunreach: diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 04909e4b3c4..097294b7da3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2336,9 +2336,9 @@ static void DBGUNDO(struct sock *sk, const char *msg) struct inet_sock *inet = inet_sk(sk); if (sk->sk_family == AF_INET) { - printk(KERN_DEBUG "Undo %s " NIPQUAD_FMT "/%u c%u l%u ss%u/%u p%u\n", + printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n", msg, - NIPQUAD(inet->daddr), ntohs(inet->dport), + &inet->daddr, ntohs(inet->dport), tp->snd_cwnd, tcp_left_out(tp), tp->snd_ssthresh, tp->prior_ssthresh, tp->packets_out); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5c8fa7f1e32..ef33246e6a6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1139,10 +1139,9 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb) if (genhash || memcmp(hash_location, newhash, 16) != 0) { if (net_ratelimit()) { - printk(KERN_INFO "MD5 Hash failed for " - "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n", - NIPQUAD(iph->saddr), ntohs(th->source), - NIPQUAD(iph->daddr), ntohs(th->dest), + printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", + &iph->saddr, ntohs(th->source), + &iph->daddr, ntohs(th->dest), genhash ? " tcp_v4_calc_md5_hash failed" : ""); } return 1; @@ -1297,10 +1296,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) * to destinations, already remembered * to the moment of synflood. */ - LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open " - "request from " NIPQUAD_FMT "/%u\n", - NIPQUAD(saddr), - ntohs(tcp_hdr(skb)->source)); + LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n", + &saddr, ntohs(tcp_hdr(skb)->source)); goto drop_and_release; } diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index 7ddc30f0744..25524d4e372 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -153,12 +153,11 @@ static int tcpprobe_sprint(char *tbuf, int n) = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start)); return snprintf(tbuf, n, - "%lu.%09lu " NIPQUAD_FMT ":%u " NIPQUAD_FMT ":%u" - " %d %#x %#x %u %u %u %u\n", + "%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u\n", (unsigned long) tv.tv_sec, (unsigned long) tv.tv_nsec, - NIPQUAD(p->saddr), ntohs(p->sport), - NIPQUAD(p->daddr), ntohs(p->dport), + &p->saddr, ntohs(p->sport), + &p->daddr, ntohs(p->dport), p->length, p->snd_nxt, p->snd_una, p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt); } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 979c9d604eb..9843f6cb40a 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -299,8 +299,8 @@ static void tcp_retransmit_timer(struct sock *sk) #ifdef TCP_DEBUG struct inet_sock *inet = inet_sk(sk); if (sk->sk_family == AF_INET) { - LIMIT_NETDEBUG(KERN_DEBUG "TCP: Treason uncloaked! Peer " NIPQUAD_FMT ":%u/%u shrinks window %u:%u. Repaired.\n", - NIPQUAD(inet->daddr), ntohs(inet->dport), + LIMIT_NETDEBUG(KERN_DEBUG "TCP: Treason uncloaked! Peer %pI4:%u/%u shrinks window %u:%u. Repaired.\n", + &inet->daddr, ntohs(inet->dport), inet->num, tp->snd_una, tp->snd_nxt); } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index f760b86e753..bcb5def2b09 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1263,13 +1263,13 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, return 0; short_packet: - LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From " NIPQUAD_FMT ":%u %d/%d to " NIPQUAD_FMT ":%u\n", + LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n", proto == IPPROTO_UDPLITE ? "-Lite" : "", - NIPQUAD(saddr), + &saddr, ntohs(uh->source), ulen, skb->len, - NIPQUAD(daddr), + &daddr, ntohs(uh->dest)); goto drop; @@ -1278,11 +1278,11 @@ csum_error: * RFC1122: OK. Discards the bad packet silently (as far as * the network is concerned, anyway) as per 4.1.3.4 (MUST). */ - LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From " NIPQUAD_FMT ":%u to " NIPQUAD_FMT ":%u ulen %d\n", + LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n", proto == IPPROTO_UDPLITE ? "-Lite" : "", - NIPQUAD(saddr), + &saddr, ntohs(uh->source), - NIPQUAD(daddr), + &daddr, ntohs(uh->dest), ulen); drop: diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c index 02885e8bb69..7c668c63f70 100644 --- a/net/ipv6/netfilter/ip6t_LOG.c +++ b/net/ipv6/netfilter/ip6t_LOG.c @@ -424,9 +424,8 @@ ip6t_log_packet(u_int8_t pf, if (skb->dev->type == ARPHRD_SIT) { const struct iphdr *iph = (struct iphdr *)skb_mac_header(skb); - printk("TUNNEL=%u.%u.%u.%u->%u.%u.%u.%u ", - NIPQUAD(iph->saddr), - NIPQUAD(iph->daddr)); + printk("TUNNEL=%pI4->%pI4 ", + &iph->saddr, &iph->daddr); } } else printk(" "); -- cgit v1.2.3-70-g09d2 From c37ccc0d4e2a4ee52f1a40cff1be0049f2104bba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 1 Nov 2008 21:19:18 -0700 Subject: udp: add a missing smp_wmb() in udp_lib_get_port() Corey Minyard spotted a missing memory barrier in udp_lib_get_port() We need to make sure a reader cannot read the new 'sk->sk_next' value and previous value of 'sk->sk_hash'. Or else, an item could be deleted from a chain, and inserted into another chain. If new chain was empty before the move, 'next' pointer is NULL, and lockless reader can not detect it missed following items in original chain. This patch is temporary, since we expect an upcoming patch to introduce another way of handling the problem. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/udp.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/ipv4/udp.c') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index bcb5def2b09..7e4d9c87115 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -189,6 +189,11 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, inet_sk(sk)->num = snum; sk->sk_hash = snum; if (sk_unhashed(sk)) { + /* + * We need that previous write to sk->sk_hash committed + * before write to sk->next done in following add_node() variant + */ + smp_wmb(); sk_add_node_rcu(sk, &hslot->head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } -- cgit v1.2.3-70-g09d2 From 88ab1932eac721c6e7336708558fa5ed02c85c80 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 16 Nov 2008 19:39:21 -0800 Subject: udp: Use hlist_nulls in UDP RCU code This is a straightforward patch, using hlist_nulls infrastructure. RCUification already done on UDP two weeks ago. Using hlist_nulls permits us to avoid some memory barriers, both at lookup time and delete time. Patch is large because it adds new macros to include/net/sock.h. These macros will be used by TCP & DCCP in next patch. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/rculist.h | 17 --------------- include/net/sock.h | 57 +++++++++++++++++++++++++++++++++++++++---------- include/net/udp.h | 2 +- net/ipv4/udp.c | 47 +++++++++++++++++++--------------------- net/ipv6/udp.c | 26 +++++++++++----------- 5 files changed, 83 insertions(+), 66 deletions(-) (limited to 'net/ipv4/udp.c') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 3ba2998b22b..e649bd3f2c9 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -383,22 +383,5 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev, ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \ pos = rcu_dereference(pos->next)) -/** - * hlist_for_each_entry_rcu_safenext - iterate over rcu list of given type - * @tpos: the type * to use as a loop cursor. - * @pos: the &struct hlist_node to use as a loop cursor. - * @head: the head for your list. - * @member: the name of the hlist_node within the struct. - * @next: the &struct hlist_node to use as a next cursor - * - * Special version of hlist_for_each_entry_rcu that make sure - * each next pointer is fetched before each iteration. - */ -#define hlist_for_each_entry_rcu_safenext(tpos, pos, head, member, next) \ - for (pos = rcu_dereference((head)->first); \ - pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) && \ - ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \ - pos = rcu_dereference(next)) - #endif /* __KERNEL__ */ #endif diff --git a/include/net/sock.h b/include/net/sock.h index 8b2b82131b6..0a638948868 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -42,6 +42,7 @@ #include #include +#include #include #include #include @@ -52,6 +53,7 @@ #include #include +#include #include #include @@ -106,6 +108,7 @@ struct net; * @skc_reuse: %SO_REUSEADDR setting * @skc_bound_dev_if: bound device index if != 0 * @skc_node: main hash linkage for various protocol lookup tables + * @skc_nulls_node: main hash linkage for UDP/UDP-Lite protocol * @skc_bind_node: bind hash linkage for various protocol lookup tables * @skc_refcnt: reference count * @skc_hash: hash value used with various protocol lookup tables @@ -120,7 +123,10 @@ struct sock_common { volatile unsigned char skc_state; unsigned char skc_reuse; int skc_bound_dev_if; - struct hlist_node skc_node; + union { + struct hlist_node skc_node; + struct hlist_nulls_node skc_nulls_node; + }; struct hlist_node skc_bind_node; atomic_t skc_refcnt; unsigned int skc_hash; @@ -206,6 +212,7 @@ struct sock { #define sk_reuse __sk_common.skc_reuse #define sk_bound_dev_if __sk_common.skc_bound_dev_if #define sk_node __sk_common.skc_node +#define sk_nulls_node __sk_common.skc_nulls_node #define sk_bind_node __sk_common.skc_bind_node #define sk_refcnt __sk_common.skc_refcnt #define sk_hash __sk_common.skc_hash @@ -300,12 +307,30 @@ static inline struct sock *sk_head(const struct hlist_head *head) return hlist_empty(head) ? NULL : __sk_head(head); } +static inline struct sock *__sk_nulls_head(const struct hlist_nulls_head *head) +{ + return hlist_nulls_entry(head->first, struct sock, sk_nulls_node); +} + +static inline struct sock *sk_nulls_head(const struct hlist_nulls_head *head) +{ + return hlist_nulls_empty(head) ? NULL : __sk_nulls_head(head); +} + static inline struct sock *sk_next(const struct sock *sk) { return sk->sk_node.next ? hlist_entry(sk->sk_node.next, struct sock, sk_node) : NULL; } +static inline struct sock *sk_nulls_next(const struct sock *sk) +{ + return (!is_a_nulls(sk->sk_nulls_node.next)) ? + hlist_nulls_entry(sk->sk_nulls_node.next, + struct sock, sk_nulls_node) : + NULL; +} + static inline int sk_unhashed(const struct sock *sk) { return hlist_unhashed(&sk->sk_node); @@ -321,6 +346,11 @@ static __inline__ void sk_node_init(struct hlist_node *node) node->pprev = NULL; } +static __inline__ void sk_nulls_node_init(struct hlist_nulls_node *node) +{ + node->pprev = NULL; +} + static __inline__ void __sk_del_node(struct sock *sk) { __hlist_del(&sk->sk_node); @@ -367,18 +397,18 @@ static __inline__ int sk_del_node_init(struct sock *sk) return rc; } -static __inline__ int __sk_del_node_init_rcu(struct sock *sk) +static __inline__ int __sk_nulls_del_node_init_rcu(struct sock *sk) { if (sk_hashed(sk)) { - hlist_del_init_rcu(&sk->sk_node); + hlist_nulls_del_init_rcu(&sk->sk_nulls_node); return 1; } return 0; } -static __inline__ int sk_del_node_init_rcu(struct sock *sk) +static __inline__ int sk_nulls_del_node_init_rcu(struct sock *sk) { - int rc = __sk_del_node_init_rcu(sk); + int rc = __sk_nulls_del_node_init_rcu(sk); if (rc) { /* paranoid for a while -acme */ @@ -399,15 +429,15 @@ static __inline__ void sk_add_node(struct sock *sk, struct hlist_head *list) __sk_add_node(sk, list); } -static __inline__ void __sk_add_node_rcu(struct sock *sk, struct hlist_head *list) +static __inline__ void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) { - hlist_add_head_rcu(&sk->sk_node, list); + hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); } -static __inline__ void sk_add_node_rcu(struct sock *sk, struct hlist_head *list) +static __inline__ void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) { sock_hold(sk); - __sk_add_node_rcu(sk, list); + __sk_nulls_add_node_rcu(sk, list); } static __inline__ void __sk_del_bind_node(struct sock *sk) @@ -423,11 +453,16 @@ static __inline__ void sk_add_bind_node(struct sock *sk, #define sk_for_each(__sk, node, list) \ hlist_for_each_entry(__sk, node, list, sk_node) -#define sk_for_each_rcu_safenext(__sk, node, list, next) \ - hlist_for_each_entry_rcu_safenext(__sk, node, list, sk_node, next) +#define sk_nulls_for_each(__sk, node, list) \ + hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node) +#define sk_nulls_for_each_rcu(__sk, node, list) \ + hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node) #define sk_for_each_from(__sk, node) \ if (__sk && ({ node = &(__sk)->sk_node; 1; })) \ hlist_for_each_entry_from(__sk, node, sk_node) +#define sk_nulls_for_each_from(__sk, node) \ + if (__sk && ({ node = &(__sk)->sk_nulls_node; 1; })) \ + hlist_nulls_for_each_entry_from(__sk, node, sk_nulls_node) #define sk_for_each_continue(__sk, node) \ if (__sk && ({ node = &(__sk)->sk_node; 1; })) \ hlist_for_each_entry_continue(__sk, node, sk_node) diff --git a/include/net/udp.h b/include/net/udp.h index df2bfe54537..90e6ce56be6 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -51,7 +51,7 @@ struct udp_skb_cb { #define UDP_SKB_CB(__skb) ((struct udp_skb_cb *)((__skb)->cb)) struct udp_hslot { - struct hlist_head head; + struct hlist_nulls_head head; spinlock_t lock; } __attribute__((aligned(2 * sizeof(long)))); struct udp_table { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 54badc9a019..fea2d873dd4 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -127,9 +127,9 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, const struct sock *sk2)) { struct sock *sk2; - struct hlist_node *node; + struct hlist_nulls_node *node; - sk_for_each(sk2, node, &hslot->head) + sk_nulls_for_each(sk2, node, &hslot->head) if (net_eq(sock_net(sk2), net) && sk2 != sk && sk2->sk_hash == num && @@ -189,12 +189,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, inet_sk(sk)->num = snum; sk->sk_hash = snum; if (sk_unhashed(sk)) { - /* - * We need that previous write to sk->sk_hash committed - * before write to sk->next done in following add_node() variant - */ - smp_wmb(); - sk_add_node_rcu(sk, &hslot->head); + sk_nulls_add_node_rcu(sk, &hslot->head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } error = 0; @@ -261,7 +256,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, int dif, struct udp_table *udptable) { struct sock *sk, *result; - struct hlist_node *node, *next; + struct hlist_nulls_node *node; unsigned short hnum = ntohs(dport); unsigned int hash = udp_hashfn(net, hnum); struct udp_hslot *hslot = &udptable->hash[hash]; @@ -271,13 +266,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, begin: result = NULL; badness = -1; - sk_for_each_rcu_safenext(sk, node, &hslot->head, next) { - /* - * lockless reader, and SLAB_DESTROY_BY_RCU items: - * We must check this item was not moved to another chain - */ - if (udp_hashfn(net, sk->sk_hash) != hash) - goto begin; + sk_nulls_for_each_rcu(sk, node, &hslot->head) { score = compute_score(sk, net, saddr, hnum, sport, daddr, dport, dif); if (score > badness) { @@ -285,6 +274,14 @@ begin: badness = score; } } + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != hash) + goto begin; + if (result) { if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) result = NULL; @@ -325,11 +322,11 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk, __be16 rmt_port, __be32 rmt_addr, int dif) { - struct hlist_node *node; + struct hlist_nulls_node *node; struct sock *s = sk; unsigned short hnum = ntohs(loc_port); - sk_for_each_from(s, node) { + sk_nulls_for_each_from(s, node) { struct inet_sock *inet = inet_sk(s); if (!net_eq(sock_net(s), net) || @@ -977,7 +974,7 @@ void udp_lib_unhash(struct sock *sk) struct udp_hslot *hslot = &udptable->hash[hash]; spin_lock_bh(&hslot->lock); - if (sk_del_node_init_rcu(sk)) { + if (sk_nulls_del_node_init_rcu(sk)) { inet_sk(sk)->num = 0; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); } @@ -1130,7 +1127,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, int dif; spin_lock(&hslot->lock); - sk = sk_head(&hslot->head); + sk = sk_nulls_head(&hslot->head); dif = skb->dev->ifindex; sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); if (sk) { @@ -1139,7 +1136,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, do { struct sk_buff *skb1 = skb; - sknext = udp_v4_mcast_next(net, sk_next(sk), uh->dest, + sknext = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest, daddr, uh->source, saddr, dif); if (sknext) @@ -1560,10 +1557,10 @@ static struct sock *udp_get_first(struct seq_file *seq, int start) struct net *net = seq_file_net(seq); for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) { - struct hlist_node *node; + struct hlist_nulls_node *node; struct udp_hslot *hslot = &state->udp_table->hash[state->bucket]; spin_lock_bh(&hslot->lock); - sk_for_each(sk, node, &hslot->head) { + sk_nulls_for_each(sk, node, &hslot->head) { if (!net_eq(sock_net(sk), net)) continue; if (sk->sk_family == state->family) @@ -1582,7 +1579,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) struct net *net = seq_file_net(seq); do { - sk = sk_next(sk); + sk = sk_nulls_next(sk); } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); if (!sk) { @@ -1753,7 +1750,7 @@ void __init udp_table_init(struct udp_table *table) int i; for (i = 0; i < UDP_HTABLE_SIZE; i++) { - INIT_HLIST_HEAD(&table->hash[i].head); + INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i); spin_lock_init(&table->hash[i].lock); } } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 8dafa36b1ba..fd2d9ad4a8a 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -98,7 +98,7 @@ static struct sock *__udp6_lib_lookup(struct net *net, int dif, struct udp_table *udptable) { struct sock *sk, *result; - struct hlist_node *node, *next; + struct hlist_nulls_node *node; unsigned short hnum = ntohs(dport); unsigned int hash = udp_hashfn(net, hnum); struct udp_hslot *hslot = &udptable->hash[hash]; @@ -108,19 +108,21 @@ static struct sock *__udp6_lib_lookup(struct net *net, begin: result = NULL; badness = -1; - sk_for_each_rcu_safenext(sk, node, &hslot->head, next) { - /* - * lockless reader, and SLAB_DESTROY_BY_RCU items: - * We must check this item was not moved to another chain - */ - if (udp_hashfn(net, sk->sk_hash) != hash) - goto begin; + sk_nulls_for_each_rcu(sk, node, &hslot->head) { score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif); if (score > badness) { result = sk; badness = score; } } + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != hash) + goto begin; + if (result) { if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) result = NULL; @@ -374,11 +376,11 @@ static struct sock *udp_v6_mcast_next(struct net *net, struct sock *sk, __be16 rmt_port, struct in6_addr *rmt_addr, int dif) { - struct hlist_node *node; + struct hlist_nulls_node *node; struct sock *s = sk; unsigned short num = ntohs(loc_port); - sk_for_each_from(s, node) { + sk_nulls_for_each_from(s, node) { struct inet_sock *inet = inet_sk(s); if (!net_eq(sock_net(s), net)) @@ -423,7 +425,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, int dif; spin_lock(&hslot->lock); - sk = sk_head(&hslot->head); + sk = sk_nulls_head(&hslot->head); dif = inet6_iif(skb); sk = udp_v6_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); if (!sk) { @@ -432,7 +434,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, } sk2 = sk; - while ((sk2 = udp_v6_mcast_next(net, sk_next(sk2), uh->dest, daddr, + while ((sk2 = udp_v6_mcast_next(net, sk_nulls_next(sk2), uh->dest, daddr, uh->source, saddr, dif))) { struct sk_buff *buff = skb_clone(skb, GFP_ATOMIC); if (buff) { -- cgit v1.2.3-70-g09d2 From 2e77d89b2fa8e3f8325b8ce7893ec3645f41aff5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 24 Nov 2008 15:52:46 -0800 Subject: net: avoid a pair of dst_hold()/dst_release() in ip_append_data() We can reduce pressure on dst entry refcount that slowdown UDP transmit path on SMP machines. This pressure is visible on RTP servers when delivering content to mediagateways, especially big ones, handling thousand of streams. Several cpus send UDP frames to the same destination, hence use the same dst entry. This patch makes ip_append_data() eventually steal the refcount its callers had to take on the dst entry. This doesnt avoid all refcounting, but still gives speedups on SMP, on UDP/RAW transmit path Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip.h | 2 +- net/ipv4/icmp.c | 8 ++++---- net/ipv4/ip_output.c | 11 ++++++++--- net/ipv4/raw.c | 2 +- net/ipv4/udp.c | 2 +- 5 files changed, 15 insertions(+), 10 deletions(-) (limited to 'net/ipv4/udp.c') diff --git a/include/net/ip.h b/include/net/ip.h index bc026ecb513..ddef10c22e3 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -110,7 +110,7 @@ extern int ip_append_data(struct sock *sk, int odd, struct sk_buff *skb), void *from, int len, int protolen, struct ipcm_cookie *ipc, - struct rtable *rt, + struct rtable **rt, unsigned int flags); extern int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb); extern ssize_t ip_append_page(struct sock *sk, struct page *page, diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 21e497efbd7..7b88be9803b 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -321,12 +321,12 @@ static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd, } static void icmp_push_reply(struct icmp_bxm *icmp_param, - struct ipcm_cookie *ipc, struct rtable *rt) + struct ipcm_cookie *ipc, struct rtable **rt) { struct sock *sk; struct sk_buff *skb; - sk = icmp_sk(dev_net(rt->u.dst.dev)); + sk = icmp_sk(dev_net((*rt)->u.dst.dev)); if (ip_append_data(sk, icmp_glue_bits, icmp_param, icmp_param->data_len+icmp_param->head_len, icmp_param->head_len, @@ -392,7 +392,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) } if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type, icmp_param->data.icmph.code)) - icmp_push_reply(icmp_param, &ipc, rt); + icmp_push_reply(icmp_param, &ipc, &rt); ip_rt_put(rt); out_unlock: icmp_xmit_unlock(sk); @@ -635,7 +635,7 @@ route_done: icmp_param.data_len = room; icmp_param.head_len = sizeof(struct icmphdr); - icmp_push_reply(&icmp_param, &ipc, rt); + icmp_push_reply(&icmp_param, &ipc, &rt); ende: ip_rt_put(rt); out_unlock: diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 46d7be233ea..5516825a075 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -778,7 +778,7 @@ int ip_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, - struct ipcm_cookie *ipc, struct rtable *rt, + struct ipcm_cookie *ipc, struct rtable **rtp, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); @@ -793,6 +793,7 @@ int ip_append_data(struct sock *sk, int offset = 0; unsigned int maxfraglen, fragheaderlen; int csummode = CHECKSUM_NONE; + struct rtable *rt; if (flags&MSG_PROBE) return 0; @@ -812,7 +813,11 @@ int ip_append_data(struct sock *sk, inet->cork.flags |= IPCORK_OPT; inet->cork.addr = ipc->addr; } - dst_hold(&rt->u.dst); + rt = *rtp; + /* + * We steal reference to this route, caller should not release it + */ + *rtp = NULL; inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path); @@ -1391,7 +1396,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0, - &ipc, rt, MSG_DONTWAIT); + &ipc, &rt, MSG_DONTWAIT); if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { if (arg->csumoffset >= 0) *((__sum16 *)skb_transport_header(skb) + diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 998fcffc9e1..dff8bc4e0fa 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -572,7 +572,7 @@ back_from_confirm: ipc.addr = rt->rt_dst; lock_sock(sk); err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0, - &ipc, rt, msg->msg_flags); + &ipc, &rt, msg->msg_flags); if (err) ip_flush_pending_frames(sk); else if (!(msg->msg_flags & MSG_MORE)) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index da869ce041d..549114472db 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -719,7 +719,7 @@ do_append_data: up->len += ulen; getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; err = ip_append_data(sk, getfrag, msg->msg_iov, ulen, - sizeof(struct udphdr), &ipc, rt, + sizeof(struct udphdr), &ipc, &rt, corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); if (err) udp_flush_pending_frames(sk); -- cgit v1.2.3-70-g09d2 From 723b46108f8ee75b61ce703d0c9225e4f537bc46 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 25 Nov 2008 13:55:15 -0800 Subject: net: udp_unhash() can test if sk is hashed Impact: Optimization Like done in inet_unhash(), we can avoid taking a chain lock if socket is not hashed in udp_unhash() Triggered by close(socket(AF_INET, SOCK_DGRAM, 0)); Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/udp.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'net/ipv4/udp.c') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 549114472db..cf5ab0581eb 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -970,16 +970,18 @@ int udp_disconnect(struct sock *sk, int flags) void udp_lib_unhash(struct sock *sk) { - struct udp_table *udptable = sk->sk_prot->h.udp_table; - unsigned int hash = udp_hashfn(sock_net(sk), sk->sk_hash); - struct udp_hslot *hslot = &udptable->hash[hash]; + if (sk_hashed(sk)) { + struct udp_table *udptable = sk->sk_prot->h.udp_table; + unsigned int hash = udp_hashfn(sock_net(sk), sk->sk_hash); + struct udp_hslot *hslot = &udptable->hash[hash]; - spin_lock_bh(&hslot->lock); - if (sk_nulls_del_node_init_rcu(sk)) { - inet_sk(sk)->num = 0; - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + spin_lock_bh(&hslot->lock); + if (sk_nulls_del_node_init_rcu(sk)) { + inet_sk(sk)->num = 0; + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + } + spin_unlock_bh(&hslot->lock); } - spin_unlock_bh(&hslot->lock); } EXPORT_SYMBOL(udp_lib_unhash); -- cgit v1.2.3-70-g09d2