From 645ca708f936b2fbeb79e52d7823e3eb2c0905f8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Oct 2008 01:41:45 -0700 Subject: udp: introduce struct udp_table and multiple spinlocks UDP sockets are hashed in a 128 slots hash table. This hash table is protected by *one* rwlock. This rwlock is readlocked each time an incoming UDP message is handled. This rwlock is writelocked each time a socket must be inserted in hash table (bind time), or deleted from this table (close time) This is not scalable on SMP machines : 1) Even in read mode, lock() and unlock() are atomic operations and must dirty a contended cache line, shared by all cpus. 2) A writer might be starved if many readers are 'in flight'. This can happen on a machine with some NIC receiving many UDP messages. User process can be delayed a long time at socket creation/dismantle time. This patch prepares RCU migration, by introducing 'struct udp_table and struct udp_hslot', and using one spinlock per chain, to reduce contention on central rwlock. Introducing one spinlock per chain reduces latencies, for port randomization on heavily loaded UDP servers. This also speedup bindings to specific ports. udp_lib_unhash() was uninlined, becoming to big. Some cleanups were done to ease review of following patch (RCUification of UDP Unicast lookups) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/udp.c | 112 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 62 insertions(+), 50 deletions(-) (limited to 'net/ipv6/udp.c') diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e51da8c092f..ccee7244ca0 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -54,62 +54,73 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum) return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal); } +static inline int compute_score(struct sock *sk, struct net *net, + unsigned short hnum, + struct in6_addr *saddr, __be16 sport, + struct in6_addr *daddr, __be16 dport, + int dif) +{ + int score = -1; + + if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum && + sk->sk_family == PF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet_sock *inet = inet_sk(sk); + + score = 0; + if (inet->dport) { + if (inet->dport != sport) + return -1; + score++; + } + if (!ipv6_addr_any(&np->rcv_saddr)) { + if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) + return -1; + score++; + } + if (!ipv6_addr_any(&np->daddr)) { + if (!ipv6_addr_equal(&np->daddr, saddr)) + return -1; + score++; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score++; + } + } + return score; +} + static struct sock *__udp6_lib_lookup(struct net *net, struct in6_addr *saddr, __be16 sport, struct in6_addr *daddr, __be16 dport, - int dif, struct hlist_head udptable[]) + int dif, struct udp_table *udptable) { struct sock *sk, *result = NULL; struct hlist_node *node; unsigned short hnum = ntohs(dport); - int badness = -1; - - read_lock(&udp_hash_lock); - sk_for_each(sk, node, &udptable[udp_hashfn(net, hnum)]) { - struct inet_sock *inet = inet_sk(sk); - - if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum && - sk->sk_family == PF_INET6) { - struct ipv6_pinfo *np = inet6_sk(sk); - int score = 0; - if (inet->dport) { - if (inet->dport != sport) - continue; - score++; - } - if (!ipv6_addr_any(&np->rcv_saddr)) { - if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) - continue; - score++; - } - if (!ipv6_addr_any(&np->daddr)) { - if (!ipv6_addr_equal(&np->daddr, saddr)) - continue; - score++; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - continue; - score++; - } - if (score == 4) { - result = sk; - break; - } else if (score > badness) { - result = sk; - badness = score; - } + unsigned int hash = udp_hashfn(net, hnum); + struct udp_hslot *hslot = &udptable->hash[hash]; + int score, badness = -1; + + spin_lock(&hslot->lock); + sk_for_each(sk, node, &hslot->head) { + score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif); + if (score > badness) { + result = sk; + badness = score; } } if (result) sock_hold(result); - read_unlock(&udp_hash_lock); + spin_unlock(&hslot->lock); return result; } static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport, - struct hlist_head udptable[]) + struct udp_table *udptable) { struct sock *sk; struct ipv6hdr *iph = ipv6_hdr(skb); @@ -239,7 +250,7 @@ csum_copy_err: void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, int type, int code, int offset, __be32 info, - struct hlist_head udptable[] ) + struct udp_table *udptable) { struct ipv6_pinfo *np; struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data; @@ -275,7 +286,7 @@ static __inline__ void udpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, int type, int code, int offset, __be32 info ) { - __udp6_lib_err(skb, opt, type, code, offset, info, udp_hash); + __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table); } int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) @@ -374,14 +385,15 @@ static struct sock *udp_v6_mcast_next(struct sock *sk, */ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct in6_addr *saddr, struct in6_addr *daddr, - struct hlist_head udptable[]) + struct udp_table *udptable) { struct sock *sk, *sk2; const struct udphdr *uh = udp_hdr(skb); + struct udp_hslot *hslot = &udptable->hash[udp_hashfn(net, ntohs(uh->dest))]; int dif; - read_lock(&udp_hash_lock); - sk = sk_head(&udptable[udp_hashfn(net, ntohs(uh->dest))]); + spin_lock(&hslot->lock); + sk = sk_head(&hslot->head); dif = inet6_iif(skb); sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); if (!sk) { @@ -409,7 +421,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, sk_add_backlog(sk, skb); bh_unlock_sock(sk); out: - read_unlock(&udp_hash_lock); + spin_unlock(&hslot->lock); return 0; } @@ -447,7 +459,7 @@ static inline int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, return 0; } -int __udp6_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], +int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto) { struct sock *sk; @@ -544,7 +556,7 @@ discard: static __inline__ int udpv6_rcv(struct sk_buff *skb) { - return __udp6_lib_rcv(skb, udp_hash, IPPROTO_UDP); + return __udp6_lib_rcv(skb, &udp_table, IPPROTO_UDP); } /* @@ -1008,7 +1020,7 @@ int udp6_seq_show(struct seq_file *seq, void *v) static struct udp_seq_afinfo udp6_seq_afinfo = { .name = "udp6", .family = AF_INET6, - .hashtable = udp_hash, + .udp_table = &udp_table, .seq_fops = { .owner = THIS_MODULE, }, @@ -1050,7 +1062,7 @@ struct proto udpv6_prot = { .sysctl_wmem = &sysctl_udp_wmem_min, .sysctl_rmem = &sysctl_udp_rmem_min, .obj_size = sizeof(struct udp6_sock), - .h.udp_hash = udp_hash, + .h.udp_table = &udp_table, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udpv6_setsockopt, .compat_getsockopt = compat_udpv6_getsockopt, -- cgit v1.2.3-70-g09d2 From 271b72c7fa82c2c7a795bc16896149933110672d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Oct 2008 02:11:14 -0700 Subject: udp: RCU handling for Unicast packets. Goals are : 1) Optimizing handling of incoming Unicast UDP frames, so that no memory writes should happen in the fast path. Note: Multicasts and broadcasts still will need to take a lock, because doing a full lockless lookup in this case is difficult. 2) No expensive operations in the socket bind/unhash phases : - No expensive synchronize_rcu() calls. - No added rcu_head in socket structure, increasing memory needs, but more important, forcing us to use call_rcu() calls, that have the bad property of making sockets structure cold. (rcu grace period between socket freeing and its potential reuse make this socket being cold in CPU cache). David did a previous patch using call_rcu() and noticed a 20% impact on TCP connection rates. Quoting Cristopher Lameter : "Right. That results in cacheline cooldown. You'd want to recycle the object as they are cache hot on a per cpu basis. That is screwed up by the delayed regular rcu processing. We have seen multiple regressions due to cacheline cooldown. The only choice in cacheline hot sensitive areas is to deal with the complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU." - Because udp sockets are allocated from dedicated kmem_cache, use of SLAB_DESTROY_BY_RCU can help here. Theory of operation : --------------------- As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()), special attention must be taken by readers and writers. Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed, reused, inserted in a different chain or in worst case in the same chain while readers could do lookups in the same time. In order to avoid loops, a reader must check each socket found in a chain really belongs to the chain the reader was traversing. If it finds a mismatch, lookup must start again at the begining. This *restart* loop is the reason we had to use rdlock for the multicast case, because we dont want to send same message several times to the same socket. We use RCU only for fast path. Thus, /proc/net/udp still takes spinlocks. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 37 ++++++++++++++++++++++++++++++++++++- net/core/sock.c | 3 ++- net/ipv4/udp.c | 35 ++++++++++++++++++++++++++--------- net/ipv4/udplite.c | 1 + net/ipv6/udp.c | 31 ++++++++++++++++++++++++------- net/ipv6/udplite.c | 1 + 6 files changed, 90 insertions(+), 18 deletions(-) (limited to 'net/ipv6/udp.c') diff --git a/include/net/sock.h b/include/net/sock.h index d200dfbe1ef..0bea25db547 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -363,6 +363,27 @@ static __inline__ int sk_del_node_init(struct sock *sk) return rc; } +static __inline__ int __sk_del_node_init_rcu(struct sock *sk) +{ + if (sk_hashed(sk)) { + hlist_del_init_rcu(&sk->sk_node); + return 1; + } + return 0; +} + +static __inline__ int sk_del_node_init_rcu(struct sock *sk) +{ + int rc = __sk_del_node_init_rcu(sk); + + if (rc) { + /* paranoid for a while -acme */ + WARN_ON(atomic_read(&sk->sk_refcnt) == 1); + __sock_put(sk); + } + return rc; +} + static __inline__ void __sk_add_node(struct sock *sk, struct hlist_head *list) { hlist_add_head(&sk->sk_node, list); @@ -374,6 +395,17 @@ static __inline__ void sk_add_node(struct sock *sk, struct hlist_head *list) __sk_add_node(sk, list); } +static __inline__ void __sk_add_node_rcu(struct sock *sk, struct hlist_head *list) +{ + hlist_add_head_rcu(&sk->sk_node, list); +} + +static __inline__ void sk_add_node_rcu(struct sock *sk, struct hlist_head *list) +{ + sock_hold(sk); + __sk_add_node_rcu(sk, list); +} + static __inline__ void __sk_del_bind_node(struct sock *sk) { __hlist_del(&sk->sk_bind_node); @@ -387,6 +419,8 @@ static __inline__ void sk_add_bind_node(struct sock *sk, #define sk_for_each(__sk, node, list) \ hlist_for_each_entry(__sk, node, list, sk_node) +#define sk_for_each_rcu(__sk, node, list) \ + hlist_for_each_entry_rcu(__sk, node, list, sk_node) #define sk_for_each_from(__sk, node) \ if (__sk && ({ node = &(__sk)->sk_node; 1; })) \ hlist_for_each_entry_from(__sk, node, sk_node) @@ -589,8 +623,9 @@ struct proto { int *sysctl_rmem; int max_header; - struct kmem_cache *slab; + struct kmem_cache *slab; unsigned int obj_size; + int slab_flags; atomic_t *orphan_count; diff --git a/net/core/sock.c b/net/core/sock.c index 5e2a3132a8c..ded1eb5d2fd 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2042,7 +2042,8 @@ int proto_register(struct proto *prot, int alloc_slab) if (alloc_slab) { prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0, - SLAB_HWCACHE_ALIGN, NULL); + SLAB_HWCACHE_ALIGN | prot->slab_flags, + NULL); if (prot->slab == NULL) { printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n", diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 2a6c491f97d..0ea974bf796 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -187,7 +187,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, inet_sk(sk)->num = snum; sk->sk_hash = snum; if (sk_unhashed(sk)) { - sk_add_node(sk, &hslot->head); + sk_add_node_rcu(sk, &hslot->head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } error = 0; @@ -253,15 +253,24 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif, struct udp_table *udptable) { - struct sock *sk, *result = NULL; + struct sock *sk, *result; struct hlist_node *node; unsigned short hnum = ntohs(dport); unsigned int hash = udp_hashfn(net, hnum); struct udp_hslot *hslot = &udptable->hash[hash]; - int score, badness = -1; + int score, badness; - spin_lock(&hslot->lock); - sk_for_each(sk, node, &hslot->head) { + rcu_read_lock(); +begin: + result = NULL; + badness = -1; + sk_for_each_rcu(sk, node, &hslot->head) { + /* + * lockless reader, and SLAB_DESTROY_BY_RCU items: + * We must check this item was not moved to another chain + */ + if (udp_hashfn(net, sk->sk_hash) != hash) + goto begin; score = compute_score(sk, net, saddr, hnum, sport, daddr, dport, dif); if (score > badness) { @@ -269,9 +278,16 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, badness = score; } } - if (result) - sock_hold(result); - spin_unlock(&hslot->lock); + if (result) { + if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) + result = NULL; + else if (unlikely(compute_score(result, net, saddr, hnum, sport, + daddr, dport, dif) < badness)) { + sock_put(result); + goto begin; + } + } + rcu_read_unlock(); return result; } @@ -953,7 +969,7 @@ void udp_lib_unhash(struct sock *sk) struct udp_hslot *hslot = &udptable->hash[hash]; spin_lock(&hslot->lock); - if (sk_del_node_init(sk)) { + if (sk_del_node_init_rcu(sk)) { inet_sk(sk)->num = 0; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); } @@ -1517,6 +1533,7 @@ struct proto udp_prot = { .sysctl_wmem = &sysctl_udp_wmem_min, .sysctl_rmem = &sysctl_udp_rmem_min, .obj_size = sizeof(struct udp_sock), + .slab_flags = SLAB_DESTROY_BY_RCU, .h.udp_table = &udp_table, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udp_setsockopt, diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index d8ea8e5f5ea..c784891cb7e 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -51,6 +51,7 @@ struct proto udplite_prot = { .unhash = udp_lib_unhash, .get_port = udp_v4_get_port, .obj_size = sizeof(struct udp_sock), + .slab_flags = SLAB_DESTROY_BY_RCU, .h.udp_table = &udplite_table, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udp_setsockopt, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index ccee7244ca0..1d9790e43df 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -97,24 +97,40 @@ static struct sock *__udp6_lib_lookup(struct net *net, struct in6_addr *daddr, __be16 dport, int dif, struct udp_table *udptable) { - struct sock *sk, *result = NULL; + struct sock *sk, *result; struct hlist_node *node; unsigned short hnum = ntohs(dport); unsigned int hash = udp_hashfn(net, hnum); struct udp_hslot *hslot = &udptable->hash[hash]; - int score, badness = -1; + int score, badness; - spin_lock(&hslot->lock); - sk_for_each(sk, node, &hslot->head) { + rcu_read_lock(); +begin: + result = NULL; + badness = -1; + sk_for_each_rcu(sk, node, &hslot->head) { + /* + * lockless reader, and SLAB_DESTROY_BY_RCU items: + * We must check this item was not moved to another chain + */ + if (udp_hashfn(net, sk->sk_hash) != hash) + goto begin; score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif); if (score > badness) { result = sk; badness = score; } } - if (result) - sock_hold(result); - spin_unlock(&hslot->lock); + if (result) { + if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) + result = NULL; + else if (unlikely(compute_score(result, net, hnum, saddr, sport, + daddr, dport, dif) < badness)) { + sock_put(result); + goto begin; + } + } + rcu_read_unlock(); return result; } @@ -1062,6 +1078,7 @@ struct proto udpv6_prot = { .sysctl_wmem = &sysctl_udp_wmem_min, .sysctl_rmem = &sysctl_udp_rmem_min, .obj_size = sizeof(struct udp6_sock), + .slab_flags = SLAB_DESTROY_BY_RCU, .h.udp_table = &udp_table, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udpv6_setsockopt, diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index f1e892a99e0..ba162a82458 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -49,6 +49,7 @@ struct proto udplitev6_prot = { .unhash = udp_lib_unhash, .get_port = udp_v6_get_port, .obj_size = sizeof(struct udp6_sock), + .slab_flags = SLAB_DESTROY_BY_RCU, .h.udp_table = &udplite_table, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udpv6_setsockopt, -- cgit v1.2.3-70-g09d2 From 96631ed16c514cf8b28fab991a076985ce378c26 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Oct 2008 11:19:58 -0700 Subject: udp: introduce sk_for_each_rcu_safenext() Corey Minyard found a race added in commit 271b72c7fa82c2c7a795bc16896149933110672d (udp: RCU handling for Unicast packets.) "If the socket is moved from one list to another list in-between the time the hash is calculated and the next field is accessed, and the socket has moved to the end of the new list, the traversal will not complete properly on the list it should have, since the socket will be on the end of the new list and there's not a way to tell it's on a new list and restart the list traversal. I think that this can be solved by pre-fetching the "next" field (with proper barriers) before checking the hash." This patch corrects this problem, introducing a new sk_for_each_rcu_safenext() macro. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/rculist.h | 17 +++++++++++++++++ include/net/sock.h | 4 ++-- net/ipv4/udp.c | 4 ++-- net/ipv6/udp.c | 4 ++-- 4 files changed, 23 insertions(+), 6 deletions(-) (limited to 'net/ipv6/udp.c') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index e649bd3f2c9..3ba2998b22b 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -383,5 +383,22 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev, ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \ pos = rcu_dereference(pos->next)) +/** + * hlist_for_each_entry_rcu_safenext - iterate over rcu list of given type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct hlist_node to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + * @next: the &struct hlist_node to use as a next cursor + * + * Special version of hlist_for_each_entry_rcu that make sure + * each next pointer is fetched before each iteration. + */ +#define hlist_for_each_entry_rcu_safenext(tpos, pos, head, member, next) \ + for (pos = rcu_dereference((head)->first); \ + pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) && \ + ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \ + pos = rcu_dereference(next)) + #endif /* __KERNEL__ */ #endif diff --git a/include/net/sock.h b/include/net/sock.h index 0bea25db547..a4f6d3fc047 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -419,8 +419,8 @@ static __inline__ void sk_add_bind_node(struct sock *sk, #define sk_for_each(__sk, node, list) \ hlist_for_each_entry(__sk, node, list, sk_node) -#define sk_for_each_rcu(__sk, node, list) \ - hlist_for_each_entry_rcu(__sk, node, list, sk_node) +#define sk_for_each_rcu_safenext(__sk, node, list, next) \ + hlist_for_each_entry_rcu_safenext(__sk, node, list, sk_node, next) #define sk_for_each_from(__sk, node) \ if (__sk && ({ node = &(__sk)->sk_node; 1; })) \ hlist_for_each_entry_from(__sk, node, sk_node) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ced820318f9..c3ecec8a9e1 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -256,7 +256,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, int dif, struct udp_table *udptable) { struct sock *sk, *result; - struct hlist_node *node; + struct hlist_node *node, *next; unsigned short hnum = ntohs(dport); unsigned int hash = udp_hashfn(net, hnum); struct udp_hslot *hslot = &udptable->hash[hash]; @@ -266,7 +266,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, begin: result = NULL; badness = -1; - sk_for_each_rcu(sk, node, &hslot->head) { + sk_for_each_rcu_safenext(sk, node, &hslot->head, next) { /* * lockless reader, and SLAB_DESTROY_BY_RCU items: * We must check this item was not moved to another chain diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 1d9790e43df..32d914db6c4 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -98,7 +98,7 @@ static struct sock *__udp6_lib_lookup(struct net *net, int dif, struct udp_table *udptable) { struct sock *sk, *result; - struct hlist_node *node; + struct hlist_node *node, *next; unsigned short hnum = ntohs(dport); unsigned int hash = udp_hashfn(net, hnum); struct udp_hslot *hslot = &udptable->hash[hash]; @@ -108,7 +108,7 @@ static struct sock *__udp6_lib_lookup(struct net *net, begin: result = NULL; badness = -1; - sk_for_each_rcu(sk, node, &hslot->head) { + sk_for_each_rcu_safenext(sk, node, &hslot->head, next) { /* * lockless reader, and SLAB_DESTROY_BY_RCU items: * We must check this item was not moved to another chain -- cgit v1.2.3-70-g09d2 From 88ab1932eac721c6e7336708558fa5ed02c85c80 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 16 Nov 2008 19:39:21 -0800 Subject: udp: Use hlist_nulls in UDP RCU code This is a straightforward patch, using hlist_nulls infrastructure. RCUification already done on UDP two weeks ago. Using hlist_nulls permits us to avoid some memory barriers, both at lookup time and delete time. Patch is large because it adds new macros to include/net/sock.h. These macros will be used by TCP & DCCP in next patch. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/rculist.h | 17 --------------- include/net/sock.h | 57 +++++++++++++++++++++++++++++++++++++++---------- include/net/udp.h | 2 +- net/ipv4/udp.c | 47 +++++++++++++++++++--------------------- net/ipv6/udp.c | 26 +++++++++++----------- 5 files changed, 83 insertions(+), 66 deletions(-) (limited to 'net/ipv6/udp.c') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 3ba2998b22b..e649bd3f2c9 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -383,22 +383,5 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev, ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \ pos = rcu_dereference(pos->next)) -/** - * hlist_for_each_entry_rcu_safenext - iterate over rcu list of given type - * @tpos: the type * to use as a loop cursor. - * @pos: the &struct hlist_node to use as a loop cursor. - * @head: the head for your list. - * @member: the name of the hlist_node within the struct. - * @next: the &struct hlist_node to use as a next cursor - * - * Special version of hlist_for_each_entry_rcu that make sure - * each next pointer is fetched before each iteration. - */ -#define hlist_for_each_entry_rcu_safenext(tpos, pos, head, member, next) \ - for (pos = rcu_dereference((head)->first); \ - pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) && \ - ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \ - pos = rcu_dereference(next)) - #endif /* __KERNEL__ */ #endif diff --git a/include/net/sock.h b/include/net/sock.h index 8b2b82131b6..0a638948868 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -42,6 +42,7 @@ #include #include +#include #include #include #include @@ -52,6 +53,7 @@ #include #include +#include #include #include @@ -106,6 +108,7 @@ struct net; * @skc_reuse: %SO_REUSEADDR setting * @skc_bound_dev_if: bound device index if != 0 * @skc_node: main hash linkage for various protocol lookup tables + * @skc_nulls_node: main hash linkage for UDP/UDP-Lite protocol * @skc_bind_node: bind hash linkage for various protocol lookup tables * @skc_refcnt: reference count * @skc_hash: hash value used with various protocol lookup tables @@ -120,7 +123,10 @@ struct sock_common { volatile unsigned char skc_state; unsigned char skc_reuse; int skc_bound_dev_if; - struct hlist_node skc_node; + union { + struct hlist_node skc_node; + struct hlist_nulls_node skc_nulls_node; + }; struct hlist_node skc_bind_node; atomic_t skc_refcnt; unsigned int skc_hash; @@ -206,6 +212,7 @@ struct sock { #define sk_reuse __sk_common.skc_reuse #define sk_bound_dev_if __sk_common.skc_bound_dev_if #define sk_node __sk_common.skc_node +#define sk_nulls_node __sk_common.skc_nulls_node #define sk_bind_node __sk_common.skc_bind_node #define sk_refcnt __sk_common.skc_refcnt #define sk_hash __sk_common.skc_hash @@ -300,12 +307,30 @@ static inline struct sock *sk_head(const struct hlist_head *head) return hlist_empty(head) ? NULL : __sk_head(head); } +static inline struct sock *__sk_nulls_head(const struct hlist_nulls_head *head) +{ + return hlist_nulls_entry(head->first, struct sock, sk_nulls_node); +} + +static inline struct sock *sk_nulls_head(const struct hlist_nulls_head *head) +{ + return hlist_nulls_empty(head) ? NULL : __sk_nulls_head(head); +} + static inline struct sock *sk_next(const struct sock *sk) { return sk->sk_node.next ? hlist_entry(sk->sk_node.next, struct sock, sk_node) : NULL; } +static inline struct sock *sk_nulls_next(const struct sock *sk) +{ + return (!is_a_nulls(sk->sk_nulls_node.next)) ? + hlist_nulls_entry(sk->sk_nulls_node.next, + struct sock, sk_nulls_node) : + NULL; +} + static inline int sk_unhashed(const struct sock *sk) { return hlist_unhashed(&sk->sk_node); @@ -321,6 +346,11 @@ static __inline__ void sk_node_init(struct hlist_node *node) node->pprev = NULL; } +static __inline__ void sk_nulls_node_init(struct hlist_nulls_node *node) +{ + node->pprev = NULL; +} + static __inline__ void __sk_del_node(struct sock *sk) { __hlist_del(&sk->sk_node); @@ -367,18 +397,18 @@ static __inline__ int sk_del_node_init(struct sock *sk) return rc; } -static __inline__ int __sk_del_node_init_rcu(struct sock *sk) +static __inline__ int __sk_nulls_del_node_init_rcu(struct sock *sk) { if (sk_hashed(sk)) { - hlist_del_init_rcu(&sk->sk_node); + hlist_nulls_del_init_rcu(&sk->sk_nulls_node); return 1; } return 0; } -static __inline__ int sk_del_node_init_rcu(struct sock *sk) +static __inline__ int sk_nulls_del_node_init_rcu(struct sock *sk) { - int rc = __sk_del_node_init_rcu(sk); + int rc = __sk_nulls_del_node_init_rcu(sk); if (rc) { /* paranoid for a while -acme */ @@ -399,15 +429,15 @@ static __inline__ void sk_add_node(struct sock *sk, struct hlist_head *list) __sk_add_node(sk, list); } -static __inline__ void __sk_add_node_rcu(struct sock *sk, struct hlist_head *list) +static __inline__ void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) { - hlist_add_head_rcu(&sk->sk_node, list); + hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); } -static __inline__ void sk_add_node_rcu(struct sock *sk, struct hlist_head *list) +static __inline__ void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) { sock_hold(sk); - __sk_add_node_rcu(sk, list); + __sk_nulls_add_node_rcu(sk, list); } static __inline__ void __sk_del_bind_node(struct sock *sk) @@ -423,11 +453,16 @@ static __inline__ void sk_add_bind_node(struct sock *sk, #define sk_for_each(__sk, node, list) \ hlist_for_each_entry(__sk, node, list, sk_node) -#define sk_for_each_rcu_safenext(__sk, node, list, next) \ - hlist_for_each_entry_rcu_safenext(__sk, node, list, sk_node, next) +#define sk_nulls_for_each(__sk, node, list) \ + hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node) +#define sk_nulls_for_each_rcu(__sk, node, list) \ + hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node) #define sk_for_each_from(__sk, node) \ if (__sk && ({ node = &(__sk)->sk_node; 1; })) \ hlist_for_each_entry_from(__sk, node, sk_node) +#define sk_nulls_for_each_from(__sk, node) \ + if (__sk && ({ node = &(__sk)->sk_nulls_node; 1; })) \ + hlist_nulls_for_each_entry_from(__sk, node, sk_nulls_node) #define sk_for_each_continue(__sk, node) \ if (__sk && ({ node = &(__sk)->sk_node; 1; })) \ hlist_for_each_entry_continue(__sk, node, sk_node) diff --git a/include/net/udp.h b/include/net/udp.h index df2bfe54537..90e6ce56be6 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -51,7 +51,7 @@ struct udp_skb_cb { #define UDP_SKB_CB(__skb) ((struct udp_skb_cb *)((__skb)->cb)) struct udp_hslot { - struct hlist_head head; + struct hlist_nulls_head head; spinlock_t lock; } __attribute__((aligned(2 * sizeof(long)))); struct udp_table { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 54badc9a019..fea2d873dd4 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -127,9 +127,9 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, const struct sock *sk2)) { struct sock *sk2; - struct hlist_node *node; + struct hlist_nulls_node *node; - sk_for_each(sk2, node, &hslot->head) + sk_nulls_for_each(sk2, node, &hslot->head) if (net_eq(sock_net(sk2), net) && sk2 != sk && sk2->sk_hash == num && @@ -189,12 +189,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, inet_sk(sk)->num = snum; sk->sk_hash = snum; if (sk_unhashed(sk)) { - /* - * We need that previous write to sk->sk_hash committed - * before write to sk->next done in following add_node() variant - */ - smp_wmb(); - sk_add_node_rcu(sk, &hslot->head); + sk_nulls_add_node_rcu(sk, &hslot->head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } error = 0; @@ -261,7 +256,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, int dif, struct udp_table *udptable) { struct sock *sk, *result; - struct hlist_node *node, *next; + struct hlist_nulls_node *node; unsigned short hnum = ntohs(dport); unsigned int hash = udp_hashfn(net, hnum); struct udp_hslot *hslot = &udptable->hash[hash]; @@ -271,13 +266,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, begin: result = NULL; badness = -1; - sk_for_each_rcu_safenext(sk, node, &hslot->head, next) { - /* - * lockless reader, and SLAB_DESTROY_BY_RCU items: - * We must check this item was not moved to another chain - */ - if (udp_hashfn(net, sk->sk_hash) != hash) - goto begin; + sk_nulls_for_each_rcu(sk, node, &hslot->head) { score = compute_score(sk, net, saddr, hnum, sport, daddr, dport, dif); if (score > badness) { @@ -285,6 +274,14 @@ begin: badness = score; } } + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != hash) + goto begin; + if (result) { if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) result = NULL; @@ -325,11 +322,11 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk, __be16 rmt_port, __be32 rmt_addr, int dif) { - struct hlist_node *node; + struct hlist_nulls_node *node; struct sock *s = sk; unsigned short hnum = ntohs(loc_port); - sk_for_each_from(s, node) { + sk_nulls_for_each_from(s, node) { struct inet_sock *inet = inet_sk(s); if (!net_eq(sock_net(s), net) || @@ -977,7 +974,7 @@ void udp_lib_unhash(struct sock *sk) struct udp_hslot *hslot = &udptable->hash[hash]; spin_lock_bh(&hslot->lock); - if (sk_del_node_init_rcu(sk)) { + if (sk_nulls_del_node_init_rcu(sk)) { inet_sk(sk)->num = 0; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); } @@ -1130,7 +1127,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, int dif; spin_lock(&hslot->lock); - sk = sk_head(&hslot->head); + sk = sk_nulls_head(&hslot->head); dif = skb->dev->ifindex; sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); if (sk) { @@ -1139,7 +1136,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, do { struct sk_buff *skb1 = skb; - sknext = udp_v4_mcast_next(net, sk_next(sk), uh->dest, + sknext = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest, daddr, uh->source, saddr, dif); if (sknext) @@ -1560,10 +1557,10 @@ static struct sock *udp_get_first(struct seq_file *seq, int start) struct net *net = seq_file_net(seq); for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) { - struct hlist_node *node; + struct hlist_nulls_node *node; struct udp_hslot *hslot = &state->udp_table->hash[state->bucket]; spin_lock_bh(&hslot->lock); - sk_for_each(sk, node, &hslot->head) { + sk_nulls_for_each(sk, node, &hslot->head) { if (!net_eq(sock_net(sk), net)) continue; if (sk->sk_family == state->family) @@ -1582,7 +1579,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) struct net *net = seq_file_net(seq); do { - sk = sk_next(sk); + sk = sk_nulls_next(sk); } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); if (!sk) { @@ -1753,7 +1750,7 @@ void __init udp_table_init(struct udp_table *table) int i; for (i = 0; i < UDP_HTABLE_SIZE; i++) { - INIT_HLIST_HEAD(&table->hash[i].head); + INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i); spin_lock_init(&table->hash[i].lock); } } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 8dafa36b1ba..fd2d9ad4a8a 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -98,7 +98,7 @@ static struct sock *__udp6_lib_lookup(struct net *net, int dif, struct udp_table *udptable) { struct sock *sk, *result; - struct hlist_node *node, *next; + struct hlist_nulls_node *node; unsigned short hnum = ntohs(dport); unsigned int hash = udp_hashfn(net, hnum); struct udp_hslot *hslot = &udptable->hash[hash]; @@ -108,19 +108,21 @@ static struct sock *__udp6_lib_lookup(struct net *net, begin: result = NULL; badness = -1; - sk_for_each_rcu_safenext(sk, node, &hslot->head, next) { - /* - * lockless reader, and SLAB_DESTROY_BY_RCU items: - * We must check this item was not moved to another chain - */ - if (udp_hashfn(net, sk->sk_hash) != hash) - goto begin; + sk_nulls_for_each_rcu(sk, node, &hslot->head) { score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif); if (score > badness) { result = sk; badness = score; } } + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != hash) + goto begin; + if (result) { if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) result = NULL; @@ -374,11 +376,11 @@ static struct sock *udp_v6_mcast_next(struct net *net, struct sock *sk, __be16 rmt_port, struct in6_addr *rmt_addr, int dif) { - struct hlist_node *node; + struct hlist_nulls_node *node; struct sock *s = sk; unsigned short num = ntohs(loc_port); - sk_for_each_from(s, node) { + sk_nulls_for_each_from(s, node) { struct inet_sock *inet = inet_sk(s); if (!net_eq(sock_net(s), net)) @@ -423,7 +425,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, int dif; spin_lock(&hslot->lock); - sk = sk_head(&hslot->head); + sk = sk_nulls_head(&hslot->head); dif = inet6_iif(skb); sk = udp_v6_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); if (!sk) { @@ -432,7 +434,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, } sk2 = sk; - while ((sk2 = udp_v6_mcast_next(net, sk_next(sk2), uh->dest, daddr, + while ((sk2 = udp_v6_mcast_next(net, sk_nulls_next(sk2), uh->dest, daddr, uh->source, saddr, dif))) { struct sk_buff *buff = skb_clone(skb, GFP_ATOMIC); if (buff) { -- cgit v1.2.3-70-g09d2 From 52479b623d3d41df84c499325b6a8c7915413032 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 25 Nov 2008 17:35:18 -0800 Subject: netns xfrm: lookup in netns Pass netns to xfrm_lookup()/__xfrm_lookup(). For that pass netns to flow_cache_lookup() and resolver callback. Take it from socket or netdevice. Stub DECnet to init_net. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/net/dst.h | 16 ++++++++-------- include/net/flow.h | 9 +++++---- net/core/flow.c | 4 ++-- net/dccp/ipv6.c | 10 +++++----- net/decnet/dn_route.c | 6 +++--- net/ipv4/icmp.c | 4 ++-- net/ipv4/netfilter.c | 4 ++-- net/ipv4/route.c | 2 +- net/ipv6/af_inet6.c | 2 +- net/ipv6/datagram.c | 3 ++- net/ipv6/icmp.c | 6 +++--- net/ipv6/inet6_connection_sock.c | 2 +- net/ipv6/ip6_tunnel.c | 5 +++-- net/ipv6/mcast.c | 4 ++-- net/ipv6/ndisc.c | 4 ++-- net/ipv6/netfilter.c | 2 +- net/ipv6/netfilter/ip6t_REJECT.c | 2 +- net/ipv6/raw.c | 3 ++- net/ipv6/syncookies.c | 2 +- net/ipv6/tcp_ipv6.c | 11 ++++++----- net/ipv6/udp.c | 3 ++- net/xfrm/xfrm_policy.c | 38 ++++++++++++++++++++------------------ 22 files changed, 75 insertions(+), 67 deletions(-) (limited to 'net/ipv6/udp.c') diff --git a/include/net/dst.h b/include/net/dst.h index 6c778799bf1..6be3b082a07 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -291,21 +291,21 @@ enum { struct flowi; #ifndef CONFIG_XFRM -static inline int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl, - struct sock *sk, int flags) +static inline int xfrm_lookup(struct net *net, struct dst_entry **dst_p, + struct flowi *fl, struct sock *sk, int flags) { return 0; } -static inline int __xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl, - struct sock *sk, int flags) +static inline int __xfrm_lookup(struct net *net, struct dst_entry **dst_p, + struct flowi *fl, struct sock *sk, int flags) { return 0; } #else -extern int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl, - struct sock *sk, int flags); -extern int __xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl, - struct sock *sk, int flags); +extern int xfrm_lookup(struct net *net, struct dst_entry **dst_p, + struct flowi *fl, struct sock *sk, int flags); +extern int __xfrm_lookup(struct net *net, struct dst_entry **dst_p, + struct flowi *fl, struct sock *sk, int flags); #endif #endif diff --git a/include/net/flow.h b/include/net/flow.h index b45a5e4fcad..809970b7dfe 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -84,12 +84,13 @@ struct flowi { #define FLOW_DIR_OUT 1 #define FLOW_DIR_FWD 2 +struct net; struct sock; -typedef int (*flow_resolve_t)(struct flowi *key, u16 family, u8 dir, - void **objp, atomic_t **obj_refp); +typedef int (*flow_resolve_t)(struct net *net, struct flowi *key, u16 family, + u8 dir, void **objp, atomic_t **obj_refp); -extern void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, - flow_resolve_t resolver); +extern void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, + u8 dir, flow_resolve_t resolver); extern void flow_cache_flush(void); extern atomic_t flow_cache_genid; diff --git a/net/core/flow.c b/net/core/flow.c index d323388dd1b..96015871ece 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -165,7 +165,7 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2) return 0; } -void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, +void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, flow_resolve_t resolver) { struct flow_cache_entry *fle, **head; @@ -225,7 +225,7 @@ nocache: void *obj; atomic_t *obj_ref; - err = resolver(key, family, dir, &obj, &obj_ref); + err = resolver(net, key, family, dir, &obj, &obj_ref); if (fle && !err) { fle->genid = atomic_read(&flow_cache_genid); diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index f033e845bb0..b963f35c65f 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -168,7 +168,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, goto out; } - err = xfrm_lookup(&dst, &fl, sk, 0); + err = xfrm_lookup(net, &dst, &fl, sk, 0); if (err < 0) { sk->sk_err_soft = -err; goto out; @@ -279,7 +279,7 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req) if (final_p) ipv6_addr_copy(&fl.fl6_dst, final_p); - err = xfrm_lookup(&dst, &fl, sk, 0); + err = xfrm_lookup(sock_net(sk), &dst, &fl, sk, 0); if (err < 0) goto done; @@ -343,7 +343,7 @@ static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb) /* sk = NULL, but it is safe for now. RST socket required. */ if (!ip6_dst_lookup(ctl_sk, &skb->dst, &fl)) { - if (xfrm_lookup(&skb->dst, &fl, NULL, 0) >= 0) { + if (xfrm_lookup(net, &skb->dst, &fl, NULL, 0) >= 0) { ip6_xmit(ctl_sk, skb, &fl, NULL, 0); DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS); DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS); @@ -569,7 +569,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, if (final_p) ipv6_addr_copy(&fl.fl6_dst, final_p); - if ((xfrm_lookup(&dst, &fl, sk, 0)) < 0) + if ((xfrm_lookup(sock_net(sk), &dst, &fl, sk, 0)) < 0) goto out; } @@ -1004,7 +1004,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (final_p) ipv6_addr_copy(&fl.fl6_dst, final_p); - err = __xfrm_lookup(&dst, &fl, sk, XFRM_LOOKUP_WAIT); + err = __xfrm_lookup(sock_net(sk), &dst, &fl, sk, XFRM_LOOKUP_WAIT); if (err < 0) { if (err == -EREMOTE) err = ip6_dst_blackhole(sk, &dst, &fl); diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 768df000523..eeaa3d819f9 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -1184,7 +1184,7 @@ static int dn_route_output_key(struct dst_entry **pprt, struct flowi *flp, int f err = __dn_route_output_key(pprt, flp, flags); if (err == 0 && flp->proto) { - err = xfrm_lookup(pprt, flp, NULL, 0); + err = xfrm_lookup(&init_net, pprt, flp, NULL, 0); } return err; } @@ -1195,8 +1195,8 @@ int dn_route_output_sock(struct dst_entry **pprt, struct flowi *fl, struct sock err = __dn_route_output_key(pprt, fl, flags & MSG_TRYHARD); if (err == 0 && fl->proto) { - err = xfrm_lookup(pprt, fl, sk, (flags & MSG_DONTWAIT) ? - 0 : XFRM_LOOKUP_WAIT); + err = xfrm_lookup(&init_net, pprt, fl, sk, + (flags & MSG_DONTWAIT) ? 0 : XFRM_LOOKUP_WAIT); } return err; } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 7b88be9803b..705b33b184a 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -562,7 +562,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) /* No need to clone since we're just using its address. */ rt2 = rt; - err = xfrm_lookup((struct dst_entry **)&rt, &fl, NULL, 0); + err = xfrm_lookup(net, (struct dst_entry **)&rt, &fl, NULL, 0); switch (err) { case 0: if (rt != rt2) @@ -601,7 +601,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) if (err) goto relookup_failed; - err = xfrm_lookup((struct dst_entry **)&rt2, &fl, NULL, + err = xfrm_lookup(net, (struct dst_entry **)&rt2, &fl, NULL, XFRM_LOOKUP_ICMP); switch (err) { case 0: diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 6efdb70b3eb..c99eecf89da 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -66,7 +66,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) #ifdef CONFIG_XFRM if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && xfrm_decode_session(skb, &fl, AF_INET) == 0) - if (xfrm_lookup(&skb->dst, &fl, skb->sk, 0)) + if (xfrm_lookup(net, &skb->dst, &fl, skb->sk, 0)) return -1; #endif @@ -97,7 +97,7 @@ int ip_xfrm_me_harder(struct sk_buff *skb) dst = ((struct xfrm_dst *)dst)->route; dst_hold(dst); - if (xfrm_lookup(&dst, &fl, skb->sk, 0) < 0) + if (xfrm_lookup(dev_net(dst->dev), &dst, &fl, skb->sk, 0) < 0) return -1; dst_release(skb->dst); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 4e6959c2981..77bfba97595 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2761,7 +2761,7 @@ int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, flp->fl4_src = (*rp)->rt_src; if (!flp->fl4_dst) flp->fl4_dst = (*rp)->rt_dst; - err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, + err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk, flags ? XFRM_LOOKUP_WAIT : 0); if (err == -EREMOTE) err = ipv4_dst_blackhole(net, rp, flp); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 01edac88851..437b750b98f 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -637,7 +637,7 @@ int inet6_sk_rebuild_header(struct sock *sk) if (final_p) ipv6_addr_copy(&fl.fl6_dst, final_p); - if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + if ((err = xfrm_lookup(sock_net(sk), &dst, &fl, sk, 0)) < 0) { sk->sk_err_soft = -err; return err; } diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index e44deb8d4df..e2bdc6d83a4 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -175,7 +175,8 @@ ipv4_connected: if (final_p) ipv6_addr_copy(&fl.fl6_dst, final_p); - if ((err = __xfrm_lookup(&dst, &fl, sk, XFRM_LOOKUP_WAIT)) < 0) { + err = __xfrm_lookup(sock_net(sk), &dst, &fl, sk, XFRM_LOOKUP_WAIT); + if (err < 0) { if (err == -EREMOTE) err = ip6_dst_blackhole(sk, &dst, &fl); if (err < 0) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index a77b8d10380..4f433847d95 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -427,7 +427,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, /* No need to clone since we're just using its address. */ dst2 = dst; - err = xfrm_lookup(&dst, &fl, sk, 0); + err = xfrm_lookup(net, &dst, &fl, sk, 0); switch (err) { case 0: if (dst != dst2) @@ -446,7 +446,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, if (ip6_dst_lookup(sk, &dst2, &fl)) goto relookup_failed; - err = xfrm_lookup(&dst2, &fl, sk, XFRM_LOOKUP_ICMP); + err = xfrm_lookup(net, &dst2, &fl, sk, XFRM_LOOKUP_ICMP); switch (err) { case 0: dst_release(dst); @@ -552,7 +552,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) err = ip6_dst_lookup(sk, &dst, &fl); if (err) goto out; - if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) + if ((err = xfrm_lookup(net, &dst, &fl, sk, 0)) < 0) goto out; if (ipv6_addr_is_multicast(&fl.fl6_dst)) diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 16d43f20b32..3c3732d50c1 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -219,7 +219,7 @@ int inet6_csk_xmit(struct sk_buff *skb, int ipfragok) if (final_p) ipv6_addr_copy(&fl.fl6_dst, final_p); - if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + if ((err = xfrm_lookup(sock_net(sk), &dst, &fl, sk, 0)) < 0) { sk->sk_route_caps = 0; kfree_skb(skb); return err; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index ef249ab5c93..58e2b0d9375 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -846,6 +846,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, int encap_limit, __u32 *pmtu) { + struct net *net = dev_net(dev); struct ip6_tnl *t = netdev_priv(dev); struct net_device_stats *stats = &t->dev->stats; struct ipv6hdr *ipv6h = ipv6_hdr(skb); @@ -861,9 +862,9 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, if ((dst = ip6_tnl_dst_check(t)) != NULL) dst_hold(dst); else { - dst = ip6_route_output(dev_net(dev), NULL, fl); + dst = ip6_route_output(net, NULL, fl); - if (dst->error || xfrm_lookup(&dst, fl, NULL, 0) < 0) + if (dst->error || xfrm_lookup(net, &dst, fl, NULL, 0) < 0) goto tx_err_link_failure; } diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 870a1d64605..0f389603283 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1466,7 +1466,7 @@ static void mld_sendpack(struct sk_buff *skb) &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->dev->ifindex); - err = xfrm_lookup(&skb->dst, &fl, NULL, 0); + err = xfrm_lookup(net, &skb->dst, &fl, NULL, 0); if (err) goto err_out; @@ -1831,7 +1831,7 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->dev->ifindex); - err = xfrm_lookup(&skb->dst, &fl, NULL, 0); + err = xfrm_lookup(net, &skb->dst, &fl, NULL, 0); if (err) goto err_out; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index af6705f03b5..e4acc212345 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -524,7 +524,7 @@ void ndisc_send_skb(struct sk_buff *skb, return; } - err = xfrm_lookup(&dst, &fl, NULL, 0); + err = xfrm_lookup(net, &dst, &fl, NULL, 0); if (err < 0) { kfree_skb(skb); return; @@ -1524,7 +1524,7 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, if (dst == NULL) return; - err = xfrm_lookup(&dst, &fl, NULL, 0); + err = xfrm_lookup(net, &dst, &fl, NULL, 0); if (err) return; diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index fd5b3a4e332..627e21db65d 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -29,7 +29,7 @@ int ip6_route_me_harder(struct sk_buff *skb) #ifdef CONFIG_XFRM if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && xfrm_decode_session(skb, &fl, AF_INET6) == 0) - if (xfrm_lookup(&skb->dst, &fl, skb->sk, 0)) + if (xfrm_lookup(net, &skb->dst, &fl, skb->sk, 0)) return -1; #endif diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c index 0981b4ccb8b..5a2d0a41694 100644 --- a/net/ipv6/netfilter/ip6t_REJECT.c +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -97,7 +97,7 @@ static void send_reset(struct net *net, struct sk_buff *oldskb) dst = ip6_route_output(net, NULL, &fl); if (dst == NULL) return; - if (dst->error || xfrm_lookup(&dst, &fl, NULL, 0)) + if (dst->error || xfrm_lookup(net, &dst, &fl, NULL, 0)) return; hh_len = (dst->dev->hard_header_len + 15)&~15; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 2ba04d41dc2..61f6827e590 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -860,7 +860,8 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, if (final_p) ipv6_addr_copy(&fl.fl6_dst, final_p); - if ((err = __xfrm_lookup(&dst, &fl, sk, XFRM_LOOKUP_WAIT)) < 0) { + err = __xfrm_lookup(sock_net(sk), &dst, &fl, sk, XFRM_LOOKUP_WAIT); + if (err < 0) { if (err == -EREMOTE) err = ip6_dst_blackhole(sk, &dst, &fl); if (err < 0) diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 676c80b5b14..711175e0571 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -259,7 +259,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) if (final_p) ipv6_addr_copy(&fl.fl6_dst, final_p); - if ((xfrm_lookup(&dst, &fl, sk, 0)) < 0) + if ((xfrm_lookup(sock_net(sk), &dst, &fl, sk, 0)) < 0) goto out_free; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index a5d750acd79..f259c9671f3 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -260,7 +260,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (final_p) ipv6_addr_copy(&fl.fl6_dst, final_p); - if ((err = __xfrm_lookup(&dst, &fl, sk, XFRM_LOOKUP_WAIT)) < 0) { + err = __xfrm_lookup(sock_net(sk), &dst, &fl, sk, XFRM_LOOKUP_WAIT); + if (err < 0) { if (err == -EREMOTE) err = ip6_dst_blackhole(sk, &dst, &fl); if (err < 0) @@ -390,7 +391,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, goto out; } - if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + if ((err = xfrm_lookup(net, &dst, &fl, sk, 0)) < 0) { sk->sk_err_soft = -err; goto out; } @@ -492,7 +493,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req) goto done; if (final_p) ipv6_addr_copy(&fl.fl6_dst, final_p); - if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) + if ((err = xfrm_lookup(sock_net(sk), &dst, &fl, sk, 0)) < 0) goto done; skb = tcp_make_synack(sk, dst, req); @@ -1018,7 +1019,7 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, * namespace */ if (!ip6_dst_lookup(ctl_sk, &buff->dst, &fl)) { - if (xfrm_lookup(&buff->dst, &fl, NULL, 0) >= 0) { + if (xfrm_lookup(net, &buff->dst, &fl, NULL, 0) >= 0) { ip6_xmit(ctl_sk, buff, &fl, NULL, 0); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); if (rst) @@ -1316,7 +1317,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, if (final_p) ipv6_addr_copy(&fl.fl6_dst, final_p); - if ((xfrm_lookup(&dst, &fl, sk, 0)) < 0) + if ((xfrm_lookup(sock_net(sk), &dst, &fl, sk, 0)) < 0) goto out; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index fd2d9ad4a8a..38390dd1963 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -849,7 +849,8 @@ do_udp_sendmsg: if (final_p) ipv6_addr_copy(&fl.fl6_dst, final_p); - if ((err = __xfrm_lookup(&dst, &fl, sk, XFRM_LOOKUP_WAIT)) < 0) { + err = __xfrm_lookup(sock_net(sk), &dst, &fl, sk, XFRM_LOOKUP_WAIT); + if (err < 0) { if (err == -EREMOTE) err = ip6_dst_blackhole(sk, &dst, &fl); if (err < 0) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 6165218fd7c..7c88a25c7af 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -940,7 +940,8 @@ static int xfrm_policy_match(struct xfrm_policy *pol, struct flowi *fl, return ret; } -static struct xfrm_policy *xfrm_policy_lookup_bytype(u8 type, struct flowi *fl, +static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, + struct flowi *fl, u16 family, u8 dir) { int err; @@ -956,7 +957,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(u8 type, struct flowi *fl, return NULL; read_lock_bh(&xfrm_policy_lock); - chain = policy_hash_direct(&init_net, daddr, saddr, family, dir); + chain = policy_hash_direct(net, daddr, saddr, family, dir); ret = NULL; hlist_for_each_entry(pol, entry, chain, bydst) { err = xfrm_policy_match(pol, fl, type, family, dir); @@ -973,7 +974,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(u8 type, struct flowi *fl, break; } } - chain = &init_net.xfrm.policy_inexact[dir]; + chain = &net->xfrm.policy_inexact[dir]; hlist_for_each_entry(pol, entry, chain, bydst) { err = xfrm_policy_match(pol, fl, type, family, dir); if (err) { @@ -996,14 +997,14 @@ fail: return ret; } -static int xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir, - void **objp, atomic_t **obj_refp) +static int xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family, + u8 dir, void **objp, atomic_t **obj_refp) { struct xfrm_policy *pol; int err = 0; #ifdef CONFIG_XFRM_SUB_POLICY - pol = xfrm_policy_lookup_bytype(XFRM_POLICY_TYPE_SUB, fl, family, dir); + pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir); if (IS_ERR(pol)) { err = PTR_ERR(pol); pol = NULL; @@ -1011,7 +1012,7 @@ static int xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir, if (pol || err) goto end; #endif - pol = xfrm_policy_lookup_bytype(XFRM_POLICY_TYPE_MAIN, fl, family, dir); + pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir); if (IS_ERR(pol)) { err = PTR_ERR(pol); pol = NULL; @@ -1537,7 +1538,7 @@ static int stale_bundle(struct dst_entry *dst); * At the moment we eat a raw IP route. Mostly to speed up lookups * on interfaces with disabled IPsec. */ -int __xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl, +int __xfrm_lookup(struct net *net, struct dst_entry **dst_p, struct flowi *fl, struct sock *sk, int flags) { struct xfrm_policy *policy; @@ -1575,10 +1576,10 @@ restart: if (!policy) { /* To accelerate a bit... */ if ((dst_orig->flags & DST_NOXFRM) || - !init_net.xfrm.policy_count[XFRM_POLICY_OUT]) + !net->xfrm.policy_count[XFRM_POLICY_OUT]) goto nopol; - policy = flow_cache_lookup(fl, dst_orig->ops->family, + policy = flow_cache_lookup(net, fl, dst_orig->ops->family, dir, xfrm_policy_lookup); err = PTR_ERR(policy); if (IS_ERR(policy)) { @@ -1635,7 +1636,8 @@ restart: #ifdef CONFIG_XFRM_SUB_POLICY if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) { - pols[1] = xfrm_policy_lookup_bytype(XFRM_POLICY_TYPE_MAIN, + pols[1] = xfrm_policy_lookup_bytype(net, + XFRM_POLICY_TYPE_MAIN, fl, family, XFRM_POLICY_OUT); if (pols[1]) { @@ -1683,11 +1685,11 @@ restart: if (err == -EAGAIN && (flags & XFRM_LOOKUP_WAIT)) { DECLARE_WAITQUEUE(wait, current); - add_wait_queue(&init_net.xfrm.km_waitq, &wait); + add_wait_queue(&net->xfrm.km_waitq, &wait); set_current_state(TASK_INTERRUPTIBLE); schedule(); set_current_state(TASK_RUNNING); - remove_wait_queue(&init_net.xfrm.km_waitq, &wait); + remove_wait_queue(&net->xfrm.km_waitq, &wait); nx = xfrm_tmpl_resolve(pols, npols, fl, xfrm, family); @@ -1781,10 +1783,10 @@ nopol: } EXPORT_SYMBOL(__xfrm_lookup); -int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl, +int xfrm_lookup(struct net *net, struct dst_entry **dst_p, struct flowi *fl, struct sock *sk, int flags) { - int err = __xfrm_lookup(dst_p, fl, sk, flags); + int err = __xfrm_lookup(net, dst_p, fl, sk, flags); if (err == -EREMOTE) { dst_release(*dst_p); @@ -1936,7 +1938,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, } if (!pol) - pol = flow_cache_lookup(&fl, family, fl_dir, + pol = flow_cache_lookup(&init_net, &fl, family, fl_dir, xfrm_policy_lookup); if (IS_ERR(pol)) { @@ -1959,7 +1961,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, npols ++; #ifdef CONFIG_XFRM_SUB_POLICY if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) { - pols[1] = xfrm_policy_lookup_bytype(XFRM_POLICY_TYPE_MAIN, + pols[1] = xfrm_policy_lookup_bytype(&init_net, XFRM_POLICY_TYPE_MAIN, &fl, family, XFRM_POLICY_IN); if (pols[1]) { @@ -2049,7 +2051,7 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family) return 0; } - return xfrm_lookup(&skb->dst, &fl, NULL, 0) == 0; + return xfrm_lookup(&init_net, &skb->dst, &fl, NULL, 0) == 0; } EXPORT_SYMBOL(__xfrm_route_forward); -- cgit v1.2.3-70-g09d2 From 9f690db7ff4cb32493c0b0b13334cc4f5fd49a6b Mon Sep 17 00:00:00 2001 From: Yang Hongyang Date: Tue, 16 Dec 2008 02:08:29 -0800 Subject: ipv6: fix the outgoing interface selection order in udpv6_sendmsg() 1.When no interface is specified in an IPV6_PKTINFO ancillary data item, the interface specified in an IPV6_PKTINFO sticky optionis is used. RFC3542: 6.7. Summary of Outgoing Interface Selection This document and [RFC-3493] specify various methods that affect the selection of the packet's outgoing interface. This subsection summarizes the ordering among those in order to ensure deterministic behavior. For a given outgoing packet on a given socket, the outgoing interface is determined in the following order: 1. if an interface is specified in an IPV6_PKTINFO ancillary data item, the interface is used. 2. otherwise, if an interface is specified in an IPV6_PKTINFO sticky option, the interface is used. Signed-off-by: Yang Hongyang Signed-off-by: David S. Miller --- net/ipv6/udp.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/ipv6/udp.c') diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 38390dd1963..84b1a296eec 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -793,6 +793,9 @@ do_udp_sendmsg: if (!fl.oif) fl.oif = sk->sk_bound_dev_if; + if (!fl.oif) + fl.oif = np->sticky_pktinfo.ipi6_ifindex; + if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); -- cgit v1.2.3-70-g09d2