From 50624c934db18ab90aaea4908f60dd39aab4e6e5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 23 Sep 2013 21:19:49 -0700 Subject: net: Delay default_device_exit_batch until no devices are unregistering v2 There is currently serialization network namespaces exiting and network devices exiting as the final part of netdev_run_todo does not happen under the rtnl_lock. This is compounded by the fact that the only list of devices unregistering in netdev_run_todo is local to the netdev_run_todo. This lack of serialization in extreme cases results in network devices unregistering in netdev_run_todo after the loopback device of their network namespace has been freed (making dst_ifdown unsafe), and after the their network namespace has exited (making the NETDEV_UNREGISTER, and NETDEV_UNREGISTER_FINAL callbacks unsafe). Add the missing serialization by a per network namespace count of how many network devices are unregistering and having a wait queue that is woken up whenever the count is decreased. The count and wait queue allow default_device_exit_batch to wait until all of the unregistration activity for a network namespace has finished before proceeding to unregister the loopback device and then allowing the network namespace to exit. Only a single global wait queue is used because there is a single global lock, and there is a single waiter, per network namespace wait queues would be a waste of resources. The per network namespace count of unregistering devices gives a progress guarantee because the number of network devices unregistering in an exiting network namespace must ultimately drop to zero (assuming network device unregistration completes). The basic logic remains the same as in v1. This patch is now half comment and half rtnl_lock_unregistering an expanded version of wait_event performs no extra work in the common case where no network devices are unregistering when we get to default_device_exit_batch. Reported-by: Francesco Ruggeri Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/core/dev.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 5c713f2239c..65f829cfd92 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5247,10 +5247,12 @@ static int dev_new_index(struct net *net) /* Delayed registration/unregisteration */ static LIST_HEAD(net_todo_list); +static DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); static void net_set_todo(struct net_device *dev) { list_add_tail(&dev->todo_list, &net_todo_list); + dev_net(dev)->dev_unreg_count++; } static void rollback_registered_many(struct list_head *head) @@ -5918,6 +5920,12 @@ void netdev_run_todo(void) if (dev->destructor) dev->destructor(dev); + /* Report a network device has been unregistered */ + rtnl_lock(); + dev_net(dev)->dev_unreg_count--; + __rtnl_unlock(); + wake_up(&netdev_unregistering_wq); + /* Free network device */ kobject_put(&dev->dev.kobj); } @@ -6603,6 +6611,34 @@ static void __net_exit default_device_exit(struct net *net) rtnl_unlock(); } +static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) +{ + /* Return with the rtnl_lock held when there are no network + * devices unregistering in any network namespace in net_list. + */ + struct net *net; + bool unregistering; + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait(&netdev_unregistering_wq, &wait, + TASK_UNINTERRUPTIBLE); + unregistering = false; + rtnl_lock(); + list_for_each_entry(net, net_list, exit_list) { + if (net->dev_unreg_count > 0) { + unregistering = true; + break; + } + } + if (!unregistering) + break; + __rtnl_unlock(); + schedule(); + } + finish_wait(&netdev_unregistering_wq, &wait); +} + static void __net_exit default_device_exit_batch(struct list_head *net_list) { /* At exit all network devices most be removed from a network @@ -6614,7 +6650,18 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) struct net *net; LIST_HEAD(dev_kill_list); - rtnl_lock(); + /* To prevent network device cleanup code from dereferencing + * loopback devices or network devices that have been freed + * wait here for all pending unregistrations to complete, + * before unregistring the loopback device and allowing the + * network namespace be freed. + * + * The netdev todo list containing all network devices + * unregistrations that happen in default_device_exit_batch + * will run in the rtnl_unlock() at the end of + * default_device_exit_batch. + */ + rtnl_lock_unregistering(net_list); list_for_each_entry(net, net_list, exit_list) { for_each_netdev_reverse(net, dev) { if (dev->rtnl_link_ops) -- cgit v1.2.3-70-g09d2 From 9a3bab6b05383f1e4c3716b3615500c51285959e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Sep 2013 06:19:57 -0700 Subject: net: net_secret should not depend on TCP A host might need net_secret[] and never open a single socket. Problem added in commit aebda156a570782 ("net: defer net_secret[] initialization") Based on prior patch from Hannes Frederic Sowa. Reported-by: Hannes Frederic Sowa Signed-off-by: Eric Dumazet Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/secure_seq.h | 1 - net/core/secure_seq.c | 27 ++++++++++++++++++++++++--- net/ipv4/af_inet.c | 4 +--- 3 files changed, 25 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h index 6ca975bebd3..c2e542b27a5 100644 --- a/include/net/secure_seq.h +++ b/include/net/secure_seq.h @@ -3,7 +3,6 @@ #include -extern void net_secret_init(void); extern __u32 secure_ip_id(__be32 daddr); extern __u32 secure_ipv6_id(const __be32 daddr[4]); extern u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport); diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 6a2f13cee86..3f1ec1586ae 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -10,11 +10,24 @@ #include -static u32 net_secret[MD5_MESSAGE_BYTES / 4] ____cacheline_aligned; +#define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4) -void net_secret_init(void) +static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned; + +static void net_secret_init(void) { - get_random_bytes(net_secret, sizeof(net_secret)); + u32 tmp; + int i; + + if (likely(net_secret[0])) + return; + + for (i = NET_SECRET_SIZE; i > 0;) { + do { + get_random_bytes(&tmp, sizeof(tmp)); + } while (!tmp); + cmpxchg(&net_secret[--i], 0, tmp); + } } #ifdef CONFIG_INET @@ -42,6 +55,7 @@ __u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, u32 hash[MD5_DIGEST_WORDS]; u32 i; + net_secret_init(); memcpy(hash, saddr, 16); for (i = 0; i < 4; i++) secret[i] = net_secret[i] + (__force u32)daddr[i]; @@ -63,6 +77,7 @@ u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, u32 hash[MD5_DIGEST_WORDS]; u32 i; + net_secret_init(); memcpy(hash, saddr, 16); for (i = 0; i < 4; i++) secret[i] = net_secret[i] + (__force u32) daddr[i]; @@ -82,6 +97,7 @@ __u32 secure_ip_id(__be32 daddr) { u32 hash[MD5_DIGEST_WORDS]; + net_secret_init(); hash[0] = (__force __u32) daddr; hash[1] = net_secret[13]; hash[2] = net_secret[14]; @@ -96,6 +112,7 @@ __u32 secure_ipv6_id(const __be32 daddr[4]) { __u32 hash[4]; + net_secret_init(); memcpy(hash, daddr, 16); md5_transform(hash, net_secret); @@ -107,6 +124,7 @@ __u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, { u32 hash[MD5_DIGEST_WORDS]; + net_secret_init(); hash[0] = (__force u32)saddr; hash[1] = (__force u32)daddr; hash[2] = ((__force u16)sport << 16) + (__force u16)dport; @@ -121,6 +139,7 @@ u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) { u32 hash[MD5_DIGEST_WORDS]; + net_secret_init(); hash[0] = (__force u32)saddr; hash[1] = (__force u32)daddr; hash[2] = (__force u32)dport ^ net_secret[14]; @@ -140,6 +159,7 @@ u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, u32 hash[MD5_DIGEST_WORDS]; u64 seq; + net_secret_init(); hash[0] = (__force u32)saddr; hash[1] = (__force u32)daddr; hash[2] = ((__force u16)sport << 16) + (__force u16)dport; @@ -164,6 +184,7 @@ u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, u64 seq; u32 i; + net_secret_init(); memcpy(hash, saddr, 16); for (i = 0; i < 4; i++) secret[i] = net_secret[i] + daddr[i]; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 7a1874b7b8f..cfeb85cff4f 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -263,10 +263,8 @@ void build_ehash_secret(void) get_random_bytes(&rnd, sizeof(rnd)); } while (rnd == 0); - if (cmpxchg(&inet_ehash_secret, 0, rnd) == 0) { + if (cmpxchg(&inet_ehash_secret, 0, rnd) == 0) get_random_bytes(&ipv6_hash_secret, sizeof(ipv6_hash_secret)); - net_secret_init(); - } } EXPORT_SYMBOL(build_ehash_secret); -- cgit v1.2.3-70-g09d2 From b86783587b3d1d552326d955acee37eac48800f1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Sep 2013 08:44:06 -0700 Subject: net: flow_dissector: fix thoff for IPPROTO_AH In commit 8ed781668dd49 ("flow_keys: include thoff into flow_keys for later usage"), we missed that existing code was using nhoff as a temporary variable that could not always contain transport header offset. This is not a problem for TCP/UDP because port offset (@poff) is 0 for these protocols. Signed-off-by: Eric Dumazet Cc: Daniel Borkmann Cc: Nikolay Aleksandrov Acked-by: Nikolay Aleksandrov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 1929af87b26..8d7d0dd72db 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -154,8 +154,8 @@ ipv6: if (poff >= 0) { __be32 *ports, _ports; - nhoff += poff; - ports = skb_header_pointer(skb, nhoff, sizeof(_ports), &_ports); + ports = skb_header_pointer(skb, nhoff + poff, + sizeof(_ports), &_ports); if (ports) flow->ports = *ports; } -- cgit v1.2.3-70-g09d2