From 7bd86cc282a458b66c41e3f6676de6656c99b8db Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 12 Aug 2012 20:09:59 +0000 Subject: ipv4: Cache local output routes Commit caacf05e5ad1abf causes big drop of UDP loop back performance. The cause of the regression is that we do not cache the local output routes. Each time we send a datagram from unconnected UDP socket, the kernel allocates a dst_entry and adds it to the rt_uncached_list. It creates lock contention on the rt_uncached_lock. Reported-by: Alex Shi Signed-off-by: Yan, Zheng Signed-off-by: David S. Miller --- net/ipv4/route.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/ipv4/route.c') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index e4ba974f143..fd9ecb52c66 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2028,7 +2028,6 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) } dev_out = net->loopback_dev; fl4->flowi4_oif = dev_out->ifindex; - res.fi = NULL; flags |= RTCF_LOCAL; goto make_route; } -- cgit v1.2.3-70-g09d2 From 9b04f350057863d1fad1ba071e09362a1da3503e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 21 Aug 2012 20:48:29 +0000 Subject: ipv4: properly update pmtu Sylvain Munault reported following info : - TCP connection get "stuck" with data in send queue when doing "large" transfers ( like typing 'ps ax' on a ssh connection ) - Only happens on path where the PMTU is lower than the MTU of the interface - Is not present right after boot, it only appears 10-20min after boot or so. (and that's inside the _same_ TCP connection, it works fine at first and then in the same ssh session, it'll get stuck) - Definitely seems related to fragments somehow since I see a router sending ICMP message saying fragmentation is needed. - Exact same setup works fine with kernel 3.5.1 Problem happens when the 10 minutes (ip_rt_mtu_expires) expiration period is over. ip_rt_update_pmtu() calls dst_set_expires() to rearm a new expiration, but dst_set_expires() does nothing because dst.expires is already set. It seems we want to set the expires field to a new value, regardless of prior one. With help from Julian Anastasov. Reported-by: Sylvain Munaut Signed-off-by: Eric Dumazet CC: Julian Anastasov Tested-by: Sylvain Munaut Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/route.c') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index fd9ecb52c66..8c8c748ebb2 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -956,7 +956,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, dst->obsolete = DST_OBSOLETE_KILL; } else { rt->rt_pmtu = mtu; - dst_set_expires(&rt->dst, ip_rt_mtu_expires); + rt->dst.expires = max(1UL, jiffies + ip_rt_mtu_expires); } } -- cgit v1.2.3-70-g09d2 From 78df76a065ae3b5dbcb9a29912adc02f697de498 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 24 Aug 2012 05:40:47 +0000 Subject: ipv4: take rt_uncached_lock only if needed Multicast traffic allocates dst with DST_NOCACHE, but dst is not inserted into rt_uncached_list. This slowdown multicast workloads on SMP because rt_uncached_lock is contended. Change the test before taking the lock to actually check the dst was inserted into rt_uncached_list. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/route.c') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 8c8c748ebb2..24fd4c59664 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1263,7 +1263,7 @@ static void ipv4_dst_destroy(struct dst_entry *dst) { struct rtable *rt = (struct rtable *) dst; - if (dst->flags & DST_NOCACHE) { + if (!list_empty(&rt->rt_uncached)) { spin_lock_bh(&rt_uncached_lock); list_del(&rt->rt_uncached); spin_unlock_bh(&rt_uncached_lock); -- cgit v1.2.3-70-g09d2 From c5ae7d41927dbd208c885d4794e30617ad6cdf12 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Aug 2012 12:33:07 +0000 Subject: ipv4: must use rcu protection while calling fib_lookup Following lockdep splat was reported by Pavel Roskin : [ 1570.586223] =============================== [ 1570.586225] [ INFO: suspicious RCU usage. ] [ 1570.586228] 3.6.0-rc3-wl-main #98 Not tainted [ 1570.586229] ------------------------------- [ 1570.586231] /home/proski/src/linux/net/ipv4/route.c:645 suspicious rcu_dereference_check() usage! [ 1570.586233] [ 1570.586233] other info that might help us debug this: [ 1570.586233] [ 1570.586236] [ 1570.586236] rcu_scheduler_active = 1, debug_locks = 0 [ 1570.586238] 2 locks held by Chrome_IOThread/4467: [ 1570.586240] #0: (slock-AF_INET){+.-...}, at: [] release_sock+0x2c/0xa0 [ 1570.586253] #1: (fnhe_lock){+.-...}, at: [] update_or_create_fnhe+0x2c/0x270 [ 1570.586260] [ 1570.586260] stack backtrace: [ 1570.586263] Pid: 4467, comm: Chrome_IOThread Not tainted 3.6.0-rc3-wl-main #98 [ 1570.586265] Call Trace: [ 1570.586271] [] lockdep_rcu_suspicious+0xfd/0x130 [ 1570.586275] [] update_or_create_fnhe+0x15c/0x270 [ 1570.586278] [] __ip_rt_update_pmtu+0x73/0xb0 [ 1570.586282] [] ip_rt_update_pmtu+0x29/0x90 [ 1570.586285] [] inet_csk_update_pmtu+0x2c/0x80 [ 1570.586290] [] tcp_v4_mtu_reduced+0x2e/0xc0 [ 1570.586293] [] tcp_release_cb+0xa4/0xb0 [ 1570.586296] [] release_sock+0x55/0xa0 [ 1570.586300] [] tcp_sendmsg+0x4af/0xf50 [ 1570.586305] [] inet_sendmsg+0x120/0x230 [ 1570.586308] [] ? inet_sk_rebuild_header+0x40/0x40 [ 1570.586312] [] ? sock_update_classid+0xbd/0x3b0 [ 1570.586315] [] ? sock_update_classid+0x130/0x3b0 [ 1570.586320] [] do_sock_write+0xc5/0xe0 [ 1570.586323] [] sock_aio_write+0x53/0x80 [ 1570.586328] [] do_sync_write+0xa3/0xe0 [ 1570.586332] [] vfs_write+0x165/0x180 [ 1570.586335] [] sys_write+0x45/0x90 [ 1570.586340] [] system_call_fastpath+0x16/0x1b Signed-off-by: Eric Dumazet Reported-by: Pavel Roskin Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4/route.c') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 24fd4c59664..82cf2a722b2 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -934,12 +934,14 @@ static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) if (mtu < ip_rt_min_pmtu) mtu = ip_rt_min_pmtu; + rcu_read_lock(); if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) { struct fib_nh *nh = &FIB_RES_NH(res); update_or_create_fnhe(nh, fl4->daddr, 0, mtu, jiffies + ip_rt_mtu_expires); } + rcu_read_unlock(); return mtu; } -- cgit v1.2.3-70-g09d2 From bafa6d9d89072c1a18853afe9ee5de05c491c13a Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 7 Sep 2012 00:45:29 +0000 Subject: ipv4/route: arg delay is useless in rt_cache_flush() Since route cache deletion (89aef8921bfbac22f), delay is no more used. Remove it. Signed-off-by: Nicolas Dichtel Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/route.h | 2 +- net/ipv4/arp.c | 2 +- net/ipv4/devinet.c | 6 +++--- net/ipv4/fib_frontend.c | 20 ++++++++++---------- net/ipv4/fib_rules.c | 2 +- net/ipv4/fib_trie.c | 6 +++--- net/ipv4/route.c | 19 +++---------------- 7 files changed, 22 insertions(+), 35 deletions(-) (limited to 'net/ipv4/route.c') diff --git a/include/net/route.h b/include/net/route.h index 776a27f1ab7..da22243d276 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -108,7 +108,7 @@ extern struct ip_rt_acct __percpu *ip_rt_acct; struct in_device; extern int ip_rt_init(void); -extern void rt_cache_flush(struct net *net, int how); +extern void rt_cache_flush(struct net *net); extern void rt_flush_dev(struct net_device *dev); extern struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp); extern struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp, diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 77e87aff419..47800459e4c 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1225,7 +1225,7 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event, switch (event) { case NETDEV_CHANGEADDR: neigh_changeaddr(&arp_tbl, dev); - rt_cache_flush(dev_net(dev), 0); + rt_cache_flush(dev_net(dev)); break; default: break; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 44bf82e3aef..9b55b6f5a58 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1503,7 +1503,7 @@ static int devinet_conf_proc(ctl_table *ctl, int write, if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 || i == IPV4_DEVCONF_ROUTE_LOCALNET - 1) if ((new_value == 0) && (old_value != 0)) - rt_cache_flush(net, 0); + rt_cache_flush(net); } return ret; @@ -1537,7 +1537,7 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write, dev_disable_lro(idev->dev); } rtnl_unlock(); - rt_cache_flush(net, 0); + rt_cache_flush(net); } } @@ -1554,7 +1554,7 @@ static int ipv4_doint_and_flush(ctl_table *ctl, int write, struct net *net = ctl->extra2; if (write && *valp != val) - rt_cache_flush(net, 0); + rt_cache_flush(net); return ret; } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index c43ae3fba79..8e2b475da9f 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -148,7 +148,7 @@ static void fib_flush(struct net *net) } if (flushed) - rt_cache_flush(net, -1); + rt_cache_flush(net); } /* @@ -999,11 +999,11 @@ static void nl_fib_lookup_exit(struct net *net) net->ipv4.fibnl = NULL; } -static void fib_disable_ip(struct net_device *dev, int force, int delay) +static void fib_disable_ip(struct net_device *dev, int force) { if (fib_sync_down_dev(dev, force)) fib_flush(dev_net(dev)); - rt_cache_flush(dev_net(dev), delay); + rt_cache_flush(dev_net(dev)); arp_ifdown(dev); } @@ -1020,7 +1020,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, fib_sync_up(dev); #endif atomic_inc(&net->ipv4.dev_addr_genid); - rt_cache_flush(dev_net(dev), -1); + rt_cache_flush(dev_net(dev)); break; case NETDEV_DOWN: fib_del_ifaddr(ifa, NULL); @@ -1029,9 +1029,9 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, /* Last address was deleted from this interface. * Disable IP. */ - fib_disable_ip(dev, 1, 0); + fib_disable_ip(dev, 1); } else { - rt_cache_flush(dev_net(dev), -1); + rt_cache_flush(dev_net(dev)); } break; } @@ -1045,7 +1045,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo struct net *net = dev_net(dev); if (event == NETDEV_UNREGISTER) { - fib_disable_ip(dev, 2, -1); + fib_disable_ip(dev, 2); rt_flush_dev(dev); return NOTIFY_DONE; } @@ -1062,14 +1062,14 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo fib_sync_up(dev); #endif atomic_inc(&net->ipv4.dev_addr_genid); - rt_cache_flush(dev_net(dev), -1); + rt_cache_flush(dev_net(dev)); break; case NETDEV_DOWN: - fib_disable_ip(dev, 0, 0); + fib_disable_ip(dev, 0); break; case NETDEV_CHANGEMTU: case NETDEV_CHANGE: - rt_cache_flush(dev_net(dev), 0); + rt_cache_flush(dev_net(dev)); break; case NETDEV_UNREGISTER_BATCH: break; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index a83d74e498d..274309d3ade 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -259,7 +259,7 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule) static void fib4_rule_flush_cache(struct fib_rules_ops *ops) { - rt_cache_flush(ops->fro_net, -1); + rt_cache_flush(ops->fro_net); } static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = { diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 57bd978483e..d1b93595b4a 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1286,7 +1286,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) fib_release_info(fi_drop); if (state & FA_S_ACCESSED) - rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); + rt_cache_flush(cfg->fc_nlinfo.nl_net); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); @@ -1333,7 +1333,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) list_add_tail_rcu(&new_fa->fa_list, (fa ? &fa->fa_list : fa_head)); - rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); + rt_cache_flush(cfg->fc_nlinfo.nl_net); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, 0); succeeded: @@ -1708,7 +1708,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) trie_leaf_remove(t, l); if (fa->fa_state & FA_S_ACCESSED) - rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); + rt_cache_flush(cfg->fc_nlinfo.nl_net); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 82cf2a722b2..f6436d3b207 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -461,11 +461,7 @@ static void rt_cache_invalidate(struct net *net) atomic_add(shuffle + 1U, &net->ipv4.rt_genid); } -/* - * delay < 0 : invalidate cache (fast : entries will be deleted later) - * delay >= 0 : invalidate & flush cache (can be long) - */ -void rt_cache_flush(struct net *net, int delay) +void rt_cache_flush(struct net *net) { rt_cache_invalidate(net); } @@ -2345,7 +2341,7 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) void ip_rt_multicast_event(struct in_device *in_dev) { - rt_cache_flush(dev_net(in_dev->dev), 0); + rt_cache_flush(dev_net(in_dev->dev)); } #ifdef CONFIG_SYSCTL @@ -2354,16 +2350,7 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, size_t *lenp, loff_t *ppos) { if (write) { - int flush_delay; - ctl_table ctl; - struct net *net; - - memcpy(&ctl, __ctl, sizeof(ctl)); - ctl.data = &flush_delay; - proc_dointvec(&ctl, write, buffer, lenp, ppos); - - net = (struct net *)__ctl->extra1; - rt_cache_flush(net, flush_delay); + rt_cache_flush((struct net *)__ctl->extra1); return 0; } -- cgit v1.2.3-70-g09d2 From 2885da72966fcb89f48d554339d347fb02b5ea78 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 7 Sep 2012 22:27:11 +0200 Subject: net: rt_cache_flush() cleanup We dont use jhash anymore since route cache removal, so we can get rid of get_random_bytes() calls for rt_genid changes. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) (limited to 'net/ipv4/route.c') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f6436d3b207..be27cfa96e8 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -447,23 +447,9 @@ static inline bool rt_is_expired(const struct rtable *rth) return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); } -/* - * Perturbation of rt_genid by a small quantity [1..256] - * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() - * many times (2^24) without giving recent rt_genid. - * Jenkins hash is strong enough that litle changes of rt_genid are OK. - */ -static void rt_cache_invalidate(struct net *net) -{ - unsigned char shuffle; - - get_random_bytes(&shuffle, sizeof(shuffle)); - atomic_add(shuffle + 1U, &net->ipv4.rt_genid); -} - void rt_cache_flush(struct net *net) { - rt_cache_invalidate(net); + atomic_inc(&net->ipv4.rt_genid); } static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, @@ -2520,8 +2506,7 @@ static __net_initdata struct pernet_operations sysctl_route_ops = { static __net_init int rt_genid_init(struct net *net) { - get_random_bytes(&net->ipv4.rt_genid, - sizeof(net->ipv4.rt_genid)); + atomic_set(&net->ipv4.rt_genid, 0); get_random_bytes(&net->ipv4.dev_addr_genid, sizeof(net->ipv4.dev_addr_genid)); return 0; -- cgit v1.2.3-70-g09d2 From b42664f898c976247f7f609b8bb9c94d7475ca10 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 10 Sep 2012 22:09:44 +0000 Subject: netns: move net->ipv4.rt_genid to net->rt_genid This commit prepares the use of rt_genid by both IPv4 and IPv6. Initialization is left in IPv4 part. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/net/net_namespace.h | 10 ++++++++++ include/net/netns/ipv4.h | 1 - net/ipv4/route.c | 9 ++------- 3 files changed, 12 insertions(+), 8 deletions(-) (limited to 'net/ipv4/route.c') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index ae1cd6c9ba5..fd87963a0ea 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -102,6 +102,7 @@ struct net { #endif struct netns_ipvs *ipvs; struct sock *diag_nlsk; + atomic_t rt_genid; }; @@ -300,5 +301,14 @@ static inline void unregister_net_sysctl_table(struct ctl_table_header *header) } #endif +static inline int rt_genid(struct net *net) +{ + return atomic_read(&net->rt_genid); +} + +static inline void rt_genid_bump(struct net *net) +{ + atomic_inc(&net->rt_genid); +} #endif /* __NET_NET_NAMESPACE_H */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 1474dd65c66..eb24dbccd81 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -65,7 +65,6 @@ struct netns_ipv4 { unsigned int sysctl_ping_group_range[2]; long sysctl_tcp_mem[3]; - atomic_t rt_genid; atomic_t dev_addr_genid; #ifdef CONFIG_IP_MROUTE diff --git a/net/ipv4/route.c b/net/ipv4/route.c index be27cfa96e8..fd9af60397b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -202,11 +202,6 @@ EXPORT_SYMBOL(ip_tos2prio); static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) -static inline int rt_genid(struct net *net) -{ - return atomic_read(&net->ipv4.rt_genid); -} - #ifdef CONFIG_PROC_FS static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) { @@ -449,7 +444,7 @@ static inline bool rt_is_expired(const struct rtable *rth) void rt_cache_flush(struct net *net) { - atomic_inc(&net->ipv4.rt_genid); + rt_genid_bump(net); } static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, @@ -2506,7 +2501,7 @@ static __net_initdata struct pernet_operations sysctl_route_ops = { static __net_init int rt_genid_init(struct net *net) { - atomic_set(&net->ipv4.rt_genid, 0); + atomic_set(&net->rt_genid, 0); get_random_bytes(&net->ipv4.dev_addr_genid, sizeof(net->ipv4.dev_addr_genid)); return 0; -- cgit v1.2.3-70-g09d2