From 6fac262526ee91ee66210b8919a4297dcf7d544e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 17 Jun 2012 19:47:34 -0700 Subject: ipv4: Cap ADVMSS metric in the FIB rather than the routing cache. It makes no sense to execute this limit test every time we create a routing cache entry. We can't simply error out on these things since we've silently accepted and truncated them forever. Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net/ipv4/fib_semantics.c') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e5b7182fa09..415f8230fc8 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -779,9 +779,14 @@ struct fib_info *fib_create_info(struct fib_config *cfg) int type = nla_type(nla); if (type) { + u32 val; + if (type > RTAX_MAX) goto err_inval; - fi->fib_metrics[type - 1] = nla_get_u32(nla); + val = nla_get_u32(nla); + if (type == RTAX_ADVMSS && val > 65535 - 40) + val = 65535 - 40; + fi->fib_metrics[type - 1] = val; } } } -- cgit v1.2.3-70-g09d2 From 7a9bc9b81a5bc6e44ebc80ef781332e4385083f2 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 29 Jun 2012 01:32:45 -0700 Subject: ipv4: Elide fib_validate_source() completely when possible. If rpfilter is off (or the SKB has an IPSEC path) and there are not tclassid users, we don't have to do anything at all when fib_validate_source() is invoked besides setting the itag to zero. We monitor tclassid uses with a counter (modified only under RTNL and marked __read_mostly) and we protect the fib_validate_source() real work with a test against this counter and whether rpfilter is to be done. Having a way to know whether we need no tclassid processing or not also opens the door for future optimized rpfilter algorithms that do not perform full FIB lookups. Signed-off-by: David S. Miller --- include/net/fib_rules.h | 1 + include/net/ip_fib.h | 5 +++++ net/core/fib_rules.c | 4 ++++ net/ipv4/fib_frontend.c | 32 ++++++++++++++++++++++++-------- net/ipv4/fib_rules.c | 16 +++++++++++++++- net/ipv4/fib_semantics.c | 10 ++++++++++ 6 files changed, 59 insertions(+), 9 deletions(-) (limited to 'net/ipv4/fib_semantics.c') diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 075f1e3a0fe..e361f488242 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -52,6 +52,7 @@ struct fib_rules_ops { struct sk_buff *, struct fib_rule_hdr *, struct nlattr **); + void (*delete)(struct fib_rule *); int (*compare)(struct fib_rule *, struct fib_rule_hdr *, struct nlattr **); diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 619f68a7185..3dc7c96bbea 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -235,6 +235,11 @@ extern int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, int oif, struct net_device *dev, struct in_device *idev, u32 *itag); extern void fib_select_default(struct fib_result *res); +#ifdef CONFIG_IP_ROUTE_CLASSID +extern int fib_num_tclassid_users; +#else +#define fib_num_tclassid_users 0 +#endif /* Exported by fib_semantics.c */ extern int ip_fib_check_default(__be32 gw, struct net_device *dev); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 72cceb79d0d..ab7db83236c 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -151,6 +151,8 @@ static void fib_rules_cleanup_ops(struct fib_rules_ops *ops) list_for_each_entry_safe(rule, tmp, &ops->rules_list, list) { list_del_rcu(&rule->list); + if (ops->delete) + ops->delete(rule); fib_rule_put(rule); } } @@ -499,6 +501,8 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).pid); + if (ops->delete) + ops->delete(rule); fib_rule_put(rule); flush_route_cache(ops); rules_ops_put(ops); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index c84cff52021..ae528d1b293 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -217,6 +218,10 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) return inet_select_addr(dev, ip_hdr(skb)->saddr, scope); } +#ifdef CONFIG_IP_ROUTE_CLASSID +int fib_num_tclassid_users __read_mostly; +#endif + /* Given (packet source, input interface) and optional (dst, oif, tos): * - (main) check, that source is valid i.e. not broadcast or our local * address. @@ -225,11 +230,11 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) * - check, that packet arrived from expected physical interface. * called with rcu_read_lock() */ -int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, - int oif, struct net_device *dev, struct in_device *idev, - u32 *itag) +static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, + u8 tos, int oif, struct net_device *dev, + int rpf, struct in_device *idev, u32 *itag) { - int ret, no_addr, rpf, accept_local; + int ret, no_addr, accept_local; struct fib_result res; struct flowi4 fl4; struct net *net; @@ -242,12 +247,9 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; - no_addr = rpf = accept_local = 0; + no_addr = accept_local = 0; no_addr = idev->ifa_list == NULL; - /* Ignore rp_filter for packets protected by IPsec. */ - rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); - accept_local = IN_DEV_ACCEPT_LOCAL(idev); fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; @@ -303,6 +305,20 @@ e_rpf: return -EXDEV; } +/* Ignore rp_filter for packets protected by IPsec. */ +int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, + u8 tos, int oif, struct net_device *dev, + struct in_device *idev, u32 *itag) +{ + int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); + + if (!r && !fib_num_tclassid_users) { + *itag = 0; + return 0; + } + return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag); +} + static inline __be32 sk_extract_addr(struct sockaddr *addr) { return ((struct sockaddr_in *) addr)->sin_addr.s_addr; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 2d043f71ef7..b23fd952c84 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -169,8 +169,11 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, rule4->dst = nla_get_be32(tb[FRA_DST]); #ifdef CONFIG_IP_ROUTE_CLASSID - if (tb[FRA_FLOW]) + if (tb[FRA_FLOW]) { rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); + if (rule4->tclassid) + fib_num_tclassid_users++; + } #endif rule4->src_len = frh->src_len; @@ -184,6 +187,16 @@ errout: return err; } +static void fib4_rule_delete(struct fib_rule *rule) +{ +#ifdef CONFIG_IP_ROUTE_CLASSID + struct fib4_rule *rule4 = (struct fib4_rule *) rule; + + if (rule4->tclassid) + fib_num_tclassid_users--; +#endif +} + static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, struct nlattr **tb) { @@ -256,6 +269,7 @@ static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = { .action = fib4_rule_action, .match = fib4_rule_match, .configure = fib4_rule_configure, + .delete = fib4_rule_delete, .compare = fib4_rule_compare, .fill = fib4_rule_fill, .default_pref = fib_default_rule_pref, diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 415f8230fc8..c46c20b6b0b 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -163,6 +163,12 @@ void free_fib_info(struct fib_info *fi) return; } fib_info_cnt--; +#ifdef CONFIG_IP_ROUTE_CLASSID + change_nexthops(fi) { + if (nexthop_nh->nh_tclassid) + fib_num_tclassid_users--; + } endfor_nexthops(fi); +#endif call_rcu(&fi->rcu, free_fib_info_rcu); } @@ -421,6 +427,8 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, #ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; + if (nexthop_nh->nh_tclassid) + fib_num_tclassid_users++; #endif } @@ -815,6 +823,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) nh->nh_flags = cfg->fc_flags; #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid = cfg->fc_flow; + if (nh->nh_tclassid) + fib_num_tclassid_users++; #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->nh_weight = 1; -- cgit v1.2.3-70-g09d2 From f4530fa574df4d833506c53697ed1daa0d390bf4 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 5 Jul 2012 22:13:13 -0700 Subject: ipv4: Avoid overhead when no custom FIB rules are installed. If the user hasn't actually installed any custom rules, or fiddled with the default ones, don't go through the whole FIB rules layer. It's just pure overhead. Instead do what we do with CONFIG_IP_MULTIPLE_TABLES disabled, check the individual tables by hand, one by one. Also, move fib_num_tclassid_users into the ipv4 network namespace. Signed-off-by: David S. Miller --- include/net/ip_fib.h | 36 ++++++++++++++++++++++++++++++++---- include/net/netns/ipv4.h | 8 ++++++++ net/ipv4/fib_frontend.c | 27 ++++++++++++++++++++++----- net/ipv4/fib_rules.c | 12 ++++++++---- net/ipv4/fib_semantics.c | 6 +++--- 5 files changed, 73 insertions(+), 16 deletions(-) (limited to 'net/ipv4/fib_semantics.c') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 3dc7c96bbea..539c6721f81 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -220,11 +220,33 @@ extern void __net_exit fib4_rules_exit(struct net *net); extern u32 fib_rules_tclass(const struct fib_result *res); #endif -extern int fib_lookup(struct net *n, struct flowi4 *flp, struct fib_result *res); - extern struct fib_table *fib_new_table(struct net *net, u32 id); extern struct fib_table *fib_get_table(struct net *net, u32 id); +extern int __fib_lookup(struct net *net, struct flowi4 *flp, + struct fib_result *res); + +static inline int fib_lookup(struct net *net, struct flowi4 *flp, + struct fib_result *res) +{ + if (!net->ipv4.fib_has_custom_rules) { + if (net->ipv4.fib_local && + !fib_table_lookup(net->ipv4.fib_local, flp, res, + FIB_LOOKUP_NOREF)) + return 0; + if (net->ipv4.fib_main && + !fib_table_lookup(net->ipv4.fib_main, flp, res, + FIB_LOOKUP_NOREF)) + return 0; + if (net->ipv4.fib_default && + !fib_table_lookup(net->ipv4.fib_default, flp, res, + FIB_LOOKUP_NOREF)) + return 0; + return -ENETUNREACH; + } + return __fib_lookup(net, flp, res); +} + #endif /* CONFIG_IP_MULTIPLE_TABLES */ /* Exported by fib_frontend.c */ @@ -236,9 +258,15 @@ extern int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, struct in_device *idev, u32 *itag); extern void fib_select_default(struct fib_result *res); #ifdef CONFIG_IP_ROUTE_CLASSID -extern int fib_num_tclassid_users; +static inline int fib_num_tclassid_users(struct net *net) +{ + return net->ipv4.fib_num_tclassid_users; +} #else -#define fib_num_tclassid_users 0 +static inline int fib_num_tclassid_users(struct net *net) +{ + return 0; +} #endif /* Exported by fib_semantics.c */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 227f0cd9d3f..599e48fa97c 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -11,6 +11,7 @@ struct ctl_table_header; struct ipv4_devconf; struct fib_rules_ops; struct hlist_head; +struct fib_table; struct sock; struct netns_ipv4 { @@ -24,6 +25,13 @@ struct netns_ipv4 { struct ipv4_devconf *devconf_dflt; #ifdef CONFIG_IP_MULTIPLE_TABLES struct fib_rules_ops *rules_ops; + bool fib_has_custom_rules; + struct fib_table *fib_local; + struct fib_table *fib_main; + struct fib_table *fib_default; +#endif +#ifdef CONFIG_IP_ROUTE_CLASSID + int fib_num_tclassid_users; #endif struct hlist_head *fib_table_hash; struct sock *fibnl; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 3e11ea225da..81f85716a89 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -86,6 +86,24 @@ struct fib_table *fib_new_table(struct net *net, u32 id) tb = fib_trie_table(id); if (!tb) return NULL; + + switch (id) { + case RT_TABLE_LOCAL: + net->ipv4.fib_local = tb; + break; + + case RT_TABLE_MAIN: + net->ipv4.fib_main = tb; + break; + + case RT_TABLE_DEFAULT: + net->ipv4.fib_default = tb; + break; + + default: + break; + } + h = id & (FIB_TABLE_HASHSZ - 1); hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]); return tb; @@ -218,10 +236,6 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) return inet_select_addr(dev, ip_hdr(skb)->saddr, scope); } -#ifdef CONFIG_IP_ROUTE_CLASSID -int fib_num_tclassid_users __read_mostly; -#endif - /* Given (packet source, input interface) and optional (dst, oif, tos): * - (main) check, that source is valid i.e. not broadcast or our local * address. @@ -312,7 +326,7 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, { int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); - if (!r && !fib_num_tclassid_users) { + if (!r && !fib_num_tclassid_users(dev_net(dev))) { *itag = 0; return 0; } @@ -1134,6 +1148,9 @@ static int __net_init fib_net_init(struct net *net) { int error; +#ifdef CONFIG_IP_ROUTE_CLASSID + net->ipv4.fib_num_tclassid_users = 0; +#endif error = ip_fib_net_init(net); if (error < 0) goto out; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index b23fd952c84..c06da93b0b7 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -54,7 +54,7 @@ u32 fib_rules_tclass(const struct fib_result *res) } #endif -int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) +int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) { struct fib_lookup_arg arg = { .result = res, @@ -67,7 +67,7 @@ int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) return err; } -EXPORT_SYMBOL_GPL(fib_lookup); +EXPORT_SYMBOL_GPL(__fib_lookup); static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) @@ -172,7 +172,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, if (tb[FRA_FLOW]) { rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); if (rule4->tclassid) - fib_num_tclassid_users++; + net->ipv4.fib_num_tclassid_users++; } #endif @@ -182,6 +182,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, rule4->dstmask = inet_make_mask(rule4->dst_len); rule4->tos = frh->tos; + net->ipv4.fib_has_custom_rules = true; err = 0; errout: return err; @@ -189,12 +190,14 @@ errout: static void fib4_rule_delete(struct fib_rule *rule) { + struct net *net = rule->fr_net; #ifdef CONFIG_IP_ROUTE_CLASSID struct fib4_rule *rule4 = (struct fib4_rule *) rule; if (rule4->tclassid) - fib_num_tclassid_users--; + net->ipv4.fib_num_tclassid_users--; #endif + net->ipv4.fib_has_custom_rules = true; } static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, @@ -309,6 +312,7 @@ int __net_init fib4_rules_init(struct net *net) if (err < 0) goto fail; net->ipv4.rules_ops = ops; + net->ipv4.fib_has_custom_rules = false; return 0; fail: diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index c46c20b6b0b..ae301c897a1 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -166,7 +166,7 @@ void free_fib_info(struct fib_info *fi) #ifdef CONFIG_IP_ROUTE_CLASSID change_nexthops(fi) { if (nexthop_nh->nh_tclassid) - fib_num_tclassid_users--; + fi->fib_net->ipv4.fib_num_tclassid_users--; } endfor_nexthops(fi); #endif call_rcu(&fi->rcu, free_fib_info_rcu); @@ -428,7 +428,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, nla = nla_find(attrs, attrlen, RTA_FLOW); nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; if (nexthop_nh->nh_tclassid) - fib_num_tclassid_users++; + fi->fib_net->ipv4.fib_num_tclassid_users++; #endif } @@ -824,7 +824,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid = cfg->fc_flow; if (nh->nh_tclassid) - fib_num_tclassid_users++; + fi->fib_net->ipv4.fib_num_tclassid_users++; #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->nh_weight = 1; -- cgit v1.2.3-70-g09d2 From 710ab6c03122cf464510f8c86eb0a179e80b2d61 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 10 Jul 2012 07:02:09 -0700 Subject: ipv4: Enforce max MTU metric at route insertion time. Rather than at every struct rtable creation. Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 2 ++ net/ipv4/route.c | 7 +------ 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'net/ipv4/fib_semantics.c') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index ae301c897a1..d71bfbdc0bf 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -794,6 +794,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) val = nla_get_u32(nla); if (type == RTAX_ADVMSS && val > 65535 - 40) val = 65535 - 40; + if (type == RTAX_MTU && val > 65535 - 15) + val = 65535 - 15; fi->fib_metrics[type - 1] = val; } } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 677d65253e4..1678b575165 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1763,21 +1763,16 @@ static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, const struct fib_result *res, struct fib_info *fi, u16 type, u32 itag) { - struct dst_entry *dst = &rt->dst; - if (fi) { if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) rt->rt_gateway = FIB_RES_GW(*res); rt_init_metrics(rt, fl4, fi); #ifdef CONFIG_IP_ROUTE_CLASSID - dst->tclassid = FIB_RES_NH(*res).nh_tclassid; + rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid; #endif } - if (dst_mtu(dst) > IP_MAX_MTU) - dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU); - #ifdef CONFIG_IP_ROUTE_CLASSID #ifdef CONFIG_IP_MULTIPLE_TABLES set_class_tag(rt, fib_rules_tclass(res)); -- cgit v1.2.3-70-g09d2 From 4895c771c7f006b4b90f9d6b1d2210939ba57b38 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 17 Jul 2012 04:19:00 -0700 Subject: ipv4: Add FIB nexthop exceptions. In a regime where we have subnetted route entries, we need a way to store persistent storage about destination specific learned values such as redirects and PMTU values. This is implemented here via nexthop exceptions. The initial implementation is a 2048 entry hash table with relaiming starting at chain length 5. A more sophisticated scheme can be devised if that proves necessary. Signed-off-by: David S. Miller --- include/net/ip_fib.h | 18 ++++ net/ipv4/fib_semantics.c | 23 +++++ net/ipv4/route.c | 256 +++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 266 insertions(+), 31 deletions(-) (limited to 'net/ipv4/fib_semantics.c') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 5697acefeba..e9ee1ca0708 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -46,6 +47,22 @@ struct fib_config { struct fib_info; +struct fib_nh_exception { + struct fib_nh_exception __rcu *fnhe_next; + __be32 fnhe_daddr; + u32 fnhe_pmtu; + u32 fnhe_gw; + unsigned long fnhe_expires; + unsigned long fnhe_stamp; +}; + +struct fnhe_hash_bucket { + struct fib_nh_exception __rcu *chain; +}; + +#define FNHE_HASH_SIZE 2048 +#define FNHE_RECLAIM_DEPTH 5 + struct fib_nh { struct net_device *nh_dev; struct hlist_node nh_hash; @@ -63,6 +80,7 @@ struct fib_nh { __be32 nh_gw; __be32 nh_saddr; int nh_saddr_genid; + struct fnhe_hash_bucket *nh_exceptions; }; /* diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index d71bfbdc0bf..1e09852df51 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -140,6 +140,27 @@ const struct fib_prop fib_props[RTN_MAX + 1] = { }, }; +static void free_nh_exceptions(struct fib_nh *nh) +{ + struct fnhe_hash_bucket *hash = nh->nh_exceptions; + int i; + + for (i = 0; i < FNHE_HASH_SIZE; i++) { + struct fib_nh_exception *fnhe; + + fnhe = rcu_dereference(hash[i].chain); + while (fnhe) { + struct fib_nh_exception *next; + + next = rcu_dereference(fnhe->fnhe_next); + kfree(fnhe); + + fnhe = next; + } + } + kfree(hash); +} + /* Release a nexthop info record */ static void free_fib_info_rcu(struct rcu_head *head) { @@ -148,6 +169,8 @@ static void free_fib_info_rcu(struct rcu_head *head) change_nexthops(fi) { if (nexthop_nh->nh_dev) dev_put(nexthop_nh->nh_dev); + if (nexthop_nh->nh_exceptions) + free_nh_exceptions(nexthop_nh); } endfor_nexthops(fi); release_net(fi->fib_net); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b35d3bfc66c..a5bd0b4acc6 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1275,14 +1275,130 @@ static void rt_del(unsigned int hash, struct rtable *rt) spin_unlock_bh(rt_hash_lock_addr(hash)); } -static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) +static void __build_flow_key(struct flowi4 *fl4, struct sock *sk, + const struct iphdr *iph, + int oif, u8 tos, + u8 prot, u32 mark, int flow_flags) +{ + if (sk) { + const struct inet_sock *inet = inet_sk(sk); + + oif = sk->sk_bound_dev_if; + mark = sk->sk_mark; + tos = RT_CONN_FLAGS(sk); + prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol; + } + flowi4_init_output(fl4, oif, mark, tos, + RT_SCOPE_UNIVERSE, prot, + flow_flags, + iph->daddr, iph->saddr, 0, 0); +} + +static void build_skb_flow_key(struct flowi4 *fl4, struct sk_buff *skb, struct sock *sk) +{ + const struct iphdr *iph = ip_hdr(skb); + int oif = skb->dev->ifindex; + u8 tos = RT_TOS(iph->tos); + u8 prot = iph->protocol; + u32 mark = skb->mark; + + __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0); +} + +static void build_sk_flow_key(struct flowi4 *fl4, struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + struct ip_options_rcu *inet_opt; + __be32 daddr = inet->inet_daddr; + + rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); + if (inet_opt && inet_opt->opt.srr) + daddr = inet_opt->opt.faddr; + flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, + RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, + inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, + inet_sk_flowi_flags(sk), + daddr, inet->inet_saddr, 0, 0); + rcu_read_unlock(); +} + +static void ip_rt_build_flow_key(struct flowi4 *fl4, struct sock *sk, + struct sk_buff *skb) +{ + if (skb) + build_skb_flow_key(fl4, skb, sk); + else + build_sk_flow_key(fl4, sk); +} + +static DEFINE_SPINLOCK(fnhe_lock); + +static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash, __be32 daddr) +{ + struct fib_nh_exception *fnhe, *oldest; + + oldest = rcu_dereference(hash->chain); + for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; + fnhe = rcu_dereference(fnhe->fnhe_next)) { + if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) + oldest = fnhe; + } + return oldest; +} + +static struct fib_nh_exception *find_or_create_fnhe(struct fib_nh *nh, __be32 daddr) +{ + struct fnhe_hash_bucket *hash = nh->nh_exceptions; + struct fib_nh_exception *fnhe; + int depth; + u32 hval; + + if (!hash) { + hash = nh->nh_exceptions = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), + GFP_ATOMIC); + if (!hash) + return NULL; + } + + hval = (__force u32) daddr; + hval ^= (hval >> 11) ^ (hval >> 22); + hash += hval; + + depth = 0; + for (fnhe = rcu_dereference(hash->chain); fnhe; + fnhe = rcu_dereference(fnhe->fnhe_next)) { + if (fnhe->fnhe_daddr == daddr) + goto out; + depth++; + } + + if (depth > FNHE_RECLAIM_DEPTH) { + fnhe = fnhe_oldest(hash + hval, daddr); + goto out_daddr; + } + fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); + if (!fnhe) + return NULL; + + fnhe->fnhe_next = hash->chain; + rcu_assign_pointer(hash->chain, fnhe); + +out_daddr: + fnhe->fnhe_daddr = daddr; +out: + fnhe->fnhe_stamp = jiffies; + return fnhe; +} + +static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4) { __be32 new_gw = icmp_hdr(skb)->un.gateway; __be32 old_gw = ip_hdr(skb)->saddr; struct net_device *dev = skb->dev; struct in_device *in_dev; + struct fib_result res; struct neighbour *n; - struct rtable *rt; struct net *net; switch (icmp_hdr(skb)->code & 7) { @@ -1296,7 +1412,6 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf return; } - rt = (struct rtable *) dst; if (rt->rt_gateway != old_gw) return; @@ -1320,11 +1435,21 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf goto reject_redirect; } - n = ipv4_neigh_lookup(dst, NULL, &new_gw); + n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw); if (n) { if (!(n->nud_state & NUD_VALID)) { neigh_event_send(n, NULL); } else { + if (fib_lookup(net, fl4, &res) == 0) { + struct fib_nh *nh = &FIB_RES_NH(res); + struct fib_nh_exception *fnhe; + + spin_lock_bh(&fnhe_lock); + fnhe = find_or_create_fnhe(nh, fl4->daddr); + if (fnhe) + fnhe->fnhe_gw = new_gw; + spin_unlock_bh(&fnhe_lock); + } rt->rt_gateway = new_gw; rt->rt_flags |= RTCF_REDIRECTED; call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); @@ -1349,6 +1474,17 @@ reject_redirect: ; } +static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) +{ + struct rtable *rt; + struct flowi4 fl4; + + rt = (struct rtable *) dst; + + ip_rt_build_flow_key(&fl4, sk, skb); + __ip_do_redirect(rt, skb, &fl4); +} + static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) { struct rtable *rt = (struct rtable *)dst; @@ -1508,33 +1644,51 @@ out: kfree_skb(skb); return 0; } -static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) +static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) { - struct rtable *rt = (struct rtable *) dst; - - dst_confirm(dst); + struct fib_result res; if (mtu < ip_rt_min_pmtu) mtu = ip_rt_min_pmtu; + if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) { + struct fib_nh *nh = &FIB_RES_NH(res); + struct fib_nh_exception *fnhe; + + spin_lock_bh(&fnhe_lock); + fnhe = find_or_create_fnhe(nh, fl4->daddr); + if (fnhe) { + fnhe->fnhe_pmtu = mtu; + fnhe->fnhe_expires = jiffies + ip_rt_mtu_expires; + } + spin_unlock_bh(&fnhe_lock); + } rt->rt_pmtu = mtu; dst_set_expires(&rt->dst, ip_rt_mtu_expires); } +static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) +{ + struct rtable *rt = (struct rtable *) dst; + struct flowi4 fl4; + + ip_rt_build_flow_key(&fl4, sk, skb); + __ip_rt_update_pmtu(rt, &fl4, mtu); +} + void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif, u32 mark, u8 protocol, int flow_flags) { - const struct iphdr *iph = (const struct iphdr *)skb->data; + const struct iphdr *iph = (const struct iphdr *) skb->data; struct flowi4 fl4; struct rtable *rt; - flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE, - protocol, flow_flags, - iph->daddr, iph->saddr, 0, 0); + __build_flow_key(&fl4, NULL, iph, oif, + RT_TOS(iph->tos), protocol, mark, flow_flags); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { - ip_rt_update_pmtu(&rt->dst, NULL, skb, mtu); + __ip_rt_update_pmtu(rt, &fl4, mtu); ip_rt_put(rt); } } @@ -1542,27 +1696,31 @@ EXPORT_SYMBOL_GPL(ipv4_update_pmtu); void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { - const struct inet_sock *inet = inet_sk(sk); + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct flowi4 fl4; + struct rtable *rt; - return ipv4_update_pmtu(skb, sock_net(sk), mtu, - sk->sk_bound_dev_if, sk->sk_mark, - inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, - inet_sk_flowi_flags(sk)); + __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + rt = __ip_route_output_key(sock_net(sk), &fl4); + if (!IS_ERR(rt)) { + __ip_rt_update_pmtu(rt, &fl4, mtu); + ip_rt_put(rt); + } } EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); void ipv4_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, u8 protocol, int flow_flags) { - const struct iphdr *iph = (const struct iphdr *)skb->data; + const struct iphdr *iph = (const struct iphdr *) skb->data; struct flowi4 fl4; struct rtable *rt; - flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE, - protocol, flow_flags, iph->daddr, iph->saddr, 0, 0); + __build_flow_key(&fl4, NULL, iph, oif, + RT_TOS(iph->tos), protocol, mark, flow_flags); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { - ip_do_redirect(&rt->dst, NULL, skb); + __ip_do_redirect(rt, skb, &fl4); ip_rt_put(rt); } } @@ -1570,12 +1728,16 @@ EXPORT_SYMBOL_GPL(ipv4_redirect); void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) { - const struct inet_sock *inet = inet_sk(sk); + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct flowi4 fl4; + struct rtable *rt; - return ipv4_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, - sk->sk_mark, - inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, - inet_sk_flowi_flags(sk)); + __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + rt = __ip_route_output_key(sock_net(sk), &fl4); + if (!IS_ERR(rt)) { + __ip_do_redirect(rt, skb, &fl4); + ip_rt_put(rt); + } } EXPORT_SYMBOL_GPL(ipv4_sk_redirect); @@ -1722,14 +1884,46 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, dst_init_metrics(&rt->dst, fi->fib_metrics, true); } +static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr) +{ + struct fnhe_hash_bucket *hash = nh->nh_exceptions; + struct fib_nh_exception *fnhe; + u32 hval; + + hval = (__force u32) daddr; + hval ^= (hval >> 11) ^ (hval >> 22); + + for (fnhe = rcu_dereference(hash[hval].chain); fnhe; + fnhe = rcu_dereference(fnhe->fnhe_next)) { + if (fnhe->fnhe_daddr == daddr) { + if (fnhe->fnhe_pmtu) { + unsigned long expires = fnhe->fnhe_expires; + unsigned long diff = jiffies - expires; + + if (time_before(jiffies, expires)) { + rt->rt_pmtu = fnhe->fnhe_pmtu; + dst_set_expires(&rt->dst, diff); + } + } + if (fnhe->fnhe_gw) + rt->rt_gateway = fnhe->fnhe_gw; + fnhe->fnhe_stamp = jiffies; + break; + } + } +} + static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, const struct fib_result *res, struct fib_info *fi, u16 type, u32 itag) { if (fi) { - if (FIB_RES_GW(*res) && - FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) - rt->rt_gateway = FIB_RES_GW(*res); + struct fib_nh *nh = &FIB_RES_NH(*res); + + if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) + rt->rt_gateway = nh->nh_gw; + if (unlikely(nh->nh_exceptions)) + rt_bind_exception(rt, nh, fl4->daddr); rt_init_metrics(rt, fl4, fi); #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid; -- cgit v1.2.3-70-g09d2 From 5abf7f7e0f6bdbfcac737f636497d7016d9507eb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 17 Jul 2012 22:42:13 +0200 Subject: ipv4: fix rcu splat free_nh_exceptions() should use rcu_dereference_protected(..., 1) since its called after one RCU grace period. Also add some const-ification in recent code. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 4 ++-- net/ipv4/inet_connection_sock.c | 4 ++-- net/ipv4/route.c | 13 +++++++------ 3 files changed, 11 insertions(+), 10 deletions(-) (limited to 'net/ipv4/fib_semantics.c') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 1e09852df51..2b57d768240 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -148,11 +148,11 @@ static void free_nh_exceptions(struct fib_nh *nh) for (i = 0; i < FNHE_HASH_SIZE; i++) { struct fib_nh_exception *fnhe; - fnhe = rcu_dereference(hash[i].chain); + fnhe = rcu_dereference_protected(hash[i].chain, 1); while (fnhe) { struct fib_nh_exception *next; - next = rcu_dereference(fnhe->fnhe_next); + next = rcu_dereference_protected(fnhe->fnhe_next, 1); kfree(fnhe); fnhe = next; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 3ea465286a3..c7a4de05ca0 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -806,8 +806,8 @@ EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt); static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl) { - struct inet_sock *inet = inet_sk(sk); - struct ip_options_rcu *inet_opt; + const struct inet_sock *inet = inet_sk(sk); + const struct ip_options_rcu *inet_opt; __be32 daddr = inet->inet_daddr; struct flowi4 *fl4; struct rtable *rt; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 812e4447a22..f67e7023672 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1275,7 +1275,7 @@ static void rt_del(unsigned int hash, struct rtable *rt) spin_unlock_bh(rt_hash_lock_addr(hash)); } -static void __build_flow_key(struct flowi4 *fl4, struct sock *sk, +static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk, const struct iphdr *iph, int oif, u8 tos, u8 prot, u32 mark, int flow_flags) @@ -1294,7 +1294,8 @@ static void __build_flow_key(struct flowi4 *fl4, struct sock *sk, iph->daddr, iph->saddr, 0, 0); } -static void build_skb_flow_key(struct flowi4 *fl4, struct sk_buff *skb, struct sock *sk) +static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, + const struct sock *sk) { const struct iphdr *iph = ip_hdr(skb); int oif = skb->dev->ifindex; @@ -1305,10 +1306,10 @@ static void build_skb_flow_key(struct flowi4 *fl4, struct sk_buff *skb, struct s __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0); } -static void build_sk_flow_key(struct flowi4 *fl4, struct sock *sk) +static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); - struct ip_options_rcu *inet_opt; + const struct ip_options_rcu *inet_opt; __be32 daddr = inet->inet_daddr; rcu_read_lock(); @@ -1323,8 +1324,8 @@ static void build_sk_flow_key(struct flowi4 *fl4, struct sock *sk) rcu_read_unlock(); } -static void ip_rt_build_flow_key(struct flowi4 *fl4, struct sock *sk, - struct sk_buff *skb) +static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, + const struct sk_buff *skb) { if (skb) build_skb_flow_key(fl4, skb, sk); -- cgit v1.2.3-70-g09d2 From f2bb4bedf35d5167a073dcdddf16543f351ef3ae Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 17 Jul 2012 12:20:47 -0700 Subject: ipv4: Cache output routes in fib_info nexthops. If we have an output route that lacks nexthop exceptions, we can cache it in the FIB info nexthop. Such routes will have DST_HOST cleared because such routes refer to a family of destinations, rather than just one. The sequence of the handling of exceptions during route lookup is adjusted to make the logic work properly. Before we allocate the route, we lookup the exception. Then we know if we will cache this route or not, and therefore whether DST_HOST should be set on the allocated route. Then we use DST_HOST to key off whether we should store the resulting route, during rt_set_nexthop(), in the FIB nexthop cache. With help from Eric Dumazet. Signed-off-by: David S. Miller --- include/net/ip_fib.h | 2 + net/ipv4/fib_semantics.c | 2 + net/ipv4/route.c | 140 ++++++++++++++++++++++++++++++++--------------- 3 files changed, 101 insertions(+), 43 deletions(-) (limited to 'net/ipv4/fib_semantics.c') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 2daf096dfc6..fb62c590360 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -46,6 +46,7 @@ struct fib_config { }; struct fib_info; +struct rtable; struct fib_nh_exception { struct fib_nh_exception __rcu *fnhe_next; @@ -80,6 +81,7 @@ struct fib_nh { __be32 nh_gw; __be32 nh_saddr; int nh_saddr_genid; + struct rtable *nh_rth_output; struct fnhe_hash_bucket *nh_exceptions; }; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 2b57d768240..83d0f42b619 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -171,6 +171,8 @@ static void free_fib_info_rcu(struct rcu_head *head) dev_put(nexthop_nh->nh_dev); if (nexthop_nh->nh_exceptions) free_nh_exceptions(nexthop_nh); + if (nexthop_nh->nh_rth_output) + dst_release(&nexthop_nh->nh_rth_output->dst); } endfor_nexthops(fi); release_net(fi->fib_net); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d52f7699c2f..8a0260010ea 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1158,8 +1158,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) return mtu; } -static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, - struct fib_info *fi) +static void rt_init_metrics(struct rtable *rt, struct fib_info *fi) { if (fi->fib_metrics != (u32 *) dst_default_metrics) { rt->fi = fi; @@ -1168,50 +1167,83 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, dst_init_metrics(&rt->dst, fi->fib_metrics, true); } -static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr) +static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) { struct fnhe_hash_bucket *hash = nh->nh_exceptions; struct fib_nh_exception *fnhe; u32 hval; + if (!hash) + return NULL; + hval = fnhe_hashfun(daddr); -restart: for (fnhe = rcu_dereference(hash[hval].chain); fnhe; fnhe = rcu_dereference(fnhe->fnhe_next)) { - __be32 fnhe_daddr, gw; - unsigned long expires; - unsigned int seq; - u32 pmtu; - - seq = read_seqbegin(&fnhe_seqlock); - fnhe_daddr = fnhe->fnhe_daddr; - gw = fnhe->fnhe_gw; - pmtu = fnhe->fnhe_pmtu; - expires = fnhe->fnhe_expires; - if (read_seqretry(&fnhe_seqlock, seq)) - goto restart; - if (daddr != fnhe_daddr) - continue; - if (pmtu) { - unsigned long diff = expires - jiffies; + if (fnhe->fnhe_daddr == daddr) + return fnhe; + } + return NULL; +} - if (time_before(jiffies, expires)) { - rt->rt_pmtu = pmtu; - dst_set_expires(&rt->dst, diff); - } - } - if (gw) { - rt->rt_flags |= RTCF_REDIRECTED; - rt->rt_gateway = gw; +static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, + __be32 daddr) +{ + __be32 fnhe_daddr, gw; + unsigned long expires; + unsigned int seq; + u32 pmtu; + +restart: + seq = read_seqbegin(&fnhe_seqlock); + fnhe_daddr = fnhe->fnhe_daddr; + gw = fnhe->fnhe_gw; + pmtu = fnhe->fnhe_pmtu; + expires = fnhe->fnhe_expires; + if (read_seqretry(&fnhe_seqlock, seq)) + goto restart; + + if (daddr != fnhe_daddr) + return; + + if (pmtu) { + unsigned long diff = expires - jiffies; + + if (time_before(jiffies, expires)) { + rt->rt_pmtu = pmtu; + dst_set_expires(&rt->dst, diff); } - fnhe->fnhe_stamp = jiffies; - break; + } + if (gw) { + rt->rt_flags |= RTCF_REDIRECTED; + rt->rt_gateway = gw; + } + fnhe->fnhe_stamp = jiffies; +} + +static inline void rt_release_rcu(struct rcu_head *head) +{ + struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); + dst_release(dst); +} + +static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) +{ + struct rtable *orig, *prev, **p = &nh->nh_rth_output; + + orig = *p; + + prev = cmpxchg(p, orig, rt); + if (prev == orig) { + dst_clone(&rt->dst); + if (orig) + call_rcu_bh(&orig->dst.rcu_head, rt_release_rcu); } } -static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, +static void rt_set_nexthop(struct rtable *rt, __be32 daddr, const struct fib_result *res, + struct fib_nh_exception *fnhe, struct fib_info *fi, u16 type, u32 itag) { if (fi) { @@ -1219,12 +1251,15 @@ static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) rt->rt_gateway = nh->nh_gw; - if (unlikely(nh->nh_exceptions)) - rt_bind_exception(rt, nh, fl4->daddr); - rt_init_metrics(rt, fl4, fi); + if (unlikely(fnhe)) + rt_bind_exception(rt, fnhe, daddr); + rt_init_metrics(rt, fi); #ifdef CONFIG_IP_ROUTE_CLASSID - rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid; + rt->dst.tclassid = nh->nh_tclassid; #endif + if (!(rt->dst.flags & DST_HOST) && + rt_is_output_route(rt)) + rt_cache_route(nh, rt); } #ifdef CONFIG_IP_ROUTE_CLASSID @@ -1236,10 +1271,10 @@ static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, } static struct rtable *rt_dst_alloc(struct net_device *dev, - bool nopolicy, bool noxfrm) + bool nopolicy, bool noxfrm, bool will_cache) { return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, - DST_HOST | DST_NOCACHE | + (will_cache ? 0 : DST_HOST) | DST_NOCACHE | (nopolicy ? DST_NOPOLICY : 0) | (noxfrm ? DST_NOXFRM : 0)); } @@ -1276,7 +1311,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto e_err; } rth = rt_dst_alloc(dev_net(dev)->loopback_dev, - IN_DEV_CONF_GET(in_dev, NOPOLICY), false); + IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); if (!rth) goto e_nobufs; @@ -1349,6 +1384,7 @@ static int __mkroute_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, u32 tos, struct rtable **result) { + struct fib_nh_exception *fnhe; struct rtable *rth; int err; struct in_device *out_dev; @@ -1395,9 +1431,13 @@ static int __mkroute_input(struct sk_buff *skb, } } + fnhe = NULL; + if (res->fi) + fnhe = find_exception(&FIB_RES_NH(*res), daddr); + rth = rt_dst_alloc(out_dev->dev, IN_DEV_CONF_GET(in_dev, NOPOLICY), - IN_DEV_CONF_GET(out_dev, NOXFRM)); + IN_DEV_CONF_GET(out_dev, NOXFRM), false); if (!rth) { err = -ENOBUFS; goto cleanup; @@ -1416,7 +1456,7 @@ static int __mkroute_input(struct sk_buff *skb, rth->dst.input = ip_forward; rth->dst.output = ip_output; - rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag); + rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); *result = rth; err = 0; @@ -1558,7 +1598,7 @@ brd_input: local_input: rth = rt_dst_alloc(net->loopback_dev, - IN_DEV_CONF_GET(in_dev, NOPOLICY), false); + IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); if (!rth) goto e_nobufs; @@ -1672,6 +1712,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, unsigned int flags) { struct fib_info *fi = res->fi; + struct fib_nh_exception *fnhe; struct in_device *in_dev; u16 type = res->type; struct rtable *rth; @@ -1710,9 +1751,22 @@ static struct rtable *__mkroute_output(const struct fib_result *res, fi = NULL; } + fnhe = NULL; + if (fi) { + fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr); + if (!fnhe) { + rth = FIB_RES_NH(*res).nh_rth_output; + if (rth && + rth->dst.obsolete == DST_OBSOLETE_FORCE_CHK) { + dst_use(&rth->dst, jiffies); + return rth; + } + } + } rth = rt_dst_alloc(dev_out, IN_DEV_CONF_GET(in_dev, NOPOLICY), - IN_DEV_CONF_GET(in_dev, NOXFRM)); + IN_DEV_CONF_GET(in_dev, NOXFRM), + fi && !fnhe); if (!rth) return ERR_PTR(-ENOBUFS); @@ -1749,7 +1803,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, #endif } - rt_set_nexthop(rth, fl4, res, fi, type, 0); + rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0); if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE) rth->dst.flags |= DST_NOCACHE; -- cgit v1.2.3-70-g09d2 From d2d68ba9fe8b38eb03124b3176a013bb8aa2b5e5 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 17 Jul 2012 12:58:50 -0700 Subject: ipv4: Cache input routes in fib_info nexthops. Caching input routes is slightly simpler than output routes, since we don't need to be concerned with nexthop exceptions. (locally destined, and routed packets, never trigger PMTU events or redirects that will be processed by us). However, we have to elide caching for the DIRECTSRC and non-zero itag cases. Signed-off-by: David S. Miller --- include/net/ip_fib.h | 1 + net/ipv4/fib_semantics.c | 2 ++ net/ipv4/route.c | 55 +++++++++++++++++++++++++++++++++++++----------- 3 files changed, 46 insertions(+), 12 deletions(-) (limited to 'net/ipv4/fib_semantics.c') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index fb62c590360..e69c3a47153 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -82,6 +82,7 @@ struct fib_nh { __be32 nh_saddr; int nh_saddr_genid; struct rtable *nh_rth_output; + struct rtable *nh_rth_input; struct fnhe_hash_bucket *nh_exceptions; }; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 83d0f42b619..e55171f184f 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -173,6 +173,8 @@ static void free_fib_info_rcu(struct rcu_head *head) free_nh_exceptions(nexthop_nh); if (nexthop_nh->nh_rth_output) dst_release(&nexthop_nh->nh_rth_output->dst); + if (nexthop_nh->nh_rth_input) + dst_release(&nexthop_nh->nh_rth_input->dst); } endfor_nexthops(fi); release_net(fi->fib_net); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 8a0260010ea..97cca8a03d9 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1231,6 +1231,9 @@ static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) { struct rtable *orig, *prev, **p = &nh->nh_rth_output; + if (rt_is_input_route(rt)) + p = &nh->nh_rth_input; + orig = *p; prev = cmpxchg(p, orig, rt); @@ -1241,6 +1244,11 @@ static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) } } +static bool rt_cache_valid(struct rtable *rt) +{ + return (rt && rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK); +} + static void rt_set_nexthop(struct rtable *rt, __be32 daddr, const struct fib_result *res, struct fib_nh_exception *fnhe, @@ -1257,8 +1265,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif - if (!(rt->dst.flags & DST_HOST) && - rt_is_output_route(rt)) + if (!(rt->dst.flags & DST_HOST)) rt_cache_route(nh, rt); } @@ -1384,11 +1391,11 @@ static int __mkroute_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, u32 tos, struct rtable **result) { - struct fib_nh_exception *fnhe; struct rtable *rth; int err; struct in_device *out_dev; unsigned int flags = 0; + bool do_cache; u32 itag; /* get a working reference to the output device */ @@ -1431,13 +1438,21 @@ static int __mkroute_input(struct sk_buff *skb, } } - fnhe = NULL; - if (res->fi) - fnhe = find_exception(&FIB_RES_NH(*res), daddr); + do_cache = false; + if (res->fi) { + if (!(flags & RTCF_DIRECTSRC) && !itag) { + rth = FIB_RES_NH(*res).nh_rth_input; + if (rt_cache_valid(rth)) { + dst_use(&rth->dst, jiffies); + goto out; + } + do_cache = true; + } + } rth = rt_dst_alloc(out_dev->dev, IN_DEV_CONF_GET(in_dev, NOPOLICY), - IN_DEV_CONF_GET(out_dev, NOXFRM), false); + IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache); if (!rth) { err = -ENOBUFS; goto cleanup; @@ -1456,8 +1471,8 @@ static int __mkroute_input(struct sk_buff *skb, rth->dst.input = ip_forward; rth->dst.output = ip_output; - rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); - + rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag); +out: *result = rth; err = 0; cleanup: @@ -1509,6 +1524,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, struct rtable *rth; int err = -EINVAL; struct net *net = dev_net(dev); + bool do_cache; /* IP on this device is disabled. */ @@ -1522,6 +1538,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) goto martian_source; + res.fi = NULL; if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) goto brd_input; @@ -1597,8 +1614,20 @@ brd_input: RT_CACHE_STAT_INC(in_brd); local_input: + do_cache = false; + if (res.fi) { + if (!(flags & RTCF_DIRECTSRC) && !itag) { + rth = FIB_RES_NH(res).nh_rth_input; + if (rt_cache_valid(rth)) { + dst_use(&rth->dst, jiffies); + goto set_and_out; + } + do_cache = true; + } + } + rth = rt_dst_alloc(net->loopback_dev, - IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); + IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache); if (!rth) goto e_nobufs; @@ -1622,6 +1651,9 @@ local_input: rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } + if (do_cache) + rt_cache_route(&FIB_RES_NH(res), rth); +set_and_out: skb_dst_set(skb, &rth->dst); err = 0; goto out; @@ -1756,8 +1788,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr); if (!fnhe) { rth = FIB_RES_NH(*res).nh_rth_output; - if (rth && - rth->dst.obsolete == DST_OBSOLETE_FORCE_CHK) { + if (rt_cache_valid(rth)) { dst_use(&rth->dst, jiffies); return rth; } -- cgit v1.2.3-70-g09d2