From 35ebf65e851c6d9731abc6362b189858eb59f4d3 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 28 Jun 2012 03:59:11 -0700 Subject: ipv4: Create and use fib_compute_spec_dst() helper. The specific destination is the host we direct unicast replies to. Usually this is the original packet source address, but if we are responding to a multicast or broadcast packet we have to use something different. Specifically we must use the source address we would use if we were to send a packet to the unicast source of the original packet. The routing cache precomputes this value, but we want to remove that precomputation because it creates a hard dependency on the expensive rpfilter source address validation which we'd like to make cheaper. There are only three places where this matters: 1) ICMP replies. 2) pktinfo CMSG 3) IP options Now there will be no real users of rt->rt_spec_dst and we can simply remove it altogether. Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'net/ipv4/fib_frontend.c') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 3854411fa37..451939b60c5 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -180,6 +180,35 @@ unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, } EXPORT_SYMBOL(inet_dev_addr_type); +__be32 fib_compute_spec_dst(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct in_device *in_dev; + struct fib_result res; + struct flowi4 fl4; + struct net *net; + + if (skb->pkt_type != PACKET_BROADCAST && + skb->pkt_type != PACKET_MULTICAST) + return ip_hdr(skb)->daddr; + + in_dev = __in_dev_get_rcu(dev); + BUG_ON(!in_dev); + fl4.flowi4_oif = 0; + fl4.flowi4_iif = 0; + fl4.daddr = ip_hdr(skb)->saddr; + fl4.saddr = ip_hdr(skb)->daddr; + fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); + fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; + + net = dev_net(dev); + if (!fib_lookup(net, &fl4, &res)) + return FIB_RES_PREFSRC(net, res); + else + return inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); +} + /* Given (packet source, input interface) and optional (dst, oif, tos): * - (main) check, that source is valid i.e. not broadcast or our local * address. -- cgit v1.2.3-70-g09d2 From 41347dcdd81988b8e60853257b2875285cc17a4e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 28 Jun 2012 04:05:27 -0700 Subject: ipv4: Kill rt->rt_spec_dst, no longer used. Signed-off-by: David S. Miller --- include/net/ip_fib.h | 2 +- include/net/route.h | 1 - net/ipv4/fib_frontend.c | 9 ++------- net/ipv4/route.c | 38 +++++++++----------------------------- net/ipv4/xfrm4_policy.c | 1 - 5 files changed, 12 insertions(+), 39 deletions(-) (limited to 'net/ipv4/fib_frontend.c') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 1687b3de54f..9e6c26d4ba4 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -233,7 +233,7 @@ extern void ip_fib_init(void); extern __be32 fib_compute_spec_dst(struct sk_buff *skb); extern int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, int oif, struct net_device *dev, - __be32 *spec_dst, u32 *itag); + u32 *itag); extern void fib_select_default(struct fib_result *res); /* Exported by fib_semantics.c */ diff --git a/include/net/route.h b/include/net/route.h index 47eb25ac1f7..211e2665139 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -65,7 +65,6 @@ struct rtable { __be32 rt_gateway; /* Miscellaneous cached information */ - __be32 rt_spec_dst; /* RFC1122 specific destination */ u32 rt_peer_genid; unsigned long _peer; /* long-living peer info */ struct fib_info *fi; /* for client ref to shared metrics */ diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 451939b60c5..63b11ca54d9 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -218,8 +218,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) * called with rcu_read_lock() */ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, - int oif, struct net_device *dev, __be32 *spec_dst, - u32 *itag) + int oif, struct net_device *dev, u32 *itag) { struct in_device *in_dev; struct flowi4 fl4; @@ -258,7 +257,6 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, if (res.type != RTN_LOCAL || !accept_local) goto e_inval; } - *spec_dst = FIB_RES_PREFSRC(net, res); fib_combine_itag(itag, &res); dev_match = false; @@ -287,17 +285,14 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, ret = 0; if (fib_lookup(net, &fl4, &res) == 0) { - if (res.type == RTN_UNICAST) { - *spec_dst = FIB_RES_PREFSRC(net, res); + if (res.type == RTN_UNICAST) ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; - } } return ret; last_resort: if (rpf) goto e_rpf; - *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); *itag = 0; return 0; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 81533e3a23d..83d56a01662 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -440,7 +440,7 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) r->rt_key_tos, -1, HHUptod, - r->rt_spec_dst, &len); + 0, &len); seq_printf(seq, "%*s\n", 127 - len, ""); } @@ -1978,7 +1978,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, { unsigned int hash; struct rtable *rth; - __be32 spec_dst; struct in_device *in_dev = __in_dev_get_rcu(dev); u32 itag = 0; int err; @@ -1999,10 +1998,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (ipv4_is_zeronet(saddr)) { if (!ipv4_is_local_multicast(daddr)) goto e_inval; - spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); } else { - err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, - &itag); + err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &itag); if (err < 0) goto e_err; } @@ -2029,7 +2026,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->rt_oif = 0; rth->rt_mark = skb->mark; rth->rt_gateway = daddr; - rth->rt_spec_dst= spec_dst; rth->rt_peer_genid = 0; rt_init_peer(rth, dev_net(dev)->ipv4.peers); rth->fi = NULL; @@ -2093,7 +2089,6 @@ static int __mkroute_input(struct sk_buff *skb, int err; struct in_device *out_dev; unsigned int flags = 0; - __be32 spec_dst; u32 itag; /* get a working reference to the output device */ @@ -2105,7 +2100,7 @@ static int __mkroute_input(struct sk_buff *skb, err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), - in_dev->dev, &spec_dst, &itag); + in_dev->dev, &itag); if (err < 0) { ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); @@ -2157,7 +2152,6 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_oif = 0; rth->rt_mark = skb->mark; rth->rt_gateway = daddr; - rth->rt_spec_dst= spec_dst; rth->rt_peer_genid = 0; rt_init_peer(rth, &res->table->tb_peers); rth->fi = NULL; @@ -2223,7 +2217,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, u32 itag = 0; struct rtable *rth; unsigned int hash; - __be32 spec_dst; int err = -EINVAL; struct net *net = dev_net(dev); @@ -2281,12 +2274,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res.type == RTN_LOCAL) { err = fib_validate_source(skb, saddr, daddr, tos, net->loopback_dev->ifindex, - dev, &spec_dst, &itag); + dev, &itag); if (err < 0) goto martian_source_keep_err; if (err) flags |= RTCF_DIRECTSRC; - spec_dst = daddr; goto local_input; } @@ -2302,11 +2294,8 @@ brd_input: if (skb->protocol != htons(ETH_P_IP)) goto e_inval; - if (ipv4_is_zeronet(saddr)) - spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); - else { - err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, - &itag); + if (!ipv4_is_zeronet(saddr)) { + err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &itag); if (err < 0) goto martian_source_keep_err; if (err) @@ -2344,7 +2333,6 @@ local_input: rth->rt_oif = 0; rth->rt_mark = skb->mark; rth->rt_gateway = daddr; - rth->rt_spec_dst= spec_dst; rth->rt_peer_genid = 0; rt_init_peer(rth, net->ipv4.peers); rth->fi = NULL; @@ -2362,7 +2350,6 @@ local_input: no_route: RT_CACHE_STAT_INC(in_no_route); - spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); res.type = RTN_UNREACHABLE; if (err == -ESRCH) err = -ENETUNREACH; @@ -2545,7 +2532,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res, rth->rt_oif = orig_oif; rth->rt_mark = fl4->flowi4_mark; rth->rt_gateway = fl4->daddr; - rth->rt_spec_dst= fl4->saddr; rth->rt_peer_genid = 0; rt_init_peer(rth, (res->table ? &res->table->tb_peers : @@ -2554,12 +2540,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res, RT_CACHE_STAT_INC(out_slow_tot); - if (flags & RTCF_LOCAL) { + if (flags & RTCF_LOCAL) rth->dst.input = ip_local_deliver; - rth->rt_spec_dst = fl4->daddr; - } if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { - rth->rt_spec_dst = fl4->saddr; if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) { rth->dst.output = ip_mc_output; @@ -2890,7 +2873,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_dst = ort->rt_dst; rt->rt_src = ort->rt_src; rt->rt_gateway = ort->rt_gateway; - rt->rt_spec_dst = ort->rt_spec_dst; rt_transfer_peer(rt, ort); rt->fi = ort->fi; if (rt->fi) @@ -2965,10 +2947,8 @@ static int rt_fill_info(struct net *net, nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) goto nla_put_failure; #endif - if (rt_is_input_route(rt)) { - if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_spec_dst)) - goto nla_put_failure; - } else if (rt->rt_src != rt->rt_key_src) { + if (!rt_is_input_route(rt) && + rt->rt_src != rt->rt_key_src) { if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src)) goto nla_put_failure; } diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 8855d826855..9815ea0bca7 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -100,7 +100,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_src = rt->rt_src; xdst->u.rt.rt_dst = rt->rt_dst; xdst->u.rt.rt_gateway = rt->rt_gateway; - xdst->u.rt.rt_spec_dst = rt->rt_spec_dst; return 0; } -- cgit v1.2.3-70-g09d2 From a207a4b2e8067cbc7f33924e7f2c0fa4ef43b459 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 28 Jun 2012 18:33:24 -0700 Subject: ipv4: Fix bugs in fib_compute_spec_dst(). Based upon feedback from Julian Anastasov. 1) Use route flags to determine multicast/broadcast, not the packet flags. 2) Leave saddr unspecified in flow key. 3) Adjust how we invoke inet_select_addr(). Pass ip_hdr(skb)->saddr as second arg, and if it was zeronet use link scope. 4) Use loopback as input interface in flow key. Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) (limited to 'net/ipv4/fib_frontend.c') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 63b11ca54d9..1d13217e01f 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -185,28 +185,36 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) struct net_device *dev = skb->dev; struct in_device *in_dev; struct fib_result res; + struct rtable *rt; struct flowi4 fl4; struct net *net; + int scope; - if (skb->pkt_type != PACKET_BROADCAST && - skb->pkt_type != PACKET_MULTICAST) + rt = skb_rtable(skb); + if (!(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) return ip_hdr(skb)->daddr; in_dev = __in_dev_get_rcu(dev); BUG_ON(!in_dev); - fl4.flowi4_oif = 0; - fl4.flowi4_iif = 0; - fl4.daddr = ip_hdr(skb)->saddr; - fl4.saddr = ip_hdr(skb)->daddr; - fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); - fl4.flowi4_scope = RT_SCOPE_UNIVERSE; - fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; net = dev_net(dev); - if (!fib_lookup(net, &fl4, &res)) - return FIB_RES_PREFSRC(net, res); - else - return inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); + + scope = RT_SCOPE_UNIVERSE; + if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) { + fl4.flowi4_oif = 0; + fl4.flowi4_iif = net->loopback_dev->ifindex; + fl4.daddr = ip_hdr(skb)->saddr; + fl4.saddr = 0; + fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); + fl4.flowi4_scope = scope; + fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; + if (!fib_lookup(net, &fl4, &res)) + return FIB_RES_PREFSRC(net, res); + } else { + scope = RT_SCOPE_LINK; + } + + return inet_select_addr(dev, ip_hdr(skb)->saddr, scope); } /* Given (packet source, input interface) and optional (dst, oif, tos): -- cgit v1.2.3-70-g09d2 From 9e56e3800ea42e78b7c816bdd2d87d047be80541 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 28 Jun 2012 18:54:02 -0700 Subject: ipv4: Adjust in_dev handling in fib_validate_source() Checking for in_dev being NULL is pointless. In fact, all of our callers have in_dev precomputed already, so just pass it in and remove the NULL checking. Signed-off-by: David S. Miller --- include/net/ip_fib.h | 2 +- net/ipv4/fib_frontend.c | 27 ++++++++++----------------- net/ipv4/route.c | 10 ++++++---- 3 files changed, 17 insertions(+), 22 deletions(-) (limited to 'net/ipv4/fib_frontend.c') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 9e6c26d4ba4..619f68a7185 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -233,7 +233,7 @@ extern void ip_fib_init(void); extern __be32 fib_compute_spec_dst(struct sk_buff *skb); extern int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, int oif, struct net_device *dev, - u32 *itag); + struct in_device *idev, u32 *itag); extern void fib_select_default(struct fib_result *res); /* Exported by fib_semantics.c */ diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 1d13217e01f..c84cff52021 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -226,15 +226,14 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) * called with rcu_read_lock() */ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, - int oif, struct net_device *dev, u32 *itag) + int oif, struct net_device *dev, struct in_device *idev, + u32 *itag) { - struct in_device *in_dev; - struct flowi4 fl4; + int ret, no_addr, rpf, accept_local; struct fib_result res; - int no_addr, rpf, accept_local; - bool dev_match; - int ret; + struct flowi4 fl4; struct net *net; + bool dev_match; fl4.flowi4_oif = 0; fl4.flowi4_iif = oif; @@ -244,19 +243,13 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, fl4.flowi4_scope = RT_SCOPE_UNIVERSE; no_addr = rpf = accept_local = 0; - in_dev = __in_dev_get_rcu(dev); - if (in_dev) { - no_addr = in_dev->ifa_list == NULL; - - /* Ignore rp_filter for packets protected by IPsec. */ - rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(in_dev); + no_addr = idev->ifa_list == NULL; - accept_local = IN_DEV_ACCEPT_LOCAL(in_dev); - fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; - } + /* Ignore rp_filter for packets protected by IPsec. */ + rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); - if (in_dev == NULL) - goto e_inval; + accept_local = IN_DEV_ACCEPT_LOCAL(idev); + fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; net = dev_net(dev); if (fib_lookup(net, &fl4, &res)) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 83d56a01662..919d69e60ba 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1999,7 +1999,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (!ipv4_is_local_multicast(daddr)) goto e_inval; } else { - err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &itag); + err = fib_validate_source(skb, saddr, 0, tos, 0, dev, + in_dev, &itag); if (err < 0) goto e_err; } @@ -2100,7 +2101,7 @@ static int __mkroute_input(struct sk_buff *skb, err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), - in_dev->dev, &itag); + in_dev->dev, in_dev, &itag); if (err < 0) { ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); @@ -2274,7 +2275,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res.type == RTN_LOCAL) { err = fib_validate_source(skb, saddr, daddr, tos, net->loopback_dev->ifindex, - dev, &itag); + dev, in_dev, &itag); if (err < 0) goto martian_source_keep_err; if (err) @@ -2295,7 +2296,8 @@ brd_input: goto e_inval; if (!ipv4_is_zeronet(saddr)) { - err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &itag); + err = fib_validate_source(skb, saddr, 0, tos, 0, dev, + in_dev, &itag); if (err < 0) goto martian_source_keep_err; if (err) -- cgit v1.2.3-70-g09d2 From 7a9bc9b81a5bc6e44ebc80ef781332e4385083f2 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 29 Jun 2012 01:32:45 -0700 Subject: ipv4: Elide fib_validate_source() completely when possible. If rpfilter is off (or the SKB has an IPSEC path) and there are not tclassid users, we don't have to do anything at all when fib_validate_source() is invoked besides setting the itag to zero. We monitor tclassid uses with a counter (modified only under RTNL and marked __read_mostly) and we protect the fib_validate_source() real work with a test against this counter and whether rpfilter is to be done. Having a way to know whether we need no tclassid processing or not also opens the door for future optimized rpfilter algorithms that do not perform full FIB lookups. Signed-off-by: David S. Miller --- include/net/fib_rules.h | 1 + include/net/ip_fib.h | 5 +++++ net/core/fib_rules.c | 4 ++++ net/ipv4/fib_frontend.c | 32 ++++++++++++++++++++++++-------- net/ipv4/fib_rules.c | 16 +++++++++++++++- net/ipv4/fib_semantics.c | 10 ++++++++++ 6 files changed, 59 insertions(+), 9 deletions(-) (limited to 'net/ipv4/fib_frontend.c') diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 075f1e3a0fe..e361f488242 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -52,6 +52,7 @@ struct fib_rules_ops { struct sk_buff *, struct fib_rule_hdr *, struct nlattr **); + void (*delete)(struct fib_rule *); int (*compare)(struct fib_rule *, struct fib_rule_hdr *, struct nlattr **); diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 619f68a7185..3dc7c96bbea 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -235,6 +235,11 @@ extern int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, int oif, struct net_device *dev, struct in_device *idev, u32 *itag); extern void fib_select_default(struct fib_result *res); +#ifdef CONFIG_IP_ROUTE_CLASSID +extern int fib_num_tclassid_users; +#else +#define fib_num_tclassid_users 0 +#endif /* Exported by fib_semantics.c */ extern int ip_fib_check_default(__be32 gw, struct net_device *dev); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 72cceb79d0d..ab7db83236c 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -151,6 +151,8 @@ static void fib_rules_cleanup_ops(struct fib_rules_ops *ops) list_for_each_entry_safe(rule, tmp, &ops->rules_list, list) { list_del_rcu(&rule->list); + if (ops->delete) + ops->delete(rule); fib_rule_put(rule); } } @@ -499,6 +501,8 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).pid); + if (ops->delete) + ops->delete(rule); fib_rule_put(rule); flush_route_cache(ops); rules_ops_put(ops); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index c84cff52021..ae528d1b293 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -217,6 +218,10 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) return inet_select_addr(dev, ip_hdr(skb)->saddr, scope); } +#ifdef CONFIG_IP_ROUTE_CLASSID +int fib_num_tclassid_users __read_mostly; +#endif + /* Given (packet source, input interface) and optional (dst, oif, tos): * - (main) check, that source is valid i.e. not broadcast or our local * address. @@ -225,11 +230,11 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) * - check, that packet arrived from expected physical interface. * called with rcu_read_lock() */ -int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, - int oif, struct net_device *dev, struct in_device *idev, - u32 *itag) +static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, + u8 tos, int oif, struct net_device *dev, + int rpf, struct in_device *idev, u32 *itag) { - int ret, no_addr, rpf, accept_local; + int ret, no_addr, accept_local; struct fib_result res; struct flowi4 fl4; struct net *net; @@ -242,12 +247,9 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; - no_addr = rpf = accept_local = 0; + no_addr = accept_local = 0; no_addr = idev->ifa_list == NULL; - /* Ignore rp_filter for packets protected by IPsec. */ - rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); - accept_local = IN_DEV_ACCEPT_LOCAL(idev); fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; @@ -303,6 +305,20 @@ e_rpf: return -EXDEV; } +/* Ignore rp_filter for packets protected by IPsec. */ +int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, + u8 tos, int oif, struct net_device *dev, + struct in_device *idev, u32 *itag) +{ + int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); + + if (!r && !fib_num_tclassid_users) { + *itag = 0; + return 0; + } + return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag); +} + static inline __be32 sk_extract_addr(struct sockaddr *addr) { return ((struct sockaddr_in *) addr)->sin_addr.s_addr; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 2d043f71ef7..b23fd952c84 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -169,8 +169,11 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, rule4->dst = nla_get_be32(tb[FRA_DST]); #ifdef CONFIG_IP_ROUTE_CLASSID - if (tb[FRA_FLOW]) + if (tb[FRA_FLOW]) { rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); + if (rule4->tclassid) + fib_num_tclassid_users++; + } #endif rule4->src_len = frh->src_len; @@ -184,6 +187,16 @@ errout: return err; } +static void fib4_rule_delete(struct fib_rule *rule) +{ +#ifdef CONFIG_IP_ROUTE_CLASSID + struct fib4_rule *rule4 = (struct fib4_rule *) rule; + + if (rule4->tclassid) + fib_num_tclassid_users--; +#endif +} + static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, struct nlattr **tb) { @@ -256,6 +269,7 @@ static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = { .action = fib4_rule_action, .match = fib4_rule_match, .configure = fib4_rule_configure, + .delete = fib4_rule_delete, .compare = fib4_rule_compare, .fill = fib4_rule_fill, .default_pref = fib_default_rule_pref, diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 415f8230fc8..c46c20b6b0b 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -163,6 +163,12 @@ void free_fib_info(struct fib_info *fi) return; } fib_info_cnt--; +#ifdef CONFIG_IP_ROUTE_CLASSID + change_nexthops(fi) { + if (nexthop_nh->nh_tclassid) + fib_num_tclassid_users--; + } endfor_nexthops(fi); +#endif call_rcu(&fi->rcu, free_fib_info_rcu); } @@ -421,6 +427,8 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, #ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; + if (nexthop_nh->nh_tclassid) + fib_num_tclassid_users++; #endif } @@ -815,6 +823,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) nh->nh_flags = cfg->fc_flags; #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid = cfg->fc_flow; + if (nh->nh_tclassid) + fib_num_tclassid_users++; #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->nh_weight = 1; -- cgit v1.2.3-70-g09d2 From a31f2d17b331db970259e875b7223d3aba7e3821 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 29 Jun 2012 06:15:21 +0000 Subject: netlink: add netlink_kernel_cfg parameter to netlink_kernel_create This patch adds the following structure: struct netlink_kernel_cfg { unsigned int groups; void (*input)(struct sk_buff *skb); struct mutex *cb_mutex; }; That can be passed to netlink_kernel_create to set optional configurations for netlink kernel sockets. I've populated this structure by looking for NULL and zero parameters at the existing code. The remaining parameters that always need to be set are still left in the original interface. That includes optional parameters for the netlink socket creation. This allows easy extensibility of this interface in the future. This patch also adapts all callers to use this new interface. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- crypto/crypto_user.c | 7 +++++-- drivers/connector/connector.c | 13 +++++++++---- drivers/infiniband/core/netlink.c | 7 +++++-- drivers/scsi/scsi_netlink.c | 7 +++++-- drivers/scsi/scsi_transport_iscsi.c | 9 ++++++--- drivers/staging/gdm72xx/netlink_k.c | 6 ++++-- include/linux/netlink.h | 15 ++++++++++----- kernel/audit.c | 7 +++++-- lib/kobject_uevent.c | 5 ++++- net/bridge/netfilter/ebt_ulog.c | 6 ++++-- net/core/rtnetlink.c | 9 +++++++-- net/core/sock_diag.c | 8 ++++++-- net/decnet/netfilter/dn_rtmsg.c | 8 +++++--- net/ipv4/fib_frontend.c | 7 +++++-- net/ipv4/netfilter/ipt_ULOG.c | 8 +++++--- net/netfilter/nfnetlink.c | 7 +++++-- net/netlink/af_netlink.c | 16 ++++++++++------ net/netlink/genetlink.c | 10 +++++++--- net/xfrm/xfrm_user.c | 7 +++++-- security/selinux/netlink.c | 6 +++++- 20 files changed, 117 insertions(+), 51 deletions(-) (limited to 'net/ipv4/fib_frontend.c') diff --git a/crypto/crypto_user.c b/crypto/crypto_user.c index 5a37eadb4e5..ba2c611154a 100644 --- a/crypto/crypto_user.c +++ b/crypto/crypto_user.c @@ -496,9 +496,12 @@ static void crypto_netlink_rcv(struct sk_buff *skb) static int __init crypto_user_init(void) { + struct netlink_kernel_cfg cfg = { + .input = crypto_netlink_rcv, + }; + crypto_nlsk = netlink_kernel_create(&init_net, NETLINK_CRYPTO, - 0, crypto_netlink_rcv, - NULL, THIS_MODULE); + THIS_MODULE, &cfg); if (!crypto_nlsk) return -ENOMEM; diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c index 34e0e9e4d91..116cf8d0283 100644 --- a/drivers/connector/connector.c +++ b/drivers/connector/connector.c @@ -251,15 +251,20 @@ static const struct file_operations cn_file_ops = { .release = single_release }; +static struct cn_dev cdev = { + .input = cn_rx_skb, +}; + static int __devinit cn_init(void) { struct cn_dev *dev = &cdev; - - dev->input = cn_rx_skb; + struct netlink_kernel_cfg cfg = { + .groups = CN_NETLINK_USERS + 0xf, + .input = dev->input, + }; dev->nls = netlink_kernel_create(&init_net, NETLINK_CONNECTOR, - CN_NETLINK_USERS + 0xf, - dev->input, NULL, THIS_MODULE); + THIS_MODULE, &cfg); if (!dev->nls) return -EIO; diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 1e691dca182..3ae2bfd3101 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -173,8 +173,11 @@ static void ibnl_rcv(struct sk_buff *skb) int __init ibnl_init(void) { - nls = netlink_kernel_create(&init_net, NETLINK_RDMA, 0, ibnl_rcv, - NULL, THIS_MODULE); + struct netlink_kernel_cfg cfg = { + .input = ibnl_rcv, + }; + + nls = netlink_kernel_create(&init_net, NETLINK_RDMA, THIS_MODULE, &cfg); if (!nls) { pr_warn("Failed to create netlink socket\n"); return -ENOMEM; diff --git a/drivers/scsi/scsi_netlink.c b/drivers/scsi/scsi_netlink.c index c77628afbf9..8818dd681c1 100644 --- a/drivers/scsi/scsi_netlink.c +++ b/drivers/scsi/scsi_netlink.c @@ -486,6 +486,10 @@ void scsi_netlink_init(void) { int error; + struct netlink_kernel_cfg cfg = { + .input = scsi_nl_rcv_msg, + .groups = SCSI_NL_GRP_CNT, + }; INIT_LIST_HEAD(&scsi_nl_drivers); @@ -497,8 +501,7 @@ scsi_netlink_init(void) } scsi_nl_sock = netlink_kernel_create(&init_net, NETLINK_SCSITRANSPORT, - SCSI_NL_GRP_CNT, scsi_nl_rcv_msg, NULL, - THIS_MODULE); + THIS_MODULE, &cfg); if (!scsi_nl_sock) { printk(KERN_ERR "%s: register of receive handler failed\n", __func__); diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 1cf640e575d..6042954d8f3 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -2936,7 +2936,10 @@ EXPORT_SYMBOL_GPL(iscsi_unregister_transport); static __init int iscsi_transport_init(void) { int err; - + struct netlink_kernel_cfg cfg = { + .groups = 1, + .input = iscsi_if_rx, + }; printk(KERN_INFO "Loading iSCSI transport class v%s.\n", ISCSI_TRANSPORT_VERSION); @@ -2966,8 +2969,8 @@ static __init int iscsi_transport_init(void) if (err) goto unregister_conn_class; - nls = netlink_kernel_create(&init_net, NETLINK_ISCSI, 1, iscsi_if_rx, - NULL, THIS_MODULE); + nls = netlink_kernel_create(&init_net, NETLINK_ISCSI, + THIS_MODULE, &cfg); if (!nls) { err = -ENOBUFS; goto unregister_session_class; diff --git a/drivers/staging/gdm72xx/netlink_k.c b/drivers/staging/gdm72xx/netlink_k.c index 2489bb5597c..87c3a07ed80 100644 --- a/drivers/staging/gdm72xx/netlink_k.c +++ b/drivers/staging/gdm72xx/netlink_k.c @@ -88,13 +88,15 @@ struct sock *netlink_init(int unit, void (*cb)(struct net_device *dev, u16 type, void *msg, int len)) { struct sock *sock; + struct netlink_kernel_cfg cfg = { + .input = netlink_rcv, + }; #if !defined(DEFINE_MUTEX) init_MUTEX(&netlink_mutex); #endif - sock = netlink_kernel_create(&init_net, unit, 0, netlink_rcv, NULL, - THIS_MODULE); + sock = netlink_kernel_create(&init_net, unit, THIS_MODULE, &cfg); if (sock) rcv_cb = cb; diff --git a/include/linux/netlink.h b/include/linux/netlink.h index ed33f0901bc..6085e4919cb 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -174,11 +174,16 @@ struct netlink_skb_parms { extern void netlink_table_grab(void); extern void netlink_table_ungrab(void); -extern struct sock *netlink_kernel_create(struct net *net, - int unit,unsigned int groups, - void (*input)(struct sk_buff *skb), - struct mutex *cb_mutex, - struct module *module); +/* optional Netlink kernel configuration parameters */ +struct netlink_kernel_cfg { + unsigned int groups; + void (*input)(struct sk_buff *skb); + struct mutex *cb_mutex; +}; + +extern struct sock *netlink_kernel_create(struct net *net, int unit, + struct module *module, + struct netlink_kernel_cfg *cfg); extern void netlink_kernel_release(struct sock *sk); extern int __netlink_change_ngroups(struct sock *sk, unsigned int groups); extern int netlink_change_ngroups(struct sock *sk, unsigned int groups); diff --git a/kernel/audit.c b/kernel/audit.c index 30b252a1fb6..4a3f28d2ca6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -962,14 +962,17 @@ static void audit_receive(struct sk_buff *skb) static int __init audit_init(void) { int i; + struct netlink_kernel_cfg cfg = { + .input = audit_receive, + }; if (audit_initialized == AUDIT_DISABLED) return 0; printk(KERN_INFO "audit: initializing netlink socket (%s)\n", audit_default ? "enabled" : "disabled"); - audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0, - audit_receive, NULL, THIS_MODULE); + audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, + THIS_MODULE, &cfg); if (!audit_sock) audit_panic("cannot initialize netlink socket"); else diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index 1a91efa6d12..0401d2916d9 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -373,13 +373,16 @@ EXPORT_SYMBOL_GPL(add_uevent_var); static int uevent_net_init(struct net *net) { struct uevent_sock *ue_sk; + struct netlink_kernel_cfg cfg = { + .groups = 1, + }; ue_sk = kzalloc(sizeof(*ue_sk), GFP_KERNEL); if (!ue_sk) return -ENOMEM; ue_sk->sk = netlink_kernel_create(net, NETLINK_KOBJECT_UEVENT, - 1, NULL, NULL, THIS_MODULE); + THIS_MODULE, &cfg); if (!ue_sk->sk) { printk(KERN_ERR "kobject_uevent: unable to create netlink socket!\n"); diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c index 1bd173218f7..374bdcd7703 100644 --- a/net/bridge/netfilter/ebt_ulog.c +++ b/net/bridge/netfilter/ebt_ulog.c @@ -282,6 +282,9 @@ static int __init ebt_ulog_init(void) { int ret; int i; + struct netlink_kernel_cfg cfg = { + .groups = EBT_ULOG_MAXNLGROUPS, + }; if (nlbufsiz >= 128*1024) { pr_warning("Netlink buffer has to be <= 128kB," @@ -296,8 +299,7 @@ static int __init ebt_ulog_init(void) } ebtulognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, - EBT_ULOG_MAXNLGROUPS, NULL, NULL, - THIS_MODULE); + THIS_MODULE, &cfg); if (!ebtulognl) ret = -ENOMEM; else if ((ret = xt_register_target(&ebt_ulog_tg_reg)) != 0) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index bc8a1cdaac9..2b325c340b4 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2353,8 +2353,13 @@ static struct notifier_block rtnetlink_dev_notifier = { static int __net_init rtnetlink_net_init(struct net *net) { struct sock *sk; - sk = netlink_kernel_create(net, NETLINK_ROUTE, RTNLGRP_MAX, - rtnetlink_rcv, &rtnl_mutex, THIS_MODULE); + struct netlink_kernel_cfg cfg = { + .groups = RTNLGRP_MAX, + .input = rtnetlink_rcv, + .cb_mutex = &rtnl_mutex, + }; + + sk = netlink_kernel_create(net, NETLINK_ROUTE, THIS_MODULE, &cfg); if (!sk) return -ENOMEM; net->rtnl = sk; diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index ff2967acbfa..07a29eb34a4 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -171,8 +171,12 @@ EXPORT_SYMBOL_GPL(sock_diag_nlsk); static int __init sock_diag_init(void) { - sock_diag_nlsk = netlink_kernel_create(&init_net, NETLINK_SOCK_DIAG, 0, - sock_diag_rcv, NULL, THIS_MODULE); + struct netlink_kernel_cfg cfg = { + .input = sock_diag_rcv, + }; + + sock_diag_nlsk = netlink_kernel_create(&init_net, NETLINK_SOCK_DIAG, + THIS_MODULE, &cfg); return sock_diag_nlsk == NULL ? -ENOMEM : 0; } diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index b8f7f5b8c35..11db0ecf342 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -125,11 +125,13 @@ static struct nf_hook_ops dnrmg_ops __read_mostly = { static int __init dn_rtmsg_init(void) { int rv = 0; + struct netlink_kernel_cfg cfg = { + .groups = DNRNG_NLGRP_MAX, + .input = dnrmg_receive_user_skb, + }; dnrmg = netlink_kernel_create(&init_net, - NETLINK_DNRTMSG, DNRNG_NLGRP_MAX, - dnrmg_receive_user_skb, - NULL, THIS_MODULE); + NETLINK_DNRTMSG, THIS_MODULE, &cfg); if (dnrmg == NULL) { printk(KERN_ERR "dn_rtmsg: Cannot create netlink socket"); return -ENOMEM; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index ae528d1b293..3e11ea225da 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -976,8 +976,11 @@ static void nl_fib_input(struct sk_buff *skb) static int __net_init nl_fib_lookup_init(struct net *net) { struct sock *sk; - sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0, - nl_fib_input, NULL, THIS_MODULE); + struct netlink_kernel_cfg cfg = { + .input = nl_fib_input, + }; + + sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, THIS_MODULE, &cfg); if (sk == NULL) return -EAFNOSUPPORT; net->ipv4.fibnl = sk; diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index 99b3f53f16a..1109f7f6c25 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -381,6 +381,9 @@ static struct nf_logger ipt_ulog_logger __read_mostly = { static int __init ulog_tg_init(void) { int ret, i; + struct netlink_kernel_cfg cfg = { + .groups = ULOG_MAXNLGROUPS, + }; pr_debug("init module\n"); @@ -393,9 +396,8 @@ static int __init ulog_tg_init(void) for (i = 0; i < ULOG_MAXNLGROUPS; i++) setup_timer(&ulog_buffers[i].timer, ulog_timer, i); - nflognl = netlink_kernel_create(&init_net, - NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL, - NULL, THIS_MODULE); + nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, + THIS_MODULE, &cfg); if (!nflognl) return -ENOMEM; diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 3e797d1fcb9..700e4616a09 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -203,9 +203,12 @@ static void nfnetlink_rcv(struct sk_buff *skb) static int __net_init nfnetlink_net_init(struct net *net) { struct sock *nfnl; + struct netlink_kernel_cfg cfg = { + .groups = NFNLGRP_MAX, + .input = nfnetlink_rcv, + }; - nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, NFNLGRP_MAX, - nfnetlink_rcv, NULL, THIS_MODULE); + nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, THIS_MODULE, &cfg); if (!nfnl) return -ENOMEM; net->nfnl_stash = nfnl; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index b3025a603d5..43a124feaad 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1503,14 +1503,16 @@ static void netlink_data_ready(struct sock *sk, int len) */ struct sock * -netlink_kernel_create(struct net *net, int unit, unsigned int groups, - void (*input)(struct sk_buff *skb), - struct mutex *cb_mutex, struct module *module) +netlink_kernel_create(struct net *net, int unit, + struct module *module, + struct netlink_kernel_cfg *cfg) { struct socket *sock; struct sock *sk; struct netlink_sock *nlk; struct listeners *listeners = NULL; + struct mutex *cb_mutex = cfg ? cfg->cb_mutex : NULL; + unsigned int groups; BUG_ON(!nl_table); @@ -1532,16 +1534,18 @@ netlink_kernel_create(struct net *net, int unit, unsigned int groups, sk = sock->sk; sk_change_net(sk, net); - if (groups < 32) + if (!cfg || cfg->groups < 32) groups = 32; + else + groups = cfg->groups; listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL); if (!listeners) goto out_sock_release; sk->sk_data_ready = netlink_data_ready; - if (input) - nlk_sk(sk)->netlink_rcv = input; + if (cfg && cfg->input) + nlk_sk(sk)->netlink_rcv = cfg->input; if (netlink_insert(sk, net, 0)) goto out_sock_release; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 2cc7c1ee769..32761b53015 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -915,10 +915,14 @@ static struct genl_multicast_group notify_grp = { static int __net_init genl_pernet_init(struct net *net) { + struct netlink_kernel_cfg cfg = { + .input = genl_rcv, + .cb_mutex = &genl_mutex, + }; + /* we'll bump the group number right afterwards */ - net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC, 0, - genl_rcv, &genl_mutex, - THIS_MODULE); + net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC, + THIS_MODULE, &cfg); if (!net->genl_sock && net_eq(net, &init_net)) panic("GENL: Cannot initialize generic netlink\n"); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 540762726aa..e75d8e47f35 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2959,9 +2959,12 @@ static struct xfrm_mgr netlink_mgr = { static int __net_init xfrm_user_net_init(struct net *net) { struct sock *nlsk; + struct netlink_kernel_cfg cfg = { + .groups = XFRMNLGRP_MAX, + .input = xfrm_netlink_rcv, + }; - nlsk = netlink_kernel_create(net, NETLINK_XFRM, XFRMNLGRP_MAX, - xfrm_netlink_rcv, NULL, THIS_MODULE); + nlsk = netlink_kernel_create(net, NETLINK_XFRM, THIS_MODULE, &cfg); if (nlsk == NULL) return -ENOMEM; net->xfrm.nlsk_stash = nlsk; /* Don't set to NULL */ diff --git a/security/selinux/netlink.c b/security/selinux/netlink.c index 8a23a35b9c5..8a77725423e 100644 --- a/security/selinux/netlink.c +++ b/security/selinux/netlink.c @@ -111,8 +111,12 @@ void selnl_notify_policyload(u32 seqno) static int __init selnl_init(void) { + struct netlink_kernel_cfg cfg = { + .groups = SELNLGRP_MAX, + }; + selnl = netlink_kernel_create(&init_net, NETLINK_SELINUX, - SELNLGRP_MAX, NULL, NULL, THIS_MODULE); + THIS_MODULE, &cfg); if (selnl == NULL) panic("SELinux: Cannot create netlink socket."); netlink_set_nonroot(NETLINK_SELINUX, NL_NONROOT_RECV); -- cgit v1.2.3-70-g09d2 From f4530fa574df4d833506c53697ed1daa0d390bf4 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 5 Jul 2012 22:13:13 -0700 Subject: ipv4: Avoid overhead when no custom FIB rules are installed. If the user hasn't actually installed any custom rules, or fiddled with the default ones, don't go through the whole FIB rules layer. It's just pure overhead. Instead do what we do with CONFIG_IP_MULTIPLE_TABLES disabled, check the individual tables by hand, one by one. Also, move fib_num_tclassid_users into the ipv4 network namespace. Signed-off-by: David S. Miller --- include/net/ip_fib.h | 36 ++++++++++++++++++++++++++++++++---- include/net/netns/ipv4.h | 8 ++++++++ net/ipv4/fib_frontend.c | 27 ++++++++++++++++++++++----- net/ipv4/fib_rules.c | 12 ++++++++---- net/ipv4/fib_semantics.c | 6 +++--- 5 files changed, 73 insertions(+), 16 deletions(-) (limited to 'net/ipv4/fib_frontend.c') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 3dc7c96bbea..539c6721f81 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -220,11 +220,33 @@ extern void __net_exit fib4_rules_exit(struct net *net); extern u32 fib_rules_tclass(const struct fib_result *res); #endif -extern int fib_lookup(struct net *n, struct flowi4 *flp, struct fib_result *res); - extern struct fib_table *fib_new_table(struct net *net, u32 id); extern struct fib_table *fib_get_table(struct net *net, u32 id); +extern int __fib_lookup(struct net *net, struct flowi4 *flp, + struct fib_result *res); + +static inline int fib_lookup(struct net *net, struct flowi4 *flp, + struct fib_result *res) +{ + if (!net->ipv4.fib_has_custom_rules) { + if (net->ipv4.fib_local && + !fib_table_lookup(net->ipv4.fib_local, flp, res, + FIB_LOOKUP_NOREF)) + return 0; + if (net->ipv4.fib_main && + !fib_table_lookup(net->ipv4.fib_main, flp, res, + FIB_LOOKUP_NOREF)) + return 0; + if (net->ipv4.fib_default && + !fib_table_lookup(net->ipv4.fib_default, flp, res, + FIB_LOOKUP_NOREF)) + return 0; + return -ENETUNREACH; + } + return __fib_lookup(net, flp, res); +} + #endif /* CONFIG_IP_MULTIPLE_TABLES */ /* Exported by fib_frontend.c */ @@ -236,9 +258,15 @@ extern int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, struct in_device *idev, u32 *itag); extern void fib_select_default(struct fib_result *res); #ifdef CONFIG_IP_ROUTE_CLASSID -extern int fib_num_tclassid_users; +static inline int fib_num_tclassid_users(struct net *net) +{ + return net->ipv4.fib_num_tclassid_users; +} #else -#define fib_num_tclassid_users 0 +static inline int fib_num_tclassid_users(struct net *net) +{ + return 0; +} #endif /* Exported by fib_semantics.c */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 227f0cd9d3f..599e48fa97c 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -11,6 +11,7 @@ struct ctl_table_header; struct ipv4_devconf; struct fib_rules_ops; struct hlist_head; +struct fib_table; struct sock; struct netns_ipv4 { @@ -24,6 +25,13 @@ struct netns_ipv4 { struct ipv4_devconf *devconf_dflt; #ifdef CONFIG_IP_MULTIPLE_TABLES struct fib_rules_ops *rules_ops; + bool fib_has_custom_rules; + struct fib_table *fib_local; + struct fib_table *fib_main; + struct fib_table *fib_default; +#endif +#ifdef CONFIG_IP_ROUTE_CLASSID + int fib_num_tclassid_users; #endif struct hlist_head *fib_table_hash; struct sock *fibnl; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 3e11ea225da..81f85716a89 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -86,6 +86,24 @@ struct fib_table *fib_new_table(struct net *net, u32 id) tb = fib_trie_table(id); if (!tb) return NULL; + + switch (id) { + case RT_TABLE_LOCAL: + net->ipv4.fib_local = tb; + break; + + case RT_TABLE_MAIN: + net->ipv4.fib_main = tb; + break; + + case RT_TABLE_DEFAULT: + net->ipv4.fib_default = tb; + break; + + default: + break; + } + h = id & (FIB_TABLE_HASHSZ - 1); hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]); return tb; @@ -218,10 +236,6 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) return inet_select_addr(dev, ip_hdr(skb)->saddr, scope); } -#ifdef CONFIG_IP_ROUTE_CLASSID -int fib_num_tclassid_users __read_mostly; -#endif - /* Given (packet source, input interface) and optional (dst, oif, tos): * - (main) check, that source is valid i.e. not broadcast or our local * address. @@ -312,7 +326,7 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, { int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); - if (!r && !fib_num_tclassid_users) { + if (!r && !fib_num_tclassid_users(dev_net(dev))) { *itag = 0; return 0; } @@ -1134,6 +1148,9 @@ static int __net_init fib_net_init(struct net *net) { int error; +#ifdef CONFIG_IP_ROUTE_CLASSID + net->ipv4.fib_num_tclassid_users = 0; +#endif error = ip_fib_net_init(net); if (error < 0) goto out; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index b23fd952c84..c06da93b0b7 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -54,7 +54,7 @@ u32 fib_rules_tclass(const struct fib_result *res) } #endif -int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) +int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) { struct fib_lookup_arg arg = { .result = res, @@ -67,7 +67,7 @@ int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) return err; } -EXPORT_SYMBOL_GPL(fib_lookup); +EXPORT_SYMBOL_GPL(__fib_lookup); static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) @@ -172,7 +172,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, if (tb[FRA_FLOW]) { rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); if (rule4->tclassid) - fib_num_tclassid_users++; + net->ipv4.fib_num_tclassid_users++; } #endif @@ -182,6 +182,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, rule4->dstmask = inet_make_mask(rule4->dst_len); rule4->tos = frh->tos; + net->ipv4.fib_has_custom_rules = true; err = 0; errout: return err; @@ -189,12 +190,14 @@ errout: static void fib4_rule_delete(struct fib_rule *rule) { + struct net *net = rule->fr_net; #ifdef CONFIG_IP_ROUTE_CLASSID struct fib4_rule *rule4 = (struct fib4_rule *) rule; if (rule4->tclassid) - fib_num_tclassid_users--; + net->ipv4.fib_num_tclassid_users--; #endif + net->ipv4.fib_has_custom_rules = true; } static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, @@ -309,6 +312,7 @@ int __net_init fib4_rules_init(struct net *net) if (err < 0) goto fail; net->ipv4.rules_ops = ops; + net->ipv4.fib_has_custom_rules = false; return 0; fail: diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index c46c20b6b0b..ae301c897a1 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -166,7 +166,7 @@ void free_fib_info(struct fib_info *fi) #ifdef CONFIG_IP_ROUTE_CLASSID change_nexthops(fi) { if (nexthop_nh->nh_tclassid) - fib_num_tclassid_users--; + fi->fib_net->ipv4.fib_num_tclassid_users--; } endfor_nexthops(fi); #endif call_rcu(&fi->rcu, free_fib_info_rcu); @@ -428,7 +428,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, nla = nla_find(attrs, attrlen, RTA_FLOW); nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; if (nexthop_nh->nh_tclassid) - fib_num_tclassid_users++; + fi->fib_net->ipv4.fib_num_tclassid_users++; #endif } @@ -824,7 +824,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid = cfg->fc_flow; if (nh->nh_tclassid) - fib_num_tclassid_users++; + fi->fib_net->ipv4.fib_num_tclassid_users++; #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->nh_weight = 1; -- cgit v1.2.3-70-g09d2 From 85b91b0339e764f7e56ff5968fa10d85451378b4 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 13 Jul 2012 08:21:29 -0700 Subject: ipv4: Don't store a rule pointer in fib_result. We only use it to fetch the rule's tclassid, so just store the tclassid there instead. This also decreases the size of fib_result by a full 8 bytes on 64-bit. On 32-bits it's a wash. Signed-off-by: David S. Miller --- include/net/ip_fib.h | 12 +++--------- net/ipv4/fib_frontend.c | 8 -------- net/ipv4/fib_rules.c | 15 ++++++--------- net/ipv4/route.c | 6 ++---- 4 files changed, 11 insertions(+), 30 deletions(-) (limited to 'net/ipv4/fib_frontend.c') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index e91fedd22db..5697acefeba 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -106,12 +106,10 @@ struct fib_result { unsigned char nh_sel; unsigned char type; unsigned char scope; + u32 tclassid; struct fib_info *fi; struct fib_table *table; struct list_head *fa_head; -#ifdef CONFIG_IP_MULTIPLE_TABLES - struct fib_rule *r; -#endif }; struct fib_result_nl { @@ -215,10 +213,6 @@ static inline int fib_lookup(struct net *net, const struct flowi4 *flp, extern int __net_init fib4_rules_init(struct net *net); extern void __net_exit fib4_rules_exit(struct net *net); -#ifdef CONFIG_IP_ROUTE_CLASSID -extern u32 fib_rules_tclass(const struct fib_result *res); -#endif - extern struct fib_table *fib_new_table(struct net *net, u32 id); extern struct fib_table *fib_get_table(struct net *net, u32 id); @@ -229,7 +223,7 @@ static inline int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) { if (!net->ipv4.fib_has_custom_rules) { - res->r = NULL; + res->tclassid = 0; if (net->ipv4.fib_local && !fib_table_lookup(net->ipv4.fib_local, flp, res, FIB_LOOKUP_NOREF)) @@ -289,7 +283,7 @@ static inline void fib_combine_itag(u32 *itag, const struct fib_result *res) #endif *itag = FIB_RES_NH(*res).nh_tclassid<<16; #ifdef CONFIG_IP_MULTIPLE_TABLES - rtag = fib_rules_tclass(res); + rtag = res->tclassid; if (*itag == 0) *itag = (rtag<<16); *itag |= (rtag>>16); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 81f85716a89..7a31194ec63 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -169,10 +169,6 @@ static inline unsigned int __inet_dev_addr_type(struct net *net, if (ipv4_is_multicast(addr)) return RTN_MULTICAST; -#ifdef CONFIG_IP_MULTIPLE_TABLES - res.r = NULL; -#endif - local_table = fib_get_table(net, RT_TABLE_LOCAL); if (local_table) { ret = RTN_UNICAST; @@ -934,10 +930,6 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb) .flowi4_scope = frn->fl_scope, }; -#ifdef CONFIG_IP_MULTIPLE_TABLES - res.r = NULL; -#endif - frn->err = -ENOENT; if (tb) { local_bh_disable(); diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index c06da93b0b7..a83d74e498d 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -47,13 +47,6 @@ struct fib4_rule { #endif }; -#ifdef CONFIG_IP_ROUTE_CLASSID -u32 fib_rules_tclass(const struct fib_result *res) -{ - return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0; -} -#endif - int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) { struct fib_lookup_arg arg = { @@ -63,8 +56,12 @@ int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) int err; err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg); - res->r = arg.rule; - +#ifdef CONFIG_IP_ROUTE_CLASSID + if (arg.rule) + res->tclassid = ((struct fib4_rule *)arg.rule)->tclassid; + else + res->tclassid = 0; +#endif return err; } EXPORT_SYMBOL_GPL(__fib_lookup); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 9319bf1f835..aad21819316 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1735,7 +1735,7 @@ static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, #ifdef CONFIG_IP_ROUTE_CLASSID #ifdef CONFIG_IP_MULTIPLE_TABLES - set_class_tag(rt, fib_rules_tclass(res)); + set_class_tag(rt, res->tclassid); #endif set_class_tag(rt, itag); #endif @@ -2353,11 +2353,9 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) __be32 orig_saddr; int orig_oif; + res.tclassid = 0; res.fi = NULL; res.table = NULL; -#ifdef CONFIG_IP_MULTIPLE_TABLES - res.r = NULL; -#endif orig_daddr = fl4->daddr; orig_saddr = fl4->saddr; -- cgit v1.2.3-70-g09d2 From 0cc535a29916c6a0e6e6af0f3d42c2fe3b0b145d Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Wed, 18 Jul 2012 21:35:03 +0000 Subject: ipv4: fix address selection in fib_compute_spec_dst ip_options_compile can be called for forwarded packets, make sure the specific-destionation address is a local one as specified in RFC 1812, 4.2.2.2 Addresses in Options Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4/fib_frontend.c') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 7a31194ec63..b83203658ee 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -206,7 +206,8 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) int scope; rt = skb_rtable(skb); - if (!(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) + if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) == + RTCF_LOCAL) return ip_hdr(skb)->daddr; in_dev = __in_dev_get_rcu(dev); -- cgit v1.2.3-70-g09d2 From 89aef8921bfbac22f00e04f8450f6e447db13e42 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 17 Jul 2012 11:00:09 -0700 Subject: ipv4: Delete routing cache. The ipv4 routing cache is non-deterministic, performance wise, and is subject to reasonably easy to launch denial of service attacks. The routing cache works great for well behaved traffic, and the world was a much friendlier place when the tradeoffs that led to the routing cache's design were considered. What it boils down to is that the performance of the routing cache is a product of the traffic patterns seen by a system rather than being a product of the contents of the routing tables. The former of which is controllable by external entitites. Even for "well behaved" legitimate traffic, high volume sites can see hit rates in the routing cache of only ~%10. Signed-off-by: David S. Miller --- include/net/route.h | 1 - net/ipv4/fib_frontend.c | 5 - net/ipv4/route.c | 940 +----------------------------------------------- 3 files changed, 13 insertions(+), 933 deletions(-) (limited to 'net/ipv4/fib_frontend.c') diff --git a/include/net/route.h b/include/net/route.h index ace3cb44251..5dcfeb621e0 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -109,7 +109,6 @@ extern struct ip_rt_acct __percpu *ip_rt_acct; struct in_device; extern int ip_rt_init(void); extern void rt_cache_flush(struct net *net, int how); -extern void rt_cache_flush_batch(struct net *net); extern struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp); extern struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp, struct sock *sk); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index b83203658ee..f277cf0e632 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1072,11 +1072,6 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo rt_cache_flush(dev_net(dev), 0); break; case NETDEV_UNREGISTER_BATCH: - /* The batch unregister is only called on the first - * device in the list of devices being unregistered. - * Therefore we should not pass dev_net(dev) in here. - */ - rt_cache_flush_batch(NULL); break; } return NOTIFY_DONE; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d547f6fae20..6d6146d31f2 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -133,10 +133,6 @@ static int ip_rt_gc_elasticity __read_mostly = 8; static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; static int ip_rt_min_advmss __read_mostly = 256; -static int rt_chain_length_max __read_mostly = 20; - -static struct delayed_work expires_work; -static unsigned long expires_ljiffies; /* * Interface to generic destination cache. @@ -152,7 +148,6 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu); static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); -static int rt_garbage_collect(struct dst_ops *ops); static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, int how) @@ -172,7 +167,6 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, static struct dst_ops ipv4_dst_ops = { .family = AF_INET, .protocol = cpu_to_be16(ETH_P_IP), - .gc = rt_garbage_collect, .check = ipv4_dst_check, .default_advmss = ipv4_default_advmss, .mtu = ipv4_mtu, @@ -209,184 +203,30 @@ const __u8 ip_tos2prio[16] = { }; EXPORT_SYMBOL(ip_tos2prio); -/* - * Route cache. - */ - -/* The locking scheme is rather straight forward: - * - * 1) Read-Copy Update protects the buckets of the central route hash. - * 2) Only writers remove entries, and they hold the lock - * as they look at rtable reference counts. - * 3) Only readers acquire references to rtable entries, - * they do so with atomic increments and with the - * lock held. - */ - -struct rt_hash_bucket { - struct rtable __rcu *chain; -}; - -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ - defined(CONFIG_PROVE_LOCKING) -/* - * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks - * The size of this table is a power of two and depends on the number of CPUS. - * (on lockdep we have a quite big spinlock_t, so keep the size down there) - */ -#ifdef CONFIG_LOCKDEP -# define RT_HASH_LOCK_SZ 256 -#else -# if NR_CPUS >= 32 -# define RT_HASH_LOCK_SZ 4096 -# elif NR_CPUS >= 16 -# define RT_HASH_LOCK_SZ 2048 -# elif NR_CPUS >= 8 -# define RT_HASH_LOCK_SZ 1024 -# elif NR_CPUS >= 4 -# define RT_HASH_LOCK_SZ 512 -# else -# define RT_HASH_LOCK_SZ 256 -# endif -#endif - -static spinlock_t *rt_hash_locks; -# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)] - -static __init void rt_hash_lock_init(void) -{ - int i; - - rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, - GFP_KERNEL); - if (!rt_hash_locks) - panic("IP: failed to allocate rt_hash_locks\n"); - - for (i = 0; i < RT_HASH_LOCK_SZ; i++) - spin_lock_init(&rt_hash_locks[i]); -} -#else -# define rt_hash_lock_addr(slot) NULL - -static inline void rt_hash_lock_init(void) -{ -} -#endif - -static struct rt_hash_bucket *rt_hash_table __read_mostly; -static unsigned int rt_hash_mask __read_mostly; -static unsigned int rt_hash_log __read_mostly; - static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) -static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, - int genid) -{ - return jhash_3words((__force u32)daddr, (__force u32)saddr, - idx, genid) - & rt_hash_mask; -} - static inline int rt_genid(struct net *net) { return atomic_read(&net->ipv4.rt_genid); } #ifdef CONFIG_PROC_FS -struct rt_cache_iter_state { - struct seq_net_private p; - int bucket; - int genid; -}; - -static struct rtable *rt_cache_get_first(struct seq_file *seq) -{ - struct rt_cache_iter_state *st = seq->private; - struct rtable *r = NULL; - - for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { - if (!rcu_access_pointer(rt_hash_table[st->bucket].chain)) - continue; - rcu_read_lock_bh(); - r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); - while (r) { - if (dev_net(r->dst.dev) == seq_file_net(seq) && - r->rt_genid == st->genid) - return r; - r = rcu_dereference_bh(r->dst.rt_next); - } - rcu_read_unlock_bh(); - } - return r; -} - -static struct rtable *__rt_cache_get_next(struct seq_file *seq, - struct rtable *r) -{ - struct rt_cache_iter_state *st = seq->private; - - r = rcu_dereference_bh(r->dst.rt_next); - while (!r) { - rcu_read_unlock_bh(); - do { - if (--st->bucket < 0) - return NULL; - } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain)); - rcu_read_lock_bh(); - r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); - } - return r; -} - -static struct rtable *rt_cache_get_next(struct seq_file *seq, - struct rtable *r) -{ - struct rt_cache_iter_state *st = seq->private; - while ((r = __rt_cache_get_next(seq, r)) != NULL) { - if (dev_net(r->dst.dev) != seq_file_net(seq)) - continue; - if (r->rt_genid == st->genid) - break; - } - return r; -} - -static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos) -{ - struct rtable *r = rt_cache_get_first(seq); - - if (r) - while (pos && (r = rt_cache_get_next(seq, r))) - --pos; - return pos ? NULL : r; -} - static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) { - struct rt_cache_iter_state *st = seq->private; if (*pos) - return rt_cache_get_idx(seq, *pos - 1); - st->genid = rt_genid(seq_file_net(seq)); + return NULL; return SEQ_START_TOKEN; } static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct rtable *r; - - if (v == SEQ_START_TOKEN) - r = rt_cache_get_first(seq); - else - r = rt_cache_get_next(seq, v); ++*pos; - return r; + return NULL; } static void rt_cache_seq_stop(struct seq_file *seq, void *v) { - if (v && v != SEQ_START_TOKEN) - rcu_read_unlock_bh(); } static int rt_cache_seq_show(struct seq_file *seq, void *v) @@ -396,24 +236,6 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" "HHUptod\tSpecDst"); - else { - struct rtable *r = v; - int len; - - seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" - "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", - r->dst.dev ? r->dst.dev->name : "*", - (__force u32)r->rt_dst, - (__force u32)r->rt_gateway, - r->rt_flags, atomic_read(&r->dst.__refcnt), - r->dst.__use, 0, (__force u32)r->rt_src, - dst_metric_advmss(&r->dst) + 40, - dst_metric(&r->dst, RTAX_WINDOW), 0, - r->rt_key_tos, - -1, 0, 0, &len); - - seq_printf(seq, "%*s\n", 127 - len, ""); - } return 0; } @@ -426,8 +248,7 @@ static const struct seq_operations rt_cache_seq_ops = { static int rt_cache_seq_open(struct inode *inode, struct file *file) { - return seq_open_net(inode, file, &rt_cache_seq_ops, - sizeof(struct rt_cache_iter_state)); + return seq_open(file, &rt_cache_seq_ops); } static const struct file_operations rt_cache_seq_fops = { @@ -435,7 +256,7 @@ static const struct file_operations rt_cache_seq_fops = { .open = rt_cache_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_net, + .release = seq_release, }; @@ -625,262 +446,11 @@ static inline int ip_rt_proc_init(void) } #endif /* CONFIG_PROC_FS */ -static inline void rt_free(struct rtable *rt) -{ - call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); -} - -static inline void rt_drop(struct rtable *rt) -{ - ip_rt_put(rt); - call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); -} - -static inline int rt_fast_clean(struct rtable *rth) -{ - /* Kill broadcast/multicast entries very aggresively, if they - collide in hash table with more useful entries */ - return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && - rt_is_input_route(rth) && rth->dst.rt_next; -} - -static inline int rt_valuable(struct rtable *rth) -{ - return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || - rth->dst.expires; -} - -static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) -{ - unsigned long age; - int ret = 0; - - if (atomic_read(&rth->dst.__refcnt)) - goto out; - - age = jiffies - rth->dst.lastuse; - if ((age <= tmo1 && !rt_fast_clean(rth)) || - (age <= tmo2 && rt_valuable(rth))) - goto out; - ret = 1; -out: return ret; -} - -/* Bits of score are: - * 31: very valuable - * 30: not quite useless - * 29..0: usage counter - */ -static inline u32 rt_score(struct rtable *rt) -{ - u32 score = jiffies - rt->dst.lastuse; - - score = ~score & ~(3<<30); - - if (rt_valuable(rt)) - score |= (1<<31); - - if (rt_is_output_route(rt) || - !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) - score |= (1<<30); - - return score; -} - -static inline bool rt_caching(const struct net *net) -{ - return net->ipv4.current_rt_cache_rebuild_count <= - net->ipv4.sysctl_rt_cache_rebuild_count; -} - -static inline bool compare_hash_inputs(const struct rtable *rt1, - const struct rtable *rt2) -{ - return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | - ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | - (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0); -} - -static inline int compare_keys(struct rtable *rt1, struct rtable *rt2) -{ - return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | - ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | - (rt1->rt_mark ^ rt2->rt_mark) | - (rt1->rt_key_tos ^ rt2->rt_key_tos) | - (rt1->rt_route_iif ^ rt2->rt_route_iif) | - (rt1->rt_oif ^ rt2->rt_oif)) == 0; -} - -static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) -{ - return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev)); -} - static inline int rt_is_expired(struct rtable *rth) { return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); } -/* - * Perform a full scan of hash table and free all entries. - * Can be called by a softirq or a process. - * In the later case, we want to be reschedule if necessary - */ -static void rt_do_flush(struct net *net, int process_context) -{ - unsigned int i; - struct rtable *rth, *next; - - for (i = 0; i <= rt_hash_mask; i++) { - struct rtable __rcu **pprev; - struct rtable *list; - - if (process_context && need_resched()) - cond_resched(); - rth = rcu_access_pointer(rt_hash_table[i].chain); - if (!rth) - continue; - - spin_lock_bh(rt_hash_lock_addr(i)); - - list = NULL; - pprev = &rt_hash_table[i].chain; - rth = rcu_dereference_protected(*pprev, - lockdep_is_held(rt_hash_lock_addr(i))); - - while (rth) { - next = rcu_dereference_protected(rth->dst.rt_next, - lockdep_is_held(rt_hash_lock_addr(i))); - - if (!net || - net_eq(dev_net(rth->dst.dev), net)) { - rcu_assign_pointer(*pprev, next); - rcu_assign_pointer(rth->dst.rt_next, list); - list = rth; - } else { - pprev = &rth->dst.rt_next; - } - rth = next; - } - - spin_unlock_bh(rt_hash_lock_addr(i)); - - for (; list; list = next) { - next = rcu_dereference_protected(list->dst.rt_next, 1); - rt_free(list); - } - } -} - -/* - * While freeing expired entries, we compute average chain length - * and standard deviation, using fixed-point arithmetic. - * This to have an estimation of rt_chain_length_max - * rt_chain_length_max = max(elasticity, AVG + 4*SD) - * We use 3 bits for frational part, and 29 (or 61) for magnitude. - */ - -#define FRACT_BITS 3 -#define ONE (1UL << FRACT_BITS) - -/* - * Given a hash chain and an item in this hash chain, - * find if a previous entry has the same hash_inputs - * (but differs on tos, mark or oif) - * Returns 0 if an alias is found. - * Returns ONE if rth has no alias before itself. - */ -static int has_noalias(const struct rtable *head, const struct rtable *rth) -{ - const struct rtable *aux = head; - - while (aux != rth) { - if (compare_hash_inputs(aux, rth)) - return 0; - aux = rcu_dereference_protected(aux->dst.rt_next, 1); - } - return ONE; -} - -static void rt_check_expire(void) -{ - static unsigned int rover; - unsigned int i = rover, goal; - struct rtable *rth; - struct rtable __rcu **rthp; - unsigned long samples = 0; - unsigned long sum = 0, sum2 = 0; - unsigned long delta; - u64 mult; - - delta = jiffies - expires_ljiffies; - expires_ljiffies = jiffies; - mult = ((u64)delta) << rt_hash_log; - if (ip_rt_gc_timeout > 1) - do_div(mult, ip_rt_gc_timeout); - goal = (unsigned int)mult; - if (goal > rt_hash_mask) - goal = rt_hash_mask + 1; - for (; goal > 0; goal--) { - unsigned long tmo = ip_rt_gc_timeout; - unsigned long length; - - i = (i + 1) & rt_hash_mask; - rthp = &rt_hash_table[i].chain; - - if (need_resched()) - cond_resched(); - - samples++; - - if (rcu_dereference_raw(*rthp) == NULL) - continue; - length = 0; - spin_lock_bh(rt_hash_lock_addr(i)); - while ((rth = rcu_dereference_protected(*rthp, - lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) { - prefetch(rth->dst.rt_next); - if (rt_is_expired(rth) || - rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { - *rthp = rth->dst.rt_next; - rt_free(rth); - continue; - } - - /* We only count entries on a chain with equal - * hash inputs once so that entries for - * different QOS levels, and other non-hash - * input attributes don't unfairly skew the - * length computation - */ - tmo >>= 1; - rthp = &rth->dst.rt_next; - length += has_noalias(rt_hash_table[i].chain, rth); - } - spin_unlock_bh(rt_hash_lock_addr(i)); - sum += length; - sum2 += length*length; - } - if (samples) { - unsigned long avg = sum / samples; - unsigned long sd = int_sqrt(sum2 / samples - avg*avg); - rt_chain_length_max = max_t(unsigned long, - ip_rt_gc_elasticity, - (avg + 4*sd) >> FRACT_BITS); - } - rover = i; -} - -/* - * rt_worker_func() is run in process context. - * we call rt_check_expire() to scan part of the hash table - */ -static void rt_worker_func(struct work_struct *work) -{ - rt_check_expire(); - schedule_delayed_work(&expires_work, ip_rt_gc_interval); -} - /* * Perturbation of rt_genid by a small quantity [1..256] * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() @@ -902,167 +472,6 @@ static void rt_cache_invalidate(struct net *net) void rt_cache_flush(struct net *net, int delay) { rt_cache_invalidate(net); - if (delay >= 0) - rt_do_flush(net, !in_softirq()); -} - -/* Flush previous cache invalidated entries from the cache */ -void rt_cache_flush_batch(struct net *net) -{ - rt_do_flush(net, !in_softirq()); -} - -static void rt_emergency_hash_rebuild(struct net *net) -{ - net_warn_ratelimited("Route hash chain too long!\n"); - rt_cache_invalidate(net); -} - -/* - Short description of GC goals. - - We want to build algorithm, which will keep routing cache - at some equilibrium point, when number of aged off entries - is kept approximately equal to newly generated ones. - - Current expiration strength is variable "expire". - We try to adjust it dynamically, so that if networking - is idle expires is large enough to keep enough of warm entries, - and when load increases it reduces to limit cache size. - */ - -static int rt_garbage_collect(struct dst_ops *ops) -{ - static unsigned long expire = RT_GC_TIMEOUT; - static unsigned long last_gc; - static int rover; - static int equilibrium; - struct rtable *rth; - struct rtable __rcu **rthp; - unsigned long now = jiffies; - int goal; - int entries = dst_entries_get_fast(&ipv4_dst_ops); - - /* - * Garbage collection is pretty expensive, - * do not make it too frequently. - */ - - RT_CACHE_STAT_INC(gc_total); - - if (now - last_gc < ip_rt_gc_min_interval && - entries < ip_rt_max_size) { - RT_CACHE_STAT_INC(gc_ignored); - goto out; - } - - entries = dst_entries_get_slow(&ipv4_dst_ops); - /* Calculate number of entries, which we want to expire now. */ - goal = entries - (ip_rt_gc_elasticity << rt_hash_log); - if (goal <= 0) { - if (equilibrium < ipv4_dst_ops.gc_thresh) - equilibrium = ipv4_dst_ops.gc_thresh; - goal = entries - equilibrium; - if (goal > 0) { - equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); - goal = entries - equilibrium; - } - } else { - /* We are in dangerous area. Try to reduce cache really - * aggressively. - */ - goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); - equilibrium = entries - goal; - } - - if (now - last_gc >= ip_rt_gc_min_interval) - last_gc = now; - - if (goal <= 0) { - equilibrium += goal; - goto work_done; - } - - do { - int i, k; - - for (i = rt_hash_mask, k = rover; i >= 0; i--) { - unsigned long tmo = expire; - - k = (k + 1) & rt_hash_mask; - rthp = &rt_hash_table[k].chain; - spin_lock_bh(rt_hash_lock_addr(k)); - while ((rth = rcu_dereference_protected(*rthp, - lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) { - if (!rt_is_expired(rth) && - !rt_may_expire(rth, tmo, expire)) { - tmo >>= 1; - rthp = &rth->dst.rt_next; - continue; - } - *rthp = rth->dst.rt_next; - rt_free(rth); - goal--; - } - spin_unlock_bh(rt_hash_lock_addr(k)); - if (goal <= 0) - break; - } - rover = k; - - if (goal <= 0) - goto work_done; - - /* Goal is not achieved. We stop process if: - - - if expire reduced to zero. Otherwise, expire is halfed. - - if table is not full. - - if we are called from interrupt. - - jiffies check is just fallback/debug loop breaker. - We will not spin here for long time in any case. - */ - - RT_CACHE_STAT_INC(gc_goal_miss); - - if (expire == 0) - break; - - expire >>= 1; - - if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) - goto out; - } while (!in_softirq() && time_before_eq(jiffies, now)); - - if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) - goto out; - if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size) - goto out; - net_warn_ratelimited("dst cache overflow\n"); - RT_CACHE_STAT_INC(gc_dst_overflow); - return 1; - -work_done: - expire += ip_rt_gc_min_interval; - if (expire > ip_rt_gc_timeout || - dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh || - dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh) - expire = ip_rt_gc_timeout; -out: return 0; -} - -/* - * Returns number of entries in a hash chain that have different hash_inputs - */ -static int slow_chain_length(const struct rtable *head) -{ - int length = 0; - const struct rtable *rth = head; - - while (rth) { - length += has_noalias(head, rth); - rth = rcu_dereference_protected(rth->dst.rt_next, 1); - } - return length >> FRACT_BITS; } static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, @@ -1086,139 +495,6 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, return neigh_create(&arp_tbl, pkey, dev); } -static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt, - struct sk_buff *skb, int ifindex) -{ - struct rtable *rth, *cand; - struct rtable __rcu **rthp, **candp; - unsigned long now; - u32 min_score; - int chain_length; - -restart: - chain_length = 0; - min_score = ~(u32)0; - cand = NULL; - candp = NULL; - now = jiffies; - - if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) { - /* - * If we're not caching, just tell the caller we - * were successful and don't touch the route. The - * caller hold the sole reference to the cache entry, and - * it will be released when the caller is done with it. - * If we drop it here, the callers have no way to resolve routes - * when we're not caching. Instead, just point *rp at rt, so - * the caller gets a single use out of the route - * Note that we do rt_free on this new route entry, so that - * once its refcount hits zero, we are still able to reap it - * (Thanks Alexey) - * Note: To avoid expensive rcu stuff for this uncached dst, - * we set DST_NOCACHE so that dst_release() can free dst without - * waiting a grace period. - */ - - rt->dst.flags |= DST_NOCACHE; - goto skip_hashing; - } - - rthp = &rt_hash_table[hash].chain; - - spin_lock_bh(rt_hash_lock_addr(hash)); - while ((rth = rcu_dereference_protected(*rthp, - lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { - if (rt_is_expired(rth)) { - *rthp = rth->dst.rt_next; - rt_free(rth); - continue; - } - if (compare_keys(rth, rt) && compare_netns(rth, rt)) { - /* Put it first */ - *rthp = rth->dst.rt_next; - /* - * Since lookup is lockfree, the deletion - * must be visible to another weakly ordered CPU before - * the insertion at the start of the hash chain. - */ - rcu_assign_pointer(rth->dst.rt_next, - rt_hash_table[hash].chain); - /* - * Since lookup is lockfree, the update writes - * must be ordered for consistency on SMP. - */ - rcu_assign_pointer(rt_hash_table[hash].chain, rth); - - dst_use(&rth->dst, now); - spin_unlock_bh(rt_hash_lock_addr(hash)); - - rt_drop(rt); - if (skb) - skb_dst_set(skb, &rth->dst); - return rth; - } - - if (!atomic_read(&rth->dst.__refcnt)) { - u32 score = rt_score(rth); - - if (score <= min_score) { - cand = rth; - candp = rthp; - min_score = score; - } - } - - chain_length++; - - rthp = &rth->dst.rt_next; - } - - if (cand) { - /* ip_rt_gc_elasticity used to be average length of chain - * length, when exceeded gc becomes really aggressive. - * - * The second limit is less certain. At the moment it allows - * only 2 entries per bucket. We will see. - */ - if (chain_length > ip_rt_gc_elasticity) { - *candp = cand->dst.rt_next; - rt_free(cand); - } - } else { - if (chain_length > rt_chain_length_max && - slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) { - struct net *net = dev_net(rt->dst.dev); - int num = ++net->ipv4.current_rt_cache_rebuild_count; - if (!rt_caching(net)) { - pr_warn("%s: %d rebuilds is over limit, route caching disabled\n", - rt->dst.dev->name, num); - } - rt_emergency_hash_rebuild(net); - spin_unlock_bh(rt_hash_lock_addr(hash)); - - hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, - ifindex, rt_genid(net)); - goto restart; - } - } - - rt->dst.rt_next = rt_hash_table[hash].chain; - - /* - * Since lookup is lockfree, we must make sure - * previous writes to rt are committed to memory - * before making rt visible to other CPUS. - */ - rcu_assign_pointer(rt_hash_table[hash].chain, rt); - - spin_unlock_bh(rt_hash_lock_addr(hash)); - -skip_hashing: - if (skb) - skb_dst_set(skb, &rt->dst); - return rt; -} - /* * Peer allocation may fail only in serious out-of-memory conditions. However * we still can generate some output. @@ -1255,26 +531,6 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) } EXPORT_SYMBOL(__ip_select_ident); -static void rt_del(unsigned int hash, struct rtable *rt) -{ - struct rtable __rcu **rthp; - struct rtable *aux; - - rthp = &rt_hash_table[hash].chain; - spin_lock_bh(rt_hash_lock_addr(hash)); - ip_rt_put(rt); - while ((aux = rcu_dereference_protected(*rthp, - lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { - if (aux == rt || rt_is_expired(aux)) { - *rthp = aux->dst.rt_next; - rt_free(aux); - continue; - } - rthp = &aux->dst.rt_next; - } - spin_unlock_bh(rt_hash_lock_addr(hash)); -} - static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk, const struct iphdr *iph, int oif, u8 tos, @@ -1518,10 +774,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) ret = NULL; } else if ((rt->rt_flags & RTCF_REDIRECTED) || rt->dst.expires) { - unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, - rt->rt_oif, - rt_genid(dev_net(dst->dev))); - rt_del(hash, rt); + ip_rt_put(rt); ret = NULL; } } @@ -1969,7 +1222,7 @@ static struct rtable *rt_dst_alloc(struct net_device *dev, bool nopolicy, bool noxfrm) { return dst_alloc(&ipv4_dst_ops, dev, 1, -1, - DST_HOST | + DST_HOST | DST_NOCACHE | (nopolicy ? DST_NOPOLICY : 0) | (noxfrm ? DST_NOXFRM : 0)); } @@ -1978,7 +1231,6 @@ static struct rtable *rt_dst_alloc(struct net_device *dev, static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev, int our) { - unsigned int hash; struct rtable *rth; struct in_device *in_dev = __in_dev_get_rcu(dev); u32 itag = 0; @@ -2042,9 +1294,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, #endif RT_CACHE_STAT_INC(in_slow_mc); - hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); - rth = rt_intern_hash(hash, rth, skb, dev->ifindex); - return IS_ERR(rth) ? PTR_ERR(rth) : 0; + skb_dst_set(skb, &rth->dst); + return 0; e_nobufs: return -ENOBUFS; @@ -2176,7 +1427,6 @@ static int ip_mkroute_input(struct sk_buff *skb, { struct rtable *rth = NULL; int err; - unsigned int hash; #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi && res->fi->fib_nhs > 1) @@ -2188,12 +1438,7 @@ static int ip_mkroute_input(struct sk_buff *skb, if (err) return err; - /* put it into the cache */ - hash = rt_hash(daddr, saddr, fl4->flowi4_iif, - rt_genid(dev_net(rth->dst.dev))); - rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif); - if (IS_ERR(rth)) - return PTR_ERR(rth); + skb_dst_set(skb, &rth->dst); return 0; } @@ -2217,7 +1462,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, unsigned int flags = 0; u32 itag = 0; struct rtable *rth; - unsigned int hash; int err = -EINVAL; struct net *net = dev_net(dev); @@ -2339,11 +1583,8 @@ local_input: rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } - hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net)); - rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif); + skb_dst_set(skb, &rth->dst); err = 0; - if (IS_ERR(rth)) - err = PTR_ERR(rth); goto out; no_route: @@ -2382,46 +1623,10 @@ martian_source_keep_err: int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev, bool noref) { - struct rtable *rth; - unsigned int hash; - int iif = dev->ifindex; - struct net *net; int res; - net = dev_net(dev); - rcu_read_lock(); - if (!rt_caching(net)) - goto skip_cache; - - tos &= IPTOS_RT_MASK; - hash = rt_hash(daddr, saddr, iif, rt_genid(net)); - - for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; - rth = rcu_dereference(rth->dst.rt_next)) { - if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) | - ((__force u32)rth->rt_key_src ^ (__force u32)saddr) | - (rth->rt_route_iif ^ iif) | - (rth->rt_key_tos ^ tos)) == 0 && - rth->rt_mark == skb->mark && - net_eq(dev_net(rth->dst.dev), net) && - !rt_is_expired(rth)) { - if (noref) { - dst_use_noref(&rth->dst, jiffies); - skb_dst_set_noref(skb, &rth->dst); - } else { - dst_use(&rth->dst, jiffies); - skb_dst_set(skb, &rth->dst); - } - RT_CACHE_STAT_INC(in_hit); - rcu_read_unlock(); - return 0; - } - RT_CACHE_STAT_INC(in_hlist_search); - } - -skip_cache: /* Multicast recognition logic is moved from route cache to here. The problem was that too many Ethernet cards have broken/missing hardware multicast filters :-( As result the host on multicasting @@ -2563,10 +1768,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res, /* * Major route resolver routine. - * called with rcu_read_lock(); */ -static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) +struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) { struct net_device *dev_out = NULL; __u8 tos = RT_FL_TOS(fl4); @@ -2746,57 +1950,11 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) make_route: rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif, tos, dev_out, flags); - if (!IS_ERR(rth)) { - unsigned int hash; - - hash = rt_hash(orig_daddr, orig_saddr, orig_oif, - rt_genid(dev_net(dev_out))); - rth = rt_intern_hash(hash, rth, NULL, orig_oif); - } out: rcu_read_unlock(); return rth; } - -struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4) -{ - struct rtable *rth; - unsigned int hash; - - if (!rt_caching(net)) - goto slow_output; - - hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net)); - - rcu_read_lock_bh(); - for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; - rth = rcu_dereference_bh(rth->dst.rt_next)) { - if (rth->rt_key_dst == flp4->daddr && - rth->rt_key_src == flp4->saddr && - rt_is_output_route(rth) && - rth->rt_oif == flp4->flowi4_oif && - rth->rt_mark == flp4->flowi4_mark && - !((rth->rt_key_tos ^ flp4->flowi4_tos) & - (IPTOS_RT_MASK | RTO_ONLINK)) && - net_eq(dev_net(rth->dst.dev), net) && - !rt_is_expired(rth)) { - dst_use(&rth->dst, jiffies); - RT_CACHE_STAT_INC(out_hit); - rcu_read_unlock_bh(); - if (!flp4->saddr) - flp4->saddr = rth->rt_src; - if (!flp4->daddr) - flp4->daddr = rth->rt_dst; - return rth; - } - RT_CACHE_STAT_INC(out_hlist_search); - } - rcu_read_unlock_bh(); - -slow_output: - return ip_route_output_slow(net, flp4); -} EXPORT_SYMBOL_GPL(__ip_route_output_key); static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) @@ -3106,43 +2264,6 @@ errout_free: int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) { - struct rtable *rt; - int h, s_h; - int idx, s_idx; - struct net *net; - - net = sock_net(skb->sk); - - s_h = cb->args[0]; - if (s_h < 0) - s_h = 0; - s_idx = idx = cb->args[1]; - for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) { - if (!rt_hash_table[h].chain) - continue; - rcu_read_lock_bh(); - for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt; - rt = rcu_dereference_bh(rt->dst.rt_next), idx++) { - if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx) - continue; - if (rt_is_expired(rt)) - continue; - skb_dst_set_noref(skb, &rt->dst); - if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, RTM_NEWROUTE, - 1, NLM_F_MULTI) <= 0) { - skb_dst_drop(skb); - rcu_read_unlock_bh(); - goto done; - } - skb_dst_drop(skb); - } - rcu_read_unlock_bh(); - } - -done: - cb->args[0] = h; - cb->args[1] = idx; return skb->len; } @@ -3376,22 +2497,6 @@ static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; #endif /* CONFIG_IP_ROUTE_CLASSID */ -static __initdata unsigned long rhash_entries; -static int __init set_rhash_entries(char *str) -{ - ssize_t ret; - - if (!str) - return 0; - - ret = kstrtoul(str, 0, &rhash_entries); - if (ret) - return 0; - - return 1; -} -__setup("rhash_entries=", set_rhash_entries); - int __init ip_rt_init(void) { int rc = 0; @@ -3414,31 +2519,12 @@ int __init ip_rt_init(void) if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); - rt_hash_table = (struct rt_hash_bucket *) - alloc_large_system_hash("IP route cache", - sizeof(struct rt_hash_bucket), - rhash_entries, - (totalram_pages >= 128 * 1024) ? - 15 : 17, - 0, - &rt_hash_log, - &rt_hash_mask, - 0, - rhash_entries ? 0 : 512 * 1024); - memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); - rt_hash_lock_init(); - - ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); - ip_rt_max_size = (rt_hash_mask + 1) * 16; + ipv4_dst_ops.gc_thresh = ~0; + ip_rt_max_size = INT_MAX; devinet_init(); ip_fib_init(); - INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func); - expires_ljiffies = jiffies; - schedule_delayed_work(&expires_work, - net_random() % ip_rt_gc_interval + ip_rt_gc_interval); - if (ip_rt_proc_init()) pr_err("Unable to create route proc files\n"); #ifdef CONFIG_XFRM -- cgit v1.2.3-70-g09d2 From 8fe5cb873b7ef7f4fa49477455e8f2e3d555230e Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Mon, 23 Jul 2012 04:11:21 +0000 Subject: ipv4: Remove redundant assignment It is redundant to set no_addr and accept_local to 0 and then set them with other values just after that. Signed-off-by: Lin Ming Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/ipv4/fib_frontend.c') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index f277cf0e632..8732cc7920e 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -258,7 +258,6 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; - no_addr = accept_local = 0; no_addr = idev->ifa_list == NULL; accept_local = IN_DEV_ACCEPT_LOCAL(idev); -- cgit v1.2.3-70-g09d2