diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/fib_semantics.c | 23 | ||||
-rw-r--r-- | net/ipv4/inet_connection_sock.c | 46 | ||||
-rw-r--r-- | net/ipv4/ip_gre.c | 2 | ||||
-rw-r--r-- | net/ipv4/ipip.c | 2 | ||||
-rw-r--r-- | net/ipv4/route.c | 267 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 13 | ||||
-rw-r--r-- | net/ipv4/xfrm4_policy.c | 10 |
7 files changed, 313 insertions, 50 deletions
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index d71bfbdc0bf..1e09852df51 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -140,6 +140,27 @@ const struct fib_prop fib_props[RTN_MAX + 1] = { }, }; +static void free_nh_exceptions(struct fib_nh *nh) +{ + struct fnhe_hash_bucket *hash = nh->nh_exceptions; + int i; + + for (i = 0; i < FNHE_HASH_SIZE; i++) { + struct fib_nh_exception *fnhe; + + fnhe = rcu_dereference(hash[i].chain); + while (fnhe) { + struct fib_nh_exception *next; + + next = rcu_dereference(fnhe->fnhe_next); + kfree(fnhe); + + fnhe = next; + } + } + kfree(hash); +} + /* Release a nexthop info record */ static void free_fib_info_rcu(struct rcu_head *head) { @@ -148,6 +169,8 @@ static void free_fib_info_rcu(struct rcu_head *head) change_nexthops(fi) { if (nexthop_nh->nh_dev) dev_put(nexthop_nh->nh_dev); + if (nexthop_nh->nh_exceptions) + free_nh_exceptions(nexthop_nh); } endfor_nexthops(fi); release_net(fi->fib_net); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 76825be3b64..3ea465286a3 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -803,3 +803,49 @@ int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname, } EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt); #endif + +static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl) +{ + struct inet_sock *inet = inet_sk(sk); + struct ip_options_rcu *inet_opt; + __be32 daddr = inet->inet_daddr; + struct flowi4 *fl4; + struct rtable *rt; + + rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); + if (inet_opt && inet_opt->opt.srr) + daddr = inet_opt->opt.faddr; + fl4 = &fl->u.ip4; + rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, + inet->inet_saddr, inet->inet_dport, + inet->inet_sport, sk->sk_protocol, + RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); + if (IS_ERR(rt)) + rt = NULL; + if (rt) + sk_setup_caps(sk, &rt->dst); + rcu_read_unlock(); + + return &rt->dst; +} + +struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu) +{ + struct dst_entry *dst = __sk_dst_check(sk, 0); + struct inet_sock *inet = inet_sk(sk); + + if (!dst) { + dst = inet_csk_rebuild_route(sk, &inet->cork.fl); + if (!dst) + goto out; + } + dst->ops->update_pmtu(dst, sk, NULL, mtu); + + dst = __sk_dst_check(sk, 0); + if (!dst) + dst = inet_csk_rebuild_route(sk, &inet->cork.fl); +out: + return dst; +} +EXPORT_SYMBOL_GPL(inet_csk_update_pmtu); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 0c3123566d7..42c44b1403c 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -833,7 +833,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); if (skb->protocol == htons(ETH_P_IP)) { df |= (old_iph->frag_off&htons(IP_DF)); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index c2d0e6d8baa..2c2c35bace7 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -519,7 +519,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) } if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); if ((old_iph->frag_off & htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index aad21819316..a5bd0b4acc6 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -148,8 +148,10 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst); static void ipv4_dst_destroy(struct dst_entry *dst); static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); -static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); -static void ip_do_redirect(struct dst_entry *dst, struct sk_buff *skb); +static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu); +static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb); static int rt_garbage_collect(struct dst_ops *ops); static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, @@ -1273,14 +1275,130 @@ static void rt_del(unsigned int hash, struct rtable *rt) spin_unlock_bh(rt_hash_lock_addr(hash)); } -static void ip_do_redirect(struct dst_entry *dst, struct sk_buff *skb) +static void __build_flow_key(struct flowi4 *fl4, struct sock *sk, + const struct iphdr *iph, + int oif, u8 tos, + u8 prot, u32 mark, int flow_flags) +{ + if (sk) { + const struct inet_sock *inet = inet_sk(sk); + + oif = sk->sk_bound_dev_if; + mark = sk->sk_mark; + tos = RT_CONN_FLAGS(sk); + prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol; + } + flowi4_init_output(fl4, oif, mark, tos, + RT_SCOPE_UNIVERSE, prot, + flow_flags, + iph->daddr, iph->saddr, 0, 0); +} + +static void build_skb_flow_key(struct flowi4 *fl4, struct sk_buff *skb, struct sock *sk) +{ + const struct iphdr *iph = ip_hdr(skb); + int oif = skb->dev->ifindex; + u8 tos = RT_TOS(iph->tos); + u8 prot = iph->protocol; + u32 mark = skb->mark; + + __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0); +} + +static void build_sk_flow_key(struct flowi4 *fl4, struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + struct ip_options_rcu *inet_opt; + __be32 daddr = inet->inet_daddr; + + rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); + if (inet_opt && inet_opt->opt.srr) + daddr = inet_opt->opt.faddr; + flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, + RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, + inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, + inet_sk_flowi_flags(sk), + daddr, inet->inet_saddr, 0, 0); + rcu_read_unlock(); +} + +static void ip_rt_build_flow_key(struct flowi4 *fl4, struct sock *sk, + struct sk_buff *skb) +{ + if (skb) + build_skb_flow_key(fl4, skb, sk); + else + build_sk_flow_key(fl4, sk); +} + +static DEFINE_SPINLOCK(fnhe_lock); + +static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash, __be32 daddr) +{ + struct fib_nh_exception *fnhe, *oldest; + + oldest = rcu_dereference(hash->chain); + for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; + fnhe = rcu_dereference(fnhe->fnhe_next)) { + if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) + oldest = fnhe; + } + return oldest; +} + +static struct fib_nh_exception *find_or_create_fnhe(struct fib_nh *nh, __be32 daddr) +{ + struct fnhe_hash_bucket *hash = nh->nh_exceptions; + struct fib_nh_exception *fnhe; + int depth; + u32 hval; + + if (!hash) { + hash = nh->nh_exceptions = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), + GFP_ATOMIC); + if (!hash) + return NULL; + } + + hval = (__force u32) daddr; + hval ^= (hval >> 11) ^ (hval >> 22); + hash += hval; + + depth = 0; + for (fnhe = rcu_dereference(hash->chain); fnhe; + fnhe = rcu_dereference(fnhe->fnhe_next)) { + if (fnhe->fnhe_daddr == daddr) + goto out; + depth++; + } + + if (depth > FNHE_RECLAIM_DEPTH) { + fnhe = fnhe_oldest(hash + hval, daddr); + goto out_daddr; + } + fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); + if (!fnhe) + return NULL; + + fnhe->fnhe_next = hash->chain; + rcu_assign_pointer(hash->chain, fnhe); + +out_daddr: + fnhe->fnhe_daddr = daddr; +out: + fnhe->fnhe_stamp = jiffies; + return fnhe; +} + +static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4) { __be32 new_gw = icmp_hdr(skb)->un.gateway; __be32 old_gw = ip_hdr(skb)->saddr; struct net_device *dev = skb->dev; struct in_device *in_dev; + struct fib_result res; struct neighbour *n; - struct rtable *rt; struct net *net; switch (icmp_hdr(skb)->code & 7) { @@ -1294,7 +1412,6 @@ static void ip_do_redirect(struct dst_entry *dst, struct sk_buff *skb) return; } - rt = (struct rtable *) dst; if (rt->rt_gateway != old_gw) return; @@ -1318,11 +1435,21 @@ static void ip_do_redirect(struct dst_entry *dst, struct sk_buff *skb) goto reject_redirect; } - n = ipv4_neigh_lookup(dst, NULL, &new_gw); + n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw); if (n) { if (!(n->nud_state & NUD_VALID)) { neigh_event_send(n, NULL); } else { + if (fib_lookup(net, fl4, &res) == 0) { + struct fib_nh *nh = &FIB_RES_NH(res); + struct fib_nh_exception *fnhe; + + spin_lock_bh(&fnhe_lock); + fnhe = find_or_create_fnhe(nh, fl4->daddr); + if (fnhe) + fnhe->fnhe_gw = new_gw; + spin_unlock_bh(&fnhe_lock); + } rt->rt_gateway = new_gw; rt->rt_flags |= RTCF_REDIRECTED; call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); @@ -1347,6 +1474,17 @@ reject_redirect: ; } +static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) +{ + struct rtable *rt; + struct flowi4 fl4; + + rt = (struct rtable *) dst; + + ip_rt_build_flow_key(&fl4, sk, skb); + __ip_do_redirect(rt, skb, &fl4); +} + static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) { struct rtable *rt = (struct rtable *)dst; @@ -1506,32 +1644,51 @@ out: kfree_skb(skb); return 0; } -static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) +static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) { - struct rtable *rt = (struct rtable *) dst; - - dst_confirm(dst); + struct fib_result res; if (mtu < ip_rt_min_pmtu) mtu = ip_rt_min_pmtu; + if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) { + struct fib_nh *nh = &FIB_RES_NH(res); + struct fib_nh_exception *fnhe; + + spin_lock_bh(&fnhe_lock); + fnhe = find_or_create_fnhe(nh, fl4->daddr); + if (fnhe) { + fnhe->fnhe_pmtu = mtu; + fnhe->fnhe_expires = jiffies + ip_rt_mtu_expires; + } + spin_unlock_bh(&fnhe_lock); + } rt->rt_pmtu = mtu; dst_set_expires(&rt->dst, ip_rt_mtu_expires); } +static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) +{ + struct rtable *rt = (struct rtable *) dst; + struct flowi4 fl4; + + ip_rt_build_flow_key(&fl4, sk, skb); + __ip_rt_update_pmtu(rt, &fl4, mtu); +} + void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif, u32 mark, u8 protocol, int flow_flags) { - const struct iphdr *iph = (const struct iphdr *)skb->data; + const struct iphdr *iph = (const struct iphdr *) skb->data; struct flowi4 fl4; struct rtable *rt; - flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE, - protocol, flow_flags, - iph->daddr, iph->saddr, 0, 0); + __build_flow_key(&fl4, NULL, iph, oif, + RT_TOS(iph->tos), protocol, mark, flow_flags); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { - ip_rt_update_pmtu(&rt->dst, mtu); + __ip_rt_update_pmtu(rt, &fl4, mtu); ip_rt_put(rt); } } @@ -1539,27 +1696,31 @@ EXPORT_SYMBOL_GPL(ipv4_update_pmtu); void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { - const struct inet_sock *inet = inet_sk(sk); + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct flowi4 fl4; + struct rtable *rt; - return ipv4_update_pmtu(skb, sock_net(sk), mtu, - sk->sk_bound_dev_if, sk->sk_mark, - inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, - inet_sk_flowi_flags(sk)); + __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + rt = __ip_route_output_key(sock_net(sk), &fl4); + if (!IS_ERR(rt)) { + __ip_rt_update_pmtu(rt, &fl4, mtu); + ip_rt_put(rt); + } } EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); void ipv4_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, u8 protocol, int flow_flags) { - const struct iphdr *iph = (const struct iphdr *)skb->data; + const struct iphdr *iph = (const struct iphdr *) skb->data; struct flowi4 fl4; struct rtable *rt; - flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE, - protocol, flow_flags, iph->daddr, iph->saddr, 0, 0); + __build_flow_key(&fl4, NULL, iph, oif, + RT_TOS(iph->tos), protocol, mark, flow_flags); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { - ip_do_redirect(&rt->dst, skb); + __ip_do_redirect(rt, skb, &fl4); ip_rt_put(rt); } } @@ -1567,12 +1728,16 @@ EXPORT_SYMBOL_GPL(ipv4_redirect); void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) { - const struct inet_sock *inet = inet_sk(sk); + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct flowi4 fl4; + struct rtable *rt; - return ipv4_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, - sk->sk_mark, - inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, - inet_sk_flowi_flags(sk)); + __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + rt = __ip_route_output_key(sock_net(sk), &fl4); + if (!IS_ERR(rt)) { + __ip_do_redirect(rt, skb, &fl4); + ip_rt_put(rt); + } } EXPORT_SYMBOL_GPL(ipv4_sk_redirect); @@ -1719,14 +1884,46 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, dst_init_metrics(&rt->dst, fi->fib_metrics, true); } +static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr) +{ + struct fnhe_hash_bucket *hash = nh->nh_exceptions; + struct fib_nh_exception *fnhe; + u32 hval; + + hval = (__force u32) daddr; + hval ^= (hval >> 11) ^ (hval >> 22); + + for (fnhe = rcu_dereference(hash[hval].chain); fnhe; + fnhe = rcu_dereference(fnhe->fnhe_next)) { + if (fnhe->fnhe_daddr == daddr) { + if (fnhe->fnhe_pmtu) { + unsigned long expires = fnhe->fnhe_expires; + unsigned long diff = jiffies - expires; + + if (time_before(jiffies, expires)) { + rt->rt_pmtu = fnhe->fnhe_pmtu; + dst_set_expires(&rt->dst, diff); + } + } + if (fnhe->fnhe_gw) + rt->rt_gateway = fnhe->fnhe_gw; + fnhe->fnhe_stamp = jiffies; + break; + } + } +} + static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, const struct fib_result *res, struct fib_info *fi, u16 type, u32 itag) { if (fi) { - if (FIB_RES_GW(*res) && - FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) - rt->rt_gateway = FIB_RES_GW(*res); + struct fib_nh *nh = &FIB_RES_NH(*res); + + if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) + rt->rt_gateway = nh->nh_gw; + if (unlikely(nh->nh_exceptions)) + rt_bind_exception(rt, nh, fl4->daddr); rt_init_metrics(rt, fl4, fi); #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid; @@ -2587,11 +2784,13 @@ static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst) return mtu ? : dst->dev->mtu; } -static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) +static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) { } -static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sk_buff *skb) +static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb) { } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7a0062cb4ed..d9caf5c07aa 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -289,17 +289,10 @@ static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu) if (sk->sk_state == TCP_LISTEN) return; - /* We don't check in the destentry if pmtu discovery is forbidden - * on this route. We just assume that no packet_to_big packets - * are send back when pmtu discovery is not active. - * There is a small race when the user changes this flag in the - * route, but I think that's acceptable. - */ - if ((dst = __sk_dst_check(sk, 0)) == NULL) + dst = inet_csk_update_pmtu(sk, mtu); + if (!dst) return; - dst->ops->update_pmtu(dst, mtu); - /* Something is about to be wrong... Remember soft error * for the case, if this connection will not able to recover. */ @@ -326,7 +319,7 @@ static void do_redirect(struct sk_buff *skb, struct sock *sk) struct dst_entry *dst = __sk_dst_check(sk, 0); if (dst) - dst->ops->redirect(dst, skb); + dst->ops->redirect(dst, sk, skb); } /* diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 737131cef37..fcf7678bc00 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -194,20 +194,22 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops) return (dst_entries_get_slow(ops) > ops->gc_thresh * 2); } -static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) +static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; - path->ops->update_pmtu(path, mtu); + path->ops->update_pmtu(path, sk, skb, mtu); } -static void xfrm4_redirect(struct dst_entry *dst, struct sk_buff *skb) +static void xfrm4_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; - path->ops->redirect(path, skb); + path->ops->redirect(path, sk, skb); } static void xfrm4_dst_destroy(struct dst_entry *dst) |