summaryrefslogtreecommitdiffstats
path: root/net/ipv4/route.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/route.c')
-rw-r--r--net/ipv4/route.c308
1 files changed, 225 insertions, 83 deletions
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 155138d8ec8..94cdbc55ca7 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -91,6 +91,7 @@
#include <linux/rcupdate.h>
#include <linux/times.h>
#include <linux/slab.h>
+#include <linux/prefetch.h>
#include <net/dst.h>
#include <net/net_namespace.h>
#include <net/protocol.h>
@@ -112,7 +113,7 @@
#include <net/secure_seq.h>
#define RT_FL_TOS(oldflp4) \
- ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
+ ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
#define IP_MAX_MTU 0xFFF0
@@ -120,6 +121,7 @@
static int ip_rt_max_size;
static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
+static int ip_rt_gc_interval __read_mostly = 60 * HZ;
static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
static int ip_rt_redirect_number __read_mostly = 9;
static int ip_rt_redirect_load __read_mostly = HZ / 50;
@@ -131,6 +133,10 @@ static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
static int ip_rt_min_advmss __read_mostly = 256;
static int rt_chain_length_max __read_mostly = 20;
+static int redirect_genid;
+
+static struct delayed_work expires_work;
+static unsigned long expires_ljiffies;
/*
* Interface to generic destination cache.
@@ -138,7 +144,7 @@ static int rt_chain_length_max __read_mostly = 20;
static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
-static unsigned int ipv4_default_mtu(const struct dst_entry *dst);
+static unsigned int ipv4_mtu(const struct dst_entry *dst);
static void ipv4_dst_destroy(struct dst_entry *dst);
static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
static void ipv4_link_failure(struct sk_buff *skb);
@@ -193,7 +199,7 @@ static struct dst_ops ipv4_dst_ops = {
.gc = rt_garbage_collect,
.check = ipv4_dst_check,
.default_advmss = ipv4_default_advmss,
- .default_mtu = ipv4_default_mtu,
+ .mtu = ipv4_mtu,
.cow_metrics = ipv4_cow_metrics,
.destroy = ipv4_dst_destroy,
.ifdown = ipv4_dst_ifdown,
@@ -416,9 +422,13 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
else {
struct rtable *r = v;
struct neighbour *n;
- int len;
+ int len, HHUptod;
+ rcu_read_lock();
n = dst_get_neighbour(&r->dst);
+ HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
+ rcu_read_unlock();
+
seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
"%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
r->dst.dev ? r->dst.dev->name : "*",
@@ -432,7 +442,7 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
dst_metric(&r->dst, RTAX_RTTVAR)),
r->rt_key_tos,
-1,
- (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0,
+ HHUptod,
r->rt_spec_dst, &len);
seq_printf(seq, "%*s\n", 127 - len, "");
@@ -825,6 +835,97 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth)
return ONE;
}
+static void rt_check_expire(void)
+{
+ static unsigned int rover;
+ unsigned int i = rover, goal;
+ struct rtable *rth;
+ struct rtable __rcu **rthp;
+ unsigned long samples = 0;
+ unsigned long sum = 0, sum2 = 0;
+ unsigned long delta;
+ u64 mult;
+
+ delta = jiffies - expires_ljiffies;
+ expires_ljiffies = jiffies;
+ mult = ((u64)delta) << rt_hash_log;
+ if (ip_rt_gc_timeout > 1)
+ do_div(mult, ip_rt_gc_timeout);
+ goal = (unsigned int)mult;
+ if (goal > rt_hash_mask)
+ goal = rt_hash_mask + 1;
+ for (; goal > 0; goal--) {
+ unsigned long tmo = ip_rt_gc_timeout;
+ unsigned long length;
+
+ i = (i + 1) & rt_hash_mask;
+ rthp = &rt_hash_table[i].chain;
+
+ if (need_resched())
+ cond_resched();
+
+ samples++;
+
+ if (rcu_dereference_raw(*rthp) == NULL)
+ continue;
+ length = 0;
+ spin_lock_bh(rt_hash_lock_addr(i));
+ while ((rth = rcu_dereference_protected(*rthp,
+ lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
+ prefetch(rth->dst.rt_next);
+ if (rt_is_expired(rth)) {
+ *rthp = rth->dst.rt_next;
+ rt_free(rth);
+ continue;
+ }
+ if (rth->dst.expires) {
+ /* Entry is expired even if it is in use */
+ if (time_before_eq(jiffies, rth->dst.expires)) {
+nofree:
+ tmo >>= 1;
+ rthp = &rth->dst.rt_next;
+ /*
+ * We only count entries on
+ * a chain with equal hash inputs once
+ * so that entries for different QOS
+ * levels, and other non-hash input
+ * attributes don't unfairly skew
+ * the length computation
+ */
+ length += has_noalias(rt_hash_table[i].chain, rth);
+ continue;
+ }
+ } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
+ goto nofree;
+
+ /* Cleanup aged off entries. */
+ *rthp = rth->dst.rt_next;
+ rt_free(rth);
+ }
+ spin_unlock_bh(rt_hash_lock_addr(i));
+ sum += length;
+ sum2 += length*length;
+ }
+ if (samples) {
+ unsigned long avg = sum / samples;
+ unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
+ rt_chain_length_max = max_t(unsigned long,
+ ip_rt_gc_elasticity,
+ (avg + 4*sd) >> FRACT_BITS);
+ }
+ rover = i;
+}
+
+/*
+ * rt_worker_func() is run in process context.
+ * we call rt_check_expire() to scan part of the hash table
+ */
+static void rt_worker_func(struct work_struct *work)
+{
+ rt_check_expire();
+ schedule_delayed_work(&expires_work, ip_rt_gc_interval);
+}
+
/*
* Perturbation of rt_genid by a small quantity [1..256]
* Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
@@ -837,6 +938,7 @@ static void rt_cache_invalidate(struct net *net)
get_random_bytes(&shuffle, sizeof(shuffle));
atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
+ redirect_genid++;
}
/*
@@ -1265,7 +1367,7 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
{
struct rtable *rt = (struct rtable *) dst;
- if (rt) {
+ if (rt && !(rt->dst.flags & DST_NOPEER)) {
if (rt->peer == NULL)
rt_bind_peer(rt, rt->rt_dst, 1);
@@ -1276,7 +1378,7 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
iph->id = htons(inet_getid(rt->peer, more));
return;
}
- } else
+ } else if (!rt)
printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
__builtin_return_address(0));
@@ -1304,16 +1406,40 @@ static void rt_del(unsigned hash, struct rtable *rt)
spin_unlock_bh(rt_hash_lock_addr(hash));
}
+static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
+{
+ struct rtable *rt = (struct rtable *) dst;
+ __be32 orig_gw = rt->rt_gateway;
+ struct neighbour *n, *old_n;
+
+ dst_confirm(&rt->dst);
+
+ rt->rt_gateway = peer->redirect_learned.a4;
+
+ n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
+ if (IS_ERR(n)) {
+ rt->rt_gateway = orig_gw;
+ return;
+ }
+ old_n = xchg(&rt->dst._neighbour, n);
+ if (old_n)
+ neigh_release(old_n);
+ if (!(n->nud_state & NUD_VALID)) {
+ neigh_event_send(n, NULL);
+ } else {
+ rt->rt_flags |= RTCF_REDIRECTED;
+ call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
+ }
+}
+
/* called in rcu_read_lock() section */
void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
__be32 saddr, struct net_device *dev)
{
int s, i;
struct in_device *in_dev = __in_dev_get_rcu(dev);
- struct rtable *rt;
__be32 skeys[2] = { saddr, 0 };
int ikeys[2] = { dev->ifindex, 0 };
- struct flowi4 fl4;
struct inet_peer *peer;
struct net *net;
@@ -1336,33 +1462,44 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
goto reject_redirect;
}
- memset(&fl4, 0, sizeof(fl4));
- fl4.daddr = daddr;
for (s = 0; s < 2; s++) {
for (i = 0; i < 2; i++) {
- fl4.flowi4_oif = ikeys[i];
- fl4.saddr = skeys[s];
- rt = __ip_route_output_key(net, &fl4);
- if (IS_ERR(rt))
- continue;
-
- if (rt->dst.error || rt->dst.dev != dev ||
- rt->rt_gateway != old_gw) {
- ip_rt_put(rt);
- continue;
- }
-
- if (!rt->peer)
- rt_bind_peer(rt, rt->rt_dst, 1);
+ unsigned int hash;
+ struct rtable __rcu **rthp;
+ struct rtable *rt;
+
+ hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
+
+ rthp = &rt_hash_table[hash].chain;
+
+ while ((rt = rcu_dereference(*rthp)) != NULL) {
+ rthp = &rt->dst.rt_next;
+
+ if (rt->rt_key_dst != daddr ||
+ rt->rt_key_src != skeys[s] ||
+ rt->rt_oif != ikeys[i] ||
+ rt_is_input_route(rt) ||
+ rt_is_expired(rt) ||
+ !net_eq(dev_net(rt->dst.dev), net) ||
+ rt->dst.error ||
+ rt->dst.dev != dev ||
+ rt->rt_gateway != old_gw)
+ continue;
- peer = rt->peer;
- if (peer) {
- peer->redirect_learned.a4 = new_gw;
- atomic_inc(&__rt_peer_genid);
+ if (!rt->peer)
+ rt_bind_peer(rt, rt->rt_dst, 1);
+
+ peer = rt->peer;
+ if (peer) {
+ if (peer->redirect_learned.a4 != new_gw ||
+ peer->redirect_genid != redirect_genid) {
+ peer->redirect_learned.a4 = new_gw;
+ peer->redirect_genid = redirect_genid;
+ atomic_inc(&__rt_peer_genid);
+ }
+ check_peer_redir(&rt->dst, peer);
+ }
}
-
- ip_rt_put(rt);
- return;
}
}
return;
@@ -1649,40 +1786,9 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
}
}
-static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
-{
- struct rtable *rt = (struct rtable *) dst;
- __be32 orig_gw = rt->rt_gateway;
- struct neighbour *n, *old_n;
-
- dst_confirm(&rt->dst);
-
- rt->rt_gateway = peer->redirect_learned.a4;
- n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
- if (IS_ERR(n))
- return PTR_ERR(n);
- old_n = xchg(&rt->dst._neighbour, n);
- if (old_n)
- neigh_release(old_n);
- if (!n || !(n->nud_state & NUD_VALID)) {
- if (n)
- neigh_event_send(n, NULL);
- rt->rt_gateway = orig_gw;
- return -EAGAIN;
- } else {
- rt->rt_flags |= RTCF_REDIRECTED;
- call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
- }
- return 0;
-}
-
-static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
+static void ipv4_validate_peer(struct rtable *rt)
{
- struct rtable *rt = (struct rtable *) dst;
-
- if (rt_is_expired(rt))
- return NULL;
if (rt->rt_peer_genid != rt_peer_genid()) {
struct inet_peer *peer;
@@ -1691,17 +1797,26 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
peer = rt->peer;
if (peer) {
- check_peer_pmtu(dst, peer);
+ check_peer_pmtu(&rt->dst, peer);
+ if (peer->redirect_genid != redirect_genid)
+ peer->redirect_learned.a4 = 0;
if (peer->redirect_learned.a4 &&
- peer->redirect_learned.a4 != rt->rt_gateway) {
- if (check_peer_redir(dst, peer))
- return NULL;
- }
+ peer->redirect_learned.a4 != rt->rt_gateway)
+ check_peer_redir(&rt->dst, peer);
}
rt->rt_peer_genid = rt_peer_genid();
}
+}
+
+static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
+{
+ struct rtable *rt = (struct rtable *) dst;
+
+ if (rt_is_expired(rt))
+ return NULL;
+ ipv4_validate_peer(rt);
return dst;
}
@@ -1806,12 +1921,17 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
return advmss;
}
-static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
+static unsigned int ipv4_mtu(const struct dst_entry *dst)
{
- unsigned int mtu = dst->dev->mtu;
+ const struct rtable *rt = (const struct rtable *) dst;
+ unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
+
+ if (mtu && rt_is_output_route(rt))
+ return mtu;
+
+ mtu = dst->dev->mtu;
if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
- const struct rtable *rt = (const struct rtable *) dst;
if (rt->rt_gateway != rt->rt_dst && mtu > 576)
mtu = 576;
@@ -1844,6 +1964,8 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
dst_init_metrics(&rt->dst, peer->metrics, false);
check_peer_pmtu(&rt->dst, peer);
+ if (peer->redirect_genid != redirect_genid)
+ peer->redirect_learned.a4 = 0;
if (peer->redirect_learned.a4 &&
peer->redirect_learned.a4 != rt->rt_gateway) {
rt->rt_gateway = peer->redirect_learned.a4;
@@ -2349,6 +2471,7 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
rth->rt_mark == skb->mark &&
net_eq(dev_net(rth->dst.dev), net) &&
!rt_is_expired(rth)) {
+ ipv4_validate_peer(rth);
if (noref) {
dst_use_noref(&rth->dst, jiffies);
skb_dst_set_noref(skb, &rth->dst);
@@ -2407,11 +2530,11 @@ EXPORT_SYMBOL(ip_route_input_common);
static struct rtable *__mkroute_output(const struct fib_result *res,
const struct flowi4 *fl4,
__be32 orig_daddr, __be32 orig_saddr,
- int orig_oif, struct net_device *dev_out,
+ int orig_oif, __u8 orig_rtos,
+ struct net_device *dev_out,
unsigned int flags)
{
struct fib_info *fi = res->fi;
- u32 tos = RT_FL_TOS(fl4);
struct in_device *in_dev;
u16 type = res->type;
struct rtable *rth;
@@ -2462,7 +2585,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
rth->rt_genid = rt_genid(dev_net(dev_out));
rth->rt_flags = flags;
rth->rt_type = type;
- rth->rt_key_tos = tos;
+ rth->rt_key_tos = orig_rtos;
rth->rt_dst = fl4->daddr;
rth->rt_src = fl4->saddr;
rth->rt_route_iif = 0;
@@ -2512,7 +2635,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
{
struct net_device *dev_out = NULL;
- u32 tos = RT_FL_TOS(fl4);
+ __u8 tos = RT_FL_TOS(fl4);
unsigned int flags = 0;
struct fib_result res;
struct rtable *rth;
@@ -2688,7 +2811,7 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
make_route:
rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
- dev_out, flags);
+ tos, dev_out, flags);
if (!IS_ERR(rth)) {
unsigned int hash;
@@ -2724,6 +2847,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
(IPTOS_RT_MASK | RTO_ONLINK)) &&
net_eq(dev_net(rth->dst.dev), net) &&
!rt_is_expired(rth)) {
+ ipv4_validate_peer(rth);
dst_use(&rth->dst, jiffies);
RT_CACHE_STAT_INC(out_hit);
rcu_read_unlock_bh();
@@ -2747,9 +2871,11 @@ static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 coo
return NULL;
}
-static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
+static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
{
- return 0;
+ unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
+
+ return mtu ? : dst->dev->mtu;
}
static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
@@ -2767,7 +2893,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
.protocol = cpu_to_be16(ETH_P_IP),
.destroy = ipv4_dst_destroy,
.check = ipv4_blackhole_dst_check,
- .default_mtu = ipv4_blackhole_default_mtu,
+ .mtu = ipv4_blackhole_mtu,
.default_advmss = ipv4_default_advmss,
.update_pmtu = ipv4_rt_blackhole_update_pmtu,
.cow_metrics = ipv4_rt_blackhole_cow_metrics,
@@ -2845,7 +2971,7 @@ static int rt_fill_info(struct net *net,
struct rtable *rt = skb_rtable(skb);
struct rtmsg *r;
struct nlmsghdr *nlh;
- long expires = 0;
+ unsigned long expires = 0;
const struct inet_peer *peer = rt->peer;
u32 id = 0, ts = 0, tsage = 0, error;
@@ -2902,8 +3028,12 @@ static int rt_fill_info(struct net *net,
tsage = get_seconds() - peer->tcp_ts_stamp;
}
expires = ACCESS_ONCE(peer->pmtu_expires);
- if (expires)
- expires -= jiffies;
+ if (expires) {
+ if (time_before(jiffies, expires))
+ expires -= jiffies;
+ else
+ expires = 0;
+ }
}
if (rt_is_input_route(rt)) {
@@ -3145,6 +3275,13 @@ static ctl_table ipv4_route_table[] = {
.proc_handler = proc_dointvec_jiffies,
},
{
+ .procname = "gc_interval",
+ .data = &ip_rt_gc_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
.procname = "redirect_load",
.data = &ip_rt_redirect_load,
.maxlen = sizeof(int),
@@ -3354,6 +3491,11 @@ int __init ip_rt_init(void)
devinet_init();
ip_fib_init();
+ INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
+ expires_ljiffies = jiffies;
+ schedule_delayed_work(&expires_work,
+ net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
+
if (ip_rt_proc_init())
printk(KERN_ERR "Unable to create route proc files\n");
#ifdef CONFIG_XFRM