summaryrefslogtreecommitdiffstats
path: root/net/ipv6/route.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv6/route.c')
-rw-r--r--net/ipv6/route.c210
1 files changed, 163 insertions, 47 deletions
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index b1e6cf0b95f..e229a3bc345 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -57,6 +57,7 @@
#include <net/xfrm.h>
#include <net/netevent.h>
#include <net/netlink.h>
+#include <net/nexthop.h>
#include <asm/uaccess.h>
@@ -289,6 +290,8 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net,
memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
rt->rt6i_genid = rt_genid(net);
+ INIT_LIST_HEAD(&rt->rt6i_siblings);
+ rt->rt6i_nsiblings = 0;
}
return rt;
}
@@ -318,13 +321,6 @@ static void ip6_dst_destroy(struct dst_entry *dst)
}
}
-static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
-
-static u32 rt6_peer_genid(void)
-{
- return atomic_read(&__rt6_peer_genid);
-}
-
void rt6_bind_peer(struct rt6_info *rt, int create)
{
struct inet_peer_base *base;
@@ -338,8 +334,6 @@ void rt6_bind_peer(struct rt6_info *rt, int create)
if (peer) {
if (!rt6_set_peer(rt, peer))
inet_putpeer(peer);
- else
- rt->rt6i_peer_genid = rt6_peer_genid();
}
}
@@ -385,6 +379,69 @@ static bool rt6_need_strict(const struct in6_addr *daddr)
(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
}
+/* Multipath route selection:
+ * Hash based function using packet header and flowlabel.
+ * Adapted from fib_info_hashfn()
+ */
+static int rt6_info_hash_nhsfn(unsigned int candidate_count,
+ const struct flowi6 *fl6)
+{
+ unsigned int val = fl6->flowi6_proto;
+
+ val ^= (__force u32)fl6->daddr.s6_addr32[0];
+ val ^= (__force u32)fl6->daddr.s6_addr32[1];
+ val ^= (__force u32)fl6->daddr.s6_addr32[2];
+ val ^= (__force u32)fl6->daddr.s6_addr32[3];
+
+ val ^= (__force u32)fl6->saddr.s6_addr32[0];
+ val ^= (__force u32)fl6->saddr.s6_addr32[1];
+ val ^= (__force u32)fl6->saddr.s6_addr32[2];
+ val ^= (__force u32)fl6->saddr.s6_addr32[3];
+
+ /* Work only if this not encapsulated */
+ switch (fl6->flowi6_proto) {
+ case IPPROTO_UDP:
+ case IPPROTO_TCP:
+ case IPPROTO_SCTP:
+ val ^= (__force u16)fl6->fl6_sport;
+ val ^= (__force u16)fl6->fl6_dport;
+ break;
+
+ case IPPROTO_ICMPV6:
+ val ^= (__force u16)fl6->fl6_icmp_type;
+ val ^= (__force u16)fl6->fl6_icmp_code;
+ break;
+ }
+ /* RFC6438 recommands to use flowlabel */
+ val ^= (__force u32)fl6->flowlabel;
+
+ /* Perhaps, we need to tune, this function? */
+ val = val ^ (val >> 7) ^ (val >> 12);
+ return val % candidate_count;
+}
+
+static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
+ struct flowi6 *fl6)
+{
+ struct rt6_info *sibling, *next_sibling;
+ int route_choosen;
+
+ route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
+ /* Don't change the route, if route_choosen == 0
+ * (siblings does not include ourself)
+ */
+ if (route_choosen)
+ list_for_each_entry_safe(sibling, next_sibling,
+ &match->rt6i_siblings, rt6i_siblings) {
+ route_choosen--;
+ if (route_choosen == 0) {
+ match = sibling;
+ break;
+ }
+ }
+ return match;
+}
+
/*
* Route lookup. Any table->tb6_lock is implied.
*/
@@ -487,35 +544,32 @@ static inline int rt6_check_dev(struct rt6_info *rt, int oif)
return 0;
}
-static inline int rt6_check_neigh(struct rt6_info *rt)
+static inline bool rt6_check_neigh(struct rt6_info *rt)
{
struct neighbour *neigh;
- int m;
+ bool ret = false;
neigh = rt->n;
if (rt->rt6i_flags & RTF_NONEXTHOP ||
!(rt->rt6i_flags & RTF_GATEWAY))
- m = 1;
+ ret = true;
else if (neigh) {
read_lock_bh(&neigh->lock);
if (neigh->nud_state & NUD_VALID)
- m = 2;
+ ret = true;
#ifdef CONFIG_IPV6_ROUTER_PREF
- else if (neigh->nud_state & NUD_FAILED)
- m = 0;
+ else if (!(neigh->nud_state & NUD_FAILED))
+ ret = true;
#endif
- else
- m = 1;
read_unlock_bh(&neigh->lock);
- } else
- m = 0;
- return m;
+ }
+ return ret;
}
static int rt6_score_route(struct rt6_info *rt, int oif,
int strict)
{
- int m, n;
+ int m;
m = rt6_check_dev(rt, oif);
if (!m && (strict & RT6_LOOKUP_F_IFACE))
@@ -523,8 +577,7 @@ static int rt6_score_route(struct rt6_info *rt, int oif,
#ifdef CONFIG_IPV6_ROUTER_PREF
m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
#endif
- n = rt6_check_neigh(rt);
- if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
+ if (!rt6_check_neigh(rt) && (strict & RT6_LOOKUP_F_REACHABLE))
return -1;
return m;
}
@@ -666,7 +719,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
else
rt6_set_expires(rt, jiffies + HZ * lifetime);
- dst_release(&rt->dst);
+ ip6_rt_put(rt);
}
return 0;
}
@@ -702,6 +755,8 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
restart:
rt = fn->leaf;
rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
+ if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
+ rt = rt6_multipath_select(rt, fl6);
BACKTRACK(net, &fl6->saddr);
out:
dst_use(&rt->dst, jiffies);
@@ -863,7 +918,8 @@ restart_2:
restart:
rt = rt6_select(fn, oif, strict | reachable);
-
+ if (rt->rt6i_nsiblings && oif == 0)
+ rt = rt6_multipath_select(rt, fl6);
BACKTRACK(net, &fl6->saddr);
if (rt == net->ipv6.ip6_null_entry ||
rt->rt6i_flags & RTF_CACHE)
@@ -879,7 +935,7 @@ restart:
else
goto out2;
- dst_release(&rt->dst);
+ ip6_rt_put(rt);
rt = nrt ? : net->ipv6.ip6_null_entry;
dst_hold(&rt->dst);
@@ -896,7 +952,7 @@ restart:
* Race condition! In the gap, when table->tb6_lock was
* released someone could insert this route. Relookup.
*/
- dst_release(&rt->dst);
+ ip6_rt_put(rt);
goto relookup;
out:
@@ -1030,14 +1086,9 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
if (rt->rt6i_genid != rt_genid(dev_net(rt->dst.dev)))
return NULL;
- if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
- if (rt->rt6i_peer_genid != rt6_peer_genid()) {
- if (!rt6_has_peer(rt))
- rt6_bind_peer(rt, 0);
- rt->rt6i_peer_genid = rt6_peer_genid();
- }
+ if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
return dst;
- }
+
return NULL;
}
@@ -1316,12 +1367,6 @@ out:
return entries > rt_max_size;
}
-/* Clean host part of a prefix. Not necessary in radix tree,
- but results in cleaner routing tables.
-
- Remove it only when all the things will work!
- */
-
int ip6_dst_hoplimit(struct dst_entry *dst)
{
int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
@@ -1507,7 +1552,7 @@ int ip6_route_add(struct fib6_config *cfg)
goto out;
if (dev) {
if (dev != grt->dst.dev) {
- dst_release(&grt->dst);
+ ip6_rt_put(grt);
goto out;
}
} else {
@@ -1518,7 +1563,7 @@ int ip6_route_add(struct fib6_config *cfg)
}
if (!(grt->rt6i_flags & RTF_GATEWAY))
err = 0;
- dst_release(&grt->dst);
+ ip6_rt_put(grt);
if (err)
goto out;
@@ -1604,7 +1649,7 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
write_unlock_bh(&table->tb6_lock);
out:
- dst_release(&rt->dst);
+ ip6_rt_put(rt);
return err;
}
@@ -1987,7 +2032,7 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
switch(cmd) {
case SIOCADDRT: /* Add a route */
case SIOCDELRT: /* Delete a route */
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
err = copy_from_user(&rtmsg, arg,
sizeof(struct in6_rtmsg));
@@ -2249,6 +2294,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
[RTA_IIF] = { .type = NLA_U32 },
[RTA_PRIORITY] = { .type = NLA_U32 },
[RTA_METRICS] = { .type = NLA_NESTED },
+ [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
};
static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -2326,11 +2372,71 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
if (tb[RTA_TABLE])
cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
+ if (tb[RTA_MULTIPATH]) {
+ cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
+ cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
+ }
+
err = 0;
errout:
return err;
}
+static int ip6_route_multipath(struct fib6_config *cfg, int add)
+{
+ struct fib6_config r_cfg;
+ struct rtnexthop *rtnh;
+ int remaining;
+ int attrlen;
+ int err = 0, last_err = 0;
+
+beginning:
+ rtnh = (struct rtnexthop *)cfg->fc_mp;
+ remaining = cfg->fc_mp_len;
+
+ /* Parse a Multipath Entry */
+ while (rtnh_ok(rtnh, remaining)) {
+ memcpy(&r_cfg, cfg, sizeof(*cfg));
+ if (rtnh->rtnh_ifindex)
+ r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
+
+ attrlen = rtnh_attrlen(rtnh);
+ if (attrlen > 0) {
+ struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
+
+ nla = nla_find(attrs, attrlen, RTA_GATEWAY);
+ if (nla) {
+ nla_memcpy(&r_cfg.fc_gateway, nla, 16);
+ r_cfg.fc_flags |= RTF_GATEWAY;
+ }
+ }
+ err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
+ if (err) {
+ last_err = err;
+ /* If we are trying to remove a route, do not stop the
+ * loop when ip6_route_del() fails (because next hop is
+ * already gone), we should try to remove all next hops.
+ */
+ if (add) {
+ /* If add fails, we should try to delete all
+ * next hops that have been already added.
+ */
+ add = 0;
+ goto beginning;
+ }
+ }
+ /* Because each route is added like a single route we remove
+ * this flag after the first nexthop (if there is a collision,
+ * we have already fail to add the first nexthop:
+ * fib6_add_rt2node() has reject it).
+ */
+ cfg->fc_nlinfo.nlh->nlmsg_flags &= ~NLM_F_EXCL;
+ rtnh = rtnh_next(rtnh, &remaining);
+ }
+
+ return last_err;
+}
+
static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
{
struct fib6_config cfg;
@@ -2340,7 +2446,10 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *a
if (err < 0)
return err;
- return ip6_route_del(&cfg);
+ if (cfg.fc_mp)
+ return ip6_route_multipath(&cfg, 0);
+ else
+ return ip6_route_del(&cfg);
}
static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
@@ -2352,7 +2461,10 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *a
if (err < 0)
return err;
- return ip6_route_add(&cfg);
+ if (cfg.fc_mp)
+ return ip6_route_multipath(&cfg, 1);
+ else
+ return ip6_route_add(&cfg);
}
static inline size_t rt6_nlmsg_size(void)
@@ -2596,7 +2708,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb) {
- dst_release(&rt->dst);
+ ip6_rt_put(rt);
err = -ENOBUFS;
goto errout;
}
@@ -2873,6 +2985,10 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ table[0].procname = NULL;
}
return table;