summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/net/tun.c150
-rw-r--r--include/linux/if_tun.h10
-rw-r--r--include/net/fib_rules.h2
-rw-r--r--include/net/netns/ipv4.h4
-rw-r--r--include/net/route.h2
-rw-r--r--net/core/fib_rules.c2
-rw-r--r--net/decnet/dn_rules.c2
-rw-r--r--net/ipv4/arp.c2
-rw-r--r--net/ipv4/devinet.c8
-rw-r--r--net/ipv4/fib_frontend.c17
-rw-r--r--net/ipv4/fib_hash.c6
-rw-r--r--net/ipv4/fib_rules.c4
-rw-r--r--net/ipv4/fib_trie.c6
-rw-r--r--net/ipv4/route.c255
14 files changed, 381 insertions, 89 deletions
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 7ab94c825b5..aa4ee4439f0 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -63,6 +63,7 @@
#include <linux/if_tun.h>
#include <linux/crc32.h>
#include <linux/nsproxy.h>
+#include <linux/virtio_net.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
@@ -283,6 +284,7 @@ static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv,
struct tun_pi pi = { 0, __constant_htons(ETH_P_IP) };
struct sk_buff *skb;
size_t len = count, align = 0;
+ struct virtio_net_hdr gso = { 0 };
if (!(tun->flags & TUN_NO_PI)) {
if ((len -= sizeof(pi)) > count)
@@ -292,6 +294,17 @@ static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv,
return -EFAULT;
}
+ if (tun->flags & TUN_VNET_HDR) {
+ if ((len -= sizeof(gso)) > count)
+ return -EINVAL;
+
+ if (memcpy_fromiovec((void *)&gso, iv, sizeof(gso)))
+ return -EFAULT;
+
+ if (gso.hdr_len > len)
+ return -EINVAL;
+ }
+
if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) {
align = NET_IP_ALIGN;
if (unlikely(len < ETH_HLEN))
@@ -311,6 +324,16 @@ static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv,
return -EFAULT;
}
+ if (gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+ if (!skb_partial_csum_set(skb, gso.csum_start,
+ gso.csum_offset)) {
+ tun->dev->stats.rx_frame_errors++;
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+ } else if (tun->flags & TUN_NOCHECKSUM)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+
switch (tun->flags & TUN_TYPE_MASK) {
case TUN_TUN_DEV:
if (tun->flags & TUN_NO_PI) {
@@ -337,8 +360,35 @@ static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv,
break;
};
- if (tun->flags & TUN_NOCHECKSUM)
- skb->ip_summed = CHECKSUM_UNNECESSARY;
+ if (gso.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+ pr_debug("GSO!\n");
+ switch (gso.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
+ case VIRTIO_NET_HDR_GSO_TCPV4:
+ skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+ break;
+ case VIRTIO_NET_HDR_GSO_TCPV6:
+ skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
+ break;
+ default:
+ tun->dev->stats.rx_frame_errors++;
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ if (gso.gso_type & VIRTIO_NET_HDR_GSO_ECN)
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+
+ skb_shinfo(skb)->gso_size = gso.gso_size;
+ if (skb_shinfo(skb)->gso_size == 0) {
+ tun->dev->stats.rx_frame_errors++;
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ /* Header must be checked, and gso_segs computed. */
+ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+ skb_shinfo(skb)->gso_segs = 0;
+ }
netif_rx_ni(skb);
tun->dev->last_rx = jiffies;
@@ -384,6 +434,39 @@ static __inline__ ssize_t tun_put_user(struct tun_struct *tun,
total += sizeof(pi);
}
+ if (tun->flags & TUN_VNET_HDR) {
+ struct virtio_net_hdr gso = { 0 }; /* no info leak */
+ if ((len -= sizeof(gso)) < 0)
+ return -EINVAL;
+
+ if (skb_is_gso(skb)) {
+ struct skb_shared_info *sinfo = skb_shinfo(skb);
+
+ /* This is a hint as to how much should be linear. */
+ gso.hdr_len = skb_headlen(skb);
+ gso.gso_size = sinfo->gso_size;
+ if (sinfo->gso_type & SKB_GSO_TCPV4)
+ gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
+ else if (sinfo->gso_type & SKB_GSO_TCPV6)
+ gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+ else
+ BUG();
+ if (sinfo->gso_type & SKB_GSO_TCP_ECN)
+ gso.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
+ } else
+ gso.gso_type = VIRTIO_NET_HDR_GSO_NONE;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ gso.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
+ gso.csum_start = skb->csum_start - skb_headroom(skb);
+ gso.csum_offset = skb->csum_offset;
+ } /* else everything is zero */
+
+ if (unlikely(memcpy_toiovec(iv, (void *)&gso, sizeof(gso))))
+ return -EFAULT;
+ total += sizeof(gso);
+ }
+
len = min_t(int, skb->len, len);
skb_copy_datagram_iovec(skb, 0, iv, len);
@@ -598,6 +681,11 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
else
tun->flags &= ~TUN_ONE_QUEUE;
+ if (ifr->ifr_flags & IFF_VNET_HDR)
+ tun->flags |= TUN_VNET_HDR;
+ else
+ tun->flags &= ~TUN_VNET_HDR;
+
file->private_data = tun;
tun->attached = 1;
get_net(dev_net(tun->dev));
@@ -611,6 +699,46 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
return err;
}
+/* This is like a cut-down ethtool ops, except done via tun fd so no
+ * privs required. */
+static int set_offload(struct net_device *dev, unsigned long arg)
+{
+ unsigned int old_features, features;
+
+ old_features = dev->features;
+ /* Unset features, set them as we chew on the arg. */
+ features = (old_features & ~(NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST
+ |NETIF_F_TSO_ECN|NETIF_F_TSO|NETIF_F_TSO6));
+
+ if (arg & TUN_F_CSUM) {
+ features |= NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST;
+ arg &= ~TUN_F_CSUM;
+
+ if (arg & (TUN_F_TSO4|TUN_F_TSO6)) {
+ if (arg & TUN_F_TSO_ECN) {
+ features |= NETIF_F_TSO_ECN;
+ arg &= ~TUN_F_TSO_ECN;
+ }
+ if (arg & TUN_F_TSO4)
+ features |= NETIF_F_TSO;
+ if (arg & TUN_F_TSO6)
+ features |= NETIF_F_TSO6;
+ arg &= ~(TUN_F_TSO4|TUN_F_TSO6);
+ }
+ }
+
+ /* This gives the user a way to test for new features in future by
+ * trying to set them. */
+ if (arg)
+ return -EINVAL;
+
+ dev->features = features;
+ if (old_features != dev->features)
+ netdev_features_change(dev);
+
+ return 0;
+}
+
static int tun_chr_ioctl(struct inode *inode, struct file *file,
unsigned int cmd, unsigned long arg)
{
@@ -640,6 +768,15 @@ static int tun_chr_ioctl(struct inode *inode, struct file *file,
return 0;
}
+ if (cmd == TUNGETFEATURES) {
+ /* Currently this just means: "what IFF flags are valid?".
+ * This is needed because we never checked for invalid flags on
+ * TUNSETIFF. */
+ return put_user(IFF_TUN | IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE |
+ IFF_VNET_HDR,
+ (unsigned int __user*)argp);
+ }
+
if (!tun)
return -EBADFD;
@@ -707,6 +844,15 @@ static int tun_chr_ioctl(struct inode *inode, struct file *file,
break;
#endif
+ case TUNSETOFFLOAD:
+ {
+ int ret;
+ rtnl_lock();
+ ret = set_offload(tun->dev, arg);
+ rtnl_unlock();
+ return ret;
+ }
+
case SIOCGIFFLAGS:
ifr.ifr_flags = tun->if_flags;
if (copy_to_user( argp, &ifr, sizeof ifr))
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index 18f31b6187a..563fae542da 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -31,6 +31,7 @@
#define TUN_NO_PI 0x0040
#define TUN_ONE_QUEUE 0x0080
#define TUN_PERSIST 0x0100
+#define TUN_VNET_HDR 0x0200
/* Ioctl defines */
#define TUNSETNOCSUM _IOW('T', 200, int)
@@ -40,12 +41,21 @@
#define TUNSETOWNER _IOW('T', 204, int)
#define TUNSETLINK _IOW('T', 205, int)
#define TUNSETGROUP _IOW('T', 206, int)
+#define TUNGETFEATURES _IOR('T', 207, unsigned int)
+#define TUNSETOFFLOAD _IOW('T', 208, unsigned int)
/* TUNSETIFF ifr flags */
#define IFF_TUN 0x0001
#define IFF_TAP 0x0002
#define IFF_NO_PI 0x1000
#define IFF_ONE_QUEUE 0x2000
+#define IFF_VNET_HDR 0x4000
+
+/* Features for GSO (TUNSETOFFLOAD). */
+#define TUN_F_CSUM 0x01 /* You can hand me unchecksummed packets. */
+#define TUN_F_TSO4 0x02 /* I can handle TSO for IPv4 packets */
+#define TUN_F_TSO6 0x04 /* I can handle TSO for IPv6 packets */
+#define TUN_F_TSO_ECN 0x08 /* I can handle TSO with ECN bits. */
struct tun_pi {
unsigned short flags;
diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index a5c6ccc5bb1..c2bb5cae651 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -62,7 +62,7 @@ struct fib_rules_ops
/* Called after modifications to the rules set, must flush
* the route cache if one exists. */
- void (*flush_cache)(void);
+ void (*flush_cache)(struct fib_rules_ops *ops);
int nlgroup;
const struct nla_policy *policy;
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 6ef90b5fafb..a6ed83853dc 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -18,6 +18,7 @@ struct netns_ipv4 {
struct ctl_table_header *forw_hdr;
struct ctl_table_header *frags_hdr;
struct ctl_table_header *ipv4_hdr;
+ struct ctl_table_header *route_hdr;
#endif
struct ipv4_devconf *devconf_all;
struct ipv4_devconf *devconf_dflt;
@@ -45,5 +46,8 @@ struct netns_ipv4 {
int sysctl_icmp_ratelimit;
int sysctl_icmp_ratemask;
int sysctl_icmp_errors_use_inbound_ifaddr;
+
+ struct timer_list rt_secret_timer;
+ atomic_t rt_genid;
};
#endif
diff --git a/include/net/route.h b/include/net/route.h
index fc836ff824c..3140cc50085 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -111,7 +111,7 @@ struct in_device;
extern int ip_rt_init(void);
extern void ip_rt_redirect(__be32 old_gw, __be32 dst, __be32 new_gw,
__be32 src, struct net_device *dev);
-extern void rt_cache_flush(int how);
+extern void rt_cache_flush(struct net *net, int how);
extern int __ip_route_output_key(struct net *, struct rtable **, const struct flowi *flp);
extern int ip_route_output_key(struct net *, struct rtable **, struct flowi *flp);
extern int ip_route_output_flow(struct net *, struct rtable **rp, struct flowi *flp, struct sock *sk, int flags);
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index e3e9ab0f74e..1c2943a119f 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -69,7 +69,7 @@ static void rules_ops_put(struct fib_rules_ops *ops)
static void flush_route_cache(struct fib_rules_ops *ops)
{
if (ops->flush_cache)
- ops->flush_cache();
+ ops->flush_cache(ops);
}
int fib_rules_register(struct fib_rules_ops *ops)
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index 5b7539b7fe0..14fbca55e90 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -229,7 +229,7 @@ static u32 dn_fib_rule_default_pref(struct fib_rules_ops *ops)
return 0;
}
-static void dn_fib_rule_flush_cache(void)
+static void dn_fib_rule_flush_cache(struct fib_rules_ops *ops)
{
dn_rt_cache_flush(-1);
}
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 20c515a1be2..29df75a6bcc 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1197,7 +1197,7 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event, vo
switch (event) {
case NETDEV_CHANGEADDR:
neigh_changeaddr(&arp_tbl, dev);
- rt_cache_flush(0);
+ rt_cache_flush(dev_net(dev), 0);
break;
default:
break;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 9de2514946c..2e667e2f90d 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1348,7 +1348,7 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write,
dev_disable_lro(idev->dev);
}
rtnl_unlock();
- rt_cache_flush(0);
+ rt_cache_flush(net, 0);
}
}
@@ -1362,9 +1362,10 @@ int ipv4_doint_and_flush(ctl_table *ctl, int write,
int *valp = ctl->data;
int val = *valp;
int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+ struct net *net = ctl->extra2;
if (write && *valp != val)
- rt_cache_flush(0);
+ rt_cache_flush(net, 0);
return ret;
}
@@ -1375,9 +1376,10 @@ int ipv4_doint_and_flush_strategy(ctl_table *table, int __user *name, int nlen,
{
int ret = devinet_conf_sysctl(table, name, nlen, oldval, oldlenp,
newval, newlen);
+ struct net *net = table->extra2;
if (ret == 1)
- rt_cache_flush(0);
+ rt_cache_flush(net, 0);
return ret;
}
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 5ad01d63f83..65c1503f8cc 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -144,7 +144,7 @@ static void fib_flush(struct net *net)
}
if (flushed)
- rt_cache_flush(-1);
+ rt_cache_flush(net, -1);
}
/*
@@ -897,21 +897,22 @@ static void fib_disable_ip(struct net_device *dev, int force)
{
if (fib_sync_down_dev(dev, force))
fib_flush(dev_net(dev));
- rt_cache_flush(0);
+ rt_cache_flush(dev_net(dev), 0);
arp_ifdown(dev);
}
static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
{
struct in_ifaddr *ifa = (struct in_ifaddr*)ptr;
+ struct net_device *dev = ifa->ifa_dev->dev;
switch (event) {
case NETDEV_UP:
fib_add_ifaddr(ifa);
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- fib_sync_up(ifa->ifa_dev->dev);
+ fib_sync_up(dev);
#endif
- rt_cache_flush(-1);
+ rt_cache_flush(dev_net(dev), -1);
break;
case NETDEV_DOWN:
fib_del_ifaddr(ifa);
@@ -919,9 +920,9 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
/* Last address was deleted from this interface.
Disable IP.
*/
- fib_disable_ip(ifa->ifa_dev->dev, 1);
+ fib_disable_ip(dev, 1);
} else {
- rt_cache_flush(-1);
+ rt_cache_flush(dev_net(dev), -1);
}
break;
}
@@ -949,14 +950,14 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
#ifdef CONFIG_IP_ROUTE_MULTIPATH
fib_sync_up(dev);
#endif
- rt_cache_flush(-1);
+ rt_cache_flush(dev_net(dev), -1);
break;
case NETDEV_DOWN:
fib_disable_ip(dev, 0);
break;
case NETDEV_CHANGEMTU:
case NETDEV_CHANGE:
- rt_cache_flush(0);
+ rt_cache_flush(dev_net(dev), 0);
break;
}
return NOTIFY_DONE;
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index eeec4bf982b..c8cac6c7f88 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -472,7 +472,7 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg)
fib_release_info(fi_drop);
if (state & FA_S_ACCESSED)
- rt_cache_flush(-1);
+ rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
rtmsg_fib(RTM_NEWROUTE, key, fa, cfg->fc_dst_len, tb->tb_id,
&cfg->fc_nlinfo, NLM_F_REPLACE);
return 0;
@@ -532,7 +532,7 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg)
if (new_f)
fz->fz_nent++;
- rt_cache_flush(-1);
+ rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id,
&cfg->fc_nlinfo, 0);
@@ -614,7 +614,7 @@ static int fn_hash_delete(struct fib_table *tb, struct fib_config *cfg)
write_unlock_bh(&fib_hash_lock);
if (fa->fa_state & FA_S_ACCESSED)
- rt_cache_flush(-1);
+ rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
fn_free_alias(fa, f);
if (kill_fn) {
fn_free_node(f);
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 1fb56876be5..6080d712082 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -258,9 +258,9 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)
+ nla_total_size(4); /* flow */
}
-static void fib4_rule_flush_cache(void)
+static void fib4_rule_flush_cache(struct fib_rules_ops *ops)
{
- rt_cache_flush(-1);
+ rt_cache_flush(ops->fro_net, -1);
}
static struct fib_rules_ops fib4_rules_ops_template = {
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 394db9c941a..d16ae4623be 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1271,7 +1271,7 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
fib_release_info(fi_drop);
if (state & FA_S_ACCESSED)
- rt_cache_flush(-1);
+ rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
@@ -1316,7 +1316,7 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
list_add_tail_rcu(&new_fa->fa_list,
(fa ? &fa->fa_list : fa_head));
- rt_cache_flush(-1);
+ rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
&cfg->fc_nlinfo, 0);
succeeded:
@@ -1664,7 +1664,7 @@ static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
trie_leaf_remove(t, l);
if (fa->fa_state & FA_S_ACCESSED)
- rt_cache_flush(-1);
+ rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
fib_release_info(fa->fa_info);
alias_free_mem_rcu(fa);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index fe3a0223728..113cd2512ba 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -132,7 +132,6 @@ static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
static void rt_worker_func(struct work_struct *work);
static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
-static struct timer_list rt_secret_timer;
/*
* Interface to generic destination cache.
@@ -251,20 +250,25 @@ static inline void rt_hash_lock_init(void)
static struct rt_hash_bucket *rt_hash_table __read_mostly;
static unsigned rt_hash_mask __read_mostly;
static unsigned int rt_hash_log __read_mostly;
-static atomic_t rt_genid __read_mostly;
static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
#define RT_CACHE_STAT_INC(field) \
(__raw_get_cpu_var(rt_cache_stat).field++)
-static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx)
+static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
+ int genid)
{
return jhash_3words((__force u32)(__be32)(daddr),
(__force u32)(__be32)(saddr),
- idx, atomic_read(&rt_genid))
+ idx, genid)
& rt_hash_mask;
}
+static inline int rt_genid(struct net *net)
+{
+ return atomic_read(&net->ipv4.rt_genid);
+}
+
#ifdef CONFIG_PROC_FS
struct rt_cache_iter_state {
struct seq_net_private p;
@@ -334,7 +338,7 @@ static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
struct rt_cache_iter_state *st = seq->private;
if (*pos)
return rt_cache_get_idx(seq, *pos - 1);
- st->genid = atomic_read(&rt_genid);
+ st->genid = rt_genid(seq_file_net(seq));
return SEQ_START_TOKEN;
}
@@ -681,6 +685,11 @@ static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
}
+static inline int rt_is_expired(struct rtable *rth)
+{
+ return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
+}
+
/*
* Perform a full scan of hash table and free all entries.
* Can be called by a softirq or a process.
@@ -690,6 +699,7 @@ static void rt_do_flush(int process_context)
{
unsigned int i;
struct rtable *rth, *next;
+ struct rtable * tail;
for (i = 0; i <= rt_hash_mask; i++) {
if (process_context && need_resched())
@@ -699,11 +709,39 @@ static void rt_do_flush(int process_context)
continue;
spin_lock_bh(rt_hash_lock_addr(i));
+#ifdef CONFIG_NET_NS
+ {
+ struct rtable ** prev, * p;
+
+ rth = rt_hash_table[i].chain;
+
+ /* defer releasing the head of the list after spin_unlock */
+ for (tail = rth; tail; tail = tail->u.dst.rt_next)
+ if (!rt_is_expired(tail))
+ break;
+ if (rth != tail)
+ rt_hash_table[i].chain = tail;
+
+ /* call rt_free on entries after the tail requiring flush */
+ prev = &rt_hash_table[i].chain;
+ for (p = *prev; p; p = next) {
+ next = p->u.dst.rt_next;
+ if (!rt_is_expired(p)) {
+ prev = &p->u.dst.rt_next;
+ } else {
+ *prev = next;
+ rt_free(p);
+ }
+ }
+ }
+#else
rth = rt_hash_table[i].chain;
rt_hash_table[i].chain = NULL;
+ tail = NULL;
+#endif
spin_unlock_bh(rt_hash_lock_addr(i));
- for (; rth; rth = next) {
+ for (; rth != tail; rth = next) {
next = rth->u.dst.rt_next;
rt_free(rth);
}
@@ -736,7 +774,7 @@ static void rt_check_expire(void)
continue;
spin_lock_bh(rt_hash_lock_addr(i));
while ((rth = *rthp) != NULL) {
- if (rth->rt_genid != atomic_read(&rt_genid)) {
+ if (rt_is_expired(rth)) {
*rthp = rth->u.dst.rt_next;
rt_free(rth);
continue;
@@ -779,21 +817,21 @@ static void rt_worker_func(struct work_struct *work)
* many times (2^24) without giving recent rt_genid.
* Jenkins hash is strong enough that litle changes of rt_genid are OK.
*/
-static void rt_cache_invalidate(void)
+static void rt_cache_invalidate(struct net *net)
{
unsigned char shuffle;
get_random_bytes(&shuffle, sizeof(shuffle));
- atomic_add(shuffle + 1U, &rt_genid);
+ atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
}
/*
* delay < 0 : invalidate cache (fast : entries will be deleted later)
* delay >= 0 : invalidate & flush cache (can be long)
*/
-void rt_cache_flush(int delay)
+void rt_cache_flush(struct net *net, int delay)
{
- rt_cache_invalidate();
+ rt_cache_invalidate(net);
if (delay >= 0)
rt_do_flush(!in_softirq());
}
@@ -801,10 +839,11 @@ void rt_cache_flush(int delay)
/*
* We change rt_genid and let gc do the cleanup
*/
-static void rt_secret_rebuild(unsigned long dummy)
+static void rt_secret_rebuild(unsigned long __net)
{
- rt_cache_invalidate();
- mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
+ struct net *net = (struct net *)__net;
+ rt_cache_invalidate(net);
+ mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
}
/*
@@ -880,7 +919,7 @@ static int rt_garbage_collect(struct dst_ops *ops)
rthp = &rt_hash_table[k].chain;
spin_lock_bh(rt_hash_lock_addr(k));
while ((rth = *rthp) != NULL) {
- if (rth->rt_genid == atomic_read(&rt_genid) &&
+ if (!rt_is_expired(rth) &&
!rt_may_expire(rth, tmo, expire)) {
tmo >>= 1;
rthp = &rth->u.dst.rt_next;
@@ -962,7 +1001,7 @@ restart:
spin_lock_bh(rt_hash_lock_addr(hash));
while ((rth = *rthp) != NULL) {
- if (rth->rt_genid != atomic_read(&rt_genid)) {
+ if (rt_is_expired(rth)) {
*rthp = rth->u.dst.rt_next;
rt_free(rth);
continue;
@@ -1138,7 +1177,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
spin_lock_bh(rt_hash_lock_addr(hash));
ip_rt_put(rt);
while ((aux = *rthp) != NULL) {
- if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
+ if (aux == rt || rt_is_expired(aux)) {
*rthp = aux->u.dst.rt_next;
rt_free(aux);
continue;
@@ -1180,7 +1219,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
for (i = 0; i < 2; i++) {
for (k = 0; k < 2; k++) {
- unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
+ unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
+ rt_genid(net));
rthp=&rt_hash_table[hash].chain;
@@ -1192,7 +1232,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
rth->fl.fl4_src != skeys[i] ||
rth->fl.oif != ikeys[k] ||
rth->fl.iif != 0 ||
- rth->rt_genid != atomic_read(&rt_genid) ||
+ rt_is_expired(rth) ||
!net_eq(dev_net(rth->u.dst.dev), net)) {
rthp = &rth->u.dst.rt_next;
continue;
@@ -1231,7 +1271,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
rt->u.dst.neighbour = NULL;
rt->u.dst.hh = NULL;
rt->u.dst.xfrm = NULL;
- rt->rt_genid = atomic_read(&rt_genid);
+ rt->rt_genid = rt_genid(net);
rt->rt_flags |= RTCF_REDIRECTED;
/* Gateway is different ... */
@@ -1295,7 +1335,8 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
rt->u.dst.expires) {
unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
- rt->fl.oif);
+ rt->fl.oif,
+ rt_genid(dev_net(dst->dev)));
#if RT_CACHE_DEBUG >= 1
printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
NIPQUAD_FMT "/%02x dropped\n",
@@ -1444,7 +1485,8 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
for (k = 0; k < 2; k++) {
for (i = 0; i < 2; i++) {
- unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
+ unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
+ rt_genid(net));
rcu_read_lock();
for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
@@ -1459,7 +1501,7 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
rth->fl.iif != 0 ||
dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
!net_eq(dev_net(rth->u.dst.dev), net) ||
- rth->rt_genid != atomic_read(&rt_genid))
+ !rt_is_expired(rth))
continue;
if (new_mtu < 68 || new_mtu >= old_mtu) {
@@ -1694,7 +1736,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
rth->fl.oif = 0;
rth->rt_gateway = daddr;
rth->rt_spec_dst= spec_dst;
- rth->rt_genid = atomic_read(&rt_genid);
+ rth->rt_genid = rt_genid(dev_net(dev));
rth->rt_flags = RTCF_MULTICAST;
rth->rt_type = RTN_MULTICAST;
if (our) {
@@ -1709,7 +1751,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
RT_CACHE_STAT_INC(in_slow_mc);
in_dev_put(in_dev);
- hash = rt_hash(daddr, saddr, dev->ifindex);
+ hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
return rt_intern_hash(hash, rth, &skb->rtable);
e_nobufs:
@@ -1835,7 +1877,7 @@ static int __mkroute_input(struct sk_buff *skb,
rth->u.dst.input = ip_forward;
rth->u.dst.output = ip_output;
- rth->rt_genid = atomic_read(&rt_genid);
+ rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
rt_set_nexthop(rth, res, itag);
@@ -1870,7 +1912,8 @@ static int ip_mkroute_input(struct sk_buff *skb,
return err;
/* put it into the cache */
- hash = rt_hash(daddr, saddr, fl->iif);
+ hash = rt_hash(daddr, saddr, fl->iif,
+ rt_genid(dev_net(rth->u.dst.dev)));
return rt_intern_hash(hash, rth, &skb->rtable);
}
@@ -1996,7 +2039,7 @@ local_input:
goto e_nobufs;
rth->u.dst.output= ip_rt_bug;
- rth->rt_genid = atomic_read(&rt_genid);
+ rth->rt_genid = rt_genid(net);
atomic_set(&rth->u.dst.__refcnt, 1);
rth->u.dst.flags= DST_HOST;
@@ -2026,7 +2069,7 @@ local_input:
rth->rt_flags &= ~RTCF_LOCAL;
}
rth->rt_type = res.type;
- hash = rt_hash(daddr, saddr, fl.iif);
+ hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
err = rt_intern_hash(hash, rth, &skb->rtable);
goto done;
@@ -2077,7 +2120,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
net = dev_net(dev);
tos &= IPTOS_RT_MASK;
- hash = rt_hash(daddr, saddr, iif);
+ hash = rt_hash(daddr, saddr, iif, rt_genid(net));
rcu_read_lock();
for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
@@ -2089,7 +2132,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
(rth->fl.fl4_tos ^ tos)) == 0 &&
rth->fl.mark == skb->mark &&
net_eq(dev_net(rth->u.dst.dev), net) &&
- rth->rt_genid == atomic_read(&rt_genid)) {
+ !rt_is_expired(rth)) {
dst_use(&rth->u.dst, jiffies);
RT_CACHE_STAT_INC(in_hit);
rcu_read_unlock();
@@ -2217,7 +2260,7 @@ static int __mkroute_output(struct rtable **result,
rth->rt_spec_dst= fl->fl4_src;
rth->u.dst.output=ip_output;
- rth->rt_genid = atomic_read(&rt_genid);
+ rth->rt_genid = rt_genid(dev_net(dev_out));
RT_CACHE_STAT_INC(out_slow_tot);
@@ -2266,7 +2309,8 @@ static int ip_mkroute_output(struct rtable **rp,
int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
unsigned hash;
if (err == 0) {
- hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
+ hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
+ rt_genid(dev_net(dev_out)));
err = rt_intern_hash(hash, rth, rp);
}
@@ -2478,7 +2522,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
unsigned hash;
struct rtable *rth;
- hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
+ hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
rcu_read_lock_bh();
for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
@@ -2491,7 +2535,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
!((rth->fl.fl4_tos ^ flp->fl4_tos) &
(IPTOS_RT_MASK | RTO_ONLINK)) &&
net_eq(dev_net(rth->u.dst.dev), net) &&
- rth->rt_genid == atomic_read(&rt_genid)) {
+ !rt_is_expired(rth)) {
dst_use(&rth->u.dst, jiffies);
RT_CACHE_STAT_INC(out_hit);
rcu_read_unlock_bh();
@@ -2522,7 +2566,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
};
-static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp)
+static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
{
struct rtable *ort = *rp;
struct rtable *rt = (struct rtable *)
@@ -2546,7 +2590,7 @@ static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp)
rt->idev = ort->idev;
if (rt->idev)
in_dev_hold(rt->idev);
- rt->rt_genid = atomic_read(&rt_genid);
+ rt->rt_genid = rt_genid(net);
rt->rt_flags = ort->rt_flags;
rt->rt_type = ort->rt_type;
rt->rt_dst = ort->rt_dst;
@@ -2582,7 +2626,7 @@ int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
flags ? XFRM_LOOKUP_WAIT : 0);
if (err == -EREMOTE)
- err = ipv4_dst_blackhole(rp, flp);
+ err = ipv4_dst_blackhole(net, rp, flp);
return err;
}
@@ -2801,7 +2845,7 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
continue;
- if (rt->rt_genid != atomic_read(&rt_genid))
+ if (rt_is_expired(rt))
continue;
skb->dst = dst_clone(&rt->u.dst);
if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
@@ -2825,19 +2869,27 @@ done:
void ip_rt_multicast_event(struct in_device *in_dev)
{
- rt_cache_flush(0);
+ rt_cache_flush(dev_net(in_dev->dev), 0);
}
#ifdef CONFIG_SYSCTL
-static int flush_delay;
-
static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
struct file *filp, void __user *buffer,
size_t *lenp, loff_t *ppos)
{
if (write) {
+ int flush_delay;
+ struct net *net;
+ static DEFINE_MUTEX(flush_mutex);
+
+ mutex_lock(&flush_mutex);
+ ctl->data = &flush_delay;
proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
- rt_cache_flush(flush_delay);
+ ctl->data = NULL;
+ mutex_unlock(&flush_mutex);
+
+ net = (struct net *)ctl->extra1;
+ rt_cache_flush(net, flush_delay);
return 0;
}
@@ -2853,25 +2905,18 @@ static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
size_t newlen)
{
int delay;
+ struct net *net;
if (newlen != sizeof(int))
return -EINVAL;
if (get_user(delay, (int __user *)newval))
return -EFAULT;
- rt_cache_flush(delay);
+ net = (struct net *)table->extra1;
+ rt_cache_flush(net, delay);
return 0;
}
ctl_table ipv4_route_table[] = {
{
- .ctl_name = NET_IPV4_ROUTE_FLUSH,
- .procname = "flush",
- .data = &flush_delay,
- .maxlen = sizeof(int),
- .mode = 0200,
- .proc_handler = &ipv4_sysctl_rtcache_flush,
- .strategy = &ipv4_sysctl_rtcache_flush_strategy,
- },
- {
.ctl_name = NET_IPV4_ROUTE_GC_THRESH,
.procname = "gc_thresh",
.data = &ipv4_dst_ops.gc_thresh,
@@ -3009,8 +3054,97 @@ ctl_table ipv4_route_table[] = {
},
{ .ctl_name = 0 }
};
+
+static __net_initdata struct ctl_path ipv4_route_path[] = {
+ { .procname = "net", .ctl_name = CTL_NET, },
+ { .procname = "ipv4", .ctl_name = NET_IPV4, },
+ { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
+ { },
+};
+
+
+static struct ctl_table ipv4_route_flush_table[] = {
+ {
+ .ctl_name = NET_IPV4_ROUTE_FLUSH,
+ .procname = "flush",
+ .maxlen = sizeof(int),
+ .mode = 0200,
+ .proc_handler = &ipv4_sysctl_rtcache_flush,
+ .strategy = &ipv4_sysctl_rtcache_flush_strategy,
+ },
+ { .ctl_name = 0 },
+};
+
+static __net_init int sysctl_route_net_init(struct net *net)
+{
+ struct ctl_table *tbl;
+
+ tbl = ipv4_route_flush_table;
+ if (net != &init_net) {
+ tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
+ if (tbl == NULL)
+ goto err_dup;
+ }
+ tbl[0].extra1 = net;
+
+ net->ipv4.route_hdr =
+ register_net_sysctl_table(net, ipv4_route_path, tbl);
+ if (net->ipv4.route_hdr == NULL)
+ goto err_reg;
+ return 0;
+
+err_reg:
+ if (tbl != ipv4_route_flush_table)
+ kfree(tbl);
+err_dup:
+ return -ENOMEM;
+}
+
+static __net_exit void sysctl_route_net_exit(struct net *net)
+{
+ struct ctl_table *tbl;
+
+ tbl = net->ipv4.route_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->ipv4.route_hdr);
+ BUG_ON(tbl == ipv4_route_flush_table);
+ kfree(tbl);
+}
+
+static __net_initdata struct pernet_operations sysctl_route_ops = {
+ .init = sysctl_route_net_init,
+ .exit = sysctl_route_net_exit,
+};
#endif
+
+static __net_init int rt_secret_timer_init(struct net *net)
+{
+ atomic_set(&net->ipv4.rt_genid,
+ (int) ((num_physpages ^ (num_physpages>>8)) ^
+ (jiffies ^ (jiffies >> 7))));
+
+ net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
+ net->ipv4.rt_secret_timer.data = (unsigned long)net;
+ init_timer_deferrable(&net->ipv4.rt_secret_timer);
+
+ net->ipv4.rt_secret_timer.expires =
+ jiffies + net_random() % ip_rt_secret_interval +
+ ip_rt_secret_interval;
+ add_timer(&net->ipv4.rt_secret_timer);
+ return 0;
+}
+
+static __net_exit void rt_secret_timer_exit(struct net *net)
+{
+ del_timer_sync(&net->ipv4.rt_secret_timer);
+}
+
+static __net_initdata struct pernet_operations rt_secret_timer_ops = {
+ .init = rt_secret_timer_init,
+ .exit = rt_secret_timer_exit,
+};
+
+
#ifdef CONFIG_NET_CLS_ROUTE
struct ip_rt_acct *ip_rt_acct __read_mostly;
#endif /* CONFIG_NET_CLS_ROUTE */
@@ -3029,9 +3163,6 @@ int __init ip_rt_init(void)
{
int rc = 0;
- atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
- (jiffies ^ (jiffies >> 7))));
-
#ifdef CONFIG_NET_CLS_ROUTE
ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
if (!ip_rt_acct)
@@ -3063,19 +3194,14 @@ int __init ip_rt_init(void)
devinet_init();
ip_fib_init();
- rt_secret_timer.function = rt_secret_rebuild;
- rt_secret_timer.data = 0;
- init_timer_deferrable(&rt_secret_timer);
-
/* All the timers, started at system startup tend
to synchronize. Perturb it a bit.
*/
schedule_delayed_work(&expires_work,
net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
- rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
- ip_rt_secret_interval;
- add_timer(&rt_secret_timer);
+ if (register_pernet_subsys(&rt_secret_timer_ops))
+ printk(KERN_ERR "Unable to setup rt_secret_timer\n");
if (ip_rt_proc_init())
printk(KERN_ERR "Unable to create route proc files\n");
@@ -3085,6 +3211,9 @@ int __init ip_rt_init(void)
#endif
rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
+#ifdef CONFIG_SYSCTL
+ register_pernet_subsys(&sysctl_route_ops);
+#endif
return rc;
}