diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/Makefile | 4 | ||||
-rw-r--r-- | net/core/dev.c | 871 | ||||
-rw-r--r-- | net/core/dev_mcast.c | 46 | ||||
-rw-r--r-- | net/core/dst.c | 202 | ||||
-rw-r--r-- | net/core/ethtool.c | 351 | ||||
-rw-r--r-- | net/core/fib_rules.c | 30 | ||||
-rw-r--r-- | net/core/neighbour.c | 97 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 42 | ||||
-rw-r--r-- | net/core/net_namespace.c | 317 | ||||
-rw-r--r-- | net/core/netpoll.c | 87 | ||||
-rw-r--r-- | net/core/pktgen.c | 155 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 179 | ||||
-rw-r--r-- | net/core/scm.c | 9 | ||||
-rw-r--r-- | net/core/skbuff.c | 18 | ||||
-rw-r--r-- | net/core/sock.c | 18 |
15 files changed, 1630 insertions, 796 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index 4751613e1b5..b1332f6d004 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -3,7 +3,7 @@ # obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \ - gen_stats.o gen_estimator.o + gen_stats.o gen_estimator.o net_namespace.o obj-$(CONFIG_SYSCTL) += sysctl_net_core.o @@ -11,7 +11,7 @@ obj-y += dev.o ethtool.o dev_mcast.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o obj-$(CONFIG_XFRM) += flow.o -obj-$(CONFIG_SYSFS) += net-sysfs.o +obj-y += net-sysfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o obj-$(CONFIG_NET_DMA) += user_dma.o diff --git a/net/core/dev.c b/net/core/dev.c index a76021c7120..1e169a541ce 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -92,6 +92,7 @@ #include <linux/etherdevice.h> #include <linux/notifier.h> #include <linux/skbuff.h> +#include <net/net_namespace.h> #include <net/sock.h> #include <linux/rtnetlink.h> #include <linux/proc_fs.h> @@ -189,25 +190,50 @@ static struct net_dma net_dma = { * unregister_netdevice(), which must be called with the rtnl * semaphore held. */ -LIST_HEAD(dev_base_head); DEFINE_RWLOCK(dev_base_lock); -EXPORT_SYMBOL(dev_base_head); EXPORT_SYMBOL(dev_base_lock); #define NETDEV_HASHBITS 8 -static struct hlist_head dev_name_head[1<<NETDEV_HASHBITS]; -static struct hlist_head dev_index_head[1<<NETDEV_HASHBITS]; +#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS) -static inline struct hlist_head *dev_name_hash(const char *name) +static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) { unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); - return &dev_name_head[hash & ((1<<NETDEV_HASHBITS)-1)]; + return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)]; } -static inline struct hlist_head *dev_index_hash(int ifindex) +static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) { - return &dev_index_head[ifindex & ((1<<NETDEV_HASHBITS)-1)]; + return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)]; +} + +/* Device list insertion */ +static int list_netdevice(struct net_device *dev) +{ + struct net *net = dev->nd_net; + + ASSERT_RTNL(); + + write_lock_bh(&dev_base_lock); + list_add_tail(&dev->dev_list, &net->dev_base_head); + hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name)); + hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); + write_unlock_bh(&dev_base_lock); + return 0; +} + +/* Device list removal */ +static void unlist_netdevice(struct net_device *dev) +{ + ASSERT_RTNL(); + + /* Unlink dev from the device chain */ + write_lock_bh(&dev_base_lock); + list_del(&dev->dev_list); + hlist_del(&dev->name_hlist); + hlist_del(&dev->index_hlist); + write_unlock_bh(&dev_base_lock); } /* @@ -220,17 +246,12 @@ static RAW_NOTIFIER_HEAD(netdev_chain); * Device drivers call our routines to queue packets here. We empty the * queue in the local softnet handler. */ -DEFINE_PER_CPU(struct softnet_data, softnet_data) = { NULL }; -#ifdef CONFIG_SYSFS -extern int netdev_sysfs_init(void); -extern int netdev_register_sysfs(struct net_device *); -extern void netdev_unregister_sysfs(struct net_device *); -#else -#define netdev_sysfs_init() (0) -#define netdev_register_sysfs(dev) (0) -#define netdev_unregister_sysfs(dev) do { } while(0) -#endif +DEFINE_PER_CPU(struct softnet_data, softnet_data); + +extern int netdev_kobject_init(void); +extern int netdev_register_kobject(struct net_device *); +extern void netdev_unregister_kobject(struct net_device *); #ifdef CONFIG_DEBUG_LOCK_ALLOC /* @@ -490,7 +511,7 @@ unsigned long netdev_boot_base(const char *prefix, int unit) * If device already registered then return base of 1 * to indicate not to probe for this interface */ - if (__dev_get_by_name(name)) + if (__dev_get_by_name(&init_net, name)) return 1; for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) @@ -545,11 +566,11 @@ __setup("netdev=", netdev_boot_setup); * careful with locks. */ -struct net_device *__dev_get_by_name(const char *name) +struct net_device *__dev_get_by_name(struct net *net, const char *name) { struct hlist_node *p; - hlist_for_each(p, dev_name_hash(name)) { + hlist_for_each(p, dev_name_hash(net, name)) { struct net_device *dev = hlist_entry(p, struct net_device, name_hlist); if (!strncmp(dev->name, name, IFNAMSIZ)) @@ -569,12 +590,12 @@ struct net_device *__dev_get_by_name(const char *name) * matching device is found. */ -struct net_device *dev_get_by_name(const char *name) +struct net_device *dev_get_by_name(struct net *net, const char *name) { struct net_device *dev; read_lock(&dev_base_lock); - dev = __dev_get_by_name(name); + dev = __dev_get_by_name(net, name); if (dev) dev_hold(dev); read_unlock(&dev_base_lock); @@ -592,11 +613,11 @@ struct net_device *dev_get_by_name(const char *name) * or @dev_base_lock. */ -struct net_device *__dev_get_by_index(int ifindex) +struct net_device *__dev_get_by_index(struct net *net, int ifindex) { struct hlist_node *p; - hlist_for_each(p, dev_index_hash(ifindex)) { + hlist_for_each(p, dev_index_hash(net, ifindex)) { struct net_device *dev = hlist_entry(p, struct net_device, index_hlist); if (dev->ifindex == ifindex) @@ -616,12 +637,12 @@ struct net_device *__dev_get_by_index(int ifindex) * dev_put to indicate they have finished with it. */ -struct net_device *dev_get_by_index(int ifindex) +struct net_device *dev_get_by_index(struct net *net, int ifindex) { struct net_device *dev; read_lock(&dev_base_lock); - dev = __dev_get_by_index(ifindex); + dev = __dev_get_by_index(net, ifindex); if (dev) dev_hold(dev); read_unlock(&dev_base_lock); @@ -642,13 +663,13 @@ struct net_device *dev_get_by_index(int ifindex) * If the API was consistent this would be __dev_get_by_hwaddr */ -struct net_device *dev_getbyhwaddr(unsigned short type, char *ha) +struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha) { struct net_device *dev; ASSERT_RTNL(); - for_each_netdev(dev) + for_each_netdev(&init_net, dev) if (dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len)) return dev; @@ -658,12 +679,12 @@ struct net_device *dev_getbyhwaddr(unsigned short type, char *ha) EXPORT_SYMBOL(dev_getbyhwaddr); -struct net_device *__dev_getfirstbyhwtype(unsigned short type) +struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) { struct net_device *dev; ASSERT_RTNL(); - for_each_netdev(dev) + for_each_netdev(net, dev) if (dev->type == type) return dev; @@ -672,12 +693,12 @@ struct net_device *__dev_getfirstbyhwtype(unsigned short type) EXPORT_SYMBOL(__dev_getfirstbyhwtype); -struct net_device *dev_getfirstbyhwtype(unsigned short type) +struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) { struct net_device *dev; rtnl_lock(); - dev = __dev_getfirstbyhwtype(type); + dev = __dev_getfirstbyhwtype(net, type); if (dev) dev_hold(dev); rtnl_unlock(); @@ -697,13 +718,13 @@ EXPORT_SYMBOL(dev_getfirstbyhwtype); * dev_put to indicate they have finished with it. */ -struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mask) +struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask) { struct net_device *dev, *ret; ret = NULL; read_lock(&dev_base_lock); - for_each_netdev(dev) { + for_each_netdev(net, dev) { if (((dev->flags ^ if_flags) & mask) == 0) { dev_hold(dev); ret = dev; @@ -740,9 +761,10 @@ int dev_valid_name(const char *name) } /** - * dev_alloc_name - allocate a name for a device - * @dev: device + * __dev_alloc_name - allocate a name for a device + * @net: network namespace to allocate the device name in * @name: name format string + * @buf: scratch buffer and result name string * * Passed a format string - eg "lt%d" it will try and find a suitable * id. It scans list of devices to build up a free map, then chooses @@ -753,13 +775,12 @@ int dev_valid_name(const char *name) * Returns the number of the unit assigned or a negative errno code. */ -int dev_alloc_name(struct net_device *dev, const char *name) +static int __dev_alloc_name(struct net *net, const char *name, char *buf) { int i = 0; - char buf[IFNAMSIZ]; const char *p; const int max_netdevices = 8*PAGE_SIZE; - long *inuse; + unsigned long *inuse; struct net_device *d; p = strnchr(name, IFNAMSIZ-1, '%'); @@ -773,18 +794,18 @@ int dev_alloc_name(struct net_device *dev, const char *name) return -EINVAL; /* Use one page as a bit array of possible slots */ - inuse = (long *) get_zeroed_page(GFP_ATOMIC); + inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); if (!inuse) return -ENOMEM; - for_each_netdev(d) { + for_each_netdev(net, d) { if (!sscanf(d->name, name, &i)) continue; if (i < 0 || i >= max_netdevices) continue; /* avoid cases where sscanf is not exact inverse of printf */ - snprintf(buf, sizeof(buf), name, i); + snprintf(buf, IFNAMSIZ, name, i); if (!strncmp(buf, d->name, IFNAMSIZ)) set_bit(i, inuse); } @@ -793,11 +814,9 @@ int dev_alloc_name(struct net_device *dev, const char *name) free_page((unsigned long) inuse); } - snprintf(buf, sizeof(buf), name, i); - if (!__dev_get_by_name(buf)) { - strlcpy(dev->name, buf, IFNAMSIZ); + snprintf(buf, IFNAMSIZ, name, i); + if (!__dev_get_by_name(net, buf)) return i; - } /* It is possible to run out of possible slots * when the name is long and there isn't enough space left @@ -806,6 +825,34 @@ int dev_alloc_name(struct net_device *dev, const char *name) return -ENFILE; } +/** + * dev_alloc_name - allocate a name for a device + * @dev: device + * @name: name format string + * + * Passed a format string - eg "lt%d" it will try and find a suitable + * id. It scans list of devices to build up a free map, then chooses + * the first empty slot. The caller must hold the dev_base or rtnl lock + * while allocating the name and adding the device in order to avoid + * duplicates. + * Limited to bits_per_byte * page size devices (ie 32K on most platforms). + * Returns the number of the unit assigned or a negative errno code. + */ + +int dev_alloc_name(struct net_device *dev, const char *name) +{ + char buf[IFNAMSIZ]; + struct net *net; + int ret; + + BUG_ON(!dev->nd_net); + net = dev->nd_net; + ret = __dev_alloc_name(net, name, buf); + if (ret >= 0) + strlcpy(dev->name, buf, IFNAMSIZ); + return ret; +} + /** * dev_change_name - change name of a device @@ -820,9 +867,12 @@ int dev_change_name(struct net_device *dev, char *newname) char oldname[IFNAMSIZ]; int err = 0; int ret; + struct net *net; ASSERT_RTNL(); + BUG_ON(!dev->nd_net); + net = dev->nd_net; if (dev->flags & IFF_UP) return -EBUSY; @@ -837,7 +887,7 @@ int dev_change_name(struct net_device *dev, char *newname) return err; strcpy(newname, dev->name); } - else if (__dev_get_by_name(newname)) + else if (__dev_get_by_name(net, newname)) return -EEXIST; else strlcpy(dev->name, newname, IFNAMSIZ); @@ -847,10 +897,10 @@ rollback: write_lock_bh(&dev_base_lock); hlist_del(&dev->name_hlist); - hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name)); + hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name)); write_unlock_bh(&dev_base_lock); - ret = raw_notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev); + ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); ret = notifier_to_errno(ret); if (ret) { @@ -876,7 +926,7 @@ rollback: */ void netdev_features_change(struct net_device *dev) { - raw_notifier_call_chain(&netdev_chain, NETDEV_FEAT_CHANGE, dev); + call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); } EXPORT_SYMBOL(netdev_features_change); @@ -891,8 +941,7 @@ EXPORT_SYMBOL(netdev_features_change); void netdev_state_change(struct net_device *dev) { if (dev->flags & IFF_UP) { - raw_notifier_call_chain(&netdev_chain, - NETDEV_CHANGE, dev); + call_netdevice_notifiers(NETDEV_CHANGE, dev); rtmsg_ifinfo(RTM_NEWLINK, dev, 0); } } @@ -906,26 +955,18 @@ void netdev_state_change(struct net_device *dev) * available in this kernel then it becomes a nop. */ -void dev_load(const char *name) +void dev_load(struct net *net, const char *name) { struct net_device *dev; read_lock(&dev_base_lock); - dev = __dev_get_by_name(name); + dev = __dev_get_by_name(net, name); read_unlock(&dev_base_lock); if (!dev && capable(CAP_SYS_MODULE)) request_module("%s", name); } -static int default_rebuild_header(struct sk_buff *skb) -{ - printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n", - skb->dev ? skb->dev->name : "NULL!!!"); - kfree_skb(skb); - return 1; -} - /** * dev_open - prepare an interface for use. * @dev: device to open @@ -988,7 +1029,7 @@ int dev_open(struct net_device *dev) /* * ... and announce new interface. */ - raw_notifier_call_chain(&netdev_chain, NETDEV_UP, dev); + call_netdevice_notifiers(NETDEV_UP, dev); } return ret; } @@ -1004,6 +1045,8 @@ int dev_open(struct net_device *dev) */ int dev_close(struct net_device *dev) { + might_sleep(); + if (!(dev->flags & IFF_UP)) return 0; @@ -1011,23 +1054,19 @@ int dev_close(struct net_device *dev) * Tell people we are going down, so that they can * prepare to death, when device is still operating. */ - raw_notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev); + call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); dev_deactivate(dev); clear_bit(__LINK_STATE_START, &dev->state); /* Synchronize to scheduled poll. We cannot touch poll list, - * it can be even on different cpu. So just clear netif_running(), - * and wait when poll really will happen. Actually, the best place - * for this is inside dev->stop() after device stopped its irq - * engine, but this requires more changes in devices. */ - + * it can be even on different cpu. So just clear netif_running(). + * + * dev->stop() will invoke napi_disable() on all of it's + * napi_struct instances on this device. + */ smp_mb__after_clear_bit(); /* Commit netif_running(). */ - while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) { - /* No hurry. */ - msleep(1); - } /* * Call the device specific close. This cannot fail. @@ -1048,12 +1087,14 @@ int dev_close(struct net_device *dev) /* * Tell people we are down */ - raw_notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev); + call_netdevice_notifiers(NETDEV_DOWN, dev); return 0; } +static int dev_boot_phase = 1; + /* * Device change register/unregister. These are not inline or static * as we export them to the world. @@ -1077,23 +1118,27 @@ int register_netdevice_notifier(struct notifier_block *nb) { struct net_device *dev; struct net_device *last; + struct net *net; int err; rtnl_lock(); err = raw_notifier_chain_register(&netdev_chain, nb); if (err) goto unlock; + if (dev_boot_phase) + goto unlock; + for_each_net(net) { + for_each_netdev(net, dev) { + err = nb->notifier_call(nb, NETDEV_REGISTER, dev); + err = notifier_to_errno(err); + if (err) + goto rollback; + + if (!(dev->flags & IFF_UP)) + continue; - for_each_netdev(dev) { - err = nb->notifier_call(nb, NETDEV_REGISTER, dev); - err = notifier_to_errno(err); - if (err) - goto rollback; - - if (!(dev->flags & IFF_UP)) - continue; - - nb->notifier_call(nb, NETDEV_UP, dev); + nb->notifier_call(nb, NETDEV_UP, dev); + } } unlock: @@ -1102,15 +1147,17 @@ unlock: rollback: last = dev; - for_each_netdev(dev) { - if (dev == last) - break; + for_each_net(net) { + for_each_netdev(net, dev) { + if (dev == last) + break; - if (dev->flags & IFF_UP) { - nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); - nb->notifier_call(nb, NETDEV_DOWN, dev); + if (dev->flags & IFF_UP) { + nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); + nb->notifier_call(nb, NETDEV_DOWN, dev); + } + nb->notifier_call(nb, NETDEV_UNREGISTER, dev); } - nb->notifier_call(nb, NETDEV_UNREGISTER, dev); } goto unlock; } @@ -1144,9 +1191,9 @@ int unregister_netdevice_notifier(struct notifier_block *nb) * are as for raw_notifier_call_chain(). */ -int call_netdevice_notifiers(unsigned long val, void *v) +int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { - return raw_notifier_call_chain(&netdev_chain, val, v); + return raw_notifier_call_chain(&netdev_chain, val, dev); } /* When > 0 there are consumers of rx skb time stamps */ @@ -1233,21 +1280,21 @@ void __netif_schedule(struct net_device *dev) } EXPORT_SYMBOL(__netif_schedule); -void __netif_rx_schedule(struct net_device *dev) +void dev_kfree_skb_irq(struct sk_buff *skb) { - unsigned long flags; + if (atomic_dec_and_test(&skb->users)) { + struct softnet_data *sd; + unsigned long flags; - local_irq_save(flags); - dev_hold(dev); - list_add_tail(&dev->poll_list, &__get_cpu_var(softnet_data).poll_list); - if (dev->quota < 0) - dev->quota += dev->weight; - else - dev->quota = dev->weight; - __raise_softirq_irqoff(NET_RX_SOFTIRQ); - local_irq_restore(flags); + local_irq_save(flags); + sd = &__get_cpu_var(softnet_data); + skb->next = sd->completion_queue; + sd->completion_queue = skb; + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_restore(flags); + } } -EXPORT_SYMBOL(__netif_rx_schedule); +EXPORT_SYMBOL(dev_kfree_skb_irq); void dev_kfree_skb_any(struct sk_buff *skb) { @@ -1259,7 +1306,12 @@ void dev_kfree_skb_any(struct sk_buff *skb) EXPORT_SYMBOL(dev_kfree_skb_any); -/* Hot-plugging. */ +/** + * netif_device_detach - mark device as removed + * @dev: network device + * + * Mark device as removed from system and therefore no longer available. + */ void netif_device_detach(struct net_device *dev) { if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && @@ -1269,6 +1321,12 @@ void netif_device_detach(struct net_device *dev) } EXPORT_SYMBOL(netif_device_detach); +/** + * netif_device_attach - mark device as attached + * @dev: network device + * + * Mark device as attached from system and restart if needed. + */ void netif_device_attach(struct net_device *dev) { if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && @@ -1501,18 +1559,6 @@ out_kfree_skb: return 0; } -#define HARD_TX_LOCK(dev, cpu) { \ - if ((dev->features & NETIF_F_LLTX) == 0) { \ - netif_tx_lock(dev); \ - } \ -} - -#define HARD_TX_UNLOCK(dev) { \ - if ((dev->features & NETIF_F_LLTX) == 0) { \ - netif_tx_unlock(dev); \ - } \ -} - /** * dev_queue_xmit - transmit a buffer * @skb: buffer to transmit @@ -1730,7 +1776,7 @@ enqueue: return NET_RX_SUCCESS; } - netif_rx_schedule(&queue->backlog_dev); + napi_schedule(&queue->backlog); goto enqueue; } @@ -1771,6 +1817,7 @@ static inline struct net_device *skb_bond(struct sk_buff *skb) return dev; } + static void net_tx_action(struct softirq_action *h) { struct softnet_data *sd = &__get_cpu_var(softnet_data); @@ -1927,7 +1974,7 @@ int netif_receive_skb(struct sk_buff *skb) __be16 type; /* if we've gotten here through NAPI, check netpoll */ - if (skb->dev->poll && netpoll_rx(skb)) + if (netpoll_receive_skb(skb)) return NET_RX_DROP; if (!skb->tstamp.tv64) @@ -2017,22 +2064,25 @@ out: return ret; } -static int process_backlog(struct net_device *backlog_dev, int *budget) +static int process_backlog(struct napi_struct *napi, int quota) { int work = 0; - int quota = min(backlog_dev->quota, *budget); struct softnet_data *queue = &__get_cpu_var(softnet_data); unsigned long start_time = jiffies; - backlog_dev->weight = weight_p; - for (;;) { + napi->weight = weight_p; + do { struct sk_buff *skb; struct net_device *dev; local_irq_disable(); skb = __skb_dequeue(&queue->input_pkt_queue); - if (!skb) - goto job_done; + if (!skb) { + __napi_complete(napi); + local_irq_enable(); + break; + } + local_irq_enable(); dev = skb->dev; @@ -2040,67 +2090,86 @@ static int process_backlog(struct net_device *backlog_dev, int *budget) netif_receive_skb(skb); dev_put(dev); + } while (++work < quota && jiffies == start_time); - work++; - - if (work >= quota || jiffies - start_time > 1) - break; - - } - - backlog_dev->quota -= work; - *budget -= work; - return -1; - -job_done: - backlog_dev->quota -= work; - *budget -= work; + return work; +} - list_del(&backlog_dev->poll_list); - smp_mb__before_clear_bit(); - netif_poll_enable(backlog_dev); +/** + * __napi_schedule - schedule for receive + * @napi: entry to schedule + * + * The entry's receive function will be scheduled to run + */ +void fastcall __napi_schedule(struct napi_struct *n) +{ + unsigned long flags; - local_irq_enable(); - return 0; + local_irq_save(flags); + list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list); + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + local_irq_restore(flags); } +EXPORT_SYMBOL(__napi_schedule); + static void net_rx_action(struct softirq_action *h) { - struct softnet_data *queue = &__get_cpu_var(softnet_data); + struct list_head *list = &__get_cpu_var(softnet_data).poll_list; unsigned long start_time = jiffies; int budget = netdev_budget; void *have; local_irq_disable(); - while (!list_empty(&queue->poll_list)) { - struct net_device *dev; + while (!list_empty(list)) { + struct napi_struct *n; + int work, weight; - if (budget <= 0 || jiffies - start_time > 1) + /* If softirq window is exhuasted then punt. + * + * Note that this is a slight policy change from the + * previous NAPI code, which would allow up to 2 + * jiffies to pass before breaking out. The test + * used to be "jiffies - start_time > 1". + */ + if (unlikely(budget <= 0 || jiffies != start_time)) goto softnet_break; local_irq_enable(); - dev = list_entry(queue->poll_list.next, - struct net_device, poll_list); - have = netpoll_poll_lock(dev); + /* Even though interrupts have been re-enabled, this + * access is safe because interrupts can only add new + * entries to the tail of this list, and only ->poll() + * calls can remove this head entry from the list. + */ + n = list_entry(list->next, struct napi_struct, poll_list); - if (dev->quota <= 0 || dev->poll(dev, &budget)) { - netpoll_poll_unlock(have); - local_irq_disable(); - list_move_tail(&dev->poll_list, &queue->poll_list); - if (dev->quota < 0) - dev->quota += dev->weight; - else - dev->quota = dev->weight; - } else { - netpoll_poll_unlock(have); - dev_put(dev); - local_irq_disable(); - } + have = netpoll_poll_lock(n); + + weight = n->weight; + + work = n->poll(n, weight); + + WARN_ON_ONCE(work > weight); + + budget -= work; + + local_irq_disable(); + + /* Drivers must not modify the NAPI state if they + * consume the entire weight. In such cases this code + * still "owns" the NAPI instance and therefore can + * move the instance around on the list at-will. + */ + if (unlikely(work == weight)) + list_move_tail(&n->poll_list, list); + + netpoll_poll_unlock(have); } out: local_irq_enable(); + #ifdef CONFIG_NET_DMA /* * There may not be any more sk_buffs coming right now, so push @@ -2115,6 +2184,7 @@ out: } } #endif + return; softnet_break: @@ -2154,7 +2224,7 @@ int register_gifconf(unsigned int family, gifconf_func_t * gifconf) * match. --pb */ -static int dev_ifname(struct ifreq __user *arg) +static int dev_ifname(struct net *net, struct ifreq __user *arg) { struct net_device *dev; struct ifreq ifr; @@ -2167,7 +2237,7 @@ static int dev_ifname(struct ifreq __user *arg) return -EFAULT; read_lock(&dev_base_lock); - dev = __dev_get_by_index(ifr.ifr_ifindex); + dev = __dev_get_by_index(net, ifr.ifr_ifindex); if (!dev) { read_unlock(&dev_base_lock); return -ENODEV; @@ -2187,7 +2257,7 @@ static int dev_ifname(struct ifreq __user *arg) * Thus we will need a 'compatibility mode'. */ -static int dev_ifconf(char __user *arg) +static int dev_ifconf(struct net *net, char __user *arg) { struct ifconf ifc; struct net_device *dev; @@ -2211,7 +2281,7 @@ static int dev_ifconf(char __user *arg) */ total = 0; - for_each_netdev(dev) { + for_each_netdev(net, dev) { for (i = 0; i < NPROTO; i++) { if (gifconf_list[i]) { int done; @@ -2245,6 +2315,7 @@ static int dev_ifconf(char __user *arg) */ void *dev_seq_start(struct seq_file *seq, loff_t *pos) { + struct net *net = seq->private; loff_t off; struct net_device *dev; @@ -2253,7 +2324,7 @@ void *dev_seq_start(struct seq_file *seq, loff_t *pos) return SEQ_START_TOKEN; off = 1; - for_each_netdev(dev) + for_each_netdev(net, dev) if (off++ == *pos) return dev; @@ -2262,9 +2333,10 @@ void *dev_seq_start(struct seq_file *seq, loff_t *pos) void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + struct net *net = seq->private; ++*pos; return v == SEQ_START_TOKEN ? - first_net_device() : next_net_device((struct net_device *)v); + first_net_device(net) : next_net_device((struct net_device *)v); } void dev_seq_stop(struct seq_file *seq, void *v) @@ -2360,7 +2432,26 @@ static const struct seq_operations dev_seq_ops = { static int dev_seq_open(struct inode *inode, struct file *file) { - return seq_open(file, &dev_seq_ops); + struct seq_file *seq; + int res; + res = seq_open(file, &dev_seq_ops); + if (!res) { + seq = file->private_data; + seq->private = get_proc_net(inode); + if (!seq->private) { + seq_release(inode, file); + res = -ENXIO; + } + } + return res; +} + +static int dev_seq_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct net *net = seq->private; + put_net(net); + return seq_release(inode, file); } static const struct file_operations dev_seq_fops = { @@ -2368,7 +2459,7 @@ static const struct file_operations dev_seq_fops = { .open = dev_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = dev_seq_release, }; static const struct seq_operations softnet_seq_ops = { @@ -2520,30 +2611,49 @@ static const struct file_operations ptype_seq_fops = { }; -static int __init dev_proc_init(void) +static int __net_init dev_proc_net_init(struct net *net) { int rc = -ENOMEM; - if (!proc_net_fops_create("dev", S_IRUGO, &dev_seq_fops)) + if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops)) goto out; - if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops)) + if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops)) goto out_dev; - if (!proc_net_fops_create("ptype", S_IRUGO, &ptype_seq_fops)) - goto out_dev2; - - if (wext_proc_init()) + if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops)) goto out_softnet; + + if (wext_proc_init(net)) + goto out_ptype; rc = 0; out: return rc; +out_ptype: + proc_net_remove(net, "ptype"); out_softnet: - proc_net_remove("ptype"); -out_dev2: - proc_net_remove("softnet_stat"); + proc_net_remove(net, "softnet_stat"); out_dev: - proc_net_remove("dev"); + proc_net_remove(net, "dev"); goto out; } + +static void __net_exit dev_proc_net_exit(struct net *net) +{ + wext_proc_exit(net); + + proc_net_remove(net, "ptype"); + proc_net_remove(net, "softnet_stat"); + proc_net_remove(net, "dev"); +} + +static struct pernet_operations __net_initdata dev_proc_ops = { + .init = dev_proc_net_init, + .exit = dev_proc_net_exit, +}; + +static int __init dev_proc_init(void) +{ + return register_pernet_subsys(&dev_proc_ops); +} #else #define dev_proc_init() 0 #endif /* CONFIG_PROC_FS */ @@ -2906,8 +3016,7 @@ int dev_change_flags(struct net_device *dev, unsigned flags) if (dev->flags & IFF_UP && ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) - raw_notifier_call_chain(&netdev_chain, - NETDEV_CHANGE, dev); + call_netdevice_notifiers(NETDEV_CHANGE, dev); if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? +1 : -1; @@ -2953,8 +3062,7 @@ int dev_set_mtu(struct net_device *dev, int new_mtu) else dev->mtu = new_mtu; if (!err && dev->flags & IFF_UP) - raw_notifier_call_chain(&netdev_chain, - NETDEV_CHANGEMTU, dev); + call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); return err; } @@ -2970,18 +3078,17 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) return -ENODEV; err = dev->set_mac_address(dev, sa); if (!err) - raw_notifier_call_chain(&netdev_chain, - NETDEV_CHANGEADDR, dev); + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); return err; } /* - * Perform the SIOCxIFxxx calls. + * Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock) */ -static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd) +static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd) { int err; - struct net_device *dev = __dev_get_by_name(ifr->ifr_name); + struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); if (!dev) return -ENODEV; @@ -2991,25 +3098,15 @@ static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd) ifr->ifr_flags = dev_get_flags(dev); return 0; - case SIOCSIFFLAGS: /* Set interface flags */ - return dev_change_flags(dev, ifr->ifr_flags); - case SIOCGIFMETRIC: /* Get the metric on the interface (currently unused) */ ifr->ifr_metric = 0; return 0; - case SIOCSIFMETRIC: /* Set the metric on the interface - (currently unused) */ - return -EOPNOTSUPP; - case SIOCGIFMTU: /* Get the MTU of a device */ ifr->ifr_mtu = dev->mtu; return 0; - case SIOCSIFMTU: /* Set the MTU of a device */ - return dev_set_mtu(dev, ifr->ifr_mtu); - case SIOCGIFHWADDR: if (!dev->addr_len) memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); @@ -3019,17 +3116,9 @@ static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd) ifr->ifr_hwaddr.sa_family = dev->type; return 0; - case SIOCSIFHWADDR: - return dev_set_mac_address(dev, &ifr->ifr_hwaddr); - - case SIOCSIFHWBROADCAST: - if (ifr->ifr_hwaddr.sa_family != dev->type) - return -EINVAL; - memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, - min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); - raw_notifier_call_chain(&netdev_chain, - NETDEV_CHANGEADDR, dev); - return 0; + case SIOCGIFSLAVE: + err = -EINVAL; + break; case SIOCGIFMAP: ifr->ifr_map.mem_start = dev->mem_start; @@ -3040,6 +3129,59 @@ static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd) ifr->ifr_map.port = dev->if_port; return 0; + case SIOCGIFINDEX: + ifr->ifr_ifindex = dev->ifindex; + return 0; + + case SIOCGIFTXQLEN: + ifr->ifr_qlen = dev->tx_queue_len; + return 0; + + default: + /* dev_ioctl() should ensure this case + * is never reached + */ + WARN_ON(1); + err = -EINVAL; + break; + + } + return err; +} + +/* + * Perform the SIOCxIFxxx calls, inside rtnl_lock() + */ +static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) +{ + int err; + struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); + + if (!dev) + return -ENODEV; + + switch (cmd) { + case SIOCSIFFLAGS: /* Set interface flags */ + return dev_change_flags(dev, ifr->ifr_flags); + + case SIOCSIFMETRIC: /* Set the metric on the interface + (currently unused) */ + return -EOPNOTSUPP; + + case SIOCSIFMTU: /* Set the MTU of a device */ + return dev_set_mtu(dev, ifr->ifr_mtu); + + case SIOCSIFHWADDR: + return dev_set_mac_address(dev, &ifr->ifr_hwaddr); + + case SIOCSIFHWBROADCAST: + if (ifr->ifr_hwaddr.sa_family != dev->type) + return -EINVAL; + memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, + min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return 0; + case SIOCSIFMAP: if (dev->set_config) { if (!netif_device_present(dev)) @@ -3066,14 +3208,6 @@ static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd) return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data, dev->addr_len, 1); - case SIOCGIFINDEX: - ifr->ifr_ifindex = dev->ifindex; - return 0; - - case SIOCGIFTXQLEN: - ifr->ifr_qlen = dev->tx_queue_len; - return 0; - case SIOCSIFTXQLEN: if (ifr->ifr_qlen < 0) return -EINVAL; @@ -3134,7 +3268,7 @@ static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd) * positive or a negative errno code on error. */ -int dev_ioctl(unsigned int cmd, void __user *arg) +int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) { struct ifreq ifr; int ret; @@ -3147,12 +3281,12 @@ int dev_ioctl(unsigned int cmd, void __user *arg) if (cmd == SIOCGIFCONF) { rtnl_lock(); - ret = dev_ifconf((char __user *) arg); + ret = dev_ifconf(net, (char __user *) arg); rtnl_unlock(); return ret; } if (cmd == SIOCGIFNAME) - return dev_ifname((struct ifreq __user *)arg); + return dev_ifname(net, (struct ifreq __user *)arg); if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) return -EFAULT; @@ -3182,9 +3316,9 @@ int dev_ioctl(unsigned int cmd, void __user *arg) case SIOCGIFMAP: case SIOCGIFINDEX: case SIOCGIFTXQLEN: - dev_load(ifr.ifr_name); + dev_load(net, ifr.ifr_name); read_lock(&dev_base_lock); - ret = dev_ifsioc(&ifr, cmd); + ret = dev_ifsioc_locked(net, &ifr, cmd); read_unlock(&dev_base_lock); if (!ret) { if (colon) @@ -3196,9 +3330,9 @@ int dev_ioctl(unsigned int cmd, void __user *arg) return ret; case SIOCETHTOOL: - dev_load(ifr.ifr_name); + dev_load(net, ifr.ifr_name); rtnl_lock(); - ret = dev_ethtool(&ifr); + ret = dev_ethtool(net, &ifr); rtnl_unlock(); if (!ret) { if (colon) @@ -3220,9 +3354,9 @@ int dev_ioctl(unsigned int cmd, void __user *arg) case SIOCSIFNAME: if (!capable(CAP_NET_ADMIN)) return -EPERM; - dev_load(ifr.ifr_name); + dev_load(net, ifr.ifr_name); rtnl_lock(); - ret = dev_ifsioc(&ifr, cmd); + ret = dev_ifsioc(net, &ifr, cmd); rtnl_unlock(); if (!ret) { if (colon) @@ -3261,9 +3395,9 @@ int dev_ioctl(unsigned int cmd, void __user *arg) /* fall through */ case SIOCBONDSLAVEINFOQUERY: case SIOCBONDINFOQUERY: - dev_load(ifr.ifr_name); + dev_load(net, ifr.ifr_name); rtnl_lock(); - ret = dev_ifsioc(&ifr, cmd); + ret = dev_ifsioc(net, &ifr, cmd); rtnl_unlock(); return ret; @@ -3283,9 +3417,9 @@ int dev_ioctl(unsigned int cmd, void __user *arg) if (cmd == SIOCWANDEV || (cmd >= SIOCDEVPRIVATE && cmd <= SIOCDEVPRIVATE + 15)) { - dev_load(ifr.ifr_name); + dev_load(net, ifr.ifr_name); rtnl_lock(); - ret = dev_ifsioc(&ifr, cmd); + ret = dev_ifsioc(net, &ifr, cmd); rtnl_unlock(); if (!ret && copy_to_user(arg, &ifr, sizeof(struct ifreq))) @@ -3294,7 +3428,7 @@ int dev_ioctl(unsigned int cmd, void __user *arg) } /* Take care of Wireless Extensions */ if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) - return wext_handle_ioctl(&ifr, cmd, arg); + return wext_handle_ioctl(net, &ifr, cmd, arg); return -EINVAL; } } @@ -3307,19 +3441,17 @@ int dev_ioctl(unsigned int cmd, void __user *arg) * number. The caller must hold the rtnl semaphore or the * dev_base_lock to be sure it remains unique. */ -static int dev_new_index(void) +static int dev_new_index(struct net *net) { static int ifindex; for (;;) { if (++ifindex <= 0) ifindex = 1; - if (!__dev_get_by_index(ifindex)) + if (!__dev_get_by_index(net, ifindex)) return ifindex; } } -static int dev_boot_phase = 1; - /* Delayed registration/unregisteration */ static DEFINE_SPINLOCK(net_todo_list_lock); static struct list_head net_todo_list = LIST_HEAD_INIT(net_todo_list); @@ -3353,6 +3485,7 @@ int register_netdevice(struct net_device *dev) struct hlist_head *head; struct hlist_node *p; int ret; + struct net *net; BUG_ON(dev_boot_phase); ASSERT_RTNL(); @@ -3361,6 +3494,8 @@ int register_netdevice(struct net_device *dev) /* When net_device's are persistent, this will be fatal. */ BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); + BUG_ON(!dev->nd_net); + net = dev->nd_net; spin_lock_init(&dev->queue_lock); spin_lock_init(&dev->_xmit_lock); @@ -3385,12 +3520,12 @@ int register_netdevice(struct net_device *dev) goto err_uninit; } - dev->ifindex = dev_new_index(); + dev->ifindex = dev_new_index(net); if (dev->iflink == -1) dev->iflink = dev->ifindex; /* Check for existence of name */ - head = dev_name_hash(dev->name); + head = dev_name_hash(net, dev->name); hlist_for_each(p, head) { struct net_device *d = hlist_entry(p, struct net_device, name_hlist); @@ -3446,15 +3581,7 @@ int register_netdevice(struct net_device *dev) } } - /* - * nil rebuild_header routine, - * that should be never called and used as just bug trap. - */ - - if (!dev->rebuild_header) - dev->rebuild_header = default_rebuild_header; - - ret = netdev_register_sysfs(dev); + ret = netdev_register_kobject(dev); if (ret) goto err_uninit; dev->reg_state = NETREG_REGISTERED; @@ -3467,15 +3594,11 @@ int register_netdevice(struct net_device *dev) set_bit(__LINK_STATE_PRESENT, &dev->state); dev_init_scheduler(dev); - write_lock_bh(&dev_base_lock); - list_add_tail(&dev->dev_list, &dev_base_head); - hlist_add_head(&dev->name_hlist, head); - hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex)); dev_hold(dev); - write_unlock_bh(&dev_base_lock); + list_netdevice(dev); /* Notify protocols, that a new device appeared. */ - ret = raw_notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev); + ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); ret = notifier_to_errno(ret); if (ret) unregister_netdevice(dev); @@ -3546,8 +3669,7 @@ static void netdev_wait_allrefs(struct net_device *dev) rtnl_lock(); /* Rebroadcast unregister notification */ - raw_notifier_call_chain(&netdev_chain, - NETDEV_UNREGISTER, dev); + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); if (test_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { @@ -3692,6 +3814,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev = (struct net_device *) (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST); dev->padded = (char *)dev - (char *)p; + dev->nd_net = &init_net; if (sizeof_priv) { dev->priv = ((char *)dev + @@ -3704,6 +3827,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev->egress_subqueue_count = queue_count; dev->get_stats = internal_stats; + netpoll_netdev_init(dev); setup(dev); strcpy(dev->name, name); return dev; @@ -3720,7 +3844,6 @@ EXPORT_SYMBOL(alloc_netdev_mq); */ void free_netdev(struct net_device *dev) { -#ifdef CONFIG_SYSFS /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { kfree((char *)dev - dev->padded); @@ -3732,9 +3855,6 @@ void free_netdev(struct net_device *dev) /* will free via device release */ put_device(&dev->dev); -#else - kfree((char *)dev - dev->padded); -#endif } /* Synchronize with packet receive processing. */ @@ -3773,15 +3893,10 @@ void unregister_netdevice(struct net_device *dev) BUG_ON(dev->reg_state != NETREG_REGISTERED); /* If device is running, close it first. */ - if (dev->flags & IFF_UP) - dev_close(dev); + dev_close(dev); /* And unlink it from device chain. */ - write_lock_bh(&dev_base_lock); - list_del(&dev->dev_list); - hlist_del(&dev->name_hlist); - hlist_del(&dev->index_hlist); - write_unlock_bh(&dev_base_lock); + unlist_netdevice(dev); dev->reg_state = NETREG_UNREGISTERING; @@ -3794,7 +3909,7 @@ void unregister_netdevice(struct net_device *dev) /* Notify protocols, that we are about to destroy this device. They should clean all the things. */ - raw_notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev); + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); /* * Flush the unicast and multicast chains @@ -3807,8 +3922,8 @@ void unregister_netdevice(struct net_device *dev) /* Notifier chain MUST detach us from master device. */ BUG_TRAP(!dev->master); - /* Remove entries from sysfs */ - netdev_unregister_sysfs(dev); + /* Remove entries from kobject tree */ + netdev_unregister_kobject(dev); /* Finish processing unregister after unlock */ net_set_todo(dev); @@ -3839,6 +3954,121 @@ void unregister_netdev(struct net_device *dev) EXPORT_SYMBOL(unregister_netdev); +/** + * dev_change_net_namespace - move device to different nethost namespace + * @dev: device + * @net: network namespace + * @pat: If not NULL name pattern to try if the current device name + * is already taken in the destination network namespace. + * + * This function shuts down a device interface and moves it + * to a new network namespace. On success 0 is returned, on + * a failure a netagive errno code is returned. + * + * Callers must hold the rtnl semaphore. + */ + +int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) +{ + char buf[IFNAMSIZ]; + const char *destname; + int err; + + ASSERT_RTNL(); + + /* Don't allow namespace local devices to be moved. */ + err = -EINVAL; + if (dev->features & NETIF_F_NETNS_LOCAL) + goto out; + + /* Ensure the device has been registrered */ + err = -EINVAL; + if (dev->reg_state != NETREG_REGISTERED) + goto out; + + /* Get out if there is nothing todo */ + err = 0; + if (dev->nd_net == net) + goto out; + + /* Pick the destination device name, and ensure + * we can use it in the destination network namespace. + */ + err = -EEXIST; + destname = dev->name; + if (__dev_get_by_name(net, destname)) { + /* We get here if we can't use the current device name */ + if (!pat) + goto out; + if (!dev_valid_name(pat)) + goto out; + if (strchr(pat, '%')) { + if (__dev_alloc_name(net, pat, buf) < 0) + goto out; + destname = buf; + } else + destname = pat; + if (__dev_get_by_name(net, destname)) + goto out; + } + + /* + * And now a mini version of register_netdevice unregister_netdevice. + */ + + /* If device is running close it first. */ + dev_close(dev); + + /* And unlink it from device chain */ + err = -ENODEV; + unlist_netdevice(dev); + + synchronize_net(); + + /* Shutdown queueing discipline. */ + dev_shutdown(dev); + + /* Notify protocols, that we are about to destroy + this device. They should clean all the things. + */ + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + + /* + * Flush the unicast and multicast chains + */ + dev_addr_discard(dev); + + /* Actually switch the network namespace */ + dev->nd_net = net; + + /* Assign the new device name */ + if (destname != dev->name) + strcpy(dev->name, destname); + + /* If there is an ifindex conflict assign a new one */ + if (__dev_get_by_index(net, dev->ifindex)) { + int iflink = (dev->iflink == dev->ifindex); + dev->ifindex = dev_new_index(net); + if (iflink) + dev->iflink = dev->ifindex; + } + + /* Fixup kobjects */ + err = device_rename(&dev->dev, dev->name); + WARN_ON(err); + + /* Add the device back in the hashes */ + list_netdevice(dev); + + /* Notify protocols, that a new device appeared. */ + call_netdevice_notifiers(NETDEV_REGISTER, dev); + + synchronize_net(); + err = 0; +out: + return err; +} + static int dev_cpu_callback(struct notifier_block *nfb, unsigned long action, void *ocpu) @@ -4032,6 +4262,82 @@ int netdev_compute_features(unsigned long all, unsigned long one) } EXPORT_SYMBOL(netdev_compute_features); +static struct hlist_head *netdev_create_hash(void) +{ + int i; + struct hlist_head *hash; + + hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL); + if (hash != NULL) + for (i = 0; i < NETDEV_HASHENTRIES; i++) + INIT_HLIST_HEAD(&hash[i]); + + return hash; +} + +/* Initialize per network namespace state */ +static int __net_init netdev_init(struct net *net) +{ + INIT_LIST_HEAD(&net->dev_base_head); + rwlock_init(&dev_base_lock); + + net->dev_name_head = netdev_create_hash(); + if (net->dev_name_head == NULL) + goto err_name; + + net->dev_index_head = netdev_create_hash(); + if (net->dev_index_head == NULL) + goto err_idx; + + return 0; + +err_idx: + kfree(net->dev_name_head); +err_name: + return -ENOMEM; +} + +static void __net_exit netdev_exit(struct net *net) +{ + kfree(net->dev_name_head); + kfree(net->dev_index_head); +} + +static struct pernet_operations __net_initdata netdev_net_ops = { + .init = netdev_init, + .exit = netdev_exit, +}; + +static void __net_exit default_device_exit(struct net *net) +{ + struct net_device *dev, *next; + /* + * Push all migratable of the network devices back to the + * initial network namespace + */ + rtnl_lock(); + for_each_netdev_safe(net, dev, next) { + int err; + + /* Ignore unmoveable devices (i.e. loopback) */ + if (dev->features & NETIF_F_NETNS_LOCAL) + continue; + + /* Push remaing network devices to init_net */ + err = dev_change_net_namespace(dev, &init_net, "dev%d"); + if (err) { + printk(KERN_WARNING "%s: failed to move %s to init_net: %d\n", + __func__, dev->name, err); + unregister_netdevice(dev); + } + } + rtnl_unlock(); +} + +static struct pernet_operations __net_initdata default_device_ops = { + .exit = default_device_exit, +}; + /* * Initialize the DEV module. At boot time this walks the device list and * unhooks any devices that fail to initialise (normally hardware not @@ -4052,18 +4358,18 @@ static int __init net_dev_init(void) if (dev_proc_init()) goto out; - if (netdev_sysfs_init()) + if (netdev_kobject_init()) goto out; INIT_LIST_HEAD(&ptype_all); for (i = 0; i < 16; i++) INIT_LIST_HEAD(&ptype_base[i]); - for (i = 0; i < ARRAY_SIZE(dev_name_head); i++) - INIT_HLIST_HEAD(&dev_name_head[i]); + if (register_pernet_subsys(&netdev_net_ops)) + goto out; - for (i = 0; i < ARRAY_SIZE(dev_index_head); i++) - INIT_HLIST_HEAD(&dev_index_head[i]); + if (register_pernet_device(&default_device_ops)) + goto out; /* * Initialise the packet receive queues. @@ -4076,10 +4382,9 @@ static int __init net_dev_init(void) skb_queue_head_init(&queue->input_pkt_queue); queue->completion_queue = NULL; INIT_LIST_HEAD(&queue->poll_list); - set_bit(__LINK_STATE_START, &queue->backlog_dev.state); - queue->backlog_dev.weight = weight_p; - queue->backlog_dev.poll = process_backlog; - atomic_set(&queue->backlog_dev.refcnt, 1); + + queue->backlog.poll = process_backlog; + queue->backlog.weight = weight_p; } netdev_dma_register(); diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c index 20330c57261..15241cf48af 100644 --- a/net/core/dev_mcast.c +++ b/net/core/dev_mcast.c @@ -41,6 +41,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/init.h> +#include <net/net_namespace.h> #include <net/ip.h> #include <net/route.h> #include <linux/skbuff.h> @@ -186,11 +187,12 @@ EXPORT_SYMBOL(dev_mc_unsync); #ifdef CONFIG_PROC_FS static void *dev_mc_seq_start(struct seq_file *seq, loff_t *pos) { + struct net *net = seq->private; struct net_device *dev; loff_t off = 0; read_lock(&dev_base_lock); - for_each_netdev(dev) { + for_each_netdev(net, dev) { if (off++ == *pos) return dev; } @@ -239,7 +241,26 @@ static const struct seq_operations dev_mc_seq_ops = { static int dev_mc_seq_open(struct inode *inode, struct file *file) { - return seq_open(file, &dev_mc_seq_ops); + struct seq_file *seq; + int res; + res = seq_open(file, &dev_mc_seq_ops); + if (!res) { + seq = file->private_data; + seq->private = get_proc_net(inode); + if (!seq->private) { + seq_release(inode, file); + res = -ENXIO; + } + } + return res; +} + +static int dev_mc_seq_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct net *net = seq->private; + put_net(net); + return seq_release(inode, file); } static const struct file_operations dev_mc_seq_fops = { @@ -247,14 +268,31 @@ static const struct file_operations dev_mc_seq_fops = { .open = dev_mc_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = dev_mc_seq_release, }; #endif +static int __net_init dev_mc_net_init(struct net *net) +{ + if (!proc_net_fops_create(net, "dev_mcast", 0, &dev_mc_seq_fops)) + return -ENOMEM; + return 0; +} + +static void __net_exit dev_mc_net_exit(struct net *net) +{ + proc_net_remove(net, "dev_mcast"); +} + +static struct pernet_operations __net_initdata dev_mc_net_ops = { + .init = dev_mc_net_init, + .exit = dev_mc_net_exit, +}; + void __init dev_mcast_init(void) { - proc_net_fops_create("dev_mcast", 0, &dev_mc_seq_fops); + register_pernet_subsys(&dev_mc_net_ops); } EXPORT_SYMBOL(dev_mc_add); diff --git a/net/core/dst.c b/net/core/dst.c index c6a05879d58..16958e64e57 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -9,59 +9,84 @@ #include <linux/errno.h> #include <linux/init.h> #include <linux/kernel.h> +#include <linux/workqueue.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/string.h> #include <linux/types.h> +#include <net/net_namespace.h> +#include <net/net_namespace.h> #include <net/dst.h> -/* Locking strategy: - * 1) Garbage collection state of dead destination cache - * entries is protected by dst_lock. - * 2) GC is run only from BH context, and is the only remover - * of entries. - * 3) Entries are added to the garbage list from both BH - * and non-BH context, so local BH disabling is needed. - * 4) All operations modify state, so a spinlock is used. +/* + * Theory of operations: + * 1) We use a list, protected by a spinlock, to add + * new entries from both BH and non-BH context. + * 2) In order to keep spinlock held for a small delay, + * we use a second list where are stored long lived + * entries, that are handled by the garbage collect thread + * fired by a workqueue. + * 3) This list is guarded by a mutex, + * so that the gc_task and dst_dev_event() can be synchronized. */ -static struct dst_entry *dst_garbage_list; #if RT_CACHE_DEBUG >= 2 static atomic_t dst_total = ATOMIC_INIT(0); #endif -static DEFINE_SPINLOCK(dst_lock); -static unsigned long dst_gc_timer_expires; -static unsigned long dst_gc_timer_inc = DST_GC_MAX; -static void dst_run_gc(unsigned long); +/* + * We want to keep lock & list close together + * to dirty as few cache lines as possible in __dst_free(). + * As this is not a very strong hint, we dont force an alignment on SMP. + */ +static struct { + spinlock_t lock; + struct dst_entry *list; + unsigned long timer_inc; + unsigned long timer_expires; +} dst_garbage = { + .lock = __SPIN_LOCK_UNLOCKED(dst_garbage.lock), + .timer_inc = DST_GC_MAX, +}; +static void dst_gc_task(struct work_struct *work); static void ___dst_free(struct dst_entry * dst); -static DEFINE_TIMER(dst_gc_timer, dst_run_gc, DST_GC_MIN, 0); +static DECLARE_DELAYED_WORK(dst_gc_work, dst_gc_task); -static void dst_run_gc(unsigned long dummy) +static DEFINE_MUTEX(dst_gc_mutex); +/* + * long lived entries are maintained in this list, guarded by dst_gc_mutex + */ +static struct dst_entry *dst_busy_list; + +static void dst_gc_task(struct work_struct *work) { int delayed = 0; - int work_performed; - struct dst_entry * dst, **dstp; + int work_performed = 0; + unsigned long expires = ~0L; + struct dst_entry *dst, *next, head; + struct dst_entry *last = &head; +#if RT_CACHE_DEBUG >= 2 + ktime_t time_start = ktime_get(); + struct timespec elapsed; +#endif - if (!spin_trylock(&dst_lock)) { - mod_timer(&dst_gc_timer, jiffies + HZ/10); - return; - } + mutex_lock(&dst_gc_mutex); + next = dst_busy_list; - del_timer(&dst_gc_timer); - dstp = &dst_garbage_list; - work_performed = 0; - while ((dst = *dstp) != NULL) { - if (atomic_read(&dst->__refcnt)) { - dstp = &dst->next; +loop: + while ((dst = next) != NULL) { + next = dst->next; + prefetch(&next->next); + if (likely(atomic_read(&dst->__refcnt))) { + last->next = dst; + last = dst; delayed++; continue; } - *dstp = dst->next; - work_performed = 1; + work_performed++; dst = dst_destroy(dst); if (dst) { @@ -77,38 +102,56 @@ static void dst_run_gc(unsigned long dummy) continue; ___dst_free(dst); - dst->next = *dstp; - *dstp = dst; - dstp = &dst->next; + dst->next = next; + next = dst; } } - if (!dst_garbage_list) { - dst_gc_timer_inc = DST_GC_MAX; - goto out; + + spin_lock_bh(&dst_garbage.lock); + next = dst_garbage.list; + if (next) { + dst_garbage.list = NULL; + spin_unlock_bh(&dst_garbage.lock); + goto loop; } - if (!work_performed) { - if ((dst_gc_timer_expires += dst_gc_timer_inc) > DST_GC_MAX) - dst_gc_timer_expires = DST_GC_MAX; - dst_gc_timer_inc += DST_GC_INC; - } else { - dst_gc_timer_inc = DST_GC_INC; - dst_gc_timer_expires = DST_GC_MIN; + last->next = NULL; + dst_busy_list = head.next; + if (!dst_busy_list) + dst_garbage.timer_inc = DST_GC_MAX; + else { + /* + * if we freed less than 1/10 of delayed entries, + * we can sleep longer. + */ + if (work_performed <= delayed/10) { + dst_garbage.timer_expires += dst_garbage.timer_inc; + if (dst_garbage.timer_expires > DST_GC_MAX) + dst_garbage.timer_expires = DST_GC_MAX; + dst_garbage.timer_inc += DST_GC_INC; + } else { + dst_garbage.timer_inc = DST_GC_INC; + dst_garbage.timer_expires = DST_GC_MIN; + } + expires = dst_garbage.timer_expires; + /* + * if the next desired timer is more than 4 seconds in the future + * then round the timer to whole seconds + */ + if (expires > 4*HZ) + expires = round_jiffies_relative(expires); + schedule_delayed_work(&dst_gc_work, expires); } + + spin_unlock_bh(&dst_garbage.lock); + mutex_unlock(&dst_gc_mutex); #if RT_CACHE_DEBUG >= 2 - printk("dst_total: %d/%d %ld\n", - atomic_read(&dst_total), delayed, dst_gc_timer_expires); + elapsed = ktime_to_timespec(ktime_sub(ktime_get(), time_start)); + printk(KERN_DEBUG "dst_total: %d delayed: %d work_perf: %d" + " expires: %lu elapsed: %lu us\n", + atomic_read(&dst_total), delayed, work_performed, + expires, + elapsed.tv_sec * USEC_PER_SEC + elapsed.tv_nsec / NSEC_PER_USEC); #endif - /* if the next desired timer is more than 4 seconds in the future - * then round the timer to whole seconds - */ - if (dst_gc_timer_expires > 4*HZ) - mod_timer(&dst_gc_timer, - round_jiffies(jiffies + dst_gc_timer_expires)); - else - mod_timer(&dst_gc_timer, jiffies + dst_gc_timer_expires); - -out: - spin_unlock(&dst_lock); } static int dst_discard(struct sk_buff *skb) @@ -153,16 +196,16 @@ static void ___dst_free(struct dst_entry * dst) void __dst_free(struct dst_entry * dst) { - spin_lock_bh(&dst_lock); + spin_lock_bh(&dst_garbage.lock); ___dst_free(dst); - dst->next = dst_garbage_list; - dst_garbage_list = dst; - if (dst_gc_timer_inc > DST_GC_INC) { - dst_gc_timer_inc = DST_GC_INC; - dst_gc_timer_expires = DST_GC_MIN; - mod_timer(&dst_gc_timer, jiffies + dst_gc_timer_expires); + dst->next = dst_garbage.list; + dst_garbage.list = dst; + if (dst_garbage.timer_inc > DST_GC_INC) { + dst_garbage.timer_inc = DST_GC_INC; + dst_garbage.timer_expires = DST_GC_MIN; + schedule_delayed_work(&dst_gc_work, dst_garbage.timer_expires); } - spin_unlock_bh(&dst_lock); + spin_unlock_bh(&dst_garbage.lock); } struct dst_entry *dst_destroy(struct dst_entry * dst) @@ -236,13 +279,13 @@ static inline void dst_ifdown(struct dst_entry *dst, struct net_device *dev, if (!unregister) { dst->input = dst->output = dst_discard; } else { - dst->dev = &loopback_dev; - dev_hold(&loopback_dev); + dst->dev = init_net.loopback_dev; + dev_hold(dst->dev); dev_put(dev); if (dst->neighbour && dst->neighbour->dev == dev) { - dst->neighbour->dev = &loopback_dev; + dst->neighbour->dev = init_net.loopback_dev; dev_put(dev); - dev_hold(&loopback_dev); + dev_hold(dst->neighbour->dev); } } } @@ -250,16 +293,33 @@ static inline void dst_ifdown(struct dst_entry *dst, struct net_device *dev, static int dst_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = ptr; - struct dst_entry *dst; + struct dst_entry *dst, *last = NULL; + + if (dev->nd_net != &init_net) + return NOTIFY_DONE; switch (event) { case NETDEV_UNREGISTER: case NETDEV_DOWN: - spin_lock_bh(&dst_lock); - for (dst = dst_garbage_list; dst; dst = dst->next) { + mutex_lock(&dst_gc_mutex); + for (dst = dst_busy_list; dst; dst = dst->next) { + last = dst; + dst_ifdown(dst, dev, event != NETDEV_DOWN); + } + + spin_lock_bh(&dst_garbage.lock); + dst = dst_garbage.list; + dst_garbage.list = NULL; + spin_unlock_bh(&dst_garbage.lock); + + if (last) + last->next = dst; + else + dst_busy_list = dst; + for (; dst; dst = dst->next) { dst_ifdown(dst, dev, event != NETDEV_DOWN); } - spin_unlock_bh(&dst_lock); + mutex_unlock(&dst_gc_mutex); break; } return NOTIFY_DONE; diff --git a/net/core/ethtool.c b/net/core/ethtool.c index c5e059352d4..1163eb2256d 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -109,6 +109,32 @@ int ethtool_op_set_ufo(struct net_device *dev, u32 data) return 0; } +/* the following list of flags are the same as their associated + * NETIF_F_xxx values in include/linux/netdevice.h + */ +static const u32 flags_dup_features = + ETH_FLAG_LRO; + +u32 ethtool_op_get_flags(struct net_device *dev) +{ + /* in the future, this function will probably contain additional + * handling for flags which are not so easily handled + * by a simple masking operation + */ + + return dev->features & flags_dup_features; +} + +int ethtool_op_set_flags(struct net_device *dev, u32 data) +{ + if (data & ETH_FLAG_LRO) + dev->features |= NETIF_F_LRO; + else + dev->features &= ~NETIF_F_LRO; + + return 0; +} + /* Handlers for each ethtool command */ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) @@ -153,10 +179,26 @@ static int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr) info.cmd = ETHTOOL_GDRVINFO; ops->get_drvinfo(dev, &info); - if (ops->self_test_count) - info.testinfo_len = ops->self_test_count(dev); - if (ops->get_stats_count) - info.n_stats = ops->get_stats_count(dev); + if (ops->get_sset_count) { + int rc; + + rc = ops->get_sset_count(dev, ETH_SS_TEST); + if (rc >= 0) + info.testinfo_len = rc; + rc = ops->get_sset_count(dev, ETH_SS_STATS); + if (rc >= 0) + info.n_stats = rc; + rc = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS); + if (rc >= 0) + info.n_priv_flags = rc; + } else { + /* code path for obsolete hooks */ + + if (ops->self_test_count) + info.testinfo_len = ops->self_test_count(dev); + if (ops->get_stats_count) + info.n_stats = ops->get_stats_count(dev); + } if (ops->get_regs_len) info.regdump_len = ops->get_regs_len(dev); if (ops->get_eeprom_len) @@ -230,34 +272,6 @@ static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) return dev->ethtool_ops->set_wol(dev, &wol); } -static int ethtool_get_msglevel(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_value edata = { ETHTOOL_GMSGLVL }; - - if (!dev->ethtool_ops->get_msglevel) - return -EOPNOTSUPP; - - edata.data = dev->ethtool_ops->get_msglevel(dev); - - if (copy_to_user(useraddr, &edata, sizeof(edata))) - return -EFAULT; - return 0; -} - -static int ethtool_set_msglevel(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_value edata; - - if (!dev->ethtool_ops->set_msglevel) - return -EOPNOTSUPP; - - if (copy_from_user(&edata, useraddr, sizeof(edata))) - return -EFAULT; - - dev->ethtool_ops->set_msglevel(dev, edata.data); - return 0; -} - static int ethtool_nway_reset(struct net_device *dev) { if (!dev->ethtool_ops->nway_reset) @@ -266,20 +280,6 @@ static int ethtool_nway_reset(struct net_device *dev) return dev->ethtool_ops->nway_reset(dev); } -static int ethtool_get_link(struct net_device *dev, void __user *useraddr) -{ - struct ethtool_value edata = { ETHTOOL_GLINK }; - - if (!dev->ethtool_ops->get_link) - return -EOPNOTSUPP; - - edata.data = dev->ethtool_ops->get_link(dev); - - if (copy_to_user(useraddr, &edata, sizeof(edata))) - return -EFAULT; - return 0; -} - static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr) { struct ethtool_eeprom eeprom; @@ -447,48 +447,6 @@ static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr) return dev->ethtool_ops->set_pauseparam(dev, &pauseparam); } -static int ethtool_get_rx_csum(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_value edata = { ETHTOOL_GRXCSUM }; - - if (!dev->ethtool_ops->get_rx_csum) - return -EOPNOTSUPP; - - edata.data = dev->ethtool_ops->get_rx_csum(dev); - - if (copy_to_user(useraddr, &edata, sizeof(edata))) - return -EFAULT; - return 0; -} - -static int ethtool_set_rx_csum(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_value edata; - - if (!dev->ethtool_ops->set_rx_csum) - return -EOPNOTSUPP; - - if (copy_from_user(&edata, useraddr, sizeof(edata))) - return -EFAULT; - - dev->ethtool_ops->set_rx_csum(dev, edata.data); - return 0; -} - -static int ethtool_get_tx_csum(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_value edata = { ETHTOOL_GTXCSUM }; - - if (!dev->ethtool_ops->get_tx_csum) - return -EOPNOTSUPP; - - edata.data = dev->ethtool_ops->get_tx_csum(dev); - - if (copy_to_user(useraddr, &edata, sizeof(edata))) - return -EFAULT; - return 0; -} - static int __ethtool_set_sg(struct net_device *dev, u32 data) { int err; @@ -527,20 +485,6 @@ static int ethtool_set_tx_csum(struct net_device *dev, char __user *useraddr) return dev->ethtool_ops->set_tx_csum(dev, edata.data); } -static int ethtool_get_sg(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_value edata = { ETHTOOL_GSG }; - - if (!dev->ethtool_ops->get_sg) - return -EOPNOTSUPP; - - edata.data = dev->ethtool_ops->get_sg(dev); - - if (copy_to_user(useraddr, &edata, sizeof(edata))) - return -EFAULT; - return 0; -} - static int ethtool_set_sg(struct net_device *dev, char __user *useraddr) { struct ethtool_value edata; @@ -558,20 +502,6 @@ static int ethtool_set_sg(struct net_device *dev, char __user *useraddr) return __ethtool_set_sg(dev, edata.data); } -static int ethtool_get_tso(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_value edata = { ETHTOOL_GTSO }; - - if (!dev->ethtool_ops->get_tso) - return -EOPNOTSUPP; - - edata.data = dev->ethtool_ops->get_tso(dev); - - if (copy_to_user(useraddr, &edata, sizeof(edata))) - return -EFAULT; - return 0; -} - static int ethtool_set_tso(struct net_device *dev, char __user *useraddr) { struct ethtool_value edata; @@ -588,18 +518,6 @@ static int ethtool_set_tso(struct net_device *dev, char __user *useraddr) return dev->ethtool_ops->set_tso(dev, edata.data); } -static int ethtool_get_ufo(struct net_device *dev, char __user *useraddr) -{ - struct ethtool_value edata = { ETHTOOL_GUFO }; - - if (!dev->ethtool_ops->get_ufo) - return -EOPNOTSUPP; - edata.data = dev->ethtool_ops->get_ufo(dev); - if (copy_to_user(useraddr, &edata, sizeof(edata))) - return -EFAULT; - return 0; -} - static int ethtool_set_ufo(struct net_device *dev, char __user *useraddr) { struct ethtool_value edata; @@ -643,16 +561,27 @@ static int ethtool_self_test(struct net_device *dev, char __user *useraddr) struct ethtool_test test; const struct ethtool_ops *ops = dev->ethtool_ops; u64 *data; - int ret; + int ret, test_len; - if (!ops->self_test || !ops->self_test_count) + if (!ops->self_test) return -EOPNOTSUPP; + if (!ops->get_sset_count && !ops->self_test_count) + return -EOPNOTSUPP; + + if (ops->get_sset_count) + test_len = ops->get_sset_count(dev, ETH_SS_TEST); + else + /* code path for obsolete hook */ + test_len = ops->self_test_count(dev); + if (test_len < 0) + return test_len; + WARN_ON(test_len == 0); if (copy_from_user(&test, useraddr, sizeof(test))) return -EFAULT; - test.len = ops->self_test_count(dev); - data = kmalloc(test.len * sizeof(u64), GFP_USER); + test.len = test_len; + data = kmalloc(test_len * sizeof(u64), GFP_USER); if (!data) return -ENOMEM; @@ -684,19 +613,29 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) if (copy_from_user(&gstrings, useraddr, sizeof(gstrings))) return -EFAULT; - switch (gstrings.string_set) { - case ETH_SS_TEST: - if (!ops->self_test_count) - return -EOPNOTSUPP; - gstrings.len = ops->self_test_count(dev); - break; - case ETH_SS_STATS: - if (!ops->get_stats_count) - return -EOPNOTSUPP; - gstrings.len = ops->get_stats_count(dev); - break; - default: - return -EINVAL; + if (ops->get_sset_count) { + ret = ops->get_sset_count(dev, gstrings.string_set); + if (ret < 0) + return ret; + + gstrings.len = ret; + } else { + /* code path for obsolete hooks */ + + switch (gstrings.string_set) { + case ETH_SS_TEST: + if (!ops->self_test_count) + return -EOPNOTSUPP; + gstrings.len = ops->self_test_count(dev); + break; + case ETH_SS_STATS: + if (!ops->get_stats_count) + return -EOPNOTSUPP; + gstrings.len = ops->get_stats_count(dev); + break; + default: + return -EINVAL; + } } data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER); @@ -736,16 +675,27 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) struct ethtool_stats stats; const struct ethtool_ops *ops = dev->ethtool_ops; u64 *data; - int ret; + int ret, n_stats; - if (!ops->get_ethtool_stats || !ops->get_stats_count) + if (!ops->get_ethtool_stats) + return -EOPNOTSUPP; + if (!ops->get_sset_count && !ops->get_stats_count) return -EOPNOTSUPP; + if (ops->get_sset_count) + n_stats = ops->get_sset_count(dev, ETH_SS_STATS); + else + /* code path for obsolete hook */ + n_stats = ops->get_stats_count(dev); + if (n_stats < 0) + return n_stats; + WARN_ON(n_stats == 0); + if (copy_from_user(&stats, useraddr, sizeof(stats))) return -EFAULT; - stats.n_stats = ops->get_stats_count(dev); - data = kmalloc(stats.n_stats * sizeof(u64), GFP_USER); + stats.n_stats = n_stats; + data = kmalloc(n_stats * sizeof(u64), GFP_USER); if (!data) return -ENOMEM; @@ -783,11 +733,55 @@ static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr) return 0; } +static int ethtool_get_value(struct net_device *dev, char __user *useraddr, + u32 cmd, u32 (*actor)(struct net_device *)) +{ + struct ethtool_value edata = { cmd }; + + if (!actor) + return -EOPNOTSUPP; + + edata.data = actor(dev); + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int ethtool_set_value_void(struct net_device *dev, char __user *useraddr, + void (*actor)(struct net_device *, u32)) +{ + struct ethtool_value edata; + + if (!actor) + return -EOPNOTSUPP; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + actor(dev, edata.data); + return 0; +} + +static int ethtool_set_value(struct net_device *dev, char __user *useraddr, + int (*actor)(struct net_device *, u32)) +{ + struct ethtool_value edata; + + if (!actor) + return -EOPNOTSUPP; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + return actor(dev, edata.data); +} + /* The main entry point in this file. Called from net/core/dev.c */ -int dev_ethtool(struct ifreq *ifr) +int dev_ethtool(struct net *net, struct ifreq *ifr) { - struct net_device *dev = __dev_get_by_name(ifr->ifr_name); + struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); void __user *useraddr = ifr->ifr_data; u32 ethcmd; int rc; @@ -817,6 +811,8 @@ int dev_ethtool(struct ifreq *ifr) case ETHTOOL_GPERMADDR: case ETHTOOL_GUFO: case ETHTOOL_GGSO: + case ETHTOOL_GFLAGS: + case ETHTOOL_GPFLAGS: break; default: if (!capable(CAP_NET_ADMIN)) @@ -849,16 +845,19 @@ int dev_ethtool(struct ifreq *ifr) rc = ethtool_set_wol(dev, useraddr); break; case ETHTOOL_GMSGLVL: - rc = ethtool_get_msglevel(dev, useraddr); + rc = ethtool_get_value(dev, useraddr, ethcmd, + dev->ethtool_ops->get_msglevel); break; case ETHTOOL_SMSGLVL: - rc = ethtool_set_msglevel(dev, useraddr); + rc = ethtool_set_value_void(dev, useraddr, + dev->ethtool_ops->set_msglevel); break; case ETHTOOL_NWAY_RST: rc = ethtool_nway_reset(dev); break; case ETHTOOL_GLINK: - rc = ethtool_get_link(dev, useraddr); + rc = ethtool_get_value(dev, useraddr, ethcmd, + dev->ethtool_ops->get_link); break; case ETHTOOL_GEEPROM: rc = ethtool_get_eeprom(dev, useraddr); @@ -885,25 +884,36 @@ int dev_ethtool(struct ifreq *ifr) rc = ethtool_set_pauseparam(dev, useraddr); break; case ETHTOOL_GRXCSUM: - rc = ethtool_get_rx_csum(dev, useraddr); + rc = ethtool_get_value(dev, useraddr, ethcmd, + dev->ethtool_ops->get_rx_csum); break; case ETHTOOL_SRXCSUM: - rc = ethtool_set_rx_csum(dev, useraddr); + rc = ethtool_set_value(dev, useraddr, + dev->ethtool_ops->set_rx_csum); break; case ETHTOOL_GTXCSUM: - rc = ethtool_get_tx_csum(dev, useraddr); + rc = ethtool_get_value(dev, useraddr, ethcmd, + (dev->ethtool_ops->get_tx_csum ? + dev->ethtool_ops->get_tx_csum : + ethtool_op_get_tx_csum)); break; case ETHTOOL_STXCSUM: rc = ethtool_set_tx_csum(dev, useraddr); break; case ETHTOOL_GSG: - rc = ethtool_get_sg(dev, useraddr); + rc = ethtool_get_value(dev, useraddr, ethcmd, + (dev->ethtool_ops->get_sg ? + dev->ethtool_ops->get_sg : + ethtool_op_get_sg)); break; case ETHTOOL_SSG: rc = ethtool_set_sg(dev, useraddr); break; case ETHTOOL_GTSO: - rc = ethtool_get_tso(dev, useraddr); + rc = ethtool_get_value(dev, useraddr, ethcmd, + (dev->ethtool_ops->get_tso ? + dev->ethtool_ops->get_tso : + ethtool_op_get_tso)); break; case ETHTOOL_STSO: rc = ethtool_set_tso(dev, useraddr); @@ -924,7 +934,10 @@ int dev_ethtool(struct ifreq *ifr) rc = ethtool_get_perm_addr(dev, useraddr); break; case ETHTOOL_GUFO: - rc = ethtool_get_ufo(dev, useraddr); + rc = ethtool_get_value(dev, useraddr, ethcmd, + (dev->ethtool_ops->get_ufo ? + dev->ethtool_ops->get_ufo : + ethtool_op_get_ufo)); break; case ETHTOOL_SUFO: rc = ethtool_set_ufo(dev, useraddr); @@ -935,6 +948,22 @@ int dev_ethtool(struct ifreq *ifr) case ETHTOOL_SGSO: rc = ethtool_set_gso(dev, useraddr); break; + case ETHTOOL_GFLAGS: + rc = ethtool_get_value(dev, useraddr, ethcmd, + dev->ethtool_ops->get_flags); + break; + case ETHTOOL_SFLAGS: + rc = ethtool_set_value(dev, useraddr, + dev->ethtool_ops->set_flags); + break; + case ETHTOOL_GPFLAGS: + rc = ethtool_get_value(dev, useraddr, ethcmd, + dev->ethtool_ops->get_priv_flags); + break; + case ETHTOOL_SPFLAGS: + rc = ethtool_set_value(dev, useraddr, + dev->ethtool_ops->set_priv_flags); + break; default: rc = -EOPNOTSUPP; } @@ -959,3 +988,5 @@ EXPORT_SYMBOL(ethtool_op_set_tx_hw_csum); EXPORT_SYMBOL(ethtool_op_set_tx_ipv6_csum); EXPORT_SYMBOL(ethtool_op_set_ufo); EXPORT_SYMBOL(ethtool_op_get_ufo); +EXPORT_SYMBOL(ethtool_op_set_flags); +EXPORT_SYMBOL(ethtool_op_get_flags); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 8c5474e1668..13de6f53f09 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -11,6 +11,8 @@ #include <linux/types.h> #include <linux/kernel.h> #include <linux/list.h> +#include <net/net_namespace.h> +#include <net/sock.h> #include <net/fib_rules.h> static LIST_HEAD(rules_ops); @@ -82,7 +84,7 @@ static void cleanup_ops(struct fib_rules_ops *ops) { struct fib_rule *rule, *tmp; - list_for_each_entry_safe(rule, tmp, ops->rules_list, list) { + list_for_each_entry_safe(rule, tmp, &ops->rules_list, list) { list_del_rcu(&rule->list); fib_rule_put(rule); } @@ -137,7 +139,7 @@ int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl, rcu_read_lock(); - list_for_each_entry_rcu(rule, ops->rules_list, list) { + list_for_each_entry_rcu(rule, &ops->rules_list, list) { jumped: if (!fib_rule_match(rule, ops, fl, flags)) continue; @@ -197,6 +199,7 @@ errout: static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) { + struct net *net = skb->sk->sk_net; struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rules_ops *ops = NULL; struct fib_rule *rule, *r, *last = NULL; @@ -234,7 +237,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) rule->ifindex = -1; nla_strlcpy(rule->ifname, tb[FRA_IFNAME], IFNAMSIZ); - dev = __dev_get_by_name(rule->ifname); + dev = __dev_get_by_name(net, rule->ifname); if (dev) rule->ifindex = dev->ifindex; } @@ -268,7 +271,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (rule->target <= rule->pref) goto errout_free; - list_for_each_entry(r, ops->rules_list, list) { + list_for_each_entry(r, &ops->rules_list, list) { if (r->pref == rule->target) { rule->ctarget = r; break; @@ -284,7 +287,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (err < 0) goto errout_free; - list_for_each_entry(r, ops->rules_list, list) { + list_for_each_entry(r, &ops->rules_list, list) { if (r->pref > rule->pref) break; last = r; @@ -297,7 +300,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) * There are unresolved goto rules in the list, check if * any of them are pointing to this new rule. */ - list_for_each_entry(r, ops->rules_list, list) { + list_for_each_entry(r, &ops->rules_list, list) { if (r->action == FR_ACT_GOTO && r->target == rule->pref) { BUG_ON(r->ctarget != NULL); @@ -317,7 +320,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (last) list_add_rcu(&rule->list, &last->list); else - list_add_rcu(&rule->list, ops->rules_list); + list_add_rcu(&rule->list, &ops->rules_list); notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).pid); flush_route_cache(ops); @@ -356,7 +359,7 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (err < 0) goto errout; - list_for_each_entry(rule, ops->rules_list, list) { + list_for_each_entry(rule, &ops->rules_list, list) { if (frh->action && (frh->action != rule->action)) continue; @@ -399,7 +402,7 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) * actually been added. */ if (ops->nr_goto_rules > 0) { - list_for_each_entry(tmp, ops->rules_list, list) { + list_for_each_entry(tmp, &ops->rules_list, list) { if (tmp->ctarget == rule) { rcu_assign_pointer(tmp->ctarget, NULL); ops->unresolved_rules++; @@ -495,7 +498,7 @@ static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb, int idx = 0; struct fib_rule *rule; - list_for_each_entry(rule, ops->rules_list, list) { + list_for_each_entry(rule, &ops->rules_list, list) { if (idx < cb->args[1]) goto skip; @@ -596,18 +599,21 @@ static int fib_rules_event(struct notifier_block *this, unsigned long event, struct net_device *dev = ptr; struct fib_rules_ops *ops; + if (dev->nd_net != &init_net) + return NOTIFY_DONE; + ASSERT_RTNL(); rcu_read_lock(); switch (event) { case NETDEV_REGISTER: list_for_each_entry(ops, &rules_ops, list) - attach_rules(ops->rules_list, dev); + attach_rules(&ops->rules_list, dev); break; case NETDEV_UNREGISTER: list_for_each_entry(ops, &rules_ops, list) - detach_rules(ops->rules_list, dev); + detach_rules(&ops->rules_list, dev); break; } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index f7de8f24d8d..c52df858d0b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -25,6 +25,7 @@ #include <linux/sysctl.h> #endif #include <linux/times.h> +#include <net/net_namespace.h> #include <net/neighbour.h> #include <net/dst.h> #include <net/sock.h> @@ -55,9 +56,8 @@ #define PNEIGH_HASHMASK 0xF static void neigh_timer_handler(unsigned long arg); -#ifdef CONFIG_ARPD -static void neigh_app_notify(struct neighbour *n); -#endif +static void __neigh_notify(struct neighbour *n, int type, int flags); +static void neigh_update_notify(struct neighbour *neigh); static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev); void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev); @@ -105,6 +105,15 @@ static int neigh_blackhole(struct sk_buff *skb) return -ENETDOWN; } +static void neigh_cleanup_and_release(struct neighbour *neigh) +{ + if (neigh->parms->neigh_cleanup) + neigh->parms->neigh_cleanup(neigh); + + __neigh_notify(neigh, RTM_DELNEIGH, 0); + neigh_release(neigh); +} + /* * It is random distribution in the interval (1/2)*base...(3/2)*base. * It corresponds to default IPv6 settings and is not overridable, @@ -141,9 +150,7 @@ static int neigh_forced_gc(struct neigh_table *tbl) n->dead = 1; shrunk = 1; write_unlock(&n->lock); - if (n->parms->neigh_cleanup) - n->parms->neigh_cleanup(n); - neigh_release(n); + neigh_cleanup_and_release(n); continue; } write_unlock(&n->lock); @@ -214,9 +221,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) NEIGH_PRINTK2("neigh %p is stray.\n", n); } write_unlock(&n->lock); - if (n->parms->neigh_cleanup) - n->parms->neigh_cleanup(n); - neigh_release(n); + neigh_cleanup_and_release(n); } } } @@ -677,9 +682,7 @@ static void neigh_periodic_timer(unsigned long arg) *np = n->next; n->dead = 1; write_unlock(&n->lock); - if (n->parms->neigh_cleanup) - n->parms->neigh_cleanup(n); - neigh_release(n); + neigh_cleanup_and_release(n); continue; } write_unlock(&n->lock); @@ -828,13 +831,10 @@ static void neigh_timer_handler(unsigned long arg) out: write_unlock(&neigh->lock); } + if (notify) - call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); + neigh_update_notify(neigh); -#ifdef CONFIG_ARPD - if (notify && neigh->parms->app_probes) - neigh_app_notify(neigh); -#endif neigh_release(neigh); } @@ -897,8 +897,8 @@ out_unlock_bh: static void neigh_update_hhs(struct neighbour *neigh) { struct hh_cache *hh; - void (*update)(struct hh_cache*, struct net_device*, unsigned char *) = - neigh->dev->header_cache_update; + void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *) + = neigh->dev->header_ops->cache_update; if (update) { for (hh = neigh->hh; hh; hh = hh->hh_next) { @@ -1063,11 +1063,8 @@ out: write_unlock_bh(&neigh->lock); if (notify) - call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); -#ifdef CONFIG_ARPD - if (notify && neigh->parms->app_probes) - neigh_app_notify(neigh); -#endif + neigh_update_notify(neigh); + return err; } @@ -1098,7 +1095,8 @@ static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, hh->hh_type = protocol; atomic_set(&hh->hh_refcnt, 0); hh->hh_next = NULL; - if (dev->hard_header_cache(n, hh)) { + + if (dev->header_ops->cache(n, hh)) { kfree(hh); hh = NULL; } else { @@ -1128,10 +1126,9 @@ int neigh_compat_output(struct sk_buff *skb) __skb_pull(skb, skb_network_offset(skb)); - if (dev->hard_header && - dev->hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL, - skb->len) < 0 && - dev->rebuild_header(skb)) + if (dev_hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL, + skb->len) < 0 && + dev->header_ops->rebuild(skb)) return 0; return dev_queue_xmit(skb); @@ -1153,17 +1150,17 @@ int neigh_resolve_output(struct sk_buff *skb) if (!neigh_event_send(neigh, skb)) { int err; struct net_device *dev = neigh->dev; - if (dev->hard_header_cache && !dst->hh) { + if (dev->header_ops->cache && !dst->hh) { write_lock_bh(&neigh->lock); if (!dst->hh) neigh_hh_init(neigh, dst, dst->ops->protocol); - err = dev->hard_header(skb, dev, ntohs(skb->protocol), - neigh->ha, NULL, skb->len); + err = dev_hard_header(skb, dev, ntohs(skb->protocol), + neigh->ha, NULL, skb->len); write_unlock_bh(&neigh->lock); } else { read_lock_bh(&neigh->lock); - err = dev->hard_header(skb, dev, ntohs(skb->protocol), - neigh->ha, NULL, skb->len); + err = dev_hard_header(skb, dev, ntohs(skb->protocol), + neigh->ha, NULL, skb->len); read_unlock_bh(&neigh->lock); } if (err >= 0) @@ -1194,8 +1191,8 @@ int neigh_connected_output(struct sk_buff *skb) __skb_pull(skb, skb_network_offset(skb)); read_lock_bh(&neigh->lock); - err = dev->hard_header(skb, dev, ntohs(skb->protocol), - neigh->ha, NULL, skb->len); + err = dev_hard_header(skb, dev, ntohs(skb->protocol), + neigh->ha, NULL, skb->len); read_unlock_bh(&neigh->lock); if (err >= 0) err = neigh->ops->queue_xmit(skb); @@ -1354,7 +1351,7 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl) panic("cannot create neighbour cache statistics"); #ifdef CONFIG_PROC_FS - tbl->pde = create_proc_entry(tbl->id, 0, proc_net_stat); + tbl->pde = create_proc_entry(tbl->id, 0, init_net.proc_net_stat); if (!tbl->pde) panic("cannot create neighbour proc dir entry"); tbl->pde->proc_fops = &neigh_stat_seq_fops; @@ -1444,6 +1441,7 @@ int neigh_table_clear(struct neigh_table *tbl) static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { + struct net *net = skb->sk->sk_net; struct ndmsg *ndm; struct nlattr *dst_attr; struct neigh_table *tbl; @@ -1459,7 +1457,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { - dev = dev_get_by_index(ndm->ndm_ifindex); + dev = dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { err = -ENODEV; goto out; @@ -1509,6 +1507,7 @@ out: static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { + struct net *net = skb->sk->sk_net; struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct neigh_table *tbl; @@ -1525,7 +1524,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { - dev = dev_get_by_index(ndm->ndm_ifindex); + dev = dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { err = -ENODEV; goto out; @@ -2000,6 +1999,11 @@ nla_put_failure: return -EMSGSIZE; } +static void neigh_update_notify(struct neighbour *neigh) +{ + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); + __neigh_notify(neigh, RTM_NEWNEIGH, 0); +} static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct netlink_callback *cb) @@ -2095,11 +2099,8 @@ void __neigh_for_each_release(struct neigh_table *tbl, } else np = &n->next; write_unlock(&n->lock); - if (release) { - if (n->parms->neigh_cleanup) - n->parms->neigh_cleanup(n); - neigh_release(n); - } + if (release) + neigh_cleanup_and_release(n); } } } @@ -2422,7 +2423,6 @@ static const struct file_operations neigh_stat_seq_fops = { #endif /* CONFIG_PROC_FS */ -#ifdef CONFIG_ARPD static inline size_t neigh_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) @@ -2454,16 +2454,11 @@ errout: rtnl_set_sk_err(RTNLGRP_NEIGH, err); } +#ifdef CONFIG_ARPD void neigh_app_ns(struct neighbour *n) { __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST); } - -static void neigh_app_notify(struct neighbour *n) -{ - __neigh_notify(n, RTM_NEWNEIGH, 0); -} - #endif /* CONFIG_ARPD */ #ifdef CONFIG_SYSCTL diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 5c19b0646d7..6628e457ddc 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -18,6 +18,7 @@ #include <linux/wireless.h> #include <net/iw_handler.h> +#ifdef CONFIG_SYSFS static const char fmt_hex[] = "%#x\n"; static const char fmt_long_hex[] = "%#lx\n"; static const char fmt_dec[] = "%d\n"; @@ -216,20 +217,6 @@ static ssize_t store_tx_queue_len(struct device *dev, return netdev_store(dev, attr, buf, len, change_tx_queue_len); } -NETDEVICE_SHOW(weight, fmt_dec); - -static int change_weight(struct net_device *net, unsigned long new_weight) -{ - net->weight = new_weight; - return 0; -} - -static ssize_t store_weight(struct device *dev, struct device_attribute *attr, - const char *buf, size_t len) -{ - return netdev_store(dev, attr, buf, len, change_weight); -} - static struct device_attribute net_class_attributes[] = { __ATTR(addr_len, S_IRUGO, show_addr_len, NULL), __ATTR(iflink, S_IRUGO, show_iflink, NULL), @@ -246,7 +233,6 @@ static struct device_attribute net_class_attributes[] = { __ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags), __ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len, store_tx_queue_len), - __ATTR(weight, S_IRUGO | S_IWUSR, show_weight, store_weight), {} }; @@ -407,29 +393,25 @@ static struct attribute_group wireless_group = { }; #endif +#endif /* CONFIG_SYSFS */ + #ifdef CONFIG_HOTPLUG -static int netdev_uevent(struct device *d, char **envp, - int num_envp, char *buf, int size) +static int netdev_uevent(struct device *d, struct kobj_uevent_env *env) { struct net_device *dev = to_net_dev(d); - int retval, len = 0, i = 0; + int retval; /* pass interface to uevent. */ - retval = add_uevent_var(envp, num_envp, &i, - buf, size, &len, - "INTERFACE=%s", dev->name); + retval = add_uevent_var(env, "INTERFACE=%s", dev->name); if (retval) goto exit; /* pass ifindex to uevent. * ifindex is useful as it won't change (interface name may change) * and is what RtNetlink uses natively. */ - retval = add_uevent_var(envp, num_envp, &i, - buf, size, &len, - "IFINDEX=%d", dev->ifindex); + retval = add_uevent_var(env, "IFINDEX=%d", dev->ifindex); exit: - envp[i] = NULL; return retval; } #endif @@ -450,7 +432,9 @@ static void netdev_release(struct device *d) static struct class net_class = { .name = "net", .dev_release = netdev_release, +#ifdef CONFIG_SYSFS .dev_attrs = net_class_attributes, +#endif /* CONFIG_SYSFS */ #ifdef CONFIG_HOTPLUG .dev_uevent = netdev_uevent, #endif @@ -459,7 +443,7 @@ static struct class net_class = { /* Delete sysfs entries but hold kobject reference until after all * netdev references are gone. */ -void netdev_unregister_sysfs(struct net_device * net) +void netdev_unregister_kobject(struct net_device * net) { struct device *dev = &(net->dev); @@ -468,7 +452,7 @@ void netdev_unregister_sysfs(struct net_device * net) } /* Create sysfs entries for network device. */ -int netdev_register_sysfs(struct net_device *net) +int netdev_register_kobject(struct net_device *net) { struct device *dev = &(net->dev); struct attribute_group **groups = net->sysfs_groups; @@ -481,6 +465,7 @@ int netdev_register_sysfs(struct net_device *net) BUILD_BUG_ON(BUS_ID_SIZE < IFNAMSIZ); strlcpy(dev->bus_id, net->name, BUS_ID_SIZE); +#ifdef CONFIG_SYSFS if (net->get_stats) *groups++ = &netstat_group; @@ -488,11 +473,12 @@ int netdev_register_sysfs(struct net_device *net) if (net->wireless_handlers && net->wireless_handlers->get_wireless_stats) *groups++ = &wireless_group; #endif +#endif /* CONFIG_SYSFS */ return device_add(dev); } -int netdev_sysfs_init(void) +int netdev_kobject_init(void) { return class_register(&net_class); } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c new file mode 100644 index 00000000000..6f71db8c442 --- /dev/null +++ b/net/core/net_namespace.c @@ -0,0 +1,317 @@ +#include <linux/workqueue.h> +#include <linux/rtnetlink.h> +#include <linux/cache.h> +#include <linux/slab.h> +#include <linux/list.h> +#include <linux/delay.h> +#include <linux/sched.h> +#include <net/net_namespace.h> + +/* + * Our network namespace constructor/destructor lists + */ + +static LIST_HEAD(pernet_list); +static struct list_head *first_device = &pernet_list; +static DEFINE_MUTEX(net_mutex); + +LIST_HEAD(net_namespace_list); + +static struct kmem_cache *net_cachep; + +struct net init_net; +EXPORT_SYMBOL_GPL(init_net); + +static struct net *net_alloc(void) +{ + return kmem_cache_zalloc(net_cachep, GFP_KERNEL); +} + +static void net_free(struct net *net) +{ + if (!net) + return; + + if (unlikely(atomic_read(&net->use_count) != 0)) { + printk(KERN_EMERG "network namespace not free! Usage: %d\n", + atomic_read(&net->use_count)); + return; + } + + kmem_cache_free(net_cachep, net); +} + +static void cleanup_net(struct work_struct *work) +{ + struct pernet_operations *ops; + struct net *net; + + net = container_of(work, struct net, work); + + mutex_lock(&net_mutex); + + /* Don't let anyone else find us. */ + rtnl_lock(); + list_del(&net->list); + rtnl_unlock(); + + /* Run all of the network namespace exit methods */ + list_for_each_entry_reverse(ops, &pernet_list, list) { + if (ops->exit) + ops->exit(net); + } + + mutex_unlock(&net_mutex); + + /* Ensure there are no outstanding rcu callbacks using this + * network namespace. + */ + rcu_barrier(); + + /* Finally it is safe to free my network namespace structure */ + net_free(net); +} + + +void __put_net(struct net *net) +{ + /* Cleanup the network namespace in process context */ + INIT_WORK(&net->work, cleanup_net); + schedule_work(&net->work); +} +EXPORT_SYMBOL_GPL(__put_net); + +/* + * setup_net runs the initializers for the network namespace object. + */ +static int setup_net(struct net *net) +{ + /* Must be called with net_mutex held */ + struct pernet_operations *ops; + int error; + + atomic_set(&net->count, 1); + atomic_set(&net->use_count, 0); + + error = 0; + list_for_each_entry(ops, &pernet_list, list) { + if (ops->init) { + error = ops->init(net); + if (error < 0) + goto out_undo; + } + } +out: + return error; + +out_undo: + /* Walk through the list backwards calling the exit functions + * for the pernet modules whose init functions did not fail. + */ + list_for_each_entry_continue_reverse(ops, &pernet_list, list) { + if (ops->exit) + ops->exit(net); + } + goto out; +} + +struct net *copy_net_ns(unsigned long flags, struct net *old_net) +{ + struct net *new_net = NULL; + int err; + + get_net(old_net); + + if (!(flags & CLONE_NEWNET)) + return old_net; + +#ifndef CONFIG_NET_NS + return ERR_PTR(-EINVAL); +#endif + + err = -ENOMEM; + new_net = net_alloc(); + if (!new_net) + goto out; + + mutex_lock(&net_mutex); + err = setup_net(new_net); + if (err) + goto out_unlock; + + rtnl_lock(); + list_add_tail(&new_net->list, &net_namespace_list); + rtnl_unlock(); + + +out_unlock: + mutex_unlock(&net_mutex); +out: + put_net(old_net); + if (err) { + net_free(new_net); + new_net = ERR_PTR(err); + } + return new_net; +} + +static int __init net_ns_init(void) +{ + int err; + + printk(KERN_INFO "net_namespace: %zd bytes\n", sizeof(struct net)); + net_cachep = kmem_cache_create("net_namespace", sizeof(struct net), + SMP_CACHE_BYTES, + SLAB_PANIC, NULL); + mutex_lock(&net_mutex); + err = setup_net(&init_net); + + rtnl_lock(); + list_add_tail(&init_net.list, &net_namespace_list); + rtnl_unlock(); + + mutex_unlock(&net_mutex); + if (err) + panic("Could not setup the initial network namespace"); + + return 0; +} + +pure_initcall(net_ns_init); + +static int register_pernet_operations(struct list_head *list, + struct pernet_operations *ops) +{ + struct net *net, *undo_net; + int error; + + error = 0; + list_add_tail(&ops->list, list); + for_each_net(net) { + if (ops->init) { + error = ops->init(net); + if (error) + goto out_undo; + } + } +out: + return error; + +out_undo: + /* If I have an error cleanup all namespaces I initialized */ + list_del(&ops->list); + for_each_net(undo_net) { + if (undo_net == net) + goto undone; + if (ops->exit) + ops->exit(undo_net); + } +undone: + goto out; +} + +static void unregister_pernet_operations(struct pernet_operations *ops) +{ + struct net *net; + + list_del(&ops->list); + for_each_net(net) + if (ops->exit) + ops->exit(net); +} + +/** + * register_pernet_subsys - register a network namespace subsystem + * @ops: pernet operations structure for the subsystem + * + * Register a subsystem which has init and exit functions + * that are called when network namespaces are created and + * destroyed respectively. + * + * When registered all network namespace init functions are + * called for every existing network namespace. Allowing kernel + * modules to have a race free view of the set of network namespaces. + * + * When a new network namespace is created all of the init + * methods are called in the order in which they were registered. + * + * When a network namespace is destroyed all of the exit methods + * are called in the reverse of the order with which they were + * registered. + */ +int register_pernet_subsys(struct pernet_operations *ops) +{ + int error; + mutex_lock(&net_mutex); + error = register_pernet_operations(first_device, ops); + mutex_unlock(&net_mutex); + return error; +} +EXPORT_SYMBOL_GPL(register_pernet_subsys); + +/** + * unregister_pernet_subsys - unregister a network namespace subsystem + * @ops: pernet operations structure to manipulate + * + * Remove the pernet operations structure from the list to be + * used when network namespaces are created or destoryed. In + * addition run the exit method for all existing network + * namespaces. + */ +void unregister_pernet_subsys(struct pernet_operations *module) +{ + mutex_lock(&net_mutex); + unregister_pernet_operations(module); + mutex_unlock(&net_mutex); +} +EXPORT_SYMBOL_GPL(unregister_pernet_subsys); + +/** + * register_pernet_device - register a network namespace device + * @ops: pernet operations structure for the subsystem + * + * Register a device which has init and exit functions + * that are called when network namespaces are created and + * destroyed respectively. + * + * When registered all network namespace init functions are + * called for every existing network namespace. Allowing kernel + * modules to have a race free view of the set of network namespaces. + * + * When a new network namespace is created all of the init + * methods are called in the order in which they were registered. + * + * When a network namespace is destroyed all of the exit methods + * are called in the reverse of the order with which they were + * registered. + */ +int register_pernet_device(struct pernet_operations *ops) +{ + int error; + mutex_lock(&net_mutex); + error = register_pernet_operations(&pernet_list, ops); + if (!error && (first_device == &pernet_list)) + first_device = &ops->list; + mutex_unlock(&net_mutex); + return error; +} +EXPORT_SYMBOL_GPL(register_pernet_device); + +/** + * unregister_pernet_device - unregister a network namespace netdevice + * @ops: pernet operations structure to manipulate + * + * Remove the pernet operations structure from the list to be + * used when network namespaces are created or destoryed. In + * addition run the exit method for all existing network + * namespaces. + */ +void unregister_pernet_device(struct pernet_operations *ops) +{ + mutex_lock(&net_mutex); + if (&ops->list == first_device) + first_device = first_device->next; + unregister_pernet_operations(ops); + mutex_unlock(&net_mutex); +} +EXPORT_SYMBOL_GPL(unregister_pernet_device); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index de1b26aa572..95daba62496 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -119,19 +119,22 @@ static __sum16 checksum_udp(struct sk_buff *skb, struct udphdr *uh, static void poll_napi(struct netpoll *np) { struct netpoll_info *npinfo = np->dev->npinfo; + struct napi_struct *napi; int budget = 16; - if (test_bit(__LINK_STATE_RX_SCHED, &np->dev->state) && - npinfo->poll_owner != smp_processor_id() && - spin_trylock(&npinfo->poll_lock)) { - npinfo->rx_flags |= NETPOLL_RX_DROP; - atomic_inc(&trapped); + list_for_each_entry(napi, &np->dev->napi_list, dev_list) { + if (test_bit(NAPI_STATE_SCHED, &napi->state) && + napi->poll_owner != smp_processor_id() && + spin_trylock(&napi->poll_lock)) { + npinfo->rx_flags |= NETPOLL_RX_DROP; + atomic_inc(&trapped); - np->dev->poll(np->dev, &budget); + napi->poll(napi, budget); - atomic_dec(&trapped); - npinfo->rx_flags &= ~NETPOLL_RX_DROP; - spin_unlock(&npinfo->poll_lock); + atomic_dec(&trapped); + npinfo->rx_flags &= ~NETPOLL_RX_DROP; + spin_unlock(&napi->poll_lock); + } } } @@ -157,7 +160,7 @@ void netpoll_poll(struct netpoll *np) /* Process pending work on NIC */ np->dev->poll_controller(np->dev); - if (np->dev->poll) + if (!list_empty(&np->dev->napi_list)) poll_napi(np); service_arp_queue(np->dev->npinfo); @@ -233,6 +236,17 @@ repeat: return skb; } +static int netpoll_owner_active(struct net_device *dev) +{ + struct napi_struct *napi; + + list_for_each_entry(napi, &dev->napi_list, dev_list) { + if (napi->poll_owner == smp_processor_id()) + return 1; + } + return 0; +} + static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { int status = NETDEV_TX_BUSY; @@ -246,8 +260,7 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) } /* don't get messages out of order, and no recursion */ - if (skb_queue_len(&npinfo->txq) == 0 && - npinfo->poll_owner != smp_processor_id()) { + if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) { unsigned long flags; local_irq_save(flags); @@ -402,11 +415,9 @@ static void arp_reply(struct sk_buff *skb) send_skb->protocol = htons(ETH_P_ARP); /* Fill the device header for the ARP frame */ - - if (np->dev->hard_header && - np->dev->hard_header(send_skb, skb->dev, ptype, - sha, np->local_mac, - send_skb->len) < 0) { + if (dev_hard_header(send_skb, skb->dev, ptype, + sha, np->local_mac, + send_skb->len) < 0) { kfree_skb(send_skb); return; } @@ -519,6 +530,23 @@ out: return 0; } +void netpoll_print_options(struct netpoll *np) +{ + DECLARE_MAC_BUF(mac); + printk(KERN_INFO "%s: local port %d\n", + np->name, np->local_port); + printk(KERN_INFO "%s: local IP %d.%d.%d.%d\n", + np->name, HIPQUAD(np->local_ip)); + printk(KERN_INFO "%s: interface %s\n", + np->name, np->dev_name); + printk(KERN_INFO "%s: remote port %d\n", + np->name, np->remote_port); + printk(KERN_INFO "%s: remote IP %d.%d.%d.%d\n", + np->name, HIPQUAD(np->remote_ip)); + printk(KERN_INFO "%s: remote ethernet address %s\n", + np->name, print_mac(mac, np->remote_mac)); +} + int netpoll_parse_options(struct netpoll *np, char *opt) { char *cur=opt, *delim; @@ -531,7 +559,6 @@ int netpoll_parse_options(struct netpoll *np, char *opt) cur = delim; } cur++; - printk(KERN_INFO "%s: local port %d\n", np->name, np->local_port); if (*cur != '/') { if ((delim = strchr(cur, '/')) == NULL) @@ -539,9 +566,6 @@ int netpoll_parse_options(struct netpoll *np, char *opt) *delim = 0; np->local_ip = ntohl(in_aton(cur)); cur = delim; - - printk(KERN_INFO "%s: local IP %d.%d.%d.%d\n", - np->name, HIPQUAD(np->local_ip)); } cur++; @@ -555,8 +579,6 @@ int netpoll_parse_options(struct netpoll *np, char *opt) } cur++; - printk(KERN_INFO "%s: interface %s\n", np->name, np->dev_name); - if (*cur != '@') { /* dst port */ if ((delim = strchr(cur, '@')) == NULL) @@ -566,7 +588,6 @@ int netpoll_parse_options(struct netpoll *np, char *opt) cur = delim; } cur++; - printk(KERN_INFO "%s: remote port %d\n", np->name, np->remote_port); /* dst ip */ if ((delim = strchr(cur, '/')) == NULL) @@ -575,9 +596,6 @@ int netpoll_parse_options(struct netpoll *np, char *opt) np->remote_ip = ntohl(in_aton(cur)); cur = delim + 1; - printk(KERN_INFO "%s: remote IP %d.%d.%d.%d\n", - np->name, HIPQUAD(np->remote_ip)); - if (*cur != 0) { /* MAC address */ if ((delim = strchr(cur, ':')) == NULL) @@ -608,15 +626,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt) np->remote_mac[5] = simple_strtol(cur, NULL, 16); } - printk(KERN_INFO "%s: remote ethernet address " - "%02x:%02x:%02x:%02x:%02x:%02x\n", - np->name, - np->remote_mac[0], - np->remote_mac[1], - np->remote_mac[2], - np->remote_mac[3], - np->remote_mac[4], - np->remote_mac[5]); + netpoll_print_options(np); return 0; @@ -635,7 +645,7 @@ int netpoll_setup(struct netpoll *np) int err; if (np->dev_name) - ndev = dev_get_by_name(np->dev_name); + ndev = dev_get_by_name(&init_net, np->dev_name); if (!ndev) { printk(KERN_ERR "%s: %s doesn't exist, aborting.\n", np->name, np->dev_name); @@ -652,8 +662,6 @@ int netpoll_setup(struct netpoll *np) npinfo->rx_flags = 0; npinfo->rx_np = NULL; - spin_lock_init(&npinfo->poll_lock); - npinfo->poll_owner = -1; spin_lock_init(&npinfo->rx_lock); skb_queue_head_init(&npinfo->arp_tx); @@ -820,6 +828,7 @@ void netpoll_set_trap(int trap) EXPORT_SYMBOL(netpoll_set_trap); EXPORT_SYMBOL(netpoll_trap); +EXPORT_SYMBOL(netpoll_print_options); EXPORT_SYMBOL(netpoll_parse_options); EXPORT_SYMBOL(netpoll_setup); EXPORT_SYMBOL(netpoll_cleanup); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 803d0c8826a..2100c734b10 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -152,6 +152,7 @@ #include <linux/wait.h> #include <linux/etherdevice.h> #include <linux/kthread.h> +#include <net/net_namespace.h> #include <net/checksum.h> #include <net/ipv6.h> #include <net/addrconf.h> @@ -167,7 +168,7 @@ #include <asm/div64.h> /* do_div */ #include <asm/timex.h> -#define VERSION "pktgen v2.68: Packet Generator for packet performance testing.\n" +#define VERSION "pktgen v2.69: Packet Generator for packet performance testing.\n" /* The buckets are exponential in 'width' */ #define LAT_BUCKETS_MAX 32 @@ -189,6 +190,7 @@ #define F_SVID_RND (1<<10) /* Random SVLAN ID */ #define F_FLOW_SEQ (1<<11) /* Sequential flows */ #define F_IPSEC_ON (1<<12) /* ipsec on for flows */ +#define F_QUEUE_MAP_RND (1<<13) /* queue map Random */ /* Thread control flag bits */ #define T_TERMINATE (1<<0) @@ -331,6 +333,7 @@ struct pktgen_dev { __be32 cur_daddr; __u16 cur_udp_dst; __u16 cur_udp_src; + __u16 cur_queue_map; __u32 cur_pkt_size; __u8 hh[14]; @@ -358,6 +361,10 @@ struct pktgen_dev { unsigned lflow; /* Flow length (config) */ unsigned nflows; /* accumulated flows (stats) */ unsigned curfl; /* current sequenced flow (state)*/ + + u16 queue_map_min; + u16 queue_map_max; + #ifdef CONFIG_XFRM __u8 ipsmode; /* IPSEC mode (config) */ __u8 ipsproto; /* IPSEC type (config) */ @@ -378,7 +385,6 @@ struct pktgen_thread { struct list_head th_list; struct task_struct *tsk; char result[512]; - u32 max_before_softirq; /* We'll call do_softirq to prevent starvation. */ /* Field for thread to receive "posted" events terminate, stop ifs etc. */ @@ -593,11 +599,11 @@ static const struct file_operations pktgen_fops = { static int pktgen_if_show(struct seq_file *seq, void *v) { - int i; struct pktgen_dev *pkt_dev = seq->private; __u64 sa; __u64 stopped; __u64 now = getCurUs(); + DECLARE_MAC_BUF(mac); seq_printf(seq, "Params: count %llu min_pkt_size: %u max_pkt_size: %u\n", @@ -613,6 +619,11 @@ static int pktgen_if_show(struct seq_file *seq, void *v) seq_printf(seq, " flows: %u flowlen: %u\n", pkt_dev->cflows, pkt_dev->lflow); + seq_printf(seq, + " queue_map_min: %u queue_map_max: %u\n", + pkt_dev->queue_map_min, + pkt_dev->queue_map_max); + if (pkt_dev->flags & F_IPV6) { char b1[128], b2[128], b3[128]; fmt_ip6(b1, pkt_dev->in6_saddr.s6_addr); @@ -637,19 +648,12 @@ static int pktgen_if_show(struct seq_file *seq, void *v) seq_puts(seq, " src_mac: "); - if (is_zero_ether_addr(pkt_dev->src_mac)) - for (i = 0; i < 6; i++) - seq_printf(seq, "%02X%s", pkt_dev->odev->dev_addr[i], - i == 5 ? " " : ":"); - else - for (i = 0; i < 6; i++) - seq_printf(seq, "%02X%s", pkt_dev->src_mac[i], - i == 5 ? " " : ":"); + seq_printf(seq, "%s ", + print_mac(mac, is_zero_ether_addr(pkt_dev->src_mac) ? + pkt_dev->odev->dev_addr : pkt_dev->src_mac)); seq_printf(seq, "dst_mac: "); - for (i = 0; i < 6; i++) - seq_printf(seq, "%02X%s", pkt_dev->dst_mac[i], - i == 5 ? "\n" : ":"); + seq_printf(seq, "%s\n", print_mac(mac, pkt_dev->dst_mac)); seq_printf(seq, " udp_src_min: %d udp_src_max: %d udp_dst_min: %d udp_dst_max: %d\n", @@ -709,6 +713,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v) if (pkt_dev->flags & F_MPLS_RND) seq_printf(seq, "MPLS_RND "); + if (pkt_dev->flags & F_QUEUE_MAP_RND) + seq_printf(seq, "QUEUE_MAP_RND "); + if (pkt_dev->cflows) { if (pkt_dev->flags & F_FLOW_SEQ) seq_printf(seq, "FLOW_SEQ "); /*in sequence flows*/ @@ -764,6 +771,8 @@ static int pktgen_if_show(struct seq_file *seq, void *v) seq_printf(seq, " cur_udp_dst: %d cur_udp_src: %d\n", pkt_dev->cur_udp_dst, pkt_dev->cur_udp_src); + seq_printf(seq, " cur_queue_map: %u\n", pkt_dev->cur_queue_map); + seq_printf(seq, " flows: %u\n", pkt_dev->nflows); if (pkt_dev->result[0]) @@ -1215,6 +1224,11 @@ static ssize_t pktgen_if_write(struct file *file, else if (strcmp(f, "FLOW_SEQ") == 0) pkt_dev->flags |= F_FLOW_SEQ; + else if (strcmp(f, "QUEUE_MAP_RND") == 0) + pkt_dev->flags |= F_QUEUE_MAP_RND; + + else if (strcmp(f, "!QUEUE_MAP_RND") == 0) + pkt_dev->flags &= ~F_QUEUE_MAP_RND; #ifdef CONFIG_XFRM else if (strcmp(f, "IPSEC") == 0) pkt_dev->flags |= F_IPSEC_ON; @@ -1526,16 +1540,40 @@ static ssize_t pktgen_if_write(struct file *file, return count; } + if (!strcmp(name, "queue_map_min")) { + len = num_arg(&user_buffer[i], 5, &value); + if (len < 0) { + return len; + } + i += len; + pkt_dev->queue_map_min = value; + sprintf(pg_result, "OK: queue_map_min=%u", pkt_dev->queue_map_min); + return count; + } + + if (!strcmp(name, "queue_map_max")) { + len = num_arg(&user_buffer[i], 5, &value); + if (len < 0) { + return len; + } + i += len; + pkt_dev->queue_map_max = value; + sprintf(pg_result, "OK: queue_map_max=%u", pkt_dev->queue_map_max); + return count; + } + if (!strcmp(name, "mpls")) { - unsigned n, offset; + unsigned n, cnt; + len = get_labels(&user_buffer[i], pkt_dev); - if (len < 0) { return len; } + if (len < 0) + return len; i += len; - offset = sprintf(pg_result, "OK: mpls="); + cnt = sprintf(pg_result, "OK: mpls="); for (n = 0; n < pkt_dev->nr_labels; n++) - offset += sprintf(pg_result + offset, - "%08x%s", ntohl(pkt_dev->labels[n]), - n == pkt_dev->nr_labels-1 ? "" : ","); + cnt += sprintf(pg_result + cnt, + "%08x%s", ntohl(pkt_dev->labels[n]), + n == pkt_dev->nr_labels-1 ? "" : ","); if (pkt_dev->nr_labels && pkt_dev->vlan_id != 0xffff) { pkt_dev->vlan_id = 0xffff; /* turn off VLAN/SVLAN */ @@ -1718,9 +1756,6 @@ static int pktgen_thread_show(struct seq_file *seq, void *v) BUG_ON(!t); - seq_printf(seq, "Name: %s max_before_softirq: %d\n", - t->tsk->comm, t->max_before_softirq); - seq_printf(seq, "Running: "); if_lock(t); @@ -1753,7 +1788,6 @@ static ssize_t pktgen_thread_write(struct file *file, int i = 0, max, len, ret; char name[40]; char *pg_result; - unsigned long value = 0; if (count < 1) { // sprintf(pg_result, "Wrong command format"); @@ -1827,12 +1861,8 @@ static ssize_t pktgen_thread_write(struct file *file, } if (!strcmp(name, "max_before_softirq")) { - len = num_arg(&user_buffer[i], 10, &value); - mutex_lock(&pktgen_thread_lock); - t->max_before_softirq = value; - mutex_unlock(&pktgen_thread_lock); + sprintf(pg_result, "OK: Note! max_before_softirq is obsoleted -- Do not use"); ret = count; - sprintf(pg_result, "OK: max_before_softirq=%lu", value); goto out; } @@ -1940,6 +1970,9 @@ static int pktgen_device_event(struct notifier_block *unused, { struct net_device *dev = ptr; + if (dev->nd_net != &init_net) + return NOTIFY_DONE; + /* It is OK that we do not hold the group lock right now, * as we run under the RTNL lock. */ @@ -1970,7 +2003,7 @@ static int pktgen_setup_dev(struct pktgen_dev *pkt_dev, const char *ifname) pkt_dev->odev = NULL; } - odev = dev_get_by_name(ifname); + odev = dev_get_by_name(&init_net, ifname); if (!odev) { printk(KERN_ERR "pktgen: no such netdevice: \"%s\"\n", ifname); return -ENODEV; @@ -2111,7 +2144,6 @@ static void spin(struct pktgen_dev *pkt_dev, __u64 spin_until_us) if (spin_until_us - now > jiffies_to_usecs(1) + 1) schedule_timeout_interruptible(1); else if (spin_until_us - now > 100) { - do_softirq(); if (!pkt_dev->running) return; if (need_resched()) @@ -2387,6 +2419,20 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) pkt_dev->cur_pkt_size = t; } + if (pkt_dev->queue_map_min < pkt_dev->queue_map_max) { + __u16 t; + if (pkt_dev->flags & F_QUEUE_MAP_RND) { + t = random32() % + (pkt_dev->queue_map_max - pkt_dev->queue_map_min + 1) + + pkt_dev->queue_map_min; + } else { + t = pkt_dev->cur_queue_map + 1; + if (t > pkt_dev->queue_map_max) + t = pkt_dev->queue_map_min; + } + pkt_dev->cur_queue_map = t; + } + pkt_dev->flows[flow].count++; } @@ -2557,6 +2603,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, skb->network_header = skb->tail; skb->transport_header = skb->network_header + sizeof(struct iphdr); skb_put(skb, sizeof(struct iphdr) + sizeof(struct udphdr)); + skb->queue_mapping = pkt_dev->cur_queue_map; iph = ip_hdr(skb); udph = udp_hdr(skb); @@ -2686,6 +2733,7 @@ static unsigned int scan_ip6(const char *s, char ip[16]) unsigned int prefixlen = 0; unsigned int suffixlen = 0; __be32 tmp; + char *pos; for (i = 0; i < 16; i++) ip[i] = 0; @@ -2700,12 +2748,9 @@ static unsigned int scan_ip6(const char *s, char ip[16]) } s++; } - { - char *tmp; - u = simple_strtoul(s, &tmp, 16); - i = tmp - s; - } + u = simple_strtoul(s, &pos, 16); + i = pos - s; if (!i) return 0; if (prefixlen == 12 && s[i] == '.') { @@ -2733,11 +2778,9 @@ static unsigned int scan_ip6(const char *s, char ip[16]) len++; } else if (suffixlen != 0) break; - { - char *tmp; - u = simple_strtol(s, &tmp, 16); - i = tmp - s; - } + + u = simple_strtol(s, &pos, 16); + i = pos - s; if (!i) { if (*s) len--; @@ -2898,6 +2941,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, skb->network_header = skb->tail; skb->transport_header = skb->network_header + sizeof(struct ipv6hdr); skb_put(skb, sizeof(struct ipv6hdr) + sizeof(struct udphdr)); + skb->queue_mapping = pkt_dev->cur_queue_map; iph = ipv6_hdr(skb); udph = udp_hdr(skb); @@ -3465,8 +3509,6 @@ static int pktgen_thread_worker(void *arg) struct pktgen_thread *t = arg; struct pktgen_dev *pkt_dev = NULL; int cpu = t->cpu; - u32 max_before_softirq; - u32 tx_since_softirq = 0; BUG_ON(smp_processor_id() != cpu); @@ -3474,8 +3516,6 @@ static int pktgen_thread_worker(void *arg) pr_debug("pktgen: starting pktgen/%d: pid=%d\n", cpu, current->pid); - max_before_softirq = t->max_before_softirq; - set_current_state(TASK_INTERRUPTIBLE); set_freezable(); @@ -3494,24 +3534,9 @@ static int pktgen_thread_worker(void *arg) __set_current_state(TASK_RUNNING); - if (pkt_dev) { - + if (pkt_dev) pktgen_xmit(pkt_dev); - /* - * We like to stay RUNNING but must also give - * others fair share. - */ - - tx_since_softirq += pkt_dev->last_ok; - - if (tx_since_softirq > max_before_softirq) { - if (local_softirq_pending()) - do_softirq(); - tx_since_softirq = 0; - } - } - if (t->control & T_STOP) { pktgen_stop(t); t->control &= ~(T_STOP); @@ -3778,7 +3803,7 @@ static int __init pg_init(void) printk(KERN_INFO "%s", version); - pg_proc_dir = proc_mkdir(PG_PROC_DIR, proc_net); + pg_proc_dir = proc_mkdir(PG_PROC_DIR, init_net.proc_net); if (!pg_proc_dir) return -ENODEV; pg_proc_dir->owner = THIS_MODULE; @@ -3787,7 +3812,7 @@ static int __init pg_init(void) if (pe == NULL) { printk(KERN_ERR "pktgen: ERROR: cannot create %s " "procfs entry.\n", PGCTRL); - proc_net_remove(PG_PROC_DIR); + proc_net_remove(&init_net, PG_PROC_DIR); return -EINVAL; } @@ -3811,7 +3836,7 @@ static int __init pg_init(void) "all threads\n"); unregister_netdevice_notifier(&pktgen_notifier_block); remove_proc_entry(PGCTRL, pg_proc_dir); - proc_net_remove(PG_PROC_DIR); + proc_net_remove(&init_net, PG_PROC_DIR); return -ENODEV; } @@ -3838,7 +3863,7 @@ static void __exit pg_cleanup(void) /* Clean up proc file system */ remove_proc_entry(PGCTRL, pg_proc_dir); - proc_net_remove(PG_PROC_DIR); + proc_net_remove(&init_net, PG_PROC_DIR); } module_init(pg_init); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 4756d5857ab..1072d16696c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -35,6 +35,7 @@ #include <linux/security.h> #include <linux/mutex.h> #include <linux/if_addr.h> +#include <linux/nsproxy.h> #include <asm/uaccess.h> #include <asm/system.h> @@ -74,8 +75,6 @@ void __rtnl_unlock(void) void rtnl_unlock(void) { mutex_unlock(&rtnl_mutex); - if (rtnl && rtnl->sk_receive_queue.qlen) - rtnl->sk_data_ready(rtnl, 0); netdev_run_todo(); } @@ -306,10 +305,13 @@ EXPORT_SYMBOL_GPL(rtnl_link_register); void __rtnl_link_unregister(struct rtnl_link_ops *ops) { struct net_device *dev, *n; + struct net *net; - for_each_netdev_safe(dev, n) { - if (dev->rtnl_link_ops == ops) - ops->dellink(dev); + for_each_net(net) { + for_each_netdev_safe(net, dev, n) { + if (dev->rtnl_link_ops == ops) + ops->dellink(dev); + } } list_del(&ops->list); } @@ -634,7 +636,6 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, NLA_PUT_STRING(skb, IFLA_IFNAME, dev->name); NLA_PUT_U32(skb, IFLA_TXQLEN, dev->tx_queue_len); - NLA_PUT_U32(skb, IFLA_WEIGHT, dev->weight); NLA_PUT_U8(skb, IFLA_OPERSTATE, netif_running(dev) ? dev->operstate : IF_OPER_DOWN); NLA_PUT_U8(skb, IFLA_LINKMODE, dev->link_mode); @@ -694,12 +695,13 @@ nla_put_failure: static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = skb->sk->sk_net; int idx; int s_idx = cb->args[0]; struct net_device *dev; idx = 0; - for_each_netdev(dev) { + for_each_netdev(net, dev) { if (idx < s_idx) goto cont; if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, @@ -714,7 +716,7 @@ cont: return skb->len; } -static const struct nla_policy ifla_policy[IFLA_MAX+1] = { +const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 }, [IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, @@ -724,6 +726,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_WEIGHT] = { .type = NLA_U32 }, [IFLA_OPERSTATE] = { .type = NLA_U8 }, [IFLA_LINKMODE] = { .type = NLA_U8 }, + [IFLA_NET_NS_PID] = { .type = NLA_U32 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -731,12 +734,45 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { [IFLA_INFO_DATA] = { .type = NLA_NESTED }, }; +static struct net *get_net_ns_by_pid(pid_t pid) +{ + struct task_struct *tsk; + struct net *net; + + /* Lookup the network namespace */ + net = ERR_PTR(-ESRCH); + rcu_read_lock(); + tsk = find_task_by_pid(pid); + if (tsk) { + task_lock(tsk); + if (tsk->nsproxy) + net = get_net(tsk->nsproxy->net_ns); + task_unlock(tsk); + } + rcu_read_unlock(); + return net; +} + static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, struct nlattr **tb, char *ifname, int modified) { int send_addr_notify = 0; int err; + if (tb[IFLA_NET_NS_PID]) { + struct net *net; + net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); + if (IS_ERR(net)) { + err = PTR_ERR(net); + goto errout; + } + err = dev_change_net_namespace(dev, net, ifname); + put_net(net); + if (err) + goto errout; + modified = 1; + } + if (tb[IFLA_MAP]) { struct rtnl_link_ifmap *u_map; struct ifmap k_map; @@ -834,9 +870,6 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, if (tb[IFLA_TXQLEN]) dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); - if (tb[IFLA_WEIGHT]) - dev->weight = nla_get_u32(tb[IFLA_WEIGHT]); - if (tb[IFLA_OPERSTATE]) set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); @@ -862,6 +895,7 @@ errout: static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { + struct net *net = skb->sk->sk_net; struct ifinfomsg *ifm; struct net_device *dev; int err; @@ -880,9 +914,9 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) err = -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) - dev = dev_get_by_index(ifm->ifi_index); + dev = dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME]) - dev = dev_get_by_name(ifname); + dev = dev_get_by_name(net, ifname); else goto errout; @@ -908,6 +942,7 @@ errout: static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { + struct net *net = skb->sk->sk_net; const struct rtnl_link_ops *ops; struct net_device *dev; struct ifinfomsg *ifm; @@ -924,9 +959,9 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) - dev = __dev_get_by_index(ifm->ifi_index); + dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME]) - dev = __dev_get_by_name(ifname); + dev = __dev_get_by_name(net, ifname); else return -EINVAL; @@ -941,8 +976,52 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) return 0; } +struct net_device *rtnl_create_link(struct net *net, char *ifname, + const struct rtnl_link_ops *ops, struct nlattr *tb[]) +{ + int err; + struct net_device *dev; + + err = -ENOMEM; + dev = alloc_netdev(ops->priv_size, ifname, ops->setup); + if (!dev) + goto err; + + if (strchr(dev->name, '%')) { + err = dev_alloc_name(dev, dev->name); + if (err < 0) + goto err_free; + } + + dev->nd_net = net; + dev->rtnl_link_ops = ops; + + if (tb[IFLA_MTU]) + dev->mtu = nla_get_u32(tb[IFLA_MTU]); + if (tb[IFLA_ADDRESS]) + memcpy(dev->dev_addr, nla_data(tb[IFLA_ADDRESS]), + nla_len(tb[IFLA_ADDRESS])); + if (tb[IFLA_BROADCAST]) + memcpy(dev->broadcast, nla_data(tb[IFLA_BROADCAST]), + nla_len(tb[IFLA_BROADCAST])); + if (tb[IFLA_TXQLEN]) + dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); + if (tb[IFLA_OPERSTATE]) + set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); + if (tb[IFLA_LINKMODE]) + dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); + + return dev; + +err_free: + free_netdev(dev); +err: + return ERR_PTR(err); +} + static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { + struct net *net = skb->sk->sk_net; const struct rtnl_link_ops *ops; struct net_device *dev; struct ifinfomsg *ifm; @@ -966,9 +1045,9 @@ replay: ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) - dev = __dev_get_by_index(ifm->ifi_index); + dev = __dev_get_by_index(net, ifm->ifi_index); else if (ifname[0]) - dev = __dev_get_by_name(ifname); + dev = __dev_get_by_name(net, ifname); else dev = NULL; @@ -1053,40 +1132,17 @@ replay: if (!ifname[0]) snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); - dev = alloc_netdev(ops->priv_size, ifname, ops->setup); - if (!dev) - return -ENOMEM; - - if (strchr(dev->name, '%')) { - err = dev_alloc_name(dev, dev->name); - if (err < 0) - goto err_free; - } - dev->rtnl_link_ops = ops; - - if (tb[IFLA_MTU]) - dev->mtu = nla_get_u32(tb[IFLA_MTU]); - if (tb[IFLA_ADDRESS]) - memcpy(dev->dev_addr, nla_data(tb[IFLA_ADDRESS]), - nla_len(tb[IFLA_ADDRESS])); - if (tb[IFLA_BROADCAST]) - memcpy(dev->broadcast, nla_data(tb[IFLA_BROADCAST]), - nla_len(tb[IFLA_BROADCAST])); - if (tb[IFLA_TXQLEN]) - dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); - if (tb[IFLA_WEIGHT]) - dev->weight = nla_get_u32(tb[IFLA_WEIGHT]); - if (tb[IFLA_OPERSTATE]) - set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); - if (tb[IFLA_LINKMODE]) - dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); - - if (ops->newlink) + + dev = rtnl_create_link(net, ifname, ops, tb); + + if (IS_ERR(dev)) + err = PTR_ERR(dev); + else if (ops->newlink) err = ops->newlink(dev, tb, data); else err = register_netdevice(dev); -err_free: - if (err < 0) + + if (err < 0 && !IS_ERR(dev)) free_netdev(dev); return err; } @@ -1094,6 +1150,7 @@ err_free: static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) { + struct net *net = skb->sk->sk_net; struct ifinfomsg *ifm; struct nlattr *tb[IFLA_MAX+1]; struct net_device *dev = NULL; @@ -1106,7 +1163,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) { - dev = dev_get_by_index(ifm->ifi_index); + dev = dev_get_by_index(net, ifm->ifi_index); if (dev == NULL) return -ENODEV; } else @@ -1255,22 +1312,20 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return doit(skb, nlh, (void *)&rta_buf[0]); } -static void rtnetlink_rcv(struct sock *sk, int len) +static void rtnetlink_rcv(struct sk_buff *skb) { - unsigned int qlen = 0; - - do { - mutex_lock(&rtnl_mutex); - netlink_run_queue(sk, &qlen, &rtnetlink_rcv_msg); - mutex_unlock(&rtnl_mutex); - - netdev_run_todo(); - } while (qlen); + rtnl_lock(); + netlink_rcv_skb(skb, &rtnetlink_rcv_msg); + rtnl_unlock(); } static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = ptr; + + if (dev->nd_net != &init_net) + return NOTIFY_DONE; + switch (event) { case NETDEV_UNREGISTER: rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); @@ -1308,8 +1363,8 @@ void __init rtnetlink_init(void) if (!rta_buf) panic("rtnetlink_init: cannot allocate rta_buf\n"); - rtnl = netlink_kernel_create(NETLINK_ROUTE, RTNLGRP_MAX, rtnetlink_rcv, - &rtnl_mutex, THIS_MODULE); + rtnl = netlink_kernel_create(&init_net, NETLINK_ROUTE, RTNLGRP_MAX, + rtnetlink_rcv, &rtnl_mutex, THIS_MODULE); if (rtnl == NULL) panic("rtnetlink_init: cannot initialize rtnetlink\n"); netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV); @@ -1335,3 +1390,5 @@ EXPORT_SYMBOL(rtnl_unlock); EXPORT_SYMBOL(rtnl_unicast); EXPORT_SYMBOL(rtnl_notify); EXPORT_SYMBOL(rtnl_set_sk_err); +EXPORT_SYMBOL(rtnl_create_link); +EXPORT_SYMBOL(ifla_policy); diff --git a/net/core/scm.c b/net/core/scm.c index 44c4ec2c876..530bee8d9ed 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -167,7 +167,8 @@ error: int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) { - struct cmsghdr __user *cm = (struct cmsghdr __user *)msg->msg_control; + struct cmsghdr __user *cm + = (__force struct cmsghdr __user *)msg->msg_control; struct cmsghdr cmhdr; int cmlen = CMSG_LEN(len); int err; @@ -202,7 +203,8 @@ out: void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) { - struct cmsghdr __user *cm = (struct cmsghdr __user*)msg->msg_control; + struct cmsghdr __user *cm + = (__force struct cmsghdr __user*)msg->msg_control; int fdmax = 0; int fdnum = scm->fp->count; @@ -222,7 +224,8 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) if (fdnum < fdmax) fdmax = fdnum; - for (i=0, cmfptr=(int __user *)CMSG_DATA(cm); i<fdmax; i++, cmfptr++) + for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i<fdmax; + i++, cmfptr++) { int new_fd; err = security_file_receive(fp[i]); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 35021eb3ed0..944189d9632 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -472,6 +472,9 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #ifdef CONFIG_INET new->sp = secpath_get(old->sp); #endif + new->csum_start = old->csum_start; + new->csum_offset = old->csum_offset; + new->ip_summed = old->ip_summed; new->transport_header = old->transport_header; new->network_header = old->network_header; new->mac_header = old->mac_header; @@ -545,8 +548,6 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) skb_reserve(n, headerlen); /* Set the tail pointer and length */ skb_put(n, skb->len); - n->csum = skb->csum; - n->ip_summed = skb->ip_summed; if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)) BUG(); @@ -589,8 +590,6 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) skb_put(n, skb_headlen(skb)); /* Copy the bytes */ skb_copy_from_linear_data(skb, n->data, n->len); - n->csum = skb->csum; - n->ip_summed = skb->ip_summed; n->truesize += skb->data_len; n->data_len = skb->data_len; @@ -686,6 +685,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb->transport_header += off; skb->network_header += off; skb->mac_header += off; + skb->csum_start += off; skb->cloned = 0; skb->hdr_len = 0; skb->nohdr = 0; @@ -734,9 +734,6 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) * * You must pass %GFP_ATOMIC as the allocation priority if this function * is called from an interrupt. - * - * BUG ALERT: ip_summed is not copied. Why does this work? Is it used - * only by netfilter in the cases when checksum is recalculated? --ANK */ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, int newheadroom, int newtailroom, @@ -749,7 +746,7 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, gfp_mask); int oldheadroom = skb_headroom(skb); int head_copy_len, head_copy_off; - int off = 0; + int off; if (!n) return NULL; @@ -773,12 +770,13 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, copy_skb_header(n, skb); -#ifdef NET_SKBUFF_DATA_USES_OFFSET off = newheadroom - oldheadroom; -#endif + n->csum_start += off; +#ifdef NET_SKBUFF_DATA_USES_OFFSET n->transport_header += off; n->network_header += off; n->mac_header += off; +#endif return n; } diff --git a/net/core/sock.c b/net/core/sock.c index 190de61cd64..4ed9b507c1e 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -119,6 +119,7 @@ #include <linux/netdevice.h> #include <net/protocol.h> #include <linux/skbuff.h> +#include <net/net_namespace.h> #include <net/request_sock.h> #include <net/sock.h> #include <net/xfrm.h> @@ -366,6 +367,7 @@ static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES + struct net *net = sk->sk_net; char devname[IFNAMSIZ]; int index; @@ -394,7 +396,7 @@ static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen) if (devname[0] == '\0') { index = 0; } else { - struct net_device *dev = dev_get_by_name(devname); + struct net_device *dev = dev_get_by_name(net, devname); ret = -ENODEV; if (!dev) @@ -872,7 +874,7 @@ static inline void sock_lock_init(struct sock *sk) * @prot: struct proto associated with this new sock instance * @zero_it: if we should zero the newly allocated sock */ -struct sock *sk_alloc(int family, gfp_t priority, +struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int zero_it) { struct sock *sk = NULL; @@ -893,6 +895,7 @@ struct sock *sk_alloc(int family, gfp_t priority, */ sk->sk_prot = sk->sk_prot_creator = prot; sock_lock_init(sk); + sk->sk_net = get_net(net); } if (security_sk_alloc(sk, family, priority)) @@ -932,6 +935,7 @@ void sk_free(struct sock *sk) __FUNCTION__, atomic_read(&sk->sk_omem_alloc)); security_sk_free(sk); + put_net(sk->sk_net); if (sk->sk_prot_creator->slab != NULL) kmem_cache_free(sk->sk_prot_creator->slab, sk); else @@ -941,7 +945,7 @@ void sk_free(struct sock *sk) struct sock *sk_clone(const struct sock *sk, const gfp_t priority) { - struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0); + struct sock *newsk = sk_alloc(sk->sk_net, sk->sk_family, priority, sk->sk_prot, 0); if (newsk != NULL) { struct sk_filter *filter; @@ -1585,9 +1589,9 @@ void fastcall lock_sock_nested(struct sock *sk, int subclass) { might_sleep(); spin_lock_bh(&sk->sk_lock.slock); - if (sk->sk_lock.owner) + if (sk->sk_lock.owned) __lock_sock(sk); - sk->sk_lock.owner = (void *)1; + sk->sk_lock.owned = 1; spin_unlock(&sk->sk_lock.slock); /* * The sk_lock has mutex_lock() semantics here: @@ -1608,7 +1612,7 @@ void fastcall release_sock(struct sock *sk) spin_lock_bh(&sk->sk_lock.slock); if (sk->sk_backlog.tail) __release_sock(sk); - sk->sk_lock.owner = NULL; + sk->sk_lock.owned = 0; if (waitqueue_active(&sk->sk_lock.wq)) wake_up(&sk->sk_lock.wq); spin_unlock_bh(&sk->sk_lock.slock); @@ -1973,7 +1977,7 @@ static const struct file_operations proto_seq_fops = { static int __init proto_init(void) { /* register /proc/net/protocols */ - return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0; + return proc_net_fops_create(&init_net, "protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0; } subsys_initcall(proto_init); |