From 4e857c58efeb99393cba5a5d0d8ec7117183137c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 17 Mar 2014 18:06:10 +0100 Subject: arch: Mass conversion of smp_mb__*() Mostly scripted conversion of the smp_mb__* barriers. Signed-off-by: Peter Zijlstra Acked-by: Paul E. McKenney Link: http://lkml.kernel.org/n/tip-55dhyhocezdw1dg7u19hmh1u@git.kernel.org Cc: Linus Torvalds Cc: linux-arch@vger.kernel.org Signed-off-by: Ingo Molnar --- net/core/dev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 5b3042e69f8..e14f1cba591 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1326,7 +1326,7 @@ static int __dev_close_many(struct list_head *head) * dev->stop() will invoke napi_disable() on all of it's * napi_struct instances on this device. */ - smp_mb__after_clear_bit(); /* Commit netif_running(). */ + smp_mb__after_atomic(); /* Commit netif_running(). */ } dev_deactivate_many(head); @@ -3343,7 +3343,7 @@ static void net_tx_action(struct softirq_action *h) root_lock = qdisc_lock(q); if (spin_trylock(root_lock)) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__QDISC_STATE_SCHED, &q->state); qdisc_run(q); @@ -3353,7 +3353,7 @@ static void net_tx_action(struct softirq_action *h) &q->state)) { __netif_reschedule(q); } else { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__QDISC_STATE_SCHED, &q->state); } @@ -4244,7 +4244,7 @@ void __napi_complete(struct napi_struct *n) BUG_ON(n->gro_list); list_del(&n->poll_list); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(NAPI_STATE_SCHED, &n->state); } EXPORT_SYMBOL(__napi_complete); -- cgit v1.2.3-70-g09d2 From a0265d28b3a5877b5b8edd14eb12a2ccb60ab1f3 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 17 Apr 2014 13:45:03 +0800 Subject: net: Add __dev_forward_skb This patch adds the helper __dev_forward_skb which is identical to dev_forward_skb except that it doesn't actually inject the skb into the stack. This is useful where we wish to have finer control over how the packet is injected, e.g., via netif_rx_ni or netif_receive_skb. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/core/dev.c | 42 ++++++++++++++++++++++++------------------ 2 files changed, 25 insertions(+), 18 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7ed3a3aa660..a803d792df1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2633,6 +2633,7 @@ int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_port_id *ppid); int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq); +int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index d2c8a06b3a9..11d70e3afef 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1661,6 +1661,29 @@ bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(is_skb_forwardable); +int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +{ + if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { + if (skb_copy_ubufs(skb, GFP_ATOMIC)) { + atomic_long_inc(&dev->rx_dropped); + kfree_skb(skb); + return NET_RX_DROP; + } + } + + if (unlikely(!is_skb_forwardable(dev, skb))) { + atomic_long_inc(&dev->rx_dropped); + kfree_skb(skb); + return NET_RX_DROP; + } + + skb_scrub_packet(skb, true); + skb->protocol = eth_type_trans(skb, dev); + + return 0; +} +EXPORT_SYMBOL_GPL(__dev_forward_skb); + /** * dev_forward_skb - loopback an skb to another netif * @@ -1681,24 +1704,7 @@ EXPORT_SYMBOL_GPL(is_skb_forwardable); */ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { - if (skb_copy_ubufs(skb, GFP_ATOMIC)) { - atomic_long_inc(&dev->rx_dropped); - kfree_skb(skb); - return NET_RX_DROP; - } - } - - if (unlikely(!is_skb_forwardable(dev, skb))) { - atomic_long_inc(&dev->rx_dropped); - kfree_skb(skb); - return NET_RX_DROP; - } - - skb_scrub_packet(skb, true); - skb->protocol = eth_type_trans(skb, dev); - - return netif_rx_internal(skb); + return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb); } EXPORT_SYMBOL_GPL(dev_forward_skb); -- cgit v1.2.3-70-g09d2 From 56bfa7ee7c8892f1aa61797e4b8fec84b31d29b3 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Thu, 1 May 2014 11:40:30 -0700 Subject: unregister_netdevice : move RTM_DELLINK to until after ndo_uninit This patch fixes ordering of rtnl notifications during unregister_netdevice by moving RTM_DELLINK notification to until after ndo_uninit. The problem was seen with unregistering bond netdevices. bond ndo_uninit callback generates a few RTM_NEWLINK notifications for NETDEV_CHANGEADDR and NETDEV_FEAT_CHANGE. This is seen mostly when the bond is deleted with slaves still enslaved to the bond. During unregister netdevice (rollback_registered_many to be specific) bond ndo_uninit is called after RTM_DELLINK notification goes out. This results in userspace seeing RTM_DELLINK followed by a couple of RTM_NEWLINK's. In userspace problem was seen with libnl. libnl cache deletes the bond when it sees RTM_DELLINK and re-adds the bond with the following RTM_NEWLINK. Resulting in a stale bond entry in libnl cache when the kernel has already deleted the bond. This patch has been tested for bond, bridges and vlan devices. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 11d70e3afef..fe0b9cd69cb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5606,10 +5606,6 @@ static void rollback_registered_many(struct list_head *head) */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); - /* * Flush the unicast and multicast chains */ @@ -5619,6 +5615,10 @@ static void rollback_registered_many(struct list_head *head) if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); + if (!dev->rtnl_link_ops || + dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); + /* Notifier chain MUST detach us all upper devices. */ WARN_ON(netdev_has_any_upper_dev(dev)); -- cgit v1.2.3-70-g09d2 From c1e756bfcbcac838a86a23f3e4501b556a961e3c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 5 May 2014 15:00:44 +0200 Subject: Revert "net: core: introduce netif_skb_dev_features" This reverts commit d206940319c41df4299db75ed56142177bb2e5f6, there are no more callers. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +------ net/core/dev.c | 22 ++++++++++------------ 2 files changed, 11 insertions(+), 18 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7ed3a3aa660..20e99efb1ca 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3180,12 +3180,7 @@ void netdev_change_features(struct net_device *dev); void netif_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev); -netdev_features_t netif_skb_dev_features(struct sk_buff *skb, - const struct net_device *dev); -static inline netdev_features_t netif_skb_features(struct sk_buff *skb) -{ - return netif_skb_dev_features(skb, skb->dev); -} +netdev_features_t netif_skb_features(struct sk_buff *skb); static inline bool net_gso_ok(netdev_features_t features, int gso_type) { diff --git a/net/core/dev.c b/net/core/dev.c index d2c8a06b3a9..c619b864133 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2418,7 +2418,7 @@ EXPORT_SYMBOL(netdev_rx_csum_fault); * 2. No high memory really exists on this machine. */ -static int illegal_highdma(const struct net_device *dev, struct sk_buff *skb) +static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_HIGHMEM int i; @@ -2493,38 +2493,36 @@ static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features) } static netdev_features_t harmonize_features(struct sk_buff *skb, - const struct net_device *dev, - netdev_features_t features) + netdev_features_t features) { int tmp; if (skb->ip_summed != CHECKSUM_NONE && !can_checksum_protocol(features, skb_network_protocol(skb, &tmp))) { features &= ~NETIF_F_ALL_CSUM; - } else if (illegal_highdma(dev, skb)) { + } else if (illegal_highdma(skb->dev, skb)) { features &= ~NETIF_F_SG; } return features; } -netdev_features_t netif_skb_dev_features(struct sk_buff *skb, - const struct net_device *dev) +netdev_features_t netif_skb_features(struct sk_buff *skb) { __be16 protocol = skb->protocol; - netdev_features_t features = dev->features; + netdev_features_t features = skb->dev->features; - if (skb_shinfo(skb)->gso_segs > dev->gso_max_segs) + if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs) features &= ~NETIF_F_GSO_MASK; if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) { struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; protocol = veh->h_vlan_encapsulated_proto; } else if (!vlan_tx_tag_present(skb)) { - return harmonize_features(skb, dev, features); + return harmonize_features(skb, features); } - features &= (dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | + features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) @@ -2532,9 +2530,9 @@ netdev_features_t netif_skb_dev_features(struct sk_buff *skb, NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; - return harmonize_features(skb, dev, features); + return harmonize_features(skb, features); } -EXPORT_SYMBOL(netif_skb_dev_features); +EXPORT_SYMBOL(netif_skb_features); int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) -- cgit v1.2.3-70-g09d2 From 200b916f3575bdf11609cb447661b8d5957b0bbf Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 12 May 2014 15:11:20 -0700 Subject: rtnetlink: wait for unregistering devices in rtnl_link_unregister() From: Cong Wang commit 50624c934db18ab90 (net: Delay default_device_exit_batch until no devices are unregistering) introduced rtnl_lock_unregistering() for default_device_exit_batch(). Same race could happen we when rmmod a driver which calls rtnl_link_unregister() as we call dev->destructor without rtnl lock. For long term, I think we should clean up the mess of netdev_run_todo() and net namespce exit code. Cc: Eric W. Biederman Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 5 +++++ net/core/dev.c | 2 +- net/core/net_namespace.c | 2 +- net/core/rtnetlink.c | 33 ++++++++++++++++++++++++++++++++- 4 files changed, 39 insertions(+), 3 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 8e3e66ac0a5..953937ea523 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -4,6 +4,7 @@ #include #include +#include #include extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo); @@ -22,6 +23,10 @@ extern void rtnl_lock(void); extern void rtnl_unlock(void); extern int rtnl_trylock(void); extern int rtnl_is_locked(void); + +extern wait_queue_head_t netdev_unregistering_wq; +extern struct mutex net_mutex; + #ifdef CONFIG_PROVE_LOCKING extern int lockdep_rtnl_is_held(void); #else diff --git a/net/core/dev.c b/net/core/dev.c index c619b864133..6da649bde4f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5541,7 +5541,7 @@ static int dev_new_index(struct net *net) /* Delayed registration/unregisteration */ static LIST_HEAD(net_todo_list); -static DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); +DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); static void net_set_todo(struct net_device *dev) { diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 81d3a9a0845..7c8ffd97496 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -24,7 +24,7 @@ static LIST_HEAD(pernet_list); static struct list_head *first_device = &pernet_list; -static DEFINE_MUTEX(net_mutex); +DEFINE_MUTEX(net_mutex); LIST_HEAD(net_namespace_list); EXPORT_SYMBOL_GPL(net_namespace_list); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 9837bebf93c..2d8d8fcfa06 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -353,15 +353,46 @@ void __rtnl_link_unregister(struct rtnl_link_ops *ops) } EXPORT_SYMBOL_GPL(__rtnl_link_unregister); +/* Return with the rtnl_lock held when there are no network + * devices unregistering in any network namespace. + */ +static void rtnl_lock_unregistering_all(void) +{ + struct net *net; + bool unregistering; + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait(&netdev_unregistering_wq, &wait, + TASK_UNINTERRUPTIBLE); + unregistering = false; + rtnl_lock(); + for_each_net(net) { + if (net->dev_unreg_count > 0) { + unregistering = true; + break; + } + } + if (!unregistering) + break; + __rtnl_unlock(); + schedule(); + } + finish_wait(&netdev_unregistering_wq, &wait); +} + /** * rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. * @ops: struct rtnl_link_ops * to unregister */ void rtnl_link_unregister(struct rtnl_link_ops *ops) { - rtnl_lock(); + /* Close the race with cleanup_net() */ + mutex_lock(&net_mutex); + rtnl_lock_unregistering_all(); __rtnl_link_unregister(ops); rtnl_unlock(); + mutex_unlock(&net_mutex); } EXPORT_SYMBOL_GPL(rtnl_link_unregister); -- cgit v1.2.3-70-g09d2 From 29e98242783ed3ba569797846a606ba66f781625 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 16 May 2014 11:34:37 -0700 Subject: net: gro: make sure skb->cb[] initial content has not to be zero MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Starting from linux-3.13, GRO attempts to build full size skbs. Problem is the commit assumed one particular field in skb->cb[] was clean, but it is not the case on some stacked devices. Timo reported a crash in case traffic is decrypted before reaching a GRE device. Fix this by initializing NAPI_GRO_CB(skb)->last at the right place, this also removes one conditional. Thanks a lot to Timo for providing full reports and bisecting this. Fixes: 8a29111c7ca6 ("net: gro: allow to build full sized skb") Bisected-by: Timo Teras Signed-off-by: Eric Dumazet Tested-by: Timo Teräs Signed-off-by: David S. Miller --- net/core/dev.c | 1 + net/core/skbuff.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 6da649bde4f..ed928e84655 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3951,6 +3951,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff } NAPI_GRO_CB(skb)->count = 1; NAPI_GRO_CB(skb)->age = jiffies; + NAPI_GRO_CB(skb)->last = skb; skb_shinfo(skb)->gso_size = skb_gro_len(skb); skb->next = napi->gro_list; napi->gro_list = skb; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1b62343f583..8383b2bddeb 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3076,7 +3076,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) if (unlikely(p->len + len >= 65536)) return -E2BIG; - lp = NAPI_GRO_CB(p)->last ?: p; + lp = NAPI_GRO_CB(p)->last; pinfo = skb_shinfo(lp); if (headlen <= offset) { @@ -3192,7 +3192,7 @@ merge: __skb_pull(skb, offset); - if (!NAPI_GRO_CB(p)->last) + if (NAPI_GRO_CB(p)->last == p) skb_shinfo(p)->frag_list = skb; else NAPI_GRO_CB(p)->last->next = skb; -- cgit v1.2.3-70-g09d2 From 4085ebe8c31face855fd01ee40372cb4aab1df3a Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Fri, 16 May 2014 17:04:53 -0400 Subject: net: Find the nesting level of a given device by type. Multiple devices in the kernel can be stacked/nested and they need to know their nesting level for the purposes of lockdep. This patch provides a generic function that determines a nesting level of a particular device by its type (ex: vlan, macvlan, etc). We only care about nesting of the same type of devices. For example: eth0 <- vlan0.10 <- macvlan0 <- vlan1.20 The nesting level of vlan1.20 would be 1, since there is another vlan in the stack under it. Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- include/linux/netdevice.h | 10 ++++++++++ net/core/dev.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 20e99efb1ca..fb912e8e5c7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3077,6 +3077,14 @@ void *netdev_lower_get_next_private_rcu(struct net_device *dev, priv; \ priv = netdev_lower_get_next_private_rcu(dev, &(iter))) +void *netdev_lower_get_next(struct net_device *dev, + struct list_head **iter); +#define netdev_for_each_lower_dev(dev, ldev, iter) \ + for (iter = &(dev)->adj_list.lower, \ + ldev = netdev_lower_get_next(dev, &(iter)); \ + ldev; \ + ldev = netdev_lower_get_next(dev, &(iter))) + void *netdev_adjacent_get_private(struct list_head *adj_list); void *netdev_lower_get_first_private_rcu(struct net_device *dev); struct net_device *netdev_master_upper_dev_get(struct net_device *dev); @@ -3092,6 +3100,8 @@ void netdev_upper_dev_unlink(struct net_device *dev, void netdev_adjacent_rename_links(struct net_device *dev, char *oldname); void *netdev_lower_dev_get_private(struct net_device *dev, struct net_device *lower_dev); +int dev_get_nest_level(struct net_device *dev, + bool (*type_check)(struct net_device *dev)); int skb_checksum_help(struct sk_buff *skb); struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path); diff --git a/net/core/dev.c b/net/core/dev.c index ed928e84655..6ee3ac25ed7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4622,6 +4622,32 @@ void *netdev_lower_get_next_private_rcu(struct net_device *dev, } EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); +/** + * netdev_lower_get_next - Get the next device from the lower neighbour + * list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent from the dev's lower neighbour + * list, starting from iter position. The caller must hold RTNL lock or + * its own locking that guarantees that the neighbour lower + * list will remain unchainged. + */ +void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) +{ + struct netdev_adjacent *lower; + + lower = list_entry((*iter)->next, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = &lower->list; + + return lower->dev; +} +EXPORT_SYMBOL(netdev_lower_get_next); + /** * netdev_lower_get_first_private_rcu - Get the first ->private from the * lower neighbour list, RCU @@ -5072,6 +5098,30 @@ void *netdev_lower_dev_get_private(struct net_device *dev, } EXPORT_SYMBOL(netdev_lower_dev_get_private); + +int dev_get_nest_level(struct net_device *dev, + bool (*type_check)(struct net_device *dev)) +{ + struct net_device *lower = NULL; + struct list_head *iter; + int max_nest = -1; + int nest; + + ASSERT_RTNL(); + + netdev_for_each_lower_dev(dev, lower, iter) { + nest = dev_get_nest_level(lower, type_check); + if (max_nest < nest) + max_nest = nest; + } + + if (type_check(dev)) + max_nest++; + + return max_nest; +} +EXPORT_SYMBOL(dev_get_nest_level); + static void dev_change_rx_flags(struct net_device *dev, int flags) { const struct net_device_ops *ops = dev->netdev_ops; -- cgit v1.2.3-70-g09d2 From d38569ab2bba6e6b3233acfc3a84cdbcfbd1f79f Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Fri, 16 May 2014 17:04:55 -0400 Subject: vlan: Fix lockdep warning with stacked vlan devices. This reverts commit dc8eaaa006350d24030502a4521542e74b5cb39f. vlan: Fix lockdep warning when vlan dev handle notification Instead we use the new new API to find the lock subclass of our vlan device. This way we can support configurations where vlans are interspersed with other devices: bond -> vlan -> macvlan -> vlan Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 3 ++- net/8021q/vlan.c | 1 + net/8021q/vlan_dev.c | 52 +++++++++---------------------------------------- net/core/dev.c | 1 - 4 files changed, 12 insertions(+), 45 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 13bbbde00e6..724bde8477b 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -73,7 +73,7 @@ static inline struct vlan_ethhdr *vlan_eth_hdr(const struct sk_buff *skb) /* found in socket.c */ extern void vlan_ioctl_set(int (*hook)(struct net *, void __user *)); -static inline int is_vlan_dev(struct net_device *dev) +static inline bool is_vlan_dev(struct net_device *dev) { return dev->priv_flags & IFF_802_1Q_VLAN; } @@ -159,6 +159,7 @@ struct vlan_dev_priv { #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *netpoll; #endif + unsigned int nest_level; }; static inline struct vlan_dev_priv *vlan_dev_priv(const struct net_device *dev) diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 175273f38cb..44ebd5c2cd4 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -169,6 +169,7 @@ int register_vlan_dev(struct net_device *dev) if (err < 0) goto out_uninit_mvrp; + vlan->nest_level = dev_get_nest_level(real_dev, is_vlan_dev) + 1; err = register_netdevice(dev); if (err < 0) goto out_uninit_mvrp; diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 733ec283ed1..019efb79708 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -493,48 +493,10 @@ static void vlan_dev_change_rx_flags(struct net_device *dev, int change) } } -static int vlan_calculate_locking_subclass(struct net_device *real_dev) -{ - int subclass = 0; - - while (is_vlan_dev(real_dev)) { - subclass++; - real_dev = vlan_dev_priv(real_dev)->real_dev; - } - - return subclass; -} - -static void vlan_dev_mc_sync(struct net_device *to, struct net_device *from) -{ - int err = 0, subclass; - - subclass = vlan_calculate_locking_subclass(to); - - spin_lock_nested(&to->addr_list_lock, subclass); - err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len); - if (!err) - __dev_set_rx_mode(to); - spin_unlock(&to->addr_list_lock); -} - -static void vlan_dev_uc_sync(struct net_device *to, struct net_device *from) -{ - int err = 0, subclass; - - subclass = vlan_calculate_locking_subclass(to); - - spin_lock_nested(&to->addr_list_lock, subclass); - err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len); - if (!err) - __dev_set_rx_mode(to); - spin_unlock(&to->addr_list_lock); -} - static void vlan_dev_set_rx_mode(struct net_device *vlan_dev) { - vlan_dev_mc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev); - vlan_dev_uc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev); + dev_mc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev); + dev_uc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev); } /* @@ -562,6 +524,11 @@ static void vlan_dev_set_lockdep_class(struct net_device *dev, int subclass) netdev_for_each_tx_queue(dev, vlan_dev_set_lockdep_one, &subclass); } +static int vlan_dev_get_lock_subclass(struct net_device *dev) +{ + return vlan_dev_priv(dev)->nest_level; +} + static const struct header_ops vlan_header_ops = { .create = vlan_dev_hard_header, .rebuild = vlan_dev_rebuild_header, @@ -597,7 +564,6 @@ static const struct net_device_ops vlan_netdev_ops; static int vlan_dev_init(struct net_device *dev) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; - int subclass = 0; netif_carrier_off(dev); @@ -646,8 +612,7 @@ static int vlan_dev_init(struct net_device *dev) SET_NETDEV_DEVTYPE(dev, &vlan_type); - subclass = vlan_calculate_locking_subclass(dev); - vlan_dev_set_lockdep_class(dev, subclass); + vlan_dev_set_lockdep_class(dev, vlan_dev_get_lock_subclass(dev)); vlan_dev_priv(dev)->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan_dev_priv(dev)->vlan_pcpu_stats) @@ -819,6 +784,7 @@ static const struct net_device_ops vlan_netdev_ops = { .ndo_netpoll_cleanup = vlan_dev_netpoll_cleanup, #endif .ndo_fix_features = vlan_dev_fix_features, + .ndo_get_lock_subclass = vlan_dev_get_lock_subclass, }; void vlan_setup(struct net_device *dev) diff --git a/net/core/dev.c b/net/core/dev.c index 6ee3ac25ed7..2b872bfbd17 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5287,7 +5287,6 @@ void __dev_set_rx_mode(struct net_device *dev) if (ops->ndo_set_rx_mode) ops->ndo_set_rx_mode(dev); } -EXPORT_SYMBOL(__dev_set_rx_mode); void dev_set_rx_mode(struct net_device *dev) { -- cgit v1.2.3-70-g09d2 From 44a4085538c844e79d6ee6bcf46fabf7c57a9a38 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Fri, 16 May 2014 17:20:38 -0400 Subject: bonding: Fix stacked device detection in arp monitoring Prior to commit fbd929f2dce460456807a51e18d623db3db9f077 bonding: support QinQ for bond arp interval the arp monitoring code allowed for proper detection of devices stacked on top of vlans. Since the above commit, the code can still detect a device stacked on top of single vlan, but not a device stacked on top of Q-in-Q configuration. The search will only set the inner vlan tag if the route device is the vlan device. However, this is not always the case, as it is possible to extend the stacked configuration. With this patch it is possible to provision devices on top Q-in-Q vlan configuration that should be used as a source of ARP monitoring information. For example: ip link add link bond0 vlan10 type vlan proto 802.1q id 10 ip link add link vlan10 vlan100 type vlan proto 802.1q id 100 ip link add link vlan100 type macvlan Note: This patch limites the number of stacked VLANs to 2, just like before. The original, however had another issue in that if we had more then 2 levels of VLANs, we would end up generating incorrectly tagged traffic. This is no longer possible. Fixes: fbd929f2dce460456807a51e18d623db3db9f077 (bonding: support QinQ for bond arp interval) CC: Jay Vosburgh CC: Veaceslav Falico CC: Andy Gospodarek CC: Ding Tianhong CC: Patric McHardy Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 134 +++++++++++++++++++--------------------- drivers/net/bonding/bonding.h | 1 + include/linux/if_vlan.h | 6 ++ include/linux/netdevice.h | 9 +++ net/core/dev.c | 26 ++++++++ 5 files changed, 107 insertions(+), 69 deletions(-) (limited to 'net/core/dev.c') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 69aff72c895..d3a67896d43 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2126,10 +2126,10 @@ static bool bond_has_this_ip(struct bonding *bond, __be32 ip) */ static void bond_arp_send(struct net_device *slave_dev, int arp_op, __be32 dest_ip, __be32 src_ip, - struct bond_vlan_tag *inner, - struct bond_vlan_tag *outer) + struct bond_vlan_tag *tags) { struct sk_buff *skb; + int i; pr_debug("arp %d on slave %s: dst %pI4 src %pI4\n", arp_op, slave_dev->name, &dest_ip, &src_ip); @@ -2141,21 +2141,26 @@ static void bond_arp_send(struct net_device *slave_dev, int arp_op, net_err_ratelimited("ARP packet allocation failed\n"); return; } - if (outer->vlan_id) { - if (inner->vlan_id) { - pr_debug("inner tag: proto %X vid %X\n", - ntohs(inner->vlan_proto), inner->vlan_id); - skb = __vlan_put_tag(skb, inner->vlan_proto, - inner->vlan_id); - if (!skb) { - net_err_ratelimited("failed to insert inner VLAN tag\n"); - return; - } - } - pr_debug("outer reg: proto %X vid %X\n", - ntohs(outer->vlan_proto), outer->vlan_id); - skb = vlan_put_tag(skb, outer->vlan_proto, outer->vlan_id); + /* Go through all the tags backwards and add them to the packet */ + for (i = BOND_MAX_VLAN_ENCAP - 1; i > 0; i--) { + if (!tags[i].vlan_id) + continue; + + pr_debug("inner tag: proto %X vid %X\n", + ntohs(tags[i].vlan_proto), tags[i].vlan_id); + skb = __vlan_put_tag(skb, tags[i].vlan_proto, + tags[i].vlan_id); + if (!skb) { + net_err_ratelimited("failed to insert inner VLAN tag\n"); + return; + } + } + /* Set the outer tag */ + if (tags[0].vlan_id) { + pr_debug("outer tag: proto %X vid %X\n", + ntohs(tags[0].vlan_proto), tags[0].vlan_id); + skb = vlan_put_tag(skb, tags[0].vlan_proto, tags[0].vlan_id); if (!skb) { net_err_ratelimited("failed to insert outer VLAN tag\n"); return; @@ -2164,22 +2169,52 @@ static void bond_arp_send(struct net_device *slave_dev, int arp_op, arp_xmit(skb); } +/* Validate the device path between the @start_dev and the @end_dev. + * The path is valid if the @end_dev is reachable through device + * stacking. + * When the path is validated, collect any vlan information in the + * path. + */ +static bool bond_verify_device_path(struct net_device *start_dev, + struct net_device *end_dev, + struct bond_vlan_tag *tags) +{ + struct net_device *upper; + struct list_head *iter; + int idx; + + if (start_dev == end_dev) + return true; + + netdev_for_each_upper_dev_rcu(start_dev, upper, iter) { + if (bond_verify_device_path(upper, end_dev, tags)) { + if (is_vlan_dev(upper)) { + idx = vlan_get_encap_level(upper); + if (idx >= BOND_MAX_VLAN_ENCAP) + return false; + + tags[idx].vlan_proto = + vlan_dev_vlan_proto(upper); + tags[idx].vlan_id = vlan_dev_vlan_id(upper); + } + return true; + } + } + + return false; +} static void bond_arp_send_all(struct bonding *bond, struct slave *slave) { - struct net_device *upper, *vlan_upper; - struct list_head *iter, *vlan_iter; struct rtable *rt; - struct bond_vlan_tag inner, outer; + struct bond_vlan_tag tags[BOND_MAX_VLAN_ENCAP]; __be32 *targets = bond->params.arp_targets, addr; int i; + bool ret; for (i = 0; i < BOND_MAX_ARP_TARGETS && targets[i]; i++) { pr_debug("basa: target %pI4\n", &targets[i]); - inner.vlan_proto = 0; - inner.vlan_id = 0; - outer.vlan_proto = 0; - outer.vlan_id = 0; + memset(tags, 0, sizeof(tags)); /* Find out through which dev should the packet go */ rt = ip_route_output(dev_net(bond->dev), targets[i], 0, @@ -2192,7 +2227,8 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave) net_warn_ratelimited("%s: no route to arp_ip_target %pI4 and arp_validate is set\n", bond->dev->name, &targets[i]); - bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i], 0, &inner, &outer); + bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i], + 0, tags); continue; } @@ -2201,52 +2237,12 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave) goto found; rcu_read_lock(); - /* first we search only for vlan devices. for every vlan - * found we verify its upper dev list, searching for the - * rt->dst.dev. If found we save the tag of the vlan and - * proceed to send the packet. - */ - netdev_for_each_all_upper_dev_rcu(bond->dev, vlan_upper, - vlan_iter) { - if (!is_vlan_dev(vlan_upper)) - continue; - - if (vlan_upper == rt->dst.dev) { - outer.vlan_proto = vlan_dev_vlan_proto(vlan_upper); - outer.vlan_id = vlan_dev_vlan_id(vlan_upper); - rcu_read_unlock(); - goto found; - } - netdev_for_each_all_upper_dev_rcu(vlan_upper, upper, - iter) { - if (upper == rt->dst.dev) { - /* If the upper dev is a vlan dev too, - * set the vlan tag to inner tag. - */ - if (is_vlan_dev(upper)) { - inner.vlan_proto = vlan_dev_vlan_proto(upper); - inner.vlan_id = vlan_dev_vlan_id(upper); - } - outer.vlan_proto = vlan_dev_vlan_proto(vlan_upper); - outer.vlan_id = vlan_dev_vlan_id(vlan_upper); - rcu_read_unlock(); - goto found; - } - } - } - - /* if the device we're looking for is not on top of any of - * our upper vlans, then just search for any dev that - * matches, and in case it's a vlan - save the id - */ - netdev_for_each_all_upper_dev_rcu(bond->dev, upper, iter) { - if (upper == rt->dst.dev) { - rcu_read_unlock(); - goto found; - } - } + ret = bond_verify_device_path(bond->dev, rt->dst.dev, tags); rcu_read_unlock(); + if (ret) + goto found; + /* Not our device - skip */ pr_debug("%s: no path to arp_ip_target %pI4 via rt.dev %s\n", bond->dev->name, &targets[i], @@ -2259,7 +2255,7 @@ found: addr = bond_confirm_addr(rt->dst.dev, targets[i], 0); ip_rt_put(rt); bond_arp_send(slave->dev, ARPOP_REQUEST, targets[i], - addr, &inner, &outer); + addr, tags); } } diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h index b8bdd0acc8f..00bea320e3b 100644 --- a/drivers/net/bonding/bonding.h +++ b/drivers/net/bonding/bonding.h @@ -36,6 +36,7 @@ #define bond_version DRV_DESCRIPTION ": v" DRV_VERSION " (" DRV_RELDATE ")\n" +#define BOND_MAX_VLAN_ENCAP 2 #define BOND_MAX_ARP_TARGETS 16 #define BOND_DEFAULT_MIIMON 100 diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 724bde8477b..c901b13b6f0 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -484,4 +484,10 @@ static inline void vlan_set_encap_proto(struct sk_buff *skb, */ skb->protocol = htons(ETH_P_802_2); } + +static inline int vlan_get_encap_level(struct net_device *dev) +{ + BUG_ON(!is_vlan_dev(dev)); + return vlan_dev_priv(dev)->nest_level; +} #endif /* !(_LINUX_IF_VLAN_H_) */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9d4b1f1b6b7..b42d07b0390 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3056,9 +3056,18 @@ extern int weight_p; extern int bpf_jit_enable; bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev); +struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, + struct list_head **iter); struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, struct list_head **iter); +/* iterate through upper list, must be called under RCU read lock */ +#define netdev_for_each_upper_dev_rcu(dev, updev, iter) \ + for (iter = &(dev)->adj_list.upper, \ + updev = netdev_upper_get_next_dev_rcu(dev, &(iter)); \ + updev; \ + updev = netdev_upper_get_next_dev_rcu(dev, &(iter))) + /* iterate through upper list, must be called under RCU read lock */ #define netdev_for_each_all_upper_dev_rcu(dev, updev, iter) \ for (iter = &(dev)->all_adj_list.upper, \ diff --git a/net/core/dev.c b/net/core/dev.c index 2b872bfbd17..9abc503b19b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4541,6 +4541,32 @@ void *netdev_adjacent_get_private(struct list_head *adj_list) } EXPORT_SYMBOL(netdev_adjacent_get_private); +/** + * netdev_upper_get_next_dev_rcu - Get the next dev from upper list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next device from the dev's upper list, starting from iter + * position. The caller must hold RCU read lock. + */ +struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *upper; + + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); + + upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&upper->list == &dev->adj_list.upper) + return NULL; + + *iter = &upper->list; + + return upper->dev; +} +EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); + /** * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list * @dev: device -- cgit v1.2.3-70-g09d2 From 4b9b1cdf83c4facba89e0646aeac8ead679851b8 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 28 May 2014 18:03:48 +0200 Subject: net: fix wrong mac_len calculation for vlans After 1e785f48d29a ("net: Start with correct mac_len in skb_network_protocol") skb->mac_len is used as a start of the calculation in skb_network_protocol() but that is not always correct. If skb->protocol == 8021Q/AD, usually the vlan header is already inserted in the skb (i.e. vlan reorder hdr == 0). Usually when the packet enters dev_hard_xmit it has mac_len == 0 so we take 2 bytes from the destination mac address (skb->data + VLAN_HLEN) as a type in skb_network_protocol() and return vlan_depth == 4. In the case where TSO is off, then the mac_len is set but it's == 18 (ETH_HLEN + VLAN_HLEN), so skb_network_protocol() returns a type from inside the packet and offset == 22. Also make vlan_depth unsigned as suggested before. As suggested by Eric Dumazet, move the while() loop in the if() so we can avoid additional testing in fast path. Here are few netperf tests + debug printk's to illustrate: cat netperf.tso-on.reorder-on.bugged - Vlan -> device (reorder on, default, this case is okay) MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.3.1 () port 0 AF_INET Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 87380 16384 16384 10.00 7111.54 [ 81.605435] skb->len 65226 skb->gso_size 1448 skb->proto 0x800 skb->mac_len 0 vlan_depth 0 type 0x800 - Vlan -> device (reorder off, bad) cat netperf.tso-on.reorder-off.bugged MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.3.1 () port 0 AF_INET Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 87380 16384 16384 10.00 241.35 [ 204.578332] skb->len 1518 skb->gso_size 0 skb->proto 0x8100 skb->mac_len 0 vlan_depth 4 type 0x5301 0x5301 are the last two bytes of the destination mac. And if we stop TSO, we may get even the following: [ 83.343156] skb->len 2966 skb->gso_size 1448 skb->proto 0x8100 skb->mac_len 18 vlan_depth 22 type 0xb84 Because mac_len already accounts for VLAN_HLEN. After the fix: cat netperf.tso-on.reorder-off.fixed MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.3.1 () port 0 AF_INET Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 87380 16384 16384 10.01 5001.46 [ 81.888489] skb->len 65230 skb->gso_size 1448 skb->proto 0x8100 skb->mac_len 0 vlan_depth 18 type 0x800 CC: Vlad Yasevich CC: Eric Dumazet CC: Daniel Borkman CC: David S. Miller Fixes:1e785f48d29a ("net: Start with correct mac_len in skb_network_protocol") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/core/dev.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 9abc503b19b..fb8b0546485 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2283,8 +2283,8 @@ EXPORT_SYMBOL(skb_checksum_help); __be16 skb_network_protocol(struct sk_buff *skb, int *depth) { + unsigned int vlan_depth = skb->mac_len; __be16 type = skb->protocol; - int vlan_depth = skb->mac_len; /* Tunnel gso handlers can set protocol to ethernet. */ if (type == htons(ETH_P_TEB)) { @@ -2297,15 +2297,30 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth) type = eth->h_proto; } - while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { - struct vlan_hdr *vh; - - if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN))) - return 0; - - vh = (struct vlan_hdr *)(skb->data + vlan_depth); - type = vh->h_vlan_encapsulated_proto; - vlan_depth += VLAN_HLEN; + /* if skb->protocol is 802.1Q/AD then the header should already be + * present at mac_len - VLAN_HLEN (if mac_len > 0), or at + * ETH_HLEN otherwise + */ + if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { + if (vlan_depth) { + if (unlikely(WARN_ON(vlan_depth < VLAN_HLEN))) + return 0; + vlan_depth -= VLAN_HLEN; + } else { + vlan_depth = ETH_HLEN; + } + do { + struct vlan_hdr *vh; + + if (unlikely(!pskb_may_pull(skb, + vlan_depth + VLAN_HLEN))) + return 0; + + vh = (struct vlan_hdr *)(skb->data + vlan_depth); + type = vh->h_vlan_encapsulated_proto; + vlan_depth += VLAN_HLEN; + } while (type == htons(ETH_P_8021Q) || + type == htons(ETH_P_8021AD)); } *depth = vlan_depth; -- cgit v1.2.3-70-g09d2 From 92ff71b8fe9cd9c673615fc6f3870af7376d7c84 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 3 Jun 2014 17:11:54 -0700 Subject: net: remove some unless free on failure in alloc_netdev_mqs() When we jump to free_pcpu on failure in alloc_netdev_mqs() rx and tx queues are not yet allocated, so no need to free them. Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/dev.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 0355ca5d292..1ba2cfe3f8e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6503,11 +6503,6 @@ free_all: free_pcpu: free_percpu(dev->pcpu_refcnt); - netif_free_tx_queues(dev); -#ifdef CONFIG_SYSFS - kfree(dev->_rx); -#endif - free_dev: netdev_freemem(dev); return NULL; -- cgit v1.2.3-70-g09d2 From 4cb28970a23ff209199b0a4358d68efe82c8f493 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 2 Jun 2014 15:55:22 -0700 Subject: net: use the new API kvfree() It is available since v3.15-rc5. Cc: Pablo Neira Ayuso Cc: "David S. Miller" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/dev.c | 10 ++-------- net/ipv4/tcp_metrics.c | 5 +---- net/netfilter/ipset/ip_set_core.c | 5 +---- net/netfilter/nft_hash.c | 5 +---- net/netfilter/xt_recent.c | 5 +---- net/sched/sch_choke.c | 7 +------ net/sched/sch_fq.c | 5 +---- net/sched/sch_fq_codel.c | 7 +------ net/sched/sch_hhf.c | 7 +------ net/sched/sch_netem.c | 7 +------ net/sched/sch_sfq.c | 7 +------ 11 files changed, 12 insertions(+), 58 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 5367bfba094..a9a08e4a485 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5933,10 +5933,7 @@ static void netdev_init_one_queue(struct net_device *dev, static void netif_free_tx_queues(struct net_device *dev) { - if (is_vmalloc_addr(dev->_tx)) - vfree(dev->_tx); - else - kfree(dev->_tx); + kvfree(dev->_tx); } static int netif_alloc_netdev_queues(struct net_device *dev) @@ -6410,10 +6407,7 @@ void netdev_freemem(struct net_device *dev) { char *addr = (char *)dev - dev->padded; - if (is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); + kvfree(addr); } /** diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index dcaf72f1021..4fe04180598 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -1159,10 +1159,7 @@ static void __net_exit tcp_net_metrics_exit(struct net *net) tm = next; } } - if (is_vmalloc_addr(net->ipv4.tcp_metrics_hash)) - vfree(net->ipv4.tcp_metrics_hash); - else - kfree(net->ipv4.tcp_metrics_hash); + kvfree(net->ipv4.tcp_metrics_hash); } static __net_initdata struct pernet_operations tcp_net_metrics_ops = { diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 117208321f1..ec8114fae50 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -271,10 +271,7 @@ ip_set_free(void *members) { pr_debug("%p: free with %s\n", members, is_vmalloc_addr(members) ? "vfree" : "kfree"); - if (is_vmalloc_addr(members)) - vfree(members); - else - kfree(members); + kvfree(members); } EXPORT_SYMBOL_GPL(ip_set_free); diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index 1dfeb678683..4080ed6a072 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -76,10 +76,7 @@ static bool nft_hash_lookup(const struct nft_set *set, static void nft_hash_tbl_free(const struct nft_hash_table *tbl) { - if (is_vmalloc_addr(tbl)) - vfree(tbl); - else - kfree(tbl); + kvfree(tbl); } static unsigned int nft_hash_tbl_size(unsigned int nelem) diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 1e657cf715c..a9faae89f95 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -313,10 +313,7 @@ out: static void recent_table_free(void *addr) { - if (is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); + kvfree(addr); } static int recent_mt_check(const struct xt_mtchk_param *par, diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 2aee02802c2..ed30e436128 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -391,12 +391,7 @@ static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = { static void choke_free(void *addr) { - if (addr) { - if (is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); - } + kvfree(addr); } static int choke_change(struct Qdisc *sch, struct nlattr *opt) diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 23c682b42f9..ba32c2b005d 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -591,10 +591,7 @@ static void *fq_alloc_node(size_t sz, int node) static void fq_free(void *addr) { - if (addr && is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); + kvfree(addr); } static int fq_resize(struct Qdisc *sch, u32 log) diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 0bf432c782c..063b726bf1f 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -365,12 +365,7 @@ static void *fq_codel_zalloc(size_t sz) static void fq_codel_free(void *addr) { - if (addr) { - if (is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); - } + kvfree(addr); } static void fq_codel_destroy(struct Qdisc *sch) diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 6aab8619bbb..d85b6812a7d 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -494,12 +494,7 @@ static void *hhf_zalloc(size_t sz) static void hhf_free(void *addr) { - if (addr) { - if (is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); - } + kvfree(addr); } static void hhf_destroy(struct Qdisc *sch) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index f1669a00f57..111d70fddae 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -648,12 +648,7 @@ static void netem_reset(struct Qdisc *sch) static void dist_free(struct disttable *d) { - if (d) { - if (is_vmalloc_addr(d)) - vfree(d); - else - kfree(d); - } + kvfree(d); } /* diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 87317ff0b4e..1af2f73906d 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -716,12 +716,7 @@ static void *sfq_alloc(size_t sz) static void sfq_free(void *addr) { - if (addr) { - if (is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); - } + kvfree(addr); } static void sfq_destroy(struct Qdisc *sch) -- cgit v1.2.3-70-g09d2 From 3b392ddba25a95dcf5fb30b33358961c49dd5cfc Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 4 Jun 2014 08:53:17 +0900 Subject: MPLS: Use mpls_features to activate software MPLS GSO segmentation If an MPLS packet requires segmentation then use mpls_features to determine if the software implementation should be used. As no driver advertises MPLS GSO segmentation this will always be the case. I had not noticed that this was necessary before as software MPLS GSO segmentation was already being used in my test environment. I believe that the reason for that is the skbs in question always had fragments and the driver I used does not advertise NETIF_F_FRAGLIST (which seems to be the case for most drivers). Thus software segmentation was activated by skb_gso_ok(). This introduces the overhead of an extra call to skb_network_protocol() in the case where where CONFIG_NET_MPLS_GSO is set and skb->ip_summed == CHECKSUM_NONE. Thanks to Jesse Gross for prompting me to investigate this. Signed-off-by: Simon Horman Acked-by: YAMAMOTO Takashi Acked-by: Thomas Graf Signed-off-by: David S. Miller --- net/core/dev.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index a9a08e4a485..ed8fe62d41a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2513,13 +2513,39 @@ static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features) return 0; } +/* If MPLS offload request, verify we are testing hardware MPLS features + * instead of standard features for the netdev. + */ +#ifdef CONFIG_NET_MPLS_GSO +static netdev_features_t net_mpls_features(struct sk_buff *skb, + netdev_features_t features, + __be16 type) +{ + if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC)) + features &= skb->dev->mpls_features; + + return features; +} +#else +static netdev_features_t net_mpls_features(struct sk_buff *skb, + netdev_features_t features, + __be16 type) +{ + return features; +} +#endif + static netdev_features_t harmonize_features(struct sk_buff *skb, netdev_features_t features) { int tmp; + __be16 type; + + type = skb_network_protocol(skb, &tmp); + features = net_mpls_features(skb, features, type); if (skb->ip_summed != CHECKSUM_NONE && - !can_checksum_protocol(features, skb_network_protocol(skb, &tmp))) { + !can_checksum_protocol(features, type)) { features &= ~NETIF_F_ALL_CSUM; } else if (illegal_highdma(skb->dev, skb)) { features &= ~NETIF_F_SG; -- cgit v1.2.3-70-g09d2 From 87757a917b0b3c0787e0563c679762152be81312 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 6 Jun 2014 06:44:03 -0700 Subject: net: force a list_del() in unregister_netdevice_many() unregister_netdevice_many() API is error prone and we had too many bugs because of dangling LIST_HEAD on stacks. See commit f87e6f47933e3e ("net: dont leave active on stack LIST_HEAD") In fact, instead of making sure no caller leaves an active list_head, just force a list_del() in the callee. No one seems to need to access the list after unregister_netdevice_many() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 1 - net/core/dev.c | 5 ++++- net/core/rtnetlink.c | 1 - net/mac80211/iface.c | 1 - 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net/core/dev.c') diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index d53e299ae1d..7eec598c5cb 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -1036,7 +1036,6 @@ static int macvlan_device_event(struct notifier_block *unused, list_for_each_entry_safe(vlan, next, &port->vlans, list) vlan->dev->rtnl_link_ops->dellink(vlan->dev, &list_kill); unregister_netdevice_many(&list_kill); - list_del(&list_kill); break; case NETDEV_PRE_TYPE_CHANGE: /* Forbid underlaying device to change its type. */ diff --git a/net/core/dev.c b/net/core/dev.c index fb8b0546485..a30bef1882f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6613,6 +6613,9 @@ EXPORT_SYMBOL(unregister_netdevice_queue); /** * unregister_netdevice_many - unregister many devices * @head: list of devices + * + * Note: As most callers use a stack allocated list_head, + * we force a list_del() to make sure stack wont be corrupted later. */ void unregister_netdevice_many(struct list_head *head) { @@ -6622,6 +6625,7 @@ void unregister_netdevice_many(struct list_head *head) rollback_registered_many(head); list_for_each_entry(dev, head, unreg_list) net_set_todo(dev); + list_del(head); } } EXPORT_SYMBOL(unregister_netdevice_many); @@ -7077,7 +7081,6 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) } } unregister_netdevice_many(&dev_kill_list); - list_del(&dev_kill_list); rtnl_unlock(); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f4e9037f9a0..fbdb1556b0d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1744,7 +1744,6 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) ops->dellink(dev, &list_kill); unregister_netdevice_many(&list_kill); - list_del(&list_kill); return 0; } diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index b8d331e7d88..34799e06ee0 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1758,7 +1758,6 @@ void ieee80211_remove_interfaces(struct ieee80211_local *local) } mutex_unlock(&local->iflist_mtx); unregister_netdevice_many(&unreg_list); - list_del(&unreg_list); list_for_each_entry_safe(sdata, tmp, &wdev_list, list) { list_del(&sdata->list); -- cgit v1.2.3-70-g09d2 From 11ef7a8996d5d433c9cd75d80651297eccbf6d42 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 30 Jun 2014 09:50:40 -0700 Subject: net: Performance fix for process_backlog In process_backlog the input_pkt_queue is only checked once for new packets and quota is artificially reduced to reflect precisely the number of packets on the input_pkt_queue so that the loop exits appropriately. This patches changes the behavior to be more straightforward and less convoluted. Packets are processed until either the quota is met or there are no more packets to process. This patch seems to provide a small, but noticeable performance improvement. The performance improvement is a result of staying in the process_backlog loop longer which can reduce number of IPI's. Performance data using super_netperf TCP_RR with 200 flows: Before fix: 88.06% CPU utilization 125/190/309 90/95/99% latencies 1.46808e+06 tps 1145382 intrs.sec. With fix: 87.73% CPU utilization 122/183/296 90/95/99% latencies 1.4921e+06 tps 1021674.30 intrs./sec. Signed-off-by: Tom Herbert Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 30eedf67791..77c19c7bb49 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4227,9 +4227,8 @@ static int process_backlog(struct napi_struct *napi, int quota) #endif napi->weight = weight_p; local_irq_disable(); - while (work < quota) { + while (1) { struct sk_buff *skb; - unsigned int qlen; while ((skb = __skb_dequeue(&sd->process_queue))) { local_irq_enable(); @@ -4243,24 +4242,24 @@ static int process_backlog(struct napi_struct *napi, int quota) } rps_lock(sd); - qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen) - skb_queue_splice_tail_init(&sd->input_pkt_queue, - &sd->process_queue); - - if (qlen < quota - work) { + if (skb_queue_empty(&sd->input_pkt_queue)) { /* * Inline a custom version of __napi_complete(). * only current cpu owns and manipulates this napi, - * and NAPI_STATE_SCHED is the only possible flag set on backlog. - * we can use a plain write instead of clear_bit(), + * and NAPI_STATE_SCHED is the only possible flag set + * on backlog. + * We can use a plain write instead of clear_bit(), * and we dont need an smp_mb() memory barrier. */ list_del(&napi->poll_list); napi->state = 0; + rps_unlock(sd); - quota = work + qlen; + break; } + + skb_queue_splice_tail_init(&sd->input_pkt_queue, + &sd->process_queue); rps_unlock(sd); } local_irq_enable(); -- cgit v1.2.3-70-g09d2 From 54951194656e4853e441266fd095f880bc0398f3 Mon Sep 17 00:00:00 2001 From: Loic Prylli Date: Tue, 1 Jul 2014 21:39:43 -0700 Subject: net: Fix NETDEV_CHANGE notifier usage causing spurious arp flush A bug was introduced in NETDEV_CHANGE notifier sequence causing the arp table to be sometimes spuriously cleared (including manual arp entries marked permanent), upon network link carrier changes. The changed argument for the notifier was applied only to a single caller of NETDEV_CHANGE, missing among others netdev_state_change(). So upon net_carrier events induced by the network, which are triggering a call to netdev_state_change(), arp_netdev_event() would decide whether to clear or not arp cache based on random/junk stack values (a kind of read buffer overflow). Fixes: be9efd365328 ("net: pass changed flags along with NETDEV_CHANGE event") Fixes: 6c8b4e3ff81b ("arp: flush arp cache on IFF_NOARP change") Signed-off-by: Loic Prylli Signed-off-by: David S. Miller --- net/core/dev.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 77c19c7bb49..7990984ca36 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -148,6 +148,9 @@ struct list_head ptype_all __read_mostly; /* Taps */ static struct list_head offload_base __read_mostly; static int netif_rx_internal(struct sk_buff *skb); +static int call_netdevice_notifiers_info(unsigned long val, + struct net_device *dev, + struct netdev_notifier_info *info); /* * The @dev_base_head list is protected by @dev_base_lock and the rtnl @@ -1207,7 +1210,11 @@ EXPORT_SYMBOL(netdev_features_change); void netdev_state_change(struct net_device *dev) { if (dev->flags & IFF_UP) { - call_netdevice_notifiers(NETDEV_CHANGE, dev); + struct netdev_notifier_change_info change_info; + + change_info.flags_changed = 0; + call_netdevice_notifiers_info(NETDEV_CHANGE, dev, + &change_info.info); rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); } } -- cgit v1.2.3-70-g09d2