diff options
Diffstat (limited to 'net/core/dev.c')
-rw-r--r-- | net/core/dev.c | 337 |
1 files changed, 216 insertions, 121 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index b793e3521a3..683d493aa1b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -118,6 +118,7 @@ #include <linux/if_vlan.h> #include <linux/ip.h> #include <net/ip.h> +#include <net/mpls.h> #include <linux/ipv6.h> #include <linux/in.h> #include <linux/jhash.h> @@ -133,6 +134,7 @@ #include <linux/vmalloc.h> #include <linux/if_macvlan.h> #include <linux/errqueue.h> +#include <linux/hrtimer.h> #include "net-sysfs.h" @@ -1435,22 +1437,17 @@ EXPORT_SYMBOL(dev_close); */ void dev_disable_lro(struct net_device *dev) { - /* - * If we're trying to disable lro on a vlan device - * use the underlying physical device instead - */ - if (is_vlan_dev(dev)) - dev = vlan_dev_real_dev(dev); - - /* the same for macvlan devices */ - if (netif_is_macvlan(dev)) - dev = macvlan_dev_real_dev(dev); + struct net_device *lower_dev; + struct list_head *iter; dev->wanted_features &= ~NETIF_F_LRO; netdev_update_features(dev); if (unlikely(dev->features & NETIF_F_LRO)) netdev_WARN(dev, "failed to disable LRO!\n"); + + netdev_for_each_lower_dev(dev, lower_dev, iter) + dev_disable_lro(lower_dev); } EXPORT_SYMBOL(dev_disable_lro); @@ -1697,6 +1694,7 @@ int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) skb_scrub_packet(skb, true); skb->protocol = eth_type_trans(skb, dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); return 0; } @@ -2525,12 +2523,12 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) /* If MPLS offload request, verify we are testing hardware MPLS features * instead of standard features for the netdev. */ -#ifdef CONFIG_NET_MPLS_GSO +#if IS_ENABLED(CONFIG_NET_MPLS_GSO) static netdev_features_t net_mpls_features(struct sk_buff *skb, netdev_features_t features, __be16 type) { - if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC)) + if (eth_p_mpls(type)) features &= skb->dev->mpls_features; return features; @@ -2565,7 +2563,7 @@ static netdev_features_t harmonize_features(struct sk_buff *skb, netdev_features_t netif_skb_features(struct sk_buff *skb) { - const struct net_device *dev = skb->dev; + struct net_device *dev = skb->dev; netdev_features_t features = dev->features; u16 gso_segs = skb_shinfo(skb)->gso_segs; __be16 protocol = skb->protocol; @@ -2573,11 +2571,21 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs) features &= ~NETIF_F_GSO_MASK; - if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) { - struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; - protocol = veh->h_vlan_encapsulated_proto; - } else if (!vlan_tx_tag_present(skb)) { - return harmonize_features(skb, features); + /* If encapsulation offload request, verify we are testing + * hardware encapsulation features instead of standard + * features for the netdev + */ + if (skb->encapsulation) + features &= dev->hw_enc_features; + + if (!vlan_tx_tag_present(skb)) { + if (unlikely(protocol == htons(ETH_P_8021Q) || + protocol == htons(ETH_P_8021AD))) { + struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; + protocol = veh->h_vlan_encapsulated_proto; + } else { + goto finalize; + } } features = netdev_intersect_features(features, @@ -2594,6 +2602,11 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); +finalize: + if (dev->netdev_ops->ndo_features_check) + features &= dev->netdev_ops->ndo_features_check(skb, dev, + features); + return harmonize_features(skb, features); } EXPORT_SYMBOL(netif_skb_features); @@ -2647,12 +2660,8 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, netdev_features_t features) { if (vlan_tx_tag_present(skb) && - !vlan_hw_offload_capable(features, skb->vlan_proto)) { - skb = __vlan_put_tag(skb, skb->vlan_proto, - vlan_tx_tag_get(skb)); - if (skb) - skb->vlan_tci = 0; - } + !vlan_hw_offload_capable(features, skb->vlan_proto)) + skb = __vlan_hwaccel_push_inside(skb); return skb; } @@ -2668,19 +2677,12 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device if (unlikely(!skb)) goto out_null; - /* If encapsulation offload request, verify we are testing - * hardware encapsulation features instead of standard - * features for the netdev - */ - if (skb->encapsulation) - features &= dev->hw_enc_features; - if (netif_needs_gso(dev, skb, features)) { struct sk_buff *segs; segs = skb_gso_segment(skb, features); if (IS_ERR(segs)) { - segs = NULL; + goto out_kfree_skb; } else if (segs) { consume_skb(skb); skb = segs; @@ -3304,7 +3306,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, rps_lock(sd); qlen = skb_queue_len(&sd->input_pkt_queue); if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { - if (skb_queue_len(&sd->input_pkt_queue)) { + if (qlen) { enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); input_queue_tail_incr_save(sd, qtail); @@ -4157,6 +4159,10 @@ EXPORT_SYMBOL(napi_gro_receive); static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) { + if (unlikely(skb->pfmemalloc)) { + consume_skb(skb); + return; + } __skb_pull(skb, skb_headlen(skb)); /* restore the reserve we had after netdev_alloc_skb_ip_align() */ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); @@ -4175,7 +4181,7 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi) struct sk_buff *skb = napi->skb; if (!skb) { - skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD); + skb = napi_alloc_skb(napi, GRO_MAX_HEAD); napi->skb = skb; } return skb; @@ -4312,20 +4318,28 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) local_irq_enable(); } +static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + return sd->rps_ipi_list != NULL; +#else + return false; +#endif +} + static int process_backlog(struct napi_struct *napi, int quota) { int work = 0; struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); -#ifdef CONFIG_RPS /* Check if we have pending ipi, its better to send them now, * not waiting net_rx_action() end. */ - if (sd->rps_ipi_list) { + if (sd_has_rps_ipi_waiting(sd)) { local_irq_disable(); net_rps_action_and_irq_enable(sd); } -#endif + napi->weight = weight_p; local_irq_disable(); while (1) { @@ -4352,7 +4366,6 @@ static int process_backlog(struct napi_struct *napi, int quota) * We can use a plain write instead of clear_bit(), * and we dont need an smp_mb() memory barrier. */ - list_del(&napi->poll_list); napi->state = 0; rps_unlock(sd); @@ -4372,7 +4385,8 @@ static int process_backlog(struct napi_struct *napi, int quota) * __napi_schedule - schedule for receive * @n: entry to schedule * - * The entry's receive function will be scheduled to run + * The entry's receive function will be scheduled to run. + * Consider using __napi_schedule_irqoff() if hard irqs are masked. */ void __napi_schedule(struct napi_struct *n) { @@ -4384,18 +4398,29 @@ void __napi_schedule(struct napi_struct *n) } EXPORT_SYMBOL(__napi_schedule); +/** + * __napi_schedule_irqoff - schedule for receive + * @n: entry to schedule + * + * Variant of __napi_schedule() assuming hard irqs are masked + */ +void __napi_schedule_irqoff(struct napi_struct *n) +{ + ____napi_schedule(this_cpu_ptr(&softnet_data), n); +} +EXPORT_SYMBOL(__napi_schedule_irqoff); + void __napi_complete(struct napi_struct *n) { BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); - BUG_ON(n->gro_list); - list_del(&n->poll_list); + list_del_init(&n->poll_list); smp_mb__before_atomic(); clear_bit(NAPI_STATE_SCHED, &n->state); } EXPORT_SYMBOL(__napi_complete); -void napi_complete(struct napi_struct *n) +void napi_complete_done(struct napi_struct *n, int work_done) { unsigned long flags; @@ -4406,12 +4431,28 @@ void napi_complete(struct napi_struct *n) if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) return; - napi_gro_flush(n, false); - local_irq_save(flags); - __napi_complete(n); - local_irq_restore(flags); + if (n->gro_list) { + unsigned long timeout = 0; + + if (work_done) + timeout = n->dev->gro_flush_timeout; + + if (timeout) + hrtimer_start(&n->timer, ns_to_ktime(timeout), + HRTIMER_MODE_REL_PINNED); + else + napi_gro_flush(n, false); + } + if (likely(list_empty(&n->poll_list))) { + WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state)); + } else { + /* If n->poll_list is not empty, we need to mask irqs */ + local_irq_save(flags); + __napi_complete(n); + local_irq_restore(flags); + } } -EXPORT_SYMBOL(napi_complete); +EXPORT_SYMBOL(napi_complete_done); /* must be called under rcu_read_lock(), as we dont take a reference */ struct napi_struct *napi_by_id(unsigned int napi_id) @@ -4465,10 +4506,23 @@ void napi_hash_del(struct napi_struct *napi) } EXPORT_SYMBOL_GPL(napi_hash_del); +static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) +{ + struct napi_struct *napi; + + napi = container_of(timer, struct napi_struct, timer); + if (napi->gro_list) + napi_schedule(napi); + + return HRTIMER_NORESTART; +} + void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { INIT_LIST_HEAD(&napi->poll_list); + hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); + napi->timer.function = napi_watchdog; napi->gro_count = 0; napi->gro_list = NULL; napi->skb = NULL; @@ -4487,6 +4541,20 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, } EXPORT_SYMBOL(netif_napi_add); +void napi_disable(struct napi_struct *n) +{ + might_sleep(); + set_bit(NAPI_STATE_DISABLE, &n->state); + + while (test_and_set_bit(NAPI_STATE_SCHED, &n->state)) + msleep(1); + + hrtimer_cancel(&n->timer); + + clear_bit(NAPI_STATE_DISABLE, &n->state); +} +EXPORT_SYMBOL(napi_disable); + void netif_napi_del(struct napi_struct *napi) { list_del_init(&napi->dev_list); @@ -4498,91 +4566,112 @@ void netif_napi_del(struct napi_struct *napi) } EXPORT_SYMBOL(netif_napi_del); -static void net_rx_action(struct softirq_action *h) +static int napi_poll(struct napi_struct *n, struct list_head *repoll) { - struct softnet_data *sd = this_cpu_ptr(&softnet_data); - unsigned long time_limit = jiffies + 2; - int budget = netdev_budget; void *have; + int work, weight; - local_irq_disable(); + list_del_init(&n->poll_list); - while (!list_empty(&sd->poll_list)) { - struct napi_struct *n; - int work, weight; + have = netpoll_poll_lock(n); - /* If softirq window is exhuasted then punt. - * Allow this to run for 2 jiffies since which will allow - * an average latency of 1.5/HZ. - */ - if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) - goto softnet_break; + weight = n->weight; - local_irq_enable(); + /* This NAPI_STATE_SCHED test is for avoiding a race + * with netpoll's poll_napi(). Only the entity which + * obtains the lock and sees NAPI_STATE_SCHED set will + * actually make the ->poll() call. Therefore we avoid + * accidentally calling ->poll() when NAPI is not scheduled. + */ + work = 0; + if (test_bit(NAPI_STATE_SCHED, &n->state)) { + work = n->poll(n, weight); + trace_napi_poll(n); + } - /* Even though interrupts have been re-enabled, this - * access is safe because interrupts can only add new - * entries to the tail of this list, and only ->poll() - * calls can remove this head entry from the list. - */ - n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list); + WARN_ON_ONCE(work > weight); - have = netpoll_poll_lock(n); + if (likely(work < weight)) + goto out_unlock; - weight = n->weight; + /* Drivers must not modify the NAPI state if they + * consume the entire weight. In such cases this code + * still "owns" the NAPI instance and therefore can + * move the instance around on the list at-will. + */ + if (unlikely(napi_disable_pending(n))) { + napi_complete(n); + goto out_unlock; + } - /* This NAPI_STATE_SCHED test is for avoiding a race - * with netpoll's poll_napi(). Only the entity which - * obtains the lock and sees NAPI_STATE_SCHED set will - * actually make the ->poll() call. Therefore we avoid - * accidentally calling ->poll() when NAPI is not scheduled. + if (n->gro_list) { + /* flush too old packets + * If HZ < 1000, flush all packets. */ - work = 0; - if (test_bit(NAPI_STATE_SCHED, &n->state)) { - work = n->poll(n, weight); - trace_napi_poll(n); - } + napi_gro_flush(n, HZ >= 1000); + } - WARN_ON_ONCE(work > weight); + /* Some drivers may have called napi_schedule + * prior to exhausting their budget. + */ + if (unlikely(!list_empty(&n->poll_list))) { + pr_warn_once("%s: Budget exhausted after napi rescheduled\n", + n->dev ? n->dev->name : "backlog"); + goto out_unlock; + } - budget -= work; + list_add_tail(&n->poll_list, repoll); - local_irq_disable(); +out_unlock: + netpoll_poll_unlock(have); - /* Drivers must not modify the NAPI state if they - * consume the entire weight. In such cases this code - * still "owns" the NAPI instance and therefore can - * move the instance around on the list at-will. - */ - if (unlikely(work == weight)) { - if (unlikely(napi_disable_pending(n))) { - local_irq_enable(); - napi_complete(n); - local_irq_disable(); - } else { - if (n->gro_list) { - /* flush too old packets - * If HZ < 1000, flush all packets. - */ - local_irq_enable(); - napi_gro_flush(n, HZ >= 1000); - local_irq_disable(); - } - list_move_tail(&n->poll_list, &sd->poll_list); - } + return work; +} + +static void net_rx_action(struct softirq_action *h) +{ + struct softnet_data *sd = this_cpu_ptr(&softnet_data); + unsigned long time_limit = jiffies + 2; + int budget = netdev_budget; + LIST_HEAD(list); + LIST_HEAD(repoll); + + local_irq_disable(); + list_splice_init(&sd->poll_list, &list); + local_irq_enable(); + + for (;;) { + struct napi_struct *n; + + if (list_empty(&list)) { + if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) + return; + break; } - netpoll_poll_unlock(have); + n = list_first_entry(&list, struct napi_struct, poll_list); + budget -= napi_poll(n, &repoll); + + /* If softirq window is exhausted then punt. + * Allow this to run for 2 jiffies since which will allow + * an average latency of 1.5/HZ. + */ + if (unlikely(budget <= 0 || + time_after_eq(jiffies, time_limit))) { + sd->time_squeeze++; + break; + } } -out: - net_rps_action_and_irq_enable(sd); - return; + local_irq_disable(); -softnet_break: - sd->time_squeeze++; - __raise_softirq_irqoff(NET_RX_SOFTIRQ); - goto out; + list_splice_tail_init(&sd->poll_list, &list); + list_splice_tail(&repoll, &list); + list_splice(&list, &sd->poll_list); + if (!list_empty(&sd->poll_list)) + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + + net_rps_action_and_irq_enable(sd); } struct netdev_adjacent { @@ -5782,7 +5871,7 @@ EXPORT_SYMBOL(dev_change_carrier); * Get device physical port ID */ int dev_get_phys_port_id(struct net_device *dev, - struct netdev_phys_port_id *ppid) + struct netdev_phys_item_id *ppid) { const struct net_device_ops *ops = dev->netdev_ops; @@ -5861,6 +5950,8 @@ static void rollback_registered_many(struct list_head *head) synchronize_net(); list_for_each_entry(dev, head, unreg_list) { + struct sk_buff *skb = NULL; + /* Shutdown queueing discipline. */ dev_shutdown(dev); @@ -5870,6 +5961,11 @@ static void rollback_registered_many(struct list_head *head) */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + if (!dev->rtnl_link_ops || + dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, + GFP_KERNEL); + /* * Flush the unicast and multicast chains */ @@ -5879,9 +5975,8 @@ static void rollback_registered_many(struct list_head *head) if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); + if (skb) + rtmsg_ifinfo_send(skb, dev, GFP_KERNEL); /* Notifier chain MUST detach us all upper devices. */ WARN_ON(netdev_has_any_upper_dev(dev)); @@ -7196,11 +7291,10 @@ static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) */ struct net *net; bool unregistering; - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); + add_wait_queue(&netdev_unregistering_wq, &wait); for (;;) { - prepare_to_wait(&netdev_unregistering_wq, &wait, - TASK_UNINTERRUPTIBLE); unregistering = false; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { @@ -7212,9 +7306,10 @@ static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) if (!unregistering) break; __rtnl_unlock(); - schedule(); + + wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } - finish_wait(&netdev_unregistering_wq, &wait); + remove_wait_queue(&netdev_unregistering_wq, &wait); } static void __net_exit default_device_exit_batch(struct list_head *net_list) |