diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/dev.c | 295 | ||||
-rw-r--r-- | net/core/dev_ioctl.c | 7 | ||||
-rw-r--r-- | net/core/ethtool.c | 81 | ||||
-rw-r--r-- | net/core/filter.c | 8 | ||||
-rw-r--r-- | net/core/flow_dissector.c | 115 | ||||
-rw-r--r-- | net/core/netpoll.c | 7 | ||||
-rw-r--r-- | net/core/pktgen.c | 28 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 66 | ||||
-rw-r--r-- | net/core/secure_seq.c | 6 | ||||
-rw-r--r-- | net/core/skbuff.c | 99 | ||||
-rw-r--r-- | net/core/sock.c | 32 | ||||
-rw-r--r-- | net/core/timestamping.c | 43 |
12 files changed, 472 insertions, 315 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index ab9a16530c3..3c6a967e583 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2485,52 +2485,6 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) return 0; } -struct dev_gso_cb { - void (*destructor)(struct sk_buff *skb); -}; - -#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb) - -static void dev_gso_skb_destructor(struct sk_buff *skb) -{ - struct dev_gso_cb *cb; - - kfree_skb_list(skb->next); - skb->next = NULL; - - cb = DEV_GSO_CB(skb); - if (cb->destructor) - cb->destructor(skb); -} - -/** - * dev_gso_segment - Perform emulated hardware segmentation on skb. - * @skb: buffer to segment - * @features: device features as applicable to this skb - * - * This function segments the given skb and stores the list of segments - * in skb->next. - */ -static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features) -{ - struct sk_buff *segs; - - segs = skb_gso_segment(skb, features); - - /* Verifying header integrity only. */ - if (!segs) - return 0; - - if (IS_ERR(segs)) - return PTR_ERR(segs); - - skb->next = segs; - DEV_GSO_CB(skb)->destructor = skb->destructor; - skb->destructor = dev_gso_skb_destructor; - - return 0; -} - /* If MPLS offload request, verify we are testing hardware MPLS features * instead of standard features for the netdev. */ @@ -2605,119 +2559,125 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) } EXPORT_SYMBOL(netif_skb_features); -int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq) +static int xmit_one(struct sk_buff *skb, struct net_device *dev, + struct netdev_queue *txq, bool more) { - const struct net_device_ops *ops = dev->netdev_ops; + unsigned int len; + int rc; + + if (!list_empty(&ptype_all)) + dev_queue_xmit_nit(skb, dev); + + len = skb->len; + trace_net_dev_start_xmit(skb, dev); + rc = netdev_start_xmit(skb, dev, txq, more); + trace_net_dev_xmit(skb, rc, dev, len); + + return rc; +} + +struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev, + struct netdev_queue *txq, int *ret) +{ + struct sk_buff *skb = first; int rc = NETDEV_TX_OK; - unsigned int skb_len; - if (likely(!skb->next)) { - netdev_features_t features; + while (skb) { + struct sk_buff *next = skb->next; - /* - * If device doesn't need skb->dst, release it right now while - * its hot in this cpu cache - */ - if (dev->priv_flags & IFF_XMIT_DST_RELEASE) - skb_dst_drop(skb); + skb->next = NULL; + rc = xmit_one(skb, dev, txq, next != NULL); + if (unlikely(!dev_xmit_complete(rc))) { + skb->next = next; + goto out; + } - features = netif_skb_features(skb); + skb = next; + if (netif_xmit_stopped(txq) && skb) { + rc = NETDEV_TX_BUSY; + break; + } + } - if (vlan_tx_tag_present(skb) && - !vlan_hw_offload_capable(features, skb->vlan_proto)) { - skb = __vlan_put_tag(skb, skb->vlan_proto, - vlan_tx_tag_get(skb)); - if (unlikely(!skb)) - goto out; +out: + *ret = rc; + return skb; +} +struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, netdev_features_t features) +{ + if (vlan_tx_tag_present(skb) && + !vlan_hw_offload_capable(features, skb->vlan_proto)) { + skb = __vlan_put_tag(skb, skb->vlan_proto, + vlan_tx_tag_get(skb)); + if (skb) skb->vlan_tci = 0; - } + } + return skb; +} - /* If encapsulation offload request, verify we are testing - * hardware encapsulation features instead of standard - * features for the netdev - */ - if (skb->encapsulation) - features &= dev->hw_enc_features; +struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) +{ + netdev_features_t features; - if (netif_needs_gso(skb, features)) { - if (unlikely(dev_gso_segment(skb, features))) - goto out_kfree_skb; - if (skb->next) - goto gso; - } else { - if (skb_needs_linearize(skb, features) && - __skb_linearize(skb)) - goto out_kfree_skb; + if (skb->next) + return skb; - /* If packet is not checksummed and device does not - * support checksumming for this protocol, complete - * checksumming here. - */ - if (skb->ip_summed == CHECKSUM_PARTIAL) { - if (skb->encapsulation) - skb_set_inner_transport_header(skb, - skb_checksum_start_offset(skb)); - else - skb_set_transport_header(skb, - skb_checksum_start_offset(skb)); - if (!(features & NETIF_F_ALL_CSUM) && - skb_checksum_help(skb)) - goto out_kfree_skb; - } - } + /* If device doesn't need skb->dst, release it right now while + * its hot in this cpu cache + */ + if (dev->priv_flags & IFF_XMIT_DST_RELEASE) + skb_dst_drop(skb); - if (!list_empty(&ptype_all)) - dev_queue_xmit_nit(skb, dev); + features = netif_skb_features(skb); + skb = validate_xmit_vlan(skb, features); + if (unlikely(!skb)) + goto out_null; - skb_len = skb->len; - trace_net_dev_start_xmit(skb, dev); - rc = ops->ndo_start_xmit(skb, dev); - trace_net_dev_xmit(skb, rc, dev, skb_len); - if (rc == NETDEV_TX_OK) - txq_trans_update(txq); - return rc; - } + /* If encapsulation offload request, verify we are testing + * hardware encapsulation features instead of standard + * features for the netdev + */ + if (skb->encapsulation) + features &= dev->hw_enc_features; -gso: - do { - struct sk_buff *nskb = skb->next; + if (netif_needs_gso(skb, features)) { + struct sk_buff *segs; - skb->next = nskb->next; - nskb->next = NULL; + segs = skb_gso_segment(skb, features); + kfree_skb(skb); + if (IS_ERR(segs)) + segs = NULL; + skb = segs; + } else { + if (skb_needs_linearize(skb, features) && + __skb_linearize(skb)) + goto out_kfree_skb; - if (!list_empty(&ptype_all)) - dev_queue_xmit_nit(nskb, dev); - - skb_len = nskb->len; - trace_net_dev_start_xmit(nskb, dev); - rc = ops->ndo_start_xmit(nskb, dev); - trace_net_dev_xmit(nskb, rc, dev, skb_len); - if (unlikely(rc != NETDEV_TX_OK)) { - if (rc & ~NETDEV_TX_MASK) - goto out_kfree_gso_skb; - nskb->next = skb->next; - skb->next = nskb; - return rc; + /* If packet is not checksummed and device does not + * support checksumming for this protocol, complete + * checksumming here. + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + if (skb->encapsulation) + skb_set_inner_transport_header(skb, + skb_checksum_start_offset(skb)); + else + skb_set_transport_header(skb, + skb_checksum_start_offset(skb)); + if (!(features & NETIF_F_ALL_CSUM) && + skb_checksum_help(skb)) + goto out_kfree_skb; } - txq_trans_update(txq); - if (unlikely(netif_xmit_stopped(txq) && skb->next)) - return NETDEV_TX_BUSY; - } while (skb->next); - -out_kfree_gso_skb: - if (likely(skb->next == NULL)) { - skb->destructor = DEV_GSO_CB(skb)->destructor; - consume_skb(skb); - return rc; } + + return skb; + out_kfree_skb: kfree_skb(skb); -out: - return rc; +out_null: + return NULL; } -EXPORT_SYMBOL_GPL(dev_hard_start_xmit); static void qdisc_pkt_len_init(struct sk_buff *skb) { @@ -2785,7 +2745,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, qdisc_bstats_update(q, skb); - if (sch_direct_xmit(skb, q, dev, txq, root_lock)) { + skb = validate_xmit_skb(skb, dev); + if (skb && sch_direct_xmit(skb, q, dev, txq, root_lock)) { if (unlikely(contended)) { spin_unlock(&q->busylock); contended = false; @@ -2925,11 +2886,15 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT) goto recursion_alert; + skb = validate_xmit_skb(skb, dev); + if (!skb) + goto drop; + HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_stopped(txq)) { __this_cpu_inc(xmit_recursion); - rc = dev_hard_start_xmit(skb, dev, txq); + skb = dev_hard_start_xmit(skb, dev, txq, &rc); __this_cpu_dec(xmit_recursion); if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); @@ -2950,10 +2915,11 @@ recursion_alert: } rc = -ENETDOWN; +drop: rcu_read_unlock_bh(); atomic_long_inc(&dev->tx_dropped); - kfree_skb(skb); + kfree_skb_list(skb); return rc; out: rcu_read_unlock_bh(); @@ -3130,8 +3096,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, } if (map) { - tcpu = map->cpus[((u64) hash * map->len) >> 32]; - + tcpu = map->cpus[reciprocal_scale(hash, map->len)]; if (cpu_online(tcpu)) { cpu = tcpu; goto done; @@ -3965,11 +3930,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (!(skb->dev->features & NETIF_F_GRO)) goto normal; - if (skb_is_gso(skb) || skb_has_frag_list(skb)) + if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad) goto normal; gro_list_prepare(napi, skb); - NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */ rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { @@ -3983,6 +3947,22 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->free = 0; NAPI_GRO_CB(skb)->udp_mark = 0; + /* Setup for GRO checksum validation */ + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + NAPI_GRO_CB(skb)->csum = skb->csum; + NAPI_GRO_CB(skb)->csum_valid = 1; + NAPI_GRO_CB(skb)->csum_cnt = 0; + break; + case CHECKSUM_UNNECESSARY: + NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; + NAPI_GRO_CB(skb)->csum_valid = 0; + break; + default: + NAPI_GRO_CB(skb)->csum_cnt = 0; + NAPI_GRO_CB(skb)->csum_valid = 0; + } + pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); break; } @@ -4212,6 +4192,31 @@ gro_result_t napi_gro_frags(struct napi_struct *napi) } EXPORT_SYMBOL(napi_gro_frags); +/* Compute the checksum from gro_offset and return the folded value + * after adding in any pseudo checksum. + */ +__sum16 __skb_gro_checksum_complete(struct sk_buff *skb) +{ + __wsum wsum; + __sum16 sum; + + wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); + + /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ + sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev); + } + + NAPI_GRO_CB(skb)->csum = wsum; + NAPI_GRO_CB(skb)->csum_valid = 1; + + return sum; +} +EXPORT_SYMBOL(__skb_gro_checksum_complete); + /* * net_rps_action_and_irq_enable sends any pending IPI's for rps. * Note: called with local irq disabled, but exits with local irq enabled. diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index cf999e09bcd..72e899a3efd 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -365,11 +365,8 @@ void dev_load(struct net *net, const char *name) no_module = !dev; if (no_module && capable(CAP_NET_ADMIN)) no_module = request_module("netdev-%s", name); - if (no_module && capable(CAP_SYS_MODULE)) { - if (!request_module("%s", name)) - pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s instead.\n", - name); - } + if (no_module && capable(CAP_SYS_MODULE)) + request_module("%s", name); } EXPORT_SYMBOL(dev_load); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 17cb912793f..27e61b88652 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1621,6 +1621,80 @@ static int ethtool_get_module_eeprom(struct net_device *dev, modinfo.eeprom_len); } +static int ethtool_tunable_valid(const struct ethtool_tunable *tuna) +{ + switch (tuna->id) { + case ETHTOOL_RX_COPYBREAK: + if (tuna->len != sizeof(u32) || + tuna->type_id != ETHTOOL_TUNABLE_U32) + return -EINVAL; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int ethtool_get_tunable(struct net_device *dev, void __user *useraddr) +{ + int ret; + struct ethtool_tunable tuna; + const struct ethtool_ops *ops = dev->ethtool_ops; + void *data; + + if (!ops->get_tunable) + return -EOPNOTSUPP; + if (copy_from_user(&tuna, useraddr, sizeof(tuna))) + return -EFAULT; + ret = ethtool_tunable_valid(&tuna); + if (ret) + return ret; + data = kmalloc(tuna.len, GFP_USER); + if (!data) + return -ENOMEM; + ret = ops->get_tunable(dev, &tuna, data); + if (ret) + goto out; + useraddr += sizeof(tuna); + ret = -EFAULT; + if (copy_to_user(useraddr, data, tuna.len)) + goto out; + ret = 0; + +out: + kfree(data); + return ret; +} + +static int ethtool_set_tunable(struct net_device *dev, void __user *useraddr) +{ + int ret; + struct ethtool_tunable tuna; + const struct ethtool_ops *ops = dev->ethtool_ops; + void *data; + + if (!ops->set_tunable) + return -EOPNOTSUPP; + if (copy_from_user(&tuna, useraddr, sizeof(tuna))) + return -EFAULT; + ret = ethtool_tunable_valid(&tuna); + if (ret) + return ret; + data = kmalloc(tuna.len, GFP_USER); + if (!data) + return -ENOMEM; + useraddr += sizeof(tuna); + ret = -EFAULT; + if (copy_from_user(data, useraddr, tuna.len)) + goto out; + ret = ops->set_tunable(dev, &tuna, data); + +out: + kfree(data); + return ret; +} + /* The main entry point in this file. Called from net/core/dev_ioctl.c */ int dev_ethtool(struct net *net, struct ifreq *ifr) @@ -1670,6 +1744,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GCHANNELS: case ETHTOOL_GET_TS_INFO: case ETHTOOL_GEEE: + case ETHTOOL_GTUNABLE: break; default: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) @@ -1857,6 +1932,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GMODULEEEPROM: rc = ethtool_get_module_eeprom(dev, useraddr); break; + case ETHTOOL_GTUNABLE: + rc = ethtool_get_tunable(dev, useraddr); + break; + case ETHTOOL_STUNABLE: + rc = ethtool_set_tunable(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } diff --git a/net/core/filter.c b/net/core/filter.c index d814b8a89d0..fa5b7d0f77a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -113,7 +113,7 @@ static unsigned int pkt_type_offset(void) static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) { - return __skb_get_poff((struct sk_buff *)(unsigned long) ctx); + return skb_get_poff((struct sk_buff *)(unsigned long) ctx); } static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) @@ -933,7 +933,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) /* Expand fp for appending the new filter representation. */ old_fp = fp; - fp = krealloc(old_fp, bpf_prog_size(new_len), GFP_KERNEL); + fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0); if (!fp) { /* The old_fp is still around in case we couldn't * allocate new memory, so uncharge on that one. @@ -1013,7 +1013,7 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) if (fprog->filter == NULL) return -EINVAL; - fp = kmalloc(bpf_prog_size(fprog->len), GFP_KERNEL); + fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!fp) return -ENOMEM; @@ -1069,7 +1069,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) if (fprog->filter == NULL) return -EINVAL; - prog = kmalloc(bpf_fsize, GFP_KERNEL); + prog = bpf_prog_alloc(bpf_fsize, 0); if (!prog) return -ENOMEM; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 5f362c1d033..8560dea5880 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -13,6 +13,7 @@ #include <linux/if_pppox.h> #include <linux/ppp_defs.h> #include <net/flow_keys.h> +#include <scsi/fc/fc_fcoe.h> /* copy saddr & daddr, possibly using 64bit load/store * Equivalent to : flow->src = iph->saddr; @@ -26,36 +27,61 @@ static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *i } /** - * skb_flow_get_ports - extract the upper layer ports and return them - * @skb: buffer to extract the ports from + * __skb_flow_get_ports - extract the upper layer ports and return them + * @skb: sk_buff to extract the ports from * @thoff: transport header offset * @ip_proto: protocol for which to get port offset + * @data: raw buffer pointer to the packet, if NULL use skb->data + * @hlen: packet header length, if @data is NULL use skb_headlen(skb) * * The function will try to retrieve the ports at offset thoff + poff where poff * is the protocol port offset returned from proto_ports_offset */ -__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto) +__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, + void *data, int hlen) { int poff = proto_ports_offset(ip_proto); + if (!data) { + data = skb->data; + hlen = skb_headlen(skb); + } + if (poff >= 0) { __be32 *ports, _ports; - ports = skb_header_pointer(skb, thoff + poff, - sizeof(_ports), &_ports); + ports = __skb_header_pointer(skb, thoff + poff, + sizeof(_ports), data, hlen, &_ports); if (ports) return *ports; } return 0; } -EXPORT_SYMBOL(skb_flow_get_ports); +EXPORT_SYMBOL(__skb_flow_get_ports); -bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow) +/** + * __skb_flow_dissect - extract the flow_keys struct and return it + * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified + * @data: raw buffer pointer to the packet, if NULL use skb->data + * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol + * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb) + * @hlen: packet header length, if @data is NULL use skb_headlen(skb) + * + * The function will try to retrieve the struct flow_keys from either the skbuff + * or a raw buffer specified by the rest parameters + */ +bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow, + void *data, __be16 proto, int nhoff, int hlen) { - int nhoff = skb_network_offset(skb); u8 ip_proto; - __be16 proto = skb->protocol; + + if (!data) { + data = skb->data; + proto = skb->protocol; + nhoff = skb_network_offset(skb); + hlen = skb_headlen(skb); + } memset(flow, 0, sizeof(*flow)); @@ -65,7 +91,7 @@ again: const struct iphdr *iph; struct iphdr _iph; ip: - iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); + iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph || iph->ihl < 5) return false; nhoff += iph->ihl * 4; @@ -83,7 +109,7 @@ ip: __be32 flow_label; ipv6: - iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); + iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph) return false; @@ -92,6 +118,13 @@ ipv6: flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr); nhoff += sizeof(struct ipv6hdr); + /* skip the flow label processing if skb is NULL. The + * assumption here is that if there is no skb we are not + * looking for flow info as much as we are length. + */ + if (!skb) + break; + flow_label = ip6_flowlabel(iph); if (flow_label) { /* Awesome, IPv6 packet has a flow label so we can @@ -113,7 +146,7 @@ ipv6: const struct vlan_hdr *vlan; struct vlan_hdr _vlan; - vlan = skb_header_pointer(skb, nhoff, sizeof(_vlan), &_vlan); + vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan); if (!vlan) return false; @@ -126,7 +159,7 @@ ipv6: struct pppoe_hdr hdr; __be16 proto; } *hdr, _hdr; - hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr); + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return false; proto = hdr->proto; @@ -140,6 +173,9 @@ ipv6: return false; } } + case htons(ETH_P_FCOE): + flow->thoff = (u16)(nhoff + FCOE_HEADER_LEN); + /* fall through */ default: return false; } @@ -151,7 +187,7 @@ ipv6: __be16 proto; } *hdr, _hdr; - hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr); + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return false; /* @@ -171,8 +207,9 @@ ipv6: const struct ethhdr *eth; struct ethhdr _eth; - eth = skb_header_pointer(skb, nhoff, - sizeof(_eth), &_eth); + eth = __skb_header_pointer(skb, nhoff, + sizeof(_eth), + data, hlen, &_eth); if (!eth) return false; proto = eth->h_proto; @@ -194,12 +231,12 @@ ipv6: flow->n_proto = proto; flow->ip_proto = ip_proto; - flow->ports = skb_flow_get_ports(skb, nhoff, ip_proto); + flow->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen); flow->thoff = (u16) nhoff; return true; } -EXPORT_SYMBOL(skb_flow_dissect); +EXPORT_SYMBOL(__skb_flow_dissect); static u32 hashrnd __read_mostly; static __always_inline void __flow_hash_secret_init(void) @@ -286,30 +323,22 @@ u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, qcount = dev->tc_to_txq[tc].count; } - return (u16) (((u64)skb_get_hash(skb) * qcount) >> 32) + qoffset; + return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; } EXPORT_SYMBOL(__skb_tx_hash); -/* __skb_get_poff() returns the offset to the payload as far as it could - * be dissected. The main user is currently BPF, so that we can dynamically - * truncate packets without needing to push actual payload to the user - * space and can analyze headers only, instead. - */ -u32 __skb_get_poff(const struct sk_buff *skb) +u32 __skb_get_poff(const struct sk_buff *skb, void *data, + const struct flow_keys *keys, int hlen) { - struct flow_keys keys; - u32 poff = 0; - - if (!skb_flow_dissect(skb, &keys)) - return 0; + u32 poff = keys->thoff; - poff += keys.thoff; - switch (keys.ip_proto) { + switch (keys->ip_proto) { case IPPROTO_TCP: { const struct tcphdr *tcph; struct tcphdr _tcph; - tcph = skb_header_pointer(skb, poff, sizeof(_tcph), &_tcph); + tcph = __skb_header_pointer(skb, poff, sizeof(_tcph), + data, hlen, &_tcph); if (!tcph) return poff; @@ -343,6 +372,21 @@ u32 __skb_get_poff(const struct sk_buff *skb) return poff; } +/* skb_get_poff() returns the offset to the payload as far as it could + * be dissected. The main user is currently BPF, so that we can dynamically + * truncate packets without needing to push actual payload to the user + * space and can analyze headers only, instead. + */ +u32 skb_get_poff(const struct sk_buff *skb) +{ + struct flow_keys keys; + + if (!skb_flow_dissect(skb, &keys)) + return 0; + + return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); +} + static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_XPS @@ -359,9 +403,8 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) if (map->len == 1) queue_index = map->queues[0]; else - queue_index = map->queues[ - ((u64)skb_get_hash(skb) * map->len) >> 32]; - + queue_index = map->queues[reciprocal_scale(skb_get_hash(skb), + map->len)]; if (unlikely(queue_index >= dev->real_num_tx_queues)) queue_index = -1; } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 907fb5e36c0..e6645b4f330 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -72,7 +72,6 @@ module_param(carrier_timeout, uint, 0644); static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { - const struct net_device_ops *ops = dev->netdev_ops; int status = NETDEV_TX_OK; netdev_features_t features; @@ -92,9 +91,7 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, skb->vlan_tci = 0; } - status = ops->ndo_start_xmit(skb, dev); - if (status == NETDEV_TX_OK) - txq_trans_update(txq); + status = netdev_start_xmit(skb, dev, txq, false); out: return status; @@ -116,7 +113,7 @@ static void queue_process(struct work_struct *work) continue; } - txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + txq = skb_get_tx_queue(dev, skb); local_irq_save(flags); HARD_TX_LOCK(dev, txq, smp_processor_id()); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 8b849ddfef2..21cb4839bc9 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -202,6 +202,7 @@ #define F_QUEUE_MAP_CPU (1<<14) /* queue map mirrors smp_processor_id() */ #define F_NODE (1<<15) /* Node memory alloc*/ #define F_UDPCSUM (1<<16) /* Include UDP checksum */ +#define F_NO_TIMESTAMP (1<<17) /* Don't timestamp packets (default TS) */ /* Thread control flag bits */ #define T_STOP (1<<0) /* Stop run */ @@ -638,6 +639,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v) if (pkt_dev->flags & F_UDPCSUM) seq_puts(seq, "UDPCSUM "); + if (pkt_dev->flags & F_NO_TIMESTAMP) + seq_puts(seq, "NO_TIMESTAMP "); + if (pkt_dev->flags & F_MPLS_RND) seq_puts(seq, "MPLS_RND "); @@ -1243,6 +1247,9 @@ static ssize_t pktgen_if_write(struct file *file, else if (strcmp(f, "!UDPCSUM") == 0) pkt_dev->flags &= ~F_UDPCSUM; + else if (strcmp(f, "NO_TIMESTAMP") == 0) + pkt_dev->flags |= F_NO_TIMESTAMP; + else { sprintf(pg_result, "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", @@ -1251,6 +1258,7 @@ static ssize_t pktgen_if_write(struct file *file, "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, " "MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, " "QUEUE_MAP_RND, QUEUE_MAP_CPU, UDPCSUM, " + "NO_TIMESTAMP, " #ifdef CONFIG_XFRM "IPSEC, " #endif @@ -2685,9 +2693,14 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, pgh->pgh_magic = htonl(PKTGEN_MAGIC); pgh->seq_num = htonl(pkt_dev->seq_num); - do_gettimeofday(×tamp); - pgh->tv_sec = htonl(timestamp.tv_sec); - pgh->tv_usec = htonl(timestamp.tv_usec); + if (pkt_dev->flags & F_NO_TIMESTAMP) { + pgh->tv_sec = 0; + pgh->tv_usec = 0; + } else { + do_gettimeofday(×tamp); + pgh->tv_sec = htonl(timestamp.tv_sec); + pgh->tv_usec = htonl(timestamp.tv_usec); + } } static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, @@ -3285,10 +3298,7 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev) static void pktgen_xmit(struct pktgen_dev *pkt_dev) { struct net_device *odev = pkt_dev->odev; - netdev_tx_t (*xmit)(struct sk_buff *, struct net_device *) - = odev->netdev_ops->ndo_start_xmit; struct netdev_queue *txq; - u16 queue_map; int ret; /* If device is offline, then don't send */ @@ -3326,8 +3336,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) if (pkt_dev->delay && pkt_dev->last_ok) spin(pkt_dev, pkt_dev->next_tx); - queue_map = skb_get_queue_mapping(pkt_dev->skb); - txq = netdev_get_tx_queue(odev, queue_map); + txq = skb_get_tx_queue(odev, pkt_dev->skb); local_bh_disable(); @@ -3339,11 +3348,10 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) goto unlock; } atomic_inc(&(pkt_dev->skb->users)); - ret = (*xmit)(pkt_dev->skb, odev); + ret = netdev_start_xmit(pkt_dev->skb, odev, txq, false); switch (ret) { case NETDEV_TX_OK: - txq_trans_update(txq); pkt_dev->last_ok = 1; pkt_dev->sofar++; pkt_dev->seq_num++; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f0493e3b747..a6882686ca3 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1481,9 +1481,12 @@ static int do_set_master(struct net_device *dev, int ifindex) return 0; } +#define DO_SETLINK_MODIFIED 0x01 +/* notify flag means notify + modified. */ +#define DO_SETLINK_NOTIFY 0x03 static int do_setlink(const struct sk_buff *skb, struct net_device *dev, struct ifinfomsg *ifm, - struct nlattr **tb, char *ifname, int modified) + struct nlattr **tb, char *ifname, int status) { const struct net_device_ops *ops = dev->netdev_ops; int err; @@ -1502,7 +1505,7 @@ static int do_setlink(const struct sk_buff *skb, put_net(net); if (err) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_MAP]) { @@ -1531,7 +1534,7 @@ static int do_setlink(const struct sk_buff *skb, if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_ADDRESS]) { @@ -1551,19 +1554,19 @@ static int do_setlink(const struct sk_buff *skb, kfree(sa); if (err) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_MTU]) { err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU])); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_GROUP]) { dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); - modified = 1; + status |= DO_SETLINK_NOTIFY; } /* @@ -1575,7 +1578,7 @@ static int do_setlink(const struct sk_buff *skb, err = dev_change_name(dev, ifname); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_IFALIAS]) { @@ -1583,7 +1586,7 @@ static int do_setlink(const struct sk_buff *skb, nla_len(tb[IFLA_IFALIAS])); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_BROADCAST]) { @@ -1601,25 +1604,35 @@ static int do_setlink(const struct sk_buff *skb, err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER])); if (err) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_CARRIER]) { err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); if (err) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } - if (tb[IFLA_TXQLEN]) - dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); + if (tb[IFLA_TXQLEN]) { + unsigned long value = nla_get_u32(tb[IFLA_TXQLEN]); + + if (dev->tx_queue_len ^ value) + status |= DO_SETLINK_NOTIFY; + + dev->tx_queue_len = value; + } if (tb[IFLA_OPERSTATE]) set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); if (tb[IFLA_LINKMODE]) { + unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]); + write_lock_bh(&dev_base_lock); - dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); + if (dev->link_mode ^ value) + status |= DO_SETLINK_NOTIFY; + dev->link_mode = value; write_unlock_bh(&dev_base_lock); } @@ -1634,7 +1647,7 @@ static int do_setlink(const struct sk_buff *skb, err = do_setvfinfo(dev, attr); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_NOTIFY; } } err = 0; @@ -1664,7 +1677,7 @@ static int do_setlink(const struct sk_buff *skb, err = ops->ndo_set_vf_port(dev, vf, port); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_NOTIFY; } } err = 0; @@ -1682,7 +1695,7 @@ static int do_setlink(const struct sk_buff *skb, err = ops->ndo_set_vf_port(dev, PORT_SELF_VF, port); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_AF_SPEC]) { @@ -1699,15 +1712,20 @@ static int do_setlink(const struct sk_buff *skb, if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_NOTIFY; } } err = 0; errout: - if (err < 0 && modified) - net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n", - dev->name); + if (status & DO_SETLINK_MODIFIED) { + if (status & DO_SETLINK_NOTIFY) + netdev_state_change(dev); + + if (err < 0) + net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n", + dev->name); + } return err; } @@ -1989,7 +2007,7 @@ replay: } if (dev) { - int modified = 0; + int status = 0; if (nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; @@ -2004,7 +2022,7 @@ replay: err = ops->changelink(dev, tb, data); if (err < 0) return err; - modified = 1; + status |= DO_SETLINK_NOTIFY; } if (linkinfo[IFLA_INFO_SLAVE_DATA]) { @@ -2015,10 +2033,10 @@ replay: tb, slave_data); if (err < 0) return err; - modified = 1; + status |= DO_SETLINK_NOTIFY; } - return do_setlink(skb, dev, ifm, tb, ifname, modified); + return do_setlink(skb, dev, ifm, tb, ifname, status); } if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index ba71212f025..51dd3193a33 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -35,7 +35,7 @@ static u32 seq_scale(u32 seq) * overlaps less than one time per MSL (2 minutes). * Choosing a clock of 64 ns period is OK. (period of 274 s) */ - return seq + (ktime_to_ns(ktime_get_real()) >> 6); + return seq + (ktime_get_real_ns() >> 6); } #endif @@ -135,7 +135,7 @@ u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, md5_transform(hash, net_secret); seq = hash[0] | (((u64)hash[1]) << 32); - seq += ktime_to_ns(ktime_get_real()); + seq += ktime_get_real_ns(); seq &= (1ull << 48) - 1; return seq; @@ -163,7 +163,7 @@ u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, md5_transform(hash, secret); seq = hash[0] | (((u64)hash[1]) << 32); - seq += ktime_to_ns(ktime_get_real()); + seq += ktime_get_real_ns(); seq &= (1ull << 48) - 1; return seq; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index da1378a3e2c..a18dfb02d94 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3491,32 +3491,53 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(sock_queue_err_skb); -void __skb_tstamp_tx(struct sk_buff *orig_skb, - struct skb_shared_hwtstamps *hwtstamps, - struct sock *sk, int tstype) +struct sk_buff *sock_dequeue_err_skb(struct sock *sk) { - struct sock_exterr_skb *serr; - struct sk_buff *skb; - int err; + struct sk_buff_head *q = &sk->sk_error_queue; + struct sk_buff *skb, *skb_next; + int err = 0; - if (!sk) - return; + spin_lock_bh(&q->lock); + skb = __skb_dequeue(q); + if (skb && (skb_next = skb_peek(q))) + err = SKB_EXT_ERR(skb_next)->ee.ee_errno; + spin_unlock_bh(&q->lock); - if (hwtstamps) { - *skb_hwtstamps(orig_skb) = - *hwtstamps; - } else { - /* - * no hardware time stamps available, - * so keep the shared tx_flags and only - * store software time stamp - */ - orig_skb->tstamp = ktime_get_real(); + sk->sk_err = err; + if (err) + sk->sk_error_report(sk); + + return skb; +} +EXPORT_SYMBOL(sock_dequeue_err_skb); + +struct sk_buff *skb_clone_sk(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct sk_buff *clone; + + if (!sk || !atomic_inc_not_zero(&sk->sk_refcnt)) + return NULL; + + clone = skb_clone(skb, GFP_ATOMIC); + if (!clone) { + sock_put(sk); + return NULL; } - skb = skb_clone(orig_skb, GFP_ATOMIC); - if (!skb) - return; + clone->sk = sk; + clone->destructor = sock_efree; + + return clone; +} +EXPORT_SYMBOL(skb_clone_sk); + +static void __skb_complete_tx_timestamp(struct sk_buff *skb, + struct sock *sk, + int tstype) +{ + struct sock_exterr_skb *serr; + int err; serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); @@ -3534,6 +3555,42 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if (err) kfree_skb(skb); } + +void skb_complete_tx_timestamp(struct sk_buff *skb, + struct skb_shared_hwtstamps *hwtstamps) +{ + struct sock *sk = skb->sk; + + /* take a reference to prevent skb_orphan() from freeing the socket */ + sock_hold(sk); + + *skb_hwtstamps(skb) = *hwtstamps; + __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND); + + sock_put(sk); +} +EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); + +void __skb_tstamp_tx(struct sk_buff *orig_skb, + struct skb_shared_hwtstamps *hwtstamps, + struct sock *sk, int tstype) +{ + struct sk_buff *skb; + + if (!sk) + return; + + if (hwtstamps) + *skb_hwtstamps(orig_skb) = *hwtstamps; + else + orig_skb->tstamp = ktime_get_real(); + + skb = skb_clone(orig_skb, GFP_ATOMIC); + if (!skb) + return; + + __skb_complete_tx_timestamp(skb, sk, tstype); +} EXPORT_SYMBOL_GPL(__skb_tstamp_tx); void skb_tstamp_tx(struct sk_buff *orig_skb, diff --git a/net/core/sock.c b/net/core/sock.c index d372b4bd3f9..6f436b5e496 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -437,7 +437,6 @@ static void sock_disable_timestamp(struct sock *sk, unsigned long flags) int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int err; - int skb_len; unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; @@ -459,13 +458,6 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) skb->dev = NULL; skb_set_owner_r(skb, sk); - /* Cache the SKB length before we tack it onto the receive - * queue. Once it is added it no longer belongs to us and - * may be freed by other threads of control pulling packets - * from the queue. - */ - skb_len = skb->len; - /* we escape from rcu protected region, make sure we dont leak * a norefcounted dst */ @@ -1645,18 +1637,24 @@ void sock_rfree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_rfree); +void sock_efree(struct sk_buff *skb) +{ + sock_put(skb->sk); +} +EXPORT_SYMBOL(sock_efree); + +#ifdef CONFIG_INET void sock_edemux(struct sk_buff *skb) { struct sock *sk = skb->sk; -#ifdef CONFIG_INET if (sk->sk_state == TCP_TIME_WAIT) inet_twsk_put(inet_twsk(sk)); else -#endif sock_put(sk); } EXPORT_SYMBOL(sock_edemux); +#endif kuid_t sock_i_uid(struct sock *sk) { @@ -2498,11 +2496,11 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type) { struct sock_exterr_skb *serr; - struct sk_buff *skb, *skb2; + struct sk_buff *skb; int copied, err; err = -EAGAIN; - skb = skb_dequeue(&sk->sk_error_queue); + skb = sock_dequeue_err_skb(sk); if (skb == NULL) goto out; @@ -2523,16 +2521,6 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, msg->msg_flags |= MSG_ERRQUEUE; err = copied; - /* Reset and regenerate socket error */ - spin_lock_bh(&sk->sk_error_queue.lock); - sk->sk_err = 0; - if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { - sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; - spin_unlock_bh(&sk->sk_error_queue.lock); - sk->sk_error_report(sk); - } else - spin_unlock_bh(&sk->sk_error_queue.lock); - out_free_skb: kfree_skb(skb); out: diff --git a/net/core/timestamping.c b/net/core/timestamping.c index a8770391ea5..43d3dd62fcc 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -36,10 +36,9 @@ void skb_clone_tx_timestamp(struct sk_buff *skb) { struct phy_device *phydev; struct sk_buff *clone; - struct sock *sk = skb->sk; unsigned int type; - if (!sk) + if (!skb->sk) return; type = classify(skb); @@ -48,50 +47,14 @@ void skb_clone_tx_timestamp(struct sk_buff *skb) phydev = skb->dev->phydev; if (likely(phydev->drv->txtstamp)) { - if (!atomic_inc_not_zero(&sk->sk_refcnt)) + clone = skb_clone_sk(skb); + if (!clone) return; - - clone = skb_clone(skb, GFP_ATOMIC); - if (!clone) { - sock_put(sk); - return; - } - - clone->sk = sk; phydev->drv->txtstamp(phydev, clone, type); } } EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp); -void skb_complete_tx_timestamp(struct sk_buff *skb, - struct skb_shared_hwtstamps *hwtstamps) -{ - struct sock *sk = skb->sk; - struct sock_exterr_skb *serr; - int err; - - if (!hwtstamps) { - sock_put(sk); - kfree_skb(skb); - return; - } - - *skb_hwtstamps(skb) = *hwtstamps; - - serr = SKB_EXT_ERR(skb); - memset(serr, 0, sizeof(*serr)); - serr->ee.ee_errno = ENOMSG; - serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; - skb->sk = NULL; - - err = sock_queue_err_skb(sk, skb); - - sock_put(sk); - if (err) - kfree_skb(skb); -} -EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); - bool skb_defer_rx_timestamp(struct sk_buff *skb) { struct phy_device *phydev; |