diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/dev.c | 346 | ||||
-rw-r--r-- | net/core/net_namespace.c | 2 | ||||
-rw-r--r-- | net/core/skbuff.c | 106 |
3 files changed, 253 insertions, 201 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index 09c66a449da..5379b0c1190 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -132,6 +132,9 @@ /* Instead of increasing this, you should create a hash table. */ #define MAX_GRO_SKBS 8 +/* This should be increased if a protocol with a bigger head is added. */ +#define GRO_MAX_HEAD (MAX_HEADER + 128) + /* * The list of packet types we will receive (as opposed to discard) * and the routines to invoke. @@ -167,25 +170,6 @@ static DEFINE_SPINLOCK(ptype_lock); static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; static struct list_head ptype_all __read_mostly; /* Taps */ -#ifdef CONFIG_NET_DMA -struct net_dma { - struct dma_client client; - spinlock_t lock; - cpumask_t channel_mask; - struct dma_chan **channels; -}; - -static enum dma_state_client -netdev_dma_event(struct dma_client *client, struct dma_chan *chan, - enum dma_state state); - -static struct net_dma net_dma = { - .client = { - .event_callback = netdev_dma_event, - }, -}; -#endif - /* * The @dev_base_head list is protected by @dev_base_lock and the rtnl * semaphore. @@ -1104,6 +1088,11 @@ int dev_open(struct net_device *dev) dev->flags |= IFF_UP; /* + * Enable NET_DMA + */ + dmaengine_get(); + + /* * Initialize multicasting status */ dev_set_rx_mode(dev); @@ -1180,6 +1169,11 @@ int dev_close(struct net_device *dev) */ call_netdevice_notifiers(NETDEV_DOWN, dev); + /* + * Shutdown NET_DMA + */ + dmaengine_put(); + return 0; } @@ -1540,7 +1534,19 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) skb->mac_len = skb->network_header - skb->mac_header; __skb_pull(skb, skb->mac_len); - if (WARN_ON(skb->ip_summed != CHECKSUM_PARTIAL)) { + if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { + struct net_device *dev = skb->dev; + struct ethtool_drvinfo info = {}; + + if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo) + dev->ethtool_ops->get_drvinfo(dev, &info); + + WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d " + "ip_summed=%d", + info.driver, dev ? dev->features : 0L, + skb->sk ? skb->sk->sk_route_caps : 0L, + skb->len, skb->data_len, skb->ip_summed); + if (skb_header_cloned(skb) && (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) return ERR_PTR(err); @@ -2345,7 +2351,7 @@ static int napi_gro_complete(struct sk_buff *skb) struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; int err = -ENOENT; - if (!skb_shinfo(skb)->frag_list) + if (NAPI_GRO_CB(skb)->count == 1) goto out; rcu_read_lock(); @@ -2365,6 +2371,7 @@ static int napi_gro_complete(struct sk_buff *skb) } out: + skb_shinfo(skb)->gso_size = 0; __skb_push(skb, -skb_network_offset(skb)); return netif_receive_skb(skb); } @@ -2383,7 +2390,7 @@ void napi_gro_flush(struct napi_struct *napi) } EXPORT_SYMBOL(napi_gro_flush); -int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff **pp = NULL; struct packet_type *ptype; @@ -2392,10 +2399,14 @@ int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) int count = 0; int same_flow; int mac_len; + int free; if (!(skb->dev->features & NETIF_F_GRO)) goto normal; + if (skb_is_gso(skb) || skb_shinfo(skb)->frag_list) + goto normal; + rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { struct sk_buff *p; @@ -2408,14 +2419,18 @@ int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) skb->mac_len = mac_len; NAPI_GRO_CB(skb)->same_flow = 0; NAPI_GRO_CB(skb)->flush = 0; + NAPI_GRO_CB(skb)->free = 0; for (p = napi->gro_list; p; p = p->next) { count++; - NAPI_GRO_CB(p)->same_flow = - p->mac_len == mac_len && - !memcmp(skb_mac_header(p), skb_mac_header(skb), - mac_len); - NAPI_GRO_CB(p)->flush = 0; + + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + if (p->mac_len != mac_len || + memcmp(skb_mac_header(p), skb_mac_header(skb), + mac_len)) + NAPI_GRO_CB(p)->same_flow = 0; } pp = ptype->gro_receive(&napi->gro_list, skb); @@ -2427,6 +2442,7 @@ int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) goto normal; same_flow = NAPI_GRO_CB(skb)->same_flow; + free = NAPI_GRO_CB(skb)->free; if (pp) { struct sk_buff *nskb = *pp; @@ -2446,17 +2462,119 @@ int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) } NAPI_GRO_CB(skb)->count = 1; + skb_shinfo(skb)->gso_size = skb->len; skb->next = napi->gro_list; napi->gro_list = skb; ok: - return NET_RX_SUCCESS; + return free; normal: - return netif_receive_skb(skb); + return -1; +} +EXPORT_SYMBOL(dev_gro_receive); + +static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +{ + struct sk_buff *p; + + for (p = napi->gro_list; p; p = p->next) { + NAPI_GRO_CB(p)->same_flow = 1; + NAPI_GRO_CB(p)->flush = 0; + } + + return dev_gro_receive(napi, skb); +} + +int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +{ + switch (__napi_gro_receive(napi, skb)) { + case -1: + return netif_receive_skb(skb); + + case 1: + kfree_skb(skb); + break; + } + + return NET_RX_SUCCESS; } EXPORT_SYMBOL(napi_gro_receive); +void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) +{ + __skb_pull(skb, skb_headlen(skb)); + skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb)); + + napi->skb = skb; +} +EXPORT_SYMBOL(napi_reuse_skb); + +struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi, + struct napi_gro_fraginfo *info) +{ + struct net_device *dev = napi->dev; + struct sk_buff *skb = napi->skb; + + napi->skb = NULL; + + if (!skb) { + skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN); + if (!skb) + goto out; + + skb_reserve(skb, NET_IP_ALIGN); + } + + BUG_ON(info->nr_frags > MAX_SKB_FRAGS); + skb_shinfo(skb)->nr_frags = info->nr_frags; + memcpy(skb_shinfo(skb)->frags, info->frags, sizeof(info->frags)); + + skb->data_len = info->len; + skb->len += info->len; + skb->truesize += info->len; + + if (!pskb_may_pull(skb, ETH_HLEN)) { + napi_reuse_skb(napi, skb); + skb = NULL; + goto out; + } + + skb->protocol = eth_type_trans(skb, dev); + + skb->ip_summed = info->ip_summed; + skb->csum = info->csum; + +out: + return skb; +} +EXPORT_SYMBOL(napi_fraginfo_skb); + +int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info) +{ + struct sk_buff *skb = napi_fraginfo_skb(napi, info); + int err = NET_RX_DROP; + + if (!skb) + goto out; + + err = NET_RX_SUCCESS; + + switch (__napi_gro_receive(napi, skb)) { + case -1: + return netif_receive_skb(skb); + + case 0: + goto out; + } + + napi_reuse_skb(napi, skb); + +out: + return err; +} +EXPORT_SYMBOL(napi_gro_frags); + static int process_backlog(struct napi_struct *napi, int quota) { int work = 0; @@ -2535,11 +2653,12 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, { INIT_LIST_HEAD(&napi->poll_list); napi->gro_list = NULL; + napi->skb = NULL; napi->poll = poll; napi->weight = weight; list_add(&napi->dev_list, &dev->napi_list); -#ifdef CONFIG_NETPOLL napi->dev = dev; +#ifdef CONFIG_NETPOLL spin_lock_init(&napi->poll_lock); napi->poll_owner = -1; #endif @@ -2552,6 +2671,7 @@ void netif_napi_del(struct napi_struct *napi) struct sk_buff *skb, *next; list_del_init(&napi->dev_list); + kfree(napi->skb); for (skb = napi->gro_list; skb; skb = next) { next = skb->next; @@ -2635,14 +2755,7 @@ out: * There may not be any more sk_buffs coming right now, so push * any pending DMA copies to hardware */ - if (!cpus_empty(net_dma.channel_mask)) { - int chan_idx; - for_each_cpu_mask_nr(chan_idx, net_dma.channel_mask) { - struct dma_chan *chan = net_dma.channels[chan_idx]; - if (chan) - dma_async_memcpy_issue_pending(chan); - } - } + dma_issue_pending_all(); #endif return; @@ -4331,6 +4444,45 @@ err_uninit: } /** + * init_dummy_netdev - init a dummy network device for NAPI + * @dev: device to init + * + * This takes a network device structure and initialize the minimum + * amount of fields so it can be used to schedule NAPI polls without + * registering a full blown interface. This is to be used by drivers + * that need to tie several hardware interfaces to a single NAPI + * poll scheduler due to HW limitations. + */ +int init_dummy_netdev(struct net_device *dev) +{ + /* Clear everything. Note we don't initialize spinlocks + * are they aren't supposed to be taken by any of the + * NAPI code and this dummy netdev is supposed to be + * only ever used for NAPI polls + */ + memset(dev, 0, sizeof(struct net_device)); + + /* make sure we BUG if trying to hit standard + * register/unregister code path + */ + dev->reg_state = NETREG_DUMMY; + + /* initialize the ref count */ + atomic_set(&dev->refcnt, 1); + + /* NAPI wants this */ + INIT_LIST_HEAD(&dev->napi_list); + + /* a dummy interface is started by default */ + set_bit(__LINK_STATE_PRESENT, &dev->state); + set_bit(__LINK_STATE_START, &dev->state); + + return 0; +} +EXPORT_SYMBOL_GPL(init_dummy_netdev); + + +/** * register_netdev - register a network device * @dev: device to register * @@ -4833,122 +4985,6 @@ static int dev_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -#ifdef CONFIG_NET_DMA -/** - * net_dma_rebalance - try to maintain one DMA channel per CPU - * @net_dma: DMA client and associated data (lock, channels, channel_mask) - * - * This is called when the number of channels allocated to the net_dma client - * changes. The net_dma client tries to have one DMA channel per CPU. - */ - -static void net_dma_rebalance(struct net_dma *net_dma) -{ - unsigned int cpu, i, n, chan_idx; - struct dma_chan *chan; - - if (cpus_empty(net_dma->channel_mask)) { - for_each_online_cpu(cpu) - rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL); - return; - } - - i = 0; - cpu = first_cpu(cpu_online_map); - - for_each_cpu_mask_nr(chan_idx, net_dma->channel_mask) { - chan = net_dma->channels[chan_idx]; - - n = ((num_online_cpus() / cpus_weight(net_dma->channel_mask)) - + (i < (num_online_cpus() % - cpus_weight(net_dma->channel_mask)) ? 1 : 0)); - - while(n) { - per_cpu(softnet_data, cpu).net_dma = chan; - cpu = next_cpu(cpu, cpu_online_map); - n--; - } - i++; - } -} - -/** - * netdev_dma_event - event callback for the net_dma_client - * @client: should always be net_dma_client - * @chan: DMA channel for the event - * @state: DMA state to be handled - */ -static enum dma_state_client -netdev_dma_event(struct dma_client *client, struct dma_chan *chan, - enum dma_state state) -{ - int i, found = 0, pos = -1; - struct net_dma *net_dma = - container_of(client, struct net_dma, client); - enum dma_state_client ack = DMA_DUP; /* default: take no action */ - - spin_lock(&net_dma->lock); - switch (state) { - case DMA_RESOURCE_AVAILABLE: - for (i = 0; i < nr_cpu_ids; i++) - if (net_dma->channels[i] == chan) { - found = 1; - break; - } else if (net_dma->channels[i] == NULL && pos < 0) - pos = i; - - if (!found && pos >= 0) { - ack = DMA_ACK; - net_dma->channels[pos] = chan; - cpu_set(pos, net_dma->channel_mask); - net_dma_rebalance(net_dma); - } - break; - case DMA_RESOURCE_REMOVED: - for (i = 0; i < nr_cpu_ids; i++) - if (net_dma->channels[i] == chan) { - found = 1; - pos = i; - break; - } - - if (found) { - ack = DMA_ACK; - cpu_clear(pos, net_dma->channel_mask); - net_dma->channels[i] = NULL; - net_dma_rebalance(net_dma); - } - break; - default: - break; - } - spin_unlock(&net_dma->lock); - - return ack; -} - -/** - * netdev_dma_register - register the networking subsystem as a DMA client - */ -static int __init netdev_dma_register(void) -{ - net_dma.channels = kzalloc(nr_cpu_ids * sizeof(struct net_dma), - GFP_KERNEL); - if (unlikely(!net_dma.channels)) { - printk(KERN_NOTICE - "netdev_dma: no memory for net_dma.channels\n"); - return -ENOMEM; - } - spin_lock_init(&net_dma.lock); - dma_cap_set(DMA_MEMCPY, net_dma.client.cap_mask); - dma_async_client_register(&net_dma.client); - dma_async_client_chan_request(&net_dma.client); - return 0; -} - -#else -static int __init netdev_dma_register(void) { return -ENODEV; } -#endif /* CONFIG_NET_DMA */ /** * netdev_increment_features - increment feature set by one @@ -5168,8 +5204,6 @@ static int __init net_dev_init(void) if (register_pernet_device(&default_device_ops)) goto out; - netdev_dma_register(); - open_softirq(NET_TX_SOFTIRQ, net_tx_action); open_softirq(NET_RX_SOFTIRQ, net_rx_action); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 55cffad2f32..55151faaf90 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -341,8 +341,8 @@ again: rv = register_pernet_operations(first_device, ops); if (rv < 0) ida_remove(&net_generic_ids, *id); - mutex_unlock(&net_mutex); out: + mutex_unlock(&net_mutex); return rv; } EXPORT_SYMBOL_GPL(register_pernet_gen_subsys); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b8d0abb2643..da74b844f4e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -73,17 +73,13 @@ static struct kmem_cache *skbuff_fclone_cache __read_mostly; static void sock_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - struct sk_buff *skb = (struct sk_buff *) buf->private; - - kfree_skb(skb); + put_page(buf->page); } static void sock_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - struct sk_buff *skb = (struct sk_buff *) buf->private; - - skb_get(skb); + get_page(buf->page); } static int sock_pipe_buf_steal(struct pipe_inode_info *pipe, @@ -1334,9 +1330,19 @@ fault: */ static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) { - struct sk_buff *skb = (struct sk_buff *) spd->partial[i].private; + put_page(spd->pages[i]); +} - kfree_skb(skb); +static inline struct page *linear_to_page(struct page *page, unsigned int len, + unsigned int offset) +{ + struct page *p = alloc_pages(GFP_KERNEL, 0); + + if (!p) + return NULL; + memcpy(page_address(p) + offset, page_address(page) + offset, len); + + return p; } /* @@ -1344,16 +1350,23 @@ static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) */ static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page, unsigned int len, unsigned int offset, - struct sk_buff *skb) + struct sk_buff *skb, int linear) { if (unlikely(spd->nr_pages == PIPE_BUFFERS)) return 1; + if (linear) { + page = linear_to_page(page, len, offset); + if (!page) + return 1; + } else + get_page(page); + spd->pages[spd->nr_pages] = page; spd->partial[spd->nr_pages].len = len; spd->partial[spd->nr_pages].offset = offset; - spd->partial[spd->nr_pages].private = (unsigned long) skb_get(skb); spd->nr_pages++; + return 0; } @@ -1369,7 +1382,7 @@ static inline void __segment_seek(struct page **page, unsigned int *poff, static inline int __splice_segment(struct page *page, unsigned int poff, unsigned int plen, unsigned int *off, unsigned int *len, struct sk_buff *skb, - struct splice_pipe_desc *spd) + struct splice_pipe_desc *spd, int linear) { if (!*len) return 1; @@ -1392,7 +1405,7 @@ static inline int __splice_segment(struct page *page, unsigned int poff, /* the linear region may spread across several pages */ flen = min_t(unsigned int, flen, PAGE_SIZE - poff); - if (spd_fill_page(spd, page, flen, poff, skb)) + if (spd_fill_page(spd, page, flen, poff, skb, linear)) return 1; __segment_seek(&page, &poff, &plen, flen); @@ -1419,7 +1432,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, if (__splice_segment(virt_to_page(skb->data), (unsigned long) skb->data & (PAGE_SIZE - 1), skb_headlen(skb), - offset, len, skb, spd)) + offset, len, skb, spd, 1)) return 1; /* @@ -1429,7 +1442,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; if (__splice_segment(f->page, f->page_offset, f->size, - offset, len, skb, spd)) + offset, len, skb, spd, 0)) return 1; } @@ -1442,7 +1455,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, * the frag list, if such a thing exists. We'd probably need to recurse to * handle that cleanly. */ -int skb_splice_bits(struct sk_buff *__skb, unsigned int offset, +int skb_splice_bits(struct sk_buff *skb, unsigned int offset, struct pipe_inode_info *pipe, unsigned int tlen, unsigned int flags) { @@ -1455,16 +1468,6 @@ int skb_splice_bits(struct sk_buff *__skb, unsigned int offset, .ops = &sock_pipe_buf_ops, .spd_release = sock_spd_release, }; - struct sk_buff *skb; - - /* - * I'd love to avoid the clone here, but tcp_read_sock() - * ignores reference counts and unconditonally kills the sk_buff - * on return from the actor. - */ - skb = skb_clone(__skb, GFP_KERNEL); - if (unlikely(!skb)) - return -ENOMEM; /* * __skb_splice_bits() only fails if the output has no room left, @@ -1488,15 +1491,9 @@ int skb_splice_bits(struct sk_buff *__skb, unsigned int offset, } done: - /* - * drop our reference to the clone, the pipe consumption will - * drop the rest. - */ - kfree_skb(skb); - if (spd.nr_pages) { + struct sock *sk = skb->sk; int ret; - struct sock *sk = __skb->sk; /* * Drop the socket lock, otherwise we have reverse @@ -2215,10 +2212,10 @@ unsigned int skb_seq_read(unsigned int consumed, const u8 **data, return 0; next_skb: - block_limit = skb_headlen(st->cur_skb); + block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; if (abs_offset < block_limit) { - *data = st->cur_skb->data + abs_offset; + *data = st->cur_skb->data + (abs_offset - st->stepped_offset); return block_limit - abs_offset; } @@ -2253,13 +2250,14 @@ next_skb: st->frag_data = NULL; } - if (st->cur_skb->next) { - st->cur_skb = st->cur_skb->next; + if (st->root_skb == st->cur_skb && + skb_shinfo(st->root_skb)->frag_list) { + st->cur_skb = skb_shinfo(st->root_skb)->frag_list; st->frag_idx = 0; goto next_skb; - } else if (st->root_skb == st->cur_skb && - skb_shinfo(st->root_skb)->frag_list) { - st->cur_skb = skb_shinfo(st->root_skb)->frag_list; + } else if (st->cur_skb->next) { + st->cur_skb = st->cur_skb->next; + st->frag_idx = 0; goto next_skb; } @@ -2588,12 +2586,30 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) struct sk_buff *nskb; unsigned int headroom; unsigned int hlen = p->data - skb_mac_header(p); + unsigned int len = skb->len; - if (hlen + p->len + skb->len >= 65536) + if (hlen + p->len + len >= 65536) return -E2BIG; if (skb_shinfo(p)->frag_list) goto merge; + else if (!skb_headlen(p) && !skb_headlen(skb) && + skb_shinfo(p)->nr_frags + skb_shinfo(skb)->nr_frags < + MAX_SKB_FRAGS) { + memcpy(skb_shinfo(p)->frags + skb_shinfo(p)->nr_frags, + skb_shinfo(skb)->frags, + skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); + + skb_shinfo(p)->nr_frags += skb_shinfo(skb)->nr_frags; + skb_shinfo(skb)->nr_frags = 0; + + skb->truesize -= skb->data_len; + skb->len -= skb->data_len; + skb->data_len = 0; + + NAPI_GRO_CB(skb)->free = 1; + goto done; + } headroom = skb_headroom(p); nskb = netdev_alloc_skb(p->dev, headroom); @@ -2613,6 +2629,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) *NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p); skb_shinfo(nskb)->frag_list = p; + skb_shinfo(nskb)->gso_size = skb_shinfo(p)->gso_size; skb_header_release(p); nskb->prev = p; @@ -2627,14 +2644,15 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) p = nskb; merge: - NAPI_GRO_CB(p)->count++; p->prev->next = skb; p->prev = skb; skb_header_release(skb); - p->data_len += skb->len; - p->truesize += skb->len; - p->len += skb->len; +done: + NAPI_GRO_CB(p)->count++; + p->data_len += len; + p->truesize += len; + p->len += len; NAPI_GRO_CB(skb)->same_flow = 1; return 0; |