From 86911732d3996a9da07914b280621450111bb6da Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 29 Jan 2009 14:19:50 +0000 Subject: gro: Avoid copying headers of unmerged packets Unfortunately simplicity isn't always the best. The fraginfo interface turned out to be suboptimal. The problem was quite obvious. For every packet, we have to copy the headers from the frags structure into skb->head, even though for 99% of the packets this part is immediately thrown away after the merge. LRO didn't have this problem because it directly read the headers from the frags structure. This patch attempts to address this by creating an interface that allows GRO to access the headers in the first frag without having to copy it. Because all drivers that use frags place the headers in the first frag this optimisation should be enough. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/skbuff.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'net/core/skbuff.c') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 2e5f2ca3bdc..f9f4065a7e9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2584,17 +2584,21 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) struct sk_buff *p = *head; struct sk_buff *nskb; unsigned int headroom; - unsigned int hlen = p->data - skb_mac_header(p); - unsigned int len = skb->len; + unsigned int len = skb_gro_len(skb); - if (hlen + p->len + len >= 65536) + if (p->len + len >= 65536) return -E2BIG; if (skb_shinfo(p)->frag_list) goto merge; - else if (!skb_headlen(p) && !skb_headlen(skb) && - skb_shinfo(p)->nr_frags + skb_shinfo(skb)->nr_frags < + else if (skb_headlen(skb) <= skb_gro_offset(skb) && + skb_shinfo(p)->nr_frags + skb_shinfo(skb)->nr_frags <= MAX_SKB_FRAGS) { + skb_shinfo(skb)->frags[0].page_offset += + skb_gro_offset(skb) - skb_headlen(skb); + skb_shinfo(skb)->frags[0].size -= + skb_gro_offset(skb) - skb_headlen(skb); + memcpy(skb_shinfo(p)->frags + skb_shinfo(p)->nr_frags, skb_shinfo(skb)->frags, skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); @@ -2611,7 +2615,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) } headroom = skb_headroom(p); - nskb = netdev_alloc_skb(p->dev, headroom); + nskb = netdev_alloc_skb(p->dev, headroom + skb_gro_offset(p)); if (unlikely(!nskb)) return -ENOMEM; @@ -2619,12 +2623,15 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) nskb->mac_len = p->mac_len; skb_reserve(nskb, headroom); + __skb_put(nskb, skb_gro_offset(p)); - skb_set_mac_header(nskb, -hlen); + skb_set_mac_header(nskb, skb_mac_header(p) - p->data); skb_set_network_header(nskb, skb_network_offset(p)); skb_set_transport_header(nskb, skb_transport_offset(p)); - memcpy(skb_mac_header(nskb), skb_mac_header(p), hlen); + __skb_pull(p, skb_gro_offset(p)); + memcpy(skb_mac_header(nskb), skb_mac_header(p), + p->data - skb_mac_header(p)); *NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p); skb_shinfo(nskb)->frag_list = p; -- cgit v1.2.3-70-g09d2 From 81705ad1b2f926d2ef15ed95074a9c1fa9fb4dc4 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 29 Jan 2009 14:19:51 +0000 Subject: gro: Do not merge paged packets into frag_list gro: Do not merge paged packets into frag_list Bigger is not always better :) It was easy to continue to merged packets into frag_list after the page array is full. However, this turns out to be worse than LRO because frag_list is a much less efficient form of storage than the page array. So we're better off stopping the merge and starting a new entry with an empty page array. In future we can optimise this further by doing frag_list merging but making sure that we continue to fill in the page array. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/skbuff.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net/core/skbuff.c') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f9f4065a7e9..d386f1082eb 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2591,9 +2591,11 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) if (skb_shinfo(p)->frag_list) goto merge; - else if (skb_headlen(skb) <= skb_gro_offset(skb) && - skb_shinfo(p)->nr_frags + skb_shinfo(skb)->nr_frags <= - MAX_SKB_FRAGS) { + else if (skb_headlen(skb) <= skb_gro_offset(skb)) { + if (skb_shinfo(p)->nr_frags + skb_shinfo(skb)->nr_frags > + MAX_SKB_FRAGS) + return -E2BIG; + skb_shinfo(skb)->frags[0].page_offset += skb_gro_offset(skb) - skb_headlen(skb); skb_shinfo(skb)->frags[0].size -= -- cgit v1.2.3-70-g09d2 From 4fb669948116d928ae44262ab7743732c574630d Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Sun, 1 Feb 2009 00:41:42 -0800 Subject: net: Optimize memory usage when splicing from sockets. The recent fix of data corruption when splicing from sockets uses memory very inefficiently allocating a new page to copy each chunk of linear part of skb. This patch uses the same page until it's full (almost) by caching the page in sk_sndmsg_page field. With changes from David S. Miller Signed-off-by: Jarek Poplawski Signed-off-by: David S. Miller --- net/core/skbuff.c | 47 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 11 deletions(-) (limited to 'net/core/skbuff.c') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f20e758fe46..e55d1ef5690 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1333,14 +1333,39 @@ static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) put_page(spd->pages[i]); } -static inline struct page *linear_to_page(struct page *page, unsigned int len, - unsigned int offset) -{ - struct page *p = alloc_pages(GFP_KERNEL, 0); +static inline struct page *linear_to_page(struct page *page, unsigned int *len, + unsigned int *offset, + struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct page *p = sk->sk_sndmsg_page; + unsigned int off; + + if (!p) { +new_page: + p = sk->sk_sndmsg_page = alloc_pages(sk->sk_allocation, 0); + if (!p) + return NULL; - if (!p) - return NULL; - memcpy(page_address(p) + offset, page_address(page) + offset, len); + off = sk->sk_sndmsg_off = 0; + /* hold one ref to this page until it's full */ + } else { + unsigned int mlen; + + off = sk->sk_sndmsg_off; + mlen = PAGE_SIZE - off; + if (mlen < 64 && mlen < *len) { + put_page(p); + goto new_page; + } + + *len = min_t(unsigned int, *len, mlen); + } + + memcpy(page_address(p) + off, page_address(page) + *offset, *len); + sk->sk_sndmsg_off += *len; + *offset = off; + get_page(p); return p; } @@ -1349,21 +1374,21 @@ static inline struct page *linear_to_page(struct page *page, unsigned int len, * Fill page/offset/length into spd, if it can hold more pages. */ static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page, - unsigned int len, unsigned int offset, + unsigned int *len, unsigned int offset, struct sk_buff *skb, int linear) { if (unlikely(spd->nr_pages == PIPE_BUFFERS)) return 1; if (linear) { - page = linear_to_page(page, len, offset); + page = linear_to_page(page, len, &offset, skb); if (!page) return 1; } else get_page(page); spd->pages[spd->nr_pages] = page; - spd->partial[spd->nr_pages].len = len; + spd->partial[spd->nr_pages].len = *len; spd->partial[spd->nr_pages].offset = offset; spd->nr_pages++; @@ -1405,7 +1430,7 @@ static inline int __splice_segment(struct page *page, unsigned int poff, /* the linear region may spread across several pages */ flen = min_t(unsigned int, flen, PAGE_SIZE - poff); - if (spd_fill_page(spd, page, flen, poff, skb, linear)) + if (spd_fill_page(spd, page, &flen, poff, skb, linear)) return 1; __segment_seek(&page, &poff, &plen, flen); -- cgit v1.2.3-70-g09d2 From 56035022d86fff45299288cb372a42f752ba23fa Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 5 Feb 2009 21:26:52 -0800 Subject: gro: Fix frag_list merging on imprecisely split packets The previous fix ad0f9904444de1309dedd2b9e365cae8af77d9b1 (gro: Fix handling of imprecisely split packets) only fixed the case of frags merging, frag_list merging in the same circumstances were still broken. In particular, the packet headers end up in the data stream. This patch fixes this plus another issue where an imprecisely split packet header may be read incorrectly (this is mostly harmless since it'll simply cause the packet to not match and be rejected for GRO). Thanks to Emil Tantilov and Jeff Kirsher for helping to track this down. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- net/core/skbuff.c | 11 +++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'net/core/skbuff.c') diff --git a/net/core/dev.c b/net/core/dev.c index 3337cf98f23..247f1614a79 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2391,7 +2391,8 @@ void *skb_gro_header(struct sk_buff *skb, unsigned int hlen) return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL; return page_address(skb_shinfo(skb)->frags[0].page) + - skb_shinfo(skb)->frags[0].page_offset + offset; + skb_shinfo(skb)->frags[0].page_offset + + offset - skb_headlen(skb); } EXPORT_SYMBOL(skb_gro_header); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e55d1ef5690..67f2a2f8582 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2678,6 +2678,17 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) p = nskb; merge: + if (skb_gro_offset(skb) > skb_headlen(skb)) { + skb_shinfo(skb)->frags[0].page_offset += + skb_gro_offset(skb) - skb_headlen(skb); + skb_shinfo(skb)->frags[0].size -= + skb_gro_offset(skb) - skb_headlen(skb); + skb_gro_reset_offset(skb); + skb_gro_pull(skb, skb_headlen(skb)); + } + + __skb_pull(skb, skb_gro_offset(skb)); + p->prev->next = skb; p->prev = skb; skb_header_release(skb); -- cgit v1.2.3-70-g09d2 From b4ac530fc3af02a004729043dacf6b6330b46892 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 10 Feb 2009 02:09:24 -0800 Subject: net: Move skbuff symbol exports after each symbol's definition. net/core/skbuff.c is a hodge-podge of symbol export placement. Some of the exports are right after the definition of the symbol being exported, others are clumped together into a big group at the end of the file. Make things consistent. Signed-off-by: David S. Miller --- net/core/skbuff.c | 79 +++++++++++++++++++++++++------------------------------ 1 file changed, 36 insertions(+), 43 deletions(-) (limited to 'net/core/skbuff.c') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 67f2a2f8582..7657cec5973 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -123,6 +123,7 @@ void skb_over_panic(struct sk_buff *skb, int sz, void *here) skb->dev ? skb->dev->name : ""); BUG(); } +EXPORT_SYMBOL(skb_over_panic); /** * skb_under_panic - private function @@ -142,6 +143,7 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here) skb->dev ? skb->dev->name : ""); BUG(); } +EXPORT_SYMBOL(skb_under_panic); void skb_truesize_bug(struct sk_buff *skb) { @@ -231,6 +233,7 @@ nodata: skb = NULL; goto out; } +EXPORT_SYMBOL(__alloc_skb); /** * __netdev_alloc_skb - allocate an skbuff for rx on a specific device @@ -258,6 +261,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, } return skb; } +EXPORT_SYMBOL(__netdev_alloc_skb); struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask) { @@ -426,6 +430,7 @@ void __kfree_skb(struct sk_buff *skb) skb_release_all(skb); kfree_skbmem(skb); } +EXPORT_SYMBOL(__kfree_skb); /** * kfree_skb - free an sk_buff @@ -444,6 +449,7 @@ void kfree_skb(struct sk_buff *skb) return; __kfree_skb(skb); } +EXPORT_SYMBOL(kfree_skb); /** * skb_recycle_check - check if skb can be reused for receive @@ -613,6 +619,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) return __skb_clone(n, skb); } +EXPORT_SYMBOL(skb_clone); static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) { @@ -679,7 +686,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) copy_skb_header(n, skb); return n; } - +EXPORT_SYMBOL(skb_copy); /** * pskb_copy - create copy of an sk_buff with private head. @@ -738,6 +745,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) out: return n; } +EXPORT_SYMBOL(pskb_copy); /** * pskb_expand_head - reallocate header of &sk_buff @@ -821,6 +829,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, nodata: return -ENOMEM; } +EXPORT_SYMBOL(pskb_expand_head); /* Make private copy of skb with writable head and some headroom */ @@ -841,7 +850,7 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) } return skb2; } - +EXPORT_SYMBOL(skb_realloc_headroom); /** * skb_copy_expand - copy and expand sk_buff @@ -906,6 +915,7 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, return n; } +EXPORT_SYMBOL(skb_copy_expand); /** * skb_pad - zero pad the tail of an skb @@ -951,6 +961,7 @@ free_skb: kfree_skb(skb); return err; } +EXPORT_SYMBOL(skb_pad); /** * skb_put - add data to a buffer @@ -1108,6 +1119,7 @@ done: return 0; } +EXPORT_SYMBOL(___pskb_trim); /** * __pskb_pull_tail - advance tail of skb header @@ -1246,6 +1258,7 @@ pull_pages: return skb_tail_pointer(skb); } +EXPORT_SYMBOL(__pskb_pull_tail); /* Copy some data bits from skb to kernel buffer. */ @@ -1323,6 +1336,7 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) fault: return -EFAULT; } +EXPORT_SYMBOL(skb_copy_bits); /* * Callback from splice_to_pipe(), if we need to release some pages @@ -1623,7 +1637,6 @@ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) fault: return -EFAULT; } - EXPORT_SYMBOL(skb_store_bits); /* Checksum skb data. */ @@ -1700,6 +1713,7 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset, return csum; } +EXPORT_SYMBOL(skb_checksum); /* Both of above in one bottle. */ @@ -1781,6 +1795,7 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, BUG_ON(len); return csum; } +EXPORT_SYMBOL(skb_copy_and_csum_bits); void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) { @@ -1807,6 +1822,7 @@ void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) *((__sum16 *)(to + csstuff)) = csum_fold(csum); } } +EXPORT_SYMBOL(skb_copy_and_csum_dev); /** * skb_dequeue - remove from the head of the queue @@ -1827,6 +1843,7 @@ struct sk_buff *skb_dequeue(struct sk_buff_head *list) spin_unlock_irqrestore(&list->lock, flags); return result; } +EXPORT_SYMBOL(skb_dequeue); /** * skb_dequeue_tail - remove from the tail of the queue @@ -1846,6 +1863,7 @@ struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) spin_unlock_irqrestore(&list->lock, flags); return result; } +EXPORT_SYMBOL(skb_dequeue_tail); /** * skb_queue_purge - empty a list @@ -1861,6 +1879,7 @@ void skb_queue_purge(struct sk_buff_head *list) while ((skb = skb_dequeue(list)) != NULL) kfree_skb(skb); } +EXPORT_SYMBOL(skb_queue_purge); /** * skb_queue_head - queue a buffer at the list head @@ -1881,6 +1900,7 @@ void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) __skb_queue_head(list, newsk); spin_unlock_irqrestore(&list->lock, flags); } +EXPORT_SYMBOL(skb_queue_head); /** * skb_queue_tail - queue a buffer at the list tail @@ -1901,6 +1921,7 @@ void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) __skb_queue_tail(list, newsk); spin_unlock_irqrestore(&list->lock, flags); } +EXPORT_SYMBOL(skb_queue_tail); /** * skb_unlink - remove a buffer from a list @@ -1920,6 +1941,7 @@ void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) __skb_unlink(skb, list); spin_unlock_irqrestore(&list->lock, flags); } +EXPORT_SYMBOL(skb_unlink); /** * skb_append - append a buffer @@ -1939,7 +1961,7 @@ void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head __skb_queue_after(list, old, newsk); spin_unlock_irqrestore(&list->lock, flags); } - +EXPORT_SYMBOL(skb_append); /** * skb_insert - insert a buffer @@ -1961,6 +1983,7 @@ void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head __skb_insert(newsk, old->prev, old, list); spin_unlock_irqrestore(&list->lock, flags); } +EXPORT_SYMBOL(skb_insert); static inline void skb_split_inside_header(struct sk_buff *skb, struct sk_buff* skb1, @@ -2039,6 +2062,7 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) else /* Second chunk has no header, nothing to copy. */ skb_split_no_header(skb, skb1, len, pos); } +EXPORT_SYMBOL(skb_split); /* Shifting from/to a cloned skb is a no-go. * @@ -2201,6 +2225,7 @@ void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, st->frag_idx = st->stepped_offset = 0; st->frag_data = NULL; } +EXPORT_SYMBOL(skb_prepare_seq_read); /** * skb_seq_read - Sequentially read skb data @@ -2288,6 +2313,7 @@ next_skb: return 0; } +EXPORT_SYMBOL(skb_seq_read); /** * skb_abort_seq_read - Abort a sequential read of skb data @@ -2301,6 +2327,7 @@ void skb_abort_seq_read(struct skb_seq_state *st) if (st->frag_data) kunmap_skb_frag(st->frag_data); } +EXPORT_SYMBOL(skb_abort_seq_read); #define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) @@ -2343,6 +2370,7 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, ret = textsearch_find(config, state); return (ret <= to - from ? ret : UINT_MAX); } +EXPORT_SYMBOL(skb_find_text); /** * skb_append_datato_frags: - append the user data to a skb @@ -2415,6 +2443,7 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, return 0; } +EXPORT_SYMBOL(skb_append_datato_frags); /** * skb_pull_rcsum - pull skb and update receive checksum @@ -2602,7 +2631,6 @@ err: } return ERR_PTR(err); } - EXPORT_SYMBOL_GPL(skb_segment); int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) @@ -2800,6 +2828,7 @@ int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int le return nsg; } +EXPORT_SYMBOL_GPL(skb_to_sgvec); /** * skb_cow_data - Check that a socket buffer's data buffers are writable @@ -2909,6 +2938,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) return elt; } +EXPORT_SYMBOL_GPL(skb_cow_data); /** * skb_partial_csum_set - set up and verify partial csum values for packet @@ -2937,6 +2967,7 @@ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) skb->csum_offset = off; return true; } +EXPORT_SYMBOL_GPL(skb_partial_csum_set); void __skb_warn_lro_forwarding(const struct sk_buff *skb) { @@ -2944,42 +2975,4 @@ void __skb_warn_lro_forwarding(const struct sk_buff *skb) pr_warning("%s: received packets cannot be forwarded" " while LRO is enabled\n", skb->dev->name); } - -EXPORT_SYMBOL(___pskb_trim); -EXPORT_SYMBOL(__kfree_skb); -EXPORT_SYMBOL(kfree_skb); -EXPORT_SYMBOL(__pskb_pull_tail); -EXPORT_SYMBOL(__alloc_skb); -EXPORT_SYMBOL(__netdev_alloc_skb); -EXPORT_SYMBOL(pskb_copy); -EXPORT_SYMBOL(pskb_expand_head); -EXPORT_SYMBOL(skb_checksum); -EXPORT_SYMBOL(skb_clone); -EXPORT_SYMBOL(skb_copy); -EXPORT_SYMBOL(skb_copy_and_csum_bits); -EXPORT_SYMBOL(skb_copy_and_csum_dev); -EXPORT_SYMBOL(skb_copy_bits); -EXPORT_SYMBOL(skb_copy_expand); -EXPORT_SYMBOL(skb_over_panic); -EXPORT_SYMBOL(skb_pad); -EXPORT_SYMBOL(skb_realloc_headroom); -EXPORT_SYMBOL(skb_under_panic); -EXPORT_SYMBOL(skb_dequeue); -EXPORT_SYMBOL(skb_dequeue_tail); -EXPORT_SYMBOL(skb_insert); -EXPORT_SYMBOL(skb_queue_purge); -EXPORT_SYMBOL(skb_queue_head); -EXPORT_SYMBOL(skb_queue_tail); -EXPORT_SYMBOL(skb_unlink); -EXPORT_SYMBOL(skb_append); -EXPORT_SYMBOL(skb_split); -EXPORT_SYMBOL(skb_prepare_seq_read); -EXPORT_SYMBOL(skb_seq_read); -EXPORT_SYMBOL(skb_abort_seq_read); -EXPORT_SYMBOL(skb_find_text); -EXPORT_SYMBOL(skb_append_datato_frags); EXPORT_SYMBOL(__skb_warn_lro_forwarding); - -EXPORT_SYMBOL_GPL(skb_to_sgvec); -EXPORT_SYMBOL_GPL(skb_cow_data); -EXPORT_SYMBOL_GPL(skb_partial_csum_set); -- cgit v1.2.3-70-g09d2 From ce3dd39595d9d64f4ba6ee8dd24c6269a3b56b6a Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Thu, 12 Feb 2009 16:51:43 -0800 Subject: net: Fix page seeking for skb_splice_bits(). struct page walking should be done with proper accessor functions, not directly. With doubts from David S. Miller and Herbert Xu. Signed-off-by: Jarek Poplawski Signed-off-by: David S. Miller --- net/core/skbuff.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net/core/skbuff.c') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7657cec5973..ab7d2e9f02f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1412,8 +1412,13 @@ static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page, static inline void __segment_seek(struct page **page, unsigned int *poff, unsigned int *plen, unsigned int off) { + unsigned long n; + *poff += off; - *page += *poff / PAGE_SIZE; + n = *poff / PAGE_SIZE; + if (n) + *page = nth_page(*page, n); + *poff = *poff % PAGE_SIZE; *plen -= off; } -- cgit v1.2.3-70-g09d2 From ac45f602ee3d1b6f326f68bc0c2591ceebf05ba4 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Thu, 12 Feb 2009 05:03:37 +0000 Subject: net: infrastructure for hardware time stamping The additional per-packet information (16 bytes for time stamps, 1 byte for flags) is stored for all packets in the skb_shared_info struct. This implementation detail is hidden from users of that information via skb_* accessor functions. A separate struct resp. union is used for the additional information so that it can be stored/copied easily outside of skb_shared_info. Compared to previous implementations (reusing the tstamp field depending on the context, optional additional structures) this is the simplest solution. It does not extend sk_buff itself. TX time stamping is implemented in software if the device driver doesn't support hardware time stamping. The new semantic for hardware/software time stamping around ndo_start_xmit() is based on two assumptions about existing network device drivers which don't support hardware time stamping and know nothing about it: - they leave the new skb_shared_tx unmodified - the keep the connection to the originating socket in skb->sk alive, i.e., don't call skb_orphan() Given that skb_shared_tx is new, the first assumption is safe. The second is only true for some drivers. As a result, software TX time stamping currently works with the bnx2 driver, but not with the unmodified igb driver (the two drivers this patch series was tested with). Signed-off-by: Patrick Ohly Signed-off-by: David S. Miller --- include/linux/skbuff.h | 91 +++++++++++++++++++++++++++++++++++++++++++++++++- net/core/dev.c | 32 ++++++++++++++++-- net/core/skbuff.c | 41 +++++++++++++++++++++++ 3 files changed, 161 insertions(+), 3 deletions(-) (limited to 'net/core/skbuff.c') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 92470084458..f96bc91bf0a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -132,6 +132,57 @@ struct skb_frag_struct { __u32 size; }; +#define HAVE_HW_TIME_STAMP + +/** + * skb_shared_hwtstamps - hardware time stamps + * + * @hwtstamp: hardware time stamp transformed into duration + * since arbitrary point in time + * @syststamp: hwtstamp transformed to system time base + * + * Software time stamps generated by ktime_get_real() are stored in + * skb->tstamp. The relation between the different kinds of time + * stamps is as follows: + * + * syststamp and tstamp can be compared against each other in + * arbitrary combinations. The accuracy of a + * syststamp/tstamp/"syststamp from other device" comparison is + * limited by the accuracy of the transformation into system time + * base. This depends on the device driver and its underlying + * hardware. + * + * hwtstamps can only be compared against other hwtstamps from + * the same device. + * + * This structure is attached to packets as part of the + * &skb_shared_info. Use skb_hwtstamps() to get a pointer. + */ +struct skb_shared_hwtstamps { + ktime_t hwtstamp; + ktime_t syststamp; +}; + +/** + * skb_shared_tx - instructions for time stamping of outgoing packets + * + * @hardware: generate hardware time stamp + * @software: generate software time stamp + * @in_progress: device driver is going to provide + * hardware time stamp + * + * These flags are attached to packets as part of the + * &skb_shared_info. Use skb_tx() to get a pointer. + */ +union skb_shared_tx { + struct { + __u8 hardware:1, + software:1, + in_progress:1; + }; + __u8 flags; +}; + /* This data is invariant across clones and lives at * the end of the header data, ie. at skb->end. */ @@ -143,10 +194,12 @@ struct skb_shared_info { unsigned short gso_segs; unsigned short gso_type; __be32 ip6_frag_id; + union skb_shared_tx tx_flags; #ifdef CONFIG_HAS_DMA unsigned int num_dma_maps; #endif struct sk_buff *frag_list; + struct skb_shared_hwtstamps hwtstamps; skb_frag_t frags[MAX_SKB_FRAGS]; #ifdef CONFIG_HAS_DMA dma_addr_t dma_maps[MAX_SKB_FRAGS + 1]; @@ -465,6 +518,16 @@ static inline unsigned char *skb_end_pointer(const struct sk_buff *skb) /* Internal */ #define skb_shinfo(SKB) ((struct skb_shared_info *)(skb_end_pointer(SKB))) +static inline struct skb_shared_hwtstamps *skb_hwtstamps(struct sk_buff *skb) +{ + return &skb_shinfo(skb)->hwtstamps; +} + +static inline union skb_shared_tx *skb_tx(struct sk_buff *skb) +{ + return &skb_shinfo(skb)->tx_flags; +} + /** * skb_queue_empty - check if a queue is empty * @list: queue head @@ -1730,6 +1793,11 @@ static inline void skb_copy_to_linear_data_offset(struct sk_buff *skb, extern void skb_init(void); +static inline ktime_t skb_get_ktime(const struct sk_buff *skb) +{ + return skb->tstamp; +} + /** * skb_get_timestamp - get timestamp from a skb * @skb: skb to get stamp from @@ -1739,11 +1807,18 @@ extern void skb_init(void); * This function converts the offset back to a struct timeval and stores * it in stamp. */ -static inline void skb_get_timestamp(const struct sk_buff *skb, struct timeval *stamp) +static inline void skb_get_timestamp(const struct sk_buff *skb, + struct timeval *stamp) { *stamp = ktime_to_timeval(skb->tstamp); } +static inline void skb_get_timestampns(const struct sk_buff *skb, + struct timespec *stamp) +{ + *stamp = ktime_to_timespec(skb->tstamp); +} + static inline void __net_timestamp(struct sk_buff *skb) { skb->tstamp = ktime_get_real(); @@ -1759,6 +1834,20 @@ static inline ktime_t net_invalid_timestamp(void) return ktime_set(0, 0); } +/** + * skb_tstamp_tx - queue clone of skb with send time stamps + * @orig_skb: the original outgoing packet + * @hwtstamps: hardware time stamps, may be NULL if not available + * + * If the skb has a socket associated, then this function clones the + * skb (thus sharing the actual data and optional structures), stores + * the optional hardware time stamping information (if non NULL) or + * generates a software time stamp (otherwise), then queues the clone + * to the error queue of the socket. Errors are silently ignored. + */ +extern void skb_tstamp_tx(struct sk_buff *orig_skb, + struct skb_shared_hwtstamps *hwtstamps); + extern __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len); extern __sum16 __skb_checksum_complete(struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index 1e27a67df24..d20c28e839d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1672,10 +1672,21 @@ static int dev_gso_segment(struct sk_buff *skb) return 0; } +static void tstamp_tx(struct sk_buff *skb) +{ + union skb_shared_tx *shtx = + skb_tx(skb); + if (unlikely(shtx->software && + !shtx->in_progress)) { + skb_tstamp_tx(skb, NULL); + } +} + int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { const struct net_device_ops *ops = dev->netdev_ops; + int rc; prefetch(&dev->netdev_ops->ndo_start_xmit); if (likely(!skb->next)) { @@ -1689,13 +1700,29 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, goto gso; } - return ops->ndo_start_xmit(skb, dev); + rc = ops->ndo_start_xmit(skb, dev); + /* + * TODO: if skb_orphan() was called by + * dev->hard_start_xmit() (for example, the unmodified + * igb driver does that; bnx2 doesn't), then + * skb_tx_software_timestamp() will be unable to send + * back the time stamp. + * + * How can this be prevented? Always create another + * reference to the socket before calling + * dev->hard_start_xmit()? Prevent that skb_orphan() + * does anything in dev->hard_start_xmit() by clearing + * the skb destructor before the call and restoring it + * afterwards, then doing the skb_orphan() ourselves? + */ + if (likely(!rc)) + tstamp_tx(skb); + return rc; } gso: do { struct sk_buff *nskb = skb->next; - int rc; skb->next = nskb->next; nskb->next = NULL; @@ -1705,6 +1732,7 @@ gso: skb->next = nskb; return rc; } + tstamp_tx(skb); if (unlikely(netif_tx_queue_stopped(txq) && skb->next)) return NETDEV_TX_BUSY; } while (skb->next); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ab7d2e9f02f..e5a8351ff12 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include @@ -215,7 +216,9 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, shinfo->gso_segs = 0; shinfo->gso_type = 0; shinfo->ip6_frag_id = 0; + shinfo->tx_flags.flags = 0; shinfo->frag_list = NULL; + memset(&shinfo->hwtstamps, 0, sizeof(shinfo->hwtstamps)); if (fclone) { struct sk_buff *child = skb + 1; @@ -2945,6 +2948,44 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) } EXPORT_SYMBOL_GPL(skb_cow_data); +void skb_tstamp_tx(struct sk_buff *orig_skb, + struct skb_shared_hwtstamps *hwtstamps) +{ + struct sock *sk = orig_skb->sk; + struct sock_exterr_skb *serr; + struct sk_buff *skb; + int err; + + if (!sk) + return; + + skb = skb_clone(orig_skb, GFP_ATOMIC); + if (!skb) + return; + + if (hwtstamps) { + *skb_hwtstamps(skb) = + *hwtstamps; + } else { + /* + * no hardware time stamps available, + * so keep the skb_shared_tx and only + * store software time stamp + */ + skb->tstamp = ktime_get_real(); + } + + serr = SKB_EXT_ERR(skb); + memset(serr, 0, sizeof(*serr)); + serr->ee.ee_errno = ENOMSG; + serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; + err = sock_queue_err_skb(sk, skb); + if (err) + kfree_skb(skb); +} +EXPORT_SYMBOL_GPL(skb_tstamp_tx); + + /** * skb_partial_csum_set - set up and verify partial csum values for packet * @skb: the skb to set -- cgit v1.2.3-70-g09d2 From f3fbbe0f6f6cbac4c2aa3d71d95e49cf148286d6 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 25 Feb 2009 00:37:32 +0000 Subject: core: remove some pointless conditionals before kfree_skb() Remove some pointless conditionals before kfree_skb(). Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- net/core/neighbour.c | 6 ++---- net/core/skbuff.c | 3 +-- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'net/core/skbuff.c') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index e1144cb94b9..417b6d739fb 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -871,8 +871,7 @@ static void neigh_timer_handler(unsigned long arg) write_unlock(&neigh->lock); neigh->ops->solicit(neigh, skb); atomic_inc(&neigh->probes); - if (skb) - kfree_skb(skb); + kfree_skb(skb); } else { out: write_unlock(&neigh->lock); @@ -908,8 +907,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) neigh->updated = jiffies; write_unlock_bh(&neigh->lock); - if (skb) - kfree_skb(skb); + kfree_skb(skb); return 1; } } else if (neigh->nud_state & NUD_STALE) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 33640d99c8e..e5e2111a397 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1208,8 +1208,7 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) insp = list; } if (!pskb_pull(list, eat)) { - if (clone) - kfree_skb(clone); + kfree_skb(clone); return NULL; } break; -- cgit v1.2.3-70-g09d2 From ead2ceb0ec9f85cff19c43b5cdb2f8a054484431 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Wed, 11 Mar 2009 09:49:55 +0000 Subject: Network Drop Monitor: Adding kfree_skb_clean for non-drops and modifying end-of-line points for skbs Signed-off-by: Neil Horman include/linux/skbuff.h | 4 +++- net/core/datagram.c | 2 +- net/core/skbuff.c | 22 ++++++++++++++++++++++ net/ipv4/arp.c | 2 +- net/ipv4/udp.c | 2 +- net/packet/af_packet.c | 2 +- 6 files changed, 29 insertions(+), 5 deletions(-) Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 +++- net/core/datagram.c | 2 +- net/core/skbuff.c | 22 ++++++++++++++++++++++ net/ipv4/arp.c | 2 +- net/ipv4/udp.c | 2 +- net/packet/af_packet.c | 2 +- 6 files changed, 29 insertions(+), 5 deletions(-) (limited to 'net/core/skbuff.c') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 1f659e8c2b8..1fbab2ae613 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -421,6 +421,7 @@ extern void skb_dma_unmap(struct device *dev, struct sk_buff *skb, #endif extern void kfree_skb(struct sk_buff *skb); +extern void consume_skb(struct sk_buff *skb); extern void __kfree_skb(struct sk_buff *skb); extern struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int fclone, int node); @@ -459,7 +460,8 @@ extern int skb_to_sgvec(struct sk_buff *skb, extern int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer); extern int skb_pad(struct sk_buff *skb, int pad); -#define dev_kfree_skb(a) kfree_skb(a) +#define dev_kfree_skb(a) consume_skb(a) +#define dev_consume_skb(a) kfree_skb_clean(a) extern void skb_over_panic(struct sk_buff *skb, int len, void *here); extern void skb_under_panic(struct sk_buff *skb, int len, diff --git a/net/core/datagram.c b/net/core/datagram.c index 5e2ac0c4b07..d0de644b378 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -208,7 +208,7 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, void skb_free_datagram(struct sock *sk, struct sk_buff *skb) { - kfree_skb(skb); + consume_skb(skb); sk_mem_reclaim_partial(sk); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e5e2111a397..6acbf9e79eb 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -65,6 +65,7 @@ #include #include +#include #include "kmap_skb.h" @@ -442,10 +443,31 @@ void kfree_skb(struct sk_buff *skb) smp_rmb(); else if (likely(!atomic_dec_and_test(&skb->users))) return; + trace_kfree_skb(skb, __builtin_return_address(0)); __kfree_skb(skb); } EXPORT_SYMBOL(kfree_skb); +/** + * consume_skb - free an skbuff + * @skb: buffer to free + * + * Drop a ref to the buffer and free it if the usage count has hit zero + * Functions identically to kfree_skb, but kfree_skb assumes that the frame + * is being dropped after a failure and notes that + */ +void consume_skb(struct sk_buff *skb) +{ + if (unlikely(!skb)) + return; + if (likely(atomic_read(&skb->users) == 1)) + smp_rmb(); + else if (likely(!atomic_dec_and_test(&skb->users))) + return; + __kfree_skb(skb); +} +EXPORT_SYMBOL(consume_skb); + /** * skb_recycle_check - check if skb can be reused for receive * @skb: buffer diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 3d67d1ffed7..9c220323f35 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -892,7 +892,7 @@ static int arp_process(struct sk_buff *skb) out: if (in_dev) in_dev_put(in_dev); - kfree_skb(skb); + consume_skb(skb); return 0; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 4bd178a111d..05b7abb99f6 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1184,7 +1184,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, sk = sknext; } while (sknext); } else - kfree_skb(skb); + consume_skb(skb); spin_unlock(&hslot->lock); return 0; } diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index d8cc006fac4..74776de523e 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -584,7 +584,7 @@ drop_n_restore: skb->len = skb_len; } drop: - kfree_skb(skb); + consume_skb(skb); return 0; } -- cgit v1.2.3-70-g09d2