From 5b057c6b1a25d57edf2b4d1e956e50936480a9ff Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 23 Jun 2006 02:06:41 -0700 Subject: [NET]: Avoid allocating skb in skb_pad First of all it is unnecessary to allocate a new skb in skb_pad since the existing one is not shared. More importantly, our hard_start_xmit interface does not allow a new skb to be allocated since that breaks requeueing. This patch uses pskb_expand_head to expand the existing skb and linearize it if needed. Actually, someone should sift through every instance of skb_pad on a non-linear skb as they do not fit the reasons why this was originally created. Incidentally, this fixes a minor bug when the skb is cloned (tcpdump, TCP, etc.). As it is skb_pad will simply write over a cloned skb. Because of the position of the write it is unlikely to cause problems but still it's best if we don't do it. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- drivers/net/3c527.c | 3 +-- drivers/net/82596.c | 3 +-- drivers/net/a2065.c | 3 +-- drivers/net/ariadne.c | 3 +-- drivers/net/arm/ether1.c | 3 +-- drivers/net/arm/ether3.c | 3 +-- drivers/net/atarilance.c | 3 +-- drivers/net/cassini.c | 3 +-- drivers/net/declance.c | 3 +-- drivers/net/depca.c | 7 ++----- drivers/net/eepro.c | 3 +-- drivers/net/eexpress.c | 3 +-- drivers/net/epic100.c | 7 ++----- drivers/net/eth16i.c | 3 +-- drivers/net/hp100.c | 7 ++----- drivers/net/lance.c | 3 +-- drivers/net/lasi_82596.c | 3 +-- drivers/net/lp486e.c | 3 +-- drivers/net/myri10ge/myri10ge.c | 3 +-- drivers/net/pcmcia/fmvj18x_cs.c | 3 +-- drivers/net/pcmcia/xirc2ps_cs.c | 3 +-- drivers/net/r8169.c | 3 +-- drivers/net/seeq8005.c | 3 +-- drivers/net/sis190.c | 3 +-- drivers/net/sk98lin/skge.c | 2 +- drivers/net/skge.c | 3 +-- drivers/net/smc9194.c | 3 +-- drivers/net/sonic.c | 3 +-- drivers/net/starfire.c | 3 +-- drivers/net/via-rhine.c | 7 ++----- drivers/net/wireless/ray_cs.c | 3 +-- drivers/net/wireless/wavelan_cs.c | 7 ++----- drivers/net/yellowfin.c | 12 +++++------- drivers/net/znet.c | 3 +-- include/linux/skbuff.h | 11 +++++------ net/core/skbuff.c | 36 ++++++++++++++++++++++++++---------- 36 files changed, 74 insertions(+), 103 deletions(-) diff --git a/drivers/net/3c527.c b/drivers/net/3c527.c index 1b1cb002607..157eda57392 100644 --- a/drivers/net/3c527.c +++ b/drivers/net/3c527.c @@ -1031,8 +1031,7 @@ static int mc32_send_packet(struct sk_buff *skb, struct net_device *dev) return 1; } - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) { + if (skb_padto(skb, ETH_ZLEN)) { netif_wake_queue(dev); return 0; } diff --git a/drivers/net/82596.c b/drivers/net/82596.c index da0c878dcba..8a9f7d61b9b 100644 --- a/drivers/net/82596.c +++ b/drivers/net/82596.c @@ -1070,8 +1070,7 @@ static int i596_start_xmit(struct sk_buff *skb, struct net_device *dev) skb->len, (unsigned int)skb->data)); if (skb->len < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) return 0; length = ETH_ZLEN; } diff --git a/drivers/net/a2065.c b/drivers/net/a2065.c index 79bb56b8dce..71165ac0257 100644 --- a/drivers/net/a2065.c +++ b/drivers/net/a2065.c @@ -573,8 +573,7 @@ static int lance_start_xmit (struct sk_buff *skb, struct net_device *dev) if (len < ETH_ZLEN) { len = ETH_ZLEN; - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) return 0; } diff --git a/drivers/net/ariadne.c b/drivers/net/ariadne.c index d1b6b1f794e..a9bb7a4aff9 100644 --- a/drivers/net/ariadne.c +++ b/drivers/net/ariadne.c @@ -607,8 +607,7 @@ static int ariadne_start_xmit(struct sk_buff *skb, struct net_device *dev) /* FIXME: is the 79C960 new enough to do its own padding right ? */ if (skb->len < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) return 0; len = ETH_ZLEN; } diff --git a/drivers/net/arm/ether1.c b/drivers/net/arm/ether1.c index 36475eb2727..312955d07b2 100644 --- a/drivers/net/arm/ether1.c +++ b/drivers/net/arm/ether1.c @@ -700,8 +700,7 @@ ether1_sendpacket (struct sk_buff *skb, struct net_device *dev) } if (skb->len < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) goto out; } diff --git a/drivers/net/arm/ether3.c b/drivers/net/arm/ether3.c index f1d5b1027ff..081074180e6 100644 --- a/drivers/net/arm/ether3.c +++ b/drivers/net/arm/ether3.c @@ -518,8 +518,7 @@ ether3_sendpacket(struct sk_buff *skb, struct net_device *dev) length = (length + 1) & ~1; if (length != skb->len) { - skb = skb_padto(skb, length); - if (skb == NULL) + if (skb_padto(skb, length)) goto out; } diff --git a/drivers/net/atarilance.c b/drivers/net/atarilance.c index 442b2cbeb58..91783a8008b 100644 --- a/drivers/net/atarilance.c +++ b/drivers/net/atarilance.c @@ -804,8 +804,7 @@ static int lance_start_xmit( struct sk_buff *skb, struct net_device *dev ) ++len; if (len > skb->len) { - skb = skb_padto(skb, len); - if (skb == NULL) + if (skb_padto(skb, len)) return 0; } diff --git a/drivers/net/cassini.c b/drivers/net/cassini.c index 39f36aa05aa..565a54f1d06 100644 --- a/drivers/net/cassini.c +++ b/drivers/net/cassini.c @@ -2915,8 +2915,7 @@ static int cas_start_xmit(struct sk_buff *skb, struct net_device *dev) */ static int ring; - skb = skb_padto(skb, cp->min_frame_size); - if (!skb) + if (skb_padto(skb, cp->min_frame_size)) return 0; /* XXX: we need some higher-level QoS hooks to steer packets to diff --git a/drivers/net/declance.c b/drivers/net/declance.c index f130bdab3fd..d3d958e7ac5 100644 --- a/drivers/net/declance.c +++ b/drivers/net/declance.c @@ -885,8 +885,7 @@ static int lance_start_xmit(struct sk_buff *skb, struct net_device *dev) len = skblen; if (len < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) return 0; len = ETH_ZLEN; } diff --git a/drivers/net/depca.c b/drivers/net/depca.c index 0941d40f046..e946c43d3b1 100644 --- a/drivers/net/depca.c +++ b/drivers/net/depca.c @@ -938,11 +938,8 @@ static int depca_start_xmit(struct sk_buff *skb, struct net_device *dev) if (skb->len < 1) goto out; - if (skb->len < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) - goto out; - } + if (skb_padto(skb, ETH_ZLEN)) + goto out; netif_stop_queue(dev); diff --git a/drivers/net/eepro.c b/drivers/net/eepro.c index a806dfe54d2..e70f172699d 100644 --- a/drivers/net/eepro.c +++ b/drivers/net/eepro.c @@ -1154,8 +1154,7 @@ static int eepro_send_packet(struct sk_buff *skb, struct net_device *dev) printk(KERN_DEBUG "%s: entering eepro_send_packet routine.\n", dev->name); if (length < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) return 0; length = ETH_ZLEN; } diff --git a/drivers/net/eexpress.c b/drivers/net/eexpress.c index 82bd356e4f3..a74b2071575 100644 --- a/drivers/net/eexpress.c +++ b/drivers/net/eexpress.c @@ -677,8 +677,7 @@ static int eexp_xmit(struct sk_buff *buf, struct net_device *dev) #endif if (buf->len < ETH_ZLEN) { - buf = skb_padto(buf, ETH_ZLEN); - if (buf == NULL) + if (skb_padto(buf, ETH_ZLEN)) return 0; length = ETH_ZLEN; } diff --git a/drivers/net/epic100.c b/drivers/net/epic100.c index 8d680ce600d..724d7dc35fa 100644 --- a/drivers/net/epic100.c +++ b/drivers/net/epic100.c @@ -1027,11 +1027,8 @@ static int epic_start_xmit(struct sk_buff *skb, struct net_device *dev) u32 ctrl_word; unsigned long flags; - if (skb->len < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) - return 0; - } + if (skb_padto(skb, ETH_ZLEN)) + return 0; /* Caution: the write order is important here, set the field with the "ownership" bit last. */ diff --git a/drivers/net/eth16i.c b/drivers/net/eth16i.c index b67545be2ca..4bf76f86d8e 100644 --- a/drivers/net/eth16i.c +++ b/drivers/net/eth16i.c @@ -1064,8 +1064,7 @@ static int eth16i_tx(struct sk_buff *skb, struct net_device *dev) unsigned long flags; if (length < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) return 0; length = ETH_ZLEN; } diff --git a/drivers/net/hp100.c b/drivers/net/hp100.c index 247c8ca8603..dd1dc32dc98 100644 --- a/drivers/net/hp100.c +++ b/drivers/net/hp100.c @@ -1487,11 +1487,8 @@ static int hp100_start_xmit_bm(struct sk_buff *skb, struct net_device *dev) if (skb->len <= 0) return 0; - if (skb->len < ETH_ZLEN && lp->chip == HP100_CHIPID_SHASTA) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) - return 0; - } + if (lp->chip == HP100_CHIPID_SHASTA && skb_padto(skb, ETH_ZLEN)) + return 0; /* Get Tx ring tail pointer */ if (lp->txrtail->next == lp->txrhead) { diff --git a/drivers/net/lance.c b/drivers/net/lance.c index bb5ad479210..c1c3452c90c 100644 --- a/drivers/net/lance.c +++ b/drivers/net/lance.c @@ -968,8 +968,7 @@ static int lance_start_xmit(struct sk_buff *skb, struct net_device *dev) /* The old LANCE chips doesn't automatically pad buffers to min. size. */ if (chip_table[lp->chip_version].flags & LANCE_MUST_PAD) { if (skb->len < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) goto out; lp->tx_ring[entry].length = -ETH_ZLEN; } diff --git a/drivers/net/lasi_82596.c b/drivers/net/lasi_82596.c index 957888de3d7..1ab09447baa 100644 --- a/drivers/net/lasi_82596.c +++ b/drivers/net/lasi_82596.c @@ -1083,8 +1083,7 @@ static int i596_start_xmit(struct sk_buff *skb, struct net_device *dev) skb->len, skb->data)); if (length < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) return 0; length = ETH_ZLEN; } diff --git a/drivers/net/lp486e.c b/drivers/net/lp486e.c index 94d5ea1ce8b..bf3f343ae71 100644 --- a/drivers/net/lp486e.c +++ b/drivers/net/lp486e.c @@ -877,8 +877,7 @@ static int i596_start_xmit (struct sk_buff *skb, struct net_device *dev) { length = skb->len; if (length < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) return 0; length = ETH_ZLEN; } diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c index 5a74f63618b..b983e1e0434 100644 --- a/drivers/net/myri10ge/myri10ge.c +++ b/drivers/net/myri10ge/myri10ge.c @@ -1939,8 +1939,7 @@ again: /* pad frames to at least ETH_ZLEN bytes */ if (unlikely(skb->len < ETH_ZLEN)) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) { + if (skb_padto(skb, ETH_ZLEN)) { /* The packet is gone, so we must * return 0 */ mgp->stats.tx_dropped += 1; diff --git a/drivers/net/pcmcia/fmvj18x_cs.c b/drivers/net/pcmcia/fmvj18x_cs.c index 09b11761cdf..ea93b8f1860 100644 --- a/drivers/net/pcmcia/fmvj18x_cs.c +++ b/drivers/net/pcmcia/fmvj18x_cs.c @@ -831,8 +831,7 @@ static int fjn_start_xmit(struct sk_buff *skb, struct net_device *dev) if (length < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) return 0; length = ETH_ZLEN; } diff --git a/drivers/net/pcmcia/xirc2ps_cs.c b/drivers/net/pcmcia/xirc2ps_cs.c index e80d1e3aec6..9bae77ce131 100644 --- a/drivers/net/pcmcia/xirc2ps_cs.c +++ b/drivers/net/pcmcia/xirc2ps_cs.c @@ -1374,8 +1374,7 @@ do_start_xmit(struct sk_buff *skb, struct net_device *dev) */ if (pktlen < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) return 0; pktlen = ETH_ZLEN; } diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c index 9945cc6b8d9..985afe0e627 100644 --- a/drivers/net/r8169.c +++ b/drivers/net/r8169.c @@ -2222,8 +2222,7 @@ static int rtl8169_start_xmit(struct sk_buff *skb, struct net_device *dev) len = skb->len; if (unlikely(len < ETH_ZLEN)) { - skb = skb_padto(skb, ETH_ZLEN); - if (!skb) + if (skb_padto(skb, ETH_ZLEN)) goto err_update_stats; len = ETH_ZLEN; } diff --git a/drivers/net/seeq8005.c b/drivers/net/seeq8005.c index bcef03feb2f..efd0f235020 100644 --- a/drivers/net/seeq8005.c +++ b/drivers/net/seeq8005.c @@ -396,8 +396,7 @@ static int seeq8005_send_packet(struct sk_buff *skb, struct net_device *dev) unsigned char *buf; if (length < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) return 0; length = ETH_ZLEN; } diff --git a/drivers/net/sis190.c b/drivers/net/sis190.c index 31dd3f036fa..df39f344765 100644 --- a/drivers/net/sis190.c +++ b/drivers/net/sis190.c @@ -1156,8 +1156,7 @@ static int sis190_start_xmit(struct sk_buff *skb, struct net_device *dev) dma_addr_t mapping; if (unlikely(skb->len < ETH_ZLEN)) { - skb = skb_padto(skb, ETH_ZLEN); - if (!skb) { + if (skb_padto(skb, ETH_ZLEN)) { tp->stats.tx_dropped++; goto out; } diff --git a/drivers/net/sk98lin/skge.c b/drivers/net/sk98lin/skge.c index 38a26df4095..f3efbd177ae 100644 --- a/drivers/net/sk98lin/skge.c +++ b/drivers/net/sk98lin/skge.c @@ -1525,7 +1525,7 @@ struct sk_buff *pMessage) /* pointer to send-message */ ** This is to resolve faulty padding by the HW with 0xaa bytes. */ if (BytesSend < C_LEN_ETHERNET_MINSIZE) { - if ((pMessage = skb_padto(pMessage, C_LEN_ETHERNET_MINSIZE)) == NULL) { + if (skb_padto(pMessage, C_LEN_ETHERNET_MINSIZE)) { spin_unlock_irqrestore(&pTxPort->TxDesRingLock, Flags); return 0; } diff --git a/drivers/net/skge.c b/drivers/net/skge.c index 536dd1cf7f7..19a4a16055d 100644 --- a/drivers/net/skge.c +++ b/drivers/net/skge.c @@ -2310,8 +2310,7 @@ static int skge_xmit_frame(struct sk_buff *skb, struct net_device *dev) u64 map; unsigned long flags; - skb = skb_padto(skb, ETH_ZLEN); - if (!skb) + if (skb_padto(skb, ETH_ZLEN)) return NETDEV_TX_OK; if (!spin_trylock_irqsave(&skge->tx_lock, flags)) diff --git a/drivers/net/smc9194.c b/drivers/net/smc9194.c index 6cf16f322ad..8b0321f1976 100644 --- a/drivers/net/smc9194.c +++ b/drivers/net/smc9194.c @@ -523,8 +523,7 @@ static int smc_wait_to_send_packet( struct sk_buff * skb, struct net_device * de length = skb->len; if (length < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) { + if (skb_padto(skb, ETH_ZLEN)) { netif_wake_queue(dev); return 0; } diff --git a/drivers/net/sonic.c b/drivers/net/sonic.c index 90b818a8de6..cab0dd95849 100644 --- a/drivers/net/sonic.c +++ b/drivers/net/sonic.c @@ -231,8 +231,7 @@ static int sonic_send_packet(struct sk_buff *skb, struct net_device *dev) length = skb->len; if (length < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) return 0; length = ETH_ZLEN; } diff --git a/drivers/net/starfire.c b/drivers/net/starfire.c index 9b7805be21d..c158eedc781 100644 --- a/drivers/net/starfire.c +++ b/drivers/net/starfire.c @@ -1349,8 +1349,7 @@ static int start_tx(struct sk_buff *skb, struct net_device *dev) #if defined(ZEROCOPY) && defined(HAS_BROKEN_FIRMWARE) if (skb->ip_summed == CHECKSUM_HW) { - skb = skb_padto(skb, (skb->len + PADDING_MASK) & ~PADDING_MASK); - if (skb == NULL) + if (skb_padto(skb, (skb->len + PADDING_MASK) & ~PADDING_MASK)) return NETDEV_TX_OK; } #endif /* ZEROCOPY && HAS_BROKEN_FIRMWARE */ diff --git a/drivers/net/via-rhine.c b/drivers/net/via-rhine.c index fdc21037f6d..c80a4f1d5f7 100644 --- a/drivers/net/via-rhine.c +++ b/drivers/net/via-rhine.c @@ -1284,11 +1284,8 @@ static int rhine_start_tx(struct sk_buff *skb, struct net_device *dev) /* Calculate the next Tx descriptor entry. */ entry = rp->cur_tx % TX_RING_SIZE; - if (skb->len < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) - return 0; - } + if (skb_padto(skb, ETH_ZLEN)) + return 0; rp->tx_skbuff[entry] = skb; diff --git a/drivers/net/wireless/ray_cs.c b/drivers/net/wireless/ray_cs.c index 879eb427607..a915fe6c6aa 100644 --- a/drivers/net/wireless/ray_cs.c +++ b/drivers/net/wireless/ray_cs.c @@ -924,8 +924,7 @@ static int ray_dev_start_xmit(struct sk_buff *skb, struct net_device *dev) if (length < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) return 0; length = ETH_ZLEN; } diff --git a/drivers/net/wireless/wavelan_cs.c b/drivers/net/wireless/wavelan_cs.c index f7724eb2fa7..561250f73fd 100644 --- a/drivers/net/wireless/wavelan_cs.c +++ b/drivers/net/wireless/wavelan_cs.c @@ -3194,11 +3194,8 @@ wavelan_packet_xmit(struct sk_buff * skb, * and we don't have the Ethernet specific requirement of beeing * able to detect collisions, therefore in theory we don't really * need to pad. Jean II */ - if (skb->len < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) - return 0; - } + if (skb_padto(skb, ETH_ZLEN)) + return 0; wv_packet_write(dev, skb->data, skb->len); diff --git a/drivers/net/yellowfin.c b/drivers/net/yellowfin.c index fd0f43b7db5..ecec8e5db78 100644 --- a/drivers/net/yellowfin.c +++ b/drivers/net/yellowfin.c @@ -862,13 +862,11 @@ static int yellowfin_start_xmit(struct sk_buff *skb, struct net_device *dev) /* Fix GX chipset errata. */ if (cacheline_end > 24 || cacheline_end == 0) { len = skb->len + 32 - cacheline_end + 1; - if (len != skb->len) - skb = skb_padto(skb, len); - } - if (skb == NULL) { - yp->tx_skbuff[entry] = NULL; - netif_wake_queue(dev); - return 0; + if (skb_padto(skb, len)) { + yp->tx_skbuff[entry] = NULL; + netif_wake_queue(dev); + return 0; + } } } yp->tx_skbuff[entry] = skb; diff --git a/drivers/net/znet.c b/drivers/net/znet.c index 3ac047bc727..a7c089df66e 100644 --- a/drivers/net/znet.c +++ b/drivers/net/znet.c @@ -544,8 +544,7 @@ static int znet_send_packet(struct sk_buff *skb, struct net_device *dev) printk(KERN_DEBUG "%s: ZNet_send_packet.\n", dev->name); if (length < ETH_ZLEN) { - skb = skb_padto(skb, ETH_ZLEN); - if (skb == NULL) + if (skb_padto(skb, ETH_ZLEN)) return 0; length = ETH_ZLEN; } diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 66f8819f956..f8c7eb79a27 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -345,7 +345,7 @@ extern struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, extern struct sk_buff *skb_copy_expand(const struct sk_buff *skb, int newheadroom, int newtailroom, gfp_t priority); -extern struct sk_buff * skb_pad(struct sk_buff *skb, int pad); +extern int skb_pad(struct sk_buff *skb, int pad); #define dev_kfree_skb(a) kfree_skb(a) extern void skb_over_panic(struct sk_buff *skb, int len, void *here); @@ -1122,16 +1122,15 @@ static inline int skb_cow(struct sk_buff *skb, unsigned int headroom) * * Pads up a buffer to ensure the trailing bytes exist and are * blanked. If the buffer already contains sufficient data it - * is untouched. Returns the buffer, which may be a replacement - * for the original, or NULL for out of memory - in which case - * the original buffer is still freed. + * is untouched. Otherwise it is extended. Returns zero on + * success. The skb is freed on error. */ -static inline struct sk_buff *skb_padto(struct sk_buff *skb, unsigned int len) +static inline int skb_padto(struct sk_buff *skb, unsigned int len) { unsigned int size = skb->len; if (likely(size >= len)) - return skb; + return 0; return skb_pad(skb, len-size); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index bb7210f4005..fe63d4efbd4 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -781,24 +781,40 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, * filled. Used by network drivers which may DMA or transfer data * beyond the buffer end onto the wire. * - * May return NULL in out of memory cases. + * May return error in out of memory cases. The skb is freed on error. */ -struct sk_buff *skb_pad(struct sk_buff *skb, int pad) +int skb_pad(struct sk_buff *skb, int pad) { - struct sk_buff *nskb; + int err; + int ntail; /* If the skbuff is non linear tailroom is always zero.. */ - if (skb_tailroom(skb) >= pad) { + if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) { memset(skb->data+skb->len, 0, pad); - return skb; + return 0; } - - nskb = skb_copy_expand(skb, skb_headroom(skb), skb_tailroom(skb) + pad, GFP_ATOMIC); + + ntail = skb->data_len + pad - (skb->end - skb->tail); + if (likely(skb_cloned(skb) || ntail > 0)) { + err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC); + if (unlikely(err)) + goto free_skb; + } + + /* FIXME: The use of this function with non-linear skb's really needs + * to be audited. + */ + err = skb_linearize(skb); + if (unlikely(err)) + goto free_skb; + + memset(skb->data + skb->len, 0, pad); + return 0; + +free_skb: kfree_skb(skb); - if (nskb) - memset(nskb->data+nskb->len, 0, pad); - return nskb; + return err; } /* Trims skb to length len. It can change skb pointers. -- cgit v1.2.3-70-g09d2 From 102128e3a27821bdcbacb10f4f2bba253f587ba4 Mon Sep 17 00:00:00 2001 From: Łukasz Stelmach Date: Thu, 22 Jun 2006 01:37:19 -0700 Subject: [IPV6]: Fix source address selection. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two additional labels (RFC 3484, sec. 10.3) for IPv6 addreses are defined to make a distinction between global unicast addresses and Unique Local Addresses (fc00::/7, RFC 4193) and Teredo (2001::/32, RFC 4380). It is necessary to avoid attempts of connection that would either fail (eg. fec0:: to 2001:feed::) or be sub-optimal (2001:0:: to 2001:feed::). Signed-off-by: Łukasz Stelmach Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c2c26fa0943..6b361fc3e14 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -862,6 +862,8 @@ static int inline ipv6_saddr_label(const struct in6_addr *addr, int type) * 2002::/16 2 * ::/96 3 * ::ffff:0:0/96 4 + * fc00::/7 5 + * 2001::/32 6 */ if (type & IPV6_ADDR_LOOPBACK) return 0; @@ -869,8 +871,12 @@ static int inline ipv6_saddr_label(const struct in6_addr *addr, int type) return 3; else if (type & IPV6_ADDR_MAPPED) return 4; + else if (addr->s6_addr32[0] == htonl(0x20010000)) + return 6; else if (addr->s6_addr16[0] == htons(0x2002)) return 2; + else if ((addr->s6_addr[0] & 0xfe) == 0xfc) + return 5; return 1; } -- cgit v1.2.3-70-g09d2 From 5e2707fa3aed8c24075087cbaea2628725adbe55 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Thu, 22 Jun 2006 01:41:18 -0700 Subject: [IPV6] ADDRCONF: Fix default source address selection without CONFIG_IPV6_PRIVACY We need to update hiscore.rule even if we don't enable CONFIG_IPV6_PRIVACY, because we have more less significant rule; longest match. Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 6b361fc3e14..4da664538f5 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1075,6 +1075,9 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev, if (hiscore.attrs & IPV6_SADDR_SCORE_PRIVACY) continue; } +#else + if (hiscore.rule < 7) + hiscore.rule++; #endif /* Rule 8: Use longest matching prefix */ if (hiscore.rule < 8) { -- cgit v1.2.3-70-g09d2 From d4828d85d188dc70ed172802e798d3978bb6e29e Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 22 Jun 2006 02:28:18 -0700 Subject: [NET]: Prevent transmission after dev_deactivate The dev_deactivate function has bit-rotted since the introduction of lockless drivers. In particular, the spin_unlock_wait call at the end has no effect on the xmit routine of lockless drivers. With a little bit of work, we can make it much more useful by providing the guarantee that when it returns, no more calls to the xmit routine of the underlying driver will be made. The idea is simple. There are two entry points in to the xmit routine. The first comes from dev_queue_xmit. That one is easily stopped by using synchronize_rcu. This works because we set the qdisc to noop_qdisc before the synchronize_rcu call. That in turn causes all subsequent packets sent to dev_queue_xmit to be dropped. The synchronize_rcu call also ensures all outstanding calls leave their critical section. The other entry point is from qdisc_run. Since we now have a bit that indicates whether it's running, all we have to do is to wait until the bit is off. I've removed the loop to wait for __LINK_STATE_SCHED to clear. This is useless because netif_wake_queue can cause it to be set again. It is also harmless because we've disarmed qdisc_run. I've also removed the spin_unlock_wait on xmit_lock because its only purpose of making sure that all outstanding xmit_lock holders have exited is also given by dev_watchdog_down. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 6 +++--- net/sched/sch_generic.c | 12 +++++++++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index ab39fe17cb5..29e3888102b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1295,7 +1295,7 @@ int dev_queue_xmit(struct sk_buff *skb) /* Disable soft irqs for various locks below. Also * stops preemption for RCU. */ - local_bh_disable(); + rcu_read_lock_bh(); /* Updates of qdisc are serialized by queue_lock. * The struct Qdisc which is pointed to by qdisc is now a @@ -1369,13 +1369,13 @@ int dev_queue_xmit(struct sk_buff *skb) } rc = -ENETDOWN; - local_bh_enable(); + rcu_read_unlock_bh(); out_kfree_skb: kfree_skb(skb); return rc; out: - local_bh_enable(); + rcu_read_unlock_bh(); return rc; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index d7aca8ef524..7aad0121232 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -181,9 +181,13 @@ requeue: void __qdisc_run(struct net_device *dev) { + if (unlikely(dev->qdisc == &noop_qdisc)) + goto out; + while (qdisc_restart(dev) < 0 && !netif_queue_stopped(dev)) /* NOTHING */; +out: clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state); } @@ -583,10 +587,12 @@ void dev_deactivate(struct net_device *dev) dev_watchdog_down(dev); - while (test_bit(__LINK_STATE_SCHED, &dev->state)) - yield(); + /* Wait for outstanding dev_queue_xmit calls. */ + synchronize_rcu(); - spin_unlock_wait(&dev->_xmit_lock); + /* Wait for outstanding qdisc_run calls. */ + while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state)) + yield(); } void dev_init_scheduler(struct net_device *dev) -- cgit v1.2.3-70-g09d2 From 7967168cefdbc63bf332d6b1548eca7cd65ebbcc Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 22 Jun 2006 02:40:14 -0700 Subject: [NET]: Merge TSO/UFO fields in sk_buff Having separate fields in sk_buff for TSO/UFO (tso_size/ufo_size) is not going to scale if we add any more segmentation methods (e.g., DCCP). So let's merge them. They were used to tell the protocol of a packet. This function has been subsumed by the new gso_type field. This is essentially a set of netdev feature bits (shifted by 16 bits) that are required to process a specific skb. As such it's easy to tell whether a given device can process a GSO skb: you just have to and the gso_type field and the netdev's features field. I've made gso_type a conjunction. The idea is that you have a base type (e.g., SKB_GSO_TCPV4) that can be modified further to support new features. For example, if we add a hardware TSO type that supports ECN, they would declare NETIF_F_TSO | NETIF_F_TSO_ECN. All TSO packets with CWR set would have a gso_type of SKB_GSO_TCPV4 | SKB_GSO_TCPV4_ECN while all other TSO packets would be SKB_GSO_TCPV4. This means that only the CWR packets need to be emulated in software. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- drivers/net/8139cp.c | 2 +- drivers/net/bnx2.c | 4 ++-- drivers/net/chelsio/sge.c | 4 ++-- drivers/net/e1000/e1000_main.c | 10 ++++----- drivers/net/forcedeth.c | 4 ++-- drivers/net/ixgb/ixgb_main.c | 4 ++-- drivers/net/loopback.c | 4 ++-- drivers/net/myri10ge/myri10ge.c | 4 ++-- drivers/net/r8169.c | 2 +- drivers/net/s2io.c | 16 +++++++------- drivers/net/sky2.c | 4 ++-- drivers/net/tg3.c | 4 ++-- drivers/net/typhoon.c | 2 +- drivers/s390/net/qeth_eddp.c | 12 +++++------ drivers/s390/net/qeth_main.c | 4 ++-- drivers/s390/net/qeth_tso.h | 2 +- include/linux/netdevice.h | 14 ++++++++++-- include/linux/skbuff.h | 12 ++++++++--- include/net/tcp.h | 4 ++-- net/bridge/br_forward.c | 4 ++-- net/bridge/br_netfilter.c | 2 +- net/core/skbuff.c | 16 ++++++++------ net/ipv4/ip_output.c | 16 ++++++++------ net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_output.c | 47 ++++++++++++++++++++++++----------------- net/ipv6/ip6_output.c | 7 +++--- 27 files changed, 120 insertions(+), 90 deletions(-) diff --git a/drivers/net/8139cp.c b/drivers/net/8139cp.c index a26077a175a..0cdc830449d 100644 --- a/drivers/net/8139cp.c +++ b/drivers/net/8139cp.c @@ -797,7 +797,7 @@ static int cp_start_xmit (struct sk_buff *skb, struct net_device *dev) entry = cp->tx_head; eor = (entry == (CP_TX_RING_SIZE - 1)) ? RingEnd : 0; if (dev->features & NETIF_F_TSO) - mss = skb_shinfo(skb)->tso_size; + mss = skb_shinfo(skb)->gso_size; if (skb_shinfo(skb)->nr_frags == 0) { struct cp_desc *txd = &cp->tx_ring[entry]; diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c index 702d546567a..7635736cc79 100644 --- a/drivers/net/bnx2.c +++ b/drivers/net/bnx2.c @@ -1640,7 +1640,7 @@ bnx2_tx_int(struct bnx2 *bp) skb = tx_buf->skb; #ifdef BCM_TSO /* partial BD completions possible with TSO packets */ - if (skb_shinfo(skb)->tso_size) { + if (skb_shinfo(skb)->gso_size) { u16 last_idx, last_ring_idx; last_idx = sw_cons + @@ -4428,7 +4428,7 @@ bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev) (TX_BD_FLAGS_VLAN_TAG | (vlan_tx_tag_get(skb) << 16)); } #ifdef BCM_TSO - if ((mss = skb_shinfo(skb)->tso_size) && + if ((mss = skb_shinfo(skb)->gso_size) && (skb->len > (bp->dev->mtu + ETH_HLEN))) { u32 tcp_opt_len, ip_tcp_len; diff --git a/drivers/net/chelsio/sge.c b/drivers/net/chelsio/sge.c index 4391bf4bf57..53efff6da78 100644 --- a/drivers/net/chelsio/sge.c +++ b/drivers/net/chelsio/sge.c @@ -1418,7 +1418,7 @@ int t1_start_xmit(struct sk_buff *skb, struct net_device *dev) struct cpl_tx_pkt *cpl; #ifdef NETIF_F_TSO - if (skb_shinfo(skb)->tso_size) { + if (skb_shinfo(skb)->gso_size) { int eth_type; struct cpl_tx_pkt_lso *hdr; @@ -1433,7 +1433,7 @@ int t1_start_xmit(struct sk_buff *skb, struct net_device *dev) hdr->ip_hdr_words = skb->nh.iph->ihl; hdr->tcp_hdr_words = skb->h.th->doff; hdr->eth_type_mss = htons(MK_ETH_TYPE_MSS(eth_type, - skb_shinfo(skb)->tso_size)); + skb_shinfo(skb)->gso_size)); hdr->len = htonl(skb->len - sizeof(*hdr)); cpl = (struct cpl_tx_pkt *)hdr; sge->stats.tx_lso_pkts++; diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c index a373ccb308d..32b7d444b37 100644 --- a/drivers/net/e1000/e1000_main.c +++ b/drivers/net/e1000/e1000_main.c @@ -2394,7 +2394,7 @@ e1000_tso(struct e1000_adapter *adapter, struct e1000_tx_ring *tx_ring, uint8_t ipcss, ipcso, tucss, tucso, hdr_len; int err; - if (skb_shinfo(skb)->tso_size) { + if (skb_shinfo(skb)->gso_size) { if (skb_header_cloned(skb)) { err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); if (err) @@ -2402,7 +2402,7 @@ e1000_tso(struct e1000_adapter *adapter, struct e1000_tx_ring *tx_ring, } hdr_len = ((skb->h.raw - skb->data) + (skb->h.th->doff << 2)); - mss = skb_shinfo(skb)->tso_size; + mss = skb_shinfo(skb)->gso_size; if (skb->protocol == htons(ETH_P_IP)) { skb->nh.iph->tot_len = 0; skb->nh.iph->check = 0; @@ -2519,7 +2519,7 @@ e1000_tx_map(struct e1000_adapter *adapter, struct e1000_tx_ring *tx_ring, * tso gets written back prematurely before the data is fully * DMA'd to the controller */ if (!skb->data_len && tx_ring->last_tx_tso && - !skb_shinfo(skb)->tso_size) { + !skb_shinfo(skb)->gso_size) { tx_ring->last_tx_tso = 0; size -= 4; } @@ -2757,7 +2757,7 @@ e1000_xmit_frame(struct sk_buff *skb, struct net_device *netdev) } #ifdef NETIF_F_TSO - mss = skb_shinfo(skb)->tso_size; + mss = skb_shinfo(skb)->gso_size; /* The controller does a simple calculation to * make sure there is enough room in the FIFO before * initiating the DMA for each buffer. The calc is: @@ -2807,7 +2807,7 @@ e1000_xmit_frame(struct sk_buff *skb, struct net_device *netdev) #ifdef NETIF_F_TSO /* Controller Erratum workaround */ if (!skb->data_len && tx_ring->last_tx_tso && - !skb_shinfo(skb)->tso_size) + !skb_shinfo(skb)->gso_size) count++; #endif diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c index 191383d461d..21be4fa071b 100644 --- a/drivers/net/forcedeth.c +++ b/drivers/net/forcedeth.c @@ -1495,8 +1495,8 @@ static int nv_start_xmit(struct sk_buff *skb, struct net_device *dev) np->tx_skbuff[nr] = skb; #ifdef NETIF_F_TSO - if (skb_shinfo(skb)->tso_size) - tx_flags_extra = NV_TX2_TSO | (skb_shinfo(skb)->tso_size << NV_TX2_TSO_SHIFT); + if (skb_shinfo(skb)->gso_size) + tx_flags_extra = NV_TX2_TSO | (skb_shinfo(skb)->gso_size << NV_TX2_TSO_SHIFT); else #endif tx_flags_extra = (skb->ip_summed == CHECKSUM_HW ? (NV_TX2_CHECKSUM_L3|NV_TX2_CHECKSUM_L4) : 0); diff --git a/drivers/net/ixgb/ixgb_main.c b/drivers/net/ixgb/ixgb_main.c index 57006fb8840..8bb32f94699 100644 --- a/drivers/net/ixgb/ixgb_main.c +++ b/drivers/net/ixgb/ixgb_main.c @@ -1173,7 +1173,7 @@ ixgb_tso(struct ixgb_adapter *adapter, struct sk_buff *skb) uint16_t ipcse, tucse, mss; int err; - if(likely(skb_shinfo(skb)->tso_size)) { + if(likely(skb_shinfo(skb)->gso_size)) { if (skb_header_cloned(skb)) { err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); if (err) @@ -1181,7 +1181,7 @@ ixgb_tso(struct ixgb_adapter *adapter, struct sk_buff *skb) } hdr_len = ((skb->h.raw - skb->data) + (skb->h.th->doff << 2)); - mss = skb_shinfo(skb)->tso_size; + mss = skb_shinfo(skb)->gso_size; skb->nh.iph->tot_len = 0; skb->nh.iph->check = 0; skb->h.th->check = ~csum_tcpudp_magic(skb->nh.iph->saddr, diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index b79d6e8d304..43fef7de8cb 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -74,7 +74,7 @@ static void emulate_large_send_offload(struct sk_buff *skb) struct iphdr *iph = skb->nh.iph; struct tcphdr *th = (struct tcphdr*)(skb->nh.raw + (iph->ihl * 4)); unsigned int doffset = (iph->ihl + th->doff) * 4; - unsigned int mtu = skb_shinfo(skb)->tso_size + doffset; + unsigned int mtu = skb_shinfo(skb)->gso_size + doffset; unsigned int offset = 0; u32 seq = ntohl(th->seq); u16 id = ntohs(iph->id); @@ -139,7 +139,7 @@ static int loopback_xmit(struct sk_buff *skb, struct net_device *dev) #endif #ifdef LOOPBACK_TSO - if (skb_shinfo(skb)->tso_size) { + if (skb_shinfo(skb)->gso_size) { BUG_ON(skb->protocol != htons(ETH_P_IP)); BUG_ON(skb->nh.iph->protocol != IPPROTO_TCP); diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c index b983e1e0434..dbdf189436f 100644 --- a/drivers/net/myri10ge/myri10ge.c +++ b/drivers/net/myri10ge/myri10ge.c @@ -1879,7 +1879,7 @@ again: #ifdef NETIF_F_TSO if (skb->len > (dev->mtu + ETH_HLEN)) { - mss = skb_shinfo(skb)->tso_size; + mss = skb_shinfo(skb)->gso_size; if (mss != 0) max_segments = MYRI10GE_MAX_SEND_DESC_TSO; } @@ -2112,7 +2112,7 @@ abort_linearize: } idx = (idx + 1) & tx->mask; } while (idx != last_idx); - if (skb_shinfo(skb)->tso_size) { + if (skb_shinfo(skb)->gso_size) { printk(KERN_ERR "myri10ge: %s: TSO but wanted to linearize?!?!?\n", mgp->dev->name); diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c index 985afe0e627..12d1cb289bb 100644 --- a/drivers/net/r8169.c +++ b/drivers/net/r8169.c @@ -2172,7 +2172,7 @@ static int rtl8169_xmit_frags(struct rtl8169_private *tp, struct sk_buff *skb, static inline u32 rtl8169_tso_csum(struct sk_buff *skb, struct net_device *dev) { if (dev->features & NETIF_F_TSO) { - u32 mss = skb_shinfo(skb)->tso_size; + u32 mss = skb_shinfo(skb)->gso_size; if (mss) return LargeSend | ((mss & MSSMask) << MSSShift); diff --git a/drivers/net/s2io.c b/drivers/net/s2io.c index 11daed495b9..3defe5d4f7d 100644 --- a/drivers/net/s2io.c +++ b/drivers/net/s2io.c @@ -3959,8 +3959,8 @@ static int s2io_xmit(struct sk_buff *skb, struct net_device *dev) txdp->Control_1 = 0; txdp->Control_2 = 0; #ifdef NETIF_F_TSO - mss = skb_shinfo(skb)->tso_size; - if (mss) { + mss = skb_shinfo(skb)->gso_size; + if (skb_shinfo(skb)->gso_type == SKB_GSO_TCPV4) { txdp->Control_1 |= TXD_TCP_LSO_EN; txdp->Control_1 |= TXD_TCP_LSO_MSS(mss); } @@ -3980,10 +3980,10 @@ static int s2io_xmit(struct sk_buff *skb, struct net_device *dev) } frg_len = skb->len - skb->data_len; - if (skb_shinfo(skb)->ufo_size) { + if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4) { int ufo_size; - ufo_size = skb_shinfo(skb)->ufo_size; + ufo_size = skb_shinfo(skb)->gso_size; ufo_size &= ~7; txdp->Control_1 |= TXD_UFO_EN; txdp->Control_1 |= TXD_UFO_MSS(ufo_size); @@ -4009,7 +4009,7 @@ static int s2io_xmit(struct sk_buff *skb, struct net_device *dev) txdp->Host_Control = (unsigned long) skb; txdp->Control_1 |= TXD_BUFFER0_SIZE(frg_len); - if (skb_shinfo(skb)->ufo_size) + if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4) txdp->Control_1 |= TXD_UFO_EN; frg_cnt = skb_shinfo(skb)->nr_frags; @@ -4024,12 +4024,12 @@ static int s2io_xmit(struct sk_buff *skb, struct net_device *dev) (sp->pdev, frag->page, frag->page_offset, frag->size, PCI_DMA_TODEVICE); txdp->Control_1 = TXD_BUFFER0_SIZE(frag->size); - if (skb_shinfo(skb)->ufo_size) + if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4) txdp->Control_1 |= TXD_UFO_EN; } txdp->Control_1 |= TXD_GATHER_CODE_LAST; - if (skb_shinfo(skb)->ufo_size) + if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4) frg_cnt++; /* as Txd0 was used for inband header */ tx_fifo = mac_control->tx_FIFO_start[queue]; @@ -4043,7 +4043,7 @@ static int s2io_xmit(struct sk_buff *skb, struct net_device *dev) if (mss) val64 |= TX_FIFO_SPECIAL_FUNC; #endif - if (skb_shinfo(skb)->ufo_size) + if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4) val64 |= TX_FIFO_SPECIAL_FUNC; writeq(val64, &tx_fifo->List_Control); diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c index fba1e4d4d83..d3577871be2 100644 --- a/drivers/net/sky2.c +++ b/drivers/net/sky2.c @@ -1160,7 +1160,7 @@ static unsigned tx_le_req(const struct sk_buff *skb) count = sizeof(dma_addr_t) / sizeof(u32); count += skb_shinfo(skb)->nr_frags * count; - if (skb_shinfo(skb)->tso_size) + if (skb_shinfo(skb)->gso_size) ++count; if (skb->ip_summed == CHECKSUM_HW) @@ -1232,7 +1232,7 @@ static int sky2_xmit_frame(struct sk_buff *skb, struct net_device *dev) } /* Check for TCP Segmentation Offload */ - mss = skb_shinfo(skb)->tso_size; + mss = skb_shinfo(skb)->gso_size; if (mss != 0) { /* just drop the packet if non-linear expansion fails */ if (skb_header_cloned(skb) && diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index b2ddd4522a8..e3e380f90f8 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -3780,7 +3780,7 @@ static int tg3_start_xmit(struct sk_buff *skb, struct net_device *dev) #if TG3_TSO_SUPPORT != 0 mss = 0; if (skb->len > (tp->dev->mtu + ETH_HLEN) && - (mss = skb_shinfo(skb)->tso_size) != 0) { + (mss = skb_shinfo(skb)->gso_size) != 0) { int tcp_opt_len, ip_tcp_len; if (skb_header_cloned(skb) && @@ -3905,7 +3905,7 @@ static int tg3_start_xmit_dma_bug(struct sk_buff *skb, struct net_device *dev) #if TG3_TSO_SUPPORT != 0 mss = 0; if (skb->len > (tp->dev->mtu + ETH_HLEN) && - (mss = skb_shinfo(skb)->tso_size) != 0) { + (mss = skb_shinfo(skb)->gso_size) != 0) { int tcp_opt_len, ip_tcp_len; if (skb_header_cloned(skb) && diff --git a/drivers/net/typhoon.c b/drivers/net/typhoon.c index d9258d42090..e49e8b520c2 100644 --- a/drivers/net/typhoon.c +++ b/drivers/net/typhoon.c @@ -340,7 +340,7 @@ enum state_values { #endif #if defined(NETIF_F_TSO) -#define skb_tso_size(x) (skb_shinfo(x)->tso_size) +#define skb_tso_size(x) (skb_shinfo(x)->gso_size) #define TSO_NUM_DESCRIPTORS 2 #define TSO_OFFLOAD_ON TYPHOON_OFFLOAD_TCP_SEGMENT #else diff --git a/drivers/s390/net/qeth_eddp.c b/drivers/s390/net/qeth_eddp.c index 0bab60a2030..38aad832145 100644 --- a/drivers/s390/net/qeth_eddp.c +++ b/drivers/s390/net/qeth_eddp.c @@ -420,7 +420,7 @@ __qeth_eddp_fill_context_tcp(struct qeth_eddp_context *ctx, } tcph = eddp->skb->h.th; while (eddp->skb_offset < eddp->skb->len) { - data_len = min((int)skb_shinfo(eddp->skb)->tso_size, + data_len = min((int)skb_shinfo(eddp->skb)->gso_size, (int)(eddp->skb->len - eddp->skb_offset)); /* prepare qdio hdr */ if (eddp->qh.hdr.l2.id == QETH_HEADER_TYPE_LAYER2){ @@ -515,20 +515,20 @@ qeth_eddp_calc_num_pages(struct qeth_eddp_context *ctx, struct sk_buff *skb, QETH_DBF_TEXT(trace, 5, "eddpcanp"); /* can we put multiple skbs in one page? */ - skbs_per_page = PAGE_SIZE / (skb_shinfo(skb)->tso_size + hdr_len); + skbs_per_page = PAGE_SIZE / (skb_shinfo(skb)->gso_size + hdr_len); if (skbs_per_page > 1){ - ctx->num_pages = (skb_shinfo(skb)->tso_segs + 1) / + ctx->num_pages = (skb_shinfo(skb)->gso_segs + 1) / skbs_per_page + 1; ctx->elements_per_skb = 1; } else { /* no -> how many elements per skb? */ - ctx->elements_per_skb = (skb_shinfo(skb)->tso_size + hdr_len + + ctx->elements_per_skb = (skb_shinfo(skb)->gso_size + hdr_len + PAGE_SIZE) >> PAGE_SHIFT; ctx->num_pages = ctx->elements_per_skb * - (skb_shinfo(skb)->tso_segs + 1); + (skb_shinfo(skb)->gso_segs + 1); } ctx->num_elements = ctx->elements_per_skb * - (skb_shinfo(skb)->tso_segs + 1); + (skb_shinfo(skb)->gso_segs + 1); } static inline struct qeth_eddp_context * diff --git a/drivers/s390/net/qeth_main.c b/drivers/s390/net/qeth_main.c index 9e671a48cd2..56009d76832 100644 --- a/drivers/s390/net/qeth_main.c +++ b/drivers/s390/net/qeth_main.c @@ -4417,7 +4417,7 @@ qeth_send_packet(struct qeth_card *card, struct sk_buff *skb) struct qeth_eddp_context *ctx = NULL; int tx_bytes = skb->len; unsigned short nr_frags = skb_shinfo(skb)->nr_frags; - unsigned short tso_size = skb_shinfo(skb)->tso_size; + unsigned short tso_size = skb_shinfo(skb)->gso_size; int rc; QETH_DBF_TEXT(trace, 6, "sendpkt"); @@ -4453,7 +4453,7 @@ qeth_send_packet(struct qeth_card *card, struct sk_buff *skb) queue = card->qdio.out_qs [qeth_get_priority_queue(card, skb, ipv, cast_type)]; - if (skb_shinfo(skb)->tso_size) + if (skb_shinfo(skb)->gso_size) large_send = card->options.large_send; /*are we able to do TSO ? If so ,prepare and send it from here */ diff --git a/drivers/s390/net/qeth_tso.h b/drivers/s390/net/qeth_tso.h index 24ef40ca956..593f298142c 100644 --- a/drivers/s390/net/qeth_tso.h +++ b/drivers/s390/net/qeth_tso.h @@ -51,7 +51,7 @@ qeth_tso_fill_header(struct qeth_card *card, struct sk_buff *skb) hdr->ext.hdr_version = 1; hdr->ext.hdr_len = 28; /*insert non-fix values */ - hdr->ext.mss = skb_shinfo(skb)->tso_size; + hdr->ext.mss = skb_shinfo(skb)->gso_size; hdr->ext.dg_hdr_len = (__u16)(iph->ihl*4 + tcph->doff*4); hdr->ext.payload_len = (__u16)(skb->len - hdr->ext.dg_hdr_len - sizeof(struct qeth_hdr_tso)); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cead6be467e..fa5671307b9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -308,9 +308,12 @@ struct net_device #define NETIF_F_HW_VLAN_RX 256 /* Receive VLAN hw acceleration */ #define NETIF_F_HW_VLAN_FILTER 512 /* Receive filtering on VLAN */ #define NETIF_F_VLAN_CHALLENGED 1024 /* Device cannot handle VLAN packets */ -#define NETIF_F_TSO 2048 /* Can offload TCP/IP segmentation */ #define NETIF_F_LLTX 4096 /* LockLess TX */ -#define NETIF_F_UFO 8192 /* Can offload UDP Large Send*/ + + /* Segmentation offload features */ +#define NETIF_F_GSO_SHIFT 16 +#define NETIF_F_TSO (SKB_GSO_TCPV4 << NETIF_F_GSO_SHIFT) +#define NETIF_F_UFO (SKB_GSO_UDPV4 << NETIF_F_GSO_SHIFT) #define NETIF_F_GEN_CSUM (NETIF_F_NO_CSUM | NETIF_F_HW_CSUM) #define NETIF_F_ALL_CSUM (NETIF_F_IP_CSUM | NETIF_F_GEN_CSUM) @@ -979,6 +982,13 @@ extern void dev_seq_stop(struct seq_file *seq, void *v); extern void linkwatch_run_queue(void); +static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb) +{ + int feature = skb_shinfo(skb)->gso_type << NETIF_F_GSO_SHIFT; + return skb_shinfo(skb)->gso_size && + (dev->features & feature) != feature; +} + #endif /* __KERNEL__ */ #endif /* _LINUX_DEV_H */ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f8c7eb79a27..97b0d2d1a6b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -134,9 +134,10 @@ struct skb_frag_struct { struct skb_shared_info { atomic_t dataref; unsigned short nr_frags; - unsigned short tso_size; - unsigned short tso_segs; - unsigned short ufo_size; + unsigned short gso_size; + /* Warning: this field is not always filled in (UFO)! */ + unsigned short gso_segs; + unsigned short gso_type; unsigned int ip6_frag_id; struct sk_buff *frag_list; skb_frag_t frags[MAX_SKB_FRAGS]; @@ -168,6 +169,11 @@ enum { SKB_FCLONE_CLONE, }; +enum { + SKB_GSO_TCPV4 = 1 << 0, + SKB_GSO_UDPV4 = 1 << 1, +}; + /** * struct sk_buff - socket buffer * @next: Next buffer in list diff --git a/include/net/tcp.h b/include/net/tcp.h index 5f4eb5c7968..b197a9e615c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -569,13 +569,13 @@ struct tcp_skb_cb { */ static inline int tcp_skb_pcount(const struct sk_buff *skb) { - return skb_shinfo(skb)->tso_segs; + return skb_shinfo(skb)->gso_segs; } /* This is valid iff tcp_skb_pcount() > 1. */ static inline int tcp_skb_mss(const struct sk_buff *skb) { - return skb_shinfo(skb)->tso_size; + return skb_shinfo(skb)->gso_size; } static inline void tcp_dec_pcount_approx(__u32 *count, diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 0dca027ceb8..8be9f2123e5 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -34,8 +34,8 @@ static inline unsigned packet_length(const struct sk_buff *skb) int br_dev_queue_push_xmit(struct sk_buff *skb) { - /* drop mtu oversized packets except tso */ - if (packet_length(skb) > skb->dev->mtu && !skb_shinfo(skb)->tso_size) + /* drop mtu oversized packets except gso */ + if (packet_length(skb) > skb->dev->mtu && !skb_shinfo(skb)->gso_size) kfree_skb(skb); else { #ifdef CONFIG_BRIDGE_NETFILTER diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 3e41f9d6d51..8298a5179ae 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -761,7 +761,7 @@ static int br_nf_dev_queue_xmit(struct sk_buff *skb) { if (skb->protocol == htons(ETH_P_IP) && skb->len > skb->dev->mtu && - !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size)) + !skb_shinfo(skb)->gso_size) return ip_fragment(skb, br_dev_queue_push_xmit); else return br_dev_queue_push_xmit(skb); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index fe63d4efbd4..368d98578c1 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -172,9 +172,9 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, shinfo = skb_shinfo(skb); atomic_set(&shinfo->dataref, 1); shinfo->nr_frags = 0; - shinfo->tso_size = 0; - shinfo->tso_segs = 0; - shinfo->ufo_size = 0; + shinfo->gso_size = 0; + shinfo->gso_segs = 0; + shinfo->gso_type = 0; shinfo->ip6_frag_id = 0; shinfo->frag_list = NULL; @@ -238,8 +238,9 @@ struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp, atomic_set(&(skb_shinfo(skb)->dataref), 1); skb_shinfo(skb)->nr_frags = 0; - skb_shinfo(skb)->tso_size = 0; - skb_shinfo(skb)->tso_segs = 0; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_segs = 0; + skb_shinfo(skb)->gso_type = 0; skb_shinfo(skb)->frag_list = NULL; out: return skb; @@ -528,8 +529,9 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #endif skb_copy_secmark(new, old); atomic_set(&new->users, 1); - skb_shinfo(new)->tso_size = skb_shinfo(old)->tso_size; - skb_shinfo(new)->tso_segs = skb_shinfo(old)->tso_segs; + skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; + skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; + skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; } /** diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 8538aac3d14..7624fd1d8f9 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -210,8 +210,7 @@ static inline int ip_finish_output(struct sk_buff *skb) return dst_output(skb); } #endif - if (skb->len > dst_mtu(skb->dst) && - !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size)) + if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->gso_size) return ip_fragment(skb, ip_finish_output2); else return ip_finish_output2(skb); @@ -362,7 +361,7 @@ packet_routed: } ip_select_ident_more(iph, &rt->u.dst, sk, - (skb_shinfo(skb)->tso_segs ?: 1) - 1); + (skb_shinfo(skb)->gso_segs ?: 1) - 1); /* Add an IP checksum. */ ip_send_check(iph); @@ -744,7 +743,8 @@ static inline int ip_ufo_append_data(struct sock *sk, (length - transhdrlen)); if (!err) { /* specify the length of each IP datagram fragment*/ - skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen); + skb_shinfo(skb)->gso_size = mtu - fragheaderlen; + skb_shinfo(skb)->gso_type = SKB_GSO_UDPV4; __skb_queue_tail(&sk->sk_write_queue, skb); return 0; @@ -1087,14 +1087,16 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, inet->cork.length += size; if ((sk->sk_protocol == IPPROTO_UDP) && - (rt->u.dst.dev->features & NETIF_F_UFO)) - skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen); + (rt->u.dst.dev->features & NETIF_F_UFO)) { + skb_shinfo(skb)->gso_size = mtu - fragheaderlen; + skb_shinfo(skb)->gso_type = SKB_GSO_UDPV4; + } while (size > 0) { int i; - if (skb_shinfo(skb)->ufo_size) + if (skb_shinfo(skb)->gso_size) len = size; else { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 74998f25007..062dd1a0d8a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -571,7 +571,7 @@ new_segment: skb->ip_summed = CHECKSUM_HW; tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; - skb_shinfo(skb)->tso_segs = 0; + skb_shinfo(skb)->gso_segs = 0; if (!copied) TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; @@ -818,7 +818,7 @@ new_segment: tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; - skb_shinfo(skb)->tso_segs = 0; + skb_shinfo(skb)->gso_segs = 0; from += copy; copied += copy; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e08245bdda3..94fe5b1f9dc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1073,7 +1073,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ else pkt_len = (end_seq - TCP_SKB_CB(skb)->seq); - if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->tso_size)) + if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size)) break; pcount = tcp_skb_pcount(skb); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 07bb5a2b375..bdd71db8bf9 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -515,15 +515,17 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned /* Avoid the costly divide in the normal * non-TSO case. */ - skb_shinfo(skb)->tso_segs = 1; - skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; } else { unsigned int factor; factor = skb->len + (mss_now - 1); factor /= mss_now; - skb_shinfo(skb)->tso_segs = factor; - skb_shinfo(skb)->tso_size = mss_now; + skb_shinfo(skb)->gso_segs = factor; + skb_shinfo(skb)->gso_size = mss_now; + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; } } @@ -914,7 +916,7 @@ static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int if (!tso_segs || (tso_segs > 1 && - skb_shinfo(skb)->tso_size != mss_now)) { + tcp_skb_mss(skb) != mss_now)) { tcp_set_skb_tso_segs(sk, skb, mss_now); tso_segs = tcp_skb_pcount(skb); } @@ -1724,8 +1726,9 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { if (!pskb_trim(skb, 0)) { TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1; - skb_shinfo(skb)->tso_segs = 1; - skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; skb->ip_summed = CHECKSUM_NONE; skb->csum = 0; } @@ -1930,8 +1933,9 @@ void tcp_send_fin(struct sock *sk) skb->csum = 0; TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); TCP_SKB_CB(skb)->sacked = 0; - skb_shinfo(skb)->tso_segs = 1; - skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ TCP_SKB_CB(skb)->seq = tp->write_seq; @@ -1963,8 +1967,9 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) skb->csum = 0; TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST); TCP_SKB_CB(skb)->sacked = 0; - skb_shinfo(skb)->tso_segs = 1; - skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; /* Send it off. */ TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp); @@ -2047,8 +2052,9 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, TCP_SKB_CB(skb)->seq = tcp_rsk(req)->snt_isn; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; TCP_SKB_CB(skb)->sacked = 0; - skb_shinfo(skb)->tso_segs = 1; - skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; th->seq = htonl(TCP_SKB_CB(skb)->seq); th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ @@ -2152,8 +2158,9 @@ int tcp_connect(struct sock *sk) TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN; TCP_ECN_send_syn(sk, tp, buff); TCP_SKB_CB(buff)->sacked = 0; - skb_shinfo(buff)->tso_segs = 1; - skb_shinfo(buff)->tso_size = 0; + skb_shinfo(buff)->gso_segs = 1; + skb_shinfo(buff)->gso_size = 0; + skb_shinfo(buff)->gso_type = 0; buff->csum = 0; TCP_SKB_CB(buff)->seq = tp->write_seq++; TCP_SKB_CB(buff)->end_seq = tp->write_seq; @@ -2257,8 +2264,9 @@ void tcp_send_ack(struct sock *sk) buff->csum = 0; TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK; TCP_SKB_CB(buff)->sacked = 0; - skb_shinfo(buff)->tso_segs = 1; - skb_shinfo(buff)->tso_size = 0; + skb_shinfo(buff)->gso_segs = 1; + skb_shinfo(buff)->gso_size = 0; + skb_shinfo(buff)->gso_type = 0; /* Send it off, this clears delayed acks for us. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp); @@ -2293,8 +2301,9 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) skb->csum = 0; TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; TCP_SKB_CB(skb)->sacked = urgent; - skb_shinfo(skb)->tso_segs = 1; - skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; /* Use a previous sequence. This should cause the other * end to send an ack. Don't queue or clone SKB, just diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index d29620f4910..abb94de3376 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -148,7 +148,7 @@ static int ip6_output2(struct sk_buff *skb) int ip6_output(struct sk_buff *skb) { - if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->ufo_size) || + if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->gso_size) || dst_allfrag(skb->dst)) return ip6_fragment(skb, ip6_output2); else @@ -833,8 +833,9 @@ static inline int ip6_ufo_append_data(struct sock *sk, struct frag_hdr fhdr; /* specify the length of each IP datagram fragment*/ - skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen) - - sizeof(struct frag_hdr); + skb_shinfo(skb)->gso_size = mtu - fragheaderlen - + sizeof(struct frag_hdr); + skb_shinfo(skb)->gso_type = SKB_GSO_UDPV4; ipv6_select_ident(skb, &fhdr); skb_shinfo(skb)->ip6_frag_id = fhdr.identification; __skb_queue_tail(&sk->sk_write_queue, skb); -- cgit v1.2.3-70-g09d2 From f6a78bfcb141f963187464bac838d46a81c3882a Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 22 Jun 2006 02:57:17 -0700 Subject: [NET]: Add generic segmentation offload This patch adds the infrastructure for generic segmentation offload. The idea is to tap into the potential savings of TSO without hardware support by postponing the allocation of segmented skb's until just before the entry point into the NIC driver. The same structure can be used to support software IPv6 TSO, as well as UFO and segmentation offload for other relevant protocols, e.g., DCCP. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 ++- net/core/dev.c | 127 ++++++++++++++++++++++++++++++++++++++++++++-- net/sched/sch_generic.c | 19 +++++-- 3 files changed, 143 insertions(+), 11 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fa5671307b9..b4eae18390c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -405,6 +405,9 @@ struct net_device struct list_head qdisc_list; unsigned long tx_queue_len; /* Max frames per queue allowed */ + /* Partially transmitted GSO packet. */ + struct sk_buff *gso_skb; + /* ingress path synchronizer */ spinlock_t ingress_lock; struct Qdisc *qdisc_ingress; @@ -539,6 +542,7 @@ struct packet_type { struct net_device *, struct packet_type *, struct net_device *); + struct sk_buff *(*gso_segment)(struct sk_buff *skb, int sg); void *af_packet_priv; struct list_head list; }; @@ -689,7 +693,8 @@ extern int dev_change_name(struct net_device *, char *); extern int dev_set_mtu(struct net_device *, int); extern int dev_set_mac_address(struct net_device *, struct sockaddr *); -extern void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev); +extern int dev_hard_start_xmit(struct sk_buff *skb, + struct net_device *dev); extern void dev_init(void); @@ -963,6 +968,7 @@ extern int netdev_max_backlog; extern int weight_p; extern int netdev_set_master(struct net_device *dev, struct net_device *master); extern int skb_checksum_help(struct sk_buff *skb, int inward); +extern struct sk_buff *skb_gso_segment(struct sk_buff *skb, int sg); #ifdef CONFIG_BUG extern void netdev_rx_csum_fault(struct net_device *dev); #else diff --git a/net/core/dev.c b/net/core/dev.c index 29e3888102b..d293e0f90a0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -116,6 +116,7 @@ #include #include #include +#include /* * The list of packet types we will receive (as opposed to discard) @@ -1048,7 +1049,7 @@ static inline void net_timestamp(struct sk_buff *skb) * taps currently in use. */ -void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) +static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { struct packet_type *ptype; @@ -1186,6 +1187,40 @@ out: return ret; } +/** + * skb_gso_segment - Perform segmentation on skb. + * @skb: buffer to segment + * @sg: whether scatter-gather is supported on the target. + * + * This function segments the given skb and returns a list of segments. + */ +struct sk_buff *skb_gso_segment(struct sk_buff *skb, int sg) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_type *ptype; + int type = skb->protocol; + + BUG_ON(skb_shinfo(skb)->frag_list); + BUG_ON(skb->ip_summed != CHECKSUM_HW); + + skb->mac.raw = skb->data; + skb->mac_len = skb->nh.raw - skb->data; + __skb_pull(skb, skb->mac_len); + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & 15], list) { + if (ptype->type == type && !ptype->dev && ptype->gso_segment) { + segs = ptype->gso_segment(skb, sg); + break; + } + } + rcu_read_unlock(); + + return segs; +} + +EXPORT_SYMBOL(skb_gso_segment); + /* Take action when hardware reception checksum errors are detected. */ #ifdef CONFIG_BUG void netdev_rx_csum_fault(struct net_device *dev) @@ -1222,6 +1257,86 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) #define illegal_highdma(dev, skb) (0) #endif +struct dev_gso_cb { + void (*destructor)(struct sk_buff *skb); +}; + +#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb) + +static void dev_gso_skb_destructor(struct sk_buff *skb) +{ + struct dev_gso_cb *cb; + + do { + struct sk_buff *nskb = skb->next; + + skb->next = nskb->next; + nskb->next = NULL; + kfree_skb(nskb); + } while (skb->next); + + cb = DEV_GSO_CB(skb); + if (cb->destructor) + cb->destructor(skb); +} + +/** + * dev_gso_segment - Perform emulated hardware segmentation on skb. + * @skb: buffer to segment + * + * This function segments the given skb and stores the list of segments + * in skb->next. + */ +static int dev_gso_segment(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct sk_buff *segs; + + segs = skb_gso_segment(skb, dev->features & NETIF_F_SG && + !illegal_highdma(dev, skb)); + if (unlikely(IS_ERR(segs))) + return PTR_ERR(segs); + + skb->next = segs; + DEV_GSO_CB(skb)->destructor = skb->destructor; + skb->destructor = dev_gso_skb_destructor; + + return 0; +} + +int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + if (likely(!skb->next)) { + if (netdev_nit) + dev_queue_xmit_nit(skb, dev); + + if (!netif_needs_gso(dev, skb)) + return dev->hard_start_xmit(skb, dev); + + if (unlikely(dev_gso_segment(skb))) + goto out_kfree_skb; + } + + do { + struct sk_buff *nskb = skb->next; + int rc; + + skb->next = nskb->next; + nskb->next = NULL; + rc = dev->hard_start_xmit(nskb, dev); + if (unlikely(rc)) { + skb->next = nskb; + return rc; + } + } while (skb->next); + + skb->destructor = DEV_GSO_CB(skb)->destructor; + +out_kfree_skb: + kfree_skb(skb); + return 0; +} + #define HARD_TX_LOCK(dev, cpu) { \ if ((dev->features & NETIF_F_LLTX) == 0) { \ netif_tx_lock(dev); \ @@ -1266,6 +1381,10 @@ int dev_queue_xmit(struct sk_buff *skb) struct Qdisc *q; int rc = -ENOMEM; + /* GSO will handle the following emulations directly. */ + if (netif_needs_gso(dev, skb)) + goto gso; + if (skb_shinfo(skb)->frag_list && !(dev->features & NETIF_F_FRAGLIST) && __skb_linearize(skb)) @@ -1290,6 +1409,7 @@ int dev_queue_xmit(struct sk_buff *skb) if (skb_checksum_help(skb, 0)) goto out_kfree_skb; +gso: spin_lock_prefetch(&dev->queue_lock); /* Disable soft irqs for various locks below. Also @@ -1346,11 +1466,8 @@ int dev_queue_xmit(struct sk_buff *skb) HARD_TX_LOCK(dev, cpu); if (!netif_queue_stopped(dev)) { - if (netdev_nit) - dev_queue_xmit_nit(skb, dev); - rc = 0; - if (!dev->hard_start_xmit(skb, dev)) { + if (!dev_hard_start_xmit(skb, dev)) { HARD_TX_UNLOCK(dev); goto out; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 7aad0121232..74d4a1dceee 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -96,8 +96,11 @@ static inline int qdisc_restart(struct net_device *dev) struct sk_buff *skb; /* Dequeue packet */ - if ((skb = q->dequeue(q)) != NULL) { + if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) { unsigned nolock = (dev->features & NETIF_F_LLTX); + + dev->gso_skb = NULL; + /* * When the driver has LLTX set it does its own locking * in start_xmit. No need to add additional overhead by @@ -134,10 +137,8 @@ static inline int qdisc_restart(struct net_device *dev) if (!netif_queue_stopped(dev)) { int ret; - if (netdev_nit) - dev_queue_xmit_nit(skb, dev); - ret = dev->hard_start_xmit(skb, dev); + ret = dev_hard_start_xmit(skb, dev); if (ret == NETDEV_TX_OK) { if (!nolock) { netif_tx_unlock(dev); @@ -171,7 +172,10 @@ static inline int qdisc_restart(struct net_device *dev) */ requeue: - q->ops->requeue(skb, q); + if (skb->next) + dev->gso_skb = skb; + else + q->ops->requeue(skb, q); netif_schedule(dev); return 1; } @@ -593,6 +597,11 @@ void dev_deactivate(struct net_device *dev) /* Wait for outstanding qdisc_run calls. */ while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state)) yield(); + + if (dev->gso_skb) { + kfree_skb(dev->gso_skb); + dev->gso_skb = NULL; + } } void dev_init_scheduler(struct net_device *dev) -- cgit v1.2.3-70-g09d2 From f4c50d990dcf11a296679dc05de3873783236711 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 22 Jun 2006 03:02:40 -0700 Subject: [NET]: Add software TSOv4 This patch adds the GSO implementation for IPv4 TCP. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + include/net/protocol.h | 1 + include/net/tcp.h | 2 + net/core/skbuff.c | 126 +++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/af_inet.c | 51 ++++++++++++++++++++ net/ipv4/tcp.c | 62 ++++++++++++++++++++++++ 6 files changed, 243 insertions(+) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 97b0d2d1a6b..a45bba9b8cb 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1297,6 +1297,7 @@ extern void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len); extern void skb_release_data(struct sk_buff *skb); +extern struct sk_buff *skb_segment(struct sk_buff *skb, int sg); static inline void *skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer) diff --git a/include/net/protocol.h b/include/net/protocol.h index bcaee39bd2f..3b6dc15c68a 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -36,6 +36,7 @@ struct net_protocol { int (*handler)(struct sk_buff *skb); void (*err_handler)(struct sk_buff *skb, u32 info); + struct sk_buff *(*gso_segment)(struct sk_buff *skb, int sg); int no_policy; }; diff --git a/include/net/tcp.h b/include/net/tcp.h index b197a9e615c..ca3d38dfc00 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1086,6 +1086,8 @@ extern struct request_sock_ops tcp_request_sock_ops; extern int tcp_v4_destroy_sock(struct sock *sk); +extern struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int sg); + #ifdef CONFIG_PROC_FS extern int tcp4_proc_init(void); extern void tcp4_proc_exit(void); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 368d98578c1..8e5044ba3ab 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1842,6 +1842,132 @@ unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) EXPORT_SYMBOL_GPL(skb_pull_rcsum); +/** + * skb_segment - Perform protocol segmentation on skb. + * @skb: buffer to segment + * @sg: whether scatter-gather can be used for generated segments + * + * This function performs segmentation on the given skb. It returns + * the segment at the given position. It returns NULL if there are + * no more segments to generate, or when an error is encountered. + */ +struct sk_buff *skb_segment(struct sk_buff *skb, int sg) +{ + struct sk_buff *segs = NULL; + struct sk_buff *tail = NULL; + unsigned int mss = skb_shinfo(skb)->gso_size; + unsigned int doffset = skb->data - skb->mac.raw; + unsigned int offset = doffset; + unsigned int headroom; + unsigned int len; + int nfrags = skb_shinfo(skb)->nr_frags; + int err = -ENOMEM; + int i = 0; + int pos; + + __skb_push(skb, doffset); + headroom = skb_headroom(skb); + pos = skb_headlen(skb); + + do { + struct sk_buff *nskb; + skb_frag_t *frag; + int hsize, nsize; + int k; + int size; + + len = skb->len - offset; + if (len > mss) + len = mss; + + hsize = skb_headlen(skb) - offset; + if (hsize < 0) + hsize = 0; + nsize = hsize + doffset; + if (nsize > len + doffset || !sg) + nsize = len + doffset; + + nskb = alloc_skb(nsize + headroom, GFP_ATOMIC); + if (unlikely(!nskb)) + goto err; + + if (segs) + tail->next = nskb; + else + segs = nskb; + tail = nskb; + + nskb->dev = skb->dev; + nskb->priority = skb->priority; + nskb->protocol = skb->protocol; + nskb->dst = dst_clone(skb->dst); + memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); + nskb->pkt_type = skb->pkt_type; + nskb->mac_len = skb->mac_len; + + skb_reserve(nskb, headroom); + nskb->mac.raw = nskb->data; + nskb->nh.raw = nskb->data + skb->mac_len; + nskb->h.raw = nskb->nh.raw + (skb->h.raw - skb->nh.raw); + memcpy(skb_put(nskb, doffset), skb->data, doffset); + + if (!sg) { + nskb->csum = skb_copy_and_csum_bits(skb, offset, + skb_put(nskb, len), + len, 0); + continue; + } + + frag = skb_shinfo(nskb)->frags; + k = 0; + + nskb->ip_summed = CHECKSUM_HW; + nskb->csum = skb->csum; + memcpy(skb_put(nskb, hsize), skb->data + offset, hsize); + + while (pos < offset + len) { + BUG_ON(i >= nfrags); + + *frag = skb_shinfo(skb)->frags[i]; + get_page(frag->page); + size = frag->size; + + if (pos < offset) { + frag->page_offset += offset - pos; + frag->size -= offset - pos; + } + + k++; + + if (pos + size <= offset + len) { + i++; + pos += size; + } else { + frag->size -= pos + size - (offset + len); + break; + } + + frag++; + } + + skb_shinfo(nskb)->nr_frags = k; + nskb->data_len = len - hsize; + nskb->len += nskb->data_len; + nskb->truesize += nskb->data_len; + } while ((offset += len) < skb->len); + + return segs; + +err: + while ((skb = segs)) { + segs = skb->next; + kfree(skb); + } + return ERR_PTR(err); +} + +EXPORT_SYMBOL_GPL(skb_segment); + void __init skb_init(void) { skbuff_head_cache = kmem_cache_create("skbuff_head_cache", diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 0a277453526..461216b4794 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -68,6 +68,7 @@ */ #include +#include #include #include #include @@ -1096,6 +1097,54 @@ int inet_sk_rebuild_header(struct sock *sk) EXPORT_SYMBOL(inet_sk_rebuild_header); +static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int sg) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct iphdr *iph; + struct net_protocol *ops; + int proto; + int ihl; + int id; + + if (!pskb_may_pull(skb, sizeof(*iph))) + goto out; + + iph = skb->nh.iph; + ihl = iph->ihl * 4; + if (ihl < sizeof(*iph)) + goto out; + + if (!pskb_may_pull(skb, ihl)) + goto out; + + skb->h.raw = __skb_pull(skb, ihl); + iph = skb->nh.iph; + id = ntohs(iph->id); + proto = iph->protocol & (MAX_INET_PROTOS - 1); + segs = ERR_PTR(-EPROTONOSUPPORT); + + rcu_read_lock(); + ops = rcu_dereference(inet_protos[proto]); + if (ops && ops->gso_segment) + segs = ops->gso_segment(skb, sg); + rcu_read_unlock(); + + if (IS_ERR(segs)) + goto out; + + skb = segs; + do { + iph = skb->nh.iph; + iph->id = htons(id++); + iph->tot_len = htons(skb->len - skb->mac_len); + iph->check = 0; + iph->check = ip_fast_csum(skb->nh.raw, iph->ihl); + } while ((skb = skb->next)); + +out: + return segs; +} + #ifdef CONFIG_IP_MULTICAST static struct net_protocol igmp_protocol = { .handler = igmp_rcv, @@ -1105,6 +1154,7 @@ static struct net_protocol igmp_protocol = { static struct net_protocol tcp_protocol = { .handler = tcp_v4_rcv, .err_handler = tcp_v4_err, + .gso_segment = tcp_tso_segment, .no_policy = 1, }; @@ -1150,6 +1200,7 @@ static int ipv4_proc_init(void); static struct packet_type ip_packet_type = { .type = __constant_htons(ETH_P_IP), .func = ip_rcv, + .gso_segment = inet_gso_segment, }; static int __init inet_init(void) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 062dd1a0d8a..0e029c4e290 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -258,6 +258,7 @@ #include #include #include +#include #include #include @@ -2144,6 +2145,67 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname, EXPORT_SYMBOL(compat_tcp_getsockopt); #endif +struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int sg) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct tcphdr *th; + unsigned thlen; + unsigned int seq; + unsigned int delta; + unsigned int oldlen; + unsigned int len; + + if (!pskb_may_pull(skb, sizeof(*th))) + goto out; + + th = skb->h.th; + thlen = th->doff * 4; + if (thlen < sizeof(*th)) + goto out; + + if (!pskb_may_pull(skb, thlen)) + goto out; + + oldlen = ~htonl(skb->len); + __skb_pull(skb, thlen); + + segs = skb_segment(skb, sg); + if (IS_ERR(segs)) + goto out; + + len = skb_shinfo(skb)->gso_size; + delta = csum_add(oldlen, htonl(thlen + len)); + + skb = segs; + th = skb->h.th; + seq = ntohl(th->seq); + + do { + th->fin = th->psh = 0; + + if (skb->ip_summed == CHECKSUM_NONE) { + th->check = csum_fold(csum_partial( + skb->h.raw, thlen, csum_add(skb->csum, delta))); + } + + seq += len; + skb = skb->next; + th = skb->h.th; + + th->seq = htonl(seq); + th->cwr = 0; + } while (skb->next); + + if (skb->ip_summed == CHECKSUM_NONE) { + delta = csum_add(oldlen, htonl(skb->tail - skb->h.raw)); + th->check = csum_fold(csum_partial( + skb->h.raw, thlen, csum_add(skb->csum, delta))); + } + +out: + return segs; +} + extern void __skb_cb_too_small_for_tcp(int, int); extern struct tcp_congestion_ops tcp_reno; -- cgit v1.2.3-70-g09d2 From 37c3185a02d4b85fbe134bf5204535405dd2c957 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 22 Jun 2006 03:07:29 -0700 Subject: [NET]: Added GSO toggle This patch adds a generic segmentation offload toggle that can be turned on/off for each net device. For now it only supports in TCPv4. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/ethtool.h | 2 ++ include/linux/netdevice.h | 1 + include/net/sock.h | 4 ++++ net/bridge/br_if.c | 17 +++++++++++------ net/core/ethtool.c | 29 +++++++++++++++++++++++++++++ 5 files changed, 47 insertions(+), 6 deletions(-) diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index cf2abeca92a..c6310aef5ab 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -411,6 +411,8 @@ struct ethtool_ops { #define ETHTOOL_GPERMADDR 0x00000020 /* Get permanent hardware address */ #define ETHTOOL_GUFO 0x00000021 /* Get UFO enable (ethtool_value) */ #define ETHTOOL_SUFO 0x00000022 /* Set UFO enable (ethtool_value) */ +#define ETHTOOL_GGSO 0x00000023 /* Get GSO enable (ethtool_value) */ +#define ETHTOOL_SGSO 0x00000024 /* Set GSO enable (ethtool_value) */ /* compatibility with older code */ #define SPARC_ETH_GSET ETHTOOL_GSET diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b4eae18390c..bc747e5d713 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -308,6 +308,7 @@ struct net_device #define NETIF_F_HW_VLAN_RX 256 /* Receive VLAN hw acceleration */ #define NETIF_F_HW_VLAN_FILTER 512 /* Receive filtering on VLAN */ #define NETIF_F_VLAN_CHALLENGED 1024 /* Device cannot handle VLAN packets */ +#define NETIF_F_GSO 2048 /* Enable software GSO. */ #define NETIF_F_LLTX 4096 /* LockLess TX */ /* Segmentation offload features */ diff --git a/include/net/sock.h b/include/net/sock.h index d10dfecb6cb..a897f05de3b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1030,9 +1030,13 @@ static inline void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { __sk_dst_set(sk, dst); sk->sk_route_caps = dst->dev->features; + if (sk->sk_route_caps & NETIF_F_GSO) + sk->sk_route_caps |= NETIF_F_TSO; if (sk->sk_route_caps & NETIF_F_TSO) { if (sock_flag(sk, SOCK_NO_LARGESEND) || dst->header_len) sk->sk_route_caps &= ~NETIF_F_TSO; + else + sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; } } diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index fdec773f5b5..07956ecf545 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -376,15 +376,20 @@ void br_features_recompute(struct net_bridge *br) features = br->feature_mask & ~NETIF_F_ALL_CSUM; list_for_each_entry(p, &br->port_list, list) { - if (checksum & NETIF_F_NO_CSUM && - !(p->dev->features & NETIF_F_NO_CSUM)) + unsigned long feature = p->dev->features; + + if (checksum & NETIF_F_NO_CSUM && !(feature & NETIF_F_NO_CSUM)) checksum ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM; - if (checksum & NETIF_F_HW_CSUM && - !(p->dev->features & NETIF_F_HW_CSUM)) + if (checksum & NETIF_F_HW_CSUM && !(feature & NETIF_F_HW_CSUM)) checksum ^= NETIF_F_HW_CSUM | NETIF_F_IP_CSUM; - if (!(p->dev->features & NETIF_F_IP_CSUM)) + if (!(feature & NETIF_F_IP_CSUM)) checksum = 0; - features &= p->dev->features; + + if (feature & NETIF_F_GSO) + feature |= NETIF_F_TSO; + feature |= NETIF_F_GSO; + + features &= feature; } br->dev->features = features | checksum | NETIF_F_LLTX; diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 33ce7ed6afc..27ce1683caf 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -614,6 +614,29 @@ static int ethtool_set_ufo(struct net_device *dev, char __user *useraddr) return dev->ethtool_ops->set_ufo(dev, edata.data); } +static int ethtool_get_gso(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata = { ETHTOOL_GGSO }; + + edata.data = dev->features & NETIF_F_GSO; + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int ethtool_set_gso(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + if (edata.data) + dev->features |= NETIF_F_GSO; + else + dev->features &= ~NETIF_F_GSO; + return 0; +} + static int ethtool_self_test(struct net_device *dev, char __user *useraddr) { struct ethtool_test test; @@ -905,6 +928,12 @@ int dev_ethtool(struct ifreq *ifr) case ETHTOOL_SUFO: rc = ethtool_set_ufo(dev, useraddr); break; + case ETHTOOL_GGSO: + rc = ethtool_get_gso(dev, useraddr); + break; + case ETHTOOL_SGSO: + rc = ethtool_set_gso(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } -- cgit v1.2.3-70-g09d2 From 09b8f7a93efd4b2c4ef391e2fbf076f28c6d36d6 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 22 Jun 2006 03:08:03 -0700 Subject: [IPSEC]: Handle GSO packets This patch segments GSO packets received by the IPsec stack. This can happen when a NIC driver injects GSO packets into the stack which are then forwarded to another host. The primary application of this is going to be Xen where its backend driver may inject GSO packets into dom0. Of course this also can be used by other virtualisation schemes such as VMWare or UML since the tap device could be modified to inject GSO packets received through splice. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/xfrm4_output.c | 54 +++++++++++++++++++++++++++++++++++++++++-------- net/ipv6/xfrm6_output.c | 39 +++++++++++++++++++++++++++++++++-- 2 files changed, 83 insertions(+), 10 deletions(-) diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index ac9d91d4bb0..193363e2293 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -9,6 +9,8 @@ */ #include +#include +#include #include #include #include @@ -97,16 +99,10 @@ error_nolock: goto out_exit; } -static int xfrm4_output_finish(struct sk_buff *skb) +static int xfrm4_output_finish2(struct sk_buff *skb) { int err; -#ifdef CONFIG_NETFILTER - if (!skb->dst->xfrm) { - IPCB(skb)->flags |= IPSKB_REROUTED; - return dst_output(skb); - } -#endif while (likely((err = xfrm4_output_one(skb)) == 0)) { nf_reset(skb); @@ -119,7 +115,7 @@ static int xfrm4_output_finish(struct sk_buff *skb) return dst_output(skb); err = nf_hook(PF_INET, NF_IP_POST_ROUTING, &skb, NULL, - skb->dst->dev, xfrm4_output_finish); + skb->dst->dev, xfrm4_output_finish2); if (unlikely(err != 1)) break; } @@ -127,6 +123,48 @@ static int xfrm4_output_finish(struct sk_buff *skb) return err; } +static int xfrm4_output_finish(struct sk_buff *skb) +{ + struct sk_buff *segs; + +#ifdef CONFIG_NETFILTER + if (!skb->dst->xfrm) { + IPCB(skb)->flags |= IPSKB_REROUTED; + return dst_output(skb); + } +#endif + + if (!skb_shinfo(skb)->gso_size) + return xfrm4_output_finish2(skb); + + skb->protocol = htons(ETH_P_IP); + segs = skb_gso_segment(skb, 0); + kfree_skb(skb); + if (unlikely(IS_ERR(segs))) + return PTR_ERR(segs); + + do { + struct sk_buff *nskb = segs->next; + int err; + + segs->next = NULL; + err = xfrm4_output_finish2(segs); + + if (unlikely(err)) { + while ((segs = nskb)) { + nskb = segs->next; + segs->next = NULL; + kfree_skb(segs); + } + return err; + } + + segs = nskb; + } while (segs); + + return 0; +} + int xfrm4_output(struct sk_buff *skb) { return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dst->dev, diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 16e84254a25..48fccb1eca0 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -94,7 +94,7 @@ error_nolock: goto out_exit; } -static int xfrm6_output_finish(struct sk_buff *skb) +static int xfrm6_output_finish2(struct sk_buff *skb) { int err; @@ -110,7 +110,7 @@ static int xfrm6_output_finish(struct sk_buff *skb) return dst_output(skb); err = nf_hook(PF_INET6, NF_IP6_POST_ROUTING, &skb, NULL, - skb->dst->dev, xfrm6_output_finish); + skb->dst->dev, xfrm6_output_finish2); if (unlikely(err != 1)) break; } @@ -118,6 +118,41 @@ static int xfrm6_output_finish(struct sk_buff *skb) return err; } +static int xfrm6_output_finish(struct sk_buff *skb) +{ + struct sk_buff *segs; + + if (!skb_shinfo(skb)->gso_size) + return xfrm6_output_finish2(skb); + + skb->protocol = htons(ETH_P_IP); + segs = skb_gso_segment(skb, 0); + kfree_skb(skb); + if (unlikely(IS_ERR(segs))) + return PTR_ERR(segs); + + do { + struct sk_buff *nskb = segs->next; + int err; + + segs->next = NULL; + err = xfrm6_output_finish2(segs); + + if (unlikely(err)) { + while ((segs = nskb)) { + nskb = segs->next; + segs->next = NULL; + kfree_skb(segs); + } + return err; + } + + segs = nskb; + } while (segs); + + return 0; +} + int xfrm6_output(struct sk_buff *skb) { return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb, NULL, skb->dst->dev, -- cgit v1.2.3-70-g09d2 From c8a553ad7f0bf943047943a758cf07017819cb3c Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 22 Jun 2006 14:28:09 -0700 Subject: [TCP]: Move inclusion of to correct place in The new header shouldn't be included from the !__KERNEL__ portion of tcp.h Signed-off-by: David Woodhouse Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 420a689c3fb..8ebf497907f 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -18,7 +18,6 @@ #define _LINUX_TCP_H #include -#include #include struct tcphdr { @@ -161,6 +160,7 @@ struct tcp_info #ifdef __KERNEL__ #include +#include #include #include #include -- cgit v1.2.3-70-g09d2 From f4b8ea7849544114e9d3d682df4d400180854677 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 22 Jun 2006 16:00:11 -0700 Subject: [NET]: fix net-core kernel-doc Warning(/var/linsrc/linux-2617-g4//include/linux/skbuff.h:304): No description found for parameter 'dma_cookie' Warning(/var/linsrc/linux-2617-g4//include/net/sock.h:1274): No description found for parameter 'copied_early' Warning(/var/linsrc/linux-2617-g4//net/core/dev.c:3309): No description found for parameter 'chan' Warning(/var/linsrc/linux-2617-g4//net/core/dev.c:3309): No description found for parameter 'event' Signed-off-by: Randy Dunlap Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 ++ include/net/sock.h | 1 + net/core/dev.c | 4 ++-- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a45bba9b8cb..16eef03ce0e 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -215,6 +215,8 @@ enum { * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c * @tc_index: Traffic control index * @tc_verd: traffic control verdict + * @dma_cookie: a cookie to one of several possible DMA operations + * done by skb DMA functions * @secmark: security marking */ diff --git a/include/net/sock.h b/include/net/sock.h index a897f05de3b..2d8d6adf161 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1269,6 +1269,7 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) * sk_eat_skb - Release a skb if it is no longer needed * @sk: socket to eat this skb from * @skb: socket buffer to eat + * @copied_early: flag indicating whether DMA operations copied this data early * * This routine must be called with interrupts disabled or with the socket * locked so that the sk_buff queue operation is ok. diff --git a/net/core/dev.c b/net/core/dev.c index d293e0f90a0..9b8f0f22c81 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3418,8 +3418,8 @@ static void net_dma_rebalance(void) /** * netdev_dma_event - event callback for the net_dma_client * @client: should always be net_dma_client - * @chan: - * @event: + * @chan: DMA channel for the event + * @event: event type */ static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan, enum dma_event event) -- cgit v1.2.3-70-g09d2 From ca6bb5d7ab22ac79f608fe6cbc6b12de6a5a19f0 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 22 Jun 2006 16:07:52 -0700 Subject: [NET]: Require CAP_NET_ADMIN to create tuntap devices. The tuntap driver allows an admin to create persistent devices and assign ownership of them to individual users. Unfortunately, relaxing the permissions on the /dev/net/tun device node so that they can actually use those devices will _also_ allow those users to create arbitrary new devices of their own. This patch corrects that, and adjusts the recommended permissions for the device node accordingly. Signed-off-By: David Woodhouse Signed-off-by: David S. Miller --- Documentation/networking/tuntap.txt | 11 +++++++---- drivers/net/tun.c | 3 +++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/Documentation/networking/tuntap.txt b/Documentation/networking/tuntap.txt index 76750fb9151..839cbb71388 100644 --- a/Documentation/networking/tuntap.txt +++ b/Documentation/networking/tuntap.txt @@ -39,10 +39,13 @@ Copyright (C) 1999-2000 Maxim Krasnyansky mknod /dev/net/tun c 10 200 Set permissions: - e.g. chmod 0700 /dev/net/tun - if you want the device only accessible by root. Giving regular users the - right to assign network devices is NOT a good idea. Users could assign - bogus network interfaces to trick firewalls or administrators. + e.g. chmod 0666 /dev/net/tun + There's no harm in allowing the device to be accessible by non-root users, + since CAP_NET_ADMIN is required for creating network devices or for + connecting to network devices which aren't owned by the user in question. + If you want to create persistent devices and give ownership of them to + unprivileged users, then you need the /dev/net/tun device to be usable by + those users. Driver module autoloading diff --git a/drivers/net/tun.c b/drivers/net/tun.c index a1ed2d98374..6c62d5c8826 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -490,6 +490,9 @@ static int tun_set_iff(struct file *file, struct ifreq *ifr) err = -EINVAL; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + /* Set dev type */ if (ifr->ifr_flags & IFF_TUN) { /* TUN device */ -- cgit v1.2.3-70-g09d2