From def8b4faff5ca349beafbbfeb2c51f3602a6ef3a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 28 Oct 2008 13:24:06 -0700 Subject: net: reduce structures when XFRM=n ifdef out * struct sk_buff::sp (pointer) * struct dst_entry::xfrm (pointer) * struct sock::sk_policy (2 pointers) Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/linux/skbuff.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2725f4e5a9b..487e34507b4 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -269,8 +269,9 @@ struct sk_buff { struct dst_entry *dst; struct rtable *rtable; }; +#ifdef CONFIG_XFRM struct sec_path *sp; - +#endif /* * This is the control buffer. It is free to use for every * layer. Please put your private variables there. If you @@ -1864,6 +1865,18 @@ static inline void skb_copy_queue_mapping(struct sk_buff *to, const struct sk_bu to->queue_mapping = from->queue_mapping; } +#ifdef CONFIG_XFRM +static inline struct sec_path *skb_sec_path(struct sk_buff *skb) +{ + return skb->sp; +} +#else +static inline struct sec_path *skb_sec_path(struct sk_buff *skb) +{ + return NULL; +} +#endif + static inline int skb_is_gso(const struct sk_buff *skb) { return skb_shinfo(skb)->gso_size; -- cgit v1.2.3-70-g09d2 From 8b30b1fe368ab03049435884c11c5c50e4c4ef0b Mon Sep 17 00:00:00 2001 From: Sujith Date: Fri, 24 Oct 2008 09:55:27 +0530 Subject: mac80211: Re-enable aggregation Wireless HW without any dedicated queues for aggregation do not need the ampdu_queues mechanism present right now in mac80211. Since mac80211 is still incomplete wrt TX MQ changes, do not allow aggregation sessions for drivers that set ampdu_queues. This is only an interim hack until Intel fixes the requeue issue. Signed-off-by: Sujith Signed-off-by: Luis Rodriguez Signed-off-by: John W. Linville --- drivers/net/wireless/ath9k/main.c | 6 ++-- drivers/net/wireless/iwlwifi/iwl-core.c | 3 +- include/linux/skbuff.h | 4 +++ include/net/mac80211.h | 8 ++--- net/core/skbuff.c | 1 + net/mac80211/ht.c | 60 +++++++++++++++++++-------------- net/mac80211/main.c | 7 ++-- net/mac80211/rx.c | 7 ++-- net/mac80211/tx.c | 19 ++++++++--- net/mac80211/wme.c | 24 ++++++------- 10 files changed, 76 insertions(+), 63 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/drivers/net/wireless/ath9k/main.c b/drivers/net/wireless/ath9k/main.c index 795fed5cadf..f6dc4c82604 100644 --- a/drivers/net/wireless/ath9k/main.c +++ b/drivers/net/wireless/ath9k/main.c @@ -953,10 +953,7 @@ static int ath_attach(u16 devid, &sc->sbands[IEEE80211_BAND_5GHZ]; } - /* FIXME: Have to figure out proper hw init values later */ - hw->queues = 4; - hw->ampdu_queues = 1; /* Register rate control */ hw->rate_control_algorithm = "ath9k_rate_control"; @@ -1745,7 +1742,8 @@ static int ath_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) hw->flags = IEEE80211_HW_RX_INCLUDES_FCS | IEEE80211_HW_HOST_BROADCAST_PS_BUFFERING | IEEE80211_HW_SIGNAL_DBM | - IEEE80211_HW_NOISE_DBM; + IEEE80211_HW_NOISE_DBM | + IEEE80211_HW_AMPDU_AGGREGATION; hw->wiphy->interface_modes = BIT(NL80211_IFTYPE_AP) | diff --git a/drivers/net/wireless/iwlwifi/iwl-core.c b/drivers/net/wireless/iwlwifi/iwl-core.c index 20c7ff38291..ba05f5ddc6d 100644 --- a/drivers/net/wireless/iwlwifi/iwl-core.c +++ b/drivers/net/wireless/iwlwifi/iwl-core.c @@ -871,7 +871,8 @@ int iwl_setup_mac(struct iwl_priv *priv) /* Tell mac80211 our characteristics */ hw->flags = IEEE80211_HW_SIGNAL_DBM | - IEEE80211_HW_NOISE_DBM; + IEEE80211_HW_NOISE_DBM | + IEEE80211_HW_AMPDU_AGGREGATION; hw->wiphy->interface_modes = BIT(NL80211_IFTYPE_AP) | BIT(NL80211_IFTYPE_STATION) | diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 487e34507b4..a01b6f84e3b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -250,6 +250,9 @@ typedef unsigned char *sk_buff_data_t; * @tc_verd: traffic control verdict * @ndisc_nodetype: router type (from link layer) * @do_not_encrypt: set to prevent encryption of this frame + * @requeue: set to indicate that the wireless core should attempt + * a software retry on this frame if we failed to + * receive an ACK for it * @dma_cookie: a cookie to one of several possible DMA operations * done by skb DMA functions * @secmark: security marking @@ -326,6 +329,7 @@ struct sk_buff { #endif #if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE) __u8 do_not_encrypt:1; + __u8 requeue:1; #endif /* 0/13/14 bit hole */ diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 16c895969e6..bba96a20388 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -242,7 +242,6 @@ struct ieee80211_bss_conf { * @IEEE80211_TX_CTL_RATE_CTRL_PROBE: internal to mac80211, can be * set by rate control algorithms to indicate probe rate, will * be cleared for fragmented frames (except on the last fragment) - * @IEEE80211_TX_CTL_REQUEUE: REMOVE THIS */ enum mac80211_tx_control_flags { IEEE80211_TX_CTL_REQ_TX_STATUS = BIT(0), @@ -258,9 +257,6 @@ enum mac80211_tx_control_flags { IEEE80211_TX_STAT_AMPDU = BIT(10), IEEE80211_TX_STAT_AMPDU_NO_BACK = BIT(11), IEEE80211_TX_CTL_RATE_CTRL_PROBE = BIT(12), - - /* XXX: remove this */ - IEEE80211_TX_CTL_REQUEUE = BIT(13), }; enum mac80211_rate_control_flags { @@ -847,6 +843,9 @@ enum ieee80211_tkip_key_type { * @IEEE80211_HW_SPECTRUM_MGMT: * Hardware supports spectrum management defined in 802.11h * Measurement, Channel Switch, Quieting, TPC + * + * @IEEE80211_HW_AMPDU_AGGREGATION: + * Hardware supports 11n A-MPDU aggregation. */ enum ieee80211_hw_flags { IEEE80211_HW_RX_INCLUDES_FCS = 1<<1, @@ -858,6 +857,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_SIGNAL_DBM = 1<<7, IEEE80211_HW_NOISE_DBM = 1<<8, IEEE80211_HW_SPECTRUM_MGMT = 1<<9, + IEEE80211_HW_AMPDU_AGGREGATION = 1<<10, }; /** diff --git a/net/core/skbuff.c b/net/core/skbuff.c index cdfe473181a..c4c8a33f341 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -544,6 +544,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) C(truesize); #if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE) C(do_not_encrypt); + C(requeue); #endif atomic_set(&n->users, 1); diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 42c3e590df9..08009d4b7d6 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -458,7 +458,7 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid) u8 *state; int ret; - if (tid >= STA_TID_NUM) + if ((tid >= STA_TID_NUM) || !(hw->flags & IEEE80211_HW_AMPDU_AGGREGATION)) return -EINVAL; #ifdef CONFIG_MAC80211_HT_DEBUG @@ -515,17 +515,19 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid) (unsigned long)&sta->timer_to_tid[tid]; init_timer(&sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer); - /* create a new queue for this aggregation */ - ret = ieee80211_ht_agg_queue_add(local, sta, tid); + if (hw->ampdu_queues) { + /* create a new queue for this aggregation */ + ret = ieee80211_ht_agg_queue_add(local, sta, tid); - /* case no queue is available to aggregation - * don't switch to aggregation */ - if (ret) { + /* case no queue is available to aggregation + * don't switch to aggregation */ + if (ret) { #ifdef CONFIG_MAC80211_HT_DEBUG - printk(KERN_DEBUG "BA request denied - queue unavailable for" - " tid %d\n", tid); + printk(KERN_DEBUG "BA request denied - " + "queue unavailable for tid %d\n", tid); #endif /* CONFIG_MAC80211_HT_DEBUG */ - goto err_unlock_queue; + goto err_unlock_queue; + } } sdata = sta->sdata; @@ -544,7 +546,8 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid) /* No need to requeue the packets in the agg queue, since we * held the tx lock: no packet could be enqueued to the newly * allocated queue */ - ieee80211_ht_agg_queue_remove(local, sta, tid, 0); + if (hw->ampdu_queues) + ieee80211_ht_agg_queue_remove(local, sta, tid, 0); #ifdef CONFIG_MAC80211_HT_DEBUG printk(KERN_DEBUG "BA request denied - HW unavailable for" " tid %d\n", tid); @@ -554,7 +557,8 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid) } /* Will put all the packets in the new SW queue */ - ieee80211_requeue(local, ieee802_1d_to_ac[tid]); + if (hw->ampdu_queues) + ieee80211_requeue(local, ieee802_1d_to_ac[tid]); spin_unlock_bh(&sta->lock); /* send an addBA request */ @@ -622,7 +626,8 @@ int ieee80211_stop_tx_ba_session(struct ieee80211_hw *hw, ra, tid); #endif /* CONFIG_MAC80211_HT_DEBUG */ - ieee80211_stop_queue(hw, sta->tid_to_tx_q[tid]); + if (hw->ampdu_queues) + ieee80211_stop_queue(hw, sta->tid_to_tx_q[tid]); *state = HT_AGG_STATE_REQ_STOP_BA_MSK | (initiator << HT_AGG_STATE_INITIATOR_SHIFT); @@ -635,7 +640,8 @@ int ieee80211_stop_tx_ba_session(struct ieee80211_hw *hw, if (ret) { WARN_ON(ret != -EBUSY); *state = HT_AGG_STATE_OPERATIONAL; - ieee80211_wake_queue(hw, sta->tid_to_tx_q[tid]); + if (hw->ampdu_queues) + ieee80211_wake_queue(hw, sta->tid_to_tx_q[tid]); goto stop_BA_exit; } @@ -691,7 +697,8 @@ void ieee80211_start_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u16 tid) #ifdef CONFIG_MAC80211_HT_DEBUG printk(KERN_DEBUG "Aggregation is on for tid %d \n", tid); #endif - ieee80211_wake_queue(hw, sta->tid_to_tx_q[tid]); + if (hw->ampdu_queues) + ieee80211_wake_queue(hw, sta->tid_to_tx_q[tid]); } spin_unlock_bh(&sta->lock); rcu_read_unlock(); @@ -745,16 +752,18 @@ void ieee80211_stop_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u8 tid) ieee80211_send_delba(sta->sdata, ra, tid, WLAN_BACK_INITIATOR, WLAN_REASON_QSTA_NOT_USE); - agg_queue = sta->tid_to_tx_q[tid]; - - ieee80211_ht_agg_queue_remove(local, sta, tid, 1); - - /* We just requeued the all the frames that were in the - * removed queue, and since we might miss a softirq we do - * netif_schedule_queue. ieee80211_wake_queue is not used - * here as this queue is not necessarily stopped - */ - netif_schedule_queue(netdev_get_tx_queue(local->mdev, agg_queue)); + if (hw->ampdu_queues) { + agg_queue = sta->tid_to_tx_q[tid]; + ieee80211_ht_agg_queue_remove(local, sta, tid, 1); + + /* We just requeued the all the frames that were in the + * removed queue, and since we might miss a softirq we do + * netif_schedule_queue. ieee80211_wake_queue is not used + * here as this queue is not necessarily stopped + */ + netif_schedule_queue(netdev_get_tx_queue(local->mdev, + agg_queue)); + } spin_lock_bh(&sta->lock); *state = HT_AGG_STATE_IDLE; sta->ampdu_mlme.addba_req_num[tid] = 0; @@ -1011,7 +1020,8 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, *state |= HT_ADDBA_RECEIVED_MSK; sta->ampdu_mlme.addba_req_num[tid] = 0; - if (*state == HT_AGG_STATE_OPERATIONAL) + if (*state == HT_AGG_STATE_OPERATIONAL && + local->hw.ampdu_queues) ieee80211_wake_queue(hw, sta->tid_to_tx_q[tid]); spin_unlock_bh(&sta->lock); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 88c1975a97a..fa0cc7a1e6b 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -386,8 +386,6 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, struct sta_info *sta, struct sk_buff *skb) { - struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - sta->tx_filtered_count++; /* @@ -434,10 +432,9 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, return; } - if (!test_sta_flags(sta, WLAN_STA_PS) && - !(info->flags & IEEE80211_TX_CTL_REQUEUE)) { + if (!test_sta_flags(sta, WLAN_STA_PS) && !skb->requeue) { /* Software retry the packet once */ - info->flags |= IEEE80211_TX_CTL_REQUEUE; + skb->requeue = 1; ieee80211_remove_tx_extra(local, sta->key, skb); dev_queue_xmit(skb); return; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index c4c95f1db60..648a1d0e6c8 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -669,7 +669,6 @@ static int ap_sta_ps_end(struct sta_info *sta) struct ieee80211_local *local = sdata->local; struct sk_buff *skb; int sent = 0; - struct ieee80211_tx_info *info; atomic_dec(&sdata->bss->num_sta_ps); @@ -685,13 +684,11 @@ static int ap_sta_ps_end(struct sta_info *sta) /* Send all buffered frames to the station */ while ((skb = skb_dequeue(&sta->tx_filtered)) != NULL) { - info = IEEE80211_SKB_CB(skb); sent++; - info->flags |= IEEE80211_TX_CTL_REQUEUE; + skb->requeue = 1; dev_queue_xmit(skb); } while ((skb = skb_dequeue(&sta->ps_tx_buf)) != NULL) { - info = IEEE80211_SKB_CB(skb); local->total_ps_buffered--; sent++; #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG @@ -699,7 +696,7 @@ static int ap_sta_ps_end(struct sta_info *sta) "since STA not sleeping anymore\n", sdata->dev->name, sta->sta.addr, sta->sta.aid); #endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */ - info->flags |= IEEE80211_TX_CTL_REQUEUE; + skb->requeue = 1; dev_queue_xmit(skb); } diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 541e3e64493..d6392af9cd2 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -661,6 +661,7 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) static ieee80211_tx_result debug_noinline ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx) { + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data; size_t hdrlen, per_fragm, num_fragm, payload_len, left; struct sk_buff **frags, *first, *frag; @@ -677,9 +678,7 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx) * This scenario is handled in __ieee80211_tx_prepare but extra * caution taken here as fragmented ampdu may cause Tx stop. */ - if (WARN_ON(tx->flags & IEEE80211_TX_CTL_AMPDU || - skb_get_queue_mapping(tx->skb) >= - ieee80211_num_regular_queues(&tx->local->hw))) + if (WARN_ON(info->flags & IEEE80211_TX_CTL_AMPDU)) return TX_DROP; first = tx->skb; @@ -951,7 +950,8 @@ __ieee80211_tx_prepare(struct ieee80211_tx_data *tx, struct ieee80211_sub_if_data *sdata; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - int hdrlen; + int hdrlen, tid; + u8 *qc, *state; memset(tx, 0, sizeof(*tx)); tx->skb = skb; @@ -982,6 +982,15 @@ __ieee80211_tx_prepare(struct ieee80211_tx_data *tx, tx->sta = sta_info_get(local, hdr->addr1); + if (tx->sta && ieee80211_is_data_qos(hdr->frame_control)) { + qc = ieee80211_get_qos_ctl(hdr); + tid = *qc & IEEE80211_QOS_CTL_TID_MASK; + + state = &tx->sta->ampdu_mlme.tid_state_tx[tid]; + if (*state == HT_AGG_STATE_OPERATIONAL) + info->flags |= IEEE80211_TX_CTL_AMPDU; + } + if (is_multicast_ether_addr(hdr->addr1)) { tx->flags &= ~IEEE80211_TX_UNICAST; info->flags |= IEEE80211_TX_CTL_NO_ACK; @@ -1172,7 +1181,7 @@ retry: * queues, there's no reason for a driver to reject * a frame there, warn and drop it. */ - if (WARN_ON(queue >= ieee80211_num_regular_queues(&local->hw))) + if (WARN_ON(info->flags & IEEE80211_TX_CTL_AMPDU)) goto drop; store = &local->pending_packet[queue]; diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c index d27ef7f2d4a..ac71b38f7cb 100644 --- a/net/mac80211/wme.c +++ b/net/mac80211/wme.c @@ -114,8 +114,8 @@ u16 ieee80211_select_queue(struct net_device *dev, struct sk_buff *skb) { struct ieee80211_master_priv *mpriv = netdev_priv(dev); struct ieee80211_local *local = mpriv->local; + struct ieee80211_hw *hw = &local->hw; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; - struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct sta_info *sta; u16 queue; u8 tid; @@ -124,21 +124,19 @@ u16 ieee80211_select_queue(struct net_device *dev, struct sk_buff *skb) if (unlikely(queue >= local->hw.queues)) queue = local->hw.queues - 1; - if (info->flags & IEEE80211_TX_CTL_REQUEUE) { + if (skb->requeue) { + if (!hw->ampdu_queues) + return queue; + rcu_read_lock(); sta = sta_info_get(local, hdr->addr1); tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; if (sta) { - struct ieee80211_hw *hw = &local->hw; int ampdu_queue = sta->tid_to_tx_q[tid]; if ((ampdu_queue < ieee80211_num_queues(hw)) && - test_bit(ampdu_queue, local->queue_pool)) { + test_bit(ampdu_queue, local->queue_pool)) queue = ampdu_queue; - info->flags |= IEEE80211_TX_CTL_AMPDU; - } else { - info->flags &= ~IEEE80211_TX_CTL_AMPDU; - } } rcu_read_unlock(); @@ -159,20 +157,18 @@ u16 ieee80211_select_queue(struct net_device *dev, struct sk_buff *skb) *p++ = ack_policy | tid; *p = 0; + if (!hw->ampdu_queues) + return queue; + rcu_read_lock(); sta = sta_info_get(local, hdr->addr1); if (sta) { int ampdu_queue = sta->tid_to_tx_q[tid]; - struct ieee80211_hw *hw = &local->hw; if ((ampdu_queue < ieee80211_num_queues(hw)) && - test_bit(ampdu_queue, local->queue_pool)) { + test_bit(ampdu_queue, local->queue_pool)) queue = ampdu_queue; - info->flags |= IEEE80211_TX_CTL_AMPDU; - } else { - info->flags &= ~IEEE80211_TX_CTL_AMPDU; - } } rcu_read_unlock(); -- cgit v1.2.3-70-g09d2 From 832d11c5cd076abc0aa1eaf7be96c81d1a59ce41 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Mon, 24 Nov 2008 21:20:15 -0800 Subject: tcp: Try to restore large SKBs while SACK processing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During SACK processing, most of the benefits of TSO are eaten by the SACK blocks that one-by-one fragment SKBs to MSS sized chunks. Then we're in problems when cleanup work for them has to be done when a large cumulative ACK comes. Try to return back to pre-split state already while more and more SACK info gets discovered by combining newly discovered SACK areas with the previous skb if that's SACKed as well. This approach has a number of benefits: 1) The processing overhead is spread more equally over the RTT 2) Write queue has less skbs to process (affect everything which has to walk in the queue past the sacked areas) 3) Write queue is consistent whole the time, so no other parts of TCP has to be aware of this (this was not the case with some other approach that was, well, quite intrusive all around). 4) Clean_rtx_queue can release most of the pages using single put_page instead of previous PAGE_SIZE/mss+1 calls In case a hole is fully filled by the new SACK block, we attempt to combine the next skb too which allows construction of skbs that are even larger than what tso split them to and it handles hole per on every nth patterns that often occur during slow start overshoot pretty nicely. Though this to be really useful also a retransmission would have to get lost since cumulative ACKs advance one hole at a time in the most typical case. TODO: handle upwards only merging. That should be rather easy when segment is fully sacked but I'm leaving that as future work item (it won't make very large difference anyway since this current approach already covers quite a lot of normal cases). I was earlier thinking of some sophisticated way of tracking timestamps of the first and the last segment but later on realized that it won't be that necessary at all to store the timestamp of the last segment. The cases that can occur are basically either: 1) ambiguous => no sensible measurement can be taken anyway 2) non-ambiguous is due to reordering => having the timestamp of the last segment there is just skewing things more off than does some good since the ack got triggered by one of the holes (besides some substle issues that would make determining right hole/skb even harder problem). Anyway, it has nothing to do with this change then. I choose to route some abnormal looking cases with goto noop, some could be handled differently (eg., by stopping the walking at that skb but again). In general, they either shouldn't happen at all or are rare enough to make no difference in practice. In theory this change (as whole) could cause some macroscale regression (global) because of cache misses that are taken over the round-trip time but it gets very likely better because of much less (local) cache misses per other write queue walkers and the big recovery clearing cumulative ack. Worth to note that these benefits would be very easy to get also without TSO/GSO being on as long as the data is in pages so that we can merge them. Currently I won't let that happen because DSACK splitting at fragment that would mess up pcounts due to sk_can_gso in tcp_set_skb_tso_segs. Once DSACKs fragments gets avoided, we have some conditions that can be made less strict. TODO: I will probably have to convert the excessive pointer passing to struct sacktag_state... :-) My testing revealed that considerable amount of skbs couldn't be shifted because they were cloned (most likely still awaiting tx reclaim)... [The rest is considering future work instead since I got repeatably EFAULT to tcpdump's recvfrom when I added pskb_expand_head to deal with clones, so I separated that into another, later patch] ...To counter that, I gave up on the fifth advantage: 5) When growing previous SACK block, less allocs for new skbs are done, basically a new alloc is needed only when new hole is detected and when the previous skb runs out of frags space ...which now only happens of if reclaim is fast enough to dispose the clone before the SACK block comes in (the window is RTT long), otherwise we'll have to alloc some. With clones being handled I got these numbers (will be somewhat worse without that), taken with fine-grained mibs: TCPSackShifted 398 TCPSackMerged 877 TCPSackShiftFallback 320 TCPSACKCOLLAPSEFALLBACKGSO 0 TCPSACKCOLLAPSEFALLBACKSKBBITS 0 TCPSACKCOLLAPSEFALLBACKSKBDATA 0 TCPSACKCOLLAPSEFALLBACKBELOW 0 TCPSACKCOLLAPSEFALLBACKFIRST 1 TCPSACKCOLLAPSEFALLBACKPREVBITS 318 TCPSACKCOLLAPSEFALLBACKMSS 1 TCPSACKCOLLAPSEFALLBACKNOHEAD 0 TCPSACKCOLLAPSEFALLBACKSHIFT 0 TCPSACKCOLLAPSENOOPSEQ 0 TCPSACKCOLLAPSENOOPSMALLPCOUNT 0 TCPSACKCOLLAPSENOOPSMALLLEN 0 TCPSACKCOLLAPSEHOLE 12 Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- include/linux/skbuff.h | 33 +++++++ include/net/tcp.h | 5 + net/core/skbuff.c | 140 +++++++++++++++++++++++++++ net/ipv4/tcp_input.c | 256 +++++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 427 insertions(+), 7 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a01b6f84e3b..acf17af45af 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -492,6 +492,19 @@ static inline bool skb_queue_is_last(const struct sk_buff_head *list, return (skb->next == (struct sk_buff *) list); } +/** + * skb_queue_is_first - check if skb is the first entry in the queue + * @list: queue head + * @skb: buffer + * + * Returns true if @skb is the first buffer on the list. + */ +static inline bool skb_queue_is_first(const struct sk_buff_head *list, + const struct sk_buff *skb) +{ + return (skb->prev == (struct sk_buff *) list); +} + /** * skb_queue_next - return the next packet in the queue * @list: queue head @@ -510,6 +523,24 @@ static inline struct sk_buff *skb_queue_next(const struct sk_buff_head *list, return skb->next; } +/** + * skb_queue_prev - return the prev packet in the queue + * @list: queue head + * @skb: current buffer + * + * Return the prev packet in @list before @skb. It is only valid to + * call this if skb_queue_is_first() evaluates to false. + */ +static inline struct sk_buff *skb_queue_prev(const struct sk_buff_head *list, + const struct sk_buff *skb) +{ + /* This BUG_ON may seem severe, but if we just return then we + * are going to dereference garbage. + */ + BUG_ON(skb_queue_is_first(list, skb)); + return skb->prev; +} + /** * skb_get - reference buffer * @skb: buffer to reference @@ -1652,6 +1683,8 @@ extern int skb_splice_bits(struct sk_buff *skb, extern void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); extern void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len); +extern int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, + int shiftlen); extern struct sk_buff *skb_segment(struct sk_buff *skb, int features); diff --git a/include/net/tcp.h b/include/net/tcp.h index 90b4c3b4c33..265392470b2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1192,6 +1192,11 @@ static inline struct sk_buff *tcp_write_queue_next(struct sock *sk, struct sk_bu return skb_queue_next(&sk->sk_write_queue, skb); } +static inline struct sk_buff *tcp_write_queue_prev(struct sock *sk, struct sk_buff *skb) +{ + return skb_queue_prev(&sk->sk_write_queue, skb); +} + #define tcp_for_write_queue(skb, sk) \ skb_queue_walk(&(sk)->sk_write_queue, skb) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 267185a848f..844b8abeb18 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2018,6 +2018,146 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) skb_split_no_header(skb, skb1, len, pos); } +/* Shifting from/to a cloned skb is a no-go. + * + * TODO: handle cloned skbs by using pskb_expand_head() + */ +static int skb_prepare_for_shift(struct sk_buff *skb) +{ + return skb_cloned(skb); +} + +/** + * skb_shift - Shifts paged data partially from skb to another + * @tgt: buffer into which tail data gets added + * @skb: buffer from which the paged data comes from + * @shiftlen: shift up to this many bytes + * + * Attempts to shift up to shiftlen worth of bytes, which may be less than + * the length of the skb, from tgt to skb. Returns number bytes shifted. + * It's up to caller to free skb if everything was shifted. + * + * If @tgt runs out of frags, the whole operation is aborted. + * + * Skb cannot include anything else but paged data while tgt is allowed + * to have non-paged data as well. + * + * TODO: full sized shift could be optimized but that would need + * specialized skb free'er to handle frags without up-to-date nr_frags. + */ +int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) +{ + int from, to, merge, todo; + struct skb_frag_struct *fragfrom, *fragto; + + BUG_ON(shiftlen > skb->len); + BUG_ON(skb_headlen(skb)); /* Would corrupt stream */ + + todo = shiftlen; + from = 0; + to = skb_shinfo(tgt)->nr_frags; + fragfrom = &skb_shinfo(skb)->frags[from]; + + /* Actual merge is delayed until the point when we know we can + * commit all, so that we don't have to undo partial changes + */ + if (!to || + !skb_can_coalesce(tgt, to, fragfrom->page, fragfrom->page_offset)) { + merge = -1; + } else { + merge = to - 1; + + todo -= fragfrom->size; + if (todo < 0) { + if (skb_prepare_for_shift(skb) || + skb_prepare_for_shift(tgt)) + return 0; + + fragto = &skb_shinfo(tgt)->frags[merge]; + + fragto->size += shiftlen; + fragfrom->size -= shiftlen; + fragfrom->page_offset += shiftlen; + + goto onlymerged; + } + + from++; + } + + /* Skip full, not-fitting skb to avoid expensive operations */ + if ((shiftlen == skb->len) && + (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to)) + return 0; + + if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) + return 0; + + while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { + if (to == MAX_SKB_FRAGS) + return 0; + + fragfrom = &skb_shinfo(skb)->frags[from]; + fragto = &skb_shinfo(tgt)->frags[to]; + + if (todo >= fragfrom->size) { + *fragto = *fragfrom; + todo -= fragfrom->size; + from++; + to++; + + } else { + get_page(fragfrom->page); + fragto->page = fragfrom->page; + fragto->page_offset = fragfrom->page_offset; + fragto->size = todo; + + fragfrom->page_offset += todo; + fragfrom->size -= todo; + todo = 0; + + to++; + break; + } + } + + /* Ready to "commit" this state change to tgt */ + skb_shinfo(tgt)->nr_frags = to; + + if (merge >= 0) { + fragfrom = &skb_shinfo(skb)->frags[0]; + fragto = &skb_shinfo(tgt)->frags[merge]; + + fragto->size += fragfrom->size; + put_page(fragfrom->page); + } + + /* Reposition in the original skb */ + to = 0; + while (from < skb_shinfo(skb)->nr_frags) + skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; + skb_shinfo(skb)->nr_frags = to; + + BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); + +onlymerged: + /* Most likely the tgt won't ever need its checksum anymore, skb on + * the other hand might need it if it needs to be resent + */ + tgt->ip_summed = CHECKSUM_PARTIAL; + skb->ip_summed = CHECKSUM_PARTIAL; + + /* Yak, is it really working this way? Some helper please? */ + skb->len -= shiftlen; + skb->data_len -= shiftlen; + skb->truesize -= shiftlen; + tgt->len += shiftlen; + tgt->data_len += shiftlen; + tgt->truesize += shiftlen; + + return shiftlen; +} + /** * skb_prepare_seq_read - Prepare a sequential read of skb data * @skb: the buffer to read diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3c8e297e2c3..97d57676b8e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1242,6 +1242,8 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb, * aligned portion of it that matches. Therefore we might need to fragment * which may fail and creates some hassle (caller must handle error case * returns). + * + * FIXME: this could be merged to shift decision code */ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, u32 start_seq, u32 end_seq) @@ -1353,9 +1355,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, if (fack_count > tp->fackets_out) tp->fackets_out = fack_count; - - if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) - tcp_advance_highest_sack(sk, skb); } /* D-SACK. We can detect redundant retransmission in S|R and plain R @@ -1370,12 +1369,231 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, return flag; } +static int tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, + struct sk_buff *skb, unsigned int pcount, + int shifted, int fack_count, int *reord, + int *flag, int mss) +{ + struct tcp_sock *tp = tcp_sk(sk); + u8 dummy_sacked = TCP_SKB_CB(skb)->sacked; /* We discard results */ + + BUG_ON(!pcount); + + TCP_SKB_CB(prev)->end_seq += shifted; + TCP_SKB_CB(skb)->seq += shifted; + + skb_shinfo(prev)->gso_segs += pcount; + BUG_ON(skb_shinfo(skb)->gso_segs < pcount); + skb_shinfo(skb)->gso_segs -= pcount; + + /* When we're adding to gso_segs == 1, gso_size will be zero, + * in theory this shouldn't be necessary but as long as DSACK + * code can come after this skb later on it's better to keep + * setting gso_size to something. + */ + if (!skb_shinfo(prev)->gso_size) { + skb_shinfo(prev)->gso_size = mss; + skb_shinfo(prev)->gso_type = sk->sk_gso_type; + } + + /* CHECKME: To clear or not to clear? Mimics normal skb currently */ + if (skb_shinfo(skb)->gso_segs <= 1) { + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; + } + + *flag |= tcp_sacktag_one(skb, sk, reord, 0, fack_count, &dummy_sacked, + pcount); + + /* Difference in this won't matter, both ACKed by the same cumul. ACK */ + TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); + + tcp_clear_all_retrans_hints(tp); + + if (skb->len > 0) { + BUG_ON(!tcp_skb_pcount(skb)); + return 0; + } + + /* Whole SKB was eaten :-) */ + + TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags; + if (skb == tcp_highest_sack(sk)) + tcp_advance_highest_sack(sk, skb); + + tcp_unlink_write_queue(skb, sk); + sk_wmem_free_skb(sk, skb); + + return 1; +} + +/* I wish gso_size would have a bit more sane initialization than + * something-or-zero which complicates things + */ +static int tcp_shift_mss(struct sk_buff *skb) +{ + int mss = tcp_skb_mss(skb); + + if (!mss) + mss = skb->len; + + return mss; +} + +/* Shifting pages past head area doesn't work */ +static int skb_can_shift(struct sk_buff *skb) +{ + return !skb_headlen(skb) && skb_is_nonlinear(skb); +} + +/* Try collapsing SACK blocks spanning across multiple skbs to a single + * skb. + */ +static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, + u32 start_seq, u32 end_seq, + int dup_sack, int *fack_count, + int *reord, int *flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *prev; + int mss; + int pcount = 0; + int len; + int in_sack; + + if (!sk_can_gso(sk)) + goto fallback; + + /* Normally R but no L won't result in plain S */ + if (!dup_sack && + (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) == TCPCB_SACKED_RETRANS) + goto fallback; + if (!skb_can_shift(skb)) + goto fallback; + /* This frame is about to be dropped (was ACKed). */ + if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) + goto fallback; + + /* Can only happen with delayed DSACK + discard craziness */ + if (unlikely(skb == tcp_write_queue_head(sk))) + goto fallback; + prev = tcp_write_queue_prev(sk, skb); + + if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) + goto fallback; + + in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && + !before(end_seq, TCP_SKB_CB(skb)->end_seq); + + if (in_sack) { + len = skb->len; + pcount = tcp_skb_pcount(skb); + mss = tcp_shift_mss(skb); + + /* TODO: Fix DSACKs to not fragment already SACKed and we can + * drop this restriction as unnecessary + */ + if (mss != tcp_shift_mss(prev)) + goto fallback; + } else { + if (!after(TCP_SKB_CB(skb)->end_seq, start_seq)) + goto noop; + /* CHECKME: This is non-MSS split case only?, this will + * cause skipped skbs due to advancing loop btw, original + * has that feature too + */ + if (tcp_skb_pcount(skb) <= 1) + goto noop; + + in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq); + if (!in_sack) { + /* TODO: head merge to next could be attempted here + * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)), + * though it might not be worth of the additional hassle + * + * ...we can probably just fallback to what was done + * previously. We could try merging non-SACKed ones + * as well but it probably isn't going to buy off + * because later SACKs might again split them, and + * it would make skb timestamp tracking considerably + * harder problem. + */ + goto fallback; + } + + len = end_seq - TCP_SKB_CB(skb)->seq; + BUG_ON(len < 0); + BUG_ON(len > skb->len); + + /* MSS boundaries should be honoured or else pcount will + * severely break even though it makes things bit trickier. + * Optimize common case to avoid most of the divides + */ + mss = tcp_skb_mss(skb); + + /* TODO: Fix DSACKs to not fragment already SACKed and we can + * drop this restriction as unnecessary + */ + if (mss != tcp_shift_mss(prev)) + goto fallback; + + if (len == mss) { + pcount = 1; + } else if (len < mss) { + goto noop; + } else { + pcount = len / mss; + len = pcount * mss; + } + } + + if (!skb_shift(prev, skb, len)) + goto fallback; + if (!tcp_shifted_skb(sk, prev, skb, pcount, len, *fack_count, reord, + flag, mss)) + goto out; + + /* Hole filled allows collapsing with the next as well, this is very + * useful when hole on every nth skb pattern happens + */ + if (prev == tcp_write_queue_tail(sk)) + goto out; + skb = tcp_write_queue_next(sk, prev); + + if (!skb_can_shift(skb)) + goto out; + if (skb == tcp_send_head(sk)) + goto out; + if ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) + goto out; + + len = skb->len; + if (skb_shift(prev, skb, len)) { + pcount += tcp_skb_pcount(skb); + tcp_shifted_skb(sk, prev, skb, tcp_skb_pcount(skb), len, + *fack_count, reord, flag, mss); + } + +out: + *fack_count += pcount; + return prev; + +noop: + return skb; + +fallback: + return NULL; +} + static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, struct tcp_sack_block *next_dup, u32 start_seq, u32 end_seq, int dup_sack_in, int *fack_count, int *reord, int *flag) { + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *tmp; + tcp_for_write_queue_from(skb, sk) { int in_sack = 0; int dup_sack = dup_sack_in; @@ -1396,18 +1614,42 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, dup_sack = 1; } - if (in_sack <= 0) - in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, - end_seq); + /* skb reference here is a bit tricky to get right, since + * shifting can eat and free both this skb and the next, + * so not even _safe variant of the loop is enough. + */ + if (in_sack <= 0) { + tmp = tcp_shift_skb_data(sk, skb, start_seq, + end_seq, dup_sack, + fack_count, reord, flag); + if (tmp != NULL) { + if (tmp != skb) { + skb = tmp; + continue; + } + + in_sack = 0; + } else { + in_sack = tcp_match_skb_to_sack(sk, skb, + start_seq, + end_seq); + } + } + if (unlikely(in_sack < 0)) break; - if (in_sack) + if (in_sack) { *flag |= tcp_sacktag_one(skb, sk, reord, dup_sack, *fack_count, &(TCP_SKB_CB(skb)->sacked), tcp_skb_pcount(skb)); + if (!before(TCP_SKB_CB(skb)->seq, + tcp_highest_sack_seq(tp))) + tcp_advance_highest_sack(sk, skb); + } + *fack_count += tcp_skb_pcount(skb); } return skb; -- cgit v1.2.3-70-g09d2 From 71d93b39e52e92aea35f1058d957cf12250d0b75 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 15 Dec 2008 23:42:33 -0800 Subject: net: Add skb_gro_receive This patch adds the helper skb_gro_receive to merge packets for GRO. The current method is to allocate a new header skb and then chain the original packets to its frag_list. This is done to make it easier to integrate into the existing GSO framework. In future as GSO is moved into the drivers, we can undo this and simply chain the original packets together. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 ++ net/core/skbuff.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index acf17af45af..cf2cb50f77d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1687,6 +1687,8 @@ extern int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen); extern struct sk_buff *skb_segment(struct sk_buff *skb, int features); +extern int skb_gro_receive(struct sk_buff **head, + struct sk_buff *skb); static inline void *skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 18e224af05a..b8d0abb2643 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2582,6 +2582,65 @@ err: EXPORT_SYMBOL_GPL(skb_segment); +int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) +{ + struct sk_buff *p = *head; + struct sk_buff *nskb; + unsigned int headroom; + unsigned int hlen = p->data - skb_mac_header(p); + + if (hlen + p->len + skb->len >= 65536) + return -E2BIG; + + if (skb_shinfo(p)->frag_list) + goto merge; + + headroom = skb_headroom(p); + nskb = netdev_alloc_skb(p->dev, headroom); + if (unlikely(!nskb)) + return -ENOMEM; + + __copy_skb_header(nskb, p); + nskb->mac_len = p->mac_len; + + skb_reserve(nskb, headroom); + + skb_set_mac_header(nskb, -hlen); + skb_set_network_header(nskb, skb_network_offset(p)); + skb_set_transport_header(nskb, skb_transport_offset(p)); + + memcpy(skb_mac_header(nskb), skb_mac_header(p), hlen); + + *NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p); + skb_shinfo(nskb)->frag_list = p; + skb_header_release(p); + nskb->prev = p; + + nskb->data_len += p->len; + nskb->truesize += p->len; + nskb->len += p->len; + + *head = nskb; + nskb->next = p->next; + p->next = NULL; + + p = nskb; + +merge: + NAPI_GRO_CB(p)->count++; + p->prev->next = skb; + p->prev = skb; + skb_header_release(skb); + + p->data_len += skb->len; + p->truesize += skb->len; + p->len += skb->len; + + NAPI_GRO_CB(skb)->same_flow = 1; + return 0; +} +EXPORT_SYMBOL_GPL(skb_gro_receive); + void __init skb_init(void) { skbuff_head_cache = kmem_cache_create("skbuff_head_cache", -- cgit v1.2.3-70-g09d2