From 712a72213fad36cc9e6ec706b5e020d7eb6e03bc Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 12 Aug 2014 14:53:16 -0400 Subject: net-timestamp: fix missing ACK timestamp ACK timestamps are generated in tcp_clean_rtx_queue. The TSO datapath can break out early, causing the timestamp code to be skipped. Move the code up before the break. Reported-by: David S. Miller Also fix a boundary condition: tp->snd_una is the next unacknowledged byte and between tests inclusive (a <= b <= c), so generate a an ACK timestamp if (prior_snd_una <= tskey <= tp->snd_una - 1). Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a3d47af0190..1a8e89fdd33 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3050,10 +3050,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, first_ackt.v64 = 0; while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { + struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *scb = TCP_SKB_CB(skb); u8 sacked = scb->sacked; u32 acked_pcount; + if (unlikely(shinfo->tx_flags & SKBTX_ACK_TSTAMP) && + between(shinfo->tskey, prior_snd_una, tp->snd_una - 1)) + __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); + /* Determine how many packets and what bytes were acked, tso and else */ if (after(scb->end_seq, tp->snd_una)) { if (tcp_skb_pcount(skb) == 1 || @@ -3107,11 +3112,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->retrans_stamp = 0; } - if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_ACK_TSTAMP) && - between(skb_shinfo(skb)->tskey, prior_snd_una, - tp->snd_una + 1)) - __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); - if (!fully_acked) break; -- cgit v1.2.3-70-g09d2 From a26552afe89438eefbe097512b3187f2c7e929fd Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Thu, 14 Aug 2014 22:06:12 +0200 Subject: tcp: don't allow syn packets without timestamps to pass tcp_tw_recycle logic tcp_tw_recycle heavily relies on tcp timestamps to build a per-host ordering of incoming connections and teardowns without the need to hold state on a specific quadruple for TCP_TIMEWAIT_LEN, but only for the last measured RTO. To do so, we keep the last seen timestamp in a per-host indexed data structure and verify if the incoming timestamp in a connection request is strictly greater than the saved one during last connection teardown. Thus we can verify later on that no old data packets will be accepted by the new connection. During moving a socket to time-wait state we already verify if timestamps where seen on a connection. Only if that was the case we let the time-wait socket expire after the RTO, otherwise normal TCP_TIMEWAIT_LEN will be used. But we don't verify this on incoming SYN packets. If a connection teardown was less than TCP_PAWS_MSL seconds in the past we cannot guarantee to not accept data packets from an old connection if no timestamps are present. We should drop this SYN packet. This patch closes this loophole. Please note, this patch does not make tcp_tw_recycle in any way more usable but only adds another safety check: Sporadic drops of SYN packets because of reordering in the network or in the socket backlog queues can happen. Users behing NAT trying to connect to a tcp_tw_recycle enabled server can get caught in blackholes and their connection requests may regullary get dropped because hosts behind an address translator don't have synchronized tcp timestamp clocks. tcp_tw_recycle cannot work if peers don't have tcp timestamps enabled. In general, use of tcp_tw_recycle is disadvised. Cc: Eric Dumazet Cc: Florian Westphal Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- net/ipv4/tcp_input.c | 9 ++++++--- net/ipv4/tcp_metrics.c | 6 ++++-- 3 files changed, 11 insertions(+), 6 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index e337e05035b..590e01a476a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -417,7 +417,7 @@ void tcp_update_metrics(struct sock *sk); void tcp_init_metrics(struct sock *sk); void tcp_metrics_init(void); bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, - bool paws_check); + bool paws_check, bool timestamps); bool tcp_remember_stamp(struct sock *sk); bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw); void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1a8e89fdd33..4f6cfbc5777 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5979,12 +5979,14 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, * timewait bucket, so that all the necessary checks * are made in the function processing timewait state. */ - if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle) { + if (tcp_death_row.sysctl_tw_recycle) { bool strict; dst = af_ops->route_req(sk, &fl, req, &strict); + if (dst && strict && - !tcp_peer_is_proven(req, dst, true)) { + !tcp_peer_is_proven(req, dst, true, + tmp_opt.saw_tstamp)) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); goto drop_and_release; } @@ -5993,7 +5995,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, else if (!sysctl_tcp_syncookies && (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < (sysctl_max_syn_backlog >> 2)) && - !tcp_peer_is_proven(req, dst, false)) { + !tcp_peer_is_proven(req, dst, false, + tmp_opt.saw_tstamp)) { /* Without syncookies last quarter of * backlog is filled with destinations, * proven to be alive. diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 0d54e59b9ea..ed9c9a91851 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -576,7 +576,8 @@ reset: tp->snd_cwnd_stamp = tcp_time_stamp; } -bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check) +bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, + bool paws_check, bool timestamps) { struct tcp_metrics_block *tm; bool ret; @@ -589,7 +590,8 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool pa if (paws_check) { if (tm && (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL && - (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW) + ((s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW || + !timestamps)) ret = false; else ret = true; -- cgit v1.2.3-70-g09d2 From 0c9ab09223fe9922baeb22546c9a90d774a4bde6 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 14 Aug 2014 16:13:07 -0400 Subject: tcp: fix ssthresh and undo for consecutive short FRTO episodes Fix TCP FRTO logic so that it always notices when snd_una advances, indicating that any RTO after that point will be a new and distinct loss episode. Previously there was a very specific sequence that could cause FRTO to fail to notice a new loss episode had started: (1) RTO timer fires, enter FRTO and retransmit packet 1 in write queue (2) receiver ACKs packet 1 (3) FRTO sends 2 more packets (4) RTO timer fires again (should start a new loss episode) The problem was in step (3) above, where tcp_process_loss() returned early (in the spot marked "Step 2.b"), so that it never got to the logic to clear icsk_retransmits. Thus icsk_retransmits stayed non-zero. Thus in step (4) tcp_enter_loss() would see the non-zero icsk_retransmits, decide that this RTO is not a new episode, and decide not to cut ssthresh and remember the current cwnd and ssthresh for undo. There were two main consequences to the bug that we have observed. First, ssthresh was not decreased in step (4). Second, when there was a series of such FRTO (1-4) sequences that happened to be followed by an FRTO undo, we would restore the cwnd and ssthresh from before the entire series started (instead of the cwnd and ssthresh from before the most recent RTO). This could result in cwnd and ssthresh being restored to values much bigger than the proper values. Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Fixes: e33099f96d99c ("tcp: implement RFC5682 F-RTO") Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4f6cfbc5777..a906e0200ff 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2687,7 +2687,6 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) */ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) { - struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); bool recovered = !before(tp->snd_una, tp->high_seq); @@ -2713,12 +2712,9 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) if (recovered) { /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */ - icsk->icsk_retransmits = 0; tcp_try_undo_recovery(sk); return; } - if (flag & FLAG_DATA_ACKED) - icsk->icsk_retransmits = 0; if (tcp_is_reno(tp)) { /* A Reno DUPACK means new data in F-RTO step 2.b above are * delivered. Lower inflight to clock out (re)tranmissions. @@ -3405,8 +3401,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); - if (after(ack, prior_snd_una)) + if (after(ack, prior_snd_una)) { flag |= FLAG_SND_UNA_ADVANCED; + icsk->icsk_retransmits = 0; + } prior_fackets = tp->fackets_out; -- cgit v1.2.3-70-g09d2 From 989e04c5bc3ff77d65e1f0d87bf7904dfa30d41c Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Fri, 22 Aug 2014 14:15:22 -0700 Subject: tcp: improve undo on timeout Upon timeout, undo (via both timestamps/Eifel and DSACKs) was disabled if any retransmits were still in flight. The concern was perhaps that spurious retransmission sent in a previous recovery episode may trigger DSACKs to falsely undo the current recovery. However, this inadvertently misses undo opportunities (using either TCP timestamps or DSACKs) when timeout occurs during a loss episode, i.e. recurring timeouts or timeout during fast recovery. In these cases some retransmissions will be in flight but we should allow undo. Furthermore, we should only reset undo_marker and undo_retrans upon timeout if we are starting a new recovery episode. Finally, when we do reset our undo state, we now do so in a manner similar to tcp_enter_recovery(), so that we require a DSACK for each of the outstsanding retransmissions. This will achieve the original goal by requiring that we receive the same number of DSACKs as retransmissions. This patch increases the undo events by 50% on Google servers. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 +- net/ipv4/tcp_input.c | 26 +++++++++++--------------- 2 files changed, 12 insertions(+), 16 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index fa5258f322e..e567f0dbf28 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -276,7 +276,7 @@ struct tcp_sock { u32 retrans_stamp; /* Timestamp of the last retransmit, * also used in SYN-SENT to remember stamp of * the first SYN. */ - u32 undo_marker; /* tracking retrans started here. */ + u32 undo_marker; /* snd_una upon a new recovery episode. */ int undo_retrans; /* number of undoable retransmissions. */ u32 total_retrans; /* Total retransmits for entire connection */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a906e0200ff..aba4926ca09 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1888,21 +1888,21 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp) tp->sacked_out = 0; } -static void tcp_clear_retrans_partial(struct tcp_sock *tp) +void tcp_clear_retrans(struct tcp_sock *tp) { tp->retrans_out = 0; tp->lost_out = 0; - tp->undo_marker = 0; tp->undo_retrans = -1; + tp->fackets_out = 0; + tp->sacked_out = 0; } -void tcp_clear_retrans(struct tcp_sock *tp) +static inline void tcp_init_undo(struct tcp_sock *tp) { - tcp_clear_retrans_partial(tp); - - tp->fackets_out = 0; - tp->sacked_out = 0; + tp->undo_marker = tp->snd_una; + /* Retransmission still in flight may cause DSACKs later. */ + tp->undo_retrans = tp->retrans_out ? : -1; } /* Enter Loss state. If we detect SACK reneging, forget all SACK information @@ -1925,18 +1925,18 @@ void tcp_enter_loss(struct sock *sk) tp->prior_ssthresh = tcp_current_ssthresh(sk); tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tcp_ca_event(sk, CA_EVENT_LOSS); + tcp_init_undo(tp); } tp->snd_cwnd = 1; tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; - tcp_clear_retrans_partial(tp); + tp->retrans_out = 0; + tp->lost_out = 0; if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); - tp->undo_marker = tp->snd_una; - skb = tcp_write_queue_head(sk); is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); if (is_reneg) { @@ -1950,9 +1950,6 @@ void tcp_enter_loss(struct sock *sk) if (skb == tcp_send_head(sk)) break; - if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) - tp->undo_marker = 0; - TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; @@ -2671,8 +2668,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) NET_INC_STATS_BH(sock_net(sk), mib_idx); tp->prior_ssthresh = 0; - tp->undo_marker = tp->snd_una; - tp->undo_retrans = tp->retrans_out ? : -1; + tcp_init_undo(tp); if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { if (!ece_ack) -- cgit v1.2.3-70-g09d2 From 04317dafd11dd7b0ec19b85f098414abae6ed5f7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Sep 2014 15:33:32 -0700 Subject: tcp: introduce TCP_SKB_CB(skb)->tcp_tw_isn TCP_SKB_CB(skb)->when has different meaning in output and input paths. In output path, it contains a timestamp. In input path, it contains an ISN, chosen by tcp_timewait_state_process() Lets add a different name to ease code comprehension. Note that 'when' field will disappear in following patch, as skb_mstamp already contains timestamp, the anonymous union will promptly disappear as well. Signed-off-by: Eric Dumazet Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/net/tcp.h | 7 ++++++- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_minisocks.c | 2 +- net/ipv6/tcp_ipv6.c | 4 ++-- 5 files changed, 11 insertions(+), 6 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index 590e01a476a..0cd7d2c65dc 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -698,7 +698,12 @@ struct tcp_skb_cb { } header; /* For incoming frames */ __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ - __u32 when; /* used to compute rtt's */ + union { + /* used in output path */ + __u32 when; /* used to compute rtt's */ + /* used in input path */ + __u32 tcp_tw_isn; /* isn chosen by tcp_timewait_state_process() */ + }; __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ __u8 sacked; /* State flags for SACK/FACK. */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index aba4926ca09..9c8b9f1dcf6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5906,7 +5906,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, struct request_sock *req; struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst = NULL; - __u32 isn = TCP_SKB_CB(skb)->when; + __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; bool want_cookie = false, fastopen; struct flowi fl; struct tcp_fastopen_cookie foc = { .len = -1 }; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 487e2a41667..02e6cd29ebf 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1627,7 +1627,7 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff * 4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); - TCP_SKB_CB(skb)->when = 0; + TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1649988bd1b..a058f411d3a 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -232,7 +232,7 @@ kill: u32 isn = tcptw->tw_snd_nxt + 65535 + 2; if (isn == 0) isn++; - TCP_SKB_CB(skb)->when = isn; + TCP_SKB_CB(skb)->tcp_tw_isn = isn; return TCP_TW_SYN; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 29964c3d363..5b3c70ff7a7 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -738,7 +738,7 @@ static void tcp_v6_init_req(struct request_sock *req, struct sock *sk, ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq->ir_iif = inet6_iif(skb); - if (!TCP_SKB_CB(skb)->when && + if (!TCP_SKB_CB(skb)->tcp_tw_isn && (ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim || np->repflow)) { @@ -1412,7 +1412,7 @@ static int tcp_v6_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff*4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); - TCP_SKB_CB(skb)->when = 0; + TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); TCP_SKB_CB(skb)->sacked = 0; -- cgit v1.2.3-70-g09d2 From 7faee5c0d514162853a343d93e4a0b6bb8bfec21 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Sep 2014 15:33:33 -0700 Subject: tcp: remove TCP_SKB_CB(skb)->when After commit 740b0f1841f6 ("tcp: switch rtt estimations to usec resolution"), we no longer need to maintain timestamps in two different fields. TCP_SKB_CB(skb)->when can be removed, as same information sits in skb_mstamp.stamp_jiffies Signed-off-by: Eric Dumazet Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/net/tcp.h | 13 +++++++------ net/ipv4/tcp_input.c | 3 ++- net/ipv4/tcp_ipv4.c | 5 +++-- net/ipv4/tcp_output.c | 39 ++++++++++++++++----------------------- net/ipv4/tcp_timer.c | 7 +++---- 5 files changed, 31 insertions(+), 36 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index 0cd7d2c65dc..a4201ef216e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -672,6 +672,12 @@ void tcp_send_window_probe(struct sock *sk); */ #define tcp_time_stamp ((__u32)(jiffies)) +static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) +{ + return skb->skb_mstamp.stamp_jiffies; +} + + #define tcp_flag_byte(th) (((u_int8_t *)th)[13]) #define TCPHDR_FIN 0x01 @@ -698,12 +704,7 @@ struct tcp_skb_cb { } header; /* For incoming frames */ __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ - union { - /* used in output path */ - __u32 when; /* used to compute rtt's */ - /* used in input path */ - __u32 tcp_tw_isn; /* isn chosen by tcp_timewait_state_process() */ - }; + __u32 tcp_tw_isn; /* isn chosen by tcp_timewait_state_process() */ __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ __u8 sacked; /* State flags for SACK/FACK. */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9c8b9f1dcf6..f97003ad0af 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2967,7 +2967,8 @@ void tcp_rearm_rto(struct sock *sk) if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { struct sk_buff *skb = tcp_write_queue_head(sk); - const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; + const u32 rto_time_stamp = + tcp_skb_timestamp(skb) + rto; s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); /* delta may not be positive if the socket is locked * when the retrans timer fires and is rescheduled. diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 02e6cd29ebf..3f9bc3f0bba 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -437,8 +437,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) skb = tcp_write_queue_head(sk); BUG_ON(!skb); - remaining = icsk->icsk_rto - min(icsk->icsk_rto, - tcp_time_stamp - TCP_SKB_CB(skb)->when); + remaining = icsk->icsk_rto - + min(icsk->icsk_rto, + tcp_time_stamp - tcp_skb_timestamp(skb)); if (remaining) { inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5a7c41fbc6d..3b22dcb7bb5 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -550,7 +550,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { opts->options |= OPTION_TS; - opts->tsval = TCP_SKB_CB(skb)->when + tp->tsoffset; + opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset; opts->tsecr = tp->rx_opt.ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } @@ -618,7 +618,7 @@ static unsigned int tcp_synack_options(struct sock *sk, } if (likely(ireq->tstamp_ok)) { opts->options |= OPTION_TS; - opts->tsval = TCP_SKB_CB(skb)->when; + opts->tsval = tcp_skb_timestamp(skb); opts->tsecr = req->ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } @@ -647,7 +647,6 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb struct tcp_out_options *opts, struct tcp_md5sig_key **md5) { - struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; struct tcp_sock *tp = tcp_sk(sk); unsigned int size = 0; unsigned int eff_sacks; @@ -666,7 +665,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb if (likely(tp->rx_opt.tstamp_ok)) { opts->options |= OPTION_TS; - opts->tsval = tcb ? tcb->when + tp->tsoffset : 0; + opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0; opts->tsecr = tp->rx_opt.ts_recent; size += TCPOLEN_TSTAMP_ALIGNED; } @@ -886,8 +885,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb = skb_clone(skb, gfp_mask); if (unlikely(!skb)) return -ENOBUFS; - /* Our usage of tstamp should remain private */ - skb->tstamp.tv64 = 0; } inet = inet_sk(sk); @@ -975,7 +972,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); + /* Our usage of tstamp should remain private */ + skb->tstamp.tv64 = 0; err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); + if (likely(err <= 0)) return err; @@ -1149,7 +1149,6 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, /* Looks stupid, but our code really uses when of * skbs, which it never sent before. --ANK */ - TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; buff->tstamp = skb->tstamp; tcp_fragment_tstamp(skb, buff); @@ -1874,8 +1873,8 @@ static int tcp_mtu_probe(struct sock *sk) tcp_init_tso_segs(sk, nskb, nskb->len); /* We're ready to send. If this fails, the probe will - * be resegmented into mss-sized pieces by tcp_write_xmit(). */ - TCP_SKB_CB(nskb)->when = tcp_time_stamp; + * be resegmented into mss-sized pieces by tcp_write_xmit(). + */ if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { /* Decrement cwnd here because we are sending * effectively two packets. */ @@ -1935,8 +1934,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, BUG_ON(!tso_segs); if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { - /* "when" is used as a start point for the retransmit timer */ - TCP_SKB_CB(skb)->when = tcp_time_stamp; + /* "skb_mstamp" is used as a start point for the retransmit timer */ + skb_mstamp_get(&skb->skb_mstamp); goto repair; /* Skip network transmission */ } @@ -2000,8 +1999,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; - TCP_SKB_CB(skb)->when = tcp_time_stamp; - if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; @@ -2499,7 +2496,6 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) /* Make a copy, if the first transmission SKB clone we made * is still in somebody's hands, else make a clone. */ - TCP_SKB_CB(skb)->when = tcp_time_stamp; /* make sure skb->data is aligned on arches that require it * and check if ack-trimming & collapsing extended the headroom @@ -2544,7 +2540,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) /* Save stamp of the first retransmit. */ if (!tp->retrans_stamp) - tp->retrans_stamp = TCP_SKB_CB(skb)->when; + tp->retrans_stamp = tcp_skb_timestamp(skb); /* snd_nxt is stored to detect loss of retransmitted segment, * see tcp_input.c tcp_sacktag_write_queue(). @@ -2752,7 +2748,6 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), TCPHDR_ACK | TCPHDR_RST); /* Send it off. */ - TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); @@ -2791,7 +2786,6 @@ int tcp_send_synack(struct sock *sk) TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; TCP_ECN_send_synack(tcp_sk(sk), skb); } - TCP_SKB_CB(skb)->when = tcp_time_stamp; return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } @@ -2835,10 +2829,10 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, memset(&opts, 0, sizeof(opts)); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) - TCP_SKB_CB(skb)->when = cookie_init_timestamp(req); + skb->skb_mstamp.stamp_jiffies = cookie_init_timestamp(req); else #endif - TCP_SKB_CB(skb)->when = tcp_time_stamp; + skb_mstamp_get(&skb->skb_mstamp); tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5, foc) + sizeof(*th); @@ -3086,7 +3080,7 @@ int tcp_connect(struct sock *sk) skb_reserve(buff, MAX_TCP_HEADER); tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); - tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp; + tp->retrans_stamp = tcp_time_stamp; tcp_connect_queue_skb(sk, buff); TCP_ECN_send_syn(sk, buff); @@ -3194,7 +3188,7 @@ void tcp_send_ack(struct sock *sk) tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK); /* Send it off, this clears delayed acks for us. */ - TCP_SKB_CB(buff)->when = tcp_time_stamp; + skb_mstamp_get(&buff->skb_mstamp); tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); } @@ -3226,7 +3220,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) * send it. */ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); - TCP_SKB_CB(skb)->when = tcp_time_stamp; + skb_mstamp_get(&skb->skb_mstamp); return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } @@ -3270,7 +3264,6 @@ int tcp_write_wakeup(struct sock *sk) tcp_set_skb_tso_segs(sk, skb, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; - TCP_SKB_CB(skb)->when = tcp_time_stamp; err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); if (!err) tcp_event_new_data_sent(sk, skb); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index df90cd1ce37..a339e7ba05a 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -135,10 +135,9 @@ static bool retransmits_timed_out(struct sock *sk, if (!inet_csk(sk)->icsk_retransmits) return false; - if (unlikely(!tcp_sk(sk)->retrans_stamp)) - start_ts = TCP_SKB_CB(tcp_write_queue_head(sk))->when; - else - start_ts = tcp_sk(sk)->retrans_stamp; + start_ts = tcp_sk(sk)->retrans_stamp; + if (unlikely(!start_ts)) + start_ts = tcp_skb_timestamp(tcp_write_queue_head(sk)); if (likely(timeout == 0)) { linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); -- cgit v1.2.3-70-g09d2 From e11ecddf5128011c936cc5360780190cbc901fdc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Sep 2014 04:19:51 -0700 Subject: tcp: use TCP_SKB_CB(skb)->tcp_flags in input path Input path of TCP do not currently uses TCP_SKB_CB(skb)->tcp_flags, which is only used in output path. tcp_recvmsg(), looks at tcp_hdr(skb)->syn for every skb found in receive queue, and its unfortunate because this bit is located in a cache line right before the payload. We can simplify TCP by copying tcp flags into TCP_SKB_CB(skb)->tcp_flags. This patch does so, and avoids the cache line miss in tcp_recvmsg() Following patches will - allow a segment with FIN being coalesced in tcp_try_coalesce() - simplify tcp_collapse() by not copying the headers. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 18 ++++++++++-------- net/ipv4/tcp_input.c | 10 +++++----- net/ipv4/tcp_ipv4.c | 1 + net/ipv6/tcp_ipv6.c | 1 + 4 files changed, 17 insertions(+), 13 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 541f26a67ba..070aeff1b13 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1510,9 +1510,9 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { offset = seq - TCP_SKB_CB(skb)->seq; - if (tcp_hdr(skb)->syn) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) offset--; - if (offset < skb->len || tcp_hdr(skb)->fin) { + if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) { *off = offset; return skb; } @@ -1585,7 +1585,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, if (offset + 1 != skb->len) continue; } - if (tcp_hdr(skb)->fin) { + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { sk_eat_skb(sk, skb, false); ++seq; break; @@ -1722,11 +1722,11 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, break; offset = *seq - TCP_SKB_CB(skb)->seq; - if (tcp_hdr(skb)->syn) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) offset--; if (offset < skb->len) goto found_ok_skb; - if (tcp_hdr(skb)->fin) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; WARN(!(flags & MSG_PEEK), "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n", @@ -1959,7 +1959,7 @@ skip_copy: if (used + offset < skb->len) continue; - if (tcp_hdr(skb)->fin) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; if (!(flags & MSG_PEEK)) { sk_eat_skb(sk, skb, copied_early); @@ -2160,8 +2160,10 @@ void tcp_close(struct sock *sk, long timeout) * reader process may not have drained the data yet! */ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { - u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - - tcp_hdr(skb)->fin; + u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq; + + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + len--; data_was_unread += len; __kfree_skb(skb); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f97003ad0af..8f639a4face 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4093,7 +4093,7 @@ static void tcp_ofo_queue(struct sock *sk) __skb_unlink(skb, &tp->out_of_order_queue); __skb_queue_tail(&sk->sk_receive_queue, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - if (tcp_hdr(skb)->fin) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) tcp_fin(sk); } } @@ -4513,7 +4513,7 @@ restart: * - bloated or contains data before "start" or * overlaps to the next one. */ - if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin && + if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) && (tcp_win_from_space(skb->truesize) > skb->len || before(TCP_SKB_CB(skb)->seq, start))) { end_of_skbs = false; @@ -4532,7 +4532,8 @@ restart: /* Decided to skip this, advance start seq. */ start = TCP_SKB_CB(skb)->end_seq; } - if (end_of_skbs || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin) + if (end_of_skbs || + (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) return; while (before(start, end)) { @@ -4579,8 +4580,7 @@ restart: skb = tcp_collapse_one(sk, skb, list); if (!skb || skb == tail || - tcp_hdr(skb)->syn || - tcp_hdr(skb)->fin) + (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) return; } } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7881b96d2b7..006b045716d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1638,6 +1638,7 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff * 4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); + TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 1835480336a..de51a88bec6 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1415,6 +1415,7 @@ static int tcp_v6_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff*4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); + TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); TCP_SKB_CB(skb)->sacked = 0; -- cgit v1.2.3-70-g09d2 From e93a0435f809d009919a743fb6e93076faac8aa7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Sep 2014 04:19:52 -0700 Subject: tcp: allow segment with FIN in tcp_try_coalesce() We can allow a segment with FIN to be aggregated, if we take care to add tcp flags, and if skb_try_coalesce() takes care of zero sized skbs. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/core/skbuff.c | 3 ++- net/ipv4/tcp_input.c | 4 +--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c8259ac3874..29f7f012149 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3936,7 +3936,8 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, return false; if (len <= skb_tailroom(to)) { - BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); + if (len) + BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); *delta_truesize = 0; return true; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8f639a4face..228bf0c5ff1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4143,9 +4143,6 @@ static bool tcp_try_coalesce(struct sock *sk, *fragstolen = false; - if (tcp_hdr(from)->fin) - return false; - /* Its possible this segment overlaps with prior segment in queue */ if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) return false; @@ -4158,6 +4155,7 @@ static bool tcp_try_coalesce(struct sock *sk, NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; + TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags; return true; } -- cgit v1.2.3-70-g09d2 From b3d6cb92fd190d720a01075c4d20cdca896663fc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Sep 2014 04:19:53 -0700 Subject: tcp: do not copy headers in tcp_collapse() tcp_collapse() wants to shrink skb so that the overhead is minimal. Now we store tcp flags into TCP_SKB_CB(skb)->tcp_flags, we no longer need to keep around full headers. Whole available space is dedicated to the payload. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 228bf0c5ff1..ea92f23ffaf 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4535,26 +4535,13 @@ restart: return; while (before(start, end)) { + int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start); struct sk_buff *nskb; - unsigned int header = skb_headroom(skb); - int copy = SKB_MAX_ORDER(header, 0); - /* Too big header? This can happen with IPv6. */ - if (copy < 0) - return; - if (end - start < copy) - copy = end - start; - nskb = alloc_skb(copy + header, GFP_ATOMIC); + nskb = alloc_skb(copy, GFP_ATOMIC); if (!nskb) return; - skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head); - skb_set_network_header(nskb, (skb_network_header(skb) - - skb->head)); - skb_set_transport_header(nskb, (skb_transport_header(skb) - - skb->head)); - skb_reserve(nskb, header); - memcpy(nskb->head, skb->head, header); memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; __skb_queue_before(list, skb, nskb); -- cgit v1.2.3-70-g09d2 From cb93471acc42b71fa3f2e46805020f2b323db64f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 17 Sep 2014 03:14:42 -0700 Subject: tcp: do not fake tcp headers in tcp_send_rcvq() Now we no longer rely on having tcp headers for skbs in receive queue, tcp repair do not need to build fake ones. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ea92f23ffaf..02fb66d4a01 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4304,24 +4304,19 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) { - struct sk_buff *skb = NULL; - struct tcphdr *th; + struct sk_buff *skb; bool fragstolen; if (size == 0) return 0; - skb = alloc_skb(size + sizeof(*th), sk->sk_allocation); + skb = alloc_skb(size, sk->sk_allocation); if (!skb) goto err; - if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th))) + if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) goto err_free; - th = (struct tcphdr *)skb_put(skb, sizeof(*th)); - skb_reset_transport_header(skb); - memset(th, 0, sizeof(*th)); - if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size)) goto err_free; @@ -4329,7 +4324,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; - if (tcp_queue_rcv(sk, skb, sizeof(*th), &fragstolen)) { + if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) { WARN_ON_ONCE(fragstolen); /* should not happen */ __kfree_skb(skb); } -- cgit v1.2.3-70-g09d2 From fcdd1cf4dd63aecf86c987d7f4ec7187be5c2fbc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 22 Sep 2014 13:19:44 -0700 Subject: tcp: avoid possible arithmetic overflows icsk_rto is a 32bit field, and icsk_backoff can reach 15 by default, or more if some sysctl (eg tcp_retries2) are changed. Better use 64bit to perform icsk_rto << icsk_backoff operations As Joe Perches suggested, add a helper for this. Yuchung spotted the tcp_v4_err() case. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 9 +++++++++ net/ipv4/tcp_input.c | 5 +++-- net/ipv4/tcp_ipv4.c | 6 +++--- net/ipv4/tcp_output.c | 13 ++++++------- net/ipv4/tcp_timer.c | 4 ++-- 5 files changed, 23 insertions(+), 14 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 5fbe6568c3c..848e85cb5c6 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -242,6 +242,15 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, #endif } +static inline unsigned long +inet_csk_rto_backoff(const struct inet_connection_sock *icsk, + unsigned long max_when) +{ + u64 when = (u64)icsk->icsk_rto << icsk->icsk_backoff; + + return (unsigned long)min_t(u64, when, max_when); +} + struct sock *inet_csk_accept(struct sock *sk, int flags, int *err); struct request_sock *inet_csk_search_req(const struct sock *sk, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 02fb66d4a01..13f3da4762e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3208,9 +3208,10 @@ static void tcp_ack_probe(struct sock *sk) * This function is not for random using! */ } else { + unsigned long when = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), - TCP_RTO_MAX); + when, TCP_RTO_MAX); } } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 006b045716d..3b2e49cb2b6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -430,9 +430,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) break; icsk->icsk_backoff--; - inet_csk(sk)->icsk_rto = (tp->srtt_us ? __tcp_set_rto(tp) : - TCP_TIMEOUT_INIT) << icsk->icsk_backoff; - tcp_bound_rto(sk); + icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : + TCP_TIMEOUT_INIT; + icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); skb = tcp_write_queue_head(sk); BUG_ON(!skb); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 7f1280dcad5..8c61a7c0c88 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3279,6 +3279,7 @@ void tcp_send_probe0(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + unsigned long probe_max; int err; err = tcp_write_wakeup(sk); @@ -3294,9 +3295,7 @@ void tcp_send_probe0(struct sock *sk) if (icsk->icsk_backoff < sysctl_tcp_retries2) icsk->icsk_backoff++; icsk->icsk_probes_out++; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), - TCP_RTO_MAX); + probe_max = TCP_RTO_MAX; } else { /* If packet was not sent due to local congestion, * do not backoff and do not remember icsk_probes_out. @@ -3306,11 +3305,11 @@ void tcp_send_probe0(struct sock *sk) */ if (!icsk->icsk_probes_out) icsk->icsk_probes_out = 1; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - min(icsk->icsk_rto << icsk->icsk_backoff, - TCP_RESOURCE_PROBE_INTERVAL), - TCP_RTO_MAX); + probe_max = TCP_RESOURCE_PROBE_INTERVAL; } + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + inet_csk_rto_backoff(icsk, probe_max), + TCP_RTO_MAX); } int tcp_rtx_synack(struct sock *sk, struct request_sock *req) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index a339e7ba05a..b24360f6e29 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -180,7 +180,7 @@ static int tcp_write_timeout(struct sock *sk) retry_until = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - const int alive = (icsk->icsk_rto < TCP_RTO_MAX); + const int alive = icsk->icsk_rto < TCP_RTO_MAX; retry_until = tcp_orphan_retries(sk, alive); do_reset = alive || @@ -294,7 +294,7 @@ static void tcp_probe_timer(struct sock *sk) max_probes = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX); + const int alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; max_probes = tcp_orphan_retries(sk, alive); -- cgit v1.2.3-70-g09d2 From bd1e75abf4b3c666f61a5cf90c896aa928a735d5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Sep 2014 08:26:20 -0700 Subject: tcp: add coalescing attempt in tcp_ofo_queue() In order to make TCP more resilient in presence of reorders, we need to allow coalescing to happen when skbs from out of order queue are transferred into receive queue. LRO/GRO can be completely canceled in some pathological cases, like per packet load balancing on aggregated links. I had to move tcp_try_coalesce() up in the file above tcp_ofo_queue() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 89 +++++++++++++++++++++++++++------------------------- 1 file changed, 47 insertions(+), 42 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 13f3da4762e..f3f016a15c5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4061,6 +4061,44 @@ static void tcp_sack_remove(struct tcp_sock *tp) tp->rx_opt.num_sacks = num_sacks; } +/** + * tcp_try_coalesce - try to merge skb to prior one + * @sk: socket + * @to: prior buffer + * @from: buffer to add in queue + * @fragstolen: pointer to boolean + * + * Before queueing skb @from after @to, try to merge them + * to reduce overall memory use and queue lengths, if cost is small. + * Packets in ofo or receive queues can stay a long time. + * Better try to coalesce them right now to avoid future collapses. + * Returns true if caller should free @from instead of queueing it + */ +static bool tcp_try_coalesce(struct sock *sk, + struct sk_buff *to, + struct sk_buff *from, + bool *fragstolen) +{ + int delta; + + *fragstolen = false; + + /* Its possible this segment overlaps with prior segment in queue */ + if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) + return false; + + if (!skb_try_coalesce(to, from, fragstolen, &delta)) + return false; + + atomic_add(delta, &sk->sk_rmem_alloc); + sk_mem_charge(sk, delta); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); + TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; + TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; + TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags; + return true; +} + /* This one checks to see if we can put data from the * out_of_order queue into the receive_queue. */ @@ -4068,7 +4106,8 @@ static void tcp_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); __u32 dsack_high = tp->rcv_nxt; - struct sk_buff *skb; + struct sk_buff *skb, *tail; + bool fragstolen, eaten; while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) { if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) @@ -4081,9 +4120,9 @@ static void tcp_ofo_queue(struct sock *sk) tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); } + __skb_unlink(skb, &tp->out_of_order_queue); if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { SOCK_DEBUG(sk, "ofo packet was already received\n"); - __skb_unlink(skb, &tp->out_of_order_queue); __kfree_skb(skb); continue; } @@ -4091,11 +4130,15 @@ static void tcp_ofo_queue(struct sock *sk) tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); - __skb_unlink(skb, &tp->out_of_order_queue); - __skb_queue_tail(&sk->sk_receive_queue, skb); + tail = skb_peek_tail(&sk->sk_receive_queue); + eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + if (!eaten) + __skb_queue_tail(&sk->sk_receive_queue, skb); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) tcp_fin(sk); + if (eaten) + kfree_skb_partial(skb, fragstolen); } } @@ -4122,44 +4165,6 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, return 0; } -/** - * tcp_try_coalesce - try to merge skb to prior one - * @sk: socket - * @to: prior buffer - * @from: buffer to add in queue - * @fragstolen: pointer to boolean - * - * Before queueing skb @from after @to, try to merge them - * to reduce overall memory use and queue lengths, if cost is small. - * Packets in ofo or receive queues can stay a long time. - * Better try to coalesce them right now to avoid future collapses. - * Returns true if caller should free @from instead of queueing it - */ -static bool tcp_try_coalesce(struct sock *sk, - struct sk_buff *to, - struct sk_buff *from, - bool *fragstolen) -{ - int delta; - - *fragstolen = false; - - /* Its possible this segment overlaps with prior segment in queue */ - if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) - return false; - - if (!skb_try_coalesce(to, from, fragstolen, &delta)) - return false; - - atomic_add(delta, &sk->sk_rmem_alloc); - sk_mem_charge(sk, delta); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); - TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; - TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; - TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags; - return true; -} - static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); -- cgit v1.2.3-70-g09d2 From 7bced397510ab569d31de4c70b39e13355046387 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 30 Dec 2013 12:37:29 -0800 Subject: net_dma: simple removal Per commit "77873803363c net_dma: mark broken" net_dma is no longer used and there is no plan to fix it. This is the mechanical removal of bits in CONFIG_NET_DMA ifdef guards. Reverting the remainder of the net_dma induced changes is deferred to subsequent patches. Marked for stable due to Roman's report of a memory leak in dma_pin_iovec_pages(): https://lkml.org/lkml/2014/9/3/177 Cc: Dave Jiang Cc: Vinod Koul Cc: David Whipple Cc: Alexander Duyck Cc: Reported-by: Roman Gushchin Acked-by: David S. Miller Signed-off-by: Dan Williams --- Documentation/ABI/removed/net_dma | 8 + Documentation/networking/ip-sysctl.txt | 6 - drivers/dma/Kconfig | 12 -- drivers/dma/Makefile | 1 - drivers/dma/dmaengine.c | 104 ------------ drivers/dma/ioat/dma.c | 1 - drivers/dma/ioat/dma.h | 7 - drivers/dma/ioat/dma_v2.c | 1 - drivers/dma/ioat/dma_v3.c | 1 - drivers/dma/iovlock.c | 280 --------------------------------- include/linux/dmaengine.h | 22 +-- include/linux/skbuff.h | 8 +- include/linux/tcp.h | 8 - include/net/netdma.h | 32 ---- include/net/sock.h | 19 +-- include/net/tcp.h | 8 - kernel/sysctl_binary.c | 1 - net/core/Makefile | 1 - net/core/dev.c | 10 -- net/core/sock.c | 6 - net/core/user_dma.c | 131 --------------- net/dccp/proto.c | 4 +- net/ipv4/sysctl_net_ipv4.c | 9 -- net/ipv4/tcp.c | 147 ++--------------- net/ipv4/tcp_input.c | 61 ------- net/ipv4/tcp_ipv4.c | 18 +-- net/ipv6/tcp_ipv6.c | 13 +- net/llc/af_llc.c | 10 +- 28 files changed, 35 insertions(+), 894 deletions(-) create mode 100644 Documentation/ABI/removed/net_dma delete mode 100644 drivers/dma/iovlock.c delete mode 100644 include/net/netdma.h delete mode 100644 net/core/user_dma.c (limited to 'net/ipv4/tcp_input.c') diff --git a/Documentation/ABI/removed/net_dma b/Documentation/ABI/removed/net_dma new file mode 100644 index 00000000000..a173aecc2f1 --- /dev/null +++ b/Documentation/ABI/removed/net_dma @@ -0,0 +1,8 @@ +What: tcp_dma_copybreak sysctl +Date: Removed in kernel v3.13 +Contact: Dan Williams +Description: + Formerly the lower limit, in bytes, of the size of socket reads + that will be offloaded to a DMA copy engine. Removed due to + coherency issues of the cpu potentially touching the buffers + while dma is in flight. diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index ab42c95f998..ea8f3b182e7 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -582,12 +582,6 @@ tcp_workaround_signed_windows - BOOLEAN not receive a window scaling option from them. Default: 0 -tcp_dma_copybreak - INTEGER - Lower limit, in bytes, of the size of socket reads that will be - offloaded to a DMA copy engine, if one is present in the system - and CONFIG_NET_DMA is enabled. - Default: 4096 - tcp_thin_linear_timeouts - BOOLEAN Enable dynamic triggering of linear timeouts for thin streams. If set, a check is performed upon retransmission by timeout to diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 605b016bcea..6b5f37e01a7 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -368,18 +368,6 @@ config DMA_OF comment "DMA Clients" depends on DMA_ENGINE -config NET_DMA - bool "Network: TCP receive copy offload" - depends on DMA_ENGINE && NET - default (INTEL_IOATDMA || FSL_DMA) - depends on BROKEN - help - This enables the use of DMA engines in the network stack to - offload receive copy-to-user operations, freeing CPU cycles. - - Say Y here if you enabled INTEL_IOATDMA or FSL_DMA, otherwise - say N. - config ASYNC_TX_DMA bool "Async_tx: Offload support for the async_tx api" depends on DMA_ENGINE diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile index a029d0f4a1b..0c9dc754932 100644 --- a/drivers/dma/Makefile +++ b/drivers/dma/Makefile @@ -6,7 +6,6 @@ obj-$(CONFIG_DMA_VIRTUAL_CHANNELS) += virt-dma.o obj-$(CONFIG_DMA_ACPI) += acpi-dma.o obj-$(CONFIG_DMA_OF) += of-dma.o -obj-$(CONFIG_NET_DMA) += iovlock.o obj-$(CONFIG_INTEL_MID_DMAC) += intel_mid_dma.o obj-$(CONFIG_DMATEST) += dmatest.o obj-$(CONFIG_INTEL_IOATDMA) += ioat/ diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index ed610b49751..268de183b51 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -1084,110 +1084,6 @@ dmaengine_get_unmap_data(struct device *dev, int nr, gfp_t flags) } EXPORT_SYMBOL(dmaengine_get_unmap_data); -/** - * dma_async_memcpy_pg_to_pg - offloaded copy from page to page - * @chan: DMA channel to offload copy to - * @dest_pg: destination page - * @dest_off: offset in page to copy to - * @src_pg: source page - * @src_off: offset in page to copy from - * @len: length - * - * Both @dest_page/@dest_off and @src_page/@src_off must be mappable to a bus - * address according to the DMA mapping API rules for streaming mappings. - * Both @dest_page/@dest_off and @src_page/@src_off must stay memory resident - * (kernel memory or locked user space pages). - */ -dma_cookie_t -dma_async_memcpy_pg_to_pg(struct dma_chan *chan, struct page *dest_pg, - unsigned int dest_off, struct page *src_pg, unsigned int src_off, - size_t len) -{ - struct dma_device *dev = chan->device; - struct dma_async_tx_descriptor *tx; - struct dmaengine_unmap_data *unmap; - dma_cookie_t cookie; - unsigned long flags; - - unmap = dmaengine_get_unmap_data(dev->dev, 2, GFP_NOWAIT); - if (!unmap) - return -ENOMEM; - - unmap->to_cnt = 1; - unmap->from_cnt = 1; - unmap->addr[0] = dma_map_page(dev->dev, src_pg, src_off, len, - DMA_TO_DEVICE); - unmap->addr[1] = dma_map_page(dev->dev, dest_pg, dest_off, len, - DMA_FROM_DEVICE); - unmap->len = len; - flags = DMA_CTRL_ACK; - tx = dev->device_prep_dma_memcpy(chan, unmap->addr[1], unmap->addr[0], - len, flags); - - if (!tx) { - dmaengine_unmap_put(unmap); - return -ENOMEM; - } - - dma_set_unmap(tx, unmap); - cookie = tx->tx_submit(tx); - dmaengine_unmap_put(unmap); - - preempt_disable(); - __this_cpu_add(chan->local->bytes_transferred, len); - __this_cpu_inc(chan->local->memcpy_count); - preempt_enable(); - - return cookie; -} -EXPORT_SYMBOL(dma_async_memcpy_pg_to_pg); - -/** - * dma_async_memcpy_buf_to_buf - offloaded copy between virtual addresses - * @chan: DMA channel to offload copy to - * @dest: destination address (virtual) - * @src: source address (virtual) - * @len: length - * - * Both @dest and @src must be mappable to a bus address according to the - * DMA mapping API rules for streaming mappings. - * Both @dest and @src must stay memory resident (kernel memory or locked - * user space pages). - */ -dma_cookie_t -dma_async_memcpy_buf_to_buf(struct dma_chan *chan, void *dest, - void *src, size_t len) -{ - return dma_async_memcpy_pg_to_pg(chan, virt_to_page(dest), - (unsigned long) dest & ~PAGE_MASK, - virt_to_page(src), - (unsigned long) src & ~PAGE_MASK, len); -} -EXPORT_SYMBOL(dma_async_memcpy_buf_to_buf); - -/** - * dma_async_memcpy_buf_to_pg - offloaded copy from address to page - * @chan: DMA channel to offload copy to - * @page: destination page - * @offset: offset in page to copy to - * @kdata: source address (virtual) - * @len: length - * - * Both @page/@offset and @kdata must be mappable to a bus address according - * to the DMA mapping API rules for streaming mappings. - * Both @page/@offset and @kdata must stay memory resident (kernel memory or - * locked user space pages) - */ -dma_cookie_t -dma_async_memcpy_buf_to_pg(struct dma_chan *chan, struct page *page, - unsigned int offset, void *kdata, size_t len) -{ - return dma_async_memcpy_pg_to_pg(chan, page, offset, - virt_to_page(kdata), - (unsigned long) kdata & ~PAGE_MASK, len); -} -EXPORT_SYMBOL(dma_async_memcpy_buf_to_pg); - void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx, struct dma_chan *chan) { diff --git a/drivers/dma/ioat/dma.c b/drivers/dma/ioat/dma.c index b76c1485933..940c1502a8b 100644 --- a/drivers/dma/ioat/dma.c +++ b/drivers/dma/ioat/dma.c @@ -1222,7 +1222,6 @@ int ioat1_dma_probe(struct ioatdma_device *device, int dca) err = ioat_probe(device); if (err) return err; - ioat_set_tcp_copy_break(4096); err = ioat_register(device); if (err) return err; diff --git a/drivers/dma/ioat/dma.h b/drivers/dma/ioat/dma.h index e982f00a984..d63f68b1aa3 100644 --- a/drivers/dma/ioat/dma.h +++ b/drivers/dma/ioat/dma.h @@ -214,13 +214,6 @@ __dump_desc_dbg(struct ioat_chan_common *chan, struct ioat_dma_descriptor *hw, #define dump_desc_dbg(c, d) \ ({ if (d) __dump_desc_dbg(&c->base, d->hw, &d->txd, desc_id(d)); 0; }) -static inline void ioat_set_tcp_copy_break(unsigned long copybreak) -{ - #ifdef CONFIG_NET_DMA - sysctl_tcp_dma_copybreak = copybreak; - #endif -} - static inline struct ioat_chan_common * ioat_chan_by_index(struct ioatdma_device *device, int index) { diff --git a/drivers/dma/ioat/dma_v2.c b/drivers/dma/ioat/dma_v2.c index 2ce9be49860..695483e6be3 100644 --- a/drivers/dma/ioat/dma_v2.c +++ b/drivers/dma/ioat/dma_v2.c @@ -900,7 +900,6 @@ int ioat2_dma_probe(struct ioatdma_device *device, int dca) err = ioat_probe(device); if (err) return err; - ioat_set_tcp_copy_break(2048); list_for_each_entry(c, &dma->channels, device_node) { chan = to_chan_common(c); diff --git a/drivers/dma/ioat/dma_v3.c b/drivers/dma/ioat/dma_v3.c index 85971d6e964..895f869d6c2 100644 --- a/drivers/dma/ioat/dma_v3.c +++ b/drivers/dma/ioat/dma_v3.c @@ -1655,7 +1655,6 @@ int ioat3_dma_probe(struct ioatdma_device *device, int dca) err = ioat_probe(device); if (err) return err; - ioat_set_tcp_copy_break(262144); list_for_each_entry(c, &dma->channels, device_node) { chan = to_chan_common(c); diff --git a/drivers/dma/iovlock.c b/drivers/dma/iovlock.c deleted file mode 100644 index bb48a57c2fc..00000000000 --- a/drivers/dma/iovlock.c +++ /dev/null @@ -1,280 +0,0 @@ -/* - * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. - * Portions based on net/core/datagram.c and copyrighted by their authors. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * The full GNU General Public License is included in this distribution in the - * file called COPYING. - */ - -/* - * This code allows the net stack to make use of a DMA engine for - * skb to iovec copies. - */ - -#include -#include -#include -#include /* for memcpy_toiovec */ -#include -#include - -static int num_pages_spanned(struct iovec *iov) -{ - return - ((PAGE_ALIGN((unsigned long)iov->iov_base + iov->iov_len) - - ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT); -} - -/* - * Pin down all the iovec pages needed for len bytes. - * Return a struct dma_pinned_list to keep track of pages pinned down. - * - * We are allocating a single chunk of memory, and then carving it up into - * 3 sections, the latter 2 whose size depends on the number of iovecs and the - * total number of pages, respectively. - */ -struct dma_pinned_list *dma_pin_iovec_pages(struct iovec *iov, size_t len) -{ - struct dma_pinned_list *local_list; - struct page **pages; - int i; - int ret; - int nr_iovecs = 0; - int iovec_len_used = 0; - int iovec_pages_used = 0; - - /* don't pin down non-user-based iovecs */ - if (segment_eq(get_fs(), KERNEL_DS)) - return NULL; - - /* determine how many iovecs/pages there are, up front */ - do { - iovec_len_used += iov[nr_iovecs].iov_len; - iovec_pages_used += num_pages_spanned(&iov[nr_iovecs]); - nr_iovecs++; - } while (iovec_len_used < len); - - /* single kmalloc for pinned list, page_list[], and the page arrays */ - local_list = kmalloc(sizeof(*local_list) - + (nr_iovecs * sizeof (struct dma_page_list)) - + (iovec_pages_used * sizeof (struct page*)), GFP_KERNEL); - if (!local_list) - goto out; - - /* list of pages starts right after the page list array */ - pages = (struct page **) &local_list->page_list[nr_iovecs]; - - local_list->nr_iovecs = 0; - - for (i = 0; i < nr_iovecs; i++) { - struct dma_page_list *page_list = &local_list->page_list[i]; - - len -= iov[i].iov_len; - - if (!access_ok(VERIFY_WRITE, iov[i].iov_base, iov[i].iov_len)) - goto unpin; - - page_list->nr_pages = num_pages_spanned(&iov[i]); - page_list->base_address = iov[i].iov_base; - - page_list->pages = pages; - pages += page_list->nr_pages; - - /* pin pages down */ - down_read(¤t->mm->mmap_sem); - ret = get_user_pages( - current, - current->mm, - (unsigned long) iov[i].iov_base, - page_list->nr_pages, - 1, /* write */ - 0, /* force */ - page_list->pages, - NULL); - up_read(¤t->mm->mmap_sem); - - if (ret != page_list->nr_pages) - goto unpin; - - local_list->nr_iovecs = i + 1; - } - - return local_list; - -unpin: - dma_unpin_iovec_pages(local_list); -out: - return NULL; -} - -void dma_unpin_iovec_pages(struct dma_pinned_list *pinned_list) -{ - int i, j; - - if (!pinned_list) - return; - - for (i = 0; i < pinned_list->nr_iovecs; i++) { - struct dma_page_list *page_list = &pinned_list->page_list[i]; - for (j = 0; j < page_list->nr_pages; j++) { - set_page_dirty_lock(page_list->pages[j]); - page_cache_release(page_list->pages[j]); - } - } - - kfree(pinned_list); -} - - -/* - * We have already pinned down the pages we will be using in the iovecs. - * Each entry in iov array has corresponding entry in pinned_list->page_list. - * Using array indexing to keep iov[] and page_list[] in sync. - * Initial elements in iov array's iov->iov_len will be 0 if already copied into - * by another call. - * iov array length remaining guaranteed to be bigger than len. - */ -dma_cookie_t dma_memcpy_to_iovec(struct dma_chan *chan, struct iovec *iov, - struct dma_pinned_list *pinned_list, unsigned char *kdata, size_t len) -{ - int iov_byte_offset; - int copy; - dma_cookie_t dma_cookie = 0; - int iovec_idx; - int page_idx; - - if (!chan) - return memcpy_toiovec(iov, kdata, len); - - iovec_idx = 0; - while (iovec_idx < pinned_list->nr_iovecs) { - struct dma_page_list *page_list; - - /* skip already used-up iovecs */ - while (!iov[iovec_idx].iov_len) - iovec_idx++; - - page_list = &pinned_list->page_list[iovec_idx]; - - iov_byte_offset = ((unsigned long)iov[iovec_idx].iov_base & ~PAGE_MASK); - page_idx = (((unsigned long)iov[iovec_idx].iov_base & PAGE_MASK) - - ((unsigned long)page_list->base_address & PAGE_MASK)) >> PAGE_SHIFT; - - /* break up copies to not cross page boundary */ - while (iov[iovec_idx].iov_len) { - copy = min_t(int, PAGE_SIZE - iov_byte_offset, len); - copy = min_t(int, copy, iov[iovec_idx].iov_len); - - dma_cookie = dma_async_memcpy_buf_to_pg(chan, - page_list->pages[page_idx], - iov_byte_offset, - kdata, - copy); - /* poll for a descriptor slot */ - if (unlikely(dma_cookie < 0)) { - dma_async_issue_pending(chan); - continue; - } - - len -= copy; - iov[iovec_idx].iov_len -= copy; - iov[iovec_idx].iov_base += copy; - - if (!len) - return dma_cookie; - - kdata += copy; - iov_byte_offset = 0; - page_idx++; - } - iovec_idx++; - } - - /* really bad if we ever run out of iovecs */ - BUG(); - return -EFAULT; -} - -dma_cookie_t dma_memcpy_pg_to_iovec(struct dma_chan *chan, struct iovec *iov, - struct dma_pinned_list *pinned_list, struct page *page, - unsigned int offset, size_t len) -{ - int iov_byte_offset; - int copy; - dma_cookie_t dma_cookie = 0; - int iovec_idx; - int page_idx; - int err; - - /* this needs as-yet-unimplemented buf-to-buff, so punt. */ - /* TODO: use dma for this */ - if (!chan || !pinned_list) { - u8 *vaddr = kmap(page); - err = memcpy_toiovec(iov, vaddr + offset, len); - kunmap(page); - return err; - } - - iovec_idx = 0; - while (iovec_idx < pinned_list->nr_iovecs) { - struct dma_page_list *page_list; - - /* skip already used-up iovecs */ - while (!iov[iovec_idx].iov_len) - iovec_idx++; - - page_list = &pinned_list->page_list[iovec_idx]; - - iov_byte_offset = ((unsigned long)iov[iovec_idx].iov_base & ~PAGE_MASK); - page_idx = (((unsigned long)iov[iovec_idx].iov_base & PAGE_MASK) - - ((unsigned long)page_list->base_address & PAGE_MASK)) >> PAGE_SHIFT; - - /* break up copies to not cross page boundary */ - while (iov[iovec_idx].iov_len) { - copy = min_t(int, PAGE_SIZE - iov_byte_offset, len); - copy = min_t(int, copy, iov[iovec_idx].iov_len); - - dma_cookie = dma_async_memcpy_pg_to_pg(chan, - page_list->pages[page_idx], - iov_byte_offset, - page, - offset, - copy); - /* poll for a descriptor slot */ - if (unlikely(dma_cookie < 0)) { - dma_async_issue_pending(chan); - continue; - } - - len -= copy; - iov[iovec_idx].iov_len -= copy; - iov[iovec_idx].iov_base += copy; - - if (!len) - return dma_cookie; - - offset += copy; - iov_byte_offset = 0; - page_idx++; - } - iovec_idx++; - } - - /* really bad if we ever run out of iovecs */ - BUG(); - return -EFAULT; -} diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index c5c92d59e53..3e382ecd192 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -903,18 +903,6 @@ static inline void dmaengine_put(void) } #endif -#ifdef CONFIG_NET_DMA -#define net_dmaengine_get() dmaengine_get() -#define net_dmaengine_put() dmaengine_put() -#else -static inline void net_dmaengine_get(void) -{ -} -static inline void net_dmaengine_put(void) -{ -} -#endif - #ifdef CONFIG_ASYNC_TX_DMA #define async_dmaengine_get() dmaengine_get() #define async_dmaengine_put() dmaengine_put() @@ -936,16 +924,8 @@ async_dma_find_channel(enum dma_transaction_type type) return NULL; } #endif /* CONFIG_ASYNC_TX_DMA */ - -dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan, - void *dest, void *src, size_t len); -dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan, - struct page *page, unsigned int offset, void *kdata, size_t len); -dma_cookie_t dma_async_memcpy_pg_to_pg(struct dma_chan *chan, - struct page *dest_pg, unsigned int dest_off, struct page *src_pg, - unsigned int src_off, size_t len); void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx, - struct dma_chan *chan); + struct dma_chan *chan); static inline void async_tx_ack(struct dma_async_tx_descriptor *tx) { diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5e1e6f2d98c..bdbf7afad6b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include @@ -515,11 +514,8 @@ struct sk_buff { /* 6/8 bit hole (depending on ndisc_nodetype presence) */ kmemcheck_bitfield_end(flags2); -#if defined CONFIG_NET_DMA || defined CONFIG_NET_RX_BUSY_POLL - union { - unsigned int napi_id; - dma_cookie_t dma_cookie; - }; +#ifdef CONFIG_NET_RX_BUSY_POLL + unsigned int napi_id; #endif #ifdef CONFIG_NETWORK_SECMARK __u32 secmark; diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 4ad0706d40e..90895b8dc7f 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -19,7 +19,6 @@ #include -#include #include #include #include @@ -169,13 +168,6 @@ struct tcp_sock { struct iovec *iov; int memory; int len; -#ifdef CONFIG_NET_DMA - /* members for async copy */ - struct dma_chan *dma_chan; - int wakeup; - struct dma_pinned_list *pinned_list; - dma_cookie_t dma_cookie; -#endif } ucopy; u32 snd_wl1; /* Sequence for window update */ diff --git a/include/net/netdma.h b/include/net/netdma.h deleted file mode 100644 index 8ba8ce284ee..00000000000 --- a/include/net/netdma.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * The full GNU General Public License is included in this distribution in the - * file called COPYING. - */ -#ifndef NETDMA_H -#define NETDMA_H -#ifdef CONFIG_NET_DMA -#include -#include - -int dma_skb_copy_datagram_iovec(struct dma_chan* chan, - struct sk_buff *skb, int offset, struct iovec *to, - size_t len, struct dma_pinned_list *pinned_list); - -#endif /* CONFIG_NET_DMA */ -#endif /* NETDMA_H */ diff --git a/include/net/sock.h b/include/net/sock.h index b9586a137ca..3353b47f3d4 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -231,7 +231,6 @@ struct cg_proto; * @sk_receive_queue: incoming packets * @sk_wmem_alloc: transmit queue bytes committed * @sk_write_queue: Packet sending queue - * @sk_async_wait_queue: DMA copied packets * @sk_omem_alloc: "o" is "option" or "other" * @sk_wmem_queued: persistent queue size * @sk_forward_alloc: space allocated forward @@ -354,10 +353,6 @@ struct sock { struct sk_filter __rcu *sk_filter; struct socket_wq __rcu *sk_wq; -#ifdef CONFIG_NET_DMA - struct sk_buff_head sk_async_wait_queue; -#endif - #ifdef CONFIG_XFRM struct xfrm_policy *sk_policy[2]; #endif @@ -2214,27 +2209,15 @@ void sock_tx_timestamp(struct sock *sk, __u8 *tx_flags); * sk_eat_skb - Release a skb if it is no longer needed * @sk: socket to eat this skb from * @skb: socket buffer to eat - * @copied_early: flag indicating whether DMA operations copied this data early * * This routine must be called with interrupts disabled or with the socket * locked so that the sk_buff queue operation is ok. */ -#ifdef CONFIG_NET_DMA -static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, bool copied_early) -{ - __skb_unlink(skb, &sk->sk_receive_queue); - if (!copied_early) - __kfree_skb(skb); - else - __skb_queue_tail(&sk->sk_async_wait_queue, skb); -} -#else -static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, bool copied_early) +static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) { __skb_unlink(skb, &sk->sk_receive_queue); __kfree_skb(skb); } -#endif static inline struct net *sock_net(const struct sock *sk) diff --git a/include/net/tcp.h b/include/net/tcp.h index 8c4dd63134d..2c2f24ffa38 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include @@ -267,7 +266,6 @@ extern int sysctl_tcp_adv_win_scale; extern int sysctl_tcp_tw_reuse; extern int sysctl_tcp_frto; extern int sysctl_tcp_low_latency; -extern int sysctl_tcp_dma_copybreak; extern int sysctl_tcp_nometrics_save; extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; @@ -1023,12 +1021,6 @@ static inline void tcp_prequeue_init(struct tcp_sock *tp) tp->ucopy.len = 0; tp->ucopy.memory = 0; skb_queue_head_init(&tp->ucopy.prequeue); -#ifdef CONFIG_NET_DMA - tp->ucopy.dma_chan = NULL; - tp->ucopy.wakeup = 0; - tp->ucopy.pinned_list = NULL; - tp->ucopy.dma_cookie = 0; -#endif } bool tcp_prequeue(struct sock *sk, struct sk_buff *skb); diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 653cbbd9e7a..d457005aced 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -390,7 +390,6 @@ static const struct bin_table bin_net_ipv4_table[] = { { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, - { CTL_INT, NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" }, { CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" }, { CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" }, { CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" }, diff --git a/net/core/Makefile b/net/core/Makefile index 9628c20acff..5038f1ea034 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -16,7 +16,6 @@ obj-y += net-sysfs.o obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o -obj-$(CONFIG_NET_DMA) += user_dma.o obj-$(CONFIG_FIB_RULES) += fib_rules.o obj-$(CONFIG_TRACEPOINTS) += net-traces.o obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o diff --git a/net/core/dev.c b/net/core/dev.c index b1b0c8d4d7d..5e37e9abe8c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1266,7 +1266,6 @@ static int __dev_open(struct net_device *dev) clear_bit(__LINK_STATE_START, &dev->state); else { dev->flags |= IFF_UP; - net_dmaengine_get(); dev_set_rx_mode(dev); dev_activate(dev); add_device_randomness(dev->dev_addr, dev->addr_len); @@ -1342,7 +1341,6 @@ static int __dev_close_many(struct list_head *head) ops->ndo_stop(dev); dev->flags &= ~IFF_UP; - net_dmaengine_put(); } return 0; @@ -4405,14 +4403,6 @@ static void net_rx_action(struct softirq_action *h) out: net_rps_action_and_irq_enable(sd); -#ifdef CONFIG_NET_DMA - /* - * There may not be any more sk_buffs coming right now, so push - * any pending DMA copies to hardware - */ - dma_issue_pending_all(); -#endif - return; softnet_break: diff --git a/net/core/sock.c b/net/core/sock.c index c0fc6bdad1e..2f143c3b190 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1452,9 +1452,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) atomic_set(&newsk->sk_omem_alloc, 0); skb_queue_head_init(&newsk->sk_receive_queue); skb_queue_head_init(&newsk->sk_write_queue); -#ifdef CONFIG_NET_DMA - skb_queue_head_init(&newsk->sk_async_wait_queue); -#endif spin_lock_init(&newsk->sk_dst_lock); rwlock_init(&newsk->sk_callback_lock); @@ -2265,9 +2262,6 @@ void sock_init_data(struct socket *sock, struct sock *sk) skb_queue_head_init(&sk->sk_receive_queue); skb_queue_head_init(&sk->sk_write_queue); skb_queue_head_init(&sk->sk_error_queue); -#ifdef CONFIG_NET_DMA - skb_queue_head_init(&sk->sk_async_wait_queue); -#endif sk->sk_send_head = NULL; diff --git a/net/core/user_dma.c b/net/core/user_dma.c deleted file mode 100644 index 1b5fefdb819..00000000000 --- a/net/core/user_dma.c +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. - * Portions based on net/core/datagram.c and copyrighted by their authors. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * The full GNU General Public License is included in this distribution in the - * file called COPYING. - */ - -/* - * This code allows the net stack to make use of a DMA engine for - * skb to iovec copies. - */ - -#include -#include -#include -#include -#include - -#define NET_DMA_DEFAULT_COPYBREAK 4096 - -int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK; -EXPORT_SYMBOL(sysctl_tcp_dma_copybreak); - -/** - * dma_skb_copy_datagram_iovec - Copy a datagram to an iovec. - * @skb - buffer to copy - * @offset - offset in the buffer to start copying from - * @iovec - io vector to copy to - * @len - amount of data to copy from buffer to iovec - * @pinned_list - locked iovec buffer data - * - * Note: the iovec is modified during the copy. - */ -int dma_skb_copy_datagram_iovec(struct dma_chan *chan, - struct sk_buff *skb, int offset, struct iovec *to, - size_t len, struct dma_pinned_list *pinned_list) -{ - int start = skb_headlen(skb); - int i, copy = start - offset; - struct sk_buff *frag_iter; - dma_cookie_t cookie = 0; - - /* Copy header. */ - if (copy > 0) { - if (copy > len) - copy = len; - cookie = dma_memcpy_to_iovec(chan, to, pinned_list, - skb->data + offset, copy); - if (cookie < 0) - goto fault; - len -= copy; - if (len == 0) - goto end; - offset += copy; - } - - /* Copy paged appendix. Hmm... why does this look so complicated? */ - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - WARN_ON(start > offset + len); - - end = start + skb_frag_size(frag); - copy = end - offset; - if (copy > 0) { - struct page *page = skb_frag_page(frag); - - if (copy > len) - copy = len; - - cookie = dma_memcpy_pg_to_iovec(chan, to, pinned_list, page, - frag->page_offset + offset - start, copy); - if (cookie < 0) - goto fault; - len -= copy; - if (len == 0) - goto end; - offset += copy; - } - start = end; - } - - skb_walk_frags(skb, frag_iter) { - int end; - - WARN_ON(start > offset + len); - - end = start + frag_iter->len; - copy = end - offset; - if (copy > 0) { - if (copy > len) - copy = len; - cookie = dma_skb_copy_datagram_iovec(chan, frag_iter, - offset - start, - to, copy, - pinned_list); - if (cookie < 0) - goto fault; - len -= copy; - if (len == 0) - goto end; - offset += copy; - } - start = end; - } - -end: - if (!len) { - skb->dma_cookie = cookie; - return cookie; - } - -fault: - return -EFAULT; -} diff --git a/net/dccp/proto.c b/net/dccp/proto.c index eb892b4f481..f9076f295b1 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -848,7 +848,7 @@ int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, default: dccp_pr_debug("packet_type=%s\n", dccp_packet_name(dh->dccph_type)); - sk_eat_skb(sk, skb, false); + sk_eat_skb(sk, skb); } verify_sock_status: if (sock_flag(sk, SOCK_DONE)) { @@ -905,7 +905,7 @@ verify_sock_status: len = skb->len; found_fin_ok: if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb, false); + sk_eat_skb(sk, skb); break; } while (1); out: diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 44eba052b43..c3d2a48481f 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -635,15 +635,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, -#ifdef CONFIG_NET_DMA - { - .procname = "tcp_dma_copybreak", - .data = &sysctl_tcp_dma_copybreak, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, -#endif { .procname = "tcp_slow_start_after_idle", .data = &sysctl_tcp_slow_start_after_idle, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 97c8f5620c4..28595a364f0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -274,7 +274,6 @@ #include #include #include -#include #include #include @@ -1454,39 +1453,6 @@ static void tcp_prequeue_process(struct sock *sk) tp->ucopy.memory = 0; } -#ifdef CONFIG_NET_DMA -static void tcp_service_net_dma(struct sock *sk, bool wait) -{ - dma_cookie_t done, used; - dma_cookie_t last_issued; - struct tcp_sock *tp = tcp_sk(sk); - - if (!tp->ucopy.dma_chan) - return; - - last_issued = tp->ucopy.dma_cookie; - dma_async_issue_pending(tp->ucopy.dma_chan); - - do { - if (dma_async_is_tx_complete(tp->ucopy.dma_chan, - last_issued, &done, - &used) == DMA_COMPLETE) { - /* Safe to free early-copied skbs now */ - __skb_queue_purge(&sk->sk_async_wait_queue); - break; - } else { - struct sk_buff *skb; - while ((skb = skb_peek(&sk->sk_async_wait_queue)) && - (dma_async_is_complete(skb->dma_cookie, done, - used) == DMA_COMPLETE)) { - __skb_dequeue(&sk->sk_async_wait_queue); - kfree_skb(skb); - } - } - } while (wait); -} -#endif - static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) { struct sk_buff *skb; @@ -1504,7 +1470,7 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) * splitted a fat GRO packet, while we released socket lock * in skb_splice_bits() */ - sk_eat_skb(sk, skb, false); + sk_eat_skb(sk, skb); } return NULL; } @@ -1570,11 +1536,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, continue; } if (tcp_hdr(skb)->fin) { - sk_eat_skb(sk, skb, false); + sk_eat_skb(sk, skb); ++seq; break; } - sk_eat_skb(sk, skb, false); + sk_eat_skb(sk, skb); if (!desc->count) break; tp->copied_seq = seq; @@ -1612,7 +1578,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, int target; /* Read at least this many bytes */ long timeo; struct task_struct *user_recv = NULL; - bool copied_early = false; struct sk_buff *skb; u32 urg_hole = 0; @@ -1655,28 +1620,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); -#ifdef CONFIG_NET_DMA - tp->ucopy.dma_chan = NULL; - preempt_disable(); - skb = skb_peek_tail(&sk->sk_receive_queue); - { - int available = 0; - - if (skb) - available = TCP_SKB_CB(skb)->seq + skb->len - (*seq); - if ((available < target) && - (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && - !sysctl_tcp_low_latency && - net_dma_find_channel()) { - preempt_enable(); - tp->ucopy.pinned_list = - dma_pin_iovec_pages(msg->msg_iov, len); - } else { - preempt_enable(); - } - } -#endif - do { u32 offset; @@ -1807,16 +1750,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* __ Set realtime policy in scheduler __ */ } -#ifdef CONFIG_NET_DMA - if (tp->ucopy.dma_chan) { - if (tp->rcv_wnd == 0 && - !skb_queue_empty(&sk->sk_async_wait_queue)) { - tcp_service_net_dma(sk, true); - tcp_cleanup_rbuf(sk, copied); - } else - dma_async_issue_pending(tp->ucopy.dma_chan); - } -#endif if (copied >= target) { /* Do not sleep, just process backlog. */ release_sock(sk); @@ -1824,11 +1757,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } else sk_wait_data(sk, &timeo); -#ifdef CONFIG_NET_DMA - tcp_service_net_dma(sk, false); /* Don't block */ - tp->ucopy.wakeup = 0; -#endif - if (user_recv) { int chunk; @@ -1886,43 +1814,13 @@ do_prequeue: } if (!(flags & MSG_TRUNC)) { -#ifdef CONFIG_NET_DMA - if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = net_dma_find_channel(); - - if (tp->ucopy.dma_chan) { - tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( - tp->ucopy.dma_chan, skb, offset, - msg->msg_iov, used, - tp->ucopy.pinned_list); - - if (tp->ucopy.dma_cookie < 0) { - - pr_alert("%s: dma_cookie < 0\n", - __func__); - - /* Exception. Bailout! */ - if (!copied) - copied = -EFAULT; - break; - } - - dma_async_issue_pending(tp->ucopy.dma_chan); - - if ((offset + used) == skb->len) - copied_early = true; - - } else -#endif - { - err = skb_copy_datagram_iovec(skb, offset, - msg->msg_iov, used); - if (err) { - /* Exception. Bailout! */ - if (!copied) - copied = -EFAULT; - break; - } + err = skb_copy_datagram_iovec(skb, offset, + msg->msg_iov, used); + if (err) { + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; } } @@ -1942,19 +1840,15 @@ skip_copy: if (tcp_hdr(skb)->fin) goto found_fin_ok; - if (!(flags & MSG_PEEK)) { - sk_eat_skb(sk, skb, copied_early); - copied_early = false; - } + if (!(flags & MSG_PEEK)) + sk_eat_skb(sk, skb); continue; found_fin_ok: /* Process the FIN. */ ++*seq; - if (!(flags & MSG_PEEK)) { - sk_eat_skb(sk, skb, copied_early); - copied_early = false; - } + if (!(flags & MSG_PEEK)) + sk_eat_skb(sk, skb); break; } while (len > 0); @@ -1977,16 +1871,6 @@ skip_copy: tp->ucopy.len = 0; } -#ifdef CONFIG_NET_DMA - tcp_service_net_dma(sk, true); /* Wait for queue to drain */ - tp->ucopy.dma_chan = NULL; - - if (tp->ucopy.pinned_list) { - dma_unpin_iovec_pages(tp->ucopy.pinned_list); - tp->ucopy.pinned_list = NULL; - } -#endif - /* According to UNIX98, msg_name/msg_namelen are ignored * on connected socket. I was just happy when found this 8) --ANK */ @@ -2330,9 +2214,6 @@ int tcp_disconnect(struct sock *sk, int flags) __skb_queue_purge(&sk->sk_receive_queue); tcp_write_queue_purge(sk); __skb_queue_purge(&tp->out_of_order_queue); -#ifdef CONFIG_NET_DMA - __skb_queue_purge(&sk->sk_async_wait_queue); -#endif inet->inet_dport = 0; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eeaac399420..1342e9851f9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -73,7 +73,6 @@ #include #include #include -#include int sysctl_tcp_timestamps __read_mostly = 1; int sysctl_tcp_window_scaling __read_mostly = 1; @@ -4970,53 +4969,6 @@ static inline bool tcp_checksum_complete_user(struct sock *sk, __tcp_checksum_complete_user(sk, skb); } -#ifdef CONFIG_NET_DMA -static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, - int hlen) -{ - struct tcp_sock *tp = tcp_sk(sk); - int chunk = skb->len - hlen; - int dma_cookie; - bool copied_early = false; - - if (tp->ucopy.wakeup) - return false; - - if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = net_dma_find_channel(); - - if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) { - - dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan, - skb, hlen, - tp->ucopy.iov, chunk, - tp->ucopy.pinned_list); - - if (dma_cookie < 0) - goto out; - - tp->ucopy.dma_cookie = dma_cookie; - copied_early = true; - - tp->ucopy.len -= chunk; - tp->copied_seq += chunk; - tcp_rcv_space_adjust(sk); - - if ((tp->ucopy.len == 0) || - (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) || - (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) { - tp->ucopy.wakeup = 1; - sk->sk_data_ready(sk, 0); - } - } else if (chunk > 0) { - tp->ucopy.wakeup = 1; - sk->sk_data_ready(sk, 0); - } -out: - return copied_early; -} -#endif /* CONFIG_NET_DMA */ - /* Does PAWS and seqno based validation of an incoming segment, flags will * play significant role here. */ @@ -5201,14 +5153,6 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if (tp->copied_seq == tp->rcv_nxt && len - tcp_header_len <= tp->ucopy.len) { -#ifdef CONFIG_NET_DMA - if (tp->ucopy.task == current && - sock_owned_by_user(sk) && - tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { - copied_early = 1; - eaten = 1; - } -#endif if (tp->ucopy.task == current && sock_owned_by_user(sk) && !copied_early) { __set_current_state(TASK_RUNNING); @@ -5274,11 +5218,6 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if (!copied_early || tp->rcv_nxt != tp->rcv_wup) __tcp_ack_snd_check(sk, 0); no_ack: -#ifdef CONFIG_NET_DMA - if (copied_early) - __skb_queue_tail(&sk->sk_async_wait_queue, skb); - else -#endif if (eaten) kfree_skb_partial(skb, fragstolen); sk->sk_data_ready(sk, 0); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3cf97651049..737c2e270ee 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -72,7 +72,6 @@ #include #include #include -#include #include #include #include @@ -1999,18 +1998,8 @@ process: bh_lock_sock_nested(sk); ret = 0; if (!sock_owned_by_user(sk)) { -#ifdef CONFIG_NET_DMA - struct tcp_sock *tp = tcp_sk(sk); - if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = net_dma_find_channel(); - if (tp->ucopy.dma_chan) + if (!tcp_prequeue(sk, skb)) ret = tcp_v4_do_rcv(sk, skb); - else -#endif - { - if (!tcp_prequeue(sk, skb)) - ret = tcp_v4_do_rcv(sk, skb); - } } else if (unlikely(sk_add_backlog(sk, skb, sk->sk_rcvbuf + sk->sk_sndbuf))) { bh_unlock_sock(sk); @@ -2169,11 +2158,6 @@ void tcp_v4_destroy_sock(struct sock *sk) } #endif -#ifdef CONFIG_NET_DMA - /* Cleans up our sk_async_wait_queue */ - __skb_queue_purge(&sk->sk_async_wait_queue); -#endif - /* Clean prequeue, it must be empty really */ __skb_queue_purge(&tp->ucopy.prequeue); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 889079b2ea8..cb21fccf208 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -59,7 +59,6 @@ #include #include #include -#include #include #include #include @@ -1520,18 +1519,8 @@ process: bh_lock_sock_nested(sk); ret = 0; if (!sock_owned_by_user(sk)) { -#ifdef CONFIG_NET_DMA - struct tcp_sock *tp = tcp_sk(sk); - if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = net_dma_find_channel(); - if (tp->ucopy.dma_chan) + if (!tcp_prequeue(sk, skb)) ret = tcp_v6_do_rcv(sk, skb); - else -#endif - { - if (!tcp_prequeue(sk, skb)) - ret = tcp_v6_do_rcv(sk, skb); - } } else if (unlikely(sk_add_backlog(sk, skb, sk->sk_rcvbuf + sk->sk_sndbuf))) { bh_unlock_sock(sk); diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 0080d2b0a8a..bb9cbc17d92 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -839,7 +839,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, if (!(flags & MSG_PEEK)) { spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags); - sk_eat_skb(sk, skb, false); + sk_eat_skb(sk, skb); spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags); *seq = 0; } @@ -861,10 +861,10 @@ copy_uaddr: llc_cmsg_rcv(msg, skb); if (!(flags & MSG_PEEK)) { - spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags); - sk_eat_skb(sk, skb, false); - spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags); - *seq = 0; + spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags); + sk_eat_skb(sk, skb); + spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags); + *seq = 0; } goto out; -- cgit v1.2.3-70-g09d2 From d27f9bc104375a0a835cf68bb88fc9cec69125da Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 30 Dec 2013 11:37:15 -0800 Subject: net_dma: revert 'copied_early' Now that tcp_dma_try_early_copy() is gone nothing ever sets copied_early. Also reverts "53240c208776 tcp: Fix possible double-ack w/ user dma" since it is no longer necessary. Cc: Ali Saidi Cc: James Morris Cc: Patrick McHardy Cc: Eric Dumazet Cc: Alexey Kuznetsov Cc: Hideaki YOSHIFUJI Cc: Neal Cardwell Reported-by: Dave Jones Acked-by: David S. Miller Signed-off-by: Dan Williams --- net/ipv4/tcp_input.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1342e9851f9..1da0ade9823 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5148,19 +5148,15 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, } } else { int eaten = 0; - int copied_early = 0; bool fragstolen = false; - if (tp->copied_seq == tp->rcv_nxt && - len - tcp_header_len <= tp->ucopy.len) { - if (tp->ucopy.task == current && - sock_owned_by_user(sk) && !copied_early) { - __set_current_state(TASK_RUNNING); + if (tp->ucopy.task == current && + tp->copied_seq == tp->rcv_nxt && + len - tcp_header_len <= tp->ucopy.len && + sock_owned_by_user(sk)) { + __set_current_state(TASK_RUNNING); - if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) - eaten = 1; - } - if (eaten) { + if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) { /* Predicted packet is in window by definition. * seq == rcv_nxt and rcv_wup <= rcv_nxt. * Hence, check seq<=rcv_wup reduces to: @@ -5176,9 +5172,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, __skb_pull(skb, tcp_header_len); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER); + eaten = 1; } - if (copied_early) - tcp_cleanup_rbuf(sk, skb->len); } if (!eaten) { if (tcp_checksum_complete_user(sk, skb)) @@ -5215,8 +5210,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, goto no_ack; } - if (!copied_early || tp->rcv_nxt != tp->rcv_wup) - __tcp_ack_snd_check(sk, 0); + __tcp_ack_snd_check(sk, 0); no_ack: if (eaten) kfree_skb_partial(skb, fragstolen); -- cgit v1.2.3-70-g09d2 From cd7d8498c9a5d510c64db38d9f4f4fbc41790f09 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 24 Sep 2014 04:11:22 -0700 Subject: tcp: change tcp_skb_pcount() location Our goal is to access no more than one cache line access per skb in a write or receive queue when doing the various walks. After recent TCP_SKB_CB() reorganizations, it is almost done. Last part is tcp_skb_pcount() which currently uses skb_shinfo(skb)->gso_segs, which is a terrible choice, because it needs 3 cache lines in current kernel (skb->head, skb->end, and shinfo->gso_segs are all in 3 different cache lines, far from skb->cb) This very simple patch reuses space currently taken by tcp_tw_isn only in input path, as tcp_skb_pcount is only needed for skb stored in write queue. This considerably speeds up tcp_ack(), granted we avoid shinfo->tx_flags to get SKBTX_ACK_TSTAMP, which seems possible. This also speeds up all sack processing in general. This speeds up tcp_sendmsg() because it no longer has to access/dirty shinfo. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 23 +++++++++++++++++++++-- net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_input.c | 8 ++++---- net/ipv4/tcp_output.c | 9 ++++++--- 4 files changed, 33 insertions(+), 11 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index 4dc6641ee99..02a9a2c366b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -698,7 +698,16 @@ static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) struct tcp_skb_cb { __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ - __u32 tcp_tw_isn; /* isn chosen by tcp_timewait_state_process() */ + union { + /* Note : tcp_tw_isn is used in input path only + * (isn chosen by tcp_timewait_state_process()) + * + * tcp_gso_segs is used in write queue only, + * cf tcp_skb_pcount() + */ + __u32 tcp_tw_isn; + __u32 tcp_gso_segs; + }; __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ __u8 sacked; /* State flags for SACK/FACK. */ @@ -746,7 +755,17 @@ TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb, */ static inline int tcp_skb_pcount(const struct sk_buff *skb) { - return skb_shinfo(skb)->gso_segs; + return TCP_SKB_CB(skb)->tcp_gso_segs; +} + +static inline void tcp_skb_pcount_set(struct sk_buff *skb, int segs) +{ + TCP_SKB_CB(skb)->tcp_gso_segs = segs; +} + +static inline void tcp_skb_pcount_add(struct sk_buff *skb, int segs) +{ + TCP_SKB_CB(skb)->tcp_gso_segs += segs; } /* This is valid iff tcp_skb_pcount() > 1. */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 553b01f52f7..87289e51be0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -963,7 +963,7 @@ new_segment: skb->ip_summed = CHECKSUM_PARTIAL; tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; - skb_shinfo(skb)->gso_segs = 0; + tcp_skb_pcount_set(skb, 0); if (!copied) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; @@ -1261,7 +1261,7 @@ new_segment: tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; - skb_shinfo(skb)->gso_segs = 0; + tcp_skb_pcount_set(skb, 0); from += copy; copied += copy; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f3f016a15c5..2c0af90231c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1295,9 +1295,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, TCP_SKB_CB(prev)->end_seq += shifted; TCP_SKB_CB(skb)->seq += shifted; - skb_shinfo(prev)->gso_segs += pcount; - BUG_ON(skb_shinfo(skb)->gso_segs < pcount); - skb_shinfo(skb)->gso_segs -= pcount; + tcp_skb_pcount_add(prev, pcount); + BUG_ON(tcp_skb_pcount(skb) < pcount); + tcp_skb_pcount_add(skb, -pcount); /* When we're adding to gso_segs == 1, gso_size will be zero, * in theory this shouldn't be necessary but as long as DSACK @@ -1310,7 +1310,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, } /* CHECKME: To clear or not to clear? Mimics normal skb currently */ - if (skb_shinfo(skb)->gso_segs <= 1) { + if (tcp_skb_pcount(skb) <= 1) { skb_shinfo(skb)->gso_size = 0; skb_shinfo(skb)->gso_type = 0; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a462fb1db89..4d92703df4c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -384,7 +384,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) TCP_SKB_CB(skb)->tcp_flags = flags; TCP_SKB_CB(skb)->sacked = 0; - shinfo->gso_segs = 1; + tcp_skb_pcount_set(skb, 1); shinfo->gso_size = 0; shinfo->gso_type = 0; @@ -972,6 +972,9 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); + /* OK, its time to fill skb_shinfo(skb)->gso_segs */ + skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); + /* Our usage of tstamp should remain private */ skb->tstamp.tv64 = 0; @@ -1019,11 +1022,11 @@ static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, /* Avoid the costly divide in the normal * non-TSO case. */ - shinfo->gso_segs = 1; + tcp_skb_pcount_set(skb, 1); shinfo->gso_size = 0; shinfo->gso_type = 0; } else { - shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now); + tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now)); shinfo->gso_size = mss_now; shinfo->gso_type = sk->sk_gso_type; } -- cgit v1.2.3-70-g09d2 From 155c6e1ad4a778cad7f9fe6695afc91b3f5fe1ac Mon Sep 17 00:00:00 2001 From: "Peter Pan(潘卫平)" Date: Wed, 24 Sep 2014 22:17:02 +0800 Subject: tcp: use tcp_flags in tcp_data_queue() This patch is a cleanup which follows the idea in commit e11ecddf5128 (tcp: use TCP_SKB_CB(skb)->tcp_flags in input path), and it may reduce register pressure since skb->cb[] access is fast, bacause skb is probably in a register. v2: remove variable th v3: reword the changelog Signed-off-by: Weiping Pan Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2c0af90231c..5073eefa6fa 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4344,7 +4344,6 @@ err: static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { - const struct tcphdr *th = tcp_hdr(skb); struct tcp_sock *tp = tcp_sk(sk); int eaten = -1; bool fragstolen = false; @@ -4353,7 +4352,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) goto drop; skb_dst_drop(skb); - __skb_pull(skb, th->doff * 4); + __skb_pull(skb, tcp_hdr(skb)->doff * 4); TCP_ECN_accept_cwr(tp, skb); @@ -4397,7 +4396,7 @@ queue_and_out: tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (skb->len) tcp_event_data_recv(sk, skb); - if (th->fin) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) tcp_fin(sk); if (!skb_queue_empty(&tp->out_of_order_queue)) { -- cgit v1.2.3-70-g09d2 From 30e502a34b8b21fae2c789da102bd9f6e99fef83 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 26 Sep 2014 22:37:33 +0200 Subject: net: tcp: add flag for ca to indicate that ECN is required This patch adds a flag to TCP congestion algorithms that allows for requesting to mark IPv4/IPv6 sockets with transport as ECN capable, that is, ECT(0), when required by a congestion algorithm. It is currently used and needed in DataCenter TCP (DCTCP), as it requires both peers to assert ECT on all IP packets sent - it uses ECN feedback (i.e. CE, Congestion Encountered information) from switches inside the data center to derive feedback to the end hosts. Therefore, simply add a new flag to icsk_ca_ops. Note that DCTCP's algorithm/behaviour slightly diverges from RFC3168, therefore this is only (!) enabled iff the assigned congestion control ops module has requested this. By that, we can tightly couple this logic really only to the provided congestion control ops. Joint work with Florian Westphal and Glenn Judd. Signed-off-by: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: Glenn Judd Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/tcp.h | 61 +++++++++++++++++++++++++++++++++++++-------------- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_output.c | 25 +++++++++++++++------ 3 files changed, 63 insertions(+), 25 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index f99b0c072ee..a12f145cfbc 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -733,23 +733,6 @@ struct tcp_skb_cb { #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) -/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set - * - * If we receive a SYN packet with these bits set, it means a network is - * playing bad games with TOS bits. In order to avoid possible false congestion - * notifications, we disable TCP ECN negociation. - */ -static inline void -TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb, - struct net *net) -{ - const struct tcphdr *th = tcp_hdr(skb); - - if (net->ipv4.sysctl_tcp_ecn && th->ece && th->cwr && - INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield)) - inet_rsk(req)->ecn_ok = 1; -} - /* Due to TSO, an SKB can be composed of multiple actual * packets. To keep these tracked properly, we use this. */ @@ -791,7 +774,10 @@ enum tcp_ca_event { #define TCP_CA_MAX 128 #define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX) +/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */ #define TCP_CONG_NON_RESTRICTED 0x1 +/* Requires ECN/ECT set on all packets */ +#define TCP_CONG_NEEDS_ECN 0x2 struct tcp_congestion_ops { struct list_head list; @@ -840,6 +826,13 @@ u32 tcp_reno_ssthresh(struct sock *sk); void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); extern struct tcp_congestion_ops tcp_reno; +static inline bool tcp_ca_needs_ecn(const struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN; +} + static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -857,6 +850,40 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) icsk->icsk_ca_ops->cwnd_event(sk, event); } +/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set + * + * If we receive a SYN packet with these bits set, it means a + * network is playing bad games with TOS bits. In order to + * avoid possible false congestion notifications, we disable + * TCP ECN negociation. + * + * Exception: tcp_ca wants ECN. This is required for DCTCP + * congestion control; it requires setting ECT on all packets, + * including SYN. We inverse the test in this case: If our + * local socket wants ECN, but peer only set ece/cwr (but not + * ECT in IP header) its probably a non-DCTCP aware sender. + */ +static inline void +TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb, + const struct sock *listen_sk) +{ + const struct tcphdr *th = tcp_hdr(skb); + const struct net *net = sock_net(listen_sk); + bool th_ecn = th->ece && th->cwr; + bool ect, need_ecn; + + if (!th_ecn) + return; + + ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); + need_ecn = tcp_ca_needs_ecn(listen_sk); + + if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn) + inet_rsk(req)->ecn_ok = 1; + else if (ect && need_ecn) + inet_rsk(req)->ecn_ok = 1; +} + /* These functions determine how the current flow behaves in respect of SACK * handling. SACK is negotiated with the peer, and therefore it can vary * between different flows. diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5073eefa6fa..fb0fe97e1c5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5944,7 +5944,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop_and_free; if (!want_cookie || tmp_opt.tstamp_ok) - TCP_ECN_create_request(req, skb, sock_net(sk)); + TCP_ECN_create_request(req, skb, sk); if (want_cookie) { isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 4d92703df4c..20e73271d75 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -318,11 +318,15 @@ static u16 tcp_select_window(struct sock *sk) } /* Packet ECN state for a SYN-ACK */ -static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb) +static inline void TCP_ECN_send_synack(struct sock *sk, struct sk_buff *skb) { + const struct tcp_sock *tp = tcp_sk(sk); + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; if (!(tp->ecn_flags & TCP_ECN_OK)) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; + else if (tcp_ca_needs_ecn(sk)) + INET_ECN_xmit(sk); } /* Packet ECN state for a SYN. */ @@ -331,17 +335,24 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); tp->ecn_flags = 0; - if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) { + if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 || + tcp_ca_needs_ecn(sk)) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; tp->ecn_flags = TCP_ECN_OK; + if (tcp_ca_needs_ecn(sk)) + INET_ECN_xmit(sk); } } static __inline__ void -TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th) +TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th, + struct sock *sk) { - if (inet_rsk(req)->ecn_ok) + if (inet_rsk(req)->ecn_ok) { th->ece = 1; + if (tcp_ca_needs_ecn(sk)) + INET_ECN_xmit(sk); + } } /* Set up ECN state for a packet on a ESTABLISHED socket that is about to @@ -362,7 +373,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, tcp_hdr(skb)->cwr = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; } - } else { + } else if (!tcp_ca_needs_ecn(sk)) { /* ACK or retransmitted segment: clear ECT|CE */ INET_ECN_dontxmit(sk); } @@ -2789,7 +2800,7 @@ int tcp_send_synack(struct sock *sk) } TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; - TCP_ECN_send_synack(tcp_sk(sk), skb); + TCP_ECN_send_synack(sk, skb); } return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } @@ -2848,7 +2859,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; - TCP_ECN_make_synack(req, th); + TCP_ECN_make_synack(req, th, sk); th->source = htons(ireq->ir_num); th->dest = ireq->ir_rmt_port; /* Setting of flags are superfluous here for callers (and ECE is -- cgit v1.2.3-70-g09d2 From 7354c8c389d18719dd71cc810da70b0921d66694 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 26 Sep 2014 22:37:34 +0200 Subject: net: tcp: split ack slow/fast events from cwnd_event The congestion control ops "cwnd_event" currently supports CA_EVENT_FAST_ACK and CA_EVENT_SLOW_ACK events (among others). Both FAST and SLOW_ACK are only used by Westwood congestion control algorithm. This removes both flags from cwnd_event and adds a new in_ack_event callback for this. The goal is to be able to provide more detailed information about ACKs, such as whether ECE flag was set, or whether the ACK resulted in a window update. It is required for DataCenter TCP (DCTCP) congestion control algorithm as it makes a different choice depending on ECE being set or not. Joint work with Daniel Borkmann and Glenn Judd. Signed-off-by: Florian Westphal Signed-off-by: Daniel Borkmann Signed-off-by: Glenn Judd Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/tcp.h | 8 ++++++-- net/ipv4/tcp_input.c | 12 ++++++++++-- net/ipv4/tcp_westwood.c | 28 ++++++++++++++++------------ 3 files changed, 32 insertions(+), 16 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index a12f145cfbc..7ec6a28305c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -763,8 +763,10 @@ enum tcp_ca_event { CA_EVENT_CWND_RESTART, /* congestion window restart */ CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ CA_EVENT_LOSS, /* loss timeout */ - CA_EVENT_FAST_ACK, /* in sequence ack */ - CA_EVENT_SLOW_ACK, /* other ack */ +}; + +enum tcp_ca_ack_event_flags { + CA_ACK_SLOWPATH = (1 << 0), }; /* @@ -796,6 +798,8 @@ struct tcp_congestion_ops { void (*set_state)(struct sock *sk, u8 new_state); /* call when cwnd event occurs (optional) */ void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev); + /* call when ack arrives (optional) */ + void (*in_ack_event)(struct sock *sk, u32 flags); /* new value of cwnd after loss (optional) */ u32 (*undo_cwnd)(struct sock *sk); /* hook for packet ack accounting (optional) */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fb0fe97e1c5..8a38774cc66 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3362,6 +3362,14 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) } } +static inline void tcp_in_ack_event(struct sock *sk, u32 flags) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ca_ops->in_ack_event) + icsk->icsk_ca_ops->in_ack_event(sk, flags); +} + /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { @@ -3421,7 +3429,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tp->snd_una = ack; flag |= FLAG_WIN_UPDATE; - tcp_ca_event(sk, CA_EVENT_FAST_ACK); + tcp_in_ack_event(sk, 0); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS); } else { @@ -3439,7 +3447,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) flag |= FLAG_ECE; - tcp_ca_event(sk, CA_EVENT_SLOW_ACK); + tcp_in_ack_event(sk, CA_ACK_SLOWPATH); } /* We passed data and got it acked, remove any soft error diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 81911a92356..bb63fba47d4 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -220,32 +220,35 @@ static u32 tcp_westwood_bw_rttmin(const struct sock *sk) return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); } +static void tcp_westwood_ack(struct sock *sk, u32 ack_flags) +{ + if (ack_flags & CA_ACK_SLOWPATH) { + struct westwood *w = inet_csk_ca(sk); + + westwood_update_window(sk); + w->bk += westwood_acked_count(sk); + + update_rtt_min(w); + return; + } + + westwood_fast_bw(sk); +} + static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) { struct tcp_sock *tp = tcp_sk(sk); struct westwood *w = inet_csk_ca(sk); switch (event) { - case CA_EVENT_FAST_ACK: - westwood_fast_bw(sk); - break; - case CA_EVENT_COMPLETE_CWR: tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); break; - case CA_EVENT_LOSS: tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); /* Update RTT_min when next ack arrives */ w->reset_rtt_min = 1; break; - - case CA_EVENT_SLOW_ACK: - westwood_update_window(sk); - w->bk += westwood_acked_count(sk); - update_rtt_min(w); - break; - default: /* don't care */ break; @@ -274,6 +277,7 @@ static struct tcp_congestion_ops tcp_westwood __read_mostly = { .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, .cwnd_event = tcp_westwood_event, + .in_ack_event = tcp_westwood_ack, .get_info = tcp_westwood_info, .pkts_acked = tcp_westwood_pkts_acked, -- cgit v1.2.3-70-g09d2 From 9890092e46b2996bb85f7f973e69424cb5c07bc0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 26 Sep 2014 22:37:35 +0200 Subject: net: tcp: more detailed ACK events and events for CE marked packets DataCenter TCP (DCTCP) determines cwnd growth based on ECN information and ACK properties, e.g. ACK that updates window is treated differently than DUPACK. Also DCTCP needs information whether ACK was delayed ACK. Furthermore, DCTCP also implements a CE state machine that keeps track of CE markings of incoming packets. Therefore, extend the congestion control framework to provide these event types, so that DCTCP can be properly implemented as a normal congestion algorithm module outside of the core stack. Joint work with Daniel Borkmann and Glenn Judd. Signed-off-by: Florian Westphal Signed-off-by: Daniel Borkmann Signed-off-by: Glenn Judd Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/tcp.h | 9 ++++++++- net/ipv4/tcp_input.c | 22 ++++++++++++++++++---- net/ipv4/tcp_output.c | 4 ++++ 3 files changed, 30 insertions(+), 5 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index 7ec6a28305c..1f57c536349 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -763,10 +763,17 @@ enum tcp_ca_event { CA_EVENT_CWND_RESTART, /* congestion window restart */ CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ CA_EVENT_LOSS, /* loss timeout */ + CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ + CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ + CA_EVENT_DELAYED_ACK, /* Delayed ack is sent */ + CA_EVENT_NON_DELAYED_ACK, }; +/* Information about inbound ACK, passed to cong_ops->in_ack_event() */ enum tcp_ca_ack_event_flags { - CA_ACK_SLOWPATH = (1 << 0), + CA_ACK_SLOWPATH = (1 << 0), /* In slow path processing */ + CA_ACK_WIN_UPDATE = (1 << 1), /* ACK updated window */ + CA_ACK_ECE = (1 << 2), /* ECE bit is set on ack */ }; /* diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8a38774cc66..fc133178c78 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -233,14 +233,21 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s tcp_enter_quickack_mode((struct sock *)tp); break; case INET_ECN_CE: + if (tcp_ca_needs_ecn((struct sock *)tp)) + tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE); + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { /* Better not delay acks, sender can have a very low cwnd */ tcp_enter_quickack_mode((struct sock *)tp); tp->ecn_flags |= TCP_ECN_DEMAND_CWR; } - /* fallinto */ + tp->ecn_flags |= TCP_ECN_SEEN; + break; default: + if (tcp_ca_needs_ecn((struct sock *)tp)) + tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE); tp->ecn_flags |= TCP_ECN_SEEN; + break; } } @@ -3429,10 +3436,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tp->snd_una = ack; flag |= FLAG_WIN_UPDATE; - tcp_in_ack_event(sk, 0); + tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS); } else { + u32 ack_ev_flags = CA_ACK_SLOWPATH; + if (ack_seq != TCP_SKB_CB(skb)->end_seq) flag |= FLAG_DATA; else @@ -3444,10 +3453,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_rtt_us); - if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) + if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) { flag |= FLAG_ECE; + ack_ev_flags |= CA_ACK_ECE; + } + + if (flag & FLAG_WIN_UPDATE) + ack_ev_flags |= CA_ACK_WIN_UPDATE; - tcp_in_ack_event(sk, CA_ACK_SLOWPATH); + tcp_in_ack_event(sk, ack_ev_flags); } /* We passed data and got it acked, remove any soft error diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 20e73271d75..124f9e4e459 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3130,6 +3130,8 @@ void tcp_send_delayed_ack(struct sock *sk) int ato = icsk->icsk_ack.ato; unsigned long timeout; + tcp_ca_event(sk, CA_EVENT_DELAYED_ACK); + if (ato > TCP_DELACK_MIN) { const struct tcp_sock *tp = tcp_sk(sk); int max_ato = HZ / 2; @@ -3186,6 +3188,8 @@ void tcp_send_ack(struct sock *sk) if (sk->sk_state == TCP_CLOSE) return; + tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK); + /* We are not putting this on the write queue, so * tcp_transmit_skb() will set the ownership to this * sock. -- cgit v1.2.3-70-g09d2 From d82bd1229885d550d03926cfa937703f6caa3cc0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 29 Sep 2014 13:08:29 +0200 Subject: tcp: move TCP_ECN_create_request out of header After Octavian Purdilas tcp ipv4/ipv6 unification work this helper only has a single callsite. While at it, convert name to lowercase, suggested by Stephen. Suggested-by: Stephen Hemminger Signed-off-by: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/tcp.h | 34 ---------------------------------- net/ipv4/tcp_input.c | 36 +++++++++++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 35 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index 1f57c536349..545a79aa421 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -861,40 +861,6 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) icsk->icsk_ca_ops->cwnd_event(sk, event); } -/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set - * - * If we receive a SYN packet with these bits set, it means a - * network is playing bad games with TOS bits. In order to - * avoid possible false congestion notifications, we disable - * TCP ECN negociation. - * - * Exception: tcp_ca wants ECN. This is required for DCTCP - * congestion control; it requires setting ECT on all packets, - * including SYN. We inverse the test in this case: If our - * local socket wants ECN, but peer only set ece/cwr (but not - * ECT in IP header) its probably a non-DCTCP aware sender. - */ -static inline void -TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb, - const struct sock *listen_sk) -{ - const struct tcphdr *th = tcp_hdr(skb); - const struct net *net = sock_net(listen_sk); - bool th_ecn = th->ece && th->cwr; - bool ect, need_ecn; - - if (!th_ecn) - return; - - ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); - need_ecn = tcp_ca_needs_ecn(listen_sk); - - if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn) - inet_rsk(req)->ecn_ok = 1; - else if (ect && need_ecn) - inet_rsk(req)->ecn_ok = 1; -} - /* These functions determine how the current flow behaves in respect of SACK * handling. SACK is negotiated with the peer, and therefore it can vary * between different flows. diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fc133178c78..174181e28ef 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5906,6 +5906,40 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) #endif } +/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set + * + * If we receive a SYN packet with these bits set, it means a + * network is playing bad games with TOS bits. In order to + * avoid possible false congestion notifications, we disable + * TCP ECN negociation. + * + * Exception: tcp_ca wants ECN. This is required for DCTCP + * congestion control; it requires setting ECT on all packets, + * including SYN. We inverse the test in this case: If our + * local socket wants ECN, but peer only set ece/cwr (but not + * ECT in IP header) its probably a non-DCTCP aware sender. + */ +static void tcp_ecn_create_request(struct request_sock *req, + const struct sk_buff *skb, + const struct sock *listen_sk) +{ + const struct tcphdr *th = tcp_hdr(skb); + const struct net *net = sock_net(listen_sk); + bool th_ecn = th->ece && th->cwr; + bool ect, need_ecn; + + if (!th_ecn) + return; + + ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); + need_ecn = tcp_ca_needs_ecn(listen_sk); + + if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn) + inet_rsk(req)->ecn_ok = 1; + else if (ect && need_ecn) + inet_rsk(req)->ecn_ok = 1; +} + int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb) @@ -5966,7 +6000,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop_and_free; if (!want_cookie || tmp_opt.tstamp_ok) - TCP_ECN_create_request(req, skb, sk); + tcp_ecn_create_request(req, skb, sk); if (want_cookie) { isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); -- cgit v1.2.3-70-g09d2 From 735d383117e113403442d971b23e7cfa2f876c7c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 29 Sep 2014 13:08:30 +0200 Subject: tcp: change TCP_ECN prefixes to lower case Suggested by Stephen. Also drop inline keyword and let compiler decide. gcc 4.7.3 decides to no longer inline tcp_ecn_check_ce, so split it up. The actual evaluation is not inlined anymore while the ECN_OK test is. Suggested-by: Stephen Hemminger Signed-off-by: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 41 ++++++++++++++++++++++------------------- net/ipv4/tcp_minisocks.c | 6 +++--- net/ipv4/tcp_output.c | 18 +++++++++--------- 3 files changed, 34 insertions(+), 31 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 174181e28ef..aa38f98b788 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -201,28 +201,25 @@ static inline bool tcp_in_quickack_mode(const struct sock *sk) return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; } -static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp) +static void tcp_ecn_queue_cwr(struct tcp_sock *tp) { if (tp->ecn_flags & TCP_ECN_OK) tp->ecn_flags |= TCP_ECN_QUEUE_CWR; } -static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb) +static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb) { if (tcp_hdr(skb)->cwr) tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; } -static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp) +static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) { tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; } -static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) +static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) { - if (!(tp->ecn_flags & TCP_ECN_OK)) - return; - switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) { case INET_ECN_NOT_ECT: /* Funny extension: if ECT is not set on a segment, @@ -251,19 +248,25 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s } } -static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) +static void tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) +{ + if (tp->ecn_flags & TCP_ECN_OK) + __tcp_ecn_check_ce(tp, skb); +} + +static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) { if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr)) tp->ecn_flags &= ~TCP_ECN_OK; } -static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) +static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) { if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr)) tp->ecn_flags &= ~TCP_ECN_OK; } -static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) +static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) { if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) return true; @@ -660,7 +663,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) } icsk->icsk_ack.lrcvtime = now; - TCP_ECN_check_ce(tp, skb); + tcp_ecn_check_ce(tp, skb); if (skb->len >= 128) tcp_grow_window(sk, skb); @@ -1976,7 +1979,7 @@ void tcp_enter_loss(struct sock *sk) sysctl_tcp_reordering); tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; - TCP_ECN_queue_cwr(tp); + tcp_ecn_queue_cwr(tp); /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous * loss recovery is underway except recurring timeout(s) on @@ -2368,7 +2371,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) if (tp->prior_ssthresh > tp->snd_ssthresh) { tp->snd_ssthresh = tp->prior_ssthresh; - TCP_ECN_withdraw_cwr(tp); + tcp_ecn_withdraw_cwr(tp); } } else { tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); @@ -2498,7 +2501,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk) tp->prr_delivered = 0; tp->prr_out = 0; tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); - TCP_ECN_queue_cwr(tp); + tcp_ecn_queue_cwr(tp); } static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, @@ -3453,7 +3456,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_rtt_us); - if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) { + if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) { flag |= FLAG_ECE; ack_ev_flags |= CA_ACK_ECE; } @@ -4193,7 +4196,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) struct sk_buff *skb1; u32 seq, end_seq; - TCP_ECN_check_ce(tp, skb); + tcp_ecn_check_ce(tp, skb); if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); @@ -4376,7 +4379,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) skb_dst_drop(skb); __skb_pull(skb, tcp_hdr(skb)->doff * 4); - TCP_ECN_accept_cwr(tp, skb); + tcp_ecn_accept_cwr(tp, skb); tp->rx_opt.dsack = 0; @@ -5457,7 +5460,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * state to ESTABLISHED..." */ - TCP_ECN_rcv_synack(tp, th); + tcp_ecn_rcv_synack(tp, th); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_ack(sk, skb, FLAG_SLOWPATH); @@ -5576,7 +5579,7 @@ discard: tp->snd_wl1 = TCP_SKB_CB(skb)->seq; tp->max_window = tp->snd_wnd; - TCP_ECN_rcv_syn(tp, th); + tcp_ecn_rcv_syn(tp, th); tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 47b73506b77..63d2680b65d 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -393,8 +393,8 @@ void tcp_openreq_init_rwin(struct request_sock *req, } EXPORT_SYMBOL(tcp_openreq_init_rwin); -static inline void TCP_ECN_openreq_child(struct tcp_sock *tp, - struct request_sock *req) +static void tcp_ecn_openreq_child(struct tcp_sock *tp, + const struct request_sock *req) { tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0; } @@ -507,7 +507,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; - TCP_ECN_openreq_child(newtp, req); + tcp_ecn_openreq_child(newtp, req); newtp->fastopen_rsk = NULL; newtp->syn_data_acked = 0; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 86a0216fcaa..ee567e9e98c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -318,7 +318,7 @@ static u16 tcp_select_window(struct sock *sk) } /* Packet ECN state for a SYN-ACK */ -static inline void TCP_ECN_send_synack(struct sock *sk, struct sk_buff *skb) +static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) { const struct tcp_sock *tp = tcp_sk(sk); @@ -330,7 +330,7 @@ static inline void TCP_ECN_send_synack(struct sock *sk, struct sk_buff *skb) } /* Packet ECN state for a SYN. */ -static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) +static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -344,8 +344,8 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) } } -static __inline__ void -TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th, +static void +tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th, struct sock *sk) { if (inet_rsk(req)->ecn_ok) { @@ -358,7 +358,7 @@ TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th, /* Set up ECN state for a packet on a ESTABLISHED socket that is about to * be sent. */ -static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, +static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, int tcp_header_len) { struct tcp_sock *tp = tcp_sk(sk); @@ -960,7 +960,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcp_options_write((__be32 *)(th + 1), tp, &opts); if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0)) - TCP_ECN_send(sk, skb, tcp_header_size); + tcp_ecn_send(sk, skb, tcp_header_size); #ifdef CONFIG_TCP_MD5SIG /* Calculate the MD5 hash, as we have all we need now */ @@ -2800,7 +2800,7 @@ int tcp_send_synack(struct sock *sk) } TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; - TCP_ECN_send_synack(sk, skb); + tcp_ecn_send_synack(sk, skb); } return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } @@ -2859,7 +2859,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; - TCP_ECN_make_synack(req, th, sk); + tcp_ecn_make_synack(req, th, sk); th->source = htons(ireq->ir_num); th->dest = ireq->ir_rmt_port; /* Setting of flags are superfluous here for callers (and ECE is @@ -3098,7 +3098,7 @@ int tcp_connect(struct sock *sk) tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); tp->retrans_stamp = tcp_time_stamp; tcp_connect_queue_skb(sk, buff); - TCP_ECN_send_syn(sk, buff); + tcp_ecn_send_syn(sk, buff); /* Send off SYN; include data in Fast Open. */ err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : -- cgit v1.2.3-70-g09d2 From ad971f616aa98ea2503f1a1064637bfb4ef7b21e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 11 Oct 2014 15:17:29 -0700 Subject: tcp: fix tcp_ack() performance problem We worked hard to improve tcp_ack() performance, by not accessing skb_shinfo() in fast path (cd7d8498c9a5 tcp: change tcp_skb_pcount() location) We still have one spurious access because of ACK timestamping, added in commit e1c8a607b281 ("net-timestamp: ACK timestamp for bytestreams") By checking if sk_tsflags has SOF_TIMESTAMPING_TX_ACK set, we can avoid two cache line misses for the common case. While we are at it, add two prefetchw() : One in tcp_ack() to bring skb at the head of write queue. One in tcp_clean_rtx_queue() loop to bring following skb, as we will delete skb from the write queue and dirty skb->next->prev. Add a couple of [un]likely() clauses. After this patch, tcp_ack() is no longer the most consuming function in tcp stack. Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Cc: Neal Cardwell Cc: Yuchung Cheng Cc: Van Jacobson Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 00a41499d52..a12b455928e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -68,6 +68,7 @@ #include #include #include +#include #include #include #include @@ -3029,6 +3030,21 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) return packets_acked; } +static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, + u32 prior_snd_una) +{ + const struct skb_shared_info *shinfo; + + /* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */ + if (likely(!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK))) + return; + + shinfo = skb_shinfo(skb); + if ((shinfo->tx_flags & SKBTX_ACK_TSTAMP) && + between(shinfo->tskey, prior_snd_una, tcp_sk(sk)->snd_una - 1)) + __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); +} + /* Remove acknowledged frames from the retransmission queue. If our packet * is before the ack sequence we can discard it as it's confirmed to have * arrived at the other end. @@ -3052,14 +3068,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, first_ackt.v64 = 0; while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { - struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *scb = TCP_SKB_CB(skb); u8 sacked = scb->sacked; u32 acked_pcount; - if (unlikely(shinfo->tx_flags & SKBTX_ACK_TSTAMP) && - between(shinfo->tskey, prior_snd_una, tp->snd_una - 1)) - __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); + tcp_ack_tstamp(sk, skb, prior_snd_una); /* Determine how many packets and what bytes were acked, tso and else */ if (after(scb->end_seq, tp->snd_una)) { @@ -3073,10 +3086,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, fully_acked = false; } else { + /* Speedup tcp_unlink_write_queue() and next loop */ + prefetchw(skb->next); acked_pcount = tcp_skb_pcount(skb); } - if (sacked & TCPCB_RETRANS) { + if (unlikely(sacked & TCPCB_RETRANS)) { if (sacked & TCPCB_SACKED_RETRANS) tp->retrans_out -= acked_pcount; flag |= FLAG_RETRANS_DATA_ACKED; @@ -3107,7 +3122,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, * connection startup slow start one packet too * quickly. This is severely frowned upon behavior. */ - if (!(scb->tcp_flags & TCPHDR_SYN)) { + if (likely(!(scb->tcp_flags & TCPHDR_SYN))) { flag |= FLAG_DATA_ACKED; } else { flag |= FLAG_SYN_ACKED; @@ -3119,9 +3134,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tcp_unlink_write_queue(skb, sk); sk_wmem_free_skb(sk, skb); - if (skb == tp->retransmit_skb_hint) + if (unlikely(skb == tp->retransmit_skb_hint)) tp->retransmit_skb_hint = NULL; - if (skb == tp->lost_skb_hint) + if (unlikely(skb == tp->lost_skb_hint)) tp->lost_skb_hint = NULL; } @@ -3132,7 +3147,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, flag |= FLAG_SACK_RENEGING; skb_mstamp_get(&now); - if (first_ackt.v64) { + if (likely(first_ackt.v64)) { seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); } @@ -3394,6 +3409,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) int acked = 0; /* Number of packets newly acked */ long sack_rtt_us = -1L; + /* We very likely will need to access write queue head. */ + prefetchw(sk->sk_write_queue.next); + /* If the ack is older than previous acks * then we can probably ignore it. */ -- cgit v1.2.3-70-g09d2 From 1f37bf87aa7523d28e7e4c4f7bb5dba98faa3e00 Mon Sep 17 00:00:00 2001 From: Marcelo Leitner Date: Tue, 4 Nov 2014 17:15:08 -0200 Subject: tcp: zero retrans_stamp if all retrans were acked Ueki Kohei reported that when we are using NewReno with connections that have a very low traffic, we may timeout the connection too early if a second loss occurs after the first one was successfully acked but no data was transfered later. Below is his description of it: When SACK is disabled, and a socket suffers multiple separate TCP retransmissions, that socket's ETIMEDOUT value is calculated from the time of the *first* retransmission instead of the *latest* retransmission. This happens because the tcp_sock's retrans_stamp is set once then never cleared. Take the following connection: Linux remote-machine | | send#1---->(*1)|--------> data#1 --------->| | | | RTO : : | | | ---(*2)|----> data#1(retrans) ---->| | (*3)|<---------- ACK <----------| | | | | : : | : : | : : 16 minutes (or more) : | : : | : : | : : | | | send#2---->(*4)|--------> data#2 --------->| | | | RTO : : | | | ---(*5)|----> data#2(retrans) ---->| | | | | | | RTO*2 : : | | | | | | ETIMEDOUT<----(*6)| | (*1) One data packet sent. (*2) Because no ACK packet is received, the packet is retransmitted. (*3) The ACK packet is received. The transmitted packet is acknowledged. At this point the first "retransmission event" has passed and been recovered from. Any future retransmission is a completely new "event". (*4) After 16 minutes (to correspond with retries2=15), a new data packet is sent. Note: No data is transmitted between (*3) and (*4). The socket's timeout SHOULD be calculated from this point in time, but instead it's calculated from the prior "event" 16 minutes ago. (*5) Because no ACK packet is received, the packet is retransmitted. (*6) At the time of the 2nd retransmission, the socket returns ETIMEDOUT. Therefore, now we clear retrans_stamp as soon as all data during the loss window is fully acked. Reported-by: Ueki Kohei Cc: Neal Cardwell Cc: Yuchung Cheng Signed-off-by: Marcelo Ricardo Leitner Acked-by: Neal Cardwell Tested-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 60 +++++++++++++++++++++++++++------------------------- 1 file changed, 31 insertions(+), 29 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a12b455928e..88fa2d16068 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2315,6 +2315,35 @@ static inline bool tcp_packet_delayed(const struct tcp_sock *tp) /* Undo procedures. */ +/* We can clear retrans_stamp when there are no retransmissions in the + * window. It would seem that it is trivially available for us in + * tp->retrans_out, however, that kind of assumptions doesn't consider + * what will happen if errors occur when sending retransmission for the + * second time. ...It could the that such segment has only + * TCPCB_EVER_RETRANS set at the present time. It seems that checking + * the head skb is enough except for some reneging corner cases that + * are not worth the effort. + * + * Main reason for all this complexity is the fact that connection dying + * time now depends on the validity of the retrans_stamp, in particular, + * that successive retransmissions of a segment must not advance + * retrans_stamp under any conditions. + */ +static bool tcp_any_retrans_done(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + if (tp->retrans_out) + return true; + + skb = tcp_write_queue_head(sk); + if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) + return true; + + return false; +} + #if FASTRETRANS_DEBUG > 1 static void DBGUNDO(struct sock *sk, const char *msg) { @@ -2410,6 +2439,8 @@ static bool tcp_try_undo_recovery(struct sock *sk) * is ACKed. For Reno it is MUST to prevent false * fast retransmits (RFC2582). SACK TCP is safe. */ tcp_moderate_cwnd(tp); + if (!tcp_any_retrans_done(sk)) + tp->retrans_stamp = 0; return true; } tcp_set_ca_state(sk, TCP_CA_Open); @@ -2430,35 +2461,6 @@ static bool tcp_try_undo_dsack(struct sock *sk) return false; } -/* We can clear retrans_stamp when there are no retransmissions in the - * window. It would seem that it is trivially available for us in - * tp->retrans_out, however, that kind of assumptions doesn't consider - * what will happen if errors occur when sending retransmission for the - * second time. ...It could the that such segment has only - * TCPCB_EVER_RETRANS set at the present time. It seems that checking - * the head skb is enough except for some reneging corner cases that - * are not worth the effort. - * - * Main reason for all this complexity is the fact that connection dying - * time now depends on the validity of the retrans_stamp, in particular, - * that successive retransmissions of a segment must not advance - * retrans_stamp under any conditions. - */ -static bool tcp_any_retrans_done(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - - if (tp->retrans_out) - return true; - - skb = tcp_write_queue_head(sk); - if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) - return true; - - return false; -} - /* Undo during loss recovery after partial ACK or using F-RTO. */ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) { -- cgit v1.2.3-70-g09d2