From d2cf43674e17ca1c16c68d46d987d2f17bf7c371 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 15 May 2013 19:25:55 +0000 Subject: tcp: speedup tcp_fixup_rcvbuf() tcp_fixup_rcvbuf() contains a loop to estimate initial socket rcv space needed for a given mss. With large MTU (like 64K on lo), we can loop ~500 times and consume a lot of cpu cycles. perf top of 200 concurrent netperf -t TCP_CRR 5.62% netperf [kernel.kallsyms] [k] tcp_init_buffer_space 1.71% netperf [kernel.kallsyms] [k] _raw_spin_lock 1.55% netperf [kernel.kallsyms] [k] kmem_cache_free 1.51% netperf [kernel.kallsyms] [k] tcp_transmit_skb 1.50% netperf [kernel.kallsyms] [k] tcp_ack Lets use a 100% factor, and remove the loop. 100% is needed anyway for tcp_adv_win_scale=1 default value, and is also the maximum factor. Refs: commit b49960a05e32 ("tcp: change tcp_adv_win_scale and tcp_rmem[2]") Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 08bbe609652..b358e8c9860 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -360,9 +360,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) if (mss > 1460) icwnd = max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2); - rcvmem = SKB_TRUESIZE(mss + MAX_TCP_HEADER); - while (tcp_win_from_space(rcvmem) < mss) - rcvmem += 128; + rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER); rcvmem *= icwnd; -- cgit v1.2.3-70-g09d2 From 3e59cb0ddfd2c59991f38e89352ad8a3c71b2374 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Fri, 17 May 2013 13:45:05 +0000 Subject: tcp: remove bad timeout logic in fast recovery tcp_timeout_skb() was intended to trigger fast recovery on timeout, unfortunately in reality it often causes spurious retransmission storms during fast recovery. The particular sign is a fast retransmit over the highest sacked sequence (SND.FACK). Currently the RTO timer re-arming (as in RFC6298) offers a nice cushion to avoid spurious timeout: when SND.UNA advances the sender re-arms RTO and extends the timeout by icsk_rto. The sender does not offset the time elapsed since the packet at SND.UNA was sent. But if the next (DUP)ACK arrives later than ~RTTVAR and triggers tcp_fastretrans_alert(), then tcp_timeout_skb() will mark any packet sent before the icsk_rto interval lost, including one that's above the highest sacked sequence. Most likely a large part of scorebard will be marked. If most packets are not lost then the subsequent DUPACKs with new SACK blocks will cause the sender to continue to retransmit packets beyond SND.FACK spuriously. Even if only one packet is lost the sender may falsely retransmit almost the entire window. The situation becomes common in the world of bufferbloat: the RTT continues to grow as the queue builds up but RTTVAR remains small and close to the minimum 200ms. If a data packet is lost and the DUPACK triggered by the next data packet is slightly delayed, then a spurious retransmission storm forms. As the original comment on tcp_timeout_skb() suggests: the usefulness of this feature is questionable. It also wastes cycles walking the sack scoreboard and is actually harmful because of false recovery. It's time to remove this. Signed-off-by: Yuchung Cheng Acked-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Nandita Dukkipati Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 - include/net/tcp.h | 1 - net/ipv4/tcp_input.c | 65 +--------------------------------------------------- 3 files changed, 1 insertion(+), 66 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 5adbc33d1ab..472120b4fac 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -246,7 +246,6 @@ struct tcp_sock { /* from STCP, retrans queue hinting */ struct sk_buff* lost_skb_hint; - struct sk_buff *scoreboard_skb_hint; struct sk_buff *retransmit_skb_hint; struct sk_buff_head out_of_order_queue; /* Out of order segments go here */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 5bba80fbd1d..e1c3723f161 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1193,7 +1193,6 @@ static inline void tcp_mib_init(struct net *net) static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp) { tp->lost_skb_hint = NULL; - tp->scoreboard_skb_hint = NULL; } static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b358e8c9860..d7d369428ae 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1255,8 +1255,6 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, if (skb == tp->retransmit_skb_hint) tp->retransmit_skb_hint = prev; - if (skb == tp->scoreboard_skb_hint) - tp->scoreboard_skb_hint = prev; if (skb == tp->lost_skb_hint) { tp->lost_skb_hint = prev; tp->lost_cnt_hint -= tcp_skb_pcount(prev); @@ -1964,20 +1962,6 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) return true; } -static inline int tcp_skb_timedout(const struct sock *sk, - const struct sk_buff *skb) -{ - return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto; -} - -static inline int tcp_head_timedout(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - - return tp->packets_out && - tcp_skb_timedout(sk, tcp_write_queue_head(sk)); -} - /* Linux NewReno/SACK/FACK/ECN state machine. * -------------------------------------- * @@ -2084,12 +2068,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) if (tcp_dupack_heuristics(tp) > tp->reordering) return true; - /* Trick#3 : when we use RFC2988 timer restart, fast - * retransmit can be triggered by timeout of queue head. - */ - if (tcp_is_fack(tp) && tcp_head_timedout(sk)) - return true; - /* Trick#4: It is still not OK... But will it be useful to delay * recovery more? */ @@ -2126,44 +2104,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) return false; } -/* New heuristics: it is possible only after we switched to restart timer - * each time when something is ACKed. Hence, we can detect timed out packets - * during fast retransmit without falling to slow start. - * - * Usefulness of this as is very questionable, since we should know which of - * the segments is the next to timeout which is relatively expensive to find - * in general case unless we add some data structure just for that. The - * current approach certainly won't find the right one too often and when it - * finally does find _something_ it usually marks large part of the window - * right away (because a retransmission with a larger timestamp blocks the - * loop from advancing). -ij - */ -static void tcp_timeout_skbs(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - - if (!tcp_is_fack(tp) || !tcp_head_timedout(sk)) - return; - - skb = tp->scoreboard_skb_hint; - if (tp->scoreboard_skb_hint == NULL) - skb = tcp_write_queue_head(sk); - - tcp_for_write_queue_from(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - if (!tcp_skb_timedout(sk, skb)) - break; - - tcp_skb_mark_lost(tp, skb); - } - - tp->scoreboard_skb_hint = skb; - - tcp_verify_left_out(tp); -} - /* Detect loss in event "A" above by marking head of queue up as lost. * For FACK or non-SACK(Reno) senders, the first "packets" number of segments * are considered lost. For RFC3517 SACK, a segment is considered lost if it @@ -2249,8 +2189,6 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) else if (fast_rexmit) tcp_mark_head_lost(sk, 1, 1); } - - tcp_timeout_skbs(sk); } /* CWND moderation, preventing bursts due to too big ACKs @@ -2842,7 +2780,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, fast_rexmit = 1; } - if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk))) + if (do_lost) tcp_update_scoreboard(sk, fast_rexmit); tcp_cwnd_reduction(sk, newly_acked_sacked, fast_rexmit); tcp_xmit_retransmit_queue(sk); @@ -3075,7 +3013,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tcp_unlink_write_queue(skb, sk); sk_wmem_free_skb(sk, skb); - tp->scoreboard_skb_hint = NULL; if (skb == tp->retransmit_skb_hint) tp->retransmit_skb_hint = NULL; if (skb == tp->lost_skb_hint) -- cgit v1.2.3-70-g09d2 From 1f6afc81088a1f5a472b272408730c73b72c68aa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 24 May 2013 15:03:54 +0000 Subject: tcp: remove one indentation level in tcp_rcv_state_process() Remove one level of indentation 'introduced' in commit c3ae62af8e75 (tcp: should drop incoming frames without ACK flag set) if (true) { ... } @acceptable variable is a boolean. This patch is a pure cleanup. Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 269 +++++++++++++++++++++++++-------------------------- 1 file changed, 133 insertions(+), 136 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8230cd6243a..40614257d2c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5536,6 +5536,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock *req; int queued = 0; + bool acceptable; tp->rx_opt.saw_tstamp = 0; @@ -5606,157 +5607,153 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, return 0; /* step 5: check the ACK field */ - if (true) { - int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH | - FLAG_UPDATE_TS_RECENT) > 0; - - switch (sk->sk_state) { - case TCP_SYN_RECV: - if (acceptable) { - /* Once we leave TCP_SYN_RECV, we no longer - * need req so release it. - */ - if (req) { - tcp_synack_rtt_meas(sk, req); - tp->total_retrans = req->num_retrans; - - reqsk_fastopen_remove(sk, req, false); - } else { - /* Make sure socket is routed, for - * correct metrics. - */ - icsk->icsk_af_ops->rebuild_header(sk); - tcp_init_congestion_control(sk); + acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH | + FLAG_UPDATE_TS_RECENT) > 0; - tcp_mtup_init(sk); - tcp_init_buffer_space(sk); - tp->copied_seq = tp->rcv_nxt; - } - smp_mb(); - tcp_set_state(sk, TCP_ESTABLISHED); - sk->sk_state_change(sk); + switch (sk->sk_state) { + case TCP_SYN_RECV: + if (acceptable) { + /* Once we leave TCP_SYN_RECV, we no longer + * need req so release it. + */ + if (req) { + tcp_synack_rtt_meas(sk, req); + tp->total_retrans = req->num_retrans; - /* Note, that this wakeup is only for marginal - * crossed SYN case. Passively open sockets - * are not waked up, because sk->sk_sleep == - * NULL and sk->sk_socket == NULL. + reqsk_fastopen_remove(sk, req, false); + } else { + /* Make sure socket is routed, for + * correct metrics. */ - if (sk->sk_socket) - sk_wake_async(sk, - SOCK_WAKE_IO, POLL_OUT); - - tp->snd_una = TCP_SKB_CB(skb)->ack_seq; - tp->snd_wnd = ntohs(th->window) << - tp->rx_opt.snd_wscale; - tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); - - if (tp->rx_opt.tstamp_ok) - tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; - - if (req) { - /* Re-arm the timer because data may - * have been sent out. This is similar - * to the regular data transmission case - * when new data has just been ack'ed. - * - * (TFO) - we could try to be more - * aggressive and retranmitting any data - * sooner based on when they were sent - * out. - */ - tcp_rearm_rto(sk); - } else - tcp_init_metrics(sk); + icsk->icsk_af_ops->rebuild_header(sk); + tcp_init_congestion_control(sk); - /* Prevent spurious tcp_cwnd_restart() on - * first data packet. + tcp_mtup_init(sk); + tcp_init_buffer_space(sk); + tp->copied_seq = tp->rcv_nxt; + } + smp_mb(); + tcp_set_state(sk, TCP_ESTABLISHED); + sk->sk_state_change(sk); + + /* Note, that this wakeup is only for marginal + * crossed SYN case. Passively open sockets + * are not waked up, because sk->sk_sleep == + * NULL and sk->sk_socket == NULL. + */ + if (sk->sk_socket) + sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); + + tp->snd_una = TCP_SKB_CB(skb)->ack_seq; + tp->snd_wnd = ntohs(th->window) << + tp->rx_opt.snd_wscale; + tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); + + if (tp->rx_opt.tstamp_ok) + tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; + + if (req) { + /* Re-arm the timer because data may + * have been sent out. This is similar + * to the regular data transmission case + * when new data has just been ack'ed. + * + * (TFO) - we could try to be more aggressive + * and retransmitting any data sooner based + * on when they are sent out. */ - tp->lsndtime = tcp_time_stamp; + tcp_rearm_rto(sk); + } else + tcp_init_metrics(sk); - tcp_initialize_rcv_mss(sk); - tcp_fast_path_on(tp); - } else { - return 1; - } - break; + /* Prevent spurious tcp_cwnd_restart() on + * first data packet. + */ + tp->lsndtime = tcp_time_stamp; - case TCP_FIN_WAIT1: - /* If we enter the TCP_FIN_WAIT1 state and we are a - * Fast Open socket and this is the first acceptable - * ACK we have received, this would have acknowledged - * our SYNACK so stop the SYNACK timer. + tcp_initialize_rcv_mss(sk); + tcp_fast_path_on(tp); + } else { + return 1; + } + break; + + case TCP_FIN_WAIT1: + /* If we enter the TCP_FIN_WAIT1 state and we are a + * Fast Open socket and this is the first acceptable + * ACK we have received, this would have acknowledged + * our SYNACK so stop the SYNACK timer. + */ + if (req != NULL) { + /* Return RST if ack_seq is invalid. + * Note that RFC793 only says to generate a + * DUPACK for it but for TCP Fast Open it seems + * better to treat this case like TCP_SYN_RECV + * above. */ - if (req != NULL) { - /* Return RST if ack_seq is invalid. - * Note that RFC793 only says to generate a - * DUPACK for it but for TCP Fast Open it seems - * better to treat this case like TCP_SYN_RECV - * above. - */ - if (!acceptable) + if (!acceptable) + return 1; + /* We no longer need the request sock. */ + reqsk_fastopen_remove(sk, req, false); + tcp_rearm_rto(sk); + } + if (tp->snd_una == tp->write_seq) { + struct dst_entry *dst; + + tcp_set_state(sk, TCP_FIN_WAIT2); + sk->sk_shutdown |= SEND_SHUTDOWN; + + dst = __sk_dst_get(sk); + if (dst) + dst_confirm(dst); + + if (!sock_flag(sk, SOCK_DEAD)) { + /* Wake up lingering close() */ + sk->sk_state_change(sk); + } else { + int tmo; + + if (tp->linger2 < 0 || + (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { + tcp_done(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); return 1; - /* We no longer need the request sock. */ - reqsk_fastopen_remove(sk, req, false); - tcp_rearm_rto(sk); - } - if (tp->snd_una == tp->write_seq) { - struct dst_entry *dst; - - tcp_set_state(sk, TCP_FIN_WAIT2); - sk->sk_shutdown |= SEND_SHUTDOWN; - - dst = __sk_dst_get(sk); - if (dst) - dst_confirm(dst); - - if (!sock_flag(sk, SOCK_DEAD)) - /* Wake up lingering close() */ - sk->sk_state_change(sk); - else { - int tmo; - - if (tp->linger2 < 0 || - (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && - after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { - tcp_done(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); - return 1; - } + } - tmo = tcp_fin_time(sk); - if (tmo > TCP_TIMEWAIT_LEN) { - inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); - } else if (th->fin || sock_owned_by_user(sk)) { - /* Bad case. We could lose such FIN otherwise. - * It is not a big problem, but it looks confusing - * and not so rare event. We still can lose it now, - * if it spins in bh_lock_sock(), but it is really - * marginal case. - */ - inet_csk_reset_keepalive_timer(sk, tmo); - } else { - tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); - goto discard; - } + tmo = tcp_fin_time(sk); + if (tmo > TCP_TIMEWAIT_LEN) { + inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); + } else if (th->fin || sock_owned_by_user(sk)) { + /* Bad case. We could lose such FIN otherwise. + * It is not a big problem, but it looks confusing + * and not so rare event. We still can lose it now, + * if it spins in bh_lock_sock(), but it is really + * marginal case. + */ + inet_csk_reset_keepalive_timer(sk, tmo); + } else { + tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); + goto discard; } } - break; + } + break; - case TCP_CLOSING: - if (tp->snd_una == tp->write_seq) { - tcp_time_wait(sk, TCP_TIME_WAIT, 0); - goto discard; - } - break; + case TCP_CLOSING: + if (tp->snd_una == tp->write_seq) { + tcp_time_wait(sk, TCP_TIME_WAIT, 0); + goto discard; + } + break; - case TCP_LAST_ACK: - if (tp->snd_una == tp->write_seq) { - tcp_update_metrics(sk); - tcp_done(sk); - goto discard; - } - break; + case TCP_LAST_ACK: + if (tp->snd_una == tp->write_seq) { + tcp_update_metrics(sk); + tcp_done(sk); + goto discard; } + break; } /* step 6: check the URG bit */ -- cgit v1.2.3-70-g09d2 From 61eb900352ff731d990d5415ce9f04e4af6a6136 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 24 May 2013 18:36:13 +0000 Subject: tcp: Remove another indentation level in tcp_rcv_state_process case TCP_SYN_RECV: can have another indentation level removed by converting if (acceptable) { ...; } else { return 1; } to if (!acceptable) return 1; ...; Reflow code and comments to fit 80 columns. Another pure cleanup patch. Signed-off-by: Joe Perches Improved-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 110 ++++++++++++++++++++++++--------------------------- 1 file changed, 51 insertions(+), 59 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 40614257d2c..413b480b932 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5612,70 +5612,62 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, switch (sk->sk_state) { case TCP_SYN_RECV: - if (acceptable) { - /* Once we leave TCP_SYN_RECV, we no longer - * need req so release it. - */ - if (req) { - tcp_synack_rtt_meas(sk, req); - tp->total_retrans = req->num_retrans; + if (!acceptable) + return 1; - reqsk_fastopen_remove(sk, req, false); - } else { - /* Make sure socket is routed, for - * correct metrics. - */ - icsk->icsk_af_ops->rebuild_header(sk); - tcp_init_congestion_control(sk); + /* Once we leave TCP_SYN_RECV, we no longer need req + * so release it. + */ + if (req) { + tcp_synack_rtt_meas(sk, req); + tp->total_retrans = req->num_retrans; - tcp_mtup_init(sk); - tcp_init_buffer_space(sk); - tp->copied_seq = tp->rcv_nxt; - } - smp_mb(); - tcp_set_state(sk, TCP_ESTABLISHED); - sk->sk_state_change(sk); - - /* Note, that this wakeup is only for marginal - * crossed SYN case. Passively open sockets - * are not waked up, because sk->sk_sleep == - * NULL and sk->sk_socket == NULL. - */ - if (sk->sk_socket) - sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); - - tp->snd_una = TCP_SKB_CB(skb)->ack_seq; - tp->snd_wnd = ntohs(th->window) << - tp->rx_opt.snd_wscale; - tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); - - if (tp->rx_opt.tstamp_ok) - tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; - - if (req) { - /* Re-arm the timer because data may - * have been sent out. This is similar - * to the regular data transmission case - * when new data has just been ack'ed. - * - * (TFO) - we could try to be more aggressive - * and retransmitting any data sooner based - * on when they are sent out. - */ - tcp_rearm_rto(sk); - } else - tcp_init_metrics(sk); + reqsk_fastopen_remove(sk, req, false); + } else { + /* Make sure socket is routed, for correct metrics. */ + icsk->icsk_af_ops->rebuild_header(sk); + tcp_init_congestion_control(sk); + + tcp_mtup_init(sk); + tcp_init_buffer_space(sk); + tp->copied_seq = tp->rcv_nxt; + } + smp_mb(); + tcp_set_state(sk, TCP_ESTABLISHED); + sk->sk_state_change(sk); - /* Prevent spurious tcp_cwnd_restart() on - * first data packet. + /* Note, that this wakeup is only for marginal crossed SYN case. + * Passively open sockets are not waked up, because + * sk->sk_sleep == NULL and sk->sk_socket == NULL. + */ + if (sk->sk_socket) + sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); + + tp->snd_una = TCP_SKB_CB(skb)->ack_seq; + tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; + tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); + + if (tp->rx_opt.tstamp_ok) + tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; + + if (req) { + /* Re-arm the timer because data may have been sent out. + * This is similar to the regular data transmission case + * when new data has just been ack'ed. + * + * (TFO) - we could try to be more aggressive and + * retransmitting any data sooner based on when they + * are sent out. */ - tp->lsndtime = tcp_time_stamp; + tcp_rearm_rto(sk); + } else + tcp_init_metrics(sk); - tcp_initialize_rcv_mss(sk); - tcp_fast_path_on(tp); - } else { - return 1; - } + /* Prevent spurious tcp_cwnd_restart() on first data packet */ + tp->lsndtime = tcp_time_stamp; + + tcp_initialize_rcv_mss(sk); + tcp_fast_path_on(tp); break; case TCP_FIN_WAIT1: -- cgit v1.2.3-70-g09d2 From c48b22daa6062fff9eded311b4d6974c29b40487 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 24 May 2013 18:06:58 +0000 Subject: tcp: Remove 2 indentation levels in tcp_rcv_state_process case TCP_FIN_WAIT1 can also be simplified by reversing tests and adding breaks; Add braces after case and move automatic definitions. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 76 +++++++++++++++++++++++++++------------------------- 1 file changed, 39 insertions(+), 37 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 413b480b932..9579e1a5a14 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5670,7 +5670,10 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tcp_fast_path_on(tp); break; - case TCP_FIN_WAIT1: + case TCP_FIN_WAIT1: { + struct dst_entry *dst; + int tmo; + /* If we enter the TCP_FIN_WAIT1 state and we are a * Fast Open socket and this is the first acceptable * ACK we have received, this would have acknowledged @@ -5689,48 +5692,47 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, reqsk_fastopen_remove(sk, req, false); tcp_rearm_rto(sk); } - if (tp->snd_una == tp->write_seq) { - struct dst_entry *dst; + if (tp->snd_una != tp->write_seq) + break; - tcp_set_state(sk, TCP_FIN_WAIT2); - sk->sk_shutdown |= SEND_SHUTDOWN; + tcp_set_state(sk, TCP_FIN_WAIT2); + sk->sk_shutdown |= SEND_SHUTDOWN; - dst = __sk_dst_get(sk); - if (dst) - dst_confirm(dst); + dst = __sk_dst_get(sk); + if (dst) + dst_confirm(dst); - if (!sock_flag(sk, SOCK_DEAD)) { - /* Wake up lingering close() */ - sk->sk_state_change(sk); - } else { - int tmo; - - if (tp->linger2 < 0 || - (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && - after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { - tcp_done(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); - return 1; - } + if (!sock_flag(sk, SOCK_DEAD)) { + /* Wake up lingering close() */ + sk->sk_state_change(sk); + break; + } - tmo = tcp_fin_time(sk); - if (tmo > TCP_TIMEWAIT_LEN) { - inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); - } else if (th->fin || sock_owned_by_user(sk)) { - /* Bad case. We could lose such FIN otherwise. - * It is not a big problem, but it looks confusing - * and not so rare event. We still can lose it now, - * if it spins in bh_lock_sock(), but it is really - * marginal case. - */ - inet_csk_reset_keepalive_timer(sk, tmo); - } else { - tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); - goto discard; - } - } + if (tp->linger2 < 0 || + (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { + tcp_done(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); + return 1; + } + + tmo = tcp_fin_time(sk); + if (tmo > TCP_TIMEWAIT_LEN) { + inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); + } else if (th->fin || sock_owned_by_user(sk)) { + /* Bad case. We could lose such FIN otherwise. + * It is not a big problem, but it looks confusing + * and not so rare event. We still can lose it now, + * if it spins in bh_lock_sock(), but it is really + * marginal case. + */ + inet_csk_reset_keepalive_timer(sk, tmo); + } else { + tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); + goto discard; } break; + } case TCP_CLOSING: if (tp->snd_una == tp->write_seq) { -- cgit v1.2.3-70-g09d2 From 6804973ffb4288bba14d53223e2fbb2bbd1d2e1b Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 29 May 2013 14:20:11 +0000 Subject: tcp: consolidate PRR packet accounting This patch series fixes an undo bug in fast recovery: the sender mistakenly undos the cwnd too early but continues fast retransmits until all pending data are acked. This also multiplies the SNMP stat PARTIALUNDO events by the degree of the network reordering. The first patch prepares the fix by consolidating the accounting of newly_acked_sacked in tcp_cwnd_reduction(), instead of updating newly_acked_sacked everytime sacked_out is adjusted. Also pass acked and prior_unsacked as const type because they are readonly in the rest of recovery processing. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 45 ++++++++++++++++++++------------------------- 1 file changed, 20 insertions(+), 25 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9579e1a5a14..86b5fa72ff9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2430,12 +2430,14 @@ static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh) TCP_ECN_queue_cwr(tp); } -static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, +static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, int fast_rexmit) { struct tcp_sock *tp = tcp_sk(sk); int sndcnt = 0; int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp); + int newly_acked_sacked = prior_unsacked - + (tp->packets_out - tp->sacked_out); tp->prr_delivered += newly_acked_sacked; if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) { @@ -2492,7 +2494,7 @@ static void tcp_try_keep_open(struct sock *sk) } } -static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked) +static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked) { struct tcp_sock *tp = tcp_sk(sk); @@ -2509,7 +2511,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked) if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open) tcp_moderate_cwnd(tp); } else { - tcp_cwnd_reduction(sk, newly_acked_sacked, 0); + tcp_cwnd_reduction(sk, prior_unsacked, 0); } } @@ -2678,15 +2680,14 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) * It does _not_ decide what to send, it is made in function * tcp_xmit_retransmit_queue(). */ -static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, - int prior_sacked, int prior_packets, +static void tcp_fastretrans_alert(struct sock *sk, const int acked, + const int prior_unsacked, bool is_dupack, int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && (tcp_fackets_out(tp) > tp->reordering)); - int newly_acked_sacked = 0; int fast_rexmit = 0; if (WARN_ON(!tp->packets_out && tp->sacked_out)) @@ -2739,9 +2740,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, if (tcp_is_reno(tp) && is_dupack) tcp_add_reno_sack(sk); } else - do_lost = tcp_try_undo_partial(sk, pkts_acked); - newly_acked_sacked = prior_packets - tp->packets_out + - tp->sacked_out - prior_sacked; + do_lost = tcp_try_undo_partial(sk, acked); break; case TCP_CA_Loss: tcp_process_loss(sk, flag, is_dupack); @@ -2755,14 +2754,12 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, if (is_dupack) tcp_add_reno_sack(sk); } - newly_acked_sacked = prior_packets - tp->packets_out + - tp->sacked_out - prior_sacked; if (icsk->icsk_ca_state <= TCP_CA_Disorder) tcp_try_undo_dsack(sk); if (!tcp_time_to_recover(sk, flag)) { - tcp_try_to_open(sk, flag, newly_acked_sacked); + tcp_try_to_open(sk, flag, prior_unsacked); return; } @@ -2784,7 +2781,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, if (do_lost) tcp_update_scoreboard(sk, fast_rexmit); - tcp_cwnd_reduction(sk, newly_acked_sacked, fast_rexmit); + tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit); tcp_xmit_retransmit_queue(sk); } @@ -3268,9 +3265,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 prior_in_flight; u32 prior_fackets; int prior_packets = tp->packets_out; - int prior_sacked = tp->sacked_out; - int pkts_acked = 0; - int previous_packets_out = 0; + const int prior_unsacked = tp->packets_out - tp->sacked_out; + int acked = 0; /* Number of packets newly acked */ /* If the ack is older than previous acks * then we can probably ignore it. @@ -3345,18 +3341,17 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) goto no_queue; /* See if we can take anything off of the retransmit queue. */ - previous_packets_out = tp->packets_out; + acked = tp->packets_out; flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); - - pkts_acked = previous_packets_out - tp->packets_out; + acked -= tp->packets_out; if (tcp_ack_is_dubious(sk, flag)) { /* Advance CWND, if state allows this. */ if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) tcp_cong_avoid(sk, ack, prior_in_flight); is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); - tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, - prior_packets, is_dupack, flag); + tcp_fastretrans_alert(sk, acked, prior_unsacked, + is_dupack, flag); } else { if (flag & FLAG_DATA_ACKED) tcp_cong_avoid(sk, ack, prior_in_flight); @@ -3378,8 +3373,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) - tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, - prior_packets, is_dupack, flag); + tcp_fastretrans_alert(sk, acked, prior_unsacked, + is_dupack, flag); /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. @@ -3401,8 +3396,8 @@ old_ack: */ if (TCP_SKB_CB(skb)->sacked) { flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); - tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, - prior_packets, is_dupack, flag); + tcp_fastretrans_alert(sk, acked, prior_unsacked, + is_dupack, flag); } SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt); -- cgit v1.2.3-70-g09d2 From 6a63df46a7363833a0dc0c431027f522b3487972 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 29 May 2013 14:20:12 +0000 Subject: tcp: refactor undo functions Refactor and relocate various functions or variables to prepare the undo fix. Remove some unused function arguments. Rename tcp_undo_cwr to tcp_undo_cwnd_reduction to be consistent with the rest of CWR related function names. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 97 +++++++++++++++++++++++++++------------------------- 1 file changed, 50 insertions(+), 47 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 86b5fa72ff9..fcb668d1860 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2243,10 +2243,23 @@ static void DBGUNDO(struct sock *sk, const char *msg) #define DBGUNDO(x...) do { } while (0) #endif -static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh) +static void tcp_undo_cwnd_reduction(struct sock *sk, const bool undo_ssthresh, + bool unmark_loss) { struct tcp_sock *tp = tcp_sk(sk); + if (unmark_loss) { + struct sk_buff *skb; + + tcp_for_write_queue(skb, sk) { + if (skb == tcp_send_head(sk)) + break; + TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; + } + tp->lost_out = 0; + tcp_clear_all_retrans_hints(tp); + } + if (tp->prior_ssthresh) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -2263,6 +2276,9 @@ static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh) tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); } tp->snd_cwnd_stamp = tcp_time_stamp; + + if (undo_ssthresh) + tp->undo_marker = 0; } static inline bool tcp_may_undo(const struct tcp_sock *tp) @@ -2282,14 +2298,13 @@ static bool tcp_try_undo_recovery(struct sock *sk) * or our original transmission succeeded. */ DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); - tcp_undo_cwr(sk, true); + tcp_undo_cwnd_reduction(sk, true, false); if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) mib_idx = LINUX_MIB_TCPLOSSUNDO; else mib_idx = LINUX_MIB_TCPFULLUNDO; NET_INC_STATS_BH(sock_net(sk), mib_idx); - tp->undo_marker = 0; } if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { /* Hold old state until something *above* high_seq @@ -2309,8 +2324,7 @@ static void tcp_try_undo_dsack(struct sock *sk) if (tp->undo_marker && !tp->undo_retrans) { DBGUNDO(sk, "D-SACK"); - tcp_undo_cwr(sk, true); - tp->undo_marker = 0; + tcp_undo_cwnd_reduction(sk, true, false); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); } } @@ -2344,60 +2358,20 @@ static bool tcp_any_retrans_done(const struct sock *sk) return false; } -/* Undo during fast recovery after partial ACK. */ - -static int tcp_try_undo_partial(struct sock *sk, int acked) -{ - struct tcp_sock *tp = tcp_sk(sk); - /* Partial ACK arrived. Force Hoe's retransmit. */ - int failed = tcp_is_reno(tp) || (tcp_fackets_out(tp) > tp->reordering); - - if (tcp_may_undo(tp)) { - /* Plain luck! Hole if filled with delayed - * packet, rather than with a retransmit. - */ - if (!tcp_any_retrans_done(sk)) - tp->retrans_stamp = 0; - - tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); - - DBGUNDO(sk, "Hoe"); - tcp_undo_cwr(sk, false); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); - - /* So... Do not make Hoe's retransmit yet. - * If the first packet was delayed, the rest - * ones are most probably delayed as well. - */ - failed = 0; - } - return failed; -} - /* Undo during loss recovery after partial ACK or using F-RTO. */ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) { struct tcp_sock *tp = tcp_sk(sk); if (frto_undo || tcp_may_undo(tp)) { - struct sk_buff *skb; - tcp_for_write_queue(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; - } - - tcp_clear_all_retrans_hints(tp); + tcp_undo_cwnd_reduction(sk, true, true); DBGUNDO(sk, "partial loss"); - tp->lost_out = 0; - tcp_undo_cwr(sk, true); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); if (frto_undo) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); inet_csk(sk)->icsk_retransmits = 0; - tp->undo_marker = 0; if (frto_undo || tcp_is_sack(tp)) tcp_set_ca_state(sk, TCP_CA_Open); return true; @@ -2669,6 +2643,35 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) tcp_xmit_retransmit_queue(sk); } +/* Undo during fast recovery after partial ACK. */ +static bool tcp_try_undo_partial(struct sock *sk, int acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + /* Partial ACK arrived. Force Hoe's retransmit. */ + bool failed = tcp_is_reno(tp) || (tcp_fackets_out(tp) > tp->reordering); + + if (tcp_may_undo(tp)) { + /* Plain luck! Hole if filled with delayed + * packet, rather than with a retransmit. + */ + if (!tcp_any_retrans_done(sk)) + tp->retrans_stamp = 0; + + tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); + + DBGUNDO(sk, "Hoe"); + tcp_undo_cwnd_reduction(sk, false, false); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); + + /* So... Do not make Hoe's retransmit yet. + * If the first packet was delayed, the rest + * ones are most probably delayed as well. + */ + failed = false; + } + return failed; +} + /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and @@ -2686,7 +2689,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && + bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && (tcp_fackets_out(tp) > tp->reordering)); int fast_rexmit = 0; -- cgit v1.2.3-70-g09d2 From 7026b912f97d912476dff5465ed9a127be094208 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 29 May 2013 14:20:13 +0000 Subject: tcp: fix undo on partial ack in recovery Upon detecting spurious fast retransmit via timestamps during recovery, use PRR to clock out new data packet instead of retransmission. Once all retransmission are proven spurious, the sender then reverts the cwnd reduction and congestion state to open or disorder. The current code does the opposite: it undoes cwnd as soon as any retransmission is spurious and continues to retransmit until all data are acked. This nullifies the point to undo the cwnd because the sender is still retransmistting spuriously. This patch fixes it. The undo_ssthresh argument of tcp_undo_cwnd_reductiuon() is no longer needed and is removed. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 59 +++++++++++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 26 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fcb668d1860..c35b2275198 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2243,8 +2243,7 @@ static void DBGUNDO(struct sock *sk, const char *msg) #define DBGUNDO(x...) do { } while (0) #endif -static void tcp_undo_cwnd_reduction(struct sock *sk, const bool undo_ssthresh, - bool unmark_loss) +static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) { struct tcp_sock *tp = tcp_sk(sk); @@ -2268,7 +2267,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, const bool undo_ssthresh, else tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1); - if (undo_ssthresh && tp->prior_ssthresh > tp->snd_ssthresh) { + if (tp->prior_ssthresh > tp->snd_ssthresh) { tp->snd_ssthresh = tp->prior_ssthresh; TCP_ECN_withdraw_cwr(tp); } @@ -2276,9 +2275,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, const bool undo_ssthresh, tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); } tp->snd_cwnd_stamp = tcp_time_stamp; - - if (undo_ssthresh) - tp->undo_marker = 0; + tp->undo_marker = 0; } static inline bool tcp_may_undo(const struct tcp_sock *tp) @@ -2298,7 +2295,7 @@ static bool tcp_try_undo_recovery(struct sock *sk) * or our original transmission succeeded. */ DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); - tcp_undo_cwnd_reduction(sk, true, false); + tcp_undo_cwnd_reduction(sk, false); if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) mib_idx = LINUX_MIB_TCPLOSSUNDO; else @@ -2324,7 +2321,7 @@ static void tcp_try_undo_dsack(struct sock *sk) if (tp->undo_marker && !tp->undo_retrans) { DBGUNDO(sk, "D-SACK"); - tcp_undo_cwnd_reduction(sk, true, false); + tcp_undo_cwnd_reduction(sk, false); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); } } @@ -2364,7 +2361,7 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) struct tcp_sock *tp = tcp_sk(sk); if (frto_undo || tcp_may_undo(tp)) { - tcp_undo_cwnd_reduction(sk, true, true); + tcp_undo_cwnd_reduction(sk, true); DBGUNDO(sk, "partial loss"); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); @@ -2644,32 +2641,37 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) } /* Undo during fast recovery after partial ACK. */ -static bool tcp_try_undo_partial(struct sock *sk, int acked) +static bool tcp_try_undo_partial(struct sock *sk, const int acked, + const int prior_unsacked) { struct tcp_sock *tp = tcp_sk(sk); - /* Partial ACK arrived. Force Hoe's retransmit. */ - bool failed = tcp_is_reno(tp) || (tcp_fackets_out(tp) > tp->reordering); - if (tcp_may_undo(tp)) { + if (tp->undo_marker && tcp_packet_delayed(tp)) { /* Plain luck! Hole if filled with delayed * packet, rather than with a retransmit. */ + tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); + + /* We are getting evidence that the reordering degree is higher + * than we realized. If there are no retransmits out then we + * can undo. Otherwise we clock out new packets but do not + * mark more packets lost or retransmit more. + */ + if (tp->retrans_out) { + tcp_cwnd_reduction(sk, prior_unsacked, 0); + return true; + } + if (!tcp_any_retrans_done(sk)) tp->retrans_stamp = 0; - tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); - - DBGUNDO(sk, "Hoe"); - tcp_undo_cwnd_reduction(sk, false, false); + DBGUNDO(sk, "partial recovery"); + tcp_undo_cwnd_reduction(sk, true); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); - - /* So... Do not make Hoe's retransmit yet. - * If the first packet was delayed, the rest - * ones are most probably delayed as well. - */ - failed = false; + tcp_try_keep_open(sk); + return true; } - return failed; + return false; } /* Process an event, which can update packets-in-flight not trivially. @@ -2742,8 +2744,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, if (!(flag & FLAG_SND_UNA_ADVANCED)) { if (tcp_is_reno(tp) && is_dupack) tcp_add_reno_sack(sk); - } else - do_lost = tcp_try_undo_partial(sk, acked); + } else { + if (tcp_try_undo_partial(sk, acked, prior_unsacked)) + return; + /* Partial ACK arrived. Force fast retransmit. */ + do_lost = tcp_is_reno(tp) || + tcp_fackets_out(tp) > tp->reordering; + } break; case TCP_CA_Loss: tcp_process_loss(sk, flag, is_dupack); -- cgit v1.2.3-70-g09d2 From c7d9d6a185a7ea383b719b79c428d34ec1470275 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 29 May 2013 14:20:14 +0000 Subject: tcp: undo on DSACK during recovery If the receiver supports DSACK, sender can detect false recoveries and revert cwnd reductions triggered by either severe network reordering or concurrent reordering and loss event. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c35b2275198..907311c9a01 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2315,7 +2315,7 @@ static bool tcp_try_undo_recovery(struct sock *sk) } /* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */ -static void tcp_try_undo_dsack(struct sock *sk) +static bool tcp_try_undo_dsack(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -2323,7 +2323,9 @@ static void tcp_try_undo_dsack(struct sock *sk) DBGUNDO(sk, "D-SACK"); tcp_undo_cwnd_reduction(sk, false); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); + return true; } + return false; } /* We can clear retrans_stamp when there are no retransmissions in the @@ -2751,6 +2753,10 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, do_lost = tcp_is_reno(tp) || tcp_fackets_out(tp) > tp->reordering; } + if (tcp_try_undo_dsack(sk)) { + tcp_try_keep_open(sk); + return; + } break; case TCP_CA_Loss: tcp_process_loss(sk, flag, is_dupack); -- cgit v1.2.3-70-g09d2 From 85f16525a2eb66e6092cbd8dcf42371df8334ed0 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 11 Jun 2013 15:35:32 -0700 Subject: tcp: properly send new data in fast recovery in first RTT Linux sends new unset data during disorder and recovery state if all (suspected) lost packets have been retransmitted ( RFC5681, section 3.2 step 1 & 2, RFC3517 section 4, NexSeg() Rule 2). One requirement is to keep the receive window about twice the estimated sender's congestion window (tcp_rcv_space_adjust()), assuming the fast retransmits repair the losses in the next round trip. But currently it's not the case on the first round trip in either normal or Fast Open connection, beucase the initial receive window is identical to (expected) sender's initial congestion window. The fix is to double it. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 5 ++--- net/ipv4/tcp_input.c | 13 ++----------- net/ipv4/tcp_output.c | 33 ++++++++++++++++++--------------- 3 files changed, 22 insertions(+), 29 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index 0d637e9403a..6fa80831dc4 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -61,9 +61,6 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); */ #define MAX_TCP_WINDOW 32767U -/* Offer an initial receive window of 10 mss. */ -#define TCP_DEFAULT_INIT_RCVWND 10 - /* Minimal accepted MSS. It is (60+60+8) - (20+20). */ #define TCP_MIN_MSS 88U @@ -1047,6 +1044,8 @@ static inline void tcp_sack_reset(struct tcp_options_received *rx_opt) rx_opt->num_sacks = 0; } +extern u32 tcp_default_init_rwnd(u32 mss); + /* Determine a window scaling and initial window to offer. */ extern void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 907311c9a01..46271cdcf08 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -347,22 +347,13 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) } /* 3. Tuning rcvbuf, when connection enters established state. */ - static void tcp_fixup_rcvbuf(struct sock *sk) { u32 mss = tcp_sk(sk)->advmss; - u32 icwnd = TCP_DEFAULT_INIT_RCVWND; int rcvmem; - /* Limit to 10 segments if mss <= 1460, - * or 14600/mss segments, with a minimum of two segments. - */ - if (mss > 1460) - icwnd = max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2); - - rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER); - - rcvmem *= icwnd; + rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) * + tcp_default_init_rwnd(mss); if (sk->sk_rcvbuf < rcvmem) sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ec335fabd5c..3dd46eab3b0 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -181,6 +181,21 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } + +u32 tcp_default_init_rwnd(u32 mss) +{ + /* Initial receive window should be twice of TCP_INIT_CWND to + * enable proper sending of new unset data during fast recovery + * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a + * limit when mss is larger than 1460. + */ + u32 init_rwnd = TCP_INIT_CWND * 2; + + if (mss > 1460) + init_rwnd = max((1460 * init_rwnd) / mss, 2U); + return init_rwnd; +} + /* Determine a window scaling and initial window to offer. * Based on the assumption that the given amount of space * will be offered. Store the results in the tp structure. @@ -230,22 +245,10 @@ void tcp_select_initial_window(int __space, __u32 mss, } } - /* Set initial window to a value enough for senders starting with - * initial congestion window of TCP_DEFAULT_INIT_RCVWND. Place - * a limit on the initial window when mss is larger than 1460. - */ if (mss > (1 << *rcv_wscale)) { - int init_cwnd = TCP_DEFAULT_INIT_RCVWND; - if (mss > 1460) - init_cwnd = - max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2); - /* when initializing use the value from init_rcv_wnd - * rather than the default from above - */ - if (init_rcv_wnd) - *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); - else - *rcv_wnd = min(*rcv_wnd, init_cwnd * mss); + if (!init_rcv_wnd) /* Use default unless specified otherwise */ + init_rcv_wnd = tcp_default_init_rwnd(mss); + *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); } /* Set the clamp no higher than max representable value */ -- cgit v1.2.3-70-g09d2 From bcefe17cffd06efdda3e7ad679ea743236e6271a Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 15 Jun 2013 09:39:18 +0800 Subject: tcp: introduce a per-route knob for quick ack In previous discussions, I tried to find some reasonable heuristics for delayed ACK, however this seems not possible, according to Eric: "ACKS might also be delayed because of bidirectional traffic, and is more controlled by the application response time. TCP stack can not easily estimate it." "ACK can be incredibly useful to recover from losses in a short time. The vast majority of TCP sessions are small lived, and we send one ACK per received segment anyway at beginning or retransmits to let the sender smoothly increase its cwnd, so an auto-tuning facility wont help them that much." and according to David: "ACKs are the only information we have to detect loss. And, for the same reasons that TCP VEGAS is fundamentally broken, we cannot measure the pipe or some other receiver-side-visible piece of information to determine when it's "safe" to stretch ACK. And even if it's "safe", we should not do it so that losses are accurately detected and we don't spuriously retransmit. The only way to know when the bandwidth increases is to "test" it, by sending more and more packets until drops happen. That's why all successful congestion control algorithms must operate on explicited tested pieces of information. Similarly, it's not really possible to universally know if it's safe to stretch ACK or not." It still makes sense to enable or disable quick ack mode like what TCP_QUICK_ACK does. Similar to TCP_QUICK_ACK option, but for people who can't modify the source code and still wants to control TCP delayed ACK behavior. As David suggested, this should belong to per-path scope, since different pathes may want different behaviors. Cc: Eric Dumazet Cc: Rick Jones Cc: Stephen Hemminger Cc: "David S. Miller" Cc: Thomas Graf CC: David Laight Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 2 ++ net/ipv4/tcp_input.c | 5 ++++- net/ipv4/tcp_output.c | 6 ++++-- 3 files changed, 10 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 7a2144e1afa..eb0f1a554d7 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -386,6 +386,8 @@ enum { #define RTAX_RTO_MIN RTAX_RTO_MIN RTAX_INITRWND, #define RTAX_INITRWND RTAX_INITRWND + RTAX_QUICKACK, +#define RTAX_QUICKACK RTAX_QUICKACK __RTAX_MAX }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 46271cdcf08..28af45abe06 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3717,6 +3717,7 @@ void tcp_reset(struct sock *sk) static void tcp_fin(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + const struct dst_entry *dst; inet_csk_schedule_ack(sk); @@ -3728,7 +3729,9 @@ static void tcp_fin(struct sock *sk) case TCP_ESTABLISHED: /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); - inet_csk(sk)->icsk_ack.pingpong = 1; + dst = __sk_dst_get(sk); + if (!dst || !dst_metric(dst, RTAX_QUICKACK)) + inet_csk(sk)->icsk_ack.pingpong = 1; break; case TCP_CLOSE_WAIT: diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e2c1333ee31..3d609490f11 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -160,6 +160,7 @@ static void tcp_event_data_sent(struct tcp_sock *tp, { struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_time_stamp; + const struct dst_entry *dst = __sk_dst_get(sk); if (sysctl_tcp_slow_start_after_idle && (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)) @@ -170,8 +171,9 @@ static void tcp_event_data_sent(struct tcp_sock *tp, /* If it is a reply for ato after last received * packet, enter pingpong mode. */ - if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) - icsk->icsk_ack.pingpong = 1; + if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato && + (!dst || !dst_metric(dst, RTAX_QUICKACK))) + icsk->icsk_ack.pingpong = 1; } /* Account for an ACK we sent. */ -- cgit v1.2.3-70-g09d2