From 7691367d71fd77ab668ff3b6edb4340cecddc805 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 21 Feb 2009 23:52:29 -0800 Subject: tcp: Always set urgent pointer if it's beyond snd_nxt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Our TCP stack does not set the urgent flag if the urgent pointer does not fit in 16 bits, i.e., if it is more than 64K from the sequence number of a packet. This behaviour is different from the BSDs, and clearly contradicts the purpose of urgent mode, which is to send the notification (though not necessarily the associated data) as soon as possible. Our current behaviour may in fact delay the urgent notification indefinitely if the receiver window does not open up. Simply matching BSD however may break legacy applications which incorrectly rely on the out-of-band delivery of urgent data, and conversely the in-band delivery of non-urgent data. Alexey Kuznetsov suggested a safe solution of following BSD only if the urgent pointer itself has not yet been transmitted. This way we guarantee that when the remote end sees the packet with non-urgent data marked as urgent due to wrap-around we would have advanced the urgent pointer beyond, either to the actual urgent data or to an as-yet untransmitted packet. The only potential downside is that applications on the remote end may see multiple SIGURG notifications. However, this would occur anyway with other TCP stacks. More importantly, the outcome of such a duplicate notification is likely to be harmless since the signal itself does not carry any information other than the fact that we're in urgent mode. Thanks to Ilpo Järvinen for fixing a critical bug in this and Jeff Chua for reporting that bug. Signed-off-by: Herbert Xu Acked-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index dda42f0bd7a..f5263c84033 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -663,10 +663,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, th->urg_ptr = 0; /* The urg_mode check is necessary during a below snd_una win probe */ - if (unlikely(tcp_urg_mode(tp) && - between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) { - th->urg_ptr = htons(tp->snd_up - tcb->seq); - th->urg = 1; + if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) { + if (before(tp->snd_up, tcb->seq + 0x10000)) { + th->urg_ptr = htons(tp->snd_up - tcb->seq); + th->urg = 1; + } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) { + th->urg_ptr = 0xFFFF; + th->urg = 1; + } } tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); -- cgit v1.2.3-70-g09d2 From ac11ba753f3aa839292c1a3b6c971c637ad2e839 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Sat, 28 Feb 2009 04:44:27 +0000 Subject: tcp: don't backtrack to sacked skbs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backtracking to sacked skbs is a horrible performance killer since the hint cannot be advanced successfully past them... ...And it's totally unnecessary too. In theory this is 2.6.27..28 regression but I doubt anybody can make .28 to have worse performance because of other TCP improvements. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f6f61b3e677..2471cd4f66d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2065,7 +2065,7 @@ begin_fwd: goto begin_fwd; } else if (!(sacked & TCPCB_LOST)) { - if (hole == NULL && !(sacked & TCPCB_SACKED_RETRANS)) + if (hole == NULL && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED))) hole = skb; continue; -- cgit v1.2.3-70-g09d2 From 62ad27619cbcf23fb8581ae72f3806c1d90a861d Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Sat, 28 Feb 2009 04:44:29 +0000 Subject: tcp: deferring in middle of queue makes very little sense MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If skb can be sent right away, we certainly should do that if it's in the middle of the queue because it won't get more data into it. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2471cd4f66d..fa3c81aa4e6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1356,6 +1356,10 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) if (limit >= sk->sk_gso_max_size) goto send_now; + /* Middle in queue won't get any more data, full sendable already? */ + if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) + goto send_now; + if (sysctl_tcp_tso_win_divisor) { u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); -- cgit v1.2.3-70-g09d2 From d3d2ae454501a4dec360995649e1b002a2ad90c5 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Sat, 28 Feb 2009 04:44:30 +0000 Subject: tcp: Don't clear hints when tcp_fragmenting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1) We didn't remove any skbs, so no need to handle stale refs. 2) scoreboard_skb_hint is trivial, no timestamps were changed so no need to clear that one 3) lost_skb_hint needs tweaking similar to that of tcp_sacktag_one(). Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index fa3c81aa4e6..3feab4d6929 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -771,7 +771,6 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, BUG_ON(len > skb->len); - tcp_clear_retrans_hints_partial(tp); nsize = skb_headlen(skb) - len; if (nsize < 0) nsize = 0; @@ -854,6 +853,12 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, tcp_verify_left_out(tp); } tcp_adjust_fackets_out(sk, skb, diff); + + if (tp->lost_skb_hint && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->lost_skb_hint)->seq) && + (tcp_is_fack(tp) || TCP_SKB_CB(skb)->sacked)) + tp->lost_cnt_hint -= diff; } /* Link BUFF into the send queue. */ -- cgit v1.2.3-70-g09d2 From 02276f3c962fd408fa9d441251067845f948bfcf Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Sat, 28 Feb 2009 04:44:31 +0000 Subject: tcp: fix corner case issue in segmentation during rexmitting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If cur_mss grew very recently so that the previously G/TSOed skb now fits well into a single segment it would get send up in parts unless we calculate # of segments again. This corner-case could happen eg. after mtu probe completes or less than previously sack blocks are required for the opposite direction. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3feab4d6929..77af7faf38a 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1921,6 +1921,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (skb->len > cur_mss) { if (tcp_fragment(sk, skb, cur_mss, cur_mss)) return -ENOMEM; /* We'll try again later. */ + } else { + tcp_init_tso_segs(sk, skb, cur_mss); } tcp_retrans_try_collapse(sk, skb, cur_mss); -- cgit v1.2.3-70-g09d2 From e6c7d0857905f1d642cb8dbadae6794bfa1dff30 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Sat, 28 Feb 2009 04:44:35 +0000 Subject: tcp: drop unnecessary local var in collapse MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 77af7faf38a..61445b57610 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1767,11 +1767,9 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); int skb_size, next_skb_size; - u16 flags; skb_size = skb->len; next_skb_size = next_skb->len; - flags = TCP_SKB_CB(skb)->flags; BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); @@ -1791,9 +1789,8 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) /* Update sequence range on original skb. */ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; - /* Merge over control information. */ - flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */ - TCP_SKB_CB(skb)->flags = flags; + /* Merge over control information. This moves PSH/FIN etc. over */ + TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(next_skb)->flags; /* All done, get rid of second SKB and account for it so * packet counting does not break. -- cgit v1.2.3-70-g09d2 From cabeccbd172cc305f4383f5a4808ae254745275f Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Sat, 28 Feb 2009 04:44:38 +0000 Subject: tcp: kill eff_sacks "cache", the sole user can calculate itself MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Also fixes insignificant bug that would cause sending of stale SACK block (would occur in some corner cases). Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 - include/net/tcp.h | 1 - net/ipv4/tcp_input.c | 15 ++------------- net/ipv4/tcp_minisocks.c | 3 +-- net/ipv4/tcp_output.c | 12 ++++++------ 5 files changed, 9 insertions(+), 23 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 0cd99e6baca..4b86ad71e05 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -218,7 +218,6 @@ struct tcp_options_received { snd_wscale : 4, /* Window scaling received from sender */ rcv_wscale : 4; /* Window scaling to send to receiver */ /* SACKs data */ - u8 eff_sacks; /* Size of SACK array to send with next packet */ u8 num_sacks; /* Number of SACK blocks */ u16 user_mss; /* mss requested by user in ioctl */ u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 0366a559afe..055e4946d4c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -926,7 +926,6 @@ extern void tcp_done(struct sock *sk); static inline void tcp_sack_reset(struct tcp_options_received *rx_opt) { rx_opt->dsack = 0; - rx_opt->eff_sacks = 0; rx_opt->num_sacks = 0; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 03f5ede8722..e4442a293eb 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4099,7 +4099,6 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) tp->rx_opt.dsack = 1; tp->duplicate_sack[0].start_seq = seq; tp->duplicate_sack[0].end_seq = end_seq; - tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + 1; } } @@ -4154,8 +4153,6 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) * Decrease num_sacks. */ tp->rx_opt.num_sacks--; - tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + - tp->rx_opt.dsack; for (i = this_sack; i < tp->rx_opt.num_sacks; i++) sp[i] = sp[i + 1]; continue; @@ -4218,7 +4215,6 @@ new_sack: sp->start_seq = seq; sp->end_seq = end_seq; tp->rx_opt.num_sacks++; - tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; } /* RCV.NXT advances, some SACKs should be eaten. */ @@ -4232,7 +4228,6 @@ static void tcp_sack_remove(struct tcp_sock *tp) /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ if (skb_queue_empty(&tp->out_of_order_queue)) { tp->rx_opt.num_sacks = 0; - tp->rx_opt.eff_sacks = tp->rx_opt.dsack; return; } @@ -4253,11 +4248,8 @@ static void tcp_sack_remove(struct tcp_sock *tp) this_sack++; sp++; } - if (num_sacks != tp->rx_opt.num_sacks) { + if (num_sacks != tp->rx_opt.num_sacks) tp->rx_opt.num_sacks = num_sacks; - tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + - tp->rx_opt.dsack; - } } /* This one checks to see if we can put data from the @@ -4333,10 +4325,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) TCP_ECN_accept_cwr(tp, skb); - if (tp->rx_opt.dsack) { + if (tp->rx_opt.dsack) tp->rx_opt.dsack = 0; - tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks; - } /* Queue data for delivery to the user. * Packets in sequence go to the receive queue. @@ -4456,7 +4446,6 @@ drop: if (tcp_is_sack(tp)) { tp->rx_opt.num_sacks = 1; tp->rx_opt.dsack = 0; - tp->rx_opt.eff_sacks = 1; tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; tp->selective_acks[0].end_seq = TCP_SKB_CB(skb)->end_seq; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f67effbb102..bb3d8b35f19 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -434,9 +434,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->rx_opt.saw_tstamp = 0; newtp->rx_opt.dsack = 0; - newtp->rx_opt.eff_sacks = 0; - newtp->rx_opt.num_sacks = 0; + newtp->urg_data = 0; if (sock_flag(newsk, SOCK_KEEPOPEN)) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 61445b57610..1555bb73b63 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -441,10 +441,8 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, *ptr++ = htonl(sp[this_sack].end_seq); } - if (tp->rx_opt.dsack) { + if (tp->rx_opt.dsack) tp->rx_opt.dsack = 0; - tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks; - } } } @@ -550,6 +548,7 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb, struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; struct tcp_sock *tp = tcp_sk(sk); unsigned size = 0; + unsigned int eff_sacks; #ifdef CONFIG_TCP_MD5SIG *md5 = tp->af_specific->md5_lookup(sk, sk); @@ -568,10 +567,11 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb, size += TCPOLEN_TSTAMP_ALIGNED; } - if (unlikely(tp->rx_opt.eff_sacks)) { + eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; + if (unlikely(eff_sacks)) { const unsigned remaining = MAX_TCP_OPTION_SPACE - size; opts->num_sack_blocks = - min_t(unsigned, tp->rx_opt.eff_sacks, + min_t(unsigned, eff_sacks, (remaining - TCPOLEN_SACK_BASE_ALIGNED) / TCPOLEN_SACK_PERBLOCK); size += TCPOLEN_SACK_BASE_ALIGNED + @@ -1418,7 +1418,7 @@ static int tcp_mtu_probe(struct sock *sk) icsk->icsk_mtup.probe_size || inet_csk(sk)->icsk_ca_state != TCP_CA_Open || tp->snd_cwnd < 11 || - tp->rx_opt.eff_sacks) + tp->rx_opt.num_sacks || tp->rx_opt.dsack) return -1; /* Very simple search strategy: just double the MSS. */ -- cgit v1.2.3-70-g09d2 From 9ce01461028d595a6f1cd724fbd7a0dd70464fe4 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Sat, 28 Feb 2009 04:44:42 +0000 Subject: tcp: get rid of two unnecessary u16s in TCP skb flags copying MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I guess these fields were one day 16-bit in the struct but nowadays they're just using 8 bits anyway. This is just a precaution, didn't result any change in my case but who knows what all those varying gcc versions & options do. I've been told that 16-bit is not so nice with some cpus. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1555bb73b63..920c57b90de 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -767,7 +767,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, struct sk_buff *buff; int nsize, old_factor; int nlen; - u16 flags; + u8 flags; BUG_ON(len > skb->len); @@ -1282,7 +1282,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, { struct sk_buff *buff; int nlen = skb->len - len; - u16 flags; + u8 flags; /* All of a TSO frame must be composed of paged data. */ if (skb->len != skb->data_len) -- cgit v1.2.3-70-g09d2 From ee7537b63a28b42b22e48842dfeedc66d96b71f1 Mon Sep 17 00:00:00 2001 From: Hantzis Fotis Date: Mon, 2 Mar 2009 22:42:02 -0800 Subject: tcp: tcp_init_wl / tcp_update_wl argument cleanup The above functions from include/net/tcp.h have been defined with an argument that they never use. The argument is 'u32 ack' which is never used inside the function body, and thus it can be removed. The rest of the patch involves the necessary changes to the function callers of the above two functions. Signed-off-by: Hantzis Fotis Signed-off-by: David S. Miller --- include/net/tcp.h | 4 ++-- net/ipv4/tcp_input.c | 9 ++++----- net/ipv4/tcp_minisocks.c | 2 +- net/ipv4/tcp_output.c | 2 +- 4 files changed, 8 insertions(+), 9 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index 055e4946d4c..d74ac301e6b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -825,12 +825,12 @@ static inline void tcp_push_pending_frames(struct sock *sk) __tcp_push_pending_frames(sk, tcp_current_mss(sk, 1), tp->nonagle); } -static inline void tcp_init_wl(struct tcp_sock *tp, u32 ack, u32 seq) +static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq) { tp->snd_wl1 = seq; } -static inline void tcp_update_wl(struct tcp_sock *tp, u32 ack, u32 seq) +static inline void tcp_update_wl(struct tcp_sock *tp, u32 seq) { tp->snd_wl1 = seq; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e4442a293eb..5ecd7aa2597 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3416,7 +3416,7 @@ static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack, if (tcp_may_update_window(tp, ack, ack_seq, nwin)) { flag |= FLAG_WIN_UPDATE; - tcp_update_wl(tp, ack, ack_seq); + tcp_update_wl(tp, ack_seq); if (tp->snd_wnd != nwin) { tp->snd_wnd = nwin; @@ -3621,7 +3621,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) * No more checks are required. * Note, we use the fact that SND.UNA>=SND.WL2. */ - tcp_update_wl(tp, ack, ack_seq); + tcp_update_wl(tp, ack_seq); tp->snd_una = ack; flag |= FLAG_WIN_UPDATE; @@ -5418,7 +5418,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * never scaled. */ tp->snd_wnd = ntohs(th->window); - tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(skb)->seq); + tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); if (!tp->rx_opt.wscale_ok) { tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; @@ -5679,8 +5679,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tp->snd_una = TCP_SKB_CB(skb)->ack_seq; tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; - tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, - TCP_SKB_CB(skb)->seq); + tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); /* tcp_ack considers this ACK as duplicate * and does not calculate rtt. diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index bb3d8b35f19..4b0df3e6b60 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -399,7 +399,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, tcp_prequeue_init(newtp); - tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn); + tcp_init_wl(newtp, treq->rcv_isn); newtp->srtt = 0; newtp->mdev = TCP_TIMEOUT_INIT; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 920c57b90de..eb285befdf3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2337,7 +2337,7 @@ static void tcp_connect_init(struct sock *sk) sk->sk_err = 0; sock_reset_flag(sk, SOCK_DONE); tp->snd_wnd = 0; - tcp_init_wl(tp, tp->write_seq, 0); + tcp_init_wl(tp, 0); tp->snd_una = tp->write_seq; tp->snd_sml = tp->write_seq; tp->snd_up = tp->write_seq; -- cgit v1.2.3-70-g09d2 From 5861f8e58dd84fc34b691c2e8d4824dea68c360e Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Sat, 14 Mar 2009 14:23:01 +0000 Subject: tcp: remove pointless .dsack/.num_sacks code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the pure assignment case, the earlier zeroing is still in effect. David S. Miller raised concerns if the ifs are there to avoid dirtying cachelines. I came to these conclusions: > We'll be dirty it anyway (now that I check), the first "real" statement > in tcp_rcv_established is: > > tp->rx_opt.saw_tstamp = 0; > > ...that'll land on the same dword. :-/ > > I suppose the blocks are there just because they had more complexity > inside when they had to calculate the eff_sacks too (maybe it would > have been better to just remove them in that drop-patch so you would > have had less head-ache :-)). Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 7 ++----- net/ipv4/tcp_output.c | 3 +-- 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5ecd7aa2597..cd39d1d02dc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4248,8 +4248,7 @@ static void tcp_sack_remove(struct tcp_sock *tp) this_sack++; sp++; } - if (num_sacks != tp->rx_opt.num_sacks) - tp->rx_opt.num_sacks = num_sacks; + tp->rx_opt.num_sacks = num_sacks; } /* This one checks to see if we can put data from the @@ -4325,8 +4324,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) TCP_ECN_accept_cwr(tp, skb); - if (tp->rx_opt.dsack) - tp->rx_opt.dsack = 0; + tp->rx_opt.dsack = 0; /* Queue data for delivery to the user. * Packets in sequence go to the receive queue. @@ -4445,7 +4443,6 @@ drop: /* Initial out of order segment, build 1 SACK. */ if (tcp_is_sack(tp)) { tp->rx_opt.num_sacks = 1; - tp->rx_opt.dsack = 0; tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; tp->selective_acks[0].end_seq = TCP_SKB_CB(skb)->end_seq; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index eb285befdf3..32565803913 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -441,8 +441,7 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, *ptr++ = htonl(sp[this_sack].end_seq); } - if (tp->rx_opt.dsack) - tp->rx_opt.dsack = 0; + tp->rx_opt.dsack = 0; } } -- cgit v1.2.3-70-g09d2 From 0c54b85f2828128274f319a1eb3ce7f604fe2a53 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Sat, 14 Mar 2009 14:23:05 +0000 Subject: tcp: simplify tcp_current_mss MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's very little need for most of the callsites to get tp->xmit_goal_size updated. That will cost us divide as is, so slice the function in two. Also, the only users of the tp->xmit_goal_size are directly behind tcp_current_mss(), so there's no need to store that variable into tcp_sock at all! The drop of xmit_goal_size currently leaves 16-bit hole and some reorganization would again be necessary to change that (but I'm aiming to fill that hole with u16 xmit_goal_size_segs to cache the results of the remaining divide to get that tso on regression). Bring xmit_goal_size parts into tcp.c Signed-off-by: Ilpo Järvinen Cc: Evgeniy Polyakov Cc: Ingo Molnar Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 - include/net/tcp.h | 13 +++++++++++-- net/ipv4/tcp.c | 43 +++++++++++++++++++++++++++++++++++-------- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_output.c | 41 +++++++---------------------------------- 5 files changed, 54 insertions(+), 46 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 4b86ad71e05..ad2021ccc55 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -248,7 +248,6 @@ struct tcp_sock { /* inet_connection_sock has to be the first member of tcp_sock */ struct inet_connection_sock inet_conn; u16 tcp_header_len; /* Bytes of tcp header to send */ - u16 xmit_size_goal; /* Goal for segmenting output packets */ /* * Header prediction flags diff --git a/include/net/tcp.h b/include/net/tcp.h index 255ca35bea0..e54c76d7549 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -481,7 +481,16 @@ static inline void tcp_clear_xmit_timers(struct sock *sk) } extern unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu); -extern unsigned int tcp_current_mss(struct sock *sk, int large); +extern unsigned int tcp_current_mss(struct sock *sk); + +/* Bound MSS / TSO packet size with the half of the window */ +static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) +{ + if (tp->max_window && pktsize > (tp->max_window >> 1)) + return max(tp->max_window >> 1, 68U - tp->tcp_header_len); + else + return pktsize; +} /* tcp.c */ extern void tcp_get_info(struct sock *, struct tcp_info *); @@ -822,7 +831,7 @@ static inline void tcp_push_pending_frames(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - __tcp_push_pending_frames(sk, tcp_current_mss(sk, 1), tp->nonagle); + __tcp_push_pending_frames(sk, tcp_current_mss(sk), tp->nonagle); } static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d3f9beee74c..886596ff0aa 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -661,6 +661,37 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) return NULL; } +static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, + int large_allowed) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 xmit_size_goal; + + xmit_size_goal = mss_now; + + if (large_allowed && sk_can_gso(sk)) { + xmit_size_goal = ((sk->sk_gso_max_size - 1) - + inet_csk(sk)->icsk_af_ops->net_header_len - + inet_csk(sk)->icsk_ext_hdr_len - + tp->tcp_header_len); + + xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); + xmit_size_goal -= (xmit_size_goal % mss_now); + } + + return xmit_size_goal; +} + +static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) +{ + int mss_now; + + mss_now = tcp_current_mss(sk); + *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); + + return mss_now; +} + static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags) { @@ -677,8 +708,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); - size_goal = tp->xmit_size_goal; + mss_now = tcp_send_mss(sk, &size_goal, flags); copied = 0; err = -EPIPE; @@ -761,8 +791,7 @@ wait_for_memory: if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); - size_goal = tp->xmit_size_goal; + mss_now = tcp_send_mss(sk, &size_goal, flags); } out: @@ -844,8 +873,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, /* This should be in poll */ clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); - size_goal = tp->xmit_size_goal; + mss_now = tcp_send_mss(sk, &size_goal, flags); /* Ok commence sending. */ iovlen = msg->msg_iovlen; @@ -1007,8 +1035,7 @@ wait_for_memory: if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); - size_goal = tp->xmit_size_goal; + mss_now = tcp_send_mss(sk, &size_goal, flags); } } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 311c30f73ee..fae78e3eccc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2864,7 +2864,7 @@ void tcp_simple_retransmit(struct sock *sk) const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - unsigned int mss = tcp_current_mss(sk, 0); + unsigned int mss = tcp_current_mss(sk); u32 prior_lost = tp->lost_out; tcp_for_write_queue(skb, sk) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 32565803913..c1f259d2d33 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -921,7 +921,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) * factor and mss. */ if (tcp_skb_pcount(skb) > 1) - tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1)); + tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk)); return 0; } @@ -982,15 +982,6 @@ void tcp_mtup_init(struct sock *sk) icsk->icsk_mtup.probe_size = 0; } -/* Bound MSS / TSO packet size with the half of the window */ -static int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) -{ - if (tp->max_window && pktsize > (tp->max_window >> 1)) - return max(tp->max_window >> 1, 68U - tp->tcp_header_len); - else - return pktsize; -} - /* This function synchronize snd mss to current pmtu/exthdr set. tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts @@ -1037,22 +1028,17 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) /* Compute the current effective MSS, taking SACKs and IP options, * and even PMTU discovery events into account. */ -unsigned int tcp_current_mss(struct sock *sk, int large_allowed) +unsigned int tcp_current_mss(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); u32 mss_now; - u16 xmit_size_goal; - int doing_tso = 0; unsigned header_len; struct tcp_out_options opts; struct tcp_md5sig_key *md5; mss_now = tp->mss_cache; - if (large_allowed && sk_can_gso(sk)) - doing_tso = 1; - if (dst) { u32 mtu = dst_mtu(dst); if (mtu != inet_csk(sk)->icsk_pmtu_cookie) @@ -1070,19 +1056,6 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) mss_now -= delta; } - xmit_size_goal = mss_now; - - if (doing_tso) { - xmit_size_goal = ((sk->sk_gso_max_size - 1) - - inet_csk(sk)->icsk_af_ops->net_header_len - - inet_csk(sk)->icsk_ext_hdr_len - - tp->tcp_header_len); - - xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); - xmit_size_goal -= (xmit_size_goal % mss_now); - } - tp->xmit_size_goal = xmit_size_goal; - return mss_now; } @@ -1264,7 +1237,7 @@ int tcp_may_send_now(struct sock *sk) struct sk_buff *skb = tcp_send_head(sk); return (skb && - tcp_snd_test(sk, skb, tcp_current_mss(sk, 1), + tcp_snd_test(sk, skb, tcp_current_mss(sk), (tcp_skb_is_last(sk, skb) ? tp->nonagle : TCP_NAGLE_PUSH))); } @@ -1421,7 +1394,7 @@ static int tcp_mtu_probe(struct sock *sk) return -1; /* Very simple search strategy: just double the MSS. */ - mss_now = tcp_current_mss(sk, 0); + mss_now = tcp_current_mss(sk); probe_size = 2 * tp->mss_cache; size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache; if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) { @@ -1903,7 +1876,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) return -EHOSTUNREACH; /* Routing failure or similar. */ - cur_mss = tcp_current_mss(sk, 0); + cur_mss = tcp_current_mss(sk); /* If receiver has shrunk his window, and skb is out of * new window, do not retransmit it. The exception is the @@ -2111,7 +2084,7 @@ void tcp_send_fin(struct sock *sk) * unsent frames. But be careful about outgoing SACKS * and IP options. */ - mss_now = tcp_current_mss(sk, 1); + mss_now = tcp_current_mss(sk); if (tcp_send_head(sk) != NULL) { TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; @@ -2523,7 +2496,7 @@ int tcp_write_wakeup(struct sock *sk) if ((skb = tcp_send_head(sk)) != NULL && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) { int err; - unsigned int mss = tcp_current_mss(sk, 0); + unsigned int mss = tcp_current_mss(sk); unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) -- cgit v1.2.3-70-g09d2