diff options
author | Ingo Molnar <mingo@elte.hu> | 2008-10-28 16:26:12 +0100 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-10-28 16:26:12 +0100 |
commit | 7a9787e1eba95a166265e6a260cf30af04ef0a99 (patch) | |
tree | e730a4565e0318140d2fbd2f0415d18a339d7336 /net/ipv4/tcp_output.c | |
parent | 41b9eb264c8407655db57b60b4457fe1b2ec9977 (diff) | |
parent | 0173a3265b228da319ceb9c1ec6a5682fd1b2d92 (diff) |
Merge commit 'v2.6.28-rc2' into x86/pci-ioapic-boot-irq-quirks
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r-- | net/ipv4/tcp_output.c | 683 |
1 files changed, 368 insertions, 315 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ad993ecb481..e4c5ac9fe89 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -5,8 +5,6 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_output.c,v 1.146 2002/02/01 22:01:04 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> @@ -347,112 +345,240 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) TCP_SKB_CB(skb)->end_seq = seq; } -static void tcp_build_and_update_options(__be32 *ptr, struct tcp_sock *tp, - __u32 tstamp, __u8 **md5_hash) +static inline int tcp_urg_mode(const struct tcp_sock *tp) { - if (tp->rx_opt.tstamp_ok) { - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_TIMESTAMP << 8) | - TCPOLEN_TIMESTAMP); - *ptr++ = htonl(tstamp); - *ptr++ = htonl(tp->rx_opt.ts_recent); - } - if (tp->rx_opt.eff_sacks) { - struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks; - int this_sack; - - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_SACK << 8) | - (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks * - TCPOLEN_SACK_PERBLOCK))); - - for (this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) { - *ptr++ = htonl(sp[this_sack].start_seq); - *ptr++ = htonl(sp[this_sack].end_seq); - } + return tp->snd_una != tp->snd_up; +} - if (tp->rx_opt.dsack) { - tp->rx_opt.dsack = 0; - tp->rx_opt.eff_sacks--; - } - } -#ifdef CONFIG_TCP_MD5SIG - if (md5_hash) { +#define OPTION_SACK_ADVERTISE (1 << 0) +#define OPTION_TS (1 << 1) +#define OPTION_MD5 (1 << 2) + +struct tcp_out_options { + u8 options; /* bit field of OPTION_* */ + u8 ws; /* window scale, 0 to disable */ + u8 num_sack_blocks; /* number of SACK blocks to include */ + u16 mss; /* 0 to disable */ + __u32 tsval, tsecr; /* need to include OPTION_TS */ +}; + +/* Beware: Something in the Internet is very sensitive to the ordering of + * TCP options, we learned this through the hard way, so be careful here. + * Luckily we can at least blame others for their non-compliance but from + * inter-operatibility perspective it seems that we're somewhat stuck with + * the ordering which we have been using if we want to keep working with + * those broken things (not that it currently hurts anybody as there isn't + * particular reason why the ordering would need to be changed). + * + * At least SACK_PERM as the first option is known to lead to a disaster + * (but it may well be that other scenarios fail similarly). + */ +static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, + const struct tcp_out_options *opts, + __u8 **md5_hash) { + if (unlikely(OPTION_MD5 & opts->options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); *md5_hash = (__u8 *)ptr; + ptr += 4; + } else { + *md5_hash = NULL; } -#endif -} -/* Construct a tcp options header for a SYN or SYN_ACK packet. - * If this is every changed make sure to change the definition of - * MAX_SYN_SIZE to match the new maximum number of options that you - * can generate. - * - * Note - that with the RFC2385 TCP option, we make room for the - * 16 byte MD5 hash. This will be filled in later, so the pointer for the - * location to be filled is passed back up. - */ -static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack, - int offer_wscale, int wscale, __u32 tstamp, - __u32 ts_recent, __u8 **md5_hash) -{ - /* We always get an MSS option. - * The option bytes which will be seen in normal data - * packets should timestamps be used, must be in the MSS - * advertised. But we subtract them from tp->mss_cache so - * that calculations in tcp_sendmsg are simpler etc. - * So account for this fact here if necessary. If we - * don't do this correctly, as a receiver we won't - * recognize data packets as being full sized when we - * should, and thus we won't abide by the delayed ACK - * rules correctly. - * SACKs don't matter, we never delay an ACK when we - * have any of those going out. - */ - *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); - if (ts) { - if (sack) + if (unlikely(opts->mss)) { + *ptr++ = htonl((TCPOPT_MSS << 24) | + (TCPOLEN_MSS << 16) | + opts->mss); + } + + if (likely(OPTION_TS & opts->options)) { + if (unlikely(OPTION_SACK_ADVERTISE & opts->options)) { *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); - else + } else { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); - *ptr++ = htonl(tstamp); /* TSVAL */ - *ptr++ = htonl(ts_recent); /* TSECR */ - } else if (sack) + } + *ptr++ = htonl(opts->tsval); + *ptr++ = htonl(opts->tsecr); + } + + if (unlikely(OPTION_SACK_ADVERTISE & opts->options && + !(OPTION_TS & opts->options))) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); - if (offer_wscale) + } + + if (unlikely(opts->ws)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | - (wscale)); + opts->ws); + } + + if (unlikely(opts->num_sack_blocks)) { + struct tcp_sack_block *sp = tp->rx_opt.dsack ? + tp->duplicate_sack : tp->selective_acks; + int this_sack; + + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_SACK << 8) | + (TCPOLEN_SACK_BASE + (opts->num_sack_blocks * + TCPOLEN_SACK_PERBLOCK))); + + for (this_sack = 0; this_sack < opts->num_sack_blocks; + ++this_sack) { + *ptr++ = htonl(sp[this_sack].start_seq); + *ptr++ = htonl(sp[this_sack].end_seq); + } + + if (tp->rx_opt.dsack) { + tp->rx_opt.dsack = 0; + tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks; + } + } +} + +static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, + struct tcp_out_options *opts, + struct tcp_md5sig_key **md5) { + struct tcp_sock *tp = tcp_sk(sk); + unsigned size = 0; + #ifdef CONFIG_TCP_MD5SIG - /* - * If MD5 is enabled, then we set the option, and include the size - * (always 18). The actual MD5 hash is added just before the - * packet is sent. - */ - if (md5_hash) { - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_MD5SIG << 8) | - TCPOLEN_MD5SIG); - *md5_hash = (__u8 *)ptr; + *md5 = tp->af_specific->md5_lookup(sk, sk); + if (*md5) { + opts->options |= OPTION_MD5; + size += TCPOLEN_MD5SIG_ALIGNED; + } +#else + *md5 = NULL; +#endif + + /* We always get an MSS option. The option bytes which will be seen in + * normal data packets should timestamps be used, must be in the MSS + * advertised. But we subtract them from tp->mss_cache so that + * calculations in tcp_sendmsg are simpler etc. So account for this + * fact here if necessary. If we don't do this correctly, as a + * receiver we won't recognize data packets as being full sized when we + * should, and thus we won't abide by the delayed ACK rules correctly. + * SACKs don't matter, we never delay an ACK when we have any of those + * going out. */ + opts->mss = tcp_advertise_mss(sk); + size += TCPOLEN_MSS_ALIGNED; + + if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { + opts->options |= OPTION_TS; + opts->tsval = TCP_SKB_CB(skb)->when; + opts->tsecr = tp->rx_opt.ts_recent; + size += TCPOLEN_TSTAMP_ALIGNED; + } + if (likely(sysctl_tcp_window_scaling)) { + opts->ws = tp->rx_opt.rcv_wscale; + if(likely(opts->ws)) + size += TCPOLEN_WSCALE_ALIGNED; + } + if (likely(sysctl_tcp_sack)) { + opts->options |= OPTION_SACK_ADVERTISE; + if (unlikely(!(OPTION_TS & opts->options))) + size += TCPOLEN_SACKPERM_ALIGNED; + } + + return size; +} + +static unsigned tcp_synack_options(struct sock *sk, + struct request_sock *req, + unsigned mss, struct sk_buff *skb, + struct tcp_out_options *opts, + struct tcp_md5sig_key **md5) { + unsigned size = 0; + struct inet_request_sock *ireq = inet_rsk(req); + char doing_ts; + +#ifdef CONFIG_TCP_MD5SIG + *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); + if (*md5) { + opts->options |= OPTION_MD5; + size += TCPOLEN_MD5SIG_ALIGNED; + } +#else + *md5 = NULL; +#endif + + /* we can't fit any SACK blocks in a packet with MD5 + TS + options. There was discussion about disabling SACK rather than TS in + order to fit in better with old, buggy kernels, but that was deemed + to be unnecessary. */ + doing_ts = ireq->tstamp_ok && !(*md5 && ireq->sack_ok); + + opts->mss = mss; + size += TCPOLEN_MSS_ALIGNED; + + if (likely(ireq->wscale_ok)) { + opts->ws = ireq->rcv_wscale; + if(likely(opts->ws)) + size += TCPOLEN_WSCALE_ALIGNED; + } + if (likely(doing_ts)) { + opts->options |= OPTION_TS; + opts->tsval = TCP_SKB_CB(skb)->when; + opts->tsecr = req->ts_recent; + size += TCPOLEN_TSTAMP_ALIGNED; + } + if (likely(ireq->sack_ok)) { + opts->options |= OPTION_SACK_ADVERTISE; + if (unlikely(!doing_ts)) + size += TCPOLEN_SACKPERM_ALIGNED; + } + + return size; +} + +static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb, + struct tcp_out_options *opts, + struct tcp_md5sig_key **md5) { + struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; + struct tcp_sock *tp = tcp_sk(sk); + unsigned size = 0; + +#ifdef CONFIG_TCP_MD5SIG + *md5 = tp->af_specific->md5_lookup(sk, sk); + if (unlikely(*md5)) { + opts->options |= OPTION_MD5; + size += TCPOLEN_MD5SIG_ALIGNED; } +#else + *md5 = NULL; #endif + + if (likely(tp->rx_opt.tstamp_ok)) { + opts->options |= OPTION_TS; + opts->tsval = tcb ? tcb->when : 0; + opts->tsecr = tp->rx_opt.ts_recent; + size += TCPOLEN_TSTAMP_ALIGNED; + } + + if (unlikely(tp->rx_opt.eff_sacks)) { + const unsigned remaining = MAX_TCP_OPTION_SPACE - size; + opts->num_sack_blocks = + min_t(unsigned, tp->rx_opt.eff_sacks, + (remaining - TCPOLEN_SACK_BASE_ALIGNED) / + TCPOLEN_SACK_PERBLOCK); + size += TCPOLEN_SACK_BASE_ALIGNED + + opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; + } + + return size; } /* This routine actually transmits TCP packets queued in by @@ -473,13 +599,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, struct inet_sock *inet; struct tcp_sock *tp; struct tcp_skb_cb *tcb; - int tcp_header_size; -#ifdef CONFIG_TCP_MD5SIG + struct tcp_out_options opts; + unsigned tcp_options_size, tcp_header_size; struct tcp_md5sig_key *md5; __u8 *md5_hash_location; -#endif struct tcphdr *th; - int sysctl_flags; int err; BUG_ON(!skb || !tcp_skb_pcount(skb)); @@ -502,50 +626,18 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, inet = inet_sk(sk); tp = tcp_sk(sk); tcb = TCP_SKB_CB(skb); - tcp_header_size = tp->tcp_header_len; - -#define SYSCTL_FLAG_TSTAMPS 0x1 -#define SYSCTL_FLAG_WSCALE 0x2 -#define SYSCTL_FLAG_SACK 0x4 + memset(&opts, 0, sizeof(opts)); - sysctl_flags = 0; - if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { - tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; - if (sysctl_tcp_timestamps) { - tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; - sysctl_flags |= SYSCTL_FLAG_TSTAMPS; - } - if (sysctl_tcp_window_scaling) { - tcp_header_size += TCPOLEN_WSCALE_ALIGNED; - sysctl_flags |= SYSCTL_FLAG_WSCALE; - } - if (sysctl_tcp_sack) { - sysctl_flags |= SYSCTL_FLAG_SACK; - if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) - tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; - } - } else if (unlikely(tp->rx_opt.eff_sacks)) { - /* A SACK is 2 pad bytes, a 2 byte header, plus - * 2 32-bit sequence numbers for each SACK block. - */ - tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + - (tp->rx_opt.eff_sacks * - TCPOLEN_SACK_PERBLOCK)); - } + if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) + tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5); + else + tcp_options_size = tcp_established_options(sk, skb, &opts, + &md5); + tcp_header_size = tcp_options_size + sizeof(struct tcphdr); if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); -#ifdef CONFIG_TCP_MD5SIG - /* - * Are we doing MD5 on this segment? If so - make - * room for it. - */ - md5 = tp->af_specific->md5_lookup(sk, sk); - if (md5) - tcp_header_size += TCPOLEN_MD5SIG_ALIGNED; -#endif - skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); skb_set_owner_w(skb, sk); @@ -570,45 +662,23 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, th->check = 0; th->urg_ptr = 0; - if (unlikely(tp->urg_mode && + /* The urg_mode check is necessary during a below snd_una win probe */ + if (unlikely(tcp_urg_mode(tp) && between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) { th->urg_ptr = htons(tp->snd_up - tcb->seq); th->urg = 1; } - if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { - tcp_syn_build_options((__be32 *)(th + 1), - tcp_advertise_mss(sk), - (sysctl_flags & SYSCTL_FLAG_TSTAMPS), - (sysctl_flags & SYSCTL_FLAG_SACK), - (sysctl_flags & SYSCTL_FLAG_WSCALE), - tp->rx_opt.rcv_wscale, - tcb->when, - tp->rx_opt.ts_recent, - -#ifdef CONFIG_TCP_MD5SIG - md5 ? &md5_hash_location : -#endif - NULL); - } else { - tcp_build_and_update_options((__be32 *)(th + 1), - tp, tcb->when, -#ifdef CONFIG_TCP_MD5SIG - md5 ? &md5_hash_location : -#endif - NULL); + tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); + if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0)) TCP_ECN_send(sk, skb, tcp_header_size); - } #ifdef CONFIG_TCP_MD5SIG /* Calculate the MD5 hash, as we have all we need now */ if (md5) { + sk->sk_route_caps &= ~NETIF_F_GSO_MASK; tp->af_specific->calc_md5_hash(md5_hash_location, - md5, - sk, NULL, NULL, - tcp_hdr(skb), - sk->sk_protocol, - skb->len); + md5, sk, NULL, skb); } #endif @@ -621,7 +691,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcp_event_data_sent(tp, skb, sk); if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) - TCP_INC_STATS(TCP_MIB_OUTSEGS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); err = icsk->icsk_af_ops->queue_xmit(skb, 0); if (likely(err <= 0)) @@ -630,10 +700,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcp_enter_cwr(sk, 1); return net_xmit_eval(err); - -#undef SYSCTL_FLAG_TSTAMPS -#undef SYSCTL_FLAG_WSCALE -#undef SYSCTL_FLAG_SACK } /* This routine just queue's the buffer @@ -963,7 +1029,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) /* Compute the current effective MSS, taking SACKs and IP options, * and even PMTU discovery events into account. * - * LARGESEND note: !urg_mode is overkill, only frames up to snd_up + * LARGESEND note: !tcp_urg_mode is overkill, only frames up to snd_up * cannot be large. However, taking into account rare use of URG, this * is not a big flaw. */ @@ -974,10 +1040,13 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) u32 mss_now; u16 xmit_size_goal; int doing_tso = 0; + unsigned header_len; + struct tcp_out_options opts; + struct tcp_md5sig_key *md5; mss_now = tp->mss_cache; - if (large_allowed && sk_can_gso(sk) && !tp->urg_mode) + if (large_allowed && sk_can_gso(sk) && !tcp_urg_mode(tp)) doing_tso = 1; if (dst) { @@ -986,14 +1055,16 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) mss_now = tcp_sync_mss(sk, mtu); } - if (tp->rx_opt.eff_sacks) - mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + - (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); - -#ifdef CONFIG_TCP_MD5SIG - if (tp->af_specific->md5_lookup(sk, sk)) - mss_now -= TCPOLEN_MD5SIG_ALIGNED; -#endif + header_len = tcp_established_options(sk, NULL, &opts, &md5) + + sizeof(struct tcphdr); + /* The mss_cache is sized based on tp->tcp_header_len, which assumes + * some common options. If this is an odd packet (because we have SACK + * blocks etc) then our calculated header_len will be different, and + * we have to adjust mss_now correspondingly */ + if (header_len != tp->tcp_header_len) { + int delta = (int) header_len - tp->tcp_header_len; + mss_now -= delta; + } xmit_size_goal = mss_now; @@ -1139,7 +1210,7 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, /* Don't use the nagle rule for urgent data (or for the final FIN). * Nagle can be ignored during F-RTO too (see RFC4138). */ - if (tp->urg_mode || (tp->frto_counter == 2) || + if (tcp_urg_mode(tp) || (tp->frto_counter == 2) || (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) return 1; @@ -1770,6 +1841,8 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, /* changed transmit queue under us so clear hints */ tcp_clear_retrans_hints_partial(tp); + if (next_skb == tp->retransmit_skb_hint) + tp->retransmit_skb_hint = skb; sk_wmem_free_skb(sk, next_skb); } @@ -1784,7 +1857,7 @@ void tcp_simple_retransmit(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; unsigned int mss = tcp_current_mss(sk, 0); - int lost = 0; + u32 prior_lost = tp->lost_out; tcp_for_write_queue(skb, sk) { if (skb == tcp_send_head(sk)) @@ -1795,17 +1868,13 @@ void tcp_simple_retransmit(struct sock *sk) TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); } - if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST)) { - TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - tp->lost_out += tcp_skb_pcount(skb); - lost = 1; - } + tcp_skb_mark_lost_uncond_verify(tp, skb); } } - tcp_clear_all_retrans_hints(tp); + tcp_clear_retrans_hints_partial(tp); - if (!lost) + if (prior_lost == tp->lost_out) return; if (tcp_is_reno(tp)) @@ -1880,8 +1949,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) /* Collapse two adjacent packets if worthwhile and we can. */ if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && (skb->len < (cur_mss >> 1)) && - (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) && (!tcp_skb_is_last(sk, skb)) && + (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) && (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) && (tcp_skb_pcount(skb) == 1 && @@ -1913,7 +1982,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (err == 0) { /* Update global TCP statistics. */ - TCP_INC_STATS(TCP_MIB_RETRANSSEGS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); tp->total_retrans++; @@ -1942,83 +2011,18 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) return err; } -/* This gets called after a retransmit timeout, and the initially - * retransmitted data is acknowledged. It tries to continue - * resending the rest of the retransmit queue, until either - * we've sent it all or the congestion window limit is reached. - * If doing SACK, the first ACK which comes back for a timeout - * based retransmit packet might feed us FACK information again. - * If so, we use it to avoid unnecessarily retransmissions. - */ -void tcp_xmit_retransmit_queue(struct sock *sk) +static int tcp_can_forward_retransmit(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - int packet_cnt; - - if (tp->retransmit_skb_hint) { - skb = tp->retransmit_skb_hint; - packet_cnt = tp->retransmit_cnt_hint; - } else { - skb = tcp_write_queue_head(sk); - packet_cnt = 0; - } - - /* First pass: retransmit lost packets. */ - if (tp->lost_out) { - tcp_for_write_queue_from(skb, sk) { - __u8 sacked = TCP_SKB_CB(skb)->sacked; - - if (skb == tcp_send_head(sk)) - break; - /* we could do better than to assign each time */ - tp->retransmit_skb_hint = skb; - tp->retransmit_cnt_hint = packet_cnt; - - /* Assume this retransmit will generate - * only one packet for congestion window - * calculation purposes. This works because - * tcp_retransmit_skb() will chop up the - * packet to be MSS sized and all the - * packet counting works out. - */ - if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) - return; - - if (sacked & TCPCB_LOST) { - if (!(sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { - if (tcp_retransmit_skb(sk, skb)) { - tp->retransmit_skb_hint = NULL; - return; - } - if (icsk->icsk_ca_state != TCP_CA_Loss) - NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); - else - NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS); - - if (skb == tcp_write_queue_head(sk)) - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, - TCP_RTO_MAX); - } - - packet_cnt += tcp_skb_pcount(skb); - if (packet_cnt >= tp->lost_out) - break; - } - } - } - - /* OK, demanded retransmission is finished. */ /* Forward retransmissions are possible only during Recovery. */ if (icsk->icsk_ca_state != TCP_CA_Recovery) - return; + return 0; /* No forward retransmissions in Reno are possible. */ if (tcp_is_reno(tp)) - return; + return 0; /* Yeah, we have to make difficult choice between forward transmission * and retransmission... Both ways have their merits... @@ -2029,43 +2033,104 @@ void tcp_xmit_retransmit_queue(struct sock *sk) */ if (tcp_may_send_now(sk)) - return; + return 0; - /* If nothing is SACKed, highest_sack in the loop won't be valid */ - if (!tp->sacked_out) - return; + return 1; +} - if (tp->forward_skb_hint) - skb = tp->forward_skb_hint; - else +/* This gets called after a retransmit timeout, and the initially + * retransmitted data is acknowledged. It tries to continue + * resending the rest of the retransmit queue, until either + * we've sent it all or the congestion window limit is reached. + * If doing SACK, the first ACK which comes back for a timeout + * based retransmit packet might feed us FACK information again. + * If so, we use it to avoid unnecessarily retransmissions. + */ +void tcp_xmit_retransmit_queue(struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + struct sk_buff *hole = NULL; + u32 last_lost; + int mib_idx; + int fwd_rexmitting = 0; + + if (!tp->lost_out) + tp->retransmit_high = tp->snd_una; + + if (tp->retransmit_skb_hint) { + skb = tp->retransmit_skb_hint; + last_lost = TCP_SKB_CB(skb)->end_seq; + if (after(last_lost, tp->retransmit_high)) + last_lost = tp->retransmit_high; + } else { skb = tcp_write_queue_head(sk); + last_lost = tp->snd_una; + } + /* First pass: retransmit lost packets. */ tcp_for_write_queue_from(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - tp->forward_skb_hint = skb; + __u8 sacked = TCP_SKB_CB(skb)->sacked; - if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) + if (skb == tcp_send_head(sk)) break; + /* we could do better than to assign each time */ + if (hole == NULL) + tp->retransmit_skb_hint = skb; + /* Assume this retransmit will generate + * only one packet for congestion window + * calculation purposes. This works because + * tcp_retransmit_skb() will chop up the + * packet to be MSS sized and all the + * packet counting works out. + */ if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) - break; + return; + + if (fwd_rexmitting) { +begin_fwd: + if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) + break; + mib_idx = LINUX_MIB_TCPFORWARDRETRANS; + + } else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) { + tp->retransmit_high = last_lost; + if (!tcp_can_forward_retransmit(sk)) + break; + /* Backtrack if necessary to non-L'ed skb */ + if (hole != NULL) { + skb = hole; + hole = NULL; + } + fwd_rexmitting = 1; + goto begin_fwd; - if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) + } else if (!(sacked & TCPCB_LOST)) { + if (hole == NULL && !(sacked & TCPCB_SACKED_RETRANS)) + hole = skb; continue; - /* Ok, retransmit it. */ - if (tcp_retransmit_skb(sk, skb)) { - tp->forward_skb_hint = NULL; - break; + } else { + last_lost = TCP_SKB_CB(skb)->end_seq; + if (icsk->icsk_ca_state != TCP_CA_Loss) + mib_idx = LINUX_MIB_TCPFASTRETRANS; + else + mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS; } + if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) + continue; + + if (tcp_retransmit_skb(sk, skb)) + return; + NET_INC_STATS_BH(sock_net(sk), mib_idx); + if (skb == tcp_write_queue_head(sk)) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); - - NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS); } } @@ -2119,7 +2184,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) /* NOTE: No TCP options attached and we never retransmit this. */ skb = alloc_skb(MAX_TCP_HEADER, priority); if (!skb) { - NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); return; } @@ -2130,9 +2195,9 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) /* Send it off. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb, 0, priority)) - NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); - TCP_INC_STATS(TCP_MIB_OUTRSTS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS); } /* WARNING: This routine must only be called when we have already sent @@ -2180,11 +2245,11 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, struct tcp_sock *tp = tcp_sk(sk); struct tcphdr *th; int tcp_header_size; + struct tcp_out_options opts; struct sk_buff *skb; -#ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *md5; __u8 *md5_hash_location; -#endif + int mss; skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); if (skb == NULL) @@ -2195,18 +2260,30 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, skb->dst = dst_clone(dst); - tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS + - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) + - (ireq->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) + - /* SACK_PERM is in the place of NOP NOP of TS */ - ((ireq->sack_ok && !ireq->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0)); + mss = dst_metric(dst, RTAX_ADVMSS); + if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) + mss = tp->rx_opt.user_mss; + + if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ + __u8 rcv_wscale; + /* Set this up on the first call only */ + req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); + /* tcp_full_space because it is guaranteed to be the first packet */ + tcp_select_initial_window(tcp_full_space(sk), + mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), + &req->rcv_wnd, + &req->window_clamp, + ireq->wscale_ok, + &rcv_wscale); + ireq->rcv_wscale = rcv_wscale; + } + + memset(&opts, 0, sizeof(opts)); + TCP_SKB_CB(skb)->when = tcp_time_stamp; + tcp_header_size = tcp_synack_options(sk, req, mss, + skb, &opts, &md5) + + sizeof(struct tcphdr); -#ifdef CONFIG_TCP_MD5SIG - /* Are we doing MD5 on this segment? If so - make room for it */ - md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); - if (md5) - tcp_header_size += TCPOLEN_MD5SIG_ALIGNED; -#endif skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); @@ -2215,7 +2292,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, th->syn = 1; th->ack = 1; TCP_ECN_make_synack(req, th); - th->source = inet_sk(sk)->sport; + th->source = ireq->loc_port; th->dest = ireq->rmt_port; /* Setting of flags are superfluous here for callers (and ECE is * not even correctly set) @@ -2224,19 +2301,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, TCPCB_FLAG_SYN | TCPCB_FLAG_ACK); th->seq = htonl(TCP_SKB_CB(skb)->seq); th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); - if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ - __u8 rcv_wscale; - /* Set this up on the first call only */ - req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); - /* tcp_full_space because it is guaranteed to be the first packet */ - tcp_select_initial_window(tcp_full_space(sk), - dst_metric(dst, RTAX_ADVMSS) - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), - &req->rcv_wnd, - &req->window_clamp, - ireq->wscale_ok, - &rcv_wscale); - ireq->rcv_wscale = rcv_wscale; - } /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ th->window = htons(min(req->rcv_wnd, 65535U)); @@ -2245,29 +2309,15 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, TCP_SKB_CB(skb)->when = cookie_init_timestamp(req); else #endif - TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_syn_build_options((__be32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), ireq->tstamp_ok, - ireq->sack_ok, ireq->wscale_ok, ireq->rcv_wscale, - TCP_SKB_CB(skb)->when, - req->ts_recent, - ( -#ifdef CONFIG_TCP_MD5SIG - md5 ? &md5_hash_location : -#endif - NULL) - ); - + tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); th->doff = (tcp_header_size >> 2); - TCP_INC_STATS(TCP_MIB_OUTSEGS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); #ifdef CONFIG_TCP_MD5SIG /* Okay, we have all we need - do the md5 hash if needed */ if (md5) { tp->af_specific->calc_md5_hash(md5_hash_location, - md5, - NULL, dst, req, - tcp_hdr(skb), sk->sk_protocol, - skb->len); + md5, NULL, req, skb); } #endif @@ -2304,6 +2354,9 @@ static void tcp_connect_init(struct sock *sk) if (!tp->window_clamp) tp->window_clamp = dst_metric(dst, RTAX_WINDOW); tp->advmss = dst_metric(dst, RTAX_ADVMSS); + if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss) + tp->advmss = tp->rx_opt.user_mss; + tcp_initialize_rcv_mss(sk); tcp_select_initial_window(tcp_full_space(sk), @@ -2322,6 +2375,7 @@ static void tcp_connect_init(struct sock *sk) tcp_init_wl(tp, tp->write_seq, 0); tp->snd_una = tp->write_seq; tp->snd_sml = tp->write_seq; + tp->snd_up = tp->write_seq; tp->rcv_nxt = 0; tp->rcv_wup = 0; tp->copied_seq = 0; @@ -2367,7 +2421,7 @@ int tcp_connect(struct sock *sk) */ tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; - TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, @@ -2531,8 +2585,7 @@ int tcp_write_wakeup(struct sock *sk) tcp_event_new_data_sent(sk, skb); return err; } else { - if (tp->urg_mode && - between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF)) + if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF)) tcp_xmit_probe_skb(sk, 1); return tcp_xmit_probe_skb(sk, 0); } |