diff options
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r-- | net/ipv4/tcp_output.c | 160 |
1 files changed, 93 insertions, 67 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5a7c41fbc6d..a3d453b9474 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -318,36 +318,47 @@ static u16 tcp_select_window(struct sock *sk) } /* Packet ECN state for a SYN-ACK */ -static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb) +static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) { + const struct tcp_sock *tp = tcp_sk(sk); + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; if (!(tp->ecn_flags & TCP_ECN_OK)) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; + else if (tcp_ca_needs_ecn(sk)) + INET_ECN_xmit(sk); } /* Packet ECN state for a SYN. */ -static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) +static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); tp->ecn_flags = 0; - if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) { + if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 || + tcp_ca_needs_ecn(sk)) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; tp->ecn_flags = TCP_ECN_OK; + if (tcp_ca_needs_ecn(sk)) + INET_ECN_xmit(sk); } } -static __inline__ void -TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th) +static void +tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th, + struct sock *sk) { - if (inet_rsk(req)->ecn_ok) + if (inet_rsk(req)->ecn_ok) { th->ece = 1; + if (tcp_ca_needs_ecn(sk)) + INET_ECN_xmit(sk); + } } /* Set up ECN state for a packet on a ESTABLISHED socket that is about to * be sent. */ -static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, +static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, int tcp_header_len) { struct tcp_sock *tp = tcp_sk(sk); @@ -362,7 +373,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, tcp_hdr(skb)->cwr = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; } - } else { + } else if (!tcp_ca_needs_ecn(sk)) { /* ACK or retransmitted segment: clear ECT|CE */ INET_ECN_dontxmit(sk); } @@ -384,7 +395,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) TCP_SKB_CB(skb)->tcp_flags = flags; TCP_SKB_CB(skb)->sacked = 0; - shinfo->gso_segs = 1; + tcp_skb_pcount_set(skb, 1); shinfo->gso_size = 0; shinfo->gso_type = 0; @@ -550,7 +561,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { opts->options |= OPTION_TS; - opts->tsval = TCP_SKB_CB(skb)->when + tp->tsoffset; + opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset; opts->tsecr = tp->rx_opt.ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } @@ -618,7 +629,7 @@ static unsigned int tcp_synack_options(struct sock *sk, } if (likely(ireq->tstamp_ok)) { opts->options |= OPTION_TS; - opts->tsval = TCP_SKB_CB(skb)->when; + opts->tsval = tcp_skb_timestamp(skb); opts->tsecr = req->ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } @@ -647,7 +658,6 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb struct tcp_out_options *opts, struct tcp_md5sig_key **md5) { - struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; struct tcp_sock *tp = tcp_sk(sk); unsigned int size = 0; unsigned int eff_sacks; @@ -666,7 +676,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb if (likely(tp->rx_opt.tstamp_ok)) { opts->options |= OPTION_TS; - opts->tsval = tcb ? tcb->when + tp->tsoffset : 0; + opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0; opts->tsecr = tp->rx_opt.ts_recent; size += TCPOLEN_TSTAMP_ALIGNED; } @@ -829,26 +839,38 @@ void tcp_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; struct tcp_sock *tp = tcp_sk(sk); + int wmem; + + /* Keep one reference on sk_wmem_alloc. + * Will be released by sk_free() from here or tcp_tasklet_func() + */ + wmem = atomic_sub_return(skb->truesize - 1, &sk->sk_wmem_alloc); + + /* If this softirq is serviced by ksoftirqd, we are likely under stress. + * Wait until our queues (qdisc + devices) are drained. + * This gives : + * - less callbacks to tcp_write_xmit(), reducing stress (batches) + * - chance for incoming ACK (processed by another cpu maybe) + * to migrate this flow (skb->ooo_okay will be eventually set) + */ + if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current) + goto out; if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) && !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) { unsigned long flags; struct tsq_tasklet *tsq; - /* Keep a ref on socket. - * This last ref will be released in tcp_tasklet_func() - */ - atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc); - /* queue this socket to tasklet queue */ local_irq_save(flags); - tsq = &__get_cpu_var(tsq_tasklet); + tsq = this_cpu_ptr(&tsq_tasklet); list_add(&tp->tsq_node, &tsq->head); tasklet_schedule(&tsq->tasklet); local_irq_restore(flags); - } else { - sock_wfree(skb); + return; } +out: + sk_free(sk); } /* This routine actually transmits TCP packets queued in by @@ -886,8 +908,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb = skb_clone(skb, gfp_mask); if (unlikely(!skb)) return -ENOBUFS; - /* Our usage of tstamp should remain private */ - skb->tstamp.tv64 = 0; } inet = inet_sk(sk); @@ -906,9 +926,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcp_ca_event(sk, CA_EVENT_TX_START); /* if no packet is in qdisc/device queue, then allow XPS to select - * another queue. + * another queue. We can be called from tcp_tsq_handler() + * which holds one reference to sk_wmem_alloc. + * + * TODO: Ideally, in-flight pure ACK packets should not matter here. + * One way to get this would be to set skb->truesize = 2 on them. */ - skb->ooo_okay = sk_wmem_alloc_get(sk) == 0; + skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); @@ -952,7 +976,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcp_options_write((__be32 *)(th + 1), tp, &opts); if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0)) - TCP_ECN_send(sk, skb, tcp_header_size); + tcp_ecn_send(sk, skb, tcp_header_size); #ifdef CONFIG_TCP_MD5SIG /* Calculate the MD5 hash, as we have all we need now */ @@ -975,7 +999,18 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); + /* OK, its time to fill skb_shinfo(skb)->gso_segs */ + skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); + + /* Our usage of tstamp should remain private */ + skb->tstamp.tv64 = 0; + + /* Cleanup our debris for IP stacks */ + memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), + sizeof(struct inet6_skb_parm))); + err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); + if (likely(err <= 0)) return err; @@ -995,7 +1030,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) /* Advance write_seq and place onto the write_queue. */ tp->write_seq = TCP_SKB_CB(skb)->end_seq; - skb_header_release(skb); + __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); sk->sk_wmem_queued += skb->truesize; sk_mem_charge(sk, skb->truesize); @@ -1014,11 +1049,11 @@ static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, /* Avoid the costly divide in the normal * non-TSO case. */ - shinfo->gso_segs = 1; + tcp_skb_pcount_set(skb, 1); shinfo->gso_size = 0; shinfo->gso_type = 0; } else { - shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now); + tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now)); shinfo->gso_size = mss_now; shinfo->gso_type = sk->sk_gso_type; } @@ -1146,10 +1181,6 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, buff->ip_summed = skb->ip_summed; - /* Looks stupid, but our code really uses when of - * skbs, which it never sent before. --ANK - */ - TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; buff->tstamp = skb->tstamp; tcp_fragment_tstamp(skb, buff); @@ -1171,7 +1202,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, } /* Link BUFF into the send queue. */ - skb_header_release(buff); + __skb_header_release(buff); tcp_insert_write_queue_after(skb, buff, sk); return 0; @@ -1675,7 +1706,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, tcp_set_skb_tso_segs(sk, buff, mss_now); /* Link BUFF into the send queue. */ - skb_header_release(buff); + __skb_header_release(buff); tcp_insert_write_queue_after(skb, buff, sk); return 0; @@ -1874,8 +1905,8 @@ static int tcp_mtu_probe(struct sock *sk) tcp_init_tso_segs(sk, nskb, nskb->len); /* We're ready to send. If this fails, the probe will - * be resegmented into mss-sized pieces by tcp_write_xmit(). */ - TCP_SKB_CB(nskb)->when = tcp_time_stamp; + * be resegmented into mss-sized pieces by tcp_write_xmit(). + */ if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { /* Decrement cwnd here because we are sending * effectively two packets. */ @@ -1935,8 +1966,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, BUG_ON(!tso_segs); if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { - /* "when" is used as a start point for the retransmit timer */ - TCP_SKB_CB(skb)->when = tcp_time_stamp; + /* "skb_mstamp" is used as a start point for the retransmit timer */ + skb_mstamp_get(&skb->skb_mstamp); goto repair; /* Skip network transmission */ } @@ -2000,8 +2031,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; - TCP_SKB_CB(skb)->when = tcp_time_stamp; - if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; @@ -2097,10 +2126,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) static bool skb_still_in_host_queue(const struct sock *sk, const struct sk_buff *skb) { - const struct sk_buff *fclone = skb + 1; - - if (unlikely(skb->fclone == SKB_FCLONE_ORIG && - fclone->fclone == SKB_FCLONE_CLONE)) { + if (unlikely(skb_fclone_busy(sk, skb))) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); return true; @@ -2499,7 +2525,6 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) /* Make a copy, if the first transmission SKB clone we made * is still in somebody's hands, else make a clone. */ - TCP_SKB_CB(skb)->when = tcp_time_stamp; /* make sure skb->data is aligned on arches that require it * and check if ack-trimming & collapsing extended the headroom @@ -2544,7 +2569,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) /* Save stamp of the first retransmit. */ if (!tp->retrans_stamp) - tp->retrans_stamp = TCP_SKB_CB(skb)->when; + tp->retrans_stamp = tcp_skb_timestamp(skb); /* snd_nxt is stored to detect loss of retransmitted segment, * see tcp_input.c tcp_sacktag_write_queue(). @@ -2752,7 +2777,6 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), TCPHDR_ACK | TCPHDR_RST); /* Send it off. */ - TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); @@ -2780,7 +2804,7 @@ int tcp_send_synack(struct sock *sk) if (nskb == NULL) return -ENOMEM; tcp_unlink_write_queue(skb, sk); - skb_header_release(nskb); + __skb_header_release(nskb); __tcp_add_write_queue_head(sk, nskb); sk_wmem_free_skb(sk, skb); sk->sk_wmem_queued += nskb->truesize; @@ -2789,9 +2813,8 @@ int tcp_send_synack(struct sock *sk) } TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; - TCP_ECN_send_synack(tcp_sk(sk), skb); + tcp_ecn_send_synack(sk, skb); } - TCP_SKB_CB(skb)->when = tcp_time_stamp; return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } @@ -2835,10 +2858,10 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, memset(&opts, 0, sizeof(opts)); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) - TCP_SKB_CB(skb)->when = cookie_init_timestamp(req); + skb->skb_mstamp.stamp_jiffies = cookie_init_timestamp(req); else #endif - TCP_SKB_CB(skb)->when = tcp_time_stamp; + skb_mstamp_get(&skb->skb_mstamp); tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5, foc) + sizeof(*th); @@ -2849,7 +2872,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; - TCP_ECN_make_synack(req, th); + tcp_ecn_make_synack(req, th, sk); th->source = htons(ireq->ir_num); th->dest = ireq->ir_rmt_port; /* Setting of flags are superfluous here for callers (and ECE is @@ -2956,7 +2979,7 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); tcb->end_seq += skb->len; - skb_header_release(skb); + __skb_header_release(skb); __tcp_add_write_queue_tail(sk, skb); sk->sk_wmem_queued += skb->truesize; sk_mem_charge(sk, skb->truesize); @@ -3086,9 +3109,9 @@ int tcp_connect(struct sock *sk) skb_reserve(buff, MAX_TCP_HEADER); tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); - tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp; + tp->retrans_stamp = tcp_time_stamp; tcp_connect_queue_skb(sk, buff); - TCP_ECN_send_syn(sk, buff); + tcp_ecn_send_syn(sk, buff); /* Send off SYN; include data in Fast Open. */ err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : @@ -3120,6 +3143,8 @@ void tcp_send_delayed_ack(struct sock *sk) int ato = icsk->icsk_ack.ato; unsigned long timeout; + tcp_ca_event(sk, CA_EVENT_DELAYED_ACK); + if (ato > TCP_DELACK_MIN) { const struct tcp_sock *tp = tcp_sk(sk); int max_ato = HZ / 2; @@ -3176,6 +3201,8 @@ void tcp_send_ack(struct sock *sk) if (sk->sk_state == TCP_CLOSE) return; + tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK); + /* We are not putting this on the write queue, so * tcp_transmit_skb() will set the ownership to this * sock. @@ -3194,9 +3221,10 @@ void tcp_send_ack(struct sock *sk) tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK); /* Send it off, this clears delayed acks for us. */ - TCP_SKB_CB(buff)->when = tcp_time_stamp; + skb_mstamp_get(&buff->skb_mstamp); tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); } +EXPORT_SYMBOL_GPL(tcp_send_ack); /* This routine sends a packet with an out of date sequence * number. It assumes the other end will try to ack it. @@ -3226,7 +3254,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) * send it. */ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); - TCP_SKB_CB(skb)->when = tcp_time_stamp; + skb_mstamp_get(&skb->skb_mstamp); return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } @@ -3270,7 +3298,6 @@ int tcp_write_wakeup(struct sock *sk) tcp_set_skb_tso_segs(sk, skb, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; - TCP_SKB_CB(skb)->when = tcp_time_stamp; err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); if (!err) tcp_event_new_data_sent(sk, skb); @@ -3289,6 +3316,7 @@ void tcp_send_probe0(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + unsigned long probe_max; int err; err = tcp_write_wakeup(sk); @@ -3304,9 +3332,7 @@ void tcp_send_probe0(struct sock *sk) if (icsk->icsk_backoff < sysctl_tcp_retries2) icsk->icsk_backoff++; icsk->icsk_probes_out++; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), - TCP_RTO_MAX); + probe_max = TCP_RTO_MAX; } else { /* If packet was not sent due to local congestion, * do not backoff and do not remember icsk_probes_out. @@ -3316,11 +3342,11 @@ void tcp_send_probe0(struct sock *sk) */ if (!icsk->icsk_probes_out) icsk->icsk_probes_out = 1; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - min(icsk->icsk_rto << icsk->icsk_backoff, - TCP_RESOURCE_PROBE_INTERVAL), - TCP_RTO_MAX); + probe_max = TCP_RESOURCE_PROBE_INTERVAL; } + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + inet_csk_rto_backoff(icsk, probe_max), + TCP_RTO_MAX); } int tcp_rtx_synack(struct sock *sk, struct request_sock *req) |