diff options
Diffstat (limited to 'net')
-rw-r--r-- | net/core/skbuff.c | 63 | ||||
-rw-r--r-- | net/core/sock.c | 1 | ||||
-rw-r--r-- | net/ipv4/ip_sockglue.c | 13 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 10 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 13 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 287 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 1 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 5 | ||||
-rw-r--r-- | net/ipv4/tcp_timer.c | 5 | ||||
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 2 | ||||
-rw-r--r-- | net/sched/sch_choke.c | 8 | ||||
-rw-r--r-- | net/sched/sch_dsmark.c | 3 | ||||
-rw-r--r-- | net/sched/sch_htb.c | 4 | ||||
-rw-r--r-- | net/sched/sch_teql.c | 4 |
15 files changed, 261 insertions, 160 deletions
diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 52ba2b5e803..2c35da818ef 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -829,7 +829,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) { int headerlen = skb_headroom(skb); - unsigned int size = (skb_end_pointer(skb) - skb->head) + skb->data_len; + unsigned int size = skb_end_offset(skb) + skb->data_len; struct sk_buff *n = alloc_skb(size, gfp_mask); if (!n) @@ -930,9 +930,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, { int i; u8 *data; - int size = nhead + (skb_end_pointer(skb) - skb->head) + ntail; + int size = nhead + skb_end_offset(skb) + ntail; long off; - bool fastpath; BUG_ON(nhead < 0); @@ -941,27 +940,6 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, size = SKB_DATA_ALIGN(size); - /* Check if we can avoid taking references on fragments if we own - * the last reference on skb->head. (see skb_release_data()) - */ - if (!skb->cloned) - fastpath = true; - else { - int delta = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1; - fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta; - } - - if (fastpath && !skb->head_frag && - size + sizeof(struct skb_shared_info) <= ksize(skb->head)) { - memmove(skb->head + size, skb_shinfo(skb), - offsetof(struct skb_shared_info, - frags[skb_shinfo(skb)->nr_frags])); - memmove(skb->head + nhead, skb->head, - skb_tail_pointer(skb) - skb->head); - off = nhead; - goto adjust_others; - } - data = kmalloc(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), gfp_mask); if (!data) @@ -977,9 +955,12 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb_shinfo(skb), offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); - if (fastpath) { - skb_free_head(skb); - } else { + /* + * if shinfo is shared we must drop the old head gracefully, but if it + * is not we can just drop the old head and let the existing refcount + * be since all we did is relocate the values + */ + if (skb_cloned(skb)) { /* copy this zero copy skb frags */ if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { if (skb_copy_ubufs(skb, gfp_mask)) @@ -992,12 +973,13 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb_clone_fraglist(skb); skb_release_data(skb); + } else { + skb_free_head(skb); } off = (data + nhead) - skb->head; skb->head = data; skb->head_frag = 0; -adjust_others: skb->data += off; #ifdef NET_SKBUFF_DATA_USES_OFFSET skb->end = size; @@ -1699,17 +1681,17 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, struct splice_pipe_desc *spd, struct sock *sk) { int seg; - bool head_is_linear = !skb->head_frag; /* map the linear part : - * If skb->head_frag is set, this 'linear' part is backed - * by a fragment, and we can avoid a copy. + * If skb->head_frag is set, this 'linear' part is backed by a + * fragment, and if the head is not shared with any clones then + * we can avoid a copy since we own the head portion of this page. */ if (__splice_segment(virt_to_page(skb->data), (unsigned long) skb->data & (PAGE_SIZE - 1), skb_headlen(skb), offset, len, skb, spd, - head_is_linear, + skb_head_is_locked(skb), sk, pipe)) return true; @@ -2745,14 +2727,13 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) if (unlikely(!nskb)) goto err; - hsize = skb_end_pointer(nskb) - nskb->head; + hsize = skb_end_offset(nskb); if (skb_cow_head(nskb, doffset + headroom)) { kfree_skb(nskb); goto err; } - nskb->truesize += skb_end_pointer(nskb) - nskb->head - - hsize; + nskb->truesize += skb_end_offset(nskb) - hsize; skb_release_head_state(nskb); __skb_push(nskb, doffset); } else { @@ -2870,6 +2851,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) unsigned int len = skb_gro_len(skb); unsigned int offset = skb_gro_offset(skb); unsigned int headlen = skb_headlen(skb); + unsigned int delta_truesize; if (p->len + len >= 65536) return -E2BIG; @@ -2899,11 +2881,15 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) frag->page_offset += offset; skb_frag_size_sub(frag, offset); + /* all fragments truesize : remove (head size + sk_buff) */ + delta_truesize = skb->truesize - + SKB_TRUESIZE(skb_end_offset(skb)); + skb->truesize -= skb->data_len; skb->len -= skb->data_len; skb->data_len = 0; - NAPI_GRO_CB(skb)->free = 1; + NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE; goto done; } else if (skb->head_frag) { int nr_frags = pinfo->nr_frags; @@ -2928,6 +2914,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); /* We dont need to clear skbinfo->nr_frags here */ + delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; goto done; } else if (skb_gro_len(p) != pinfo->gso_size) @@ -2970,7 +2957,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) p = nskb; merge: - p->truesize += skb->truesize - len; + delta_truesize = skb->truesize; if (offset > headlen) { unsigned int eat = offset - headlen; @@ -2990,7 +2977,7 @@ merge: done: NAPI_GRO_CB(p)->count++; p->data_len += len; - p->truesize += len; + p->truesize += delta_truesize; p->len += len; NAPI_GRO_CB(skb)->same_flow = 1; diff --git a/net/core/sock.c b/net/core/sock.c index 1a8835117fd..b8c818e69c2 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -113,6 +113,7 @@ #include <linux/user_namespace.h> #include <linux/static_key.h> #include <linux/memcontrol.h> +#include <linux/prefetch.h> #include <asm/uaccess.h> diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 51c6c672c8a..0d11f234d61 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -673,10 +673,15 @@ static int do_ip_setsockopt(struct sock *sk, int level, break; } else { memset(&mreq, 0, sizeof(mreq)); - if (optlen >= sizeof(struct in_addr) && - copy_from_user(&mreq.imr_address, optval, - sizeof(struct in_addr))) - break; + if (optlen >= sizeof(struct ip_mreq)) { + if (copy_from_user(&mreq, optval, + sizeof(struct ip_mreq))) + break; + } else if (optlen >= sizeof(struct in_addr)) { + if (copy_from_user(&mreq.imr_address, optval, + sizeof(struct in_addr))) + break; + } } if (!mreq.imr_ifindex) { diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 33417f84e07..ef32956ed65 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -27,6 +27,7 @@ #include <net/tcp_memcontrol.h> static int zero; +static int two = 2; static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; @@ -677,6 +678,15 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { + .procname = "tcp_early_retrans", + .data = &sysctl_tcp_early_retrans, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, + }, + { .procname = "udp_mem", .data = &sysctl_udp_mem, .maxlen = sizeof(sysctl_udp_mem), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9670af34193..c2cff8b6277 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -395,6 +395,7 @@ void tcp_init_sock(struct sock *sk) tp->mss_cache = TCP_MSS_DEFAULT; tp->reordering = sysctl_tcp_reordering; + tcp_enable_early_retrans(tp); icsk->icsk_ca_ops = &tcp_init_congestion_ops; sk->sk_state = TCP_CLOSE; @@ -980,8 +981,8 @@ static inline int select_size(const struct sock *sk, bool sg) static int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) { struct sk_buff *skb; - struct tcp_skb_cb *cb; struct tcphdr *th; + bool fragstolen; skb = alloc_skb(size + sizeof(*th), sk->sk_allocation); if (!skb) @@ -994,14 +995,14 @@ static int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size)) goto err_free; - cb = TCP_SKB_CB(skb); - TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; - tcp_queue_rcv(sk, skb, sizeof(*th)); - + if (tcp_queue_rcv(sk, skb, sizeof(*th), &fragstolen)) { + WARN_ON_ONCE(fragstolen); /* should not happen */ + __kfree_skb(skb); + } return size; err_free: @@ -2495,6 +2496,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, err = -EINVAL; else tp->thin_dupack = val; + if (tp->thin_dupack) + tcp_disable_early_retrans(tp); break; case TCP_REPAIR: diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 96a631deb4e..7b2d351f24d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -99,6 +99,7 @@ int sysctl_tcp_thin_dupack __read_mostly; int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; int sysctl_tcp_abc __read_mostly; +int sysctl_tcp_early_retrans __read_mostly = 2; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ @@ -906,6 +907,7 @@ static void tcp_init_metrics(struct sock *sk) if (dst_metric(dst, RTAX_REORDERING) && tp->reordering != dst_metric(dst, RTAX_REORDERING)) { tcp_disable_fack(tp); + tcp_disable_early_retrans(tp); tp->reordering = dst_metric(dst, RTAX_REORDERING); } @@ -988,6 +990,9 @@ static void tcp_update_reordering(struct sock *sk, const int metric, #endif tcp_disable_fack(tp); } + + if (metric > 0) + tcp_disable_early_retrans(tp); } /* This must be called before lost_out is incremented */ @@ -2339,6 +2344,27 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; } +static bool tcp_pause_early_retransmit(struct sock *sk, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned long delay; + + /* Delay early retransmit and entering fast recovery for + * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples + * available, or RTO is scheduled to fire first. + */ + if (sysctl_tcp_early_retrans < 2 || (flag & FLAG_ECE) || !tp->srtt) + return false; + + delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2)); + if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) + return false; + + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, TCP_RTO_MAX); + tp->early_retrans_delayed = 1; + return true; +} + static inline int tcp_skb_timedout(const struct sock *sk, const struct sk_buff *skb) { @@ -2446,7 +2472,7 @@ static inline int tcp_head_timedout(const struct sock *sk) * Main question: may we further continue forward transmission * with the same cwnd? */ -static int tcp_time_to_recover(struct sock *sk) +static int tcp_time_to_recover(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); __u32 packets_out; @@ -2492,6 +2518,16 @@ static int tcp_time_to_recover(struct sock *sk) tcp_is_sack(tp) && !tcp_send_head(sk)) return 1; + /* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious + * retransmissions due to small network reorderings, we implement + * Mitigation A.3 in the RFC and delay the retransmission for a short + * interval if appropriate. + */ + if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && + (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) && + !tcp_may_send_now(sk)) + return !tcp_pause_early_retransmit(sk, flag); + return 0; } @@ -3022,6 +3058,38 @@ static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked, tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt; } +static void tcp_enter_recovery(struct sock *sk, bool ece_ack) +{ + struct tcp_sock *tp = tcp_sk(sk); + int mib_idx; + + if (tcp_is_reno(tp)) + mib_idx = LINUX_MIB_TCPRENORECOVERY; + else + mib_idx = LINUX_MIB_TCPSACKRECOVERY; + + NET_INC_STATS_BH(sock_net(sk), mib_idx); + + tp->high_seq = tp->snd_nxt; + tp->prior_ssthresh = 0; + tp->undo_marker = tp->snd_una; + tp->undo_retrans = tp->retrans_out; + + if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { + if (!ece_ack) + tp->prior_ssthresh = tcp_current_ssthresh(sk); + tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); + TCP_ECN_queue_cwr(tp); + } + + tp->bytes_acked = 0; + tp->snd_cwnd_cnt = 0; + tp->prior_cwnd = tp->snd_cwnd; + tp->prr_delivered = 0; + tp->prr_out = 0; + tcp_set_ca_state(sk, TCP_CA_Recovery); +} + /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and @@ -3041,7 +3109,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, struct tcp_sock *tp = tcp_sk(sk); int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && (tcp_fackets_out(tp) > tp->reordering)); - int fast_rexmit = 0, mib_idx; + int fast_rexmit = 0; if (WARN_ON(!tp->packets_out && tp->sacked_out)) tp->sacked_out = 0; @@ -3125,7 +3193,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, if (icsk->icsk_ca_state <= TCP_CA_Disorder) tcp_try_undo_dsack(sk); - if (!tcp_time_to_recover(sk)) { + if (!tcp_time_to_recover(sk, flag)) { tcp_try_to_open(sk, flag); return; } @@ -3142,32 +3210,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, } /* Otherwise enter Recovery state */ - - if (tcp_is_reno(tp)) - mib_idx = LINUX_MIB_TCPRENORECOVERY; - else - mib_idx = LINUX_MIB_TCPSACKRECOVERY; - - NET_INC_STATS_BH(sock_net(sk), mib_idx); - - tp->high_seq = tp->snd_nxt; - tp->prior_ssthresh = 0; - tp->undo_marker = tp->snd_una; - tp->undo_retrans = tp->retrans_out; - - if (icsk->icsk_ca_state < TCP_CA_CWR) { - if (!(flag & FLAG_ECE)) - tp->prior_ssthresh = tcp_current_ssthresh(sk); - tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); - TCP_ECN_queue_cwr(tp); - } - - tp->bytes_acked = 0; - tp->snd_cwnd_cnt = 0; - tp->prior_cwnd = tp->snd_cwnd; - tp->prr_delivered = 0; - tp->prr_out = 0; - tcp_set_ca_state(sk, TCP_CA_Recovery); + tcp_enter_recovery(sk, (flag & FLAG_ECE)); fast_rexmit = 1; } @@ -3249,16 +3292,47 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) /* Restart timer after forward progress on connection. * RFC2988 recommends to restart timer to now+rto. */ -static void tcp_rearm_rto(struct sock *sk) +void tcp_rearm_rto(struct sock *sk) { - const struct tcp_sock *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); if (!tp->packets_out) { inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); } else { - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, TCP_RTO_MAX); + u32 rto = inet_csk(sk)->icsk_rto; + /* Offset the time elapsed after installing regular RTO */ + if (tp->early_retrans_delayed) { + struct sk_buff *skb = tcp_write_queue_head(sk); + const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; + s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); + /* delta may not be positive if the socket is locked + * when the delayed ER timer fires and is rescheduled. + */ + if (delta > 0) + rto = delta; + } + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, + TCP_RTO_MAX); } + tp->early_retrans_delayed = 0; +} + +/* This function is called when the delayed ER timer fires. TCP enters + * fast recovery and performs fast-retransmit. + */ +void tcp_resume_early_retransmit(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tcp_rearm_rto(sk); + + /* Stop if ER is disabled after the delayed ER timer is scheduled */ + if (!tp->do_early_retrans) + return; + + tcp_enter_recovery(sk, false); + tcp_update_scoreboard(sk, 1); + tcp_xmit_retransmit_queue(sk); } /* If we get here, the whole TSO packet has not been acked. */ @@ -3707,6 +3781,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (after(ack, tp->snd_nxt)) goto invalid_ack; + if (tp->early_retrans_delayed) + tcp_rearm_rto(sk); + if (after(ack, prior_snd_una)) flag |= FLAG_SND_UNA_ADVANCED; @@ -4455,6 +4532,7 @@ static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) * @sk: socket * @to: prior buffer * @from: buffer to add in queue + * @fragstolen: pointer to boolean * * Before queueing skb @from after @to, try to merge them * to reduce overall memory use and queue lengths, if cost is small. @@ -4467,59 +4545,82 @@ static bool tcp_try_coalesce(struct sock *sk, struct sk_buff *from, bool *fragstolen) { - int delta, len = from->len; + int i, delta, len = from->len; *fragstolen = false; - if (tcp_hdr(from)->fin) + + if (tcp_hdr(from)->fin || skb_cloned(to)) return false; + if (len <= skb_tailroom(to)) { BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); -merge: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); - TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; - TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; - return true; + goto merge; } if (skb_has_frag_list(to) || skb_has_frag_list(from)) return false; - if (skb_headlen(from) == 0 && - (skb_shinfo(to)->nr_frags + - skb_shinfo(from)->nr_frags <= MAX_SKB_FRAGS)) { - WARN_ON_ONCE(from->head_frag); - delta = from->truesize - ksize(from->head) - - SKB_DATA_ALIGN(sizeof(struct sk_buff)); - - WARN_ON_ONCE(delta < len); -copyfrags: - memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags, - skb_shinfo(from)->frags, - skb_shinfo(from)->nr_frags * sizeof(skb_frag_t)); - skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags; - skb_shinfo(from)->nr_frags = 0; - to->truesize += delta; - atomic_add(delta, &sk->sk_rmem_alloc); - sk_mem_charge(sk, delta); - to->len += len; - to->data_len += len; - goto merge; - } - if (from->head_frag) { + if (skb_headlen(from) != 0) { struct page *page; unsigned int offset; - if (skb_shinfo(to)->nr_frags + skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) + if (skb_shinfo(to)->nr_frags + + skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) return false; + + if (skb_head_is_locked(from)) + return false; + + delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); + page = virt_to_head_page(from->head); offset = from->data - (unsigned char *)page_address(page); + skb_fill_page_desc(to, skb_shinfo(to)->nr_frags, page, offset, skb_headlen(from)); *fragstolen = true; - delta = len; /* we dont know real truesize... */ - goto copyfrags; + } else { + if (skb_shinfo(to)->nr_frags + + skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS) + return false; + + delta = from->truesize - + SKB_TRUESIZE(skb_end_pointer(from) - from->head); } - return false; + + WARN_ON_ONCE(delta < len); + + memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags, + skb_shinfo(from)->frags, + skb_shinfo(from)->nr_frags * sizeof(skb_frag_t)); + skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags; + + if (!skb_cloned(from)) + skb_shinfo(from)->nr_frags = 0; + + /* if the skb is cloned this does nothing since we set nr_frags to 0 */ + for (i = 0; i < skb_shinfo(from)->nr_frags; i++) + skb_frag_ref(from, i); + + to->truesize += delta; + atomic_add(delta, &sk->sk_rmem_alloc); + sk_mem_charge(sk, delta); + to->len += len; + to->data_len += len; + +merge: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); + TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; + TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; + return true; +} + +static void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) +{ + if (head_stolen) + kmem_cache_free(skbuff_head_cache, skb); + else + __kfree_skb(skb); } static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) @@ -4565,10 +4666,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { __skb_queue_after(&tp->out_of_order_queue, skb1, skb); } else { - if (fragstolen) - kmem_cache_free(skbuff_head_cache, skb); - else - __kfree_skb(skb); + kfree_skb_partial(skb, fragstolen); skb = NULL; } @@ -4645,6 +4743,22 @@ end: skb_set_owner_r(skb, sk); } +int tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, + bool *fragstolen) +{ + int eaten; + struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue); + + __skb_pull(skb, hdrlen); + eaten = (tail && + tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; + tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + if (!eaten) { + __skb_queue_tail(&sk->sk_receive_queue, skb); + skb_set_owner_r(skb, sk); + } + return eaten; +} static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { @@ -4691,20 +4805,12 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) } if (eaten <= 0) { - struct sk_buff *tail; queue_and_out: if (eaten < 0 && tcp_try_rmem_schedule(sk, skb->truesize)) goto drop; - tail = skb_peek_tail(&sk->sk_receive_queue); - eaten = (tail && - tcp_try_coalesce(sk, tail, skb, - &fragstolen)) ? 1 : 0; - if (eaten <= 0) { - skb_set_owner_r(skb, sk); - __skb_queue_tail(&sk->sk_receive_queue, skb); - } + eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); } tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (skb->len) @@ -4727,12 +4833,9 @@ queue_and_out: tcp_fast_path_check(sk); - if (eaten > 0) { - if (fragstolen) - kmem_cache_free(skbuff_head_cache, skb); - else - __kfree_skb(skb); - } else if (!sock_flag(sk, SOCK_DEAD)) + if (eaten > 0) + kfree_skb_partial(skb, fragstolen); + else if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk, 0); return; } @@ -5402,14 +5505,6 @@ discard: return 0; } -void tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen) -{ - __skb_pull(skb, hdrlen); - __skb_queue_tail(&sk->sk_receive_queue, skb); - skb_set_owner_r(skb, sk); - tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; -} - /* * TCP receive function for the ESTABLISHED state. * @@ -5518,6 +5613,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, } else { int eaten = 0; int copied_early = 0; + bool fragstolen = false; if (tp->copied_seq == tp->rcv_nxt && len - tcp_header_len <= tp->ucopy.len) { @@ -5575,7 +5671,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ - tcp_queue_rcv(sk, skb, tcp_header_len); + eaten = tcp_queue_rcv(sk, skb, tcp_header_len, + &fragstolen); } tcp_event_data_recv(sk, skb); @@ -5597,7 +5694,7 @@ no_ack: else #endif if (eaten) - __kfree_skb(skb); + kfree_skb_partial(skb, fragstolen); else sk->sk_data_ready(sk, 0); return 0; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index cf97e9821d7..4ff5e1f70d1 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1368,7 +1368,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) goto drop_and_free; if (!want_cookie || tmp_opt.tstamp_ok) - TCP_ECN_create_request(req, tcp_hdr(skb)); + TCP_ECN_create_request(req, skb); if (want_cookie) { isn = cookie_v4_init_sequence(sk, skb, &req->mss); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 3cabafb5cdd..6f6a9183282 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -482,6 +482,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->sacked_out = 0; newtp->fackets_out = 0; newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + tcp_enable_early_retrans(newtp); /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 834e89fc541..d9473300992 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -78,9 +78,8 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) tp->frto_counter = 3; tp->packets_out += tcp_skb_pcount(skb); - if (!prior_packets) - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, TCP_RTO_MAX); + if (!prior_packets || tp->early_retrans_delayed) + tcp_rearm_rto(sk); } /* SND.NXT, if window was not shrunk. diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 34d4a02c2f1..e911e6c523e 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -319,6 +319,11 @@ void tcp_retransmit_timer(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + if (tp->early_retrans_delayed) { + tcp_resume_early_retransmit(sk); + return; + } + if (!tp->packets_out) goto out; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 57b21096983..078d039e8fd 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1140,7 +1140,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) treq->rmt_addr = ipv6_hdr(skb)->saddr; treq->loc_addr = ipv6_hdr(skb)->daddr; if (!want_cookie || tmp_opt.tstamp_ok) - TCP_ECN_create_request(req, tcp_hdr(skb)); + TCP_ECN_create_request(req, skb); treq->iif = sk->sk_bound_dev_if; diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 81445cc8196..cc37dd52ecf 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -332,15 +332,13 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) } q->stats.pdrop++; - sch->qstats.drops++; - kfree_skb(skb); - return NET_XMIT_DROP; + return qdisc_drop(skb, sch); - congestion_drop: +congestion_drop: qdisc_drop(skb, sch); return NET_XMIT_CN; - other_drop: +other_drop: if (ret & __NET_XMIT_BYPASS) sch->qstats.drops++; kfree_skb(skb); diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 389b856c665..3886365cc20 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -265,8 +265,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_SUCCESS; drop: - kfree_skb(skb); - sch->qstats.drops++; + qdisc_drop(skb, sch); return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; } diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 2ea6f196e3c..acae5b0e384 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -558,9 +558,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) __skb_queue_tail(&q->direct_queue, skb); q->direct_pkts++; } else { - kfree_skb(skb); - sch->qstats.drops++; - return NET_XMIT_DROP; + return qdisc_drop(skb, sch); } #ifdef CONFIG_NET_CLS_ACT } else if (!cl) { diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 45326599fda..ca0c29695d5 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -88,9 +88,7 @@ teql_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_SUCCESS; } - kfree_skb(skb); - sch->qstats.drops++; - return NET_XMIT_DROP; + return qdisc_drop(skb, sch); } static struct sk_buff * |