From 1485348d2424e1131ea42efc033cbd9366462b01 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 30 Jul 2012 16:11:42 +0000 Subject: tcp: Apply device TSO segment limit earlier Cache the device gso_max_segs in sock::sk_gso_max_segs and use it to limit the size of TSO skbs. This avoids the need to fall back to software GSO for local TCP senders. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3f1bcff0b10..a7b3ec9b6c3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1522,21 +1522,21 @@ static void tcp_cwnd_validate(struct sock *sk) * when we would be allowed to send the split-due-to-Nagle skb fully. */ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, - unsigned int mss_now, unsigned int cwnd) + unsigned int mss_now, unsigned int max_segs) { const struct tcp_sock *tp = tcp_sk(sk); - u32 needed, window, cwnd_len; + u32 needed, window, max_len; window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; - cwnd_len = mss_now * cwnd; + max_len = mss_now * max_segs; - if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk))) - return cwnd_len; + if (likely(max_len <= window && skb != tcp_write_queue_tail(sk))) + return max_len; needed = min(skb->len, window); - if (cwnd_len <= needed) - return cwnd_len; + if (max_len <= needed) + return max_len; return needed - needed % mss_now; } @@ -1765,7 +1765,8 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) limit = min(send_win, cong_win); /* If a full-sized TSO skb can be sent, do it. */ - if (limit >= sk->sk_gso_max_size) + if (limit >= min_t(unsigned int, sk->sk_gso_max_size, + sk->sk_gso_max_segs * tp->mss_cache)) goto send_now; /* Middle in queue won't get any more data, full sendable already? */ @@ -1999,7 +2000,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, - cwnd_quota); + min_t(unsigned int, + cwnd_quota, + sk->sk_gso_max_segs)); if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) -- cgit v1.2.3-70-g09d2 From 8e7dfbc8d1ea9ca9058aa641a8fe795ebca320e2 Mon Sep 17 00:00:00 2001 From: Silviu-Mihai Popescu Date: Sat, 4 Aug 2012 09:31:29 +0000 Subject: tcp_output: fix sparse warning for tcp_wfree Fix sparse warning: * symbol 'tcp_wfree' was not declared. Should it be static? Signed-off-by: Silviu-Mihai Popescu Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a7b3ec9b6c3..20dfd892c86 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -940,7 +940,7 @@ void __init tcp_tasklet_init(void) * We cant xmit new skbs from this context, as we might already * hold qdisc lock. */ -void tcp_wfree(struct sk_buff *skb) +static void tcp_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; struct tcp_sock *tp = tcp_sk(sk); -- cgit v1.2.3-70-g09d2 From 144d56e91044181ec0ef67aeca91e9a8b5718348 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 20 Aug 2012 00:22:46 +0000 Subject: tcp: fix possible socket refcount problem Commit 6f458dfb40 (tcp: improve latencies of timer triggered events) added bug leading to following trace : [ 2866.131281] IPv4: Attempt to release TCP socket in state 1 ffff880019ec0000 [ 2866.131726] [ 2866.132188] ========================= [ 2866.132281] [ BUG: held lock freed! ] [ 2866.132281] 3.6.0-rc1+ #622 Not tainted [ 2866.132281] ------------------------- [ 2866.132281] kworker/0:1/652 is freeing memory ffff880019ec0000-ffff880019ec0a1f, with a lock still held there! [ 2866.132281] (sk_lock-AF_INET-RPC){+.+...}, at: [] tcp_sendmsg+0x29/0xcc6 [ 2866.132281] 4 locks held by kworker/0:1/652: [ 2866.132281] #0: (rpciod){.+.+.+}, at: [] process_one_work+0x1de/0x47f [ 2866.132281] #1: ((&task->u.tk_work)){+.+.+.}, at: [] process_one_work+0x1de/0x47f [ 2866.132281] #2: (sk_lock-AF_INET-RPC){+.+...}, at: [] tcp_sendmsg+0x29/0xcc6 [ 2866.132281] #3: (&icsk->icsk_retransmit_timer){+.-...}, at: [] run_timer_softirq+0x1ad/0x35f [ 2866.132281] [ 2866.132281] stack backtrace: [ 2866.132281] Pid: 652, comm: kworker/0:1 Not tainted 3.6.0-rc1+ #622 [ 2866.132281] Call Trace: [ 2866.132281] [] debug_check_no_locks_freed+0x112/0x159 [ 2866.132281] [] ? __sk_free+0xfd/0x114 [ 2866.132281] [] kmem_cache_free+0x6b/0x13a [ 2866.132281] [] __sk_free+0xfd/0x114 [ 2866.132281] [] sk_free+0x1c/0x1e [ 2866.132281] [] tcp_write_timer+0x51/0x56 [ 2866.132281] [] run_timer_softirq+0x218/0x35f [ 2866.132281] [] ? run_timer_softirq+0x1ad/0x35f [ 2866.132281] [] ? rb_commit+0x58/0x85 [ 2866.132281] [] ? tcp_write_timer_handler+0x148/0x148 [ 2866.132281] [] __do_softirq+0xcb/0x1f9 [ 2866.132281] [] ? _raw_spin_unlock+0x29/0x2e [ 2866.132281] [] call_softirq+0x1c/0x30 [ 2866.132281] [] do_softirq+0x4a/0xa6 [ 2866.132281] [] irq_exit+0x51/0xad [ 2866.132281] [] do_IRQ+0x9d/0xb4 [ 2866.132281] [] common_interrupt+0x6f/0x6f [ 2866.132281] [] ? sched_clock_cpu+0x58/0xd1 [ 2866.132281] [] ? _raw_spin_unlock_irqrestore+0x4c/0x56 [ 2866.132281] [] mod_timer+0x178/0x1a9 [ 2866.132281] [] sk_reset_timer+0x19/0x26 [ 2866.132281] [] tcp_rearm_rto+0x99/0xa4 [ 2866.132281] [] tcp_event_new_data_sent+0x6e/0x70 [ 2866.132281] [] tcp_write_xmit+0x7de/0x8e4 [ 2866.132281] [] ? __alloc_skb+0xa0/0x1a1 [ 2866.132281] [] __tcp_push_pending_frames+0x2e/0x8a [ 2866.132281] [] tcp_sendmsg+0xb32/0xcc6 [ 2866.132281] [] inet_sendmsg+0xaa/0xd5 [ 2866.132281] [] ? inet_autobind+0x5f/0x5f [ 2866.132281] [] ? trace_clock_local+0x9/0xb [ 2866.132281] [] sock_sendmsg+0xa3/0xc4 [ 2866.132281] [] ? rb_reserve_next_event+0x26f/0x2d5 [ 2866.132281] [] ? native_sched_clock+0x29/0x6f [ 2866.132281] [] ? sched_clock+0x9/0xd [ 2866.132281] [] ? trace_clock_local+0x9/0xb [ 2866.132281] [] kernel_sendmsg+0x37/0x43 [ 2866.132281] [] xs_send_kvec+0x77/0x80 [ 2866.132281] [] xs_sendpages+0x6f/0x1a0 [ 2866.132281] [] ? try_to_del_timer_sync+0x55/0x61 [ 2866.132281] [] xs_tcp_send_request+0x55/0xf1 [ 2866.132281] [] xprt_transmit+0x89/0x1db [ 2866.132281] [] ? call_connect+0x3c/0x3c [ 2866.132281] [] call_transmit+0x1c5/0x20e [ 2866.132281] [] __rpc_execute+0x6f/0x225 [ 2866.132281] [] ? call_connect+0x3c/0x3c [ 2866.132281] [] rpc_async_schedule+0x28/0x34 [ 2866.132281] [] process_one_work+0x24d/0x47f [ 2866.132281] [] ? process_one_work+0x1de/0x47f [ 2866.132281] [] ? __rpc_execute+0x225/0x225 [ 2866.132281] [] worker_thread+0x236/0x317 [ 2866.132281] [] ? process_scheduled_works+0x2f/0x2f [ 2866.132281] [] kthread+0x9a/0xa2 [ 2866.132281] [] kernel_thread_helper+0x4/0x10 [ 2866.132281] [] ? retint_restore_args+0x13/0x13 [ 2866.132281] [] ? __init_kthread_worker+0x5a/0x5a [ 2866.132281] [] ? gs_change+0x13/0x13 [ 2866.308506] IPv4: Attempt to release TCP socket in state 1 ffff880019ec0000 [ 2866.309689] ============================================================================= [ 2866.310254] BUG TCP (Not tainted): Object already free [ 2866.310254] ----------------------------------------------------------------------------- [ 2866.310254] The bug comes from the fact that timer set in sk_reset_timer() can run before we actually do the sock_hold(). socket refcount reaches zero and we free the socket too soon. timer handler is not allowed to reduce socket refcnt if socket is owned by the user, or we need to change sk_reset_timer() implementation. We should take a reference on the socket in case TCP_DELACK_TIMER_DEFERRED or TCP_DELACK_TIMER_DEFERRED bit are set in tsq_flags Also fix a typo in tcp_delack_timer(), where TCP_WRITE_TIMER_DEFERRED was used instead of TCP_DELACK_TIMER_DEFERRED. For consistency, use same socket refcount change for TCP_MTU_REDUCED_DEFERRED, even if not fired from a timer. Reported-by: Fengguang Wu Tested-by: Fengguang Wu Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 8 +++++--- net/ipv4/tcp_output.c | 14 +++++++++----- net/ipv4/tcp_timer.c | 6 ++++-- 3 files changed, 18 insertions(+), 10 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5bf2040b25b..00a748d1406 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -417,10 +417,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ tp->mtu_info = info; - if (!sock_owned_by_user(sk)) + if (!sock_owned_by_user(sk)) { tcp_v4_mtu_reduced(sk); - else - set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags); + } else { + if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags)) + sock_hold(sk); + } goto out; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 20dfd892c86..d04632673a9 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -910,14 +910,18 @@ void tcp_release_cb(struct sock *sk) if (flags & (1UL << TCP_TSQ_DEFERRED)) tcp_tsq_handler(sk); - if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) + if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) { tcp_write_timer_handler(sk); - - if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) + __sock_put(sk); + } + if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) { tcp_delack_timer_handler(sk); - - if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) + __sock_put(sk); + } + if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) { sk->sk_prot->mtu_reduced(sk); + __sock_put(sk); + } } EXPORT_SYMBOL(tcp_release_cb); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 6df36ad55a3..b774a03bd1d 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -252,7 +252,8 @@ static void tcp_delack_timer(unsigned long data) inet_csk(sk)->icsk_ack.blocked = 1; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); /* deleguate our work to tcp_release_cb() */ - set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags); + if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) + sock_hold(sk); } bh_unlock_sock(sk); sock_put(sk); @@ -481,7 +482,8 @@ static void tcp_write_timer(unsigned long data) tcp_write_timer_handler(sk); } else { /* deleguate our work to tcp_release_cb() */ - set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags); + if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) + sock_hold(sk); } bh_unlock_sock(sk); sock_put(sk); -- cgit v1.2.3-70-g09d2