From 4d846f02392a710f9604892ac3329e628e60a230 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 16 Apr 2012 23:28:07 +0000
Subject: tcp: fix tcp_grow_window() for large incoming frames

tcp_grow_window() has to grow rcv_ssthresh up to window_clamp, allowing
sender to increase its window.

tcp_grow_window() still assumes a tcp frame is under MSS, but its no
longer true with LRO/GRO.

This patch fixes one of the performance issue we noticed with GRO on.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Tom Herbert <therbert@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9944c1d9a21..3ff36406537 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -335,6 +335,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
 			incr = __tcp_grow_window(sk, skb);
 
 		if (incr) {
+			incr = max_t(int, incr, 2 * skb->len);
 			tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
 					       tp->window_clamp);
 			inet_csk(sk)->icsk_ack.quick |= 1;
-- 
cgit v1.2.3-70-g09d2


From 651913ce9de2bbcedef608c5d6cf39c244248509 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Fri, 27 Apr 2012 11:29:37 -0400
Subject: tcp: clean up use of jiffies in tcp_rcv_rtt_measure()

Clean up a reference to jiffies in tcp_rcv_rtt_measure() that should
instead reference tcp_time_stamp. Since the result of the subtraction
is passed into a function taking u32, this should not change any
behavior (and indeed the generated assembly does not change on
x86_64). However, it seems worth cleaning this up for consistency and
clarity (and perhaps to avoid bugs if this is copied and pasted
somewhere else).

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3ff36406537..2a702e3a4ae 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -495,7 +495,7 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
 		goto new_measure;
 	if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
 		return;
-	tcp_rcv_rtt_update(tp, jiffies - tp->rcv_rtt_est.time, 1);
+	tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1);
 
 new_measure:
 	tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
-- 
cgit v1.2.3-70-g09d2


From 1cebce36d660c83bd1353e41f3e66abd4686f215 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 30 Apr 2012 06:00:18 +0000
Subject: tcp: fix infinite cwnd in tcp_complete_cwr()

When the cwnd reduction is done, ssthresh may be infinite
if TCP enters CWR via ECN or F-RTO. If cwnd is not undone, i.e.,
undo_marker is set, tcp_complete_cwr() falsely set cwnd to the
infinite ssthresh value. The correct operation is to keep cwnd
intact because it has been updated in ECN or F-RTO.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2a702e3a4ae..d99efd7dfb6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2868,11 +2868,14 @@ static inline void tcp_complete_cwr(struct sock *sk)
 
 	/* Do not moderate cwnd if it's already undone in cwr or recovery. */
 	if (tp->undo_marker) {
-		if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR)
+		if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) {
 			tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
-		else /* PRR */
+			tp->snd_cwnd_stamp = tcp_time_stamp;
+		} else if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH) {
+			/* PRR algorithm. */
 			tp->snd_cwnd = tp->snd_ssthresh;
-		tp->snd_cwnd_stamp = tcp_time_stamp;
+			tp->snd_cwnd_stamp = tcp_time_stamp;
+		}
 	}
 	tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
 }
-- 
cgit v1.2.3-70-g09d2


From b49960a05e32121d29316cfdf653894b88ac9190 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 2 May 2012 02:28:41 +0000
Subject: tcp: change tcp_adv_win_scale and tcp_rmem[2]

tcp_adv_win_scale default value is 2, meaning we expect a good citizen
skb to have skb->len / skb->truesize ratio of 75% (3/4)

In 2.6 kernels we (mis)accounted for typical MSS=1460 frame :
1536 + 64 + 256 = 1856 'estimated truesize', and 1856 * 3/4 = 1392.
So these skbs were considered as not bloated.

With recent truesize fixes, a typical MSS=1460 frame truesize is now the
more precise :
2048 + 256 = 2304. But 2304 * 3/4 = 1728.
So these skb are not good citizen anymore, because 1460 < 1728

(GRO can escape this problem because it build skbs with a too low
truesize.)

This also means tcp advertises a too optimistic window for a given
allocated rcvspace : When receiving frames, sk_rmem_alloc can hit
sk_rcvbuf limit and we call tcp_prune_queue()/tcp_collapse() too often,
especially when application is slow to drain its receive queue or in
case of losses (netperf is fast, scp is slow). This is a major latency
source.

We should adjust the len/truesize ratio to 50% instead of 75%

This patch :

1) changes tcp_adv_win_scale default to 1 instead of 2

2) increase tcp_rmem[2] limit from 4MB to 6MB to take into account
better truesize tracking and to allow autotuning tcp receive window to
reach same value than before. Note that same amount of kernel memory is
consumed compared to 2.6 kernels.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 4 ++--
 net/ipv4/tcp.c                         | 9 +++++----
 net/ipv4/tcp_input.c                   | 2 +-
 3 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index bd80ba5847d..1619a8c8087 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -147,7 +147,7 @@ tcp_adv_win_scale - INTEGER
 	(if tcp_adv_win_scale > 0) or bytes-bytes/2^(-tcp_adv_win_scale),
 	if it is <= 0.
 	Possible values are [-31, 31], inclusive.
-	Default: 2
+	Default: 1
 
 tcp_allowed_congestion_control - STRING
 	Show/set the congestion control choices available to non-privileged
@@ -410,7 +410,7 @@ tcp_rmem - vector of 3 INTEGERs: min, default, max
 	net.core.rmem_max.  Calling setsockopt() with SO_RCVBUF disables
 	automatic tuning of that socket's receive buffer size, in which
 	case this value is ignored.
-	Default: between 87380B and 4MB, depending on RAM size.
+	Default: between 87380B and 6MB, depending on RAM size.
 
 tcp_sack - BOOLEAN
 	Enable select acknowledgments (SACKS).
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8bb6adeb62c..1272a88c2a6 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3243,7 +3243,7 @@ void __init tcp_init(void)
 {
 	struct sk_buff *skb = NULL;
 	unsigned long limit;
-	int max_share, cnt;
+	int max_rshare, max_wshare, cnt;
 	unsigned int i;
 	unsigned long jiffy = jiffies;
 
@@ -3303,15 +3303,16 @@ void __init tcp_init(void)
 	tcp_init_mem(&init_net);
 	/* Set per-socket limits to no more than 1/128 the pressure threshold */
 	limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
-	max_share = min(4UL*1024*1024, limit);
+	max_wshare = min(4UL*1024*1024, limit);
+	max_rshare = min(6UL*1024*1024, limit);
 
 	sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
 	sysctl_tcp_wmem[1] = 16*1024;
-	sysctl_tcp_wmem[2] = max(64*1024, max_share);
+	sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
 
 	sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
 	sysctl_tcp_rmem[1] = 87380;
-	sysctl_tcp_rmem[2] = max(87380, max_share);
+	sysctl_tcp_rmem[2] = max(87380, max_rshare);
 
 	pr_info("Hash tables configured (established %u bind %u)\n",
 		tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d99efd7dfb6..257b61789ee 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -85,7 +85,7 @@ int sysctl_tcp_ecn __read_mostly = 2;
 EXPORT_SYMBOL(sysctl_tcp_ecn);
 int sysctl_tcp_dsack __read_mostly = 1;
 int sysctl_tcp_app_win __read_mostly = 31;
-int sysctl_tcp_adv_win_scale __read_mostly = 2;
+int sysctl_tcp_adv_win_scale __read_mostly = 1;
 EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
 
 int sysctl_tcp_stdurg __read_mostly;
-- 
cgit v1.2.3-70-g09d2