From 5ffd5cddd4d353fe18c01b89397dcad02035bb95 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkmann@redhat.com>
Date: Thu, 5 Sep 2013 17:48:47 +0200
Subject: net: netlink: filter particular protocols from analyzers

Fix finer-grained control and let only a whitelist of allowed netlink
protocols pass, in our case related to networking. If later on, other
subsystems decide they want to add their protocol as well to the list
of allowed protocols they shall simply add it. While at it, we also
need to tell what protocol is in use otherwise BPF_S_ANC_PROTOCOL can
not pick it up (as it's not filled out).

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/af_netlink.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'net')

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index a17dda1bbee..8df7f64c6db 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -168,16 +168,43 @@ int netlink_remove_tap(struct netlink_tap *nt)
 }
 EXPORT_SYMBOL_GPL(netlink_remove_tap);
 
+static bool netlink_filter_tap(const struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+	bool pass = false;
+
+	/* We take the more conservative approach and
+	 * whitelist socket protocols that may pass.
+	 */
+	switch (sk->sk_protocol) {
+	case NETLINK_ROUTE:
+	case NETLINK_USERSOCK:
+	case NETLINK_SOCK_DIAG:
+	case NETLINK_NFLOG:
+	case NETLINK_XFRM:
+	case NETLINK_FIB_LOOKUP:
+	case NETLINK_NETFILTER:
+	case NETLINK_GENERIC:
+		pass = true;
+		break;
+	}
+
+	return pass;
+}
+
 static int __netlink_deliver_tap_skb(struct sk_buff *skb,
 				     struct net_device *dev)
 {
 	struct sk_buff *nskb;
+	struct sock *sk = skb->sk;
 	int ret = -ENOMEM;
 
 	dev_hold(dev);
 	nskb = skb_clone(skb, GFP_ATOMIC);
 	if (nskb) {
 		nskb->dev = dev;
+		nskb->protocol = htons((u16) sk->sk_protocol);
+
 		ret = dev_queue_xmit(nskb);
 		if (unlikely(ret > 0))
 			ret = net_xmit_errno(ret);
@@ -192,6 +219,9 @@ static void __netlink_deliver_tap(struct sk_buff *skb)
 	int ret;
 	struct netlink_tap *tmp;
 
+	if (!netlink_filter_tap(skb))
+		return;
+
 	list_for_each_entry_rcu(tmp, &netlink_tap_all, list) {
 		ret = __netlink_deliver_tap_skb(skb, tmp->dev);
 		if (unlikely(ret))
-- 
cgit v1.2.3-70-g09d2


From 16edfe7ee02dd7fcc13855ea19158333529533b2 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 5 Sep 2013 15:36:09 -0700
Subject: tcp: fix no cwnd growth after timeout

In commit 0f7cc9a3 "tcp: increase throughput when reordering is high",
it only allows cwnd to increase in Open state. This mistakenly disables
slow start after timeout (CA_Loss). Moreover cwnd won't grow if the
state moves from Disorder to Open later in tcp_fastretrans_alert().

Therefore the correct logic should be to allow cwnd to grow as long
as the data is received in order in Open, Loss, or even Disorder state.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1969e16d936..894bc174f47 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3162,16 +3162,14 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
 
 	/* If reordering is high then always grow cwnd whenever data is
 	 * delivered regardless of its ordering. Otherwise stay conservative
-	 * and only grow cwnd on in-order delivery in Open state, and retain
-	 * cwnd in Disordered state (RFC5681). A stretched ACK with
+	 * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/
 	 * new SACK or ECE mark may first advance cwnd here and later reduce
 	 * cwnd in tcp_fastretrans_alert() based on more states.
 	 */
 	if (tcp_sk(sk)->reordering > sysctl_tcp_reordering)
 		return flag & FLAG_FORWARD_PROGRESS;
 
-	return inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
-	       flag & FLAG_DATA_ACKED;
+	return flag & FLAG_DATA_ACKED;
 }
 
 /* Check that window update is acceptable.
-- 
cgit v1.2.3-70-g09d2


From 4e4f1fc226816905c937f9b29dabe351075dfe0f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 6 Sep 2013 10:35:58 -0700
Subject: tcp: properly increase rcv_ssthresh for ofo packets

TCP receive window handling is multi staged.

A socket has a memory budget, static or dynamic, in sk_rcvbuf.

Because we do not really know how this memory budget translates to
a TCP window (payload), TCP announces a small initial window
(about 20 MSS).

When a packet is received, we increase TCP rcv_win depending
on the payload/truesize ratio of this packet. Good citizen
packets give a hint that it's reasonable to have rcv_win = sk_rcvbuf/2

This heuristic takes place in tcp_grow_window()

Problem is : We currently call tcp_grow_window() only for in-order
packets.

This means that reorders or packet losses stop proper grow of
rcv_win, and senders are unable to benefit from fast recovery,
or proper reordering level detection.

Really, a packet being stored in OFO queue is not a bad citizen.
It should be part of the game as in-order packets.

In our traces, we very often see sender is limited by linux small
receive windows, even if linux hosts use autotuning (DRS) and should
allow rcv_win to grow to ~3MB.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 894bc174f47..25a89eaa669 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4139,6 +4139,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 		if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
 			__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
 		} else {
+			tcp_grow_window(sk, skb);
 			kfree_skb_partial(skb, fragstolen);
 			skb = NULL;
 		}
@@ -4214,8 +4215,10 @@ add_sack:
 	if (tcp_is_sack(tp))
 		tcp_sack_new_ofo_skb(sk, seq, end_seq);
 end:
-	if (skb)
+	if (skb) {
+		tcp_grow_window(sk, skb);
 		skb_set_owner_r(skb, sk);
+	}
 }
 
 static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
-- 
cgit v1.2.3-70-g09d2