From 4670fd819e7f47392c7c6fc6168ea2857c66d163 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 9 Jun 2012 01:25:47 -0700
Subject: tcp: Get rid of inetpeer special cases.

The get_peer method TCP uses is full of special cases that make no
sense accommodating, and it also gets in the way of doing more
reasonable things here.

First of all, if the socket doesn't have a usable cached route, there
is no sense in trying to optimize timewait recycling.

Likewise for the case where we have IP options, such as SRR enabled,
that make the IP header destination address (and thus the destination
address of the route key) differ from that of the connection's
destination address.

Just return a NULL peer in these cases, and thus we're also able to
get rid of the clumsy inetpeer release logic.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net/tcp.h')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index e79aa48d9fc..42459186603 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -327,7 +327,7 @@ extern void tcp_shutdown (struct sock *sk, int how);
 
 extern int tcp_v4_rcv(struct sk_buff *skb);
 
-extern struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it);
+extern struct inet_peer *tcp_v4_get_peer(struct sock *sk);
 extern void *tcp_v4_tw_get_peer(struct sock *sk);
 extern int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw);
 extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
-- 
cgit v1.2.3-70-g09d2


From 2397849baa7c44c242e5d5142d5d16d1e7ed53d0 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 9 Jun 2012 14:56:12 -0700
Subject: [PATCH] tcp: Cache inetpeer in timewait socket, and only when
 necessary.

Since it's guarenteed that we will access the inetpeer if we're trying
to do timewait recycling and TCP options were enabled on the
connection, just cache the peer in the timewait socket.

In the future, inetpeer lookups will be context dependent (per routing
realm), and this helps facilitate that as well.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h         |  3 ++-
 include/net/tcp.h           |  1 -
 include/net/timewait_sock.h |  8 --------
 net/ipv4/tcp_ipv4.c         | 10 ----------
 net/ipv4/tcp_minisocks.c    | 27 ++++++++++++++++++++-------
 net/ipv6/tcp_ipv6.c         | 13 -------------
 6 files changed, 22 insertions(+), 40 deletions(-)

(limited to 'include/net/tcp.h')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4c5b6328337..23e8234f75a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -506,8 +506,9 @@ struct tcp_timewait_sock {
 	u32			  tw_rcv_wnd;
 	u32			  tw_ts_recent;
 	long			  tw_ts_recent_stamp;
+	struct inet_peer	  *tw_peer;
 #ifdef CONFIG_TCP_MD5SIG
-	struct tcp_md5sig_key	*tw_md5_key;
+	struct tcp_md5sig_key	  *tw_md5_key;
 #endif
 	/* Few sockets in timewait have cookies; in that case, then this
 	 * object holds a reference to them (tw_cookie_values->kref).
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 42459186603..9332f342259 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -328,7 +328,6 @@ extern void tcp_shutdown (struct sock *sk, int how);
 extern int tcp_v4_rcv(struct sk_buff *skb);
 
 extern struct inet_peer *tcp_v4_get_peer(struct sock *sk);
-extern void *tcp_v4_tw_get_peer(struct sock *sk);
 extern int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw);
 extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		       size_t size);
diff --git a/include/net/timewait_sock.h b/include/net/timewait_sock.h
index 8d6689cb2c6..68f0ecad6c6 100644
--- a/include/net/timewait_sock.h
+++ b/include/net/timewait_sock.h
@@ -22,7 +22,6 @@ struct timewait_sock_ops {
 	int		(*twsk_unique)(struct sock *sk,
 				       struct sock *sktw, void *twp);
 	void		(*twsk_destructor)(struct sock *sk);
-	void		*(*twsk_getpeer)(struct sock *sk);
 };
 
 static inline int twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
@@ -41,11 +40,4 @@ static inline void twsk_destructor(struct sock *sk)
 		sk->sk_prot->twsk_prot->twsk_destructor(sk);
 }
 
-static inline void *twsk_getpeer(struct sock *sk)
-{
-	if (sk->sk_prot->twsk_prot->twsk_getpeer)
-		return sk->sk_prot->twsk_prot->twsk_getpeer(sk);
-	return NULL;
-}
-
 #endif /* _TIMEWAIT_SOCK_H */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 77f049d00db..fda2ca17135 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1835,20 +1835,10 @@ struct inet_peer *tcp_v4_get_peer(struct sock *sk)
 }
 EXPORT_SYMBOL(tcp_v4_get_peer);
 
-void *tcp_v4_tw_get_peer(struct sock *sk)
-{
-	const struct inet_timewait_sock *tw = inet_twsk(sk);
-	struct net *net = sock_net(sk);
-
-	return inet_getpeer_v4(net, tw->tw_daddr, 1);
-}
-EXPORT_SYMBOL(tcp_v4_tw_get_peer);
-
 static struct timewait_sock_ops tcp_timewait_sock_ops = {
 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
 	.twsk_unique	= tcp_twsk_unique,
 	.twsk_destructor= tcp_twsk_destructor,
-	.twsk_getpeer	= tcp_v4_tw_get_peer,
 };
 
 const struct inet_connection_sock_af_ops ipv4_specific = {
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index fef9dbf3af0..cb015317c9f 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -77,20 +77,19 @@ static bool tcp_remember_stamp(struct sock *sk)
 
 static bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
 {
+	const struct tcp_timewait_sock *tcptw;
 	struct sock *sk = (struct sock *) tw;
 	struct inet_peer *peer;
 
-	peer = twsk_getpeer(sk);
+	tcptw = tcp_twsk(sk);
+	peer = tcptw->tw_peer;
 	if (peer) {
-		const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
-
 		if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
 		    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
 		     peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
 			peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
 			peer->tcp_ts	   = tcptw->tw_ts_recent;
 		}
-		inet_putpeer(peer);
 		return true;
 	}
 	return false;
@@ -314,9 +313,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	const struct tcp_sock *tp = tcp_sk(sk);
 	bool recycle_ok = false;
+	bool recycle_on = false;
 
-	if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
+	if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) {
 		recycle_ok = tcp_remember_stamp(sk);
+		recycle_on = true;
+	}
 
 	if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
 		tw = inet_twsk_alloc(sk, state);
@@ -324,8 +326,10 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 	if (tw != NULL) {
 		struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
 		const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
+		struct inet_sock *inet = inet_sk(sk);
+		struct inet_peer *peer = NULL;
 
-		tw->tw_transparent	= inet_sk(sk)->transparent;
+		tw->tw_transparent	= inet->transparent;
 		tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale;
 		tcptw->tw_rcv_nxt	= tp->rcv_nxt;
 		tcptw->tw_snd_nxt	= tp->snd_nxt;
@@ -347,6 +351,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 		}
 #endif
 
+		if (recycle_on)
+			peer = icsk->icsk_af_ops->get_peer(sk);
+		tcptw->tw_peer = peer;
+		if (peer)
+			atomic_inc(&peer->refcnt);
+
 #ifdef CONFIG_TCP_MD5SIG
 		/*
 		 * The timewait bucket does not have the key DB from the
@@ -398,8 +408,11 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 
 void tcp_twsk_destructor(struct sock *sk)
 {
-#ifdef CONFIG_TCP_MD5SIG
 	struct tcp_timewait_sock *twsk = tcp_twsk(sk);
+
+	if (twsk->tw_peer)
+		inet_putpeer(twsk->tw_peer);
+#ifdef CONFIG_TCP_MD5SIG
 	if (twsk->tw_md5_key) {
 		tcp_free_md5sig_pool();
 		kfree_rcu(twsk->tw_md5_key, rcu);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index b5ecf37b61a..f91b0bfd12d 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1746,23 +1746,10 @@ static struct inet_peer *tcp_v6_get_peer(struct sock *sk)
 	return rt6_get_peer_create(rt);
 }
 
-static void *tcp_v6_tw_get_peer(struct sock *sk)
-{
-	const struct inet6_timewait_sock *tw6 = inet6_twsk(sk);
-	const struct inet_timewait_sock *tw = inet_twsk(sk);
-	struct net *net = sock_net(sk);
-
-	if (tw->tw_family == AF_INET)
-		return tcp_v4_tw_get_peer(sk);
-
-	return inet_getpeer_v6(net, &tw6->tw_v6_daddr, 1);
-}
-
 static struct timewait_sock_ops tcp6_timewait_sock_ops = {
 	.twsk_obj_size	= sizeof(struct tcp6_timewait_sock),
 	.twsk_unique	= tcp_twsk_unique,
 	.twsk_destructor= tcp_twsk_destructor,
-	.twsk_getpeer	= tcp_v6_tw_get_peer,
 };
 
 static const struct inet_connection_sock_af_ops ipv6_specific = {
-- 
cgit v1.2.3-70-g09d2


From 41063e9dd11956f2d285e12e4342e1d232ba0ea2 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 19 Jun 2012 21:22:05 -0700
Subject: ipv4: Early TCP socket demux.

Input packet processing for local sockets involves two major demuxes.
One for the route and one for the socket.

But we can optimize this down to one demux for certain kinds of local
sockets.

Currently we only do this for established TCP sockets, but it could
at least in theory be expanded to other kinds of connections.

If a TCP socket is established then it's identity is fully specified.

This means that whatever input route was used during the three-way
handshake must work equally well for the rest of the connection since
the keys will not change.

Once we move to established state, we cache the receive packet's input
route to use later.

Like the existing cached route in sk->sk_dst_cache used for output
packets, we have to check for route invalidations using dst->obsolete
and dst->ops->check().

Early demux occurs outside of a socket locked section, so when a route
invalidation occurs we defer the fixup of sk->sk_rx_dst until we are
actually inside of established state packet processing and thus have
the socket locked.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_hashtables.h |  4 ++--
 include/net/protocol.h        |  1 +
 include/net/sock.h            |  2 ++
 include/net/tcp.h             |  1 +
 net/core/sock.c               |  5 +++++
 net/ipv4/af_inet.c            | 18 +++++++++--------
 net/ipv4/ip_input.c           | 39 ++++++++++++++++++++++++------------
 net/ipv4/tcp_input.c          | 16 ++++++++++++++-
 net/ipv4/tcp_ipv4.c           | 46 +++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_minisocks.c      |  2 ++
 10 files changed, 110 insertions(+), 24 deletions(-)

(limited to 'include/net/tcp.h')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 808fc5f76b0..54be0287eb9 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -379,10 +379,10 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
 					     const __be16 sport,
 					     const __be16 dport)
 {
-	struct sock *sk;
+	struct sock *sk = skb_steal_sock(skb);
 	const struct iphdr *iph = ip_hdr(skb);
 
-	if (unlikely(sk = skb_steal_sock(skb)))
+	if (sk)
 		return sk;
 	else
 		return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo,
diff --git a/include/net/protocol.h b/include/net/protocol.h
index a1b1b530c33..967b926cbfb 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -37,6 +37,7 @@
 
 /* This is used to register protocols. */
 struct net_protocol {
+	int			(*early_demux)(struct sk_buff *skb);
 	int			(*handler)(struct sk_buff *skb);
 	void			(*err_handler)(struct sk_buff *skb, u32 info);
 	int			(*gso_send_check)(struct sk_buff *skb);
diff --git a/include/net/sock.h b/include/net/sock.h
index 4a452169956..87b424ae750 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -319,6 +319,7 @@ struct sock {
 	unsigned long 		sk_flags;
 	struct dst_entry	*sk_dst_cache;
 	spinlock_t		sk_dst_lock;
+	struct dst_entry	*sk_rx_dst;
 	atomic_t		sk_wmem_alloc;
 	atomic_t		sk_omem_alloc;
 	int			sk_sndbuf;
@@ -1426,6 +1427,7 @@ extern struct sk_buff		*sock_rmalloc(struct sock *sk,
 					      gfp_t priority);
 extern void			sock_wfree(struct sk_buff *skb);
 extern void			sock_rfree(struct sk_buff *skb);
+extern void			sock_edemux(struct sk_buff *skb);
 
 extern int			sock_setsockopt(struct socket *sock, int level,
 						int op, char __user *optval,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 9332f342259..6660ffc4963 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -325,6 +325,7 @@ extern void tcp_v4_err(struct sk_buff *skb, u32);
 
 extern void tcp_shutdown (struct sock *sk, int how);
 
+extern int tcp_v4_early_demux(struct sk_buff *skb);
 extern int tcp_v4_rcv(struct sk_buff *skb);
 
 extern struct inet_peer *tcp_v4_get_peer(struct sock *sk);
diff --git a/net/core/sock.c b/net/core/sock.c
index 9e5b71fda6e..929bdcc2383 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1465,6 +1465,11 @@ void sock_rfree(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(sock_rfree);
 
+void sock_edemux(struct sk_buff *skb)
+{
+	sock_put(skb->sk);
+}
+EXPORT_SYMBOL(sock_edemux);
 
 int sock_i_uid(struct sock *sk)
 {
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 85a3b176313..07a02f6e969 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -157,6 +157,7 @@ void inet_sock_destruct(struct sock *sk)
 
 	kfree(rcu_dereference_protected(inet->inet_opt, 1));
 	dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
+	dst_release(sk->sk_rx_dst);
 	sk_refcnt_debug_dec(sk);
 }
 EXPORT_SYMBOL(inet_sock_destruct);
@@ -1518,14 +1519,15 @@ static const struct net_protocol igmp_protocol = {
 #endif
 
 static const struct net_protocol tcp_protocol = {
-	.handler =	tcp_v4_rcv,
-	.err_handler =	tcp_v4_err,
-	.gso_send_check = tcp_v4_gso_send_check,
-	.gso_segment =	tcp_tso_segment,
-	.gro_receive =	tcp4_gro_receive,
-	.gro_complete =	tcp4_gro_complete,
-	.no_policy =	1,
-	.netns_ok =	1,
+	.early_demux	=	tcp_v4_early_demux,
+	.handler	=	tcp_v4_rcv,
+	.err_handler	=	tcp_v4_err,
+	.gso_send_check	=	tcp_v4_gso_send_check,
+	.gso_segment	=	tcp_tso_segment,
+	.gro_receive	=	tcp4_gro_receive,
+	.gro_complete	=	tcp4_gro_complete,
+	.no_policy	=	1,
+	.netns_ok	=	1,
 };
 
 static const struct net_protocol udp_protocol = {
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index c4fe1d27113..93b092c9a39 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -323,19 +323,32 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	 *	how the packet travels inside Linux networking.
 	 */
 	if (skb_dst(skb) == NULL) {
-		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
-					       iph->tos, skb->dev);
-		if (unlikely(err)) {
-			if (err == -EHOSTUNREACH)
-				IP_INC_STATS_BH(dev_net(skb->dev),
-						IPSTATS_MIB_INADDRERRORS);
-			else if (err == -ENETUNREACH)
-				IP_INC_STATS_BH(dev_net(skb->dev),
-						IPSTATS_MIB_INNOROUTES);
-			else if (err == -EXDEV)
-				NET_INC_STATS_BH(dev_net(skb->dev),
-						 LINUX_MIB_IPRPFILTER);
-			goto drop;
+		const struct net_protocol *ipprot;
+		int protocol = iph->protocol;
+		int err;
+
+		rcu_read_lock();
+		ipprot = rcu_dereference(inet_protos[protocol]);
+		err = -ENOENT;
+		if (ipprot && ipprot->early_demux)
+			err = ipprot->early_demux(skb);
+		rcu_read_unlock();
+
+		if (err) {
+			err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+						   iph->tos, skb->dev);
+			if (unlikely(err)) {
+				if (err == -EHOSTUNREACH)
+					IP_INC_STATS_BH(dev_net(skb->dev),
+							IPSTATS_MIB_INADDRERRORS);
+				else if (err == -ENETUNREACH)
+					IP_INC_STATS_BH(dev_net(skb->dev),
+							IPSTATS_MIB_INNOROUTES);
+				else if (err == -EXDEV)
+					NET_INC_STATS_BH(dev_net(skb->dev),
+							 LINUX_MIB_IPRPFILTER);
+				goto drop;
+			}
 		}
 	}
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b224eb8bce8..8416f8a68e6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5518,6 +5518,18 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 	struct tcp_sock *tp = tcp_sk(sk);
 	int res;
 
+	if (sk->sk_rx_dst) {
+		struct dst_entry *dst = sk->sk_rx_dst;
+		if (unlikely(dst->obsolete)) {
+			if (dst->ops->check(dst, 0) == NULL) {
+				dst_release(dst);
+				sk->sk_rx_dst = NULL;
+			}
+		}
+	}
+	if (unlikely(sk->sk_rx_dst == NULL))
+		sk->sk_rx_dst = dst_clone(skb_dst(skb));
+
 	/*
 	 *	Header prediction.
 	 *	The code loosely follows the one in the famous
@@ -5729,8 +5741,10 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
 
 	tcp_set_state(sk, TCP_ESTABLISHED);
 
-	if (skb != NULL)
+	if (skb != NULL) {
+		sk->sk_rx_dst = dst_clone(skb_dst(skb));
 		security_inet_conn_established(sk, skb);
+	}
 
 	/* Make sure socket is routed, for correct metrics.  */
 	icsk->icsk_af_ops->rebuild_header(sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fda2ca17135..13857df1dae 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1671,6 +1671,52 @@ csum_err:
 }
 EXPORT_SYMBOL(tcp_v4_do_rcv);
 
+int tcp_v4_early_demux(struct sk_buff *skb)
+{
+	struct net *net = dev_net(skb->dev);
+	const struct iphdr *iph;
+	const struct tcphdr *th;
+	struct sock *sk;
+	int err;
+
+	err = -ENOENT;
+	if (skb->pkt_type != PACKET_HOST)
+		goto out_err;
+
+	if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
+		goto out_err;
+
+	iph = ip_hdr(skb);
+	th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
+
+	if (th->doff < sizeof(struct tcphdr) / 4)
+		goto out_err;
+
+	if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4))
+		goto out_err;
+
+	sk = __inet_lookup_established(net, &tcp_hashinfo,
+				       iph->saddr, th->source,
+				       iph->daddr, th->dest,
+				       skb->dev->ifindex);
+	if (sk) {
+		skb->sk = sk;
+		skb->destructor = sock_edemux;
+		if (sk->sk_state != TCP_TIME_WAIT) {
+			struct dst_entry *dst = sk->sk_rx_dst;
+			if (dst)
+				dst = dst_check(dst, 0);
+			if (dst) {
+				skb_dst_set_noref(skb, dst);
+				err = 0;
+			}
+		}
+	}
+
+out_err:
+	return err;
+}
+
 /*
  *	From tcp_input.c
  */
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index cb015317c9f..72b7c63b1a3 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -445,6 +445,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		struct tcp_sock *oldtp = tcp_sk(sk);
 		struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
 
+		newsk->sk_rx_dst = dst_clone(skb_dst(skb));
+
 		/* TCP Cookie Transactions require space for the cookie pair,
 		 * as it differs for each connection.  There is no need to
 		 * copy any s_data_payload stored at the original socket.
-- 
cgit v1.2.3-70-g09d2


From c074da2810c118b3812f32d6754bd9ead2f169e7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 26 Jun 2012 23:14:15 +0000
Subject: ipv4: tcp: dont cache unconfirmed intput dst

DDOS synflood attacks hit badly IP route cache.

On typical machines, this cache is allowed to hold up to 8 Millions dst
entries, 256 bytes for each, for a total of 2GB of memory.

rt_garbage_collect() triggers and tries to cleanup things.

Eventually route cache is disabled but machine is under fire and might
OOM and crash.

This patch exploits the new TCP early demux, to set a nocache
boolean in case incoming TCP frame is for a not yet ESTABLISHED or
TIMEWAIT socket.

This 'nocache' boolean is then used in case dst entry is not found in
route cache, to create an unhashed dst entry (DST_NOCACHE)

SYN-cookie-ACK sent use a similar mechanism (ipv4: tcp: dont cache
output dst for syncookies), so after this patch, a machine is able to
absorb a DDOS synflood attack without polluting its IP route cache.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Hans Schillstrom <hans.schillstrom@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/protocol.h | 2 +-
 include/net/route.h    | 8 ++++----
 include/net/tcp.h      | 2 +-
 net/ipv4/arp.c         | 2 +-
 net/ipv4/ip_fragment.c | 2 +-
 net/ipv4/ip_input.c    | 5 +++--
 net/ipv4/route.c       | 8 +++++---
 net/ipv4/tcp_ipv4.c    | 4 +++-
 net/ipv4/xfrm4_input.c | 2 +-
 9 files changed, 20 insertions(+), 15 deletions(-)

(limited to 'include/net/tcp.h')

diff --git a/include/net/protocol.h b/include/net/protocol.h
index 967b926cbfb..7cfc8f76914 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -37,7 +37,7 @@
 
 /* This is used to register protocols. */
 struct net_protocol {
-	int			(*early_demux)(struct sk_buff *skb);
+	int			(*early_demux)(struct sk_buff *skb, bool *nocache);
 	int			(*handler)(struct sk_buff *skb);
 	void			(*err_handler)(struct sk_buff *skb, u32 info);
 	int			(*gso_send_check)(struct sk_buff *skb);
diff --git a/include/net/route.h b/include/net/route.h
index 47eb25ac1f7..6361f933577 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -201,18 +201,18 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4
 }
 
 extern int ip_route_input_common(struct sk_buff *skb, __be32 dst, __be32 src,
-				 u8 tos, struct net_device *devin, bool noref);
+				 u8 tos, struct net_device *devin, bool noref, bool nocache);
 
 static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
 				 u8 tos, struct net_device *devin)
 {
-	return ip_route_input_common(skb, dst, src, tos, devin, false);
+	return ip_route_input_common(skb, dst, src, tos, devin, false, false);
 }
 
 static inline int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src,
-				       u8 tos, struct net_device *devin)
+				       u8 tos, struct net_device *devin, bool nocache)
 {
-	return ip_route_input_common(skb, dst, src, tos, devin, true);
+	return ip_route_input_common(skb, dst, src, tos, devin, true, nocache);
 }
 
 extern void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6660ffc4963..917ed2e55e8 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -325,7 +325,7 @@ extern void tcp_v4_err(struct sk_buff *skb, u32);
 
 extern void tcp_shutdown (struct sock *sk, int how);
 
-extern int tcp_v4_early_demux(struct sk_buff *skb);
+extern int tcp_v4_early_demux(struct sk_buff *skb, bool *nocache);
 extern int tcp_v4_rcv(struct sk_buff *skb);
 
 extern struct inet_peer *tcp_v4_get_peer(struct sock *sk);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 2e560f0c757..6a979594436 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -828,7 +828,7 @@ static int arp_process(struct sk_buff *skb)
 	}
 
 	if (arp->ar_op == htons(ARPOP_REQUEST) &&
-	    ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
+	    ip_route_input_noref(skb, tip, sip, 0, dev, false) == 0) {
 
 		rt = skb_rtable(skb);
 		addr_type = rt->rt_type;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 8d07c973409..978d55f256e 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -259,7 +259,7 @@ static void ip_expire(unsigned long arg)
 		skb_dst_drop(head);
 		iph = ip_hdr(head);
 		err = ip_route_input_noref(head, iph->daddr, iph->saddr,
-					   iph->tos, head->dev);
+					   iph->tos, head->dev, false);
 		if (err)
 			goto out_rcu_unlock;
 
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 2a39204de5b..7be54c8dcbe 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -326,6 +326,7 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	 */
 	if (skb_dst(skb) == NULL) {
 		int err = -ENOENT;
+		bool nocache = false;
 
 		if (sysctl_ip_early_demux) {
 			const struct net_protocol *ipprot;
@@ -334,13 +335,13 @@ static int ip_rcv_finish(struct sk_buff *skb)
 			rcu_read_lock();
 			ipprot = rcu_dereference(inet_protos[protocol]);
 			if (ipprot && ipprot->early_demux)
-				err = ipprot->early_demux(skb);
+				err = ipprot->early_demux(skb, &nocache);
 			rcu_read_unlock();
 		}
 
 		if (err) {
 			err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
-						   iph->tos, skb->dev);
+						   iph->tos, skb->dev, nocache);
 			if (unlikely(err)) {
 				if (err == -EXDEV)
 					NET_INC_STATS_BH(dev_net(skb->dev),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 81533e3a23d..fdc7900f9d7 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2214,7 +2214,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
  */
 
 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
-			       u8 tos, struct net_device *dev)
+			       u8 tos, struct net_device *dev, bool nocache)
 {
 	struct fib_result res;
 	struct in_device *in_dev = __in_dev_get_rcu(dev);
@@ -2353,6 +2353,8 @@ local_input:
 		rth->dst.error= -err;
 		rth->rt_flags 	&= ~RTCF_LOCAL;
 	}
+	if (nocache)
+		rth->dst.flags |= DST_NOCACHE;
 	hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
 	rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
 	err = 0;
@@ -2395,7 +2397,7 @@ martian_source_keep_err:
 }
 
 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
-			   u8 tos, struct net_device *dev, bool noref)
+			   u8 tos, struct net_device *dev, bool noref, bool nocache)
 {
 	struct rtable	*rth;
 	unsigned int	hash;
@@ -2471,7 +2473,7 @@ skip_cache:
 		rcu_read_unlock();
 		return -EINVAL;
 	}
-	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
+	res = ip_route_input_slow(skb, daddr, saddr, tos, dev, nocache);
 	rcu_read_unlock();
 	return res;
 }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1781dc650b9..33aabd4fc20 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1673,7 +1673,7 @@ csum_err:
 }
 EXPORT_SYMBOL(tcp_v4_do_rcv);
 
-int tcp_v4_early_demux(struct sk_buff *skb)
+int tcp_v4_early_demux(struct sk_buff *skb, bool *no_dst_cache)
 {
 	struct net *net = dev_net(skb->dev);
 	const struct iphdr *iph;
@@ -1719,6 +1719,8 @@ int tcp_v4_early_demux(struct sk_buff *skb)
 				}
 			}
 		}
+	} else {
+		*no_dst_cache = true;
 	}
 
 out_err:
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 06814b6216d..eee636b191b 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -28,7 +28,7 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
 		const struct iphdr *iph = ip_hdr(skb);
 
 		if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
-					 iph->tos, skb->dev))
+					 iph->tos, skb->dev, false))
 			goto drop;
 	}
 	return dst_input(skb);
-- 
cgit v1.2.3-70-g09d2


From c10237e077cef50e925f052e49f3b4fead9d71f9 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 27 Jun 2012 17:05:06 -0700
Subject: Revert "ipv4: tcp: dont cache unconfirmed intput dst"

This reverts commit c074da2810c118b3812f32d6754bd9ead2f169e7.

This change has several unwanted side effects:

1) Sockets will cache the DST_NOCACHE route in sk->sk_rx_dst and we'll
   thus never create a real cached route.

2) All TCP traffic will use DST_NOCACHE and never use the routing
   cache at all.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/protocol.h | 2 +-
 include/net/route.h    | 8 ++++----
 include/net/tcp.h      | 2 +-
 net/ipv4/arp.c         | 2 +-
 net/ipv4/ip_fragment.c | 2 +-
 net/ipv4/ip_input.c    | 5 ++---
 net/ipv4/route.c       | 8 +++-----
 net/ipv4/tcp_ipv4.c    | 4 +---
 net/ipv4/xfrm4_input.c | 2 +-
 9 files changed, 15 insertions(+), 20 deletions(-)

(limited to 'include/net/tcp.h')

diff --git a/include/net/protocol.h b/include/net/protocol.h
index 7cfc8f76914..967b926cbfb 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -37,7 +37,7 @@
 
 /* This is used to register protocols. */
 struct net_protocol {
-	int			(*early_demux)(struct sk_buff *skb, bool *nocache);
+	int			(*early_demux)(struct sk_buff *skb);
 	int			(*handler)(struct sk_buff *skb);
 	void			(*err_handler)(struct sk_buff *skb, u32 info);
 	int			(*gso_send_check)(struct sk_buff *skb);
diff --git a/include/net/route.h b/include/net/route.h
index 6361f933577..47eb25ac1f7 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -201,18 +201,18 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4
 }
 
 extern int ip_route_input_common(struct sk_buff *skb, __be32 dst, __be32 src,
-				 u8 tos, struct net_device *devin, bool noref, bool nocache);
+				 u8 tos, struct net_device *devin, bool noref);
 
 static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
 				 u8 tos, struct net_device *devin)
 {
-	return ip_route_input_common(skb, dst, src, tos, devin, false, false);
+	return ip_route_input_common(skb, dst, src, tos, devin, false);
 }
 
 static inline int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src,
-				       u8 tos, struct net_device *devin, bool nocache)
+				       u8 tos, struct net_device *devin)
 {
-	return ip_route_input_common(skb, dst, src, tos, devin, true, nocache);
+	return ip_route_input_common(skb, dst, src, tos, devin, true);
 }
 
 extern void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 917ed2e55e8..6660ffc4963 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -325,7 +325,7 @@ extern void tcp_v4_err(struct sk_buff *skb, u32);
 
 extern void tcp_shutdown (struct sock *sk, int how);
 
-extern int tcp_v4_early_demux(struct sk_buff *skb, bool *nocache);
+extern int tcp_v4_early_demux(struct sk_buff *skb);
 extern int tcp_v4_rcv(struct sk_buff *skb);
 
 extern struct inet_peer *tcp_v4_get_peer(struct sock *sk);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 6a979594436..2e560f0c757 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -828,7 +828,7 @@ static int arp_process(struct sk_buff *skb)
 	}
 
 	if (arp->ar_op == htons(ARPOP_REQUEST) &&
-	    ip_route_input_noref(skb, tip, sip, 0, dev, false) == 0) {
+	    ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
 
 		rt = skb_rtable(skb);
 		addr_type = rt->rt_type;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 978d55f256e..8d07c973409 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -259,7 +259,7 @@ static void ip_expire(unsigned long arg)
 		skb_dst_drop(head);
 		iph = ip_hdr(head);
 		err = ip_route_input_noref(head, iph->daddr, iph->saddr,
-					   iph->tos, head->dev, false);
+					   iph->tos, head->dev);
 		if (err)
 			goto out_rcu_unlock;
 
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 7be54c8dcbe..2a39204de5b 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -326,7 +326,6 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	 */
 	if (skb_dst(skb) == NULL) {
 		int err = -ENOENT;
-		bool nocache = false;
 
 		if (sysctl_ip_early_demux) {
 			const struct net_protocol *ipprot;
@@ -335,13 +334,13 @@ static int ip_rcv_finish(struct sk_buff *skb)
 			rcu_read_lock();
 			ipprot = rcu_dereference(inet_protos[protocol]);
 			if (ipprot && ipprot->early_demux)
-				err = ipprot->early_demux(skb, &nocache);
+				err = ipprot->early_demux(skb);
 			rcu_read_unlock();
 		}
 
 		if (err) {
 			err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
-						   iph->tos, skb->dev, nocache);
+						   iph->tos, skb->dev);
 			if (unlikely(err)) {
 				if (err == -EXDEV)
 					NET_INC_STATS_BH(dev_net(skb->dev),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index fdc7900f9d7..81533e3a23d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2214,7 +2214,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
  */
 
 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
-			       u8 tos, struct net_device *dev, bool nocache)
+			       u8 tos, struct net_device *dev)
 {
 	struct fib_result res;
 	struct in_device *in_dev = __in_dev_get_rcu(dev);
@@ -2353,8 +2353,6 @@ local_input:
 		rth->dst.error= -err;
 		rth->rt_flags 	&= ~RTCF_LOCAL;
 	}
-	if (nocache)
-		rth->dst.flags |= DST_NOCACHE;
 	hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
 	rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
 	err = 0;
@@ -2397,7 +2395,7 @@ martian_source_keep_err:
 }
 
 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
-			   u8 tos, struct net_device *dev, bool noref, bool nocache)
+			   u8 tos, struct net_device *dev, bool noref)
 {
 	struct rtable	*rth;
 	unsigned int	hash;
@@ -2473,7 +2471,7 @@ skip_cache:
 		rcu_read_unlock();
 		return -EINVAL;
 	}
-	res = ip_route_input_slow(skb, daddr, saddr, tos, dev, nocache);
+	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
 	rcu_read_unlock();
 	return res;
 }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 33aabd4fc20..1781dc650b9 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1673,7 +1673,7 @@ csum_err:
 }
 EXPORT_SYMBOL(tcp_v4_do_rcv);
 
-int tcp_v4_early_demux(struct sk_buff *skb, bool *no_dst_cache)
+int tcp_v4_early_demux(struct sk_buff *skb)
 {
 	struct net *net = dev_net(skb->dev);
 	const struct iphdr *iph;
@@ -1719,8 +1719,6 @@ int tcp_v4_early_demux(struct sk_buff *skb, bool *no_dst_cache)
 				}
 			}
 		}
-	} else {
-		*no_dst_cache = true;
 	}
 
 out_err:
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index eee636b191b..06814b6216d 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -28,7 +28,7 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
 		const struct iphdr *iph = ip_hdr(skb);
 
 		if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
-					 iph->tos, skb->dev, false))
+					 iph->tos, skb->dev))
 			goto drop;
 	}
 	return dst_input(skb);
-- 
cgit v1.2.3-70-g09d2


From 160eb5a6b14ca2eab5c598bdbbb24c24624bad34 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 27 Jun 2012 22:01:22 -0700
Subject: ipv4: Kill early demux method return value.

It's completely unnecessary.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/protocol.h |  2 +-
 include/net/tcp.h      |  2 +-
 net/ipv4/ip_input.c    | 42 +++++++++++++++++++-----------------------
 net/ipv4/tcp_ipv4.c    | 19 ++++++-------------
 4 files changed, 27 insertions(+), 38 deletions(-)

(limited to 'include/net/tcp.h')

diff --git a/include/net/protocol.h b/include/net/protocol.h
index 967b926cbfb..057f2d31556 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -37,7 +37,7 @@
 
 /* This is used to register protocols. */
 struct net_protocol {
-	int			(*early_demux)(struct sk_buff *skb);
+	void			(*early_demux)(struct sk_buff *skb);
 	int			(*handler)(struct sk_buff *skb);
 	void			(*err_handler)(struct sk_buff *skb, u32 info);
 	int			(*gso_send_check)(struct sk_buff *skb);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6660ffc4963..53fb7d81417 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -325,7 +325,7 @@ extern void tcp_v4_err(struct sk_buff *skb, u32);
 
 extern void tcp_shutdown (struct sock *sk, int how);
 
-extern int tcp_v4_early_demux(struct sk_buff *skb);
+extern void tcp_v4_early_demux(struct sk_buff *skb);
 extern int tcp_v4_rcv(struct sk_buff *skb);
 
 extern struct inet_peer *tcp_v4_get_peer(struct sock *sk);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 2a39204de5b..b27d4440f52 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -320,33 +320,29 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	const struct iphdr *iph = ip_hdr(skb);
 	struct rtable *rt;
 
+	if (sysctl_ip_early_demux && !skb_dst(skb)) {
+		const struct net_protocol *ipprot;
+		int protocol = iph->protocol;
+
+		rcu_read_lock();
+		ipprot = rcu_dereference(inet_protos[protocol]);
+		if (ipprot && ipprot->early_demux)
+			ipprot->early_demux(skb);
+		rcu_read_unlock();
+	}
+
 	/*
 	 *	Initialise the virtual path cache for the packet. It describes
 	 *	how the packet travels inside Linux networking.
 	 */
-	if (skb_dst(skb) == NULL) {
-		int err = -ENOENT;
-
-		if (sysctl_ip_early_demux) {
-			const struct net_protocol *ipprot;
-			int protocol = iph->protocol;
-
-			rcu_read_lock();
-			ipprot = rcu_dereference(inet_protos[protocol]);
-			if (ipprot && ipprot->early_demux)
-				err = ipprot->early_demux(skb);
-			rcu_read_unlock();
-		}
-
-		if (err) {
-			err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
-						   iph->tos, skb->dev);
-			if (unlikely(err)) {
-				if (err == -EXDEV)
-					NET_INC_STATS_BH(dev_net(skb->dev),
-							 LINUX_MIB_IPRPFILTER);
-				goto drop;
-			}
+	if (!skb_dst(skb)) {
+		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+					       iph->tos, skb->dev);
+		if (unlikely(err)) {
+			if (err == -EXDEV)
+				NET_INC_STATS_BH(dev_net(skb->dev),
+						 LINUX_MIB_IPRPFILTER);
+			goto drop;
 		}
 	}
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1781dc650b9..b4ae1c199f3 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1673,30 +1673,28 @@ csum_err:
 }
 EXPORT_SYMBOL(tcp_v4_do_rcv);
 
-int tcp_v4_early_demux(struct sk_buff *skb)
+void tcp_v4_early_demux(struct sk_buff *skb)
 {
 	struct net *net = dev_net(skb->dev);
 	const struct iphdr *iph;
 	const struct tcphdr *th;
 	struct net_device *dev;
 	struct sock *sk;
-	int err;
 
-	err = -ENOENT;
 	if (skb->pkt_type != PACKET_HOST)
-		goto out_err;
+		return;
 
 	if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
-		goto out_err;
+		return;
 
 	iph = ip_hdr(skb);
 	th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
 
 	if (th->doff < sizeof(struct tcphdr) / 4)
-		goto out_err;
+		return;
 
 	if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4))
-		goto out_err;
+		return;
 
 	dev = skb->dev;
 	sk = __inet_lookup_established(net, &tcp_hashinfo,
@@ -1713,16 +1711,11 @@ int tcp_v4_early_demux(struct sk_buff *skb)
 			if (dst) {
 				struct rtable *rt = (struct rtable *) dst;
 
-				if (rt->rt_iif == dev->ifindex) {
+				if (rt->rt_iif == dev->ifindex)
 					skb_dst_set_noref(skb, dst);
-					err = 0;
-				}
 			}
 		}
 	}
-
-out_err:
-	return err;
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 4aabd8ef8c43677cfee3e1e36c5a79edddb41942 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 9 Jul 2012 16:07:30 -0700
Subject: tcp: Move dynamnic metrics handling into seperate file.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h      |   4 ++
 net/ipv4/Makefile      |   2 +-
 net/ipv4/tcp_input.c   | 188 +-----------------------------------------------
 net/ipv4/tcp_metrics.c | 192 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 199 insertions(+), 187 deletions(-)
 create mode 100644 net/ipv4/tcp_metrics.c

(limited to 'include/net/tcp.h')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 53fb7d81417..98ca797001a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -388,6 +388,8 @@ extern void tcp_enter_frto(struct sock *sk);
 extern void tcp_enter_loss(struct sock *sk, int how);
 extern void tcp_clear_retrans(struct tcp_sock *tp);
 extern void tcp_update_metrics(struct sock *sk);
+extern void tcp_init_metrics(struct sock *sk);
+extern void tcp_disable_fack(struct tcp_sock *tp);
 extern void tcp_close(struct sock *sk, long timeout);
 extern void tcp_init_sock(struct sock *sk);
 extern unsigned int tcp_poll(struct file * file, struct socket *sock,
@@ -556,6 +558,8 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
 	return (tp->srtt >> 3) + tp->rttvar;
 }
 
+extern void tcp_set_rto(struct sock *sk);
+
 static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
 {
 	tp->pred_flags = htonl((tp->tcp_header_len << 26) |
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ff75d3bbcd6..5a23e8b3710 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -7,7 +7,7 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     ip_output.o ip_sockglue.o inet_hashtables.o \
 	     inet_timewait_sock.o inet_connection_sock.o \
 	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
-	     tcp_minisocks.o tcp_cong.o \
+	     tcp_minisocks.o tcp_cong.o tcp_metrics.o \
 	     datagram.o raw.o udp.o udplite.o \
 	     arp.o icmp.o devinet.o af_inet.o  igmp.o \
 	     fib_frontend.o fib_semantics.o fib_trie.o \
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ca0d0e7c977..055ac49b8b4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -93,7 +93,6 @@ int sysctl_tcp_rfc1337 __read_mostly;
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 int sysctl_tcp_frto __read_mostly = 2;
 int sysctl_tcp_frto_response __read_mostly;
-int sysctl_tcp_nometrics_save __read_mostly;
 
 int sysctl_tcp_thin_dupack __read_mostly;
 
@@ -701,7 +700,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
  * routine referred to above.
  */
-static inline void tcp_set_rto(struct sock *sk)
+void tcp_set_rto(struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	/* Old crap is replaced with new one. 8)
@@ -728,109 +727,6 @@ static inline void tcp_set_rto(struct sock *sk)
 	tcp_bound_rto(sk);
 }
 
-/* Save metrics learned by this TCP session.
-   This function is called only, when TCP finishes successfully
-   i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
- */
-void tcp_update_metrics(struct sock *sk)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct dst_entry *dst = __sk_dst_get(sk);
-
-	if (sysctl_tcp_nometrics_save)
-		return;
-
-	if (dst && (dst->flags & DST_HOST)) {
-		const struct inet_connection_sock *icsk = inet_csk(sk);
-		int m;
-		unsigned long rtt;
-
-		dst_confirm(dst);
-
-		if (icsk->icsk_backoff || !tp->srtt) {
-			/* This session failed to estimate rtt. Why?
-			 * Probably, no packets returned in time.
-			 * Reset our results.
-			 */
-			if (!(dst_metric_locked(dst, RTAX_RTT)))
-				dst_metric_set(dst, RTAX_RTT, 0);
-			return;
-		}
-
-		rtt = dst_metric_rtt(dst, RTAX_RTT);
-		m = rtt - tp->srtt;
-
-		/* If newly calculated rtt larger than stored one,
-		 * store new one. Otherwise, use EWMA. Remember,
-		 * rtt overestimation is always better than underestimation.
-		 */
-		if (!(dst_metric_locked(dst, RTAX_RTT))) {
-			if (m <= 0)
-				set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt);
-			else
-				set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3));
-		}
-
-		if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
-			unsigned long var;
-			if (m < 0)
-				m = -m;
-
-			/* Scale deviation to rttvar fixed point */
-			m >>= 1;
-			if (m < tp->mdev)
-				m = tp->mdev;
-
-			var = dst_metric_rtt(dst, RTAX_RTTVAR);
-			if (m >= var)
-				var = m;
-			else
-				var -= (var - m) >> 2;
-
-			set_dst_metric_rtt(dst, RTAX_RTTVAR, var);
-		}
-
-		if (tcp_in_initial_slowstart(tp)) {
-			/* Slow start still did not finish. */
-			if (dst_metric(dst, RTAX_SSTHRESH) &&
-			    !dst_metric_locked(dst, RTAX_SSTHRESH) &&
-			    (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
-				dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
-			if (!dst_metric_locked(dst, RTAX_CWND) &&
-			    tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
-				dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
-		} else if (tp->snd_cwnd > tp->snd_ssthresh &&
-			   icsk->icsk_ca_state == TCP_CA_Open) {
-			/* Cong. avoidance phase, cwnd is reliable. */
-			if (!dst_metric_locked(dst, RTAX_SSTHRESH))
-				dst_metric_set(dst, RTAX_SSTHRESH,
-					       max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
-			if (!dst_metric_locked(dst, RTAX_CWND))
-				dst_metric_set(dst, RTAX_CWND,
-					       (dst_metric(dst, RTAX_CWND) +
-						tp->snd_cwnd) >> 1);
-		} else {
-			/* Else slow start did not finish, cwnd is non-sense,
-			   ssthresh may be also invalid.
-			 */
-			if (!dst_metric_locked(dst, RTAX_CWND))
-				dst_metric_set(dst, RTAX_CWND,
-					       (dst_metric(dst, RTAX_CWND) +
-						tp->snd_ssthresh) >> 1);
-			if (dst_metric(dst, RTAX_SSTHRESH) &&
-			    !dst_metric_locked(dst, RTAX_SSTHRESH) &&
-			    tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
-				dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
-		}
-
-		if (!dst_metric_locked(dst, RTAX_REORDERING)) {
-			if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
-			    tp->reordering != sysctl_tcp_reordering)
-				dst_metric_set(dst, RTAX_REORDERING, tp->reordering);
-		}
-	}
-}
-
 __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
 {
 	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
@@ -867,7 +763,7 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
  * Packet counting of FACK is based on in-order assumptions, therefore TCP
  * disables it when reordering is detected
  */
-static void tcp_disable_fack(struct tcp_sock *tp)
+void tcp_disable_fack(struct tcp_sock *tp)
 {
 	/* RFC3517 uses different metric in lost marker => reset on change */
 	if (tcp_is_fack(tp))
@@ -881,86 +777,6 @@ static void tcp_dsack_seen(struct tcp_sock *tp)
 	tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
 }
 
-/* Initialize metrics on socket. */
-
-static void tcp_init_metrics(struct sock *sk)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct dst_entry *dst = __sk_dst_get(sk);
-
-	if (dst == NULL)
-		goto reset;
-
-	dst_confirm(dst);
-
-	if (dst_metric_locked(dst, RTAX_CWND))
-		tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
-	if (dst_metric(dst, RTAX_SSTHRESH)) {
-		tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
-		if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
-			tp->snd_ssthresh = tp->snd_cwnd_clamp;
-	} else {
-		/* ssthresh may have been reduced unnecessarily during.
-		 * 3WHS. Restore it back to its initial default.
-		 */
-		tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
-	}
-	if (dst_metric(dst, RTAX_REORDERING) &&
-	    tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
-		tcp_disable_fack(tp);
-		tcp_disable_early_retrans(tp);
-		tp->reordering = dst_metric(dst, RTAX_REORDERING);
-	}
-
-	if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0)
-		goto reset;
-
-	/* Initial rtt is determined from SYN,SYN-ACK.
-	 * The segment is small and rtt may appear much
-	 * less than real one. Use per-dst memory
-	 * to make it more realistic.
-	 *
-	 * A bit of theory. RTT is time passed after "normal" sized packet
-	 * is sent until it is ACKed. In normal circumstances sending small
-	 * packets force peer to delay ACKs and calculation is correct too.
-	 * The algorithm is adaptive and, provided we follow specs, it
-	 * NEVER underestimate RTT. BUT! If peer tries to make some clever
-	 * tricks sort of "quick acks" for time long enough to decrease RTT
-	 * to low value, and then abruptly stops to do it and starts to delay
-	 * ACKs, wait for troubles.
-	 */
-	if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) {
-		tp->srtt = dst_metric_rtt(dst, RTAX_RTT);
-		tp->rtt_seq = tp->snd_nxt;
-	}
-	if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) {
-		tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR);
-		tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
-	}
-	tcp_set_rto(sk);
-reset:
-	if (tp->srtt == 0) {
-		/* RFC6298: 5.7 We've failed to get a valid RTT sample from
-		 * 3WHS. This is most likely due to retransmission,
-		 * including spurious one. Reset the RTO back to 3secs
-		 * from the more aggressive 1sec to avoid more spurious
-		 * retransmission.
-		 */
-		tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
-		inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
-	}
-	/* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
-	 * retransmitted. In light of RFC6298 more aggressive 1sec
-	 * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
-	 * retransmission has occurred.
-	 */
-	if (tp->total_retrans > 1)
-		tp->snd_cwnd = 1;
-	else
-		tp->snd_cwnd = tcp_init_cwnd(tp, dst);
-	tp->snd_cwnd_stamp = tcp_time_stamp;
-}
-
 static void tcp_update_reordering(struct sock *sk, const int metric,
 				  const int ts)
 {
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
new file mode 100644
index 00000000000..2793ecf928d
--- /dev/null
+++ b/net/ipv4/tcp_metrics.c
@@ -0,0 +1,192 @@
+#include <linux/cache.h>
+#include <linux/tcp.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/sock.h>
+#include <net/dst.h>
+#include <net/tcp.h>
+
+int sysctl_tcp_nometrics_save __read_mostly;
+
+/* Save metrics learned by this TCP session.  This function is called
+ * only, when TCP finishes successfully i.e. when it enters TIME-WAIT
+ * or goes from LAST-ACK to CLOSE.
+ */
+void tcp_update_metrics(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct dst_entry *dst = __sk_dst_get(sk);
+
+	if (sysctl_tcp_nometrics_save)
+		return;
+
+	if (dst && (dst->flags & DST_HOST)) {
+		const struct inet_connection_sock *icsk = inet_csk(sk);
+		int m;
+		unsigned long rtt;
+
+		dst_confirm(dst);
+
+		if (icsk->icsk_backoff || !tp->srtt) {
+			/* This session failed to estimate rtt. Why?
+			 * Probably, no packets returned in time.
+			 * Reset our results.
+			 */
+			if (!(dst_metric_locked(dst, RTAX_RTT)))
+				dst_metric_set(dst, RTAX_RTT, 0);
+			return;
+		}
+
+		rtt = dst_metric_rtt(dst, RTAX_RTT);
+		m = rtt - tp->srtt;
+
+		/* If newly calculated rtt larger than stored one,
+		 * store new one. Otherwise, use EWMA. Remember,
+		 * rtt overestimation is always better than underestimation.
+		 */
+		if (!(dst_metric_locked(dst, RTAX_RTT))) {
+			if (m <= 0)
+				set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt);
+			else
+				set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3));
+		}
+
+		if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
+			unsigned long var;
+			if (m < 0)
+				m = -m;
+
+			/* Scale deviation to rttvar fixed point */
+			m >>= 1;
+			if (m < tp->mdev)
+				m = tp->mdev;
+
+			var = dst_metric_rtt(dst, RTAX_RTTVAR);
+			if (m >= var)
+				var = m;
+			else
+				var -= (var - m) >> 2;
+
+			set_dst_metric_rtt(dst, RTAX_RTTVAR, var);
+		}
+
+		if (tcp_in_initial_slowstart(tp)) {
+			/* Slow start still did not finish. */
+			if (dst_metric(dst, RTAX_SSTHRESH) &&
+			    !dst_metric_locked(dst, RTAX_SSTHRESH) &&
+			    (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
+				dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
+			if (!dst_metric_locked(dst, RTAX_CWND) &&
+			    tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
+				dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
+		} else if (tp->snd_cwnd > tp->snd_ssthresh &&
+			   icsk->icsk_ca_state == TCP_CA_Open) {
+			/* Cong. avoidance phase, cwnd is reliable. */
+			if (!dst_metric_locked(dst, RTAX_SSTHRESH))
+				dst_metric_set(dst, RTAX_SSTHRESH,
+					       max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
+			if (!dst_metric_locked(dst, RTAX_CWND))
+				dst_metric_set(dst, RTAX_CWND,
+					       (dst_metric(dst, RTAX_CWND) +
+						tp->snd_cwnd) >> 1);
+		} else {
+			/* Else slow start did not finish, cwnd is non-sense,
+			   ssthresh may be also invalid.
+			 */
+			if (!dst_metric_locked(dst, RTAX_CWND))
+				dst_metric_set(dst, RTAX_CWND,
+					       (dst_metric(dst, RTAX_CWND) +
+						tp->snd_ssthresh) >> 1);
+			if (dst_metric(dst, RTAX_SSTHRESH) &&
+			    !dst_metric_locked(dst, RTAX_SSTHRESH) &&
+			    tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
+				dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
+		}
+
+		if (!dst_metric_locked(dst, RTAX_REORDERING)) {
+			if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
+			    tp->reordering != sysctl_tcp_reordering)
+				dst_metric_set(dst, RTAX_REORDERING, tp->reordering);
+		}
+	}
+}
+
+/* Initialize metrics on socket. */
+
+void tcp_init_metrics(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct dst_entry *dst = __sk_dst_get(sk);
+
+	if (dst == NULL)
+		goto reset;
+
+	dst_confirm(dst);
+
+	if (dst_metric_locked(dst, RTAX_CWND))
+		tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
+	if (dst_metric(dst, RTAX_SSTHRESH)) {
+		tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
+		if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
+			tp->snd_ssthresh = tp->snd_cwnd_clamp;
+	} else {
+		/* ssthresh may have been reduced unnecessarily during.
+		 * 3WHS. Restore it back to its initial default.
+		 */
+		tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+	}
+	if (dst_metric(dst, RTAX_REORDERING) &&
+	    tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
+		tcp_disable_fack(tp);
+		tcp_disable_early_retrans(tp);
+		tp->reordering = dst_metric(dst, RTAX_REORDERING);
+	}
+
+	if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0)
+		goto reset;
+
+	/* Initial rtt is determined from SYN,SYN-ACK.
+	 * The segment is small and rtt may appear much
+	 * less than real one. Use per-dst memory
+	 * to make it more realistic.
+	 *
+	 * A bit of theory. RTT is time passed after "normal" sized packet
+	 * is sent until it is ACKed. In normal circumstances sending small
+	 * packets force peer to delay ACKs and calculation is correct too.
+	 * The algorithm is adaptive and, provided we follow specs, it
+	 * NEVER underestimate RTT. BUT! If peer tries to make some clever
+	 * tricks sort of "quick acks" for time long enough to decrease RTT
+	 * to low value, and then abruptly stops to do it and starts to delay
+	 * ACKs, wait for troubles.
+	 */
+	if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) {
+		tp->srtt = dst_metric_rtt(dst, RTAX_RTT);
+		tp->rtt_seq = tp->snd_nxt;
+	}
+	if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) {
+		tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR);
+		tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
+	}
+	tcp_set_rto(sk);
+reset:
+	if (tp->srtt == 0) {
+		/* RFC6298: 5.7 We've failed to get a valid RTT sample from
+		 * 3WHS. This is most likely due to retransmission,
+		 * including spurious one. Reset the RTO back to 3secs
+		 * from the more aggressive 1sec to avoid more spurious
+		 * retransmission.
+		 */
+		tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
+		inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
+	}
+	/* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
+	 * retransmitted. In light of RFC6298 more aggressive 1sec
+	 * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
+	 * retransmission has occurred.
+	 */
+	if (tp->total_retrans > 1)
+		tp->snd_cwnd = 1;
+	else
+		tp->snd_cwnd = tcp_init_cwnd(tp, dst);
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+}
-- 
cgit v1.2.3-70-g09d2


From ab92bb2f679d66c7e12a6b1c0cdd76fe308f6546 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 9 Jul 2012 16:19:30 -0700
Subject: tcp: Abstract back handling peer aliveness test into helper function.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h      |  1 +
 net/ipv4/tcp_ipv4.c    |  2 +-
 net/ipv4/tcp_metrics.c | 10 ++++++++++
 net/ipv6/tcp_ipv6.c    |  2 +-
 4 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/net/tcp.h')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 98ca797001a..5478356ea8c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -389,6 +389,7 @@ extern void tcp_enter_loss(struct sock *sk, int how);
 extern void tcp_clear_retrans(struct tcp_sock *tp);
 extern void tcp_update_metrics(struct sock *sk);
 extern void tcp_init_metrics(struct sock *sk);
+extern bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
 extern void tcp_disable_fack(struct tcp_sock *tp);
 extern void tcp_close(struct sock *sk, long timeout);
 extern void tcp_init_sock(struct sock *sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 64568fa21d0..e9312a8f33a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1405,7 +1405,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
 			  (sysctl_max_syn_backlog >> 2)) &&
 			 (!peer || !peer->tcp_ts_stamp) &&
-			 (!dst || !dst_metric(dst, RTAX_RTT))) {
+			 !tcp_peer_is_proven(req, dst)) {
 			/* Without syncookies last quarter of
 			 * backlog is filled with destinations,
 			 * proven to be alive.
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 2793ecf928d..9afe703c85c 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -1,7 +1,9 @@
+#include <linux/module.h>
 #include <linux/cache.h>
 #include <linux/tcp.h>
 
 #include <net/inet_connection_sock.h>
+#include <net/request_sock.h>
 #include <net/sock.h>
 #include <net/dst.h>
 #include <net/tcp.h>
@@ -190,3 +192,11 @@ reset:
 		tp->snd_cwnd = tcp_init_cwnd(tp, dst);
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
+
+bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst)
+{
+	if (!dst)
+		return false;
+	return dst_metric(dst, RTAX_RTT) ? true : false;
+}
+EXPORT_SYMBOL_GPL(tcp_peer_is_proven);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 6cc67ed6c2e..75d179555c2 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1177,7 +1177,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
 			  (sysctl_max_syn_backlog >> 2)) &&
 			 (!peer || !peer->tcp_ts_stamp) &&
-			 (!dst || !dst_metric(dst, RTAX_RTT))) {
+			 !tcp_peer_is_proven(req, dst)) {
 			/* Without syncookies last quarter of
 			 * backlog is filled with destinations,
 			 * proven to be alive.
-- 
cgit v1.2.3-70-g09d2


From 51c5d0c4b169bf762f09e0d5b283a7f0b2a45739 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 10 Jul 2012 00:49:14 -0700
Subject: tcp: Maintain dynamic metrics in local cache.

Maintain a local hash table of TCP dynamic metrics blobs.

Computed TCP metrics are no longer maintained in the route metrics.

The table uses RCU and an extremely simple hash so that it has low
latency and low overhead.  A simple hash is legitimate because we only
make metrics blobs for fully established connections.

Some tweaking of the default hash table sizes, metric timeouts, and
the hash chain length limit certainly could use some tweaking.  But
the basic design seems sound.

With help from Eric Dumazet and Joe Perches.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h |   3 +
 include/net/tcp.h        |   1 +
 net/ipv4/tcp.c           |   2 +
 net/ipv4/tcp_metrics.c   | 555 +++++++++++++++++++++++++++++++++++++++--------
 4 files changed, 468 insertions(+), 93 deletions(-)

(limited to 'include/net/tcp.h')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 599e48fa97c..2e089a99d60 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -7,6 +7,7 @@
 
 #include <net/inet_frag.h>
 
+struct tcpm_hash_bucket;
 struct ctl_table_header;
 struct ipv4_devconf;
 struct fib_rules_ops;
@@ -39,6 +40,8 @@ struct netns_ipv4 {
 	struct sock		**icmp_sk;
 	struct sock		*tcp_sock;
 	struct inet_peer_base	*peers;
+	struct tcpm_hash_bucket	*tcp_metrics_hash;
+	unsigned int		tcp_metrics_hash_mask;
 	struct netns_frags	frags;
 #ifdef CONFIG_NETFILTER
 	struct xt_table		*iptable_filter;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5478356ea8c..0900d63d162 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -389,6 +389,7 @@ extern void tcp_enter_loss(struct sock *sk, int how);
 extern void tcp_clear_retrans(struct tcp_sock *tp);
 extern void tcp_update_metrics(struct sock *sk);
 extern void tcp_init_metrics(struct sock *sk);
+extern void tcp_metrics_init(void);
 extern bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
 extern void tcp_disable_fack(struct tcp_sock *tp);
 extern void tcp_close(struct sock *sk, long timeout);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3ba605f60e4..29aa0c800cd 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3563,6 +3563,8 @@ void __init tcp_init(void)
 	pr_info("Hash tables configured (established %u bind %u)\n",
 		tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
 
+	tcp_metrics_init();
+
 	tcp_register_congestion_control(&tcp_reno);
 
 	memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets));
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 9afe703c85c..56223bab251 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -1,134 +1,431 @@
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+#include <linux/jiffies.h>
+#include <linux/bootmem.h>
 #include <linux/module.h>
 #include <linux/cache.h>
+#include <linux/slab.h>
+#include <linux/init.h>
 #include <linux/tcp.h>
 
 #include <net/inet_connection_sock.h>
+#include <net/net_namespace.h>
 #include <net/request_sock.h>
+#include <net/inetpeer.h>
 #include <net/sock.h>
+#include <net/ipv6.h>
 #include <net/dst.h>
 #include <net/tcp.h>
 
 int sysctl_tcp_nometrics_save __read_mostly;
 
+enum tcp_metric_index {
+	TCP_METRIC_RTT,
+	TCP_METRIC_RTTVAR,
+	TCP_METRIC_SSTHRESH,
+	TCP_METRIC_CWND,
+	TCP_METRIC_REORDERING,
+
+	/* Always last.  */
+	TCP_METRIC_MAX,
+};
+
+struct tcp_metrics_block {
+	struct tcp_metrics_block __rcu	*tcpm_next;
+	struct inetpeer_addr		tcpm_addr;
+	unsigned long			tcpm_stamp;
+	u32				tcpm_lock;
+	u32				tcpm_vals[TCP_METRIC_MAX];
+};
+
+static bool tcp_metric_locked(struct tcp_metrics_block *tm,
+			      enum tcp_metric_index idx)
+{
+	return tm->tcpm_lock & (1 << idx);
+}
+
+static u32 tcp_metric_get(struct tcp_metrics_block *tm,
+			  enum tcp_metric_index idx)
+{
+	return tm->tcpm_vals[idx];
+}
+
+static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm,
+				  enum tcp_metric_index idx)
+{
+	return msecs_to_jiffies(tm->tcpm_vals[idx]);
+}
+
+static void tcp_metric_set(struct tcp_metrics_block *tm,
+			   enum tcp_metric_index idx,
+			   u32 val)
+{
+	tm->tcpm_vals[idx] = val;
+}
+
+static void tcp_metric_set_msecs(struct tcp_metrics_block *tm,
+				 enum tcp_metric_index idx,
+				 u32 val)
+{
+	tm->tcpm_vals[idx] = jiffies_to_msecs(val);
+}
+
+static bool addr_same(const struct inetpeer_addr *a,
+		      const struct inetpeer_addr *b)
+{
+	const struct in6_addr *a6, *b6;
+
+	if (a->family != b->family)
+		return false;
+	if (a->family == AF_INET)
+		return a->addr.a4 == b->addr.a4;
+
+	a6 = (const struct in6_addr *) &a->addr.a6[0];
+	b6 = (const struct in6_addr *) &b->addr.a6[0];
+
+	return ipv6_addr_equal(a6, b6);
+}
+
+struct tcpm_hash_bucket {
+	struct tcp_metrics_block __rcu	*chain;
+};
+
+static DEFINE_SPINLOCK(tcp_metrics_lock);
+
+static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst)
+{
+	u32 val;
+
+	val = 0;
+	if (dst_metric_locked(dst, RTAX_RTT))
+		val |= 1 << TCP_METRIC_RTT;
+	if (dst_metric_locked(dst, RTAX_RTTVAR))
+		val |= 1 << TCP_METRIC_RTTVAR;
+	if (dst_metric_locked(dst, RTAX_SSTHRESH))
+		val |= 1 << TCP_METRIC_SSTHRESH;
+	if (dst_metric_locked(dst, RTAX_CWND))
+		val |= 1 << TCP_METRIC_CWND;
+	if (dst_metric_locked(dst, RTAX_REORDERING))
+		val |= 1 << TCP_METRIC_REORDERING;
+	tm->tcpm_lock = val;
+
+	tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT);
+	tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR);
+	tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
+	tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
+	tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
+}
+
+static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
+					  struct inetpeer_addr *addr,
+					  unsigned int hash,
+					  bool reclaim)
+{
+	struct tcp_metrics_block *tm;
+	struct net *net;
+
+	spin_lock_bh(&tcp_metrics_lock);
+	net = dev_net(dst->dev);
+	if (unlikely(reclaim)) {
+		struct tcp_metrics_block *oldest;
+
+		oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain);
+		for (tm = rcu_dereference(oldest->tcpm_next); tm;
+		     tm = rcu_dereference(tm->tcpm_next)) {
+			if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp))
+				oldest = tm;
+		}
+		tm = oldest;
+	} else {
+		tm = kmalloc(sizeof(*tm), GFP_ATOMIC);
+		if (!tm)
+			goto out_unlock;
+	}
+	tm->tcpm_addr = *addr;
+	tm->tcpm_stamp = jiffies;
+
+	tcpm_suck_dst(tm, dst);
+
+	if (likely(!reclaim)) {
+		tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain;
+		rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm);
+	}
+
+out_unlock:
+	spin_unlock_bh(&tcp_metrics_lock);
+	return tm;
+}
+
+#define TCP_METRICS_TIMEOUT		(60 * 60 * HZ)
+
+static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst)
+{
+	if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT)))
+		tcpm_suck_dst(tm, dst);
+}
+
+#define TCP_METRICS_RECLAIM_DEPTH	5
+#define TCP_METRICS_RECLAIM_PTR		(struct tcp_metrics_block *) 0x1UL
+
+static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth)
+{
+	if (tm)
+		return tm;
+	if (depth > TCP_METRICS_RECLAIM_DEPTH)
+		return TCP_METRICS_RECLAIM_PTR;
+	return NULL;
+}
+
+static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr,
+						   struct net *net, unsigned int hash)
+{
+	struct tcp_metrics_block *tm;
+	int depth = 0;
+
+	for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
+	     tm = rcu_dereference(tm->tcpm_next)) {
+		if (addr_same(&tm->tcpm_addr, addr))
+			break;
+		depth++;
+	}
+	return tcp_get_encode(tm, depth);
+}
+
+static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
+						       struct dst_entry *dst)
+{
+	struct tcp_metrics_block *tm;
+	struct inetpeer_addr addr;
+	unsigned int hash;
+	struct net *net;
+
+	addr.family = req->rsk_ops->family;
+	switch (addr.family) {
+	case AF_INET:
+		addr.addr.a4 = inet_rsk(req)->rmt_addr;
+		hash = (__force unsigned int) addr.addr.a4;
+		break;
+	case AF_INET6:
+		*(struct in6_addr *)addr.addr.a6 = inet6_rsk(req)->rmt_addr;
+		hash = ((__force unsigned int) addr.addr.a6[0] ^
+			(__force unsigned int) addr.addr.a6[1] ^
+			(__force unsigned int) addr.addr.a6[2] ^
+			(__force unsigned int) addr.addr.a6[3]);
+		break;
+	default:
+		return NULL;
+	}
+
+	hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8);
+
+	net = dev_net(dst->dev);
+	hash &= net->ipv4.tcp_metrics_hash_mask;
+
+	for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
+	     tm = rcu_dereference(tm->tcpm_next)) {
+		if (addr_same(&tm->tcpm_addr, &addr))
+			break;
+	}
+	tcpm_check_stamp(tm, dst);
+	return tm;
+}
+
+static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
+						 struct dst_entry *dst,
+						 bool create)
+{
+	struct tcp_metrics_block *tm;
+	struct inetpeer_addr addr;
+	unsigned int hash;
+	struct net *net;
+	bool reclaim;
+
+	addr.family = sk->sk_family;
+	switch (addr.family) {
+	case AF_INET:
+		addr.addr.a4 = inet_sk(sk)->inet_daddr;
+		hash = (__force unsigned int) addr.addr.a4;
+		break;
+	case AF_INET6:
+		*(struct in6_addr *)addr.addr.a6 = inet6_sk(sk)->daddr;
+		hash = ((__force unsigned int) addr.addr.a6[0] ^
+			(__force unsigned int) addr.addr.a6[1] ^
+			(__force unsigned int) addr.addr.a6[2] ^
+			(__force unsigned int) addr.addr.a6[3]);
+		break;
+	default:
+		return NULL;
+	}
+
+	hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8);
+
+	net = dev_net(dst->dev);
+	hash &= net->ipv4.tcp_metrics_hash_mask;
+
+	tm = __tcp_get_metrics(&addr, net, hash);
+	reclaim = false;
+	if (tm == TCP_METRICS_RECLAIM_PTR) {
+		reclaim = true;
+		tm = NULL;
+	}
+	if (!tm && create)
+		tm = tcpm_new(dst, &addr, hash, reclaim);
+	else
+		tcpm_check_stamp(tm, dst);
+
+	return tm;
+}
+
 /* Save metrics learned by this TCP session.  This function is called
  * only, when TCP finishes successfully i.e. when it enters TIME-WAIT
  * or goes from LAST-ACK to CLOSE.
  */
 void tcp_update_metrics(struct sock *sk)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct dst_entry *dst = __sk_dst_get(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_metrics_block *tm;
+	unsigned long rtt;
+	u32 val;
+	int m;
 
-	if (sysctl_tcp_nometrics_save)
+	if (sysctl_tcp_nometrics_save || !dst)
 		return;
 
-	if (dst && (dst->flags & DST_HOST)) {
-		const struct inet_connection_sock *icsk = inet_csk(sk);
-		int m;
-		unsigned long rtt;
-
+	if (dst->flags & DST_HOST)
 		dst_confirm(dst);
 
-		if (icsk->icsk_backoff || !tp->srtt) {
-			/* This session failed to estimate rtt. Why?
-			 * Probably, no packets returned in time.
-			 * Reset our results.
-			 */
-			if (!(dst_metric_locked(dst, RTAX_RTT)))
-				dst_metric_set(dst, RTAX_RTT, 0);
-			return;
-		}
+	rcu_read_lock();
+	if (icsk->icsk_backoff || !tp->srtt) {
+		/* This session failed to estimate rtt. Why?
+		 * Probably, no packets returned in time.  Reset our
+		 * results.
+		 */
+		tm = tcp_get_metrics(sk, dst, false);
+		if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT))
+			tcp_metric_set(tm, TCP_METRIC_RTT, 0);
+		goto out_unlock;
+	} else
+		tm = tcp_get_metrics(sk, dst, true);
 
-		rtt = dst_metric_rtt(dst, RTAX_RTT);
-		m = rtt - tp->srtt;
+	if (!tm)
+		goto out_unlock;
 
-		/* If newly calculated rtt larger than stored one,
-		 * store new one. Otherwise, use EWMA. Remember,
-		 * rtt overestimation is always better than underestimation.
-		 */
-		if (!(dst_metric_locked(dst, RTAX_RTT))) {
-			if (m <= 0)
-				set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt);
-			else
-				set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3));
-		}
+	rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
+	m = rtt - tp->srtt;
 
-		if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
-			unsigned long var;
-			if (m < 0)
-				m = -m;
+	/* If newly calculated rtt larger than stored one, store new
+	 * one. Otherwise, use EWMA. Remember, rtt overestimation is
+	 * always better than underestimation.
+	 */
+	if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) {
+		if (m <= 0)
+			rtt = tp->srtt;
+		else
+			rtt -= (m >> 3);
+		tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt);
+	}
 
-			/* Scale deviation to rttvar fixed point */
-			m >>= 1;
-			if (m < tp->mdev)
-				m = tp->mdev;
+	if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) {
+		unsigned long var;
 
-			var = dst_metric_rtt(dst, RTAX_RTTVAR);
-			if (m >= var)
-				var = m;
-			else
-				var -= (var - m) >> 2;
+		if (m < 0)
+			m = -m;
 
-			set_dst_metric_rtt(dst, RTAX_RTTVAR, var);
-		}
+		/* Scale deviation to rttvar fixed point */
+		m >>= 1;
+		if (m < tp->mdev)
+			m = tp->mdev;
 
-		if (tcp_in_initial_slowstart(tp)) {
-			/* Slow start still did not finish. */
-			if (dst_metric(dst, RTAX_SSTHRESH) &&
-			    !dst_metric_locked(dst, RTAX_SSTHRESH) &&
-			    (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
-				dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
-			if (!dst_metric_locked(dst, RTAX_CWND) &&
-			    tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
-				dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
-		} else if (tp->snd_cwnd > tp->snd_ssthresh &&
-			   icsk->icsk_ca_state == TCP_CA_Open) {
-			/* Cong. avoidance phase, cwnd is reliable. */
-			if (!dst_metric_locked(dst, RTAX_SSTHRESH))
-				dst_metric_set(dst, RTAX_SSTHRESH,
-					       max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
-			if (!dst_metric_locked(dst, RTAX_CWND))
-				dst_metric_set(dst, RTAX_CWND,
-					       (dst_metric(dst, RTAX_CWND) +
-						tp->snd_cwnd) >> 1);
-		} else {
-			/* Else slow start did not finish, cwnd is non-sense,
-			   ssthresh may be also invalid.
-			 */
-			if (!dst_metric_locked(dst, RTAX_CWND))
-				dst_metric_set(dst, RTAX_CWND,
-					       (dst_metric(dst, RTAX_CWND) +
-						tp->snd_ssthresh) >> 1);
-			if (dst_metric(dst, RTAX_SSTHRESH) &&
-			    !dst_metric_locked(dst, RTAX_SSTHRESH) &&
-			    tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
-				dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
-		}
+		var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
+		if (m >= var)
+			var = m;
+		else
+			var -= (var - m) >> 2;
 
-		if (!dst_metric_locked(dst, RTAX_REORDERING)) {
-			if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
+		tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var);
+	}
+
+	if (tcp_in_initial_slowstart(tp)) {
+		/* Slow start still did not finish. */
+		if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
+			val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+			if (val && (tp->snd_cwnd >> 1) > val)
+				tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+					       tp->snd_cwnd >> 1);
+		}
+		if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+			val = tcp_metric_get(tm, TCP_METRIC_CWND);
+			if (tp->snd_cwnd > val)
+				tcp_metric_set(tm, TCP_METRIC_CWND,
+					       tp->snd_cwnd);
+		}
+	} else if (tp->snd_cwnd > tp->snd_ssthresh &&
+		   icsk->icsk_ca_state == TCP_CA_Open) {
+		/* Cong. avoidance phase, cwnd is reliable. */
+		if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
+			tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+				       max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
+		if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+			val = tcp_metric_get(tm, TCP_METRIC_CWND);
+			tcp_metric_set(tm, RTAX_CWND, (val + tp->snd_cwnd) >> 1);
+		}
+	} else {
+		/* Else slow start did not finish, cwnd is non-sense,
+		 * ssthresh may be also invalid.
+		 */
+		if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+			val = tcp_metric_get(tm, TCP_METRIC_CWND);
+			tcp_metric_set(tm, TCP_METRIC_CWND,
+				       (val + tp->snd_ssthresh) >> 1);
+		}
+		if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
+			val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+			if (val && tp->snd_ssthresh > val)
+				tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+					       tp->snd_ssthresh);
+		}
+		if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {
+			val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
+			if (val < tp->reordering &&
 			    tp->reordering != sysctl_tcp_reordering)
-				dst_metric_set(dst, RTAX_REORDERING, tp->reordering);
+				tcp_metric_set(tm, TCP_METRIC_REORDERING,
+					       tp->reordering);
 		}
 	}
+	tm->tcpm_stamp = jiffies;
+out_unlock:
+	rcu_read_unlock();
 }
 
 /* Initialize metrics on socket. */
 
 void tcp_init_metrics(struct sock *sk)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
 	struct dst_entry *dst = __sk_dst_get(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_metrics_block *tm;
+	u32 val;
 
 	if (dst == NULL)
 		goto reset;
 
 	dst_confirm(dst);
 
-	if (dst_metric_locked(dst, RTAX_CWND))
-		tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
-	if (dst_metric(dst, RTAX_SSTHRESH)) {
-		tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
+	rcu_read_lock();
+	tm = tcp_get_metrics(sk, dst, true);
+	if (!tm) {
+		rcu_read_unlock();
+		goto reset;
+	}
+
+	if (tcp_metric_locked(tm, TCP_METRIC_CWND))
+		tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND);
+
+	val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+	if (val) {
+		tp->snd_ssthresh = val;
 		if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
 			tp->snd_ssthresh = tp->snd_cwnd_clamp;
 	} else {
@@ -137,16 +434,18 @@ void tcp_init_metrics(struct sock *sk)
 		 */
 		tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
 	}
-	if (dst_metric(dst, RTAX_REORDERING) &&
-	    tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
+	val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
+	if (val && tp->reordering != val) {
 		tcp_disable_fack(tp);
 		tcp_disable_early_retrans(tp);
-		tp->reordering = dst_metric(dst, RTAX_REORDERING);
+		tp->reordering = val;
 	}
 
-	if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0)
+	val = tcp_metric_get(tm, TCP_METRIC_RTT);
+	if (val == 0 || tp->srtt == 0) {
+		rcu_read_unlock();
 		goto reset;
-
+	}
 	/* Initial rtt is determined from SYN,SYN-ACK.
 	 * The segment is small and rtt may appear much
 	 * less than real one. Use per-dst memory
@@ -161,14 +460,18 @@ void tcp_init_metrics(struct sock *sk)
 	 * to low value, and then abruptly stops to do it and starts to delay
 	 * ACKs, wait for troubles.
 	 */
-	if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) {
-		tp->srtt = dst_metric_rtt(dst, RTAX_RTT);
+	val = msecs_to_jiffies(val);
+	if (val > tp->srtt) {
+		tp->srtt = val;
 		tp->rtt_seq = tp->snd_nxt;
 	}
-	if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) {
-		tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR);
+	val = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
+	if (val > tp->mdev) {
+		tp->mdev = val;
 		tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
 	}
+	rcu_read_unlock();
+
 	tcp_set_rto(sk);
 reset:
 	if (tp->srtt == 0) {
@@ -195,8 +498,74 @@ reset:
 
 bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst)
 {
+	struct tcp_metrics_block *tm;
+	bool ret;
+
 	if (!dst)
 		return false;
-	return dst_metric(dst, RTAX_RTT) ? true : false;
+
+	rcu_read_lock();
+	tm = __tcp_get_metrics_req(req, dst);
+	if (tm && tcp_metric_get(tm, TCP_METRIC_RTT))
+		ret = true;
+	else
+		ret = false;
+	rcu_read_unlock();
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(tcp_peer_is_proven);
+
+static unsigned long tcpmhash_entries;
+static int __init set_tcpmhash_entries(char *str)
+{
+	ssize_t ret;
+
+	if (!str)
+		return 0;
+
+	ret = kstrtoul(str, 0, &tcpmhash_entries);
+	if (ret)
+		return 0;
+
+	return 1;
+}
+__setup("tcpmhash_entries=", set_tcpmhash_entries);
+
+static int __net_init tcp_net_metrics_init(struct net *net)
+{
+	int slots, size;
+
+	slots = tcpmhash_entries;
+	if (!slots) {
+		if (totalram_pages >= 128 * 1024)
+			slots = 16 * 1024;
+		else
+			slots = 8 * 1024;
+	}
+
+	size = slots * sizeof(struct tcpm_hash_bucket);
+
+	net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL);
+	if (!net->ipv4.tcp_metrics_hash)
+		return -ENOMEM;
+
+	net->ipv4.tcp_metrics_hash_mask = (slots - 1);
+
+	return 0;
+}
+
+static void __net_exit tcp_net_metrics_exit(struct net *net)
+{
+	kfree(net->ipv4.tcp_metrics_hash);
+}
+
+static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
+	.init	=	tcp_net_metrics_init,
+	.exit	=	tcp_net_metrics_exit,
+};
+
+void __init tcp_metrics_init(void)
+{
+	register_pernet_subsys(&tcp_net_metrics_ops);
+}
-- 
cgit v1.2.3-70-g09d2


From 81166dd6fa8eb780b2132d32fbc77eb6ac04e44e Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 10 Jul 2012 03:14:24 -0700
Subject: tcp: Move timestamps from inetpeer to metrics cache.

With help from Lin Ming.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inetpeer.h   |   4 +-
 include/net/tcp.h        |   5 +-
 net/ipv4/inetpeer.c      |   1 -
 net/ipv4/route.c         |   8 +--
 net/ipv4/tcp_ipv4.c      |  30 ++---------
 net/ipv4/tcp_metrics.c   | 136 +++++++++++++++++++++++++++++++++++++++++++++--
 net/ipv4/tcp_minisocks.c |  46 ----------------
 net/ipv6/route.c         |  13 +----
 net/ipv6/tcp_ipv6.c      |  33 ++----------
 9 files changed, 149 insertions(+), 127 deletions(-)

(limited to 'include/net/tcp.h')

diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index c27c8f10ebd..1119f6f6cdb 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -46,15 +46,13 @@ struct inet_peer {
 	};
 	/*
 	 * Once inet_peer is queued for deletion (refcnt == -1), following fields
-	 * are not available: rid, ip_id_count, tcp_ts, tcp_ts_stamp
+	 * are not available: rid, ip_id_count
 	 * We can share memory with rcu_head to help keep inet_peer small.
 	 */
 	union {
 		struct {
 			atomic_t			rid;		/* Frag reception counter */
 			atomic_t			ip_id_count;	/* IP ID for the next packet */
-			__u32				tcp_ts;
-			__u32				tcp_ts_stamp;
 		};
 		struct rcu_head         rcu;
 		struct inet_peer	*gc_next;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0900d63d162..3618fefae04 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -390,7 +390,10 @@ extern void tcp_clear_retrans(struct tcp_sock *tp);
 extern void tcp_update_metrics(struct sock *sk);
 extern void tcp_init_metrics(struct sock *sk);
 extern void tcp_metrics_init(void);
-extern bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
+extern bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check);
+extern bool tcp_remember_stamp(struct sock *sk);
+extern bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw);
+extern void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst);
 extern void tcp_disable_fack(struct tcp_sock *tp);
 extern void tcp_close(struct sock *sk, long timeout);
 extern void tcp_init_sock(struct sock *sk);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index da90a8cab61..f457bcb4135 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -508,7 +508,6 @@ relookup:
 				(daddr->family == AF_INET) ?
 					secure_ip_id(daddr->addr.a4) :
 					secure_ipv6_id(daddr->addr.a6));
-		p->tcp_ts_stamp = 0;
 		p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
 		p->rate_tokens = 0;
 		p->rate_last = 0;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d02c91177d3..78d81543766 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2846,7 +2846,7 @@ static int rt_fill_info(struct net *net,
 	struct rtmsg *r;
 	struct nlmsghdr *nlh;
 	unsigned long expires = 0;
-	u32 id = 0, ts = 0, tsage = 0, error;
+	u32 id = 0, error;
 
 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
 	if (nlh == NULL)
@@ -2903,10 +2903,6 @@ static int rt_fill_info(struct net *net,
 		const struct inet_peer *peer = rt_peer_ptr(rt);
 		inet_peer_refcheck(peer);
 		id = atomic_read(&peer->ip_id_count) & 0xffff;
-		if (peer->tcp_ts_stamp) {
-			ts = peer->tcp_ts;
-			tsage = get_seconds() - peer->tcp_ts_stamp;
-		}
 		expires = ACCESS_ONCE(peer->pmtu_expires);
 		if (expires) {
 			if (time_before(jiffies, expires))
@@ -2942,7 +2938,7 @@ static int rt_fill_info(struct net *net,
 				goto nla_put_failure;
 	}
 
-	if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
+	if (rtnl_put_cacheinfo(skb, &rt->dst, id, 0, 0,
 			       expires, error) < 0)
 		goto nla_put_failure;
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index e9312a8f33a..d406bf7f37d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -209,22 +209,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	}
 
 	if (tcp_death_row.sysctl_tw_recycle &&
-	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
-		struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
-		/*
-		 * VJ's idea. We save last timestamp seen from
-		 * the destination in peer table, when entering state
-		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
-		 * when trying new connection.
-		 */
-		if (peer) {
-			inet_peer_refcheck(peer);
-			if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
-				tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
-				tp->rx_opt.ts_recent = peer->tcp_ts;
-			}
-		}
-	}
+	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
+		tcp_fetch_timewait_stamp(sk, &rt->dst);
 
 	inet->inet_dport = usin->sin_port;
 	inet->inet_daddr = daddr;
@@ -1375,7 +1361,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
 		req->cookie_ts = tmp_opt.tstamp_ok;
 	} else if (!isn) {
-		struct inet_peer *peer = NULL;
 		struct flowi4 fl4;
 
 		/* VJ's idea. We save last timestamp seen
@@ -1390,12 +1375,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		if (tmp_opt.saw_tstamp &&
 		    tcp_death_row.sysctl_tw_recycle &&
 		    (dst = inet_csk_route_req(sk, &fl4, req, want_cookie)) != NULL &&
-		    fl4.daddr == saddr &&
-		    (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
-			inet_peer_refcheck(peer);
-			if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
-			    (s32)(peer->tcp_ts - req->ts_recent) >
-							TCP_PAWS_WINDOW) {
+		    fl4.daddr == saddr) {
+			if (!tcp_peer_is_proven(req, dst, true)) {
 				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
 				goto drop_and_release;
 			}
@@ -1404,8 +1385,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		else if (!sysctl_tcp_syncookies &&
 			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
 			  (sysctl_max_syn_backlog >> 2)) &&
-			 (!peer || !peer->tcp_ts_stamp) &&
-			 !tcp_peer_is_proven(req, dst)) {
+			 !tcp_peer_is_proven(req, dst, false)) {
 			/* Without syncookies last quarter of
 			 * backlog is filled with destinations,
 			 * proven to be alive.
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 56223bab251..1fd83d3118f 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -34,6 +34,8 @@ struct tcp_metrics_block {
 	struct tcp_metrics_block __rcu	*tcpm_next;
 	struct inetpeer_addr		tcpm_addr;
 	unsigned long			tcpm_stamp;
+	u32				tcpm_ts;
+	u32				tcpm_ts_stamp;
 	u32				tcpm_lock;
 	u32				tcpm_vals[TCP_METRIC_MAX];
 };
@@ -114,6 +116,8 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst)
 	tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
 	tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
 	tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
+	tm->tcpm_ts = 0;
+	tm->tcpm_ts_stamp = 0;
 }
 
 static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
@@ -230,6 +234,45 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
 	return tm;
 }
 
+static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw)
+{
+	struct inet6_timewait_sock *tw6;
+	struct tcp_metrics_block *tm;
+	struct inetpeer_addr addr;
+	unsigned int hash;
+	struct net *net;
+
+	addr.family = tw->tw_family;
+	switch (addr.family) {
+	case AF_INET:
+		addr.addr.a4 = tw->tw_daddr;
+		hash = (__force unsigned int) addr.addr.a4;
+		break;
+	case AF_INET6:
+		tw6 = inet6_twsk((struct sock *)tw);
+		*(struct in6_addr *)addr.addr.a6 = tw6->tw_v6_daddr;
+		hash = ((__force unsigned int) addr.addr.a6[0] ^
+			(__force unsigned int) addr.addr.a6[1] ^
+			(__force unsigned int) addr.addr.a6[2] ^
+			(__force unsigned int) addr.addr.a6[3]);
+		break;
+	default:
+		return NULL;
+	}
+
+	hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8);
+
+	net = twsk_net(tw);
+	hash &= net->ipv4.tcp_metrics_hash_mask;
+
+	for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
+	     tm = rcu_dereference(tm->tcpm_next)) {
+		if (addr_same(&tm->tcpm_addr, &addr))
+			break;
+	}
+	return tm;
+}
+
 static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
 						 struct dst_entry *dst,
 						 bool create)
@@ -496,7 +539,7 @@ reset:
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
-bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst)
+bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check)
 {
 	struct tcp_metrics_block *tm;
 	bool ret;
@@ -506,16 +549,99 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst)
 
 	rcu_read_lock();
 	tm = __tcp_get_metrics_req(req, dst);
-	if (tm && tcp_metric_get(tm, TCP_METRIC_RTT))
-		ret = true;
-	else
-		ret = false;
+	if (paws_check) {
+		if (tm &&
+		    (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL &&
+		    (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW)
+			ret = false;
+		else
+			ret = true;
+	} else {
+		if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp)
+			ret = true;
+		else
+			ret = false;
+	}
 	rcu_read_unlock();
 
 	return ret;
 }
 EXPORT_SYMBOL_GPL(tcp_peer_is_proven);
 
+void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst)
+{
+	struct tcp_metrics_block *tm;
+
+	rcu_read_lock();
+	tm = tcp_get_metrics(sk, dst, true);
+	if (tm) {
+		struct tcp_sock *tp = tcp_sk(sk);
+
+		if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) {
+			tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp;
+			tp->rx_opt.ts_recent = tm->tcpm_ts;
+		}
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp);
+
+/* VJ's idea. Save last timestamp seen from this destination and hold
+ * it at least for normal timewait interval to use for duplicate
+ * segment detection in subsequent connections, before they enter
+ * synchronized state.
+ */
+bool tcp_remember_stamp(struct sock *sk)
+{
+	struct dst_entry *dst = __sk_dst_get(sk);
+	bool ret = false;
+
+	if (dst) {
+		struct tcp_metrics_block *tm;
+
+		rcu_read_lock();
+		tm = tcp_get_metrics(sk, dst, true);
+		if (tm) {
+			struct tcp_sock *tp = tcp_sk(sk);
+
+			if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 ||
+			    ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
+			     tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
+				tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
+				tm->tcpm_ts = tp->rx_opt.ts_recent;
+			}
+			ret = true;
+		}
+		rcu_read_unlock();
+	}
+	return ret;
+}
+
+bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
+{
+	struct tcp_metrics_block *tm;
+	bool ret = false;
+
+	rcu_read_lock();
+	tm = __tcp_get_metrics_tw(tw);
+	if (tw) {
+		const struct tcp_timewait_sock *tcptw;
+		struct sock *sk = (struct sock *) tw;
+
+		tcptw = tcp_twsk(sk);
+		if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 ||
+		    ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
+		     tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
+			tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
+			tm->tcpm_ts	   = tcptw->tw_ts_recent;
+		}
+		ret = true;
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
 static unsigned long tcpmhash_entries;
 static int __init set_tcpmhash_entries(char *str)
 {
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 72b7c63b1a3..a51aa534dab 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -49,52 +49,6 @@ struct inet_timewait_death_row tcp_death_row = {
 };
 EXPORT_SYMBOL_GPL(tcp_death_row);
 
-/* VJ's idea. Save last timestamp seen from this destination
- * and hold it at least for normal timewait interval to use for duplicate
- * segment detection in subsequent connections, before they enter synchronized
- * state.
- */
-
-static bool tcp_remember_stamp(struct sock *sk)
-{
-	const struct inet_connection_sock *icsk = inet_csk(sk);
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct inet_peer *peer;
-
-	peer = icsk->icsk_af_ops->get_peer(sk);
-	if (peer) {
-		if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
-		    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
-		     peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
-			peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
-			peer->tcp_ts = tp->rx_opt.ts_recent;
-		}
-		return true;
-	}
-
-	return false;
-}
-
-static bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
-{
-	const struct tcp_timewait_sock *tcptw;
-	struct sock *sk = (struct sock *) tw;
-	struct inet_peer *peer;
-
-	tcptw = tcp_twsk(sk);
-	peer = tcptw->tw_peer;
-	if (peer) {
-		if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
-		    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
-		     peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
-			peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
-			peer->tcp_ts	   = tcptw->tw_ts_recent;
-		}
-		return true;
-	}
-	return false;
-}
-
 static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
 {
 	if (seq == s_win)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 6cc6c881f54..0c068475378 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2348,13 +2348,11 @@ static int rt6_fill_node(struct net *net,
 			 int iif, int type, u32 pid, u32 seq,
 			 int prefix, int nowait, unsigned int flags)
 {
-	const struct inet_peer *peer;
 	struct rtmsg *rtm;
 	struct nlmsghdr *nlh;
 	long expires;
 	u32 table;
 	struct neighbour *n;
-	u32 ts, tsage;
 
 	if (prefix) {	/* user wants prefix routes only */
 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
@@ -2473,16 +2471,7 @@ static int rt6_fill_node(struct net *net,
 	else
 		expires = INT_MAX;
 
-	peer = NULL;
-	if (rt6_has_peer(rt))
-		peer = rt6_peer_ptr(rt);
-	ts = tsage = 0;
-	if (peer && peer->tcp_ts_stamp) {
-		ts = peer->tcp_ts;
-		tsage = get_seconds() - peer->tcp_ts_stamp;
-	}
-
-	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
+	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
 			       expires, rt->dst.error) < 0)
 		goto nla_put_failure;
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 75d179555c2..9e96b5f21d2 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -277,22 +277,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	rt = (struct rt6_info *) dst;
 	if (tcp_death_row.sysctl_tw_recycle &&
 	    !tp->rx_opt.ts_recent_stamp &&
-	    ipv6_addr_equal(&rt->rt6i_dst.addr, &np->daddr)) {
-		struct inet_peer *peer = rt6_get_peer(rt);
-		/*
-		 * VJ's idea. We save last timestamp seen from
-		 * the destination in peer table, when entering state
-		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
-		 * when trying new connection.
-		 */
-		if (peer) {
-			inet_peer_refcheck(peer);
-			if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
-				tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
-				tp->rx_opt.ts_recent = peer->tcp_ts;
-			}
-		}
-	}
+	    ipv6_addr_equal(&rt->rt6i_dst.addr, &np->daddr))
+		tcp_fetch_timewait_stamp(sk, dst);
 
 	icsk->icsk_ext_hdr_len = 0;
 	if (np->opt)
@@ -1134,8 +1120,6 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 		treq->iif = inet6_iif(skb);
 
 	if (!isn) {
-		struct inet_peer *peer = NULL;
-
 		if (ipv6_opt_accepted(sk, skb) ||
 		    np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
 		    np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
@@ -1160,14 +1144,8 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 		 */
 		if (tmp_opt.saw_tstamp &&
 		    tcp_death_row.sysctl_tw_recycle &&
-		    (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL &&
-		    (peer = rt6_get_peer((struct rt6_info *)dst)) != NULL &&
-		    ipv6_addr_equal((struct in6_addr *)peer->daddr.addr.a6,
-				    &treq->rmt_addr)) {
-			inet_peer_refcheck(peer);
-			if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
-			    (s32)(peer->tcp_ts - req->ts_recent) >
-							TCP_PAWS_WINDOW) {
+		    (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL) {
+			if (!tcp_peer_is_proven(req, dst, true)) {
 				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
 				goto drop_and_release;
 			}
@@ -1176,8 +1154,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 		else if (!sysctl_tcp_syncookies &&
 			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
 			  (sysctl_max_syn_backlog >> 2)) &&
-			 (!peer || !peer->tcp_ts_stamp) &&
-			 !tcp_peer_is_proven(req, dst)) {
+			 !tcp_peer_is_proven(req, dst, false)) {
 			/* Without syncookies last quarter of
 			 * backlog is filled with destinations,
 			 * proven to be alive.
-- 
cgit v1.2.3-70-g09d2


From 46d3ceabd8d98ed0ad10f20c595ca784e34786c5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 11 Jul 2012 05:50:31 +0000
Subject: tcp: TCP Small Queues

This introduce TSQ (TCP Small Queues)

TSQ goal is to reduce number of TCP packets in xmit queues (qdisc &
device queues), to reduce RTT and cwnd bias, part of the bufferbloat
problem.

sk->sk_wmem_alloc not allowed to grow above a given limit,
allowing no more than ~128KB [1] per tcp socket in qdisc/dev layers at a
given time.

TSO packets are sized/capped to half the limit, so that we have two
TSO packets in flight, allowing better bandwidth use.

As a side effect, setting the limit to 40000 automatically reduces the
standard gso max limit (65536) to 40000/2 : It can help to reduce
latencies of high prio packets, having smaller TSO packets.

This means we divert sock_wfree() to a tcp_wfree() handler, to
queue/send following frames when skb_orphan() [2] is called for the
already queued skbs.

Results on my dev machines (tg3/ixgbe nics) are really impressive,
using standard pfifo_fast, and with or without TSO/GSO.

Without reduction of nominal bandwidth, we have reduction of buffering
per bulk sender :
< 1ms on Gbit (instead of 50ms with TSO)
< 8ms on 100Mbit (instead of 132 ms)

I no longer have 4 MBytes backlogged in qdisc by a single netperf
session, and both side socket autotuning no longer use 4 Mbytes.

As skb destructor cannot restart xmit itself ( as qdisc lock might be
taken at this point ), we delegate the work to a tasklet. We use one
tasklest per cpu for performance reasons.

If tasklet finds a socket owned by the user, it sets TSQ_OWNED flag.
This flag is tested in a new protocol method called from release_sock(),
to eventually send new segments.

[1] New /proc/sys/net/ipv4/tcp_limit_output_bytes tunable
[2] skb_orphan() is usually called at TX completion time,
  but some drivers call it in their start_xmit() handler.
  These drivers should at least use BQL, or else a single TCP
  session can still fill the whole NIC TX ring, since TSQ will
  have no effect.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Dave Taht <dave.taht@bufferbloat.net>
Cc: Tom Herbert <therbert@google.com>
Cc: Matt Mathis <mattmathis@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  14 +++
 include/linux/tcp.h                    |   9 ++
 include/net/sock.h                     |   2 +
 include/net/tcp.h                      |   4 +
 net/core/sock.c                        |   4 +
 net/ipv4/sysctl_net_ipv4.c             |   7 ++
 net/ipv4/tcp.c                         |   6 ++
 net/ipv4/tcp_ipv4.c                    |   1 +
 net/ipv4/tcp_minisocks.c               |   1 +
 net/ipv4/tcp_output.c                  | 154 ++++++++++++++++++++++++++++++++-
 net/ipv6/tcp_ipv6.c                    |   1 +
 11 files changed, 202 insertions(+), 1 deletion(-)

(limited to 'include/net/tcp.h')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 47b6c79e9b0..e20c17a7d34 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -551,6 +551,20 @@ tcp_thin_dupack - BOOLEAN
 	Documentation/networking/tcp-thin.txt
 	Default: 0
 
+tcp_limit_output_bytes - INTEGER
+	Controls TCP Small Queue limit per tcp socket.
+	TCP bulk sender tends to increase packets in flight until it
+	gets losses notifications. With SNDBUF autotuning, this can
+	result in a large amount of packets queued in qdisc/device
+	on the local machine, hurting latency of other flows, for
+	typical pfifo_fast qdiscs.
+	tcp_limit_output_bytes limits the number of bytes on qdisc
+	or device to reduce artificial RTT/cwnd and reduce bufferbloat.
+	Note: For GSO/TSO enabled flows, we try to have at least two
+	packets in flight. Reducing tcp_limit_output_bytes might also
+	reduce the size of individual GSO packet (64KB being the max)
+	Default: 131072
+
 UDP variables:
 
 udp_mem - vector of 3 INTEGERs: min, pressure, max
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 2de9cf46f9f..1888169e07c 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -339,6 +339,9 @@ struct tcp_sock {
 	u32	rcv_tstamp;	/* timestamp of last received ACK (for keepalives) */
 	u32	lsndtime;	/* timestamp of last sent data packet (for restart window) */
 
+	struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
+	unsigned long	tsq_flags;
+
 	/* Data for direct copy to user */
 	struct {
 		struct sk_buff_head	prequeue;
@@ -494,6 +497,12 @@ struct tcp_sock {
 	struct tcp_cookie_values  *cookie_values;
 };
 
+enum tsq_flags {
+	TSQ_THROTTLED,
+	TSQ_QUEUED,
+	TSQ_OWNED, /* tcp_tasklet_func() found socket was locked */
+};
+
 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
 {
 	return (struct tcp_sock *)sk;
diff --git a/include/net/sock.h b/include/net/sock.h
index dcb54a0793e..88de092df50 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -858,6 +858,8 @@ struct proto {
 	int			(*backlog_rcv) (struct sock *sk,
 						struct sk_buff *skb);
 
+	void		(*release_cb)(struct sock *sk);
+
 	/* Keeping track of sk's, looking them up, and port selection methods. */
 	void			(*hash)(struct sock *sk);
 	void			(*unhash)(struct sock *sk);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3618fefae04..439984b9af4 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -253,6 +253,7 @@ extern int sysctl_tcp_cookie_size;
 extern int sysctl_tcp_thin_linear_timeouts;
 extern int sysctl_tcp_thin_dupack;
 extern int sysctl_tcp_early_retrans;
+extern int sysctl_tcp_limit_output_bytes;
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
@@ -321,6 +322,8 @@ extern struct proto tcp_prot;
 
 extern void tcp_init_mem(struct net *net);
 
+extern void tcp_tasklet_init(void);
+
 extern void tcp_v4_err(struct sk_buff *skb, u32);
 
 extern void tcp_shutdown (struct sock *sk, int how);
@@ -334,6 +337,7 @@ extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		       size_t size);
 extern int tcp_sendpage(struct sock *sk, struct page *page, int offset,
 			size_t size, int flags);
+extern void tcp_release_cb(struct sock *sk);
 extern int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
 extern int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 				 const struct tcphdr *th, unsigned int len);
diff --git a/net/core/sock.c b/net/core/sock.c
index 929bdcc2383..24039ac1242 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2159,6 +2159,10 @@ void release_sock(struct sock *sk)
 	spin_lock_bh(&sk->sk_lock.slock);
 	if (sk->sk_backlog.tail)
 		__release_sock(sk);
+
+	if (sk->sk_prot->release_cb)
+		sk->sk_prot->release_cb(sk);
+
 	sk->sk_lock.owned = 0;
 	if (waitqueue_active(&sk->sk_lock.wq))
 		wake_up(&sk->sk_lock.wq);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 12aa0c5867c..70730f7aeaf 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -598,6 +598,13 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_limit_output_bytes",
+		.data		= &sysctl_tcp_limit_output_bytes,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 #ifdef CONFIG_NET_DMA
 	{
 		.procname	= "tcp_dma_copybreak",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d902da96d15..4252cd8f39f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -376,6 +376,7 @@ void tcp_init_sock(struct sock *sk)
 	skb_queue_head_init(&tp->out_of_order_queue);
 	tcp_init_xmit_timers(sk);
 	tcp_prequeue_init(tp);
+	INIT_LIST_HEAD(&tp->tsq_node);
 
 	icsk->icsk_rto = TCP_TIMEOUT_INIT;
 	tp->mdev = TCP_TIMEOUT_INIT;
@@ -796,6 +797,10 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
 				  inet_csk(sk)->icsk_ext_hdr_len -
 				  tp->tcp_header_len);
 
+		/* TSQ : try to have two TSO segments in flight */
+		xmit_size_goal = min_t(u32, xmit_size_goal,
+				       sysctl_tcp_limit_output_bytes >> 1);
+
 		xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
 
 		/* We try hard to avoid divides here */
@@ -3574,4 +3579,5 @@ void __init tcp_init(void)
 	tcp_secret_primary = &tcp_secret_one;
 	tcp_secret_retiring = &tcp_secret_two;
 	tcp_secret_secondary = &tcp_secret_two;
+	tcp_tasklet_init();
 }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ddefd39ac0c..01545a3fc0f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2588,6 +2588,7 @@ struct proto tcp_prot = {
 	.sendmsg		= tcp_sendmsg,
 	.sendpage		= tcp_sendpage,
 	.backlog_rcv		= tcp_v4_do_rcv,
+	.release_cb		= tcp_release_cb,
 	.hash			= inet_hash,
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 65608863fde..c66f2ede160 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -424,6 +424,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 			treq->snt_isn + 1 + tcp_s_data_size(oldtp);
 
 		tcp_prequeue_init(newtp);
+		INIT_LIST_HEAD(&newtp->tsq_node);
 
 		tcp_init_wl(newtp, treq->rcv_isn);
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c465d3e51e2..03854abfd9d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -50,6 +50,9 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1;
  */
 int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
 
+/* Default TSQ limit of two TSO segments */
+int sysctl_tcp_limit_output_bytes __read_mostly = 131072;
+
 /* This limits the percentage of the congestion window which we
  * will allow a single TSO frame to consume.  Building TSO frames
  * which are too large can cause TCP streams to be bursty.
@@ -65,6 +68,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
 int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
 EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
 
+static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+			   int push_one, gfp_t gfp);
 
 /* Account for new data that has been sent to the network. */
 static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
@@ -783,6 +788,140 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
 	return size;
 }
 
+
+/* TCP SMALL QUEUES (TSQ)
+ *
+ * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
+ * to reduce RTT and bufferbloat.
+ * We do this using a special skb destructor (tcp_wfree).
+ *
+ * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
+ * needs to be reallocated in a driver.
+ * The invariant being skb->truesize substracted from sk->sk_wmem_alloc
+ *
+ * Since transmit from skb destructor is forbidden, we use a tasklet
+ * to process all sockets that eventually need to send more skbs.
+ * We use one tasklet per cpu, with its own queue of sockets.
+ */
+struct tsq_tasklet {
+	struct tasklet_struct	tasklet;
+	struct list_head	head; /* queue of tcp sockets */
+};
+static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
+
+/*
+ * One tasklest per cpu tries to send more skbs.
+ * We run in tasklet context but need to disable irqs when
+ * transfering tsq->head because tcp_wfree() might
+ * interrupt us (non NAPI drivers)
+ */
+static void tcp_tasklet_func(unsigned long data)
+{
+	struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
+	LIST_HEAD(list);
+	unsigned long flags;
+	struct list_head *q, *n;
+	struct tcp_sock *tp;
+	struct sock *sk;
+
+	local_irq_save(flags);
+	list_splice_init(&tsq->head, &list);
+	local_irq_restore(flags);
+
+	list_for_each_safe(q, n, &list) {
+		tp = list_entry(q, struct tcp_sock, tsq_node);
+		list_del(&tp->tsq_node);
+
+		sk = (struct sock *)tp;
+		bh_lock_sock(sk);
+
+		if (!sock_owned_by_user(sk)) {
+			if ((1 << sk->sk_state) &
+			    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 |
+			     TCPF_CLOSING | TCPF_CLOSE_WAIT))
+				tcp_write_xmit(sk,
+					       tcp_current_mss(sk),
+					       0, 0,
+					       GFP_ATOMIC);
+		} else {
+			/* defer the work to tcp_release_cb() */
+			set_bit(TSQ_OWNED, &tp->tsq_flags);
+		}
+		bh_unlock_sock(sk);
+
+		clear_bit(TSQ_QUEUED, &tp->tsq_flags);
+		sk_free(sk);
+	}
+}
+
+/**
+ * tcp_release_cb - tcp release_sock() callback
+ * @sk: socket
+ *
+ * called from release_sock() to perform protocol dependent
+ * actions before socket release.
+ */
+void tcp_release_cb(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (test_and_clear_bit(TSQ_OWNED, &tp->tsq_flags)) {
+		if ((1 << sk->sk_state) &
+		    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 |
+		     TCPF_CLOSING | TCPF_CLOSE_WAIT))
+			tcp_write_xmit(sk,
+				       tcp_current_mss(sk),
+				       0, 0,
+				       GFP_ATOMIC);
+	}
+}
+EXPORT_SYMBOL(tcp_release_cb);
+
+void __init tcp_tasklet_init(void)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
+
+		INIT_LIST_HEAD(&tsq->head);
+		tasklet_init(&tsq->tasklet,
+			     tcp_tasklet_func,
+			     (unsigned long)tsq);
+	}
+}
+
+/*
+ * Write buffer destructor automatically called from kfree_skb.
+ * We cant xmit new skbs from this context, as we might already
+ * hold qdisc lock.
+ */
+void tcp_wfree(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
+	    !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
+		unsigned long flags;
+		struct tsq_tasklet *tsq;
+
+		/* Keep a ref on socket.
+		 * This last ref will be released in tcp_tasklet_func()
+		 */
+		atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc);
+
+		/* queue this socket to tasklet queue */
+		local_irq_save(flags);
+		tsq = &__get_cpu_var(tsq_tasklet);
+		list_add(&tp->tsq_node, &tsq->head);
+		tasklet_schedule(&tsq->tasklet);
+		local_irq_restore(flags);
+	} else {
+		sock_wfree(skb);
+	}
+}
+
 /* This routine actually transmits TCP packets queued in by
  * tcp_do_sendmsg().  This is used by both the initial
  * transmission and possible later retransmissions.
@@ -844,7 +983,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 
 	skb_push(skb, tcp_header_size);
 	skb_reset_transport_header(skb);
-	skb_set_owner_w(skb, sk);
+
+	skb_orphan(skb);
+	skb->sk = sk;
+	skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
+			  tcp_wfree : sock_wfree;
+	atomic_add(skb->truesize, &sk->sk_wmem_alloc);
 
 	/* Build TCP header and checksum it. */
 	th = tcp_hdr(skb);
@@ -1780,6 +1924,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 	while ((skb = tcp_send_head(sk))) {
 		unsigned int limit;
 
+
 		tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
 		BUG_ON(!tso_segs);
 
@@ -1800,6 +1945,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 				break;
 		}
 
+		/* TSQ : sk_wmem_alloc accounts skb truesize,
+		 * including skb overhead. But thats OK.
+		 */
+		if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) {
+			set_bit(TSQ_THROTTLED, &tp->tsq_flags);
+			break;
+		}
 		limit = mss_now;
 		if (tso_segs > 1 && !tcp_urg_mode(tp))
 			limit = tcp_mss_split_point(sk, skb, mss_now,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 61175cb2478..70458a9cd83 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1970,6 +1970,7 @@ struct proto tcpv6_prot = {
 	.sendmsg		= tcp_sendmsg,
 	.sendpage		= tcp_sendpage,
 	.backlog_rcv		= tcp_v6_do_rcv,
+	.release_cb		= tcp_release_cb,
 	.hash			= tcp_v6_hash,
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,
-- 
cgit v1.2.3-70-g09d2


From 282f23c6ee343126156dd41218b22ece96d747e3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 17 Jul 2012 10:13:05 +0200
Subject: tcp: implement RFC 5961 3.2

Implement the RFC 5691 mitigation against Blind
Reset attack using RST bit.

Idea is to validate incoming RST sequence,
to match RCV.NXT value, instead of previouly accepted
window : (RCV.NXT <= SEG.SEQ < RCV.NXT+RCV.WND)

If sequence is in window but not an exact match, send
a "challenge ACK", so that the other part can resend an
RST with the appropriate sequence.

Add a new sysctl, tcp_challenge_ack_limit, to limit
number of challenge ACK sent per second.

Add a new SNMP counter to count number of challenge acks sent.
(netstat -s | grep TCPChallengeACK)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Kiran Kumar Kella <kkiran@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  5 +++++
 include/linux/snmp.h                   |  1 +
 include/net/tcp.h                      |  1 +
 net/ipv4/proc.c                        |  1 +
 net/ipv4/sysctl_net_ipv4.c             |  7 +++++++
 net/ipv4/tcp_input.c                   | 31 ++++++++++++++++++++++++++++++-
 6 files changed, 45 insertions(+), 1 deletion(-)

(limited to 'include/net/tcp.h')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index e20c17a7d34..e1e021594cf 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -565,6 +565,11 @@ tcp_limit_output_bytes - INTEGER
 	reduce the size of individual GSO packet (64KB being the max)
 	Default: 131072
 
+tcp_challenge_ack_limit - INTEGER
+	Limits number of Challenge ACK sent per second, as recommended
+	in RFC 5961 (Improving TCP's Robustness to Blind In-Window Attacks)
+	Default: 100
+
 UDP variables:
 
 udp_mem - vector of 3 INTEGERs: min, pressure, max
diff --git a/include/linux/snmp.h b/include/linux/snmp.h
index 6e4c5112382..673e0e928b2 100644
--- a/include/linux/snmp.h
+++ b/include/linux/snmp.h
@@ -237,6 +237,7 @@ enum
 	LINUX_MIB_TCPOFOQUEUE,			/* TCPOFOQueue */
 	LINUX_MIB_TCPOFODROP,			/* TCPOFODrop */
 	LINUX_MIB_TCPOFOMERGE,			/* TCPOFOMerge */
+	LINUX_MIB_TCPCHALLENGEACK,		/* TCPChallengeACK */
 	__LINUX_MIB_MAX
 };
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 439984b9af4..85c5090bfe2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -254,6 +254,7 @@ extern int sysctl_tcp_thin_linear_timeouts;
 extern int sysctl_tcp_thin_dupack;
 extern int sysctl_tcp_early_retrans;
 extern int sysctl_tcp_limit_output_bytes;
+extern int sysctl_tcp_challenge_ack_limit;
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index dae25e7622c..3e8e78f12a3 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -261,6 +261,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPOFOQueue", LINUX_MIB_TCPOFOQUEUE),
 	SNMP_MIB_ITEM("TCPOFODrop", LINUX_MIB_TCPOFODROP),
 	SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE),
+	SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 70730f7aeaf..3f6a1e762e9 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -605,6 +605,13 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_challenge_ack_limit",
+		.data		= &sysctl_tcp_challenge_ack_limit,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 #ifdef CONFIG_NET_DMA
 	{
 		.procname	= "tcp_dma_copybreak",
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index cc4e12f1f2f..c841a899037 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -88,6 +88,9 @@ int sysctl_tcp_app_win __read_mostly = 31;
 int sysctl_tcp_adv_win_scale __read_mostly = 1;
 EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
 
+/* rfc5961 challenge ack rate limiting */
+int sysctl_tcp_challenge_ack_limit = 100;
+
 int sysctl_tcp_stdurg __read_mostly;
 int sysctl_tcp_rfc1337 __read_mostly;
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
@@ -5247,6 +5250,23 @@ out:
 }
 #endif /* CONFIG_NET_DMA */
 
+static void tcp_send_challenge_ack(struct sock *sk)
+{
+	/* unprotected vars, we dont care of overwrites */
+	static u32 challenge_timestamp;
+	static unsigned int challenge_count;
+	u32 now = jiffies / HZ;
+
+	if (now != challenge_timestamp) {
+		challenge_timestamp = now;
+		challenge_count = 0;
+	}
+	if (++challenge_count <= sysctl_tcp_challenge_ack_limit) {
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
+		tcp_send_ack(sk);
+	}
+}
+
 /* Does PAWS and seqno based validation of an incoming segment, flags will
  * play significant role here.
  */
@@ -5283,7 +5303,16 @@ static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 
 	/* Step 2: check RST bit */
 	if (th->rst) {
-		tcp_reset(sk);
+		/* RFC 5961 3.2 :
+		 * If sequence number exactly matches RCV.NXT, then
+		 *     RESET the connection
+		 * else
+		 *     Send a challenge ACK
+		 */
+		if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt)
+			tcp_reset(sk);
+		else
+			tcp_send_challenge_ack(sk);
 		goto discard;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 2100c8d2d9db23c0a09901a782bb4e3b21bee298 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 19 Jul 2012 06:43:05 +0000
Subject: net-tcp: Fast Open base

This patch impelements the common code for both the client and server.

1. TCP Fast Open option processing. Since Fast Open does not have an
   option number assigned by IANA yet, it shares the experiment option
   code 254 by implementing draft-ietf-tcpm-experimental-options
   with a 16 bits magic number 0xF989. This enables global experiments
   without clashing the scarce(2) experimental options available for TCP.

   When the draft status becomes standard (maybe), the client should
   switch to the new option number assigned while the server supports
   both numbers for transistion.

2. The new sysctl tcp_fastopen

3. A place holder init function

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h        | 10 ++++++++++
 include/net/tcp.h          |  9 ++++++++-
 net/ipv4/Makefile          |  2 +-
 net/ipv4/syncookies.c      |  2 +-
 net/ipv4/sysctl_net_ipv4.c |  7 +++++++
 net/ipv4/tcp_fastopen.c    | 11 +++++++++++
 net/ipv4/tcp_input.c       | 26 ++++++++++++++++++++++----
 net/ipv4/tcp_ipv4.c        |  2 +-
 net/ipv4/tcp_minisocks.c   |  4 ++--
 net/ipv4/tcp_output.c      | 25 +++++++++++++++++++++----
 net/ipv6/syncookies.c      |  2 +-
 net/ipv6/tcp_ipv6.c        |  2 +-
 12 files changed, 86 insertions(+), 16 deletions(-)
 create mode 100644 net/ipv4/tcp_fastopen.c

(limited to 'include/net/tcp.h')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 1888169e07c..12948f54383 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -243,6 +243,16 @@ static inline unsigned int tcp_optlen(const struct sk_buff *skb)
 	return (tcp_hdr(skb)->doff - 5) * 4;
 }
 
+/* TCP Fast Open */
+#define TCP_FASTOPEN_COOKIE_MIN	4	/* Min Fast Open Cookie size in bytes */
+#define TCP_FASTOPEN_COOKIE_MAX	16	/* Max Fast Open Cookie size in bytes */
+
+/* TCP Fast Open Cookie as stored in memory */
+struct tcp_fastopen_cookie {
+	s8	len;
+	u8	val[TCP_FASTOPEN_COOKIE_MAX];
+};
+
 /* This defines a selective acknowledgement block. */
 struct tcp_sack_block_wire {
 	__be32	start_seq;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 85c5090bfe2..5aed3718fde 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -170,6 +170,11 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCPOPT_TIMESTAMP	8	/* Better RTT estimations/PAWS */
 #define TCPOPT_MD5SIG		19	/* MD5 Signature (RFC2385) */
 #define TCPOPT_COOKIE		253	/* Cookie extension (experimental) */
+#define TCPOPT_EXP		254	/* Experimental */
+/* Magic number to be after the option value for sharing TCP
+ * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
+ */
+#define TCPOPT_FASTOPEN_MAGIC	0xF989
 
 /*
  *     TCP option lengths
@@ -180,6 +185,7 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCPOLEN_SACK_PERM      2
 #define TCPOLEN_TIMESTAMP      10
 #define TCPOLEN_MD5SIG         18
+#define TCPOLEN_EXP_FASTOPEN_BASE  4
 #define TCPOLEN_COOKIE_BASE    2	/* Cookie-less header extension */
 #define TCPOLEN_COOKIE_PAIR    3	/* Cookie pair header extension */
 #define TCPOLEN_COOKIE_MIN     (TCPOLEN_COOKIE_BASE+TCP_COOKIE_MIN)
@@ -222,6 +228,7 @@ extern int sysctl_tcp_retries1;
 extern int sysctl_tcp_retries2;
 extern int sysctl_tcp_orphan_retries;
 extern int sysctl_tcp_syncookies;
+extern int sysctl_tcp_fastopen;
 extern int sysctl_tcp_retrans_collapse;
 extern int sysctl_tcp_stdurg;
 extern int sysctl_tcp_rfc1337;
@@ -418,7 +425,7 @@ extern int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		       size_t len, int nonblock, int flags, int *addr_len);
 extern void tcp_parse_options(const struct sk_buff *skb,
 			      struct tcp_options_received *opt_rx, const u8 **hvpp,
-			      int estab);
+			      int estab, struct tcp_fastopen_cookie *foc);
 extern const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
 
 /*
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index a677d804e53..ae2ccf2890e 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -7,7 +7,7 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     ip_output.o ip_sockglue.o inet_hashtables.o \
 	     inet_timewait_sock.o inet_connection_sock.o \
 	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
-	     tcp_minisocks.o tcp_cong.o tcp_metrics.o \
+	     tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
 	     datagram.o raw.o udp.o udplite.o \
 	     arp.o icmp.o devinet.o af_inet.o  igmp.o \
 	     fib_frontend.o fib_semantics.o fib_trie.o \
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index eab2a7fb15d..650e1528e1e 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -293,7 +293,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 
 	/* check for timestamp cookie support */
 	memset(&tcp_opt, 0, sizeof(tcp_opt));
-	tcp_parse_options(skb, &tcp_opt, &hash_location, 0);
+	tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL);
 
 	if (!cookie_check_timestamp(&tcp_opt, &ecn_ok))
 		goto out;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 3f6a1e762e9..5840c325572 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -366,6 +366,13 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler	= proc_dointvec
 	},
 #endif
+	{
+		.procname	= "tcp_fastopen",
+		.data		= &sysctl_tcp_fastopen,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{
 		.procname	= "tcp_tw_recycle",
 		.data		= &tcp_death_row.sysctl_tw_recycle,
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
new file mode 100644
index 00000000000..a7f729c409d
--- /dev/null
+++ b/net/ipv4/tcp_fastopen.c
@@ -0,0 +1,11 @@
+#include <linux/init.h>
+#include <linux/kernel.h>
+
+int sysctl_tcp_fastopen;
+
+static int __init tcp_fastopen_init(void)
+{
+	return 0;
+}
+
+late_initcall(tcp_fastopen_init);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fdd49f1b7a5..a06bb8959e7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3732,7 +3732,8 @@ old_ack:
  * the fast version below fails.
  */
 void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx,
-		       const u8 **hvpp, int estab)
+		       const u8 **hvpp, int estab,
+		       struct tcp_fastopen_cookie *foc)
 {
 	const unsigned char *ptr;
 	const struct tcphdr *th = tcp_hdr(skb);
@@ -3839,8 +3840,25 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o
 					break;
 				}
 				break;
-			}
 
+			case TCPOPT_EXP:
+				/* Fast Open option shares code 254 using a
+				 * 16 bits magic number. It's valid only in
+				 * SYN or SYN-ACK with an even size.
+				 */
+				if (opsize < TCPOLEN_EXP_FASTOPEN_BASE ||
+				    get_unaligned_be16(ptr) != TCPOPT_FASTOPEN_MAGIC ||
+				    foc == NULL || !th->syn || (opsize & 1))
+					break;
+				foc->len = opsize - TCPOLEN_EXP_FASTOPEN_BASE;
+				if (foc->len >= TCP_FASTOPEN_COOKIE_MIN &&
+				    foc->len <= TCP_FASTOPEN_COOKIE_MAX)
+					memcpy(foc->val, ptr + 2, foc->len);
+				else if (foc->len != 0)
+					foc->len = -1;
+				break;
+
+			}
 			ptr += opsize-2;
 			length -= opsize;
 		}
@@ -3882,7 +3900,7 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb,
 		if (tcp_parse_aligned_timestamp(tp, th))
 			return true;
 	}
-	tcp_parse_options(skb, &tp->rx_opt, hvpp, 1);
+	tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL);
 	return true;
 }
 
@@ -5637,7 +5655,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 	struct tcp_cookie_values *cvp = tp->cookie_values;
 	int saved_clamp = tp->rx_opt.mss_clamp;
 
-	tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0);
+	tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, NULL);
 
 	if (th->ack) {
 		/* rfc793:
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d7d2fa50f07..01aa77a9702 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1307,7 +1307,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	tcp_clear_options(&tmp_opt);
 	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
 	tmp_opt.user_mss  = tp->rx_opt.user_mss;
-	tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+	tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
 
 	if (tmp_opt.cookie_plus > 0 &&
 	    tmp_opt.saw_tstamp &&
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index c66f2ede160..5912ac3fd24 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -97,7 +97,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
 
 	tmp_opt.saw_tstamp = 0;
 	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
-		tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+		tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
 
 		if (tmp_opt.saw_tstamp) {
 			tmp_opt.ts_recent	= tcptw->tw_ts_recent;
@@ -534,7 +534,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 
 	tmp_opt.saw_tstamp = 0;
 	if (th->doff > (sizeof(struct tcphdr)>>2)) {
-		tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+		tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
 
 		if (tmp_opt.saw_tstamp) {
 			tmp_opt.ts_recent = req->ts_recent;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 15a7c7bc3e5..4849be76ccd 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -385,15 +385,17 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
 #define OPTION_MD5		(1 << 2)
 #define OPTION_WSCALE		(1 << 3)
 #define OPTION_COOKIE_EXTENSION	(1 << 4)
+#define OPTION_FAST_OPEN_COOKIE	(1 << 8)
 
 struct tcp_out_options {
-	u8 options;		/* bit field of OPTION_* */
+	u16 options;		/* bit field of OPTION_* */
+	u16 mss;		/* 0 to disable */
 	u8 ws;			/* window scale, 0 to disable */
 	u8 num_sack_blocks;	/* number of SACK blocks to include */
 	u8 hash_size;		/* bytes in hash_location */
-	u16 mss;		/* 0 to disable */
-	__u32 tsval, tsecr;	/* need to include OPTION_TS */
 	__u8 *hash_location;	/* temporary pointer, overloaded */
+	__u32 tsval, tsecr;	/* need to include OPTION_TS */
+	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */
 };
 
 /* The sysctl int routines are generic, so check consistency here.
@@ -442,7 +444,7 @@ static u8 tcp_cookie_size_check(u8 desired)
 static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
 			      struct tcp_out_options *opts)
 {
-	u8 options = opts->options;	/* mungable copy */
+	u16 options = opts->options;	/* mungable copy */
 
 	/* Having both authentication and cookies for security is redundant,
 	 * and there's certainly not enough room.  Instead, the cookie-less
@@ -564,6 +566,21 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
 
 		tp->rx_opt.dsack = 0;
 	}
+
+	if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
+		struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
+
+		*ptr++ = htonl((TCPOPT_EXP << 24) |
+			       ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) |
+			       TCPOPT_FASTOPEN_MAGIC);
+
+		memcpy(ptr, foc->val, foc->len);
+		if ((foc->len & 3) == 2) {
+			u8 *align = ((u8 *)ptr) + foc->len;
+			align[0] = align[1] = TCPOPT_NOP;
+		}
+		ptr += (foc->len + 3) >> 2;
+	}
 }
 
 /* Compute TCP options for SYN packets. This is not the final
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 7bf3cc427c2..bb46061c813 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -177,7 +177,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 
 	/* check for timestamp cookie support */
 	memset(&tcp_opt, 0, sizeof(tcp_opt));
-	tcp_parse_options(skb, &tcp_opt, &hash_location, 0);
+	tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL);
 
 	if (!cookie_check_timestamp(&tcp_opt, &ecn_ok))
 		goto out;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index c9dabdd832d..0302ec3fecf 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1033,7 +1033,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	tcp_clear_options(&tmp_opt);
 	tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
 	tmp_opt.user_mss = tp->rx_opt.user_mss;
-	tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+	tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
 
 	if (tmp_opt.cookie_plus > 0 &&
 	    tmp_opt.saw_tstamp &&
-- 
cgit v1.2.3-70-g09d2


From 1fe4c481ba637660793217769695c146a037bd54 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 19 Jul 2012 06:43:06 +0000
Subject: net-tcp: Fast Open client - cookie cache

With help from Eric Dumazet, add Fast Open metrics in tcp metrics cache.
The basic ones are MSS and the cookies. Later patch will cache more to
handle unfriendly middleboxes.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h      |  4 ++++
 net/ipv4/tcp_metrics.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+)

(limited to 'include/net/tcp.h')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5aed3718fde..e601da19736 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -405,6 +405,10 @@ extern void tcp_metrics_init(void);
 extern bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check);
 extern bool tcp_remember_stamp(struct sock *sk);
 extern bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw);
+extern void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
+				   struct tcp_fastopen_cookie *cookie);
+extern void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
+				   struct tcp_fastopen_cookie *cookie);
 extern void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst);
 extern void tcp_disable_fack(struct tcp_sock *tp);
 extern void tcp_close(struct sock *sk, long timeout);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 1a115b66579..d02ff377778 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -30,6 +30,11 @@ enum tcp_metric_index {
 	TCP_METRIC_MAX,
 };
 
+struct tcp_fastopen_metrics {
+	u16	mss;
+	struct	tcp_fastopen_cookie	cookie;
+};
+
 struct tcp_metrics_block {
 	struct tcp_metrics_block __rcu	*tcpm_next;
 	struct inetpeer_addr		tcpm_addr;
@@ -38,6 +43,7 @@ struct tcp_metrics_block {
 	u32				tcpm_ts_stamp;
 	u32				tcpm_lock;
 	u32				tcpm_vals[TCP_METRIC_MAX];
+	struct tcp_fastopen_metrics	tcpm_fastopen;
 };
 
 static bool tcp_metric_locked(struct tcp_metrics_block *tm,
@@ -118,6 +124,8 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst)
 	tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
 	tm->tcpm_ts = 0;
 	tm->tcpm_ts_stamp = 0;
+	tm->tcpm_fastopen.mss = 0;
+	tm->tcpm_fastopen.cookie.len = 0;
 }
 
 static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
@@ -633,6 +641,49 @@ bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
 	return ret;
 }
 
+static DEFINE_SEQLOCK(fastopen_seqlock);
+
+void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
+			    struct tcp_fastopen_cookie *cookie)
+{
+	struct tcp_metrics_block *tm;
+
+	rcu_read_lock();
+	tm = tcp_get_metrics(sk, __sk_dst_get(sk), false);
+	if (tm) {
+		struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
+		unsigned int seq;
+
+		do {
+			seq = read_seqbegin(&fastopen_seqlock);
+			if (tfom->mss)
+				*mss = tfom->mss;
+			*cookie = tfom->cookie;
+		} while (read_seqretry(&fastopen_seqlock, seq));
+	}
+	rcu_read_unlock();
+}
+
+
+void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
+			    struct tcp_fastopen_cookie *cookie)
+{
+	struct tcp_metrics_block *tm;
+
+	rcu_read_lock();
+	tm = tcp_get_metrics(sk, __sk_dst_get(sk), true);
+	if (tm) {
+		struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
+
+		write_seqlock_bh(&fastopen_seqlock);
+		tfom->mss = mss;
+		if (cookie->len > 0)
+			tfom->cookie = *cookie;
+		write_sequnlock_bh(&fastopen_seqlock);
+	}
+	rcu_read_unlock();
+}
+
 static unsigned long tcpmhash_entries;
 static int __init set_tcpmhash_entries(char *str)
 {
-- 
cgit v1.2.3-70-g09d2


From 783237e8daf13481ee234997cbbbb823872ac388 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 19 Jul 2012 06:43:07 +0000
Subject: net-tcp: Fast Open client - sending SYN-data

This patch implements sending SYN-data in tcp_connect(). The data is
from tcp_sendmsg() with flag MSG_FASTOPEN (implemented in a later patch).

The length of the cookie in tcp_fastopen_req, init'd to 0, controls the
type of the SYN. If the cookie is not cached (len==0), the host sends
data-less SYN with Fast Open cookie request option to solicit a cookie
from the remote. If cookie is not available (len > 0), the host sends
a SYN-data with Fast Open cookie option. If cookie length is negative,
  the SYN will not include any Fast Open option (for fall back operations).

To deal with middleboxes that may drop SYN with data or experimental TCP
option, the SYN-data is only sent once. SYN retransmits do not include
data or Fast Open options. The connection will fall back to regular TCP
handshake.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/snmp.h  |   1 +
 include/linux/tcp.h   |   6 ++-
 include/net/tcp.h     |   9 ++++
 net/ipv4/af_inet.c    |  10 ++++-
 net/ipv4/proc.c       |   1 +
 net/ipv4/tcp_output.c | 115 ++++++++++++++++++++++++++++++++++++++++++++++----
 6 files changed, 130 insertions(+), 12 deletions(-)

(limited to 'include/net/tcp.h')

diff --git a/include/linux/snmp.h b/include/linux/snmp.h
index e5fcbd079e4..00bc189cb39 100644
--- a/include/linux/snmp.h
+++ b/include/linux/snmp.h
@@ -238,6 +238,7 @@ enum
 	LINUX_MIB_TCPOFOMERGE,			/* TCPOFOMerge */
 	LINUX_MIB_TCPCHALLENGEACK,		/* TCPChallengeACK */
 	LINUX_MIB_TCPSYNCHALLENGE,		/* TCPSYNChallenge */
+	LINUX_MIB_TCPFASTOPENACTIVE,		/* TCPFastOpenActive */
 	__LINUX_MIB_MAX
 };
 
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 12948f54383..1edf96afab4 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -386,7 +386,8 @@ struct tcp_sock {
 		unused      : 1;
 	u8	repair_queue;
 	u8	do_early_retrans:1,/* Enable RFC5827 early-retransmit  */
-		early_retrans_delayed:1; /* Delayed ER timer installed */
+		early_retrans_delayed:1, /* Delayed ER timer installed */
+		syn_fastopen:1;	/* SYN includes Fast Open option */
 
 /* RTT measurement */
 	u32	srtt;		/* smoothed round trip time << 3	*/
@@ -500,6 +501,9 @@ struct tcp_sock {
 	struct tcp_md5sig_info	__rcu *md5sig_info;
 #endif
 
+/* TCP fastopen related information */
+	struct tcp_fastopen_request *fastopen_req;
+
 	/* When the cookie options are generated and exchanged, then this
 	 * object holds a reference to them (cookie_values->kref).  Also
 	 * contains related tcp_cookie_transactions fields.
diff --git a/include/net/tcp.h b/include/net/tcp.h
index e601da19736..867557b4244 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1289,6 +1289,15 @@ extern int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *, const struct sk_buff
 extern int tcp_md5_hash_key(struct tcp_md5sig_pool *hp,
 			    const struct tcp_md5sig_key *key);
 
+struct tcp_fastopen_request {
+	/* Fast Open cookie. Size 0 means a cookie request */
+	struct tcp_fastopen_cookie	cookie;
+	struct msghdr			*data;  /* data in MSG_FASTOPEN */
+	u16				copied;	/* queued in tcp_connect() */
+};
+
+void tcp_free_fastopen_req(struct tcp_sock *tp);
+
 /* write queue abstraction */
 static inline void tcp_write_queue_purge(struct sock *sk)
 {
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 07a02f6e969..edc414625be 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -556,11 +556,12 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
 }
 EXPORT_SYMBOL(inet_dgram_connect);
 
-static long inet_wait_for_connect(struct sock *sk, long timeo)
+static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
 {
 	DEFINE_WAIT(wait);
 
 	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+	sk->sk_write_pending += writebias;
 
 	/* Basic assumption: if someone sets sk->sk_err, he _must_
 	 * change state of the socket from TCP_SYN_*.
@@ -576,6 +577,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)
 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 	}
 	finish_wait(sk_sleep(sk), &wait);
+	sk->sk_write_pending -= writebias;
 	return timeo;
 }
 
@@ -634,8 +636,12 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
 
 	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+		int writebias = (sk->sk_protocol == IPPROTO_TCP) &&
+				tcp_sk(sk)->fastopen_req &&
+				tcp_sk(sk)->fastopen_req->data ? 1 : 0;
+
 		/* Error code is set above */
-		if (!timeo || !inet_wait_for_connect(sk, timeo))
+		if (!timeo || !inet_wait_for_connect(sk, timeo, writebias))
 			goto out;
 
 		err = sock_intr_errno(timeo);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 2a5240b2ea6..957acd12250 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -262,6 +262,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE),
 	SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),
 	SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE),
+	SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 4849be76ccd..88693281da4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -596,6 +596,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 	u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ?
 			 tcp_cookie_size_check(cvp->cookie_desired) :
 			 0;
+	struct tcp_fastopen_request *fastopen = tp->fastopen_req;
 
 #ifdef CONFIG_TCP_MD5SIG
 	*md5 = tp->af_specific->md5_lookup(sk, sk);
@@ -636,6 +637,16 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 			remaining -= TCPOLEN_SACKPERM_ALIGNED;
 	}
 
+	if (fastopen && fastopen->cookie.len >= 0) {
+		u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len;
+		need = (need + 3) & ~3U;  /* Align to 32 bits */
+		if (remaining >= need) {
+			opts->options |= OPTION_FAST_OPEN_COOKIE;
+			opts->fastopen_cookie = &fastopen->cookie;
+			remaining -= need;
+			tp->syn_fastopen = 1;
+		}
+	}
 	/* Note that timestamps are required by the specification.
 	 *
 	 * Odd numbers of bytes are prohibited by the specification, ensuring
@@ -2824,6 +2835,96 @@ void tcp_connect_init(struct sock *sk)
 	tcp_clear_retrans(tp);
 }
 
+static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+	tcb->end_seq += skb->len;
+	skb_header_release(skb);
+	__tcp_add_write_queue_tail(sk, skb);
+	sk->sk_wmem_queued += skb->truesize;
+	sk_mem_charge(sk, skb->truesize);
+	tp->write_seq = tcb->end_seq;
+	tp->packets_out += tcp_skb_pcount(skb);
+}
+
+/* Build and send a SYN with data and (cached) Fast Open cookie. However,
+ * queue a data-only packet after the regular SYN, such that regular SYNs
+ * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
+ * only the SYN sequence, the data are retransmitted in the first ACK.
+ * If cookie is not cached or other error occurs, falls back to send a
+ * regular SYN with Fast Open cookie request option.
+ */
+static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_fastopen_request *fo = tp->fastopen_req;
+	int space, i, err = 0, iovlen = fo->data->msg_iovlen;
+	struct sk_buff *syn_data = NULL, *data;
+
+	tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie);
+	if (fo->cookie.len <= 0)
+		goto fallback;
+
+	/* MSS for SYN-data is based on cached MSS and bounded by PMTU and
+	 * user-MSS. Reserve maximum option space for middleboxes that add
+	 * private TCP options. The cost is reduced data space in SYN :(
+	 */
+	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp)
+		tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
+	space = tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
+		MAX_TCP_OPTION_SPACE;
+
+	syn_data = skb_copy_expand(syn, skb_headroom(syn), space,
+				   sk->sk_allocation);
+	if (syn_data == NULL)
+		goto fallback;
+
+	for (i = 0; i < iovlen && syn_data->len < space; ++i) {
+		struct iovec *iov = &fo->data->msg_iov[i];
+		unsigned char __user *from = iov->iov_base;
+		int len = iov->iov_len;
+
+		if (syn_data->len + len > space)
+			len = space - syn_data->len;
+		else if (i + 1 == iovlen)
+			/* No more data pending in inet_wait_for_connect() */
+			fo->data = NULL;
+
+		if (skb_add_data(syn_data, from, len))
+			goto fallback;
+	}
+
+	/* Queue a data-only packet after the regular SYN for retransmission */
+	data = pskb_copy(syn_data, sk->sk_allocation);
+	if (data == NULL)
+		goto fallback;
+	TCP_SKB_CB(data)->seq++;
+	TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN;
+	TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH);
+	tcp_connect_queue_skb(sk, data);
+	fo->copied = data->len;
+
+	if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
+		goto done;
+	}
+	syn_data = NULL;
+
+fallback:
+	/* Send a regular SYN with Fast Open cookie request option */
+	if (fo->cookie.len > 0)
+		fo->cookie.len = 0;
+	err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
+	if (err)
+		tp->syn_fastopen = 0;
+	kfree_skb(syn_data);
+done:
+	fo->cookie.len = -1;  /* Exclude Fast Open option for SYN retries */
+	return err;
+}
+
 /* Build a SYN and send it off. */
 int tcp_connect(struct sock *sk)
 {
@@ -2841,17 +2942,13 @@ int tcp_connect(struct sock *sk)
 	skb_reserve(buff, MAX_TCP_HEADER);
 
 	tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
+	tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp;
+	tcp_connect_queue_skb(sk, buff);
 	TCP_ECN_send_syn(sk, buff);
 
-	/* Send it off. */
-	TCP_SKB_CB(buff)->when = tcp_time_stamp;
-	tp->retrans_stamp = TCP_SKB_CB(buff)->when;
-	skb_header_release(buff);
-	__tcp_add_write_queue_tail(sk, buff);
-	sk->sk_wmem_queued += buff->truesize;
-	sk_mem_charge(sk, buff->truesize);
-	tp->packets_out += tcp_skb_pcount(buff);
-	err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
+	/* Send off SYN; include data in Fast Open. */
+	err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
+	      tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
 	if (err == -ECONNREFUSED)
 		return err;
 
-- 
cgit v1.2.3-70-g09d2


From cf60af03ca4e71134206809ea892e49b92a88896 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 19 Jul 2012 06:43:09 +0000
Subject: net-tcp: Fast Open client - sendmsg(MSG_FASTOPEN)

sendmsg() (or sendto()) with MSG_FASTOPEN is a combo of connect(2)
and write(2). The application should replace connect() with it to
send data in the opening SYN packet.

For blocking socket, sendmsg() blocks until all the data are buffered
locally and the handshake is completed like connect() call. It
returns similar errno like connect() if the TCP handshake fails.

For non-blocking socket, it returns the number of bytes queued (and
transmitted in the SYN-data packet) if cookie is available. If cookie
is not available, it transmits a data-less SYN packet with Fast Open
cookie request option and returns -EINPROGRESS like connect().

Using MSG_FASTOPEN on connecting or connected socket will result in
simlar errno like repeating connect() calls. Therefore the application
should only use this flag on new sockets.

The buffer size of sendmsg() is independent of the MSS of the connection.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 11 ++++++
 include/linux/socket.h                 |  1 +
 include/net/inet_common.h              |  6 ++--
 include/net/tcp.h                      |  3 ++
 net/ipv4/af_inet.c                     | 19 ++++++++---
 net/ipv4/tcp.c                         | 61 +++++++++++++++++++++++++++++++---
 net/ipv4/tcp_ipv4.c                    |  3 ++
 7 files changed, 92 insertions(+), 12 deletions(-)

(limited to 'include/net/tcp.h')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index e1e021594cf..03964e08818 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -468,6 +468,17 @@ tcp_syncookies - BOOLEAN
 	SYN flood warnings in logs not being really flooded, your server
 	is seriously misconfigured.
 
+tcp_fastopen - INTEGER
+	Enable TCP Fast Open feature (draft-ietf-tcpm-fastopen) to send data
+	in the opening SYN packet. To use this feature, the client application
+	must not use connect(). Instead, it should use sendmsg() or sendto()
+	with MSG_FASTOPEN flag which performs a TCP handshake automatically.
+
+	The values (bitmap) are:
+	1: Enables sending data in the opening SYN on the client
+
+	Default: 0
+
 tcp_syn_retries - INTEGER
 	Number of times initial SYNs for an active TCP connection attempt
 	will be retransmitted. Should not be higher than 255. Default value
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 25d6322fb63..ba7b2e817cf 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -268,6 +268,7 @@ struct ucred {
 #define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */
 #define MSG_EOF         MSG_FIN
 
+#define MSG_FASTOPEN	0x20000000	/* Send data in TCP SYN */
 #define MSG_CMSG_CLOEXEC 0x40000000	/* Set close_on_exit for file
 					   descriptor received through
 					   SCM_RIGHTS */
diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index 22fac9892b1..234008782c8 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -14,9 +14,11 @@ struct sockaddr;
 struct socket;
 
 extern int inet_release(struct socket *sock);
-extern int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr,
+extern int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 			       int addr_len, int flags);
-extern int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
+extern int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+				 int addr_len, int flags);
+extern int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
 			      int addr_len, int flags);
 extern int inet_accept(struct socket *sock, struct socket *newsock, int flags);
 extern int inet_sendmsg(struct kiocb *iocb, struct socket *sock,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 867557b4244..c0258100d70 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -212,6 +212,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 /* TCP initial congestion window as per draft-hkchu-tcpm-initcwnd-01 */
 #define TCP_INIT_CWND		10
 
+/* Bit Flags for sysctl_tcp_fastopen */
+#define	TFO_CLIENT_ENABLE	1
+
 extern struct inet_timewait_death_row tcp_death_row;
 
 /* sysctl variables for tcp */
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index edc414625be..fe4582ca969 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -585,8 +585,8 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
  *	Connect to a remote host. There is regrettably still a little
  *	TCP 'magic' in here.
  */
-int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
-			int addr_len, int flags)
+int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+			  int addr_len, int flags)
 {
 	struct sock *sk = sock->sk;
 	int err;
@@ -595,8 +595,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	if (addr_len < sizeof(uaddr->sa_family))
 		return -EINVAL;
 
-	lock_sock(sk);
-
 	if (uaddr->sa_family == AF_UNSPEC) {
 		err = sk->sk_prot->disconnect(sk, flags);
 		sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
@@ -663,7 +661,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	sock->state = SS_CONNECTED;
 	err = 0;
 out:
-	release_sock(sk);
 	return err;
 
 sock_error:
@@ -673,6 +670,18 @@ sock_error:
 		sock->state = SS_DISCONNECTING;
 	goto out;
 }
+EXPORT_SYMBOL(__inet_stream_connect);
+
+int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+			int addr_len, int flags)
+{
+	int err;
+
+	lock_sock(sock->sk);
+	err = __inet_stream_connect(sock, uaddr, addr_len, flags);
+	release_sock(sock->sk);
+	return err;
+}
 EXPORT_SYMBOL(inet_stream_connect);
 
 /*
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4252cd8f39f..581ecf02c6b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -270,6 +270,7 @@
 #include <linux/slab.h>
 
 #include <net/icmp.h>
+#include <net/inet_common.h>
 #include <net/tcp.h>
 #include <net/xfrm.h>
 #include <net/ip.h>
@@ -982,26 +983,67 @@ static inline int select_size(const struct sock *sk, bool sg)
 	return tmp;
 }
 
+void tcp_free_fastopen_req(struct tcp_sock *tp)
+{
+	if (tp->fastopen_req != NULL) {
+		kfree(tp->fastopen_req);
+		tp->fastopen_req = NULL;
+	}
+}
+
+static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int err, flags;
+
+	if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
+		return -EOPNOTSUPP;
+	if (tp->fastopen_req != NULL)
+		return -EALREADY; /* Another Fast Open is in progress */
+
+	tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
+				   sk->sk_allocation);
+	if (unlikely(tp->fastopen_req == NULL))
+		return -ENOBUFS;
+	tp->fastopen_req->data = msg;
+
+	flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
+	err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
+				    msg->msg_namelen, flags);
+	*size = tp->fastopen_req->copied;
+	tcp_free_fastopen_req(tp);
+	return err;
+}
+
 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		size_t size)
 {
 	struct iovec *iov;
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
-	int iovlen, flags, err, copied;
-	int mss_now = 0, size_goal;
+	int iovlen, flags, err, copied = 0;
+	int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
 	bool sg;
 	long timeo;
 
 	lock_sock(sk);
 
 	flags = msg->msg_flags;
+	if (flags & MSG_FASTOPEN) {
+		err = tcp_sendmsg_fastopen(sk, msg, &copied_syn);
+		if (err == -EINPROGRESS && copied_syn > 0)
+			goto out;
+		else if (err)
+			goto out_err;
+		offset = copied_syn;
+	}
+
 	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 
 	/* Wait for a connection to finish. */
 	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
-			goto out_err;
+			goto do_error;
 
 	if (unlikely(tp->repair)) {
 		if (tp->repair_queue == TCP_RECV_QUEUE) {
@@ -1037,6 +1079,15 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		unsigned char __user *from = iov->iov_base;
 
 		iov++;
+		if (unlikely(offset > 0)) {  /* Skip bytes copied in SYN */
+			if (offset >= seglen) {
+				offset -= seglen;
+				continue;
+			}
+			seglen -= offset;
+			from += offset;
+			offset = 0;
+		}
 
 		while (seglen > 0) {
 			int copy = 0;
@@ -1199,7 +1250,7 @@ out:
 	if (copied && likely(!tp->repair))
 		tcp_push(sk, flags, mss_now, tp->nonagle);
 	release_sock(sk);
-	return copied;
+	return copied + copied_syn;
 
 do_fault:
 	if (!skb->len) {
@@ -1212,7 +1263,7 @@ do_fault:
 	}
 
 do_error:
-	if (copied)
+	if (copied + copied_syn)
 		goto out;
 out_err:
 	err = sk_stream_error(sk, flags, err);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 01aa77a9702..1d8b75a5898 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1952,6 +1952,9 @@ void tcp_v4_destroy_sock(struct sock *sk)
 		tp->cookie_values = NULL;
 	}
 
+	/* If socket is aborted during connect operation */
+	tcp_free_fastopen_req(tp);
+
 	sk_sockets_allocated_dec(sk);
 	sock_release_memcg(sk);
 }
-- 
cgit v1.2.3-70-g09d2


From aab4874355679c70f93993cf3b3fd74643b9ac33 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 19 Jul 2012 06:43:10 +0000
Subject: net-tcp: Fast Open client - detecting SYN-data drops

On paths with firewalls dropping SYN with data or experimental TCP options,
Fast Open connections will have experience SYN timeout and bad performance.
The solution is to track such incidents in the cookie cache and disables
Fast Open temporarily.

Since only the original SYN includes data and/or Fast Open option, the
SYN-ACK has some tell-tale sign (tcp_rcv_fastopen_synack()) to detect
such drops. If a path has recurring Fast Open SYN drops, Fast Open is
disabled for 2^(recurring_losses) minutes starting from four minutes up to
roughly one and half day. sendmsg with MSG_FASTOPEN flag will succeed but
it behaves as connect() then write().

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h      |  6 ++++--
 net/ipv4/tcp_input.c   | 10 +++++++++-
 net/ipv4/tcp_metrics.c | 16 +++++++++++++---
 net/ipv4/tcp_output.c  | 13 +++++++++++--
 4 files changed, 37 insertions(+), 8 deletions(-)

(limited to 'include/net/tcp.h')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index c0258100d70..e07878d246a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -409,9 +409,11 @@ extern bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst,
 extern bool tcp_remember_stamp(struct sock *sk);
 extern bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw);
 extern void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
-				   struct tcp_fastopen_cookie *cookie);
+				   struct tcp_fastopen_cookie *cookie,
+				   int *syn_loss, unsigned long *last_syn_loss);
 extern void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
-				   struct tcp_fastopen_cookie *cookie);
+				   struct tcp_fastopen_cookie *cookie,
+				   bool syn_lost);
 extern void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst);
 extern void tcp_disable_fack(struct tcp_sock *tp);
 extern void tcp_close(struct sock *sk, long timeout);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 38b6a811edf..c49a4fc175b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5652,6 +5652,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *data = tcp_write_queue_head(sk);
 	u16 mss = tp->rx_opt.mss_clamp;
+	bool syn_drop;
 
 	if (mss == tp->rx_opt.user_mss) {
 		struct tcp_options_received opt;
@@ -5664,7 +5665,14 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 		mss = opt.mss_clamp;
 	}
 
-	tcp_fastopen_cache_set(sk, mss, cookie);
+	/* The SYN-ACK neither has cookie nor acknowledges the data. Presumably
+	 * the remote receives only the retransmitted (regular) SYNs: either
+	 * the original SYN-data or the corresponding SYN-ACK is lost.
+	 */
+	syn_drop = (cookie->len <= 0 && data &&
+		    inet_csk(sk)->icsk_retransmits);
+
+	tcp_fastopen_cache_set(sk, mss, cookie, syn_drop);
 
 	if (data) { /* Retransmit unacked data in SYN */
 		tcp_retransmit_skb(sk, data);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index d02ff377778..99779ae44f6 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -32,6 +32,8 @@ enum tcp_metric_index {
 
 struct tcp_fastopen_metrics {
 	u16	mss;
+	u16	syn_loss:10;		/* Recurring Fast Open SYN losses */
+	unsigned long	last_syn_loss;	/* Last Fast Open SYN loss */
 	struct	tcp_fastopen_cookie	cookie;
 };
 
@@ -125,6 +127,7 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst)
 	tm->tcpm_ts = 0;
 	tm->tcpm_ts_stamp = 0;
 	tm->tcpm_fastopen.mss = 0;
+	tm->tcpm_fastopen.syn_loss = 0;
 	tm->tcpm_fastopen.cookie.len = 0;
 }
 
@@ -644,7 +647,8 @@ bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
 static DEFINE_SEQLOCK(fastopen_seqlock);
 
 void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
-			    struct tcp_fastopen_cookie *cookie)
+			    struct tcp_fastopen_cookie *cookie,
+			    int *syn_loss, unsigned long *last_syn_loss)
 {
 	struct tcp_metrics_block *tm;
 
@@ -659,14 +663,15 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
 			if (tfom->mss)
 				*mss = tfom->mss;
 			*cookie = tfom->cookie;
+			*syn_loss = tfom->syn_loss;
+			*last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0;
 		} while (read_seqretry(&fastopen_seqlock, seq));
 	}
 	rcu_read_unlock();
 }
 
-
 void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
-			    struct tcp_fastopen_cookie *cookie)
+			    struct tcp_fastopen_cookie *cookie, bool syn_lost)
 {
 	struct tcp_metrics_block *tm;
 
@@ -679,6 +684,11 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
 		tfom->mss = mss;
 		if (cookie->len > 0)
 			tfom->cookie = *cookie;
+		if (syn_lost) {
+			++tfom->syn_loss;
+			tfom->last_syn_loss = jiffies;
+		} else
+			tfom->syn_loss = 0;
 		write_sequnlock_bh(&fastopen_seqlock);
 	}
 	rcu_read_unlock();
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 88693281da4..c5cfd5ec318 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2860,10 +2860,19 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_fastopen_request *fo = tp->fastopen_req;
-	int space, i, err = 0, iovlen = fo->data->msg_iovlen;
+	int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen;
 	struct sk_buff *syn_data = NULL, *data;
+	unsigned long last_syn_loss = 0;
+
+	tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
+			       &syn_loss, &last_syn_loss);
+	/* Recurring FO SYN losses: revert to regular handshake temporarily */
+	if (syn_loss > 1 &&
+	    time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
+		fo->cookie.len = -1;
+		goto fallback;
+	}
 
-	tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie);
 	if (fo->cookie.len <= 0)
 		goto fallback;
 
-- 
cgit v1.2.3-70-g09d2


From 67da22d23fa6f3324e03bcd0580b914b2e4afbf3 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 19 Jul 2012 06:43:11 +0000
Subject: net-tcp: Fast Open client - cookie-less mode

In trusted networks, e.g., intranet, data-center, the client does not
need to use Fast Open cookie to mitigate DoS attacks. In cookie-less
mode, sendmsg() with MSG_FASTOPEN flag will send SYN-data regardless
of cookie availability.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 2 ++
 include/linux/tcp.h                    | 1 +
 include/net/tcp.h                      | 1 +
 net/ipv4/tcp_input.c                   | 8 ++++++--
 net/ipv4/tcp_output.c                  | 6 +++++-
 5 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'include/net/tcp.h')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 03964e08818..5f3ef7f7fce 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -476,6 +476,8 @@ tcp_fastopen - INTEGER
 
 	The values (bitmap) are:
 	1: Enables sending data in the opening SYN on the client
+	5: Enables sending data in the opening SYN on the client regardless
+	   of cookie availability.
 
 	Default: 0
 
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 1edf96afab4..9febfb685c3 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -387,6 +387,7 @@ struct tcp_sock {
 	u8	repair_queue;
 	u8	do_early_retrans:1,/* Enable RFC5827 early-retransmit  */
 		early_retrans_delayed:1, /* Delayed ER timer installed */
+		syn_data:1,	/* SYN includes data */
 		syn_fastopen:1;	/* SYN includes Fast Open option */
 
 /* RTT measurement */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index e07878d246a..bc7c134ec05 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -214,6 +214,7 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 
 /* Bit Flags for sysctl_tcp_fastopen */
 #define	TFO_CLIENT_ENABLE	1
+#define	TFO_CLIENT_NO_COOKIE	4	/* Data in SYN w/o cookie option */
 
 extern struct inet_timewait_death_row tcp_death_row;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c49a4fc175b..e67d685a6c0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5650,7 +5650,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 				    struct tcp_fastopen_cookie *cookie)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *data = tcp_write_queue_head(sk);
+	struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL;
 	u16 mss = tp->rx_opt.mss_clamp;
 	bool syn_drop;
 
@@ -5665,6 +5665,9 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 		mss = opt.mss_clamp;
 	}
 
+	if (!tp->syn_fastopen)  /* Ignore an unsolicited cookie */
+		cookie->len = -1;
+
 	/* The SYN-ACK neither has cookie nor acknowledges the data. Presumably
 	 * the remote receives only the retransmitted (regular) SYNs: either
 	 * the original SYN-data or the corresponding SYN-ACK is lost.
@@ -5816,7 +5819,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 
 		tcp_finish_connect(sk, skb);
 
-		if (tp->syn_fastopen && tcp_rcv_fastopen_synack(sk, skb, &foc))
+		if ((tp->syn_fastopen || tp->syn_data) &&
+		    tcp_rcv_fastopen_synack(sk, skb, &foc))
 			return -1;
 
 		if (sk->sk_write_pending ||
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c5cfd5ec318..27a32acfdb6 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2864,6 +2864,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
 	struct sk_buff *syn_data = NULL, *data;
 	unsigned long last_syn_loss = 0;
 
+	tp->rx_opt.mss_clamp = tp->advmss;  /* If MSS is not cached */
 	tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
 			       &syn_loss, &last_syn_loss);
 	/* Recurring FO SYN losses: revert to regular handshake temporarily */
@@ -2873,7 +2874,9 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
 		goto fallback;
 	}
 
-	if (fo->cookie.len <= 0)
+	if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE)
+		fo->cookie.len = -1;
+	else if (fo->cookie.len <= 0)
 		goto fallback;
 
 	/* MSS for SYN-data is based on cached MSS and bounded by PMTU and
@@ -2916,6 +2919,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
 	fo->copied = data->len;
 
 	if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) {
+		tp->syn_data = (fo->copied > 0);
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
 		goto done;
 	}
-- 
cgit v1.2.3-70-g09d2


From 6f458dfb409272082c9bfa412f77ff2fc21c626f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 20 Jul 2012 05:45:50 +0000
Subject: tcp: improve latencies of timer triggered events

Modern TCP stack highly depends on tcp_write_timer() having a small
latency, but current implementation doesn't exactly meet the
expectations.

When a timer fires but finds the socket is owned by the user, it rearms
itself for an additional delay hoping next run will be more
successful.

tcp_write_timer() for example uses a 50ms delay for next try, and it
defeats many attempts to get predictable TCP behavior in term of
latencies.

Use the recently introduced tcp_release_cb(), so that the user owning
the socket will call various handlers right before socket release.

This will permit us to post a followup patch to address the
tcp_tso_should_defer() syndrome (some deferred packets have to wait
RTO timer to be transmitted, while cwnd should allow us to send them
sooner)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Nandita Dukkipati <nanditad@google.com>
Cc: H.K. Jerry Chu <hkchu@google.com>
Cc: John Heffner <johnwheffner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   |  4 ++-
 include/net/tcp.h     |  2 ++
 net/ipv4/tcp_output.c | 46 ++++++++++++++++++++-------------
 net/ipv4/tcp_timer.c  | 70 +++++++++++++++++++++++++++------------------------
 4 files changed, 71 insertions(+), 51 deletions(-)

(limited to 'include/net/tcp.h')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 9febfb685c3..2761856987b 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -515,7 +515,9 @@ struct tcp_sock {
 enum tsq_flags {
 	TSQ_THROTTLED,
 	TSQ_QUEUED,
-	TSQ_OWNED, /* tcp_tasklet_func() found socket was locked */
+	TCP_TSQ_DEFERRED,	   /* tcp_tasklet_func() found socket was owned */
+	TCP_WRITE_TIMER_DEFERRED,  /* tcp_write_timer() found socket was owned */
+	TCP_DELACK_TIMER_DEFERRED, /* tcp_delack_timer() found socket was owned */
 };
 
 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index bc7c134ec05..e19124b84cd 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -350,6 +350,8 @@ extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 extern int tcp_sendpage(struct sock *sk, struct page *page, int offset,
 			size_t size, int flags);
 extern void tcp_release_cb(struct sock *sk);
+extern void tcp_write_timer_handler(struct sock *sk);
+extern void tcp_delack_timer_handler(struct sock *sk);
 extern int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
 extern int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 				 const struct tcphdr *th, unsigned int len);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 27a32acfdb6..950aebfd996 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -837,6 +837,13 @@ struct tsq_tasklet {
 };
 static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
 
+static void tcp_tsq_handler(struct sock *sk)
+{
+	if ((1 << sk->sk_state) &
+	    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
+	     TCPF_CLOSE_WAIT  | TCPF_LAST_ACK))
+		tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC);
+}
 /*
  * One tasklest per cpu tries to send more skbs.
  * We run in tasklet context but need to disable irqs when
@@ -864,16 +871,10 @@ static void tcp_tasklet_func(unsigned long data)
 		bh_lock_sock(sk);
 
 		if (!sock_owned_by_user(sk)) {
-			if ((1 << sk->sk_state) &
-			    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 |
-			     TCPF_CLOSING | TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
-				tcp_write_xmit(sk,
-					       tcp_current_mss(sk),
-					       0, 0,
-					       GFP_ATOMIC);
+			tcp_tsq_handler(sk);
 		} else {
 			/* defer the work to tcp_release_cb() */
-			set_bit(TSQ_OWNED, &tp->tsq_flags);
+			set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
 		}
 		bh_unlock_sock(sk);
 
@@ -882,6 +883,9 @@ static void tcp_tasklet_func(unsigned long data)
 	}
 }
 
+#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) |		\
+			  (1UL << TCP_WRITE_TIMER_DEFERRED) |	\
+			  (1UL << TCP_DELACK_TIMER_DEFERRED))
 /**
  * tcp_release_cb - tcp release_sock() callback
  * @sk: socket
@@ -892,16 +896,24 @@ static void tcp_tasklet_func(unsigned long data)
 void tcp_release_cb(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned long flags, nflags;
 
-	if (test_and_clear_bit(TSQ_OWNED, &tp->tsq_flags)) {
-		if ((1 << sk->sk_state) &
-		    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 |
-		     TCPF_CLOSING | TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
-			tcp_write_xmit(sk,
-				       tcp_current_mss(sk),
-				       0, 0,
-				       GFP_ATOMIC);
-	}
+	/* perform an atomic operation only if at least one flag is set */
+	do {
+		flags = tp->tsq_flags;
+		if (!(flags & TCP_DEFERRED_ALL))
+			return;
+		nflags = flags & ~TCP_DEFERRED_ALL;
+	} while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);
+
+	if (flags & (1UL << TCP_TSQ_DEFERRED))
+		tcp_tsq_handler(sk);
+
+	if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED))
+		tcp_write_timer_handler(sk);
+
+	if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED))
+		tcp_delack_timer_handler(sk);
 }
 EXPORT_SYMBOL(tcp_release_cb);
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index e911e6c523e..6df36ad55a3 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -32,17 +32,6 @@ int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
 int sysctl_tcp_orphan_retries __read_mostly;
 int sysctl_tcp_thin_linear_timeouts __read_mostly;
 
-static void tcp_write_timer(unsigned long);
-static void tcp_delack_timer(unsigned long);
-static void tcp_keepalive_timer (unsigned long data);
-
-void tcp_init_xmit_timers(struct sock *sk)
-{
-	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
-				  &tcp_keepalive_timer);
-}
-EXPORT_SYMBOL(tcp_init_xmit_timers);
-
 static void tcp_write_err(struct sock *sk)
 {
 	sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
@@ -205,21 +194,11 @@ static int tcp_write_timeout(struct sock *sk)
 	return 0;
 }
 
-static void tcp_delack_timer(unsigned long data)
+void tcp_delack_timer_handler(struct sock *sk)
 {
-	struct sock *sk = (struct sock *)data;
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
-	bh_lock_sock(sk);
-	if (sock_owned_by_user(sk)) {
-		/* Try again later. */
-		icsk->icsk_ack.blocked = 1;
-		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
-		sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN);
-		goto out_unlock;
-	}
-
 	sk_mem_reclaim_partial(sk);
 
 	if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
@@ -260,7 +239,21 @@ static void tcp_delack_timer(unsigned long data)
 out:
 	if (sk_under_memory_pressure(sk))
 		sk_mem_reclaim(sk);
-out_unlock:
+}
+
+static void tcp_delack_timer(unsigned long data)
+{
+	struct sock *sk = (struct sock *)data;
+
+	bh_lock_sock(sk);
+	if (!sock_owned_by_user(sk)) {
+		tcp_delack_timer_handler(sk);
+	} else {
+		inet_csk(sk)->icsk_ack.blocked = 1;
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
+		/* deleguate our work to tcp_release_cb() */
+		set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags);
+	}
 	bh_unlock_sock(sk);
 	sock_put(sk);
 }
@@ -450,19 +443,11 @@ out_reset_timer:
 out:;
 }
 
-static void tcp_write_timer(unsigned long data)
+void tcp_write_timer_handler(struct sock *sk)
 {
-	struct sock *sk = (struct sock *)data;
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	int event;
 
-	bh_lock_sock(sk);
-	if (sock_owned_by_user(sk)) {
-		/* Try again later */
-		sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
-		goto out_unlock;
-	}
-
 	if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
 		goto out;
 
@@ -485,7 +470,19 @@ static void tcp_write_timer(unsigned long data)
 
 out:
 	sk_mem_reclaim(sk);
-out_unlock:
+}
+
+static void tcp_write_timer(unsigned long data)
+{
+	struct sock *sk = (struct sock *)data;
+
+	bh_lock_sock(sk);
+	if (!sock_owned_by_user(sk)) {
+		tcp_write_timer_handler(sk);
+	} else {
+		/* deleguate our work to tcp_release_cb() */
+		set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags);
+	}
 	bh_unlock_sock(sk);
 	sock_put(sk);
 }
@@ -602,3 +599,10 @@ out:
 	bh_unlock_sock(sk);
 	sock_put(sk);
 }
+
+void tcp_init_xmit_timers(struct sock *sk)
+{
+	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
+				  &tcp_keepalive_timer);
+}
+EXPORT_SYMBOL(tcp_init_xmit_timers);
-- 
cgit v1.2.3-70-g09d2