From 505fbcf035c245a1a42cd80184feecf61ee868dc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 27 Jul 2012 06:23:40 +0000
Subject: ipv4: fix TCP early demux

commit 92101b3b2e317 (ipv4: Prepare for change of rt->rt_iif encoding.)
invalidated TCP early demux, because rx_dst_ifindex is not properly
initialized and checked.

Also remove the use of inet_iif(skb) in favor or skb->skb_iif

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3e07a64ca44..aa659e82505 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5603,6 +5603,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
 
 	if (skb != NULL) {
 		sk->sk_rx_dst = dst_clone(skb_dst(skb));
+		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
 		security_inet_conn_established(sk, skb);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 59ea33a68a9083ac98515e4861c00e71efdc49a1 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Fri, 27 Jul 2012 10:38:50 +0000
Subject: tcp: perform DMA to userspace only if there is a task waiting for it

Back in 2006, commit 1a2449a87b ("[I/OAT]: TCP recv offload to I/OAT")
added support for receive offloading to IOAT dma engine if available.

The code in tcp_rcv_established() tries to perform early DMA copy if
applicable. It however does so without checking whether the userspace
task is actually expecting the data in the buffer.

This is not a problem under normal circumstances, but there is a corner
case where this doesn't work -- and that's when MSG_TRUNC flag to
recvmsg() is used.

If the IOAT dma engine is not used, the code properly checks whether
there is a valid ucopy.task and the socket is owned by userspace, but
misses the check in the dmaengine case.

This problem can be observed in real trivially -- for example 'tbench' is a
good reproducer, as it makes a heavy use of MSG_TRUNC. On systems utilizing
IOAT, you will soon find tbench waiting indefinitely in sk_wait_data(), as they
have been already early-copied in tcp_rcv_established() using dma engine.

This patch introduces the same check we are performing in the simple
iovec copy case to the IOAT case as well. It fixes the indefinite
recvmsg(MSG_TRUNC) hangs.

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index aa659e82505..a356e1fecf9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5475,7 +5475,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			if (tp->copied_seq == tp->rcv_nxt &&
 			    len - tcp_header_len <= tp->ucopy.len) {
 #ifdef CONFIG_NET_DMA
-				if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
+				if (tp->ucopy.task == current &&
+				    sock_owned_by_user(sk) &&
+				    tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
 					copied_early = 1;
 					eaten = 1;
 				}
-- 
cgit v1.2.3-70-g09d2


From 404e0a8b6a55d5e1cd138c6deb1bca9abdf75d8c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 29 Jul 2012 23:20:37 +0000
Subject: net: ipv4: fix RCU races on dst refcounts

commit c6cffba4ffa2 (ipv4: Fix input route performance regression.)
added various fatal races with dst refcounts.

crashes happen on tcp workloads if routes are added/deleted at the same
time.

The dst_free() calls from free_fib_info_rcu() are clearly racy.

We need instead regular dst refcounting (dst_release()) and make
sure dst_release() is aware of RCU grace periods :

Add DST_RCU_FREE flag so that dst_release() respects an RCU grace period
before dst destruction for cached dst

Introduce a new inet_sk_rx_dst_set() helper, using atomic_inc_not_zero()
to make sure we dont increase a zero refcount (On a dst currently
waiting an rcu grace period before destruction)

rt_cache_route() must take a reference on the new cached route, and
release it if was not able to install it.

With this patch, my machines survive various benchmarks.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h        |  7 +------
 include/net/inet_sock.h  | 13 +++++++++++++
 net/core/dst.c           | 26 +++++++++++++++++++++-----
 net/decnet/dn_route.c    |  6 ++++++
 net/ipv4/fib_semantics.c |  4 ++--
 net/ipv4/route.c         | 16 ++++------------
 net/ipv4/tcp_input.c     |  3 +--
 net/ipv4/tcp_ipv4.c      | 12 ++++++------
 net/ipv4/tcp_minisocks.c |  3 +--
 9 files changed, 55 insertions(+), 35 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/include/net/dst.h b/include/net/dst.h
index baf59789006..31a9fd39edb 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -61,6 +61,7 @@ struct dst_entry {
 #define DST_NOPEER		0x0040
 #define DST_FAKE_RTABLE		0x0080
 #define DST_XFRM_TUNNEL		0x0100
+#define DST_RCU_FREE		0x0200
 
 	unsigned short		pending_confirm;
 
@@ -382,12 +383,6 @@ static inline void dst_free(struct dst_entry *dst)
 	__dst_free(dst);
 }
 
-static inline void dst_rcu_free(struct rcu_head *head)
-{
-	struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
-	dst_free(dst);
-}
-
 static inline void dst_confirm(struct dst_entry *dst)
 {
 	dst->pending_confirm = 1;
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 613cfa40167..e3fd34c83ac 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -249,4 +249,17 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk)
 	return flags;
 }
 
+static inline void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+
+	if (atomic_inc_not_zero(&dst->__refcnt)) {
+		if (!(dst->flags & DST_RCU_FREE))
+			dst->flags |= DST_RCU_FREE;
+
+		sk->sk_rx_dst = dst;
+		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
+	}
+}
+
 #endif	/* _INET_SOCK_H */
diff --git a/net/core/dst.c b/net/core/dst.c
index 069d51d2941..d9e33ebe170 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -258,6 +258,15 @@ again:
 }
 EXPORT_SYMBOL(dst_destroy);
 
+static void dst_rcu_destroy(struct rcu_head *head)
+{
+	struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
+
+	dst = dst_destroy(dst);
+	if (dst)
+		__dst_free(dst);
+}
+
 void dst_release(struct dst_entry *dst)
 {
 	if (dst) {
@@ -265,10 +274,14 @@ void dst_release(struct dst_entry *dst)
 
 		newrefcnt = atomic_dec_return(&dst->__refcnt);
 		WARN_ON(newrefcnt < 0);
-		if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) {
-			dst = dst_destroy(dst);
-			if (dst)
-				__dst_free(dst);
+		if (unlikely(dst->flags & (DST_NOCACHE | DST_RCU_FREE)) && !newrefcnt) {
+			if (dst->flags & DST_RCU_FREE) {
+				call_rcu_bh(&dst->rcu_head, dst_rcu_destroy);
+			} else {
+				dst = dst_destroy(dst);
+				if (dst)
+					__dst_free(dst);
+			}
 		}
 	}
 }
@@ -320,11 +333,14 @@ EXPORT_SYMBOL(__dst_destroy_metrics_generic);
  */
 void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
 {
+	bool hold;
+
 	WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
 	/* If dst not in cache, we must take a reference, because
 	 * dst_release() will destroy dst as soon as its refcount becomes zero
 	 */
-	if (unlikely(dst->flags & DST_NOCACHE)) {
+	hold = (dst->flags & (DST_NOCACHE | DST_RCU_FREE)) == DST_NOCACHE;
+	if (unlikely(hold)) {
 		dst_hold(dst);
 		skb_dst_set(skb, dst);
 	} else {
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 85a3604c87c..26719779ad8 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -184,6 +184,12 @@ static __inline__ unsigned int dn_hash(__le16 src, __le16 dst)
 	return dn_rt_hash_mask & (unsigned int)tmp;
 }
 
+static inline void dst_rcu_free(struct rcu_head *head)
+{
+	struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
+	dst_free(dst);
+}
+
 static inline void dnrt_free(struct dn_route *rt)
 {
 	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index da0cc2e6b25..e55171f184f 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -172,9 +172,9 @@ static void free_fib_info_rcu(struct rcu_head *head)
 		if (nexthop_nh->nh_exceptions)
 			free_nh_exceptions(nexthop_nh);
 		if (nexthop_nh->nh_rth_output)
-			dst_free(&nexthop_nh->nh_rth_output->dst);
+			dst_release(&nexthop_nh->nh_rth_output->dst);
 		if (nexthop_nh->nh_rth_input)
-			dst_free(&nexthop_nh->nh_rth_input->dst);
+			dst_release(&nexthop_nh->nh_rth_input->dst);
 	} endfor_nexthops(fi);
 
 	release_net(fi->fib_net);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index fc1a81ca79a..d6eabcfe8a9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1199,11 +1199,6 @@ restart:
 	fnhe->fnhe_stamp = jiffies;
 }
 
-static inline void rt_free(struct rtable *rt)
-{
-	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
-}
-
 static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
 {
 	struct rtable *orig, *prev, **p = &nh->nh_rth_output;
@@ -1213,17 +1208,14 @@ static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
 
 	orig = *p;
 
+	rt->dst.flags |= DST_RCU_FREE;
+	dst_hold(&rt->dst);
 	prev = cmpxchg(p, orig, rt);
 	if (prev == orig) {
 		if (orig)
-			rt_free(orig);
+			dst_release(&orig->dst);
 	} else {
-		/* Routes we intend to cache in the FIB nexthop have
-		 * the DST_NOCACHE bit clear.  However, if we are
-		 * unsuccessful at storing this route into the cache
-		 * we really need to set it.
-		 */
-		rt->dst.flags |= DST_NOCACHE;
+		dst_release(&rt->dst);
 	}
 }
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a356e1fecf9..9be30b039ae 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5604,8 +5604,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
 	tcp_set_state(sk, TCP_ESTABLISHED);
 
 	if (skb != NULL) {
-		sk->sk_rx_dst = dst_clone(skb_dst(skb));
-		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
+		inet_sk_rx_dst_set(sk, skb);
 		security_inet_conn_established(sk, skb);
 	}
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2fbd9921253..7f91e5ac827 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1617,19 +1617,19 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 #endif
 
 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
+		struct dst_entry *dst = sk->sk_rx_dst;
+
 		sock_rps_save_rxhash(sk, skb);
-		if (sk->sk_rx_dst) {
-			struct dst_entry *dst = sk->sk_rx_dst;
+		if (dst) {
 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
 			    dst->ops->check(dst, 0) == NULL) {
 				dst_release(dst);
 				sk->sk_rx_dst = NULL;
 			}
 		}
-		if (unlikely(sk->sk_rx_dst == NULL)) {
-			sk->sk_rx_dst = dst_clone(skb_dst(skb));
-			inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
-		}
+		if (unlikely(sk->sk_rx_dst == NULL))
+			inet_sk_rx_dst_set(sk, skb);
+
 		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
 			rsk = sk;
 			goto reset;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 3f1cc2028ed..232a90c3ec8 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -387,8 +387,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		struct tcp_sock *oldtp = tcp_sk(sk);
 		struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
 
-		newsk->sk_rx_dst = dst_clone(skb_dst(skb));
-		inet_sk(newsk)->rx_dst_ifindex = skb->skb_iif;
+		inet_sk_rx_dst_set(newsk, skb);
 
 		/* TCP Cookie Transactions require space for the cookie pair,
 		 * as it differs for each connection.  There is no need to
-- 
cgit v1.2.3-70-g09d2


From c76562b6709fee5eff8a6a779be41c0bce661fd7 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Tue, 31 Jul 2012 16:44:41 -0700
Subject: netvm: prevent a stream-specific deadlock

This patch series is based on top of "Swap-over-NBD without deadlocking
v15" as it depends on the same reservation of PF_MEMALLOC reserves logic.

When a user or administrator requires swap for their application, they
create a swap partition and file, format it with mkswap and activate it
with swapon.  In diskless systems this is not an option so if swap if
required then swapping over the network is considered.  The two likely
scenarios are when blade servers are used as part of a cluster where the
form factor or maintenance costs do not allow the use of disks and thin
clients.

The Linux Terminal Server Project recommends the use of the Network Block
Device (NBD) for swap but this is not always an option.  There is no
guarantee that the network attached storage (NAS) device is running Linux
or supports NBD.  However, it is likely that it supports NFS so there are
users that want support for swapping over NFS despite any performance
concern.  Some distributions currently carry patches that support swapping
over NFS but it would be preferable to support it in the mainline kernel.

Patch 1 avoids a stream-specific deadlock that potentially affects TCP.

Patch 2 is a small modification to SELinux to avoid using PFMEMALLOC
	reserves.

Patch 3 adds three helpers for filesystems to handle swap cache pages.
	For example, page_file_mapping() returns page->mapping for
	file-backed pages and the address_space of the underlying
	swap file for swap cache pages.

Patch 4 adds two address_space_operations to allow a filesystem
	to pin all metadata relevant to a swapfile in memory. Upon
	successful activation, the swapfile is marked SWP_FILE and
	the address space operation ->direct_IO is used for writing
	and ->readpage for reading in swap pages.

Patch 5 notes that patch 3 is bolting
	filesystem-specific-swapfile-support onto the side and that
	the default handlers have different information to what
	is available to the filesystem. This patch refactors the
	code so that there are generic handlers for each of the new
	address_space operations.

Patch 6 adds an API to allow a vector of kernel addresses to be
	translated to struct pages and pinned for IO.

Patch 7 adds support for using highmem pages for swap by kmapping
	the pages before calling the direct_IO handler.

Patch 8 updates NFS to use the helpers from patch 3 where necessary.

Patch 9 avoids setting PF_private on PG_swapcache pages within NFS.

Patch 10 implements the new swapfile-related address_space operations
	for NFS and teaches the direct IO handler how to manage
	kernel addresses.

Patch 11 prevents page allocator recursions in NFS by using GFP_NOIO
	where appropriate.

Patch 12 fixes a NULL pointer dereference that occurs when using
	swap-over-NFS.

With the patches applied, it is possible to mount a swapfile that is on an
NFS filesystem.  Swap performance is not great with a swap stress test
taking roughly twice as long to complete than if the swap device was
backed by NBD.

This patch: netvm: prevent a stream-specific deadlock

It could happen that all !SOCK_MEMALLOC sockets have buffered so much data
that we're over the global rmem limit.  This will prevent SOCK_MEMALLOC
buffers from receiving data, which will prevent userspace from running,
which is needed to reduce the buffered data.

Fix this by exempting the SOCK_MEMALLOC sockets from the rmem limit.  Once
this change it applied, it is important that sockets that set
SOCK_MEMALLOC do not clear the flag until the socket is being torn down.
If this happens, a warning is generated and the tokens reclaimed to avoid
accounting errors until the bug is fixed.

[davem@davemloft.net: Warning about clearing SOCK_MEMALLOC]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Mike Christie <michaelc@cs.wisc.edu>
Cc: Eric B Munson <emunson@mgebm.net>
Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/net/sock.h     |  8 +++++---
 net/caif/caif_socket.c |  2 +-
 net/core/sock.c        | 14 +++++++++++++-
 net/ipv4/tcp_input.c   | 21 +++++++++++----------
 net/sctp/ulpevent.c    |  3 ++-
 5 files changed, 32 insertions(+), 16 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/include/net/sock.h b/include/net/sock.h
index 43a470d40d7..b3730239bf1 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1329,12 +1329,14 @@ static inline bool sk_wmem_schedule(struct sock *sk, int size)
 		__sk_mem_schedule(sk, size, SK_MEM_SEND);
 }
 
-static inline bool sk_rmem_schedule(struct sock *sk, int size)
+static inline bool
+sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, unsigned int size)
 {
 	if (!sk_has_account(sk))
 		return true;
-	return size <= sk->sk_forward_alloc ||
-		__sk_mem_schedule(sk, size, SK_MEM_RECV);
+	return size<= sk->sk_forward_alloc ||
+		__sk_mem_schedule(sk, size, SK_MEM_RECV) ||
+		skb_pfmemalloc(skb);
 }
 
 static inline void sk_mem_reclaim(struct sock *sk)
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 78f1cdad5b3..095259f8390 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -141,7 +141,7 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	err = sk_filter(sk, skb);
 	if (err)
 		return err;
-	if (!sk_rmem_schedule(sk, skb->truesize) && rx_flow_is_on(cf_sk)) {
+	if (!sk_rmem_schedule(sk, skb, skb->truesize) && rx_flow_is_on(cf_sk)) {
 		set_rx_flow_off(cf_sk);
 		net_dbg_ratelimited("sending flow OFF due to rmem_schedule\n");
 		caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ);
diff --git a/net/core/sock.c b/net/core/sock.c
index 32fdcd2d6e8..6b654b3ddfd 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -295,6 +295,18 @@ void sk_clear_memalloc(struct sock *sk)
 	sock_reset_flag(sk, SOCK_MEMALLOC);
 	sk->sk_allocation &= ~__GFP_MEMALLOC;
 	static_key_slow_dec(&memalloc_socks);
+
+	/*
+	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
+	 * progress of swapping. However, if SOCK_MEMALLOC is cleared while
+	 * it has rmem allocations there is a risk that the user of the
+	 * socket cannot make forward progress due to exceeding the rmem
+	 * limits. By rights, sk_clear_memalloc() should only be called
+	 * on sockets being torn down but warn and reset the accounting if
+	 * that assumption breaks.
+	 */
+	if (WARN_ON(sk->sk_forward_alloc))
+		sk_mem_reclaim(sk);
 }
 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 
@@ -396,7 +408,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	if (err)
 		return err;
 
-	if (!sk_rmem_schedule(sk, skb->truesize)) {
+	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 		atomic_inc(&sk->sk_drops);
 		return -ENOBUFS;
 	}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a356e1fecf9..00b91b4b866 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4351,19 +4351,20 @@ static void tcp_ofo_queue(struct sock *sk)
 static bool tcp_prune_ofo_queue(struct sock *sk);
 static int tcp_prune_queue(struct sock *sk);
 
-static int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
+static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
+				 unsigned int size)
 {
 	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
-	    !sk_rmem_schedule(sk, size)) {
+	    !sk_rmem_schedule(sk, skb, size)) {
 
 		if (tcp_prune_queue(sk) < 0)
 			return -1;
 
-		if (!sk_rmem_schedule(sk, size)) {
+		if (!sk_rmem_schedule(sk, skb, size)) {
 			if (!tcp_prune_ofo_queue(sk))
 				return -1;
 
-			if (!sk_rmem_schedule(sk, size))
+			if (!sk_rmem_schedule(sk, skb, size))
 				return -1;
 		}
 	}
@@ -4418,7 +4419,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 
 	TCP_ECN_check_ce(tp, skb);
 
-	if (unlikely(tcp_try_rmem_schedule(sk, skb->truesize))) {
+	if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
 		__kfree_skb(skb);
 		return;
@@ -4552,17 +4553,17 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int
 
 int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
 {
-	struct sk_buff *skb;
+	struct sk_buff *skb = NULL;
 	struct tcphdr *th;
 	bool fragstolen;
 
-	if (tcp_try_rmem_schedule(sk, size + sizeof(*th)))
-		goto err;
-
 	skb = alloc_skb(size + sizeof(*th), sk->sk_allocation);
 	if (!skb)
 		goto err;
 
+	if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th)))
+		goto err_free;
+
 	th = (struct tcphdr *)skb_put(skb, sizeof(*th));
 	skb_reset_transport_header(skb);
 	memset(th, 0, sizeof(*th));
@@ -4633,7 +4634,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 		if (eaten <= 0) {
 queue_and_out:
 			if (eaten < 0 &&
-			    tcp_try_rmem_schedule(sk, skb->truesize))
+			    tcp_try_rmem_schedule(sk, skb, skb->truesize))
 				goto drop;
 
 			eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 33d89477619..10c018a5b9f 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -702,7 +702,8 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
 	if (rx_count >= asoc->base.sk->sk_rcvbuf) {
 
 		if ((asoc->base.sk->sk_userlocks & SOCK_RCVBUF_LOCK) ||
-		    (!sk_rmem_schedule(asoc->base.sk, chunk->skb->truesize)))
+		    (!sk_rmem_schedule(asoc->base.sk, chunk->skb,
+				       chunk->skb->truesize)))
 			goto fail;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 5d299f3d3c8a2fbc732b1bf03af36333ccec3130 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 6 Aug 2012 05:09:33 +0000
Subject: net: ipv6: fix TCP early demux

IPv6 needs a cookie in dst_check() call.

We need to add rx_dst_cookie and provide a family independent
sk_rx_dst_set(sk, skb) method to properly support IPv6 TCP early demux.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h               |  1 +
 include/net/inet_connection_sock.h |  1 +
 include/net/inet_sock.h            |  9 ---------
 net/ipv4/tcp_input.c               |  4 +++-
 net/ipv4/tcp_ipv4.c                | 13 ++++++++++---
 net/ipv4/tcp_minisocks.c           |  2 +-
 net/ipv6/tcp_ipv6.c                | 27 +++++++++++++++++++++++++--
 7 files changed, 41 insertions(+), 16 deletions(-)

(limited to 'net/ipv4/tcp_input.c')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 379e433e15e..879db26ec40 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -369,6 +369,7 @@ struct ipv6_pinfo {
 	__u8			rcv_tclass;
 
 	__u32			dst_cookie;
+	__u32			rx_dst_cookie;
 
 	struct ipv6_mc_socklist	__rcu *ipv6_mc_list;
 	struct ipv6_ac_socklist	*ipv6_ac_list;
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 5ee66f517b4..ba1d3615acb 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -39,6 +39,7 @@ struct inet_connection_sock_af_ops {
 	int	    (*queue_xmit)(struct sk_buff *skb, struct flowi *fl);
 	void	    (*send_check)(struct sock *sk, struct sk_buff *skb);
 	int	    (*rebuild_header)(struct sock *sk);
+	void	    (*sk_rx_dst_set)(struct sock *sk, const struct sk_buff *skb);
 	int	    (*conn_request)(struct sock *sk, struct sk_buff *skb);
 	struct sock *(*syn_recv_sock)(struct sock *sk, struct sk_buff *skb,
 				      struct request_sock *req,
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 83b567fe194..613cfa40167 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -249,13 +249,4 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk)
 	return flags;
 }
 
-static inline void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
-{
-	struct dst_entry *dst = skb_dst(skb);
-
-	dst_hold(dst);
-	sk->sk_rx_dst = dst;
-	inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
-}
-
 #endif	/* _INET_SOCK_H */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2fd2bc9e3c6..85308b90df8 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5392,6 +5392,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
+	if (unlikely(sk->sk_rx_dst == NULL))
+		inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
 	/*
 	 *	Header prediction.
 	 *	The code loosely follows the one in the famous
@@ -5605,7 +5607,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
 	tcp_set_state(sk, TCP_ESTABLISHED);
 
 	if (skb != NULL) {
-		inet_sk_rx_dst_set(sk, skb);
+		icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
 		security_inet_conn_established(sk, skb);
 	}
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 42b2a6a7309..272241f16fc 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1627,9 +1627,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 				sk->sk_rx_dst = NULL;
 			}
 		}
-		if (unlikely(sk->sk_rx_dst == NULL))
-			inet_sk_rx_dst_set(sk, skb);
-
 		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
 			rsk = sk;
 			goto reset;
@@ -1872,10 +1869,20 @@ static struct timewait_sock_ops tcp_timewait_sock_ops = {
 	.twsk_destructor= tcp_twsk_destructor,
 };
 
+static void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+
+	dst_hold(dst);
+	sk->sk_rx_dst = dst;
+	inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
+}
+
 const struct inet_connection_sock_af_ops ipv4_specific = {
 	.queue_xmit	   = ip_queue_xmit,
 	.send_check	   = tcp_v4_send_check,
 	.rebuild_header	   = inet_sk_rebuild_header,
+	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
 	.conn_request	   = tcp_v4_conn_request,
 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
 	.net_header_len	   = sizeof(struct iphdr),
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 232a90c3ec8..d9c9dcef2de 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -387,7 +387,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		struct tcp_sock *oldtp = tcp_sk(sk);
 		struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
 
-		inet_sk_rx_dst_set(newsk, skb);
+		newicsk->icsk_af_ops->sk_rx_dst_set(newsk, skb);
 
 		/* TCP Cookie Transactions require space for the cookie pair,
 		 * as it differs for each connection.  There is no need to
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index c66b90f71c9..5a439e9a4c0 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1447,7 +1447,17 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 		opt_skb = skb_clone(skb, sk_gfp_atomic(sk, GFP_ATOMIC));
 
 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
+		struct dst_entry *dst = sk->sk_rx_dst;
+
 		sock_rps_save_rxhash(sk, skb);
+		if (dst) {
+			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
+			    dst->ops->check(dst, np->rx_dst_cookie) == NULL) {
+				dst_release(dst);
+				sk->sk_rx_dst = NULL;
+			}
+		}
+
 		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len))
 			goto reset;
 		if (opt_skb)
@@ -1705,9 +1715,9 @@ static void tcp_v6_early_demux(struct sk_buff *skb)
 			struct dst_entry *dst = sk->sk_rx_dst;
 			struct inet_sock *icsk = inet_sk(sk);
 			if (dst)
-				dst = dst_check(dst, 0);
+				dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie);
 			if (dst &&
-			    icsk->rx_dst_ifindex == inet6_iif(skb))
+			    icsk->rx_dst_ifindex == skb->skb_iif)
 				skb_dst_set_noref(skb, dst);
 		}
 	}
@@ -1719,10 +1729,23 @@ static struct timewait_sock_ops tcp6_timewait_sock_ops = {
 	.twsk_destructor= tcp_twsk_destructor,
 };
 
+static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	const struct rt6_info *rt = (const struct rt6_info *)dst;
+
+	dst_hold(dst);
+	sk->sk_rx_dst = dst;
+	inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
+	if (rt->rt6i_node)
+		inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum;
+}
+
 static const struct inet_connection_sock_af_ops ipv6_specific = {
 	.queue_xmit	   = inet6_csk_xmit,
 	.send_check	   = tcp_v6_send_check,
 	.rebuild_header	   = inet6_sk_rebuild_header,
+	.sk_rx_dst_set	   = inet6_sk_rx_dst_set,
 	.conn_request	   = tcp_v6_conn_request,
 	.syn_recv_sock	   = tcp_v6_syn_recv_sock,
 	.net_header_len	   = sizeof(struct ipv6hdr),
-- 
cgit v1.2.3-70-g09d2