From 9ea88a153001ffeb3d8810917e8eea62ca9b6f25 Mon Sep 17 00:00:00 2001
From: Dmitry Popov <ixaphire@qrator.net>
Date: Thu, 7 Aug 2014 02:38:22 +0400
Subject: tcp: md5: check md5 signature without socket lock

Since a8afca032 (tcp: md5: protects md5sig_info with RCU) tcp_md5_do_lookup
doesn't require socket lock, rcu_read_lock is enough. Therefore socket lock is
no longer required for tcp_v{4,6}_inbound_md5_hash too, so we can move these
calls (wrapped with rcu_read_{,un}lock) before bh_lock_sock:
from tcp_v{4,6}_do_rcv to tcp_v{4,6}_rcv.

Signed-off-by: Dmitry Popov <ixaphire@qrator.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tcp_ipv6.c | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

(limited to 'net/ipv6/tcp_ipv6.c')

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 22055b09842..f2ce9550239 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -667,7 +667,8 @@ clear_hash_noput:
 	return 1;
 }
 
-static int tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
+static int __tcp_v6_inbound_md5_hash(struct sock *sk,
+				     const struct sk_buff *skb)
 {
 	const __u8 *hash_location = NULL;
 	struct tcp_md5sig_key *hash_expected;
@@ -707,6 +708,18 @@ static int tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
 	}
 	return 0;
 }
+
+static int tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
+{
+	int ret;
+
+	rcu_read_lock();
+	ret = __tcp_v6_inbound_md5_hash(sk, skb);
+	rcu_read_unlock();
+
+	return ret;
+}
+
 #endif
 
 static void tcp_v6_init_req(struct request_sock *req, struct sock *sk,
@@ -1247,11 +1260,6 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 	if (skb->protocol == htons(ETH_P_IP))
 		return tcp_v4_do_rcv(sk, skb);
 
-#ifdef CONFIG_TCP_MD5SIG
-	if (tcp_v6_inbound_md5_hash(sk, skb))
-		goto discard;
-#endif
-
 	if (sk_filter(sk, skb))
 		goto discard;
 
@@ -1424,6 +1432,11 @@ process:
 	if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
 		goto discard_and_relse;
 
+#ifdef CONFIG_TCP_MD5SIG
+	if (tcp_v6_inbound_md5_hash(sk, skb))
+		goto discard_and_relse;
+#endif
+
 	if (sk_filter(sk, skb))
 		goto discard_and_relse;
 
-- 
cgit v1.2.3-70-g09d2


From 4fab9071950c2021d846e18351e0f46a1cffd67b Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Thu, 14 Aug 2014 12:40:05 -0400
Subject: tcp: fix tcp_release_cb() to dispatch via address family for
 mtu_reduced()

Make sure we use the correct address-family-specific function for
handling MTU reductions from within tcp_release_cb().

Previously AF_INET6 sockets were incorrectly always using the IPv6
code path when sometimes they were handling IPv4 traffic and thus had
an IPv4 dst.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Diagnosed-by: Willem de Bruijn <willemb@google.com>
Fixes: 563d34d057862 ("tcp: dont drop MTU reduction indications")
Reviewed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h | 1 +
 include/net/sock.h                 | 1 -
 include/net/tcp.h                  | 1 +
 net/ipv4/tcp_ipv4.c                | 5 +++--
 net/ipv4/tcp_output.c              | 2 +-
 net/ipv6/tcp_ipv6.c                | 3 ++-
 6 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'net/ipv6/tcp_ipv6.c')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 7a431388756..5fbe6568c3c 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -62,6 +62,7 @@ struct inet_connection_sock_af_ops {
 	void	    (*addr2sockaddr)(struct sock *sk, struct sockaddr *);
 	int	    (*bind_conflict)(const struct sock *sk,
 				     const struct inet_bind_bucket *tb, bool relax);
+	void	    (*mtu_reduced)(struct sock *sk);
 };
 
 /** inet_connection_sock - INET connection oriented sock
diff --git a/include/net/sock.h b/include/net/sock.h
index 38805fa02e4..7f2ab72f321 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -987,7 +987,6 @@ struct proto {
 						struct sk_buff *skb);
 
 	void		(*release_cb)(struct sock *sk);
-	void		(*mtu_reduced)(struct sock *sk);
 
 	/* Keeping track of sk's, looking them up, and port selection methods. */
 	void			(*hash)(struct sock *sk);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 36f55254573..e337e05035b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -448,6 +448,7 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
  */
 
 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb);
+void tcp_v4_mtu_reduced(struct sock *sk);
 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
 struct sock *tcp_create_openreq_child(struct sock *sk,
 				      struct request_sock *req,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index dceff5fe8e6..cd17f009aed 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -271,7 +271,7 @@ EXPORT_SYMBOL(tcp_v4_connect);
  * It can be called through tcp_release_cb() if socket was owned by user
  * at the time tcp_v4_err() was called to handle ICMP message.
  */
-static void tcp_v4_mtu_reduced(struct sock *sk)
+void tcp_v4_mtu_reduced(struct sock *sk)
 {
 	struct dst_entry *dst;
 	struct inet_sock *inet = inet_sk(sk);
@@ -302,6 +302,7 @@ static void tcp_v4_mtu_reduced(struct sock *sk)
 		tcp_simple_retransmit(sk);
 	} /* else let the usual retransmit timer handle it */
 }
+EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 
 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 {
@@ -1787,6 +1788,7 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
 	.compat_setsockopt = compat_ip_setsockopt,
 	.compat_getsockopt = compat_ip_getsockopt,
 #endif
+	.mtu_reduced	   = tcp_v4_mtu_reduced,
 };
 EXPORT_SYMBOL(ipv4_specific);
 
@@ -2406,7 +2408,6 @@ struct proto tcp_prot = {
 	.sendpage		= tcp_sendpage,
 	.backlog_rcv		= tcp_v4_do_rcv,
 	.release_cb		= tcp_release_cb,
-	.mtu_reduced		= tcp_v4_mtu_reduced,
 	.hash			= inet_hash,
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index ff3f0f75cc6..5a7c41fbc6d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -800,7 +800,7 @@ void tcp_release_cb(struct sock *sk)
 		__sock_put(sk);
 	}
 	if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
-		sk->sk_prot->mtu_reduced(sk);
+		inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
 		__sock_put(sk);
 	}
 }
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index f2ce9550239..29964c3d363 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1595,6 +1595,7 @@ static const struct inet_connection_sock_af_ops ipv6_specific = {
 	.compat_setsockopt = compat_ipv6_setsockopt,
 	.compat_getsockopt = compat_ipv6_getsockopt,
 #endif
+	.mtu_reduced	   = tcp_v6_mtu_reduced,
 };
 
 #ifdef CONFIG_TCP_MD5SIG
@@ -1625,6 +1626,7 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = {
 	.compat_setsockopt = compat_ipv6_setsockopt,
 	.compat_getsockopt = compat_ipv6_getsockopt,
 #endif
+	.mtu_reduced	   = tcp_v4_mtu_reduced,
 };
 
 #ifdef CONFIG_TCP_MD5SIG
@@ -1864,7 +1866,6 @@ struct proto tcpv6_prot = {
 	.sendpage		= tcp_sendpage,
 	.backlog_rcv		= tcp_v6_do_rcv,
 	.release_cb		= tcp_release_cb,
-	.mtu_reduced		= tcp_v6_mtu_reduced,
 	.hash			= tcp_v6_hash,
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,
-- 
cgit v1.2.3-70-g09d2


From 04317dafd11dd7b0ec19b85f098414abae6ed5f7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 5 Sep 2014 15:33:32 -0700
Subject: tcp: introduce TCP_SKB_CB(skb)->tcp_tw_isn

TCP_SKB_CB(skb)->when has different meaning in output and input paths.

In output path, it contains a timestamp.
In input path, it contains an ISN, chosen by tcp_timewait_state_process()

Lets add a different name to ease code comprehension.

Note that 'when' field will disappear in following patch,
as skb_mstamp already contains timestamp, the anonymous
union will promptly disappear as well.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h        | 7 ++++++-
 net/ipv4/tcp_input.c     | 2 +-
 net/ipv4/tcp_ipv4.c      | 2 +-
 net/ipv4/tcp_minisocks.c | 2 +-
 net/ipv6/tcp_ipv6.c      | 4 ++--
 5 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'net/ipv6/tcp_ipv6.c')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 590e01a476a..0cd7d2c65dc 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -698,7 +698,12 @@ struct tcp_skb_cb {
 	} header;	/* For incoming frames		*/
 	__u32		seq;		/* Starting sequence number	*/
 	__u32		end_seq;	/* SEQ + FIN + SYN + datalen	*/
-	__u32		when;		/* used to compute rtt's	*/
+	union {
+		/* used in output path */
+		__u32		when;	/* used to compute rtt's	*/
+		/* used in input path */
+		__u32		tcp_tw_isn; /* isn chosen by tcp_timewait_state_process() */
+	};
 	__u8		tcp_flags;	/* TCP header flags. (tcp[13])	*/
 
 	__u8		sacked;		/* State flags for SACK/FACK.	*/
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index aba4926ca09..9c8b9f1dcf6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5906,7 +5906,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	struct request_sock *req;
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct dst_entry *dst = NULL;
-	__u32 isn = TCP_SKB_CB(skb)->when;
+	__u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
 	bool want_cookie = false, fastopen;
 	struct flowi fl;
 	struct tcp_fastopen_cookie foc = { .len = -1 };
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 487e2a41667..02e6cd29ebf 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1627,7 +1627,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
 				    skb->len - th->doff * 4);
 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
-	TCP_SKB_CB(skb)->when	 = 0;
+	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
 	TCP_SKB_CB(skb)->sacked	 = 0;
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 1649988bd1b..a058f411d3a 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -232,7 +232,7 @@ kill:
 		u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
 		if (isn == 0)
 			isn++;
-		TCP_SKB_CB(skb)->when = isn;
+		TCP_SKB_CB(skb)->tcp_tw_isn = isn;
 		return TCP_TW_SYN;
 	}
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 29964c3d363..5b3c70ff7a7 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -738,7 +738,7 @@ static void tcp_v6_init_req(struct request_sock *req, struct sock *sk,
 	    ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
 		ireq->ir_iif = inet6_iif(skb);
 
-	if (!TCP_SKB_CB(skb)->when &&
+	if (!TCP_SKB_CB(skb)->tcp_tw_isn &&
 	    (ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo ||
 	     np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim ||
 	     np->rxopt.bits.rxohlim || np->repflow)) {
@@ -1412,7 +1412,7 @@ static int tcp_v6_rcv(struct sk_buff *skb)
 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
 				    skb->len - th->doff*4);
 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
-	TCP_SKB_CB(skb)->when = 0;
+	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
 	TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr);
 	TCP_SKB_CB(skb)->sacked = 0;
 
-- 
cgit v1.2.3-70-g09d2


From ca777eff51f7fbaebd954e645d8ecb781a906b4a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 8 Sep 2014 08:06:07 -0700
Subject: tcp: remove dst refcount false sharing for prequeue mode

Alexander Duyck reported high false sharing on dst refcount in tcp stack
when prequeue is used. prequeue is the mechanism used when a thread is
blocked in recvmsg()/read() on a TCP socket, using a blocking model
rather than select()/poll()/epoll() non blocking one.

We already try to use RCU in input path as much as possible, but we were
forced to take a refcount on the dst when skb escaped RCU protected
region. When/if the user thread runs on different cpu, dst_release()
will then touch dst refcount again.

Commit 093162553c33 (tcp: force a dst refcount when prequeue packet)
was an example of a race fix.

It turns out the only remaining usage of skb->dst for a packet stored
in a TCP socket prequeue is IP early demux.

We can add a logic to detect when IP early demux is probably going
to use skb->dst. Because we do an optimistic check rather than duplicate
existing logic, we need to guard inet_sk_rx_dst_set() and
inet6_sk_rx_dst_set() from using a NULL dst.

Many thanks to Alexander for providing a nice bug report, git bisection,
and reproducer.

Tested using Alexander script on a 40Gb NIC, 8 RX queues.
Hosts have 24 cores, 48 hyper threads.

echo 0 >/proc/sys/net/ipv4/tcp_autocorking

for i in `seq 0 47`
do
  for j in `seq 0 2`
  do
     netperf -H $DEST -t TCP_STREAM -l 1000 \
             -c -C -T $i,$i -P 0 -- \
             -m 64 -s 64K -D &
  done
done

Before patch : ~6Mpps and ~95% cpu usage on receiver
After patch : ~9Mpps and ~35% cpu usage on receiver.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 20 ++++++++++++++++----
 net/ipv6/tcp_ipv6.c | 15 +++++++++------
 2 files changed, 25 insertions(+), 10 deletions(-)

(limited to 'net/ipv6/tcp_ipv6.c')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3f9bc3f0bba..7881b96d2b7 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1559,7 +1559,17 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
 	    skb_queue_len(&tp->ucopy.prequeue) == 0)
 		return false;
 
-	skb_dst_force(skb);
+	/* Before escaping RCU protected region, we need to take care of skb
+	 * dst. Prequeue is only enabled for established sockets.
+	 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
+	 * Instead of doing full sk_rx_dst validity here, let's perform
+	 * an optimistic check.
+	 */
+	if (likely(sk->sk_rx_dst))
+		skb_dst_drop(skb);
+	else
+		skb_dst_force(skb);
+
 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
 	tp->ucopy.memory += skb->truesize;
 	if (tp->ucopy.memory > sk->sk_rcvbuf) {
@@ -1765,9 +1775,11 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
 
-	dst_hold(dst);
-	sk->sk_rx_dst = dst;
-	inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
+	if (dst) {
+		dst_hold(dst);
+		sk->sk_rx_dst = dst;
+		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
+	}
 }
 EXPORT_SYMBOL(inet_sk_rx_dst_set);
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5b3c70ff7a7..1835480336a 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -93,13 +93,16 @@ static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk,
 static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
-	const struct rt6_info *rt = (const struct rt6_info *)dst;
 
-	dst_hold(dst);
-	sk->sk_rx_dst = dst;
-	inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
-	if (rt->rt6i_node)
-		inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum;
+	if (dst) {
+		const struct rt6_info *rt = (const struct rt6_info *)dst;
+
+		dst_hold(dst);
+		sk->sk_rx_dst = dst;
+		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
+		if (rt->rt6i_node)
+			inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum;
+	}
 }
 
 static void tcp_v6_hash(struct sock *sk)
-- 
cgit v1.2.3-70-g09d2


From e11ecddf5128011c936cc5360780190cbc901fdc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 15 Sep 2014 04:19:51 -0700
Subject: tcp: use TCP_SKB_CB(skb)->tcp_flags in input path

Input path of TCP do not currently uses TCP_SKB_CB(skb)->tcp_flags,
which is only used in output path.

tcp_recvmsg(), looks at tcp_hdr(skb)->syn for every skb found in receive queue,
and its unfortunate because this bit is located in a cache line right before
the payload.

We can simplify TCP by copying tcp flags into TCP_SKB_CB(skb)->tcp_flags.

This patch does so, and avoids the cache line miss in tcp_recvmsg()

Following patches will
- allow a segment with FIN being coalesced in tcp_try_coalesce()
- simplify tcp_collapse() by not copying the headers.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c       | 18 ++++++++++--------
 net/ipv4/tcp_input.c | 10 +++++-----
 net/ipv4/tcp_ipv4.c  |  1 +
 net/ipv6/tcp_ipv6.c  |  1 +
 4 files changed, 17 insertions(+), 13 deletions(-)

(limited to 'net/ipv6/tcp_ipv6.c')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 541f26a67ba..070aeff1b13 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1510,9 +1510,9 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
 
 	while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
 		offset = seq - TCP_SKB_CB(skb)->seq;
-		if (tcp_hdr(skb)->syn)
+		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
 			offset--;
-		if (offset < skb->len || tcp_hdr(skb)->fin) {
+		if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
 			*off = offset;
 			return skb;
 		}
@@ -1585,7 +1585,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 			if (offset + 1 != skb->len)
 				continue;
 		}
-		if (tcp_hdr(skb)->fin) {
+		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
 			sk_eat_skb(sk, skb, false);
 			++seq;
 			break;
@@ -1722,11 +1722,11 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 				break;
 
 			offset = *seq - TCP_SKB_CB(skb)->seq;
-			if (tcp_hdr(skb)->syn)
+			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
 				offset--;
 			if (offset < skb->len)
 				goto found_ok_skb;
-			if (tcp_hdr(skb)->fin)
+			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
 				goto found_fin_ok;
 			WARN(!(flags & MSG_PEEK),
 			     "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
@@ -1959,7 +1959,7 @@ skip_copy:
 		if (used + offset < skb->len)
 			continue;
 
-		if (tcp_hdr(skb)->fin)
+		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
 			goto found_fin_ok;
 		if (!(flags & MSG_PEEK)) {
 			sk_eat_skb(sk, skb, copied_early);
@@ -2160,8 +2160,10 @@ void tcp_close(struct sock *sk, long timeout)
 	 *  reader process may not have drained the data yet!
 	 */
 	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
-		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
-			  tcp_hdr(skb)->fin;
+		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
+
+		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+			len--;
 		data_was_unread += len;
 		__kfree_skb(skb);
 	}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f97003ad0af..8f639a4face 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4093,7 +4093,7 @@ static void tcp_ofo_queue(struct sock *sk)
 		__skb_unlink(skb, &tp->out_of_order_queue);
 		__skb_queue_tail(&sk->sk_receive_queue, skb);
 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
-		if (tcp_hdr(skb)->fin)
+		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
 			tcp_fin(sk);
 	}
 }
@@ -4513,7 +4513,7 @@ restart:
 		 * - bloated or contains data before "start" or
 		 *   overlaps to the next one.
 		 */
-		if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin &&
+		if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
 		    (tcp_win_from_space(skb->truesize) > skb->len ||
 		     before(TCP_SKB_CB(skb)->seq, start))) {
 			end_of_skbs = false;
@@ -4532,7 +4532,8 @@ restart:
 		/* Decided to skip this, advance start seq. */
 		start = TCP_SKB_CB(skb)->end_seq;
 	}
-	if (end_of_skbs || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin)
+	if (end_of_skbs ||
+	    (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
 		return;
 
 	while (before(start, end)) {
@@ -4579,8 +4580,7 @@ restart:
 				skb = tcp_collapse_one(sk, skb, list);
 				if (!skb ||
 				    skb == tail ||
-				    tcp_hdr(skb)->syn ||
-				    tcp_hdr(skb)->fin)
+				    (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
 					return;
 			}
 		}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7881b96d2b7..006b045716d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1638,6 +1638,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
 				    skb->len - th->doff * 4);
 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
+	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
 	TCP_SKB_CB(skb)->sacked	 = 0;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 1835480336a..de51a88bec6 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1415,6 +1415,7 @@ static int tcp_v6_rcv(struct sk_buff *skb)
 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
 				    skb->len - th->doff*4);
 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
+	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
 	TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr);
 	TCP_SKB_CB(skb)->sacked = 0;
-- 
cgit v1.2.3-70-g09d2


From 7bced397510ab569d31de4c70b39e13355046387 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 30 Dec 2013 12:37:29 -0800
Subject: net_dma: simple removal

Per commit "77873803363c net_dma: mark broken" net_dma is no longer used
and there is no plan to fix it.

This is the mechanical removal of bits in CONFIG_NET_DMA ifdef guards.
Reverting the remainder of the net_dma induced changes is deferred to
subsequent patches.

Marked for stable due to Roman's report of a memory leak in
dma_pin_iovec_pages():

    https://lkml.org/lkml/2014/9/3/177

Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Vinod Koul <vinod.koul@intel.com>
Cc: David Whipple <whipple@securedatainnovations.ch>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Cc: <stable@vger.kernel.org>
Reported-by: Roman Gushchin <klamm@yandex-team.ru>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 Documentation/ABI/removed/net_dma      |   8 +
 Documentation/networking/ip-sysctl.txt |   6 -
 drivers/dma/Kconfig                    |  12 --
 drivers/dma/Makefile                   |   1 -
 drivers/dma/dmaengine.c                | 104 ------------
 drivers/dma/ioat/dma.c                 |   1 -
 drivers/dma/ioat/dma.h                 |   7 -
 drivers/dma/ioat/dma_v2.c              |   1 -
 drivers/dma/ioat/dma_v3.c              |   1 -
 drivers/dma/iovlock.c                  | 280 ---------------------------------
 include/linux/dmaengine.h              |  22 +--
 include/linux/skbuff.h                 |   8 +-
 include/linux/tcp.h                    |   8 -
 include/net/netdma.h                   |  32 ----
 include/net/sock.h                     |  19 +--
 include/net/tcp.h                      |   8 -
 kernel/sysctl_binary.c                 |   1 -
 net/core/Makefile                      |   1 -
 net/core/dev.c                         |  10 --
 net/core/sock.c                        |   6 -
 net/core/user_dma.c                    | 131 ---------------
 net/dccp/proto.c                       |   4 +-
 net/ipv4/sysctl_net_ipv4.c             |   9 --
 net/ipv4/tcp.c                         | 147 ++---------------
 net/ipv4/tcp_input.c                   |  61 -------
 net/ipv4/tcp_ipv4.c                    |  18 +--
 net/ipv6/tcp_ipv6.c                    |  13 +-
 net/llc/af_llc.c                       |  10 +-
 28 files changed, 35 insertions(+), 894 deletions(-)
 create mode 100644 Documentation/ABI/removed/net_dma
 delete mode 100644 drivers/dma/iovlock.c
 delete mode 100644 include/net/netdma.h
 delete mode 100644 net/core/user_dma.c

(limited to 'net/ipv6/tcp_ipv6.c')

diff --git a/Documentation/ABI/removed/net_dma b/Documentation/ABI/removed/net_dma
new file mode 100644
index 00000000000..a173aecc2f1
--- /dev/null
+++ b/Documentation/ABI/removed/net_dma
@@ -0,0 +1,8 @@
+What:		tcp_dma_copybreak sysctl
+Date:		Removed in kernel v3.13
+Contact:	Dan Williams <dan.j.williams@intel.com>
+Description:
+	Formerly the lower limit, in bytes, of the size of socket reads
+	that will be offloaded to a DMA copy engine.  Removed due to
+	coherency issues of the cpu potentially touching the buffers
+	while dma is in flight.
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index ab42c95f998..ea8f3b182e7 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -582,12 +582,6 @@ tcp_workaround_signed_windows - BOOLEAN
 	not receive a window scaling option from them.
 	Default: 0
 
-tcp_dma_copybreak - INTEGER
-	Lower limit, in bytes, of the size of socket reads that will be
-	offloaded to a DMA copy engine, if one is present in the system
-	and CONFIG_NET_DMA is enabled.
-	Default: 4096
-
 tcp_thin_linear_timeouts - BOOLEAN
 	Enable dynamic triggering of linear timeouts for thin streams.
 	If set, a check is performed upon retransmission by timeout to
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 605b016bcea..6b5f37e01a7 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -368,18 +368,6 @@ config DMA_OF
 comment "DMA Clients"
 	depends on DMA_ENGINE
 
-config NET_DMA
-	bool "Network: TCP receive copy offload"
-	depends on DMA_ENGINE && NET
-	default (INTEL_IOATDMA || FSL_DMA)
-	depends on BROKEN
-	help
-	  This enables the use of DMA engines in the network stack to
-	  offload receive copy-to-user operations, freeing CPU cycles.
-
-	  Say Y here if you enabled INTEL_IOATDMA or FSL_DMA, otherwise
-	  say N.
-
 config ASYNC_TX_DMA
 	bool "Async_tx: Offload support for the async_tx api"
 	depends on DMA_ENGINE
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index a029d0f4a1b..0c9dc754932 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -6,7 +6,6 @@ obj-$(CONFIG_DMA_VIRTUAL_CHANNELS) += virt-dma.o
 obj-$(CONFIG_DMA_ACPI) += acpi-dma.o
 obj-$(CONFIG_DMA_OF) += of-dma.o
 
-obj-$(CONFIG_NET_DMA) += iovlock.o
 obj-$(CONFIG_INTEL_MID_DMAC) += intel_mid_dma.o
 obj-$(CONFIG_DMATEST) += dmatest.o
 obj-$(CONFIG_INTEL_IOATDMA) += ioat/
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index ed610b49751..268de183b51 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -1084,110 +1084,6 @@ dmaengine_get_unmap_data(struct device *dev, int nr, gfp_t flags)
 }
 EXPORT_SYMBOL(dmaengine_get_unmap_data);
 
-/**
- * dma_async_memcpy_pg_to_pg - offloaded copy from page to page
- * @chan: DMA channel to offload copy to
- * @dest_pg: destination page
- * @dest_off: offset in page to copy to
- * @src_pg: source page
- * @src_off: offset in page to copy from
- * @len: length
- *
- * Both @dest_page/@dest_off and @src_page/@src_off must be mappable to a bus
- * address according to the DMA mapping API rules for streaming mappings.
- * Both @dest_page/@dest_off and @src_page/@src_off must stay memory resident
- * (kernel memory or locked user space pages).
- */
-dma_cookie_t
-dma_async_memcpy_pg_to_pg(struct dma_chan *chan, struct page *dest_pg,
-	unsigned int dest_off, struct page *src_pg, unsigned int src_off,
-	size_t len)
-{
-	struct dma_device *dev = chan->device;
-	struct dma_async_tx_descriptor *tx;
-	struct dmaengine_unmap_data *unmap;
-	dma_cookie_t cookie;
-	unsigned long flags;
-
-	unmap = dmaengine_get_unmap_data(dev->dev, 2, GFP_NOWAIT);
-	if (!unmap)
-		return -ENOMEM;
-
-	unmap->to_cnt = 1;
-	unmap->from_cnt = 1;
-	unmap->addr[0] = dma_map_page(dev->dev, src_pg, src_off, len,
-				      DMA_TO_DEVICE);
-	unmap->addr[1] = dma_map_page(dev->dev, dest_pg, dest_off, len,
-				      DMA_FROM_DEVICE);
-	unmap->len = len;
-	flags = DMA_CTRL_ACK;
-	tx = dev->device_prep_dma_memcpy(chan, unmap->addr[1], unmap->addr[0],
-					 len, flags);
-
-	if (!tx) {
-		dmaengine_unmap_put(unmap);
-		return -ENOMEM;
-	}
-
-	dma_set_unmap(tx, unmap);
-	cookie = tx->tx_submit(tx);
-	dmaengine_unmap_put(unmap);
-
-	preempt_disable();
-	__this_cpu_add(chan->local->bytes_transferred, len);
-	__this_cpu_inc(chan->local->memcpy_count);
-	preempt_enable();
-
-	return cookie;
-}
-EXPORT_SYMBOL(dma_async_memcpy_pg_to_pg);
-
-/**
- * dma_async_memcpy_buf_to_buf - offloaded copy between virtual addresses
- * @chan: DMA channel to offload copy to
- * @dest: destination address (virtual)
- * @src: source address (virtual)
- * @len: length
- *
- * Both @dest and @src must be mappable to a bus address according to the
- * DMA mapping API rules for streaming mappings.
- * Both @dest and @src must stay memory resident (kernel memory or locked
- * user space pages).
- */
-dma_cookie_t
-dma_async_memcpy_buf_to_buf(struct dma_chan *chan, void *dest,
-			    void *src, size_t len)
-{
-	return dma_async_memcpy_pg_to_pg(chan, virt_to_page(dest),
-					 (unsigned long) dest & ~PAGE_MASK,
-					 virt_to_page(src),
-					 (unsigned long) src & ~PAGE_MASK, len);
-}
-EXPORT_SYMBOL(dma_async_memcpy_buf_to_buf);
-
-/**
- * dma_async_memcpy_buf_to_pg - offloaded copy from address to page
- * @chan: DMA channel to offload copy to
- * @page: destination page
- * @offset: offset in page to copy to
- * @kdata: source address (virtual)
- * @len: length
- *
- * Both @page/@offset and @kdata must be mappable to a bus address according
- * to the DMA mapping API rules for streaming mappings.
- * Both @page/@offset and @kdata must stay memory resident (kernel memory or
- * locked user space pages)
- */
-dma_cookie_t
-dma_async_memcpy_buf_to_pg(struct dma_chan *chan, struct page *page,
-			   unsigned int offset, void *kdata, size_t len)
-{
-	return dma_async_memcpy_pg_to_pg(chan, page, offset,
-					 virt_to_page(kdata),
-					 (unsigned long) kdata & ~PAGE_MASK, len);
-}
-EXPORT_SYMBOL(dma_async_memcpy_buf_to_pg);
-
 void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx,
 	struct dma_chan *chan)
 {
diff --git a/drivers/dma/ioat/dma.c b/drivers/dma/ioat/dma.c
index b76c1485933..940c1502a8b 100644
--- a/drivers/dma/ioat/dma.c
+++ b/drivers/dma/ioat/dma.c
@@ -1222,7 +1222,6 @@ int ioat1_dma_probe(struct ioatdma_device *device, int dca)
 	err = ioat_probe(device);
 	if (err)
 		return err;
-	ioat_set_tcp_copy_break(4096);
 	err = ioat_register(device);
 	if (err)
 		return err;
diff --git a/drivers/dma/ioat/dma.h b/drivers/dma/ioat/dma.h
index e982f00a984..d63f68b1aa3 100644
--- a/drivers/dma/ioat/dma.h
+++ b/drivers/dma/ioat/dma.h
@@ -214,13 +214,6 @@ __dump_desc_dbg(struct ioat_chan_common *chan, struct ioat_dma_descriptor *hw,
 #define dump_desc_dbg(c, d) \
 	({ if (d) __dump_desc_dbg(&c->base, d->hw, &d->txd, desc_id(d)); 0; })
 
-static inline void ioat_set_tcp_copy_break(unsigned long copybreak)
-{
-	#ifdef CONFIG_NET_DMA
-	sysctl_tcp_dma_copybreak = copybreak;
-	#endif
-}
-
 static inline struct ioat_chan_common *
 ioat_chan_by_index(struct ioatdma_device *device, int index)
 {
diff --git a/drivers/dma/ioat/dma_v2.c b/drivers/dma/ioat/dma_v2.c
index 2ce9be49860..695483e6be3 100644
--- a/drivers/dma/ioat/dma_v2.c
+++ b/drivers/dma/ioat/dma_v2.c
@@ -900,7 +900,6 @@ int ioat2_dma_probe(struct ioatdma_device *device, int dca)
 	err = ioat_probe(device);
 	if (err)
 		return err;
-	ioat_set_tcp_copy_break(2048);
 
 	list_for_each_entry(c, &dma->channels, device_node) {
 		chan = to_chan_common(c);
diff --git a/drivers/dma/ioat/dma_v3.c b/drivers/dma/ioat/dma_v3.c
index 85971d6e964..895f869d6c2 100644
--- a/drivers/dma/ioat/dma_v3.c
+++ b/drivers/dma/ioat/dma_v3.c
@@ -1655,7 +1655,6 @@ int ioat3_dma_probe(struct ioatdma_device *device, int dca)
 	err = ioat_probe(device);
 	if (err)
 		return err;
-	ioat_set_tcp_copy_break(262144);
 
 	list_for_each_entry(c, &dma->channels, device_node) {
 		chan = to_chan_common(c);
diff --git a/drivers/dma/iovlock.c b/drivers/dma/iovlock.c
deleted file mode 100644
index bb48a57c2fc..00000000000
--- a/drivers/dma/iovlock.c
+++ /dev/null
@@ -1,280 +0,0 @@
-/*
- * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
- * Portions based on net/core/datagram.c and copyrighted by their authors.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
- *
- * The full GNU General Public License is included in this distribution in the
- * file called COPYING.
- */
-
-/*
- * This code allows the net stack to make use of a DMA engine for
- * skb to iovec copies.
- */
-
-#include <linux/dmaengine.h>
-#include <linux/pagemap.h>
-#include <linux/slab.h>
-#include <net/tcp.h> /* for memcpy_toiovec */
-#include <asm/io.h>
-#include <asm/uaccess.h>
-
-static int num_pages_spanned(struct iovec *iov)
-{
-	return
-	((PAGE_ALIGN((unsigned long)iov->iov_base + iov->iov_len) -
-	((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT);
-}
-
-/*
- * Pin down all the iovec pages needed for len bytes.
- * Return a struct dma_pinned_list to keep track of pages pinned down.
- *
- * We are allocating a single chunk of memory, and then carving it up into
- * 3 sections, the latter 2 whose size depends on the number of iovecs and the
- * total number of pages, respectively.
- */
-struct dma_pinned_list *dma_pin_iovec_pages(struct iovec *iov, size_t len)
-{
-	struct dma_pinned_list *local_list;
-	struct page **pages;
-	int i;
-	int ret;
-	int nr_iovecs = 0;
-	int iovec_len_used = 0;
-	int iovec_pages_used = 0;
-
-	/* don't pin down non-user-based iovecs */
-	if (segment_eq(get_fs(), KERNEL_DS))
-		return NULL;
-
-	/* determine how many iovecs/pages there are, up front */
-	do {
-		iovec_len_used += iov[nr_iovecs].iov_len;
-		iovec_pages_used += num_pages_spanned(&iov[nr_iovecs]);
-		nr_iovecs++;
-	} while (iovec_len_used < len);
-
-	/* single kmalloc for pinned list, page_list[], and the page arrays */
-	local_list = kmalloc(sizeof(*local_list)
-		+ (nr_iovecs * sizeof (struct dma_page_list))
-		+ (iovec_pages_used * sizeof (struct page*)), GFP_KERNEL);
-	if (!local_list)
-		goto out;
-
-	/* list of pages starts right after the page list array */
-	pages = (struct page **) &local_list->page_list[nr_iovecs];
-
-	local_list->nr_iovecs = 0;
-
-	for (i = 0; i < nr_iovecs; i++) {
-		struct dma_page_list *page_list = &local_list->page_list[i];
-
-		len -= iov[i].iov_len;
-
-		if (!access_ok(VERIFY_WRITE, iov[i].iov_base, iov[i].iov_len))
-			goto unpin;
-
-		page_list->nr_pages = num_pages_spanned(&iov[i]);
-		page_list->base_address = iov[i].iov_base;
-
-		page_list->pages = pages;
-		pages += page_list->nr_pages;
-
-		/* pin pages down */
-		down_read(&current->mm->mmap_sem);
-		ret = get_user_pages(
-			current,
-			current->mm,
-			(unsigned long) iov[i].iov_base,
-			page_list->nr_pages,
-			1,	/* write */
-			0,	/* force */
-			page_list->pages,
-			NULL);
-		up_read(&current->mm->mmap_sem);
-
-		if (ret != page_list->nr_pages)
-			goto unpin;
-
-		local_list->nr_iovecs = i + 1;
-	}
-
-	return local_list;
-
-unpin:
-	dma_unpin_iovec_pages(local_list);
-out:
-	return NULL;
-}
-
-void dma_unpin_iovec_pages(struct dma_pinned_list *pinned_list)
-{
-	int i, j;
-
-	if (!pinned_list)
-		return;
-
-	for (i = 0; i < pinned_list->nr_iovecs; i++) {
-		struct dma_page_list *page_list = &pinned_list->page_list[i];
-		for (j = 0; j < page_list->nr_pages; j++) {
-			set_page_dirty_lock(page_list->pages[j]);
-			page_cache_release(page_list->pages[j]);
-		}
-	}
-
-	kfree(pinned_list);
-}
-
-
-/*
- * We have already pinned down the pages we will be using in the iovecs.
- * Each entry in iov array has corresponding entry in pinned_list->page_list.
- * Using array indexing to keep iov[] and page_list[] in sync.
- * Initial elements in iov array's iov->iov_len will be 0 if already copied into
- *   by another call.
- * iov array length remaining guaranteed to be bigger than len.
- */
-dma_cookie_t dma_memcpy_to_iovec(struct dma_chan *chan, struct iovec *iov,
-	struct dma_pinned_list *pinned_list, unsigned char *kdata, size_t len)
-{
-	int iov_byte_offset;
-	int copy;
-	dma_cookie_t dma_cookie = 0;
-	int iovec_idx;
-	int page_idx;
-
-	if (!chan)
-		return memcpy_toiovec(iov, kdata, len);
-
-	iovec_idx = 0;
-	while (iovec_idx < pinned_list->nr_iovecs) {
-		struct dma_page_list *page_list;
-
-		/* skip already used-up iovecs */
-		while (!iov[iovec_idx].iov_len)
-			iovec_idx++;
-
-		page_list = &pinned_list->page_list[iovec_idx];
-
-		iov_byte_offset = ((unsigned long)iov[iovec_idx].iov_base & ~PAGE_MASK);
-		page_idx = (((unsigned long)iov[iovec_idx].iov_base & PAGE_MASK)
-			 - ((unsigned long)page_list->base_address & PAGE_MASK)) >> PAGE_SHIFT;
-
-		/* break up copies to not cross page boundary */
-		while (iov[iovec_idx].iov_len) {
-			copy = min_t(int, PAGE_SIZE - iov_byte_offset, len);
-			copy = min_t(int, copy, iov[iovec_idx].iov_len);
-
-			dma_cookie = dma_async_memcpy_buf_to_pg(chan,
-					page_list->pages[page_idx],
-					iov_byte_offset,
-					kdata,
-					copy);
-			/* poll for a descriptor slot */
-			if (unlikely(dma_cookie < 0)) {
-				dma_async_issue_pending(chan);
-				continue;
-			}
-
-			len -= copy;
-			iov[iovec_idx].iov_len -= copy;
-			iov[iovec_idx].iov_base += copy;
-
-			if (!len)
-				return dma_cookie;
-
-			kdata += copy;
-			iov_byte_offset = 0;
-			page_idx++;
-		}
-		iovec_idx++;
-	}
-
-	/* really bad if we ever run out of iovecs */
-	BUG();
-	return -EFAULT;
-}
-
-dma_cookie_t dma_memcpy_pg_to_iovec(struct dma_chan *chan, struct iovec *iov,
-	struct dma_pinned_list *pinned_list, struct page *page,
-	unsigned int offset, size_t len)
-{
-	int iov_byte_offset;
-	int copy;
-	dma_cookie_t dma_cookie = 0;
-	int iovec_idx;
-	int page_idx;
-	int err;
-
-	/* this needs as-yet-unimplemented buf-to-buff, so punt. */
-	/* TODO: use dma for this */
-	if (!chan || !pinned_list) {
-		u8 *vaddr = kmap(page);
-		err = memcpy_toiovec(iov, vaddr + offset, len);
-		kunmap(page);
-		return err;
-	}
-
-	iovec_idx = 0;
-	while (iovec_idx < pinned_list->nr_iovecs) {
-		struct dma_page_list *page_list;
-
-		/* skip already used-up iovecs */
-		while (!iov[iovec_idx].iov_len)
-			iovec_idx++;
-
-		page_list = &pinned_list->page_list[iovec_idx];
-
-		iov_byte_offset = ((unsigned long)iov[iovec_idx].iov_base & ~PAGE_MASK);
-		page_idx = (((unsigned long)iov[iovec_idx].iov_base & PAGE_MASK)
-			 - ((unsigned long)page_list->base_address & PAGE_MASK)) >> PAGE_SHIFT;
-
-		/* break up copies to not cross page boundary */
-		while (iov[iovec_idx].iov_len) {
-			copy = min_t(int, PAGE_SIZE - iov_byte_offset, len);
-			copy = min_t(int, copy, iov[iovec_idx].iov_len);
-
-			dma_cookie = dma_async_memcpy_pg_to_pg(chan,
-					page_list->pages[page_idx],
-					iov_byte_offset,
-					page,
-					offset,
-					copy);
-			/* poll for a descriptor slot */
-			if (unlikely(dma_cookie < 0)) {
-				dma_async_issue_pending(chan);
-				continue;
-			}
-
-			len -= copy;
-			iov[iovec_idx].iov_len -= copy;
-			iov[iovec_idx].iov_base += copy;
-
-			if (!len)
-				return dma_cookie;
-
-			offset += copy;
-			iov_byte_offset = 0;
-			page_idx++;
-		}
-		iovec_idx++;
-	}
-
-	/* really bad if we ever run out of iovecs */
-	BUG();
-	return -EFAULT;
-}
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index c5c92d59e53..3e382ecd192 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -903,18 +903,6 @@ static inline void dmaengine_put(void)
 }
 #endif
 
-#ifdef CONFIG_NET_DMA
-#define net_dmaengine_get()	dmaengine_get()
-#define net_dmaengine_put()	dmaengine_put()
-#else
-static inline void net_dmaengine_get(void)
-{
-}
-static inline void net_dmaengine_put(void)
-{
-}
-#endif
-
 #ifdef CONFIG_ASYNC_TX_DMA
 #define async_dmaengine_get()	dmaengine_get()
 #define async_dmaengine_put()	dmaengine_put()
@@ -936,16 +924,8 @@ async_dma_find_channel(enum dma_transaction_type type)
 	return NULL;
 }
 #endif /* CONFIG_ASYNC_TX_DMA */
-
-dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan,
-	void *dest, void *src, size_t len);
-dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan,
-	struct page *page, unsigned int offset, void *kdata, size_t len);
-dma_cookie_t dma_async_memcpy_pg_to_pg(struct dma_chan *chan,
-	struct page *dest_pg, unsigned int dest_off, struct page *src_pg,
-	unsigned int src_off, size_t len);
 void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx,
-	struct dma_chan *chan);
+				  struct dma_chan *chan);
 
 static inline void async_tx_ack(struct dma_async_tx_descriptor *tx)
 {
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5e1e6f2d98c..bdbf7afad6b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -28,7 +28,6 @@
 #include <linux/textsearch.h>
 #include <net/checksum.h>
 #include <linux/rcupdate.h>
-#include <linux/dmaengine.h>
 #include <linux/hrtimer.h>
 #include <linux/dma-mapping.h>
 #include <linux/netdev_features.h>
@@ -515,11 +514,8 @@ struct sk_buff {
 	/* 6/8 bit hole (depending on ndisc_nodetype presence) */
 	kmemcheck_bitfield_end(flags2);
 
-#if defined CONFIG_NET_DMA || defined CONFIG_NET_RX_BUSY_POLL
-	union {
-		unsigned int	napi_id;
-		dma_cookie_t	dma_cookie;
-	};
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	unsigned int	napi_id;
 #endif
 #ifdef CONFIG_NETWORK_SECMARK
 	__u32			secmark;
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4ad0706d40e..90895b8dc7f 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -19,7 +19,6 @@
 
 
 #include <linux/skbuff.h>
-#include <linux/dmaengine.h>
 #include <net/sock.h>
 #include <net/inet_connection_sock.h>
 #include <net/inet_timewait_sock.h>
@@ -169,13 +168,6 @@ struct tcp_sock {
 		struct iovec		*iov;
 		int			memory;
 		int			len;
-#ifdef CONFIG_NET_DMA
-		/* members for async copy */
-		struct dma_chan		*dma_chan;
-		int			wakeup;
-		struct dma_pinned_list	*pinned_list;
-		dma_cookie_t		dma_cookie;
-#endif
 	} ucopy;
 
 	u32	snd_wl1;	/* Sequence for window update		*/
diff --git a/include/net/netdma.h b/include/net/netdma.h
deleted file mode 100644
index 8ba8ce284ee..00000000000
--- a/include/net/netdma.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
- *
- * The full GNU General Public License is included in this distribution in the
- * file called COPYING.
- */
-#ifndef NETDMA_H
-#define NETDMA_H
-#ifdef CONFIG_NET_DMA
-#include <linux/dmaengine.h>
-#include <linux/skbuff.h>
-
-int dma_skb_copy_datagram_iovec(struct dma_chan* chan,
-		struct sk_buff *skb, int offset, struct iovec *to,
-		size_t len, struct dma_pinned_list *pinned_list);
-
-#endif /* CONFIG_NET_DMA */
-#endif /* NETDMA_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index b9586a137ca..3353b47f3d4 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -231,7 +231,6 @@ struct cg_proto;
   *	@sk_receive_queue: incoming packets
   *	@sk_wmem_alloc: transmit queue bytes committed
   *	@sk_write_queue: Packet sending queue
-  *	@sk_async_wait_queue: DMA copied packets
   *	@sk_omem_alloc: "o" is "option" or "other"
   *	@sk_wmem_queued: persistent queue size
   *	@sk_forward_alloc: space allocated forward
@@ -354,10 +353,6 @@ struct sock {
 	struct sk_filter __rcu	*sk_filter;
 	struct socket_wq __rcu	*sk_wq;
 
-#ifdef CONFIG_NET_DMA
-	struct sk_buff_head	sk_async_wait_queue;
-#endif
-
 #ifdef CONFIG_XFRM
 	struct xfrm_policy	*sk_policy[2];
 #endif
@@ -2214,27 +2209,15 @@ void sock_tx_timestamp(struct sock *sk, __u8 *tx_flags);
  * sk_eat_skb - Release a skb if it is no longer needed
  * @sk: socket to eat this skb from
  * @skb: socket buffer to eat
- * @copied_early: flag indicating whether DMA operations copied this data early
  *
  * This routine must be called with interrupts disabled or with the socket
  * locked so that the sk_buff queue operation is ok.
 */
-#ifdef CONFIG_NET_DMA
-static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, bool copied_early)
-{
-	__skb_unlink(skb, &sk->sk_receive_queue);
-	if (!copied_early)
-		__kfree_skb(skb);
-	else
-		__skb_queue_tail(&sk->sk_async_wait_queue, skb);
-}
-#else
-static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, bool copied_early)
+static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
 {
 	__skb_unlink(skb, &sk->sk_receive_queue);
 	__kfree_skb(skb);
 }
-#endif
 
 static inline
 struct net *sock_net(const struct sock *sk)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 8c4dd63134d..2c2f24ffa38 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -27,7 +27,6 @@
 #include <linux/cache.h>
 #include <linux/percpu.h>
 #include <linux/skbuff.h>
-#include <linux/dmaengine.h>
 #include <linux/crypto.h>
 #include <linux/cryptohash.h>
 #include <linux/kref.h>
@@ -267,7 +266,6 @@ extern int sysctl_tcp_adv_win_scale;
 extern int sysctl_tcp_tw_reuse;
 extern int sysctl_tcp_frto;
 extern int sysctl_tcp_low_latency;
-extern int sysctl_tcp_dma_copybreak;
 extern int sysctl_tcp_nometrics_save;
 extern int sysctl_tcp_moderate_rcvbuf;
 extern int sysctl_tcp_tso_win_divisor;
@@ -1023,12 +1021,6 @@ static inline void tcp_prequeue_init(struct tcp_sock *tp)
 	tp->ucopy.len = 0;
 	tp->ucopy.memory = 0;
 	skb_queue_head_init(&tp->ucopy.prequeue);
-#ifdef CONFIG_NET_DMA
-	tp->ucopy.dma_chan = NULL;
-	tp->ucopy.wakeup = 0;
-	tp->ucopy.pinned_list = NULL;
-	tp->ucopy.dma_cookie = 0;
-#endif
 }
 
 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb);
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 653cbbd9e7a..d457005aced 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -390,7 +390,6 @@ static const struct bin_table bin_net_ipv4_table[] = {
 	{ CTL_INT,	NET_TCP_MTU_PROBING,			"tcp_mtu_probing" },
 	{ CTL_INT,	NET_TCP_BASE_MSS,			"tcp_base_mss" },
 	{ CTL_INT,	NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS,	"tcp_workaround_signed_windows" },
-	{ CTL_INT,	NET_TCP_DMA_COPYBREAK,			"tcp_dma_copybreak" },
 	{ CTL_INT,	NET_TCP_SLOW_START_AFTER_IDLE,		"tcp_slow_start_after_idle" },
 	{ CTL_INT,	NET_CIPSOV4_CACHE_ENABLE,		"cipso_cache_enable" },
 	{ CTL_INT,	NET_CIPSOV4_CACHE_BUCKET_SIZE,		"cipso_cache_bucket_size" },
diff --git a/net/core/Makefile b/net/core/Makefile
index 9628c20acff..5038f1ea034 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -16,7 +16,6 @@ obj-y += net-sysfs.o
 obj-$(CONFIG_PROC_FS) += net-procfs.o
 obj-$(CONFIG_NET_PKTGEN) += pktgen.o
 obj-$(CONFIG_NETPOLL) += netpoll.o
-obj-$(CONFIG_NET_DMA) += user_dma.o
 obj-$(CONFIG_FIB_RULES) += fib_rules.o
 obj-$(CONFIG_TRACEPOINTS) += net-traces.o
 obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
diff --git a/net/core/dev.c b/net/core/dev.c
index b1b0c8d4d7d..5e37e9abe8c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1266,7 +1266,6 @@ static int __dev_open(struct net_device *dev)
 		clear_bit(__LINK_STATE_START, &dev->state);
 	else {
 		dev->flags |= IFF_UP;
-		net_dmaengine_get();
 		dev_set_rx_mode(dev);
 		dev_activate(dev);
 		add_device_randomness(dev->dev_addr, dev->addr_len);
@@ -1342,7 +1341,6 @@ static int __dev_close_many(struct list_head *head)
 			ops->ndo_stop(dev);
 
 		dev->flags &= ~IFF_UP;
-		net_dmaengine_put();
 	}
 
 	return 0;
@@ -4405,14 +4403,6 @@ static void net_rx_action(struct softirq_action *h)
 out:
 	net_rps_action_and_irq_enable(sd);
 
-#ifdef CONFIG_NET_DMA
-	/*
-	 * There may not be any more sk_buffs coming right now, so push
-	 * any pending DMA copies to hardware
-	 */
-	dma_issue_pending_all();
-#endif
-
 	return;
 
 softnet_break:
diff --git a/net/core/sock.c b/net/core/sock.c
index c0fc6bdad1e..2f143c3b190 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1452,9 +1452,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		atomic_set(&newsk->sk_omem_alloc, 0);
 		skb_queue_head_init(&newsk->sk_receive_queue);
 		skb_queue_head_init(&newsk->sk_write_queue);
-#ifdef CONFIG_NET_DMA
-		skb_queue_head_init(&newsk->sk_async_wait_queue);
-#endif
 
 		spin_lock_init(&newsk->sk_dst_lock);
 		rwlock_init(&newsk->sk_callback_lock);
@@ -2265,9 +2262,6 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	skb_queue_head_init(&sk->sk_receive_queue);
 	skb_queue_head_init(&sk->sk_write_queue);
 	skb_queue_head_init(&sk->sk_error_queue);
-#ifdef CONFIG_NET_DMA
-	skb_queue_head_init(&sk->sk_async_wait_queue);
-#endif
 
 	sk->sk_send_head	=	NULL;
 
diff --git a/net/core/user_dma.c b/net/core/user_dma.c
deleted file mode 100644
index 1b5fefdb819..00000000000
--- a/net/core/user_dma.c
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
- * Portions based on net/core/datagram.c and copyrighted by their authors.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
- *
- * The full GNU General Public License is included in this distribution in the
- * file called COPYING.
- */
-
-/*
- * This code allows the net stack to make use of a DMA engine for
- * skb to iovec copies.
- */
-
-#include <linux/dmaengine.h>
-#include <linux/socket.h>
-#include <linux/export.h>
-#include <net/tcp.h>
-#include <net/netdma.h>
-
-#define NET_DMA_DEFAULT_COPYBREAK 4096
-
-int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK;
-EXPORT_SYMBOL(sysctl_tcp_dma_copybreak);
-
-/**
- *	dma_skb_copy_datagram_iovec - Copy a datagram to an iovec.
- *	@skb - buffer to copy
- *	@offset - offset in the buffer to start copying from
- *	@iovec - io vector to copy to
- *	@len - amount of data to copy from buffer to iovec
- *	@pinned_list - locked iovec buffer data
- *
- *	Note: the iovec is modified during the copy.
- */
-int dma_skb_copy_datagram_iovec(struct dma_chan *chan,
-			struct sk_buff *skb, int offset, struct iovec *to,
-			size_t len, struct dma_pinned_list *pinned_list)
-{
-	int start = skb_headlen(skb);
-	int i, copy = start - offset;
-	struct sk_buff *frag_iter;
-	dma_cookie_t cookie = 0;
-
-	/* Copy header. */
-	if (copy > 0) {
-		if (copy > len)
-			copy = len;
-		cookie = dma_memcpy_to_iovec(chan, to, pinned_list,
-					    skb->data + offset, copy);
-		if (cookie < 0)
-			goto fault;
-		len -= copy;
-		if (len == 0)
-			goto end;
-		offset += copy;
-	}
-
-	/* Copy paged appendix. Hmm... why does this look so complicated? */
-	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
-		int end;
-		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-
-		WARN_ON(start > offset + len);
-
-		end = start + skb_frag_size(frag);
-		copy = end - offset;
-		if (copy > 0) {
-			struct page *page = skb_frag_page(frag);
-
-			if (copy > len)
-				copy = len;
-
-			cookie = dma_memcpy_pg_to_iovec(chan, to, pinned_list, page,
-					frag->page_offset + offset - start, copy);
-			if (cookie < 0)
-				goto fault;
-			len -= copy;
-			if (len == 0)
-				goto end;
-			offset += copy;
-		}
-		start = end;
-	}
-
-	skb_walk_frags(skb, frag_iter) {
-		int end;
-
-		WARN_ON(start > offset + len);
-
-		end = start + frag_iter->len;
-		copy = end - offset;
-		if (copy > 0) {
-			if (copy > len)
-				copy = len;
-			cookie = dma_skb_copy_datagram_iovec(chan, frag_iter,
-							     offset - start,
-							     to, copy,
-							     pinned_list);
-			if (cookie < 0)
-				goto fault;
-			len -= copy;
-			if (len == 0)
-				goto end;
-			offset += copy;
-		}
-		start = end;
-	}
-
-end:
-	if (!len) {
-		skb->dma_cookie = cookie;
-		return cookie;
-	}
-
-fault:
-	return -EFAULT;
-}
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index eb892b4f481..f9076f295b1 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -848,7 +848,7 @@ int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		default:
 			dccp_pr_debug("packet_type=%s\n",
 				      dccp_packet_name(dh->dccph_type));
-			sk_eat_skb(sk, skb, false);
+			sk_eat_skb(sk, skb);
 		}
 verify_sock_status:
 		if (sock_flag(sk, SOCK_DONE)) {
@@ -905,7 +905,7 @@ verify_sock_status:
 			len = skb->len;
 	found_fin_ok:
 		if (!(flags & MSG_PEEK))
-			sk_eat_skb(sk, skb, false);
+			sk_eat_skb(sk, skb);
 		break;
 	} while (1);
 out:
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 44eba052b43..c3d2a48481f 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -635,15 +635,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-#ifdef CONFIG_NET_DMA
-	{
-		.procname	= "tcp_dma_copybreak",
-		.data		= &sysctl_tcp_dma_copybreak,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
-#endif
 	{
 		.procname	= "tcp_slow_start_after_idle",
 		.data		= &sysctl_tcp_slow_start_after_idle,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 97c8f5620c4..28595a364f0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -274,7 +274,6 @@
 #include <net/tcp.h>
 #include <net/xfrm.h>
 #include <net/ip.h>
-#include <net/netdma.h>
 #include <net/sock.h>
 
 #include <asm/uaccess.h>
@@ -1454,39 +1453,6 @@ static void tcp_prequeue_process(struct sock *sk)
 	tp->ucopy.memory = 0;
 }
 
-#ifdef CONFIG_NET_DMA
-static void tcp_service_net_dma(struct sock *sk, bool wait)
-{
-	dma_cookie_t done, used;
-	dma_cookie_t last_issued;
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	if (!tp->ucopy.dma_chan)
-		return;
-
-	last_issued = tp->ucopy.dma_cookie;
-	dma_async_issue_pending(tp->ucopy.dma_chan);
-
-	do {
-		if (dma_async_is_tx_complete(tp->ucopy.dma_chan,
-					      last_issued, &done,
-					      &used) == DMA_COMPLETE) {
-			/* Safe to free early-copied skbs now */
-			__skb_queue_purge(&sk->sk_async_wait_queue);
-			break;
-		} else {
-			struct sk_buff *skb;
-			while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
-			       (dma_async_is_complete(skb->dma_cookie, done,
-						      used) == DMA_COMPLETE)) {
-				__skb_dequeue(&sk->sk_async_wait_queue);
-				kfree_skb(skb);
-			}
-		}
-	} while (wait);
-}
-#endif
-
 static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
 {
 	struct sk_buff *skb;
@@ -1504,7 +1470,7 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
 		 * splitted a fat GRO packet, while we released socket lock
 		 * in skb_splice_bits()
 		 */
-		sk_eat_skb(sk, skb, false);
+		sk_eat_skb(sk, skb);
 	}
 	return NULL;
 }
@@ -1570,11 +1536,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 				continue;
 		}
 		if (tcp_hdr(skb)->fin) {
-			sk_eat_skb(sk, skb, false);
+			sk_eat_skb(sk, skb);
 			++seq;
 			break;
 		}
-		sk_eat_skb(sk, skb, false);
+		sk_eat_skb(sk, skb);
 		if (!desc->count)
 			break;
 		tp->copied_seq = seq;
@@ -1612,7 +1578,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	int target;		/* Read at least this many bytes */
 	long timeo;
 	struct task_struct *user_recv = NULL;
-	bool copied_early = false;
 	struct sk_buff *skb;
 	u32 urg_hole = 0;
 
@@ -1655,28 +1620,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 
 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
 
-#ifdef CONFIG_NET_DMA
-	tp->ucopy.dma_chan = NULL;
-	preempt_disable();
-	skb = skb_peek_tail(&sk->sk_receive_queue);
-	{
-		int available = 0;
-
-		if (skb)
-			available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);
-		if ((available < target) &&
-		    (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
-		    !sysctl_tcp_low_latency &&
-		    net_dma_find_channel()) {
-			preempt_enable();
-			tp->ucopy.pinned_list =
-					dma_pin_iovec_pages(msg->msg_iov, len);
-		} else {
-			preempt_enable();
-		}
-	}
-#endif
-
 	do {
 		u32 offset;
 
@@ -1807,16 +1750,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			/* __ Set realtime policy in scheduler __ */
 		}
 
-#ifdef CONFIG_NET_DMA
-		if (tp->ucopy.dma_chan) {
-			if (tp->rcv_wnd == 0 &&
-			    !skb_queue_empty(&sk->sk_async_wait_queue)) {
-				tcp_service_net_dma(sk, true);
-				tcp_cleanup_rbuf(sk, copied);
-			} else
-				dma_async_issue_pending(tp->ucopy.dma_chan);
-		}
-#endif
 		if (copied >= target) {
 			/* Do not sleep, just process backlog. */
 			release_sock(sk);
@@ -1824,11 +1757,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		} else
 			sk_wait_data(sk, &timeo);
 
-#ifdef CONFIG_NET_DMA
-		tcp_service_net_dma(sk, false);  /* Don't block */
-		tp->ucopy.wakeup = 0;
-#endif
-
 		if (user_recv) {
 			int chunk;
 
@@ -1886,43 +1814,13 @@ do_prequeue:
 		}
 
 		if (!(flags & MSG_TRUNC)) {
-#ifdef CONFIG_NET_DMA
-			if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
-				tp->ucopy.dma_chan = net_dma_find_channel();
-
-			if (tp->ucopy.dma_chan) {
-				tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
-					tp->ucopy.dma_chan, skb, offset,
-					msg->msg_iov, used,
-					tp->ucopy.pinned_list);
-
-				if (tp->ucopy.dma_cookie < 0) {
-
-					pr_alert("%s: dma_cookie < 0\n",
-						 __func__);
-
-					/* Exception. Bailout! */
-					if (!copied)
-						copied = -EFAULT;
-					break;
-				}
-
-				dma_async_issue_pending(tp->ucopy.dma_chan);
-
-				if ((offset + used) == skb->len)
-					copied_early = true;
-
-			} else
-#endif
-			{
-				err = skb_copy_datagram_iovec(skb, offset,
-						msg->msg_iov, used);
-				if (err) {
-					/* Exception. Bailout! */
-					if (!copied)
-						copied = -EFAULT;
-					break;
-				}
+			err = skb_copy_datagram_iovec(skb, offset,
+						      msg->msg_iov, used);
+			if (err) {
+				/* Exception. Bailout! */
+				if (!copied)
+					copied = -EFAULT;
+				break;
 			}
 		}
 
@@ -1942,19 +1840,15 @@ skip_copy:
 
 		if (tcp_hdr(skb)->fin)
 			goto found_fin_ok;
-		if (!(flags & MSG_PEEK)) {
-			sk_eat_skb(sk, skb, copied_early);
-			copied_early = false;
-		}
+		if (!(flags & MSG_PEEK))
+			sk_eat_skb(sk, skb);
 		continue;
 
 	found_fin_ok:
 		/* Process the FIN. */
 		++*seq;
-		if (!(flags & MSG_PEEK)) {
-			sk_eat_skb(sk, skb, copied_early);
-			copied_early = false;
-		}
+		if (!(flags & MSG_PEEK))
+			sk_eat_skb(sk, skb);
 		break;
 	} while (len > 0);
 
@@ -1977,16 +1871,6 @@ skip_copy:
 		tp->ucopy.len = 0;
 	}
 
-#ifdef CONFIG_NET_DMA
-	tcp_service_net_dma(sk, true);  /* Wait for queue to drain */
-	tp->ucopy.dma_chan = NULL;
-
-	if (tp->ucopy.pinned_list) {
-		dma_unpin_iovec_pages(tp->ucopy.pinned_list);
-		tp->ucopy.pinned_list = NULL;
-	}
-#endif
-
 	/* According to UNIX98, msg_name/msg_namelen are ignored
 	 * on connected socket. I was just happy when found this 8) --ANK
 	 */
@@ -2330,9 +2214,6 @@ int tcp_disconnect(struct sock *sk, int flags)
 	__skb_queue_purge(&sk->sk_receive_queue);
 	tcp_write_queue_purge(sk);
 	__skb_queue_purge(&tp->out_of_order_queue);
-#ifdef CONFIG_NET_DMA
-	__skb_queue_purge(&sk->sk_async_wait_queue);
-#endif
 
 	inet->inet_dport = 0;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index eeaac399420..1342e9851f9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -73,7 +73,6 @@
 #include <net/inet_common.h>
 #include <linux/ipsec.h>
 #include <asm/unaligned.h>
-#include <net/netdma.h>
 
 int sysctl_tcp_timestamps __read_mostly = 1;
 int sysctl_tcp_window_scaling __read_mostly = 1;
@@ -4970,53 +4969,6 @@ static inline bool tcp_checksum_complete_user(struct sock *sk,
 	       __tcp_checksum_complete_user(sk, skb);
 }
 
-#ifdef CONFIG_NET_DMA
-static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
-				  int hlen)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	int chunk = skb->len - hlen;
-	int dma_cookie;
-	bool copied_early = false;
-
-	if (tp->ucopy.wakeup)
-		return false;
-
-	if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
-		tp->ucopy.dma_chan = net_dma_find_channel();
-
-	if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) {
-
-		dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan,
-							 skb, hlen,
-							 tp->ucopy.iov, chunk,
-							 tp->ucopy.pinned_list);
-
-		if (dma_cookie < 0)
-			goto out;
-
-		tp->ucopy.dma_cookie = dma_cookie;
-		copied_early = true;
-
-		tp->ucopy.len -= chunk;
-		tp->copied_seq += chunk;
-		tcp_rcv_space_adjust(sk);
-
-		if ((tp->ucopy.len == 0) ||
-		    (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) ||
-		    (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {
-			tp->ucopy.wakeup = 1;
-			sk->sk_data_ready(sk, 0);
-		}
-	} else if (chunk > 0) {
-		tp->ucopy.wakeup = 1;
-		sk->sk_data_ready(sk, 0);
-	}
-out:
-	return copied_early;
-}
-#endif /* CONFIG_NET_DMA */
-
 /* Does PAWS and seqno based validation of an incoming segment, flags will
  * play significant role here.
  */
@@ -5201,14 +5153,6 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 
 			if (tp->copied_seq == tp->rcv_nxt &&
 			    len - tcp_header_len <= tp->ucopy.len) {
-#ifdef CONFIG_NET_DMA
-				if (tp->ucopy.task == current &&
-				    sock_owned_by_user(sk) &&
-				    tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
-					copied_early = 1;
-					eaten = 1;
-				}
-#endif
 				if (tp->ucopy.task == current &&
 				    sock_owned_by_user(sk) && !copied_early) {
 					__set_current_state(TASK_RUNNING);
@@ -5274,11 +5218,6 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			if (!copied_early || tp->rcv_nxt != tp->rcv_wup)
 				__tcp_ack_snd_check(sk, 0);
 no_ack:
-#ifdef CONFIG_NET_DMA
-			if (copied_early)
-				__skb_queue_tail(&sk->sk_async_wait_queue, skb);
-			else
-#endif
 			if (eaten)
 				kfree_skb_partial(skb, fragstolen);
 			sk->sk_data_ready(sk, 0);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3cf97651049..737c2e270ee 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -72,7 +72,6 @@
 #include <net/inet_common.h>
 #include <net/timewait_sock.h>
 #include <net/xfrm.h>
-#include <net/netdma.h>
 #include <net/secure_seq.h>
 #include <net/tcp_memcontrol.h>
 #include <net/busy_poll.h>
@@ -1999,18 +1998,8 @@ process:
 	bh_lock_sock_nested(sk);
 	ret = 0;
 	if (!sock_owned_by_user(sk)) {
-#ifdef CONFIG_NET_DMA
-		struct tcp_sock *tp = tcp_sk(sk);
-		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
-			tp->ucopy.dma_chan = net_dma_find_channel();
-		if (tp->ucopy.dma_chan)
+		if (!tcp_prequeue(sk, skb))
 			ret = tcp_v4_do_rcv(sk, skb);
-		else
-#endif
-		{
-			if (!tcp_prequeue(sk, skb))
-				ret = tcp_v4_do_rcv(sk, skb);
-		}
 	} else if (unlikely(sk_add_backlog(sk, skb,
 					   sk->sk_rcvbuf + sk->sk_sndbuf))) {
 		bh_unlock_sock(sk);
@@ -2169,11 +2158,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
 	}
 #endif
 
-#ifdef CONFIG_NET_DMA
-	/* Cleans up our sk_async_wait_queue */
-	__skb_queue_purge(&sk->sk_async_wait_queue);
-#endif
-
 	/* Clean prequeue, it must be empty really */
 	__skb_queue_purge(&tp->ucopy.prequeue);
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 889079b2ea8..cb21fccf208 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -59,7 +59,6 @@
 #include <net/snmp.h>
 #include <net/dsfield.h>
 #include <net/timewait_sock.h>
-#include <net/netdma.h>
 #include <net/inet_common.h>
 #include <net/secure_seq.h>
 #include <net/tcp_memcontrol.h>
@@ -1520,18 +1519,8 @@ process:
 	bh_lock_sock_nested(sk);
 	ret = 0;
 	if (!sock_owned_by_user(sk)) {
-#ifdef CONFIG_NET_DMA
-		struct tcp_sock *tp = tcp_sk(sk);
-		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
-			tp->ucopy.dma_chan = net_dma_find_channel();
-		if (tp->ucopy.dma_chan)
+		if (!tcp_prequeue(sk, skb))
 			ret = tcp_v6_do_rcv(sk, skb);
-		else
-#endif
-		{
-			if (!tcp_prequeue(sk, skb))
-				ret = tcp_v6_do_rcv(sk, skb);
-		}
 	} else if (unlikely(sk_add_backlog(sk, skb,
 					   sk->sk_rcvbuf + sk->sk_sndbuf))) {
 		bh_unlock_sock(sk);
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 0080d2b0a8a..bb9cbc17d92 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -839,7 +839,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock,
 
 		if (!(flags & MSG_PEEK)) {
 			spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags);
-			sk_eat_skb(sk, skb, false);
+			sk_eat_skb(sk, skb);
 			spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags);
 			*seq = 0;
 		}
@@ -861,10 +861,10 @@ copy_uaddr:
 		llc_cmsg_rcv(msg, skb);
 
 	if (!(flags & MSG_PEEK)) {
-			spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags);
-			sk_eat_skb(sk, skb, false);
-			spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags);
-			*seq = 0;
+		spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags);
+		sk_eat_skb(sk, skb);
+		spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags);
+		*seq = 0;
 	}
 
 	goto out;
-- 
cgit v1.2.3-70-g09d2


From a224772db8420ecb7ce91a9ba5d535ee3a50d982 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 27 Sep 2014 09:50:56 -0700
Subject: ipv6: add a struct inet6_skb_parm param to ipv6_opt_accepted()

ipv6_opt_accepted() assumes IP6CB(skb) holds the struct inet6_skb_parm
that it needs. Lets not assume this, as TCP stack might use a different
place.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h    | 3 ++-
 net/dccp/ipv6.c       | 2 +-
 net/ipv6/af_inet6.c   | 4 ++--
 net/ipv6/syncookies.c | 2 +-
 net/ipv6/tcp_ipv6.c   | 5 +++--
 5 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'net/ipv6/tcp_ipv6.c')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 7e247e9b876..97f47201243 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -288,7 +288,8 @@ struct ipv6_txoptions *ipv6_renew_options(struct sock *sk,
 struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
 					  struct ipv6_txoptions *opt);
 
-bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb);
+bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb,
+		       const struct inet6_skb_parm *opt);
 
 static inline bool ipv6_accept_ra(struct inet6_dev *idev)
 {
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 04cb17d4b0c..ad2acfe1ca6 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -404,7 +404,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
 	ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
 
-	if (ipv6_opt_accepted(sk, skb) ||
+	if (ipv6_opt_accepted(sk, skb, IP6CB(skb)) ||
 	    np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
 	    np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
 		atomic_inc(&skb->users);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index e4865a3ebe1..34f726f5981 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -672,10 +672,10 @@ int inet6_sk_rebuild_header(struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header);
 
-bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb)
+bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb,
+		       const struct inet6_skb_parm *opt)
 {
 	const struct ipv6_pinfo *np = inet6_sk(sk);
-	const struct inet6_skb_parm *opt = IP6CB(skb);
 
 	if (np->rxopt.all) {
 		if ((opt->hop && (np->rxopt.bits.hopopts ||
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index c643dc907ce..9a2838e93cc 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -203,7 +203,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	ireq->ir_num = ntohs(th->dest);
 	ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
 	ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
-	if (ipv6_opt_accepted(sk, skb) ||
+	if (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) ||
 	    np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
 	    np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
 		atomic_inc(&skb->users);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index de51a88bec6..9400b4326f2 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -742,7 +742,8 @@ static void tcp_v6_init_req(struct request_sock *req, struct sock *sk,
 		ireq->ir_iif = inet6_iif(skb);
 
 	if (!TCP_SKB_CB(skb)->tcp_tw_isn &&
-	    (ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo ||
+	    (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) ||
+	     np->rxopt.bits.rxinfo ||
 	     np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim ||
 	     np->rxopt.bits.rxohlim || np->repflow)) {
 		atomic_inc(&skb->users);
@@ -1367,7 +1368,7 @@ ipv6_pktoptions:
 			np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb));
 		if (np->repflow)
 			np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb));
-		if (ipv6_opt_accepted(sk, opt_skb)) {
+		if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) {
 			skb_set_owner_r(opt_skb, sk);
 			opt_skb = xchg(&np->pktoptions, opt_skb);
 		} else {
-- 
cgit v1.2.3-70-g09d2


From 971f10eca186cab238c49daa91f703c5a001b0b1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 27 Sep 2014 09:50:57 -0700
Subject: tcp: better TCP_SKB_CB layout to reduce cache line misses

TCP maintains lists of skb in write queue, and in receive queues
(in order and out of order queues)

Scanning these lists both in input and output path usually requires
access to skb->next, TCP_SKB_CB(skb)->seq, and TCP_SKB_CB(skb)->end_seq

These fields are currently in two different cache lines, meaning we
waste lot of memory bandwidth when these queues are big and flows
have either packet drops or packet reorders.

We can move TCP_SKB_CB(skb)->header at the end of TCP_SKB_CB, because
this header is not used in fast path. This allows TCP to search much faster
in the skb lists.

Even with regular flows, we save one cache line miss in fast path.

Thanks to Christoph Paasch for noticing we need to cleanup
skb->cb[] (IPCB/IP6CB) before entering IP stack in tx path,
and that I forgot IPCB use in tcp_v4_hnd_req() and tcp_v4_save_options().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     | 12 ++++++------
 net/ipv4/tcp_ipv4.c   | 19 ++++++++++++-------
 net/ipv4/tcp_output.c |  5 +++++
 net/ipv6/tcp_ipv6.c   |  7 +++++++
 4 files changed, 30 insertions(+), 13 deletions(-)

(limited to 'net/ipv6/tcp_ipv6.c')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index a4201ef216e..4dc6641ee99 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -696,12 +696,6 @@ static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
  * If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately.
  */
 struct tcp_skb_cb {
-	union {
-		struct inet_skb_parm	h4;
-#if IS_ENABLED(CONFIG_IPV6)
-		struct inet6_skb_parm	h6;
-#endif
-	} header;	/* For incoming frames		*/
 	__u32		seq;		/* Starting sequence number	*/
 	__u32		end_seq;	/* SEQ + FIN + SYN + datalen	*/
 	__u32		tcp_tw_isn;	/* isn chosen by tcp_timewait_state_process() */
@@ -720,6 +714,12 @@ struct tcp_skb_cb {
 	__u8		ip_dsfield;	/* IPv4 tos or IPv6 dsfield	*/
 	/* 1 byte hole */
 	__u32		ack_seq;	/* Sequence number ACK'd	*/
+	union {
+		struct inet_skb_parm	h4;
+#if IS_ENABLED(CONFIG_IPV6)
+		struct inet6_skb_parm	h6;
+#endif
+	} header;	/* For incoming frames		*/
 };
 
 #define TCP_SKB_CB(__skb)	((struct tcp_skb_cb *)&((__skb)->cb[0]))
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 28ab90382c0..9ce3eac0295 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -886,18 +886,16 @@ EXPORT_SYMBOL(tcp_syn_flood_action);
  */
 static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
 {
-	const struct ip_options *opt = &(IPCB(skb)->opt);
+	const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
 	struct ip_options_rcu *dopt = NULL;
 
 	if (opt && opt->optlen) {
 		int opt_size = sizeof(*dopt) + opt->optlen;
 
 		dopt = kmalloc(opt_size, GFP_ATOMIC);
-		if (dopt) {
-			if (ip_options_echo(&dopt->opt, skb)) {
-				kfree(dopt);
-				dopt = NULL;
-			}
+		if (dopt && __ip_options_echo(&dopt->opt, skb, opt)) {
+			kfree(dopt);
+			dopt = NULL;
 		}
 	}
 	return dopt;
@@ -1431,7 +1429,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
 
 #ifdef CONFIG_SYN_COOKIES
 	if (!th->syn)
-		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
+		sk = cookie_v4_check(sk, skb, &TCP_SKB_CB(skb)->header.h4.opt);
 #endif
 	return sk;
 }
@@ -1636,6 +1634,13 @@ int tcp_v4_rcv(struct sk_buff *skb)
 
 	th = tcp_hdr(skb);
 	iph = ip_hdr(skb);
+	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
+	 * barrier() makes sure compiler wont play fool^Waliasing games.
+	 */
+	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
+		sizeof(struct inet_skb_parm));
+	barrier();
+
 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
 				    skb->len - th->doff * 4);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f173b1c4f81..a462fb1db89 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -974,6 +974,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 
 	/* Our usage of tstamp should remain private */
 	skb->tstamp.tv64 = 0;
+
+	/* Cleanup our debris for IP stacks */
+	memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
+			       sizeof(struct inet6_skb_parm)));
+
 	err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
 
 	if (likely(err <= 0))
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 9400b4326f2..132bac137ae 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1412,6 +1412,13 @@ static int tcp_v6_rcv(struct sk_buff *skb)
 
 	th = tcp_hdr(skb);
 	hdr = ipv6_hdr(skb);
+	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
+	 * barrier() makes sure compiler wont play fool^Waliasing games.
+	 */
+	memmove(&TCP_SKB_CB(skb)->header.h6, IP6CB(skb),
+		sizeof(struct inet6_skb_parm));
+	barrier();
+
 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
 				    skb->len - th->doff*4);
-- 
cgit v1.2.3-70-g09d2


From 870c3151382c980590d4d609babf3b0243e7db93 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 17 Oct 2014 09:17:20 -0700
Subject: ipv6: introduce tcp_v6_iif()

Commit 971f10eca186 ("tcp: better TCP_SKB_CB layout to reduce cache line
misses") added a regression for SO_BINDTODEVICE on IPv6.

This is because we still use inet6_iif() which expects that IP6 control
block is still at the beginning of skb->cb[]

This patch adds tcp_v6_iif() helper and uses it where necessary.

Because __inet6_lookup_skb() is used by TCP and DCCP, we add an iif
parameter to it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Fixes: 971f10eca186 ("tcp: better TCP_SKB_CB layout to reduce cache line misses")
Acked-by: Cong Wang <cwang@twopensource.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet6_hashtables.h |  5 +++--
 include/net/tcp.h              |  9 +++++++++
 net/dccp/ipv6.c                |  3 ++-
 net/ipv6/syncookies.c          |  2 +-
 net/ipv6/tcp_ipv6.c            | 26 +++++++++++++++-----------
 5 files changed, 30 insertions(+), 15 deletions(-)

(limited to 'net/ipv6/tcp_ipv6.c')

diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index ae061354430..d1d272843b3 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -80,7 +80,8 @@ static inline struct sock *__inet6_lookup(struct net *net,
 static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
 					      struct sk_buff *skb,
 					      const __be16 sport,
-					      const __be16 dport)
+					      const __be16 dport,
+					      int iif)
 {
 	struct sock *sk = skb_steal_sock(skb);
 
@@ -90,7 +91,7 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
 	return __inet6_lookup(dev_net(skb_dst(skb)->dev), hashinfo,
 			      &ipv6_hdr(skb)->saddr, sport,
 			      &ipv6_hdr(skb)->daddr, ntohs(dport),
-			      inet6_iif(skb));
+			      iif);
 }
 
 struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3a4bbbfdcf8..c9766f89deb 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -729,6 +729,15 @@ struct tcp_skb_cb {
 
 #define TCP_SKB_CB(__skb)	((struct tcp_skb_cb *)&((__skb)->cb[0]))
 
+
+/* This is the variant of inet6_iif() that must be used by TCP,
+ * as TCP moves IP6CB into a different location in skb->cb[]
+ */
+static inline int tcp_v6_iif(const struct sk_buff *skb)
+{
+	return TCP_SKB_CB(skb)->header.h6.iif;
+}
+
 /* Due to TSO, an SKB can be composed of multiple actual
  * packets.  To keep these tracked properly, we use this.
  */
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index ad2acfe1ca6..6bcaa33cd80 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -757,7 +757,8 @@ static int dccp_v6_rcv(struct sk_buff *skb)
 	/* Step 2:
 	 *	Look up flow ID in table and get corresponding socket */
 	sk = __inet6_lookup_skb(&dccp_hashinfo, skb,
-			        dh->dccph_sport, dh->dccph_dport);
+			        dh->dccph_sport, dh->dccph_dport,
+				inet6_iif(skb));
 	/*
 	 * Step 2:
 	 *	If no socket ...
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 9a2838e93cc..2a86a0f00f2 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -214,7 +214,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	/* So that link locals have meaning */
 	if (!sk->sk_bound_dev_if &&
 	    ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
-		ireq->ir_iif = inet6_iif(skb);
+		ireq->ir_iif = tcp_v6_iif(skb);
 
 	ireq->ir_mark = inet_request_mark(sk, skb);
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index cf2e45ab2fa..831495529b8 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -424,6 +424,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		if (sock_owned_by_user(sk))
 			goto out;
 
+		/* Note : We use inet6_iif() here, not tcp_v6_iif() */
 		req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr,
 					   &hdr->saddr, inet6_iif(skb));
 		if (!req)
@@ -738,7 +739,7 @@ static void tcp_v6_init_req(struct request_sock *req, struct sock *sk,
 	/* So that link locals have meaning */
 	if (!sk->sk_bound_dev_if &&
 	    ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
-		ireq->ir_iif = inet6_iif(skb);
+		ireq->ir_iif = tcp_v6_iif(skb);
 
 	if (!TCP_SKB_CB(skb)->tcp_tw_isn &&
 	    (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) ||
@@ -860,7 +861,7 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
 
 	fl6.flowi6_proto = IPPROTO_TCP;
 	if (rt6_need_strict(&fl6.daddr) && !oif)
-		fl6.flowi6_oif = inet6_iif(skb);
+		fl6.flowi6_oif = tcp_v6_iif(skb);
 	else
 		fl6.flowi6_oif = oif;
 	fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark);
@@ -918,7 +919,7 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
 		sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev),
 					   &tcp_hashinfo, &ipv6h->saddr,
 					   th->source, &ipv6h->daddr,
-					   ntohs(th->source), inet6_iif(skb));
+					   ntohs(th->source), tcp_v6_iif(skb));
 		if (!sk1)
 			return;
 
@@ -1000,13 +1001,14 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
 	/* Find possible connection requests. */
 	req = inet6_csk_search_req(sk, &prev, th->source,
 				   &ipv6_hdr(skb)->saddr,
-				   &ipv6_hdr(skb)->daddr, inet6_iif(skb));
+				   &ipv6_hdr(skb)->daddr, tcp_v6_iif(skb));
 	if (req)
 		return tcp_check_req(sk, skb, req, prev, false);
 
 	nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo,
-			&ipv6_hdr(skb)->saddr, th->source,
-			&ipv6_hdr(skb)->daddr, ntohs(th->dest), inet6_iif(skb));
+					 &ipv6_hdr(skb)->saddr, th->source,
+					 &ipv6_hdr(skb)->daddr, ntohs(th->dest),
+					 tcp_v6_iif(skb));
 
 	if (nsk) {
 		if (nsk->sk_state != TCP_TIME_WAIT) {
@@ -1090,7 +1092,7 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		newnp->ipv6_fl_list = NULL;
 		newnp->pktoptions  = NULL;
 		newnp->opt	   = NULL;
-		newnp->mcast_oif   = inet6_iif(skb);
+		newnp->mcast_oif   = tcp_v6_iif(skb);
 		newnp->mcast_hops  = ipv6_hdr(skb)->hop_limit;
 		newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb));
 		if (np->repflow)
@@ -1174,7 +1176,7 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 			skb_set_owner_r(newnp->pktoptions, newsk);
 	}
 	newnp->opt	  = NULL;
-	newnp->mcast_oif  = inet6_iif(skb);
+	newnp->mcast_oif  = tcp_v6_iif(skb);
 	newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
 	newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb));
 	if (np->repflow)
@@ -1360,7 +1362,7 @@ ipv6_pktoptions:
 	if (TCP_SKB_CB(opt_skb)->end_seq == tp->rcv_nxt &&
 	    !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
 		if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo)
-			np->mcast_oif = inet6_iif(opt_skb);
+			np->mcast_oif = tcp_v6_iif(opt_skb);
 		if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim)
 			np->mcast_hops = ipv6_hdr(opt_skb)->hop_limit;
 		if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass)
@@ -1427,7 +1429,8 @@ static int tcp_v6_rcv(struct sk_buff *skb)
 	TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr);
 	TCP_SKB_CB(skb)->sacked = 0;
 
-	sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
+	sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest,
+				tcp_v6_iif(skb));
 	if (!sk)
 		goto no_tcp_socket;
 
@@ -1514,7 +1517,7 @@ do_time_wait:
 		sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo,
 					    &ipv6_hdr(skb)->saddr, th->source,
 					    &ipv6_hdr(skb)->daddr,
-					    ntohs(th->dest), inet6_iif(skb));
+					    ntohs(th->dest), tcp_v6_iif(skb));
 		if (sk2 != NULL) {
 			struct inet_timewait_sock *tw = inet_twsk(sk);
 			inet_twsk_deschedule(tw, &tcp_death_row);
@@ -1553,6 +1556,7 @@ static void tcp_v6_early_demux(struct sk_buff *skb)
 	if (th->doff < sizeof(struct tcphdr) / 4)
 		return;
 
+	/* Note : We use inet6_iif() here, not tcp_v6_iif() */
 	sk = __inet6_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
 					&hdr->saddr, th->source,
 					&hdr->daddr, ntohs(th->dest),
-- 
cgit v1.2.3-70-g09d2


From 9e7ceb060754f134231f68cb29d5db31419fe1ed Mon Sep 17 00:00:00 2001
From: Sathya Perla <sathya.perla@emulex.com>
Date: Wed, 22 Oct 2014 21:42:01 +0530
Subject: net: fix saving TX flow hash in sock for outgoing connections

The commit "net: Save TX flow hash in sock and set in skbuf on xmit"
introduced the inet_set_txhash() and ip6_set_txhash() routines to calculate
and record flow hash(sk_txhash) in the socket structure. sk_txhash is used
to set skb->hash which is used to spread flows across multiple TXQs.

But, the above routines are invoked before the source port of the connection
is created. Because of this all outgoing connections that just differ in the
source port get hashed into the same TXQ.

This patch fixes this problem for IPv4/6 by invoking the the above routines
after the source port is available for the socket.

Fixes: b73c3d0e4("net: Save TX flow hash in sock and set in skbuf on xmit")

Signed-off-by: Sathya Perla <sathya.perla@emulex.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 4 ++--
 net/ipv6/tcp_ipv6.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'net/ipv6/tcp_ipv6.c')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 94d1a7757ff..9c7d7621466 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -206,8 +206,6 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	inet->inet_dport = usin->sin_port;
 	inet->inet_daddr = daddr;
 
-	inet_set_txhash(sk);
-
 	inet_csk(sk)->icsk_ext_hdr_len = 0;
 	if (inet_opt)
 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
@@ -224,6 +222,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	if (err)
 		goto failure;
 
+	inet_set_txhash(sk);
+
 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 			       inet->inet_sport, inet->inet_dport, sk);
 	if (IS_ERR(rt)) {
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 831495529b8..ace29b60813 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -200,8 +200,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	sk->sk_v6_daddr = usin->sin6_addr;
 	np->flow_label = fl6.flowlabel;
 
-	ip6_set_txhash(sk);
-
 	/*
 	 *	TCP over IPv4
 	 */
@@ -297,6 +295,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	if (err)
 		goto late_failure;
 
+	ip6_set_txhash(sk);
+
 	if (!tp->write_seq && likely(!tp->repair))
 		tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32,
 							     sk->sk_v6_daddr.s6_addr32,
-- 
cgit v1.2.3-70-g09d2