From 9d186cac7ffb1831e9f34cb4a3a8b22abb9dd9d4 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Wed, 13 Aug 2014 16:03:10 +0400 Subject: tcp: don't use timestamp from repaired skb-s to calculate RTT (v2) We don't know right timestamp for repaired skb-s. Wrong RTT estimations isn't good, because some congestion modules heavily depends on it. This patch adds the TCPCB_REPAIRED flag, which is included in TCPCB_RETRANS. Thanks to Eric for the advice how to fix this issue. This patch fixes the warning: [ 879.562947] WARNING: CPU: 0 PID: 2825 at net/ipv4/tcp_input.c:3078 tcp_ack+0x11f5/0x1380() [ 879.567253] CPU: 0 PID: 2825 Comm: socket-tcpbuf-l Not tainted 3.16.0-next-20140811 #1 [ 879.567829] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 879.568177] 0000000000000000 00000000c532680c ffff880039643d00 ffffffff817aa2d2 [ 879.568776] 0000000000000000 ffff880039643d38 ffffffff8109afbd ffff880039d6ba80 [ 879.569386] ffff88003a449800 000000002983d6bd 0000000000000000 000000002983d6bc [ 879.569982] Call Trace: [ 879.570264] [] dump_stack+0x4d/0x66 [ 879.570599] [] warn_slowpath_common+0x7d/0xa0 [ 879.570935] [] warn_slowpath_null+0x1a/0x20 [ 879.571292] [] tcp_ack+0x11f5/0x1380 [ 879.571614] [] tcp_rcv_established+0x1ed/0x710 [ 879.571958] [] tcp_v4_do_rcv+0x10a/0x370 [ 879.572315] [] release_sock+0x89/0x1d0 [ 879.572642] [] do_tcp_setsockopt.isra.36+0x120/0x860 [ 879.573000] [] ? rcu_read_lock_held+0x6e/0x80 [ 879.573352] [] tcp_setsockopt+0x32/0x40 [ 879.573678] [] sock_common_setsockopt+0x14/0x20 [ 879.574031] [] SyS_setsockopt+0x80/0xf0 [ 879.574393] [] system_call_fastpath+0x16/0x1b [ 879.574730] ---[ end trace a17cbc38eb8c5c00 ]--- v2: moving setting of skb->when for repaired skb-s in tcp_write_xmit, where it's set for other skb-s. Fixes: 431a91242d8d ("tcp: timestamp SYN+DATA messages") Fixes: 740b0f1841f6 ("tcp: switch rtt estimations to usec resolution") Cc: Eric Dumazet Cc: Pavel Emelyanov Cc: "David S. Miller" Signed-off-by: Andrey Vagin Signed-off-by: David S. Miller --- include/net/tcp.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index dafa1cbc149..36f55254573 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -705,8 +705,10 @@ struct tcp_skb_cb { #define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */ #define TCPCB_LOST 0x04 /* SKB is lost */ #define TCPCB_TAGBITS 0x07 /* All tag bits */ +#define TCPCB_REPAIRED 0x10 /* SKB repaired (no skb_mstamp) */ #define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */ -#define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS) +#define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \ + TCPCB_REPAIRED) __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ /* 1 byte hole */ -- cgit v1.2.3-70-g09d2 From 4fab9071950c2021d846e18351e0f46a1cffd67b Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 14 Aug 2014 12:40:05 -0400 Subject: tcp: fix tcp_release_cb() to dispatch via address family for mtu_reduced() Make sure we use the correct address-family-specific function for handling MTU reductions from within tcp_release_cb(). Previously AF_INET6 sockets were incorrectly always using the IPv6 code path when sometimes they were handling IPv4 traffic and thus had an IPv4 dst. Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Diagnosed-by: Willem de Bruijn Fixes: 563d34d057862 ("tcp: dont drop MTU reduction indications") Reviewed-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 1 + include/net/sock.h | 1 - include/net/tcp.h | 1 + net/ipv4/tcp_ipv4.c | 5 +++-- net/ipv4/tcp_output.c | 2 +- net/ipv6/tcp_ipv6.c | 3 ++- 6 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 7a431388756..5fbe6568c3c 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -62,6 +62,7 @@ struct inet_connection_sock_af_ops { void (*addr2sockaddr)(struct sock *sk, struct sockaddr *); int (*bind_conflict)(const struct sock *sk, const struct inet_bind_bucket *tb, bool relax); + void (*mtu_reduced)(struct sock *sk); }; /** inet_connection_sock - INET connection oriented sock diff --git a/include/net/sock.h b/include/net/sock.h index 38805fa02e4..7f2ab72f321 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -987,7 +987,6 @@ struct proto { struct sk_buff *skb); void (*release_cb)(struct sock *sk); - void (*mtu_reduced)(struct sock *sk); /* Keeping track of sk's, looking them up, and port selection methods. */ void (*hash)(struct sock *sk); diff --git a/include/net/tcp.h b/include/net/tcp.h index 36f55254573..e337e05035b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -448,6 +448,7 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th); */ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb); +void tcp_v4_mtu_reduced(struct sock *sk); int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb); struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index dceff5fe8e6..cd17f009aed 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -271,7 +271,7 @@ EXPORT_SYMBOL(tcp_v4_connect); * It can be called through tcp_release_cb() if socket was owned by user * at the time tcp_v4_err() was called to handle ICMP message. */ -static void tcp_v4_mtu_reduced(struct sock *sk) +void tcp_v4_mtu_reduced(struct sock *sk) { struct dst_entry *dst; struct inet_sock *inet = inet_sk(sk); @@ -302,6 +302,7 @@ static void tcp_v4_mtu_reduced(struct sock *sk) tcp_simple_retransmit(sk); } /* else let the usual retransmit timer handle it */ } +EXPORT_SYMBOL(tcp_v4_mtu_reduced); static void do_redirect(struct sk_buff *skb, struct sock *sk) { @@ -1787,6 +1788,7 @@ const struct inet_connection_sock_af_ops ipv4_specific = { .compat_setsockopt = compat_ip_setsockopt, .compat_getsockopt = compat_ip_getsockopt, #endif + .mtu_reduced = tcp_v4_mtu_reduced, }; EXPORT_SYMBOL(ipv4_specific); @@ -2406,7 +2408,6 @@ struct proto tcp_prot = { .sendpage = tcp_sendpage, .backlog_rcv = tcp_v4_do_rcv, .release_cb = tcp_release_cb, - .mtu_reduced = tcp_v4_mtu_reduced, .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ff3f0f75cc6..5a7c41fbc6d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -800,7 +800,7 @@ void tcp_release_cb(struct sock *sk) __sock_put(sk); } if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) { - sk->sk_prot->mtu_reduced(sk); + inet_csk(sk)->icsk_af_ops->mtu_reduced(sk); __sock_put(sk); } } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index f2ce9550239..29964c3d363 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1595,6 +1595,7 @@ static const struct inet_connection_sock_af_ops ipv6_specific = { .compat_setsockopt = compat_ipv6_setsockopt, .compat_getsockopt = compat_ipv6_getsockopt, #endif + .mtu_reduced = tcp_v6_mtu_reduced, }; #ifdef CONFIG_TCP_MD5SIG @@ -1625,6 +1626,7 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = { .compat_setsockopt = compat_ipv6_setsockopt, .compat_getsockopt = compat_ipv6_getsockopt, #endif + .mtu_reduced = tcp_v4_mtu_reduced, }; #ifdef CONFIG_TCP_MD5SIG @@ -1864,7 +1866,6 @@ struct proto tcpv6_prot = { .sendpage = tcp_sendpage, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, - .mtu_reduced = tcp_v6_mtu_reduced, .hash = tcp_v6_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, -- cgit v1.2.3-70-g09d2 From a26552afe89438eefbe097512b3187f2c7e929fd Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Thu, 14 Aug 2014 22:06:12 +0200 Subject: tcp: don't allow syn packets without timestamps to pass tcp_tw_recycle logic tcp_tw_recycle heavily relies on tcp timestamps to build a per-host ordering of incoming connections and teardowns without the need to hold state on a specific quadruple for TCP_TIMEWAIT_LEN, but only for the last measured RTO. To do so, we keep the last seen timestamp in a per-host indexed data structure and verify if the incoming timestamp in a connection request is strictly greater than the saved one during last connection teardown. Thus we can verify later on that no old data packets will be accepted by the new connection. During moving a socket to time-wait state we already verify if timestamps where seen on a connection. Only if that was the case we let the time-wait socket expire after the RTO, otherwise normal TCP_TIMEWAIT_LEN will be used. But we don't verify this on incoming SYN packets. If a connection teardown was less than TCP_PAWS_MSL seconds in the past we cannot guarantee to not accept data packets from an old connection if no timestamps are present. We should drop this SYN packet. This patch closes this loophole. Please note, this patch does not make tcp_tw_recycle in any way more usable but only adds another safety check: Sporadic drops of SYN packets because of reordering in the network or in the socket backlog queues can happen. Users behing NAT trying to connect to a tcp_tw_recycle enabled server can get caught in blackholes and their connection requests may regullary get dropped because hosts behind an address translator don't have synchronized tcp timestamp clocks. tcp_tw_recycle cannot work if peers don't have tcp timestamps enabled. In general, use of tcp_tw_recycle is disadvised. Cc: Eric Dumazet Cc: Florian Westphal Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- net/ipv4/tcp_input.c | 9 ++++++--- net/ipv4/tcp_metrics.c | 6 ++++-- 3 files changed, 11 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index e337e05035b..590e01a476a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -417,7 +417,7 @@ void tcp_update_metrics(struct sock *sk); void tcp_init_metrics(struct sock *sk); void tcp_metrics_init(void); bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, - bool paws_check); + bool paws_check, bool timestamps); bool tcp_remember_stamp(struct sock *sk); bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw); void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1a8e89fdd33..4f6cfbc5777 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5979,12 +5979,14 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, * timewait bucket, so that all the necessary checks * are made in the function processing timewait state. */ - if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle) { + if (tcp_death_row.sysctl_tw_recycle) { bool strict; dst = af_ops->route_req(sk, &fl, req, &strict); + if (dst && strict && - !tcp_peer_is_proven(req, dst, true)) { + !tcp_peer_is_proven(req, dst, true, + tmp_opt.saw_tstamp)) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); goto drop_and_release; } @@ -5993,7 +5995,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, else if (!sysctl_tcp_syncookies && (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < (sysctl_max_syn_backlog >> 2)) && - !tcp_peer_is_proven(req, dst, false)) { + !tcp_peer_is_proven(req, dst, false, + tmp_opt.saw_tstamp)) { /* Without syncookies last quarter of * backlog is filled with destinations, * proven to be alive. diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 0d54e59b9ea..ed9c9a91851 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -576,7 +576,8 @@ reset: tp->snd_cwnd_stamp = tcp_time_stamp; } -bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check) +bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, + bool paws_check, bool timestamps) { struct tcp_metrics_block *tm; bool ret; @@ -589,7 +590,8 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool pa if (paws_check) { if (tm && (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL && - (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW) + ((s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW || + !timestamps)) ret = false; else ret = true; -- cgit v1.2.3-70-g09d2 From 5300fdcb7b7e97d83033bc7196582705524d35ea Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Wed, 13 Aug 2014 16:38:29 +0200 Subject: rhashtable: RCU annotations for next pointers Properly annotate next pointers as access is RCU protected in the lookup path. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 4 ++-- lib/rhashtable.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 9cda293c867..8c6048e77f2 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -21,7 +21,7 @@ #include struct rhash_head { - struct rhash_head *next; + struct rhash_head __rcu *next; }; #define INIT_HASH_HEAD(ptr) ((ptr)->next = NULL) @@ -97,7 +97,7 @@ u32 rhashtable_obj_hashfn(const struct rhashtable *ht, void *ptr); void rhashtable_insert(struct rhashtable *ht, struct rhash_head *node, gfp_t); bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *node, gfp_t); void rhashtable_remove_pprev(struct rhashtable *ht, struct rhash_head *obj, - struct rhash_head **pprev, gfp_t flags); + struct rhash_head __rcu **pprev, gfp_t flags); bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size); bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size); diff --git a/lib/rhashtable.c b/lib/rhashtable.c index e6940cf1662..338dd7aa5e1 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -386,7 +386,7 @@ EXPORT_SYMBOL_GPL(rhashtable_insert); * deletion when combined with walking or lookup. */ void rhashtable_remove_pprev(struct rhashtable *ht, struct rhash_head *obj, - struct rhash_head **pprev, gfp_t flags) + struct rhash_head __rcu **pprev, gfp_t flags) { struct bucket_table *tbl = rht_dereference(ht->tbl, ht); -- cgit v1.2.3-70-g09d2 From c91eee56dc4f8c3d9ae834bacb835596d47a709e Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Wed, 13 Aug 2014 16:38:30 +0200 Subject: rhashtable: unexport and make rht_obj() static No need to export rht_obj(), all inner to outer object translations occur internally. It was intended to be used with rht_for_each() which now primarily serves as the iterator for rhashtable_remove_pprev() to effectively flush and free the full table. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 1 - lib/rhashtable.c | 8 +------- 2 files changed, 1 insertion(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 8c6048e77f2..af967c4c759 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -117,7 +117,6 @@ void rhashtable_destroy(const struct rhashtable *ht); #define rht_dereference_rcu(p, ht) \ rcu_dereference_check(p, lockdep_rht_mutex_is_held(ht)) -/* Internal, use rht_obj() instead */ #define rht_entry(ptr, type, member) container_of(ptr, type, member) #define rht_entry_safe(ptr, type, member) \ ({ \ diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 338dd7aa5e1..a2c78810ebc 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -38,16 +38,10 @@ int lockdep_rht_mutex_is_held(const struct rhashtable *ht) EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held); #endif -/** - * rht_obj - cast hash head to outer object - * @ht: hash table - * @he: hashed node - */ -void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he) +static void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he) { return (void *) he - ht->p.head_offset; } -EXPORT_SYMBOL_GPL(rht_obj); static u32 __hashfn(const struct rhashtable *ht, const void *key, u32 len, u32 hsize) -- cgit v1.2.3-70-g09d2 From 93f560811e80216e98f3fcec220aa0f8836b09af Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Wed, 13 Aug 2014 16:38:31 +0200 Subject: rhashtable: fix annotations for rht_for_each_entry_rcu() Call rcu_deference_raw() directly from within rht_for_each_entry_rcu() as list_for_each_entry_rcu() does. Fixes the following sparse warnings: net/netlink/af_netlink.c:2906:25: expected struct rhash_head const *__mptr net/netlink/af_netlink.c:2906:25: got struct rhash_head [noderef] * Fixes: e341694e3eb57fc ("netlink: Convert netlink_lookup() to use RCU protected hash table") Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index af967c4c759..36826c0166c 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -123,11 +123,6 @@ void rhashtable_destroy(const struct rhashtable *ht); typeof(ptr) __ptr = (ptr); \ __ptr ? rht_entry(__ptr, type, member) : NULL; \ }) -#define rht_entry_safe_rcu(ptr, type, member) \ -({ \ - typeof(*ptr) __rcu *__ptr = (typeof(*ptr) __rcu __force *)ptr; \ - __ptr ? container_of((typeof(ptr))rcu_dereference_raw(__ptr), type, member) : NULL; \ -}) #define rht_next_entry_safe(pos, ht, member) \ ({ \ @@ -204,9 +199,10 @@ void rhashtable_destroy(const struct rhashtable *ht); * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_entry_rcu(pos, head, member) \ - for (pos = rht_entry_safe_rcu(head, typeof(*(pos)), member); \ + for (pos = rht_entry_safe(rcu_dereference_raw(head), \ + typeof(*(pos)), member); \ pos; \ - pos = rht_entry_safe_rcu((pos)->member.next, \ - typeof(*(pos)), member)) + pos = rht_entry_safe(rcu_dereference_raw((pos)->member.next), \ + typeof(*(pos)), member)) #endif /* _LINUX_RHASHTABLE_H */ -- cgit v1.2.3-70-g09d2