From 7d98ffd8c2d1da6cec5d84eba42c4aa836a93f85 Mon Sep 17 00:00:00 2001 From: Ulrich Weber Date: Fri, 5 Nov 2010 01:39:12 +0000 Subject: xfrm: update flowi saddr in icmp_send if unset otherwise xfrm_lookup will fail to find correct policy Signed-off-by: Ulrich Weber Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 96bc7f9475a..e5d1a44bcbd 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -569,6 +569,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) /* No need to clone since we're just using its address. */ rt2 = rt; + if (!fl.nl_u.ip4_u.saddr) + fl.nl_u.ip4_u.saddr = rt->rt_src; + err = xfrm_lookup(net, (struct dst_entry **)&rt, &fl, NULL, 0); switch (err) { case 0: -- cgit v1.2.3-70-g09d2 From 7a1c8e5ab120a5f352e78bbc1fa5bb64e6f23639 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 20 Nov 2010 07:46:35 +0000 Subject: net: allow GFP_HIGHMEM in __vmalloc() We forgot to use __GFP_HIGHMEM in several __vmalloc() calls. In ceph, add the missing flag. In fib_trie.c, xfrm_hash.c and request_sock.c, using vzalloc() is cleaner and allows using HIGHMEM pages as well. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ceph/buffer.c | 2 +- net/core/request_sock.c | 4 +--- net/ipv4/fib_trie.c | 2 +- net/xfrm/xfrm_hash.c | 2 +- 4 files changed, 4 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c index 53d8abfa25d..bf3e6a13c21 100644 --- a/net/ceph/buffer.c +++ b/net/ceph/buffer.c @@ -19,7 +19,7 @@ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) if (b->vec.iov_base) { b->is_vmalloc = false; } else { - b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL); + b->vec.iov_base = __vmalloc(len, gfp | __GFP_HIGHMEM, PAGE_KERNEL); if (!b->vec.iov_base) { kfree(b); return NULL; diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 7552495aff7..fceeb37d716 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -45,9 +45,7 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); lopt_size += nr_table_entries * sizeof(struct request_sock *); if (lopt_size > PAGE_SIZE) - lopt = __vmalloc(lopt_size, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL); + lopt = vzalloc(lopt_size); else lopt = kzalloc(lopt_size, GFP_KERNEL); if (lopt == NULL) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 200eb538fbb..0f280348e0f 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -365,7 +365,7 @@ static struct tnode *tnode_alloc(size_t size) if (size <= PAGE_SIZE) return kzalloc(size, GFP_KERNEL); else - return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); + return vzalloc(size); } static void __tnode_vfree(struct work_struct *arg) diff --git a/net/xfrm/xfrm_hash.c b/net/xfrm/xfrm_hash.c index a2023ec5232..1e98bc0fe0a 100644 --- a/net/xfrm/xfrm_hash.c +++ b/net/xfrm/xfrm_hash.c @@ -19,7 +19,7 @@ struct hlist_head *xfrm_hash_alloc(unsigned int sz) if (sz <= PAGE_SIZE) n = kzalloc(sz, GFP_KERNEL); else if (hashdist) - n = __vmalloc(sz, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); + n = vzalloc(sz); else n = (struct hlist_head *) __get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, -- cgit v1.2.3-70-g09d2 From c39508d6f118308355468314ff414644115a07f3 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 24 Nov 2010 11:47:22 -0800 Subject: tcp: Make TCP_MAXSEG minimum more correct. Use TCP_MIN_MSS instead of constant 64. Reported-by: Min Zhang Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 08141996948..f15c36a706e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2246,7 +2246,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, /* Values greater than interface MTU won't take effect. However * at the point when this call is done we typically don't yet * know which interface is going to be used */ - if (val < 64 || val > MAX_TCP_WINDOW) { + if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) { err = -EINVAL; break; } -- cgit v1.2.3-70-g09d2 From 8475ef9fd16cadbfc692f78e608d1941a340beb2 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 22 Nov 2010 03:26:12 +0000 Subject: netns: Don't leak others' openreq-s in proc The /proc/net/tcp leaks openreq sockets from other namespaces. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 69ccbc1dde9..e13da6de1fc 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2043,7 +2043,9 @@ get_req: } get_sk: sk_nulls_for_each_from(sk, node) { - if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) { + if (!net_eq(sock_net(sk), net)) + continue; + if (sk->sk_family == st->family) { cur = sk; goto out; } -- cgit v1.2.3-70-g09d2 From 0147fc058d11bd4009b126d09974d2c8f48fef15 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 22 Nov 2010 12:54:21 +0000 Subject: tcp: restrict net.ipv4.tcp_adv_win_scale (#20312) tcp_win_from_space() does the following: if (sysctl_tcp_adv_win_scale <= 0) return space >> (-sysctl_tcp_adv_win_scale); else return space - (space >> sysctl_tcp_adv_win_scale); "space" is int. As per C99 6.5.7 (3) shifting int for 32 or more bits is undefined behaviour. Indeed, if sysctl_tcp_adv_win_scale is exactly 32, space >> 32 equals space and function returns 0. Which means we busyloop in tcp_fixup_rcvbuf(). Restrict net.ipv4.tcp_adv_win_scale to [-31, 31]. Fix https://bugzilla.kernel.org/show_bug.cgi?id=20312 Steps to reproduce: echo 32 >/proc/sys/net/ipv4/tcp_adv_win_scale wget www.kernel.org [softlockup] Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 1 + net/ipv4/sysctl_net_ipv4.c | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index fe95105992c..3c5e465296e 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -144,6 +144,7 @@ tcp_adv_win_scale - INTEGER Count buffering overhead as bytes/2^tcp_adv_win_scale (if tcp_adv_win_scale > 0) or bytes-bytes/2^(-tcp_adv_win_scale), if it is <= 0. + Possible values are [-31, 31], inclusive. Default: 2 tcp_allowed_congestion_control - STRING diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index e91911d7aae..1b4ec21497a 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -26,6 +26,8 @@ static int zero; static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; +static int tcp_adv_win_scale_min = -31; +static int tcp_adv_win_scale_max = 31; /* Update system visible IP port range */ static void set_local_port_range(int range[2]) @@ -426,7 +428,9 @@ static struct ctl_table ipv4_table[] = { .data = &sysctl_tcp_adv_win_scale, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &tcp_adv_win_scale_min, + .extra2 = &tcp_adv_win_scale_max, }, { .procname = "tcp_tw_reuse", -- cgit v1.2.3-70-g09d2 From b4ff3c90e6066bacc8a92111752fe9e4f4c45cca Mon Sep 17 00:00:00 2001 From: Nagendra Tomar Date: Fri, 26 Nov 2010 14:26:27 +0000 Subject: inet: Fix __inet_inherit_port() to correctly increment bsockets and num_owners inet sockets corresponding to passive connections are added to the bind hash using ___inet_inherit_port(). These sockets are later removed from the bind hash using __inet_put_port(). These two functions are not exactly symmetrical. __inet_put_port() decrements hashinfo->bsockets and tb->num_owners, whereas ___inet_inherit_port() does not increment them. This results in both of these going to -ve values. This patch fixes this by calling inet_bind_hash() from ___inet_inherit_port(), which does the right thing. 'bsockets' and 'num_owners' were introduced by commit a9d8f9110d7e953c (inet: Allowing more than 64k connections and heavily optimize bind(0)) Signed-off-by: Nagendra Singh Tomar Acked-by: Eric Dumazet Acked-by: Evgeniy Polyakov Signed-off-by: David S. Miller --- net/ipv4/inet_hashtables.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 1b344f30b46..3c0369a3a66 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -133,8 +133,7 @@ int __inet_inherit_port(struct sock *sk, struct sock *child) } } } - sk_add_bind_node(child, &tb->owners); - inet_csk(child)->icsk_bind_hash = tb; + inet_bind_hash(child, tb, port); spin_unlock(&head->lock); return 0; -- cgit v1.2.3-70-g09d2 From b1afde60f2b9ee8444fba4e012dc99a3b28d224d Mon Sep 17 00:00:00 2001 From: Nandita Dukkipati Date: Fri, 3 Dec 2010 13:33:44 +0000 Subject: tcp: Bug fix in initialization of receive window. The bug has to do with boundary checks on the initial receive window. If the initial receive window falls between init_cwnd and the receive window specified by the user, the initial window is incorrectly brought down to init_cwnd. The correct behavior is to allow it to remain unchanged. Signed-off-by: Nandita Dukkipati Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 05b1ecf3676..3c59ab42df2 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -231,11 +231,10 @@ void tcp_select_initial_window(int __space, __u32 mss, /* when initializing use the value from init_rcv_wnd * rather than the default from above */ - if (init_rcv_wnd && - (*rcv_wnd > init_rcv_wnd * mss)) - *rcv_wnd = init_rcv_wnd * mss; - else if (*rcv_wnd > init_cwnd * mss) - *rcv_wnd = init_cwnd * mss; + if (init_rcv_wnd) + *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); + else + *rcv_wnd = min(*rcv_wnd, init_cwnd * mss); } /* Set the clamp no higher than max representable value */ -- cgit v1.2.3-70-g09d2 From 67631510a318d5a930055fe927607f483716e100 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 8 Dec 2010 12:16:33 -0800 Subject: tcp: Replace time wait bucket msg by counter Rather than printing the message to the log, use a mib counter to keep track of the count of occurences of time wait bucket overflow. Reduces spam in logs. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/snmp.h | 1 + net/ipv4/proc.c | 1 + net/ipv4/tcp_minisocks.c | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/include/linux/snmp.h b/include/linux/snmp.h index ebb0c80ffd6..12b2b18e50c 100644 --- a/include/linux/snmp.h +++ b/include/linux/snmp.h @@ -230,6 +230,7 @@ enum LINUX_MIB_TCPMINTTLDROP, /* RFC 5082 */ LINUX_MIB_TCPDEFERACCEPTDROP, LINUX_MIB_IPRPFILTER, /* IP Reverse Path Filter (rp_filter) */ + LINUX_MIB_TCPTIMEWAITOVERFLOW, /* TCPTimeWaitOverflow */ __LINUX_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 1b48eb1ed45..b14ec7d03b6 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -253,6 +253,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP), SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP), SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER), + SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 43cf901d765..a66735f7596 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -347,7 +347,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) * socket up. We've got bigger problems than * non-graceful socket closings. */ - LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n"); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW); } tcp_update_metrics(sk); -- cgit v1.2.3-70-g09d2 From ad9f4f50fe9288bbe65b7dfd76d8820afac6a24c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 7 Dec 2010 12:03:55 +0000 Subject: tcp: avoid a possible divide by zero sysctl_tcp_tso_win_divisor might be set to zero while one cpu runs in tcp_tso_should_defer(). Make sure we dont allow a divide by zero by reading sysctl_tcp_tso_win_divisor exactly once. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3c59ab42df2..0d4a3cebfb4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1512,6 +1512,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); u32 send_win, cong_win, limit, in_flight; + int win_divisor; if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) goto send_now; @@ -1543,13 +1544,14 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) goto send_now; - if (sysctl_tcp_tso_win_divisor) { + win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor); + if (win_divisor) { u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); /* If at least some fraction of a window is available, * just use it. */ - chunk /= sysctl_tcp_tso_win_divisor; + chunk /= win_divisor; if (limit >= chunk) goto send_now; } else { -- cgit v1.2.3-70-g09d2 From f19872575ff7819a3723154657a497d9bca66b33 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 7 Dec 2010 12:20:47 +0000 Subject: tcp: protect sysctl_tcp_cookie_size reads Make sure sysctl_tcp_cookie_size is read once in tcp_cookie_size_check(), or we might return an illegal value to caller if sysctl_tcp_cookie_size is changed by another cpu. Signed-off-by: Eric Dumazet Cc: Ben Hutchings Cc: William Allen Simpson Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0d4a3cebfb4..61c2463e275 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -385,27 +385,30 @@ struct tcp_out_options { */ static u8 tcp_cookie_size_check(u8 desired) { - if (desired > 0) { + int cookie_size; + + if (desired > 0) /* previously specified */ return desired; - } - if (sysctl_tcp_cookie_size <= 0) { + + cookie_size = ACCESS_ONCE(sysctl_tcp_cookie_size); + if (cookie_size <= 0) /* no default specified */ return 0; - } - if (sysctl_tcp_cookie_size <= TCP_COOKIE_MIN) { + + if (cookie_size <= TCP_COOKIE_MIN) /* value too small, specify minimum */ return TCP_COOKIE_MIN; - } - if (sysctl_tcp_cookie_size >= TCP_COOKIE_MAX) { + + if (cookie_size >= TCP_COOKIE_MAX) /* value too large, specify maximum */ return TCP_COOKIE_MAX; - } - if (0x1 & sysctl_tcp_cookie_size) { + + if (cookie_size & 1) /* 8-bit multiple, illegal, fix it */ - return (u8)(sysctl_tcp_cookie_size + 0x1); - } - return (u8)sysctl_tcp_cookie_size; + cookie_size++; + + return (u8)cookie_size; } /* Write previously computed TCP options to the packet. -- cgit v1.2.3-70-g09d2