From fa2b04f4502d74659e4e4b1294c6d88e08ece032 Mon Sep 17 00:00:00 2001 From: David Ward Date: Tue, 5 Mar 2013 17:06:32 +0000 Subject: net/ipv4: Timestamp option cannot overflow with prespecified addresses When a router forwards a packet that contains the IPv4 timestamp option, if there is no space left in the option for the router to add its own timestamp, then the router increments the Overflow value in the option. However, if the addresses of the routers are prespecified in the option, then the overflow condition cannot happen: the option is structured so that each prespecified router has a place to write its timestamp. Other routers do not add a timestamp, so there will never be a lack of space. This fix ensures that the Overflow value in the IPv4 timestamp option is not incremented when the addresses of the routers are prespecified, even if the Pointer value is greater than the Length value. Signed-off-by: David Ward Signed-off-by: David S. Miller --- net/ipv4/ip_options.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index f6289bf6f33..310a3647c83 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -423,7 +423,7 @@ int ip_options_compile(struct net *net, put_unaligned_be32(midtime, timeptr); opt->is_changed = 1; } - } else { + } else if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC) { unsigned int overflow = optptr[3]>>4; if (overflow == 15) { pp_ptr = optptr + 3; -- cgit v1.2.3-70-g09d2 From c10cb5fc0fc9fa605e01f715118bde5ba5a98616 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Thu, 7 Mar 2013 02:34:33 +0000 Subject: Fix: sparse warning in inet_csk_prepare_forced_close In e337e24d66 (inet: Fix kmemleak in tcp_v4/6_syn_recv_sock and dccp_v4/6_request_recv_sock) I introduced the function inet_csk_prepare_forced_close, which does a call to bh_unlock_sock(). This produces a sparse-warning. This patch adds the missing __releases. Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 7d1874be1df..786d97aee75 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -735,6 +735,7 @@ EXPORT_SYMBOL(inet_csk_destroy_sock); * tcp/dccp_create_openreq_child(). */ void inet_csk_prepare_forced_close(struct sock *sk) + __releases(&sk->sk_lock.slock) { /* sk_clone_lock locked the socket and set refcnt to 2 */ bh_unlock_sock(sk); -- cgit v1.2.3-70-g09d2 From 4660c7f498c07c43173142ea95145e9dac5a6d14 Mon Sep 17 00:00:00 2001 From: David Ward Date: Mon, 11 Mar 2013 10:43:39 +0000 Subject: net/ipv4: Ensure that location of timestamp option is stored This is needed in order to detect if the timestamp option appears more than once in a packet, to remove the option if the packet is fragmented, etc. My previous change neglected to store the option location when the router addresses were prespecified and Pointer > Length. But now the option location is also stored when Flag is an unrecognized value, to ensure these option handling behaviors are still performed. Signed-off-by: David Ward Signed-off-by: David S. Miller --- net/ipv4/ip_options.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 310a3647c83..ec7264514a8 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -370,7 +370,6 @@ int ip_options_compile(struct net *net, } switch (optptr[3]&0xF) { case IPOPT_TS_TSONLY: - opt->ts = optptr - iph; if (skb) timeptr = &optptr[optptr[2]-1]; opt->ts_needtime = 1; @@ -381,7 +380,6 @@ int ip_options_compile(struct net *net, pp_ptr = optptr + 2; goto error; } - opt->ts = optptr - iph; if (rt) { spec_dst_fill(&spec_dst, skb); memcpy(&optptr[optptr[2]-1], &spec_dst, 4); @@ -396,7 +394,6 @@ int ip_options_compile(struct net *net, pp_ptr = optptr + 2; goto error; } - opt->ts = optptr - iph; { __be32 addr; memcpy(&addr, &optptr[optptr[2]-1], 4); @@ -429,12 +426,12 @@ int ip_options_compile(struct net *net, pp_ptr = optptr + 3; goto error; } - opt->ts = optptr - iph; if (skb) { optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4); opt->is_changed = 1; } } + opt->ts = optptr - iph; break; case IPOPT_RA: if (optlen < 4) { -- cgit v1.2.3-70-g09d2 From 16fad69cfe4adbbfa813de516757b87bcae36d93 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Mar 2013 05:40:32 +0000 Subject: tcp: fix skb_availroom() Chrome OS team reported a crash on a Pixel ChromeBook in TCP stack : https://code.google.com/p/chromium/issues/detail?id=182056 commit a21d45726acac (tcp: avoid order-1 allocations on wifi and tx path) did a poor choice adding an 'avail_size' field to skb, while what we really needed was a 'reserved_tailroom' one. It would have avoided commit 22b4a4f22da (tcp: fix retransmit of partially acked frames) and this commit. Crash occurs because skb_split() is not aware of the 'avail_size' management (and should not be aware) Signed-off-by: Eric Dumazet Reported-by: Mukesh Agrawal Signed-off-by: David S. Miller --- include/linux/skbuff.h | 7 +++++-- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_output.c | 1 - 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 821c7f45d2a..6f2bb860e05 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -500,7 +500,7 @@ struct sk_buff { union { __u32 mark; __u32 dropcount; - __u32 avail_size; + __u32 reserved_tailroom; }; sk_buff_data_t inner_transport_header; @@ -1447,7 +1447,10 @@ static inline int skb_tailroom(const struct sk_buff *skb) */ static inline int skb_availroom(const struct sk_buff *skb) { - return skb_is_nonlinear(skb) ? 0 : skb->avail_size - skb->len; + if (skb_is_nonlinear(skb)) + return 0; + + return skb->end - skb->tail - skb->reserved_tailroom; } /** diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 47e854fcae2..e2202079070 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -775,7 +775,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) * Make sure that we have exactly size bytes * available to the caller, no more, no less. */ - skb->avail_size = size; + skb->reserved_tailroom = skb->end - skb->tail - size; return skb; } __kfree_skb(skb); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e2b4461074d..817fbb396bc 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1298,7 +1298,6 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) eat = min_t(int, len, skb_headlen(skb)); if (eat) { __skb_pull(skb, eat); - skb->avail_size -= eat; len -= eat; if (!len) return; -- cgit v1.2.3-70-g09d2 From 8c6216d7f118a128678270824b6a1286a63863ca Mon Sep 17 00:00:00 2001 From: Timo Teräs Date: Wed, 13 Mar 2013 02:37:49 +0000 Subject: Revert "ip_gre: make ipgre_tunnel_xmit() not parse network header as IP unconditionally" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 412ed94744d16806fbec3bd250fd94e71cde5a1f. The commit is wrong as tiph points to the outer IPv4 header which is installed at ipgre_header() and not the inner one which is protocol dependant. This commit broke succesfully opennhrp which use PF_PACKET socket with ETH_P_NHRP protocol. Additionally ssl_addr is set to the link-layer IPv4 address. This address is written by ipgre_header() to the skb earlier, and this is the IPv4 header tiph should point to - regardless of the inner protocol payload. Signed-off-by: Timo Teräs Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index d0ef0e674ec..91d66dbde9c 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -798,10 +798,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev if (dev->header_ops && dev->type == ARPHRD_IPGRE) { gre_hlen = 0; - if (skb->protocol == htons(ETH_P_IP)) - tiph = (const struct iphdr *)skb->data; - else - tiph = &tunnel->parms.iph; + tiph = (const struct iphdr *)skb->data; } else { gre_hlen = tunnel->hlen; tiph = &tunnel->parms.iph; -- cgit v1.2.3-70-g09d2 From 0d4f0608619de59fd8169dd8e72aadc28d80e715 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 18 Mar 2013 07:01:28 +0000 Subject: tcp: dont handle MTU reduction on LISTEN socket When an ICMP ICMP_FRAG_NEEDED (or ICMPV6_PKT_TOOBIG) message finds a LISTEN socket, and this socket is currently owned by the user, we set TCP_MTU_REDUCED_DEFERRED flag in listener tsq_flags. This is bad because if we clone the parent before it had a chance to clear the flag, the child inherits the tsq_flags value, and next tcp_release_cb() on the child will decrement sk_refcnt. Result is that we might free a live TCP socket, as reported by Dormando. IPv4: Attempt to release TCP socket in state 1 Fix this issue by testing sk_state against TCP_LISTEN early, so that we set TCP_MTU_REDUCED_DEFERRED on appropriate sockets (not a LISTEN one) This bug was introduced in commit 563d34d05786 (tcp: dont drop MTU reduction indications) Reported-by: dormando Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 14 +++++++------- net/ipv6/tcp_ipv6.c | 7 +++++++ 2 files changed, 14 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4a8ec457310..d09203c6326 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -274,13 +274,6 @@ static void tcp_v4_mtu_reduced(struct sock *sk) struct inet_sock *inet = inet_sk(sk); u32 mtu = tcp_sk(sk)->mtu_info; - /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs - * send out by Linux are always <576bytes so they should go through - * unfragmented). - */ - if (sk->sk_state == TCP_LISTEN) - return; - dst = inet_csk_update_pmtu(sk, mtu); if (!dst) return; @@ -408,6 +401,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) goto out; if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ + /* We are not interested in TCP_LISTEN and open_requests + * (SYN-ACKs send out by Linux are always <576bytes so + * they should go through unfragmented). + */ + if (sk->sk_state == TCP_LISTEN) + goto out; + tp->mtu_info = info; if (!sock_owned_by_user(sk)) { tcp_v4_mtu_reduced(sk); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 9b6460055df..f6d629fd6ae 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -389,6 +389,13 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } if (type == ICMPV6_PKT_TOOBIG) { + /* We are not interested in TCP_LISTEN and open_requests + * (SYN-ACKs send out by Linux are always <576bytes so + * they should go through unfragmented). + */ + if (sk->sk_state == TCP_LISTEN) + goto out; + tp->mtu_info = ntohl(info); if (!sock_owned_by_user(sk)) tcp_v6_mtu_reduced(sk); -- cgit v1.2.3-70-g09d2 From 5a3da1fe9561828d0ca7eca664b16ec2b9bf0055 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Fri, 15 Mar 2013 11:32:30 +0000 Subject: inet: limit length of fragment queue hash table bucket lists This patch introduces a constant limit of the fragment queue hash table bucket list lengths. Currently the limit 128 is choosen somewhat arbitrary and just ensures that we can fill up the fragment cache with empty packets up to the default ip_frag_high_thresh limits. It should just protect from list iteration eating considerable amounts of cpu. If we reach the maximum length in one hash bucket a warning is printed. This is implemented on the caller side of inet_frag_find to distinguish between the different users of inet_fragment.c. I dropped the out of memory warning in the ipv4 fragment lookup path, because we already get a warning by the slab allocator. Cc: Eric Dumazet Cc: Jesper Dangaard Brouer Signed-off-by: Hannes Frederic Sowa Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_frag.h | 9 +++++++++ net/ipv4/inet_fragment.c | 20 +++++++++++++++++++- net/ipv4/ip_fragment.c | 11 ++++------- net/ipv6/netfilter/nf_conntrack_reasm.c | 12 ++++++------ net/ipv6/reassembly.c | 8 ++++++-- 5 files changed, 44 insertions(+), 16 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 76c3fe5ecc2..0a1dcc2fa2f 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -43,6 +43,13 @@ struct inet_frag_queue { #define INETFRAGS_HASHSZ 64 +/* averaged: + * max_depth = default ipfrag_high_thresh / INETFRAGS_HASHSZ / + * rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or + * struct frag_queue)) + */ +#define INETFRAGS_MAXDEPTH 128 + struct inet_frags { struct hlist_head hash[INETFRAGS_HASHSZ]; /* This rwlock is a global lock (seperate per IPv4, IPv6 and @@ -76,6 +83,8 @@ int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force); struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, struct inet_frags *f, void *key, unsigned int hash) __releases(&f->lock); +void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, + const char *prefix); static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f) { diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 245ae078a07..f4fd23de9b1 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -21,6 +21,7 @@ #include #include +#include #include static void inet_frag_secret_rebuild(unsigned long dummy) @@ -277,6 +278,7 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, __releases(&f->lock) { struct inet_frag_queue *q; + int depth = 0; hlist_for_each_entry(q, &f->hash[hash], list) { if (q->net == nf && f->match(q, key)) { @@ -284,9 +286,25 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, read_unlock(&f->lock); return q; } + depth++; } read_unlock(&f->lock); - return inet_frag_create(nf, f, key); + if (depth <= INETFRAGS_MAXDEPTH) + return inet_frag_create(nf, f, key); + else + return ERR_PTR(-ENOBUFS); } EXPORT_SYMBOL(inet_frag_find); + +void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, + const char *prefix) +{ + static const char msg[] = "inet_frag_find: Fragment hash bucket" + " list length grew over limit " __stringify(INETFRAGS_MAXDEPTH) + ". Dropping fragment.\n"; + + if (PTR_ERR(q) == -ENOBUFS) + LIMIT_NETDEBUG(KERN_WARNING "%s%s", prefix, msg); +} +EXPORT_SYMBOL(inet_frag_maybe_warn_overflow); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index b6d30acb600..a6445b843ef 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -292,14 +292,11 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); - if (q == NULL) - goto out_nomem; - + if (IS_ERR_OR_NULL(q)) { + inet_frag_maybe_warn_overflow(q, pr_fmt()); + return NULL; + } return container_of(q, struct ipq, q); - -out_nomem: - LIMIT_NETDEBUG(KERN_ERR pr_fmt("ip_frag_create: no memory left !\n")); - return NULL; } /* Is the fragment too far ahead to be part of ipq? */ diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 54087e96d7b..6700069949d 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -14,6 +14,8 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) "IPv6-nf: " fmt + #include #include #include @@ -180,13 +182,11 @@ static inline struct frag_queue *fq_find(struct net *net, __be32 id, q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash); local_bh_enable(); - if (q == NULL) - goto oom; - + if (IS_ERR_OR_NULL(q)) { + inet_frag_maybe_warn_overflow(q, pr_fmt()); + return NULL; + } return container_of(q, struct frag_queue, q); - -oom: - return NULL; } diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 3c6a77290c6..196ab9347ad 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -26,6 +26,9 @@ * YOSHIFUJI,H. @USAGI Always remove fragment header to * calculate ICV correctly. */ + +#define pr_fmt(fmt) "IPv6: " fmt + #include #include #include @@ -185,9 +188,10 @@ fq_find(struct net *net, __be32 id, const struct in6_addr *src, const struct in6 hash = inet6_hash_frag(id, src, dst, ip6_frags.rnd); q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash); - if (q == NULL) + if (IS_ERR_OR_NULL(q)) { + inet_frag_maybe_warn_overflow(q, pr_fmt()); return NULL; - + } return container_of(q, struct frag_queue, q); } -- cgit v1.2.3-70-g09d2