From b71d1d426d263b0b6cb5760322efebbfc89d4463 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 22 Apr 2011 04:53:02 +0000
Subject: inet: constify ip headers and in6_addr

Add const qualifiers to structs iphdr, ipv6hdr and in6_addr pointers
where possible, to make code intention more obvious.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4/af_inet.c')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 807d83c02ef..cae75ef21fe 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1186,7 +1186,7 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
 
 static int inet_gso_send_check(struct sk_buff *skb)
 {
-	struct iphdr *iph;
+	const struct iphdr *iph;
 	const struct net_protocol *ops;
 	int proto;
 	int ihl;
@@ -1293,7 +1293,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
 	const struct net_protocol *ops;
 	struct sk_buff **pp = NULL;
 	struct sk_buff *p;
-	struct iphdr *iph;
+	const struct iphdr *iph;
 	unsigned int hlen;
 	unsigned int off;
 	unsigned int id;
-- 
cgit v1.2.3-70-g09d2


From 2d7192d6cbab20e153c47fa1559ffd41ceef0e79 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 26 Apr 2011 13:28:44 -0700
Subject: ipv4: Sanitize and simplify ip_route_{connect,newports}()

These functions are used together as a unit for route resolution
during connect().  They address the chicken-and-egg problem that
exists when ports need to be allocated during connect() processing,
yet such port allocations require addressing information from the
routing code.

It's currently more heavy handed than it needs to be, and in
particular we allocate and initialize a flow object twice.

Let the callers provide the on-stack flow object.  That way we only
need to initialize it once in the ip_route_connect() call.

Later, if ip_route_newports() needs to do anything, it re-uses that
flow object as-is except for the ports which it updates before the
route re-lookup.

Also, describe why this set of facilities are needed and how it works
in a big comment.

Signed-off-by: David S. Miller <davem@davemloft.net>
Reviewed-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/net/route.h | 88 ++++++++++++++++++++++++++++++++++-------------------
 net/dccp/ipv4.c     | 10 +++---
 net/ipv4/af_inet.c  |  3 +-
 net/ipv4/datagram.c |  3 +-
 net/ipv4/tcp_ipv4.c | 10 +++---
 net/l2tp/l2tp_ip.c  |  8 ++---
 6 files changed, 74 insertions(+), 48 deletions(-)

(limited to 'net/ipv4/af_inet.c')

diff --git a/include/net/route.h b/include/net/route.h
index 3684c3edbae..79530da31b3 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -217,17 +217,37 @@ static inline char rt_tos2priority(u8 tos)
 	return ip_tos2prio[IPTOS_TOS(tos)>>1];
 }
 
-static inline struct rtable *ip_route_connect(__be32 dst, __be32 src, u32 tos,
-					      int oif, u8 protocol,
-					      __be16 sport, __be16 dport,
-					      struct sock *sk, bool can_sleep)
+/* ip_route_connect() and ip_route_newports() work in tandem whilst
+ * binding a socket for a new outgoing connection.
+ *
+ * In order to use IPSEC properly, we must, in the end, have a
+ * route that was looked up using all available keys including source
+ * and destination ports.
+ *
+ * However, if a source port needs to be allocated (the user specified
+ * a wildcard source port) we need to obtain addressing information
+ * in order to perform that allocation.
+ *
+ * So ip_route_connect() looks up a route using wildcarded source and
+ * destination ports in the key, simply so that we can get a pair of
+ * addresses to use for port allocation.
+ *
+ * Later, once the ports are allocated, ip_route_newports() will make
+ * another route lookup if needed to make sure we catch any IPSEC
+ * rules keyed on the port information.
+ *
+ * The callers allocate the flow key on their stack, and must pass in
+ * the same flowi4 object to both the ip_route_connect() and the
+ * ip_route_newports() calls.
+ */
+
+static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32 src,
+					 u32 tos, int oif, u8 protocol,
+					 __be16 sport, __be16 dport,
+					 struct sock *sk, bool can_sleep)
 {
-	struct net *net = sock_net(sk);
-	struct rtable *rt;
-	struct flowi4 fl4;
-	__u8 flow_flags;
+	__u8 flow_flags = 0;
 
-	flow_flags = 0;
 	if (inet_sk(sk)->transparent)
 		flow_flags |= FLOWI_FLAG_ANYSRC;
 	if (protocol == IPPROTO_TCP)
@@ -235,41 +255,45 @@ static inline struct rtable *ip_route_connect(__be32 dst, __be32 src, u32 tos,
 	if (can_sleep)
 		flow_flags |= FLOWI_FLAG_CAN_SLEEP;
 
-	flowi4_init_output(&fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE,
+	flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE,
 			   protocol, flow_flags, dst, src, dport, sport);
+}
+
+static inline struct rtable *ip_route_connect(struct flowi4 *fl4,
+					      __be32 dst, __be32 src, u32 tos,
+					      int oif, u8 protocol,
+					      __be16 sport, __be16 dport,
+					      struct sock *sk, bool can_sleep)
+{
+	struct net *net = sock_net(sk);
+	struct rtable *rt;
+
+	ip_route_connect_init(fl4, dst, src, tos, oif, protocol,
+			      sport, dport, sk, can_sleep);
 
 	if (!dst || !src) {
-		rt = __ip_route_output_key(net, &fl4);
+		rt = __ip_route_output_key(net, fl4);
 		if (IS_ERR(rt))
 			return rt;
-		fl4.daddr = rt->rt_dst;
-		fl4.saddr = rt->rt_src;
+		fl4->daddr = rt->rt_dst;
+		fl4->saddr = rt->rt_src;
 		ip_rt_put(rt);
 	}
-	security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
-	return ip_route_output_flow(net, &fl4, sk);
+	security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
+	return ip_route_output_flow(net, fl4, sk);
 }
 
-static inline struct rtable *ip_route_newports(struct rtable *rt,
-					       u8 protocol, __be16 orig_sport,
-					       __be16 orig_dport, __be16 sport,
-					       __be16 dport, struct sock *sk)
+static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable *rt,
+					       __be16 orig_sport, __be16 orig_dport,
+					       __be16 sport, __be16 dport,
+					       struct sock *sk)
 {
 	if (sport != orig_sport || dport != orig_dport) {
-		struct flowi4 fl4;
-		__u8 flow_flags;
-
-		flow_flags = 0;
-		if (inet_sk(sk)->transparent)
-			flow_flags |= FLOWI_FLAG_ANYSRC;
-		if (protocol == IPPROTO_TCP)
-			flow_flags |= FLOWI_FLAG_PRECOW_METRICS;
-		flowi4_init_output(&fl4, rt->rt_oif, rt->rt_mark, rt->rt_tos,
-				   RT_SCOPE_UNIVERSE, protocol, flow_flags,
-				   rt->rt_dst, rt->rt_src, dport, sport);
+		fl4->fl4_dport = dport;
+		fl4->fl4_sport = sport;
 		ip_rt_put(rt);
-		security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
-		return ip_route_output_flow(sock_net(sk), &fl4, sk);
+		security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
+		return ip_route_output_flow(sock_net(sk), fl4, sk);
 	}
 	return rt;
 }
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index ae451c6d83b..b92ab655d44 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -40,12 +40,13 @@
 
 int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
+	const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 	struct inet_sock *inet = inet_sk(sk);
 	struct dccp_sock *dp = dccp_sk(sk);
-	const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 	__be16 orig_sport, orig_dport;
-	struct rtable *rt;
 	__be32 daddr, nexthop;
+	struct flowi4 fl4;
+	struct rtable *rt;
 	int err;
 
 	dp->dccps_role = DCCP_ROLE_CLIENT;
@@ -65,7 +66,7 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 
 	orig_sport = inet->inet_sport;
 	orig_dport = usin->sin_port;
-	rt = ip_route_connect(nexthop, inet->inet_saddr,
+	rt = ip_route_connect(&fl4, nexthop, inet->inet_saddr,
 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 			      IPPROTO_DCCP,
 			      orig_sport, orig_dport, sk, true);
@@ -101,8 +102,7 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	if (err != 0)
 		goto failure;
 
-	rt = ip_route_newports(rt, IPPROTO_DCCP,
-			       orig_sport, orig_dport,
+	rt = ip_route_newports(&fl4, rt, orig_sport, orig_dport,
 			       inet->inet_sport, inet->inet_dport, sk);
 	if (IS_ERR(rt)) {
 		rt = NULL;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index cae75ef21fe..0413af3e228 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1103,6 +1103,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
 	struct inet_sock *inet = inet_sk(sk);
 	__be32 old_saddr = inet->inet_saddr;
 	__be32 daddr = inet->inet_daddr;
+	struct flowi4 fl4;
 	struct rtable *rt;
 	__be32 new_saddr;
 
@@ -1110,7 +1111,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
 		daddr = inet->opt->faddr;
 
 	/* Query new route. */
-	rt = ip_route_connect(daddr, 0, RT_CONN_FLAGS(sk),
+	rt = ip_route_connect(&fl4, daddr, 0, RT_CONN_FLAGS(sk),
 			      sk->sk_bound_dev_if, sk->sk_protocol,
 			      inet->inet_sport, inet->inet_dport, sk, false);
 	if (IS_ERR(rt))
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 85bd24ca4f6..216ba2338b6 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -24,6 +24,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
+	struct flowi4 fl4;
 	struct rtable *rt;
 	__be32 saddr;
 	int oif;
@@ -46,7 +47,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		if (!saddr)
 			saddr = inet->mc_addr;
 	}
-	rt = ip_route_connect(usin->sin_addr.s_addr, saddr,
+	rt = ip_route_connect(&fl4, usin->sin_addr.s_addr, saddr,
 			      RT_CONN_FLAGS(sk), oif,
 			      sk->sk_protocol,
 			      inet->inet_sport, usin->sin_port, sk, true);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index edf18bd74b8..310454c2f4d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -146,12 +146,13 @@ EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 /* This will initiate an outgoing connection. */
 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
+	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 	struct inet_sock *inet = inet_sk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 	__be16 orig_sport, orig_dport;
-	struct rtable *rt;
 	__be32 daddr, nexthop;
+	struct flowi4 fl4;
+	struct rtable *rt;
 	int err;
 
 	if (addr_len < sizeof(struct sockaddr_in))
@@ -169,7 +170,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 
 	orig_sport = inet->inet_sport;
 	orig_dport = usin->sin_port;
-	rt = ip_route_connect(nexthop, inet->inet_saddr,
+	rt = ip_route_connect(&fl4, nexthop, inet->inet_saddr,
 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 			      IPPROTO_TCP,
 			      orig_sport, orig_dport, sk, true);
@@ -236,8 +237,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	if (err)
 		goto failure;
 
-	rt = ip_route_newports(rt, IPPROTO_TCP,
-			       orig_sport, orig_dport,
+	rt = ip_route_newports(&fl4, rt, orig_sport, orig_dport,
 			       inet->inet_sport, inet->inet_dport, sk);
 	if (IS_ERR(rt)) {
 		err = PTR_ERR(rt);
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index fce9bd3bd3f..cc673677c5d 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -296,12 +296,12 @@ out_in_use:
 
 static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
-	int rc;
-	struct inet_sock *inet = inet_sk(sk);
 	struct sockaddr_l2tpip *lsa = (struct sockaddr_l2tpip *) uaddr;
+	struct inet_sock *inet = inet_sk(sk);
+	struct flowi4 fl4;
 	struct rtable *rt;
 	__be32 saddr;
-	int oif;
+	int oif, rc;
 
 	rc = -EINVAL;
 	if (addr_len < sizeof(*lsa))
@@ -320,7 +320,7 @@ static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
 	if (ipv4_is_multicast(lsa->l2tp_addr.s_addr))
 		goto out;
 
-	rt = ip_route_connect(lsa->l2tp_addr.s_addr, saddr,
+	rt = ip_route_connect(&fl4, lsa->l2tp_addr.s_addr, saddr,
 			      RT_CONN_FLAGS(sk), oif,
 			      IPPROTO_L2TP,
 			      0, 0, sk, true);
-- 
cgit v1.2.3-70-g09d2


From f6d8bd051c391c1c0458a30b2a7abcd939329259 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 21 Apr 2011 09:45:37 +0000
Subject: inet: add RCU protection to inet->opt

We lack proper synchronization to manipulate inet->opt ip_options

Problem is ip_make_skb() calls ip_setup_cork() and
ip_setup_cork() possibly makes a copy of ipc->opt (struct ip_options),
without any protection against another thread manipulating inet->opt.

Another thread can change inet->opt pointer and free old one under us.

Use RCU to protect inet->opt (changed to inet->inet_opt).

Instead of handling atomic refcounts, just copy ip_options when
necessary, to avoid cache line dirtying.

We cant insert an rcu_head in struct ip_options since its included in
skb->cb[], so this patch is large because I had to introduce a new
ip_options_rcu structure.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_sock.h         |  14 +++--
 include/net/ip.h                |  11 ++--
 net/dccp/ipv4.c                 |  16 +++---
 net/dccp/ipv6.c                 |   2 +-
 net/ipv4/af_inet.c              |  17 ++++--
 net/ipv4/cipso_ipv4.c           | 113 ++++++++++++++++++++++------------------
 net/ipv4/icmp.c                 |  23 ++++----
 net/ipv4/inet_connection_sock.c |   6 +--
 net/ipv4/ip_options.c           |  38 +++++++-------
 net/ipv4/ip_output.c            |  44 ++++++++--------
 net/ipv4/ip_sockglue.c          |  35 +++++++++----
 net/ipv4/raw.c                  |  19 +++++--
 net/ipv4/syncookies.c           |   4 +-
 net/ipv4/tcp_ipv4.c             |  34 +++++++-----
 net/ipv4/udp.c                  |  21 ++++++--
 net/ipv6/tcp_ipv6.c             |   2 +-
 net/l2tp/l2tp_ip.c              |  10 ++--
 17 files changed, 241 insertions(+), 168 deletions(-)

(limited to 'net/ipv4/af_inet.c')

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 7a37369f8ea..ed2ba6eca72 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -57,7 +57,15 @@ struct ip_options {
 	unsigned char	__data[0];
 };
 
-#define optlength(opt) (sizeof(struct ip_options) + opt->optlen)
+struct ip_options_rcu {
+	struct rcu_head rcu;
+	struct ip_options opt;
+};
+
+struct ip_options_data {
+	struct ip_options_rcu	opt;
+	char			data[40];
+};
 
 struct inet_request_sock {
 	struct request_sock	req;
@@ -78,7 +86,7 @@ struct inet_request_sock {
 				acked	   : 1,
 				no_srccheck: 1;
 	kmemcheck_bitfield_end(flags);
-	struct ip_options	*opt;
+	struct ip_options_rcu	*opt;
 };
 
 static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk)
@@ -140,7 +148,7 @@ struct inet_sock {
 	__be16			inet_sport;
 	__u16			inet_id;
 
-	struct ip_options	*opt;
+	struct ip_options_rcu __rcu	*inet_opt;
 	__u8			tos;
 	__u8			min_ttl;
 	__u8			mc_ttl;
diff --git a/include/net/ip.h b/include/net/ip.h
index 7c416583b71..3a59bf99aa3 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -52,7 +52,7 @@ static inline unsigned int ip_hdrlen(const struct sk_buff *skb)
 struct ipcm_cookie {
 	__be32			addr;
 	int			oif;
-	struct ip_options	*opt;
+	struct ip_options_rcu	*opt;
 	__u8			tx_flags;
 };
 
@@ -92,7 +92,7 @@ extern int		igmp_mc_proc_init(void);
 
 extern int		ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 					      __be32 saddr, __be32 daddr,
-					      struct ip_options *opt);
+					      struct ip_options_rcu *opt);
 extern int		ip_rcv(struct sk_buff *skb, struct net_device *dev,
 			       struct packet_type *pt, struct net_device *orig_dev);
 extern int		ip_local_deliver(struct sk_buff *skb);
@@ -416,14 +416,15 @@ extern int ip_forward(struct sk_buff *skb);
  *	Functions provided by ip_options.c
  */
  
-extern void ip_options_build(struct sk_buff *skb, struct ip_options *opt, __be32 daddr, struct rtable *rt, int is_frag);
+extern void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
+			     __be32 daddr, struct rtable *rt, int is_frag);
 extern int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb);
 extern void ip_options_fragment(struct sk_buff *skb);
 extern int ip_options_compile(struct net *net,
 			      struct ip_options *opt, struct sk_buff *skb);
-extern int ip_options_get(struct net *net, struct ip_options **optp,
+extern int ip_options_get(struct net *net, struct ip_options_rcu **optp,
 			  unsigned char *data, int optlen);
-extern int ip_options_get_from_user(struct net *net, struct ip_options **optp,
+extern int ip_options_get_from_user(struct net *net, struct ip_options_rcu **optp,
 				    unsigned char __user *data, int optlen);
 extern void ip_options_undo(struct ip_options * opt);
 extern void ip_forward_options(struct sk_buff *skb);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index b92ab655d44..cbbcc6c036e 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -48,6 +48,7 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	struct flowi4 fl4;
 	struct rtable *rt;
 	int err;
+	struct ip_options_rcu *inet_opt;
 
 	dp->dccps_role = DCCP_ROLE_CLIENT;
 
@@ -58,10 +59,13 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		return -EAFNOSUPPORT;
 
 	nexthop = daddr = usin->sin_addr.s_addr;
-	if (inet->opt != NULL && inet->opt->srr) {
+
+	inet_opt = rcu_dereference_protected(inet->inet_opt,
+					     sock_owned_by_user(sk));
+	if (inet_opt != NULL && inet_opt->opt.srr) {
 		if (daddr == 0)
 			return -EINVAL;
-		nexthop = inet->opt->faddr;
+		nexthop = inet_opt->opt.faddr;
 	}
 
 	orig_sport = inet->inet_sport;
@@ -78,7 +82,7 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		return -ENETUNREACH;
 	}
 
-	if (inet->opt == NULL || !inet->opt->srr)
+	if (inet_opt == NULL || !inet_opt->opt.srr)
 		daddr = rt->rt_dst;
 
 	if (inet->inet_saddr == 0)
@@ -89,8 +93,8 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	inet->inet_daddr = daddr;
 
 	inet_csk(sk)->icsk_ext_hdr_len = 0;
-	if (inet->opt != NULL)
-		inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
+	if (inet_opt)
+		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 	/*
 	 * Socket identity is still unknown (sport may be zero).
 	 * However we set state to DCCP_REQUESTING and not releasing socket
@@ -405,7 +409,7 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
 	newinet->inet_daddr	= ireq->rmt_addr;
 	newinet->inet_rcv_saddr = ireq->loc_addr;
 	newinet->inet_saddr	= ireq->loc_addr;
-	newinet->opt	   = ireq->opt;
+	newinet->inet_opt	= ireq->opt;
 	ireq->opt	   = NULL;
 	newinet->mc_index  = inet_iif(skb);
 	newinet->mc_ttl	   = ip_hdr(skb)->ttl;
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 73add237324..8dc4348774a 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -573,7 +573,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 
 	   First: no IPv4 options.
 	 */
-	newinet->opt = NULL;
+	newinet->inet_opt = NULL;
 
 	/* Clone RX bits */
 	newnp->rxopt.all = np->rxopt.all;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 0413af3e228..963a621e75c 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -153,7 +153,7 @@ void inet_sock_destruct(struct sock *sk)
 	WARN_ON(sk->sk_wmem_queued);
 	WARN_ON(sk->sk_forward_alloc);
 
-	kfree(inet->opt);
+	kfree(rcu_dereference_protected(inet->inet_opt, 1));
 	dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
 	sk_refcnt_debug_dec(sk);
 }
@@ -1106,9 +1106,12 @@ static int inet_sk_reselect_saddr(struct sock *sk)
 	struct flowi4 fl4;
 	struct rtable *rt;
 	__be32 new_saddr;
+	struct ip_options_rcu *inet_opt;
 
-	if (inet->opt && inet->opt->srr)
-		daddr = inet->opt->faddr;
+	inet_opt = rcu_dereference_protected(inet->inet_opt,
+					     sock_owned_by_user(sk));
+	if (inet_opt && inet_opt->opt.srr)
+		daddr = inet_opt->opt.faddr;
 
 	/* Query new route. */
 	rt = ip_route_connect(&fl4, daddr, 0, RT_CONN_FLAGS(sk),
@@ -1148,6 +1151,7 @@ int inet_sk_rebuild_header(struct sock *sk)
 	struct inet_sock *inet = inet_sk(sk);
 	struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
 	__be32 daddr;
+	struct ip_options_rcu *inet_opt;
 	int err;
 
 	/* Route is OK, nothing to do. */
@@ -1155,9 +1159,12 @@ int inet_sk_rebuild_header(struct sock *sk)
 		return 0;
 
 	/* Reroute. */
+	rcu_read_lock();
+	inet_opt = rcu_dereference(inet->inet_opt);
 	daddr = inet->inet_daddr;
-	if (inet->opt && inet->opt->srr)
-		daddr = inet->opt->faddr;
+	if (inet_opt && inet_opt->opt.srr)
+		daddr = inet_opt->opt.faddr;
+	rcu_read_unlock();
 	rt = ip_route_output_ports(sock_net(sk), sk, daddr, inet->inet_saddr,
 				   inet->inet_dport, inet->inet_sport,
 				   sk->sk_protocol, RT_CONN_FLAGS(sk),
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index a0af7ea8787..2b3c23c287c 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1857,6 +1857,11 @@ static int cipso_v4_genopt(unsigned char *buf, u32 buf_len,
 	return CIPSO_V4_HDR_LEN + ret_val;
 }
 
+static void opt_kfree_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct ip_options_rcu, rcu));
+}
+
 /**
  * cipso_v4_sock_setattr - Add a CIPSO option to a socket
  * @sk: the socket
@@ -1879,7 +1884,7 @@ int cipso_v4_sock_setattr(struct sock *sk,
 	unsigned char *buf = NULL;
 	u32 buf_len;
 	u32 opt_len;
-	struct ip_options *opt = NULL;
+	struct ip_options_rcu *old, *opt = NULL;
 	struct inet_sock *sk_inet;
 	struct inet_connection_sock *sk_conn;
 
@@ -1915,22 +1920,25 @@ int cipso_v4_sock_setattr(struct sock *sk,
 		ret_val = -ENOMEM;
 		goto socket_setattr_failure;
 	}
-	memcpy(opt->__data, buf, buf_len);
-	opt->optlen = opt_len;
-	opt->cipso = sizeof(struct iphdr);
+	memcpy(opt->opt.__data, buf, buf_len);
+	opt->opt.optlen = opt_len;
+	opt->opt.cipso = sizeof(struct iphdr);
 	kfree(buf);
 	buf = NULL;
 
 	sk_inet = inet_sk(sk);
+
+	old = rcu_dereference_protected(sk_inet->inet_opt, sock_owned_by_user(sk));
 	if (sk_inet->is_icsk) {
 		sk_conn = inet_csk(sk);
-		if (sk_inet->opt)
-			sk_conn->icsk_ext_hdr_len -= sk_inet->opt->optlen;
-		sk_conn->icsk_ext_hdr_len += opt->optlen;
+		if (old)
+			sk_conn->icsk_ext_hdr_len -= old->opt.optlen;
+		sk_conn->icsk_ext_hdr_len += opt->opt.optlen;
 		sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie);
 	}
-	opt = xchg(&sk_inet->opt, opt);
-	kfree(opt);
+	rcu_assign_pointer(sk_inet->inet_opt, opt);
+	if (old)
+		call_rcu(&old->rcu, opt_kfree_rcu);
 
 	return 0;
 
@@ -1960,7 +1968,7 @@ int cipso_v4_req_setattr(struct request_sock *req,
 	unsigned char *buf = NULL;
 	u32 buf_len;
 	u32 opt_len;
-	struct ip_options *opt = NULL;
+	struct ip_options_rcu *opt = NULL;
 	struct inet_request_sock *req_inet;
 
 	/* We allocate the maximum CIPSO option size here so we are probably
@@ -1988,15 +1996,16 @@ int cipso_v4_req_setattr(struct request_sock *req,
 		ret_val = -ENOMEM;
 		goto req_setattr_failure;
 	}
-	memcpy(opt->__data, buf, buf_len);
-	opt->optlen = opt_len;
-	opt->cipso = sizeof(struct iphdr);
+	memcpy(opt->opt.__data, buf, buf_len);
+	opt->opt.optlen = opt_len;
+	opt->opt.cipso = sizeof(struct iphdr);
 	kfree(buf);
 	buf = NULL;
 
 	req_inet = inet_rsk(req);
 	opt = xchg(&req_inet->opt, opt);
-	kfree(opt);
+	if (opt)
+		call_rcu(&opt->rcu, opt_kfree_rcu);
 
 	return 0;
 
@@ -2016,34 +2025,34 @@ req_setattr_failure:
  * values on failure.
  *
  */
-static int cipso_v4_delopt(struct ip_options **opt_ptr)
+static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr)
 {
 	int hdr_delta = 0;
-	struct ip_options *opt = *opt_ptr;
+	struct ip_options_rcu *opt = *opt_ptr;
 
-	if (opt->srr || opt->rr || opt->ts || opt->router_alert) {
+	if (opt->opt.srr || opt->opt.rr || opt->opt.ts || opt->opt.router_alert) {
 		u8 cipso_len;
 		u8 cipso_off;
 		unsigned char *cipso_ptr;
 		int iter;
 		int optlen_new;
 
-		cipso_off = opt->cipso - sizeof(struct iphdr);
-		cipso_ptr = &opt->__data[cipso_off];
+		cipso_off = opt->opt.cipso - sizeof(struct iphdr);
+		cipso_ptr = &opt->opt.__data[cipso_off];
 		cipso_len = cipso_ptr[1];
 
-		if (opt->srr > opt->cipso)
-			opt->srr -= cipso_len;
-		if (opt->rr > opt->cipso)
-			opt->rr -= cipso_len;
-		if (opt->ts > opt->cipso)
-			opt->ts -= cipso_len;
-		if (opt->router_alert > opt->cipso)
-			opt->router_alert -= cipso_len;
-		opt->cipso = 0;
+		if (opt->opt.srr > opt->opt.cipso)
+			opt->opt.srr -= cipso_len;
+		if (opt->opt.rr > opt->opt.cipso)
+			opt->opt.rr -= cipso_len;
+		if (opt->opt.ts > opt->opt.cipso)
+			opt->opt.ts -= cipso_len;
+		if (opt->opt.router_alert > opt->opt.cipso)
+			opt->opt.router_alert -= cipso_len;
+		opt->opt.cipso = 0;
 
 		memmove(cipso_ptr, cipso_ptr + cipso_len,
-			opt->optlen - cipso_off - cipso_len);
+			opt->opt.optlen - cipso_off - cipso_len);
 
 		/* determining the new total option length is tricky because of
 		 * the padding necessary, the only thing i can think to do at
@@ -2052,21 +2061,21 @@ static int cipso_v4_delopt(struct ip_options **opt_ptr)
 		 * from there we can determine the new total option length */
 		iter = 0;
 		optlen_new = 0;
-		while (iter < opt->optlen)
-			if (opt->__data[iter] != IPOPT_NOP) {
-				iter += opt->__data[iter + 1];
+		while (iter < opt->opt.optlen)
+			if (opt->opt.__data[iter] != IPOPT_NOP) {
+				iter += opt->opt.__data[iter + 1];
 				optlen_new = iter;
 			} else
 				iter++;
-		hdr_delta = opt->optlen;
-		opt->optlen = (optlen_new + 3) & ~3;
-		hdr_delta -= opt->optlen;
+		hdr_delta = opt->opt.optlen;
+		opt->opt.optlen = (optlen_new + 3) & ~3;
+		hdr_delta -= opt->opt.optlen;
 	} else {
 		/* only the cipso option was present on the socket so we can
 		 * remove the entire option struct */
 		*opt_ptr = NULL;
-		hdr_delta = opt->optlen;
-		kfree(opt);
+		hdr_delta = opt->opt.optlen;
+		call_rcu(&opt->rcu, opt_kfree_rcu);
 	}
 
 	return hdr_delta;
@@ -2083,15 +2092,15 @@ static int cipso_v4_delopt(struct ip_options **opt_ptr)
 void cipso_v4_sock_delattr(struct sock *sk)
 {
 	int hdr_delta;
-	struct ip_options *opt;
+	struct ip_options_rcu *opt;
 	struct inet_sock *sk_inet;
 
 	sk_inet = inet_sk(sk);
-	opt = sk_inet->opt;
-	if (opt == NULL || opt->cipso == 0)
+	opt = rcu_dereference_protected(sk_inet->inet_opt, 1);
+	if (opt == NULL || opt->opt.cipso == 0)
 		return;
 
-	hdr_delta = cipso_v4_delopt(&sk_inet->opt);
+	hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt);
 	if (sk_inet->is_icsk && hdr_delta > 0) {
 		struct inet_connection_sock *sk_conn = inet_csk(sk);
 		sk_conn->icsk_ext_hdr_len -= hdr_delta;
@@ -2109,12 +2118,12 @@ void cipso_v4_sock_delattr(struct sock *sk)
  */
 void cipso_v4_req_delattr(struct request_sock *req)
 {
-	struct ip_options *opt;
+	struct ip_options_rcu *opt;
 	struct inet_request_sock *req_inet;
 
 	req_inet = inet_rsk(req);
 	opt = req_inet->opt;
-	if (opt == NULL || opt->cipso == 0)
+	if (opt == NULL || opt->opt.cipso == 0)
 		return;
 
 	cipso_v4_delopt(&req_inet->opt);
@@ -2184,14 +2193,18 @@ getattr_return:
  */
 int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
 {
-	struct ip_options *opt;
+	struct ip_options_rcu *opt;
+	int res = -ENOMSG;
 
-	opt = inet_sk(sk)->opt;
-	if (opt == NULL || opt->cipso == 0)
-		return -ENOMSG;
-
-	return cipso_v4_getattr(opt->__data + opt->cipso - sizeof(struct iphdr),
-				secattr);
+	rcu_read_lock();
+	opt = rcu_dereference(inet_sk(sk)->inet_opt);
+	if (opt && opt->opt.cipso)
+		res = cipso_v4_getattr(opt->opt.__data +
+						opt->opt.cipso -
+						sizeof(struct iphdr),
+				       secattr);
+	rcu_read_unlock();
+	return res;
 }
 
 /**
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 74e35e5736e..cfeca3c2152 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -108,8 +108,7 @@ struct icmp_bxm {
 		__be32	       times[3];
 	} data;
 	int head_len;
-	struct ip_options replyopts;
-	unsigned char  optbuf[40];
+	struct ip_options_data replyopts;
 };
 
 /* An array of errno for error messages from dest unreach. */
@@ -333,7 +332,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	struct inet_sock *inet;
 	__be32 daddr;
 
-	if (ip_options_echo(&icmp_param->replyopts, skb))
+	if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
 		return;
 
 	sk = icmp_xmit_lock(net);
@@ -347,10 +346,10 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	daddr = ipc.addr = rt->rt_src;
 	ipc.opt = NULL;
 	ipc.tx_flags = 0;
-	if (icmp_param->replyopts.optlen) {
-		ipc.opt = &icmp_param->replyopts;
-		if (ipc.opt->srr)
-			daddr = icmp_param->replyopts.faddr;
+	if (icmp_param->replyopts.opt.opt.optlen) {
+		ipc.opt = &icmp_param->replyopts.opt;
+		if (ipc.opt->opt.srr)
+			daddr = icmp_param->replyopts.opt.opt.faddr;
 	}
 	{
 		struct flowi4 fl4 = {
@@ -379,8 +378,8 @@ static struct rtable *icmp_route_lookup(struct net *net, struct sk_buff *skb_in,
 					struct icmp_bxm *param)
 {
 	struct flowi4 fl4 = {
-		.daddr = (param->replyopts.srr ?
-			  param->replyopts.faddr : iph->saddr),
+		.daddr = (param->replyopts.opt.opt.srr ?
+			  param->replyopts.opt.opt.faddr : iph->saddr),
 		.saddr = saddr,
 		.flowi4_tos = RT_TOS(tos),
 		.flowi4_proto = IPPROTO_ICMP,
@@ -581,7 +580,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 					   IPTOS_PREC_INTERNETCONTROL) :
 					  iph->tos;
 
-	if (ip_options_echo(&icmp_param.replyopts, skb_in))
+	if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in))
 		goto out_unlock;
 
 
@@ -597,7 +596,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	icmp_param.offset = skb_network_offset(skb_in);
 	inet_sk(sk)->tos = tos;
 	ipc.addr = iph->saddr;
-	ipc.opt = &icmp_param.replyopts;
+	ipc.opt = &icmp_param.replyopts.opt;
 	ipc.tx_flags = 0;
 
 	rt = icmp_route_lookup(net, skb_in, iph, saddr, tos,
@@ -613,7 +612,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	room = dst_mtu(&rt->dst);
 	if (room > 576)
 		room = 576;
-	room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen;
+	room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen;
 	room -= sizeof(struct icmphdr);
 
 	icmp_param.data_len = skb_in->len - icmp_param.offset;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 8514db54a7f..3282cb2de39 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -354,20 +354,20 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
 {
 	struct rtable *rt;
 	const struct inet_request_sock *ireq = inet_rsk(req);
-	struct ip_options *opt = inet_rsk(req)->opt;
+	struct ip_options_rcu *opt = inet_rsk(req)->opt;
 	struct net *net = sock_net(sk);
 	struct flowi4 fl4;
 
 	flowi4_init_output(&fl4, sk->sk_bound_dev_if, sk->sk_mark,
 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 			   sk->sk_protocol, inet_sk_flowi_flags(sk),
-			   (opt && opt->srr) ? opt->faddr : ireq->rmt_addr,
+			   (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr,
 			   ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport);
 	security_req_classify_flow(req, flowi4_to_flowi(&fl4));
 	rt = ip_route_output_flow(net, &fl4, sk);
 	if (IS_ERR(rt))
 		goto no_route;
-	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
+	if (opt && opt->opt.is_strictroute && rt->rt_dst != rt->rt_gateway)
 		goto route_err;
 	return &rt->dst;
 
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 2391b24e825..01fc4096584 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -36,7 +36,7 @@
  * saddr is address of outgoing interface.
  */
 
-void ip_options_build(struct sk_buff * skb, struct ip_options * opt,
+void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
 			    __be32 daddr, struct rtable *rt, int is_frag)
 {
 	unsigned char *iph = skb_network_header(skb);
@@ -83,9 +83,9 @@ void ip_options_build(struct sk_buff * skb, struct ip_options * opt,
  * NOTE: dopt cannot point to skb.
  */
 
-int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
+int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
 {
-	struct ip_options *sopt;
+	const struct ip_options *sopt;
 	unsigned char *sptr, *dptr;
 	int soffset, doffset;
 	int	optlen;
@@ -95,10 +95,8 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
 
 	sopt = &(IPCB(skb)->opt);
 
-	if (sopt->optlen == 0) {
-		dopt->optlen = 0;
+	if (sopt->optlen == 0)
 		return 0;
-	}
 
 	sptr = skb_network_header(skb);
 	dptr = dopt->__data;
@@ -157,7 +155,7 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
 		dopt->optlen += optlen;
 	}
 	if (sopt->srr) {
-		unsigned char * start = sptr+sopt->srr;
+		unsigned char *start = sptr+sopt->srr;
 		__be32 faddr;
 
 		optlen  = start[1];
@@ -499,19 +497,19 @@ void ip_options_undo(struct ip_options * opt)
 	}
 }
 
-static struct ip_options *ip_options_get_alloc(const int optlen)
+static struct ip_options_rcu *ip_options_get_alloc(const int optlen)
 {
-	return kzalloc(sizeof(struct ip_options) + ((optlen + 3) & ~3),
+	return kzalloc(sizeof(struct ip_options_rcu) + ((optlen + 3) & ~3),
 		       GFP_KERNEL);
 }
 
-static int ip_options_get_finish(struct net *net, struct ip_options **optp,
-				 struct ip_options *opt, int optlen)
+static int ip_options_get_finish(struct net *net, struct ip_options_rcu **optp,
+				 struct ip_options_rcu *opt, int optlen)
 {
 	while (optlen & 3)
-		opt->__data[optlen++] = IPOPT_END;
-	opt->optlen = optlen;
-	if (optlen && ip_options_compile(net, opt, NULL)) {
+		opt->opt.__data[optlen++] = IPOPT_END;
+	opt->opt.optlen = optlen;
+	if (optlen && ip_options_compile(net, &opt->opt, NULL)) {
 		kfree(opt);
 		return -EINVAL;
 	}
@@ -520,29 +518,29 @@ static int ip_options_get_finish(struct net *net, struct ip_options **optp,
 	return 0;
 }
 
-int ip_options_get_from_user(struct net *net, struct ip_options **optp,
+int ip_options_get_from_user(struct net *net, struct ip_options_rcu **optp,
 			     unsigned char __user *data, int optlen)
 {
-	struct ip_options *opt = ip_options_get_alloc(optlen);
+	struct ip_options_rcu *opt = ip_options_get_alloc(optlen);
 
 	if (!opt)
 		return -ENOMEM;
-	if (optlen && copy_from_user(opt->__data, data, optlen)) {
+	if (optlen && copy_from_user(opt->opt.__data, data, optlen)) {
 		kfree(opt);
 		return -EFAULT;
 	}
 	return ip_options_get_finish(net, optp, opt, optlen);
 }
 
-int ip_options_get(struct net *net, struct ip_options **optp,
+int ip_options_get(struct net *net, struct ip_options_rcu **optp,
 		   unsigned char *data, int optlen)
 {
-	struct ip_options *opt = ip_options_get_alloc(optlen);
+	struct ip_options_rcu *opt = ip_options_get_alloc(optlen);
 
 	if (!opt)
 		return -ENOMEM;
 	if (optlen)
-		memcpy(opt->__data, data, optlen);
+		memcpy(opt->opt.__data, data, optlen);
 	return ip_options_get_finish(net, optp, opt, optlen);
 }
 
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index bdad3d60aa8..362e66f7d2f 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -140,14 +140,14 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
  *
  */
 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
-			  __be32 saddr, __be32 daddr, struct ip_options *opt)
+			  __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct rtable *rt = skb_rtable(skb);
 	struct iphdr *iph;
 
 	/* Build the IP header. */
-	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
+	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
 	skb_reset_network_header(skb);
 	iph = ip_hdr(skb);
 	iph->version  = 4;
@@ -163,9 +163,9 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 	iph->protocol = sk->sk_protocol;
 	ip_select_ident(iph, &rt->dst, sk);
 
-	if (opt && opt->optlen) {
-		iph->ihl += opt->optlen>>2;
-		ip_options_build(skb, opt, daddr, rt, 0);
+	if (opt && opt->opt.optlen) {
+		iph->ihl += opt->opt.optlen>>2;
+		ip_options_build(skb, &opt->opt, daddr, rt, 0);
 	}
 
 	skb->priority = sk->sk_priority;
@@ -316,7 +316,7 @@ int ip_queue_xmit(struct sk_buff *skb)
 {
 	struct sock *sk = skb->sk;
 	struct inet_sock *inet = inet_sk(sk);
-	struct ip_options *opt = inet->opt;
+	struct ip_options_rcu *inet_opt;
 	struct rtable *rt;
 	struct iphdr *iph;
 	int res;
@@ -325,6 +325,7 @@ int ip_queue_xmit(struct sk_buff *skb)
 	 * f.e. by something like SCTP.
 	 */
 	rcu_read_lock();
+	inet_opt = rcu_dereference(inet->inet_opt);
 	rt = skb_rtable(skb);
 	if (rt != NULL)
 		goto packet_routed;
@@ -336,8 +337,8 @@ int ip_queue_xmit(struct sk_buff *skb)
 
 		/* Use correct destination address if we have options. */
 		daddr = inet->inet_daddr;
-		if(opt && opt->srr)
-			daddr = opt->faddr;
+		if (inet_opt && inet_opt->opt.srr)
+			daddr = inet_opt->opt.faddr;
 
 		/* If this fails, retransmit mechanism of transport layer will
 		 * keep trying until route appears or the connection times
@@ -357,11 +358,11 @@ int ip_queue_xmit(struct sk_buff *skb)
 	skb_dst_set_noref(skb, &rt->dst);
 
 packet_routed:
-	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
+	if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_dst != rt->rt_gateway)
 		goto no_route;
 
 	/* OK, we know where to send it, allocate and build IP header. */
-	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
+	skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
 	skb_reset_network_header(skb);
 	iph = ip_hdr(skb);
 	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
@@ -375,9 +376,9 @@ packet_routed:
 	iph->daddr    = rt->rt_dst;
 	/* Transport layer set skb->h.foo itself. */
 
-	if (opt && opt->optlen) {
-		iph->ihl += opt->optlen >> 2;
-		ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
+	if (inet_opt && inet_opt->opt.optlen) {
+		iph->ihl += inet_opt->opt.optlen >> 2;
+		ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
 	}
 
 	ip_select_ident_more(iph, &rt->dst, sk,
@@ -1033,7 +1034,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
 			 struct ipcm_cookie *ipc, struct rtable **rtp)
 {
 	struct inet_sock *inet = inet_sk(sk);
-	struct ip_options *opt;
+	struct ip_options_rcu *opt;
 	struct rtable *rt;
 
 	/*
@@ -1047,7 +1048,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
 			if (unlikely(cork->opt == NULL))
 				return -ENOBUFS;
 		}
-		memcpy(cork->opt, opt, sizeof(struct ip_options) + opt->optlen);
+		memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
 		cork->flags |= IPCORK_OPT;
 		cork->addr = ipc->addr;
 	}
@@ -1451,26 +1452,23 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
 		   unsigned int len)
 {
 	struct inet_sock *inet = inet_sk(sk);
-	struct {
-		struct ip_options	opt;
-		char			data[40];
-	} replyopts;
+	struct ip_options_data replyopts;
 	struct ipcm_cookie ipc;
 	__be32 daddr;
 	struct rtable *rt = skb_rtable(skb);
 
-	if (ip_options_echo(&replyopts.opt, skb))
+	if (ip_options_echo(&replyopts.opt.opt, skb))
 		return;
 
 	daddr = ipc.addr = rt->rt_src;
 	ipc.opt = NULL;
 	ipc.tx_flags = 0;
 
-	if (replyopts.opt.optlen) {
+	if (replyopts.opt.opt.optlen) {
 		ipc.opt = &replyopts.opt;
 
-		if (ipc.opt->srr)
-			daddr = replyopts.opt.faddr;
+		if (replyopts.opt.opt.srr)
+			daddr = replyopts.opt.opt.faddr;
 	}
 
 	{
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 9640900309b..ab0c9efd1ef 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -451,6 +451,11 @@ out:
 }
 
 
+static void opt_kfree_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct ip_options_rcu, rcu));
+}
+
 /*
  *	Socket option code for IP. This is the end of the line after any
  *	TCP,UDP etc options on an IP socket.
@@ -497,13 +502,16 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 	switch (optname) {
 	case IP_OPTIONS:
 	{
-		struct ip_options *opt = NULL;
+		struct ip_options_rcu *old, *opt = NULL;
+
 		if (optlen > 40)
 			goto e_inval;
 		err = ip_options_get_from_user(sock_net(sk), &opt,
 					       optval, optlen);
 		if (err)
 			break;
+		old = rcu_dereference_protected(inet->inet_opt,
+						sock_owned_by_user(sk));
 		if (inet->is_icsk) {
 			struct inet_connection_sock *icsk = inet_csk(sk);
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
@@ -512,17 +520,18 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 			       (TCPF_LISTEN | TCPF_CLOSE)) &&
 			     inet->inet_daddr != LOOPBACK4_IPV6)) {
 #endif
-				if (inet->opt)
-					icsk->icsk_ext_hdr_len -= inet->opt->optlen;
+				if (old)
+					icsk->icsk_ext_hdr_len -= old->opt.optlen;
 				if (opt)
-					icsk->icsk_ext_hdr_len += opt->optlen;
+					icsk->icsk_ext_hdr_len += opt->opt.optlen;
 				icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 			}
 #endif
 		}
-		opt = xchg(&inet->opt, opt);
-		kfree(opt);
+		rcu_assign_pointer(inet->inet_opt, opt);
+		if (old)
+			call_rcu(&old->rcu, opt_kfree_rcu);
 		break;
 	}
 	case IP_PKTINFO:
@@ -1081,12 +1090,16 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 	case IP_OPTIONS:
 	{
 		unsigned char optbuf[sizeof(struct ip_options)+40];
-		struct ip_options * opt = (struct ip_options *)optbuf;
+		struct ip_options *opt = (struct ip_options *)optbuf;
+		struct ip_options_rcu *inet_opt;
+
+		inet_opt = rcu_dereference_protected(inet->inet_opt,
+						     sock_owned_by_user(sk));
 		opt->optlen = 0;
-		if (inet->opt)
-			memcpy(optbuf, inet->opt,
-			       sizeof(struct ip_options)+
-			       inet->opt->optlen);
+		if (inet_opt)
+			memcpy(optbuf, &inet_opt->opt,
+			       sizeof(struct ip_options) +
+			       inet_opt->opt.optlen);
 		release_sock(sk);
 
 		if (opt->optlen == 0)
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index abf14dbcb3b..a8659e0c4a6 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -460,6 +460,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	__be32 saddr;
 	u8  tos;
 	int err;
+	struct ip_options_data opt_copy;
 
 	err = -EMSGSIZE;
 	if (len > 0xFFFF)
@@ -520,8 +521,18 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	saddr = ipc.addr;
 	ipc.addr = daddr;
 
-	if (!ipc.opt)
-		ipc.opt = inet->opt;
+	if (!ipc.opt) {
+		struct ip_options_rcu *inet_opt;
+
+		rcu_read_lock();
+		inet_opt = rcu_dereference(inet->inet_opt);
+		if (inet_opt) {
+			memcpy(&opt_copy, inet_opt,
+			       sizeof(*inet_opt) + inet_opt->opt.optlen);
+			ipc.opt = &opt_copy.opt;
+		}
+		rcu_read_unlock();
+	}
 
 	if (ipc.opt) {
 		err = -EINVAL;
@@ -530,10 +541,10 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		 */
 		if (inet->hdrincl)
 			goto done;
-		if (ipc.opt->srr) {
+		if (ipc.opt->opt.srr) {
 			if (!daddr)
 				goto done;
-			daddr = ipc.opt->faddr;
+			daddr = ipc.opt->opt.faddr;
 		}
 	}
 	tos = RT_CONN_FLAGS(sk);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 71e02969190..26461492a84 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -321,10 +321,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	 * the ACK carries the same options again (see RFC1122 4.2.3.8)
 	 */
 	if (opt && opt->optlen) {
-		int opt_size = sizeof(struct ip_options) + opt->optlen;
+		int opt_size = sizeof(struct ip_options_rcu) + opt->optlen;
 
 		ireq->opt = kmalloc(opt_size, GFP_ATOMIC);
-		if (ireq->opt != NULL && ip_options_echo(ireq->opt, skb)) {
+		if (ireq->opt != NULL && ip_options_echo(&ireq->opt->opt, skb)) {
 			kfree(ireq->opt);
 			ireq->opt = NULL;
 		}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 310454c2f4d..d60732fe5f2 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -154,6 +154,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	struct flowi4 fl4;
 	struct rtable *rt;
 	int err;
+	struct ip_options_rcu *inet_opt;
 
 	if (addr_len < sizeof(struct sockaddr_in))
 		return -EINVAL;
@@ -162,10 +163,12 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		return -EAFNOSUPPORT;
 
 	nexthop = daddr = usin->sin_addr.s_addr;
-	if (inet->opt && inet->opt->srr) {
+	inet_opt = rcu_dereference_protected(inet->inet_opt,
+					     sock_owned_by_user(sk));
+	if (inet_opt && inet_opt->opt.srr) {
 		if (!daddr)
 			return -EINVAL;
-		nexthop = inet->opt->faddr;
+		nexthop = inet_opt->opt.faddr;
 	}
 
 	orig_sport = inet->inet_sport;
@@ -186,7 +189,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		return -ENETUNREACH;
 	}
 
-	if (!inet->opt || !inet->opt->srr)
+	if (!inet_opt || !inet_opt->opt.srr)
 		daddr = rt->rt_dst;
 
 	if (!inet->inet_saddr)
@@ -222,8 +225,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	inet->inet_daddr = daddr;
 
 	inet_csk(sk)->icsk_ext_hdr_len = 0;
-	if (inet->opt)
-		inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
+	if (inet_opt)
+		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 
 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 
@@ -820,17 +823,18 @@ static void syn_flood_warning(const struct sk_buff *skb)
 /*
  * Save and compile IPv4 options into the request_sock if needed.
  */
-static struct ip_options *tcp_v4_save_options(struct sock *sk,
-					      struct sk_buff *skb)
+static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
+						  struct sk_buff *skb)
 {
-	struct ip_options *opt = &(IPCB(skb)->opt);
-	struct ip_options *dopt = NULL;
+	const struct ip_options *opt = &(IPCB(skb)->opt);
+	struct ip_options_rcu *dopt = NULL;
 
 	if (opt && opt->optlen) {
-		int opt_size = optlength(opt);
+		int opt_size = sizeof(*dopt) + opt->optlen;
+
 		dopt = kmalloc(opt_size, GFP_ATOMIC);
 		if (dopt) {
-			if (ip_options_echo(dopt, skb)) {
+			if (ip_options_echo(&dopt->opt, skb)) {
 				kfree(dopt);
 				dopt = NULL;
 			}
@@ -1411,6 +1415,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 #ifdef CONFIG_TCP_MD5SIG
 	struct tcp_md5sig_key *key;
 #endif
+	struct ip_options_rcu *inet_opt;
 
 	if (sk_acceptq_is_full(sk))
 		goto exit_overflow;
@@ -1431,13 +1436,14 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	newinet->inet_daddr   = ireq->rmt_addr;
 	newinet->inet_rcv_saddr = ireq->loc_addr;
 	newinet->inet_saddr	      = ireq->loc_addr;
-	newinet->opt	      = ireq->opt;
+	inet_opt	      = ireq->opt;
+	rcu_assign_pointer(newinet->inet_opt, inet_opt);
 	ireq->opt	      = NULL;
 	newinet->mc_index     = inet_iif(skb);
 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
-	if (newinet->opt)
-		inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
+	if (inet_opt)
+		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 	newinet->inet_id = newtp->write_seq ^ jiffies;
 
 	tcp_mtup_init(newsk);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index bc0dab2593e..544f435d1af 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -804,6 +804,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
 	int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
 	struct sk_buff *skb;
+	struct ip_options_data opt_copy;
 
 	if (len > 0xFFFF)
 		return -EMSGSIZE;
@@ -877,22 +878,32 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			free = 1;
 		connected = 0;
 	}
-	if (!ipc.opt)
-		ipc.opt = inet->opt;
+	if (!ipc.opt) {
+		struct ip_options_rcu *inet_opt;
+
+		rcu_read_lock();
+		inet_opt = rcu_dereference(inet->inet_opt);
+		if (inet_opt) {
+			memcpy(&opt_copy, inet_opt,
+			       sizeof(*inet_opt) + inet_opt->opt.optlen);
+			ipc.opt = &opt_copy.opt;
+		}
+		rcu_read_unlock();
+	}
 
 	saddr = ipc.addr;
 	ipc.addr = faddr = daddr;
 
-	if (ipc.opt && ipc.opt->srr) {
+	if (ipc.opt && ipc.opt->opt.srr) {
 		if (!daddr)
 			return -EINVAL;
-		faddr = ipc.opt->faddr;
+		faddr = ipc.opt->opt.faddr;
 		connected = 0;
 	}
 	tos = RT_TOS(inet->tos);
 	if (sock_flag(sk, SOCK_LOCALROUTE) ||
 	    (msg->msg_flags & MSG_DONTROUTE) ||
-	    (ipc.opt && ipc.opt->is_strictroute)) {
+	    (ipc.opt && ipc.opt->opt.is_strictroute)) {
 		tos |= RTO_ONLINK;
 		connected = 0;
 	}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index cb7658aceb6..868366470b4 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1469,7 +1469,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 	   First: no IPv4 options.
 	 */
-	newinet->opt = NULL;
+	newinet->inet_opt = NULL;
 	newnp->ipv6_fl_list = NULL;
 
 	/* Clone RX bits */
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index cc673677c5d..962a607b51d 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -416,7 +416,6 @@ static int l2tp_ip_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m
 	int rc;
 	struct l2tp_ip_sock *lsa = l2tp_ip_sk(sk);
 	struct inet_sock *inet = inet_sk(sk);
-	struct ip_options *opt = inet->opt;
 	struct rtable *rt = NULL;
 	int connected = 0;
 	__be32 daddr;
@@ -471,9 +470,14 @@ static int l2tp_ip_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m
 		rt = (struct rtable *) __sk_dst_check(sk, 0);
 
 	if (rt == NULL) {
+		struct ip_options_rcu *inet_opt;
+
+		inet_opt = rcu_dereference_protected(inet->inet_opt,
+						     sock_owned_by_user(sk));
+
 		/* Use correct destination address if we have options. */
-		if (opt && opt->srr)
-			daddr = opt->faddr;
+		if (inet_opt && inet_opt->opt.srr)
+			daddr = inet_opt->opt.faddr;
 
 		/* If this fails, retransmit mechanism of transport layer will
 		 * keep trying until route appears or the connection times
-- 
cgit v1.2.3-70-g09d2


From b883187785004cacea053569f1655fada0dfc299 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 28 Apr 2011 23:16:53 -0700
Subject: ipv4: Fetch route saddr from flow key in inet_sk_reselect_saddr().

Now that output route lookups update the flow with
source address selection, we can fetch it from
fl4->saddr instead of rt->rt_src

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4/af_inet.c')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 963a621e75c..4e734992e26 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1122,7 +1122,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
 
 	sk_setup_caps(sk, &rt->dst);
 
-	new_saddr = rt->rt_src;
+	new_saddr = fl4.saddr;
 
 	if (new_saddr == old_saddr)
 		return 0;
-- 
cgit v1.2.3-70-g09d2


From 31e4543db29fb85496a122b965d6482c8d1a2bfe Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 3 May 2011 20:25:42 -0700
Subject: ipv4: Make caller provide on-stack flow key to
 ip_route_output_ports().

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/cxgb3/iwch_cm.c |  3 ++-
 drivers/infiniband/hw/cxgb4/cm.c      |  3 ++-
 drivers/net/pptp.c                    |  6 ++++--
 drivers/scsi/cxgbi/libcxgbi.c         |  3 ++-
 include/net/route.h                   | 11 +++++------
 net/ipv4/af_inet.c                    |  3 ++-
 net/ipv4/igmp.c                       |  6 ++++--
 net/ipv4/ip_output.c                  |  3 ++-
 net/ipv4/ipip.c                       | 19 +++++++++++--------
 net/ipv4/ipmr.c                       |  5 +++--
 net/ipv6/ip6_tunnel.c                 |  5 +++--
 net/ipv6/sit.c                        |  6 ++++--
 net/l2tp/l2tp_ip.c                    |  3 ++-
 net/rxrpc/ar-peer.c                   |  3 ++-
 14 files changed, 48 insertions(+), 31 deletions(-)

(limited to 'net/ipv4/af_inet.c')

diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c
index 3216bcad7e8..23918413899 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_cm.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c
@@ -338,8 +338,9 @@ static struct rtable *find_route(struct t3cdev *dev, __be32 local_ip,
 				 __be16 peer_port, u8 tos)
 {
 	struct rtable *rt;
+	struct flowi4 fl4;
 
-	rt = ip_route_output_ports(&init_net, NULL, peer_ip, local_ip,
+	rt = ip_route_output_ports(&init_net, &fl4, NULL, peer_ip, local_ip,
 				   peer_port, local_port, IPPROTO_TCP,
 				   tos, 0);
 	if (IS_ERR(rt))
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 9d8dcfab2b3..6aa53cd6947 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -315,8 +315,9 @@ static struct rtable *find_route(struct c4iw_dev *dev, __be32 local_ip,
 				 __be16 peer_port, u8 tos)
 {
 	struct rtable *rt;
+	struct flowi4 fl4;
 
-	rt = ip_route_output_ports(&init_net, NULL, peer_ip, local_ip,
+	rt = ip_route_output_ports(&init_net, &fl4, NULL, peer_ip, local_ip,
 				   peer_port, local_port, IPPROTO_TCP,
 				   tos, 0);
 	if (IS_ERR(rt))
diff --git a/drivers/net/pptp.c b/drivers/net/pptp.c
index 51dfcf8023c..e771e8d27eb 100644
--- a/drivers/net/pptp.c
+++ b/drivers/net/pptp.c
@@ -175,6 +175,7 @@ static int pptp_xmit(struct ppp_channel *chan, struct sk_buff *skb)
 	struct pptp_opt *opt = &po->proto.pptp;
 	struct pptp_gre_header *hdr;
 	unsigned int header_len = sizeof(*hdr);
+	struct flowi4 fl4;
 	int islcp;
 	int len;
 	unsigned char *data;
@@ -189,7 +190,7 @@ static int pptp_xmit(struct ppp_channel *chan, struct sk_buff *skb)
 	if (sk_pppox(po)->sk_state & PPPOX_DEAD)
 		goto tx_error;
 
-	rt = ip_route_output_ports(&init_net, NULL,
+	rt = ip_route_output_ports(&init_net, &fl4, NULL,
 				   opt->dst_addr.sin_addr.s_addr,
 				   opt->src_addr.sin_addr.s_addr,
 				   0, 0, IPPROTO_GRE,
@@ -434,6 +435,7 @@ static int pptp_connect(struct socket *sock, struct sockaddr *uservaddr,
 	struct pppox_sock *po = pppox_sk(sk);
 	struct pptp_opt *opt = &po->proto.pptp;
 	struct rtable *rt;
+	struct flowi4 fl4;
 	int error = 0;
 
 	if (sp->sa_protocol != PX_PROTO_PPTP)
@@ -463,7 +465,7 @@ static int pptp_connect(struct socket *sock, struct sockaddr *uservaddr,
 	po->chan.private = sk;
 	po->chan.ops = &pptp_chan_ops;
 
-	rt = ip_route_output_ports(&init_net, sk,
+	rt = ip_route_output_ports(&init_net, &fl4, sk,
 				   opt->dst_addr.sin_addr.s_addr,
 				   opt->src_addr.sin_addr.s_addr,
 				   0, 0,
diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c
index de764ea7419..0c33d250c7d 100644
--- a/drivers/scsi/cxgbi/libcxgbi.c
+++ b/drivers/scsi/cxgbi/libcxgbi.c
@@ -454,8 +454,9 @@ static struct rtable *find_route_ipv4(__be32 saddr, __be32 daddr,
 				      __be16 sport, __be16 dport, u8 tos)
 {
 	struct rtable *rt;
+	struct flowi4 fl4;
 
-	rt = ip_route_output_ports(&init_net, NULL, daddr, saddr,
+	rt = ip_route_output_ports(&init_net, &fl4, NULL, daddr, saddr,
 				   dport, sport, IPPROTO_TCP, tos, 0);
 	if (IS_ERR(rt))
 		return NULL;
diff --git a/include/net/route.h b/include/net/route.h
index f07609e8314..8c02c871a8c 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -137,20 +137,19 @@ static inline struct rtable *ip_route_output(struct net *net, __be32 daddr,
 	return ip_route_output_key(net, &fl4);
 }
 
-static inline struct rtable *ip_route_output_ports(struct net *net, struct sock *sk,
+static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi4 *fl4,
+						   struct sock *sk,
 						   __be32 daddr, __be32 saddr,
 						   __be16 dport, __be16 sport,
 						   __u8 proto, __u8 tos, int oif)
 {
-	struct flowi4 fl4;
-
-	flowi4_init_output(&fl4, oif, sk ? sk->sk_mark : 0, tos,
+	flowi4_init_output(fl4, oif, sk ? sk->sk_mark : 0, tos,
 			   RT_SCOPE_UNIVERSE, proto,
 			   sk ? inet_sk_flowi_flags(sk) : 0,
 			   daddr, saddr, dport, sport);
 	if (sk)
-		security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
-	return ip_route_output_flow(net, &fl4, sk);
+		security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
+	return ip_route_output_flow(net, fl4, sk);
 }
 
 static inline struct rtable *ip_route_output_gre(struct net *net,
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 4e734992e26..7b91fa8bf83 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1152,6 +1152,7 @@ int inet_sk_rebuild_header(struct sock *sk)
 	struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
 	__be32 daddr;
 	struct ip_options_rcu *inet_opt;
+	struct flowi4 fl4;
 	int err;
 
 	/* Route is OK, nothing to do. */
@@ -1165,7 +1166,7 @@ int inet_sk_rebuild_header(struct sock *sk)
 	if (inet_opt && inet_opt->opt.srr)
 		daddr = inet_opt->opt.faddr;
 	rcu_read_unlock();
-	rt = ip_route_output_ports(sock_net(sk), sk, daddr, inet->inet_saddr,
+	rt = ip_route_output_ports(sock_net(sk), &fl4, sk, daddr, inet->inet_saddr,
 				   inet->inet_dport, inet->inet_sport,
 				   sk->sk_protocol, RT_CONN_FLAGS(sk),
 				   sk->sk_bound_dev_if);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 8ae0a5702f5..7c2ef59e3f7 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -309,6 +309,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
 	struct iphdr *pip;
 	struct igmpv3_report *pig;
 	struct net *net = dev_net(dev);
+	struct flowi4 fl4;
 
 	while (1) {
 		skb = alloc_skb(size + LL_ALLOCATED_SPACE(dev),
@@ -321,7 +322,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
 	}
 	igmp_skb_size(skb) = size;
 
-	rt = ip_route_output_ports(net, NULL, IGMPV3_ALL_MCR, 0,
+	rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0,
 				   0, 0,
 				   IPPROTO_IGMP, 0, dev->ifindex);
 	if (IS_ERR(rt)) {
@@ -650,6 +651,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
 	struct net_device *dev = in_dev->dev;
 	struct net *net = dev_net(dev);
 	__be32	group = pmc ? pmc->multiaddr : 0;
+	struct flowi4 fl4;
 	__be32	dst;
 
 	if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
@@ -659,7 +661,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
 	else
 		dst = group;
 
-	rt = ip_route_output_ports(net, NULL, dst, 0,
+	rt = ip_route_output_ports(net, &fl4, NULL, dst, 0,
 				   0, 0,
 				   IPPROTO_IGMP, 0, dev->ifindex);
 	if (IS_ERR(rt))
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 362e66f7d2f..3aa4c31e544 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -333,6 +333,7 @@ int ip_queue_xmit(struct sk_buff *skb)
 	/* Make sure we can route this packet. */
 	rt = (struct rtable *)__sk_dst_check(sk, 0);
 	if (rt == NULL) {
+		struct flowi4 fl4;
 		__be32 daddr;
 
 		/* Use correct destination address if we have options. */
@@ -344,7 +345,7 @@ int ip_queue_xmit(struct sk_buff *skb)
 		 * keep trying until route appears or the connection times
 		 * itself out.
 		 */
-		rt = ip_route_output_ports(sock_net(sk), sk,
+		rt = ip_route_output_ports(sock_net(sk), &fl4, sk,
 					   daddr, inet->inet_saddr,
 					   inet->inet_dport,
 					   inet->inet_sport,
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index ef16377ec73..88d96bde950 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -442,6 +442,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct iphdr  *iph;			/* Our new IP header */
 	unsigned int max_headroom;		/* The extra header space needed */
 	__be32 dst = tiph->daddr;
+	struct flowi4 fl4;
 	int    mtu;
 
 	if (skb->protocol != htons(ETH_P_IP))
@@ -460,7 +461,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 			goto tx_error_icmp;
 	}
 
-	rt = ip_route_output_ports(dev_net(dev), NULL,
+	rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
 				   dst, tiph->saddr,
 				   0, 0,
 				   IPPROTO_IPIP, RT_TOS(tos),
@@ -578,13 +579,15 @@ static void ipip_tunnel_bind_dev(struct net_device *dev)
 	iph = &tunnel->parms.iph;
 
 	if (iph->daddr) {
-		struct rtable *rt = ip_route_output_ports(dev_net(dev), NULL,
-							  iph->daddr, iph->saddr,
-							  0, 0,
-							  IPPROTO_IPIP,
-							  RT_TOS(iph->tos),
-							  tunnel->parms.link);
-
+		struct rtable *rt;
+		struct flowi4 fl4;
+
+		rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
+					   iph->daddr, iph->saddr,
+					   0, 0,
+					   IPPROTO_IPIP,
+					   RT_TOS(iph->tos),
+					   tunnel->parms.link);
 		if (!IS_ERR(rt)) {
 			tdev = rt->dst.dev;
 			ip_rt_put(rt);
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 3ad38a44958..86033b7a05b 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1595,6 +1595,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
 	struct vif_device *vif = &mrt->vif_table[vifi];
 	struct net_device *dev;
 	struct rtable *rt;
+	struct flowi4 fl4;
 	int    encap = 0;
 
 	if (vif->dev == NULL)
@@ -1612,7 +1613,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
 #endif
 
 	if (vif->flags & VIFF_TUNNEL) {
-		rt = ip_route_output_ports(net, NULL,
+		rt = ip_route_output_ports(net, &fl4, NULL,
 					   vif->remote, vif->local,
 					   0, 0,
 					   IPPROTO_IPIP,
@@ -1621,7 +1622,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
 			goto out_free;
 		encap = sizeof(struct iphdr);
 	} else {
-		rt = ip_route_output_ports(net, NULL, iph->daddr, 0,
+		rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0,
 					   0, 0,
 					   IPPROTO_IPIP,
 					   RT_TOS(iph->tos), vif->link);
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 9dd0e964b8b..3dff27cba95 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -537,6 +537,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	struct sk_buff *skb2;
 	const struct iphdr *eiph;
 	struct rtable *rt;
+	struct flowi4 fl4;
 
 	err = ip6_tnl_err(skb, IPPROTO_IPIP, opt, &rel_type, &rel_code,
 			  &rel_msg, &rel_info, offset);
@@ -577,7 +578,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	eiph = ip_hdr(skb2);
 
 	/* Try to guess incoming interface */
-	rt = ip_route_output_ports(dev_net(skb->dev), NULL,
+	rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL,
 				   eiph->saddr, 0,
 				   0, 0,
 				   IPPROTO_IPIP, RT_TOS(eiph->tos), 0);
@@ -590,7 +591,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	if (rt->rt_flags & RTCF_LOCAL) {
 		ip_rt_put(rt);
 		rt = NULL;
-		rt = ip_route_output_ports(dev_net(skb->dev), NULL,
+		rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL,
 					   eiph->daddr, eiph->saddr,
 					   0, 0,
 					   IPPROTO_IPIP,
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 34d89642670..a24fb14d91f 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -674,6 +674,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 	struct iphdr  *iph;			/* Our new IP header */
 	unsigned int max_headroom;		/* The extra header space needed */
 	__be32 dst = tiph->daddr;
+	struct flowi4 fl4;
 	int    mtu;
 	const struct in6_addr *addr6;
 	int addr_type;
@@ -733,7 +734,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 		dst = addr6->s6_addr32[3];
 	}
 
-	rt = ip_route_output_ports(dev_net(dev), NULL,
+	rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
 				   dst, tiph->saddr,
 				   0, 0,
 				   IPPROTO_IPV6, RT_TOS(tos),
@@ -851,12 +852,13 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev)
 	struct net_device *tdev = NULL;
 	struct ip_tunnel *tunnel;
 	const struct iphdr *iph;
+	struct flowi4 fl4;
 
 	tunnel = netdev_priv(dev);
 	iph = &tunnel->parms.iph;
 
 	if (iph->daddr) {
-		struct rtable *rt = ip_route_output_ports(dev_net(dev), NULL,
+		struct rtable *rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
 							  iph->daddr, iph->saddr,
 							  0, 0,
 							  IPPROTO_IPV6,
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index a4d2dfa1fdb..81899600abe 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -471,6 +471,7 @@ static int l2tp_ip_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m
 
 	if (rt == NULL) {
 		struct ip_options_rcu *inet_opt;
+		struct flowi4 fl4;
 
 		rcu_read_lock();
 		inet_opt = rcu_dereference(inet->inet_opt);
@@ -485,7 +486,7 @@ static int l2tp_ip_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m
 		 * keep trying until route appears or the connection times
 		 * itself out.
 		 */
-		rt = ip_route_output_ports(sock_net(sk), sk,
+		rt = ip_route_output_ports(sock_net(sk), &fl4, sk,
 					   daddr, inet->inet_saddr,
 					   inet->inet_dport, inet->inet_sport,
 					   sk->sk_protocol, RT_CONN_FLAGS(sk),
diff --git a/net/rxrpc/ar-peer.c b/net/rxrpc/ar-peer.c
index 55b93dc60d0..b6ff06351d6 100644
--- a/net/rxrpc/ar-peer.c
+++ b/net/rxrpc/ar-peer.c
@@ -36,10 +36,11 @@ static void rxrpc_destroy_peer(struct work_struct *work);
 static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer)
 {
 	struct rtable *rt;
+	struct flowi4 fl4;
 
 	peer->if_mtu = 1500;
 
-	rt = ip_route_output_ports(&init_net, NULL,
+	rt = ip_route_output_ports(&init_net, &fl4, NULL,
 				   peer->srx.transport.sin.sin_addr.s_addr, 0,
 				   htons(7000), htons(7001),
 				   IPPROTO_UDP, 0, 0);
-- 
cgit v1.2.3-70-g09d2


From 6e869138101bc607e4780187210b79d531f9b2ce Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 6 May 2011 16:18:04 -0700
Subject: ipv4: Use cork flow in inet_sk_{reselect_saddr,rebuild_header}()

These two functions must be invoked only when the socket is locked
(because socket identity modifications are made non-atomically).

Therefore we can use the cork flow for output route lookups.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'net/ipv4/af_inet.c')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 7b91fa8bf83..851aa056854 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1103,7 +1103,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
 	struct inet_sock *inet = inet_sk(sk);
 	__be32 old_saddr = inet->inet_saddr;
 	__be32 daddr = inet->inet_daddr;
-	struct flowi4 fl4;
+	struct flowi4 *fl4;
 	struct rtable *rt;
 	__be32 new_saddr;
 	struct ip_options_rcu *inet_opt;
@@ -1114,7 +1114,8 @@ static int inet_sk_reselect_saddr(struct sock *sk)
 		daddr = inet_opt->opt.faddr;
 
 	/* Query new route. */
-	rt = ip_route_connect(&fl4, daddr, 0, RT_CONN_FLAGS(sk),
+	fl4 = &inet->cork.fl.u.ip4;
+	rt = ip_route_connect(fl4, daddr, 0, RT_CONN_FLAGS(sk),
 			      sk->sk_bound_dev_if, sk->sk_protocol,
 			      inet->inet_sport, inet->inet_dport, sk, false);
 	if (IS_ERR(rt))
@@ -1122,7 +1123,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
 
 	sk_setup_caps(sk, &rt->dst);
 
-	new_saddr = fl4.saddr;
+	new_saddr = fl4->saddr;
 
 	if (new_saddr == old_saddr)
 		return 0;
@@ -1152,7 +1153,7 @@ int inet_sk_rebuild_header(struct sock *sk)
 	struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
 	__be32 daddr;
 	struct ip_options_rcu *inet_opt;
-	struct flowi4 fl4;
+	struct flowi4 *fl4;
 	int err;
 
 	/* Route is OK, nothing to do. */
@@ -1166,7 +1167,8 @@ int inet_sk_rebuild_header(struct sock *sk)
 	if (inet_opt && inet_opt->opt.srr)
 		daddr = inet_opt->opt.faddr;
 	rcu_read_unlock();
-	rt = ip_route_output_ports(sock_net(sk), &fl4, sk, daddr, inet->inet_saddr,
+	fl4 = &inet->cork.fl.u.ip4;
+	rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr,
 				   inet->inet_dport, inet->inet_sport,
 				   sk->sk_protocol, RT_CONN_FLAGS(sk),
 				   sk->sk_bound_dev_if);
-- 
cgit v1.2.3-70-g09d2


From c319b4d76b9e583a5d88d6bf190e079c4e43213d Mon Sep 17 00:00:00 2001
From: Vasiliy Kulikov <segoon@openwall.com>
Date: Fri, 13 May 2011 10:01:00 +0000
Subject: net: ipv4: add IPPROTO_ICMP socket kind

This patch adds IPPROTO_ICMP socket kind.  It makes it possible to send
ICMP_ECHO messages and receive the corresponding ICMP_ECHOREPLY messages
without any special privileges.  In other words, the patch makes it
possible to implement setuid-less and CAP_NET_RAW-less /bin/ping.  In
order not to increase the kernel's attack surface, the new functionality
is disabled by default, but is enabled at bootup by supporting Linux
distributions, optionally with restriction to a group or a group range
(see below).

Similar functionality is implemented in Mac OS X:
http://www.manpagez.com/man/4/icmp/

A new ping socket is created with

    socket(PF_INET, SOCK_DGRAM, PROT_ICMP)

Message identifiers (octets 4-5 of ICMP header) are interpreted as local
ports. Addresses are stored in struct sockaddr_in. No port numbers are
reserved for privileged processes, port 0 is reserved for API ("let the
kernel pick a free number"). There is no notion of remote ports, remote
port numbers provided by the user (e.g. in connect()) are ignored.

Data sent and received include ICMP headers. This is deliberate to:
1) Avoid the need to transport headers values like sequence numbers by
other means.
2) Make it easier to port existing programs using raw sockets.

ICMP headers given to send() are checked and sanitized. The type must be
ICMP_ECHO and the code must be zero (future extensions might relax this,
see below). The id is set to the number (local port) of the socket, the
checksum is always recomputed.

ICMP reply packets received from the network are demultiplexed according
to their id's, and are returned by recv() without any modifications.
IP header information and ICMP errors of those packets may be obtained
via ancillary data (IP_RECVTTL, IP_RETOPTS, and IP_RECVERR). ICMP source
quenches and redirects are reported as fake errors via the error queue
(IP_RECVERR); the next hop address for redirects is saved to ee_info (in
network order).

socket(2) is restricted to the group range specified in
"/proc/sys/net/ipv4/ping_group_range".  It is "1 0" by default, meaning
that nobody (not even root) may create ping sockets.  Setting it to "100
100" would grant permissions to the single group (to either make
/sbin/ping g+s and owned by this group or to grant permissions to the
"netadmins" group), "0 4294967295" would enable it for the world, "100
4294967295" would enable it for the users, but not daemons.

The existing code might be (in the unlikely case anyone needs it)
extended rather easily to handle other similar pairs of ICMP messages
(Timestamp/Reply, Information Request/Reply, Address Mask Request/Reply
etc.).

Userspace ping util & patch for it:
http://openwall.info/wiki/people/segoon/ping

For Openwall GNU/*/Linux it was the last step on the road to the
setuid-less distro.  A revision of this patch (for RHEL5/OpenVZ kernels)
is in use in Owl-current, such as in the 2011/03/12 LiveCD ISOs:
http://mirrors.kernel.org/openwall/Owl/current/iso/

Initially this functionality was written by Pavel Kankovsky for
Linux 2.4.32, but unfortunately it was never made public.

All ping options (-b, -p, -Q, -R, -s, -t, -T, -M, -I), are tested with
the patch.

PATCH v3:
    - switched to flowi4.
    - minor changes to be consistent with raw sockets code.

PATCH v2:
    - changed ping_debug() to pr_debug().
    - removed CONFIG_IP_PING.
    - removed ping_seq_fops.owner field (unused for procfs).
    - switched to proc_net_fops_create().
    - switched to %pK in seq_printf().

PATCH v1:
    - fixed checksumming bug.
    - CAP_NET_RAW may not create icmp sockets anymore.

RFC v2:
    - minor cleanups.
    - introduced sysctl'able group range to restrict socket(2).

Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h   |   2 +
 include/net/ping.h         |  57 +++
 net/ipv4/Makefile          |   2 +-
 net/ipv4/af_inet.c         |  22 ++
 net/ipv4/icmp.c            |  12 +-
 net/ipv4/ping.c            | 937 +++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/sysctl_net_ipv4.c |  80 ++++
 7 files changed, 1110 insertions(+), 2 deletions(-)
 create mode 100644 include/net/ping.h
 create mode 100644 net/ipv4/ping.c

(limited to 'net/ipv4/af_inet.c')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 542195d9469..d786b4fc02a 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -54,6 +54,8 @@ struct netns_ipv4 {
 	int sysctl_rt_cache_rebuild_count;
 	int current_rt_cache_rebuild_count;
 
+	unsigned int sysctl_ping_group_range[2];
+
 	atomic_t rt_genid;
 	atomic_t dev_addr_genid;
 
diff --git a/include/net/ping.h b/include/net/ping.h
new file mode 100644
index 00000000000..23062c369c6
--- /dev/null
+++ b/include/net/ping.h
@@ -0,0 +1,57 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Definitions for the "ping" module.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+#ifndef _PING_H
+#define _PING_H
+
+#include <net/netns/hash.h>
+
+/* PING_HTABLE_SIZE must be power of 2 */
+#define PING_HTABLE_SIZE 	64
+#define PING_HTABLE_MASK 	(PING_HTABLE_SIZE-1)
+
+#define ping_portaddr_for_each_entry(__sk, node, list) \
+	hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node)
+
+/*
+ * gid_t is either uint or ushort.  We want to pass it to
+ * proc_dointvec_minmax(), so it must not be larger than MAX_INT
+ */
+#define GID_T_MAX (((gid_t)~0U) >> 1)
+
+struct ping_table {
+	struct hlist_nulls_head	hash[PING_HTABLE_SIZE];
+	rwlock_t		lock;
+};
+
+struct ping_iter_state {
+	struct seq_net_private  p;
+	int			bucket;
+};
+
+extern struct proto ping_prot;
+
+
+extern void ping_rcv(struct sk_buff *);
+extern void ping_err(struct sk_buff *, u32 info);
+
+extern void inet_get_ping_group_range_net(struct net *net, unsigned int *low, unsigned int *high);
+
+#ifdef CONFIG_PROC_FS
+extern int __init ping_proc_init(void);
+extern void ping_proc_exit(void);
+#endif
+
+void __init ping_init(void);
+
+
+#endif /* _PING_H */
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 0dc772d0d12..f2dc69cffb5 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -11,7 +11,7 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     datagram.o raw.o udp.o udplite.o \
 	     arp.o icmp.o devinet.o af_inet.o  igmp.o \
 	     fib_frontend.o fib_semantics.o fib_trie.o \
-	     inet_fragment.o
+	     inet_fragment.o ping.o
 
 obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
 obj-$(CONFIG_PROC_FS) += proc.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 851aa056854..cc1463156cd 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -105,6 +105,7 @@
 #include <net/tcp.h>
 #include <net/udp.h>
 #include <net/udplite.h>
+#include <net/ping.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
 #include <net/raw.h>
@@ -1008,6 +1009,14 @@ static struct inet_protosw inetsw_array[] =
 		.flags =      INET_PROTOSW_PERMANENT,
        },
 
+       {
+		.type =       SOCK_DGRAM,
+		.protocol =   IPPROTO_ICMP,
+		.prot =       &ping_prot,
+		.ops =        &inet_dgram_ops,
+		.no_check =   UDP_CSUM_DEFAULT,
+		.flags =      INET_PROTOSW_REUSE,
+       },
 
        {
 	       .type =       SOCK_RAW,
@@ -1527,6 +1536,7 @@ static const struct net_protocol udp_protocol = {
 
 static const struct net_protocol icmp_protocol = {
 	.handler =	icmp_rcv,
+	.err_handler =	ping_err,
 	.no_policy =	1,
 	.netns_ok =	1,
 };
@@ -1642,6 +1652,10 @@ static int __init inet_init(void)
 	if (rc)
 		goto out_unregister_udp_proto;
 
+	rc = proto_register(&ping_prot, 1);
+	if (rc)
+		goto out_unregister_raw_proto;
+
 	/*
 	 *	Tell SOCKET that we are alive...
 	 */
@@ -1697,6 +1711,8 @@ static int __init inet_init(void)
 	/* Add UDP-Lite (RFC 3828) */
 	udplite4_register();
 
+	ping_init();
+
 	/*
 	 *	Set the ICMP layer up
 	 */
@@ -1727,6 +1743,8 @@ static int __init inet_init(void)
 	rc = 0;
 out:
 	return rc;
+out_unregister_raw_proto:
+	proto_unregister(&raw_prot);
 out_unregister_udp_proto:
 	proto_unregister(&udp_prot);
 out_unregister_tcp_proto:
@@ -1751,11 +1769,15 @@ static int __init ipv4_proc_init(void)
 		goto out_tcp;
 	if (udp4_proc_init())
 		goto out_udp;
+	if (ping_proc_init())
+		goto out_ping;
 	if (ip_misc_proc_init())
 		goto out_misc;
 out:
 	return rc;
 out_misc:
+	ping_proc_exit();
+out_ping:
 	udp4_proc_exit();
 out_udp:
 	tcp4_proc_exit();
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 3314394f0aa..3f47585aad6 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -83,6 +83,7 @@
 #include <net/tcp.h>
 #include <net/udp.h>
 #include <net/raw.h>
+#include <net/ping.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
 #include <linux/errno.h>
@@ -781,6 +782,15 @@ static void icmp_redirect(struct sk_buff *skb)
 			       iph->saddr, skb->dev);
 		break;
 	}
+
+	/* Ping wants to see redirects.
+         * Let's pretend they are errors of sorts... */
+	if (iph->protocol == IPPROTO_ICMP &&
+	    iph->ihl >= 5 &&
+	    pskb_may_pull(skb, (iph->ihl<<2)+8)) {
+		ping_err(skb, icmp_hdr(skb)->un.gateway);
+	}
+
 out:
 	return;
 out_err:
@@ -1041,7 +1051,7 @@ error:
  */
 static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
 	[ICMP_ECHOREPLY] = {
-		.handler = icmp_discard,
+		.handler = ping_rcv,
 	},
 	[1] = {
 		.handler = icmp_discard,
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
new file mode 100644
index 00000000000..a77e2d788da
--- /dev/null
+++ b/net/ipv4/ping.c
@@ -0,0 +1,937 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		"Ping" sockets
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Based on ipv4/udp.c code.
+ *
+ * Authors:	Vasiliy Kulikov / Openwall (for Linux 2.6),
+ *		Pavel Kankovsky (for Linux 2.4.32)
+ *
+ * Pavel gave all rights to bugs to Vasiliy,
+ * none of the bugs are Pavel's now.
+ *
+ */
+
+#include <asm/system.h>
+#include <linux/uaccess.h>
+#include <asm/ioctls.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <net/sock.h>
+#include <net/ping.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/route.h>
+#include <net/inet_common.h>
+#include <net/checksum.h>
+
+
+struct ping_table ping_table __read_mostly;
+
+u16 ping_port_rover;
+
+static inline int ping_hashfn(struct net *net, unsigned num, unsigned mask)
+{
+	int res = (num + net_hash_mix(net)) & mask;
+	pr_debug("hash(%d) = %d\n", num, res);
+	return res;
+}
+
+static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table,
+					     struct net *net, unsigned num)
+{
+	return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)];
+}
+
+static int ping_v4_get_port(struct sock *sk, unsigned short ident)
+{
+	struct hlist_nulls_node *node;
+	struct hlist_nulls_head *hlist;
+	struct inet_sock *isk, *isk2;
+	struct sock *sk2 = NULL;
+
+	isk = inet_sk(sk);
+	write_lock_bh(&ping_table.lock);
+	if (ident == 0) {
+		u32 i;
+		u16 result = ping_port_rover + 1;
+
+		for (i = 0; i < (1L << 16); i++, result++) {
+			if (!result)
+				result++; /* avoid zero */
+			hlist = ping_hashslot(&ping_table, sock_net(sk),
+					    result);
+			ping_portaddr_for_each_entry(sk2, node, hlist) {
+				isk2 = inet_sk(sk2);
+
+				if (isk2->inet_num == result)
+					goto next_port;
+			}
+
+			/* found */
+			ping_port_rover = ident = result;
+			break;
+next_port:
+			;
+		}
+		if (i >= (1L << 16))
+			goto fail;
+	} else {
+		hlist = ping_hashslot(&ping_table, sock_net(sk), ident);
+		ping_portaddr_for_each_entry(sk2, node, hlist) {
+			isk2 = inet_sk(sk2);
+
+			if ((isk2->inet_num == ident) &&
+			    (sk2 != sk) &&
+			    (!sk2->sk_reuse || !sk->sk_reuse))
+				goto fail;
+		}
+	}
+
+	pr_debug("found port/ident = %d\n", ident);
+	isk->inet_num = ident;
+	if (sk_unhashed(sk)) {
+		pr_debug("was not hashed\n");
+		sock_hold(sk);
+		hlist_nulls_add_head(&sk->sk_nulls_node, hlist);
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+	}
+	write_unlock_bh(&ping_table.lock);
+	return 0;
+
+fail:
+	write_unlock_bh(&ping_table.lock);
+	return 1;
+}
+
+static void ping_v4_hash(struct sock *sk)
+{
+	pr_debug("ping_v4_hash(sk->port=%u)\n", inet_sk(sk)->inet_num);
+	BUG(); /* "Please do not press this button again." */
+}
+
+static void ping_v4_unhash(struct sock *sk)
+{
+	struct inet_sock *isk = inet_sk(sk);
+	pr_debug("ping_v4_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num);
+	if (sk_hashed(sk)) {
+		struct hlist_nulls_head *hslot;
+
+		hslot = ping_hashslot(&ping_table, sock_net(sk), isk->inet_num);
+		write_lock_bh(&ping_table.lock);
+		hlist_nulls_del(&sk->sk_nulls_node);
+		sock_put(sk);
+		isk->inet_num = isk->inet_sport = 0;
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+		write_unlock_bh(&ping_table.lock);
+	}
+}
+
+struct sock *ping_v4_lookup(struct net *net, u32 saddr, u32 daddr,
+	 u16 ident, int dif)
+{
+	struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident);
+	struct sock *sk = NULL;
+	struct inet_sock *isk;
+	struct hlist_nulls_node *hnode;
+
+	pr_debug("try to find: num = %d, daddr = %ld, dif = %d\n",
+			 (int)ident, (unsigned long)daddr, dif);
+	read_lock_bh(&ping_table.lock);
+
+	ping_portaddr_for_each_entry(sk, hnode, hslot) {
+		isk = inet_sk(sk);
+
+		pr_debug("found: %p: num = %d, daddr = %ld, dif = %d\n", sk,
+			 (int)isk->inet_num, (unsigned long)isk->inet_rcv_saddr,
+			 sk->sk_bound_dev_if);
+
+		pr_debug("iterate\n");
+		if (isk->inet_num != ident)
+			continue;
+		if (isk->inet_rcv_saddr && isk->inet_rcv_saddr != daddr)
+			continue;
+		if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
+			continue;
+
+		sock_hold(sk);
+		goto exit;
+	}
+
+	sk = NULL;
+exit:
+	read_unlock_bh(&ping_table.lock);
+
+	return sk;
+}
+
+static int ping_init_sock(struct sock *sk)
+{
+	struct net *net = sock_net(sk);
+	gid_t group = current_egid();
+	gid_t range[2];
+	struct group_info *group_info = get_current_groups();
+	int i, j, count = group_info->ngroups;
+
+	inet_get_ping_group_range_net(net, range, range+1);
+	if (range[0] <= group && group <= range[1])
+		return 0;
+
+	for (i = 0; i < group_info->nblocks; i++) {
+		int cp_count = min_t(int, NGROUPS_PER_BLOCK, count);
+
+		for (j = 0; j < cp_count; j++) {
+			group = group_info->blocks[i][j];
+			if (range[0] <= group && group <= range[1])
+				return 0;
+		}
+
+		count -= cp_count;
+	}
+
+	return -EACCES;
+}
+
+static void ping_close(struct sock *sk, long timeout)
+{
+	pr_debug("ping_close(sk=%p,sk->num=%u)\n",
+		inet_sk(sk), inet_sk(sk)->inet_num);
+	pr_debug("isk->refcnt = %d\n", sk->sk_refcnt.counter);
+
+	sk_common_release(sk);
+}
+
+/*
+ * We need our own bind because there are no privileged id's == local ports.
+ * Moreover, we don't allow binding to multi- and broadcast addresses.
+ */
+
+static int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
+	struct inet_sock *isk = inet_sk(sk);
+	unsigned short snum;
+	int chk_addr_ret;
+	int err;
+
+	if (addr_len < sizeof(struct sockaddr_in))
+		return -EINVAL;
+
+	pr_debug("ping_v4_bind(sk=%p,sa_addr=%08x,sa_port=%d)\n",
+		sk, addr->sin_addr.s_addr, ntohs(addr->sin_port));
+
+	chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
+	if (addr->sin_addr.s_addr == INADDR_ANY)
+		chk_addr_ret = RTN_LOCAL;
+
+	if ((sysctl_ip_nonlocal_bind == 0 &&
+	    isk->freebind == 0 && isk->transparent == 0 &&
+	     chk_addr_ret != RTN_LOCAL) ||
+	    chk_addr_ret == RTN_MULTICAST ||
+	    chk_addr_ret == RTN_BROADCAST)
+		return -EADDRNOTAVAIL;
+
+	lock_sock(sk);
+
+	err = -EINVAL;
+	if (isk->inet_num != 0)
+		goto out;
+
+	err = -EADDRINUSE;
+	isk->inet_rcv_saddr = isk->inet_saddr = addr->sin_addr.s_addr;
+	snum = ntohs(addr->sin_port);
+	if (ping_v4_get_port(sk, snum) != 0) {
+		isk->inet_saddr = isk->inet_rcv_saddr = 0;
+		goto out;
+	}
+
+	pr_debug("after bind(): num = %d, daddr = %ld, dif = %d\n",
+		(int)isk->inet_num,
+		(unsigned long) isk->inet_rcv_saddr,
+		(int)sk->sk_bound_dev_if);
+
+	err = 0;
+	if (isk->inet_rcv_saddr)
+		sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+	if (snum)
+		sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
+	isk->inet_sport = htons(isk->inet_num);
+	isk->inet_daddr = 0;
+	isk->inet_dport = 0;
+	sk_dst_reset(sk);
+out:
+	release_sock(sk);
+	pr_debug("ping_v4_bind -> %d\n", err);
+	return err;
+}
+
+/*
+ * Is this a supported type of ICMP message?
+ */
+
+static inline int ping_supported(int type, int code)
+{
+	if (type == ICMP_ECHO && code == 0)
+		return 1;
+	return 0;
+}
+
+/*
+ * This routine is called by the ICMP module when it gets some
+ * sort of error condition.
+ */
+
+static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+
+void ping_err(struct sk_buff *skb, u32 info)
+{
+	struct iphdr *iph = (struct iphdr *)skb->data;
+	struct icmphdr *icmph = (struct icmphdr *)(skb->data+(iph->ihl<<2));
+	struct inet_sock *inet_sock;
+	int type = icmph->type;
+	int code = icmph->code;
+	struct net *net = dev_net(skb->dev);
+	struct sock *sk;
+	int harderr;
+	int err;
+
+	/* We assume the packet has already been checked by icmp_unreach */
+
+	if (!ping_supported(icmph->type, icmph->code))
+		return;
+
+	pr_debug("ping_err(type=%04x,code=%04x,id=%04x,seq=%04x)\n", type,
+		code, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence));
+
+	sk = ping_v4_lookup(net, iph->daddr, iph->saddr,
+			    ntohs(icmph->un.echo.id), skb->dev->ifindex);
+	if (sk == NULL) {
+		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+		pr_debug("no socket, dropping\n");
+		return;	/* No socket for error */
+	}
+	pr_debug("err on socket %p\n", sk);
+
+	err = 0;
+	harderr = 0;
+	inet_sock = inet_sk(sk);
+
+	switch (type) {
+	default:
+	case ICMP_TIME_EXCEEDED:
+		err = EHOSTUNREACH;
+		break;
+	case ICMP_SOURCE_QUENCH:
+		/* This is not a real error but ping wants to see it.
+		 * Report it with some fake errno. */
+		err = EREMOTEIO;
+		break;
+	case ICMP_PARAMETERPROB:
+		err = EPROTO;
+		harderr = 1;
+		break;
+	case ICMP_DEST_UNREACH:
+		if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
+			if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) {
+				err = EMSGSIZE;
+				harderr = 1;
+				break;
+			}
+			goto out;
+		}
+		err = EHOSTUNREACH;
+		if (code <= NR_ICMP_UNREACH) {
+			harderr = icmp_err_convert[code].fatal;
+			err = icmp_err_convert[code].errno;
+		}
+		break;
+	case ICMP_REDIRECT:
+		/* See ICMP_SOURCE_QUENCH */
+		err = EREMOTEIO;
+		break;
+	}
+
+	/*
+	 *      RFC1122: OK.  Passes ICMP errors back to application, as per
+	 *	4.1.3.3.
+	 */
+	if (!inet_sock->recverr) {
+		if (!harderr || sk->sk_state != TCP_ESTABLISHED)
+			goto out;
+	} else {
+		ip_icmp_error(sk, skb, err, 0 /* no remote port */,
+			 info, (u8 *)icmph);
+	}
+	sk->sk_err = err;
+	sk->sk_error_report(sk);
+out:
+	sock_put(sk);
+}
+
+/*
+ *	Copy and checksum an ICMP Echo packet from user space into a buffer.
+ */
+
+struct pingfakehdr {
+	struct icmphdr icmph;
+	struct iovec *iov;
+	u32 wcheck;
+};
+
+static int ping_getfrag(void *from, char * to,
+			int offset, int fraglen, int odd, struct sk_buff *skb)
+{
+	struct pingfakehdr *pfh = (struct pingfakehdr *)from;
+
+	if (offset == 0) {
+		if (fraglen < sizeof(struct icmphdr))
+			BUG();
+		if (csum_partial_copy_fromiovecend(to + sizeof(struct icmphdr),
+			    pfh->iov, 0, fraglen - sizeof(struct icmphdr),
+			    &pfh->wcheck))
+			return -EFAULT;
+
+		return 0;
+	}
+	if (offset < sizeof(struct icmphdr))
+		BUG();
+	if (csum_partial_copy_fromiovecend
+			(to, pfh->iov, offset - sizeof(struct icmphdr),
+			 fraglen, &pfh->wcheck))
+		return -EFAULT;
+	return 0;
+}
+
+static int ping_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh, struct flowi4 *fl4)
+{
+	struct sk_buff *skb = skb_peek(&sk->sk_write_queue);
+
+	pfh->wcheck = csum_partial((char *)&pfh->icmph,
+		sizeof(struct icmphdr), pfh->wcheck);
+	pfh->icmph.checksum = csum_fold(pfh->wcheck);
+	memcpy(icmp_hdr(skb), &pfh->icmph, sizeof(struct icmphdr));
+	skb->ip_summed = CHECKSUM_NONE;
+	return ip_push_pending_frames(sk, fl4);
+}
+
+int ping_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+		 size_t len)
+{
+	struct net *net = sock_net(sk);
+	struct flowi4 fl4;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ipcm_cookie ipc;
+	struct icmphdr user_icmph;
+	struct pingfakehdr pfh;
+	struct rtable *rt = NULL;
+	struct ip_options_data opt_copy;
+	int free = 0;
+	u32 saddr, daddr, faddr;
+	u8  tos;
+	int err;
+
+	pr_debug("ping_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num);
+
+
+	if (len > 0xFFFF)
+		return -EMSGSIZE;
+
+	/*
+	 *	Check the flags.
+	 */
+
+	/* Mirror BSD error message compatibility */
+	if (msg->msg_flags & MSG_OOB)
+		return -EOPNOTSUPP;
+
+	/*
+	 *	Fetch the ICMP header provided by the userland.
+	 *	iovec is modified!
+	 */
+
+	if (memcpy_fromiovec((u8 *)&user_icmph, msg->msg_iov,
+			     sizeof(struct icmphdr)))
+		return -EFAULT;
+	if (!ping_supported(user_icmph.type, user_icmph.code))
+		return -EINVAL;
+
+	/*
+	 *	Get and verify the address.
+	 */
+
+	if (msg->msg_name) {
+		struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name;
+		if (msg->msg_namelen < sizeof(*usin))
+			return -EINVAL;
+		if (usin->sin_family != AF_INET)
+			return -EINVAL;
+		daddr = usin->sin_addr.s_addr;
+		/* no remote port */
+	} else {
+		if (sk->sk_state != TCP_ESTABLISHED)
+			return -EDESTADDRREQ;
+		daddr = inet->inet_daddr;
+		/* no remote port */
+	}
+
+	ipc.addr = inet->inet_saddr;
+	ipc.opt = NULL;
+	ipc.oif = sk->sk_bound_dev_if;
+	ipc.tx_flags = 0;
+	err = sock_tx_timestamp(sk, &ipc.tx_flags);
+	if (err)
+		return err;
+
+	if (msg->msg_controllen) {
+		err = ip_cmsg_send(sock_net(sk), msg, &ipc);
+		if (err)
+			return err;
+		if (ipc.opt)
+			free = 1;
+	}
+	if (!ipc.opt) {
+		struct ip_options_rcu *inet_opt;
+
+		rcu_read_lock();
+		inet_opt = rcu_dereference(inet->inet_opt);
+		if (inet_opt) {
+			memcpy(&opt_copy, inet_opt,
+			       sizeof(*inet_opt) + inet_opt->opt.optlen);
+			ipc.opt = &opt_copy.opt;
+		}
+		rcu_read_unlock();
+	}
+
+	saddr = ipc.addr;
+	ipc.addr = faddr = daddr;
+
+	if (ipc.opt && ipc.opt->opt.srr) {
+		if (!daddr)
+			return -EINVAL;
+		faddr = ipc.opt->opt.faddr;
+	}
+	tos = RT_TOS(inet->tos);
+	if (sock_flag(sk, SOCK_LOCALROUTE) ||
+	    (msg->msg_flags & MSG_DONTROUTE) ||
+	    (ipc.opt && ipc.opt->opt.is_strictroute)) {
+		tos |= RTO_ONLINK;
+	}
+
+	if (ipv4_is_multicast(daddr)) {
+		if (!ipc.oif)
+			ipc.oif = inet->mc_index;
+		if (!saddr)
+			saddr = inet->mc_addr;
+	}
+
+	flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
+			   RT_SCOPE_UNIVERSE, sk->sk_protocol,
+			   inet_sk_flowi_flags(sk), faddr, saddr, 0, 0);
+
+	security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
+	rt = ip_route_output_flow(net, &fl4, sk);
+	if (IS_ERR(rt)) {
+		err = PTR_ERR(rt);
+		rt = NULL;
+		if (err == -ENETUNREACH)
+			IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+		goto out;
+	}
+
+	err = -EACCES;
+	if ((rt->rt_flags & RTCF_BROADCAST) &&
+	    !sock_flag(sk, SOCK_BROADCAST))
+		goto out;
+
+	if (msg->msg_flags & MSG_CONFIRM)
+		goto do_confirm;
+back_from_confirm:
+
+	if (!ipc.addr)
+		ipc.addr = fl4.daddr;
+
+	lock_sock(sk);
+
+	pfh.icmph.type = user_icmph.type; /* already checked */
+	pfh.icmph.code = user_icmph.code; /* ditto */
+	pfh.icmph.checksum = 0;
+	pfh.icmph.un.echo.id = inet->inet_sport;
+	pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence;
+	pfh.iov = msg->msg_iov;
+	pfh.wcheck = 0;
+
+	err = ip_append_data(sk, &fl4, ping_getfrag, &pfh, len,
+			0, &ipc, &rt, msg->msg_flags);
+	if (err)
+		ip_flush_pending_frames(sk);
+	else
+		err = ping_push_pending_frames(sk, &pfh, &fl4);
+	release_sock(sk);
+
+out:
+	ip_rt_put(rt);
+	if (free)
+		kfree(ipc.opt);
+	if (!err) {
+		icmp_out_count(sock_net(sk), user_icmph.type);
+		return len;
+	}
+	return err;
+
+do_confirm:
+	dst_confirm(&rt->dst);
+	if (!(msg->msg_flags & MSG_PROBE) || len)
+		goto back_from_confirm;
+	err = 0;
+	goto out;
+}
+
+/*
+ *	IOCTL requests applicable to the UDP^H^H^HICMP protocol
+ */
+
+int ping_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+	pr_debug("ping_ioctl(sk=%p,sk->num=%u,cmd=%d,arg=%lu)\n",
+		inet_sk(sk), inet_sk(sk)->inet_num, cmd, arg);
+	switch (cmd) {
+	case SIOCOUTQ:
+	case SIOCINQ:
+		return udp_ioctl(sk, cmd, arg);
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+
+int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+		 size_t len, int noblock, int flags, int *addr_len)
+{
+	struct inet_sock *isk = inet_sk(sk);
+	struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
+	struct sk_buff *skb;
+	int copied, err;
+
+	pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num);
+
+	if (flags & MSG_OOB)
+		goto out;
+
+	if (addr_len)
+		*addr_len = sizeof(*sin);
+
+	if (flags & MSG_ERRQUEUE)
+		return ip_recv_error(sk, msg, len);
+
+	skb = skb_recv_datagram(sk, flags, noblock, &err);
+	if (!skb)
+		goto out;
+
+	copied = skb->len;
+	if (copied > len) {
+		msg->msg_flags |= MSG_TRUNC;
+		copied = len;
+	}
+
+	/* Don't bother checking the checksum */
+	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	if (err)
+		goto done;
+
+	sock_recv_timestamp(msg, sk, skb);
+
+	/* Copy the address. */
+	if (sin) {
+		sin->sin_family = AF_INET;
+		sin->sin_port = 0 /* skb->h.uh->source */;
+		sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+		memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+	}
+	if (isk->cmsg_flags)
+		ip_cmsg_recv(msg, skb);
+	err = copied;
+
+done:
+	skb_free_datagram(sk, skb);
+out:
+	pr_debug("ping_recvmsg -> %d\n", err);
+	return err;
+}
+
+static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n",
+		inet_sk(sk), inet_sk(sk)->inet_num, skb);
+	if (sock_queue_rcv_skb(sk, skb) < 0) {
+		ICMP_INC_STATS_BH(sock_net(sk), ICMP_MIB_INERRORS);
+		kfree_skb(skb);
+		pr_debug("ping_queue_rcv_skb -> failed\n");
+		return -1;
+	}
+	return 0;
+}
+
+
+/*
+ *	All we need to do is get the socket.
+ */
+
+void ping_rcv(struct sk_buff *skb)
+{
+	struct sock *sk;
+	struct net *net = dev_net(skb->dev);
+	struct iphdr *iph = ip_hdr(skb);
+	struct icmphdr *icmph = icmp_hdr(skb);
+	u32 saddr = iph->saddr;
+	u32 daddr = iph->daddr;
+
+	/* We assume the packet has already been checked by icmp_rcv */
+
+	pr_debug("ping_rcv(skb=%p,id=%04x,seq=%04x)\n",
+		skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence));
+
+	/* Push ICMP header back */
+	skb_push(skb, skb->data - (u8 *)icmph);
+
+	sk = ping_v4_lookup(net, saddr, daddr, ntohs(icmph->un.echo.id),
+			    skb->dev->ifindex);
+	if (sk != NULL) {
+		pr_debug("rcv on socket %p\n", sk);
+		ping_queue_rcv_skb(sk, skb_get(skb));
+		sock_put(sk);
+		return;
+	}
+	pr_debug("no socket, dropping\n");
+
+	/* We're called from icmp_rcv(). kfree_skb() is done there. */
+}
+
+struct proto ping_prot = {
+	.name =		"PING",
+	.owner =	THIS_MODULE,
+	.init =		ping_init_sock,
+	.close =	ping_close,
+	.connect =	ip4_datagram_connect,
+	.disconnect =	udp_disconnect,
+	.ioctl =	ping_ioctl,
+	.setsockopt =	ip_setsockopt,
+	.getsockopt =	ip_getsockopt,
+	.sendmsg =	ping_sendmsg,
+	.recvmsg =	ping_recvmsg,
+	.bind =		ping_bind,
+	.backlog_rcv =	ping_queue_rcv_skb,
+	.hash =		ping_v4_hash,
+	.unhash =	ping_v4_unhash,
+	.get_port =	ping_v4_get_port,
+	.obj_size =	sizeof(struct inet_sock),
+};
+EXPORT_SYMBOL(ping_prot);
+
+#ifdef CONFIG_PROC_FS
+
+static struct sock *ping_get_first(struct seq_file *seq, int start)
+{
+	struct sock *sk;
+	struct ping_iter_state *state = seq->private;
+	struct net *net = seq_file_net(seq);
+
+	for (state->bucket = start; state->bucket < PING_HTABLE_SIZE;
+	     ++state->bucket) {
+		struct hlist_nulls_node *node;
+		struct hlist_nulls_head *hslot = &ping_table.hash[state->bucket];
+
+		if (hlist_nulls_empty(hslot))
+			continue;
+
+		sk_nulls_for_each(sk, node, hslot) {
+			if (net_eq(sock_net(sk), net))
+				goto found;
+		}
+	}
+	sk = NULL;
+found:
+	return sk;
+}
+
+static struct sock *ping_get_next(struct seq_file *seq, struct sock *sk)
+{
+	struct ping_iter_state *state = seq->private;
+	struct net *net = seq_file_net(seq);
+
+	do {
+		sk = sk_nulls_next(sk);
+	} while (sk && (!net_eq(sock_net(sk), net)));
+
+	if (!sk)
+		return ping_get_first(seq, state->bucket + 1);
+	return sk;
+}
+
+static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct sock *sk = ping_get_first(seq, 0);
+
+	if (sk)
+		while (pos && (sk = ping_get_next(seq, sk)) != NULL)
+			--pos;
+	return pos ? NULL : sk;
+}
+
+static void *ping_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct ping_iter_state *state = seq->private;
+	state->bucket = 0;
+
+	read_lock_bh(&ping_table.lock);
+
+	return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
+}
+
+static void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct sock *sk;
+
+	if (v == SEQ_START_TOKEN)
+		sk = ping_get_idx(seq, 0);
+	else
+		sk = ping_get_next(seq, v);
+
+	++*pos;
+	return sk;
+}
+
+static void ping_seq_stop(struct seq_file *seq, void *v)
+{
+	read_unlock_bh(&ping_table.lock);
+}
+
+static void ping_format_sock(struct sock *sp, struct seq_file *f,
+		int bucket, int *len)
+{
+	struct inet_sock *inet = inet_sk(sp);
+	__be32 dest = inet->inet_daddr;
+	__be32 src = inet->inet_rcv_saddr;
+	__u16 destp = ntohs(inet->inet_dport);
+	__u16 srcp = ntohs(inet->inet_sport);
+
+	seq_printf(f, "%5d: %08X:%04X %08X:%04X"
+		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n",
+		bucket, src, srcp, dest, destp, sp->sk_state,
+		sk_wmem_alloc_get(sp),
+		sk_rmem_alloc_get(sp),
+		0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
+		atomic_read(&sp->sk_refcnt), sp,
+		atomic_read(&sp->sk_drops), len);
+}
+
+static int ping_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN)
+		seq_printf(seq, "%-127s\n",
+			   "  sl  local_address rem_address   st tx_queue "
+			   "rx_queue tr tm->when retrnsmt   uid  timeout "
+			   "inode ref pointer drops");
+	else {
+		struct ping_iter_state *state = seq->private;
+		int len;
+
+		ping_format_sock(v, seq, state->bucket, &len);
+		seq_printf(seq, "%*s\n", 127 - len, "");
+	}
+	return 0;
+}
+
+static const struct seq_operations ping_seq_ops = {
+	.show		= ping_seq_show,
+	.start		= ping_seq_start,
+	.next		= ping_seq_next,
+	.stop		= ping_seq_stop,
+};
+
+static int ping_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &ping_seq_ops,
+			   sizeof(struct ping_iter_state));
+}
+
+static const struct file_operations ping_seq_fops = {
+	.open		= ping_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_net,
+};
+
+static int ping_proc_register(struct net *net)
+{
+	struct proc_dir_entry *p;
+	int rc = 0;
+
+	p = proc_net_fops_create(net, "icmp", S_IRUGO, &ping_seq_fops);
+	if (!p)
+		rc = -ENOMEM;
+	return rc;
+}
+
+static void ping_proc_unregister(struct net *net)
+{
+	proc_net_remove(net, "icmp");
+}
+
+
+static int __net_init ping_proc_init_net(struct net *net)
+{
+	return ping_proc_register(net);
+}
+
+static void __net_exit ping_proc_exit_net(struct net *net)
+{
+	ping_proc_unregister(net);
+}
+
+static struct pernet_operations ping_net_ops = {
+	.init = ping_proc_init_net,
+	.exit = ping_proc_exit_net,
+};
+
+int __init ping_proc_init(void)
+{
+	return register_pernet_subsys(&ping_net_ops);
+}
+
+void ping_proc_exit(void)
+{
+	unregister_pernet_subsys(&ping_net_ops);
+}
+
+#endif
+
+void __init ping_init(void)
+{
+	int i;
+
+	for (i = 0; i < PING_HTABLE_SIZE; i++)
+		INIT_HLIST_NULLS_HEAD(&ping_table.hash[i], i);
+	rwlock_init(&ping_table.lock);
+}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 321e6e84dbc..28e8273bbef 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -13,6 +13,7 @@
 #include <linux/seqlock.h>
 #include <linux/init.h>
 #include <linux/slab.h>
+#include <linux/nsproxy.h>
 #include <net/snmp.h>
 #include <net/icmp.h>
 #include <net/ip.h>
@@ -21,6 +22,7 @@
 #include <net/udp.h>
 #include <net/cipso_ipv4.h>
 #include <net/inet_frag.h>
+#include <net/ping.h>
 
 static int zero;
 static int tcp_retr1_max = 255;
@@ -30,6 +32,8 @@ static int tcp_adv_win_scale_min = -31;
 static int tcp_adv_win_scale_max = 31;
 static int ip_ttl_min = 1;
 static int ip_ttl_max = 255;
+static int ip_ping_group_range_min[] = { 0, 0 };
+static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
 
 /* Update system visible IP port range */
 static void set_local_port_range(int range[2])
@@ -68,6 +72,65 @@ static int ipv4_local_port_range(ctl_table *table, int write,
 	return ret;
 }
 
+
+void inet_get_ping_group_range_net(struct net *net, gid_t *low, gid_t *high)
+{
+	gid_t *data = net->ipv4.sysctl_ping_group_range;
+	unsigned seq;
+	do {
+		seq = read_seqbegin(&sysctl_local_ports.lock);
+
+		*low = data[0];
+		*high = data[1];
+	} while (read_seqretry(&sysctl_local_ports.lock, seq));
+}
+
+void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t *high)
+{
+	gid_t *data = table->data;
+	unsigned seq;
+	do {
+		seq = read_seqbegin(&sysctl_local_ports.lock);
+
+		*low = data[0];
+		*high = data[1];
+	} while (read_seqretry(&sysctl_local_ports.lock, seq));
+}
+
+/* Update system visible IP port range */
+static void set_ping_group_range(struct ctl_table *table, int range[2])
+{
+	gid_t *data = table->data;
+	write_seqlock(&sysctl_local_ports.lock);
+	data[0] = range[0];
+	data[1] = range[1];
+	write_sequnlock(&sysctl_local_ports.lock);
+}
+
+/* Validate changes from /proc interface. */
+static int ipv4_ping_group_range(ctl_table *table, int write,
+				 void __user *buffer,
+				 size_t *lenp, loff_t *ppos)
+{
+	int ret;
+	gid_t range[2];
+	ctl_table tmp = {
+		.data = &range,
+		.maxlen = sizeof(range),
+		.mode = table->mode,
+		.extra1 = &ip_ping_group_range_min,
+		.extra2 = &ip_ping_group_range_max,
+	};
+
+	inet_get_ping_group_range_table(table, range, range + 1);
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+	if (write && ret == 0)
+		set_ping_group_range(table, range);
+
+	return ret;
+}
+
 static int proc_tcp_congestion_control(ctl_table *ctl, int write,
 				       void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -677,6 +740,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "ping_group_range",
+		.data		= &init_net.ipv4.sysctl_ping_group_range,
+		.maxlen		= sizeof(init_net.ipv4.sysctl_ping_group_range),
+		.mode		= 0644,
+		.proc_handler	= ipv4_ping_group_range,
+	},
 	{ }
 };
 
@@ -711,8 +781,18 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
 			&net->ipv4.sysctl_icmp_ratemask;
 		table[6].data =
 			&net->ipv4.sysctl_rt_cache_rebuild_count;
+		table[7].data =
+			&net->ipv4.sysctl_ping_group_range;
+
 	}
 
+	/*
+	 * Sane defaults - nobody may create ping sockets.
+	 * Boot scripts should set this to distro-specific group.
+	 */
+	net->ipv4.sysctl_ping_group_range[0] = 1;
+	net->ipv4.sysctl_ping_group_range[1] = 0;
+
 	net->ipv4.sysctl_rt_cache_rebuild_count = 4;
 
 	net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
-- 
cgit v1.2.3-70-g09d2