diff options
Diffstat (limited to 'net/ipv4')
53 files changed, 2181 insertions, 1049 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 0dc772d0d12..f2dc69cffb5 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -11,7 +11,7 @@ obj-y := route.o inetpeer.o protocol.o \ datagram.o raw.o udp.o udplite.o \ arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o \ - inet_fragment.o + inet_fragment.o ping.o obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o obj-$(CONFIG_PROC_FS) += proc.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 807d83c02ef..ef1528af7ab 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -105,6 +105,7 @@ #include <net/tcp.h> #include <net/udp.h> #include <net/udplite.h> +#include <net/ping.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/raw.h> @@ -153,7 +154,7 @@ void inet_sock_destruct(struct sock *sk) WARN_ON(sk->sk_wmem_queued); WARN_ON(sk->sk_forward_alloc); - kfree(inet->opt); + kfree(rcu_dereference_protected(inet->inet_opt, 1)); dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); sk_refcnt_debug_dec(sk); } @@ -464,6 +465,11 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (addr_len < sizeof(struct sockaddr_in)) goto out; + if (addr->sin_family != AF_INET) { + err = -EAFNOSUPPORT; + goto out; + } + chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); /* Not specified by any standard per-se, however it breaks too @@ -672,6 +678,7 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags) lock_sock(sk2); + sock_rps_record_flow(sk2); WARN_ON(!((1 << sk2->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE))); @@ -1008,6 +1015,14 @@ static struct inet_protosw inetsw_array[] = .flags = INET_PROTOSW_PERMANENT, }, + { + .type = SOCK_DGRAM, + .protocol = IPPROTO_ICMP, + .prot = &ping_prot, + .ops = &inet_dgram_ops, + .no_check = UDP_CSUM_DEFAULT, + .flags = INET_PROTOSW_REUSE, + }, { .type = SOCK_RAW, @@ -1103,14 +1118,19 @@ static int inet_sk_reselect_saddr(struct sock *sk) struct inet_sock *inet = inet_sk(sk); __be32 old_saddr = inet->inet_saddr; __be32 daddr = inet->inet_daddr; + struct flowi4 *fl4; struct rtable *rt; __be32 new_saddr; + struct ip_options_rcu *inet_opt; - if (inet->opt && inet->opt->srr) - daddr = inet->opt->faddr; + inet_opt = rcu_dereference_protected(inet->inet_opt, + sock_owned_by_user(sk)); + if (inet_opt && inet_opt->opt.srr) + daddr = inet_opt->opt.faddr; /* Query new route. */ - rt = ip_route_connect(daddr, 0, RT_CONN_FLAGS(sk), + fl4 = &inet->cork.fl.u.ip4; + rt = ip_route_connect(fl4, daddr, 0, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, sk->sk_protocol, inet->inet_sport, inet->inet_dport, sk, false); if (IS_ERR(rt)) @@ -1118,7 +1138,7 @@ static int inet_sk_reselect_saddr(struct sock *sk) sk_setup_caps(sk, &rt->dst); - new_saddr = rt->rt_src; + new_saddr = fl4->saddr; if (new_saddr == old_saddr) return 0; @@ -1147,6 +1167,8 @@ int inet_sk_rebuild_header(struct sock *sk) struct inet_sock *inet = inet_sk(sk); struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); __be32 daddr; + struct ip_options_rcu *inet_opt; + struct flowi4 *fl4; int err; /* Route is OK, nothing to do. */ @@ -1154,10 +1176,14 @@ int inet_sk_rebuild_header(struct sock *sk) return 0; /* Reroute. */ + rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); daddr = inet->inet_daddr; - if (inet->opt && inet->opt->srr) - daddr = inet->opt->faddr; - rt = ip_route_output_ports(sock_net(sk), sk, daddr, inet->inet_saddr, + if (inet_opt && inet_opt->opt.srr) + daddr = inet_opt->opt.faddr; + rcu_read_unlock(); + fl4 = &inet->cork.fl.u.ip4; + rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, sk->sk_protocol, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); @@ -1186,7 +1212,7 @@ EXPORT_SYMBOL(inet_sk_rebuild_header); static int inet_gso_send_check(struct sk_buff *skb) { - struct iphdr *iph; + const struct iphdr *iph; const struct net_protocol *ops; int proto; int ihl; @@ -1293,7 +1319,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, const struct net_protocol *ops; struct sk_buff **pp = NULL; struct sk_buff *p; - struct iphdr *iph; + const struct iphdr *iph; unsigned int hlen; unsigned int off; unsigned int id; @@ -1516,6 +1542,7 @@ static const struct net_protocol udp_protocol = { static const struct net_protocol icmp_protocol = { .handler = icmp_rcv, + .err_handler = ping_err, .no_policy = 1, .netns_ok = 1, }; @@ -1631,6 +1658,10 @@ static int __init inet_init(void) if (rc) goto out_unregister_udp_proto; + rc = proto_register(&ping_prot, 1); + if (rc) + goto out_unregister_raw_proto; + /* * Tell SOCKET that we are alive... */ @@ -1686,6 +1717,8 @@ static int __init inet_init(void) /* Add UDP-Lite (RFC 3828) */ udplite4_register(); + ping_init(); + /* * Set the ICMP layer up */ @@ -1716,6 +1749,8 @@ static int __init inet_init(void) rc = 0; out: return rc; +out_unregister_raw_proto: + proto_unregister(&raw_prot); out_unregister_udp_proto: proto_unregister(&udp_prot); out_unregister_tcp_proto: @@ -1740,11 +1775,15 @@ static int __init ipv4_proc_init(void) goto out_tcp; if (udp4_proc_init()) goto out_udp; + if (ping_proc_init()) + goto out_ping; if (ip_misc_proc_init()) goto out_misc; out: return rc; out_misc: + ping_proc_exit(); +out_ping: udp4_proc_exit(); out_udp: tcp4_proc_exit(); diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 4286fd3cc0e..c1f4154552f 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -73,7 +73,7 @@ static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash, * into IP header for icv calculation. Options are already checked * for validity, so paranoia is not required. */ -static int ip_clear_mutable_options(struct iphdr *iph, __be32 *daddr) +static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr) { unsigned char * optptr = (unsigned char*)(iph+1); int l = iph->ihl*4 - sizeof(struct iphdr); @@ -396,7 +396,7 @@ out: static void ah4_err(struct sk_buff *skb, u32 info) { struct net *net = dev_net(skb->dev); - struct iphdr *iph = (struct iphdr *)skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; @@ -404,7 +404,8 @@ static void ah4_err(struct sk_buff *skb, u32 info) icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return; - x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); + x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, + ah->spi, IPPROTO_AH, AF_INET); if (!x) return; printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n", diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index a0af7ea8787..2b3c23c287c 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1857,6 +1857,11 @@ static int cipso_v4_genopt(unsigned char *buf, u32 buf_len, return CIPSO_V4_HDR_LEN + ret_val; } +static void opt_kfree_rcu(struct rcu_head *head) +{ + kfree(container_of(head, struct ip_options_rcu, rcu)); +} + /** * cipso_v4_sock_setattr - Add a CIPSO option to a socket * @sk: the socket @@ -1879,7 +1884,7 @@ int cipso_v4_sock_setattr(struct sock *sk, unsigned char *buf = NULL; u32 buf_len; u32 opt_len; - struct ip_options *opt = NULL; + struct ip_options_rcu *old, *opt = NULL; struct inet_sock *sk_inet; struct inet_connection_sock *sk_conn; @@ -1915,22 +1920,25 @@ int cipso_v4_sock_setattr(struct sock *sk, ret_val = -ENOMEM; goto socket_setattr_failure; } - memcpy(opt->__data, buf, buf_len); - opt->optlen = opt_len; - opt->cipso = sizeof(struct iphdr); + memcpy(opt->opt.__data, buf, buf_len); + opt->opt.optlen = opt_len; + opt->opt.cipso = sizeof(struct iphdr); kfree(buf); buf = NULL; sk_inet = inet_sk(sk); + + old = rcu_dereference_protected(sk_inet->inet_opt, sock_owned_by_user(sk)); if (sk_inet->is_icsk) { sk_conn = inet_csk(sk); - if (sk_inet->opt) - sk_conn->icsk_ext_hdr_len -= sk_inet->opt->optlen; - sk_conn->icsk_ext_hdr_len += opt->optlen; + if (old) + sk_conn->icsk_ext_hdr_len -= old->opt.optlen; + sk_conn->icsk_ext_hdr_len += opt->opt.optlen; sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie); } - opt = xchg(&sk_inet->opt, opt); - kfree(opt); + rcu_assign_pointer(sk_inet->inet_opt, opt); + if (old) + call_rcu(&old->rcu, opt_kfree_rcu); return 0; @@ -1960,7 +1968,7 @@ int cipso_v4_req_setattr(struct request_sock *req, unsigned char *buf = NULL; u32 buf_len; u32 opt_len; - struct ip_options *opt = NULL; + struct ip_options_rcu *opt = NULL; struct inet_request_sock *req_inet; /* We allocate the maximum CIPSO option size here so we are probably @@ -1988,15 +1996,16 @@ int cipso_v4_req_setattr(struct request_sock *req, ret_val = -ENOMEM; goto req_setattr_failure; } - memcpy(opt->__data, buf, buf_len); - opt->optlen = opt_len; - opt->cipso = sizeof(struct iphdr); + memcpy(opt->opt.__data, buf, buf_len); + opt->opt.optlen = opt_len; + opt->opt.cipso = sizeof(struct iphdr); kfree(buf); buf = NULL; req_inet = inet_rsk(req); opt = xchg(&req_inet->opt, opt); - kfree(opt); + if (opt) + call_rcu(&opt->rcu, opt_kfree_rcu); return 0; @@ -2016,34 +2025,34 @@ req_setattr_failure: * values on failure. * */ -static int cipso_v4_delopt(struct ip_options **opt_ptr) +static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr) { int hdr_delta = 0; - struct ip_options *opt = *opt_ptr; + struct ip_options_rcu *opt = *opt_ptr; - if (opt->srr || opt->rr || opt->ts || opt->router_alert) { + if (opt->opt.srr || opt->opt.rr || opt->opt.ts || opt->opt.router_alert) { u8 cipso_len; u8 cipso_off; unsigned char *cipso_ptr; int iter; int optlen_new; - cipso_off = opt->cipso - sizeof(struct iphdr); - cipso_ptr = &opt->__data[cipso_off]; + cipso_off = opt->opt.cipso - sizeof(struct iphdr); + cipso_ptr = &opt->opt.__data[cipso_off]; cipso_len = cipso_ptr[1]; - if (opt->srr > opt->cipso) - opt->srr -= cipso_len; - if (opt->rr > opt->cipso) - opt->rr -= cipso_len; - if (opt->ts > opt->cipso) - opt->ts -= cipso_len; - if (opt->router_alert > opt->cipso) - opt->router_alert -= cipso_len; - opt->cipso = 0; + if (opt->opt.srr > opt->opt.cipso) + opt->opt.srr -= cipso_len; + if (opt->opt.rr > opt->opt.cipso) + opt->opt.rr -= cipso_len; + if (opt->opt.ts > opt->opt.cipso) + opt->opt.ts -= cipso_len; + if (opt->opt.router_alert > opt->opt.cipso) + opt->opt.router_alert -= cipso_len; + opt->opt.cipso = 0; memmove(cipso_ptr, cipso_ptr + cipso_len, - opt->optlen - cipso_off - cipso_len); + opt->opt.optlen - cipso_off - cipso_len); /* determining the new total option length is tricky because of * the padding necessary, the only thing i can think to do at @@ -2052,21 +2061,21 @@ static int cipso_v4_delopt(struct ip_options **opt_ptr) * from there we can determine the new total option length */ iter = 0; optlen_new = 0; - while (iter < opt->optlen) - if (opt->__data[iter] != IPOPT_NOP) { - iter += opt->__data[iter + 1]; + while (iter < opt->opt.optlen) + if (opt->opt.__data[iter] != IPOPT_NOP) { + iter += opt->opt.__data[iter + 1]; optlen_new = iter; } else iter++; - hdr_delta = opt->optlen; - opt->optlen = (optlen_new + 3) & ~3; - hdr_delta -= opt->optlen; + hdr_delta = opt->opt.optlen; + opt->opt.optlen = (optlen_new + 3) & ~3; + hdr_delta -= opt->opt.optlen; } else { /* only the cipso option was present on the socket so we can * remove the entire option struct */ *opt_ptr = NULL; - hdr_delta = opt->optlen; - kfree(opt); + hdr_delta = opt->opt.optlen; + call_rcu(&opt->rcu, opt_kfree_rcu); } return hdr_delta; @@ -2083,15 +2092,15 @@ static int cipso_v4_delopt(struct ip_options **opt_ptr) void cipso_v4_sock_delattr(struct sock *sk) { int hdr_delta; - struct ip_options *opt; + struct ip_options_rcu *opt; struct inet_sock *sk_inet; sk_inet = inet_sk(sk); - opt = sk_inet->opt; - if (opt == NULL || opt->cipso == 0) + opt = rcu_dereference_protected(sk_inet->inet_opt, 1); + if (opt == NULL || opt->opt.cipso == 0) return; - hdr_delta = cipso_v4_delopt(&sk_inet->opt); + hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt); if (sk_inet->is_icsk && hdr_delta > 0) { struct inet_connection_sock *sk_conn = inet_csk(sk); sk_conn->icsk_ext_hdr_len -= hdr_delta; @@ -2109,12 +2118,12 @@ void cipso_v4_sock_delattr(struct sock *sk) */ void cipso_v4_req_delattr(struct request_sock *req) { - struct ip_options *opt; + struct ip_options_rcu *opt; struct inet_request_sock *req_inet; req_inet = inet_rsk(req); opt = req_inet->opt; - if (opt == NULL || opt->cipso == 0) + if (opt == NULL || opt->opt.cipso == 0) return; cipso_v4_delopt(&req_inet->opt); @@ -2184,14 +2193,18 @@ getattr_return: */ int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) { - struct ip_options *opt; + struct ip_options_rcu *opt; + int res = -ENOMSG; - opt = inet_sk(sk)->opt; - if (opt == NULL || opt->cipso == 0) - return -ENOMSG; - - return cipso_v4_getattr(opt->__data + opt->cipso - sizeof(struct iphdr), - secattr); + rcu_read_lock(); + opt = rcu_dereference(inet_sk(sk)->inet_opt); + if (opt && opt->opt.cipso) + res = cipso_v4_getattr(opt->opt.__data + + opt->opt.cipso - + sizeof(struct iphdr), + secattr); + rcu_read_unlock(); + return res; } /** diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 85bd24ca4f6..424fafbc8cb 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -24,6 +24,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct sockaddr_in *usin = (struct sockaddr_in *) uaddr; + struct flowi4 *fl4; struct rtable *rt; __be32 saddr; int oif; @@ -38,6 +39,8 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) sk_dst_reset(sk); + lock_sock(sk); + oif = sk->sk_bound_dev_if; saddr = inet->inet_saddr; if (ipv4_is_multicast(usin->sin_addr.s_addr)) { @@ -46,7 +49,8 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (!saddr) saddr = inet->mc_addr; } - rt = ip_route_connect(usin->sin_addr.s_addr, saddr, + fl4 = &inet->cork.fl.u.ip4; + rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr, RT_CONN_FLAGS(sk), oif, sk->sk_protocol, inet->inet_sport, usin->sin_port, sk, true); @@ -54,26 +58,30 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) err = PTR_ERR(rt); if (err == -ENETUNREACH) IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); - return err; + goto out; } if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) { ip_rt_put(rt); - return -EACCES; + err = -EACCES; + goto out; } if (!inet->inet_saddr) - inet->inet_saddr = rt->rt_src; /* Update source address */ + inet->inet_saddr = fl4->saddr; /* Update source address */ if (!inet->inet_rcv_saddr) { - inet->inet_rcv_saddr = rt->rt_src; + inet->inet_rcv_saddr = fl4->saddr; if (sk->sk_prot->rehash) sk->sk_prot->rehash(sk); } - inet->inet_daddr = rt->rt_dst; + inet->inet_daddr = fl4->daddr; inet->inet_dport = usin->sin_port; sk->sk_state = TCP_ESTABLISHED; inet->inet_id = jiffies; sk_dst_set(sk, &rt->dst); - return 0; + err = 0; +out: + release_sock(sk); + return err; } EXPORT_SYMBOL(ip4_datagram_connect); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index cd9ca0811cf..0d4a184af16 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1369,7 +1369,7 @@ errout: static size_t inet_get_link_af_size(const struct net_device *dev) { - struct in_device *in_dev = __in_dev_get_rtnl(dev); + struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr); if (!in_dev) return 0; @@ -1379,7 +1379,7 @@ static size_t inet_get_link_af_size(const struct net_device *dev) static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev) { - struct in_device *in_dev = __in_dev_get_rtnl(dev); + struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr); struct nlattr *nla; int i; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 03f994bcf7d..a5b413416da 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -276,7 +276,7 @@ error: static int esp_input_done2(struct sk_buff *skb, int err) { - struct iphdr *iph; + const struct iphdr *iph; struct xfrm_state *x = xfrm_input_state(skb); struct esp_data *esp = x->data; struct crypto_aead *aead = esp->aead; @@ -484,7 +484,7 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu) static void esp4_err(struct sk_buff *skb, u32 info) { struct net *net = dev_net(skb->dev); - struct iphdr *iph = (struct iphdr *)skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; @@ -492,7 +492,8 @@ static void esp4_err(struct sk_buff *skb, u32 info) icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return; - x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); + x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, + esph->spi, IPPROTO_ESP, AF_INET); if (!x) return; NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 451088330bb..22524716fe7 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -44,6 +44,7 @@ #include <net/arp.h> #include <net/ip_fib.h> #include <net/rtnetlink.h> +#include <net/xfrm.h> #ifndef CONFIG_IP_MULTIPLE_TABLES @@ -188,9 +189,9 @@ EXPORT_SYMBOL(inet_dev_addr_type); * - check, that packet arrived from expected physical interface. * called with rcu_read_lock() */ -int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, - struct net_device *dev, __be32 *spec_dst, - u32 *itag, u32 mark) +int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, + int oif, struct net_device *dev, __be32 *spec_dst, + u32 *itag) { struct in_device *in_dev; struct flowi4 fl4; @@ -202,7 +203,6 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, fl4.flowi4_oif = 0; fl4.flowi4_iif = oif; - fl4.flowi4_mark = mark; fl4.daddr = src; fl4.saddr = dst; fl4.flowi4_tos = tos; @@ -212,10 +212,12 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, in_dev = __in_dev_get_rcu(dev); if (in_dev) { no_addr = in_dev->ifa_list == NULL; - rpf = IN_DEV_RPFILTER(in_dev); + + /* Ignore rp_filter for packets protected by IPsec. */ + rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(in_dev); + accept_local = IN_DEV_ACCEPT_LOCAL(in_dev); - if (mark && !IN_DEV_SRC_VMARK(in_dev)) - fl4.flowi4_mark = 0; + fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; } if (in_dev == NULL) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 641a5a2a9f9..33e2c35b74b 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -141,18 +141,8 @@ const struct fib_prop fib_props[RTN_MAX + 1] = { }, }; - /* Release a nexthop info record */ -static void free_fib_info_rcu(struct rcu_head *head) -{ - struct fib_info *fi = container_of(head, struct fib_info, rcu); - - if (fi->fib_metrics != (u32 *) dst_default_metrics) - kfree(fi->fib_metrics); - kfree(fi); -} - void free_fib_info(struct fib_info *fi) { if (fi->fib_dead == 0) { @@ -166,7 +156,7 @@ void free_fib_info(struct fib_info *fi) } endfor_nexthops(fi); fib_info_cnt--; release_net(fi->fib_net); - call_rcu(&fi->rcu, free_fib_info_rcu); + kfree_rcu(fi, rcu); } void fib_release_info(struct fib_info *fi) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 5fe9b8b41df..58c25ea5a5c 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -72,6 +72,7 @@ #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> +#include <linux/prefetch.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/protocol.h> @@ -126,7 +127,7 @@ struct tnode { struct work_struct work; struct tnode *tnode_free; }; - struct rt_trie_node *child[0]; + struct rt_trie_node __rcu *child[0]; }; #ifdef CONFIG_IP_FIB_TRIE_STATS @@ -151,7 +152,7 @@ struct trie_stat { }; struct trie { - struct rt_trie_node *trie; + struct rt_trie_node __rcu *trie; #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats stats; #endif @@ -177,16 +178,29 @@ static const int sync_pages = 128; static struct kmem_cache *fn_alias_kmem __read_mostly; static struct kmem_cache *trie_leaf_kmem __read_mostly; -static inline struct tnode *node_parent(struct rt_trie_node *node) +/* + * caller must hold RTNL + */ +static inline struct tnode *node_parent(const struct rt_trie_node *node) { - return (struct tnode *)(node->parent & ~NODE_TYPE_MASK); + unsigned long parent; + + parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held()); + + return (struct tnode *)(parent & ~NODE_TYPE_MASK); } -static inline struct tnode *node_parent_rcu(struct rt_trie_node *node) +/* + * caller must hold RCU read lock or RTNL + */ +static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node) { - struct tnode *ret = node_parent(node); + unsigned long parent; - return rcu_dereference_rtnl(ret); + parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() || + lockdep_rtnl_is_held()); + + return (struct tnode *)(parent & ~NODE_TYPE_MASK); } /* Same as rcu_assign_pointer @@ -198,18 +212,24 @@ static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr) node->parent = (unsigned long)ptr | NODE_TYPE(node); } -static inline struct rt_trie_node *tnode_get_child(struct tnode *tn, unsigned int i) +/* + * caller must hold RTNL + */ +static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i) { BUG_ON(i >= 1U << tn->bits); - return tn->child[i]; + return rtnl_dereference(tn->child[i]); } -static inline struct rt_trie_node *tnode_get_child_rcu(struct tnode *tn, unsigned int i) +/* + * caller must hold RCU read lock or RTNL + */ +static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i) { - struct rt_trie_node *ret = tnode_get_child(tn, i); + BUG_ON(i >= 1U << tn->bits); - return rcu_dereference_rtnl(ret); + return rcu_dereference_rtnl(tn->child[i]); } static inline int tnode_child_length(const struct tnode *tn) @@ -350,14 +370,9 @@ static inline void free_leaf(struct leaf *l) call_rcu_bh(&l->rcu, __leaf_free_rcu); } -static void __leaf_info_free_rcu(struct rcu_head *head) -{ - kfree(container_of(head, struct leaf_info, rcu)); -} - static inline void free_leaf_info(struct leaf_info *leaf) { - call_rcu(&leaf->rcu, __leaf_info_free_rcu); + kfree_rcu(leaf, rcu); } static struct tnode *tnode_alloc(size_t size) @@ -487,7 +502,7 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i, static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, int wasfull) { - struct rt_trie_node *chi = tn->child[i]; + struct rt_trie_node *chi = rtnl_dereference(tn->child[i]); int isfull; BUG_ON(i >= 1<<tn->bits); @@ -665,7 +680,7 @@ one_child: for (i = 0; i < tnode_child_length(tn); i++) { struct rt_trie_node *n; - n = tn->child[i]; + n = rtnl_dereference(tn->child[i]); if (!n) continue; @@ -679,6 +694,20 @@ one_child: return (struct rt_trie_node *) tn; } + +static void tnode_clean_free(struct tnode *tn) +{ + int i; + struct tnode *tofree; + + for (i = 0; i < tnode_child_length(tn); i++) { + tofree = (struct tnode *)rtnl_dereference(tn->child[i]); + if (tofree) + tnode_free(tofree); + } + tnode_free(tn); +} + static struct tnode *inflate(struct trie *t, struct tnode *tn) { struct tnode *oldtnode = tn; @@ -755,8 +784,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) inode = (struct tnode *) node; if (inode->bits == 1) { - put_child(t, tn, 2*i, inode->child[0]); - put_child(t, tn, 2*i+1, inode->child[1]); + put_child(t, tn, 2*i, rtnl_dereference(inode->child[0])); + put_child(t, tn, 2*i+1, rtnl_dereference(inode->child[1])); tnode_free_safe(inode); continue; @@ -797,8 +826,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) size = tnode_child_length(left); for (j = 0; j < size; j++) { - put_child(t, left, j, inode->child[j]); - put_child(t, right, j, inode->child[j + size]); + put_child(t, left, j, rtnl_dereference(inode->child[j])); + put_child(t, right, j, rtnl_dereference(inode->child[j + size])); } put_child(t, tn, 2*i, resize(t, left)); put_child(t, tn, 2*i+1, resize(t, right)); @@ -808,18 +837,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) tnode_free_safe(oldtnode); return tn; nomem: - { - int size = tnode_child_length(tn); - int j; - - for (j = 0; j < size; j++) - if (tn->child[j]) - tnode_free((struct tnode *)tn->child[j]); - - tnode_free(tn); - - return ERR_PTR(-ENOMEM); - } + tnode_clean_free(tn); + return ERR_PTR(-ENOMEM); } static struct tnode *halve(struct trie *t, struct tnode *tn) @@ -890,18 +909,8 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) tnode_free_safe(oldtnode); return tn; nomem: - { - int size = tnode_child_length(tn); - int j; - - for (j = 0; j < size; j++) - if (tn->child[j]) - tnode_free((struct tnode *)tn->child[j]); - - tnode_free(tn); - - return ERR_PTR(-ENOMEM); - } + tnode_clean_free(tn); + return ERR_PTR(-ENOMEM); } /* readside must use rcu_read_lock currently dump routines @@ -1033,7 +1042,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) t_key cindex; pos = 0; - n = t->trie; + n = rtnl_dereference(t->trie); /* If we point to NULL, stop. Either the tree is empty and we should * just put a new leaf in if, or we have reached an empty child slot, @@ -1319,6 +1328,9 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) } } + if (!plen) + tb->tb_num_default++; + list_add_tail_rcu(&new_fa->fa_list, (fa ? &fa->fa_list : fa_head)); @@ -1684,6 +1696,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) list_del_rcu(&fa->fa_list); + if (!plen) + tb->tb_num_default--; + if (list_empty(fa_head)) { hlist_del_rcu(&li->hlist); free_leaf_info(li); @@ -1756,7 +1771,7 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c) continue; if (IS_LEAF(c)) { - prefetch(p->child[idx]); + prefetch(rcu_dereference_rtnl(p->child[idx])); return (struct leaf *) c; } @@ -1974,6 +1989,7 @@ struct fib_table *fib_trie_table(u32 id) tb->tb_id = id; tb->tb_default = -1; + tb->tb_num_default = 0; t = (struct trie *) tb->tb_data; memset(t, 0, sizeof(*t)); @@ -2269,7 +2285,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) /* walk rest of this hash chain */ h = tb->tb_id & (FIB_TABLE_HASHSZ - 1); - while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) { + while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) { tb = hlist_entry(tb_node, struct fib_table, tb_hlist); n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); if (n) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index e5f8a71d3a2..5395e45dcce 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -83,6 +83,7 @@ #include <net/tcp.h> #include <net/udp.h> #include <net/raw.h> +#include <net/ping.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/errno.h> @@ -108,8 +109,7 @@ struct icmp_bxm { __be32 times[3]; } data; int head_len; - struct ip_options replyopts; - unsigned char optbuf[40]; + struct ip_options_data replyopts; }; /* An array of errno for error messages from dest unreach. */ @@ -234,7 +234,7 @@ static inline void icmp_xmit_unlock(struct sock *sk) */ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, - int type, int code) + struct flowi4 *fl4, int type, int code) { struct dst_entry *dst = &rt->dst; bool rc = true; @@ -253,7 +253,7 @@ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, /* Limit if icmp type is enabled in ratemask. */ if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { if (!rt->peer) - rt_bind_peer(rt, 1); + rt_bind_peer(rt, fl4->daddr, 1); rc = inet_peer_xrlim_allow(rt->peer, net->ipv4.sysctl_icmp_ratelimit); } @@ -291,13 +291,14 @@ static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd, } static void icmp_push_reply(struct icmp_bxm *icmp_param, + struct flowi4 *fl4, struct ipcm_cookie *ipc, struct rtable **rt) { struct sock *sk; struct sk_buff *skb; sk = icmp_sk(dev_net((*rt)->dst.dev)); - if (ip_append_data(sk, icmp_glue_bits, icmp_param, + if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param, icmp_param->data_len+icmp_param->head_len, icmp_param->head_len, ipc, rt, MSG_DONTWAIT) < 0) { @@ -316,7 +317,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param, icmp_param->head_len, csum); icmph->checksum = csum_fold(csum); skb->ip_summed = CHECKSUM_NONE; - ip_push_pending_frames(sk); + ip_push_pending_frames(sk, fl4); } } @@ -329,11 +330,12 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) struct ipcm_cookie ipc; struct rtable *rt = skb_rtable(skb); struct net *net = dev_net(rt->dst.dev); + struct flowi4 fl4; struct sock *sk; struct inet_sock *inet; __be32 daddr; - if (ip_options_echo(&icmp_param->replyopts, skb)) + if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) return; sk = icmp_xmit_lock(net); @@ -344,65 +346,60 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) icmp_param->data.icmph.checksum = 0; inet->tos = ip_hdr(skb)->tos; - daddr = ipc.addr = rt->rt_src; + daddr = ipc.addr = ip_hdr(skb)->saddr; ipc.opt = NULL; ipc.tx_flags = 0; - if (icmp_param->replyopts.optlen) { - ipc.opt = &icmp_param->replyopts; - if (ipc.opt->srr) - daddr = icmp_param->replyopts.faddr; + if (icmp_param->replyopts.opt.opt.optlen) { + ipc.opt = &icmp_param->replyopts.opt; + if (ipc.opt->opt.srr) + daddr = icmp_param->replyopts.opt.opt.faddr; } - { - struct flowi4 fl4 = { - .daddr = daddr, - .saddr = rt->rt_spec_dst, - .flowi4_tos = RT_TOS(ip_hdr(skb)->tos), - .flowi4_proto = IPPROTO_ICMP, - }; - security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); - rt = ip_route_output_key(net, &fl4); - if (IS_ERR(rt)) - goto out_unlock; - } - if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type, + memset(&fl4, 0, sizeof(fl4)); + fl4.daddr = daddr; + fl4.saddr = rt->rt_spec_dst; + fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); + fl4.flowi4_proto = IPPROTO_ICMP; + security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) + goto out_unlock; + if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type, icmp_param->data.icmph.code)) - icmp_push_reply(icmp_param, &ipc, &rt); + icmp_push_reply(icmp_param, &fl4, &ipc, &rt); ip_rt_put(rt); out_unlock: icmp_xmit_unlock(sk); } -static struct rtable *icmp_route_lookup(struct net *net, struct sk_buff *skb_in, - struct iphdr *iph, +static struct rtable *icmp_route_lookup(struct net *net, + struct flowi4 *fl4, + struct sk_buff *skb_in, + const struct iphdr *iph, __be32 saddr, u8 tos, int type, int code, struct icmp_bxm *param) { - struct flowi4 fl4 = { - .daddr = (param->replyopts.srr ? - param->replyopts.faddr : iph->saddr), - .saddr = saddr, - .flowi4_tos = RT_TOS(tos), - .flowi4_proto = IPPROTO_ICMP, - .fl4_icmp_type = type, - .fl4_icmp_code = code, - }; struct rtable *rt, *rt2; int err; - security_skb_classify_flow(skb_in, flowi4_to_flowi(&fl4)); - rt = __ip_route_output_key(net, &fl4); + memset(fl4, 0, sizeof(*fl4)); + fl4->daddr = (param->replyopts.opt.opt.srr ? + param->replyopts.opt.opt.faddr : iph->saddr); + fl4->saddr = saddr; + fl4->flowi4_tos = RT_TOS(tos); + fl4->flowi4_proto = IPPROTO_ICMP; + fl4->fl4_icmp_type = type; + fl4->fl4_icmp_code = code; + security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); + rt = __ip_route_output_key(net, fl4); if (IS_ERR(rt)) return rt; /* No need to clone since we're just using its address. */ rt2 = rt; - if (!fl4.saddr) - fl4.saddr = rt->rt_src; - rt = (struct rtable *) xfrm_lookup(net, &rt->dst, - flowi4_to_flowi(&fl4), NULL, 0); + flowi4_to_flowi(fl4), NULL, 0); if (!IS_ERR(rt)) { if (rt != rt2) return rt; @@ -411,19 +408,19 @@ static struct rtable *icmp_route_lookup(struct net *net, struct sk_buff *skb_in, } else return rt; - err = xfrm_decode_session_reverse(skb_in, flowi4_to_flowi(&fl4), AF_INET); + err = xfrm_decode_session_reverse(skb_in, flowi4_to_flowi(fl4), AF_INET); if (err) goto relookup_failed; - if (inet_addr_type(net, fl4.saddr) == RTN_LOCAL) { - rt2 = __ip_route_output_key(net, &fl4); + if (inet_addr_type(net, fl4->saddr) == RTN_LOCAL) { + rt2 = __ip_route_output_key(net, fl4); if (IS_ERR(rt2)) err = PTR_ERR(rt2); } else { struct flowi4 fl4_2 = {}; unsigned long orefdst; - fl4_2.daddr = fl4.saddr; + fl4_2.daddr = fl4->saddr; rt2 = ip_route_output_key(net, &fl4_2); if (IS_ERR(rt2)) { err = PTR_ERR(rt2); @@ -431,7 +428,7 @@ static struct rtable *icmp_route_lookup(struct net *net, struct sk_buff *skb_in, } /* Ugh! */ orefdst = skb_in->_skb_refdst; /* save old refdst */ - err = ip_route_input(skb_in, fl4.daddr, fl4.saddr, + err = ip_route_input(skb_in, fl4->daddr, fl4->saddr, RT_TOS(tos), rt2->dst.dev); dst_release(&rt2->dst); @@ -443,7 +440,7 @@ static struct rtable *icmp_route_lookup(struct net *net, struct sk_buff *skb_in, goto relookup_failed; rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst, - flowi4_to_flowi(&fl4), NULL, + flowi4_to_flowi(fl4), NULL, XFRM_LOOKUP_ICMP); if (!IS_ERR(rt2)) { dst_release(&rt->dst); @@ -482,6 +479,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) struct icmp_bxm icmp_param; struct rtable *rt = skb_rtable(skb_in); struct ipcm_cookie ipc; + struct flowi4 fl4; __be32 saddr; u8 tos; struct net *net; @@ -581,7 +579,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) IPTOS_PREC_INTERNETCONTROL) : iph->tos; - if (ip_options_echo(&icmp_param.replyopts, skb_in)) + if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in)) goto out_unlock; @@ -597,15 +595,15 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) icmp_param.offset = skb_network_offset(skb_in); inet_sk(sk)->tos = tos; ipc.addr = iph->saddr; - ipc.opt = &icmp_param.replyopts; + ipc.opt = &icmp_param.replyopts.opt; ipc.tx_flags = 0; - rt = icmp_route_lookup(net, skb_in, iph, saddr, tos, + rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, type, code, &icmp_param); if (IS_ERR(rt)) goto out_unlock; - if (!icmpv4_xrlim_allow(net, rt, type, code)) + if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code)) goto ende; /* RFC says return as much as we can without exceeding 576 bytes. */ @@ -613,7 +611,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) room = dst_mtu(&rt->dst); if (room > 576) room = 576; - room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen; + room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen; room -= sizeof(struct icmphdr); icmp_param.data_len = skb_in->len - icmp_param.offset; @@ -621,7 +619,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) icmp_param.data_len = room; icmp_param.head_len = sizeof(struct icmphdr); - icmp_push_reply(&icmp_param, &ipc, &rt); + icmp_push_reply(&icmp_param, &fl4, &ipc, &rt); ende: ip_rt_put(rt); out_unlock: @@ -637,7 +635,7 @@ EXPORT_SYMBOL(icmp_send); static void icmp_unreach(struct sk_buff *skb) { - struct iphdr *iph; + const struct iphdr *iph; struct icmphdr *icmph; int hash, protocol; const struct net_protocol *ipprot; @@ -656,7 +654,7 @@ static void icmp_unreach(struct sk_buff *skb) goto out_err; icmph = icmp_hdr(skb); - iph = (struct iphdr *)skb->data; + iph = (const struct iphdr *)skb->data; if (iph->ihl < 5) /* Mangled header, drop. */ goto out_err; @@ -729,7 +727,7 @@ static void icmp_unreach(struct sk_buff *skb) if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) goto out; - iph = (struct iphdr *)skb->data; + iph = (const struct iphdr *)skb->data; protocol = iph->protocol; /* @@ -758,7 +756,7 @@ out_err: static void icmp_redirect(struct sk_buff *skb) { - struct iphdr *iph; + const struct iphdr *iph; if (skb->len < sizeof(struct iphdr)) goto out_err; @@ -769,7 +767,7 @@ static void icmp_redirect(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto out; - iph = (struct iphdr *)skb->data; + iph = (const struct iphdr *)skb->data; switch (icmp_hdr(skb)->code & 7) { case ICMP_REDIR_NET: @@ -784,6 +782,15 @@ static void icmp_redirect(struct sk_buff *skb) iph->saddr, skb->dev); break; } + + /* Ping wants to see redirects. + * Let's pretend they are errors of sorts... */ + if (iph->protocol == IPPROTO_ICMP && + iph->ihl >= 5 && + pskb_may_pull(skb, (iph->ihl<<2)+8)) { + ping_err(skb, icmp_hdr(skb)->un.gateway); + } + out: return; out_err: @@ -933,12 +940,12 @@ static void icmp_address_reply(struct sk_buff *skb) BUG_ON(mp == NULL); for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { if (*mp == ifa->ifa_mask && - inet_ifa_match(rt->rt_src, ifa)) + inet_ifa_match(ip_hdr(skb)->saddr, ifa)) break; } if (!ifa && net_ratelimit()) { printk(KERN_INFO "Wrong address mask %pI4 from %s/%pI4\n", - mp, dev->name, &rt->rt_src); + mp, dev->name, &ip_hdr(skb)->saddr); } } } @@ -1044,7 +1051,7 @@ error: */ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = { [ICMP_ECHOREPLY] = { - .handler = icmp_discard, + .handler = ping_rcv, }, [1] = { .handler = icmp_discard, diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 1fd3d9ce839..f1d27f6c935 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -149,17 +149,11 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc); static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, int sfcount, __be32 *psfsrc, int delta); - -static void ip_mc_list_reclaim(struct rcu_head *head) -{ - kfree(container_of(head, struct ip_mc_list, rcu)); -} - static void ip_ma_put(struct ip_mc_list *im) { if (atomic_dec_and_test(&im->refcnt)) { in_dev_put(im->interface); - call_rcu(&im->rcu, ip_mc_list_reclaim); + kfree_rcu(im, rcu); } } @@ -309,6 +303,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) struct iphdr *pip; struct igmpv3_report *pig; struct net *net = dev_net(dev); + struct flowi4 fl4; while (1) { skb = alloc_skb(size + LL_ALLOCATED_SPACE(dev), @@ -321,18 +316,13 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) } igmp_skb_size(skb) = size; - rt = ip_route_output_ports(net, NULL, IGMPV3_ALL_MCR, 0, + rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0, 0, 0, IPPROTO_IGMP, 0, dev->ifindex); if (IS_ERR(rt)) { kfree_skb(skb); return NULL; } - if (rt->rt_src == 0) { - kfree_skb(skb); - ip_rt_put(rt); - return NULL; - } skb_dst_set(skb, &rt->dst); skb->dev = dev; @@ -348,8 +338,8 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) pip->tos = 0xc0; pip->frag_off = htons(IP_DF); pip->ttl = 1; - pip->daddr = rt->rt_dst; - pip->saddr = rt->rt_src; + pip->daddr = fl4.daddr; + pip->saddr = fl4.saddr; pip->protocol = IPPROTO_IGMP; pip->tot_len = 0; /* filled in later */ ip_select_ident(pip, &rt->dst, NULL); @@ -655,6 +645,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, struct net_device *dev = in_dev->dev; struct net *net = dev_net(dev); __be32 group = pmc ? pmc->multiaddr : 0; + struct flowi4 fl4; __be32 dst; if (type == IGMPV3_HOST_MEMBERSHIP_REPORT) @@ -664,17 +655,12 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, else dst = group; - rt = ip_route_output_ports(net, NULL, dst, 0, + rt = ip_route_output_ports(net, &fl4, NULL, dst, 0, 0, 0, IPPROTO_IGMP, 0, dev->ifindex); if (IS_ERR(rt)) return -1; - if (rt->rt_src == 0) { - ip_rt_put(rt); - return -1; - } - skb = alloc_skb(IGMP_SIZE+LL_ALLOCATED_SPACE(dev), GFP_ATOMIC); if (skb == NULL) { ip_rt_put(rt); @@ -695,7 +681,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, iph->frag_off = htons(IP_DF); iph->ttl = 1; iph->daddr = dst; - iph->saddr = rt->rt_src; + iph->saddr = fl4.saddr; iph->protocol = IPPROTO_IGMP; ip_select_ident(iph, &rt->dst, NULL); ((u8*)&iph[1])[0] = IPOPT_RA; @@ -1169,20 +1155,18 @@ static void igmp_group_dropped(struct ip_mc_list *im) if (!in_dev->dead) { if (IGMP_V1_SEEN(in_dev)) - goto done; + return; if (IGMP_V2_SEEN(in_dev)) { if (reporter) igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE); - goto done; + return; } /* IGMPv3 */ igmpv3_add_delrec(in_dev, im); igmp_ifc_event(in_dev); } -done: #endif - ip_mc_clear_src(im); } static void igmp_group_added(struct ip_mc_list *im) @@ -1319,6 +1303,7 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr) *ip = i->next_rcu; in_dev->mc_count--; igmp_group_dropped(i); + ip_mc_clear_src(i); if (!in_dev->dead) ip_rt_multicast_event(in_dev); @@ -1428,7 +1413,8 @@ void ip_mc_destroy_dev(struct in_device *in_dev) in_dev->mc_list = i->next_rcu; in_dev->mc_count--; - igmp_group_dropped(i); + /* We've dropped the groups in ip_mc_down already */ + ip_mc_clear_src(i); ip_ma_put(i); } } @@ -1836,12 +1822,6 @@ done: } EXPORT_SYMBOL(ip_mc_join_group); -static void ip_sf_socklist_reclaim(struct rcu_head *rp) -{ - kfree(container_of(rp, struct ip_sf_socklist, rcu)); - /* sk_omem_alloc should have been decreased by the caller*/ -} - static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, struct in_device *in_dev) { @@ -1858,18 +1838,10 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, rcu_assign_pointer(iml->sflist, NULL); /* decrease mem now to avoid the memleak warning */ atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc); - call_rcu(&psf->rcu, ip_sf_socklist_reclaim); + kfree_rcu(psf, rcu); return err; } - -static void ip_mc_socklist_reclaim(struct rcu_head *rp) -{ - kfree(container_of(rp, struct ip_mc_socklist, rcu)); - /* sk_omem_alloc should have been decreased by the caller*/ -} - - /* * Ask a socket to leave a group. */ @@ -1909,7 +1881,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) rtnl_unlock(); /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); - call_rcu(&iml->rcu, ip_mc_socklist_reclaim); + kfree_rcu(iml, rcu); return 0; } if (!in_dev) @@ -2026,7 +1998,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct newpsl->sl_addr[i] = psl->sl_addr[i]; /* decrease mem now to avoid the memleak warning */ atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); - call_rcu(&psl->rcu, ip_sf_socklist_reclaim); + kfree_rcu(psl, rcu); } rcu_assign_pointer(pmc->sflist, newpsl); psl = newpsl; @@ -2127,7 +2099,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) psl->sl_count, psl->sl_addr, 0); /* decrease mem now to avoid the memleak warning */ atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); - call_rcu(&psl->rcu, ip_sf_socklist_reclaim); + kfree_rcu(psl, rcu); } else (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, 0, NULL, 0); @@ -2324,7 +2296,7 @@ void ip_mc_drop_socket(struct sock *sk) ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); - call_rcu(&iml->rcu, ip_mc_socklist_reclaim); + kfree_rcu(iml, rcu); } rtnl_unlock(); } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 38f23e721b8..c14d88ad348 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -33,7 +33,7 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg); * This struct holds the first and last local port number. */ struct local_ports sysctl_local_ports __read_mostly = { - .lock = SEQLOCK_UNLOCKED, + .lock = __SEQLOCK_UNLOCKED(sysctl_local_ports.lock), .range = { 32768, 61000 }, }; @@ -350,30 +350,24 @@ void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); struct dst_entry *inet_csk_route_req(struct sock *sk, + struct flowi4 *fl4, const struct request_sock *req) { struct rtable *rt; const struct inet_request_sock *ireq = inet_rsk(req); - struct ip_options *opt = inet_rsk(req)->opt; - struct flowi4 fl4 = { - .flowi4_oif = sk->sk_bound_dev_if, - .flowi4_mark = sk->sk_mark, - .daddr = ((opt && opt->srr) ? - opt->faddr : ireq->rmt_addr), - .saddr = ireq->loc_addr, - .flowi4_tos = RT_CONN_FLAGS(sk), - .flowi4_proto = sk->sk_protocol, - .flowi4_flags = inet_sk_flowi_flags(sk), - .fl4_sport = inet_sk(sk)->inet_sport, - .fl4_dport = ireq->rmt_port, - }; + struct ip_options_rcu *opt = inet_rsk(req)->opt; struct net *net = sock_net(sk); - security_req_classify_flow(req, flowi4_to_flowi(&fl4)); - rt = ip_route_output_flow(net, &fl4, sk); + flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, + RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, + sk->sk_protocol, inet_sk_flowi_flags(sk), + (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr, + ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport); + security_req_classify_flow(req, flowi4_to_flowi(fl4)); + rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; - if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) + if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) goto route_err; return &rt->dst; @@ -385,6 +379,39 @@ no_route: } EXPORT_SYMBOL_GPL(inet_csk_route_req); +struct dst_entry *inet_csk_route_child_sock(struct sock *sk, + struct sock *newsk, + const struct request_sock *req) +{ + const struct inet_request_sock *ireq = inet_rsk(req); + struct inet_sock *newinet = inet_sk(newsk); + struct ip_options_rcu *opt = ireq->opt; + struct net *net = sock_net(sk); + struct flowi4 *fl4; + struct rtable *rt; + + fl4 = &newinet->cork.fl.u.ip4; + flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, + RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, + sk->sk_protocol, inet_sk_flowi_flags(sk), + (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr, + ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport); + security_req_classify_flow(req, flowi4_to_flowi(fl4)); + rt = ip_route_output_flow(net, fl4, sk); + if (IS_ERR(rt)) + goto no_route; + if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) + goto route_err; + return &rt->dst; + +route_err: + ip_rt_put(rt); +no_route: + IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); + return NULL; +} +EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); + static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, const u32 rnd, const u32 synq_hsize) { diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 2ada17129fc..3267d389843 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -124,7 +124,7 @@ static int inet_csk_diag_fill(struct sock *sk, #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) if (r->idiag_family == AF_INET6) { - struct ipv6_pinfo *np = inet6_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, &np->rcv_saddr); @@ -437,7 +437,7 @@ static int valid_cc(const void *bc, int len, int cc) return 0; if (cc == len) return 1; - if (op->yes < 4) + if (op->yes < 4 || op->yes & 3) return 0; len -= op->yes; bc += op->yes; @@ -447,11 +447,11 @@ static int valid_cc(const void *bc, int len, int cc) static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) { - const unsigned char *bc = bytecode; + const void *bc = bytecode; int len = bytecode_len; while (len > 0) { - struct inet_diag_bc_op *op = (struct inet_diag_bc_op *)bc; + const struct inet_diag_bc_op *op = bc; //printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len); switch (op->code) { @@ -462,22 +462,20 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) case INET_DIAG_BC_S_LE: case INET_DIAG_BC_D_GE: case INET_DIAG_BC_D_LE: - if (op->yes < 4 || op->yes > len + 4) - return -EINVAL; case INET_DIAG_BC_JMP: - if (op->no < 4 || op->no > len + 4) + if (op->no < 4 || op->no > len + 4 || op->no & 3) return -EINVAL; if (op->no < len && !valid_cc(bytecode, bytecode_len, len - op->no)) return -EINVAL; break; case INET_DIAG_BC_NOP: - if (op->yes < 4 || op->yes > len + 4) - return -EINVAL; break; default: return -EINVAL; } + if (op->yes < 4 || op->yes > len + 4 || op->yes & 3) + return -EINVAL; bc += op->yes; len -= op->yes; } diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c index 47038cb6c13..85a0f75dae6 100644 --- a/net/ipv4/inet_lro.c +++ b/net/ipv4/inet_lro.c @@ -51,8 +51,8 @@ MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)"); * Basic tcp checks whether packet is suitable for LRO */ -static int lro_tcp_ip_check(struct iphdr *iph, struct tcphdr *tcph, - int len, struct net_lro_desc *lro_desc) +static int lro_tcp_ip_check(const struct iphdr *iph, const struct tcphdr *tcph, + int len, const struct net_lro_desc *lro_desc) { /* check ip header: don't aggregate padded frames */ if (ntohs(iph->tot_len) != len) diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 9df4e635fb5..ce616d92cc5 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -154,11 +154,9 @@ void __init inet_initpeers(void) /* Called with or without local BH being disabled. */ static void unlink_from_unused(struct inet_peer *p) { - if (!list_empty(&p->unused)) { - spin_lock_bh(&unused_peers.lock); - list_del_init(&p->unused); - spin_unlock_bh(&unused_peers.lock); - } + spin_lock_bh(&unused_peers.lock); + list_del_init(&p->unused); + spin_unlock_bh(&unused_peers.lock); } static int addr_compare(const struct inetpeer_addr *a, @@ -205,6 +203,20 @@ static int addr_compare(const struct inetpeer_addr *a, u; \ }) +static bool atomic_add_unless_return(atomic_t *ptr, int a, int u, int *newv) +{ + int cur, old = atomic_read(ptr); + + while (old != u) { + *newv = old + a; + cur = atomic_cmpxchg(ptr, old, *newv); + if (cur == old) + return true; + old = cur; + } + return false; +} + /* * Called with rcu_read_lock() * Because we hold no lock against a writer, its quite possible we fall @@ -213,7 +225,8 @@ static int addr_compare(const struct inetpeer_addr *a, * We exit from this function if number of links exceeds PEER_MAXDEPTH */ static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, - struct inet_peer_base *base) + struct inet_peer_base *base, + int *newrefcnt) { struct inet_peer *u = rcu_dereference(base->root); int count = 0; @@ -226,7 +239,7 @@ static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, * distinction between an unused entry (refcnt=0) and * a freed one. */ - if (unlikely(!atomic_add_unless(&u->refcnt, 1, -1))) + if (!atomic_add_unless_return(&u->refcnt, 1, -1, newrefcnt)) u = NULL; return u; } @@ -465,22 +478,23 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) struct inet_peer_base *base = family_to_base(daddr->family); struct inet_peer *p; unsigned int sequence; - int invalidated; + int invalidated, newrefcnt = 0; /* Look up for the address quickly, lockless. * Because of a concurrent writer, we might not find an existing entry. */ rcu_read_lock(); sequence = read_seqbegin(&base->lock); - p = lookup_rcu(daddr, base); + p = lookup_rcu(daddr, base, &newrefcnt); invalidated = read_seqretry(&base->lock, sequence); rcu_read_unlock(); if (p) { - /* The existing node has been found. +found: /* The existing node has been found. * Remove the entry from unused list if it was there. */ - unlink_from_unused(p); + if (newrefcnt == 1) + unlink_from_unused(p); return p; } @@ -494,11 +508,9 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) write_seqlock_bh(&base->lock); p = lookup(daddr, stack, base); if (p != peer_avl_empty) { - atomic_inc(&p->refcnt); + newrefcnt = atomic_inc_return(&p->refcnt); write_sequnlock_bh(&base->lock); - /* Remove the entry from unused list if it was there. */ - unlink_from_unused(p); - return p; + goto found; } p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL; if (p) { diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 99461f09320..3b34d1c8627 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -84,7 +84,7 @@ int ip_forward(struct sk_buff *skb) rt = skb_rtable(skb); - if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) + if (opt->is_strictroute && ip_hdr(skb)->daddr != rt->rt_gateway) goto sr_failed; if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) && diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index b1d282f11be..0ad6035f636 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -77,22 +77,40 @@ struct ipq { struct inet_peer *peer; }; -#define IPFRAG_ECN_CLEAR 0x01 /* one frag had INET_ECN_NOT_ECT */ -#define IPFRAG_ECN_SET_CE 0x04 /* one frag had INET_ECN_CE */ +/* RFC 3168 support : + * We want to check ECN values of all fragments, do detect invalid combinations. + * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value. + */ +#define IPFRAG_ECN_NOT_ECT 0x01 /* one frag had ECN_NOT_ECT */ +#define IPFRAG_ECN_ECT_1 0x02 /* one frag had ECN_ECT_1 */ +#define IPFRAG_ECN_ECT_0 0x04 /* one frag had ECN_ECT_0 */ +#define IPFRAG_ECN_CE 0x08 /* one frag had ECN_CE */ static inline u8 ip4_frag_ecn(u8 tos) { - tos = (tos & INET_ECN_MASK) + 1; - /* - * After the last operation we have (in binary): - * INET_ECN_NOT_ECT => 001 - * INET_ECN_ECT_1 => 010 - * INET_ECN_ECT_0 => 011 - * INET_ECN_CE => 100 - */ - return (tos & 2) ? 0 : tos; + return 1 << (tos & INET_ECN_MASK); } +/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements + * Value : 0xff if frame should be dropped. + * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field + */ +static const u8 ip4_frag_ecn_table[16] = { + /* at least one fragment had CE, and others ECT_0 or ECT_1 */ + [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE, + [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE, + [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE, + + /* invalid combinations : drop frame */ + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, +}; + static struct inet_frags ip4_frags; int ip_frag_nqueues(struct net *net) @@ -524,9 +542,15 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, int len; int ihlen; int err; + u8 ecn; ipq_kill(qp); + ecn = ip4_frag_ecn_table[qp->ecn]; + if (unlikely(ecn == 0xff)) { + err = -EINVAL; + goto out_fail; + } /* Make the one we just received the head. */ if (prev) { head = prev->next; @@ -605,17 +629,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, iph = ip_hdr(head); iph->frag_off = 0; iph->tot_len = htons(len); - /* RFC3168 5.3 Fragmentation support - * If one fragment had INET_ECN_NOT_ECT, - * reassembled frame also has INET_ECN_NOT_ECT - * Elif one fragment had INET_ECN_CE - * reassembled frame also has INET_ECN_CE - */ - if (qp->ecn & IPFRAG_ECN_CLEAR) - iph->tos &= ~INET_ECN_MASK; - else if (qp->ecn & IPFRAG_ECN_SET_CE) - iph->tos |= INET_ECN_CE; - + iph->tos |= ecn; IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); qp->q.fragments = NULL; qp->q.fragments_tail = NULL; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index da5941f18c3..8871067560d 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -413,11 +413,6 @@ static struct ip_tunnel *ipgre_tunnel_locate(struct net *net, dev_net_set(dev, net); - if (strchr(name, '%')) { - if (dev_alloc_name(dev, name) < 0) - goto failed_free; - } - nt = netdev_priv(dev); nt->parms = *parms; dev->rtnl_link_ops = &ipgre_link_ops; @@ -462,7 +457,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info) by themself??? */ - struct iphdr *iph = (struct iphdr *)skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; __be16 *p = (__be16*)(skb->data+(iph->ihl<<2)); int grehlen = (iph->ihl<<2) + 4; const int type = icmp_hdr(skb)->type; @@ -534,7 +529,7 @@ out: rcu_read_unlock(); } -static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) +static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb) { if (INET_ECN_is_ce(iph->tos)) { if (skb->protocol == htons(ETH_P_IP)) { @@ -546,19 +541,19 @@ static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) } static inline u8 -ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb) +ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb) { u8 inner = 0; if (skb->protocol == htons(ETH_P_IP)) inner = old_iph->tos; else if (skb->protocol == htons(ETH_P_IPV6)) - inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph); + inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph); return INET_ECN_encapsulate(tos, inner); } static int ipgre_rcv(struct sk_buff *skb) { - struct iphdr *iph; + const struct iphdr *iph; u8 *h; __be16 flags; __sum16 csum = 0; @@ -697,8 +692,9 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev { struct ip_tunnel *tunnel = netdev_priv(dev); struct pcpu_tstats *tstats; - struct iphdr *old_iph = ip_hdr(skb); - struct iphdr *tiph; + const struct iphdr *old_iph = ip_hdr(skb); + const struct iphdr *tiph; + struct flowi4 fl4; u8 tos; __be16 df; struct rtable *rt; /* Route to the other host */ @@ -714,7 +710,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev if (dev->header_ops && dev->type == ARPHRD_IPGRE) { gre_hlen = 0; - tiph = (struct iphdr *)skb->data; + tiph = (const struct iphdr *)skb->data; } else { gre_hlen = tunnel->hlen; tiph = &tunnel->parms.iph; @@ -735,14 +731,14 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) else if (skb->protocol == htons(ETH_P_IPV6)) { - struct in6_addr *addr6; + const struct in6_addr *addr6; int addr_type; struct neighbour *neigh = skb_dst(skb)->neighbour; if (neigh == NULL) goto tx_error; - addr6 = (struct in6_addr *)&neigh->primary_key; + addr6 = (const struct in6_addr *)&neigh->primary_key; addr_type = ipv6_addr_type(addr6); if (addr_type == IPV6_ADDR_ANY) { @@ -766,10 +762,10 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev if (skb->protocol == htons(ETH_P_IP)) tos = old_iph->tos; else if (skb->protocol == htons(ETH_P_IPV6)) - tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph); + tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph); } - rt = ip_route_output_gre(dev_net(dev), dst, tiph->saddr, + rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr, tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link); if (IS_ERR(rt)) { @@ -873,15 +869,15 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev iph->frag_off = df; iph->protocol = IPPROTO_GRE; iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb); - iph->daddr = rt->rt_dst; - iph->saddr = rt->rt_src; + iph->daddr = fl4.daddr; + iph->saddr = fl4.saddr; if ((iph->ttl = tiph->ttl) == 0) { if (skb->protocol == htons(ETH_P_IP)) iph->ttl = old_iph->ttl; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) else if (skb->protocol == htons(ETH_P_IPV6)) - iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit; + iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit; #endif else iph->ttl = ip4_dst_hoplimit(&rt->dst); @@ -927,7 +923,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev) { struct net_device *tdev = NULL; struct ip_tunnel *tunnel; - struct iphdr *iph; + const struct iphdr *iph; int hlen = LL_MAX_HEADER; int mtu = ETH_DATA_LEN; int addend = sizeof(struct iphdr) + 4; @@ -938,12 +934,14 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev) /* Guess output device to choose reasonable mtu and needed_headroom */ if (iph->daddr) { - struct rtable *rt = ip_route_output_gre(dev_net(dev), - iph->daddr, iph->saddr, - tunnel->parms.o_key, - RT_TOS(iph->tos), - tunnel->parms.link); - + struct flowi4 fl4; + struct rtable *rt; + + rt = ip_route_output_gre(dev_net(dev), &fl4, + iph->daddr, iph->saddr, + tunnel->parms.o_key, + RT_TOS(iph->tos), + tunnel->parms.link); if (!IS_ERR(rt)) { tdev = rt->dst.dev; ip_rt_put(rt); @@ -1180,7 +1178,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr) { - struct iphdr *iph = (struct iphdr *) skb_mac_header(skb); + const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb); memcpy(haddr, &iph->saddr, 4); return 4; } @@ -1196,13 +1194,15 @@ static int ipgre_open(struct net_device *dev) struct ip_tunnel *t = netdev_priv(dev); if (ipv4_is_multicast(t->parms.iph.daddr)) { - struct rtable *rt = ip_route_output_gre(dev_net(dev), - t->parms.iph.daddr, - t->parms.iph.saddr, - t->parms.o_key, - RT_TOS(t->parms.iph.tos), - t->parms.link); - + struct flowi4 fl4; + struct rtable *rt; + + rt = ip_route_output_gre(dev_net(dev), &fl4, + t->parms.iph.daddr, + t->parms.iph.saddr, + t->parms.o_key, + RT_TOS(t->parms.iph.tos), + t->parms.link); if (IS_ERR(rt)) return -EADDRNOTAVAIL; dev = rt->dst.dev; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index d7b2b0987a3..c8f48efc5fd 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -268,7 +268,7 @@ int ip_local_deliver(struct sk_buff *skb) static inline int ip_rcv_options(struct sk_buff *skb) { struct ip_options *opt; - struct iphdr *iph; + const struct iphdr *iph; struct net_device *dev = skb->dev; /* It looks as overkill, because not all @@ -374,7 +374,7 @@ drop: */ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - struct iphdr *iph; + const struct iphdr *iph; u32 len; /* When the interface is in promisc. mode, drop all the crap diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 2391b24e825..ec93335901d 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -14,6 +14,7 @@ #include <linux/slab.h> #include <linux/types.h> #include <asm/uaccess.h> +#include <asm/unaligned.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/icmp.h> @@ -36,8 +37,8 @@ * saddr is address of outgoing interface. */ -void ip_options_build(struct sk_buff * skb, struct ip_options * opt, - __be32 daddr, struct rtable *rt, int is_frag) +void ip_options_build(struct sk_buff *skb, struct ip_options *opt, + __be32 daddr, struct rtable *rt, int is_frag) { unsigned char *iph = skb_network_header(skb); @@ -50,9 +51,9 @@ void ip_options_build(struct sk_buff * skb, struct ip_options * opt, if (!is_frag) { if (opt->rr_needaddr) - ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, rt); + ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, skb, rt); if (opt->ts_needaddr) - ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, rt); + ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt); if (opt->ts_needtime) { struct timespec tv; __be32 midtime; @@ -83,9 +84,9 @@ void ip_options_build(struct sk_buff * skb, struct ip_options * opt, * NOTE: dopt cannot point to skb. */ -int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) +int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) { - struct ip_options *sopt; + const struct ip_options *sopt; unsigned char *sptr, *dptr; int soffset, doffset; int optlen; @@ -95,10 +96,8 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) sopt = &(IPCB(skb)->opt); - if (sopt->optlen == 0) { - dopt->optlen = 0; + if (sopt->optlen == 0) return 0; - } sptr = skb_network_header(skb); dptr = dopt->__data; @@ -157,7 +156,7 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) dopt->optlen += optlen; } if (sopt->srr) { - unsigned char * start = sptr+sopt->srr; + unsigned char *start = sptr+sopt->srr; __be32 faddr; optlen = start[1]; @@ -352,7 +351,7 @@ int ip_options_compile(struct net *net, goto error; } if (optptr[2] <= optlen) { - __be32 *timeptr = NULL; + unsigned char *timeptr = NULL; if (optptr[2]+3 > optptr[1]) { pp_ptr = optptr + 2; goto error; @@ -361,7 +360,7 @@ int ip_options_compile(struct net *net, case IPOPT_TS_TSONLY: opt->ts = optptr - iph; if (skb) - timeptr = (__be32*)&optptr[optptr[2]-1]; + timeptr = &optptr[optptr[2]-1]; opt->ts_needtime = 1; optptr[2] += 4; break; @@ -373,7 +372,7 @@ int ip_options_compile(struct net *net, opt->ts = optptr - iph; if (rt) { memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); - timeptr = (__be32*)&optptr[optptr[2]+3]; + timeptr = &optptr[optptr[2]+3]; } opt->ts_needaddr = 1; opt->ts_needtime = 1; @@ -391,7 +390,7 @@ int ip_options_compile(struct net *net, if (inet_addr_type(net, addr) == RTN_UNICAST) break; if (skb) - timeptr = (__be32*)&optptr[optptr[2]+3]; + timeptr = &optptr[optptr[2]+3]; } opt->ts_needtime = 1; optptr[2] += 8; @@ -405,10 +404,10 @@ int ip_options_compile(struct net *net, } if (timeptr) { struct timespec tv; - __be32 midtime; + u32 midtime; getnstimeofday(&tv); - midtime = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC); - memcpy(timeptr, &midtime, sizeof(__be32)); + midtime = (tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC; + put_unaligned_be32(midtime, timeptr); opt->is_changed = 1; } } else { @@ -499,19 +498,19 @@ void ip_options_undo(struct ip_options * opt) } } -static struct ip_options *ip_options_get_alloc(const int optlen) +static struct ip_options_rcu *ip_options_get_alloc(const int optlen) { - return kzalloc(sizeof(struct ip_options) + ((optlen + 3) & ~3), + return kzalloc(sizeof(struct ip_options_rcu) + ((optlen + 3) & ~3), GFP_KERNEL); } -static int ip_options_get_finish(struct net *net, struct ip_options **optp, - struct ip_options *opt, int optlen) +static int ip_options_get_finish(struct net *net, struct ip_options_rcu **optp, + struct ip_options_rcu *opt, int optlen) { while (optlen & 3) - opt->__data[optlen++] = IPOPT_END; - opt->optlen = optlen; - if (optlen && ip_options_compile(net, opt, NULL)) { + opt->opt.__data[optlen++] = IPOPT_END; + opt->opt.optlen = optlen; + if (optlen && ip_options_compile(net, &opt->opt, NULL)) { kfree(opt); return -EINVAL; } @@ -520,29 +519,29 @@ static int ip_options_get_finish(struct net *net, struct ip_options **optp, return 0; } -int ip_options_get_from_user(struct net *net, struct ip_options **optp, +int ip_options_get_from_user(struct net *net, struct ip_options_rcu **optp, unsigned char __user *data, int optlen) { - struct ip_options *opt = ip_options_get_alloc(optlen); + struct ip_options_rcu *opt = ip_options_get_alloc(optlen); if (!opt) return -ENOMEM; - if (optlen && copy_from_user(opt->__data, data, optlen)) { + if (optlen && copy_from_user(opt->opt.__data, data, optlen)) { kfree(opt); return -EFAULT; } return ip_options_get_finish(net, optp, opt, optlen); } -int ip_options_get(struct net *net, struct ip_options **optp, +int ip_options_get(struct net *net, struct ip_options_rcu **optp, unsigned char *data, int optlen) { - struct ip_options *opt = ip_options_get_alloc(optlen); + struct ip_options_rcu *opt = ip_options_get_alloc(optlen); if (!opt) return -ENOMEM; if (optlen) - memcpy(opt->__data, data, optlen); + memcpy(opt->opt.__data, data, optlen); return ip_options_get_finish(net, optp, opt, optlen); } @@ -555,7 +554,7 @@ void ip_forward_options(struct sk_buff *skb) if (opt->rr_needaddr) { optptr = (unsigned char *)raw + opt->rr; - ip_rt_get_source(&optptr[optptr[2]-5], rt); + ip_rt_get_source(&optptr[optptr[2]-5], skb, rt); opt->is_changed = 1; } if (opt->srr_is_hit) { @@ -569,19 +568,18 @@ void ip_forward_options(struct sk_buff *skb) ) { if (srrptr + 3 > srrspace) break; - if (memcmp(&rt->rt_dst, &optptr[srrptr-1], 4) == 0) + if (memcmp(&ip_hdr(skb)->daddr, &optptr[srrptr-1], 4) == 0) break; } if (srrptr + 3 <= srrspace) { opt->is_changed = 1; - ip_rt_get_source(&optptr[srrptr-1], rt); - ip_hdr(skb)->daddr = rt->rt_dst; + ip_rt_get_source(&optptr[srrptr-1], skb, rt); optptr[2] = srrptr+4; } else if (net_ratelimit()) printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n"); if (opt->ts_needaddr) { optptr = raw + opt->ts; - ip_rt_get_source(&optptr[optptr[2]-9], rt); + ip_rt_get_source(&optptr[optptr[2]-9], skb, rt); opt->is_changed = 1; } } @@ -603,7 +601,7 @@ int ip_options_rcv_srr(struct sk_buff *skb) unsigned long orefdst; int err; - if (!opt->srr || !rt) + if (!rt) return 0; if (skb->pkt_type != PACKET_HOST) @@ -637,7 +635,7 @@ int ip_options_rcv_srr(struct sk_buff *skb) if (rt2->rt_type != RTN_LOCAL) break; /* Superfast 8) loopback forward */ - memcpy(&iph->daddr, &optptr[srrptr-1], 4); + iph->daddr = nexthop; opt->is_changed = 1; } if (srrptr <= srrspace) { diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 459c011b1d4..84f26e8e6c6 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -140,14 +140,14 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) * */ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, - __be32 saddr, __be32 daddr, struct ip_options *opt) + __be32 saddr, __be32 daddr, struct ip_options_rcu *opt) { struct inet_sock *inet = inet_sk(sk); struct rtable *rt = skb_rtable(skb); struct iphdr *iph; /* Build the IP header. */ - skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); + skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0)); skb_reset_network_header(skb); iph = ip_hdr(skb); iph->version = 4; @@ -158,14 +158,14 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, else iph->frag_off = 0; iph->ttl = ip_select_ttl(inet, &rt->dst); - iph->daddr = rt->rt_dst; - iph->saddr = rt->rt_src; + iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); + iph->saddr = saddr; iph->protocol = sk->sk_protocol; ip_select_ident(iph, &rt->dst, sk); - if (opt && opt->optlen) { - iph->ihl += opt->optlen>>2; - ip_options_build(skb, opt, daddr, rt, 0); + if (opt && opt->opt.optlen) { + iph->ihl += opt->opt.optlen>>2; + ip_options_build(skb, &opt->opt, daddr, rt, 0); } skb->priority = sk->sk_priority; @@ -312,11 +312,12 @@ int ip_output(struct sk_buff *skb) !(IPCB(skb)->flags & IPSKB_REROUTED)); } -int ip_queue_xmit(struct sk_buff *skb) +int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) { struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); - struct ip_options *opt = inet->opt; + struct ip_options_rcu *inet_opt; + struct flowi4 *fl4; struct rtable *rt; struct iphdr *iph; int res; @@ -325,6 +326,8 @@ int ip_queue_xmit(struct sk_buff *skb) * f.e. by something like SCTP. */ rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); + fl4 = &fl->u.ip4; rt = skb_rtable(skb); if (rt != NULL) goto packet_routed; @@ -336,14 +339,14 @@ int ip_queue_xmit(struct sk_buff *skb) /* Use correct destination address if we have options. */ daddr = inet->inet_daddr; - if(opt && opt->srr) - daddr = opt->faddr; + if (inet_opt && inet_opt->opt.srr) + daddr = inet_opt->opt.faddr; /* If this fails, retransmit mechanism of transport layer will * keep trying until route appears or the connection times * itself out. */ - rt = ip_route_output_ports(sock_net(sk), sk, + rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, @@ -357,11 +360,11 @@ int ip_queue_xmit(struct sk_buff *skb) skb_dst_set_noref(skb, &rt->dst); packet_routed: - if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) + if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) goto no_route; /* OK, we know where to send it, allocate and build IP header. */ - skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); + skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0)); skb_reset_network_header(skb); iph = ip_hdr(skb); *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); @@ -371,13 +374,13 @@ packet_routed: iph->frag_off = 0; iph->ttl = ip_select_ttl(inet, &rt->dst); iph->protocol = sk->sk_protocol; - iph->saddr = rt->rt_src; - iph->daddr = rt->rt_dst; + iph->saddr = fl4->saddr; + iph->daddr = fl4->daddr; /* Transport layer set skb->h.foo itself. */ - if (opt && opt->optlen) { - iph->ihl += opt->optlen >> 2; - ip_options_build(skb, opt, inet->inet_daddr, rt, 0); + if (inet_opt && inet_opt->opt.optlen) { + iph->ihl += inet_opt->opt.optlen >> 2; + ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); } ip_select_ident_more(iph, &rt->dst, sk, @@ -773,7 +776,9 @@ static inline int ip_ufo_append_data(struct sock *sk, (length - transhdrlen)); } -static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue, +static int __ip_append_data(struct sock *sk, + struct flowi4 *fl4, + struct sk_buff_head *queue, struct inet_cork *cork, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), @@ -794,9 +799,9 @@ static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue, int csummode = CHECKSUM_NONE; struct rtable *rt = (struct rtable *)cork->dst; - exthdrlen = transhdrlen ? rt->dst.header_len : 0; - length += exthdrlen; - transhdrlen += exthdrlen; + skb = skb_peek_tail(queue); + + exthdrlen = !skb ? rt->dst.header_len : 0; mtu = cork->fragsize; hh_len = LL_RESERVED_SPACE(rt->dst.dev); @@ -805,7 +810,7 @@ static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue, maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; if (cork->length + length > 0xFFFF - fragheaderlen) { - ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, + ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu-exthdrlen); return -EMSGSIZE; } @@ -820,12 +825,10 @@ static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue, !exthdrlen) csummode = CHECKSUM_PARTIAL; - skb = skb_peek_tail(queue); - cork->length += length; if (((length > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && - (rt->dst.dev->features & NETIF_F_UFO)) { + (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) { err = ip_ufo_append_data(sk, queue, getfrag, from, length, hh_len, fragheaderlen, transhdrlen, mtu, flags); @@ -878,17 +881,16 @@ alloc_new_skb: else alloclen = fraglen; + alloclen += exthdrlen; + /* The last fragment gets additional space at tail. * Note, with MSG_MORE we overallocate on fragments, * because we have no idea what fragment will be * the last. */ - if (datalen == length + fraggap) { + if (datalen == length + fraggap) alloclen += rt->dst.trailer_len; - /* make sure mtu is not reached */ - if (datalen > mtu - fragheaderlen - rt->dst.trailer_len) - datalen -= ALIGN(rt->dst.trailer_len, 8); - } + if (transhdrlen) { skb = sock_alloc_send_skb(sk, alloclen + hh_len + 15, @@ -921,11 +923,11 @@ alloc_new_skb: /* * Find where to start putting bytes. */ - data = skb_put(skb, fraglen); + data = skb_put(skb, fraglen + exthdrlen); skb_set_network_header(skb, exthdrlen); skb->transport_header = (skb->network_header + fragheaderlen); - data += fragheaderlen; + data += fragheaderlen + exthdrlen; if (fraggap) { skb->csum = skb_copy_and_csum_bits( @@ -1033,7 +1035,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, struct ipcm_cookie *ipc, struct rtable **rtp) { struct inet_sock *inet = inet_sk(sk); - struct ip_options *opt; + struct ip_options_rcu *opt; struct rtable *rt; /* @@ -1047,7 +1049,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, if (unlikely(cork->opt == NULL)) return -ENOBUFS; } - memcpy(cork->opt, opt, sizeof(struct ip_options) + opt->optlen); + memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen); cork->flags |= IPCORK_OPT; cork->addr = ipc->addr; } @@ -1059,7 +1061,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, */ *rtp = NULL; cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ? - rt->dst.dev->mtu : dst_mtu(rt->dst.path); + rt->dst.dev->mtu : dst_mtu(&rt->dst); cork->dst = &rt->dst; cork->length = 0; cork->tx_flags = ipc->tx_flags; @@ -1080,7 +1082,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, * * LATER: length must be adjusted by pad at tail, when it is required. */ -int ip_append_data(struct sock *sk, +int ip_append_data(struct sock *sk, struct flowi4 *fl4, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, @@ -1094,24 +1096,25 @@ int ip_append_data(struct sock *sk, return 0; if (skb_queue_empty(&sk->sk_write_queue)) { - err = ip_setup_cork(sk, &inet->cork, ipc, rtp); + err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp); if (err) return err; } else { transhdrlen = 0; } - return __ip_append_data(sk, &sk->sk_write_queue, &inet->cork, getfrag, + return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag, from, length, transhdrlen, flags); } -ssize_t ip_append_page(struct sock *sk, struct page *page, +ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, int offset, size_t size, int flags) { struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; struct rtable *rt; struct ip_options *opt = NULL; + struct inet_cork *cork; int hh_len; int mtu; int len; @@ -1127,28 +1130,29 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, if (skb_queue_empty(&sk->sk_write_queue)) return -EINVAL; - rt = (struct rtable *)inet->cork.dst; - if (inet->cork.flags & IPCORK_OPT) - opt = inet->cork.opt; + cork = &inet->cork.base; + rt = (struct rtable *)cork->dst; + if (cork->flags & IPCORK_OPT) + opt = cork->opt; if (!(rt->dst.dev->features&NETIF_F_SG)) return -EOPNOTSUPP; hh_len = LL_RESERVED_SPACE(rt->dst.dev); - mtu = inet->cork.fragsize; + mtu = cork->fragsize; fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; - if (inet->cork.length + size > 0xFFFF - fragheaderlen) { - ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu); + if (cork->length + size > 0xFFFF - fragheaderlen) { + ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu); return -EMSGSIZE; } if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) return -EINVAL; - inet->cork.length += size; + cork->length += size; if ((size + skb->len > mtu) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO)) { @@ -1243,7 +1247,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, return 0; error: - inet->cork.length -= size; + cork->length -= size; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); return err; } @@ -1262,6 +1266,7 @@ static void ip_cork_release(struct inet_cork *cork) * and push them out. */ struct sk_buff *__ip_make_skb(struct sock *sk, + struct flowi4 *fl4, struct sk_buff_head *queue, struct inet_cork *cork) { @@ -1319,17 +1324,18 @@ struct sk_buff *__ip_make_skb(struct sock *sk, iph = (struct iphdr *)skb->data; iph->version = 4; iph->ihl = 5; - if (opt) { - iph->ihl += opt->optlen>>2; - ip_options_build(skb, opt, cork->addr, rt, 0); - } iph->tos = inet->tos; iph->frag_off = df; ip_select_ident(iph, &rt->dst, sk); iph->ttl = ttl; iph->protocol = sk->sk_protocol; - iph->saddr = rt->rt_src; - iph->daddr = rt->rt_dst; + iph->saddr = fl4->saddr; + iph->daddr = fl4->daddr; + + if (opt) { + iph->ihl += opt->optlen>>2; + ip_options_build(skb, opt, cork->addr, rt, 0); + } skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; @@ -1365,11 +1371,11 @@ int ip_send_skb(struct sk_buff *skb) return err; } -int ip_push_pending_frames(struct sock *sk) +int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4) { struct sk_buff *skb; - skb = ip_finish_skb(sk); + skb = ip_finish_skb(sk, fl4); if (!skb) return 0; @@ -1394,17 +1400,18 @@ static void __ip_flush_pending_frames(struct sock *sk, void ip_flush_pending_frames(struct sock *sk) { - __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork); + __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base); } struct sk_buff *ip_make_skb(struct sock *sk, + struct flowi4 *fl4, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm_cookie *ipc, struct rtable **rtp, unsigned int flags) { - struct inet_cork cork = {}; + struct inet_cork cork; struct sk_buff_head queue; int err; @@ -1413,18 +1420,21 @@ struct sk_buff *ip_make_skb(struct sock *sk, __skb_queue_head_init(&queue); + cork.flags = 0; + cork.addr = 0; + cork.opt = NULL; err = ip_setup_cork(sk, &cork, ipc, rtp); if (err) return ERR_PTR(err); - err = __ip_append_data(sk, &queue, &cork, getfrag, + err = __ip_append_data(sk, fl4, &queue, &cork, getfrag, from, length, transhdrlen, flags); if (err) { __ip_flush_pending_frames(sk, &queue, &cork); return ERR_PTR(err); } - return __ip_make_skb(sk, &queue, &cork); + return __ip_make_skb(sk, fl4, &queue, &cork); } /* @@ -1447,48 +1457,39 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset, * Should run single threaded per socket because it uses the sock * structure to pass arguments. */ -void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg, - unsigned int len) +void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, + struct ip_reply_arg *arg, unsigned int len) { struct inet_sock *inet = inet_sk(sk); - struct { - struct ip_options opt; - char data[40]; - } replyopts; + struct ip_options_data replyopts; struct ipcm_cookie ipc; - __be32 daddr; + struct flowi4 fl4; struct rtable *rt = skb_rtable(skb); - if (ip_options_echo(&replyopts.opt, skb)) + if (ip_options_echo(&replyopts.opt.opt, skb)) return; - daddr = ipc.addr = rt->rt_src; + ipc.addr = daddr; ipc.opt = NULL; ipc.tx_flags = 0; - if (replyopts.opt.optlen) { + if (replyopts.opt.opt.optlen) { ipc.opt = &replyopts.opt; - if (ipc.opt->srr) - daddr = replyopts.opt.faddr; + if (replyopts.opt.opt.srr) + daddr = replyopts.opt.opt.faddr; } - { - struct flowi4 fl4 = { - .flowi4_oif = arg->bound_dev_if, - .daddr = daddr, - .saddr = rt->rt_spec_dst, - .flowi4_tos = RT_TOS(ip_hdr(skb)->tos), - .fl4_sport = tcp_hdr(skb)->dest, - .fl4_dport = tcp_hdr(skb)->source, - .flowi4_proto = sk->sk_protocol, - .flowi4_flags = ip_reply_arg_flowi_flags(arg), - }; - security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); - rt = ip_route_output_key(sock_net(sk), &fl4); - if (IS_ERR(rt)) - return; - } + flowi4_init_output(&fl4, arg->bound_dev_if, 0, + RT_TOS(ip_hdr(skb)->tos), + RT_SCOPE_UNIVERSE, sk->sk_protocol, + ip_reply_arg_flowi_flags(arg), + daddr, rt->rt_spec_dst, + tcp_hdr(skb)->source, tcp_hdr(skb)->dest); + security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); + rt = ip_route_output_key(sock_net(sk), &fl4); + if (IS_ERR(rt)) + return; /* And let IP do all the hard work. @@ -1501,7 +1502,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar sk->sk_priority = skb->priority; sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; - ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0, + ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, &ipc, &rt, MSG_DONTWAIT); if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { if (arg->csumoffset >= 0) @@ -1509,7 +1510,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum)); skb->ip_summed = CHECKSUM_NONE; - ip_push_pending_frames(sk); + ip_push_pending_frames(sk, &fl4); } bh_unlock_sock(sk); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 3948c86e59c..ab0c9efd1ef 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -131,7 +131,7 @@ static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb) static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) { struct sockaddr_in sin; - struct iphdr *iph = ip_hdr(skb); + const struct iphdr *iph = ip_hdr(skb); __be16 *ports = (__be16 *)skb_transport_header(skb); if (skb_transport_offset(skb) + 4 > skb->len) @@ -451,6 +451,11 @@ out: } +static void opt_kfree_rcu(struct rcu_head *head) +{ + kfree(container_of(head, struct ip_options_rcu, rcu)); +} + /* * Socket option code for IP. This is the end of the line after any * TCP,UDP etc options on an IP socket. @@ -497,13 +502,16 @@ static int do_ip_setsockopt(struct sock *sk, int level, switch (optname) { case IP_OPTIONS: { - struct ip_options *opt = NULL; + struct ip_options_rcu *old, *opt = NULL; + if (optlen > 40) goto e_inval; err = ip_options_get_from_user(sock_net(sk), &opt, optval, optlen); if (err) break; + old = rcu_dereference_protected(inet->inet_opt, + sock_owned_by_user(sk)); if (inet->is_icsk) { struct inet_connection_sock *icsk = inet_csk(sk); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) @@ -512,17 +520,18 @@ static int do_ip_setsockopt(struct sock *sk, int level, (TCPF_LISTEN | TCPF_CLOSE)) && inet->inet_daddr != LOOPBACK4_IPV6)) { #endif - if (inet->opt) - icsk->icsk_ext_hdr_len -= inet->opt->optlen; + if (old) + icsk->icsk_ext_hdr_len -= old->opt.optlen; if (opt) - icsk->icsk_ext_hdr_len += opt->optlen; + icsk->icsk_ext_hdr_len += opt->opt.optlen; icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) } #endif } - opt = xchg(&inet->opt, opt); - kfree(opt); + rcu_assign_pointer(inet->inet_opt, opt); + if (old) + call_rcu(&old->rcu, opt_kfree_rcu); break; } case IP_PKTINFO: @@ -1081,12 +1090,16 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_OPTIONS: { unsigned char optbuf[sizeof(struct ip_options)+40]; - struct ip_options * opt = (struct ip_options *)optbuf; + struct ip_options *opt = (struct ip_options *)optbuf; + struct ip_options_rcu *inet_opt; + + inet_opt = rcu_dereference_protected(inet->inet_opt, + sock_owned_by_user(sk)); opt->optlen = 0; - if (inet->opt) - memcpy(optbuf, inet->opt, - sizeof(struct ip_options)+ - inet->opt->optlen); + if (inet_opt) + memcpy(optbuf, &inet_opt->opt, + sizeof(struct ip_options) + + inet_opt->opt.optlen); release_sock(sk); if (opt->optlen == 0) diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 629067571f0..c857f6f49b0 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -27,7 +27,7 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) { struct net *net = dev_net(skb->dev); __be32 spi; - struct iphdr *iph = (struct iphdr *)skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; @@ -36,7 +36,7 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) return; spi = htonl(ntohs(ipch->cpi)); - x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, + x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, spi, IPPROTO_COMP, AF_INET); if (!x) return; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index cbff2ecccf3..ab7e5542c1c 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -87,8 +87,8 @@ #endif /* Define the friendly delay before and after opening net devices */ -#define CONF_PRE_OPEN 500 /* Before opening: 1/2 second */ -#define CONF_POST_OPEN 1 /* After opening: 1 second */ +#define CONF_POST_OPEN 10 /* After opening: 10 msecs */ +#define CONF_CARRIER_TIMEOUT 120000 /* Wait for carrier timeout */ /* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */ #define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */ @@ -188,14 +188,14 @@ struct ic_device { static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */ static struct net_device *ic_dev __initdata = NULL; /* Selected device */ -static bool __init ic_device_match(struct net_device *dev) +static bool __init ic_is_init_dev(struct net_device *dev) { - if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : + if (dev->flags & IFF_LOOPBACK) + return false; + return user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : (!(dev->flags & IFF_LOOPBACK) && (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) && - strncmp(dev->name, "dummy", 5))) - return true; - return false; + strncmp(dev->name, "dummy", 5)); } static int __init ic_open_devs(void) @@ -203,6 +203,7 @@ static int __init ic_open_devs(void) struct ic_device *d, **last; struct net_device *dev; unsigned short oflags; + unsigned long start; last = &ic_first_dev; rtnl_lock(); @@ -216,9 +217,7 @@ static int __init ic_open_devs(void) } for_each_netdev(&init_net, dev) { - if (dev->flags & IFF_LOOPBACK) - continue; - if (ic_device_match(dev)) { + if (ic_is_init_dev(dev)) { int able = 0; if (dev->mtu >= 364) able |= IC_BOOTP; @@ -252,6 +251,17 @@ static int __init ic_open_devs(void) dev->name, able, d->xid)); } } + + /* wait for a carrier on at least one device */ + start = jiffies; + while (jiffies - start < msecs_to_jiffies(CONF_CARRIER_TIMEOUT)) { + for_each_netdev(&init_net, dev) + if (ic_is_init_dev(dev) && netif_carrier_ok(dev)) + goto have_carrier; + + msleep(1); + } +have_carrier: rtnl_unlock(); *last = NULL; @@ -1324,14 +1334,13 @@ static int __init wait_for_devices(void) { int i; - msleep(CONF_PRE_OPEN); for (i = 0; i < DEVICE_WAIT_MAX; i++) { struct net_device *dev; int found = 0; rtnl_lock(); for_each_netdev(&init_net, dev) { - if (ic_device_match(dev)) { + if (ic_is_init_dev(dev)) { found = 1; break; } @@ -1378,7 +1387,7 @@ static int __init ip_auto_config(void) return err; /* Give drivers a chance to settle */ - ssleep(CONF_POST_OPEN); + msleep(CONF_POST_OPEN); /* * If the config information is insufficient (e.g., our IP address or diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index bfc17c5914e..378b20b7ca6 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -276,11 +276,6 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net, dev_net_set(dev, net); - if (strchr(name, '%')) { - if (dev_alloc_name(dev, name) < 0) - goto failed_free; - } - nt = netdev_priv(dev); nt->parms = *parms; @@ -319,7 +314,7 @@ static int ipip_err(struct sk_buff *skb, u32 info) 8 bytes of packet payload. It means, that precise relaying of ICMP in the real Internet is absolutely infeasible. */ - struct iphdr *iph = (struct iphdr *)skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct ip_tunnel *t; @@ -433,15 +428,16 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct pcpu_tstats *tstats; - struct iphdr *tiph = &tunnel->parms.iph; + const struct iphdr *tiph = &tunnel->parms.iph; u8 tos = tunnel->parms.iph.tos; __be16 df = tiph->frag_off; struct rtable *rt; /* Route to the other host */ struct net_device *tdev; /* Device to other host */ - struct iphdr *old_iph = ip_hdr(skb); + const struct iphdr *old_iph = ip_hdr(skb); struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ __be32 dst = tiph->daddr; + struct flowi4 fl4; int mtu; if (skb->protocol != htons(ETH_P_IP)) @@ -460,7 +456,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) goto tx_error_icmp; } - rt = ip_route_output_ports(dev_net(dev), NULL, + rt = ip_route_output_ports(dev_net(dev), &fl4, NULL, dst, tiph->saddr, 0, 0, IPPROTO_IPIP, RT_TOS(tos), @@ -549,8 +545,8 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) iph->frag_off = df; iph->protocol = IPPROTO_IPIP; iph->tos = INET_ECN_encapsulate(tos, old_iph->tos); - iph->daddr = rt->rt_dst; - iph->saddr = rt->rt_src; + iph->daddr = fl4.daddr; + iph->saddr = fl4.saddr; if ((iph->ttl = tiph->ttl) == 0) iph->ttl = old_iph->ttl; @@ -572,19 +568,21 @@ static void ipip_tunnel_bind_dev(struct net_device *dev) { struct net_device *tdev = NULL; struct ip_tunnel *tunnel; - struct iphdr *iph; + const struct iphdr *iph; tunnel = netdev_priv(dev); iph = &tunnel->parms.iph; if (iph->daddr) { - struct rtable *rt = ip_route_output_ports(dev_net(dev), NULL, - iph->daddr, iph->saddr, - 0, 0, - IPPROTO_IPIP, - RT_TOS(iph->tos), - tunnel->parms.link); - + struct rtable *rt; + struct flowi4 fl4; + + rt = ip_route_output_ports(dev_net(dev), &fl4, NULL, + iph->daddr, iph->saddr, + 0, 0, + IPPROTO_IPIP, + RT_TOS(iph->tos), + tunnel->parms.link); if (!IS_ERR(rt)) { tdev = rt->dst.dev; ip_rt_put(rt); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 1f62eaeb6de..30a7763c400 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1549,7 +1549,7 @@ static struct notifier_block ip_mr_notifier = { static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) { struct iphdr *iph; - struct iphdr *old_iph = ip_hdr(skb); + const struct iphdr *old_iph = ip_hdr(skb); skb_push(skb, sizeof(struct iphdr)); skb->transport_header = skb->network_header; @@ -1595,6 +1595,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, struct vif_device *vif = &mrt->vif_table[vifi]; struct net_device *dev; struct rtable *rt; + struct flowi4 fl4; int encap = 0; if (vif->dev == NULL) @@ -1612,7 +1613,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, #endif if (vif->flags & VIFF_TUNNEL) { - rt = ip_route_output_ports(net, NULL, + rt = ip_route_output_ports(net, &fl4, NULL, vif->remote, vif->local, 0, 0, IPPROTO_IPIP, @@ -1621,7 +1622,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, goto out_free; encap = sizeof(struct iphdr); } else { - rt = ip_route_output_ports(net, NULL, iph->daddr, 0, + rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0, 0, 0, IPPROTO_IPIP, RT_TOS(iph->tos), vif->link); @@ -1788,12 +1789,14 @@ dont_forward: return 0; } -static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct rtable *rt) +static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) { + struct rtable *rt = skb_rtable(skb); + struct iphdr *iph = ip_hdr(skb); struct flowi4 fl4 = { - .daddr = rt->rt_key_dst, - .saddr = rt->rt_key_src, - .flowi4_tos = rt->rt_tos, + .daddr = iph->daddr, + .saddr = iph->saddr, + .flowi4_tos = iph->tos, .flowi4_oif = rt->rt_oif, .flowi4_iif = rt->rt_iif, .flowi4_mark = rt->rt_mark, @@ -1825,7 +1828,7 @@ int ip_mr_input(struct sk_buff *skb) if (IPCB(skb)->flags & IPSKB_FORWARDED) goto dont_forward; - mrt = ipmr_rt_fib_lookup(net, skb_rtable(skb)); + mrt = ipmr_rt_fib_lookup(net, skb); if (IS_ERR(mrt)) { kfree_skb(skb); return PTR_ERR(mrt); @@ -1957,7 +1960,7 @@ int pim_rcv_v1(struct sk_buff *skb) pim = igmp_hdr(skb); - mrt = ipmr_rt_fib_lookup(net, skb_rtable(skb)); + mrt = ipmr_rt_fib_lookup(net, skb); if (IS_ERR(mrt)) goto drop; if (!mrt->mroute_do_pim || @@ -1989,7 +1992,7 @@ static int pim_rcv(struct sk_buff *skb) csum_fold(skb_checksum(skb, 0, skb->len, 0)))) goto drop; - mrt = ipmr_rt_fib_lookup(net, skb_rtable(skb)); + mrt = ipmr_rt_fib_lookup(net, skb); if (IS_ERR(mrt)) goto drop; if (__pim_rcv(mrt, skb, sizeof(*pim))) { @@ -2038,20 +2041,20 @@ rtattr_failure: return -EMSGSIZE; } -int ipmr_get_route(struct net *net, - struct sk_buff *skb, struct rtmsg *rtm, int nowait) +int ipmr_get_route(struct net *net, struct sk_buff *skb, + __be32 saddr, __be32 daddr, + struct rtmsg *rtm, int nowait) { - int err; - struct mr_table *mrt; struct mfc_cache *cache; - struct rtable *rt = skb_rtable(skb); + struct mr_table *mrt; + int err; mrt = ipmr_get_table(net, RT_TABLE_DEFAULT); if (mrt == NULL) return -ENOENT; rcu_read_lock(); - cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst); + cache = ipmr_cache_find(mrt, saddr, daddr); if (cache == NULL) { struct sk_buff *skb2; @@ -2084,8 +2087,8 @@ int ipmr_get_route(struct net *net, skb_reset_network_header(skb2); iph = ip_hdr(skb2); iph->ihl = sizeof(struct iphdr) >> 2; - iph->saddr = rt->rt_src; - iph->daddr = rt->rt_dst; + iph->saddr = saddr; + iph->daddr = daddr; iph->version = 0; err = ipmr_cache_unresolved(mrt, vif, skb2); read_unlock(&mrt_lock); diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 4614babdc45..2e97e3ec1eb 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -17,51 +17,35 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; struct flowi4 fl4 = {}; - unsigned long orefdst; + __be32 saddr = iph->saddr; + __u8 flags = 0; unsigned int hh_len; - unsigned int type; - type = inet_addr_type(net, iph->saddr); - if (skb->sk && inet_sk(skb->sk)->transparent) - type = RTN_LOCAL; - if (addr_type == RTN_UNSPEC) - addr_type = type; + if (!skb->sk && addr_type != RTN_LOCAL) { + if (addr_type == RTN_UNSPEC) + addr_type = inet_addr_type(net, saddr); + if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST) + flags |= FLOWI_FLAG_ANYSRC; + else + saddr = 0; + } /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook. */ - if (addr_type == RTN_LOCAL) { - fl4.daddr = iph->daddr; - if (type == RTN_LOCAL) - fl4.saddr = iph->saddr; - fl4.flowi4_tos = RT_TOS(iph->tos); - fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; - fl4.flowi4_mark = skb->mark; - fl4.flowi4_flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; - rt = ip_route_output_key(net, &fl4); - if (IS_ERR(rt)) - return -1; - - /* Drop old route. */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - } else { - /* non-local src, find valid iif to satisfy - * rp-filter when calling ip_route_input. */ - fl4.daddr = iph->saddr; - rt = ip_route_output_key(net, &fl4); - if (IS_ERR(rt)) - return -1; + fl4.daddr = iph->daddr; + fl4.saddr = saddr; + fl4.flowi4_tos = RT_TOS(iph->tos); + fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; + fl4.flowi4_mark = skb->mark; + fl4.flowi4_flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : flags; + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) + return -1; - orefdst = skb->_skb_refdst; - if (ip_route_input(skb, iph->daddr, iph->saddr, - RT_TOS(iph->tos), rt->dst.dev) != 0) { - dst_release(&rt->dst); - return -1; - } - dst_release(&rt->dst); - refdst_drop(orefdst); - } + /* Drop old route. */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); if (skb_dst(skb)->error) return -1; diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 89bc7e66d59..fd7a3f68917 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -260,6 +260,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, void *table_base; const struct xt_table_info *private; struct xt_action_param acpar; + unsigned int addend; if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) return NF_DROP; @@ -267,7 +268,8 @@ unsigned int arpt_do_table(struct sk_buff *skb, indev = in ? in->name : nulldevname; outdev = out ? out->name : nulldevname; - xt_info_rdlock_bh(); + local_bh_disable(); + addend = xt_write_recseq_begin(); private = table->private; table_base = private->entries[smp_processor_id()]; @@ -338,7 +340,8 @@ unsigned int arpt_do_table(struct sk_buff *skb, /* Verdict */ break; } while (!acpar.hotdrop); - xt_info_rdunlock_bh(); + xt_write_recseq_end(addend); + local_bh_enable(); if (acpar.hotdrop) return NF_DROP; @@ -712,7 +715,7 @@ static void get_counters(const struct xt_table_info *t, unsigned int i; for_each_possible_cpu(cpu) { - seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock; + seqcount_t *s = &per_cpu(xt_recseq, cpu); i = 0; xt_entry_foreach(iter, t->entries[cpu], t->size) { @@ -720,10 +723,10 @@ static void get_counters(const struct xt_table_info *t, unsigned int start; do { - start = read_seqbegin(lock); + start = read_seqcount_begin(s); bcnt = iter->counters.bcnt; pcnt = iter->counters.pcnt; - } while (read_seqretry(lock, start)); + } while (read_seqcount_retry(s, start)); ADD_COUNTER(counters[i], bcnt, pcnt); ++i; @@ -1115,6 +1118,7 @@ static int do_add_counters(struct net *net, const void __user *user, int ret = 0; void *loc_cpu_entry; struct arpt_entry *iter; + unsigned int addend; #ifdef CONFIG_COMPAT struct compat_xt_counters_info compat_tmp; @@ -1171,12 +1175,12 @@ static int do_add_counters(struct net *net, const void __user *user, /* Choose the copy that is on our node */ curcpu = smp_processor_id(); loc_cpu_entry = private->entries[curcpu]; - xt_info_wrlock(curcpu); + addend = xt_write_recseq_begin(); xt_entry_foreach(iter, loc_cpu_entry, private->size) { ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); ++i; } - xt_info_wrunlock(curcpu); + xt_write_recseq_end(addend); unlock_up_free: local_bh_enable(); xt_table_unlock(t); diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index d2c1311cb28..5c9b9d96391 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -203,7 +203,8 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp) else pmsg->outdev_name[0] = '\0'; - if (entry->indev && entry->skb->dev) { + if (entry->indev && entry->skb->dev && + entry->skb->mac_header != entry->skb->network_header) { pmsg->hw_type = entry->skb->dev->type; pmsg->hw_addrlen = dev_parse_header(entry->skb, pmsg->hw_addr); @@ -402,7 +403,8 @@ ipq_dev_drop(int ifindex) static inline void __ipq_rcv_skb(struct sk_buff *skb) { - int status, type, pid, flags, nlmsglen, skblen; + int status, type, pid, flags; + unsigned int nlmsglen, skblen; struct nlmsghdr *nlh; skblen = skb->len; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 70491502800..24e556e83a3 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -68,15 +68,6 @@ void *ipt_alloc_initial_table(const struct xt_table *info) } EXPORT_SYMBOL_GPL(ipt_alloc_initial_table); -/* - We keep a set of rules for each CPU, so we can avoid write-locking - them in the softirq when updating the counters and therefore - only need to read-lock in the softirq; doing a write_lock_bh() in user - context stops packets coming through and allows user context to read - the counters or update the rules. - - Hence the start of any table is given by get_table() below. */ - /* Returns whether matches rule or not. */ /* Performance critical - called for every packet */ static inline bool @@ -311,6 +302,7 @@ ipt_do_table(struct sk_buff *skb, unsigned int *stackptr, origptr, cpu; const struct xt_table_info *private; struct xt_action_param acpar; + unsigned int addend; /* Initialization */ ip = ip_hdr(skb); @@ -331,7 +323,8 @@ ipt_do_table(struct sk_buff *skb, acpar.hooknum = hook; IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - xt_info_rdlock_bh(); + local_bh_disable(); + addend = xt_write_recseq_begin(); private = table->private; cpu = smp_processor_id(); table_base = private->entries[cpu]; @@ -430,7 +423,9 @@ ipt_do_table(struct sk_buff *skb, pr_debug("Exiting %s; resetting sp from %u to %u\n", __func__, *stackptr, origptr); *stackptr = origptr; - xt_info_rdunlock_bh(); + xt_write_recseq_end(addend); + local_bh_enable(); + #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; #else @@ -571,7 +566,7 @@ check_entry(const struct ipt_entry *e, const char *name) const struct xt_entry_target *t; if (!ip_checkentry(&e->ip)) { - duprintf("ip check failed %p %s.\n", e, par->match->name); + duprintf("ip check failed %p %s.\n", e, name); return -EINVAL; } @@ -886,7 +881,7 @@ get_counters(const struct xt_table_info *t, unsigned int i; for_each_possible_cpu(cpu) { - seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock; + seqcount_t *s = &per_cpu(xt_recseq, cpu); i = 0; xt_entry_foreach(iter, t->entries[cpu], t->size) { @@ -894,10 +889,10 @@ get_counters(const struct xt_table_info *t, unsigned int start; do { - start = read_seqbegin(lock); + start = read_seqcount_begin(s); bcnt = iter->counters.bcnt; pcnt = iter->counters.pcnt; - } while (read_seqretry(lock, start)); + } while (read_seqcount_retry(s, start)); ADD_COUNTER(counters[i], bcnt, pcnt); ++i; /* macro does multi eval of i */ @@ -1312,6 +1307,7 @@ do_add_counters(struct net *net, const void __user *user, int ret = 0; void *loc_cpu_entry; struct ipt_entry *iter; + unsigned int addend; #ifdef CONFIG_COMPAT struct compat_xt_counters_info compat_tmp; @@ -1368,12 +1364,12 @@ do_add_counters(struct net *net, const void __user *user, /* Choose the copy that is on our node */ curcpu = smp_processor_id(); loc_cpu_entry = private->entries[curcpu]; - xt_info_wrlock(curcpu); + addend = xt_write_recseq_begin(); xt_entry_foreach(iter, loc_cpu_entry, private->size) { ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); ++i; } - xt_info_wrunlock(curcpu); + xt_write_recseq_end(addend); unlock_up_free: local_bh_enable(); xt_table_unlock(t); diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index d609ac3cb9a..5c9e97c7901 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -307,7 +307,7 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) * error messages (RELATED) and information requests (see below) */ if (ip_hdr(skb)->protocol == IPPROTO_ICMP && (ctinfo == IP_CT_RELATED || - ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)) + ctinfo == IP_CT_RELATED_REPLY)) return XT_CONTINUE; /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, @@ -321,12 +321,12 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) ct->mark = hash; break; case IP_CT_RELATED: - case IP_CT_RELATED+IP_CT_IS_REPLY: + case IP_CT_RELATED_REPLY: /* FIXME: we don't handle expectations at the * moment. they can arrive on a different node than * the master connection (e.g. FTP passive mode) */ case IP_CT_ESTABLISHED: - case IP_CT_ESTABLISHED+IP_CT_IS_REPLY: + case IP_CT_ESTABLISHED_REPLY: break; default: break; diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index d2ed9dc74eb..9931152a78b 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -60,7 +60,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) nat = nfct_nat(ct); NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || - ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); + ctinfo == IP_CT_RELATED_REPLY)); /* Source address is 0.0.0.0 - locally generated packet that is * probably not supposed to be masqueraded. diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 1ff79e557f9..51f13f8ec72 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -40,7 +40,6 @@ static void send_reset(struct sk_buff *oldskb, int hook) struct iphdr *niph; const struct tcphdr *oth; struct tcphdr _otcph, *tcph; - unsigned int addr_type; /* IP header checks: fragment. */ if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) @@ -55,6 +54,9 @@ static void send_reset(struct sk_buff *oldskb, int hook) if (oth->rst) return; + if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) + return; + /* Check checksum */ if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP)) return; @@ -101,19 +103,11 @@ static void send_reset(struct sk_buff *oldskb, int hook) nskb->csum_start = (unsigned char *)tcph - nskb->head; nskb->csum_offset = offsetof(struct tcphdr, check); - addr_type = RTN_UNSPEC; - if (hook != NF_INET_FORWARD -#ifdef CONFIG_BRIDGE_NETFILTER - || (nskb->nf_bridge && nskb->nf_bridge->mask & BRNF_BRIDGED) -#endif - ) - addr_type = RTN_LOCAL; - /* ip_route_me_harder expects skb->dst to be set */ skb_dst_set_noref(nskb, skb_dst(oldskb)); nskb->protocol = htons(ETH_P_IP); - if (ip_route_me_harder(nskb, addr_type)) + if (ip_route_me_harder(nskb, RTN_UNSPEC)) goto free_nskb; niph->ttl = ip4_dst_hoplimit(skb_dst(nskb)); diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c index af6e9c77834..2b57e52c746 100644 --- a/net/ipv4/netfilter/ipt_ecn.c +++ b/net/ipv4/netfilter/ipt_ecn.c @@ -25,7 +25,8 @@ MODULE_LICENSE("GPL"); static inline bool match_ip(const struct sk_buff *skb, const struct ipt_ecn_info *einfo) { - return (ip_hdr(skb)->tos & IPT_ECN_IP_MASK) == einfo->ip_ect; + return ((ip_hdr(skb)->tos & IPT_ECN_IP_MASK) == einfo->ip_ect) ^ + !!(einfo->invert & IPT_ECN_OP_MATCH_IP); } static inline bool match_tcp(const struct sk_buff *skb, @@ -76,8 +77,6 @@ static bool ecn_mt(const struct sk_buff *skb, struct xt_action_param *par) return false; if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) { - if (ip_hdr(skb)->protocol != IPPROTO_TCP) - return false; if (!match_tcp(skb, info, &par->hotdrop)) return false; } @@ -97,7 +96,7 @@ static int ecn_mt_check(const struct xt_mtchk_param *par) return -EINVAL; if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR) && - ip->proto != IPPROTO_TCP) { + (ip->proto != IPPROTO_TCP || ip->invflags & IPT_INV_PROTO)) { pr_info("cannot match TCP bits in rule for non-tcp packets\n"); return -EINVAL; } diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 5a03c02af99..de9da21113a 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -101,7 +101,7 @@ static unsigned int ipv4_confirm(unsigned int hooknum, /* This is where we call the helper: as the packet goes out. */ ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY) + if (!ct || ctinfo == IP_CT_RELATED_REPLY) goto out; help = nfct_help(ct); @@ -121,7 +121,9 @@ static unsigned int ipv4_confirm(unsigned int hooknum, return ret; } - if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)) { + /* adjust seqs for loopback traffic only in outgoing direction */ + if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && + !nf_is_loopback_packet(skb)) { typeof(nf_nat_seq_adjust_hook) seq_adjust; seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook); diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 7404bde9599..ab5b27a2916 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -160,7 +160,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, /* Update skb to refer to this connection */ skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general; skb->nfctinfo = *ctinfo; - return -NF_ACCEPT; + return NF_ACCEPT; } /* Small and modified version of icmp_rcv */ diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 9c71b2755ce..3346de5d94d 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -433,7 +433,7 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, /* Must be RELATED */ NF_CT_ASSERT(skb->nfctinfo == IP_CT_RELATED || - skb->nfctinfo == IP_CT_RELATED+IP_CT_IS_REPLY); + skb->nfctinfo == IP_CT_RELATED_REPLY); /* Redirects on non-null nats must be dropped, else they'll start talking to each other without our translation, and be diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c index 31427fb57aa..ebc5f8894f9 100644 --- a/net/ipv4/netfilter/nf_nat_helper.c +++ b/net/ipv4/netfilter/nf_nat_helper.c @@ -153,14 +153,14 @@ void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo, } EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust); -static void nf_nat_csum(struct sk_buff *skb, struct iphdr *iph, void *data, +static void nf_nat_csum(struct sk_buff *skb, const struct iphdr *iph, void *data, int datalen, __sum16 *check, int oldlen) { struct rtable *rt = skb_rtable(skb); if (skb->ip_summed != CHECKSUM_PARTIAL) { if (!(rt->rt_flags & RTCF_LOCAL) && - skb->dev->features & NETIF_F_V4_CSUM) { + (!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) + diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c index 21c30426480..733c9abc1cb 100644 --- a/net/ipv4/netfilter/nf_nat_rule.c +++ b/net/ipv4/netfilter/nf_nat_rule.c @@ -53,7 +53,7 @@ ipt_snat_target(struct sk_buff *skb, const struct xt_action_param *par) /* Connection must be valid and new. */ NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || - ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); + ctinfo == IP_CT_RELATED_REPLY)); NF_CT_ASSERT(par->out != NULL); return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_SRC); diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c index 7317bdf1d45..483b76d042d 100644 --- a/net/ipv4/netfilter/nf_nat_standalone.c +++ b/net/ipv4/netfilter/nf_nat_standalone.c @@ -116,7 +116,7 @@ nf_nat_fn(unsigned int hooknum, switch (ctinfo) { case IP_CT_RELATED: - case IP_CT_RELATED+IP_CT_IS_REPLY: + case IP_CT_RELATED_REPLY: if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { if (!nf_nat_icmp_reply_translation(ct, ctinfo, hooknum, skb)) @@ -144,7 +144,7 @@ nf_nat_fn(unsigned int hooknum, default: /* ESTABLISHED */ NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || - ctinfo == (IP_CT_ESTABLISHED+IP_CT_IS_REPLY)); + ctinfo == IP_CT_ESTABLISHED_REPLY); } return nf_nat_packet(ct, ctinfo, hooknum, skb); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c new file mode 100644 index 00000000000..39b403f854c --- /dev/null +++ b/net/ipv4/ping.c @@ -0,0 +1,931 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * "Ping" sockets + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Based on ipv4/udp.c code. + * + * Authors: Vasiliy Kulikov / Openwall (for Linux 2.6), + * Pavel Kankovsky (for Linux 2.4.32) + * + * Pavel gave all rights to bugs to Vasiliy, + * none of the bugs are Pavel's now. + * + */ + +#include <asm/system.h> +#include <linux/uaccess.h> +#include <linux/types.h> +#include <linux/fcntl.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/timer.h> +#include <linux/mm.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <net/snmp.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/icmp.h> +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <net/sock.h> +#include <net/ping.h> +#include <net/udp.h> +#include <net/route.h> +#include <net/inet_common.h> +#include <net/checksum.h> + + +static struct ping_table ping_table; + +static u16 ping_port_rover; + +static inline int ping_hashfn(struct net *net, unsigned num, unsigned mask) +{ + int res = (num + net_hash_mix(net)) & mask; + pr_debug("hash(%d) = %d\n", num, res); + return res; +} + +static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table, + struct net *net, unsigned num) +{ + return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)]; +} + +static int ping_v4_get_port(struct sock *sk, unsigned short ident) +{ + struct hlist_nulls_node *node; + struct hlist_nulls_head *hlist; + struct inet_sock *isk, *isk2; + struct sock *sk2 = NULL; + + isk = inet_sk(sk); + write_lock_bh(&ping_table.lock); + if (ident == 0) { + u32 i; + u16 result = ping_port_rover + 1; + + for (i = 0; i < (1L << 16); i++, result++) { + if (!result) + result++; /* avoid zero */ + hlist = ping_hashslot(&ping_table, sock_net(sk), + result); + ping_portaddr_for_each_entry(sk2, node, hlist) { + isk2 = inet_sk(sk2); + + if (isk2->inet_num == result) + goto next_port; + } + + /* found */ + ping_port_rover = ident = result; + break; +next_port: + ; + } + if (i >= (1L << 16)) + goto fail; + } else { + hlist = ping_hashslot(&ping_table, sock_net(sk), ident); + ping_portaddr_for_each_entry(sk2, node, hlist) { + isk2 = inet_sk(sk2); + + if ((isk2->inet_num == ident) && + (sk2 != sk) && + (!sk2->sk_reuse || !sk->sk_reuse)) + goto fail; + } + } + + pr_debug("found port/ident = %d\n", ident); + isk->inet_num = ident; + if (sk_unhashed(sk)) { + pr_debug("was not hashed\n"); + sock_hold(sk); + hlist_nulls_add_head(&sk->sk_nulls_node, hlist); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + } + write_unlock_bh(&ping_table.lock); + return 0; + +fail: + write_unlock_bh(&ping_table.lock); + return 1; +} + +static void ping_v4_hash(struct sock *sk) +{ + pr_debug("ping_v4_hash(sk->port=%u)\n", inet_sk(sk)->inet_num); + BUG(); /* "Please do not press this button again." */ +} + +static void ping_v4_unhash(struct sock *sk) +{ + struct inet_sock *isk = inet_sk(sk); + pr_debug("ping_v4_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num); + if (sk_hashed(sk)) { + write_lock_bh(&ping_table.lock); + hlist_nulls_del(&sk->sk_nulls_node); + sock_put(sk); + isk->inet_num = isk->inet_sport = 0; + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + write_unlock_bh(&ping_table.lock); + } +} + +static struct sock *ping_v4_lookup(struct net *net, u32 saddr, u32 daddr, + u16 ident, int dif) +{ + struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident); + struct sock *sk = NULL; + struct inet_sock *isk; + struct hlist_nulls_node *hnode; + + pr_debug("try to find: num = %d, daddr = %ld, dif = %d\n", + (int)ident, (unsigned long)daddr, dif); + read_lock_bh(&ping_table.lock); + + ping_portaddr_for_each_entry(sk, hnode, hslot) { + isk = inet_sk(sk); + + pr_debug("found: %p: num = %d, daddr = %ld, dif = %d\n", sk, + (int)isk->inet_num, (unsigned long)isk->inet_rcv_saddr, + sk->sk_bound_dev_if); + + pr_debug("iterate\n"); + if (isk->inet_num != ident) + continue; + if (isk->inet_rcv_saddr && isk->inet_rcv_saddr != daddr) + continue; + if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) + continue; + + sock_hold(sk); + goto exit; + } + + sk = NULL; +exit: + read_unlock_bh(&ping_table.lock); + + return sk; +} + +static void inet_get_ping_group_range_net(struct net *net, gid_t *low, + gid_t *high) +{ + gid_t *data = net->ipv4.sysctl_ping_group_range; + unsigned seq; + do { + seq = read_seqbegin(&sysctl_local_ports.lock); + + *low = data[0]; + *high = data[1]; + } while (read_seqretry(&sysctl_local_ports.lock, seq)); +} + + +static int ping_init_sock(struct sock *sk) +{ + struct net *net = sock_net(sk); + gid_t group = current_egid(); + gid_t range[2]; + struct group_info *group_info = get_current_groups(); + int i, j, count = group_info->ngroups; + + inet_get_ping_group_range_net(net, range, range+1); + if (range[0] <= group && group <= range[1]) + return 0; + + for (i = 0; i < group_info->nblocks; i++) { + int cp_count = min_t(int, NGROUPS_PER_BLOCK, count); + + for (j = 0; j < cp_count; j++) { + group = group_info->blocks[i][j]; + if (range[0] <= group && group <= range[1]) + return 0; + } + + count -= cp_count; + } + + return -EACCES; +} + +static void ping_close(struct sock *sk, long timeout) +{ + pr_debug("ping_close(sk=%p,sk->num=%u)\n", + inet_sk(sk), inet_sk(sk)->inet_num); + pr_debug("isk->refcnt = %d\n", sk->sk_refcnt.counter); + + sk_common_release(sk); +} + +/* + * We need our own bind because there are no privileged id's == local ports. + * Moreover, we don't allow binding to multi- and broadcast addresses. + */ + +static int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; + struct inet_sock *isk = inet_sk(sk); + unsigned short snum; + int chk_addr_ret; + int err; + + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + pr_debug("ping_v4_bind(sk=%p,sa_addr=%08x,sa_port=%d)\n", + sk, addr->sin_addr.s_addr, ntohs(addr->sin_port)); + + chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); + if (addr->sin_addr.s_addr == INADDR_ANY) + chk_addr_ret = RTN_LOCAL; + + if ((sysctl_ip_nonlocal_bind == 0 && + isk->freebind == 0 && isk->transparent == 0 && + chk_addr_ret != RTN_LOCAL) || + chk_addr_ret == RTN_MULTICAST || + chk_addr_ret == RTN_BROADCAST) + return -EADDRNOTAVAIL; + + lock_sock(sk); + + err = -EINVAL; + if (isk->inet_num != 0) + goto out; + + err = -EADDRINUSE; + isk->inet_rcv_saddr = isk->inet_saddr = addr->sin_addr.s_addr; + snum = ntohs(addr->sin_port); + if (ping_v4_get_port(sk, snum) != 0) { + isk->inet_saddr = isk->inet_rcv_saddr = 0; + goto out; + } + + pr_debug("after bind(): num = %d, daddr = %ld, dif = %d\n", + (int)isk->inet_num, + (unsigned long) isk->inet_rcv_saddr, + (int)sk->sk_bound_dev_if); + + err = 0; + if (isk->inet_rcv_saddr) + sk->sk_userlocks |= SOCK_BINDADDR_LOCK; + if (snum) + sk->sk_userlocks |= SOCK_BINDPORT_LOCK; + isk->inet_sport = htons(isk->inet_num); + isk->inet_daddr = 0; + isk->inet_dport = 0; + sk_dst_reset(sk); +out: + release_sock(sk); + pr_debug("ping_v4_bind -> %d\n", err); + return err; +} + +/* + * Is this a supported type of ICMP message? + */ + +static inline int ping_supported(int type, int code) +{ + if (type == ICMP_ECHO && code == 0) + return 1; + return 0; +} + +/* + * This routine is called by the ICMP module when it gets some + * sort of error condition. + */ + +static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); + +void ping_err(struct sk_buff *skb, u32 info) +{ + struct iphdr *iph = (struct iphdr *)skb->data; + struct icmphdr *icmph = (struct icmphdr *)(skb->data+(iph->ihl<<2)); + struct inet_sock *inet_sock; + int type = icmph->type; + int code = icmph->code; + struct net *net = dev_net(skb->dev); + struct sock *sk; + int harderr; + int err; + + /* We assume the packet has already been checked by icmp_unreach */ + + if (!ping_supported(icmph->type, icmph->code)) + return; + + pr_debug("ping_err(type=%04x,code=%04x,id=%04x,seq=%04x)\n", type, + code, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence)); + + sk = ping_v4_lookup(net, iph->daddr, iph->saddr, + ntohs(icmph->un.echo.id), skb->dev->ifindex); + if (sk == NULL) { + ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); + pr_debug("no socket, dropping\n"); + return; /* No socket for error */ + } + pr_debug("err on socket %p\n", sk); + + err = 0; + harderr = 0; + inet_sock = inet_sk(sk); + + switch (type) { + default: + case ICMP_TIME_EXCEEDED: + err = EHOSTUNREACH; + break; + case ICMP_SOURCE_QUENCH: + /* This is not a real error but ping wants to see it. + * Report it with some fake errno. */ + err = EREMOTEIO; + break; + case ICMP_PARAMETERPROB: + err = EPROTO; + harderr = 1; + break; + case ICMP_DEST_UNREACH: + if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ + if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) { + err = EMSGSIZE; + harderr = 1; + break; + } + goto out; + } + err = EHOSTUNREACH; + if (code <= NR_ICMP_UNREACH) { + harderr = icmp_err_convert[code].fatal; + err = icmp_err_convert[code].errno; + } + break; + case ICMP_REDIRECT: + /* See ICMP_SOURCE_QUENCH */ + err = EREMOTEIO; + break; + } + + /* + * RFC1122: OK. Passes ICMP errors back to application, as per + * 4.1.3.3. + */ + if (!inet_sock->recverr) { + if (!harderr || sk->sk_state != TCP_ESTABLISHED) + goto out; + } else { + ip_icmp_error(sk, skb, err, 0 /* no remote port */, + info, (u8 *)icmph); + } + sk->sk_err = err; + sk->sk_error_report(sk); +out: + sock_put(sk); +} + +/* + * Copy and checksum an ICMP Echo packet from user space into a buffer. + */ + +struct pingfakehdr { + struct icmphdr icmph; + struct iovec *iov; + u32 wcheck; +}; + +static int ping_getfrag(void *from, char * to, + int offset, int fraglen, int odd, struct sk_buff *skb) +{ + struct pingfakehdr *pfh = (struct pingfakehdr *)from; + + if (offset == 0) { + if (fraglen < sizeof(struct icmphdr)) + BUG(); + if (csum_partial_copy_fromiovecend(to + sizeof(struct icmphdr), + pfh->iov, 0, fraglen - sizeof(struct icmphdr), + &pfh->wcheck)) + return -EFAULT; + + return 0; + } + if (offset < sizeof(struct icmphdr)) + BUG(); + if (csum_partial_copy_fromiovecend + (to, pfh->iov, offset - sizeof(struct icmphdr), + fraglen, &pfh->wcheck)) + return -EFAULT; + return 0; +} + +static int ping_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh, + struct flowi4 *fl4) +{ + struct sk_buff *skb = skb_peek(&sk->sk_write_queue); + + pfh->wcheck = csum_partial((char *)&pfh->icmph, + sizeof(struct icmphdr), pfh->wcheck); + pfh->icmph.checksum = csum_fold(pfh->wcheck); + memcpy(icmp_hdr(skb), &pfh->icmph, sizeof(struct icmphdr)); + skb->ip_summed = CHECKSUM_NONE; + return ip_push_pending_frames(sk, fl4); +} + +static int ping_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len) +{ + struct net *net = sock_net(sk); + struct flowi4 fl4; + struct inet_sock *inet = inet_sk(sk); + struct ipcm_cookie ipc; + struct icmphdr user_icmph; + struct pingfakehdr pfh; + struct rtable *rt = NULL; + struct ip_options_data opt_copy; + int free = 0; + u32 saddr, daddr, faddr; + u8 tos; + int err; + + pr_debug("ping_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); + + + if (len > 0xFFFF) + return -EMSGSIZE; + + /* + * Check the flags. + */ + + /* Mirror BSD error message compatibility */ + if (msg->msg_flags & MSG_OOB) + return -EOPNOTSUPP; + + /* + * Fetch the ICMP header provided by the userland. + * iovec is modified! + */ + + if (memcpy_fromiovec((u8 *)&user_icmph, msg->msg_iov, + sizeof(struct icmphdr))) + return -EFAULT; + if (!ping_supported(user_icmph.type, user_icmph.code)) + return -EINVAL; + + /* + * Get and verify the address. + */ + + if (msg->msg_name) { + struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; + if (msg->msg_namelen < sizeof(*usin)) + return -EINVAL; + if (usin->sin_family != AF_INET) + return -EINVAL; + daddr = usin->sin_addr.s_addr; + /* no remote port */ + } else { + if (sk->sk_state != TCP_ESTABLISHED) + return -EDESTADDRREQ; + daddr = inet->inet_daddr; + /* no remote port */ + } + + ipc.addr = inet->inet_saddr; + ipc.opt = NULL; + ipc.oif = sk->sk_bound_dev_if; + ipc.tx_flags = 0; + err = sock_tx_timestamp(sk, &ipc.tx_flags); + if (err) + return err; + + if (msg->msg_controllen) { + err = ip_cmsg_send(sock_net(sk), msg, &ipc); + if (err) + return err; + if (ipc.opt) + free = 1; + } + if (!ipc.opt) { + struct ip_options_rcu *inet_opt; + + rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); + if (inet_opt) { + memcpy(&opt_copy, inet_opt, + sizeof(*inet_opt) + inet_opt->opt.optlen); + ipc.opt = &opt_copy.opt; + } + rcu_read_unlock(); + } + + saddr = ipc.addr; + ipc.addr = faddr = daddr; + + if (ipc.opt && ipc.opt->opt.srr) { + if (!daddr) + return -EINVAL; + faddr = ipc.opt->opt.faddr; + } + tos = RT_TOS(inet->tos); + if (sock_flag(sk, SOCK_LOCALROUTE) || + (msg->msg_flags & MSG_DONTROUTE) || + (ipc.opt && ipc.opt->opt.is_strictroute)) { + tos |= RTO_ONLINK; + } + + if (ipv4_is_multicast(daddr)) { + if (!ipc.oif) + ipc.oif = inet->mc_index; + if (!saddr) + saddr = inet->mc_addr; + } + + flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, + RT_SCOPE_UNIVERSE, sk->sk_protocol, + inet_sk_flowi_flags(sk), faddr, saddr, 0, 0); + + security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); + rt = ip_route_output_flow(net, &fl4, sk); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + rt = NULL; + if (err == -ENETUNREACH) + IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); + goto out; + } + + err = -EACCES; + if ((rt->rt_flags & RTCF_BROADCAST) && + !sock_flag(sk, SOCK_BROADCAST)) + goto out; + + if (msg->msg_flags & MSG_CONFIRM) + goto do_confirm; +back_from_confirm: + + if (!ipc.addr) + ipc.addr = fl4.daddr; + + lock_sock(sk); + + pfh.icmph.type = user_icmph.type; /* already checked */ + pfh.icmph.code = user_icmph.code; /* ditto */ + pfh.icmph.checksum = 0; + pfh.icmph.un.echo.id = inet->inet_sport; + pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence; + pfh.iov = msg->msg_iov; + pfh.wcheck = 0; + + err = ip_append_data(sk, &fl4, ping_getfrag, &pfh, len, + 0, &ipc, &rt, msg->msg_flags); + if (err) + ip_flush_pending_frames(sk); + else + err = ping_push_pending_frames(sk, &pfh, &fl4); + release_sock(sk); + +out: + ip_rt_put(rt); + if (free) + kfree(ipc.opt); + if (!err) { + icmp_out_count(sock_net(sk), user_icmph.type); + return len; + } + return err; + +do_confirm: + dst_confirm(&rt->dst); + if (!(msg->msg_flags & MSG_PROBE) || len) + goto back_from_confirm; + err = 0; + goto out; +} + +static int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len, int noblock, int flags, int *addr_len) +{ + struct inet_sock *isk = inet_sk(sk); + struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; + struct sk_buff *skb; + int copied, err; + + pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num); + + if (flags & MSG_OOB) + goto out; + + if (addr_len) + *addr_len = sizeof(*sin); + + if (flags & MSG_ERRQUEUE) + return ip_recv_error(sk, msg, len); + + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + + /* Don't bother checking the checksum */ + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto done; + + sock_recv_timestamp(msg, sk, skb); + + /* Copy the address. */ + if (sin) { + sin->sin_family = AF_INET; + sin->sin_port = 0 /* skb->h.uh->source */; + sin->sin_addr.s_addr = ip_hdr(skb)->saddr; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + } + if (isk->cmsg_flags) + ip_cmsg_recv(msg, skb); + err = copied; + +done: + skb_free_datagram(sk, skb); +out: + pr_debug("ping_recvmsg -> %d\n", err); + return err; +} + +static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n", + inet_sk(sk), inet_sk(sk)->inet_num, skb); + if (sock_queue_rcv_skb(sk, skb) < 0) { + ICMP_INC_STATS_BH(sock_net(sk), ICMP_MIB_INERRORS); + kfree_skb(skb); + pr_debug("ping_queue_rcv_skb -> failed\n"); + return -1; + } + return 0; +} + + +/* + * All we need to do is get the socket. + */ + +void ping_rcv(struct sk_buff *skb) +{ + struct sock *sk; + struct net *net = dev_net(skb->dev); + struct iphdr *iph = ip_hdr(skb); + struct icmphdr *icmph = icmp_hdr(skb); + u32 saddr = iph->saddr; + u32 daddr = iph->daddr; + + /* We assume the packet has already been checked by icmp_rcv */ + + pr_debug("ping_rcv(skb=%p,id=%04x,seq=%04x)\n", + skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence)); + + /* Push ICMP header back */ + skb_push(skb, skb->data - (u8 *)icmph); + + sk = ping_v4_lookup(net, saddr, daddr, ntohs(icmph->un.echo.id), + skb->dev->ifindex); + if (sk != NULL) { + pr_debug("rcv on socket %p\n", sk); + ping_queue_rcv_skb(sk, skb_get(skb)); + sock_put(sk); + return; + } + pr_debug("no socket, dropping\n"); + + /* We're called from icmp_rcv(). kfree_skb() is done there. */ +} + +struct proto ping_prot = { + .name = "PING", + .owner = THIS_MODULE, + .init = ping_init_sock, + .close = ping_close, + .connect = ip4_datagram_connect, + .disconnect = udp_disconnect, + .setsockopt = ip_setsockopt, + .getsockopt = ip_getsockopt, + .sendmsg = ping_sendmsg, + .recvmsg = ping_recvmsg, + .bind = ping_bind, + .backlog_rcv = ping_queue_rcv_skb, + .hash = ping_v4_hash, + .unhash = ping_v4_unhash, + .get_port = ping_v4_get_port, + .obj_size = sizeof(struct inet_sock), +}; +EXPORT_SYMBOL(ping_prot); + +#ifdef CONFIG_PROC_FS + +static struct sock *ping_get_first(struct seq_file *seq, int start) +{ + struct sock *sk; + struct ping_iter_state *state = seq->private; + struct net *net = seq_file_net(seq); + + for (state->bucket = start; state->bucket < PING_HTABLE_SIZE; + ++state->bucket) { + struct hlist_nulls_node *node; + struct hlist_nulls_head *hslot; + + hslot = &ping_table.hash[state->bucket]; + + if (hlist_nulls_empty(hslot)) + continue; + + sk_nulls_for_each(sk, node, hslot) { + if (net_eq(sock_net(sk), net)) + goto found; + } + } + sk = NULL; +found: + return sk; +} + +static struct sock *ping_get_next(struct seq_file *seq, struct sock *sk) +{ + struct ping_iter_state *state = seq->private; + struct net *net = seq_file_net(seq); + + do { + sk = sk_nulls_next(sk); + } while (sk && (!net_eq(sock_net(sk), net))); + + if (!sk) + return ping_get_first(seq, state->bucket + 1); + return sk; +} + +static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos) +{ + struct sock *sk = ping_get_first(seq, 0); + + if (sk) + while (pos && (sk = ping_get_next(seq, sk)) != NULL) + --pos; + return pos ? NULL : sk; +} + +static void *ping_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct ping_iter_state *state = seq->private; + state->bucket = 0; + + read_lock_bh(&ping_table.lock); + + return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN; +} + +static void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sock *sk; + + if (v == SEQ_START_TOKEN) + sk = ping_get_idx(seq, 0); + else + sk = ping_get_next(seq, v); + + ++*pos; + return sk; +} + +static void ping_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock_bh(&ping_table.lock); +} + +static void ping_format_sock(struct sock *sp, struct seq_file *f, + int bucket, int *len) +{ + struct inet_sock *inet = inet_sk(sp); + __be32 dest = inet->inet_daddr; + __be32 src = inet->inet_rcv_saddr; + __u16 destp = ntohs(inet->inet_dport); + __u16 srcp = ntohs(inet->inet_sport); + + seq_printf(f, "%5d: %08X:%04X %08X:%04X" + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n", + bucket, src, srcp, dest, destp, sp->sk_state, + sk_wmem_alloc_get(sp), + sk_rmem_alloc_get(sp), + 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), + atomic_read(&sp->sk_refcnt), sp, + atomic_read(&sp->sk_drops), len); +} + +static int ping_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_printf(seq, "%-127s\n", + " sl local_address rem_address st tx_queue " + "rx_queue tr tm->when retrnsmt uid timeout " + "inode ref pointer drops"); + else { + struct ping_iter_state *state = seq->private; + int len; + + ping_format_sock(v, seq, state->bucket, &len); + seq_printf(seq, "%*s\n", 127 - len, ""); + } + return 0; +} + +static const struct seq_operations ping_seq_ops = { + .show = ping_seq_show, + .start = ping_seq_start, + .next = ping_seq_next, + .stop = ping_seq_stop, +}; + +static int ping_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ping_seq_ops, + sizeof(struct ping_iter_state)); +} + +static const struct file_operations ping_seq_fops = { + .open = ping_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static int ping_proc_register(struct net *net) +{ + struct proc_dir_entry *p; + int rc = 0; + + p = proc_net_fops_create(net, "icmp", S_IRUGO, &ping_seq_fops); + if (!p) + rc = -ENOMEM; + return rc; +} + +static void ping_proc_unregister(struct net *net) +{ + proc_net_remove(net, "icmp"); +} + + +static int __net_init ping_proc_init_net(struct net *net) +{ + return ping_proc_register(net); +} + +static void __net_exit ping_proc_exit_net(struct net *net) +{ + ping_proc_unregister(net); +} + +static struct pernet_operations ping_net_ops = { + .init = ping_proc_init_net, + .exit = ping_proc_exit_net, +}; + +int __init ping_proc_init(void) +{ + return register_pernet_subsys(&ping_net_ops); +} + +void ping_proc_exit(void) +{ + unregister_pernet_subsys(&ping_net_ops); +} + +#endif + +void __init ping_init(void) +{ + int i; + + for (i = 0; i < PING_HTABLE_SIZE; i++) + INIT_HLIST_NULLS_HEAD(&ping_table.hash[i], i); + rwlock_init(&ping_table.lock); +} diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index bceaec42c37..c9893d43242 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -154,7 +154,7 @@ static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) * RFC 1122: SHOULD pass TOS value up to the transport layer. * -> It does. And not only TOS, but all IP header. */ -static int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) +static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) { struct sock *sk; struct hlist_head *head; @@ -247,7 +247,7 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) } if (inet->recverr) { - struct iphdr *iph = (struct iphdr *)skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; u8 *payload = skb->data + (iph->ihl << 2); if (inet->hdrincl) @@ -265,7 +265,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) { int hash; struct sock *raw_sk; - struct iphdr *iph; + const struct iphdr *iph; struct net *net; hash = protocol & (RAW_HTABLE_SIZE - 1); @@ -273,7 +273,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) read_lock(&raw_v4_hashinfo.lock); raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]); if (raw_sk != NULL) { - iph = (struct iphdr *)skb->data; + iph = (const struct iphdr *)skb->data; net = dev_net(skb->dev); while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol, @@ -281,7 +281,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) skb->dev->ifindex)) != NULL) { raw_err(raw_sk, skb, info); raw_sk = sk_next(raw_sk); - iph = (struct iphdr *)skb->data; + iph = (const struct iphdr *)skb->data; } } read_unlock(&raw_v4_hashinfo.lock); @@ -314,9 +314,10 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb) return 0; } -static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, - struct rtable **rtp, - unsigned int flags) +static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, + void *from, size_t length, + struct rtable **rtp, + unsigned int flags) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); @@ -327,7 +328,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, struct rtable *rt = *rtp; if (length > rt->dst.dev->mtu) { - ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, + ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, rt->dst.dev->mtu); return -EMSGSIZE; } @@ -372,7 +373,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, if (iphlen >= sizeof(*iph)) { if (!iph->saddr) - iph->saddr = rt->rt_src; + iph->saddr = fl4->saddr; iph->check = 0; iph->tot_len = htons(length); if (!iph->id) @@ -455,11 +456,13 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct inet_sock *inet = inet_sk(sk); struct ipcm_cookie ipc; struct rtable *rt = NULL; + struct flowi4 fl4; int free = 0; __be32 daddr; __be32 saddr; u8 tos; int err; + struct ip_options_data opt_copy; err = -EMSGSIZE; if (len > 0xFFFF) @@ -520,8 +523,18 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, saddr = ipc.addr; ipc.addr = daddr; - if (!ipc.opt) - ipc.opt = inet->opt; + if (!ipc.opt) { + struct ip_options_rcu *inet_opt; + + rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); + if (inet_opt) { + memcpy(&opt_copy, inet_opt, + sizeof(*inet_opt) + inet_opt->opt.optlen); + ipc.opt = &opt_copy.opt; + } + rcu_read_unlock(); + } if (ipc.opt) { err = -EINVAL; @@ -530,10 +543,10 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, */ if (inet->hdrincl) goto done; - if (ipc.opt->srr) { + if (ipc.opt->opt.srr) { if (!daddr) goto done; - daddr = ipc.opt->faddr; + daddr = ipc.opt->opt.faddr; } } tos = RT_CONN_FLAGS(sk); @@ -547,31 +560,23 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, saddr = inet->mc_addr; } - { - struct flowi4 fl4 = { - .flowi4_oif = ipc.oif, - .flowi4_mark = sk->sk_mark, - .daddr = daddr, - .saddr = saddr, - .flowi4_tos = tos, - .flowi4_proto = (inet->hdrincl ? - IPPROTO_RAW : - sk->sk_protocol), - .flowi4_flags = FLOWI_FLAG_CAN_SLEEP, - }; - if (!inet->hdrincl) { - err = raw_probe_proto_opt(&fl4, msg); - if (err) - goto done; - } + flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, + RT_SCOPE_UNIVERSE, + inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, + FLOWI_FLAG_CAN_SLEEP, daddr, saddr, 0, 0); - security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); - rt = ip_route_output_flow(sock_net(sk), &fl4, sk); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - rt = NULL; + if (!inet->hdrincl) { + err = raw_probe_proto_opt(&fl4, msg); + if (err) goto done; - } + } + + security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); + rt = ip_route_output_flow(sock_net(sk), &fl4, sk); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + rt = NULL; + goto done; } err = -EACCES; @@ -583,19 +588,20 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, back_from_confirm: if (inet->hdrincl) - err = raw_send_hdrinc(sk, msg->msg_iov, len, - &rt, msg->msg_flags); + err = raw_send_hdrinc(sk, &fl4, msg->msg_iov, len, + &rt, msg->msg_flags); else { if (!ipc.addr) - ipc.addr = rt->rt_dst; + ipc.addr = fl4.daddr; lock_sock(sk); - err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0, - &ipc, &rt, msg->msg_flags); + err = ip_append_data(sk, &fl4, ip_generic_getfrag, + msg->msg_iov, len, 0, + &ipc, &rt, msg->msg_flags); if (err) ip_flush_pending_frames(sk); else if (!(msg->msg_flags & MSG_MORE)) { - err = ip_push_pending_frames(sk); + err = ip_push_pending_frames(sk, &fl4); if (err == -ENOBUFS && !inet->recverr) err = 0; } @@ -973,7 +979,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) srcp = inet->inet_num; seq_printf(seq, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n", + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d\n", i, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 99e6e4bb1c7..aa13ef10511 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -156,7 +156,7 @@ static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) u32 *p = NULL; if (!rt->peer) - rt_bind_peer(rt, 1); + rt_bind_peer(rt, rt->rt_dst, 1); peer = rt->peer; if (peer) { @@ -424,7 +424,7 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) dst_metric(&r->dst, RTAX_WINDOW), (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + dst_metric(&r->dst, RTAX_RTTVAR)), - r->rt_tos, + r->rt_key_tos, r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1, r->dst.hh ? (r->dst.hh->hh_output == dev_queue_xmit) : 0, @@ -724,7 +724,7 @@ static inline int compare_keys(struct rtable *rt1, struct rtable *rt2) return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | (rt1->rt_mark ^ rt2->rt_mark) | - (rt1->rt_tos ^ rt2->rt_tos) | + (rt1->rt_key_tos ^ rt2->rt_key_tos) | (rt1->rt_oif ^ rt2->rt_oif) | (rt1->rt_iif ^ rt2->rt_iif)) == 0; } @@ -968,10 +968,6 @@ static int rt_garbage_collect(struct dst_ops *ops) break; expire >>= 1; -#if RT_CACHE_DEBUG >= 2 - printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, - dst_entries_get_fast(&ipv4_dst_ops), goal, i); -#endif if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) goto out; @@ -992,10 +988,6 @@ work_done: dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh || dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh) expire = ip_rt_gc_timeout; -#if RT_CACHE_DEBUG >= 2 - printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, - dst_entries_get_fast(&ipv4_dst_ops), goal, rover); -#endif out: return 0; } @@ -1179,16 +1171,6 @@ restart: rt->dst.rt_next = rt_hash_table[hash].chain; -#if RT_CACHE_DEBUG >= 2 - if (rt->dst.rt_next) { - struct rtable *trt; - printk(KERN_DEBUG "rt_cache @%02x: %pI4", - hash, &rt->rt_dst); - for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next) - printk(" . %pI4", &trt->rt_dst); - printk("\n"); - } -#endif /* * Since lookup is lockfree, we must make sure * previous writes to rt are committed to memory @@ -1211,11 +1193,11 @@ static u32 rt_peer_genid(void) return atomic_read(&__rt_peer_genid); } -void rt_bind_peer(struct rtable *rt, int create) +void rt_bind_peer(struct rtable *rt, __be32 daddr, int create) { struct inet_peer *peer; - peer = inet_getpeer_v4(rt->rt_dst, create); + peer = inet_getpeer_v4(daddr, create); if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) inet_putpeer(peer); @@ -1249,7 +1231,7 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) if (rt) { if (rt->peer == NULL) - rt_bind_peer(rt, 1); + rt_bind_peer(rt, rt->rt_dst, 1); /* If peer is attached to destination, it is never detached, so that we need not to grab a lock to dereference it. @@ -1334,6 +1316,23 @@ reject_redirect: ; } +static bool peer_pmtu_expired(struct inet_peer *peer) +{ + unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); + + return orig && + time_after_eq(jiffies, orig) && + cmpxchg(&peer->pmtu_expires, orig, 0) == orig; +} + +static bool peer_pmtu_cleaned(struct inet_peer *peer) +{ + unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); + + return orig && + cmpxchg(&peer->pmtu_expires, orig, 0) == orig; +} + static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) { struct rtable *rt = (struct rtable *)dst; @@ -1347,20 +1346,10 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, rt->rt_oif, rt_genid(dev_net(dst->dev))); -#if RT_CACHE_DEBUG >= 1 - printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n", - &rt->rt_dst, rt->rt_tos); -#endif rt_del(hash, rt); ret = NULL; - } else if (rt->peer && - rt->peer->pmtu_expires && - time_after_eq(jiffies, rt->peer->pmtu_expires)) { - unsigned long orig = rt->peer->pmtu_expires; - - if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig) - dst_metric_set(dst, RTAX_MTU, - rt->peer->pmtu_orig); + } else if (rt->peer && peer_pmtu_expired(rt->peer)) { + dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig); } } return ret; @@ -1399,7 +1388,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) rcu_read_unlock(); if (!rt->peer) - rt_bind_peer(rt, 1); + rt_bind_peer(rt, rt->rt_dst, 1); peer = rt->peer; if (!peer) { icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); @@ -1435,7 +1424,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) peer->rate_tokens == ip_rt_redirect_number && net_ratelimit()) printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", - &rt->rt_src, rt->rt_iif, + &ip_hdr(skb)->saddr, rt->rt_iif, &rt->rt_dst, &rt->rt_gateway); #endif } @@ -1467,7 +1456,7 @@ static int ip_error(struct sk_buff *skb) } if (!rt->peer) - rt_bind_peer(rt, 1); + rt_bind_peer(rt, rt->rt_dst, 1); peer = rt->peer; send = true; @@ -1507,7 +1496,7 @@ static inline unsigned short guess_mtu(unsigned short old_mtu) return 68; } -unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, +unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph, unsigned short new_mtu, struct net_device *dev) { @@ -1553,8 +1542,10 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer) { - unsigned long expires = peer->pmtu_expires; + unsigned long expires = ACCESS_ONCE(peer->pmtu_expires); + if (!expires) + return; if (time_before(jiffies, expires)) { u32 orig_dst_mtu = dst_mtu(dst); if (peer->pmtu_learned < orig_dst_mtu) { @@ -1574,13 +1565,14 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) dst_confirm(dst); if (!rt->peer) - rt_bind_peer(rt, 1); + rt_bind_peer(rt, rt->rt_dst, 1); peer = rt->peer; if (peer) { + unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires); + if (mtu < ip_rt_min_pmtu) mtu = ip_rt_min_pmtu; - if (!peer->pmtu_expires || mtu < peer->pmtu_learned) { - unsigned long pmtu_expires; + if (!pmtu_expires || mtu < peer->pmtu_learned) { pmtu_expires = jiffies + ip_rt_mtu_expires; if (!pmtu_expires) @@ -1631,16 +1623,17 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) struct inet_peer *peer; if (!rt->peer) - rt_bind_peer(rt, 0); + rt_bind_peer(rt, rt->rt_dst, 0); peer = rt->peer; - if (peer && peer->pmtu_expires) + if (peer) { check_peer_pmtu(dst, peer); - if (peer && peer->redirect_learned.a4 && - peer->redirect_learned.a4 != rt->rt_gateway) { - if (check_peer_redir(dst, peer)) - return NULL; + if (peer->redirect_learned.a4 && + peer->redirect_learned.a4 != rt->rt_gateway) { + if (check_peer_redir(dst, peer)) + return NULL; + } } rt->rt_peer_genid = rt_peer_genid(); @@ -1671,14 +1664,8 @@ static void ipv4_link_failure(struct sk_buff *skb) icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); rt = skb_rtable(skb); - if (rt && - rt->peer && - rt->peer->pmtu_expires) { - unsigned long orig = rt->peer->pmtu_expires; - - if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig) - dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig); - } + if (rt && rt->peer && peer_pmtu_cleaned(rt->peer)) + dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig); } static int ip_rt_bug(struct sk_buff *skb) @@ -1687,6 +1674,7 @@ static int ip_rt_bug(struct sk_buff *skb) &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, skb->dev ? skb->dev->name : "?"); kfree_skb(skb); + WARN_ON(1); return 0; } @@ -1699,22 +1687,26 @@ static int ip_rt_bug(struct sk_buff *skb) in IP options! */ -void ip_rt_get_source(u8 *addr, struct rtable *rt) +void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) { __be32 src; - struct fib_result res; if (rt_is_output_route(rt)) - src = rt->rt_src; + src = ip_hdr(skb)->saddr; else { - struct flowi4 fl4 = { - .daddr = rt->rt_key_dst, - .saddr = rt->rt_key_src, - .flowi4_tos = rt->rt_tos, - .flowi4_oif = rt->rt_oif, - .flowi4_iif = rt->rt_iif, - .flowi4_mark = rt->rt_mark, - }; + struct fib_result res; + struct flowi4 fl4; + struct iphdr *iph; + + iph = ip_hdr(skb); + + memset(&fl4, 0, sizeof(fl4)); + fl4.daddr = iph->daddr; + fl4.saddr = iph->saddr; + fl4.flowi4_tos = iph->tos; + fl4.flowi4_oif = rt->dst.dev->ifindex; + fl4.flowi4_iif = skb->dev->ifindex; + fl4.flowi4_mark = skb->mark; rcu_read_lock(); if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) @@ -1767,7 +1759,7 @@ static unsigned int ipv4_default_mtu(const struct dst_entry *dst) return mtu; } -static void rt_init_metrics(struct rtable *rt, const struct flowi4 *oldflp4, +static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, struct fib_info *fi) { struct inet_peer *peer; @@ -1776,7 +1768,7 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *oldflp4, /* If a peer entry exists for this destination, we must hook * it up in order to get at cached metrics. */ - if (oldflp4 && (oldflp4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS)) + if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS)) create = 1; rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create); @@ -1787,8 +1779,7 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *oldflp4, sizeof(u32) * RTAX_MAX); dst_init_metrics(&rt->dst, peer->metrics, false); - if (peer->pmtu_expires) - check_peer_pmtu(&rt->dst, peer); + check_peer_pmtu(&rt->dst, peer); if (peer->redirect_learned.a4 && peer->redirect_learned.a4 != rt->rt_gateway) { rt->rt_gateway = peer->redirect_learned.a4; @@ -1803,7 +1794,7 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *oldflp4, } } -static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *oldflp4, +static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, const struct fib_result *res, struct fib_info *fi, u16 type, u32 itag) { @@ -1813,7 +1804,7 @@ static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *oldflp4, if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) rt->rt_gateway = FIB_RES_GW(*res); - rt_init_metrics(rt, oldflp4, fi); + rt_init_metrics(rt, fl4, fi); #ifdef CONFIG_IP_ROUTE_CLASSID dst->tclassid = FIB_RES_NH(*res).nh_tclassid; #endif @@ -1830,20 +1821,15 @@ static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *oldflp4, #endif set_class_tag(rt, itag); #endif - rt->rt_type = type; } -static struct rtable *rt_dst_alloc(bool nopolicy, bool noxfrm) +static struct rtable *rt_dst_alloc(struct net_device *dev, + bool nopolicy, bool noxfrm) { - struct rtable *rt = dst_alloc(&ipv4_dst_ops, 1); - if (rt) { - rt->dst.obsolete = -1; - - rt->dst.flags = DST_HOST | - (nopolicy ? DST_NOPOLICY : 0) | - (noxfrm ? DST_NOXFRM : 0); - } - return rt; + return dst_alloc(&ipv4_dst_ops, dev, 1, -1, + DST_HOST | + (nopolicy ? DST_NOPOLICY : 0) | + (noxfrm ? DST_NOXFRM : 0)); } /* called in rcu_read_lock() section */ @@ -1871,36 +1857,38 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto e_inval; spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); } else { - err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, - &itag, 0); + err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, + &itag); if (err < 0) goto e_err; } - rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), false); + rth = rt_dst_alloc(init_net.loopback_dev, + IN_DEV_CONF_GET(in_dev, NOPOLICY), false); if (!rth) goto e_nobufs; +#ifdef CONFIG_IP_ROUTE_CLASSID + rth->dst.tclassid = itag; +#endif rth->dst.output = ip_rt_bug; rth->rt_key_dst = daddr; - rth->rt_dst = daddr; - rth->rt_tos = tos; - rth->rt_mark = skb->mark; rth->rt_key_src = saddr; + rth->rt_genid = rt_genid(dev_net(dev)); + rth->rt_flags = RTCF_MULTICAST; + rth->rt_type = RTN_MULTICAST; + rth->rt_key_tos = tos; + rth->rt_dst = daddr; rth->rt_src = saddr; -#ifdef CONFIG_IP_ROUTE_CLASSID - rth->dst.tclassid = itag; -#endif rth->rt_route_iif = dev->ifindex; rth->rt_iif = dev->ifindex; - rth->dst.dev = init_net.loopback_dev; - dev_hold(rth->dst.dev); rth->rt_oif = 0; + rth->rt_mark = skb->mark; rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; - rth->rt_genid = rt_genid(dev_net(dev)); - rth->rt_flags = RTCF_MULTICAST; - rth->rt_type = RTN_MULTICAST; + rth->rt_peer_genid = 0; + rth->peer = NULL; + rth->fi = NULL; if (our) { rth->dst.input= ip_local_deliver; rth->rt_flags |= RTCF_LOCAL; @@ -1914,9 +1902,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); rth = rt_intern_hash(hash, rth, skb, dev->ifindex); - err = 0; - if (IS_ERR(rth)) - err = PTR_ERR(rth); + return IS_ERR(rth) ? PTR_ERR(rth) : 0; e_nobufs: return -ENOBUFS; @@ -1981,8 +1967,8 @@ static int __mkroute_input(struct sk_buff *skb, } - err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), - in_dev->dev, &spec_dst, &itag, skb->mark); + err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), + in_dev->dev, &spec_dst, &itag); if (err < 0) { ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); @@ -2013,7 +1999,8 @@ static int __mkroute_input(struct sk_buff *skb, } } - rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), + rth = rt_dst_alloc(out_dev->dev, + IN_DEV_CONF_GET(in_dev, NOPOLICY), IN_DEV_CONF_GET(out_dev, NOXFRM)); if (!rth) { err = -ENOBUFS; @@ -2021,27 +2008,28 @@ static int __mkroute_input(struct sk_buff *skb, } rth->rt_key_dst = daddr; - rth->rt_dst = daddr; - rth->rt_tos = tos; - rth->rt_mark = skb->mark; rth->rt_key_src = saddr; + rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); + rth->rt_flags = flags; + rth->rt_type = res->type; + rth->rt_key_tos = tos; + rth->rt_dst = daddr; rth->rt_src = saddr; - rth->rt_gateway = daddr; rth->rt_route_iif = in_dev->dev->ifindex; rth->rt_iif = in_dev->dev->ifindex; - rth->dst.dev = (out_dev)->dev; - dev_hold(rth->dst.dev); rth->rt_oif = 0; + rth->rt_mark = skb->mark; + rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; + rth->rt_peer_genid = 0; + rth->peer = NULL; + rth->fi = NULL; rth->dst.input = ip_forward; rth->dst.output = ip_output; - rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag); - rth->rt_flags = flags; - *result = rth; err = 0; cleanup: @@ -2150,9 +2138,9 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto brd_input; if (res.type == RTN_LOCAL) { - err = fib_validate_source(saddr, daddr, tos, + err = fib_validate_source(skb, saddr, daddr, tos, net->loopback_dev->ifindex, - dev, &spec_dst, &itag, skb->mark); + dev, &spec_dst, &itag); if (err < 0) goto martian_source_keep_err; if (err) @@ -2176,8 +2164,8 @@ brd_input: if (ipv4_is_zeronet(saddr)) spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); else { - err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, - &itag, skb->mark); + err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, + &itag); if (err < 0) goto martian_source_keep_err; if (err) @@ -2188,36 +2176,42 @@ brd_input: RT_CACHE_STAT_INC(in_brd); local_input: - rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), false); + rth = rt_dst_alloc(net->loopback_dev, + IN_DEV_CONF_GET(in_dev, NOPOLICY), false); if (!rth) goto e_nobufs; + rth->dst.input= ip_local_deliver; rth->dst.output= ip_rt_bug; - rth->rt_genid = rt_genid(net); +#ifdef CONFIG_IP_ROUTE_CLASSID + rth->dst.tclassid = itag; +#endif rth->rt_key_dst = daddr; - rth->rt_dst = daddr; - rth->rt_tos = tos; - rth->rt_mark = skb->mark; rth->rt_key_src = saddr; + rth->rt_genid = rt_genid(net); + rth->rt_flags = flags|RTCF_LOCAL; + rth->rt_type = res.type; + rth->rt_key_tos = tos; + rth->rt_dst = daddr; rth->rt_src = saddr; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif rth->rt_route_iif = dev->ifindex; rth->rt_iif = dev->ifindex; - rth->dst.dev = net->loopback_dev; - dev_hold(rth->dst.dev); + rth->rt_oif = 0; + rth->rt_mark = skb->mark; rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; - rth->dst.input= ip_local_deliver; - rth->rt_flags = flags|RTCF_LOCAL; + rth->rt_peer_genid = 0; + rth->peer = NULL; + rth->fi = NULL; if (res.type == RTN_UNREACHABLE) { rth->dst.input= ip_error; rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } - rth->rt_type = res.type; hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net)); rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif); err = 0; @@ -2288,7 +2282,7 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, ((__force u32)rth->rt_key_src ^ (__force u32)saddr) | (rth->rt_iif ^ iif) | rth->rt_oif | - (rth->rt_tos ^ tos)) == 0 && + (rth->rt_key_tos ^ tos)) == 0 && rth->rt_mark == skb->mark && net_eq(dev_net(rth->dst.dev), net) && !rt_is_expired(rth)) { @@ -2349,12 +2343,12 @@ EXPORT_SYMBOL(ip_route_input_common); /* called with rcu_read_lock() */ static struct rtable *__mkroute_output(const struct fib_result *res, const struct flowi4 *fl4, - const struct flowi4 *oldflp4, - struct net_device *dev_out, + __be32 orig_daddr, __be32 orig_saddr, + int orig_oif, struct net_device *dev_out, unsigned int flags) { struct fib_info *fi = res->fi; - u32 tos = RT_FL_TOS(oldflp4); + u32 tos = RT_FL_TOS(fl4); struct in_device *in_dev; u16 type = res->type; struct rtable *rth; @@ -2381,8 +2375,8 @@ static struct rtable *__mkroute_output(const struct fib_result *res, fi = NULL; } else if (type == RTN_MULTICAST) { flags |= RTCF_MULTICAST | RTCF_LOCAL; - if (!ip_check_mc_rcu(in_dev, oldflp4->daddr, oldflp4->saddr, - oldflp4->flowi4_proto)) + if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, + fl4->flowi4_proto)) flags &= ~RTCF_LOCAL; /* If multicast route do not exist use * default one, but do not gateway in this case. @@ -2392,29 +2386,31 @@ static struct rtable *__mkroute_output(const struct fib_result *res, fi = NULL; } - rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), + rth = rt_dst_alloc(dev_out, + IN_DEV_CONF_GET(in_dev, NOPOLICY), IN_DEV_CONF_GET(in_dev, NOXFRM)); if (!rth) return ERR_PTR(-ENOBUFS); - rth->rt_key_dst = oldflp4->daddr; - rth->rt_tos = tos; - rth->rt_key_src = oldflp4->saddr; - rth->rt_oif = oldflp4->flowi4_oif; - rth->rt_mark = oldflp4->flowi4_mark; + rth->dst.output = ip_output; + + rth->rt_key_dst = orig_daddr; + rth->rt_key_src = orig_saddr; + rth->rt_genid = rt_genid(dev_net(dev_out)); + rth->rt_flags = flags; + rth->rt_type = type; + rth->rt_key_tos = tos; rth->rt_dst = fl4->daddr; rth->rt_src = fl4->saddr; rth->rt_route_iif = 0; - rth->rt_iif = oldflp4->flowi4_oif ? : dev_out->ifindex; - /* get references to the devices that are to be hold by the routing - cache entry */ - rth->dst.dev = dev_out; - dev_hold(dev_out); + rth->rt_iif = orig_oif ? : dev_out->ifindex; + rth->rt_oif = orig_oif; + rth->rt_mark = fl4->flowi4_mark; rth->rt_gateway = fl4->daddr; rth->rt_spec_dst= fl4->saddr; - - rth->dst.output=ip_output; - rth->rt_genid = rt_genid(dev_net(dev_out)); + rth->rt_peer_genid = 0; + rth->peer = NULL; + rth->fi = NULL; RT_CACHE_STAT_INC(out_slow_tot); @@ -2432,7 +2428,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, #ifdef CONFIG_IP_MROUTE if (type == RTN_MULTICAST) { if (IN_DEV_MFORWARD(in_dev) && - !ipv4_is_local_multicast(oldflp4->daddr)) { + !ipv4_is_local_multicast(fl4->daddr)) { rth->dst.input = ip_mr_input; rth->dst.output = ip_mc_output; } @@ -2440,9 +2436,8 @@ static struct rtable *__mkroute_output(const struct fib_result *res, #endif } - rt_set_nexthop(rth, oldflp4, res, fi, type, 0); + rt_set_nexthop(rth, fl4, res, fi, type, 0); - rth->rt_flags = flags; return rth; } @@ -2451,36 +2446,37 @@ static struct rtable *__mkroute_output(const struct fib_result *res, * called with rcu_read_lock(); */ -static struct rtable *ip_route_output_slow(struct net *net, - const struct flowi4 *oldflp4) +static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) { - u32 tos = RT_FL_TOS(oldflp4); - struct flowi4 fl4; - struct fib_result res; - unsigned int flags = 0; struct net_device *dev_out = NULL; + u32 tos = RT_FL_TOS(fl4); + unsigned int flags = 0; + struct fib_result res; struct rtable *rth; + __be32 orig_daddr; + __be32 orig_saddr; + int orig_oif; res.fi = NULL; #ifdef CONFIG_IP_MULTIPLE_TABLES res.r = NULL; #endif - fl4.flowi4_oif = oldflp4->flowi4_oif; - fl4.flowi4_iif = net->loopback_dev->ifindex; - fl4.flowi4_mark = oldflp4->flowi4_mark; - fl4.daddr = oldflp4->daddr; - fl4.saddr = oldflp4->saddr; - fl4.flowi4_tos = tos & IPTOS_RT_MASK; - fl4.flowi4_scope = ((tos & RTO_ONLINK) ? - RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); + orig_daddr = fl4->daddr; + orig_saddr = fl4->saddr; + orig_oif = fl4->flowi4_oif; + + fl4->flowi4_iif = net->loopback_dev->ifindex; + fl4->flowi4_tos = tos & IPTOS_RT_MASK; + fl4->flowi4_scope = ((tos & RTO_ONLINK) ? + RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); rcu_read_lock(); - if (oldflp4->saddr) { + if (fl4->saddr) { rth = ERR_PTR(-EINVAL); - if (ipv4_is_multicast(oldflp4->saddr) || - ipv4_is_lbcast(oldflp4->saddr) || - ipv4_is_zeronet(oldflp4->saddr)) + if (ipv4_is_multicast(fl4->saddr) || + ipv4_is_lbcast(fl4->saddr) || + ipv4_is_zeronet(fl4->saddr)) goto out; /* I removed check for oif == dev_out->oif here. @@ -2491,11 +2487,11 @@ static struct rtable *ip_route_output_slow(struct net *net, of another iface. --ANK */ - if (oldflp4->flowi4_oif == 0 && - (ipv4_is_multicast(oldflp4->daddr) || - ipv4_is_lbcast(oldflp4->daddr))) { + if (fl4->flowi4_oif == 0 && + (ipv4_is_multicast(fl4->daddr) || + ipv4_is_lbcast(fl4->daddr))) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ - dev_out = __ip_dev_find(net, oldflp4->saddr, false); + dev_out = __ip_dev_find(net, fl4->saddr, false); if (dev_out == NULL) goto out; @@ -2514,20 +2510,20 @@ static struct rtable *ip_route_output_slow(struct net *net, Luckily, this hack is good workaround. */ - fl4.flowi4_oif = dev_out->ifindex; + fl4->flowi4_oif = dev_out->ifindex; goto make_route; } - if (!(oldflp4->flowi4_flags & FLOWI_FLAG_ANYSRC)) { + if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ - if (!__ip_dev_find(net, oldflp4->saddr, false)) + if (!__ip_dev_find(net, fl4->saddr, false)) goto out; } } - if (oldflp4->flowi4_oif) { - dev_out = dev_get_by_index_rcu(net, oldflp4->flowi4_oif); + if (fl4->flowi4_oif) { + dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); rth = ERR_PTR(-ENODEV); if (dev_out == NULL) goto out; @@ -2537,37 +2533,37 @@ static struct rtable *ip_route_output_slow(struct net *net, rth = ERR_PTR(-ENETUNREACH); goto out; } - if (ipv4_is_local_multicast(oldflp4->daddr) || - ipv4_is_lbcast(oldflp4->daddr)) { - if (!fl4.saddr) - fl4.saddr = inet_select_addr(dev_out, 0, - RT_SCOPE_LINK); + if (ipv4_is_local_multicast(fl4->daddr) || + ipv4_is_lbcast(fl4->daddr)) { + if (!fl4->saddr) + fl4->saddr = inet_select_addr(dev_out, 0, + RT_SCOPE_LINK); goto make_route; } - if (!fl4.saddr) { - if (ipv4_is_multicast(oldflp4->daddr)) - fl4.saddr = inet_select_addr(dev_out, 0, - fl4.flowi4_scope); - else if (!oldflp4->daddr) - fl4.saddr = inet_select_addr(dev_out, 0, - RT_SCOPE_HOST); + if (fl4->saddr) { + if (ipv4_is_multicast(fl4->daddr)) + fl4->saddr = inet_select_addr(dev_out, 0, + fl4->flowi4_scope); + else if (!fl4->daddr) + fl4->saddr = inet_select_addr(dev_out, 0, + RT_SCOPE_HOST); } } - if (!fl4.daddr) { - fl4.daddr = fl4.saddr; - if (!fl4.daddr) - fl4.daddr = fl4.saddr = htonl(INADDR_LOOPBACK); + if (!fl4->daddr) { + fl4->daddr = fl4->saddr; + if (!fl4->daddr) + fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); dev_out = net->loopback_dev; - fl4.flowi4_oif = net->loopback_dev->ifindex; + fl4->flowi4_oif = net->loopback_dev->ifindex; res.type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; } - if (fib_lookup(net, &fl4, &res)) { + if (fib_lookup(net, fl4, &res)) { res.fi = NULL; - if (oldflp4->flowi4_oif) { + if (fl4->flowi4_oif) { /* Apparently, routing tables are wrong. Assume, that the destination is on link. @@ -2586,9 +2582,9 @@ static struct rtable *ip_route_output_slow(struct net *net, likely IPv6, but we do not. */ - if (fl4.saddr == 0) - fl4.saddr = inet_select_addr(dev_out, 0, - RT_SCOPE_LINK); + if (fl4->saddr == 0) + fl4->saddr = inet_select_addr(dev_out, 0, + RT_SCOPE_LINK); res.type = RTN_UNICAST; goto make_route; } @@ -2597,42 +2593,45 @@ static struct rtable *ip_route_output_slow(struct net *net, } if (res.type == RTN_LOCAL) { - if (!fl4.saddr) { + if (!fl4->saddr) { if (res.fi->fib_prefsrc) - fl4.saddr = res.fi->fib_prefsrc; + fl4->saddr = res.fi->fib_prefsrc; else - fl4.saddr = fl4.daddr; + fl4->saddr = fl4->daddr; } dev_out = net->loopback_dev; - fl4.flowi4_oif = dev_out->ifindex; + fl4->flowi4_oif = dev_out->ifindex; res.fi = NULL; flags |= RTCF_LOCAL; goto make_route; } #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res.fi->fib_nhs > 1 && fl4.flowi4_oif == 0) + if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) fib_select_multipath(&res); else #endif - if (!res.prefixlen && res.type == RTN_UNICAST && !fl4.flowi4_oif) + if (!res.prefixlen && + res.table->tb_num_default > 1 && + res.type == RTN_UNICAST && !fl4->flowi4_oif) fib_select_default(&res); - if (!fl4.saddr) - fl4.saddr = FIB_RES_PREFSRC(net, res); + if (!fl4->saddr) + fl4->saddr = FIB_RES_PREFSRC(net, res); dev_out = FIB_RES_DEV(res); - fl4.flowi4_oif = dev_out->ifindex; + fl4->flowi4_oif = dev_out->ifindex; make_route: - rth = __mkroute_output(&res, &fl4, oldflp4, dev_out, flags); + rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif, + dev_out, flags); if (!IS_ERR(rth)) { unsigned int hash; - hash = rt_hash(oldflp4->daddr, oldflp4->saddr, oldflp4->flowi4_oif, + hash = rt_hash(orig_daddr, orig_saddr, orig_oif, rt_genid(dev_net(dev_out))); - rth = rt_intern_hash(hash, rth, NULL, oldflp4->flowi4_oif); + rth = rt_intern_hash(hash, rth, NULL, orig_oif); } out: @@ -2640,7 +2639,7 @@ out: return rth; } -struct rtable *__ip_route_output_key(struct net *net, const struct flowi4 *flp4) +struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4) { struct rtable *rth; unsigned int hash; @@ -2658,13 +2657,17 @@ struct rtable *__ip_route_output_key(struct net *net, const struct flowi4 *flp4) rt_is_output_route(rth) && rth->rt_oif == flp4->flowi4_oif && rth->rt_mark == flp4->flowi4_mark && - !((rth->rt_tos ^ flp4->flowi4_tos) & + !((rth->rt_key_tos ^ flp4->flowi4_tos) & (IPTOS_RT_MASK | RTO_ONLINK)) && net_eq(dev_net(rth->dst.dev), net) && !rt_is_expired(rth)) { dst_use(&rth->dst, jiffies); RT_CACHE_STAT_INC(out_hit); rcu_read_unlock_bh(); + if (!flp4->saddr) + flp4->saddr = rth->rt_src; + if (!flp4->daddr) + flp4->daddr = rth->rt_dst; return rth; } RT_CACHE_STAT_INC(out_hlist_search); @@ -2709,7 +2712,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = { struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) { - struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, 1); + struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0); struct rtable *ort = (struct rtable *) dst_orig; if (rt) { @@ -2726,7 +2729,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_key_dst = ort->rt_key_dst; rt->rt_key_src = ort->rt_key_src; - rt->rt_tos = ort->rt_tos; + rt->rt_key_tos = ort->rt_key_tos; rt->rt_route_iif = ort->rt_route_iif; rt->rt_iif = ort->rt_iif; rt->rt_oif = ort->rt_oif; @@ -2762,15 +2765,10 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, if (IS_ERR(rt)) return rt; - if (flp4->flowi4_proto) { - if (!flp4->saddr) - flp4->saddr = rt->rt_src; - if (!flp4->daddr) - flp4->daddr = rt->rt_dst; + if (flp4->flowi4_proto) rt = (struct rtable *) xfrm_lookup(net, &rt->dst, flowi4_to_flowi(flp4), sk, 0); - } return rt; } @@ -2783,7 +2781,8 @@ static int rt_fill_info(struct net *net, struct rtable *rt = skb_rtable(skb); struct rtmsg *r; struct nlmsghdr *nlh; - long expires; + long expires = 0; + const struct inet_peer *peer = rt->peer; u32 id = 0, ts = 0, tsage = 0, error; nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); @@ -2794,7 +2793,7 @@ static int rt_fill_info(struct net *net, r->rtm_family = AF_INET; r->rtm_dst_len = 32; r->rtm_src_len = 0; - r->rtm_tos = rt->rt_tos; + r->rtm_tos = rt->rt_key_tos; r->rtm_table = RT_TABLE_MAIN; NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN); r->rtm_type = rt->rt_type; @@ -2831,15 +2830,16 @@ static int rt_fill_info(struct net *net, NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark); error = rt->dst.error; - expires = (rt->peer && rt->peer->pmtu_expires) ? - rt->peer->pmtu_expires - jiffies : 0; - if (rt->peer) { + if (peer) { inet_peer_refcheck(rt->peer); - id = atomic_read(&rt->peer->ip_id_count) & 0xffff; - if (rt->peer->tcp_ts_stamp) { - ts = rt->peer->tcp_ts; - tsage = get_seconds() - rt->peer->tcp_ts_stamp; + id = atomic_read(&peer->ip_id_count) & 0xffff; + if (peer->tcp_ts_stamp) { + ts = peer->tcp_ts; + tsage = get_seconds() - peer->tcp_ts_stamp; } + expires = ACCESS_ONCE(peer->pmtu_expires); + if (expires) + expires -= jiffies; } if (rt_is_input_route(rt)) { @@ -2848,7 +2848,9 @@ static int rt_fill_info(struct net *net, if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { - int err = ipmr_get_route(net, skb, r, nowait); + int err = ipmr_get_route(net, skb, + rt->rt_src, rt->rt_dst, + r, nowait); if (err <= 0) { if (!nowait) { if (err == 0) diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 8b44c6d2a79..26461492a84 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -321,10 +321,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, * the ACK carries the same options again (see RFC1122 4.2.3.8) */ if (opt && opt->optlen) { - int opt_size = sizeof(struct ip_options) + opt->optlen; + int opt_size = sizeof(struct ip_options_rcu) + opt->optlen; ireq->opt = kmalloc(opt_size, GFP_ATOMIC); - if (ireq->opt != NULL && ip_options_echo(ireq->opt, skb)) { + if (ireq->opt != NULL && ip_options_echo(&ireq->opt->opt, skb)) { kfree(ireq->opt); ireq->opt = NULL; } @@ -345,17 +345,13 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, * no easy way to do this. */ { - struct flowi4 fl4 = { - .flowi4_mark = sk->sk_mark, - .daddr = ((opt && opt->srr) ? - opt->faddr : ireq->rmt_addr), - .saddr = ireq->loc_addr, - .flowi4_tos = RT_CONN_FLAGS(sk), - .flowi4_proto = IPPROTO_TCP, - .flowi4_flags = inet_sk_flowi_flags(sk), - .fl4_sport = th->dest, - .fl4_dport = th->source, - }; + struct flowi4 fl4; + + flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk), + RT_SCOPE_UNIVERSE, IPPROTO_TCP, + inet_sk_flowi_flags(sk), + (opt && opt->srr) ? opt->faddr : ireq->rmt_addr, + ireq->loc_addr, th->source, th->dest); security_req_classify_flow(req, flowi4_to_flowi(&fl4)); rt = ip_route_output_key(sock_net(sk), &fl4); if (IS_ERR(rt)) { diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 321e6e84dbc..57d0752e239 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -13,6 +13,7 @@ #include <linux/seqlock.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/nsproxy.h> #include <net/snmp.h> #include <net/icmp.h> #include <net/ip.h> @@ -21,6 +22,7 @@ #include <net/udp.h> #include <net/cipso_ipv4.h> #include <net/inet_frag.h> +#include <net/ping.h> static int zero; static int tcp_retr1_max = 255; @@ -30,6 +32,8 @@ static int tcp_adv_win_scale_min = -31; static int tcp_adv_win_scale_max = 31; static int ip_ttl_min = 1; static int ip_ttl_max = 255; +static int ip_ping_group_range_min[] = { 0, 0 }; +static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; /* Update system visible IP port range */ static void set_local_port_range(int range[2]) @@ -68,6 +72,53 @@ static int ipv4_local_port_range(ctl_table *table, int write, return ret; } + +void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t *high) +{ + gid_t *data = table->data; + unsigned seq; + do { + seq = read_seqbegin(&sysctl_local_ports.lock); + + *low = data[0]; + *high = data[1]; + } while (read_seqretry(&sysctl_local_ports.lock, seq)); +} + +/* Update system visible IP port range */ +static void set_ping_group_range(struct ctl_table *table, int range[2]) +{ + gid_t *data = table->data; + write_seqlock(&sysctl_local_ports.lock); + data[0] = range[0]; + data[1] = range[1]; + write_sequnlock(&sysctl_local_ports.lock); +} + +/* Validate changes from /proc interface. */ +static int ipv4_ping_group_range(ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret; + gid_t range[2]; + ctl_table tmp = { + .data = &range, + .maxlen = sizeof(range), + .mode = table->mode, + .extra1 = &ip_ping_group_range_min, + .extra2 = &ip_ping_group_range_max, + }; + + inet_get_ping_group_range_table(table, range, range + 1); + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + + if (write && ret == 0) + set_ping_group_range(table, range); + + return ret; +} + static int proc_tcp_congestion_control(ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -677,6 +728,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "ping_group_range", + .data = &init_net.ipv4.sysctl_ping_group_range, + .maxlen = sizeof(init_net.ipv4.sysctl_ping_group_range), + .mode = 0644, + .proc_handler = ipv4_ping_group_range, + }, { } }; @@ -711,8 +769,18 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) &net->ipv4.sysctl_icmp_ratemask; table[6].data = &net->ipv4.sysctl_rt_cache_rebuild_count; + table[7].data = + &net->ipv4.sysctl_ping_group_range; + } + /* + * Sane defaults - nobody may create ping sockets. + * Boot scripts should set this to distro-specific group. + */ + net->ipv4.sysctl_ping_group_range[0] = 1; + net->ipv4.sysctl_ping_group_range[1] = 0; + net->ipv4.sysctl_rt_cache_rebuild_count = 4; net->ipv4.ipv4_hdr = register_net_sysctl_table(net, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b22d4501054..46febcacb72 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -999,7 +999,8 @@ new_segment: /* We have some space in skb head. Superb! */ if (copy > skb_tailroom(skb)) copy = skb_tailroom(skb); - if ((err = skb_add_data(skb, from, copy)) != 0) + err = skb_add_data_nocache(sk, skb, from, copy); + if (err) goto do_fault; } else { int merge = 0; @@ -1042,8 +1043,8 @@ new_segment: /* Time to copy data. We are close to * the end! */ - err = skb_copy_to_page(sk, from, skb, page, - off, copy); + err = skb_copy_to_page_nocache(sk, from, skb, + page, off, copy); if (err) { /* If this page was new, give it to the * socket so it does not get leaked. @@ -3219,7 +3220,7 @@ __setup("thash_entries=", set_thash_entries); void __init tcp_init(void) { struct sk_buff *skb = NULL; - unsigned long nr_pages, limit; + unsigned long limit; int i, max_share, cnt; unsigned long jiffy = jiffies; @@ -3276,13 +3277,7 @@ void __init tcp_init(void) sysctl_tcp_max_orphans = cnt / 2; sysctl_max_syn_backlog = max(128, cnt / 256); - /* Set the pressure threshold to be a fraction of global memory that - * is up to 1/2 at 256 MB, decreasing toward zero with the amount of - * memory, with a floor of 128 pages. - */ - nr_pages = totalram_pages - totalhigh_pages; - limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT); - limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11); + limit = nr_free_buffer_pages() / 8; limit = max(limit, 128UL); sysctl_tcp_mem[0] = limit / 4 * 3; sysctl_tcp_mem[1] = limit; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f7e6c2c2d2b..708dc203b03 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -146,13 +146,15 @@ EXPORT_SYMBOL_GPL(tcp_twsk_unique); /* This will initiate an outgoing connection. */ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { + struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); - struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; __be16 orig_sport, orig_dport; - struct rtable *rt; __be32 daddr, nexthop; + struct flowi4 *fl4; + struct rtable *rt; int err; + struct ip_options_rcu *inet_opt; if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; @@ -161,15 +163,18 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) return -EAFNOSUPPORT; nexthop = daddr = usin->sin_addr.s_addr; - if (inet->opt && inet->opt->srr) { + inet_opt = rcu_dereference_protected(inet->inet_opt, + sock_owned_by_user(sk)); + if (inet_opt && inet_opt->opt.srr) { if (!daddr) return -EINVAL; - nexthop = inet->opt->faddr; + nexthop = inet_opt->opt.faddr; } orig_sport = inet->inet_sport; orig_dport = usin->sin_port; - rt = ip_route_connect(nexthop, inet->inet_saddr, + fl4 = &inet->cork.fl.u.ip4; + rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, orig_dport, sk, true); @@ -185,11 +190,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) return -ENETUNREACH; } - if (!inet->opt || !inet->opt->srr) - daddr = rt->rt_dst; + if (!inet_opt || !inet_opt->opt.srr) + daddr = fl4->daddr; if (!inet->inet_saddr) - inet->inet_saddr = rt->rt_src; + inet->inet_saddr = fl4->saddr; inet->inet_rcv_saddr = inet->inet_saddr; if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { @@ -200,8 +205,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) } if (tcp_death_row.sysctl_tw_recycle && - !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) { - struct inet_peer *peer = rt_get_peer(rt); + !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) { + struct inet_peer *peer = rt_get_peer(rt, fl4->daddr); /* * VJ's idea. We save last timestamp seen from * the destination in peer table, when entering state @@ -221,8 +226,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_daddr = daddr; inet_csk(sk)->icsk_ext_hdr_len = 0; - if (inet->opt) - inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; + if (inet_opt) + inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; @@ -236,8 +241,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (err) goto failure; - rt = ip_route_newports(rt, IPPROTO_TCP, - orig_sport, orig_dport, + rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, inet->inet_sport, inet->inet_dport, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); @@ -279,7 +283,7 @@ EXPORT_SYMBOL(tcp_v4_connect); /* * This routine does path mtu discovery as defined in RFC1191. */ -static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu) +static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu) { struct dst_entry *dst; struct inet_sock *inet = inet_sk(sk); @@ -341,7 +345,7 @@ static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu) void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) { - struct iphdr *iph = (struct iphdr *)icmp_skb->data; + const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); struct inet_connection_sock *icsk; struct tcp_sock *tp; @@ -647,7 +651,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; net = dev_net(skb_dst(skb)->dev); - ip_send_reply(net->ipv4.tcp_sock, skb, + ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); @@ -722,7 +726,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, if (oif) arg.bound_dev_if = oif; - ip_send_reply(net->ipv4.tcp_sock, skb, + ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); @@ -765,11 +769,12 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, struct request_values *rvp) { const struct inet_request_sock *ireq = inet_rsk(req); + struct flowi4 fl4; int err = -1; struct sk_buff * skb; /* First, grab a route. */ - if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) + if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) return -1; skb = tcp_make_synack(sk, dst, req, rvp); @@ -820,17 +825,18 @@ static void syn_flood_warning(const struct sk_buff *skb) /* * Save and compile IPv4 options into the request_sock if needed. */ -static struct ip_options *tcp_v4_save_options(struct sock *sk, - struct sk_buff *skb) +static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk, + struct sk_buff *skb) { - struct ip_options *opt = &(IPCB(skb)->opt); - struct ip_options *dopt = NULL; + const struct ip_options *opt = &(IPCB(skb)->opt); + struct ip_options_rcu *dopt = NULL; if (opt && opt->optlen) { - int opt_size = optlength(opt); + int opt_size = sizeof(*dopt) + opt->optlen; + dopt = kmalloc(opt_size, GFP_ATOMIC); if (dopt) { - if (ip_options_echo(dopt, skb)) { + if (ip_options_echo(&dopt->opt, skb)) { kfree(dopt); dopt = NULL; } @@ -1333,6 +1339,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) req->cookie_ts = tmp_opt.tstamp_ok; } else if (!isn) { struct inet_peer *peer = NULL; + struct flowi4 fl4; /* VJ's idea. We save last timestamp seen * from the destination in peer table, when entering @@ -1345,9 +1352,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) */ if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle && - (dst = inet_csk_route_req(sk, req)) != NULL && - (peer = rt_get_peer((struct rtable *)dst)) != NULL && - peer->daddr.addr.a4 == saddr) { + (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && + fl4.daddr == saddr && + (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) { inet_peer_refcheck(peer); if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && (s32)(peer->tcp_ts - req->ts_recent) > @@ -1411,19 +1418,16 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *key; #endif + struct ip_options_rcu *inet_opt; if (sk_acceptq_is_full(sk)) goto exit_overflow; - if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) - goto exit; - newsk = tcp_create_openreq_child(sk, req, skb); if (!newsk) goto exit_nonewsk; newsk->sk_gso_type = SKB_GSO_TCPV4; - sk_setup_caps(newsk, dst); newtp = tcp_sk(newsk); newinet = inet_sk(newsk); @@ -1431,15 +1435,21 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newinet->inet_daddr = ireq->rmt_addr; newinet->inet_rcv_saddr = ireq->loc_addr; newinet->inet_saddr = ireq->loc_addr; - newinet->opt = ireq->opt; + inet_opt = ireq->opt; + rcu_assign_pointer(newinet->inet_opt, inet_opt); ireq->opt = NULL; newinet->mc_index = inet_iif(skb); newinet->mc_ttl = ip_hdr(skb)->ttl; inet_csk(newsk)->icsk_ext_hdr_len = 0; - if (newinet->opt) - inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen; + if (inet_opt) + inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; newinet->inet_id = newtp->write_seq ^ jiffies; + if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL) + goto put_and_exit; + + sk_setup_caps(newsk, dst); + tcp_mtup_init(newsk); tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = dst_metric_advmss(dst); @@ -1467,10 +1477,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, } #endif - if (__inet_inherit_port(sk, newsk) < 0) { - sock_put(newsk); - goto exit; - } + if (__inet_inherit_port(sk, newsk) < 0) + goto put_and_exit; __inet_hash_nolisten(newsk, NULL); return newsk; @@ -1482,6 +1490,9 @@ exit_nonewsk: exit: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); return NULL; +put_and_exit: + sock_put(newsk); + goto exit; } EXPORT_SYMBOL(tcp_v4_syn_recv_sock); @@ -1578,6 +1589,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) goto discard; if (nsk != sk) { + sock_rps_save_rxhash(nsk, skb->rxhash); if (tcp_child_process(sk, nsk, skb)) { rsk = nsk; goto reset; @@ -1764,12 +1776,13 @@ struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it) struct inet_sock *inet = inet_sk(sk); struct inet_peer *peer; - if (!rt || rt->rt_dst != inet->inet_daddr) { + if (!rt || + inet->cork.fl.u.ip4.daddr != inet->inet_daddr) { peer = inet_getpeer_v4(inet->inet_daddr, 1); *release_it = true; } else { if (!rt->peer) - rt_bind_peer(rt, 1); + rt_bind_peer(rt, inet->inet_daddr, 1); peer = rt->peer; *release_it = false; } @@ -2359,7 +2372,7 @@ static void get_openreq4(struct sock *sk, struct request_sock *req, int ttd = req->expires - jiffies; seq_printf(f, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n", + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n", i, ireq->loc_addr, ntohs(inet_sk(sk)->inet_sport), @@ -2414,7 +2427,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " - "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n", + "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n", i, src, srcp, dest, destp, sk->sk_state, tp->write_seq - tp->snd_una, rx_queue, @@ -2449,7 +2462,7 @@ static void get_timewait4_sock(struct inet_timewait_sock *tw, srcp = ntohs(tw->tw_sport); seq_printf(f, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n", + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n", i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0, atomic_read(&tw->tw_refcnt), tw, len); @@ -2527,7 +2540,7 @@ void tcp4_proc_exit(void) struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) { - struct iphdr *iph = skb_gro_network_header(skb); + const struct iphdr *iph = skb_gro_network_header(skb); switch (skb->ip_summed) { case CHECKSUM_COMPLETE: @@ -2548,7 +2561,7 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) int tcp4_gro_complete(struct sk_buff *skb) { - struct iphdr *iph = ip_hdr(skb); + const struct iphdr *iph = ip_hdr(skb); struct tcphdr *th = tcp_hdr(skb); th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb), diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 17388c7f49c..882e0b0964d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -899,7 +899,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); - err = icsk->icsk_af_ops->queue_xmit(skb); + err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl); if (likely(err <= 0)) return err; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index f87a8eb76f3..198f75b7bdd 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -578,7 +578,7 @@ found: void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) { struct inet_sock *inet; - struct iphdr *iph = (struct iphdr *)skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2)); const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; @@ -706,12 +706,11 @@ static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) } } -static int udp_send_skb(struct sk_buff *skb, __be32 daddr, __be32 dport) +static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) { struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); struct udphdr *uh; - struct rtable *rt = (struct rtable *)skb_dst(skb); int err = 0; int is_udplite = IS_UDPLITE(sk); int offset = skb_transport_offset(skb); @@ -723,7 +722,7 @@ static int udp_send_skb(struct sk_buff *skb, __be32 daddr, __be32 dport) */ uh = udp_hdr(skb); uh->source = inet->inet_sport; - uh->dest = dport; + uh->dest = fl4->fl4_dport; uh->len = htons(len); uh->check = 0; @@ -737,14 +736,14 @@ static int udp_send_skb(struct sk_buff *skb, __be32 daddr, __be32 dport) } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ - udp4_hwcsum(skb, rt->rt_src, daddr); + udp4_hwcsum(skb, fl4->saddr, fl4->daddr); goto send; } else csum = udp_csum(skb); /* add protocol-dependent pseudo-header */ - uh->check = csum_tcpudp_magic(rt->rt_src, daddr, len, + uh->check = csum_tcpudp_magic(fl4->saddr, fl4->daddr, len, sk->sk_protocol, csum); if (uh->check == 0) uh->check = CSUM_MANGLED_0; @@ -774,11 +773,11 @@ static int udp_push_pending_frames(struct sock *sk) struct sk_buff *skb; int err = 0; - skb = ip_finish_skb(sk); + skb = ip_finish_skb(sk, fl4); if (!skb) goto out; - err = udp_send_skb(skb, fl4->daddr, fl4->fl4_dport); + err = udp_send_skb(skb, fl4); out: up->len = 0; @@ -791,6 +790,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, { struct inet_sock *inet = inet_sk(sk); struct udp_sock *up = udp_sk(sk); + struct flowi4 fl4_stack; struct flowi4 *fl4; int ulen = len; struct ipcm_cookie ipc; @@ -804,6 +804,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); struct sk_buff *skb; + struct ip_options_data opt_copy; if (len > 0xFFFF) return -EMSGSIZE; @@ -820,6 +821,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; + fl4 = &inet->cork.fl.u.ip4; if (up->pending) { /* * There are pending frames. @@ -877,22 +879,32 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, free = 1; connected = 0; } - if (!ipc.opt) - ipc.opt = inet->opt; + if (!ipc.opt) { + struct ip_options_rcu *inet_opt; + + rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); + if (inet_opt) { + memcpy(&opt_copy, inet_opt, + sizeof(*inet_opt) + inet_opt->opt.optlen); + ipc.opt = &opt_copy.opt; + } + rcu_read_unlock(); + } saddr = ipc.addr; ipc.addr = faddr = daddr; - if (ipc.opt && ipc.opt->srr) { + if (ipc.opt && ipc.opt->opt.srr) { if (!daddr) return -EINVAL; - faddr = ipc.opt->faddr; + faddr = ipc.opt->opt.faddr; connected = 0; } tos = RT_TOS(inet->tos); if (sock_flag(sk, SOCK_LOCALROUTE) || (msg->msg_flags & MSG_DONTROUTE) || - (ipc.opt && ipc.opt->is_strictroute)) { + (ipc.opt && ipc.opt->opt.is_strictroute)) { tos |= RTO_ONLINK; connected = 0; } @@ -909,22 +921,16 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, rt = (struct rtable *)sk_dst_check(sk, 0); if (rt == NULL) { - struct flowi4 fl4 = { - .flowi4_oif = ipc.oif, - .flowi4_mark = sk->sk_mark, - .daddr = faddr, - .saddr = saddr, - .flowi4_tos = tos, - .flowi4_proto = sk->sk_protocol, - .flowi4_flags = (inet_sk_flowi_flags(sk) | - FLOWI_FLAG_CAN_SLEEP), - .fl4_sport = inet->inet_sport, - .fl4_dport = dport, - }; struct net *net = sock_net(sk); - security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); - rt = ip_route_output_flow(net, &fl4, sk); + fl4 = &fl4_stack; + flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, + RT_SCOPE_UNIVERSE, sk->sk_protocol, + inet_sk_flowi_flags(sk)|FLOWI_FLAG_CAN_SLEEP, + faddr, saddr, dport, inet->inet_sport); + + security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); + rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; @@ -945,18 +951,18 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, goto do_confirm; back_from_confirm: - saddr = rt->rt_src; + saddr = fl4->saddr; if (!ipc.addr) - daddr = ipc.addr = rt->rt_dst; + daddr = ipc.addr = fl4->daddr; /* Lockless fast path for the non-corking case. */ if (!corkreq) { - skb = ip_make_skb(sk, getfrag, msg->msg_iov, ulen, + skb = ip_make_skb(sk, fl4, getfrag, msg->msg_iov, ulen, sizeof(struct udphdr), &ipc, &rt, msg->msg_flags); err = PTR_ERR(skb); if (skb && !IS_ERR(skb)) - err = udp_send_skb(skb, daddr, dport); + err = udp_send_skb(skb, fl4); goto out; } @@ -982,9 +988,9 @@ back_from_confirm: do_append_data: up->len += ulen; - err = ip_append_data(sk, getfrag, msg->msg_iov, ulen, - sizeof(struct udphdr), &ipc, &rt, - corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); + err = ip_append_data(sk, fl4, getfrag, msg->msg_iov, ulen, + sizeof(struct udphdr), &ipc, &rt, + corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); if (err) udp_flush_pending_frames(sk); else if (!corkreq) @@ -1024,6 +1030,7 @@ EXPORT_SYMBOL(udp_sendmsg); int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) { + struct inet_sock *inet = inet_sk(sk); struct udp_sock *up = udp_sk(sk); int ret; @@ -1048,7 +1055,8 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset, return -EINVAL; } - ret = ip_append_page(sk, page, offset, size, flags); + ret = ip_append_page(sk, &inet->cork.fl.u.ip4, + page, offset, size, flags); if (ret == -EOPNOTSUPP) { release_sock(sk); return sock_no_sendpage(sk->sk_socket, page, offset, @@ -1241,6 +1249,9 @@ csum_copy_err: if (noblock) return -EAGAIN; + + /* starting over for a new packet */ + msg->msg_flags &= ~MSG_TRUNC; goto try_again; } @@ -2082,7 +2093,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, __u16 srcp = ntohs(inet->inet_sport); seq_printf(f, "%5d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n", + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n", bucket, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), @@ -2198,16 +2209,10 @@ void __init udp_table_init(struct udp_table *table, const char *name) void __init udp_init(void) { - unsigned long nr_pages, limit; + unsigned long limit; udp_table_init(&udp_table, "UDP"); - /* Set the pressure threshold up by the same strategy of TCP. It is a - * fraction of global memory that is up to 1/2 at 256 MB, decreasing - * toward zero with the amount of memory, with a floor of 128 pages. - */ - nr_pages = totalram_pages - totalhigh_pages; - limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT); - limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11); + limit = nr_free_buffer_pages() / 8; limit = max(limit, 128UL); sysctl_udp_mem[0] = limit / 4 * 3; sysctl_udp_mem[1] = limit; diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 2d51840e53a..327a617d594 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -32,7 +32,12 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) dst = skb_dst(skb); mtu = dst_mtu(dst); if (skb->len > mtu) { - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + if (skb->sk) + ip_local_error(skb->sk, EMSGSIZE, ip_hdr(skb)->daddr, + inet_sk(skb->sk)->inet_dport, mtu); + else + icmp_send(skb, ICMP_DEST_UNREACH, + ICMP_FRAG_NEEDED, htonl(mtu)); ret = -EMSGSIZE; } out: diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index d20a05e970d..981e43eaf70 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -18,38 +18,46 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo; -static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, - const xfrm_address_t *saddr, - const xfrm_address_t *daddr) +static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, + int tos, + const xfrm_address_t *saddr, + const xfrm_address_t *daddr) { - struct flowi4 fl4 = { - .daddr = daddr->a4, - .flowi4_tos = tos, - }; struct rtable *rt; + memset(fl4, 0, sizeof(*fl4)); + fl4->daddr = daddr->a4; + fl4->flowi4_tos = tos; if (saddr) - fl4.saddr = saddr->a4; + fl4->saddr = saddr->a4; - rt = __ip_route_output_key(net, &fl4); + rt = __ip_route_output_key(net, fl4); if (!IS_ERR(rt)) return &rt->dst; return ERR_CAST(rt); } +static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, + const xfrm_address_t *saddr, + const xfrm_address_t *daddr) +{ + struct flowi4 fl4; + + return __xfrm4_dst_lookup(net, &fl4, tos, saddr, daddr); +} + static int xfrm4_get_saddr(struct net *net, xfrm_address_t *saddr, xfrm_address_t *daddr) { struct dst_entry *dst; - struct rtable *rt; + struct flowi4 fl4; - dst = xfrm4_dst_lookup(net, 0, NULL, daddr); + dst = __xfrm4_dst_lookup(net, &fl4, 0, NULL, daddr); if (IS_ERR(dst)) return -EHOSTUNREACH; - rt = (struct rtable *)dst; - saddr->a4 = rt->rt_src; + saddr->a4 = fl4.saddr; dst_release(dst); return 0; } @@ -73,7 +81,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, rt->rt_key_dst = fl4->daddr; rt->rt_key_src = fl4->saddr; - rt->rt_tos = fl4->flowi4_tos; + rt->rt_key_tos = fl4->flowi4_tos; rt->rt_route_iif = fl4->flowi4_iif; rt->rt_iif = fl4->flowi4_iif; rt->rt_oif = fl4->flowi4_oif; @@ -102,7 +110,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, static void _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) { - struct iphdr *iph = ip_hdr(skb); + const struct iphdr *iph = ip_hdr(skb); u8 *xprth = skb_network_header(skb) + iph->ihl * 4; struct flowi4 *fl4 = &fl->u.ip4; diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 805d63ef434..d9ac0a0058b 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -55,7 +55,7 @@ xfrm4_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl, int xfrm4_extract_header(struct sk_buff *skb) { - struct iphdr *iph = ip_hdr(skb); + const struct iphdr *iph = ip_hdr(skb); XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph); XFRM_MODE_SKB_CB(skb)->id = iph->id; |