From b6c6712a42ca3f9fa7f4a3d7c40e3a9dd1fd9e03 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Apr 2010 23:03:29 +0000 Subject: net: sk_dst_cache RCUification With latest CONFIG_PROVE_RCU stuff, I felt more comfortable to make this work. sk->sk_dst_cache is currently protected by a rwlock (sk_dst_lock) This rwlock is readlocked for a very small amount of time, and dst entries are already freed after RCU grace period. This calls for RCU again :) This patch converts sk_dst_lock to a spinlock, and use RCU for readers. __sk_dst_get() is supposed to be called with rcu_read_lock() or if socket locked by user, so use appropriate rcu_dereference_check() condition (rcu_read_lock_held() || sock_owned_by_user(sk)) This patch avoids two atomic ops per tx packet on UDP connected sockets, for example, and permits sk_dst_lock to be much less dirtied. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ipv6_sockglue.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'net/ipv6/ipv6_sockglue.c') diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 33f60fca7aa..1160400e9db 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -114,9 +114,9 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk, } opt = xchg(&inet6_sk(sk)->opt, opt); } else { - write_lock(&sk->sk_dst_lock); + spin_lock(&sk->sk_dst_lock); opt = xchg(&inet6_sk(sk)->opt, opt); - write_unlock(&sk->sk_dst_lock); + spin_unlock(&sk->sk_dst_lock); } sk_dst_reset(sk); @@ -971,14 +971,13 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, case IPV6_MTU: { struct dst_entry *dst; + val = 0; - lock_sock(sk); - dst = sk_dst_get(sk); - if (dst) { + rcu_read_lock(); + dst = __sk_dst_get(sk); + if (dst) val = dst_mtu(dst); - dst_release(dst); - } - release_sock(sk); + rcu_read_unlock(); if (!val) return -ENOTCONN; break; @@ -1066,12 +1065,14 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, else val = np->mcast_hops; - dst = sk_dst_get(sk); - if (dst) { - if (val < 0) + if (val < 0) { + rcu_read_lock(); + dst = __sk_dst_get(sk); + if (dst) val = ip6_dst_hoplimit(dst); - dst_release(dst); + rcu_read_unlock(); } + if (val < 0) val = sock_net(sk)->ipv6.devconf_all->hop_limit; break; -- cgit v1.2.3-70-g09d2 From e802af9cabb011f09b9c19a82faef3dd315f27eb Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 22 Apr 2010 15:24:53 -0700 Subject: IPv6: Generic TTL Security Mechanism (final version) This patch adds IPv6 support for RFC5082 Generalized TTL Security Mechanism. Not to users of mapped address; the IPV6 and IPV4 socket options are seperate. The server does have to deal with both IPv4 and IPv6 socket options and the client has to handle the different for each family. On client: int ttl = 255; getaddrinfo(argv[1], argv[2], &hint, &result); for (rp = result; rp != NULL; rp = rp->ai_next) { s = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol); if (s < 0) continue; if (rp->ai_family == AF_INET) { setsockopt(s, IPPROTO_IP, IP_TTL, &ttl, sizeof(ttl)); } else if (rp->ai_family == AF_INET6) { setsockopt(s, IPPROTO_IPV6, IPV6_UNICAST_HOPS, &ttl, sizeof(ttl))) } if (connect(s, rp->ai_addr, rp->ai_addrlen) == 0) { ... On server: int minttl = 255 - maxhops; getaddrinfo(NULL, port, &hints, &result); for (rp = result; rp != NULL; rp = rp->ai_next) { s = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol); if (s < 0) continue; if (rp->ai_family == AF_INET6) setsockopt(s, IPPROTO_IPV6, IPV6_MINHOPCOUNT, &minttl, sizeof(minttl)); setsockopt(s, IPPROTO_IP, IP_MINTTL, &minttl, sizeof(minttl)); if (bind(s, rp->ai_addr, rp->ai_addrlen) == 0) break ... Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/in6.h | 3 +++ include/linux/ipv6.h | 1 + net/ipv6/ipv6_sockglue.c | 12 ++++++++++++ net/ipv6/tcp_ipv6.c | 14 +++++++++++++- 4 files changed, 29 insertions(+), 1 deletion(-) (limited to 'net/ipv6/ipv6_sockglue.c') diff --git a/include/linux/in6.h b/include/linux/in6.h index bd55c6e46b2..9b90cb296eb 100644 --- a/include/linux/in6.h +++ b/include/linux/in6.h @@ -265,6 +265,9 @@ struct in6_flowlabel_req { #define IPV6_PREFER_SRC_CGA 0x0008 #define IPV6_PREFER_SRC_NONCGA 0x0800 +/* RFC5082: Generalized Ttl Security Mechanism */ +#define IPV6_MINHOPCOUNT 73 + /* * Multicast Routing: * see include/linux/mroute6.h. diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index e0cc9a7db2b..1bdbebf08d1 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -348,6 +348,7 @@ struct ipv6_pinfo { * 010: prefer public address * 100: prefer care-of address */ + __u8 min_hopcount; __u8 tclass; __u32 dst_cookie; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 1160400e9db..92295ad3487 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -767,6 +767,14 @@ pref_skip_coa: break; } + case IPV6_MINHOPCOUNT: + if (optlen < sizeof(int)) + goto e_inval; + if (val < 0 || val > 255) + goto e_inval; + np->min_hopcount = val; + retv = 0; + break; } release_sock(sk); @@ -1116,6 +1124,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val |= IPV6_PREFER_SRC_HOME; break; + case IPV6_MINHOPCOUNT: + val = np->min_hopcount; + break; + default: return -ENOPROTOOPT; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 1ababbb4113..6603511e367 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -353,6 +353,11 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (sk->sk_state == TCP_CLOSE) goto out; + if (ipv6_hdr(skb)->hop_limit < inet6_sk(sk)->min_hopcount) { + NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); + goto out; + } + tp = tcp_sk(sk); seq = ntohl(th->seq); if (sk->sk_state != TCP_LISTEN && @@ -1678,6 +1683,7 @@ ipv6_pktoptions: static int tcp_v6_rcv(struct sk_buff *skb) { struct tcphdr *th; + struct ipv6hdr *hdr; struct sock *sk; int ret; struct net *net = dev_net(skb->dev); @@ -1704,12 +1710,13 @@ static int tcp_v6_rcv(struct sk_buff *skb) goto bad_packet; th = tcp_hdr(skb); + hdr = ipv6_hdr(skb); TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff*4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); TCP_SKB_CB(skb)->when = 0; - TCP_SKB_CB(skb)->flags = ipv6_get_dsfield(ipv6_hdr(skb)); + TCP_SKB_CB(skb)->flags = ipv6_get_dsfield(hdr); TCP_SKB_CB(skb)->sacked = 0; sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); @@ -1720,6 +1727,11 @@ process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; + if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) { + NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); + goto discard_and_relse; + } + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; -- cgit v1.2.3-70-g09d2 From 793b14731686595a741d9f47726ad8b9a235385a Mon Sep 17 00:00:00 2001 From: Brian Haley Date: Fri, 23 Apr 2010 11:26:07 +0000 Subject: IPv6: data structure changes for new socket options Add underlying data structure changes and basic setsockopt() and getsockopt() support for IPV6_RECVPATHMTU, IPV6_PATHMTU, and IPV6_DONTFRAG. IPV6_PATHMTU is actually fully functional at this point. Signed-off-by: Brian Haley Signed-off-by: David S. Miller --- include/linux/in6.h | 2 +- include/linux/ipv6.h | 13 ++++++++++--- net/ipv6/ipv6_sockglue.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 4 deletions(-) (limited to 'net/ipv6/ipv6_sockglue.c') diff --git a/include/linux/in6.h b/include/linux/in6.h index 9b90cb296eb..c4bf46f764b 100644 --- a/include/linux/in6.h +++ b/include/linux/in6.h @@ -221,10 +221,10 @@ struct in6_flowlabel_req { #define IPV6_RTHDR 57 #define IPV6_RECVDSTOPTS 58 #define IPV6_DSTOPTS 59 -#if 0 /* not yet */ #define IPV6_RECVPATHMTU 60 #define IPV6_PATHMTU 61 #define IPV6_DONTFRAG 62 +#if 0 /* not yet */ #define IPV6_USE_MIN_MTU 63 #endif diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 1bdbebf08d1..1976942cf6f 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -21,6 +21,10 @@ struct in6_pktinfo { int ipi6_ifindex; }; +struct ip6_mtuinfo { + struct sockaddr_in6 ip6m_addr; + __u32 ip6m_mtu; +}; struct in6_ifreq { struct in6_addr ifr6_addr; @@ -334,22 +338,25 @@ struct ipv6_pinfo { dstopts:1, odstopts:1, rxflow:1, - rxtclass:1; + rxtclass:1, + rxpmtu:1; } bits; __u16 all; } rxopt; /* sockopt flags */ - __u8 recverr:1, + __u16 recverr:1, sndflow:1, pmtudisc:2, ipv6only:1, - srcprefs:3; /* 001: prefer temporary address + srcprefs:3, /* 001: prefer temporary address * 010: prefer public address * 100: prefer care-of address */ + dontfrag:1; __u8 min_hopcount; __u8 tclass; + __u8 padding; __u32 dst_cookie; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 92295ad3487..2bf9eda7278 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -337,6 +337,13 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = 0; break; + case IPV6_RECVPATHMTU: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.rxpmtu = valbool; + retv = 0; + break; + case IPV6_HOPOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: @@ -773,6 +780,9 @@ pref_skip_coa: if (val < 0 || val > 255) goto e_inval; np->min_hopcount = val; + break; + case IPV6_DONTFRAG: + np->dontfrag = valbool; retv = 0; break; } @@ -1063,6 +1073,38 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = np->rxopt.bits.rxflow; break; + case IPV6_RECVPATHMTU: + val = np->rxopt.bits.rxpmtu; + break; + + case IPV6_PATHMTU: + { + struct dst_entry *dst; + struct ip6_mtuinfo mtuinfo; + + if (len < sizeof(mtuinfo)) + return -EINVAL; + + len = sizeof(mtuinfo); + memset(&mtuinfo, 0, sizeof(mtuinfo)); + + rcu_read_lock(); + dst = __sk_dst_get(sk); + if (dst) + mtuinfo.ip6m_mtu = dst_mtu(dst); + rcu_read_unlock(); + if (!mtuinfo.ip6m_mtu) + return -ENOTCONN; + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &mtuinfo, len)) + return -EFAULT; + + return 0; + break; + } + case IPV6_UNICAST_HOPS: case IPV6_MULTICAST_HOPS: { @@ -1128,6 +1170,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = np->min_hopcount; break; + case IPV6_DONTFRAG: + val = np->dontfrag; + break; + default: return -ENOPROTOOPT; } -- cgit v1.2.3-70-g09d2 From 13b52cd44670e3359055e9918d0e766d89836425 Mon Sep 17 00:00:00 2001 From: Brian Haley Date: Fri, 23 Apr 2010 11:26:08 +0000 Subject: IPv6: Add dontfrag argument to relevant functions Add dontfrag argument to relevant functions for IPV6_DONTFRAG support, as well as allowing the value to be passed-in via ancillary cmsg data. Signed-off-by: Brian Haley Signed-off-by: David S. Miller --- include/net/ipv6.h | 3 ++- include/net/transp_v6.h | 3 ++- net/ipv6/datagram.c | 21 ++++++++++++++++++++- net/ipv6/icmp.c | 5 +++-- net/ipv6/ip6_flowlabel.c | 3 ++- net/ipv6/ip6_output.c | 2 +- net/ipv6/ipv6_sockglue.c | 3 ++- net/ipv6/raw.c | 9 +++++++-- net/ipv6/udp.c | 9 +++++++-- 9 files changed, 46 insertions(+), 12 deletions(-) (limited to 'net/ipv6/ipv6_sockglue.c') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index b1d8db90b21..7ab6323e631 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -503,7 +503,8 @@ extern int ip6_append_data(struct sock *sk, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt, - unsigned int flags); + unsigned int flags, + int dontfrag); extern int ip6_push_pending_frames(struct sock *sk); diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h index d65381cad0f..42a0eb68b7b 100644 --- a/include/net/transp_v6.h +++ b/include/net/transp_v6.h @@ -44,7 +44,8 @@ extern int datagram_send_ctl(struct net *net, struct msghdr *msg, struct flowi *fl, struct ipv6_txoptions *opt, - int *hlimit, int *tclass); + int *hlimit, int *tclass, + int *dontfrag); #define LOOPBACK4_IPV6 cpu_to_be32(0x7f000006) diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 622dc7939a1..f5076d349b1 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -497,7 +497,7 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) int datagram_send_ctl(struct net *net, struct msghdr *msg, struct flowi *fl, struct ipv6_txoptions *opt, - int *hlimit, int *tclass) + int *hlimit, int *tclass, int *dontfrag) { struct in6_pktinfo *src_info; struct cmsghdr *cmsg; @@ -735,6 +735,25 @@ int datagram_send_ctl(struct net *net, err = 0; *tclass = tc; + break; + } + + case IPV6_DONTFRAG: + { + int df; + + err = -EINVAL; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) { + goto exit_f; + } + + df = *(int *)CMSG_DATA(cmsg); + if (df < 0 || df > 1) + goto exit_f; + + err = 0; + *dontfrag = df; + break; } default: diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 12d2fa42657..ce799298255 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -481,7 +481,7 @@ route_done: len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), hlimit, np->tclass, NULL, &fl, (struct rt6_info*)dst, - MSG_DONTWAIT); + MSG_DONTWAIT, np->dontfrag); if (err) { ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS); ip6_flush_pending_frames(sk); @@ -561,7 +561,8 @@ static void icmpv6_echo_reply(struct sk_buff *skb) err = ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), hlimit, np->tclass, NULL, &fl, - (struct rt6_info*)dst, MSG_DONTWAIT); + (struct rt6_info*)dst, MSG_DONTWAIT, + np->dontfrag); if (err) { ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS); diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 14e23216eb2..13654686aea 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -360,7 +360,8 @@ fl_create(struct net *net, struct in6_flowlabel_req *freq, char __user *optval, msg.msg_control = (void*)(fl->opt+1); flowi.oif = 0; - err = datagram_send_ctl(net, &msg, &flowi, fl->opt, &junk, &junk); + err = datagram_send_ctl(net, &msg, &flowi, fl->opt, &junk, + &junk, &junk); if (err) goto done; err = -EINVAL; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 263d4cf5a8d..54d43dd1f08 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1105,7 +1105,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl, - struct rt6_info *rt, unsigned int flags) + struct rt6_info *rt, unsigned int flags, int dontfrag) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 2bf9eda7278..bd43f0152c2 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -458,7 +458,8 @@ sticky_done: msg.msg_controllen = optlen; msg.msg_control = (void*)(opt+1); - retv = datagram_send_ctl(net, &msg, &fl, opt, &junk, &junk); + retv = datagram_send_ctl(net, &msg, &fl, opt, &junk, &junk, + &junk); if (retv) goto done; update: diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 8763b1a0814..44a84ea9b3e 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -733,6 +733,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, int addr_len = msg->msg_namelen; int hlimit = -1; int tclass = -1; + int dontfrag = -1; u16 proto; int err; @@ -811,7 +812,8 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(struct ipv6_txoptions); - err = datagram_send_ctl(sock_net(sk), msg, &fl, opt, &hlimit, &tclass); + err = datagram_send_ctl(sock_net(sk), msg, &fl, opt, &hlimit, + &tclass, &dontfrag); if (err < 0) { fl6_sock_release(flowlabel); return err; @@ -880,6 +882,9 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, if (tclass < 0) tclass = np->tclass; + if (dontfrag < 0) + dontfrag = np->dontfrag; + if (msg->msg_flags&MSG_CONFIRM) goto do_confirm; @@ -890,7 +895,7 @@ back_from_confirm: lock_sock(sk); err = ip6_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0, hlimit, tclass, opt, &fl, (struct rt6_info*)dst, - msg->msg_flags); + msg->msg_flags, dontfrag); if (err) ip6_flush_pending_frames(sk); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 92bf9033e24..39e3665d946 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -919,6 +919,7 @@ int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk, int ulen = len; int hlimit = -1; int tclass = -1; + int dontfrag = -1; int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; int err; int connected = 0; @@ -1049,7 +1050,8 @@ do_udp_sendmsg: memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(*opt); - err = datagram_send_ctl(sock_net(sk), msg, &fl, opt, &hlimit, &tclass); + err = datagram_send_ctl(sock_net(sk), msg, &fl, opt, &hlimit, + &tclass, &dontfrag); if (err < 0) { fl6_sock_release(flowlabel); return err; @@ -1120,6 +1122,9 @@ do_udp_sendmsg: if (tclass < 0) tclass = np->tclass; + if (dontfrag < 0) + dontfrag = np->dontfrag; + if (msg->msg_flags&MSG_CONFIRM) goto do_confirm; back_from_confirm: @@ -1143,7 +1148,7 @@ do_append_data: err = ip6_append_data(sk, getfrag, msg->msg_iov, ulen, sizeof(struct udphdr), hlimit, tclass, opt, &fl, (struct rt6_info*)dst, - corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); + corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags, dontfrag); if (err) udp_v6_flush_pending_frames(sk); else if (!corkreq) -- cgit v1.2.3-70-g09d2