From e3192690a3c889767d1161b228374f4926d92af0 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sun, 3 Jun 2012 17:41:40 +0000 Subject: net: Remove casts to same type Adding casts of objects to the same type is unnecessary and confusing for a human reader. For example, this cast: int y; int *p = (int *)&y; I used the coccinelle script below to find and remove these unnecessary casts. I manually removed the conversions this script produces of casts with __force and __user. @@ type T; T *p; @@ - (T *)p + p Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/af_inet.c') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index c8f7aee587d..e4e8e00a2c9 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -553,7 +553,7 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, if (!inet_sk(sk)->inet_num && inet_autobind(sk)) return -EAGAIN; - return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len); + return sk->sk_prot->connect(sk, uaddr, addr_len); } EXPORT_SYMBOL(inet_dgram_connect); -- cgit v1.2.3-70-g09d2 From f9242b6b28d61295f2bf7e8adfb1060b382e5381 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 19 Jun 2012 18:56:21 -0700 Subject: inet: Sanitize inet{,6} protocol demux. Don't pretend that inet_protos[] and inet6_protos[] are hashes, thay are just a straight arrays. Remove all unnecessary hash masking. Document MAX_INET_PROTOS. Use RAW_HTABLE_SIZE when appropriate. Reported-by: Ben Hutchings Signed-off-by: David S. Miller --- include/net/protocol.h | 7 +++++-- net/ipv4/af_inet.c | 26 ++++++++++++-------------- net/ipv4/icmp.c | 9 ++++----- net/ipv4/ip_input.c | 5 ++--- net/ipv4/protocol.c | 8 +++----- net/ipv6/icmp.c | 7 ++----- net/ipv6/ip6_input.c | 9 +++------ net/ipv6/protocol.c | 8 +++----- net/ipv6/raw.c | 4 ++-- 9 files changed, 36 insertions(+), 47 deletions(-) (limited to 'net/ipv4/af_inet.c') diff --git a/include/net/protocol.h b/include/net/protocol.h index 875f4895b03..a1b1b530c33 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -29,8 +29,11 @@ #include #endif -#define MAX_INET_PROTOS 256 /* Must be a power of 2 */ - +/* This is one larger than the largest protocol value that can be + * found in an ipv4 or ipv6 header. Since in both cases the protocol + * value is presented in a __u8, this is defined to be 256. + */ +#define MAX_INET_PROTOS 256 /* This is used to register protocols. */ struct net_protocol { diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e4e8e00a2c9..85a3b176313 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -242,20 +242,18 @@ void build_ehash_secret(void) } EXPORT_SYMBOL(build_ehash_secret); -static inline int inet_netns_ok(struct net *net, int protocol) +static inline int inet_netns_ok(struct net *net, __u8 protocol) { - int hash; const struct net_protocol *ipprot; if (net_eq(net, &init_net)) return 1; - hash = protocol & (MAX_INET_PROTOS - 1); - ipprot = rcu_dereference(inet_protos[hash]); - - if (ipprot == NULL) + ipprot = rcu_dereference(inet_protos[protocol]); + if (ipprot == NULL) { /* raw IP is OK */ return 1; + } return ipprot->netns_ok; } @@ -1216,8 +1214,8 @@ EXPORT_SYMBOL(inet_sk_rebuild_header); static int inet_gso_send_check(struct sk_buff *skb) { - const struct iphdr *iph; const struct net_protocol *ops; + const struct iphdr *iph; int proto; int ihl; int err = -EINVAL; @@ -1236,7 +1234,7 @@ static int inet_gso_send_check(struct sk_buff *skb) __skb_pull(skb, ihl); skb_reset_transport_header(skb); iph = ip_hdr(skb); - proto = iph->protocol & (MAX_INET_PROTOS - 1); + proto = iph->protocol; err = -EPROTONOSUPPORT; rcu_read_lock(); @@ -1253,8 +1251,8 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); - struct iphdr *iph; const struct net_protocol *ops; + struct iphdr *iph; int proto; int ihl; int id; @@ -1286,7 +1284,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, skb_reset_transport_header(skb); iph = ip_hdr(skb); id = ntohs(iph->id); - proto = iph->protocol & (MAX_INET_PROTOS - 1); + proto = iph->protocol; segs = ERR_PTR(-EPROTONOSUPPORT); rcu_read_lock(); @@ -1340,7 +1338,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, goto out; } - proto = iph->protocol & (MAX_INET_PROTOS - 1); + proto = iph->protocol; rcu_read_lock(); ops = rcu_dereference(inet_protos[proto]); @@ -1398,11 +1396,11 @@ out: static int inet_gro_complete(struct sk_buff *skb) { - const struct net_protocol *ops; + __be16 newlen = htons(skb->len - skb_network_offset(skb)); struct iphdr *iph = ip_hdr(skb); - int proto = iph->protocol & (MAX_INET_PROTOS - 1); + const struct net_protocol *ops; + int proto = iph->protocol; int err = -ENOSYS; - __be16 newlen = htons(skb->len - skb_network_offset(skb)); csum_replace2(&iph->check, iph->tot_len, newlen); iph->tot_len = newlen; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index e1caa1abe5d..49a74cc79dc 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -637,12 +637,12 @@ EXPORT_SYMBOL(icmp_send); static void icmp_unreach(struct sk_buff *skb) { + const struct net_protocol *ipprot; const struct iphdr *iph; struct icmphdr *icmph; - int hash, protocol; - const struct net_protocol *ipprot; - u32 info = 0; struct net *net; + u32 info = 0; + int protocol; net = dev_net(skb_dst(skb)->dev); @@ -731,9 +731,8 @@ static void icmp_unreach(struct sk_buff *skb) */ raw_icmp_error(skb, protocol, info); - hash = protocol & (MAX_INET_PROTOS - 1); rcu_read_lock(); - ipprot = rcu_dereference(inet_protos[hash]); + ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot && ipprot->err_handler) ipprot->err_handler(skb, info); rcu_read_unlock(); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 8590144ca33..c4fe1d27113 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -198,14 +198,13 @@ static int ip_local_deliver_finish(struct sk_buff *skb) rcu_read_lock(); { int protocol = ip_hdr(skb)->protocol; - int hash, raw; const struct net_protocol *ipprot; + int raw; resubmit: raw = raw_local_deliver(skb, protocol); - hash = protocol & (MAX_INET_PROTOS - 1); - ipprot = rcu_dereference(inet_protos[hash]); + ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot != NULL) { int ret; diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 9ae5c01cd0b..8918eff1426 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -36,9 +36,7 @@ const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) { - int hash = protocol & (MAX_INET_PROTOS - 1); - - return !cmpxchg((const struct net_protocol **)&inet_protos[hash], + return !cmpxchg((const struct net_protocol **)&inet_protos[protocol], NULL, prot) ? 0 : -1; } EXPORT_SYMBOL(inet_add_protocol); @@ -49,9 +47,9 @@ EXPORT_SYMBOL(inet_add_protocol); int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) { - int ret, hash = protocol & (MAX_INET_PROTOS - 1); + int ret; - ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash], + ret = (cmpxchg((const struct net_protocol **)&inet_protos[protocol], prot, NULL) == prot) ? 0 : -1; synchronize_net(); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 5247d5c211f..c7da1422cbd 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -600,9 +600,8 @@ static void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) { const struct inet6_protocol *ipprot; int inner_offset; - int hash; - u8 nexthdr; __be16 frag_off; + u8 nexthdr; if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) return; @@ -629,10 +628,8 @@ static void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) --ANK (980726) */ - hash = nexthdr & (MAX_INET_PROTOS - 1); - rcu_read_lock(); - ipprot = rcu_dereference(inet6_protos[hash]); + ipprot = rcu_dereference(inet6_protos[nexthdr]); if (ipprot && ipprot->err_handler) ipprot->err_handler(skb, NULL, type, code, inner_offset, info); rcu_read_unlock(); diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 21a15dfe4a9..5ab923e51af 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -168,13 +168,12 @@ drop: static int ip6_input_finish(struct sk_buff *skb) { + struct net *net = dev_net(skb_dst(skb)->dev); const struct inet6_protocol *ipprot; + struct inet6_dev *idev; unsigned int nhoff; int nexthdr; bool raw; - u8 hash; - struct inet6_dev *idev; - struct net *net = dev_net(skb_dst(skb)->dev); /* * Parse extension headers @@ -189,9 +188,7 @@ resubmit: nexthdr = skb_network_header(skb)[nhoff]; raw = raw6_local_deliver(skb, nexthdr); - - hash = nexthdr & (MAX_INET_PROTOS - 1); - if ((ipprot = rcu_dereference(inet6_protos[hash])) != NULL) { + if ((ipprot = rcu_dereference(inet6_protos[nexthdr])) != NULL) { int ret; if (ipprot->flags & INET6_PROTO_FINAL) { diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c index 9a7978fdc02..053082dfc93 100644 --- a/net/ipv6/protocol.c +++ b/net/ipv6/protocol.c @@ -29,9 +29,7 @@ const struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS] __read_mostly; int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol) { - int hash = protocol & (MAX_INET_PROTOS - 1); - - return !cmpxchg((const struct inet6_protocol **)&inet6_protos[hash], + return !cmpxchg((const struct inet6_protocol **)&inet6_protos[protocol], NULL, prot) ? 0 : -1; } EXPORT_SYMBOL(inet6_add_protocol); @@ -42,9 +40,9 @@ EXPORT_SYMBOL(inet6_add_protocol); int inet6_del_protocol(const struct inet6_protocol *prot, unsigned char protocol) { - int ret, hash = protocol & (MAX_INET_PROTOS - 1); + int ret; - ret = (cmpxchg((const struct inet6_protocol **)&inet6_protos[hash], + ret = (cmpxchg((const struct inet6_protocol **)&inet6_protos[protocol], prot, NULL) == prot) ? 0 : -1; synchronize_net(); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 43b0042f15f..b5c1dcb2773 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -165,7 +165,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) saddr = &ipv6_hdr(skb)->saddr; daddr = saddr + 1; - hash = nexthdr & (MAX_INET_PROTOS - 1); + hash = nexthdr & (RAW_HTABLE_SIZE - 1); read_lock(&raw_v6_hashinfo.lock); sk = sk_head(&raw_v6_hashinfo.ht[hash]); @@ -229,7 +229,7 @@ bool raw6_local_deliver(struct sk_buff *skb, int nexthdr) { struct sock *raw_sk; - raw_sk = sk_head(&raw_v6_hashinfo.ht[nexthdr & (MAX_INET_PROTOS - 1)]); + raw_sk = sk_head(&raw_v6_hashinfo.ht[nexthdr & (RAW_HTABLE_SIZE - 1)]); if (raw_sk && !ipv6_raw_deliver(skb, nexthdr)) raw_sk = NULL; -- cgit v1.2.3-70-g09d2 From 41063e9dd11956f2d285e12e4342e1d232ba0ea2 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 19 Jun 2012 21:22:05 -0700 Subject: ipv4: Early TCP socket demux. Input packet processing for local sockets involves two major demuxes. One for the route and one for the socket. But we can optimize this down to one demux for certain kinds of local sockets. Currently we only do this for established TCP sockets, but it could at least in theory be expanded to other kinds of connections. If a TCP socket is established then it's identity is fully specified. This means that whatever input route was used during the three-way handshake must work equally well for the rest of the connection since the keys will not change. Once we move to established state, we cache the receive packet's input route to use later. Like the existing cached route in sk->sk_dst_cache used for output packets, we have to check for route invalidations using dst->obsolete and dst->ops->check(). Early demux occurs outside of a socket locked section, so when a route invalidation occurs we defer the fixup of sk->sk_rx_dst until we are actually inside of established state packet processing and thus have the socket locked. Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 4 ++-- include/net/protocol.h | 1 + include/net/sock.h | 2 ++ include/net/tcp.h | 1 + net/core/sock.c | 5 +++++ net/ipv4/af_inet.c | 18 +++++++++-------- net/ipv4/ip_input.c | 39 ++++++++++++++++++++++++------------ net/ipv4/tcp_input.c | 16 ++++++++++++++- net/ipv4/tcp_ipv4.c | 46 +++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_minisocks.c | 2 ++ 10 files changed, 110 insertions(+), 24 deletions(-) (limited to 'net/ipv4/af_inet.c') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 808fc5f76b0..54be0287eb9 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -379,10 +379,10 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, const __be16 sport, const __be16 dport) { - struct sock *sk; + struct sock *sk = skb_steal_sock(skb); const struct iphdr *iph = ip_hdr(skb); - if (unlikely(sk = skb_steal_sock(skb))) + if (sk) return sk; else return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo, diff --git a/include/net/protocol.h b/include/net/protocol.h index a1b1b530c33..967b926cbfb 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -37,6 +37,7 @@ /* This is used to register protocols. */ struct net_protocol { + int (*early_demux)(struct sk_buff *skb); int (*handler)(struct sk_buff *skb); void (*err_handler)(struct sk_buff *skb, u32 info); int (*gso_send_check)(struct sk_buff *skb); diff --git a/include/net/sock.h b/include/net/sock.h index 4a452169956..87b424ae750 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -319,6 +319,7 @@ struct sock { unsigned long sk_flags; struct dst_entry *sk_dst_cache; spinlock_t sk_dst_lock; + struct dst_entry *sk_rx_dst; atomic_t sk_wmem_alloc; atomic_t sk_omem_alloc; int sk_sndbuf; @@ -1426,6 +1427,7 @@ extern struct sk_buff *sock_rmalloc(struct sock *sk, gfp_t priority); extern void sock_wfree(struct sk_buff *skb); extern void sock_rfree(struct sk_buff *skb); +extern void sock_edemux(struct sk_buff *skb); extern int sock_setsockopt(struct socket *sock, int level, int op, char __user *optval, diff --git a/include/net/tcp.h b/include/net/tcp.h index 9332f342259..6660ffc4963 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -325,6 +325,7 @@ extern void tcp_v4_err(struct sk_buff *skb, u32); extern void tcp_shutdown (struct sock *sk, int how); +extern int tcp_v4_early_demux(struct sk_buff *skb); extern int tcp_v4_rcv(struct sk_buff *skb); extern struct inet_peer *tcp_v4_get_peer(struct sock *sk); diff --git a/net/core/sock.c b/net/core/sock.c index 9e5b71fda6e..929bdcc2383 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1465,6 +1465,11 @@ void sock_rfree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_rfree); +void sock_edemux(struct sk_buff *skb) +{ + sock_put(skb->sk); +} +EXPORT_SYMBOL(sock_edemux); int sock_i_uid(struct sock *sk) { diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 85a3b176313..07a02f6e969 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -157,6 +157,7 @@ void inet_sock_destruct(struct sock *sk) kfree(rcu_dereference_protected(inet->inet_opt, 1)); dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); + dst_release(sk->sk_rx_dst); sk_refcnt_debug_dec(sk); } EXPORT_SYMBOL(inet_sock_destruct); @@ -1518,14 +1519,15 @@ static const struct net_protocol igmp_protocol = { #endif static const struct net_protocol tcp_protocol = { - .handler = tcp_v4_rcv, - .err_handler = tcp_v4_err, - .gso_send_check = tcp_v4_gso_send_check, - .gso_segment = tcp_tso_segment, - .gro_receive = tcp4_gro_receive, - .gro_complete = tcp4_gro_complete, - .no_policy = 1, - .netns_ok = 1, + .early_demux = tcp_v4_early_demux, + .handler = tcp_v4_rcv, + .err_handler = tcp_v4_err, + .gso_send_check = tcp_v4_gso_send_check, + .gso_segment = tcp_tso_segment, + .gro_receive = tcp4_gro_receive, + .gro_complete = tcp4_gro_complete, + .no_policy = 1, + .netns_ok = 1, }; static const struct net_protocol udp_protocol = { diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index c4fe1d27113..93b092c9a39 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -323,19 +323,32 @@ static int ip_rcv_finish(struct sk_buff *skb) * how the packet travels inside Linux networking. */ if (skb_dst(skb) == NULL) { - int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev); - if (unlikely(err)) { - if (err == -EHOSTUNREACH) - IP_INC_STATS_BH(dev_net(skb->dev), - IPSTATS_MIB_INADDRERRORS); - else if (err == -ENETUNREACH) - IP_INC_STATS_BH(dev_net(skb->dev), - IPSTATS_MIB_INNOROUTES); - else if (err == -EXDEV) - NET_INC_STATS_BH(dev_net(skb->dev), - LINUX_MIB_IPRPFILTER); - goto drop; + const struct net_protocol *ipprot; + int protocol = iph->protocol; + int err; + + rcu_read_lock(); + ipprot = rcu_dereference(inet_protos[protocol]); + err = -ENOENT; + if (ipprot && ipprot->early_demux) + err = ipprot->early_demux(skb); + rcu_read_unlock(); + + if (err) { + err = ip_route_input_noref(skb, iph->daddr, iph->saddr, + iph->tos, skb->dev); + if (unlikely(err)) { + if (err == -EHOSTUNREACH) + IP_INC_STATS_BH(dev_net(skb->dev), + IPSTATS_MIB_INADDRERRORS); + else if (err == -ENETUNREACH) + IP_INC_STATS_BH(dev_net(skb->dev), + IPSTATS_MIB_INNOROUTES); + else if (err == -EXDEV) + NET_INC_STATS_BH(dev_net(skb->dev), + LINUX_MIB_IPRPFILTER); + goto drop; + } } } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b224eb8bce8..8416f8a68e6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5518,6 +5518,18 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, struct tcp_sock *tp = tcp_sk(sk); int res; + if (sk->sk_rx_dst) { + struct dst_entry *dst = sk->sk_rx_dst; + if (unlikely(dst->obsolete)) { + if (dst->ops->check(dst, 0) == NULL) { + dst_release(dst); + sk->sk_rx_dst = NULL; + } + } + } + if (unlikely(sk->sk_rx_dst == NULL)) + sk->sk_rx_dst = dst_clone(skb_dst(skb)); + /* * Header prediction. * The code loosely follows the one in the famous @@ -5729,8 +5741,10 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) tcp_set_state(sk, TCP_ESTABLISHED); - if (skb != NULL) + if (skb != NULL) { + sk->sk_rx_dst = dst_clone(skb_dst(skb)); security_inet_conn_established(sk, skb); + } /* Make sure socket is routed, for correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fda2ca17135..13857df1dae 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1671,6 +1671,52 @@ csum_err: } EXPORT_SYMBOL(tcp_v4_do_rcv); +int tcp_v4_early_demux(struct sk_buff *skb) +{ + struct net *net = dev_net(skb->dev); + const struct iphdr *iph; + const struct tcphdr *th; + struct sock *sk; + int err; + + err = -ENOENT; + if (skb->pkt_type != PACKET_HOST) + goto out_err; + + if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr))) + goto out_err; + + iph = ip_hdr(skb); + th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb)); + + if (th->doff < sizeof(struct tcphdr) / 4) + goto out_err; + + if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4)) + goto out_err; + + sk = __inet_lookup_established(net, &tcp_hashinfo, + iph->saddr, th->source, + iph->daddr, th->dest, + skb->dev->ifindex); + if (sk) { + skb->sk = sk; + skb->destructor = sock_edemux; + if (sk->sk_state != TCP_TIME_WAIT) { + struct dst_entry *dst = sk->sk_rx_dst; + if (dst) + dst = dst_check(dst, 0); + if (dst) { + skb_dst_set_noref(skb, dst); + err = 0; + } + } + } + +out_err: + return err; +} + /* * From tcp_input.c */ diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index cb015317c9f..72b7c63b1a3 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -445,6 +445,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct tcp_sock *oldtp = tcp_sk(sk); struct tcp_cookie_values *oldcvp = oldtp->cookie_values; + newsk->sk_rx_dst = dst_clone(skb_dst(skb)); + /* TCP Cookie Transactions require space for the cookie pair, * as it differs for each connection. There is no need to * copy any s_data_payload stored at the original socket. -- cgit v1.2.3-70-g09d2 From 783237e8daf13481ee234997cbbbb823872ac388 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 19 Jul 2012 06:43:07 +0000 Subject: net-tcp: Fast Open client - sending SYN-data This patch implements sending SYN-data in tcp_connect(). The data is from tcp_sendmsg() with flag MSG_FASTOPEN (implemented in a later patch). The length of the cookie in tcp_fastopen_req, init'd to 0, controls the type of the SYN. If the cookie is not cached (len==0), the host sends data-less SYN with Fast Open cookie request option to solicit a cookie from the remote. If cookie is not available (len > 0), the host sends a SYN-data with Fast Open cookie option. If cookie length is negative, the SYN will not include any Fast Open option (for fall back operations). To deal with middleboxes that may drop SYN with data or experimental TCP option, the SYN-data is only sent once. SYN retransmits do not include data or Fast Open options. The connection will fall back to regular TCP handshake. Signed-off-by: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/snmp.h | 1 + include/linux/tcp.h | 6 ++- include/net/tcp.h | 9 ++++ net/ipv4/af_inet.c | 10 ++++- net/ipv4/proc.c | 1 + net/ipv4/tcp_output.c | 115 ++++++++++++++++++++++++++++++++++++++++++++++---- 6 files changed, 130 insertions(+), 12 deletions(-) (limited to 'net/ipv4/af_inet.c') diff --git a/include/linux/snmp.h b/include/linux/snmp.h index e5fcbd079e4..00bc189cb39 100644 --- a/include/linux/snmp.h +++ b/include/linux/snmp.h @@ -238,6 +238,7 @@ enum LINUX_MIB_TCPOFOMERGE, /* TCPOFOMerge */ LINUX_MIB_TCPCHALLENGEACK, /* TCPChallengeACK */ LINUX_MIB_TCPSYNCHALLENGE, /* TCPSYNChallenge */ + LINUX_MIB_TCPFASTOPENACTIVE, /* TCPFastOpenActive */ __LINUX_MIB_MAX }; diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 12948f54383..1edf96afab4 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -386,7 +386,8 @@ struct tcp_sock { unused : 1; u8 repair_queue; u8 do_early_retrans:1,/* Enable RFC5827 early-retransmit */ - early_retrans_delayed:1; /* Delayed ER timer installed */ + early_retrans_delayed:1, /* Delayed ER timer installed */ + syn_fastopen:1; /* SYN includes Fast Open option */ /* RTT measurement */ u32 srtt; /* smoothed round trip time << 3 */ @@ -500,6 +501,9 @@ struct tcp_sock { struct tcp_md5sig_info __rcu *md5sig_info; #endif +/* TCP fastopen related information */ + struct tcp_fastopen_request *fastopen_req; + /* When the cookie options are generated and exchanged, then this * object holds a reference to them (cookie_values->kref). Also * contains related tcp_cookie_transactions fields. diff --git a/include/net/tcp.h b/include/net/tcp.h index e601da19736..867557b4244 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1289,6 +1289,15 @@ extern int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *, const struct sk_buff extern int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key); +struct tcp_fastopen_request { + /* Fast Open cookie. Size 0 means a cookie request */ + struct tcp_fastopen_cookie cookie; + struct msghdr *data; /* data in MSG_FASTOPEN */ + u16 copied; /* queued in tcp_connect() */ +}; + +void tcp_free_fastopen_req(struct tcp_sock *tp); + /* write queue abstraction */ static inline void tcp_write_queue_purge(struct sock *sk) { diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 07a02f6e969..edc414625be 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -556,11 +556,12 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, } EXPORT_SYMBOL(inet_dgram_connect); -static long inet_wait_for_connect(struct sock *sk, long timeo) +static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) { DEFINE_WAIT(wait); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + sk->sk_write_pending += writebias; /* Basic assumption: if someone sets sk->sk_err, he _must_ * change state of the socket from TCP_SYN_*. @@ -576,6 +577,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo) prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); } finish_wait(sk_sleep(sk), &wait); + sk->sk_write_pending -= writebias; return timeo; } @@ -634,8 +636,12 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { + int writebias = (sk->sk_protocol == IPPROTO_TCP) && + tcp_sk(sk)->fastopen_req && + tcp_sk(sk)->fastopen_req->data ? 1 : 0; + /* Error code is set above */ - if (!timeo || !inet_wait_for_connect(sk, timeo)) + if (!timeo || !inet_wait_for_connect(sk, timeo, writebias)) goto out; err = sock_intr_errno(timeo); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 2a5240b2ea6..957acd12250 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -262,6 +262,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE), SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK), SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE), + SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 4849be76ccd..88693281da4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -596,6 +596,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ? tcp_cookie_size_check(cvp->cookie_desired) : 0; + struct tcp_fastopen_request *fastopen = tp->fastopen_req; #ifdef CONFIG_TCP_MD5SIG *md5 = tp->af_specific->md5_lookup(sk, sk); @@ -636,6 +637,16 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, remaining -= TCPOLEN_SACKPERM_ALIGNED; } + if (fastopen && fastopen->cookie.len >= 0) { + u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len; + need = (need + 3) & ~3U; /* Align to 32 bits */ + if (remaining >= need) { + opts->options |= OPTION_FAST_OPEN_COOKIE; + opts->fastopen_cookie = &fastopen->cookie; + remaining -= need; + tp->syn_fastopen = 1; + } + } /* Note that timestamps are required by the specification. * * Odd numbers of bytes are prohibited by the specification, ensuring @@ -2824,6 +2835,96 @@ void tcp_connect_init(struct sock *sk) tcp_clear_retrans(tp); } +static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + + tcb->end_seq += skb->len; + skb_header_release(skb); + __tcp_add_write_queue_tail(sk, skb); + sk->sk_wmem_queued += skb->truesize; + sk_mem_charge(sk, skb->truesize); + tp->write_seq = tcb->end_seq; + tp->packets_out += tcp_skb_pcount(skb); +} + +/* Build and send a SYN with data and (cached) Fast Open cookie. However, + * queue a data-only packet after the regular SYN, such that regular SYNs + * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges + * only the SYN sequence, the data are retransmitted in the first ACK. + * If cookie is not cached or other error occurs, falls back to send a + * regular SYN with Fast Open cookie request option. + */ +static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_fastopen_request *fo = tp->fastopen_req; + int space, i, err = 0, iovlen = fo->data->msg_iovlen; + struct sk_buff *syn_data = NULL, *data; + + tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie); + if (fo->cookie.len <= 0) + goto fallback; + + /* MSS for SYN-data is based on cached MSS and bounded by PMTU and + * user-MSS. Reserve maximum option space for middleboxes that add + * private TCP options. The cost is reduced data space in SYN :( + */ + if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp) + tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; + space = tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - + MAX_TCP_OPTION_SPACE; + + syn_data = skb_copy_expand(syn, skb_headroom(syn), space, + sk->sk_allocation); + if (syn_data == NULL) + goto fallback; + + for (i = 0; i < iovlen && syn_data->len < space; ++i) { + struct iovec *iov = &fo->data->msg_iov[i]; + unsigned char __user *from = iov->iov_base; + int len = iov->iov_len; + + if (syn_data->len + len > space) + len = space - syn_data->len; + else if (i + 1 == iovlen) + /* No more data pending in inet_wait_for_connect() */ + fo->data = NULL; + + if (skb_add_data(syn_data, from, len)) + goto fallback; + } + + /* Queue a data-only packet after the regular SYN for retransmission */ + data = pskb_copy(syn_data, sk->sk_allocation); + if (data == NULL) + goto fallback; + TCP_SKB_CB(data)->seq++; + TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN; + TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH); + tcp_connect_queue_skb(sk, data); + fo->copied = data->len; + + if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); + goto done; + } + syn_data = NULL; + +fallback: + /* Send a regular SYN with Fast Open cookie request option */ + if (fo->cookie.len > 0) + fo->cookie.len = 0; + err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation); + if (err) + tp->syn_fastopen = 0; + kfree_skb(syn_data); +done: + fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */ + return err; +} + /* Build a SYN and send it off. */ int tcp_connect(struct sock *sk) { @@ -2841,17 +2942,13 @@ int tcp_connect(struct sock *sk) skb_reserve(buff, MAX_TCP_HEADER); tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); + tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp; + tcp_connect_queue_skb(sk, buff); TCP_ECN_send_syn(sk, buff); - /* Send it off. */ - TCP_SKB_CB(buff)->when = tcp_time_stamp; - tp->retrans_stamp = TCP_SKB_CB(buff)->when; - skb_header_release(buff); - __tcp_add_write_queue_tail(sk, buff); - sk->sk_wmem_queued += buff->truesize; - sk_mem_charge(sk, buff->truesize); - tp->packets_out += tcp_skb_pcount(buff); - err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); + /* Send off SYN; include data in Fast Open. */ + err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : + tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); if (err == -ECONNREFUSED) return err; -- cgit v1.2.3-70-g09d2 From cf60af03ca4e71134206809ea892e49b92a88896 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 19 Jul 2012 06:43:09 +0000 Subject: net-tcp: Fast Open client - sendmsg(MSG_FASTOPEN) sendmsg() (or sendto()) with MSG_FASTOPEN is a combo of connect(2) and write(2). The application should replace connect() with it to send data in the opening SYN packet. For blocking socket, sendmsg() blocks until all the data are buffered locally and the handshake is completed like connect() call. It returns similar errno like connect() if the TCP handshake fails. For non-blocking socket, it returns the number of bytes queued (and transmitted in the SYN-data packet) if cookie is available. If cookie is not available, it transmits a data-less SYN packet with Fast Open cookie request option and returns -EINPROGRESS like connect(). Using MSG_FASTOPEN on connecting or connected socket will result in simlar errno like repeating connect() calls. Therefore the application should only use this flag on new sockets. The buffer size of sendmsg() is independent of the MSS of the connection. Signed-off-by: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 11 ++++++ include/linux/socket.h | 1 + include/net/inet_common.h | 6 ++-- include/net/tcp.h | 3 ++ net/ipv4/af_inet.c | 19 ++++++++--- net/ipv4/tcp.c | 61 +++++++++++++++++++++++++++++++--- net/ipv4/tcp_ipv4.c | 3 ++ 7 files changed, 92 insertions(+), 12 deletions(-) (limited to 'net/ipv4/af_inet.c') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index e1e021594cf..03964e08818 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -468,6 +468,17 @@ tcp_syncookies - BOOLEAN SYN flood warnings in logs not being really flooded, your server is seriously misconfigured. +tcp_fastopen - INTEGER + Enable TCP Fast Open feature (draft-ietf-tcpm-fastopen) to send data + in the opening SYN packet. To use this feature, the client application + must not use connect(). Instead, it should use sendmsg() or sendto() + with MSG_FASTOPEN flag which performs a TCP handshake automatically. + + The values (bitmap) are: + 1: Enables sending data in the opening SYN on the client + + Default: 0 + tcp_syn_retries - INTEGER Number of times initial SYNs for an active TCP connection attempt will be retransmitted. Should not be higher than 255. Default value diff --git a/include/linux/socket.h b/include/linux/socket.h index 25d6322fb63..ba7b2e817cf 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -268,6 +268,7 @@ struct ucred { #define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */ #define MSG_EOF MSG_FIN +#define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ #define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exit for file descriptor received through SCM_RIGHTS */ diff --git a/include/net/inet_common.h b/include/net/inet_common.h index 22fac9892b1..234008782c8 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -14,9 +14,11 @@ struct sockaddr; struct socket; extern int inet_release(struct socket *sock); -extern int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr, +extern int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags); -extern int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, +extern int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags); +extern int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags); extern int inet_accept(struct socket *sock, struct socket *newsock, int flags); extern int inet_sendmsg(struct kiocb *iocb, struct socket *sock, diff --git a/include/net/tcp.h b/include/net/tcp.h index 867557b4244..c0258100d70 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -212,6 +212,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); /* TCP initial congestion window as per draft-hkchu-tcpm-initcwnd-01 */ #define TCP_INIT_CWND 10 +/* Bit Flags for sysctl_tcp_fastopen */ +#define TFO_CLIENT_ENABLE 1 + extern struct inet_timewait_death_row tcp_death_row; /* sysctl variables for tcp */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index edc414625be..fe4582ca969 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -585,8 +585,8 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) * Connect to a remote host. There is regrettably still a little * TCP 'magic' in here. */ -int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, - int addr_len, int flags) +int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) { struct sock *sk = sock->sk; int err; @@ -595,8 +595,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; - lock_sock(sk); - if (uaddr->sa_family == AF_UNSPEC) { err = sk->sk_prot->disconnect(sk, flags); sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; @@ -663,7 +661,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, sock->state = SS_CONNECTED; err = 0; out: - release_sock(sk); return err; sock_error: @@ -673,6 +670,18 @@ sock_error: sock->state = SS_DISCONNECTING; goto out; } +EXPORT_SYMBOL(__inet_stream_connect); + +int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + int err; + + lock_sock(sock->sk); + err = __inet_stream_connect(sock, uaddr, addr_len, flags); + release_sock(sock->sk); + return err; +} EXPORT_SYMBOL(inet_stream_connect); /* diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4252cd8f39f..581ecf02c6b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -270,6 +270,7 @@ #include #include +#include #include #include #include @@ -982,26 +983,67 @@ static inline int select_size(const struct sock *sk, bool sg) return tmp; } +void tcp_free_fastopen_req(struct tcp_sock *tp) +{ + if (tp->fastopen_req != NULL) { + kfree(tp->fastopen_req); + tp->fastopen_req = NULL; + } +} + +static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size) +{ + struct tcp_sock *tp = tcp_sk(sk); + int err, flags; + + if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE)) + return -EOPNOTSUPP; + if (tp->fastopen_req != NULL) + return -EALREADY; /* Another Fast Open is in progress */ + + tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request), + sk->sk_allocation); + if (unlikely(tp->fastopen_req == NULL)) + return -ENOBUFS; + tp->fastopen_req->data = msg; + + flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; + err = __inet_stream_connect(sk->sk_socket, msg->msg_name, + msg->msg_namelen, flags); + *size = tp->fastopen_req->copied; + tcp_free_fastopen_req(tp); + return err; +} + int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t size) { struct iovec *iov; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int iovlen, flags, err, copied; - int mss_now = 0, size_goal; + int iovlen, flags, err, copied = 0; + int mss_now = 0, size_goal, copied_syn = 0, offset = 0; bool sg; long timeo; lock_sock(sk); flags = msg->msg_flags; + if (flags & MSG_FASTOPEN) { + err = tcp_sendmsg_fastopen(sk, msg, &copied_syn); + if (err == -EINPROGRESS && copied_syn > 0) + goto out; + else if (err) + goto out_err; + offset = copied_syn; + } + timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); /* Wait for a connection to finish. */ if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) - goto out_err; + goto do_error; if (unlikely(tp->repair)) { if (tp->repair_queue == TCP_RECV_QUEUE) { @@ -1037,6 +1079,15 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, unsigned char __user *from = iov->iov_base; iov++; + if (unlikely(offset > 0)) { /* Skip bytes copied in SYN */ + if (offset >= seglen) { + offset -= seglen; + continue; + } + seglen -= offset; + from += offset; + offset = 0; + } while (seglen > 0) { int copy = 0; @@ -1199,7 +1250,7 @@ out: if (copied && likely(!tp->repair)) tcp_push(sk, flags, mss_now, tp->nonagle); release_sock(sk); - return copied; + return copied + copied_syn; do_fault: if (!skb->len) { @@ -1212,7 +1263,7 @@ do_fault: } do_error: - if (copied) + if (copied + copied_syn) goto out; out_err: err = sk_stream_error(sk, flags, err); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 01aa77a9702..1d8b75a5898 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1952,6 +1952,9 @@ void tcp_v4_destroy_sock(struct sock *sk) tp->cookie_values = NULL; } + /* If socket is aborted during connect operation */ + tcp_free_fastopen_req(tp); + sk_sockets_allocated_dec(sk); sock_release_memcg(sk); } -- cgit v1.2.3-70-g09d2