From 4aea39c11c610e411768649fdc04777903ebfe07 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 3 Jun 2012 20:33:21 +0000 Subject: tcp: tcp_make_synack() consumes dst parameter tcp_make_synack() clones the dst, and callers release it. We can avoid two atomic operations per SYNACK if tcp_make_synack() consumes dst instead of cloning it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c8d28c433b2..3d9c1a4b881 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -848,7 +848,6 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, err = net_xmit_eval(err); } - dst_release(dst); return err; } -- cgit v1.2.3-70-g09d2 From 54db0cc2ba0d38166acc2d6bae21721405305537 Mon Sep 17 00:00:00 2001 From: Gao feng Date: Fri, 8 Jun 2012 01:21:40 +0000 Subject: inetpeer: add parameter net for inet_getpeer_v4,v6 add struct net as a parameter of inet_getpeer_v[4,6], use net to replace &init_net. and modify some places to provide net for inet_getpeer_v[4,6] Signed-off-by: Gao feng Signed-off-by: David S. Miller --- include/net/inetpeer.h | 12 ++++++++---- net/ipv4/inet_fragment.c | 2 +- net/ipv4/ip_fragment.c | 6 +++++- net/ipv4/route.c | 8 +++++--- net/ipv4/tcp_ipv4.c | 6 ++++-- net/ipv6/route.c | 3 ++- net/ipv6/tcp_ipv6.c | 6 ++++-- 7 files changed, 29 insertions(+), 14 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index fef9dfab5d4..20e67db69ac 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -79,22 +79,26 @@ struct inet_peer *inet_getpeer(struct net *net, const struct inetpeer_addr *daddr, int create); -static inline struct inet_peer *inet_getpeer_v4(__be32 v4daddr, int create) +static inline struct inet_peer *inet_getpeer_v4(struct net *net, + __be32 v4daddr, + int create) { struct inetpeer_addr daddr; daddr.addr.a4 = v4daddr; daddr.family = AF_INET; - return inet_getpeer(&init_net, &daddr, create); + return inet_getpeer(net, &daddr, create); } -static inline struct inet_peer *inet_getpeer_v6(const struct in6_addr *v6daddr, int create) +static inline struct inet_peer *inet_getpeer_v6(struct net *net, + const struct in6_addr *v6daddr, + int create) { struct inetpeer_addr daddr; *(struct in6_addr *)daddr.addr.a6 = *v6daddr; daddr.family = AF_INET6; - return inet_getpeer(&init_net, &daddr, create); + return inet_getpeer(net, &daddr, create); } /* can be called from BH context or outside */ diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 5ff2a51b6d0..85190e69297 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -243,12 +243,12 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, if (q == NULL) return NULL; + q->net = nf; f->constructor(q, arg); atomic_add(f->qsize, &nf->mem); setup_timer(&q->timer, f->frag_expire, (unsigned long)q); spin_lock_init(&q->lock); atomic_set(&q->refcnt, 1); - q->net = nf; return q; } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 9dbd3dd6022..22c6bab9717 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -171,6 +171,10 @@ static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb) static void ip4_frag_init(struct inet_frag_queue *q, void *a) { struct ipq *qp = container_of(q, struct ipq, q); + struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4, + frags); + struct net *net = container_of(ipv4, struct net, ipv4); + struct ip4_create_arg *arg = a; qp->protocol = arg->iph->protocol; @@ -180,7 +184,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a) qp->daddr = arg->iph->daddr; qp->user = arg->user; qp->peer = sysctl_ipfrag_max_dist ? - inet_getpeer_v4(arg->iph->saddr, 1) : NULL; + inet_getpeer_v4(net, arg->iph->saddr, 1) : NULL; } static __inline__ void ip4_frag_free(struct inet_frag_queue *q) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 006c21cc232..2c9f73f3b7c 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1328,9 +1328,10 @@ static u32 rt_peer_genid(void) void rt_bind_peer(struct rtable *rt, __be32 daddr, int create) { + struct net *net = dev_net(rt->dst.dev); struct inet_peer *peer; - peer = inet_getpeer_v4(daddr, create); + peer = inet_getpeer_v4(net, daddr, create); if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) inet_putpeer(peer); @@ -1694,7 +1695,7 @@ unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph, unsigned short est_mtu = 0; struct inet_peer *peer; - peer = inet_getpeer_v4(iph->daddr, 1); + peer = inet_getpeer_v4(net, iph->daddr, 1); if (peer) { unsigned short mtu = new_mtu; @@ -1935,6 +1936,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, struct fib_info *fi) { + struct net *net = dev_net(rt->dst.dev); struct inet_peer *peer; int create = 0; @@ -1944,7 +1946,7 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS)) create = 1; - rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create); + rt->peer = peer = inet_getpeer_v4(net, rt->rt_dst, create); if (peer) { rt->rt_peer_genid = rt_peer_genid(); if (inet_metrics_new(peer)) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3d9c1a4b881..f485b451f92 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1824,11 +1824,12 @@ struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it) { struct rtable *rt = (struct rtable *) __sk_dst_get(sk); struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); struct inet_peer *peer; if (!rt || inet->cork.fl.u.ip4.daddr != inet->inet_daddr) { - peer = inet_getpeer_v4(inet->inet_daddr, 1); + peer = inet_getpeer_v4(net, inet->inet_daddr, 1); *release_it = true; } else { if (!rt->peer) @@ -1844,8 +1845,9 @@ EXPORT_SYMBOL(tcp_v4_get_peer); void *tcp_v4_tw_get_peer(struct sock *sk) { const struct inet_timewait_sock *tw = inet_twsk(sk); + struct net *net = sock_net(sk); - return inet_getpeer_v4(tw->tw_daddr, 1); + return inet_getpeer_v4(net, tw->tw_daddr, 1); } EXPORT_SYMBOL(tcp_v4_tw_get_peer); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 999a982ad3f..4eca0130cce 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -306,9 +306,10 @@ static u32 rt6_peer_genid(void) void rt6_bind_peer(struct rt6_info *rt, int create) { + struct net *net = dev_net(rt->dst.dev); struct inet_peer *peer; - peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create); + peer = inet_getpeer_v6(net, &rt->rt6i_dst.addr, create); if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL) inet_putpeer(peer); else diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 80758255556..1a9cdd09f11 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1736,11 +1736,12 @@ static struct inet_peer *tcp_v6_get_peer(struct sock *sk, bool *release_it) { struct rt6_info *rt = (struct rt6_info *) __sk_dst_get(sk); struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); struct inet_peer *peer; if (!rt || !ipv6_addr_equal(&np->daddr, &rt->rt6i_dst.addr)) { - peer = inet_getpeer_v6(&np->daddr, 1); + peer = inet_getpeer_v6(net, &np->daddr, 1); *release_it = true; } else { if (!rt->rt6i_peer) @@ -1756,11 +1757,12 @@ static void *tcp_v6_tw_get_peer(struct sock *sk) { const struct inet6_timewait_sock *tw6 = inet6_twsk(sk); const struct inet_timewait_sock *tw = inet_twsk(sk); + struct net *net = sock_net(sk); if (tw->tw_family == AF_INET) return tcp_v4_tw_get_peer(sk); - return inet_getpeer_v6(&tw6->tw_v6_daddr, 1); + return inet_getpeer_v6(net, &tw6->tw_v6_daddr, 1); } static struct timewait_sock_ops tcp6_timewait_sock_ops = { -- cgit v1.2.3-70-g09d2 From fbfe95a42e90b3dd079cc9019ba7d7700feee0f6 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 8 Jun 2012 23:24:18 -0700 Subject: inet: Create and use rt{,6}_get_peer_create(). There's a lot of places that open-code rt{,6}_get_peer() only because they want to set 'create' to one. So add an rt{,6}_get_peer_create() for their sake. There were also a few spots open-coding plain rt{,6}_get_peer() and those are transformed here as well. Signed-off-by: David S. Miller --- include/net/ip6_route.h | 17 +++++++++++++---- include/net/route.h | 14 ++++++++++++-- net/ipv4/icmp.c | 5 ++--- net/ipv4/route.c | 35 +++++++++-------------------------- net/ipv4/tcp_ipv4.c | 4 +--- net/ipv6/icmp.c | 6 +++--- net/ipv6/ip6_output.c | 11 ++++------- net/ipv6/ndisc.c | 6 +++--- net/ipv6/route.c | 5 +---- net/ipv6/tcp_ipv6.c | 4 +--- 10 files changed, 49 insertions(+), 58 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 37c1a1ed82c..73d75028812 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -53,18 +53,27 @@ static inline unsigned int rt6_flags2srcprefs(int flags) return (flags >> 3) & 7; } -extern void rt6_bind_peer(struct rt6_info *rt, - int create); +extern void rt6_bind_peer(struct rt6_info *rt, int create); -static inline struct inet_peer *rt6_get_peer(struct rt6_info *rt) +static inline struct inet_peer *__rt6_get_peer(struct rt6_info *rt, int create) { if (rt->rt6i_peer) return rt->rt6i_peer; - rt6_bind_peer(rt, 0); + rt6_bind_peer(rt, create); return rt->rt6i_peer; } +static inline struct inet_peer *rt6_get_peer(struct rt6_info *rt) +{ + return __rt6_get_peer(rt, 0); +} + +static inline struct inet_peer *rt6_get_peer_create(struct rt6_info *rt) +{ + return __rt6_get_peer(rt, 1); +} + extern void ip6_route_input(struct sk_buff *skb); extern struct dst_entry * ip6_route_output(struct net *net, diff --git a/include/net/route.h b/include/net/route.h index ed2b78e2375..433fc6c1d40 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -296,15 +296,25 @@ static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable extern void rt_bind_peer(struct rtable *rt, __be32 daddr, int create); -static inline struct inet_peer *rt_get_peer(struct rtable *rt, __be32 daddr) +static inline struct inet_peer *__rt_get_peer(struct rtable *rt, __be32 daddr, int create) { if (rt->peer) return rt->peer; - rt_bind_peer(rt, daddr, 0); + rt_bind_peer(rt, daddr, create); return rt->peer; } +static inline struct inet_peer *rt_get_peer(struct rtable *rt, __be32 daddr) +{ + return __rt_get_peer(rt, daddr, 0); +} + +static inline struct inet_peer *rt_get_peer_create(struct rtable *rt, __be32 daddr) +{ + return __rt_get_peer(rt, daddr, 1); +} + static inline int inet_iif(const struct sk_buff *skb) { return skb_rtable(skb)->rt_iif; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index c75efbdc71c..0c78ef1e5dd 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -253,9 +253,8 @@ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, /* Limit if icmp type is enabled in ratemask. */ if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { - if (!rt->peer) - rt_bind_peer(rt, fl4->daddr, 1); - rc = inet_peer_xrlim_allow(rt->peer, + struct inet_peer *peer = rt_get_peer_create(rt, fl4->daddr); + rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit); } out: diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 2c9f73f3b7c..7a4d724765e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -162,10 +162,7 @@ static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) struct inet_peer *peer; u32 *p = NULL; - if (!rt->peer) - rt_bind_peer(rt, rt->rt_dst, 1); - - peer = rt->peer; + peer = rt_get_peer_create(rt, rt->rt_dst); if (peer) { u32 *old_p = __DST_METRICS_PTR(old); unsigned long prev, new; @@ -1364,14 +1361,13 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) struct rtable *rt = (struct rtable *) dst; if (rt && !(rt->dst.flags & DST_NOPEER)) { - if (rt->peer == NULL) - rt_bind_peer(rt, rt->rt_dst, 1); + struct inet_peer *peer = rt_get_peer_create(rt, rt->rt_dst); /* If peer is attached to destination, it is never detached, so that we need not to grab a lock to dereference it. */ - if (rt->peer) { - iph->id = htons(inet_getid(rt->peer, more)); + if (peer) { + iph->id = htons(inet_getid(peer, more)); return; } } else if (!rt) @@ -1481,10 +1477,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, rt->rt_gateway != old_gw) continue; - if (!rt->peer) - rt_bind_peer(rt, rt->rt_dst, 1); - - peer = rt->peer; + peer = rt_get_peer_create(rt, rt->rt_dst); if (peer) { if (peer->redirect_learned.a4 != new_gw) { peer->redirect_learned.a4 = new_gw; @@ -1579,9 +1572,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) log_martians = IN_DEV_LOG_MARTIANS(in_dev); rcu_read_unlock(); - if (!rt->peer) - rt_bind_peer(rt, rt->rt_dst, 1); - peer = rt->peer; + peer = rt_get_peer_create(rt, rt->rt_dst); if (!peer) { icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); return; @@ -1646,9 +1637,7 @@ static int ip_error(struct sk_buff *skb) break; } - if (!rt->peer) - rt_bind_peer(rt, rt->rt_dst, 1); - peer = rt->peer; + peer = rt_get_peer_create(rt, rt->rt_dst); send = true; if (peer) { @@ -1754,9 +1743,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) dst_confirm(dst); - if (!rt->peer) - rt_bind_peer(rt, rt->rt_dst, 1); - peer = rt->peer; + peer = rt_get_peer_create(rt, rt->rt_dst); if (peer) { unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires); @@ -1782,12 +1769,8 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) static void ipv4_validate_peer(struct rtable *rt) { if (rt->rt_peer_genid != rt_peer_genid()) { - struct inet_peer *peer; - - if (!rt->peer) - rt_bind_peer(rt, rt->rt_dst, 0); + struct inet_peer *peer = rt_get_peer(rt, rt->rt_dst); - peer = rt->peer; if (peer) { check_peer_pmtu(&rt->dst, peer); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f485b451f92..833e8d96a63 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1832,9 +1832,7 @@ struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it) peer = inet_getpeer_v4(net, inet->inet_daddr, 1); *release_it = true; } else { - if (!rt->peer) - rt_bind_peer(rt, inet->inet_daddr, 1); - peer = rt->peer; + peer = rt_get_peer_create(rt, inet->inet_daddr); *release_it = false; } diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 091a2971c7b..ed89bba745a 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -188,14 +188,14 @@ static inline bool icmpv6_xrlim_allow(struct sock *sk, u8 type, } else { struct rt6_info *rt = (struct rt6_info *)dst; int tmo = net->ipv6.sysctl.icmpv6_time; + struct inet_peer *peer; /* Give more bandwidth to wider prefixes. */ if (rt->rt6i_dst.plen < 128) tmo >>= ((128 - rt->rt6i_dst.plen)>>5); - if (!rt->rt6i_peer) - rt6_bind_peer(rt, 1); - res = inet_peer_xrlim_allow(rt->rt6i_peer, tmo); + peer = rt6_get_peer_create(rt); + res = inet_peer_xrlim_allow(peer, tmo); } dst_release(dst); return res; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 17b8c67998b..62fcf3e48ac 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -463,6 +463,7 @@ int ip6_forward(struct sk_buff *skb) */ if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) { struct in6_addr *target = NULL; + struct inet_peer *peer; struct rt6_info *rt; /* @@ -476,13 +477,12 @@ int ip6_forward(struct sk_buff *skb) else target = &hdr->daddr; - if (!rt->rt6i_peer) - rt6_bind_peer(rt, 1); + peer = rt6_get_peer_create(rt); /* Limit redirects both by destination (here) and by source (inside ndisc_send_redirect) */ - if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ)) + if (inet_peer_xrlim_allow(peer, 1*HZ)) ndisc_send_redirect(skb, target); } else { int addrtype = ipv6_addr_type(&hdr->saddr); @@ -602,11 +602,8 @@ void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt) int old, new; if (rt && !(rt->dst.flags & DST_NOPEER)) { - struct inet_peer *peer; + struct inet_peer *peer = rt6_get_peer_create(rt); - if (!rt->rt6i_peer) - rt6_bind_peer(rt, 1); - peer = rt->rt6i_peer; if (peer) { fhdr->identification = htonl(inet_getid(peer, 0)); return; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 54f62d3b8dd..69a6330dea9 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1472,6 +1472,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) struct net *net = dev_net(dev); struct sock *sk = net->ipv6.ndisc_sk; int len = sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr); + struct inet_peer *peer; struct sk_buff *buff; struct icmp6hdr *icmph; struct in6_addr saddr_buf; @@ -1518,9 +1519,8 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) "Redirect: destination is not a neighbour\n"); goto release; } - if (!rt->rt6i_peer) - rt6_bind_peer(rt, 1); - if (!inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ)) + peer = rt6_get_peer_create(rt); + if (!inet_peer_xrlim_allow(peer, 1*HZ)) goto release; if (dev->addr_len) { diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 4eca0130cce..8a986be4aed 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -99,10 +99,7 @@ static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) if (!(rt->dst.flags & DST_HOST)) return NULL; - if (!rt->rt6i_peer) - rt6_bind_peer(rt, 1); - - peer = rt->rt6i_peer; + peer = rt6_get_peer_create(rt); if (peer) { u32 *old_p = __DST_METRICS_PTR(old); unsigned long prev, new; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 1a9cdd09f11..218433cb992 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1744,9 +1744,7 @@ static struct inet_peer *tcp_v6_get_peer(struct sock *sk, bool *release_it) peer = inet_getpeer_v6(net, &np->daddr, 1); *release_it = true; } else { - if (!rt->rt6i_peer) - rt6_bind_peer(rt, 1); - peer = rt->rt6i_peer; + peer = rt6_get_peer_create(rt); *release_it = false; } -- cgit v1.2.3-70-g09d2 From 4670fd819e7f47392c7c6fc6168ea2857c66d163 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 9 Jun 2012 01:25:47 -0700 Subject: tcp: Get rid of inetpeer special cases. The get_peer method TCP uses is full of special cases that make no sense accommodating, and it also gets in the way of doing more reasonable things here. First of all, if the socket doesn't have a usable cached route, there is no sense in trying to optimize timewait recycling. Likewise for the case where we have IP options, such as SRR enabled, that make the IP header destination address (and thus the destination address of the route key) differ from that of the connection's destination address. Just return a NULL peer in these cases, and thus we're also able to get rid of the clumsy inetpeer release logic. Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 2 +- include/net/tcp.h | 2 +- net/ipv4/tcp_ipv4.c | 21 ++++++++------------- net/ipv4/tcp_minisocks.c | 5 +---- net/ipv6/tcp_ipv6.c | 21 ++++++++------------- 5 files changed, 19 insertions(+), 32 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 7d83f90f203..e1b7734c456 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -43,7 +43,7 @@ struct inet_connection_sock_af_ops { struct sock *(*syn_recv_sock)(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst); - struct inet_peer *(*get_peer)(struct sock *sk, bool *release_it); + struct inet_peer *(*get_peer)(struct sock *sk); u16 net_header_len; u16 net_frag_header_len; u16 sockaddr_len; diff --git a/include/net/tcp.h b/include/net/tcp.h index e79aa48d9fc..42459186603 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -327,7 +327,7 @@ extern void tcp_shutdown (struct sock *sk, int how); extern int tcp_v4_rcv(struct sk_buff *skb); -extern struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it); +extern struct inet_peer *tcp_v4_get_peer(struct sock *sk); extern void *tcp_v4_tw_get_peer(struct sock *sk); extern int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw); extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 833e8d96a63..77f049d00db 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1820,23 +1820,18 @@ do_time_wait: goto discard_it; } -struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it) +struct inet_peer *tcp_v4_get_peer(struct sock *sk) { struct rtable *rt = (struct rtable *) __sk_dst_get(sk); struct inet_sock *inet = inet_sk(sk); - struct net *net = sock_net(sk); - struct inet_peer *peer; - - if (!rt || - inet->cork.fl.u.ip4.daddr != inet->inet_daddr) { - peer = inet_getpeer_v4(net, inet->inet_daddr, 1); - *release_it = true; - } else { - peer = rt_get_peer_create(rt, inet->inet_daddr); - *release_it = false; - } - return peer; + /* If we don't have a valid cached route, or we're doing IP + * options which make the IPv4 header destination address + * different from our peer's, do not bother with this. + */ + if (!rt || inet->cork.fl.u.ip4.daddr != inet->inet_daddr) + return NULL; + return rt_get_peer_create(rt, inet->inet_daddr); } EXPORT_SYMBOL(tcp_v4_get_peer); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b85d9fe7d66..fef9dbf3af0 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -60,9 +60,8 @@ static bool tcp_remember_stamp(struct sock *sk) const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct inet_peer *peer; - bool release_it; - peer = icsk->icsk_af_ops->get_peer(sk, &release_it); + peer = icsk->icsk_af_ops->get_peer(sk); if (peer) { if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && @@ -70,8 +69,6 @@ static bool tcp_remember_stamp(struct sock *sk) peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; peer->tcp_ts = tp->rx_opt.ts_recent; } - if (release_it) - inet_putpeer(peer); return true; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 218433cb992..b5ecf37b61a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1732,23 +1732,18 @@ do_time_wait: goto discard_it; } -static struct inet_peer *tcp_v6_get_peer(struct sock *sk, bool *release_it) +static struct inet_peer *tcp_v6_get_peer(struct sock *sk) { struct rt6_info *rt = (struct rt6_info *) __sk_dst_get(sk); struct ipv6_pinfo *np = inet6_sk(sk); - struct net *net = sock_net(sk); - struct inet_peer *peer; - - if (!rt || - !ipv6_addr_equal(&np->daddr, &rt->rt6i_dst.addr)) { - peer = inet_getpeer_v6(net, &np->daddr, 1); - *release_it = true; - } else { - peer = rt6_get_peer_create(rt); - *release_it = false; - } - return peer; + /* If we don't have a valid cached route, or we're doing IP + * options which make the IPv6 header destination address + * different from our peer's, do not bother with this. + */ + if (!rt || !ipv6_addr_equal(&np->daddr, &rt->rt6i_dst.addr)) + return NULL; + return rt6_get_peer_create(rt); } static void *tcp_v6_tw_get_peer(struct sock *sk) -- cgit v1.2.3-70-g09d2 From 2397849baa7c44c242e5d5142d5d16d1e7ed53d0 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 9 Jun 2012 14:56:12 -0700 Subject: [PATCH] tcp: Cache inetpeer in timewait socket, and only when necessary. Since it's guarenteed that we will access the inetpeer if we're trying to do timewait recycling and TCP options were enabled on the connection, just cache the peer in the timewait socket. In the future, inetpeer lookups will be context dependent (per routing realm), and this helps facilitate that as well. Signed-off-by: David S. Miller --- include/linux/tcp.h | 3 ++- include/net/tcp.h | 1 - include/net/timewait_sock.h | 8 -------- net/ipv4/tcp_ipv4.c | 10 ---------- net/ipv4/tcp_minisocks.c | 27 ++++++++++++++++++++------- net/ipv6/tcp_ipv6.c | 13 ------------- 6 files changed, 22 insertions(+), 40 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 4c5b6328337..23e8234f75a 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -506,8 +506,9 @@ struct tcp_timewait_sock { u32 tw_rcv_wnd; u32 tw_ts_recent; long tw_ts_recent_stamp; + struct inet_peer *tw_peer; #ifdef CONFIG_TCP_MD5SIG - struct tcp_md5sig_key *tw_md5_key; + struct tcp_md5sig_key *tw_md5_key; #endif /* Few sockets in timewait have cookies; in that case, then this * object holds a reference to them (tw_cookie_values->kref). diff --git a/include/net/tcp.h b/include/net/tcp.h index 42459186603..9332f342259 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -328,7 +328,6 @@ extern void tcp_shutdown (struct sock *sk, int how); extern int tcp_v4_rcv(struct sk_buff *skb); extern struct inet_peer *tcp_v4_get_peer(struct sock *sk); -extern void *tcp_v4_tw_get_peer(struct sock *sk); extern int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw); extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t size); diff --git a/include/net/timewait_sock.h b/include/net/timewait_sock.h index 8d6689cb2c6..68f0ecad6c6 100644 --- a/include/net/timewait_sock.h +++ b/include/net/timewait_sock.h @@ -22,7 +22,6 @@ struct timewait_sock_ops { int (*twsk_unique)(struct sock *sk, struct sock *sktw, void *twp); void (*twsk_destructor)(struct sock *sk); - void *(*twsk_getpeer)(struct sock *sk); }; static inline int twsk_unique(struct sock *sk, struct sock *sktw, void *twp) @@ -41,11 +40,4 @@ static inline void twsk_destructor(struct sock *sk) sk->sk_prot->twsk_prot->twsk_destructor(sk); } -static inline void *twsk_getpeer(struct sock *sk) -{ - if (sk->sk_prot->twsk_prot->twsk_getpeer) - return sk->sk_prot->twsk_prot->twsk_getpeer(sk); - return NULL; -} - #endif /* _TIMEWAIT_SOCK_H */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 77f049d00db..fda2ca17135 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1835,20 +1835,10 @@ struct inet_peer *tcp_v4_get_peer(struct sock *sk) } EXPORT_SYMBOL(tcp_v4_get_peer); -void *tcp_v4_tw_get_peer(struct sock *sk) -{ - const struct inet_timewait_sock *tw = inet_twsk(sk); - struct net *net = sock_net(sk); - - return inet_getpeer_v4(net, tw->tw_daddr, 1); -} -EXPORT_SYMBOL(tcp_v4_tw_get_peer); - static struct timewait_sock_ops tcp_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp_timewait_sock), .twsk_unique = tcp_twsk_unique, .twsk_destructor= tcp_twsk_destructor, - .twsk_getpeer = tcp_v4_tw_get_peer, }; const struct inet_connection_sock_af_ops ipv4_specific = { diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index fef9dbf3af0..cb015317c9f 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -77,20 +77,19 @@ static bool tcp_remember_stamp(struct sock *sk) static bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw) { + const struct tcp_timewait_sock *tcptw; struct sock *sk = (struct sock *) tw; struct inet_peer *peer; - peer = twsk_getpeer(sk); + tcptw = tcp_twsk(sk); + peer = tcptw->tw_peer; if (peer) { - const struct tcp_timewait_sock *tcptw = tcp_twsk(sk); - if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; peer->tcp_ts = tcptw->tw_ts_recent; } - inet_putpeer(peer); return true; } return false; @@ -314,9 +313,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); bool recycle_ok = false; + bool recycle_on = false; - if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) + if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) { recycle_ok = tcp_remember_stamp(sk); + recycle_on = true; + } if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) tw = inet_twsk_alloc(sk, state); @@ -324,8 +326,10 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) if (tw != NULL) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); + struct inet_sock *inet = inet_sk(sk); + struct inet_peer *peer = NULL; - tw->tw_transparent = inet_sk(sk)->transparent; + tw->tw_transparent = inet->transparent; tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; tcptw->tw_rcv_nxt = tp->rcv_nxt; tcptw->tw_snd_nxt = tp->snd_nxt; @@ -347,6 +351,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) } #endif + if (recycle_on) + peer = icsk->icsk_af_ops->get_peer(sk); + tcptw->tw_peer = peer; + if (peer) + atomic_inc(&peer->refcnt); + #ifdef CONFIG_TCP_MD5SIG /* * The timewait bucket does not have the key DB from the @@ -398,8 +408,11 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) void tcp_twsk_destructor(struct sock *sk) { -#ifdef CONFIG_TCP_MD5SIG struct tcp_timewait_sock *twsk = tcp_twsk(sk); + + if (twsk->tw_peer) + inet_putpeer(twsk->tw_peer); +#ifdef CONFIG_TCP_MD5SIG if (twsk->tw_md5_key) { tcp_free_md5sig_pool(); kfree_rcu(twsk->tw_md5_key, rcu); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index b5ecf37b61a..f91b0bfd12d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1746,23 +1746,10 @@ static struct inet_peer *tcp_v6_get_peer(struct sock *sk) return rt6_get_peer_create(rt); } -static void *tcp_v6_tw_get_peer(struct sock *sk) -{ - const struct inet6_timewait_sock *tw6 = inet6_twsk(sk); - const struct inet_timewait_sock *tw = inet_twsk(sk); - struct net *net = sock_net(sk); - - if (tw->tw_family == AF_INET) - return tcp_v4_tw_get_peer(sk); - - return inet_getpeer_v6(net, &tw6->tw_v6_daddr, 1); -} - static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp6_timewait_sock), .twsk_unique = tcp_twsk_unique, .twsk_destructor= tcp_twsk_destructor, - .twsk_getpeer = tcp_v6_tw_get_peer, }; static const struct inet_connection_sock_af_ops ipv6_specific = { -- cgit v1.2.3-70-g09d2 From 41063e9dd11956f2d285e12e4342e1d232ba0ea2 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 19 Jun 2012 21:22:05 -0700 Subject: ipv4: Early TCP socket demux. Input packet processing for local sockets involves two major demuxes. One for the route and one for the socket. But we can optimize this down to one demux for certain kinds of local sockets. Currently we only do this for established TCP sockets, but it could at least in theory be expanded to other kinds of connections. If a TCP socket is established then it's identity is fully specified. This means that whatever input route was used during the three-way handshake must work equally well for the rest of the connection since the keys will not change. Once we move to established state, we cache the receive packet's input route to use later. Like the existing cached route in sk->sk_dst_cache used for output packets, we have to check for route invalidations using dst->obsolete and dst->ops->check(). Early demux occurs outside of a socket locked section, so when a route invalidation occurs we defer the fixup of sk->sk_rx_dst until we are actually inside of established state packet processing and thus have the socket locked. Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 4 ++-- include/net/protocol.h | 1 + include/net/sock.h | 2 ++ include/net/tcp.h | 1 + net/core/sock.c | 5 +++++ net/ipv4/af_inet.c | 18 +++++++++-------- net/ipv4/ip_input.c | 39 ++++++++++++++++++++++++------------ net/ipv4/tcp_input.c | 16 ++++++++++++++- net/ipv4/tcp_ipv4.c | 46 +++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_minisocks.c | 2 ++ 10 files changed, 110 insertions(+), 24 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 808fc5f76b0..54be0287eb9 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -379,10 +379,10 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, const __be16 sport, const __be16 dport) { - struct sock *sk; + struct sock *sk = skb_steal_sock(skb); const struct iphdr *iph = ip_hdr(skb); - if (unlikely(sk = skb_steal_sock(skb))) + if (sk) return sk; else return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo, diff --git a/include/net/protocol.h b/include/net/protocol.h index a1b1b530c33..967b926cbfb 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -37,6 +37,7 @@ /* This is used to register protocols. */ struct net_protocol { + int (*early_demux)(struct sk_buff *skb); int (*handler)(struct sk_buff *skb); void (*err_handler)(struct sk_buff *skb, u32 info); int (*gso_send_check)(struct sk_buff *skb); diff --git a/include/net/sock.h b/include/net/sock.h index 4a452169956..87b424ae750 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -319,6 +319,7 @@ struct sock { unsigned long sk_flags; struct dst_entry *sk_dst_cache; spinlock_t sk_dst_lock; + struct dst_entry *sk_rx_dst; atomic_t sk_wmem_alloc; atomic_t sk_omem_alloc; int sk_sndbuf; @@ -1426,6 +1427,7 @@ extern struct sk_buff *sock_rmalloc(struct sock *sk, gfp_t priority); extern void sock_wfree(struct sk_buff *skb); extern void sock_rfree(struct sk_buff *skb); +extern void sock_edemux(struct sk_buff *skb); extern int sock_setsockopt(struct socket *sock, int level, int op, char __user *optval, diff --git a/include/net/tcp.h b/include/net/tcp.h index 9332f342259..6660ffc4963 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -325,6 +325,7 @@ extern void tcp_v4_err(struct sk_buff *skb, u32); extern void tcp_shutdown (struct sock *sk, int how); +extern int tcp_v4_early_demux(struct sk_buff *skb); extern int tcp_v4_rcv(struct sk_buff *skb); extern struct inet_peer *tcp_v4_get_peer(struct sock *sk); diff --git a/net/core/sock.c b/net/core/sock.c index 9e5b71fda6e..929bdcc2383 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1465,6 +1465,11 @@ void sock_rfree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_rfree); +void sock_edemux(struct sk_buff *skb) +{ + sock_put(skb->sk); +} +EXPORT_SYMBOL(sock_edemux); int sock_i_uid(struct sock *sk) { diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 85a3b176313..07a02f6e969 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -157,6 +157,7 @@ void inet_sock_destruct(struct sock *sk) kfree(rcu_dereference_protected(inet->inet_opt, 1)); dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); + dst_release(sk->sk_rx_dst); sk_refcnt_debug_dec(sk); } EXPORT_SYMBOL(inet_sock_destruct); @@ -1518,14 +1519,15 @@ static const struct net_protocol igmp_protocol = { #endif static const struct net_protocol tcp_protocol = { - .handler = tcp_v4_rcv, - .err_handler = tcp_v4_err, - .gso_send_check = tcp_v4_gso_send_check, - .gso_segment = tcp_tso_segment, - .gro_receive = tcp4_gro_receive, - .gro_complete = tcp4_gro_complete, - .no_policy = 1, - .netns_ok = 1, + .early_demux = tcp_v4_early_demux, + .handler = tcp_v4_rcv, + .err_handler = tcp_v4_err, + .gso_send_check = tcp_v4_gso_send_check, + .gso_segment = tcp_tso_segment, + .gro_receive = tcp4_gro_receive, + .gro_complete = tcp4_gro_complete, + .no_policy = 1, + .netns_ok = 1, }; static const struct net_protocol udp_protocol = { diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index c4fe1d27113..93b092c9a39 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -323,19 +323,32 @@ static int ip_rcv_finish(struct sk_buff *skb) * how the packet travels inside Linux networking. */ if (skb_dst(skb) == NULL) { - int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev); - if (unlikely(err)) { - if (err == -EHOSTUNREACH) - IP_INC_STATS_BH(dev_net(skb->dev), - IPSTATS_MIB_INADDRERRORS); - else if (err == -ENETUNREACH) - IP_INC_STATS_BH(dev_net(skb->dev), - IPSTATS_MIB_INNOROUTES); - else if (err == -EXDEV) - NET_INC_STATS_BH(dev_net(skb->dev), - LINUX_MIB_IPRPFILTER); - goto drop; + const struct net_protocol *ipprot; + int protocol = iph->protocol; + int err; + + rcu_read_lock(); + ipprot = rcu_dereference(inet_protos[protocol]); + err = -ENOENT; + if (ipprot && ipprot->early_demux) + err = ipprot->early_demux(skb); + rcu_read_unlock(); + + if (err) { + err = ip_route_input_noref(skb, iph->daddr, iph->saddr, + iph->tos, skb->dev); + if (unlikely(err)) { + if (err == -EHOSTUNREACH) + IP_INC_STATS_BH(dev_net(skb->dev), + IPSTATS_MIB_INADDRERRORS); + else if (err == -ENETUNREACH) + IP_INC_STATS_BH(dev_net(skb->dev), + IPSTATS_MIB_INNOROUTES); + else if (err == -EXDEV) + NET_INC_STATS_BH(dev_net(skb->dev), + LINUX_MIB_IPRPFILTER); + goto drop; + } } } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b224eb8bce8..8416f8a68e6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5518,6 +5518,18 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, struct tcp_sock *tp = tcp_sk(sk); int res; + if (sk->sk_rx_dst) { + struct dst_entry *dst = sk->sk_rx_dst; + if (unlikely(dst->obsolete)) { + if (dst->ops->check(dst, 0) == NULL) { + dst_release(dst); + sk->sk_rx_dst = NULL; + } + } + } + if (unlikely(sk->sk_rx_dst == NULL)) + sk->sk_rx_dst = dst_clone(skb_dst(skb)); + /* * Header prediction. * The code loosely follows the one in the famous @@ -5729,8 +5741,10 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) tcp_set_state(sk, TCP_ESTABLISHED); - if (skb != NULL) + if (skb != NULL) { + sk->sk_rx_dst = dst_clone(skb_dst(skb)); security_inet_conn_established(sk, skb); + } /* Make sure socket is routed, for correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fda2ca17135..13857df1dae 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1671,6 +1671,52 @@ csum_err: } EXPORT_SYMBOL(tcp_v4_do_rcv); +int tcp_v4_early_demux(struct sk_buff *skb) +{ + struct net *net = dev_net(skb->dev); + const struct iphdr *iph; + const struct tcphdr *th; + struct sock *sk; + int err; + + err = -ENOENT; + if (skb->pkt_type != PACKET_HOST) + goto out_err; + + if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr))) + goto out_err; + + iph = ip_hdr(skb); + th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb)); + + if (th->doff < sizeof(struct tcphdr) / 4) + goto out_err; + + if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4)) + goto out_err; + + sk = __inet_lookup_established(net, &tcp_hashinfo, + iph->saddr, th->source, + iph->daddr, th->dest, + skb->dev->ifindex); + if (sk) { + skb->sk = sk; + skb->destructor = sock_edemux; + if (sk->sk_state != TCP_TIME_WAIT) { + struct dst_entry *dst = sk->sk_rx_dst; + if (dst) + dst = dst_check(dst, 0); + if (dst) { + skb_dst_set_noref(skb, dst); + err = 0; + } + } + } + +out_err: + return err; +} + /* * From tcp_input.c */ diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index cb015317c9f..72b7c63b1a3 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -445,6 +445,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct tcp_sock *oldtp = tcp_sk(sk); struct tcp_cookie_values *oldcvp = oldtp->cookie_values; + newsk->sk_rx_dst = dst_clone(skb_dst(skb)); + /* TCP Cookie Transactions require space for the cookie pair, * as it differs for each connection. There is no need to * copy any s_data_payload stored at the original socket. -- cgit v1.2.3-70-g09d2 From fd62e09b946522ec3578412826a81bead06fadf7 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 21 Jun 2012 14:58:10 -0700 Subject: tcp: Validate route interface in early demux. Otherwise we might violate reverse path filtering. Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 13857df1dae..21e22a00481 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1676,6 +1676,7 @@ int tcp_v4_early_demux(struct sk_buff *skb) struct net *net = dev_net(skb->dev); const struct iphdr *iph; const struct tcphdr *th; + struct net_device *dev; struct sock *sk; int err; @@ -1695,10 +1696,11 @@ int tcp_v4_early_demux(struct sk_buff *skb) if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4)) goto out_err; + dev = skb->dev; sk = __inet_lookup_established(net, &tcp_hashinfo, iph->saddr, th->source, iph->daddr, th->dest, - skb->dev->ifindex); + dev->ifindex); if (sk) { skb->sk = sk; skb->destructor = sock_edemux; @@ -1707,8 +1709,12 @@ int tcp_v4_early_demux(struct sk_buff *skb) if (dst) dst = dst_check(dst, 0); if (dst) { - skb_dst_set_noref(skb, dst); - err = 0; + struct rtable *rt = (struct rtable *) dst; + + if (rt->rt_iif == dev->ifindex) { + skb_dst_set_noref(skb, dst); + err = 0; + } } } } -- cgit v1.2.3-70-g09d2 From 7586eceb0abc0ea1c2b023e3e5d4dfd4ff40930a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 20 Jun 2012 05:02:19 +0000 Subject: ipv4: tcp: dont cache output dst for syncookies Don't cache output dst for syncookies, as this adds pressure on IP route cache and rcu subsystem for no gain. Signed-off-by: Eric Dumazet Cc: Hans Schillstrom Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/flow.h | 1 + include/net/inet_connection_sock.h | 3 ++- net/dccp/ipv4.c | 2 +- net/ipv4/inet_connection_sock.c | 8 ++++++-- net/ipv4/route.c | 5 ++++- net/ipv4/tcp_ipv4.c | 12 +++++++----- 6 files changed, 21 insertions(+), 10 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/flow.h b/include/net/flow.h index 6c469dbdb91..bd524f59856 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -22,6 +22,7 @@ struct flowi_common { #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_PRECOW_METRICS 0x02 #define FLOWI_FLAG_CAN_SLEEP 0x04 +#define FLOWI_FLAG_RT_NOCACHE 0x08 __u32 flowic_secid; }; diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index e1b7734c456..af3c743a40e 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -251,7 +251,8 @@ extern int inet_csk_get_port(struct sock *sk, unsigned short snum); extern struct dst_entry* inet_csk_route_req(struct sock *sk, struct flowi4 *fl4, - const struct request_sock *req); + const struct request_sock *req, + bool nocache); extern struct dst_entry* inet_csk_route_child_sock(struct sock *sk, struct sock *newsk, const struct request_sock *req); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 07f5579ca75..3eb76b5f221 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -504,7 +504,7 @@ static int dccp_v4_send_response(struct sock *sk, struct request_sock *req, struct dst_entry *dst; struct flowi4 fl4; - dst = inet_csk_route_req(sk, &fl4, req); + dst = inet_csk_route_req(sk, &fl4, req, false); if (dst == NULL) goto out; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f9ee7417f6a..034ddbe42ad 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -368,17 +368,21 @@ EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); struct dst_entry *inet_csk_route_req(struct sock *sk, struct flowi4 *fl4, - const struct request_sock *req) + const struct request_sock *req, + bool nocache) { struct rtable *rt; const struct inet_request_sock *ireq = inet_rsk(req); struct ip_options_rcu *opt = inet_rsk(req)->opt; struct net *net = sock_net(sk); + int flags = inet_sk_flowi_flags(sk) & ~FLOWI_FLAG_PRECOW_METRICS; + if (nocache) + flags |= FLOWI_FLAG_RT_NOCACHE; flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, - inet_sk_flowi_flags(sk) & ~FLOWI_FLAG_PRECOW_METRICS, + flags, (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr, ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport); security_req_classify_flow(req, flowi4_to_flowi(fl4)); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a91f6d33804..8d62d85e68d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1156,7 +1156,7 @@ restart: candp = NULL; now = jiffies; - if (!rt_caching(dev_net(rt->dst.dev))) { + if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) { /* * If we're not caching, just tell the caller we * were successful and don't touch the route. The @@ -2582,6 +2582,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res, rt_set_nexthop(rth, fl4, res, fi, type, 0); + if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE) + rth->dst.flags |= DST_NOCACHE; + return rth; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 21e22a00481..b52934f5334 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -825,7 +825,8 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct request_values *rvp, - u16 queue_mapping) + u16 queue_mapping, + bool nocache) { const struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; @@ -833,7 +834,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, struct sk_buff * skb; /* First, grab a route. */ - if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) + if (!dst && (dst = inet_csk_route_req(sk, &fl4, req, nocache)) == NULL) return -1; skb = tcp_make_synack(sk, dst, req, rvp); @@ -855,7 +856,7 @@ static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req, struct request_values *rvp) { TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); - return tcp_v4_send_synack(sk, NULL, req, rvp, 0); + return tcp_v4_send_synack(sk, NULL, req, rvp, 0, false); } /* @@ -1388,7 +1389,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) */ if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle && - (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && + (dst = inet_csk_route_req(sk, &fl4, req, want_cookie)) != NULL && fl4.daddr == saddr && (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) { inet_peer_refcheck(peer); @@ -1424,7 +1425,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (tcp_v4_send_synack(sk, dst, req, (struct request_values *)&tmp_ext, - skb_get_queue_mapping(skb)) || + skb_get_queue_mapping(skb), + want_cookie) || want_cookie) goto drop_and_free; -- cgit v1.2.3-70-g09d2 From 7011d0851b80a1a229acfda37ce08aad903b12d1 Mon Sep 17 00:00:00 2001 From: Vijay Subramanian Date: Sat, 23 Jun 2012 17:38:10 +0000 Subject: tcp: Fix bug in tcp socket early demux The dest port for the call to __inet_lookup_established() in TCP early demux code is passed with the wrong endian-ness. This causes the lookup to fail leading to early demux not being used. Signed-off-by: Vijay Subramanian Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b52934f5334..1781dc650b9 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1701,7 +1701,7 @@ int tcp_v4_early_demux(struct sk_buff *skb) dev = skb->dev; sk = __inet_lookup_established(net, &tcp_hashinfo, iph->saddr, th->source, - iph->daddr, th->dest, + iph->daddr, ntohs(th->dest), dev->ifindex); if (sk) { skb->sk = sk; -- cgit v1.2.3-70-g09d2 From c074da2810c118b3812f32d6754bd9ead2f169e7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Jun 2012 23:14:15 +0000 Subject: ipv4: tcp: dont cache unconfirmed intput dst DDOS synflood attacks hit badly IP route cache. On typical machines, this cache is allowed to hold up to 8 Millions dst entries, 256 bytes for each, for a total of 2GB of memory. rt_garbage_collect() triggers and tries to cleanup things. Eventually route cache is disabled but machine is under fire and might OOM and crash. This patch exploits the new TCP early demux, to set a nocache boolean in case incoming TCP frame is for a not yet ESTABLISHED or TIMEWAIT socket. This 'nocache' boolean is then used in case dst entry is not found in route cache, to create an unhashed dst entry (DST_NOCACHE) SYN-cookie-ACK sent use a similar mechanism (ipv4: tcp: dont cache output dst for syncookies), so after this patch, a machine is able to absorb a DDOS synflood attack without polluting its IP route cache. Signed-off-by: Eric Dumazet Cc: Hans Schillstrom Signed-off-by: David S. Miller --- include/net/protocol.h | 2 +- include/net/route.h | 8 ++++---- include/net/tcp.h | 2 +- net/ipv4/arp.c | 2 +- net/ipv4/ip_fragment.c | 2 +- net/ipv4/ip_input.c | 5 +++-- net/ipv4/route.c | 8 +++++--- net/ipv4/tcp_ipv4.c | 4 +++- net/ipv4/xfrm4_input.c | 2 +- 9 files changed, 20 insertions(+), 15 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/protocol.h b/include/net/protocol.h index 967b926cbfb..7cfc8f76914 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -37,7 +37,7 @@ /* This is used to register protocols. */ struct net_protocol { - int (*early_demux)(struct sk_buff *skb); + int (*early_demux)(struct sk_buff *skb, bool *nocache); int (*handler)(struct sk_buff *skb); void (*err_handler)(struct sk_buff *skb, u32 info); int (*gso_send_check)(struct sk_buff *skb); diff --git a/include/net/route.h b/include/net/route.h index 47eb25ac1f7..6361f933577 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -201,18 +201,18 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 } extern int ip_route_input_common(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin, bool noref); + u8 tos, struct net_device *devin, bool noref, bool nocache); static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, u8 tos, struct net_device *devin) { - return ip_route_input_common(skb, dst, src, tos, devin, false); + return ip_route_input_common(skb, dst, src, tos, devin, false, false); } static inline int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin) + u8 tos, struct net_device *devin, bool nocache) { - return ip_route_input_common(skb, dst, src, tos, devin, true); + return ip_route_input_common(skb, dst, src, tos, devin, true, nocache); } extern void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, diff --git a/include/net/tcp.h b/include/net/tcp.h index 6660ffc4963..917ed2e55e8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -325,7 +325,7 @@ extern void tcp_v4_err(struct sk_buff *skb, u32); extern void tcp_shutdown (struct sock *sk, int how); -extern int tcp_v4_early_demux(struct sk_buff *skb); +extern int tcp_v4_early_demux(struct sk_buff *skb, bool *nocache); extern int tcp_v4_rcv(struct sk_buff *skb); extern struct inet_peer *tcp_v4_get_peer(struct sock *sk); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 2e560f0c757..6a979594436 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -828,7 +828,7 @@ static int arp_process(struct sk_buff *skb) } if (arp->ar_op == htons(ARPOP_REQUEST) && - ip_route_input_noref(skb, tip, sip, 0, dev) == 0) { + ip_route_input_noref(skb, tip, sip, 0, dev, false) == 0) { rt = skb_rtable(skb); addr_type = rt->rt_type; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 8d07c973409..978d55f256e 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -259,7 +259,7 @@ static void ip_expire(unsigned long arg) skb_dst_drop(head); iph = ip_hdr(head); err = ip_route_input_noref(head, iph->daddr, iph->saddr, - iph->tos, head->dev); + iph->tos, head->dev, false); if (err) goto out_rcu_unlock; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 2a39204de5b..7be54c8dcbe 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -326,6 +326,7 @@ static int ip_rcv_finish(struct sk_buff *skb) */ if (skb_dst(skb) == NULL) { int err = -ENOENT; + bool nocache = false; if (sysctl_ip_early_demux) { const struct net_protocol *ipprot; @@ -334,13 +335,13 @@ static int ip_rcv_finish(struct sk_buff *skb) rcu_read_lock(); ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot && ipprot->early_demux) - err = ipprot->early_demux(skb); + err = ipprot->early_demux(skb, &nocache); rcu_read_unlock(); } if (err) { err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev); + iph->tos, skb->dev, nocache); if (unlikely(err)) { if (err == -EXDEV) NET_INC_STATS_BH(dev_net(skb->dev), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 81533e3a23d..fdc7900f9d7 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2214,7 +2214,7 @@ static int ip_mkroute_input(struct sk_buff *skb, */ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev) + u8 tos, struct net_device *dev, bool nocache) { struct fib_result res; struct in_device *in_dev = __in_dev_get_rcu(dev); @@ -2353,6 +2353,8 @@ local_input: rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } + if (nocache) + rth->dst.flags |= DST_NOCACHE; hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net)); rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif); err = 0; @@ -2395,7 +2397,7 @@ martian_source_keep_err: } int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, bool noref) + u8 tos, struct net_device *dev, bool noref, bool nocache) { struct rtable *rth; unsigned int hash; @@ -2471,7 +2473,7 @@ skip_cache: rcu_read_unlock(); return -EINVAL; } - res = ip_route_input_slow(skb, daddr, saddr, tos, dev); + res = ip_route_input_slow(skb, daddr, saddr, tos, dev, nocache); rcu_read_unlock(); return res; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1781dc650b9..33aabd4fc20 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1673,7 +1673,7 @@ csum_err: } EXPORT_SYMBOL(tcp_v4_do_rcv); -int tcp_v4_early_demux(struct sk_buff *skb) +int tcp_v4_early_demux(struct sk_buff *skb, bool *no_dst_cache) { struct net *net = dev_net(skb->dev); const struct iphdr *iph; @@ -1719,6 +1719,8 @@ int tcp_v4_early_demux(struct sk_buff *skb) } } } + } else { + *no_dst_cache = true; } out_err: diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 06814b6216d..eee636b191b 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -28,7 +28,7 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb) const struct iphdr *iph = ip_hdr(skb); if (ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev)) + iph->tos, skb->dev, false)) goto drop; } return dst_input(skb); -- cgit v1.2.3-70-g09d2 From c10237e077cef50e925f052e49f3b4fead9d71f9 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 27 Jun 2012 17:05:06 -0700 Subject: Revert "ipv4: tcp: dont cache unconfirmed intput dst" This reverts commit c074da2810c118b3812f32d6754bd9ead2f169e7. This change has several unwanted side effects: 1) Sockets will cache the DST_NOCACHE route in sk->sk_rx_dst and we'll thus never create a real cached route. 2) All TCP traffic will use DST_NOCACHE and never use the routing cache at all. Signed-off-by: David S. Miller --- include/net/protocol.h | 2 +- include/net/route.h | 8 ++++---- include/net/tcp.h | 2 +- net/ipv4/arp.c | 2 +- net/ipv4/ip_fragment.c | 2 +- net/ipv4/ip_input.c | 5 ++--- net/ipv4/route.c | 8 +++----- net/ipv4/tcp_ipv4.c | 4 +--- net/ipv4/xfrm4_input.c | 2 +- 9 files changed, 15 insertions(+), 20 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/protocol.h b/include/net/protocol.h index 7cfc8f76914..967b926cbfb 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -37,7 +37,7 @@ /* This is used to register protocols. */ struct net_protocol { - int (*early_demux)(struct sk_buff *skb, bool *nocache); + int (*early_demux)(struct sk_buff *skb); int (*handler)(struct sk_buff *skb); void (*err_handler)(struct sk_buff *skb, u32 info); int (*gso_send_check)(struct sk_buff *skb); diff --git a/include/net/route.h b/include/net/route.h index 6361f933577..47eb25ac1f7 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -201,18 +201,18 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 } extern int ip_route_input_common(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin, bool noref, bool nocache); + u8 tos, struct net_device *devin, bool noref); static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, u8 tos, struct net_device *devin) { - return ip_route_input_common(skb, dst, src, tos, devin, false, false); + return ip_route_input_common(skb, dst, src, tos, devin, false); } static inline int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin, bool nocache) + u8 tos, struct net_device *devin) { - return ip_route_input_common(skb, dst, src, tos, devin, true, nocache); + return ip_route_input_common(skb, dst, src, tos, devin, true); } extern void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, diff --git a/include/net/tcp.h b/include/net/tcp.h index 917ed2e55e8..6660ffc4963 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -325,7 +325,7 @@ extern void tcp_v4_err(struct sk_buff *skb, u32); extern void tcp_shutdown (struct sock *sk, int how); -extern int tcp_v4_early_demux(struct sk_buff *skb, bool *nocache); +extern int tcp_v4_early_demux(struct sk_buff *skb); extern int tcp_v4_rcv(struct sk_buff *skb); extern struct inet_peer *tcp_v4_get_peer(struct sock *sk); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 6a979594436..2e560f0c757 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -828,7 +828,7 @@ static int arp_process(struct sk_buff *skb) } if (arp->ar_op == htons(ARPOP_REQUEST) && - ip_route_input_noref(skb, tip, sip, 0, dev, false) == 0) { + ip_route_input_noref(skb, tip, sip, 0, dev) == 0) { rt = skb_rtable(skb); addr_type = rt->rt_type; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 978d55f256e..8d07c973409 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -259,7 +259,7 @@ static void ip_expire(unsigned long arg) skb_dst_drop(head); iph = ip_hdr(head); err = ip_route_input_noref(head, iph->daddr, iph->saddr, - iph->tos, head->dev, false); + iph->tos, head->dev); if (err) goto out_rcu_unlock; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 7be54c8dcbe..2a39204de5b 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -326,7 +326,6 @@ static int ip_rcv_finish(struct sk_buff *skb) */ if (skb_dst(skb) == NULL) { int err = -ENOENT; - bool nocache = false; if (sysctl_ip_early_demux) { const struct net_protocol *ipprot; @@ -335,13 +334,13 @@ static int ip_rcv_finish(struct sk_buff *skb) rcu_read_lock(); ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot && ipprot->early_demux) - err = ipprot->early_demux(skb, &nocache); + err = ipprot->early_demux(skb); rcu_read_unlock(); } if (err) { err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev, nocache); + iph->tos, skb->dev); if (unlikely(err)) { if (err == -EXDEV) NET_INC_STATS_BH(dev_net(skb->dev), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index fdc7900f9d7..81533e3a23d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2214,7 +2214,7 @@ static int ip_mkroute_input(struct sk_buff *skb, */ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, bool nocache) + u8 tos, struct net_device *dev) { struct fib_result res; struct in_device *in_dev = __in_dev_get_rcu(dev); @@ -2353,8 +2353,6 @@ local_input: rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } - if (nocache) - rth->dst.flags |= DST_NOCACHE; hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net)); rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif); err = 0; @@ -2397,7 +2395,7 @@ martian_source_keep_err: } int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, bool noref, bool nocache) + u8 tos, struct net_device *dev, bool noref) { struct rtable *rth; unsigned int hash; @@ -2473,7 +2471,7 @@ skip_cache: rcu_read_unlock(); return -EINVAL; } - res = ip_route_input_slow(skb, daddr, saddr, tos, dev, nocache); + res = ip_route_input_slow(skb, daddr, saddr, tos, dev); rcu_read_unlock(); return res; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 33aabd4fc20..1781dc650b9 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1673,7 +1673,7 @@ csum_err: } EXPORT_SYMBOL(tcp_v4_do_rcv); -int tcp_v4_early_demux(struct sk_buff *skb, bool *no_dst_cache) +int tcp_v4_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); const struct iphdr *iph; @@ -1719,8 +1719,6 @@ int tcp_v4_early_demux(struct sk_buff *skb, bool *no_dst_cache) } } } - } else { - *no_dst_cache = true; } out_err: diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index eee636b191b..06814b6216d 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -28,7 +28,7 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb) const struct iphdr *iph = ip_hdr(skb); if (ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev, false)) + iph->tos, skb->dev)) goto drop; } return dst_input(skb); -- cgit v1.2.3-70-g09d2 From 160eb5a6b14ca2eab5c598bdbbb24c24624bad34 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 27 Jun 2012 22:01:22 -0700 Subject: ipv4: Kill early demux method return value. It's completely unnecessary. Signed-off-by: David S. Miller --- include/net/protocol.h | 2 +- include/net/tcp.h | 2 +- net/ipv4/ip_input.c | 42 +++++++++++++++++++----------------------- net/ipv4/tcp_ipv4.c | 19 ++++++------------- 4 files changed, 27 insertions(+), 38 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/protocol.h b/include/net/protocol.h index 967b926cbfb..057f2d31556 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -37,7 +37,7 @@ /* This is used to register protocols. */ struct net_protocol { - int (*early_demux)(struct sk_buff *skb); + void (*early_demux)(struct sk_buff *skb); int (*handler)(struct sk_buff *skb); void (*err_handler)(struct sk_buff *skb, u32 info); int (*gso_send_check)(struct sk_buff *skb); diff --git a/include/net/tcp.h b/include/net/tcp.h index 6660ffc4963..53fb7d81417 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -325,7 +325,7 @@ extern void tcp_v4_err(struct sk_buff *skb, u32); extern void tcp_shutdown (struct sock *sk, int how); -extern int tcp_v4_early_demux(struct sk_buff *skb); +extern void tcp_v4_early_demux(struct sk_buff *skb); extern int tcp_v4_rcv(struct sk_buff *skb); extern struct inet_peer *tcp_v4_get_peer(struct sock *sk); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 2a39204de5b..b27d4440f52 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -320,33 +320,29 @@ static int ip_rcv_finish(struct sk_buff *skb) const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; + if (sysctl_ip_early_demux && !skb_dst(skb)) { + const struct net_protocol *ipprot; + int protocol = iph->protocol; + + rcu_read_lock(); + ipprot = rcu_dereference(inet_protos[protocol]); + if (ipprot && ipprot->early_demux) + ipprot->early_demux(skb); + rcu_read_unlock(); + } + /* * Initialise the virtual path cache for the packet. It describes * how the packet travels inside Linux networking. */ - if (skb_dst(skb) == NULL) { - int err = -ENOENT; - - if (sysctl_ip_early_demux) { - const struct net_protocol *ipprot; - int protocol = iph->protocol; - - rcu_read_lock(); - ipprot = rcu_dereference(inet_protos[protocol]); - if (ipprot && ipprot->early_demux) - err = ipprot->early_demux(skb); - rcu_read_unlock(); - } - - if (err) { - err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev); - if (unlikely(err)) { - if (err == -EXDEV) - NET_INC_STATS_BH(dev_net(skb->dev), - LINUX_MIB_IPRPFILTER); - goto drop; - } + if (!skb_dst(skb)) { + int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, + iph->tos, skb->dev); + if (unlikely(err)) { + if (err == -EXDEV) + NET_INC_STATS_BH(dev_net(skb->dev), + LINUX_MIB_IPRPFILTER); + goto drop; } } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1781dc650b9..b4ae1c199f3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1673,30 +1673,28 @@ csum_err: } EXPORT_SYMBOL(tcp_v4_do_rcv); -int tcp_v4_early_demux(struct sk_buff *skb) +void tcp_v4_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); const struct iphdr *iph; const struct tcphdr *th; struct net_device *dev; struct sock *sk; - int err; - err = -ENOENT; if (skb->pkt_type != PACKET_HOST) - goto out_err; + return; if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr))) - goto out_err; + return; iph = ip_hdr(skb); th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb)); if (th->doff < sizeof(struct tcphdr) / 4) - goto out_err; + return; if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4)) - goto out_err; + return; dev = skb->dev; sk = __inet_lookup_established(net, &tcp_hashinfo, @@ -1713,16 +1711,11 @@ int tcp_v4_early_demux(struct sk_buff *skb) if (dst) { struct rtable *rt = (struct rtable *) dst; - if (rt->rt_iif == dev->ifindex) { + if (rt->rt_iif == dev->ifindex) skb_dst_set_noref(skb, dst); - err = 0; - } } } } - -out_err: - return err; } /* -- cgit v1.2.3-70-g09d2 From 70e7341673a47fb1525cfc7d6651cc98b5348928 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 28 Jun 2012 03:21:41 -0700 Subject: ipv4: Show that ip_send_reply() is purely unicast routine. Rename it to ip_send_unicast_reply() and add explicit 'saddr' argument. This removed one of the few users of rt->rt_spec_dst. Signed-off-by: David S. Miller --- include/net/ip.h | 5 +++-- net/ipv4/ip_output.c | 9 +++++---- net/ipv4/tcp_ipv4.c | 8 ++++---- 3 files changed, 12 insertions(+), 10 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/ip.h b/include/net/ip.h index 50841bd6f10..ec5cfde85e9 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -158,8 +158,9 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg) return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0; } -void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, - const struct ip_reply_arg *arg, unsigned int len); +void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, + __be32 saddr, const struct ip_reply_arg *arg, + unsigned int len); struct ipv4_config { int log_martians; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 0f3185a662c..2630900e480 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1459,13 +1459,14 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset, /* * Generic function to send a packet as reply to another packet. - * Used to send TCP resets so far. ICMP should use this function too. + * Used to send TCP resets so far. * * Should run single threaded per socket because it uses the sock * structure to pass arguments. */ -void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, - const struct ip_reply_arg *arg, unsigned int len) +void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, + __be32 saddr, const struct ip_reply_arg *arg, + unsigned int len) { struct inet_sock *inet = inet_sk(sk); struct ip_options_data replyopts; @@ -1491,7 +1492,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, RT_TOS(arg->tos), RT_SCOPE_UNIVERSE, sk->sk_protocol, ip_reply_arg_flowi_flags(arg), - daddr, rt->rt_spec_dst, + daddr, saddr, tcp_hdr(skb)->source, tcp_hdr(skb)->dest); security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); rt = ip_route_output_key(sock_net(sk), &fl4); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b4ae1c199f3..64568fa21d0 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -698,8 +698,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) net = dev_net(skb_dst(skb)->dev); arg.tos = ip_hdr(skb)->tos; - ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, - &arg, arg.iov[0].iov_len); + ip_send_unicast_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); @@ -781,8 +781,8 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, if (oif) arg.bound_dev_if = oif; arg.tos = tos; - ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, - &arg, arg.iov[0].iov_len); + ip_send_unicast_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); } -- cgit v1.2.3-70-g09d2 From ab92bb2f679d66c7e12a6b1c0cdd76fe308f6546 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 9 Jul 2012 16:19:30 -0700 Subject: tcp: Abstract back handling peer aliveness test into helper function. Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_metrics.c | 10 ++++++++++ net/ipv6/tcp_ipv6.c | 2 +- 4 files changed, 13 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index 98ca797001a..5478356ea8c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -389,6 +389,7 @@ extern void tcp_enter_loss(struct sock *sk, int how); extern void tcp_clear_retrans(struct tcp_sock *tp); extern void tcp_update_metrics(struct sock *sk); extern void tcp_init_metrics(struct sock *sk); +extern bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst); extern void tcp_disable_fack(struct tcp_sock *tp); extern void tcp_close(struct sock *sk, long timeout); extern void tcp_init_sock(struct sock *sk); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 64568fa21d0..e9312a8f33a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1405,7 +1405,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < (sysctl_max_syn_backlog >> 2)) && (!peer || !peer->tcp_ts_stamp) && - (!dst || !dst_metric(dst, RTAX_RTT))) { + !tcp_peer_is_proven(req, dst)) { /* Without syncookies last quarter of * backlog is filled with destinations, * proven to be alive. diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 2793ecf928d..9afe703c85c 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -1,7 +1,9 @@ +#include #include #include #include +#include #include #include #include @@ -190,3 +192,11 @@ reset: tp->snd_cwnd = tcp_init_cwnd(tp, dst); tp->snd_cwnd_stamp = tcp_time_stamp; } + +bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst) +{ + if (!dst) + return false; + return dst_metric(dst, RTAX_RTT) ? true : false; +} +EXPORT_SYMBOL_GPL(tcp_peer_is_proven); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 6cc67ed6c2e..75d179555c2 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1177,7 +1177,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < (sysctl_max_syn_backlog >> 2)) && (!peer || !peer->tcp_ts_stamp) && - (!dst || !dst_metric(dst, RTAX_RTT))) { + !tcp_peer_is_proven(req, dst)) { /* Without syncookies last quarter of * backlog is filled with destinations, * proven to be alive. -- cgit v1.2.3-70-g09d2 From 81166dd6fa8eb780b2132d32fbc77eb6ac04e44e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 10 Jul 2012 03:14:24 -0700 Subject: tcp: Move timestamps from inetpeer to metrics cache. With help from Lin Ming. Signed-off-by: David S. Miller --- include/net/inetpeer.h | 4 +- include/net/tcp.h | 5 +- net/ipv4/inetpeer.c | 1 - net/ipv4/route.c | 8 +-- net/ipv4/tcp_ipv4.c | 30 ++--------- net/ipv4/tcp_metrics.c | 136 +++++++++++++++++++++++++++++++++++++++++++++-- net/ipv4/tcp_minisocks.c | 46 ---------------- net/ipv6/route.c | 13 +---- net/ipv6/tcp_ipv6.c | 33 ++---------- 9 files changed, 149 insertions(+), 127 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index c27c8f10ebd..1119f6f6cdb 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -46,15 +46,13 @@ struct inet_peer { }; /* * Once inet_peer is queued for deletion (refcnt == -1), following fields - * are not available: rid, ip_id_count, tcp_ts, tcp_ts_stamp + * are not available: rid, ip_id_count * We can share memory with rcu_head to help keep inet_peer small. */ union { struct { atomic_t rid; /* Frag reception counter */ atomic_t ip_id_count; /* IP ID for the next packet */ - __u32 tcp_ts; - __u32 tcp_ts_stamp; }; struct rcu_head rcu; struct inet_peer *gc_next; diff --git a/include/net/tcp.h b/include/net/tcp.h index 0900d63d162..3618fefae04 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -390,7 +390,10 @@ extern void tcp_clear_retrans(struct tcp_sock *tp); extern void tcp_update_metrics(struct sock *sk); extern void tcp_init_metrics(struct sock *sk); extern void tcp_metrics_init(void); -extern bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst); +extern bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check); +extern bool tcp_remember_stamp(struct sock *sk); +extern bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw); +extern void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst); extern void tcp_disable_fack(struct tcp_sock *tp); extern void tcp_close(struct sock *sk, long timeout); extern void tcp_init_sock(struct sock *sk); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index da90a8cab61..f457bcb4135 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -508,7 +508,6 @@ relookup: (daddr->family == AF_INET) ? secure_ip_id(daddr->addr.a4) : secure_ipv6_id(daddr->addr.a6)); - p->tcp_ts_stamp = 0; p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; p->rate_tokens = 0; p->rate_last = 0; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d02c91177d3..78d81543766 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2846,7 +2846,7 @@ static int rt_fill_info(struct net *net, struct rtmsg *r; struct nlmsghdr *nlh; unsigned long expires = 0; - u32 id = 0, ts = 0, tsage = 0, error; + u32 id = 0, error; nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); if (nlh == NULL) @@ -2903,10 +2903,6 @@ static int rt_fill_info(struct net *net, const struct inet_peer *peer = rt_peer_ptr(rt); inet_peer_refcheck(peer); id = atomic_read(&peer->ip_id_count) & 0xffff; - if (peer->tcp_ts_stamp) { - ts = peer->tcp_ts; - tsage = get_seconds() - peer->tcp_ts_stamp; - } expires = ACCESS_ONCE(peer->pmtu_expires); if (expires) { if (time_before(jiffies, expires)) @@ -2942,7 +2938,7 @@ static int rt_fill_info(struct net *net, goto nla_put_failure; } - if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, + if (rtnl_put_cacheinfo(skb, &rt->dst, id, 0, 0, expires, error) < 0) goto nla_put_failure; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e9312a8f33a..d406bf7f37d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -209,22 +209,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) } if (tcp_death_row.sysctl_tw_recycle && - !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) { - struct inet_peer *peer = rt_get_peer(rt, fl4->daddr); - /* - * VJ's idea. We save last timestamp seen from - * the destination in peer table, when entering state - * TIME-WAIT * and initialize rx_opt.ts_recent from it, - * when trying new connection. - */ - if (peer) { - inet_peer_refcheck(peer); - if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { - tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; - tp->rx_opt.ts_recent = peer->tcp_ts; - } - } - } + !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) + tcp_fetch_timewait_stamp(sk, &rt->dst); inet->inet_dport = usin->sin_port; inet->inet_daddr = daddr; @@ -1375,7 +1361,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) isn = cookie_v4_init_sequence(sk, skb, &req->mss); req->cookie_ts = tmp_opt.tstamp_ok; } else if (!isn) { - struct inet_peer *peer = NULL; struct flowi4 fl4; /* VJ's idea. We save last timestamp seen @@ -1390,12 +1375,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle && (dst = inet_csk_route_req(sk, &fl4, req, want_cookie)) != NULL && - fl4.daddr == saddr && - (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) { - inet_peer_refcheck(peer); - if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && - (s32)(peer->tcp_ts - req->ts_recent) > - TCP_PAWS_WINDOW) { + fl4.daddr == saddr) { + if (!tcp_peer_is_proven(req, dst, true)) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); goto drop_and_release; } @@ -1404,8 +1385,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) else if (!sysctl_tcp_syncookies && (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < (sysctl_max_syn_backlog >> 2)) && - (!peer || !peer->tcp_ts_stamp) && - !tcp_peer_is_proven(req, dst)) { + !tcp_peer_is_proven(req, dst, false)) { /* Without syncookies last quarter of * backlog is filled with destinations, * proven to be alive. diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 56223bab251..1fd83d3118f 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -34,6 +34,8 @@ struct tcp_metrics_block { struct tcp_metrics_block __rcu *tcpm_next; struct inetpeer_addr tcpm_addr; unsigned long tcpm_stamp; + u32 tcpm_ts; + u32 tcpm_ts_stamp; u32 tcpm_lock; u32 tcpm_vals[TCP_METRIC_MAX]; }; @@ -114,6 +116,8 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst) tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH); tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND); tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); + tm->tcpm_ts = 0; + tm->tcpm_ts_stamp = 0; } static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, @@ -230,6 +234,45 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, return tm; } +static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw) +{ + struct inet6_timewait_sock *tw6; + struct tcp_metrics_block *tm; + struct inetpeer_addr addr; + unsigned int hash; + struct net *net; + + addr.family = tw->tw_family; + switch (addr.family) { + case AF_INET: + addr.addr.a4 = tw->tw_daddr; + hash = (__force unsigned int) addr.addr.a4; + break; + case AF_INET6: + tw6 = inet6_twsk((struct sock *)tw); + *(struct in6_addr *)addr.addr.a6 = tw6->tw_v6_daddr; + hash = ((__force unsigned int) addr.addr.a6[0] ^ + (__force unsigned int) addr.addr.a6[1] ^ + (__force unsigned int) addr.addr.a6[2] ^ + (__force unsigned int) addr.addr.a6[3]); + break; + default: + return NULL; + } + + hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8); + + net = twsk_net(tw); + hash &= net->ipv4.tcp_metrics_hash_mask; + + for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; + tm = rcu_dereference(tm->tcpm_next)) { + if (addr_same(&tm->tcpm_addr, &addr)) + break; + } + return tm; +} + static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, struct dst_entry *dst, bool create) @@ -496,7 +539,7 @@ reset: tp->snd_cwnd_stamp = tcp_time_stamp; } -bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst) +bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check) { struct tcp_metrics_block *tm; bool ret; @@ -506,16 +549,99 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst) rcu_read_lock(); tm = __tcp_get_metrics_req(req, dst); - if (tm && tcp_metric_get(tm, TCP_METRIC_RTT)) - ret = true; - else - ret = false; + if (paws_check) { + if (tm && + (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL && + (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW) + ret = false; + else + ret = true; + } else { + if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp) + ret = true; + else + ret = false; + } rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(tcp_peer_is_proven); +void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst) +{ + struct tcp_metrics_block *tm; + + rcu_read_lock(); + tm = tcp_get_metrics(sk, dst, true); + if (tm) { + struct tcp_sock *tp = tcp_sk(sk); + + if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) { + tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp; + tp->rx_opt.ts_recent = tm->tcpm_ts; + } + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp); + +/* VJ's idea. Save last timestamp seen from this destination and hold + * it at least for normal timewait interval to use for duplicate + * segment detection in subsequent connections, before they enter + * synchronized state. + */ +bool tcp_remember_stamp(struct sock *sk) +{ + struct dst_entry *dst = __sk_dst_get(sk); + bool ret = false; + + if (dst) { + struct tcp_metrics_block *tm; + + rcu_read_lock(); + tm = tcp_get_metrics(sk, dst, true); + if (tm) { + struct tcp_sock *tp = tcp_sk(sk); + + if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 || + ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL && + tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { + tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; + tm->tcpm_ts = tp->rx_opt.ts_recent; + } + ret = true; + } + rcu_read_unlock(); + } + return ret; +} + +bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw) +{ + struct tcp_metrics_block *tm; + bool ret = false; + + rcu_read_lock(); + tm = __tcp_get_metrics_tw(tw); + if (tw) { + const struct tcp_timewait_sock *tcptw; + struct sock *sk = (struct sock *) tw; + + tcptw = tcp_twsk(sk); + if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 || + ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL && + tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { + tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; + tm->tcpm_ts = tcptw->tw_ts_recent; + } + ret = true; + } + rcu_read_unlock(); + + return ret; +} + static unsigned long tcpmhash_entries; static int __init set_tcpmhash_entries(char *str) { diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 72b7c63b1a3..a51aa534dab 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -49,52 +49,6 @@ struct inet_timewait_death_row tcp_death_row = { }; EXPORT_SYMBOL_GPL(tcp_death_row); -/* VJ's idea. Save last timestamp seen from this destination - * and hold it at least for normal timewait interval to use for duplicate - * segment detection in subsequent connections, before they enter synchronized - * state. - */ - -static bool tcp_remember_stamp(struct sock *sk) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_sock *tp = tcp_sk(sk); - struct inet_peer *peer; - - peer = icsk->icsk_af_ops->get_peer(sk); - if (peer) { - if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || - ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && - peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { - peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; - peer->tcp_ts = tp->rx_opt.ts_recent; - } - return true; - } - - return false; -} - -static bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw) -{ - const struct tcp_timewait_sock *tcptw; - struct sock *sk = (struct sock *) tw; - struct inet_peer *peer; - - tcptw = tcp_twsk(sk); - peer = tcptw->tw_peer; - if (peer) { - if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || - ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && - peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { - peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; - peer->tcp_ts = tcptw->tw_ts_recent; - } - return true; - } - return false; -} - static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { if (seq == s_win) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6cc6c881f54..0c068475378 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2348,13 +2348,11 @@ static int rt6_fill_node(struct net *net, int iif, int type, u32 pid, u32 seq, int prefix, int nowait, unsigned int flags) { - const struct inet_peer *peer; struct rtmsg *rtm; struct nlmsghdr *nlh; long expires; u32 table; struct neighbour *n; - u32 ts, tsage; if (prefix) { /* user wants prefix routes only */ if (!(rt->rt6i_flags & RTF_PREFIX_RT)) { @@ -2473,16 +2471,7 @@ static int rt6_fill_node(struct net *net, else expires = INT_MAX; - peer = NULL; - if (rt6_has_peer(rt)) - peer = rt6_peer_ptr(rt); - ts = tsage = 0; - if (peer && peer->tcp_ts_stamp) { - ts = peer->tcp_ts; - tsage = get_seconds() - peer->tcp_ts_stamp; - } - - if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage, + if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0, expires, rt->dst.error) < 0) goto nla_put_failure; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 75d179555c2..9e96b5f21d2 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -277,22 +277,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, rt = (struct rt6_info *) dst; if (tcp_death_row.sysctl_tw_recycle && !tp->rx_opt.ts_recent_stamp && - ipv6_addr_equal(&rt->rt6i_dst.addr, &np->daddr)) { - struct inet_peer *peer = rt6_get_peer(rt); - /* - * VJ's idea. We save last timestamp seen from - * the destination in peer table, when entering state - * TIME-WAIT * and initialize rx_opt.ts_recent from it, - * when trying new connection. - */ - if (peer) { - inet_peer_refcheck(peer); - if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { - tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; - tp->rx_opt.ts_recent = peer->tcp_ts; - } - } - } + ipv6_addr_equal(&rt->rt6i_dst.addr, &np->daddr)) + tcp_fetch_timewait_stamp(sk, dst); icsk->icsk_ext_hdr_len = 0; if (np->opt) @@ -1134,8 +1120,6 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) treq->iif = inet6_iif(skb); if (!isn) { - struct inet_peer *peer = NULL; - if (ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { @@ -1160,14 +1144,8 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) */ if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle && - (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL && - (peer = rt6_get_peer((struct rt6_info *)dst)) != NULL && - ipv6_addr_equal((struct in6_addr *)peer->daddr.addr.a6, - &treq->rmt_addr)) { - inet_peer_refcheck(peer); - if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && - (s32)(peer->tcp_ts - req->ts_recent) > - TCP_PAWS_WINDOW) { + (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL) { + if (!tcp_peer_is_proven(req, dst, true)) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); goto drop_and_release; } @@ -1176,8 +1154,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) else if (!sysctl_tcp_syncookies && (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < (sysctl_max_syn_backlog >> 2)) && - (!peer || !peer->tcp_ts_stamp) && - !tcp_peer_is_proven(req, dst)) { + !tcp_peer_is_proven(req, dst, false)) { /* Without syncookies last quarter of * backlog is filled with destinations, * proven to be alive. -- cgit v1.2.3-70-g09d2 From 16d1839907e695387654901995f9286b65fbbc6a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 10 Jul 2012 03:32:59 -0700 Subject: inet: Remove ->get_peer() method. No longer used. Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 1 - net/ipv4/tcp_ipv4.c | 16 ---------------- net/ipv6/tcp_ipv6.c | 16 ---------------- 3 files changed, 33 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index af3c743a40e..291e7cee14e 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -43,7 +43,6 @@ struct inet_connection_sock_af_ops { struct sock *(*syn_recv_sock)(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst); - struct inet_peer *(*get_peer)(struct sock *sk); u16 net_header_len; u16 net_frag_header_len; u16 sockaddr_len; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d406bf7f37d..ddefd39ac0c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1847,21 +1847,6 @@ do_time_wait: goto discard_it; } -struct inet_peer *tcp_v4_get_peer(struct sock *sk) -{ - struct rtable *rt = (struct rtable *) __sk_dst_get(sk); - struct inet_sock *inet = inet_sk(sk); - - /* If we don't have a valid cached route, or we're doing IP - * options which make the IPv4 header destination address - * different from our peer's, do not bother with this. - */ - if (!rt || inet->cork.fl.u.ip4.daddr != inet->inet_daddr) - return NULL; - return rt_get_peer_create(rt, inet->inet_daddr); -} -EXPORT_SYMBOL(tcp_v4_get_peer); - static struct timewait_sock_ops tcp_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp_timewait_sock), .twsk_unique = tcp_twsk_unique, @@ -1874,7 +1859,6 @@ const struct inet_connection_sock_af_ops ipv4_specific = { .rebuild_header = inet_sk_rebuild_header, .conn_request = tcp_v4_conn_request, .syn_recv_sock = tcp_v4_syn_recv_sock, - .get_peer = tcp_v4_get_peer, .net_header_len = sizeof(struct iphdr), .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 9e96b5f21d2..61175cb2478 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1689,20 +1689,6 @@ do_time_wait: goto discard_it; } -static struct inet_peer *tcp_v6_get_peer(struct sock *sk) -{ - struct rt6_info *rt = (struct rt6_info *) __sk_dst_get(sk); - struct ipv6_pinfo *np = inet6_sk(sk); - - /* If we don't have a valid cached route, or we're doing IP - * options which make the IPv6 header destination address - * different from our peer's, do not bother with this. - */ - if (!rt || !ipv6_addr_equal(&np->daddr, &rt->rt6i_dst.addr)) - return NULL; - return rt6_get_peer_create(rt); -} - static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp6_timewait_sock), .twsk_unique = tcp_twsk_unique, @@ -1715,7 +1701,6 @@ static const struct inet_connection_sock_af_ops ipv6_specific = { .rebuild_header = inet6_sk_rebuild_header, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, - .get_peer = tcp_v6_get_peer, .net_header_len = sizeof(struct ipv6hdr), .net_frag_header_len = sizeof(struct frag_hdr), .setsockopt = ipv6_setsockopt, @@ -1747,7 +1732,6 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = { .rebuild_header = inet_sk_rebuild_header, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, - .get_peer = tcp_v4_get_peer, .net_header_len = sizeof(struct iphdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, -- cgit v1.2.3-70-g09d2 From 46d3ceabd8d98ed0ad10f20c595ca784e34786c5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 11 Jul 2012 05:50:31 +0000 Subject: tcp: TCP Small Queues This introduce TSQ (TCP Small Queues) TSQ goal is to reduce number of TCP packets in xmit queues (qdisc & device queues), to reduce RTT and cwnd bias, part of the bufferbloat problem. sk->sk_wmem_alloc not allowed to grow above a given limit, allowing no more than ~128KB [1] per tcp socket in qdisc/dev layers at a given time. TSO packets are sized/capped to half the limit, so that we have two TSO packets in flight, allowing better bandwidth use. As a side effect, setting the limit to 40000 automatically reduces the standard gso max limit (65536) to 40000/2 : It can help to reduce latencies of high prio packets, having smaller TSO packets. This means we divert sock_wfree() to a tcp_wfree() handler, to queue/send following frames when skb_orphan() [2] is called for the already queued skbs. Results on my dev machines (tg3/ixgbe nics) are really impressive, using standard pfifo_fast, and with or without TSO/GSO. Without reduction of nominal bandwidth, we have reduction of buffering per bulk sender : < 1ms on Gbit (instead of 50ms with TSO) < 8ms on 100Mbit (instead of 132 ms) I no longer have 4 MBytes backlogged in qdisc by a single netperf session, and both side socket autotuning no longer use 4 Mbytes. As skb destructor cannot restart xmit itself ( as qdisc lock might be taken at this point ), we delegate the work to a tasklet. We use one tasklest per cpu for performance reasons. If tasklet finds a socket owned by the user, it sets TSQ_OWNED flag. This flag is tested in a new protocol method called from release_sock(), to eventually send new segments. [1] New /proc/sys/net/ipv4/tcp_limit_output_bytes tunable [2] skb_orphan() is usually called at TX completion time, but some drivers call it in their start_xmit() handler. These drivers should at least use BQL, or else a single TCP session can still fill the whole NIC TX ring, since TSQ will have no effect. Signed-off-by: Eric Dumazet Cc: Dave Taht Cc: Tom Herbert Cc: Matt Mathis Cc: Yuchung Cheng Cc: Nandita Dukkipati Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 14 +++ include/linux/tcp.h | 9 ++ include/net/sock.h | 2 + include/net/tcp.h | 4 + net/core/sock.c | 4 + net/ipv4/sysctl_net_ipv4.c | 7 ++ net/ipv4/tcp.c | 6 ++ net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_minisocks.c | 1 + net/ipv4/tcp_output.c | 154 ++++++++++++++++++++++++++++++++- net/ipv6/tcp_ipv6.c | 1 + 11 files changed, 202 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 47b6c79e9b0..e20c17a7d34 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -551,6 +551,20 @@ tcp_thin_dupack - BOOLEAN Documentation/networking/tcp-thin.txt Default: 0 +tcp_limit_output_bytes - INTEGER + Controls TCP Small Queue limit per tcp socket. + TCP bulk sender tends to increase packets in flight until it + gets losses notifications. With SNDBUF autotuning, this can + result in a large amount of packets queued in qdisc/device + on the local machine, hurting latency of other flows, for + typical pfifo_fast qdiscs. + tcp_limit_output_bytes limits the number of bytes on qdisc + or device to reduce artificial RTT/cwnd and reduce bufferbloat. + Note: For GSO/TSO enabled flows, we try to have at least two + packets in flight. Reducing tcp_limit_output_bytes might also + reduce the size of individual GSO packet (64KB being the max) + Default: 131072 + UDP variables: udp_mem - vector of 3 INTEGERs: min, pressure, max diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 2de9cf46f9f..1888169e07c 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -339,6 +339,9 @@ struct tcp_sock { u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ u32 lsndtime; /* timestamp of last sent data packet (for restart window) */ + struct list_head tsq_node; /* anchor in tsq_tasklet.head list */ + unsigned long tsq_flags; + /* Data for direct copy to user */ struct { struct sk_buff_head prequeue; @@ -494,6 +497,12 @@ struct tcp_sock { struct tcp_cookie_values *cookie_values; }; +enum tsq_flags { + TSQ_THROTTLED, + TSQ_QUEUED, + TSQ_OWNED, /* tcp_tasklet_func() found socket was locked */ +}; + static inline struct tcp_sock *tcp_sk(const struct sock *sk) { return (struct tcp_sock *)sk; diff --git a/include/net/sock.h b/include/net/sock.h index dcb54a0793e..88de092df50 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -858,6 +858,8 @@ struct proto { int (*backlog_rcv) (struct sock *sk, struct sk_buff *skb); + void (*release_cb)(struct sock *sk); + /* Keeping track of sk's, looking them up, and port selection methods. */ void (*hash)(struct sock *sk); void (*unhash)(struct sock *sk); diff --git a/include/net/tcp.h b/include/net/tcp.h index 3618fefae04..439984b9af4 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -253,6 +253,7 @@ extern int sysctl_tcp_cookie_size; extern int sysctl_tcp_thin_linear_timeouts; extern int sysctl_tcp_thin_dupack; extern int sysctl_tcp_early_retrans; +extern int sysctl_tcp_limit_output_bytes; extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; @@ -321,6 +322,8 @@ extern struct proto tcp_prot; extern void tcp_init_mem(struct net *net); +extern void tcp_tasklet_init(void); + extern void tcp_v4_err(struct sk_buff *skb, u32); extern void tcp_shutdown (struct sock *sk, int how); @@ -334,6 +337,7 @@ extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t size); extern int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); +extern void tcp_release_cb(struct sock *sk); extern int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg); extern int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len); diff --git a/net/core/sock.c b/net/core/sock.c index 929bdcc2383..24039ac1242 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2159,6 +2159,10 @@ void release_sock(struct sock *sk) spin_lock_bh(&sk->sk_lock.slock); if (sk->sk_backlog.tail) __release_sock(sk); + + if (sk->sk_prot->release_cb) + sk->sk_prot->release_cb(sk); + sk->sk_lock.owned = 0; if (waitqueue_active(&sk->sk_lock.wq)) wake_up(&sk->sk_lock.wq); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 12aa0c5867c..70730f7aeaf 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -598,6 +598,13 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_limit_output_bytes", + .data = &sysctl_tcp_limit_output_bytes, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, #ifdef CONFIG_NET_DMA { .procname = "tcp_dma_copybreak", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d902da96d15..4252cd8f39f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -376,6 +376,7 @@ void tcp_init_sock(struct sock *sk) skb_queue_head_init(&tp->out_of_order_queue); tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); + INIT_LIST_HEAD(&tp->tsq_node); icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev = TCP_TIMEOUT_INIT; @@ -796,6 +797,10 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, inet_csk(sk)->icsk_ext_hdr_len - tp->tcp_header_len); + /* TSQ : try to have two TSO segments in flight */ + xmit_size_goal = min_t(u32, xmit_size_goal, + sysctl_tcp_limit_output_bytes >> 1); + xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); /* We try hard to avoid divides here */ @@ -3574,4 +3579,5 @@ void __init tcp_init(void) tcp_secret_primary = &tcp_secret_one; tcp_secret_retiring = &tcp_secret_two; tcp_secret_secondary = &tcp_secret_two; + tcp_tasklet_init(); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ddefd39ac0c..01545a3fc0f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2588,6 +2588,7 @@ struct proto tcp_prot = { .sendmsg = tcp_sendmsg, .sendpage = tcp_sendpage, .backlog_rcv = tcp_v4_do_rcv, + .release_cb = tcp_release_cb, .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 65608863fde..c66f2ede160 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -424,6 +424,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, treq->snt_isn + 1 + tcp_s_data_size(oldtp); tcp_prequeue_init(newtp); + INIT_LIST_HEAD(&newtp->tsq_node); tcp_init_wl(newtp, treq->rcv_isn); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index c465d3e51e2..03854abfd9d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -50,6 +50,9 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1; */ int sysctl_tcp_workaround_signed_windows __read_mostly = 0; +/* Default TSQ limit of two TSO segments */ +int sysctl_tcp_limit_output_bytes __read_mostly = 131072; + /* This limits the percentage of the congestion window which we * will allow a single TSO frame to consume. Building TSO frames * which are too large can cause TCP streams to be bursty. @@ -65,6 +68,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); +static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + int push_one, gfp_t gfp); /* Account for new data that has been sent to the network. */ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) @@ -783,6 +788,140 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb return size; } + +/* TCP SMALL QUEUES (TSQ) + * + * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev) + * to reduce RTT and bufferbloat. + * We do this using a special skb destructor (tcp_wfree). + * + * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb + * needs to be reallocated in a driver. + * The invariant being skb->truesize substracted from sk->sk_wmem_alloc + * + * Since transmit from skb destructor is forbidden, we use a tasklet + * to process all sockets that eventually need to send more skbs. + * We use one tasklet per cpu, with its own queue of sockets. + */ +struct tsq_tasklet { + struct tasklet_struct tasklet; + struct list_head head; /* queue of tcp sockets */ +}; +static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet); + +/* + * One tasklest per cpu tries to send more skbs. + * We run in tasklet context but need to disable irqs when + * transfering tsq->head because tcp_wfree() might + * interrupt us (non NAPI drivers) + */ +static void tcp_tasklet_func(unsigned long data) +{ + struct tsq_tasklet *tsq = (struct tsq_tasklet *)data; + LIST_HEAD(list); + unsigned long flags; + struct list_head *q, *n; + struct tcp_sock *tp; + struct sock *sk; + + local_irq_save(flags); + list_splice_init(&tsq->head, &list); + local_irq_restore(flags); + + list_for_each_safe(q, n, &list) { + tp = list_entry(q, struct tcp_sock, tsq_node); + list_del(&tp->tsq_node); + + sk = (struct sock *)tp; + bh_lock_sock(sk); + + if (!sock_owned_by_user(sk)) { + if ((1 << sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | + TCPF_CLOSING | TCPF_CLOSE_WAIT)) + tcp_write_xmit(sk, + tcp_current_mss(sk), + 0, 0, + GFP_ATOMIC); + } else { + /* defer the work to tcp_release_cb() */ + set_bit(TSQ_OWNED, &tp->tsq_flags); + } + bh_unlock_sock(sk); + + clear_bit(TSQ_QUEUED, &tp->tsq_flags); + sk_free(sk); + } +} + +/** + * tcp_release_cb - tcp release_sock() callback + * @sk: socket + * + * called from release_sock() to perform protocol dependent + * actions before socket release. + */ +void tcp_release_cb(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (test_and_clear_bit(TSQ_OWNED, &tp->tsq_flags)) { + if ((1 << sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | + TCPF_CLOSING | TCPF_CLOSE_WAIT)) + tcp_write_xmit(sk, + tcp_current_mss(sk), + 0, 0, + GFP_ATOMIC); + } +} +EXPORT_SYMBOL(tcp_release_cb); + +void __init tcp_tasklet_init(void) +{ + int i; + + for_each_possible_cpu(i) { + struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i); + + INIT_LIST_HEAD(&tsq->head); + tasklet_init(&tsq->tasklet, + tcp_tasklet_func, + (unsigned long)tsq); + } +} + +/* + * Write buffer destructor automatically called from kfree_skb. + * We cant xmit new skbs from this context, as we might already + * hold qdisc lock. + */ +void tcp_wfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct tcp_sock *tp = tcp_sk(sk); + + if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) && + !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) { + unsigned long flags; + struct tsq_tasklet *tsq; + + /* Keep a ref on socket. + * This last ref will be released in tcp_tasklet_func() + */ + atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc); + + /* queue this socket to tasklet queue */ + local_irq_save(flags); + tsq = &__get_cpu_var(tsq_tasklet); + list_add(&tp->tsq_node, &tsq->head); + tasklet_schedule(&tsq->tasklet); + local_irq_restore(flags); + } else { + sock_wfree(skb); + } +} + /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. @@ -844,7 +983,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); - skb_set_owner_w(skb, sk); + + skb_orphan(skb); + skb->sk = sk; + skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ? + tcp_wfree : sock_wfree; + atomic_add(skb->truesize, &sk->sk_wmem_alloc); /* Build TCP header and checksum it. */ th = tcp_hdr(skb); @@ -1780,6 +1924,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, while ((skb = tcp_send_head(sk))) { unsigned int limit; + tso_segs = tcp_init_tso_segs(sk, skb, mss_now); BUG_ON(!tso_segs); @@ -1800,6 +1945,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, break; } + /* TSQ : sk_wmem_alloc accounts skb truesize, + * including skb overhead. But thats OK. + */ + if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) { + set_bit(TSQ_THROTTLED, &tp->tsq_flags); + break; + } limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 61175cb2478..70458a9cd83 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1970,6 +1970,7 @@ struct proto tcpv6_prot = { .sendmsg = tcp_sendmsg, .sendpage = tcp_sendpage, .backlog_rcv = tcp_v6_do_rcv, + .release_cb = tcp_release_cb, .hash = tcp_v6_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, -- cgit v1.2.3-70-g09d2 From 55be7a9c6074f749d617a7fc1914c9a23505438c Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 11 Jul 2012 21:27:49 -0700 Subject: ipv4: Add redirect support to all protocol icmp error handlers. Signed-off-by: David S. Miller --- net/dccp/ipv4.c | 11 +++++++++++ net/ipv4/ah4.c | 18 +++++++++++++----- net/ipv4/esp4.c | 18 +++++++++++++----- net/ipv4/ip_gre.c | 9 ++++++++- net/ipv4/ipcomp.c | 18 +++++++++++++----- net/ipv4/ipip.c | 9 +++++++++ net/ipv4/ping.c | 1 + net/ipv4/raw.c | 2 ++ net/ipv4/tcp_ipv4.c | 11 +++++++++++ net/ipv4/udp.c | 3 +++ net/ipv4/xfrm4_policy.c | 10 ++++++++++ net/sctp/input.c | 16 ++++++++++++++++ 12 files changed, 110 insertions(+), 16 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 3eb76b5f221..8f41a319085 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -195,6 +195,14 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk, } /* else let the usual retransmit timer handle it */ } +static void dccp_do_redirect(struct sk_buff *skb, struct sock *sk) +{ + struct dst_entry *dst = __sk_dst_check(sk, 0); + + if (dst && dst->ops->redirect) + dst->ops->redirect(dst, skb); +} + /* * This routine is called by the ICMP module when it gets some sort of error * condition. If err < 0 then the socket should be closed and the error @@ -259,6 +267,9 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) } switch (type) { + case ICMP_REDIRECT: + dccp_do_redirect(skb, sk); + goto out; case ICMP_SOURCE_QUENCH: /* Just silently ignore these. */ goto out; diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 916d5ecaf6c..a0d8392491c 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -398,17 +398,25 @@ static void ah4_err(struct sk_buff *skb, u32 info) struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; - if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || - icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + switch (icmp_hdr(skb)->type) { + case ICMP_DEST_UNREACH: + if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + return; + case ICMP_REDIRECT: + break; + default: return; + } x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); if (!x) return; - pr_debug("pmtu discovery on SA AH/%08x/%08x\n", - ntohl(ah->spi), ntohl(iph->daddr)); - ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0); + + if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) + ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0); + else + ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0); xfrm_state_put(x); } diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 7b95b49a36c..b61e9deb7c7 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -484,17 +484,25 @@ static void esp4_err(struct sk_buff *skb, u32 info) struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; - if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || - icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + switch (icmp_hdr(skb)->type) { + case ICMP_DEST_UNREACH: + if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + return; + case ICMP_REDIRECT: + break; + default: return; + } x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); if (!x) return; - NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", - ntohl(esph->spi), ntohl(iph->daddr)); - ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0); + + if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) + ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0); + else + ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0); xfrm_state_put(x); } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 594cec35ac4..0c3123566d7 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -528,6 +528,9 @@ static void ipgre_err(struct sk_buff *skb, u32 info) if (code != ICMP_EXC_TTL) return; break; + + case ICMP_REDIRECT: + break; } rcu_read_lock(); @@ -543,7 +546,11 @@ static void ipgre_err(struct sk_buff *skb, u32 info) t->parms.link, 0, IPPROTO_GRE, 0); goto out; } - + if (type == ICMP_REDIRECT) { + ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0, + IPPROTO_GRE, 0); + goto out; + } if (t->parms.iph.daddr == 0 || ipv4_is_multicast(t->parms.iph.daddr)) goto out; diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index b91375482d8..d3ab47e19a8 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -31,18 +31,26 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; - if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || - icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + switch (icmp_hdr(skb)->type) { + case ICMP_DEST_UNREACH: + if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + return; + case ICMP_REDIRECT: + break; + default: return; + } spi = htonl(ntohs(ipch->cpi)); x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, spi, IPPROTO_COMP, AF_INET); if (!x) return; - NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%pI4\n", - spi, &iph->daddr); - ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0); + + if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) + ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0); + else + ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0); xfrm_state_put(x); } diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 715338a1b20..c2d0e6d8baa 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -360,6 +360,8 @@ static int ipip_err(struct sk_buff *skb, u32 info) if (code != ICMP_EXC_TTL) return 0; break; + case ICMP_REDIRECT: + break; } err = -ENOENT; @@ -376,6 +378,13 @@ static int ipip_err(struct sk_buff *skb, u32 info) goto out; } + if (type == ICMP_REDIRECT) { + ipv4_redirect(skb, dev_net(skb->dev), t->dev->ifindex, 0, + IPPROTO_IPIP, 0); + err = 0; + goto out; + } + if (t->parms.iph.daddr == 0) goto out; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 340fcf29a96..6232d476f37 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -387,6 +387,7 @@ void ping_err(struct sk_buff *skb, u32 info) break; case ICMP_REDIRECT: /* See ICMP_SOURCE_QUENCH */ + ipv4_sk_redirect(skb, sk); err = EREMOTEIO; break; } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 659ddfb1094..ff0f071969e 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -218,6 +218,8 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) ipv4_sk_update_pmtu(skb, sk, info); + else if (type == ICMP_REDIRECT) + ipv4_sk_redirect(skb, sk); /* Report error on raw socket, if: 1. User requested ip_recverr. diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 01545a3fc0f..087a8488843 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -321,6 +321,14 @@ static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu) } /* else let the usual retransmit timer handle it */ } +static void do_redirect(struct sk_buff *skb, struct sock *sk) +{ + struct dst_entry *dst = __sk_dst_check(sk, 0); + + if (dst && dst->ops->redirect) + dst->ops->redirect(dst, skb); +} + /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should @@ -394,6 +402,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) } switch (type) { + case ICMP_REDIRECT: + do_redirect(icmp_skb, sk); + goto out; case ICMP_SOURCE_QUENCH: /* Just silently ignore these. */ goto out; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ee37d47d472..b4c3582a991 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -630,6 +630,9 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) err = icmp_err_convert[code].errno; } break; + case ICMP_REDIRECT: + ipv4_sk_redirect(skb, sk); + break; } /* diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 87d3fcc302d..258ebd7b268 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -202,6 +202,15 @@ static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) path->ops->update_pmtu(path, mtu); } +static void xfrm4_redirect(struct dst_entry *dst, struct sk_buff *skb) +{ + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + struct dst_entry *path = xdst->route; + + if (path->ops->redirect) + path->ops->redirect(path, skb); +} + static void xfrm4_dst_destroy(struct dst_entry *dst) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; @@ -225,6 +234,7 @@ static struct dst_ops xfrm4_dst_ops = { .protocol = cpu_to_be16(ETH_P_IP), .gc = xfrm4_garbage_collect, .update_pmtu = xfrm4_update_pmtu, + .redirect = xfrm4_redirect, .cow_metrics = dst_cow_metrics_generic, .destroy = xfrm4_dst_destroy, .ifdown = xfrm4_dst_ifdown, diff --git a/net/sctp/input.c b/net/sctp/input.c index 80564fe0302..9fb4247f9a9 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -423,6 +423,18 @@ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc, sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD); } +static void sctp_icmp_redirect(struct sock *sk, struct sctp_transport *t, + struct sk_buff *skb) +{ + struct dst_entry *dst; + + if (!t) + return; + dst = sctp_transport_dst_check(t); + if (dst && dst->ops->redirect) + dst->ops->redirect(dst, skb); +} + /* * SCTP Implementer's Guide, 2.37 ICMP handling procedures * @@ -628,6 +640,10 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info) err = EHOSTUNREACH; break; + case ICMP_REDIRECT: + sctp_icmp_redirect(sk, transport, skb); + err = 0; + break; default: goto out_unlock; } -- cgit v1.2.3-70-g09d2 From 1ed5c48f231cd00eac0b3d2350ac61e3c825063e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 12 Jul 2012 00:41:25 -0700 Subject: net: Remove checks for dst_ops->redirect being NULL. No longer necessary. Signed-off-by: David S. Miller --- net/dccp/ipv4.c | 2 +- net/dccp/ipv6.c | 2 +- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/xfrm4_policy.c | 3 +-- net/ipv6/ip6_tunnel.c | 6 ++---- net/ipv6/tcp_ipv6.c | 2 +- net/sctp/input.c | 2 +- 7 files changed, 8 insertions(+), 11 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 8f41a319085..129ed8f7413 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -199,7 +199,7 @@ static void dccp_do_redirect(struct sk_buff *skb, struct sock *sk) { struct dst_entry *dst = __sk_dst_check(sk, 0); - if (dst && dst->ops->redirect) + if (dst) dst->ops->redirect(dst, skb); } diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index b4d7d28ce6d..090c0800ce0 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -133,7 +133,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (type == NDISC_REDIRECT) { struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); - if (dst && dst->ops->redirect) + if (dst) dst->ops->redirect(dst, skb); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 087a8488843..7a0062cb4ed 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -325,7 +325,7 @@ static void do_redirect(struct sk_buff *skb, struct sock *sk) { struct dst_entry *dst = __sk_dst_check(sk, 0); - if (dst && dst->ops->redirect) + if (dst) dst->ops->redirect(dst, skb); } diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 258ebd7b268..737131cef37 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -207,8 +207,7 @@ static void xfrm4_redirect(struct dst_entry *dst, struct sk_buff *skb) struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; - if (path->ops->redirect) - path->ops->redirect(path, skb); + path->ops->redirect(path, skb); } static void xfrm4_dst_destroy(struct dst_entry *dst) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 0b5b60ec6f4..61d10659729 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -611,10 +611,8 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, skb_dst(skb2)->ops->update_pmtu(skb_dst(skb2), rel_info); } - if (rel_type == ICMP_REDIRECT) { - if (skb_dst(skb2)->ops->redirect) - skb_dst(skb2)->ops->redirect(skb_dst(skb2), skb2); - } + if (rel_type == ICMP_REDIRECT) + skb_dst(skb2)->ops->redirect(skb_dst(skb2), skb2); icmp_send(skb2, rel_type, rel_code, htonl(rel_info)); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7249e4bb9b8..3071f377145 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -366,7 +366,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (type == NDISC_REDIRECT) { struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); - if (dst && dst->ops->redirect) + if (dst) dst->ops->redirect(dst,skb); } diff --git a/net/sctp/input.c b/net/sctp/input.c index 5943b7d77dd..f050d45faa9 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -431,7 +431,7 @@ void sctp_icmp_redirect(struct sock *sk, struct sctp_transport *t, if (!t) return; dst = sctp_transport_dst_check(t); - if (dst && dst->ops->redirect) + if (dst) dst->ops->redirect(dst, skb); } -- cgit v1.2.3-70-g09d2 From 80d0a69fc57715dc9080c0567df1ed911b78abea Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 16 Jul 2012 03:28:06 -0700 Subject: ipv4: Add helper inet_csk_update_pmtu(). This abstracts away the call to dst_ops->update_pmtu() so that we can transparently handle the fact that, in the future, the dst itself can be invalidated by the PMTU update (when we have non-host routes cached in sockets). So we try to rebuild the socket cached route after the method invocation if necessary. This isn't used by SCTP because it needs to cache dsts per-transport, and thus will need it's own local version of this helper. Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 2 ++ net/dccp/ipv4.c | 11 ++------- net/ipv4/inet_connection_sock.c | 46 ++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_ipv4.c | 11 ++------- 4 files changed, 52 insertions(+), 18 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 291e7cee14e..2cf44b4ed2e 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -337,4 +337,6 @@ extern int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); extern int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen); + +extern struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu); #endif /* _INET_CONNECTION_SOCK_H */ diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 129ed8f7413..683902fcc8e 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -161,17 +161,10 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk, if (sk->sk_state == DCCP_LISTEN) return; - /* We don't check in the destentry if pmtu discovery is forbidden - * on this route. We just assume that no packet_to_big packets - * are send back when pmtu discovery is not active. - * There is a small race when the user changes this flag in the - * route, but I think that's acceptable. - */ - if ((dst = __sk_dst_check(sk, 0)) == NULL) + dst = inet_csk_update_pmtu(sk, mtu); + if (!dst) return; - dst->ops->update_pmtu(dst, mtu); - /* Something is about to be wrong... Remember soft error * for the case, if this connection will not able to recover. */ diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 76825be3b64..200d2180937 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -803,3 +803,49 @@ int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname, } EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt); #endif + +static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl) +{ + struct inet_sock *inet = inet_sk(sk); + struct ip_options_rcu *inet_opt; + __be32 daddr = inet->inet_daddr; + struct flowi4 *fl4; + struct rtable *rt; + + rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); + if (inet_opt && inet_opt->opt.srr) + daddr = inet_opt->opt.faddr; + fl4 = &fl->u.ip4; + rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, + inet->inet_saddr, inet->inet_dport, + inet->inet_sport, sk->sk_protocol, + RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); + if (IS_ERR(rt)) + rt = NULL; + if (rt) + sk_setup_caps(sk, &rt->dst); + rcu_read_unlock(); + + return &rt->dst; +} + +struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu) +{ + struct dst_entry *dst = __sk_dst_check(sk, 0); + struct inet_sock *inet = inet_sk(sk); + + if (!dst) { + dst = inet_csk_rebuild_route(sk, &inet->cork.fl); + if (!dst) + goto out; + } + dst->ops->update_pmtu(dst, mtu); + + dst = __sk_dst_check(sk, 0); + if (!dst) + dst = inet_csk_rebuild_route(sk, &inet->cork.fl); +out: + return dst; +} +EXPORT_SYMBOL_GPL(inet_csk_update_pmtu); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7a0062cb4ed..b8e7e059540 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -289,17 +289,10 @@ static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu) if (sk->sk_state == TCP_LISTEN) return; - /* We don't check in the destentry if pmtu discovery is forbidden - * on this route. We just assume that no packet_to_big packets - * are send back when pmtu discovery is not active. - * There is a small race when the user changes this flag in the - * route, but I think that's acceptable. - */ - if ((dst = __sk_dst_check(sk, 0)) == NULL) + dst = inet_csk_update_pmtu(sk, mtu); + if (!dst) return; - dst->ops->update_pmtu(dst, mtu); - /* Something is about to be wrong... Remember soft error * for the case, if this connection will not able to recover. */ -- cgit v1.2.3-70-g09d2 From 6700c2709c08d74ae2c3c29b84a30da012dbc7f1 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 17 Jul 2012 03:29:28 -0700 Subject: net: Pass optional SKB and SK arguments to dst_ops->{update_pmtu,redirect}() This will be used so that we can compose a full flow key. Even though we have a route in this context, we need more. In the future the routes will be without destination address, source address, etc. keying. One ipv4 route will cover entire subnets, etc. In this environment we have to have a way to possess persistent storage for redirects and PMTU information. This persistent storage will exist in the FIB tables, and that's why we'll need to be able to rebuild a full lookup flow key here. Using that flow key will do a fib_lookup() and create/update the persistent entry. Signed-off-by: David S. Miller --- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 2 +- include/net/dst_ops.h | 6 ++++-- net/bridge/br_netfilter.c | 6 ++++-- net/dccp/ipv4.c | 2 +- net/dccp/ipv6.c | 2 +- net/decnet/dn_route.c | 12 ++++++++---- net/ipv4/inet_connection_sock.c | 2 +- net/ipv4/ip_gre.c | 2 +- net/ipv4/ipip.c | 2 +- net/ipv4/route.c | 21 +++++++++++++-------- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/xfrm4_policy.c | 10 ++++++---- net/ipv6/inet6_connection_sock.c | 2 +- net/ipv6/ip6_tunnel.c | 6 +++--- net/ipv6/route.c | 21 +++++++++++++-------- net/ipv6/sit.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- net/ipv6/xfrm6_policy.c | 10 ++++++---- net/netfilter/ipvs/ip_vs_xmit.c | 4 ++-- net/sctp/input.c | 2 +- net/sctp/transport.c | 2 +- 21 files changed, 71 insertions(+), 49 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 014504d8e43..1ca732201f3 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -1397,7 +1397,7 @@ void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, int e = skb_queue_empty(&priv->cm.skb_queue); if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); skb_queue_tail(&priv->cm.skb_queue, skb); if (e) diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h index 085931fa7ce..d079fc61c12 100644 --- a/include/net/dst_ops.h +++ b/include/net/dst_ops.h @@ -24,8 +24,10 @@ struct dst_ops { struct net_device *dev, int how); struct dst_entry * (*negative_advice)(struct dst_entry *); void (*link_failure)(struct sk_buff *); - void (*update_pmtu)(struct dst_entry *dst, u32 mtu); - void (*redirect)(struct dst_entry *dst, struct sk_buff *skb); + void (*update_pmtu)(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu); + void (*redirect)(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb); int (*local_out)(struct sk_buff *skb); struct neighbour * (*neigh_lookup)(const struct dst_entry *dst, struct sk_buff *skb, diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 81f76c402cf..68e8f364bbf 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -111,11 +111,13 @@ static inline __be16 pppoe_proto(const struct sk_buff *skb) pppoe_proto(skb) == htons(PPP_IPV6) && \ brnf_filter_pppoe_tagged) -static void fake_update_pmtu(struct dst_entry *dst, u32 mtu) +static void fake_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) { } -static void fake_redirect(struct dst_entry *dst, struct sk_buff *skb) +static void fake_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb) { } diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 683902fcc8e..ab4f44c9bb2 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -193,7 +193,7 @@ static void dccp_do_redirect(struct sk_buff *skb, struct sock *sk) struct dst_entry *dst = __sk_dst_check(sk, 0); if (dst) - dst->ops->redirect(dst, skb); + dst->ops->redirect(dst, sk, skb); } /* diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 3ee0342e1ce..56840b249f3 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -134,7 +134,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); if (dst) - dst->ops->redirect(dst, skb); + dst->ops->redirect(dst, sk, skb); } if (type == ICMPV6_PKT_TOOBIG) { diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index e9c4e2e864c..47de90d8fe9 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -117,8 +117,10 @@ static void dn_dst_destroy(struct dst_entry *); static void dn_dst_ifdown(struct dst_entry *, struct net_device *dev, int how); static struct dst_entry *dn_dst_negative_advice(struct dst_entry *); static void dn_dst_link_failure(struct sk_buff *); -static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu); -static void dn_dst_redirect(struct dst_entry *dst, struct sk_buff *skb); +static void dn_dst_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb , u32 mtu); +static void dn_dst_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb); static struct neighbour *dn_dst_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr); @@ -266,7 +268,8 @@ static int dn_dst_gc(struct dst_ops *ops) * We update both the mtu and the advertised mss (i.e. the segment size we * advertise to the other end). */ -static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu) +static void dn_dst_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) { struct dn_route *rt = (struct dn_route *) dst; struct neighbour *n = rt->n; @@ -294,7 +297,8 @@ static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu) } } -static void dn_dst_redirect(struct dst_entry *dst, struct sk_buff *skb) +static void dn_dst_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb) { } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 200d2180937..3ea465286a3 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -840,7 +840,7 @@ struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu) if (!dst) goto out; } - dst->ops->update_pmtu(dst, mtu); + dst->ops->update_pmtu(dst, sk, NULL, mtu); dst = __sk_dst_check(sk, 0); if (!dst) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 0c3123566d7..42c44b1403c 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -833,7 +833,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); if (skb->protocol == htons(ETH_P_IP)) { df |= (old_iph->frag_off&htons(IP_DF)); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index c2d0e6d8baa..2c2c35bace7 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -519,7 +519,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) } if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); if ((old_iph->frag_off & htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index aad21819316..b35d3bfc66c 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -148,8 +148,10 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst); static void ipv4_dst_destroy(struct dst_entry *dst); static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); -static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); -static void ip_do_redirect(struct dst_entry *dst, struct sk_buff *skb); +static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu); +static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb); static int rt_garbage_collect(struct dst_ops *ops); static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, @@ -1273,7 +1275,7 @@ static void rt_del(unsigned int hash, struct rtable *rt) spin_unlock_bh(rt_hash_lock_addr(hash)); } -static void ip_do_redirect(struct dst_entry *dst, struct sk_buff *skb) +static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { __be32 new_gw = icmp_hdr(skb)->un.gateway; __be32 old_gw = ip_hdr(skb)->saddr; @@ -1506,7 +1508,8 @@ out: kfree_skb(skb); return 0; } -static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) +static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) { struct rtable *rt = (struct rtable *) dst; @@ -1531,7 +1534,7 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, iph->daddr, iph->saddr, 0, 0); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { - ip_rt_update_pmtu(&rt->dst, mtu); + ip_rt_update_pmtu(&rt->dst, NULL, skb, mtu); ip_rt_put(rt); } } @@ -1559,7 +1562,7 @@ void ipv4_redirect(struct sk_buff *skb, struct net *net, protocol, flow_flags, iph->daddr, iph->saddr, 0, 0); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { - ip_do_redirect(&rt->dst, skb); + ip_do_redirect(&rt->dst, NULL, skb); ip_rt_put(rt); } } @@ -2587,11 +2590,13 @@ static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst) return mtu ? : dst->dev->mtu; } -static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) +static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) { } -static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sk_buff *skb) +static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb) { } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b8e7e059540..d9caf5c07aa 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -319,7 +319,7 @@ static void do_redirect(struct sk_buff *skb, struct sock *sk) struct dst_entry *dst = __sk_dst_check(sk, 0); if (dst) - dst->ops->redirect(dst, skb); + dst->ops->redirect(dst, sk, skb); } /* diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 737131cef37..fcf7678bc00 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -194,20 +194,22 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops) return (dst_entries_get_slow(ops) > ops->gc_thresh * 2); } -static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) +static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; - path->ops->update_pmtu(path, mtu); + path->ops->update_pmtu(path, sk, skb, mtu); } -static void xfrm4_redirect(struct dst_entry *dst, struct sk_buff *skb) +static void xfrm4_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; - path->ops->redirect(path, skb); + path->ops->redirect(path, sk, skb); } static void xfrm4_dst_destroy(struct dst_entry *dst) diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 62539a4b2dc..4a0c4d2d8b0 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -269,7 +269,7 @@ struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu) if (IS_ERR(dst)) return NULL; - dst->ops->update_pmtu(dst, mtu); + dst->ops->update_pmtu(dst, sk, NULL, mtu); return inet6_csk_route_socket(sk); } diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 61d10659729..db328466796 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -609,10 +609,10 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (rel_info > dst_mtu(skb_dst(skb2))) goto out; - skb_dst(skb2)->ops->update_pmtu(skb_dst(skb2), rel_info); + skb_dst(skb2)->ops->update_pmtu(skb_dst(skb2), NULL, skb2, rel_info); } if (rel_type == ICMP_REDIRECT) - skb_dst(skb2)->ops->redirect(skb_dst(skb2), skb2); + skb_dst(skb2)->ops->redirect(skb_dst(skb2), NULL, skb2); icmp_send(skb2, rel_type, rel_code, htonl(rel_info)); @@ -952,7 +952,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); if (skb->len > mtu) { *pmtu = mtu; err = -EMSGSIZE; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 2a4c8d48977..31af1ed6c1d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -78,8 +78,10 @@ static int ip6_dst_gc(struct dst_ops *ops); static int ip6_pkt_discard(struct sk_buff *skb); static int ip6_pkt_discard_out(struct sk_buff *skb); static void ip6_link_failure(struct sk_buff *skb); -static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu); -static void rt6_do_redirect(struct dst_entry *dst, struct sk_buff *skb); +static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu); +static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb); #ifdef CONFIG_IPV6_ROUTE_INFO static struct rt6_info *rt6_add_route_info(struct net *net, @@ -187,11 +189,13 @@ static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) return mtu ? : dst->dev->mtu; } -static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) +static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) { } -static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sk_buff *skb) +static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb) { } @@ -1071,7 +1075,8 @@ static void ip6_link_failure(struct sk_buff *skb) } } -static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu) +static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) { struct rt6_info *rt6 = (struct rt6_info*)dst; @@ -1108,7 +1113,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, dst = ip6_route_output(net, NULL, &fl6); if (!dst->error) - ip6_rt_update_pmtu(dst, ntohl(mtu)); + ip6_rt_update_pmtu(dst, NULL, skb, ntohl(mtu)); dst_release(dst); } EXPORT_SYMBOL_GPL(ip6_update_pmtu); @@ -1136,7 +1141,7 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark) dst = ip6_route_output(net, NULL, &fl6); if (!dst->error) - rt6_do_redirect(dst, skb); + rt6_do_redirect(dst, NULL, skb); dst_release(dst); } EXPORT_SYMBOL_GPL(ip6_redirect); @@ -1639,7 +1644,7 @@ static int ip6_route_del(struct fib6_config *cfg) return err; } -static void rt6_do_redirect(struct dst_entry *dst, struct sk_buff *skb) +static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct netevent_redirect netevent; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index fbf1622fdee..3bd1bfc01f8 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -807,7 +807,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, } if (tunnel->parms.iph.daddr && skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); if (skb->len > mtu) { icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ecdf241cad0..c9dabdd832d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -367,7 +367,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); if (dst) - dst->ops->redirect(dst,skb); + dst->ops->redirect(dst, sk, skb); } if (type == ICMPV6_PKT_TOOBIG) { diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index f5a9cb8257b..ef39812107b 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -207,20 +207,22 @@ static inline int xfrm6_garbage_collect(struct dst_ops *ops) return dst_entries_get_fast(ops) > ops->gc_thresh * 2; } -static void xfrm6_update_pmtu(struct dst_entry *dst, u32 mtu) +static void xfrm6_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; - path->ops->update_pmtu(path, mtu); + path->ops->update_pmtu(path, sk, skb, mtu); } -static void xfrm6_redirect(struct dst_entry *dst, struct sk_buff *skb) +static void xfrm6_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; - path->ops->redirect(path, skb); + path->ops->redirect(path, sk, skb); } static void xfrm6_dst_destroy(struct dst_entry *dst) diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 71d6ecb6592..65b616ae171 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -797,7 +797,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error_put; } if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); df |= (old_iph->frag_off & htons(IP_DF)); @@ -913,7 +913,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error_put; } if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr) && !skb_is_gso(skb)) { diff --git a/net/sctp/input.c b/net/sctp/input.c index a67bc31f49f..c201b26879a 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -432,7 +432,7 @@ void sctp_icmp_redirect(struct sock *sk, struct sctp_transport *t, return; dst = sctp_transport_dst_check(t); if (dst) - dst->ops->redirect(dst, skb); + dst->ops->redirect(dst, sk, skb); } /* diff --git a/net/sctp/transport.c b/net/sctp/transport.c index e69e1a2175a..a6b7ee9ce28 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -249,7 +249,7 @@ void sctp_transport_update_pmtu(struct sock *sk, struct sctp_transport *t, u32 p t->af_specific->get_dst(t, &t->saddr, &t->fl, sk); if (dst) { - dst->ops->update_pmtu(dst, pmtu); + dst->ops->update_pmtu(dst, sk, NULL, pmtu); dst = sctp_transport_dst_check(t); if (!dst) -- cgit v1.2.3-70-g09d2 From be9f4a44e7d41cee50ddb5f038fc2391cbbb4046 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 19 Jul 2012 07:34:03 +0000 Subject: ipv4: tcp: remove per net tcp_sock tcp_v4_send_reset() and tcp_v4_send_ack() use a single socket per network namespace. This leads to bad behavior on multiqueue NICS, because many cpus contend for the socket lock and once socket lock is acquired, extra false sharing on various socket fields slow down the operations. To better resist to attacks, we use a percpu socket. Each cpu can run without contention, using appropriate memory (local node) Additional features : 1) We also mirror the queue_mapping of the incoming skb, so that answers use the same queue if possible. 2) Setting SOCK_USE_WRITE_QUEUE socket flag speedup sock_wfree() 3) We now limit the number of in-flight RST/ACK [1] packets per cpu, instead of per namespace, and we honor the sysctl_wmem_default limit dynamically. (Prior to this patch, sysctl_wmem_default value was copied at boot time, so any further change would not affect tcp_sock limit) [1] These packets are only generated when no socket was matched for the incoming packet. Reported-by: Bill Sommerfeld Signed-off-by: Eric Dumazet Cc: Tom Herbert Signed-off-by: David S. Miller --- include/net/ip.h | 2 +- include/net/netns/ipv4.h | 1 - net/ipv4/ip_output.c | 50 +++++++++++++++++++++++++++++++----------------- net/ipv4/tcp_ipv4.c | 8 +++----- 4 files changed, 36 insertions(+), 25 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/ip.h b/include/net/ip.h index ec5cfde85e9..bd5e444a19c 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -158,7 +158,7 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg) return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0; } -void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, +void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, unsigned int len); diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 2e089a99d60..d909c7fc3da 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -38,7 +38,6 @@ struct netns_ipv4 { struct sock *fibnl; struct sock **icmp_sk; - struct sock *tcp_sock; struct inet_peer_base *peers; struct tcpm_hash_bucket *tcp_metrics_hash; unsigned int tcp_metrics_hash_mask; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index cc52679790b..c528f841ca4 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1463,20 +1463,33 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset, /* * Generic function to send a packet as reply to another packet. - * Used to send TCP resets so far. + * Used to send some TCP resets/acks so far. * - * Should run single threaded per socket because it uses the sock - * structure to pass arguments. + * Use a fake percpu inet socket to avoid false sharing and contention. */ -void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, +static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = { + .sk = { + .__sk_common = { + .skc_refcnt = ATOMIC_INIT(1), + }, + .sk_wmem_alloc = ATOMIC_INIT(1), + .sk_allocation = GFP_ATOMIC, + .sk_flags = (1UL << SOCK_USE_WRITE_QUEUE), + }, + .pmtudisc = IP_PMTUDISC_WANT, +}; + +void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, unsigned int len) { - struct inet_sock *inet = inet_sk(sk); struct ip_options_data replyopts; struct ipcm_cookie ipc; struct flowi4 fl4; struct rtable *rt = skb_rtable(skb); + struct sk_buff *nskb; + struct sock *sk; + struct inet_sock *inet; if (ip_options_echo(&replyopts.opt.opt, skb)) return; @@ -1494,38 +1507,39 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, flowi4_init_output(&fl4, arg->bound_dev_if, 0, RT_TOS(arg->tos), - RT_SCOPE_UNIVERSE, sk->sk_protocol, + RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, ip_reply_arg_flowi_flags(arg), daddr, saddr, tcp_hdr(skb)->source, tcp_hdr(skb)->dest); security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); - rt = ip_route_output_key(sock_net(sk), &fl4); + rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) return; - /* And let IP do all the hard work. + inet = &get_cpu_var(unicast_sock); - This chunk is not reenterable, hence spinlock. - Note that it uses the fact, that this function is called - with locally disabled BH and that sk cannot be already spinlocked. - */ - bh_lock_sock(sk); inet->tos = arg->tos; + sk = &inet->sk; sk->sk_priority = skb->priority; sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; + sock_net_set(sk, net); + __skb_queue_head_init(&sk->sk_write_queue); + sk->sk_sndbuf = sysctl_wmem_default; ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, &ipc, &rt, MSG_DONTWAIT); - if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { + nskb = skb_peek(&sk->sk_write_queue); + if (nskb) { if (arg->csumoffset >= 0) - *((__sum16 *)skb_transport_header(skb) + - arg->csumoffset) = csum_fold(csum_add(skb->csum, + *((__sum16 *)skb_transport_header(nskb) + + arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); - skb->ip_summed = CHECKSUM_NONE; + nskb->ip_summed = CHECKSUM_NONE; + skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); ip_push_pending_frames(sk, &fl4); } - bh_unlock_sock(sk); + put_cpu_var(unicast_sock); ip_rt_put(rt); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d9caf5c07aa..d7d2fa50f07 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -688,7 +688,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) net = dev_net(skb_dst(skb)->dev); arg.tos = ip_hdr(skb)->tos; - ip_send_unicast_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, + ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); @@ -771,7 +771,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, if (oif) arg.bound_dev_if = oif; arg.tos = tos; - ip_send_unicast_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, + ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); @@ -2624,13 +2624,11 @@ EXPORT_SYMBOL(tcp_prot); static int __net_init tcp_sk_init(struct net *net) { - return inet_ctl_sock_create(&net->ipv4.tcp_sock, - PF_INET, SOCK_RAW, IPPROTO_TCP, net); + return 0; } static void __net_exit tcp_sk_exit(struct net *net) { - inet_ctl_sock_destroy(net->ipv4.tcp_sock); } static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) -- cgit v1.2.3-70-g09d2 From 2100c8d2d9db23c0a09901a782bb4e3b21bee298 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 19 Jul 2012 06:43:05 +0000 Subject: net-tcp: Fast Open base This patch impelements the common code for both the client and server. 1. TCP Fast Open option processing. Since Fast Open does not have an option number assigned by IANA yet, it shares the experiment option code 254 by implementing draft-ietf-tcpm-experimental-options with a 16 bits magic number 0xF989. This enables global experiments without clashing the scarce(2) experimental options available for TCP. When the draft status becomes standard (maybe), the client should switch to the new option number assigned while the server supports both numbers for transistion. 2. The new sysctl tcp_fastopen 3. A place holder init function Signed-off-by: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 10 ++++++++++ include/net/tcp.h | 9 ++++++++- net/ipv4/Makefile | 2 +- net/ipv4/syncookies.c | 2 +- net/ipv4/sysctl_net_ipv4.c | 7 +++++++ net/ipv4/tcp_fastopen.c | 11 +++++++++++ net/ipv4/tcp_input.c | 26 ++++++++++++++++++++++---- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_minisocks.c | 4 ++-- net/ipv4/tcp_output.c | 25 +++++++++++++++++++++---- net/ipv6/syncookies.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- 12 files changed, 86 insertions(+), 16 deletions(-) create mode 100644 net/ipv4/tcp_fastopen.c (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 1888169e07c..12948f54383 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -243,6 +243,16 @@ static inline unsigned int tcp_optlen(const struct sk_buff *skb) return (tcp_hdr(skb)->doff - 5) * 4; } +/* TCP Fast Open */ +#define TCP_FASTOPEN_COOKIE_MIN 4 /* Min Fast Open Cookie size in bytes */ +#define TCP_FASTOPEN_COOKIE_MAX 16 /* Max Fast Open Cookie size in bytes */ + +/* TCP Fast Open Cookie as stored in memory */ +struct tcp_fastopen_cookie { + s8 len; + u8 val[TCP_FASTOPEN_COOKIE_MAX]; +}; + /* This defines a selective acknowledgement block. */ struct tcp_sack_block_wire { __be32 start_seq; diff --git a/include/net/tcp.h b/include/net/tcp.h index 85c5090bfe2..5aed3718fde 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -170,6 +170,11 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */ #define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */ #define TCPOPT_COOKIE 253 /* Cookie extension (experimental) */ +#define TCPOPT_EXP 254 /* Experimental */ +/* Magic number to be after the option value for sharing TCP + * experimental options. See draft-ietf-tcpm-experimental-options-00.txt + */ +#define TCPOPT_FASTOPEN_MAGIC 0xF989 /* * TCP option lengths @@ -180,6 +185,7 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCPOLEN_SACK_PERM 2 #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_MD5SIG 18 +#define TCPOLEN_EXP_FASTOPEN_BASE 4 #define TCPOLEN_COOKIE_BASE 2 /* Cookie-less header extension */ #define TCPOLEN_COOKIE_PAIR 3 /* Cookie pair header extension */ #define TCPOLEN_COOKIE_MIN (TCPOLEN_COOKIE_BASE+TCP_COOKIE_MIN) @@ -222,6 +228,7 @@ extern int sysctl_tcp_retries1; extern int sysctl_tcp_retries2; extern int sysctl_tcp_orphan_retries; extern int sysctl_tcp_syncookies; +extern int sysctl_tcp_fastopen; extern int sysctl_tcp_retrans_collapse; extern int sysctl_tcp_stdurg; extern int sysctl_tcp_rfc1337; @@ -418,7 +425,7 @@ extern int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); extern void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx, const u8 **hvpp, - int estab); + int estab, struct tcp_fastopen_cookie *foc); extern const u8 *tcp_parse_md5sig_option(const struct tcphdr *th); /* diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index a677d804e53..ae2ccf2890e 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -7,7 +7,7 @@ obj-y := route.o inetpeer.o protocol.o \ ip_output.o ip_sockglue.o inet_hashtables.o \ inet_timewait_sock.o inet_connection_sock.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ - tcp_minisocks.o tcp_cong.o tcp_metrics.o \ + tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ datagram.o raw.o udp.o udplite.o \ arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o \ diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index eab2a7fb15d..650e1528e1e 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -293,7 +293,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); - tcp_parse_options(skb, &tcp_opt, &hash_location, 0); + tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL); if (!cookie_check_timestamp(&tcp_opt, &ecn_ok)) goto out; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 3f6a1e762e9..5840c325572 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -366,6 +366,13 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, #endif + { + .procname = "tcp_fastopen", + .data = &sysctl_tcp_fastopen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "tcp_tw_recycle", .data = &tcp_death_row.sysctl_tw_recycle, diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c new file mode 100644 index 00000000000..a7f729c409d --- /dev/null +++ b/net/ipv4/tcp_fastopen.c @@ -0,0 +1,11 @@ +#include +#include + +int sysctl_tcp_fastopen; + +static int __init tcp_fastopen_init(void) +{ + return 0; +} + +late_initcall(tcp_fastopen_init); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fdd49f1b7a5..a06bb8959e7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3732,7 +3732,8 @@ old_ack: * the fast version below fails. */ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx, - const u8 **hvpp, int estab) + const u8 **hvpp, int estab, + struct tcp_fastopen_cookie *foc) { const unsigned char *ptr; const struct tcphdr *th = tcp_hdr(skb); @@ -3839,8 +3840,25 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o break; } break; - } + case TCPOPT_EXP: + /* Fast Open option shares code 254 using a + * 16 bits magic number. It's valid only in + * SYN or SYN-ACK with an even size. + */ + if (opsize < TCPOLEN_EXP_FASTOPEN_BASE || + get_unaligned_be16(ptr) != TCPOPT_FASTOPEN_MAGIC || + foc == NULL || !th->syn || (opsize & 1)) + break; + foc->len = opsize - TCPOLEN_EXP_FASTOPEN_BASE; + if (foc->len >= TCP_FASTOPEN_COOKIE_MIN && + foc->len <= TCP_FASTOPEN_COOKIE_MAX) + memcpy(foc->val, ptr + 2, foc->len); + else if (foc->len != 0) + foc->len = -1; + break; + + } ptr += opsize-2; length -= opsize; } @@ -3882,7 +3900,7 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb, if (tcp_parse_aligned_timestamp(tp, th)) return true; } - tcp_parse_options(skb, &tp->rx_opt, hvpp, 1); + tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL); return true; } @@ -5637,7 +5655,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, struct tcp_cookie_values *cvp = tp->cookie_values; int saved_clamp = tp->rx_opt.mss_clamp; - tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0); + tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, NULL); if (th->ack) { /* rfc793: diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d7d2fa50f07..01aa77a9702 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1307,7 +1307,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = TCP_MSS_DEFAULT; tmp_opt.user_mss = tp->rx_opt.user_mss; - tcp_parse_options(skb, &tmp_opt, &hash_location, 0); + tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); if (tmp_opt.cookie_plus > 0 && tmp_opt.saw_tstamp && diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index c66f2ede160..5912ac3fd24 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -97,7 +97,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { - tcp_parse_options(skb, &tmp_opt, &hash_location, 0); + tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = tcptw->tw_ts_recent; @@ -534,7 +534,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { - tcp_parse_options(skb, &tmp_opt, &hash_location, 0); + tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = req->ts_recent; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 15a7c7bc3e5..4849be76ccd 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -385,15 +385,17 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_MD5 (1 << 2) #define OPTION_WSCALE (1 << 3) #define OPTION_COOKIE_EXTENSION (1 << 4) +#define OPTION_FAST_OPEN_COOKIE (1 << 8) struct tcp_out_options { - u8 options; /* bit field of OPTION_* */ + u16 options; /* bit field of OPTION_* */ + u16 mss; /* 0 to disable */ u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */ u8 hash_size; /* bytes in hash_location */ - u16 mss; /* 0 to disable */ - __u32 tsval, tsecr; /* need to include OPTION_TS */ __u8 *hash_location; /* temporary pointer, overloaded */ + __u32 tsval, tsecr; /* need to include OPTION_TS */ + struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ }; /* The sysctl int routines are generic, so check consistency here. @@ -442,7 +444,7 @@ static u8 tcp_cookie_size_check(u8 desired) static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, struct tcp_out_options *opts) { - u8 options = opts->options; /* mungable copy */ + u16 options = opts->options; /* mungable copy */ /* Having both authentication and cookies for security is redundant, * and there's certainly not enough room. Instead, the cookie-less @@ -564,6 +566,21 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, tp->rx_opt.dsack = 0; } + + if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { + struct tcp_fastopen_cookie *foc = opts->fastopen_cookie; + + *ptr++ = htonl((TCPOPT_EXP << 24) | + ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) | + TCPOPT_FASTOPEN_MAGIC); + + memcpy(ptr, foc->val, foc->len); + if ((foc->len & 3) == 2) { + u8 *align = ((u8 *)ptr) + foc->len; + align[0] = align[1] = TCPOPT_NOP; + } + ptr += (foc->len + 3) >> 2; + } } /* Compute TCP options for SYN packets. This is not the final diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 7bf3cc427c2..bb46061c813 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -177,7 +177,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); - tcp_parse_options(skb, &tcp_opt, &hash_location, 0); + tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL); if (!cookie_check_timestamp(&tcp_opt, &ecn_ok)) goto out; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index c9dabdd832d..0302ec3fecf 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1033,7 +1033,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); tmp_opt.user_mss = tp->rx_opt.user_mss; - tcp_parse_options(skb, &tmp_opt, &hash_location, 0); + tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); if (tmp_opt.cookie_plus > 0 && tmp_opt.saw_tstamp && -- cgit v1.2.3-70-g09d2 From cf60af03ca4e71134206809ea892e49b92a88896 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 19 Jul 2012 06:43:09 +0000 Subject: net-tcp: Fast Open client - sendmsg(MSG_FASTOPEN) sendmsg() (or sendto()) with MSG_FASTOPEN is a combo of connect(2) and write(2). The application should replace connect() with it to send data in the opening SYN packet. For blocking socket, sendmsg() blocks until all the data are buffered locally and the handshake is completed like connect() call. It returns similar errno like connect() if the TCP handshake fails. For non-blocking socket, it returns the number of bytes queued (and transmitted in the SYN-data packet) if cookie is available. If cookie is not available, it transmits a data-less SYN packet with Fast Open cookie request option and returns -EINPROGRESS like connect(). Using MSG_FASTOPEN on connecting or connected socket will result in simlar errno like repeating connect() calls. Therefore the application should only use this flag on new sockets. The buffer size of sendmsg() is independent of the MSS of the connection. Signed-off-by: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 11 ++++++ include/linux/socket.h | 1 + include/net/inet_common.h | 6 ++-- include/net/tcp.h | 3 ++ net/ipv4/af_inet.c | 19 ++++++++--- net/ipv4/tcp.c | 61 +++++++++++++++++++++++++++++++--- net/ipv4/tcp_ipv4.c | 3 ++ 7 files changed, 92 insertions(+), 12 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index e1e021594cf..03964e08818 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -468,6 +468,17 @@ tcp_syncookies - BOOLEAN SYN flood warnings in logs not being really flooded, your server is seriously misconfigured. +tcp_fastopen - INTEGER + Enable TCP Fast Open feature (draft-ietf-tcpm-fastopen) to send data + in the opening SYN packet. To use this feature, the client application + must not use connect(). Instead, it should use sendmsg() or sendto() + with MSG_FASTOPEN flag which performs a TCP handshake automatically. + + The values (bitmap) are: + 1: Enables sending data in the opening SYN on the client + + Default: 0 + tcp_syn_retries - INTEGER Number of times initial SYNs for an active TCP connection attempt will be retransmitted. Should not be higher than 255. Default value diff --git a/include/linux/socket.h b/include/linux/socket.h index 25d6322fb63..ba7b2e817cf 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -268,6 +268,7 @@ struct ucred { #define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */ #define MSG_EOF MSG_FIN +#define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ #define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exit for file descriptor received through SCM_RIGHTS */ diff --git a/include/net/inet_common.h b/include/net/inet_common.h index 22fac9892b1..234008782c8 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -14,9 +14,11 @@ struct sockaddr; struct socket; extern int inet_release(struct socket *sock); -extern int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr, +extern int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags); -extern int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, +extern int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags); +extern int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags); extern int inet_accept(struct socket *sock, struct socket *newsock, int flags); extern int inet_sendmsg(struct kiocb *iocb, struct socket *sock, diff --git a/include/net/tcp.h b/include/net/tcp.h index 867557b4244..c0258100d70 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -212,6 +212,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); /* TCP initial congestion window as per draft-hkchu-tcpm-initcwnd-01 */ #define TCP_INIT_CWND 10 +/* Bit Flags for sysctl_tcp_fastopen */ +#define TFO_CLIENT_ENABLE 1 + extern struct inet_timewait_death_row tcp_death_row; /* sysctl variables for tcp */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index edc414625be..fe4582ca969 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -585,8 +585,8 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) * Connect to a remote host. There is regrettably still a little * TCP 'magic' in here. */ -int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, - int addr_len, int flags) +int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) { struct sock *sk = sock->sk; int err; @@ -595,8 +595,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; - lock_sock(sk); - if (uaddr->sa_family == AF_UNSPEC) { err = sk->sk_prot->disconnect(sk, flags); sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; @@ -663,7 +661,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, sock->state = SS_CONNECTED; err = 0; out: - release_sock(sk); return err; sock_error: @@ -673,6 +670,18 @@ sock_error: sock->state = SS_DISCONNECTING; goto out; } +EXPORT_SYMBOL(__inet_stream_connect); + +int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + int err; + + lock_sock(sock->sk); + err = __inet_stream_connect(sock, uaddr, addr_len, flags); + release_sock(sock->sk); + return err; +} EXPORT_SYMBOL(inet_stream_connect); /* diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4252cd8f39f..581ecf02c6b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -270,6 +270,7 @@ #include #include +#include #include #include #include @@ -982,26 +983,67 @@ static inline int select_size(const struct sock *sk, bool sg) return tmp; } +void tcp_free_fastopen_req(struct tcp_sock *tp) +{ + if (tp->fastopen_req != NULL) { + kfree(tp->fastopen_req); + tp->fastopen_req = NULL; + } +} + +static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size) +{ + struct tcp_sock *tp = tcp_sk(sk); + int err, flags; + + if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE)) + return -EOPNOTSUPP; + if (tp->fastopen_req != NULL) + return -EALREADY; /* Another Fast Open is in progress */ + + tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request), + sk->sk_allocation); + if (unlikely(tp->fastopen_req == NULL)) + return -ENOBUFS; + tp->fastopen_req->data = msg; + + flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; + err = __inet_stream_connect(sk->sk_socket, msg->msg_name, + msg->msg_namelen, flags); + *size = tp->fastopen_req->copied; + tcp_free_fastopen_req(tp); + return err; +} + int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t size) { struct iovec *iov; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int iovlen, flags, err, copied; - int mss_now = 0, size_goal; + int iovlen, flags, err, copied = 0; + int mss_now = 0, size_goal, copied_syn = 0, offset = 0; bool sg; long timeo; lock_sock(sk); flags = msg->msg_flags; + if (flags & MSG_FASTOPEN) { + err = tcp_sendmsg_fastopen(sk, msg, &copied_syn); + if (err == -EINPROGRESS && copied_syn > 0) + goto out; + else if (err) + goto out_err; + offset = copied_syn; + } + timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); /* Wait for a connection to finish. */ if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) - goto out_err; + goto do_error; if (unlikely(tp->repair)) { if (tp->repair_queue == TCP_RECV_QUEUE) { @@ -1037,6 +1079,15 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, unsigned char __user *from = iov->iov_base; iov++; + if (unlikely(offset > 0)) { /* Skip bytes copied in SYN */ + if (offset >= seglen) { + offset -= seglen; + continue; + } + seglen -= offset; + from += offset; + offset = 0; + } while (seglen > 0) { int copy = 0; @@ -1199,7 +1250,7 @@ out: if (copied && likely(!tp->repair)) tcp_push(sk, flags, mss_now, tp->nonagle); release_sock(sk); - return copied; + return copied + copied_syn; do_fault: if (!skb->len) { @@ -1212,7 +1263,7 @@ do_fault: } do_error: - if (copied) + if (copied + copied_syn) goto out; out_err: err = sk_stream_error(sk, flags, err); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 01aa77a9702..1d8b75a5898 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1952,6 +1952,9 @@ void tcp_v4_destroy_sock(struct sock *sk) tp->cookie_values = NULL; } + /* If socket is aborted during connect operation */ + tcp_free_fastopen_req(tp); + sk_sockets_allocated_dec(sk); sock_release_memcg(sk); } -- cgit v1.2.3-70-g09d2 From ba3f7f04ef2b19aace38f855aedd17fe43035d50 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 17 Jul 2012 14:02:46 -0700 Subject: ipv4: Kill FLOWI_FLAG_RT_NOCACHE and associated code. Signed-off-by: David S. Miller --- include/net/flow.h | 1 - include/net/inet_connection_sock.h | 3 +-- net/dccp/ipv4.c | 2 +- net/ipv4/inet_connection_sock.c | 5 +---- net/ipv4/route.c | 3 --- net/ipv4/tcp_ipv4.c | 4 ++-- 6 files changed, 5 insertions(+), 13 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/flow.h b/include/net/flow.h index ce9cb7656b4..e1dd5082ec7 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -21,7 +21,6 @@ struct flowi_common { __u8 flowic_flags; #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_CAN_SLEEP 0x02 -#define FLOWI_FLAG_RT_NOCACHE 0x04 __u32 flowic_secid; }; diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 2cf44b4ed2e..5ee66f517b4 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -250,8 +250,7 @@ extern int inet_csk_get_port(struct sock *sk, unsigned short snum); extern struct dst_entry* inet_csk_route_req(struct sock *sk, struct flowi4 *fl4, - const struct request_sock *req, - bool nocache); + const struct request_sock *req); extern struct dst_entry* inet_csk_route_child_sock(struct sock *sk, struct sock *newsk, const struct request_sock *req); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index ab4f44c9bb2..25428d0c50c 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -508,7 +508,7 @@ static int dccp_v4_send_response(struct sock *sk, struct request_sock *req, struct dst_entry *dst; struct flowi4 fl4; - dst = inet_csk_route_req(sk, &fl4, req, false); + dst = inet_csk_route_req(sk, &fl4, req); if (dst == NULL) goto out; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 0a290d719bc..db0cf17c00f 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -368,8 +368,7 @@ EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); struct dst_entry *inet_csk_route_req(struct sock *sk, struct flowi4 *fl4, - const struct request_sock *req, - bool nocache) + const struct request_sock *req) { struct rtable *rt; const struct inet_request_sock *ireq = inet_rsk(req); @@ -377,8 +376,6 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, struct net *net = sock_net(sk); int flags = inet_sk_flowi_flags(sk); - if (nocache) - flags |= FLOWI_FLAG_RT_NOCACHE; flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 97cca8a03d9..7e1c0ed0ef7 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1836,9 +1836,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res, rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0); - if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE) - rth->dst.flags |= DST_NOCACHE; - return rth; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1d8b75a5898..59110caeb07 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -824,7 +824,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, struct sk_buff * skb; /* First, grab a route. */ - if (!dst && (dst = inet_csk_route_req(sk, &fl4, req, nocache)) == NULL) + if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) return -1; skb = tcp_make_synack(sk, dst, req, rvp); @@ -1378,7 +1378,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) */ if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle && - (dst = inet_csk_route_req(sk, &fl4, req, want_cookie)) != NULL && + (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && fl4.daddr == saddr) { if (!tcp_peer_is_proven(req, dst, true)) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); -- cgit v1.2.3-70-g09d2 From 563d34d05786263893ba4a1042eb9b9374127cf5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Jul 2012 09:48:52 +0200 Subject: tcp: dont drop MTU reduction indications MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ICMP messages generated in output path if frame length is bigger than mtu are actually lost because socket is owned by user (doing the xmit) One example is the ipgre_tunnel_xmit() calling icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); We had a similar case fixed in commit a34a101e1e6 (ipv6: disable GSO on sockets hitting dst_allfrag). Problem of such fix is that it relied on retransmit timers, so short tcp sessions paid a too big latency increase price. This patch uses the tcp_release_cb() infrastructure so that MTU reduction messages (ICMP messages) are not lost, and no extra delay is added in TCP transmits. Reported-by: Maciej Żenczykowski Diagnosed-by: Neal Cardwell Signed-off-by: Eric Dumazet Cc: Nandita Dukkipati Cc: Tom Herbert Cc: Tore Anderson Signed-off-by: David S. Miller --- include/linux/tcp.h | 6 ++++++ include/net/sock.h | 1 + net/ipv4/tcp_ipv4.c | 19 +++++++++++++++---- net/ipv4/tcp_output.c | 6 +++++- net/ipv6/tcp_ipv6.c | 40 ++++++++++++++++++++++++---------------- 5 files changed, 51 insertions(+), 21 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 2761856987b..eb125a4c30b 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -493,6 +493,9 @@ struct tcp_sock { u32 probe_seq_start; u32 probe_seq_end; } mtu_probe; + u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG + * while socket was owned by user. + */ #ifdef CONFIG_TCP_MD5SIG /* TCP AF-Specific parts; only used by MD5 Signature support so far */ @@ -518,6 +521,9 @@ enum tsq_flags { TCP_TSQ_DEFERRED, /* tcp_tasklet_func() found socket was owned */ TCP_WRITE_TIMER_DEFERRED, /* tcp_write_timer() found socket was owned */ TCP_DELACK_TIMER_DEFERRED, /* tcp_delack_timer() found socket was owned */ + TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call + * tcp_v{4|6}_mtu_reduced() + */ }; static inline struct tcp_sock *tcp_sk(const struct sock *sk) diff --git a/include/net/sock.h b/include/net/sock.h index 88de092df50..e067f8c18f8 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -859,6 +859,7 @@ struct proto { struct sk_buff *skb); void (*release_cb)(struct sock *sk); + void (*mtu_reduced)(struct sock *sk); /* Keeping track of sk's, looking them up, and port selection methods. */ void (*hash)(struct sock *sk); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 59110caeb07..bc5432e3c77 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -275,12 +275,15 @@ failure: EXPORT_SYMBOL(tcp_v4_connect); /* - * This routine does path mtu discovery as defined in RFC1191. + * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. + * It can be called through tcp_release_cb() if socket was owned by user + * at the time tcp_v4_err() was called to handle ICMP message. */ -static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu) +static void tcp_v4_mtu_reduced(struct sock *sk) { struct dst_entry *dst; struct inet_sock *inet = inet_sk(sk); + u32 mtu = tcp_sk(sk)->mtu_info; /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs * send out by Linux are always <576bytes so they should go through @@ -373,8 +376,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) bh_lock_sock(sk); /* If too many ICMPs get dropped on busy * servers this needs to be solved differently. + * We do take care of PMTU discovery (RFC1191) special case : + * we can receive locally generated ICMP messages while socket is held. */ - if (sock_owned_by_user(sk)) + if (sock_owned_by_user(sk) && + type != ICMP_DEST_UNREACH && + code != ICMP_FRAG_NEEDED) NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == TCP_CLOSE) @@ -409,8 +416,11 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) goto out; if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ + tp->mtu_info = info; if (!sock_owned_by_user(sk)) - do_pmtu_discovery(sk, iph, info); + tcp_v4_mtu_reduced(sk); + else + set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags); goto out; } @@ -2596,6 +2606,7 @@ struct proto tcp_prot = { .sendpage = tcp_sendpage, .backlog_rcv = tcp_v4_do_rcv, .release_cb = tcp_release_cb, + .mtu_reduced = tcp_v4_mtu_reduced, .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 950aebfd996..33cd065cfbd 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -885,7 +885,8 @@ static void tcp_tasklet_func(unsigned long data) #define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \ (1UL << TCP_WRITE_TIMER_DEFERRED) | \ - (1UL << TCP_DELACK_TIMER_DEFERRED)) + (1UL << TCP_DELACK_TIMER_DEFERRED) | \ + (1UL << TCP_MTU_REDUCED_DEFERRED)) /** * tcp_release_cb - tcp release_sock() callback * @sk: socket @@ -914,6 +915,9 @@ void tcp_release_cb(struct sock *sk) if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) tcp_delack_timer_handler(sk); + + if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) + sk->sk_prot->mtu_reduced(sk); } EXPORT_SYMBOL(tcp_release_cb); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0302ec3fecf..f49476e2d88 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -315,6 +315,23 @@ failure: return err; } +static void tcp_v6_mtu_reduced(struct sock *sk) +{ + struct dst_entry *dst; + + if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) + return; + + dst = inet6_csk_update_pmtu(sk, tcp_sk(sk)->mtu_info); + if (!dst) + return; + + if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { + tcp_sync_mss(sk, dst_mtu(dst)); + tcp_simple_retransmit(sk); + } +} + static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { @@ -342,7 +359,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } bh_lock_sock(sk); - if (sock_owned_by_user(sk)) + if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == TCP_CLOSE) @@ -371,21 +388,11 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } if (type == ICMPV6_PKT_TOOBIG) { - struct dst_entry *dst; - - if (sock_owned_by_user(sk)) - goto out; - if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) - goto out; - - dst = inet6_csk_update_pmtu(sk, ntohl(info)); - if (!dst) - goto out; - - if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { - tcp_sync_mss(sk, dst_mtu(dst)); - tcp_simple_retransmit(sk); - } + tp->mtu_info = ntohl(info); + if (!sock_owned_by_user(sk)) + tcp_v6_mtu_reduced(sk); + else + set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags); goto out; } @@ -1949,6 +1956,7 @@ struct proto tcpv6_prot = { .sendpage = tcp_sendpage, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, + .mtu_reduced = tcp_v6_mtu_reduced, .hash = tcp_v6_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, -- cgit v1.2.3-70-g09d2 From 92101b3b2e3178087127709a556b091dae314e9e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 23 Jul 2012 16:29:00 -0700 Subject: ipv4: Prepare for change of rt->rt_iif encoding. Use inet_iif() consistently, and for TCP record the input interface of cached RX dst in inet sock. rt->rt_iif is going to be encoded differently, so that we can legitimately cache input routes in the FIB info more aggressively. When the input interface is "use SKB device index" the rt->rt_iif will be set to zero. This forces us to move the TCP RX dst cache installation into the ipv4 specific code, and as well it should since doing the route caching for ipv6 is pointless at the moment since it is not inspected in the ipv6 input paths yet. Also, remove the unlikely on dst->obsolete, all ipv4 dsts have obsolete set to a non-zero value to force invocation of the check callback. Signed-off-by: David S. Miller --- include/net/inet_sock.h | 1 + net/dccp/ipv4.c | 2 +- net/ipv4/icmp.c | 2 +- net/ipv4/ip_sockglue.c | 5 ++--- net/ipv4/route.c | 2 +- net/ipv4/tcp_input.c | 12 ------------ net/ipv4/tcp_ipv4.c | 24 ++++++++++++++++++------ net/sched/cls_route.c | 2 +- net/sched/em_meta.c | 2 +- net/sctp/protocol.c | 2 +- 10 files changed, 27 insertions(+), 27 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 924d7b98ab6..613cfa40167 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -172,6 +172,7 @@ struct inet_sock { int uc_index; int mc_index; __be32 mc_addr; + int rx_dst_ifindex; struct ip_mc_socklist __rcu *mc_list; struct inet_cork_full cork; }; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 25428d0c50c..176ecdba4a2 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -481,7 +481,7 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, struct rtable *rt; const struct iphdr *iph = ip_hdr(skb); struct flowi4 fl4 = { - .flowi4_oif = skb_rtable(skb)->rt_iif, + .flowi4_oif = inet_iif(skb), .daddr = iph->saddr, .saddr = iph->daddr, .flowi4_tos = RT_CONN_FLAGS(sk), diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index f2a06beffbd..f2eccd53174 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -571,7 +571,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) rcu_read_lock(); if (rt_is_input_route(rt) && net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) - dev = dev_get_by_index_rcu(net, rt->rt_iif); + dev = dev_get_by_index_rcu(net, inet_iif(skb_in)); if (dev) saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index de29f46f68b..5eea4a81104 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1027,10 +1027,9 @@ e_inval: void ipv4_pktinfo_prepare(struct sk_buff *skb) { struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb); - const struct rtable *rt = skb_rtable(skb); - if (rt) { - pktinfo->ipi_ifindex = rt->rt_iif; + if (skb_rtable(skb)) { + pktinfo->ipi_ifindex = inet_iif(skb); pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb); } else { pktinfo->ipi_ifindex = 0; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 34017be87c8..f6be7811939 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -848,7 +848,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) if (log_martians && peer->rate_tokens == ip_rt_redirect_number) net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", - &ip_hdr(skb)->saddr, rt->rt_iif, + &ip_hdr(skb)->saddr, inet_iif(skb), &ip_hdr(skb)->daddr, &rt->rt_gateway); #endif } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 21d7f8f3a7a..3e07a64ca44 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5391,18 +5391,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); - if (sk->sk_rx_dst) { - struct dst_entry *dst = sk->sk_rx_dst; - if (unlikely(dst->obsolete)) { - if (dst->ops->check(dst, 0) == NULL) { - dst_release(dst); - sk->sk_rx_dst = NULL; - } - } - } - if (unlikely(sk->sk_rx_dst == NULL)) - sk->sk_rx_dst = dst_clone(skb_dst(skb)); - /* * Header prediction. * The code loosely follows the one in the famous diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index bc5432e3c77..3e30548ac32 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1618,6 +1618,20 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ sock_rps_save_rxhash(sk, skb); + if (sk->sk_rx_dst) { + struct dst_entry *dst = sk->sk_rx_dst; + if (dst->ops->check(dst, 0) == NULL) { + dst_release(dst); + sk->sk_rx_dst = NULL; + } + } + if (unlikely(sk->sk_rx_dst == NULL)) { + struct inet_sock *icsk = inet_sk(sk); + struct rtable *rt = skb_rtable(skb); + + sk->sk_rx_dst = dst_clone(&rt->dst); + icsk->rx_dst_ifindex = inet_iif(skb); + } if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; @@ -1700,14 +1714,12 @@ void tcp_v4_early_demux(struct sk_buff *skb) skb->destructor = sock_edemux; if (sk->sk_state != TCP_TIME_WAIT) { struct dst_entry *dst = sk->sk_rx_dst; + struct inet_sock *icsk = inet_sk(sk); if (dst) dst = dst_check(dst, 0); - if (dst) { - struct rtable *rt = (struct rtable *) dst; - - if (rt->rt_iif == dev->ifindex) - skb_dst_set_noref(skb, dst); - } + if (dst && + icsk->rx_dst_ifindex == dev->ifindex) + skb_dst_set_noref(skb, dst); } } } diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 36fec422740..44f405cb9aa 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -143,7 +143,7 @@ static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp, if (head == NULL) goto old_method; - iif = ((struct rtable *)dst)->rt_iif; + iif = inet_iif(skb); h = route4_fastmap_hash(id, iif); if (id == head->fastmap[h].id && diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 4790c696cbc..4ab6e332557 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -264,7 +264,7 @@ META_COLLECTOR(int_rtiif) if (unlikely(skb_rtable(skb) == NULL)) *err = -1; else - dst->value = skb_rtable(skb)->rt_iif; + dst->value = inet_iif(skb); } /************************************************************************** diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 9c90811d113..1f89c4e6964 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -568,7 +568,7 @@ static void sctp_v4_get_saddr(struct sctp_sock *sk, /* What interface did this skb arrive on? */ static int sctp_v4_skb_iif(const struct sk_buff *skb) { - return skb_rtable(skb)->rt_iif; + return inet_iif(skb); } /* Was this packet marked by Explicit Congestion Notification? */ -- cgit v1.2.3-70-g09d2