From c074da2810c118b3812f32d6754bd9ead2f169e7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Jun 2012 23:14:15 +0000 Subject: ipv4: tcp: dont cache unconfirmed intput dst DDOS synflood attacks hit badly IP route cache. On typical machines, this cache is allowed to hold up to 8 Millions dst entries, 256 bytes for each, for a total of 2GB of memory. rt_garbage_collect() triggers and tries to cleanup things. Eventually route cache is disabled but machine is under fire and might OOM and crash. This patch exploits the new TCP early demux, to set a nocache boolean in case incoming TCP frame is for a not yet ESTABLISHED or TIMEWAIT socket. This 'nocache' boolean is then used in case dst entry is not found in route cache, to create an unhashed dst entry (DST_NOCACHE) SYN-cookie-ACK sent use a similar mechanism (ipv4: tcp: dont cache output dst for syncookies), so after this patch, a machine is able to absorb a DDOS synflood attack without polluting its IP route cache. Signed-off-by: Eric Dumazet Cc: Hans Schillstrom Signed-off-by: David S. Miller --- net/ipv4/xfrm4_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/xfrm4_input.c') diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 06814b6216d..eee636b191b 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -28,7 +28,7 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb) const struct iphdr *iph = ip_hdr(skb); if (ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev)) + iph->tos, skb->dev, false)) goto drop; } return dst_input(skb); -- cgit v1.2.3-70-g09d2 From c10237e077cef50e925f052e49f3b4fead9d71f9 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 27 Jun 2012 17:05:06 -0700 Subject: Revert "ipv4: tcp: dont cache unconfirmed intput dst" This reverts commit c074da2810c118b3812f32d6754bd9ead2f169e7. This change has several unwanted side effects: 1) Sockets will cache the DST_NOCACHE route in sk->sk_rx_dst and we'll thus never create a real cached route. 2) All TCP traffic will use DST_NOCACHE and never use the routing cache at all. Signed-off-by: David S. Miller --- include/net/protocol.h | 2 +- include/net/route.h | 8 ++++---- include/net/tcp.h | 2 +- net/ipv4/arp.c | 2 +- net/ipv4/ip_fragment.c | 2 +- net/ipv4/ip_input.c | 5 ++--- net/ipv4/route.c | 8 +++----- net/ipv4/tcp_ipv4.c | 4 +--- net/ipv4/xfrm4_input.c | 2 +- 9 files changed, 15 insertions(+), 20 deletions(-) (limited to 'net/ipv4/xfrm4_input.c') diff --git a/include/net/protocol.h b/include/net/protocol.h index 7cfc8f76914..967b926cbfb 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -37,7 +37,7 @@ /* This is used to register protocols. */ struct net_protocol { - int (*early_demux)(struct sk_buff *skb, bool *nocache); + int (*early_demux)(struct sk_buff *skb); int (*handler)(struct sk_buff *skb); void (*err_handler)(struct sk_buff *skb, u32 info); int (*gso_send_check)(struct sk_buff *skb); diff --git a/include/net/route.h b/include/net/route.h index 6361f933577..47eb25ac1f7 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -201,18 +201,18 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 } extern int ip_route_input_common(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin, bool noref, bool nocache); + u8 tos, struct net_device *devin, bool noref); static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, u8 tos, struct net_device *devin) { - return ip_route_input_common(skb, dst, src, tos, devin, false, false); + return ip_route_input_common(skb, dst, src, tos, devin, false); } static inline int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin, bool nocache) + u8 tos, struct net_device *devin) { - return ip_route_input_common(skb, dst, src, tos, devin, true, nocache); + return ip_route_input_common(skb, dst, src, tos, devin, true); } extern void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, diff --git a/include/net/tcp.h b/include/net/tcp.h index 917ed2e55e8..6660ffc4963 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -325,7 +325,7 @@ extern void tcp_v4_err(struct sk_buff *skb, u32); extern void tcp_shutdown (struct sock *sk, int how); -extern int tcp_v4_early_demux(struct sk_buff *skb, bool *nocache); +extern int tcp_v4_early_demux(struct sk_buff *skb); extern int tcp_v4_rcv(struct sk_buff *skb); extern struct inet_peer *tcp_v4_get_peer(struct sock *sk); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 6a979594436..2e560f0c757 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -828,7 +828,7 @@ static int arp_process(struct sk_buff *skb) } if (arp->ar_op == htons(ARPOP_REQUEST) && - ip_route_input_noref(skb, tip, sip, 0, dev, false) == 0) { + ip_route_input_noref(skb, tip, sip, 0, dev) == 0) { rt = skb_rtable(skb); addr_type = rt->rt_type; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 978d55f256e..8d07c973409 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -259,7 +259,7 @@ static void ip_expire(unsigned long arg) skb_dst_drop(head); iph = ip_hdr(head); err = ip_route_input_noref(head, iph->daddr, iph->saddr, - iph->tos, head->dev, false); + iph->tos, head->dev); if (err) goto out_rcu_unlock; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 7be54c8dcbe..2a39204de5b 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -326,7 +326,6 @@ static int ip_rcv_finish(struct sk_buff *skb) */ if (skb_dst(skb) == NULL) { int err = -ENOENT; - bool nocache = false; if (sysctl_ip_early_demux) { const struct net_protocol *ipprot; @@ -335,13 +334,13 @@ static int ip_rcv_finish(struct sk_buff *skb) rcu_read_lock(); ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot && ipprot->early_demux) - err = ipprot->early_demux(skb, &nocache); + err = ipprot->early_demux(skb); rcu_read_unlock(); } if (err) { err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev, nocache); + iph->tos, skb->dev); if (unlikely(err)) { if (err == -EXDEV) NET_INC_STATS_BH(dev_net(skb->dev), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index fdc7900f9d7..81533e3a23d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2214,7 +2214,7 @@ static int ip_mkroute_input(struct sk_buff *skb, */ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, bool nocache) + u8 tos, struct net_device *dev) { struct fib_result res; struct in_device *in_dev = __in_dev_get_rcu(dev); @@ -2353,8 +2353,6 @@ local_input: rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } - if (nocache) - rth->dst.flags |= DST_NOCACHE; hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net)); rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif); err = 0; @@ -2397,7 +2395,7 @@ martian_source_keep_err: } int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, bool noref, bool nocache) + u8 tos, struct net_device *dev, bool noref) { struct rtable *rth; unsigned int hash; @@ -2473,7 +2471,7 @@ skip_cache: rcu_read_unlock(); return -EINVAL; } - res = ip_route_input_slow(skb, daddr, saddr, tos, dev, nocache); + res = ip_route_input_slow(skb, daddr, saddr, tos, dev); rcu_read_unlock(); return res; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 33aabd4fc20..1781dc650b9 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1673,7 +1673,7 @@ csum_err: } EXPORT_SYMBOL(tcp_v4_do_rcv); -int tcp_v4_early_demux(struct sk_buff *skb, bool *no_dst_cache) +int tcp_v4_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); const struct iphdr *iph; @@ -1719,8 +1719,6 @@ int tcp_v4_early_demux(struct sk_buff *skb, bool *no_dst_cache) } } } - } else { - *no_dst_cache = true; } out_err: diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index eee636b191b..06814b6216d 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -28,7 +28,7 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb) const struct iphdr *iph = ip_hdr(skb); if (ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev, false)) + iph->tos, skb->dev)) goto drop; } return dst_input(skb); -- cgit v1.2.3-70-g09d2 From 38a424e4657462fe9f8b76f01a0e879abde99ab4 Mon Sep 17 00:00:00 2001 From: David Miller Date: Sun, 1 Jul 2012 02:02:53 +0000 Subject: ipv4: Kill ip_route_input_noref(). The "noref" argument to ip_route_input_common() is now always ignored because we do not cache routes, and in that case we must always grab a reference to the resulting 'dst'. Signed-off-by: David S. Miller --- include/net/route.h | 16 ++-------------- net/ipv4/arp.c | 2 +- net/ipv4/ip_fragment.c | 4 ++-- net/ipv4/ip_input.c | 4 ++-- net/ipv4/route.c | 6 +++--- net/ipv4/xfrm4_input.c | 4 ++-- 6 files changed, 12 insertions(+), 24 deletions(-) (limited to 'net/ipv4/xfrm4_input.c') diff --git a/include/net/route.h b/include/net/route.h index 5dcfeb621e0..5c86c4773b2 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -160,20 +160,8 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 return ip_route_output_key(net, fl4); } -extern int ip_route_input_common(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin, bool noref); - -static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin) -{ - return ip_route_input_common(skb, dst, src, tos, devin, false); -} - -static inline int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin) -{ - return ip_route_input_common(skb, dst, src, tos, devin, true); -} +extern int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, + u8 tos, struct net_device *devin); extern void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif, u32 mark, u8 protocol, int flow_flags); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 2e560f0c757..c38293f3816 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -828,7 +828,7 @@ static int arp_process(struct sk_buff *skb) } if (arp->ar_op == htons(ARPOP_REQUEST) && - ip_route_input_noref(skb, tip, sip, 0, dev) == 0) { + ip_route_input(skb, tip, sip, 0, dev) == 0) { rt = skb_rtable(skb); addr_type = rt->rt_type; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 8d07c973409..7ad88e5e711 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -258,8 +258,8 @@ static void ip_expire(unsigned long arg) /* skb dst is stale, drop it, and perform route lookup again */ skb_dst_drop(head); iph = ip_hdr(head); - err = ip_route_input_noref(head, iph->daddr, iph->saddr, - iph->tos, head->dev); + err = ip_route_input(head, iph->daddr, iph->saddr, + iph->tos, head->dev); if (err) goto out_rcu_unlock; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index b27d4440f52..4ebc6feee25 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -336,8 +336,8 @@ static int ip_rcv_finish(struct sk_buff *skb) * how the packet travels inside Linux networking. */ if (!skb_dst(skb)) { - int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev); + int err = ip_route_input(skb, iph->daddr, iph->saddr, + iph->tos, skb->dev); if (unlikely(err)) { if (err == -EXDEV) NET_INC_STATS_BH(dev_net(skb->dev), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6d6146d31f2..55eb4634ed6 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1620,8 +1620,8 @@ martian_source_keep_err: goto out; } -int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, bool noref) +int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev) { int res; @@ -1664,7 +1664,7 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, rcu_read_unlock(); return res; } -EXPORT_SYMBOL(ip_route_input_common); +EXPORT_SYMBOL(ip_route_input); /* called with rcu_read_lock() */ static struct rtable *__mkroute_output(const struct fib_result *res, diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 06814b6216d..58d23a57250 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -27,8 +27,8 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb) if (skb_dst(skb) == NULL) { const struct iphdr *iph = ip_hdr(skb); - if (ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev)) + if (ip_route_input(skb, iph->daddr, iph->saddr, + iph->tos, skb->dev)) goto drop; } return dst_input(skb); -- cgit v1.2.3-70-g09d2