diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/inetpeer.c | 81 | ||||
-rw-r--r-- | net/ipv4/ip_gre.c | 10 | ||||
-rw-r--r-- | net/ipv4/ping.c | 1 | ||||
-rw-r--r-- | net/ipv4/route.c | 12 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 5 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 23 | ||||
-rw-r--r-- | net/ipv4/xfrm4_mode_beet.c | 5 | ||||
-rw-r--r-- | net/ipv4/xfrm4_mode_tunnel.c | 6 |
8 files changed, 109 insertions, 34 deletions
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index bf4a9c4808e..d4d61b694fa 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -17,6 +17,7 @@ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/net.h> +#include <linux/workqueue.h> #include <net/ip.h> #include <net/inetpeer.h> #include <net/secure_seq.h> @@ -66,6 +67,11 @@ static struct kmem_cache *peer_cachep __read_mostly; +static LIST_HEAD(gc_list); +static const int gc_delay = 60 * HZ; +static struct delayed_work gc_work; +static DEFINE_SPINLOCK(gc_lock); + #define node_height(x) x->avl_height #define peer_avl_empty ((struct inet_peer *)&peer_fake_node) @@ -102,6 +108,50 @@ int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries m int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ +static void inetpeer_gc_worker(struct work_struct *work) +{ + struct inet_peer *p, *n; + LIST_HEAD(list); + + spin_lock_bh(&gc_lock); + list_replace_init(&gc_list, &list); + spin_unlock_bh(&gc_lock); + + if (list_empty(&list)) + return; + + list_for_each_entry_safe(p, n, &list, gc_list) { + + if(need_resched()) + cond_resched(); + + if (p->avl_left != peer_avl_empty) { + list_add_tail(&p->avl_left->gc_list, &list); + p->avl_left = peer_avl_empty; + } + + if (p->avl_right != peer_avl_empty) { + list_add_tail(&p->avl_right->gc_list, &list); + p->avl_right = peer_avl_empty; + } + + n = list_entry(p->gc_list.next, struct inet_peer, gc_list); + + if (!atomic_read(&p->refcnt)) { + list_del(&p->gc_list); + kmem_cache_free(peer_cachep, p); + } + } + + if (list_empty(&list)) + return; + + spin_lock_bh(&gc_lock); + list_splice(&list, &gc_list); + spin_unlock_bh(&gc_lock); + + schedule_delayed_work(&gc_work, gc_delay); +} /* Called from ip_output.c:ip_init */ void __init inet_initpeers(void) @@ -126,6 +176,7 @@ void __init inet_initpeers(void) 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + INIT_DELAYED_WORK_DEFERRABLE(&gc_work, inetpeer_gc_worker); } static int addr_compare(const struct inetpeer_addr *a, @@ -447,9 +498,8 @@ relookup: p->rate_last = 0; p->pmtu_expires = 0; p->pmtu_orig = 0; - p->redirect_genid = 0; memset(&p->redirect_learned, 0, sizeof(p->redirect_learned)); - + INIT_LIST_HEAD(&p->gc_list); /* Link the node. */ link_to_pool(p, base); @@ -509,3 +559,30 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout) return rc; } EXPORT_SYMBOL(inet_peer_xrlim_allow); + +void inetpeer_invalidate_tree(int family) +{ + struct inet_peer *old, *new, *prev; + struct inet_peer_base *base = family_to_base(family); + + write_seqlock_bh(&base->lock); + + old = base->root; + if (old == peer_avl_empty_rcu) + goto out; + + new = peer_avl_empty_rcu; + + prev = cmpxchg(&base->root, old, new); + if (prev == old) { + base->total = 0; + spin_lock(&gc_lock); + list_add_tail(&prev->gc_list, &gc_list); + spin_unlock(&gc_lock); + schedule_delayed_work(&gc_work, gc_delay); + } + +out: + write_sequnlock_bh(&base->lock); +} +EXPORT_SYMBOL(inetpeer_invalidate_tree); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 6b3ca5ba445..38673d2860e 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -65,7 +65,7 @@ it is infeasible task. The most general solutions would be to keep skb->encapsulation counter (sort of local ttl), and silently drop packet when it expires. It is a good - solution, but it supposes maintaing new variable in ALL + solution, but it supposes maintaining new variable in ALL skb, even if no tunneling is used. Current solution: xmit_recursion breaks dead loops. This is a percpu @@ -91,14 +91,14 @@ One of them is to parse packet trying to detect inner encapsulation made by our node. It is difficult or even impossible, especially, - taking into account fragmentation. TO be short, tt is not solution at all. + taking into account fragmentation. TO be short, ttl is not solution at all. Current solution: The solution was UNEXPECTEDLY SIMPLE. We force DF flag on tunnels with preconfigured hop limit, that is ALL. :-) Well, it does not remove the problem completely, but exponential growth of network traffic is changed to linear (branches, that exceed pmtu are pruned) and tunnel mtu - fastly degrades to value <68, where looping stops. + rapidly degrades to value <68, where looping stops. Yes, it is not good if there exists a router in the loop, which does not force DF, even when encapsulating packets have DF set. But it is not our problem! Nobody could accuse us, we made @@ -457,8 +457,8 @@ static void ipgre_err(struct sk_buff *skb, u32 info) GRE tunnels with enabled checksum. Tell them "thank you". Well, I wonder, rfc1812 was written by Cisco employee, - what the hell these idiots break standrads established - by themself??? + what the hell these idiots break standards established + by themselves??? */ const struct iphdr *iph = (const struct iphdr *)skb->data; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index aea5a199c37..b072386cee2 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -630,6 +630,7 @@ static int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num); + err = -EOPNOTSUPP; if (flags & MSG_OOB) goto out; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index bcacf54e541..01977479617 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -132,7 +132,6 @@ static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; static int ip_rt_min_advmss __read_mostly = 256; static int rt_chain_length_max __read_mostly = 20; -static int redirect_genid; static struct delayed_work expires_work; static unsigned long expires_ljiffies; @@ -937,7 +936,7 @@ static void rt_cache_invalidate(struct net *net) get_random_bytes(&shuffle, sizeof(shuffle)); atomic_add(shuffle + 1U, &net->ipv4.rt_genid); - redirect_genid++; + inetpeer_invalidate_tree(AF_INET); } /* @@ -1485,10 +1484,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, peer = rt->peer; if (peer) { - if (peer->redirect_learned.a4 != new_gw || - peer->redirect_genid != redirect_genid) { + if (peer->redirect_learned.a4 != new_gw) { peer->redirect_learned.a4 = new_gw; - peer->redirect_genid = redirect_genid; atomic_inc(&__rt_peer_genid); } check_peer_redir(&rt->dst, peer); @@ -1793,8 +1790,6 @@ static void ipv4_validate_peer(struct rtable *rt) if (peer) { check_peer_pmtu(&rt->dst, peer); - if (peer->redirect_genid != redirect_genid) - peer->redirect_learned.a4 = 0; if (peer->redirect_learned.a4 && peer->redirect_learned.a4 != rt->rt_gateway) check_peer_redir(&rt->dst, peer); @@ -1958,8 +1953,7 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, dst_init_metrics(&rt->dst, peer->metrics, false); check_peer_pmtu(&rt->dst, peer); - if (peer->redirect_genid != redirect_genid) - peer->redirect_learned.a4 = 0; + if (peer->redirect_learned.a4 && peer->redirect_learned.a4 != rt->rt_gateway) { rt->rt_gateway = peer->redirect_learned.a4; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 37755ccc0e9..22ef5f9fd2f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3240,7 +3240,8 @@ void __init tcp_init(void) { struct sk_buff *skb = NULL; unsigned long limit; - int i, max_share, cnt; + int max_share, cnt; + unsigned int i; unsigned long jiffy = jiffies; BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); @@ -3283,7 +3284,7 @@ void __init tcp_init(void) &tcp_hashinfo.bhash_size, NULL, 64 * 1024); - tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size; + tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size; for (i = 0; i < tcp_hashinfo.bhash_size; i++) { spin_lock_init(&tcp_hashinfo.bhash[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 53c8ce4046b..b5e315f1364 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1403,8 +1403,16 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, BUG_ON(!pcount); - /* Adjust hint for FACK. Non-FACK is handled in tcp_sacktag_one(). */ - if (tcp_is_fack(tp) && (skb == tp->lost_skb_hint)) + /* Adjust counters and hints for the newly sacked sequence + * range but discard the return value since prev is already + * marked. We must tag the range first because the seq + * advancement below implicitly advances + * tcp_highest_sack_seq() when skb is highest_sack. + */ + tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, + start_seq, end_seq, dup_sack, pcount); + + if (skb == tp->lost_skb_hint) tp->lost_cnt_hint += pcount; TCP_SKB_CB(prev)->end_seq += shifted; @@ -1430,12 +1438,6 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, skb_shinfo(skb)->gso_type = 0; } - /* Adjust counters and hints for the newly sacked sequence range but - * discard the return value since prev is already marked. - */ - tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, - start_seq, end_seq, dup_sack, pcount); - /* Difference in this won't matter, both ACKed by the same cumul. ACK */ TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); @@ -1583,6 +1585,10 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, } } + /* tcp_sacktag_one() won't SACK-tag ranges below snd_una */ + if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una)) + goto fallback; + if (!skb_shift(prev, skb, len)) goto fallback; if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack)) @@ -2567,6 +2573,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) if (cnt > packets) { if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) || + (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || (oldcnt >= packets)) break; diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c index 63418185f52..e3db3f91511 100644 --- a/net/ipv4/xfrm4_mode_beet.c +++ b/net/ipv4/xfrm4_mode_beet.c @@ -110,10 +110,7 @@ static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb) skb_push(skb, sizeof(*iph)); skb_reset_network_header(skb); - - memmove(skb->data - skb->mac_len, skb_mac_header(skb), - skb->mac_len); - skb_set_mac_header(skb, -skb->mac_len); + skb_mac_header_rebuild(skb); xfrm4_beet_make_header(skb); diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 534972e114a..ed4bf11ef9f 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -66,7 +66,6 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) { - const unsigned char *old_mac; int err = -EINVAL; if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP) @@ -84,10 +83,9 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) if (!(x->props.flags & XFRM_STATE_NOECN)) ipip_ecn_decapsulate(skb); - old_mac = skb_mac_header(skb); - skb_set_mac_header(skb, -skb->mac_len); - memmove(skb_mac_header(skb), old_mac, skb->mac_len); skb_reset_network_header(skb); + skb_mac_header_rebuild(skb); + err = 0; out: |