diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/Kconfig | 8 | ||||
-rw-r--r-- | net/ipv4/af_inet.c | 2 | ||||
-rw-r--r-- | net/ipv4/esp4.c | 4 | ||||
-rw-r--r-- | net/ipv4/fib_trie.c | 7 | ||||
-rw-r--r-- | net/ipv4/icmp.c | 24 | ||||
-rw-r--r-- | net/ipv4/inet_fragment.c | 3 | ||||
-rw-r--r-- | net/ipv4/inet_timewait_sock.c | 1 | ||||
-rw-r--r-- | net/ipv4/ip_forward.c | 2 | ||||
-rw-r--r-- | net/ipv4/ip_fragment.c | 2 | ||||
-rw-r--r-- | net/ipv4/ip_sockglue.c | 4 | ||||
-rw-r--r-- | net/ipv4/ipconfig.c | 11 | ||||
-rw-r--r-- | net/ipv4/netfilter/ip_queue.c | 8 | ||||
-rw-r--r-- | net/ipv4/netfilter/ipt_CLUSTERIP.c | 9 | ||||
-rw-r--r-- | net/ipv4/netfilter/ipt_recent.c | 5 | ||||
-rw-r--r-- | net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c | 7 | ||||
-rw-r--r-- | net/ipv4/netfilter/nf_nat_core.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 4 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 143 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 17 | ||||
-rw-r--r-- | net/ipv4/udp.c | 4 | ||||
-rw-r--r-- | net/ipv4/xfrm4_mode_beet.c | 11 | ||||
-rw-r--r-- | net/ipv4/xfrm4_mode_tunnel.c | 2 | ||||
-rw-r--r-- | net/ipv4/xfrm4_output.c | 2 | ||||
-rw-r--r-- | net/ipv4/xfrm4_state.c | 2 |
24 files changed, 181 insertions, 103 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 9c7e5ffb223..4670683b468 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -160,7 +160,7 @@ config IP_PNP_DHCP If unsure, say Y. Note that if you want to use DHCP, a DHCP server must be operating on your network. Read - <file:Documentation/nfsroot.txt> for details. + <file:Documentation/filesystems/nfsroot.txt> for details. config IP_PNP_BOOTP bool "IP: BOOTP support" @@ -175,7 +175,7 @@ config IP_PNP_BOOTP does BOOTP itself, providing all necessary information on the kernel command line, you can say N here. If unsure, say Y. Note that if you want to use BOOTP, a BOOTP server must be operating on your network. - Read <file:Documentation/nfsroot.txt> for details. + Read <file:Documentation/filesystems/nfsroot.txt> for details. config IP_PNP_RARP bool "IP: RARP support" @@ -187,8 +187,8 @@ config IP_PNP_RARP discovered automatically at boot time using the RARP protocol (an older protocol which is being obsoleted by BOOTP and DHCP), say Y here. Note that if you want to use RARP, a RARP server must be - operating on your network. Read <file:Documentation/nfsroot.txt> for - details. + operating on your network. Read + <file:Documentation/filesystems/nfsroot.txt> for details. # not yet ready.. # bool ' IP: ARP support' CONFIG_IP_PNP_ARP diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 09ca5293d08..0d109504ed8 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -458,7 +458,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) err = -EADDRNOTAVAIL; if (!sysctl_ip_nonlocal_bind && !inet->freebind && - addr->sin_addr.s_addr != INADDR_ANY && + addr->sin_addr.s_addr != htonl(INADDR_ANY) && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 091e6709f83..4e73e5708e7 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -168,7 +168,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) struct xfrm_encap_tmpl *encap = x->encap; struct udphdr *uh; __be32 *udpdata32; - unsigned int sport, dport; + __be16 sport, dport; int encap_type; spin_lock_bh(&x->lock); @@ -336,7 +336,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) struct scatterlist *asg; int err = -EINVAL; - if (!pskb_may_pull(skb, sizeof(*esph))) + if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead))) goto out; if (elen <= 0) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 1ff446d0fa8..f6cdc012eec 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -177,10 +177,13 @@ static inline struct tnode *node_parent_rcu(struct node *node) return rcu_dereference(ret); } +/* Same as rcu_assign_pointer + * but that macro() assumes that value is a pointer. + */ static inline void node_set_parent(struct node *node, struct tnode *ptr) { - rcu_assign_pointer(node->parent, - (unsigned long)ptr | NODE_TYPE(node)); + smp_wmb(); + node->parent = (unsigned long)ptr | NODE_TYPE(node); } static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index a13c074dac0..40508babad8 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -591,7 +591,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) } if (xfrm_decode_session_reverse(skb_in, &fl, AF_INET)) - goto out_unlock; + goto relookup_failed; if (inet_addr_type(net, fl.fl4_src) == RTN_LOCAL) err = __ip_route_output_key(net, &rt2, &fl); @@ -601,7 +601,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) fl2.fl4_dst = fl.fl4_src; if (ip_route_output_key(net, &rt2, &fl2)) - goto out_unlock; + goto relookup_failed; /* Ugh! */ odst = skb_in->dst; @@ -614,21 +614,23 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) } if (err) - goto out_unlock; + goto relookup_failed; err = xfrm_lookup((struct dst_entry **)&rt2, &fl, NULL, XFRM_LOOKUP_ICMP); - if (err == -ENOENT) { + switch (err) { + case 0: + dst_release(&rt->u.dst); + rt = rt2; + break; + case -EPERM: + goto ende; + default: +relookup_failed: if (!rt) goto out_unlock; - goto route_done; + break; } - - dst_release(&rt->u.dst); - rt = rt2; - - if (err) - goto out_unlock; } route_done: diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 724d69aed03..a0a3c78cb5e 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -86,7 +86,10 @@ EXPORT_SYMBOL(inet_frags_fini); void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) { nf->low_thresh = 0; + + local_bh_disable(); inet_frag_evictor(nf, f); + local_bh_enable(); } EXPORT_SYMBOL(inet_frags_exit_net); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 876169f3a52..717c411a5c6 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -124,6 +124,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat tw->tw_hash = sk->sk_hash; tw->tw_ipv6only = 0; tw->tw_prot = sk->sk_prot_creator; + tw->tw_net = sk->sk_net; atomic_set(&tw->tw_refcnt, 1); inet_twsk_dead_node_init(tw); __module_get(tw->tw_prot->owner); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 0b3b328d82d..a4506c8cfef 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -85,7 +85,7 @@ int ip_forward(struct sk_buff *skb) if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) goto sr_failed; - if (unlikely(skb->len > dst_mtu(&rt->u.dst) && + if (unlikely(skb->len > dst_mtu(&rt->u.dst) && !skb_is_gso(skb) && (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index a2e92f9709d..3b2e5adca83 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -568,7 +568,7 @@ int ip_defrag(struct sk_buff *skb, u32 user) IP_INC_STATS_BH(IPSTATS_MIB_REASMREQDS); - net = skb->dev->nd_net; + net = skb->dev ? skb->dev->nd_net : skb->dst->dev->nd_net; /* Start by cleaning up the memory. */ if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh) ip_evictor(net); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index de0572c8885..c2921d01e92 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -583,7 +583,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, } if (!mreq.imr_ifindex) { - if (mreq.imr_address.s_addr == INADDR_ANY) { + if (mreq.imr_address.s_addr == htonl(INADDR_ANY)) { inet->mc_index = 0; inet->mc_addr = 0; err = 0; @@ -1132,7 +1132,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, } release_sock(sk); - if (len < sizeof(int) && len > 0 && val>=0 && val<255) { + if (len < sizeof(int) && len > 0 && val>=0 && val<=255) { unsigned char ucval = (unsigned char)val; len = 1; if (put_user(len, optlen)) diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 5dd938579ee..4824fe8996b 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -103,6 +103,7 @@ - '3' from resolv.h */ #define NONE __constant_htonl(INADDR_NONE) +#define ANY __constant_htonl(INADDR_ANY) /* * Public IP configuration @@ -1410,7 +1411,7 @@ late_initcall(ip_auto_config); /* * Decode any IP configuration options in the "ip=" or "nfsaddrs=" kernel - * command line parameter. See Documentation/nfsroot.txt. + * command line parameter. See Documentation/filesystems/nfsroot.txt. */ static int __init ic_proto_name(char *name) { @@ -1479,19 +1480,19 @@ static int __init ip_auto_config_setup(char *addrs) DBG(("IP-Config: Parameter #%d: `%s'\n", num, ip)); switch (num) { case 0: - if ((ic_myaddr = in_aton(ip)) == INADDR_ANY) + if ((ic_myaddr = in_aton(ip)) == ANY) ic_myaddr = NONE; break; case 1: - if ((ic_servaddr = in_aton(ip)) == INADDR_ANY) + if ((ic_servaddr = in_aton(ip)) == ANY) ic_servaddr = NONE; break; case 2: - if ((ic_gateway = in_aton(ip)) == INADDR_ANY) + if ((ic_gateway = in_aton(ip)) == ANY) ic_gateway = NONE; break; case 3: - if ((ic_netmask = in_aton(ip)) == INADDR_ANY) + if ((ic_netmask = in_aton(ip)) == ANY) ic_netmask = NONE; break; case 4: diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index fe05da41d6b..4dc162894cb 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -588,11 +588,9 @@ static int __init ip_queue_init(void) } #ifdef CONFIG_PROC_FS - proc = create_proc_entry(IPQ_PROC_FS_NAME, 0, init_net.proc_net); - if (proc) { - proc->owner = THIS_MODULE; - proc->proc_fops = &ip_queue_proc_fops; - } else { + proc = proc_create(IPQ_PROC_FS_NAME, 0, init_net.proc_net, + &ip_queue_proc_fops); + if (!proc) { printk(KERN_ERR "ip_queue: failed to create proc entry\n"); goto cleanup_ipqnl; } diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index c6cf84c7761..a12dd329e20 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -82,8 +82,8 @@ clusterip_config_put(struct clusterip_config *c) static inline void clusterip_config_entry_put(struct clusterip_config *c) { + write_lock_bh(&clusterip_lock); if (atomic_dec_and_test(&c->entries)) { - write_lock_bh(&clusterip_lock); list_del(&c->list); write_unlock_bh(&clusterip_lock); @@ -96,7 +96,9 @@ clusterip_config_entry_put(struct clusterip_config *c) #ifdef CONFIG_PROC_FS remove_proc_entry(c->pde->name, c->pde->parent); #endif + return; } + write_unlock_bh(&clusterip_lock); } static struct clusterip_config * @@ -167,14 +169,13 @@ clusterip_config_init(struct ipt_clusterip_tgt_info *i, __be32 ip, /* create proc dir entry */ sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(ip)); - c->pde = create_proc_entry(buffer, S_IWUSR|S_IRUSR, - clusterip_procdir); + c->pde = proc_create(buffer, S_IWUSR|S_IRUSR, + clusterip_procdir, &clusterip_proc_fops); if (!c->pde) { kfree(c); return NULL; } } - c->pde->proc_fops = &clusterip_proc_fops; c->pde->data = c; #endif diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c index 68cbe3ca01c..50e06690eb5 100644 --- a/net/ipv4/netfilter/ipt_recent.c +++ b/net/ipv4/netfilter/ipt_recent.c @@ -252,6 +252,8 @@ recent_mt_check(const char *tablename, const void *ip, if ((info->check_set & (IPT_RECENT_SET | IPT_RECENT_REMOVE)) && (info->seconds || info->hit_count)) return false; + if (info->hit_count > ip_pkt_list_tot) + return false; if (info->name[0] == '\0' || strnlen(info->name, IPT_RECENT_NAME_LEN) == IPT_RECENT_NAME_LEN) return false; @@ -274,12 +276,11 @@ recent_mt_check(const char *tablename, const void *ip, for (i = 0; i < ip_list_hash_size; i++) INIT_LIST_HEAD(&t->iphash[i]); #ifdef CONFIG_PROC_FS - t->proc = create_proc_entry(t->name, ip_list_perms, proc_dir); + t->proc = proc_create(t->name, ip_list_perms, proc_dir, &recent_fops); if (t->proc == NULL) { kfree(t); goto out; } - t->proc->proc_fops = &recent_fops; t->proc->uid = ip_list_uid; t->proc->gid = ip_list_gid; t->proc->data = t; diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 089252e82c0..f500b0fdaef 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -395,13 +395,10 @@ int __init nf_conntrack_ipv4_compat_init(void) if (!proc_exp) goto err2; - proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, init_net.proc_net_stat); + proc_stat = proc_create("ip_conntrack", S_IRUGO, + init_net.proc_net_stat, &ct_cpu_seq_fops); if (!proc_stat) goto err3; - - proc_stat->proc_fops = &ct_cpu_seq_fops; - proc_stat->owner = THIS_MODULE; - return 0; err3: diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 0d5fa3a54d0..36b4e3bb056 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -629,6 +629,8 @@ static int __init nf_nat_init(void) size_t i; int ret; + need_ipv4_conntrack(); + ret = nf_ct_extend_register(&nat_extend); if (ret < 0) { printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 071e83a894a..39b629ac240 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -735,7 +735,7 @@ new_segment: if (!(psize -= copy)) goto out; - if (skb->len < mss_now || (flags & MSG_OOB)) + if (skb->len < size_goal || (flags & MSG_OOB)) continue; if (forced_push(tp)) { @@ -981,7 +981,7 @@ new_segment: if ((seglen -= copy) == 0 && iovlen == 0) goto out; - if (skb->len < mss_now || (flags & MSG_OOB)) + if (skb->len < size_goal || (flags & MSG_OOB)) continue; if (forced_push(tp)) { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7facdb0f696..bbb7d88a16b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1625,13 +1625,11 @@ out: return flag; } -/* If we receive more dupacks than we expected counting segments - * in assumption of absent reordering, interpret this as reordering. - * The only another reason could be bug in receiver TCP. +/* Limits sacked_out so that sum with lost_out isn't ever larger than + * packets_out. Returns zero if sacked_out adjustement wasn't necessary. */ -static void tcp_check_reno_reordering(struct sock *sk, const int addend) +int tcp_limit_reno_sacked(struct tcp_sock *tp) { - struct tcp_sock *tp = tcp_sk(sk); u32 holes; holes = max(tp->lost_out, 1U); @@ -1639,8 +1637,20 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend) if ((tp->sacked_out + holes) > tp->packets_out) { tp->sacked_out = tp->packets_out - holes; - tcp_update_reordering(sk, tp->packets_out + addend, 0); + return 1; } + return 0; +} + +/* If we receive more dupacks than we expected counting segments + * in assumption of absent reordering, interpret this as reordering. + * The only another reason could be bug in receiver TCP. + */ +static void tcp_check_reno_reordering(struct sock *sk, const int addend) +{ + struct tcp_sock *tp = tcp_sk(sk); + if (tcp_limit_reno_sacked(tp)) + tcp_update_reordering(sk, tp->packets_out + addend, 0); } /* Emulate SACKs for SACKless connection: account for a new dupack. */ @@ -1681,11 +1691,16 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp) int tcp_use_frto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb; if (!sysctl_tcp_frto) return 0; + /* MTU probe and F-RTO won't really play nicely along currently */ + if (icsk->icsk_mtup.probe_size) + return 0; + if (IsSackFrto()) return 1; @@ -2134,11 +2149,13 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) /* Mark head of queue up as lost. With RFC3517 SACK, the packets is * is against sacked "cnt", otherwise it's against facked "cnt" */ -static void tcp_mark_head_lost(struct sock *sk, int packets, int fast_rexmit) +static void tcp_mark_head_lost(struct sock *sk, int packets) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int cnt; + int cnt, oldcnt; + int err; + unsigned int mss; BUG_TRAP(packets <= tp->packets_out); if (tp->lost_skb_hint) { @@ -2157,13 +2174,25 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int fast_rexmit) tp->lost_skb_hint = skb; tp->lost_cnt_hint = cnt; + if (after(TCP_SKB_CB(skb)->end_seq, tp->high_seq)) + break; + + oldcnt = cnt; if (tcp_is_fack(tp) || tcp_is_reno(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) cnt += tcp_skb_pcount(skb); - if (((!fast_rexmit || (tp->lost_out > 0)) && (cnt > packets)) || - after(TCP_SKB_CB(skb)->end_seq, tp->high_seq)) - break; + if (cnt > packets) { + if (tcp_is_sack(tp) || (oldcnt >= packets)) + break; + + mss = skb_shinfo(skb)->gso_size; + err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss); + if (err < 0) + break; + cnt = packets; + } + if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_SACKED_ACKED|TCPCB_LOST))) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); @@ -2180,17 +2209,17 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) struct tcp_sock *tp = tcp_sk(sk); if (tcp_is_reno(tp)) { - tcp_mark_head_lost(sk, 1, fast_rexmit); + tcp_mark_head_lost(sk, 1); } else if (tcp_is_fack(tp)) { int lost = tp->fackets_out - tp->reordering; if (lost <= 0) lost = 1; - tcp_mark_head_lost(sk, lost, fast_rexmit); + tcp_mark_head_lost(sk, lost); } else { int sacked_upto = tp->sacked_out - tp->reordering; - if (sacked_upto < 0) - sacked_upto = 0; - tcp_mark_head_lost(sk, sacked_upto, fast_rexmit); + if (sacked_upto < fast_rexmit) + sacked_upto = fast_rexmit; + tcp_mark_head_lost(sk, sacked_upto); } /* New heuristics: it is possible only after we switched @@ -2524,7 +2553,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) before(tp->snd_una, tp->high_seq) && icsk->icsk_ca_state != TCP_CA_Open && tp->fackets_out > tp->reordering) { - tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0); + tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering); NET_INC_STATS_BH(LINUX_MIB_TCPLOSS); } @@ -2586,6 +2615,8 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) case TCP_CA_Loss: if (flag & FLAG_DATA_ACKED) icsk->icsk_retransmits = 0; + if (tcp_is_reno(tp) && flag & FLAG_SND_UNA_ADVANCED) + tcp_reset_reno_sack(tp); if (!tcp_try_undo_loss(sk)) { tcp_moderate_cwnd(tp); tcp_xmit_retransmit_queue(sk); @@ -3810,8 +3841,28 @@ static void tcp_ofo_queue(struct sock *sk) } } +static int tcp_prune_ofo_queue(struct sock *sk); static int tcp_prune_queue(struct sock *sk); +static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) +{ + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || + !sk_rmem_schedule(sk, size)) { + + if (tcp_prune_queue(sk) < 0) + return -1; + + if (!sk_rmem_schedule(sk, size)) { + if (!tcp_prune_ofo_queue(sk)) + return -1; + + if (!sk_rmem_schedule(sk, size)) + return -1; + } + } + return 0; +} + static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { struct tcphdr *th = tcp_hdr(skb); @@ -3861,12 +3912,9 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) if (eaten <= 0) { queue_and_out: if (eaten < 0 && - (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - !sk_rmem_schedule(sk, skb->truesize))) { - if (tcp_prune_queue(sk) < 0 || - !sk_rmem_schedule(sk, skb->truesize)) - goto drop; - } + tcp_try_rmem_schedule(sk, skb->truesize)) + goto drop; + skb_set_owner_r(skb, sk); __skb_queue_tail(&sk->sk_receive_queue, skb); } @@ -3935,12 +3983,8 @@ drop: TCP_ECN_check_ce(tp, skb); - if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - !sk_rmem_schedule(sk, skb->truesize)) { - if (tcp_prune_queue(sk) < 0 || - !sk_rmem_schedule(sk, skb->truesize)) - goto drop; - } + if (tcp_try_rmem_schedule(sk, skb->truesize)) + goto drop; /* Disable header prediction. */ tp->pred_flags = 0; @@ -4167,6 +4211,32 @@ static void tcp_collapse_ofo_queue(struct sock *sk) } } +/* + * Purge the out-of-order queue. + * Return true if queue was pruned. + */ +static int tcp_prune_ofo_queue(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + int res = 0; + + if (!skb_queue_empty(&tp->out_of_order_queue)) { + NET_INC_STATS_BH(LINUX_MIB_OFOPRUNED); + __skb_queue_purge(&tp->out_of_order_queue); + + /* Reset SACK state. A conforming SACK implementation will + * do the same at a timeout based retransmit. When a connection + * is in a sad state like this, we care only about integrity + * of the connection not performance. + */ + if (tp->rx_opt.sack_ok) + tcp_sack_reset(&tp->rx_opt); + sk_mem_reclaim(sk); + res = 1; + } + return res; +} + /* Reduce allocated memory if we can, trying to get * the socket within its memory limits again. * @@ -4200,20 +4270,7 @@ static int tcp_prune_queue(struct sock *sk) /* Collapsing did not help, destructive actions follow. * This must not ever occur. */ - /* First, purge the out_of_order queue. */ - if (!skb_queue_empty(&tp->out_of_order_queue)) { - NET_INC_STATS_BH(LINUX_MIB_OFOPRUNED); - __skb_queue_purge(&tp->out_of_order_queue); - - /* Reset SACK state. A conforming SACK implementation will - * do the same at a timeout based retransmit. When a connection - * is in a sad state like this, we care only about integrity - * of the connection not performance. - */ - if (tcp_is_sack(tp)) - tcp_sack_reset(&tp->rx_opt); - sk_mem_reclaim(sk); - } + tcp_prune_ofo_queue(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) return 0; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ed750f9ceb0..d29ef79c00c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -255,7 +255,7 @@ static u16 tcp_select_window(struct sock *sk) * * Relax Will Robinson. */ - new_win = cur_win; + new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); } tp->rcv_wnd = new_win; tp->rcv_wup = tp->rcv_nxt; @@ -1035,6 +1035,13 @@ static void tcp_cwnd_validate(struct sock *sk) * introducing MSS oddities to segment boundaries. In rare cases where * mss_now != mss_cache, we will request caller to create a small skb * per input skb which could be mostly avoided here (if desired). + * + * We explicitly want to create a request for splitting write queue tail + * to a small skb for Nagle purposes while avoiding unnecessary modulos, + * thus all the complexity (cwnd_len is always MSS multiple which we + * return whenever allowed by the other factors). Basically we need the + * modulo only when the receiver window alone is the limiting factor or + * when we would be allowed to send the split-due-to-Nagle skb fully. */ static unsigned int tcp_mss_split_point(struct sock *sk, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd) @@ -1048,10 +1055,11 @@ static unsigned int tcp_mss_split_point(struct sock *sk, struct sk_buff *skb, if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk))) return cwnd_len; - if (skb == tcp_write_queue_tail(sk) && cwnd_len <= skb->len) + needed = min(skb->len, window); + + if (skb == tcp_write_queue_tail(sk) && cwnd_len <= needed) return cwnd_len; - needed = min(skb->len, window); return needed - needed % mss_now; } @@ -1800,6 +1808,9 @@ void tcp_simple_retransmit(struct sock *sk) if (!lost) return; + if (tcp_is_reno(tp)) + tcp_limit_reno_sacked(tp); + tcp_verify_left_out(tp); /* Don't muck with the congestion window here. diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 7ea1b67b6de..1704c1474ea 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1556,14 +1556,14 @@ static void *udp_seq_start(struct seq_file *seq, loff_t *pos) __acquires(udp_hash_lock) { read_lock(&udp_hash_lock); - return *pos ? udp_get_idx(seq, *pos-1) : (void *)1; + return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN; } static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sock *sk; - if (v == (void *)1) + if (v == SEQ_START_TOKEN) sk = udp_get_idx(seq, 0); else sk = udp_get_next(seq, v); diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c index b47030ba162..9c798abce73 100644 --- a/net/ipv4/xfrm4_mode_beet.c +++ b/net/ipv4/xfrm4_mode_beet.c @@ -39,13 +39,11 @@ static void xfrm4_beet_make_header(struct sk_buff *skb) static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb) { struct ip_beet_phdr *ph; - struct iphdr *iph, *top_iph; + struct iphdr *top_iph; int hdrlen, optlen; - iph = ip_hdr(skb); - hdrlen = 0; - optlen = iph->ihl * 4 - sizeof(*iph); + optlen = XFRM_MODE_SKB_CB(skb)->optlen; if (unlikely(optlen)) hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4); @@ -53,11 +51,12 @@ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb) hdrlen); skb->mac_header = skb->network_header + offsetof(struct iphdr, protocol); - skb->transport_header = skb->network_header + sizeof(*iph); + skb->transport_header = skb->network_header + sizeof(*top_iph); xfrm4_beet_make_header(skb); - ph = (struct ip_beet_phdr *)__skb_pull(skb, sizeof(*iph) - hdrlen); + ph = (struct ip_beet_phdr *) + __skb_pull(skb, XFRM_MODE_SKB_CB(skb)->ihl - hdrlen); top_iph = ip_hdr(skb); diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 8dee617ee90..584e6d74e3a 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -41,7 +41,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->ihl = 5; top_iph->version = 4; - top_iph->protocol = x->inner_mode->afinfo->proto; + top_iph->protocol = xfrm_af2proto(skb->dst->ops->family); /* DS disclosed */ top_iph->tos = INET_ECN_encapsulate(XFRM_MODE_SKB_CB(skb)->tos, diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index d5a58a81802..8c3180adddb 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -56,7 +56,7 @@ int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb) { int err; - err = x->inner_mode->afinfo->extract_output(x, skb); + err = xfrm_inner_extract_output(x, skb); if (err) return err; diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index fdeebe68a37..07735ed280d 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -52,10 +52,12 @@ int xfrm4_extract_header(struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); + XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph); XFRM_MODE_SKB_CB(skb)->id = iph->id; XFRM_MODE_SKB_CB(skb)->frag_off = iph->frag_off; XFRM_MODE_SKB_CB(skb)->tos = iph->tos; XFRM_MODE_SKB_CB(skb)->ttl = iph->ttl; + XFRM_MODE_SKB_CB(skb)->optlen = iph->ihl * 4 - sizeof(*iph); memset(XFRM_MODE_SKB_CB(skb)->flow_lbl, 0, sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl)); |