summaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig8
-rw-r--r--net/ipv4/af_inet.c2
-rw-r--r--net/ipv4/esp4.c4
-rw-r--r--net/ipv4/fib_trie.c7
-rw-r--r--net/ipv4/icmp.c24
-rw-r--r--net/ipv4/inet_fragment.c3
-rw-r--r--net/ipv4/inet_timewait_sock.c1
-rw-r--r--net/ipv4/ip_forward.c2
-rw-r--r--net/ipv4/ip_fragment.c2
-rw-r--r--net/ipv4/ip_sockglue.c4
-rw-r--r--net/ipv4/ipconfig.c11
-rw-r--r--net/ipv4/netfilter/ip_queue.c8
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c9
-rw-r--r--net/ipv4/netfilter/ipt_recent.c5
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c7
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c2
-rw-r--r--net/ipv4/tcp.c4
-rw-r--r--net/ipv4/tcp_input.c143
-rw-r--r--net/ipv4/tcp_output.c17
-rw-r--r--net/ipv4/udp.c4
-rw-r--r--net/ipv4/xfrm4_mode_beet.c11
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c2
-rw-r--r--net/ipv4/xfrm4_output.c2
-rw-r--r--net/ipv4/xfrm4_state.c2
24 files changed, 181 insertions, 103 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 9c7e5ffb223..4670683b468 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -160,7 +160,7 @@ config IP_PNP_DHCP
If unsure, say Y. Note that if you want to use DHCP, a DHCP server
must be operating on your network. Read
- <file:Documentation/nfsroot.txt> for details.
+ <file:Documentation/filesystems/nfsroot.txt> for details.
config IP_PNP_BOOTP
bool "IP: BOOTP support"
@@ -175,7 +175,7 @@ config IP_PNP_BOOTP
does BOOTP itself, providing all necessary information on the kernel
command line, you can say N here. If unsure, say Y. Note that if you
want to use BOOTP, a BOOTP server must be operating on your network.
- Read <file:Documentation/nfsroot.txt> for details.
+ Read <file:Documentation/filesystems/nfsroot.txt> for details.
config IP_PNP_RARP
bool "IP: RARP support"
@@ -187,8 +187,8 @@ config IP_PNP_RARP
discovered automatically at boot time using the RARP protocol (an
older protocol which is being obsoleted by BOOTP and DHCP), say Y
here. Note that if you want to use RARP, a RARP server must be
- operating on your network. Read <file:Documentation/nfsroot.txt> for
- details.
+ operating on your network. Read
+ <file:Documentation/filesystems/nfsroot.txt> for details.
# not yet ready..
# bool ' IP: ARP support' CONFIG_IP_PNP_ARP
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 09ca5293d08..0d109504ed8 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -458,7 +458,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
err = -EADDRNOTAVAIL;
if (!sysctl_ip_nonlocal_bind &&
!inet->freebind &&
- addr->sin_addr.s_addr != INADDR_ANY &&
+ addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
chk_addr_ret != RTN_LOCAL &&
chk_addr_ret != RTN_MULTICAST &&
chk_addr_ret != RTN_BROADCAST)
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 091e6709f83..4e73e5708e7 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -168,7 +168,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
struct xfrm_encap_tmpl *encap = x->encap;
struct udphdr *uh;
__be32 *udpdata32;
- unsigned int sport, dport;
+ __be16 sport, dport;
int encap_type;
spin_lock_bh(&x->lock);
@@ -336,7 +336,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
struct scatterlist *asg;
int err = -EINVAL;
- if (!pskb_may_pull(skb, sizeof(*esph)))
+ if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)))
goto out;
if (elen <= 0)
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 1ff446d0fa8..f6cdc012eec 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -177,10 +177,13 @@ static inline struct tnode *node_parent_rcu(struct node *node)
return rcu_dereference(ret);
}
+/* Same as rcu_assign_pointer
+ * but that macro() assumes that value is a pointer.
+ */
static inline void node_set_parent(struct node *node, struct tnode *ptr)
{
- rcu_assign_pointer(node->parent,
- (unsigned long)ptr | NODE_TYPE(node));
+ smp_wmb();
+ node->parent = (unsigned long)ptr | NODE_TYPE(node);
}
static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index a13c074dac0..40508babad8 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -591,7 +591,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
}
if (xfrm_decode_session_reverse(skb_in, &fl, AF_INET))
- goto out_unlock;
+ goto relookup_failed;
if (inet_addr_type(net, fl.fl4_src) == RTN_LOCAL)
err = __ip_route_output_key(net, &rt2, &fl);
@@ -601,7 +601,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
fl2.fl4_dst = fl.fl4_src;
if (ip_route_output_key(net, &rt2, &fl2))
- goto out_unlock;
+ goto relookup_failed;
/* Ugh! */
odst = skb_in->dst;
@@ -614,21 +614,23 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
}
if (err)
- goto out_unlock;
+ goto relookup_failed;
err = xfrm_lookup((struct dst_entry **)&rt2, &fl, NULL,
XFRM_LOOKUP_ICMP);
- if (err == -ENOENT) {
+ switch (err) {
+ case 0:
+ dst_release(&rt->u.dst);
+ rt = rt2;
+ break;
+ case -EPERM:
+ goto ende;
+ default:
+relookup_failed:
if (!rt)
goto out_unlock;
- goto route_done;
+ break;
}
-
- dst_release(&rt->u.dst);
- rt = rt2;
-
- if (err)
- goto out_unlock;
}
route_done:
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 724d69aed03..a0a3c78cb5e 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -86,7 +86,10 @@ EXPORT_SYMBOL(inet_frags_fini);
void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
{
nf->low_thresh = 0;
+
+ local_bh_disable();
inet_frag_evictor(nf, f);
+ local_bh_enable();
}
EXPORT_SYMBOL(inet_frags_exit_net);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 876169f3a52..717c411a5c6 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -124,6 +124,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
tw->tw_hash = sk->sk_hash;
tw->tw_ipv6only = 0;
tw->tw_prot = sk->sk_prot_creator;
+ tw->tw_net = sk->sk_net;
atomic_set(&tw->tw_refcnt, 1);
inet_twsk_dead_node_init(tw);
__module_get(tw->tw_prot->owner);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 0b3b328d82d..a4506c8cfef 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -85,7 +85,7 @@ int ip_forward(struct sk_buff *skb)
if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
goto sr_failed;
- if (unlikely(skb->len > dst_mtu(&rt->u.dst) &&
+ if (unlikely(skb->len > dst_mtu(&rt->u.dst) && !skb_is_gso(skb) &&
(ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) {
IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index a2e92f9709d..3b2e5adca83 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -568,7 +568,7 @@ int ip_defrag(struct sk_buff *skb, u32 user)
IP_INC_STATS_BH(IPSTATS_MIB_REASMREQDS);
- net = skb->dev->nd_net;
+ net = skb->dev ? skb->dev->nd_net : skb->dst->dev->nd_net;
/* Start by cleaning up the memory. */
if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh)
ip_evictor(net);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index de0572c8885..c2921d01e92 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -583,7 +583,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
}
if (!mreq.imr_ifindex) {
- if (mreq.imr_address.s_addr == INADDR_ANY) {
+ if (mreq.imr_address.s_addr == htonl(INADDR_ANY)) {
inet->mc_index = 0;
inet->mc_addr = 0;
err = 0;
@@ -1132,7 +1132,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
}
release_sock(sk);
- if (len < sizeof(int) && len > 0 && val>=0 && val<255) {
+ if (len < sizeof(int) && len > 0 && val>=0 && val<=255) {
unsigned char ucval = (unsigned char)val;
len = 1;
if (put_user(len, optlen))
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 5dd938579ee..4824fe8996b 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -103,6 +103,7 @@
- '3' from resolv.h */
#define NONE __constant_htonl(INADDR_NONE)
+#define ANY __constant_htonl(INADDR_ANY)
/*
* Public IP configuration
@@ -1410,7 +1411,7 @@ late_initcall(ip_auto_config);
/*
* Decode any IP configuration options in the "ip=" or "nfsaddrs=" kernel
- * command line parameter. See Documentation/nfsroot.txt.
+ * command line parameter. See Documentation/filesystems/nfsroot.txt.
*/
static int __init ic_proto_name(char *name)
{
@@ -1479,19 +1480,19 @@ static int __init ip_auto_config_setup(char *addrs)
DBG(("IP-Config: Parameter #%d: `%s'\n", num, ip));
switch (num) {
case 0:
- if ((ic_myaddr = in_aton(ip)) == INADDR_ANY)
+ if ((ic_myaddr = in_aton(ip)) == ANY)
ic_myaddr = NONE;
break;
case 1:
- if ((ic_servaddr = in_aton(ip)) == INADDR_ANY)
+ if ((ic_servaddr = in_aton(ip)) == ANY)
ic_servaddr = NONE;
break;
case 2:
- if ((ic_gateway = in_aton(ip)) == INADDR_ANY)
+ if ((ic_gateway = in_aton(ip)) == ANY)
ic_gateway = NONE;
break;
case 3:
- if ((ic_netmask = in_aton(ip)) == INADDR_ANY)
+ if ((ic_netmask = in_aton(ip)) == ANY)
ic_netmask = NONE;
break;
case 4:
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index fe05da41d6b..4dc162894cb 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -588,11 +588,9 @@ static int __init ip_queue_init(void)
}
#ifdef CONFIG_PROC_FS
- proc = create_proc_entry(IPQ_PROC_FS_NAME, 0, init_net.proc_net);
- if (proc) {
- proc->owner = THIS_MODULE;
- proc->proc_fops = &ip_queue_proc_fops;
- } else {
+ proc = proc_create(IPQ_PROC_FS_NAME, 0, init_net.proc_net,
+ &ip_queue_proc_fops);
+ if (!proc) {
printk(KERN_ERR "ip_queue: failed to create proc entry\n");
goto cleanup_ipqnl;
}
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index c6cf84c7761..a12dd329e20 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -82,8 +82,8 @@ clusterip_config_put(struct clusterip_config *c)
static inline void
clusterip_config_entry_put(struct clusterip_config *c)
{
+ write_lock_bh(&clusterip_lock);
if (atomic_dec_and_test(&c->entries)) {
- write_lock_bh(&clusterip_lock);
list_del(&c->list);
write_unlock_bh(&clusterip_lock);
@@ -96,7 +96,9 @@ clusterip_config_entry_put(struct clusterip_config *c)
#ifdef CONFIG_PROC_FS
remove_proc_entry(c->pde->name, c->pde->parent);
#endif
+ return;
}
+ write_unlock_bh(&clusterip_lock);
}
static struct clusterip_config *
@@ -167,14 +169,13 @@ clusterip_config_init(struct ipt_clusterip_tgt_info *i, __be32 ip,
/* create proc dir entry */
sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(ip));
- c->pde = create_proc_entry(buffer, S_IWUSR|S_IRUSR,
- clusterip_procdir);
+ c->pde = proc_create(buffer, S_IWUSR|S_IRUSR,
+ clusterip_procdir, &clusterip_proc_fops);
if (!c->pde) {
kfree(c);
return NULL;
}
}
- c->pde->proc_fops = &clusterip_proc_fops;
c->pde->data = c;
#endif
diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c
index 68cbe3ca01c..50e06690eb5 100644
--- a/net/ipv4/netfilter/ipt_recent.c
+++ b/net/ipv4/netfilter/ipt_recent.c
@@ -252,6 +252,8 @@ recent_mt_check(const char *tablename, const void *ip,
if ((info->check_set & (IPT_RECENT_SET | IPT_RECENT_REMOVE)) &&
(info->seconds || info->hit_count))
return false;
+ if (info->hit_count > ip_pkt_list_tot)
+ return false;
if (info->name[0] == '\0' ||
strnlen(info->name, IPT_RECENT_NAME_LEN) == IPT_RECENT_NAME_LEN)
return false;
@@ -274,12 +276,11 @@ recent_mt_check(const char *tablename, const void *ip,
for (i = 0; i < ip_list_hash_size; i++)
INIT_LIST_HEAD(&t->iphash[i]);
#ifdef CONFIG_PROC_FS
- t->proc = create_proc_entry(t->name, ip_list_perms, proc_dir);
+ t->proc = proc_create(t->name, ip_list_perms, proc_dir, &recent_fops);
if (t->proc == NULL) {
kfree(t);
goto out;
}
- t->proc->proc_fops = &recent_fops;
t->proc->uid = ip_list_uid;
t->proc->gid = ip_list_gid;
t->proc->data = t;
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 089252e82c0..f500b0fdaef 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -395,13 +395,10 @@ int __init nf_conntrack_ipv4_compat_init(void)
if (!proc_exp)
goto err2;
- proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, init_net.proc_net_stat);
+ proc_stat = proc_create("ip_conntrack", S_IRUGO,
+ init_net.proc_net_stat, &ct_cpu_seq_fops);
if (!proc_stat)
goto err3;
-
- proc_stat->proc_fops = &ct_cpu_seq_fops;
- proc_stat->owner = THIS_MODULE;
-
return 0;
err3:
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 0d5fa3a54d0..36b4e3bb056 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -629,6 +629,8 @@ static int __init nf_nat_init(void)
size_t i;
int ret;
+ need_ipv4_conntrack();
+
ret = nf_ct_extend_register(&nat_extend);
if (ret < 0) {
printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 071e83a894a..39b629ac240 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -735,7 +735,7 @@ new_segment:
if (!(psize -= copy))
goto out;
- if (skb->len < mss_now || (flags & MSG_OOB))
+ if (skb->len < size_goal || (flags & MSG_OOB))
continue;
if (forced_push(tp)) {
@@ -981,7 +981,7 @@ new_segment:
if ((seglen -= copy) == 0 && iovlen == 0)
goto out;
- if (skb->len < mss_now || (flags & MSG_OOB))
+ if (skb->len < size_goal || (flags & MSG_OOB))
continue;
if (forced_push(tp)) {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7facdb0f696..bbb7d88a16b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1625,13 +1625,11 @@ out:
return flag;
}
-/* If we receive more dupacks than we expected counting segments
- * in assumption of absent reordering, interpret this as reordering.
- * The only another reason could be bug in receiver TCP.
+/* Limits sacked_out so that sum with lost_out isn't ever larger than
+ * packets_out. Returns zero if sacked_out adjustement wasn't necessary.
*/
-static void tcp_check_reno_reordering(struct sock *sk, const int addend)
+int tcp_limit_reno_sacked(struct tcp_sock *tp)
{
- struct tcp_sock *tp = tcp_sk(sk);
u32 holes;
holes = max(tp->lost_out, 1U);
@@ -1639,8 +1637,20 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend)
if ((tp->sacked_out + holes) > tp->packets_out) {
tp->sacked_out = tp->packets_out - holes;
- tcp_update_reordering(sk, tp->packets_out + addend, 0);
+ return 1;
}
+ return 0;
+}
+
+/* If we receive more dupacks than we expected counting segments
+ * in assumption of absent reordering, interpret this as reordering.
+ * The only another reason could be bug in receiver TCP.
+ */
+static void tcp_check_reno_reordering(struct sock *sk, const int addend)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ if (tcp_limit_reno_sacked(tp))
+ tcp_update_reordering(sk, tp->packets_out + addend, 0);
}
/* Emulate SACKs for SACKless connection: account for a new dupack. */
@@ -1681,11 +1691,16 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
int tcp_use_frto(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb;
if (!sysctl_tcp_frto)
return 0;
+ /* MTU probe and F-RTO won't really play nicely along currently */
+ if (icsk->icsk_mtup.probe_size)
+ return 0;
+
if (IsSackFrto())
return 1;
@@ -2134,11 +2149,13 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
/* Mark head of queue up as lost. With RFC3517 SACK, the packets is
* is against sacked "cnt", otherwise it's against facked "cnt"
*/
-static void tcp_mark_head_lost(struct sock *sk, int packets, int fast_rexmit)
+static void tcp_mark_head_lost(struct sock *sk, int packets)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- int cnt;
+ int cnt, oldcnt;
+ int err;
+ unsigned int mss;
BUG_TRAP(packets <= tp->packets_out);
if (tp->lost_skb_hint) {
@@ -2157,13 +2174,25 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int fast_rexmit)
tp->lost_skb_hint = skb;
tp->lost_cnt_hint = cnt;
+ if (after(TCP_SKB_CB(skb)->end_seq, tp->high_seq))
+ break;
+
+ oldcnt = cnt;
if (tcp_is_fack(tp) || tcp_is_reno(tp) ||
(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
cnt += tcp_skb_pcount(skb);
- if (((!fast_rexmit || (tp->lost_out > 0)) && (cnt > packets)) ||
- after(TCP_SKB_CB(skb)->end_seq, tp->high_seq))
- break;
+ if (cnt > packets) {
+ if (tcp_is_sack(tp) || (oldcnt >= packets))
+ break;
+
+ mss = skb_shinfo(skb)->gso_size;
+ err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss);
+ if (err < 0)
+ break;
+ cnt = packets;
+ }
+
if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_SACKED_ACKED|TCPCB_LOST))) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
@@ -2180,17 +2209,17 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_is_reno(tp)) {
- tcp_mark_head_lost(sk, 1, fast_rexmit);
+ tcp_mark_head_lost(sk, 1);
} else if (tcp_is_fack(tp)) {
int lost = tp->fackets_out - tp->reordering;
if (lost <= 0)
lost = 1;
- tcp_mark_head_lost(sk, lost, fast_rexmit);
+ tcp_mark_head_lost(sk, lost);
} else {
int sacked_upto = tp->sacked_out - tp->reordering;
- if (sacked_upto < 0)
- sacked_upto = 0;
- tcp_mark_head_lost(sk, sacked_upto, fast_rexmit);
+ if (sacked_upto < fast_rexmit)
+ sacked_upto = fast_rexmit;
+ tcp_mark_head_lost(sk, sacked_upto);
}
/* New heuristics: it is possible only after we switched
@@ -2524,7 +2553,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
before(tp->snd_una, tp->high_seq) &&
icsk->icsk_ca_state != TCP_CA_Open &&
tp->fackets_out > tp->reordering) {
- tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0);
+ tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering);
NET_INC_STATS_BH(LINUX_MIB_TCPLOSS);
}
@@ -2586,6 +2615,8 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
case TCP_CA_Loss:
if (flag & FLAG_DATA_ACKED)
icsk->icsk_retransmits = 0;
+ if (tcp_is_reno(tp) && flag & FLAG_SND_UNA_ADVANCED)
+ tcp_reset_reno_sack(tp);
if (!tcp_try_undo_loss(sk)) {
tcp_moderate_cwnd(tp);
tcp_xmit_retransmit_queue(sk);
@@ -3810,8 +3841,28 @@ static void tcp_ofo_queue(struct sock *sk)
}
}
+static int tcp_prune_ofo_queue(struct sock *sk);
static int tcp_prune_queue(struct sock *sk);
+static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
+{
+ if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+ !sk_rmem_schedule(sk, size)) {
+
+ if (tcp_prune_queue(sk) < 0)
+ return -1;
+
+ if (!sk_rmem_schedule(sk, size)) {
+ if (!tcp_prune_ofo_queue(sk))
+ return -1;
+
+ if (!sk_rmem_schedule(sk, size))
+ return -1;
+ }
+ }
+ return 0;
+}
+
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
struct tcphdr *th = tcp_hdr(skb);
@@ -3861,12 +3912,9 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
if (eaten <= 0) {
queue_and_out:
if (eaten < 0 &&
- (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
- !sk_rmem_schedule(sk, skb->truesize))) {
- if (tcp_prune_queue(sk) < 0 ||
- !sk_rmem_schedule(sk, skb->truesize))
- goto drop;
- }
+ tcp_try_rmem_schedule(sk, skb->truesize))
+ goto drop;
+
skb_set_owner_r(skb, sk);
__skb_queue_tail(&sk->sk_receive_queue, skb);
}
@@ -3935,12 +3983,8 @@ drop:
TCP_ECN_check_ce(tp, skb);
- if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
- !sk_rmem_schedule(sk, skb->truesize)) {
- if (tcp_prune_queue(sk) < 0 ||
- !sk_rmem_schedule(sk, skb->truesize))
- goto drop;
- }
+ if (tcp_try_rmem_schedule(sk, skb->truesize))
+ goto drop;
/* Disable header prediction. */
tp->pred_flags = 0;
@@ -4167,6 +4211,32 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
}
}
+/*
+ * Purge the out-of-order queue.
+ * Return true if queue was pruned.
+ */
+static int tcp_prune_ofo_queue(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int res = 0;
+
+ if (!skb_queue_empty(&tp->out_of_order_queue)) {
+ NET_INC_STATS_BH(LINUX_MIB_OFOPRUNED);
+ __skb_queue_purge(&tp->out_of_order_queue);
+
+ /* Reset SACK state. A conforming SACK implementation will
+ * do the same at a timeout based retransmit. When a connection
+ * is in a sad state like this, we care only about integrity
+ * of the connection not performance.
+ */
+ if (tp->rx_opt.sack_ok)
+ tcp_sack_reset(&tp->rx_opt);
+ sk_mem_reclaim(sk);
+ res = 1;
+ }
+ return res;
+}
+
/* Reduce allocated memory if we can, trying to get
* the socket within its memory limits again.
*
@@ -4200,20 +4270,7 @@ static int tcp_prune_queue(struct sock *sk)
/* Collapsing did not help, destructive actions follow.
* This must not ever occur. */
- /* First, purge the out_of_order queue. */
- if (!skb_queue_empty(&tp->out_of_order_queue)) {
- NET_INC_STATS_BH(LINUX_MIB_OFOPRUNED);
- __skb_queue_purge(&tp->out_of_order_queue);
-
- /* Reset SACK state. A conforming SACK implementation will
- * do the same at a timeout based retransmit. When a connection
- * is in a sad state like this, we care only about integrity
- * of the connection not performance.
- */
- if (tcp_is_sack(tp))
- tcp_sack_reset(&tp->rx_opt);
- sk_mem_reclaim(sk);
- }
+ tcp_prune_ofo_queue(sk);
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
return 0;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index ed750f9ceb0..d29ef79c00c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -255,7 +255,7 @@ static u16 tcp_select_window(struct sock *sk)
*
* Relax Will Robinson.
*/
- new_win = cur_win;
+ new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
}
tp->rcv_wnd = new_win;
tp->rcv_wup = tp->rcv_nxt;
@@ -1035,6 +1035,13 @@ static void tcp_cwnd_validate(struct sock *sk)
* introducing MSS oddities to segment boundaries. In rare cases where
* mss_now != mss_cache, we will request caller to create a small skb
* per input skb which could be mostly avoided here (if desired).
+ *
+ * We explicitly want to create a request for splitting write queue tail
+ * to a small skb for Nagle purposes while avoiding unnecessary modulos,
+ * thus all the complexity (cwnd_len is always MSS multiple which we
+ * return whenever allowed by the other factors). Basically we need the
+ * modulo only when the receiver window alone is the limiting factor or
+ * when we would be allowed to send the split-due-to-Nagle skb fully.
*/
static unsigned int tcp_mss_split_point(struct sock *sk, struct sk_buff *skb,
unsigned int mss_now, unsigned int cwnd)
@@ -1048,10 +1055,11 @@ static unsigned int tcp_mss_split_point(struct sock *sk, struct sk_buff *skb,
if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk)))
return cwnd_len;
- if (skb == tcp_write_queue_tail(sk) && cwnd_len <= skb->len)
+ needed = min(skb->len, window);
+
+ if (skb == tcp_write_queue_tail(sk) && cwnd_len <= needed)
return cwnd_len;
- needed = min(skb->len, window);
return needed - needed % mss_now;
}
@@ -1800,6 +1808,9 @@ void tcp_simple_retransmit(struct sock *sk)
if (!lost)
return;
+ if (tcp_is_reno(tp))
+ tcp_limit_reno_sacked(tp);
+
tcp_verify_left_out(tp);
/* Don't muck with the congestion window here.
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 7ea1b67b6de..1704c1474ea 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1556,14 +1556,14 @@ static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(udp_hash_lock)
{
read_lock(&udp_hash_lock);
- return *pos ? udp_get_idx(seq, *pos-1) : (void *)1;
+ return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
}
static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct sock *sk;
- if (v == (void *)1)
+ if (v == SEQ_START_TOKEN)
sk = udp_get_idx(seq, 0);
else
sk = udp_get_next(seq, v);
diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c
index b47030ba162..9c798abce73 100644
--- a/net/ipv4/xfrm4_mode_beet.c
+++ b/net/ipv4/xfrm4_mode_beet.c
@@ -39,13 +39,11 @@ static void xfrm4_beet_make_header(struct sk_buff *skb)
static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb)
{
struct ip_beet_phdr *ph;
- struct iphdr *iph, *top_iph;
+ struct iphdr *top_iph;
int hdrlen, optlen;
- iph = ip_hdr(skb);
-
hdrlen = 0;
- optlen = iph->ihl * 4 - sizeof(*iph);
+ optlen = XFRM_MODE_SKB_CB(skb)->optlen;
if (unlikely(optlen))
hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4);
@@ -53,11 +51,12 @@ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb)
hdrlen);
skb->mac_header = skb->network_header +
offsetof(struct iphdr, protocol);
- skb->transport_header = skb->network_header + sizeof(*iph);
+ skb->transport_header = skb->network_header + sizeof(*top_iph);
xfrm4_beet_make_header(skb);
- ph = (struct ip_beet_phdr *)__skb_pull(skb, sizeof(*iph) - hdrlen);
+ ph = (struct ip_beet_phdr *)
+ __skb_pull(skb, XFRM_MODE_SKB_CB(skb)->ihl - hdrlen);
top_iph = ip_hdr(skb);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 8dee617ee90..584e6d74e3a 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -41,7 +41,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
top_iph->ihl = 5;
top_iph->version = 4;
- top_iph->protocol = x->inner_mode->afinfo->proto;
+ top_iph->protocol = xfrm_af2proto(skb->dst->ops->family);
/* DS disclosed */
top_iph->tos = INET_ECN_encapsulate(XFRM_MODE_SKB_CB(skb)->tos,
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index d5a58a81802..8c3180adddb 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -56,7 +56,7 @@ int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
{
int err;
- err = x->inner_mode->afinfo->extract_output(x, skb);
+ err = xfrm_inner_extract_output(x, skb);
if (err)
return err;
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index fdeebe68a37..07735ed280d 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -52,10 +52,12 @@ int xfrm4_extract_header(struct sk_buff *skb)
{
struct iphdr *iph = ip_hdr(skb);
+ XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph);
XFRM_MODE_SKB_CB(skb)->id = iph->id;
XFRM_MODE_SKB_CB(skb)->frag_off = iph->frag_off;
XFRM_MODE_SKB_CB(skb)->tos = iph->tos;
XFRM_MODE_SKB_CB(skb)->ttl = iph->ttl;
+ XFRM_MODE_SKB_CB(skb)->optlen = iph->ihl * 4 - sizeof(*iph);
memset(XFRM_MODE_SKB_CB(skb)->flow_lbl, 0,
sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl));