summaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c144
-rw-r--r--net/ipv4/devinet.c9
-rw-r--r--net/ipv4/fib_semantics.c2
-rw-r--r--net/ipv4/gre_demux.c24
-rw-r--r--net/ipv4/icmp.c23
-rw-r--r--net/ipv4/igmp.c12
-rw-r--r--net/ipv4/inet_connection_sock.c19
-rw-r--r--net/ipv4/inet_hashtables.c6
-rw-r--r--net/ipv4/inetpeer.c2
-rw-r--r--net/ipv4/ip_forward.c54
-rw-r--r--net/ipv4/ip_fragment.c5
-rw-r--r--net/ipv4/ip_gre.c6
-rw-r--r--net/ipv4/ip_options.c6
-rw-r--r--net/ipv4/ip_output.c66
-rw-r--r--net/ipv4/ip_tunnel.c15
-rw-r--r--net/ipv4/ip_vti.c5
-rw-r--r--net/ipv4/ipip.c1
-rw-r--r--net/ipv4/ipmr.c2
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c5
-rw-r--r--net/ipv4/ping.c6
-rw-r--r--net/ipv4/proc.c24
-rw-r--r--net/ipv4/route.c9
-rw-r--r--net/ipv4/syncookies.c3
-rw-r--r--net/ipv4/sysctl_net_ipv4.c87
-rw-r--r--net/ipv4/tcp_bic.c5
-rw-r--r--net/ipv4/tcp_cong.c24
-rw-r--r--net/ipv4/tcp_cubic.c7
-rw-r--r--net/ipv4/tcp_fastopen.c219
-rw-r--r--net/ipv4/tcp_highspeed.c4
-rw-r--r--net/ipv4/tcp_htcp.c4
-rw-r--r--net/ipv4/tcp_hybla.c7
-rw-r--r--net/ipv4/tcp_illinois.c5
-rw-r--r--net/ipv4/tcp_input.c9
-rw-r--r--net/ipv4/tcp_ipv4.c303
-rw-r--r--net/ipv4/tcp_lp.c5
-rw-r--r--net/ipv4/tcp_minisocks.c31
-rw-r--r--net/ipv4/tcp_output.c65
-rw-r--r--net/ipv4/tcp_scalable.c5
-rw-r--r--net/ipv4/tcp_vegas.c7
-rw-r--r--net/ipv4/tcp_veno.c9
-rw-r--r--net/ipv4/tcp_yeah.c5
-rw-r--r--net/ipv4/udp.c49
-rw-r--r--net/ipv4/udplite.c1
-rw-r--r--net/ipv4/xfrm4_output.c34
-rw-r--r--net/ipv4/xfrm4_protocol.c19
45 files changed, 650 insertions, 702 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 8c54870db79..0e9bb08a91e 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -254,7 +254,6 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
struct inet_sock *inet;
struct proto *answer_prot;
unsigned char answer_flags;
- char answer_no_check;
int try_loading_module = 0;
int err;
@@ -312,7 +311,6 @@ lookup_protocol:
sock->ops = answer->ops;
answer_prot = answer->prot;
- answer_no_check = answer->no_check;
answer_flags = answer->flags;
rcu_read_unlock();
@@ -324,7 +322,6 @@ lookup_protocol:
goto out;
err = 0;
- sk->sk_no_check = answer_no_check;
if (INET_PROTOSW_REUSE & answer_flags)
sk->sk_reuse = SK_CAN_REUSE;
@@ -1002,7 +999,6 @@ static struct inet_protosw inetsw_array[] =
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
- .no_check = 0,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
},
@@ -1012,7 +1008,6 @@ static struct inet_protosw inetsw_array[] =
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
- .no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_PERMANENT,
},
@@ -1021,7 +1016,6 @@ static struct inet_protosw inetsw_array[] =
.protocol = IPPROTO_ICMP,
.prot = &ping_prot,
.ops = &inet_dgram_ops,
- .no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
},
@@ -1030,7 +1024,6 @@ static struct inet_protosw inetsw_array[] =
.protocol = IPPROTO_IP, /* wild card */
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
- .no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
}
};
@@ -1476,22 +1469,20 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
}
EXPORT_SYMBOL_GPL(inet_ctl_sock_create);
-unsigned long snmp_fold_field(void __percpu *mib[], int offt)
+unsigned long snmp_fold_field(void __percpu *mib, int offt)
{
unsigned long res = 0;
- int i, j;
+ int i;
- for_each_possible_cpu(i) {
- for (j = 0; j < SNMP_ARRAY_SZ; j++)
- res += *(((unsigned long *) per_cpu_ptr(mib[j], i)) + offt);
- }
+ for_each_possible_cpu(i)
+ res += *(((unsigned long *) per_cpu_ptr(mib, i)) + offt);
return res;
}
EXPORT_SYMBOL_GPL(snmp_fold_field);
#if BITS_PER_LONG==32
-u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)
+u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)
{
u64 res = 0;
int cpu;
@@ -1502,7 +1493,7 @@ u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)
u64 v;
unsigned int start;
- bhptr = per_cpu_ptr(mib[0], cpu);
+ bhptr = per_cpu_ptr(mib, cpu);
syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
do {
start = u64_stats_fetch_begin_irq(syncp);
@@ -1516,25 +1507,6 @@ u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)
EXPORT_SYMBOL_GPL(snmp_fold_field64);
#endif
-int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align)
-{
- BUG_ON(ptr == NULL);
- ptr[0] = __alloc_percpu(mibsize, align);
- if (!ptr[0])
- return -ENOMEM;
-
-#if SNMP_ARRAY_SZ == 2
- ptr[1] = __alloc_percpu(mibsize, align);
- if (!ptr[1]) {
- free_percpu(ptr[0]);
- ptr[0] = NULL;
- return -ENOMEM;
- }
-#endif
- return 0;
-}
-EXPORT_SYMBOL_GPL(snmp_mib_init);
-
#ifdef CONFIG_IP_MULTICAST
static const struct net_protocol igmp_protocol = {
.handler = igmp_rcv,
@@ -1570,40 +1542,30 @@ static __net_init int ipv4_mib_init_net(struct net *net)
{
int i;
- if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics,
- sizeof(struct tcp_mib),
- __alignof__(struct tcp_mib)) < 0)
+ net->mib.tcp_statistics = alloc_percpu(struct tcp_mib);
+ if (!net->mib.tcp_statistics)
goto err_tcp_mib;
- if (snmp_mib_init((void __percpu **)net->mib.ip_statistics,
- sizeof(struct ipstats_mib),
- __alignof__(struct ipstats_mib)) < 0)
+ net->mib.ip_statistics = alloc_percpu(struct ipstats_mib);
+ if (!net->mib.ip_statistics)
goto err_ip_mib;
for_each_possible_cpu(i) {
struct ipstats_mib *af_inet_stats;
- af_inet_stats = per_cpu_ptr(net->mib.ip_statistics[0], i);
- u64_stats_init(&af_inet_stats->syncp);
-#if SNMP_ARRAY_SZ == 2
- af_inet_stats = per_cpu_ptr(net->mib.ip_statistics[1], i);
+ af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i);
u64_stats_init(&af_inet_stats->syncp);
-#endif
}
- if (snmp_mib_init((void __percpu **)net->mib.net_statistics,
- sizeof(struct linux_mib),
- __alignof__(struct linux_mib)) < 0)
+ net->mib.net_statistics = alloc_percpu(struct linux_mib);
+ if (!net->mib.net_statistics)
goto err_net_mib;
- if (snmp_mib_init((void __percpu **)net->mib.udp_statistics,
- sizeof(struct udp_mib),
- __alignof__(struct udp_mib)) < 0)
+ net->mib.udp_statistics = alloc_percpu(struct udp_mib);
+ if (!net->mib.udp_statistics)
goto err_udp_mib;
- if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics,
- sizeof(struct udp_mib),
- __alignof__(struct udp_mib)) < 0)
+ net->mib.udplite_statistics = alloc_percpu(struct udp_mib);
+ if (!net->mib.udplite_statistics)
goto err_udplite_mib;
- if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics,
- sizeof(struct icmp_mib),
- __alignof__(struct icmp_mib)) < 0)
+ net->mib.icmp_statistics = alloc_percpu(struct icmp_mib);
+ if (!net->mib.icmp_statistics)
goto err_icmp_mib;
net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib),
GFP_KERNEL);
@@ -1614,17 +1576,17 @@ static __net_init int ipv4_mib_init_net(struct net *net)
return 0;
err_icmpmsg_mib:
- snmp_mib_free((void __percpu **)net->mib.icmp_statistics);
+ free_percpu(net->mib.icmp_statistics);
err_icmp_mib:
- snmp_mib_free((void __percpu **)net->mib.udplite_statistics);
+ free_percpu(net->mib.udplite_statistics);
err_udplite_mib:
- snmp_mib_free((void __percpu **)net->mib.udp_statistics);
+ free_percpu(net->mib.udp_statistics);
err_udp_mib:
- snmp_mib_free((void __percpu **)net->mib.net_statistics);
+ free_percpu(net->mib.net_statistics);
err_net_mib:
- snmp_mib_free((void __percpu **)net->mib.ip_statistics);
+ free_percpu(net->mib.ip_statistics);
err_ip_mib:
- snmp_mib_free((void __percpu **)net->mib.tcp_statistics);
+ free_percpu(net->mib.tcp_statistics);
err_tcp_mib:
return -ENOMEM;
}
@@ -1632,12 +1594,12 @@ err_tcp_mib:
static __net_exit void ipv4_mib_exit_net(struct net *net)
{
kfree(net->mib.icmpmsg_statistics);
- snmp_mib_free((void __percpu **)net->mib.icmp_statistics);
- snmp_mib_free((void __percpu **)net->mib.udplite_statistics);
- snmp_mib_free((void __percpu **)net->mib.udp_statistics);
- snmp_mib_free((void __percpu **)net->mib.net_statistics);
- snmp_mib_free((void __percpu **)net->mib.ip_statistics);
- snmp_mib_free((void __percpu **)net->mib.tcp_statistics);
+ free_percpu(net->mib.icmp_statistics);
+ free_percpu(net->mib.udplite_statistics);
+ free_percpu(net->mib.udp_statistics);
+ free_percpu(net->mib.net_statistics);
+ free_percpu(net->mib.ip_statistics);
+ free_percpu(net->mib.tcp_statistics);
}
static __net_initdata struct pernet_operations ipv4_mib_ops = {
@@ -1650,6 +1612,39 @@ static int __init init_ipv4_mibs(void)
return register_pernet_subsys(&ipv4_mib_ops);
}
+static __net_init int inet_init_net(struct net *net)
+{
+ /*
+ * Set defaults for local port range
+ */
+ seqlock_init(&net->ipv4.ip_local_ports.lock);
+ net->ipv4.ip_local_ports.range[0] = 32768;
+ net->ipv4.ip_local_ports.range[1] = 61000;
+
+ seqlock_init(&net->ipv4.ping_group_range.lock);
+ /*
+ * Sane defaults - nobody may create ping sockets.
+ * Boot scripts should set this to distro-specific group.
+ */
+ net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1);
+ net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0);
+ return 0;
+}
+
+static __net_exit void inet_exit_net(struct net *net)
+{
+}
+
+static __net_initdata struct pernet_operations af_inet_ops = {
+ .init = inet_init_net,
+ .exit = inet_exit_net,
+};
+
+static int __init init_inet_pernet_ops(void)
+{
+ return register_pernet_subsys(&af_inet_ops);
+}
+
static int ipv4_proc_init(void);
/*
@@ -1703,13 +1698,9 @@ static int __init inet_init(void)
BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb));
- sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
- if (!sysctl_local_reserved_ports)
- goto out;
-
rc = proto_register(&tcp_prot, 1);
if (rc)
- goto out_free_reserved_ports;
+ goto out;
rc = proto_register(&udp_prot, 1);
if (rc)
@@ -1794,6 +1785,9 @@ static int __init inet_init(void)
if (ip_mr_init())
pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
#endif
+
+ if (init_inet_pernet_ops())
+ pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);
/*
* Initialise per-cpu ipv4 mibs
*/
@@ -1816,8 +1810,6 @@ out_unregister_udp_proto:
proto_unregister(&udp_prot);
out_unregister_tcp_proto:
proto_unregister(&tcp_prot);
-out_free_reserved_ports:
- kfree(sysctl_local_reserved_ports);
goto out;
}
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index bdbf68bb2e2..e9449376b58 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -106,7 +106,6 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
#define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT)
static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
-static DEFINE_SPINLOCK(inet_addr_hash_lock);
static u32 inet_addr_hash(struct net *net, __be32 addr)
{
@@ -119,16 +118,14 @@ static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
{
u32 hash = inet_addr_hash(net, ifa->ifa_local);
- spin_lock(&inet_addr_hash_lock);
+ ASSERT_RTNL();
hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
- spin_unlock(&inet_addr_hash_lock);
}
static void inet_hash_remove(struct in_ifaddr *ifa)
{
- spin_lock(&inet_addr_hash_lock);
+ ASSERT_RTNL();
hlist_del_init_rcu(&ifa->hash);
- spin_unlock(&inet_addr_hash_lock);
}
/**
@@ -830,7 +827,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
ifa_existing = find_matching_ifa(ifa);
if (!ifa_existing) {
/* It would be best to check for !NLM_F_CREATE here but
- * userspace alreay relies on not having to provide this.
+ * userspace already relies on not having to provide this.
*/
set_ifa_lifetime(ifa, valid_lft, prefered_lft);
return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 8a043f03c88..b10cd43a472 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -821,13 +821,13 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
if (fi == NULL)
goto failure;
+ fib_info_cnt++;
if (cfg->fc_mx) {
fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
if (!fi->fib_metrics)
goto failure;
} else
fi->fib_metrics = (u32 *) dst_default_metrics;
- fib_info_cnt++;
fi->fib_net = hold_net(net);
fi->fib_protocol = cfg->fc_protocol;
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 250be7421ab..fbfd829f404 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -93,28 +93,6 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
}
EXPORT_SYMBOL_GPL(gre_build_header);
-static __sum16 check_checksum(struct sk_buff *skb)
-{
- __sum16 csum = 0;
-
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- csum = csum_fold(skb->csum);
-
- if (!csum)
- break;
- /* Fall through. */
-
- case CHECKSUM_NONE:
- skb->csum = 0;
- csum = __skb_checksum_complete(skb);
- skb->ip_summed = CHECKSUM_COMPLETE;
- break;
- }
-
- return csum;
-}
-
static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
bool *csum_err)
{
@@ -141,7 +119,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
options = (__be32 *)(greh + 1);
if (greh->flags & GRE_CSUM) {
- if (check_checksum(skb)) {
+ if (skb_checksum_simple_validate(skb)) {
*csum_err = true;
return -EINVAL;
}
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 0134663fdbc..79c3d947a48 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -337,6 +337,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
struct sock *sk;
struct inet_sock *inet;
__be32 daddr, saddr;
+ u32 mark = IP4_REPLY_MARK(net, skb->mark);
if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
return;
@@ -349,6 +350,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
icmp_param->data.icmph.checksum = 0;
inet->tos = ip_hdr(skb)->tos;
+ sk->sk_mark = mark;
daddr = ipc.addr = ip_hdr(skb)->saddr;
saddr = fib_compute_spec_dst(skb);
ipc.opt = NULL;
@@ -364,6 +366,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
memset(&fl4, 0, sizeof(fl4));
fl4.daddr = daddr;
fl4.saddr = saddr;
+ fl4.flowi4_mark = mark;
fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
fl4.flowi4_proto = IPPROTO_ICMP;
security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
@@ -382,7 +385,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
struct flowi4 *fl4,
struct sk_buff *skb_in,
const struct iphdr *iph,
- __be32 saddr, u8 tos,
+ __be32 saddr, u8 tos, u32 mark,
int type, int code,
struct icmp_bxm *param)
{
@@ -394,6 +397,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
fl4->daddr = (param->replyopts.opt.opt.srr ?
param->replyopts.opt.opt.faddr : iph->saddr);
fl4->saddr = saddr;
+ fl4->flowi4_mark = mark;
fl4->flowi4_tos = RT_TOS(tos);
fl4->flowi4_proto = IPPROTO_ICMP;
fl4->fl4_icmp_type = type;
@@ -491,6 +495,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
struct flowi4 fl4;
__be32 saddr;
u8 tos;
+ u32 mark;
struct net *net;
struct sock *sk;
@@ -592,6 +597,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
IPTOS_PREC_INTERNETCONTROL) :
iph->tos;
+ mark = IP4_REPLY_MARK(net, skb_in->mark);
if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in))
goto out_unlock;
@@ -608,13 +614,14 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
icmp_param->skb = skb_in;
icmp_param->offset = skb_network_offset(skb_in);
inet_sk(sk)->tos = tos;
+ sk->sk_mark = mark;
ipc.addr = iph->saddr;
ipc.opt = &icmp_param->replyopts.opt;
ipc.tx_flags = 0;
ipc.ttl = 0;
ipc.tos = -1;
- rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos,
+ rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
type, code, icmp_param);
if (IS_ERR(rt))
goto out_unlock;
@@ -908,16 +915,8 @@ int icmp_rcv(struct sk_buff *skb)
ICMP_INC_STATS_BH(net, ICMP_MIB_INMSGS);
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- if (!csum_fold(skb->csum))
- break;
- /* fall through */
- case CHECKSUM_NONE:
- skb->csum = 0;
- if (__skb_checksum_complete(skb))
- goto csum_error;
- }
+ if (skb_checksum_simple_validate(skb))
+ goto csum_error;
if (!pskb_pull(skb, sizeof(*icmph)))
goto error;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 97e4d1655d2..17d34e3c2ac 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -988,16 +988,8 @@ int igmp_rcv(struct sk_buff *skb)
if (!pskb_may_pull(skb, sizeof(struct igmphdr)))
goto drop;
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- if (!csum_fold(skb->csum))
- break;
- /* fall through */
- case CHECKSUM_NONE:
- skb->csum = 0;
- if (__skb_checksum_complete(skb))
- goto drop;
- }
+ if (skb_checksum_simple_validate(skb))
+ goto drop;
ih = igmp_hdr(skb);
switch (ih->type) {
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 0d1e2cb877e..14d02ea905b 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -29,19 +29,16 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
EXPORT_SYMBOL(inet_csk_timer_bug_msg);
#endif
-unsigned long *sysctl_local_reserved_ports;
-EXPORT_SYMBOL(sysctl_local_reserved_ports);
-
void inet_get_local_port_range(struct net *net, int *low, int *high)
{
unsigned int seq;
do {
- seq = read_seqbegin(&net->ipv4.sysctl_local_ports.lock);
+ seq = read_seqbegin(&net->ipv4.ip_local_ports.lock);
- *low = net->ipv4.sysctl_local_ports.range[0];
- *high = net->ipv4.sysctl_local_ports.range[1];
- } while (read_seqretry(&net->ipv4.sysctl_local_ports.lock, seq));
+ *low = net->ipv4.ip_local_ports.range[0];
+ *high = net->ipv4.ip_local_ports.range[1];
+ } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq));
}
EXPORT_SYMBOL(inet_get_local_port_range);
@@ -113,7 +110,7 @@ again:
smallest_size = -1;
do {
- if (inet_is_reserved_local_port(rover))
+ if (inet_is_local_reserved_port(net, rover))
goto next_nolock;
head = &hashinfo->bhash[inet_bhashfn(net, rover,
hashinfo->bhash_size)];
@@ -408,7 +405,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
struct net *net = sock_net(sk);
int flags = inet_sk_flowi_flags(sk);
- flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
+ flowi4_init_output(fl4, sk->sk_bound_dev_if, ireq->ir_mark,
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
sk->sk_protocol,
flags,
@@ -445,7 +442,7 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
rcu_read_lock();
opt = rcu_dereference(newinet->inet_opt);
- flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
+ flowi4_init_output(fl4, sk->sk_bound_dev_if, inet_rsk(req)->ir_mark,
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
sk->sk_protocol, inet_sk_flowi_flags(sk),
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
@@ -680,6 +677,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num);
newsk->sk_write_space = sk_stream_write_space;
+ newsk->sk_mark = inet_rsk(req)->ir_mark;
+
newicsk->icsk_retransmits = 0;
newicsk->icsk_backoff = 0;
newicsk->icsk_probes_out = 0;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 8b9cf279450..43116e8c8e1 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -274,7 +274,7 @@ struct sock *__inet_lookup_established(struct net *net,
const __be32 daddr, const u16 hnum,
const int dif)
{
- INET_ADDR_COOKIE(acookie, saddr, daddr)
+ INET_ADDR_COOKIE(acookie, saddr, daddr);
const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
struct sock *sk;
const struct hlist_nulls_node *node;
@@ -327,7 +327,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
__be32 daddr = inet->inet_rcv_saddr;
__be32 saddr = inet->inet_daddr;
int dif = sk->sk_bound_dev_if;
- INET_ADDR_COOKIE(acookie, saddr, daddr)
+ INET_ADDR_COOKIE(acookie, saddr, daddr);
const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
struct net *net = sock_net(sk);
unsigned int hash = inet_ehashfn(net, daddr, lport,
@@ -500,7 +500,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
local_bh_disable();
for (i = 1; i <= remaining; i++) {
port = low + (i + offset) % remaining;
- if (inet_is_reserved_local_port(port))
+ if (inet_is_local_reserved_port(net, port))
continue;
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 48f42446511..c98cf141f4e 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -120,7 +120,7 @@ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min
static void inetpeer_gc_worker(struct work_struct *work)
{
struct inet_peer *p, *n, *c;
- LIST_HEAD(list);
+ struct list_head list;
spin_lock_bh(&gc_lock);
list_replace_init(&gc_list, &list);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index be8abe73bb9..3a83ce5efa8 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -42,12 +42,12 @@
static bool ip_may_fragment(const struct sk_buff *skb)
{
return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) ||
- !skb->local_df;
+ skb->ignore_df;
}
static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
{
- if (skb->len <= mtu || skb->local_df)
+ if (skb->len <= mtu)
return false;
if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
@@ -56,53 +56,6 @@ static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
return true;
}
-static bool ip_gso_exceeds_dst_mtu(const struct sk_buff *skb)
-{
- unsigned int mtu;
-
- if (skb->local_df || !skb_is_gso(skb))
- return false;
-
- mtu = ip_dst_mtu_maybe_forward(skb_dst(skb), true);
-
- /* if seglen > mtu, do software segmentation for IP fragmentation on
- * output. DF bit cannot be set since ip_forward would have sent
- * icmp error.
- */
- return skb_gso_network_seglen(skb) > mtu;
-}
-
-/* called if GSO skb needs to be fragmented on forward */
-static int ip_forward_finish_gso(struct sk_buff *skb)
-{
- struct dst_entry *dst = skb_dst(skb);
- netdev_features_t features;
- struct sk_buff *segs;
- int ret = 0;
-
- features = netif_skb_dev_features(skb, dst->dev);
- segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
- if (IS_ERR(segs)) {
- kfree_skb(skb);
- return -ENOMEM;
- }
-
- consume_skb(skb);
-
- do {
- struct sk_buff *nskb = segs->next;
- int err;
-
- segs->next = NULL;
- err = dst_output(segs);
-
- if (err && ret == 0)
- ret = err;
- segs = nskb;
- } while (segs);
-
- return ret;
-}
static int ip_forward_finish(struct sk_buff *skb)
{
@@ -114,9 +67,6 @@ static int ip_forward_finish(struct sk_buff *skb)
if (unlikely(opt->optlen))
ip_forward_options(skb);
- if (ip_gso_exceeds_dst_mtu(skb))
- return ip_forward_finish_gso(skb);
-
return dst_output(skb);
}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index c10a3ce5cbf..ed32313e307 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -232,8 +232,9 @@ static void ip_expire(unsigned long arg)
* "Fragment Reassembly Timeout" message, per RFC792.
*/
if (qp->user == IP_DEFRAG_AF_PACKET ||
- (qp->user == IP_DEFRAG_CONNTRACK_IN &&
- skb_rtable(head)->rt_type != RTN_LOCAL))
+ ((qp->user >= IP_DEFRAG_CONNTRACK_IN) &&
+ (qp->user <= __IP_DEFRAG_CONNTRACK_IN_END) &&
+ (skb_rtable(head)->rt_type != RTN_LOCAL)))
goto out_rcu_unlock;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 94213c89156..c5a557a06a3 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -410,7 +410,7 @@ static int ipgre_open(struct net_device *dev)
struct flowi4 fl4;
struct rtable *rt;
- rt = ip_route_output_gre(dev_net(dev), &fl4,
+ rt = ip_route_output_gre(t->net, &fl4,
t->parms.iph.daddr,
t->parms.iph.saddr,
t->parms.o_key,
@@ -434,7 +434,7 @@ static int ipgre_close(struct net_device *dev)
if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
struct in_device *in_dev;
- in_dev = inetdev_by_index(dev_net(dev), t->mlink);
+ in_dev = inetdev_by_index(t->net, t->mlink);
if (in_dev)
ip_mc_dec_group(in_dev, t->parms.iph.daddr);
}
@@ -478,7 +478,7 @@ static void __gre_tunnel_init(struct net_device *dev)
dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
- dev->features |= NETIF_F_NETNS_LOCAL | GRE_FEATURES;
+ dev->features |= GRE_FEATURES;
dev->hw_features |= GRE_FEATURES;
if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index f4ab72e19af..5e7aecea05c 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -364,7 +364,7 @@ int ip_options_compile(struct net *net,
}
if (optptr[2] <= optlen) {
unsigned char *timeptr = NULL;
- if (optptr[2]+3 > optptr[1]) {
+ if (optptr[2]+3 > optlen) {
pp_ptr = optptr + 2;
goto error;
}
@@ -376,7 +376,7 @@ int ip_options_compile(struct net *net,
optptr[2] += 4;
break;
case IPOPT_TS_TSANDADDR:
- if (optptr[2]+7 > optptr[1]) {
+ if (optptr[2]+7 > optlen) {
pp_ptr = optptr + 2;
goto error;
}
@@ -390,7 +390,7 @@ int ip_options_compile(struct net *net,
optptr[2] += 8;
break;
case IPOPT_TS_PRESPEC:
- if (optptr[2]+7 > optptr[1]) {
+ if (optptr[2]+7 > optlen) {
pp_ptr = optptr + 2;
goto error;
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 1cbeba5edff..6e231ab58d6 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -211,6 +211,48 @@ static inline int ip_finish_output2(struct sk_buff *skb)
return -EINVAL;
}
+static int ip_finish_output_gso(struct sk_buff *skb)
+{
+ netdev_features_t features;
+ struct sk_buff *segs;
+ int ret = 0;
+
+ /* common case: locally created skb or seglen is <= mtu */
+ if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) ||
+ skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb))
+ return ip_finish_output2(skb);
+
+ /* Slowpath - GSO segment length is exceeding the dst MTU.
+ *
+ * This can happen in two cases:
+ * 1) TCP GRO packet, DF bit not set
+ * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly
+ * from host network stack.
+ */
+ features = netif_skb_features(skb);
+ segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+ if (IS_ERR(segs)) {
+ kfree_skb(skb);
+ return -ENOMEM;
+ }
+
+ consume_skb(skb);
+
+ do {
+ struct sk_buff *nskb = segs->next;
+ int err;
+
+ segs->next = NULL;
+ err = ip_fragment(segs, ip_finish_output2);
+
+ if (err && ret == 0)
+ ret = err;
+ segs = nskb;
+ } while (segs);
+
+ return ret;
+}
+
static int ip_finish_output(struct sk_buff *skb)
{
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
@@ -220,10 +262,13 @@ static int ip_finish_output(struct sk_buff *skb)
return dst_output(skb);
}
#endif
- if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
+ if (skb_is_gso(skb))
+ return ip_finish_output_gso(skb);
+
+ if (skb->len > ip_skb_dst_mtu(skb))
return ip_fragment(skb, ip_finish_output2);
- else
- return ip_finish_output2(skb);
+
+ return ip_finish_output2(skb);
}
int ip_mc_output(struct sock *sk, struct sk_buff *skb)
@@ -370,7 +415,7 @@ packet_routed:
skb_reset_network_header(skb);
iph = ip_hdr(skb);
*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
- if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
+ if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
iph->frag_off = htons(IP_DF);
else
iph->frag_off = 0;
@@ -456,7 +501,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
iph = ip_hdr(skb);
mtu = ip_skb_dst_mtu(skb);
- if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) ||
+ if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
(IPCB(skb)->frag_max_size &&
IPCB(skb)->frag_max_size > mtu))) {
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
@@ -821,7 +866,7 @@ static int __ip_append_data(struct sock *sk,
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
- maxnonfragsize = ip_sk_local_df(sk) ? 0xFFFF : mtu;
+ maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
if (cork->length + length > maxnonfragsize - fragheaderlen) {
ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
@@ -1144,7 +1189,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
- maxnonfragsize = ip_sk_local_df(sk) ? 0xFFFF : mtu;
+ maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
if (cork->length + size > maxnonfragsize - fragheaderlen) {
ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
@@ -1305,10 +1350,10 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
* to fragment the frame generated here. No matter, what transforms
* how transforms change size of the packet, it will come out.
*/
- skb->local_df = ip_sk_local_df(sk);
+ skb->ignore_df = ip_sk_ignore_df(sk);
/* DF bit is set when we want to see DF on outgoing frames.
- * If local_df is set too, we still allow to fragment this frame
+ * If ignore_df is set too, we still allow to fragment this frame
* locally. */
if (inet->pmtudisc == IP_PMTUDISC_DO ||
inet->pmtudisc == IP_PMTUDISC_PROBE ||
@@ -1501,7 +1546,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
daddr = replyopts.opt.opt.faddr;
}
- flowi4_init_output(&fl4, arg->bound_dev_if, 0,
+ flowi4_init_output(&fl4, arg->bound_dev_if,
+ IP4_REPLY_MARK(net, skb->mark),
RT_TOS(arg->tos),
RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
ip_reply_arg_flowi_flags(arg),
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index fa5b7519765..86a00bd6684 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -395,11 +395,10 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
struct ip_tunnel_net *itn,
struct ip_tunnel_parm *parms)
{
- struct ip_tunnel *nt, *fbt;
+ struct ip_tunnel *nt;
struct net_device *dev;
BUG_ON(!itn->fb_tunnel_dev);
- fbt = netdev_priv(itn->fb_tunnel_dev);
dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
if (IS_ERR(dev))
return ERR_CAST(dev);
@@ -442,6 +441,8 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
tunnel->i_seqno = ntohl(tpi->seq) + 1;
}
+ skb_reset_network_header(skb);
+
err = IP_ECN_decapsulate(iph, skb);
if (unlikely(err)) {
if (log_ecn_error)
@@ -538,9 +539,10 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
unsigned int max_headroom; /* The extra header space needed */
__be32 dst;
int err;
- bool connected = true;
+ bool connected;
inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
+ connected = (tunnel->parms.iph.daddr != 0);
dst = tnl_params->daddr;
if (dst == 0) {
@@ -753,10 +755,8 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
if (!t && (cmd == SIOCADDTUNNEL)) {
t = ip_tunnel_create(net, itn, p);
- if (IS_ERR(t)) {
- err = PTR_ERR(t);
- break;
- }
+ err = PTR_ERR_OR_ZERO(t);
+ break;
}
if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
if (t != NULL) {
@@ -880,6 +880,7 @@ int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
*/
if (!IS_ERR(itn->fb_tunnel_dev)) {
itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
+ itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
}
rtnl_unlock();
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index afcee51b90e..13ef00f1e17 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -239,6 +239,7 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
static int vti4_err(struct sk_buff *skb, u32 info)
{
__be32 spi;
+ __u32 mark;
struct xfrm_state *x;
struct ip_tunnel *tunnel;
struct ip_esp_hdr *esph;
@@ -254,6 +255,8 @@ static int vti4_err(struct sk_buff *skb, u32 info)
if (!tunnel)
return -1;
+ mark = be32_to_cpu(tunnel->parms.o_key);
+
switch (protocol) {
case IPPROTO_ESP:
esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
@@ -281,7 +284,7 @@ static int vti4_err(struct sk_buff *skb, u32 info)
return 0;
}
- x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+ x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr,
spi, protocol, AF_INET);
if (!x)
return 0;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 812b1835146..4bc508f0db9 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -486,4 +486,5 @@ static void __exit ipip_fini(void)
module_init(ipip_init);
module_exit(ipip_fini);
MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("ipip");
MODULE_ALIAS_NETDEV("tunl0");
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index d84dc8d4c91..2bc9cc47f24 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -484,7 +484,7 @@ static void reg_vif_setup(struct net_device *dev)
dev->type = ARPHRD_PIMREG;
dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
dev->flags = IFF_NOARP;
- dev->netdev_ops = &reg_vif_netdev_ops,
+ dev->netdev_ops = &reg_vif_netdev_ops;
dev->destructor = free_netdev;
dev->features |= NETIF_F_NETNS_LOCAL;
}
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 12e13bd82b5..b8f6381c7d0 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -22,7 +22,6 @@
#endif
#include <net/netfilter/nf_conntrack_zones.h>
-/* Returns new sk_buff, or NULL */
static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
{
int err;
@@ -33,8 +32,10 @@ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
err = ip_defrag(skb, user);
local_bh_enable();
- if (!err)
+ if (!err) {
ip_send_check(ip_hdr(skb));
+ skb->ignore_df = 1;
+ }
return err;
}
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 8210964a9f1..044a0ddf6a7 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -236,15 +236,15 @@ exit:
static void inet_get_ping_group_range_net(struct net *net, kgid_t *low,
kgid_t *high)
{
- kgid_t *data = net->ipv4.sysctl_ping_group_range;
+ kgid_t *data = net->ipv4.ping_group_range.range;
unsigned int seq;
do {
- seq = read_seqbegin(&net->ipv4.sysctl_local_ports.lock);
+ seq = read_seqbegin(&net->ipv4.ping_group_range.lock);
*low = data[0];
*high = data[1];
- } while (read_seqretry(&net->ipv4.sysctl_local_ports.lock, seq));
+ } while (read_seqretry(&net->ipv4.ping_group_range.lock, seq));
}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index ad737fad6d8..ae0af9386f7 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -345,15 +345,15 @@ static void icmp_put(struct seq_file *seq)
for (i = 0; icmpmibmap[i].name != NULL; i++)
seq_printf(seq, " Out%s", icmpmibmap[i].name);
seq_printf(seq, "\nIcmp: %lu %lu %lu",
- snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INMSGS),
- snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS),
- snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS));
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS),
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS),
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS));
for (i = 0; icmpmibmap[i].name != NULL; i++)
seq_printf(seq, " %lu",
atomic_long_read(ptr + icmpmibmap[i].index));
seq_printf(seq, " %lu %lu",
- snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
- snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
for (i = 0; icmpmibmap[i].name != NULL; i++)
seq_printf(seq, " %lu",
atomic_long_read(ptr + (icmpmibmap[i].index | 0x100)));
@@ -379,7 +379,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
seq_printf(seq, " %llu",
- snmp_fold_field64((void __percpu **)net->mib.ip_statistics,
+ snmp_fold_field64(net->mib.ip_statistics,
snmp4_ipstats_list[i].entry,
offsetof(struct ipstats_mib, syncp)));
@@ -395,11 +395,11 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
/* MaxConn field is signed, RFC 2012 */
if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
seq_printf(seq, " %ld",
- snmp_fold_field((void __percpu **)net->mib.tcp_statistics,
+ snmp_fold_field(net->mib.tcp_statistics,
snmp4_tcp_list[i].entry));
else
seq_printf(seq, " %lu",
- snmp_fold_field((void __percpu **)net->mib.tcp_statistics,
+ snmp_fold_field(net->mib.tcp_statistics,
snmp4_tcp_list[i].entry));
}
@@ -410,7 +410,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
seq_puts(seq, "\nUdp:");
for (i = 0; snmp4_udp_list[i].name != NULL; i++)
seq_printf(seq, " %lu",
- snmp_fold_field((void __percpu **)net->mib.udp_statistics,
+ snmp_fold_field(net->mib.udp_statistics,
snmp4_udp_list[i].entry));
/* the UDP and UDP-Lite MIBs are the same */
@@ -421,7 +421,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
seq_puts(seq, "\nUdpLite:");
for (i = 0; snmp4_udp_list[i].name != NULL; i++)
seq_printf(seq, " %lu",
- snmp_fold_field((void __percpu **)net->mib.udplite_statistics,
+ snmp_fold_field(net->mib.udplite_statistics,
snmp4_udp_list[i].entry));
seq_putc(seq, '\n');
@@ -458,7 +458,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
seq_puts(seq, "\nTcpExt:");
for (i = 0; snmp4_net_list[i].name != NULL; i++)
seq_printf(seq, " %lu",
- snmp_fold_field((void __percpu **)net->mib.net_statistics,
+ snmp_fold_field(net->mib.net_statistics,
snmp4_net_list[i].entry));
seq_puts(seq, "\nIpExt:");
@@ -468,7 +468,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
seq_puts(seq, "\nIpExt:");
for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)
seq_printf(seq, " %llu",
- snmp_fold_field64((void __percpu **)net->mib.ip_statistics,
+ snmp_fold_field64(net->mib.ip_statistics,
snmp4_ipextstats_list[i].entry,
offsetof(struct ipstats_mib, syncp)));
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index db1e0da871f..4154eb76b0a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -993,6 +993,9 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
struct flowi4 fl4;
struct rtable *rt;
+ if (!mark)
+ mark = IP4_REPLY_MARK(net, skb->mark);
+
__build_flow_key(&fl4, NULL, iph, oif,
RT_TOS(iph->tos), protocol, mark, flow_flags);
rt = __ip_route_output_key(net, &fl4);
@@ -1010,6 +1013,10 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
struct rtable *rt;
__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+
+ if (!fl4.flowi4_mark)
+ fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
+
rt = __ip_route_output_key(sock_net(sk), &fl4);
if (!IS_ERR(rt)) {
__ip_rt_update_pmtu(rt, &fl4, mtu);
@@ -1519,7 +1526,7 @@ static int __mkroute_input(struct sk_buff *skb,
struct in_device *out_dev;
unsigned int flags = 0;
bool do_cache;
- u32 itag;
+ u32 itag = 0;
/* get a working reference to the output device */
out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index f2ed13c2125..c86624b36a6 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -303,6 +303,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
ireq->ir_rmt_port = th->source;
ireq->ir_loc_addr = ip_hdr(skb)->daddr;
ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
+ ireq->ir_mark = inet_request_mark(sk, skb);
ireq->ecn_ok = ecn_ok;
ireq->snd_wscale = tcp_opt.snd_wscale;
ireq->sack_ok = tcp_opt.sack_ok;
@@ -339,7 +340,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
* hasn't changed since we received the original syn, but I see
* no easy way to do this.
*/
- flowi4_init_output(&fl4, sk->sk_bound_dev_if, sk->sk_mark,
+ flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark,
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
inet_sk_flowi_flags(sk),
(opt && opt->srr) ? opt->faddr : ireq->ir_rmt_addr,
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 44eba052b43..79a007c5255 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -45,10 +45,10 @@ static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
/* Update system visible IP port range */
static void set_local_port_range(struct net *net, int range[2])
{
- write_seqlock(&net->ipv4.sysctl_local_ports.lock);
- net->ipv4.sysctl_local_ports.range[0] = range[0];
- net->ipv4.sysctl_local_ports.range[1] = range[1];
- write_sequnlock(&net->ipv4.sysctl_local_ports.lock);
+ write_seqlock(&net->ipv4.ip_local_ports.lock);
+ net->ipv4.ip_local_ports.range[0] = range[0];
+ net->ipv4.ip_local_ports.range[1] = range[1];
+ write_sequnlock(&net->ipv4.ip_local_ports.lock);
}
/* Validate changes from /proc interface. */
@@ -57,7 +57,7 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
size_t *lenp, loff_t *ppos)
{
struct net *net =
- container_of(table->data, struct net, ipv4.sysctl_local_ports.range);
+ container_of(table->data, struct net, ipv4.ip_local_ports.range);
int ret;
int range[2];
struct ctl_table tmp = {
@@ -87,14 +87,14 @@ static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low
{
kgid_t *data = table->data;
struct net *net =
- container_of(table->data, struct net, ipv4.sysctl_ping_group_range);
+ container_of(table->data, struct net, ipv4.ping_group_range.range);
unsigned int seq;
do {
- seq = read_seqbegin(&net->ipv4.sysctl_local_ports.lock);
+ seq = read_seqbegin(&net->ipv4.ip_local_ports.lock);
*low = data[0];
*high = data[1];
- } while (read_seqretry(&net->ipv4.sysctl_local_ports.lock, seq));
+ } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq));
}
/* Update system visible IP port range */
@@ -102,11 +102,11 @@ static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t hig
{
kgid_t *data = table->data;
struct net *net =
- container_of(table->data, struct net, ipv4.sysctl_ping_group_range);
- write_seqlock(&net->ipv4.sysctl_local_ports.lock);
+ container_of(table->data, struct net, ipv4.ping_group_range.range);
+ write_seqlock(&net->ipv4.ip_local_ports.lock);
data[0] = low;
data[1] = high;
- write_sequnlock(&net->ipv4.sysctl_local_ports.lock);
+ write_sequnlock(&net->ipv4.ip_local_ports.lock);
}
/* Validate changes from /proc interface. */
@@ -437,13 +437,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "ip_local_reserved_ports",
- .data = NULL, /* initialized in sysctl_ipv4_init */
- .maxlen = 65536,
- .mode = 0644,
- .proc_handler = proc_do_large_bitmap,
- },
- {
.procname = "igmp_max_memberships",
.data = &sysctl_igmp_max_memberships,
.maxlen = sizeof(int),
@@ -805,7 +798,7 @@ static struct ctl_table ipv4_net_table[] = {
},
{
.procname = "ping_group_range",
- .data = &init_net.ipv4.sysctl_ping_group_range,
+ .data = &init_net.ipv4.ping_group_range.range,
.maxlen = sizeof(gid_t)*2,
.mode = 0644,
.proc_handler = ipv4_ping_group_range,
@@ -819,12 +812,19 @@ static struct ctl_table ipv4_net_table[] = {
},
{
.procname = "ip_local_port_range",
- .maxlen = sizeof(init_net.ipv4.sysctl_local_ports.range),
- .data = &init_net.ipv4.sysctl_local_ports.range,
+ .maxlen = sizeof(init_net.ipv4.ip_local_ports.range),
+ .data = &init_net.ipv4.ip_local_ports.range,
.mode = 0644,
.proc_handler = ipv4_local_port_range,
},
{
+ .procname = "ip_local_reserved_ports",
+ .data = &init_net.ipv4.sysctl_local_reserved_ports,
+ .maxlen = 65536,
+ .mode = 0644,
+ .proc_handler = proc_do_large_bitmap,
+ },
+ {
.procname = "ip_no_pmtu_disc",
.data = &init_net.ipv4.sysctl_ip_no_pmtu_disc,
.maxlen = sizeof(int),
@@ -838,6 +838,20 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "fwmark_reflect",
+ .data = &init_net.ipv4.sysctl_fwmark_reflect,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_fwmark_accept",
+ .data = &init_net.ipv4.sysctl_tcp_fwmark_accept,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
{ }
};
@@ -858,26 +872,18 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
table[i].data += (void *)net - (void *)&init_net;
}
- /*
- * Sane defaults - nobody may create ping sockets.
- * Boot scripts should set this to distro-specific group.
- */
- net->ipv4.sysctl_ping_group_range[0] = make_kgid(&init_user_ns, 1);
- net->ipv4.sysctl_ping_group_range[1] = make_kgid(&init_user_ns, 0);
-
- /*
- * Set defaults for local port range
- */
- seqlock_init(&net->ipv4.sysctl_local_ports.lock);
- net->ipv4.sysctl_local_ports.range[0] = 32768;
- net->ipv4.sysctl_local_ports.range[1] = 61000;
-
net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
if (net->ipv4.ipv4_hdr == NULL)
goto err_reg;
+ net->ipv4.sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
+ if (!net->ipv4.sysctl_local_reserved_ports)
+ goto err_ports;
+
return 0;
+err_ports:
+ unregister_net_sysctl_table(net->ipv4.ipv4_hdr);
err_reg:
if (!net_eq(net, &init_net))
kfree(table);
@@ -889,6 +895,7 @@ static __net_exit void ipv4_sysctl_exit_net(struct net *net)
{
struct ctl_table *table;
+ kfree(net->ipv4.sysctl_local_reserved_ports);
table = net->ipv4.ipv4_hdr->ctl_table_arg;
unregister_net_sysctl_table(net->ipv4.ipv4_hdr);
kfree(table);
@@ -902,16 +909,6 @@ static __net_initdata struct pernet_operations ipv4_sysctl_ops = {
static __init int sysctl_ipv4_init(void)
{
struct ctl_table_header *hdr;
- struct ctl_table *i;
-
- for (i = ipv4_table; i->procname; i++) {
- if (strcmp(i->procname, "ip_local_reserved_ports") == 0) {
- i->data = sysctl_local_reserved_ports;
- break;
- }
- }
- if (!i->procname)
- return -EINVAL;
hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table);
if (hdr == NULL)
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 821846fb0a7..d5de69bc04f 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -140,13 +140,12 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
ca->cnt = 1;
}
-static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked,
- u32 in_flight)
+static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 2b9464c93b8..7b09d8b49fa 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -276,26 +276,6 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
return err;
}
-/* RFC2861 Check whether we are limited by application or congestion window
- * This is the inverse of cwnd check in tcp_tso_should_defer
- */
-bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
- u32 left;
-
- if (in_flight >= tp->snd_cwnd)
- return true;
-
- left = tp->snd_cwnd - in_flight;
- if (sk_can_gso(sk) &&
- left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd &&
- left < tp->xmit_size_goal_segs)
- return true;
- return left <= tcp_max_tso_deferred_mss(tp);
-}
-EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
-
/* Slow start is used when congestion window is no greater than the slow start
* threshold. We base on RFC2581 and also handle stretch ACKs properly.
* We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but
@@ -337,11 +317,11 @@ EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
/* This is Jacobson's slow start and congestion avoidance.
* SIGCOMM '88, p. 328.
*/
-void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
+void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
/* In "safe" area, increase. */
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 8bf224516ba..a9bd8a4828a 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -304,13 +304,12 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
ca->cnt = 1;
}
-static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked,
- u32 in_flight)
+static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
if (tp->snd_cwnd <= tp->snd_ssthresh) {
@@ -409,7 +408,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT;
ratio += cnt;
- ca->delayed_ack = min(ratio, ACK_RATIO_LIMIT);
+ ca->delayed_ack = clamp(ratio, 1U, ACK_RATIO_LIMIT);
}
/* Some calls are for duplicates without timetamps */
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index f195d9316e5..62e48cf84e6 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -72,25 +72,224 @@ error: kfree(ctx);
return err;
}
-/* Computes the fastopen cookie for the IP path.
- * The path is a 128 bits long (pad with zeros for IPv4).
- *
- * The caller must check foc->len to determine if a valid cookie
- * has been generated successfully.
-*/
-void tcp_fastopen_cookie_gen(__be32 src, __be32 dst,
- struct tcp_fastopen_cookie *foc)
+static bool __tcp_fastopen_cookie_gen(const void *path,
+ struct tcp_fastopen_cookie *foc)
{
- __be32 path[4] = { src, dst, 0, 0 };
struct tcp_fastopen_context *ctx;
+ bool ok = false;
tcp_fastopen_init_key_once(true);
rcu_read_lock();
ctx = rcu_dereference(tcp_fastopen_ctx);
if (ctx) {
- crypto_cipher_encrypt_one(ctx->tfm, foc->val, (__u8 *)path);
+ crypto_cipher_encrypt_one(ctx->tfm, foc->val, path);
foc->len = TCP_FASTOPEN_COOKIE_SIZE;
+ ok = true;
}
rcu_read_unlock();
+ return ok;
+}
+
+/* Generate the fastopen cookie by doing aes128 encryption on both
+ * the source and destination addresses. Pad 0s for IPv4 or IPv4-mapped-IPv6
+ * addresses. For the longer IPv6 addresses use CBC-MAC.
+ *
+ * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE.
+ */
+static bool tcp_fastopen_cookie_gen(struct request_sock *req,
+ struct sk_buff *syn,
+ struct tcp_fastopen_cookie *foc)
+{
+ if (req->rsk_ops->family == AF_INET) {
+ const struct iphdr *iph = ip_hdr(syn);
+
+ __be32 path[4] = { iph->saddr, iph->daddr, 0, 0 };
+ return __tcp_fastopen_cookie_gen(path, foc);
+ }
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (req->rsk_ops->family == AF_INET6) {
+ const struct ipv6hdr *ip6h = ipv6_hdr(syn);
+ struct tcp_fastopen_cookie tmp;
+
+ if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) {
+ struct in6_addr *buf = (struct in6_addr *) tmp.val;
+ int i = 4;
+
+ for (i = 0; i < 4; i++)
+ buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i];
+ return __tcp_fastopen_cookie_gen(buf, foc);
+ }
+ }
+#endif
+ return false;
+}
+
+static bool tcp_fastopen_create_child(struct sock *sk,
+ struct sk_buff *skb,
+ struct dst_entry *dst,
+ struct request_sock *req)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
+ struct sock *child;
+
+ req->num_retrans = 0;
+ req->num_timeout = 0;
+ req->sk = NULL;
+
+ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
+ if (child == NULL)
+ return false;
+
+ spin_lock(&queue->fastopenq->lock);
+ queue->fastopenq->qlen++;
+ spin_unlock(&queue->fastopenq->lock);
+
+ /* Initialize the child socket. Have to fix some values to take
+ * into account the child is a Fast Open socket and is created
+ * only out of the bits carried in the SYN packet.
+ */
+ tp = tcp_sk(child);
+
+ tp->fastopen_rsk = req;
+ /* Do a hold on the listner sk so that if the listener is being
+ * closed, the child that has been accepted can live on and still
+ * access listen_lock.
+ */
+ sock_hold(sk);
+ tcp_rsk(req)->listener = sk;
+
+ /* RFC1323: The window in SYN & SYN/ACK segments is never
+ * scaled. So correct it appropriately.
+ */
+ tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
+
+ /* Activate the retrans timer so that SYNACK can be retransmitted.
+ * The request socket is not added to the SYN table of the parent
+ * because it's been added to the accept queue directly.
+ */
+ inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
+ TCP_TIMEOUT_INIT, TCP_RTO_MAX);
+
+ /* Add the child socket directly into the accept queue */
+ inet_csk_reqsk_queue_add(sk, req, child);
+
+ /* Now finish processing the fastopen child socket. */
+ inet_csk(child)->icsk_af_ops->rebuild_header(child);
+ tcp_init_congestion_control(child);
+ tcp_mtup_init(child);
+ tcp_init_metrics(child);
+ tcp_init_buffer_space(child);
+
+ /* Queue the data carried in the SYN packet. We need to first
+ * bump skb's refcnt because the caller will attempt to free it.
+ *
+ * XXX (TFO) - we honor a zero-payload TFO request for now,
+ * (any reason not to?) but no need to queue the skb since
+ * there is no data. How about SYN+FIN?
+ */
+ if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1) {
+ skb = skb_get(skb);
+ skb_dst_drop(skb);
+ __skb_pull(skb, tcp_hdr(skb)->doff * 4);
+ skb_set_owner_r(skb, child);
+ __skb_queue_tail(&child->sk_receive_queue, skb);
+ tp->syn_data_acked = 1;
+ }
+ tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ sk->sk_data_ready(sk);
+ bh_unlock_sock(child);
+ sock_put(child);
+ WARN_ON(req->sk == NULL);
+ return true;
+}
+EXPORT_SYMBOL(tcp_fastopen_create_child);
+
+static bool tcp_fastopen_queue_check(struct sock *sk)
+{
+ struct fastopen_queue *fastopenq;
+
+ /* Make sure the listener has enabled fastopen, and we don't
+ * exceed the max # of pending TFO requests allowed before trying
+ * to validating the cookie in order to avoid burning CPU cycles
+ * unnecessarily.
+ *
+ * XXX (TFO) - The implication of checking the max_qlen before
+ * processing a cookie request is that clients can't differentiate
+ * between qlen overflow causing Fast Open to be disabled
+ * temporarily vs a server not supporting Fast Open at all.
+ */
+ fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
+ if (fastopenq == NULL || fastopenq->max_qlen == 0)
+ return false;
+
+ if (fastopenq->qlen >= fastopenq->max_qlen) {
+ struct request_sock *req1;
+ spin_lock(&fastopenq->lock);
+ req1 = fastopenq->rskq_rst_head;
+ if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
+ spin_unlock(&fastopenq->lock);
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
+ return false;
+ }
+ fastopenq->rskq_rst_head = req1->dl_next;
+ fastopenq->qlen--;
+ spin_unlock(&fastopenq->lock);
+ reqsk_free(req1);
+ }
+ return true;
+}
+
+/* Returns true if we should perform Fast Open on the SYN. The cookie (foc)
+ * may be updated and return the client in the SYN-ACK later. E.g., Fast Open
+ * cookie request (foc->len == 0).
+ */
+bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct tcp_fastopen_cookie *foc,
+ struct dst_entry *dst)
+{
+ struct tcp_fastopen_cookie valid_foc = { .len = -1 };
+ bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
+
+ if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) &&
+ (syn_data || foc->len >= 0) &&
+ tcp_fastopen_queue_check(sk))) {
+ foc->len = -1;
+ return false;
+ }
+
+ if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD))
+ goto fastopen;
+
+ if (tcp_fastopen_cookie_gen(req, skb, &valid_foc) &&
+ foc->len == TCP_FASTOPEN_COOKIE_SIZE &&
+ foc->len == valid_foc.len &&
+ !memcmp(foc->val, valid_foc.val, foc->len)) {
+ /* Cookie is valid. Create a (full) child socket to accept
+ * the data in SYN before returning a SYN-ACK to ack the
+ * data. If we fail to create the socket, fall back and
+ * ack the ISN only but includes the same cookie.
+ *
+ * Note: Data-less SYN with valid cookie is allowed to send
+ * data in SYN_RECV state.
+ */
+fastopen:
+ if (tcp_fastopen_create_child(sk, skb, dst, req)) {
+ foc->len = -1;
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENPASSIVE);
+ return true;
+ }
+ }
+
+ NET_INC_STATS_BH(sock_net(sk), foc->len ?
+ LINUX_MIB_TCPFASTOPENPASSIVEFAIL :
+ LINUX_MIB_TCPFASTOPENCOOKIEREQD);
+ *foc = valid_foc;
+ return false;
}
+EXPORT_SYMBOL(tcp_try_fastopen);
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 8b9e7bad77c..1c4908280d9 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -109,12 +109,12 @@ static void hstcp_init(struct sock *sk)
tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
}
-static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
+static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct hstcp *ca = inet_csk_ca(sk);
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 4a194acfd92..031361311a8 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -227,12 +227,12 @@ static u32 htcp_recalc_ssthresh(struct sock *sk)
return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
}
-static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
+static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct htcp *ca = inet_csk_ca(sk);
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index a15a799bf76..d8f8f05a495 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -87,8 +87,7 @@ static inline u32 hybla_fraction(u32 odds)
* o Give cwnd a new value based on the model proposed
* o remember increments <1
*/
-static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked,
- u32 in_flight)
+static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct hybla *ca = inet_csk_ca(sk);
@@ -101,11 +100,11 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked,
ca->minrtt_us = tp->srtt_us;
}
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
if (!ca->hybla_en) {
- tcp_reno_cong_avoid(sk, ack, acked, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked);
return;
}
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 863d105e301..5999b3972e6 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -255,8 +255,7 @@ static void tcp_illinois_state(struct sock *sk, u8 new_state)
/*
* Increase window in response to successful acknowledgment.
*/
-static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked,
- u32 in_flight)
+static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct illinois *ca = inet_csk_ca(sk);
@@ -265,7 +264,7 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked,
update_params(sk);
/* RFC2861 only increase cwnd if fully utilized */
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
/* In slow start */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6efed134ab6..350b2072f0a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2938,10 +2938,11 @@ static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp)
tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L);
}
-static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
+static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- icsk->icsk_ca_ops->cong_avoid(sk, ack, acked, in_flight);
+
+ icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);
tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
}
@@ -3364,7 +3365,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
bool is_dupack = false;
- u32 prior_in_flight;
u32 prior_fackets;
int prior_packets = tp->packets_out;
const int prior_unsacked = tp->packets_out - tp->sacked_out;
@@ -3397,7 +3397,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
flag |= FLAG_SND_UNA_ADVANCED;
prior_fackets = tp->fackets_out;
- prior_in_flight = tcp_packets_in_flight(tp);
/* ts_recent update must be made after we are sure that the packet
* is in window.
@@ -3452,7 +3451,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* Advance cwnd if state allows */
if (tcp_may_raise_cwnd(sk, flag))
- tcp_cong_avoid(sk, ack, acked, prior_in_flight);
+ tcp_cong_avoid(sk, ack, acked);
if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 438f3b95143..77cccda1ad0 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -336,8 +336,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
const int code = icmp_hdr(icmp_skb)->code;
struct sock *sk;
struct sk_buff *skb;
- struct request_sock *req;
- __u32 seq;
+ struct request_sock *fastopen;
+ __u32 seq, snd_una;
__u32 remaining;
int err;
struct net *net = dev_net(icmp_skb->dev);
@@ -378,12 +378,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
icsk = inet_csk(sk);
tp = tcp_sk(sk);
- req = tp->fastopen_rsk;
seq = ntohl(th->seq);
+ /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
+ fastopen = tp->fastopen_rsk;
+ snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
if (sk->sk_state != TCP_LISTEN &&
- !between(seq, tp->snd_una, tp->snd_nxt) &&
- (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
- /* For a Fast Open socket, allow seq to be snt_isn. */
+ !between(seq, snd_una, tp->snd_nxt)) {
NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
goto out;
}
@@ -426,11 +426,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
break;
if (seq != tp->snd_una || !icsk->icsk_retransmits ||
- !icsk->icsk_backoff)
+ !icsk->icsk_backoff || fastopen)
break;
- /* XXX (TFO) - revisit the following logic for TFO */
-
if (sock_owned_by_user(sk))
break;
@@ -462,14 +460,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
goto out;
}
- /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
- * than following the TCP_SYN_RECV case and closing the socket,
- * we ignore the ICMP error and keep trying like a fully established
- * socket. Is this the right thing to do?
- */
- if (req && req->sk == NULL)
- goto out;
-
switch (sk->sk_state) {
struct request_sock *req, **prev;
case TCP_LISTEN:
@@ -502,10 +492,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
goto out;
case TCP_SYN_SENT:
- case TCP_SYN_RECV: /* Cannot happen.
- It can f.e. if SYNs crossed,
- or Fast Open.
- */
+ case TCP_SYN_RECV:
+ /* Only in fast or simultaneous open. If a fast open socket is
+ * is already accepted it is treated as a connected one below.
+ */
+ if (fastopen && fastopen->sk == NULL)
+ break;
+
if (!sock_owned_by_user(sk)) {
sk->sk_err = err;
@@ -822,7 +815,8 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
*/
static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
- u16 queue_mapping)
+ u16 queue_mapping,
+ struct tcp_fastopen_cookie *foc)
{
const struct inet_request_sock *ireq = inet_rsk(req);
struct flowi4 fl4;
@@ -833,7 +827,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
return -1;
- skb = tcp_make_synack(sk, dst, req, NULL);
+ skb = tcp_make_synack(sk, dst, req, foc);
if (skb) {
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
@@ -852,7 +846,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
{
- int res = tcp_v4_send_synack(sk, NULL, req, 0);
+ int res = tcp_v4_send_synack(sk, NULL, req, 0, NULL);
if (!res) {
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
@@ -1260,187 +1254,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
};
#endif
-static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
- struct request_sock *req,
- struct tcp_fastopen_cookie *foc,
- struct tcp_fastopen_cookie *valid_foc)
-{
- bool skip_cookie = false;
- struct fastopen_queue *fastopenq;
-
- if (likely(!fastopen_cookie_present(foc))) {
- /* See include/net/tcp.h for the meaning of these knobs */
- if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
- ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
- (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
- skip_cookie = true; /* no cookie to validate */
- else
- return false;
- }
- fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
- /* A FO option is present; bump the counter. */
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
-
- /* Make sure the listener has enabled fastopen, and we don't
- * exceed the max # of pending TFO requests allowed before trying
- * to validating the cookie in order to avoid burning CPU cycles
- * unnecessarily.
- *
- * XXX (TFO) - The implication of checking the max_qlen before
- * processing a cookie request is that clients can't differentiate
- * between qlen overflow causing Fast Open to be disabled
- * temporarily vs a server not supporting Fast Open at all.
- */
- if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
- fastopenq == NULL || fastopenq->max_qlen == 0)
- return false;
-
- if (fastopenq->qlen >= fastopenq->max_qlen) {
- struct request_sock *req1;
- spin_lock(&fastopenq->lock);
- req1 = fastopenq->rskq_rst_head;
- if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
- spin_unlock(&fastopenq->lock);
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
- /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
- foc->len = -1;
- return false;
- }
- fastopenq->rskq_rst_head = req1->dl_next;
- fastopenq->qlen--;
- spin_unlock(&fastopenq->lock);
- reqsk_free(req1);
- }
- if (skip_cookie) {
- tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
- return true;
- }
-
- if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
- if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
- tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
- ip_hdr(skb)->daddr, valid_foc);
- if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
- memcmp(&foc->val[0], &valid_foc->val[0],
- TCP_FASTOPEN_COOKIE_SIZE) != 0)
- return false;
- valid_foc->len = -1;
- }
- /* Acknowledge the data received from the peer. */
- tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
- return true;
- } else if (foc->len == 0) { /* Client requesting a cookie */
- tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
- ip_hdr(skb)->daddr, valid_foc);
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPFASTOPENCOOKIEREQD);
- } else {
- /* Client sent a cookie with wrong size. Treat it
- * the same as invalid and return a valid one.
- */
- tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr,
- ip_hdr(skb)->daddr, valid_foc);
- }
- return false;
-}
-
-static int tcp_v4_conn_req_fastopen(struct sock *sk,
- struct sk_buff *skb,
- struct sk_buff *skb_synack,
- struct request_sock *req)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
- const struct inet_request_sock *ireq = inet_rsk(req);
- struct sock *child;
- int err;
-
- req->num_retrans = 0;
- req->num_timeout = 0;
- req->sk = NULL;
-
- child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
- if (child == NULL) {
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
- kfree_skb(skb_synack);
- return -1;
- }
- err = ip_build_and_send_pkt(skb_synack, sk, ireq->ir_loc_addr,
- ireq->ir_rmt_addr, ireq->opt);
- err = net_xmit_eval(err);
- if (!err)
- tcp_rsk(req)->snt_synack = tcp_time_stamp;
- /* XXX (TFO) - is it ok to ignore error and continue? */
-
- spin_lock(&queue->fastopenq->lock);
- queue->fastopenq->qlen++;
- spin_unlock(&queue->fastopenq->lock);
-
- /* Initialize the child socket. Have to fix some values to take
- * into account the child is a Fast Open socket and is created
- * only out of the bits carried in the SYN packet.
- */
- tp = tcp_sk(child);
-
- tp->fastopen_rsk = req;
- /* Do a hold on the listner sk so that if the listener is being
- * closed, the child that has been accepted can live on and still
- * access listen_lock.
- */
- sock_hold(sk);
- tcp_rsk(req)->listener = sk;
-
- /* RFC1323: The window in SYN & SYN/ACK segments is never
- * scaled. So correct it appropriately.
- */
- tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
-
- /* Activate the retrans timer so that SYNACK can be retransmitted.
- * The request socket is not added to the SYN table of the parent
- * because it's been added to the accept queue directly.
- */
- inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
- TCP_TIMEOUT_INIT, TCP_RTO_MAX);
-
- /* Add the child socket directly into the accept queue */
- inet_csk_reqsk_queue_add(sk, req, child);
-
- /* Now finish processing the fastopen child socket. */
- inet_csk(child)->icsk_af_ops->rebuild_header(child);
- tcp_init_congestion_control(child);
- tcp_mtup_init(child);
- tcp_init_metrics(child);
- tcp_init_buffer_space(child);
-
- /* Queue the data carried in the SYN packet. We need to first
- * bump skb's refcnt because the caller will attempt to free it.
- *
- * XXX (TFO) - we honor a zero-payload TFO request for now.
- * (Any reason not to?)
- */
- if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
- /* Don't queue the skb if there is no payload in SYN.
- * XXX (TFO) - How about SYN+FIN?
- */
- tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
- } else {
- skb = skb_get(skb);
- skb_dst_drop(skb);
- __skb_pull(skb, tcp_hdr(skb)->doff * 4);
- skb_set_owner_r(skb, child);
- __skb_queue_tail(&child->sk_receive_queue, skb);
- tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
- tp->syn_data_acked = 1;
- }
- sk->sk_data_ready(sk);
- bh_unlock_sock(child);
- sock_put(child);
- WARN_ON(req->sk == NULL);
- return 0;
-}
-
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
struct tcp_options_received tmp_opt;
@@ -1451,12 +1264,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
__be32 saddr = ip_hdr(skb)->saddr;
__be32 daddr = ip_hdr(skb)->daddr;
__u32 isn = TCP_SKB_CB(skb)->when;
- bool want_cookie = false;
+ bool want_cookie = false, fastopen;
struct flowi4 fl4;
struct tcp_fastopen_cookie foc = { .len = -1 };
- struct tcp_fastopen_cookie valid_foc = { .len = -1 };
- struct sk_buff *skb_synack;
- int do_fastopen;
+ int err;
/* Never answer to SYNs send to broadcast or multicast */
if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
@@ -1507,6 +1318,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
ireq->ir_rmt_addr = saddr;
ireq->no_srccheck = inet_sk(sk)->transparent;
ireq->opt = tcp_v4_save_options(skb);
+ ireq->ir_mark = inet_request_mark(sk, skb);
if (security_inet_conn_request(sk, skb, req))
goto drop_and_free;
@@ -1555,52 +1367,24 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
isn = tcp_v4_init_sequence(skb);
}
- tcp_rsk(req)->snt_isn = isn;
-
- if (dst == NULL) {
- dst = inet_csk_route_req(sk, &fl4, req);
- if (dst == NULL)
- goto drop_and_free;
- }
- do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
-
- /* We don't call tcp_v4_send_synack() directly because we need
- * to make sure a child socket can be created successfully before
- * sending back synack!
- *
- * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
- * (or better yet, call tcp_send_synack() in the child context
- * directly, but will have to fix bunch of other code first)
- * after syn_recv_sock() except one will need to first fix the
- * latter to remove its dependency on the current implementation
- * of tcp_v4_send_synack()->tcp_select_initial_window().
- */
- skb_synack = tcp_make_synack(sk, dst, req,
- fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
-
- if (skb_synack) {
- __tcp_v4_send_check(skb_synack, ireq->ir_loc_addr, ireq->ir_rmt_addr);
- skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
- } else
+ if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
goto drop_and_free;
- if (likely(!do_fastopen)) {
- int err;
- err = ip_build_and_send_pkt(skb_synack, sk, ireq->ir_loc_addr,
- ireq->ir_rmt_addr, ireq->opt);
- err = net_xmit_eval(err);
+ tcp_rsk(req)->snt_isn = isn;
+ tcp_rsk(req)->snt_synack = tcp_time_stamp;
+ tcp_openreq_init_rwin(req, sk, dst);
+ fastopen = !want_cookie &&
+ tcp_try_fastopen(sk, skb, req, &foc, dst);
+ err = tcp_v4_send_synack(sk, dst, req,
+ skb_get_queue_mapping(skb), &foc);
+ if (!fastopen) {
if (err || want_cookie)
goto drop_and_free;
tcp_rsk(req)->snt_synack = tcp_time_stamp;
tcp_rsk(req)->listener = NULL;
- /* Add the request_sock to the SYN table */
inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
- if (fastopen_cookie_present(&foc) && foc.len != 0)
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
- } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req))
- goto drop_and_free;
+ }
return 0;
@@ -1744,28 +1528,6 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
return sk;
}
-static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
-{
- const struct iphdr *iph = ip_hdr(skb);
-
- if (skb->ip_summed == CHECKSUM_COMPLETE) {
- if (!tcp_v4_check(skb->len, iph->saddr,
- iph->daddr, skb->csum)) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- return 0;
- }
- }
-
- skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
- skb->len, IPPROTO_TCP, 0);
-
- if (skb->len <= 76) {
- return __skb_checksum_complete(skb);
- }
- return 0;
-}
-
-
/* The socket must have it's spinlock held when we get
* here.
*
@@ -1960,7 +1722,8 @@ int tcp_v4_rcv(struct sk_buff *skb)
* Packet length and doff are validated by header prediction,
* provided case of th->doff==0 is eliminated.
* So, we defer the checks. */
- if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
+
+ if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
goto csum_error;
th = tcp_hdr(skb);
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index c9aecae3132..1e70fa8fa79 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -115,13 +115,12 @@ static void tcp_lp_init(struct sock *sk)
* Will only call newReno CA when away from inference.
* From TCP-LP's paper, this will be handled in additive increasement.
*/
-static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked,
- u32 in_flight)
+static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct lp *lp = inet_csk_ca(sk);
if (!(lp->flag & LP_WITHIN_INF))
- tcp_reno_cong_avoid(sk, ack, acked, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked);
}
/**
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 05c1b155251..e68e0d4af6c 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -362,6 +362,37 @@ void tcp_twsk_destructor(struct sock *sk)
}
EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
+void tcp_openreq_init_rwin(struct request_sock *req,
+ struct sock *sk, struct dst_entry *dst)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+ struct tcp_sock *tp = tcp_sk(sk);
+ __u8 rcv_wscale;
+ int mss = dst_metric_advmss(dst);
+
+ if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
+ mss = tp->rx_opt.user_mss;
+
+ /* Set this up on the first call only */
+ req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
+
+ /* limit the window selection if the user enforce a smaller rx buffer */
+ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+ (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
+ req->window_clamp = tcp_full_space(sk);
+
+ /* tcp_full_space because it is guaranteed to be the first packet */
+ tcp_select_initial_window(tcp_full_space(sk),
+ mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
+ &req->rcv_wnd,
+ &req->window_clamp,
+ ireq->wscale_ok,
+ &rcv_wscale,
+ dst_metric(dst, RTAX_INITRWND));
+ ireq->rcv_wscale = rcv_wscale;
+}
+EXPORT_SYMBOL(tcp_openreq_init_rwin);
+
static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,
struct request_sock *req)
{
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 20847de991e..d463c35db33 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -627,7 +627,7 @@ static unsigned int tcp_synack_options(struct sock *sk,
if (unlikely(!ireq->tstamp_ok))
remaining -= TCPOLEN_SACKPERM_ALIGNED;
}
- if (foc != NULL) {
+ if (foc != NULL && foc->len >= 0) {
u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
need = (need + 3) & ~3U; /* Align to 32 bits */
if (remaining >= need) {
@@ -1402,12 +1402,21 @@ static void tcp_cwnd_application_limited(struct sock *sk)
tp->snd_cwnd_stamp = tcp_time_stamp;
}
-/* Congestion window validation. (RFC2861) */
-static void tcp_cwnd_validate(struct sock *sk)
+static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tp->packets_out >= tp->snd_cwnd) {
+ /* Track the maximum number of outstanding packets in each
+ * window, and remember whether we were cwnd-limited then.
+ */
+ if (!before(tp->snd_una, tp->max_packets_seq) ||
+ tp->packets_out > tp->max_packets_out) {
+ tp->max_packets_out = tp->packets_out;
+ tp->max_packets_seq = tp->snd_nxt;
+ tp->is_cwnd_limited = is_cwnd_limited;
+ }
+
+ if (tcp_is_cwnd_limited(sk)) {
/* Network is feed fully. */
tp->snd_cwnd_used = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -1659,7 +1668,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
*
* This algorithm is from John Heffner.
*/
-static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
+static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
+ bool *is_cwnd_limited)
{
struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1723,6 +1733,9 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
if (!tp->tso_deferred)
tp->tso_deferred = 1 | (jiffies << 1);
+ if (cong_win < send_win && cong_win < skb->len)
+ *is_cwnd_limited = true;
+
return true;
send_now:
@@ -1883,6 +1896,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
unsigned int tso_segs, sent_pkts;
int cwnd_quota;
int result;
+ bool is_cwnd_limited = false;
sent_pkts = 0;
@@ -1907,6 +1921,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
cwnd_quota = tcp_cwnd_test(tp, skb);
if (!cwnd_quota) {
+ is_cwnd_limited = true;
if (push_one == 2)
/* Force out a loss probe pkt. */
cwnd_quota = 1;
@@ -1923,7 +1938,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
nonagle : TCP_NAGLE_PUSH))))
break;
} else {
- if (!push_one && tcp_tso_should_defer(sk, skb))
+ if (!push_one &&
+ tcp_tso_should_defer(sk, skb, &is_cwnd_limited))
break;
}
@@ -1990,7 +2006,7 @@ repair:
/* Send one loss probe per tail loss episode. */
if (push_one != 2)
tcp_schedule_loss_probe(sk);
- tcp_cwnd_validate(sk);
+ tcp_cwnd_validate(sk, is_cwnd_limited);
return false;
}
return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
@@ -2481,8 +2497,14 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
- if (likely(!err))
+ if (likely(!err)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
+ /* Update global TCP statistics. */
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+ tp->total_retrans++;
+ }
return err;
}
@@ -2492,12 +2514,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
int err = __tcp_retransmit_skb(sk, skb);
if (err == 0) {
- /* Update global TCP statistics. */
- TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
- if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
- tp->total_retrans++;
-
#if FASTRETRANS_DEBUG > 0
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
net_dbg_ratelimited("retrans_out leaked\n");
@@ -2796,27 +2812,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
mss = tp->rx_opt.user_mss;
- if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
- __u8 rcv_wscale;
- /* Set this up on the first call only */
- req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
-
- /* limit the window selection if the user enforce a smaller rx buffer */
- if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
- (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
- req->window_clamp = tcp_full_space(sk);
-
- /* tcp_full_space because it is guaranteed to be the first packet */
- tcp_select_initial_window(tcp_full_space(sk),
- mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
- &req->rcv_wnd,
- &req->window_clamp,
- ireq->wscale_ok,
- &rcv_wscale,
- dst_metric(dst, RTAX_INITRWND));
- ireq->rcv_wscale = rcv_wscale;
- }
-
memset(&opts, 0, sizeof(opts));
#ifdef CONFIG_SYN_COOKIES
if (unlikely(req->cookie_ts))
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 0ac50836da4..8250949b885 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -15,12 +15,11 @@
#define TCP_SCALABLE_AI_CNT 50U
#define TCP_SCALABLE_MD_SCALE 3
-static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked,
- u32 in_flight)
+static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 48539fff635..9a5e05f27f4 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -163,14 +163,13 @@ static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp)
return min(tp->snd_ssthresh, tp->snd_cwnd-1);
}
-static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked,
- u32 in_flight)
+static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct vegas *vegas = inet_csk_ca(sk);
if (!vegas->doing_vegas_now) {
- tcp_reno_cong_avoid(sk, ack, acked, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked);
return;
}
@@ -195,7 +194,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked,
/* We don't have enough RTT samples to do the Vegas
* calculation, so we'll behave like Reno.
*/
- tcp_reno_cong_avoid(sk, ack, acked, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked);
} else {
u32 rtt, diff;
u64 target_cwnd;
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 1b8e28fcd7e..27b9825753d 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -114,19 +114,18 @@ static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event)
tcp_veno_init(sk);
}
-static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked,
- u32 in_flight)
+static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct veno *veno = inet_csk_ca(sk);
if (!veno->doing_veno_now) {
- tcp_reno_cong_avoid(sk, ack, acked, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked);
return;
}
/* limited by applications */
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
/* We do the Veno calculations only if we got enough rtt samples */
@@ -134,7 +133,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked,
/* We don't have enough rtt samples to do the Veno
* calculation, so we'll behave like Reno.
*/
- tcp_reno_cong_avoid(sk, ack, acked, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked);
} else {
u64 target_cwnd;
u32 rtt;
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 5ede0e72794..599b79b8eac 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -69,13 +69,12 @@ static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us)
tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us);
}
-static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked,
- u32 in_flight)
+static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct yeah *yeah = inet_csk_ca(sk);
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 4468e1adc09..e07d52b8617 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -246,7 +246,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
do {
if (low <= snum && snum <= high &&
!test_bit(snum >> udptable->log, bitmap) &&
- !inet_is_reserved_local_port(snum))
+ !inet_is_local_reserved_port(net, snum))
goto found;
snum += rand;
} while (snum != first);
@@ -785,7 +785,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
if (is_udplite) /* UDP-Lite */
csum = udplite_csum(skb);
- else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */
+ else if (sk->sk_no_check_tx) { /* UDP csum disabled */
skb->ip_summed = CHECKSUM_NONE;
goto send;
@@ -1495,6 +1495,10 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
if (skb->len > sizeof(struct udphdr) && encap_rcv != NULL) {
int ret;
+ /* Verify checksum before giving to encap */
+ if (udp_lib_checksum_complete(skb))
+ goto csum_error;
+
ret = encap_rcv(sk, skb);
if (ret <= 0) {
UDP_INC_STATS_BH(sock_net(sk),
@@ -1672,7 +1676,6 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
int proto)
{
- const struct iphdr *iph;
int err;
UDP_SKB_CB(skb)->partial_cov = 0;
@@ -1684,22 +1687,8 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
return err;
}
- iph = ip_hdr(skb);
- if (uh->check == 0) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- } else if (skb->ip_summed == CHECKSUM_COMPLETE) {
- if (!csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len,
- proto, skb->csum))
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- }
- if (!skb_csum_unnecessary(skb))
- skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
- skb->len, proto, 0);
- /* Probably, we should checksum udp header (it should be in cache
- * in any case) and data in tiny packets (< rx copybreak).
- */
-
- return 0;
+ return skb_checksum_init_zero_check(skb, proto, uh->check,
+ inet_compute_pseudo);
}
/*
@@ -1886,7 +1875,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum);
unsigned int slot2 = hash2 & udp_table.mask;
struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
- INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr)
+ INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum);
rcu_read_lock();
@@ -1979,7 +1968,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
int (*push_pending_frames)(struct sock *))
{
struct udp_sock *up = udp_sk(sk);
- int val;
+ int val, valbool;
int err = 0;
int is_udplite = IS_UDPLITE(sk);
@@ -1989,6 +1978,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
if (get_user(val, (int __user *)optval))
return -EFAULT;
+ valbool = val ? 1 : 0;
+
switch (optname) {
case UDP_CORK:
if (val != 0) {
@@ -2018,6 +2009,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
}
break;
+ case UDP_NO_CHECK6_TX:
+ up->no_check6_tx = valbool;
+ break;
+
+ case UDP_NO_CHECK6_RX:
+ up->no_check6_rx = valbool;
+ break;
+
/*
* UDP-Lite's partial checksum coverage (RFC 3828).
*/
@@ -2100,6 +2099,14 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
val = up->encap_type;
break;
+ case UDP_NO_CHECK6_TX:
+ val = up->no_check6_tx;
+ break;
+
+ case UDP_NO_CHECK6_RX:
+ val = up->no_check6_rx;
+ break;
+
/* The following two cannot be changed on UDP sockets, the return is
* always 0 (which corresponds to the full checksum coverage of UDP). */
case UDPLITE_SEND_CSCOV:
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 2c46acd4cc3..3b3efbda48e 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -70,7 +70,6 @@ static struct inet_protosw udplite4_protosw = {
.protocol = IPPROTO_UDPLITE,
.prot = &udplite_prot,
.ops = &inet_dgram_ops,
- .no_check = 0, /* must checksum (RFC 3828) */
.flags = INET_PROTOSW_PERMANENT,
};
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 40e701f2e1e..d5f6bd9a210 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -25,7 +25,7 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb)
if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE)
goto out;
- if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df)
+ if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->ignore_df)
goto out;
mtu = dst_mtu(skb_dst(skb));
@@ -62,10 +62,7 @@ int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
if (err)
return err;
- memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
- IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED;
-
- skb->protocol = htons(ETH_P_IP);
+ IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE;
return x->outer_mode->output2(x, skb);
}
@@ -73,27 +70,34 @@ EXPORT_SYMBOL(xfrm4_prepare_output);
int xfrm4_output_finish(struct sk_buff *skb)
{
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ skb->protocol = htons(ETH_P_IP);
+
+#ifdef CONFIG_NETFILTER
+ IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED;
+#endif
+
+ return xfrm_output(skb);
+}
+
+static int __xfrm4_output(struct sk_buff *skb)
+{
+ struct xfrm_state *x = skb_dst(skb)->xfrm;
+
#ifdef CONFIG_NETFILTER
- if (!skb_dst(skb)->xfrm) {
+ if (!x) {
IPCB(skb)->flags |= IPSKB_REROUTED;
return dst_output(skb);
}
-
- IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED;
#endif
- skb->protocol = htons(ETH_P_IP);
- return xfrm_output(skb);
+ return x->outer_mode->afinfo->output_finish(skb);
}
int xfrm4_output(struct sock *sk, struct sk_buff *skb)
{
- struct dst_entry *dst = skb_dst(skb);
- struct xfrm_state *x = dst->xfrm;
-
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb,
- NULL, dst->dev,
- x->outer_mode->afinfo->output_finish,
+ NULL, skb_dst(skb)->dev, __xfrm4_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
index 7f7b243e813..a2ce0101eaa 100644
--- a/net/ipv4/xfrm4_protocol.c
+++ b/net/ipv4/xfrm4_protocol.c
@@ -50,8 +50,12 @@ int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err)
{
int ret;
struct xfrm4_protocol *handler;
+ struct xfrm4_protocol __rcu **head = proto_handlers(protocol);
- for_each_protocol_rcu(*proto_handlers(protocol), handler)
+ if (!head)
+ return 0;
+
+ for_each_protocol_rcu(*head, handler)
if ((ret = handler->cb_handler(skb, err)) <= 0)
return ret;
@@ -64,15 +68,20 @@ int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
{
int ret;
struct xfrm4_protocol *handler;
+ struct xfrm4_protocol __rcu **head = proto_handlers(nexthdr);
XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
XFRM_SPI_SKB_CB(skb)->family = AF_INET;
XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
- for_each_protocol_rcu(*proto_handlers(nexthdr), handler)
+ if (!head)
+ goto out;
+
+ for_each_protocol_rcu(*head, handler)
if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL)
return ret;
+out:
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
kfree_skb(skb);
@@ -208,6 +217,9 @@ int xfrm4_protocol_register(struct xfrm4_protocol *handler,
int ret = -EEXIST;
int priority = handler->priority;
+ if (!proto_handlers(protocol) || !netproto(protocol))
+ return -EINVAL;
+
mutex_lock(&xfrm4_protocol_mutex);
if (!rcu_dereference_protected(*proto_handlers(protocol),
@@ -250,6 +262,9 @@ int xfrm4_protocol_deregister(struct xfrm4_protocol *handler,
struct xfrm4_protocol *t;
int ret = -ENOENT;
+ if (!proto_handlers(protocol) || !netproto(protocol))
+ return -EINVAL;
+
mutex_lock(&xfrm4_protocol_mutex);
for (pprev = proto_handlers(protocol);