summaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorDmitry Torokhov <dmitry.torokhov@gmail.com>2013-05-01 08:47:44 -0700
committerDmitry Torokhov <dmitry.torokhov@gmail.com>2013-05-01 08:47:44 -0700
commitbf61c8840efe60fd8f91446860b63338fb424158 (patch)
tree7a71832407a4f0d6346db773343f4c3ae2257b19 /net/ipv4
parent5846115b30f3a881e542c8bfde59a699c1c13740 (diff)
parent0c6a61657da78098472fd0eb71cc01f2387fa1bb (diff)
Merge branch 'next' into for-linus
Prepare first set of updates for 3.10 merge window.
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig11
-rw-r--r--net/ipv4/af_inet.c135
-rw-r--r--net/ipv4/ah4.c21
-rw-r--r--net/ipv4/arp.c37
-rw-r--r--net/ipv4/datagram.c25
-rw-r--r--net/ipv4/devinet.c419
-rw-r--r--net/ipv4/esp4.c12
-rw-r--r--net/ipv4/fib_frontend.c19
-rw-r--r--net/ipv4/fib_semantics.c25
-rw-r--r--net/ipv4/fib_trie.c51
-rw-r--r--net/ipv4/gre.c118
-rw-r--r--net/ipv4/icmp.c26
-rw-r--r--net/ipv4/igmp.c11
-rw-r--r--net/ipv4/inet_connection_sock.c97
-rw-r--r--net/ipv4/inet_diag.c164
-rw-r--r--net/ipv4/inet_fragment.c49
-rw-r--r--net/ipv4/inet_hashtables.c72
-rw-r--r--net/ipv4/inet_timewait_sock.c7
-rw-r--r--net/ipv4/ip_fragment.c75
-rw-r--r--net/ipv4/ip_gre.c137
-rw-r--r--net/ipv4/ip_input.c13
-rw-r--r--net/ipv4/ip_options.c8
-rw-r--r--net/ipv4/ip_output.c5
-rw-r--r--net/ipv4/ip_sockglue.c42
-rw-r--r--net/ipv4/ip_vti.c31
-rw-r--r--net/ipv4/ipcomp.c8
-rw-r--r--net/ipv4/ipconfig.c16
-rw-r--r--net/ipv4/ipip.c275
-rw-r--r--net/ipv4/ipmr.c271
-rw-r--r--net/ipv4/netfilter/Kconfig4
-rw-r--r--net/ipv4/netfilter/arp_tables.c18
-rw-r--r--net/ipv4/netfilter/ip_tables.c18
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c11
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c1
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c12
-rw-r--r--net/ipv4/netfilter/iptable_nat.c15
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c92
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c14
-rw-r--r--net/ipv4/ping.c9
-rw-r--r--net/ipv4/proc.c17
-rw-r--r--net/ipv4/protocol.c27
-rw-r--r--net/ipv4/raw.c13
-rw-r--r--net/ipv4/route.c103
-rw-r--r--net/ipv4/syncookies.c9
-rw-r--r--net/ipv4/sysctl_net_ipv4.c41
-rw-r--r--net/ipv4/tcp.c83
-rw-r--r--net/ipv4/tcp_cong.c47
-rw-r--r--net/ipv4/tcp_input.c119
-rw-r--r--net/ipv4/tcp_ipv4.c87
-rw-r--r--net/ipv4/tcp_metrics.c12
-rw-r--r--net/ipv4/tcp_minisocks.c12
-rw-r--r--net/ipv4/tcp_output.c50
-rw-r--r--net/ipv4/tcp_probe.c6
-rw-r--r--net/ipv4/tcp_timer.c8
-rw-r--r--net/ipv4/udp.c69
-rw-r--r--net/ipv4/xfrm4_input.c2
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c4
-rw-r--r--net/ipv4/xfrm4_policy.c71
58 files changed, 2281 insertions, 873 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 5a19aeb8609..7944df76845 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -488,7 +488,6 @@ config TCP_CONG_HTCP
config TCP_CONG_HSTCP
tristate "High Speed TCP"
- depends on EXPERIMENTAL
default n
---help---
Sally Floyd's High Speed TCP (RFC 3649) congestion control.
@@ -499,7 +498,6 @@ config TCP_CONG_HSTCP
config TCP_CONG_HYBLA
tristate "TCP-Hybla congestion control algorithm"
- depends on EXPERIMENTAL
default n
---help---
TCP-Hybla is a sender-side only change that eliminates penalization of
@@ -509,7 +507,6 @@ config TCP_CONG_HYBLA
config TCP_CONG_VEGAS
tristate "TCP Vegas"
- depends on EXPERIMENTAL
default n
---help---
TCP Vegas is a sender-side only change to TCP that anticipates
@@ -520,7 +517,6 @@ config TCP_CONG_VEGAS
config TCP_CONG_SCALABLE
tristate "Scalable TCP"
- depends on EXPERIMENTAL
default n
---help---
Scalable TCP is a sender-side only change to TCP which uses a
@@ -530,7 +526,6 @@ config TCP_CONG_SCALABLE
config TCP_CONG_LP
tristate "TCP Low Priority"
- depends on EXPERIMENTAL
default n
---help---
TCP Low Priority (TCP-LP), a distributed algorithm whose goal is
@@ -540,7 +535,6 @@ config TCP_CONG_LP
config TCP_CONG_VENO
tristate "TCP Veno"
- depends on EXPERIMENTAL
default n
---help---
TCP Veno is a sender-side only enhancement of TCP to obtain better
@@ -552,7 +546,6 @@ config TCP_CONG_VENO
config TCP_CONG_YEAH
tristate "YeAH TCP"
- depends on EXPERIMENTAL
select TCP_CONG_VEGAS
default n
---help---
@@ -567,7 +560,6 @@ config TCP_CONG_YEAH
config TCP_CONG_ILLINOIS
tristate "TCP Illinois"
- depends on EXPERIMENTAL
default n
---help---
TCP-Illinois is a sender-side modification of TCP Reno for
@@ -631,8 +623,7 @@ config DEFAULT_TCP_CONG
default "cubic"
config TCP_MD5SIG
- bool "TCP: MD5 Signature Option support (RFC2385) (EXPERIMENTAL)"
- depends on EXPERIMENTAL
+ bool "TCP: MD5 Signature Option support (RFC2385)"
select CRYPTO
select CRYPTO_MD5
---help---
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 766c5965856..68f6a94f766 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -248,8 +248,12 @@ EXPORT_SYMBOL(inet_listen);
u32 inet_ehash_secret __read_mostly;
EXPORT_SYMBOL(inet_ehash_secret);
+u32 ipv6_hash_secret __read_mostly;
+EXPORT_SYMBOL(ipv6_hash_secret);
+
/*
- * inet_ehash_secret must be set exactly once
+ * inet_ehash_secret must be set exactly once, and to a non nul value
+ * ipv6_hash_secret must be set exactly once.
*/
void build_ehash_secret(void)
{
@@ -259,25 +263,11 @@ void build_ehash_secret(void)
get_random_bytes(&rnd, sizeof(rnd));
} while (rnd == 0);
- cmpxchg(&inet_ehash_secret, 0, rnd);
+ if (cmpxchg(&inet_ehash_secret, 0, rnd) == 0)
+ get_random_bytes(&ipv6_hash_secret, sizeof(ipv6_hash_secret));
}
EXPORT_SYMBOL(build_ehash_secret);
-static inline int inet_netns_ok(struct net *net, __u8 protocol)
-{
- const struct net_protocol *ipprot;
-
- if (net_eq(net, &init_net))
- return 1;
-
- ipprot = rcu_dereference(inet_protos[protocol]);
- if (ipprot == NULL) {
- /* raw IP is OK */
- return 1;
- }
- return ipprot->netns_ok;
-}
-
/*
* Create an inet socket.
*/
@@ -346,11 +336,8 @@ lookup_protocol:
}
err = -EPERM;
- if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
- goto out_rcu_unlock;
-
- err = -EAFNOSUPPORT;
- if (!inet_netns_ok(net, protocol))
+ if (sock->type == SOCK_RAW && !kern &&
+ !ns_capable(net->user_ns, CAP_NET_RAW))
goto out_rcu_unlock;
sock->ops = answer->ops;
@@ -473,6 +460,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
struct sock *sk = sock->sk;
struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
unsigned short snum;
int chk_addr_ret;
int err;
@@ -496,7 +484,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
goto out;
}
- chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
+ chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);
/* Not specified by any standard per-se, however it breaks too
* many applications when removed. It is unfortunate since
@@ -516,7 +504,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
snum = ntohs(addr->sin_port);
err = -EACCES;
- if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+ if (snum && snum < PROT_SOCK &&
+ !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
goto out;
/* We keep a pair of addresses. rcv_saddr is the one
@@ -1251,7 +1240,7 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
static int inet_gso_send_check(struct sk_buff *skb)
{
- const struct net_protocol *ops;
+ const struct net_offload *ops;
const struct iphdr *iph;
int proto;
int ihl;
@@ -1275,9 +1264,9 @@ static int inet_gso_send_check(struct sk_buff *skb)
err = -EPROTONOSUPPORT;
rcu_read_lock();
- ops = rcu_dereference(inet_protos[proto]);
- if (likely(ops && ops->gso_send_check))
- err = ops->gso_send_check(skb);
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (likely(ops && ops->callbacks.gso_send_check))
+ err = ops->callbacks.gso_send_check(skb);
rcu_read_unlock();
out:
@@ -1288,7 +1277,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
- const struct net_protocol *ops;
+ const struct net_offload *ops;
struct iphdr *iph;
int proto;
int ihl;
@@ -1303,6 +1292,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
SKB_GSO_UDP |
SKB_GSO_DODGY |
SKB_GSO_TCP_ECN |
+ SKB_GSO_GRE |
0)))
goto out;
@@ -1325,12 +1315,12 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
segs = ERR_PTR(-EPROTONOSUPPORT);
rcu_read_lock();
- ops = rcu_dereference(inet_protos[proto]);
- if (likely(ops && ops->gso_segment))
- segs = ops->gso_segment(skb, features);
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (likely(ops && ops->callbacks.gso_segment))
+ segs = ops->callbacks.gso_segment(skb, features);
rcu_read_unlock();
- if (!segs || IS_ERR(segs))
+ if (IS_ERR_OR_NULL(segs))
goto out;
skb = segs;
@@ -1342,8 +1332,10 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
if (skb->next != NULL)
iph->frag_off |= htons(IP_MF);
offset += (skb->len - skb->mac_len - iph->ihl * 4);
- } else
- iph->id = htons(id++);
+ } else {
+ if (!(iph->frag_off & htons(IP_DF)))
+ iph->id = htons(id++);
+ }
iph->tot_len = htons(skb->len - skb->mac_len);
iph->check = 0;
iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl);
@@ -1356,7 +1348,7 @@ out:
static struct sk_buff **inet_gro_receive(struct sk_buff **head,
struct sk_buff *skb)
{
- const struct net_protocol *ops;
+ const struct net_offload *ops;
struct sk_buff **pp = NULL;
struct sk_buff *p;
const struct iphdr *iph;
@@ -1378,8 +1370,8 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
proto = iph->protocol;
rcu_read_lock();
- ops = rcu_dereference(inet_protos[proto]);
- if (!ops || !ops->gro_receive)
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (!ops || !ops->callbacks.gro_receive)
goto out_unlock;
if (*(u8 *)iph != 0x45)
@@ -1420,7 +1412,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
skb_gro_pull(skb, sizeof(*iph));
skb_set_transport_header(skb, skb_gro_offset(skb));
- pp = ops->gro_receive(head, skb);
+ pp = ops->callbacks.gro_receive(head, skb);
out_unlock:
rcu_read_unlock();
@@ -1435,7 +1427,7 @@ static int inet_gro_complete(struct sk_buff *skb)
{
__be16 newlen = htons(skb->len - skb_network_offset(skb));
struct iphdr *iph = ip_hdr(skb);
- const struct net_protocol *ops;
+ const struct net_offload *ops;
int proto = iph->protocol;
int err = -ENOSYS;
@@ -1443,11 +1435,11 @@ static int inet_gro_complete(struct sk_buff *skb)
iph->tot_len = newlen;
rcu_read_lock();
- ops = rcu_dereference(inet_protos[proto]);
- if (WARN_ON(!ops || !ops->gro_complete))
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (WARN_ON(!ops || !ops->callbacks.gro_complete))
goto out_unlock;
- err = ops->gro_complete(skb);
+ err = ops->callbacks.gro_complete(skb);
out_unlock:
rcu_read_unlock();
@@ -1558,26 +1550,36 @@ static const struct net_protocol tcp_protocol = {
.early_demux = tcp_v4_early_demux,
.handler = tcp_v4_rcv,
.err_handler = tcp_v4_err,
- .gso_send_check = tcp_v4_gso_send_check,
- .gso_segment = tcp_tso_segment,
- .gro_receive = tcp4_gro_receive,
- .gro_complete = tcp4_gro_complete,
.no_policy = 1,
.netns_ok = 1,
};
+static const struct net_offload tcp_offload = {
+ .callbacks = {
+ .gso_send_check = tcp_v4_gso_send_check,
+ .gso_segment = tcp_tso_segment,
+ .gro_receive = tcp4_gro_receive,
+ .gro_complete = tcp4_gro_complete,
+ },
+};
+
static const struct net_protocol udp_protocol = {
.handler = udp_rcv,
.err_handler = udp_err,
- .gso_send_check = udp4_ufo_send_check,
- .gso_segment = udp4_ufo_fragment,
.no_policy = 1,
.netns_ok = 1,
};
+static const struct net_offload udp_offload = {
+ .callbacks = {
+ .gso_send_check = udp4_ufo_send_check,
+ .gso_segment = udp4_ufo_fragment,
+ },
+};
+
static const struct net_protocol icmp_protocol = {
.handler = icmp_rcv,
- .err_handler = ping_err,
+ .err_handler = icmp_err,
.no_policy = 1,
.netns_ok = 1,
};
@@ -1659,23 +1661,44 @@ static int ipv4_proc_init(void);
* IP protocol layer initialiser
*/
+static struct packet_offload ip_packet_offload __read_mostly = {
+ .type = cpu_to_be16(ETH_P_IP),
+ .callbacks = {
+ .gso_send_check = inet_gso_send_check,
+ .gso_segment = inet_gso_segment,
+ .gro_receive = inet_gro_receive,
+ .gro_complete = inet_gro_complete,
+ },
+};
+
+static int __init ipv4_offload_init(void)
+{
+ /*
+ * Add offloads
+ */
+ if (inet_add_offload(&udp_offload, IPPROTO_UDP) < 0)
+ pr_crit("%s: Cannot add UDP protocol offload\n", __func__);
+ if (inet_add_offload(&tcp_offload, IPPROTO_TCP) < 0)
+ pr_crit("%s: Cannot add TCP protocol offlaod\n", __func__);
+
+ dev_add_offload(&ip_packet_offload);
+ return 0;
+}
+
+fs_initcall(ipv4_offload_init);
+
static struct packet_type ip_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_IP),
.func = ip_rcv,
- .gso_send_check = inet_gso_send_check,
- .gso_segment = inet_gso_segment,
- .gro_receive = inet_gro_receive,
- .gro_complete = inet_gro_complete,
};
static int __init inet_init(void)
{
- struct sk_buff *dummy_skb;
struct inet_protosw *q;
struct list_head *r;
int rc = -EINVAL;
- BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
+ BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb));
sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
if (!sysctl_local_reserved_ports)
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index a0d8392491c..2e7f1948216 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -269,7 +269,11 @@ static void ah_input_done(struct crypto_async_request *base, int err)
skb->network_header += ah_hlen;
memcpy(skb_network_header(skb), work_iph, ihl);
__skb_pull(skb, ah_hlen + ihl);
- skb_set_transport_header(skb, -ihl);
+
+ if (x->props.mode == XFRM_MODE_TUNNEL)
+ skb_reset_transport_header(skb);
+ else
+ skb_set_transport_header(skb, -ihl);
out:
kfree(AH_SKB_CB(skb)->tmp);
xfrm_input_resume(skb, err);
@@ -317,8 +321,7 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
/* We are going to _remove_ AH header to keep sockets happy,
* so... Later this can change. */
- if (skb_cloned(skb) &&
- pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ if (skb_unclone(skb, GFP_ATOMIC))
goto out;
skb->ip_summed = CHECKSUM_NONE;
@@ -381,7 +384,10 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
skb->network_header += ah_hlen;
memcpy(skb_network_header(skb), work_iph, ihl);
__skb_pull(skb, ah_hlen + ihl);
- skb_set_transport_header(skb, -ihl);
+ if (x->props.mode == XFRM_MODE_TUNNEL)
+ skb_reset_transport_header(skb);
+ else
+ skb_set_transport_header(skb, -ihl);
err = nexthdr;
@@ -413,9 +419,12 @@ static void ah4_err(struct sk_buff *skb, u32 info)
if (!x)
return;
- if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) {
+ atomic_inc(&flow_cache_genid);
+ rt_genid_bump(net);
+
ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0);
- else
+ } else
ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0);
xfrm_state_put(x);
}
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 47800459e4c..fea4929f620 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -321,7 +321,7 @@ static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb)
static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
{
__be32 saddr = 0;
- u8 *dst_ha = NULL;
+ u8 dst_ha[MAX_ADDR_LEN], *dst_hw = NULL;
struct net_device *dev = neigh->dev;
__be32 target = *(__be32 *)neigh->primary_key;
int probes = atomic_read(&neigh->probes);
@@ -363,8 +363,8 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
if (probes < 0) {
if (!(neigh->nud_state & NUD_VALID))
pr_debug("trying to ucast probe in NUD_INVALID\n");
- dst_ha = neigh->ha;
- read_lock_bh(&neigh->lock);
+ neigh_ha_snapshot(dst_ha, neigh, dev);
+ dst_hw = dst_ha;
} else {
probes -= neigh->parms->app_probes;
if (probes < 0) {
@@ -376,9 +376,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
}
arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
- dst_ha, dev->dev_addr, NULL);
- if (dst_ha)
- read_unlock_bh(&neigh->lock);
+ dst_hw, dev->dev_addr, NULL);
}
static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
@@ -930,24 +928,25 @@ static void parp_redo(struct sk_buff *skb)
static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
- struct arphdr *arp;
+ const struct arphdr *arp;
+
+ if (dev->flags & IFF_NOARP ||
+ skb->pkt_type == PACKET_OTHERHOST ||
+ skb->pkt_type == PACKET_LOOPBACK)
+ goto freeskb;
+
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ goto out_of_mem;
/* ARP header, plus 2 device addresses, plus 2 IP addresses. */
if (!pskb_may_pull(skb, arp_hdr_len(dev)))
goto freeskb;
arp = arp_hdr(skb);
- if (arp->ar_hln != dev->addr_len ||
- dev->flags & IFF_NOARP ||
- skb->pkt_type == PACKET_OTHERHOST ||
- skb->pkt_type == PACKET_LOOPBACK ||
- arp->ar_pln != 4)
+ if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4)
goto freeskb;
- skb = skb_share_check(skb, GFP_ATOMIC);
- if (skb == NULL)
- goto out_of_mem;
-
memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, skb, dev, NULL, arp_process);
@@ -1161,7 +1160,7 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
switch (cmd) {
case SIOCDARP:
case SIOCSARP:
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
case SIOCGARP:
err = copy_from_user(&r, arg, sizeof(struct arpreq));
@@ -1406,14 +1405,14 @@ static const struct file_operations arp_seq_fops = {
static int __net_init arp_net_init(struct net *net)
{
- if (!proc_net_fops_create(net, "arp", S_IRUGO, &arp_seq_fops))
+ if (!proc_create("arp", S_IRUGO, net->proc_net, &arp_seq_fops))
return -ENOMEM;
return 0;
}
static void __net_exit arp_net_exit(struct net *net)
{
- proc_net_remove(net, "arp");
+ remove_proc_entry("arp", net->proc_net);
}
static struct pernet_operations arp_net_ops = {
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 424fafbc8cb..b28e863fe0a 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -85,3 +85,28 @@ out:
return err;
}
EXPORT_SYMBOL(ip4_datagram_connect);
+
+void ip4_datagram_release_cb(struct sock *sk)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct ip_options_rcu *inet_opt;
+ __be32 daddr = inet->inet_daddr;
+ struct flowi4 fl4;
+ struct rtable *rt;
+
+ if (! __sk_dst_get(sk) || __sk_dst_check(sk, 0))
+ return;
+
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
+ rt = ip_route_output_ports(sock_net(sk), &fl4, sk, daddr,
+ inet->inet_saddr, inet->inet_dport,
+ inet->inet_sport, sk->sk_protocol,
+ RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
+ if (!IS_ERR(rt))
+ __sk_dst_set(sk, &rt->dst);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(ip4_datagram_release_cb);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 2a6abc163ed..f678507bc82 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -55,6 +55,7 @@
#include <linux/sysctl.h>
#endif
#include <linux/kmod.h>
+#include <linux/netconf.h>
#include <net/arp.h>
#include <net/ip.h>
@@ -62,6 +63,7 @@
#include <net/ip_fib.h>
#include <net/rtnetlink.h>
#include <net/net_namespace.h>
+#include <net/addrconf.h>
#include "fib_lookup.h"
@@ -92,6 +94,7 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
[IFA_ADDRESS] = { .type = NLA_U32 },
[IFA_BROADCAST] = { .type = NLA_U32 },
[IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
+ [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) },
};
#define IN4_ADDR_HSIZE_SHIFT 8
@@ -136,10 +139,9 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
u32 hash = inet_addr_hash(net, addr);
struct net_device *result = NULL;
struct in_ifaddr *ifa;
- struct hlist_node *node;
rcu_read_lock();
- hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) {
+ hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash) {
if (ifa->ifa_local == addr) {
struct net_device *dev = ifa->ifa_dev->dev;
@@ -416,6 +418,10 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
__inet_del_ifa(in_dev, ifap, destroy, NULL, 0);
}
+static void check_lifetime(struct work_struct *work);
+
+static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime);
+
static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
u32 portid)
{
@@ -461,6 +467,9 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
inet_hash_insert(dev_net(in_dev->dev), ifa);
+ cancel_delayed_work(&check_lifetime_work);
+ schedule_delayed_work(&check_lifetime_work, 0);
+
/* Send message first, then call notifier.
Notifier will trigger FIB update, so that
listeners of netlink will know about new ifaddr */
@@ -572,7 +581,105 @@ errout:
return err;
}
-static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh)
+#define INFINITY_LIFE_TIME 0xFFFFFFFF
+
+static void check_lifetime(struct work_struct *work)
+{
+ unsigned long now, next, next_sec, next_sched;
+ struct in_ifaddr *ifa;
+ int i;
+
+ now = jiffies;
+ next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY);
+
+ rcu_read_lock();
+ for (i = 0; i < IN4_ADDR_HSIZE; i++) {
+ hlist_for_each_entry_rcu(ifa, &inet_addr_lst[i], hash) {
+ unsigned long age;
+
+ if (ifa->ifa_flags & IFA_F_PERMANENT)
+ continue;
+
+ /* We try to batch several events at once. */
+ age = (now - ifa->ifa_tstamp +
+ ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
+
+ if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME &&
+ age >= ifa->ifa_valid_lft) {
+ struct in_ifaddr **ifap ;
+
+ rtnl_lock();
+ for (ifap = &ifa->ifa_dev->ifa_list;
+ *ifap != NULL; ifap = &ifa->ifa_next) {
+ if (*ifap == ifa)
+ inet_del_ifa(ifa->ifa_dev,
+ ifap, 1);
+ }
+ rtnl_unlock();
+ } else if (ifa->ifa_preferred_lft ==
+ INFINITY_LIFE_TIME) {
+ continue;
+ } else if (age >= ifa->ifa_preferred_lft) {
+ if (time_before(ifa->ifa_tstamp +
+ ifa->ifa_valid_lft * HZ, next))
+ next = ifa->ifa_tstamp +
+ ifa->ifa_valid_lft * HZ;
+
+ if (!(ifa->ifa_flags & IFA_F_DEPRECATED)) {
+ ifa->ifa_flags |= IFA_F_DEPRECATED;
+ rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0);
+ }
+ } else if (time_before(ifa->ifa_tstamp +
+ ifa->ifa_preferred_lft * HZ,
+ next)) {
+ next = ifa->ifa_tstamp +
+ ifa->ifa_preferred_lft * HZ;
+ }
+ }
+ }
+ rcu_read_unlock();
+
+ next_sec = round_jiffies_up(next);
+ next_sched = next;
+
+ /* If rounded timeout is accurate enough, accept it. */
+ if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ))
+ next_sched = next_sec;
+
+ now = jiffies;
+ /* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */
+ if (time_before(next_sched, now + ADDRCONF_TIMER_FUZZ_MAX))
+ next_sched = now + ADDRCONF_TIMER_FUZZ_MAX;
+
+ schedule_delayed_work(&check_lifetime_work, next_sched - now);
+}
+
+static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft,
+ __u32 prefered_lft)
+{
+ unsigned long timeout;
+
+ ifa->ifa_flags &= ~(IFA_F_PERMANENT | IFA_F_DEPRECATED);
+
+ timeout = addrconf_timeout_fixup(valid_lft, HZ);
+ if (addrconf_finite_timeout(timeout))
+ ifa->ifa_valid_lft = timeout;
+ else
+ ifa->ifa_flags |= IFA_F_PERMANENT;
+
+ timeout = addrconf_timeout_fixup(prefered_lft, HZ);
+ if (addrconf_finite_timeout(timeout)) {
+ if (timeout == 0)
+ ifa->ifa_flags |= IFA_F_DEPRECATED;
+ ifa->ifa_preferred_lft = timeout;
+ }
+ ifa->ifa_tstamp = jiffies;
+ if (!ifa->ifa_cstamp)
+ ifa->ifa_cstamp = ifa->ifa_tstamp;
+}
+
+static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
+ __u32 *pvalid_lft, __u32 *pprefered_lft)
{
struct nlattr *tb[IFA_MAX+1];
struct in_ifaddr *ifa;
@@ -632,24 +739,73 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh)
else
memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ if (tb[IFA_CACHEINFO]) {
+ struct ifa_cacheinfo *ci;
+
+ ci = nla_data(tb[IFA_CACHEINFO]);
+ if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) {
+ err = -EINVAL;
+ goto errout;
+ }
+ *pvalid_lft = ci->ifa_valid;
+ *pprefered_lft = ci->ifa_prefered;
+ }
+
return ifa;
errout:
return ERR_PTR(err);
}
+static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa)
+{
+ struct in_device *in_dev = ifa->ifa_dev;
+ struct in_ifaddr *ifa1, **ifap;
+
+ if (!ifa->ifa_local)
+ return NULL;
+
+ for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
+ ifap = &ifa1->ifa_next) {
+ if (ifa1->ifa_mask == ifa->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, ifa) &&
+ ifa1->ifa_local == ifa->ifa_local)
+ return ifa1;
+ }
+ return NULL;
+}
+
static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
{
struct net *net = sock_net(skb->sk);
struct in_ifaddr *ifa;
+ struct in_ifaddr *ifa_existing;
+ __u32 valid_lft = INFINITY_LIFE_TIME;
+ __u32 prefered_lft = INFINITY_LIFE_TIME;
ASSERT_RTNL();
- ifa = rtm_to_ifaddr(net, nlh);
+ ifa = rtm_to_ifaddr(net, nlh, &valid_lft, &prefered_lft);
if (IS_ERR(ifa))
return PTR_ERR(ifa);
- return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid);
+ ifa_existing = find_matching_ifa(ifa);
+ if (!ifa_existing) {
+ /* It would be best to check for !NLM_F_CREATE here but
+ * userspace alreay relies on not having to provide this.
+ */
+ set_ifa_lifetime(ifa, valid_lft, prefered_lft);
+ return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid);
+ } else {
+ inet_free_ifa(ifa);
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL ||
+ !(nlh->nlmsg_flags & NLM_F_REPLACE))
+ return -EEXIST;
+
+ set_ifa_lifetime(ifa_existing, valid_lft, prefered_lft);
+ }
+ return 0;
}
/*
@@ -723,7 +879,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
case SIOCSIFFLAGS:
ret = -EPERM;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
goto out;
break;
case SIOCSIFADDR: /* Set interface address (and family) */
@@ -731,7 +887,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
case SIOCSIFDSTADDR: /* Set the destination address */
case SIOCSIFNETMASK: /* Set the netmask for the interface */
ret = -EPERM;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
goto out;
ret = -EINVAL;
if (sin->sin_family != AF_INET)
@@ -822,9 +978,9 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
if (!ifa) {
ret = -ENOBUFS;
ifa = inet_alloc_ifa();
- INIT_HLIST_NODE(&ifa->hash);
if (!ifa)
break;
+ INIT_HLIST_NODE(&ifa->hash);
if (colon)
memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ);
else
@@ -851,6 +1007,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
ifa->ifa_prefixlen = 32;
ifa->ifa_mask = inet_make_mask(32);
}
+ set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME);
ret = inet_set_ifa(dev, ifa);
break;
@@ -1189,6 +1346,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
ifa->ifa_dev = in_dev;
ifa->ifa_scope = RT_SCOPE_HOST;
memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ set_ifa_lifetime(ifa, INFINITY_LIFE_TIME,
+ INFINITY_LIFE_TIME);
inet_insert_ifa(ifa);
}
}
@@ -1245,11 +1404,30 @@ static size_t inet_nlmsg_size(void)
+ nla_total_size(IFNAMSIZ); /* IFA_LABEL */
}
+static inline u32 cstamp_delta(unsigned long cstamp)
+{
+ return (cstamp - INITIAL_JIFFIES) * 100UL / HZ;
+}
+
+static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp,
+ unsigned long tstamp, u32 preferred, u32 valid)
+{
+ struct ifa_cacheinfo ci;
+
+ ci.cstamp = cstamp_delta(cstamp);
+ ci.tstamp = cstamp_delta(tstamp);
+ ci.ifa_prefered = preferred;
+ ci.ifa_valid = valid;
+
+ return nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci);
+}
+
static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
u32 portid, u32 seq, int event, unsigned int flags)
{
struct ifaddrmsg *ifm;
struct nlmsghdr *nlh;
+ u32 preferred, valid;
nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags);
if (nlh == NULL)
@@ -1258,10 +1436,31 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
ifm = nlmsg_data(nlh);
ifm->ifa_family = AF_INET;
ifm->ifa_prefixlen = ifa->ifa_prefixlen;
- ifm->ifa_flags = ifa->ifa_flags|IFA_F_PERMANENT;
+ ifm->ifa_flags = ifa->ifa_flags;
ifm->ifa_scope = ifa->ifa_scope;
ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
+ if (!(ifm->ifa_flags & IFA_F_PERMANENT)) {
+ preferred = ifa->ifa_preferred_lft;
+ valid = ifa->ifa_valid_lft;
+ if (preferred != INFINITY_LIFE_TIME) {
+ long tval = (jiffies - ifa->ifa_tstamp) / HZ;
+
+ if (preferred > tval)
+ preferred -= tval;
+ else
+ preferred = 0;
+ if (valid != INFINITY_LIFE_TIME) {
+ if (valid > tval)
+ valid -= tval;
+ else
+ valid = 0;
+ }
+ }
+ } else {
+ preferred = INFINITY_LIFE_TIME;
+ valid = INFINITY_LIFE_TIME;
+ }
if ((ifa->ifa_address &&
nla_put_be32(skb, IFA_ADDRESS, ifa->ifa_address)) ||
(ifa->ifa_local &&
@@ -1269,7 +1468,9 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
(ifa->ifa_broadcast &&
nla_put_be32(skb, IFA_BROADCAST, ifa->ifa_broadcast)) ||
(ifa->ifa_label[0] &&
- nla_put_string(skb, IFA_LABEL, ifa->ifa_label)))
+ nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) ||
+ put_cacheinfo(skb, ifa->ifa_cstamp, ifa->ifa_tstamp,
+ preferred, valid))
goto nla_put_failure;
return nlmsg_end(skb, nlh);
@@ -1289,7 +1490,6 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
struct in_device *in_dev;
struct in_ifaddr *ifa;
struct hlist_head *head;
- struct hlist_node *node;
s_h = cb->args[0];
s_idx = idx = cb->args[1];
@@ -1299,7 +1499,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
idx = 0;
head = &net->dev_index_head[h];
rcu_read_lock();
- hlist_for_each_entry_rcu(dev, node, head, index_hlist) {
+ hlist_for_each_entry_rcu(dev, head, index_hlist) {
if (idx < s_idx)
goto cont;
if (h > s_h || idx > s_idx)
@@ -1442,6 +1642,155 @@ static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla)
return 0;
}
+static int inet_netconf_msgsize_devconf(int type)
+{
+ int size = NLMSG_ALIGN(sizeof(struct netconfmsg))
+ + nla_total_size(4); /* NETCONFA_IFINDEX */
+
+ /* type -1 is used for ALL */
+ if (type == -1 || type == NETCONFA_FORWARDING)
+ size += nla_total_size(4);
+ if (type == -1 || type == NETCONFA_RP_FILTER)
+ size += nla_total_size(4);
+ if (type == -1 || type == NETCONFA_MC_FORWARDING)
+ size += nla_total_size(4);
+
+ return size;
+}
+
+static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
+ struct ipv4_devconf *devconf, u32 portid,
+ u32 seq, int event, unsigned int flags,
+ int type)
+{
+ struct nlmsghdr *nlh;
+ struct netconfmsg *ncm;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
+ flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ncm = nlmsg_data(nlh);
+ ncm->ncm_family = AF_INET;
+
+ if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0)
+ goto nla_put_failure;
+
+ /* type -1 is used for ALL */
+ if ((type == -1 || type == NETCONFA_FORWARDING) &&
+ nla_put_s32(skb, NETCONFA_FORWARDING,
+ IPV4_DEVCONF(*devconf, FORWARDING)) < 0)
+ goto nla_put_failure;
+ if ((type == -1 || type == NETCONFA_RP_FILTER) &&
+ nla_put_s32(skb, NETCONFA_RP_FILTER,
+ IPV4_DEVCONF(*devconf, RP_FILTER)) < 0)
+ goto nla_put_failure;
+ if ((type == -1 || type == NETCONFA_MC_FORWARDING) &&
+ nla_put_s32(skb, NETCONFA_MC_FORWARDING,
+ IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
+ goto nla_put_failure;
+
+ return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+void inet_netconf_notify_devconf(struct net *net, int type, int ifindex,
+ struct ipv4_devconf *devconf)
+{
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_ATOMIC);
+ if (skb == NULL)
+ goto errout;
+
+ err = inet_netconf_fill_devconf(skb, ifindex, devconf, 0, 0,
+ RTM_NEWNETCONF, 0, type);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_ATOMIC);
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_NETCONF, err);
+}
+
+static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = {
+ [NETCONFA_IFINDEX] = { .len = sizeof(int) },
+ [NETCONFA_FORWARDING] = { .len = sizeof(int) },
+ [NETCONFA_RP_FILTER] = { .len = sizeof(int) },
+};
+
+static int inet_netconf_get_devconf(struct sk_buff *in_skb,
+ struct nlmsghdr *nlh,
+ void *arg)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct nlattr *tb[NETCONFA_MAX+1];
+ struct netconfmsg *ncm;
+ struct sk_buff *skb;
+ struct ipv4_devconf *devconf;
+ struct in_device *in_dev;
+ struct net_device *dev;
+ int ifindex;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX,
+ devconf_ipv4_policy);
+ if (err < 0)
+ goto errout;
+
+ err = EINVAL;
+ if (!tb[NETCONFA_IFINDEX])
+ goto errout;
+
+ ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]);
+ switch (ifindex) {
+ case NETCONFA_IFINDEX_ALL:
+ devconf = net->ipv4.devconf_all;
+ break;
+ case NETCONFA_IFINDEX_DEFAULT:
+ devconf = net->ipv4.devconf_dflt;
+ break;
+ default:
+ dev = __dev_get_by_index(net, ifindex);
+ if (dev == NULL)
+ goto errout;
+ in_dev = __in_dev_get_rtnl(dev);
+ if (in_dev == NULL)
+ goto errout;
+ devconf = &in_dev->cnf;
+ break;
+ }
+
+ err = -ENOBUFS;
+ skb = nlmsg_new(inet_netconf_msgsize_devconf(-1), GFP_ATOMIC);
+ if (skb == NULL)
+ goto errout;
+
+ err = inet_netconf_fill_devconf(skb, ifindex, devconf,
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
+ -1);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+errout:
+ return err;
+}
+
#ifdef CONFIG_SYSCTL
static void devinet_copy_dflt_conf(struct net *net, int i)
@@ -1467,6 +1816,12 @@ static void inet_forward_change(struct net *net)
IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on;
IPV4_DEVCONF_DFLT(net, FORWARDING) = on;
+ inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+ NETCONFA_IFINDEX_ALL,
+ net->ipv4.devconf_all);
+ inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+ NETCONFA_IFINDEX_DEFAULT,
+ net->ipv4.devconf_dflt);
for_each_netdev(net, dev) {
struct in_device *in_dev;
@@ -1474,8 +1829,11 @@ static void inet_forward_change(struct net *net)
dev_disable_lro(dev);
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
- if (in_dev)
+ if (in_dev) {
IN_DEV_CONF_SET(in_dev, FORWARDING, on);
+ inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+ dev->ifindex, &in_dev->cnf);
+ }
rcu_read_unlock();
}
}
@@ -1501,6 +1859,23 @@ static int devinet_conf_proc(ctl_table *ctl, int write,
i == IPV4_DEVCONF_ROUTE_LOCALNET - 1)
if ((new_value == 0) && (old_value != 0))
rt_cache_flush(net);
+ if (i == IPV4_DEVCONF_RP_FILTER - 1 &&
+ new_value != old_value) {
+ int ifindex;
+
+ if (cnf == net->ipv4.devconf_dflt)
+ ifindex = NETCONFA_IFINDEX_DEFAULT;
+ else if (cnf == net->ipv4.devconf_all)
+ ifindex = NETCONFA_IFINDEX_ALL;
+ else {
+ struct in_device *idev =
+ container_of(cnf, struct in_device,
+ cnf);
+ ifindex = idev->dev->ifindex;
+ }
+ inet_netconf_notify_devconf(net, NETCONFA_RP_FILTER,
+ ifindex, cnf);
+ }
}
return ret;
@@ -1527,15 +1902,23 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write,
}
if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) {
inet_forward_change(net);
- } else if (*valp) {
+ } else {
struct ipv4_devconf *cnf = ctl->extra1;
struct in_device *idev =
container_of(cnf, struct in_device, cnf);
- dev_disable_lro(idev->dev);
+ if (*valp)
+ dev_disable_lro(idev->dev);
+ inet_netconf_notify_devconf(net,
+ NETCONFA_FORWARDING,
+ idev->dev->ifindex,
+ cnf);
}
rtnl_unlock();
rt_cache_flush(net);
- }
+ } else
+ inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+ NETCONFA_IFINDEX_DEFAULT,
+ net->ipv4.devconf_dflt);
}
return ret;
@@ -1804,10 +2187,14 @@ void __init devinet_init(void)
register_gifconf(PF_INET, inet_gifconf);
register_netdevice_notifier(&ip_netdev_notifier);
+ schedule_delayed_work(&check_lifetime_work, 0);
+
rtnl_af_register(&inet_af_ops);
rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL);
rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL);
rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL);
+ rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf,
+ NULL, NULL);
}
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index b61e9deb7c7..3b4f0cd2e63 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -346,7 +346,10 @@ static int esp_input_done2(struct sk_buff *skb, int err)
pskb_trim(skb, skb->len - alen - padlen - 2);
__skb_pull(skb, hlen);
- skb_set_transport_header(skb, -ihl);
+ if (x->props.mode == XFRM_MODE_TUNNEL)
+ skb_reset_transport_header(skb);
+ else
+ skb_set_transport_header(skb, -ihl);
err = nexthdr[1];
@@ -499,9 +502,12 @@ static void esp4_err(struct sk_buff *skb, u32 info)
if (!x)
return;
- if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) {
+ atomic_inc(&flow_cache_genid);
+ rt_genid_bump(net);
+
ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0);
- else
+ } else
ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0);
xfrm_state_put(x);
}
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 825c608826d..eb4bb12b3eb 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -112,7 +112,6 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
struct fib_table *fib_get_table(struct net *net, u32 id)
{
struct fib_table *tb;
- struct hlist_node *node;
struct hlist_head *head;
unsigned int h;
@@ -122,7 +121,7 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
rcu_read_lock();
head = &net->ipv4.fib_table_hash[h];
- hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
if (tb->tb_id == id) {
rcu_read_unlock();
return tb;
@@ -137,13 +136,12 @@ static void fib_flush(struct net *net)
{
int flushed = 0;
struct fib_table *tb;
- struct hlist_node *node;
struct hlist_head *head;
unsigned int h;
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
head = &net->ipv4.fib_table_hash[h];
- hlist_for_each_entry(tb, node, head, tb_hlist)
+ hlist_for_each_entry(tb, head, tb_hlist)
flushed += fib_table_flush(tb);
}
@@ -488,7 +486,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
switch (cmd) {
case SIOCADDRT: /* Add a route */
case SIOCDELRT: /* Delete a route */
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
if (copy_from_user(&rt, arg, sizeof(rt)))
@@ -656,7 +654,6 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
unsigned int h, s_h;
unsigned int e = 0, s_e;
struct fib_table *tb;
- struct hlist_node *node;
struct hlist_head *head;
int dumped = 0;
@@ -670,7 +667,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
e = 0;
head = &net->ipv4.fib_table_hash[h];
- hlist_for_each_entry(tb, node, head, tb_hlist) {
+ hlist_for_each_entry(tb, head, tb_hlist) {
if (e < s_e)
goto next;
if (dumped)
@@ -974,7 +971,7 @@ static void nl_fib_input(struct sk_buff *skb)
nl_fib_lookup(frn, tb);
- portid = NETLINK_CB(skb).portid; /* pid of sending process */
+ portid = NETLINK_CB(skb).portid; /* netlink portid */
NETLINK_CB(skb).portid = 0; /* from kernel */
NETLINK_CB(skb).dst_group = 0; /* unicast */
netlink_unicast(net->ipv4.fibnl, skb, portid, MSG_DONTWAIT);
@@ -1117,11 +1114,11 @@ static void ip_fib_net_exit(struct net *net)
for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
struct fib_table *tb;
struct hlist_head *head;
- struct hlist_node *node, *tmp;
+ struct hlist_node *tmp;
head = &net->ipv4.fib_table_hash[i];
- hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) {
- hlist_del(node);
+ hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
+ hlist_del(&tb->tb_hlist);
fib_table_flush(tb);
fib_free_table(tb);
}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 71b125cd5db..8f6cb7a87cd 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -298,14 +298,13 @@ static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
static struct fib_info *fib_find_info(const struct fib_info *nfi)
{
struct hlist_head *head;
- struct hlist_node *node;
struct fib_info *fi;
unsigned int hash;
hash = fib_info_hashfn(nfi);
head = &fib_info_hash[hash];
- hlist_for_each_entry(fi, node, head, fib_hash) {
+ hlist_for_each_entry(fi, head, fib_hash) {
if (!net_eq(fi->fib_net, nfi->fib_net))
continue;
if (fi->fib_nhs != nfi->fib_nhs)
@@ -331,7 +330,6 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
int ip_fib_check_default(__be32 gw, struct net_device *dev)
{
struct hlist_head *head;
- struct hlist_node *node;
struct fib_nh *nh;
unsigned int hash;
@@ -339,7 +337,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev)
hash = fib_devindex_hashfn(dev->ifindex);
head = &fib_info_devhash[hash];
- hlist_for_each_entry(nh, node, head, nh_hash) {
+ hlist_for_each_entry(nh, head, nh_hash) {
if (nh->nh_dev == dev &&
nh->nh_gw == gw &&
!(nh->nh_flags & RTNH_F_DEAD)) {
@@ -721,10 +719,10 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash,
for (i = 0; i < old_size; i++) {
struct hlist_head *head = &fib_info_hash[i];
- struct hlist_node *node, *n;
+ struct hlist_node *n;
struct fib_info *fi;
- hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
+ hlist_for_each_entry_safe(fi, n, head, fib_hash) {
struct hlist_head *dest;
unsigned int new_hash;
@@ -739,10 +737,10 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash,
for (i = 0; i < old_size; i++) {
struct hlist_head *lhead = &fib_info_laddrhash[i];
- struct hlist_node *node, *n;
+ struct hlist_node *n;
struct fib_info *fi;
- hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
+ hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) {
struct hlist_head *ldest;
unsigned int new_hash;
@@ -803,7 +801,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
unsigned int bytes;
if (!new_size)
- new_size = 1;
+ new_size = 16;
bytes = new_size * sizeof(struct hlist_head *);
new_info_hash = fib_info_hash_alloc(bytes);
new_laddrhash = fib_info_hash_alloc(bytes);
@@ -1096,13 +1094,12 @@ int fib_sync_down_addr(struct net *net, __be32 local)
int ret = 0;
unsigned int hash = fib_laddr_hashfn(local);
struct hlist_head *head = &fib_info_laddrhash[hash];
- struct hlist_node *node;
struct fib_info *fi;
if (fib_info_laddrhash == NULL || local == 0)
return 0;
- hlist_for_each_entry(fi, node, head, fib_lhash) {
+ hlist_for_each_entry(fi, head, fib_lhash) {
if (!net_eq(fi->fib_net, net))
continue;
if (fi->fib_prefsrc == local) {
@@ -1120,13 +1117,12 @@ int fib_sync_down_dev(struct net_device *dev, int force)
struct fib_info *prev_fi = NULL;
unsigned int hash = fib_devindex_hashfn(dev->ifindex);
struct hlist_head *head = &fib_info_devhash[hash];
- struct hlist_node *node;
struct fib_nh *nh;
if (force)
scope = -1;
- hlist_for_each_entry(nh, node, head, nh_hash) {
+ hlist_for_each_entry(nh, head, nh_hash) {
struct fib_info *fi = nh->nh_parent;
int dead;
@@ -1232,7 +1228,6 @@ int fib_sync_up(struct net_device *dev)
struct fib_info *prev_fi;
unsigned int hash;
struct hlist_head *head;
- struct hlist_node *node;
struct fib_nh *nh;
int ret;
@@ -1244,7 +1239,7 @@ int fib_sync_up(struct net_device *dev)
head = &fib_info_devhash[hash];
ret = 0;
- hlist_for_each_entry(nh, node, head, nh_hash) {
+ hlist_for_each_entry(nh, head, nh_hash) {
struct fib_info *fi = nh->nh_parent;
int alive;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 31d771ca9a7..ff06b7543d9 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -920,10 +920,9 @@ nomem:
static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
{
struct hlist_head *head = &l->list;
- struct hlist_node *node;
struct leaf_info *li;
- hlist_for_each_entry_rcu(li, node, head, hlist)
+ hlist_for_each_entry_rcu(li, head, hlist)
if (li->plen == plen)
return li;
@@ -943,12 +942,11 @@ static inline struct list_head *get_fa_head(struct leaf *l, int plen)
static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
{
struct leaf_info *li = NULL, *last = NULL;
- struct hlist_node *node;
if (hlist_empty(head)) {
hlist_add_head_rcu(&new->hlist, head);
} else {
- hlist_for_each_entry(li, node, head, hlist) {
+ hlist_for_each_entry(li, head, hlist) {
if (new->plen > li->plen)
break;
@@ -1354,9 +1352,8 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
{
struct leaf_info *li;
struct hlist_head *hhead = &l->list;
- struct hlist_node *node;
- hlist_for_each_entry_rcu(li, node, hhead, hlist) {
+ hlist_for_each_entry_rcu(li, hhead, hlist) {
struct fib_alias *fa;
if (l->key != (key & li->mask_plen))
@@ -1740,10 +1737,10 @@ static int trie_flush_leaf(struct leaf *l)
{
int found = 0;
struct hlist_head *lih = &l->list;
- struct hlist_node *node, *tmp;
+ struct hlist_node *tmp;
struct leaf_info *li = NULL;
- hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
+ hlist_for_each_entry_safe(li, tmp, lih, hlist) {
found += trie_flush_list(&li->falh);
if (list_empty(&li->falh)) {
@@ -1895,14 +1892,13 @@ static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
struct sk_buff *skb, struct netlink_callback *cb)
{
struct leaf_info *li;
- struct hlist_node *node;
int i, s_i;
s_i = cb->args[4];
i = 0;
/* rcu_read_lock is hold by caller */
- hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
+ hlist_for_each_entry_rcu(li, &l->list, hlist) {
if (i < s_i) {
i++;
continue;
@@ -2092,14 +2088,13 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
if (IS_LEAF(n)) {
struct leaf *l = (struct leaf *)n;
struct leaf_info *li;
- struct hlist_node *tmp;
s->leaves++;
s->totdepth += iter.depth;
if (iter.depth > s->maxdepth)
s->maxdepth = iter.depth;
- hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
+ hlist_for_each_entry_rcu(li, &l->list, hlist)
++s->prefixes;
} else {
const struct tnode *tn = (const struct tnode *) n;
@@ -2200,10 +2195,9 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
- struct hlist_node *node;
struct fib_table *tb;
- hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
struct trie *t = (struct trie *) tb->tb_data;
struct trie_stat stat;
@@ -2245,10 +2239,9 @@ static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
- struct hlist_node *node;
struct fib_table *tb;
- hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
struct rt_trie_node *n;
for (n = fib_trie_get_first(iter,
@@ -2298,7 +2291,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
/* new hash chain */
while (++h < FIB_TABLE_HASHSZ) {
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
- hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
if (n)
goto found;
@@ -2381,13 +2374,12 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
} else {
struct leaf *l = (struct leaf *) n;
struct leaf_info *li;
- struct hlist_node *node;
__be32 val = htonl(l->key);
seq_indent(seq, iter->depth);
seq_printf(seq, " |-- %pI4\n", &val);
- hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
+ hlist_for_each_entry_rcu(li, &l->list, hlist) {
struct fib_alias *fa;
list_for_each_entry_rcu(fa, &li->falh, fa_list) {
@@ -2532,7 +2524,6 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
{
struct leaf *l = v;
struct leaf_info *li;
- struct hlist_node *node;
if (v == SEQ_START_TOKEN) {
seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
@@ -2541,7 +2532,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
return 0;
}
- hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
+ hlist_for_each_entry_rcu(li, &l->list, hlist) {
struct fib_alias *fa;
__be32 mask, prefix;
@@ -2607,31 +2598,31 @@ static const struct file_operations fib_route_fops = {
int __net_init fib_proc_init(struct net *net)
{
- if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
+ if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
goto out1;
- if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
- &fib_triestat_fops))
+ if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
+ &fib_triestat_fops))
goto out2;
- if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
+ if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
goto out3;
return 0;
out3:
- proc_net_remove(net, "fib_triestat");
+ remove_proc_entry("fib_triestat", net->proc_net);
out2:
- proc_net_remove(net, "fib_trie");
+ remove_proc_entry("fib_trie", net->proc_net);
out1:
return -ENOMEM;
}
void __net_exit fib_proc_exit(struct net *net)
{
- proc_net_remove(net, "fib_trie");
- proc_net_remove(net, "fib_triestat");
- proc_net_remove(net, "route");
+ remove_proc_entry("fib_trie", net->proc_net);
+ remove_proc_entry("fib_triestat", net->proc_net);
+ remove_proc_entry("route", net->proc_net);
}
#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c
index 42a491055c7..7a4c710c4cd 100644
--- a/net/ipv4/gre.c
+++ b/net/ipv4/gre.c
@@ -19,6 +19,7 @@
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/netdevice.h>
+#include <linux/if_tunnel.h>
#include <linux/spinlock.h>
#include <net/protocol.h>
#include <net/gre.h>
@@ -26,6 +27,11 @@
static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
static DEFINE_SPINLOCK(gre_proto_lock);
+struct gre_base_hdr {
+ __be16 flags;
+ __be16 protocol;
+};
+#define GRE_HEADER_SECTION 4
int gre_add_protocol(const struct gre_protocol *proto, u8 version)
{
@@ -112,12 +118,117 @@ static void gre_err(struct sk_buff *skb, u32 info)
rcu_read_unlock();
}
+static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ netdev_features_t enc_features;
+ int ghl = GRE_HEADER_SECTION;
+ struct gre_base_hdr *greh;
+ int mac_len = skb->mac_len;
+ int tnl_hlen;
+ bool csum;
+
+ if (unlikely(skb_shinfo(skb)->gso_type &
+ ~(SKB_GSO_TCPV4 |
+ SKB_GSO_TCPV6 |
+ SKB_GSO_UDP |
+ SKB_GSO_DODGY |
+ SKB_GSO_TCP_ECN |
+ SKB_GSO_GRE)))
+ goto out;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(*greh))))
+ goto out;
+
+ greh = (struct gre_base_hdr *)skb_transport_header(skb);
+
+ if (greh->flags & GRE_KEY)
+ ghl += GRE_HEADER_SECTION;
+ if (greh->flags & GRE_SEQ)
+ ghl += GRE_HEADER_SECTION;
+ if (greh->flags & GRE_CSUM) {
+ ghl += GRE_HEADER_SECTION;
+ csum = true;
+ } else
+ csum = false;
+
+ /* setup inner skb. */
+ if (greh->protocol == htons(ETH_P_TEB)) {
+ struct ethhdr *eth = eth_hdr(skb);
+ skb->protocol = eth->h_proto;
+ } else {
+ skb->protocol = greh->protocol;
+ }
+
+ skb->encapsulation = 0;
+
+ if (unlikely(!pskb_may_pull(skb, ghl)))
+ goto out;
+ __skb_pull(skb, ghl);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, skb_inner_network_offset(skb));
+ skb->mac_len = skb_inner_network_offset(skb);
+
+ /* segment inner packet. */
+ enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
+ segs = skb_mac_gso_segment(skb, enc_features);
+ if (!segs || IS_ERR(segs))
+ goto out;
+
+ skb = segs;
+ tnl_hlen = skb_tnl_header_len(skb);
+ do {
+ __skb_push(skb, ghl);
+ if (csum) {
+ __be32 *pcsum;
+
+ if (skb_has_shared_frag(skb)) {
+ int err;
+
+ err = __skb_linearize(skb);
+ if (err) {
+ kfree_skb(segs);
+ segs = ERR_PTR(err);
+ goto out;
+ }
+ }
+
+ greh = (struct gre_base_hdr *)(skb->data);
+ pcsum = (__be32 *)(greh + 1);
+ *pcsum = 0;
+ *(__sum16 *)pcsum = csum_fold(skb_checksum(skb, 0, skb->len, 0));
+ }
+ __skb_push(skb, tnl_hlen - ghl);
+
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, mac_len);
+ skb->mac_len = mac_len;
+ } while ((skb = skb->next));
+out:
+ return segs;
+}
+
+static int gre_gso_send_check(struct sk_buff *skb)
+{
+ if (!skb->encapsulation)
+ return -EINVAL;
+ return 0;
+}
+
static const struct net_protocol net_gre_protocol = {
.handler = gre_rcv,
.err_handler = gre_err,
.netns_ok = 1,
};
+static const struct net_offload gre_offload = {
+ .callbacks = {
+ .gso_send_check = gre_gso_send_check,
+ .gso_segment = gre_gso_segment,
+ },
+};
+
static int __init gre_init(void)
{
pr_info("GRE over IPv4 demultiplexor driver\n");
@@ -127,11 +238,18 @@ static int __init gre_init(void)
return -EAGAIN;
}
+ if (inet_add_offload(&gre_offload, IPPROTO_GRE)) {
+ pr_err("can't add protocol offload\n");
+ inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
+ return -EAGAIN;
+ }
+
return 0;
}
static void __exit gre_exit(void)
{
+ inet_del_offload(&gre_offload, IPPROTO_GRE);
inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
}
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index f2eccd53174..3ac5dff7962 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -257,7 +257,8 @@ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1);
rc = inet_peer_xrlim_allow(peer,
net->ipv4.sysctl_icmp_ratelimit);
- inet_putpeer(peer);
+ if (peer)
+ inet_putpeer(peer);
}
out:
return rc;
@@ -933,6 +934,29 @@ error:
goto drop;
}
+void icmp_err(struct sk_buff *skb, u32 info)
+{
+ struct iphdr *iph = (struct iphdr *)skb->data;
+ struct icmphdr *icmph = (struct icmphdr *)(skb->data+(iph->ihl<<2));
+ int type = icmp_hdr(skb)->type;
+ int code = icmp_hdr(skb)->code;
+ struct net *net = dev_net(skb->dev);
+
+ /*
+ * Use ping_err to handle all icmp errors except those
+ * triggered by ICMP_ECHOREPLY which sent from kernel.
+ */
+ if (icmph->type != ICMP_ECHOREPLY) {
+ ping_err(skb, info);
+ return;
+ }
+
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+ ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ICMP, 0);
+ else if (type == ICMP_REDIRECT)
+ ipv4_redirect(skb, net, 0, 0, IPPROTO_ICMP, 0);
+}
+
/*
* This table is the definition of how we handle ICMP.
*/
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 736ab70fd17..d8c232794bc 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2646,24 +2646,25 @@ static int __net_init igmp_net_init(struct net *net)
{
struct proc_dir_entry *pde;
- pde = proc_net_fops_create(net, "igmp", S_IRUGO, &igmp_mc_seq_fops);
+ pde = proc_create("igmp", S_IRUGO, net->proc_net, &igmp_mc_seq_fops);
if (!pde)
goto out_igmp;
- pde = proc_net_fops_create(net, "mcfilter", S_IRUGO, &igmp_mcf_seq_fops);
+ pde = proc_create("mcfilter", S_IRUGO, net->proc_net,
+ &igmp_mcf_seq_fops);
if (!pde)
goto out_mcfilter;
return 0;
out_mcfilter:
- proc_net_remove(net, "igmp");
+ remove_proc_entry("igmp", net->proc_net);
out_igmp:
return -ENOMEM;
}
static void __net_exit igmp_net_exit(struct net *net)
{
- proc_net_remove(net, "mcfilter");
- proc_net_remove(net, "igmp");
+ remove_proc_entry("mcfilter", net->proc_net);
+ remove_proc_entry("igmp", net->proc_net);
}
static struct pernet_operations igmp_net_ops = {
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index d34ce2972c8..786d97aee75 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -57,8 +57,9 @@ int inet_csk_bind_conflict(const struct sock *sk,
const struct inet_bind_bucket *tb, bool relax)
{
struct sock *sk2;
- struct hlist_node *node;
int reuse = sk->sk_reuse;
+ int reuseport = sk->sk_reuseport;
+ kuid_t uid = sock_i_uid((struct sock *)sk);
/*
* Unlike other sk lookup places we do not check
@@ -67,14 +68,17 @@ int inet_csk_bind_conflict(const struct sock *sk,
* one this bucket belongs to.
*/
- sk_for_each_bound(sk2, node, &tb->owners) {
+ sk_for_each_bound(sk2, &tb->owners) {
if (sk != sk2 &&
!inet_v6_ipv6only(sk2) &&
(!sk->sk_bound_dev_if ||
!sk2->sk_bound_dev_if ||
sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
- if (!reuse || !sk2->sk_reuse ||
- sk2->sk_state == TCP_LISTEN) {
+ if ((!reuse || !sk2->sk_reuse ||
+ sk2->sk_state == TCP_LISTEN) &&
+ (!reuseport || !sk2->sk_reuseport ||
+ (sk2->sk_state != TCP_TIME_WAIT &&
+ !uid_eq(uid, sock_i_uid(sk2))))) {
const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
sk2_rcv_saddr == sk_rcv_saddr(sk))
@@ -90,7 +94,7 @@ int inet_csk_bind_conflict(const struct sock *sk,
}
}
}
- return node != NULL;
+ return sk2 != NULL;
}
EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
@@ -101,11 +105,11 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_bind_hashbucket *head;
- struct hlist_node *node;
struct inet_bind_bucket *tb;
int ret, attempts = 5;
struct net *net = sock_net(sk);
int smallest_size = -1, smallest_rover;
+ kuid_t uid = sock_i_uid(sk);
local_bh_disable();
if (!snum) {
@@ -123,11 +127,14 @@ again:
head = &hashinfo->bhash[inet_bhashfn(net, rover,
hashinfo->bhash_size)];
spin_lock(&head->lock);
- inet_bind_bucket_for_each(tb, node, &head->chain)
+ inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->port == rover) {
- if (tb->fastreuse > 0 &&
- sk->sk_reuse &&
- sk->sk_state != TCP_LISTEN &&
+ if (((tb->fastreuse > 0 &&
+ sk->sk_reuse &&
+ sk->sk_state != TCP_LISTEN) ||
+ (tb->fastreuseport > 0 &&
+ sk->sk_reuseport &&
+ uid_eq(tb->fastuid, uid))) &&
(tb->num_owners < smallest_size || smallest_size == -1)) {
smallest_size = tb->num_owners;
smallest_rover = rover;
@@ -174,7 +181,7 @@ have_snum:
head = &hashinfo->bhash[inet_bhashfn(net, snum,
hashinfo->bhash_size)];
spin_lock(&head->lock);
- inet_bind_bucket_for_each(tb, node, &head->chain)
+ inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->port == snum)
goto tb_found;
}
@@ -185,14 +192,18 @@ tb_found:
if (sk->sk_reuse == SK_FORCE_REUSE)
goto success;
- if (tb->fastreuse > 0 &&
- sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
+ if (((tb->fastreuse > 0 &&
+ sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
+ (tb->fastreuseport > 0 &&
+ sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
smallest_size == -1) {
goto success;
} else {
ret = 1;
if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
- if (sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
+ if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
+ (tb->fastreuseport > 0 &&
+ sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
smallest_size != -1 && --attempts >= 0) {
spin_unlock(&head->lock);
goto again;
@@ -212,9 +223,19 @@ tb_not_found:
tb->fastreuse = 1;
else
tb->fastreuse = 0;
- } else if (tb->fastreuse &&
- (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
- tb->fastreuse = 0;
+ if (sk->sk_reuseport) {
+ tb->fastreuseport = 1;
+ tb->fastuid = uid;
+ } else
+ tb->fastreuseport = 0;
+ } else {
+ if (tb->fastreuse &&
+ (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
+ tb->fastreuse = 0;
+ if (tb->fastreuseport &&
+ (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)))
+ tb->fastreuseport = 0;
+ }
success:
if (!inet_csk(sk)->icsk_bind_hash)
inet_bind_hash(sk, tb, snum);
@@ -521,20 +542,30 @@ static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
int *expire, int *resend)
{
if (!rskq_defer_accept) {
- *expire = req->retrans >= thresh;
+ *expire = req->num_timeout >= thresh;
*resend = 1;
return;
}
- *expire = req->retrans >= thresh &&
- (!inet_rsk(req)->acked || req->retrans >= max_retries);
+ *expire = req->num_timeout >= thresh &&
+ (!inet_rsk(req)->acked || req->num_timeout >= max_retries);
/*
* Do not resend while waiting for data after ACK,
* start to resend on end of deferring period to give
* last chance for data or ACK to create established socket.
*/
*resend = !inet_rsk(req)->acked ||
- req->retrans >= rskq_defer_accept - 1;
+ req->num_timeout >= rskq_defer_accept - 1;
+}
+
+int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req)
+{
+ int err = req->rsk_ops->rtx_syn_ack(parent, req, NULL);
+
+ if (!err)
+ req->num_retrans++;
+ return err;
}
+EXPORT_SYMBOL(inet_rtx_syn_ack);
void inet_csk_reqsk_queue_prune(struct sock *parent,
const unsigned long interval,
@@ -599,13 +630,14 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
req->rsk_ops->syn_ack_timeout(parent, req);
if (!expire &&
(!resend ||
- !req->rsk_ops->rtx_syn_ack(parent, req, NULL) ||
+ !inet_rtx_syn_ack(parent, req) ||
inet_rsk(req)->acked)) {
unsigned long timeo;
- if (req->retrans++ == 0)
+ if (req->num_timeout++ == 0)
lopt->qlen_young--;
- timeo = min((timeout << req->retrans), max_rto);
+ timeo = min(timeout << req->num_timeout,
+ max_rto);
req->expires = now + timeo;
reqp = &req->dl_next;
continue;
@@ -699,6 +731,23 @@ void inet_csk_destroy_sock(struct sock *sk)
}
EXPORT_SYMBOL(inet_csk_destroy_sock);
+/* This function allows to force a closure of a socket after the call to
+ * tcp/dccp_create_openreq_child().
+ */
+void inet_csk_prepare_forced_close(struct sock *sk)
+ __releases(&sk->sk_lock.slock)
+{
+ /* sk_clone_lock locked the socket and set refcnt to 2 */
+ bh_unlock_sock(sk);
+ sock_put(sk);
+
+ /* The below has to be done to allow calling inet_csk_destroy_sock */
+ sock_set_flag(sk, SOCK_DEAD);
+ percpu_counter_inc(sk->sk_prot->orphan_count);
+ inet_sk(sk)->inet_num = 0;
+}
+EXPORT_SYMBOL(inet_csk_prepare_forced_close);
+
int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
{
struct inet_sock *inet = inet_sk(sk);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 535584c00f9..7afa2c3c788 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -44,6 +44,10 @@ struct inet_diag_entry {
u16 dport;
u16 family;
u16 userlocks;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct in6_addr saddr_storage; /* for IPv4-mapped-IPv6 addresses */
+ struct in6_addr daddr_storage; /* for IPv4-mapped-IPv6 addresses */
+#endif
};
static DEFINE_MUTEX(inet_diag_table_mutex);
@@ -105,6 +109,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
r->id.idiag_src[0] = inet->inet_rcv_saddr;
r->id.idiag_dst[0] = inet->inet_daddr;
+ if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown))
+ goto errout;
+
/* IPv6 dual-stack sockets use inet->tos for IPv4 connections,
* hence this needs to be included regardless of socket family.
*/
@@ -428,25 +435,31 @@ static int inet_diag_bc_run(const struct nlattr *_bc,
break;
}
- if (cond->prefix_len == 0)
- break;
-
if (op->code == INET_DIAG_BC_S_COND)
addr = entry->saddr;
else
addr = entry->daddr;
+ if (cond->family != AF_UNSPEC &&
+ cond->family != entry->family) {
+ if (entry->family == AF_INET6 &&
+ cond->family == AF_INET) {
+ if (addr[0] == 0 && addr[1] == 0 &&
+ addr[2] == htonl(0xffff) &&
+ bitstring_match(addr + 3,
+ cond->addr,
+ cond->prefix_len))
+ break;
+ }
+ yes = 0;
+ break;
+ }
+
+ if (cond->prefix_len == 0)
+ break;
if (bitstring_match(addr, cond->addr,
cond->prefix_len))
break;
- if (entry->family == AF_INET6 &&
- cond->family == AF_INET) {
- if (addr[0] == 0 && addr[1] == 0 &&
- addr[2] == htonl(0xffff) &&
- bitstring_match(addr + 3, cond->addr,
- cond->prefix_len))
- break;
- }
yes = 0;
break;
}
@@ -509,6 +522,55 @@ static int valid_cc(const void *bc, int len, int cc)
return 0;
}
+/* Validate an inet_diag_hostcond. */
+static bool valid_hostcond(const struct inet_diag_bc_op *op, int len,
+ int *min_len)
+{
+ int addr_len;
+ struct inet_diag_hostcond *cond;
+
+ /* Check hostcond space. */
+ *min_len += sizeof(struct inet_diag_hostcond);
+ if (len < *min_len)
+ return false;
+ cond = (struct inet_diag_hostcond *)(op + 1);
+
+ /* Check address family and address length. */
+ switch (cond->family) {
+ case AF_UNSPEC:
+ addr_len = 0;
+ break;
+ case AF_INET:
+ addr_len = sizeof(struct in_addr);
+ break;
+ case AF_INET6:
+ addr_len = sizeof(struct in6_addr);
+ break;
+ default:
+ return false;
+ }
+ *min_len += addr_len;
+ if (len < *min_len)
+ return false;
+
+ /* Check prefix length (in bits) vs address length (in bytes). */
+ if (cond->prefix_len > 8 * addr_len)
+ return false;
+
+ return true;
+}
+
+/* Validate a port comparison operator. */
+static inline bool valid_port_comparison(const struct inet_diag_bc_op *op,
+ int len, int *min_len)
+{
+ /* Port comparisons put the port in a follow-on inet_diag_bc_op. */
+ *min_len += sizeof(struct inet_diag_bc_op);
+ if (len < *min_len)
+ return false;
+ return true;
+}
+
static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
{
const void *bc = bytecode;
@@ -516,29 +578,39 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
while (len > 0) {
const struct inet_diag_bc_op *op = bc;
+ int min_len = sizeof(struct inet_diag_bc_op);
//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
switch (op->code) {
- case INET_DIAG_BC_AUTO:
case INET_DIAG_BC_S_COND:
case INET_DIAG_BC_D_COND:
+ if (!valid_hostcond(bc, len, &min_len))
+ return -EINVAL;
+ break;
case INET_DIAG_BC_S_GE:
case INET_DIAG_BC_S_LE:
case INET_DIAG_BC_D_GE:
case INET_DIAG_BC_D_LE:
- case INET_DIAG_BC_JMP:
- if (op->no < 4 || op->no > len + 4 || op->no & 3)
- return -EINVAL;
- if (op->no < len &&
- !valid_cc(bytecode, bytecode_len, len - op->no))
+ if (!valid_port_comparison(bc, len, &min_len))
return -EINVAL;
break;
+ case INET_DIAG_BC_AUTO:
+ case INET_DIAG_BC_JMP:
case INET_DIAG_BC_NOP:
break;
default:
return -EINVAL;
}
- if (op->yes < 4 || op->yes > len + 4 || op->yes & 3)
+
+ if (op->code != INET_DIAG_BC_NOP) {
+ if (op->no < min_len || op->no > len + 4 || op->no & 3)
+ return -EINVAL;
+ if (op->no < len &&
+ !valid_cc(bytecode, bytecode_len, len - op->no))
+ return -EINVAL;
+ }
+
+ if (op->yes < min_len || op->yes > len + 4 || op->yes & 3)
return -EINVAL;
bc += op->yes;
len -= op->yes;
@@ -596,6 +668,36 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
}
+/* Get the IPv4, IPv6, or IPv4-mapped-IPv6 local and remote addresses
+ * from a request_sock. For IPv4-mapped-IPv6 we must map IPv4 to IPv6.
+ */
+static inline void inet_diag_req_addrs(const struct sock *sk,
+ const struct request_sock *req,
+ struct inet_diag_entry *entry)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6) {
+ if (req->rsk_ops->family == AF_INET6) {
+ entry->saddr = inet6_rsk(req)->loc_addr.s6_addr32;
+ entry->daddr = inet6_rsk(req)->rmt_addr.s6_addr32;
+ } else if (req->rsk_ops->family == AF_INET) {
+ ipv6_addr_set_v4mapped(ireq->loc_addr,
+ &entry->saddr_storage);
+ ipv6_addr_set_v4mapped(ireq->rmt_addr,
+ &entry->daddr_storage);
+ entry->saddr = entry->saddr_storage.s6_addr32;
+ entry->daddr = entry->daddr_storage.s6_addr32;
+ }
+ } else
+#endif
+ {
+ entry->saddr = &ireq->loc_addr;
+ entry->daddr = &ireq->rmt_addr;
+ }
+}
+
static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
struct request_sock *req,
struct user_namespace *user_ns,
@@ -617,7 +719,7 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
r->idiag_family = sk->sk_family;
r->idiag_state = TCP_SYN_RECV;
r->idiag_timer = 1;
- r->idiag_retrans = req->retrans;
+ r->idiag_retrans = req->num_retrans;
r->id.idiag_if = sk->sk_bound_dev_if;
sock_diag_save_cookie(req, r->id.idiag_cookie);
@@ -637,8 +739,10 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
r->idiag_inode = 0;
#if IS_ENABLED(CONFIG_IPV6)
if (r->idiag_family == AF_INET6) {
- *(struct in6_addr *)r->id.idiag_src = inet6_rsk(req)->loc_addr;
- *(struct in6_addr *)r->id.idiag_dst = inet6_rsk(req)->rmt_addr;
+ struct inet_diag_entry entry;
+ inet_diag_req_addrs(sk, req, &entry);
+ memcpy(r->id.idiag_src, entry.saddr, sizeof(struct in6_addr));
+ memcpy(r->id.idiag_dst, entry.daddr, sizeof(struct in6_addr));
}
#endif
@@ -691,18 +795,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
continue;
if (bc) {
- entry.saddr =
-#if IS_ENABLED(CONFIG_IPV6)
- (entry.family == AF_INET6) ?
- inet6_rsk(req)->loc_addr.s6_addr32 :
-#endif
- &ireq->loc_addr;
- entry.daddr =
-#if IS_ENABLED(CONFIG_IPV6)
- (entry.family == AF_INET6) ?
- inet6_rsk(req)->rmt_addr.s6_addr32 :
-#endif
- &ireq->rmt_addr;
+ inet_diag_req_addrs(sk, req, &entry);
entry.dport = ntohs(ireq->rmt_port);
if (!inet_diag_bc_run(bc, &entry))
@@ -892,13 +985,16 @@ static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
struct inet_diag_req_v2 *r, struct nlattr *bc)
{
const struct inet_diag_handler *handler;
+ int err = 0;
handler = inet_diag_lock_handler(r->sdiag_protocol);
if (!IS_ERR(handler))
handler->dump(skb, cb, r, bc);
+ else
+ err = PTR_ERR(handler);
inet_diag_unlock_handler(handler);
- return skb->len;
+ return err ? : skb->len;
}
static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 4750d2b74d7..245ae078a07 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -33,9 +33,9 @@ static void inet_frag_secret_rebuild(unsigned long dummy)
get_random_bytes(&f->rnd, sizeof(u32));
for (i = 0; i < INETFRAGS_HASHSZ; i++) {
struct inet_frag_queue *q;
- struct hlist_node *p, *n;
+ struct hlist_node *n;
- hlist_for_each_entry_safe(q, p, n, &f->hash[i], list) {
+ hlist_for_each_entry_safe(q, n, &f->hash[i], list) {
unsigned int hval = f->hashfn(q);
if (hval != i) {
@@ -73,8 +73,9 @@ EXPORT_SYMBOL(inet_frags_init);
void inet_frags_init_net(struct netns_frags *nf)
{
nf->nqueues = 0;
- atomic_set(&nf->mem, 0);
+ init_frag_mem_limit(nf);
INIT_LIST_HEAD(&nf->lru_list);
+ spin_lock_init(&nf->lru_lock);
}
EXPORT_SYMBOL(inet_frags_init_net);
@@ -91,6 +92,8 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
local_bh_disable();
inet_frag_evictor(nf, f, true);
local_bh_enable();
+
+ percpu_counter_destroy(&nf->mem);
}
EXPORT_SYMBOL(inet_frags_exit_net);
@@ -98,9 +101,9 @@ static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
{
write_lock(&f->lock);
hlist_del(&fq->list);
- list_del(&fq->lru_list);
fq->net->nqueues--;
write_unlock(&f->lock);
+ inet_frag_lru_del(fq);
}
void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
@@ -117,12 +120,8 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
EXPORT_SYMBOL(inet_frag_kill);
static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
- struct sk_buff *skb, int *work)
+ struct sk_buff *skb)
{
- if (work)
- *work -= skb->truesize;
-
- atomic_sub(skb->truesize, &nf->mem);
if (f->skb_free)
f->skb_free(skb);
kfree_skb(skb);
@@ -133,6 +132,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
{
struct sk_buff *fp;
struct netns_frags *nf;
+ unsigned int sum, sum_truesize = 0;
WARN_ON(!(q->last_in & INET_FRAG_COMPLETE));
WARN_ON(del_timer(&q->timer) != 0);
@@ -143,13 +143,14 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
while (fp) {
struct sk_buff *xp = fp->next;
- frag_kfree_skb(nf, f, fp, work);
+ sum_truesize += fp->truesize;
+ frag_kfree_skb(nf, f, fp);
fp = xp;
}
-
+ sum = sum_truesize + f->qsize;
if (work)
- *work -= f->qsize;
- atomic_sub(f->qsize, &nf->mem);
+ *work -= sum;
+ sub_frag_mem_limit(q, sum);
if (f->destructor)
f->destructor(q);
@@ -164,22 +165,23 @@ int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
int work, evicted = 0;
if (!force) {
- if (atomic_read(&nf->mem) <= nf->high_thresh)
+ if (frag_mem_limit(nf) <= nf->high_thresh)
return 0;
}
- work = atomic_read(&nf->mem) - nf->low_thresh;
+ work = frag_mem_limit(nf) - nf->low_thresh;
while (work > 0) {
- read_lock(&f->lock);
+ spin_lock(&nf->lru_lock);
+
if (list_empty(&nf->lru_list)) {
- read_unlock(&f->lock);
+ spin_unlock(&nf->lru_lock);
break;
}
q = list_first_entry(&nf->lru_list,
struct inet_frag_queue, lru_list);
atomic_inc(&q->refcnt);
- read_unlock(&f->lock);
+ spin_unlock(&nf->lru_lock);
spin_lock(&q->lock);
if (!(q->last_in & INET_FRAG_COMPLETE))
@@ -201,7 +203,6 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
{
struct inet_frag_queue *qp;
#ifdef CONFIG_SMP
- struct hlist_node *n;
#endif
unsigned int hash;
@@ -217,7 +218,7 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
* such entry could be created on other cpu, while we
* promoted read lock to write lock.
*/
- hlist_for_each_entry(qp, n, &f->hash[hash], list) {
+ hlist_for_each_entry(qp, &f->hash[hash], list) {
if (qp->net == nf && f->match(qp, arg)) {
atomic_inc(&qp->refcnt);
write_unlock(&f->lock);
@@ -233,9 +234,9 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
atomic_inc(&qp->refcnt);
hlist_add_head(&qp->list, &f->hash[hash]);
- list_add_tail(&qp->lru_list, &nf->lru_list);
nf->nqueues++;
write_unlock(&f->lock);
+ inet_frag_lru_add(nf, qp);
return qp;
}
@@ -250,7 +251,8 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
q->net = nf;
f->constructor(q, arg);
- atomic_add(f->qsize, &nf->mem);
+ add_frag_mem_limit(q, f->qsize);
+
setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
spin_lock_init(&q->lock);
atomic_set(&q->refcnt, 1);
@@ -275,9 +277,8 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
__releases(&f->lock)
{
struct inet_frag_queue *q;
- struct hlist_node *n;
- hlist_for_each_entry(q, n, &f->hash[hash], list) {
+ hlist_for_each_entry(q, &f->hash[hash], list) {
if (q->net == nf && f->match(q, key)) {
atomic_inc(&q->refcnt);
read_unlock(&f->lock);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 7880af97020..6af375afeee 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -39,6 +39,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
write_pnet(&tb->ib_net, hold_net(net));
tb->port = snum;
tb->fastreuse = 0;
+ tb->fastreuseport = 0;
tb->num_owners = 0;
INIT_HLIST_HEAD(&tb->owners);
hlist_add_head(&tb->node, &head->chain);
@@ -119,13 +120,12 @@ int __inet_inherit_port(struct sock *sk, struct sock *child)
* that the listener socket's icsk_bind_hash is the same
* as that of the child socket. We have to look up or
* create a new bind bucket for the child here. */
- struct hlist_node *node;
- inet_bind_bucket_for_each(tb, node, &head->chain) {
+ inet_bind_bucket_for_each(tb, &head->chain) {
if (net_eq(ib_net(tb), sock_net(sk)) &&
tb->port == port)
break;
}
- if (!node) {
+ if (!tb) {
tb = inet_bind_bucket_create(table->bind_bucket_cachep,
sock_net(sk), head, port);
if (!tb) {
@@ -151,16 +151,16 @@ static inline int compute_score(struct sock *sk, struct net *net,
if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
!ipv6_only_sock(sk)) {
__be32 rcv_saddr = inet->inet_rcv_saddr;
- score = sk->sk_family == PF_INET ? 1 : 0;
+ score = sk->sk_family == PF_INET ? 2 : 1;
if (rcv_saddr) {
if (rcv_saddr != daddr)
return -1;
- score += 2;
+ score += 4;
}
if (sk->sk_bound_dev_if) {
if (sk->sk_bound_dev_if != dif)
return -1;
- score += 2;
+ score += 4;
}
}
return score;
@@ -176,6 +176,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
struct sock *__inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
+ const __be32 saddr, __be16 sport,
const __be32 daddr, const unsigned short hnum,
const int dif)
{
@@ -183,17 +184,29 @@ struct sock *__inet_lookup_listener(struct net *net,
struct hlist_nulls_node *node;
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
- int score, hiscore;
+ int score, hiscore, matches = 0, reuseport = 0;
+ u32 phash = 0;
rcu_read_lock();
begin:
result = NULL;
- hiscore = -1;
+ hiscore = 0;
sk_nulls_for_each_rcu(sk, node, &ilb->head) {
score = compute_score(sk, net, hnum, daddr, dif);
if (score > hiscore) {
result = sk;
hiscore = score;
+ reuseport = sk->sk_reuseport;
+ if (reuseport) {
+ phash = inet_ehashfn(net, daddr, hnum,
+ saddr, sport);
+ matches = 1;
+ }
+ } else if (score == hiscore && reuseport) {
+ matches++;
+ if (((u64)phash * matches) >> 32 == 0)
+ result = sk;
+ phash = next_pseudo_random32(phash);
}
}
/*
@@ -237,12 +250,14 @@ struct sock *__inet_lookup_established(struct net *net,
rcu_read_lock();
begin:
sk_nulls_for_each_rcu(sk, node, &head->chain) {
- if (INET_MATCH(sk, net, hash, acookie,
- saddr, daddr, ports, dif)) {
+ if (sk->sk_hash != hash)
+ continue;
+ if (likely(INET_MATCH(sk, net, acookie,
+ saddr, daddr, ports, dif))) {
if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
goto begintw;
- if (unlikely(!INET_MATCH(sk, net, hash, acookie,
- saddr, daddr, ports, dif))) {
+ if (unlikely(!INET_MATCH(sk, net, acookie,
+ saddr, daddr, ports, dif))) {
sock_put(sk);
goto begin;
}
@@ -260,14 +275,18 @@ begin:
begintw:
/* Must check for a TIME_WAIT'er before going to listener hash. */
sk_nulls_for_each_rcu(sk, node, &head->twchain) {
- if (INET_TW_MATCH(sk, net, hash, acookie,
- saddr, daddr, ports, dif)) {
+ if (sk->sk_hash != hash)
+ continue;
+ if (likely(INET_TW_MATCH(sk, net, acookie,
+ saddr, daddr, ports,
+ dif))) {
if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
sk = NULL;
goto out;
}
- if (unlikely(!INET_TW_MATCH(sk, net, hash, acookie,
- saddr, daddr, ports, dif))) {
+ if (unlikely(!INET_TW_MATCH(sk, net, acookie,
+ saddr, daddr, ports,
+ dif))) {
sock_put(sk);
goto begintw;
}
@@ -314,10 +333,12 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
/* Check TIME-WAIT sockets first. */
sk_nulls_for_each(sk2, node, &head->twchain) {
- tw = inet_twsk(sk2);
+ if (sk2->sk_hash != hash)
+ continue;
- if (INET_TW_MATCH(sk2, net, hash, acookie,
- saddr, daddr, ports, dif)) {
+ if (likely(INET_TW_MATCH(sk2, net, acookie,
+ saddr, daddr, ports, dif))) {
+ tw = inet_twsk(sk2);
if (twsk_unique(sk, sk2, twp))
goto unique;
else
@@ -328,8 +349,10 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
/* And established part... */
sk_nulls_for_each(sk2, node, &head->chain) {
- if (INET_MATCH(sk2, net, hash, acookie,
- saddr, daddr, ports, dif))
+ if (sk2->sk_hash != hash)
+ continue;
+ if (likely(INET_MATCH(sk2, net, acookie,
+ saddr, daddr, ports, dif)))
goto not_unique;
}
@@ -469,7 +492,6 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
int i, remaining, low, high, port;
static u32 hint;
u32 offset = hint + port_offset;
- struct hlist_node *node;
struct inet_timewait_sock *tw = NULL;
inet_get_local_port_range(&low, &high);
@@ -488,10 +510,11 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
* because the established check is already
* unique enough.
*/
- inet_bind_bucket_for_each(tb, node, &head->chain) {
+ inet_bind_bucket_for_each(tb, &head->chain) {
if (net_eq(ib_net(tb), net) &&
tb->port == port) {
- if (tb->fastreuse >= 0)
+ if (tb->fastreuse >= 0 ||
+ tb->fastreuseport >= 0)
goto next_port;
WARN_ON(hlist_empty(&tb->owners));
if (!check_established(death_row, sk,
@@ -508,6 +531,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
break;
}
tb->fastreuse = -1;
+ tb->fastreuseport = -1;
goto ok;
next_port:
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 2784db3155f..1f27c9f4afd 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -216,7 +216,6 @@ static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
const int slot)
{
struct inet_timewait_sock *tw;
- struct hlist_node *node;
unsigned int killed;
int ret;
@@ -229,7 +228,7 @@ static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
killed = 0;
ret = 0;
rescan:
- inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) {
+ inet_twsk_for_each_inmate(tw, &twdr->cells[slot]) {
__inet_twsk_del_dead_node(tw);
spin_unlock(&twdr->death_lock);
__inet_twsk_kill(tw, twdr->hashinfo);
@@ -438,10 +437,10 @@ void inet_twdr_twcal_tick(unsigned long data)
for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) {
if (time_before_eq(j, now)) {
- struct hlist_node *node, *safe;
+ struct hlist_node *safe;
struct inet_timewait_sock *tw;
- inet_twsk_for_each_inmate_safe(tw, node, safe,
+ inet_twsk_for_each_inmate_safe(tw, safe,
&twdr->twcal_row[slot]) {
__inet_twsk_del_dead_node(tw);
__inet_twsk_kill(tw, twdr->hashinfo);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 448e6854682..b6d30acb600 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -122,7 +122,7 @@ int ip_frag_nqueues(struct net *net)
int ip_frag_mem(struct net *net)
{
- return atomic_read(&net->ipv4.frags.mem);
+ return sum_frag_mem_limit(&net->ipv4.frags);
}
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
@@ -161,13 +161,6 @@ static bool ip4_frag_match(struct inet_frag_queue *q, void *a)
qp->user == arg->user;
}
-/* Memory Tracking Functions. */
-static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb)
-{
- atomic_sub(skb->truesize, &nf->mem);
- kfree_skb(skb);
-}
-
static void ip4_frag_init(struct inet_frag_queue *q, void *a)
{
struct ipq *qp = container_of(q, struct ipq, q);
@@ -340,6 +333,7 @@ static inline int ip_frag_too_far(struct ipq *qp)
static int ip_frag_reinit(struct ipq *qp)
{
struct sk_buff *fp;
+ unsigned int sum_truesize = 0;
if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
atomic_inc(&qp->q.refcnt);
@@ -349,9 +343,12 @@ static int ip_frag_reinit(struct ipq *qp)
fp = qp->q.fragments;
do {
struct sk_buff *xp = fp->next;
- frag_kfree_skb(qp->q.net, fp);
+
+ sum_truesize += fp->truesize;
+ kfree_skb(fp);
fp = xp;
} while (fp);
+ sub_frag_mem_limit(&qp->q, sum_truesize);
qp->q.last_in = 0;
qp->q.len = 0;
@@ -496,7 +493,8 @@ found:
qp->q.fragments = next;
qp->q.meat -= free_it->len;
- frag_kfree_skb(qp->q.net, free_it);
+ sub_frag_mem_limit(&qp->q, free_it->truesize);
+ kfree_skb(free_it);
}
}
@@ -519,7 +517,7 @@ found:
qp->q.stamp = skb->tstamp;
qp->q.meat += skb->len;
qp->ecn |= ecn;
- atomic_add(skb->truesize, &qp->q.net->mem);
+ add_frag_mem_limit(&qp->q, skb->truesize);
if (offset == 0)
qp->q.last_in |= INET_FRAG_FIRST_IN;
@@ -531,9 +529,7 @@ found:
qp->q.meat == qp->q.len)
return ip_frag_reasm(qp, prev, dev);
- write_lock(&ip4_frags.lock);
- list_move_tail(&qp->q.lru_list, &qp->q.net->lru_list);
- write_unlock(&ip4_frags.lock);
+ inet_frag_lru_move(&qp->q);
return -EINPROGRESS;
err:
@@ -594,7 +590,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
goto out_oversize;
/* Head of list must not be cloned. */
- if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC))
+ if (skb_unclone(head, GFP_ATOMIC))
goto out_nomem;
/* If the first fragment is fragmented itself, we split
@@ -617,7 +613,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
head->len -= clone->len;
clone->csum = 0;
clone->ip_summed = head->ip_summed;
- atomic_add(clone->truesize, &qp->q.net->mem);
+ add_frag_mem_limit(&qp->q, clone->truesize);
}
skb_push(head, head->data - skb_network_header(head));
@@ -645,7 +641,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
}
fp = next;
}
- atomic_sub(sum_truesize, &qp->q.net->mem);
+ sub_frag_mem_limit(&qp->q, sum_truesize);
head->next = NULL;
head->dev = dev;
@@ -707,28 +703,27 @@ EXPORT_SYMBOL(ip_defrag);
struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user)
{
- const struct iphdr *iph;
+ struct iphdr iph;
u32 len;
if (skb->protocol != htons(ETH_P_IP))
return skb;
- if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ if (!skb_copy_bits(skb, 0, &iph, sizeof(iph)))
return skb;
- iph = ip_hdr(skb);
- if (iph->ihl < 5 || iph->version != 4)
+ if (iph.ihl < 5 || iph.version != 4)
return skb;
- if (!pskb_may_pull(skb, iph->ihl*4))
- return skb;
- iph = ip_hdr(skb);
- len = ntohs(iph->tot_len);
- if (skb->len < len || len < (iph->ihl * 4))
+
+ len = ntohs(iph.tot_len);
+ if (skb->len < len || len < (iph.ihl * 4))
return skb;
- if (ip_is_fragment(ip_hdr(skb))) {
+ if (ip_is_fragment(&iph)) {
skb = skb_share_check(skb, GFP_ATOMIC);
if (skb) {
+ if (!pskb_may_pull(skb, iph.ihl*4))
+ return skb;
if (pskb_trim_rcsum(skb, len))
return skb;
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
@@ -802,6 +797,10 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
table[0].data = &net->ipv4.frags.high_thresh;
table[1].data = &net->ipv4.frags.low_thresh;
table[2].data = &net->ipv4.frags.timeout;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ table[0].procname = NULL;
}
hdr = register_net_sysctl(net, "net/ipv4", table);
@@ -848,14 +847,22 @@ static inline void ip4_frags_ctl_register(void)
static int __net_init ipv4_frags_init_net(struct net *net)
{
- /*
- * Fragment cache limits. We will commit 256K at one time. Should we
- * cross that limit we will prune down to 192K. This should cope with
- * even the most extreme cases without allowing an attacker to
- * measurably harm machine performance.
+ /* Fragment cache limits.
+ *
+ * The fragment memory accounting code, (tries to) account for
+ * the real memory usage, by measuring both the size of frag
+ * queue struct (inet_frag_queue (ipv4:ipq/ipv6:frag_queue))
+ * and the SKB's truesize.
+ *
+ * A 64K fragment consumes 129736 bytes (44*2944)+200
+ * (1500 truesize == 2944, sizeof(struct ipq) == 200)
+ *
+ * We will commit 4MB at one time. Should we cross that limit
+ * we will prune down to 3MB, making room for approx 8 big 64K
+ * fragments 8x128k.
*/
- net->ipv4.frags.high_thresh = 256 * 1024;
- net->ipv4.frags.low_thresh = 192 * 1024;
+ net->ipv4.frags.high_thresh = 4 * 1024 * 1024;
+ net->ipv4.frags.low_thresh = 3 * 1024 * 1024;
/*
* Important NOTE! Fragment queue must be destroyed before MSL expires.
* RFC791 is wrong proposing to prolongate timer each fragment arrival
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 7240f8e2dd4..d0ef0e674ec 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -164,21 +164,6 @@ struct ipgre_net {
#define tunnels_r tunnels[2]
#define tunnels_l tunnels[1]
#define tunnels_wc tunnels[0]
-/*
- * Locking : hash tables are protected by RCU and RTNL
- */
-
-#define for_each_ip_tunnel_rcu(start) \
- for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
-
-/* often modified stats are per cpu, other are shared (netdev->stats) */
-struct pcpu_tstats {
- u64 rx_packets;
- u64 rx_bytes;
- u64 tx_packets;
- u64 tx_bytes;
- struct u64_stats_sync syncp;
-};
static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
struct rtnl_link_stats64 *tot)
@@ -250,7 +235,7 @@ static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
ARPHRD_ETHER : ARPHRD_IPGRE;
int score, cand_score = 4;
- for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
+ for_each_ip_tunnel_rcu(t, ign->tunnels_r_l[h0 ^ h1]) {
if (local != t->parms.iph.saddr ||
remote != t->parms.iph.daddr ||
!(t->dev->flags & IFF_UP))
@@ -277,7 +262,7 @@ static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
}
}
- for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
+ for_each_ip_tunnel_rcu(t, ign->tunnels_r[h0 ^ h1]) {
if (remote != t->parms.iph.daddr ||
!(t->dev->flags & IFF_UP))
continue;
@@ -303,7 +288,7 @@ static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
}
}
- for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
+ for_each_ip_tunnel_rcu(t, ign->tunnels_l[h1]) {
if ((local != t->parms.iph.saddr &&
(local != t->parms.iph.daddr ||
!ipv4_is_multicast(local))) ||
@@ -331,7 +316,7 @@ static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
}
}
- for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
+ for_each_ip_tunnel_rcu(t, ign->tunnels_wc[h1]) {
if (t->parms.i_key != key ||
!(t->dev->flags & IFF_UP))
continue;
@@ -750,11 +735,36 @@ drop:
return 0;
}
+static struct sk_buff *handle_offloads(struct ip_tunnel *tunnel, struct sk_buff *skb)
+{
+ int err;
+
+ if (skb_is_gso(skb)) {
+ err = skb_unclone(skb, GFP_ATOMIC);
+ if (unlikely(err))
+ goto error;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_GRE;
+ return skb;
+ } else if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ tunnel->parms.o_flags&GRE_CSUM) {
+ err = skb_checksum_help(skb);
+ if (unlikely(err))
+ goto error;
+ } else if (skb->ip_summed != CHECKSUM_PARTIAL)
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return skb;
+
+error:
+ kfree_skb(skb);
+ return ERR_PTR(err);
+}
+
static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
{
+ struct pcpu_tstats *tstats = this_cpu_ptr(dev->tstats);
struct ip_tunnel *tunnel = netdev_priv(dev);
- struct pcpu_tstats *tstats;
- const struct iphdr *old_iph = ip_hdr(skb);
+ const struct iphdr *old_iph;
const struct iphdr *tiph;
struct flowi4 fl4;
u8 tos;
@@ -766,17 +776,32 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
int gre_hlen;
__be32 dst;
int mtu;
+ u8 ttl;
+ int err;
+ int pkt_len;
- if (skb->ip_summed == CHECKSUM_PARTIAL &&
- skb_checksum_help(skb))
- goto tx_error;
+ skb = handle_offloads(tunnel, skb);
+ if (IS_ERR(skb)) {
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
+ }
+
+ if (!skb->encapsulation) {
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+ }
+
+ old_iph = ip_hdr(skb);
if (dev->type == ARPHRD_ETHER)
IPCB(skb)->flags = 0;
if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
gre_hlen = 0;
- tiph = (const struct iphdr *)skb->data;
+ if (skb->protocol == htons(ETH_P_IP))
+ tiph = (const struct iphdr *)skb->data;
+ else
+ tiph = &tunnel->parms.iph;
} else {
gre_hlen = tunnel->hlen;
tiph = &tunnel->parms.iph;
@@ -828,9 +853,10 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
goto tx_error;
}
+ ttl = tiph->ttl;
tos = tiph->tos;
- if (tos == 1) {
- tos = 0;
+ if (tos & 0x1) {
+ tos &= ~0x1;
if (skb->protocol == htons(ETH_P_IP))
tos = old_iph->tos;
else if (skb->protocol == htons(ETH_P_IPV6))
@@ -864,7 +890,8 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
if (skb->protocol == htons(ETH_P_IP)) {
df |= (old_iph->frag_off&htons(IP_DF));
- if ((old_iph->frag_off&htons(IP_DF)) &&
+ if (!skb_is_gso(skb) &&
+ (old_iph->frag_off&htons(IP_DF)) &&
mtu < ntohs(old_iph->tot_len)) {
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
ip_rt_put(rt);
@@ -884,7 +911,9 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
}
}
- if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
+ if (!skb_is_gso(skb) &&
+ mtu >= IPV6_MIN_MTU &&
+ mtu < skb->len - tunnel->hlen + gre_hlen) {
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
ip_rt_put(rt);
goto tx_error;
@@ -920,11 +949,12 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
dev_kfree_skb(skb);
skb = new_skb;
old_iph = ip_hdr(skb);
+ /* Warning : tiph value might point to freed memory */
}
- skb_reset_transport_header(skb);
skb_push(skb, gre_hlen);
skb_reset_network_header(skb);
+ skb_set_transport_header(skb, sizeof(*iph));
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
IPSKB_REROUTED);
@@ -943,8 +973,11 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
iph->daddr = fl4.daddr;
iph->saddr = fl4.saddr;
+ iph->ttl = ttl;
+
+ tunnel_ip_select_ident(skb, old_iph, &rt->dst);
- if ((iph->ttl = tiph->ttl) == 0) {
+ if (ttl == 0) {
if (skb->protocol == htons(ETH_P_IP))
iph->ttl = old_iph->ttl;
#if IS_ENABLED(CONFIG_IPV6)
@@ -971,15 +1004,37 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
*ptr = tunnel->parms.o_key;
ptr--;
}
- if (tunnel->parms.o_flags&GRE_CSUM) {
+ /* Skip GRE checksum if skb is getting offloaded. */
+ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE) &&
+ (tunnel->parms.o_flags&GRE_CSUM)) {
+ int offset = skb_transport_offset(skb);
+
+ if (skb_has_shared_frag(skb)) {
+ err = __skb_linearize(skb);
+ if (err)
+ goto tx_error;
+ }
+
*ptr = 0;
- *(__sum16 *)ptr = ip_compute_csum((void *)(iph+1), skb->len - sizeof(struct iphdr));
+ *(__sum16 *)ptr = csum_fold(skb_checksum(skb, offset,
+ skb->len - offset,
+ 0));
}
}
nf_reset(skb);
- tstats = this_cpu_ptr(dev->tstats);
- __IPTUNNEL_XMIT(tstats, &dev->stats);
+
+ pkt_len = skb->len - skb_transport_offset(skb);
+ err = ip_local_out(skb);
+ if (likely(net_xmit_eval(err) == 0)) {
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->tx_bytes += pkt_len;
+ tstats->tx_packets++;
+ u64_stats_update_end(&tstats->syncp);
+ } else {
+ dev->stats.tx_errors++;
+ dev->stats.tx_aborted_errors++;
+ }
return NETDEV_TX_OK;
#if IS_ENABLED(CONFIG_IPV6)
@@ -1049,6 +1104,11 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev)
mtu = 68;
tunnel->hlen = addend;
+ /* TCP offload with GRE SEQ is not supported. */
+ if (!(tunnel->parms.o_flags & GRE_SEQ)) {
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+ }
return mtu;
}
@@ -1082,7 +1142,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
case SIOCADDTUNNEL:
case SIOCCHGTUNNEL:
err = -EPERM;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
goto done;
err = -EFAULT;
@@ -1157,7 +1217,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
case SIOCDELTUNNEL:
err = -EPERM;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
goto done;
if (dev == ign->fb_tunnel_dev) {
@@ -1598,6 +1658,9 @@ static void ipgre_tap_setup(struct net_device *dev)
dev->iflink = 0;
dev->features |= NETIF_F_NETNS_LOCAL;
+
+ dev->features |= GRE_FEATURES;
+ dev->hw_features |= GRE_FEATURES;
}
static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index f1395a6fb35..2bdf802e28e 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -208,13 +208,6 @@ static int ip_local_deliver_finish(struct sk_buff *skb)
if (ipprot != NULL) {
int ret;
- if (!net_eq(net, &init_net) && !ipprot->netns_ok) {
- net_info_ratelimited("%s: proto %d isn't netns-ready\n",
- __func__, protocol);
- kfree_skb(skb);
- goto out;
- }
-
if (!ipprot->no_policy) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
kfree_skb(skb);
@@ -235,9 +228,11 @@ static int ip_local_deliver_finish(struct sk_buff *skb)
icmp_send(skb, ICMP_DEST_UNREACH,
ICMP_PROT_UNREACH, 0);
}
- } else
+ kfree_skb(skb);
+ } else {
IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
- kfree_skb(skb);
+ consume_skb(skb);
+ }
}
}
out:
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 1dc01f9793d..310a3647c83 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -409,7 +409,7 @@ int ip_options_compile(struct net *net,
optptr[2] += 8;
break;
default:
- if (!skb && !capable(CAP_NET_RAW)) {
+ if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) {
pp_ptr = optptr + 3;
goto error;
}
@@ -423,7 +423,7 @@ int ip_options_compile(struct net *net,
put_unaligned_be32(midtime, timeptr);
opt->is_changed = 1;
}
- } else {
+ } else if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC) {
unsigned int overflow = optptr[3]>>4;
if (overflow == 15) {
pp_ptr = optptr + 3;
@@ -445,7 +445,7 @@ int ip_options_compile(struct net *net,
opt->router_alert = optptr - iph;
break;
case IPOPT_CIPSO:
- if ((!skb && !capable(CAP_NET_RAW)) || opt->cipso) {
+ if ((!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) || opt->cipso) {
pp_ptr = optptr;
goto error;
}
@@ -458,7 +458,7 @@ int ip_options_compile(struct net *net,
case IPOPT_SEC:
case IPOPT_SID:
default:
- if (!skb && !capable(CAP_NET_RAW)) {
+ if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) {
pp_ptr = optptr;
goto error;
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 6537a408a4f..5e12dca7b3d 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -595,6 +595,11 @@ slow_path_clean:
}
slow_path:
+ /* for offloaded checksums cleanup checksum before fragmentation */
+ if ((skb->ip_summed == CHECKSUM_PARTIAL) && skb_checksum_help(skb))
+ goto fail;
+ iph = ip_hdr(skb);
+
left = skb->len - hlen; /* Space per frame */
ptr = hlen; /* Where to start from */
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 5eea4a81104..d9c4f113d70 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -457,19 +457,28 @@ static int do_ip_setsockopt(struct sock *sk, int level,
struct inet_sock *inet = inet_sk(sk);
int val = 0, err;
- if (((1<<optname) & ((1<<IP_PKTINFO) | (1<<IP_RECVTTL) |
- (1<<IP_RECVOPTS) | (1<<IP_RECVTOS) |
- (1<<IP_RETOPTS) | (1<<IP_TOS) |
- (1<<IP_TTL) | (1<<IP_HDRINCL) |
- (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) |
- (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) |
- (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT) |
- (1<<IP_MINTTL) | (1<<IP_NODEFRAG))) ||
- optname == IP_UNICAST_IF ||
- optname == IP_MULTICAST_TTL ||
- optname == IP_MULTICAST_ALL ||
- optname == IP_MULTICAST_LOOP ||
- optname == IP_RECVORIGDSTADDR) {
+ switch (optname) {
+ case IP_PKTINFO:
+ case IP_RECVTTL:
+ case IP_RECVOPTS:
+ case IP_RECVTOS:
+ case IP_RETOPTS:
+ case IP_TOS:
+ case IP_TTL:
+ case IP_HDRINCL:
+ case IP_MTU_DISCOVER:
+ case IP_RECVERR:
+ case IP_ROUTER_ALERT:
+ case IP_FREEBIND:
+ case IP_PASSSEC:
+ case IP_TRANSPARENT:
+ case IP_MINTTL:
+ case IP_NODEFRAG:
+ case IP_UNICAST_IF:
+ case IP_MULTICAST_TTL:
+ case IP_MULTICAST_ALL:
+ case IP_MULTICAST_LOOP:
+ case IP_RECVORIGDSTADDR:
if (optlen >= sizeof(int)) {
if (get_user(val, (int __user *) optval))
return -EFAULT;
@@ -581,7 +590,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
case IP_TTL:
if (optlen < 1)
goto e_inval;
- if (val != -1 && (val < 0 || val > 255))
+ if (val != -1 && (val < 1 || val > 255))
goto e_inval;
inet->uc_ttl = val;
break;
@@ -980,13 +989,14 @@ mc_msf_out:
case IP_IPSEC_POLICY:
case IP_XFRM_POLICY:
err = -EPERM;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
break;
err = xfrm_user_policy(sk, optname, optval, optlen);
break;
case IP_TRANSPARENT:
- if (!!val && !capable(CAP_NET_RAW) && !capable(CAP_NET_ADMIN)) {
+ if (!!val && !ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+ !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
err = -EPERM;
break;
}
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 1831092f999..c3a4233c0ac 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -66,20 +66,6 @@ static void vti_tunnel_setup(struct net_device *dev);
static void vti_dev_free(struct net_device *dev);
static int vti_tunnel_bind_dev(struct net_device *dev);
-/* Locking : hash tables are protected by RCU and RTNL */
-
-#define for_each_ip_tunnel_rcu(start) \
- for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
-
-/* often modified stats are per cpu, other are shared (netdev->stats) */
-struct pcpu_tstats {
- u64 rx_packets;
- u64 rx_bytes;
- u64 tx_packets;
- u64 tx_bytes;
- struct u64_stats_sync syncp;
-};
-
#define VTI_XMIT(stats1, stats2) do { \
int err; \
int pkt_len = skb->len; \
@@ -142,19 +128,19 @@ static struct ip_tunnel *vti_tunnel_lookup(struct net *net,
struct ip_tunnel *t;
struct vti_net *ipn = net_generic(net, vti_net_id);
- for_each_ip_tunnel_rcu(ipn->tunnels_r_l[h0 ^ h1])
+ for_each_ip_tunnel_rcu(t, ipn->tunnels_r_l[h0 ^ h1])
if (local == t->parms.iph.saddr &&
remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
return t;
- for_each_ip_tunnel_rcu(ipn->tunnels_r[h0])
+ for_each_ip_tunnel_rcu(t, ipn->tunnels_r[h0])
if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
return t;
- for_each_ip_tunnel_rcu(ipn->tunnels_l[h1])
+ for_each_ip_tunnel_rcu(t, ipn->tunnels_l[h1])
if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
return t;
- for_each_ip_tunnel_rcu(ipn->tunnels_wc[0])
+ for_each_ip_tunnel_rcu(t, ipn->tunnels_wc[0])
if (t && (t->dev->flags&IFF_UP))
return t;
return NULL;
@@ -338,12 +324,17 @@ static int vti_rcv(struct sk_buff *skb)
if (tunnel != NULL) {
struct pcpu_tstats *tstats;
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+ return -1;
+
tstats = this_cpu_ptr(tunnel->dev->tstats);
u64_stats_update_begin(&tstats->syncp);
tstats->rx_packets++;
tstats->rx_bytes += skb->len;
u64_stats_update_end(&tstats->syncp);
+ skb->mark = 0;
+ secpath_reset(skb);
skb->dev = tunnel->dev;
return 1;
}
@@ -497,7 +488,7 @@ vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
case SIOCADDTUNNEL:
case SIOCCHGTUNNEL:
err = -EPERM;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
goto done;
err = -EFAULT;
@@ -562,7 +553,7 @@ vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
case SIOCDELTUNNEL:
err = -EPERM;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
goto done;
if (dev == ipn->fb_tunnel_dev) {
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index d3ab47e19a8..f01d1b1aff7 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -47,9 +47,12 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
if (!x)
return;
- if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) {
+ atomic_inc(&flow_cache_genid);
+ rt_genid_bump(net);
+
ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0);
- else
+ } else
ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0);
xfrm_state_put(x);
}
@@ -160,6 +163,7 @@ static const struct net_protocol ipcomp4_protocol = {
.handler = xfrm4_rcv,
.err_handler = ipcomp4_err,
.no_policy = 1,
+ .netns_ok = 1,
};
static int __init ipcomp4_init(void)
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 798358b1071..98cbc687701 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -136,6 +136,8 @@ __be32 ic_myaddr = NONE; /* My IP address */
static __be32 ic_netmask = NONE; /* Netmask for local subnet */
__be32 ic_gateway = NONE; /* Gateway IP address */
+__be32 ic_addrservaddr = NONE; /* IP Address of the IP addresses'server */
+
__be32 ic_servaddr = NONE; /* Boot server IP address */
__be32 root_server_addr = NONE; /* Address of NFS server */
@@ -558,6 +560,7 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
if (ic_myaddr == NONE)
ic_myaddr = tip;
ic_servaddr = sip;
+ ic_addrservaddr = sip;
ic_got_reply = IC_RARP;
drop_unlock:
@@ -1068,7 +1071,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
ic_servaddr = server_id;
#ifdef IPCONFIG_DEBUG
printk("DHCP: Offered address %pI4 by server %pI4\n",
- &ic_myaddr, &ic_servaddr);
+ &ic_myaddr, &b->iph.saddr);
#endif
/* The DHCP indicated server address takes
* precedence over the bootp header one if
@@ -1113,6 +1116,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
ic_dev = dev;
ic_myaddr = b->your_ip;
ic_servaddr = b->server_ip;
+ ic_addrservaddr = b->iph.saddr;
if (ic_gateway == NONE && b->relay_ip)
ic_gateway = b->relay_ip;
if (ic_nameservers[0] == NONE)
@@ -1268,7 +1272,7 @@ static int __init ic_dynamic(void)
printk("IP-Config: Got %s answer from %pI4, ",
((ic_got_reply & IC_RARP) ? "RARP"
: (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"),
- &ic_servaddr);
+ &ic_addrservaddr);
pr_cont("my address is %pI4\n", &ic_myaddr);
return 0;
@@ -1390,7 +1394,7 @@ static int __init ip_auto_config(void)
unsigned int i;
#ifdef CONFIG_PROC_FS
- proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops);
+ proc_create("pnp", S_IRUGO, init_net.proc_net, &pnp_seq_fops);
#endif /* CONFIG_PROC_FS */
if (!ic_enable)
@@ -1500,8 +1504,10 @@ static int __init ip_auto_config(void)
* Clue in the operator.
*/
pr_info("IP-Config: Complete:\n");
- pr_info(" device=%s, addr=%pI4, mask=%pI4, gw=%pI4\n",
- ic_dev->name, &ic_myaddr, &ic_netmask, &ic_gateway);
+
+ pr_info(" device=%s, hwaddr=%*phC, ipaddr=%pI4, mask=%pI4, gw=%pI4\n",
+ ic_dev->name, ic_dev->addr_len, ic_dev->dev_addr,
+ &ic_myaddr, &ic_netmask, &ic_gateway);
pr_info(" host=%s, domain=%s, nis-domain=%s\n",
utsname()->nodename, ic_domain, utsname()->domainname);
pr_info(" bootserver=%pI4, rootserver=%pI4, rootpath=%s",
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index e15b45297c0..8f024d41eef 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -138,22 +138,7 @@ struct ipip_net {
static int ipip_tunnel_init(struct net_device *dev);
static void ipip_tunnel_setup(struct net_device *dev);
static void ipip_dev_free(struct net_device *dev);
-
-/*
- * Locking : hash tables are protected by RCU and RTNL
- */
-
-#define for_each_ip_tunnel_rcu(start) \
- for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
-
-/* often modified stats are per cpu, other are shared (netdev->stats) */
-struct pcpu_tstats {
- u64 rx_packets;
- u64 rx_bytes;
- u64 tx_packets;
- u64 tx_bytes;
- struct u64_stats_sync syncp;
-};
+static struct rtnl_link_ops ipip_link_ops __read_mostly;
static struct rtnl_link_stats64 *ipip_get_stats64(struct net_device *dev,
struct rtnl_link_stats64 *tot)
@@ -197,16 +182,16 @@ static struct ip_tunnel *ipip_tunnel_lookup(struct net *net,
struct ip_tunnel *t;
struct ipip_net *ipn = net_generic(net, ipip_net_id);
- for_each_ip_tunnel_rcu(ipn->tunnels_r_l[h0 ^ h1])
+ for_each_ip_tunnel_rcu(t, ipn->tunnels_r_l[h0 ^ h1])
if (local == t->parms.iph.saddr &&
remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
return t;
- for_each_ip_tunnel_rcu(ipn->tunnels_r[h0])
+ for_each_ip_tunnel_rcu(t, ipn->tunnels_r[h0])
if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
return t;
- for_each_ip_tunnel_rcu(ipn->tunnels_l[h1])
+ for_each_ip_tunnel_rcu(t, ipn->tunnels_l[h1])
if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
return t;
@@ -264,6 +249,32 @@ static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
rcu_assign_pointer(*tp, t);
}
+static int ipip_tunnel_create(struct net_device *dev)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct net *net = dev_net(dev);
+ struct ipip_net *ipn = net_generic(net, ipip_net_id);
+ int err;
+
+ err = ipip_tunnel_init(dev);
+ if (err < 0)
+ goto out;
+
+ err = register_netdevice(dev);
+ if (err < 0)
+ goto out;
+
+ strcpy(t->parms.name, dev->name);
+ dev->rtnl_link_ops = &ipip_link_ops;
+
+ dev_hold(dev);
+ ipip_tunnel_link(ipn, t);
+ return 0;
+
+out:
+ return err;
+}
+
static struct ip_tunnel *ipip_tunnel_locate(struct net *net,
struct ip_tunnel_parm *parms, int create)
{
@@ -298,16 +309,9 @@ static struct ip_tunnel *ipip_tunnel_locate(struct net *net,
nt = netdev_priv(dev);
nt->parms = *parms;
- if (ipip_tunnel_init(dev) < 0)
+ if (ipip_tunnel_create(dev) < 0)
goto failed_free;
- if (register_netdevice(dev) < 0)
- goto failed_free;
-
- strcpy(nt->parms.name, dev->name);
-
- dev_hold(dev);
- ipip_tunnel_link(ipn, nt);
return nt;
failed_free:
@@ -463,13 +467,12 @@ drop:
static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- struct pcpu_tstats *tstats;
const struct iphdr *tiph = &tunnel->parms.iph;
u8 tos = tunnel->parms.iph.tos;
__be16 df = tiph->frag_off;
struct rtable *rt; /* Route to the other host */
struct net_device *tdev; /* Device to other host */
- const struct iphdr *old_iph = ip_hdr(skb);
+ const struct iphdr *old_iph;
struct iphdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
__be32 dst = tiph->daddr;
@@ -479,6 +482,12 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
if (skb->protocol != htons(ETH_P_IP))
goto tx_error;
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ skb_checksum_help(skb))
+ goto tx_error;
+
+ old_iph = ip_hdr(skb);
+
if (tos & 1)
tos = old_iph->tos;
@@ -586,9 +595,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
if ((iph->ttl = tiph->ttl) == 0)
iph->ttl = old_iph->ttl;
- nf_reset(skb);
- tstats = this_cpu_ptr(dev->tstats);
- __IPTUNNEL_XMIT(tstats, &dev->stats);
+ iptunnel_xmit(skb, dev);
return NETDEV_TX_OK;
tx_error_icmp:
@@ -635,6 +642,28 @@ static void ipip_tunnel_bind_dev(struct net_device *dev)
dev->iflink = tunnel->parms.link;
}
+static void ipip_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p)
+{
+ struct net *net = dev_net(t->dev);
+ struct ipip_net *ipn = net_generic(net, ipip_net_id);
+
+ ipip_tunnel_unlink(ipn, t);
+ synchronize_net();
+ t->parms.iph.saddr = p->iph.saddr;
+ t->parms.iph.daddr = p->iph.daddr;
+ memcpy(t->dev->dev_addr, &p->iph.saddr, 4);
+ memcpy(t->dev->broadcast, &p->iph.daddr, 4);
+ ipip_tunnel_link(ipn, t);
+ t->parms.iph.ttl = p->iph.ttl;
+ t->parms.iph.tos = p->iph.tos;
+ t->parms.iph.frag_off = p->iph.frag_off;
+ if (t->parms.link != p->link) {
+ t->parms.link = p->link;
+ ipip_tunnel_bind_dev(t->dev);
+ }
+ netdev_state_change(t->dev);
+}
+
static int
ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
{
@@ -664,7 +693,7 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
case SIOCADDTUNNEL:
case SIOCCHGTUNNEL:
err = -EPERM;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
goto done;
err = -EFAULT;
@@ -693,29 +722,13 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
break;
}
t = netdev_priv(dev);
- ipip_tunnel_unlink(ipn, t);
- synchronize_net();
- t->parms.iph.saddr = p.iph.saddr;
- t->parms.iph.daddr = p.iph.daddr;
- memcpy(dev->dev_addr, &p.iph.saddr, 4);
- memcpy(dev->broadcast, &p.iph.daddr, 4);
- ipip_tunnel_link(ipn, t);
- netdev_state_change(dev);
}
+
+ ipip_tunnel_update(t, &p);
}
if (t) {
err = 0;
- if (cmd == SIOCCHGTUNNEL) {
- t->parms.iph.ttl = p.iph.ttl;
- t->parms.iph.tos = p.iph.tos;
- t->parms.iph.frag_off = p.iph.frag_off;
- if (t->parms.link != p.link) {
- t->parms.link = p.link;
- ipip_tunnel_bind_dev(dev);
- netdev_state_change(dev);
- }
- }
if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
err = -EFAULT;
} else
@@ -724,7 +737,7 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
case SIOCDELTUNNEL:
err = -EPERM;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
goto done;
if (dev == ipn->fb_tunnel_dev) {
@@ -773,6 +786,11 @@ static void ipip_dev_free(struct net_device *dev)
free_netdev(dev);
}
+#define IPIP_FEATURES (NETIF_F_SG | \
+ NETIF_F_FRAGLIST | \
+ NETIF_F_HIGHDMA | \
+ NETIF_F_HW_CSUM)
+
static void ipip_tunnel_setup(struct net_device *dev)
{
dev->netdev_ops = &ipip_netdev_ops;
@@ -787,6 +805,9 @@ static void ipip_tunnel_setup(struct net_device *dev)
dev->features |= NETIF_F_NETNS_LOCAL;
dev->features |= NETIF_F_LLTX;
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+
+ dev->features |= IPIP_FEATURES;
+ dev->hw_features |= IPIP_FEATURES;
}
static int ipip_tunnel_init(struct net_device *dev)
@@ -829,6 +850,142 @@ static int __net_init ipip_fb_tunnel_init(struct net_device *dev)
return 0;
}
+static void ipip_netlink_parms(struct nlattr *data[],
+ struct ip_tunnel_parm *parms)
+{
+ memset(parms, 0, sizeof(*parms));
+
+ parms->iph.version = 4;
+ parms->iph.protocol = IPPROTO_IPIP;
+ parms->iph.ihl = 5;
+
+ if (!data)
+ return;
+
+ if (data[IFLA_IPTUN_LINK])
+ parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);
+
+ if (data[IFLA_IPTUN_LOCAL])
+ parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]);
+
+ if (data[IFLA_IPTUN_REMOTE])
+ parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]);
+
+ if (data[IFLA_IPTUN_TTL]) {
+ parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]);
+ if (parms->iph.ttl)
+ parms->iph.frag_off = htons(IP_DF);
+ }
+
+ if (data[IFLA_IPTUN_TOS])
+ parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]);
+
+ if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
+ parms->iph.frag_off = htons(IP_DF);
+}
+
+static int ipip_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ struct net *net = dev_net(dev);
+ struct ip_tunnel *nt;
+
+ nt = netdev_priv(dev);
+ ipip_netlink_parms(data, &nt->parms);
+
+ if (ipip_tunnel_locate(net, &nt->parms, 0))
+ return -EEXIST;
+
+ return ipip_tunnel_create(dev);
+}
+
+static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[])
+{
+ struct ip_tunnel *t;
+ struct ip_tunnel_parm p;
+ struct net *net = dev_net(dev);
+ struct ipip_net *ipn = net_generic(net, ipip_net_id);
+
+ if (dev == ipn->fb_tunnel_dev)
+ return -EINVAL;
+
+ ipip_netlink_parms(data, &p);
+
+ if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) ||
+ (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
+ return -EINVAL;
+
+ t = ipip_tunnel_locate(net, &p, 0);
+
+ if (t) {
+ if (t->dev != dev)
+ return -EEXIST;
+ } else
+ t = netdev_priv(dev);
+
+ ipip_tunnel_update(t, &p);
+ return 0;
+}
+
+static size_t ipip_get_size(const struct net_device *dev)
+{
+ return
+ /* IFLA_IPTUN_LINK */
+ nla_total_size(4) +
+ /* IFLA_IPTUN_LOCAL */
+ nla_total_size(4) +
+ /* IFLA_IPTUN_REMOTE */
+ nla_total_size(4) +
+ /* IFLA_IPTUN_TTL */
+ nla_total_size(1) +
+ /* IFLA_IPTUN_TOS */
+ nla_total_size(1) +
+ /* IFLA_IPTUN_PMTUDISC */
+ nla_total_size(1) +
+ 0;
+}
+
+static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct ip_tunnel_parm *parm = &tunnel->parms;
+
+ if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) ||
+ nla_put_be32(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) ||
+ nla_put_be32(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) ||
+ nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) ||
+ nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) ||
+ nla_put_u8(skb, IFLA_IPTUN_PMTUDISC,
+ !!(parm->iph.frag_off & htons(IP_DF))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = {
+ [IFLA_IPTUN_LINK] = { .type = NLA_U32 },
+ [IFLA_IPTUN_LOCAL] = { .type = NLA_U32 },
+ [IFLA_IPTUN_REMOTE] = { .type = NLA_U32 },
+ [IFLA_IPTUN_TTL] = { .type = NLA_U8 },
+ [IFLA_IPTUN_TOS] = { .type = NLA_U8 },
+ [IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 },
+};
+
+static struct rtnl_link_ops ipip_link_ops __read_mostly = {
+ .kind = "ipip",
+ .maxtype = IFLA_IPTUN_MAX,
+ .policy = ipip_policy,
+ .priv_size = sizeof(struct ip_tunnel),
+ .setup = ipip_tunnel_setup,
+ .newlink = ipip_newlink,
+ .changelink = ipip_changelink,
+ .get_size = ipip_get_size,
+ .fill_info = ipip_fill_info,
+};
+
static struct xfrm_tunnel ipip_handler __read_mostly = {
.handler = ipip_rcv,
.err_handler = ipip_err,
@@ -925,14 +1082,26 @@ static int __init ipip_init(void)
return err;
err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
if (err < 0) {
- unregister_pernet_device(&ipip_net_ops);
pr_info("%s: can't register tunnel\n", __func__);
+ goto xfrm_tunnel_failed;
}
+ err = rtnl_link_register(&ipip_link_ops);
+ if (err < 0)
+ goto rtnl_link_failed;
+
+out:
return err;
+
+rtnl_link_failed:
+ xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
+xfrm_tunnel_failed:
+ unregister_pernet_device(&ipip_net_ops);
+ goto out;
}
static void __exit ipip_fini(void)
{
+ rtnl_link_unregister(&ipip_link_ops);
if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
pr_info("%s: can't deregister tunnel\n", __func__);
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 6168c4dc58b..5f95b3aa579 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -65,6 +65,7 @@
#include <net/checksum.h>
#include <net/netlink.h>
#include <net/fib_rules.h>
+#include <linux/netconf.h>
#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
#define CONFIG_IP_PIMSM 1
@@ -83,8 +84,8 @@ struct mr_table {
struct vif_device vif_table[MAXVIFS];
int maxvif;
atomic_t cache_resolve_queue_len;
- int mroute_do_assert;
- int mroute_do_pim;
+ bool mroute_do_assert;
+ bool mroute_do_pim;
#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
int mroute_reg_vif_num;
#endif
@@ -133,6 +134,8 @@ static int ipmr_cache_report(struct mr_table *mrt,
struct sk_buff *pkt, vifi_t vifi, int assert);
static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
struct mfc_cache *c, struct rtmsg *rtm);
+static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
+ int cmd);
static void mroute_clean_tables(struct mr_table *mrt);
static void ipmr_expire_process(unsigned long arg);
@@ -582,6 +585,9 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
in_dev = __in_dev_get_rtnl(dev);
if (in_dev) {
IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
+ inet_netconf_notify_devconf(dev_net(dev),
+ NETCONFA_MC_FORWARDING,
+ dev->ifindex, &in_dev->cnf);
ip_rt_multicast_event(in_dev);
}
@@ -665,6 +671,7 @@ static void ipmr_expire_process(unsigned long arg)
}
list_del(&c->list);
+ mroute_netlink_event(mrt, c, RTM_DELROUTE);
ipmr_destroy_unres(mrt, c);
}
@@ -772,6 +779,8 @@ static int vif_add(struct net *net, struct mr_table *mrt,
return -EADDRNOTAVAIL;
}
IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
+ inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, dev->ifindex,
+ &in_dev->cnf);
ip_rt_multicast_event(in_dev);
/* Fill in the VIF structures */
@@ -819,6 +828,49 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
return NULL;
}
+/* Look for a (*,*,oif) entry */
+static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt,
+ int vifi)
+{
+ int line = MFC_HASH(htonl(INADDR_ANY), htonl(INADDR_ANY));
+ struct mfc_cache *c;
+
+ list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list)
+ if (c->mfc_origin == htonl(INADDR_ANY) &&
+ c->mfc_mcastgrp == htonl(INADDR_ANY) &&
+ c->mfc_un.res.ttls[vifi] < 255)
+ return c;
+
+ return NULL;
+}
+
+/* Look for a (*,G) entry */
+static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt,
+ __be32 mcastgrp, int vifi)
+{
+ int line = MFC_HASH(mcastgrp, htonl(INADDR_ANY));
+ struct mfc_cache *c, *proxy;
+
+ if (mcastgrp == htonl(INADDR_ANY))
+ goto skip;
+
+ list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list)
+ if (c->mfc_origin == htonl(INADDR_ANY) &&
+ c->mfc_mcastgrp == mcastgrp) {
+ if (c->mfc_un.res.ttls[vifi] < 255)
+ return c;
+
+ /* It's ok if the vifi is part of the static tree */
+ proxy = ipmr_cache_find_any_parent(mrt,
+ c->mfc_parent);
+ if (proxy && proxy->mfc_un.res.ttls[vifi] < 255)
+ return c;
+ }
+
+skip:
+ return ipmr_cache_find_any_parent(mrt, vifi);
+}
+
/*
* Allocate a multicast cache entry
*/
@@ -1020,6 +1072,7 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
atomic_inc(&mrt->cache_resolve_queue_len);
list_add(&c->list, &mrt->mfc_unres_queue);
+ mroute_netlink_event(mrt, c, RTM_NEWROUTE);
if (atomic_read(&mrt->cache_resolve_queue_len) == 1)
mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
@@ -1043,7 +1096,7 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
* MFC cache manipulation by user space mroute daemon
*/
-static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
+static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
{
int line;
struct mfc_cache *c, *next;
@@ -1052,9 +1105,10 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
- c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
+ c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr &&
+ (parent == -1 || parent == c->mfc_parent)) {
list_del_rcu(&c->list);
-
+ mroute_netlink_event(mrt, c, RTM_DELROUTE);
ipmr_cache_free(c);
return 0;
}
@@ -1063,7 +1117,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
}
static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
- struct mfcctl *mfc, int mrtsock)
+ struct mfcctl *mfc, int mrtsock, int parent)
{
bool found = false;
int line;
@@ -1076,7 +1130,8 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
- c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
+ c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr &&
+ (parent == -1 || parent == c->mfc_parent)) {
found = true;
break;
}
@@ -1089,10 +1144,12 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
if (!mrtsock)
c->mfc_flags |= MFC_STATIC;
write_unlock_bh(&mrt_lock);
+ mroute_netlink_event(mrt, c, RTM_NEWROUTE);
return 0;
}
- if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
+ if (mfc->mfcc_mcastgrp.s_addr != htonl(INADDR_ANY) &&
+ !ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
return -EINVAL;
c = ipmr_cache_alloc();
@@ -1131,6 +1188,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
ipmr_cache_resolve(net, mrt, uc, c);
ipmr_cache_free(uc);
}
+ mroute_netlink_event(mrt, c, RTM_NEWROUTE);
return 0;
}
@@ -1159,6 +1217,7 @@ static void mroute_clean_tables(struct mr_table *mrt)
if (c->mfc_flags & MFC_STATIC)
continue;
list_del_rcu(&c->list);
+ mroute_netlink_event(mrt, c, RTM_DELROUTE);
ipmr_cache_free(c);
}
}
@@ -1167,6 +1226,7 @@ static void mroute_clean_tables(struct mr_table *mrt)
spin_lock_bh(&mfc_unres_lock);
list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
list_del(&c->list);
+ mroute_netlink_event(mrt, c, RTM_DELROUTE);
ipmr_destroy_unres(mrt, c);
}
spin_unlock_bh(&mfc_unres_lock);
@@ -1185,6 +1245,9 @@ static void mrtsock_destruct(struct sock *sk)
ipmr_for_each_table(mrt, net) {
if (sk == rtnl_dereference(mrt->mroute_sk)) {
IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
+ inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING,
+ NETCONFA_IFINDEX_ALL,
+ net->ipv4.devconf_all);
RCU_INIT_POINTER(mrt->mroute_sk, NULL);
mroute_clean_tables(mrt);
}
@@ -1201,29 +1264,30 @@ static void mrtsock_destruct(struct sock *sk)
int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen)
{
- int ret;
+ int ret, parent = 0;
struct vifctl vif;
struct mfcctl mfc;
struct net *net = sock_net(sk);
struct mr_table *mrt;
+ if (sk->sk_type != SOCK_RAW ||
+ inet_sk(sk)->inet_num != IPPROTO_IGMP)
+ return -EOPNOTSUPP;
+
mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
if (mrt == NULL)
return -ENOENT;
if (optname != MRT_INIT) {
if (sk != rcu_access_pointer(mrt->mroute_sk) &&
- !capable(CAP_NET_ADMIN))
+ !ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EACCES;
}
switch (optname) {
case MRT_INIT:
- if (sk->sk_type != SOCK_RAW ||
- inet_sk(sk)->inet_num != IPPROTO_IGMP)
- return -EOPNOTSUPP;
if (optlen != sizeof(int))
- return -ENOPROTOOPT;
+ return -EINVAL;
rtnl_lock();
if (rtnl_dereference(mrt->mroute_sk)) {
@@ -1235,6 +1299,9 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
if (ret == 0) {
rcu_assign_pointer(mrt->mroute_sk, sk);
IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
+ inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING,
+ NETCONFA_IFINDEX_ALL,
+ net->ipv4.devconf_all);
}
rtnl_unlock();
return ret;
@@ -1266,16 +1333,22 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
*/
case MRT_ADD_MFC:
case MRT_DEL_MFC:
+ parent = -1;
+ case MRT_ADD_MFC_PROXY:
+ case MRT_DEL_MFC_PROXY:
if (optlen != sizeof(mfc))
return -EINVAL;
if (copy_from_user(&mfc, optval, sizeof(mfc)))
return -EFAULT;
+ if (parent == 0)
+ parent = mfc.mfcc_parent;
rtnl_lock();
- if (optname == MRT_DEL_MFC)
- ret = ipmr_mfc_delete(mrt, &mfc);
+ if (optname == MRT_DEL_MFC || optname == MRT_DEL_MFC_PROXY)
+ ret = ipmr_mfc_delete(mrt, &mfc, parent);
else
ret = ipmr_mfc_add(net, mrt, &mfc,
- sk == rtnl_dereference(mrt->mroute_sk));
+ sk == rtnl_dereference(mrt->mroute_sk),
+ parent);
rtnl_unlock();
return ret;
/*
@@ -1284,9 +1357,11 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
case MRT_ASSERT:
{
int v;
+ if (optlen != sizeof(v))
+ return -EINVAL;
if (get_user(v, (int __user *)optval))
return -EFAULT;
- mrt->mroute_do_assert = (v) ? 1 : 0;
+ mrt->mroute_do_assert = v;
return 0;
}
#ifdef CONFIG_IP_PIMSM
@@ -1294,9 +1369,11 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
{
int v;
+ if (optlen != sizeof(v))
+ return -EINVAL;
if (get_user(v, (int __user *)optval))
return -EFAULT;
- v = (v) ? 1 : 0;
+ v = !!v;
rtnl_lock();
ret = 0;
@@ -1318,6 +1395,10 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
if (get_user(v, (u32 __user *)optval))
return -EFAULT;
+ /* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */
+ if (v != RT_TABLE_DEFAULT && v >= 1000000000)
+ return -EINVAL;
+
rtnl_lock();
ret = 0;
if (sk == rtnl_dereference(mrt->mroute_sk)) {
@@ -1325,7 +1406,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
} else {
if (!ipmr_new_table(net, v))
ret = -ENOMEM;
- raw_sk(sk)->ipmr_table = v;
+ else
+ raw_sk(sk)->ipmr_table = v;
}
rtnl_unlock();
return ret;
@@ -1351,6 +1433,10 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
struct net *net = sock_net(sk);
struct mr_table *mrt;
+ if (sk->sk_type != SOCK_RAW ||
+ inet_sk(sk)->inet_num != IPPROTO_IGMP)
+ return -EOPNOTSUPP;
+
mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
if (mrt == NULL)
return -ENOENT;
@@ -1715,17 +1801,28 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
{
int psend = -1;
int vif, ct;
+ int true_vifi = ipmr_find_vif(mrt, skb->dev);
vif = cache->mfc_parent;
cache->mfc_un.res.pkt++;
cache->mfc_un.res.bytes += skb->len;
+ if (cache->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) {
+ struct mfc_cache *cache_proxy;
+
+ /* For an (*,G) entry, we only check that the incomming
+ * interface is part of the static tree.
+ */
+ cache_proxy = ipmr_cache_find_any_parent(mrt, vif);
+ if (cache_proxy &&
+ cache_proxy->mfc_un.res.ttls[true_vifi] < 255)
+ goto forward;
+ }
+
/*
* Wrong interface: drop packet and (maybe) send PIM assert.
*/
if (mrt->vif_table[vif].dev != skb->dev) {
- int true_vifi;
-
if (rt_is_output_route(skb_rtable(skb))) {
/* It is our own packet, looped back.
* Very complicated situation...
@@ -1742,7 +1839,6 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
}
cache->mfc_un.res.wrong_if++;
- true_vifi = ipmr_find_vif(mrt, skb->dev);
if (true_vifi >= 0 && mrt->mroute_do_assert &&
/* pimsm uses asserts, when switching from RPT to SPT,
@@ -1760,15 +1856,34 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
goto dont_forward;
}
+forward:
mrt->vif_table[vif].pkt_in++;
mrt->vif_table[vif].bytes_in += skb->len;
/*
* Forward the frame
*/
+ if (cache->mfc_origin == htonl(INADDR_ANY) &&
+ cache->mfc_mcastgrp == htonl(INADDR_ANY)) {
+ if (true_vifi >= 0 &&
+ true_vifi != cache->mfc_parent &&
+ ip_hdr(skb)->ttl >
+ cache->mfc_un.res.ttls[cache->mfc_parent]) {
+ /* It's an (*,*) entry and the packet is not coming from
+ * the upstream: forward the packet to the upstream
+ * only.
+ */
+ psend = cache->mfc_parent;
+ goto last_forward;
+ }
+ goto dont_forward;
+ }
for (ct = cache->mfc_un.res.maxvif - 1;
ct >= cache->mfc_un.res.minvif; ct--) {
- if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
+ /* For (*,G) entry, don't forward to the incoming interface */
+ if ((cache->mfc_origin != htonl(INADDR_ANY) ||
+ ct != true_vifi) &&
+ ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
if (psend != -1) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
@@ -1779,6 +1894,7 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
psend = ct;
}
}
+last_forward:
if (psend != -1) {
if (local) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
@@ -1868,6 +1984,13 @@ int ip_mr_input(struct sk_buff *skb)
/* already under rcu_read_lock() */
cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
+ if (cache == NULL) {
+ int vif = ipmr_find_vif(mrt, skb->dev);
+
+ if (vif >= 0)
+ cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr,
+ vif);
+ }
/*
* No usable cache entry
@@ -2020,6 +2143,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
int ct;
struct rtnexthop *nhp;
struct nlattr *mp_attr;
+ struct rta_mfc_stats mfcs;
/* If cache is unresolved, don't try to parse IIF and OIF */
if (c->mfc_parent >= MAXVIFS)
@@ -2048,6 +2172,12 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
nla_nest_end(skb, mp_attr);
+ mfcs.mfcs_packets = c->mfc_un.res.pkt;
+ mfcs.mfcs_bytes = c->mfc_un.res.bytes;
+ mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if;
+ if (nla_put(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs) < 0)
+ return -EMSGSIZE;
+
rtm->rtm_type = RTN_MULTICAST;
return 1;
}
@@ -2066,7 +2196,12 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
rcu_read_lock();
cache = ipmr_cache_find(mrt, saddr, daddr);
+ if (cache == NULL && skb->dev) {
+ int vif = ipmr_find_vif(mrt, skb->dev);
+ if (vif >= 0)
+ cache = ipmr_cache_find_any(mrt, daddr, vif);
+ }
if (cache == NULL) {
struct sk_buff *skb2;
struct iphdr *iph;
@@ -2117,12 +2252,13 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
}
static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
- u32 portid, u32 seq, struct mfc_cache *c)
+ u32 portid, u32 seq, struct mfc_cache *c, int cmd)
{
struct nlmsghdr *nlh;
struct rtmsg *rtm;
+ int err;
- nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI);
+ nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), NLM_F_MULTI);
if (nlh == NULL)
return -EMSGSIZE;
@@ -2136,13 +2272,18 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
goto nla_put_failure;
rtm->rtm_type = RTN_MULTICAST;
rtm->rtm_scope = RT_SCOPE_UNIVERSE;
- rtm->rtm_protocol = RTPROT_UNSPEC;
+ if (c->mfc_flags & MFC_STATIC)
+ rtm->rtm_protocol = RTPROT_STATIC;
+ else
+ rtm->rtm_protocol = RTPROT_MROUTED;
rtm->rtm_flags = 0;
if (nla_put_be32(skb, RTA_SRC, c->mfc_origin) ||
nla_put_be32(skb, RTA_DST, c->mfc_mcastgrp))
goto nla_put_failure;
- if (__ipmr_fill_mroute(mrt, skb, c, rtm) < 0)
+ err = __ipmr_fill_mroute(mrt, skb, c, rtm);
+ /* do not break the dump if cache is unresolved */
+ if (err < 0 && err != -ENOENT)
goto nla_put_failure;
return nlmsg_end(skb, nlh);
@@ -2152,6 +2293,52 @@ nla_put_failure:
return -EMSGSIZE;
}
+static size_t mroute_msgsize(bool unresolved, int maxvif)
+{
+ size_t len =
+ NLMSG_ALIGN(sizeof(struct rtmsg))
+ + nla_total_size(4) /* RTA_TABLE */
+ + nla_total_size(4) /* RTA_SRC */
+ + nla_total_size(4) /* RTA_DST */
+ ;
+
+ if (!unresolved)
+ len = len
+ + nla_total_size(4) /* RTA_IIF */
+ + nla_total_size(0) /* RTA_MULTIPATH */
+ + maxvif * NLA_ALIGN(sizeof(struct rtnexthop))
+ /* RTA_MFC_STATS */
+ + nla_total_size(sizeof(struct rta_mfc_stats))
+ ;
+
+ return len;
+}
+
+static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
+ int cmd)
+{
+ struct net *net = read_pnet(&mrt->net);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(mroute_msgsize(mfc->mfc_parent >= MAXVIFS, mrt->maxvif),
+ GFP_ATOMIC);
+ if (skb == NULL)
+ goto errout;
+
+ err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd);
+ if (err < 0)
+ goto errout;
+
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE, NULL, GFP_ATOMIC);
+ return;
+
+errout:
+ kfree_skb(skb);
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err);
+}
+
static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
@@ -2178,13 +2365,29 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
if (ipmr_fill_mroute(mrt, skb,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
- mfc) < 0)
+ mfc, RTM_NEWROUTE) < 0)
goto done;
next_entry:
e++;
}
e = s_e = 0;
}
+ spin_lock_bh(&mfc_unres_lock);
+ list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
+ if (e < s_e)
+ goto next_entry2;
+ if (ipmr_fill_mroute(mrt, skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ mfc, RTM_NEWROUTE) < 0) {
+ spin_unlock_bh(&mfc_unres_lock);
+ goto done;
+ }
+next_entry2:
+ e++;
+ }
+ spin_unlock_bh(&mfc_unres_lock);
+ e = s_e = 0;
s_h = 0;
next_table:
t++;
@@ -2500,16 +2703,16 @@ static int __net_init ipmr_net_init(struct net *net)
#ifdef CONFIG_PROC_FS
err = -ENOMEM;
- if (!proc_net_fops_create(net, "ip_mr_vif", 0, &ipmr_vif_fops))
+ if (!proc_create("ip_mr_vif", 0, net->proc_net, &ipmr_vif_fops))
goto proc_vif_fail;
- if (!proc_net_fops_create(net, "ip_mr_cache", 0, &ipmr_mfc_fops))
+ if (!proc_create("ip_mr_cache", 0, net->proc_net, &ipmr_mfc_fops))
goto proc_cache_fail;
#endif
return 0;
#ifdef CONFIG_PROC_FS
proc_cache_fail:
- proc_net_remove(net, "ip_mr_vif");
+ remove_proc_entry("ip_mr_vif", net->proc_net);
proc_vif_fail:
ipmr_rules_exit(net);
#endif
@@ -2520,8 +2723,8 @@ fail:
static void __net_exit ipmr_net_exit(struct net *net)
{
#ifdef CONFIG_PROC_FS
- proc_net_remove(net, "ip_mr_cache");
- proc_net_remove(net, "ip_mr_vif");
+ remove_proc_entry("ip_mr_cache", net->proc_net);
+ remove_proc_entry("ip_mr_vif", net->proc_net);
#endif
ipmr_rules_exit(net);
}
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index d8d6f2a5bf1..ce2d43e1f09 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -241,8 +241,8 @@ config IP_NF_MANGLE
To compile it as a module, choose M here. If unsure, say N.
config IP_NF_TARGET_CLUSTERIP
- tristate "CLUSTERIP target support (EXPERIMENTAL)"
- depends on IP_NF_MANGLE && EXPERIMENTAL
+ tristate "CLUSTERIP target support"
+ depends on IP_NF_MANGLE
depends on NF_CONNTRACK_IPV4
depends on NETFILTER_ADVANCED
select NF_CONNTRACK_MARK
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 97e61eadf58..7dc6a974359 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -901,7 +901,7 @@ static int get_info(struct net *net, void __user *user,
#endif
t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),
"arptable_%s", name);
- if (t && !IS_ERR(t)) {
+ if (!IS_ERR_OR_NULL(t)) {
struct arpt_getinfo info;
const struct xt_table_info *private = t->private;
#ifdef CONFIG_COMPAT
@@ -958,7 +958,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
}
t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
- if (t && !IS_ERR(t)) {
+ if (!IS_ERR_OR_NULL(t)) {
const struct xt_table_info *private = t->private;
duprintf("t->private->number = %u\n",
@@ -1001,7 +1001,7 @@ static int __do_replace(struct net *net, const char *name,
t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),
"arptable_%s", name);
- if (!t || IS_ERR(t)) {
+ if (IS_ERR_OR_NULL(t)) {
ret = t ? PTR_ERR(t) : -ENOENT;
goto free_newinfo_counters_untrans;
}
@@ -1158,7 +1158,7 @@ static int do_add_counters(struct net *net, const void __user *user,
}
t = xt_find_table_lock(net, NFPROTO_ARP, name);
- if (!t || IS_ERR(t)) {
+ if (IS_ERR_OR_NULL(t)) {
ret = t ? PTR_ERR(t) : -ENOENT;
goto free;
}
@@ -1533,7 +1533,7 @@ static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user,
{
int ret;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
switch (cmd) {
@@ -1646,7 +1646,7 @@ static int compat_get_entries(struct net *net,
xt_compat_lock(NFPROTO_ARP);
t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
- if (t && !IS_ERR(t)) {
+ if (!IS_ERR_OR_NULL(t)) {
const struct xt_table_info *private = t->private;
struct xt_table_info info;
@@ -1677,7 +1677,7 @@ static int compat_do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user,
{
int ret;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
switch (cmd) {
@@ -1698,7 +1698,7 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned
{
int ret;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
switch (cmd) {
@@ -1722,7 +1722,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
{
int ret;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
switch (cmd) {
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 170b1fdd6b7..3efcf87400c 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1090,7 +1090,7 @@ static int get_info(struct net *net, void __user *user,
#endif
t = try_then_request_module(xt_find_table_lock(net, AF_INET, name),
"iptable_%s", name);
- if (t && !IS_ERR(t)) {
+ if (!IS_ERR_OR_NULL(t)) {
struct ipt_getinfo info;
const struct xt_table_info *private = t->private;
#ifdef CONFIG_COMPAT
@@ -1149,7 +1149,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr,
}
t = xt_find_table_lock(net, AF_INET, get.name);
- if (t && !IS_ERR(t)) {
+ if (!IS_ERR_OR_NULL(t)) {
const struct xt_table_info *private = t->private;
duprintf("t->private->number = %u\n", private->number);
if (get.size == private->size)
@@ -1189,7 +1189,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
t = try_then_request_module(xt_find_table_lock(net, AF_INET, name),
"iptable_%s", name);
- if (!t || IS_ERR(t)) {
+ if (IS_ERR_OR_NULL(t)) {
ret = t ? PTR_ERR(t) : -ENOENT;
goto free_newinfo_counters_untrans;
}
@@ -1347,7 +1347,7 @@ do_add_counters(struct net *net, const void __user *user,
}
t = xt_find_table_lock(net, AF_INET, name);
- if (!t || IS_ERR(t)) {
+ if (IS_ERR_OR_NULL(t)) {
ret = t ? PTR_ERR(t) : -ENOENT;
goto free;
}
@@ -1846,7 +1846,7 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user,
{
int ret;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
switch (cmd) {
@@ -1931,7 +1931,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
xt_compat_lock(AF_INET);
t = xt_find_table_lock(net, AF_INET, get.name);
- if (t && !IS_ERR(t)) {
+ if (!IS_ERR_OR_NULL(t)) {
const struct xt_table_info *private = t->private;
struct xt_table_info info;
duprintf("t->private->number = %u\n", private->number);
@@ -1961,7 +1961,7 @@ compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
int ret;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
switch (cmd) {
@@ -1983,7 +1983,7 @@ do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
{
int ret;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
switch (cmd) {
@@ -2008,7 +2008,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
int ret;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
switch (cmd) {
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index fe5daea5214..5852b249054 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -657,10 +657,11 @@ static int clusterip_proc_release(struct inode *inode, struct file *file)
static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
size_t size, loff_t *ofs)
{
- struct clusterip_config *c = PDE(file->f_path.dentry->d_inode)->data;
+ struct clusterip_config *c = PDE(file_inode(file))->data;
#define PROC_WRITELEN 10
char buffer[PROC_WRITELEN+1];
unsigned long nodenum;
+ int rc;
if (size > PROC_WRITELEN)
return -EIO;
@@ -669,11 +670,15 @@ static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
buffer[size] = 0;
if (*buffer == '+') {
- nodenum = simple_strtoul(buffer+1, NULL, 10);
+ rc = kstrtoul(buffer+1, 10, &nodenum);
+ if (rc)
+ return rc;
if (clusterip_add_node(c, nodenum))
return -ENOMEM;
} else if (*buffer == '-') {
- nodenum = simple_strtoul(buffer+1, NULL,10);
+ rc = kstrtoul(buffer+1, 10, &nodenum);
+ if (rc)
+ return rc;
if (clusterip_del_node(c, nodenum))
return -ENOENT;
} else
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 51f13f8ec72..04b18c1ac34 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -81,6 +81,7 @@ static void send_reset(struct sk_buff *oldskb, int hook)
niph->saddr = oiph->daddr;
niph->daddr = oiph->saddr;
+ skb_reset_transport_header(nskb);
tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr));
memset(tcph, 0, sizeof(*tcph));
tcph->source = oth->dest;
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index b5ef3cba225..7d168dcbd13 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -88,10 +88,8 @@ static void ulog_send(unsigned int nlgroupnum)
{
ulog_buff_t *ub = &ulog_buffers[nlgroupnum];
- if (timer_pending(&ub->timer)) {
- pr_debug("ulog_send: timer was pending, deleting\n");
- del_timer(&ub->timer);
- }
+ pr_debug("ulog_send: timer is deleting\n");
+ del_timer(&ub->timer);
if (!ub->skb) {
pr_debug("ulog_send: nothing to send\n");
@@ -426,10 +424,8 @@ static void __exit ulog_tg_exit(void)
/* remove pending timers and free allocated skb's */
for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
ub = &ulog_buffers[i];
- if (timer_pending(&ub->timer)) {
- pr_debug("timer was pending, deleting\n");
- del_timer(&ub->timer);
- }
+ pr_debug("timer is deleting\n");
+ del_timer(&ub->timer);
if (ub->skb) {
kfree_skb(ub->skb);
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index a82047282db..eeaff7e4acb 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -124,19 +124,28 @@ nf_nat_ipv4_fn(unsigned int hooknum,
ret = nf_nat_rule_find(skb, hooknum, in, out, ct);
if (ret != NF_ACCEPT)
return ret;
- } else
+ } else {
pr_debug("Already setup manip %s for ct %p\n",
maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
ct);
+ if (nf_nat_oif_changed(hooknum, ctinfo, nat, out))
+ goto oif_changed;
+ }
break;
default:
/* ESTABLISHED */
NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
ctinfo == IP_CT_ESTABLISHED_REPLY);
+ if (nf_nat_oif_changed(hooknum, ctinfo, nat, out))
+ goto oif_changed;
}
return nf_nat_packet(ct, ctinfo, hooknum, skb);
+
+oif_changed:
+ nf_ct_kill_acct(ct, ctinfo, skb);
+ return NF_DROP;
}
static unsigned int
@@ -276,9 +285,7 @@ static int __net_init iptable_nat_net_init(struct net *net)
return -ENOMEM;
net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl);
kfree(repl);
- if (IS_ERR(net->ipv4.nat_table))
- return PTR_ERR(net->ipv4.nat_table);
- return 0;
+ return PTR_RET(net->ipv4.nat_table);
}
static void __net_exit iptable_nat_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index fcdd0c2406e..2820aa18b54 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -100,7 +100,6 @@ static unsigned int ipv4_helper(unsigned int hooknum,
enum ip_conntrack_info ctinfo;
const struct nf_conn_help *help;
const struct nf_conntrack_helper *helper;
- unsigned int ret;
/* This is where we call the helper: as the packet goes out. */
ct = nf_ct_get(skb, &ctinfo);
@@ -116,13 +115,8 @@ static unsigned int ipv4_helper(unsigned int hooknum,
if (!helper)
return NF_ACCEPT;
- ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
- ct, ctinfo);
- if (ret != NF_ACCEPT && (ret & NF_VERDICT_MASK) != NF_QUEUE) {
- nf_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, NULL,
- "nf_ct_%s: dropping packet", helper->name);
- }
- return ret;
+ return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
+ ct, ctinfo);
}
static unsigned int ipv4_confirm(unsigned int hooknum,
@@ -420,54 +414,43 @@ static int ipv4_net_init(struct net *net)
{
int ret = 0;
- ret = nf_conntrack_l4proto_register(net,
- &nf_conntrack_l4proto_tcp4);
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_tcp4);
if (ret < 0) {
- pr_err("nf_conntrack_l4proto_tcp4 :protocol register failed\n");
+ pr_err("nf_conntrack_tcp4: pernet registration failed\n");
goto out_tcp;
}
- ret = nf_conntrack_l4proto_register(net,
- &nf_conntrack_l4proto_udp4);
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udp4);
if (ret < 0) {
- pr_err("nf_conntrack_l4proto_udp4 :protocol register failed\n");
+ pr_err("nf_conntrack_udp4: pernet registration failed\n");
goto out_udp;
}
- ret = nf_conntrack_l4proto_register(net,
- &nf_conntrack_l4proto_icmp);
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_icmp);
if (ret < 0) {
- pr_err("nf_conntrack_l4proto_icmp4 :protocol register failed\n");
+ pr_err("nf_conntrack_icmp4: pernet registration failed\n");
goto out_icmp;
}
- ret = nf_conntrack_l3proto_register(net,
- &nf_conntrack_l3proto_ipv4);
+ ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv4);
if (ret < 0) {
- pr_err("nf_conntrack_l3proto_ipv4 :protocol register failed\n");
+ pr_err("nf_conntrack_ipv4: pernet registration failed\n");
goto out_ipv4;
}
return 0;
out_ipv4:
- nf_conntrack_l4proto_unregister(net,
- &nf_conntrack_l4proto_icmp);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp);
out_icmp:
- nf_conntrack_l4proto_unregister(net,
- &nf_conntrack_l4proto_udp4);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4);
out_udp:
- nf_conntrack_l4proto_unregister(net,
- &nf_conntrack_l4proto_tcp4);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4);
out_tcp:
return ret;
}
static void ipv4_net_exit(struct net *net)
{
- nf_conntrack_l3proto_unregister(net,
- &nf_conntrack_l3proto_ipv4);
- nf_conntrack_l4proto_unregister(net,
- &nf_conntrack_l4proto_icmp);
- nf_conntrack_l4proto_unregister(net,
- &nf_conntrack_l4proto_udp4);
- nf_conntrack_l4proto_unregister(net,
- &nf_conntrack_l4proto_tcp4);
+ nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv4);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4);
}
static struct pernet_operations ipv4_net_ops = {
@@ -500,16 +483,49 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
pr_err("nf_conntrack_ipv4: can't register hooks.\n");
goto cleanup_pernet;
}
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_tcp4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv4: can't register tcp4 proto.\n");
+ goto cleanup_hooks;
+ }
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udp4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv4: can't register udp4 proto.\n");
+ goto cleanup_tcp4;
+ }
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_icmp);
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv4: can't register icmpv4 proto.\n");
+ goto cleanup_udp4;
+ }
+
+ ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv4: can't register ipv4 proto.\n");
+ goto cleanup_icmpv4;
+ }
+
#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
ret = nf_conntrack_ipv4_compat_init();
if (ret < 0)
- goto cleanup_hooks;
+ goto cleanup_proto;
#endif
return ret;
#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+ cleanup_proto:
+ nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
+#endif
+ cleanup_icmpv4:
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp);
+ cleanup_udp4:
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4);
+ cleanup_tcp4:
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
cleanup_hooks:
nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
-#endif
cleanup_pernet:
unregister_pernet_subsys(&ipv4_net_ops);
cleanup_sockopt:
@@ -523,6 +539,10 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void)
#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
nf_conntrack_ipv4_compat_fini();
#endif
+ nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp);
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4);
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
unregister_pernet_subsys(&ipv4_net_ops);
nf_unregister_sockopt(&so_getorigdst);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 9682b36df38..f2ca1279408 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -417,12 +417,12 @@ static int __net_init ip_conntrack_net_init(struct net *net)
{
struct proc_dir_entry *proc, *proc_exp, *proc_stat;
- proc = proc_net_fops_create(net, "ip_conntrack", 0440, &ct_file_ops);
+ proc = proc_create("ip_conntrack", 0440, net->proc_net, &ct_file_ops);
if (!proc)
goto err1;
- proc_exp = proc_net_fops_create(net, "ip_conntrack_expect", 0440,
- &ip_exp_file_ops);
+ proc_exp = proc_create("ip_conntrack_expect", 0440, net->proc_net,
+ &ip_exp_file_ops);
if (!proc_exp)
goto err2;
@@ -433,9 +433,9 @@ static int __net_init ip_conntrack_net_init(struct net *net)
return 0;
err3:
- proc_net_remove(net, "ip_conntrack_expect");
+ remove_proc_entry("ip_conntrack_expect", net->proc_net);
err2:
- proc_net_remove(net, "ip_conntrack");
+ remove_proc_entry("ip_conntrack", net->proc_net);
err1:
return -ENOMEM;
}
@@ -443,8 +443,8 @@ err1:
static void __net_exit ip_conntrack_net_exit(struct net *net)
{
remove_proc_entry("ip_conntrack", net->proc_net_stat);
- proc_net_remove(net, "ip_conntrack_expect");
- proc_net_remove(net, "ip_conntrack");
+ remove_proc_entry("ip_conntrack_expect", net->proc_net);
+ remove_proc_entry("ip_conntrack", net->proc_net);
}
static struct pernet_operations ip_conntrack_net_ops = {
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 8f3d05424a3..2e91006d607 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -322,8 +322,8 @@ void ping_err(struct sk_buff *skb, u32 info)
struct iphdr *iph = (struct iphdr *)skb->data;
struct icmphdr *icmph = (struct icmphdr *)(skb->data+(iph->ihl<<2));
struct inet_sock *inet_sock;
- int type = icmph->type;
- int code = icmph->code;
+ int type = icmp_hdr(skb)->type;
+ int code = icmp_hdr(skb)->code;
struct net *net = dev_net(skb->dev);
struct sock *sk;
int harderr;
@@ -738,6 +738,7 @@ struct proto ping_prot = {
.recvmsg = ping_recvmsg,
.bind = ping_bind,
.backlog_rcv = ping_queue_rcv_skb,
+ .release_cb = ip4_datagram_release_cb,
.hash = ping_v4_hash,
.unhash = ping_v4_unhash,
.get_port = ping_v4_get_port,
@@ -888,7 +889,7 @@ static int ping_proc_register(struct net *net)
struct proc_dir_entry *p;
int rc = 0;
- p = proc_net_fops_create(net, "icmp", S_IRUGO, &ping_seq_fops);
+ p = proc_create("icmp", S_IRUGO, net->proc_net, &ping_seq_fops);
if (!p)
rc = -ENOMEM;
return rc;
@@ -896,7 +897,7 @@ static int ping_proc_register(struct net *net)
static void ping_proc_unregister(struct net *net)
{
- proc_net_remove(net, "icmp");
+ remove_proc_entry("icmp", net->proc_net);
}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 8de53e1ddd5..32030a24e77 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -471,28 +471,29 @@ static const struct file_operations netstat_seq_fops = {
static __net_init int ip_proc_init_net(struct net *net)
{
- if (!proc_net_fops_create(net, "sockstat", S_IRUGO, &sockstat_seq_fops))
+ if (!proc_create("sockstat", S_IRUGO, net->proc_net,
+ &sockstat_seq_fops))
goto out_sockstat;
- if (!proc_net_fops_create(net, "netstat", S_IRUGO, &netstat_seq_fops))
+ if (!proc_create("netstat", S_IRUGO, net->proc_net, &netstat_seq_fops))
goto out_netstat;
- if (!proc_net_fops_create(net, "snmp", S_IRUGO, &snmp_seq_fops))
+ if (!proc_create("snmp", S_IRUGO, net->proc_net, &snmp_seq_fops))
goto out_snmp;
return 0;
out_snmp:
- proc_net_remove(net, "netstat");
+ remove_proc_entry("netstat", net->proc_net);
out_netstat:
- proc_net_remove(net, "sockstat");
+ remove_proc_entry("sockstat", net->proc_net);
out_sockstat:
return -ENOMEM;
}
static __net_exit void ip_proc_exit_net(struct net *net)
{
- proc_net_remove(net, "snmp");
- proc_net_remove(net, "netstat");
- proc_net_remove(net, "sockstat");
+ remove_proc_entry("snmp", net->proc_net);
+ remove_proc_entry("netstat", net->proc_net);
+ remove_proc_entry("sockstat", net->proc_net);
}
static __net_initdata struct pernet_operations ip_proc_ops = {
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 8918eff1426..ce848461acb 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -29,6 +29,7 @@
#include <net/protocol.h>
const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
+const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
/*
* Add a protocol handler to the hash tables
@@ -36,11 +37,24 @@ const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
{
+ if (!prot->netns_ok) {
+ pr_err("Protocol %u is not namespace aware, cannot register.\n",
+ protocol);
+ return -EINVAL;
+ }
+
return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],
NULL, prot) ? 0 : -1;
}
EXPORT_SYMBOL(inet_add_protocol);
+int inet_add_offload(const struct net_offload *prot, unsigned char protocol)
+{
+ return !cmpxchg((const struct net_offload **)&inet_offloads[protocol],
+ NULL, prot) ? 0 : -1;
+}
+EXPORT_SYMBOL(inet_add_offload);
+
/*
* Remove a protocol from the hash tables.
*/
@@ -57,3 +71,16 @@ int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
return ret;
}
EXPORT_SYMBOL(inet_del_protocol);
+
+int inet_del_offload(const struct net_offload *prot, unsigned char protocol)
+{
+ int ret;
+
+ ret = (cmpxchg((const struct net_offload **)&inet_offloads[protocol],
+ prot, NULL) == prot) ? 0 : -1;
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(inet_del_offload);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 73d1e4df4bf..dd44e0ab600 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -111,9 +111,7 @@ EXPORT_SYMBOL_GPL(raw_unhash_sk);
static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
unsigned short num, __be32 raddr, __be32 laddr, int dif)
{
- struct hlist_node *node;
-
- sk_for_each_from(sk, node) {
+ sk_for_each_from(sk) {
struct inet_sock *inet = inet_sk(sk);
if (net_eq(sock_net(sk), net) && inet->inet_num == num &&
@@ -894,6 +892,7 @@ struct proto raw_prot = {
.recvmsg = raw_recvmsg,
.bind = raw_bind,
.backlog_rcv = raw_rcv_skb,
+ .release_cb = ip4_datagram_release_cb,
.hash = raw_hash_sk,
.unhash = raw_unhash_sk,
.obj_size = sizeof(struct raw_sock),
@@ -913,9 +912,7 @@ static struct sock *raw_get_first(struct seq_file *seq)
for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE;
++state->bucket) {
- struct hlist_node *node;
-
- sk_for_each(sk, node, &state->h->ht[state->bucket])
+ sk_for_each(sk, &state->h->ht[state->bucket])
if (sock_net(sk) == seq_file_net(seq))
goto found;
}
@@ -1049,7 +1046,7 @@ static const struct file_operations raw_seq_fops = {
static __net_init int raw_init_net(struct net *net)
{
- if (!proc_net_fops_create(net, "raw", S_IRUGO, &raw_seq_fops))
+ if (!proc_create("raw", S_IRUGO, net->proc_net, &raw_seq_fops))
return -ENOMEM;
return 0;
@@ -1057,7 +1054,7 @@ static __net_init int raw_init_net(struct net *net)
static __net_exit void raw_exit_net(struct net *net)
{
- proc_net_remove(net, "raw");
+ remove_proc_entry("raw", net->proc_net);
}
static __net_initdata struct pernet_operations raw_net_ops = {
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a8c651216fa..6e2851464f8 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -117,15 +117,11 @@
#define RT_GC_TIMEOUT (300*HZ)
static int ip_rt_max_size;
-static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
-static int ip_rt_gc_interval __read_mostly = 60 * HZ;
-static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
static int ip_rt_redirect_number __read_mostly = 9;
static int ip_rt_redirect_load __read_mostly = HZ / 50;
static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
static int ip_rt_error_cost __read_mostly = HZ;
static int ip_rt_error_burst __read_mostly = 5 * HZ;
-static int ip_rt_gc_elasticity __read_mostly = 8;
static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
static int ip_rt_min_advmss __read_mostly = 256;
@@ -384,8 +380,8 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
{
struct proc_dir_entry *pde;
- pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
- &rt_cache_seq_fops);
+ pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
+ &rt_cache_seq_fops);
if (!pde)
goto err1;
@@ -912,6 +908,9 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
struct dst_entry *dst = &rt->dst;
struct fib_result res;
+ if (dst_metric_locked(dst, RTAX_MTU))
+ return;
+
if (dst->dev->mtu < mtu)
return;
@@ -962,7 +961,7 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
}
EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
-void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
+static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
{
const struct iphdr *iph = (const struct iphdr *) skb->data;
struct flowi4 fl4;
@@ -975,6 +974,53 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
ip_rt_put(rt);
}
}
+
+void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
+{
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ struct flowi4 fl4;
+ struct rtable *rt;
+ struct dst_entry *dst;
+ bool new = false;
+
+ bh_lock_sock(sk);
+ rt = (struct rtable *) __sk_dst_get(sk);
+
+ if (sock_owned_by_user(sk) || !rt) {
+ __ipv4_sk_update_pmtu(skb, sk, mtu);
+ goto out;
+ }
+
+ __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+
+ if (!__sk_dst_check(sk, 0)) {
+ rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
+ if (IS_ERR(rt))
+ goto out;
+
+ new = true;
+ }
+
+ __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
+
+ dst = dst_check(&rt->dst, 0);
+ if (!dst) {
+ if (new)
+ dst_release(&rt->dst);
+
+ rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
+ if (IS_ERR(rt))
+ goto out;
+
+ new = true;
+ }
+
+ if (new)
+ __sk_dst_set(sk, &rt->dst);
+
+out:
+ bh_unlock_sock(sk);
+}
EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
void ipv4_redirect(struct sk_buff *skb, struct net *net,
@@ -1120,7 +1166,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
if (!mtu || time_after_eq(jiffies, rt->dst.expires))
mtu = dst_metric_raw(dst, RTAX_MTU);
- if (mtu && rt_is_output_route(rt))
+ if (mtu)
return mtu;
mtu = dst->dev->mtu;
@@ -1785,6 +1831,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
if (dev_out->flags & IFF_LOOPBACK)
flags |= RTCF_LOCAL;
+ do_cache = true;
if (type == RTN_BROADCAST) {
flags |= RTCF_BROADCAST | RTCF_LOCAL;
fi = NULL;
@@ -1793,6 +1840,8 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
fl4->flowi4_proto))
flags &= ~RTCF_LOCAL;
+ else
+ do_cache = false;
/* If multicast route do not exist use
* default one, but do not gateway in this case.
* Yes, it is hack.
@@ -1802,8 +1851,8 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
}
fnhe = NULL;
- do_cache = fi != NULL;
- if (fi) {
+ do_cache &= fi != NULL;
+ if (do_cache) {
struct rtable __rcu **prth;
struct fib_nh *nh = &FIB_RES_NH(*res);
@@ -2229,8 +2278,27 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
error = rt->dst.error;
if (rt_is_input_route(rt)) {
- if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
- goto nla_put_failure;
+#ifdef CONFIG_IP_MROUTE
+ if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
+ IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
+ int err = ipmr_get_route(net, skb,
+ fl4->saddr, fl4->daddr,
+ r, nowait);
+ if (err <= 0) {
+ if (!nowait) {
+ if (err == 0)
+ return 0;
+ goto nla_put_failure;
+ } else {
+ if (err == -EMSGSIZE)
+ goto nla_put_failure;
+ error = err;
+ }
+ }
+ } else
+#endif
+ if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
+ goto nla_put_failure;
}
if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
@@ -2351,6 +2419,11 @@ void ip_rt_multicast_event(struct in_device *in_dev)
}
#ifdef CONFIG_SYSCTL
+static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
+static int ip_rt_gc_interval __read_mostly = 60 * HZ;
+static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
+static int ip_rt_gc_elasticity __read_mostly = 8;
+
static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
@@ -2493,6 +2566,10 @@ static __net_init int sysctl_route_net_init(struct net *net)
tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
if (tbl == NULL)
goto err_dup;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ tbl[0].procname = NULL;
}
tbl[0].extra1 = net;
@@ -2597,7 +2674,7 @@ int __init ip_rt_init(void)
pr_err("Unable to create route proc files\n");
#ifdef CONFIG_XFRM
xfrm_init();
- xfrm4_init(ip_rt_max_size);
+ xfrm4_init();
#endif
rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index ba48e799b03..ef54377fb11 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -232,7 +232,8 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
*
* return false if we decode an option that should not be.
*/
-bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, bool *ecn_ok)
+bool cookie_check_timestamp(struct tcp_options_received *tcp_opt,
+ struct net *net, bool *ecn_ok)
{
/* echoed timestamp, lowest bits contain options */
u32 options = tcp_opt->rcv_tsecr & TSMASK;
@@ -247,7 +248,7 @@ bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, bool *ecn_ok)
tcp_opt->sack_ok = (options & (1 << 4)) ? TCP_SACK_SEEN : 0;
*ecn_ok = (options >> 5) & 1;
- if (*ecn_ok && !sysctl_tcp_ecn)
+ if (*ecn_ok && !net->ipv4.sysctl_tcp_ecn)
return false;
if (tcp_opt->sack_ok && !sysctl_tcp_sack)
@@ -295,7 +296,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
memset(&tcp_opt, 0, sizeof(tcp_opt));
tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL);
- if (!cookie_check_timestamp(&tcp_opt, &ecn_ok))
+ if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))
goto out;
ret = NULL;
@@ -340,7 +341,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
}
req->expires = 0UL;
- req->retrans = 0;
+ req->num_retrans = 0;
/*
* We need to lookup the route here to get at the correct
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 63d4eccc674..960fd29d9b8 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -27,6 +27,7 @@
#include <net/tcp_memcontrol.h>
static int zero;
+static int one = 1;
static int two = 2;
static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 };
@@ -232,8 +233,8 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write,
return 0;
}
-int proc_tcp_fastopen_key(ctl_table *ctl, int write, void __user *buffer,
- size_t *lenp, loff_t *ppos)
+static int proc_tcp_fastopen_key(ctl_table *ctl, int write, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
{
ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) };
struct tcp_fastopen_context *ctxt;
@@ -538,13 +539,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "tcp_ecn",
- .data = &sysctl_tcp_ecn,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_dsack",
.data = &sysctl_tcp_dsack,
.maxlen = sizeof(int),
@@ -556,14 +550,16 @@ static struct ctl_table ipv4_table[] = {
.data = &sysctl_tcp_wmem,
.maxlen = sizeof(sysctl_tcp_wmem),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
},
{
.procname = "tcp_rmem",
.data = &sysctl_tcp_rmem,
.maxlen = sizeof(sysctl_tcp_rmem),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
},
{
.procname = "tcp_app_win",
@@ -637,13 +633,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_tcp_congestion_control,
},
{
- .procname = "tcp_abc",
- .data = &sysctl_tcp_abc,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
.procname = "tcp_mtu_probing",
.data = &sysctl_tcp_mtu_probing,
.maxlen = sizeof(int),
@@ -786,7 +775,7 @@ static struct ctl_table ipv4_table[] = {
.maxlen = sizeof(sysctl_udp_rmem_min),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero
+ .extra1 = &one
},
{
.procname = "udp_wmem_min",
@@ -794,7 +783,7 @@ static struct ctl_table ipv4_table[] = {
.maxlen = sizeof(sysctl_udp_wmem_min),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero
+ .extra1 = &one
},
{ }
};
@@ -850,6 +839,13 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = ipv4_ping_group_range,
},
{
+ .procname = "tcp_ecn",
+ .data = &init_net.ipv4.sysctl_tcp_ecn,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
.procname = "tcp_mem",
.maxlen = sizeof(init_net.ipv4.sysctl_tcp_mem),
.mode = 0644,
@@ -882,7 +878,12 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
&net->ipv4.sysctl_icmp_ratemask;
table[6].data =
&net->ipv4.sysctl_ping_group_range;
+ table[7].data =
+ &net->ipv4.sysctl_tcp_ecn;
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ table[0].procname = NULL;
}
/*
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 197c0008503..47e854fcae2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -400,6 +400,8 @@ void tcp_init_sock(struct sock *sk)
tcp_enable_early_retrans(tp);
icsk->icsk_ca_ops = &tcp_init_congestion_ops;
+ tp->tsoffset = 0;
+
sk->sk_state = TCP_CLOSE;
sk->sk_write_space = sk_stream_write_space;
@@ -536,13 +538,14 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
{
struct tcp_sock *tp = tcp_sk(sk);
int answ;
+ bool slow;
switch (cmd) {
case SIOCINQ:
if (sk->sk_state == TCP_LISTEN)
return -EINVAL;
- lock_sock(sk);
+ slow = lock_sock_fast(sk);
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
answ = 0;
else if (sock_flag(sk, SOCK_URGINLINE) ||
@@ -557,7 +560,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
answ--;
} else
answ = tp->urg_seq - tp->copied_seq;
- release_sock(sk);
+ unlock_sock_fast(sk, slow);
break;
case SIOCATMARK:
answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
@@ -830,8 +833,8 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
return mss_now;
}
-static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
- size_t psize, int flags)
+static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags)
{
struct tcp_sock *tp = tcp_sk(sk);
int mss_now, size_goal;
@@ -858,12 +861,9 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto out_err;
- while (psize > 0) {
+ while (size > 0) {
struct sk_buff *skb = tcp_write_queue_tail(sk);
- struct page *page = pages[poffset / PAGE_SIZE];
int copy, i;
- int offset = poffset % PAGE_SIZE;
- int size = min_t(size_t, psize, PAGE_SIZE - offset);
bool can_coalesce;
if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
@@ -897,6 +897,7 @@ new_segment:
get_page(page);
skb_fill_page_desc(skb, i, page, offset, copy);
}
+ skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
skb->len += copy;
skb->data_len += copy;
@@ -912,8 +913,8 @@ new_segment:
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
copied += copy;
- poffset += copy;
- if (!(psize -= copy))
+ offset += copy;
+ if (!(size -= copy))
goto out;
if (skb->len < size_goal || (flags & MSG_OOB))
@@ -960,7 +961,7 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
flags);
lock_sock(sk);
- res = do_tcp_sendpages(sk, &page, offset, size, flags);
+ res = do_tcp_sendpages(sk, page, offset, size, flags);
release_sock(sk);
return res;
}
@@ -1212,7 +1213,7 @@ new_segment:
wait_for_sndbuf:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
- if (copied && likely(!tp->repair))
+ if (copied)
tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
@@ -1223,7 +1224,7 @@ wait_for_memory:
}
out:
- if (copied && likely(!tp->repair))
+ if (copied)
tcp_push(sk, flags, mss_now, tp->nonagle);
release_sock(sk);
return copied + copied_syn;
@@ -1408,10 +1409,10 @@ static void tcp_service_net_dma(struct sock *sk, bool wait)
return;
last_issued = tp->ucopy.dma_cookie;
- dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+ dma_async_issue_pending(tp->ucopy.dma_chan);
do {
- if (dma_async_memcpy_complete(tp->ucopy.dma_chan,
+ if (dma_async_is_tx_complete(tp->ucopy.dma_chan,
last_issued, &done,
&used) == DMA_SUCCESS) {
/* Safe to free early-copied skbs now */
@@ -1430,12 +1431,12 @@ static void tcp_service_net_dma(struct sock *sk, bool wait)
}
#endif
-static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
+static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
{
struct sk_buff *skb;
u32 offset;
- skb_queue_walk(&sk->sk_receive_queue, skb) {
+ while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
offset = seq - TCP_SKB_CB(skb)->seq;
if (tcp_hdr(skb)->syn)
offset--;
@@ -1443,6 +1444,11 @@ static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
*off = offset;
return skb;
}
+ /* This looks weird, but this can happen if TCP collapsing
+ * splitted a fat GRO packet, while we released socket lock
+ * in skb_splice_bits()
+ */
+ sk_eat_skb(sk, skb, false);
}
return NULL;
}
@@ -1484,7 +1490,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
break;
}
used = recv_actor(desc, skb, offset, len);
- if (used < 0) {
+ if (used <= 0) {
if (!copied)
copied = used;
break;
@@ -1493,15 +1499,19 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
copied += used;
offset += used;
}
- /*
- * If recv_actor drops the lock (e.g. TCP splice
+ /* If recv_actor drops the lock (e.g. TCP splice
* receive) the skb pointer might be invalid when
* getting here: tcp_collapse might have deleted it
* while aggregating skbs from the socket queue.
*/
- skb = tcp_recv_skb(sk, seq-1, &offset);
- if (!skb || (offset+1 != skb->len))
+ skb = tcp_recv_skb(sk, seq - 1, &offset);
+ if (!skb)
break;
+ /* TCP coalescing might have appended data to the skb.
+ * Try to splice more frags
+ */
+ if (offset + 1 != skb->len)
+ continue;
}
if (tcp_hdr(skb)->fin) {
sk_eat_skb(sk, skb, false);
@@ -1518,8 +1528,10 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
tcp_rcv_space_adjust(sk);
/* Clean up data we have read: This will do ACK frames. */
- if (copied > 0)
+ if (copied > 0) {
+ tcp_recv_skb(sk, seq, &offset);
tcp_cleanup_rbuf(sk, copied);
+ }
return copied;
}
EXPORT_SYMBOL(tcp_read_sock);
@@ -1742,7 +1754,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
tcp_service_net_dma(sk, true);
tcp_cleanup_rbuf(sk, copied);
} else
- dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+ dma_async_issue_pending(tp->ucopy.dma_chan);
}
#endif
if (copied >= target) {
@@ -1835,7 +1847,7 @@ do_prequeue:
break;
}
- dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+ dma_async_issue_pending(tp->ucopy.dma_chan);
if ((offset + used) == skb->len)
copied_early = true;
@@ -2278,7 +2290,6 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->packets_out = 0;
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd_cnt = 0;
- tp->bytes_acked = 0;
tp->window_clamp = 0;
tcp_set_ca_state(sk, TCP_CA_Open);
tcp_clear_retrans(tp);
@@ -2303,7 +2314,7 @@ void tcp_sock_destruct(struct sock *sk)
static inline bool tcp_can_repair_sock(const struct sock *sk)
{
- return capable(CAP_NET_ADMIN) &&
+ return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
}
@@ -2702,6 +2713,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
else
err = -EINVAL;
break;
+ case TCP_TIMESTAMP:
+ if (!tp->repair)
+ err = -EPERM;
+ else
+ tp->tsoffset = val - tcp_time_stamp;
+ break;
default:
err = -ENOPROTOOPT;
break;
@@ -2950,6 +2967,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
case TCP_USER_TIMEOUT:
val = jiffies_to_msecs(icsk->icsk_user_timeout);
break;
+ case TCP_TIMESTAMP:
+ val = tcp_time_stamp + tp->tsoffset;
+ break;
default:
return -ENOPROTOOPT;
}
@@ -3023,6 +3043,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
SKB_GSO_DODGY |
SKB_GSO_TCP_ECN |
SKB_GSO_TCPV6 |
+ SKB_GSO_GRE |
0) ||
!(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
goto out;
@@ -3234,7 +3255,7 @@ __tcp_alloc_md5sig_pool(struct sock *sk)
struct crypto_hash *hash;
hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
- if (!hash || IS_ERR(hash))
+ if (IS_ERR_OR_NULL(hash))
goto out_free;
per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash;
@@ -3589,8 +3610,7 @@ void __init tcp_init(void)
alloc_large_system_hash("TCP established",
sizeof(struct inet_ehash_bucket),
thash_entries,
- (totalram_pages >= 128 * 1024) ?
- 13 : 15,
+ 17, /* one slot per 128 KB of memory */
0,
NULL,
&tcp_hashinfo.ehash_mask,
@@ -3606,8 +3626,7 @@ void __init tcp_init(void)
alloc_large_system_hash("TCP bind",
sizeof(struct inet_bind_hashbucket),
tcp_hashinfo.ehash_mask + 1,
- (totalram_pages >= 128 * 1024) ?
- 13 : 15,
+ 17, /* one slot per 128 KB of memory */
0,
&tcp_hashinfo.bhash_size,
NULL,
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 1432cdb0644..019c2389a34 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -1,7 +1,7 @@
/*
* Plugable TCP congestion control support and newReno
* congestion control.
- * Based on ideas from I/O scheduler suport and Web100.
+ * Based on ideas from I/O scheduler support and Web100.
*
* Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
*/
@@ -259,7 +259,8 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
if (!ca)
err = -ENOENT;
- else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || capable(CAP_NET_ADMIN)))
+ else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
+ ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)))
err = -EPERM;
else if (!try_module_get(ca->owner))
@@ -309,35 +310,24 @@ void tcp_slow_start(struct tcp_sock *tp)
{
int cnt; /* increase in packets */
unsigned int delta = 0;
+ u32 snd_cwnd = tp->snd_cwnd;
- /* RFC3465: ABC Slow start
- * Increase only after a full MSS of bytes is acked
- *
- * TCP sender SHOULD increase cwnd by the number of
- * previously unacknowledged bytes ACKed by each incoming
- * acknowledgment, provided the increase is not more than L
- */
- if (sysctl_tcp_abc && tp->bytes_acked < tp->mss_cache)
- return;
+ if (unlikely(!snd_cwnd)) {
+ pr_err_once("snd_cwnd is nul, please report this bug.\n");
+ snd_cwnd = 1U;
+ }
if (sysctl_tcp_max_ssthresh > 0 && tp->snd_cwnd > sysctl_tcp_max_ssthresh)
cnt = sysctl_tcp_max_ssthresh >> 1; /* limited slow start */
else
- cnt = tp->snd_cwnd; /* exponential increase */
-
- /* RFC3465: ABC
- * We MAY increase by 2 if discovered delayed ack
- */
- if (sysctl_tcp_abc > 1 && tp->bytes_acked >= 2*tp->mss_cache)
- cnt <<= 1;
- tp->bytes_acked = 0;
+ cnt = snd_cwnd; /* exponential increase */
tp->snd_cwnd_cnt += cnt;
- while (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
- tp->snd_cwnd_cnt -= tp->snd_cwnd;
+ while (tp->snd_cwnd_cnt >= snd_cwnd) {
+ tp->snd_cwnd_cnt -= snd_cwnd;
delta++;
}
- tp->snd_cwnd = min(tp->snd_cwnd + delta, tp->snd_cwnd_clamp);
+ tp->snd_cwnd = min(snd_cwnd + delta, tp->snd_cwnd_clamp);
}
EXPORT_SYMBOL_GPL(tcp_slow_start);
@@ -371,20 +361,9 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
/* In "safe" area, increase. */
if (tp->snd_cwnd <= tp->snd_ssthresh)
tcp_slow_start(tp);
-
/* In dangerous area, increase slowly. */
- else if (sysctl_tcp_abc) {
- /* RFC3465: Appropriate Byte Count
- * increase once for each full cwnd acked
- */
- if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) {
- tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache;
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- }
- } else {
+ else
tcp_cong_avoid_ai(tp, tp->snd_cwnd);
- }
}
EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2c2b13a999e..0d9bdacce99 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -81,8 +81,6 @@ int sysctl_tcp_sack __read_mostly = 1;
int sysctl_tcp_fack __read_mostly = 1;
int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
EXPORT_SYMBOL(sysctl_tcp_reordering);
-int sysctl_tcp_ecn __read_mostly = 2;
-EXPORT_SYMBOL(sysctl_tcp_ecn);
int sysctl_tcp_dsack __read_mostly = 1;
int sysctl_tcp_app_win __read_mostly = 31;
int sysctl_tcp_adv_win_scale __read_mostly = 1;
@@ -100,7 +98,6 @@ int sysctl_tcp_frto_response __read_mostly;
int sysctl_tcp_thin_dupack __read_mostly;
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
-int sysctl_tcp_abc __read_mostly;
int sysctl_tcp_early_retrans __read_mostly = 2;
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
@@ -2009,7 +2006,6 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
tp->frto_counter = 0;
- tp->bytes_acked = 0;
tp->reordering = min_t(unsigned int, tp->reordering,
sysctl_tcp_reordering);
@@ -2058,7 +2054,6 @@ void tcp_enter_loss(struct sock *sk, int how)
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
- tp->bytes_acked = 0;
tcp_clear_retrans_partial(tp);
if (tcp_is_reno(tp))
@@ -2686,7 +2681,6 @@ static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
struct tcp_sock *tp = tcp_sk(sk);
tp->high_seq = tp->snd_nxt;
- tp->bytes_acked = 0;
tp->snd_cwnd_cnt = 0;
tp->prior_cwnd = tp->snd_cwnd;
tp->prr_delivered = 0;
@@ -2737,7 +2731,6 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
struct tcp_sock *tp = tcp_sk(sk);
tp->prior_ssthresh = 0;
- tp->bytes_acked = 0;
if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
tp->undo_marker = 0;
tcp_init_cwnd_reduction(sk, set_ssthresh);
@@ -3419,7 +3412,6 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
{
tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
tp->snd_cwnd_cnt = 0;
- tp->bytes_acked = 0;
TCP_ECN_queue_cwr(tp);
tcp_moderate_cwnd(tp);
}
@@ -3504,6 +3496,11 @@ static bool tcp_process_frto(struct sock *sk, int flag)
}
} else {
if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) {
+ if (!tcp_packets_in_flight(tp)) {
+ tcp_enter_frto_loss(sk, 2, flag);
+ return true;
+ }
+
/* Prevent sending of new data. */
tp->snd_cwnd = min(tp->snd_cwnd,
tcp_packets_in_flight(tp));
@@ -3552,6 +3549,24 @@ static bool tcp_process_frto(struct sock *sk, int flag)
return false;
}
+/* RFC 5961 7 [ACK Throttling] */
+static void tcp_send_challenge_ack(struct sock *sk)
+{
+ /* unprotected vars, we dont care of overwrites */
+ static u32 challenge_timestamp;
+ static unsigned int challenge_count;
+ u32 now = jiffies / HZ;
+
+ if (now != challenge_timestamp) {
+ challenge_timestamp = now;
+ challenge_count = 0;
+ }
+ if (++challenge_count <= sysctl_tcp_challenge_ack_limit) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
+ tcp_send_ack(sk);
+ }
+}
+
/* This routine deals with incoming acks, but not outgoing ones. */
static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
{
@@ -3571,8 +3586,14 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* If the ack is older than previous acks
* then we can probably ignore it.
*/
- if (before(ack, prior_snd_una))
+ if (before(ack, prior_snd_una)) {
+ /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
+ if (before(ack, prior_snd_una - tp->max_window)) {
+ tcp_send_challenge_ack(sk);
+ return -1;
+ }
goto old_ack;
+ }
/* If the ack includes data we haven't sent yet, discard
* this segment (RFC793 Section 3.9).
@@ -3586,15 +3607,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (after(ack, prior_snd_una))
flag |= FLAG_SND_UNA_ADVANCED;
- if (sysctl_tcp_abc) {
- if (icsk->icsk_ca_state < TCP_CA_CWR)
- tp->bytes_acked += ack - prior_snd_una;
- else if (icsk->icsk_ca_state == TCP_CA_Loss)
- /* we assume just one segment left network */
- tp->bytes_acked += min(ack - prior_snd_una,
- tp->mss_cache);
- }
-
prior_fackets = tp->fackets_out;
prior_in_flight = tcp_packets_in_flight(tp);
@@ -3848,7 +3860,7 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr
++ptr;
tp->rx_opt.rcv_tsval = ntohl(*ptr);
++ptr;
- tp->rx_opt.rcv_tsecr = ntohl(*ptr);
+ tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
return true;
}
return false;
@@ -3872,7 +3884,11 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb,
if (tcp_parse_aligned_timestamp(tp, th))
return true;
}
+
tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL);
+ if (tp->rx_opt.saw_tstamp)
+ tp->rx_opt.rcv_tsecr -= tp->tsoffset;
+
return true;
}
@@ -5244,23 +5260,6 @@ out:
}
#endif /* CONFIG_NET_DMA */
-static void tcp_send_challenge_ack(struct sock *sk)
-{
- /* unprotected vars, we dont care of overwrites */
- static u32 challenge_timestamp;
- static unsigned int challenge_count;
- u32 now = jiffies / HZ;
-
- if (now != challenge_timestamp) {
- challenge_timestamp = now;
- challenge_count = 0;
- }
- if (++challenge_count <= sysctl_tcp_challenge_ack_limit) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
- tcp_send_ack(sk);
- }
-}
-
/* Does PAWS and seqno based validation of an incoming segment, flags will
* play significant role here.
*/
@@ -5313,11 +5312,6 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
goto discard;
}
- /* ts_recent update must be made after we are sure that the packet
- * is in window.
- */
- tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
-
/* step 3: check security and precedence [ignored] */
/* step 4: Check for a SYN
@@ -5491,6 +5485,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
if (tcp_checksum_complete_user(sk, skb))
goto csum_error;
+ if ((int)skb->truesize > sk->sk_forward_alloc)
+ goto step5;
+
/* Predicted packet is in window by definition.
* seq == rcv_nxt and rcv_wup <= rcv_nxt.
* Hence, check seq<=rcv_wup reduces to:
@@ -5502,9 +5499,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tcp_rcv_rtt_measure_ts(sk, skb);
- if ((int)skb->truesize > sk->sk_forward_alloc)
- goto step5;
-
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);
/* Bulk data transfer: receiver */
@@ -5541,6 +5535,9 @@ slow_path:
if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb))
goto csum_error;
+ if (!th->ack && !th->rst)
+ goto discard;
+
/*
* Standard slow path.
*/
@@ -5549,9 +5546,14 @@ slow_path:
return 0;
step5:
- if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0)
+ if (tcp_ack(sk, skb, FLAG_SLOWPATH) < 0)
goto discard;
+ /* ts_recent update must be made after we are sure that the packet
+ * is in window.
+ */
+ tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
+
tcp_rcv_rtt_measure_ts(sk, skb);
/* Process urgent data. */
@@ -5639,13 +5641,16 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
* the remote receives only the retransmitted (regular) SYNs: either
* the original SYN-data or the corresponding SYN-ACK is lost.
*/
- syn_drop = (cookie->len <= 0 && data &&
- inet_csk(sk)->icsk_retransmits);
+ syn_drop = (cookie->len <= 0 && data && tp->total_retrans);
tcp_fastopen_cache_set(sk, mss, cookie, syn_drop);
if (data) { /* Retransmit unacked data in SYN */
- tcp_retransmit_skb(sk, data);
+ tcp_for_write_queue_from(data, sk) {
+ if (data == tcp_send_head(sk) ||
+ __tcp_retransmit_skb(sk, data))
+ break;
+ }
tcp_rearm_rto(sk);
return true;
}
@@ -5664,6 +5669,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
int saved_clamp = tp->rx_opt.mss_clamp;
tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, &foc);
+ if (tp->rx_opt.saw_tstamp)
+ tp->rx_opt.rcv_tsecr -= tp->tsoffset;
if (th->ack) {
/* rfc793:
@@ -5973,11 +5980,15 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
if (tcp_check_req(sk, skb, req, NULL, true) == NULL)
goto discard;
}
+
+ if (!th->ack && !th->rst)
+ goto discard;
+
if (!tcp_validate_incoming(sk, skb, th, 0))
return 0;
/* step 5: check the ACK field */
- if (th->ack) {
+ if (true) {
int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0;
switch (sk->sk_state) {
@@ -5988,7 +5999,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
*/
if (req) {
tcp_synack_rtt_meas(sk, req);
- tp->total_retrans = req->retrans;
+ tp->total_retrans = req->num_retrans;
reqsk_fastopen_remove(sk, req, false);
} else {
@@ -6127,8 +6138,12 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
}
break;
}
- } else
- goto discard;
+ }
+
+ /* ts_recent update must be made after we are sure that the packet
+ * is in window.
+ */
+ tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
/* step 6: check the URG bit */
tcp_urg(sk, skb, th);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 0c4a6435560..4a8ec457310 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -138,14 +138,6 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
}
EXPORT_SYMBOL_GPL(tcp_twsk_unique);
-static int tcp_repair_connect(struct sock *sk)
-{
- tcp_connect_init(sk);
- tcp_finish_connect(sk, NULL);
-
- return 0;
-}
-
/* This will initiate an outgoing connection. */
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
@@ -250,10 +242,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_id = tp->write_seq ^ jiffies;
- if (likely(!tp->repair))
- err = tcp_connect(sk);
- else
- err = tcp_repair_connect(sk);
+ err = tcp_connect(sk);
rt = NULL;
if (err)
@@ -380,11 +369,10 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
* We do take care of PMTU discovery (RFC1191) special case :
* we can receive locally generated ICMP messages while socket is held.
*/
- if (sock_owned_by_user(sk) &&
- type != ICMP_DEST_UNREACH &&
- code != ICMP_FRAG_NEEDED)
- NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
-
+ if (sock_owned_by_user(sk)) {
+ if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
+ NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
+ }
if (sk->sk_state == TCP_CLOSE)
goto out;
@@ -508,6 +496,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
* errors returned from accept().
*/
inet_csk_reqsk_queue_drop(sk, req, prev);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
goto out;
case TCP_SYN_SENT:
@@ -668,7 +657,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
* no RST generated if md5 hash doesn't match.
*/
sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
- &tcp_hashinfo, ip_hdr(skb)->daddr,
+ &tcp_hashinfo, ip_hdr(skb)->saddr,
+ th->source, ip_hdr(skb)->daddr,
ntohs(th->source), inet_iif(skb));
/* don't send rst if it can't find key */
if (!sk1)
@@ -736,7 +726,7 @@ release_sk1:
*/
static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
- u32 win, u32 ts, int oif,
+ u32 win, u32 tsval, u32 tsecr, int oif,
struct tcp_md5sig_key *key,
int reply_flags, u8 tos)
{
@@ -757,12 +747,12 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
arg.iov[0].iov_base = (unsigned char *)&rep;
arg.iov[0].iov_len = sizeof(rep.th);
- if (ts) {
+ if (tsecr) {
rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
(TCPOPT_TIMESTAMP << 8) |
TCPOLEN_TIMESTAMP);
- rep.opt[1] = htonl(tcp_time_stamp);
- rep.opt[2] = htonl(ts);
+ rep.opt[1] = htonl(tsval);
+ rep.opt[2] = htonl(tsecr);
arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
}
@@ -777,7 +767,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
#ifdef CONFIG_TCP_MD5SIG
if (key) {
- int offset = (ts) ? 3 : 0;
+ int offset = (tsecr) ? 3 : 0;
rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
@@ -812,6 +802,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
+ tcp_time_stamp + tcptw->tw_ts_offset,
tcptw->tw_ts_recent,
tw->tw_bound_dev_if,
tcp_twsk_md5_key(tcptw),
@@ -831,6 +822,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
+ tcp_time_stamp,
req->ts_recent,
0,
tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
@@ -877,10 +869,13 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
}
static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
- struct request_values *rvp)
+ struct request_values *rvp)
{
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
- return tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
+ int res = tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
+
+ if (!res)
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
+ return res;
}
/*
@@ -959,7 +954,6 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_key *key;
- struct hlist_node *pos;
unsigned int size = sizeof(struct in_addr);
struct tcp_md5sig_info *md5sig;
@@ -973,7 +967,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
if (family == AF_INET6)
size = sizeof(struct in6_addr);
#endif
- hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) {
+ hlist_for_each_entry_rcu(key, &md5sig->head, node) {
if (key->family != family)
continue;
if (!memcmp(&key->addr, addr, size))
@@ -1070,18 +1064,18 @@ int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
}
EXPORT_SYMBOL(tcp_md5_do_del);
-void tcp_clear_md5_list(struct sock *sk)
+static void tcp_clear_md5_list(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_key *key;
- struct hlist_node *pos, *n;
+ struct hlist_node *n;
struct tcp_md5sig_info *md5sig;
md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
if (!hlist_empty(&md5sig->head))
tcp_free_md5sig_pool();
- hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) {
+ hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
hlist_del_rcu(&key->node);
atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
kfree_rcu(key, rcu);
@@ -1386,7 +1380,8 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk,
struct sock *child;
int err;
- req->retrans = 0;
+ req->num_retrans = 0;
+ req->num_timeout = 0;
req->sk = NULL;
child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
@@ -1508,8 +1503,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
* clogging syn queue with openreqs with exponentially increasing
* timeout.
*/
- if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
+ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
goto drop;
+ }
req = inet_reqsk_alloc(&tcp_request_sock_ops);
if (!req)
@@ -1575,7 +1572,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
goto drop_and_free;
if (!want_cookie || tmp_opt.tstamp_ok)
- TCP_ECN_create_request(req, skb);
+ TCP_ECN_create_request(req, skb, sock_net(sk));
if (want_cookie) {
isn = cookie_v4_init_sequence(sk, skb, &req->mss);
@@ -1674,6 +1671,7 @@ drop_and_release:
drop_and_free:
reqsk_free(req);
drop:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
return 0;
}
EXPORT_SYMBOL(tcp_v4_conn_request);
@@ -1741,7 +1739,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
tcp_initialize_rcv_mss(newsk);
tcp_synack_rtt_meas(newsk, req);
- newtp->total_retrans = req->retrans;
+ newtp->total_retrans = req->num_retrans;
#ifdef CONFIG_TCP_MD5SIG
/* Copy over the MD5 key from the original socket */
@@ -1774,10 +1772,8 @@ exit:
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
return NULL;
put_and_exit:
- tcp_clear_xmit_timers(newsk);
- tcp_cleanup_congestion_control(newsk);
- bh_unlock_sock(newsk);
- sock_put(newsk);
+ inet_csk_prepare_forced_close(newsk);
+ tcp_done(newsk);
goto exit;
}
EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
@@ -1919,7 +1915,6 @@ EXPORT_SYMBOL(tcp_v4_do_rcv);
void tcp_v4_early_demux(struct sk_buff *skb)
{
- struct net *net = dev_net(skb->dev);
const struct iphdr *iph;
const struct tcphdr *th;
struct sock *sk;
@@ -1927,16 +1922,16 @@ void tcp_v4_early_demux(struct sk_buff *skb)
if (skb->pkt_type != PACKET_HOST)
return;
- if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
+ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
return;
iph = ip_hdr(skb);
- th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
+ th = tcp_hdr(skb);
if (th->doff < sizeof(struct tcphdr) / 4)
return;
- sk = __inet_lookup_established(net, &tcp_hashinfo,
+ sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
iph->saddr, th->source,
iph->daddr, ntohs(th->dest),
skb->skb_iif);
@@ -2084,6 +2079,7 @@ do_time_wait:
case TCP_TW_SYN: {
struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
&tcp_hashinfo,
+ iph->saddr, th->source,
iph->daddr, th->dest,
inet_iif(skb));
if (sk2) {
@@ -2619,7 +2615,7 @@ EXPORT_SYMBOL(tcp_proc_register);
void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
{
- proc_net_remove(net, afinfo->name);
+ remove_proc_entry(afinfo->name, net->proc_net);
}
EXPORT_SYMBOL(tcp_proc_unregister);
@@ -2640,7 +2636,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req,
0, 0, /* could print option size, but that is af dependent. */
1, /* timers active (only the expire timer) */
jiffies_delta_to_clock_t(delta),
- req->retrans,
+ req->num_timeout,
from_kuid_munged(seq_user_ns(f), uid),
0, /* non standard timer */
0, /* open_requests have no inode */
@@ -2898,6 +2894,7 @@ EXPORT_SYMBOL(tcp_prot);
static int __net_init tcp_sk_init(struct net *net)
{
+ net->ipv4.sysctl_tcp_ecn = 2;
return 0;
}
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 53bc5847bfa..f696d7c2e9f 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -1,7 +1,6 @@
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/jiffies.h>
-#include <linux/bootmem.h>
#include <linux/module.h>
#include <linux/cache.h>
#include <linux/slab.h>
@@ -9,6 +8,7 @@
#include <linux/tcp.h>
#include <linux/hash.h>
#include <linux/tcp_metrics.h>
+#include <linux/vmalloc.h>
#include <net/inet_connection_sock.h>
#include <net/net_namespace.h>
@@ -1034,7 +1034,10 @@ static int __net_init tcp_net_metrics_init(struct net *net)
net->ipv4.tcp_metrics_hash_log = order_base_2(slots);
size = sizeof(struct tcpm_hash_bucket) << net->ipv4.tcp_metrics_hash_log;
- net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL);
+ net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+ if (!net->ipv4.tcp_metrics_hash)
+ net->ipv4.tcp_metrics_hash = vzalloc(size);
+
if (!net->ipv4.tcp_metrics_hash)
return -ENOMEM;
@@ -1055,7 +1058,10 @@ static void __net_exit tcp_net_metrics_exit(struct net *net)
tm = next;
}
}
- kfree(net->ipv4.tcp_metrics_hash);
+ if (is_vmalloc_addr(net->ipv4.tcp_metrics_hash))
+ vfree(net->ipv4.tcp_metrics_hash);
+ else
+ kfree(net->ipv4.tcp_metrics_hash);
}
static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index a7302d974f3..b83a49cc381 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -102,6 +102,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
if (tmp_opt.saw_tstamp) {
+ tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset;
tmp_opt.ts_recent = tcptw->tw_ts_recent;
tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
@@ -288,6 +289,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
tcptw->tw_rcv_wnd = tcp_receive_window(tp);
tcptw->tw_ts_recent = tp->rx_opt.ts_recent;
tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
+ tcptw->tw_ts_offset = tp->tsoffset;
#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == PF_INET6) {
@@ -446,7 +448,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
*/
newtp->snd_cwnd = TCP_INIT_CWND;
newtp->snd_cwnd_cnt = 0;
- newtp->bytes_acked = 0;
newtp->frto_counter = 0;
newtp->frto_highmark = 0;
@@ -500,6 +501,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->rx_opt.ts_recent_stamp = 0;
newtp->tcp_header_len = sizeof(struct tcphdr);
}
+ newtp->tsoffset = 0;
#ifdef CONFIG_TCP_MD5SIG
newtp->md5sig_info = NULL; /*XXX*/
if (newtp->af_specific->md5_lookup(sk, newsk))
@@ -553,7 +555,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
* it can be estimated (approximately)
* from another data.
*/
- tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
+ tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout);
paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
}
}
@@ -582,7 +584,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
* Note that even if there is new data in the SYN packet
* they will be thrown away too.
*/
- req->rsk_ops->rtx_syn_ack(sk, req, NULL);
+ inet_rtx_syn_ack(sk, req);
return NULL;
}
@@ -696,7 +698,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
/* Got ACK for our SYNACK, so update baseline for SYNACK RTT sample. */
if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr)
tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr;
- else if (req->retrans) /* don't take RTT sample if retrans && ~TS */
+ else if (req->num_retrans) /* don't take RTT sample if retrans && ~TS */
tcp_rsk(req)->snt_synack = 0;
/* For Fast Open no more processing is needed (sk is the
@@ -706,7 +708,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
return sk;
/* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
- if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
+ if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
inet_rsk(req)->acked = 1;
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index cfe6ffe1c17..e2b4461074d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -314,7 +314,7 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
struct tcp_sock *tp = tcp_sk(sk);
tp->ecn_flags = 0;
- if (sysctl_tcp_ecn == 1) {
+ if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) {
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
tp->ecn_flags = TCP_ECN_OK;
}
@@ -622,7 +622,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
opts->options |= OPTION_TS;
- opts->tsval = TCP_SKB_CB(skb)->when;
+ opts->tsval = TCP_SKB_CB(skb)->when + tp->tsoffset;
opts->tsecr = tp->rx_opt.ts_recent;
remaining -= TCPOLEN_TSTAMP_ALIGNED;
}
@@ -806,7 +806,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
if (likely(tp->rx_opt.tstamp_ok)) {
opts->options |= OPTION_TS;
- opts->tsval = tcb ? tcb->when : 0;
+ opts->tsval = tcb ? tcb->when + tp->tsoffset : 0;
opts->tsecr = tp->rx_opt.ts_recent;
size += TCPOLEN_TSTAMP_ALIGNED;
}
@@ -1331,7 +1331,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
/* Remove acked data from a packet in the transmit queue. */
int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
{
- if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ if (skb_unclone(skb, GFP_ATOMIC))
return -ENOMEM;
__pskb_trim_head(skb, len);
@@ -1351,8 +1351,8 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
return 0;
}
-/* Calculate MSS. Not accounting for SACKs here. */
-int tcp_mtu_to_mss(struct sock *sk, int pmtu)
+/* Calculate MSS not accounting any TCP options. */
+static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1381,13 +1381,17 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu)
/* Then reserve room for full set of TCP options and 8 bytes of data */
if (mss_now < 48)
mss_now = 48;
-
- /* Now subtract TCP options size, not including SACKs */
- mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
-
return mss_now;
}
+/* Calculate MSS. Not accounting for SACKs here. */
+int tcp_mtu_to_mss(struct sock *sk, int pmtu)
+{
+ /* Subtract TCP options size, not including SACKs */
+ return __tcp_mtu_to_mss(sk, pmtu) -
+ (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
+}
+
/* Inverse of above */
int tcp_mss_to_mtu(struct sock *sk, int mss)
{
@@ -1986,6 +1990,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
BUG_ON(!tso_segs);
+ if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE)
+ goto repair; /* Skip network transmission */
+
cwnd_quota = tcp_cwnd_test(tp, skb);
if (!cwnd_quota)
break;
@@ -2026,6 +2033,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
break;
+repair:
/* Advance the send_head. This one is sent out.
* This call will increment packets_out.
*/
@@ -2305,12 +2313,11 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
* state updates are done by the caller. Returns non-zero if an
* error occurred which prevented the send.
*/
-int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
+int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
unsigned int cur_mss;
- int err;
/* Inconslusive MTU probe */
if (icsk->icsk_mtup.probe_size) {
@@ -2383,11 +2390,17 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
if (unlikely(NET_IP_ALIGN && ((unsigned long)skb->data & 3))) {
struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER,
GFP_ATOMIC);
- err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
- -ENOBUFS;
+ return nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
+ -ENOBUFS;
} else {
- err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
+ return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
+}
+
+int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int err = __tcp_retransmit_skb(sk, skb);
if (err == 0) {
/* Update global TCP statistics. */
@@ -2921,7 +2934,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
*/
if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp)
tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
- space = tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
+ space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
MAX_TCP_OPTION_SPACE;
syn_data = skb_copy_expand(syn, skb_headroom(syn), space,
@@ -2983,6 +2996,11 @@ int tcp_connect(struct sock *sk)
tcp_connect_init(sk);
+ if (unlikely(tp->repair)) {
+ tcp_finish_connect(sk, NULL);
+ return 0;
+ }
+
buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
if (unlikely(buff == NULL))
return -ENOBUFS;
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 4526fe68e60..d4943f67aff 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -234,7 +234,7 @@ static __init int tcpprobe_init(void)
if (!tcp_probe.log)
goto err0;
- if (!proc_net_fops_create(&init_net, procname, S_IRUSR, &tcpprobe_fops))
+ if (!proc_create(procname, S_IRUSR, init_net.proc_net, &tcpprobe_fops))
goto err0;
ret = register_jprobe(&tcp_jprobe);
@@ -244,7 +244,7 @@ static __init int tcpprobe_init(void)
pr_info("probe registered (port=%d) bufsize=%u\n", port, bufsize);
return 0;
err1:
- proc_net_remove(&init_net, procname);
+ remove_proc_entry(procname, init_net.proc_net);
err0:
kfree(tcp_probe.log);
return ret;
@@ -253,7 +253,7 @@ module_init(tcpprobe_init);
static __exit void tcpprobe_exit(void)
{
- proc_net_remove(&init_net, procname);
+ remove_proc_entry(procname, init_net.proc_net);
unregister_jprobe(&tcp_jprobe);
kfree(tcp_probe.log);
}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index d47c1b4421a..b78aac30c49 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -318,7 +318,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
req = tcp_sk(sk)->fastopen_rsk;
req->rsk_ops->syn_ack_timeout(sk, req);
- if (req->retrans >= max_retries) {
+ if (req->num_timeout >= max_retries) {
tcp_write_err(sk);
return;
}
@@ -327,10 +327,10 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
* regular retransmit because if the child socket has been accepted
* it's not good to give up too easily.
*/
- req->rsk_ops->rtx_syn_ack(sk, req, NULL);
- req->retrans++;
+ inet_rtx_syn_ack(sk, req);
+ req->num_timeout++;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- TCP_TIMEOUT_INIT << req->retrans, TCP_RTO_MAX);
+ TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
}
/*
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 79c8dbe59b5..265c42cf963 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -139,6 +139,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
{
struct sock *sk2;
struct hlist_nulls_node *node;
+ kuid_t uid = sock_i_uid(sk);
sk_nulls_for_each(sk2, node, &hslot->head)
if (net_eq(sock_net(sk2), net) &&
@@ -147,6 +148,8 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
(!sk2->sk_reuse || !sk->sk_reuse) &&
(!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+ (!sk2->sk_reuseport || !sk->sk_reuseport ||
+ !uid_eq(uid, sock_i_uid(sk2))) &&
(*saddr_comp)(sk, sk2)) {
if (bitmap)
__set_bit(udp_sk(sk2)->udp_port_hash >> log,
@@ -169,6 +172,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
{
struct sock *sk2;
struct hlist_nulls_node *node;
+ kuid_t uid = sock_i_uid(sk);
int res = 0;
spin_lock(&hslot2->lock);
@@ -179,6 +183,8 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
(!sk2->sk_reuse || !sk->sk_reuse) &&
(!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+ (!sk2->sk_reuseport || !sk->sk_reuseport ||
+ !uid_eq(uid, sock_i_uid(sk2))) &&
(*saddr_comp)(sk, sk2)) {
res = 1;
break;
@@ -337,26 +343,26 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
!ipv6_only_sock(sk)) {
struct inet_sock *inet = inet_sk(sk);
- score = (sk->sk_family == PF_INET ? 1 : 0);
+ score = (sk->sk_family == PF_INET ? 2 : 1);
if (inet->inet_rcv_saddr) {
if (inet->inet_rcv_saddr != daddr)
return -1;
- score += 2;
+ score += 4;
}
if (inet->inet_daddr) {
if (inet->inet_daddr != saddr)
return -1;
- score += 2;
+ score += 4;
}
if (inet->inet_dport) {
if (inet->inet_dport != sport)
return -1;
- score += 2;
+ score += 4;
}
if (sk->sk_bound_dev_if) {
if (sk->sk_bound_dev_if != dif)
return -1;
- score += 2;
+ score += 4;
}
}
return score;
@@ -365,7 +371,6 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
/*
* In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num)
*/
-#define SCORE2_MAX (1 + 2 + 2 + 2)
static inline int compute_score2(struct sock *sk, struct net *net,
__be32 saddr, __be16 sport,
__be32 daddr, unsigned int hnum, int dif)
@@ -380,21 +385,21 @@ static inline int compute_score2(struct sock *sk, struct net *net,
if (inet->inet_num != hnum)
return -1;
- score = (sk->sk_family == PF_INET ? 1 : 0);
+ score = (sk->sk_family == PF_INET ? 2 : 1);
if (inet->inet_daddr) {
if (inet->inet_daddr != saddr)
return -1;
- score += 2;
+ score += 4;
}
if (inet->inet_dport) {
if (inet->inet_dport != sport)
return -1;
- score += 2;
+ score += 4;
}
if (sk->sk_bound_dev_if) {
if (sk->sk_bound_dev_if != dif)
return -1;
- score += 2;
+ score += 4;
}
}
return score;
@@ -409,19 +414,29 @@ static struct sock *udp4_lib_lookup2(struct net *net,
{
struct sock *sk, *result;
struct hlist_nulls_node *node;
- int score, badness;
+ int score, badness, matches = 0, reuseport = 0;
+ u32 hash = 0;
begin:
result = NULL;
- badness = -1;
+ badness = 0;
udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
score = compute_score2(sk, net, saddr, sport,
daddr, hnum, dif);
if (score > badness) {
result = sk;
badness = score;
- if (score == SCORE2_MAX)
- goto exact_match;
+ reuseport = sk->sk_reuseport;
+ if (reuseport) {
+ hash = inet_ehashfn(net, daddr, hnum,
+ saddr, htons(sport));
+ matches = 1;
+ }
+ } else if (score == badness && reuseport) {
+ matches++;
+ if (((u64)hash * matches) >> 32 == 0)
+ result = sk;
+ hash = next_pseudo_random32(hash);
}
}
/*
@@ -431,9 +446,7 @@ begin:
*/
if (get_nulls_value(node) != slot2)
goto begin;
-
if (result) {
-exact_match:
if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
result = NULL;
else if (unlikely(compute_score2(result, net, saddr, sport,
@@ -457,7 +470,8 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
unsigned short hnum = ntohs(dport);
unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
- int score, badness;
+ int score, badness, matches = 0, reuseport = 0;
+ u32 hash = 0;
rcu_read_lock();
if (hslot->count > 10) {
@@ -486,13 +500,24 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
}
begin:
result = NULL;
- badness = -1;
+ badness = 0;
sk_nulls_for_each_rcu(sk, node, &hslot->head) {
score = compute_score(sk, net, saddr, hnum, sport,
daddr, dport, dif);
if (score > badness) {
result = sk;
badness = score;
+ reuseport = sk->sk_reuseport;
+ if (reuseport) {
+ hash = inet_ehashfn(net, daddr, hnum,
+ saddr, htons(sport));
+ matches = 1;
+ }
+ } else if (score == badness && reuseport) {
+ matches++;
+ if (((u64)hash * matches) >> 32 == 0)
+ result = sk;
+ hash = next_pseudo_random32(hash);
}
}
/*
@@ -971,7 +996,7 @@ back_from_confirm:
sizeof(struct udphdr), &ipc, &rt,
msg->msg_flags);
err = PTR_ERR(skb);
- if (skb && !IS_ERR(skb))
+ if (!IS_ERR_OR_NULL(skb))
err = udp_send_skb(skb, fl4);
goto out;
}
@@ -1952,6 +1977,7 @@ struct proto udp_prot = {
.recvmsg = udp_recvmsg,
.sendpage = udp_sendpage,
.backlog_rcv = __udp_queue_rcv_skb,
+ .release_cb = ip4_datagram_release_cb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
.rehash = udp_v4_rehash,
@@ -2096,7 +2122,7 @@ EXPORT_SYMBOL(udp_proc_register);
void udp_proc_unregister(struct net *net, struct udp_seq_afinfo *afinfo)
{
- proc_net_remove(net, afinfo->name);
+ remove_proc_entry(afinfo->name, net->proc_net);
}
EXPORT_SYMBOL(udp_proc_unregister);
@@ -2279,7 +2305,8 @@ struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
/* Packet is from an untrusted source, reset gso_segs. */
int type = skb_shinfo(skb)->gso_type;
- if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY) ||
+ if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
+ SKB_GSO_GRE) ||
!(type & (SKB_GSO_UDP))))
goto out;
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 06814b6216d..1f12c8b4586 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -132,7 +132,7 @@ int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
* header and optional ESP marker bytes) and then modify the
* protocol to ESP, and then call into the transform receiver.
*/
- if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ if (skb_unclone(skb, GFP_ATOMIC))
goto drop;
/* Now we can update and verify the packet length... */
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index ddee0a099a2..fe5189e2e11 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -142,8 +142,8 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
for_each_input_rcu(rcv_notify_handlers, handler)
handler->handler(skb);
- if (skb_cloned(skb) &&
- (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+ err = skb_unclone(skb, GFP_ATOMIC);
+ if (err)
goto out;
if (x->props.flags & XFRM_STATE_DECAP_DSCP)
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 05c5ab8d983..9a459be24af 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -262,43 +262,66 @@ static struct ctl_table xfrm4_policy_table[] = {
{ }
};
-static struct ctl_table_header *sysctl_hdr;
-#endif
-
-static void __init xfrm4_policy_init(void)
+static int __net_init xfrm4_net_init(struct net *net)
{
- xfrm_policy_register_afinfo(&xfrm4_policy_afinfo);
+ struct ctl_table *table;
+ struct ctl_table_header *hdr;
+
+ table = xfrm4_policy_table;
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(table, sizeof(xfrm4_policy_table), GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
+
+ table[0].data = &net->xfrm.xfrm4_dst_ops.gc_thresh;
+ }
+
+ hdr = register_net_sysctl(net, "net/ipv4", table);
+ if (!hdr)
+ goto err_reg;
+
+ net->ipv4.xfrm4_hdr = hdr;
+ return 0;
+
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
}
-static void __exit xfrm4_policy_fini(void)
+static void __net_exit xfrm4_net_exit(struct net *net)
{
-#ifdef CONFIG_SYSCTL
- if (sysctl_hdr)
- unregister_net_sysctl_table(sysctl_hdr);
+ struct ctl_table *table;
+
+ if (net->ipv4.xfrm4_hdr == NULL)
+ return;
+
+ table = net->ipv4.xfrm4_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->ipv4.xfrm4_hdr);
+ if (!net_eq(net, &init_net))
+ kfree(table);
+}
+
+static struct pernet_operations __net_initdata xfrm4_net_ops = {
+ .init = xfrm4_net_init,
+ .exit = xfrm4_net_exit,
+};
#endif
- xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo);
+
+static void __init xfrm4_policy_init(void)
+{
+ xfrm_policy_register_afinfo(&xfrm4_policy_afinfo);
}
-void __init xfrm4_init(int rt_max_size)
+void __init xfrm4_init(void)
{
- /*
- * Select a default value for the gc_thresh based on the main route
- * table hash size. It seems to me the worst case scenario is when
- * we have ipsec operating in transport mode, in which we create a
- * dst_entry per socket. The xfrm gc algorithm starts trying to remove
- * entries at gc_thresh, and prevents new allocations as 2*gc_thresh
- * so lets set an initial xfrm gc_thresh value at the rt_max_size/2.
- * That will let us store an ipsec connection per route table entry,
- * and start cleaning when were 1/2 full
- */
- xfrm4_dst_ops.gc_thresh = rt_max_size/2;
dst_entries_init(&xfrm4_dst_ops);
xfrm4_state_init();
xfrm4_policy_init();
#ifdef CONFIG_SYSCTL
- sysctl_hdr = register_net_sysctl(&init_net, "net/ipv4",
- xfrm4_policy_table);
+ register_pernet_subsys(&xfrm4_net_ops);
#endif
}