summaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2012-06-15 12:32:04 -0700
committerSage Weil <sage@inktank.com>2012-06-15 12:32:04 -0700
commit9a64e8e0ace51b309fdcff4b4754b3649250382a (patch)
tree1f0d75c196c5ab0408c55ed6cf3a152f1f921e15 /net/ipv4
parentf3dea7edd3d449fe7a6d402c1ce56a294b985261 (diff)
parentf8f5701bdaf9134b1f90e5044a82c66324d2073f (diff)
Merge tag 'v3.5-rc1'
Linux 3.5-rc1 Conflicts: net/ceph/messenger.c
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig8
-rw-r--r--net/ipv4/af_inet.c33
-rw-r--r--net/ipv4/ah4.c23
-rw-r--r--net/ipv4/arp.c29
-rw-r--r--net/ipv4/cipso_ipv4.c11
-rw-r--r--net/ipv4/devinet.c68
-rw-r--r--net/ipv4/esp4.c34
-rw-r--r--net/ipv4/fib_frontend.c17
-rw-r--r--net/ipv4/fib_rules.c16
-rw-r--r--net/ipv4/fib_semantics.c62
-rw-r--r--net/ipv4/fib_trie.c8
-rw-r--r--net/ipv4/gre.c6
-rw-r--r--net/ipv4/icmp.c30
-rw-r--r--net/ipv4/igmp.c19
-rw-r--r--net/ipv4/inet_connection_sock.c32
-rw-r--r--net/ipv4/inet_diag.c24
-rw-r--r--net/ipv4/inet_hashtables.c2
-rw-r--r--net/ipv4/inet_timewait_sock.c6
-rw-r--r--net/ipv4/ip_forward.c4
-rw-r--r--net/ipv4/ip_fragment.c54
-rw-r--r--net/ipv4/ip_gre.c131
-rw-r--r--net/ipv4/ip_input.c30
-rw-r--r--net/ipv4/ip_options.c34
-rw-r--r--net/ipv4/ip_output.c5
-rw-r--r--net/ipv4/ip_sockglue.c63
-rw-r--r--net/ipv4/ipcomp.c8
-rw-r--r--net/ipv4/ipconfig.c124
-rw-r--r--net/ipv4/ipip.c64
-rw-r--r--net/ipv4/ipmr.c15
-rw-r--r--net/ipv4/netfilter.c12
-rw-r--r--net/ipv4/netfilter/Kconfig9
-rw-r--r--net/ipv4/netfilter/Makefile4
-rw-r--r--net/ipv4/netfilter/arp_tables.c7
-rw-r--r--net/ipv4/netfilter/ip_queue.c639
-rw-r--r--net/ipv4/netfilter/ip_tables.c5
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c3
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c516
-rw-r--r--net/ipv4/netfilter/iptable_filter.c9
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c19
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c68
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c8
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c40
-rw-r--r--net/ipv4/netfilter/nf_nat_sip.c9
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c8
-rw-r--r--net/ipv4/ping.c43
-rw-r--r--net/ipv4/proc.c2
-rw-r--r--net/ipv4/raw.c12
-rw-r--r--net/ipv4/route.c194
-rw-r--r--net/ipv4/sysctl_net_ipv4.c26
-rw-r--r--net/ipv4/tcp.c352
-rw-r--r--net/ipv4/tcp_cong.c15
-rw-r--r--net/ipv4/tcp_hybla.c10
-rw-r--r--net/ipv4/tcp_input.c843
-rw-r--r--net/ipv4/tcp_ipv4.c431
-rw-r--r--net/ipv4/tcp_memcontrol.c111
-rw-r--r--net/ipv4/tcp_minisocks.c37
-rw-r--r--net/ipv4/tcp_output.c158
-rw-r--r--net/ipv4/tcp_probe.c8
-rw-r--r--net/ipv4/tcp_timer.c19
-rw-r--r--net/ipv4/tunnel4.c8
-rw-r--r--net/ipv4/udp.c100
-rw-r--r--net/ipv4/udp_diag.c9
-rw-r--r--net/ipv4/udp_impl.h2
-rw-r--r--net/ipv4/udplite.c7
-rw-r--r--net/ipv4/xfrm4_policy.c6
-rw-r--r--net/ipv4/xfrm4_tunnel.c16
66 files changed, 2089 insertions, 2636 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index d183262943d..20f1cb5c8ab 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -262,8 +262,8 @@ config ARPD
bool "IP: ARP daemon support"
---help---
The kernel maintains an internal cache which maps IP addresses to
- hardware addresses on the local network, so that Ethernet/Token Ring/
- etc. frames are sent to the proper address on the physical networking
+ hardware addresses on the local network, so that Ethernet
+ frames are sent to the proper address on the physical networking
layer. Normally, kernel uses the ARP protocol to resolve these
mappings.
@@ -312,7 +312,7 @@ config SYN_COOKIES
config INET_AH
tristate "IP: AH transformation"
- select XFRM
+ select XFRM_ALGO
select CRYPTO
select CRYPTO_HMAC
select CRYPTO_MD5
@@ -324,7 +324,7 @@ config INET_AH
config INET_ESP
tristate "IP: ESP transformation"
- select XFRM
+ select XFRM_ALGO
select CRYPTO
select CRYPTO_AUTHENC
select CRYPTO_HMAC
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f7b5670744f..c8f7aee587d 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -65,6 +65,8 @@
* 2 of the License, or (at your option) any later version.
*/
+#define pr_fmt(fmt) "IPv4: " fmt
+
#include <linux/err.h>
#include <linux/errno.h>
#include <linux/types.h>
@@ -89,7 +91,6 @@
#include <linux/slab.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/inet.h>
#include <linux/igmp.h>
@@ -349,7 +350,7 @@ lookup_protocol:
err = 0;
sk->sk_no_check = answer_no_check;
if (INET_PROTOSW_REUSE & answer_flags)
- sk->sk_reuse = 1;
+ sk->sk_reuse = SK_CAN_REUSE;
inet = inet_sk(sk);
inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
@@ -381,6 +382,7 @@ lookup_protocol:
inet->mc_all = 1;
inet->mc_index = 0;
inet->mc_list = NULL;
+ inet->rcv_tos = 0;
sk_refcnt_debug_inc(sk);
@@ -539,7 +541,7 @@ out:
}
EXPORT_SYMBOL(inet_bind);
-int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
+int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
struct sock *sk = sock->sk;
@@ -1084,13 +1086,11 @@ out:
return;
out_permanent:
- printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
- protocol);
+ pr_err("Attempt to override permanent protocol %d\n", protocol);
goto out;
out_illegal:
- printk(KERN_ERR
- "Ignoring attempt to register invalid socket type %d.\n",
+ pr_err("Ignoring attempt to register invalid socket type %d\n",
p->type);
goto out;
}
@@ -1099,8 +1099,7 @@ EXPORT_SYMBOL(inet_register_protosw);
void inet_unregister_protosw(struct inet_protosw *p)
{
if (INET_PROTOSW_PERMANENT & p->flags) {
- printk(KERN_ERR
- "Attempt to unregister permanent protocol %d.\n",
+ pr_err("Attempt to unregister permanent protocol %d\n",
p->protocol);
} else {
spin_lock_bh(&inetsw_lock);
@@ -1149,8 +1148,8 @@ static int inet_sk_reselect_saddr(struct sock *sk)
return 0;
if (sysctl_ip_dynaddr > 1) {
- printk(KERN_INFO "%s(): shifting inet->saddr from %pI4 to %pI4\n",
- __func__, &old_saddr, &new_saddr);
+ pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n",
+ __func__, &old_saddr, &new_saddr);
}
inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
@@ -1679,14 +1678,14 @@ static int __init inet_init(void)
*/
if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
- printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");
+ pr_crit("%s: Cannot add ICMP protocol\n", __func__);
if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
- printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");
+ pr_crit("%s: Cannot add UDP protocol\n", __func__);
if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
- printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");
+ pr_crit("%s: Cannot add TCP protocol\n", __func__);
#ifdef CONFIG_IP_MULTICAST
if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
- printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n");
+ pr_crit("%s: Cannot add IGMP protocol\n", __func__);
#endif
/* Register the socket-side information for inet_create. */
@@ -1733,14 +1732,14 @@ static int __init inet_init(void)
*/
#if defined(CONFIG_IP_MROUTE)
if (ip_mr_init())
- printk(KERN_CRIT "inet_init: Cannot init ipv4 mroute\n");
+ pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
#endif
/*
* Initialise per-cpu ipv4 mibs
*/
if (init_ipv4_mibs())
- printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n");
+ pr_crit("%s: Cannot init ipv4 mibs\n", __func__);
ipv4_proc_init();
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 36d14406261..e8f2617ecd4 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -1,3 +1,5 @@
+#define pr_fmt(fmt) "IPsec: " fmt
+
#include <crypto/hash.h>
#include <linux/err.h>
#include <linux/module.h>
@@ -75,7 +77,7 @@ static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash,
static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr)
{
- unsigned char * optptr = (unsigned char*)(iph+1);
+ unsigned char *optptr = (unsigned char *)(iph+1);
int l = iph->ihl*4 - sizeof(struct iphdr);
int optlen;
@@ -404,8 +406,8 @@ static void ah4_err(struct sk_buff *skb, u32 info)
ah->spi, IPPROTO_AH, AF_INET);
if (!x)
return;
- printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n",
- ntohl(ah->spi), ntohl(iph->daddr));
+ pr_debug("pmtu discovery on SA AH/%08x/%08x\n",
+ ntohl(ah->spi), ntohl(iph->daddr));
xfrm_state_put(x);
}
@@ -445,9 +447,10 @@ static int ah_init_state(struct xfrm_state *x)
if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
crypto_ahash_digestsize(ahash)) {
- printk(KERN_INFO "AH: %s digestsize %u != %hu\n",
- x->aalg->alg_name, crypto_ahash_digestsize(ahash),
- aalg_desc->uinfo.auth.icv_fullbits/8);
+ pr_info("%s: %s digestsize %u != %hu\n",
+ __func__, x->aalg->alg_name,
+ crypto_ahash_digestsize(ahash),
+ aalg_desc->uinfo.auth.icv_fullbits / 8);
goto error;
}
@@ -510,11 +513,11 @@ static const struct net_protocol ah4_protocol = {
static int __init ah4_init(void)
{
if (xfrm_register_type(&ah_type, AF_INET) < 0) {
- printk(KERN_INFO "ip ah init: can't add xfrm type\n");
+ pr_info("%s: can't add xfrm type\n", __func__);
return -EAGAIN;
}
if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) {
- printk(KERN_INFO "ip ah init: can't add protocol\n");
+ pr_info("%s: can't add protocol\n", __func__);
xfrm_unregister_type(&ah_type, AF_INET);
return -EAGAIN;
}
@@ -524,9 +527,9 @@ static int __init ah4_init(void)
static void __exit ah4_fini(void)
{
if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0)
- printk(KERN_INFO "ip ah close: can't remove protocol\n");
+ pr_info("%s: can't remove protocol\n", __func__);
if (xfrm_unregister_type(&ah_type, AF_INET) < 0)
- printk(KERN_INFO "ip ah close: can't remove xfrm type\n");
+ pr_info("%s: can't remove xfrm type\n", __func__);
}
module_init(ah4_init);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 63e49890ad3..cda37be02f8 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -73,6 +73,8 @@
* Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/types.h>
#include <linux/string.h>
@@ -89,7 +91,6 @@
#include <linux/etherdevice.h>
#include <linux/fddidevice.h>
#include <linux/if_arp.h>
-#include <linux/trdevice.h>
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
@@ -113,7 +114,6 @@
#include <net/ax25.h>
#include <net/netrom.h>
-#include <asm/system.h>
#include <linux/uaccess.h>
#include <linux/netfilter_arp.h>
@@ -194,9 +194,6 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
case ARPHRD_IEEE802:
ip_eth_mc_map(addr, haddr);
return 0;
- case ARPHRD_IEEE802_TR:
- ip_tr_mc_map(addr, haddr);
- return 0;
case ARPHRD_INFINIBAND:
ip_ib_mc_map(addr, dev->broadcast, haddr);
return 0;
@@ -365,8 +362,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
probes -= neigh->parms->ucast_probes;
if (probes < 0) {
if (!(neigh->nud_state & NUD_VALID))
- printk(KERN_DEBUG
- "trying to ucast probe in NUD_INVALID\n");
+ pr_debug("trying to ucast probe in NUD_INVALID\n");
dst_ha = neigh->ha;
read_lock_bh(&neigh->lock);
} else {
@@ -453,7 +449,7 @@ static int arp_set_predefined(int addr_hint, unsigned char *haddr,
{
switch (addr_hint) {
case RTN_LOCAL:
- printk(KERN_DEBUG "ARP: arp called for own IP address\n");
+ pr_debug("arp called for own IP address\n");
memcpy(haddr, dev->dev_addr, dev->addr_len);
return 1;
case RTN_MULTICAST:
@@ -474,7 +470,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
struct neighbour *n;
if (!skb_dst(skb)) {
- printk(KERN_DEBUG "arp_find is called with dst==NULL\n");
+ pr_debug("arp_find is called with dst==NULL\n");
kfree_skb(skb);
return 1;
}
@@ -649,12 +645,6 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
arp->ar_pro = htons(ETH_P_IP);
break;
#endif
-#if IS_ENABLED(CONFIG_TR)
- case ARPHRD_IEEE802_TR:
- arp->ar_hrd = htons(ARPHRD_IEEE802);
- arp->ar_pro = htons(ETH_P_IP);
- break;
-#endif
}
arp->ar_hln = dev->addr_len;
@@ -752,11 +742,10 @@ static int arp_process(struct sk_buff *skb)
goto out;
break;
case ARPHRD_ETHER:
- case ARPHRD_IEEE802_TR:
case ARPHRD_FDDI:
case ARPHRD_IEEE802:
/*
- * ETHERNET, Token Ring and Fibre Channel (which are IEEE 802
+ * ETHERNET, and Fibre Channel (which are IEEE 802
* devices, according to RFC 2625) devices will accept ARP
* hardware types of either 1 (Ethernet) or 6 (IEEE 802.2).
* This is the case also of FDDI, where the RFC 1390 says that
@@ -889,7 +878,7 @@ static int arp_process(struct sk_buff *skb)
n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
- if (IPV4_DEVCONF_ALL(dev_net(dev), ARP_ACCEPT)) {
+ if (IN_DEV_ARP_ACCEPT(in_dev)) {
/* Unsolicited ARP is not accepted by default.
It is possible, that this option should be enabled for some
devices (strip is candidate)
@@ -1060,7 +1049,7 @@ static int arp_req_set(struct net *net, struct arpreq *r,
neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev);
err = PTR_ERR(neigh);
if (!IS_ERR(neigh)) {
- unsigned state = NUD_STALE;
+ unsigned int state = NUD_STALE;
if (r->arp_flags & ATF_PERM)
state = NUD_PERMANENT;
err = neigh_update(neigh, (r->arp_flags & ATF_COM) ?
@@ -1072,7 +1061,7 @@ static int arp_req_set(struct net *net, struct arpreq *r,
return err;
}
-static unsigned arp_state_to_flags(struct neighbour *neigh)
+static unsigned int arp_state_to_flags(struct neighbour *neigh)
{
if (neigh->nud_state&NUD_PERMANENT)
return ATF_PERM | ATF_COM;
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 86f3b885b4f..c48adc565e9 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1857,11 +1857,6 @@ static int cipso_v4_genopt(unsigned char *buf, u32 buf_len,
return CIPSO_V4_HDR_LEN + ret_val;
}
-static void opt_kfree_rcu(struct rcu_head *head)
-{
- kfree(container_of(head, struct ip_options_rcu, rcu));
-}
-
/**
* cipso_v4_sock_setattr - Add a CIPSO option to a socket
* @sk: the socket
@@ -1938,7 +1933,7 @@ int cipso_v4_sock_setattr(struct sock *sk,
}
rcu_assign_pointer(sk_inet->inet_opt, opt);
if (old)
- call_rcu(&old->rcu, opt_kfree_rcu);
+ kfree_rcu(old, rcu);
return 0;
@@ -2005,7 +2000,7 @@ int cipso_v4_req_setattr(struct request_sock *req,
req_inet = inet_rsk(req);
opt = xchg(&req_inet->opt, opt);
if (opt)
- call_rcu(&opt->rcu, opt_kfree_rcu);
+ kfree_rcu(opt, rcu);
return 0;
@@ -2075,7 +2070,7 @@ static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr)
* remove the entire option struct */
*opt_ptr = NULL;
hdr_delta = opt->opt.optlen;
- call_rcu(&opt->rcu, opt_kfree_rcu);
+ kfree_rcu(opt, rcu);
}
return hdr_delta;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index e41c40f48cf..10e15a144e9 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -27,7 +27,6 @@
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/bitops.h>
#include <linux/capability.h>
#include <linux/module.h>
@@ -218,8 +217,7 @@ void in_dev_finish_destroy(struct in_device *idev)
WARN_ON(idev->ifa_list);
WARN_ON(idev->mc_list);
#ifdef NET_REFCNT_DEBUG
- printk(KERN_DEBUG "in_dev_finish_destroy: %p=%s\n",
- idev, dev ? dev->name : "NIL");
+ pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL");
#endif
dev_put(dev);
if (!idev->dead)
@@ -1079,6 +1077,7 @@ __be32 inet_confirm_addr(struct in_device *in_dev,
return addr;
}
+EXPORT_SYMBOL(inet_confirm_addr);
/*
* Device notifier
@@ -1125,7 +1124,7 @@ skip:
}
}
-static inline bool inetdev_valid_mtu(unsigned mtu)
+static inline bool inetdev_valid_mtu(unsigned int mtu)
{
return mtu >= 68;
}
@@ -1174,7 +1173,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
switch (event) {
case NETDEV_REGISTER:
- printk(KERN_DEBUG "inetdev_event: bug\n");
+ pr_debug("%s: bug\n", __func__);
RCU_INIT_POINTER(dev->ip_ptr, NULL);
break;
case NETDEV_UP:
@@ -1266,17 +1265,15 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
ifm->ifa_scope = ifa->ifa_scope;
ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
- if (ifa->ifa_address)
- NLA_PUT_BE32(skb, IFA_ADDRESS, ifa->ifa_address);
-
- if (ifa->ifa_local)
- NLA_PUT_BE32(skb, IFA_LOCAL, ifa->ifa_local);
-
- if (ifa->ifa_broadcast)
- NLA_PUT_BE32(skb, IFA_BROADCAST, ifa->ifa_broadcast);
-
- if (ifa->ifa_label[0])
- NLA_PUT_STRING(skb, IFA_LABEL, ifa->ifa_label);
+ if ((ifa->ifa_address &&
+ nla_put_be32(skb, IFA_ADDRESS, ifa->ifa_address)) ||
+ (ifa->ifa_local &&
+ nla_put_be32(skb, IFA_LOCAL, ifa->ifa_local)) ||
+ (ifa->ifa_broadcast &&
+ nla_put_be32(skb, IFA_BROADCAST, ifa->ifa_broadcast)) ||
+ (ifa->ifa_label[0] &&
+ nla_put_string(skb, IFA_LABEL, ifa->ifa_label)))
+ goto nla_put_failure;
return nlmsg_end(skb, nlh);
@@ -1587,7 +1584,6 @@ static int ipv4_doint_and_flush(ctl_table *ctl, int write,
static struct devinet_sysctl_table {
struct ctl_table_header *sysctl_header;
struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX];
- char *dev_name;
} devinet_sysctl = {
.devinet_vars = {
DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
@@ -1629,16 +1625,7 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
{
int i;
struct devinet_sysctl_table *t;
-
-#define DEVINET_CTL_PATH_DEV 3
-
- struct ctl_path devinet_ctl_path[] = {
- { .procname = "net", },
- { .procname = "ipv4", },
- { .procname = "conf", },
- { /* to be set */ },
- { },
- };
+ char path[sizeof("net/ipv4/conf/") + IFNAMSIZ];
t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL);
if (!t)
@@ -1650,27 +1637,15 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
t->devinet_vars[i].extra2 = net;
}
- /*
- * Make a copy of dev_name, because '.procname' is regarded as const
- * by sysctl and we wouldn't want anyone to change it under our feet
- * (see SIOCSIFNAME).
- */
- t->dev_name = kstrdup(dev_name, GFP_KERNEL);
- if (!t->dev_name)
- goto free;
-
- devinet_ctl_path[DEVINET_CTL_PATH_DEV].procname = t->dev_name;
+ snprintf(path, sizeof(path), "net/ipv4/conf/%s", dev_name);
- t->sysctl_header = register_net_sysctl_table(net, devinet_ctl_path,
- t->devinet_vars);
+ t->sysctl_header = register_net_sysctl(net, path, t->devinet_vars);
if (!t->sysctl_header)
- goto free_procname;
+ goto free;
p->sysctl = t;
return 0;
-free_procname:
- kfree(t->dev_name);
free:
kfree(t);
out:
@@ -1686,7 +1661,6 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
cnf->sysctl = NULL;
unregister_net_sysctl_table(t->sysctl_header);
- kfree(t->dev_name);
kfree(t);
}
@@ -1716,12 +1690,6 @@ static struct ctl_table ctl_forward_entry[] = {
},
{ },
};
-
-static __net_initdata struct ctl_path net_ipv4_path[] = {
- { .procname = "net", },
- { .procname = "ipv4", },
- { },
-};
#endif
static __net_init int devinet_init_net(struct net *net)
@@ -1767,7 +1735,7 @@ static __net_init int devinet_init_net(struct net *net)
goto err_reg_dflt;
err = -ENOMEM;
- forw_hdr = register_net_sysctl_table(net, net_ipv4_path, tbl);
+ forw_hdr = register_net_sysctl(net, "net/ipv4", tbl);
if (forw_hdr == NULL)
goto err_reg_ctl;
net->ipv4.forw_hdr = forw_hdr;
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index a5b413416da..cb982a61536 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -1,3 +1,5 @@
+#define pr_fmt(fmt) "IPsec: " fmt
+
#include <crypto/aead.h>
#include <crypto/authenc.h>
#include <linux/err.h>
@@ -457,28 +459,22 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
struct esp_data *esp = x->data;
u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4);
u32 align = max_t(u32, blksize, esp->padlen);
- u32 rem;
-
- mtu -= x->props.header_len + crypto_aead_authsize(esp->aead);
- rem = mtu & (align - 1);
- mtu &= ~(align - 1);
+ unsigned int net_adj;
switch (x->props.mode) {
- case XFRM_MODE_TUNNEL:
- break;
- default:
case XFRM_MODE_TRANSPORT:
- /* The worst case */
- mtu -= blksize - 4;
- mtu += min_t(u32, blksize - 4, rem);
- break;
case XFRM_MODE_BEET:
- /* The worst case. */
- mtu += min_t(u32, IPV4_BEET_PHMAXLEN, rem);
+ net_adj = sizeof(struct iphdr);
+ break;
+ case XFRM_MODE_TUNNEL:
+ net_adj = 0;
break;
+ default:
+ BUG();
}
- return mtu - 2;
+ return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) -
+ net_adj) & ~(align - 1)) + (net_adj - 2);
}
static void esp4_err(struct sk_buff *skb, u32 info)
@@ -706,11 +702,11 @@ static const struct net_protocol esp4_protocol = {
static int __init esp4_init(void)
{
if (xfrm_register_type(&esp_type, AF_INET) < 0) {
- printk(KERN_INFO "ip esp init: can't add xfrm type\n");
+ pr_info("%s: can't add xfrm type\n", __func__);
return -EAGAIN;
}
if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) {
- printk(KERN_INFO "ip esp init: can't add protocol\n");
+ pr_info("%s: can't add protocol\n", __func__);
xfrm_unregister_type(&esp_type, AF_INET);
return -EAGAIN;
}
@@ -720,9 +716,9 @@ static int __init esp4_init(void)
static void __exit esp4_fini(void)
{
if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0)
- printk(KERN_INFO "ip esp close: can't remove protocol\n");
+ pr_info("%s: can't remove protocol\n", __func__);
if (xfrm_unregister_type(&esp_type, AF_INET) < 0)
- printk(KERN_INFO "ip esp close: can't remove xfrm type\n");
+ pr_info("%s: can't remove xfrm type\n", __func__);
}
module_init(esp4_init);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 92fc5f69f5d..3854411fa37 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -15,7 +15,6 @@
#include <linux/module.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/bitops.h>
#include <linux/capability.h>
#include <linux/types.h>
@@ -137,13 +136,13 @@ static void fib_flush(struct net *net)
* Find address type as if only "dev" was present in the system. If
* on_dev is NULL then all interfaces are taken into consideration.
*/
-static inline unsigned __inet_dev_addr_type(struct net *net,
- const struct net_device *dev,
- __be32 addr)
+static inline unsigned int __inet_dev_addr_type(struct net *net,
+ const struct net_device *dev,
+ __be32 addr)
{
struct flowi4 fl4 = { .daddr = addr };
struct fib_result res;
- unsigned ret = RTN_BROADCAST;
+ unsigned int ret = RTN_BROADCAST;
struct fib_table *local_table;
if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
@@ -695,7 +694,7 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
if (ifa->ifa_flags & IFA_F_SECONDARY) {
prim = inet_ifa_byprefix(in_dev, prefix, mask);
if (prim == NULL) {
- printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n");
+ pr_warn("%s: bug: prim == NULL\n", __func__);
return;
}
}
@@ -741,7 +740,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
#define BRD_OK 2
#define BRD0_OK 4
#define BRD1_OK 8
- unsigned ok = 0;
+ unsigned int ok = 0;
int subnet = 0; /* Primary network */
int gone = 1; /* Address is missing */
int same_prefsrc = 0; /* Another primary with same IP */
@@ -749,11 +748,11 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
if (ifa->ifa_flags & IFA_F_SECONDARY) {
prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
if (prim == NULL) {
- printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n");
+ pr_warn("%s: bug: prim == NULL\n", __func__);
return;
}
if (iprim && iprim != prim) {
- printk(KERN_WARNING "fib_del_ifaddr: bug: iprim != prim\n");
+ pr_warn("%s: bug: iprim != prim\n", __func__);
return;
}
} else if (!ipv4_is_zeronet(any) &&
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 799fc790b3c..2d043f71ef7 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -221,15 +221,15 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
frh->src_len = rule4->src_len;
frh->tos = rule4->tos;
- if (rule4->dst_len)
- NLA_PUT_BE32(skb, FRA_DST, rule4->dst);
-
- if (rule4->src_len)
- NLA_PUT_BE32(skb, FRA_SRC, rule4->src);
-
+ if ((rule4->dst_len &&
+ nla_put_be32(skb, FRA_DST, rule4->dst)) ||
+ (rule4->src_len &&
+ nla_put_be32(skb, FRA_SRC, rule4->src)))
+ goto nla_put_failure;
#ifdef CONFIG_IP_ROUTE_CLASSID
- if (rule4->tclassid)
- NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid);
+ if (rule4->tclassid &&
+ nla_put_u32(skb, FRA_FLOW, rule4->tclassid))
+ goto nla_put_failure;
#endif
return 0;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 80106d89d54..e5b7182fa09 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -14,7 +14,6 @@
*/
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -146,6 +145,12 @@ static void free_fib_info_rcu(struct rcu_head *head)
{
struct fib_info *fi = container_of(head, struct fib_info, rcu);
+ change_nexthops(fi) {
+ if (nexthop_nh->nh_dev)
+ dev_put(nexthop_nh->nh_dev);
+ } endfor_nexthops(fi);
+
+ release_net(fi->fib_net);
if (fi->fib_metrics != (u32 *) dst_default_metrics)
kfree(fi->fib_metrics);
kfree(fi);
@@ -154,16 +159,10 @@ static void free_fib_info_rcu(struct rcu_head *head)
void free_fib_info(struct fib_info *fi)
{
if (fi->fib_dead == 0) {
- pr_warning("Freeing alive fib_info %p\n", fi);
+ pr_warn("Freeing alive fib_info %p\n", fi);
return;
}
- change_nexthops(fi) {
- if (nexthop_nh->nh_dev)
- dev_put(nexthop_nh->nh_dev);
- nexthop_nh->nh_dev = NULL;
- } endfor_nexthops(fi);
fib_info_cnt--;
- release_net(fi->fib_net);
call_rcu(&fi->rcu, free_fib_info_rcu);
}
@@ -932,33 +931,36 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
rtm->rtm_table = tb_id;
else
rtm->rtm_table = RT_TABLE_COMPAT;
- NLA_PUT_U32(skb, RTA_TABLE, tb_id);
+ if (nla_put_u32(skb, RTA_TABLE, tb_id))
+ goto nla_put_failure;
rtm->rtm_type = type;
rtm->rtm_flags = fi->fib_flags;
rtm->rtm_scope = fi->fib_scope;
rtm->rtm_protocol = fi->fib_protocol;
- if (rtm->rtm_dst_len)
- NLA_PUT_BE32(skb, RTA_DST, dst);
-
- if (fi->fib_priority)
- NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
-
+ if (rtm->rtm_dst_len &&
+ nla_put_be32(skb, RTA_DST, dst))
+ goto nla_put_failure;
+ if (fi->fib_priority &&
+ nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority))
+ goto nla_put_failure;
if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
goto nla_put_failure;
- if (fi->fib_prefsrc)
- NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
-
+ if (fi->fib_prefsrc &&
+ nla_put_be32(skb, RTA_PREFSRC, fi->fib_prefsrc))
+ goto nla_put_failure;
if (fi->fib_nhs == 1) {
- if (fi->fib_nh->nh_gw)
- NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
-
- if (fi->fib_nh->nh_oif)
- NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
+ if (fi->fib_nh->nh_gw &&
+ nla_put_be32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw))
+ goto nla_put_failure;
+ if (fi->fib_nh->nh_oif &&
+ nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif))
+ goto nla_put_failure;
#ifdef CONFIG_IP_ROUTE_CLASSID
- if (fi->fib_nh[0].nh_tclassid)
- NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
+ if (fi->fib_nh[0].nh_tclassid &&
+ nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
+ goto nla_put_failure;
#endif
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -979,11 +981,13 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
rtnh->rtnh_hops = nh->nh_weight - 1;
rtnh->rtnh_ifindex = nh->nh_oif;
- if (nh->nh_gw)
- NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
+ if (nh->nh_gw &&
+ nla_put_be32(skb, RTA_GATEWAY, nh->nh_gw))
+ goto nla_put_failure;
#ifdef CONFIG_IP_ROUTE_CLASSID
- if (nh->nh_tclassid)
- NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
+ if (nh->nh_tclassid &&
+ nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
+ goto nla_put_failure;
#endif
/* length of rtnetlink header + attributes */
rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 2b555a5521e..30b88d7b4bd 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -51,7 +51,6 @@
#define VERSION "0.409"
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -1170,9 +1169,8 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
}
if (tp && tp->pos + tp->bits > 32)
- pr_warning("fib_trie"
- " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
- tp, tp->pos, tp->bits, key, plen);
+ pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
+ tp, tp->pos, tp->bits, key, plen);
/* Rebalance the trie */
@@ -1372,6 +1370,8 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
continue;
+ if (fi->fib_dead)
+ continue;
if (fa->fa_info->fib_scope < flp->flowi4_scope)
continue;
fib_alias_accessed(fa);
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c
index 8cb1ebb7cd7..42a491055c7 100644
--- a/net/ipv4/gre.c
+++ b/net/ipv4/gre.c
@@ -10,6 +10,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/kmod.h>
@@ -118,10 +120,10 @@ static const struct net_protocol net_gre_protocol = {
static int __init gre_init(void)
{
- pr_info("GRE over IPv4 demultiplexor driver");
+ pr_info("GRE over IPv4 demultiplexor driver\n");
if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
- pr_err("gre: can't add protocol\n");
+ pr_err("can't add protocol\n");
return -EAGAIN;
}
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index ab188ae12fd..c75efbdc71c 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -62,6 +62,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/types.h>
#include <linux/jiffies.h>
@@ -89,7 +91,6 @@
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/init.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <net/checksum.h>
#include <net/xfrm.h>
@@ -670,7 +671,7 @@ static void icmp_unreach(struct sk_buff *skb)
break;
case ICMP_FRAG_NEEDED:
if (ipv4_config.no_pmtu_disc) {
- LIMIT_NETDEBUG(KERN_INFO "ICMP: %pI4: fragmentation needed and DF set.\n",
+ LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"),
&iph->daddr);
} else {
info = ip_rt_frag_needed(net, iph,
@@ -681,7 +682,7 @@ static void icmp_unreach(struct sk_buff *skb)
}
break;
case ICMP_SR_FAILED:
- LIMIT_NETDEBUG(KERN_INFO "ICMP: %pI4: Source Route Failed.\n",
+ LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: Source Route Failed\n"),
&iph->daddr);
break;
default:
@@ -712,14 +713,10 @@ static void icmp_unreach(struct sk_buff *skb)
if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses &&
inet_addr_type(net, iph->daddr) == RTN_BROADCAST) {
- if (net_ratelimit())
- printk(KERN_WARNING "%pI4 sent an invalid ICMP "
- "type %u, code %u "
- "error to a broadcast: %pI4 on %s\n",
- &ip_hdr(skb)->saddr,
- icmph->type, icmph->code,
- &iph->daddr,
- skb->dev->name);
+ net_warn_ratelimited("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n",
+ &ip_hdr(skb)->saddr,
+ icmph->type, icmph->code,
+ &iph->daddr, skb->dev->name);
goto out;
}
@@ -908,8 +905,7 @@ out_err:
static void icmp_address(struct sk_buff *skb)
{
#if 0
- if (net_ratelimit())
- printk(KERN_DEBUG "a guy asks for address mask. Who is it?\n");
+ net_dbg_ratelimited("a guy asks for address mask. Who is it?\n");
#endif
}
@@ -945,10 +941,10 @@ static void icmp_address_reply(struct sk_buff *skb)
inet_ifa_match(ip_hdr(skb)->saddr, ifa))
break;
}
- if (!ifa && net_ratelimit()) {
- printk(KERN_INFO "Wrong address mask %pI4 from %s/%pI4\n",
- mp, dev->name, &ip_hdr(skb)->saddr);
- }
+ if (!ifa)
+ net_info_ratelimited("Wrong address mask %pI4 from %s/%pI4\n",
+ mp,
+ dev->name, &ip_hdr(skb)->saddr);
}
}
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 450e5d21ed2..6699f23e6f5 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -73,7 +73,6 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
@@ -345,10 +344,10 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
pip->protocol = IPPROTO_IGMP;
pip->tot_len = 0; /* filled in later */
ip_select_ident(pip, &rt->dst, NULL);
- ((u8*)&pip[1])[0] = IPOPT_RA;
- ((u8*)&pip[1])[1] = 4;
- ((u8*)&pip[1])[2] = 0;
- ((u8*)&pip[1])[3] = 0;
+ ((u8 *)&pip[1])[0] = IPOPT_RA;
+ ((u8 *)&pip[1])[1] = 4;
+ ((u8 *)&pip[1])[2] = 0;
+ ((u8 *)&pip[1])[3] = 0;
skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4;
skb_put(skb, sizeof(*pig));
@@ -689,10 +688,10 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
iph->saddr = fl4.saddr;
iph->protocol = IPPROTO_IGMP;
ip_select_ident(iph, &rt->dst, NULL);
- ((u8*)&iph[1])[0] = IPOPT_RA;
- ((u8*)&iph[1])[1] = 4;
- ((u8*)&iph[1])[2] = 0;
- ((u8*)&iph[1])[3] = 0;
+ ((u8 *)&iph[1])[0] = IPOPT_RA;
+ ((u8 *)&iph[1])[1] = 4;
+ ((u8 *)&iph[1])[2] = 0;
+ ((u8 *)&iph[1])[3] = 0;
ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
ih->type = type;
@@ -775,7 +774,7 @@ static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
if (psf->sf_count[MCAST_INCLUDE] ||
pmc->sfcount[MCAST_EXCLUDE] !=
psf->sf_count[MCAST_EXCLUDE])
- continue;
+ break;
if (srcs[i] == psf->sf_inaddr) {
scount++;
break;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 19d66cefd7d..f9ee7417f6a 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -42,7 +42,8 @@ EXPORT_SYMBOL(sysctl_local_reserved_ports);
void inet_get_local_port_range(int *low, int *high)
{
- unsigned seq;
+ unsigned int seq;
+
do {
seq = read_seqbegin(&sysctl_local_ports.lock);
@@ -53,7 +54,7 @@ void inet_get_local_port_range(int *low, int *high)
EXPORT_SYMBOL(inet_get_local_port_range);
int inet_csk_bind_conflict(const struct sock *sk,
- const struct inet_bind_bucket *tb)
+ const struct inet_bind_bucket *tb, bool relax)
{
struct sock *sk2;
struct hlist_node *node;
@@ -79,6 +80,14 @@ int inet_csk_bind_conflict(const struct sock *sk,
sk2_rcv_saddr == sk_rcv_saddr(sk))
break;
}
+ if (!relax && reuse && sk2->sk_reuse &&
+ sk2->sk_state != TCP_LISTEN) {
+ const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
+
+ if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
+ sk2_rcv_saddr == sk_rcv_saddr(sk))
+ break;
+ }
}
}
return node != NULL;
@@ -122,12 +131,13 @@ again:
(tb->num_owners < smallest_size || smallest_size == -1)) {
smallest_size = tb->num_owners;
smallest_rover = rover;
- if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) {
+ if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&
+ !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
snum = smallest_rover;
goto tb_found;
}
}
- if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) {
+ if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
snum = rover;
goto tb_found;
}
@@ -172,18 +182,22 @@ have_snum:
goto tb_not_found;
tb_found:
if (!hlist_empty(&tb->owners)) {
+ if (sk->sk_reuse == SK_FORCE_REUSE)
+ goto success;
+
if (tb->fastreuse > 0 &&
sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
smallest_size == -1) {
goto success;
} else {
ret = 1;
- if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) {
+ if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
if (sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
smallest_size != -1 && --attempts >= 0) {
spin_unlock(&head->lock);
goto again;
}
+
goto fail_unlock;
}
}
@@ -363,7 +377,8 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
- sk->sk_protocol, inet_sk_flowi_flags(sk),
+ sk->sk_protocol,
+ inet_sk_flowi_flags(sk) & ~FLOWI_FLAG_PRECOW_METRICS,
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr,
ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport);
security_req_classify_flow(req, flowi4_to_flowi(fl4));
@@ -514,7 +529,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
/* Normally all the openreqs are young and become mature
* (i.e. converted to established socket) for first timeout.
- * If synack was not acknowledged for 3 seconds, it means
+ * If synack was not acknowledged for 1 second, it means
* one of the following things: synack was lost, ack was lost,
* rtt is high or nobody planned to ack (i.e. synflood).
* When server is a bit loaded, queue is populated with old
@@ -555,8 +570,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
syn_ack_recalc(req, thresh, max_retries,
queue->rskq_defer_accept,
&expire, &resend);
- if (req->rsk_ops->syn_ack_timeout)
- req->rsk_ops->syn_ack_timeout(parent, req);
+ req->rsk_ops->syn_ack_timeout(parent, req);
if (!expire &&
(!resend ||
!req->rsk_ops->rtx_syn_ack(parent, req, NULL) ||
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index fcf281819cd..46d1e7199a8 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -141,7 +141,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
goto rtattr_failure;
if (icsk == NULL) {
- r->idiag_rqueue = r->idiag_wqueue = 0;
+ handler->idiag_get_info(sk, r, NULL);
goto out;
}
@@ -960,9 +960,12 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
return -EINVAL;
}
-
- return netlink_dump_start(sock_diag_nlsk, skb, nlh,
- inet_diag_dump_compat, NULL, 0);
+ {
+ struct netlink_dump_control c = {
+ .dump = inet_diag_dump_compat,
+ };
+ return netlink_dump_start(sock_diag_nlsk, skb, nlh, &c);
+ }
}
return inet_diag_get_exact_compat(skb, nlh);
@@ -985,20 +988,23 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
return -EINVAL;
}
-
- return netlink_dump_start(sock_diag_nlsk, skb, h,
- inet_diag_dump, NULL, 0);
+ {
+ struct netlink_dump_control c = {
+ .dump = inet_diag_dump,
+ };
+ return netlink_dump_start(sock_diag_nlsk, skb, h, &c);
+ }
}
return inet_diag_get_exact(skb, h, (struct inet_diag_req_v2 *)NLMSG_DATA(h));
}
-static struct sock_diag_handler inet_diag_handler = {
+static const struct sock_diag_handler inet_diag_handler = {
.family = AF_INET,
.dump = inet_diag_handler_dump,
};
-static struct sock_diag_handler inet6_diag_handler = {
+static const struct sock_diag_handler inet6_diag_handler = {
.family = AF_INET6,
.dump = inet_diag_handler_dump,
};
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 984ec656b03..7880af97020 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -217,7 +217,7 @@ begin:
}
EXPORT_SYMBOL_GPL(__inet_lookup_listener);
-struct sock * __inet_lookup_established(struct net *net,
+struct sock *__inet_lookup_established(struct net *net,
struct inet_hashinfo *hashinfo,
const __be32 saddr, const __be16 sport,
const __be32 daddr, const u16 hnum,
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 89168c6351f..2784db3155f 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -89,8 +89,8 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
#ifdef SOCK_REFCNT_DEBUG
if (atomic_read(&tw->tw_refcnt) != 1) {
- printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n",
- tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
+ pr_debug("%s timewait_sock %p refcnt=%d\n",
+ tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
}
#endif
while (refcnt) {
@@ -263,7 +263,7 @@ rescan:
void inet_twdr_hangman(unsigned long data)
{
struct inet_timewait_death_row *twdr;
- int unsigned need_timer;
+ unsigned int need_timer;
twdr = (struct inet_timewait_death_row *)data;
spin_lock(&twdr->death_lock);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 29a07b6c716..e5c44fc586a 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -41,7 +41,7 @@
static int ip_forward_finish(struct sk_buff *skb)
{
- struct ip_options * opt = &(IPCB(skb)->opt);
+ struct ip_options *opt = &(IPCB(skb)->opt);
IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
@@ -55,7 +55,7 @@ int ip_forward(struct sk_buff *skb)
{
struct iphdr *iph; /* Our header */
struct rtable *rt; /* Route we use */
- struct ip_options * opt = &(IPCB(skb)->opt);
+ struct ip_options *opt = &(IPCB(skb)->opt);
if (skb_warn_if_lro(skb))
goto drop;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 1f23a57aa9e..9dbd3dd6022 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -20,6 +20,8 @@
* Patrick McHardy : LRU queue of frag heads for evictor.
*/
+#define pr_fmt(fmt) "IPv4: " fmt
+
#include <linux/compiler.h>
#include <linux/module.h>
#include <linux/types.h>
@@ -146,17 +148,17 @@ static unsigned int ip4_hashfn(struct inet_frag_queue *q)
return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
}
-static int ip4_frag_match(struct inet_frag_queue *q, void *a)
+static bool ip4_frag_match(struct inet_frag_queue *q, void *a)
{
struct ipq *qp;
struct ip4_create_arg *arg = a;
qp = container_of(q, struct ipq, q);
return qp->id == arg->iph->id &&
- qp->saddr == arg->iph->saddr &&
- qp->daddr == arg->iph->daddr &&
- qp->protocol == arg->iph->protocol &&
- qp->user == arg->user;
+ qp->saddr == arg->iph->saddr &&
+ qp->daddr == arg->iph->daddr &&
+ qp->protocol == arg->iph->protocol &&
+ qp->user == arg->user;
}
/* Memory Tracking Functions. */
@@ -299,7 +301,7 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
return container_of(q, struct ipq, q);
out_nomem:
- LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n");
+ LIMIT_NETDEBUG(KERN_ERR pr_fmt("ip_frag_create: no memory left !\n"));
return NULL;
}
@@ -543,6 +545,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
int len;
int ihlen;
int err;
+ int sum_truesize;
u8 ecn;
ipq_kill(qp);
@@ -567,7 +570,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
skb_morph(head, qp->q.fragments);
head->next = qp->q.fragments->next;
- kfree_skb(qp->q.fragments);
+ consume_skb(qp->q.fragments);
qp->q.fragments = head;
}
@@ -609,19 +612,32 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
atomic_add(clone->truesize, &qp->q.net->mem);
}
- skb_shinfo(head)->frag_list = head->next;
skb_push(head, head->data - skb_network_header(head));
- for (fp=head->next; fp; fp = fp->next) {
- head->data_len += fp->len;
- head->len += fp->len;
+ sum_truesize = head->truesize;
+ for (fp = head->next; fp;) {
+ bool headstolen;
+ int delta;
+ struct sk_buff *next = fp->next;
+
+ sum_truesize += fp->truesize;
if (head->ip_summed != fp->ip_summed)
head->ip_summed = CHECKSUM_NONE;
else if (head->ip_summed == CHECKSUM_COMPLETE)
head->csum = csum_add(head->csum, fp->csum);
- head->truesize += fp->truesize;
+
+ if (skb_try_coalesce(head, fp, &headstolen, &delta)) {
+ kfree_skb_partial(fp, headstolen);
+ } else {
+ if (!skb_shinfo(head)->frag_list)
+ skb_shinfo(head)->frag_list = fp;
+ head->data_len += fp->len;
+ head->len += fp->len;
+ head->truesize += fp->truesize;
+ }
+ fp = next;
}
- atomic_sub(head->truesize, &qp->q.net->mem);
+ atomic_sub(sum_truesize, &qp->q.net->mem);
head->next = NULL;
head->dev = dev;
@@ -637,14 +653,12 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
return 0;
out_nomem:
- LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing "
- "queue %p\n", qp);
+ LIMIT_NETDEBUG(KERN_ERR pr_fmt("queue_glue: no memory for gluing queue %p\n"),
+ qp);
err = -ENOMEM;
goto out_fail;
out_oversize:
- if (net_ratelimit())
- printk(KERN_INFO "Oversized IP packet from %pI4.\n",
- &qp->saddr);
+ net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr);
out_fail:
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
return err;
@@ -781,7 +795,7 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
table[2].data = &net->ipv4.frags.timeout;
}
- hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table);
+ hdr = register_net_sysctl(net, "net/ipv4", table);
if (hdr == NULL)
goto err_reg;
@@ -806,7 +820,7 @@ static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
static void ip4_frags_ctl_register(void)
{
- register_net_sysctl_rotable(net_ipv4_ctl_path, ip4_frags_ctl_table);
+ register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table);
}
#else
static inline int ip4_frags_ns_ctl_register(struct net *net)
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 38673d2860e..f49047b7960 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -10,6 +10,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/capability.h>
#include <linux/module.h>
#include <linux/types.h>
@@ -167,37 +169,56 @@ struct ipgre_net {
/* often modified stats are per cpu, other are shared (netdev->stats) */
struct pcpu_tstats {
- unsigned long rx_packets;
- unsigned long rx_bytes;
- unsigned long tx_packets;
- unsigned long tx_bytes;
-} __attribute__((aligned(4*sizeof(unsigned long))));
+ u64 rx_packets;
+ u64 rx_bytes;
+ u64 tx_packets;
+ u64 tx_bytes;
+ struct u64_stats_sync syncp;
+};
-static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
+static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *tot)
{
- struct pcpu_tstats sum = { 0 };
int i;
for_each_possible_cpu(i) {
const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
-
- sum.rx_packets += tstats->rx_packets;
- sum.rx_bytes += tstats->rx_bytes;
- sum.tx_packets += tstats->tx_packets;
- sum.tx_bytes += tstats->tx_bytes;
+ u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+ unsigned int start;
+
+ do {
+ start = u64_stats_fetch_begin_bh(&tstats->syncp);
+ rx_packets = tstats->rx_packets;
+ tx_packets = tstats->tx_packets;
+ rx_bytes = tstats->rx_bytes;
+ tx_bytes = tstats->tx_bytes;
+ } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
+
+ tot->rx_packets += rx_packets;
+ tot->tx_packets += tx_packets;
+ tot->rx_bytes += rx_bytes;
+ tot->tx_bytes += tx_bytes;
}
- dev->stats.rx_packets = sum.rx_packets;
- dev->stats.rx_bytes = sum.rx_bytes;
- dev->stats.tx_packets = sum.tx_packets;
- dev->stats.tx_bytes = sum.tx_bytes;
- return &dev->stats;
+
+ tot->multicast = dev->stats.multicast;
+ tot->rx_crc_errors = dev->stats.rx_crc_errors;
+ tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
+ tot->rx_length_errors = dev->stats.rx_length_errors;
+ tot->rx_errors = dev->stats.rx_errors;
+ tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
+ tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
+ tot->tx_dropped = dev->stats.tx_dropped;
+ tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
+ tot->tx_errors = dev->stats.tx_errors;
+
+ return tot;
}
/* Given src, dst and key, find appropriate for input tunnel. */
-static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
- __be32 remote, __be32 local,
- __be32 key, __be16 gre_proto)
+static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
+ __be32 remote, __be32 local,
+ __be32 key, __be16 gre_proto)
{
struct net *net = dev_net(dev);
int link = dev->ifindex;
@@ -462,7 +483,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
*/
const struct iphdr *iph = (const struct iphdr *)skb->data;
- __be16 *p = (__be16*)(skb->data+(iph->ihl<<2));
+ __be16 *p = (__be16 *)(skb->data+(iph->ihl<<2));
int grehlen = (iph->ihl<<2) + 4;
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
@@ -572,7 +593,7 @@ static int ipgre_rcv(struct sk_buff *skb)
iph = ip_hdr(skb);
h = skb->data;
- flags = *(__be16*)h;
+ flags = *(__be16 *)h;
if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
/* - Version must be 0.
@@ -596,11 +617,11 @@ static int ipgre_rcv(struct sk_buff *skb)
offset += 4;
}
if (flags&GRE_KEY) {
- key = *(__be32*)(h + offset);
+ key = *(__be32 *)(h + offset);
offset += 4;
}
if (flags&GRE_SEQ) {
- seqno = ntohl(*(__be32*)(h + offset));
+ seqno = ntohl(*(__be32 *)(h + offset));
offset += 4;
}
}
@@ -670,8 +691,10 @@ static int ipgre_rcv(struct sk_buff *skb)
}
tstats = this_cpu_ptr(tunnel->dev->tstats);
+ u64_stats_update_begin(&tstats->syncp);
tstats->rx_packets++;
tstats->rx_bytes += skb->len;
+ u64_stats_update_end(&tstats->syncp);
__skb_tunnel_rx(skb, tunnel->dev);
@@ -730,15 +753,16 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
if (skb->protocol == htons(ETH_P_IP)) {
rt = skb_rtable(skb);
- if ((dst = rt->rt_gateway) == 0)
- goto tx_error_icmp;
+ dst = rt->rt_gateway;
}
#if IS_ENABLED(CONFIG_IPV6)
else if (skb->protocol == htons(ETH_P_IPV6)) {
- struct neighbour *neigh = dst_get_neighbour_noref(skb_dst(skb));
const struct in6_addr *addr6;
+ struct neighbour *neigh;
+ bool do_tx_error_icmp;
int addr_type;
+ neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
if (neigh == NULL)
goto tx_error;
@@ -751,9 +775,14 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
}
if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
+ do_tx_error_icmp = true;
+ else {
+ do_tx_error_icmp = false;
+ dst = addr6->s6_addr32[3];
+ }
+ neigh_release(neigh);
+ if (do_tx_error_icmp)
goto tx_error_icmp;
-
- dst = addr6->s6_addr32[3];
}
#endif
else
@@ -892,7 +921,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
htons(ETH_P_TEB) : skb->protocol;
if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
- __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
+ __be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4);
if (tunnel->parms.o_flags&GRE_SEQ) {
++tunnel->o_seqno;
@@ -905,7 +934,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
}
if (tunnel->parms.o_flags&GRE_CSUM) {
*ptr = 0;
- *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
+ *(__sum16 *)ptr = ip_compute_csum((void *)(iph+1), skb->len - sizeof(struct iphdr));
}
}
@@ -914,9 +943,10 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
__IPTUNNEL_XMIT(tstats, &dev->stats);
return NETDEV_TX_OK;
+#if IS_ENABLED(CONFIG_IPV6)
tx_error_icmp:
dst_link_failure(skb);
-
+#endif
tx_error:
dev->stats.tx_errors++;
dev_kfree_skb(skb);
@@ -1160,7 +1190,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
{
struct ip_tunnel *t = netdev_priv(dev);
struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
- __be16 *p = (__be16*)(iph+1);
+ __be16 *p = (__be16 *)(iph+1);
memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
p[0] = t->parms.o_flags;
@@ -1244,7 +1274,7 @@ static const struct net_device_ops ipgre_netdev_ops = {
.ndo_start_xmit = ipgre_tunnel_xmit,
.ndo_do_ioctl = ipgre_tunnel_ioctl,
.ndo_change_mtu = ipgre_tunnel_change_mtu,
- .ndo_get_stats = ipgre_get_stats,
+ .ndo_get_stats64 = ipgre_get_stats64,
};
static void ipgre_dev_free(struct net_device *dev)
@@ -1498,7 +1528,7 @@ static const struct net_device_ops ipgre_tap_netdev_ops = {
.ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr,
.ndo_change_mtu = ipgre_tunnel_change_mtu,
- .ndo_get_stats = ipgre_get_stats,
+ .ndo_get_stats64 = ipgre_get_stats64,
};
static void ipgre_tap_setup(struct net_device *dev)
@@ -1529,7 +1559,7 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nla
return -EEXIST;
if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
- random_ether_addr(dev->dev_addr);
+ eth_hw_addr_random(dev);
mtu = ipgre_tunnel_bind_dev(dev);
if (!tb[IFLA_MTU])
@@ -1645,17 +1675,18 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
struct ip_tunnel *t = netdev_priv(dev);
struct ip_tunnel_parm *p = &t->parms;
- NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
- NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
- NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
- NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
- NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
- NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
- NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
- NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
- NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
- NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
-
+ if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
+ nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) ||
+ nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) ||
+ nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
+ nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
+ nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
+ nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
+ nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
+ nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
+ nla_put_u8(skb, IFLA_GRE_PMTUDISC,
+ !!(p->iph.frag_off & htons(IP_DF))))
+ goto nla_put_failure;
return 0;
nla_put_failure:
@@ -1709,7 +1740,7 @@ static int __init ipgre_init(void)
{
int err;
- printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
+ pr_info("GRE over IPv4 tunneling driver\n");
err = register_pernet_device(&ipgre_net_ops);
if (err < 0)
@@ -1717,7 +1748,7 @@ static int __init ipgre_init(void)
err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
if (err < 0) {
- printk(KERN_INFO "ipgre init: can't add protocol\n");
+ pr_info("%s: can't add protocol\n", __func__);
goto add_proto_failed;
}
@@ -1746,7 +1777,7 @@ static void __exit ipgre_fini(void)
rtnl_link_unregister(&ipgre_tap_ops);
rtnl_link_unregister(&ipgre_link_ops);
if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
- printk(KERN_INFO "ipgre close: can't remove protocol\n");
+ pr_info("%s: can't remove protocol\n", __func__);
unregister_pernet_device(&ipgre_net_ops);
}
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 073a9b01c40..8590144ca33 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -113,7 +113,8 @@
* 2 of the License, or (at your option) any later version.
*/
-#include <asm/system.h>
+#define pr_fmt(fmt) "IPv4: " fmt
+
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -148,7 +149,7 @@
/*
* Process Router Attention IP option (RFC 2113)
*/
-int ip_call_ra_chain(struct sk_buff *skb)
+bool ip_call_ra_chain(struct sk_buff *skb)
{
struct ip_ra_chain *ra;
u8 protocol = ip_hdr(skb)->protocol;
@@ -167,7 +168,7 @@ int ip_call_ra_chain(struct sk_buff *skb)
net_eq(sock_net(sk), dev_net(dev))) {
if (ip_is_fragment(ip_hdr(skb))) {
if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN))
- return 1;
+ return true;
}
if (last) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
@@ -180,9 +181,9 @@ int ip_call_ra_chain(struct sk_buff *skb)
if (last) {
raw_rcv(last, skb);
- return 1;
+ return true;
}
- return 0;
+ return false;
}
static int ip_local_deliver_finish(struct sk_buff *skb)
@@ -209,9 +210,8 @@ static int ip_local_deliver_finish(struct sk_buff *skb)
int ret;
if (!net_eq(net, &init_net) && !ipprot->netns_ok) {
- if (net_ratelimit())
- printk("%s: proto %d isn't netns-ready\n",
- __func__, protocol);
+ net_info_ratelimited("%s: proto %d isn't netns-ready\n",
+ __func__, protocol);
kfree_skb(skb);
goto out;
}
@@ -265,7 +265,7 @@ int ip_local_deliver(struct sk_buff *skb)
ip_local_deliver_finish);
}
-static inline int ip_rcv_options(struct sk_buff *skb)
+static inline bool ip_rcv_options(struct sk_buff *skb)
{
struct ip_options *opt;
const struct iphdr *iph;
@@ -297,10 +297,10 @@ static inline int ip_rcv_options(struct sk_buff *skb)
if (in_dev) {
if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
- if (IN_DEV_LOG_MARTIANS(in_dev) &&
- net_ratelimit())
- printk(KERN_INFO "source route option %pI4 -> %pI4\n",
- &iph->saddr, &iph->daddr);
+ if (IN_DEV_LOG_MARTIANS(in_dev))
+ net_info_ratelimited("source route option %pI4 -> %pI4\n",
+ &iph->saddr,
+ &iph->daddr);
goto drop;
}
}
@@ -309,9 +309,9 @@ static inline int ip_rcv_options(struct sk_buff *skb)
goto drop;
}
- return 0;
+ return false;
drop:
- return -1;
+ return true;
}
static int ip_rcv_finish(struct sk_buff *skb)
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 42dd1a90ede..708b99494e2 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -9,6 +9,8 @@
*
*/
+#define pr_fmt(fmt) "IPv4: " fmt
+
#include <linux/capability.h>
#include <linux/module.h>
#include <linux/slab.h>
@@ -208,10 +210,10 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
* Simple and stupid 8), but the most efficient way.
*/
-void ip_options_fragment(struct sk_buff * skb)
+void ip_options_fragment(struct sk_buff *skb)
{
unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr);
- struct ip_options * opt = &(IPCB(skb)->opt);
+ struct ip_options *opt = &(IPCB(skb)->opt);
int l = opt->optlen;
int optlen;
@@ -246,13 +248,13 @@ void ip_options_fragment(struct sk_buff * skb)
*/
int ip_options_compile(struct net *net,
- struct ip_options * opt, struct sk_buff * skb)
+ struct ip_options *opt, struct sk_buff *skb)
{
int l;
- unsigned char * iph;
- unsigned char * optptr;
+ unsigned char *iph;
+ unsigned char *optptr;
int optlen;
- unsigned char * pp_ptr = NULL;
+ unsigned char *pp_ptr = NULL;
struct rtable *rt = NULL;
if (skb != NULL) {
@@ -411,7 +413,7 @@ int ip_options_compile(struct net *net,
opt->is_changed = 1;
}
} else {
- unsigned overflow = optptr[3]>>4;
+ unsigned int overflow = optptr[3]>>4;
if (overflow == 15) {
pp_ptr = optptr + 3;
goto error;
@@ -471,20 +473,20 @@ EXPORT_SYMBOL(ip_options_compile);
* Undo all the changes done by ip_options_compile().
*/
-void ip_options_undo(struct ip_options * opt)
+void ip_options_undo(struct ip_options *opt)
{
if (opt->srr) {
- unsigned char * optptr = opt->__data+opt->srr-sizeof(struct iphdr);
+ unsigned char *optptr = opt->__data+opt->srr-sizeof(struct iphdr);
memmove(optptr+7, optptr+3, optptr[1]-7);
memcpy(optptr+3, &opt->faddr, 4);
}
if (opt->rr_needaddr) {
- unsigned char * optptr = opt->__data+opt->rr-sizeof(struct iphdr);
+ unsigned char *optptr = opt->__data+opt->rr-sizeof(struct iphdr);
optptr[2] -= 4;
memset(&optptr[optptr[2]-1], 0, 4);
}
if (opt->ts) {
- unsigned char * optptr = opt->__data+opt->ts-sizeof(struct iphdr);
+ unsigned char *optptr = opt->__data+opt->ts-sizeof(struct iphdr);
if (opt->ts_needtime) {
optptr[2] -= 4;
memset(&optptr[optptr[2]-1], 0, 4);
@@ -547,8 +549,8 @@ int ip_options_get(struct net *net, struct ip_options_rcu **optp,
void ip_forward_options(struct sk_buff *skb)
{
- struct ip_options * opt = &(IPCB(skb)->opt);
- unsigned char * optptr;
+ struct ip_options *opt = &(IPCB(skb)->opt);
+ unsigned char *optptr;
struct rtable *rt = skb_rtable(skb);
unsigned char *raw = skb_network_header(skb);
@@ -576,8 +578,10 @@ void ip_forward_options(struct sk_buff *skb)
ip_hdr(skb)->daddr = opt->nexthop;
ip_rt_get_source(&optptr[srrptr-1], skb, rt);
optptr[2] = srrptr+4;
- } else if (net_ratelimit())
- printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n");
+ } else {
+ net_crit_ratelimited("%s(): Argh! Destination lost!\n",
+ __func__);
+ }
if (opt->ts_needaddr) {
optptr = raw + opt->ts;
ip_rt_get_source(&optptr[optptr[2]-9], skb, rt);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index ff302bde889..451f97c42eb 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -43,7 +43,6 @@
*/
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -215,8 +214,8 @@ static inline int ip_finish_output2(struct sk_buff *skb)
}
rcu_read_unlock();
- if (net_ratelimit())
- printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
+ net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
+ __func__);
kfree_skb(skb);
return -EINVAL;
}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 8aa87c19fa0..0d11f234d61 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -90,7 +90,7 @@ static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb)
static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
{
unsigned char optbuf[sizeof(struct ip_options) + 40];
- struct ip_options * opt = (struct ip_options *)optbuf;
+ struct ip_options *opt = (struct ip_options *)optbuf;
if (IPCB(skb)->opt.optlen == 0)
return;
@@ -147,7 +147,7 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
{
struct inet_sock *inet = inet_sk(skb->sk);
- unsigned flags = inet->cmsg_flags;
+ unsigned int flags = inet->cmsg_flags;
/* Ordered by supposed usage frequency */
if (flags & 1)
@@ -445,11 +445,6 @@ out:
}
-static void opt_kfree_rcu(struct rcu_head *head)
-{
- kfree(container_of(head, struct ip_options_rcu, rcu));
-}
-
/*
* Socket option code for IP. This is the end of the line after any
* TCP,UDP etc options on an IP socket.
@@ -469,6 +464,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
(1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) |
(1<<IP_PASSSEC) | (1<<IP_TRANSPARENT) |
(1<<IP_MINTTL) | (1<<IP_NODEFRAG))) ||
+ optname == IP_UNICAST_IF ||
optname == IP_MULTICAST_TTL ||
optname == IP_MULTICAST_ALL ||
optname == IP_MULTICAST_LOOP ||
@@ -525,7 +521,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
}
rcu_assign_pointer(inet->inet_opt, opt);
if (old)
- call_rcu(&old->rcu, opt_kfree_rcu);
+ kfree_rcu(old, rcu);
break;
}
case IP_PKTINFO:
@@ -628,6 +624,35 @@ static int do_ip_setsockopt(struct sock *sk, int level,
goto e_inval;
inet->mc_loop = !!val;
break;
+ case IP_UNICAST_IF:
+ {
+ struct net_device *dev = NULL;
+ int ifindex;
+
+ if (optlen != sizeof(int))
+ goto e_inval;
+
+ ifindex = (__force int)ntohl((__force __be32)val);
+ if (ifindex == 0) {
+ inet->uc_index = 0;
+ err = 0;
+ break;
+ }
+
+ dev = dev_get_by_index(sock_net(sk), ifindex);
+ err = -EADDRNOTAVAIL;
+ if (!dev)
+ break;
+ dev_put(dev);
+
+ err = -EINVAL;
+ if (sk->sk_bound_dev_if)
+ break;
+
+ inet->uc_index = ifindex;
+ err = 0;
+ break;
+ }
case IP_MULTICAST_IF:
{
struct ip_mreqn mreq;
@@ -648,10 +673,15 @@ static int do_ip_setsockopt(struct sock *sk, int level,
break;
} else {
memset(&mreq, 0, sizeof(mreq));
- if (optlen >= sizeof(struct in_addr) &&
- copy_from_user(&mreq.imr_address, optval,
- sizeof(struct in_addr)))
- break;
+ if (optlen >= sizeof(struct ip_mreq)) {
+ if (copy_from_user(&mreq, optval,
+ sizeof(struct ip_mreq)))
+ break;
+ } else if (optlen >= sizeof(struct in_addr)) {
+ if (copy_from_user(&mreq.imr_address, optval,
+ sizeof(struct in_addr)))
+ break;
+ }
}
if (!mreq.imr_ifindex) {
@@ -1069,7 +1099,7 @@ EXPORT_SYMBOL(compat_ip_setsockopt);
*/
static int do_ip_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen, unsigned flags)
+ char __user *optval, int __user *optlen, unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
int val;
@@ -1178,6 +1208,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_MULTICAST_LOOP:
val = inet->mc_loop;
break;
+ case IP_UNICAST_IF:
+ val = (__force int)htonl((__u32) inet->uc_index);
+ break;
case IP_MULTICAST_IF:
{
struct in_addr addr;
@@ -1256,6 +1289,10 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
int hlim = inet->mc_ttl;
put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
}
+ if (inet->cmsg_flags & IP_CMSG_TOS) {
+ int tos = inet->rcv_tos;
+ put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos);
+ }
len -= msg.msg_controllen;
return put_user(len, optlen);
}
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index c857f6f49b0..63b64c45a82 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -156,11 +156,11 @@ static const struct net_protocol ipcomp4_protocol = {
static int __init ipcomp4_init(void)
{
if (xfrm_register_type(&ipcomp_type, AF_INET) < 0) {
- printk(KERN_INFO "ipcomp init: can't add xfrm type\n");
+ pr_info("%s: can't add xfrm type\n", __func__);
return -EAGAIN;
}
if (inet_add_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) {
- printk(KERN_INFO "ipcomp init: can't add protocol\n");
+ pr_info("%s: can't add protocol\n", __func__);
xfrm_unregister_type(&ipcomp_type, AF_INET);
return -EAGAIN;
}
@@ -170,9 +170,9 @@ static int __init ipcomp4_init(void)
static void __exit ipcomp4_fini(void)
{
if (inet_del_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0)
- printk(KERN_INFO "ip ipcomp close: can't remove protocol\n");
+ pr_info("%s: can't remove protocol\n", __func__);
if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0)
- printk(KERN_INFO "ip ipcomp close: can't remove xfrm type\n");
+ pr_info("%s: can't remove xfrm type\n", __func__);
}
module_init(ipcomp4_init);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 6e412a60a91..67e8a6b086e 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -214,7 +214,7 @@ static int __init ic_open_devs(void)
if (!(dev->flags & IFF_LOOPBACK))
continue;
if (dev_change_flags(dev, dev->flags | IFF_UP) < 0)
- printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name);
+ pr_err("IP-Config: Failed to open %s\n", dev->name);
}
for_each_netdev(&init_net, dev) {
@@ -223,7 +223,8 @@ static int __init ic_open_devs(void)
if (dev->mtu >= 364)
able |= IC_BOOTP;
else
- printk(KERN_WARNING "DHCP/BOOTP: Ignoring device %s, MTU %d too small", dev->name, dev->mtu);
+ pr_warn("DHCP/BOOTP: Ignoring device %s, MTU %d too small",
+ dev->name, dev->mtu);
if (!(dev->flags & IFF_NOARP))
able |= IC_RARP;
able &= ic_proto_enabled;
@@ -231,7 +232,8 @@ static int __init ic_open_devs(void)
continue;
oflags = dev->flags;
if (dev_change_flags(dev, oflags | IFF_UP) < 0) {
- printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name);
+ pr_err("IP-Config: Failed to open %s\n",
+ dev->name);
continue;
}
if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) {
@@ -273,9 +275,10 @@ have_carrier:
if (!ic_first_dev) {
if (user_dev_name[0])
- printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name);
+ pr_err("IP-Config: Device `%s' not found\n",
+ user_dev_name);
else
- printk(KERN_ERR "IP-Config: No network devices available.\n");
+ pr_err("IP-Config: No network devices available\n");
return -ENODEV;
}
return 0;
@@ -359,17 +362,20 @@ static int __init ic_setup_if(void)
strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name);
set_sockaddr(sin, ic_myaddr, 0);
if ((err = ic_devinet_ioctl(SIOCSIFADDR, &ir)) < 0) {
- printk(KERN_ERR "IP-Config: Unable to set interface address (%d).\n", err);
+ pr_err("IP-Config: Unable to set interface address (%d)\n",
+ err);
return -1;
}
set_sockaddr(sin, ic_netmask, 0);
if ((err = ic_devinet_ioctl(SIOCSIFNETMASK, &ir)) < 0) {
- printk(KERN_ERR "IP-Config: Unable to set interface netmask (%d).\n", err);
+ pr_err("IP-Config: Unable to set interface netmask (%d)\n",
+ err);
return -1;
}
set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0);
if ((err = ic_devinet_ioctl(SIOCSIFBRDADDR, &ir)) < 0) {
- printk(KERN_ERR "IP-Config: Unable to set interface broadcast address (%d).\n", err);
+ pr_err("IP-Config: Unable to set interface broadcast address (%d)\n",
+ err);
return -1;
}
/* Handle the case where we need non-standard MTU on the boot link (a network
@@ -380,8 +386,8 @@ static int __init ic_setup_if(void)
strcpy(ir.ifr_name, ic_dev->name);
ir.ifr_mtu = ic_dev_mtu;
if ((err = ic_dev_ioctl(SIOCSIFMTU, &ir)) < 0)
- printk(KERN_ERR "IP-Config: Unable to set interface mtu to %d (%d).\n",
- ic_dev_mtu, err);
+ pr_err("IP-Config: Unable to set interface mtu to %d (%d)\n",
+ ic_dev_mtu, err);
}
return 0;
}
@@ -396,7 +402,7 @@ static int __init ic_setup_routes(void)
memset(&rm, 0, sizeof(rm));
if ((ic_gateway ^ ic_myaddr) & ic_netmask) {
- printk(KERN_ERR "IP-Config: Gateway not on directly connected network.\n");
+ pr_err("IP-Config: Gateway not on directly connected network\n");
return -1;
}
set_sockaddr((struct sockaddr_in *) &rm.rt_dst, 0, 0);
@@ -404,7 +410,8 @@ static int __init ic_setup_routes(void)
set_sockaddr((struct sockaddr_in *) &rm.rt_gateway, ic_gateway, 0);
rm.rt_flags = RTF_UP | RTF_GATEWAY;
if ((err = ic_route_ioctl(SIOCADDRT, &rm)) < 0) {
- printk(KERN_ERR "IP-Config: Cannot add default route (%d).\n", err);
+ pr_err("IP-Config: Cannot add default route (%d)\n",
+ err);
return -1;
}
}
@@ -437,8 +444,8 @@ static int __init ic_defaults(void)
else if (IN_CLASSC(ntohl(ic_myaddr)))
ic_netmask = htonl(IN_CLASSC_NET);
else {
- printk(KERN_ERR "IP-Config: Unable to guess netmask for address %pI4\n",
- &ic_myaddr);
+ pr_err("IP-Config: Unable to guess netmask for address %pI4\n",
+ &ic_myaddr);
return -1;
}
printk("IP-Config: Guessing netmask %pI4\n", &ic_netmask);
@@ -688,8 +695,8 @@ ic_dhcp_init_options(u8 *options)
e += len;
}
if (*vendor_class_identifier) {
- printk(KERN_INFO "DHCP: sending class identifier \"%s\"\n",
- vendor_class_identifier);
+ pr_info("DHCP: sending class identifier \"%s\"\n",
+ vendor_class_identifier);
*e++ = 60; /* Class-identifier */
len = strlen(vendor_class_identifier);
*e++ = len;
@@ -801,8 +808,6 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d
b->op = BOOTP_REQUEST;
if (dev->type < 256) /* check for false types */
b->htype = dev->type;
- else if (dev->type == ARPHRD_IEEE802_TR) /* fix for token ring */
- b->htype = ARPHRD_IEEE802;
else if (dev->type == ARPHRD_FDDI)
b->htype = ARPHRD_ETHER;
else {
@@ -948,9 +953,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
/* Fragments are not supported */
if (ip_is_fragment(h)) {
- if (net_ratelimit())
- printk(KERN_ERR "DHCP/BOOTP: Ignoring fragmented "
- "reply.\n");
+ net_err_ratelimited("DHCP/BOOTP: Ignoring fragmented reply\n");
goto drop;
}
@@ -998,17 +1001,14 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
/* Is it a reply to our BOOTP request? */
if (b->op != BOOTP_REPLY ||
b->xid != d->xid) {
- if (net_ratelimit())
- printk(KERN_ERR "DHCP/BOOTP: Reply not for us, "
- "op[%x] xid[%x]\n",
- b->op, b->xid);
+ net_err_ratelimited("DHCP/BOOTP: Reply not for us, op[%x] xid[%x]\n",
+ b->op, b->xid);
goto drop_unlock;
}
/* Is it a reply for the device we are configuring? */
if (b->xid != ic_dev_xid) {
- if (net_ratelimit())
- printk(KERN_ERR "DHCP/BOOTP: Ignoring delayed packet\n");
+ net_err_ratelimited("DHCP/BOOTP: Ignoring delayed packet\n");
goto drop_unlock;
}
@@ -1146,17 +1146,17 @@ static int __init ic_dynamic(void)
* are missing, and without DHCP/BOOTP/RARP we are unable to get it.
*/
if (!ic_proto_enabled) {
- printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n");
+ pr_err("IP-Config: Incomplete network configuration information\n");
return -1;
}
#ifdef IPCONFIG_BOOTP
if ((ic_proto_enabled ^ ic_proto_have_if) & IC_BOOTP)
- printk(KERN_ERR "DHCP/BOOTP: No suitable device found.\n");
+ pr_err("DHCP/BOOTP: No suitable device found\n");
#endif
#ifdef IPCONFIG_RARP
if ((ic_proto_enabled ^ ic_proto_have_if) & IC_RARP)
- printk(KERN_ERR "RARP: No suitable device found.\n");
+ pr_err("RARP: No suitable device found\n");
#endif
if (!ic_proto_have_if)
@@ -1183,17 +1183,17 @@ static int __init ic_dynamic(void)
* [Actually we could now, but the nothing else running note still
* applies.. - AC]
*/
- printk(KERN_NOTICE "Sending %s%s%s requests .",
- do_bootp
- ? ((ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP") : "",
- (do_bootp && do_rarp) ? " and " : "",
- do_rarp ? "RARP" : "");
+ pr_notice("Sending %s%s%s requests .",
+ do_bootp
+ ? ((ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP") : "",
+ (do_bootp && do_rarp) ? " and " : "",
+ do_rarp ? "RARP" : "");
start_jiffies = jiffies;
d = ic_first_dev;
retries = CONF_SEND_RETRIES;
get_random_bytes(&timeout, sizeof(timeout));
- timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM);
+ timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned int) CONF_TIMEOUT_RANDOM);
for (;;) {
/* Track the device we are configuring */
ic_dev_xid = d->xid;
@@ -1216,13 +1216,13 @@ static int __init ic_dynamic(void)
(ic_proto_enabled & IC_USE_DHCP) &&
ic_dhcp_msgtype != DHCPACK) {
ic_got_reply = 0;
- printk(KERN_CONT ",");
+ pr_cont(",");
continue;
}
#endif /* IPCONFIG_DHCP */
if (ic_got_reply) {
- printk(KERN_CONT " OK\n");
+ pr_cont(" OK\n");
break;
}
@@ -1230,7 +1230,7 @@ static int __init ic_dynamic(void)
continue;
if (! --retries) {
- printk(KERN_CONT " timed out!\n");
+ pr_cont(" timed out!\n");
break;
}
@@ -1240,7 +1240,7 @@ static int __init ic_dynamic(void)
if (timeout > CONF_TIMEOUT_MAX)
timeout = CONF_TIMEOUT_MAX;
- printk(KERN_CONT ".");
+ pr_cont(".");
}
#ifdef IPCONFIG_BOOTP
@@ -1260,8 +1260,8 @@ static int __init ic_dynamic(void)
printk("IP-Config: Got %s answer from %pI4, ",
((ic_got_reply & IC_RARP) ? "RARP"
: (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"),
- &ic_servaddr);
- printk(KERN_CONT "my address is %pI4\n", &ic_myaddr);
+ &ic_servaddr);
+ pr_cont("my address is %pI4\n", &ic_myaddr);
return 0;
}
@@ -1437,24 +1437,22 @@ static int __init ip_auto_config(void)
*/
#ifdef CONFIG_ROOT_NFS
if (ROOT_DEV == Root_NFS) {
- printk(KERN_ERR
- "IP-Config: Retrying forever (NFS root)...\n");
+ pr_err("IP-Config: Retrying forever (NFS root)...\n");
goto try_try_again;
}
#endif
if (--retries) {
- printk(KERN_ERR
- "IP-Config: Reopening network devices...\n");
+ pr_err("IP-Config: Reopening network devices...\n");
goto try_try_again;
}
/* Oh, well. At least we tried. */
- printk(KERN_ERR "IP-Config: Auto-configuration of network failed.\n");
+ pr_err("IP-Config: Auto-configuration of network failed\n");
return -1;
}
#else /* !DYNAMIC */
- printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n");
+ pr_err("IP-Config: Incomplete network configuration information\n");
ic_close_devs();
return -1;
#endif /* IPCONFIG_DYNAMIC */
@@ -1492,19 +1490,16 @@ static int __init ip_auto_config(void)
/*
* Clue in the operator.
*/
- printk("IP-Config: Complete:\n");
- printk(" device=%s", ic_dev->name);
- printk(KERN_CONT ", addr=%pI4", &ic_myaddr);
- printk(KERN_CONT ", mask=%pI4", &ic_netmask);
- printk(KERN_CONT ", gw=%pI4", &ic_gateway);
- printk(KERN_CONT ",\n host=%s, domain=%s, nis-domain=%s",
- utsname()->nodename, ic_domain, utsname()->domainname);
- printk(KERN_CONT ",\n bootserver=%pI4", &ic_servaddr);
- printk(KERN_CONT ", rootserver=%pI4", &root_server_addr);
- printk(KERN_CONT ", rootpath=%s", root_server_path);
+ pr_info("IP-Config: Complete:\n");
+ pr_info(" device=%s, addr=%pI4, mask=%pI4, gw=%pI4\n",
+ ic_dev->name, &ic_myaddr, &ic_netmask, &ic_gateway);
+ pr_info(" host=%s, domain=%s, nis-domain=%s\n",
+ utsname()->nodename, ic_domain, utsname()->domainname);
+ pr_info(" bootserver=%pI4, rootserver=%pI4, rootpath=%s",
+ &ic_servaddr, &root_server_addr, root_server_path);
if (ic_dev_mtu)
- printk(KERN_CONT ", mtu=%d", ic_dev_mtu);
- printk(KERN_CONT "\n");
+ pr_cont(", mtu=%d", ic_dev_mtu);
+ pr_cont("\n");
#endif /* !SILENT */
return 0;
@@ -1626,22 +1621,21 @@ static int __init ip_auto_config_setup(char *addrs)
return 1;
}
+__setup("ip=", ip_auto_config_setup);
static int __init nfsaddrs_config_setup(char *addrs)
{
return ip_auto_config_setup(addrs);
}
+__setup("nfsaddrs=", nfsaddrs_config_setup);
static int __init vendor_class_identifier_setup(char *addrs)
{
if (strlcpy(vendor_class_identifier, addrs,
sizeof(vendor_class_identifier))
>= sizeof(vendor_class_identifier))
- printk(KERN_WARNING "DHCP: vendorclass too long, truncated to \"%s\"",
- vendor_class_identifier);
+ pr_warn("DHCP: vendorclass too long, truncated to \"%s\"",
+ vendor_class_identifier);
return 1;
}
-
-__setup("ip=", ip_auto_config_setup);
-__setup("nfsaddrs=", nfsaddrs_config_setup);
__setup("dhcpclass=", vendor_class_identifier_setup);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 22a19931530..2d0f99bf61b 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -144,33 +144,48 @@ static void ipip_dev_free(struct net_device *dev);
/* often modified stats are per cpu, other are shared (netdev->stats) */
struct pcpu_tstats {
- unsigned long rx_packets;
- unsigned long rx_bytes;
- unsigned long tx_packets;
- unsigned long tx_bytes;
-} __attribute__((aligned(4*sizeof(unsigned long))));
+ u64 rx_packets;
+ u64 rx_bytes;
+ u64 tx_packets;
+ u64 tx_bytes;
+ struct u64_stats_sync syncp;
+};
-static struct net_device_stats *ipip_get_stats(struct net_device *dev)
+static struct rtnl_link_stats64 *ipip_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *tot)
{
- struct pcpu_tstats sum = { 0 };
int i;
for_each_possible_cpu(i) {
const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
-
- sum.rx_packets += tstats->rx_packets;
- sum.rx_bytes += tstats->rx_bytes;
- sum.tx_packets += tstats->tx_packets;
- sum.tx_bytes += tstats->tx_bytes;
+ u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+ unsigned int start;
+
+ do {
+ start = u64_stats_fetch_begin_bh(&tstats->syncp);
+ rx_packets = tstats->rx_packets;
+ tx_packets = tstats->tx_packets;
+ rx_bytes = tstats->rx_bytes;
+ tx_bytes = tstats->tx_bytes;
+ } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
+
+ tot->rx_packets += rx_packets;
+ tot->tx_packets += tx_packets;
+ tot->rx_bytes += rx_bytes;
+ tot->tx_bytes += tx_bytes;
}
- dev->stats.rx_packets = sum.rx_packets;
- dev->stats.rx_bytes = sum.rx_bytes;
- dev->stats.tx_packets = sum.tx_packets;
- dev->stats.tx_bytes = sum.tx_bytes;
- return &dev->stats;
+
+ tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
+ tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
+ tot->tx_dropped = dev->stats.tx_dropped;
+ tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
+ tot->tx_errors = dev->stats.tx_errors;
+ tot->collisions = dev->stats.collisions;
+
+ return tot;
}
-static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
+static struct ip_tunnel *ipip_tunnel_lookup(struct net *net,
__be32 remote, __be32 local)
{
unsigned int h0 = HASH(remote);
@@ -245,7 +260,7 @@ static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
rcu_assign_pointer(*tp, t);
}
-static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
+static struct ip_tunnel *ipip_tunnel_locate(struct net *net,
struct ip_tunnel_parm *parms, int create)
{
__be32 remote = parms->iph.daddr;
@@ -404,8 +419,10 @@ static int ipip_rcv(struct sk_buff *skb)
skb->pkt_type = PACKET_HOST;
tstats = this_cpu_ptr(tunnel->dev->tstats);
+ u64_stats_update_begin(&tstats->syncp);
tstats->rx_packets++;
tstats->rx_bytes += skb->len;
+ u64_stats_update_end(&tstats->syncp);
__skb_tunnel_rx(skb, tunnel->dev);
@@ -454,8 +471,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
dev->stats.tx_fifo_errors++;
goto tx_error;
}
- if ((dst = rt->rt_gateway) == 0)
- goto tx_error_icmp;
+ dst = rt->rt_gateway;
}
rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
@@ -731,7 +747,7 @@ static const struct net_device_ops ipip_netdev_ops = {
.ndo_start_xmit = ipip_tunnel_xmit,
.ndo_do_ioctl = ipip_tunnel_ioctl,
.ndo_change_mtu = ipip_tunnel_change_mtu,
- .ndo_get_stats = ipip_get_stats,
+ .ndo_get_stats64 = ipip_get_stats64,
};
static void ipip_dev_free(struct net_device *dev)
@@ -893,7 +909,7 @@ static int __init ipip_init(void)
err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
if (err < 0) {
unregister_pernet_device(&ipip_net_ops);
- printk(KERN_INFO "ipip init: can't register tunnel\n");
+ pr_info("%s: can't register tunnel\n", __func__);
}
return err;
}
@@ -901,7 +917,7 @@ static int __init ipip_init(void)
static void __exit ipip_fini(void)
{
if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
- printk(KERN_INFO "ipip close: can't deregister tunnel\n");
+ pr_info("%s: can't deregister tunnel\n", __func__);
unregister_pernet_device(&ipip_net_ops);
}
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 7bc2db6db8d..a9e519ad6db 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -26,7 +26,6 @@
*
*/
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <linux/types.h>
#include <linux/capability.h>
@@ -950,8 +949,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
ret = sock_queue_rcv_skb(mroute_sk, skb);
rcu_read_unlock();
if (ret < 0) {
- if (net_ratelimit())
- printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
+ net_warn_ratelimited("mroute: pending queue full, dropping entries\n");
kfree_skb(skb);
}
@@ -2120,15 +2118,16 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
rtm->rtm_src_len = 32;
rtm->rtm_tos = 0;
rtm->rtm_table = mrt->id;
- NLA_PUT_U32(skb, RTA_TABLE, mrt->id);
+ if (nla_put_u32(skb, RTA_TABLE, mrt->id))
+ goto nla_put_failure;
rtm->rtm_type = RTN_MULTICAST;
rtm->rtm_scope = RT_SCOPE_UNIVERSE;
rtm->rtm_protocol = RTPROT_UNSPEC;
rtm->rtm_flags = 0;
- NLA_PUT_BE32(skb, RTA_SRC, c->mfc_origin);
- NLA_PUT_BE32(skb, RTA_DST, c->mfc_mcastgrp);
-
+ if (nla_put_be32(skb, RTA_SRC, c->mfc_origin) ||
+ nla_put_be32(skb, RTA_DST, c->mfc_mcastgrp))
+ goto nla_put_failure;
if (__ipmr_fill_mroute(mrt, skb, c, rtm) < 0)
goto nla_put_failure;
@@ -2538,7 +2537,7 @@ int __init ip_mr_init(void)
goto reg_notif_fail;
#ifdef CONFIG_IP_PIMSM_V2
if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) {
- printk(KERN_ERR "ip_mr_init: can't add PIM protocol\n");
+ pr_err("%s: can't add PIM protocol\n", __func__);
err = -EAGAIN;
goto add_proto_fail;
}
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 4f47e064e26..ed1b3678319 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -12,7 +12,7 @@
#include <net/netfilter/nf_queue.h>
/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
-int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
+int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type)
{
struct net *net = dev_net(skb_dst(skb)->dev);
const struct iphdr *iph = ip_hdr(skb);
@@ -237,13 +237,3 @@ static void ipv4_netfilter_fini(void)
module_init(ipv4_netfilter_init);
module_exit(ipv4_netfilter_fini);
-
-#ifdef CONFIG_SYSCTL
-struct ctl_path nf_net_ipv4_netfilter_sysctl_path[] = {
- { .procname = "net", },
- { .procname = "ipv4", },
- { .procname = "netfilter", },
- { }
-};
-EXPORT_SYMBOL_GPL(nf_net_ipv4_netfilter_sysctl_path);
-#endif /* CONFIG_SYSCTL */
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 74dfc9e5211..fcc543cd987 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -123,15 +123,6 @@ config IP_NF_TARGET_REJECT
To compile it as a module, choose M here. If unsure, say N.
-config IP_NF_TARGET_LOG
- tristate "LOG target support"
- default m if NETFILTER_ADVANCED=n
- help
- This option adds a `LOG' target, which allows you to create rules in
- any iptables table which records the packet header to the syslog.
-
- To compile it as a module, choose M here. If unsure, say N.
-
config IP_NF_TARGET_ULOG
tristate "ULOG target support"
default m if NETFILTER_ADVANCED=n
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 213a462b739..c20674dc945 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -54,7 +54,6 @@ obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ipt_rpfilter.o
# targets
obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
-obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o
obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o
obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o
@@ -67,6 +66,3 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
# just filtering instance of ARP tables for now
obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
-
-obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o
-
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index fd7a3f68917..97e61eadf58 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -221,9 +221,8 @@ static inline int arp_checkentry(const struct arpt_arp *arp)
static unsigned int
arpt_error(struct sk_buff *skb, const struct xt_action_param *par)
{
- if (net_ratelimit())
- pr_err("arp_tables: error: '%s'\n",
- (const char *)par->targinfo);
+ net_err_ratelimited("arp_tables: error: '%s'\n",
+ (const char *)par->targinfo);
return NF_DROP;
}
@@ -303,7 +302,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
if (v < 0) {
/* Pop from stack? */
if (v != XT_RETURN) {
- verdict = (unsigned)(-v) - 1;
+ verdict = (unsigned int)(-v) - 1;
break;
}
e = back;
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
deleted file mode 100644
index 94d45e1f888..00000000000
--- a/net/ipv4/netfilter/ip_queue.c
+++ /dev/null
@@ -1,639 +0,0 @@
-/*
- * This is a module which is used for queueing IPv4 packets and
- * communicating with userspace via netlink.
- *
- * (C) 2000-2002 James Morris <jmorris@intercode.com.au>
- * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/init.h>
-#include <linux/ip.h>
-#include <linux/notifier.h>
-#include <linux/netdevice.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4/ip_queue.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netlink.h>
-#include <linux/spinlock.h>
-#include <linux/sysctl.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/security.h>
-#include <linux/net.h>
-#include <linux/mutex.h>
-#include <linux/slab.h>
-#include <net/net_namespace.h>
-#include <net/sock.h>
-#include <net/route.h>
-#include <net/netfilter/nf_queue.h>
-#include <net/ip.h>
-
-#define IPQ_QMAX_DEFAULT 1024
-#define IPQ_PROC_FS_NAME "ip_queue"
-#define NET_IPQ_QMAX 2088
-#define NET_IPQ_QMAX_NAME "ip_queue_maxlen"
-
-typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long);
-
-static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE;
-static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT;
-static DEFINE_SPINLOCK(queue_lock);
-static int peer_pid __read_mostly;
-static unsigned int copy_range __read_mostly;
-static unsigned int queue_total;
-static unsigned int queue_dropped = 0;
-static unsigned int queue_user_dropped = 0;
-static struct sock *ipqnl __read_mostly;
-static LIST_HEAD(queue_list);
-static DEFINE_MUTEX(ipqnl_mutex);
-
-static inline void
-__ipq_enqueue_entry(struct nf_queue_entry *entry)
-{
- list_add_tail(&entry->list, &queue_list);
- queue_total++;
-}
-
-static inline int
-__ipq_set_mode(unsigned char mode, unsigned int range)
-{
- int status = 0;
-
- switch(mode) {
- case IPQ_COPY_NONE:
- case IPQ_COPY_META:
- copy_mode = mode;
- copy_range = 0;
- break;
-
- case IPQ_COPY_PACKET:
- if (range > 0xFFFF)
- range = 0xFFFF;
- copy_range = range;
- copy_mode = mode;
- break;
-
- default:
- status = -EINVAL;
-
- }
- return status;
-}
-
-static void __ipq_flush(ipq_cmpfn cmpfn, unsigned long data);
-
-static inline void
-__ipq_reset(void)
-{
- peer_pid = 0;
- net_disable_timestamp();
- __ipq_set_mode(IPQ_COPY_NONE, 0);
- __ipq_flush(NULL, 0);
-}
-
-static struct nf_queue_entry *
-ipq_find_dequeue_entry(unsigned long id)
-{
- struct nf_queue_entry *entry = NULL, *i;
-
- spin_lock_bh(&queue_lock);
-
- list_for_each_entry(i, &queue_list, list) {
- if ((unsigned long)i == id) {
- entry = i;
- break;
- }
- }
-
- if (entry) {
- list_del(&entry->list);
- queue_total--;
- }
-
- spin_unlock_bh(&queue_lock);
- return entry;
-}
-
-static void
-__ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
-{
- struct nf_queue_entry *entry, *next;
-
- list_for_each_entry_safe(entry, next, &queue_list, list) {
- if (!cmpfn || cmpfn(entry, data)) {
- list_del(&entry->list);
- queue_total--;
- nf_reinject(entry, NF_DROP);
- }
- }
-}
-
-static void
-ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
-{
- spin_lock_bh(&queue_lock);
- __ipq_flush(cmpfn, data);
- spin_unlock_bh(&queue_lock);
-}
-
-static struct sk_buff *
-ipq_build_packet_message(struct nf_queue_entry *entry, int *errp)
-{
- sk_buff_data_t old_tail;
- size_t size = 0;
- size_t data_len = 0;
- struct sk_buff *skb;
- struct ipq_packet_msg *pmsg;
- struct nlmsghdr *nlh;
- struct timeval tv;
-
- switch (ACCESS_ONCE(copy_mode)) {
- case IPQ_COPY_META:
- case IPQ_COPY_NONE:
- size = NLMSG_SPACE(sizeof(*pmsg));
- break;
-
- case IPQ_COPY_PACKET:
- if (entry->skb->ip_summed == CHECKSUM_PARTIAL &&
- (*errp = skb_checksum_help(entry->skb)))
- return NULL;
-
- data_len = ACCESS_ONCE(copy_range);
- if (data_len == 0 || data_len > entry->skb->len)
- data_len = entry->skb->len;
-
- size = NLMSG_SPACE(sizeof(*pmsg) + data_len);
- break;
-
- default:
- *errp = -EINVAL;
- return NULL;
- }
-
- skb = alloc_skb(size, GFP_ATOMIC);
- if (!skb)
- goto nlmsg_failure;
-
- old_tail = skb->tail;
- nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh));
- pmsg = NLMSG_DATA(nlh);
- memset(pmsg, 0, sizeof(*pmsg));
-
- pmsg->packet_id = (unsigned long )entry;
- pmsg->data_len = data_len;
- tv = ktime_to_timeval(entry->skb->tstamp);
- pmsg->timestamp_sec = tv.tv_sec;
- pmsg->timestamp_usec = tv.tv_usec;
- pmsg->mark = entry->skb->mark;
- pmsg->hook = entry->hook;
- pmsg->hw_protocol = entry->skb->protocol;
-
- if (entry->indev)
- strcpy(pmsg->indev_name, entry->indev->name);
- else
- pmsg->indev_name[0] = '\0';
-
- if (entry->outdev)
- strcpy(pmsg->outdev_name, entry->outdev->name);
- else
- pmsg->outdev_name[0] = '\0';
-
- if (entry->indev && entry->skb->dev &&
- entry->skb->mac_header != entry->skb->network_header) {
- pmsg->hw_type = entry->skb->dev->type;
- pmsg->hw_addrlen = dev_parse_header(entry->skb,
- pmsg->hw_addr);
- }
-
- if (data_len)
- if (skb_copy_bits(entry->skb, 0, pmsg->payload, data_len))
- BUG();
-
- nlh->nlmsg_len = skb->tail - old_tail;
- return skb;
-
-nlmsg_failure:
- kfree_skb(skb);
- *errp = -EINVAL;
- printk(KERN_ERR "ip_queue: error creating packet message\n");
- return NULL;
-}
-
-static int
-ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
-{
- int status = -EINVAL;
- struct sk_buff *nskb;
-
- if (copy_mode == IPQ_COPY_NONE)
- return -EAGAIN;
-
- nskb = ipq_build_packet_message(entry, &status);
- if (nskb == NULL)
- return status;
-
- spin_lock_bh(&queue_lock);
-
- if (!peer_pid)
- goto err_out_free_nskb;
-
- if (queue_total >= queue_maxlen) {
- queue_dropped++;
- status = -ENOSPC;
- if (net_ratelimit())
- printk (KERN_WARNING "ip_queue: full at %d entries, "
- "dropping packets(s). Dropped: %d\n", queue_total,
- queue_dropped);
- goto err_out_free_nskb;
- }
-
- /* netlink_unicast will either free the nskb or attach it to a socket */
- status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT);
- if (status < 0) {
- queue_user_dropped++;
- goto err_out_unlock;
- }
-
- __ipq_enqueue_entry(entry);
-
- spin_unlock_bh(&queue_lock);
- return status;
-
-err_out_free_nskb:
- kfree_skb(nskb);
-
-err_out_unlock:
- spin_unlock_bh(&queue_lock);
- return status;
-}
-
-static int
-ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct nf_queue_entry *e)
-{
- int diff;
- struct iphdr *user_iph = (struct iphdr *)v->payload;
- struct sk_buff *nskb;
-
- if (v->data_len < sizeof(*user_iph))
- return 0;
- diff = v->data_len - e->skb->len;
- if (diff < 0) {
- if (pskb_trim(e->skb, v->data_len))
- return -ENOMEM;
- } else if (diff > 0) {
- if (v->data_len > 0xFFFF)
- return -EINVAL;
- if (diff > skb_tailroom(e->skb)) {
- nskb = skb_copy_expand(e->skb, skb_headroom(e->skb),
- diff, GFP_ATOMIC);
- if (!nskb) {
- printk(KERN_WARNING "ip_queue: error "
- "in mangle, dropping packet\n");
- return -ENOMEM;
- }
- kfree_skb(e->skb);
- e->skb = nskb;
- }
- skb_put(e->skb, diff);
- }
- if (!skb_make_writable(e->skb, v->data_len))
- return -ENOMEM;
- skb_copy_to_linear_data(e->skb, v->payload, v->data_len);
- e->skb->ip_summed = CHECKSUM_NONE;
-
- return 0;
-}
-
-static int
-ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len)
-{
- struct nf_queue_entry *entry;
-
- if (vmsg->value > NF_MAX_VERDICT || vmsg->value == NF_STOLEN)
- return -EINVAL;
-
- entry = ipq_find_dequeue_entry(vmsg->id);
- if (entry == NULL)
- return -ENOENT;
- else {
- int verdict = vmsg->value;
-
- if (vmsg->data_len && vmsg->data_len == len)
- if (ipq_mangle_ipv4(vmsg, entry) < 0)
- verdict = NF_DROP;
-
- nf_reinject(entry, verdict);
- return 0;
- }
-}
-
-static int
-ipq_set_mode(unsigned char mode, unsigned int range)
-{
- int status;
-
- spin_lock_bh(&queue_lock);
- status = __ipq_set_mode(mode, range);
- spin_unlock_bh(&queue_lock);
- return status;
-}
-
-static int
-ipq_receive_peer(struct ipq_peer_msg *pmsg,
- unsigned char type, unsigned int len)
-{
- int status = 0;
-
- if (len < sizeof(*pmsg))
- return -EINVAL;
-
- switch (type) {
- case IPQM_MODE:
- status = ipq_set_mode(pmsg->msg.mode.value,
- pmsg->msg.mode.range);
- break;
-
- case IPQM_VERDICT:
- status = ipq_set_verdict(&pmsg->msg.verdict,
- len - sizeof(*pmsg));
- break;
- default:
- status = -EINVAL;
- }
- return status;
-}
-
-static int
-dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex)
-{
- if (entry->indev)
- if (entry->indev->ifindex == ifindex)
- return 1;
- if (entry->outdev)
- if (entry->outdev->ifindex == ifindex)
- return 1;
-#ifdef CONFIG_BRIDGE_NETFILTER
- if (entry->skb->nf_bridge) {
- if (entry->skb->nf_bridge->physindev &&
- entry->skb->nf_bridge->physindev->ifindex == ifindex)
- return 1;
- if (entry->skb->nf_bridge->physoutdev &&
- entry->skb->nf_bridge->physoutdev->ifindex == ifindex)
- return 1;
- }
-#endif
- return 0;
-}
-
-static void
-ipq_dev_drop(int ifindex)
-{
- ipq_flush(dev_cmp, ifindex);
-}
-
-#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
-
-static inline void
-__ipq_rcv_skb(struct sk_buff *skb)
-{
- int status, type, pid, flags;
- unsigned int nlmsglen, skblen;
- struct nlmsghdr *nlh;
- bool enable_timestamp = false;
-
- skblen = skb->len;
- if (skblen < sizeof(*nlh))
- return;
-
- nlh = nlmsg_hdr(skb);
- nlmsglen = nlh->nlmsg_len;
- if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen)
- return;
-
- pid = nlh->nlmsg_pid;
- flags = nlh->nlmsg_flags;
-
- if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI)
- RCV_SKB_FAIL(-EINVAL);
-
- if (flags & MSG_TRUNC)
- RCV_SKB_FAIL(-ECOMM);
-
- type = nlh->nlmsg_type;
- if (type < NLMSG_NOOP || type >= IPQM_MAX)
- RCV_SKB_FAIL(-EINVAL);
-
- if (type <= IPQM_BASE)
- return;
-
- if (!capable(CAP_NET_ADMIN))
- RCV_SKB_FAIL(-EPERM);
-
- spin_lock_bh(&queue_lock);
-
- if (peer_pid) {
- if (peer_pid != pid) {
- spin_unlock_bh(&queue_lock);
- RCV_SKB_FAIL(-EBUSY);
- }
- } else {
- enable_timestamp = true;
- peer_pid = pid;
- }
-
- spin_unlock_bh(&queue_lock);
- if (enable_timestamp)
- net_enable_timestamp();
- status = ipq_receive_peer(NLMSG_DATA(nlh), type,
- nlmsglen - NLMSG_LENGTH(0));
- if (status < 0)
- RCV_SKB_FAIL(status);
-
- if (flags & NLM_F_ACK)
- netlink_ack(skb, nlh, 0);
-}
-
-static void
-ipq_rcv_skb(struct sk_buff *skb)
-{
- mutex_lock(&ipqnl_mutex);
- __ipq_rcv_skb(skb);
- mutex_unlock(&ipqnl_mutex);
-}
-
-static int
-ipq_rcv_dev_event(struct notifier_block *this,
- unsigned long event, void *ptr)
-{
- struct net_device *dev = ptr;
-
- if (!net_eq(dev_net(dev), &init_net))
- return NOTIFY_DONE;
-
- /* Drop any packets associated with the downed device */
- if (event == NETDEV_DOWN)
- ipq_dev_drop(dev->ifindex);
- return NOTIFY_DONE;
-}
-
-static struct notifier_block ipq_dev_notifier = {
- .notifier_call = ipq_rcv_dev_event,
-};
-
-static int
-ipq_rcv_nl_event(struct notifier_block *this,
- unsigned long event, void *ptr)
-{
- struct netlink_notify *n = ptr;
-
- if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL) {
- spin_lock_bh(&queue_lock);
- if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid))
- __ipq_reset();
- spin_unlock_bh(&queue_lock);
- }
- return NOTIFY_DONE;
-}
-
-static struct notifier_block ipq_nl_notifier = {
- .notifier_call = ipq_rcv_nl_event,
-};
-
-#ifdef CONFIG_SYSCTL
-static struct ctl_table_header *ipq_sysctl_header;
-
-static ctl_table ipq_table[] = {
- {
- .procname = NET_IPQ_QMAX_NAME,
- .data = &queue_maxlen,
- .maxlen = sizeof(queue_maxlen),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- { }
-};
-#endif
-
-#ifdef CONFIG_PROC_FS
-static int ip_queue_show(struct seq_file *m, void *v)
-{
- spin_lock_bh(&queue_lock);
-
- seq_printf(m,
- "Peer PID : %d\n"
- "Copy mode : %hu\n"
- "Copy range : %u\n"
- "Queue length : %u\n"
- "Queue max. length : %u\n"
- "Queue dropped : %u\n"
- "Netlink dropped : %u\n",
- peer_pid,
- copy_mode,
- copy_range,
- queue_total,
- queue_maxlen,
- queue_dropped,
- queue_user_dropped);
-
- spin_unlock_bh(&queue_lock);
- return 0;
-}
-
-static int ip_queue_open(struct inode *inode, struct file *file)
-{
- return single_open(file, ip_queue_show, NULL);
-}
-
-static const struct file_operations ip_queue_proc_fops = {
- .open = ip_queue_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .owner = THIS_MODULE,
-};
-#endif
-
-static const struct nf_queue_handler nfqh = {
- .name = "ip_queue",
- .outfn = &ipq_enqueue_packet,
-};
-
-static int __init ip_queue_init(void)
-{
- int status = -ENOMEM;
- struct proc_dir_entry *proc __maybe_unused;
-
- netlink_register_notifier(&ipq_nl_notifier);
- ipqnl = netlink_kernel_create(&init_net, NETLINK_FIREWALL, 0,
- ipq_rcv_skb, NULL, THIS_MODULE);
- if (ipqnl == NULL) {
- printk(KERN_ERR "ip_queue: failed to create netlink socket\n");
- goto cleanup_netlink_notifier;
- }
-
-#ifdef CONFIG_PROC_FS
- proc = proc_create(IPQ_PROC_FS_NAME, 0, init_net.proc_net,
- &ip_queue_proc_fops);
- if (!proc) {
- printk(KERN_ERR "ip_queue: failed to create proc entry\n");
- goto cleanup_ipqnl;
- }
-#endif
- register_netdevice_notifier(&ipq_dev_notifier);
-#ifdef CONFIG_SYSCTL
- ipq_sysctl_header = register_sysctl_paths(net_ipv4_ctl_path, ipq_table);
-#endif
- status = nf_register_queue_handler(NFPROTO_IPV4, &nfqh);
- if (status < 0) {
- printk(KERN_ERR "ip_queue: failed to register queue handler\n");
- goto cleanup_sysctl;
- }
- return status;
-
-cleanup_sysctl:
-#ifdef CONFIG_SYSCTL
- unregister_sysctl_table(ipq_sysctl_header);
-#endif
- unregister_netdevice_notifier(&ipq_dev_notifier);
- proc_net_remove(&init_net, IPQ_PROC_FS_NAME);
-cleanup_ipqnl: __maybe_unused
- netlink_kernel_release(ipqnl);
- mutex_lock(&ipqnl_mutex);
- mutex_unlock(&ipqnl_mutex);
-
-cleanup_netlink_notifier:
- netlink_unregister_notifier(&ipq_nl_notifier);
- return status;
-}
-
-static void __exit ip_queue_fini(void)
-{
- nf_unregister_queue_handlers(&nfqh);
-
- ipq_flush(NULL, 0);
-
-#ifdef CONFIG_SYSCTL
- unregister_sysctl_table(ipq_sysctl_header);
-#endif
- unregister_netdevice_notifier(&ipq_dev_notifier);
- proc_net_remove(&init_net, IPQ_PROC_FS_NAME);
-
- netlink_kernel_release(ipqnl);
- mutex_lock(&ipqnl_mutex);
- mutex_unlock(&ipqnl_mutex);
-
- netlink_unregister_notifier(&ipq_nl_notifier);
-}
-
-MODULE_DESCRIPTION("IPv4 packet queue handler");
-MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_FIREWALL);
-
-module_init(ip_queue_init);
-module_exit(ip_queue_fini);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 24e556e83a3..170b1fdd6b7 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -153,8 +153,7 @@ ip_checkentry(const struct ipt_ip *ip)
static unsigned int
ipt_error(struct sk_buff *skb, const struct xt_action_param *par)
{
- if (net_ratelimit())
- pr_info("error: `%s'\n", (const char *)par->targinfo);
+ net_info_ratelimited("error: `%s'\n", (const char *)par->targinfo);
return NF_DROP;
}
@@ -377,7 +376,7 @@ ipt_do_table(struct sk_buff *skb,
if (v < 0) {
/* Pop from stack? */
if (v != XT_RETURN) {
- verdict = (unsigned)(-v) - 1;
+ verdict = (unsigned int)(-v) - 1;
break;
}
if (*stackptr <= origptr) {
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index a639967eb72..fe5daea5214 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -246,8 +246,7 @@ clusterip_hashfn(const struct sk_buff *skb,
dport = ports[1];
}
} else {
- if (net_ratelimit())
- pr_info("unknown protocol %u\n", iph->protocol);
+ net_info_ratelimited("unknown protocol %u\n", iph->protocol);
}
switch (config->hash_mode) {
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
deleted file mode 100644
index d76d6c9ed94..00000000000
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ /dev/null
@@ -1,516 +0,0 @@
-/*
- * This is a module which is used for logging packets.
- */
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/skbuff.h>
-#include <linux/if_arp.h>
-#include <linux/ip.h>
-#include <net/icmp.h>
-#include <net/udp.h>
-#include <net/tcp.h>
-#include <net/route.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter/x_tables.h>
-#include <linux/netfilter_ipv4/ipt_LOG.h>
-#include <net/netfilter/nf_log.h>
-#include <net/netfilter/xt_log.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
-MODULE_DESCRIPTION("Xtables: IPv4 packet logging to syslog");
-
-/* One level of recursion won't kill us */
-static void dump_packet(struct sbuff *m,
- const struct nf_loginfo *info,
- const struct sk_buff *skb,
- unsigned int iphoff)
-{
- struct iphdr _iph;
- const struct iphdr *ih;
- unsigned int logflags;
-
- if (info->type == NF_LOG_TYPE_LOG)
- logflags = info->u.log.logflags;
- else
- logflags = NF_LOG_MASK;
-
- ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
- if (ih == NULL) {
- sb_add(m, "TRUNCATED");
- return;
- }
-
- /* Important fields:
- * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
- /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
- sb_add(m, "SRC=%pI4 DST=%pI4 ",
- &ih->saddr, &ih->daddr);
-
- /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
- sb_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
- ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
- ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
-
- /* Max length: 6 "CE DF MF " */
- if (ntohs(ih->frag_off) & IP_CE)
- sb_add(m, "CE ");
- if (ntohs(ih->frag_off) & IP_DF)
- sb_add(m, "DF ");
- if (ntohs(ih->frag_off) & IP_MF)
- sb_add(m, "MF ");
-
- /* Max length: 11 "FRAG:65535 " */
- if (ntohs(ih->frag_off) & IP_OFFSET)
- sb_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
-
- if ((logflags & IPT_LOG_IPOPT) &&
- ih->ihl * 4 > sizeof(struct iphdr)) {
- const unsigned char *op;
- unsigned char _opt[4 * 15 - sizeof(struct iphdr)];
- unsigned int i, optsize;
-
- optsize = ih->ihl * 4 - sizeof(struct iphdr);
- op = skb_header_pointer(skb, iphoff+sizeof(_iph),
- optsize, _opt);
- if (op == NULL) {
- sb_add(m, "TRUNCATED");
- return;
- }
-
- /* Max length: 127 "OPT (" 15*4*2chars ") " */
- sb_add(m, "OPT (");
- for (i = 0; i < optsize; i++)
- sb_add(m, "%02X", op[i]);
- sb_add(m, ") ");
- }
-
- switch (ih->protocol) {
- case IPPROTO_TCP: {
- struct tcphdr _tcph;
- const struct tcphdr *th;
-
- /* Max length: 10 "PROTO=TCP " */
- sb_add(m, "PROTO=TCP ");
-
- if (ntohs(ih->frag_off) & IP_OFFSET)
- break;
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- th = skb_header_pointer(skb, iphoff + ih->ihl * 4,
- sizeof(_tcph), &_tcph);
- if (th == NULL) {
- sb_add(m, "INCOMPLETE [%u bytes] ",
- skb->len - iphoff - ih->ihl*4);
- break;
- }
-
- /* Max length: 20 "SPT=65535 DPT=65535 " */
- sb_add(m, "SPT=%u DPT=%u ",
- ntohs(th->source), ntohs(th->dest));
- /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
- if (logflags & IPT_LOG_TCPSEQ)
- sb_add(m, "SEQ=%u ACK=%u ",
- ntohl(th->seq), ntohl(th->ack_seq));
- /* Max length: 13 "WINDOW=65535 " */
- sb_add(m, "WINDOW=%u ", ntohs(th->window));
- /* Max length: 9 "RES=0x3F " */
- sb_add(m, "RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
- /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
- if (th->cwr)
- sb_add(m, "CWR ");
- if (th->ece)
- sb_add(m, "ECE ");
- if (th->urg)
- sb_add(m, "URG ");
- if (th->ack)
- sb_add(m, "ACK ");
- if (th->psh)
- sb_add(m, "PSH ");
- if (th->rst)
- sb_add(m, "RST ");
- if (th->syn)
- sb_add(m, "SYN ");
- if (th->fin)
- sb_add(m, "FIN ");
- /* Max length: 11 "URGP=65535 " */
- sb_add(m, "URGP=%u ", ntohs(th->urg_ptr));
-
- if ((logflags & IPT_LOG_TCPOPT) &&
- th->doff * 4 > sizeof(struct tcphdr)) {
- unsigned char _opt[4 * 15 - sizeof(struct tcphdr)];
- const unsigned char *op;
- unsigned int i, optsize;
-
- optsize = th->doff * 4 - sizeof(struct tcphdr);
- op = skb_header_pointer(skb,
- iphoff+ih->ihl*4+sizeof(_tcph),
- optsize, _opt);
- if (op == NULL) {
- sb_add(m, "TRUNCATED");
- return;
- }
-
- /* Max length: 127 "OPT (" 15*4*2chars ") " */
- sb_add(m, "OPT (");
- for (i = 0; i < optsize; i++)
- sb_add(m, "%02X", op[i]);
- sb_add(m, ") ");
- }
- break;
- }
- case IPPROTO_UDP:
- case IPPROTO_UDPLITE: {
- struct udphdr _udph;
- const struct udphdr *uh;
-
- if (ih->protocol == IPPROTO_UDP)
- /* Max length: 10 "PROTO=UDP " */
- sb_add(m, "PROTO=UDP " );
- else /* Max length: 14 "PROTO=UDPLITE " */
- sb_add(m, "PROTO=UDPLITE ");
-
- if (ntohs(ih->frag_off) & IP_OFFSET)
- break;
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- uh = skb_header_pointer(skb, iphoff+ih->ihl*4,
- sizeof(_udph), &_udph);
- if (uh == NULL) {
- sb_add(m, "INCOMPLETE [%u bytes] ",
- skb->len - iphoff - ih->ihl*4);
- break;
- }
-
- /* Max length: 20 "SPT=65535 DPT=65535 " */
- sb_add(m, "SPT=%u DPT=%u LEN=%u ",
- ntohs(uh->source), ntohs(uh->dest),
- ntohs(uh->len));
- break;
- }
- case IPPROTO_ICMP: {
- struct icmphdr _icmph;
- const struct icmphdr *ich;
- static const size_t required_len[NR_ICMP_TYPES+1]
- = { [ICMP_ECHOREPLY] = 4,
- [ICMP_DEST_UNREACH]
- = 8 + sizeof(struct iphdr),
- [ICMP_SOURCE_QUENCH]
- = 8 + sizeof(struct iphdr),
- [ICMP_REDIRECT]
- = 8 + sizeof(struct iphdr),
- [ICMP_ECHO] = 4,
- [ICMP_TIME_EXCEEDED]
- = 8 + sizeof(struct iphdr),
- [ICMP_PARAMETERPROB]
- = 8 + sizeof(struct iphdr),
- [ICMP_TIMESTAMP] = 20,
- [ICMP_TIMESTAMPREPLY] = 20,
- [ICMP_ADDRESS] = 12,
- [ICMP_ADDRESSREPLY] = 12 };
-
- /* Max length: 11 "PROTO=ICMP " */
- sb_add(m, "PROTO=ICMP ");
-
- if (ntohs(ih->frag_off) & IP_OFFSET)
- break;
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
- sizeof(_icmph), &_icmph);
- if (ich == NULL) {
- sb_add(m, "INCOMPLETE [%u bytes] ",
- skb->len - iphoff - ih->ihl*4);
- break;
- }
-
- /* Max length: 18 "TYPE=255 CODE=255 " */
- sb_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code);
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- if (ich->type <= NR_ICMP_TYPES &&
- required_len[ich->type] &&
- skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
- sb_add(m, "INCOMPLETE [%u bytes] ",
- skb->len - iphoff - ih->ihl*4);
- break;
- }
-
- switch (ich->type) {
- case ICMP_ECHOREPLY:
- case ICMP_ECHO:
- /* Max length: 19 "ID=65535 SEQ=65535 " */
- sb_add(m, "ID=%u SEQ=%u ",
- ntohs(ich->un.echo.id),
- ntohs(ich->un.echo.sequence));
- break;
-
- case ICMP_PARAMETERPROB:
- /* Max length: 14 "PARAMETER=255 " */
- sb_add(m, "PARAMETER=%u ",
- ntohl(ich->un.gateway) >> 24);
- break;
- case ICMP_REDIRECT:
- /* Max length: 24 "GATEWAY=255.255.255.255 " */
- sb_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
- /* Fall through */
- case ICMP_DEST_UNREACH:
- case ICMP_SOURCE_QUENCH:
- case ICMP_TIME_EXCEEDED:
- /* Max length: 3+maxlen */
- if (!iphoff) { /* Only recurse once. */
- sb_add(m, "[");
- dump_packet(m, info, skb,
- iphoff + ih->ihl*4+sizeof(_icmph));
- sb_add(m, "] ");
- }
-
- /* Max length: 10 "MTU=65535 " */
- if (ich->type == ICMP_DEST_UNREACH &&
- ich->code == ICMP_FRAG_NEEDED)
- sb_add(m, "MTU=%u ", ntohs(ich->un.frag.mtu));
- }
- break;
- }
- /* Max Length */
- case IPPROTO_AH: {
- struct ip_auth_hdr _ahdr;
- const struct ip_auth_hdr *ah;
-
- if (ntohs(ih->frag_off) & IP_OFFSET)
- break;
-
- /* Max length: 9 "PROTO=AH " */
- sb_add(m, "PROTO=AH ");
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
- sizeof(_ahdr), &_ahdr);
- if (ah == NULL) {
- sb_add(m, "INCOMPLETE [%u bytes] ",
- skb->len - iphoff - ih->ihl*4);
- break;
- }
-
- /* Length: 15 "SPI=0xF1234567 " */
- sb_add(m, "SPI=0x%x ", ntohl(ah->spi));
- break;
- }
- case IPPROTO_ESP: {
- struct ip_esp_hdr _esph;
- const struct ip_esp_hdr *eh;
-
- /* Max length: 10 "PROTO=ESP " */
- sb_add(m, "PROTO=ESP ");
-
- if (ntohs(ih->frag_off) & IP_OFFSET)
- break;
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
- sizeof(_esph), &_esph);
- if (eh == NULL) {
- sb_add(m, "INCOMPLETE [%u bytes] ",
- skb->len - iphoff - ih->ihl*4);
- break;
- }
-
- /* Length: 15 "SPI=0xF1234567 " */
- sb_add(m, "SPI=0x%x ", ntohl(eh->spi));
- break;
- }
- /* Max length: 10 "PROTO 255 " */
- default:
- sb_add(m, "PROTO=%u ", ih->protocol);
- }
-
- /* Max length: 15 "UID=4294967295 " */
- if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
- read_lock_bh(&skb->sk->sk_callback_lock);
- if (skb->sk->sk_socket && skb->sk->sk_socket->file)
- sb_add(m, "UID=%u GID=%u ",
- skb->sk->sk_socket->file->f_cred->fsuid,
- skb->sk->sk_socket->file->f_cred->fsgid);
- read_unlock_bh(&skb->sk->sk_callback_lock);
- }
-
- /* Max length: 16 "MARK=0xFFFFFFFF " */
- if (!iphoff && skb->mark)
- sb_add(m, "MARK=0x%x ", skb->mark);
-
- /* Proto Max log string length */
- /* IP: 40+46+6+11+127 = 230 */
- /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */
- /* UDP: 10+max(25,20) = 35 */
- /* UDPLITE: 14+max(25,20) = 39 */
- /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */
- /* ESP: 10+max(25)+15 = 50 */
- /* AH: 9+max(25)+15 = 49 */
- /* unknown: 10 */
-
- /* (ICMP allows recursion one level deep) */
- /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */
- /* maxlen = 230+ 91 + 230 + 252 = 803 */
-}
-
-static void dump_mac_header(struct sbuff *m,
- const struct nf_loginfo *info,
- const struct sk_buff *skb)
-{
- struct net_device *dev = skb->dev;
- unsigned int logflags = 0;
-
- if (info->type == NF_LOG_TYPE_LOG)
- logflags = info->u.log.logflags;
-
- if (!(logflags & IPT_LOG_MACDECODE))
- goto fallback;
-
- switch (dev->type) {
- case ARPHRD_ETHER:
- sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
- eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
- ntohs(eth_hdr(skb)->h_proto));
- return;
- default:
- break;
- }
-
-fallback:
- sb_add(m, "MAC=");
- if (dev->hard_header_len &&
- skb->mac_header != skb->network_header) {
- const unsigned char *p = skb_mac_header(skb);
- unsigned int i;
-
- sb_add(m, "%02x", *p++);
- for (i = 1; i < dev->hard_header_len; i++, p++)
- sb_add(m, ":%02x", *p);
- }
- sb_add(m, " ");
-}
-
-static struct nf_loginfo default_loginfo = {
- .type = NF_LOG_TYPE_LOG,
- .u = {
- .log = {
- .level = 5,
- .logflags = NF_LOG_MASK,
- },
- },
-};
-
-static void
-ipt_log_packet(u_int8_t pf,
- unsigned int hooknum,
- const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct nf_loginfo *loginfo,
- const char *prefix)
-{
- struct sbuff *m = sb_open();
-
- if (!loginfo)
- loginfo = &default_loginfo;
-
- sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
- prefix,
- in ? in->name : "",
- out ? out->name : "");
-#ifdef CONFIG_BRIDGE_NETFILTER
- if (skb->nf_bridge) {
- const struct net_device *physindev;
- const struct net_device *physoutdev;
-
- physindev = skb->nf_bridge->physindev;
- if (physindev && in != physindev)
- sb_add(m, "PHYSIN=%s ", physindev->name);
- physoutdev = skb->nf_bridge->physoutdev;
- if (physoutdev && out != physoutdev)
- sb_add(m, "PHYSOUT=%s ", physoutdev->name);
- }
-#endif
-
- if (in != NULL)
- dump_mac_header(m, loginfo, skb);
-
- dump_packet(m, loginfo, skb, 0);
-
- sb_close(m);
-}
-
-static unsigned int
-log_tg(struct sk_buff *skb, const struct xt_action_param *par)
-{
- const struct ipt_log_info *loginfo = par->targinfo;
- struct nf_loginfo li;
-
- li.type = NF_LOG_TYPE_LOG;
- li.u.log.level = loginfo->level;
- li.u.log.logflags = loginfo->logflags;
-
- ipt_log_packet(NFPROTO_IPV4, par->hooknum, skb, par->in, par->out, &li,
- loginfo->prefix);
- return XT_CONTINUE;
-}
-
-static int log_tg_check(const struct xt_tgchk_param *par)
-{
- const struct ipt_log_info *loginfo = par->targinfo;
-
- if (loginfo->level >= 8) {
- pr_debug("level %u >= 8\n", loginfo->level);
- return -EINVAL;
- }
- if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') {
- pr_debug("prefix is not null-terminated\n");
- return -EINVAL;
- }
- return 0;
-}
-
-static struct xt_target log_tg_reg __read_mostly = {
- .name = "LOG",
- .family = NFPROTO_IPV4,
- .target = log_tg,
- .targetsize = sizeof(struct ipt_log_info),
- .checkentry = log_tg_check,
- .me = THIS_MODULE,
-};
-
-static struct nf_logger ipt_log_logger __read_mostly = {
- .name = "ipt_LOG",
- .logfn = &ipt_log_packet,
- .me = THIS_MODULE,
-};
-
-static int __init log_tg_init(void)
-{
- int ret;
-
- ret = xt_register_target(&log_tg_reg);
- if (ret < 0)
- return ret;
- nf_log_register(NFPROTO_IPV4, &ipt_log_logger);
- return 0;
-}
-
-static void __exit log_tg_exit(void)
-{
- nf_log_unregister(&ipt_log_logger);
- xt_unregister_target(&log_tg_reg);
-}
-
-module_init(log_tg_init);
-module_exit(log_tg_exit);
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 0e58f09e59f..851acec852d 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -52,7 +52,7 @@ iptable_filter_hook(unsigned int hook, struct sk_buff *skb,
static struct nf_hook_ops *filter_ops __read_mostly;
/* Default to forward because I got too much mail already. */
-static bool forward = NF_ACCEPT;
+static bool forward = true;
module_param(forward, bool, 0000);
static int __net_init iptable_filter_net_init(struct net *net)
@@ -64,7 +64,7 @@ static int __net_init iptable_filter_net_init(struct net *net)
return -ENOMEM;
/* Entry 1 is the FORWARD hook */
((struct ipt_standard *)repl->entries)[1].target.verdict =
- -forward - 1;
+ forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;
net->ipv4.iptable_filter =
ipt_register_table(net, &packet_filter, repl);
@@ -88,11 +88,6 @@ static int __init iptable_filter_init(void)
{
int ret;
- if (forward < 0 || forward > NF_MAX_VERDICT) {
- pr_err("iptables forward must be 0 or 1\n");
- return -EINVAL;
- }
-
ret = register_pernet_subsys(&iptable_filter_net_ops);
if (ret < 0)
return ret;
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index de9da21113a..91747d4ebc2 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -74,16 +74,24 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
if (iph == NULL)
- return -NF_DROP;
+ return -NF_ACCEPT;
/* Conntrack defragments packets, we might still see fragments
* inside ICMP packets though. */
if (iph->frag_off & htons(IP_OFFSET))
- return -NF_DROP;
+ return -NF_ACCEPT;
*dataoff = nhoff + (iph->ihl << 2);
*protonum = iph->protocol;
+ /* Check bogus IP headers */
+ if (*dataoff > skb->len) {
+ pr_debug("nf_conntrack_ipv4: bogus IPv4 packet: "
+ "nhoff %u, ihl %u, skblen %u\n",
+ nhoff, iph->ihl << 2, skb->len);
+ return -NF_ACCEPT;
+ }
+
return NF_ACCEPT;
}
@@ -303,8 +311,9 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
static int ipv4_tuple_to_nlattr(struct sk_buff *skb,
const struct nf_conntrack_tuple *tuple)
{
- NLA_PUT_BE32(skb, CTA_IP_V4_SRC, tuple->src.u3.ip);
- NLA_PUT_BE32(skb, CTA_IP_V4_DST, tuple->dst.u3.ip);
+ if (nla_put_be32(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) ||
+ nla_put_be32(skb, CTA_IP_V4_DST, tuple->dst.u3.ip))
+ goto nla_put_failure;
return 0;
nla_put_failure:
@@ -356,7 +365,7 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
.nla_policy = ipv4_nla_policy,
#endif
#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
- .ctl_table_path = nf_net_ipv4_netfilter_sysctl_path,
+ .ctl_table_path = "net/ipv4/netfilter",
.ctl_table = ip_ct_sysctl_table,
#endif
.me = THIS_MODULE,
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index ab5b27a2916..0847e373d33 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -75,25 +75,31 @@ static int icmp_print_tuple(struct seq_file *s,
ntohs(tuple->src.u.icmp.id));
}
+static unsigned int *icmp_get_timeouts(struct net *net)
+{
+ return &nf_ct_icmp_timeout;
+}
+
/* Returns verdict for packet, or -1 for invalid. */
static int icmp_packet(struct nf_conn *ct,
const struct sk_buff *skb,
unsigned int dataoff,
enum ip_conntrack_info ctinfo,
u_int8_t pf,
- unsigned int hooknum)
+ unsigned int hooknum,
+ unsigned int *timeout)
{
/* Do not immediately delete the connection after the first
successful reply to avoid excessive conntrackd traffic
and also to handle correctly ICMP echo reply duplicates. */
- nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmp_timeout);
+ nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
return NF_ACCEPT;
}
/* Called when a new connection for this protocol found. */
static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
- unsigned int dataoff)
+ unsigned int dataoff, unsigned int *timeouts)
{
static const u_int8_t valid_new[] = {
[ICMP_ECHO] = 1,
@@ -222,10 +228,10 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
static int icmp_tuple_to_nlattr(struct sk_buff *skb,
const struct nf_conntrack_tuple *t)
{
- NLA_PUT_BE16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id);
- NLA_PUT_U8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type);
- NLA_PUT_U8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code);
-
+ if (nla_put_be16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id) ||
+ nla_put_u8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type) ||
+ nla_put_u8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code))
+ goto nla_put_failure;
return 0;
nla_put_failure:
@@ -263,6 +269,44 @@ static int icmp_nlattr_tuple_size(void)
}
#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_cttimeout.h>
+
+static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], void *data)
+{
+ unsigned int *timeout = data;
+
+ if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) {
+ *timeout =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ;
+ } else {
+ /* Set default ICMP timeout. */
+ *timeout = nf_ct_icmp_timeout;
+ }
+ return 0;
+}
+
+static int
+icmp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
+{
+ const unsigned int *timeout = data;
+
+ if (nla_put_be32(skb, CTA_TIMEOUT_ICMP_TIMEOUT, htonl(*timeout / HZ)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static const struct nla_policy
+icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = {
+ [CTA_TIMEOUT_ICMP_TIMEOUT] = { .type = NLA_U32 },
+};
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+
#ifdef CONFIG_SYSCTL
static struct ctl_table_header *icmp_sysctl_header;
static struct ctl_table icmp_sysctl_table[] = {
@@ -298,6 +342,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
.invert_tuple = icmp_invert_tuple,
.print_tuple = icmp_print_tuple,
.packet = icmp_packet,
+ .get_timeouts = icmp_get_timeouts,
.new = icmp_new,
.error = icmp_error,
.destroy = NULL,
@@ -308,6 +353,15 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
.nlattr_to_tuple = icmp_nlattr_to_tuple,
.nla_policy = icmp_nla_policy,
#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = icmp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = icmp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_ICMP_MAX,
+ .obj_size = sizeof(unsigned int),
+ .nla_policy = icmp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
#ifdef CONFIG_SYSCTL
.ctl_table_header = &icmp_sysctl_header,
.ctl_table = icmp_sysctl_table,
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index a708933dc23..abb52adf5ac 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -686,6 +686,11 @@ static struct pernet_operations nf_nat_net_ops = {
.exit = nf_nat_net_exit,
};
+static struct nf_ct_helper_expectfn follow_master_nat = {
+ .name = "nat-follow-master",
+ .expectfn = nf_nat_follow_master,
+};
+
static int __init nf_nat_init(void)
{
size_t i;
@@ -717,6 +722,8 @@ static int __init nf_nat_init(void)
l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET);
+ nf_ct_helper_expectfn_register(&follow_master_nat);
+
BUG_ON(nf_nat_seq_adjust_hook != NULL);
RCU_INIT_POINTER(nf_nat_seq_adjust_hook, nf_nat_seq_adjust);
BUG_ON(nfnetlink_parse_nat_setup_hook != NULL);
@@ -736,6 +743,7 @@ static void __exit nf_nat_cleanup(void)
unregister_pernet_subsys(&nf_nat_net_ops);
nf_ct_l3proto_put(l3proto);
nf_ct_extend_unregister(&nat_extend);
+ nf_ct_helper_expectfn_unregister(&follow_master_nat);
RCU_INIT_POINTER(nf_nat_seq_adjust_hook, NULL);
RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL);
RCU_INIT_POINTER(nf_ct_nat_offset, NULL);
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index dc1dd912baf..cad29c12131 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -42,9 +42,7 @@ static int set_addr(struct sk_buff *skb,
if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
addroff, sizeof(buf),
(char *) &buf, sizeof(buf))) {
- if (net_ratelimit())
- pr_notice("nf_nat_h323: nf_nat_mangle_tcp_packet"
- " error\n");
+ net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_tcp_packet error\n");
return -1;
}
@@ -58,9 +56,7 @@ static int set_addr(struct sk_buff *skb,
if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
addroff, sizeof(buf),
(char *) &buf, sizeof(buf))) {
- if (net_ratelimit())
- pr_notice("nf_nat_h323: nf_nat_mangle_udp_packet"
- " error\n");
+ net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_udp_packet error\n");
return -1;
}
/* nf_nat_mangle_udp_packet uses skb_make_writable() to copy
@@ -214,8 +210,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
/* Run out of expectations */
if (i >= H323_RTP_CHANNEL_MAX) {
- if (net_ratelimit())
- pr_notice("nf_nat_h323: out of expectations\n");
+ net_notice_ratelimited("nf_nat_h323: out of expectations\n");
return 0;
}
@@ -244,8 +239,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
}
if (nated_port == 0) { /* No port available */
- if (net_ratelimit())
- pr_notice("nf_nat_h323: out of RTP ports\n");
+ net_notice_ratelimited("nf_nat_h323: out of RTP ports\n");
return 0;
}
@@ -308,8 +302,7 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
}
if (nated_port == 0) { /* No port available */
- if (net_ratelimit())
- pr_notice("nf_nat_h323: out of TCP ports\n");
+ net_notice_ratelimited("nf_nat_h323: out of TCP ports\n");
return 0;
}
@@ -365,8 +358,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
}
if (nated_port == 0) { /* No port available */
- if (net_ratelimit())
- pr_notice("nf_nat_q931: out of TCP ports\n");
+ net_notice_ratelimited("nf_nat_q931: out of TCP ports\n");
return 0;
}
@@ -456,8 +448,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
}
if (nated_port == 0) { /* No port available */
- if (net_ratelimit())
- pr_notice("nf_nat_ras: out of TCP ports\n");
+ net_notice_ratelimited("nf_nat_ras: out of TCP ports\n");
return 0;
}
@@ -545,8 +536,7 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
}
if (nated_port == 0) { /* No port available */
- if (net_ratelimit())
- pr_notice("nf_nat_q931: out of TCP ports\n");
+ net_notice_ratelimited("nf_nat_q931: out of TCP ports\n");
return 0;
}
@@ -568,6 +558,16 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
return 0;
}
+static struct nf_ct_helper_expectfn q931_nat = {
+ .name = "Q.931",
+ .expectfn = ip_nat_q931_expect,
+};
+
+static struct nf_ct_helper_expectfn callforwarding_nat = {
+ .name = "callforwarding",
+ .expectfn = ip_nat_callforwarding_expect,
+};
+
/****************************************************************************/
static int __init init(void)
{
@@ -590,6 +590,8 @@ static int __init init(void)
RCU_INIT_POINTER(nat_h245_hook, nat_h245);
RCU_INIT_POINTER(nat_callforwarding_hook, nat_callforwarding);
RCU_INIT_POINTER(nat_q931_hook, nat_q931);
+ nf_ct_helper_expectfn_register(&q931_nat);
+ nf_ct_helper_expectfn_register(&callforwarding_nat);
return 0;
}
@@ -605,6 +607,8 @@ static void __exit fini(void)
RCU_INIT_POINTER(nat_h245_hook, NULL);
RCU_INIT_POINTER(nat_callforwarding_hook, NULL);
RCU_INIT_POINTER(nat_q931_hook, NULL);
+ nf_ct_helper_expectfn_unregister(&q931_nat);
+ nf_ct_helper_expectfn_unregister(&callforwarding_nat);
synchronize_rcu();
}
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
index d0319f96269..ea4a23813d2 100644
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ b/net/ipv4/netfilter/nf_nat_sip.c
@@ -283,7 +283,7 @@ static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff,
__be32 newip;
u_int16_t port;
char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
- unsigned buflen;
+ unsigned int buflen;
/* Connection will come from reply */
if (ct->tuplehash[dir].tuple.src.u3.ip == ct->tuplehash[!dir].tuple.dst.u3.ip)
@@ -526,6 +526,11 @@ err1:
return NF_DROP;
}
+static struct nf_ct_helper_expectfn sip_nat = {
+ .name = "sip",
+ .expectfn = ip_nat_sip_expected,
+};
+
static void __exit nf_nat_sip_fini(void)
{
RCU_INIT_POINTER(nf_nat_sip_hook, NULL);
@@ -535,6 +540,7 @@ static void __exit nf_nat_sip_fini(void)
RCU_INIT_POINTER(nf_nat_sdp_port_hook, NULL);
RCU_INIT_POINTER(nf_nat_sdp_session_hook, NULL);
RCU_INIT_POINTER(nf_nat_sdp_media_hook, NULL);
+ nf_ct_helper_expectfn_unregister(&sip_nat);
synchronize_rcu();
}
@@ -554,6 +560,7 @@ static int __init nf_nat_sip_init(void)
RCU_INIT_POINTER(nf_nat_sdp_port_hook, ip_nat_sdp_port);
RCU_INIT_POINTER(nf_nat_sdp_session_hook, ip_nat_sdp_session);
RCU_INIT_POINTER(nf_nat_sdp_media_hook, ip_nat_sdp_media);
+ nf_ct_helper_expectfn_register(&sip_nat);
return 0;
}
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index 2133c30a4a5..746edec8b86 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -1206,8 +1206,7 @@ static int snmp_translate(struct nf_conn *ct,
if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr),
paylen, &map, &udph->check)) {
- if (net_ratelimit())
- printk(KERN_WARNING "bsalg: parser failed\n");
+ net_warn_ratelimited("bsalg: parser failed\n");
return NF_DROP;
}
return NF_ACCEPT;
@@ -1241,9 +1240,8 @@ static int help(struct sk_buff *skb, unsigned int protoff,
* can mess around with the payload.
*/
if (ntohs(udph->len) != skb->len - (iph->ihl << 2)) {
- if (net_ratelimit())
- printk(KERN_WARNING "SNMP: dropping malformed packet src=%pI4 dst=%pI4\n",
- &iph->saddr, &iph->daddr);
+ net_warn_ratelimited("SNMP: dropping malformed packet src=%pI4 dst=%pI4\n",
+ &iph->saddr, &iph->daddr);
return NF_DROP;
}
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index b072386cee2..2c00e8bf684 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -20,7 +20,6 @@
*
*/
-#include <asm/system.h>
#include <linux/uaccess.h>
#include <linux/types.h>
#include <linux/fcntl.h>
@@ -52,15 +51,16 @@ static struct ping_table ping_table;
static u16 ping_port_rover;
-static inline int ping_hashfn(struct net *net, unsigned num, unsigned mask)
+static inline int ping_hashfn(struct net *net, unsigned int num, unsigned int mask)
{
int res = (num + net_hash_mix(net)) & mask;
+
pr_debug("hash(%d) = %d\n", num, res);
return res;
}
static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table,
- struct net *net, unsigned num)
+ struct net *net, unsigned int num)
{
return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)];
}
@@ -156,7 +156,7 @@ static struct sock *ping_v4_lookup(struct net *net, __be32 saddr, __be32 daddr,
struct hlist_nulls_node *hnode;
pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n",
- (int)ident, &daddr, dif);
+ (int)ident, &daddr, dif);
read_lock_bh(&ping_table.lock);
ping_portaddr_for_each_entry(sk, hnode, hslot) {
@@ -189,7 +189,8 @@ static void inet_get_ping_group_range_net(struct net *net, gid_t *low,
gid_t *high)
{
gid_t *data = net->ipv4.sysctl_ping_group_range;
- unsigned seq;
+ unsigned int seq;
+
do {
seq = read_seqbegin(&sysctl_local_ports.lock);
@@ -206,17 +207,22 @@ static int ping_init_sock(struct sock *sk)
gid_t range[2];
struct group_info *group_info = get_current_groups();
int i, j, count = group_info->ngroups;
+ kgid_t low, high;
inet_get_ping_group_range_net(net, range, range+1);
+ low = make_kgid(&init_user_ns, range[0]);
+ high = make_kgid(&init_user_ns, range[1]);
+ if (!gid_valid(low) || !gid_valid(high) || gid_lt(high, low))
+ return -EACCES;
+
if (range[0] <= group && group <= range[1])
return 0;
for (i = 0; i < group_info->nblocks; i++) {
int cp_count = min_t(int, NGROUPS_PER_BLOCK, count);
-
for (j = 0; j < cp_count; j++) {
- group = group_info->blocks[i][j];
- if (range[0] <= group && group <= range[1])
+ kgid_t gid = group_info->blocks[i][j];
+ if (gid_lte(low, gid) && gid_lte(gid, high))
return 0;
}
@@ -229,7 +235,7 @@ static int ping_init_sock(struct sock *sk)
static void ping_close(struct sock *sk, long timeout)
{
pr_debug("ping_close(sk=%p,sk->num=%u)\n",
- inet_sk(sk), inet_sk(sk)->inet_num);
+ inet_sk(sk), inet_sk(sk)->inet_num);
pr_debug("isk->refcnt = %d\n", sk->sk_refcnt.counter);
sk_common_release(sk);
@@ -252,7 +258,7 @@ static int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
return -EINVAL;
pr_debug("ping_v4_bind(sk=%p,sa_addr=%08x,sa_port=%d)\n",
- sk, addr->sin_addr.s_addr, ntohs(addr->sin_port));
+ sk, addr->sin_addr.s_addr, ntohs(addr->sin_port));
chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
if (addr->sin_addr.s_addr == htonl(INADDR_ANY))
@@ -280,9 +286,9 @@ static int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
}
pr_debug("after bind(): num = %d, daddr = %pI4, dif = %d\n",
- (int)isk->inet_num,
- &isk->inet_rcv_saddr,
- (int)sk->sk_bound_dev_if);
+ (int)isk->inet_num,
+ &isk->inet_rcv_saddr,
+ (int)sk->sk_bound_dev_if);
err = 0;
if (isk->inet_rcv_saddr)
@@ -335,7 +341,7 @@ void ping_err(struct sk_buff *skb, u32 info)
return;
pr_debug("ping_err(type=%04x,code=%04x,id=%04x,seq=%04x)\n", type,
- code, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence));
+ code, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence));
sk = ping_v4_lookup(net, iph->daddr, iph->saddr,
ntohs(icmph->un.echo.id), skb->dev->ifindex);
@@ -411,7 +417,7 @@ struct pingfakehdr {
__wsum wcheck;
};
-static int ping_getfrag(void *from, char * to,
+static int ping_getfrag(void *from, char *to,
int offset, int fraglen, int odd, struct sk_buff *skb)
{
struct pingfakehdr *pfh = (struct pingfakehdr *)from;
@@ -556,7 +562,8 @@ static int ping_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
ipc.oif = inet->mc_index;
if (!saddr)
saddr = inet->mc_addr;
- }
+ } else if (!ipc.oif)
+ ipc.oif = inet->uc_index;
flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
RT_SCOPE_UNIVERSE, sk->sk_protocol,
@@ -678,7 +685,7 @@ out:
static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n",
- inet_sk(sk), inet_sk(sk)->inet_num, skb);
+ inet_sk(sk), inet_sk(sk)->inet_num, skb);
if (sock_queue_rcv_skb(sk, skb) < 0) {
kfree_skb(skb);
pr_debug("ping_queue_rcv_skb -> failed\n");
@@ -704,7 +711,7 @@ void ping_rcv(struct sk_buff *skb)
/* We assume the packet has already been checked by icmp_rcv */
pr_debug("ping_rcv(skb=%p,id=%04x,seq=%04x)\n",
- skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence));
+ skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence));
/* Push ICMP header back */
skb_push(skb, skb->data - (u8 *)icmph);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 6afc807ee2a..8af0d44e4e2 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -256,6 +256,8 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW),
SNMP_MIB_ITEM("TCPReqQFullDoCookies", LINUX_MIB_TCPREQQFULLDOCOOKIES),
SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP),
+ SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL),
+ SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE),
SNMP_MIB_SENTINEL
};
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 3ccda5ae8a2..4032b818f3e 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -288,7 +288,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
read_unlock(&raw_v4_hashinfo.lock);
}
-static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
+static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
/* Charge it to the socket. */
@@ -491,11 +491,8 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (msg->msg_namelen < sizeof(*usin))
goto out;
if (usin->sin_family != AF_INET) {
- static int complained;
- if (!complained++)
- printk(KERN_INFO "%s forgot to set AF_INET in "
- "raw sendmsg. Fix it!\n",
- current->comm);
+ pr_info_once("%s: %s forgot to set AF_INET. Fix it!\n",
+ __func__, current->comm);
err = -EAFNOSUPPORT;
if (usin->sin_family)
goto out;
@@ -563,7 +560,8 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
ipc.oif = inet->mc_index;
if (!saddr)
saddr = inet->mc_addr;
- }
+ } else if (!ipc.oif)
+ ipc.oif = inet->uc_index;
flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
RT_SCOPE_UNIVERSE,
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 01977479617..98b30d08efe 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -62,9 +62,10 @@
* 2 of the License, or (at your option) any later version.
*/
+#define pr_fmt(fmt) "IPv4: " fmt
+
#include <linux/module.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -108,6 +109,7 @@
#include <net/rtnetlink.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
+#include <linux/kmemleak.h>
#endif
#include <net/secure_seq.h>
@@ -228,7 +230,7 @@ const __u8 ip_tos2prio[16] = {
TC_PRIO_INTERACTIVE_BULK,
ECN_OR_COST(INTERACTIVE_BULK)
};
-
+EXPORT_SYMBOL(ip_tos2prio);
/*
* Route cache.
@@ -295,7 +297,7 @@ static inline void rt_hash_lock_init(void)
#endif
static struct rt_hash_bucket *rt_hash_table __read_mostly;
-static unsigned rt_hash_mask __read_mostly;
+static unsigned int rt_hash_mask __read_mostly;
static unsigned int rt_hash_log __read_mostly;
static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
@@ -958,8 +960,7 @@ void rt_cache_flush_batch(struct net *net)
static void rt_emergency_hash_rebuild(struct net *net)
{
- if (net_ratelimit())
- printk(KERN_WARNING "Route hash chain too long!\n");
+ net_warn_ratelimited("Route hash chain too long!\n");
rt_cache_invalidate(net);
}
@@ -1082,8 +1083,7 @@ static int rt_garbage_collect(struct dst_ops *ops)
goto out;
if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
goto out;
- if (net_ratelimit())
- printk(KERN_WARNING "dst cache overflow\n");
+ net_warn_ratelimited("dst cache overflow\n");
RT_CACHE_STAT_INC(gc_dst_overflow);
return 1;
@@ -1116,12 +1116,17 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const vo
static const __be32 inaddr_any = 0;
struct net_device *dev = dst->dev;
const __be32 *pkey = daddr;
+ const struct rtable *rt;
struct neighbour *n;
+ rt = (const struct rtable *) dst;
+
if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
pkey = &inaddr_any;
+ else if (rt->rt_gateway)
+ pkey = (const __be32 *) &rt->rt_gateway;
- n = __ipv4_neigh_lookup(&arp_tbl, dev, *(__force u32 *)pkey);
+ n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
if (n)
return n;
return neigh_create(&arp_tbl, pkey, dev);
@@ -1137,7 +1142,7 @@ static int rt_bind_neighbour(struct rtable *rt)
return 0;
}
-static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
+static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
struct sk_buff *skb, int ifindex)
{
struct rtable *rth, *cand;
@@ -1175,9 +1180,7 @@ restart:
if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
int err = rt_bind_neighbour(rt);
if (err) {
- if (net_ratelimit())
- printk(KERN_WARNING
- "Neighbour table failure & not caching routes.\n");
+ net_warn_ratelimited("Neighbour table failure & not caching routes\n");
ip_rt_put(rt);
return ERR_PTR(err);
}
@@ -1253,7 +1256,7 @@ restart:
struct net *net = dev_net(rt->dst.dev);
int num = ++net->ipv4.current_rt_cache_rebuild_count;
if (!rt_caching(net)) {
- printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
+ pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
rt->dst.dev->name, num);
}
rt_emergency_hash_rebuild(net);
@@ -1293,8 +1296,7 @@ restart:
goto restart;
}
- if (net_ratelimit())
- printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
+ net_warn_ratelimited("Neighbour table overflow\n");
rt_drop(rt);
return ERR_PTR(-ENOBUFS);
}
@@ -1372,14 +1374,13 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
return;
}
} else if (!rt)
- printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
- __builtin_return_address(0));
+ pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0));
ip_select_fb_ident(iph);
}
EXPORT_SYMBOL(__ip_select_ident);
-static void rt_del(unsigned hash, struct rtable *rt)
+static void rt_del(unsigned int hash, struct rtable *rt)
{
struct rtable __rcu **rthp;
struct rtable *aux;
@@ -1497,11 +1498,11 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
reject_redirect:
#ifdef CONFIG_IP_ROUTE_VERBOSE
- if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
- printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
- " Advised path = %pI4 -> %pI4\n",
- &old_gw, dev->name, &new_gw,
- &saddr, &daddr);
+ if (IN_DEV_LOG_MARTIANS(in_dev))
+ net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
+ " Advised path = %pI4 -> %pI4\n",
+ &old_gw, dev->name, &new_gw,
+ &saddr, &daddr);
#endif
;
}
@@ -1533,7 +1534,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
ip_rt_put(rt);
ret = NULL;
} else if (rt->rt_flags & RTCF_REDIRECTED) {
- unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
+ unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
rt->rt_oif,
rt_genid(dev_net(dst->dev)));
rt_del(hash, rt);
@@ -1611,11 +1612,10 @@ void ip_rt_send_redirect(struct sk_buff *skb)
++peer->rate_tokens;
#ifdef CONFIG_IP_ROUTE_VERBOSE
if (log_martians &&
- peer->rate_tokens == ip_rt_redirect_number &&
- net_ratelimit())
- printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
- &ip_hdr(skb)->saddr, rt->rt_iif,
- &rt->rt_dst, &rt->rt_gateway);
+ peer->rate_tokens == ip_rt_redirect_number)
+ net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
+ &ip_hdr(skb)->saddr, rt->rt_iif,
+ &rt->rt_dst, &rt->rt_gateway);
#endif
}
}
@@ -1838,9 +1838,9 @@ static void ipv4_link_failure(struct sk_buff *skb)
static int ip_rt_bug(struct sk_buff *skb)
{
- printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
- &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
- skb->dev ? skb->dev->name : "?");
+ pr_debug("%s: %pI4 -> %pI4, %s\n",
+ __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
+ skb->dev ? skb->dev->name : "?");
kfree_skb(skb);
WARN_ON(1);
return 0;
@@ -2036,7 +2036,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (err < 0)
goto e_err;
}
- rth = rt_dst_alloc(init_net.loopback_dev,
+ rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
if (!rth)
goto e_nobufs;
@@ -2100,18 +2100,13 @@ static void ip_handle_martian_source(struct net_device *dev,
* RFC1812 recommendation, if source is martian,
* the only hint is MAC header.
*/
- printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
+ pr_warn("martian source %pI4 from %pI4, on dev %s\n",
&daddr, &saddr, dev->name);
if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
- int i;
- const unsigned char *p = skb_mac_header(skb);
- printk(KERN_WARNING "ll header: ");
- for (i = 0; i < dev->hard_header_len; i++, p++) {
- printk("%02x", *p);
- if (i < (dev->hard_header_len - 1))
- printk(":");
- }
- printk("\n");
+ print_hex_dump(KERN_WARNING, "ll header: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ skb_mac_header(skb),
+ dev->hard_header_len, true);
}
}
#endif
@@ -2134,9 +2129,7 @@ static int __mkroute_input(struct sk_buff *skb,
/* get a working reference to the output device */
out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
if (out_dev == NULL) {
- if (net_ratelimit())
- printk(KERN_CRIT "Bug in ip_route_input" \
- "_slow(). Please, report\n");
+ net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
return -EINVAL;
}
@@ -2216,9 +2209,9 @@ static int ip_mkroute_input(struct sk_buff *skb,
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos)
{
- struct rtable* rth = NULL;
+ struct rtable *rth = NULL;
int err;
- unsigned hash;
+ unsigned int hash;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (res->fi && res->fi->fib_nhs > 1)
@@ -2256,13 +2249,13 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
struct fib_result res;
struct in_device *in_dev = __in_dev_get_rcu(dev);
struct flowi4 fl4;
- unsigned flags = 0;
+ unsigned int flags = 0;
u32 itag = 0;
- struct rtable * rth;
- unsigned hash;
+ struct rtable *rth;
+ unsigned int hash;
__be32 spec_dst;
int err = -EINVAL;
- struct net * net = dev_net(dev);
+ struct net *net = dev_net(dev);
/* IP on this device is disabled. */
@@ -2407,9 +2400,9 @@ no_route:
martian_destination:
RT_CACHE_STAT_INC(in_martian_dst);
#ifdef CONFIG_IP_ROUTE_VERBOSE
- if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
- printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
- &daddr, &saddr, dev->name);
+ if (IN_DEV_LOG_MARTIANS(in_dev))
+ net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
+ &daddr, &saddr, dev->name);
#endif
e_hostunreach:
@@ -2434,8 +2427,8 @@ martian_source_keep_err:
int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev, bool noref)
{
- struct rtable * rth;
- unsigned hash;
+ struct rtable *rth;
+ unsigned int hash;
int iif = dev->ifindex;
struct net *net;
int res;
@@ -2973,7 +2966,8 @@ static int rt_fill_info(struct net *net,
r->rtm_src_len = 0;
r->rtm_tos = rt->rt_key_tos;
r->rtm_table = RT_TABLE_MAIN;
- NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
+ if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
+ goto nla_put_failure;
r->rtm_type = rt->rt_type;
r->rtm_scope = RT_SCOPE_UNIVERSE;
r->rtm_protocol = RTPROT_UNSPEC;
@@ -2981,31 +2975,38 @@ static int rt_fill_info(struct net *net,
if (rt->rt_flags & RTCF_NOTIFY)
r->rtm_flags |= RTM_F_NOTIFY;
- NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
-
+ if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
+ goto nla_put_failure;
if (rt->rt_key_src) {
r->rtm_src_len = 32;
- NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
+ if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
+ goto nla_put_failure;
}
- if (rt->dst.dev)
- NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
+ if (rt->dst.dev &&
+ nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
+ goto nla_put_failure;
#ifdef CONFIG_IP_ROUTE_CLASSID
- if (rt->dst.tclassid)
- NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
+ if (rt->dst.tclassid &&
+ nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
+ goto nla_put_failure;
#endif
- if (rt_is_input_route(rt))
- NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
- else if (rt->rt_src != rt->rt_key_src)
- NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
-
- if (rt->rt_dst != rt->rt_gateway)
- NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
+ if (rt_is_input_route(rt)) {
+ if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_spec_dst))
+ goto nla_put_failure;
+ } else if (rt->rt_src != rt->rt_key_src) {
+ if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
+ goto nla_put_failure;
+ }
+ if (rt->rt_dst != rt->rt_gateway &&
+ nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
+ goto nla_put_failure;
if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
goto nla_put_failure;
- if (rt->rt_mark)
- NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
+ if (rt->rt_mark &&
+ nla_put_be32(skb, RTA_MARK, rt->rt_mark))
+ goto nla_put_failure;
error = rt->dst.error;
if (peer) {
@@ -3046,7 +3047,8 @@ static int rt_fill_info(struct net *net,
}
} else
#endif
- NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
+ if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
+ goto nla_put_failure;
}
if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
@@ -3060,7 +3062,7 @@ nla_put_failure:
return -EMSGSIZE;
}
-static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
+static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
{
struct net *net = sock_net(in_skb->sk);
struct rtmsg *rtm;
@@ -3335,23 +3337,6 @@ static ctl_table ipv4_route_table[] = {
{ }
};
-static struct ctl_table empty[1];
-
-static struct ctl_table ipv4_skeleton[] =
-{
- { .procname = "route",
- .mode = 0555, .child = ipv4_route_table},
- { .procname = "neigh",
- .mode = 0555, .child = empty},
- { }
-};
-
-static __net_initdata struct ctl_path ipv4_path[] = {
- { .procname = "net", },
- { .procname = "ipv4", },
- { },
-};
-
static struct ctl_table ipv4_route_flush_table[] = {
{
.procname = "flush",
@@ -3362,13 +3347,6 @@ static struct ctl_table ipv4_route_flush_table[] = {
{ },
};
-static __net_initdata struct ctl_path ipv4_route_path[] = {
- { .procname = "net", },
- { .procname = "ipv4", },
- { .procname = "route", },
- { },
-};
-
static __net_init int sysctl_route_net_init(struct net *net)
{
struct ctl_table *tbl;
@@ -3381,8 +3359,7 @@ static __net_init int sysctl_route_net_init(struct net *net)
}
tbl[0].extra1 = net;
- net->ipv4.route_hdr =
- register_net_sysctl_table(net, ipv4_route_path, tbl);
+ net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
if (net->ipv4.route_hdr == NULL)
goto err_reg;
return 0;
@@ -3431,9 +3408,15 @@ struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
static __initdata unsigned long rhash_entries;
static int __init set_rhash_entries(char *str)
{
+ ssize_t ret;
+
if (!str)
return 0;
- rhash_entries = simple_strtoul(str, &str, 0);
+
+ ret = kstrtoul(str, 0, &rhash_entries);
+ if (ret)
+ return 0;
+
return 1;
}
__setup("rhash_entries=", set_rhash_entries);
@@ -3469,6 +3452,7 @@ int __init ip_rt_init(void)
0,
&rt_hash_log,
&rt_hash_mask,
+ 0,
rhash_entries ? 0 : 512 * 1024);
memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
rt_hash_lock_init();
@@ -3485,7 +3469,7 @@ int __init ip_rt_init(void)
net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
if (ip_rt_proc_init())
- printk(KERN_ERR "Unable to create route proc files\n");
+ pr_err("Unable to create route proc files\n");
#ifdef CONFIG_XFRM
xfrm_init();
xfrm4_init(ip_rt_max_size);
@@ -3506,6 +3490,6 @@ int __init ip_rt_init(void)
*/
void __init ip_static_sysctl_init(void)
{
- register_sysctl_paths(ipv4_path, ipv4_skeleton);
+ register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
}
#endif
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 7a7724da9bf..ef32956ed65 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -27,6 +27,7 @@
#include <net/tcp_memcontrol.h>
static int zero;
+static int two = 2;
static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 };
static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -78,7 +79,7 @@ static int ipv4_local_port_range(ctl_table *table, int write,
static void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t *high)
{
gid_t *data = table->data;
- unsigned seq;
+ unsigned int seq;
do {
seq = read_seqbegin(&sysctl_local_ports.lock);
@@ -677,6 +678,15 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "tcp_early_retrans",
+ .data = &sysctl_tcp_early_retrans,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &two,
+ },
+ {
.procname = "udp_mem",
.data = &sysctl_udp_mem,
.maxlen = sizeof(sysctl_udp_mem),
@@ -768,13 +778,6 @@ static struct ctl_table ipv4_net_table[] = {
{ }
};
-struct ctl_path net_ipv4_ctl_path[] = {
- { .procname = "net", },
- { .procname = "ipv4", },
- { },
-};
-EXPORT_SYMBOL_GPL(net_ipv4_ctl_path);
-
static __net_init int ipv4_sysctl_init_net(struct net *net)
{
struct ctl_table *table;
@@ -815,8 +818,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
tcp_init_mem(net);
- net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
- net_ipv4_ctl_path, table);
+ net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
if (net->ipv4.ipv4_hdr == NULL)
goto err_reg;
@@ -857,12 +859,12 @@ static __init int sysctl_ipv4_init(void)
if (!i->procname)
return -EINVAL;
- hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table);
+ hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table);
if (hdr == NULL)
return -ENOMEM;
if (register_pernet_subsys(&ipv4_sysctl_ops)) {
- unregister_sysctl_table(hdr);
+ unregister_net_sysctl_table(hdr);
return -ENOMEM;
}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 22ef5f9fd2f..3ba605f60e4 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -245,6 +245,8 @@
* TCP_CLOSE socket is finished
*/
+#define pr_fmt(fmt) "TCP: " fmt
+
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/types.h>
@@ -361,6 +363,71 @@ static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
return period;
}
+/* Address-family independent initialization for a tcp_sock.
+ *
+ * NOTE: A lot of things set to zero explicitly by call to
+ * sk_alloc() so need not be done here.
+ */
+void tcp_init_sock(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ skb_queue_head_init(&tp->out_of_order_queue);
+ tcp_init_xmit_timers(sk);
+ tcp_prequeue_init(tp);
+
+ icsk->icsk_rto = TCP_TIMEOUT_INIT;
+ tp->mdev = TCP_TIMEOUT_INIT;
+
+ /* So many TCP implementations out there (incorrectly) count the
+ * initial SYN frame in their delayed-ACK and congestion control
+ * algorithms that we must have the following bandaid to talk
+ * efficiently to them. -DaveM
+ */
+ tp->snd_cwnd = TCP_INIT_CWND;
+
+ /* See draft-stevens-tcpca-spec-01 for discussion of the
+ * initialization of these values.
+ */
+ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ tp->snd_cwnd_clamp = ~0;
+ tp->mss_cache = TCP_MSS_DEFAULT;
+
+ tp->reordering = sysctl_tcp_reordering;
+ tcp_enable_early_retrans(tp);
+ icsk->icsk_ca_ops = &tcp_init_congestion_ops;
+
+ sk->sk_state = TCP_CLOSE;
+
+ sk->sk_write_space = sk_stream_write_space;
+ sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+
+ icsk->icsk_sync_mss = tcp_sync_mss;
+
+ /* TCP Cookie Transactions */
+ if (sysctl_tcp_cookie_size > 0) {
+ /* Default, cookies without s_data_payload. */
+ tp->cookie_values =
+ kzalloc(sizeof(*tp->cookie_values),
+ sk->sk_allocation);
+ if (tp->cookie_values != NULL)
+ kref_init(&tp->cookie_values->kref);
+ }
+ /* Presumed zeroed, in order of appearance:
+ * cookie_in_always, cookie_out_never,
+ * s_data_constant, s_data_in, s_data_out
+ */
+ sk->sk_sndbuf = sysctl_tcp_wmem[1];
+ sk->sk_rcvbuf = sysctl_tcp_rmem[1];
+
+ local_bh_disable();
+ sock_update_memcg(sk);
+ sk_sockets_allocated_inc(sk);
+ local_bh_enable();
+}
+EXPORT_SYMBOL(tcp_init_sock);
+
/*
* Wait for a TCP event.
*
@@ -526,7 +593,7 @@ static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
tp->pushed_seq = tp->write_seq;
}
-static inline int forced_push(const struct tcp_sock *tp)
+static inline bool forced_push(const struct tcp_sock *tp)
{
return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
}
@@ -699,11 +766,12 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
if (skb) {
if (sk_wmem_schedule(sk, skb->truesize)) {
+ skb_reserve(skb, sk->sk_prot->max_header);
/*
* Make sure that we have exactly size bytes
* available to the caller, no more, no less.
*/
- skb_reserve(skb, skb_tailroom(skb) - size);
+ skb->avail_size = size;
return skb;
}
__kfree_skb(skb);
@@ -781,9 +849,10 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
while (psize > 0) {
struct sk_buff *skb = tcp_write_queue_tail(sk);
struct page *page = pages[poffset / PAGE_SIZE];
- int copy, i, can_coalesce;
+ int copy, i;
int offset = poffset % PAGE_SIZE;
int size = min_t(size_t, psize, PAGE_SIZE - offset);
+ bool can_coalesce;
if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
new_segment:
@@ -848,8 +917,7 @@ new_segment:
wait_for_sndbuf:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
- if (copied)
- tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
+ tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
goto do_error;
@@ -858,7 +926,7 @@ wait_for_memory:
}
out:
- if (copied)
+ if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
tcp_push(sk, flags, mss_now, tp->nonagle);
return copied;
@@ -916,7 +984,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int iovlen, flags, err, copied;
- int mss_now, size_goal;
+ int mss_now = 0, size_goal;
bool sg;
long timeo;
@@ -930,6 +998,19 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
goto out_err;
+ if (unlikely(tp->repair)) {
+ if (tp->repair_queue == TCP_RECV_QUEUE) {
+ copied = tcp_send_rcvq(sk, msg, size);
+ goto out;
+ }
+
+ err = -EINVAL;
+ if (tp->repair_queue == TCP_NO_QUEUE)
+ goto out_err;
+
+ /* 'common' sending to sendq */
+ }
+
/* This should be in poll */
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
@@ -993,15 +1074,14 @@ new_segment:
copy = seglen;
/* Where to copy to? */
- if (skb_tailroom(skb) > 0) {
+ if (skb_availroom(skb) > 0) {
/* We have some space in skb head. Superb! */
- if (copy > skb_tailroom(skb))
- copy = skb_tailroom(skb);
+ copy = min_t(int, copy, skb_availroom(skb));
err = skb_add_data_nocache(sk, skb, from, copy);
if (err)
goto do_fault;
} else {
- int merge = 0;
+ bool merge = false;
int i = skb_shinfo(skb)->nr_frags;
struct page *page = sk->sk_sndmsg_page;
int off;
@@ -1015,7 +1095,7 @@ new_segment:
off != PAGE_SIZE) {
/* We can extend the last page
* fragment. */
- merge = 1;
+ merge = true;
} else if (i == MAX_SKB_FRAGS || !sg) {
/* Need to add new fragment and cannot
* do this because interface is non-SG,
@@ -1087,7 +1167,7 @@ new_segment:
if ((seglen -= copy) == 0 && iovlen == 0)
goto out;
- if (skb->len < max || (flags & MSG_OOB))
+ if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
continue;
if (forced_push(tp)) {
@@ -1100,7 +1180,7 @@ new_segment:
wait_for_sndbuf:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
- if (copied)
+ if (copied && likely(!tp->repair))
tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
@@ -1111,7 +1191,7 @@ wait_for_memory:
}
out:
- if (copied)
+ if (copied && likely(!tp->repair))
tcp_push(sk, flags, mss_now, tp->nonagle);
release_sock(sk);
return copied;
@@ -1185,6 +1265,24 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
return -EAGAIN;
}
+static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
+{
+ struct sk_buff *skb;
+ int copied = 0, err = 0;
+
+ /* XXX -- need to support SO_PEEK_OFF */
+
+ skb_queue_walk(&sk->sk_write_queue, skb) {
+ err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len);
+ if (err)
+ break;
+
+ copied += skb->len;
+ }
+
+ return err ?: copied;
+}
+
/* Clean up the receive buffer for full frames taken by the user,
* then send an ACK if necessary. COPIED is the number of bytes
* tcp_recvmsg has given to the user so far, it speeds up the
@@ -1194,7 +1292,7 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
void tcp_cleanup_rbuf(struct sock *sk, int copied)
{
struct tcp_sock *tp = tcp_sk(sk);
- int time_to_ack = 0;
+ bool time_to_ack = false;
struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
@@ -1220,7 +1318,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
!icsk->icsk_ack.pingpong)) &&
!atomic_read(&sk->sk_rmem_alloc)))
- time_to_ack = 1;
+ time_to_ack = true;
}
/* We send an ACK if we can now advertise a non-zero window
@@ -1242,7 +1340,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
* "Lots" means "at least twice" here.
*/
if (new_window && new_window >= 2 * rcv_window_now)
- time_to_ack = 1;
+ time_to_ack = true;
}
}
if (time_to_ack)
@@ -1374,11 +1472,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
break;
}
if (tcp_hdr(skb)->fin) {
- sk_eat_skb(sk, skb, 0);
+ sk_eat_skb(sk, skb, false);
++seq;
break;
}
- sk_eat_skb(sk, skb, 0);
+ sk_eat_skb(sk, skb, false);
if (!desc->count)
break;
tp->copied_seq = seq;
@@ -1414,7 +1512,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
int target; /* Read at least this many bytes */
long timeo;
struct task_struct *user_recv = NULL;
- int copied_early = 0;
+ bool copied_early = false;
struct sk_buff *skb;
u32 urg_hole = 0;
@@ -1430,6 +1528,21 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (flags & MSG_OOB)
goto recv_urg;
+ if (unlikely(tp->repair)) {
+ err = -EPERM;
+ if (!(flags & MSG_PEEK))
+ goto out;
+
+ if (tp->repair_queue == TCP_SEND_QUEUE)
+ goto recv_sndq;
+
+ err = -EINVAL;
+ if (tp->repair_queue == TCP_NO_QUEUE)
+ goto out;
+
+ /* 'common' recv queue MSG_PEEK-ing */
+ }
+
seq = &tp->copied_seq;
if (flags & MSG_PEEK) {
peek_seq = tp->copied_seq;
@@ -1450,7 +1563,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if ((available < target) &&
(len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
!sysctl_tcp_low_latency &&
- dma_find_channel(DMA_MEMCPY)) {
+ net_dma_find_channel()) {
preempt_enable_no_resched();
tp->ucopy.pinned_list =
dma_pin_iovec_pages(msg->msg_iov, len);
@@ -1631,9 +1744,9 @@ do_prequeue:
}
if ((flags & MSG_PEEK) &&
(peek_seq - copied - urg_hole != tp->copied_seq)) {
- if (net_ratelimit())
- printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
- current->comm, task_pid_nr(current));
+ net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
+ current->comm,
+ task_pid_nr(current));
peek_seq = tp->copied_seq;
}
continue;
@@ -1665,7 +1778,7 @@ do_prequeue:
if (!(flags & MSG_TRUNC)) {
#ifdef CONFIG_NET_DMA
if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
- tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
+ tp->ucopy.dma_chan = net_dma_find_channel();
if (tp->ucopy.dma_chan) {
tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
@@ -1675,7 +1788,8 @@ do_prequeue:
if (tp->ucopy.dma_cookie < 0) {
- printk(KERN_ALERT "dma_cookie < 0\n");
+ pr_alert("%s: dma_cookie < 0\n",
+ __func__);
/* Exception. Bailout! */
if (!copied)
@@ -1686,7 +1800,7 @@ do_prequeue:
dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
if ((offset + used) == skb->len)
- copied_early = 1;
+ copied_early = true;
} else
#endif
@@ -1720,7 +1834,7 @@ skip_copy:
goto found_fin_ok;
if (!(flags & MSG_PEEK)) {
sk_eat_skb(sk, skb, copied_early);
- copied_early = 0;
+ copied_early = false;
}
continue;
@@ -1729,7 +1843,7 @@ skip_copy:
++*seq;
if (!(flags & MSG_PEEK)) {
sk_eat_skb(sk, skb, copied_early);
- copied_early = 0;
+ copied_early = false;
}
break;
} while (len > 0);
@@ -1780,6 +1894,10 @@ out:
recv_urg:
err = tcp_recv_urg(sk, msg, len, flags);
goto out;
+
+recv_sndq:
+ err = tcp_peek_sndq(sk, msg, len);
+ goto out;
}
EXPORT_SYMBOL(tcp_recvmsg);
@@ -1883,10 +2001,10 @@ bool tcp_check_oom(struct sock *sk, int shift)
too_many_orphans = tcp_too_many_orphans(sk, shift);
out_of_socket_memory = tcp_out_of_memory(sk);
- if (too_many_orphans && net_ratelimit())
- pr_info("TCP: too many orphaned sockets\n");
- if (out_of_socket_memory && net_ratelimit())
- pr_info("TCP: out of memory -- consider tuning tcp_mem\n");
+ if (too_many_orphans)
+ net_info_ratelimited("too many orphaned sockets\n");
+ if (out_of_socket_memory)
+ net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
return too_many_orphans || out_of_socket_memory;
}
@@ -1932,7 +2050,9 @@ void tcp_close(struct sock *sk, long timeout)
* advertise a zero window, then kill -9 the FTP client, wheee...
* Note: timeout is always zero in such a case.
*/
- if (data_was_unread) {
+ if (unlikely(tcp_sk(sk)->repair)) {
+ sk->sk_prot->disconnect(sk, 0);
+ } else if (data_was_unread) {
/* Unread data was tossed, zap the connection. */
NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
tcp_set_state(sk, TCP_CLOSE);
@@ -2050,7 +2170,7 @@ EXPORT_SYMBOL(tcp_close);
/* These states need RST on ABORT according to RFC793 */
-static inline int tcp_need_reset(int state)
+static inline bool tcp_need_reset(int state)
{
return (1 << state) &
(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
@@ -2071,6 +2191,8 @@ int tcp_disconnect(struct sock *sk, int flags)
/* ABORT function of RFC793 */
if (old_state == TCP_LISTEN) {
inet_csk_listen_stop(sk);
+ } else if (unlikely(tp->repair)) {
+ sk->sk_err = ECONNABORTED;
} else if (tcp_need_reset(old_state) ||
(tp->snd_nxt != tp->write_seq &&
(1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
@@ -2122,6 +2244,54 @@ int tcp_disconnect(struct sock *sk, int flags)
}
EXPORT_SYMBOL(tcp_disconnect);
+static inline bool tcp_can_repair_sock(const struct sock *sk)
+{
+ return capable(CAP_NET_ADMIN) &&
+ ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
+}
+
+static int tcp_repair_options_est(struct tcp_sock *tp,
+ struct tcp_repair_opt __user *optbuf, unsigned int len)
+{
+ struct tcp_repair_opt opt;
+
+ while (len >= sizeof(opt)) {
+ if (copy_from_user(&opt, optbuf, sizeof(opt)))
+ return -EFAULT;
+
+ optbuf++;
+ len -= sizeof(opt);
+
+ switch (opt.opt_code) {
+ case TCPOPT_MSS:
+ tp->rx_opt.mss_clamp = opt.opt_val;
+ break;
+ case TCPOPT_WINDOW:
+ if (opt.opt_val > 14)
+ return -EFBIG;
+
+ tp->rx_opt.snd_wscale = opt.opt_val;
+ break;
+ case TCPOPT_SACK_PERM:
+ if (opt.opt_val != 0)
+ return -EINVAL;
+
+ tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
+ if (sysctl_tcp_fack)
+ tcp_enable_fack(tp);
+ break;
+ case TCPOPT_TIMESTAMP:
+ if (opt.opt_val != 0)
+ return -EINVAL;
+
+ tp->rx_opt.tstamp_ok = 1;
+ break;
+ }
+ }
+
+ return 0;
+}
+
/*
* Socket option code for TCP.
*/
@@ -2292,6 +2462,55 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
err = -EINVAL;
else
tp->thin_dupack = val;
+ if (tp->thin_dupack)
+ tcp_disable_early_retrans(tp);
+ break;
+
+ case TCP_REPAIR:
+ if (!tcp_can_repair_sock(sk))
+ err = -EPERM;
+ else if (val == 1) {
+ tp->repair = 1;
+ sk->sk_reuse = SK_FORCE_REUSE;
+ tp->repair_queue = TCP_NO_QUEUE;
+ } else if (val == 0) {
+ tp->repair = 0;
+ sk->sk_reuse = SK_NO_REUSE;
+ tcp_send_window_probe(sk);
+ } else
+ err = -EINVAL;
+
+ break;
+
+ case TCP_REPAIR_QUEUE:
+ if (!tp->repair)
+ err = -EPERM;
+ else if (val < TCP_QUEUES_NR)
+ tp->repair_queue = val;
+ else
+ err = -EINVAL;
+ break;
+
+ case TCP_QUEUE_SEQ:
+ if (sk->sk_state != TCP_CLOSE)
+ err = -EPERM;
+ else if (tp->repair_queue == TCP_SEND_QUEUE)
+ tp->write_seq = val;
+ else if (tp->repair_queue == TCP_RECV_QUEUE)
+ tp->rcv_nxt = val;
+ else
+ err = -EINVAL;
+ break;
+
+ case TCP_REPAIR_OPTIONS:
+ if (!tp->repair)
+ err = -EINVAL;
+ else if (sk->sk_state == TCP_ESTABLISHED)
+ err = tcp_repair_options_est(tp,
+ (struct tcp_repair_opt __user *)optval,
+ optlen);
+ else
+ err = -EPERM;
break;
case TCP_CORK:
@@ -2527,6 +2746,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = tp->mss_cache;
if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
val = tp->rx_opt.user_mss;
+ if (tp->repair)
+ val = tp->rx_opt.mss_clamp;
break;
case TCP_NODELAY:
val = !!(tp->nonagle&TCP_NAGLE_OFF);
@@ -2629,6 +2850,26 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = tp->thin_dupack;
break;
+ case TCP_REPAIR:
+ val = tp->repair;
+ break;
+
+ case TCP_REPAIR_QUEUE:
+ if (tp->repair)
+ val = tp->repair_queue;
+ else
+ return -EINVAL;
+ break;
+
+ case TCP_QUEUE_SEQ:
+ if (tp->repair_queue == TCP_SEND_QUEUE)
+ val = tp->write_seq;
+ else if (tp->repair_queue == TCP_RECV_QUEUE)
+ val = tp->rcv_nxt;
+ else
+ return -EINVAL;
+ break;
+
case TCP_USER_TIMEOUT:
val = jiffies_to_msecs(icsk->icsk_user_timeout);
break;
@@ -2672,7 +2913,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
struct tcphdr *th;
- unsigned thlen;
+ unsigned int thlen;
unsigned int seq;
__be32 delta;
unsigned int oldlen;
@@ -2930,13 +3171,13 @@ out_free:
struct tcp_md5sig_pool __percpu *tcp_alloc_md5sig_pool(struct sock *sk)
{
struct tcp_md5sig_pool __percpu *pool;
- int alloc = 0;
+ bool alloc = false;
retry:
spin_lock_bh(&tcp_md5sig_pool_lock);
pool = tcp_md5sig_pool;
if (tcp_md5sig_users++ == 0) {
- alloc = 1;
+ alloc = true;
spin_unlock_bh(&tcp_md5sig_pool_lock);
} else if (!pool) {
tcp_md5sig_users--;
@@ -3030,9 +3271,9 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
struct scatterlist sg;
const struct tcphdr *tp = tcp_hdr(skb);
struct hash_desc *desc = &hp->md5_desc;
- unsigned i;
- const unsigned head_data_len = skb_headlen(skb) > header_len ?
- skb_headlen(skb) - header_len : 0;
+ unsigned int i;
+ const unsigned int head_data_len = skb_headlen(skb) > header_len ?
+ skb_headlen(skb) - header_len : 0;
const struct skb_shared_info *shi = skb_shinfo(skb);
struct sk_buff *frag_iter;
@@ -3220,9 +3461,15 @@ extern struct tcp_congestion_ops tcp_reno;
static __initdata unsigned long thash_entries;
static int __init set_thash_entries(char *str)
{
+ ssize_t ret;
+
if (!str)
return 0;
- thash_entries = simple_strtoul(str, &str, 0);
+
+ ret = kstrtoul(str, 0, &thash_entries);
+ if (ret)
+ return 0;
+
return 1;
}
__setup("thash_entries=", set_thash_entries);
@@ -3240,7 +3487,7 @@ void __init tcp_init(void)
{
struct sk_buff *skb = NULL;
unsigned long limit;
- int max_share, cnt;
+ int max_rshare, max_wshare, cnt;
unsigned int i;
unsigned long jiffy = jiffies;
@@ -3267,6 +3514,7 @@ void __init tcp_init(void)
0,
NULL,
&tcp_hashinfo.ehash_mask,
+ 0,
thash_entries ? 0 : 512 * 1024);
for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) {
INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
@@ -3283,6 +3531,7 @@ void __init tcp_init(void)
0,
&tcp_hashinfo.bhash_size,
NULL,
+ 0,
64 * 1024);
tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
@@ -3299,21 +3548,20 @@ void __init tcp_init(void)
tcp_init_mem(&init_net);
/* Set per-socket limits to no more than 1/128 the pressure threshold */
- limit = nr_free_buffer_pages() << (PAGE_SHIFT - 10);
- limit = max(limit, 128UL);
- max_share = min(4UL*1024*1024, limit);
+ limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
+ max_wshare = min(4UL*1024*1024, limit);
+ max_rshare = min(6UL*1024*1024, limit);
sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
sysctl_tcp_wmem[1] = 16*1024;
- sysctl_tcp_wmem[2] = max(64*1024, max_share);
+ sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
sysctl_tcp_rmem[1] = 87380;
- sysctl_tcp_rmem[2] = max(87380, max_share);
+ sysctl_tcp_rmem[2] = max(87380, max_rshare);
- printk(KERN_INFO "TCP: Hash tables configured "
- "(established %u bind %u)\n",
- tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
+ pr_info("Hash tables configured (established %u bind %u)\n",
+ tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
tcp_register_congestion_control(&tcp_reno);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index fc6d475f488..04dbd7ae7c6 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -6,6 +6,8 @@
* Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
*/
+#define pr_fmt(fmt) "TCP: " fmt
+
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/types.h>
@@ -41,18 +43,17 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
/* all algorithms must implement ssthresh and cong_avoid ops */
if (!ca->ssthresh || !ca->cong_avoid) {
- printk(KERN_ERR "TCP %s does not implement required ops\n",
- ca->name);
+ pr_err("%s does not implement required ops\n", ca->name);
return -EINVAL;
}
spin_lock(&tcp_cong_list_lock);
if (tcp_ca_find(ca->name)) {
- printk(KERN_NOTICE "TCP %s already registered\n", ca->name);
+ pr_notice("%s already registered\n", ca->name);
ret = -EEXIST;
} else {
list_add_tail_rcu(&ca->list, &tcp_cong_list);
- printk(KERN_INFO "TCP %s registered\n", ca->name);
+ pr_info("%s registered\n", ca->name);
}
spin_unlock(&tcp_cong_list_lock);
@@ -279,19 +280,19 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
/* RFC2861 Check whether we are limited by application or congestion window
* This is the inverse of cwnd check in tcp_tso_should_defer
*/
-int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
+bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
{
const struct tcp_sock *tp = tcp_sk(sk);
u32 left;
if (in_flight >= tp->snd_cwnd)
- return 1;
+ return true;
left = tp->snd_cwnd - in_flight;
if (sk_can_gso(sk) &&
left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd &&
left * tp->mss_cache < sk->sk_gso_max_size)
- return 1;
+ return true;
return left <= tcp_max_tso_deferred_mss(tp);
}
EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index fe3ecf484b4..57bdd17dff4 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -15,7 +15,7 @@
/* Tcp Hybla structure. */
struct hybla {
- u8 hybla_en;
+ bool hybla_en;
u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */
u32 rho; /* Rho parameter, integer part */
u32 rho2; /* Rho * Rho, integer part */
@@ -24,8 +24,7 @@ struct hybla {
u32 minrtt; /* Minimum smoothed round trip time value seen */
};
-/* Hybla reference round trip time (default= 1/40 sec = 25 ms),
- expressed in jiffies */
+/* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */
static int rtt0 = 25;
module_param(rtt0, int, 0644);
MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
@@ -39,7 +38,7 @@ static inline void hybla_recalc_param (struct sock *sk)
ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8);
ca->rho = ca->rho_3ls >> 3;
ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
- ca->rho2 = ca->rho2_7ls >>7;
+ ca->rho2 = ca->rho2_7ls >> 7;
}
static void hybla_init(struct sock *sk)
@@ -52,7 +51,7 @@ static void hybla_init(struct sock *sk)
ca->rho_3ls = 0;
ca->rho2_7ls = 0;
ca->snd_cwnd_cents = 0;
- ca->hybla_en = 1;
+ ca->hybla_en = true;
tp->snd_cwnd = 2;
tp->snd_cwnd_clamp = 65535;
@@ -67,6 +66,7 @@ static void hybla_init(struct sock *sk)
static void hybla_state(struct sock *sk, u8 ca_state)
{
struct hybla *ca = inet_csk_ca(sk);
+
ca->hybla_en = (ca_state == TCP_CA_Open);
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b5e315f1364..b224eb8bce8 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -61,6 +61,8 @@
* Pasi Sarolahti: F-RTO for dealing with spurious RTOs
*/
+#define pr_fmt(fmt) "TCP: " fmt
+
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/module.h>
@@ -83,7 +85,7 @@ int sysctl_tcp_ecn __read_mostly = 2;
EXPORT_SYMBOL(sysctl_tcp_ecn);
int sysctl_tcp_dsack __read_mostly = 1;
int sysctl_tcp_app_win __read_mostly = 31;
-int sysctl_tcp_adv_win_scale __read_mostly = 2;
+int sysctl_tcp_adv_win_scale __read_mostly = 1;
EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
int sysctl_tcp_stdurg __read_mostly;
@@ -97,6 +99,7 @@ int sysctl_tcp_thin_dupack __read_mostly;
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
int sysctl_tcp_abc __read_mostly;
+int sysctl_tcp_early_retrans __read_mostly = 2;
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@@ -173,7 +176,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
static void tcp_incr_quickack(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
+ unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
if (quickacks == 0)
quickacks = 2;
@@ -193,9 +196,10 @@ static void tcp_enter_quickack_mode(struct sock *sk)
* and the session is not interactive.
*/
-static inline int tcp_in_quickack_mode(const struct sock *sk)
+static inline bool tcp_in_quickack_mode(const struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
+
return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
}
@@ -250,11 +254,11 @@ static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
tp->ecn_flags &= ~TCP_ECN_OK;
}
-static inline int TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
+static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
{
if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
- return 1;
- return 0;
+ return true;
+ return false;
}
/* Buffer size and advertised window tuning.
@@ -333,6 +337,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
incr = __tcp_grow_window(sk, skb);
if (incr) {
+ incr = max_t(int, incr, 2 * skb->len);
tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
tp->window_clamp);
inet_csk(sk)->icsk_ack.quick |= 1;
@@ -472,8 +477,11 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
if (!win_dep) {
m -= (new_sample >> 3);
new_sample += m;
- } else if (m < new_sample)
- new_sample = m << 3;
+ } else {
+ m <<= 3;
+ if (m < new_sample)
+ new_sample = m;
+ }
} else {
/* No previous measure. */
new_sample = m << 3;
@@ -489,7 +497,7 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
goto new_measure;
if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
return;
- tcp_rcv_rtt_update(tp, jiffies - tp->rcv_rtt_est.time, 1);
+ tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1);
new_measure:
tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
@@ -900,6 +908,7 @@ static void tcp_init_metrics(struct sock *sk)
if (dst_metric(dst, RTAX_REORDERING) &&
tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
tcp_disable_fack(tp);
+ tcp_disable_early_retrans(tp);
tp->reordering = dst_metric(dst, RTAX_REORDERING);
}
@@ -931,7 +940,7 @@ static void tcp_init_metrics(struct sock *sk)
tcp_set_rto(sk);
reset:
if (tp->srtt == 0) {
- /* RFC2988bis: We've failed to get a valid RTT sample from
+ /* RFC6298: 5.7 We've failed to get a valid RTT sample from
* 3WHS. This is most likely due to retransmission,
* including spurious one. Reset the RTO back to 3secs
* from the more aggressive 1sec to avoid more spurious
@@ -941,7 +950,7 @@ reset:
inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
}
/* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
- * retransmitted. In light of RFC2988bis' more aggressive 1sec
+ * retransmitted. In light of RFC6298 more aggressive 1sec
* initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
* retransmission has occurred.
*/
@@ -973,15 +982,18 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
NET_INC_STATS_BH(sock_net(sk), mib_idx);
#if FASTRETRANS_DEBUG > 1
- printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n",
- tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
- tp->reordering,
- tp->fackets_out,
- tp->sacked_out,
- tp->undo_marker ? tp->undo_retrans : 0);
+ pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
+ tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
+ tp->reordering,
+ tp->fackets_out,
+ tp->sacked_out,
+ tp->undo_marker ? tp->undo_retrans : 0);
#endif
tcp_disable_fack(tp);
}
+
+ if (metric > 0)
+ tcp_disable_early_retrans(tp);
}
/* This must be called before lost_out is incremented */
@@ -1112,36 +1124,36 @@ static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
* the exact amount is rather hard to quantify. However, tp->max_window can
* be used as an exaggerated estimate.
*/
-static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack,
- u32 start_seq, u32 end_seq)
+static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
+ u32 start_seq, u32 end_seq)
{
/* Too far in future, or reversed (interpretation is ambiguous) */
if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq))
- return 0;
+ return false;
/* Nasty start_seq wrap-around check (see comments above) */
if (!before(start_seq, tp->snd_nxt))
- return 0;
+ return false;
/* In outstanding window? ...This is valid exit for D-SACKs too.
* start_seq == snd_una is non-sensical (see comments above)
*/
if (after(start_seq, tp->snd_una))
- return 1;
+ return true;
if (!is_dsack || !tp->undo_marker)
- return 0;
+ return false;
/* ...Then it's D-SACK, and must reside below snd_una completely */
if (after(end_seq, tp->snd_una))
- return 0;
+ return false;
if (!before(start_seq, tp->undo_marker))
- return 1;
+ return true;
/* Too old */
if (!after(end_seq, tp->undo_marker))
- return 0;
+ return false;
/* Undo_marker boundary crossing (overestimates a lot). Known already:
* start_seq < undo_marker and end_seq >= undo_marker.
@@ -1213,17 +1225,17 @@ static void tcp_mark_lost_retrans(struct sock *sk)
tp->lost_retrans_low = new_low_seq;
}
-static int tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
- struct tcp_sack_block_wire *sp, int num_sacks,
- u32 prior_snd_una)
+static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
+ struct tcp_sack_block_wire *sp, int num_sacks,
+ u32 prior_snd_una)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
- int dup_sack = 0;
+ bool dup_sack = false;
if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
- dup_sack = 1;
+ dup_sack = true;
tcp_dsack_seen(tp);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
} else if (num_sacks > 1) {
@@ -1232,7 +1244,7 @@ static int tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
if (!after(end_seq_0, end_seq_1) &&
!before(start_seq_0, start_seq_1)) {
- dup_sack = 1;
+ dup_sack = true;
tcp_dsack_seen(tp);
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPDSACKOFORECV);
@@ -1263,9 +1275,10 @@ struct tcp_sacktag_state {
* FIXME: this could be merged to shift decision code
*/
static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
- u32 start_seq, u32 end_seq)
+ u32 start_seq, u32 end_seq)
{
- int in_sack, err;
+ int err;
+ bool in_sack;
unsigned int pkt_len;
unsigned int mss;
@@ -1311,7 +1324,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
static u8 tcp_sacktag_one(struct sock *sk,
struct tcp_sacktag_state *state, u8 sacked,
u32 start_seq, u32 end_seq,
- int dup_sack, int pcount)
+ bool dup_sack, int pcount)
{
struct tcp_sock *tp = tcp_sk(sk);
int fack_count = state->fack_count;
@@ -1391,10 +1404,10 @@ static u8 tcp_sacktag_one(struct sock *sk,
/* Shift newly-SACKed bytes from this skb to the immediately previous
* already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
*/
-static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
- struct tcp_sacktag_state *state,
- unsigned int pcount, int shifted, int mss,
- int dup_sack)
+static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
+ struct tcp_sacktag_state *state,
+ unsigned int pcount, int shifted, int mss,
+ bool dup_sack)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
@@ -1444,7 +1457,7 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
if (skb->len > 0) {
BUG_ON(!tcp_skb_pcount(skb));
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED);
- return 0;
+ return false;
}
/* Whole SKB was eaten :-) */
@@ -1467,7 +1480,7 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED);
- return 1;
+ return true;
}
/* I wish gso_size would have a bit more sane initialization than
@@ -1490,7 +1503,7 @@ static int skb_can_shift(const struct sk_buff *skb)
static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
struct tcp_sacktag_state *state,
u32 start_seq, u32 end_seq,
- int dup_sack)
+ bool dup_sack)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *prev;
@@ -1629,14 +1642,14 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
struct tcp_sack_block *next_dup,
struct tcp_sacktag_state *state,
u32 start_seq, u32 end_seq,
- int dup_sack_in)
+ bool dup_sack_in)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *tmp;
tcp_for_write_queue_from(skb, sk) {
int in_sack = 0;
- int dup_sack = dup_sack_in;
+ bool dup_sack = dup_sack_in;
if (skb == tcp_send_head(sk))
break;
@@ -1651,7 +1664,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
next_dup->start_seq,
next_dup->end_seq);
if (in_sack > 0)
- dup_sack = 1;
+ dup_sack = true;
}
/* skb reference here is a bit tricky to get right, since
@@ -1756,7 +1769,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
struct sk_buff *skb;
int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
int used_sacks;
- int found_dup_sack = 0;
+ bool found_dup_sack = false;
int i, j;
int first_sack_index;
@@ -1787,7 +1800,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
used_sacks = 0;
first_sack_index = 0;
for (i = 0; i < num_sacks; i++) {
- int dup_sack = !i && found_dup_sack;
+ bool dup_sack = !i && found_dup_sack;
sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);
sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);
@@ -1854,7 +1867,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
while (i < used_sacks) {
u32 start_seq = sp[i].start_seq;
u32 end_seq = sp[i].end_seq;
- int dup_sack = (found_dup_sack && (i == first_sack_index));
+ bool dup_sack = (found_dup_sack && (i == first_sack_index));
struct tcp_sack_block *next_dup = NULL;
if (found_dup_sack && ((i + 1) == first_sack_index))
@@ -1956,9 +1969,9 @@ out:
}
/* Limits sacked_out so that sum with lost_out isn't ever larger than
- * packets_out. Returns zero if sacked_out adjustement wasn't necessary.
+ * packets_out. Returns false if sacked_out adjustement wasn't necessary.
*/
-static int tcp_limit_reno_sacked(struct tcp_sock *tp)
+static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
{
u32 holes;
@@ -1967,9 +1980,9 @@ static int tcp_limit_reno_sacked(struct tcp_sock *tp)
if ((tp->sacked_out + holes) > tp->packets_out) {
tp->sacked_out = tp->packets_out - holes;
- return 1;
+ return true;
}
- return 0;
+ return false;
}
/* If we receive more dupacks than we expected counting segments
@@ -2023,40 +2036,40 @@ static int tcp_is_sackfrto(const struct tcp_sock *tp)
/* F-RTO can only be used if TCP has never retransmitted anything other than
* head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
*/
-int tcp_use_frto(struct sock *sk)
+bool tcp_use_frto(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb;
if (!sysctl_tcp_frto)
- return 0;
+ return false;
/* MTU probe and F-RTO won't really play nicely along currently */
if (icsk->icsk_mtup.probe_size)
- return 0;
+ return false;
if (tcp_is_sackfrto(tp))
- return 1;
+ return true;
/* Avoid expensive walking of rexmit queue if possible */
if (tp->retrans_out > 1)
- return 0;
+ return false;
skb = tcp_write_queue_head(sk);
if (tcp_skb_is_last(sk, skb))
- return 1;
+ return true;
skb = tcp_write_queue_next(sk, skb); /* Skips head */
tcp_for_write_queue_from(skb, sk) {
if (skb == tcp_send_head(sk))
break;
if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
- return 0;
+ return false;
/* Short-circuit when first non-SACKed skb has been checked */
if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
break;
}
- return 1;
+ return true;
}
/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO
@@ -2292,7 +2305,7 @@ void tcp_enter_loss(struct sock *sk, int how)
*
* Do processing similar to RTO timeout.
*/
-static int tcp_check_sack_reneging(struct sock *sk, int flag)
+static bool tcp_check_sack_reneging(struct sock *sk, int flag)
{
if (flag & FLAG_SACK_RENEGING) {
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2303,9 +2316,9 @@ static int tcp_check_sack_reneging(struct sock *sk, int flag)
tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
icsk->icsk_rto, TCP_RTO_MAX);
- return 1;
+ return true;
}
- return 0;
+ return false;
}
static inline int tcp_fackets_out(const struct tcp_sock *tp)
@@ -2333,6 +2346,27 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
}
+static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long delay;
+
+ /* Delay early retransmit and entering fast recovery for
+ * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
+ * available, or RTO is scheduled to fire first.
+ */
+ if (sysctl_tcp_early_retrans < 2 || (flag & FLAG_ECE) || !tp->srtt)
+ return false;
+
+ delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2));
+ if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
+ return false;
+
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, TCP_RTO_MAX);
+ tp->early_retrans_delayed = 1;
+ return true;
+}
+
static inline int tcp_skb_timedout(const struct sock *sk,
const struct sk_buff *skb)
{
@@ -2440,28 +2474,28 @@ static inline int tcp_head_timedout(const struct sock *sk)
* Main question: may we further continue forward transmission
* with the same cwnd?
*/
-static int tcp_time_to_recover(struct sock *sk)
+static bool tcp_time_to_recover(struct sock *sk, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
__u32 packets_out;
/* Do not perform any recovery during F-RTO algorithm */
if (tp->frto_counter)
- return 0;
+ return false;
/* Trick#1: The loss is proven. */
if (tp->lost_out)
- return 1;
+ return true;
/* Not-A-Trick#2 : Classic rule... */
if (tcp_dupack_heuristics(tp) > tp->reordering)
- return 1;
+ return true;
/* Trick#3 : when we use RFC2988 timer restart, fast
* retransmit can be triggered by timeout of queue head.
*/
if (tcp_is_fack(tp) && tcp_head_timedout(sk))
- return 1;
+ return true;
/* Trick#4: It is still not OK... But will it be useful to delay
* recovery more?
@@ -2473,7 +2507,7 @@ static int tcp_time_to_recover(struct sock *sk)
/* We have nothing to send. This connection is limited
* either by receiver window or by application.
*/
- return 1;
+ return true;
}
/* If a thin stream is detected, retransmit after first
@@ -2484,9 +2518,19 @@ static int tcp_time_to_recover(struct sock *sk)
if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
tcp_is_sack(tp) && !tcp_send_head(sk))
- return 1;
+ return true;
- return 0;
+ /* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious
+ * retransmissions due to small network reorderings, we implement
+ * Mitigation A.3 in the RFC and delay the retransmission for a short
+ * interval if appropriate.
+ */
+ if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
+ (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) &&
+ !tcp_may_send_now(sk))
+ return !tcp_pause_early_retransmit(sk, flag);
+
+ return false;
}
/* New heuristics: it is possible only after we switched to restart timer
@@ -2674,22 +2718,22 @@ static void DBGUNDO(struct sock *sk, const char *msg)
struct inet_sock *inet = inet_sk(sk);
if (sk->sk_family == AF_INET) {
- printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
- msg,
- &inet->inet_daddr, ntohs(inet->inet_dport),
- tp->snd_cwnd, tcp_left_out(tp),
- tp->snd_ssthresh, tp->prior_ssthresh,
- tp->packets_out);
+ pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
+ msg,
+ &inet->inet_daddr, ntohs(inet->inet_dport),
+ tp->snd_cwnd, tcp_left_out(tp),
+ tp->snd_ssthresh, tp->prior_ssthresh,
+ tp->packets_out);
}
#if IS_ENABLED(CONFIG_IPV6)
else if (sk->sk_family == AF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
- printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
- msg,
- &np->daddr, ntohs(inet->inet_dport),
- tp->snd_cwnd, tcp_left_out(tp),
- tp->snd_ssthresh, tp->prior_ssthresh,
- tp->packets_out);
+ pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
+ msg,
+ &np->daddr, ntohs(inet->inet_dport),
+ tp->snd_cwnd, tcp_left_out(tp),
+ tp->snd_ssthresh, tp->prior_ssthresh,
+ tp->packets_out);
}
#endif
}
@@ -2725,7 +2769,7 @@ static inline int tcp_may_undo(const struct tcp_sock *tp)
}
/* People celebrate: "We love our President!" */
-static int tcp_try_undo_recovery(struct sock *sk)
+static bool tcp_try_undo_recovery(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -2750,10 +2794,10 @@ static int tcp_try_undo_recovery(struct sock *sk)
* is ACKed. For Reno it is MUST to prevent false
* fast retransmits (RFC2582). SACK TCP is safe. */
tcp_moderate_cwnd(tp);
- return 1;
+ return true;
}
tcp_set_ca_state(sk, TCP_CA_Open);
- return 0;
+ return false;
}
/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
@@ -2783,19 +2827,19 @@ static void tcp_try_undo_dsack(struct sock *sk)
* that successive retransmissions of a segment must not advance
* retrans_stamp under any conditions.
*/
-static int tcp_any_retrans_done(const struct sock *sk)
+static bool tcp_any_retrans_done(const struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
if (tp->retrans_out)
- return 1;
+ return true;
skb = tcp_write_queue_head(sk);
if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
- return 1;
+ return true;
- return 0;
+ return false;
}
/* Undo during fast recovery after partial ACK. */
@@ -2829,7 +2873,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked)
}
/* Undo during loss recovery after partial ACK. */
-static int tcp_try_undo_loss(struct sock *sk)
+static bool tcp_try_undo_loss(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -2851,9 +2895,9 @@ static int tcp_try_undo_loss(struct sock *sk)
tp->undo_marker = 0;
if (tcp_is_sack(tp))
tcp_set_ca_state(sk, TCP_CA_Open);
- return 1;
+ return true;
}
- return 0;
+ return false;
}
static inline void tcp_complete_cwr(struct sock *sk)
@@ -2862,11 +2906,14 @@ static inline void tcp_complete_cwr(struct sock *sk)
/* Do not moderate cwnd if it's already undone in cwr or recovery. */
if (tp->undo_marker) {
- if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR)
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) {
tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
- else /* PRR */
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ } else if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH) {
+ /* PRR algorithm. */
tp->snd_cwnd = tp->snd_ssthresh;
- tp->snd_cwnd_stamp = tcp_time_stamp;
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ }
}
tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
}
@@ -3016,6 +3063,38 @@ static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked,
tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
}
+static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int mib_idx;
+
+ if (tcp_is_reno(tp))
+ mib_idx = LINUX_MIB_TCPRENORECOVERY;
+ else
+ mib_idx = LINUX_MIB_TCPSACKRECOVERY;
+
+ NET_INC_STATS_BH(sock_net(sk), mib_idx);
+
+ tp->high_seq = tp->snd_nxt;
+ tp->prior_ssthresh = 0;
+ tp->undo_marker = tp->snd_una;
+ tp->undo_retrans = tp->retrans_out;
+
+ if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
+ if (!ece_ack)
+ tp->prior_ssthresh = tcp_current_ssthresh(sk);
+ tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
+ TCP_ECN_queue_cwr(tp);
+ }
+
+ tp->bytes_acked = 0;
+ tp->snd_cwnd_cnt = 0;
+ tp->prior_cwnd = tp->snd_cwnd;
+ tp->prr_delivered = 0;
+ tp->prr_out = 0;
+ tcp_set_ca_state(sk, TCP_CA_Recovery);
+}
+
/* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out,
* taking into account both packets sitting in receiver's buffer and
@@ -3035,7 +3114,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
struct tcp_sock *tp = tcp_sk(sk);
int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
(tcp_fackets_out(tp) > tp->reordering));
- int fast_rexmit = 0, mib_idx;
+ int fast_rexmit = 0;
if (WARN_ON(!tp->packets_out && tp->sacked_out))
tp->sacked_out = 0;
@@ -3119,7 +3198,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
if (icsk->icsk_ca_state <= TCP_CA_Disorder)
tcp_try_undo_dsack(sk);
- if (!tcp_time_to_recover(sk)) {
+ if (!tcp_time_to_recover(sk, flag)) {
tcp_try_to_open(sk, flag);
return;
}
@@ -3136,32 +3215,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
}
/* Otherwise enter Recovery state */
-
- if (tcp_is_reno(tp))
- mib_idx = LINUX_MIB_TCPRENORECOVERY;
- else
- mib_idx = LINUX_MIB_TCPSACKRECOVERY;
-
- NET_INC_STATS_BH(sock_net(sk), mib_idx);
-
- tp->high_seq = tp->snd_nxt;
- tp->prior_ssthresh = 0;
- tp->undo_marker = tp->snd_una;
- tp->undo_retrans = tp->retrans_out;
-
- if (icsk->icsk_ca_state < TCP_CA_CWR) {
- if (!(flag & FLAG_ECE))
- tp->prior_ssthresh = tcp_current_ssthresh(sk);
- tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
- TCP_ECN_queue_cwr(tp);
- }
-
- tp->bytes_acked = 0;
- tp->snd_cwnd_cnt = 0;
- tp->prior_cwnd = tp->snd_cwnd;
- tp->prr_delivered = 0;
- tp->prr_out = 0;
- tcp_set_ca_state(sk, TCP_CA_Recovery);
+ tcp_enter_recovery(sk, (flag & FLAG_ECE));
fast_rexmit = 1;
}
@@ -3243,16 +3297,47 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
/* Restart timer after forward progress on connection.
* RFC2988 recommends to restart timer to now+rto.
*/
-static void tcp_rearm_rto(struct sock *sk)
+void tcp_rearm_rto(struct sock *sk)
{
- const struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
if (!tp->packets_out) {
inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
} else {
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
+ u32 rto = inet_csk(sk)->icsk_rto;
+ /* Offset the time elapsed after installing regular RTO */
+ if (tp->early_retrans_delayed) {
+ struct sk_buff *skb = tcp_write_queue_head(sk);
+ const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto;
+ s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
+ /* delta may not be positive if the socket is locked
+ * when the delayed ER timer fires and is rescheduled.
+ */
+ if (delta > 0)
+ rto = delta;
+ }
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
+ TCP_RTO_MAX);
}
+ tp->early_retrans_delayed = 0;
+}
+
+/* This function is called when the delayed ER timer fires. TCP enters
+ * fast recovery and performs fast-retransmit.
+ */
+void tcp_resume_early_retransmit(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tcp_rearm_rto(sk);
+
+ /* Stop if ER is disabled after the delayed ER timer is scheduled */
+ if (!tp->do_early_retrans)
+ return;
+
+ tcp_enter_recovery(sk, false);
+ tcp_update_scoreboard(sk, 1);
+ tcp_xmit_retransmit_queue(sk);
}
/* If we get here, the whole TSO packet has not been acked. */
@@ -3287,7 +3372,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
const struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb;
u32 now = tcp_time_stamp;
- int fully_acked = 1;
+ int fully_acked = true;
int flag = 0;
u32 pkts_acked = 0;
u32 reord = tp->packets_out;
@@ -3311,7 +3396,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (!acked_pcount)
break;
- fully_acked = 0;
+ fully_acked = false;
} else {
acked_pcount = tcp_skb_pcount(skb);
}
@@ -3428,18 +3513,18 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (!tp->packets_out && tcp_is_sack(tp)) {
icsk = inet_csk(sk);
if (tp->lost_out) {
- printk(KERN_DEBUG "Leak l=%u %d\n",
- tp->lost_out, icsk->icsk_ca_state);
+ pr_debug("Leak l=%u %d\n",
+ tp->lost_out, icsk->icsk_ca_state);
tp->lost_out = 0;
}
if (tp->sacked_out) {
- printk(KERN_DEBUG "Leak s=%u %d\n",
- tp->sacked_out, icsk->icsk_ca_state);
+ pr_debug("Leak s=%u %d\n",
+ tp->sacked_out, icsk->icsk_ca_state);
tp->sacked_out = 0;
}
if (tp->retrans_out) {
- printk(KERN_DEBUG "Leak r=%u %d\n",
- tp->retrans_out, icsk->icsk_ca_state);
+ pr_debug("Leak r=%u %d\n",
+ tp->retrans_out, icsk->icsk_ca_state);
tp->retrans_out = 0;
}
}
@@ -3590,7 +3675,7 @@ static void tcp_undo_spur_to_response(struct sock *sk, int flag)
* to prove that the RTO is indeed spurious. It transfers the control
* from F-RTO to the conventional RTO recovery
*/
-static int tcp_process_frto(struct sock *sk, int flag)
+static bool tcp_process_frto(struct sock *sk, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -3606,7 +3691,7 @@ static int tcp_process_frto(struct sock *sk, int flag)
if (!before(tp->snd_una, tp->frto_highmark)) {
tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag);
- return 1;
+ return true;
}
if (!tcp_is_sackfrto(tp)) {
@@ -3615,19 +3700,19 @@ static int tcp_process_frto(struct sock *sk, int flag)
* data, winupdate
*/
if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP))
- return 1;
+ return true;
if (!(flag & FLAG_DATA_ACKED)) {
tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3),
flag);
- return 1;
+ return true;
}
} else {
if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) {
/* Prevent sending of new data. */
tp->snd_cwnd = min(tp->snd_cwnd,
tcp_packets_in_flight(tp));
- return 1;
+ return true;
}
if ((tp->frto_counter >= 2) &&
@@ -3637,10 +3722,10 @@ static int tcp_process_frto(struct sock *sk, int flag)
/* RFC4138 shortcoming (see comment above) */
if (!(flag & FLAG_FORWARD_PROGRESS) &&
(flag & FLAG_NOT_DUP))
- return 1;
+ return true;
tcp_enter_frto_loss(sk, 3, flag);
- return 1;
+ return true;
}
}
@@ -3652,7 +3737,7 @@ static int tcp_process_frto(struct sock *sk, int flag)
if (!tcp_may_send_now(sk))
tcp_enter_frto_loss(sk, 2, flag);
- return 1;
+ return true;
} else {
switch (sysctl_tcp_frto_response) {
case 2:
@@ -3669,7 +3754,7 @@ static int tcp_process_frto(struct sock *sk, int flag)
tp->undo_marker = 0;
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS);
}
- return 0;
+ return false;
}
/* This routine deals with incoming acks, but not outgoing ones. */
@@ -3687,7 +3772,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
int prior_sacked = tp->sacked_out;
int pkts_acked = 0;
int newly_acked_sacked = 0;
- int frto_cwnd = 0;
+ bool frto_cwnd = false;
/* If the ack is older than previous acks
* then we can probably ignore it.
@@ -3701,6 +3786,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (after(ack, tp->snd_nxt))
goto invalid_ack;
+ if (tp->early_retrans_delayed)
+ tcp_rearm_rto(sk);
+
if (after(ack, prior_snd_una))
flag |= FLAG_SND_UNA_ADVANCED;
@@ -3866,10 +3954,9 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o
__u8 snd_wscale = *(__u8 *)ptr;
opt_rx->wscale_ok = 1;
if (snd_wscale > 14) {
- if (net_ratelimit())
- printk(KERN_INFO "tcp_parse_options: Illegal window "
- "scaling value %d >14 received.\n",
- snd_wscale);
+ net_info_ratelimited("%s: Illegal window scaling value %d >14 received\n",
+ __func__,
+ snd_wscale);
snd_wscale = 14;
}
opt_rx->snd_wscale = snd_wscale;
@@ -3940,7 +4027,7 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o
}
EXPORT_SYMBOL(tcp_parse_options);
-static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th)
+static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th)
{
const __be32 *ptr = (const __be32 *)(th + 1);
@@ -3951,31 +4038,31 @@ static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr
tp->rx_opt.rcv_tsval = ntohl(*ptr);
++ptr;
tp->rx_opt.rcv_tsecr = ntohl(*ptr);
- return 1;
+ return true;
}
- return 0;
+ return false;
}
/* Fast parse options. This hopes to only see timestamps.
* If it is wrong it falls back on tcp_parse_options().
*/
-static int tcp_fast_parse_options(const struct sk_buff *skb,
- const struct tcphdr *th,
- struct tcp_sock *tp, const u8 **hvpp)
+static bool tcp_fast_parse_options(const struct sk_buff *skb,
+ const struct tcphdr *th,
+ struct tcp_sock *tp, const u8 **hvpp)
{
/* In the spirit of fast parsing, compare doff directly to constant
* values. Because equality is used, short doff can be ignored here.
*/
if (th->doff == (sizeof(*th) / 4)) {
tp->rx_opt.saw_tstamp = 0;
- return 0;
+ return false;
} else if (tp->rx_opt.tstamp_ok &&
th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
if (tcp_parse_aligned_timestamp(tp, th))
- return 1;
+ return true;
}
tcp_parse_options(skb, &tp->rx_opt, hvpp, 1);
- return 1;
+ return true;
}
#ifdef CONFIG_TCP_MD5SIG
@@ -4191,7 +4278,7 @@ static void tcp_fin(struct sock *sk)
/* Only TCP_LISTEN and TCP_CLOSE are left, in these
* cases we should never reach this piece of code.
*/
- printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n",
+ pr_err("%s: Impossible, sk->sk_state=%d\n",
__func__, sk->sk_state);
break;
}
@@ -4216,7 +4303,7 @@ static void tcp_fin(struct sock *sk)
}
}
-static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
+static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
u32 end_seq)
{
if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
@@ -4224,9 +4311,9 @@ static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
sp->start_seq = seq;
if (after(end_seq, sp->end_seq))
sp->end_seq = end_seq;
- return 1;
+ return true;
}
- return 0;
+ return false;
}
static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
@@ -4422,10 +4509,10 @@ static void tcp_ofo_queue(struct sock *sk)
}
}
-static int tcp_prune_ofo_queue(struct sock *sk);
+static bool tcp_prune_ofo_queue(struct sock *sk);
static int tcp_prune_queue(struct sock *sk);
-static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
+static int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
{
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
!sk_rmem_schedule(sk, size)) {
@@ -4444,11 +4531,225 @@ static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
return 0;
}
+/**
+ * tcp_try_coalesce - try to merge skb to prior one
+ * @sk: socket
+ * @to: prior buffer
+ * @from: buffer to add in queue
+ * @fragstolen: pointer to boolean
+ *
+ * Before queueing skb @from after @to, try to merge them
+ * to reduce overall memory use and queue lengths, if cost is small.
+ * Packets in ofo or receive queues can stay a long time.
+ * Better try to coalesce them right now to avoid future collapses.
+ * Returns true if caller should free @from instead of queueing it
+ */
+static bool tcp_try_coalesce(struct sock *sk,
+ struct sk_buff *to,
+ struct sk_buff *from,
+ bool *fragstolen)
+{
+ int delta;
+
+ *fragstolen = false;
+
+ if (tcp_hdr(from)->fin)
+ return false;
+
+ /* Its possible this segment overlaps with prior segment in queue */
+ if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
+ return false;
+
+ if (!skb_try_coalesce(to, from, fragstolen, &delta))
+ return false;
+
+ atomic_add(delta, &sk->sk_rmem_alloc);
+ sk_mem_charge(sk, delta);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
+ TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
+ TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
+ return true;
+}
+
+static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb1;
+ u32 seq, end_seq;
+
+ TCP_ECN_check_ce(tp, skb);
+
+ if (tcp_try_rmem_schedule(sk, skb->truesize)) {
+ /* TODO: should increment a counter */
+ __kfree_skb(skb);
+ return;
+ }
+
+ /* Disable header prediction. */
+ tp->pred_flags = 0;
+ inet_csk_schedule_ack(sk);
+
+ SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
+ tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+
+ skb1 = skb_peek_tail(&tp->out_of_order_queue);
+ if (!skb1) {
+ /* Initial out of order segment, build 1 SACK. */
+ if (tcp_is_sack(tp)) {
+ tp->rx_opt.num_sacks = 1;
+ tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
+ tp->selective_acks[0].end_seq =
+ TCP_SKB_CB(skb)->end_seq;
+ }
+ __skb_queue_head(&tp->out_of_order_queue, skb);
+ goto end;
+ }
+
+ seq = TCP_SKB_CB(skb)->seq;
+ end_seq = TCP_SKB_CB(skb)->end_seq;
+
+ if (seq == TCP_SKB_CB(skb1)->end_seq) {
+ bool fragstolen;
+
+ if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
+ __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+ } else {
+ kfree_skb_partial(skb, fragstolen);
+ skb = NULL;
+ }
+
+ if (!tp->rx_opt.num_sacks ||
+ tp->selective_acks[0].end_seq != seq)
+ goto add_sack;
+
+ /* Common case: data arrive in order after hole. */
+ tp->selective_acks[0].end_seq = end_seq;
+ goto end;
+ }
+
+ /* Find place to insert this segment. */
+ while (1) {
+ if (!after(TCP_SKB_CB(skb1)->seq, seq))
+ break;
+ if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
+ skb1 = NULL;
+ break;
+ }
+ skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
+ }
+
+ /* Do skb overlap to previous one? */
+ if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
+ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+ /* All the bits are present. Drop. */
+ __kfree_skb(skb);
+ skb = NULL;
+ tcp_dsack_set(sk, seq, end_seq);
+ goto add_sack;
+ }
+ if (after(seq, TCP_SKB_CB(skb1)->seq)) {
+ /* Partial overlap. */
+ tcp_dsack_set(sk, seq,
+ TCP_SKB_CB(skb1)->end_seq);
+ } else {
+ if (skb_queue_is_first(&tp->out_of_order_queue,
+ skb1))
+ skb1 = NULL;
+ else
+ skb1 = skb_queue_prev(
+ &tp->out_of_order_queue,
+ skb1);
+ }
+ }
+ if (!skb1)
+ __skb_queue_head(&tp->out_of_order_queue, skb);
+ else
+ __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+
+ /* And clean segments covered by new one as whole. */
+ while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
+ skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
+
+ if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
+ break;
+ if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+ tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
+ end_seq);
+ break;
+ }
+ __skb_unlink(skb1, &tp->out_of_order_queue);
+ tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
+ TCP_SKB_CB(skb1)->end_seq);
+ __kfree_skb(skb1);
+ }
+
+add_sack:
+ if (tcp_is_sack(tp))
+ tcp_sack_new_ofo_skb(sk, seq, end_seq);
+end:
+ if (skb)
+ skb_set_owner_r(skb, sk);
+}
+
+static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
+ bool *fragstolen)
+{
+ int eaten;
+ struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
+
+ __skb_pull(skb, hdrlen);
+ eaten = (tail &&
+ tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0;
+ tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ if (!eaten) {
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ skb_set_owner_r(skb, sk);
+ }
+ return eaten;
+}
+
+int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
+{
+ struct sk_buff *skb;
+ struct tcphdr *th;
+ bool fragstolen;
+
+ if (tcp_try_rmem_schedule(sk, size + sizeof(*th)))
+ goto err;
+
+ skb = alloc_skb(size + sizeof(*th), sk->sk_allocation);
+ if (!skb)
+ goto err;
+
+ th = (struct tcphdr *)skb_put(skb, sizeof(*th));
+ skb_reset_transport_header(skb);
+ memset(th, 0, sizeof(*th));
+
+ if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size))
+ goto err_free;
+
+ TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
+ TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
+
+ if (tcp_queue_rcv(sk, skb, sizeof(*th), &fragstolen)) {
+ WARN_ON_ONCE(fragstolen); /* should not happen */
+ __kfree_skb(skb);
+ }
+ return size;
+
+err_free:
+ kfree_skb(skb);
+err:
+ return -ENOMEM;
+}
+
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
const struct tcphdr *th = tcp_hdr(skb);
struct tcp_sock *tp = tcp_sk(sk);
int eaten = -1;
+ bool fragstolen = false;
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
goto drop;
@@ -4493,8 +4794,7 @@ queue_and_out:
tcp_try_rmem_schedule(sk, skb->truesize))
goto drop;
- skb_set_owner_r(skb, sk);
- __skb_queue_tail(&sk->sk_receive_queue, skb);
+ eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
}
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
if (skb->len)
@@ -4518,7 +4818,7 @@ queue_and_out:
tcp_fast_path_check(sk);
if (eaten > 0)
- __kfree_skb(skb);
+ kfree_skb_partial(skb, fragstolen);
else if (!sock_flag(sk, SOCK_DEAD))
sk->sk_data_ready(sk, 0);
return;
@@ -4559,105 +4859,7 @@ drop:
goto queue_and_out;
}
- TCP_ECN_check_ce(tp, skb);
-
- if (tcp_try_rmem_schedule(sk, skb->truesize))
- goto drop;
-
- /* Disable header prediction. */
- tp->pred_flags = 0;
- inet_csk_schedule_ack(sk);
-
- SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
- tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
-
- skb_set_owner_r(skb, sk);
-
- if (!skb_peek(&tp->out_of_order_queue)) {
- /* Initial out of order segment, build 1 SACK. */
- if (tcp_is_sack(tp)) {
- tp->rx_opt.num_sacks = 1;
- tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
- tp->selective_acks[0].end_seq =
- TCP_SKB_CB(skb)->end_seq;
- }
- __skb_queue_head(&tp->out_of_order_queue, skb);
- } else {
- struct sk_buff *skb1 = skb_peek_tail(&tp->out_of_order_queue);
- u32 seq = TCP_SKB_CB(skb)->seq;
- u32 end_seq = TCP_SKB_CB(skb)->end_seq;
-
- if (seq == TCP_SKB_CB(skb1)->end_seq) {
- __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
-
- if (!tp->rx_opt.num_sacks ||
- tp->selective_acks[0].end_seq != seq)
- goto add_sack;
-
- /* Common case: data arrive in order after hole. */
- tp->selective_acks[0].end_seq = end_seq;
- return;
- }
-
- /* Find place to insert this segment. */
- while (1) {
- if (!after(TCP_SKB_CB(skb1)->seq, seq))
- break;
- if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
- skb1 = NULL;
- break;
- }
- skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
- }
-
- /* Do skb overlap to previous one? */
- if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
- if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
- /* All the bits are present. Drop. */
- __kfree_skb(skb);
- tcp_dsack_set(sk, seq, end_seq);
- goto add_sack;
- }
- if (after(seq, TCP_SKB_CB(skb1)->seq)) {
- /* Partial overlap. */
- tcp_dsack_set(sk, seq,
- TCP_SKB_CB(skb1)->end_seq);
- } else {
- if (skb_queue_is_first(&tp->out_of_order_queue,
- skb1))
- skb1 = NULL;
- else
- skb1 = skb_queue_prev(
- &tp->out_of_order_queue,
- skb1);
- }
- }
- if (!skb1)
- __skb_queue_head(&tp->out_of_order_queue, skb);
- else
- __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
-
- /* And clean segments covered by new one as whole. */
- while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
- skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
-
- if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
- break;
- if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
- tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
- end_seq);
- break;
- }
- __skb_unlink(skb1, &tp->out_of_order_queue);
- tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
- TCP_SKB_CB(skb1)->end_seq);
- __kfree_skb(skb1);
- }
-
-add_sack:
- if (tcp_is_sack(tp))
- tcp_sack_new_ofo_skb(sk, seq, end_seq);
- }
+ tcp_data_queue_ofo(sk, skb);
}
static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
@@ -4836,10 +5038,10 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
* Purge the out-of-order queue.
* Return true if queue was pruned.
*/
-static int tcp_prune_ofo_queue(struct sock *sk)
+static bool tcp_prune_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- int res = 0;
+ bool res = false;
if (!skb_queue_empty(&tp->out_of_order_queue)) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
@@ -4853,7 +5055,7 @@ static int tcp_prune_ofo_queue(struct sock *sk)
if (tp->rx_opt.sack_ok)
tcp_sack_reset(&tp->rx_opt);
sk_mem_reclaim(sk);
- res = 1;
+ res = true;
}
return res;
}
@@ -4930,7 +5132,7 @@ void tcp_cwnd_application_limited(struct sock *sk)
tp->snd_cwnd_stamp = tcp_time_stamp;
}
-static int tcp_should_expand_sndbuf(const struct sock *sk)
+static bool tcp_should_expand_sndbuf(const struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
@@ -4938,21 +5140,21 @@ static int tcp_should_expand_sndbuf(const struct sock *sk)
* not modify it.
*/
if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
- return 0;
+ return false;
/* If we are under global TCP memory pressure, do not expand. */
if (sk_under_memory_pressure(sk))
- return 0;
+ return false;
/* If we are under soft global TCP memory pressure, do not expand. */
if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
- return 0;
+ return false;
/* If we filled the congestion window, do not expand. */
if (tp->packets_out >= tp->snd_cwnd)
- return 0;
+ return false;
- return 1;
+ return true;
}
/* When incoming ACK allowed to free some skb from write_queue,
@@ -5178,19 +5380,19 @@ static inline int tcp_checksum_complete_user(struct sock *sk,
}
#ifdef CONFIG_NET_DMA
-static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
+static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
int hlen)
{
struct tcp_sock *tp = tcp_sk(sk);
int chunk = skb->len - hlen;
int dma_cookie;
- int copied_early = 0;
+ bool copied_early = false;
if (tp->ucopy.wakeup)
- return 0;
+ return false;
if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
- tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
+ tp->ucopy.dma_chan = net_dma_find_channel();
if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) {
@@ -5203,7 +5405,7 @@ static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
goto out;
tp->ucopy.dma_cookie = dma_cookie;
- copied_early = 1;
+ copied_early = true;
tp->ucopy.len -= chunk;
tp->copied_seq += chunk;
@@ -5395,6 +5597,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
} else {
int eaten = 0;
int copied_early = 0;
+ bool fragstolen = false;
if (tp->copied_seq == tp->rcv_nxt &&
len - tcp_header_len <= tp->ucopy.len) {
@@ -5452,10 +5655,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);
/* Bulk data transfer: receiver */
- __skb_pull(skb, tcp_header_len);
- __skb_queue_tail(&sk->sk_receive_queue, skb);
- skb_set_owner_r(skb, sk);
- tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
+ &fragstolen);
}
tcp_event_data_recv(sk, skb);
@@ -5477,7 +5678,7 @@ no_ack:
else
#endif
if (eaten)
- __kfree_skb(skb);
+ kfree_skb_partial(skb, fragstolen);
else
sk->sk_data_ready(sk, 0);
return 0;
@@ -5521,6 +5722,44 @@ discard:
}
EXPORT_SYMBOL(tcp_rcv_established);
+void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ tcp_set_state(sk, TCP_ESTABLISHED);
+
+ if (skb != NULL)
+ security_inet_conn_established(sk, skb);
+
+ /* Make sure socket is routed, for correct metrics. */
+ icsk->icsk_af_ops->rebuild_header(sk);
+
+ tcp_init_metrics(sk);
+
+ tcp_init_congestion_control(sk);
+
+ /* Prevent spurious tcp_cwnd_restart() on first data
+ * packet.
+ */
+ tp->lsndtime = tcp_time_stamp;
+
+ tcp_init_buffer_space(sk);
+
+ if (sock_flag(sk, SOCK_KEEPOPEN))
+ inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
+
+ if (!tp->rx_opt.snd_wscale)
+ __tcp_fast_path_on(tp, tp->snd_wnd);
+ else
+ tp->pred_flags = 0;
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ sk->sk_state_change(sk);
+ sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
+ }
+}
+
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th, unsigned int len)
{
@@ -5653,36 +5892,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
}
smp_mb();
- tcp_set_state(sk, TCP_ESTABLISHED);
-
- security_inet_conn_established(sk, skb);
- /* Make sure socket is routed, for correct metrics. */
- icsk->icsk_af_ops->rebuild_header(sk);
-
- tcp_init_metrics(sk);
-
- tcp_init_congestion_control(sk);
-
- /* Prevent spurious tcp_cwnd_restart() on first data
- * packet.
- */
- tp->lsndtime = tcp_time_stamp;
-
- tcp_init_buffer_space(sk);
-
- if (sock_flag(sk, SOCK_KEEPOPEN))
- inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
-
- if (!tp->rx_opt.snd_wscale)
- __tcp_fast_path_on(tp, tp->snd_wnd);
- else
- tp->pred_flags = 0;
-
- if (!sock_flag(sk, SOCK_DEAD)) {
- sk->sk_state_change(sk);
- sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
- }
+ tcp_finish_connect(sk, skb);
if (sk->sk_write_pending ||
icsk->icsk_accept_queue.rskq_defer_accept ||
@@ -5696,8 +5907,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
*/
inet_csk_schedule_ack(sk);
icsk->icsk_ack.lrcvtime = tcp_time_stamp;
- icsk->icsk_ack.ato = TCP_ATO_MIN;
- tcp_incr_quickack(sk);
tcp_enter_quickack_mode(sk);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
TCP_DELACK_MAX, TCP_RTO_MAX);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fd54c5f8a25..c8d28c433b2 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -50,6 +50,7 @@
* a single port at the same time.
*/
+#define pr_fmt(fmt) "TCP: " fmt
#include <linux/bottom_half.h>
#include <linux/types.h>
@@ -90,16 +91,8 @@ EXPORT_SYMBOL(sysctl_tcp_low_latency);
#ifdef CONFIG_TCP_MD5SIG
-static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
- __be32 addr);
-static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
+static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
__be32 daddr, __be32 saddr, const struct tcphdr *th);
-#else
-static inline
-struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
-{
- return NULL;
-}
#endif
struct inet_hashinfo tcp_hashinfo;
@@ -145,6 +138,14 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
}
EXPORT_SYMBOL_GPL(tcp_twsk_unique);
+static int tcp_repair_connect(struct sock *sk)
+{
+ tcp_connect_init(sk);
+ tcp_finish_connect(sk, NULL);
+
+ return 0;
+}
+
/* This will initiate an outgoing connection. */
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
@@ -203,7 +204,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
/* Reset inherited state */
tp->rx_opt.ts_recent = 0;
tp->rx_opt.ts_recent_stamp = 0;
- tp->write_seq = 0;
+ if (likely(!tp->repair))
+ tp->write_seq = 0;
}
if (tcp_death_row.sysctl_tw_recycle &&
@@ -254,7 +256,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
sk->sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(sk, &rt->dst);
- if (!tp->write_seq)
+ if (!tp->write_seq && likely(!tp->repair))
tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
inet->inet_daddr,
inet->inet_sport,
@@ -262,7 +264,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_id = tp->write_seq ^ jiffies;
- err = tcp_connect(sk);
+ if (likely(!tp->repair))
+ err = tcp_connect(sk);
+ else
+ err = tcp_repair_connect(sk);
+
rt = NULL;
if (err)
goto failure;
@@ -601,6 +607,10 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
struct ip_reply_arg arg;
#ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *key;
+ const __u8 *hash_location = NULL;
+ unsigned char newhash[16];
+ int genhash;
+ struct sock *sk1 = NULL;
#endif
struct net *net;
@@ -631,7 +641,36 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
arg.iov[0].iov_len = sizeof(rep.th);
#ifdef CONFIG_TCP_MD5SIG
- key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->saddr) : NULL;
+ hash_location = tcp_parse_md5sig_option(th);
+ if (!sk && hash_location) {
+ /*
+ * active side is lost. Try to find listening socket through
+ * source port, and then find md5 key through listening socket.
+ * we are not loose security here:
+ * Incoming packet is checked with md5 hash with finding key,
+ * no RST generated if md5 hash doesn't match.
+ */
+ sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
+ &tcp_hashinfo, ip_hdr(skb)->daddr,
+ ntohs(th->source), inet_iif(skb));
+ /* don't send rst if it can't find key */
+ if (!sk1)
+ return;
+ rcu_read_lock();
+ key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
+ &ip_hdr(skb)->saddr, AF_INET);
+ if (!key)
+ goto release_sk1;
+
+ genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
+ if (genhash || memcmp(hash_location, newhash, 16) != 0)
+ goto release_sk1;
+ } else {
+ key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
+ &ip_hdr(skb)->saddr,
+ AF_INET) : NULL;
+ }
+
if (key) {
rep.opt[0] = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
@@ -664,6 +703,14 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
+
+#ifdef CONFIG_TCP_MD5SIG
+release_sk1:
+ if (sk1) {
+ rcu_read_unlock();
+ sock_put(sk1);
+ }
+#endif
}
/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
@@ -764,7 +811,8 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
req->ts_recent,
0,
- tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
+ tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
+ AF_INET),
inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
ip_hdr(skb)->tos);
}
@@ -776,7 +824,8 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
*/
static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
- struct request_values *rvp)
+ struct request_values *rvp,
+ u16 queue_mapping)
{
const struct inet_request_sock *ireq = inet_rsk(req);
struct flowi4 fl4;
@@ -792,6 +841,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
if (skb) {
__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
+ skb_set_queue_mapping(skb, queue_mapping);
err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
ireq->rmt_addr,
ireq->opt);
@@ -806,7 +856,7 @@ static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
struct request_values *rvp)
{
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
- return tcp_v4_send_synack(sk, NULL, req, rvp);
+ return tcp_v4_send_synack(sk, NULL, req, rvp, 0);
}
/*
@@ -818,14 +868,14 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
}
/*
- * Return 1 if a syncookie should be sent
+ * Return true if a syncookie should be sent
*/
-int tcp_syn_flood_action(struct sock *sk,
+bool tcp_syn_flood_action(struct sock *sk,
const struct sk_buff *skb,
const char *proto)
{
const char *msg = "Dropping request";
- int want_cookie = 0;
+ bool want_cookie = false;
struct listen_sock *lopt;
@@ -833,7 +883,7 @@ int tcp_syn_flood_action(struct sock *sk,
#ifdef CONFIG_SYN_COOKIES
if (sysctl_tcp_syncookies) {
msg = "Sending cookies";
- want_cookie = 1;
+ want_cookie = true;
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
} else
#endif
@@ -842,8 +892,7 @@ int tcp_syn_flood_action(struct sock *sk,
lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
if (!lopt->synflood_warned) {
lopt->synflood_warned = 1;
- pr_info("%s: Possible SYN flooding on port %d. %s. "
- " Check SNMP counters.\n",
+ pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
proto, ntohs(tcp_hdr(skb)->dest), msg);
}
return want_cookie;
@@ -881,153 +930,138 @@ static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
*/
/* Find the Key structure for an address. */
-static struct tcp_md5sig_key *
- tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
+struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
+ const union tcp_md5_addr *addr,
+ int family)
{
struct tcp_sock *tp = tcp_sk(sk);
- int i;
-
- if (!tp->md5sig_info || !tp->md5sig_info->entries4)
+ struct tcp_md5sig_key *key;
+ struct hlist_node *pos;
+ unsigned int size = sizeof(struct in_addr);
+ struct tcp_md5sig_info *md5sig;
+
+ /* caller either holds rcu_read_lock() or socket lock */
+ md5sig = rcu_dereference_check(tp->md5sig_info,
+ sock_owned_by_user(sk) ||
+ lockdep_is_held(&sk->sk_lock.slock));
+ if (!md5sig)
return NULL;
- for (i = 0; i < tp->md5sig_info->entries4; i++) {
- if (tp->md5sig_info->keys4[i].addr == addr)
- return &tp->md5sig_info->keys4[i].base;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (family == AF_INET6)
+ size = sizeof(struct in6_addr);
+#endif
+ hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) {
+ if (key->family != family)
+ continue;
+ if (!memcmp(&key->addr, addr, size))
+ return key;
}
return NULL;
}
+EXPORT_SYMBOL(tcp_md5_do_lookup);
struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
struct sock *addr_sk)
{
- return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
+ union tcp_md5_addr *addr;
+
+ addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
+ return tcp_md5_do_lookup(sk, addr, AF_INET);
}
EXPORT_SYMBOL(tcp_v4_md5_lookup);
static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
struct request_sock *req)
{
- return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
+ union tcp_md5_addr *addr;
+
+ addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
+ return tcp_md5_do_lookup(sk, addr, AF_INET);
}
/* This can be called on a newly created socket, from other files */
-int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
- u8 *newkey, u8 newkeylen)
+int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
+ int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
{
/* Add Key to the list */
struct tcp_md5sig_key *key;
struct tcp_sock *tp = tcp_sk(sk);
- struct tcp4_md5sig_key *keys;
+ struct tcp_md5sig_info *md5sig;
- key = tcp_v4_md5_do_lookup(sk, addr);
+ key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
if (key) {
/* Pre-existing entry - just update that one. */
- kfree(key->key);
- key->key = newkey;
+ memcpy(key->key, newkey, newkeylen);
key->keylen = newkeylen;
- } else {
- struct tcp_md5sig_info *md5sig;
-
- if (!tp->md5sig_info) {
- tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
- GFP_ATOMIC);
- if (!tp->md5sig_info) {
- kfree(newkey);
- return -ENOMEM;
- }
- sk_nocaps_add(sk, NETIF_F_GSO_MASK);
- }
+ return 0;
+ }
- md5sig = tp->md5sig_info;
- if (md5sig->entries4 == 0 &&
- tcp_alloc_md5sig_pool(sk) == NULL) {
- kfree(newkey);
+ md5sig = rcu_dereference_protected(tp->md5sig_info,
+ sock_owned_by_user(sk));
+ if (!md5sig) {
+ md5sig = kmalloc(sizeof(*md5sig), gfp);
+ if (!md5sig)
return -ENOMEM;
- }
- if (md5sig->alloced4 == md5sig->entries4) {
- keys = kmalloc((sizeof(*keys) *
- (md5sig->entries4 + 1)), GFP_ATOMIC);
- if (!keys) {
- kfree(newkey);
- if (md5sig->entries4 == 0)
- tcp_free_md5sig_pool();
- return -ENOMEM;
- }
-
- if (md5sig->entries4)
- memcpy(keys, md5sig->keys4,
- sizeof(*keys) * md5sig->entries4);
+ sk_nocaps_add(sk, NETIF_F_GSO_MASK);
+ INIT_HLIST_HEAD(&md5sig->head);
+ rcu_assign_pointer(tp->md5sig_info, md5sig);
+ }
- /* Free old key list, and reference new one */
- kfree(md5sig->keys4);
- md5sig->keys4 = keys;
- md5sig->alloced4++;
- }
- md5sig->entries4++;
- md5sig->keys4[md5sig->entries4 - 1].addr = addr;
- md5sig->keys4[md5sig->entries4 - 1].base.key = newkey;
- md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
+ key = sock_kmalloc(sk, sizeof(*key), gfp);
+ if (!key)
+ return -ENOMEM;
+ if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
+ sock_kfree_s(sk, key, sizeof(*key));
+ return -ENOMEM;
}
- return 0;
-}
-EXPORT_SYMBOL(tcp_v4_md5_do_add);
-static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
- u8 *newkey, u8 newkeylen)
-{
- return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
- newkey, newkeylen);
+ memcpy(key->key, newkey, newkeylen);
+ key->keylen = newkeylen;
+ key->family = family;
+ memcpy(&key->addr, addr,
+ (family == AF_INET6) ? sizeof(struct in6_addr) :
+ sizeof(struct in_addr));
+ hlist_add_head_rcu(&key->node, &md5sig->head);
+ return 0;
}
+EXPORT_SYMBOL(tcp_md5_do_add);
-int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
+int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
{
struct tcp_sock *tp = tcp_sk(sk);
- int i;
-
- for (i = 0; i < tp->md5sig_info->entries4; i++) {
- if (tp->md5sig_info->keys4[i].addr == addr) {
- /* Free the key */
- kfree(tp->md5sig_info->keys4[i].base.key);
- tp->md5sig_info->entries4--;
-
- if (tp->md5sig_info->entries4 == 0) {
- kfree(tp->md5sig_info->keys4);
- tp->md5sig_info->keys4 = NULL;
- tp->md5sig_info->alloced4 = 0;
- tcp_free_md5sig_pool();
- } else if (tp->md5sig_info->entries4 != i) {
- /* Need to do some manipulation */
- memmove(&tp->md5sig_info->keys4[i],
- &tp->md5sig_info->keys4[i+1],
- (tp->md5sig_info->entries4 - i) *
- sizeof(struct tcp4_md5sig_key));
- }
- return 0;
- }
- }
- return -ENOENT;
+ struct tcp_md5sig_key *key;
+ struct tcp_md5sig_info *md5sig;
+
+ key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
+ if (!key)
+ return -ENOENT;
+ hlist_del_rcu(&key->node);
+ atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
+ kfree_rcu(key, rcu);
+ md5sig = rcu_dereference_protected(tp->md5sig_info,
+ sock_owned_by_user(sk));
+ if (hlist_empty(&md5sig->head))
+ tcp_free_md5sig_pool();
+ return 0;
}
-EXPORT_SYMBOL(tcp_v4_md5_do_del);
+EXPORT_SYMBOL(tcp_md5_do_del);
-static void tcp_v4_clear_md5_list(struct sock *sk)
+void tcp_clear_md5_list(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_md5sig_key *key;
+ struct hlist_node *pos, *n;
+ struct tcp_md5sig_info *md5sig;
- /* Free each key, then the set of key keys,
- * the crypto element, and then decrement our
- * hold on the last resort crypto.
- */
- if (tp->md5sig_info->entries4) {
- int i;
- for (i = 0; i < tp->md5sig_info->entries4; i++)
- kfree(tp->md5sig_info->keys4[i].base.key);
- tp->md5sig_info->entries4 = 0;
+ md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
+
+ if (!hlist_empty(&md5sig->head))
tcp_free_md5sig_pool();
- }
- if (tp->md5sig_info->keys4) {
- kfree(tp->md5sig_info->keys4);
- tp->md5sig_info->keys4 = NULL;
- tp->md5sig_info->alloced4 = 0;
+ hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) {
+ hlist_del_rcu(&key->node);
+ atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
+ kfree_rcu(key, rcu);
}
}
@@ -1036,7 +1070,6 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
{
struct tcp_md5sig cmd;
struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
- u8 *newkey;
if (optlen < sizeof(cmd))
return -EINVAL;
@@ -1047,32 +1080,16 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
if (sin->sin_family != AF_INET)
return -EINVAL;
- if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
- if (!tcp_sk(sk)->md5sig_info)
- return -ENOENT;
- return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
- }
+ if (!cmd.tcpm_key || !cmd.tcpm_keylen)
+ return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
+ AF_INET);
if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
return -EINVAL;
- if (!tcp_sk(sk)->md5sig_info) {
- struct tcp_sock *tp = tcp_sk(sk);
- struct tcp_md5sig_info *p;
-
- p = kzalloc(sizeof(*p), sk->sk_allocation);
- if (!p)
- return -EINVAL;
-
- tp->md5sig_info = p;
- sk_nocaps_add(sk, NETIF_F_GSO_MASK);
- }
-
- newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
- if (!newkey)
- return -ENOMEM;
- return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
- newkey, cmd.tcpm_keylen);
+ return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
+ AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
+ GFP_KERNEL);
}
static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
@@ -1098,7 +1115,7 @@ static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
}
-static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
+static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
__be32 daddr, __be32 saddr, const struct tcphdr *th)
{
struct tcp_md5sig_pool *hp;
@@ -1181,7 +1198,7 @@ clear_hash_noput:
}
EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
-static int tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
+static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
{
/*
* This gets called for each TCP segment that arrives
@@ -1198,21 +1215,22 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
int genhash;
unsigned char newhash[16];
- hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
+ hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
+ AF_INET);
hash_location = tcp_parse_md5sig_option(th);
/* We've parsed the options - do we have a hash? */
if (!hash_expected && !hash_location)
- return 0;
+ return false;
if (hash_expected && !hash_location) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
- return 1;
+ return true;
}
if (!hash_expected && hash_location) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
- return 1;
+ return true;
}
/* Okay, so this is hash_expected and hash_location -
@@ -1223,15 +1241,14 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
NULL, NULL, skb);
if (genhash || memcmp(hash_location, newhash, 16) != 0) {
- if (net_ratelimit()) {
- printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
- &iph->saddr, ntohs(th->source),
- &iph->daddr, ntohs(th->dest),
- genhash ? " tcp_v4_calc_md5_hash failed" : "");
- }
- return 1;
+ net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
+ &iph->saddr, ntohs(th->source),
+ &iph->daddr, ntohs(th->dest),
+ genhash ? " tcp_v4_calc_md5_hash failed"
+ : "");
+ return true;
}
- return 0;
+ return false;
}
#endif
@@ -1265,7 +1282,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
__be32 saddr = ip_hdr(skb)->saddr;
__be32 daddr = ip_hdr(skb)->daddr;
__u32 isn = TCP_SKB_CB(skb)->when;
- int want_cookie = 0;
+ bool want_cookie = false;
/* Never answer to SYNs send to broadcast or multicast */
if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
@@ -1324,7 +1341,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
while (l-- > 0)
*c++ ^= *hash_location++;
- want_cookie = 0; /* not our kind of cookie */
+ want_cookie = false; /* not our kind of cookie */
tmp_ext.cookie_out_never = 0; /* false */
tmp_ext.cookie_plus = tmp_opt.cookie_plus;
} else if (!tp->rx_opt.cookie_in_always) {
@@ -1352,7 +1369,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
goto drop_and_free;
if (!want_cookie || tmp_opt.tstamp_ok)
- TCP_ECN_create_request(req, tcp_hdr(skb));
+ TCP_ECN_create_request(req, skb);
if (want_cookie) {
isn = cookie_v4_init_sequence(sk, skb, &req->mss);
@@ -1396,7 +1413,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
* to destinations, already remembered
* to the moment of synflood.
*/
- LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
+ LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
&saddr, ntohs(tcp_hdr(skb)->source));
goto drop_and_release;
}
@@ -1407,7 +1424,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
tcp_rsk(req)->snt_synack = tcp_time_stamp;
if (tcp_v4_send_synack(sk, dst, req,
- (struct request_values *)&tmp_ext) ||
+ (struct request_values *)&tmp_ext,
+ skb_get_queue_mapping(skb)) ||
want_cookie)
goto drop_and_free;
@@ -1461,6 +1479,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
ireq->opt = NULL;
newinet->mc_index = inet_iif(skb);
newinet->mc_ttl = ip_hdr(skb)->ttl;
+ newinet->rcv_tos = ip_hdr(skb)->tos;
inet_csk(newsk)->icsk_ext_hdr_len = 0;
if (inet_opt)
inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
@@ -1490,7 +1509,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
#ifdef CONFIG_TCP_MD5SIG
/* Copy over the MD5 key from the original socket */
- key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
+ key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
+ AF_INET);
if (key != NULL) {
/*
* We're using one, so create a matching key
@@ -1498,10 +1518,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
* memory, then we end up not copying the key
* across. Shucks.
*/
- char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
- if (newkey != NULL)
- tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
- newkey, key->keylen);
+ tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
+ AF_INET, key->key, key->keylen, GFP_ATOMIC);
sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
}
#endif
@@ -1727,7 +1745,7 @@ process:
#ifdef CONFIG_NET_DMA
struct tcp_sock *tp = tcp_sk(sk);
if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
- tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
+ tp->ucopy.dma_chan = net_dma_find_channel();
if (tp->ucopy.dma_chan)
ret = tcp_v4_do_rcv(sk, skb);
else
@@ -1736,7 +1754,8 @@ process:
if (!tcp_prequeue(sk, skb))
ret = tcp_v4_do_rcv(sk, skb);
}
- } else if (unlikely(sk_add_backlog(sk, skb))) {
+ } else if (unlikely(sk_add_backlog(sk, skb,
+ sk->sk_rcvbuf + sk->sk_sndbuf))) {
bh_unlock_sock(sk);
NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
goto discard_and_relse;
@@ -1862,7 +1881,6 @@ EXPORT_SYMBOL(ipv4_specific);
static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
.md5_lookup = tcp_v4_md5_lookup,
.calc_md5_hash = tcp_v4_md5_hash_skb,
- .md5_add = tcp_v4_md5_add_func,
.md5_parse = tcp_v4_parse_md5_keys,
};
#endif
@@ -1873,64 +1891,15 @@ static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
static int tcp_v4_init_sock(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
-
- skb_queue_head_init(&tp->out_of_order_queue);
- tcp_init_xmit_timers(sk);
- tcp_prequeue_init(tp);
- icsk->icsk_rto = TCP_TIMEOUT_INIT;
- tp->mdev = TCP_TIMEOUT_INIT;
-
- /* So many TCP implementations out there (incorrectly) count the
- * initial SYN frame in their delayed-ACK and congestion control
- * algorithms that we must have the following bandaid to talk
- * efficiently to them. -DaveM
- */
- tp->snd_cwnd = TCP_INIT_CWND;
-
- /* See draft-stevens-tcpca-spec-01 for discussion of the
- * initialization of these values.
- */
- tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
- tp->snd_cwnd_clamp = ~0;
- tp->mss_cache = TCP_MSS_DEFAULT;
-
- tp->reordering = sysctl_tcp_reordering;
- icsk->icsk_ca_ops = &tcp_init_congestion_ops;
-
- sk->sk_state = TCP_CLOSE;
-
- sk->sk_write_space = sk_stream_write_space;
- sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+ tcp_init_sock(sk);
icsk->icsk_af_ops = &ipv4_specific;
- icsk->icsk_sync_mss = tcp_sync_mss;
+
#ifdef CONFIG_TCP_MD5SIG
- tp->af_specific = &tcp_sock_ipv4_specific;
+ tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
#endif
- /* TCP Cookie Transactions */
- if (sysctl_tcp_cookie_size > 0) {
- /* Default, cookies without s_data_payload. */
- tp->cookie_values =
- kzalloc(sizeof(*tp->cookie_values),
- sk->sk_allocation);
- if (tp->cookie_values != NULL)
- kref_init(&tp->cookie_values->kref);
- }
- /* Presumed zeroed, in order of appearance:
- * cookie_in_always, cookie_out_never,
- * s_data_constant, s_data_in, s_data_out
- */
- sk->sk_sndbuf = sysctl_tcp_wmem[1];
- sk->sk_rcvbuf = sysctl_tcp_rmem[1];
-
- local_bh_disable();
- sock_update_memcg(sk);
- sk_sockets_allocated_inc(sk);
- local_bh_enable();
-
return 0;
}
@@ -1951,8 +1920,8 @@ void tcp_v4_destroy_sock(struct sock *sk)
#ifdef CONFIG_TCP_MD5SIG
/* Clean up the MD5 key list, if any */
if (tp->md5sig_info) {
- tcp_v4_clear_md5_list(sk);
- kfree(tp->md5sig_info);
+ tcp_clear_md5_list(sk);
+ kfree_rcu(tp->md5sig_info, rcu);
tp->md5sig_info = NULL;
}
#endif
@@ -2107,7 +2076,7 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
return rc;
}
-static inline int empty_bucket(struct tcp_iter_state *st)
+static inline bool empty_bucket(struct tcp_iter_state *st)
{
return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 49978788a9d..b6f3583ddfe 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -6,37 +6,6 @@
#include <linux/memcontrol.h>
#include <linux/module.h>
-static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft);
-static int tcp_cgroup_write(struct cgroup *cont, struct cftype *cft,
- const char *buffer);
-static int tcp_cgroup_reset(struct cgroup *cont, unsigned int event);
-
-static struct cftype tcp_files[] = {
- {
- .name = "kmem.tcp.limit_in_bytes",
- .write_string = tcp_cgroup_write,
- .read_u64 = tcp_cgroup_read,
- .private = RES_LIMIT,
- },
- {
- .name = "kmem.tcp.usage_in_bytes",
- .read_u64 = tcp_cgroup_read,
- .private = RES_USAGE,
- },
- {
- .name = "kmem.tcp.failcnt",
- .private = RES_FAILCNT,
- .trigger = tcp_cgroup_reset,
- .read_u64 = tcp_cgroup_read,
- },
- {
- .name = "kmem.tcp.max_usage_in_bytes",
- .private = RES_MAX_USAGE,
- .trigger = tcp_cgroup_reset,
- .read_u64 = tcp_cgroup_read,
- },
-};
-
static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto)
{
return container_of(cg_proto, struct tcp_memcontrol, cg_proto);
@@ -49,7 +18,7 @@ static void memcg_tcp_enter_memory_pressure(struct sock *sk)
}
EXPORT_SYMBOL(memcg_tcp_enter_memory_pressure);
-int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
+int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
{
/*
* The root cgroup does not use res_counters, but rather,
@@ -59,13 +28,12 @@ int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
struct res_counter *res_parent = NULL;
struct cg_proto *cg_proto, *parent_cg;
struct tcp_memcontrol *tcp;
- struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
struct net *net = current->nsproxy->net_ns;
cg_proto = tcp_prot.proto_cgroup(memcg);
if (!cg_proto)
- goto create_files;
+ return 0;
tcp = tcp_from_cgproto(cg_proto);
@@ -88,15 +56,12 @@ int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
cg_proto->sockets_allocated = &tcp->tcp_sockets_allocated;
cg_proto->memcg = memcg;
-create_files:
- return cgroup_add_files(cgrp, ss, tcp_files,
- ARRAY_SIZE(tcp_files));
+ return 0;
}
EXPORT_SYMBOL(tcp_init_cgroup);
-void tcp_destroy_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
+void tcp_destroy_cgroup(struct mem_cgroup *memcg)
{
- struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct cg_proto *cg_proto;
struct tcp_memcontrol *tcp;
u64 val;
@@ -109,9 +74,6 @@ void tcp_destroy_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
percpu_counter_destroy(&tcp->tcp_sockets_allocated);
val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
-
- if (val != RESOURCE_MAX)
- jump_label_dec(&memcg_socket_limit_enabled);
}
EXPORT_SYMBOL(tcp_destroy_cgroup);
@@ -142,10 +104,33 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT,
net->ipv4.sysctl_tcp_mem[i]);
- if (val == RESOURCE_MAX && old_lim != RESOURCE_MAX)
- jump_label_dec(&memcg_socket_limit_enabled);
- else if (old_lim == RESOURCE_MAX && val != RESOURCE_MAX)
- jump_label_inc(&memcg_socket_limit_enabled);
+ if (val == RESOURCE_MAX)
+ clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
+ else if (val != RESOURCE_MAX) {
+ /*
+ * The active bit needs to be written after the static_key
+ * update. This is what guarantees that the socket activation
+ * function is the last one to run. See sock_update_memcg() for
+ * details, and note that we don't mark any socket as belonging
+ * to this memcg until that flag is up.
+ *
+ * We need to do this, because static_keys will span multiple
+ * sites, but we can't control their order. If we mark a socket
+ * as accounted, but the accounting functions are not patched in
+ * yet, we'll lose accounting.
+ *
+ * We never race with the readers in sock_update_memcg(),
+ * because when this value change, the code to process it is not
+ * patched in yet.
+ *
+ * The activated bit is used to guarantee that no two writers
+ * will do the update in the same memcg. Without that, we can't
+ * properly shutdown the static key.
+ */
+ if (!test_and_set_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags))
+ static_key_slow_inc(&memcg_socket_limit_enabled);
+ set_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
+ }
return 0;
}
@@ -270,3 +255,37 @@ void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx)
tcp->tcp_prot_mem[idx] = val;
}
+
+static struct cftype tcp_files[] = {
+ {
+ .name = "kmem.tcp.limit_in_bytes",
+ .write_string = tcp_cgroup_write,
+ .read_u64 = tcp_cgroup_read,
+ .private = RES_LIMIT,
+ },
+ {
+ .name = "kmem.tcp.usage_in_bytes",
+ .read_u64 = tcp_cgroup_read,
+ .private = RES_USAGE,
+ },
+ {
+ .name = "kmem.tcp.failcnt",
+ .private = RES_FAILCNT,
+ .trigger = tcp_cgroup_reset,
+ .read_u64 = tcp_cgroup_read,
+ },
+ {
+ .name = "kmem.tcp.max_usage_in_bytes",
+ .private = RES_MAX_USAGE,
+ .trigger = tcp_cgroup_reset,
+ .read_u64 = tcp_cgroup_read,
+ },
+ { } /* terminate */
+};
+
+static int __init tcp_memcontrol_init(void)
+{
+ WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, tcp_files));
+ return 0;
+}
+__initcall(tcp_memcontrol_init);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 550e755747e..b85d9fe7d66 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -55,7 +55,7 @@ EXPORT_SYMBOL_GPL(tcp_death_row);
* state.
*/
-static int tcp_remember_stamp(struct sock *sk)
+static bool tcp_remember_stamp(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -72,13 +72,13 @@ static int tcp_remember_stamp(struct sock *sk)
}
if (release_it)
inet_putpeer(peer);
- return 1;
+ return true;
}
- return 0;
+ return false;
}
-static int tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
+static bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
{
struct sock *sk = (struct sock *) tw;
struct inet_peer *peer;
@@ -94,17 +94,17 @@ static int tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
peer->tcp_ts = tcptw->tw_ts_recent;
}
inet_putpeer(peer);
- return 1;
+ return true;
}
- return 0;
+ return false;
}
-static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
+static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
{
if (seq == s_win)
- return 1;
+ return true;
if (after(end_seq, s_win) && before(seq, e_win))
- return 1;
+ return true;
return seq == e_win && seq == end_seq;
}
@@ -143,7 +143,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
struct tcp_options_received tmp_opt;
const u8 *hash_location;
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
- int paws_reject = 0;
+ bool paws_reject = false;
tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
@@ -316,7 +316,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
struct inet_timewait_sock *tw = NULL;
const struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_sock *tp = tcp_sk(sk);
- int recycle_ok = 0;
+ bool recycle_ok = false;
if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
recycle_ok = tcp_remember_stamp(sk);
@@ -359,13 +359,11 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
*/
do {
struct tcp_md5sig_key *key;
- memset(tcptw->tw_md5_key, 0, sizeof(tcptw->tw_md5_key));
- tcptw->tw_md5_keylen = 0;
+ tcptw->tw_md5_key = NULL;
key = tp->af_specific->md5_lookup(sk, sk);
if (key != NULL) {
- memcpy(&tcptw->tw_md5_key, key->key, key->keylen);
- tcptw->tw_md5_keylen = key->keylen;
- if (tcp_alloc_md5sig_pool(sk) == NULL)
+ tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
+ if (tcptw->tw_md5_key && tcp_alloc_md5sig_pool(sk) == NULL)
BUG();
}
} while (0);
@@ -405,8 +403,10 @@ void tcp_twsk_destructor(struct sock *sk)
{
#ifdef CONFIG_TCP_MD5SIG
struct tcp_timewait_sock *twsk = tcp_twsk(sk);
- if (twsk->tw_md5_keylen)
+ if (twsk->tw_md5_key) {
tcp_free_md5sig_pool();
+ kfree_rcu(twsk->tw_md5_key, rcu);
+ }
#endif
}
EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
@@ -482,6 +482,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->sacked_out = 0;
newtp->fackets_out = 0;
newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ tcp_enable_early_retrans(newtp);
/* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control
@@ -574,7 +575,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
struct sock *child;
const struct tcphdr *th = tcp_hdr(skb);
__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
- int paws_reject = 0;
+ bool paws_reject = false;
tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(struct tcphdr)>>2)) {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 4ff3b6dc74f..803cbfe82fb 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -34,6 +34,8 @@
*
*/
+#define pr_fmt(fmt) "TCP: " fmt
+
#include <net/tcp.h>
#include <linux/compiler.h>
@@ -78,9 +80,8 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
tp->frto_counter = 3;
tp->packets_out += tcp_skb_pcount(skb);
- if (!prior_packets)
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
+ if (!prior_packets || tp->early_retrans_delayed)
+ tcp_rearm_rto(sk);
}
/* SND.NXT, if window was not shrunk.
@@ -369,7 +370,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
TCP_SKB_CB(skb)->end_seq = seq;
}
-static inline int tcp_urg_mode(const struct tcp_sock *tp)
+static inline bool tcp_urg_mode(const struct tcp_sock *tp)
{
return tp->snd_una != tp->snd_up;
}
@@ -563,13 +564,13 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
/* Compute TCP options for SYN packets. This is not the final
* network wire format yet.
*/
-static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
+static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
struct tcp_out_options *opts,
struct tcp_md5sig_key **md5)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_cookie_values *cvp = tp->cookie_values;
- unsigned remaining = MAX_TCP_OPTION_SPACE;
+ unsigned int remaining = MAX_TCP_OPTION_SPACE;
u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ?
tcp_cookie_size_check(cvp->cookie_desired) :
0;
@@ -663,15 +664,15 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
}
/* Set up TCP options for SYN-ACKs. */
-static unsigned tcp_synack_options(struct sock *sk,
+static unsigned int tcp_synack_options(struct sock *sk,
struct request_sock *req,
- unsigned mss, struct sk_buff *skb,
+ unsigned int mss, struct sk_buff *skb,
struct tcp_out_options *opts,
struct tcp_md5sig_key **md5,
struct tcp_extend_values *xvp)
{
struct inet_request_sock *ireq = inet_rsk(req);
- unsigned remaining = MAX_TCP_OPTION_SPACE;
+ unsigned int remaining = MAX_TCP_OPTION_SPACE;
u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ?
xvp->cookie_plus :
0;
@@ -742,13 +743,13 @@ static unsigned tcp_synack_options(struct sock *sk,
/* Compute TCP options for ESTABLISHED sockets. This is not the
* final wire format yet.
*/
-static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
+static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
struct tcp_out_options *opts,
struct tcp_md5sig_key **md5)
{
struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
struct tcp_sock *tp = tcp_sk(sk);
- unsigned size = 0;
+ unsigned int size = 0;
unsigned int eff_sacks;
#ifdef CONFIG_TCP_MD5SIG
@@ -770,9 +771,9 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
if (unlikely(eff_sacks)) {
- const unsigned remaining = MAX_TCP_OPTION_SPACE - size;
+ const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
opts->num_sack_blocks =
- min_t(unsigned, eff_sacks,
+ min_t(unsigned int, eff_sacks,
(remaining - TCPOLEN_SACK_BASE_ALIGNED) /
TCPOLEN_SACK_PERBLOCK);
size += TCPOLEN_SACK_BASE_ALIGNED +
@@ -801,7 +802,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
struct tcp_sock *tp;
struct tcp_skb_cb *tcb;
struct tcp_out_options opts;
- unsigned tcp_options_size, tcp_header_size;
+ unsigned int tcp_options_size, tcp_header_size;
struct tcp_md5sig_key *md5;
struct tcphdr *th;
int err;
@@ -1096,6 +1097,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
eat = min_t(int, len, skb_headlen(skb));
if (eat) {
__skb_pull(skb, eat);
+ skb->avail_size -= eat;
len -= eat;
if (!len)
return;
@@ -1149,7 +1151,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
}
/* Calculate MSS. Not accounting for SACKs here. */
-int tcp_mtu_to_mss(const struct sock *sk, int pmtu)
+int tcp_mtu_to_mss(struct sock *sk, int pmtu)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1160,6 +1162,14 @@ int tcp_mtu_to_mss(const struct sock *sk, int pmtu)
*/
mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
+ /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
+ if (icsk->icsk_af_ops->net_frag_header_len) {
+ const struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (dst && dst_allfrag(dst))
+ mss_now -= icsk->icsk_af_ops->net_frag_header_len;
+ }
+
/* Clamp it (mss_clamp does not include tcp options) */
if (mss_now > tp->rx_opt.mss_clamp)
mss_now = tp->rx_opt.mss_clamp;
@@ -1178,7 +1188,7 @@ int tcp_mtu_to_mss(const struct sock *sk, int pmtu)
}
/* Inverse of above */
-int tcp_mss_to_mtu(const struct sock *sk, int mss)
+int tcp_mss_to_mtu(struct sock *sk, int mss)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1189,6 +1199,13 @@ int tcp_mss_to_mtu(const struct sock *sk, int mss)
icsk->icsk_ext_hdr_len +
icsk->icsk_af_ops->net_header_len;
+ /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
+ if (icsk->icsk_af_ops->net_frag_header_len) {
+ const struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (dst && dst_allfrag(dst))
+ mtu += icsk->icsk_af_ops->net_frag_header_len;
+ }
return mtu;
}
@@ -1258,7 +1275,7 @@ unsigned int tcp_current_mss(struct sock *sk)
const struct tcp_sock *tp = tcp_sk(sk);
const struct dst_entry *dst = __sk_dst_get(sk);
u32 mss_now;
- unsigned header_len;
+ unsigned int header_len;
struct tcp_out_options opts;
struct tcp_md5sig_key *md5;
@@ -1374,33 +1391,33 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
}
/* Minshall's variant of the Nagle send check. */
-static inline int tcp_minshall_check(const struct tcp_sock *tp)
+static inline bool tcp_minshall_check(const struct tcp_sock *tp)
{
return after(tp->snd_sml, tp->snd_una) &&
!after(tp->snd_sml, tp->snd_nxt);
}
-/* Return 0, if packet can be sent now without violation Nagle's rules:
+/* Return false, if packet can be sent now without violation Nagle's rules:
* 1. It is full sized.
* 2. Or it contains FIN. (already checked by caller)
* 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
* 4. Or TCP_CORK is not set, and all sent packets are ACKed.
* With Minshall's modification: all sent small packets are ACKed.
*/
-static inline int tcp_nagle_check(const struct tcp_sock *tp,
+static inline bool tcp_nagle_check(const struct tcp_sock *tp,
const struct sk_buff *skb,
- unsigned mss_now, int nonagle)
+ unsigned int mss_now, int nonagle)
{
return skb->len < mss_now &&
((nonagle & TCP_NAGLE_CORK) ||
(!nonagle && tp->packets_out && tcp_minshall_check(tp)));
}
-/* Return non-zero if the Nagle test allows this packet to be
+/* Return true if the Nagle test allows this packet to be
* sent now.
*/
-static inline int tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
- unsigned int cur_mss, int nonagle)
+static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
+ unsigned int cur_mss, int nonagle)
{
/* Nagle rule does not apply to frames, which sit in the middle of the
* write_queue (they have no chances to get new data).
@@ -1409,24 +1426,25 @@ static inline int tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff
* argument based upon the location of SKB in the send queue.
*/
if (nonagle & TCP_NAGLE_PUSH)
- return 1;
+ return true;
/* Don't use the nagle rule for urgent data (or for the final FIN).
* Nagle can be ignored during F-RTO too (see RFC4138).
*/
if (tcp_urg_mode(tp) || (tp->frto_counter == 2) ||
(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
- return 1;
+ return true;
if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
- return 1;
+ return true;
- return 0;
+ return false;
}
/* Does at least the first segment of SKB fit into the send window? */
-static inline int tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb,
- unsigned int cur_mss)
+static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
+ const struct sk_buff *skb,
+ unsigned int cur_mss)
{
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
@@ -1459,7 +1477,7 @@ static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb,
}
/* Test if sending is allowed right now. */
-int tcp_may_send_now(struct sock *sk)
+bool tcp_may_send_now(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb = tcp_send_head(sk);
@@ -1529,7 +1547,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
*
* This algorithm is from John Heffner.
*/
-static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
+static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1589,11 +1607,11 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
/* Ok, it looks like it is advisable to defer. */
tp->tso_deferred = 1 | (jiffies << 1);
- return 1;
+ return true;
send_now:
tp->tso_deferred = 0;
- return 0;
+ return false;
}
/* Create a new MTU probe if we are ready.
@@ -1735,11 +1753,11 @@ static int tcp_mtu_probe(struct sock *sk)
* snd_up-64k-mss .. snd_up cannot be large. However, taking into
* account rare use of URG, this is not a big flaw.
*
- * Returns 1, if no segments are in flight and we have queued segments, but
- * cannot send anything now because of SWS or another problem.
+ * Returns true, if no segments are in flight and we have queued segments,
+ * but cannot send anything now because of SWS or another problem.
*/
-static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
- int push_one, gfp_t gfp)
+static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ int push_one, gfp_t gfp)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
@@ -1753,7 +1771,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
/* Do MTU probing. */
result = tcp_mtu_probe(sk);
if (!result) {
- return 0;
+ return false;
} else if (result > 0) {
sent_pkts = 1;
}
@@ -1812,7 +1830,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
if (likely(sent_pkts)) {
tcp_cwnd_validate(sk);
- return 0;
+ return false;
}
return !tp->packets_out && tcp_send_head(sk);
}
@@ -2011,22 +2029,22 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
}
/* Check if coalescing SKBs is legal. */
-static int tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
+static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
{
if (tcp_skb_pcount(skb) > 1)
- return 0;
+ return false;
/* TODO: SACK collapsing could be used to remove this condition */
if (skb_shinfo(skb)->nr_frags != 0)
- return 0;
+ return false;
if (skb_cloned(skb))
- return 0;
+ return false;
if (skb == tcp_send_head(sk))
- return 0;
+ return false;
/* Some heurestics for collapsing over SACK'd could be invented */
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
- return 0;
+ return false;
- return 1;
+ return true;
}
/* Collapse packets in the retransmit queue to make to create
@@ -2037,7 +2055,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb = to, *tmp;
- int first = 1;
+ bool first = true;
if (!sysctl_tcp_retrans_collapse)
return;
@@ -2051,7 +2069,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
space -= skb->len;
if (first) {
- first = 0;
+ first = false;
continue;
}
@@ -2060,7 +2078,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
/* Punt if not enough space exists in the first SKB for
* the data in the second
*/
- if (skb->len > skb_tailroom(to))
+ if (skb->len > skb_availroom(to))
break;
if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
@@ -2166,8 +2184,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
#if FASTRETRANS_DEBUG > 0
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
- if (net_ratelimit())
- printk(KERN_DEBUG "retrans_out leaked.\n");
+ net_dbg_ratelimited("retrans_out leaked\n");
}
#endif
if (!tp->retrans_out)
@@ -2192,18 +2209,18 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
/* Check if we forward retransmits are possible in the current
* window/congestion state.
*/
-static int tcp_can_forward_retransmit(struct sock *sk)
+static bool tcp_can_forward_retransmit(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_sock *tp = tcp_sk(sk);
/* Forward retransmissions are possible only during Recovery. */
if (icsk->icsk_ca_state != TCP_CA_Recovery)
- return 0;
+ return false;
/* No forward retransmissions in Reno are possible. */
if (tcp_is_reno(tp))
- return 0;
+ return false;
/* Yeah, we have to make difficult choice between forward transmission
* and retransmission... Both ways have their merits...
@@ -2214,9 +2231,9 @@ static int tcp_can_forward_retransmit(struct sock *sk)
*/
if (tcp_may_send_now(sk))
- return 0;
+ return false;
- return 1;
+ return true;
}
/* This gets called after a retransmit timeout, and the initially
@@ -2306,8 +2323,10 @@ begin_fwd:
if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
continue;
- if (tcp_retransmit_skb(sk, skb))
+ if (tcp_retransmit_skb(sk, skb)) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
return;
+ }
NET_INC_STATS_BH(sock_net(sk), mib_idx);
if (inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery)
@@ -2399,7 +2418,7 @@ int tcp_send_synack(struct sock *sk)
skb = tcp_write_queue_head(sk);
if (skb == NULL || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
- printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
+ pr_debug("%s: wrong queue state\n", __func__);
return -EFAULT;
}
if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
@@ -2559,7 +2578,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
EXPORT_SYMBOL(tcp_make_synack);
/* Do all connect socket setups that can be done AF independent. */
-static void tcp_connect_init(struct sock *sk)
+void tcp_connect_init(struct sock *sk)
{
const struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -2614,9 +2633,12 @@ static void tcp_connect_init(struct sock *sk)
tp->snd_una = tp->write_seq;
tp->snd_sml = tp->write_seq;
tp->snd_up = tp->write_seq;
- tp->rcv_nxt = 0;
- tp->rcv_wup = 0;
- tp->copied_seq = 0;
+ tp->snd_nxt = tp->write_seq;
+
+ if (likely(!tp->repair))
+ tp->rcv_nxt = 0;
+ tp->rcv_wup = tp->rcv_nxt;
+ tp->copied_seq = tp->rcv_nxt;
inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
inet_csk(sk)->icsk_retransmits = 0;
@@ -2639,7 +2661,6 @@ int tcp_connect(struct sock *sk)
/* Reserve space for headers. */
skb_reserve(buff, MAX_TCP_HEADER);
- tp->snd_nxt = tp->write_seq;
tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
TCP_ECN_send_syn(sk, buff);
@@ -2788,6 +2809,15 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
}
+void tcp_send_window_probe(struct sock *sk)
+{
+ if (sk->sk_state == TCP_ESTABLISHED) {
+ tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
+ tcp_sk(sk)->snd_nxt = tcp_sk(sk)->write_seq;
+ tcp_xmit_probe_skb(sk, 0);
+ }
+}
+
/* Initiate keepalive or window probe from timer. */
int tcp_write_wakeup(struct sock *sk)
{
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 85ee7eb7e38..4526fe68e60 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -18,6 +18,8 @@
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/kprobes.h>
#include <linux/socket.h>
@@ -89,7 +91,7 @@ static inline int tcp_probe_avail(void)
* Note: arguments must match tcp_rcv_established()!
*/
static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
- struct tcphdr *th, unsigned len)
+ struct tcphdr *th, unsigned int len)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_sock *inet = inet_sk(sk);
@@ -136,7 +138,7 @@ static struct jprobe tcp_jprobe = {
.entry = jtcp_rcv_established,
};
-static int tcpprobe_open(struct inode * inode, struct file * file)
+static int tcpprobe_open(struct inode *inode, struct file *file)
{
/* Reset (empty) log */
spin_lock_bh(&tcp_probe.lock);
@@ -239,7 +241,7 @@ static __init int tcpprobe_init(void)
if (ret)
goto err1;
- pr_info("TCP probe registered (port=%d) bufsize=%u\n", port, bufsize);
+ pr_info("probe registered (port=%d) bufsize=%u\n", port, bufsize);
return 0;
err1:
proc_net_remove(&init_net, procname);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index cd2e0723266..e911e6c523e 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -319,6 +319,11 @@ void tcp_retransmit_timer(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ if (tp->early_retrans_delayed) {
+ tcp_resume_early_retransmit(sk);
+ return;
+ }
+
if (!tp->packets_out)
goto out;
@@ -333,16 +338,18 @@ void tcp_retransmit_timer(struct sock *sk)
*/
struct inet_sock *inet = inet_sk(sk);
if (sk->sk_family == AF_INET) {
- LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
- &inet->inet_daddr, ntohs(inet->inet_dport),
- inet->inet_num, tp->snd_una, tp->snd_nxt);
+ LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"),
+ &inet->inet_daddr,
+ ntohs(inet->inet_dport), inet->inet_num,
+ tp->snd_una, tp->snd_nxt);
}
#if IS_ENABLED(CONFIG_IPV6)
else if (sk->sk_family == AF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
- LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
- &np->daddr, ntohs(inet->inet_dport),
- inet->inet_num, tp->snd_una, tp->snd_nxt);
+ LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"),
+ &np->daddr,
+ ntohs(inet->inet_dport), inet->inet_num,
+ tp->snd_una, tp->snd_nxt);
}
#endif
if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index 01775983b99..0d017183062 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -164,12 +164,12 @@ static const struct net_protocol tunnel64_protocol = {
static int __init tunnel4_init(void)
{
if (inet_add_protocol(&tunnel4_protocol, IPPROTO_IPIP)) {
- printk(KERN_ERR "tunnel4 init: can't add protocol\n");
+ pr_err("%s: can't add protocol\n", __func__);
return -EAGAIN;
}
#if IS_ENABLED(CONFIG_IPV6)
if (inet_add_protocol(&tunnel64_protocol, IPPROTO_IPV6)) {
- printk(KERN_ERR "tunnel64 init: can't add protocol\n");
+ pr_err("tunnel64 init: can't add protocol\n");
inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP);
return -EAGAIN;
}
@@ -181,10 +181,10 @@ static void __exit tunnel4_fini(void)
{
#if IS_ENABLED(CONFIG_IPV6)
if (inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6))
- printk(KERN_ERR "tunnel64 close: can't remove protocol\n");
+ pr_err("tunnel64 close: can't remove protocol\n");
#endif
if (inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP))
- printk(KERN_ERR "tunnel4 close: can't remove protocol\n");
+ pr_err("tunnel4 close: can't remove protocol\n");
}
module_init(tunnel4_init);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 5d075b5f70f..eaca73644e7 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -77,7 +77,8 @@
* 2 of the License, or (at your option) any later version.
*/
-#include <asm/system.h>
+#define pr_fmt(fmt) "UDP: " fmt
+
#include <asm/uaccess.h>
#include <asm/ioctls.h>
#include <linux/bootmem.h>
@@ -106,6 +107,7 @@
#include <net/checksum.h>
#include <net/xfrm.h>
#include <trace/events/udp.h>
+#include <linux/static_key.h>
#include "udp_impl.h"
struct udp_table udp_table __read_mostly;
@@ -205,7 +207,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
if (!snum) {
int low, high, remaining;
- unsigned rand;
+ unsigned int rand;
unsigned short first, last;
DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
@@ -845,7 +847,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
* Get and verify the address.
*/
if (msg->msg_name) {
- struct sockaddr_in * usin = (struct sockaddr_in *)msg->msg_name;
+ struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name;
if (msg->msg_namelen < sizeof(*usin))
return -EINVAL;
if (usin->sin_family != AF_INET) {
@@ -917,7 +919,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (!saddr)
saddr = inet->mc_addr;
connected = 0;
- }
+ } else if (!ipc.oif)
+ ipc.oif = inet->uc_index;
if (connected)
rt = (struct rtable *)sk_dst_check(sk, 0);
@@ -974,7 +977,7 @@ back_from_confirm:
/* ... which is an evident application bug. --ANK */
release_sock(sk);
- LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n");
+ LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("cork app bug 2\n"));
err = -EINVAL;
goto out;
}
@@ -1053,7 +1056,7 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset,
if (unlikely(!up->pending)) {
release_sock(sk);
- LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 3\n");
+ LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("udp cork app bug 3\n"));
return -EINVAL;
}
@@ -1166,7 +1169,7 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
struct sk_buff *skb;
unsigned int ulen, copied;
- int peeked;
+ int peeked, off = 0;
int err;
int is_udplite = IS_UDPLITE(sk);
bool slow;
@@ -1182,7 +1185,7 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
try_again:
skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
- &peeked, &err);
+ &peeked, &off, &err);
if (!skb)
goto out;
@@ -1377,6 +1380,14 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
}
+static struct static_key udp_encap_needed __read_mostly;
+void udp_encap_enable(void)
+{
+ if (!static_key_enabled(&udp_encap_needed))
+ static_key_slow_inc(&udp_encap_needed);
+}
+EXPORT_SYMBOL(udp_encap_enable);
+
/* returns:
* -1: error
* 0: success
@@ -1398,7 +1409,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
goto drop;
nf_reset(skb);
- if (up->encap_type) {
+ if (static_key_false(&udp_encap_needed) && up->encap_type) {
int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
/*
@@ -1446,9 +1457,8 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
* provided by the application."
*/
if (up->pcrlen == 0) { /* full coverage was set */
- LIMIT_NETDEBUG(KERN_WARNING "UDPLITE: partial coverage "
- "%d while full coverage %d requested\n",
- UDP_SKB_CB(skb)->cscov, skb->len);
+ LIMIT_NETDEBUG(KERN_WARNING "UDPLite: partial coverage %d while full coverage %d requested\n",
+ UDP_SKB_CB(skb)->cscov, skb->len);
goto drop;
}
/* The next case involves violating the min. coverage requested
@@ -1458,9 +1468,8 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
* Therefore the above ...()->partial_cov statement is essential.
*/
if (UDP_SKB_CB(skb)->cscov < up->pcrlen) {
- LIMIT_NETDEBUG(KERN_WARNING
- "UDPLITE: coverage %d too small, need min %d\n",
- UDP_SKB_CB(skb)->cscov, up->pcrlen);
+ LIMIT_NETDEBUG(KERN_WARNING "UDPLite: coverage %d too small, need min %d\n",
+ UDP_SKB_CB(skb)->cscov, up->pcrlen);
goto drop;
}
}
@@ -1470,7 +1479,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
goto drop;
- if (sk_rcvqueues_full(sk, skb))
+ if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf))
goto drop;
rc = 0;
@@ -1479,7 +1488,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
bh_lock_sock(sk);
if (!sock_owned_by_user(sk))
rc = __udp_queue_rcv_skb(sk, skb);
- else if (sk_add_backlog(sk, skb)) {
+ else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
bh_unlock_sock(sk);
goto drop;
}
@@ -1688,13 +1697,10 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
short_packet:
LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n",
- proto == IPPROTO_UDPLITE ? "-Lite" : "",
- &saddr,
- ntohs(uh->source),
- ulen,
- skb->len,
- &daddr,
- ntohs(uh->dest));
+ proto == IPPROTO_UDPLITE ? "Lite" : "",
+ &saddr, ntohs(uh->source),
+ ulen, skb->len,
+ &daddr, ntohs(uh->dest));
goto drop;
csum_error:
@@ -1703,11 +1709,8 @@ csum_error:
* the network is concerned, anyway) as per 4.1.3.4 (MUST).
*/
LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n",
- proto == IPPROTO_UDPLITE ? "-Lite" : "",
- &saddr,
- ntohs(uh->source),
- &daddr,
- ntohs(uh->dest),
+ proto == IPPROTO_UDPLITE ? "Lite" : "",
+ &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest),
ulen);
drop:
UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
@@ -1766,6 +1769,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
/* FALLTHROUGH */
case UDP_ENCAP_L2TPINUDP:
up->encap_type = val;
+ udp_encap_enable();
break;
default:
err = -ENOPROTOOPT;
@@ -2169,9 +2173,15 @@ void udp4_proc_exit(void)
static __initdata unsigned long uhash_entries;
static int __init set_uhash_entries(char *str)
{
+ ssize_t ret;
+
if (!str)
return 0;
- uhash_entries = simple_strtoul(str, &str, 0);
+
+ ret = kstrtoul(str, 0, &uhash_entries);
+ if (ret)
+ return 0;
+
if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN)
uhash_entries = UDP_HTABLE_SIZE_MIN;
return 1;
@@ -2182,26 +2192,16 @@ void __init udp_table_init(struct udp_table *table, const char *name)
{
unsigned int i;
- if (!CONFIG_BASE_SMALL)
- table->hash = alloc_large_system_hash(name,
- 2 * sizeof(struct udp_hslot),
- uhash_entries,
- 21, /* one slot per 2 MB */
- 0,
- &table->log,
- &table->mask,
- 64 * 1024);
- /*
- * Make sure hash table has the minimum size
- */
- if (CONFIG_BASE_SMALL || table->mask < UDP_HTABLE_SIZE_MIN - 1) {
- table->hash = kmalloc(UDP_HTABLE_SIZE_MIN *
- 2 * sizeof(struct udp_hslot), GFP_KERNEL);
- if (!table->hash)
- panic(name);
- table->log = ilog2(UDP_HTABLE_SIZE_MIN);
- table->mask = UDP_HTABLE_SIZE_MIN - 1;
- }
+ table->hash = alloc_large_system_hash(name,
+ 2 * sizeof(struct udp_hslot),
+ uhash_entries,
+ 21, /* one slot per 2 MB */
+ 0,
+ &table->log,
+ &table->mask,
+ UDP_HTABLE_SIZE_MIN,
+ 64 * 1024);
+
table->hash2 = table->hash + (table->mask + 1);
for (i = 0; i <= table->mask; i++) {
INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 8a949f19deb..a7f86a3cd50 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -146,9 +146,17 @@ static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
return udp_dump_one(&udp_table, in_skb, nlh, req);
}
+static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+ void *info)
+{
+ r->idiag_rqueue = sk_rmem_alloc_get(sk);
+ r->idiag_wqueue = sk_wmem_alloc_get(sk);
+}
+
static const struct inet_diag_handler udp_diag_handler = {
.dump = udp_diag_dump,
.dump_one = udp_diag_dump_one,
+ .idiag_get_info = udp_diag_get_info,
.idiag_type = IPPROTO_UDP,
};
@@ -167,6 +175,7 @@ static int udplite_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *
static const struct inet_diag_handler udplite_diag_handler = {
.dump = udplite_diag_dump,
.dump_one = udplite_diag_dump_one,
+ .idiag_get_info = udp_diag_get_info,
.idiag_type = IPPROTO_UDPLITE,
};
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index aaad650d47d..5a681e298b9 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -25,7 +25,7 @@ extern int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t len, int noblock, int flags, int *addr_len);
extern int udp_sendpage(struct sock *sk, struct page *page, int offset,
size_t size, int flags);
-extern int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb);
+extern int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
extern void udp_destroy_sock(struct sock *sk);
#ifdef CONFIG_PROC_FS
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 12e9499a1a6..2c46acd4cc3 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -10,6 +10,9 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
+
+#define pr_fmt(fmt) "UDPLite: " fmt
+
#include <linux/export.h>
#include "udp_impl.h"
@@ -129,11 +132,11 @@ void __init udplite4_register(void)
inet_register_protosw(&udplite4_protosw);
if (udplite4_proc_init())
- printk(KERN_ERR "%s: Cannot register /proc!\n", __func__);
+ pr_err("%s: Cannot register /proc!\n", __func__);
return;
out_unregister_proto:
proto_unregister(&udplite_prot);
out_register_err:
- printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __func__);
+ pr_crit("%s: Cannot add UDP-Lite protocol\n", __func__);
}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index a0b4c5da8d4..0d3426cb5c4 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -152,7 +152,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
case IPPROTO_AH:
if (pskb_may_pull(skb, xprth + 8 - skb->data)) {
- __be32 *ah_hdr = (__be32*)xprth;
+ __be32 *ah_hdr = (__be32 *)xprth;
fl4->fl4_ipsec_spi = ah_hdr[1];
}
@@ -298,8 +298,8 @@ void __init xfrm4_init(int rt_max_size)
xfrm4_state_init();
xfrm4_policy_init();
#ifdef CONFIG_SYSCTL
- sysctl_hdr = register_net_sysctl_table(&init_net, net_ipv4_ctl_path,
- xfrm4_policy_table);
+ sysctl_hdr = register_net_sysctl(&init_net, "net/ipv4",
+ xfrm4_policy_table);
#endif
}
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 9247d9d70e9..05a5df2febc 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -3,6 +3,8 @@
* Copyright (C) 2003 David S. Miller (davem@redhat.com)
*/
+#define pr_fmt(fmt) "IPsec: " fmt
+
#include <linux/skbuff.h>
#include <linux/module.h>
#include <linux/mutex.h>
@@ -75,18 +77,18 @@ static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = {
static int __init ipip_init(void)
{
if (xfrm_register_type(&ipip_type, AF_INET) < 0) {
- printk(KERN_INFO "ipip init: can't add xfrm type\n");
+ pr_info("%s: can't add xfrm type\n", __func__);
return -EAGAIN;
}
if (xfrm4_tunnel_register(&xfrm_tunnel_handler, AF_INET)) {
- printk(KERN_INFO "ipip init: can't add xfrm handler for AF_INET\n");
+ pr_info("%s: can't add xfrm handler for AF_INET\n", __func__);
xfrm_unregister_type(&ipip_type, AF_INET);
return -EAGAIN;
}
#if IS_ENABLED(CONFIG_IPV6)
if (xfrm4_tunnel_register(&xfrm64_tunnel_handler, AF_INET6)) {
- printk(KERN_INFO "ipip init: can't add xfrm handler for AF_INET6\n");
+ pr_info("%s: can't add xfrm handler for AF_INET6\n", __func__);
xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET);
xfrm_unregister_type(&ipip_type, AF_INET);
return -EAGAIN;
@@ -99,12 +101,14 @@ static void __exit ipip_fini(void)
{
#if IS_ENABLED(CONFIG_IPV6)
if (xfrm4_tunnel_deregister(&xfrm64_tunnel_handler, AF_INET6))
- printk(KERN_INFO "ipip close: can't remove xfrm handler for AF_INET6\n");
+ pr_info("%s: can't remove xfrm handler for AF_INET6\n",
+ __func__);
#endif
if (xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET))
- printk(KERN_INFO "ipip close: can't remove xfrm handler for AF_INET\n");
+ pr_info("%s: can't remove xfrm handler for AF_INET\n",
+ __func__);
if (xfrm_unregister_type(&ipip_type, AF_INET) < 0)
- printk(KERN_INFO "ipip close: can't remove xfrm type\n");
+ pr_info("%s: can't remove xfrm type\n", __func__);
}
module_init(ipip_init);