diff options
Diffstat (limited to 'net/ipv4')
94 files changed, 2275 insertions, 1490 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index fb790977425..d894f616c3d 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -394,6 +394,14 @@ config INET_XFRM_MODE_BEET If unsure, say Y. +config INET_LRO + tristate "Large Receive Offload (ipv4/tcp)" + + ---help--- + Support for Large Receive Offload (ipv4/tcp). + + If unsure, say Y. + config INET_DIAG tristate "INET: socket monitoring interface" default y diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index fbf1674e0c2..a02c36d0a13 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -25,6 +25,7 @@ obj-$(CONFIG_INET_ESP) += esp4.o obj-$(CONFIG_INET_IPCOMP) += ipcomp.o obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o +obj-$(CONFIG_INET_LRO) += inet_lro.o obj-$(CONFIG_INET_TUNNEL) += tunnel4.o obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 06c08e5740f..621b128897d 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -241,7 +241,7 @@ EXPORT_SYMBOL(build_ehash_secret); * Create an inet socket. */ -static int inet_create(struct socket *sock, int protocol) +static int inet_create(struct net *net, struct socket *sock, int protocol) { struct sock *sk; struct list_head *p; @@ -253,6 +253,9 @@ static int inet_create(struct socket *sock, int protocol) int try_loading_module = 0; int err; + if (net != &init_net) + return -EAFNOSUPPORT; + if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM && !inet_ehash_secret) @@ -320,7 +323,7 @@ lookup_protocol: BUG_TRAP(answer_prot->slab != NULL); err = -ENOBUFS; - sk = sk_alloc(PF_INET, GFP_KERNEL, answer_prot, 1); + sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, 1); if (sk == NULL) goto out; @@ -831,7 +834,7 @@ const struct proto_ops inet_stream_ops = { .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, - .sendmsg = inet_sendmsg, + .sendmsg = tcp_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, .sendpage = tcp_sendpage, @@ -939,7 +942,7 @@ static struct inet_protosw inetsw_array[] = } }; -#define INETSW_ARRAY_LEN (sizeof(inetsw_array) / sizeof(struct inet_protosw)) +#define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array) void inet_register_protosw(struct inet_protosw *p) { @@ -1299,6 +1302,10 @@ static int __init init_ipv4_mibs(void) sizeof(struct icmp_mib), __alignof__(struct icmp_mib)) < 0) goto err_icmp_mib; + if (snmp_mib_init((void **)icmpmsg_statistics, + sizeof(struct icmpmsg_mib), + __alignof__(struct icmpmsg_mib)) < 0) + goto err_icmpmsg_mib; if (snmp_mib_init((void **)tcp_statistics, sizeof(struct tcp_mib), __alignof__(struct tcp_mib)) < 0) @@ -1321,6 +1328,8 @@ err_udplite_mib: err_udp_mib: snmp_mib_free((void **)tcp_statistics); err_tcp_mib: + snmp_mib_free((void **)icmpmsg_statistics); +err_icmpmsg_mib: snmp_mib_free((void **)icmp_statistics); err_icmp_mib: snmp_mib_free((void **)ip_statistics); diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 7a23e59c374..4e8e3b079f5 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -5,6 +5,7 @@ #include <net/ah.h> #include <linux/crypto.h> #include <linux/pfkeyv2.h> +#include <linux/spinlock.h> #include <net/icmp.h> #include <net/protocol.h> #include <asm/scatterlist.h> @@ -46,7 +47,7 @@ static int ip_clear_mutable_options(struct iphdr *iph, __be32 *daddr) memcpy(daddr, optptr+optlen-4, 4); /* Fall through */ default: - memset(optptr+2, 0, optlen-2); + memset(optptr, 0, optlen); } l -= optlen; optptr += optlen; @@ -65,6 +66,7 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) char buf[60]; } tmp_iph; + skb_push(skb, -skb_network_offset(skb)); top_iph = ip_hdr(skb); iph = &tmp_iph.iph; @@ -80,28 +82,30 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) goto error; } - ah = (struct ip_auth_hdr *)((char *)top_iph+top_iph->ihl*4); - ah->nexthdr = top_iph->protocol; + ah = ip_auth_hdr(skb); + ah->nexthdr = *skb_mac_header(skb); + *skb_mac_header(skb) = IPPROTO_AH; top_iph->tos = 0; top_iph->tot_len = htons(skb->len); top_iph->frag_off = 0; top_iph->ttl = 0; - top_iph->protocol = IPPROTO_AH; top_iph->check = 0; ahp = x->data; - ah->hdrlen = (XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + - ahp->icv_trunc_len) >> 2) - 2; + ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; ah->reserved = 0; ah->spi = x->id.spi; - ah->seq_no = htonl(++x->replay.oseq); - xfrm_aevent_doreplay(x); + ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq); + + spin_lock_bh(&x->lock); err = ah_mac_digest(ahp, skb, ah->auth_data); + memcpy(ah->auth_data, ahp->work_icv, ahp->icv_trunc_len); + spin_unlock_bh(&x->lock); + if (err) goto error; - memcpy(ah->auth_data, ahp->work_icv, ahp->icv_trunc_len); top_iph->tos = iph->tos; top_iph->ttl = iph->ttl; @@ -111,8 +115,6 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); } - ip_send_check(top_iph); - err = 0; error: @@ -123,21 +125,23 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) { int ah_hlen; int ihl; + int nexthdr; int err = -EINVAL; struct iphdr *iph; struct ip_auth_hdr *ah; struct ah_data *ahp; char work_buf[60]; - if (!pskb_may_pull(skb, sizeof(struct ip_auth_hdr))) + if (!pskb_may_pull(skb, sizeof(*ah))) goto out; - ah = (struct ip_auth_hdr*)skb->data; + ah = (struct ip_auth_hdr *)skb->data; ahp = x->data; + nexthdr = ah->nexthdr; ah_hlen = (ah->hdrlen + 2) << 2; - if (ah_hlen != XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_full_len) && - ah_hlen != XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len)) + if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) && + ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len)) goto out; if (!pskb_may_pull(skb, ah_hlen)) @@ -151,7 +155,7 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) skb->ip_summed = CHECKSUM_NONE; - ah = (struct ip_auth_hdr*)skb->data; + ah = (struct ip_auth_hdr *)skb->data; iph = ip_hdr(skb); ihl = skb->data - skb_network_header(skb); @@ -180,13 +184,12 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) goto out; } } - ((struct iphdr*)work_buf)->protocol = ah->nexthdr; skb->network_header += ah_hlen; memcpy(skb_network_header(skb), work_buf, ihl); skb->transport_header = skb->network_header; __skb_pull(skb, ah_hlen + ihl); - return 0; + return nexthdr; out: return err; @@ -219,10 +222,6 @@ static int ah_init_state(struct xfrm_state *x) if (!x->aalg) goto error; - /* null auth can use a zero length key */ - if (x->aalg->alg_key_len > 512) - goto error; - if (x->encap) goto error; @@ -230,14 +229,13 @@ static int ah_init_state(struct xfrm_state *x) if (ahp == NULL) return -ENOMEM; - ahp->key = x->aalg->alg_key; - ahp->key_len = (x->aalg->alg_key_len+7)/8; tfm = crypto_alloc_hash(x->aalg->alg_name, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm)) goto error; ahp->tfm = tfm; - if (crypto_hash_setkey(tfm, ahp->key, ahp->key_len)) + if (crypto_hash_setkey(tfm, x->aalg->alg_key, + (x->aalg->alg_key_len + 7) / 8)) goto error; /* @@ -266,7 +264,8 @@ static int ah_init_state(struct xfrm_state *x) if (!ahp->work_icv) goto error; - x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len); + x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + + ahp->icv_trunc_len); if (x->props.mode == XFRM_MODE_TUNNEL) x->props.header_len += sizeof(struct iphdr); x->data = ahp; @@ -302,6 +301,7 @@ static struct xfrm_type ah_type = .description = "AH4", .owner = THIS_MODULE, .proto = IPPROTO_AH, + .flags = XFRM_TYPE_REPLAY_PROT, .init_state = ah_init_state, .destructor = ah_destroy, .input = ah_input, diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 9ab9d534fba..36d6798947b 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -103,6 +103,7 @@ #include <linux/sysctl.h> #endif +#include <net/net_namespace.h> #include <net/ip.h> #include <net/icmp.h> #include <net/route.h> @@ -252,7 +253,7 @@ static int arp_constructor(struct neighbour *neigh) neigh->parms = neigh_parms_clone(parms); rcu_read_unlock(); - if (dev->hard_header == NULL) { + if (!dev->header_ops) { neigh->nud_state = NUD_NOARP; neigh->ops = &arp_direct_ops; neigh->output = neigh->ops->queue_xmit; @@ -309,10 +310,12 @@ static int arp_constructor(struct neighbour *neigh) neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->broadcast, dev->addr_len); } - if (dev->hard_header_cache) + + if (dev->header_ops->cache) neigh->ops = &arp_hh_ops; else neigh->ops = &arp_generic_ops; + if (neigh->nud_state&NUD_VALID) neigh->output = neigh->ops->connected_output; else @@ -590,8 +593,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, /* * Fill the device header for the ARP frame */ - if (dev->hard_header && - dev->hard_header(skb,dev,ptype,dest_hw,src_hw,skb->len) < 0) + if (dev_hard_header(skb, dev, ptype, dest_hw, src_hw, skb->len) < 0) goto out; /* @@ -931,6 +933,9 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, { struct arphdr *arp; + if (dev->nd_net != &init_net) + goto freeskb; + /* ARP header, plus 2 device addresses, plus 2 IP addresses. */ if (!pskb_may_pull(skb, (sizeof(struct arphdr) + (2 * dev->addr_len) + @@ -977,7 +982,7 @@ static int arp_req_set(struct arpreq *r, struct net_device * dev) if (mask && mask != htonl(0xFFFFFFFF)) return -EINVAL; if (!dev && (r->arp_flags & ATF_COM)) { - dev = dev_getbyhwaddr(r->arp_ha.sa_family, r->arp_ha.sa_data); + dev = dev_getbyhwaddr(&init_net, r->arp_ha.sa_family, r->arp_ha.sa_data); if (!dev) return -ENODEV; } @@ -1165,7 +1170,7 @@ int arp_ioctl(unsigned int cmd, void __user *arg) rtnl_lock(); if (r.arp_dev[0]) { err = -ENODEV; - if ((dev = __dev_get_by_name(r.arp_dev)) == NULL) + if ((dev = __dev_get_by_name(&init_net, r.arp_dev)) == NULL) goto out; /* Mmmm... It is wrong... ARPHRD_NETROM==0 */ @@ -1201,6 +1206,9 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event, vo { struct net_device *dev = ptr; + if (dev->nd_net != &init_net) + return NOTIFY_DONE; + switch (event) { case NETDEV_CHANGEADDR: neigh_changeaddr(&arp_tbl, dev); @@ -1370,24 +1378,8 @@ static const struct seq_operations arp_seq_ops = { static int arp_seq_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - int rc = -ENOMEM; - struct neigh_seq_state *s = kzalloc(sizeof(*s), GFP_KERNEL); - - if (!s) - goto out; - - rc = seq_open(file, &arp_seq_ops); - if (rc) - goto out_kfree; - - seq = file->private_data; - seq->private = s; -out: - return rc; -out_kfree: - kfree(s); - goto out; + return seq_open_private(file, &arp_seq_ops, + sizeof(struct neigh_seq_state)); } static const struct file_operations arp_seq_fops = { @@ -1400,7 +1392,7 @@ static const struct file_operations arp_seq_fops = { static int __init arp_proc_init(void) { - if (!proc_net_fops_create("arp", S_IRUGO, &arp_seq_fops)) + if (!proc_net_fops_create(&init_net, "arp", S_IRUGO, &arp_seq_fops)) return -ENOMEM; return 0; } diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index ab56a052ce3..805a78e6ed5 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1831,68 +1831,75 @@ socket_setattr_failure: } /** - * cipso_v4_sock_getattr - Get the security attributes from a sock - * @sk: the sock + * cipso_v4_getattr - Helper function for the cipso_v4_*_getattr functions + * @cipso: the CIPSO v4 option * @secattr: the security attributes * * Description: - * Query @sk to see if there is a CIPSO option attached to the sock and if - * there is return the CIPSO security attributes in @secattr. This function - * requires that @sk be locked, or privately held, but it does not do any - * locking itself. Returns zero on success and negative values on failure. + * Inspect @cipso and return the security attributes in @secattr. Returns zero + * on success and negative values on failure. * */ -int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) +static int cipso_v4_getattr(const unsigned char *cipso, + struct netlbl_lsm_secattr *secattr) { int ret_val = -ENOMSG; - struct inet_sock *sk_inet; - unsigned char *cipso_ptr; u32 doi; struct cipso_v4_doi *doi_def; - sk_inet = inet_sk(sk); - if (sk_inet->opt == NULL || sk_inet->opt->cipso == 0) - return -ENOMSG; - cipso_ptr = sk_inet->opt->__data + sk_inet->opt->cipso - - sizeof(struct iphdr); - ret_val = cipso_v4_cache_check(cipso_ptr, cipso_ptr[1], secattr); - if (ret_val == 0) - return ret_val; + if (cipso_v4_cache_check(cipso, cipso[1], secattr) == 0) + return 0; - doi = ntohl(get_unaligned((__be32 *)&cipso_ptr[2])); + doi = ntohl(get_unaligned((__be32 *)&cipso[2])); rcu_read_lock(); doi_def = cipso_v4_doi_search(doi); - if (doi_def == NULL) { - rcu_read_unlock(); - return -ENOMSG; - } - + if (doi_def == NULL) + goto getattr_return; /* XXX - This code assumes only one tag per CIPSO option which isn't * really a good assumption to make but since we only support the MAC * tags right now it is a safe assumption. */ - switch (cipso_ptr[6]) { + switch (cipso[6]) { case CIPSO_V4_TAG_RBITMAP: - ret_val = cipso_v4_parsetag_rbm(doi_def, - &cipso_ptr[6], - secattr); + ret_val = cipso_v4_parsetag_rbm(doi_def, &cipso[6], secattr); break; case CIPSO_V4_TAG_ENUM: - ret_val = cipso_v4_parsetag_enum(doi_def, - &cipso_ptr[6], - secattr); + ret_val = cipso_v4_parsetag_enum(doi_def, &cipso[6], secattr); break; case CIPSO_V4_TAG_RANGE: - ret_val = cipso_v4_parsetag_rng(doi_def, - &cipso_ptr[6], - secattr); + ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr); break; } - rcu_read_unlock(); +getattr_return: + rcu_read_unlock(); return ret_val; } /** + * cipso_v4_sock_getattr - Get the security attributes from a sock + * @sk: the sock + * @secattr: the security attributes + * + * Description: + * Query @sk to see if there is a CIPSO option attached to the sock and if + * there is return the CIPSO security attributes in @secattr. This function + * requires that @sk be locked, or privately held, but it does not do any + * locking itself. Returns zero on success and negative values on failure. + * + */ +int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) +{ + struct ip_options *opt; + + opt = inet_sk(sk)->opt; + if (opt == NULL || opt->cipso == 0) + return -ENOMSG; + + return cipso_v4_getattr(opt->__data + opt->cipso - sizeof(struct iphdr), + secattr); +} + +/** * cipso_v4_skbuff_getattr - Get the security attributes from the CIPSO option * @skb: the packet * @secattr: the security attributes @@ -1905,45 +1912,7 @@ int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) int cipso_v4_skbuff_getattr(const struct sk_buff *skb, struct netlbl_lsm_secattr *secattr) { - int ret_val = -ENOMSG; - unsigned char *cipso_ptr; - u32 doi; - struct cipso_v4_doi *doi_def; - - cipso_ptr = CIPSO_V4_OPTPTR(skb); - if (cipso_v4_cache_check(cipso_ptr, cipso_ptr[1], secattr) == 0) - return 0; - - doi = ntohl(get_unaligned((__be32 *)&cipso_ptr[2])); - rcu_read_lock(); - doi_def = cipso_v4_doi_search(doi); - if (doi_def == NULL) - goto skbuff_getattr_return; - - /* XXX - This code assumes only one tag per CIPSO option which isn't - * really a good assumption to make but since we only support the MAC - * tags right now it is a safe assumption. */ - switch (cipso_ptr[6]) { - case CIPSO_V4_TAG_RBITMAP: - ret_val = cipso_v4_parsetag_rbm(doi_def, - &cipso_ptr[6], - secattr); - break; - case CIPSO_V4_TAG_ENUM: - ret_val = cipso_v4_parsetag_enum(doi_def, - &cipso_ptr[6], - secattr); - break; - case CIPSO_V4_TAG_RANGE: - ret_val = cipso_v4_parsetag_rng(doi_def, - &cipso_ptr[6], - secattr); - break; - } - -skbuff_getattr_return: - rcu_read_unlock(); - return ret_val; + return cipso_v4_getattr(CIPSO_V4_OPTPTR(skb), secattr); } /* diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index abf6352f990..55d199e4ae2 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -203,8 +203,6 @@ static void inetdev_destroy(struct in_device *in_dev) ASSERT_RTNL(); dev = in_dev->dev; - if (dev == &loopback_dev) - return; in_dev->dead = 1; @@ -420,7 +418,7 @@ struct in_device *inetdev_by_index(int ifindex) struct net_device *dev; struct in_device *in_dev = NULL; read_lock(&dev_base_lock); - dev = __dev_get_by_index(ifindex); + dev = __dev_get_by_index(&init_net, ifindex); if (dev) in_dev = in_dev_get(dev); read_unlock(&dev_base_lock); @@ -506,7 +504,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct nlmsghdr *nlh) goto errout; } - dev = __dev_get_by_index(ifm->ifa_index); + dev = __dev_get_by_index(&init_net, ifm->ifa_index); if (dev == NULL) { err = -ENODEV; goto errout; @@ -628,7 +626,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg) *colon = 0; #ifdef CONFIG_KMOD - dev_load(ifr.ifr_name); + dev_load(&init_net, ifr.ifr_name); #endif switch (cmd) { @@ -669,7 +667,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg) rtnl_lock(); ret = -ENODEV; - if ((dev = __dev_get_by_name(ifr.ifr_name)) == NULL) + if ((dev = __dev_get_by_name(&init_net, ifr.ifr_name)) == NULL) goto done; if (colon) @@ -909,7 +907,7 @@ no_in_dev: */ read_lock(&dev_base_lock); rcu_read_lock(); - for_each_netdev(dev) { + for_each_netdev(&init_net, dev) { if ((in_dev = __in_dev_get_rcu(dev)) == NULL) continue; @@ -988,7 +986,7 @@ __be32 inet_confirm_addr(const struct net_device *dev, __be32 dst, __be32 local, read_lock(&dev_base_lock); rcu_read_lock(); - for_each_netdev(dev) { + for_each_netdev(&init_net, dev) { if ((in_dev = __in_dev_get_rcu(dev))) { addr = confirm_addr_indev(in_dev, dst, local, scope); if (addr) @@ -1051,15 +1049,17 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, struct net_device *dev = ptr; struct in_device *in_dev = __in_dev_get_rtnl(dev); + if (dev->nd_net != &init_net) + return NOTIFY_DONE; + ASSERT_RTNL(); if (!in_dev) { if (event == NETDEV_REGISTER) { in_dev = inetdev_init(dev); - if (dev == &loopback_dev) { - if (!in_dev) - panic("devinet: " - "Failed to create loopback\n"); + if (!in_dev) + return notifier_from_errno(-ENOMEM); + if (dev->flags & IFF_LOOPBACK) { IN_DEV_CONF_SET(in_dev, NOXFRM, 1); IN_DEV_CONF_SET(in_dev, NOPOLICY, 1); } @@ -1075,7 +1075,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, case NETDEV_UP: if (dev->mtu < 68) break; - if (dev == &loopback_dev) { + if (dev->flags & IFF_LOOPBACK) { struct in_ifaddr *ifa; if ((ifa = inet_alloc_ifa()) != NULL) { ifa->ifa_local = @@ -1183,7 +1183,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) s_ip_idx = ip_idx = cb->args[1]; idx = 0; - for_each_netdev(dev) { + for_each_netdev(&init_net, dev) { if (idx < s_idx) goto cont; if (idx > s_idx) @@ -1194,7 +1194,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) for (ifa = in_dev->ifa_list, ip_idx = 0; ifa; ifa = ifa->ifa_next, ip_idx++) { if (ip_idx < s_ip_idx) - goto cont; + continue; if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, RTM_NEWADDR, NLM_F_MULTI) <= 0) @@ -1242,7 +1242,7 @@ static void devinet_copy_dflt_conf(int i) struct net_device *dev; read_lock(&dev_base_lock); - for_each_netdev(dev) { + for_each_netdev(&init_net, dev) { struct in_device *in_dev; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); @@ -1331,7 +1331,7 @@ void inet_forward_change(void) IPV4_DEVCONF_DFLT(FORWARDING) = on; read_lock(&dev_base_lock); - for_each_netdev(dev) { + for_each_netdev(&init_net, dev) { struct in_device *in_dev; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 98767a4f118..6b1a31a74cf 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -8,6 +8,7 @@ #include <linux/kernel.h> #include <linux/pfkeyv2.h> #include <linux/random.h> +#include <linux/spinlock.h> #include <net/icmp.h> #include <net/protocol.h> #include <net/udp.h> @@ -15,7 +16,6 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) { int err; - struct iphdr *top_iph; struct ip_esp_hdr *esph; struct crypto_blkcipher *tfm; struct blkcipher_desc desc; @@ -27,9 +27,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) int alen; int nfrags; - /* Strip IP+ESP header. */ - __skb_pull(skb, skb_transport_offset(skb)); - /* Now skb is pure payload to encrypt */ + /* skb is pure payload to encrypt */ err = -ENOMEM; @@ -59,12 +57,12 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) tail[clen - skb->len - 2] = (clen - skb->len) - 2; pskb_put(skb, trailer, clen - skb->len); - __skb_push(skb, skb->data - skb_network_header(skb)); - top_iph = ip_hdr(skb); - esph = (struct ip_esp_hdr *)(skb_network_header(skb) + - top_iph->ihl * 4); - top_iph->tot_len = htons(skb->len + alen); - *(skb_tail_pointer(trailer) - 1) = top_iph->protocol; + skb_push(skb, -skb_network_offset(skb)); + esph = ip_esp_hdr(skb); + *(skb_tail_pointer(trailer) - 1) = *skb_mac_header(skb); + *skb_mac_header(skb) = IPPROTO_ESP; + + spin_lock_bh(&x->lock); /* this is non-NULL only with UDP Encapsulation */ if (x->encap) { @@ -75,7 +73,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) uh = (struct udphdr *)esph; uh->source = encap->encap_sport; uh->dest = encap->encap_dport; - uh->len = htons(skb->len + alen - top_iph->ihl*4); + uh->len = htons(skb->len + alen - skb_transport_offset(skb)); uh->check = 0; switch (encap->encap_type) { @@ -90,13 +88,11 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) break; } - top_iph->protocol = IPPROTO_UDP; - } else - top_iph->protocol = IPPROTO_ESP; + *skb_mac_header(skb) = IPPROTO_UDP; + } esph->spi = x->id.spi; - esph->seq_no = htonl(++x->replay.oseq); - xfrm_aevent_doreplay(x); + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq); if (esp->conf.ivlen) { if (unlikely(!esp->conf.ivinitted)) { @@ -112,7 +108,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) if (unlikely(nfrags > ESP_NUM_FAST_SG)) { sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC); if (!sg) - goto error; + goto unlock; } skb_to_sgvec(skb, sg, esph->enc_data+esp->conf.ivlen-skb->data, clen); err = crypto_blkcipher_encrypt(&desc, sg, sg, clen); @@ -121,7 +117,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) } while (0); if (unlikely(err)) - goto error; + goto unlock; if (esp->conf.ivlen) { memcpy(esph->enc_data, esp->conf.ivec, esp->conf.ivlen); @@ -134,7 +130,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) memcpy(pskb_put(skb, trailer, alen), esp->auth.work_icv, alen); } - ip_send_check(top_iph); +unlock: + spin_unlock_bh(&x->lock); error: return err; @@ -155,7 +152,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) struct sk_buff *trailer; int blksize = ALIGN(crypto_blkcipher_blocksize(tfm), 4); int alen = esp->auth.icv_trunc_len; - int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen; + int elen = skb->len - sizeof(*esph) - esp->conf.ivlen - alen; int nfrags; int ihl; u8 nexthdr[2]; @@ -163,7 +160,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) int padlen; int err; - if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr))) + if (!pskb_may_pull(skb, sizeof(*esph))) goto out; if (elen <= 0 || (elen & (blksize-1))) @@ -191,7 +188,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) skb->ip_summed = CHECKSUM_NONE; - esph = (struct ip_esp_hdr*)skb->data; + esph = (struct ip_esp_hdr *)skb->data; /* Get ivec. This can be wrong, check against another impls. */ if (esp->conf.ivlen) @@ -204,7 +201,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) if (!sg) goto out; } - skb_to_sgvec(skb, sg, sizeof(struct ip_esp_hdr) + esp->conf.ivlen, elen); + skb_to_sgvec(skb, sg, sizeof(*esph) + esp->conf.ivlen, elen); err = crypto_blkcipher_decrypt(&desc, sg, sg, elen); if (unlikely(sg != &esp->sgbuf[0])) kfree(sg); @@ -256,17 +253,15 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) * as per draft-ietf-ipsec-udp-encaps-06, * section 3.1.2 */ - if (x->props.mode == XFRM_MODE_TRANSPORT || - x->props.mode == XFRM_MODE_BEET) + if (x->props.mode == XFRM_MODE_TRANSPORT) skb->ip_summed = CHECKSUM_UNNECESSARY; } - iph->protocol = nexthdr[1]; pskb_trim(skb, skb->len - alen - padlen - 2); __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen); skb_set_transport_header(skb, -ihl); - return 0; + return nexthdr[1]; out: return -EINVAL; @@ -343,11 +338,6 @@ static int esp_init_state(struct xfrm_state *x) struct crypto_blkcipher *tfm; u32 align; - /* null auth and encryption can have zero length keys */ - if (x->aalg) { - if (x->aalg->alg_key_len > 512) - goto error; - } if (x->ealg == NULL) goto error; @@ -359,15 +349,14 @@ static int esp_init_state(struct xfrm_state *x) struct xfrm_algo_desc *aalg_desc; struct crypto_hash *hash; - esp->auth.key = x->aalg->alg_key; - esp->auth.key_len = (x->aalg->alg_key_len+7)/8; hash = crypto_alloc_hash(x->aalg->alg_name, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(hash)) goto error; esp->auth.tfm = hash; - if (crypto_hash_setkey(hash, esp->auth.key, esp->auth.key_len)) + if (crypto_hash_setkey(hash, x->aalg->alg_key, + (x->aalg->alg_key_len + 7) / 8)) goto error; aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0); @@ -389,8 +378,7 @@ static int esp_init_state(struct xfrm_state *x) if (!esp->auth.work_icv) goto error; } - esp->conf.key = x->ealg->alg_key; - esp->conf.key_len = (x->ealg->alg_key_len+7)/8; + tfm = crypto_alloc_blkcipher(x->ealg->alg_name, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm)) goto error; @@ -403,7 +391,8 @@ static int esp_init_state(struct xfrm_state *x) goto error; esp->conf.ivinitted = 0; } - if (crypto_blkcipher_setkey(tfm, esp->conf.key, esp->conf.key_len)) + if (crypto_blkcipher_setkey(tfm, x->ealg->alg_key, + (x->ealg->alg_key_len + 7) / 8)) goto error; x->props.header_len = sizeof(struct ip_esp_hdr) + esp->conf.ivlen; if (x->props.mode == XFRM_MODE_TUNNEL) @@ -443,6 +432,7 @@ static struct xfrm_type esp_type = .description = "ESP4", .owner = THIS_MODULE, .proto = IPPROTO_ESP, + .flags = XFRM_TYPE_REPLAY_PROT, .init_state = esp_init_state, .destructor = esp_destroy, .get_mtu = esp4_get_mtu, diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 2eb909be804..78b514ba141 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -49,6 +49,8 @@ #define FFprint(a...) printk(KERN_DEBUG a) +static struct sock *fibnl; + #ifndef CONFIG_IP_MULTIPLE_TABLES struct fib_table *ip_fib_local_table; @@ -334,7 +336,7 @@ static int rtentry_to_fib_config(int cmd, struct rtentry *rt, colon = strchr(devname, ':'); if (colon) *colon = 0; - dev = __dev_get_by_name(devname); + dev = __dev_get_by_name(&init_net, devname); if (!dev) return -ENODEV; cfg->fc_oif = dev->ifindex; @@ -487,7 +489,7 @@ static int rtm_to_fib_config(struct sk_buff *skb, struct nlmsghdr *nlh, } nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) { - switch (attr->nla_type) { + switch (nla_type(attr)) { case RTA_DST: cfg->fc_dst = nla_get_be32(attr); break; @@ -784,17 +786,12 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb ) } } -static void nl_fib_input(struct sock *sk, int len) +static void nl_fib_input(struct sk_buff *skb) { - struct sk_buff *skb = NULL; - struct nlmsghdr *nlh = NULL; struct fib_result_nl *frn; - u32 pid; + struct nlmsghdr *nlh; struct fib_table *tb; - - skb = skb_dequeue(&sk->sk_receive_queue); - if (skb == NULL) - return; + u32 pid; nlh = nlmsg_hdr(skb); if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len || @@ -811,13 +808,13 @@ static void nl_fib_input(struct sock *sk, int len) pid = NETLINK_CB(skb).pid; /* pid of sending process */ NETLINK_CB(skb).pid = 0; /* from kernel */ NETLINK_CB(skb).dst_group = 0; /* unicast */ - netlink_unicast(sk, skb, pid, MSG_DONTWAIT); + netlink_unicast(fibnl, skb, pid, MSG_DONTWAIT); } static void nl_fib_lookup_init(void) { - netlink_kernel_create(NETLINK_FIB_LOOKUP, 0, nl_fib_input, NULL, - THIS_MODULE); + fibnl = netlink_kernel_create(&init_net, NETLINK_FIB_LOOKUP, 0, + nl_fib_input, NULL, THIS_MODULE); } static void fib_disable_ip(struct net_device *dev, int force) @@ -860,6 +857,9 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo struct net_device *dev = ptr; struct in_device *in_dev = __in_dev_get_rtnl(dev); + if (dev->nd_net != &init_net) + return NOTIFY_DONE; + if (event == NETDEV_UNREGISTER) { fib_disable_ip(dev, 2); return NOTIFY_DONE; diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 07e843a47dd..527a6e0af5b 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -35,6 +35,7 @@ #include <linux/netlink.h> #include <linux/init.h> +#include <net/net_namespace.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> @@ -771,13 +772,13 @@ struct fib_table * __init fib_hash_init(u32 id) fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node), 0, SLAB_HWCACHE_ALIGN, - NULL, NULL); + NULL); if (fn_alias_kmem == NULL) fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias), 0, SLAB_HWCACHE_ALIGN, - NULL, NULL); + NULL); tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash), GFP_KERNEL); @@ -1038,24 +1039,8 @@ static const struct seq_operations fib_seq_ops = { static int fib_seq_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - int rc = -ENOMEM; - struct fib_iter_state *s = kzalloc(sizeof(*s), GFP_KERNEL); - - if (!s) - goto out; - - rc = seq_open(file, &fib_seq_ops); - if (rc) - goto out_kfree; - - seq = file->private_data; - seq->private = s; -out: - return rc; -out_kfree: - kfree(s); - goto out; + return seq_open_private(file, &fib_seq_ops, + sizeof(struct fib_iter_state)); } static const struct file_operations fib_seq_fops = { @@ -1068,13 +1053,13 @@ static const struct file_operations fib_seq_fops = { int __init fib_proc_init(void) { - if (!proc_net_fops_create("route", S_IRUGO, &fib_seq_fops)) + if (!proc_net_fops_create(&init_net, "route", S_IRUGO, &fib_seq_fops)) return -ENOMEM; return 0; } void __init fib_proc_exit(void) { - proc_net_remove("route"); + proc_net_remove(&init_net, "route"); } #endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 2a947840210..f16839c6a72 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -76,8 +76,6 @@ static struct fib4_rule local_rule = { }, }; -static LIST_HEAD(fib4_rules); - #ifdef CONFIG_NET_CLS_ROUTE u32 fib_rules_tclass(struct fib_result *res) { @@ -279,9 +277,9 @@ static u32 fib4_rule_default_pref(void) struct list_head *pos; struct fib_rule *rule; - if (!list_empty(&fib4_rules)) { - pos = fib4_rules.next; - if (pos->next != &fib4_rules) { + if (!list_empty(&fib4_rules_ops.rules_list)) { + pos = fib4_rules_ops.rules_list.next; + if (pos->next != &fib4_rules_ops.rules_list) { rule = list_entry(pos->next, struct fib_rule, list); if (rule->pref) return rule->pref - 1; @@ -317,15 +315,15 @@ static struct fib_rules_ops fib4_rules_ops = { .flush_cache = fib4_rule_flush_cache, .nlgroup = RTNLGRP_IPV4_RULE, .policy = fib4_rule_policy, - .rules_list = &fib4_rules, + .rules_list = LIST_HEAD_INIT(fib4_rules_ops.rules_list), .owner = THIS_MODULE, }; void __init fib4_rules_init(void) { - list_add_tail(&local_rule.common.list, &fib4_rules); - list_add_tail(&main_rule.common.list, &fib4_rules); - list_add_tail(&default_rule.common.list, &fib4_rules); + list_add_tail(&local_rule.common.list, &fib4_rules_ops.rules_list); + list_add_tail(&main_rule.common.list, &fib4_rules_ops.rules_list); + list_add_tail(&default_rule.common.list, &fib4_rules_ops.rules_list); fib_rules_register(&fib4_rules_ops); } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index c434119deb5..1351a2617dc 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -533,7 +533,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, return -EINVAL; if (inet_addr_type(nh->nh_gw) != RTN_UNICAST) return -EINVAL; - if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL) + if ((dev = __dev_get_by_index(&init_net, nh->nh_oif)) == NULL) return -ENODEV; if (!(dev->flags&IFF_UP)) return -ENETDOWN; @@ -743,7 +743,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) int remaining; nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { - int type = nla->nla_type; + int type = nla_type(nla); if (type) { if (type > RTAX_MAX) @@ -799,7 +799,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (nhs != 1 || nh->nh_gw) goto err_inval; nh->nh_scope = RT_SCOPE_NOWHERE; - nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif); + nh->nh_dev = dev_get_by_index(&init_net, fi->fib_nh->nh_oif); err = -ENODEV; if (nh->nh_dev == NULL) goto failure; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 30e332ade61..81a8285d6d6 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -73,6 +73,7 @@ #include <linux/netlink.h> #include <linux/init.h> #include <linux/list.h> +#include <net/net_namespace.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> @@ -85,23 +86,14 @@ #define MAX_STAT_DEPTH 32 #define KEYLENGTH (8*sizeof(t_key)) -#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l)) -#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset)) typedef unsigned int t_key; #define T_TNODE 0 #define T_LEAF 1 #define NODE_TYPE_MASK 0x1UL -#define NODE_PARENT(node) \ - ((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK))) - #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK) -#define NODE_SET_PARENT(node, ptr) \ - rcu_assign_pointer((node)->parent, \ - ((unsigned long)(ptr)) | NODE_TYPE(node)) - #define IS_TNODE(n) (!(n->parent & T_LEAF)) #define IS_LEAF(n) (n->parent & T_LEAF) @@ -174,6 +166,19 @@ static void tnode_free(struct tnode *tn); static struct kmem_cache *fn_alias_kmem __read_mostly; static struct trie *trie_local = NULL, *trie_main = NULL; +static inline struct tnode *node_parent(struct node *node) +{ + struct tnode *ret; + + ret = (struct tnode *)(node->parent & ~NODE_TYPE_MASK); + return rcu_dereference(ret); +} + +static inline void node_set_parent(struct node *node, struct tnode *ptr) +{ + rcu_assign_pointer(node->parent, + (unsigned long)ptr | NODE_TYPE(node)); +} /* rcu_read_lock needs to be hold by caller from readside */ @@ -189,6 +194,11 @@ static inline int tnode_child_length(const struct tnode *tn) return 1 << tn->bits; } +static inline t_key mask_pfx(t_key k, unsigned short l) +{ + return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l); +} + static inline t_key tkey_extract_bits(t_key a, int offset, int bits) { if (offset < KEYLENGTH) @@ -446,7 +456,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w tn->full_children++; if (n) - NODE_SET_PARENT(n, tn); + node_set_parent(n, tn); rcu_assign_pointer(tn->child[i], n); } @@ -481,7 +491,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) continue; /* compress one level */ - NODE_SET_PARENT(n, NULL); + node_set_parent(n, NULL); tnode_free(tn); return n; } @@ -636,7 +646,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) /* compress one level */ - NODE_SET_PARENT(n, NULL); + node_set_parent(n, NULL); tnode_free(tn); return n; } @@ -673,7 +683,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) inode->pos == oldtnode->pos + oldtnode->bits && inode->bits > 1) { struct tnode *left, *right; - t_key m = TKEY_GET_MASK(inode->pos, 1); + t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos; left = tnode_new(inode->key&(~m), inode->pos + 1, inode->bits - 1); @@ -961,24 +971,21 @@ fib_find_node(struct trie *t, u32 key) static struct node *trie_rebalance(struct trie *t, struct tnode *tn) { int wasfull; - t_key cindex, key; - struct tnode *tp = NULL; - - key = tn->key; - - while (tn != NULL && NODE_PARENT(tn) != NULL) { + t_key cindex, key = tn->key; + struct tnode *tp; - tp = NODE_PARENT(tn); + while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); tn = (struct tnode *) resize (t, (struct tnode *)tn); tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull); - if (!NODE_PARENT(tn)) + tp = node_parent((struct node *) tn); + if (!tp) break; - - tn = NODE_PARENT(tn); + tn = tp; } + /* Handle last (top) tnode */ if (IS_TNODE(tn)) tn = (struct tnode*) resize(t, (struct tnode *)tn); @@ -1031,7 +1038,7 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) pos = tn->pos + tn->bits; n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); - BUG_ON(n && NODE_PARENT(n) != tn); + BUG_ON(n && node_parent(n) != tn); } else break; } @@ -1083,7 +1090,7 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) if (t->trie && n == NULL) { /* Case 2: n is NULL, and will just insert a new leaf */ - NODE_SET_PARENT(l, tp); + node_set_parent((struct node *)l, tp); cindex = tkey_extract_bits(key, tp->pos, tp->bits); put_child(t, (struct tnode *)tp, cindex, (struct node *)l); @@ -1114,7 +1121,7 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) goto err; } - NODE_SET_PARENT(tn, tp); + node_set_parent((struct node *)tn, tp); missbit = tkey_extract_bits(key, newpos, 1); put_child(t, tn, missbit, (struct node *)l); @@ -1364,7 +1371,8 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result bits = pn->bits; if (!chopped_off) - cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits); + cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length), + pos, bits); n = tnode_get_child(pn, cindex); @@ -1450,8 +1458,8 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result * to find a matching prefix. */ - node_prefix = MASK_PFX(cn->key, cn->pos); - key_prefix = MASK_PFX(key, cn->pos); + node_prefix = mask_pfx(cn->key, cn->pos); + key_prefix = mask_pfx(key, cn->pos); pref_mismatch = key_prefix^node_prefix; mp = 0; @@ -1495,12 +1503,13 @@ backtrace: if (chopped_off <= pn->bits) { cindex &= ~(1 << (chopped_off-1)); } else { - if (NODE_PARENT(pn) == NULL) + struct tnode *parent = node_parent((struct node *) pn); + if (!parent) goto failed; /* Get Child's index */ - cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits); - pn = NODE_PARENT(pn); + cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits); + pn = parent; chopped_off = 0; #ifdef CONFIG_IP_FIB_TRIE_STATS @@ -1536,7 +1545,7 @@ static int trie_leaf_remove(struct trie *t, t_key key) check_tnode(tn); n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits)); - BUG_ON(n && NODE_PARENT(n) != tn); + BUG_ON(n && node_parent(n) != tn); } l = (struct leaf *) n; @@ -1551,7 +1560,7 @@ static int trie_leaf_remove(struct trie *t, t_key key) t->revision++; t->size--; - tp = NODE_PARENT(n); + tp = node_parent(n); tnode_free((struct tnode *) n); if (tp) { @@ -1703,7 +1712,7 @@ static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf) p = (struct tnode*) trie; /* Start */ } else - p = (struct tnode *) NODE_PARENT(c); + p = node_parent(c); while (p) { int pos, last; @@ -1740,7 +1749,7 @@ static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf) up: /* No more children go up one step */ c = (struct node *) p; - p = (struct tnode *) NODE_PARENT(p); + p = node_parent(c); } return NULL; /* Ready. Root of trie */ } @@ -1970,7 +1979,7 @@ struct fib_table * __init fib_hash_init(u32 id) fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias), 0, SLAB_HWCACHE_ALIGN, - NULL, NULL); + NULL); tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie), GFP_KERNEL); @@ -2043,7 +2052,7 @@ rescan: } /* Current node exhausted, pop back up */ - p = NODE_PARENT(tn); + p = node_parent((struct node *)tn); if (p) { cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1; tn = p; @@ -2317,7 +2326,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) return 0; - if (!NODE_PARENT(n)) { + if (!node_parent(n)) { if (iter->trie == trie_local) seq_puts(seq, "<local>:\n"); else @@ -2326,7 +2335,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) if (IS_TNODE(n)) { struct tnode *tn = (struct tnode *) n; - __be32 prf = htonl(MASK_PFX(tn->key, tn->pos)); + __be32 prf = htonl(mask_pfx(tn->key, tn->pos)); seq_indent(seq, iter->depth-1); seq_printf(seq, " +-- %d.%d.%d.%d/%d %d %d %d\n", @@ -2370,25 +2379,8 @@ static const struct seq_operations fib_trie_seq_ops = { static int fib_trie_seq_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - int rc = -ENOMEM; - struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL); - - if (!s) - goto out; - - rc = seq_open(file, &fib_trie_seq_ops); - if (rc) - goto out_kfree; - - seq = file->private_data; - seq->private = s; - memset(s, 0, sizeof(*s)); -out: - return rc; -out_kfree: - kfree(s); - goto out; + return seq_open_private(file, &fib_trie_seq_ops, + sizeof(struct fib_trie_iter)); } static const struct file_operations fib_trie_fops = { @@ -2491,25 +2483,8 @@ static const struct seq_operations fib_route_seq_ops = { static int fib_route_seq_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - int rc = -ENOMEM; - struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL); - - if (!s) - goto out; - - rc = seq_open(file, &fib_route_seq_ops); - if (rc) - goto out_kfree; - - seq = file->private_data; - seq->private = s; - memset(s, 0, sizeof(*s)); -out: - return rc; -out_kfree: - kfree(s); - goto out; + return seq_open_private(file, &fib_route_seq_ops, + sizeof(struct fib_trie_iter)); } static const struct file_operations fib_route_fops = { @@ -2522,30 +2497,30 @@ static const struct file_operations fib_route_fops = { int __init fib_proc_init(void) { - if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_fops)) + if (!proc_net_fops_create(&init_net, "fib_trie", S_IRUGO, &fib_trie_fops)) goto out1; - if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_fops)) + if (!proc_net_fops_create(&init_net, "fib_triestat", S_IRUGO, &fib_triestat_fops)) goto out2; - if (!proc_net_fops_create("route", S_IRUGO, &fib_route_fops)) + if (!proc_net_fops_create(&init_net, "route", S_IRUGO, &fib_route_fops)) goto out3; return 0; out3: - proc_net_remove("fib_triestat"); + proc_net_remove(&init_net, "fib_triestat"); out2: - proc_net_remove("fib_trie"); + proc_net_remove(&init_net, "fib_trie"); out1: return -ENOMEM; } void __init fib_proc_exit(void) { - proc_net_remove("fib_trie"); - proc_net_remove("fib_triestat"); - proc_net_remove("route"); + proc_net_remove(&init_net, "fib_trie"); + proc_net_remove(&init_net, "fib_triestat"); + proc_net_remove(&init_net, "route"); } #endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 02a899bec19..272c69e106e 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -115,6 +115,7 @@ struct icmp_bxm { * Statistics */ DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics) __read_mostly; +DEFINE_SNMP_STAT(struct icmpmsg_mib, icmpmsg_statistics) __read_mostly; /* An array of errno for error messages from dest unreach. */ /* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */ @@ -214,8 +215,6 @@ int sysctl_icmp_errors_use_inbound_ifaddr __read_mostly; */ struct icmp_control { - int output_entry; /* Field for increment on output */ - int input_entry; /* Field for increment on input */ void (*handler)(struct sk_buff *skb); short error; /* This ICMP is classed as an error message */ }; @@ -316,12 +315,10 @@ out: /* * Maintain the counters used in the SNMP statistics for outgoing ICMP */ -static void icmp_out_count(int type) +void icmp_out_count(unsigned char type) { - if (type <= NR_ICMP_TYPES) { - ICMP_INC_STATS(icmp_pointers[type].output_entry); - ICMP_INC_STATS(ICMP_MIB_OUTMSGS); - } + ICMPMSGOUT_INC_STATS(type); + ICMP_INC_STATS(ICMP_MIB_OUTMSGS); } /* @@ -390,7 +387,6 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) return; icmp_param->data.icmph.checksum = 0; - icmp_out_count(icmp_param->data.icmph.type); inet->tos = ip_hdr(skb)->tos; daddr = ipc.addr = rt->rt_src; @@ -517,7 +513,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) struct net_device *dev = NULL; if (rt->fl.iif && sysctl_icmp_errors_use_inbound_ifaddr) - dev = dev_get_by_index(rt->fl.iif); + dev = dev_get_by_index(&init_net, rt->fl.iif); if (dev) { saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); @@ -952,6 +948,7 @@ int icmp_rcv(struct sk_buff *skb) icmph = icmp_hdr(skb); + ICMPMSGIN_INC_STATS_BH(icmph->type); /* * 18 is the highest 'known' ICMP type. Anything else is a mystery * @@ -986,7 +983,6 @@ int icmp_rcv(struct sk_buff *skb) } } - ICMP_INC_STATS_BH(icmp_pointers[icmph->type].input_entry); icmp_pointers[icmph->type].handler(skb); drop: @@ -1002,109 +998,71 @@ error: */ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = { [ICMP_ECHOREPLY] = { - .output_entry = ICMP_MIB_OUTECHOREPS, - .input_entry = ICMP_MIB_INECHOREPS, .handler = icmp_discard, }, [1] = { - .output_entry = ICMP_MIB_DUMMY, - .input_entry = ICMP_MIB_INERRORS, .handler = icmp_discard, .error = 1, }, [2] = { - .output_entry = ICMP_MIB_DUMMY, - .input_entry = ICMP_MIB_INERRORS, .handler = icmp_discard, .error = 1, }, [ICMP_DEST_UNREACH] = { - .output_entry = ICMP_MIB_OUTDESTUNREACHS, - .input_entry = ICMP_MIB_INDESTUNREACHS, .handler = icmp_unreach, .error = 1, }, [ICMP_SOURCE_QUENCH] = { - .output_entry = ICMP_MIB_OUTSRCQUENCHS, - .input_entry = ICMP_MIB_INSRCQUENCHS, .handler = icmp_unreach, .error = 1, }, [ICMP_REDIRECT] = { - .output_entry = ICMP_MIB_OUTREDIRECTS, - .input_entry = ICMP_MIB_INREDIRECTS, .handler = icmp_redirect, .error = 1, }, [6] = { - .output_entry = ICMP_MIB_DUMMY, - .input_entry = ICMP_MIB_INERRORS, .handler = icmp_discard, .error = 1, }, [7] = { - .output_entry = ICMP_MIB_DUMMY, - .input_entry = ICMP_MIB_INERRORS, .handler = icmp_discard, .error = 1, }, [ICMP_ECHO] = { - .output_entry = ICMP_MIB_OUTECHOS, - .input_entry = ICMP_MIB_INECHOS, .handler = icmp_echo, }, [9] = { - .output_entry = ICMP_MIB_DUMMY, - .input_entry = ICMP_MIB_INERRORS, .handler = icmp_discard, .error = 1, }, [10] = { - .output_entry = ICMP_MIB_DUMMY, - .input_entry = ICMP_MIB_INERRORS, .handler = icmp_discard, .error = 1, }, [ICMP_TIME_EXCEEDED] = { - .output_entry = ICMP_MIB_OUTTIMEEXCDS, - .input_entry = ICMP_MIB_INTIMEEXCDS, .handler = icmp_unreach, .error = 1, }, [ICMP_PARAMETERPROB] = { - .output_entry = ICMP_MIB_OUTPARMPROBS, - .input_entry = ICMP_MIB_INPARMPROBS, .handler = icmp_unreach, .error = 1, }, [ICMP_TIMESTAMP] = { - .output_entry = ICMP_MIB_OUTTIMESTAMPS, - .input_entry = ICMP_MIB_INTIMESTAMPS, .handler = icmp_timestamp, }, [ICMP_TIMESTAMPREPLY] = { - .output_entry = ICMP_MIB_OUTTIMESTAMPREPS, - .input_entry = ICMP_MIB_INTIMESTAMPREPS, .handler = icmp_discard, }, [ICMP_INFO_REQUEST] = { - .output_entry = ICMP_MIB_DUMMY, - .input_entry = ICMP_MIB_DUMMY, .handler = icmp_discard, }, [ICMP_INFO_REPLY] = { - .output_entry = ICMP_MIB_DUMMY, - .input_entry = ICMP_MIB_DUMMY, .handler = icmp_discard, }, [ICMP_ADDRESS] = { - .output_entry = ICMP_MIB_OUTADDRMASKS, - .input_entry = ICMP_MIB_INADDRMASKS, .handler = icmp_address, }, [ICMP_ADDRESSREPLY] = { - .output_entry = ICMP_MIB_OUTADDRMASKREPS, - .input_entry = ICMP_MIB_INADDRMASKREPS, .handler = icmp_address_reply, }, }; @@ -1146,4 +1104,5 @@ void __init icmp_init(struct net_proto_family *ops) EXPORT_SYMBOL(icmp_err_convert); EXPORT_SYMBOL(icmp_send); EXPORT_SYMBOL(icmp_statistics); +EXPORT_SYMBOL(icmpmsg_statistics); EXPORT_SYMBOL(xrlim_allow); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index a646409c2d0..7dbc282d4f9 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -91,6 +91,7 @@ #include <linux/rtnetlink.h> #include <linux/times.h> +#include <net/net_namespace.h> #include <net/arp.h> #include <net/ip.h> #include <net/protocol.h> @@ -1694,8 +1695,8 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[i]); } else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) { #ifdef CONFIG_IP_MULTICAST - struct in_device *in_dev = pmc->interface; struct ip_sf_list *psf; + in_dev = pmc->interface; #endif /* filter mode change */ @@ -1798,7 +1799,7 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, { int err; - if (iml->sflist == 0) { + if (iml->sflist == NULL) { /* any-source empty exclude case */ return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, iml->sfmode, 0, NULL, 0); @@ -2166,7 +2167,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, return -EFAULT; } for (i=0; i<copycount; i++) { - struct sockaddr_in *psin; struct sockaddr_storage ss; psin = (struct sockaddr_in *)&ss; @@ -2291,7 +2291,7 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq) struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); state->in_dev = NULL; - for_each_netdev(state->dev) { + for_each_netdev(&init_net, state->dev) { struct in_device *in_dev; in_dev = in_dev_get(state->dev); if (!in_dev) @@ -2410,23 +2410,8 @@ static const struct seq_operations igmp_mc_seq_ops = { static int igmp_mc_seq_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - int rc = -ENOMEM; - struct igmp_mc_iter_state *s = kzalloc(sizeof(*s), GFP_KERNEL); - - if (!s) - goto out; - rc = seq_open(file, &igmp_mc_seq_ops); - if (rc) - goto out_kfree; - - seq = file->private_data; - seq->private = s; -out: - return rc; -out_kfree: - kfree(s); - goto out; + return seq_open_private(file, &igmp_mc_seq_ops, + sizeof(struct igmp_mc_iter_state)); } static const struct file_operations igmp_mc_seq_fops = { @@ -2453,7 +2438,7 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq) state->idev = NULL; state->im = NULL; - for_each_netdev(state->dev) { + for_each_netdev(&init_net, state->dev) { struct in_device *idev; idev = in_dev_get(state->dev); if (unlikely(idev == NULL)) @@ -2584,23 +2569,8 @@ static const struct seq_operations igmp_mcf_seq_ops = { static int igmp_mcf_seq_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - int rc = -ENOMEM; - struct igmp_mcf_iter_state *s = kzalloc(sizeof(*s), GFP_KERNEL); - - if (!s) - goto out; - rc = seq_open(file, &igmp_mcf_seq_ops); - if (rc) - goto out_kfree; - - seq = file->private_data; - seq->private = s; -out: - return rc; -out_kfree: - kfree(s); - goto out; + return seq_open_private(file, &igmp_mcf_seq_ops, + sizeof(struct igmp_mcf_iter_state)); } static const struct file_operations igmp_mcf_seq_fops = { @@ -2613,8 +2583,8 @@ static const struct file_operations igmp_mcf_seq_fops = { int __init igmp_mc_proc_init(void) { - proc_net_fops_create("igmp", S_IRUGO, &igmp_mc_seq_fops); - proc_net_fops_create("mcfilter", S_IRUGO, &igmp_mcf_seq_fops); + proc_net_fops_create(&init_net, "igmp", S_IRUGO, &igmp_mc_seq_fops); + proc_net_fops_create(&init_net, "mcfilter", S_IRUGO, &igmp_mcf_seq_fops); return 0; } #endif diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index fbe7714f21d..3cef12835c4 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -33,6 +33,19 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg); * This array holds the first and last local port number. */ int sysctl_local_port_range[2] = { 32768, 61000 }; +DEFINE_SEQLOCK(sysctl_port_range_lock); + +void inet_get_local_port_range(int *low, int *high) +{ + unsigned seq; + do { + seq = read_seqbegin(&sysctl_port_range_lock); + + *low = sysctl_local_port_range[0]; + *high = sysctl_local_port_range[1]; + } while (read_seqretry(&sysctl_port_range_lock, seq)); +} +EXPORT_SYMBOL(inet_get_local_port_range); int inet_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb) @@ -77,10 +90,11 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo, local_bh_disable(); if (!snum) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int remaining = (high - low) + 1; - int rover = net_random() % (high - low) + low; + int remaining, rover, low, high; + + inet_get_local_port_range(&low, &high); + remaining = high - low; + rover = net_random() % remaining + low; do { head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index dbeacd8b0f9..7eb83ebed2e 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -11,6 +11,7 @@ * 2 of the License, or (at your option) any later version. */ +#include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> #include <linux/fcntl.h> @@ -112,7 +113,7 @@ static int inet_csk_diag_fill(struct sock *sk, } #endif -#define EXPIRES_IN_MS(tmo) ((tmo - jiffies) * 1000 + HZ - 1) / HZ +#define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ) if (icsk->icsk_pending == ICSK_TIME_RETRANS) { r->idiag_timer = 1; @@ -190,7 +191,7 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, r->id.idiag_dst[0] = tw->tw_daddr; r->idiag_state = tw->tw_substate; r->idiag_timer = 3; - r->idiag_expires = (tmo * 1000 + HZ - 1) / HZ; + r->idiag_expires = DIV_ROUND_UP(tmo * 1000, HZ); r->idiag_rqueue = 0; r->idiag_wqueue = 0; r->idiag_uid = 0; @@ -836,13 +837,13 @@ static int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return inet_diag_get_exact(skb, nlh); } -static void inet_diag_rcv(struct sock *sk, int len) -{ - unsigned int qlen = 0; +static DEFINE_MUTEX(inet_diag_mutex); - do { - netlink_run_queue(sk, &qlen, &inet_diag_rcv_msg); - } while (qlen); +static void inet_diag_rcv(struct sk_buff *skb) +{ + mutex_lock(&inet_diag_mutex); + netlink_rcv_skb(skb, &inet_diag_rcv_msg); + mutex_unlock(&inet_diag_mutex); } static DEFINE_SPINLOCK(inet_diag_register_lock); @@ -892,8 +893,8 @@ static int __init inet_diag_init(void) if (!inet_diag_table) goto out; - idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, 0, inet_diag_rcv, - NULL, THIS_MODULE); + idiagnl = netlink_kernel_create(&init_net, NETLINK_INET_DIAG, 0, + inet_diag_rcv, NULL, THIS_MODULE); if (idiagnl == NULL) goto out_free_table; err = 0; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index fb662621c54..fac6398e436 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -279,19 +279,18 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row, int ret; if (!snum) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int range = high - low; - int i; - int port; + int i, remaining, low, high, port; static u32 hint; u32 offset = hint + inet_sk_port_offset(sk); struct hlist_node *node; struct inet_timewait_sock *tw = NULL; + inet_get_local_port_range(&low, &high); + remaining = high - low; + local_bh_disable(); - for (i = 1; i <= range; i++) { - port = low + (i + offset) % range; + for (i = 1; i <= remaining; i++) { + port = low + (i + offset) % remaining; head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; spin_lock(&head->lock); diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c new file mode 100644 index 00000000000..4545b64e281 --- /dev/null +++ b/net/ipv4/inet_lro.c @@ -0,0 +1,600 @@ +/* + * linux/net/ipv4/inet_lro.c + * + * Large Receive Offload (ipv4 / tcp) + * + * (C) Copyright IBM Corp. 2007 + * + * Authors: + * Jan-Bernd Themann <themann@de.ibm.com> + * Christoph Raisch <raisch@de.ibm.com> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + + +#include <linux/module.h> +#include <linux/if_vlan.h> +#include <linux/inet_lro.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>"); +MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)"); + +#define TCP_HDR_LEN(tcph) (tcph->doff << 2) +#define IP_HDR_LEN(iph) (iph->ihl << 2) +#define TCP_PAYLOAD_LENGTH(iph, tcph) \ + (ntohs(iph->tot_len) - IP_HDR_LEN(iph) - TCP_HDR_LEN(tcph)) + +#define IPH_LEN_WO_OPTIONS 5 +#define TCPH_LEN_WO_OPTIONS 5 +#define TCPH_LEN_W_TIMESTAMP 8 + +#define LRO_MAX_PG_HLEN 64 + +#define LRO_INC_STATS(lro_mgr, attr) { lro_mgr->stats.attr++; } + +/* + * Basic tcp checks whether packet is suitable for LRO + */ + +static int lro_tcp_ip_check(struct iphdr *iph, struct tcphdr *tcph, + int len, struct net_lro_desc *lro_desc) +{ + /* check ip header: don't aggregate padded frames */ + if (ntohs(iph->tot_len) != len) + return -1; + + if (TCP_PAYLOAD_LENGTH(iph, tcph) == 0) + return -1; + + if (iph->ihl != IPH_LEN_WO_OPTIONS) + return -1; + + if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack + || tcph->rst || tcph->syn || tcph->fin) + return -1; + + if (INET_ECN_is_ce(ipv4_get_dsfield(iph))) + return -1; + + if (tcph->doff != TCPH_LEN_WO_OPTIONS + && tcph->doff != TCPH_LEN_W_TIMESTAMP) + return -1; + + /* check tcp options (only timestamp allowed) */ + if (tcph->doff == TCPH_LEN_W_TIMESTAMP) { + u32 *topt = (u32 *)(tcph + 1); + + if (*topt != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) + | TCPOLEN_TIMESTAMP)) + return -1; + + /* timestamp should be in right order */ + topt++; + if (lro_desc && after(ntohl(lro_desc->tcp_rcv_tsval), + ntohl(*topt))) + return -1; + + /* timestamp reply should not be zero */ + topt++; + if (*topt == 0) + return -1; + } + + return 0; +} + +static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc) +{ + struct iphdr *iph = lro_desc->iph; + struct tcphdr *tcph = lro_desc->tcph; + u32 *p; + __wsum tcp_hdr_csum; + + tcph->ack_seq = lro_desc->tcp_ack; + tcph->window = lro_desc->tcp_window; + + if (lro_desc->tcp_saw_tstamp) { + p = (u32 *)(tcph + 1); + *(p+2) = lro_desc->tcp_rcv_tsecr; + } + + iph->tot_len = htons(lro_desc->ip_tot_len); + + iph->check = 0; + iph->check = ip_fast_csum((u8 *)lro_desc->iph, iph->ihl); + + tcph->check = 0; + tcp_hdr_csum = csum_partial((u8 *)tcph, TCP_HDR_LEN(tcph), 0); + lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum); + tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, + lro_desc->ip_tot_len - + IP_HDR_LEN(iph), IPPROTO_TCP, + lro_desc->data_csum); +} + +static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len) +{ + __wsum tcp_csum; + __wsum tcp_hdr_csum; + __wsum tcp_ps_hdr_csum; + + tcp_csum = ~csum_unfold(tcph->check); + tcp_hdr_csum = csum_partial((u8 *)tcph, TCP_HDR_LEN(tcph), tcp_csum); + + tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, + len + TCP_HDR_LEN(tcph), + IPPROTO_TCP, 0); + + return csum_sub(csum_sub(tcp_csum, tcp_hdr_csum), + tcp_ps_hdr_csum); +} + +static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb, + struct iphdr *iph, struct tcphdr *tcph, + u16 vlan_tag, struct vlan_group *vgrp) +{ + int nr_frags; + u32 *ptr; + u32 tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); + + nr_frags = skb_shinfo(skb)->nr_frags; + lro_desc->parent = skb; + lro_desc->next_frag = &(skb_shinfo(skb)->frags[nr_frags]); + lro_desc->iph = iph; + lro_desc->tcph = tcph; + lro_desc->tcp_next_seq = ntohl(tcph->seq) + tcp_data_len; + lro_desc->tcp_ack = ntohl(tcph->ack_seq); + lro_desc->tcp_window = tcph->window; + + lro_desc->pkt_aggr_cnt = 1; + lro_desc->ip_tot_len = ntohs(iph->tot_len); + + if (tcph->doff == 8) { + ptr = (u32 *)(tcph+1); + lro_desc->tcp_saw_tstamp = 1; + lro_desc->tcp_rcv_tsval = *(ptr+1); + lro_desc->tcp_rcv_tsecr = *(ptr+2); + } + + lro_desc->mss = tcp_data_len; + lro_desc->vgrp = vgrp; + lro_desc->vlan_tag = vlan_tag; + lro_desc->active = 1; + + lro_desc->data_csum = lro_tcp_data_csum(iph, tcph, + tcp_data_len); +} + +static inline void lro_clear_desc(struct net_lro_desc *lro_desc) +{ + memset(lro_desc, 0, sizeof(struct net_lro_desc)); +} + +static void lro_add_common(struct net_lro_desc *lro_desc, struct iphdr *iph, + struct tcphdr *tcph, int tcp_data_len) +{ + struct sk_buff *parent = lro_desc->parent; + u32 *topt; + + lro_desc->pkt_aggr_cnt++; + lro_desc->ip_tot_len += tcp_data_len; + lro_desc->tcp_next_seq += tcp_data_len; + lro_desc->tcp_window = tcph->window; + lro_desc->tcp_ack = tcph->ack_seq; + + /* don't update tcp_rcv_tsval, would not work with PAWS */ + if (lro_desc->tcp_saw_tstamp) { + topt = (u32 *) (tcph + 1); + lro_desc->tcp_rcv_tsecr = *(topt + 2); + } + + lro_desc->data_csum = csum_block_add(lro_desc->data_csum, + lro_tcp_data_csum(iph, tcph, + tcp_data_len), + parent->len); + + parent->len += tcp_data_len; + parent->data_len += tcp_data_len; + if (tcp_data_len > lro_desc->mss) + lro_desc->mss = tcp_data_len; +} + +static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb, + struct iphdr *iph, struct tcphdr *tcph) +{ + struct sk_buff *parent = lro_desc->parent; + int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); + + lro_add_common(lro_desc, iph, tcph, tcp_data_len); + + skb_pull(skb, (skb->len - tcp_data_len)); + parent->truesize += skb->truesize; + + if (lro_desc->last_skb) + lro_desc->last_skb->next = skb; + else + skb_shinfo(parent)->frag_list = skb; + + lro_desc->last_skb = skb; +} + +static void lro_add_frags(struct net_lro_desc *lro_desc, + int len, int hlen, int truesize, + struct skb_frag_struct *skb_frags, + struct iphdr *iph, struct tcphdr *tcph) +{ + struct sk_buff *skb = lro_desc->parent; + int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); + + lro_add_common(lro_desc, iph, tcph, tcp_data_len); + + skb->truesize += truesize; + + skb_frags[0].page_offset += hlen; + skb_frags[0].size -= hlen; + + while (tcp_data_len > 0) { + *(lro_desc->next_frag) = *skb_frags; + tcp_data_len -= skb_frags->size; + lro_desc->next_frag++; + skb_frags++; + skb_shinfo(skb)->nr_frags++; + } +} + +static int lro_check_tcp_conn(struct net_lro_desc *lro_desc, + struct iphdr *iph, + struct tcphdr *tcph) +{ + if ((lro_desc->iph->saddr != iph->saddr) + || (lro_desc->iph->daddr != iph->daddr) + || (lro_desc->tcph->source != tcph->source) + || (lro_desc->tcph->dest != tcph->dest)) + return -1; + return 0; +} + +static struct net_lro_desc *lro_get_desc(struct net_lro_mgr *lro_mgr, + struct net_lro_desc *lro_arr, + struct iphdr *iph, + struct tcphdr *tcph) +{ + struct net_lro_desc *lro_desc = NULL; + struct net_lro_desc *tmp; + int max_desc = lro_mgr->max_desc; + int i; + + for (i = 0; i < max_desc; i++) { + tmp = &lro_arr[i]; + if (tmp->active) + if (!lro_check_tcp_conn(tmp, iph, tcph)) { + lro_desc = tmp; + goto out; + } + } + + for (i = 0; i < max_desc; i++) { + if (!lro_arr[i].active) { + lro_desc = &lro_arr[i]; + goto out; + } + } + + LRO_INC_STATS(lro_mgr, no_desc); +out: + return lro_desc; +} + +static void lro_flush(struct net_lro_mgr *lro_mgr, + struct net_lro_desc *lro_desc) +{ + if (lro_desc->pkt_aggr_cnt > 1) + lro_update_tcp_ip_header(lro_desc); + + skb_shinfo(lro_desc->parent)->gso_size = lro_desc->mss; + + if (lro_desc->vgrp) { + if (test_bit(LRO_F_NAPI, &lro_mgr->features)) + vlan_hwaccel_receive_skb(lro_desc->parent, + lro_desc->vgrp, + lro_desc->vlan_tag); + else + vlan_hwaccel_rx(lro_desc->parent, + lro_desc->vgrp, + lro_desc->vlan_tag); + + } else { + if (test_bit(LRO_F_NAPI, &lro_mgr->features)) + netif_receive_skb(lro_desc->parent); + else + netif_rx(lro_desc->parent); + } + + LRO_INC_STATS(lro_mgr, flushed); + lro_clear_desc(lro_desc); +} + +static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb, + struct vlan_group *vgrp, u16 vlan_tag, void *priv) +{ + struct net_lro_desc *lro_desc; + struct iphdr *iph; + struct tcphdr *tcph; + u64 flags; + int vlan_hdr_len = 0; + + if (!lro_mgr->get_skb_header + || lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph, + &flags, priv)) + goto out; + + if (!(flags & LRO_IPV4) || !(flags & LRO_TCP)) + goto out; + + lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); + if (!lro_desc) + goto out; + + if ((skb->protocol == htons(ETH_P_8021Q)) + && !test_bit(LRO_F_EXTRACT_VLAN_ID, &lro_mgr->features)) + vlan_hdr_len = VLAN_HLEN; + + if (!lro_desc->active) { /* start new lro session */ + if (lro_tcp_ip_check(iph, tcph, skb->len - vlan_hdr_len, NULL)) + goto out; + + skb->ip_summed = lro_mgr->ip_summed_aggr; + lro_init_desc(lro_desc, skb, iph, tcph, vlan_tag, vgrp); + LRO_INC_STATS(lro_mgr, aggregated); + return 0; + } + + if (lro_desc->tcp_next_seq != ntohl(tcph->seq)) + goto out2; + + if (lro_tcp_ip_check(iph, tcph, skb->len, lro_desc)) + goto out2; + + lro_add_packet(lro_desc, skb, iph, tcph); + LRO_INC_STATS(lro_mgr, aggregated); + + if ((lro_desc->pkt_aggr_cnt >= lro_mgr->max_aggr) || + lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu)) + lro_flush(lro_mgr, lro_desc); + + return 0; + +out2: /* send aggregated SKBs to stack */ + lro_flush(lro_mgr, lro_desc); + +out: /* Original SKB has to be posted to stack */ + skb->ip_summed = lro_mgr->ip_summed; + return 1; +} + + +static struct sk_buff *lro_gen_skb(struct net_lro_mgr *lro_mgr, + struct skb_frag_struct *frags, + int len, int true_size, + void *mac_hdr, + int hlen, __wsum sum, + u32 ip_summed) +{ + struct sk_buff *skb; + struct skb_frag_struct *skb_frags; + int data_len = len; + int hdr_len = min(len, hlen); + + skb = netdev_alloc_skb(lro_mgr->dev, hlen); + if (!skb) + return NULL; + + skb->len = len; + skb->data_len = len - hdr_len; + skb->truesize += true_size; + skb->tail += hdr_len; + + memcpy(skb->data, mac_hdr, hdr_len); + + skb_frags = skb_shinfo(skb)->frags; + while (data_len > 0) { + *skb_frags = *frags; + data_len -= frags->size; + skb_frags++; + frags++; + skb_shinfo(skb)->nr_frags++; + } + + skb_shinfo(skb)->frags[0].page_offset += hdr_len; + skb_shinfo(skb)->frags[0].size -= hdr_len; + + skb->ip_summed = ip_summed; + skb->csum = sum; + skb->protocol = eth_type_trans(skb, lro_mgr->dev); + return skb; +} + +static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr, + struct skb_frag_struct *frags, + int len, int true_size, + struct vlan_group *vgrp, + u16 vlan_tag, void *priv, __wsum sum) +{ + struct net_lro_desc *lro_desc; + struct iphdr *iph; + struct tcphdr *tcph; + struct sk_buff *skb; + u64 flags; + void *mac_hdr; + int mac_hdr_len; + int hdr_len = LRO_MAX_PG_HLEN; + int vlan_hdr_len = 0; + + if (!lro_mgr->get_frag_header + || lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph, + (void *)&tcph, &flags, priv)) { + mac_hdr = page_address(frags->page) + frags->page_offset; + goto out1; + } + + if (!(flags & LRO_IPV4) || !(flags & LRO_TCP)) + goto out1; + + hdr_len = (int)((void *)(tcph) + TCP_HDR_LEN(tcph) - mac_hdr); + mac_hdr_len = (int)((void *)(iph) - mac_hdr); + + lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); + if (!lro_desc) + goto out1; + + if (!lro_desc->active) { /* start new lro session */ + if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, NULL)) + goto out1; + + skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr, + hdr_len, 0, lro_mgr->ip_summed_aggr); + if (!skb) + goto out; + + if ((skb->protocol == htons(ETH_P_8021Q)) + && !test_bit(LRO_F_EXTRACT_VLAN_ID, &lro_mgr->features)) + vlan_hdr_len = VLAN_HLEN; + + iph = (void *)(skb->data + vlan_hdr_len); + tcph = (void *)((u8 *)skb->data + vlan_hdr_len + + IP_HDR_LEN(iph)); + + lro_init_desc(lro_desc, skb, iph, tcph, 0, NULL); + LRO_INC_STATS(lro_mgr, aggregated); + return NULL; + } + + if (lro_desc->tcp_next_seq != ntohl(tcph->seq)) + goto out2; + + if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, lro_desc)) + goto out2; + + lro_add_frags(lro_desc, len, hdr_len, true_size, frags, iph, tcph); + LRO_INC_STATS(lro_mgr, aggregated); + + if ((skb_shinfo(lro_desc->parent)->nr_frags >= lro_mgr->max_aggr) || + lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu)) + lro_flush(lro_mgr, lro_desc); + + return NULL; + +out2: /* send aggregated packets to the stack */ + lro_flush(lro_mgr, lro_desc); + +out1: /* Original packet has to be posted to the stack */ + skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr, + hdr_len, sum, lro_mgr->ip_summed); +out: + return skb; +} + +void lro_receive_skb(struct net_lro_mgr *lro_mgr, + struct sk_buff *skb, + void *priv) +{ + if (__lro_proc_skb(lro_mgr, skb, NULL, 0, priv)) { + if (test_bit(LRO_F_NAPI, &lro_mgr->features)) + netif_receive_skb(skb); + else + netif_rx(skb); + } +} +EXPORT_SYMBOL(lro_receive_skb); + +void lro_vlan_hwaccel_receive_skb(struct net_lro_mgr *lro_mgr, + struct sk_buff *skb, + struct vlan_group *vgrp, + u16 vlan_tag, + void *priv) +{ + if (__lro_proc_skb(lro_mgr, skb, vgrp, vlan_tag, priv)) { + if (test_bit(LRO_F_NAPI, &lro_mgr->features)) + vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag); + else + vlan_hwaccel_rx(skb, vgrp, vlan_tag); + } +} +EXPORT_SYMBOL(lro_vlan_hwaccel_receive_skb); + +void lro_receive_frags(struct net_lro_mgr *lro_mgr, + struct skb_frag_struct *frags, + int len, int true_size, void *priv, __wsum sum) +{ + struct sk_buff *skb; + + skb = __lro_proc_segment(lro_mgr, frags, len, true_size, NULL, 0, + priv, sum); + if (!skb) + return; + + if (test_bit(LRO_F_NAPI, &lro_mgr->features)) + netif_receive_skb(skb); + else + netif_rx(skb); +} +EXPORT_SYMBOL(lro_receive_frags); + +void lro_vlan_hwaccel_receive_frags(struct net_lro_mgr *lro_mgr, + struct skb_frag_struct *frags, + int len, int true_size, + struct vlan_group *vgrp, + u16 vlan_tag, void *priv, __wsum sum) +{ + struct sk_buff *skb; + + skb = __lro_proc_segment(lro_mgr, frags, len, true_size, vgrp, + vlan_tag, priv, sum); + if (!skb) + return; + + if (test_bit(LRO_F_NAPI, &lro_mgr->features)) + vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag); + else + vlan_hwaccel_rx(skb, vgrp, vlan_tag); +} +EXPORT_SYMBOL(lro_vlan_hwaccel_receive_frags); + +void lro_flush_all(struct net_lro_mgr *lro_mgr) +{ + int i; + struct net_lro_desc *lro_desc = lro_mgr->lro_arr; + + for (i = 0; i < lro_mgr->max_desc; i++) { + if (lro_desc[i].active) + lro_flush(lro_mgr, &lro_desc[i]); + } +} +EXPORT_SYMBOL(lro_flush_all); + +void lro_flush_pkt(struct net_lro_mgr *lro_mgr, + struct iphdr *iph, struct tcphdr *tcph) +{ + struct net_lro_desc *lro_desc; + + lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); + if (lro_desc->active) + lro_flush(lro_mgr, lro_desc); +} +EXPORT_SYMBOL(lro_flush_pkt); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 2586df09b9b..4e189e28f30 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -8,7 +8,7 @@ * From code orinally in TCP */ - +#include <linux/kernel.h> #include <net/inet_hashtables.h> #include <net/inet_timewait_sock.h> #include <net/ip.h> @@ -292,7 +292,7 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw, if (timeo >= timewait_len) { slot = INET_TWDR_TWKILL_SLOTS - 1; } else { - slot = (timeo + twdr->period - 1) / twdr->period; + slot = DIV_ROUND_UP(timeo, twdr->period); if (slot >= INET_TWDR_TWKILL_SLOTS) slot = INET_TWDR_TWKILL_SLOTS - 1; } diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 2f44e612806..771031dfbd0 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -123,7 +123,7 @@ void __init inet_initpeers(void) peer_cachep = kmem_cache_create("inet_peer_cache", sizeof(struct inet_peer), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, - NULL, NULL); + NULL); /* All the timers, started at system startup tend to synchronize. Perturb it a bit. @@ -158,7 +158,7 @@ static void unlink_from_unused(struct inet_peer *p) #define lookup(_daddr,_stack) \ ({ \ struct inet_peer *u, **v; \ - if (_stack) { \ + if (_stack != NULL) { \ stackptr = _stack; \ *stackptr++ = &peer_root; \ } \ @@ -169,7 +169,7 @@ static void unlink_from_unused(struct inet_peer *p) v = &u->avl_left; \ else \ v = &u->avl_right; \ - if (_stack) \ + if (_stack != NULL) \ *stackptr++ = v; \ u = *v; \ } \ diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 9cb04df0054..afbf938836f 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -86,7 +86,7 @@ int ip_forward(struct sk_buff *skb) goto sr_failed; if (unlikely(skb->len > dst_mtu(&rt->u.dst) && - (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { + (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(dst_mtu(&rt->u.dst))); @@ -105,7 +105,7 @@ int ip_forward(struct sk_buff *skb) * We now generate an ICMP HOST REDIRECT giving the route * we calculated. */ - if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr) + if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb->sp) ip_rt_send_redirect(skb); skb->priority = rt_tos2priority(iph->tos); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 0231bdcb2ab..fabb86db763 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -292,7 +292,7 @@ static void ip_expire(unsigned long arg) if ((qp->last_in&FIRST_IN) && qp->fragments != NULL) { struct sk_buff *head = qp->fragments; /* Send an ICMP "Fragment Reassembly Timeout" message. */ - if ((head->dev = dev_get_by_index(qp->iif)) != NULL) { + if ((head->dev = dev_get_by_index(&init_net, qp->iif)) != NULL) { icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); dev_put(head->dev); } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 5c14ed63e56..f151900efaf 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -262,7 +262,7 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int int i; for (i=1; i<100; i++) { sprintf(name, "gre%d", i); - if (__dev_get_by_name(name) == NULL) + if (__dev_get_by_name(&init_net, name) == NULL) break; } if (i==100) @@ -684,7 +684,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) goto tx_error; } - if (dev->hard_header) { + if (dev->header_ops) { gre_hlen = 0; tiph = (struct iphdr*)skb->data; } else { @@ -1063,8 +1063,9 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) */ -static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, - void *daddr, void *saddr, unsigned len) +static int ipgre_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, + const void *daddr, const void *saddr, unsigned len) { struct ip_tunnel *t = netdev_priv(dev); struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); @@ -1091,6 +1092,10 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned sh return -t->hlen; } +static const struct header_ops ipgre_header_ops = { + .create = ipgre_header, +}; + static int ipgre_open(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); @@ -1132,7 +1137,6 @@ static int ipgre_close(struct net_device *dev) static void ipgre_tunnel_setup(struct net_device *dev) { - SET_MODULE_OWNER(dev); dev->uninit = ipgre_tunnel_uninit; dev->destructor = free_netdev; dev->hard_start_xmit = ipgre_tunnel_xmit; @@ -1188,7 +1192,7 @@ static int ipgre_tunnel_init(struct net_device *dev) if (!iph->saddr) return -EINVAL; dev->flags = IFF_BROADCAST; - dev->hard_header = ipgre_header; + dev->header_ops = &ipgre_header_ops; dev->open = ipgre_open; dev->stop = ipgre_close; } @@ -1196,7 +1200,7 @@ static int ipgre_tunnel_init(struct net_device *dev) } if (!tdev && tunnel->parms.link) - tdev = __dev_get_by_index(tunnel->parms.link); + tdev = __dev_get_by_index(&init_net, tunnel->parms.link); if (tdev) { hlen = tdev->hard_header_len; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 97069399d86..41d8964591e 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -382,6 +382,9 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct iphdr *iph; u32 len; + if (dev->nd_net != &init_net) + goto drop; + /* When the interface is in promisc. mode, drop all the crap * that it receives, do not try to analyse it. */ diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 251346828cb..2f14745a9e1 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -513,11 +513,8 @@ void ip_options_undo(struct ip_options * opt) static struct ip_options *ip_options_get_alloc(const int optlen) { - struct ip_options *opt = kmalloc(sizeof(*opt) + ((optlen + 3) & ~3), - GFP_KERNEL); - if (opt) - memset(opt, 0, sizeof(*opt)); - return opt; + return kzalloc(sizeof(struct ip_options) + ((optlen + 3) & ~3), + GFP_KERNEL); } static int ip_options_get_finish(struct ip_options **optp, diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index c9e2b5e6305..699f06781fd 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -75,7 +75,6 @@ #include <net/icmp.h> #include <net/checksum.h> #include <net/inetpeer.h> -#include <net/checksum.h> #include <linux/igmp.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_bridge.h> @@ -170,7 +169,7 @@ static inline int ip_finish_output2(struct sk_buff *skb) IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS); /* Be paranoid, rather than too clever. */ - if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) { + if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { struct sk_buff *skb2; skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); @@ -1262,6 +1261,10 @@ int ip_push_pending_frames(struct sock *sk) skb->priority = sk->sk_priority; skb->dst = dst_clone(&rt->u.dst); + if (iph->protocol == IPPROTO_ICMP) + icmp_out_count(((struct icmphdr *) + skb_transport_header(skb))->type); + /* Netfilter gets whole the not fragmented skb. */ err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 4d544573f48..f51f20e487c 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -602,7 +602,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, dev_put(dev); } } else - dev = __dev_get_by_index(mreq.imr_ifindex); + dev = __dev_get_by_index(&init_net, mreq.imr_ifindex); err = -EADDRNOTAVAIL; @@ -625,6 +625,10 @@ static int do_ip_setsockopt(struct sock *sk, int level, { struct ip_mreqn mreq; + err = -EPROTO; + if (inet_sk(sk)->is_icsk) + break; + if (optlen < sizeof(struct ip_mreq)) goto e_inval; err = -EFAULT; @@ -655,7 +659,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, break; } msf = kmalloc(optlen, GFP_KERNEL); - if (msf == 0) { + if (!msf) { err = -ENOBUFS; break; } @@ -812,7 +816,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, break; } gsf = kmalloc(optlen,GFP_KERNEL); - if (gsf == 0) { + if (!gsf) { err = -ENOBUFS; break; } @@ -832,7 +836,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, } msize = IP_MSFILTER_SIZE(gsf->gf_numsrc); msf = kmalloc(msize,GFP_KERNEL); - if (msf == 0) { + if (!msf) { err = -ENOBUFS; goto mc_msf_out; } diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index e787044a851..0bfeb02a5f8 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -75,7 +75,6 @@ out: static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) { int err = -ENOMEM; - struct iphdr *iph; struct ip_comp_hdr *ipch; if (skb_linearize_cow(skb)) @@ -84,12 +83,14 @@ static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) skb->ip_summed = CHECKSUM_NONE; /* Remove ipcomp header and decompress original payload */ - iph = ip_hdr(skb); ipch = (void *)skb->data; - iph->protocol = ipch->nexthdr; skb->transport_header = skb->network_header + sizeof(*ipch); __skb_pull(skb, sizeof(*ipch)); err = ipcomp_decompress(x, skb); + if (err) + goto out; + + err = ipch->nexthdr; out: return err; @@ -98,10 +99,9 @@ out: static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb) { struct ipcomp_data *ipcd = x->data; - const int ihlen = ip_hdrlen(skb); - const int plen = skb->len - ihlen; + const int plen = skb->len; int dlen = IPCOMP_SCRATCH_SIZE; - u8 *start = skb->data + ihlen; + u8 *start = skb->data; const int cpu = get_cpu(); u8 *scratch = *per_cpu_ptr(ipcomp_scratches, cpu); struct crypto_comp *tfm = *per_cpu_ptr(ipcd->tfms, cpu); @@ -118,7 +118,7 @@ static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb) memcpy(start + sizeof(struct ip_comp_hdr), scratch, dlen); put_cpu(); - pskb_trim(skb, ihlen + dlen + sizeof(struct ip_comp_hdr)); + pskb_trim(skb, dlen + sizeof(struct ip_comp_hdr)); return 0; out: @@ -131,12 +131,8 @@ static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb) int err; struct ip_comp_hdr *ipch; struct ipcomp_data *ipcd = x->data; - int hdr_len = 0; - struct iphdr *iph = ip_hdr(skb); - iph->tot_len = htons(skb->len); - hdr_len = iph->ihl * 4; - if ((skb->len - hdr_len) < ipcd->threshold) { + if (skb->len < ipcd->threshold) { /* Don't bother compressing */ goto out_ok; } @@ -145,25 +141,19 @@ static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb) goto out_ok; err = ipcomp_compress(x, skb); - iph = ip_hdr(skb); if (err) { goto out_ok; } /* Install ipcomp header, convert into ipcomp datagram. */ - iph->tot_len = htons(skb->len); - ipch = (struct ip_comp_hdr *)((char *)iph + iph->ihl * 4); - ipch->nexthdr = iph->protocol; + ipch = ip_comp_hdr(skb); + ipch->nexthdr = *skb_mac_header(skb); ipch->flags = 0; ipch->cpi = htons((u16 )ntohl(x->id.spi)); - iph->protocol = IPPROTO_COMP; - ip_send_check(iph); - return 0; - + *skb_mac_header(skb) = IPPROTO_COMP; out_ok: - if (x->props.mode == XFRM_MODE_TUNNEL) - ip_send_check(iph); + skb_push(skb, -skb_network_offset(skb)); return 0; } diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 342ca8d8945..c5c107a0182 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -55,6 +55,7 @@ #include <linux/root_dev.h> #include <linux/delay.h> #include <linux/nfs_fs.h> +#include <net/net_namespace.h> #include <net/arp.h> #include <net/ip.h> #include <net/ipconfig.h> @@ -189,11 +190,15 @@ static int __init ic_open_devs(void) rtnl_lock(); /* bring loopback device up first */ - if (dev_change_flags(&loopback_dev, loopback_dev.flags | IFF_UP) < 0) - printk(KERN_ERR "IP-Config: Failed to open %s\n", loopback_dev.name); + for_each_netdev(&init_net, dev) { + if (!(dev->flags & IFF_LOOPBACK)) + continue; + if (dev_change_flags(dev, dev->flags | IFF_UP) < 0) + printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name); + } - for_each_netdev(dev) { - if (dev == &loopback_dev) + for_each_netdev(&init_net, dev) { + if (dev->flags & IFF_LOOPBACK) continue; if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : (!(dev->flags & IFF_LOOPBACK) && @@ -425,6 +430,9 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt unsigned char *sha, *tha; /* s for "source", t for "target" */ struct ic_device *d; + if (dev->nd_net != &init_net) + goto drop; + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) return NET_RX_DROP; @@ -749,8 +757,8 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d /* Chain packet down the line... */ skb->dev = dev; skb->protocol = htons(ETH_P_IP); - if ((dev->hard_header && - dev->hard_header(skb, dev, ntohs(skb->protocol), dev->broadcast, dev->dev_addr, skb->len) < 0) || + if (dev_hard_header(skb, dev, ntohs(skb->protocol), + dev->broadcast, dev->dev_addr, skb->len) < 0 || dev_queue_xmit(skb) < 0) printk("E"); } @@ -834,6 +842,9 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str struct ic_device *d; int len, ext_len; + if (dev->nd_net != &init_net) + goto drop; + /* Perform verifications before taking the lock. */ if (skb->pkt_type == PACKET_OTHERHOST) goto drop; @@ -1253,7 +1264,7 @@ static int __init ip_auto_config(void) __be32 addr; #ifdef CONFIG_PROC_FS - proc_net_fops_create("pnp", S_IRUGO, &pnp_seq_fops); + proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops); #endif /* CONFIG_PROC_FS */ if (!ic_enable) @@ -1281,9 +1292,9 @@ static int __init ip_auto_config(void) */ if (ic_myaddr == NONE || #ifdef CONFIG_ROOT_NFS - (MAJOR(ROOT_DEV) == UNNAMED_MAJOR - && root_server_addr == NONE - && ic_servaddr == NONE) || + (root_server_addr == NONE + && ic_servaddr == NONE + && ROOT_DEV == Root_NFS) || #endif ic_first_dev->next) { #ifdef IPCONFIG_DYNAMIC diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 396437242a1..5cd5bbe1379 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -225,7 +225,7 @@ static struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int c int i; for (i=1; i<100; i++) { sprintf(name, "tunl%d", i); - if (__dev_get_by_name(name) == NULL) + if (__dev_get_by_name(&init_net, name) == NULL) break; } if (i==100) @@ -237,7 +237,6 @@ static struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int c return NULL; nt = netdev_priv(dev); - SET_MODULE_OWNER(dev); dev->init = ipip_tunnel_init; nt->parms = *parms; @@ -775,7 +774,6 @@ static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu) static void ipip_tunnel_setup(struct net_device *dev) { - SET_MODULE_OWNER(dev); dev->uninit = ipip_tunnel_uninit; dev->hard_start_xmit = ipip_tunnel_xmit; dev->get_stats = ipip_tunnel_get_stats; @@ -822,7 +820,7 @@ static int ipip_tunnel_init(struct net_device *dev) } if (!tdev && tunnel->parms.link) - tdev = __dev_get_by_index(tunnel->parms.link); + tdev = __dev_get_by_index(&init_net, tunnel->parms.link); if (tdev) { dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index d96582acdf6..37bb497d92a 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -49,6 +49,7 @@ #include <linux/mroute.h> #include <linux/init.h> #include <linux/if_ether.h> +#include <net/net_namespace.h> #include <net/ip.h> #include <net/protocol.h> #include <linux/skbuff.h> @@ -124,7 +125,7 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v) { struct net_device *dev; - dev = __dev_get_by_name("tunl0"); + dev = __dev_get_by_name(&init_net, "tunl0"); if (dev) { int err; @@ -148,7 +149,7 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v) dev = NULL; - if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) { + if (err == 0 && (dev = __dev_get_by_name(&init_net, p.name)) != NULL) { dev->flags |= IFF_MULTICAST; in_dev = __in_dev_get_rtnl(dev); @@ -1082,13 +1083,18 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr) { + struct net_device *dev = ptr; struct vif_device *v; int ct; + + if (dev->nd_net != &init_net) + return NOTIFY_DONE; + if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; v=&vif_table[0]; for (ct=0;ct<maxvif;ct++,v++) { - if (v->dev==ptr) + if (v->dev==dev) vif_delete(ct); } return NOTIFY_DONE; @@ -1708,26 +1714,8 @@ static const struct seq_operations ipmr_vif_seq_ops = { static int ipmr_vif_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - int rc = -ENOMEM; - struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL); - - if (!s) - goto out; - - rc = seq_open(file, &ipmr_vif_seq_ops); - if (rc) - goto out_kfree; - - s->ct = 0; - seq = file->private_data; - seq->private = s; -out: - return rc; -out_kfree: - kfree(s); - goto out; - + return seq_open_private(file, &ipmr_vif_seq_ops, + sizeof(struct ipmr_vif_iter)); } static const struct file_operations ipmr_vif_fops = { @@ -1871,25 +1859,8 @@ static const struct seq_operations ipmr_mfc_seq_ops = { static int ipmr_mfc_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - int rc = -ENOMEM; - struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL); - - if (!s) - goto out; - - rc = seq_open(file, &ipmr_mfc_seq_ops); - if (rc) - goto out_kfree; - - seq = file->private_data; - seq->private = s; -out: - return rc; -out_kfree: - kfree(s); - goto out; - + return seq_open_private(file, &ipmr_mfc_seq_ops, + sizeof(struct ipmr_mfc_iter)); } static const struct file_operations ipmr_mfc_fops = { @@ -1917,12 +1888,12 @@ void __init ip_mr_init(void) mrt_cachep = kmem_cache_create("ip_mrt_cache", sizeof(struct mfc_cache), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, - NULL, NULL); + NULL); init_timer(&ipmr_expire_timer); ipmr_expire_timer.function=ipmr_expire_process; register_netdevice_notifier(&ip_mr_notifier); #ifdef CONFIG_PROC_FS - proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops); - proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops); + proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops); + proc_net_fops_create(&init_net, "ip_mr_cache", 0, &ipmr_mfc_fops); #endif } diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c index 8d6901d4e94..341474eefa5 100644 --- a/net/ipv4/ipvs/ip_vs_app.c +++ b/net/ipv4/ipvs/ip_vs_app.c @@ -25,6 +25,7 @@ #include <linux/skbuff.h> #include <linux/in.h> #include <linux/ip.h> +#include <net/net_namespace.h> #include <net/protocol.h> #include <net/tcp.h> #include <asm/system.h> @@ -616,12 +617,12 @@ int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri, int ip_vs_app_init(void) { /* we will replace it with proc_net_ipvs_create() soon */ - proc_net_fops_create("ip_vs_app", 0, &ip_vs_app_fops); + proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops); return 0; } void ip_vs_app_cleanup(void) { - proc_net_remove("ip_vs_app"); + proc_net_remove(&init_net, "ip_vs_app"); } diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index 3b446b1a6b9..4b702f708d3 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -35,6 +35,7 @@ #include <linux/jhash.h> #include <linux/random.h> +#include <net/net_namespace.h> #include <net/ip_vs.h> @@ -901,7 +902,7 @@ int ip_vs_conn_init(void) /* Allocate ip_vs_conn slab cache */ ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", sizeof(struct ip_vs_conn), 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); + SLAB_HWCACHE_ALIGN, NULL); if (!ip_vs_conn_cachep) { vfree(ip_vs_conn_tab); return -ENOMEM; @@ -922,7 +923,7 @@ int ip_vs_conn_init(void) rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); } - proc_net_fops_create("ip_vs_conn", 0, &ip_vs_conn_fops); + proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops); /* calculate the random value for connection hash */ get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); @@ -938,6 +939,6 @@ void ip_vs_conn_cleanup(void) /* Release the empty cache */ kmem_cache_destroy(ip_vs_conn_cachep); - proc_net_remove("ip_vs_conn"); + proc_net_remove(&init_net, "ip_vs_conn"); vfree(ip_vs_conn_tab); } diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index f005a2f929f..fbca2a2ff29 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -961,7 +961,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff **pskb, * ... don't know why 1st test DOES NOT include 2nd (?) */ if (unlikely(skb->pkt_type != PACKET_HOST - || skb->dev == &loopback_dev || skb->sk)) { + || skb->dev->flags & IFF_LOOPBACK || skb->sk)) { IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n", skb->pkt_type, ip_hdr(skb)->protocol, diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index e1052bcf4ed..7345fc252a2 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -29,13 +29,13 @@ #include <linux/proc_fs.h> #include <linux/workqueue.h> #include <linux/swap.h> -#include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/mutex.h> +#include <net/net_namespace.h> #include <net/ip.h> #include <net/route.h> #include <net/sock.h> @@ -909,7 +909,7 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) write_lock_bh(&__ip_vs_svc_lock); /* Wait until all other svc users go away */ - while (atomic_read(&svc->usecnt) > 1) {}; + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); /* call the update_service, because server weight may be changed */ svc->scheduler->update_service(svc); @@ -1792,24 +1792,8 @@ static const struct seq_operations ip_vs_info_seq_ops = { static int ip_vs_info_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - int rc = -ENOMEM; - struct ip_vs_iter *s = kzalloc(sizeof(*s), GFP_KERNEL); - - if (!s) - goto out; - - rc = seq_open(file, &ip_vs_info_seq_ops); - if (rc) - goto out_kfree; - - seq = file->private_data; - seq->private = s; -out: - return rc; -out_kfree: - kfree(s); - goto out; + return seq_open_private(file, &ip_vs_info_seq_ops, + sizeof(struct ip_vs_iter)); } static const struct file_operations ip_vs_info_fops = { @@ -2340,6 +2324,7 @@ static struct nf_sockopt_ops ip_vs_sockopts = { .get_optmin = IP_VS_BASE_CTL, .get_optmax = IP_VS_SO_GET_MAX+1, .get = do_ip_vs_get_ctl, + .owner = THIS_MODULE, }; @@ -2356,8 +2341,8 @@ int ip_vs_control_init(void) return ret; } - proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops); - proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops); + proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops); + proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops); sysctl_header = register_sysctl_table(vs_root_table); @@ -2390,8 +2375,8 @@ void ip_vs_control_cleanup(void) cancel_work_sync(&defense_work.work); ip_vs_kill_estimator(&ip_vs_stats); unregister_sysctl_table(sysctl_header); - proc_net_remove("ip_vs_stats"); - proc_net_remove("ip_vs"); + proc_net_remove(&init_net, "ip_vs_stats"); + proc_net_remove(&init_net, "ip_vs"); nf_unregister_sockopt(&ip_vs_sockopts); LeaveFunction(2); } diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index 6225acac7a3..6a1fec416ea 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -50,6 +50,7 @@ #include <linux/sysctl.h> /* for proc_net_create/proc_net_remove */ #include <linux/proc_fs.h> +#include <net/net_namespace.h> #include <net/ip_vs.h> @@ -843,7 +844,7 @@ static int __init ip_vs_lblcr_init(void) INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list); sysctl_header = register_sysctl_table(lblcr_root_table); #ifdef CONFIG_IP_VS_LBLCR_DEBUG - proc_net_create("ip_vs_lblcr", 0, ip_vs_lblcr_getinfo); + proc_net_create(&init_net, "ip_vs_lblcr", 0, ip_vs_lblcr_getinfo); #endif return register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); } @@ -852,7 +853,7 @@ static int __init ip_vs_lblcr_init(void) static void __exit ip_vs_lblcr_cleanup(void) { #ifdef CONFIG_IP_VS_LBLCR_DEBUG - proc_net_remove("ip_vs_lblcr"); + proc_net_remove(&init_net, "ip_vs_lblcr"); #endif unregister_sysctl_table(sysctl_header); unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index 356f067484e..1960747f354 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -387,7 +387,7 @@ static int set_mcast_if(struct sock *sk, char *ifname) struct net_device *dev; struct inet_sock *inet = inet_sk(sk); - if ((dev = __dev_get_by_name(ifname)) == NULL) + if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) return -ENODEV; if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) @@ -412,7 +412,7 @@ static int set_sync_mesg_maxlen(int sync_state) int num; if (sync_state == IP_VS_STATE_MASTER) { - if ((dev = __dev_get_by_name(ip_vs_master_mcast_ifn)) == NULL) + if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL) return -ENODEV; num = (dev->mtu - sizeof(struct iphdr) - @@ -423,7 +423,7 @@ static int set_sync_mesg_maxlen(int sync_state) IP_VS_DBG(7, "setting the maximum length of sync sending " "message %d.\n", sync_send_mesg_maxlen); } else if (sync_state == IP_VS_STATE_BACKUP) { - if ((dev = __dev_get_by_name(ip_vs_backup_mcast_ifn)) == NULL) + if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL) return -ENODEV; sync_recv_mesg_maxlen = dev->mtu - @@ -451,7 +451,7 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) memset(&mreq, 0, sizeof(mreq)); memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); - if ((dev = __dev_get_by_name(ifname)) == NULL) + if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) return -ENODEV; if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) return -EINVAL; @@ -472,7 +472,7 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname) __be32 addr; struct sockaddr_in sin; - if ((dev = __dev_get_by_name(ifname)) == NULL) + if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) return -ENODEV; addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c index 900ce29db38..666e080a74a 100644 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ b/net/ipv4/ipvs/ip_vs_xmit.c @@ -128,7 +128,7 @@ ip_vs_dst_reset(struct ip_vs_dest *dest) #define IP_VS_XMIT(skb, rt) \ do { \ (skb)->ipvs_property = 1; \ - (skb)->ip_summed = CHECKSUM_NONE; \ + skb_forward_csum(skb); \ NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \ (rt)->u.dst.dev, dst_output); \ } while (0) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index d1149aba935..29114a9ccd1 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1161,6 +1161,7 @@ static struct nf_sockopt_ops arpt_sockopts = { .get_optmin = ARPT_BASE_CTL, .get_optmax = ARPT_SO_GET_MAX+1, .get = do_arpt_get_ctl, + .owner = THIS_MODULE, }; static int __init arp_tables_init(void) diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 702d94db19b..23cbfc7c80f 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -24,6 +24,7 @@ #include <linux/proc_fs.h> #include <linux/security.h> #include <linux/mutex.h> +#include <net/net_namespace.h> #include <net/sock.h> #include <net/route.h> @@ -249,10 +250,8 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp) if (entry->info->indev && entry->skb->dev) { pmsg->hw_type = entry->skb->dev->type; - if (entry->skb->dev->hard_header_parse) - pmsg->hw_addrlen = - entry->skb->dev->hard_header_parse(entry->skb, - pmsg->hw_addr); + pmsg->hw_addrlen = dev_parse_header(entry->skb, + pmsg->hw_addr); } if (data_len) @@ -476,7 +475,7 @@ ipq_dev_drop(int ifindex) #define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) static inline void -ipq_rcv_skb(struct sk_buff *skb) +__ipq_rcv_skb(struct sk_buff *skb) { int status, type, pid, flags, nlmsglen, skblen; struct nlmsghdr *nlh; @@ -534,19 +533,10 @@ ipq_rcv_skb(struct sk_buff *skb) } static void -ipq_rcv_sk(struct sock *sk, int len) +ipq_rcv_skb(struct sk_buff *skb) { - struct sk_buff *skb; - unsigned int qlen; - mutex_lock(&ipqnl_mutex); - - for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { - skb = skb_dequeue(&sk->sk_receive_queue); - ipq_rcv_skb(skb); - kfree_skb(skb); - } - + __ipq_rcv_skb(skb); mutex_unlock(&ipqnl_mutex); } @@ -556,6 +546,9 @@ ipq_rcv_dev_event(struct notifier_block *this, { struct net_device *dev = ptr; + if (dev->nd_net != &init_net) + return NOTIFY_DONE; + /* Drop any packets associated with the downed device */ if (event == NETDEV_DOWN) ipq_dev_drop(dev->ifindex); @@ -575,7 +568,7 @@ ipq_rcv_nl_event(struct notifier_block *this, if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL && n->pid) { write_lock_bh(&queue_lock); - if (n->pid == peer_pid) + if ((n->net == &init_net) && (n->pid == peer_pid)) __ipq_reset(); write_unlock_bh(&queue_lock); } @@ -667,14 +660,14 @@ static int __init ip_queue_init(void) struct proc_dir_entry *proc; netlink_register_notifier(&ipq_nl_notifier); - ipqnl = netlink_kernel_create(NETLINK_FIREWALL, 0, ipq_rcv_sk, - NULL, THIS_MODULE); + ipqnl = netlink_kernel_create(&init_net, NETLINK_FIREWALL, 0, + ipq_rcv_skb, NULL, THIS_MODULE); if (ipqnl == NULL) { printk(KERN_ERR "ip_queue: failed to create netlink socket\n"); goto cleanup_netlink_notifier; } - proc = proc_net_create(IPQ_PROC_FS_NAME, 0, ipq_get_info); + proc = proc_net_create(&init_net, IPQ_PROC_FS_NAME, 0, ipq_get_info); if (proc) proc->owner = THIS_MODULE; else { @@ -695,8 +688,7 @@ static int __init ip_queue_init(void) cleanup_sysctl: unregister_sysctl_table(ipq_sysctl_header); unregister_netdevice_notifier(&ipq_dev_notifier); - proc_net_remove(IPQ_PROC_FS_NAME); - + proc_net_remove(&init_net, IPQ_PROC_FS_NAME); cleanup_ipqnl: sock_release(ipqnl->sk_socket); mutex_lock(&ipqnl_mutex); @@ -715,7 +707,7 @@ static void __exit ip_queue_fini(void) unregister_sysctl_table(ipq_sysctl_header); unregister_netdevice_notifier(&ipq_dev_notifier); - proc_net_remove(IPQ_PROC_FS_NAME); + proc_net_remove(&init_net, IPQ_PROC_FS_NAME); sock_release(ipqnl->sk_socket); mutex_lock(&ipqnl_mutex); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index e1b402c6b85..6486894f450 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -2296,6 +2296,7 @@ static struct nf_sockopt_ops ipt_sockopts = { #ifdef CONFIG_COMPAT .compat_get = compat_do_ipt_get_ctl, #endif + .owner = THIS_MODULE, }; static struct xt_match icmp_matchstruct __read_mostly = { diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index dcc12b18347..27f14e1ebd8 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -19,13 +19,13 @@ #include <linux/udp.h> #include <linux/icmp.h> #include <linux/if_arp.h> -#include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/netfilter_arp.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_CLUSTERIP.h> #include <net/netfilter/nf_conntrack.h> +#include <net/net_namespace.h> #include <net/checksum.h> #define CLUSTERIP_VERSION "0.8" @@ -401,7 +401,7 @@ checkentry(const char *tablename, return false; } - dev = dev_get_by_name(e->ip.iniface); + dev = dev_get_by_name(&init_net, e->ip.iniface); if (!dev) { printk(KERN_WARNING "CLUSTERIP: no such interface %s\n", e->ip.iniface); return false; @@ -727,7 +727,7 @@ static int __init ipt_clusterip_init(void) goto cleanup_target; #ifdef CONFIG_PROC_FS - clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", proc_net); + clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", init_net.proc_net); if (!clusterip_procdir) { printk(KERN_ERR "CLUSTERIP: Unable to proc dir entry\n"); ret = -ENOMEM; diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index 5937ad150b9..127a5e89bf1 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -479,10 +479,8 @@ static int __init ipt_log_init(void) ret = xt_register_target(&ipt_log_reg); if (ret < 0) return ret; - ret = nf_log_register(PF_INET, &ipt_log_logger); - if (ret < 0 && ret != -EEXIST) - xt_unregister_target(&ipt_log_reg); - return ret; + nf_log_register(PF_INET, &ipt_log_logger); + return 0; } static void __exit ipt_log_fini(void) diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 7c4e4be7c8b..3e0b562b2db 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -125,6 +125,9 @@ static int masq_device_event(struct notifier_block *this, { const struct net_device *dev = ptr; + if (dev->nd_net != &init_net) + return NOTIFY_DONE; + if (event == NETDEV_DOWN) { /* Device was downed. Search entire table for conntracks which were associated with that device, diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index 6ca43e4ca7e..c636d6d6357 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -409,7 +409,8 @@ static int __init ipt_ulog_init(void) for (i = 0; i < ULOG_MAXNLGROUPS; i++) setup_timer(&ulog_buffers[i].timer, ulog_timer, i); - nflognl = netlink_kernel_create(NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL, + nflognl = netlink_kernel_create(&init_net, + NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL, NULL, THIS_MODULE); if (!nflognl) return -ENOMEM; diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c index 32180431565..11d39fb5f38 100644 --- a/net/ipv4/netfilter/ipt_recent.c +++ b/net/ipv4/netfilter/ipt_recent.c @@ -24,6 +24,7 @@ #include <linux/bitops.h> #include <linux/skbuff.h> #include <linux/inet.h> +#include <net/net_namespace.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ipt_recent.h> @@ -380,20 +381,14 @@ static const struct seq_operations recent_seq_ops = { static int recent_seq_open(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); - struct seq_file *seq; struct recent_iter_state *st; - int ret; - st = kzalloc(sizeof(*st), GFP_KERNEL); + st = __seq_open_private(file, &recent_seq_ops, sizeof(*st)); if (st == NULL) return -ENOMEM; - ret = seq_open(file, &recent_seq_ops); - if (ret) - kfree(st); + st->table = pde->data; - seq = file->private_data; - seq->private = st; - return ret; + return 0; } static ssize_t recent_proc_write(struct file *file, const char __user *input, @@ -482,7 +477,7 @@ static int __init ipt_recent_init(void) #ifdef CONFIG_PROC_FS if (err) return err; - proc_dir = proc_mkdir("ipt_recent", proc_net); + proc_dir = proc_mkdir("ipt_recent", init_net.proc_net); if (proc_dir == NULL) { xt_unregister_match(&recent_match); err = -ENOMEM; @@ -496,7 +491,7 @@ static void __exit ipt_recent_exit(void) BUG_ON(!list_empty(&tables)); xt_unregister_match(&recent_match); #ifdef CONFIG_PROC_FS - remove_proc_entry("ipt_recent", proc_net); + remove_proc_entry("ipt_recent", init_net.proc_net); #endif } diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 64552afd01c..2fcb9249a8d 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -87,14 +87,10 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, if (iph == NULL) return -NF_DROP; - /* Never happen */ - if (iph->frag_off & htons(IP_OFFSET)) { - if (net_ratelimit()) { - printk(KERN_ERR "ipv4_get_l4proto: Frag of proto %u\n", - iph->protocol); - } + /* Conntrack defragments packets, we might still see fragments + * inside ICMP packets though. */ + if (iph->frag_off & htons(IP_OFFSET)) return -NF_DROP; - } *dataoff = nhoff + (iph->ihl << 2); *protonum = iph->protocol; @@ -364,35 +360,32 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> -static int ipv4_tuple_to_nfattr(struct sk_buff *skb, +static int ipv4_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { - NFA_PUT(skb, CTA_IP_V4_SRC, sizeof(u_int32_t), + NLA_PUT(skb, CTA_IP_V4_SRC, sizeof(u_int32_t), &tuple->src.u3.ip); - NFA_PUT(skb, CTA_IP_V4_DST, sizeof(u_int32_t), + NLA_PUT(skb, CTA_IP_V4_DST, sizeof(u_int32_t), &tuple->dst.u3.ip); return 0; -nfattr_failure: +nla_put_failure: return -1; } -static const size_t cta_min_ip[CTA_IP_MAX] = { - [CTA_IP_V4_SRC-1] = sizeof(u_int32_t), - [CTA_IP_V4_DST-1] = sizeof(u_int32_t), +static const struct nla_policy ipv4_nla_policy[CTA_IP_MAX+1] = { + [CTA_IP_V4_SRC] = { .type = NLA_U32 }, + [CTA_IP_V4_DST] = { .type = NLA_U32 }, }; -static int ipv4_nfattr_to_tuple(struct nfattr *tb[], +static int ipv4_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *t) { - if (!tb[CTA_IP_V4_SRC-1] || !tb[CTA_IP_V4_DST-1]) - return -EINVAL; - - if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip)) + if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST]) return -EINVAL; - t->src.u3.ip = *(__be32 *)NFA_DATA(tb[CTA_IP_V4_SRC-1]); - t->dst.u3.ip = *(__be32 *)NFA_DATA(tb[CTA_IP_V4_DST-1]); + t->src.u3.ip = *(__be32 *)nla_data(tb[CTA_IP_V4_SRC]); + t->dst.u3.ip = *(__be32 *)nla_data(tb[CTA_IP_V4_DST]); return 0; } @@ -403,6 +396,7 @@ static struct nf_sockopt_ops so_getorigdst = { .get_optmin = SO_ORIGINAL_DST, .get_optmax = SO_ORIGINAL_DST+1, .get = &getorigdst, + .owner = THIS_MODULE, }; struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { @@ -414,8 +408,9 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { .print_conntrack = ipv4_print_conntrack, .get_l4proto = ipv4_get_l4proto, #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - .tuple_to_nfattr = ipv4_tuple_to_nfattr, - .nfattr_to_tuple = ipv4_nfattr_to_tuple, + .tuple_to_nlattr = ipv4_tuple_to_nlattr, + .nlattr_to_tuple = ipv4_nlattr_to_tuple, + .nla_policy = ipv4_nla_policy, #endif #if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) .ctl_table_path = nf_net_ipv4_netfilter_sysctl_path, @@ -509,3 +504,9 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void) module_init(nf_conntrack_l3proto_ipv4_init); module_exit(nf_conntrack_l3proto_ipv4_fini); + +void need_ipv4_conntrack(void) +{ + return; +} +EXPORT_SYMBOL_GPL(need_ipv4_conntrack); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 3da9d73d1b5..741f3dfaa5a 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -11,6 +11,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/percpu.h> +#include <net/net_namespace.h> #include <linux/netfilter.h> #include <net/netfilter/nf_conntrack_core.h> @@ -173,23 +174,8 @@ static const struct seq_operations ct_seq_ops = { static int ct_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - struct ct_iter_state *st; - int ret; - - st = kmalloc(sizeof(struct ct_iter_state), GFP_KERNEL); - if (st == NULL) - return -ENOMEM; - ret = seq_open(file, &ct_seq_ops); - if (ret) - goto out_free; - seq = file->private_data; - seq->private = st; - memset(st, 0, sizeof(struct ct_iter_state)); - return ret; -out_free: - kfree(st); - return ret; + return seq_open_private(file, &ct_seq_ops, + sizeof(struct ct_iter_state)); } static const struct file_operations ct_file_ops = { @@ -291,23 +277,8 @@ static const struct seq_operations exp_seq_ops = { static int exp_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - struct ct_expect_iter_state *st; - int ret; - - st = kmalloc(sizeof(struct ct_expect_iter_state), GFP_KERNEL); - if (st == NULL) - return -ENOMEM; - ret = seq_open(file, &exp_seq_ops); - if (ret) - goto out_free; - seq = file->private_data; - seq->private = st; - memset(st, 0, sizeof(struct ct_expect_iter_state)); - return ret; -out_free: - kfree(st); - return ret; + return seq_open_private(file, &exp_seq_ops, + sizeof(struct ct_expect_iter_state)); } static const struct file_operations ip_exp_file_ops = { @@ -410,16 +381,16 @@ int __init nf_conntrack_ipv4_compat_init(void) { struct proc_dir_entry *proc, *proc_exp, *proc_stat; - proc = proc_net_fops_create("ip_conntrack", 0440, &ct_file_ops); + proc = proc_net_fops_create(&init_net, "ip_conntrack", 0440, &ct_file_ops); if (!proc) goto err1; - proc_exp = proc_net_fops_create("ip_conntrack_expect", 0440, + proc_exp = proc_net_fops_create(&init_net, "ip_conntrack_expect", 0440, &ip_exp_file_ops); if (!proc_exp) goto err2; - proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, proc_net_stat); + proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, init_net.proc_net_stat); if (!proc_stat) goto err3; @@ -429,16 +400,16 @@ int __init nf_conntrack_ipv4_compat_init(void) return 0; err3: - proc_net_remove("ip_conntrack_expect"); + proc_net_remove(&init_net, "ip_conntrack_expect"); err2: - proc_net_remove("ip_conntrack"); + proc_net_remove(&init_net, "ip_conntrack"); err1: return -ENOMEM; } void __exit nf_conntrack_ipv4_compat_fini(void) { - remove_proc_entry("ip_conntrack", proc_net_stat); - proc_net_remove("ip_conntrack_expect"); - proc_net_remove("ip_conntrack"); + remove_proc_entry("ip_conntrack", init_net.proc_net_stat); + proc_net_remove(&init_net, "ip_conntrack_expect"); + proc_net_remove(&init_net, "ip_conntrack"); } diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 6593fd2c5b1..11fedc73049 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -232,45 +232,42 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff, #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> -static int icmp_tuple_to_nfattr(struct sk_buff *skb, +static int icmp_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *t) { - NFA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(u_int16_t), + NLA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(u_int16_t), &t->src.u.icmp.id); - NFA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t), + NLA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t), &t->dst.u.icmp.type); - NFA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t), + NLA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t), &t->dst.u.icmp.code); return 0; -nfattr_failure: +nla_put_failure: return -1; } -static const size_t cta_min_proto[CTA_PROTO_MAX] = { - [CTA_PROTO_ICMP_TYPE-1] = sizeof(u_int8_t), - [CTA_PROTO_ICMP_CODE-1] = sizeof(u_int8_t), - [CTA_PROTO_ICMP_ID-1] = sizeof(u_int16_t) +static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = { + [CTA_PROTO_ICMP_TYPE] = { .type = NLA_U8 }, + [CTA_PROTO_ICMP_CODE] = { .type = NLA_U8 }, + [CTA_PROTO_ICMP_ID] = { .type = NLA_U16 }, }; -static int icmp_nfattr_to_tuple(struct nfattr *tb[], +static int icmp_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *tuple) { - if (!tb[CTA_PROTO_ICMP_TYPE-1] - || !tb[CTA_PROTO_ICMP_CODE-1] - || !tb[CTA_PROTO_ICMP_ID-1]) - return -EINVAL; - - if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) + if (!tb[CTA_PROTO_ICMP_TYPE] + || !tb[CTA_PROTO_ICMP_CODE] + || !tb[CTA_PROTO_ICMP_ID]) return -EINVAL; tuple->dst.u.icmp.type = - *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_TYPE-1]); + *(u_int8_t *)nla_data(tb[CTA_PROTO_ICMP_TYPE]); tuple->dst.u.icmp.code = - *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]); + *(u_int8_t *)nla_data(tb[CTA_PROTO_ICMP_CODE]); tuple->src.u.icmp.id = - *(__be16 *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]); + *(__be16 *)nla_data(tb[CTA_PROTO_ICMP_ID]); if (tuple->dst.u.icmp.type >= sizeof(invmap) || !invmap[tuple->dst.u.icmp.type]) @@ -327,8 +324,9 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = .destroy = NULL, .me = NULL, #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - .tuple_to_nfattr = icmp_tuple_to_nfattr, - .nfattr_to_tuple = icmp_nfattr_to_tuple, + .tuple_to_nlattr = icmp_tuple_to_nlattr, + .nlattr_to_tuple = icmp_nlattr_to_tuple, + .nla_policy = icmp_nla_policy, #endif #ifdef CONFIG_SYSCTL .ctl_table_header = &icmp_sysctl_header, diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index e848d8d6292..7221aa20e6f 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -77,7 +77,8 @@ static inline unsigned int hash_by_src(const struct nf_conntrack_tuple *tuple) { /* Original src, to ensure we map it consistently if poss. */ - return jhash_3words((__force u32)tuple->src.u3.ip, tuple->src.u.all, + return jhash_3words((__force u32)tuple->src.u3.ip, + (__force u32)tuple->src.u.all, tuple->dst.protonum, 0) % nf_nat_htable_size; } @@ -543,46 +544,46 @@ EXPORT_SYMBOL(nf_nat_protocol_unregister); #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) int -nf_nat_port_range_to_nfattr(struct sk_buff *skb, +nf_nat_port_range_to_nlattr(struct sk_buff *skb, const struct nf_nat_range *range) { - NFA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(__be16), + NLA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(__be16), &range->min.tcp.port); - NFA_PUT(skb, CTA_PROTONAT_PORT_MAX, sizeof(__be16), + NLA_PUT(skb, CTA_PROTONAT_PORT_MAX, sizeof(__be16), &range->max.tcp.port); return 0; -nfattr_failure: +nla_put_failure: return -1; } -EXPORT_SYMBOL_GPL(nf_nat_port_nfattr_to_range); +EXPORT_SYMBOL_GPL(nf_nat_port_nlattr_to_range); int -nf_nat_port_nfattr_to_range(struct nfattr *tb[], struct nf_nat_range *range) +nf_nat_port_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range *range) { int ret = 0; /* we have to return whether we actually parsed something or not */ - if (tb[CTA_PROTONAT_PORT_MIN-1]) { + if (tb[CTA_PROTONAT_PORT_MIN]) { ret = 1; range->min.tcp.port = - *(__be16 *)NFA_DATA(tb[CTA_PROTONAT_PORT_MIN-1]); + *(__be16 *)nla_data(tb[CTA_PROTONAT_PORT_MIN]); } - if (!tb[CTA_PROTONAT_PORT_MAX-1]) { + if (!tb[CTA_PROTONAT_PORT_MAX]) { if (ret) range->max.tcp.port = range->min.tcp.port; } else { ret = 1; range->max.tcp.port = - *(__be16 *)NFA_DATA(tb[CTA_PROTONAT_PORT_MAX-1]); + *(__be16 *)nla_data(tb[CTA_PROTONAT_PORT_MAX]); } return ret; } -EXPORT_SYMBOL_GPL(nf_nat_port_range_to_nfattr); +EXPORT_SYMBOL_GPL(nf_nat_port_range_to_nlattr); #endif /* Noone using conntrack by the time this called. */ diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c index 2e40cc83526..d562290b182 100644 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ b/net/ipv4/netfilter/nf_nat_proto_gre.c @@ -142,8 +142,8 @@ static struct nf_nat_protocol gre __read_mostly = { .in_range = gre_in_range, .unique_tuple = gre_unique_tuple, #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - .range_to_nfattr = nf_nat_port_range_to_nfattr, - .nfattr_to_range = nf_nat_port_nfattr_to_range, + .range_to_nlattr = nf_nat_port_range_to_nlattr, + .nlattr_to_range = nf_nat_port_nlattr_to_range, #endif }; diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c index f71ef9b5f42..898d7377115 100644 --- a/net/ipv4/netfilter/nf_nat_proto_icmp.c +++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c @@ -79,7 +79,7 @@ struct nf_nat_protocol nf_nat_protocol_icmp = { .in_range = icmp_in_range, .unique_tuple = icmp_unique_tuple, #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - .range_to_nfattr = nf_nat_port_range_to_nfattr, - .nfattr_to_range = nf_nat_port_nfattr_to_range, + .range_to_nlattr = nf_nat_port_range_to_nlattr, + .nlattr_to_range = nf_nat_port_nlattr_to_range, #endif }; diff --git a/net/ipv4/netfilter/nf_nat_proto_tcp.c b/net/ipv4/netfilter/nf_nat_proto_tcp.c index 123c95913f2..5bbbb2acdc7 100644 --- a/net/ipv4/netfilter/nf_nat_proto_tcp.c +++ b/net/ipv4/netfilter/nf_nat_proto_tcp.c @@ -145,7 +145,7 @@ struct nf_nat_protocol nf_nat_protocol_tcp = { .in_range = tcp_in_range, .unique_tuple = tcp_unique_tuple, #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - .range_to_nfattr = nf_nat_port_range_to_nfattr, - .nfattr_to_range = nf_nat_port_nfattr_to_range, + .range_to_nlattr = nf_nat_port_range_to_nlattr, + .nlattr_to_range = nf_nat_port_nlattr_to_range, #endif }; diff --git a/net/ipv4/netfilter/nf_nat_proto_udp.c b/net/ipv4/netfilter/nf_nat_proto_udp.c index 1c4c70e25cd..a0af4fd9558 100644 --- a/net/ipv4/netfilter/nf_nat_proto_udp.c +++ b/net/ipv4/netfilter/nf_nat_proto_udp.c @@ -135,7 +135,7 @@ struct nf_nat_protocol nf_nat_protocol_udp = { .in_range = udp_in_range, .unique_tuple = udp_unique_tuple, #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - .range_to_nfattr = nf_nat_port_range_to_nfattr, - .nfattr_to_range = nf_nat_port_nfattr_to_range, + .range_to_nlattr = nf_nat_port_range_to_nlattr, + .nlattr_to_range = nf_nat_port_nlattr_to_range, #endif }; diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c index 0f45427e5fd..76ec59ae524 100644 --- a/net/ipv4/netfilter/nf_nat_rule.c +++ b/net/ipv4/netfilter/nf_nat_rule.c @@ -192,7 +192,7 @@ alloc_null_binding_confirmed(struct nf_conn *ct, unsigned int hooknum) = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip); - u_int16_t all + __be16 all = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.all : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u.all); diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c index a889ec3ec83..e14d41976c2 100644 --- a/net/ipv4/netfilter/nf_nat_sip.c +++ b/net/ipv4/netfilter/nf_nat_sip.c @@ -104,7 +104,7 @@ static unsigned int ip_nat_sip(struct sk_buff **pskb, dataoff = ip_hdrlen(*pskb) + sizeof(struct udphdr); datalen = (*pskb)->len - dataoff; if (datalen < sizeof("SIP/2.0") - 1) - return NF_DROP; + return NF_ACCEPT; addr_map_init(ct, &map); diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c index 332814dac50..46cc99def16 100644 --- a/net/ipv4/netfilter/nf_nat_standalone.c +++ b/net/ipv4/netfilter/nf_nat_standalone.c @@ -328,7 +328,7 @@ static int __init nf_nat_standalone_init(void) { int ret = 0; - need_conntrack(); + need_ipv4_conntrack(); #ifdef CONFIG_XFRM BUG_ON(ip_nat_decode_session != NULL); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 3b690cf2a4e..e5b05b03910 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -34,6 +34,7 @@ * 2 of the License, or (at your option) any later version. */ #include <linux/types.h> +#include <net/net_namespace.h> #include <net/icmp.h> #include <net/protocol.h> #include <net/tcp.h> @@ -123,33 +124,30 @@ static const struct snmp_mib snmp4_ipextstats_list[] = { static const struct snmp_mib snmp4_icmp_list[] = { SNMP_MIB_ITEM("InMsgs", ICMP_MIB_INMSGS), SNMP_MIB_ITEM("InErrors", ICMP_MIB_INERRORS), - SNMP_MIB_ITEM("InDestUnreachs", ICMP_MIB_INDESTUNREACHS), - SNMP_MIB_ITEM("InTimeExcds", ICMP_MIB_INTIMEEXCDS), - SNMP_MIB_ITEM("InParmProbs", ICMP_MIB_INPARMPROBS), - SNMP_MIB_ITEM("InSrcQuenchs", ICMP_MIB_INSRCQUENCHS), - SNMP_MIB_ITEM("InRedirects", ICMP_MIB_INREDIRECTS), - SNMP_MIB_ITEM("InEchos", ICMP_MIB_INECHOS), - SNMP_MIB_ITEM("InEchoReps", ICMP_MIB_INECHOREPS), - SNMP_MIB_ITEM("InTimestamps", ICMP_MIB_INTIMESTAMPS), - SNMP_MIB_ITEM("InTimestampReps", ICMP_MIB_INTIMESTAMPREPS), - SNMP_MIB_ITEM("InAddrMasks", ICMP_MIB_INADDRMASKS), - SNMP_MIB_ITEM("InAddrMaskReps", ICMP_MIB_INADDRMASKREPS), SNMP_MIB_ITEM("OutMsgs", ICMP_MIB_OUTMSGS), SNMP_MIB_ITEM("OutErrors", ICMP_MIB_OUTERRORS), - SNMP_MIB_ITEM("OutDestUnreachs", ICMP_MIB_OUTDESTUNREACHS), - SNMP_MIB_ITEM("OutTimeExcds", ICMP_MIB_OUTTIMEEXCDS), - SNMP_MIB_ITEM("OutParmProbs", ICMP_MIB_OUTPARMPROBS), - SNMP_MIB_ITEM("OutSrcQuenchs", ICMP_MIB_OUTSRCQUENCHS), - SNMP_MIB_ITEM("OutRedirects", ICMP_MIB_OUTREDIRECTS), - SNMP_MIB_ITEM("OutEchos", ICMP_MIB_OUTECHOS), - SNMP_MIB_ITEM("OutEchoReps", ICMP_MIB_OUTECHOREPS), - SNMP_MIB_ITEM("OutTimestamps", ICMP_MIB_OUTTIMESTAMPS), - SNMP_MIB_ITEM("OutTimestampReps", ICMP_MIB_OUTTIMESTAMPREPS), - SNMP_MIB_ITEM("OutAddrMasks", ICMP_MIB_OUTADDRMASKS), - SNMP_MIB_ITEM("OutAddrMaskReps", ICMP_MIB_OUTADDRMASKREPS), SNMP_MIB_SENTINEL }; +static struct { + char *name; + int index; +} icmpmibmap[] = { + { "DestUnreachs", ICMP_DEST_UNREACH }, + { "TimeExcds", ICMP_TIME_EXCEEDED }, + { "ParmProbs", ICMP_PARAMETERPROB }, + { "SrcQuenchs", ICMP_SOURCE_QUENCH }, + { "Redirects", ICMP_REDIRECT }, + { "Echos", ICMP_ECHO }, + { "EchoReps", ICMP_ECHOREPLY }, + { "Timestamps", ICMP_TIMESTAMP }, + { "TimestampReps", ICMP_TIMESTAMPREPLY }, + { "AddrMasks", ICMP_ADDRESS }, + { "AddrMaskReps", ICMP_ADDRESSREPLY }, + { NULL, 0 } +}; + + static const struct snmp_mib snmp4_tcp_list[] = { SNMP_MIB_ITEM("RtoAlgorithm", TCP_MIB_RTOALGORITHM), SNMP_MIB_ITEM("RtoMin", TCP_MIB_RTOMIN), @@ -244,9 +242,79 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPAbortOnLinger", LINUX_MIB_TCPABORTONLINGER), SNMP_MIB_ITEM("TCPAbortFailed", LINUX_MIB_TCPABORTFAILED), SNMP_MIB_ITEM("TCPMemoryPressures", LINUX_MIB_TCPMEMORYPRESSURES), + SNMP_MIB_ITEM("TCPSACKDiscard", LINUX_MIB_TCPSACKDISCARD), + SNMP_MIB_ITEM("TCPDSACKIgnoredOld", LINUX_MIB_TCPDSACKIGNOREDOLD), + SNMP_MIB_ITEM("TCPDSACKIgnoredNoUndo", LINUX_MIB_TCPDSACKIGNOREDNOUNDO), + SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS), SNMP_MIB_SENTINEL }; +static void icmpmsg_put(struct seq_file *seq) +{ +#define PERLINE 16 + + int j, i, count; + static int out[PERLINE]; + + count = 0; + for (i = 0; i < ICMPMSG_MIB_MAX; i++) { + + if (snmp_fold_field((void **) icmpmsg_statistics, i)) + out[count++] = i; + if (count < PERLINE) + continue; + + seq_printf(seq, "\nIcmpMsg:"); + for (j = 0; j < PERLINE; ++j) + seq_printf(seq, " %sType%u", i & 0x100 ? "Out" : "In", + i & 0xff); + seq_printf(seq, "\nIcmpMsg: "); + for (j = 0; j < PERLINE; ++j) + seq_printf(seq, " %lu", + snmp_fold_field((void **) icmpmsg_statistics, + out[j])); + seq_putc(seq, '\n'); + } + if (count) { + seq_printf(seq, "\nIcmpMsg:"); + for (j = 0; j < count; ++j) + seq_printf(seq, " %sType%u", out[j] & 0x100 ? "Out" : + "In", out[j] & 0xff); + seq_printf(seq, "\nIcmpMsg:"); + for (j = 0; j < count; ++j) + seq_printf(seq, " %lu", snmp_fold_field((void **) + icmpmsg_statistics, out[j])); + } + +#undef PERLINE +} + +static void icmp_put(struct seq_file *seq) +{ + int i; + + seq_puts(seq, "\nIcmp: InMsgs InErrors"); + for (i=0; icmpmibmap[i].name != NULL; i++) + seq_printf(seq, " In%s", icmpmibmap[i].name); + seq_printf(seq, " OutMsgs OutErrors"); + for (i=0; icmpmibmap[i].name != NULL; i++) + seq_printf(seq, " Out%s", icmpmibmap[i].name); + seq_printf(seq, "\nIcmp: %lu %lu", + snmp_fold_field((void **) icmp_statistics, ICMP_MIB_INMSGS), + snmp_fold_field((void **) icmp_statistics, ICMP_MIB_INERRORS)); + for (i=0; icmpmibmap[i].name != NULL; i++) + seq_printf(seq, " %lu", + snmp_fold_field((void **) icmpmsg_statistics, + icmpmibmap[i].index)); + seq_printf(seq, " %lu %lu", + snmp_fold_field((void **) icmp_statistics, ICMP_MIB_OUTMSGS), + snmp_fold_field((void **) icmp_statistics, ICMP_MIB_OUTERRORS)); + for (i=0; icmpmibmap[i].name != NULL; i++) + seq_printf(seq, " %lu", + snmp_fold_field((void **) icmpmsg_statistics, + icmpmibmap[i].index)); +} + /* * Called from the PROCfs module. This outputs /proc/net/snmp. */ @@ -267,15 +335,8 @@ static int snmp_seq_show(struct seq_file *seq, void *v) snmp_fold_field((void **)ip_statistics, snmp4_ipstats_list[i].entry)); - seq_puts(seq, "\nIcmp:"); - for (i = 0; snmp4_icmp_list[i].name != NULL; i++) - seq_printf(seq, " %s", snmp4_icmp_list[i].name); - - seq_puts(seq, "\nIcmp:"); - for (i = 0; snmp4_icmp_list[i].name != NULL; i++) - seq_printf(seq, " %lu", - snmp_fold_field((void **)icmp_statistics, - snmp4_icmp_list[i].entry)); + icmp_put(seq); /* RFC 2011 compatibility */ + icmpmsg_put(seq); seq_puts(seq, "\nTcp:"); for (i = 0; snmp4_tcp_list[i].name != NULL; i++) @@ -332,6 +393,8 @@ static const struct file_operations snmp_seq_fops = { .release = single_release, }; + + /* * Output /proc/net/netstat */ @@ -380,20 +443,20 @@ int __init ip_misc_proc_init(void) { int rc = 0; - if (!proc_net_fops_create("netstat", S_IRUGO, &netstat_seq_fops)) + if (!proc_net_fops_create(&init_net, "netstat", S_IRUGO, &netstat_seq_fops)) goto out_netstat; - if (!proc_net_fops_create("snmp", S_IRUGO, &snmp_seq_fops)) + if (!proc_net_fops_create(&init_net, "snmp", S_IRUGO, &snmp_seq_fops)) goto out_snmp; - if (!proc_net_fops_create("sockstat", S_IRUGO, &sockstat_seq_fops)) + if (!proc_net_fops_create(&init_net, "sockstat", S_IRUGO, &sockstat_seq_fops)) goto out_sockstat; out: return rc; out_sockstat: - proc_net_remove("snmp"); + proc_net_remove(&init_net, "snmp"); out_snmp: - proc_net_remove("netstat"); + proc_net_remove(&init_net, "netstat"); out_netstat: rc = -ENOMEM; goto out; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 24d7c9f3191..3916faca3af 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -59,6 +59,7 @@ #include <linux/in_route.h> #include <linux/route.h> #include <linux/skbuff.h> +#include <net/net_namespace.h> #include <net/dst.h> #include <net/sock.h> #include <linux/gfp.h> @@ -313,6 +314,9 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } + if (iph->protocol == IPPROTO_ICMP) + icmp_out_count(((struct icmphdr *) + skb_transport_header(skb))->type); err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, dst_output); @@ -898,24 +902,8 @@ static const struct seq_operations raw_seq_ops = { static int raw_seq_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - int rc = -ENOMEM; - struct raw_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); - - if (!s) - goto out; - rc = seq_open(file, &raw_seq_ops); - if (rc) - goto out_kfree; - - seq = file->private_data; - seq->private = s; - memset(s, 0, sizeof(*s)); -out: - return rc; -out_kfree: - kfree(s); - goto out; + return seq_open_private(file, &raw_seq_ops, + sizeof(struct raw_iter_state)); } static const struct file_operations raw_seq_fops = { @@ -928,13 +916,13 @@ static const struct file_operations raw_seq_fops = { int __init raw_proc_init(void) { - if (!proc_net_fops_create("raw", S_IRUGO, &raw_seq_fops)) + if (!proc_net_fops_create(&init_net, "raw", S_IRUGO, &raw_seq_fops)) return -ENOMEM; return 0; } void __init raw_proc_exit(void) { - proc_net_remove("raw"); + proc_net_remove(&init_net, "raw"); } #endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 88fa648d7ba..21b12de9e65 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -81,6 +81,7 @@ #include <linux/netdevice.h> #include <linux/proc_fs.h> #include <linux/init.h> +#include <linux/workqueue.h> #include <linux/skbuff.h> #include <linux/inetdevice.h> #include <linux/igmp.h> @@ -91,6 +92,7 @@ #include <linux/jhash.h> #include <linux/rcupdate.h> #include <linux/times.h> +#include <net/net_namespace.h> #include <net/protocol.h> #include <net/ip.h> #include <net/route.h> @@ -135,7 +137,8 @@ static unsigned long rt_deadline; #define RTprint(a...) printk(KERN_DEBUG a) static struct timer_list rt_flush_timer; -static struct timer_list rt_periodic_timer; +static void rt_check_expire(struct work_struct *work); +static DECLARE_DELAYED_WORK(expires_work, rt_check_expire); static struct timer_list rt_secret_timer; /* @@ -243,7 +246,7 @@ static spinlock_t *rt_hash_locks; static struct rt_hash_bucket *rt_hash_table; static unsigned rt_hash_mask; -static int rt_hash_log; +static unsigned int rt_hash_log; static unsigned int rt_hash_rnd; static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); @@ -372,23 +375,8 @@ static const struct seq_operations rt_cache_seq_ops = { static int rt_cache_seq_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - int rc = -ENOMEM; - struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); - - if (!s) - goto out; - rc = seq_open(file, &rt_cache_seq_ops); - if (rc) - goto out_kfree; - seq = file->private_data; - seq->private = s; - memset(s, 0, sizeof(*s)); -out: - return rc; -out_kfree: - kfree(s); - goto out; + return seq_open_private(file, &rt_cache_seq_ops, + sizeof(struct rt_cache_iter_state)); } static const struct file_operations rt_cache_seq_fops = { @@ -571,33 +559,32 @@ static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) (fl1->iif ^ fl2->iif)) == 0; } -/* This runs via a timer and thus is always in BH context. */ -static void rt_check_expire(unsigned long dummy) +static void rt_check_expire(struct work_struct *work) { static unsigned int rover; unsigned int i = rover, goal; struct rtable *rth, **rthp; - unsigned long now = jiffies; u64 mult; mult = ((u64)ip_rt_gc_interval) << rt_hash_log; if (ip_rt_gc_timeout > 1) do_div(mult, ip_rt_gc_timeout); goal = (unsigned int)mult; - if (goal > rt_hash_mask) goal = rt_hash_mask + 1; + if (goal > rt_hash_mask) + goal = rt_hash_mask + 1; for (; goal > 0; goal--) { unsigned long tmo = ip_rt_gc_timeout; i = (i + 1) & rt_hash_mask; rthp = &rt_hash_table[i].chain; - if (*rthp == 0) + if (*rthp == NULL) continue; - spin_lock(rt_hash_lock_addr(i)); + spin_lock_bh(rt_hash_lock_addr(i)); while ((rth = *rthp) != NULL) { if (rth->u.dst.expires) { /* Entry is expired even if it is in use */ - if (time_before_eq(now, rth->u.dst.expires)) { + if (time_before_eq(jiffies, rth->u.dst.expires)) { tmo >>= 1; rthp = &rth->u.dst.rt_next; continue; @@ -612,14 +599,10 @@ static void rt_check_expire(unsigned long dummy) *rthp = rth->u.dst.rt_next; rt_free(rth); } - spin_unlock(rt_hash_lock_addr(i)); - - /* Fallback loop breaker. */ - if (time_after(jiffies, now)) - break; + spin_unlock_bh(rt_hash_lock_addr(i)); } rover = i; - mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval); + schedule_delayed_work(&expires_work, ip_rt_gc_interval); } /* This can run from both BH and non-BH contexts, the latter @@ -1404,8 +1387,8 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, { struct rtable *rt = (struct rtable *) dst; struct in_device *idev = rt->idev; - if (dev != &loopback_dev && idev && idev->dev == dev) { - struct in_device *loopback_idev = in_dev_get(&loopback_dev); + if (dev != init_net.loopback_dev && idev && idev->dev == dev) { + struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev); if (loopback_idev) { rt->idev = loopback_idev; in_dev_put(idev); @@ -1557,7 +1540,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, #endif rth->rt_iif = rth->fl.iif = dev->ifindex; - rth->u.dst.dev = &loopback_dev; + rth->u.dst.dev = init_net.loopback_dev; dev_hold(rth->u.dst.dev); rth->idev = in_dev_get(rth->u.dst.dev); rth->fl.oif = 0; @@ -1814,7 +1797,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res.type == RTN_LOCAL) { int result; result = fib_validate_source(saddr, daddr, tos, - loopback_dev.ifindex, + init_net.loopback_dev->ifindex, dev, &spec_dst, &itag); if (result < 0) goto martian_source; @@ -1881,7 +1864,7 @@ local_input: #endif rth->rt_iif = rth->fl.iif = dev->ifindex; - rth->u.dst.dev = &loopback_dev; + rth->u.dst.dev = init_net.loopback_dev; dev_hold(rth->u.dst.dev); rth->idev = in_dev_get(rth->u.dst.dev); rth->rt_gateway = daddr; @@ -2151,7 +2134,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) RT_SCOPE_UNIVERSE), } }, .mark = oldflp->mark, - .iif = loopback_dev.ifindex, + .iif = init_net.loopback_dev->ifindex, .oif = oldflp->oif }; struct fib_result res; unsigned flags = 0; @@ -2212,7 +2195,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) if (oldflp->oif) { - dev_out = dev_get_by_index(oldflp->oif); + dev_out = dev_get_by_index(&init_net, oldflp->oif); err = -ENODEV; if (dev_out == NULL) goto out; @@ -2245,9 +2228,9 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); if (dev_out) dev_put(dev_out); - dev_out = &loopback_dev; + dev_out = init_net.loopback_dev; dev_hold(dev_out); - fl.oif = loopback_dev.ifindex; + fl.oif = init_net.loopback_dev->ifindex; res.type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; @@ -2292,7 +2275,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) fl.fl4_src = fl.fl4_dst; if (dev_out) dev_put(dev_out); - dev_out = &loopback_dev; + dev_out = init_net.loopback_dev; dev_hold(dev_out); fl.oif = dev_out->ifindex; if (res.fi) @@ -2591,7 +2574,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void if (iif) { struct net_device *dev; - dev = __dev_get_by_index(iif); + dev = __dev_get_by_index(&init_net, iif); if (dev == NULL) { err = -ENODEV; goto errout_free; @@ -2967,7 +2950,7 @@ int __init ip_rt_init(void) ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; @@ -2992,17 +2975,14 @@ int __init ip_rt_init(void) init_timer(&rt_flush_timer); rt_flush_timer.function = rt_run_flush; - init_timer(&rt_periodic_timer); - rt_periodic_timer.function = rt_check_expire; init_timer(&rt_secret_timer); rt_secret_timer.function = rt_secret_rebuild; /* All the timers, started at system startup tend to synchronize. Perturb it a bit. */ - rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval + - ip_rt_gc_interval; - add_timer(&rt_periodic_timer); + schedule_delayed_work(&expires_work, + net_random() % ip_rt_gc_interval + ip_rt_gc_interval); rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval + ip_rt_secret_interval; @@ -3011,15 +2991,15 @@ int __init ip_rt_init(void) #ifdef CONFIG_PROC_FS { struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */ - if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) || + if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) || !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, - proc_net_stat))) { + init_net.proc_net_stat))) { return -ENOMEM; } rtstat_pde->proc_fops = &rt_cpu_seq_fops; } #ifdef CONFIG_NET_CLS_ROUTE - create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL); + create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL); #endif #endif #ifdef CONFIG_XFRM diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 53ef0f4bbda..eb286abcf5d 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -12,6 +12,7 @@ #include <linux/sysctl.h> #include <linux/igmp.h> #include <linux/inetdevice.h> +#include <linux/seqlock.h> #include <net/snmp.h> #include <net/icmp.h> #include <net/ip.h> @@ -89,6 +90,74 @@ static int ipv4_sysctl_forward_strategy(ctl_table *table, return 1; } +extern seqlock_t sysctl_port_range_lock; +extern int sysctl_local_port_range[2]; + +/* Update system visible IP port range */ +static void set_local_port_range(int range[2]) +{ + write_seqlock(&sysctl_port_range_lock); + sysctl_local_port_range[0] = range[0]; + sysctl_local_port_range[1] = range[1]; + write_sequnlock(&sysctl_port_range_lock); +} + +/* Validate changes from /proc interface. */ +static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret; + int range[2] = { sysctl_local_port_range[0], + sysctl_local_port_range[1] }; + ctl_table tmp = { + .data = &range, + .maxlen = sizeof(range), + .mode = table->mode, + .extra1 = &ip_local_port_range_min, + .extra2 = &ip_local_port_range_max, + }; + + ret = proc_dointvec_minmax(&tmp, write, filp, buffer, lenp, ppos); + + if (write && ret == 0) { + if (range[1] <= range[0]) + ret = -EINVAL; + else + set_local_port_range(range); + } + + return ret; +} + +/* Validate changes from sysctl interface. */ +static int ipv4_sysctl_local_port_range(ctl_table *table, int __user *name, + int nlen, void __user *oldval, + size_t __user *oldlenp, + void __user *newval, size_t newlen) +{ + int ret; + int range[2] = { sysctl_local_port_range[0], + sysctl_local_port_range[1] }; + ctl_table tmp = { + .data = &range, + .maxlen = sizeof(range), + .mode = table->mode, + .extra1 = &ip_local_port_range_min, + .extra2 = &ip_local_port_range_max, + }; + + ret = sysctl_intvec(&tmp, name, nlen, oldval, oldlenp, newval, newlen); + if (ret == 0 && newval && newlen) { + if (range[1] <= range[0]) + ret = -EINVAL; + else + set_local_port_range(range); + } + return ret; +} + + static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -427,10 +496,8 @@ ctl_table ipv4_table[] = { .data = &sysctl_local_port_range, .maxlen = sizeof(sysctl_local_port_range), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = ip_local_port_range_min, - .extra2 = ip_local_port_range_max + .proc_handler = &ipv4_local_port_range, + .strategy = &ipv4_sysctl_local_port_range, }, { .ctl_name = NET_IPV4_ICMP_ECHO_IGNORE_ALL, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 987b94403be..4f322003835 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -247,6 +247,7 @@ * TCP_CLOSE socket is finished */ +#include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> #include <linux/fcntl.h> @@ -658,9 +659,10 @@ static inline int select_size(struct sock *sk) return tmp; } -int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, +int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size) { + struct sock *sk = sock->sk; struct iovec *iov; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -2013,7 +2015,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (tp->rx_opt.tstamp_ok) info->tcpi_options |= TCPI_OPT_TIMESTAMPS; - if (tp->rx_opt.sack_ok) + if (tcp_is_sack(tp)) info->tcpi_options |= TCPI_OPT_SACK; if (tp->rx_opt.wscale_ok) { info->tcpi_options |= TCPI_OPT_WSCALE; @@ -2029,8 +2031,13 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_snd_mss = tp->mss_cache; info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; - info->tcpi_unacked = tp->packets_out; - info->tcpi_sacked = tp->sacked_out; + if (sk->sk_state == TCP_LISTEN) { + info->tcpi_unacked = sk->sk_ack_backlog; + info->tcpi_sacked = sk->sk_max_ack_backlog; + } else { + info->tcpi_unacked = tp->packets_out; + info->tcpi_sacked = tp->sacked_out; + } info->tcpi_lost = tp->lost_out; info->tcpi_retrans = tp->retrans_out; info->tcpi_fackets = tp->fackets_out; @@ -2209,7 +2216,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) goto out; mss = skb_shinfo(skb)->gso_size; - skb_shinfo(skb)->gso_segs = (skb->len + mss - 1) / mss; + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); segs = NULL; goto out; @@ -2430,7 +2437,7 @@ void __init tcp_init(void) tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); /* Size and allocate the main established and bind bucket * hash tables. diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index dd9ef65ad3f..5dba0fc8f57 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -137,7 +137,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) } static void bictcp_cong_avoid(struct sock *sk, u32 ack, - u32 seq_rtt, u32 in_flight, int data_acked) + u32 in_flight, int data_acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -206,11 +206,11 @@ static void bictcp_state(struct sock *sk, u8 new_state) /* Track delayed acknowledgment ratio using sliding window * ratio = (15*ratio + sample) / 16 */ -static void bictcp_acked(struct sock *sk, u32 cnt, ktime_t last) +static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt) { const struct inet_connection_sock *icsk = inet_csk(sk); - if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) { + if (icsk->icsk_ca_state == TCP_CA_Open) { struct bictcp *ca = inet_csk_ca(sk); cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; ca->delayed_ack += cnt; diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 1260e52ad77..55fca1820c3 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -324,8 +324,7 @@ EXPORT_SYMBOL_GPL(tcp_slow_start); /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ -void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, - int flag) +void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight, int flag) { struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index ebfaac2f9f4..80bd084a9f9 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -246,38 +246,12 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) ca->cnt = 1; } - -/* Keep track of minimum rtt */ -static inline void measure_delay(struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - struct bictcp *ca = inet_csk_ca(sk); - u32 delay; - - /* No time stamp */ - if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) || - /* Discard delay samples right after fast recovery */ - (s32)(tcp_time_stamp - ca->epoch_start) < HZ) - return; - - delay = (tcp_time_stamp - tp->rx_opt.rcv_tsecr)<<3; - if (delay == 0) - delay = 1; - - /* first time call or link delay decreases */ - if (ca->delay_min == 0 || ca->delay_min > delay) - ca->delay_min = delay; -} - static void bictcp_cong_avoid(struct sock *sk, u32 ack, - u32 seq_rtt, u32 in_flight, int data_acked) + u32 in_flight, int data_acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - if (data_acked) - measure_delay(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) return; @@ -334,17 +308,33 @@ static void bictcp_state(struct sock *sk, u8 new_state) /* Track delayed acknowledgment ratio using sliding window * ratio = (15*ratio + sample) / 16 */ -static void bictcp_acked(struct sock *sk, u32 cnt, ktime_t last) +static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) { const struct inet_connection_sock *icsk = inet_csk(sk); + struct bictcp *ca = inet_csk_ca(sk); + u32 delay; - if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) { - struct bictcp *ca = inet_csk_ca(sk); + if (icsk->icsk_ca_state == TCP_CA_Open) { cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; ca->delayed_ack += cnt; } -} + /* Some calls are for duplicates without timetamps */ + if (rtt_us < 0) + return; + + /* Discard delay samples right after fast recovery */ + if ((s32)(tcp_time_stamp - ca->epoch_start) < HZ) + return; + + delay = usecs_to_jiffies(rtt_us) << 3; + if (delay == 0) + delay = 1; + + /* first time call or link delay decreases */ + if (ca->delay_min == 0 || ca->delay_min > delay) + ca->delay_min = delay; +} static struct tcp_congestion_ops cubictcp = { .init = bictcp_init, diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 57c5f0b10e6..3904d2158a9 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -25,11 +25,13 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, const struct tcp_sock *tp = tcp_sk(sk); struct tcp_info *info = _info; - if (sk->sk_state == TCP_LISTEN) + if (sk->sk_state == TCP_LISTEN) { r->idiag_rqueue = sk->sk_ack_backlog; - else + r->idiag_wqueue = sk->sk_max_ack_backlog; + } else { r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq; - r->idiag_wqueue = tp->write_seq - tp->snd_una; + r->idiag_wqueue = tp->write_seq - tp->snd_una; + } if (info != NULL) tcp_get_info(sk, info); } diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 43d624e5043..14a073d8b60 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -109,7 +109,7 @@ static void hstcp_init(struct sock *sk) tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); } -static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, +static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 in_flight, int data_acked) { struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 4ba4a7ae0a8..5215691f276 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -76,20 +76,17 @@ static u32 htcp_cwnd_undo(struct sock *sk) return max(tp->snd_cwnd, (tp->snd_ssthresh << 7) / ca->beta); } -static inline void measure_rtt(struct sock *sk) +static inline void measure_rtt(struct sock *sk, u32 srtt) { const struct inet_connection_sock *icsk = inet_csk(sk); - const struct tcp_sock *tp = tcp_sk(sk); struct htcp *ca = inet_csk_ca(sk); - u32 srtt = tp->srtt >> 3; /* keep track of minimum RTT seen so far, minRTT is zero at first */ if (ca->minRTT > srtt || !ca->minRTT) ca->minRTT = srtt; /* max RTT */ - if (icsk->icsk_ca_state == TCP_CA_Open - && tp->snd_ssthresh < 0xFFFF && htcp_ccount(ca) > 3) { + if (icsk->icsk_ca_state == TCP_CA_Open) { if (ca->maxRTT < ca->minRTT) ca->maxRTT = ca->minRTT; if (ca->maxRTT < srtt @@ -98,7 +95,7 @@ static inline void measure_rtt(struct sock *sk) } } -static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, ktime_t last) +static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, s32 rtt) { const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); @@ -108,6 +105,9 @@ static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, ktime_t if (icsk->icsk_ca_state == TCP_CA_Open) ca->pkts_acked = pkts_acked; + if (rtt > 0) + measure_rtt(sk, usecs_to_jiffies(rtt)); + if (!use_bandwidth_switch) return; @@ -225,7 +225,7 @@ static u32 htcp_recalc_ssthresh(struct sock *sk) return max((tp->snd_cwnd * ca->beta) >> 7, 2U); } -static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, +static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight, int data_acked) { struct tcp_sock *tp = tcp_sk(sk); @@ -237,8 +237,6 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, if (tp->snd_cwnd <= tp->snd_ssthresh) tcp_slow_start(tp); else { - measure_rtt(sk); - /* In dangerous area, increase slowly. * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd */ diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index e5be3511722..b3e55cf5617 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -85,7 +85,7 @@ static inline u32 hybla_fraction(u32 odds) * o Give cwnd a new value based on the model proposed * o remember increments <1 */ -static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt, +static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight, int flag) { struct tcp_sock *tp = tcp_sk(sk); @@ -103,7 +103,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt, return; if (!ca->hybla_en) - return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); + return tcp_reno_cong_avoid(sk, ack, in_flight, flag); if (ca->rho == 0) hybla_recalc_param(sk); diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index b2b2256d3b8..64f1cbaf96e 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -83,18 +83,16 @@ static void tcp_illinois_init(struct sock *sk) } /* Measure RTT for each ack. */ -static void tcp_illinois_acked(struct sock *sk, u32 pkts_acked, ktime_t last) +static void tcp_illinois_acked(struct sock *sk, u32 pkts_acked, s32 rtt) { struct illinois *ca = inet_csk_ca(sk); - u32 rtt; ca->acked = pkts_acked; - if (ktime_equal(last, net_invalid_timestamp())) + /* dup ack, no rtt sample */ + if (rtt < 0) return; - rtt = ktime_to_us(net_timedelta(last)); - /* ignore bogus values, this prevents wraparound in alpha math */ if (rtt > RTT_MAX) rtt = RTT_MAX; @@ -258,7 +256,7 @@ static void tcp_illinois_state(struct sock *sk, u8 new_state) /* * Increase window in response to successful acknowledgment. */ -static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 rtt, +static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 in_flight, int flag) { struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4e5884ac8f2..0a42e934034 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -85,7 +85,7 @@ int sysctl_tcp_adv_win_scale __read_mostly = 2; int sysctl_tcp_stdurg __read_mostly; int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; -int sysctl_tcp_frto __read_mostly; +int sysctl_tcp_frto __read_mostly = 2; int sysctl_tcp_frto_response __read_mostly; int sysctl_tcp_nometrics_save __read_mostly; @@ -102,19 +102,20 @@ int sysctl_tcp_abc __read_mostly; #define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */ #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ #define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ +#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ +#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained DSACK info */ +#define FLAG_NONHEAD_RETRANS_ACKED 0x1000 /* Non-head rexmitted data was ACKed */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) #define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE) #define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) - -#define IsReno(tp) ((tp)->rx_opt.sack_ok == 0) -#define IsFack(tp) ((tp)->rx_opt.sack_ok & 2) -#define IsDSack(tp) ((tp)->rx_opt.sack_ok & 4) +#define FLAG_ANY_PROGRESS (FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED) #define IsSackFrto() (sysctl_tcp_frto == 0x2) #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) +#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) /* Adapt the MSS value used to make delayed ack decision to the * real world. @@ -195,6 +196,55 @@ static inline int tcp_in_quickack_mode(const struct sock *sk) return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; } +static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp) +{ + if (tp->ecn_flags&TCP_ECN_OK) + tp->ecn_flags |= TCP_ECN_QUEUE_CWR; +} + +static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, struct sk_buff *skb) +{ + if (tcp_hdr(skb)->cwr) + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; +} + +static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp) +{ + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; +} + +static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb) +{ + if (tp->ecn_flags&TCP_ECN_OK) { + if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags)) + tp->ecn_flags |= TCP_ECN_DEMAND_CWR; + /* Funny extension: if ECT is not set on a segment, + * it is surely retransmit. It is not in ECN RFC, + * but Linux follows this rule. */ + else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags))) + tcp_enter_quickack_mode((struct sock *)tp); + } +} + +static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, struct tcphdr *th) +{ + if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || th->cwr)) + tp->ecn_flags &= ~TCP_ECN_OK; +} + +static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, struct tcphdr *th) +{ + if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || !th->cwr)) + tp->ecn_flags &= ~TCP_ECN_OK; +} + +static inline int TCP_ECN_rcv_ecn_echo(struct tcp_sock *tp, struct tcphdr *th) +{ + if (th->ece && !th->syn && (tp->ecn_flags&TCP_ECN_OK)) + return 1; + return 0; +} + /* Buffer size and advertised window tuning. * * 1. Tuning sk->sk_sndbuf, when connection enters established state. @@ -552,6 +602,16 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) tcp_grow_window(sk, skb); } +static u32 tcp_rto_min(struct sock *sk) +{ + struct dst_entry *dst = __sk_dst_get(sk); + u32 rto_min = TCP_RTO_MIN; + + if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) + rto_min = dst->metrics[RTAX_RTO_MIN-1]; + return rto_min; +} + /* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge @@ -613,13 +673,13 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) if (tp->mdev_max < tp->rttvar) tp->rttvar -= (tp->rttvar-tp->mdev_max)>>2; tp->rtt_seq = tp->snd_nxt; - tp->mdev_max = TCP_RTO_MIN; + tp->mdev_max = tcp_rto_min(sk); } } else { /* no previous measure. */ tp->srtt = m<<3; /* take the measured time to be rtt */ tp->mdev = m<<1; /* make sure rto = 3*rtt */ - tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); + tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); tp->rtt_seq = tp->snd_nxt; } } @@ -752,7 +812,15 @@ void tcp_update_metrics(struct sock *sk) } } -/* Numbers are taken from RFC2414. */ +/* Numbers are taken from RFC3390. + * + * John Heffner states: + * + * The RFC specifies a window of no more than 4380 bytes + * unless 2*MSS > 4380. Reading the pseudocode in the RFC + * is a bit misleading because they use a clamp at 4380 bytes + * rather than use a multiplier in the relevant range. + */ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) { __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); @@ -789,6 +857,21 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) } } +/* + * Packet counting of FACK is based on in-order assumptions, therefore TCP + * disables it when reordering is detected + */ +static void tcp_disable_fack(struct tcp_sock *tp) +{ + tp->rx_opt.sack_ok &= ~2; +} + +/* Take a notice that peer is sending DSACKs */ +static void tcp_dsack_seen(struct tcp_sock *tp) +{ + tp->rx_opt.sack_ok |= 4; +} + /* Initialize metrics on socket. */ static void tcp_init_metrics(struct sock *sk) @@ -810,7 +893,7 @@ static void tcp_init_metrics(struct sock *sk) } if (dst_metric(dst, RTAX_REORDERING) && tp->reordering != dst_metric(dst, RTAX_REORDERING)) { - tp->rx_opt.sack_ok &= ~2; + tcp_disable_fack(tp); tp->reordering = dst_metric(dst, RTAX_REORDERING); } @@ -872,9 +955,9 @@ static void tcp_update_reordering(struct sock *sk, const int metric, /* This exciting event is worth to be remembered. 8) */ if (ts) NET_INC_STATS_BH(LINUX_MIB_TCPTSREORDER); - else if (IsReno(tp)) + else if (tcp_is_reno(tp)) NET_INC_STATS_BH(LINUX_MIB_TCPRENOREORDER); - else if (IsFack(tp)) + else if (tcp_is_fack(tp)) NET_INC_STATS_BH(LINUX_MIB_TCPFACKREORDER); else NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER); @@ -886,8 +969,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric, tp->sacked_out, tp->undo_marker ? tp->undo_retrans : 0); #endif - /* Disable FACK yet. */ - tp->rx_opt.sack_ok &= ~2; + tcp_disable_fack(tp); } } @@ -938,7 +1020,216 @@ static void tcp_update_reordering(struct sock *sk, const int metric, * for retransmitted and already SACKed segment -> reordering.. * Both of these heuristics are not used in Loss state, when we cannot * account for retransmits accurately. + * + * SACK block validation. + * ---------------------- + * + * SACK block range validation checks that the received SACK block fits to + * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT. + * Note that SND.UNA is not included to the range though being valid because + * it means that the receiver is rather inconsistent with itself reporting + * SACK reneging when it should advance SND.UNA. Such SACK block this is + * perfectly valid, however, in light of RFC2018 which explicitly states + * that "SACK block MUST reflect the newest segment. Even if the newest + * segment is going to be discarded ...", not that it looks very clever + * in case of head skb. Due to potentional receiver driven attacks, we + * choose to avoid immediate execution of a walk in write queue due to + * reneging and defer head skb's loss recovery to standard loss recovery + * procedure that will eventually trigger (nothing forbids us doing this). + * + * Implements also blockage to start_seq wrap-around. Problem lies in the + * fact that though start_seq (s) is before end_seq (i.e., not reversed), + * there's no guarantee that it will be before snd_nxt (n). The problem + * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt + * wrap (s_w): + * + * <- outs wnd -> <- wrapzone -> + * u e n u_w e_w s n_w + * | | | | | | | + * |<------------+------+----- TCP seqno space --------------+---------->| + * ...-- <2^31 ->| |<--------... + * ...---- >2^31 ------>| |<--------... + * + * Current code wouldn't be vulnerable but it's better still to discard such + * crazy SACK blocks. Doing this check for start_seq alone closes somewhat + * similar case (end_seq after snd_nxt wrap) as earlier reversed check in + * snd_nxt wrap -> snd_una region will then become "well defined", i.e., + * equal to the ideal case (infinite seqno space without wrap caused issues). + * + * With D-SACK the lower bound is extended to cover sequence space below + * SND.UNA down to undo_marker, which is the last point of interest. Yet + * again, DSACK block must not to go across snd_una (for the same reason as + * for the normal SACK blocks, explained above). But there all simplicity + * ends, TCP might receive valid D-SACKs below that. As long as they reside + * fully below undo_marker they do not affect behavior in anyway and can + * therefore be safely ignored. In rare cases (which are more or less + * theoretical ones), the D-SACK will nicely cross that boundary due to skb + * fragmentation and packet reordering past skb's retransmission. To consider + * them correctly, the acceptable range must be extended even more though + * the exact amount is rather hard to quantify. However, tp->max_window can + * be used as an exaggerated estimate. + */ +static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, + u32 start_seq, u32 end_seq) +{ + /* Too far in future, or reversed (interpretation is ambiguous) */ + if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq)) + return 0; + + /* Nasty start_seq wrap-around check (see comments above) */ + if (!before(start_seq, tp->snd_nxt)) + return 0; + + /* In outstanding window? ...This is valid exit for DSACKs too. + * start_seq == snd_una is non-sensical (see comments above) + */ + if (after(start_seq, tp->snd_una)) + return 1; + + if (!is_dsack || !tp->undo_marker) + return 0; + + /* ...Then it's D-SACK, and must reside below snd_una completely */ + if (!after(end_seq, tp->snd_una)) + return 0; + + if (!before(start_seq, tp->undo_marker)) + return 1; + + /* Too old */ + if (!after(end_seq, tp->undo_marker)) + return 0; + + /* Undo_marker boundary crossing (overestimates a lot). Known already: + * start_seq < undo_marker and end_seq >= undo_marker. + */ + return !before(start_seq, end_seq - tp->max_window); +} + +/* Check for lost retransmit. This superb idea is borrowed from "ratehalving". + * Event "C". Later note: FACK people cheated me again 8), we have to account + * for reordering! Ugly, but should help. + * + * Search retransmitted skbs from write_queue that were sent when snd_nxt was + * less than what is now known to be received by the other end (derived from + * SACK blocks by the caller). Also calculate the lowest snd_nxt among the + * remaining retransmitted skbs to avoid some costly processing per ACKs. + */ +static int tcp_mark_lost_retrans(struct sock *sk, u32 received_upto) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + int flag = 0; + int cnt = 0; + u32 new_low_seq = 0; + + tcp_for_write_queue(skb, sk) { + u32 ack_seq = TCP_SKB_CB(skb)->ack_seq; + + if (skb == tcp_send_head(sk)) + break; + if (cnt == tp->retrans_out) + break; + if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) + continue; + + if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) + continue; + + if (after(received_upto, ack_seq) && + (tcp_is_fack(tp) || + !before(received_upto, + ack_seq + tp->reordering * tp->mss_cache))) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); + + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; + + if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { + tp->lost_out += tcp_skb_pcount(skb); + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + flag |= FLAG_DATA_SACKED; + NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT); + } + } else { + if (!new_low_seq || before(ack_seq, new_low_seq)) + new_low_seq = ack_seq; + cnt += tcp_skb_pcount(skb); + } + } + + if (tp->retrans_out) + tp->lost_retrans_low = new_low_seq; + + return flag; +} + +static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, + struct tcp_sack_block_wire *sp, int num_sacks, + u32 prior_snd_una) +{ + u32 start_seq_0 = ntohl(get_unaligned(&sp[0].start_seq)); + u32 end_seq_0 = ntohl(get_unaligned(&sp[0].end_seq)); + int dup_sack = 0; + + if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { + dup_sack = 1; + tcp_dsack_seen(tp); + NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV); + } else if (num_sacks > 1) { + u32 end_seq_1 = ntohl(get_unaligned(&sp[1].end_seq)); + u32 start_seq_1 = ntohl(get_unaligned(&sp[1].start_seq)); + + if (!after(end_seq_0, end_seq_1) && + !before(start_seq_0, start_seq_1)) { + dup_sack = 1; + tcp_dsack_seen(tp); + NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV); + } + } + + /* D-SACK for already forgotten data... Do dumb counting. */ + if (dup_sack && + !after(end_seq_0, prior_snd_una) && + after(end_seq_0, tp->undo_marker)) + tp->undo_retrans--; + + return dup_sack; +} + +/* Check if skb is fully within the SACK block. In presence of GSO skbs, + * the incoming SACK may not exactly match but we can find smaller MSS + * aligned portion of it that matches. Therefore we might need to fragment + * which may fail and creates some hassle (caller must handle error case + * returns). */ +int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, + u32 start_seq, u32 end_seq) +{ + int in_sack, err; + unsigned int pkt_len; + + in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && + !before(end_seq, TCP_SKB_CB(skb)->end_seq); + + if (tcp_skb_pcount(skb) > 1 && !in_sack && + after(TCP_SKB_CB(skb)->end_seq, start_seq)) { + + in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq); + + if (!in_sack) + pkt_len = start_seq - TCP_SKB_CB(skb)->seq; + else + pkt_len = end_seq - TCP_SKB_CB(skb)->seq; + err = tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size); + if (err < 0) + return err; + } + + return in_sack; +} + static int tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una) { @@ -951,36 +1242,24 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3; int reord = tp->packets_out; int prior_fackets; - u32 lost_retrans = 0; + u32 highest_sack_end_seq = 0; int flag = 0; int found_dup_sack = 0; int cached_fack_count; int i; int first_sack_index; - if (!tp->sacked_out) - tp->fackets_out = 0; - prior_fackets = tp->fackets_out; - - /* Check for D-SACK. */ - if (before(ntohl(sp[0].start_seq), TCP_SKB_CB(ack_skb)->ack_seq)) { - found_dup_sack = 1; - tp->rx_opt.sack_ok |= 4; - NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV); - } else if (num_sacks > 1 && - !after(ntohl(sp[0].end_seq), ntohl(sp[1].end_seq)) && - !before(ntohl(sp[0].start_seq), ntohl(sp[1].start_seq))) { - found_dup_sack = 1; - tp->rx_opt.sack_ok |= 4; - NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV); + if (!tp->sacked_out) { + if (WARN_ON(tp->fackets_out)) + tp->fackets_out = 0; + tp->highest_sack = tp->snd_una; } + prior_fackets = tp->fackets_out; - /* D-SACK for already forgotten data... - * Do dumb counting. */ - if (found_dup_sack && - !after(ntohl(sp[0].end_seq), prior_snd_una) && - after(ntohl(sp[0].end_seq), tp->undo_marker)) - tp->undo_retrans--; + found_dup_sack = tcp_check_dsack(tp, ack_skb, sp, + num_sacks, prior_snd_una); + if (found_dup_sack) + flag |= FLAG_DSACKING_ACK; /* Eliminate too old ACKs, but take into * account more or less fresh ones, they can @@ -1060,6 +1339,22 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ int fack_count; int dup_sack = (found_dup_sack && (i == first_sack_index)); + if (!tcp_is_sackblock_valid(tp, dup_sack, start_seq, end_seq)) { + if (dup_sack) { + if (!tp->undo_marker) + NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDNOUNDO); + else + NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDOLD); + } else { + /* Don't count olds caused by ACK reordering */ + if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) && + !after(end_seq, tp->snd_una)) + continue; + NET_INC_STATS_BH(LINUX_MIB_TCPSACKDISCARD); + } + continue; + } + skb = cached_skb; fack_count = cached_fack_count; @@ -1068,7 +1363,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ flag |= FLAG_DATA_LOST; tcp_for_write_queue_from(skb, sk) { - int in_sack, pcount; + int in_sack; u8 sacked; if (skb == tcp_send_head(sk)) @@ -1087,30 +1382,11 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (!before(TCP_SKB_CB(skb)->seq, end_seq)) break; - in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && - !before(end_seq, TCP_SKB_CB(skb)->end_seq); - - pcount = tcp_skb_pcount(skb); - - if (pcount > 1 && !in_sack && - after(TCP_SKB_CB(skb)->end_seq, start_seq)) { - unsigned int pkt_len; - - in_sack = !after(start_seq, - TCP_SKB_CB(skb)->seq); - - if (!in_sack) - pkt_len = (start_seq - - TCP_SKB_CB(skb)->seq); - else - pkt_len = (end_seq - - TCP_SKB_CB(skb)->seq); - if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size)) - break; - pcount = tcp_skb_pcount(skb); - } + in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, end_seq); + if (in_sack < 0) + break; - fack_count += pcount; + fack_count += tcp_skb_pcount(skb); sacked = TCP_SKB_CB(skb)->sacked; @@ -1137,11 +1413,6 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ continue; } - if ((sacked&TCPCB_SACKED_RETRANS) && - after(end_seq, TCP_SKB_CB(skb)->ack_seq) && - (!lost_retrans || after(end_seq, lost_retrans))) - lost_retrans = end_seq; - if (!in_sack) continue; @@ -1194,6 +1465,11 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (fack_count > tp->fackets_out) tp->fackets_out = fack_count; + + if (after(TCP_SKB_CB(skb)->seq, tp->highest_sack)) { + tp->highest_sack = TCP_SKB_CB(skb)->seq; + highest_sack_end_seq = TCP_SKB_CB(skb)->end_seq; + } } else { if (dup_sack && (sacked&TCPCB_RETRANS)) reord = min(fack_count, reord); @@ -1213,45 +1489,12 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ } } - /* Check for lost retransmit. This superb idea is - * borrowed from "ratehalving". Event "C". - * Later note: FACK people cheated me again 8), - * we have to account for reordering! Ugly, - * but should help. - */ - if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) { - struct sk_buff *skb; + if (tp->retrans_out && + after(highest_sack_end_seq, tp->lost_retrans_low) && + icsk->icsk_ca_state == TCP_CA_Recovery) + flag |= tcp_mark_lost_retrans(sk, highest_sack_end_seq); - tcp_for_write_queue(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - if (after(TCP_SKB_CB(skb)->seq, lost_retrans)) - break; - if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) - continue; - if ((TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) && - after(lost_retrans, TCP_SKB_CB(skb)->ack_seq) && - (IsFack(tp) || - !before(lost_retrans, - TCP_SKB_CB(skb)->ack_seq + tp->reordering * - tp->mss_cache))) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; - tp->retrans_out -= tcp_skb_pcount(skb); - - /* clear lost hint */ - tp->retransmit_skb_hint = NULL; - - if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) { - tp->lost_out += tcp_skb_pcount(skb); - TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - flag |= FLAG_DATA_SACKED; - NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT); - } - } - } - } - - tp->left_out = tp->sacked_out + tp->lost_out; + tcp_verify_left_out(tp); if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss && (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark))) @@ -1266,6 +1509,56 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ return flag; } +/* If we receive more dupacks than we expected counting segments + * in assumption of absent reordering, interpret this as reordering. + * The only another reason could be bug in receiver TCP. + */ +static void tcp_check_reno_reordering(struct sock *sk, const int addend) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 holes; + + holes = max(tp->lost_out, 1U); + holes = min(holes, tp->packets_out); + + if ((tp->sacked_out + holes) > tp->packets_out) { + tp->sacked_out = tp->packets_out - holes; + tcp_update_reordering(sk, tp->packets_out + addend, 0); + } +} + +/* Emulate SACKs for SACKless connection: account for a new dupack. */ + +static void tcp_add_reno_sack(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + tp->sacked_out++; + tcp_check_reno_reordering(sk, 0); + tcp_verify_left_out(tp); +} + +/* Account for ACK, ACKing some data in Reno Recovery phase. */ + +static void tcp_remove_reno_sacks(struct sock *sk, int acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (acked > 0) { + /* One ACK acked hole. The rest eat duplicate ACKs. */ + if (acked-1 >= tp->sacked_out) + tp->sacked_out = 0; + else + tp->sacked_out -= acked-1; + } + tcp_check_reno_reordering(sk, acked); + tcp_verify_left_out(tp); +} + +static inline void tcp_reset_reno_sack(struct tcp_sock *tp) +{ + tp->sacked_out = 0; +} + /* F-RTO can only be used if TCP has never retransmitted anything other than * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) */ @@ -1353,11 +1646,13 @@ void tcp_enter_frto(struct sock *sk) tp->undo_retrans = 0; skb = tcp_write_queue_head(sk); + if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) + tp->undo_marker = 0; if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); } - tcp_sync_left_out(tp); + tcp_verify_left_out(tp); /* Earlier loss recovery underway (see RFC4138; Appendix B). * The last condition is necessary at least in tp->frto_counter case. @@ -1382,17 +1677,15 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int cnt = 0; - tp->sacked_out = 0; tp->lost_out = 0; - tp->fackets_out = 0; tp->retrans_out = 0; + if (tcp_is_reno(tp)) + tcp_reset_reno_sack(tp); tcp_for_write_queue(skb, sk) { if (skb == tcp_send_head(sk)) break; - cnt += tcp_skb_pcount(skb); /* * Count the retransmission made on RTO correctly (only when * waiting for the first ACK and did not get it)... @@ -1404,30 +1697,25 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) /* ...enter this if branch just for the first segment */ flag |= FLAG_DATA_ACKED; } else { + if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) + tp->undo_marker = 0; TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); } - if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) { - /* Do not mark those segments lost that were - * forward transmitted after RTO - */ - if (!after(TCP_SKB_CB(skb)->end_seq, - tp->frto_highmark)) { - TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - tp->lost_out += tcp_skb_pcount(skb); - } - } else { - tp->sacked_out += tcp_skb_pcount(skb); - tp->fackets_out = cnt; + /* Don't lost mark skbs that were fwd transmitted after RTO */ + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) && + !after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) { + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + tp->lost_out += tcp_skb_pcount(skb); } } - tcp_sync_left_out(tp); + tcp_verify_left_out(tp); tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments; tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; - tp->undo_marker = 0; tp->frto_counter = 0; + tp->bytes_acked = 0; tp->reordering = min_t(unsigned int, tp->reordering, sysctl_tcp_reordering); @@ -1435,22 +1723,26 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) tp->high_seq = tp->frto_highmark; TCP_ECN_queue_cwr(tp); - clear_all_retrans_hints(tp); + tcp_clear_retrans_hints_partial(tp); } -void tcp_clear_retrans(struct tcp_sock *tp) +static void tcp_clear_retrans_partial(struct tcp_sock *tp) { - tp->left_out = 0; tp->retrans_out = 0; - - tp->fackets_out = 0; - tp->sacked_out = 0; tp->lost_out = 0; tp->undo_marker = 0; tp->undo_retrans = 0; } +void tcp_clear_retrans(struct tcp_sock *tp) +{ + tcp_clear_retrans_partial(tp); + + tp->fackets_out = 0; + tp->sacked_out = 0; +} + /* Enter Loss state. If "how" is not zero, forget all SACK information * and reset tags completely, otherwise preserve SACKs. If receiver * dropped its ofo queue, we will know this due to reneging detection. @@ -1460,7 +1752,6 @@ void tcp_enter_loss(struct sock *sk, int how) const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int cnt = 0; /* Reduce ssthresh if it has not yet been made inside this window. */ if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || @@ -1474,17 +1765,26 @@ void tcp_enter_loss(struct sock *sk, int how) tp->snd_cwnd_stamp = tcp_time_stamp; tp->bytes_acked = 0; - tcp_clear_retrans(tp); + tcp_clear_retrans_partial(tp); + + if (tcp_is_reno(tp)) + tcp_reset_reno_sack(tp); - /* Push undo marker, if it was plain RTO and nothing - * was retransmitted. */ - if (!how) + if (!how) { + /* Push undo marker, if it was plain RTO and nothing + * was retransmitted. */ tp->undo_marker = tp->snd_una; + tcp_clear_retrans_hints_partial(tp); + } else { + tp->sacked_out = 0; + tp->fackets_out = 0; + tcp_clear_all_retrans_hints(tp); + } tcp_for_write_queue(skb, sk) { if (skb == tcp_send_head(sk)) break; - cnt += tcp_skb_pcount(skb); + if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS) tp->undo_marker = 0; TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; @@ -1492,12 +1792,9 @@ void tcp_enter_loss(struct sock *sk, int how) TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); - } else { - tp->sacked_out += tcp_skb_pcount(skb); - tp->fackets_out = cnt; } } - tcp_sync_left_out(tp); + tcp_verify_left_out(tp); tp->reordering = min_t(unsigned int, tp->reordering, sysctl_tcp_reordering); @@ -1506,8 +1803,6 @@ void tcp_enter_loss(struct sock *sk, int how) TCP_ECN_queue_cwr(tp); /* Abort FRTO algorithm if one is in progress */ tp->frto_counter = 0; - - clear_all_retrans_hints(tp); } static int tcp_check_sack_reneging(struct sock *sk) @@ -1537,7 +1832,7 @@ static int tcp_check_sack_reneging(struct sock *sk) static inline int tcp_fackets_out(struct tcp_sock *tp) { - return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out; + return tcp_is_reno(tp) ? tp->sacked_out+1 : tp->fackets_out; } static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) @@ -1685,55 +1980,18 @@ static int tcp_time_to_recover(struct sock *sk) return 0; } -/* If we receive more dupacks than we expected counting segments - * in assumption of absent reordering, interpret this as reordering. - * The only another reason could be bug in receiver TCP. +/* RFC: This is from the original, I doubt that this is necessary at all: + * clear xmit_retrans hint if seq of this skb is beyond hint. How could we + * retransmitted past LOST markings in the first place? I'm not fully sure + * about undo and end of connection cases, which can cause R without L? */ -static void tcp_check_reno_reordering(struct sock *sk, const int addend) -{ - struct tcp_sock *tp = tcp_sk(sk); - u32 holes; - - holes = max(tp->lost_out, 1U); - holes = min(holes, tp->packets_out); - - if ((tp->sacked_out + holes) > tp->packets_out) { - tp->sacked_out = tp->packets_out - holes; - tcp_update_reordering(sk, tp->packets_out + addend, 0); - } -} - -/* Emulate SACKs for SACKless connection: account for a new dupack. */ - -static void tcp_add_reno_sack(struct sock *sk) +static void tcp_verify_retransmit_hint(struct tcp_sock *tp, + struct sk_buff *skb) { - struct tcp_sock *tp = tcp_sk(sk); - tp->sacked_out++; - tcp_check_reno_reordering(sk, 0); - tcp_sync_left_out(tp); -} - -/* Account for ACK, ACKing some data in Reno Recovery phase. */ - -static void tcp_remove_reno_sacks(struct sock *sk, int acked) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (acked > 0) { - /* One ACK acked hole. The rest eat duplicate ACKs. */ - if (acked-1 >= tp->sacked_out) - tp->sacked_out = 0; - else - tp->sacked_out -= acked-1; - } - tcp_check_reno_reordering(sk, acked); - tcp_sync_left_out(tp); -} - -static inline void tcp_reset_reno_sack(struct tcp_sock *tp) -{ - tp->sacked_out = 0; - tp->left_out = tp->lost_out; + if ((tp->retransmit_skb_hint != NULL) && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) + tp->retransmit_skb_hint = NULL; } /* Mark head of queue up as lost. */ @@ -1763,20 +2021,13 @@ static void tcp_mark_head_lost(struct sock *sk, cnt += tcp_skb_pcount(skb); if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, high_seq)) break; - if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { + if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_SACKED_ACKED|TCPCB_LOST))) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); - - /* clear xmit_retransmit_queue hints - * if this is beyond hint */ - if (tp->retransmit_skb_hint != NULL && - before(TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) - tp->retransmit_skb_hint = NULL; - + tcp_verify_retransmit_hint(tp, skb); } } - tcp_sync_left_out(tp); + tcp_verify_left_out(tp); } /* Account newly detected lost packet(s) */ @@ -1785,7 +2036,7 @@ static void tcp_update_scoreboard(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - if (IsFack(tp)) { + if (tcp_is_fack(tp)) { int lost = tp->fackets_out - tp->reordering; if (lost <= 0) lost = 1; @@ -1799,7 +2050,7 @@ static void tcp_update_scoreboard(struct sock *sk) * Hence, we can detect timed out packets during fast * retransmit without falling to slow start. */ - if (!IsReno(tp) && tcp_head_timedout(sk)) { + if (!tcp_is_reno(tp) && tcp_head_timedout(sk)) { struct sk_buff *skb; skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint @@ -1814,19 +2065,13 @@ static void tcp_update_scoreboard(struct sock *sk) if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); - - /* clear xmit_retrans hint */ - if (tp->retransmit_skb_hint && - before(TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) - - tp->retransmit_skb_hint = NULL; + tcp_verify_retransmit_hint(tp, skb); } } tp->scoreboard_skb_hint = skb; - tcp_sync_left_out(tp); + tcp_verify_left_out(tp); } } @@ -1851,19 +2096,22 @@ static inline u32 tcp_cwnd_min(const struct sock *sk) } /* Decrease cwnd each second ack. */ -static void tcp_cwnd_down(struct sock *sk) +static void tcp_cwnd_down(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); int decr = tp->snd_cwnd_cnt + 1; - tp->snd_cwnd_cnt = decr&1; - decr >>= 1; + if ((flag&(FLAG_ANY_PROGRESS|FLAG_DSACKING_ACK)) || + (tcp_is_reno(tp) && !(flag&FLAG_NOT_DUP))) { + tp->snd_cwnd_cnt = decr&1; + decr >>= 1; - if (decr && tp->snd_cwnd > tcp_cwnd_min(sk)) - tp->snd_cwnd -= decr; + if (decr && tp->snd_cwnd > tcp_cwnd_min(sk)) + tp->snd_cwnd -= decr; - tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); + tp->snd_cwnd_stamp = tcp_time_stamp; + } } /* Nothing was retransmitted or returned timestamp is less @@ -1887,7 +2135,7 @@ static void DBGUNDO(struct sock *sk, const char *msg) printk(KERN_DEBUG "Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n", msg, NIPQUAD(inet->daddr), ntohs(inet->dport), - tp->snd_cwnd, tp->left_out, + tp->snd_cwnd, tcp_left_out(tp), tp->snd_ssthresh, tp->prior_ssthresh, tp->packets_out); } @@ -1919,7 +2167,7 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) /* There is something screwy going on with the retrans hints after an undo */ - clear_all_retrans_hints(tp); + tcp_clear_all_retrans_hints(tp); } static inline int tcp_may_undo(struct tcp_sock *tp) @@ -1945,7 +2193,7 @@ static int tcp_try_undo_recovery(struct sock *sk) NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO); tp->undo_marker = 0; } - if (tp->snd_una == tp->high_seq && IsReno(tp)) { + if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { /* Hold old state until something *above* high_seq * is ACKed. For Reno it is MUST to prevent false * fast retransmits (RFC2582). SACK TCP is safe. */ @@ -1975,7 +2223,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked) { struct tcp_sock *tp = tcp_sk(sk); /* Partial ACK arrived. Force Hoe's retransmit. */ - int failed = IsReno(tp) || tp->fackets_out>tp->reordering; + int failed = tcp_is_reno(tp) || tp->fackets_out>tp->reordering; if (tcp_may_undo(tp)) { /* Plain luck! Hole if filled with delayed @@ -2012,16 +2260,15 @@ static int tcp_try_undo_loss(struct sock *sk) TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; } - clear_all_retrans_hints(tp); + tcp_clear_all_retrans_hints(tp); DBGUNDO(sk, "partial loss"); tp->lost_out = 0; - tp->left_out = tp->sacked_out; tcp_undo_cwr(sk, 1); NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); inet_csk(sk)->icsk_retransmits = 0; tp->undo_marker = 0; - if (!IsReno(tp)) + if (tcp_is_sack(tp)) tcp_set_ca_state(sk, TCP_CA_Open); return 1; } @@ -2040,7 +2287,7 @@ static void tcp_try_to_open(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); - tcp_sync_left_out(tp); + tcp_verify_left_out(tp); if (tp->retrans_out == 0) tp->retrans_stamp = 0; @@ -2051,7 +2298,7 @@ static void tcp_try_to_open(struct sock *sk, int flag) if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { int state = TCP_CA_Open; - if (tp->left_out || tp->retrans_out || tp->undo_marker) + if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker) state = TCP_CA_Disorder; if (inet_csk(sk)->icsk_ca_state != state) { @@ -2060,7 +2307,7 @@ static void tcp_try_to_open(struct sock *sk, int flag) } tcp_moderate_cwnd(tp); } else { - tcp_cwnd_down(sk); + tcp_cwnd_down(sk, flag); } } @@ -2104,19 +2351,20 @@ static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb) * tcp_xmit_retransmit_queue(). */ static void -tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, - int prior_packets, int flag) +tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP)); + int is_dupack = !(flag&(FLAG_SND_UNA_ADVANCED|FLAG_NOT_DUP)); + int do_lost = is_dupack || ((flag&FLAG_DATA_SACKED) && + (tp->fackets_out > tp->reordering)); /* Some technical things: * 1. Reno does not count dupacks (sacked_out) automatically. */ if (!tp->packets_out) tp->sacked_out = 0; - /* 2. SACK counts snd_fack in packets inaccurately. */ - if (tp->sacked_out == 0) + + if (WARN_ON(!tp->sacked_out && tp->fackets_out)) tp->fackets_out = 0; /* Now state machine starts. @@ -2137,8 +2385,8 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, NET_INC_STATS_BH(LINUX_MIB_TCPLOSS); } - /* D. Synchronize left_out to current state. */ - tcp_sync_left_out(tp); + /* D. Check consistency of the current state. */ + tcp_verify_left_out(tp); /* E. Check state exit conditions. State can be terminated * when high_seq is ACKed. */ @@ -2167,14 +2415,14 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, if (!tp->undo_marker || /* For SACK case do not Open to allow to undo * catching for all duplicate ACKs. */ - IsReno(tp) || tp->snd_una != tp->high_seq) { + tcp_is_reno(tp) || tp->snd_una != tp->high_seq) { tp->undo_marker = 0; tcp_set_ca_state(sk, TCP_CA_Open); } break; case TCP_CA_Recovery: - if (IsReno(tp)) + if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); if (tcp_try_undo_recovery(sk)) return; @@ -2186,15 +2434,11 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, /* F. Process state. */ switch (icsk->icsk_ca_state) { case TCP_CA_Recovery: - if (prior_snd_una == tp->snd_una) { - if (IsReno(tp) && is_dupack) + if (!(flag & FLAG_SND_UNA_ADVANCED)) { + if (tcp_is_reno(tp) && is_dupack) tcp_add_reno_sack(sk); - } else { - int acked = prior_packets - tp->packets_out; - if (IsReno(tp)) - tcp_remove_reno_sacks(sk, acked); - is_dupack = tcp_try_undo_partial(sk, acked); - } + } else + do_lost = tcp_try_undo_partial(sk, pkts_acked); break; case TCP_CA_Loss: if (flag&FLAG_DATA_ACKED) @@ -2208,8 +2452,8 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, return; /* Loss is undone; fall through to processing in Open state. */ default: - if (IsReno(tp)) { - if (tp->snd_una != prior_snd_una) + if (tcp_is_reno(tp)) { + if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); if (is_dupack) tcp_add_reno_sack(sk); @@ -2236,7 +2480,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, /* Otherwise enter Recovery state */ - if (IsReno(tp)) + if (tcp_is_reno(tp)) NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERY); else NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERY); @@ -2258,9 +2502,9 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, tcp_set_ca_state(sk, TCP_CA_Recovery); } - if (is_dupack || tcp_head_timedout(sk)) + if (do_lost || tcp_head_timedout(sk)) tcp_update_scoreboard(sk); - tcp_cwnd_down(sk); + tcp_cwnd_down(sk, flag); tcp_xmit_retransmit_queue(sk); } @@ -2323,19 +2567,18 @@ static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, tcp_ack_no_tstamp(sk, seq_rtt, flag); } -static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, +static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight, int good) { const struct inet_connection_sock *icsk = inet_csk(sk); - icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good); + icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight, good); tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; } /* Restart timer after forward progress on connection. * RFC2988 recommends to restart timer to now+rto. */ - -static void tcp_ack_packets_out(struct sock *sk) +static void tcp_rearm_rto(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -2346,164 +2589,163 @@ static void tcp_ack_packets_out(struct sock *sk) } } -static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, - __u32 now, __s32 *seq_rtt) +/* If we get here, the whole TSO packet has not been acked. */ +static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - struct tcp_skb_cb *scb = TCP_SKB_CB(skb); - __u32 seq = tp->snd_una; - __u32 packets_acked; - int acked = 0; + u32 packets_acked; - /* If we get here, the whole TSO packet has not been - * acked. - */ - BUG_ON(!after(scb->end_seq, seq)); + BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)); packets_acked = tcp_skb_pcount(skb); - if (tcp_trim_head(sk, skb, seq - scb->seq)) + if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) return 0; packets_acked -= tcp_skb_pcount(skb); if (packets_acked) { - __u8 sacked = scb->sacked; - - acked |= FLAG_DATA_ACKED; - if (sacked) { - if (sacked & TCPCB_RETRANS) { - if (sacked & TCPCB_SACKED_RETRANS) - tp->retrans_out -= packets_acked; - acked |= FLAG_RETRANS_DATA_ACKED; - *seq_rtt = -1; - } else if (*seq_rtt < 0) - *seq_rtt = now - scb->when; - if (sacked & TCPCB_SACKED_ACKED) - tp->sacked_out -= packets_acked; - if (sacked & TCPCB_LOST) - tp->lost_out -= packets_acked; - if (sacked & TCPCB_URG) { - if (tp->urg_mode && - !before(seq, tp->snd_up)) - tp->urg_mode = 0; - } - } else if (*seq_rtt < 0) - *seq_rtt = now - scb->when; - - if (tp->fackets_out) { - __u32 dval = min(tp->fackets_out, packets_acked); - tp->fackets_out -= dval; - } - tp->packets_out -= packets_acked; - BUG_ON(tcp_skb_pcount(skb) == 0); - BUG_ON(!before(scb->seq, scb->end_seq)); + BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)); } - return acked; + return packets_acked; } -/* Remove acknowledged frames from the retransmission queue. */ -static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) +/* Remove acknowledged frames from the retransmission queue. If our packet + * is before the ack sequence we can discard it as it's confirmed to have + * arrived at the other end. + */ +static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb; - __u32 now = tcp_time_stamp; - int acked = 0; + u32 now = tcp_time_stamp; + int fully_acked = 1; + int flag = 0; int prior_packets = tp->packets_out; - __s32 seq_rtt = -1; + s32 seq_rtt = -1; ktime_t last_ackt = net_invalid_timestamp(); - while ((skb = tcp_write_queue_head(sk)) && - skb != tcp_send_head(sk)) { + while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); - __u8 sacked = scb->sacked; + u32 end_seq; + u32 packets_acked; + u8 sacked = scb->sacked; - /* If our packet is before the ack sequence we can - * discard it as it's confirmed to have arrived at - * the other end. - */ if (after(scb->end_seq, tp->snd_una)) { - if (tcp_skb_pcount(skb) > 1 && - after(tp->snd_una, scb->seq)) - acked |= tcp_tso_acked(sk, skb, - now, &seq_rtt); - break; - } + if (tcp_skb_pcount(skb) == 1 || + !after(tp->snd_una, scb->seq)) + break; - /* Initial outgoing SYN's get put onto the write_queue - * just like anything else we transmit. It is not - * true data, and if we misinform our callers that - * this ACK acks real data, we will erroneously exit - * connection startup slow start one packet too - * quickly. This is severely frowned upon behavior. - */ - if (!(scb->flags & TCPCB_FLAG_SYN)) { - acked |= FLAG_DATA_ACKED; + packets_acked = tcp_tso_acked(sk, skb); + if (!packets_acked) + break; + + fully_acked = 0; + end_seq = tp->snd_una; } else { - acked |= FLAG_SYN_ACKED; - tp->retrans_stamp = 0; + packets_acked = tcp_skb_pcount(skb); + end_seq = scb->end_seq; } /* MTU probing checks */ - if (icsk->icsk_mtup.probe_size) { - if (!after(tp->mtu_probe.probe_seq_end, TCP_SKB_CB(skb)->end_seq)) { - tcp_mtup_probe_success(sk, skb); - } + if (fully_acked && icsk->icsk_mtup.probe_size && + !after(tp->mtu_probe.probe_seq_end, scb->end_seq)) { + tcp_mtup_probe_success(sk, skb); } if (sacked) { if (sacked & TCPCB_RETRANS) { if (sacked & TCPCB_SACKED_RETRANS) - tp->retrans_out -= tcp_skb_pcount(skb); - acked |= FLAG_RETRANS_DATA_ACKED; + tp->retrans_out -= packets_acked; + flag |= FLAG_RETRANS_DATA_ACKED; seq_rtt = -1; + if ((flag & FLAG_DATA_ACKED) || + (packets_acked > 1)) + flag |= FLAG_NONHEAD_RETRANS_ACKED; } else if (seq_rtt < 0) { seq_rtt = now - scb->when; - last_ackt = skb->tstamp; + if (fully_acked) + last_ackt = skb->tstamp; } + if (sacked & TCPCB_SACKED_ACKED) - tp->sacked_out -= tcp_skb_pcount(skb); + tp->sacked_out -= packets_acked; if (sacked & TCPCB_LOST) - tp->lost_out -= tcp_skb_pcount(skb); - if (sacked & TCPCB_URG) { - if (tp->urg_mode && - !before(scb->end_seq, tp->snd_up)) - tp->urg_mode = 0; - } + tp->lost_out -= packets_acked; + + if ((sacked & TCPCB_URG) && tp->urg_mode && + !before(end_seq, tp->snd_up)) + tp->urg_mode = 0; } else if (seq_rtt < 0) { seq_rtt = now - scb->when; - last_ackt = skb->tstamp; + if (fully_acked) + last_ackt = skb->tstamp; } - tcp_dec_pcount_approx(&tp->fackets_out, skb); - tcp_packets_out_dec(tp, skb); + tp->packets_out -= packets_acked; + + /* Initial outgoing SYN's get put onto the write_queue + * just like anything else we transmit. It is not + * true data, and if we misinform our callers that + * this ACK acks real data, we will erroneously exit + * connection startup slow start one packet too + * quickly. This is severely frowned upon behavior. + */ + if (!(scb->flags & TCPCB_FLAG_SYN)) { + flag |= FLAG_DATA_ACKED; + } else { + flag |= FLAG_SYN_ACKED; + tp->retrans_stamp = 0; + } + + if (!fully_acked) + break; + tcp_unlink_write_queue(skb, sk); sk_stream_free_skb(sk, skb); - clear_all_retrans_hints(tp); + tcp_clear_all_retrans_hints(tp); } - if (acked&FLAG_ACKED) { + if (flag & FLAG_ACKED) { u32 pkts_acked = prior_packets - tp->packets_out; const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; - tcp_ack_update_rtt(sk, acked, seq_rtt); - tcp_ack_packets_out(sk); - - /* Is the ACK triggering packet unambiguous? */ - if (acked & FLAG_RETRANS_DATA_ACKED) - last_ackt = net_invalid_timestamp(); + tcp_ack_update_rtt(sk, flag, seq_rtt); + tcp_rearm_rto(sk); + + tp->fackets_out -= min(pkts_acked, tp->fackets_out); + /* hint's skb might be NULL but we don't need to care */ + tp->fastpath_cnt_hint -= min_t(u32, pkts_acked, + tp->fastpath_cnt_hint); + if (tcp_is_reno(tp)) + tcp_remove_reno_sacks(sk, pkts_acked); + + if (ca_ops->pkts_acked) { + s32 rtt_us = -1; + + /* Is the ACK triggering packet unambiguous? */ + if (!(flag & FLAG_RETRANS_DATA_ACKED)) { + /* High resolution needed and available? */ + if (ca_ops->flags & TCP_CONG_RTT_STAMP && + !ktime_equal(last_ackt, + net_invalid_timestamp())) + rtt_us = ktime_us_delta(ktime_get_real(), + last_ackt); + else if (seq_rtt > 0) + rtt_us = jiffies_to_usecs(seq_rtt); + } - if (ca_ops->pkts_acked) - ca_ops->pkts_acked(sk, pkts_acked, last_ackt); + ca_ops->pkts_acked(sk, pkts_acked, rtt_us); + } } #if FASTRETRANS_DEBUG > 0 BUG_TRAP((int)tp->sacked_out >= 0); BUG_TRAP((int)tp->lost_out >= 0); BUG_TRAP((int)tp->retrans_out >= 0); - if (!tp->packets_out && tp->rx_opt.sack_ok) { - const struct inet_connection_sock *icsk = inet_csk(sk); + if (!tp->packets_out && tcp_is_sack(tp)) { + icsk = inet_csk(sk); if (tp->lost_out) { printk(KERN_DEBUG "Leak l=%u %d\n", tp->lost_out, icsk->icsk_ca_state); @@ -2522,7 +2764,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) } #endif *seq_rtt_p = seq_rtt; - return acked; + return flag; } static void tcp_ack_probe(struct sock *sk) @@ -2617,6 +2859,7 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp) { tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); tp->snd_cwnd_cnt = 0; + tp->bytes_acked = 0; TCP_ECN_queue_cwr(tp); tcp_moderate_cwnd(tp); } @@ -2667,28 +2910,31 @@ static void tcp_undo_spur_to_response(struct sock *sk, int flag) * to prove that the RTO is indeed spurious. It transfers the control * from F-RTO to the conventional RTO recovery */ -static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) +static int tcp_process_frto(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); - tcp_sync_left_out(tp); + tcp_verify_left_out(tp); /* Duplicate the behavior from Loss state (fastretrans_alert) */ if (flag&FLAG_DATA_ACKED) inet_csk(sk)->icsk_retransmits = 0; + if ((flag & FLAG_NONHEAD_RETRANS_ACKED) || + ((tp->frto_counter >= 2) && (flag & FLAG_RETRANS_DATA_ACKED))) + tp->undo_marker = 0; + if (!before(tp->snd_una, tp->frto_highmark)) { tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag); return 1; } - if (!IsSackFrto() || IsReno(tp)) { + if (!IsSackFrto() || tcp_is_reno(tp)) { /* RFC4138 shortcoming in step 2; should also have case c): * ACK isn't duplicate nor advances window, e.g., opposite dir * data, winupdate */ - if ((tp->snd_una == prior_snd_una) && (flag&FLAG_NOT_DUP) && - !(flag&FLAG_FORWARD_PROGRESS)) + if (!(flag&FLAG_ANY_PROGRESS) && (flag&FLAG_NOT_DUP)) return 1; if (!(flag&FLAG_DATA_ACKED)) { @@ -2742,6 +2988,8 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) break; } tp->frto_counter = 0; + tp->undo_marker = 0; + NET_INC_STATS_BH(LINUX_MIB_TCPSPURIOUSRTOS); } return 0; } @@ -2768,6 +3016,9 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) if (before(ack, prior_snd_una)) goto old_ack; + if (after(ack, prior_snd_una)) + flag |= FLAG_SND_UNA_ADVANCED; + if (sysctl_tcp_abc) { if (icsk->icsk_ca_state < TCP_CA_CWR) tp->bytes_acked += ack - prior_snd_una; @@ -2819,18 +3070,21 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) /* See if we can take anything off of the retransmit queue. */ flag |= tcp_clean_rtx_queue(sk, &seq_rtt); + /* Guarantee sacktag reordering detection against wrap-arounds */ + if (before(tp->frto_highmark, tp->snd_una)) + tp->frto_highmark = 0; if (tp->frto_counter) - frto_cwnd = tcp_process_frto(sk, prior_snd_una, flag); + frto_cwnd = tcp_process_frto(sk, flag); if (tcp_ack_is_dubious(sk, flag)) { /* Advance CWND, if state allows this. */ if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && tcp_may_raise_cwnd(sk, flag)) - tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0); - tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); + tcp_cong_avoid(sk, ack, prior_in_flight, 0); + tcp_fastretrans_alert(sk, prior_packets - tp->packets_out, flag); } else { if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) - tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1); + tcp_cong_avoid(sk, ack, prior_in_flight, 1); } if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) @@ -3164,7 +3418,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) * Probably, we should reset in this case. For now drop them. */ __skb_queue_purge(&tp->out_of_order_queue); - if (tp->rx_opt.sack_ok) + if (tcp_is_sack(tp)) tcp_sack_reset(&tp->rx_opt); sk_stream_mem_reclaim(sk); @@ -3194,7 +3448,7 @@ static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_se static void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) { - if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { + if (tcp_is_sack(tp) && sysctl_tcp_dsack) { if (before(seq, tp->rcv_nxt)) NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOLDSENT); else @@ -3224,7 +3478,7 @@ static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb) NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST); tcp_enter_quickack_mode(sk); - if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { + if (tcp_is_sack(tp) && sysctl_tcp_dsack) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) @@ -3540,7 +3794,7 @@ drop: if (!skb_peek(&tp->out_of_order_queue)) { /* Initial out of order segment, build 1 SACK. */ - if (tp->rx_opt.sack_ok) { + if (tcp_is_sack(tp)) { tp->rx_opt.num_sacks = 1; tp->rx_opt.dsack = 0; tp->rx_opt.eff_sacks = 1; @@ -3605,7 +3859,7 @@ drop: } add_sack: - if (tp->rx_opt.sack_ok) + if (tcp_is_sack(tp)) tcp_sack_new_ofo_skb(sk, seq, end_seq); } } @@ -3794,7 +4048,7 @@ static int tcp_prune_queue(struct sock *sk) * is in a sad state like this, we care only about integrity * of the connection not performance. */ - if (tp->rx_opt.sack_ok) + if (tcp_is_sack(tp)) tcp_sack_reset(&tp->rx_opt); sk_stream_mem_reclaim(sk); } @@ -4495,8 +4749,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tp->tcp_header_len = sizeof(struct tcphdr); } - if (tp->rx_opt.sack_ok && sysctl_tcp_fack) - tp->rx_opt.sack_ok |= 2; + if (tcp_is_sack(tp) && sysctl_tcp_fack) + tcp_enable_fack(tp); tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3f5f7423b95..38cf73a5673 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -62,6 +62,7 @@ #include <linux/init.h> #include <linux/times.h> +#include <net/net_namespace.h> #include <net/icmp.h> #include <net/inet_hashtables.h> #include <net/tcp.h> @@ -833,8 +834,7 @@ static struct tcp_md5sig_key * return NULL; for (i = 0; i < tp->md5sig_info->entries4; i++) { if (tp->md5sig_info->keys4[i].addr == addr) - return (struct tcp_md5sig_key *) - &tp->md5sig_info->keys4[i]; + return &tp->md5sig_info->keys4[i].base; } return NULL; } @@ -865,9 +865,9 @@ int tcp_v4_md5_do_add(struct sock *sk, __be32 addr, key = (struct tcp4_md5sig_key *)tcp_v4_md5_do_lookup(sk, addr); if (key) { /* Pre-existing entry - just update that one. */ - kfree(key->key); - key->key = newkey; - key->keylen = newkeylen; + kfree(key->base.key); + key->base.key = newkey; + key->base.keylen = newkeylen; } else { struct tcp_md5sig_info *md5sig; @@ -906,9 +906,9 @@ int tcp_v4_md5_do_add(struct sock *sk, __be32 addr, md5sig->alloced4++; } md5sig->entries4++; - md5sig->keys4[md5sig->entries4 - 1].addr = addr; - md5sig->keys4[md5sig->entries4 - 1].key = newkey; - md5sig->keys4[md5sig->entries4 - 1].keylen = newkeylen; + md5sig->keys4[md5sig->entries4 - 1].addr = addr; + md5sig->keys4[md5sig->entries4 - 1].base.key = newkey; + md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen; } return 0; } @@ -930,7 +930,7 @@ int tcp_v4_md5_do_del(struct sock *sk, __be32 addr) for (i = 0; i < tp->md5sig_info->entries4; i++) { if (tp->md5sig_info->keys4[i].addr == addr) { /* Free the key */ - kfree(tp->md5sig_info->keys4[i].key); + kfree(tp->md5sig_info->keys4[i].base.key); tp->md5sig_info->entries4--; if (tp->md5sig_info->entries4 == 0) { @@ -964,7 +964,7 @@ static void tcp_v4_clear_md5_list(struct sock *sk) if (tp->md5sig_info->entries4) { int i; for (i = 0; i < tp->md5sig_info->entries4; i++) - kfree(tp->md5sig_info->keys4[i].key); + kfree(tp->md5sig_info->keys4[i].base.key); tp->md5sig_info->entries4 = 0; tcp_free_md5sig_pool(); } @@ -2250,7 +2250,7 @@ int tcp_proc_register(struct tcp_seq_afinfo *afinfo) afinfo->seq_fops->llseek = seq_lseek; afinfo->seq_fops->release = seq_release_private; - p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops); + p = proc_net_fops_create(&init_net, afinfo->name, S_IRUGO, afinfo->seq_fops); if (p) p->data = afinfo; else @@ -2262,7 +2262,7 @@ void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo) { if (!afinfo) return; - proc_net_remove(afinfo->name); + proc_net_remove(&init_net, afinfo->name); memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); } @@ -2425,7 +2425,6 @@ struct proto tcp_prot = { .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, - .sendmsg = tcp_sendmsg, .recvmsg = tcp_recvmsg, .backlog_rcv = tcp_v4_do_rcv, .hash = tcp_v4_hash, @@ -2471,6 +2470,5 @@ EXPORT_SYMBOL(tcp_v4_syn_recv_sock); EXPORT_SYMBOL(tcp_proc_register); EXPORT_SYMBOL(tcp_proc_unregister); #endif -EXPORT_SYMBOL(sysctl_local_port_range); EXPORT_SYMBOL(sysctl_tcp_low_latency); diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index e49836ce012..e7f5ef92cbd 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -115,13 +115,12 @@ static void tcp_lp_init(struct sock *sk) * Will only call newReno CA when away from inference. * From TCP-LP's paper, this will be handled in additive increasement. */ -static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, - int flag) +static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight, int flag) { struct lp *lp = inet_csk_ca(sk); if (!(lp->flag & LP_WITHIN_INF)) - tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); + tcp_reno_cong_avoid(sk, ack, in_flight, flag); } /** @@ -261,13 +260,13 @@ static void tcp_lp_rtt_sample(struct sock *sk, u32 rtt) * newReno in increase case. * We work it out by following the idea from TCP-LP's paper directly */ -static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, ktime_t last) +static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us) { struct tcp_sock *tp = tcp_sk(sk); struct lp *lp = inet_csk_ca(sk); - if (!ktime_equal(last, net_invalid_timestamp())) - tcp_lp_rtt_sample(sk, ktime_to_us(net_timedelta(last))); + if (rtt_us > 0) + tcp_lp_rtt_sample(sk, rtt_us); /* calc inference */ if (tcp_time_stamp > tp->rx_opt.rcv_tsecr) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index a12b08fca5a..b61b76847ad 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -368,6 +368,12 @@ void tcp_twsk_destructor(struct sock *sk) EXPORT_SYMBOL_GPL(tcp_twsk_destructor); +static inline void TCP_ECN_openreq_child(struct tcp_sock *tp, + struct request_sock *req) +{ + tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0; +} + /* This is not only more efficient than what we used to do, it eliminates * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM * @@ -399,7 +405,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newicsk->icsk_rto = TCP_TIMEOUT_INIT; newtp->packets_out = 0; - newtp->left_out = 0; newtp->retrans_out = 0; newtp->sacked_out = 0; newtp->fackets_out = 0; @@ -440,7 +445,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { if (sysctl_tcp_fack) - newtp->rx_opt.sack_ok |= 2; + tcp_enable_fack(newtp); } newtp->window_clamp = req->window_clamp; newtp->rcv_ssthresh = req->rcv_wnd; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 20aea1595c4..324b4207254 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -61,6 +61,18 @@ int sysctl_tcp_base_mss __read_mostly = 512; /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; +static inline void tcp_packets_out_inc(struct sock *sk, + const struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + int orig = tp->packets_out; + + tp->packets_out += tcp_skb_pcount(skb); + if (!orig) + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, TCP_RTO_MAX); +} + static void update_send_head(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -269,6 +281,56 @@ static u16 tcp_select_window(struct sock *sk) return new_win; } +static inline void TCP_ECN_send_synack(struct tcp_sock *tp, + struct sk_buff *skb) +{ + TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR; + if (!(tp->ecn_flags&TCP_ECN_OK)) + TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE; +} + +static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->ecn_flags = 0; + if (sysctl_tcp_ecn) { + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE|TCPCB_FLAG_CWR; + tp->ecn_flags = TCP_ECN_OK; + } +} + +static __inline__ void +TCP_ECN_make_synack(struct request_sock *req, struct tcphdr *th) +{ + if (inet_rsk(req)->ecn_ok) + th->ece = 1; +} + +static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, + int tcp_header_len) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->ecn_flags & TCP_ECN_OK) { + /* Not-retransmitted data segment: set ECT and inject CWR. */ + if (skb->len != tcp_header_len && + !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) { + INET_ECN_xmit(sk); + if (tp->ecn_flags&TCP_ECN_QUEUE_CWR) { + tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; + tcp_hdr(skb)->cwr = 1; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; + } + } else { + /* ACK or retransmitted segment: clear ECT|CE */ + INET_ECN_dontxmit(sk); + } + if (tp->ecn_flags & TCP_ECN_DEMAND_CWR) + tcp_hdr(skb)->ece = 1; + } +} + static void tcp_build_and_update_options(__be32 *ptr, struct tcp_sock *tp, __u32 tstamp, __u8 **md5_hash) { @@ -584,16 +646,32 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned skb_shinfo(skb)->gso_size = 0; skb_shinfo(skb)->gso_type = 0; } else { - unsigned int factor; - - factor = skb->len + (mss_now - 1); - factor /= mss_now; - skb_shinfo(skb)->gso_segs = factor; + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now); skb_shinfo(skb)->gso_size = mss_now; skb_shinfo(skb)->gso_type = sk->sk_gso_type; } } +/* When a modification to fackets out becomes necessary, we need to check + * skb is counted to fackets_out or not. Another important thing is to + * tweak SACK fastpath hint too as it would overwrite all changes unless + * hint is also changed. + */ +static void tcp_adjust_fackets_out(struct tcp_sock *tp, struct sk_buff *skb, + int decr) +{ + if (!tp->sacked_out || tcp_is_reno(tp)) + return; + + if (!before(tp->highest_sack, TCP_SKB_CB(skb)->seq)) + tp->fackets_out -= decr; + + /* cnt_hint is "off-by-one" compared with fackets_out (see sacktag) */ + if (tp->fastpath_skb_hint != NULL && + after(TCP_SKB_CB(tp->fastpath_skb_hint)->seq, TCP_SKB_CB(skb)->seq)) + tp->fastpath_cnt_hint -= decr; +} + /* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. @@ -609,7 +687,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss BUG_ON(len > skb->len); - clear_all_retrans_hints(tp); + tcp_clear_retrans_hints_partial(tp); nsize = skb_headlen(skb) - len; if (nsize < 0) nsize = 0; @@ -634,6 +712,10 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; + if (tcp_is_sack(tp) && tp->sacked_out && + (TCP_SKB_CB(skb)->seq == tp->highest_sack)) + tp->highest_sack = TCP_SKB_CB(buff)->seq; + /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->flags; TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); @@ -682,32 +764,15 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) tp->retrans_out -= diff; - if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { + if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) tp->lost_out -= diff; - tp->left_out -= diff; - } - - if (diff > 0) { - /* Adjust Reno SACK estimate. */ - if (!tp->rx_opt.sack_ok) { - tp->sacked_out -= diff; - if ((int)tp->sacked_out < 0) - tp->sacked_out = 0; - tcp_sync_left_out(tp); - } - tp->fackets_out -= diff; - if ((int)tp->fackets_out < 0) - tp->fackets_out = 0; - /* SACK fastpath might overwrite it unless dealt with */ - if (tp->fastpath_skb_hint != NULL && - after(TCP_SKB_CB(tp->fastpath_skb_hint)->seq, - TCP_SKB_CB(skb)->seq)) { - tp->fastpath_cnt_hint -= diff; - if ((int)tp->fastpath_cnt_hint < 0) - tp->fastpath_cnt_hint = 0; - } + /* Adjust Reno SACK estimate. */ + if (tcp_is_reno(tp) && diff > 0) { + tcp_dec_pcount_approx_int(&tp->sacked_out, diff); + tcp_verify_left_out(tp); } + tcp_adjust_fackets_out(tp, skb, diff); } /* Link BUFF into the send queue. */ @@ -1615,7 +1680,7 @@ u32 __tcp_select_window(struct sock *sk) if (window <= free_space - mss || window > free_space) window = (free_space/mss)*mss; else if (mss == full_space && - free_space > window + full_space/2) + free_space > window + full_space/2) window = free_space; } @@ -1654,8 +1719,9 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); - /* changing transmit queue under us so clear hints */ - clear_all_retrans_hints(tp); + if (WARN_ON(tcp_is_sack(tp) && tp->sacked_out && + (TCP_SKB_CB(next_skb)->seq == tp->highest_sack))) + return; /* Ok. We will be able to collapse the packet. */ tcp_unlink_write_queue(next_skb, sk); @@ -1683,21 +1749,23 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL); if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS) tp->retrans_out -= tcp_skb_pcount(next_skb); - if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) { + if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) tp->lost_out -= tcp_skb_pcount(next_skb); - tp->left_out -= tcp_skb_pcount(next_skb); - } /* Reno case is special. Sigh... */ - if (!tp->rx_opt.sack_ok && tp->sacked_out) { + if (tcp_is_reno(tp) && tp->sacked_out) tcp_dec_pcount_approx(&tp->sacked_out, next_skb); - tp->left_out -= tcp_skb_pcount(next_skb); + + tcp_adjust_fackets_out(tp, next_skb, tcp_skb_pcount(next_skb)); + tp->packets_out -= tcp_skb_pcount(next_skb); + + /* changed transmit queue under us so clear hints */ + tcp_clear_retrans_hints_partial(tp); + /* manually tune sacktag skb hint */ + if (tp->fastpath_skb_hint == next_skb) { + tp->fastpath_skb_hint = skb; + tp->fastpath_cnt_hint -= tcp_skb_pcount(skb); } - /* Not quite right: it can be > snd.fack, but - * it is better to underestimate fackets. - */ - tcp_dec_pcount_approx(&tp->fackets_out, next_skb); - tcp_packets_out_dec(tp, next_skb); sk_stream_free_skb(sk, next_skb); } } @@ -1731,12 +1799,12 @@ void tcp_simple_retransmit(struct sock *sk) } } - clear_all_retrans_hints(tp); + tcp_clear_all_retrans_hints(tp); if (!lost) return; - tcp_sync_left_out(tp); + tcp_verify_left_out(tp); /* Don't muck with the congestion window here. * Reason is that we do not increase amount of _data_ @@ -1846,6 +1914,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) printk(KERN_DEBUG "retrans_out leaked.\n"); } #endif + if (!tp->retrans_out) + tp->lost_retrans_low = tp->snd_nxt; TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; tp->retrans_out += tcp_skb_pcount(skb); @@ -1938,40 +2008,35 @@ void tcp_xmit_retransmit_queue(struct sock *sk) return; /* No forward retransmissions in Reno are possible. */ - if (!tp->rx_opt.sack_ok) + if (tcp_is_reno(tp)) return; /* Yeah, we have to make difficult choice between forward transmission * and retransmission... Both ways have their merits... * * For now we do not retransmit anything, while we have some new - * segments to send. + * segments to send. In the other cases, follow rule 3 for + * NextSeg() specified in RFC3517. */ if (tcp_may_send_now(sk)) return; - if (tp->forward_skb_hint) { + /* If nothing is SACKed, highest_sack in the loop won't be valid */ + if (!tp->sacked_out) + return; + + if (tp->forward_skb_hint) skb = tp->forward_skb_hint; - packet_cnt = tp->forward_cnt_hint; - } else{ + else skb = tcp_write_queue_head(sk); - packet_cnt = 0; - } tcp_for_write_queue_from(skb, sk) { if (skb == tcp_send_head(sk)) break; - tp->forward_cnt_hint = packet_cnt; tp->forward_skb_hint = skb; - /* Similar to the retransmit loop above we - * can pretend that the retransmitted SKB - * we send out here will be composed of one - * real MSS sized packet because tcp_retransmit_skb() - * will fragment it if necessary. - */ - if (++packet_cnt > tp->fackets_out) + if (after(TCP_SKB_CB(skb)->seq, tp->highest_sack)) break; if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index f37d5928921..87dd5bff315 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -26,6 +26,7 @@ #include <linux/module.h> #include <linux/ktime.h> #include <linux/time.h> +#include <net/net_namespace.h> #include <net/tcp.h> @@ -130,7 +131,7 @@ static struct jprobe tcp_jprobe = { .kp = { .symbol_name = "tcp_rcv_established", }, - .entry = JPROBE_ENTRY(jtcp_rcv_established), + .entry = jtcp_rcv_established, }; static int tcpprobe_open(struct inode * inode, struct file * file) @@ -228,7 +229,7 @@ static __init int tcpprobe_init(void) if (!tcp_probe.log) goto err0; - if (!proc_net_fops_create(procname, S_IRUSR, &tcpprobe_fops)) + if (!proc_net_fops_create(&init_net, procname, S_IRUSR, &tcpprobe_fops)) goto err0; ret = register_jprobe(&tcp_jprobe); @@ -238,7 +239,7 @@ static __init int tcpprobe_init(void) pr_info("TCP probe registered (port=%d)\n", port); return 0; err1: - proc_net_remove(procname); + proc_net_remove(&init_net, procname); err0: kfree(tcp_probe.log); return ret; @@ -247,7 +248,7 @@ module_init(tcpprobe_init); static __exit void tcpprobe_exit(void) { - proc_net_remove(procname); + proc_net_remove(&init_net, procname); unregister_jprobe(&tcp_jprobe); kfree(tcp_probe.log); } diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 4624501e968..be27a33a1c6 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -15,7 +15,7 @@ #define TCP_SCALABLE_AI_CNT 50U #define TCP_SCALABLE_MD_SCALE 3 -static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt, +static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 in_flight, int flag) { struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index e9b151b3a59..d8970ecfcfc 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -315,7 +315,7 @@ static void tcp_retransmit_timer(struct sock *sk) if (icsk->icsk_retransmits == 0) { if (icsk->icsk_ca_state == TCP_CA_Disorder || icsk->icsk_ca_state == TCP_CA_Recovery) { - if (tp->rx_opt.sack_ok) { + if (tcp_is_sack(tp)) { if (icsk->icsk_ca_state == TCP_CA_Recovery) NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL); else diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index e218a51cece..b49dedcda52 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -112,16 +112,16 @@ EXPORT_SYMBOL_GPL(tcp_vegas_init); * o min-filter RTT samples from a much longer window (forever for now) * to find the propagation delay (baseRTT) */ -void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, ktime_t last) +void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us) { struct vegas *vegas = inet_csk_ca(sk); u32 vrtt; - if (ktime_equal(last, net_invalid_timestamp())) + if (rtt_us < 0) return; /* Never allow zero rtt or baseRTT */ - vrtt = ktime_to_us(net_timedelta(last)) + 1; + vrtt = rtt_us + 1; /* Filter to find propagation delay: */ if (vrtt < vegas->baseRTT) @@ -163,13 +163,13 @@ void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event) EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event); static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, - u32 seq_rtt, u32 in_flight, int flag) + u32 in_flight, int flag) { struct tcp_sock *tp = tcp_sk(sk); struct vegas *vegas = inet_csk_ca(sk); if (!vegas->doing_vegas_now) - return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); + return tcp_reno_cong_avoid(sk, ack, in_flight, flag); /* The key players are v_beg_snd_una and v_beg_snd_nxt. * @@ -228,7 +228,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, /* We don't have enough RTT samples to do the Vegas * calculation, so we'll behave like Reno. */ - tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); + tcp_reno_cong_avoid(sk, ack, in_flight, flag); } else { u32 rtt, target_cwnd, diff; diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h index 502fa818363..6c0eea2f824 100644 --- a/net/ipv4/tcp_vegas.h +++ b/net/ipv4/tcp_vegas.h @@ -17,7 +17,7 @@ struct vegas { extern void tcp_vegas_init(struct sock *sk); extern void tcp_vegas_state(struct sock *sk, u8 ca_state); -extern void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, ktime_t last); +extern void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us); extern void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event); extern void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb); diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index ec854cc5fad..8fb2aee0b1a 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -69,16 +69,16 @@ static void tcp_veno_init(struct sock *sk) } /* Do rtt sampling needed for Veno. */ -static void tcp_veno_pkts_acked(struct sock *sk, u32 cnt, ktime_t last) +static void tcp_veno_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us) { struct veno *veno = inet_csk_ca(sk); u32 vrtt; - if (ktime_equal(last, net_invalid_timestamp())) + if (rtt_us < 0) return; /* Never allow zero rtt or baseRTT */ - vrtt = ktime_to_us(net_timedelta(last)) + 1; + vrtt = rtt_us + 1; /* Filter to find propagation delay: */ if (vrtt < veno->basertt) @@ -115,13 +115,13 @@ static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event) } static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, - u32 seq_rtt, u32 in_flight, int flag) + u32 in_flight, int flag) { struct tcp_sock *tp = tcp_sk(sk); struct veno *veno = inet_csk_ca(sk); if (!veno->doing_veno_now) - return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); + return tcp_reno_cong_avoid(sk, ack, in_flight, flag); /* limited by applications */ if (!tcp_is_cwnd_limited(sk, in_flight)) @@ -132,7 +132,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, /* We don't have enough rtt samples to do the Veno * calculation, so we'll behave like Reno. */ - tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); + tcp_reno_cong_avoid(sk, ack, in_flight, flag); } else { u32 rtt, target_cwnd; diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index e61e09dd513..20151d6a624 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -100,11 +100,12 @@ static void westwood_filter(struct westwood *w, u32 delta) * Called after processing group of packets. * but all westwood needs is the last sample of srtt. */ -static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt, ktime_t last) +static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt, s32 rtt) { struct westwood *w = inet_csk_ca(sk); - if (cnt > 0) - w->rtt = tcp_sk(sk)->srtt >> 3; + + if (rtt > 0) + w->rtt = usecs_to_jiffies(rtt); } /* diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 545ed237ab5..c107fba7430 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -58,7 +58,7 @@ static void tcp_yeah_init(struct sock *sk) } -static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, ktime_t last) +static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us) { const struct inet_connection_sock *icsk = inet_csk(sk); struct yeah *yeah = inet_csk_ca(sk); @@ -66,11 +66,11 @@ static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, ktime_t last) if (icsk->icsk_ca_state == TCP_CA_Open) yeah->pkts_acked = pkts_acked; - tcp_vegas_pkts_acked(sk, pkts_acked, last); + tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us); } static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, - u32 seq_rtt, u32 in_flight, int flag) + u32 in_flight, int flag) { struct tcp_sock *tp = tcp_sk(sk); struct yeah *yeah = inet_csk_ca(sk); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 28355350fb6..cb9fc58efb2 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -98,6 +98,7 @@ #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <net/net_namespace.h> #include <net/icmp.h> #include <net/route.h> #include <net/checksum.h> @@ -113,9 +114,8 @@ DEFINE_SNMP_STAT(struct udp_mib, udp_statistics) __read_mostly; struct hlist_head udp_hash[UDP_HTABLE_SIZE]; DEFINE_RWLOCK(udp_hash_lock); -static int udp_port_rover; - -static inline int __udp_lib_lport_inuse(__u16 num, struct hlist_head udptable[]) +static inline int __udp_lib_lport_inuse(__u16 num, + const struct hlist_head udptable[]) { struct sock *sk; struct hlist_node *node; @@ -132,11 +132,10 @@ static inline int __udp_lib_lport_inuse(__u16 num, struct hlist_head udptable[]) * @sk: socket struct in question * @snum: port number to look up * @udptable: hash list table, must be of UDP_HTABLE_SIZE - * @port_rover: pointer to record of last unallocated port * @saddr_comp: AF-dependent comparison of bound local IP addresses */ int __udp_lib_get_port(struct sock *sk, unsigned short snum, - struct hlist_head udptable[], int *port_rover, + struct hlist_head udptable[], int (*saddr_comp)(const struct sock *sk1, const struct sock *sk2 ) ) { @@ -146,49 +145,56 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, int error = 1; write_lock_bh(&udp_hash_lock); - if (snum == 0) { - int best_size_so_far, best, result, i; - - if (*port_rover > sysctl_local_port_range[1] || - *port_rover < sysctl_local_port_range[0]) - *port_rover = sysctl_local_port_range[0]; - best_size_so_far = 32767; - best = result = *port_rover; - for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) { - int size; - - head = &udptable[result & (UDP_HTABLE_SIZE - 1)]; - if (hlist_empty(head)) { - if (result > sysctl_local_port_range[1]) - result = sysctl_local_port_range[0] + - ((result - sysctl_local_port_range[0]) & - (UDP_HTABLE_SIZE - 1)); + + if (!snum) { + int i, low, high; + unsigned rover, best, best_size_so_far; + + inet_get_local_port_range(&low, &high); + + best_size_so_far = UINT_MAX; + best = rover = net_random() % (high - low) + low; + + /* 1st pass: look for empty (or shortest) hash chain */ + for (i = 0; i < UDP_HTABLE_SIZE; i++) { + int size = 0; + + head = &udptable[rover & (UDP_HTABLE_SIZE - 1)]; + if (hlist_empty(head)) goto gotit; - } - size = 0; + sk_for_each(sk2, node, head) { if (++size >= best_size_so_far) goto next; } best_size_so_far = size; - best = result; + best = rover; next: - ; + /* fold back if end of range */ + if (++rover > high) + rover = low + ((rover - low) + & (UDP_HTABLE_SIZE - 1)); + + } - result = best; - for (i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; - i++, result += UDP_HTABLE_SIZE) { - if (result > sysctl_local_port_range[1]) - result = sysctl_local_port_range[0] - + ((result - sysctl_local_port_range[0]) & - (UDP_HTABLE_SIZE - 1)); - if (! __udp_lib_lport_inuse(result, udptable)) - break; + + /* 2nd pass: find hole in shortest hash chain */ + rover = best; + for (i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++) { + if (! __udp_lib_lport_inuse(rover, udptable)) + goto gotit; + rover += UDP_HTABLE_SIZE; + if (rover > high) + rover = low + ((rover - low) + & (UDP_HTABLE_SIZE - 1)); } - if (i >= (1 << 16) / UDP_HTABLE_SIZE) - goto fail; + + + /* All ports in use! */ + goto fail; + gotit: - *port_rover = snum = result; + snum = rover; } else { head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; @@ -201,6 +207,7 @@ gotit: (*saddr_comp)(sk, sk2) ) goto fail; } + inet_sk(sk)->num = snum; sk->sk_hash = snum; if (sk_unhashed(sk)) { @@ -217,7 +224,7 @@ fail: int udp_get_port(struct sock *sk, unsigned short snum, int (*scmp)(const struct sock *, const struct sock *)) { - return __udp_lib_get_port(sk, snum, udp_hash, &udp_port_rover, scmp); + return __udp_lib_get_port(sk, snum, udp_hash, scmp); } int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) @@ -505,6 +512,8 @@ send: out: up->len = 0; up->pending = 0; + if (!err) + UDP_INC_STATS_USER(UDP_MIB_OUTDATAGRAMS, up->pcflag); return err; } @@ -693,10 +702,8 @@ out: ip_rt_put(rt); if (free) kfree(ipc.opt); - if (!err) { - UDP_INC_STATS_USER(UDP_MIB_OUTDATAGRAMS, is_udplite); + if (!err) return len; - } /* * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space. Reporting * ENOBUFS might not be good (it's not tunable per se), but otherwise @@ -1560,7 +1567,7 @@ int udp_proc_register(struct udp_seq_afinfo *afinfo) afinfo->seq_fops->llseek = seq_lseek; afinfo->seq_fops->release = seq_release_private; - p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops); + p = proc_net_fops_create(&init_net, afinfo->name, S_IRUGO, afinfo->seq_fops); if (p) p->data = afinfo; else @@ -1572,7 +1579,7 @@ void udp_proc_unregister(struct udp_seq_afinfo *afinfo) { if (!afinfo) return; - proc_net_remove(afinfo->name); + proc_net_remove(&init_net, afinfo->name); memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); } diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index 820a477cfaa..6c55828e41b 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -9,7 +9,7 @@ extern int __udp4_lib_rcv(struct sk_buff *, struct hlist_head [], int ); extern void __udp4_lib_err(struct sk_buff *, u32, struct hlist_head []); extern int __udp_lib_get_port(struct sock *sk, unsigned short snum, - struct hlist_head udptable[], int *port_rover, + struct hlist_head udptable[], int (*)(const struct sock*,const struct sock*)); extern int ipv4_rcv_saddr_equal(const struct sock *, const struct sock *); diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index f34fd686a8f..94977205abb 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -16,12 +16,11 @@ DEFINE_SNMP_STAT(struct udp_mib, udplite_statistics) __read_mostly; struct hlist_head udplite_hash[UDP_HTABLE_SIZE]; -static int udplite_port_rover; int udplite_get_port(struct sock *sk, unsigned short p, int (*c)(const struct sock *, const struct sock *)) { - return __udp_lib_get_port(sk, p, udplite_hash, &udplite_port_rover, c); + return __udp_lib_get_port(sk, p, udplite_hash, c); } static int udplite_v4_get_port(struct sock *sk, unsigned short snum) diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 2fa10824541..e9bbfde19ac 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -54,12 +54,14 @@ static int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type) int xfrm_nr = 0; int decaps = 0; int err = xfrm4_parse_spi(skb, ip_hdr(skb)->protocol, &spi, &seq); + unsigned int nhoff = offsetof(struct iphdr, protocol); if (err != 0) goto drop; do { const struct iphdr *iph = ip_hdr(skb); + int nexthdr; if (xfrm_nr == XFRM_MAX_DEPTH) goto drop; @@ -82,9 +84,12 @@ static int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type) if (xfrm_state_check_expire(x)) goto drop_unlock; - if (x->type->input(x, skb)) + nexthdr = x->type->input(x, skb); + if (nexthdr <= 0) goto drop_unlock; + skb_network_header(skb)[nhoff] = nexthdr; + /* only the first xfrm gets the encap type */ encap_type = 0; diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c index a73e710740c..73d2338bec5 100644 --- a/net/ipv4/xfrm4_mode_beet.c +++ b/net/ipv4/xfrm4_mode_beet.c @@ -20,38 +20,33 @@ /* Add encapsulation header. * * The top IP header will be constructed per draft-nikander-esp-beet-mode-06.txt. - * The following fields in it shall be filled in by x->type->output: - * tot_len - * check - * - * On exit, skb->h will be set to the start of the payload to be processed - * by x->type->output and skb->nh will be set to the top IP header. */ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb) { + struct ip_beet_phdr *ph; struct iphdr *iph, *top_iph; int hdrlen, optlen; iph = ip_hdr(skb); - skb->transport_header = skb->network_header; hdrlen = 0; optlen = iph->ihl * 4 - sizeof(*iph); if (unlikely(optlen)) hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4); - skb_push(skb, x->props.header_len - IPV4_BEET_PHMAXLEN + hdrlen); - skb_reset_network_header(skb); - top_iph = ip_hdr(skb); - skb->transport_header += sizeof(*iph) - hdrlen; + skb_set_network_header(skb, IPV4_BEET_PHMAXLEN - x->props.header_len - + hdrlen); + skb->mac_header = skb->network_header + + offsetof(struct iphdr, protocol); + skb->transport_header = skb->network_header + sizeof(*iph); + + ph = (struct ip_beet_phdr *)__skb_pull(skb, sizeof(*iph) - hdrlen); + top_iph = ip_hdr(skb); memmove(top_iph, iph, sizeof(*iph)); if (unlikely(optlen)) { - struct ip_beet_phdr *ph; - BUG_ON(optlen < 0); - ph = (struct ip_beet_phdr *)skb_transport_header(skb); ph->padlen = 4 - (optlen & 4); ph->hdrlen = optlen / 8; ph->nexthdr = top_iph->protocol; diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c index 601047161ea..fd840c7d75e 100644 --- a/net/ipv4/xfrm4_mode_transport.c +++ b/net/ipv4/xfrm4_mode_transport.c @@ -17,18 +17,17 @@ * * The IP header will be moved forward to make space for the encapsulation * header. - * - * On exit, skb->h will be set to the start of the payload to be processed - * by x->type->output and skb->nh will be set to the top IP header. */ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); int ihl = iph->ihl * 4; + skb_set_network_header(skb, -x->props.header_len); + skb->mac_header = skb->network_header + + offsetof(struct iphdr, protocol); skb->transport_header = skb->network_header + ihl; - skb_push(skb, x->props.header_len); - skb_reset_network_header(skb); + __skb_pull(skb, ihl); memmove(skb_network_header(skb), iph, ihl); return 0; } diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 9963700e74c..1ae9d32276f 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -31,13 +31,7 @@ static inline void ipip6_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) /* Add encapsulation header. * - * The top IP header will be constructed per RFC 2401. The following fields - * in it shall be filled in by x->type->output: - * tot_len - * check - * - * On exit, skb->h will be set to the start of the payload to be processed - * by x->type->output and skb->nh will be set to the top IP header. + * The top IP header will be constructed per RFC 2401. */ static int xfrm4_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) { @@ -47,10 +41,11 @@ static int xfrm4_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) int flags; iph = ip_hdr(skb); - skb->transport_header = skb->network_header; - skb_push(skb, x->props.header_len); - skb_reset_network_header(skb); + skb_set_network_header(skb, -x->props.header_len); + skb->mac_header = skb->network_header + + offsetof(struct iphdr, protocol); + skb->transport_header = skb->network_header + sizeof(*iph); top_iph = ip_hdr(skb); top_iph->ihl = 5; diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 44ef208a75c..434ef302ba8 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -12,7 +12,6 @@ #include <linux/if_ether.h> #include <linux/kernel.h> #include <linux/skbuff.h> -#include <linux/spinlock.h> #include <linux/netfilter_ipv4.h> #include <net/ip.h> #include <net/xfrm.h> @@ -41,58 +40,32 @@ out: return ret; } -static int xfrm4_output_one(struct sk_buff *skb) +static inline int xfrm4_output_one(struct sk_buff *skb) { struct dst_entry *dst = skb->dst; struct xfrm_state *x = dst->xfrm; + struct iphdr *iph; int err; - if (skb->ip_summed == CHECKSUM_PARTIAL) { - err = skb_checksum_help(skb); - if (err) - goto error_nolock; - } - if (x->props.mode == XFRM_MODE_TUNNEL) { err = xfrm4_tunnel_check_size(skb); if (err) goto error_nolock; } - do { - spin_lock_bh(&x->lock); - err = xfrm_state_check(x, skb); - if (err) - goto error; - - err = x->mode->output(x, skb); - if (err) - goto error; - - err = x->type->output(x, skb); - if (err) - goto error; - - x->curlft.bytes += skb->len; - x->curlft.packets++; + err = xfrm_output(skb); + if (err) + goto error_nolock; - spin_unlock_bh(&x->lock); - - if (!(skb->dst = dst_pop(dst))) { - err = -EHOSTUNREACH; - goto error_nolock; - } - dst = skb->dst; - x = dst->xfrm; - } while (x && (x->props.mode != XFRM_MODE_TUNNEL)); + iph = ip_hdr(skb); + iph->tot_len = htons(skb->len); + ip_send_check(iph); IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; err = 0; out_exit: return err; -error: - spin_unlock_bh(&x->lock); error_nolock: kfree_skb(skb); goto out_exit; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 4ff8ed30024..329825ca68f 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -306,7 +306,7 @@ static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, xdst = (struct xfrm_dst *)dst; if (xdst->u.rt.idev->dev == dev) { - struct in_device *loopback_idev = in_dev_get(&loopback_dev); + struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev); BUG_ON(!loopback_idev); do { diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c index 9275c79119b..1312417608e 100644 --- a/net/ipv4/xfrm4_tunnel.c +++ b/net/ipv4/xfrm4_tunnel.c @@ -12,17 +12,13 @@ static int ipip_output(struct xfrm_state *x, struct sk_buff *skb) { - struct iphdr *iph = ip_hdr(skb); - - iph->tot_len = htons(skb->len); - ip_send_check(iph); - + skb_push(skb, -skb_network_offset(skb)); return 0; } static int ipip_xfrm_rcv(struct xfrm_state *x, struct sk_buff *skb) { - return 0; + return IPPROTO_IP; } static int ipip_init_state(struct xfrm_state *x) |