diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2008-04-18 18:02:35 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-04-18 18:02:35 -0700 |
commit | 334d094504c2fe1c44211ecb49146ae6bca8c321 (patch) | |
tree | d3c0f68e4b9f8e3d2ccc39e7dfe5de0534a5fad9 /net/ipv4 | |
parent | d1a4be630fb068f251d64b62919f143c49ca8057 (diff) | |
parent | d1643d24c61b725bef399cc1cf2944b4c9c23177 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6.26
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6.26: (1090 commits)
[NET]: Fix and allocate less memory for ->priv'less netdevices
[IPV6]: Fix dangling references on error in fib6_add().
[NETLABEL]: Fix NULL deref in netlbl_unlabel_staticlist_gen() if ifindex not found
[PKT_SCHED]: Fix datalen check in tcf_simp_init().
[INET]: Uninline the __inet_inherit_port call.
[INET]: Drop the inet_inherit_port() call.
SCTP: Initialize partial_bytes_acked to 0, when all of the data is acked.
[netdrvr] forcedeth: internal simplifications; changelog removal
phylib: factor out get_phy_id from within get_phy_device
PHY: add BCM5464 support to broadcom PHY driver
cxgb3: Fix __must_check warning with dev_dbg.
tc35815: Statistics cleanup
natsemi: fix MMIO for PPC 44x platforms
[TIPC]: Cleanup of TIPC reference table code
[TIPC]: Optimized initialization of TIPC reference table
[TIPC]: Remove inlining of reference table locking routines
e1000: convert uint16_t style integers to u16
ixgb: convert uint16_t style integers to u16
sb1000.c: make const arrays static
sb1000.c: stop inlining largish static functions
...
Diffstat (limited to 'net/ipv4')
83 files changed, 2819 insertions, 1784 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 0d109504ed8..f2b5270efda 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -243,6 +243,23 @@ void build_ehash_secret(void) } EXPORT_SYMBOL(build_ehash_secret); +static inline int inet_netns_ok(struct net *net, int protocol) +{ + int hash; + struct net_protocol *ipprot; + + if (net == &init_net) + return 1; + + hash = protocol & (MAX_INET_PROTOS - 1); + ipprot = rcu_dereference(inet_protos[hash]); + + if (ipprot == NULL) + /* raw IP is OK */ + return 1; + return ipprot->netns_ok; +} + /* * Create an inet socket. */ @@ -259,9 +276,6 @@ static int inet_create(struct net *net, struct socket *sock, int protocol) int try_loading_module = 0; int err; - if (net != &init_net) - return -EAFNOSUPPORT; - if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM && !inet_ehash_secret) @@ -320,6 +334,10 @@ lookup_protocol: if (answer->capability > 0 && !capable(answer->capability)) goto out_rcu_unlock; + err = -EAFNOSUPPORT; + if (!inet_netns_ok(net, protocol)) + goto out_rcu_unlock; + sock->ops = answer->ops; answer_prot = answer->prot; answer_no_check = answer->no_check; @@ -446,7 +464,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (addr_len < sizeof(struct sockaddr_in)) goto out; - chk_addr_ret = inet_addr_type(&init_net, addr->sin_addr.s_addr); + chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); /* Not specified by any standard per-se, however it breaks too * many applications when removed. It is unfortunate since @@ -784,6 +802,7 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; int err = 0; + struct net *net = sock_net(sk); switch (cmd) { case SIOCGSTAMP: @@ -795,12 +814,12 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCADDRT: case SIOCDELRT: case SIOCRTMSG: - err = ip_rt_ioctl(sk->sk_net, cmd, (void __user *)arg); + err = ip_rt_ioctl(net, cmd, (void __user *)arg); break; case SIOCDARP: case SIOCGARP: case SIOCSARP: - err = arp_ioctl(sk->sk_net, cmd, (void __user *)arg); + err = arp_ioctl(net, cmd, (void __user *)arg); break; case SIOCGIFADDR: case SIOCSIFADDR: @@ -813,7 +832,7 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCSIFPFLAGS: case SIOCGIFPFLAGS: case SIOCSIFFLAGS: - err = devinet_ioctl(cmd, (void __user *)arg); + err = devinet_ioctl(net, cmd, (void __user *)arg); break; default: if (sk->sk_prot->ioctl) @@ -1058,8 +1077,8 @@ static int inet_sk_reselect_saddr(struct sock *sk) if (sysctl_ip_dynaddr > 1) { printk(KERN_INFO "%s(): shifting inet->" - "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n", - __FUNCTION__, + "saddr from " NIPQUAD_FMT " to " NIPQUAD_FMT "\n", + __func__, NIPQUAD(old_saddr), NIPQUAD(new_saddr)); } @@ -1113,7 +1132,7 @@ int inet_sk_rebuild_header(struct sock *sk) }; security_sk_classify_flow(sk, &fl); - err = ip_route_output_flow(&init_net, &rt, &fl, sk, 0); + err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0); } if (!err) sk_setup_caps(sk, &rt->u.dst); @@ -1231,6 +1250,29 @@ out: return segs; } +int inet_ctl_sock_create(struct sock **sk, unsigned short family, + unsigned short type, unsigned char protocol, + struct net *net) +{ + struct socket *sock; + int rc = sock_create_kern(family, type, protocol, &sock); + + if (rc == 0) { + *sk = sock->sk; + (*sk)->sk_allocation = GFP_ATOMIC; + /* + * Unhash it so that IP input processing does not even see it, + * we do not wish this socket to see incoming packets. + */ + (*sk)->sk_prot->unhash(*sk); + + sk_change_net(*sk, net); + } + return rc; +} + +EXPORT_SYMBOL_GPL(inet_ctl_sock_create); + unsigned long snmp_fold_field(void *mib[], int offt) { unsigned long res = 0; @@ -1283,17 +1325,20 @@ static struct net_protocol tcp_protocol = { .gso_send_check = tcp_v4_gso_send_check, .gso_segment = tcp_tso_segment, .no_policy = 1, + .netns_ok = 1, }; static struct net_protocol udp_protocol = { .handler = udp_rcv, .err_handler = udp_err, .no_policy = 1, + .netns_ok = 1, }; static struct net_protocol icmp_protocol = { .handler = icmp_rcv, .no_policy = 1, + .netns_ok = 1, }; static int __init init_ipv4_mibs(void) @@ -1414,7 +1459,7 @@ static int __init inet_init(void) ip_init(); - tcp_v4_init(&inet_family_ops); + tcp_v4_init(); /* Setup TCP slab cache for open requests. */ tcp_init(); @@ -1429,7 +1474,8 @@ static int __init inet_init(void) * Set the ICMP layer up */ - icmp_init(&inet_family_ops); + if (icmp_init() < 0) + panic("Failed to create the ICMP control socket.\n"); /* * Initialise the multicast router diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 8e17f65f400..68b72a7a180 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -242,7 +242,7 @@ static int arp_constructor(struct neighbour *neigh) return -EINVAL; } - neigh->type = inet_addr_type(&init_net, addr); + neigh->type = inet_addr_type(dev_net(dev), addr); parms = in_dev->arp_parms; __neigh_parms_put(neigh->parms); @@ -341,14 +341,14 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { default: case 0: /* By default announce any local IP */ - if (skb && inet_addr_type(&init_net, ip_hdr(skb)->saddr) == RTN_LOCAL) + if (skb && inet_addr_type(dev_net(dev), ip_hdr(skb)->saddr) == RTN_LOCAL) saddr = ip_hdr(skb)->saddr; break; case 1: /* Restrict announcements of saddr in same subnet */ if (!skb) break; saddr = ip_hdr(skb)->saddr; - if (inet_addr_type(&init_net, saddr) == RTN_LOCAL) { + if (inet_addr_type(dev_net(dev), saddr) == RTN_LOCAL) { /* saddr should be known to target */ if (inet_addr_onlink(in_dev, target, saddr)) break; @@ -424,7 +424,7 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) int flag = 0; /*unsigned long now; */ - if (ip_route_output_key(&init_net, &rt, &fl) < 0) + if (ip_route_output_key(dev_net(dev), &rt, &fl) < 0) return 1; if (rt->u.dst.dev != dev) { NET_INC_STATS_BH(LINUX_MIB_ARPFILTER); @@ -475,9 +475,9 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) return 1; } - paddr = ((struct rtable*)skb->dst)->rt_gateway; + paddr = skb->rtable->rt_gateway; - if (arp_set_predefined(inet_addr_type(&init_net, paddr), haddr, paddr, dev)) + if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, paddr, dev)) return 0; n = __neigh_lookup(&arp_tbl, &paddr, dev, 1); @@ -570,14 +570,13 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, * Allocate a buffer */ - skb = alloc_skb(sizeof(struct arphdr)+ 2*(dev->addr_len+4) - + LL_RESERVED_SPACE(dev), GFP_ATOMIC); + skb = alloc_skb(arp_hdr_len(dev) + LL_RESERVED_SPACE(dev), GFP_ATOMIC); if (skb == NULL) return NULL; skb_reserve(skb, LL_RESERVED_SPACE(dev)); skb_reset_network_header(skb); - arp = (struct arphdr *) skb_put(skb,sizeof(struct arphdr) + 2*(dev->addr_len+4)); + arp = (struct arphdr *) skb_put(skb, arp_hdr_len(dev)); skb->dev = dev; skb->protocol = htons(ETH_P_ARP); if (src_hw == NULL) @@ -710,6 +709,7 @@ static int arp_process(struct sk_buff *skb) u16 dev_type = dev->type; int addr_type; struct neighbour *n; + struct net *net = dev_net(dev); /* arp_rcv below verifies the ARP header and verifies the device * is ARP'able. @@ -805,7 +805,7 @@ static int arp_process(struct sk_buff *skb) /* Special case: IPv4 duplicate address detection packet (RFC2131) */ if (sip == 0) { if (arp->ar_op == htons(ARPOP_REQUEST) && - inet_addr_type(&init_net, tip) == RTN_LOCAL && + inet_addr_type(net, tip) == RTN_LOCAL && !arp_ignore(in_dev, sip, tip)) arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha); @@ -815,7 +815,7 @@ static int arp_process(struct sk_buff *skb) if (arp->ar_op == htons(ARPOP_REQUEST) && ip_route_input(skb, tip, sip, 0, dev) == 0) { - rt = (struct rtable*)skb->dst; + rt = skb->rtable; addr_type = rt->rt_type; if (addr_type == RTN_LOCAL) { @@ -835,7 +835,7 @@ static int arp_process(struct sk_buff *skb) goto out; } else if (IN_DEV_FORWARD(in_dev)) { if (addr_type == RTN_UNICAST && rt->u.dst.dev != dev && - (arp_fwd_proxy(in_dev, rt) || pneigh_lookup(&arp_tbl, &init_net, &tip, dev, 0))) { + (arp_fwd_proxy(in_dev, rt) || pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) neigh_release(n); @@ -858,14 +858,14 @@ static int arp_process(struct sk_buff *skb) n = __neigh_lookup(&arp_tbl, &sip, dev, 0); - if (IPV4_DEVCONF_ALL(dev->nd_net, ARP_ACCEPT)) { + if (IPV4_DEVCONF_ALL(dev_net(dev), ARP_ACCEPT)) { /* Unsolicited ARP is not accepted by default. It is possible, that this option should be enabled for some devices (strip is candidate) */ if (n == NULL && arp->ar_op == htons(ARPOP_REPLY) && - inet_addr_type(&init_net, sip) == RTN_UNICAST) + inet_addr_type(net, sip) == RTN_UNICAST) n = __neigh_lookup(&arp_tbl, &sip, dev, 1); } @@ -912,13 +912,8 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, { struct arphdr *arp; - if (dev->nd_net != &init_net) - goto freeskb; - /* ARP header, plus 2 device addresses, plus 2 IP addresses. */ - if (!pskb_may_pull(skb, (sizeof(struct arphdr) + - (2 * dev->addr_len) + - (2 * sizeof(u32))))) + if (!pskb_may_pull(skb, arp_hdr_len(dev))) goto freeskb; arp = arp_hdr(skb); @@ -1201,9 +1196,6 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event, vo { struct net_device *dev = ptr; - if (dev->nd_net != &init_net) - return NOTIFY_DONE; - switch (event) { case NETDEV_CHANGEADDR: neigh_changeaddr(&arp_tbl, dev); @@ -1318,7 +1310,7 @@ static void arp_format_neigh_entry(struct seq_file *seq, #if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) } #endif - sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(*(u32*)n->primary_key)); + sprintf(tbuf, NIPQUAD_FMT, NIPQUAD(*(u32*)n->primary_key)); seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n", tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name); read_unlock(&n->lock); @@ -1331,7 +1323,7 @@ static void arp_format_pneigh_entry(struct seq_file *seq, int hatype = dev ? dev->type : 0; char tbuf[16]; - sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(*(u32*)n->key)); + sprintf(tbuf, NIPQUAD_FMT, NIPQUAD(*(u32*)n->key)); seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n", tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00", dev ? dev->name : "*"); @@ -1385,13 +1377,29 @@ static const struct file_operations arp_seq_fops = { .release = seq_release_net, }; -static int __init arp_proc_init(void) + +static int __net_init arp_net_init(struct net *net) { - if (!proc_net_fops_create(&init_net, "arp", S_IRUGO, &arp_seq_fops)) + if (!proc_net_fops_create(net, "arp", S_IRUGO, &arp_seq_fops)) return -ENOMEM; return 0; } +static void __net_exit arp_net_exit(struct net *net) +{ + proc_net_remove(net, "arp"); +} + +static struct pernet_operations arp_net_ops = { + .init = arp_net_init, + .exit = arp_net_exit, +}; + +static int __init arp_proc_init(void) +{ + return register_pernet_subsys(&arp_net_ops); +} + #else /* CONFIG_PROC_FS */ static int __init arp_proc_init(void) diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 8cd357f4128..4637ded3dba 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1800,7 +1800,6 @@ int cipso_v4_sock_setattr(struct sock *sk, } memcpy(opt->__data, buf, buf_len); opt->optlen = opt_len; - opt->is_data = 1; opt->cipso = sizeof(struct iphdr); kfree(buf); buf = NULL; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 87490f7bb0f..6848e4760f3 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -165,7 +165,7 @@ static struct in_device *inetdev_init(struct net_device *dev) if (!in_dev) goto out; INIT_RCU_HEAD(&in_dev->rcu_head); - memcpy(&in_dev->cnf, dev->nd_net->ipv4.devconf_dflt, + memcpy(&in_dev->cnf, dev_net(dev)->ipv4.devconf_dflt, sizeof(in_dev->cnf)); in_dev->cnf.sysctl = NULL; in_dev->dev = dev; @@ -437,7 +437,7 @@ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { - struct net *net = skb->sk->sk_net; + struct net *net = sock_net(skb->sk); struct nlattr *tb[IFA_MAX+1]; struct in_device *in_dev; struct ifaddrmsg *ifm; @@ -446,9 +446,6 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg ASSERT_RTNL(); - if (net != &init_net) - return -EINVAL; - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy); if (err < 0) goto errout; @@ -555,14 +552,11 @@ errout: static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { - struct net *net = skb->sk->sk_net; + struct net *net = sock_net(skb->sk); struct in_ifaddr *ifa; ASSERT_RTNL(); - if (net != &init_net) - return -EINVAL; - ifa = rtm_to_ifaddr(net, nlh); if (IS_ERR(ifa)) return PTR_ERR(ifa); @@ -595,7 +589,7 @@ static __inline__ int inet_abc_len(__be32 addr) } -int devinet_ioctl(unsigned int cmd, void __user *arg) +int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) { struct ifreq ifr; struct sockaddr_in sin_orig; @@ -624,7 +618,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg) *colon = 0; #ifdef CONFIG_KMOD - dev_load(&init_net, ifr.ifr_name); + dev_load(net, ifr.ifr_name); #endif switch (cmd) { @@ -665,7 +659,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg) rtnl_lock(); ret = -ENODEV; - if ((dev = __dev_get_by_name(&init_net, ifr.ifr_name)) == NULL) + if ((dev = __dev_get_by_name(net, ifr.ifr_name)) == NULL) goto done; if (colon) @@ -878,6 +872,7 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) { __be32 addr = 0; struct in_device *in_dev; + struct net *net = dev_net(dev); rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); @@ -906,7 +901,7 @@ no_in_dev: */ read_lock(&dev_base_lock); rcu_read_lock(); - for_each_netdev(&init_net, dev) { + for_each_netdev(net, dev) { if ((in_dev = __in_dev_get_rcu(dev)) == NULL) continue; @@ -979,7 +974,7 @@ __be32 inet_confirm_addr(struct in_device *in_dev, if (scope != RT_SCOPE_LINK) return confirm_addr_indev(in_dev, dst, local, scope); - net = in_dev->dev->nd_net; + net = dev_net(in_dev->dev); read_lock(&dev_base_lock); rcu_read_lock(); for_each_netdev(net, dev) { @@ -1045,9 +1040,6 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, struct net_device *dev = ptr; struct in_device *in_dev = __in_dev_get_rtnl(dev); - if (dev->nd_net != &init_net) - return NOTIFY_DONE; - ASSERT_RTNL(); if (!in_dev) { @@ -1166,16 +1158,13 @@ nla_put_failure: static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = skb->sk->sk_net; + struct net *net = sock_net(skb->sk); int idx, ip_idx; struct net_device *dev; struct in_device *in_dev; struct in_ifaddr *ifa; int s_ip_idx, s_idx = cb->args[0]; - if (net != &init_net) - return 0; - s_ip_idx = ip_idx = cb->args[1]; idx = 0; for_each_netdev(net, dev) { @@ -1214,7 +1203,7 @@ static void rtmsg_ifa(int event, struct in_ifaddr* ifa, struct nlmsghdr *nlh, int err = -ENOBUFS; struct net *net; - net = ifa->ifa_dev->dev->nd_net; + net = dev_net(ifa->ifa_dev->dev); skb = nlmsg_new(inet_nlmsg_size(), GFP_KERNEL); if (skb == NULL) goto errout; @@ -1528,7 +1517,7 @@ static void devinet_sysctl_register(struct in_device *idev) { neigh_sysctl_register(idev->dev, idev->arp_parms, NET_IPV4, NET_IPV4_NEIGH, "ipv4", NULL, NULL); - __devinet_sysctl_register(idev->dev->nd_net, idev->dev->name, + __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name, idev->dev->ifindex, &idev->cnf); } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 86ff2711fc9..0f1557a4ac7 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -257,7 +257,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, if (in_dev == NULL) goto e_inval; - net = dev->nd_net; + net = dev_net(dev); if (fib_lookup(net, &fl, &res)) goto last_resort; if (res.type != RTN_UNICAST) @@ -583,7 +583,7 @@ errout: static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) { - struct net *net = skb->sk->sk_net; + struct net *net = sock_net(skb->sk); struct fib_config cfg; struct fib_table *tb; int err; @@ -605,7 +605,7 @@ errout: static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) { - struct net *net = skb->sk->sk_net; + struct net *net = sock_net(skb->sk); struct fib_config cfg; struct fib_table *tb; int err; @@ -627,7 +627,7 @@ errout: static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = skb->sk->sk_net; + struct net *net = sock_net(skb->sk); unsigned int h, s_h; unsigned int e = 0, s_e; struct fib_table *tb; @@ -674,7 +674,7 @@ out: static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa) { - struct net *net = ifa->ifa_dev->dev->nd_net; + struct net *net = dev_net(ifa->ifa_dev->dev); struct fib_table *tb; struct fib_config cfg = { .fc_protocol = RTPROT_KERNEL, @@ -801,15 +801,15 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); /* Check, that this local address finally disappeared. */ - if (inet_addr_type(dev->nd_net, ifa->ifa_local) != RTN_LOCAL) { + if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) { /* And the last, but not the least thing. We must flush stray FIB entries. First of all, we scan fib_info list searching for stray nexthop entries, then ignite fib_flush. */ - if (fib_sync_down_addr(dev->nd_net, ifa->ifa_local)) - fib_flush(dev->nd_net); + if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local)) + fib_flush(dev_net(dev)); } } #undef LOCAL_OK @@ -857,7 +857,7 @@ static void nl_fib_input(struct sk_buff *skb) struct fib_table *tb; u32 pid; - net = skb->sk->sk_net; + net = sock_net(skb->sk); nlh = nlmsg_hdr(skb); if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len || nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn))) @@ -899,7 +899,7 @@ static void nl_fib_lookup_exit(struct net *net) static void fib_disable_ip(struct net_device *dev, int force) { if (fib_sync_down_dev(dev, force)) - fib_flush(dev->nd_net); + fib_flush(dev_net(dev)); rt_cache_flush(0); arp_ifdown(dev); } diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 8d58d85dfac..02088deb046 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -821,7 +821,7 @@ static struct fib_alias *fib_get_first(struct seq_file *seq) struct fib_table *main_table; struct fn_hash *table; - main_table = fib_get_table(iter->p.net, RT_TABLE_MAIN); + main_table = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN); table = (struct fn_hash *)main_table->tb_data; iter->bucket = 0; @@ -959,11 +959,10 @@ static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos) static void *fib_seq_start(struct seq_file *seq, loff_t *pos) __acquires(fib_hash_lock) { - struct fib_iter_state *iter = seq->private; void *v = NULL; read_lock(&fib_hash_lock); - if (fib_get_table(iter->p.net, RT_TABLE_MAIN)) + if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN)) v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; return v; } diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 19274d01afa..1fb56876be5 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -137,7 +137,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct nlmsghdr *nlh, struct fib_rule_hdr *frh, struct nlattr **tb) { - struct net *net = skb->sk->sk_net; + struct net *net = sock_net(skb->sk); int err = -EINVAL; struct fib4_rule *rule4 = (struct fib4_rule *) rule; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index a13c84763d4..3b83c34019f 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -152,6 +152,7 @@ void free_fib_info(struct fib_info *fi) nh->nh_dev = NULL; } endfor_nexthops(fi); fib_info_cnt--; + release_net(fi->fib_net); kfree(fi); } @@ -730,7 +731,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) goto failure; fib_info_cnt++; - fi->fib_net = net; + fi->fib_net = hold_net(net); fi->fib_protocol = cfg->fc_protocol; fi->fib_flags = cfg->fc_flags; fi->fib_priority = cfg->fc_priority; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index f6cdc012eec..ea294fffb9c 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -122,7 +122,10 @@ struct tnode { unsigned char bits; /* 2log(KEYLENGTH) bits needed */ unsigned int full_children; /* KEYLENGTH bits needed */ unsigned int empty_children; /* KEYLENGTH bits needed */ - struct rcu_head rcu; + union { + struct rcu_head rcu; + struct work_struct work; + }; struct node *child[0]; }; @@ -160,7 +163,6 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, static struct node *resize(struct trie *t, struct tnode *tn); static struct tnode *inflate(struct trie *t, struct tnode *tn); static struct tnode *halve(struct trie *t, struct tnode *tn); -static void tnode_free(struct tnode *tn); static struct kmem_cache *fn_alias_kmem __read_mostly; static struct kmem_cache *trie_leaf_kmem __read_mostly; @@ -334,6 +336,11 @@ static void __leaf_free_rcu(struct rcu_head *head) kmem_cache_free(trie_leaf_kmem, l); } +static inline void free_leaf(struct leaf *l) +{ + call_rcu_bh(&l->rcu, __leaf_free_rcu); +} + static void __leaf_info_free_rcu(struct rcu_head *head) { kfree(container_of(head, struct leaf_info, rcu)); @@ -346,16 +353,16 @@ static inline void free_leaf_info(struct leaf_info *leaf) static struct tnode *tnode_alloc(size_t size) { - struct page *pages; - if (size <= PAGE_SIZE) return kzalloc(size, GFP_KERNEL); + else + return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); +} - pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size)); - if (!pages) - return NULL; - - return page_address(pages); +static void __tnode_vfree(struct work_struct *arg) +{ + struct tnode *tn = container_of(arg, struct tnode, work); + vfree(tn); } static void __tnode_free_rcu(struct rcu_head *head) @@ -366,16 +373,17 @@ static void __tnode_free_rcu(struct rcu_head *head) if (size <= PAGE_SIZE) kfree(tn); - else - free_pages((unsigned long)tn, get_order(size)); + else { + INIT_WORK(&tn->work, __tnode_vfree); + schedule_work(&tn->work); + } } static inline void tnode_free(struct tnode *tn) { - if (IS_LEAF(tn)) { - struct leaf *l = (struct leaf *) tn; - call_rcu_bh(&l->rcu, __leaf_free_rcu); - } else + if (IS_LEAF(tn)) + free_leaf((struct leaf *) tn); + else call_rcu(&tn->rcu, __tnode_free_rcu); } @@ -1086,7 +1094,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) li = leaf_info_new(plen); if (!li) { - tnode_free((struct tnode *) l); + free_leaf(l); return NULL; } @@ -1122,7 +1130,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) if (!tn) { free_leaf_info(li); - tnode_free((struct tnode *) l); + free_leaf(l); return NULL; } @@ -1578,7 +1586,7 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l) } else rcu_assign_pointer(t->trie, NULL); - tnode_free((struct tnode *) l); + free_leaf(l); } /* @@ -1665,7 +1673,7 @@ static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg) return 0; } -static int trie_flush_list(struct trie *t, struct list_head *head) +static int trie_flush_list(struct list_head *head) { struct fib_alias *fa, *fa_node; int found = 0; @@ -1683,7 +1691,7 @@ static int trie_flush_list(struct trie *t, struct list_head *head) return found; } -static int trie_flush_leaf(struct trie *t, struct leaf *l) +static int trie_flush_leaf(struct leaf *l) { int found = 0; struct hlist_head *lih = &l->list; @@ -1691,7 +1699,7 @@ static int trie_flush_leaf(struct trie *t, struct leaf *l) struct leaf_info *li = NULL; hlist_for_each_entry_safe(li, node, tmp, lih, hlist) { - found += trie_flush_list(t, &li->falh); + found += trie_flush_list(&li->falh); if (list_empty(&li->falh)) { hlist_del_rcu(&li->hlist); @@ -1782,7 +1790,7 @@ static int fn_trie_flush(struct fib_table *tb) int found = 0; for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) { - found += trie_flush_leaf(t, l); + found += trie_flush_leaf(l); if (ll && hlist_empty(&ll->list)) trie_leaf_remove(t, ll); @@ -2029,9 +2037,8 @@ struct fib_table *fib_hash_table(u32 id) /* Depth first Trie walk iterator */ struct fib_trie_iter { struct seq_net_private p; - struct trie *trie_local, *trie_main; + struct fib_table *tb; struct tnode *tnode; - struct trie *trie; unsigned index; unsigned depth; }; @@ -2084,31 +2091,26 @@ rescan: static struct node *fib_trie_get_first(struct fib_trie_iter *iter, struct trie *t) { - struct node *n ; + struct node *n; if (!t) return NULL; n = rcu_dereference(t->trie); - - if (!iter) + if (!n) return NULL; - if (n) { - if (IS_TNODE(n)) { - iter->tnode = (struct tnode *) n; - iter->trie = t; - iter->index = 0; - iter->depth = 1; - } else { - iter->tnode = NULL; - iter->trie = t; - iter->index = 0; - iter->depth = 0; - } - return n; + if (IS_TNODE(n)) { + iter->tnode = (struct tnode *) n; + iter->index = 0; + iter->depth = 1; + } else { + iter->tnode = NULL; + iter->index = 0; + iter->depth = 0; } - return NULL; + + return n; } static void trie_collect_stats(struct trie *t, struct trie_stat *s) @@ -2119,8 +2121,7 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s) memset(s, 0, sizeof(*s)); rcu_read_lock(); - for (n = fib_trie_get_first(&iter, t); n; - n = fib_trie_get_next(&iter)) { + for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) { if (IS_LEAF(n)) { struct leaf *l = (struct leaf *)n; struct leaf_info *li; @@ -2209,36 +2210,48 @@ static void trie_show_usage(struct seq_file *seq, } #endif /* CONFIG_IP_FIB_TRIE_STATS */ -static void fib_trie_show(struct seq_file *seq, const char *name, - struct trie *trie) +static void fib_table_print(struct seq_file *seq, struct fib_table *tb) { - struct trie_stat stat; - - trie_collect_stats(trie, &stat); - seq_printf(seq, "%s:\n", name); - trie_show_stats(seq, &stat); -#ifdef CONFIG_IP_FIB_TRIE_STATS - trie_show_usage(seq, &trie->stats); -#endif + if (tb->tb_id == RT_TABLE_LOCAL) + seq_puts(seq, "Local:\n"); + else if (tb->tb_id == RT_TABLE_MAIN) + seq_puts(seq, "Main:\n"); + else + seq_printf(seq, "Id %d:\n", tb->tb_id); } + static int fib_triestat_seq_show(struct seq_file *seq, void *v) { struct net *net = (struct net *)seq->private; - struct fib_table *tb; + unsigned int h; seq_printf(seq, "Basic info: size of leaf:" " %Zd bytes, size of tnode: %Zd bytes.\n", sizeof(struct leaf), sizeof(struct tnode)); - tb = fib_get_table(net, RT_TABLE_LOCAL); - if (tb) - fib_trie_show(seq, "Local", (struct trie *) tb->tb_data); + for (h = 0; h < FIB_TABLE_HASHSZ; h++) { + struct hlist_head *head = &net->ipv4.fib_table_hash[h]; + struct hlist_node *node; + struct fib_table *tb; + + hlist_for_each_entry_rcu(tb, node, head, tb_hlist) { + struct trie *t = (struct trie *) tb->tb_data; + struct trie_stat stat; + + if (!t) + continue; - tb = fib_get_table(net, RT_TABLE_MAIN); - if (tb) - fib_trie_show(seq, "Main", (struct trie *) tb->tb_data); + fib_table_print(seq, tb); + + trie_collect_stats(t, &stat); + trie_show_stats(seq, &stat); +#ifdef CONFIG_IP_FIB_TRIE_STATS + trie_show_usage(seq, &t->stats); +#endif + } + } return 0; } @@ -2274,67 +2287,79 @@ static const struct file_operations fib_triestat_fops = { .release = fib_triestat_seq_release, }; -static struct node *fib_trie_get_idx(struct fib_trie_iter *iter, - loff_t pos) +static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) { + struct fib_trie_iter *iter = seq->private; + struct net *net = seq_file_net(seq); loff_t idx = 0; - struct node *n; + unsigned int h; - for (n = fib_trie_get_first(iter, iter->trie_local); - n; ++idx, n = fib_trie_get_next(iter)) { - if (pos == idx) - return n; - } + for (h = 0; h < FIB_TABLE_HASHSZ; h++) { + struct hlist_head *head = &net->ipv4.fib_table_hash[h]; + struct hlist_node *node; + struct fib_table *tb; - for (n = fib_trie_get_first(iter, iter->trie_main); - n; ++idx, n = fib_trie_get_next(iter)) { - if (pos == idx) - return n; + hlist_for_each_entry_rcu(tb, node, head, tb_hlist) { + struct node *n; + + for (n = fib_trie_get_first(iter, + (struct trie *) tb->tb_data); + n; n = fib_trie_get_next(iter)) + if (pos == idx++) { + iter->tb = tb; + return n; + } + } } + return NULL; } static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { - struct fib_trie_iter *iter = seq->private; - struct fib_table *tb; - - if (!iter->trie_local) { - tb = fib_get_table(iter->p.net, RT_TABLE_LOCAL); - if (tb) - iter->trie_local = (struct trie *) tb->tb_data; - } - if (!iter->trie_main) { - tb = fib_get_table(iter->p.net, RT_TABLE_MAIN); - if (tb) - iter->trie_main = (struct trie *) tb->tb_data; - } rcu_read_lock(); - if (*pos == 0) - return SEQ_START_TOKEN; - return fib_trie_get_idx(iter, *pos - 1); + return fib_trie_get_idx(seq, *pos); } static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct fib_trie_iter *iter = seq->private; - void *l = v; + struct net *net = seq_file_net(seq); + struct fib_table *tb = iter->tb; + struct hlist_node *tb_node; + unsigned int h; + struct node *n; ++*pos; - if (v == SEQ_START_TOKEN) - return fib_trie_get_idx(iter, 0); - - v = fib_trie_get_next(iter); - BUG_ON(v == l); - if (v) - return v; + /* next node in same table */ + n = fib_trie_get_next(iter); + if (n) + return n; - /* continue scan in next trie */ - if (iter->trie == iter->trie_local) - return fib_trie_get_first(iter, iter->trie_main); + /* walk rest of this hash chain */ + h = tb->tb_id & (FIB_TABLE_HASHSZ - 1); + while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) { + tb = hlist_entry(tb_node, struct fib_table, tb_hlist); + n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); + if (n) + goto found; + } + /* new hash chain */ + while (++h < FIB_TABLE_HASHSZ) { + struct hlist_head *head = &net->ipv4.fib_table_hash[h]; + hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) { + n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); + if (n) + goto found; + } + } return NULL; + +found: + iter->tb = tb; + return n; } static void fib_trie_seq_stop(struct seq_file *seq, void *v) @@ -2391,22 +2416,15 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) const struct fib_trie_iter *iter = seq->private; struct node *n = v; - if (v == SEQ_START_TOKEN) - return 0; - - if (!node_parent_rcu(n)) { - if (iter->trie == iter->trie_local) - seq_puts(seq, "<local>:\n"); - else - seq_puts(seq, "<main>:\n"); - } + if (!node_parent_rcu(n)) + fib_table_print(seq, iter->tb); if (IS_TNODE(n)) { struct tnode *tn = (struct tnode *) n; __be32 prf = htonl(mask_pfx(tn->key, tn->pos)); seq_indent(seq, iter->depth-1); - seq_printf(seq, " +-- %d.%d.%d.%d/%d %d %d %d\n", + seq_printf(seq, " +-- " NIPQUAD_FMT "/%d %d %d %d\n", NIPQUAD(prf), tn->pos, tn->bits, tn->full_children, tn->empty_children); @@ -2417,7 +2435,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) __be32 val = htonl(l->key); seq_indent(seq, iter->depth); - seq_printf(seq, " |-- %d.%d.%d.%d\n", NIPQUAD(val)); + seq_printf(seq, " |-- " NIPQUAD_FMT "\n", NIPQUAD(val)); hlist_for_each_entry_rcu(li, node, &l->list, hlist) { struct fib_alias *fa; @@ -2502,7 +2520,7 @@ static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos) struct fib_table *tb; rcu_read_lock(); - tb = fib_get_table(iter->p.net, RT_TABLE_MAIN); + tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN); if (!tb) return NULL; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 40508babad8..f064031f203 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -93,6 +93,7 @@ #include <asm/uaccess.h> #include <net/checksum.h> #include <net/xfrm.h> +#include <net/inet_common.h> /* * Build xmit assembly blocks @@ -188,29 +189,6 @@ struct icmp_err icmp_err_convert[] = { }, }; -/* Control parameters for ECHO replies. */ -int sysctl_icmp_echo_ignore_all __read_mostly; -int sysctl_icmp_echo_ignore_broadcasts __read_mostly = 1; - -/* Control parameter - ignore bogus broadcast responses? */ -int sysctl_icmp_ignore_bogus_error_responses __read_mostly = 1; - -/* - * Configurable global rate limit. - * - * ratelimit defines tokens/packet consumed for dst->rate_token bucket - * ratemask defines which icmp types are ratelimited by setting - * it's bit position. - * - * default: - * dest unreachable (3), source quench (4), - * time exceeded (11), parameter problem (12) - */ - -int sysctl_icmp_ratelimit __read_mostly = 1 * HZ; -int sysctl_icmp_ratemask __read_mostly = 0x1818; -int sysctl_icmp_errors_use_inbound_ifaddr __read_mostly; - /* * ICMP control array. This specifies what to do with each ICMP. */ @@ -229,14 +207,16 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; * * On SMP we have one ICMP socket per-cpu. */ -static DEFINE_PER_CPU(struct socket *, __icmp_socket) = NULL; -#define icmp_socket __get_cpu_var(__icmp_socket) +static struct sock *icmp_sk(struct net *net) +{ + return net->ipv4.icmp_sk[smp_processor_id()]; +} -static inline int icmp_xmit_lock(void) +static inline int icmp_xmit_lock(struct sock *sk) { local_bh_disable(); - if (unlikely(!spin_trylock(&icmp_socket->sk->sk_lock.slock))) { + if (unlikely(!spin_trylock(&sk->sk_lock.slock))) { /* This can happen if the output path signals a * dst_link_failure() for an outgoing ICMP packet. */ @@ -246,9 +226,9 @@ static inline int icmp_xmit_lock(void) return 0; } -static inline void icmp_xmit_unlock(void) +static inline void icmp_xmit_unlock(struct sock *sk) { - spin_unlock_bh(&icmp_socket->sk->sk_lock.slock); + spin_unlock_bh(&sk->sk_lock.slock); } /* @@ -291,7 +271,8 @@ int xrlim_allow(struct dst_entry *dst, int timeout) return rc; } -static inline int icmpv4_xrlim_allow(struct rtable *rt, int type, int code) +static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt, + int type, int code) { struct dst_entry *dst = &rt->u.dst; int rc = 1; @@ -308,8 +289,8 @@ static inline int icmpv4_xrlim_allow(struct rtable *rt, int type, int code) goto out; /* Limit if icmp type is enabled in ratemask. */ - if ((1 << type) & sysctl_icmp_ratemask) - rc = xrlim_allow(dst, sysctl_icmp_ratelimit); + if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) + rc = xrlim_allow(dst, net->ipv4.sysctl_icmp_ratelimit); out: return rc; } @@ -346,19 +327,21 @@ static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd, static void icmp_push_reply(struct icmp_bxm *icmp_param, struct ipcm_cookie *ipc, struct rtable *rt) { + struct sock *sk; struct sk_buff *skb; - if (ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param, + sk = icmp_sk(dev_net(rt->u.dst.dev)); + if (ip_append_data(sk, icmp_glue_bits, icmp_param, icmp_param->data_len+icmp_param->head_len, icmp_param->head_len, ipc, rt, MSG_DONTWAIT) < 0) - ip_flush_pending_frames(icmp_socket->sk); - else if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) { + ip_flush_pending_frames(sk); + else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { struct icmphdr *icmph = icmp_hdr(skb); __wsum csum = 0; struct sk_buff *skb1; - skb_queue_walk(&icmp_socket->sk->sk_write_queue, skb1) { + skb_queue_walk(&sk->sk_write_queue, skb1) { csum = csum_add(csum, skb1->csum); } csum = csum_partial_copy_nocheck((void *)&icmp_param->data, @@ -366,7 +349,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param, icmp_param->head_len, csum); icmph->checksum = csum_fold(csum); skb->ip_summed = CHECKSUM_NONE; - ip_push_pending_frames(icmp_socket->sk); + ip_push_pending_frames(sk); } } @@ -376,16 +359,17 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param, static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) { - struct sock *sk = icmp_socket->sk; - struct inet_sock *inet = inet_sk(sk); struct ipcm_cookie ipc; - struct rtable *rt = (struct rtable *)skb->dst; + struct rtable *rt = skb->rtable; + struct net *net = dev_net(rt->u.dst.dev); + struct sock *sk = icmp_sk(net); + struct inet_sock *inet = inet_sk(sk); __be32 daddr; if (ip_options_echo(&icmp_param->replyopts, skb)) return; - if (icmp_xmit_lock()) + if (icmp_xmit_lock(sk)) return; icmp_param->data.icmph.checksum = 0; @@ -405,15 +389,15 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) .tos = RT_TOS(ip_hdr(skb)->tos) } }, .proto = IPPROTO_ICMP }; security_skb_classify_flow(skb, &fl); - if (ip_route_output_key(rt->u.dst.dev->nd_net, &rt, &fl)) + if (ip_route_output_key(net, &rt, &fl)) goto out_unlock; } - if (icmpv4_xrlim_allow(rt, icmp_param->data.icmph.type, + if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type, icmp_param->data.icmph.code)) icmp_push_reply(icmp_param, &ipc, rt); ip_rt_put(rt); out_unlock: - icmp_xmit_unlock(); + icmp_xmit_unlock(sk); } @@ -433,15 +417,17 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) struct iphdr *iph; int room; struct icmp_bxm icmp_param; - struct rtable *rt = (struct rtable *)skb_in->dst; + struct rtable *rt = skb_in->rtable; struct ipcm_cookie ipc; __be32 saddr; u8 tos; struct net *net; + struct sock *sk; if (!rt) goto out; - net = rt->u.dst.dev->nd_net; + net = dev_net(rt->u.dst.dev); + sk = icmp_sk(net); /* * Find the original header. It is expected to be valid, of course. @@ -505,7 +491,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) } } - if (icmp_xmit_lock()) + if (icmp_xmit_lock(sk)) return; /* @@ -516,7 +502,8 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) if (!(rt->rt_flags & RTCF_LOCAL)) { struct net_device *dev = NULL; - if (rt->fl.iif && sysctl_icmp_errors_use_inbound_ifaddr) + if (rt->fl.iif && + net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) dev = dev_get_by_index(net, rt->fl.iif); if (dev) { @@ -544,7 +531,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) icmp_param.data.icmph.checksum = 0; icmp_param.skb = skb_in; icmp_param.offset = skb_network_offset(skb_in); - inet_sk(icmp_socket->sk)->tos = tos; + inet_sk(sk)->tos = tos; ipc.addr = iph->saddr; ipc.opt = &icmp_param.replyopts; @@ -609,7 +596,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) RT_TOS(tos), rt2->u.dst.dev); dst_release(&rt2->u.dst); - rt2 = (struct rtable *)skb_in->dst; + rt2 = skb_in->rtable; skb_in->dst = odst; } @@ -634,7 +621,7 @@ relookup_failed: } route_done: - if (!icmpv4_xrlim_allow(rt, type, code)) + if (!icmpv4_xrlim_allow(net, rt, type, code)) goto ende; /* RFC says return as much as we can without exceeding 576 bytes. */ @@ -654,7 +641,7 @@ route_done: ende: ip_rt_put(rt); out_unlock: - icmp_xmit_unlock(); + icmp_xmit_unlock(sk); out:; } @@ -672,7 +659,7 @@ static void icmp_unreach(struct sk_buff *skb) u32 info = 0; struct net *net; - net = skb->dst->dev->nd_net; + net = dev_net(skb->dst->dev); /* * Incomplete header ? @@ -698,7 +685,7 @@ static void icmp_unreach(struct sk_buff *skb) break; case ICMP_FRAG_NEEDED: if (ipv4_config.no_pmtu_disc) { - LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: " + LIMIT_NETDEBUG(KERN_INFO "ICMP: " NIPQUAD_FMT ": " "fragmentation needed " "and DF set.\n", NIPQUAD(iph->daddr)); @@ -710,7 +697,7 @@ static void icmp_unreach(struct sk_buff *skb) } break; case ICMP_SR_FAILED: - LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: Source " + LIMIT_NETDEBUG(KERN_INFO "ICMP: " NIPQUAD_FMT ": Source " "Route Failed.\n", NIPQUAD(iph->daddr)); break; @@ -740,12 +727,12 @@ static void icmp_unreach(struct sk_buff *skb) * get the other vendor to fix their kit. */ - if (!sysctl_icmp_ignore_bogus_error_responses && + if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses && inet_addr_type(net, iph->daddr) == RTN_BROADCAST) { if (net_ratelimit()) - printk(KERN_WARNING "%u.%u.%u.%u sent an invalid ICMP " + printk(KERN_WARNING NIPQUAD_FMT " sent an invalid ICMP " "type %u, code %u " - "error to a broadcast: %u.%u.%u.%u on %s\n", + "error to a broadcast: " NIPQUAD_FMT " on %s\n", NIPQUAD(ip_hdr(skb)->saddr), icmph->type, icmph->code, NIPQUAD(iph->daddr), @@ -835,7 +822,10 @@ out_err: static void icmp_echo(struct sk_buff *skb) { - if (!sysctl_icmp_echo_ignore_all) { + struct net *net; + + net = dev_net(skb->dst->dev); + if (!net->ipv4.sysctl_icmp_echo_ignore_all) { struct icmp_bxm icmp_param; icmp_param.data.icmph = *icmp_hdr(skb); @@ -938,7 +928,7 @@ static void icmp_address(struct sk_buff *skb) static void icmp_address_reply(struct sk_buff *skb) { - struct rtable *rt = (struct rtable *)skb->dst; + struct rtable *rt = skb->rtable; struct net_device *dev = skb->dev; struct in_device *in_dev; struct in_ifaddr *ifa; @@ -963,8 +953,8 @@ static void icmp_address_reply(struct sk_buff *skb) break; } if (!ifa && net_ratelimit()) { - printk(KERN_INFO "Wrong address mask %u.%u.%u.%u from " - "%s/%u.%u.%u.%u\n", + printk(KERN_INFO "Wrong address mask " NIPQUAD_FMT " from " + "%s/" NIPQUAD_FMT "\n", NIPQUAD(*mp), dev->name, NIPQUAD(rt->rt_src)); } } @@ -983,7 +973,7 @@ static void icmp_discard(struct sk_buff *skb) int icmp_rcv(struct sk_buff *skb) { struct icmphdr *icmph; - struct rtable *rt = (struct rtable *)skb->dst; + struct rtable *rt = skb->rtable; if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { int nh; @@ -1038,6 +1028,9 @@ int icmp_rcv(struct sk_buff *skb) */ if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { + struct net *net; + + net = dev_net(rt->u.dst.dev); /* * RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be * silently ignored (we let user decide with a sysctl). @@ -1046,7 +1039,7 @@ int icmp_rcv(struct sk_buff *skb) */ if ((icmph->type == ICMP_ECHO || icmph->type == ICMP_TIMESTAMP) && - sysctl_icmp_echo_ignore_broadcasts) { + net->ipv4.sysctl_icmp_echo_ignore_broadcasts) { goto error; } if (icmph->type != ICMP_ECHO && @@ -1141,38 +1134,84 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = { }, }; -void __init icmp_init(struct net_proto_family *ops) +static void __net_exit icmp_sk_exit(struct net *net) { - struct inet_sock *inet; int i; - for_each_possible_cpu(i) { - int err; + for_each_possible_cpu(i) + inet_ctl_sock_destroy(net->ipv4.icmp_sk[i]); + kfree(net->ipv4.icmp_sk); + net->ipv4.icmp_sk = NULL; +} + +int __net_init icmp_sk_init(struct net *net) +{ + int i, err; - err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_ICMP, - &per_cpu(__icmp_socket, i)); + net->ipv4.icmp_sk = + kzalloc(nr_cpu_ids * sizeof(struct sock *), GFP_KERNEL); + if (net->ipv4.icmp_sk == NULL) + return -ENOMEM; + for_each_possible_cpu(i) { + struct sock *sk; + + err = inet_ctl_sock_create(&sk, PF_INET, + SOCK_RAW, IPPROTO_ICMP, net); if (err < 0) - panic("Failed to create the ICMP control socket.\n"); + goto fail; - per_cpu(__icmp_socket, i)->sk->sk_allocation = GFP_ATOMIC; + net->ipv4.icmp_sk[i] = sk; /* Enough space for 2 64K ICMP packets, including * sk_buff struct overhead. */ - per_cpu(__icmp_socket, i)->sk->sk_sndbuf = + sk->sk_sndbuf = (2 * ((64 * 1024) + sizeof(struct sk_buff))); - inet = inet_sk(per_cpu(__icmp_socket, i)->sk); - inet->uc_ttl = -1; - inet->pmtudisc = IP_PMTUDISC_DONT; - - /* Unhash it so that IP input processing does not even - * see it, we do not wish this socket to see incoming - * packets. - */ - per_cpu(__icmp_socket, i)->sk->sk_prot->unhash(per_cpu(__icmp_socket, i)->sk); + inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT; } + + /* Control parameters for ECHO replies. */ + net->ipv4.sysctl_icmp_echo_ignore_all = 0; + net->ipv4.sysctl_icmp_echo_ignore_broadcasts = 1; + + /* Control parameter - ignore bogus broadcast responses? */ + net->ipv4.sysctl_icmp_ignore_bogus_error_responses = 1; + + /* + * Configurable global rate limit. + * + * ratelimit defines tokens/packet consumed for dst->rate_token + * bucket ratemask defines which icmp types are ratelimited by + * setting it's bit position. + * + * default: + * dest unreachable (3), source quench (4), + * time exceeded (11), parameter problem (12) + */ + + net->ipv4.sysctl_icmp_ratelimit = 1 * HZ; + net->ipv4.sysctl_icmp_ratemask = 0x1818; + net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0; + + return 0; + +fail: + for_each_possible_cpu(i) + inet_ctl_sock_destroy(net->ipv4.icmp_sk[i]); + kfree(net->ipv4.icmp_sk); + return err; +} + +static struct pernet_operations __net_initdata icmp_sk_ops = { + .init = icmp_sk_init, + .exit = icmp_sk_exit, +}; + +int __init icmp_init(void) +{ + return register_pernet_device(&icmp_sk_ops); } EXPORT_SYMBOL(icmp_err_convert); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 732cd07e607..6250f4239b6 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -130,12 +130,12 @@ */ #define IGMP_V1_SEEN(in_dev) \ - (IPV4_DEVCONF_ALL(in_dev->dev->nd_net, FORCE_IGMP_VERSION) == 1 || \ + (IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \ IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \ ((in_dev)->mr_v1_seen && \ time_before(jiffies, (in_dev)->mr_v1_seen))) #define IGMP_V2_SEEN(in_dev) \ - (IPV4_DEVCONF_ALL(in_dev->dev->nd_net, FORCE_IGMP_VERSION) == 2 || \ + (IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \ IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \ ((in_dev)->mr_v2_seen && \ time_before(jiffies, (in_dev)->mr_v2_seen))) @@ -948,7 +948,7 @@ int igmp_rcv(struct sk_buff *skb) case IGMPV2_HOST_MEMBERSHIP_REPORT: case IGMPV3_HOST_MEMBERSHIP_REPORT: /* Is it our report looped back? */ - if (((struct rtable*)skb->dst)->fl.iif == 0) + if (skb->rtable->fl.iif == 0) break; /* don't rely on MC router hearing unicast reports */ if (skb->pkt_type == PACKET_MULTICAST || @@ -1198,6 +1198,9 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) ASSERT_RTNL(); + if (dev_net(in_dev->dev) != &init_net) + return; + for (im=in_dev->mc_list; im; im=im->next) { if (im->multiaddr == addr) { im->users++; @@ -1277,6 +1280,9 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr) ASSERT_RTNL(); + if (dev_net(in_dev->dev) != &init_net) + return; + for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) { if (i->multiaddr==addr) { if (--i->users == 0) { @@ -1304,6 +1310,9 @@ void ip_mc_down(struct in_device *in_dev) ASSERT_RTNL(); + if (dev_net(in_dev->dev) != &init_net) + return; + for (i=in_dev->mc_list; i; i=i->next) igmp_group_dropped(i); @@ -1324,6 +1333,9 @@ void ip_mc_init_dev(struct in_device *in_dev) { ASSERT_RTNL(); + if (dev_net(in_dev->dev) != &init_net) + return; + in_dev->mc_tomb = NULL; #ifdef CONFIG_IP_MULTICAST in_dev->mr_gq_running = 0; @@ -1347,6 +1359,9 @@ void ip_mc_up(struct in_device *in_dev) ASSERT_RTNL(); + if (dev_net(in_dev->dev) != &init_net) + return; + ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); for (i=in_dev->mc_list; i; i=i->next) @@ -1363,6 +1378,9 @@ void ip_mc_destroy_dev(struct in_device *in_dev) ASSERT_RTNL(); + if (dev_net(in_dev->dev) != &init_net) + return; + /* Deactivate timers */ ip_mc_down(in_dev); @@ -1744,6 +1762,9 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) if (!ipv4_is_multicast(addr)) return -EINVAL; + if (sock_net(sk) != &init_net) + return -EPROTONOSUPPORT; + rtnl_lock(); in_dev = ip_mc_find_dev(imr); @@ -1812,6 +1833,9 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) u32 ifindex; int ret = -EADDRNOTAVAIL; + if (sock_net(sk) != &init_net) + return -EPROTONOSUPPORT; + rtnl_lock(); in_dev = ip_mc_find_dev(imr); ifindex = imr->imr_ifindex; @@ -1857,6 +1881,9 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct if (!ipv4_is_multicast(addr)) return -EINVAL; + if (sock_net(sk) != &init_net) + return -EPROTONOSUPPORT; + rtnl_lock(); imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr; @@ -1990,6 +2017,9 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) msf->imsf_fmode != MCAST_EXCLUDE) return -EINVAL; + if (sock_net(sk) != &init_net) + return -EPROTONOSUPPORT; + rtnl_lock(); imr.imr_multiaddr.s_addr = msf->imsf_multiaddr; @@ -2070,6 +2100,9 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, if (!ipv4_is_multicast(addr)) return -EINVAL; + if (sock_net(sk) != &init_net) + return -EPROTONOSUPPORT; + rtnl_lock(); imr.imr_multiaddr.s_addr = msf->imsf_multiaddr; @@ -2132,6 +2165,9 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, if (!ipv4_is_multicast(addr)) return -EINVAL; + if (sock_net(sk) != &init_net) + return -EPROTONOSUPPORT; + rtnl_lock(); err = -EADDRNOTAVAIL; @@ -2216,6 +2252,9 @@ void ip_mc_drop_socket(struct sock *sk) if (inet->mc_list == NULL) return; + if (sock_net(sk) != &init_net) + return; + rtnl_lock(); while ((iml = inet->mc_list) != NULL) { struct in_device *in_dev; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index b189278c7bc..828ea211ff2 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -55,6 +55,13 @@ int inet_csk_bind_conflict(const struct sock *sk, struct hlist_node *node; int reuse = sk->sk_reuse; + /* + * Unlike other sk lookup places we do not check + * for sk_net here, since _all_ the socks listed + * in tb->owners list belong to the same net - the + * one this bucket belongs to. + */ + sk_for_each_bound(sk2, node, &tb->owners) { if (sk != sk2 && !inet_v6_ipv6only(sk2) && @@ -80,12 +87,12 @@ EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); */ int inet_csk_get_port(struct sock *sk, unsigned short snum) { - struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo; + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_bind_hashbucket *head; struct hlist_node *node; struct inet_bind_bucket *tb; int ret; - struct net *net = sk->sk_net; + struct net *net = sock_net(sk); local_bh_disable(); if (!snum) { @@ -133,8 +140,6 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) goto tb_not_found; tb_found: if (!hlist_empty(&tb->owners)) { - if (sk->sk_reuse > 1) - goto success; if (tb->fastreuse > 0 && sk->sk_reuse && sk->sk_state != TCP_LISTEN) { goto success; @@ -333,7 +338,7 @@ struct dst_entry* inet_csk_route_req(struct sock *sk, .dport = ireq->rmt_port } } }; security_req_classify_flow(req, &fl); - if (ip_route_output_flow(&init_net, &rt, &fl, sk, 0)) { + if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0)) { IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); return NULL; } @@ -414,8 +419,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, struct inet_connection_sock *icsk = inet_csk(parent); struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct listen_sock *lopt = queue->listen_opt; - int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; - int thresh = max_retries; + int thresh = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; unsigned long now = jiffies; struct request_sock **reqp, *req; int i, budget; @@ -451,9 +455,6 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, } } - if (queue->rskq_defer_accept) - max_retries = queue->rskq_defer_accept; - budget = 2 * (lopt->nr_table_entries / (timeout / interval)); i = lopt->clock_hand; @@ -461,9 +462,8 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, reqp=&lopt->syn_table[i]; while ((req = *reqp) != NULL) { if (time_after_eq(now, req->expires)) { - if ((req->retrans < thresh || - (inet_rsk(req)->acked && req->retrans < max_retries)) - && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) { + if (req->retrans < thresh && + !req->rsk_ops->rtx_syn_ack(parent, req)) { unsigned long timeo; if (req->retrans++ == 0) @@ -656,25 +656,6 @@ void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); -int inet_csk_ctl_sock_create(struct socket **sock, unsigned short family, - unsigned short type, unsigned char protocol) -{ - int rc = sock_create_kern(family, type, protocol, sock); - - if (rc == 0) { - (*sock)->sk->sk_allocation = GFP_ATOMIC; - inet_sk((*sock)->sk)->uc_ttl = -1; - /* - * Unhash it so that IP input processing does not even see it, - * we do not wish this socket to see incoming packets. - */ - (*sock)->sk->sk_prot->unhash((*sock)->sk); - } - return rc; -} - -EXPORT_SYMBOL_GPL(inet_csk_ctl_sock_create); - #ifdef CONFIG_COMPAT int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index a0a3c78cb5e..4ed429bd595 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -107,10 +107,10 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) if (del_timer(&fq->timer)) atomic_dec(&fq->refcnt); - if (!(fq->last_in & COMPLETE)) { + if (!(fq->last_in & INET_FRAG_COMPLETE)) { fq_unlink(fq, f); atomic_dec(&fq->refcnt); - fq->last_in |= COMPLETE; + fq->last_in |= INET_FRAG_COMPLETE; } } @@ -134,7 +134,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, struct sk_buff *fp; struct netns_frags *nf; - BUG_TRAP(q->last_in & COMPLETE); + BUG_TRAP(q->last_in & INET_FRAG_COMPLETE); BUG_TRAP(del_timer(&q->timer) == 0); /* Release all fragment data. */ @@ -177,7 +177,7 @@ int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f) read_unlock(&f->lock); spin_lock(&q->lock); - if (!(q->last_in & COMPLETE)) + if (!(q->last_in & INET_FRAG_COMPLETE)) inet_frag_kill(q, f); spin_unlock(&q->lock); @@ -209,7 +209,7 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, if (qp->net == nf && f->match(qp, arg)) { atomic_inc(&qp->refcnt); write_unlock(&f->lock); - qp_in->last_in |= COMPLETE; + qp_in->last_in |= INET_FRAG_COMPLETE; inet_frag_put(qp_in, f); return qp; } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 1aba606f6bb..2023d37b270 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -35,7 +35,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); if (tb != NULL) { - tb->ib_net = net; + tb->ib_net = hold_net(net); tb->port = snum; tb->fastreuse = 0; INIT_HLIST_HEAD(&tb->owners); @@ -51,6 +51,7 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket { if (hlist_empty(&tb->owners)) { __hlist_del(&tb->node); + release_net(tb->ib_net); kmem_cache_free(cachep, tb); } } @@ -68,7 +69,7 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, */ static void __inet_put_port(struct sock *sk) { - struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo; + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size); struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; struct inet_bind_bucket *tb; @@ -91,6 +92,22 @@ void inet_put_port(struct sock *sk) EXPORT_SYMBOL(inet_put_port); +void __inet_inherit_port(struct sock *sk, struct sock *child) +{ + struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; + const int bhash = inet_bhashfn(inet_sk(child)->num, table->bhash_size); + struct inet_bind_hashbucket *head = &table->bhash[bhash]; + struct inet_bind_bucket *tb; + + spin_lock(&head->lock); + tb = inet_csk(sk)->icsk_bind_hash; + sk_add_bind_node(child, &tb->owners); + inet_csk(child)->icsk_bind_hash = tb; + spin_unlock(&head->lock); +} + +EXPORT_SYMBOL_GPL(__inet_inherit_port); + /* * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. * Look, when several writers sleep and reader wakes them up, all but one @@ -139,7 +156,7 @@ static struct sock *inet_lookup_listener_slow(struct net *net, sk_for_each(sk, node, head) { const struct inet_sock *inet = inet_sk(sk); - if (sk->sk_net == net && inet->num == hnum && + if (net_eq(sock_net(sk), net) && inet->num == hnum && !ipv6_only_sock(sk)) { const __be32 rcv_saddr = inet->rcv_saddr; int score = sk->sk_family == PF_INET ? 1 : 0; @@ -182,7 +199,7 @@ struct sock *__inet_lookup_listener(struct net *net, if (inet->num == hnum && !sk->sk_node.next && (!inet->rcv_saddr || inet->rcv_saddr == daddr) && (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && - !sk->sk_bound_dev_if && sk->sk_net == net) + !sk->sk_bound_dev_if && net_eq(sock_net(sk), net)) goto sherry_cache; sk = inet_lookup_listener_slow(net, head, daddr, hnum, dif); } @@ -254,7 +271,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, struct sock *sk2; const struct hlist_node *node; struct inet_timewait_sock *tw; - struct net *net = sk->sk_net; + struct net *net = sock_net(sk); prefetch(head->chain.first); write_lock(lock); @@ -288,7 +305,7 @@ unique: sk->sk_hash = hash; BUG_TRAP(sk_unhashed(sk)); __sk_add_node(sk, &head->chain); - sock_prot_inuse_add(sk->sk_prot, 1); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); write_unlock(lock); if (twp) { @@ -318,7 +335,7 @@ static inline u32 inet_sk_port_offset(const struct sock *sk) void __inet_hash_nolisten(struct sock *sk) { - struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo; + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct hlist_head *list; rwlock_t *lock; struct inet_ehash_bucket *head; @@ -332,14 +349,14 @@ void __inet_hash_nolisten(struct sock *sk) write_lock(lock); __sk_add_node(sk, list); - sock_prot_inuse_add(sk->sk_prot, 1); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); write_unlock(lock); } EXPORT_SYMBOL_GPL(__inet_hash_nolisten); static void __inet_hash(struct sock *sk) { - struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo; + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct hlist_head *list; rwlock_t *lock; @@ -354,7 +371,7 @@ static void __inet_hash(struct sock *sk) inet_listen_wlock(hashinfo); __sk_add_node(sk, list); - sock_prot_inuse_add(sk->sk_prot, 1); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); write_unlock(lock); wake_up(&hashinfo->lhash_wait); } @@ -372,7 +389,7 @@ EXPORT_SYMBOL_GPL(inet_hash); void inet_unhash(struct sock *sk) { rwlock_t *lock; - struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo; + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; if (sk_unhashed(sk)) goto out; @@ -387,7 +404,7 @@ void inet_unhash(struct sock *sk) } if (__sk_del_node_init(sk)) - sock_prot_inuse_add(sk->sk_prot, -1); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); write_unlock_bh(lock); out: if (sk->sk_state == TCP_LISTEN) @@ -406,7 +423,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct inet_bind_hashbucket *head; struct inet_bind_bucket *tb; int ret; - struct net *net = sk->sk_net; + struct net *net = sock_net(sk); if (!snum) { int i, remaining, low, high, port; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 717c411a5c6..ce16e9ac24c 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -57,6 +57,7 @@ void inet_twsk_put(struct inet_timewait_sock *tw) printk(KERN_DEBUG "%s timewait_sock %p released\n", tw->tw_prot->name, tw); #endif + release_net(twsk_net(tw)); kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw); module_put(owner); } @@ -91,7 +92,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, /* Step 2: Remove SK from established hash. */ if (__sk_del_node_init(sk)) - sock_prot_inuse_add(sk->sk_prot, -1); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); /* Step 3: Hash TW into TIMEWAIT chain. */ inet_twsk_add_node(tw, &ehead->twchain); @@ -124,7 +125,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat tw->tw_hash = sk->sk_hash; tw->tw_ipv6only = 0; tw->tw_prot = sk->sk_prot_creator; - tw->tw_net = sk->sk_net; + twsk_net_set(tw, hold_net(sock_net(sk))); atomic_set(&tw->tw_refcnt, 1); inet_twsk_dead_node_init(tw); __module_get(tw->tw_prot->owner); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index a4506c8cfef..4813c39b438 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -80,7 +80,7 @@ int ip_forward(struct sk_buff *skb) if (!xfrm4_route_forward(skb)) goto drop; - rt = (struct rtable*)skb->dst; + rt = skb->rtable; if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) goto sr_failed; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 3b2e5adca83..cd6ce6ac635 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -194,7 +194,7 @@ static void ip_expire(unsigned long arg) spin_lock(&qp->q.lock); - if (qp->q.last_in & COMPLETE) + if (qp->q.last_in & INET_FRAG_COMPLETE) goto out; ipq_kill(qp); @@ -202,10 +202,13 @@ static void ip_expire(unsigned long arg) IP_INC_STATS_BH(IPSTATS_MIB_REASMTIMEOUT); IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); - if ((qp->q.last_in&FIRST_IN) && qp->q.fragments != NULL) { + if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { struct sk_buff *head = qp->q.fragments; + struct net *net; + + net = container_of(qp->q.net, struct net, ipv4.frags); /* Send an ICMP "Fragment Reassembly Timeout" message. */ - if ((head->dev = dev_get_by_index(&init_net, qp->iif)) != NULL) { + if ((head->dev = dev_get_by_index(net, qp->iif)) != NULL) { icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); dev_put(head->dev); } @@ -298,7 +301,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) int ihl, end; int err = -ENOENT; - if (qp->q.last_in & COMPLETE) + if (qp->q.last_in & INET_FRAG_COMPLETE) goto err; if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && @@ -324,9 +327,9 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) * or have different end, the segment is corrrupted. */ if (end < qp->q.len || - ((qp->q.last_in & LAST_IN) && end != qp->q.len)) + ((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len)) goto err; - qp->q.last_in |= LAST_IN; + qp->q.last_in |= INET_FRAG_LAST_IN; qp->q.len = end; } else { if (end&7) { @@ -336,7 +339,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) } if (end > qp->q.len) { /* Some bits beyond end -> corruption. */ - if (qp->q.last_in & LAST_IN) + if (qp->q.last_in & INET_FRAG_LAST_IN) goto err; qp->q.len = end; } @@ -435,9 +438,10 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) qp->q.meat += skb->len; atomic_add(skb->truesize, &qp->q.net->mem); if (offset == 0) - qp->q.last_in |= FIRST_IN; + qp->q.last_in |= INET_FRAG_FIRST_IN; - if (qp->q.last_in == (FIRST_IN | LAST_IN) && qp->q.meat == qp->q.len) + if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && + qp->q.meat == qp->q.len) return ip_frag_reasm(qp, prev, dev); write_lock(&ip4_frags.lock); @@ -553,7 +557,7 @@ out_nomem: out_oversize: if (net_ratelimit()) printk(KERN_INFO - "Oversized IP packet from %d.%d.%d.%d.\n", + "Oversized IP packet from " NIPQUAD_FMT ".\n", NIPQUAD(qp->saddr)); out_fail: IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); @@ -568,7 +572,7 @@ int ip_defrag(struct sk_buff *skb, u32 user) IP_INC_STATS_BH(IPSTATS_MIB_REASMREQDS); - net = skb->dev ? skb->dev->nd_net : skb->dst->dev->nd_net; + net = skb->dev ? dev_net(skb->dev) : dev_net(skb->dst->dev); /* Start by cleaning up the memory. */ if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh) ip_evictor(net); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index e7821ba7a9a..2ada033406d 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -39,6 +39,8 @@ #include <net/dsfield.h> #include <net/inet_ecn.h> #include <net/xfrm.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> #ifdef CONFIG_IPV6 #include <net/ipv6.h> @@ -122,7 +124,14 @@ static void ipgre_tunnel_setup(struct net_device *dev); static int ipgre_fb_tunnel_init(struct net_device *dev); -static struct net_device *ipgre_fb_tunnel_dev; +#define HASH_SIZE 16 + +static int ipgre_net_id; +struct ipgre_net { + struct ip_tunnel *tunnels[4][HASH_SIZE]; + + struct net_device *fb_tunnel_dev; +}; /* Tunnel hash table */ @@ -142,39 +151,38 @@ static struct net_device *ipgre_fb_tunnel_dev; will match fallback tunnel. */ -#define HASH_SIZE 16 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) -static struct ip_tunnel *tunnels[4][HASH_SIZE]; - -#define tunnels_r_l (tunnels[3]) -#define tunnels_r (tunnels[2]) -#define tunnels_l (tunnels[1]) -#define tunnels_wc (tunnels[0]) +#define tunnels_r_l tunnels[3] +#define tunnels_r tunnels[2] +#define tunnels_l tunnels[1] +#define tunnels_wc tunnels[0] static DEFINE_RWLOCK(ipgre_lock); /* Given src, dst and key, find appropriate for input tunnel. */ -static struct ip_tunnel * ipgre_tunnel_lookup(__be32 remote, __be32 local, __be32 key) +static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net, + __be32 remote, __be32 local, __be32 key) { unsigned h0 = HASH(remote); unsigned h1 = HASH(key); struct ip_tunnel *t; + struct ipgre_net *ign = net_generic(net, ipgre_net_id); - for (t = tunnels_r_l[h0^h1]; t; t = t->next) { + for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) { if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) return t; } } - for (t = tunnels_r[h0^h1]; t; t = t->next) { + for (t = ign->tunnels_r[h0^h1]; t; t = t->next) { if (remote == t->parms.iph.daddr) { if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) return t; } } - for (t = tunnels_l[h1]; t; t = t->next) { + for (t = ign->tunnels_l[h1]; t; t = t->next) { if (local == t->parms.iph.saddr || (local == t->parms.iph.daddr && ipv4_is_multicast(local))) { @@ -182,17 +190,18 @@ static struct ip_tunnel * ipgre_tunnel_lookup(__be32 remote, __be32 local, __be3 return t; } } - for (t = tunnels_wc[h1]; t; t = t->next) { + for (t = ign->tunnels_wc[h1]; t; t = t->next) { if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) return t; } - if (ipgre_fb_tunnel_dev->flags&IFF_UP) - return netdev_priv(ipgre_fb_tunnel_dev); + if (ign->fb_tunnel_dev->flags&IFF_UP) + return netdev_priv(ign->fb_tunnel_dev); return NULL; } -static struct ip_tunnel **__ipgre_bucket(struct ip_tunnel_parm *parms) +static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign, + struct ip_tunnel_parm *parms) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; @@ -207,17 +216,18 @@ static struct ip_tunnel **__ipgre_bucket(struct ip_tunnel_parm *parms) h ^= HASH(remote); } - return &tunnels[prio][h]; + return &ign->tunnels[prio][h]; } -static inline struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t) +static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign, + struct ip_tunnel *t) { - return __ipgre_bucket(&t->parms); + return __ipgre_bucket(ign, &t->parms); } -static void ipgre_tunnel_link(struct ip_tunnel *t) +static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t) { - struct ip_tunnel **tp = ipgre_bucket(t); + struct ip_tunnel **tp = ipgre_bucket(ign, t); t->next = *tp; write_lock_bh(&ipgre_lock); @@ -225,11 +235,11 @@ static void ipgre_tunnel_link(struct ip_tunnel *t) write_unlock_bh(&ipgre_lock); } -static void ipgre_tunnel_unlink(struct ip_tunnel *t) +static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) { struct ip_tunnel **tp; - for (tp = ipgre_bucket(t); *tp; tp = &(*tp)->next) { + for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) { if (t == *tp) { write_lock_bh(&ipgre_lock); *tp = t->next; @@ -239,7 +249,8 @@ static void ipgre_tunnel_unlink(struct ip_tunnel *t) } } -static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int create) +static struct ip_tunnel * ipgre_tunnel_locate(struct net *net, + struct ip_tunnel_parm *parms, int create) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; @@ -247,8 +258,9 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int struct ip_tunnel *t, **tp, *nt; struct net_device *dev; char name[IFNAMSIZ]; + struct ipgre_net *ign = net_generic(net, ipgre_net_id); - for (tp = __ipgre_bucket(parms); (t = *tp) != NULL; tp = &t->next) { + for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) { if (key == t->parms.i_key) return t; @@ -266,6 +278,8 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int if (!dev) return NULL; + dev_net_set(dev, net); + if (strchr(name, '%')) { if (dev_alloc_name(dev, name) < 0) goto failed_free; @@ -279,7 +293,7 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int goto failed_free; dev_hold(dev); - ipgre_tunnel_link(nt); + ipgre_tunnel_link(ign, nt); return nt; failed_free: @@ -289,7 +303,10 @@ failed_free: static void ipgre_tunnel_uninit(struct net_device *dev) { - ipgre_tunnel_unlink(netdev_priv(dev)); + struct net *net = dev_net(dev); + struct ipgre_net *ign = net_generic(net, ipgre_net_id); + + ipgre_tunnel_unlink(ign, netdev_priv(dev)); dev_put(dev); } @@ -363,7 +380,9 @@ static void ipgre_err(struct sk_buff *skb, u32 info) } read_lock(&ipgre_lock); - t = ipgre_tunnel_lookup(iph->daddr, iph->saddr, (flags&GRE_KEY) ? *(((__be32*)p) + (grehlen>>2) - 1) : 0); + t = ipgre_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr, + (flags&GRE_KEY) ? + *(((__be32*)p) + (grehlen>>2) - 1) : 0); if (t == NULL || t->parms.iph.daddr == 0 || ipv4_is_multicast(t->parms.iph.daddr)) goto out; @@ -476,7 +495,7 @@ out: fl.fl4_dst = eiph->saddr; fl.fl4_tos = RT_TOS(eiph->tos); fl.proto = IPPROTO_GRE; - if (ip_route_output_key(&init_net, &rt, &fl)) { + if (ip_route_output_key(dev_net(skb->dev), &rt, &fl)) { kfree_skb(skb2); return; } @@ -489,7 +508,7 @@ out: fl.fl4_dst = eiph->daddr; fl.fl4_src = eiph->saddr; fl.fl4_tos = eiph->tos; - if (ip_route_output_key(&init_net, &rt, &fl) || + if (ip_route_output_key(dev_net(skb->dev), &rt, &fl) || rt->u.dst.dev->type != ARPHRD_IPGRE) { ip_rt_put(rt); kfree_skb(skb2); @@ -596,7 +615,8 @@ static int ipgre_rcv(struct sk_buff *skb) } read_lock(&ipgre_lock); - if ((tunnel = ipgre_tunnel_lookup(iph->saddr, iph->daddr, key)) != NULL) { + if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev), + iph->saddr, iph->daddr, key)) != NULL) { secpath_reset(skb); skb->protocol = *(__be16*)(h + 2); @@ -619,7 +639,7 @@ static int ipgre_rcv(struct sk_buff *skb) #ifdef CONFIG_NET_IPGRE_BROADCAST if (ipv4_is_multicast(iph->daddr)) { /* Looped back packet, drop it! */ - if (((struct rtable*)skb->dst)->fl.iif == 0) + if (skb->rtable->fl.iif == 0) goto drop; tunnel->stat.multicast++; skb->pkt_type = PACKET_BROADCAST; @@ -699,7 +719,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) } if (skb->protocol == htons(ETH_P_IP)) { - rt = (struct rtable*)skb->dst; + rt = skb->rtable; if ((dst = rt->rt_gateway) == 0) goto tx_error_icmp; } @@ -744,7 +764,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) .saddr = tiph->saddr, .tos = RT_TOS(tos) } }, .proto = IPPROTO_GRE }; - if (ip_route_output_key(&init_net, &rt, &fl)) { + if (ip_route_output_key(dev_net(dev), &rt, &fl)) { tunnel->stat.tx_carrier_errors++; goto tx_error; } @@ -917,7 +937,7 @@ static void ipgre_tunnel_bind_dev(struct net_device *dev) .tos = RT_TOS(iph->tos) } }, .proto = IPPROTO_GRE }; struct rtable *rt; - if (!ip_route_output_key(&init_net, &rt, &fl)) { + if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { tdev = rt->u.dst.dev; ip_rt_put(rt); } @@ -925,7 +945,7 @@ static void ipgre_tunnel_bind_dev(struct net_device *dev) } if (!tdev && tunnel->parms.link) - tdev = __dev_get_by_index(&init_net, tunnel->parms.link); + tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); if (tdev) { hlen = tdev->hard_header_len; @@ -954,16 +974,18 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) int err = 0; struct ip_tunnel_parm p; struct ip_tunnel *t; + struct net *net = dev_net(dev); + struct ipgre_net *ign = net_generic(net, ipgre_net_id); switch (cmd) { case SIOCGETTUNNEL: t = NULL; - if (dev == ipgre_fb_tunnel_dev) { + if (dev == ign->fb_tunnel_dev) { if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { err = -EFAULT; break; } - t = ipgre_tunnel_locate(&p, 0); + t = ipgre_tunnel_locate(net, &p, 0); } if (t == NULL) t = netdev_priv(dev); @@ -995,9 +1017,9 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) if (!(p.o_flags&GRE_KEY)) p.o_key = 0; - t = ipgre_tunnel_locate(&p, cmd == SIOCADDTUNNEL); + t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); - if (dev != ipgre_fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { + if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { if (t != NULL) { if (t->dev != dev) { err = -EEXIST; @@ -1017,14 +1039,14 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) err = -EINVAL; break; } - ipgre_tunnel_unlink(t); + ipgre_tunnel_unlink(ign, t); t->parms.iph.saddr = p.iph.saddr; t->parms.iph.daddr = p.iph.daddr; t->parms.i_key = p.i_key; t->parms.o_key = p.o_key; memcpy(dev->dev_addr, &p.iph.saddr, 4); memcpy(dev->broadcast, &p.iph.daddr, 4); - ipgre_tunnel_link(t); + ipgre_tunnel_link(ign, t); netdev_state_change(dev); } } @@ -1052,15 +1074,15 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) if (!capable(CAP_NET_ADMIN)) goto done; - if (dev == ipgre_fb_tunnel_dev) { + if (dev == ign->fb_tunnel_dev) { err = -EFAULT; if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) goto done; err = -ENOENT; - if ((t = ipgre_tunnel_locate(&p, 0)) == NULL) + if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL) goto done; err = -EPERM; - if (t == netdev_priv(ipgre_fb_tunnel_dev)) + if (t == netdev_priv(ign->fb_tunnel_dev)) goto done; dev = t->dev; } @@ -1173,7 +1195,7 @@ static int ipgre_open(struct net_device *dev) .tos = RT_TOS(t->parms.iph.tos) } }, .proto = IPPROTO_GRE }; struct rtable *rt; - if (ip_route_output_key(&init_net, &rt, &fl)) + if (ip_route_output_key(dev_net(dev), &rt, &fl)) return -EADDRNOTAVAIL; dev = rt->u.dst.dev; ip_rt_put(rt); @@ -1190,7 +1212,7 @@ static int ipgre_close(struct net_device *dev) struct ip_tunnel *t = netdev_priv(dev); if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { struct in_device *in_dev; - in_dev = inetdev_by_index(dev->nd_net, t->mlink); + in_dev = inetdev_by_index(dev_net(dev), t->mlink); if (in_dev) { ip_mc_dec_group(in_dev, t->parms.iph.daddr); in_dev_put(in_dev); @@ -1216,6 +1238,7 @@ static void ipgre_tunnel_setup(struct net_device *dev) dev->flags = IFF_NOARP; dev->iflink = 0; dev->addr_len = 4; + dev->features |= NETIF_F_NETNS_LOCAL; } static int ipgre_tunnel_init(struct net_device *dev) @@ -1251,10 +1274,11 @@ static int ipgre_tunnel_init(struct net_device *dev) return 0; } -static int __init ipgre_fb_tunnel_init(struct net_device *dev) +static int ipgre_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; + struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id); tunnel->dev = dev; strcpy(tunnel->parms.name, dev->name); @@ -1265,7 +1289,7 @@ static int __init ipgre_fb_tunnel_init(struct net_device *dev) tunnel->hlen = sizeof(struct iphdr) + 4; dev_hold(dev); - tunnels_wc[0] = tunnel; + ign->tunnels_wc[0] = tunnel; return 0; } @@ -1273,56 +1297,98 @@ static int __init ipgre_fb_tunnel_init(struct net_device *dev) static struct net_protocol ipgre_protocol = { .handler = ipgre_rcv, .err_handler = ipgre_err, + .netns_ok = 1, }; +static void ipgre_destroy_tunnels(struct ipgre_net *ign) +{ + int prio; -/* - * And now the modules code and kernel interface. - */ + for (prio = 0; prio < 4; prio++) { + int h; + for (h = 0; h < HASH_SIZE; h++) { + struct ip_tunnel *t; + while ((t = ign->tunnels[prio][h]) != NULL) + unregister_netdevice(t->dev); + } + } +} -static int __init ipgre_init(void) +static int ipgre_init_net(struct net *net) { int err; + struct ipgre_net *ign; - printk(KERN_INFO "GRE over IPv4 tunneling driver\n"); + err = -ENOMEM; + ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL); + if (ign == NULL) + goto err_alloc; - if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) { - printk(KERN_INFO "ipgre init: can't add protocol\n"); - return -EAGAIN; - } + err = net_assign_generic(net, ipgre_net_id, ign); + if (err < 0) + goto err_assign; - ipgre_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0", + ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0", ipgre_tunnel_setup); - if (!ipgre_fb_tunnel_dev) { + if (!ign->fb_tunnel_dev) { err = -ENOMEM; - goto err1; + goto err_alloc_dev; } - ipgre_fb_tunnel_dev->init = ipgre_fb_tunnel_init; + ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init; + dev_net_set(ign->fb_tunnel_dev, net); - if ((err = register_netdev(ipgre_fb_tunnel_dev))) - goto err2; -out: + if ((err = register_netdev(ign->fb_tunnel_dev))) + goto err_reg_dev; + + return 0; + +err_reg_dev: + free_netdev(ign->fb_tunnel_dev); +err_alloc_dev: + /* nothing */ +err_assign: + kfree(ign); +err_alloc: return err; -err2: - free_netdev(ipgre_fb_tunnel_dev); -err1: - inet_del_protocol(&ipgre_protocol, IPPROTO_GRE); - goto out; } -static void __exit ipgre_destroy_tunnels(void) +static void ipgre_exit_net(struct net *net) { - int prio; + struct ipgre_net *ign; - for (prio = 0; prio < 4; prio++) { - int h; - for (h = 0; h < HASH_SIZE; h++) { - struct ip_tunnel *t; - while ((t = tunnels[prio][h]) != NULL) - unregister_netdevice(t->dev); - } + ign = net_generic(net, ipgre_net_id); + rtnl_lock(); + ipgre_destroy_tunnels(ign); + rtnl_unlock(); + kfree(ign); +} + +static struct pernet_operations ipgre_net_ops = { + .init = ipgre_init_net, + .exit = ipgre_exit_net, +}; + +/* + * And now the modules code and kernel interface. + */ + +static int __init ipgre_init(void) +{ + int err; + + printk(KERN_INFO "GRE over IPv4 tunneling driver\n"); + + if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) { + printk(KERN_INFO "ipgre init: can't add protocol\n"); + return -EAGAIN; } + + err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops); + if (err < 0) + inet_del_protocol(&ipgre_protocol, IPPROTO_GRE); + + return err; } static void __exit ipgre_fini(void) @@ -1330,9 +1396,7 @@ static void __exit ipgre_fini(void) if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) printk(KERN_INFO "ipgre close: can't remove protocol\n"); - rtnl_lock(); - ipgre_destroy_tunnels(); - rtnl_unlock(); + unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops); } module_init(ipgre_init); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 65631391d47..7b4bad6d572 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -160,6 +160,7 @@ int ip_call_ra_chain(struct sk_buff *skb) struct ip_ra_chain *ra; u8 protocol = ip_hdr(skb)->protocol; struct sock *last = NULL; + struct net_device *dev = skb->dev; read_lock(&ip_ra_lock); for (ra = ip_ra_chain; ra; ra = ra->next) { @@ -170,7 +171,8 @@ int ip_call_ra_chain(struct sk_buff *skb) */ if (sk && inet_sk(sk)->num == protocol && (!sk->sk_bound_dev_if || - sk->sk_bound_dev_if == skb->dev->ifindex)) { + sk->sk_bound_dev_if == dev->ifindex) && + sock_net(sk) == dev_net(dev)) { if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) { read_unlock(&ip_ra_lock); @@ -197,6 +199,8 @@ int ip_call_ra_chain(struct sk_buff *skb) static int ip_local_deliver_finish(struct sk_buff *skb) { + struct net *net = dev_net(skb->dev); + __skb_pull(skb, ip_hdrlen(skb)); /* Point into the IP datagram, just past the header. */ @@ -212,7 +216,8 @@ static int ip_local_deliver_finish(struct sk_buff *skb) raw = raw_local_deliver(skb, protocol); hash = protocol & (MAX_INET_PROTOS - 1); - if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) { + ipprot = rcu_dereference(inet_protos[hash]); + if (ipprot != NULL && (net == &init_net || ipprot->netns_ok)) { int ret; if (!ipprot->no_policy) { @@ -283,13 +288,14 @@ static inline int ip_rcv_options(struct sk_buff *skb) } iph = ip_hdr(skb); + opt = &(IPCB(skb)->opt); + opt->optlen = iph->ihl*4 - sizeof(struct iphdr); - if (ip_options_compile(NULL, skb)) { + if (ip_options_compile(dev_net(dev), opt, skb)) { IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); goto drop; } - opt = &(IPCB(skb)->opt); if (unlikely(opt->srr)) { struct in_device *in_dev = in_dev_get(dev); if (in_dev) { @@ -297,7 +303,7 @@ static inline int ip_rcv_options(struct sk_buff *skb) if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) printk(KERN_INFO "source route option " - "%u.%u.%u.%u -> %u.%u.%u.%u\n", + NIPQUAD_FMT " -> " NIPQUAD_FMT "\n", NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); in_dev_put(in_dev); @@ -351,7 +357,7 @@ static int ip_rcv_finish(struct sk_buff *skb) if (iph->ihl > 5 && ip_rcv_options(skb)) goto drop; - rt = (struct rtable*)skb->dst; + rt = skb->rtable; if (rt->rt_type == RTN_MULTICAST) IP_INC_STATS_BH(IPSTATS_MIB_INMCASTPKTS); else if (rt->rt_type == RTN_BROADCAST) @@ -372,9 +378,6 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct iphdr *iph; u32 len; - if (dev->nd_net != &init_net) - goto drop; - /* When the interface is in promisc. mode, drop all the crap * that it receives, do not try to analyse it. */ diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 4d315158fd3..d107543d3f8 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -45,7 +45,6 @@ void ip_options_build(struct sk_buff * skb, struct ip_options * opt, memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options)); memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen); opt = &(IPCB(skb)->opt); - opt->is_data = 0; if (opt->srr) memcpy(iph+opt->srr+iph[opt->srr+1]-4, &daddr, 4); @@ -95,8 +94,6 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) memset(dopt, 0, sizeof(struct ip_options)); - dopt->is_data = 1; - sopt = &(IPCB(skb)->opt); if (sopt->optlen == 0) { @@ -107,10 +104,7 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) sptr = skb_network_header(skb); dptr = dopt->__data; - if (skb->dst) - daddr = ((struct rtable*)skb->dst)->rt_spec_dst; - else - daddr = ip_hdr(skb)->daddr; + daddr = skb->rtable->rt_spec_dst; if (sopt->rr) { optlen = sptr[sopt->rr+1]; @@ -151,7 +145,7 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) __be32 addr; memcpy(&addr, sptr+soffset-1, 4); - if (inet_addr_type(&init_net, addr) != RTN_LOCAL) { + if (inet_addr_type(dev_net(skb->dst->dev), addr) != RTN_LOCAL) { dopt->ts_needtime = 1; soffset += 8; } @@ -254,26 +248,22 @@ void ip_options_fragment(struct sk_buff * skb) * If opt == NULL, then skb->data should point to IP header. */ -int ip_options_compile(struct ip_options * opt, struct sk_buff * skb) +int ip_options_compile(struct net *net, + struct ip_options * opt, struct sk_buff * skb) { int l; unsigned char * iph; unsigned char * optptr; int optlen; unsigned char * pp_ptr = NULL; - struct rtable *rt = skb ? (struct rtable*)skb->dst : NULL; - - if (!opt) { - opt = &(IPCB(skb)->opt); - iph = skb_network_header(skb); - opt->optlen = ((struct iphdr *)iph)->ihl*4 - sizeof(struct iphdr); - optptr = iph + sizeof(struct iphdr); - opt->is_data = 0; - } else { - optptr = opt->is_data ? opt->__data : - (unsigned char *)&(ip_hdr(skb)[1]); - iph = optptr - sizeof(struct iphdr); - } + struct rtable *rt = NULL; + + if (skb != NULL) { + rt = skb->rtable; + optptr = (unsigned char *)&(ip_hdr(skb)[1]); + } else + optptr = opt->__data; + iph = optptr - sizeof(struct iphdr); for (l = opt->optlen; l > 0; ) { switch (*optptr) { @@ -400,7 +390,7 @@ int ip_options_compile(struct ip_options * opt, struct sk_buff * skb) { __be32 addr; memcpy(&addr, &optptr[optptr[2]-1], 4); - if (inet_addr_type(&init_net, addr) == RTN_UNICAST) + if (inet_addr_type(net, addr) == RTN_UNICAST) break; if (skb) timeptr = (__be32*)&optptr[optptr[2]+3]; @@ -517,14 +507,13 @@ static struct ip_options *ip_options_get_alloc(const int optlen) GFP_KERNEL); } -static int ip_options_get_finish(struct ip_options **optp, +static int ip_options_get_finish(struct net *net, struct ip_options **optp, struct ip_options *opt, int optlen) { while (optlen & 3) opt->__data[optlen++] = IPOPT_END; opt->optlen = optlen; - opt->is_data = 1; - if (optlen && ip_options_compile(opt, NULL)) { + if (optlen && ip_options_compile(net, opt, NULL)) { kfree(opt); return -EINVAL; } @@ -533,7 +522,8 @@ static int ip_options_get_finish(struct ip_options **optp, return 0; } -int ip_options_get_from_user(struct ip_options **optp, unsigned char __user *data, int optlen) +int ip_options_get_from_user(struct net *net, struct ip_options **optp, + unsigned char __user *data, int optlen) { struct ip_options *opt = ip_options_get_alloc(optlen); @@ -543,10 +533,11 @@ int ip_options_get_from_user(struct ip_options **optp, unsigned char __user *dat kfree(opt); return -EFAULT; } - return ip_options_get_finish(optp, opt, optlen); + return ip_options_get_finish(net, optp, opt, optlen); } -int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen) +int ip_options_get(struct net *net, struct ip_options **optp, + unsigned char *data, int optlen) { struct ip_options *opt = ip_options_get_alloc(optlen); @@ -554,14 +545,14 @@ int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen) return -ENOMEM; if (optlen) memcpy(opt->__data, data, optlen); - return ip_options_get_finish(optp, opt, optlen); + return ip_options_get_finish(net, optp, opt, optlen); } void ip_forward_options(struct sk_buff *skb) { struct ip_options * opt = &(IPCB(skb)->opt); unsigned char * optptr; - struct rtable *rt = (struct rtable*)skb->dst; + struct rtable *rt = skb->rtable; unsigned char *raw = skb_network_header(skb); if (opt->rr_needaddr) { @@ -609,7 +600,7 @@ int ip_options_rcv_srr(struct sk_buff *skb) __be32 nexthop; struct iphdr *iph = ip_hdr(skb); unsigned char *optptr = skb_network_header(skb) + opt->srr; - struct rtable *rt = (struct rtable*)skb->dst; + struct rtable *rt = skb->rtable; struct rtable *rt2; int err; @@ -634,13 +625,13 @@ int ip_options_rcv_srr(struct sk_buff *skb) } memcpy(&nexthop, &optptr[srrptr-1], 4); - rt = (struct rtable*)skb->dst; - skb->dst = NULL; + rt = skb->rtable; + skb->rtable = NULL; err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev); - rt2 = (struct rtable*)skb->dst; + rt2 = skb->rtable; if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { ip_rt_put(rt2); - skb->dst = &rt->u.dst; + skb->rtable = rt; return -EINVAL; } ip_rt_put(rt); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 341779e685d..08349267ceb 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -142,7 +142,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, __be32 saddr, __be32 daddr, struct ip_options *opt) { struct inet_sock *inet = inet_sk(sk); - struct rtable *rt = (struct rtable *)skb->dst; + struct rtable *rt = skb->rtable; struct iphdr *iph; /* Build the IP header. */ @@ -240,7 +240,7 @@ static int ip_finish_output(struct sk_buff *skb) int ip_mc_output(struct sk_buff *skb) { struct sock *sk = skb->sk; - struct rtable *rt = (struct rtable*)skb->dst; + struct rtable *rt = skb->rtable; struct net_device *dev = rt->u.dst.dev; /* @@ -321,7 +321,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok) /* Skip all of this if the packet is already routed, * f.e. by something like SCTP. */ - rt = (struct rtable *) skb->dst; + rt = skb->rtable; if (rt != NULL) goto packet_routed; @@ -351,7 +351,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok) * itself out. */ security_sk_classify_flow(sk, &fl); - if (ip_route_output_flow(&init_net, &rt, &fl, sk, 0)) + if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0)) goto no_route; } sk_setup_caps(sk, &rt->u.dst); @@ -441,7 +441,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) unsigned int mtu, hlen, left, len, ll_rs, pad; int offset; __be16 not_last_frag; - struct rtable *rt = (struct rtable*)skb->dst; + struct rtable *rt = skb->rtable; int err = 0; dev = rt->u.dst.dev; @@ -825,7 +825,7 @@ int ip_append_data(struct sock *sk, inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path); - inet->cork.rt = rt; + inet->cork.dst = &rt->u.dst; inet->cork.length = 0; sk->sk_sndmsg_page = NULL; sk->sk_sndmsg_off = 0; @@ -834,7 +834,7 @@ int ip_append_data(struct sock *sk, transhdrlen += exthdrlen; } } else { - rt = inet->cork.rt; + rt = (struct rtable *)inet->cork.dst; if (inet->cork.flags & IPCORK_OPT) opt = inet->cork.opt; @@ -1083,7 +1083,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, if (skb_queue_empty(&sk->sk_write_queue)) return -EINVAL; - rt = inet->cork.rt; + rt = (struct rtable *)inet->cork.dst; if (inet->cork.flags & IPCORK_OPT) opt = inet->cork.opt; @@ -1208,10 +1208,8 @@ static void ip_cork_release(struct inet_sock *inet) inet->cork.flags &= ~IPCORK_OPT; kfree(inet->cork.opt); inet->cork.opt = NULL; - if (inet->cork.rt) { - ip_rt_put(inet->cork.rt); - inet->cork.rt = NULL; - } + dst_release(inet->cork.dst); + inet->cork.dst = NULL; } /* @@ -1224,7 +1222,7 @@ int ip_push_pending_frames(struct sock *sk) struct sk_buff **tail_skb; struct inet_sock *inet = inet_sk(sk); struct ip_options *opt = NULL; - struct rtable *rt = inet->cork.rt; + struct rtable *rt = (struct rtable *)inet->cork.dst; struct iphdr *iph; __be16 df = 0; __u8 ttl; @@ -1357,7 +1355,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar } replyopts; struct ipcm_cookie ipc; __be32 daddr; - struct rtable *rt = (struct rtable*)skb->dst; + struct rtable *rt = skb->rtable; if (ip_options_echo(&replyopts.opt, skb)) return; @@ -1384,7 +1382,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar .dport = tcp_hdr(skb)->source } }, .proto = sk->sk_protocol }; security_skb_classify_flow(skb, &fl); - if (ip_route_output_key(sk->sk_net, &rt, &fl)) + if (ip_route_output_key(sock_net(sk), &rt, &fl)) return; } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index c2921d01e92..d8adfd4972e 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -57,7 +57,7 @@ static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) { struct in_pktinfo info; - struct rtable *rt = (struct rtable *)skb->dst; + struct rtable *rt = skb->rtable; info.ipi_addr.s_addr = ip_hdr(skb)->daddr; if (rt) { @@ -163,7 +163,7 @@ void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) ip_cmsg_recv_security(msg, skb); } -int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc) +int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) { int err; struct cmsghdr *cmsg; @@ -176,7 +176,7 @@ int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc) switch (cmsg->cmsg_type) { case IP_RETOPTS: err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); - err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40); + err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40); if (err) return err; break; @@ -449,7 +449,8 @@ static int do_ip_setsockopt(struct sock *sk, int level, struct ip_options * opt = NULL; if (optlen > 40 || optlen < 0) goto e_inval; - err = ip_options_get_from_user(&opt, optval, optlen); + err = ip_options_get_from_user(sock_net(sk), &opt, + optval, optlen); if (err) break; if (inet->is_icsk) { @@ -589,13 +590,13 @@ static int do_ip_setsockopt(struct sock *sk, int level, err = 0; break; } - dev = ip_dev_find(&init_net, mreq.imr_address.s_addr); + dev = ip_dev_find(sock_net(sk), mreq.imr_address.s_addr); if (dev) { mreq.imr_ifindex = dev->ifindex; dev_put(dev); } } else - dev = __dev_get_by_index(&init_net, mreq.imr_ifindex); + dev = __dev_get_by_index(sock_net(sk), mreq.imr_ifindex); err = -EADDRNOTAVAIL; diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 58b60b2fb01..fb53ddfea5b 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -179,7 +179,7 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) spi, IPPROTO_COMP, AF_INET); if (!x) return; - NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n", + NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/" NIPQUAD_FMT "\n", spi, NIPQUAD(iph->daddr)); xfrm_state_put(x); } diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 4824fe8996b..0f42d1c1f69 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -292,7 +292,7 @@ static int __init ic_dev_ioctl(unsigned int cmd, struct ifreq *arg) mm_segment_t oldfs = get_fs(); set_fs(get_ds()); - res = devinet_ioctl(cmd, (struct ifreq __user *) arg); + res = devinet_ioctl(&init_net, cmd, (struct ifreq __user *) arg); set_fs(oldfs); return res; } @@ -376,7 +376,7 @@ static int __init ic_defaults(void) */ if (!ic_host_name_set) - sprintf(init_utsname()->nodename, "%u.%u.%u.%u", NIPQUAD(ic_myaddr)); + sprintf(init_utsname()->nodename, NIPQUAD_FMT, NIPQUAD(ic_myaddr)); if (root_server_addr == NONE) root_server_addr = ic_servaddr; @@ -389,11 +389,11 @@ static int __init ic_defaults(void) else if (IN_CLASSC(ntohl(ic_myaddr))) ic_netmask = htonl(IN_CLASSC_NET); else { - printk(KERN_ERR "IP-Config: Unable to guess netmask for address %u.%u.%u.%u\n", + printk(KERN_ERR "IP-Config: Unable to guess netmask for address " NIPQUAD_FMT "\n", NIPQUAD(ic_myaddr)); return -1; } - printk("IP-Config: Guessing netmask %u.%u.%u.%u\n", NIPQUAD(ic_netmask)); + printk("IP-Config: Guessing netmask " NIPQUAD_FMT "\n", NIPQUAD(ic_netmask)); } return 0; @@ -434,7 +434,7 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt unsigned char *sha, *tha; /* s for "source", t for "target" */ struct ic_device *d; - if (dev->nd_net != &init_net) + if (dev_net(dev) != &init_net) goto drop; if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) @@ -460,10 +460,7 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt if (rarp->ar_pro != htons(ETH_P_IP)) goto drop; - if (!pskb_may_pull(skb, - sizeof(struct arphdr) + - (2 * dev->addr_len) + - (2 * 4))) + if (!pskb_may_pull(skb, arp_hdr_len(dev))) goto drop; /* OK, it is all there and looks valid, process... */ @@ -857,7 +854,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str struct ic_device *d; int len, ext_len; - if (dev->nd_net != &init_net) + if (dev_net(dev) != &init_net) goto drop; /* Perform verifications before taking the lock. */ @@ -984,9 +981,9 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str ic_myaddr = b->your_ip; ic_servaddr = server_id; #ifdef IPCONFIG_DEBUG - printk("DHCP: Offered address %u.%u.%u.%u", + printk("DHCP: Offered address " NIPQUAD_FMT, NIPQUAD(ic_myaddr)); - printk(" by server %u.%u.%u.%u\n", + printk(" by server " NIPQUAD_FMT "\n", NIPQUAD(ic_servaddr)); #endif /* The DHCP indicated server address takes @@ -1182,11 +1179,11 @@ static int __init ic_dynamic(void) return -1; } - printk("IP-Config: Got %s answer from %u.%u.%u.%u, ", + printk("IP-Config: Got %s answer from " NIPQUAD_FMT ", ", ((ic_got_reply & IC_RARP) ? "RARP" : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"), NIPQUAD(ic_servaddr)); - printk("my address is %u.%u.%u.%u\n", NIPQUAD(ic_myaddr)); + printk("my address is " NIPQUAD_FMT "\n", NIPQUAD(ic_myaddr)); return 0; } @@ -1212,12 +1209,12 @@ static int pnp_seq_show(struct seq_file *seq, void *v) for (i = 0; i < CONF_NAMESERVERS_MAX; i++) { if (ic_nameservers[i] != NONE) seq_printf(seq, - "nameserver %u.%u.%u.%u\n", + "nameserver " NIPQUAD_FMT "\n", NIPQUAD(ic_nameservers[i])); } if (ic_servaddr != NONE) seq_printf(seq, - "bootserver %u.%u.%u.%u\n", + "bootserver " NIPQUAD_FMT "\n", NIPQUAD(ic_servaddr)); return 0; } @@ -1392,13 +1389,13 @@ static int __init ip_auto_config(void) */ printk("IP-Config: Complete:"); printk("\n device=%s", ic_dev->name); - printk(", addr=%u.%u.%u.%u", NIPQUAD(ic_myaddr)); - printk(", mask=%u.%u.%u.%u", NIPQUAD(ic_netmask)); - printk(", gw=%u.%u.%u.%u", NIPQUAD(ic_gateway)); + printk(", addr=" NIPQUAD_FMT, NIPQUAD(ic_myaddr)); + printk(", mask=" NIPQUAD_FMT, NIPQUAD(ic_netmask)); + printk(", gw=" NIPQUAD_FMT, NIPQUAD(ic_gateway)); printk(",\n host=%s, domain=%s, nis-domain=%s", utsname()->nodename, ic_domain, utsname()->domainname); - printk(",\n bootserver=%u.%u.%u.%u", NIPQUAD(ic_servaddr)); - printk(", rootserver=%u.%u.%u.%u", NIPQUAD(root_server_addr)); + printk(",\n bootserver=" NIPQUAD_FMT, NIPQUAD(ic_servaddr)); + printk(", rootserver=" NIPQUAD_FMT, NIPQUAD(root_server_addr)); printk(", rootpath=%s", root_server_path); printk("\n"); #endif /* !SILENT */ diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index dbaed69de06..149111f08e8 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -115,49 +115,57 @@ #include <net/ipip.h> #include <net/inet_ecn.h> #include <net/xfrm.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> #define HASH_SIZE 16 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) +static int ipip_net_id; +struct ipip_net { + struct ip_tunnel *tunnels_r_l[HASH_SIZE]; + struct ip_tunnel *tunnels_r[HASH_SIZE]; + struct ip_tunnel *tunnels_l[HASH_SIZE]; + struct ip_tunnel *tunnels_wc[1]; + struct ip_tunnel **tunnels[4]; + + struct net_device *fb_tunnel_dev; +}; + static int ipip_fb_tunnel_init(struct net_device *dev); static int ipip_tunnel_init(struct net_device *dev); static void ipip_tunnel_setup(struct net_device *dev); -static struct net_device *ipip_fb_tunnel_dev; - -static struct ip_tunnel *tunnels_r_l[HASH_SIZE]; -static struct ip_tunnel *tunnels_r[HASH_SIZE]; -static struct ip_tunnel *tunnels_l[HASH_SIZE]; -static struct ip_tunnel *tunnels_wc[1]; -static struct ip_tunnel **tunnels[4] = { tunnels_wc, tunnels_l, tunnels_r, tunnels_r_l }; - static DEFINE_RWLOCK(ipip_lock); -static struct ip_tunnel * ipip_tunnel_lookup(__be32 remote, __be32 local) +static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, + __be32 remote, __be32 local) { unsigned h0 = HASH(remote); unsigned h1 = HASH(local); struct ip_tunnel *t; + struct ipip_net *ipn = net_generic(net, ipip_net_id); - for (t = tunnels_r_l[h0^h1]; t; t = t->next) { + for (t = ipn->tunnels_r_l[h0^h1]; t; t = t->next) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) return t; } - for (t = tunnels_r[h0]; t; t = t->next) { + for (t = ipn->tunnels_r[h0]; t; t = t->next) { if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) return t; } - for (t = tunnels_l[h1]; t; t = t->next) { + for (t = ipn->tunnels_l[h1]; t; t = t->next) { if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) return t; } - if ((t = tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP)) + if ((t = ipn->tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP)) return t; return NULL; } -static struct ip_tunnel **__ipip_bucket(struct ip_tunnel_parm *parms) +static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn, + struct ip_tunnel_parm *parms) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; @@ -172,19 +180,20 @@ static struct ip_tunnel **__ipip_bucket(struct ip_tunnel_parm *parms) prio |= 1; h ^= HASH(local); } - return &tunnels[prio][h]; + return &ipn->tunnels[prio][h]; } -static inline struct ip_tunnel **ipip_bucket(struct ip_tunnel *t) +static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn, + struct ip_tunnel *t) { - return __ipip_bucket(&t->parms); + return __ipip_bucket(ipn, &t->parms); } -static void ipip_tunnel_unlink(struct ip_tunnel *t) +static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t) { struct ip_tunnel **tp; - for (tp = ipip_bucket(t); *tp; tp = &(*tp)->next) { + for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) { if (t == *tp) { write_lock_bh(&ipip_lock); *tp = t->next; @@ -194,9 +203,9 @@ static void ipip_tunnel_unlink(struct ip_tunnel *t) } } -static void ipip_tunnel_link(struct ip_tunnel *t) +static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t) { - struct ip_tunnel **tp = ipip_bucket(t); + struct ip_tunnel **tp = ipip_bucket(ipn, t); t->next = *tp; write_lock_bh(&ipip_lock); @@ -204,15 +213,17 @@ static void ipip_tunnel_link(struct ip_tunnel *t) write_unlock_bh(&ipip_lock); } -static struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int create) +static struct ip_tunnel * ipip_tunnel_locate(struct net *net, + struct ip_tunnel_parm *parms, int create) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; struct ip_tunnel *t, **tp, *nt; struct net_device *dev; char name[IFNAMSIZ]; + struct ipip_net *ipn = net_generic(net, ipip_net_id); - for (tp = __ipip_bucket(parms); (t = *tp) != NULL; tp = &t->next) { + for (tp = __ipip_bucket(ipn, parms); (t = *tp) != NULL; tp = &t->next) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) return t; } @@ -228,6 +239,8 @@ static struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int c if (dev == NULL) return NULL; + dev_net_set(dev, net); + if (strchr(name, '%')) { if (dev_alloc_name(dev, name) < 0) goto failed_free; @@ -241,7 +254,7 @@ static struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int c goto failed_free; dev_hold(dev); - ipip_tunnel_link(nt); + ipip_tunnel_link(ipn, nt); return nt; failed_free: @@ -251,12 +264,15 @@ failed_free: static void ipip_tunnel_uninit(struct net_device *dev) { - if (dev == ipip_fb_tunnel_dev) { + struct net *net = dev_net(dev); + struct ipip_net *ipn = net_generic(net, ipip_net_id); + + if (dev == ipn->fb_tunnel_dev) { write_lock_bh(&ipip_lock); - tunnels_wc[0] = NULL; + ipn->tunnels_wc[0] = NULL; write_unlock_bh(&ipip_lock); } else - ipip_tunnel_unlink(netdev_priv(dev)); + ipip_tunnel_unlink(ipn, netdev_priv(dev)); dev_put(dev); } @@ -305,7 +321,7 @@ static int ipip_err(struct sk_buff *skb, u32 info) err = -ENOENT; read_lock(&ipip_lock); - t = ipip_tunnel_lookup(iph->daddr, iph->saddr); + t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); if (t == NULL || t->parms.iph.daddr == 0) goto out; @@ -401,7 +417,7 @@ out: fl.fl4_daddr = eiph->saddr; fl.fl4_tos = RT_TOS(eiph->tos); fl.proto = IPPROTO_IPIP; - if (ip_route_output_key(&init_net, &rt, &key)) { + if (ip_route_output_key(dev_net(skb->dev), &rt, &key)) { kfree_skb(skb2); return 0; } @@ -414,7 +430,7 @@ out: fl.fl4_daddr = eiph->daddr; fl.fl4_src = eiph->saddr; fl.fl4_tos = eiph->tos; - if (ip_route_output_key(&init_net, &rt, &fl) || + if (ip_route_output_key(dev_net(skb->dev), &rt, &fl) || rt->u.dst.dev->type != ARPHRD_TUNNEL) { ip_rt_put(rt); kfree_skb(skb2); @@ -465,7 +481,8 @@ static int ipip_rcv(struct sk_buff *skb) const struct iphdr *iph = ip_hdr(skb); read_lock(&ipip_lock); - if ((tunnel = ipip_tunnel_lookup(iph->saddr, iph->daddr)) != NULL) { + if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev), + iph->saddr, iph->daddr)) != NULL) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { read_unlock(&ipip_lock); kfree_skb(skb); @@ -528,7 +545,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (!dst) { /* NBMA tunnel */ - if ((rt = (struct rtable*)skb->dst) == NULL) { + if ((rt = skb->rtable) == NULL) { tunnel->stat.tx_fifo_errors++; goto tx_error; } @@ -543,7 +560,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) .saddr = tiph->saddr, .tos = RT_TOS(tos) } }, .proto = IPPROTO_IPIP }; - if (ip_route_output_key(&init_net, &rt, &fl)) { + if (ip_route_output_key(dev_net(dev), &rt, &fl)) { tunnel->stat.tx_carrier_errors++; goto tx_error_icmp; } @@ -664,7 +681,7 @@ static void ipip_tunnel_bind_dev(struct net_device *dev) .tos = RT_TOS(iph->tos) } }, .proto = IPPROTO_IPIP }; struct rtable *rt; - if (!ip_route_output_key(&init_net, &rt, &fl)) { + if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { tdev = rt->u.dst.dev; ip_rt_put(rt); } @@ -672,7 +689,7 @@ static void ipip_tunnel_bind_dev(struct net_device *dev) } if (!tdev && tunnel->parms.link) - tdev = __dev_get_by_index(&init_net, tunnel->parms.link); + tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); if (tdev) { dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); @@ -687,16 +704,18 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) int err = 0; struct ip_tunnel_parm p; struct ip_tunnel *t; + struct net *net = dev_net(dev); + struct ipip_net *ipn = net_generic(net, ipip_net_id); switch (cmd) { case SIOCGETTUNNEL: t = NULL; - if (dev == ipip_fb_tunnel_dev) { + if (dev == ipn->fb_tunnel_dev) { if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { err = -EFAULT; break; } - t = ipip_tunnel_locate(&p, 0); + t = ipip_tunnel_locate(net, &p, 0); } if (t == NULL) t = netdev_priv(dev); @@ -722,9 +741,9 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) if (p.iph.ttl) p.iph.frag_off |= htons(IP_DF); - t = ipip_tunnel_locate(&p, cmd == SIOCADDTUNNEL); + t = ipip_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); - if (dev != ipip_fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { + if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { if (t != NULL) { if (t->dev != dev) { err = -EEXIST; @@ -737,12 +756,12 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) break; } t = netdev_priv(dev); - ipip_tunnel_unlink(t); + ipip_tunnel_unlink(ipn, t); t->parms.iph.saddr = p.iph.saddr; t->parms.iph.daddr = p.iph.daddr; memcpy(dev->dev_addr, &p.iph.saddr, 4); memcpy(dev->broadcast, &p.iph.daddr, 4); - ipip_tunnel_link(t); + ipip_tunnel_link(ipn, t); netdev_state_change(dev); } } @@ -770,15 +789,15 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) if (!capable(CAP_NET_ADMIN)) goto done; - if (dev == ipip_fb_tunnel_dev) { + if (dev == ipn->fb_tunnel_dev) { err = -EFAULT; if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) goto done; err = -ENOENT; - if ((t = ipip_tunnel_locate(&p, 0)) == NULL) + if ((t = ipip_tunnel_locate(net, &p, 0)) == NULL) goto done; err = -EPERM; - if (t->dev == ipip_fb_tunnel_dev) + if (t->dev == ipn->fb_tunnel_dev) goto done; dev = t->dev; } @@ -822,6 +841,7 @@ static void ipip_tunnel_setup(struct net_device *dev) dev->flags = IFF_NOARP; dev->iflink = 0; dev->addr_len = 4; + dev->features |= NETIF_F_NETNS_LOCAL; } static int ipip_tunnel_init(struct net_device *dev) @@ -841,10 +861,11 @@ static int ipip_tunnel_init(struct net_device *dev) return 0; } -static int __init ipip_fb_tunnel_init(struct net_device *dev) +static int ipip_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; + struct ipip_net *ipn = net_generic(dev_net(dev), ipip_net_id); tunnel->dev = dev; strcpy(tunnel->parms.name, dev->name); @@ -854,7 +875,7 @@ static int __init ipip_fb_tunnel_init(struct net_device *dev) iph->ihl = 5; dev_hold(dev); - tunnels_wc[0] = tunnel; + ipn->tunnels_wc[0] = tunnel; return 0; } @@ -867,50 +888,98 @@ static struct xfrm_tunnel ipip_handler = { static char banner[] __initdata = KERN_INFO "IPv4 over IPv4 tunneling driver\n"; -static int __init ipip_init(void) +static void ipip_destroy_tunnels(struct ipip_net *ipn) +{ + int prio; + + for (prio = 1; prio < 4; prio++) { + int h; + for (h = 0; h < HASH_SIZE; h++) { + struct ip_tunnel *t; + while ((t = ipn->tunnels[prio][h]) != NULL) + unregister_netdevice(t->dev); + } + } +} + +static int ipip_init_net(struct net *net) { int err; + struct ipip_net *ipn; - printk(banner); + err = -ENOMEM; + ipn = kzalloc(sizeof(struct ipip_net), GFP_KERNEL); + if (ipn == NULL) + goto err_alloc; - if (xfrm4_tunnel_register(&ipip_handler, AF_INET)) { - printk(KERN_INFO "ipip init: can't register tunnel\n"); - return -EAGAIN; - } + err = net_assign_generic(net, ipip_net_id, ipn); + if (err < 0) + goto err_assign; - ipip_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), + ipn->tunnels[0] = ipn->tunnels_wc; + ipn->tunnels[1] = ipn->tunnels_l; + ipn->tunnels[2] = ipn->tunnels_r; + ipn->tunnels[3] = ipn->tunnels_r_l; + + ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "tunl0", ipip_tunnel_setup); - if (!ipip_fb_tunnel_dev) { + if (!ipn->fb_tunnel_dev) { err = -ENOMEM; - goto err1; + goto err_alloc_dev; } - ipip_fb_tunnel_dev->init = ipip_fb_tunnel_init; + ipn->fb_tunnel_dev->init = ipip_fb_tunnel_init; + dev_net_set(ipn->fb_tunnel_dev, net); + + if ((err = register_netdev(ipn->fb_tunnel_dev))) + goto err_reg_dev; + + return 0; - if ((err = register_netdev(ipip_fb_tunnel_dev))) - goto err2; - out: +err_reg_dev: + free_netdev(ipn->fb_tunnel_dev); +err_alloc_dev: + /* nothing */ +err_assign: + kfree(ipn); +err_alloc: return err; - err2: - free_netdev(ipip_fb_tunnel_dev); - err1: - xfrm4_tunnel_deregister(&ipip_handler, AF_INET); - goto out; } -static void __exit ipip_destroy_tunnels(void) +static void ipip_exit_net(struct net *net) { - int prio; + struct ipip_net *ipn; - for (prio = 1; prio < 4; prio++) { - int h; - for (h = 0; h < HASH_SIZE; h++) { - struct ip_tunnel *t; - while ((t = tunnels[prio][h]) != NULL) - unregister_netdevice(t->dev); - } + ipn = net_generic(net, ipip_net_id); + rtnl_lock(); + ipip_destroy_tunnels(ipn); + unregister_netdevice(ipn->fb_tunnel_dev); + rtnl_unlock(); + kfree(ipn); +} + +static struct pernet_operations ipip_net_ops = { + .init = ipip_init_net, + .exit = ipip_exit_net, +}; + +static int __init ipip_init(void) +{ + int err; + + printk(banner); + + if (xfrm4_tunnel_register(&ipip_handler, AF_INET)) { + printk(KERN_INFO "ipip init: can't register tunnel\n"); + return -EAGAIN; } + + err = register_pernet_gen_device(&ipip_net_id, &ipip_net_ops); + if (err) + xfrm4_tunnel_deregister(&ipip_handler, AF_INET); + + return err; } static void __exit ipip_fini(void) @@ -918,10 +987,7 @@ static void __exit ipip_fini(void) if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET)) printk(KERN_INFO "ipip close: can't deregister tunnel\n"); - rtnl_lock(); - ipip_destroy_tunnels(); - unregister_netdevice(ipip_fb_tunnel_dev); - rtnl_unlock(); + unregister_pernet_gen_device(ipip_net_id, &ipip_net_ops); } module_init(ipip_init); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index a94f52c207a..11700a4dcd9 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -849,7 +849,7 @@ static void mrtsock_destruct(struct sock *sk) { rtnl_lock(); if (sk == mroute_socket) { - IPV4_DEVCONF_ALL(sk->sk_net, MC_FORWARDING)--; + IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)--; write_lock_bh(&mrt_lock); mroute_socket=NULL; @@ -898,7 +898,7 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int opt mroute_socket=sk; write_unlock_bh(&mrt_lock); - IPV4_DEVCONF_ALL(sk->sk_net, MC_FORWARDING)++; + IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)++; } rtnl_unlock(); return ret; @@ -1089,7 +1089,7 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v struct vif_device *v; int ct; - if (dev->nd_net != &init_net) + if (dev_net(dev) != &init_net) return NOTIFY_DONE; if (event != NETDEV_UNREGISTER) @@ -1283,7 +1283,7 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local if (vif_table[vif].dev != skb->dev) { int true_vifi; - if (((struct rtable*)skb->dst)->fl.iif == 0) { + if (skb->rtable->fl.iif == 0) { /* It is our own packet, looped back. Very complicated situation... @@ -1357,7 +1357,7 @@ dont_forward: int ip_mr_input(struct sk_buff *skb) { struct mfc_cache *cache; - int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL; + int local = skb->rtable->rt_flags&RTCF_LOCAL; /* Packet is looped back after forward, it should not be forwarded second time, but still can be delivered locally. @@ -1594,7 +1594,7 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait) { int err; struct mfc_cache *cache; - struct rtable *rt = (struct rtable*)skb->dst; + struct rtable *rt = skb->rtable; read_lock(&mrt_lock); cache = ipmr_cache_find(rt->rt_src, rt->rt_dst); diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index 12dc0d640b6..620e40ff79a 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -550,7 +550,7 @@ tcp_app_conn_bind(struct ip_vs_conn *cp) IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->" "%u.%u.%u.%u:%u to app %s on port %u\n", - __FUNCTION__, + __func__, NIPQUAD(cp->caddr), ntohs(cp->cport), NIPQUAD(cp->vaddr), ntohs(cp->vport), inc->name, ntohs(inc->port)); diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index 1fa7b330b9a..1caa2908373 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -344,7 +344,7 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp) IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->" "%u.%u.%u.%u:%u to app %s on port %u\n", - __FUNCTION__, + __func__, NIPQUAD(cp->caddr), ntohs(cp->cport), NIPQUAD(cp->vaddr), ntohs(cp->vport), inc->name, ntohs(inc->port)); diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index 948378d0a75..69c56663cc9 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -916,7 +916,7 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) if (!tinfo) return -ENOMEM; - IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, task_pid_nr(current)); + IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); IP_VS_DBG(7, "Each ip_vs_sync_conn entry need %Zd bytes\n", sizeof(struct ip_vs_sync_conn)); @@ -956,7 +956,7 @@ int stop_sync_thread(int state) (state == IP_VS_STATE_BACKUP && !sync_backup_pid)) return -ESRCH; - IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, task_pid_nr(current)); + IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); IP_VS_INFO("stopping sync thread %d ...\n", (state == IP_VS_STATE_MASTER) ? sync_master_pid : sync_backup_pid); diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 9a904c6c0dc..f8edacdf991 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -182,21 +182,44 @@ __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, } return csum; } - EXPORT_SYMBOL(nf_ip_checksum); +static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, + unsigned int dataoff, unsigned int len, + u_int8_t protocol) +{ + const struct iphdr *iph = ip_hdr(skb); + __sum16 csum = 0; + + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + if (len == skb->len - dataoff) + return nf_ip_checksum(skb, hook, dataoff, protocol); + /* fall through */ + case CHECKSUM_NONE: + skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol, + skb->len - dataoff, 0); + skb->ip_summed = CHECKSUM_NONE; + csum = __skb_checksum_complete_head(skb, dataoff + len); + if (!csum) + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + return csum; +} + static int nf_ip_route(struct dst_entry **dst, struct flowi *fl) { return ip_route_output_key(&init_net, (struct rtable **)dst, fl); } static const struct nf_afinfo nf_ip_afinfo = { - .family = AF_INET, - .checksum = nf_ip_checksum, - .route = nf_ip_route, - .saveroute = nf_ip_saveroute, - .reroute = nf_ip_reroute, - .route_key_size = sizeof(struct ip_rt_info), + .family = AF_INET, + .checksum = nf_ip_checksum, + .checksum_partial = nf_ip_checksum_partial, + .route = nf_ip_route, + .saveroute = nf_ip_saveroute, + .reroute = nf_ip_reroute, + .route_key_size = sizeof(struct ip_rt_info), }; static int ipv4_netfilter_init(void) diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 9a077cb2479..0c95cd5872f 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -241,10 +241,25 @@ config NF_NAT_SNMP_BASIC # <expr> '&&' <expr> (6) # # (6) Returns the result of min(/expr/, /expr/). +config NF_NAT_PROTO_DCCP + tristate + depends on NF_NAT && NF_CT_PROTO_DCCP + default NF_NAT && NF_CT_PROTO_DCCP + config NF_NAT_PROTO_GRE tristate depends on NF_NAT && NF_CT_PROTO_GRE +config NF_NAT_PROTO_UDPLITE + tristate + depends on NF_NAT && NF_CT_PROTO_UDPLITE + default NF_NAT && NF_CT_PROTO_UDPLITE + +config NF_NAT_PROTO_SCTP + tristate + default NF_NAT && NF_CT_PROTO_SCTP + depends on NF_NAT && NF_CT_PROTO_SCTP + config NF_NAT_FTP tristate depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 0c7dc78a62e..d9b92fbf557 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -10,7 +10,7 @@ nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o endif endif -nf_nat-objs := nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o +nf_nat-objs := nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_common.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o iptable_nat-objs := nf_nat_rule.o nf_nat_standalone.o # connection tracking @@ -29,7 +29,10 @@ obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o # NAT protocols (nf_nat) +obj-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o +obj-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o +obj-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o # generic IP tables obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index a7591ce344d..03e83a65aec 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -52,14 +52,14 @@ MODULE_DESCRIPTION("arptables core"); do { \ if (!(x)) \ printk("ARP_NF_ASSERT: %s:%s:%u\n", \ - __FUNCTION__, __FILE__, __LINE__); \ + __func__, __FILE__, __LINE__); \ } while(0) #else #define ARP_NF_ASSERT(x) #endif static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap, - char *hdr_addr, int len) + const char *hdr_addr, int len) { int i, ret; @@ -80,8 +80,8 @@ static inline int arp_packet_match(const struct arphdr *arphdr, const char *outdev, const struct arpt_arp *arpinfo) { - char *arpptr = (char *)(arphdr + 1); - char *src_devaddr, *tgt_devaddr; + const char *arpptr = (char *)(arphdr + 1); + const char *src_devaddr, *tgt_devaddr; __be32 src_ipaddr, tgt_ipaddr; int i, ret; @@ -222,21 +222,18 @@ unsigned int arpt_do_table(struct sk_buff *skb, unsigned int hook, const struct net_device *in, const struct net_device *out, - struct arpt_table *table) + struct xt_table *table) { static const char nulldevname[IFNAMSIZ]; unsigned int verdict = NF_DROP; - struct arphdr *arp; + const struct arphdr *arp; bool hotdrop = false; struct arpt_entry *e, *back; const char *indev, *outdev; void *table_base; - struct xt_table_info *private; + const struct xt_table_info *private; - /* ARP header, plus 2 device addresses, plus 2 IP addresses. */ - if (!pskb_may_pull(skb, (sizeof(struct arphdr) + - (2 * skb->dev->addr_len) + - (2 * sizeof(u32))))) + if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) return NF_DROP; indev = in ? in->name : nulldevname; @@ -355,7 +352,7 @@ static int mark_source_chains(struct xt_table_info *newinfo, e->counters.pcnt = pos; for (;;) { - struct arpt_standard_target *t + const struct arpt_standard_target *t = (void *)arpt_get_target(e); int visited = e->comefrom & (1 << hook); @@ -440,7 +437,7 @@ static int mark_source_chains(struct xt_table_info *newinfo, static inline int check_entry(struct arpt_entry *e, const char *name) { - struct arpt_entry_target *t; + const struct arpt_entry_target *t; if (!arp_checkentry(&e->arp)) { duprintf("arp_tables: arp check failed %p %s.\n", e, name); @@ -460,7 +457,7 @@ static inline int check_entry(struct arpt_entry *e, const char *name) static inline int check_target(struct arpt_entry *e, const char *name) { struct arpt_entry_target *t; - struct arpt_target *target; + struct xt_target *target; int ret; t = arpt_get_target(e); @@ -483,7 +480,7 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size, unsigned int *i) { struct arpt_entry_target *t; - struct arpt_target *target; + struct xt_target *target; int ret; ret = check_entry(e, name); @@ -709,11 +706,11 @@ static void get_counters(const struct xt_table_info *t, } } -static inline struct xt_counters *alloc_counters(struct arpt_table *table) +static inline struct xt_counters *alloc_counters(struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; - struct xt_table_info *private = table->private; + const struct xt_table_info *private = table->private; /* We need atomic snapshot of counters: rest doesn't change * (other than comefrom, which userspace doesn't care @@ -734,7 +731,7 @@ static inline struct xt_counters *alloc_counters(struct arpt_table *table) } static int copy_entries_to_user(unsigned int total_size, - struct arpt_table *table, + struct xt_table *table, void __user *userptr) { unsigned int off, num; @@ -854,7 +851,7 @@ static int compat_table_info(const struct xt_table_info *info, static int get_info(struct net *net, void __user *user, int *len, int compat) { char name[ARPT_TABLE_MAXNAMELEN]; - struct arpt_table *t; + struct xt_table *t; int ret; if (*len != sizeof(struct arpt_getinfo)) { @@ -875,7 +872,7 @@ static int get_info(struct net *net, void __user *user, int *len, int compat) "arptable_%s", name); if (t && !IS_ERR(t)) { struct arpt_getinfo info; - struct xt_table_info *private = t->private; + const struct xt_table_info *private = t->private; #ifdef CONFIG_COMPAT if (compat) { @@ -914,7 +911,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, { int ret; struct arpt_get_entries get; - struct arpt_table *t; + struct xt_table *t; if (*len < sizeof(get)) { duprintf("get_entries: %u < %Zu\n", *len, sizeof(get)); @@ -930,7 +927,8 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, t = xt_find_table_lock(net, NF_ARP, get.name); if (t && !IS_ERR(t)) { - struct xt_table_info *private = t->private; + const struct xt_table_info *private = t->private; + duprintf("t->private->number = %u\n", private->number); if (get.size == private->size) @@ -939,7 +937,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, else { duprintf("get_entries: I've got %u not %u!\n", private->size, get.size); - ret = -EINVAL; + ret = -EAGAIN; } module_put(t->me); xt_table_unlock(t); @@ -956,7 +954,7 @@ static int __do_replace(struct net *net, const char *name, void __user *counters_ptr) { int ret; - struct arpt_table *t; + struct xt_table *t; struct xt_table_info *oldinfo; struct xt_counters *counters; void *loc_cpu_old_entry; @@ -1090,11 +1088,11 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len, struct xt_counters_info tmp; struct xt_counters *paddc; unsigned int num_counters; - char *name; + const char *name; int size; void *ptmp; - struct arpt_table *t; - struct xt_table_info *private; + struct xt_table *t; + const struct xt_table_info *private; int ret = 0; void *loc_cpu_entry; #ifdef CONFIG_COMPAT @@ -1499,11 +1497,11 @@ static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, switch (cmd) { case ARPT_SO_SET_REPLACE: - ret = compat_do_replace(sk->sk_net, user, len); + ret = compat_do_replace(sock_net(sk), user, len); break; case ARPT_SO_SET_ADD_COUNTERS: - ret = do_add_counters(sk->sk_net, user, len, 1); + ret = do_add_counters(sock_net(sk), user, len, 1); break; default: @@ -1557,11 +1555,11 @@ out: } static int compat_copy_entries_to_user(unsigned int total_size, - struct arpt_table *table, + struct xt_table *table, void __user *userptr) { struct xt_counters *counters; - struct xt_table_info *private = table->private; + const struct xt_table_info *private = table->private; void __user *pos; unsigned int size; int ret = 0; @@ -1595,7 +1593,7 @@ static int compat_get_entries(struct net *net, { int ret; struct compat_arpt_get_entries get; - struct arpt_table *t; + struct xt_table *t; if (*len < sizeof(get)) { duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get)); @@ -1612,7 +1610,7 @@ static int compat_get_entries(struct net *net, xt_compat_lock(NF_ARP); t = xt_find_table_lock(net, NF_ARP, get.name); if (t && !IS_ERR(t)) { - struct xt_table_info *private = t->private; + const struct xt_table_info *private = t->private; struct xt_table_info info; duprintf("t->private->number = %u\n", private->number); @@ -1623,7 +1621,7 @@ static int compat_get_entries(struct net *net, } else if (!ret) { duprintf("compat_get_entries: I've got %u not %u!\n", private->size, get.size); - ret = -EINVAL; + ret = -EAGAIN; } xt_compat_flush_offsets(NF_ARP); module_put(t->me); @@ -1647,10 +1645,10 @@ static int compat_do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, switch (cmd) { case ARPT_SO_GET_INFO: - ret = get_info(sk->sk_net, user, len, 1); + ret = get_info(sock_net(sk), user, len, 1); break; case ARPT_SO_GET_ENTRIES: - ret = compat_get_entries(sk->sk_net, user, len); + ret = compat_get_entries(sock_net(sk), user, len); break; default: ret = do_arpt_get_ctl(sk, cmd, user, len); @@ -1668,11 +1666,11 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned switch (cmd) { case ARPT_SO_SET_REPLACE: - ret = do_replace(sk->sk_net, user, len); + ret = do_replace(sock_net(sk), user, len); break; case ARPT_SO_SET_ADD_COUNTERS: - ret = do_add_counters(sk->sk_net, user, len, 0); + ret = do_add_counters(sock_net(sk), user, len, 0); break; default: @@ -1692,11 +1690,11 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len switch (cmd) { case ARPT_SO_GET_INFO: - ret = get_info(sk->sk_net, user, len, 0); + ret = get_info(sock_net(sk), user, len, 0); break; case ARPT_SO_GET_ENTRIES: - ret = get_entries(sk->sk_net, user, len); + ret = get_entries(sock_net(sk), user, len); break; case ARPT_SO_GET_REVISION_TARGET: { @@ -1725,9 +1723,8 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len return ret; } -struct arpt_table *arpt_register_table(struct net *net, - struct arpt_table *table, - const struct arpt_replace *repl) +struct xt_table *arpt_register_table(struct net *net, struct xt_table *table, + const struct arpt_replace *repl) { int ret; struct xt_table_info *newinfo; @@ -1769,7 +1766,7 @@ out: return ERR_PTR(ret); } -void arpt_unregister_table(struct arpt_table *table) +void arpt_unregister_table(struct xt_table *table) { struct xt_table_info *private; void *loc_cpu_entry; @@ -1787,7 +1784,7 @@ void arpt_unregister_table(struct arpt_table *table) } /* The built-in targets: standard (NULL) and error. */ -static struct arpt_target arpt_standard_target __read_mostly = { +static struct xt_target arpt_standard_target __read_mostly = { .name = ARPT_STANDARD_TARGET, .targetsize = sizeof(int), .family = NF_ARP, @@ -1798,7 +1795,7 @@ static struct arpt_target arpt_standard_target __read_mostly = { #endif }; -static struct arpt_target arpt_error_target __read_mostly = { +static struct xt_target arpt_error_target __read_mostly = { .name = ARPT_ERROR_TARGET, .target = arpt_error, .targetsize = ARPT_FUNCTION_MAXNAMELEN, diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c index 3f4222b0a80..a385959d265 100644 --- a/net/ipv4/netfilter/arpt_mangle.c +++ b/net/ipv4/netfilter/arpt_mangle.c @@ -15,7 +15,7 @@ target(struct sk_buff *skb, const void *targinfo) { const struct arpt_mangle *mangle = targinfo; - struct arphdr *arp; + const struct arphdr *arp; unsigned char *arpptr; int pln, hln; @@ -73,8 +73,9 @@ checkentry(const char *tablename, const void *e, const struct xt_target *target, return true; } -static struct arpt_target arpt_mangle_reg __read_mostly = { +static struct xt_target arpt_mangle_reg __read_mostly = { .name = "mangle", + .family = NF_ARP, .target = target, .targetsize = sizeof(struct arpt_mangle), .checkentry = checkentry, @@ -83,15 +84,12 @@ static struct arpt_target arpt_mangle_reg __read_mostly = { static int __init arpt_mangle_init(void) { - if (arpt_register_target(&arpt_mangle_reg)) - return -EINVAL; - - return 0; + return xt_register_target(&arpt_mangle_reg); } static void __exit arpt_mangle_fini(void) { - arpt_unregister_target(&arpt_mangle_reg); + xt_unregister_target(&arpt_mangle_reg); } module_init(arpt_mangle_init); diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 4e9c496a30c..3be4d07e7ed 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -45,10 +45,10 @@ static struct .term = ARPT_ERROR_INIT, }; -static struct arpt_table packet_filter = { +static struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, - .lock = RW_LOCK_UNLOCKED, + .lock = __RW_LOCK_UNLOCKED(packet_filter.lock), .private = NULL, .me = THIS_MODULE, .af = NF_ARP, @@ -70,18 +70,21 @@ static struct nf_hook_ops arpt_ops[] __read_mostly = { .owner = THIS_MODULE, .pf = NF_ARP, .hooknum = NF_ARP_IN, + .priority = NF_IP_PRI_FILTER, }, { .hook = arpt_hook, .owner = THIS_MODULE, .pf = NF_ARP, .hooknum = NF_ARP_OUT, + .priority = NF_IP_PRI_FILTER, }, { .hook = arpt_hook, .owner = THIS_MODULE, .pf = NF_ARP, .hooknum = NF_ARP_FORWARD, + .priority = NF_IP_PRI_FILTER, }, }; diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 4dc162894cb..719be29f750 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -481,7 +481,7 @@ ipq_rcv_dev_event(struct notifier_block *this, { struct net_device *dev = ptr; - if (dev->nd_net != &init_net) + if (dev_net(dev) != &init_net) return NOTIFY_DONE; /* Drop any packets associated with the downed device */ diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 600737f122d..4e7c719445c 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -53,7 +53,7 @@ MODULE_DESCRIPTION("IPv4 packet filter"); do { \ if (!(x)) \ printk("IP_NF_ASSERT: %s:%s:%u\n", \ - __FUNCTION__, __FILE__, __LINE__); \ + __func__, __FILE__, __LINE__); \ } while(0) #else #define IP_NF_ASSERT(x) @@ -296,7 +296,7 @@ static void trace_packet(struct sk_buff *skb, struct ipt_entry *e) { void *table_base; - struct ipt_entry *root; + const struct ipt_entry *root; char *hookname, *chainname, *comment; unsigned int rulenum = 0; @@ -327,7 +327,7 @@ ipt_do_table(struct sk_buff *skb, { static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); u_int16_t offset; - struct iphdr *ip; + const struct iphdr *ip; u_int16_t datalen; bool hotdrop = false; /* Initializing verdict to NF_DROP keeps gcc happy. */ @@ -926,7 +926,7 @@ static struct xt_counters * alloc_counters(struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; - struct xt_table_info *private = table->private; + const struct xt_table_info *private = table->private; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -953,9 +953,9 @@ copy_entries_to_user(unsigned int total_size, unsigned int off, num; struct ipt_entry *e; struct xt_counters *counters; - struct xt_table_info *private = table->private; + const struct xt_table_info *private = table->private; int ret = 0; - void *loc_cpu_entry; + const void *loc_cpu_entry; counters = alloc_counters(table); if (IS_ERR(counters)) @@ -975,8 +975,8 @@ copy_entries_to_user(unsigned int total_size, /* ... then go back and fix counters and names */ for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ unsigned int i; - struct ipt_entry_match *m; - struct ipt_entry_target *t; + const struct ipt_entry_match *m; + const struct ipt_entry_target *t; e = (struct ipt_entry *)(loc_cpu_entry + off); if (copy_to_user(userptr + off @@ -1116,7 +1116,7 @@ static int get_info(struct net *net, void __user *user, int *len, int compat) "iptable_%s", name); if (t && !IS_ERR(t)) { struct ipt_getinfo info; - struct xt_table_info *private = t->private; + const struct xt_table_info *private = t->private; #ifdef CONFIG_COMPAT if (compat) { @@ -1172,7 +1172,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr, int *len) t = xt_find_table_lock(net, AF_INET, get.name); if (t && !IS_ERR(t)) { - struct xt_table_info *private = t->private; + const struct xt_table_info *private = t->private; duprintf("t->private->number = %u\n", private->number); if (get.size == private->size) ret = copy_entries_to_user(private->size, @@ -1180,7 +1180,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr, int *len) else { duprintf("get_entries: I've got %u not %u!\n", private->size, get.size); - ret = -EINVAL; + ret = -EAGAIN; } module_put(t->me); xt_table_unlock(t); @@ -1337,11 +1337,11 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat struct xt_counters_info tmp; struct xt_counters *paddc; unsigned int num_counters; - char *name; + const char *name; int size; void *ptmp; struct xt_table *t; - struct xt_table_info *private; + const struct xt_table_info *private; int ret = 0; void *loc_cpu_entry; #ifdef CONFIG_COMPAT @@ -1852,11 +1852,11 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, switch (cmd) { case IPT_SO_SET_REPLACE: - ret = compat_do_replace(sk->sk_net, user, len); + ret = compat_do_replace(sock_net(sk), user, len); break; case IPT_SO_SET_ADD_COUNTERS: - ret = do_add_counters(sk->sk_net, user, len, 1); + ret = do_add_counters(sock_net(sk), user, len, 1); break; default: @@ -1878,11 +1878,11 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, void __user *userptr) { struct xt_counters *counters; - struct xt_table_info *private = table->private; + const struct xt_table_info *private = table->private; void __user *pos; unsigned int size; int ret = 0; - void *loc_cpu_entry; + const void *loc_cpu_entry; unsigned int i = 0; counters = alloc_counters(table); @@ -1929,7 +1929,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, xt_compat_lock(AF_INET); t = xt_find_table_lock(net, AF_INET, get.name); if (t && !IS_ERR(t)) { - struct xt_table_info *private = t->private; + const struct xt_table_info *private = t->private; struct xt_table_info info; duprintf("t->private->number = %u\n", private->number); ret = compat_table_info(private, &info); @@ -1939,7 +1939,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, } else if (!ret) { duprintf("compat_get_entries: I've got %u not %u!\n", private->size, get.size); - ret = -EINVAL; + ret = -EAGAIN; } xt_compat_flush_offsets(AF_INET); module_put(t->me); @@ -1963,10 +1963,10 @@ compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) switch (cmd) { case IPT_SO_GET_INFO: - ret = get_info(sk->sk_net, user, len, 1); + ret = get_info(sock_net(sk), user, len, 1); break; case IPT_SO_GET_ENTRIES: - ret = compat_get_entries(sk->sk_net, user, len); + ret = compat_get_entries(sock_net(sk), user, len); break; default: ret = do_ipt_get_ctl(sk, cmd, user, len); @@ -1985,11 +1985,11 @@ do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) switch (cmd) { case IPT_SO_SET_REPLACE: - ret = do_replace(sk->sk_net, user, len); + ret = do_replace(sock_net(sk), user, len); break; case IPT_SO_SET_ADD_COUNTERS: - ret = do_add_counters(sk->sk_net, user, len, 0); + ret = do_add_counters(sock_net(sk), user, len, 0); break; default: @@ -2010,11 +2010,11 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) switch (cmd) { case IPT_SO_GET_INFO: - ret = get_info(sk->sk_net, user, len, 0); + ret = get_info(sock_net(sk), user, len, 0); break; case IPT_SO_GET_ENTRIES: - ret = get_entries(sk->sk_net, user, len); + ret = get_entries(sock_net(sk), user, len); break; case IPT_SO_GET_REVISION_MATCH: @@ -2130,7 +2130,8 @@ icmp_match(const struct sk_buff *skb, unsigned int protoff, bool *hotdrop) { - struct icmphdr _icmph, *ic; + const struct icmphdr *ic; + struct icmphdr _icmph; const struct ipt_icmp *icmpinfo = matchinfo; /* Must not be a fragment. */ diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index a12dd329e20..22d8e7cd919 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -144,7 +144,7 @@ clusterip_config_init_nodelist(struct clusterip_config *c, } static struct clusterip_config * -clusterip_config_init(struct ipt_clusterip_tgt_info *i, __be32 ip, +clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip, struct net_device *dev) { struct clusterip_config *c; @@ -333,7 +333,7 @@ clusterip_tg(struct sk_buff *skb, const struct net_device *in, } #ifdef DEBUG - DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + nf_ct_dump_tuple_ip(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); #endif pr_debug("hash=%u ct_hash=%u ", hash, ct->mark); if (!clusterip_responsible(cipinfo->config, hash)) { @@ -418,7 +418,7 @@ clusterip_tg_check(const char *tablename, const void *e_void, /* drop reference count of cluster config when rule is deleted */ static void clusterip_tg_destroy(const struct xt_target *target, void *targinfo) { - struct ipt_clusterip_tgt_info *cipinfo = targinfo; + const struct ipt_clusterip_tgt_info *cipinfo = targinfo; /* if no more entries are referencing the config, remove it * from the list and destroy the proc entry */ @@ -567,7 +567,7 @@ struct clusterip_seq_position { static void *clusterip_seq_start(struct seq_file *s, loff_t *pos) { - struct proc_dir_entry *pde = s->private; + const struct proc_dir_entry *pde = s->private; struct clusterip_config *c = pde->data; unsigned int weight; u_int32_t local_nodes; @@ -594,7 +594,7 @@ static void *clusterip_seq_start(struct seq_file *s, loff_t *pos) static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos) { - struct clusterip_seq_position *idx = (struct clusterip_seq_position *)v; + struct clusterip_seq_position *idx = v; *pos = ++idx->pos; if (*pos >= idx->weight) { @@ -613,7 +613,7 @@ static void clusterip_seq_stop(struct seq_file *s, void *v) static int clusterip_seq_show(struct seq_file *s, void *v) { - struct clusterip_seq_position *idx = (struct clusterip_seq_position *)v; + struct clusterip_seq_position *idx = v; if (idx->pos != 0) seq_putc(s, ','); @@ -669,7 +669,7 @@ static ssize_t clusterip_proc_write(struct file *file, const char __user *input, { #define PROC_WRITELEN 10 char buffer[PROC_WRITELEN+1]; - struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + const struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); struct clusterip_config *c = pde->data; unsigned long nodenum; diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index 21395bc2b27..d60139c134c 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -100,7 +100,7 @@ ecn_tg_check(const char *tablename, const void *e_void, const struct xt_target *target, void *targinfo, unsigned int hook_mask) { - const struct ipt_ECN_info *einfo = (struct ipt_ECN_info *)targinfo; + const struct ipt_ECN_info *einfo = targinfo; const struct ipt_entry *e = e_void; if (einfo->operation & IPT_ECN_OP_MASK) { diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index b38d7850f50..0af14137137 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -76,7 +76,8 @@ static void dump_packet(const struct nf_loginfo *info, if ((logflags & IPT_LOG_IPOPT) && ih->ihl * 4 > sizeof(struct iphdr)) { - unsigned char _opt[4 * 15 - sizeof(struct iphdr)], *op; + const unsigned char *op; + unsigned char _opt[4 * 15 - sizeof(struct iphdr)]; unsigned int i, optsize; optsize = ih->ihl * 4 - sizeof(struct iphdr); @@ -338,12 +339,16 @@ static void dump_packet(const struct nf_loginfo *info, if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) { read_lock_bh(&skb->sk->sk_callback_lock); if (skb->sk->sk_socket && skb->sk->sk_socket->file) - printk("UID=%u GID=%u", + printk("UID=%u GID=%u ", skb->sk->sk_socket->file->f_uid, skb->sk->sk_socket->file->f_gid); read_unlock_bh(&skb->sk->sk_callback_lock); } + /* Max length: 16 "MARK=0xFFFFFFFF " */ + if (!iphoff && skb->mark) + printk("MARK=0x%x ", skb->mark); + /* Proto Max log string length */ /* IP: 40+46+6+11+127 = 230 */ /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */ diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index d80fee8327e..84c26dd27d8 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -77,7 +77,7 @@ masquerade_tg(struct sk_buff *skb, const struct net_device *in, return NF_ACCEPT; mr = targinfo; - rt = (struct rtable *)skb->dst; + rt = skb->rtable; newsrc = inet_select_addr(out, rt->rt_gateway, RT_SCOPE_UNIVERSE); if (!newsrc) { printk("MASQUERADE: %s ate my IP address\n", out->name); @@ -120,7 +120,7 @@ static int masq_device_event(struct notifier_block *this, { const struct net_device *dev = ptr; - if (dev->nd_net != &init_net) + if (dev_net(dev) != &init_net) return NOTIFY_DONE; if (event == NETDEV_DOWN) { @@ -139,18 +139,8 @@ static int masq_inet_event(struct notifier_block *this, unsigned long event, void *ptr) { - const struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev; - - if (event == NETDEV_DOWN) { - /* IP address was deleted. Search entire table for - conntracks which were associated with that device, - and forget them. */ - NF_CT_ASSERT(dev->ifindex != 0); - - nf_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex); - } - - return NOTIFY_DONE; + struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev; + return masq_device_event(this, event, dev); } static struct notifier_block masq_dev_notifier = { diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 22606e2baa1..2639872849d 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -35,8 +35,10 @@ MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv4"); static void send_reset(struct sk_buff *oldskb, int hook) { struct sk_buff *nskb; - struct iphdr *oiph, *niph; - struct tcphdr _otcph, *oth, *tcph; + const struct iphdr *oiph; + struct iphdr *niph; + const struct tcphdr *oth; + struct tcphdr _otcph, *tcph; unsigned int addr_type; /* IP header checks: fragment. */ diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c index 50e06690eb5..21cb053f5d7 100644 --- a/net/ipv4/netfilter/ipt_recent.c +++ b/net/ipv4/netfilter/ipt_recent.c @@ -340,7 +340,7 @@ static void *recent_seq_start(struct seq_file *seq, loff_t *pos) static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct recent_iter_state *st = seq->private; - struct recent_table *t = st->table; + const struct recent_table *t = st->table; struct recent_entry *e = v; struct list_head *head = e->list.next; @@ -361,7 +361,7 @@ static void recent_seq_stop(struct seq_file *s, void *v) static int recent_seq_show(struct seq_file *seq, void *v) { - struct recent_entry *e = v; + const struct recent_entry *e = v; unsigned int i; i = (e->index - 1) % ip_pkt_list_tot; @@ -396,7 +396,7 @@ static int recent_seq_open(struct inode *inode, struct file *file) static ssize_t recent_proc_write(struct file *file, const char __user *input, size_t size, loff_t *loff) { - struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + const struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); struct recent_table *t = pde->data; struct recent_entry *e; char buf[sizeof("+255.255.255.255")], *c = buf; diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 69f3d7e6e96..1ea677dcf84 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -56,20 +56,32 @@ static struct static struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, - .lock = RW_LOCK_UNLOCKED, + .lock = __RW_LOCK_UNLOCKED(packet_filter.lock), .me = THIS_MODULE, .af = AF_INET, }; /* The work comes in here from netfilter.c. */ static unsigned int +ipt_local_in_hook(unsigned int hook, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ipt_do_table(skb, hook, in, out, + nf_local_in_net(in, out)->ipv4.iptable_filter); +} + +static unsigned int ipt_hook(unsigned int hook, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - return ipt_do_table(skb, hook, in, out, init_net.ipv4.iptable_filter); + return ipt_do_table(skb, hook, in, out, + nf_forward_net(in, out)->ipv4.iptable_filter); } static unsigned int @@ -88,12 +100,13 @@ ipt_local_out_hook(unsigned int hook, return NF_ACCEPT; } - return ipt_do_table(skb, hook, in, out, init_net.ipv4.iptable_filter); + return ipt_do_table(skb, hook, in, out, + nf_local_out_net(in, out)->ipv4.iptable_filter); } static struct nf_hook_ops ipt_ops[] __read_mostly = { { - .hook = ipt_hook, + .hook = ipt_local_in_hook, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_INET_LOCAL_IN, diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index c55a210853a..da59182f222 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -67,20 +67,54 @@ static struct static struct xt_table packet_mangler = { .name = "mangle", .valid_hooks = MANGLE_VALID_HOOKS, - .lock = RW_LOCK_UNLOCKED, + .lock = __RW_LOCK_UNLOCKED(packet_mangler.lock), .me = THIS_MODULE, .af = AF_INET, }; /* The work comes in here from netfilter.c. */ static unsigned int -ipt_route_hook(unsigned int hook, +ipt_pre_routing_hook(unsigned int hook, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ipt_do_table(skb, hook, in, out, + nf_pre_routing_net(in, out)->ipv4.iptable_mangle); +} + +static unsigned int +ipt_post_routing_hook(unsigned int hook, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ipt_do_table(skb, hook, in, out, + nf_post_routing_net(in, out)->ipv4.iptable_mangle); +} + +static unsigned int +ipt_local_in_hook(unsigned int hook, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ipt_do_table(skb, hook, in, out, + nf_local_in_net(in, out)->ipv4.iptable_mangle); +} + +static unsigned int +ipt_forward_hook(unsigned int hook, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - return ipt_do_table(skb, hook, in, out, init_net.ipv4.iptable_mangle); + return ipt_do_table(skb, hook, in, out, + nf_forward_net(in, out)->ipv4.iptable_mangle); } static unsigned int @@ -112,7 +146,8 @@ ipt_local_hook(unsigned int hook, daddr = iph->daddr; tos = iph->tos; - ret = ipt_do_table(skb, hook, in, out, init_net.ipv4.iptable_mangle); + ret = ipt_do_table(skb, hook, in, out, + nf_local_out_net(in, out)->ipv4.iptable_mangle); /* Reroute for ANY change. */ if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) { iph = ip_hdr(skb); @@ -130,21 +165,21 @@ ipt_local_hook(unsigned int hook, static struct nf_hook_ops ipt_ops[] __read_mostly = { { - .hook = ipt_route_hook, + .hook = ipt_pre_routing_hook, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_MANGLE, }, { - .hook = ipt_route_hook, + .hook = ipt_local_in_hook, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_MANGLE, }, { - .hook = ipt_route_hook, + .hook = ipt_forward_hook, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_INET_FORWARD, @@ -158,7 +193,7 @@ static struct nf_hook_ops ipt_ops[] __read_mostly = { .priority = NF_IP_PRI_MANGLE, }, { - .hook = ipt_route_hook, + .hook = ipt_post_routing_hook, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_INET_POST_ROUTING, diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index e41fe8ca4e1..fddce7754b7 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -39,7 +39,7 @@ static struct static struct xt_table packet_raw = { .name = "raw", .valid_hooks = RAW_VALID_HOOKS, - .lock = RW_LOCK_UNLOCKED, + .lock = __RW_LOCK_UNLOCKED(packet_raw.lock), .me = THIS_MODULE, .af = AF_INET, }; @@ -52,7 +52,8 @@ ipt_hook(unsigned int hook, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - return ipt_do_table(skb, hook, in, out, init_net.ipv4.iptable_raw); + return ipt_do_table(skb, hook, in, out, + nf_pre_routing_net(in, out)->ipv4.iptable_raw); } static unsigned int @@ -70,7 +71,8 @@ ipt_local_hook(unsigned int hook, "packet.\n"); return NF_ACCEPT; } - return ipt_do_table(skb, hook, in, out, init_net.ipv4.iptable_raw); + return ipt_do_table(skb, hook, in, out, + nf_local_out_net(in, out)->ipv4.iptable_raw); } /* 'raw' is the very first table. */ diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index a65b845c5f1..cacb9cb27da 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -23,30 +23,36 @@ #include <net/netfilter/nf_conntrack_l3proto.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> +#include <net/netfilter/nf_nat_helper.h> -static int ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, - struct nf_conntrack_tuple *tuple) +int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo); +EXPORT_SYMBOL_GPL(nf_nat_seq_adjust_hook); + +static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, + struct nf_conntrack_tuple *tuple) { const __be32 *ap; __be32 _addrs[2]; ap = skb_header_pointer(skb, nhoff + offsetof(struct iphdr, saddr), sizeof(u_int32_t) * 2, _addrs); if (ap == NULL) - return 0; + return false; tuple->src.u3.ip = ap[0]; tuple->dst.u3.ip = ap[1]; - return 1; + return true; } -static int ipv4_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) +static bool ipv4_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) { tuple->src.u3.ip = orig->dst.u3.ip; tuple->dst.u3.ip = orig->src.u3.ip; - return 1; + return true; } static int ipv4_print_tuple(struct seq_file *s, @@ -101,35 +107,41 @@ static unsigned int ipv4_confirm(unsigned int hooknum, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - /* We've seen it coming out the other side: confirm it */ - return nf_conntrack_confirm(skb); -} - -static unsigned int ipv4_conntrack_help(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ struct nf_conn *ct; enum ip_conntrack_info ctinfo; const struct nf_conn_help *help; const struct nf_conntrack_helper *helper; + unsigned int ret; /* This is where we call the helper: as the packet goes out. */ ct = nf_ct_get(skb, &ctinfo); if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY) - return NF_ACCEPT; + goto out; help = nfct_help(ct); if (!help) - return NF_ACCEPT; + goto out; + /* rcu_read_lock()ed by nf_hook_slow */ helper = rcu_dereference(help->helper); if (!helper) - return NF_ACCEPT; - return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), - ct, ctinfo); + goto out; + + ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), + ct, ctinfo); + if (ret != NF_ACCEPT) + return ret; + + if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)) { + typeof(nf_nat_seq_adjust_hook) seq_adjust; + + seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook); + if (!seq_adjust || !seq_adjust(skb, ct, ctinfo)) + return NF_DROP; + } +out: + /* We've seen it coming out the other side: confirm it */ + return nf_conntrack_confirm(skb); } static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, @@ -211,20 +223,6 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { .priority = NF_IP_PRI_CONNTRACK, }, { - .hook = ipv4_conntrack_help, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_HELPER, - }, - { - .hook = ipv4_conntrack_help, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_CONNTRACK_HELPER, - }, - { .hook = ipv4_confirm, .owner = THIS_MODULE, .pf = PF_INET, diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index f500b0fdaef..40a46d48249 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -106,21 +106,16 @@ static int ct_seq_show(struct seq_file *s, void *v) /* we only want to print DIR_ORIGINAL */ if (NF_CT_DIRECTION(hash)) return 0; - if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num != AF_INET) + if (nf_ct_l3num(ct) != AF_INET) return 0; - l3proto = __nf_ct_l3proto_find(ct->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.src.l3num); + l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); NF_CT_ASSERT(l3proto); - l4proto = __nf_ct_l4proto_find(ct->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.src.l3num, - ct->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.dst.protonum); + l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); NF_CT_ASSERT(l4proto); if (seq_printf(s, "%-8s %u %ld ", - l4proto->name, - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum, + l4proto->name, nf_ct_protonum(ct), timer_pending(&ct->timeout) ? (long)(ct->timeout.expires - jiffies)/HZ : 0) != 0) return -ENOSPC; @@ -379,7 +374,7 @@ static const struct file_operations ct_cpu_seq_fops = { .open = ct_cpu_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release, }; int __init nf_conntrack_ipv4_compat_init(void) diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 6873fddb352..78ab19accac 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -22,22 +22,21 @@ static unsigned long nf_ct_icmp_timeout __read_mostly = 30*HZ; -static int icmp_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct nf_conntrack_tuple *tuple) +static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, + struct nf_conntrack_tuple *tuple) { const struct icmphdr *hp; struct icmphdr _hdr; hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (hp == NULL) - return 0; + return false; tuple->dst.u.icmp.type = hp->type; tuple->src.u.icmp.id = hp->un.echo.id; tuple->dst.u.icmp.code = hp->code; - return 1; + return true; } /* Add 1; spaces filled with 0. */ @@ -52,17 +51,17 @@ static const u_int8_t invmap[] = { [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1 }; -static int icmp_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) +static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) { if (orig->dst.u.icmp.type >= sizeof(invmap) || !invmap[orig->dst.u.icmp.type]) - return 0; + return false; tuple->src.u.icmp.id = orig->src.u.icmp.id; tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1; tuple->dst.u.icmp.code = orig->dst.u.icmp.code; - return 1; + return true; } /* Print out the per-protocol part of the tuple. */ @@ -101,8 +100,8 @@ static int icmp_packet(struct nf_conn *ct, } /* Called when a new connection for this protocol found. */ -static int icmp_new(struct nf_conn *ct, - const struct sk_buff *skb, unsigned int dataoff) +static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff) { static const u_int8_t valid_new[] = { [ICMP_ECHO] = 1, @@ -116,11 +115,11 @@ static int icmp_new(struct nf_conn *ct, /* Can't create a new ICMP `conn' with this. */ pr_debug("icmp: can't create new conn with type %u\n", ct->tuplehash[0].tuple.dst.u.icmp.type); - NF_CT_DUMP_TUPLE(&ct->tuplehash[0].tuple); - return 0; + nf_ct_dump_tuple_ip(&ct->tuplehash[0].tuple); + return false; } atomic_set(&ct->proto.icmp.count, 0); - return 1; + return true; } /* Returns conntrack if it dealt with ICMP, and filled in skb fields */ diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 36b4e3bb056..04578593e10 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -150,9 +150,9 @@ find_appropriate_src(const struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range) { unsigned int h = hash_by_src(tuple); - struct nf_conn_nat *nat; - struct nf_conn *ct; - struct hlist_node *n; + const struct nf_conn_nat *nat; + const struct nf_conn *ct; + const struct hlist_node *n; rcu_read_lock(); hlist_for_each_entry_rcu(nat, n, &bysource[h], bysource) { @@ -349,7 +349,7 @@ nf_nat_setup_info(struct nf_conn *ct, EXPORT_SYMBOL(nf_nat_setup_info); /* Returns true if succeeded. */ -static int +static bool manip_pkt(u_int16_t proto, struct sk_buff *skb, unsigned int iphdroff, @@ -360,7 +360,7 @@ manip_pkt(u_int16_t proto, const struct nf_nat_protocol *p; if (!skb_make_writable(skb, iphdroff + sizeof(*iph))) - return 0; + return false; iph = (void *)skb->data + iphdroff; @@ -369,7 +369,7 @@ manip_pkt(u_int16_t proto, /* rcu_read_lock()ed by nf_hook_slow */ p = __nf_nat_proto_find(proto); if (!p->manip_pkt(skb, iphdroff, target, maniptype)) - return 0; + return false; iph = (void *)skb->data + iphdroff; @@ -380,7 +380,7 @@ manip_pkt(u_int16_t proto, csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip); iph->daddr = target->dst.u3.ip; } - return 1; + return true; } /* Do packet manipulations according to nf_nat_setup_info. */ @@ -426,7 +426,7 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, struct icmphdr icmp; struct iphdr ip; } *inside; - struct nf_conntrack_l4proto *l4proto; + const struct nf_conntrack_l4proto *l4proto; struct nf_conntrack_tuple inner, target; int hdrlen = ip_hdrlen(skb); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); @@ -544,46 +544,6 @@ void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto) } EXPORT_SYMBOL(nf_nat_protocol_unregister); -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) -int -nf_nat_port_range_to_nlattr(struct sk_buff *skb, - const struct nf_nat_range *range) -{ - NLA_PUT_BE16(skb, CTA_PROTONAT_PORT_MIN, range->min.tcp.port); - NLA_PUT_BE16(skb, CTA_PROTONAT_PORT_MAX, range->max.tcp.port); - - return 0; - -nla_put_failure: - return -1; -} -EXPORT_SYMBOL_GPL(nf_nat_port_nlattr_to_range); - -int -nf_nat_port_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range *range) -{ - int ret = 0; - - /* we have to return whether we actually parsed something or not */ - - if (tb[CTA_PROTONAT_PORT_MIN]) { - ret = 1; - range->min.tcp.port = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); - } - - if (!tb[CTA_PROTONAT_PORT_MAX]) { - if (ret) - range->max.tcp.port = range->min.tcp.port; - } else { - ret = 1; - range->max.tcp.port = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]); - } - - return ret; -} -EXPORT_SYMBOL_GPL(nf_nat_port_range_to_nlattr); -#endif - /* Noone using conntrack by the time this called. */ static void nf_nat_cleanup_conntrack(struct nf_conn *ct) { @@ -660,6 +620,9 @@ static int __init nf_nat_init(void) nf_conntrack_untracked.status |= IPS_NAT_DONE_MASK; l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET); + + BUG_ON(nf_nat_seq_adjust_hook != NULL); + rcu_assign_pointer(nf_nat_seq_adjust_hook, nf_nat_seq_adjust); return 0; cleanup_extend: @@ -686,6 +649,8 @@ static void __exit nf_nat_cleanup(void) nf_ct_free_hashtable(bysource, nf_nat_vmalloced, nf_nat_htable_size); nf_ct_l3proto_put(l3proto); nf_ct_extend_unregister(&nat_extend); + rcu_assign_pointer(nf_nat_seq_adjust_hook, NULL); + synchronize_net(); } MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c index ca57f47bbd2..11976ea2988 100644 --- a/net/ipv4/netfilter/nf_nat_helper.c +++ b/net/ipv4/netfilter/nf_nat_helper.c @@ -139,7 +139,7 @@ nf_nat_mangle_tcp_packet(struct sk_buff *skb, const char *rep_buffer, unsigned int rep_len) { - struct rtable *rt = (struct rtable *)skb->dst; + struct rtable *rt = skb->rtable; struct iphdr *iph; struct tcphdr *tcph; int oldlen, datalen; @@ -217,7 +217,7 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb, const char *rep_buffer, unsigned int rep_len) { - struct rtable *rt = (struct rtable *)skb->dst; + struct rtable *rt = skb->rtable; struct iphdr *iph; struct udphdr *udph; int datalen, oldlen; @@ -416,7 +416,6 @@ nf_nat_seq_adjust(struct sk_buff *skb, return 1; } -EXPORT_SYMBOL(nf_nat_seq_adjust); /* Setup NAT on this expected conntrack so it follows master. */ /* If we fail to get a free NAT slot, we'll get dropped on confirm */ diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index 3a1e6d6afc0..da3d91a5ef5 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -72,7 +72,7 @@ static void pptp_nat_expected(struct nf_conn *ct, } pr_debug("trying to unexpect other dir: "); - NF_CT_DUMP_TUPLE(&t); + nf_ct_dump_tuple_ip(&t); other_exp = nf_ct_expect_find_get(&t); if (other_exp) { nf_ct_unexpect_related(other_exp); diff --git a/net/ipv4/netfilter/nf_nat_proto_common.c b/net/ipv4/netfilter/nf_nat_proto_common.c new file mode 100644 index 00000000000..91537f11273 --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_proto_common.c @@ -0,0 +1,120 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/random.h> +#include <linux/ip.h> + +#include <linux/netfilter.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_rule.h> +#include <net/netfilter/nf_nat_protocol.h> + +bool nf_nat_proto_in_range(const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype, + const union nf_conntrack_man_proto *min, + const union nf_conntrack_man_proto *max) +{ + __be16 port; + + if (maniptype == IP_NAT_MANIP_SRC) + port = tuple->src.u.all; + else + port = tuple->dst.u.all; + + return ntohs(port) >= ntohs(min->all) && + ntohs(port) <= ntohs(max->all); +} +EXPORT_SYMBOL_GPL(nf_nat_proto_in_range); + +bool nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct, + u_int16_t *rover) +{ + unsigned int range_size, min, i; + __be16 *portptr; + u_int16_t off; + + if (maniptype == IP_NAT_MANIP_SRC) + portptr = &tuple->src.u.all; + else + portptr = &tuple->dst.u.all; + + /* If no range specified... */ + if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { + /* If it's dst rewrite, can't change port */ + if (maniptype == IP_NAT_MANIP_DST) + return false; + + if (ntohs(*portptr) < 1024) { + /* Loose convention: >> 512 is credential passing */ + if (ntohs(*portptr) < 512) { + min = 1; + range_size = 511 - min + 1; + } else { + min = 600; + range_size = 1023 - min + 1; + } + } else { + min = 1024; + range_size = 65535 - 1024 + 1; + } + } else { + min = ntohs(range->min.all); + range_size = ntohs(range->max.all) - min + 1; + } + + off = *rover; + if (range->flags & IP_NAT_RANGE_PROTO_RANDOM) + off = net_random(); + + for (i = 0; i < range_size; i++, off++) { + *portptr = htons(min + off % range_size); + if (nf_nat_used_tuple(tuple, ct)) + continue; + if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) + *rover = off; + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(nf_nat_proto_unique_tuple); + +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +int nf_nat_proto_range_to_nlattr(struct sk_buff *skb, + const struct nf_nat_range *range) +{ + NLA_PUT_BE16(skb, CTA_PROTONAT_PORT_MIN, range->min.all); + NLA_PUT_BE16(skb, CTA_PROTONAT_PORT_MAX, range->max.all); + return 0; + +nla_put_failure: + return -1; +} +EXPORT_SYMBOL_GPL(nf_nat_proto_nlattr_to_range); + +int nf_nat_proto_nlattr_to_range(struct nlattr *tb[], + struct nf_nat_range *range) +{ + if (tb[CTA_PROTONAT_PORT_MIN]) { + range->min.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); + range->max.all = range->min.tcp.port; + range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED; + } + if (tb[CTA_PROTONAT_PORT_MAX]) { + range->max.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]); + range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED; + } + return 0; +} +EXPORT_SYMBOL_GPL(nf_nat_proto_range_to_nlattr); +#endif diff --git a/net/ipv4/netfilter/nf_nat_proto_dccp.c b/net/ipv4/netfilter/nf_nat_proto_dccp.c new file mode 100644 index 00000000000..22485ce306d --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_proto_dccp.c @@ -0,0 +1,108 @@ +/* + * DCCP NAT protocol helper + * + * Copyright (c) 2005, 2006. 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/dccp.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_protocol.h> + +static u_int16_t dccp_port_rover; + +static bool +dccp_unique_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct) +{ + return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, + &dccp_port_rover); +} + +static bool +dccp_manip_pkt(struct sk_buff *skb, + unsigned int iphdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + const struct iphdr *iph = (const void *)(skb->data + iphdroff); + struct dccp_hdr *hdr; + unsigned int hdroff = iphdroff + iph->ihl * 4; + __be32 oldip, newip; + __be16 *portptr, oldport, newport; + int hdrsize = 8; /* DCCP connection tracking guarantees this much */ + + if (skb->len >= hdroff + sizeof(struct dccp_hdr)) + hdrsize = sizeof(struct dccp_hdr); + + if (!skb_make_writable(skb, hdroff + hdrsize)) + return false; + + iph = (struct iphdr *)(skb->data + iphdroff); + hdr = (struct dccp_hdr *)(skb->data + hdroff); + + if (maniptype == IP_NAT_MANIP_SRC) { + oldip = iph->saddr; + newip = tuple->src.u3.ip; + newport = tuple->src.u.dccp.port; + portptr = &hdr->dccph_sport; + } else { + oldip = iph->daddr; + newip = tuple->dst.u3.ip; + newport = tuple->dst.u.dccp.port; + portptr = &hdr->dccph_dport; + } + + oldport = *portptr; + *portptr = newport; + + if (hdrsize < sizeof(*hdr)) + return true; + + inet_proto_csum_replace4(&hdr->dccph_checksum, skb, oldip, newip, 1); + inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport, + 0); + return true; +} + +static const struct nf_nat_protocol nf_nat_protocol_dccp = { + .protonum = IPPROTO_DCCP, + .me = THIS_MODULE, + .manip_pkt = dccp_manip_pkt, + .in_range = nf_nat_proto_in_range, + .unique_tuple = dccp_unique_tuple, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .range_to_nlattr = nf_nat_proto_range_to_nlattr, + .nlattr_to_range = nf_nat_proto_nlattr_to_range, +#endif +}; + +static int __init nf_nat_proto_dccp_init(void) +{ + return nf_nat_protocol_register(&nf_nat_protocol_dccp); +} + +static void __exit nf_nat_proto_dccp_fini(void) +{ + nf_nat_protocol_unregister(&nf_nat_protocol_dccp); +} + +module_init(nf_nat_proto_dccp_init); +module_exit(nf_nat_proto_dccp_fini); + +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("DCCP NAT protocol helper"); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c index a1e4da16da2..d7e89201351 100644 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ b/net/ipv4/netfilter/nf_nat_proto_gre.c @@ -36,26 +36,8 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE"); -/* is key in given range between min and max */ -static int -gre_in_range(const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype, - const union nf_conntrack_man_proto *min, - const union nf_conntrack_man_proto *max) -{ - __be16 key; - - if (maniptype == IP_NAT_MANIP_SRC) - key = tuple->src.u.gre.key; - else - key = tuple->dst.u.gre.key; - - return ntohs(key) >= ntohs(min->gre.key) && - ntohs(key) <= ntohs(max->gre.key); -} - /* generate unique tuple ... */ -static int +static bool gre_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype, @@ -68,7 +50,7 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple, /* If there is no master conntrack we are not PPTP, do not change tuples */ if (!ct->master) - return 0; + return false; if (maniptype == IP_NAT_MANIP_SRC) keyptr = &tuple->src.u.gre.key; @@ -89,20 +71,20 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple, for (i = 0; i < range_size; i++, key++) { *keyptr = htons(min + key % range_size); if (!nf_nat_used_tuple(tuple, ct)) - return 1; + return true; } pr_debug("%p: no NAT mapping\n", ct); - return 0; + return false; } /* manipulate a GRE packet according to maniptype */ -static int +static bool gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { - struct gre_hdr *greh; + const struct gre_hdr *greh; struct gre_hdr_pptp *pgreh; const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); unsigned int hdroff = iphdroff + iph->ihl * 4; @@ -110,7 +92,7 @@ gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, /* pgreh includes two optional 32bit fields which are not required * to be there. That's where the magic '8' comes from */ if (!skb_make_writable(skb, hdroff + sizeof(*pgreh) - 8)) - return 0; + return false; greh = (void *)skb->data + hdroff; pgreh = (struct gre_hdr_pptp *)greh; @@ -118,7 +100,7 @@ gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, /* we only have destination manip of a packet, since 'source key' * is not present in the packet itself */ if (maniptype != IP_NAT_MANIP_DST) - return 1; + return true; switch (greh->version) { case GRE_VERSION_1701: /* We do not currently NAT any GREv0 packets. @@ -130,21 +112,20 @@ gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, break; default: pr_debug("can't nat unknown GRE version\n"); - return 0; + return false; } - return 1; + return true; } static const struct nf_nat_protocol gre = { - .name = "GRE", .protonum = IPPROTO_GRE, .me = THIS_MODULE, .manip_pkt = gre_manip_pkt, - .in_range = gre_in_range, + .in_range = nf_nat_proto_in_range, .unique_tuple = gre_unique_tuple, #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - .range_to_nlattr = nf_nat_port_range_to_nlattr, - .nlattr_to_range = nf_nat_port_nlattr_to_range, + .range_to_nlattr = nf_nat_proto_range_to_nlattr, + .nlattr_to_range = nf_nat_proto_nlattr_to_range, #endif }; diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c index 03a02969aa5..19a8b0b07d8 100644 --- a/net/ipv4/netfilter/nf_nat_proto_icmp.c +++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c @@ -17,7 +17,7 @@ #include <net/netfilter/nf_nat_rule.h> #include <net/netfilter/nf_nat_protocol.h> -static int +static bool icmp_in_range(const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype, const union nf_conntrack_man_proto *min, @@ -27,7 +27,7 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple, ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); } -static int +static bool icmp_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype, @@ -46,12 +46,12 @@ icmp_unique_tuple(struct nf_conntrack_tuple *tuple, tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) + (id % range_size)); if (!nf_nat_used_tuple(tuple, ct)) - return 1; + return true; } - return 0; + return false; } -static int +static bool icmp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, const struct nf_conntrack_tuple *tuple, @@ -62,24 +62,23 @@ icmp_manip_pkt(struct sk_buff *skb, unsigned int hdroff = iphdroff + iph->ihl*4; if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) - return 0; + return false; hdr = (struct icmphdr *)(skb->data + hdroff); inet_proto_csum_replace2(&hdr->checksum, skb, hdr->un.echo.id, tuple->src.u.icmp.id, 0); hdr->un.echo.id = tuple->src.u.icmp.id; - return 1; + return true; } const struct nf_nat_protocol nf_nat_protocol_icmp = { - .name = "ICMP", .protonum = IPPROTO_ICMP, .me = THIS_MODULE, .manip_pkt = icmp_manip_pkt, .in_range = icmp_in_range, .unique_tuple = icmp_unique_tuple, #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - .range_to_nlattr = nf_nat_port_range_to_nlattr, - .nlattr_to_range = nf_nat_port_nlattr_to_range, + .range_to_nlattr = nf_nat_proto_range_to_nlattr, + .nlattr_to_range = nf_nat_proto_nlattr_to_range, #endif }; diff --git a/net/ipv4/netfilter/nf_nat_proto_sctp.c b/net/ipv4/netfilter/nf_nat_proto_sctp.c new file mode 100644 index 00000000000..82e4c0e286b --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_proto_sctp.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/init.h> +#include <linux/ip.h> +#include <linux/sctp.h> +#include <net/sctp/checksum.h> + +#include <net/netfilter/nf_nat_protocol.h> + +static u_int16_t nf_sctp_port_rover; + +static bool +sctp_unique_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct) +{ + return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, + &nf_sctp_port_rover); +} + +static bool +sctp_manip_pkt(struct sk_buff *skb, + unsigned int iphdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); + sctp_sctphdr_t *hdr; + unsigned int hdroff = iphdroff + iph->ihl*4; + __be32 oldip, newip; + u32 crc32; + + if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + return false; + + iph = (struct iphdr *)(skb->data + iphdroff); + hdr = (struct sctphdr *)(skb->data + hdroff); + + if (maniptype == IP_NAT_MANIP_SRC) { + /* Get rid of src ip and src pt */ + oldip = iph->saddr; + newip = tuple->src.u3.ip; + hdr->source = tuple->src.u.sctp.port; + } else { + /* Get rid of dst ip and dst pt */ + oldip = iph->daddr; + newip = tuple->dst.u3.ip; + hdr->dest = tuple->dst.u.sctp.port; + } + + crc32 = sctp_start_cksum((u8 *)hdr, skb_headlen(skb) - hdroff); + for (skb = skb_shinfo(skb)->frag_list; skb; skb = skb->next) + crc32 = sctp_update_cksum((u8 *)skb->data, skb_headlen(skb), + crc32); + crc32 = sctp_end_cksum(crc32); + hdr->checksum = htonl(crc32); + + return true; +} + +static const struct nf_nat_protocol nf_nat_protocol_sctp = { + .protonum = IPPROTO_SCTP, + .me = THIS_MODULE, + .manip_pkt = sctp_manip_pkt, + .in_range = nf_nat_proto_in_range, + .unique_tuple = sctp_unique_tuple, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .range_to_nlattr = nf_nat_proto_range_to_nlattr, + .nlattr_to_range = nf_nat_proto_nlattr_to_range, +#endif +}; + +static int __init nf_nat_proto_sctp_init(void) +{ + return nf_nat_protocol_register(&nf_nat_protocol_sctp); +} + +static void __exit nf_nat_proto_sctp_exit(void) +{ + nf_nat_protocol_unregister(&nf_nat_protocol_sctp); +} + +module_init(nf_nat_proto_sctp_init); +module_exit(nf_nat_proto_sctp_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SCTP NAT protocol helper"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/ipv4/netfilter/nf_nat_proto_tcp.c b/net/ipv4/netfilter/nf_nat_proto_tcp.c index ffd5d1589ec..399e2cfa263 100644 --- a/net/ipv4/netfilter/nf_nat_proto_tcp.c +++ b/net/ipv4/netfilter/nf_nat_proto_tcp.c @@ -8,7 +8,6 @@ #include <linux/types.h> #include <linux/init.h> -#include <linux/random.h> #include <linux/ip.h> #include <linux/tcp.h> @@ -19,75 +18,19 @@ #include <net/netfilter/nf_nat_protocol.h> #include <net/netfilter/nf_nat_core.h> -static int -tcp_in_range(const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype, - const union nf_conntrack_man_proto *min, - const union nf_conntrack_man_proto *max) -{ - __be16 port; - - if (maniptype == IP_NAT_MANIP_SRC) - port = tuple->src.u.tcp.port; - else - port = tuple->dst.u.tcp.port; - - return ntohs(port) >= ntohs(min->tcp.port) && - ntohs(port) <= ntohs(max->tcp.port); -} +static u_int16_t tcp_port_rover; -static int +static bool tcp_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { - static u_int16_t port; - __be16 *portptr; - unsigned int range_size, min, i; - - if (maniptype == IP_NAT_MANIP_SRC) - portptr = &tuple->src.u.tcp.port; - else - portptr = &tuple->dst.u.tcp.port; - - /* If no range specified... */ - if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { - /* If it's dst rewrite, can't change port */ - if (maniptype == IP_NAT_MANIP_DST) - return 0; - - /* Map privileged onto privileged. */ - if (ntohs(*portptr) < 1024) { - /* Loose convention: >> 512 is credential passing */ - if (ntohs(*portptr)<512) { - min = 1; - range_size = 511 - min + 1; - } else { - min = 600; - range_size = 1023 - min + 1; - } - } else { - min = 1024; - range_size = 65535 - 1024 + 1; - } - } else { - min = ntohs(range->min.tcp.port); - range_size = ntohs(range->max.tcp.port) - min + 1; - } - - if (range->flags & IP_NAT_RANGE_PROTO_RANDOM) - port = net_random(); - - for (i = 0; i < range_size; i++, port++) { - *portptr = htons(min + port % range_size); - if (!nf_nat_used_tuple(tuple, ct)) - return 1; - } - return 0; + return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, + &tcp_port_rover); } -static int +static bool tcp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, const struct nf_conntrack_tuple *tuple, @@ -107,7 +50,7 @@ tcp_manip_pkt(struct sk_buff *skb, hdrsize = sizeof(struct tcphdr); if (!skb_make_writable(skb, hdroff + hdrsize)) - return 0; + return false; iph = (struct iphdr *)(skb->data + iphdroff); hdr = (struct tcphdr *)(skb->data + hdroff); @@ -130,22 +73,21 @@ tcp_manip_pkt(struct sk_buff *skb, *portptr = newport; if (hdrsize < sizeof(*hdr)) - return 1; + return true; inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1); inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0); - return 1; + return true; } const struct nf_nat_protocol nf_nat_protocol_tcp = { - .name = "TCP", .protonum = IPPROTO_TCP, .me = THIS_MODULE, .manip_pkt = tcp_manip_pkt, - .in_range = tcp_in_range, + .in_range = nf_nat_proto_in_range, .unique_tuple = tcp_unique_tuple, #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - .range_to_nlattr = nf_nat_port_range_to_nlattr, - .nlattr_to_range = nf_nat_port_nlattr_to_range, + .range_to_nlattr = nf_nat_proto_range_to_nlattr, + .nlattr_to_range = nf_nat_proto_nlattr_to_range, #endif }; diff --git a/net/ipv4/netfilter/nf_nat_proto_udp.c b/net/ipv4/netfilter/nf_nat_proto_udp.c index 4b8f49910ff..9e61c79492e 100644 --- a/net/ipv4/netfilter/nf_nat_proto_udp.c +++ b/net/ipv4/netfilter/nf_nat_proto_udp.c @@ -8,7 +8,6 @@ #include <linux/types.h> #include <linux/init.h> -#include <linux/random.h> #include <linux/ip.h> #include <linux/udp.h> @@ -18,74 +17,19 @@ #include <net/netfilter/nf_nat_rule.h> #include <net/netfilter/nf_nat_protocol.h> -static int -udp_in_range(const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype, - const union nf_conntrack_man_proto *min, - const union nf_conntrack_man_proto *max) -{ - __be16 port; - - if (maniptype == IP_NAT_MANIP_SRC) - port = tuple->src.u.udp.port; - else - port = tuple->dst.u.udp.port; - - return ntohs(port) >= ntohs(min->udp.port) && - ntohs(port) <= ntohs(max->udp.port); -} +static u_int16_t udp_port_rover; -static int +static bool udp_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { - static u_int16_t port; - __be16 *portptr; - unsigned int range_size, min, i; - - if (maniptype == IP_NAT_MANIP_SRC) - portptr = &tuple->src.u.udp.port; - else - portptr = &tuple->dst.u.udp.port; - - /* If no range specified... */ - if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { - /* If it's dst rewrite, can't change port */ - if (maniptype == IP_NAT_MANIP_DST) - return 0; - - if (ntohs(*portptr) < 1024) { - /* Loose convention: >> 512 is credential passing */ - if (ntohs(*portptr)<512) { - min = 1; - range_size = 511 - min + 1; - } else { - min = 600; - range_size = 1023 - min + 1; - } - } else { - min = 1024; - range_size = 65535 - 1024 + 1; - } - } else { - min = ntohs(range->min.udp.port); - range_size = ntohs(range->max.udp.port) - min + 1; - } - - if (range->flags & IP_NAT_RANGE_PROTO_RANDOM) - port = net_random(); - - for (i = 0; i < range_size; i++, port++) { - *portptr = htons(min + port % range_size); - if (!nf_nat_used_tuple(tuple, ct)) - return 1; - } - return 0; + return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, + &udp_port_rover); } -static int +static bool udp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, const struct nf_conntrack_tuple *tuple, @@ -98,7 +42,7 @@ udp_manip_pkt(struct sk_buff *skb, __be16 *portptr, newport; if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) - return 0; + return false; iph = (struct iphdr *)(skb->data + iphdroff); hdr = (struct udphdr *)(skb->data + hdroff); @@ -124,18 +68,17 @@ udp_manip_pkt(struct sk_buff *skb, hdr->check = CSUM_MANGLED_0; } *portptr = newport; - return 1; + return true; } const struct nf_nat_protocol nf_nat_protocol_udp = { - .name = "UDP", .protonum = IPPROTO_UDP, .me = THIS_MODULE, .manip_pkt = udp_manip_pkt, - .in_range = udp_in_range, + .in_range = nf_nat_proto_in_range, .unique_tuple = udp_unique_tuple, #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - .range_to_nlattr = nf_nat_port_range_to_nlattr, - .nlattr_to_range = nf_nat_port_nlattr_to_range, + .range_to_nlattr = nf_nat_proto_range_to_nlattr, + .nlattr_to_range = nf_nat_proto_nlattr_to_range, #endif }; diff --git a/net/ipv4/netfilter/nf_nat_proto_udplite.c b/net/ipv4/netfilter/nf_nat_proto_udplite.c new file mode 100644 index 00000000000..440a229bbd8 --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_proto_udplite.c @@ -0,0 +1,99 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/init.h> +#include <linux/ip.h> +#include <linux/udp.h> + +#include <linux/netfilter.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_protocol.h> + +static u_int16_t udplite_port_rover; + +static bool +udplite_unique_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct) +{ + return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, + &udplite_port_rover); +} + +static bool +udplite_manip_pkt(struct sk_buff *skb, + unsigned int iphdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); + struct udphdr *hdr; + unsigned int hdroff = iphdroff + iph->ihl*4; + __be32 oldip, newip; + __be16 *portptr, newport; + + if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + return false; + + iph = (struct iphdr *)(skb->data + iphdroff); + hdr = (struct udphdr *)(skb->data + hdroff); + + if (maniptype == IP_NAT_MANIP_SRC) { + /* Get rid of src ip and src pt */ + oldip = iph->saddr; + newip = tuple->src.u3.ip; + newport = tuple->src.u.udp.port; + portptr = &hdr->source; + } else { + /* Get rid of dst ip and dst pt */ + oldip = iph->daddr; + newip = tuple->dst.u3.ip; + newport = tuple->dst.u.udp.port; + portptr = &hdr->dest; + } + + inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1); + inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, 0); + if (!hdr->check) + hdr->check = CSUM_MANGLED_0; + + *portptr = newport; + return true; +} + +static const struct nf_nat_protocol nf_nat_protocol_udplite = { + .protonum = IPPROTO_UDPLITE, + .me = THIS_MODULE, + .manip_pkt = udplite_manip_pkt, + .in_range = nf_nat_proto_in_range, + .unique_tuple = udplite_unique_tuple, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .range_to_nlattr = nf_nat_proto_range_to_nlattr, + .nlattr_to_range = nf_nat_proto_nlattr_to_range, +#endif +}; + +static int __init nf_nat_proto_udplite_init(void) +{ + return nf_nat_protocol_register(&nf_nat_protocol_udplite); +} + +static void __exit nf_nat_proto_udplite_fini(void) +{ + nf_nat_protocol_unregister(&nf_nat_protocol_udplite); +} + +module_init(nf_nat_proto_udplite_init); +module_exit(nf_nat_proto_udplite_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("UDP-Lite NAT protocol helper"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/ipv4/netfilter/nf_nat_proto_unknown.c b/net/ipv4/netfilter/nf_nat_proto_unknown.c index a26efeb073c..14381c62ace 100644 --- a/net/ipv4/netfilter/nf_nat_proto_unknown.c +++ b/net/ipv4/netfilter/nf_nat_proto_unknown.c @@ -18,35 +18,34 @@ #include <net/netfilter/nf_nat_rule.h> #include <net/netfilter/nf_nat_protocol.h> -static int unknown_in_range(const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type manip_type, - const union nf_conntrack_man_proto *min, - const union nf_conntrack_man_proto *max) +static bool unknown_in_range(const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type manip_type, + const union nf_conntrack_man_proto *min, + const union nf_conntrack_man_proto *max) { - return 1; + return true; } -static int unknown_unique_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, - enum nf_nat_manip_type maniptype, - const struct nf_conn *ct) +static bool unknown_unique_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct) { /* Sorry: we can't help you; if it's not unique, we can't frob anything. */ - return 0; + return false; } -static int +static bool unknown_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { - return 1; + return true; } const struct nf_nat_protocol nf_nat_unknown_protocol = { - .name = "unknown", /* .me isn't set: getting a ref to this cannot fail. */ .manip_pkt = unknown_manip_pkt, .in_range = unknown_in_range, diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c index f8fda57ba20..e8b4d0d4439 100644 --- a/net/ipv4/netfilter/nf_nat_rule.c +++ b/net/ipv4/netfilter/nf_nat_rule.c @@ -61,7 +61,7 @@ static struct static struct xt_table __nat_table = { .name = "nat", .valid_hooks = NAT_VALID_HOOKS, - .lock = RW_LOCK_UNLOCKED, + .lock = __RW_LOCK_UNLOCKED(__nat_table.lock), .me = THIS_MODULE, .af = AF_INET, }; @@ -143,7 +143,7 @@ static bool ipt_snat_checkentry(const char *tablename, void *targinfo, unsigned int hook_mask) { - struct nf_nat_multi_range_compat *mr = targinfo; + const struct nf_nat_multi_range_compat *mr = targinfo; /* Must be a valid range */ if (mr->rangesize != 1) { @@ -159,7 +159,7 @@ static bool ipt_dnat_checkentry(const char *tablename, void *targinfo, unsigned int hook_mask) { - struct nf_nat_multi_range_compat *mr = targinfo; + const struct nf_nat_multi_range_compat *mr = targinfo; /* Must be a valid range */ if (mr->rangesize != 1) { @@ -188,25 +188,6 @@ alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); } -unsigned int -alloc_null_binding_confirmed(struct nf_conn *ct, unsigned int hooknum) -{ - __be32 ip - = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC - ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip - : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip); - __be16 all - = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC - ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.all - : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u.all); - struct nf_nat_range range - = { IP_NAT_RANGE_MAP_IPS, ip, ip, { all }, { all } }; - - pr_debug("Allocating NULL binding for confirmed %p (%u.%u.%u.%u)\n", - ct, NIPQUAD(ip)); - return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); -} - int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum, const struct net_device *in, diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c index b4c8d4968bb..4334d5cabc5 100644 --- a/net/ipv4/netfilter/nf_nat_sip.c +++ b/net/ipv4/netfilter/nf_nat_sip.c @@ -2,6 +2,8 @@ * * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar> * based on RR's ip_nat_ftp.c and other modules. + * (C) 2007 United Security Providers + * (C) 2007, 2008 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -26,275 +28,461 @@ MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>"); MODULE_DESCRIPTION("SIP NAT helper"); MODULE_ALIAS("ip_nat_sip"); -struct addr_map { - struct { - char src[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; - char dst[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; - unsigned int srclen, srciplen; - unsigned int dstlen, dstiplen; - } addr[IP_CT_DIR_MAX]; -}; -static void addr_map_init(const struct nf_conn *ct, struct addr_map *map) +static unsigned int mangle_packet(struct sk_buff *skb, + const char **dptr, unsigned int *datalen, + unsigned int matchoff, unsigned int matchlen, + const char *buffer, unsigned int buflen) { - const struct nf_conntrack_tuple *t; - enum ip_conntrack_dir dir; - unsigned int n; - - for (dir = 0; dir < IP_CT_DIR_MAX; dir++) { - t = &ct->tuplehash[dir].tuple; - - n = sprintf(map->addr[dir].src, "%u.%u.%u.%u", - NIPQUAD(t->src.u3.ip)); - map->addr[dir].srciplen = n; - n += sprintf(map->addr[dir].src + n, ":%u", - ntohs(t->src.u.udp.port)); - map->addr[dir].srclen = n; - - n = sprintf(map->addr[dir].dst, "%u.%u.%u.%u", - NIPQUAD(t->dst.u3.ip)); - map->addr[dir].dstiplen = n; - n += sprintf(map->addr[dir].dst + n, ":%u", - ntohs(t->dst.u.udp.port)); - map->addr[dir].dstlen = n; - } + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + + if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, matchoff, matchlen, + buffer, buflen)) + return 0; + + /* Reload data pointer and adjust datalen value */ + *dptr = skb->data + ip_hdrlen(skb) + sizeof(struct udphdr); + *datalen += buflen - matchlen; + return 1; } -static int map_sip_addr(struct sk_buff *skb, enum ip_conntrack_info ctinfo, - struct nf_conn *ct, const char **dptr, size_t dlen, - enum sip_header_pos pos, struct addr_map *map) +static int map_addr(struct sk_buff *skb, + const char **dptr, unsigned int *datalen, + unsigned int matchoff, unsigned int matchlen, + union nf_inet_addr *addr, __be16 port) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - unsigned int matchlen, matchoff, addrlen; - char *addr; - - if (ct_sip_get_info(ct, *dptr, dlen, &matchoff, &matchlen, pos) <= 0) + char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; + unsigned int buflen; + __be32 newaddr; + __be16 newport; + + if (ct->tuplehash[dir].tuple.src.u3.ip == addr->ip && + ct->tuplehash[dir].tuple.src.u.udp.port == port) { + newaddr = ct->tuplehash[!dir].tuple.dst.u3.ip; + newport = ct->tuplehash[!dir].tuple.dst.u.udp.port; + } else if (ct->tuplehash[dir].tuple.dst.u3.ip == addr->ip && + ct->tuplehash[dir].tuple.dst.u.udp.port == port) { + newaddr = ct->tuplehash[!dir].tuple.src.u3.ip; + newport = ct->tuplehash[!dir].tuple.src.u.udp.port; + } else return 1; - if ((matchlen == map->addr[dir].srciplen || - matchlen == map->addr[dir].srclen) && - memcmp(*dptr + matchoff, map->addr[dir].src, matchlen) == 0) { - addr = map->addr[!dir].dst; - addrlen = map->addr[!dir].dstlen; - } else if ((matchlen == map->addr[dir].dstiplen || - matchlen == map->addr[dir].dstlen) && - memcmp(*dptr + matchoff, map->addr[dir].dst, matchlen) == 0) { - addr = map->addr[!dir].src; - addrlen = map->addr[!dir].srclen; - } else + if (newaddr == addr->ip && newport == port) return 1; - if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, - matchoff, matchlen, addr, addrlen)) - return 0; - *dptr = skb->data + ip_hdrlen(skb) + sizeof(struct udphdr); - return 1; + buflen = sprintf(buffer, "%u.%u.%u.%u:%u", + NIPQUAD(newaddr), ntohs(newport)); + return mangle_packet(skb, dptr, datalen, matchoff, matchlen, + buffer, buflen); } -static unsigned int ip_nat_sip(struct sk_buff *skb, - enum ip_conntrack_info ctinfo, - struct nf_conn *ct, - const char **dptr) +static int map_sip_addr(struct sk_buff *skb, + const char **dptr, unsigned int *datalen, + enum sip_header_types type) { - enum sip_header_pos pos; - struct addr_map map; - int dataoff, datalen; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + unsigned int matchlen, matchoff; + union nf_inet_addr addr; + __be16 port; - dataoff = ip_hdrlen(skb) + sizeof(struct udphdr); - datalen = skb->len - dataoff; - if (datalen < sizeof("SIP/2.0") - 1) - return NF_ACCEPT; + if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, type, NULL, + &matchoff, &matchlen, &addr, &port) <= 0) + return 1; + return map_addr(skb, dptr, datalen, matchoff, matchlen, &addr, port); +} - addr_map_init(ct, &map); +static unsigned int ip_nat_sip(struct sk_buff *skb, + const char **dptr, unsigned int *datalen) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + unsigned int dataoff, matchoff, matchlen; + union nf_inet_addr addr; + __be16 port; + int request, in_header; /* Basic rules: requests and responses. */ - if (strncmp(*dptr, "SIP/2.0", sizeof("SIP/2.0") - 1) != 0) { - /* 10.2: Constructing the REGISTER Request: - * - * The "userinfo" and "@" components of the SIP URI MUST NOT - * be present. - */ - if (datalen >= sizeof("REGISTER") - 1 && - strncmp(*dptr, "REGISTER", sizeof("REGISTER") - 1) == 0) - pos = POS_REG_REQ_URI; - else - pos = POS_REQ_URI; - - if (!map_sip_addr(skb, ctinfo, ct, dptr, datalen, pos, &map)) + if (strnicmp(*dptr, "SIP/2.0", strlen("SIP/2.0")) != 0) { + if (ct_sip_parse_request(ct, *dptr, *datalen, + &matchoff, &matchlen, + &addr, &port) > 0 && + !map_addr(skb, dptr, datalen, matchoff, matchlen, + &addr, port)) + return NF_DROP; + request = 1; + } else + request = 0; + + /* Translate topmost Via header and parameters */ + if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, + SIP_HDR_VIA, NULL, &matchoff, &matchlen, + &addr, &port) > 0) { + unsigned int matchend, poff, plen, buflen, n; + char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; + + /* We're only interested in headers related to this + * connection */ + if (request) { + if (addr.ip != ct->tuplehash[dir].tuple.src.u3.ip || + port != ct->tuplehash[dir].tuple.src.u.udp.port) + goto next; + } else { + if (addr.ip != ct->tuplehash[dir].tuple.dst.u3.ip || + port != ct->tuplehash[dir].tuple.dst.u.udp.port) + goto next; + } + + if (!map_addr(skb, dptr, datalen, matchoff, matchlen, + &addr, port)) return NF_DROP; + + matchend = matchoff + matchlen; + + /* The maddr= parameter (RFC 2361) specifies where to send + * the reply. */ + if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen, + "maddr=", &poff, &plen, + &addr) > 0 && + addr.ip == ct->tuplehash[dir].tuple.src.u3.ip && + addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) { + __be32 ip = ct->tuplehash[!dir].tuple.dst.u3.ip; + buflen = sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(ip)); + if (!mangle_packet(skb, dptr, datalen, poff, plen, + buffer, buflen)) + return NF_DROP; + } + + /* The received= parameter (RFC 2361) contains the address + * from which the server received the request. */ + if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen, + "received=", &poff, &plen, + &addr) > 0 && + addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip && + addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) { + __be32 ip = ct->tuplehash[!dir].tuple.src.u3.ip; + buflen = sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(ip)); + if (!mangle_packet(skb, dptr, datalen, poff, plen, + buffer, buflen)) + return NF_DROP; + } + + /* The rport= parameter (RFC 3581) contains the port number + * from which the server received the request. */ + if (ct_sip_parse_numerical_param(ct, *dptr, matchend, *datalen, + "rport=", &poff, &plen, + &n) > 0 && + htons(n) == ct->tuplehash[dir].tuple.dst.u.udp.port && + htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) { + __be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port; + buflen = sprintf(buffer, "%u", ntohs(p)); + if (!mangle_packet(skb, dptr, datalen, poff, plen, + buffer, buflen)) + return NF_DROP; + } } - if (!map_sip_addr(skb, ctinfo, ct, dptr, datalen, POS_FROM, &map) || - !map_sip_addr(skb, ctinfo, ct, dptr, datalen, POS_TO, &map) || - !map_sip_addr(skb, ctinfo, ct, dptr, datalen, POS_VIA, &map) || - !map_sip_addr(skb, ctinfo, ct, dptr, datalen, POS_CONTACT, &map)) +next: + /* Translate Contact headers */ + dataoff = 0; + in_header = 0; + while (ct_sip_parse_header_uri(ct, *dptr, &dataoff, *datalen, + SIP_HDR_CONTACT, &in_header, + &matchoff, &matchlen, + &addr, &port) > 0) { + if (!map_addr(skb, dptr, datalen, matchoff, matchlen, + &addr, port)) + return NF_DROP; + } + + if (!map_sip_addr(skb, dptr, datalen, SIP_HDR_FROM) || + !map_sip_addr(skb, dptr, datalen, SIP_HDR_TO)) return NF_DROP; return NF_ACCEPT; } -static unsigned int mangle_sip_packet(struct sk_buff *skb, - enum ip_conntrack_info ctinfo, - struct nf_conn *ct, - const char **dptr, size_t dlen, - char *buffer, int bufflen, - enum sip_header_pos pos) +/* Handles expected signalling connections and media streams */ +static void ip_nat_sip_expected(struct nf_conn *ct, + struct nf_conntrack_expect *exp) { - unsigned int matchlen, matchoff; + struct nf_nat_range range; - if (ct_sip_get_info(ct, *dptr, dlen, &matchoff, &matchlen, pos) <= 0) - return 0; + /* This must be a fresh one. */ + BUG_ON(ct->status & IPS_NAT_DONE_MASK); - if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, - matchoff, matchlen, buffer, bufflen)) - return 0; + /* For DST manip, map port here to where it's expected. */ + range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); + range.min = range.max = exp->saved_proto; + range.min_ip = range.max_ip = exp->saved_ip; + nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST); - /* We need to reload this. Thanks Patrick. */ - *dptr = skb->data + ip_hdrlen(skb) + sizeof(struct udphdr); - return 1; + /* Change src to where master sends to, but only if the connection + * actually came from the same source. */ + if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == + ct->master->tuplehash[exp->dir].tuple.src.u3.ip) { + range.flags = IP_NAT_RANGE_MAP_IPS; + range.min_ip = range.max_ip + = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip; + nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC); + } } -static int mangle_content_len(struct sk_buff *skb, - enum ip_conntrack_info ctinfo, - struct nf_conn *ct, - const char *dptr) +static unsigned int ip_nat_sip_expect(struct sk_buff *skb, + const char **dptr, unsigned int *datalen, + struct nf_conntrack_expect *exp, + unsigned int matchoff, + unsigned int matchlen) { - unsigned int dataoff, matchoff, matchlen; - char buffer[sizeof("65536")]; - int bufflen; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + __be32 newip; + u_int16_t port; + char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; + unsigned buflen; - dataoff = ip_hdrlen(skb) + sizeof(struct udphdr); + /* Connection will come from reply */ + if (ct->tuplehash[dir].tuple.src.u3.ip == ct->tuplehash[!dir].tuple.dst.u3.ip) + newip = exp->tuple.dst.u3.ip; + else + newip = ct->tuplehash[!dir].tuple.dst.u3.ip; - /* Get actual SDP length */ - if (ct_sip_get_info(ct, dptr, skb->len - dataoff, &matchoff, - &matchlen, POS_SDP_HEADER) > 0) { + /* If the signalling port matches the connection's source port in the + * original direction, try to use the destination port in the opposite + * direction. */ + if (exp->tuple.dst.u.udp.port == + ct->tuplehash[dir].tuple.src.u.udp.port) + port = ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port); + else + port = ntohs(exp->tuple.dst.u.udp.port); + + exp->saved_ip = exp->tuple.dst.u3.ip; + exp->tuple.dst.u3.ip = newip; + exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port; + exp->dir = !dir; + exp->expectfn = ip_nat_sip_expected; - /* since ct_sip_get_info() give us a pointer passing 'v=' - we need to add 2 bytes in this count. */ - int c_len = skb->len - dataoff - matchoff + 2; + for (; port != 0; port++) { + exp->tuple.dst.u.udp.port = htons(port); + if (nf_ct_expect_related(exp) == 0) + break; + } - /* Now, update SDP length */ - if (ct_sip_get_info(ct, dptr, skb->len - dataoff, &matchoff, - &matchlen, POS_CONTENT) > 0) { + if (port == 0) + return NF_DROP; - bufflen = sprintf(buffer, "%u", c_len); - return nf_nat_mangle_udp_packet(skb, ct, ctinfo, - matchoff, matchlen, - buffer, bufflen); - } + if (exp->tuple.dst.u3.ip != exp->saved_ip || + exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) { + buflen = sprintf(buffer, "%u.%u.%u.%u:%u", + NIPQUAD(newip), port); + if (!mangle_packet(skb, dptr, datalen, matchoff, matchlen, + buffer, buflen)) + goto err; } - return 0; + return NF_ACCEPT; + +err: + nf_ct_unexpect_related(exp); + return NF_DROP; } -static unsigned int mangle_sdp(struct sk_buff *skb, - enum ip_conntrack_info ctinfo, - struct nf_conn *ct, - __be32 newip, u_int16_t port, - const char *dptr) +static int mangle_content_len(struct sk_buff *skb, + const char **dptr, unsigned int *datalen) { - char buffer[sizeof("nnn.nnn.nnn.nnn")]; - unsigned int dataoff, bufflen; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + unsigned int matchoff, matchlen; + char buffer[sizeof("65536")]; + int buflen, c_len; - dataoff = ip_hdrlen(skb) + sizeof(struct udphdr); + /* Get actual SDP length */ + if (ct_sip_get_sdp_header(ct, *dptr, 0, *datalen, + SDP_HDR_VERSION, SDP_HDR_UNSPEC, + &matchoff, &matchlen) <= 0) + return 0; + c_len = *datalen - matchoff + strlen("v="); - /* Mangle owner and contact info. */ - bufflen = sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(newip)); - if (!mangle_sip_packet(skb, ctinfo, ct, &dptr, skb->len - dataoff, - buffer, bufflen, POS_OWNER_IP4)) + /* Now, update SDP length */ + if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CONTENT_LENGTH, + &matchoff, &matchlen) <= 0) return 0; - if (!mangle_sip_packet(skb, ctinfo, ct, &dptr, skb->len - dataoff, - buffer, bufflen, POS_CONNECTION_IP4)) + buflen = sprintf(buffer, "%u", c_len); + return mangle_packet(skb, dptr, datalen, matchoff, matchlen, + buffer, buflen); +} + +static unsigned mangle_sdp_packet(struct sk_buff *skb, const char **dptr, + unsigned int dataoff, unsigned int *datalen, + enum sdp_header_types type, + enum sdp_header_types term, + char *buffer, int buflen) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + unsigned int matchlen, matchoff; + + if (ct_sip_get_sdp_header(ct, *dptr, dataoff, *datalen, type, term, + &matchoff, &matchlen) <= 0) return 0; + return mangle_packet(skb, dptr, datalen, matchoff, matchlen, + buffer, buflen); +} - /* Mangle media port. */ - bufflen = sprintf(buffer, "%u", port); - if (!mangle_sip_packet(skb, ctinfo, ct, &dptr, skb->len - dataoff, - buffer, bufflen, POS_MEDIA)) +static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, const char **dptr, + unsigned int dataoff, + unsigned int *datalen, + enum sdp_header_types type, + enum sdp_header_types term, + const union nf_inet_addr *addr) +{ + char buffer[sizeof("nnn.nnn.nnn.nnn")]; + unsigned int buflen; + + buflen = sprintf(buffer, NIPQUAD_FMT, NIPQUAD(addr->ip)); + if (!mangle_sdp_packet(skb, dptr, dataoff, datalen, type, term, + buffer, buflen)) return 0; - return mangle_content_len(skb, ctinfo, ct, dptr); + return mangle_content_len(skb, dptr, datalen); } -static void ip_nat_sdp_expect(struct nf_conn *ct, - struct nf_conntrack_expect *exp) +static unsigned int ip_nat_sdp_port(struct sk_buff *skb, + const char **dptr, + unsigned int *datalen, + unsigned int matchoff, + unsigned int matchlen, + u_int16_t port) { - struct nf_nat_range range; + char buffer[sizeof("nnnnn")]; + unsigned int buflen; - /* This must be a fresh one. */ - BUG_ON(ct->status & IPS_NAT_DONE_MASK); + buflen = sprintf(buffer, "%u", port); + if (!mangle_packet(skb, dptr, datalen, matchoff, matchlen, + buffer, buflen)) + return 0; - /* Change src to where master sends to */ - range.flags = IP_NAT_RANGE_MAP_IPS; - range.min_ip = range.max_ip - = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip; - nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC); + return mangle_content_len(skb, dptr, datalen); +} - /* For DST manip, map port here to where it's expected. */ - range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); - range.min = range.max = exp->saved_proto; - range.min_ip = range.max_ip = exp->saved_ip; - nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST); +static unsigned int ip_nat_sdp_session(struct sk_buff *skb, const char **dptr, + unsigned int dataoff, + unsigned int *datalen, + const union nf_inet_addr *addr) +{ + char buffer[sizeof("nnn.nnn.nnn.nnn")]; + unsigned int buflen; + + /* Mangle session description owner and contact addresses */ + buflen = sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(addr->ip)); + if (!mangle_sdp_packet(skb, dptr, dataoff, datalen, + SDP_HDR_OWNER_IP4, SDP_HDR_MEDIA, + buffer, buflen)) + return 0; + + if (!mangle_sdp_packet(skb, dptr, dataoff, datalen, + SDP_HDR_CONNECTION_IP4, SDP_HDR_MEDIA, + buffer, buflen)) + return 0; + + return mangle_content_len(skb, dptr, datalen); } /* So, this packet has hit the connection tracking matching code. Mangle it, and change the expectation to match the new version. */ -static unsigned int ip_nat_sdp(struct sk_buff *skb, - enum ip_conntrack_info ctinfo, - struct nf_conntrack_expect *exp, - const char *dptr) +static unsigned int ip_nat_sdp_media(struct sk_buff *skb, + const char **dptr, + unsigned int *datalen, + struct nf_conntrack_expect *rtp_exp, + struct nf_conntrack_expect *rtcp_exp, + unsigned int mediaoff, + unsigned int medialen, + union nf_inet_addr *rtp_addr) { - struct nf_conn *ct = exp->master; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - __be32 newip; u_int16_t port; /* Connection will come from reply */ if (ct->tuplehash[dir].tuple.src.u3.ip == ct->tuplehash[!dir].tuple.dst.u3.ip) - newip = exp->tuple.dst.u3.ip; + rtp_addr->ip = rtp_exp->tuple.dst.u3.ip; else - newip = ct->tuplehash[!dir].tuple.dst.u3.ip; - - exp->saved_ip = exp->tuple.dst.u3.ip; - exp->tuple.dst.u3.ip = newip; - exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port; - exp->dir = !dir; - - /* When you see the packet, we need to NAT it the same as the - this one. */ - exp->expectfn = ip_nat_sdp_expect; - - /* Try to get same port: if not, try to change it. */ - for (port = ntohs(exp->saved_proto.udp.port); port != 0; port++) { - exp->tuple.dst.u.udp.port = htons(port); - if (nf_ct_expect_related(exp) == 0) + rtp_addr->ip = ct->tuplehash[!dir].tuple.dst.u3.ip; + + rtp_exp->saved_ip = rtp_exp->tuple.dst.u3.ip; + rtp_exp->tuple.dst.u3.ip = rtp_addr->ip; + rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port; + rtp_exp->dir = !dir; + rtp_exp->expectfn = ip_nat_sip_expected; + + rtcp_exp->saved_ip = rtcp_exp->tuple.dst.u3.ip; + rtcp_exp->tuple.dst.u3.ip = rtp_addr->ip; + rtcp_exp->saved_proto.udp.port = rtcp_exp->tuple.dst.u.udp.port; + rtcp_exp->dir = !dir; + rtcp_exp->expectfn = ip_nat_sip_expected; + + /* Try to get same pair of ports: if not, try to change them. */ + for (port = ntohs(rtp_exp->tuple.dst.u.udp.port); + port != 0; port += 2) { + rtp_exp->tuple.dst.u.udp.port = htons(port); + if (nf_ct_expect_related(rtp_exp) != 0) + continue; + rtcp_exp->tuple.dst.u.udp.port = htons(port + 1); + if (nf_ct_expect_related(rtcp_exp) == 0) break; + nf_ct_unexpect_related(rtp_exp); } if (port == 0) - return NF_DROP; + goto err1; + + /* Update media port. */ + if (rtp_exp->tuple.dst.u.udp.port != rtp_exp->saved_proto.udp.port && + !ip_nat_sdp_port(skb, dptr, datalen, mediaoff, medialen, port)) + goto err2; - if (!mangle_sdp(skb, ctinfo, ct, newip, port, dptr)) { - nf_ct_unexpect_related(exp); - return NF_DROP; - } return NF_ACCEPT; + +err2: + nf_ct_unexpect_related(rtp_exp); + nf_ct_unexpect_related(rtcp_exp); +err1: + return NF_DROP; } static void __exit nf_nat_sip_fini(void) { rcu_assign_pointer(nf_nat_sip_hook, NULL); - rcu_assign_pointer(nf_nat_sdp_hook, NULL); + rcu_assign_pointer(nf_nat_sip_expect_hook, NULL); + rcu_assign_pointer(nf_nat_sdp_addr_hook, NULL); + rcu_assign_pointer(nf_nat_sdp_port_hook, NULL); + rcu_assign_pointer(nf_nat_sdp_session_hook, NULL); + rcu_assign_pointer(nf_nat_sdp_media_hook, NULL); synchronize_rcu(); } static int __init nf_nat_sip_init(void) { BUG_ON(nf_nat_sip_hook != NULL); - BUG_ON(nf_nat_sdp_hook != NULL); + BUG_ON(nf_nat_sip_expect_hook != NULL); + BUG_ON(nf_nat_sdp_addr_hook != NULL); + BUG_ON(nf_nat_sdp_port_hook != NULL); + BUG_ON(nf_nat_sdp_session_hook != NULL); + BUG_ON(nf_nat_sdp_media_hook != NULL); rcu_assign_pointer(nf_nat_sip_hook, ip_nat_sip); - rcu_assign_pointer(nf_nat_sdp_hook, ip_nat_sdp); + rcu_assign_pointer(nf_nat_sip_expect_hook, ip_nat_sip_expect); + rcu_assign_pointer(nf_nat_sdp_addr_hook, ip_nat_sdp_addr); + rcu_assign_pointer(nf_nat_sdp_port_hook, ip_nat_sdp_port); + rcu_assign_pointer(nf_nat_sdp_session_hook, ip_nat_sdp_session); + rcu_assign_pointer(nf_nat_sdp_media_hook, ip_nat_sdp_media); return 0; } diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index 540ce6ae887..5daefad3d19 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -50,6 +50,7 @@ #include <net/udp.h> #include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_nat_helper.h> @@ -219,7 +220,7 @@ static unsigned char asn1_length_decode(struct asn1_ctx *ctx, if (ch < 0x80) *len = ch; else { - cnt = (unsigned char) (ch & 0x7F); + cnt = ch & 0x7F; *len = 0; while (cnt > 0) { @@ -617,8 +618,7 @@ struct snmp_cnv int syntax; }; -static struct snmp_cnv snmp_conv [] = -{ +static const struct snmp_cnv snmp_conv[] = { {ASN1_UNI, ASN1_NUL, SNMP_NULL}, {ASN1_UNI, ASN1_INT, SNMP_INTEGER}, {ASN1_UNI, ASN1_OTS, SNMP_OCTETSTR}, @@ -643,7 +643,7 @@ static unsigned char snmp_tag_cls2syntax(unsigned int tag, unsigned int cls, unsigned short *syntax) { - struct snmp_cnv *cnv; + const struct snmp_cnv *cnv; cnv = snmp_conv; @@ -903,7 +903,7 @@ static inline void mangle_address(unsigned char *begin, u_int32_t old; if (debug) - memcpy(&old, (unsigned char *)addr, sizeof(old)); + memcpy(&old, addr, sizeof(old)); *addr = map->to; @@ -998,7 +998,7 @@ err_id_free: * *****************************************************************************/ -static void hex_dump(unsigned char *buf, size_t len) +static void hex_dump(const unsigned char *buf, size_t len) { size_t i; @@ -1079,7 +1079,7 @@ static int snmp_parse_mangle(unsigned char *msg, if (cls != ASN1_CTX || con != ASN1_CON) return 0; if (debug > 1) { - unsigned char *pdus[] = { + static const unsigned char *const pdus[] = { [SNMP_PDU_GET] = "get", [SNMP_PDU_NEXT] = "get-next", [SNMP_PDU_RESPONSE] = "response", @@ -1231,8 +1231,8 @@ static int help(struct sk_buff *skb, unsigned int protoff, { int dir = CTINFO2DIR(ctinfo); unsigned int ret; - struct iphdr *iph = ip_hdr(skb); - struct udphdr *udph = (struct udphdr *)((u_int32_t *)iph + iph->ihl); + const struct iphdr *iph = ip_hdr(skb); + const struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl); /* SNMP replies and originating SNMP traps get mangled */ if (udph->source == htons(SNMP_PORT) && dir != IP_CT_DIR_REPLY) @@ -1267,11 +1267,15 @@ static int help(struct sk_buff *skb, unsigned int protoff, return ret; } +static const struct nf_conntrack_expect_policy snmp_exp_policy = { + .max_expected = 0, + .timeout = 180, +}; + static struct nf_conntrack_helper snmp_helper __read_mostly = { - .max_expected = 0, - .timeout = 180, .me = THIS_MODULE, .help = help, + .expect_policy = &snmp_exp_policy, .name = "snmp", .tuple.src.l3num = AF_INET, .tuple.src.u.udp.port = __constant_htons(SNMP_PORT), @@ -1279,10 +1283,9 @@ static struct nf_conntrack_helper snmp_helper __read_mostly = { }; static struct nf_conntrack_helper snmp_trap_helper __read_mostly = { - .max_expected = 0, - .timeout = 180, .me = THIS_MODULE, .help = help, + .expect_policy = &snmp_exp_policy, .name = "snmp_trap", .tuple.src.l3num = AF_INET, .tuple.src.u.udp.port = __constant_htons(SNMP_TRAP_PORT), diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c index 99b2c788d5a..b7dd695691a 100644 --- a/net/ipv4/netfilter/nf_nat_standalone.c +++ b/net/ipv4/netfilter/nf_nat_standalone.c @@ -30,8 +30,8 @@ #ifdef CONFIG_XFRM static void nat_decode_session(struct sk_buff *skb, struct flowi *fl) { - struct nf_conn *ct; - struct nf_conntrack_tuple *t; + const struct nf_conn *ct; + const struct nf_conntrack_tuple *t; enum ip_conntrack_info ctinfo; enum ip_conntrack_dir dir; unsigned long statusbit; @@ -50,7 +50,10 @@ static void nat_decode_session(struct sk_buff *skb, struct flowi *fl) if (ct->status & statusbit) { fl->fl4_dst = t->dst.u3.ip; if (t->dst.protonum == IPPROTO_TCP || - t->dst.protonum == IPPROTO_UDP) + t->dst.protonum == IPPROTO_UDP || + t->dst.protonum == IPPROTO_UDPLITE || + t->dst.protonum == IPPROTO_DCCP || + t->dst.protonum == IPPROTO_SCTP) fl->fl_ip_dport = t->dst.u.tcp.port; } @@ -59,7 +62,10 @@ static void nat_decode_session(struct sk_buff *skb, struct flowi *fl) if (ct->status & statusbit) { fl->fl4_src = t->src.u3.ip; if (t->dst.protonum == IPPROTO_TCP || - t->dst.protonum == IPPROTO_UDP) + t->dst.protonum == IPPROTO_UDP || + t->dst.protonum == IPPROTO_UDPLITE || + t->dst.protonum == IPPROTO_DCCP || + t->dst.protonum == IPPROTO_SCTP) fl->fl_ip_sport = t->src.u.tcp.port; } } @@ -87,21 +93,8 @@ nf_nat_fn(unsigned int hooknum, have dropped it. Hence it's the user's responsibilty to packet filter it out, or implement conntrack/NAT for that protocol. 8) --RR */ - if (!ct) { - /* Exception: ICMP redirect to new connection (not in - hash table yet). We must not let this through, in - case we're doing NAT to the same network. */ - if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { - struct icmphdr _hdr, *hp; - - hp = skb_header_pointer(skb, ip_hdrlen(skb), - sizeof(_hdr), &_hdr); - if (hp != NULL && - hp->type == ICMP_REDIRECT) - return NF_DROP; - } + if (!ct) return NF_ACCEPT; - } /* Don't try to NAT if this packet is not conntracked */ if (ct == &nf_conntrack_untracked) @@ -109,6 +102,9 @@ nf_nat_fn(unsigned int hooknum, nat = nfct_nat(ct); if (!nat) { + /* NAT module was loaded late. */ + if (nf_ct_is_confirmed(ct)) + return NF_ACCEPT; nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); if (nat == NULL) { pr_debug("failed to add NAT extension\n"); @@ -134,10 +130,7 @@ nf_nat_fn(unsigned int hooknum, if (!nf_nat_initialized(ct, maniptype)) { unsigned int ret; - if (unlikely(nf_ct_is_confirmed(ct))) - /* NAT module was loaded late */ - ret = alloc_null_binding_confirmed(ct, hooknum); - else if (hooknum == NF_INET_LOCAL_IN) + if (hooknum == NF_INET_LOCAL_IN) /* LOCAL_IN hook doesn't have a chain! */ ret = alloc_null_binding(ct, hooknum); else @@ -189,7 +182,7 @@ nf_nat_out(unsigned int hooknum, int (*okfn)(struct sk_buff *)) { #ifdef CONFIG_XFRM - struct nf_conn *ct; + const struct nf_conn *ct; enum ip_conntrack_info ctinfo; #endif unsigned int ret; @@ -223,7 +216,7 @@ nf_nat_local_fn(unsigned int hooknum, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - struct nf_conn *ct; + const struct nf_conn *ct; enum ip_conntrack_info ctinfo; unsigned int ret; @@ -252,25 +245,6 @@ nf_nat_local_fn(unsigned int hooknum, return ret; } -static unsigned int -nf_nat_adjust(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - - ct = nf_ct_get(skb, &ctinfo); - if (ct && test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)) { - pr_debug("nf_nat_standalone: adjusting sequence number\n"); - if (!nf_nat_seq_adjust(skb, ct, ctinfo)) - return NF_DROP; - } - return NF_ACCEPT; -} - /* We must be after connection tracking and before packet filtering. */ static struct nf_hook_ops nf_nat_ops[] __read_mostly = { @@ -290,14 +264,6 @@ static struct nf_hook_ops nf_nat_ops[] __read_mostly = { .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_NAT_SRC, }, - /* After conntrack, adjust sequence number */ - { - .hook = nf_nat_adjust, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_NAT_SEQ_ADJUST, - }, /* Before packet filtering, change destination */ { .hook = nf_nat_local_fn, @@ -314,14 +280,6 @@ static struct nf_hook_ops nf_nat_ops[] __read_mostly = { .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC, }, - /* After conntrack, adjust sequence number */ - { - .hook = nf_nat_adjust, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_NAT_SEQ_ADJUST, - }, }; static int __init nf_nat_standalone_init(void) diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index d63474c6b40..552169b41b1 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -51,24 +51,54 @@ */ static int sockstat_seq_show(struct seq_file *seq, void *v) { + struct net *net = seq->private; + socket_seq_show(seq); seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", - sock_prot_inuse_get(&tcp_prot), + sock_prot_inuse_get(net, &tcp_prot), atomic_read(&tcp_orphan_count), tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated), atomic_read(&tcp_memory_allocated)); - seq_printf(seq, "UDP: inuse %d mem %d\n", sock_prot_inuse_get(&udp_prot), + seq_printf(seq, "UDP: inuse %d mem %d\n", + sock_prot_inuse_get(net, &udp_prot), atomic_read(&udp_memory_allocated)); - seq_printf(seq, "UDPLITE: inuse %d\n", sock_prot_inuse_get(&udplite_prot)); - seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse_get(&raw_prot)); + seq_printf(seq, "UDPLITE: inuse %d\n", + sock_prot_inuse_get(net, &udplite_prot)); + seq_printf(seq, "RAW: inuse %d\n", + sock_prot_inuse_get(net, &raw_prot)); seq_printf(seq, "FRAG: inuse %d memory %d\n", - ip_frag_nqueues(&init_net), ip_frag_mem(&init_net)); + ip_frag_nqueues(net), ip_frag_mem(net)); return 0; } static int sockstat_seq_open(struct inode *inode, struct file *file) { - return single_open(file, sockstat_seq_show, NULL); + int err; + struct net *net; + + err = -ENXIO; + net = get_proc_net(inode); + if (net == NULL) + goto err_net; + + err = single_open(file, sockstat_seq_show, net); + if (err < 0) + goto err_open; + + return 0; + +err_open: + put_net(net); +err_net: + return err; +} + +static int sockstat_seq_release(struct inode *inode, struct file *file) +{ + struct net *net = ((struct seq_file *)file->private_data)->private; + + put_net(net); + return single_release(inode, file); } static const struct file_operations sockstat_seq_fops = { @@ -76,7 +106,7 @@ static const struct file_operations sockstat_seq_fops = { .open = sockstat_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = sockstat_seq_release, }; /* snmp items */ @@ -423,25 +453,42 @@ static const struct file_operations netstat_seq_fops = { .release = single_release, }; +static __net_init int ip_proc_init_net(struct net *net) +{ + if (!proc_net_fops_create(net, "sockstat", S_IRUGO, &sockstat_seq_fops)) + return -ENOMEM; + return 0; +} + +static __net_exit void ip_proc_exit_net(struct net *net) +{ + proc_net_remove(net, "sockstat"); +} + +static __net_initdata struct pernet_operations ip_proc_ops = { + .init = ip_proc_init_net, + .exit = ip_proc_exit_net, +}; + int __init ip_misc_proc_init(void) { int rc = 0; + if (register_pernet_subsys(&ip_proc_ops)) + goto out_pernet; + if (!proc_net_fops_create(&init_net, "netstat", S_IRUGO, &netstat_seq_fops)) goto out_netstat; if (!proc_net_fops_create(&init_net, "snmp", S_IRUGO, &snmp_seq_fops)) goto out_snmp; - - if (!proc_net_fops_create(&init_net, "sockstat", S_IRUGO, &sockstat_seq_fops)) - goto out_sockstat; out: return rc; -out_sockstat: - proc_net_remove(&init_net, "snmp"); out_snmp: proc_net_remove(&init_net, "netstat"); out_netstat: + unregister_pernet_subsys(&ip_proc_ops); +out_pernet: rc = -ENOMEM; goto out; } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index a3002fe65b7..11d7f753a82 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -81,41 +81,34 @@ #include <linux/netfilter_ipv4.h> static struct raw_hashinfo raw_v4_hashinfo = { - .lock = __RW_LOCK_UNLOCKED(), + .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock), }; -void raw_hash_sk(struct sock *sk, struct raw_hashinfo *h) +void raw_hash_sk(struct sock *sk) { + struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; struct hlist_head *head; head = &h->ht[inet_sk(sk)->num & (RAW_HTABLE_SIZE - 1)]; write_lock_bh(&h->lock); sk_add_node(sk, head); - sock_prot_inuse_add(sk->sk_prot, 1); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); write_unlock_bh(&h->lock); } EXPORT_SYMBOL_GPL(raw_hash_sk); -void raw_unhash_sk(struct sock *sk, struct raw_hashinfo *h) +void raw_unhash_sk(struct sock *sk) { + struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; + write_lock_bh(&h->lock); if (sk_del_node_init(sk)) - sock_prot_inuse_add(sk->sk_prot, -1); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); write_unlock_bh(&h->lock); } EXPORT_SYMBOL_GPL(raw_unhash_sk); -static void raw_v4_hash(struct sock *sk) -{ - raw_hash_sk(sk, &raw_v4_hashinfo); -} - -static void raw_v4_unhash(struct sock *sk) -{ - raw_unhash_sk(sk, &raw_v4_hashinfo); -} - static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, unsigned short num, __be32 raddr, __be32 laddr, int dif) { @@ -124,7 +117,7 @@ static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, sk_for_each_from(sk, node) { struct inet_sock *inet = inet_sk(sk); - if (sk->sk_net == net && inet->num == num && + if (net_eq(sock_net(sk), net) && inet->num == num && !(inet->daddr && inet->daddr != raddr) && !(inet->rcv_saddr && inet->rcv_saddr != laddr) && !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) @@ -175,7 +168,7 @@ static int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) if (hlist_empty(head)) goto out; - net = skb->dev->nd_net; + net = dev_net(skb->dev); sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol, iph->saddr, iph->daddr, skb->dev->ifindex); @@ -283,7 +276,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]); if (raw_sk != NULL) { iph = (struct iphdr *)skb->data; - net = skb->dev->nd_net; + net = dev_net(skb->dev); while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol, iph->daddr, iph->saddr, @@ -506,7 +499,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { - err = ip_cmsg_send(msg, &ipc); + err = ip_cmsg_send(sock_net(sk), msg, &ipc); if (err) goto out; if (ipc.opt) @@ -560,7 +553,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } security_sk_classify_flow(sk, &fl); - err = ip_route_output_flow(&init_net, &rt, &fl, sk, 1); + err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 1); } if (err) goto done; @@ -627,7 +620,7 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in)) goto out; - chk_addr_ret = inet_addr_type(sk->sk_net, addr->sin_addr.s_addr); + chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); ret = -EADDRNOTAVAIL; if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) @@ -825,8 +818,6 @@ static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg) } } -DEFINE_PROTO_INUSE(raw) - struct proto raw_prot = { .name = "RAW", .owner = THIS_MODULE, @@ -841,14 +832,14 @@ struct proto raw_prot = { .recvmsg = raw_recvmsg, .bind = raw_bind, .backlog_rcv = raw_rcv_skb, - .hash = raw_v4_hash, - .unhash = raw_v4_unhash, + .hash = raw_hash_sk, + .unhash = raw_unhash_sk, .obj_size = sizeof(struct raw_sock), + .h.raw_hash = &raw_v4_hashinfo, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_raw_setsockopt, .compat_getsockopt = compat_raw_getsockopt, #endif - REF_PROTO_INUSE(raw) }; #ifdef CONFIG_PROC_FS @@ -862,7 +853,7 @@ static struct sock *raw_get_first(struct seq_file *seq) struct hlist_node *node; sk_for_each(sk, node, &state->h->ht[state->bucket]) - if (sk->sk_net == state->p.net) + if (sock_net(sk) == seq_file_net(seq)) goto found; } sk = NULL; @@ -878,7 +869,7 @@ static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk) sk = sk_next(sk); try_again: ; - } while (sk && sk->sk_net != state->p.net); + } while (sk && sock_net(sk) != seq_file_net(seq)); if (!sk && ++state->bucket < RAW_HTABLE_SIZE) { sk = sk_head(&state->h->ht[state->bucket]); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 7b5e8e1d94b..780e9484c82 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -118,21 +118,19 @@ #define RT_GC_TIMEOUT (300*HZ) static int ip_rt_max_size; -static int ip_rt_gc_timeout = RT_GC_TIMEOUT; -static int ip_rt_gc_interval = 60 * HZ; -static int ip_rt_gc_min_interval = HZ / 2; -static int ip_rt_redirect_number = 9; -static int ip_rt_redirect_load = HZ / 50; -static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1)); -static int ip_rt_error_cost = HZ; -static int ip_rt_error_burst = 5 * HZ; -static int ip_rt_gc_elasticity = 8; -static int ip_rt_mtu_expires = 10 * 60 * HZ; -static int ip_rt_min_pmtu = 512 + 20 + 20; -static int ip_rt_min_advmss = 256; -static int ip_rt_secret_interval = 10 * 60 * HZ; - -#define RTprint(a...) printk(KERN_DEBUG a) +static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; +static int ip_rt_gc_interval __read_mostly = 60 * HZ; +static int ip_rt_gc_min_interval __read_mostly = HZ / 2; +static int ip_rt_redirect_number __read_mostly = 9; +static int ip_rt_redirect_load __read_mostly = HZ / 50; +static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1)); +static int ip_rt_error_cost __read_mostly = HZ; +static int ip_rt_error_burst __read_mostly = 5 * HZ; +static int ip_rt_gc_elasticity __read_mostly = 8; +static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; +static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; +static int ip_rt_min_advmss __read_mostly = 256; +static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ; static void rt_worker_func(struct work_struct *work); static DECLARE_DELAYED_WORK(expires_work, rt_worker_func); @@ -252,40 +250,41 @@ static inline void rt_hash_lock_init(void) } #endif -static struct rt_hash_bucket *rt_hash_table; -static unsigned rt_hash_mask; -static unsigned int rt_hash_log; -static atomic_t rt_genid; +static struct rt_hash_bucket *rt_hash_table __read_mostly; +static unsigned rt_hash_mask __read_mostly; +static unsigned int rt_hash_log __read_mostly; +static atomic_t rt_genid __read_mostly; static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); #define RT_CACHE_STAT_INC(field) \ (__raw_get_cpu_var(rt_cache_stat).field++) -static unsigned int rt_hash_code(u32 daddr, u32 saddr) +static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx) { - return jhash_2words(daddr, saddr, atomic_read(&rt_genid)) + return jhash_3words((__force u32)(__be32)(daddr), + (__force u32)(__be32)(saddr), + idx, atomic_read(&rt_genid)) & rt_hash_mask; } -#define rt_hash(daddr, saddr, idx) \ - rt_hash_code((__force u32)(__be32)(daddr),\ - (__force u32)(__be32)(saddr) ^ ((idx) << 5)) - #ifdef CONFIG_PROC_FS struct rt_cache_iter_state { + struct seq_net_private p; int bucket; int genid; }; -static struct rtable *rt_cache_get_first(struct rt_cache_iter_state *st) +static struct rtable *rt_cache_get_first(struct seq_file *seq) { + struct rt_cache_iter_state *st = seq->private; struct rtable *r = NULL; for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { rcu_read_lock_bh(); r = rcu_dereference(rt_hash_table[st->bucket].chain); while (r) { - if (r->rt_genid == st->genid) + if (dev_net(r->u.dst.dev) == seq_file_net(seq) && + r->rt_genid == st->genid) return r; r = rcu_dereference(r->u.dst.rt_next); } @@ -294,8 +293,10 @@ static struct rtable *rt_cache_get_first(struct rt_cache_iter_state *st) return r; } -static struct rtable *rt_cache_get_next(struct rt_cache_iter_state *st, struct rtable *r) +static struct rtable *__rt_cache_get_next(struct seq_file *seq, + struct rtable *r) { + struct rt_cache_iter_state *st = seq->private; r = r->u.dst.rt_next; while (!r) { rcu_read_unlock_bh(); @@ -307,25 +308,34 @@ static struct rtable *rt_cache_get_next(struct rt_cache_iter_state *st, struct r return rcu_dereference(r); } -static struct rtable *rt_cache_get_idx(struct rt_cache_iter_state *st, loff_t pos) +static struct rtable *rt_cache_get_next(struct seq_file *seq, + struct rtable *r) +{ + struct rt_cache_iter_state *st = seq->private; + while ((r = __rt_cache_get_next(seq, r)) != NULL) { + if (dev_net(r->u.dst.dev) != seq_file_net(seq)) + continue; + if (r->rt_genid == st->genid) + break; + } + return r; +} + +static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos) { - struct rtable *r = rt_cache_get_first(st); + struct rtable *r = rt_cache_get_first(seq); if (r) - while (pos && (r = rt_cache_get_next(st, r))) { - if (r->rt_genid != st->genid) - continue; + while (pos && (r = rt_cache_get_next(seq, r))) --pos; - } return pos ? NULL : r; } static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) { struct rt_cache_iter_state *st = seq->private; - if (*pos) - return rt_cache_get_idx(st, *pos - 1); + return rt_cache_get_idx(seq, *pos - 1); st->genid = atomic_read(&rt_genid); return SEQ_START_TOKEN; } @@ -333,12 +343,11 @@ static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct rtable *r; - struct rt_cache_iter_state *st = seq->private; if (v == SEQ_START_TOKEN) - r = rt_cache_get_first(st); + r = rt_cache_get_first(seq); else - r = rt_cache_get_next(st, v); + r = rt_cache_get_next(seq, v); ++*pos; return r; } @@ -390,7 +399,7 @@ static const struct seq_operations rt_cache_seq_ops = { static int rt_cache_seq_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &rt_cache_seq_ops, + return seq_open_net(inode, file, &rt_cache_seq_ops, sizeof(struct rt_cache_iter_state)); } @@ -399,7 +408,7 @@ static const struct file_operations rt_cache_seq_fops = { .open = rt_cache_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; @@ -533,7 +542,7 @@ static int ip_rt_acct_read(char *buffer, char **start, off_t offset, } #endif -static __init int ip_rt_proc_init(struct net *net) +static int __net_init ip_rt_do_proc_init(struct net *net) { struct proc_dir_entry *pde; @@ -564,25 +573,43 @@ err2: err1: return -ENOMEM; } + +static void __net_exit ip_rt_do_proc_exit(struct net *net) +{ + remove_proc_entry("rt_cache", net->proc_net_stat); + remove_proc_entry("rt_cache", net->proc_net); + remove_proc_entry("rt_acct", net->proc_net); +} + +static struct pernet_operations ip_rt_proc_ops __net_initdata = { + .init = ip_rt_do_proc_init, + .exit = ip_rt_do_proc_exit, +}; + +static int __init ip_rt_proc_init(void) +{ + return register_pernet_subsys(&ip_rt_proc_ops); +} + #else -static inline int ip_rt_proc_init(struct net *net) +static inline int ip_rt_proc_init(void) { return 0; } #endif /* CONFIG_PROC_FS */ -static __inline__ void rt_free(struct rtable *rt) +static inline void rt_free(struct rtable *rt) { call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); } -static __inline__ void rt_drop(struct rtable *rt) +static inline void rt_drop(struct rtable *rt) { ip_rt_put(rt); call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); } -static __inline__ int rt_fast_clean(struct rtable *rth) +static inline int rt_fast_clean(struct rtable *rth) { /* Kill broadcast/multicast entries very aggresively, if they collide in hash table with more useful entries */ @@ -590,7 +617,7 @@ static __inline__ int rt_fast_clean(struct rtable *rth) rth->fl.iif && rth->u.dst.rt_next; } -static __inline__ int rt_valuable(struct rtable *rth) +static inline int rt_valuable(struct rtable *rth) { return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || rth->u.dst.expires; @@ -652,7 +679,7 @@ static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) { - return rt1->u.dst.dev->nd_net == rt2->u.dst.dev->nd_net; + return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev); } /* @@ -1032,10 +1059,10 @@ restart: #if RT_CACHE_DEBUG >= 2 if (rt->u.dst.rt_next) { struct rtable *trt; - printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash, + printk(KERN_DEBUG "rt_cache @%02x: " NIPQUAD_FMT, hash, NIPQUAD(rt->rt_dst)); for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next) - printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst)); + printk(" . " NIPQUAD_FMT, NIPQUAD(trt->rt_dst)); printk("\n"); } #endif @@ -1131,10 +1158,12 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, __be32 skeys[2] = { saddr, 0 }; int ikeys[2] = { dev->ifindex, 0 }; struct netevent_redirect netevent; + struct net *net; if (!in_dev) return; + net = dev_net(dev); if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) || ipv4_is_zeronet(new_gw)) @@ -1146,7 +1175,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) goto reject_redirect; } else { - if (inet_addr_type(&init_net, new_gw) != RTN_UNICAST) + if (inet_addr_type(net, new_gw) != RTN_UNICAST) goto reject_redirect; } @@ -1164,7 +1193,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, rth->fl.fl4_src != skeys[i] || rth->fl.oif != ikeys[k] || rth->fl.iif != 0 || - rth->rt_genid != atomic_read(&rt_genid)) { + rth->rt_genid != atomic_read(&rt_genid) || + !net_eq(dev_net(rth->u.dst.dev), net)) { rthp = &rth->u.dst.rt_next; continue; } @@ -1245,9 +1275,9 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, reject_redirect: #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) - printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about " - "%u.%u.%u.%u ignored.\n" - " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n", + printk(KERN_INFO "Redirect from " NIPQUAD_FMT " on %s about " + NIPQUAD_FMT " ignored.\n" + " Advised path = " NIPQUAD_FMT " -> " NIPQUAD_FMT "\n", NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw), NIPQUAD(saddr), NIPQUAD(daddr)); #endif @@ -1256,7 +1286,7 @@ reject_redirect: static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) { - struct rtable *rt = (struct rtable*)dst; + struct rtable *rt = (struct rtable *)dst; struct dst_entry *ret = dst; if (rt) { @@ -1269,7 +1299,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) rt->fl.oif); #if RT_CACHE_DEBUG >= 1 printk(KERN_DEBUG "ipv4_negative_advice: redirect to " - "%u.%u.%u.%u/%02x dropped\n", + NIPQUAD_FMT "/%02x dropped\n", NIPQUAD(rt->rt_dst), rt->fl.fl4_tos); #endif rt_del(hash, rt); @@ -1297,7 +1327,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) void ip_rt_send_redirect(struct sk_buff *skb) { - struct rtable *rt = (struct rtable*)skb->dst; + struct rtable *rt = skb->rtable; struct in_device *in_dev = in_dev_get(rt->u.dst.dev); if (!in_dev) @@ -1334,8 +1364,8 @@ void ip_rt_send_redirect(struct sk_buff *skb) if (IN_DEV_LOG_MARTIANS(in_dev) && rt->u.dst.rate_tokens == ip_rt_redirect_number && net_ratelimit()) - printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores " - "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n", + printk(KERN_WARNING "host " NIPQUAD_FMT "/if%d ignores " + "redirects for " NIPQUAD_FMT " to " NIPQUAD_FMT ".\n", NIPQUAD(rt->rt_src), rt->rt_iif, NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway)); #endif @@ -1346,7 +1376,7 @@ out: static int ip_error(struct sk_buff *skb) { - struct rtable *rt = (struct rtable*)skb->dst; + struct rtable *rt = skb->rtable; unsigned long now; int code; @@ -1388,7 +1418,7 @@ out: kfree_skb(skb); static const unsigned short mtu_plateau[] = {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; -static __inline__ unsigned short guess_mtu(unsigned short old_mtu) +static inline unsigned short guess_mtu(unsigned short old_mtu) { int i; @@ -1423,7 +1453,7 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, rth->rt_src == iph->saddr && rth->fl.iif == 0 && !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) && - rth->u.dst.dev->nd_net == net && + net_eq(dev_net(rth->u.dst.dev), net) && rth->rt_genid == atomic_read(&rt_genid)) { unsigned short mtu = new_mtu; @@ -1499,9 +1529,9 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, { struct rtable *rt = (struct rtable *) dst; struct in_device *idev = rt->idev; - if (dev != dev->nd_net->loopback_dev && idev && idev->dev == dev) { + if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) { struct in_device *loopback_idev = - in_dev_get(dev->nd_net->loopback_dev); + in_dev_get(dev_net(dev)->loopback_dev); if (loopback_idev) { rt->idev = loopback_idev; in_dev_put(idev); @@ -1515,14 +1545,14 @@ static void ipv4_link_failure(struct sk_buff *skb) icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); - rt = (struct rtable *) skb->dst; + rt = skb->rtable; if (rt) dst_set_expires(&rt->u.dst, 0); } static int ip_rt_bug(struct sk_buff *skb) { - printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n", + printk(KERN_DEBUG "ip_rt_bug: " NIPQUAD_FMT " -> " NIPQUAD_FMT ", %s\n", NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr), skb->dev ? skb->dev->name : "?"); kfree_skb(skb); @@ -1545,7 +1575,7 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt) if (rt->fl.iif == 0) src = rt->rt_src; - else if (fib_lookup(rt->u.dst.dev->nd_net, &rt->fl, &res) == 0) { + else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) { src = FIB_RES_PREFSRC(res); fib_res_put(&res); } else @@ -1675,7 +1705,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, in_dev_put(in_dev); hash = rt_hash(daddr, saddr, dev->ifindex); - return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst); + return rt_intern_hash(hash, rth, &skb->rtable); e_nobufs: in_dev_put(in_dev); @@ -1700,8 +1730,8 @@ static void ip_handle_martian_source(struct net_device *dev, * RFC1812 recommendation, if source is martian, * the only hint is MAC header. */ - printk(KERN_WARNING "martian source %u.%u.%u.%u from " - "%u.%u.%u.%u, on dev %s\n", + printk(KERN_WARNING "martian source " NIPQUAD_FMT " from " + NIPQUAD_FMT", on dev %s\n", NIPQUAD(daddr), NIPQUAD(saddr), dev->name); if (dev->hard_header_len && skb_mac_header_was_set(skb)) { int i; @@ -1718,11 +1748,11 @@ static void ip_handle_martian_source(struct net_device *dev, #endif } -static inline int __mkroute_input(struct sk_buff *skb, - struct fib_result* res, - struct in_device *in_dev, - __be32 daddr, __be32 saddr, u32 tos, - struct rtable **result) +static int __mkroute_input(struct sk_buff *skb, + struct fib_result *res, + struct in_device *in_dev, + __be32 daddr, __be32 saddr, u32 tos, + struct rtable **result) { struct rtable *rth; @@ -1814,11 +1844,11 @@ static inline int __mkroute_input(struct sk_buff *skb, return err; } -static inline int ip_mkroute_input(struct sk_buff *skb, - struct fib_result* res, - const struct flowi *fl, - struct in_device *in_dev, - __be32 daddr, __be32 saddr, u32 tos) +static int ip_mkroute_input(struct sk_buff *skb, + struct fib_result *res, + const struct flowi *fl, + struct in_device *in_dev, + __be32 daddr, __be32 saddr, u32 tos) { struct rtable* rth = NULL; int err; @@ -1836,7 +1866,7 @@ static inline int ip_mkroute_input(struct sk_buff *skb, /* put it into the cache */ hash = rt_hash(daddr, saddr, fl->iif); - return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); + return rt_intern_hash(hash, rth, &skb->rtable); } /* @@ -1869,7 +1899,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, __be32 spec_dst; int err = -EINVAL; int free_res = 0; - struct net * net = dev->nd_net; + struct net * net = dev_net(dev); /* IP on this device is disabled. */ @@ -1992,7 +2022,7 @@ local_input: } rth->rt_type = res.type; hash = rt_hash(daddr, saddr, fl.iif); - err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); + err = rt_intern_hash(hash, rth, &skb->rtable); goto done; no_route: @@ -2010,8 +2040,8 @@ martian_destination: RT_CACHE_STAT_INC(in_martian_dst); #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) - printk(KERN_WARNING "martian destination %u.%u.%u.%u from " - "%u.%u.%u.%u, dev %s\n", + printk(KERN_WARNING "martian destination " NIPQUAD_FMT " from " + NIPQUAD_FMT ", dev %s\n", NIPQUAD(daddr), NIPQUAD(saddr), dev->name); #endif @@ -2040,25 +2070,25 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, int iif = dev->ifindex; struct net *net; - net = dev->nd_net; + net = dev_net(dev); tos &= IPTOS_RT_MASK; hash = rt_hash(daddr, saddr, iif); rcu_read_lock(); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; rth = rcu_dereference(rth->u.dst.rt_next)) { - if (rth->fl.fl4_dst == daddr && - rth->fl.fl4_src == saddr && - rth->fl.iif == iif && - rth->fl.oif == 0 && + if (((rth->fl.fl4_dst ^ daddr) | + (rth->fl.fl4_src ^ saddr) | + (rth->fl.iif ^ iif) | + rth->fl.oif | + (rth->fl.fl4_tos ^ tos)) == 0 && rth->fl.mark == skb->mark && - rth->fl.fl4_tos == tos && - rth->u.dst.dev->nd_net == net && + net_eq(dev_net(rth->u.dst.dev), net) && rth->rt_genid == atomic_read(&rt_genid)) { dst_use(&rth->u.dst, jiffies); RT_CACHE_STAT_INC(in_hit); rcu_read_unlock(); - skb->dst = (struct dst_entry*)rth; + skb->rtable = rth; return 0; } RT_CACHE_STAT_INC(in_hlist_search); @@ -2100,12 +2130,12 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, return ip_route_input_slow(skb, daddr, saddr, tos, dev); } -static inline int __mkroute_output(struct rtable **result, - struct fib_result* res, - const struct flowi *fl, - const struct flowi *oldflp, - struct net_device *dev_out, - unsigned flags) +static int __mkroute_output(struct rtable **result, + struct fib_result *res, + const struct flowi *fl, + const struct flowi *oldflp, + struct net_device *dev_out, + unsigned flags) { struct rtable *rth; struct in_device *in_dev; @@ -2220,12 +2250,12 @@ static inline int __mkroute_output(struct rtable **result, return err; } -static inline int ip_mkroute_output(struct rtable **rp, - struct fib_result* res, - const struct flowi *fl, - const struct flowi *oldflp, - struct net_device *dev_out, - unsigned flags) +static int ip_mkroute_output(struct rtable **rp, + struct fib_result *res, + const struct flowi *fl, + const struct flowi *oldflp, + struct net_device *dev_out, + unsigned flags) { struct rtable *rth = NULL; int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); @@ -2455,7 +2485,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, rth->fl.mark == flp->mark && !((rth->fl.fl4_tos ^ flp->fl4_tos) & (IPTOS_RT_MASK | RTO_ONLINK)) && - rth->u.dst.dev->nd_net == net && + net_eq(dev_net(rth->u.dst.dev), net) && rth->rt_genid == atomic_read(&rt_genid)) { dst_use(&rth->u.dst, jiffies); RT_CACHE_STAT_INC(out_hit); @@ -2487,7 +2517,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = { }; -static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk) +static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp) { struct rtable *ort = *rp; struct rtable *rt = (struct rtable *) @@ -2547,7 +2577,7 @@ int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags ? XFRM_LOOKUP_WAIT : 0); if (err == -EREMOTE) - err = ipv4_dst_blackhole(rp, flp, sk); + err = ipv4_dst_blackhole(rp, flp); return err; } @@ -2565,7 +2595,7 @@ int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp) static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait, unsigned int flags) { - struct rtable *rt = (struct rtable*)skb->dst; + struct rtable *rt = skb->rtable; struct rtmsg *r; struct nlmsghdr *nlh; long expires; @@ -2658,7 +2688,7 @@ nla_put_failure: static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) { - struct net *net = in_skb->sk->sk_net; + struct net *net = sock_net(in_skb->sk); struct rtmsg *rtm; struct nlattr *tb[RTA_MAX+1]; struct rtable *rt = NULL; @@ -2668,9 +2698,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void int err; struct sk_buff *skb; - if (net != &init_net) - return -EINVAL; - err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); if (err < 0) goto errout; @@ -2700,7 +2727,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void if (iif) { struct net_device *dev; - dev = __dev_get_by_index(&init_net, iif); + dev = __dev_get_by_index(net, iif); if (dev == NULL) { err = -ENODEV; goto errout_free; @@ -2712,7 +2739,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); local_bh_enable(); - rt = (struct rtable*) skb->dst; + rt = skb->rtable; if (err == 0 && rt->u.dst.error) err = -rt->u.dst.error; } else { @@ -2726,22 +2753,22 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void }, .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, }; - err = ip_route_output_key(&init_net, &rt, &fl); + err = ip_route_output_key(net, &rt, &fl); } if (err) goto errout_free; - skb->dst = &rt->u.dst; + skb->rtable = rt; if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, - RTM_NEWROUTE, 0, 0); + RTM_NEWROUTE, 0, 0); if (err <= 0) goto errout_free; - err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid); + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid); errout: return err; @@ -2755,6 +2782,9 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) struct rtable *rt; int h, s_h; int idx, s_idx; + struct net *net; + + net = sock_net(skb->sk); s_h = cb->args[0]; if (s_h < 0) @@ -2764,7 +2794,7 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock_bh(); for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt; rt = rcu_dereference(rt->u.dst.rt_next), idx++) { - if (idx < s_idx) + if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx) continue; if (rt->rt_genid != atomic_read(&rt_genid)) continue; @@ -3028,7 +3058,9 @@ int __init ip_rt_init(void) devinet_init(); ip_fib_init(); - setup_timer(&rt_secret_timer, rt_secret_rebuild, 0); + rt_secret_timer.function = rt_secret_rebuild; + rt_secret_timer.data = 0; + init_timer_deferrable(&rt_secret_timer); /* All the timers, started at system startup tend to synchronize. Perturb it a bit. @@ -3040,7 +3072,7 @@ int __init ip_rt_init(void) ip_rt_secret_interval; add_timer(&rt_secret_timer); - if (ip_rt_proc_init(&init_net)) + if (ip_rt_proc_init()) printk(KERN_ERR "Unable to create route proc files\n"); #ifdef CONFIG_XFRM xfrm_init(); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index f470fe4511d..73ba98921d6 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -10,8 +10,6 @@ * 2 of the License, or (at your option) any later version. * * $Id: syncookies.c,v 1.18 2002/02/01 22:01:04 davem Exp $ - * - * Missing: IPv6 support. */ #include <linux/tcp.h> @@ -21,26 +19,33 @@ #include <linux/kernel.h> #include <net/tcp.h> +/* Timestamps: lowest 9 bits store TCP options */ +#define TSBITS 9 +#define TSMASK (((__u32)1 << TSBITS) - 1) + extern int sysctl_tcp_syncookies; -static __u32 syncookie_secret[2][16-3+SHA_DIGEST_WORDS]; +__u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS]; +EXPORT_SYMBOL(syncookie_secret); static __init int init_syncookies(void) { get_random_bytes(syncookie_secret, sizeof(syncookie_secret)); return 0; } -module_init(init_syncookies); +__initcall(init_syncookies); #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) +static DEFINE_PER_CPU(__u32, cookie_scratch)[16 + 5 + SHA_WORKSPACE_WORDS]; + static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u32 count, int c) { - __u32 tmp[16 + 5 + SHA_WORKSPACE_WORDS]; + __u32 *tmp = __get_cpu_var(cookie_scratch); - memcpy(tmp + 3, syncookie_secret[c], sizeof(syncookie_secret[c])); + memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c])); tmp[0] = (__force u32)saddr; tmp[1] = (__force u32)daddr; tmp[2] = ((__force u32)sport << 16) + (__force u32)dport; @@ -50,6 +55,39 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, return tmp[17]; } + +/* + * when syncookies are in effect and tcp timestamps are enabled we encode + * tcp options in the lowest 9 bits of the timestamp value that will be + * sent in the syn-ack. + * Since subsequent timestamps use the normal tcp_time_stamp value, we + * must make sure that the resulting initial timestamp is <= tcp_time_stamp. + */ +__u32 cookie_init_timestamp(struct request_sock *req) +{ + struct inet_request_sock *ireq; + u32 ts, ts_now = tcp_time_stamp; + u32 options = 0; + + ireq = inet_rsk(req); + if (ireq->wscale_ok) { + options = ireq->snd_wscale; + options |= ireq->rcv_wscale << 4; + } + options |= ireq->sack_ok << 8; + + ts = ts_now & ~TSMASK; + ts |= options; + if (ts > ts_now) { + ts >>= TSBITS; + ts--; + ts <<= TSBITS; + ts |= options; + } + return ts; +} + + static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, __u32 sseq, __u32 count, __u32 data) @@ -184,6 +222,35 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, return child; } + +/* + * when syncookies are in effect and tcp timestamps are enabled we stored + * additional tcp options in the timestamp. + * This extracts these options from the timestamp echo. + * + * The lowest 4 bits are for snd_wscale + * The next 4 lsb are for rcv_wscale + * The next lsb is for sack_ok + */ +void cookie_check_timestamp(struct tcp_options_received *tcp_opt) +{ + /* echoed timestamp, 9 lowest bits contain options */ + u32 options = tcp_opt->rcv_tsecr & TSMASK; + + tcp_opt->snd_wscale = options & 0xf; + options >>= 4; + tcp_opt->rcv_wscale = options & 0xf; + + tcp_opt->sack_ok = (options >> 4) & 0x1; + + if (tcp_opt->sack_ok) + tcp_sack_reset(tcp_opt); + + if (tcp_opt->snd_wscale || tcp_opt->rcv_wscale) + tcp_opt->wscale_ok = 1; +} +EXPORT_SYMBOL(cookie_check_timestamp); + struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct ip_options *opt) { @@ -197,6 +264,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, int mss; struct rtable *rt; __u8 rcv_wscale; + struct tcp_options_received tcp_opt; if (!sysctl_tcp_syncookies || !th->ack) goto out; @@ -209,6 +277,13 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESRECV); + /* check for timestamp cookie support */ + memset(&tcp_opt, 0, sizeof(tcp_opt)); + tcp_parse_options(skb, &tcp_opt, 0); + + if (tcp_opt.saw_tstamp) + cookie_check_timestamp(&tcp_opt); + ret = NULL; req = reqsk_alloc(&tcp_request_sock_ops); /* for safety */ if (!req) @@ -227,6 +302,12 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, ireq->loc_addr = ip_hdr(skb)->daddr; ireq->rmt_addr = ip_hdr(skb)->saddr; ireq->opt = NULL; + ireq->snd_wscale = tcp_opt.snd_wscale; + ireq->rcv_wscale = tcp_opt.rcv_wscale; + ireq->sack_ok = tcp_opt.sack_ok; + ireq->wscale_ok = tcp_opt.wscale_ok; + ireq->tstamp_ok = tcp_opt.saw_tstamp; + req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; /* We throwed the options of the initial SYN away, so we hope * the ACK carries the same options again (see RFC1122 4.2.3.8) @@ -241,8 +322,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, } } - ireq->snd_wscale = ireq->rcv_wscale = ireq->tstamp_ok = 0; - ireq->wscale_ok = ireq->sack_ok = 0; req->expires = 0UL; req->retrans = 0; @@ -271,11 +350,12 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, } /* Try to redo what tcp_v4_send_synack did. */ - req->window_clamp = dst_metric(&rt->u.dst, RTAX_WINDOW); + req->window_clamp = tp->window_clamp ? :dst_metric(&rt->u.dst, RTAX_WINDOW); + tcp_select_initial_window(tcp_full_space(sk), req->mss, &req->rcv_wnd, &req->window_clamp, - 0, &rcv_wscale); - /* BTW win scale with syncookies is 0 by definition */ + ireq->wscale_ok, &rcv_wscale); + ireq->rcv_wscale = rcv_wscale; ret = get_cookie_sock(sk, skb, req, &rt->u.dst); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 88286f35d1e..c437f804ee3 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -404,38 +404,6 @@ static struct ctl_table ipv4_table[] = { .strategy = &ipv4_sysctl_local_port_range, }, { - .ctl_name = NET_IPV4_ICMP_ECHO_IGNORE_ALL, - .procname = "icmp_echo_ignore_all", - .data = &sysctl_icmp_echo_ignore_all, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, - .procname = "icmp_echo_ignore_broadcasts", - .data = &sysctl_icmp_echo_ignore_broadcasts, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, - .procname = "icmp_ignore_bogus_error_responses", - .data = &sysctl_icmp_ignore_bogus_error_responses, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, - .procname = "icmp_errors_use_inbound_ifaddr", - .data = &sysctl_icmp_errors_use_inbound_ifaddr, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { .ctl_name = NET_IPV4_ROUTE, .procname = "route", .maxlen = 0, @@ -586,22 +554,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = &proc_dointvec }, { - .ctl_name = NET_IPV4_ICMP_RATELIMIT, - .procname = "icmp_ratelimit", - .data = &sysctl_icmp_ratelimit, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = NET_IPV4_ICMP_RATEMASK, - .procname = "icmp_ratemask", - .data = &sysctl_icmp_ratemask, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { .ctl_name = NET_TCP_TW_REUSE, .procname = "tcp_tw_reuse", .data = &sysctl_tcp_tw_reuse, @@ -804,6 +756,58 @@ static struct ctl_table ipv4_table[] = { { .ctl_name = 0 } }; +static struct ctl_table ipv4_net_table[] = { + { + .ctl_name = NET_IPV4_ICMP_ECHO_IGNORE_ALL, + .procname = "icmp_echo_ignore_all", + .data = &init_net.ipv4.sysctl_icmp_echo_ignore_all, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, + .procname = "icmp_echo_ignore_broadcasts", + .data = &init_net.ipv4.sysctl_icmp_echo_ignore_broadcasts, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, + .procname = "icmp_ignore_bogus_error_responses", + .data = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, + .procname = "icmp_errors_use_inbound_ifaddr", + .data = &init_net.ipv4.sysctl_icmp_errors_use_inbound_ifaddr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_ICMP_RATELIMIT, + .procname = "icmp_ratelimit", + .data = &init_net.ipv4.sysctl_icmp_ratelimit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_ICMP_RATEMASK, + .procname = "icmp_ratemask", + .data = &init_net.ipv4.sysctl_icmp_ratemask, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { } +}; + struct ctl_path net_ipv4_ctl_path[] = { { .procname = "net", .ctl_name = CTL_NET, }, { .procname = "ipv4", .ctl_name = NET_IPV4, }, @@ -811,12 +815,72 @@ struct ctl_path net_ipv4_ctl_path[] = { }; EXPORT_SYMBOL_GPL(net_ipv4_ctl_path); +static __net_init int ipv4_sysctl_init_net(struct net *net) +{ + struct ctl_table *table; + + table = ipv4_net_table; + if (net != &init_net) { + table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL); + if (table == NULL) + goto err_alloc; + + table[0].data = + &net->ipv4.sysctl_icmp_echo_ignore_all; + table[1].data = + &net->ipv4.sysctl_icmp_echo_ignore_broadcasts; + table[2].data = + &net->ipv4.sysctl_icmp_ignore_bogus_error_responses; + table[3].data = + &net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr; + table[4].data = + &net->ipv4.sysctl_icmp_ratelimit; + table[5].data = + &net->ipv4.sysctl_icmp_ratemask; + } + + net->ipv4.ipv4_hdr = register_net_sysctl_table(net, + net_ipv4_ctl_path, table); + if (net->ipv4.ipv4_hdr == NULL) + goto err_reg; + + return 0; + +err_reg: + if (net != &init_net) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static __net_exit void ipv4_sysctl_exit_net(struct net *net) +{ + struct ctl_table *table; + + table = net->ipv4.ipv4_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->ipv4.ipv4_hdr); + kfree(table); +} + +static __net_initdata struct pernet_operations ipv4_sysctl_ops = { + .init = ipv4_sysctl_init_net, + .exit = ipv4_sysctl_exit_net, +}; + static __init int sysctl_ipv4_init(void) { struct ctl_table_header *hdr; hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table); - return hdr == NULL ? -ENOMEM : 0; + if (hdr == NULL) + return -ENOMEM; + + if (register_pernet_subsys(&ipv4_sysctl_ops)) { + unregister_sysctl_table(hdr); + return -ENOMEM; + } + + return 0; } __initcall(sysctl_ipv4_init); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 39b629ac240..58ac838bf46 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2105,15 +2105,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_DEFER_ACCEPT: - icsk->icsk_accept_queue.rskq_defer_accept = 0; - if (val > 0) { - /* Translate value in seconds to number of - * retransmits */ - while (icsk->icsk_accept_queue.rskq_defer_accept < 32 && - val > ((TCP_TIMEOUT_INIT / HZ) << - icsk->icsk_accept_queue.rskq_defer_accept)) - icsk->icsk_accept_queue.rskq_defer_accept++; - icsk->icsk_accept_queue.rskq_defer_accept++; + if (val < 0) { + err = -EINVAL; + } else { + if (val > MAX_TCP_ACCEPT_DEFERRED) + val = MAX_TCP_ACCEPT_DEFERRED; + icsk->icsk_accept_queue.rskq_defer_accept = val; } break; @@ -2295,8 +2292,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = (val ? : sysctl_tcp_fin_timeout) / HZ; break; case TCP_DEFER_ACCEPT: - val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 : - ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1)); + val = icsk->icsk_accept_queue.rskq_defer_accept; break; case TCP_WINDOW_CLAMP: val = tp->window_clamp; diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 3aa0b23c1ea..eb5b9854c8c 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -1,12 +1,13 @@ /* - * TCP CUBIC: Binary Increase Congestion control for TCP v2.1 - * + * TCP CUBIC: Binary Increase Congestion control for TCP v2.2 + * Home page: + * http://netsrv.csc.ncsu.edu/twiki/bin/view/Main/BIC * This is from the implementation of CUBIC TCP in * Injong Rhee, Lisong Xu. * "CUBIC: A New TCP-Friendly High-Speed TCP Variant * in PFLDnet 2005 * Available from: - * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf + * http://netsrv.csc.ncsu.edu/export/cubic-paper.pdf * * Unless CUBIC is enabled and congestion window is large * this behaves the same as the original Reno. @@ -20,15 +21,10 @@ #define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation * max_cwnd = snd_cwnd * beta */ -#define BICTCP_B 4 /* - * In binary search, - * go to point (max+min)/N - */ #define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */ static int fast_convergence __read_mostly = 1; -static int max_increment __read_mostly = 16; -static int beta __read_mostly = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ +static int beta __read_mostly = 717; /* = 717/1024 (BICTCP_BETA_SCALE) */ static int initial_ssthresh __read_mostly; static int bic_scale __read_mostly = 41; static int tcp_friendliness __read_mostly = 1; @@ -40,9 +36,7 @@ static u64 cube_factor __read_mostly; /* Note parameters that are used for precomputing scale factors are read-only */ module_param(fast_convergence, int, 0644); MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence"); -module_param(max_increment, int, 0644); -MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search"); -module_param(beta, int, 0444); +module_param(beta, int, 0644); MODULE_PARM_DESC(beta, "beta for multiplicative increase"); module_param(initial_ssthresh, int, 0644); MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); @@ -145,7 +139,7 @@ static u32 cubic_root(u64 a) static inline void bictcp_update(struct bictcp *ca, u32 cwnd) { u64 offs; - u32 delta, t, bic_target, min_cnt, max_cnt; + u32 delta, t, bic_target, max_cnt; ca->ack_cnt++; /* count the number of ACKs */ @@ -211,19 +205,6 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) ca->cnt = 100 * cwnd; /* very small increment*/ } - if (ca->delay_min > 0) { - /* max increment = Smax * rtt / 0.1 */ - min_cnt = (cwnd * HZ * 8)/(10 * max_increment * ca->delay_min); - - /* use concave growth when the target is above the origin */ - if (ca->cnt < min_cnt && t >= ca->bic_K) - ca->cnt = min_cnt; - } - - /* slow start and low utilization */ - if (ca->loss_cwnd == 0) /* could be aggressive in slow start */ - ca->cnt = 50; - /* TCP Friendly */ if (tcp_friendliness) { u32 scale = beta_scale; @@ -391,4 +372,4 @@ module_exit(cubictcp_unregister); MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("CUBIC TCP"); -MODULE_VERSION("2.1"); +MODULE_VERSION("2.2"); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bbb7d88a16b..cdc051bfdb4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2309,12 +2309,25 @@ static void DBGUNDO(struct sock *sk, const char *msg) struct tcp_sock *tp = tcp_sk(sk); struct inet_sock *inet = inet_sk(sk); - printk(KERN_DEBUG "Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n", - msg, - NIPQUAD(inet->daddr), ntohs(inet->dport), - tp->snd_cwnd, tcp_left_out(tp), - tp->snd_ssthresh, tp->prior_ssthresh, - tp->packets_out); + if (sk->sk_family == AF_INET) { + printk(KERN_DEBUG "Undo %s " NIPQUAD_FMT "/%u c%u l%u ss%u/%u p%u\n", + msg, + NIPQUAD(inet->daddr), ntohs(inet->dport), + tp->snd_cwnd, tcp_left_out(tp), + tp->snd_ssthresh, tp->prior_ssthresh, + tp->packets_out); + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + printk(KERN_DEBUG "Undo %s " NIP6_FMT "/%u c%u l%u ss%u/%u p%u\n", + msg, + NIP6(np->daddr), ntohs(inet->dport), + tp->snd_cwnd, tcp_left_out(tp), + tp->snd_ssthresh, tp->prior_ssthresh, + tp->packets_out); + } +#endif } #else #define DBGUNDO(x...) do { } while (0) @@ -3592,7 +3605,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) * cases we should never reach this piece of code. */ printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n", - __FUNCTION__, sk->sk_state); + __func__, sk->sk_state); break; } @@ -4012,7 +4025,7 @@ drop: u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (seq == TCP_SKB_CB(skb1)->end_seq) { - __skb_append(skb1, skb, &tp->out_of_order_queue); + __skb_queue_after(&tp->out_of_order_queue, skb1, skb); if (!tp->rx_opt.num_sacks || tp->selective_acks[0].end_seq != seq) @@ -4508,6 +4521,49 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th) } } +static int tcp_defer_accept_check(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->defer_tcp_accept.request) { + int queued_data = tp->rcv_nxt - tp->copied_seq; + int hasfin = !skb_queue_empty(&sk->sk_receive_queue) ? + tcp_hdr((struct sk_buff *) + sk->sk_receive_queue.prev)->fin : 0; + + if (queued_data && hasfin) + queued_data--; + + if (queued_data && + tp->defer_tcp_accept.listen_sk->sk_state == TCP_LISTEN) { + if (sock_flag(sk, SOCK_KEEPOPEN)) { + inet_csk_reset_keepalive_timer(sk, + keepalive_time_when(tp)); + } else { + inet_csk_delete_keepalive_timer(sk); + } + + inet_csk_reqsk_queue_add( + tp->defer_tcp_accept.listen_sk, + tp->defer_tcp_accept.request, + sk); + + tp->defer_tcp_accept.listen_sk->sk_data_ready( + tp->defer_tcp_accept.listen_sk, 0); + + sock_put(tp->defer_tcp_accept.listen_sk); + sock_put(sk); + tp->defer_tcp_accept.listen_sk = NULL; + tp->defer_tcp_accept.request = NULL; + } else if (hasfin || + tp->defer_tcp_accept.listen_sk->sk_state != TCP_LISTEN) { + tcp_reset(sk); + return -1; + } + } + return 0; +} + static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) { struct tcp_sock *tp = tcp_sk(sk); @@ -4868,6 +4924,9 @@ step5: tcp_data_snd_check(sk); tcp_ack_snd_check(sk); + + if (tcp_defer_accept_check(sk)) + return -1; return 0; csum_error: @@ -5387,6 +5446,7 @@ discard: EXPORT_SYMBOL(sysctl_tcp_ecn); EXPORT_SYMBOL(sysctl_tcp_reordering); +EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); EXPORT_SYMBOL(tcp_parse_options); EXPORT_SYMBOL(tcp_rcv_established); EXPORT_SYMBOL(tcp_rcv_state_process); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 00156bf421c..776615180b9 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -88,9 +88,6 @@ int sysctl_tcp_low_latency __read_mostly; /* Check TCP sequence numbers in ICMP packets. */ #define ICMP_MIN_LENGTH 8 -/* Socket used for sending RSTs */ -static struct socket *tcp_socket __read_mostly; - void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb); #ifdef CONFIG_TCP_MD5SIG @@ -353,7 +350,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) return; } - sk = inet_lookup(skb->dev->nd_net, &tcp_hashinfo, iph->daddr, th->dest, + sk = inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->daddr, th->dest, iph->saddr, th->source, inet_iif(skb)); if (!sk) { ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); @@ -552,7 +549,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) if (th->rst) return; - if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL) + if (skb->rtable->rt_type != RTN_LOCAL) return; /* Swap the send and the receive. */ @@ -598,7 +595,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) sizeof(struct tcphdr), IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; - ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len); + ip_send_reply(dev_net(skb->dst->dev)->ipv4.tcp_sock, skb, + &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); TCP_INC_STATS_BH(TCP_MIB_OUTRSTS); @@ -693,7 +691,8 @@ static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk, if (twsk) arg.bound_dev_if = twsk->tw_sk.tw_bound_dev_if; - ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len); + ip_send_reply(dev_net(skb->dev)->ipv4.tcp_sock, skb, + &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); } @@ -723,8 +722,8 @@ static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, * This still operates on a request_sock only, not on a big * socket. */ -static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req, - struct dst_entry *dst) +static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req, + struct dst_entry *dst) { const struct inet_request_sock *ireq = inet_rsk(req); int err = -1; @@ -732,7 +731,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req, /* First, grab a route. */ if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) - goto out; + return -1; skb = tcp_make_synack(sk, dst, req); @@ -751,11 +750,15 @@ static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req, err = net_xmit_eval(err); } -out: dst_release(dst); return err; } +static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req) +{ + return __tcp_v4_send_synack(sk, req, NULL); +} + /* * IPv4 request_sock destructor. */ @@ -1258,8 +1261,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) #endif /* Never answer to SYNs send to broadcast or multicast */ - if (((struct rtable *)skb->dst)->rt_flags & - (RTCF_BROADCAST | RTCF_MULTICAST)) + if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) goto drop; /* TW buckets are converted to open requests without @@ -1297,10 +1299,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_parse_options(skb, &tmp_opt, 0); - if (want_cookie) { + if (want_cookie && !tmp_opt.saw_tstamp) tcp_clear_options(&tmp_opt); - tmp_opt.saw_tstamp = 0; - } if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) { /* Some OSes (unknown ones, but I see them on web server, which @@ -1328,6 +1328,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (want_cookie) { #ifdef CONFIG_SYN_COOKIES syn_flood_warning(skb); + req->cookie_ts = tmp_opt.tstamp_ok; #endif isn = cookie_v4_init_sequence(sk, skb, &req->mss); } else if (!isn) { @@ -1351,8 +1352,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) { NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED); - dst_release(dst); - goto drop_and_free; + goto drop_and_release; } } /* Kill the following clause, if you dislike this way. */ @@ -1369,27 +1369,24 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) * to the moment of synflood. */ LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open " - "request from %u.%u.%u.%u/%u\n", + "request from " NIPQUAD_FMT "/%u\n", NIPQUAD(saddr), ntohs(tcp_hdr(skb)->source)); - dst_release(dst); - goto drop_and_free; + goto drop_and_release; } isn = tcp_v4_init_sequence(skb); } tcp_rsk(req)->snt_isn = isn; - if (tcp_v4_send_synack(sk, req, dst)) + if (__tcp_v4_send_synack(sk, req, dst) || want_cookie) goto drop_and_free; - if (want_cookie) { - reqsk_free(req); - } else { - inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); - } + inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); return 0; +drop_and_release: + dst_release(dst); drop_and_free: reqsk_free(req); drop: @@ -1487,7 +1484,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) if (req) return tcp_check_req(sk, skb, req, prev); - nsk = inet_lookup_established(sk->sk_net, &tcp_hashinfo, iph->saddr, + nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb)); if (nsk) { @@ -1645,7 +1642,7 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->flags = iph->tos; TCP_SKB_CB(skb)->sacked = 0; - sk = __inet_lookup(skb->dev->nd_net, &tcp_hashinfo, iph->saddr, + sk = __inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb)); if (!sk) goto no_tcp_socket; @@ -1719,7 +1716,7 @@ do_time_wait: } switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { case TCP_TW_SYN: { - struct sock *sk2 = inet_lookup_listener(skb->dev->nd_net, + struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), &tcp_hashinfo, iph->daddr, th->dest, inet_iif(skb)); @@ -1921,6 +1918,14 @@ int tcp_v4_destroy_sock(struct sock *sk) sk->sk_sndmsg_page = NULL; } + if (tp->defer_tcp_accept.request) { + reqsk_free(tp->defer_tcp_accept.request); + sock_put(tp->defer_tcp_accept.listen_sk); + sock_put(sk); + tp->defer_tcp_accept.listen_sk = NULL; + tp->defer_tcp_accept.request = NULL; + } + atomic_dec(&tcp_sockets_allocated); return 0; @@ -1949,6 +1954,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) struct hlist_node *node; struct sock *sk = cur; struct tcp_iter_state* st = seq->private; + struct net *net = seq_file_net(seq); if (!sk) { st->bucket = 0; @@ -1965,7 +1971,8 @@ static void *listening_get_next(struct seq_file *seq, void *cur) req = req->dl_next; while (1) { while (req) { - if (req->rsk_ops->family == st->family) { + if (req->rsk_ops->family == st->family && + net_eq(sock_net(req->sk), net)) { cur = req; goto out; } @@ -1989,7 +1996,7 @@ get_req: } get_sk: sk_for_each_from(sk, node) { - if (sk->sk_family == st->family) { + if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) { cur = sk; goto out; } @@ -2028,6 +2035,7 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos) static void *established_get_first(struct seq_file *seq) { struct tcp_iter_state* st = seq->private; + struct net *net = seq_file_net(seq); void *rc = NULL; for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { @@ -2038,7 +2046,8 @@ static void *established_get_first(struct seq_file *seq) read_lock_bh(lock); sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { - if (sk->sk_family != st->family) { + if (sk->sk_family != st->family || + !net_eq(sock_net(sk), net)) { continue; } rc = sk; @@ -2047,7 +2056,8 @@ static void *established_get_first(struct seq_file *seq) st->state = TCP_SEQ_STATE_TIME_WAIT; inet_twsk_for_each(tw, node, &tcp_hashinfo.ehash[st->bucket].twchain) { - if (tw->tw_family != st->family) { + if (tw->tw_family != st->family || + !net_eq(twsk_net(tw), net)) { continue; } rc = tw; @@ -2066,6 +2076,7 @@ static void *established_get_next(struct seq_file *seq, void *cur) struct inet_timewait_sock *tw; struct hlist_node *node; struct tcp_iter_state* st = seq->private; + struct net *net = seq_file_net(seq); ++st->num; @@ -2073,7 +2084,7 @@ static void *established_get_next(struct seq_file *seq, void *cur) tw = cur; tw = tw_next(tw); get_tw: - while (tw && tw->tw_family != st->family) { + while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) { tw = tw_next(tw); } if (tw) { @@ -2094,7 +2105,7 @@ get_tw: sk = sk_next(sk); sk_for_each_from(sk, node) { - if (sk->sk_family == st->family) + if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) goto found; } @@ -2200,48 +2211,37 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) static int tcp_seq_open(struct inode *inode, struct file *file) { struct tcp_seq_afinfo *afinfo = PDE(inode)->data; - struct seq_file *seq; struct tcp_iter_state *s; - int rc; + int err; if (unlikely(afinfo == NULL)) return -EINVAL; - s = kzalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; + err = seq_open_net(inode, file, &afinfo->seq_ops, + sizeof(struct tcp_iter_state)); + if (err < 0) + return err; + + s = ((struct seq_file *)file->private_data)->private; s->family = afinfo->family; - s->seq_ops.start = tcp_seq_start; - s->seq_ops.next = tcp_seq_next; - s->seq_ops.show = afinfo->seq_show; - s->seq_ops.stop = tcp_seq_stop; - - rc = seq_open(file, &s->seq_ops); - if (rc) - goto out_kfree; - seq = file->private_data; - seq->private = s; -out: - return rc; -out_kfree: - kfree(s); - goto out; + return 0; } -int tcp_proc_register(struct tcp_seq_afinfo *afinfo) +int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo) { int rc = 0; struct proc_dir_entry *p; - if (!afinfo) - return -EINVAL; - afinfo->seq_fops->owner = afinfo->owner; - afinfo->seq_fops->open = tcp_seq_open; - afinfo->seq_fops->read = seq_read; - afinfo->seq_fops->llseek = seq_lseek; - afinfo->seq_fops->release = seq_release_private; + afinfo->seq_fops.open = tcp_seq_open; + afinfo->seq_fops.read = seq_read; + afinfo->seq_fops.llseek = seq_lseek; + afinfo->seq_fops.release = seq_release_net; + + afinfo->seq_ops.start = tcp_seq_start; + afinfo->seq_ops.next = tcp_seq_next; + afinfo->seq_ops.stop = tcp_seq_stop; - p = proc_net_fops_create(&init_net, afinfo->name, S_IRUGO, afinfo->seq_fops); + p = proc_net_fops_create(net, afinfo->name, S_IRUGO, &afinfo->seq_fops); if (p) p->data = afinfo; else @@ -2249,12 +2249,9 @@ int tcp_proc_register(struct tcp_seq_afinfo *afinfo) return rc; } -void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo) +void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) { - if (!afinfo) - return; - proc_net_remove(&init_net, afinfo->name); - memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); + proc_net_remove(net, afinfo->name); } static void get_openreq4(struct sock *sk, struct request_sock *req, @@ -2383,28 +2380,43 @@ out: return 0; } -static struct file_operations tcp4_seq_fops; static struct tcp_seq_afinfo tcp4_seq_afinfo = { - .owner = THIS_MODULE, .name = "tcp", .family = AF_INET, - .seq_show = tcp4_seq_show, - .seq_fops = &tcp4_seq_fops, + .seq_fops = { + .owner = THIS_MODULE, + }, + .seq_ops = { + .show = tcp4_seq_show, + }, +}; + +static int tcp4_proc_init_net(struct net *net) +{ + return tcp_proc_register(net, &tcp4_seq_afinfo); +} + +static void tcp4_proc_exit_net(struct net *net) +{ + tcp_proc_unregister(net, &tcp4_seq_afinfo); +} + +static struct pernet_operations tcp4_net_ops = { + .init = tcp4_proc_init_net, + .exit = tcp4_proc_exit_net, }; int __init tcp4_proc_init(void) { - return tcp_proc_register(&tcp4_seq_afinfo); + return register_pernet_subsys(&tcp4_net_ops); } void tcp4_proc_exit(void) { - tcp_proc_unregister(&tcp4_seq_afinfo); + unregister_pernet_subsys(&tcp4_net_ops); } #endif /* CONFIG_PROC_FS */ -DEFINE_PROTO_INUSE(tcp) - struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, @@ -2435,18 +2447,33 @@ struct proto tcp_prot = { .obj_size = sizeof(struct tcp_sock), .twsk_prot = &tcp_timewait_sock_ops, .rsk_prot = &tcp_request_sock_ops, - .hashinfo = &tcp_hashinfo, + .h.hashinfo = &tcp_hashinfo, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif - REF_PROTO_INUSE(tcp) }; -void __init tcp_v4_init(struct net_proto_family *ops) + +static int __net_init tcp_sk_init(struct net *net) +{ + return inet_ctl_sock_create(&net->ipv4.tcp_sock, + PF_INET, SOCK_RAW, IPPROTO_TCP, net); +} + +static void __net_exit tcp_sk_exit(struct net *net) +{ + inet_ctl_sock_destroy(net->ipv4.tcp_sock); +} + +static struct pernet_operations __net_initdata tcp_sk_ops = { + .init = tcp_sk_init, + .exit = tcp_sk_exit, +}; + +void __init tcp_v4_init(void) { - if (inet_csk_ctl_sock_create(&tcp_socket, PF_INET, SOCK_RAW, - IPPROTO_TCP) < 0) + if (register_pernet_device(&tcp_sk_ops)) panic("Failed to create the TCP control socket.\n"); } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b61b76847ad..019c8c16e5c 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -35,6 +35,8 @@ #endif int sysctl_tcp_syncookies __read_mostly = SYNC_INIT; +EXPORT_SYMBOL(sysctl_tcp_syncookies); + int sysctl_tcp_abort_on_overflow __read_mostly; struct inet_timewait_death_row tcp_death_row = { @@ -536,7 +538,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, * Enforce "SYN-ACK" according to figure 8, figure 6 * of RFC793, fixed by RFC1122. */ - req->rsk_ops->rtx_syn_ack(sk, req, NULL); + req->rsk_ops->rtx_syn_ack(sk, req); return NULL; } @@ -569,10 +571,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, does sequence test, SYN is truncated, and thus we consider it a bare ACK. - If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this - bare ACK. Otherwise, we create an established connection. Both - ends (listening sockets) accept the new incoming connection and try - to talk to each other. 8-) + Both ends (listening sockets) accept the new incoming + connection and try to talk to each other. 8-) Note: This case is both harmless, and rare. Possibility is about the same as us discovering intelligent life on another plant tomorrow. @@ -640,13 +640,6 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, if (!(flg & TCP_FLAG_ACK)) return NULL; - /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ - if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && - TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { - inet_rsk(req)->acked = 1; - return NULL; - } - /* OK, ACK is valid, create big socket and * feed this segment to it. It will repeat all * the tests. THIS SEGMENT MUST MOVE SOCKET TO @@ -685,7 +678,24 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, inet_csk_reqsk_queue_unlink(sk, req, prev); inet_csk_reqsk_queue_removed(sk, req); - inet_csk_reqsk_queue_add(sk, req, child); + if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && + TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { + + /* the accept queue handling is done is est recv slow + * path so lets make sure to start there + */ + tcp_sk(child)->pred_flags = 0; + sock_hold(sk); + sock_hold(child); + tcp_sk(child)->defer_tcp_accept.listen_sk = sk; + tcp_sk(child)->defer_tcp_accept.request = req; + + inet_csk_reset_keepalive_timer(child, + inet_csk(sk)->icsk_accept_queue.rskq_defer_accept * HZ); + } else { + inet_csk_reqsk_queue_add(sk, req, child); + } + return child; listen_overflow: diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d29ef79c00c..debf2358160 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -998,7 +998,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) xmit_size_goal = mss_now; if (doing_tso) { - xmit_size_goal = (65535 - + xmit_size_goal = ((sk->sk_gso_max_size - 1) - inet_csk(sk)->icsk_af_ops->net_header_len - inet_csk(sk)->icsk_ext_hdr_len - tp->tcp_header_len); @@ -1057,7 +1057,7 @@ static unsigned int tcp_mss_split_point(struct sock *sk, struct sk_buff *skb, needed = min(skb->len, window); - if (skb == tcp_write_queue_tail(sk) && cwnd_len <= needed) + if (cwnd_len <= needed) return cwnd_len; return needed - needed % mss_now; @@ -1282,7 +1282,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) limit = min(send_win, cong_win); /* If a full-sized TSO skb can be sent, do it. */ - if (limit >= 65536) + if (limit >= sk->sk_gso_max_size) goto send_now; if (sysctl_tcp_tso_win_divisor) { @@ -2236,7 +2236,11 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ th->window = htons(min(req->rcv_wnd, 65535U)); - +#ifdef CONFIG_SYN_COOKIES + if (unlikely(req->cookie_ts)) + TCP_SKB_CB(skb)->when = cookie_init_timestamp(req); + else +#endif TCP_SKB_CB(skb)->when = tcp_time_stamp; tcp_syn_build_options((__be32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), ireq->tstamp_ok, ireq->sack_ok, ireq->wscale_ok, ireq->rcv_wscale, @@ -2571,6 +2575,7 @@ void tcp_send_probe0(struct sock *sk) } } +EXPORT_SYMBOL(tcp_select_initial_window); EXPORT_SYMBOL(tcp_connect); EXPORT_SYMBOL(tcp_make_synack); EXPORT_SYMBOL(tcp_simple_retransmit); diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index 87dd5bff315..1c509592574 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -153,7 +153,7 @@ static int tcpprobe_sprint(char *tbuf, int n) = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start)); return snprintf(tbuf, n, - "%lu.%09lu %d.%d.%d.%d:%u %d.%d.%d.%d:%u" + "%lu.%09lu " NIPQUAD_FMT ":%u " NIPQUAD_FMT ":%u" " %d %#x %#x %u %u %u %u\n", (unsigned long) tv.tv_sec, (unsigned long) tv.tv_nsec, diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 803d758a2b1..4de68cf5f2a 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -299,12 +299,20 @@ static void tcp_retransmit_timer(struct sock *sk) * we cannot allow such beasts to hang infinitely. */ #ifdef TCP_DEBUG - if (1) { - struct inet_sock *inet = inet_sk(sk); - LIMIT_NETDEBUG(KERN_DEBUG "TCP: Treason uncloaked! Peer %u.%u.%u.%u:%u/%u shrinks window %u:%u. Repaired.\n", + struct inet_sock *inet = inet_sk(sk); + if (sk->sk_family == AF_INET) { + LIMIT_NETDEBUG(KERN_DEBUG "TCP: Treason uncloaked! Peer " NIPQUAD_FMT ":%u/%u shrinks window %u:%u. Repaired.\n", NIPQUAD(inet->daddr), ntohs(inet->dport), inet->num, tp->snd_una, tp->snd_nxt); } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + LIMIT_NETDEBUG(KERN_DEBUG "TCP: Treason uncloaked! Peer " NIP6_FMT ":%u/%u shrinks window %u:%u. Repaired.\n", + NIP6(np->daddr), ntohs(inet->dport), + inet->num, tp->snd_una, tp->snd_nxt); + } +#endif #endif if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) { tcp_write_err(sk); @@ -481,6 +489,11 @@ static void tcp_keepalive_timer (unsigned long data) goto death; } + if (tp->defer_tcp_accept.request && sk->sk_state == TCP_ESTABLISHED) { + tcp_send_active_reset(sk, GFP_ATOMIC); + goto death; + } + if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE) goto out; diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index 978b3fd61e6..d3b709a6f26 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -136,6 +136,7 @@ static struct net_protocol tunnel4_protocol = { .handler = tunnel4_rcv, .err_handler = tunnel4_err, .no_policy = 1, + .netns_ok = 1, }; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) @@ -143,6 +144,7 @@ static struct net_protocol tunnel64_protocol = { .handler = tunnel64_rcv, .err_handler = tunnel64_err, .no_policy = 1, + .netns_ok = 1, }; #endif diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1704c1474ea..b053ac79527 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -137,29 +137,28 @@ static inline int __udp_lib_lport_inuse(struct net *net, __u16 num, struct hlist_node *node; sk_for_each(sk, node, &udptable[num & (UDP_HTABLE_SIZE - 1)]) - if (sk->sk_net == net && sk->sk_hash == num) + if (net_eq(sock_net(sk), net) && sk->sk_hash == num) return 1; return 0; } /** - * __udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6 + * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6 * * @sk: socket struct in question * @snum: port number to look up - * @udptable: hash list table, must be of UDP_HTABLE_SIZE * @saddr_comp: AF-dependent comparison of bound local IP addresses */ -int __udp_lib_get_port(struct sock *sk, unsigned short snum, - struct hlist_head udptable[], +int udp_lib_get_port(struct sock *sk, unsigned short snum, int (*saddr_comp)(const struct sock *sk1, const struct sock *sk2 ) ) { + struct hlist_head *udptable = sk->sk_prot->h.udp_hash; struct hlist_node *node; struct hlist_head *head; struct sock *sk2; int error = 1; - struct net *net = sk->sk_net; + struct net *net = sock_net(sk); write_lock_bh(&udp_hash_lock); @@ -219,7 +218,7 @@ gotit: sk_for_each(sk2, node, head) if (sk2->sk_hash == snum && sk2 != sk && - sk2->sk_net == net && + net_eq(sock_net(sk2), net) && (!sk2->sk_reuse || !sk->sk_reuse) && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && @@ -232,7 +231,7 @@ gotit: if (sk_unhashed(sk)) { head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; sk_add_node(sk, head); - sock_prot_inuse_add(sk->sk_prot, 1); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } error = 0; fail: @@ -240,13 +239,7 @@ fail: return error; } -int udp_get_port(struct sock *sk, unsigned short snum, - int (*scmp)(const struct sock *, const struct sock *)) -{ - return __udp_lib_get_port(sk, snum, udp_hash, scmp); -} - -int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) +static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) { struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); @@ -255,9 +248,9 @@ int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) inet1->rcv_saddr == inet2->rcv_saddr )); } -static inline int udp_v4_get_port(struct sock *sk, unsigned short snum) +int udp_v4_get_port(struct sock *sk, unsigned short snum) { - return udp_get_port(sk, snum, ipv4_rcv_saddr_equal); + return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal); } /* UDP is nearly always wildcards out the wazoo, it makes no sense to try @@ -276,7 +269,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) { struct inet_sock *inet = inet_sk(sk); - if (sk->sk_net == net && sk->sk_hash == hnum && + if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum && !ipv6_only_sock(sk)) { int score = (sk->sk_family == PF_INET ? 1 : 0); if (inet->rcv_saddr) { @@ -364,7 +357,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct hlist_head udptable[]) int harderr; int err; - sk = __udp4_lib_lookup(skb->dev->nd_net, iph->daddr, uh->dest, + sk = __udp4_lib_lookup(dev_net(skb->dev), iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex, udptable); if (sk == NULL) { ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); @@ -614,7 +607,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { - err = ip_cmsg_send(msg, &ipc); + err = ip_cmsg_send(sock_net(sk), msg, &ipc); if (err) return err; if (ipc.opt) @@ -663,7 +656,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, { .sport = inet->sport, .dport = dport } } }; security_sk_classify_flow(sk, &fl); - err = ip_route_output_flow(&init_net, &rt, &fl, sk, 1); + err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 1); if (err) { if (err == -ENETUNREACH) IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); @@ -1188,7 +1181,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) return __udp4_lib_mcast_deliver(skb, uh, saddr, daddr, udptable); - sk = __udp4_lib_lookup(skb->dev->nd_net, saddr, uh->source, daddr, + sk = __udp4_lib_lookup(dev_net(skb->dev), saddr, uh->source, daddr, uh->dest, inet_iif(skb), udptable); if (sk != NULL) { @@ -1228,7 +1221,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], return 0; short_packet: - LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n", + LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From " NIPQUAD_FMT ":%u %d/%d to " NIPQUAD_FMT ":%u\n", proto == IPPROTO_UDPLITE ? "-Lite" : "", NIPQUAD(saddr), ntohs(uh->source), @@ -1243,7 +1236,7 @@ csum_error: * RFC1122: OK. Discards the bad packet silently (as far as * the network is concerned, anyway) as per 4.1.3.4 (MUST). */ - LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n", + LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From " NIPQUAD_FMT ":%u to " NIPQUAD_FMT ":%u ulen %d\n", proto == IPPROTO_UDPLITE ? "-Lite" : "", NIPQUAD(saddr), ntohs(uh->source), @@ -1474,8 +1467,6 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) } -DEFINE_PROTO_INUSE(udp) - struct proto udp_prot = { .name = "UDP", .owner = THIS_MODULE, @@ -1498,11 +1489,11 @@ struct proto udp_prot = { .sysctl_wmem = &sysctl_udp_wmem_min, .sysctl_rmem = &sysctl_udp_rmem_min, .obj_size = sizeof(struct udp_sock), + .h.udp_hash = udp_hash, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udp_setsockopt, .compat_getsockopt = compat_udp_getsockopt, #endif - REF_PROTO_INUSE(udp) }; /* ------------------------------------------------------------------------ */ @@ -1512,10 +1503,13 @@ static struct sock *udp_get_first(struct seq_file *seq) { struct sock *sk; struct udp_iter_state *state = seq->private; + struct net *net = seq_file_net(seq); for (state->bucket = 0; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) { struct hlist_node *node; sk_for_each(sk, node, state->hashtable + state->bucket) { + if (!net_eq(sock_net(sk), net)) + continue; if (sk->sk_family == state->family) goto found; } @@ -1528,12 +1522,13 @@ found: static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) { struct udp_iter_state *state = seq->private; + struct net *net = seq_file_net(seq); do { sk = sk_next(sk); try_again: ; - } while (sk && sk->sk_family != state->family); + } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); if (!sk && ++state->bucket < UDP_HTABLE_SIZE) { sk = sk_head(state->hashtable + state->bucket); @@ -1581,47 +1576,36 @@ static void udp_seq_stop(struct seq_file *seq, void *v) static int udp_seq_open(struct inode *inode, struct file *file) { struct udp_seq_afinfo *afinfo = PDE(inode)->data; - struct seq_file *seq; - int rc = -ENOMEM; - struct udp_iter_state *s = kzalloc(sizeof(*s), GFP_KERNEL); + struct udp_iter_state *s; + int err; - if (!s) - goto out; + err = seq_open_net(inode, file, &afinfo->seq_ops, + sizeof(struct udp_iter_state)); + if (err < 0) + return err; + + s = ((struct seq_file *)file->private_data)->private; s->family = afinfo->family; s->hashtable = afinfo->hashtable; - s->seq_ops.start = udp_seq_start; - s->seq_ops.next = udp_seq_next; - s->seq_ops.show = afinfo->seq_show; - s->seq_ops.stop = udp_seq_stop; - - rc = seq_open(file, &s->seq_ops); - if (rc) - goto out_kfree; - - seq = file->private_data; - seq->private = s; -out: - return rc; -out_kfree: - kfree(s); - goto out; + return err; } /* ------------------------------------------------------------------------ */ -int udp_proc_register(struct udp_seq_afinfo *afinfo) +int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo) { struct proc_dir_entry *p; int rc = 0; - if (!afinfo) - return -EINVAL; - afinfo->seq_fops->owner = afinfo->owner; - afinfo->seq_fops->open = udp_seq_open; - afinfo->seq_fops->read = seq_read; - afinfo->seq_fops->llseek = seq_lseek; - afinfo->seq_fops->release = seq_release_private; + afinfo->seq_fops.open = udp_seq_open; + afinfo->seq_fops.read = seq_read; + afinfo->seq_fops.llseek = seq_lseek; + afinfo->seq_fops.release = seq_release_net; + + afinfo->seq_ops.start = udp_seq_start; + afinfo->seq_ops.next = udp_seq_next; + afinfo->seq_ops.stop = udp_seq_stop; - p = proc_net_fops_create(&init_net, afinfo->name, S_IRUGO, afinfo->seq_fops); + p = proc_net_fops_create(net, afinfo->name, S_IRUGO, &afinfo->seq_fops); if (p) p->data = afinfo; else @@ -1629,12 +1613,9 @@ int udp_proc_register(struct udp_seq_afinfo *afinfo) return rc; } -void udp_proc_unregister(struct udp_seq_afinfo *afinfo) +void udp_proc_unregister(struct net *net, struct udp_seq_afinfo *afinfo) { - if (!afinfo) - return; - proc_net_remove(&init_net, afinfo->name); - memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); + proc_net_remove(net, afinfo->name); } /* ------------------------------------------------------------------------ */ @@ -1673,24 +1654,41 @@ int udp4_seq_show(struct seq_file *seq, void *v) } /* ------------------------------------------------------------------------ */ -static struct file_operations udp4_seq_fops; static struct udp_seq_afinfo udp4_seq_afinfo = { - .owner = THIS_MODULE, .name = "udp", .family = AF_INET, .hashtable = udp_hash, - .seq_show = udp4_seq_show, - .seq_fops = &udp4_seq_fops, + .seq_fops = { + .owner = THIS_MODULE, + }, + .seq_ops = { + .show = udp4_seq_show, + }, +}; + +static int udp4_proc_init_net(struct net *net) +{ + return udp_proc_register(net, &udp4_seq_afinfo); +} + +static void udp4_proc_exit_net(struct net *net) +{ + udp_proc_unregister(net, &udp4_seq_afinfo); +} + +static struct pernet_operations udp4_net_ops = { + .init = udp4_proc_init_net, + .exit = udp4_proc_exit_net, }; int __init udp4_proc_init(void) { - return udp_proc_register(&udp4_seq_afinfo); + return register_pernet_subsys(&udp4_net_ops); } void udp4_proc_exit(void) { - udp_proc_unregister(&udp4_seq_afinfo); + unregister_pernet_subsys(&udp4_net_ops); } #endif /* CONFIG_PROC_FS */ @@ -1717,12 +1715,12 @@ EXPORT_SYMBOL(udp_disconnect); EXPORT_SYMBOL(udp_hash); EXPORT_SYMBOL(udp_hash_lock); EXPORT_SYMBOL(udp_ioctl); -EXPORT_SYMBOL(udp_get_port); EXPORT_SYMBOL(udp_prot); EXPORT_SYMBOL(udp_sendmsg); EXPORT_SYMBOL(udp_lib_getsockopt); EXPORT_SYMBOL(udp_lib_setsockopt); EXPORT_SYMBOL(udp_poll); +EXPORT_SYMBOL(udp_lib_get_port); #ifdef CONFIG_PROC_FS EXPORT_SYMBOL(udp_proc_register); diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index 6c55828e41b..7288bf7977f 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -8,11 +8,7 @@ extern int __udp4_lib_rcv(struct sk_buff *, struct hlist_head [], int ); extern void __udp4_lib_err(struct sk_buff *, u32, struct hlist_head []); -extern int __udp_lib_get_port(struct sock *sk, unsigned short snum, - struct hlist_head udptable[], - int (*)(const struct sock*,const struct sock*)); -extern int ipv4_rcv_saddr_equal(const struct sock *, const struct sock *); - +extern int udp_v4_get_port(struct sock *sk, unsigned short snum); extern int udp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen); diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 001b881ca36..72ce26b6c4d 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -17,17 +17,6 @@ DEFINE_SNMP_STAT(struct udp_mib, udplite_statistics) __read_mostly; struct hlist_head udplite_hash[UDP_HTABLE_SIZE]; -int udplite_get_port(struct sock *sk, unsigned short p, - int (*c)(const struct sock *, const struct sock *)) -{ - return __udp_lib_get_port(sk, p, udplite_hash, c); -} - -static int udplite_v4_get_port(struct sock *sk, unsigned short snum) -{ - return udplite_get_port(sk, snum, ipv4_rcv_saddr_equal); -} - static int udplite_rcv(struct sk_buff *skb) { return __udp4_lib_rcv(skb, udplite_hash, IPPROTO_UDPLITE); @@ -42,10 +31,9 @@ static struct net_protocol udplite_protocol = { .handler = udplite_rcv, .err_handler = udplite_err, .no_policy = 1, + .netns_ok = 1, }; -DEFINE_PROTO_INUSE(udplite) - struct proto udplite_prot = { .name = "UDP-Lite", .owner = THIS_MODULE, @@ -63,13 +51,13 @@ struct proto udplite_prot = { .backlog_rcv = udp_queue_rcv_skb, .hash = udp_lib_hash, .unhash = udp_lib_unhash, - .get_port = udplite_v4_get_port, + .get_port = udp_v4_get_port, .obj_size = sizeof(struct udp_sock), + .h.udp_hash = udplite_hash, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udp_setsockopt, .compat_getsockopt = compat_udp_getsockopt, #endif - REF_PROTO_INUSE(udplite) }; static struct inet_protosw udplite4_protosw = { @@ -83,15 +71,42 @@ static struct inet_protosw udplite4_protosw = { }; #ifdef CONFIG_PROC_FS -static struct file_operations udplite4_seq_fops; static struct udp_seq_afinfo udplite4_seq_afinfo = { - .owner = THIS_MODULE, .name = "udplite", .family = AF_INET, .hashtable = udplite_hash, - .seq_show = udp4_seq_show, - .seq_fops = &udplite4_seq_fops, + .seq_fops = { + .owner = THIS_MODULE, + }, + .seq_ops = { + .show = udp4_seq_show, + }, +}; + +static int udplite4_proc_init_net(struct net *net) +{ + return udp_proc_register(net, &udplite4_seq_afinfo); +} + +static void udplite4_proc_exit_net(struct net *net) +{ + udp_proc_unregister(net, &udplite4_seq_afinfo); +} + +static struct pernet_operations udplite4_net_ops = { + .init = udplite4_proc_init_net, + .exit = udplite4_proc_exit_net, }; + +static __init int udplite4_proc_init(void) +{ + return register_pernet_subsys(&udplite4_net_ops); +} +#else +static inline int udplite4_proc_init(void) +{ + return 0; +} #endif void __init udplite4_register(void) @@ -104,18 +119,15 @@ void __init udplite4_register(void) inet_register_protosw(&udplite4_protosw); -#ifdef CONFIG_PROC_FS - if (udp_proc_register(&udplite4_seq_afinfo)) /* udplite4_proc_init() */ - printk(KERN_ERR "%s: Cannot register /proc!\n", __FUNCTION__); -#endif + if (udplite4_proc_init()) + printk(KERN_ERR "%s: Cannot register /proc!\n", __func__); return; out_unregister_proto: proto_unregister(&udplite_prot); out_register_err: - printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __FUNCTION__); + printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __func__); } EXPORT_SYMBOL(udplite_hash); EXPORT_SYMBOL(udplite_prot); -EXPORT_SYMBOL(udplite_get_port); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 10ed7049143..c63de0a72ab 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -221,7 +221,7 @@ static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, xdst = (struct xfrm_dst *)dst; if (xdst->u.rt.idev->dev == dev) { struct in_device *loopback_idev = - in_dev_get(dev->nd_net->loopback_dev); + in_dev_get(dev_net(dev)->loopback_dev); BUG_ON(!loopback_idev); do { |