diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-05 14:54:29 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-05 14:54:29 -0700 |
commit | cc998ff8811530be521f6b316f37ab7676a07938 (patch) | |
tree | a054b3bf4b2ef406bf756a6cfc9be2f9115f17ae /net/ipv6 | |
parent | 57d730924d5cc2c3e280af16a9306587c3a511db (diff) | |
parent | 0d40f75bdab241868c0eb6f97aef9f8b3a66f7b3 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking changes from David Miller:
"Noteworthy changes this time around:
1) Multicast rejoin support for team driver, from Jiri Pirko.
2) Centralize and simplify TCP RTT measurement handling in order to
reduce the impact of bad RTO seeding from SYN/ACKs. Also, when
both timestamps and local RTT measurements are available prefer
the later because there are broken middleware devices which
scramble the timestamp.
From Yuchung Cheng.
3) Add TCP_NOTSENT_LOWAT socket option to limit the amount of kernel
memory consumed to queue up unsend user data. From Eric Dumazet.
4) Add a "physical port ID" abstraction for network devices, from
Jiri Pirko.
5) Add a "suppress" operation to influence fib_rules lookups, from
Stefan Tomanek.
6) Add a networking development FAQ, from Paul Gortmaker.
7) Extend the information provided by tcp_probe and add ipv6 support,
from Daniel Borkmann.
8) Use RCU locking more extensively in openvswitch data paths, from
Pravin B Shelar.
9) Add SCTP support to openvswitch, from Joe Stringer.
10) Add EF10 chip support to SFC driver, from Ben Hutchings.
11) Add new SYNPROXY netfilter target, from Patrick McHardy.
12) Compute a rate approximation for sending in TCP sockets, and use
this to more intelligently coalesce TSO frames. Furthermore, add
a new packet scheduler which takes advantage of this estimate when
available. From Eric Dumazet.
13) Allow AF_PACKET fanouts with random selection, from Daniel
Borkmann.
14) Add ipv6 support to vxlan driver, from Cong Wang"
Resolved conflicts as per discussion.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1218 commits)
openvswitch: Fix alignment of struct sw_flow_key.
netfilter: Fix build errors with xt_socket.c
tcp: Add missing braces to do_tcp_setsockopt
caif: Add missing braces to multiline if in cfctrl_linkup_request
bnx2x: Add missing braces in bnx2x:bnx2x_link_initialize
vxlan: Fix kernel panic on device delete.
net: mvneta: implement ->ndo_do_ioctl() to support PHY ioctls
net: mvneta: properly disable HW PHY polling and ensure adjust_link() works
icplus: Use netif_running to determine device state
ethernet/arc/arc_emac: Fix huge delays in large file copies
tuntap: orphan frags before trying to set tx timestamp
tuntap: purge socket error queue on detach
qlcnic: use standard NAPI weights
ipv6:introduce function to find route for redirect
bnx2x: VF RSS support - VF side
bnx2x: VF RSS support - PF side
vxlan: Notify drivers for listening UDP port changes
net: usbnet: update addr_assign_type if appropriate
driver/net: enic: update enic maintainers and driver
driver/net: enic: Exposing symbols for Cisco's low latency driver
...
Diffstat (limited to 'net/ipv6')
33 files changed, 1280 insertions, 384 deletions
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 498ea99194a..d6ff12617f3 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -99,9 +99,9 @@ #define ACONF_DEBUG 2 #if ACONF_DEBUG >= 3 -#define ADBG(x) printk x +#define ADBG(fmt, ...) printk(fmt, ##__VA_ARGS__) #else -#define ADBG(x) +#define ADBG(fmt, ...) do { if (0) printk(fmt, ##__VA_ARGS__); } while (0) #endif #define INFINITY_LIFE_TIME 0xFFFFFFFF @@ -177,6 +177,8 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .accept_redirects = 1, .autoconf = 1, .force_mld_version = 0, + .mldv1_unsolicited_report_interval = 10 * HZ, + .mldv2_unsolicited_report_interval = HZ, .dad_transmits = 1, .rtr_solicits = MAX_RTR_SOLICITATIONS, .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, @@ -202,6 +204,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .accept_source_route = 0, /* we do not accept RH0 by default. */ .disable_ipv6 = 0, .accept_dad = 1, + .suppress_frag_ndisc = 1, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -211,6 +214,9 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .accept_ra = 1, .accept_redirects = 1, .autoconf = 1, + .force_mld_version = 0, + .mldv1_unsolicited_report_interval = 10 * HZ, + .mldv2_unsolicited_report_interval = HZ, .dad_transmits = 1, .rtr_solicits = MAX_RTR_SOLICITATIONS, .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, @@ -236,17 +242,9 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .accept_source_route = 0, /* we do not accept RH0 by default. */ .disable_ipv6 = 0, .accept_dad = 1, + .suppress_frag_ndisc = 1, }; -/* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */ -const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; -const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; -const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; -const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; -const struct in6_addr in6addr_interfacelocal_allnodes = IN6ADDR_INTERFACELOCAL_ALLNODES_INIT; -const struct in6_addr in6addr_interfacelocal_allrouters = IN6ADDR_INTERFACELOCAL_ALLROUTERS_INIT; -const struct in6_addr in6addr_sitelocal_allrouters = IN6ADDR_SITELOCAL_ALLROUTERS_INIT; - /* Check if a valid qdisc is available */ static inline bool addrconf_qdisc_ok(const struct net_device *dev) { @@ -306,36 +304,6 @@ err_ip: return -ENOMEM; } -static void snmp6_free_dev(struct inet6_dev *idev) -{ - kfree(idev->stats.icmpv6msgdev); - kfree(idev->stats.icmpv6dev); - snmp_mib_free((void __percpu **)idev->stats.ipv6); -} - -/* Nobody refers to this device, we may destroy it. */ - -void in6_dev_finish_destroy(struct inet6_dev *idev) -{ - struct net_device *dev = idev->dev; - - WARN_ON(!list_empty(&idev->addr_list)); - WARN_ON(idev->mc_list != NULL); - WARN_ON(timer_pending(&idev->rs_timer)); - -#ifdef NET_REFCNT_DEBUG - pr_debug("%s: %s\n", __func__, dev ? dev->name : "NIL"); -#endif - dev_put(dev); - if (!idev->dead) { - pr_warn("Freeing alive inet6 device %p\n", idev); - return; - } - snmp6_free_dev(idev); - kfree_rcu(idev, rcu); -} -EXPORT_SYMBOL(in6_dev_finish_destroy); - static struct inet6_dev *ipv6_add_dev(struct net_device *dev) { struct inet6_dev *ndev; @@ -369,9 +337,9 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) dev_hold(dev); if (snmp6_alloc_dev(ndev) < 0) { - ADBG((KERN_WARNING + ADBG(KERN_WARNING "%s: cannot allocate memory for statistics; dev=%s.\n", - __func__, dev->name)); + __func__, dev->name); neigh_parms_release(&nd_tbl, ndev->nd_parms); dev_put(dev); kfree(ndev); @@ -379,9 +347,9 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) } if (snmp6_register_dev(ndev) < 0) { - ADBG((KERN_WARNING + ADBG(KERN_WARNING "%s: cannot create /proc/net/dev_snmp6/%s\n", - __func__, dev->name)); + __func__, dev->name); neigh_parms_release(&nd_tbl, ndev->nd_parms); ndev->dead = 1; in6_dev_finish_destroy(ndev); @@ -844,7 +812,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, /* Ignore adding duplicate addresses on an interface */ if (ipv6_chk_same_addr(dev_net(idev->dev), addr, idev->dev)) { - ADBG(("ipv6_add_addr: already assigned\n")); + ADBG("ipv6_add_addr: already assigned\n"); err = -EEXIST; goto out; } @@ -852,7 +820,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC); if (ifa == NULL) { - ADBG(("ipv6_add_addr: malloc failed\n")); + ADBG("ipv6_add_addr: malloc failed\n"); err = -ENOBUFS; goto out; } @@ -1054,7 +1022,6 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *i unsigned long regen_advance; int tmp_plen; int ret = 0; - int max_addresses; u32 addr_flags; unsigned long now = jiffies; @@ -1100,7 +1067,6 @@ retry: idev->cnf.temp_prefered_lft + age - idev->cnf.max_desync_factor); tmp_plen = ifp->prefix_len; - max_addresses = idev->cnf.max_addresses; tmp_tstamp = ifp->tstamp; spin_unlock_bh(&ifp->lock); @@ -1807,6 +1773,16 @@ static int addrconf_ifid_gre(u8 *eui, struct net_device *dev) return __ipv6_isatap_ifid(eui, *(__be32 *)dev->dev_addr); } +static int addrconf_ifid_ip6tnl(u8 *eui, struct net_device *dev) +{ + memcpy(eui, dev->perm_addr, 3); + memcpy(eui + 5, dev->perm_addr + 3, 3); + eui[3] = 0xFF; + eui[4] = 0xFE; + eui[0] ^= 2; + return 0; +} + static int ipv6_generate_eui64(u8 *eui, struct net_device *dev) { switch (dev->type) { @@ -1825,6 +1801,8 @@ static int ipv6_generate_eui64(u8 *eui, struct net_device *dev) return addrconf_ifid_eui64(eui, dev); case ARPHRD_IEEE1394: return addrconf_ifid_ieee1394(eui, dev); + case ARPHRD_TUNNEL6: + return addrconf_ifid_ip6tnl(eui, dev); } return -1; } @@ -2050,7 +2028,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) pinfo = (struct prefix_info *) opt; if (len < sizeof(struct prefix_info)) { - ADBG(("addrconf: prefix option too short\n")); + ADBG("addrconf: prefix option too short\n"); return; } @@ -2702,7 +2680,8 @@ static void addrconf_dev_config(struct net_device *dev) (dev->type != ARPHRD_ARCNET) && (dev->type != ARPHRD_INFINIBAND) && (dev->type != ARPHRD_IEEE802154) && - (dev->type != ARPHRD_IEEE1394)) { + (dev->type != ARPHRD_IEEE1394) && + (dev->type != ARPHRD_TUNNEL6)) { /* Alas, we support only Ethernet autoconfiguration. */ return; } @@ -2788,44 +2767,6 @@ ipv6_inherit_linklocal(struct inet6_dev *idev, struct net_device *link_dev) return -1; } -static void ip6_tnl_add_linklocal(struct inet6_dev *idev) -{ - struct net_device *link_dev; - struct net *net = dev_net(idev->dev); - - /* first try to inherit the link-local address from the link device */ - if (idev->dev->iflink && - (link_dev = __dev_get_by_index(net, idev->dev->iflink))) { - if (!ipv6_inherit_linklocal(idev, link_dev)) - return; - } - /* then try to inherit it from any device */ - for_each_netdev(net, link_dev) { - if (!ipv6_inherit_linklocal(idev, link_dev)) - return; - } - pr_debug("init ip6-ip6: add_linklocal failed\n"); -} - -/* - * Autoconfigure tunnel with a link-local address so routing protocols, - * DHCPv6, MLD etc. can be run over the virtual link - */ - -static void addrconf_ip6_tnl_config(struct net_device *dev) -{ - struct inet6_dev *idev; - - ASSERT_RTNL(); - - idev = addrconf_add_dev(dev); - if (IS_ERR(idev)) { - pr_debug("init ip6-ip6: add_dev failed\n"); - return; - } - ip6_tnl_add_linklocal(idev); -} - static int addrconf_notify(struct notifier_block *this, unsigned long event, void *ptr) { @@ -2893,9 +2834,6 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, addrconf_gre_config(dev); break; #endif - case ARPHRD_TUNNEL6: - addrconf_ip6_tnl_config(dev); - break; case ARPHRD_LOOPBACK: init_loopback(dev); break; @@ -3120,6 +3058,7 @@ static int addrconf_ifdown(struct net_device *dev, int how) static void addrconf_rs_timer(unsigned long data) { struct inet6_dev *idev = (struct inet6_dev *)data; + struct net_device *dev = idev->dev; struct in6_addr lladdr; write_lock(&idev->lock); @@ -3134,12 +3073,14 @@ static void addrconf_rs_timer(unsigned long data) goto out; if (idev->rs_probes++ < idev->cnf.rtr_solicits) { - if (!__ipv6_get_lladdr(idev, &lladdr, IFA_F_TENTATIVE)) - ndisc_send_rs(idev->dev, &lladdr, + write_unlock(&idev->lock); + if (!ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE)) + ndisc_send_rs(dev, &lladdr, &in6addr_linklocal_allrouters); else - goto out; + goto put; + write_lock(&idev->lock); /* The wait after the last probe can be shorter */ addrconf_mod_rs_timer(idev, (idev->rs_probes == idev->cnf.rtr_solicits) ? @@ -3155,6 +3096,7 @@ static void addrconf_rs_timer(unsigned long data) out: write_unlock(&idev->lock); +put: in6_dev_put(idev); } @@ -3630,8 +3572,8 @@ restart: if (time_before(next_sched, jiffies + ADDRCONF_TIMER_FUZZ_MAX)) next_sched = jiffies + ADDRCONF_TIMER_FUZZ_MAX; - ADBG((KERN_DEBUG "now = %lu, schedule = %lu, rounded schedule = %lu => %lu\n", - now, next, next_sec, next_sched)); + ADBG(KERN_DEBUG "now = %lu, schedule = %lu, rounded schedule = %lu => %lu\n", + now, next, next_sec, next_sched); addr_chk_timer.expires = next_sched; add_timer(&addr_chk_timer); @@ -4177,6 +4119,10 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_RTR_SOLICIT_DELAY] = jiffies_to_msecs(cnf->rtr_solicit_delay); array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version; + array[DEVCONF_MLDV1_UNSOLICITED_REPORT_INTERVAL] = + jiffies_to_msecs(cnf->mldv1_unsolicited_report_interval); + array[DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL] = + jiffies_to_msecs(cnf->mldv2_unsolicited_report_interval); #ifdef CONFIG_IPV6_PRIVACY array[DEVCONF_USE_TEMPADDR] = cnf->use_tempaddr; array[DEVCONF_TEMP_VALID_LFT] = cnf->temp_valid_lft; @@ -4207,6 +4153,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_ACCEPT_DAD] = cnf->accept_dad; array[DEVCONF_FORCE_TLLAO] = cnf->force_tllao; array[DEVCONF_NDISC_NOTIFY] = cnf->ndisc_notify; + array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc; } static inline size_t inet6_ifla6_size(void) @@ -4652,6 +4599,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) break; } atomic_inc(&net->ipv6.dev_addr_genid); + rt_genid_bump_ipv6(net); } static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) @@ -4859,6 +4807,22 @@ static struct addrconf_sysctl_table .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "mldv1_unsolicited_report_interval", + .data = + &ipv6_devconf.mldv1_unsolicited_report_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, + { + .procname = "mldv2_unsolicited_report_interval", + .data = + &ipv6_devconf.mldv2_unsolicited_report_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, #ifdef CONFIG_IPV6_PRIVACY { .procname = "use_tempaddr", @@ -5004,6 +4968,13 @@ static struct addrconf_sysctl_table .proc_handler = proc_dointvec }, { + .procname = "suppress_frag_ndisc", + .data = &ipv6_devconf.suppress_frag_ndisc, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { /* sentinel */ } }, diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index d2f87427244..4c11cbcf830 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -6,6 +6,7 @@ #include <linux/export.h> #include <net/ipv6.h> #include <net/addrconf.h> +#include <net/ip.h> #define IPV6_ADDR_SCOPE_TYPE(scope) ((scope) << 16) @@ -98,3 +99,52 @@ int inet6addr_notifier_call_chain(unsigned long val, void *v) return atomic_notifier_call_chain(&inet6addr_chain, val, v); } EXPORT_SYMBOL(inet6addr_notifier_call_chain); + +const struct ipv6_stub *ipv6_stub __read_mostly; +EXPORT_SYMBOL_GPL(ipv6_stub); + +/* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */ +const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; +EXPORT_SYMBOL(in6addr_loopback); +const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; +EXPORT_SYMBOL(in6addr_any); +const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; +EXPORT_SYMBOL(in6addr_linklocal_allnodes); +const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; +EXPORT_SYMBOL(in6addr_linklocal_allrouters); +const struct in6_addr in6addr_interfacelocal_allnodes = IN6ADDR_INTERFACELOCAL_ALLNODES_INIT; +EXPORT_SYMBOL(in6addr_interfacelocal_allnodes); +const struct in6_addr in6addr_interfacelocal_allrouters = IN6ADDR_INTERFACELOCAL_ALLROUTERS_INIT; +EXPORT_SYMBOL(in6addr_interfacelocal_allrouters); +const struct in6_addr in6addr_sitelocal_allrouters = IN6ADDR_SITELOCAL_ALLROUTERS_INIT; +EXPORT_SYMBOL(in6addr_sitelocal_allrouters); + +static void snmp6_free_dev(struct inet6_dev *idev) +{ + kfree(idev->stats.icmpv6msgdev); + kfree(idev->stats.icmpv6dev); + snmp_mib_free((void __percpu **)idev->stats.ipv6); +} + +/* Nobody refers to this device, we may destroy it. */ + +void in6_dev_finish_destroy(struct inet6_dev *idev) +{ + struct net_device *dev = idev->dev; + + WARN_ON(!list_empty(&idev->addr_list)); + WARN_ON(idev->mc_list != NULL); + WARN_ON(timer_pending(&idev->rs_timer)); + +#ifdef NET_REFCNT_DEBUG + pr_debug("%s: %s\n", __func__, dev ? dev->name : "NIL"); +#endif + dev_put(dev); + if (!idev->dead) { + pr_warn("Freeing alive inet6 device %p\n", idev); + return; + } + snmp6_free_dev(idev); + kfree_rcu(idev, rcu); +} +EXPORT_SYMBOL(in6_dev_finish_destroy); diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index f083a583a05..b30ad3741b4 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -251,38 +251,36 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net, /* add a label */ static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace) { + struct hlist_node *n; + struct ip6addrlbl_entry *last = NULL, *p = NULL; int ret = 0; - ADDRLABEL(KERN_DEBUG "%s(newp=%p, replace=%d)\n", - __func__, - newp, replace); + ADDRLABEL(KERN_DEBUG "%s(newp=%p, replace=%d)\n", __func__, newp, + replace); - if (hlist_empty(&ip6addrlbl_table.head)) { - hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head); - } else { - struct hlist_node *n; - struct ip6addrlbl_entry *p = NULL; - hlist_for_each_entry_safe(p, n, - &ip6addrlbl_table.head, list) { - if (p->prefixlen == newp->prefixlen && - net_eq(ip6addrlbl_net(p), ip6addrlbl_net(newp)) && - p->ifindex == newp->ifindex && - ipv6_addr_equal(&p->prefix, &newp->prefix)) { - if (!replace) { - ret = -EEXIST; - goto out; - } - hlist_replace_rcu(&p->list, &newp->list); - ip6addrlbl_put(p); - goto out; - } else if ((p->prefixlen == newp->prefixlen && !p->ifindex) || - (p->prefixlen < newp->prefixlen)) { - hlist_add_before_rcu(&newp->list, &p->list); + hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) { + if (p->prefixlen == newp->prefixlen && + net_eq(ip6addrlbl_net(p), ip6addrlbl_net(newp)) && + p->ifindex == newp->ifindex && + ipv6_addr_equal(&p->prefix, &newp->prefix)) { + if (!replace) { + ret = -EEXIST; goto out; } + hlist_replace_rcu(&p->list, &newp->list); + ip6addrlbl_put(p); + goto out; + } else if ((p->prefixlen == newp->prefixlen && !p->ifindex) || + (p->prefixlen < newp->prefixlen)) { + hlist_add_before_rcu(&newp->list, &p->list); + goto out; } - hlist_add_after_rcu(&p->list, &newp->list); + last = p; } + if (last) + hlist_add_after_rcu(&last->list, &newp->list); + else + hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head); out: if (!ret) ip6addrlbl_table.seq++; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index a5ac969aeef..136fe55c1a4 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -56,6 +56,7 @@ #include <net/transp_v6.h> #include <net/ip6_route.h> #include <net/addrconf.h> +#include <net/ndisc.h> #ifdef CONFIG_IPV6_TUNNEL #include <net/ip6_tunnel.h> #endif @@ -766,6 +767,7 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.bindv6only = 0; net->ipv6.sysctl.icmpv6_time = 1*HZ; + atomic_set(&net->ipv6.rt_genid, 0); err = ipv6_init_mibs(net); if (err) @@ -809,6 +811,15 @@ static struct pernet_operations inet6_net_ops = { .exit = inet6_net_exit, }; +static const struct ipv6_stub ipv6_stub_impl = { + .ipv6_sock_mc_join = ipv6_sock_mc_join, + .ipv6_sock_mc_drop = ipv6_sock_mc_drop, + .ipv6_dst_lookup = ip6_dst_lookup, + .udpv6_encap_enable = udpv6_encap_enable, + .ndisc_send_na = ndisc_send_na, + .nd_tbl = &nd_tbl, +}; + static int __init inet6_init(void) { struct list_head *r; @@ -883,6 +894,9 @@ static int __init inet6_init(void) err = igmp6_init(); if (err) goto igmp_fail; + + ipv6_stub = &ipv6_stub_impl; + err = ipv6_netfilter_init(); if (err) goto netfilter_fail; @@ -1039,6 +1053,7 @@ static void __exit inet6_exit(void) raw6_proc_exit(); #endif ipv6_netfilter_fini(); + ipv6_stub = NULL; igmp6_cleanup(); ndisc_cleanup(); ip6_mr_cleanup(); diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index bb02e176cb7..73784c3d464 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -628,7 +628,7 @@ static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return; if (type == NDISC_REDIRECT) - ip6_redirect(skb, net, 0, 0); + ip6_redirect(skb, net, skb->dev->ifindex, 0); else ip6_update_pmtu(skb, net, info, 0, 0); xfrm_state_put(x); diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 197e6f4a2b7..48b6bd2a9a1 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -890,7 +890,7 @@ void ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, src = &np->rcv_saddr; seq_printf(seq, "%5d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " - "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d\n", + "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d\n", bucket, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], srcp, diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index aeac0dc3635..d3618a78fca 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -447,7 +447,7 @@ static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return; if (type == NDISC_REDIRECT) - ip6_redirect(skb, net, 0, 0); + ip6_redirect(skb, net, skb->dev->ifindex, 0); else ip6_update_pmtu(skb, net, info, 0, 0); xfrm_state_put(x); diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 2e1a432867c..a6c58ce43d3 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -55,26 +55,33 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, struct fib6_table *table; struct net *net = rule->fr_net; pol_lookup_t lookup = arg->lookup_ptr; + int err = 0; switch (rule->action) { case FR_ACT_TO_TBL: break; case FR_ACT_UNREACHABLE: + err = -ENETUNREACH; rt = net->ipv6.ip6_null_entry; goto discard_pkt; default: case FR_ACT_BLACKHOLE: + err = -EINVAL; rt = net->ipv6.ip6_blk_hole_entry; goto discard_pkt; case FR_ACT_PROHIBIT: + err = -EACCES; rt = net->ipv6.ip6_prohibit_entry; goto discard_pkt; } table = fib6_get_table(net, rule->table); - if (table) - rt = lookup(net, table, flp6, flags); + if (!table) { + err = -EAGAIN; + goto out; + } + rt = lookup(net, table, flp6, flags); if (rt != net->ipv6.ip6_null_entry) { struct fib6_rule *r = (struct fib6_rule *)rule; @@ -101,6 +108,7 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, } again: ip6_rt_put(rt); + err = -EAGAIN; rt = NULL; goto out; @@ -108,9 +116,31 @@ discard_pkt: dst_hold(&rt->dst); out: arg->result = rt; - return rt == NULL ? -EAGAIN : 0; + return err; } +static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) +{ + struct rt6_info *rt = (struct rt6_info *) arg->result; + struct net_device *dev = rt->rt6i_idev->dev; + /* do not accept result if the route does + * not meet the required prefix length + */ + if (rt->rt6i_dst.plen <= rule->suppress_prefixlen) + goto suppress_route; + + /* do not accept result if the route uses a device + * belonging to a forbidden interface group + */ + if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup) + goto suppress_route; + + return false; + +suppress_route: + ip6_rt_put(rt); + return true; +} static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) { @@ -244,6 +274,7 @@ static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = { .addr_size = sizeof(struct in6_addr), .action = fib6_rule_action, .match = fib6_rule_match, + .suppress = fib6_rule_suppress, .configure = fib6_rule_configure, .compare = fib6_rule_compare, .fill = fib6_rule_fill, diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 7cfc8d28487..eef8d945b36 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -92,7 +92,7 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (type == ICMPV6_PKT_TOOBIG) ip6_update_pmtu(skb, net, info, 0, 0); else if (type == NDISC_REDIRECT) - ip6_redirect(skb, net, 0, 0); + ip6_redirect(skb, net, skb->dev->ifindex, 0); if (!(type & ICMPV6_INFOMSG_MASK)) if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST) @@ -940,6 +940,14 @@ static const struct icmp6_err { .err = ECONNREFUSED, .fatal = 1, }, + { /* POLICY_FAIL */ + .err = EACCES, + .fatal = 1, + }, + { /* REJECT_ROUTE */ + .err = EACCES, + .fatal = 1, + }, }; int icmpv6_err_convert(u8 type, u8 code, int *err) @@ -951,7 +959,7 @@ int icmpv6_err_convert(u8 type, u8 code, int *err) switch (type) { case ICMPV6_DEST_UNREACH: fatal = 1; - if (code <= ICMPV6_PORT_UNREACH) { + if (code < ARRAY_SIZE(tab_unreach)) { *err = tab_unreach[code].err; fatal = tab_unreach[code].fatal; } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index c4ff5bbb45c..73db48eba1c 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -425,8 +425,8 @@ out: * node. */ -static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, - int addrlen, int plen, +static struct fib6_node *fib6_add_1(struct fib6_node *root, + struct in6_addr *addr, int plen, int offset, int allow_create, int replace_required) { @@ -543,7 +543,7 @@ insert_above: but if it is >= plen, the value is ignored in any case. */ - bit = __ipv6_addr_diff(addr, &key->addr, addrlen); + bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr)); /* * (intermediate)[in] @@ -822,9 +822,9 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) if (!allow_create && !replace_required) pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); - fn = fib6_add_1(root, &rt->rt6i_dst.addr, sizeof(struct in6_addr), - rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst), - allow_create, replace_required); + fn = fib6_add_1(root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen, + offsetof(struct rt6_info, rt6i_dst), allow_create, + replace_required); if (IS_ERR(fn)) { err = PTR_ERR(fn); @@ -863,7 +863,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) /* Now add the first leaf node to new subtree */ sn = fib6_add_1(sfn, &rt->rt6i_src.addr, - sizeof(struct in6_addr), rt->rt6i_src.plen, + rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), allow_create, replace_required); @@ -882,7 +882,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) fn->subtree = sfn; } else { sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr, - sizeof(struct in6_addr), rt->rt6i_src.plen, + rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), allow_create, replace_required); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 90747f1973f..6b26e9feafb 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -335,6 +335,7 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net, dev->rtnl_link_ops = &ip6gre_link_ops; nt->dev = dev; + nt->net = dev_net(dev); ip6gre_tnl_link_config(nt, 1); if (register_netdevice(dev) < 0) @@ -508,8 +509,6 @@ static int ip6gre_rcv(struct sk_buff *skb) goto drop; } - secpath_reset(skb); - skb->protocol = gre_proto; /* WCCP version 1 and 2 protocol decoding. * - Change protocol to IP @@ -524,7 +523,6 @@ static int ip6gre_rcv(struct sk_buff *skb) skb->mac_header = skb->network_header; __pskb_pull(skb, offset); skb_postpull_rcsum(skb, skb_transport_header(skb), offset); - skb->pkt_type = PACKET_HOST; if (((flags&GRE_CSUM) && csum) || (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { @@ -556,7 +554,7 @@ static int ip6gre_rcv(struct sk_buff *skb) skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); } - __skb_tunnel_rx(skb, tunnel->dev); + __skb_tunnel_rx(skb, tunnel->dev, tunnel->net); skb_reset_network_header(skb); @@ -693,6 +691,8 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, tunnel->err_count = 0; } + skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev))); + max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + dst->header_len; if (skb_headroom(skb) < max_headroom || skb_shared(skb) || @@ -709,8 +709,6 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, skb = new_skb; } - skb_dst_drop(skb); - if (fl6->flowi6_mark) { skb_dst_set(skb, dst); ndst = NULL; @@ -1260,6 +1258,7 @@ static int ip6gre_tunnel_init(struct net_device *dev) tunnel = netdev_priv(dev); tunnel->dev = dev; + tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); memcpy(dev->dev_addr, &tunnel->parms.laddr, sizeof(struct in6_addr)); @@ -1280,6 +1279,7 @@ static void ip6gre_fb_tunnel_init(struct net_device *dev) struct ip6_tnl *tunnel = netdev_priv(dev); tunnel->dev = dev; + tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); tunnel->hlen = sizeof(struct ipv6hdr) + 4; @@ -1455,6 +1455,7 @@ static int ip6gre_tap_init(struct net_device *dev) tunnel = netdev_priv(dev); tunnel->dev = dev; + tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); ip6gre_tnl_link_config(tunnel, 1); @@ -1506,6 +1507,7 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev, eth_hw_addr_random(dev); nt->dev = dev; + nt->net = dev_net(dev); ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]); /* Can use a lockless transmit, unless we generate output sequences */ diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 2bab2aa5974..302d6fb1ff2 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -44,7 +44,7 @@ #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/xfrm.h> - +#include <net/inet_ecn.h> int ip6_rcv_finish(struct sk_buff *skb) @@ -109,6 +109,10 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt if (hdr->version != 6) goto err; + IP6_ADD_STATS_BH(dev_net(dev), idev, + IPSTATS_MIB_NOECTPKTS + + (ipv6_get_dsfield(hdr) & INET_ECN_MASK), + max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); /* * RFC4291 2.5.3 * A packet received on an interface with a destination address diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index a263b990ee1..d82de722810 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -91,6 +91,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, unsigned int unfrag_ip6hlen; u8 *prevhdr; int offset = 0; + bool tunnel; if (unlikely(skb_shinfo(skb)->gso_type & ~(SKB_GSO_UDP | @@ -106,6 +107,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) goto out; + tunnel = skb->encapsulation; ipv6h = ipv6_hdr(skb); __skb_pull(skb, sizeof(*ipv6h)); segs = ERR_PTR(-EPROTONOSUPPORT); @@ -126,7 +128,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, ipv6h = ipv6_hdr(skb); ipv6h->payload_len = htons(skb->len - skb->mac_len - sizeof(*ipv6h)); - if (proto == IPPROTO_UDP) { + if (!tunnel && proto == IPPROTO_UDP) { unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index e7ceb6c871d..3a692d52916 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -56,31 +56,6 @@ #include <net/checksum.h> #include <linux/mroute6.h> -int __ip6_local_out(struct sk_buff *skb) -{ - int len; - - len = skb->len - sizeof(struct ipv6hdr); - if (len > IPV6_MAXPLEN) - len = 0; - ipv6_hdr(skb)->payload_len = htons(len); - - return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, - skb_dst(skb)->dev, dst_output); -} - -int ip6_local_out(struct sk_buff *skb) -{ - int err; - - err = __ip6_local_out(skb); - if (likely(err == 1)) - err = dst_output(skb); - - return err; -} -EXPORT_SYMBOL_GPL(ip6_local_out); - static int ip6_finish_output2(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 46ba243605a..61355f7f4da 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -41,6 +41,7 @@ #include <linux/netfilter_ipv6.h> #include <linux/slab.h> #include <linux/hash.h> +#include <linux/etherdevice.h> #include <asm/uaccess.h> #include <linux/atomic.h> @@ -315,6 +316,7 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) t = netdev_priv(dev); t->parms = *p; + t->net = dev_net(dev); err = ip6_tnl_create2(dev); if (err < 0) goto failed_free; @@ -374,7 +376,7 @@ static void ip6_tnl_dev_uninit(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - struct net *net = dev_net(dev); + struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); if (dev == ip6n->fb_tnl_dev) @@ -741,7 +743,7 @@ int ip6_tnl_rcv_ctl(struct ip6_tnl *t, { struct __ip6_tnl_parm *p = &t->parms; int ret = 0; - struct net *net = dev_net(t->dev); + struct net *net = t->net; if ((p->flags & IP6_TNL_F_CAP_RCV) || ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && @@ -800,14 +802,12 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol, rcu_read_unlock(); goto discard; } - secpath_reset(skb); skb->mac_header = skb->network_header; skb_reset_network_header(skb); skb->protocol = htons(protocol); - skb->pkt_type = PACKET_HOST; memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); - __skb_tunnel_rx(skb, t->dev); + __skb_tunnel_rx(skb, t->dev, t->net); err = dscp_ecn_decapsulate(t, ipv6h, skb); if (unlikely(err)) { @@ -895,7 +895,7 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t) { struct __ip6_tnl_parm *p = &t->parms; int ret = 0; - struct net *net = dev_net(t->dev); + struct net *net = t->net; if (p->flags & IP6_TNL_F_CAP_XMIT) { struct net_device *ldev = NULL; @@ -945,8 +945,8 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, int encap_limit, __u32 *pmtu) { - struct net *net = dev_net(dev); struct ip6_tnl *t = netdev_priv(dev); + struct net *net = t->net; struct net_device_stats *stats = &t->dev->stats; struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct ipv6_tel_txoption opt; @@ -996,6 +996,8 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, goto tx_err_dst_release; } + skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev))); + /* * Okay, now see if we can stuff it in the buffer as-is. */ @@ -1013,7 +1015,6 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, consume_skb(skb); skb = new_skb; } - skb_dst_drop(skb); if (fl6->flowi6_mark) { skb_dst_set(skb, dst); ndst = NULL; @@ -1208,7 +1209,7 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) int strict = (ipv6_addr_type(&p->raddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)); - struct rt6_info *rt = rt6_lookup(dev_net(dev), + struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, p->link, strict); @@ -1257,7 +1258,7 @@ ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) static int ip6_tnl_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) { - struct net *net = dev_net(t->dev); + struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); int err; @@ -1469,8 +1470,10 @@ static void ip6_tnl_dev_setup(struct net_device *dev) dev->mtu-=8; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); - dev->features |= NETIF_F_NETNS_LOCAL; dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + /* This perm addr will be used as interface identifier by IPv6 */ + dev->addr_assign_type = NET_ADDR_RANDOM; + eth_random_addr(dev->perm_addr); } @@ -1485,6 +1488,7 @@ ip6_tnl_dev_init_gen(struct net_device *dev) struct ip6_tnl *t = netdev_priv(dev); t->dev = dev; + t->net = dev_net(dev); dev->tstats = alloc_percpu(struct pcpu_tstats); if (!dev->tstats) return -ENOMEM; @@ -1602,9 +1606,9 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev, static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { - struct ip6_tnl *t; + struct ip6_tnl *t = netdev_priv(dev); struct __ip6_tnl_parm p; - struct net *net = dev_net(dev); + struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); if (dev == ip6n->fb_tnl_dev) @@ -1705,14 +1709,24 @@ static struct xfrm6_tunnel ip6ip6_handler __read_mostly = { static void __net_exit ip6_tnl_destroy_tunnels(struct ip6_tnl_net *ip6n) { + struct net *net = dev_net(ip6n->fb_tnl_dev); + struct net_device *dev, *aux; int h; struct ip6_tnl *t; LIST_HEAD(list); + for_each_netdev_safe(net, dev, aux) + if (dev->rtnl_link_ops == &ip6_link_ops) + unregister_netdevice_queue(dev, &list); + for (h = 0; h < HASH_SIZE; h++) { t = rtnl_dereference(ip6n->tnls_r_l[h]); while (t != NULL) { - unregister_netdevice_queue(t->dev, &list); + /* If dev is in the same netns, it has already + * been added to the list by the previous loop. + */ + if (!net_eq(dev_net(t->dev), net)) + unregister_netdevice_queue(t->dev, &list); t = rtnl_dereference(t->next); } } @@ -1738,6 +1752,10 @@ static int __net_init ip6_tnl_init_net(struct net *net) if (!ip6n->fb_tnl_dev) goto err_alloc_dev; dev_net_set(ip6n->fb_tnl_dev, net); + /* FB netdevice is special: we have one, and only one per netns. + * Allowing to move it to another netns is clearly unsafe. + */ + ip6n->fb_tnl_dev->features |= NETIF_F_NETNS_LOCAL; err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev); if (err < 0) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 03986d31fa4..f365310bfcc 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -110,8 +110,8 @@ static struct kmem_cache *mrt_cachep __read_mostly; static struct mr6_table *ip6mr_new_table(struct net *net, u32 id); static void ip6mr_free_table(struct mr6_table *mrt); -static int ip6_mr_forward(struct net *net, struct mr6_table *mrt, - struct sk_buff *skb, struct mfc6_cache *cache); +static void ip6_mr_forward(struct net *net, struct mr6_table *mrt, + struct sk_buff *skb, struct mfc6_cache *cache); static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert); static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, @@ -672,9 +672,8 @@ static int pim6_rcv(struct sk_buff *skb) skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IPV6); skb->ip_summed = CHECKSUM_NONE; - skb->pkt_type = PACKET_HOST; - skb_tunnel_rx(skb, reg_dev); + skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev)); netif_rx(skb); @@ -2074,8 +2073,8 @@ static int ip6mr_find_vif(struct mr6_table *mrt, struct net_device *dev) return ct; } -static int ip6_mr_forward(struct net *net, struct mr6_table *mrt, - struct sk_buff *skb, struct mfc6_cache *cache) +static void ip6_mr_forward(struct net *net, struct mr6_table *mrt, + struct sk_buff *skb, struct mfc6_cache *cache) { int psend = -1; int vif, ct; @@ -2156,12 +2155,11 @@ forward: last_forward: if (psend != -1) { ip6mr_forward2(net, mrt, skb, cache, psend); - return 0; + return; } dont_forward: kfree_skb(skb); - return 0; } diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 7af5aee75d9..5636a912074 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -76,7 +76,7 @@ static void ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return; if (type == NDISC_REDIRECT) - ip6_redirect(skb, net, 0, 0); + ip6_redirect(skb, net, skb->dev->ifindex, 0); else ip6_update_pmtu(skb, net, info, 0, 0); xfrm_state_put(x); diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 99cd65c715c..096cd67b737 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -44,6 +44,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/slab.h> +#include <linux/pkt_sched.h> #include <net/mld.h> #include <linux/netfilter.h> @@ -94,6 +95,7 @@ static void mld_ifc_event(struct inet6_dev *idev); static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc); static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *addr); static void mld_clear_delrec(struct inet6_dev *idev); +static bool mld_in_v1_mode(const struct inet6_dev *idev); static int sf_setstate(struct ifmcaddr6 *pmc); static void sf_markstate(struct ifmcaddr6 *pmc); static void ip6_mc_clear_src(struct ifmcaddr6 *pmc); @@ -106,14 +108,15 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, struct inet6_dev *idev); - -#define IGMP6_UNSOLICITED_IVAL (10*HZ) #define MLD_QRV_DEFAULT 2 +/* RFC3810, 9.2. Query Interval */ +#define MLD_QI_DEFAULT (125 * HZ) +/* RFC3810, 9.3. Query Response Interval */ +#define MLD_QRI_DEFAULT (10 * HZ) -#define MLD_V1_SEEN(idev) (dev_net((idev)->dev)->ipv6.devconf_all->force_mld_version == 1 || \ - (idev)->cnf.force_mld_version == 1 || \ - ((idev)->mc_v1_seen && \ - time_before(jiffies, (idev)->mc_v1_seen))) +/* RFC3810, 8.1 Query Version Distinctions */ +#define MLD_V1_QUERY_LEN 24 +#define MLD_V2_QUERY_LEN_MIN 28 #define IPV6_MLD_MAX_MSF 64 @@ -128,6 +131,18 @@ int sysctl_mld_max_msf __read_mostly = IPV6_MLD_MAX_MSF; pmc != NULL; \ pmc = rcu_dereference(pmc->next)) +static int unsolicited_report_interval(struct inet6_dev *idev) +{ + int iv; + + if (mld_in_v1_mode(idev)) + iv = idev->cnf.mldv1_unsolicited_report_interval; + else + iv = idev->cnf.mldv2_unsolicited_report_interval; + + return iv > 0 ? iv : 1; +} + int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) { struct net_device *dev = NULL; @@ -676,7 +691,7 @@ static void igmp6_group_added(struct ifmcaddr6 *mc) if (!(dev->flags & IFF_UP) || (mc->mca_flags & MAF_NOREPORT)) return; - if (MLD_V1_SEEN(mc->idev)) { + if (mld_in_v1_mode(mc->idev)) { igmp6_join_group(mc); return; } @@ -984,29 +999,49 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, static void mld_gq_start_timer(struct inet6_dev *idev) { - int tv = net_random() % idev->mc_maxdelay; + unsigned long tv = net_random() % idev->mc_maxdelay; idev->mc_gq_running = 1; if (!mod_timer(&idev->mc_gq_timer, jiffies+tv+2)) in6_dev_hold(idev); } -static void mld_ifc_start_timer(struct inet6_dev *idev, int delay) +static void mld_gq_stop_timer(struct inet6_dev *idev) { - int tv = net_random() % delay; + idev->mc_gq_running = 0; + if (del_timer(&idev->mc_gq_timer)) + __in6_dev_put(idev); +} + +static void mld_ifc_start_timer(struct inet6_dev *idev, unsigned long delay) +{ + unsigned long tv = net_random() % delay; if (!mod_timer(&idev->mc_ifc_timer, jiffies+tv+2)) in6_dev_hold(idev); } -static void mld_dad_start_timer(struct inet6_dev *idev, int delay) +static void mld_ifc_stop_timer(struct inet6_dev *idev) { - int tv = net_random() % delay; + idev->mc_ifc_count = 0; + if (del_timer(&idev->mc_ifc_timer)) + __in6_dev_put(idev); +} + +static void mld_dad_start_timer(struct inet6_dev *idev, unsigned long delay) +{ + unsigned long tv = net_random() % delay; if (!mod_timer(&idev->mc_dad_timer, jiffies+tv+2)) in6_dev_hold(idev); } +static void mld_dad_stop_timer(struct inet6_dev *idev) +{ + if (del_timer(&idev->mc_dad_timer)) + __in6_dev_put(idev); +} + /* * IGMP handling (alias multicast ICMPv6 messages) */ @@ -1025,12 +1060,9 @@ static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) delay = ma->mca_timer.expires - jiffies; } - if (delay >= resptime) { - if (resptime) - delay = net_random() % resptime; - else - delay = 1; - } + if (delay >= resptime) + delay = net_random() % resptime; + ma->mca_timer.expires = jiffies + delay; if (!mod_timer(&ma->mca_timer, jiffies + delay)) atomic_inc(&ma->mca_refcnt); @@ -1097,6 +1129,158 @@ static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, return true; } +static int mld_force_mld_version(const struct inet6_dev *idev) +{ + /* Normally, both are 0 here. If enforcement to a particular is + * being used, individual device enforcement will have a lower + * precedence over 'all' device (.../conf/all/force_mld_version). + */ + + if (dev_net(idev->dev)->ipv6.devconf_all->force_mld_version != 0) + return dev_net(idev->dev)->ipv6.devconf_all->force_mld_version; + else + return idev->cnf.force_mld_version; +} + +static bool mld_in_v2_mode_only(const struct inet6_dev *idev) +{ + return mld_force_mld_version(idev) == 2; +} + +static bool mld_in_v1_mode_only(const struct inet6_dev *idev) +{ + return mld_force_mld_version(idev) == 1; +} + +static bool mld_in_v1_mode(const struct inet6_dev *idev) +{ + if (mld_in_v2_mode_only(idev)) + return false; + if (mld_in_v1_mode_only(idev)) + return true; + if (idev->mc_v1_seen && time_before(jiffies, idev->mc_v1_seen)) + return true; + + return false; +} + +static void mld_set_v1_mode(struct inet6_dev *idev) +{ + /* RFC3810, relevant sections: + * - 9.1. Robustness Variable + * - 9.2. Query Interval + * - 9.3. Query Response Interval + * - 9.12. Older Version Querier Present Timeout + */ + unsigned long switchback; + + switchback = (idev->mc_qrv * idev->mc_qi) + idev->mc_qri; + + idev->mc_v1_seen = jiffies + switchback; +} + +static void mld_update_qrv(struct inet6_dev *idev, + const struct mld2_query *mlh2) +{ + /* RFC3810, relevant sections: + * - 5.1.8. QRV (Querier's Robustness Variable) + * - 9.1. Robustness Variable + */ + + /* The value of the Robustness Variable MUST NOT be zero, + * and SHOULD NOT be one. Catch this here if we ever run + * into such a case in future. + */ + WARN_ON(idev->mc_qrv == 0); + + if (mlh2->mld2q_qrv > 0) + idev->mc_qrv = mlh2->mld2q_qrv; + + if (unlikely(idev->mc_qrv < 2)) { + net_warn_ratelimited("IPv6: MLD: clamping QRV from %u to %u!\n", + idev->mc_qrv, MLD_QRV_DEFAULT); + idev->mc_qrv = MLD_QRV_DEFAULT; + } +} + +static void mld_update_qi(struct inet6_dev *idev, + const struct mld2_query *mlh2) +{ + /* RFC3810, relevant sections: + * - 5.1.9. QQIC (Querier's Query Interval Code) + * - 9.2. Query Interval + * - 9.12. Older Version Querier Present Timeout + * (the [Query Interval] in the last Query received) + */ + unsigned long mc_qqi; + + if (mlh2->mld2q_qqic < 128) { + mc_qqi = mlh2->mld2q_qqic; + } else { + unsigned long mc_man, mc_exp; + + mc_exp = MLDV2_QQIC_EXP(mlh2->mld2q_qqic); + mc_man = MLDV2_QQIC_MAN(mlh2->mld2q_qqic); + + mc_qqi = (mc_man | 0x10) << (mc_exp + 3); + } + + idev->mc_qi = mc_qqi * HZ; +} + +static void mld_update_qri(struct inet6_dev *idev, + const struct mld2_query *mlh2) +{ + /* RFC3810, relevant sections: + * - 5.1.3. Maximum Response Code + * - 9.3. Query Response Interval + */ + idev->mc_qri = msecs_to_jiffies(mldv2_mrc(mlh2)); +} + +static int mld_process_v1(struct inet6_dev *idev, struct mld_msg *mld, + unsigned long *max_delay) +{ + unsigned long mldv1_md; + + /* Ignore v1 queries */ + if (mld_in_v2_mode_only(idev)) + return -EINVAL; + + /* MLDv1 router present */ + mldv1_md = ntohs(mld->mld_maxdelay); + *max_delay = max(msecs_to_jiffies(mldv1_md), 1UL); + + mld_set_v1_mode(idev); + + /* cancel MLDv2 report timer */ + mld_gq_stop_timer(idev); + /* cancel the interface change timer */ + mld_ifc_stop_timer(idev); + /* clear deleted report items */ + mld_clear_delrec(idev); + + return 0; +} + +static int mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld, + unsigned long *max_delay) +{ + /* hosts need to stay in MLDv1 mode, discard MLDv2 queries */ + if (mld_in_v1_mode(idev)) + return -EINVAL; + + *max_delay = max(msecs_to_jiffies(mldv2_mrc(mld)), 1UL); + + mld_update_qrv(idev, mld); + mld_update_qi(idev, mld); + mld_update_qri(idev, mld); + + idev->mc_maxdelay = *max_delay; + + return 0; +} + /* called with rcu_read_lock() */ int igmp6_event_query(struct sk_buff *skb) { @@ -1108,7 +1292,7 @@ int igmp6_event_query(struct sk_buff *skb) struct mld_msg *mld; int group_type; int mark = 0; - int len; + int len, err; if (!pskb_may_pull(skb, sizeof(struct in6_addr))) return -EINVAL; @@ -1122,7 +1306,6 @@ int igmp6_event_query(struct sk_buff *skb) return -EINVAL; idev = __in6_dev_get(skb->dev); - if (idev == NULL) return 0; @@ -1134,35 +1317,23 @@ int igmp6_event_query(struct sk_buff *skb) !(group_type&IPV6_ADDR_MULTICAST)) return -EINVAL; - if (len == 24) { - int switchback; - /* MLDv1 router present */ - - /* Translate milliseconds to jiffies */ - max_delay = (ntohs(mld->mld_maxdelay)*HZ)/1000; - - switchback = (idev->mc_qrv + 1) * max_delay; - idev->mc_v1_seen = jiffies + switchback; - - /* cancel the interface change timer */ - idev->mc_ifc_count = 0; - if (del_timer(&idev->mc_ifc_timer)) - __in6_dev_put(idev); - /* clear deleted report items */ - mld_clear_delrec(idev); - } else if (len >= 28) { + if (len == MLD_V1_QUERY_LEN) { + err = mld_process_v1(idev, mld, &max_delay); + if (err < 0) + return err; + } else if (len >= MLD_V2_QUERY_LEN_MIN) { int srcs_offset = sizeof(struct mld2_query) - sizeof(struct icmp6hdr); + if (!pskb_may_pull(skb, srcs_offset)) return -EINVAL; mlh2 = (struct mld2_query *)skb_transport_header(skb); - max_delay = (MLDV2_MRC(ntohs(mlh2->mld2q_mrc))*HZ)/1000; - if (!max_delay) - max_delay = 1; - idev->mc_maxdelay = max_delay; - if (mlh2->mld2q_qrv) - idev->mc_qrv = mlh2->mld2q_qrv; + + err = mld_process_v2(idev, mlh2, &max_delay); + if (err < 0) + return err; + if (group_type == IPV6_ADDR_ANY) { /* general query */ if (mlh2->mld2q_nsrcs) return -EINVAL; /* no sources allowed */ @@ -1376,6 +1547,7 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, int size) if (!skb) return NULL; + skb->priority = TC_PRIO_CONTROL; skb_reserve(skb, hlen); if (__ipv6_get_lladdr(idev, &addr_buf, IFA_F_TENTATIVE)) { @@ -1769,7 +1941,7 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) rcu_read_unlock(); return; } - + skb->priority = TC_PRIO_CONTROL; skb_reserve(skb, hlen); if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { @@ -1827,7 +1999,7 @@ err_out: static void mld_resend_report(struct inet6_dev *idev) { - if (MLD_V1_SEEN(idev)) { + if (mld_in_v1_mode(idev)) { struct ifmcaddr6 *mcaddr; read_lock_bh(&idev->lock); for (mcaddr = idev->mc_list; mcaddr; mcaddr = mcaddr->next) { @@ -1891,7 +2063,7 @@ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, else pmc->mca_sources = psf->sf_next; if (psf->sf_oldin && !(pmc->mca_flags & MAF_NOREPORT) && - !MLD_V1_SEEN(idev)) { + !mld_in_v1_mode(idev)) { psf->sf_crcount = idev->mc_qrv; psf->sf_next = pmc->mca_tomb; pmc->mca_tomb = psf; @@ -2156,7 +2328,7 @@ static void igmp6_join_group(struct ifmcaddr6 *ma) igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); - delay = net_random() % IGMP6_UNSOLICITED_IVAL; + delay = net_random() % unsolicited_report_interval(ma->idev); spin_lock_bh(&ma->mca_lock); if (del_timer(&ma->mca_timer)) { @@ -2191,7 +2363,7 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, static void igmp6_leave_group(struct ifmcaddr6 *ma) { - if (MLD_V1_SEEN(ma->idev)) { + if (mld_in_v1_mode(ma->idev)) { if (ma->mca_flags & MAF_LAST_REPORTER) igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REDUCTION); @@ -2225,7 +2397,7 @@ static void mld_ifc_timer_expire(unsigned long data) static void mld_ifc_event(struct inet6_dev *idev) { - if (MLD_V1_SEEN(idev)) + if (mld_in_v1_mode(idev)) return; idev->mc_ifc_count = idev->mc_qrv; mld_ifc_start_timer(idev, 1); @@ -2236,7 +2408,7 @@ static void igmp6_timer_handler(unsigned long data) { struct ifmcaddr6 *ma = (struct ifmcaddr6 *) data; - if (MLD_V1_SEEN(ma->idev)) + if (mld_in_v1_mode(ma->idev)) igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); else mld_send_report(ma->idev, ma); @@ -2276,14 +2448,9 @@ void ipv6_mc_down(struct inet6_dev *idev) /* Withdraw multicast list */ read_lock_bh(&idev->lock); - idev->mc_ifc_count = 0; - if (del_timer(&idev->mc_ifc_timer)) - __in6_dev_put(idev); - idev->mc_gq_running = 0; - if (del_timer(&idev->mc_gq_timer)) - __in6_dev_put(idev); - if (del_timer(&idev->mc_dad_timer)) - __in6_dev_put(idev); + mld_ifc_stop_timer(idev); + mld_gq_stop_timer(idev); + mld_dad_stop_timer(idev); for (i = idev->mc_list; i; i=i->next) igmp6_group_dropped(i); @@ -2322,8 +2489,12 @@ void ipv6_mc_init_dev(struct inet6_dev *idev) (unsigned long)idev); setup_timer(&idev->mc_dad_timer, mld_dad_timer_expire, (unsigned long)idev); + idev->mc_qrv = MLD_QRV_DEFAULT; - idev->mc_maxdelay = IGMP6_UNSOLICITED_IVAL; + idev->mc_qi = MLD_QI_DEFAULT; + idev->mc_qri = MLD_QRI_DEFAULT; + + idev->mc_maxdelay = unsolicited_report_interval(idev); idev->mc_v1_seen = 0; write_unlock_bh(&idev->lock); } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 04d31c2fbef..12179457b2c 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -372,14 +372,11 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, int tlen = dev->needed_tailroom; struct sock *sk = dev_net(dev)->ipv6.ndisc_sk; struct sk_buff *skb; - int err; - skb = sock_alloc_send_skb(sk, - hlen + sizeof(struct ipv6hdr) + len + tlen, - 1, &err); + skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC); if (!skb) { - ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb, err=%d\n", - __func__, err); + ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb\n", + __func__); return NULL; } @@ -389,6 +386,11 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, skb_reserve(skb, hlen + sizeof(struct ipv6hdr)); skb_reset_transport_header(skb); + /* Manually assign socket ownership as we avoid calling + * sock_alloc_send_pskb() to bypass wmem buffer limits + */ + skb_set_owner_w(skb, sk); + return skb; } @@ -428,7 +430,6 @@ static void ndisc_send_skb(struct sk_buff *skb, type = icmp6h->icmp6_type; if (!dst) { - struct sock *sk = net->ipv6.ndisc_sk; struct flowi6 fl6; icmpv6_flow_init(sk, &fl6, type, saddr, daddr, skb->dev->ifindex); @@ -462,10 +463,10 @@ static void ndisc_send_skb(struct sk_buff *skb, rcu_read_unlock(); } -static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, - const struct in6_addr *daddr, - const struct in6_addr *solicited_addr, - bool router, bool solicited, bool override, bool inc_opt) +void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, + const struct in6_addr *daddr, + const struct in6_addr *solicited_addr, + bool router, bool solicited, bool override, bool inc_opt) { struct sk_buff *skb; struct in6_addr tmpaddr; @@ -663,9 +664,7 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) } ndisc_send_ns(dev, neigh, target, target, saddr); } else if ((probes -= neigh->parms->app_probes) < 0) { -#ifdef CONFIG_ARPD neigh_app_ns(neigh); -#endif } else { addrconf_addr_solict_mult(target, &mcaddr); ndisc_send_ns(dev, NULL, target, &mcaddr, saddr); @@ -1370,7 +1369,8 @@ static void ndisc_redirect_rcv(struct sk_buff *skb) return; if (!ndopts.nd_opts_rh) { - ip6_redirect_no_header(skb, dev_net(skb->dev), 0, 0); + ip6_redirect_no_header(skb, dev_net(skb->dev), + skb->dev->ifindex, 0); return; } @@ -1519,10 +1519,27 @@ static void pndisc_redo(struct sk_buff *skb) kfree_skb(skb); } +static bool ndisc_suppress_frag_ndisc(struct sk_buff *skb) +{ + struct inet6_dev *idev = __in6_dev_get(skb->dev); + + if (!idev) + return true; + if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED && + idev->cnf.suppress_frag_ndisc) { + net_warn_ratelimited("Received fragmented ndisc packet. Carefully consider disabling suppress_frag_ndisc.\n"); + return true; + } + return false; +} + int ndisc_rcv(struct sk_buff *skb) { struct nd_msg *msg; + if (ndisc_suppress_frag_ndisc(skb)) + return 0; + if (skb_linearize(skb)) return 0; diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 4433ab40e7d..a7f842b29b6 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -153,6 +153,19 @@ config IP6_NF_TARGET_REJECT To compile it as a module, choose M here. If unsure, say N. +config IP6_NF_TARGET_SYNPROXY + tristate "SYNPROXY target support" + depends on NF_CONNTRACK && NETFILTER_ADVANCED + select NETFILTER_SYNPROXY + select SYN_COOKIES + help + The SYNPROXY target allows you to intercept TCP connections and + establish them using syncookies before they are passed on to the + server. This allows to avoid conntrack and server resource usage + during SYN-flood attacks. + + To compile it as a module, choose M here. If unsure, say N. + config IP6_NF_MANGLE tristate "Packet mangling" default m if NETFILTER_ADVANCED=n diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 2d11fcc2cf3..2b53738f798 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -14,7 +14,7 @@ obj-$(CONFIG_NF_NAT_IPV6) += ip6table_nat.o nf_conntrack_ipv6-y := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o # l3 independent conntrack -obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o nf_defrag_ipv6.o +obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o nf_nat_ipv6-y := nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o @@ -37,3 +37,4 @@ obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o obj-$(CONFIG_IP6_NF_TARGET_MASQUERADE) += ip6t_MASQUERADE.o obj-$(CONFIG_IP6_NF_TARGET_NPT) += ip6t_NPT.o obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o +obj-$(CONFIG_IP6_NF_TARGET_SYNPROXY) += ip6t_SYNPROXY.o diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c index 47bff610751..3e4e92d5e15 100644 --- a/net/ipv6/netfilter/ip6t_MASQUERADE.c +++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c @@ -76,7 +76,7 @@ static int masq_device_event(struct notifier_block *this, if (event == NETDEV_DOWN) nf_ct_iterate_cleanup(net, device_cmp, - (void *)(long)dev->ifindex); + (void *)(long)dev->ifindex, 0, 0); return NOTIFY_DONE; } diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c index 70f9abc0efe..56eef30ee5f 100644 --- a/net/ipv6/netfilter/ip6t_REJECT.c +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -169,7 +169,25 @@ static void send_reset(struct net *net, struct sk_buff *oldskb) nf_ct_attach(nskb, oldskb); - ip6_local_out(nskb); +#ifdef CONFIG_BRIDGE_NETFILTER + /* If we use ip6_local_out for bridged traffic, the MAC source on + * the RST will be ours, instead of the destination's. This confuses + * some routers/firewalls, and they drop the packet. So we need to + * build the eth header using the original destination's MAC as the + * source, and send the RST packet directly. + */ + if (oldskb->nf_bridge) { + struct ethhdr *oeth = eth_hdr(oldskb); + nskb->dev = oldskb->nf_bridge->physindev; + nskb->protocol = htons(ETH_P_IPV6); + ip6h->payload_len = htons(sizeof(struct tcphdr)); + if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), + oeth->h_source, oeth->h_dest, nskb->len) < 0) + return; + dev_queue_xmit(nskb); + } else +#endif + ip6_local_out(nskb); } static inline void diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c new file mode 100644 index 00000000000..19cfea8dbca --- /dev/null +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -0,0 +1,499 @@ +/* + * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <net/ip6_checksum.h> +#include <net/ip6_route.h> +#include <net/tcp.h> + +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_SYNPROXY.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/nf_conntrack_synproxy.h> + +static struct ipv6hdr * +synproxy_build_ip(struct sk_buff *skb, const struct in6_addr *saddr, + const struct in6_addr *daddr) +{ + struct ipv6hdr *iph; + + skb_reset_network_header(skb); + iph = (struct ipv6hdr *)skb_put(skb, sizeof(*iph)); + ip6_flow_hdr(iph, 0, 0); + iph->hop_limit = 64; //XXX + iph->nexthdr = IPPROTO_TCP; + iph->saddr = *saddr; + iph->daddr = *daddr; + + return iph; +} + +static void +synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb, + struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, + struct ipv6hdr *niph, struct tcphdr *nth, + unsigned int tcp_hdr_size) +{ + struct net *net = nf_ct_net((struct nf_conn *)nfct); + struct dst_entry *dst; + struct flowi6 fl6; + + nth->check = ~tcp_v6_check(tcp_hdr_size, &niph->saddr, &niph->daddr, 0); + nskb->ip_summed = CHECKSUM_PARTIAL; + nskb->csum_start = (unsigned char *)nth - nskb->head; + nskb->csum_offset = offsetof(struct tcphdr, check); + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_TCP; + fl6.saddr = niph->saddr; + fl6.daddr = niph->daddr; + fl6.fl6_sport = nth->source; + fl6.fl6_dport = nth->dest; + security_skb_classify_flow((struct sk_buff *)skb, flowi6_to_flowi(&fl6)); + dst = ip6_route_output(net, NULL, &fl6); + if (dst == NULL || dst->error) { + dst_release(dst); + goto free_nskb; + } + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); + if (IS_ERR(dst)) + goto free_nskb; + + skb_dst_set(nskb, dst); + + if (nfct) { + nskb->nfct = nfct; + nskb->nfctinfo = ctinfo; + nf_conntrack_get(nfct); + } + + ip6_local_out(nskb); + return; + +free_nskb: + kfree_skb(nskb); +} + +static void +synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + u16 mss = opts->mss; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, &iph->daddr, &iph->saddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(__cookie_v6_init_sequence(iph, th, &mss)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; + if (opts->options & XT_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE; + nth->doff = tcp_hdr_size / 4; + nth->window = 0; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, + niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_server_syn(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts, u32 recv_seq) +{ + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, &iph->saddr, &iph->daddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(recv_seq - 1); + /* ack_seq is used to relay our ISN to the synproxy hook to initialize + * sequence number translation once a connection tracking entry exists. + */ + nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); + tcp_flag_word(nth) = TCP_FLAG_SYN; + if (opts->options & XT_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; + nth->doff = tcp_hdr_size / 4; + nth->window = th->window; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, + niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_server_ack(const struct synproxy_net *snet, + const struct ip_ct_tcp *state, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, &iph->daddr, &iph->saddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(ntohl(th->ack_seq)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_client_ack(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, &iph->saddr, &iph->daddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(ntohl(th->seq) + 1); + nth->ack_seq = th->ack_seq; + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = ntohs(htons(th->window) >> opts->wscale); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); +} + +static bool +synproxy_recv_client_ack(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, + struct synproxy_options *opts, u32 recv_seq) +{ + int mss; + + mss = __cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1); + if (mss == 0) { + this_cpu_inc(snet->stats->cookie_invalid); + return false; + } + + this_cpu_inc(snet->stats->cookie_valid); + opts->mss = mss; + + if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy_check_timestamp_cookie(opts); + + synproxy_send_server_syn(snet, skb, th, opts, recv_seq); + return true; +} + +static unsigned int +synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_synproxy_info *info = par->targinfo; + struct synproxy_net *snet = synproxy_pernet(dev_net(par->in)); + struct synproxy_options opts = {}; + struct tcphdr *th, _th; + + if (nf_ip6_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP)) + return NF_DROP; + + th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th); + if (th == NULL) + return NF_DROP; + + synproxy_parse_options(skb, par->thoff, th, &opts); + + if (th->syn && !(th->ack || th->fin || th->rst)) { + /* Initial SYN from client */ + this_cpu_inc(snet->stats->syn_received); + + if (th->ece && th->cwr) + opts.options |= XT_SYNPROXY_OPT_ECN; + + opts.options &= info->options; + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy_init_timestamp_cookie(info, &opts); + else + opts.options &= ~(XT_SYNPROXY_OPT_WSCALE | + XT_SYNPROXY_OPT_SACK_PERM | + XT_SYNPROXY_OPT_ECN); + + synproxy_send_client_synack(skb, th, &opts); + return NF_DROP; + + } else if (th->ack && !(th->fin || th->rst || th->syn)) { + /* ACK from client */ + synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq)); + return NF_DROP; + } + + return XT_CONTINUE; +} + +static unsigned int ipv6_synproxy_hook(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct synproxy_net *snet = synproxy_pernet(dev_net(in ? : out)); + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + struct nf_conn_synproxy *synproxy; + struct synproxy_options opts = {}; + const struct ip_ct_tcp *state; + struct tcphdr *th, _th; + __be16 frag_off; + u8 nexthdr; + int thoff; + + ct = nf_ct_get(skb, &ctinfo); + if (ct == NULL) + return NF_ACCEPT; + + synproxy = nfct_synproxy(ct); + if (synproxy == NULL) + return NF_ACCEPT; + + if (nf_is_loopback_packet(skb)) + return NF_ACCEPT; + + nexthdr = ipv6_hdr(skb)->nexthdr; + thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + &frag_off); + if (thoff < 0) + return NF_ACCEPT; + + th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); + if (th == NULL) + return NF_DROP; + + state = &ct->proto.tcp; + switch (state->state) { + case TCP_CONNTRACK_CLOSE: + if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - + ntohl(th->seq) + 1); + break; + } + + if (!th->syn || th->ack || + CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + break; + + /* Reopened connection - reset the sequence number and timestamp + * adjustments, they will get initialized once the connection is + * reestablished. + */ + nf_ct_seqadj_init(ct, ctinfo, 0); + synproxy->tsoff = 0; + this_cpu_inc(snet->stats->conn_reopened); + + /* fall through */ + case TCP_CONNTRACK_SYN_SENT: + synproxy_parse_options(skb, thoff, th, &opts); + + if (!th->syn && th->ack && + CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { + /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, + * therefore we need to add 1 to make the SYN sequence + * number match the one of first SYN. + */ + if (synproxy_recv_client_ack(snet, skb, th, &opts, + ntohl(th->seq) + 1)) + this_cpu_inc(snet->stats->cookie_retrans); + + return NF_DROP; + } + + synproxy->isn = ntohl(th->ack_seq); + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy->its = opts.tsecr; + break; + case TCP_CONNTRACK_SYN_RECV: + if (!th->syn || !th->ack) + break; + + synproxy_parse_options(skb, thoff, th, &opts); + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy->tsoff = opts.tsval - synproxy->its; + + opts.options &= ~(XT_SYNPROXY_OPT_MSS | + XT_SYNPROXY_OPT_WSCALE | + XT_SYNPROXY_OPT_SACK_PERM); + + swap(opts.tsval, opts.tsecr); + synproxy_send_server_ack(snet, state, skb, th, &opts); + + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); + + swap(opts.tsval, opts.tsecr); + synproxy_send_client_ack(snet, skb, th, &opts); + + consume_skb(skb); + return NF_STOLEN; + default: + break; + } + + synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); + return NF_ACCEPT; +} + +static int synproxy_tg6_check(const struct xt_tgchk_param *par) +{ + const struct ip6t_entry *e = par->entryinfo; + + if (!(e->ipv6.flags & IP6T_F_PROTO) || + e->ipv6.proto != IPPROTO_TCP || + e->ipv6.invflags & XT_INV_PROTO) + return -EINVAL; + + return nf_ct_l3proto_try_module_get(par->family); +} + +static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par) +{ + nf_ct_l3proto_module_put(par->family); +} + +static struct xt_target synproxy_tg6_reg __read_mostly = { + .name = "SYNPROXY", + .family = NFPROTO_IPV6, + .target = synproxy_tg6, + .targetsize = sizeof(struct xt_synproxy_info), + .checkentry = synproxy_tg6_check, + .destroy = synproxy_tg6_destroy, + .me = THIS_MODULE, +}; + +static struct nf_hook_ops ipv6_synproxy_ops[] __read_mostly = { + { + .hook = ipv6_synproxy_hook, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, + { + .hook = ipv6_synproxy_hook, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, +}; + +static int __init synproxy_tg6_init(void) +{ + int err; + + err = nf_register_hooks(ipv6_synproxy_ops, + ARRAY_SIZE(ipv6_synproxy_ops)); + if (err < 0) + goto err1; + + err = xt_register_target(&synproxy_tg6_reg); + if (err < 0) + goto err2; + + return 0; + +err2: + nf_unregister_hooks(ipv6_synproxy_ops, ARRAY_SIZE(ipv6_synproxy_ops)); +err1: + return err; +} + +static void __exit synproxy_tg6_exit(void) +{ + xt_unregister_target(&synproxy_tg6_reg); + nf_unregister_hooks(ipv6_synproxy_ops, ARRAY_SIZE(ipv6_synproxy_ops)); +} + +module_init(synproxy_tg6_init); +module_exit(synproxy_tg6_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index c9b6a6e6a1e..d6e4dd8b58d 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -28,6 +28,7 @@ #include <net/netfilter/nf_conntrack_l3proto.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> #include <net/netfilter/nf_nat_helper.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> @@ -158,11 +159,7 @@ static unsigned int ipv6_confirm(unsigned int hooknum, /* adjust seqs for loopback traffic only in outgoing direction */ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && !nf_is_loopback_packet(skb)) { - typeof(nf_nat_seq_adjust_hook) seq_adjust; - - seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook); - if (!seq_adjust || - !seq_adjust(skb, ct, ctinfo, protoff)) { + if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); return NF_DROP; } diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index ab92a3673fb..827f795209c 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -5,6 +5,7 @@ #include <linux/export.h> #include <net/ipv6.h> #include <net/ip6_fib.h> +#include <net/addrconf.h> void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt) { @@ -75,3 +76,50 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) return offset; } EXPORT_SYMBOL(ip6_find_1stfragopt); + +#if IS_ENABLED(CONFIG_IPV6) +int ip6_dst_hoplimit(struct dst_entry *dst) +{ + int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); + if (hoplimit == 0) { + struct net_device *dev = dst->dev; + struct inet6_dev *idev; + + rcu_read_lock(); + idev = __in6_dev_get(dev); + if (idev) + hoplimit = idev->cnf.hop_limit; + else + hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit; + rcu_read_unlock(); + } + return hoplimit; +} +EXPORT_SYMBOL(ip6_dst_hoplimit); +#endif + +int __ip6_local_out(struct sk_buff *skb) +{ + int len; + + len = skb->len - sizeof(struct ipv6hdr); + if (len > IPV6_MAXPLEN) + len = 0; + ipv6_hdr(skb)->payload_len = htons(len); + + return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, + skb_dst(skb)->dev, dst_output); +} +EXPORT_SYMBOL_GPL(__ip6_local_out); + +int ip6_local_out(struct sk_buff *skb) +{ + int err; + + err = __ip6_local_out(skb); + if (likely(err == 1)) + err = dst_output(skb); + + return err; +} +EXPORT_SYMBOL_GPL(ip6_local_out); diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 51c3285b5d9..091d066a57b 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -91,6 +91,10 @@ static const struct snmp_mib snmp6_ipstats_list[] = { SNMP_MIB_ITEM("Ip6InBcastOctets", IPSTATS_MIB_INBCASTOCTETS), SNMP_MIB_ITEM("Ip6OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS), /* IPSTATS_MIB_CSUMERRORS is not relevant in IPv6 (no checksum) */ + SNMP_MIB_ITEM("Ip6InNoECTPkts", IPSTATS_MIB_NOECTPKTS), + SNMP_MIB_ITEM("Ip6InECT1Pkts", IPSTATS_MIB_ECT1PKTS), + SNMP_MIB_ITEM("Ip6InECT0Pkts", IPSTATS_MIB_ECT0PKTS), + SNMP_MIB_ITEM("Ip6InCEPkts", IPSTATS_MIB_CEPKTS), SNMP_MIB_SENTINEL }; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index cdaed47ba93..58916bbb172 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -63,6 +63,8 @@ #include <linux/seq_file.h> #include <linux/export.h> +#define ICMPV6_HDRLEN 4 /* ICMPv6 header, RFC 4443 Section 2.1 */ + static struct raw_hashinfo raw_v6_hashinfo = { .lock = __RW_LOCK_UNLOCKED(raw_v6_hashinfo.lock), }; @@ -108,11 +110,14 @@ found: */ static int icmpv6_filter(const struct sock *sk, const struct sk_buff *skb) { - struct icmp6hdr *_hdr; + struct icmp6hdr _hdr; const struct icmp6hdr *hdr; + /* We require only the four bytes of the ICMPv6 header, not any + * additional bytes of message body in "struct icmp6hdr". + */ hdr = skb_header_pointer(skb, skb_transport_offset(skb), - sizeof(_hdr), &_hdr); + ICMPV6_HDRLEN, &_hdr); if (hdr) { const __u32 *data = &raw6_sk(sk)->filter.data[0]; unsigned int type = hdr->icmp6_type; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 8d9a93ed9c5..c979dd96d82 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -283,9 +283,8 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net, memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers); - rt->rt6i_genid = rt_genid(net); + rt->rt6i_genid = rt_genid_ipv6(net); INIT_LIST_HEAD(&rt->rt6i_siblings); - rt->rt6i_nsiblings = 0; } return rt; } @@ -1062,7 +1061,7 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) * DST_OBSOLETE_FORCE_CHK which forces validation calls down * into this function always. */ - if (rt->rt6i_genid != rt_genid(dev_net(rt->dst.dev))) + if (rt->rt6i_genid != rt_genid_ipv6(dev_net(rt->dst.dev))) return NULL; if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) @@ -1157,6 +1156,77 @@ void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) } EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); +/* Handle redirects */ +struct ip6rd_flowi { + struct flowi6 fl6; + struct in6_addr gateway; +}; + +static struct rt6_info *__ip6_route_redirect(struct net *net, + struct fib6_table *table, + struct flowi6 *fl6, + int flags) +{ + struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; + struct rt6_info *rt; + struct fib6_node *fn; + + /* Get the "current" route for this destination and + * check if the redirect has come from approriate router. + * + * RFC 4861 specifies that redirects should only be + * accepted if they come from the nexthop to the target. + * Due to the way the routes are chosen, this notion + * is a bit fuzzy and one might need to check all possible + * routes. + */ + + read_lock_bh(&table->tb6_lock); + fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); +restart: + for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + if (rt6_check_expired(rt)) + continue; + if (rt->dst.error) + break; + if (!(rt->rt6i_flags & RTF_GATEWAY)) + continue; + if (fl6->flowi6_oif != rt->dst.dev->ifindex) + continue; + if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) + continue; + break; + } + + if (!rt) + rt = net->ipv6.ip6_null_entry; + else if (rt->dst.error) { + rt = net->ipv6.ip6_null_entry; + goto out; + } + BACKTRACK(net, &fl6->saddr); +out: + dst_hold(&rt->dst); + + read_unlock_bh(&table->tb6_lock); + + return rt; +}; + +static struct dst_entry *ip6_route_redirect(struct net *net, + const struct flowi6 *fl6, + const struct in6_addr *gateway) +{ + int flags = RT6_LOOKUP_F_HAS_SADDR; + struct ip6rd_flowi rdfl; + + rdfl.fl6 = *fl6; + rdfl.gateway = *gateway; + + return fib6_rule_lookup(net, &rdfl.fl6, + flags, __ip6_route_redirect); +} + void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark) { const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; @@ -1171,9 +1241,8 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark) fl6.saddr = iph->saddr; fl6.flowlabel = ip6_flowinfo(iph); - dst = ip6_route_output(net, NULL, &fl6); - if (!dst->error) - rt6_do_redirect(dst, NULL, skb); + dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr); + rt6_do_redirect(dst, NULL, skb); dst_release(dst); } EXPORT_SYMBOL_GPL(ip6_redirect); @@ -1193,9 +1262,8 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, fl6.daddr = msg->dest; fl6.saddr = iph->daddr; - dst = ip6_route_output(net, NULL, &fl6); - if (!dst->error) - rt6_do_redirect(dst, NULL, skb); + dst = ip6_route_redirect(net, &fl6, &iph->saddr); + rt6_do_redirect(dst, NULL, skb); dst_release(dst); } @@ -1355,25 +1423,6 @@ out: return entries > rt_max_size; } -int ip6_dst_hoplimit(struct dst_entry *dst) -{ - int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); - if (hoplimit == 0) { - struct net_device *dev = dst->dev; - struct inet6_dev *idev; - - rcu_read_lock(); - idev = __in6_dev_get(dev); - if (idev) - hoplimit = idev->cnf.hop_limit; - else - hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit; - rcu_read_unlock(); - } - return hoplimit; -} -EXPORT_SYMBOL(ip6_dst_hoplimit); - /* * */ diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 21b25dd8466..7ee5cb96db3 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -581,12 +581,10 @@ static int ipip6_rcv(struct sk_buff *skb) tunnel->parms.iph.protocol != 0) goto out; - secpath_reset(skb); skb->mac_header = skb->network_header; skb_reset_network_header(skb); IPCB(skb)->flags = 0; skb->protocol = htons(ETH_P_IPV6); - skb->pkt_type = PACKET_HOST; if (tunnel->dev->priv_flags & IFF_ISATAP) { if (!isatap_chksrc(skb, iph, tunnel)) { @@ -603,7 +601,7 @@ static int ipip6_rcv(struct sk_buff *skb) } } - __skb_tunnel_rx(skb, tunnel->dev); + __skb_tunnel_rx(skb, tunnel->dev, tunnel->net); err = IP_ECN_decapsulate(iph, skb); if (unlikely(err)) { @@ -621,8 +619,6 @@ static int ipip6_rcv(struct sk_buff *skb) tstats->rx_packets++; tstats->rx_bytes += skb->len; - if (tunnel->net != dev_net(tunnel->dev)) - skb_scrub_packet(skb); netif_rx(skb); return 0; @@ -858,9 +854,6 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, tunnel->err_count = 0; } - if (tunnel->net != dev_net(dev)) - skb_scrub_packet(skb); - /* * Okay, now see if we can stuff it in the buffer as-is. */ @@ -891,8 +884,8 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, skb->encapsulation = 1; } - err = iptunnel_xmit(dev_net(dev), rt, skb, fl4.saddr, fl4.daddr, - IPPROTO_IPV6, tos, ttl, df); + err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, IPPROTO_IPV6, tos, + ttl, df, !net_eq(tunnel->net, dev_net(dev))); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); return NETDEV_TX_OK; @@ -1592,7 +1585,7 @@ static void __net_exit sit_destroy_tunnels(struct sit_net *sitn, struct list_hea /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ - if (dev_net(t->dev) != net) + if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, head); t = rtnl_dereference(t->next); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index d5dda20bd71..bf63ac8a49b 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -112,32 +112,38 @@ static __u32 check_tcp_syn_cookie(__u32 cookie, const struct in6_addr *saddr, & COOKIEMASK; } -__u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb, __u16 *mssp) +u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, + const struct tcphdr *th, __u16 *mssp) { - const struct ipv6hdr *iph = ipv6_hdr(skb); - const struct tcphdr *th = tcp_hdr(skb); int mssind; const __u16 mss = *mssp; - tcp_synq_overflow(sk); - for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--) if (mss >= msstab[mssind]) break; *mssp = msstab[mssind]; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); - return secure_tcp_syn_cookie(&iph->saddr, &iph->daddr, th->source, th->dest, ntohl(th->seq), jiffies / (HZ * 60), mssind); } +EXPORT_SYMBOL_GPL(__cookie_v6_init_sequence); -static inline int cookie_check(const struct sk_buff *skb, __u32 cookie) +__u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb, __u16 *mssp) { const struct ipv6hdr *iph = ipv6_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); + + tcp_synq_overflow(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); + + return __cookie_v6_init_sequence(iph, th, mssp); +} + +int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th, + __u32 cookie) +{ __u32 seq = ntohl(th->seq) - 1; __u32 mssind = check_tcp_syn_cookie(cookie, &iph->saddr, &iph->daddr, th->source, th->dest, seq, @@ -145,6 +151,7 @@ static inline int cookie_check(const struct sk_buff *skb, __u32 cookie) return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; } +EXPORT_SYMBOL_GPL(__cookie_v6_check); struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) { @@ -167,7 +174,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) goto out; if (tcp_synq_no_recent_overflow(sk) || - (mss = cookie_check(skb, cookie)) == 0) { + (mss = __cookie_v6_check(ipv6_hdr(skb), th, cookie)) == 0) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); goto out; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 6e1649d5853..5c71501fc91 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -963,7 +963,8 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (!ipv6_unicast_destination(skb)) goto drop; - if (inet_csk_reqsk_queue_is_full(sk) && !isn) { + if ((sysctl_tcp_syncookies == 2 || + inet_csk_reqsk_queue_is_full(sk)) && !isn) { want_cookie = tcp_syn_flood_action(sk, skb, "TCPv6"); if (!want_cookie) goto drop; @@ -1237,8 +1238,6 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; tcp_initialize_rcv_mss(newsk); - tcp_synack_rtt_meas(newsk, req); - newtp->total_retrans = req->num_retrans; newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; newinet->inet_rcv_saddr = LOOPBACK4_IPV6; @@ -1361,8 +1360,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) } } - if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) - goto reset; + tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len); if (opt_skb) goto ipv6_pktoptions; return 0; @@ -1427,7 +1425,7 @@ ipv6_pktoptions: if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) np->mcast_hops = ipv6_hdr(opt_skb)->hop_limit; if (np->rxopt.bits.rxtclass) - np->rcv_tclass = ipv6_get_dsfield(ipv6_hdr(skb)); + np->rcv_tclass = ipv6_get_dsfield(ipv6_hdr(opt_skb)); if (ipv6_opt_accepted(sk, opt_skb)) { skb_set_owner_r(opt_skb, sk); opt_skb = xchg(&np->pktoptions, opt_skb); @@ -1732,7 +1730,7 @@ static void get_openreq6(struct seq_file *seq, seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " - "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK\n", + "%02X %08X:%08X %02X:%08lX %08X %5u %8d %d %d %pK\n", i, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], @@ -1783,7 +1781,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " - "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %lu %lu %u %u %d\n", + "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %lu %lu %u %u %d\n", i, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], srcp, @@ -1926,6 +1924,7 @@ struct proto tcpv6_prot = { .unhash = inet_unhash, .get_port = inet_csk_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, + .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 5d1b8d7ac99..60559511bd9 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -21,26 +21,25 @@ static int udp6_ufo_send_check(struct sk_buff *skb) const struct ipv6hdr *ipv6h; struct udphdr *uh; - /* UDP Tunnel offload on ipv6 is not yet supported. */ - if (skb->encapsulation) - return -EINVAL; - if (!pskb_may_pull(skb, sizeof(*uh))) return -EINVAL; - ipv6h = ipv6_hdr(skb); - uh = udp_hdr(skb); + if (likely(!skb->encapsulation)) { + ipv6h = ipv6_hdr(skb); + uh = udp_hdr(skb); + + uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len, + IPPROTO_UDP, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + skb->ip_summed = CHECKSUM_PARTIAL; + } - uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len, - IPPROTO_UDP, 0); - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct udphdr, check); - skb->ip_summed = CHECKSUM_PARTIAL; return 0; } static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, - netdev_features_t features) + netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); unsigned int mss; @@ -75,47 +74,51 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, goto out; } - /* Do software UFO. Complete and fill in the UDP checksum as HW cannot - * do checksum of UDP packets sent as multiple IP fragments. - */ - offset = skb_checksum_start_offset(skb); - csum = skb_checksum(skb, offset, skb->len - offset, 0); - offset += skb->csum_offset; - *(__sum16 *)(skb->data + offset) = csum_fold(csum); - skb->ip_summed = CHECKSUM_NONE; - - /* Check if there is enough headroom to insert fragment header. */ - tnl_hlen = skb_tnl_header_len(skb); - if (skb_headroom(skb) < (tnl_hlen + frag_hdr_sz)) { - if (gso_pskb_expand_head(skb, tnl_hlen + frag_hdr_sz)) - goto out; + if (skb->encapsulation && skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) + segs = skb_udp_tunnel_segment(skb, features); + else { + /* Do software UFO. Complete and fill in the UDP checksum as HW cannot + * do checksum of UDP packets sent as multiple IP fragments. + */ + offset = skb_checksum_start_offset(skb); + csum = skb_checksum(skb, offset, skb->len - offset, 0); + offset += skb->csum_offset; + *(__sum16 *)(skb->data + offset) = csum_fold(csum); + skb->ip_summed = CHECKSUM_NONE; + + /* Check if there is enough headroom to insert fragment header. */ + tnl_hlen = skb_tnl_header_len(skb); + if (skb_headroom(skb) < (tnl_hlen + frag_hdr_sz)) { + if (gso_pskb_expand_head(skb, tnl_hlen + frag_hdr_sz)) + goto out; + } + + /* Find the unfragmentable header and shift it left by frag_hdr_sz + * bytes to insert fragment header. + */ + unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); + nexthdr = *prevhdr; + *prevhdr = NEXTHDR_FRAGMENT; + unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) + + unfrag_ip6hlen + tnl_hlen; + packet_start = (u8 *) skb->head + SKB_GSO_CB(skb)->mac_offset; + memmove(packet_start-frag_hdr_sz, packet_start, unfrag_len); + + SKB_GSO_CB(skb)->mac_offset -= frag_hdr_sz; + skb->mac_header -= frag_hdr_sz; + skb->network_header -= frag_hdr_sz; + + fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen); + fptr->nexthdr = nexthdr; + fptr->reserved = 0; + ipv6_select_ident(fptr, (struct rt6_info *)skb_dst(skb)); + + /* Fragment the skb. ipv6 header and the remaining fields of the + * fragment header are updated in ipv6_gso_segment() + */ + segs = skb_segment(skb, features); } - /* Find the unfragmentable header and shift it left by frag_hdr_sz - * bytes to insert fragment header. - */ - unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); - nexthdr = *prevhdr; - *prevhdr = NEXTHDR_FRAGMENT; - unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) + - unfrag_ip6hlen + tnl_hlen; - packet_start = (u8 *) skb->head + SKB_GSO_CB(skb)->mac_offset; - memmove(packet_start-frag_hdr_sz, packet_start, unfrag_len); - - SKB_GSO_CB(skb)->mac_offset -= frag_hdr_sz; - skb->mac_header -= frag_hdr_sz; - skb->network_header -= frag_hdr_sz; - - fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen); - fptr->nexthdr = nexthdr; - fptr->reserved = 0; - ipv6_select_ident(fptr, (struct rt6_info *)skb_dst(skb)); - - /* Fragment the skb. ipv6 header and the remaining fields of the - * fragment header are updated in ipv6_gso_segment() - */ - segs = skb_segment(skb, features); - out: return segs; } |