diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2008-01-29 22:54:01 +1100 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-01-29 22:54:01 +1100 |
commit | 0ba6c33bcddc64a54b5f1c25a696c4767dc76292 (patch) | |
tree | 62e616f97a4762d8e75bf732e4827af2d15d52c5 /net | |
parent | 21af0297c7e56024a5ccc4d8ad2a590f9ec371ba (diff) | |
parent | 85040bcb4643cba578839e953f25e2d1965d83d0 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6.25
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6.25: (1470 commits)
[IPV6] ADDRLABEL: Fix double free on label deletion.
[PPP]: Sparse warning fixes.
[IPV4] fib_trie: remove unneeded NULL check
[IPV4] fib_trie: More whitespace cleanup.
[NET_SCHED]: Use nla_policy for attribute validation in ematches
[NET_SCHED]: Use nla_policy for attribute validation in actions
[NET_SCHED]: Use nla_policy for attribute validation in classifiers
[NET_SCHED]: Use nla_policy for attribute validation in packet schedulers
[NET_SCHED]: sch_api: introduce constant for rate table size
[NET_SCHED]: Use typeful attribute parsing helpers
[NET_SCHED]: Use typeful attribute construction helpers
[NET_SCHED]: Use NLA_PUT_STRING for string dumping
[NET_SCHED]: Use nla_nest_start/nla_nest_end
[NET_SCHED]: Propagate nla_parse return value
[NET_SCHED]: act_api: use PTR_ERR in tcf_action_init/tcf_action_get
[NET_SCHED]: act_api: use nlmsg_parse
[NET_SCHED]: act_api: fix netlink API conversion bug
[NET_SCHED]: sch_netem: use nla_parse_nested_compat
[NET_SCHED]: sch_atm: fix format string warning
[NETNS]: Add namespace for ICMP replying code.
...
Diffstat (limited to 'net')
520 files changed, 28870 insertions, 16810 deletions
diff --git a/net/802/Makefile b/net/802/Makefile index 977704a54f6..68569ffddea 100644 --- a/net/802/Makefile +++ b/net/802/Makefile @@ -3,9 +3,8 @@ # # Check the p8022 selections against net/core/Makefile. -obj-$(CONFIG_SYSCTL) += sysctl_net_802.o obj-$(CONFIG_LLC) += p8022.o psnap.o -obj-$(CONFIG_TR) += p8022.o psnap.o tr.o sysctl_net_802.o +obj-$(CONFIG_TR) += p8022.o psnap.o tr.o obj-$(CONFIG_NET_FC) += fc.o obj-$(CONFIG_FDDI) += fddi.o obj-$(CONFIG_HIPPI) += hippi.o diff --git a/net/802/sysctl_net_802.c b/net/802/sysctl_net_802.c deleted file mode 100644 index ead56037398..00000000000 --- a/net/802/sysctl_net_802.c +++ /dev/null @@ -1,33 +0,0 @@ -/* -*- linux-c -*- - * sysctl_net_802.c: sysctl interface to net 802 subsystem. - * - * Begun April 1, 1996, Mike Shaver. - * Added /proc/sys/net/802 directory entry (empty =) ). [MS] - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/mm.h> -#include <linux/if_tr.h> -#include <linux/sysctl.h> - -#ifdef CONFIG_TR -extern int sysctl_tr_rif_timeout; -#endif - -struct ctl_table tr_table[] = { -#ifdef CONFIG_TR - { - .ctl_name = NET_TR_RIF_TIMEOUT, - .procname = "rif_timeout", - .data = &sysctl_tr_rif_timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, -#endif /* CONFIG_TR */ - { 0 }, -}; diff --git a/net/802/tr.c b/net/802/tr.c index 1e115e5beab..3f16b172055 100644 --- a/net/802/tr.c +++ b/net/802/tr.c @@ -35,6 +35,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/init.h> +#include <linux/sysctl.h> #include <net/arp.h> #include <net/net_namespace.h> @@ -634,6 +635,26 @@ struct net_device *alloc_trdev(int sizeof_priv) return alloc_netdev(sizeof_priv, "tr%d", tr_setup); } +#ifdef CONFIG_SYSCTL +static struct ctl_table tr_table[] = { + { + .ctl_name = NET_TR_RIF_TIMEOUT, + .procname = "rif_timeout", + .data = &sysctl_tr_rif_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { 0 }, +}; + +static __initdata struct ctl_path tr_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "token-ring", .ctl_name = NET_TR, }, + { } +}; +#endif + /* * Called during bootup. We don't actually have to initialise * too much for this. @@ -641,12 +662,12 @@ struct net_device *alloc_trdev(int sizeof_priv) static int __init rif_init(void) { - init_timer(&rif_timer); rif_timer.expires = jiffies + sysctl_tr_rif_timeout; - rif_timer.data = 0L; - rif_timer.function = rif_check_expire; + setup_timer(&rif_timer, rif_check_expire, 0); add_timer(&rif_timer); - +#ifdef CONFIG_SYSCTL + register_sysctl_paths(tr_path, tr_table); +#endif proc_net_fops_create(&init_net, "tr_rif", S_IRUGO, &rif_seq_fops); return 0; } diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 032bf44eca5..dbc81b96509 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -3,7 +3,7 @@ * Ethernet-type device handling. * * Authors: Ben Greear <greearb@candelatech.com> - * Please send support related email to: vlan@scry.wanfear.com + * Please send support related email to: netdev@vger.kernel.org * VLAN Home Page: http://www.candelatech.com/~greear/vlan.html * * Fixes: @@ -43,23 +43,12 @@ /* Our listing of VLAN group(s) */ static struct hlist_head vlan_group_hash[VLAN_GRP_HASH_SIZE]; -#define vlan_grp_hashfn(IDX) ((((IDX) >> VLAN_GRP_HASH_SHIFT) ^ (IDX)) & VLAN_GRP_HASH_MASK) static char vlan_fullname[] = "802.1Q VLAN Support"; static char vlan_version[] = DRV_VERSION; static char vlan_copyright[] = "Ben Greear <greearb@candelatech.com>"; static char vlan_buggyright[] = "David S. Miller <davem@redhat.com>"; -static int vlan_device_event(struct notifier_block *, unsigned long, void *); -static int vlan_ioctl_handler(struct net *net, void __user *); -static int unregister_vlan_dev(struct net_device *, unsigned short ); - -static struct notifier_block vlan_notifier_block = { - .notifier_call = vlan_device_event, -}; - -/* These may be changed at run-time through IOCTLs */ - /* Determines interface naming scheme. */ unsigned short vlan_name_type = VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD; @@ -70,82 +59,11 @@ static struct packet_type vlan_packet_type = { /* End of global variables definitions. */ -/* - * Function vlan_proto_init (pro) - * - * Initialize VLAN protocol layer, - * - */ -static int __init vlan_proto_init(void) +static inline unsigned int vlan_grp_hashfn(unsigned int idx) { - int err; - - printk(VLAN_INF "%s v%s %s\n", - vlan_fullname, vlan_version, vlan_copyright); - printk(VLAN_INF "All bugs added by %s\n", - vlan_buggyright); - - /* proc file system initialization */ - err = vlan_proc_init(); - if (err < 0) { - printk(KERN_ERR - "%s %s: can't create entry in proc filesystem!\n", - __FUNCTION__, VLAN_NAME); - return err; - } - - dev_add_pack(&vlan_packet_type); - - /* Register us to receive netdevice events */ - err = register_netdevice_notifier(&vlan_notifier_block); - if (err < 0) - goto err1; - - err = vlan_netlink_init(); - if (err < 0) - goto err2; - - vlan_ioctl_set(vlan_ioctl_handler); - return 0; - -err2: - unregister_netdevice_notifier(&vlan_notifier_block); -err1: - vlan_proc_cleanup(); - dev_remove_pack(&vlan_packet_type); - return err; + return ((idx >> VLAN_GRP_HASH_SHIFT) ^ idx) & VLAN_GRP_HASH_MASK; } -/* - * Module 'remove' entry point. - * o delete /proc/net/router directory and static entries. - */ -static void __exit vlan_cleanup_module(void) -{ - int i; - - vlan_ioctl_set(NULL); - vlan_netlink_fini(); - - /* Un-register us from receiving netdevice events */ - unregister_netdevice_notifier(&vlan_notifier_block); - - dev_remove_pack(&vlan_packet_type); - - /* This table must be empty if there are no module - * references left. - */ - for (i = 0; i < VLAN_GRP_HASH_SIZE; i++) { - BUG_ON(!hlist_empty(&vlan_group_hash[i])); - } - vlan_proc_cleanup(); - - synchronize_net(); -} - -module_init(vlan_proto_init); -module_exit(vlan_cleanup_module); - /* Must be invoked with RCU read lock (no preempt) */ static struct vlan_group *__vlan_find_group(int real_dev_ifindex) { @@ -180,7 +98,7 @@ static void vlan_group_free(struct vlan_group *grp) { int i; - for (i=0; i < VLAN_GROUP_ARRAY_SPLIT_PARTS; i++) + for (i = 0; i < VLAN_GROUP_ARRAY_SPLIT_PARTS; i++) kfree(grp->vlan_devices_arrays[i]); kfree(grp); } @@ -218,179 +136,50 @@ static void vlan_rcu_free(struct rcu_head *rcu) vlan_group_free(container_of(rcu, struct vlan_group, rcu)); } - -/* This returns 0 if everything went fine. - * It will return 1 if the group was killed as a result. - * A negative return indicates failure. - * - * The RTNL lock must be held. - */ -static int unregister_vlan_dev(struct net_device *real_dev, - unsigned short vlan_id) +void unregister_vlan_dev(struct net_device *dev) { - struct net_device *dev = NULL; - int real_dev_ifindex = real_dev->ifindex; + struct vlan_dev_info *vlan = vlan_dev_info(dev); + struct net_device *real_dev = vlan->real_dev; struct vlan_group *grp; - int i, ret; - -#ifdef VLAN_DEBUG - printk(VLAN_DBG "%s: VID: %i\n", __FUNCTION__, vlan_id); -#endif - - /* sanity check */ - if (vlan_id >= VLAN_VID_MASK) - return -EINVAL; + unsigned short vlan_id = vlan->vlan_id; ASSERT_RTNL(); - grp = __vlan_find_group(real_dev_ifindex); - - ret = 0; - - if (grp) { - dev = vlan_group_get_device(grp, vlan_id); - if (dev) { - /* Remove proc entry */ - vlan_proc_rem_dev(dev); - /* Take it out of our own structures, but be sure to - * interlock with HW accelerating devices or SW vlan - * input packet processing. - */ - if (real_dev->features & NETIF_F_HW_VLAN_FILTER) - real_dev->vlan_rx_kill_vid(real_dev, vlan_id); - - vlan_group_set_device(grp, vlan_id, NULL); - synchronize_net(); + grp = __vlan_find_group(real_dev->ifindex); + BUG_ON(!grp); + vlan_proc_rem_dev(dev); - /* Caller unregisters (and if necessary, puts) - * VLAN device, but we get rid of the reference to - * real_dev here. - */ - dev_put(real_dev); + /* Take it out of our own structures, but be sure to interlock with + * HW accelerating devices or SW vlan input packet processing. + */ + if (real_dev->features & NETIF_F_HW_VLAN_FILTER) + real_dev->vlan_rx_kill_vid(real_dev, vlan_id); - /* If the group is now empty, kill off the - * group. - */ - for (i = 0; i < VLAN_VID_MASK; i++) - if (vlan_group_get_device(grp, i)) - break; + vlan_group_set_device(grp, vlan_id, NULL); + grp->nr_vlans--; - if (i == VLAN_VID_MASK) { - if (real_dev->features & NETIF_F_HW_VLAN_RX) - real_dev->vlan_rx_register(real_dev, NULL); + synchronize_net(); - hlist_del_rcu(&grp->hlist); + /* If the group is now empty, kill off the group. */ + if (grp->nr_vlans == 0) { + if (real_dev->features & NETIF_F_HW_VLAN_RX) + real_dev->vlan_rx_register(real_dev, NULL); - /* Free the group, after all cpu's are done. */ - call_rcu(&grp->rcu, vlan_rcu_free); + hlist_del_rcu(&grp->hlist); - grp = NULL; - ret = 1; - } - } + /* Free the group, after all cpu's are done. */ + call_rcu(&grp->rcu, vlan_rcu_free); } - return ret; -} + /* Get rid of the vlan's reference to real_dev */ + dev_put(real_dev); -int unregister_vlan_device(struct net_device *dev) -{ - int ret; - - ret = unregister_vlan_dev(VLAN_DEV_INFO(dev)->real_dev, - VLAN_DEV_INFO(dev)->vlan_id); unregister_netdevice(dev); - - if (ret == 1) - ret = 0; - return ret; -} - -/* - * vlan network devices have devices nesting below it, and are a special - * "super class" of normal network devices; split their locks off into a - * separate class since they always nest. - */ -static struct lock_class_key vlan_netdev_xmit_lock_key; - -static const struct header_ops vlan_header_ops = { - .create = vlan_dev_hard_header, - .rebuild = vlan_dev_rebuild_header, - .parse = eth_header_parse, -}; - -static int vlan_dev_init(struct net_device *dev) -{ - struct net_device *real_dev = VLAN_DEV_INFO(dev)->real_dev; - int subclass = 0; - - /* IFF_BROADCAST|IFF_MULTICAST; ??? */ - dev->flags = real_dev->flags & ~IFF_UP; - dev->iflink = real_dev->ifindex; - dev->state = (real_dev->state & ((1<<__LINK_STATE_NOCARRIER) | - (1<<__LINK_STATE_DORMANT))) | - (1<<__LINK_STATE_PRESENT); - - /* ipv6 shared card related stuff */ - dev->dev_id = real_dev->dev_id; - - if (is_zero_ether_addr(dev->dev_addr)) - memcpy(dev->dev_addr, real_dev->dev_addr, dev->addr_len); - if (is_zero_ether_addr(dev->broadcast)) - memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len); - - if (real_dev->features & NETIF_F_HW_VLAN_TX) { - dev->header_ops = real_dev->header_ops; - dev->hard_header_len = real_dev->hard_header_len; - dev->hard_start_xmit = vlan_dev_hwaccel_hard_start_xmit; - } else { - dev->header_ops = &vlan_header_ops; - dev->hard_header_len = real_dev->hard_header_len + VLAN_HLEN; - dev->hard_start_xmit = vlan_dev_hard_start_xmit; - } - - if (real_dev->priv_flags & IFF_802_1Q_VLAN) - subclass = 1; - - lockdep_set_class_and_subclass(&dev->_xmit_lock, - &vlan_netdev_xmit_lock_key, subclass); - return 0; -} - -void vlan_setup(struct net_device *new_dev) -{ - ether_setup(new_dev); - - /* new_dev->ifindex = 0; it will be set when added to - * the global list. - * iflink is set as well. - */ - new_dev->get_stats = vlan_dev_get_stats; - - /* Make this thing known as a VLAN device */ - new_dev->priv_flags |= IFF_802_1Q_VLAN; - - /* Set us up to have no queue, as the underlying Hardware device - * can do all the queueing we could want. - */ - new_dev->tx_queue_len = 0; - - /* set up method calls */ - new_dev->change_mtu = vlan_dev_change_mtu; - new_dev->init = vlan_dev_init; - new_dev->open = vlan_dev_open; - new_dev->stop = vlan_dev_stop; - new_dev->set_mac_address = vlan_set_mac_address; - new_dev->set_multicast_list = vlan_dev_set_multicast_list; - new_dev->change_rx_flags = vlan_change_rx_flags; - new_dev->destructor = free_netdev; - new_dev->do_ioctl = vlan_dev_ioctl; - - memset(new_dev->broadcast, 0, ETH_ALEN); } -static void vlan_transfer_operstate(const struct net_device *dev, struct net_device *vlandev) +static void vlan_transfer_operstate(const struct net_device *dev, + struct net_device *vlandev) { /* Have to respect userspace enforced dormant state * of real device, also must allow supplicant running @@ -412,23 +201,22 @@ static void vlan_transfer_operstate(const struct net_device *dev, struct net_dev int vlan_check_real_dev(struct net_device *real_dev, unsigned short vlan_id) { + char *name = real_dev->name; + if (real_dev->features & NETIF_F_VLAN_CHALLENGED) { - printk(VLAN_DBG "%s: VLANs not supported on %s.\n", - __FUNCTION__, real_dev->name); + pr_info("8021q: VLANs not supported on %s\n", name); return -EOPNOTSUPP; } if ((real_dev->features & NETIF_F_HW_VLAN_RX) && !real_dev->vlan_rx_register) { - printk(VLAN_DBG "%s: Device %s has buggy VLAN hw accel.\n", - __FUNCTION__, real_dev->name); + pr_info("8021q: device %s has buggy VLAN hw accel\n", name); return -EOPNOTSUPP; } if ((real_dev->features & NETIF_F_HW_VLAN_FILTER) && (!real_dev->vlan_rx_add_vid || !real_dev->vlan_rx_kill_vid)) { - printk(VLAN_DBG "%s: Device %s has buggy VLAN hw accel.\n", - __FUNCTION__, real_dev->name); + pr_info("8021q: Device %s has buggy VLAN hw accel\n", name); return -EOPNOTSUPP; } @@ -438,18 +226,15 @@ int vlan_check_real_dev(struct net_device *real_dev, unsigned short vlan_id) if (!(real_dev->flags & IFF_UP)) return -ENETDOWN; - if (__find_vlan_dev(real_dev, vlan_id) != NULL) { - /* was already registered. */ - printk(VLAN_DBG "%s: ALREADY had VLAN registered\n", __FUNCTION__); + if (__find_vlan_dev(real_dev, vlan_id) != NULL) return -EEXIST; - } return 0; } int register_vlan_dev(struct net_device *dev) { - struct vlan_dev_info *vlan = VLAN_DEV_INFO(dev); + struct vlan_dev_info *vlan = vlan_dev_info(dev); struct net_device *real_dev = vlan->real_dev; unsigned short vlan_id = vlan->vlan_id; struct vlan_group *grp, *ngrp = NULL; @@ -476,14 +261,16 @@ int register_vlan_dev(struct net_device *dev) * it into our local structure. */ vlan_group_set_device(grp, vlan_id, dev); + grp->nr_vlans++; + if (ngrp && real_dev->features & NETIF_F_HW_VLAN_RX) real_dev->vlan_rx_register(real_dev, ngrp); if (real_dev->features & NETIF_F_HW_VLAN_FILTER) real_dev->vlan_rx_add_vid(real_dev, vlan_id); if (vlan_proc_add_dev(dev) < 0) - printk(KERN_WARNING "VLAN: failed to add proc entry for %s\n", - dev->name); + pr_warning("8021q: failed to add proc entry for %s\n", + dev->name); return 0; out_free_group: @@ -502,11 +289,6 @@ static int register_vlan_device(struct net_device *real_dev, char name[IFNAMSIZ]; int err; -#ifdef VLAN_DEBUG - printk(VLAN_DBG "%s: if_name -:%s:- vid: %i\n", - __FUNCTION__, eth_IF_name, VLAN_ID); -#endif - if (VLAN_ID >= VLAN_VID_MASK) return -ERANGE; @@ -515,10 +297,6 @@ static int register_vlan_device(struct net_device *real_dev, return err; /* Gotta set up the fields for the device. */ -#ifdef VLAN_DEBUG - printk(VLAN_DBG "About to allocate name, vlan_name_type: %i\n", - vlan_name_type); -#endif switch (vlan_name_type) { case VLAN_NAME_TYPE_RAW_PLUS_VID: /* name will look like: eth1.0005 */ @@ -555,26 +333,16 @@ static int register_vlan_device(struct net_device *real_dev, */ new_dev->mtu = real_dev->mtu; -#ifdef VLAN_DEBUG - printk(VLAN_DBG "Allocated new name -:%s:-\n", new_dev->name); - VLAN_MEM_DBG("new_dev->priv malloc, addr: %p size: %i\n", - new_dev->priv, - sizeof(struct vlan_dev_info)); -#endif - - VLAN_DEV_INFO(new_dev)->vlan_id = VLAN_ID; /* 1 through VLAN_VID_MASK */ - VLAN_DEV_INFO(new_dev)->real_dev = real_dev; - VLAN_DEV_INFO(new_dev)->dent = NULL; - VLAN_DEV_INFO(new_dev)->flags = VLAN_FLAG_REORDER_HDR; + vlan_dev_info(new_dev)->vlan_id = VLAN_ID; /* 1 through VLAN_VID_MASK */ + vlan_dev_info(new_dev)->real_dev = real_dev; + vlan_dev_info(new_dev)->dent = NULL; + vlan_dev_info(new_dev)->flags = VLAN_FLAG_REORDER_HDR; new_dev->rtnl_link_ops = &vlan_link_ops; err = register_vlan_dev(new_dev); if (err < 0) goto out_free_newdev; -#ifdef VLAN_DEBUG - printk(VLAN_DBG "Allocated new device successfully, returning.\n"); -#endif return 0; out_free_newdev: @@ -585,7 +353,7 @@ out_free_newdev: static void vlan_sync_address(struct net_device *dev, struct net_device *vlandev) { - struct vlan_dev_info *vlan = VLAN_DEV_INFO(vlandev); + struct vlan_dev_info *vlan = vlan_dev_info(vlandev); /* May be called without an actual change */ if (!compare_ether_addr(vlan->real_dev_addr, dev->dev_addr)) @@ -606,7 +374,8 @@ static void vlan_sync_address(struct net_device *dev, memcpy(vlan->real_dev_addr, dev->dev_addr, ETH_ALEN); } -static int vlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) +static int vlan_device_event(struct notifier_block *unused, unsigned long event, + void *ptr) { struct net_device *dev = ptr; struct vlan_group *grp = __vlan_find_group(dev->ifindex); @@ -683,20 +452,16 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, case NETDEV_UNREGISTER: /* Delete all VLANs for this dev. */ for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) { - int ret; - vlandev = vlan_group_get_device(grp, i); if (!vlandev) continue; - ret = unregister_vlan_dev(dev, - VLAN_DEV_INFO(vlandev)->vlan_id); + /* unregistration of last vlan destroys group, abort + * afterwards */ + if (grp->nr_vlans == 1) + i = VLAN_GROUP_ARRAY_LEN; - unregister_netdevice(vlandev); - - /* Group was destroyed? */ - if (ret == 1) - break; + unregister_vlan_dev(vlandev); } break; } @@ -705,6 +470,10 @@ out: return NOTIFY_DONE; } +static struct notifier_block vlan_notifier_block __read_mostly = { + .notifier_call = vlan_device_event, +}; + /* * VLAN IOCTL handler. * o execute requested action or pass command to the device driver @@ -724,10 +493,6 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg) args.device1[23] = 0; args.u.device2[23] = 0; -#ifdef VLAN_DEBUG - printk(VLAN_DBG "%s: args.cmd: %x\n", __FUNCTION__, args.cmd); -#endif - rtnl_lock(); switch (args.cmd) { @@ -802,36 +567,16 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg) err = -EPERM; if (!capable(CAP_NET_ADMIN)) break; - err = unregister_vlan_device(dev); + unregister_vlan_dev(dev); + err = 0; break; - case GET_VLAN_INGRESS_PRIORITY_CMD: - /* TODO: Implement - err = vlan_dev_get_ingress_priority(args); - if (copy_to_user((void*)arg, &args, - sizeof(struct vlan_ioctl_args))) { - err = -EFAULT; - } - */ - err = -EINVAL; - break; - case GET_VLAN_EGRESS_PRIORITY_CMD: - /* TODO: Implement - err = vlan_dev_get_egress_priority(args.device1, &(args.args); - if (copy_to_user((void*)arg, &args, - sizeof(struct vlan_ioctl_args))) { - err = -EFAULT; - } - */ - err = -EINVAL; - break; case GET_VLAN_REALDEV_NAME_CMD: err = 0; vlan_dev_get_realdev_name(dev, args.u.device2); if (copy_to_user(arg, &args, - sizeof(struct vlan_ioctl_args))) { + sizeof(struct vlan_ioctl_args))) err = -EFAULT; - } break; case GET_VLAN_VID_CMD: @@ -839,16 +584,12 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg) vlan_dev_get_vid(dev, &vid); args.u.VID = vid; if (copy_to_user(arg, &args, - sizeof(struct vlan_ioctl_args))) { + sizeof(struct vlan_ioctl_args))) err = -EFAULT; - } break; default: - /* pass on to underlying device instead?? */ - printk(VLAN_DBG "%s: Unknown VLAN CMD: %x \n", - __FUNCTION__, args.cmd); - err = -EINVAL; + err = -EOPNOTSUPP; break; } out: @@ -856,5 +597,59 @@ out: return err; } +static int __init vlan_proto_init(void) +{ + int err; + + pr_info("%s v%s %s\n", vlan_fullname, vlan_version, vlan_copyright); + pr_info("All bugs added by %s\n", vlan_buggyright); + + err = vlan_proc_init(); + if (err < 0) + goto err1; + + err = register_netdevice_notifier(&vlan_notifier_block); + if (err < 0) + goto err2; + + err = vlan_netlink_init(); + if (err < 0) + goto err3; + + dev_add_pack(&vlan_packet_type); + vlan_ioctl_set(vlan_ioctl_handler); + return 0; + +err3: + unregister_netdevice_notifier(&vlan_notifier_block); +err2: + vlan_proc_cleanup(); +err1: + return err; +} + +static void __exit vlan_cleanup_module(void) +{ + unsigned int i; + + vlan_ioctl_set(NULL); + vlan_netlink_fini(); + + unregister_netdevice_notifier(&vlan_notifier_block); + + dev_remove_pack(&vlan_packet_type); + + /* This table must be empty if there are no module references left. */ + for (i = 0; i < VLAN_GRP_HASH_SIZE; i++) + BUG_ON(!hlist_empty(&vlan_group_hash[i])); + + vlan_proc_cleanup(); + + synchronize_net(); +} + +module_init(vlan_proto_init); +module_exit(vlan_cleanup_module); + MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_VERSION); diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index 2cd1393073e..73efcc715cc 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -3,31 +3,6 @@ #include <linux/if_vlan.h> -/* Uncomment this if you want debug traces to be shown. */ -/* #define VLAN_DEBUG */ - -#define VLAN_ERR KERN_ERR -#define VLAN_INF KERN_INFO -#define VLAN_DBG KERN_ALERT /* change these... to debug, having a hard time - * changing the log level at run-time..for some reason. - */ - -/* - -These I use for memory debugging. I feared a leak at one time, but -I never found it..and the problem seems to have dissappeared. Still, -I'll bet they might prove useful again... --Ben - - -#define VLAN_MEM_DBG(x, y, z) printk(VLAN_DBG "%s: " x, __FUNCTION__, y, z); -#define VLAN_FMEM_DBG(x, y) printk(VLAN_DBG "%s: " x, __FUNCTION__, y); -*/ - -/* This way they don't do anything! */ -#define VLAN_MEM_DBG(x, y, z) -#define VLAN_FMEM_DBG(x, y) - - extern unsigned short vlan_name_type; #define VLAN_GRP_HASH_SHIFT 5 @@ -45,23 +20,12 @@ extern unsigned short vlan_name_type; * Must be invoked with rcu_read_lock (ie preempt disabled) * or with RTNL. */ -struct net_device *__find_vlan_dev(struct net_device* real_dev, +struct net_device *__find_vlan_dev(struct net_device *real_dev, unsigned short VID); /* vlan.c */ /* found in vlan_dev.c */ -int vlan_dev_rebuild_header(struct sk_buff *skb); int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev); -int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, - unsigned short type, const void *daddr, - const void *saddr, unsigned len); -int vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev); -int vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb, struct net_device *dev); -int vlan_dev_change_mtu(struct net_device *dev, int new_mtu); -int vlan_dev_open(struct net_device* dev); -int vlan_dev_stop(struct net_device* dev); -int vlan_set_mac_address(struct net_device *dev, void *p); -int vlan_dev_ioctl(struct net_device* dev, struct ifreq *ifr, int cmd); void vlan_dev_set_ingress_priority(const struct net_device *dev, u32 skb_prio, short vlan_prio); int vlan_dev_set_egress_priority(const struct net_device *dev, @@ -70,13 +34,11 @@ int vlan_dev_set_vlan_flag(const struct net_device *dev, u32 flag, short flag_val); void vlan_dev_get_realdev_name(const struct net_device *dev, char *result); void vlan_dev_get_vid(const struct net_device *dev, unsigned short *result); -void vlan_change_rx_flags(struct net_device *dev, int change); -void vlan_dev_set_multicast_list(struct net_device *vlan_dev); int vlan_check_real_dev(struct net_device *real_dev, unsigned short vlan_id); void vlan_setup(struct net_device *dev); int register_vlan_dev(struct net_device *dev); -int unregister_vlan_device(struct net_device *dev); +void unregister_vlan_dev(struct net_device *dev); int vlan_netlink_init(void); void vlan_netlink_fini(void); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 4f99bb86af5..8059fa42b08 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -3,7 +3,7 @@ * Ethernet-type device handling. * * Authors: Ben Greear <greearb@candelatech.com> - * Please send support related email to: vlan@scry.wanfear.com + * Please send support related email to: netdev@vger.kernel.org * VLAN Home Page: http://www.candelatech.com/~greear/vlan.html * * Fixes: Mar 22 2001: Martin Bokaemper <mbokaemper@unispherenetworks.com> @@ -47,7 +47,7 @@ * * TODO: This needs a checkup, I'm ignorant here. --BLG */ -int vlan_dev_rebuild_header(struct sk_buff *skb) +static int vlan_dev_rebuild_header(struct sk_buff *skb) { struct net_device *dev = skb->dev; struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data); @@ -60,9 +60,8 @@ int vlan_dev_rebuild_header(struct sk_buff *skb) return arp_find(veth->h_dest, skb); #endif default: - printk(VLAN_DBG - "%s: unable to resolve type %X addresses.\n", - dev->name, ntohs(veth->h_vlan_encapsulated_proto)); + pr_debug("%s: unable to resolve type %X addresses.\n", + dev->name, ntohs(veth->h_vlan_encapsulated_proto)); memcpy(veth->h_source, dev->dev_addr, ETH_ALEN); break; @@ -73,7 +72,7 @@ int vlan_dev_rebuild_header(struct sk_buff *skb) static inline struct sk_buff *vlan_check_reorder_header(struct sk_buff *skb) { - if (VLAN_DEV_INFO(skb->dev)->flags & VLAN_FLAG_REORDER_HDR) { + if (vlan_dev_info(skb->dev)->flags & VLAN_FLAG_REORDER_HDR) { if (skb_shared(skb) || skb_cloned(skb)) { struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); kfree_skb(skb); @@ -90,6 +89,40 @@ static inline struct sk_buff *vlan_check_reorder_header(struct sk_buff *skb) return skb; } +static inline void vlan_set_encap_proto(struct sk_buff *skb, + struct vlan_hdr *vhdr) +{ + __be16 proto; + unsigned char *rawp; + + /* + * Was a VLAN packet, grab the encapsulated protocol, which the layer + * three protocols care about. + */ + + proto = vhdr->h_vlan_encapsulated_proto; + if (ntohs(proto) >= 1536) { + skb->protocol = proto; + return; + } + + rawp = skb->data; + if (*(unsigned short *)rawp == 0xFFFF) + /* + * This is a magic hack to spot IPX packets. Older Novell + * breaks the protocol design and runs IPX over 802.3 without + * an 802.2 LLC layer. We look for FFFF which isn't a used + * 802.2 SSAP/DSAP. This won't work for fault tolerant netware + * but does for the rest. + */ + skb->protocol = htons(ETH_P_802_3); + else + /* + * Real 802.2 LLC + */ + skb->protocol = htons(ETH_P_802_2); +} + /* * Determine the packet's protocol ID. The rule here is that we * assume 802.3 if the type field is short enough to be a length. @@ -107,115 +140,58 @@ static inline struct sk_buff *vlan_check_reorder_header(struct sk_buff *skb) * SANITY NOTE 2: We are referencing to the VLAN_HDR frields, which MAY be * stored UNALIGNED in the memory. RISC systems don't like * such cases very much... - * SANITY NOTE 2a: According to Dave Miller & Alexey, it will always be aligned, - * so there doesn't need to be any of the unaligned stuff. It has - * been commented out now... --Ben + * SANITY NOTE 2a: According to Dave Miller & Alexey, it will always be + * aligned, so there doesn't need to be any of the unaligned + * stuff. It has been commented out now... --Ben * */ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev, - struct packet_type* ptype, struct net_device *orig_dev) + struct packet_type *ptype, struct net_device *orig_dev) { - unsigned char *rawp = NULL; struct vlan_hdr *vhdr; unsigned short vid; struct net_device_stats *stats; unsigned short vlan_TCI; - __be16 proto; - if (dev->nd_net != &init_net) { - kfree_skb(skb); - return -1; - } + if (dev->nd_net != &init_net) + goto err_free; - if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) - return -1; + skb = skb_share_check(skb, GFP_ATOMIC); + if (skb == NULL) + goto err_free; - if (unlikely(!pskb_may_pull(skb, VLAN_HLEN))) { - kfree_skb(skb); - return -1; - } - - vhdr = (struct vlan_hdr *)(skb->data); + if (unlikely(!pskb_may_pull(skb, VLAN_HLEN))) + goto err_free; - /* vlan_TCI = ntohs(get_unaligned(&vhdr->h_vlan_TCI)); */ + vhdr = (struct vlan_hdr *)skb->data; vlan_TCI = ntohs(vhdr->h_vlan_TCI); - vid = (vlan_TCI & VLAN_VID_MASK); -#ifdef VLAN_DEBUG - printk(VLAN_DBG "%s: skb: %p vlan_id: %hx\n", - __FUNCTION__, skb, vid); -#endif - - /* Ok, we will find the correct VLAN device, strip the header, - * and then go on as usual. - */ - - /* We have 12 bits of vlan ID. - * - * We must not drop allow preempt until we hold a - * reference to the device (netif_rx does that) or we - * fail. - */ - rcu_read_lock(); skb->dev = __find_vlan_dev(dev, vid); if (!skb->dev) { - rcu_read_unlock(); - -#ifdef VLAN_DEBUG - printk(VLAN_DBG "%s: ERROR: No net_device for VID: %i on dev: %s [%i]\n", - __FUNCTION__, (unsigned int)(vid), dev->name, dev->ifindex); -#endif - kfree_skb(skb); - return -1; + pr_debug("%s: ERROR: No net_device for VID: %u on dev: %s\n", + __FUNCTION__, (unsigned int)vid, dev->name); + goto err_unlock; } skb->dev->last_rx = jiffies; - /* Bump the rx counters for the VLAN device. */ - stats = vlan_dev_get_stats(skb->dev); + stats = &skb->dev->stats; stats->rx_packets++; stats->rx_bytes += skb->len; - /* Take off the VLAN header (4 bytes currently) */ skb_pull_rcsum(skb, VLAN_HLEN); - /* Ok, lets check to make sure the device (dev) we - * came in on is what this VLAN is attached to. - */ - - if (dev != VLAN_DEV_INFO(skb->dev)->real_dev) { - rcu_read_unlock(); - -#ifdef VLAN_DEBUG - printk(VLAN_DBG "%s: dropping skb: %p because came in on wrong device, dev: %s real_dev: %s, skb_dev: %s\n", - __FUNCTION__, skb, dev->name, - VLAN_DEV_INFO(skb->dev)->real_dev->name, - skb->dev->name); -#endif - kfree_skb(skb); - stats->rx_errors++; - return -1; - } - - /* - * Deal with ingress priority mapping. - */ - skb->priority = vlan_get_ingress_priority(skb->dev, ntohs(vhdr->h_vlan_TCI)); + skb->priority = vlan_get_ingress_priority(skb->dev, + ntohs(vhdr->h_vlan_TCI)); -#ifdef VLAN_DEBUG - printk(VLAN_DBG "%s: priority: %lu for TCI: %hu (hbo)\n", - __FUNCTION__, (unsigned long)(skb->priority), - ntohs(vhdr->h_vlan_TCI)); -#endif + pr_debug("%s: priority: %u for TCI: %hu\n", + __FUNCTION__, skb->priority, ntohs(vhdr->h_vlan_TCI)); - /* The ethernet driver already did the pkt_type calculations - * for us... - */ switch (skb->pkt_type) { case PACKET_BROADCAST: /* Yeah, stats collect these together.. */ - // stats->broadcast ++; // no such counter :-( + /* stats->broadcast ++; // no such counter :-( */ break; case PACKET_MULTICAST: @@ -224,109 +200,47 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev, case PACKET_OTHERHOST: /* Our lower layer thinks this is not local, let's make sure. - * This allows the VLAN to have a different MAC than the underlying - * device, and still route correctly. + * This allows the VLAN to have a different MAC than the + * underlying device, and still route correctly. */ - if (!compare_ether_addr(eth_hdr(skb)->h_dest, skb->dev->dev_addr)) { - /* It is for our (changed) MAC-address! */ + if (!compare_ether_addr(eth_hdr(skb)->h_dest, + skb->dev->dev_addr)) skb->pkt_type = PACKET_HOST; - } break; default: break; } - /* Was a VLAN packet, grab the encapsulated protocol, which the layer - * three protocols care about. - */ - /* proto = get_unaligned(&vhdr->h_vlan_encapsulated_proto); */ - proto = vhdr->h_vlan_encapsulated_proto; - - skb->protocol = proto; - if (ntohs(proto) >= 1536) { - /* place it back on the queue to be handled by - * true layer 3 protocols. - */ - - /* See if we are configured to re-write the VLAN header - * to make it look like ethernet... - */ - skb = vlan_check_reorder_header(skb); - - /* Can be null if skb-clone fails when re-ordering */ - if (skb) { - netif_rx(skb); - } else { - /* TODO: Add a more specific counter here. */ - stats->rx_errors++; - } - rcu_read_unlock(); - return 0; - } - - rawp = skb->data; - - /* - * This is a magic hack to spot IPX packets. Older Novell breaks - * the protocol design and runs IPX over 802.3 without an 802.2 LLC - * layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This - * won't work for fault tolerant netware but does for the rest. - */ - if (*(unsigned short *)rawp == 0xFFFF) { - skb->protocol = htons(ETH_P_802_3); - /* place it back on the queue to be handled by true layer 3 protocols. - */ - - /* See if we are configured to re-write the VLAN header - * to make it look like ethernet... - */ - skb = vlan_check_reorder_header(skb); + vlan_set_encap_proto(skb, vhdr); - /* Can be null if skb-clone fails when re-ordering */ - if (skb) { - netif_rx(skb); - } else { - /* TODO: Add a more specific counter here. */ - stats->rx_errors++; - } - rcu_read_unlock(); - return 0; - } - - /* - * Real 802.2 LLC - */ - skb->protocol = htons(ETH_P_802_2); - /* place it back on the queue to be handled by upper layer protocols. - */ - - /* See if we are configured to re-write the VLAN header - * to make it look like ethernet... - */ skb = vlan_check_reorder_header(skb); - - /* Can be null if skb-clone fails when re-ordering */ - if (skb) { - netif_rx(skb); - } else { - /* TODO: Add a more specific counter here. */ + if (!skb) { stats->rx_errors++; + goto err_unlock; } + + netif_rx(skb); rcu_read_unlock(); - return 0; + return NET_RX_SUCCESS; + +err_unlock: + rcu_read_unlock(); +err_free: + kfree_skb(skb); + return NET_RX_DROP; } -static inline unsigned short vlan_dev_get_egress_qos_mask(struct net_device* dev, - struct sk_buff* skb) +static inline unsigned short +vlan_dev_get_egress_qos_mask(struct net_device *dev, struct sk_buff *skb) { - struct vlan_priority_tci_mapping *mp = - VLAN_DEV_INFO(dev)->egress_priority_map[(skb->priority & 0xF)]; + struct vlan_priority_tci_mapping *mp; + mp = vlan_dev_info(dev)->egress_priority_map[(skb->priority & 0xF)]; while (mp) { if (mp->priority == skb->priority) { - return mp->vlan_qos; /* This should already be shifted to mask - * correctly with the VLAN's TCI - */ + return mp->vlan_qos; /* This should already be shifted + * to mask correctly with the + * VLAN's TCI */ } mp = mp->next; } @@ -342,20 +256,20 @@ static inline unsigned short vlan_dev_get_egress_qos_mask(struct net_device* dev * This is called when the SKB is moving down the stack towards the * physical devices. */ -int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, - unsigned short type, - const void *daddr, const void *saddr, unsigned len) +static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, + const void *daddr, const void *saddr, + unsigned int len) { struct vlan_hdr *vhdr; unsigned short veth_TCI = 0; int rc = 0; int build_vlan_header = 0; - struct net_device *vdev = dev; /* save this for the bottom of the method */ + struct net_device *vdev = dev; -#ifdef VLAN_DEBUG - printk(VLAN_DBG "%s: skb: %p type: %hx len: %x vlan_id: %hx, daddr: %p\n", - __FUNCTION__, skb, type, len, VLAN_DEV_INFO(dev)->vlan_id, daddr); -#endif + pr_debug("%s: skb: %p type: %hx len: %u vlan_id: %hx, daddr: %p\n", + __FUNCTION__, skb, type, len, vlan_dev_info(dev)->vlan_id, + daddr); /* build vlan header only if re_order_header flag is NOT set. This * fixes some programs that get confused when they see a VLAN device @@ -365,7 +279,7 @@ int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, * header shuffling in the hard_start_xmit. Users can turn off this * REORDER behaviour with the vconfig tool. */ - if (!(VLAN_DEV_INFO(dev)->flags & VLAN_FLAG_REORDER_HDR)) + if (!(vlan_dev_info(dev)->flags & VLAN_FLAG_REORDER_HDR)) build_vlan_header = 1; if (build_vlan_header) { @@ -373,29 +287,28 @@ int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, /* build the four bytes that make this a VLAN header. */ - /* Now, construct the second two bytes. This field looks something - * like: + /* Now, construct the second two bytes. This field looks + * something like: * usr_priority: 3 bits (high bits) * CFI 1 bit * VLAN ID 12 bits (low bits) * */ - veth_TCI = VLAN_DEV_INFO(dev)->vlan_id; + veth_TCI = vlan_dev_info(dev)->vlan_id; veth_TCI |= vlan_dev_get_egress_qos_mask(dev, skb); vhdr->h_vlan_TCI = htons(veth_TCI); /* - * Set the protocol type. - * For a packet of type ETH_P_802_3 we put the length in here instead. - * It is up to the 802.2 layer to carry protocol information. + * Set the protocol type. For a packet of type ETH_P_802_3 we + * put the length in here instead. It is up to the 802.2 + * layer to carry protocol information. */ - if (type != ETH_P_802_3) { + if (type != ETH_P_802_3) vhdr->h_vlan_encapsulated_proto = htons(type); - } else { + else vhdr->h_vlan_encapsulated_proto = htons(len); - } skb->protocol = htons(ETH_P_8021Q); skb_reset_network_header(skb); @@ -405,16 +318,16 @@ int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, if (saddr == NULL) saddr = dev->dev_addr; - dev = VLAN_DEV_INFO(dev)->real_dev; + dev = vlan_dev_info(dev)->real_dev; - /* MPLS can send us skbuffs w/out enough space. This check will grow the - * skb if it doesn't have enough headroom. Not a beautiful solution, so - * I'll tick a counter so that users can know it's happening... If they - * care... + /* MPLS can send us skbuffs w/out enough space. This check will grow + * the skb if it doesn't have enough headroom. Not a beautiful solution, + * so I'll tick a counter so that users can know it's happening... + * If they care... */ - /* NOTE: This may still break if the underlying device is not the final - * device (and thus there are more headers to add...) It should work for + /* NOTE: This may still break if the underlying device is not the final + * device (and thus there are more headers to add...) It should work for * good-ole-ethernet though. */ if (skb_headroom(skb) < dev->hard_header_len) { @@ -422,14 +335,12 @@ int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, skb = skb_realloc_headroom(sk_tmp, dev->hard_header_len); kfree_skb(sk_tmp); if (skb == NULL) { - struct net_device_stats *stats = vlan_dev_get_stats(vdev); + struct net_device_stats *stats = &vdev->stats; stats->tx_dropped++; return -ENOMEM; } - VLAN_DEV_INFO(vdev)->cnt_inc_headroom_on_tx++; -#ifdef VLAN_DEBUG - printk(VLAN_DBG "%s: %s: had to grow skb.\n", __FUNCTION__, vdev->name); -#endif + vlan_dev_info(vdev)->cnt_inc_headroom_on_tx++; + pr_debug("%s: %s: had to grow skb\n", __FUNCTION__, vdev->name); } if (build_vlan_header) { @@ -441,19 +352,19 @@ int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, else if (rc < 0) rc -= VLAN_HLEN; } else - /* If here, then we'll just make a normal looking ethernet frame, - * but, the hard_start_xmit method will insert the tag (it has to - * be able to do this for bridged and other skbs that don't come - * down the protocol stack in an orderly manner. + /* If here, then we'll just make a normal looking ethernet + * frame, but, the hard_start_xmit method will insert the tag + * (it has to be able to do this for bridged and other skbs + * that don't come down the protocol stack in an orderly manner. */ rc = dev_hard_header(skb, dev, type, daddr, saddr, len); return rc; } -int vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) +static int vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) { - struct net_device_stats *stats = vlan_dev_get_stats(dev); + struct net_device_stats *stats = &dev->stats; struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data); /* Handle non-VLAN frames if they are sent to us, for example by DHCP. @@ -463,24 +374,22 @@ int vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) */ if (veth->h_vlan_proto != htons(ETH_P_8021Q) || - VLAN_DEV_INFO(dev)->flags & VLAN_FLAG_REORDER_HDR) { + vlan_dev_info(dev)->flags & VLAN_FLAG_REORDER_HDR) { int orig_headroom = skb_headroom(skb); unsigned short veth_TCI; /* This is not a VLAN frame...but we can fix that! */ - VLAN_DEV_INFO(dev)->cnt_encap_on_xmit++; + vlan_dev_info(dev)->cnt_encap_on_xmit++; -#ifdef VLAN_DEBUG - printk(VLAN_DBG "%s: proto to encap: 0x%hx (hbo)\n", - __FUNCTION__, htons(veth->h_vlan_proto)); -#endif + pr_debug("%s: proto to encap: 0x%hx\n", + __FUNCTION__, htons(veth->h_vlan_proto)); /* Construct the second two bytes. This field looks something * like: * usr_priority: 3 bits (high bits) * CFI 1 bit * VLAN ID 12 bits (low bits) */ - veth_TCI = VLAN_DEV_INFO(dev)->vlan_id; + veth_TCI = vlan_dev_info(dev)->vlan_id; veth_TCI |= vlan_dev_get_egress_qos_mask(dev, skb); skb = __vlan_put_tag(skb, veth_TCI); @@ -489,32 +398,33 @@ int vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) return 0; } - if (orig_headroom < VLAN_HLEN) { - VLAN_DEV_INFO(dev)->cnt_inc_headroom_on_tx++; - } + if (orig_headroom < VLAN_HLEN) + vlan_dev_info(dev)->cnt_inc_headroom_on_tx++; } -#ifdef VLAN_DEBUG - printk(VLAN_DBG "%s: about to send skb: %p to dev: %s\n", + pr_debug("%s: about to send skb: %p to dev: %s\n", __FUNCTION__, skb, skb->dev->name); - printk(VLAN_DBG " %2hx.%2hx.%2hx.%2xh.%2hx.%2hx %2hx.%2hx.%2hx.%2hx.%2hx.%2hx %4hx %4hx %4hx\n", - veth->h_dest[0], veth->h_dest[1], veth->h_dest[2], veth->h_dest[3], veth->h_dest[4], veth->h_dest[5], - veth->h_source[0], veth->h_source[1], veth->h_source[2], veth->h_source[3], veth->h_source[4], veth->h_source[5], - veth->h_vlan_proto, veth->h_vlan_TCI, veth->h_vlan_encapsulated_proto); -#endif + pr_debug(" " MAC_FMT " " MAC_FMT " %4hx %4hx %4hx\n", + veth->h_dest[0], veth->h_dest[1], veth->h_dest[2], + veth->h_dest[3], veth->h_dest[4], veth->h_dest[5], + veth->h_source[0], veth->h_source[1], veth->h_source[2], + veth->h_source[3], veth->h_source[4], veth->h_source[5], + veth->h_vlan_proto, veth->h_vlan_TCI, + veth->h_vlan_encapsulated_proto); stats->tx_packets++; /* for statics only */ stats->tx_bytes += skb->len; - skb->dev = VLAN_DEV_INFO(dev)->real_dev; + skb->dev = vlan_dev_info(dev)->real_dev; dev_queue_xmit(skb); return 0; } -int vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) +static int vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb, + struct net_device *dev) { - struct net_device_stats *stats = vlan_dev_get_stats(dev); + struct net_device_stats *stats = &dev->stats; unsigned short veth_TCI; /* Construct the second two bytes. This field looks something @@ -523,25 +433,25 @@ int vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb, struct net_device *dev * CFI 1 bit * VLAN ID 12 bits (low bits) */ - veth_TCI = VLAN_DEV_INFO(dev)->vlan_id; + veth_TCI = vlan_dev_info(dev)->vlan_id; veth_TCI |= vlan_dev_get_egress_qos_mask(dev, skb); skb = __vlan_hwaccel_put_tag(skb, veth_TCI); stats->tx_packets++; stats->tx_bytes += skb->len; - skb->dev = VLAN_DEV_INFO(dev)->real_dev; + skb->dev = vlan_dev_info(dev)->real_dev; dev_queue_xmit(skb); return 0; } -int vlan_dev_change_mtu(struct net_device *dev, int new_mtu) +static int vlan_dev_change_mtu(struct net_device *dev, int new_mtu) { /* TODO: gotta make sure the underlying layer can handle it, * maybe an IFF_VLAN_CAPABLE flag for devices? */ - if (VLAN_DEV_INFO(dev)->real_dev->mtu < new_mtu) + if (vlan_dev_info(dev)->real_dev->mtu < new_mtu) return -ERANGE; dev->mtu = new_mtu; @@ -552,7 +462,7 @@ int vlan_dev_change_mtu(struct net_device *dev, int new_mtu) void vlan_dev_set_ingress_priority(const struct net_device *dev, u32 skb_prio, short vlan_prio) { - struct vlan_dev_info *vlan = VLAN_DEV_INFO(dev); + struct vlan_dev_info *vlan = vlan_dev_info(dev); if (vlan->ingress_priority_map[vlan_prio & 0x7] && !skb_prio) vlan->nr_ingress_mappings--; @@ -565,7 +475,7 @@ void vlan_dev_set_ingress_priority(const struct net_device *dev, int vlan_dev_set_egress_priority(const struct net_device *dev, u32 skb_prio, short vlan_prio) { - struct vlan_dev_info *vlan = VLAN_DEV_INFO(dev); + struct vlan_dev_info *vlan = vlan_dev_info(dev); struct vlan_priority_tci_mapping *mp = NULL; struct vlan_priority_tci_mapping *np; u32 vlan_qos = (vlan_prio << 13) & 0xE000; @@ -605,30 +515,28 @@ int vlan_dev_set_vlan_flag(const struct net_device *dev, { /* verify flag is supported */ if (flag == VLAN_FLAG_REORDER_HDR) { - if (flag_val) { - VLAN_DEV_INFO(dev)->flags |= VLAN_FLAG_REORDER_HDR; - } else { - VLAN_DEV_INFO(dev)->flags &= ~VLAN_FLAG_REORDER_HDR; - } + if (flag_val) + vlan_dev_info(dev)->flags |= VLAN_FLAG_REORDER_HDR; + else + vlan_dev_info(dev)->flags &= ~VLAN_FLAG_REORDER_HDR; return 0; } - printk(KERN_ERR "%s: flag %i is not valid.\n", __FUNCTION__, flag); return -EINVAL; } void vlan_dev_get_realdev_name(const struct net_device *dev, char *result) { - strncpy(result, VLAN_DEV_INFO(dev)->real_dev->name, 23); + strncpy(result, vlan_dev_info(dev)->real_dev->name, 23); } void vlan_dev_get_vid(const struct net_device *dev, unsigned short *result) { - *result = VLAN_DEV_INFO(dev)->vlan_id; + *result = vlan_dev_info(dev)->vlan_id; } -int vlan_dev_open(struct net_device *dev) +static int vlan_dev_open(struct net_device *dev) { - struct vlan_dev_info *vlan = VLAN_DEV_INFO(dev); + struct vlan_dev_info *vlan = vlan_dev_info(dev); struct net_device *real_dev = vlan->real_dev; int err; @@ -650,9 +558,9 @@ int vlan_dev_open(struct net_device *dev) return 0; } -int vlan_dev_stop(struct net_device *dev) +static int vlan_dev_stop(struct net_device *dev) { - struct net_device *real_dev = VLAN_DEV_INFO(dev)->real_dev; + struct net_device *real_dev = vlan_dev_info(dev)->real_dev; dev_mc_unsync(real_dev, dev); if (dev->flags & IFF_ALLMULTI) @@ -666,9 +574,9 @@ int vlan_dev_stop(struct net_device *dev) return 0; } -int vlan_set_mac_address(struct net_device *dev, void *p) +static int vlan_dev_set_mac_address(struct net_device *dev, void *p) { - struct net_device *real_dev = VLAN_DEV_INFO(dev)->real_dev; + struct net_device *real_dev = vlan_dev_info(dev)->real_dev; struct sockaddr *addr = p; int err; @@ -692,16 +600,16 @@ out: return 0; } -int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +static int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { - struct net_device *real_dev = VLAN_DEV_INFO(dev)->real_dev; + struct net_device *real_dev = vlan_dev_info(dev)->real_dev; struct ifreq ifrr; int err = -EOPNOTSUPP; strncpy(ifrr.ifr_name, real_dev->name, IFNAMSIZ); ifrr.ifr_ifru = ifr->ifr_ifru; - switch(cmd) { + switch (cmd) { case SIOCGMIIPHY: case SIOCGMIIREG: case SIOCSMIIREG: @@ -716,9 +624,9 @@ int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return err; } -void vlan_change_rx_flags(struct net_device *dev, int change) +static void vlan_dev_change_rx_flags(struct net_device *dev, int change) { - struct net_device *real_dev = VLAN_DEV_INFO(dev)->real_dev; + struct net_device *real_dev = vlan_dev_info(dev)->real_dev; if (change & IFF_ALLMULTI) dev_set_allmulti(real_dev, dev->flags & IFF_ALLMULTI ? 1 : -1); @@ -726,8 +634,78 @@ void vlan_change_rx_flags(struct net_device *dev, int change) dev_set_promiscuity(real_dev, dev->flags & IFF_PROMISC ? 1 : -1); } -/** Taken from Gleb + Lennert's VLAN code, and modified... */ -void vlan_dev_set_multicast_list(struct net_device *vlan_dev) +static void vlan_dev_set_multicast_list(struct net_device *vlan_dev) +{ + dev_mc_sync(vlan_dev_info(vlan_dev)->real_dev, vlan_dev); +} + +/* + * vlan network devices have devices nesting below it, and are a special + * "super class" of normal network devices; split their locks off into a + * separate class since they always nest. + */ +static struct lock_class_key vlan_netdev_xmit_lock_key; + +static const struct header_ops vlan_header_ops = { + .create = vlan_dev_hard_header, + .rebuild = vlan_dev_rebuild_header, + .parse = eth_header_parse, +}; + +static int vlan_dev_init(struct net_device *dev) +{ + struct net_device *real_dev = vlan_dev_info(dev)->real_dev; + int subclass = 0; + + /* IFF_BROADCAST|IFF_MULTICAST; ??? */ + dev->flags = real_dev->flags & ~IFF_UP; + dev->iflink = real_dev->ifindex; + dev->state = (real_dev->state & ((1<<__LINK_STATE_NOCARRIER) | + (1<<__LINK_STATE_DORMANT))) | + (1<<__LINK_STATE_PRESENT); + + /* ipv6 shared card related stuff */ + dev->dev_id = real_dev->dev_id; + + if (is_zero_ether_addr(dev->dev_addr)) + memcpy(dev->dev_addr, real_dev->dev_addr, dev->addr_len); + if (is_zero_ether_addr(dev->broadcast)) + memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len); + + if (real_dev->features & NETIF_F_HW_VLAN_TX) { + dev->header_ops = real_dev->header_ops; + dev->hard_header_len = real_dev->hard_header_len; + dev->hard_start_xmit = vlan_dev_hwaccel_hard_start_xmit; + } else { + dev->header_ops = &vlan_header_ops; + dev->hard_header_len = real_dev->hard_header_len + VLAN_HLEN; + dev->hard_start_xmit = vlan_dev_hard_start_xmit; + } + + if (real_dev->priv_flags & IFF_802_1Q_VLAN) + subclass = 1; + + lockdep_set_class_and_subclass(&dev->_xmit_lock, + &vlan_netdev_xmit_lock_key, subclass); + return 0; +} + +void vlan_setup(struct net_device *dev) { - dev_mc_sync(VLAN_DEV_INFO(vlan_dev)->real_dev, vlan_dev); + ether_setup(dev); + + dev->priv_flags |= IFF_802_1Q_VLAN; + dev->tx_queue_len = 0; + + dev->change_mtu = vlan_dev_change_mtu; + dev->init = vlan_dev_init; + dev->open = vlan_dev_open; + dev->stop = vlan_dev_stop; + dev->set_mac_address = vlan_dev_set_mac_address; + dev->set_multicast_list = vlan_dev_set_multicast_list; + dev->change_rx_flags = vlan_dev_change_rx_flags; + dev->do_ioctl = vlan_dev_ioctl; + dev->destructor = free_netdev; + + memset(dev->broadcast, 0, ETH_ALEN); } diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index 0996185e2ed..e32eeb37987 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -75,7 +75,7 @@ static int vlan_validate(struct nlattr *tb[], struct nlattr *data[]) static int vlan_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { - struct vlan_dev_info *vlan = VLAN_DEV_INFO(dev); + struct vlan_dev_info *vlan = vlan_dev_info(dev); struct ifla_vlan_flags *flags; struct ifla_vlan_qos_mapping *m; struct nlattr *attr; @@ -104,7 +104,7 @@ static int vlan_changelink(struct net_device *dev, static int vlan_newlink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { - struct vlan_dev_info *vlan = VLAN_DEV_INFO(dev); + struct vlan_dev_info *vlan = vlan_dev_info(dev); struct net_device *real_dev; int err; @@ -137,11 +137,6 @@ static int vlan_newlink(struct net_device *dev, return register_vlan_dev(dev); } -static void vlan_dellink(struct net_device *dev) -{ - unregister_vlan_device(dev); -} - static inline size_t vlan_qos_map_size(unsigned int n) { if (n == 0) @@ -153,7 +148,7 @@ static inline size_t vlan_qos_map_size(unsigned int n) static size_t vlan_get_size(const struct net_device *dev) { - struct vlan_dev_info *vlan = VLAN_DEV_INFO(dev); + struct vlan_dev_info *vlan = vlan_dev_info(dev); return nla_total_size(2) + /* IFLA_VLAN_ID */ vlan_qos_map_size(vlan->nr_ingress_mappings) + @@ -162,14 +157,14 @@ static size_t vlan_get_size(const struct net_device *dev) static int vlan_fill_info(struct sk_buff *skb, const struct net_device *dev) { - struct vlan_dev_info *vlan = VLAN_DEV_INFO(dev); + struct vlan_dev_info *vlan = vlan_dev_info(dev); struct vlan_priority_tci_mapping *pm; struct ifla_vlan_flags f; struct ifla_vlan_qos_mapping m; struct nlattr *nest; unsigned int i; - NLA_PUT_U16(skb, IFLA_VLAN_ID, VLAN_DEV_INFO(dev)->vlan_id); + NLA_PUT_U16(skb, IFLA_VLAN_ID, vlan_dev_info(dev)->vlan_id); if (vlan->flags) { f.flags = vlan->flags; f.mask = ~0; @@ -226,7 +221,7 @@ struct rtnl_link_ops vlan_link_ops __read_mostly = { .validate = vlan_validate, .newlink = vlan_newlink, .changelink = vlan_changelink, - .dellink = vlan_dellink, + .dellink = unregister_vlan_dev, .get_size = vlan_get_size, .fill_info = vlan_fill_info, }; diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c index 6cefdf8e381..a0ec4792559 100644 --- a/net/8021q/vlanproc.c +++ b/net/8021q/vlanproc.c @@ -125,10 +125,10 @@ static struct proc_dir_entry *proc_vlan_conf; /* Strings */ static const char *vlan_name_type_str[VLAN_NAME_TYPE_HIGHEST] = { - [VLAN_NAME_TYPE_RAW_PLUS_VID] = "VLAN_NAME_TYPE_RAW_PLUS_VID", - [VLAN_NAME_TYPE_PLUS_VID_NO_PAD] = "VLAN_NAME_TYPE_PLUS_VID_NO_PAD", - [VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD]= "VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD", - [VLAN_NAME_TYPE_PLUS_VID] = "VLAN_NAME_TYPE_PLUS_VID", + [VLAN_NAME_TYPE_RAW_PLUS_VID] = "VLAN_NAME_TYPE_RAW_PLUS_VID", + [VLAN_NAME_TYPE_PLUS_VID_NO_PAD] = "VLAN_NAME_TYPE_PLUS_VID_NO_PAD", + [VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD] = "VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD", + [VLAN_NAME_TYPE_PLUS_VID] = "VLAN_NAME_TYPE_PLUS_VID", }; /* * Interface functions @@ -158,15 +158,18 @@ void vlan_proc_cleanup(void) int __init vlan_proc_init(void) { proc_vlan_dir = proc_mkdir(name_root, init_net.proc_net); - if (proc_vlan_dir) { - proc_vlan_conf = create_proc_entry(name_conf, - S_IFREG|S_IRUSR|S_IWUSR, - proc_vlan_dir); - if (proc_vlan_conf) { - proc_vlan_conf->proc_fops = &vlan_fops; - return 0; - } - } + if (!proc_vlan_dir) + goto err; + + proc_vlan_conf = create_proc_entry(name_conf, S_IFREG|S_IRUSR|S_IWUSR, + proc_vlan_dir); + if (!proc_vlan_conf) + goto err; + proc_vlan_conf->proc_fops = &vlan_fops; + return 0; + +err: + pr_err("%s: can't create entry in proc filesystem!\n", __FUNCTION__); vlan_proc_cleanup(); return -ENOBUFS; } @@ -175,16 +178,9 @@ int __init vlan_proc_init(void) * Add directory entry for VLAN device. */ -int vlan_proc_add_dev (struct net_device *vlandev) +int vlan_proc_add_dev(struct net_device *vlandev) { - struct vlan_dev_info *dev_info = VLAN_DEV_INFO(vlandev); - - if (!(vlandev->priv_flags & IFF_802_1Q_VLAN)) { - printk(KERN_ERR - "ERROR: vlan_proc_add, device -:%s:- is NOT a VLAN\n", - vlandev->name); - return -EINVAL; - } + struct vlan_dev_info *dev_info = vlan_dev_info(vlandev); dev_info->dent = create_proc_entry(vlandev->name, S_IFREG|S_IRUSR|S_IWUSR, @@ -194,11 +190,6 @@ int vlan_proc_add_dev (struct net_device *vlandev) dev_info->dent->proc_fops = &vlandev_fops; dev_info->dent->data = vlandev; - -#ifdef VLAN_DEBUG - printk(KERN_ERR "vlan_proc_add, device -:%s:- being added.\n", - vlandev->name); -#endif return 0; } @@ -207,28 +198,12 @@ int vlan_proc_add_dev (struct net_device *vlandev) */ int vlan_proc_rem_dev(struct net_device *vlandev) { - if (!vlandev) { - printk(VLAN_ERR "%s: invalid argument: %p\n", - __FUNCTION__, vlandev); - return -EINVAL; - } - - if (!(vlandev->priv_flags & IFF_802_1Q_VLAN)) { - printk(VLAN_DBG "%s: invalid argument, device: %s is not a VLAN device, priv_flags: 0x%4hX.\n", - __FUNCTION__, vlandev->name, vlandev->priv_flags); - return -EINVAL; - } - -#ifdef VLAN_DEBUG - printk(VLAN_DBG "%s: dev: %p\n", __FUNCTION__, vlandev); -#endif - /** NOTE: This will consume the memory pointed to by dent, it seems. */ - if (VLAN_DEV_INFO(vlandev)->dent) { - remove_proc_entry(VLAN_DEV_INFO(vlandev)->dent->name, proc_vlan_dir); - VLAN_DEV_INFO(vlandev)->dent = NULL; + if (vlan_dev_info(vlandev)->dent) { + remove_proc_entry(vlan_dev_info(vlandev)->dent->name, + proc_vlan_dir); + vlan_dev_info(vlandev)->dent = NULL; } - return 0; } @@ -245,6 +220,7 @@ static inline int is_vlan_dev(struct net_device *dev) /* start read of /proc/net/vlan/config */ static void *vlan_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(dev_base_lock) { struct net_device *dev; loff_t i = 1; @@ -286,6 +262,7 @@ static void *vlan_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void vlan_seq_stop(struct seq_file *seq, void *v) + __releases(dev_base_lock) { read_unlock(&dev_base_lock); } @@ -301,10 +278,10 @@ static int vlan_seq_show(struct seq_file *seq, void *v) nmtype = vlan_name_type_str[vlan_name_type]; seq_printf(seq, "Name-Type: %s\n", - nmtype ? nmtype : "UNKNOWN" ); + nmtype ? nmtype : "UNKNOWN"); } else { const struct net_device *vlandev = v; - const struct vlan_dev_info *dev_info = VLAN_DEV_INFO(vlandev); + const struct vlan_dev_info *dev_info = vlan_dev_info(vlandev); seq_printf(seq, "%-15s| %d | %s\n", vlandev->name, dev_info->vlan_id, dev_info->real_dev->name); @@ -315,20 +292,18 @@ static int vlan_seq_show(struct seq_file *seq, void *v) static int vlandev_seq_show(struct seq_file *seq, void *offset) { struct net_device *vlandev = (struct net_device *) seq->private; - const struct vlan_dev_info *dev_info = VLAN_DEV_INFO(vlandev); - struct net_device_stats *stats; + const struct vlan_dev_info *dev_info = vlan_dev_info(vlandev); + struct net_device_stats *stats = &vlandev->stats; static const char fmt[] = "%30s %12lu\n"; int i; if (!(vlandev->priv_flags & IFF_802_1Q_VLAN)) return 0; - seq_printf(seq, "%s VID: %d REORDER_HDR: %i dev->priv_flags: %hx\n", - vlandev->name, dev_info->vlan_id, - (int)(dev_info->flags & 1), vlandev->priv_flags); - - - stats = vlan_dev_get_stats(vlandev); + seq_printf(seq, + "%s VID: %d REORDER_HDR: %i dev->priv_flags: %hx\n", + vlandev->name, dev_info->vlan_id, + (int)(dev_info->flags & 1), vlandev->priv_flags); seq_printf(seq, fmt, "total frames received", stats->rx_packets); seq_printf(seq, fmt, "total bytes received", stats->rx_bytes); @@ -342,16 +317,16 @@ static int vlandev_seq_show(struct seq_file *seq, void *offset) dev_info->cnt_encap_on_xmit); seq_printf(seq, "Device: %s", dev_info->real_dev->name); /* now show all PRIORITY mappings relating to this VLAN */ - seq_printf(seq, - "\nINGRESS priority mappings: 0:%u 1:%u 2:%u 3:%u 4:%u 5:%u 6:%u 7:%u\n", - dev_info->ingress_priority_map[0], - dev_info->ingress_priority_map[1], - dev_info->ingress_priority_map[2], - dev_info->ingress_priority_map[3], - dev_info->ingress_priority_map[4], - dev_info->ingress_priority_map[5], - dev_info->ingress_priority_map[6], - dev_info->ingress_priority_map[7]); + seq_printf(seq, "\nINGRESS priority mappings: " + "0:%u 1:%u 2:%u 3:%u 4:%u 5:%u 6:%u 7:%u\n", + dev_info->ingress_priority_map[0], + dev_info->ingress_priority_map[1], + dev_info->ingress_priority_map[2], + dev_info->ingress_priority_map[3], + dev_info->ingress_priority_map[4], + dev_info->ingress_priority_map[5], + dev_info->ingress_priority_map[6], + dev_info->ingress_priority_map[7]); seq_printf(seq, "EGRESSS priority Mappings: "); for (i = 0; i < 16; i++) { diff --git a/net/8021q/vlanproc.h b/net/8021q/vlanproc.h index f908ee332fd..da542cacc5a 100644 --- a/net/8021q/vlanproc.h +++ b/net/8021q/vlanproc.h @@ -4,16 +4,15 @@ #ifdef CONFIG_PROC_FS int vlan_proc_init(void); int vlan_proc_rem_dev(struct net_device *vlandev); -int vlan_proc_add_dev (struct net_device *vlandev); -void vlan_proc_cleanup (void); +int vlan_proc_add_dev(struct net_device *vlandev); +void vlan_proc_cleanup(void); #else /* No CONFIG_PROC_FS */ #define vlan_proc_init() (0) -#define vlan_proc_cleanup() do {} while(0) -#define vlan_proc_add_dev(dev) ({(void)(dev), 0;}) -#define vlan_proc_rem_dev(dev) ({(void)(dev), 0;}) - +#define vlan_proc_cleanup() do {} while (0) +#define vlan_proc_add_dev(dev) ({(void)(dev), 0; }) +#define vlan_proc_rem_dev(dev) ({(void)(dev), 0; }) #endif #endif /* !(__BEN_VLAN_PROC_INC__) */ diff --git a/net/Kconfig b/net/Kconfig index ab4e6da5012..b6a5d454f2f 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -144,9 +144,21 @@ config NETFILTER_DEBUG You can say Y here if you want to get additional messages useful in debugging the netfilter code. +config NETFILTER_ADVANCED + bool "Advanced netfilter configuration" + depends on NETFILTER + default y + help + If you say Y here you can select between all the netfilter modules. + If you say N the more ununsual ones will not be shown and the + basic ones needed by most people will default to 'M'. + + If unsure, say Y. + config BRIDGE_NETFILTER bool "Bridged IP/ARP packets filtering" depends on BRIDGE && NETFILTER && INET + depends on NETFILTER_ADVANCED default y ---help--- Enabling this option will let arptables resp. iptables see bridged @@ -218,6 +230,7 @@ endmenu endmenu source "net/ax25/Kconfig" +source "net/can/Kconfig" source "net/irda/Kconfig" source "net/bluetooth/Kconfig" source "net/rxrpc/Kconfig" diff --git a/net/Makefile b/net/Makefile index bbe7d2a4148..b7a13643b54 100644 --- a/net/Makefile +++ b/net/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_LAPB) += lapb/ obj-$(CONFIG_NETROM) += netrom/ obj-$(CONFIG_ROSE) += rose/ obj-$(CONFIG_AX25) += ax25/ +obj-$(CONFIG_CAN) += can/ obj-$(CONFIG_IRDA) += irda/ obj-$(CONFIG_BT) += bluetooth/ obj-$(CONFIG_SUNRPC) += sunrpc/ diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c index 6c5c6dc098e..18058bbc796 100644 --- a/net/appletalk/aarp.c +++ b/net/appletalk/aarp.c @@ -874,9 +874,7 @@ void __init aarp_proto_init(void) aarp_dl = register_snap_client(aarp_snap_id, aarp_rcv); if (!aarp_dl) printk(KERN_CRIT "Unable to register AARP with SNAP.\n"); - init_timer(&aarp_timer); - aarp_timer.function = aarp_expire_timeout; - aarp_timer.data = 0; + setup_timer(&aarp_timer, aarp_expire_timeout, 0); aarp_timer.expires = jiffies + sysctl_aarp_expiry_time; add_timer(&aarp_timer); register_netdevice_notifier(&aarp_notifier); @@ -943,6 +941,7 @@ static struct aarp_entry *iter_next(struct aarp_iter_state *iter, loff_t *pos) } static void *aarp_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(aarp_lock) { struct aarp_iter_state *iter = seq->private; @@ -977,6 +976,7 @@ static void *aarp_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void aarp_seq_stop(struct seq_file *seq, void *v) + __releases(aarp_lock) { read_unlock_bh(&aarp_lock); } diff --git a/net/appletalk/atalk_proc.c b/net/appletalk/atalk_proc.c index 05d9652afcb..8e8dcfd532d 100644 --- a/net/appletalk/atalk_proc.c +++ b/net/appletalk/atalk_proc.c @@ -27,6 +27,7 @@ static __inline__ struct atalk_iface *atalk_get_interface_idx(loff_t pos) } static void *atalk_seq_interface_start(struct seq_file *seq, loff_t *pos) + __acquires(atalk_interfaces_lock) { loff_t l = *pos; @@ -52,6 +53,7 @@ out: } static void atalk_seq_interface_stop(struct seq_file *seq, void *v) + __releases(atalk_interfaces_lock) { read_unlock_bh(&atalk_interfaces_lock); } @@ -86,6 +88,7 @@ static __inline__ struct atalk_route *atalk_get_route_idx(loff_t pos) } static void *atalk_seq_route_start(struct seq_file *seq, loff_t *pos) + __acquires(atalk_routes_lock) { loff_t l = *pos; @@ -111,6 +114,7 @@ out: } static void atalk_seq_route_stop(struct seq_file *seq, void *v) + __releases(atalk_routes_lock) { read_unlock_bh(&atalk_routes_lock); } @@ -154,6 +158,7 @@ found: } static void *atalk_seq_socket_start(struct seq_file *seq, loff_t *pos) + __acquires(atalk_sockets_lock) { loff_t l = *pos; @@ -176,6 +181,7 @@ out: } static void atalk_seq_socket_stop(struct seq_file *seq, void *v) + __releases(atalk_sockets_lock) { read_unlock_bh(&atalk_sockets_lock); } diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index e0d37d6dc1f..3be55c8ca4e 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -177,10 +177,9 @@ static inline void atalk_destroy_socket(struct sock *sk) if (atomic_read(&sk->sk_wmem_alloc) || atomic_read(&sk->sk_rmem_alloc)) { - init_timer(&sk->sk_timer); + setup_timer(&sk->sk_timer, atalk_destroy_timer, + (unsigned long)sk); sk->sk_timer.expires = jiffies + SOCK_DESTROY_TIME; - sk->sk_timer.function = atalk_destroy_timer; - sk->sk_timer.data = (unsigned long)sk; add_timer(&sk->sk_timer); } else sock_put(sk); diff --git a/net/appletalk/sysctl_net_atalk.c b/net/appletalk/sysctl_net_atalk.c index 7df1778e221..621805dfa2f 100644 --- a/net/appletalk/sysctl_net_atalk.c +++ b/net/appletalk/sysctl_net_atalk.c @@ -49,31 +49,17 @@ static struct ctl_table atalk_table[] = { { 0 }, }; -static struct ctl_table atalk_dir_table[] = { - { - .ctl_name = NET_ATALK, - .procname = "appletalk", - .mode = 0555, - .child = atalk_table, - }, - { 0 }, -}; - -static struct ctl_table atalk_root_table[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = atalk_dir_table, - }, - { 0 }, +static struct ctl_path atalk_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "appletalk", .ctl_name = NET_ATALK, }, + { } }; static struct ctl_table_header *atalk_table_header; void atalk_register_sysctl(void) { - atalk_table_header = register_sysctl_table(atalk_root_table); + atalk_table_header = register_sysctl_paths(atalk_path, atalk_table); } void atalk_unregister_sysctl(void) diff --git a/net/atm/Kconfig b/net/atm/Kconfig index 21ff276b2d8..754ea103b37 100644 --- a/net/atm/Kconfig +++ b/net/atm/Kconfig @@ -1,10 +1,9 @@ # -# Asynchronous Transfer Mode (ATM) (EXPERIMENTAL) +# Asynchronous Transfer Mode (ATM) # config ATM - tristate "Asynchronous Transfer Mode (ATM) (EXPERIMENTAL)" - depends on EXPERIMENTAL + tristate "Asynchronous Transfer Mode (ATM)" ---help--- ATM is a high-speed networking technology for Local Area Networks and Wide Area Networks. It uses a fixed packet size and is @@ -20,7 +19,7 @@ config ATM further details. config ATM_CLIP - tristate "Classical IP over ATM (EXPERIMENTAL)" + tristate "Classical IP over ATM" depends on ATM && INET help Classical IP over ATM for PVCs and SVCs, supporting InARP and @@ -29,7 +28,7 @@ config ATM_CLIP (LANE)" below. config ATM_CLIP_NO_ICMP - bool "Do NOT send ICMP if no neighbour (EXPERIMENTAL)" + bool "Do NOT send ICMP if no neighbour" depends on ATM_CLIP help Normally, an "ICMP host unreachable" message is sent if a neighbour @@ -39,7 +38,7 @@ config ATM_CLIP_NO_ICMP such neighbours are silently discarded instead. config ATM_LANE - tristate "LAN Emulation (LANE) support (EXPERIMENTAL)" + tristate "LAN Emulation (LANE) support" depends on ATM help LAN Emulation emulates services of existing LANs across an ATM @@ -48,7 +47,7 @@ config ATM_LANE ELAN and Ethernet segments. You need LANE if you want to try MPOA. config ATM_MPOA - tristate "Multi-Protocol Over ATM (MPOA) support (EXPERIMENTAL)" + tristate "Multi-Protocol Over ATM (MPOA) support" depends on ATM && INET && ATM_LANE!=n help Multi-Protocol Over ATM allows ATM edge devices such as routers, diff --git a/net/atm/atm_sysfs.c b/net/atm/atm_sysfs.c index 9ef07eda2c4..1b88311f213 100644 --- a/net/atm/atm_sysfs.c +++ b/net/atm/atm_sysfs.c @@ -9,13 +9,15 @@ #define to_atm_dev(cldev) container_of(cldev, struct atm_dev, class_dev) -static ssize_t show_type(struct class_device *cdev, char *buf) +static ssize_t show_type(struct device *cdev, + struct device_attribute *attr, char *buf) { struct atm_dev *adev = to_atm_dev(cdev); return sprintf(buf, "%s\n", adev->type); } -static ssize_t show_address(struct class_device *cdev, char *buf) +static ssize_t show_address(struct device *cdev, + struct device_attribute *attr, char *buf) { char *pos = buf; struct atm_dev *adev = to_atm_dev(cdev); @@ -28,7 +30,8 @@ static ssize_t show_address(struct class_device *cdev, char *buf) return pos - buf; } -static ssize_t show_atmaddress(struct class_device *cdev, char *buf) +static ssize_t show_atmaddress(struct device *cdev, + struct device_attribute *attr, char *buf) { unsigned long flags; char *pos = buf; @@ -54,7 +57,8 @@ static ssize_t show_atmaddress(struct class_device *cdev, char *buf) return pos - buf; } -static ssize_t show_carrier(struct class_device *cdev, char *buf) +static ssize_t show_carrier(struct device *cdev, + struct device_attribute *attr, char *buf) { char *pos = buf; struct atm_dev *adev = to_atm_dev(cdev); @@ -65,7 +69,8 @@ static ssize_t show_carrier(struct class_device *cdev, char *buf) return pos - buf; } -static ssize_t show_link_rate(struct class_device *cdev, char *buf) +static ssize_t show_link_rate(struct device *cdev, + struct device_attribute *attr, char *buf) { char *pos = buf; struct atm_dev *adev = to_atm_dev(cdev); @@ -90,22 +95,23 @@ static ssize_t show_link_rate(struct class_device *cdev, char *buf) return pos - buf; } -static CLASS_DEVICE_ATTR(address, S_IRUGO, show_address, NULL); -static CLASS_DEVICE_ATTR(atmaddress, S_IRUGO, show_atmaddress, NULL); -static CLASS_DEVICE_ATTR(carrier, S_IRUGO, show_carrier, NULL); -static CLASS_DEVICE_ATTR(type, S_IRUGO, show_type, NULL); -static CLASS_DEVICE_ATTR(link_rate, S_IRUGO, show_link_rate, NULL); - -static struct class_device_attribute *atm_attrs[] = { - &class_device_attr_atmaddress, - &class_device_attr_address, - &class_device_attr_carrier, - &class_device_attr_type, - &class_device_attr_link_rate, +static DEVICE_ATTR(address, S_IRUGO, show_address, NULL); +static DEVICE_ATTR(atmaddress, S_IRUGO, show_atmaddress, NULL); +static DEVICE_ATTR(carrier, S_IRUGO, show_carrier, NULL); +static DEVICE_ATTR(type, S_IRUGO, show_type, NULL); +static DEVICE_ATTR(link_rate, S_IRUGO, show_link_rate, NULL); + +static struct device_attribute *atm_attrs[] = { + &dev_attr_atmaddress, + &dev_attr_address, + &dev_attr_carrier, + &dev_attr_type, + &dev_attr_link_rate, NULL }; -static int atm_uevent(struct class_device *cdev, struct kobj_uevent_env *env) + +static int atm_uevent(struct device *cdev, struct kobj_uevent_env *env) { struct atm_dev *adev; @@ -122,7 +128,7 @@ static int atm_uevent(struct class_device *cdev, struct kobj_uevent_env *env) return 0; } -static void atm_release(struct class_device *cdev) +static void atm_release(struct device *cdev) { struct atm_dev *adev = to_atm_dev(cdev); @@ -131,25 +137,25 @@ static void atm_release(struct class_device *cdev) static struct class atm_class = { .name = "atm", - .release = atm_release, - .uevent = atm_uevent, + .dev_release = atm_release, + .dev_uevent = atm_uevent, }; int atm_register_sysfs(struct atm_dev *adev) { - struct class_device *cdev = &adev->class_dev; + struct device *cdev = &adev->class_dev; int i, j, err; cdev->class = &atm_class; - class_set_devdata(cdev, adev); + dev_set_drvdata(cdev, adev); - snprintf(cdev->class_id, BUS_ID_SIZE, "%s%d", adev->type, adev->number); - err = class_device_register(cdev); + snprintf(cdev->bus_id, BUS_ID_SIZE, "%s%d", adev->type, adev->number); + err = device_register(cdev); if (err < 0) return err; for (i = 0; atm_attrs[i]; i++) { - err = class_device_create_file(cdev, atm_attrs[i]); + err = device_create_file(cdev, atm_attrs[i]); if (err) goto err_out; } @@ -158,16 +164,16 @@ int atm_register_sysfs(struct atm_dev *adev) err_out: for (j = 0; j < i; j++) - class_device_remove_file(cdev, atm_attrs[j]); - class_device_del(cdev); + device_remove_file(cdev, atm_attrs[j]); + device_del(cdev); return err; } void atm_unregister_sysfs(struct atm_dev *adev) { - struct class_device *cdev = &adev->class_dev; + struct device *cdev = &adev->class_dev; - class_device_del(cdev); + device_del(cdev); } int __init atm_sysfs_init(void) diff --git a/net/atm/br2684.c b/net/atm/br2684.c index ba6428f204f..574d9a96417 100644 --- a/net/atm/br2684.c +++ b/net/atm/br2684.c @@ -1,8 +1,10 @@ /* -Experimental ethernet netdevice using ATM AAL5 as underlying carrier -(RFC1483 obsoleted by RFC2684) for Linux 2.4 -Author: Marcell GAL, 2000, XDSL Ltd, Hungary -*/ + * Ethernet netdevice using ATM AAL5 as underlying carrier + * (RFC1483 obsoleted by RFC2684) for Linux + * + * Authors: Marcell GAL, 2000, XDSL Ltd, Hungary + * Eric Kinzie, 2006-2007, US Naval Research Laboratory + */ #include <linux/module.h> #include <linux/init.h> @@ -39,21 +41,35 @@ static void skb_debug(const struct sk_buff *skb) #define skb_debug(skb) do {} while (0) #endif +#define BR2684_ETHERTYPE_LEN 2 +#define BR2684_PAD_LEN 2 + +#define LLC 0xaa, 0xaa, 0x03 +#define SNAP_BRIDGED 0x00, 0x80, 0xc2 +#define SNAP_ROUTED 0x00, 0x00, 0x00 +#define PID_ETHERNET 0x00, 0x07 +#define ETHERTYPE_IPV4 0x08, 0x00 +#define ETHERTYPE_IPV6 0x86, 0xdd +#define PAD_BRIDGED 0x00, 0x00 + +static unsigned char ethertype_ipv4[] = { ETHERTYPE_IPV4 }; +static unsigned char ethertype_ipv6[] = { ETHERTYPE_IPV6 }; static unsigned char llc_oui_pid_pad[] = - { 0xAA, 0xAA, 0x03, 0x00, 0x80, 0xC2, 0x00, 0x07, 0x00, 0x00 }; -#define PADLEN (2) + { LLC, SNAP_BRIDGED, PID_ETHERNET, PAD_BRIDGED }; +static unsigned char llc_oui_ipv4[] = { LLC, SNAP_ROUTED, ETHERTYPE_IPV4 }; +static unsigned char llc_oui_ipv6[] = { LLC, SNAP_ROUTED, ETHERTYPE_IPV6 }; enum br2684_encaps { - e_vc = BR2684_ENCAPS_VC, + e_vc = BR2684_ENCAPS_VC, e_llc = BR2684_ENCAPS_LLC, }; struct br2684_vcc { - struct atm_vcc *atmvcc; + struct atm_vcc *atmvcc; struct net_device *device; - /* keep old push,pop functions for chaining */ - void (*old_push)(struct atm_vcc *vcc,struct sk_buff *skb); - /* void (*old_pop)(struct atm_vcc *vcc,struct sk_buff *skb); */ + /* keep old push, pop functions for chaining */ + void (*old_push) (struct atm_vcc * vcc, struct sk_buff * skb); + /* void (*old_pop)(struct atm_vcc *vcc, struct sk_buff *skb); */ enum br2684_encaps encaps; struct list_head brvccs; #ifdef CONFIG_ATM_BR2684_IPFILTER @@ -66,9 +82,10 @@ struct br2684_dev { struct net_device *net_dev; struct list_head br2684_devs; int number; - struct list_head brvccs; /* one device <=> one vcc (before xmas) */ + struct list_head brvccs; /* one device <=> one vcc (before xmas) */ struct net_device_stats stats; int mac_was_set; + enum br2684_payload payload; }; /* @@ -84,7 +101,7 @@ static LIST_HEAD(br2684_devs); static inline struct br2684_dev *BRPRIV(const struct net_device *net_dev) { - return (struct br2684_dev *) net_dev->priv; + return (struct br2684_dev *)net_dev->priv; } static inline struct net_device *list_entry_brdev(const struct list_head *le) @@ -94,7 +111,7 @@ static inline struct net_device *list_entry_brdev(const struct list_head *le) static inline struct br2684_vcc *BR2684_VCC(const struct atm_vcc *atmvcc) { - return (struct br2684_vcc *) (atmvcc->user_back); + return (struct br2684_vcc *)(atmvcc->user_back); } static inline struct br2684_vcc *list_entry_brvcc(const struct list_head *le) @@ -132,10 +149,11 @@ static struct net_device *br2684_find_dev(const struct br2684_if_spec *s) * otherwise false */ static int br2684_xmit_vcc(struct sk_buff *skb, struct br2684_dev *brdev, - struct br2684_vcc *brvcc) + struct br2684_vcc *brvcc) { struct atm_vcc *atmvcc; int minheadroom = (brvcc->encaps == e_llc) ? 10 : 2; + if (skb_headroom(skb) < minheadroom) { struct sk_buff *skb2 = skb_realloc_headroom(skb, minheadroom); brvcc->copies_needed++; @@ -146,23 +164,48 @@ static int br2684_xmit_vcc(struct sk_buff *skb, struct br2684_dev *brdev, } skb = skb2; } - skb_push(skb, minheadroom); - if (brvcc->encaps == e_llc) - skb_copy_to_linear_data(skb, llc_oui_pid_pad, 10); - else - memset(skb->data, 0, 2); + + if (brvcc->encaps == e_llc) { + if (brdev->payload == p_bridged) { + skb_push(skb, sizeof(llc_oui_pid_pad)); + skb_copy_to_linear_data(skb, llc_oui_pid_pad, + sizeof(llc_oui_pid_pad)); + } else if (brdev->payload == p_routed) { + unsigned short prot = ntohs(skb->protocol); + + skb_push(skb, sizeof(llc_oui_ipv4)); + switch (prot) { + case ETH_P_IP: + skb_copy_to_linear_data(skb, llc_oui_ipv4, + sizeof(llc_oui_ipv4)); + break; + case ETH_P_IPV6: + skb_copy_to_linear_data(skb, llc_oui_ipv6, + sizeof(llc_oui_ipv6)); + break; + default: + dev_kfree_skb(skb); + return 0; + } + } + } else { + skb_push(skb, 2); + if (brdev->payload == p_bridged) + memset(skb->data, 0, 2); + } skb_debug(skb); ATM_SKB(skb)->vcc = atmvcc = brvcc->atmvcc; pr_debug("atm_skb(%p)->vcc(%p)->dev(%p)\n", skb, atmvcc, atmvcc->dev); if (!atm_may_send(atmvcc, skb->truesize)) { - /* we free this here for now, because we cannot know in a higher - layer whether the skb point it supplied wasn't freed yet. - now, it always is. - */ + /* + * We free this here for now, because we cannot know in a higher + * layer whether the skb pointer it supplied wasn't freed yet. + * Now, it always is. + */ dev_kfree_skb(skb); return 0; - } + } atomic_add(skb->truesize, &sk_atm(atmvcc)->sk_wmem_alloc); ATM_SKB(skb)->atm_options = atmvcc->atm_options; brdev->stats.tx_packets++; @@ -172,10 +215,9 @@ static int br2684_xmit_vcc(struct sk_buff *skb, struct br2684_dev *brdev, } static inline struct br2684_vcc *pick_outgoing_vcc(struct sk_buff *skb, - struct br2684_dev *brdev) + struct br2684_dev *brdev) { - return list_empty(&brdev->brvccs) ? NULL : - list_entry_brvcc(brdev->brvccs.next); /* 1 vcc/dev right now */ + return list_empty(&brdev->brvccs) ? NULL : list_entry_brvcc(brdev->brvccs.next); /* 1 vcc/dev right now */ } static int br2684_start_xmit(struct sk_buff *skb, struct net_device *dev) @@ -199,11 +241,10 @@ static int br2684_start_xmit(struct sk_buff *skb, struct net_device *dev) /* * We should probably use netif_*_queue() here, but that * involves added complication. We need to walk before - * we can run + * we can run. + * + * Don't free here! this pointer might be no longer valid! */ - /* don't free here! this pointer might be no longer valid! - dev_kfree_skb(skb); - */ brdev->stats.tx_errors++; brdev->stats.tx_fifo_errors++; } @@ -217,12 +258,11 @@ static struct net_device_stats *br2684_get_stats(struct net_device *dev) return &BRPRIV(dev)->stats; } - /* * We remember when the MAC gets set, so we don't override it later with * the ESI of the ATM card of the first VC */ -static int (*my_eth_mac_addr)(struct net_device *, void *); +static int (*my_eth_mac_addr) (struct net_device *, void *); static int br2684_mac_addr(struct net_device *dev, void *p) { int err = my_eth_mac_addr(dev, p); @@ -233,7 +273,7 @@ static int br2684_mac_addr(struct net_device *dev, void *p) #ifdef CONFIG_ATM_BR2684_IPFILTER /* this IOCTL is experimental. */ -static int br2684_setfilt(struct atm_vcc *atmvcc, void __user *arg) +static int br2684_setfilt(struct atm_vcc *atmvcc, void __user * arg) { struct br2684_vcc *brvcc; struct br2684_filter_set fs; @@ -243,13 +283,12 @@ static int br2684_setfilt(struct atm_vcc *atmvcc, void __user *arg) if (fs.ifspec.method != BR2684_FIND_BYNOTHING) { /* * This is really a per-vcc thing, but we can also search - * by device + * by device. */ struct br2684_dev *brdev; read_lock(&devs_lock); brdev = BRPRIV(br2684_find_dev(&fs.ifspec)); - if (brdev == NULL || list_empty(&brdev->brvccs) || - brdev->brvccs.next != brdev->brvccs.prev) /* >1 VCC */ + if (brdev == NULL || list_empty(&brdev->brvccs) || brdev->brvccs.next != brdev->brvccs.prev) /* >1 VCC */ brvcc = NULL; else brvcc = list_entry_brvcc(brdev->brvccs.next); @@ -267,15 +306,16 @@ static inline int packet_fails_filter(__be16 type, struct br2684_vcc *brvcc, struct sk_buff *skb) { if (brvcc->filter.netmask == 0) - return 0; /* no filter in place */ + return 0; /* no filter in place */ if (type == htons(ETH_P_IP) && - (((struct iphdr *) (skb->data))->daddr & brvcc->filter. + (((struct iphdr *)(skb->data))->daddr & brvcc->filter. netmask) == brvcc->filter.prefix) return 0; if (type == htons(ETH_P_ARP)) return 0; - /* TODO: we should probably filter ARPs too.. don't want to have - * them returning values that don't make sense, or is that ok? + /* + * TODO: we should probably filter ARPs too.. don't want to have + * them returning values that don't make sense, or is that ok? */ return 1; /* drop */ } @@ -299,7 +339,6 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb) struct br2684_vcc *brvcc = BR2684_VCC(atmvcc); struct net_device *net_dev = brvcc->device; struct br2684_dev *brdev = BRPRIV(net_dev); - int plen = sizeof(llc_oui_pid_pad) + ETH_HLEN; pr_debug("br2684_push\n"); @@ -320,35 +359,58 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb) atm_return(atmvcc, skb->truesize); pr_debug("skb from brdev %p\n", brdev); if (brvcc->encaps == e_llc) { - /* let us waste some time for checking the encapsulation. - Note, that only 7 char is checked so frames with a valid FCS - are also accepted (but FCS is not checked of course) */ - if (memcmp(skb->data, llc_oui_pid_pad, 7)) { + + if (skb->len > 7 && skb->data[7] == 0x01) + __skb_trim(skb, skb->len - 4); + + /* accept packets that have "ipv[46]" in the snap header */ + if ((skb->len >= (sizeof(llc_oui_ipv4))) + && + (memcmp + (skb->data, llc_oui_ipv4, + sizeof(llc_oui_ipv4) - BR2684_ETHERTYPE_LEN) == 0)) { + if (memcmp + (skb->data + 6, ethertype_ipv6, + sizeof(ethertype_ipv6)) == 0) + skb->protocol = __constant_htons(ETH_P_IPV6); + else if (memcmp + (skb->data + 6, ethertype_ipv4, + sizeof(ethertype_ipv4)) == 0) + skb->protocol = __constant_htons(ETH_P_IP); + else { + brdev->stats.rx_errors++; + dev_kfree_skb(skb); + return; + } + skb_pull(skb, sizeof(llc_oui_ipv4)); + skb_reset_network_header(skb); + skb->pkt_type = PACKET_HOST; + /* + * Let us waste some time for checking the encapsulation. + * Note, that only 7 char is checked so frames with a valid FCS + * are also accepted (but FCS is not checked of course). + */ + } else if ((skb->len >= sizeof(llc_oui_pid_pad)) && + (memcmp(skb->data, llc_oui_pid_pad, 7) == 0)) { + skb_pull(skb, sizeof(llc_oui_pid_pad)); + skb->protocol = eth_type_trans(skb, net_dev); + } else { brdev->stats.rx_errors++; dev_kfree_skb(skb); return; } - /* Strip FCS if present */ - if (skb->len > 7 && skb->data[7] == 0x01) - __skb_trim(skb, skb->len - 4); } else { - plen = PADLEN + ETH_HLEN; /* pad, dstmac,srcmac, ethtype */ /* first 2 chars should be 0 */ if (*((u16 *) (skb->data)) != 0) { brdev->stats.rx_errors++; dev_kfree_skb(skb); return; } - } - if (skb->len < plen) { - brdev->stats.rx_errors++; - dev_kfree_skb(skb); /* dev_ not needed? */ - return; + skb_pull(skb, BR2684_PAD_LEN + ETH_HLEN); /* pad, dstmac, srcmac, ethtype */ + skb->protocol = eth_type_trans(skb, net_dev); } - skb_pull(skb, plen - ETH_HLEN); - skb->protocol = eth_type_trans(skb, net_dev); #ifdef CONFIG_ATM_BR2684_IPFILTER if (unlikely(packet_fails_filter(skb->protocol, brvcc, skb))) { brdev->stats.rx_dropped++; @@ -372,11 +434,12 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb) netif_rx(skb); } -static int br2684_regvcc(struct atm_vcc *atmvcc, void __user *arg) +/* + * Assign a vcc to a dev + * Note: we do not have explicit unassign, but look at _push() + */ +static int br2684_regvcc(struct atm_vcc *atmvcc, void __user * arg) { -/* assign a vcc to a dev -Note: we do not have explicit unassign, but look at _push() -*/ int err; struct br2684_vcc *brvcc; struct sk_buff *skb; @@ -395,7 +458,7 @@ Note: we do not have explicit unassign, but look at _push() net_dev = br2684_find_dev(&be.ifspec); if (net_dev == NULL) { printk(KERN_ERR - "br2684: tried to attach to non-existant device\n"); + "br2684: tried to attach to non-existant device\n"); err = -ENXIO; goto error; } @@ -411,13 +474,15 @@ Note: we do not have explicit unassign, but look at _push() } if (be.fcs_in != BR2684_FCSIN_NO || be.fcs_out != BR2684_FCSOUT_NO || be.fcs_auto || be.has_vpiid || be.send_padding || (be.encaps != - BR2684_ENCAPS_VC && be.encaps != BR2684_ENCAPS_LLC) || - be.min_size != 0) { + BR2684_ENCAPS_VC + && be.encaps != + BR2684_ENCAPS_LLC) + || be.min_size != 0) { err = -EINVAL; goto error; } - pr_debug("br2684_regvcc vcc=%p, encaps=%d, brvcc=%p\n", atmvcc, be.encaps, - brvcc); + pr_debug("br2684_regvcc vcc=%p, encaps=%d, brvcc=%p\n", atmvcc, + be.encaps, brvcc); if (list_empty(&brdev->brvccs) && !brdev->mac_was_set) { unsigned char *esi = atmvcc->dev->esi; if (esi[0] | esi[1] | esi[2] | esi[3] | esi[4] | esi[5]) @@ -430,7 +495,7 @@ Note: we do not have explicit unassign, but look at _push() brvcc->device = net_dev; brvcc->atmvcc = atmvcc; atmvcc->user_back = brvcc; - brvcc->encaps = (enum br2684_encaps) be.encaps; + brvcc->encaps = (enum br2684_encaps)be.encaps; brvcc->old_push = atmvcc->push; barrier(); atmvcc->push = br2684_push; @@ -461,7 +526,7 @@ Note: we do not have explicit unassign, but look at _push() } __module_get(THIS_MODULE); return 0; - error: + error: write_unlock_irq(&devs_lock); kfree(brvcc); return err; @@ -482,25 +547,52 @@ static void br2684_setup(struct net_device *netdev) INIT_LIST_HEAD(&brdev->brvccs); } -static int br2684_create(void __user *arg) +static void br2684_setup_routed(struct net_device *netdev) +{ + struct br2684_dev *brdev = BRPRIV(netdev); + brdev->net_dev = netdev; + + netdev->hard_header_len = 0; + my_eth_mac_addr = netdev->set_mac_address; + netdev->set_mac_address = br2684_mac_addr; + netdev->hard_start_xmit = br2684_start_xmit; + netdev->get_stats = br2684_get_stats; + netdev->addr_len = 0; + netdev->mtu = 1500; + netdev->type = ARPHRD_PPP; + netdev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; + netdev->tx_queue_len = 100; + INIT_LIST_HEAD(&brdev->brvccs); +} + +static int br2684_create(void __user * arg) { int err; struct net_device *netdev; struct br2684_dev *brdev; struct atm_newif_br2684 ni; + enum br2684_payload payload; pr_debug("br2684_create\n"); if (copy_from_user(&ni, arg, sizeof ni)) { return -EFAULT; } + + if (ni.media & BR2684_FLAG_ROUTED) + payload = p_routed; + else + payload = p_bridged; + ni.media &= 0xffff; /* strip flags */ + if (ni.media != BR2684_MEDIA_ETHERNET || ni.mtu != 1500) { return -EINVAL; } netdev = alloc_netdev(sizeof(struct br2684_dev), ni.ifname[0] ? ni.ifname : "nas%d", - br2684_setup); + (payload == p_routed) ? + br2684_setup_routed : br2684_setup); if (!netdev) return -ENOMEM; @@ -516,6 +608,7 @@ static int br2684_create(void __user *arg) } write_lock_irq(&devs_lock); + brdev->payload = payload; brdev->number = list_empty(&br2684_devs) ? 1 : BRPRIV(list_entry_brdev(br2684_devs.prev))->number + 1; list_add_tail(&brdev->br2684_devs, &br2684_devs); @@ -528,16 +621,16 @@ static int br2684_create(void __user *arg) * -ENOIOCTLCMD for any unrecognized ioctl */ static int br2684_ioctl(struct socket *sock, unsigned int cmd, - unsigned long arg) + unsigned long arg) { struct atm_vcc *atmvcc = ATM_SD(sock); void __user *argp = (void __user *)arg; + atm_backend_t b; int err; - switch(cmd) { + switch (cmd) { case ATM_SETBACKEND: - case ATM_NEWBACKENDIF: { - atm_backend_t b; + case ATM_NEWBACKENDIF: err = get_user(b, (atm_backend_t __user *) argp); if (err) return -EFAULT; @@ -549,7 +642,6 @@ static int br2684_ioctl(struct socket *sock, unsigned int cmd, return br2684_regvcc(atmvcc, argp); else return br2684_create(argp); - } #ifdef CONFIG_ATM_BR2684_IPFILTER case BR2684_SETFILT: if (atmvcc->push != br2684_push) @@ -557,6 +649,7 @@ static int br2684_ioctl(struct socket *sock, unsigned int cmd, if (!capable(CAP_NET_ADMIN)) return -EPERM; err = br2684_setfilt(atmvcc, argp); + return err; #endif /* CONFIG_ATM_BR2684_IPFILTER */ } @@ -564,24 +657,25 @@ static int br2684_ioctl(struct socket *sock, unsigned int cmd, } static struct atm_ioctl br2684_ioctl_ops = { - .owner = THIS_MODULE, - .ioctl = br2684_ioctl, + .owner = THIS_MODULE, + .ioctl = br2684_ioctl, }; - #ifdef CONFIG_PROC_FS -static void *br2684_seq_start(struct seq_file *seq, loff_t *pos) +static void *br2684_seq_start(struct seq_file *seq, loff_t * pos) + __acquires(devs_lock) { read_lock(&devs_lock); return seq_list_start(&br2684_devs, *pos); } -static void *br2684_seq_next(struct seq_file *seq, void *v, loff_t *pos) +static void *br2684_seq_next(struct seq_file *seq, void *v, loff_t * pos) { return seq_list_next(v, &br2684_devs, pos); } static void br2684_seq_stop(struct seq_file *seq, void *v) + __releases(devs_lock) { read_unlock(&devs_lock); } @@ -589,7 +683,7 @@ static void br2684_seq_stop(struct seq_file *seq, void *v) static int br2684_seq_show(struct seq_file *seq, void *v) { const struct br2684_dev *brdev = list_entry(v, struct br2684_dev, - br2684_devs); + br2684_devs); const struct net_device *net_dev = brdev->net_dev; const struct br2684_vcc *brvcc; DECLARE_MAC_BUF(mac); @@ -601,21 +695,19 @@ static int br2684_seq_show(struct seq_file *seq, void *v) brdev->mac_was_set ? "set" : "auto"); list_for_each_entry(brvcc, &brdev->brvccs, brvccs) { - seq_printf(seq, " vcc %d.%d.%d: encaps=%s" - ", failed copies %u/%u" - "\n", brvcc->atmvcc->dev->number, - brvcc->atmvcc->vpi, brvcc->atmvcc->vci, - (brvcc->encaps == e_llc) ? "LLC" : "VC" - , brvcc->copies_failed - , brvcc->copies_needed - ); + seq_printf(seq, " vcc %d.%d.%d: encaps=%s payload=%s" + ", failed copies %u/%u" + "\n", brvcc->atmvcc->dev->number, + brvcc->atmvcc->vpi, brvcc->atmvcc->vci, + (brvcc->encaps == e_llc) ? "LLC" : "VC", + (brdev->payload == p_bridged) ? "bridged" : "routed", + brvcc->copies_failed, brvcc->copies_needed); #ifdef CONFIG_ATM_BR2684_IPFILTER #define b1(var, byte) ((u8 *) &brvcc->filter.var)[byte] #define bs(var) b1(var, 0), b1(var, 1), b1(var, 2), b1(var, 3) - if (brvcc->filter.netmask != 0) - seq_printf(seq, " filter=%d.%d.%d.%d/" - "%d.%d.%d.%d\n", - bs(prefix), bs(netmask)); + if (brvcc->filter.netmask != 0) + seq_printf(seq, " filter=%d.%d.%d.%d/" + "%d.%d.%d.%d\n", bs(prefix), bs(netmask)); #undef bs #undef b1 #endif /* CONFIG_ATM_BR2684_IPFILTER */ @@ -625,9 +717,9 @@ static int br2684_seq_show(struct seq_file *seq, void *v) static const struct seq_operations br2684_seq_ops = { .start = br2684_seq_start, - .next = br2684_seq_next, - .stop = br2684_seq_stop, - .show = br2684_seq_show, + .next = br2684_seq_next, + .stop = br2684_seq_stop, + .show = br2684_seq_show, }; static int br2684_proc_open(struct inode *inode, struct file *file) @@ -636,15 +728,15 @@ static int br2684_proc_open(struct inode *inode, struct file *file) } static const struct file_operations br2684_proc_ops = { - .owner = THIS_MODULE, - .open = br2684_proc_open, - .read = seq_read, - .llseek = seq_lseek, + .owner = THIS_MODULE, + .open = br2684_proc_open, + .read = seq_read, + .llseek = seq_lseek, .release = seq_release, }; extern struct proc_dir_entry *atm_proc_root; /* from proc.c */ -#endif +#endif /* CONFIG_PROC_FS */ static int __init br2684_init(void) { diff --git a/net/atm/clip.c b/net/atm/clip.c index 741742f0079..86b885ec1cb 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -285,7 +285,7 @@ static int clip_constructor(struct neighbour *neigh) struct neigh_parms *parms; pr_debug("clip_constructor (neigh %p, entry %p)\n", neigh, entry); - neigh->type = inet_addr_type(entry->ip); + neigh->type = inet_addr_type(&init_net, entry->ip); if (neigh->type != RTN_UNICAST) return -EINVAL; @@ -534,7 +534,7 @@ static int clip_setentry(struct atm_vcc *vcc, __be32 ip) unlink_clip_vcc(clip_vcc); return 0; } - error = ip_route_output_key(&rt, &fl); + error = ip_route_output_key(&init_net, &rt, &fl); if (error) return error; neigh = __neigh_lookup(&clip_tbl, &ip, rt->u.dst.dev, 1); @@ -903,6 +903,8 @@ static void *clip_seq_sub_iter(struct neigh_seq_state *_state, static void *clip_seq_start(struct seq_file *seq, loff_t * pos) { + struct clip_seq_state *state = seq->private; + state->ns.neigh_sub_iter = clip_seq_sub_iter; return neigh_seq_start(seq, pos, &clip_tbl, NEIGH_SEQ_NEIGH_ONLY); } @@ -932,36 +934,15 @@ static const struct seq_operations arp_seq_ops = { static int arp_seq_open(struct inode *inode, struct file *file) { - struct clip_seq_state *state; - struct seq_file *seq; - int rc = -EAGAIN; - - state = kzalloc(sizeof(*state), GFP_KERNEL); - if (!state) { - rc = -ENOMEM; - goto out_kfree; - } - state->ns.neigh_sub_iter = clip_seq_sub_iter; - - rc = seq_open(file, &arp_seq_ops); - if (rc) - goto out_kfree; - - seq = file->private_data; - seq->private = state; -out: - return rc; - -out_kfree: - kfree(state); - goto out; + return seq_open_net(inode, file, &arp_seq_ops, + sizeof(struct clip_seq_state)); } static const struct file_operations arp_seq_fops = { .open = arp_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, .owner = THIS_MODULE }; #endif diff --git a/net/atm/common.c b/net/atm/common.c index eba09a04f6b..c865517ba44 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -113,7 +113,7 @@ static void vcc_write_space(struct sock *sk) if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) wake_up_interruptible(sk->sk_sleep); - sk_wake_async(sk, 2, POLL_OUT); + sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } read_unlock(&sk->sk_callback_lock); diff --git a/net/atm/lec.c b/net/atm/lec.c index 7eb1b21a0e9..1a8c4c6c0cd 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -176,7 +176,7 @@ static void lec_handle_bridge(struct sk_buff *skb, struct net_device *dev) static unsigned char *get_tr_dst(unsigned char *packet, unsigned char *rdesc) { struct trh_hdr *trh; - int riflen, num_rdsc; + unsigned int riflen, num_rdsc; trh = (struct trh_hdr *)packet; if (trh->daddr[0] & (uint8_t) 0x80) @@ -1789,9 +1789,8 @@ static struct lec_arp_table *make_entry(struct lec_priv *priv, } memcpy(to_return->mac_addr, mac_addr, ETH_ALEN); INIT_HLIST_NODE(&to_return->next); - init_timer(&to_return->timer); - to_return->timer.function = lec_arp_expire_arp; - to_return->timer.data = (unsigned long)to_return; + setup_timer(&to_return->timer, lec_arp_expire_arp, + (unsigned long)to_return); to_return->last_used = jiffies; to_return->priv = priv; skb_queue_head_init(&to_return->tx_wait); diff --git a/net/atm/proc.c b/net/atm/proc.c index 5d9d5ffba14..49125110bb8 100644 --- a/net/atm/proc.c +++ b/net/atm/proc.c @@ -142,6 +142,7 @@ static int vcc_seq_release(struct inode *inode, struct file *file) } static void *vcc_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(vcc_sklist_lock) { struct vcc_state *state = seq->private; loff_t left = *pos; @@ -152,6 +153,7 @@ static void *vcc_seq_start(struct seq_file *seq, loff_t *pos) } static void vcc_seq_stop(struct seq_file *seq, void *v) + __releases(vcc_sklist_lock) { read_unlock(&vcc_sklist_lock); } @@ -476,7 +478,7 @@ static void atm_proc_dirs_remove(void) if (e->dirent) remove_proc_entry(e->name, atm_proc_root); } - remove_proc_entry("atm", init_net.proc_net); + proc_net_remove(&init_net, "atm"); } int __init atm_proc_init(void) @@ -484,7 +486,7 @@ int __init atm_proc_init(void) static struct atm_proc_entry *e; int ret; - atm_proc_root = proc_mkdir("atm", init_net.proc_net); + atm_proc_root = proc_net_mkdir(&init_net, "atm", init_net.proc_net); if (!atm_proc_root) goto err_out; for (e = atm_proc_ents; e->name; e++) { diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index b4725ff317c..1bc0e85f04a 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -330,10 +330,9 @@ void ax25_destroy_socket(ax25_cb *ax25) if (atomic_read(&ax25->sk->sk_wmem_alloc) || atomic_read(&ax25->sk->sk_rmem_alloc)) { /* Defer: outstanding buffers */ - init_timer(&ax25->dtimer); + setup_timer(&ax25->dtimer, ax25_destroy_timer, + (unsigned long)ax25); ax25->dtimer.expires = jiffies + 2 * HZ; - ax25->dtimer.function = ax25_destroy_timer; - ax25->dtimer.data = (unsigned long)ax25; add_timer(&ax25->dtimer); } else { struct sock *sk=ax25->sk; @@ -571,7 +570,7 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname, res = -EINVAL; break; } - ax25->rtt = (opt * HZ) / 2; + ax25->rtt = (opt * HZ) >> 1; ax25->t1 = opt * HZ; break; @@ -1864,6 +1863,7 @@ static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) #ifdef CONFIG_PROC_FS static void *ax25_info_start(struct seq_file *seq, loff_t *pos) + __acquires(ax25_list_lock) { struct ax25_cb *ax25; struct hlist_node *node; @@ -1887,6 +1887,7 @@ static void *ax25_info_next(struct seq_file *seq, void *v, loff_t *pos) } static void ax25_info_stop(struct seq_file *seq, void *v) + __releases(ax25_list_lock) { spin_unlock_bh(&ax25_list_lock); } diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c index 4f44185955c..c4e3b025d21 100644 --- a/net/ax25/ax25_ds_timer.c +++ b/net/ax25/ax25_ds_timer.c @@ -130,7 +130,7 @@ void ax25_ds_heartbeat_expiry(ax25_cb *ax25) */ if (sk != NULL) { if (atomic_read(&sk->sk_rmem_alloc) < - (sk->sk_rcvbuf / 2) && + (sk->sk_rcvbuf >> 1) && (ax25->condition & AX25_COND_OWN_RX_BUSY)) { ax25->condition &= ~AX25_COND_OWN_RX_BUSY; ax25->condition &= ~AX25_COND_ACK_PENDING; diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index 9ecf6f1df86..38c7f3087ec 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -249,6 +249,7 @@ int ax25_rt_ioctl(unsigned int cmd, void __user *arg) #ifdef CONFIG_PROC_FS static void *ax25_rt_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(ax25_route_lock) { struct ax25_route *ax25_rt; int i = 1; @@ -274,6 +275,7 @@ static void *ax25_rt_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void ax25_rt_seq_stop(struct seq_file *seq, void *v) + __releases(ax25_route_lock) { read_unlock(&ax25_route_lock); } diff --git a/net/ax25/ax25_std_timer.c b/net/ax25/ax25_std_timer.c index f2f6918ac9b..96e4b927325 100644 --- a/net/ax25/ax25_std_timer.c +++ b/net/ax25/ax25_std_timer.c @@ -32,7 +32,7 @@ void ax25_std_heartbeat_expiry(ax25_cb *ax25) { - struct sock *sk=ax25->sk; + struct sock *sk = ax25->sk; if (sk) bh_lock_sock(sk); @@ -62,7 +62,7 @@ void ax25_std_heartbeat_expiry(ax25_cb *ax25) */ if (sk != NULL) { if (atomic_read(&sk->sk_rmem_alloc) < - (sk->sk_rcvbuf / 2) && + (sk->sk_rcvbuf >> 1) && (ax25->condition & AX25_COND_OWN_RX_BUSY)) { ax25->condition &= ~AX25_COND_OWN_RX_BUSY; ax25->condition &= ~AX25_COND_ACK_PENDING; diff --git a/net/ax25/ax25_uid.c b/net/ax25/ax25_uid.c index ce0b13d4438..5f4eb73fb9d 100644 --- a/net/ax25/ax25_uid.c +++ b/net/ax25/ax25_uid.c @@ -43,10 +43,10 @@ * Callsign/UID mapper. This is in kernel space for security on multi-amateur machines. */ -HLIST_HEAD(ax25_uid_list); +static HLIST_HEAD(ax25_uid_list); static DEFINE_RWLOCK(ax25_uid_lock); -int ax25_uid_policy = 0; +int ax25_uid_policy; EXPORT_SYMBOL(ax25_uid_policy); @@ -144,6 +144,7 @@ int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax) #ifdef CONFIG_PROC_FS static void *ax25_uid_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(ax25_uid_lock) { struct ax25_uid_assoc *pt; struct hlist_node *node; @@ -167,6 +168,7 @@ static void *ax25_uid_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void ax25_uid_seq_stop(struct seq_file *seq, void *v) + __releases(ax25_uid_lock) { read_unlock(&ax25_uid_lock); } diff --git a/net/ax25/sysctl_net_ax25.c b/net/ax25/sysctl_net_ax25.c index 443a8367663..f597987b242 100644 --- a/net/ax25/sysctl_net_ax25.c +++ b/net/ax25/sysctl_net_ax25.c @@ -31,25 +31,11 @@ static struct ctl_table_header *ax25_table_header; static ctl_table *ax25_table; static int ax25_table_size; -static ctl_table ax25_dir_table[] = { - { - .ctl_name = NET_AX25, - .procname = "ax25", - .mode = 0555, - }, - { .ctl_name = 0 } -}; - -static ctl_table ax25_root_table[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = ax25_dir_table - }, - { .ctl_name = 0 } +static struct ctl_path ax25_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "ax25", .ctl_name = NET_AX25, }, + { } }; - static const ctl_table ax25_param_table[] = { { .ctl_name = NET_AX25_IP_DEFAULT_MODE, @@ -243,9 +229,7 @@ void ax25_register_sysctl(void) } spin_unlock_bh(&ax25_dev_lock); - ax25_dir_table[0].child = ax25_table; - - ax25_table_header = register_sysctl_table(ax25_root_table); + ax25_table_header = register_sysctl_paths(ax25_path, ax25_table); } void ax25_unregister_sysctl(void) @@ -253,7 +237,6 @@ void ax25_unregister_sysctl(void) ctl_table *p; unregister_sysctl_table(ax25_table_header); - ax25_dir_table[0].child = NULL; for (p = ax25_table; p->ctl_name; p++) kfree(p->child); kfree(ax25_table); diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c index 9ebd3c64474..81065e548a1 100644 --- a/net/bluetooth/bnep/sock.c +++ b/net/bluetooth/bnep/sock.c @@ -94,7 +94,7 @@ static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long return err; if (nsock->sk->sk_state != BT_CONNECTED) { - fput(nsock->file); + sockfd_put(nsock); return -EBADFD; } @@ -103,7 +103,7 @@ static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long if (copy_to_user(argp, &ca, sizeof(ca))) err = -EFAULT; } else - fput(nsock->file); + sockfd_put(nsock); return err; diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c index 783edab12ce..8c7f7bc4e0b 100644 --- a/net/bluetooth/cmtp/sock.c +++ b/net/bluetooth/cmtp/sock.c @@ -88,7 +88,7 @@ static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long return err; if (nsock->sk->sk_state != BT_CONNECTED) { - fput(nsock->file); + sockfd_put(nsock); return -EBADFD; } @@ -97,7 +97,7 @@ static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long if (copy_to_user(argp, &ca, sizeof(ca))) err = -EFAULT; } else - fput(nsock->file); + sockfd_put(nsock); return err; diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 34d1a3c822b..5fc7be206f6 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -208,13 +208,8 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst) skb_queue_head_init(&conn->data_q); - init_timer(&conn->disc_timer); - conn->disc_timer.function = hci_conn_timeout; - conn->disc_timer.data = (unsigned long) conn; - - init_timer(&conn->idle_timer); - conn->idle_timer.function = hci_conn_idle; - conn->idle_timer.data = (unsigned long) conn; + setup_timer(&conn->disc_timer, hci_conn_timeout, (unsigned long)conn); + setup_timer(&conn->idle_timer, hci_conn_idle, (unsigned long)conn); atomic_set(&conn->refcnt, 0); diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 4bbacddeb49..782a22602b8 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -811,10 +811,7 @@ int hidp_add_connection(struct hidp_connadd_req *req, struct socket *ctrl_sock, session->intr_sock = intr_sock; session->state = BT_CONNECTED; - init_timer(&session->timer); - - session->timer.function = hidp_idle_timeout; - session->timer.data = (unsigned long) session; + setup_timer(&session->timer, hidp_idle_timeout, (unsigned long)session); skb_queue_head_init(&session->ctrl_transmit); skb_queue_head_init(&session->intr_transmit); diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c index 3292b956a7c..f4dd02ca9a9 100644 --- a/net/bluetooth/hidp/sock.c +++ b/net/bluetooth/hidp/sock.c @@ -86,13 +86,13 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long isock = sockfd_lookup(ca.intr_sock, &err); if (!isock) { - fput(csock->file); + sockfd_put(csock); return err; } if (csock->sk->sk_state != BT_CONNECTED || isock->sk->sk_state != BT_CONNECTED) { - fput(csock->file); - fput(isock->file); + sockfd_put(csock); + sockfd_put(isock); return -EBADFD; } @@ -101,8 +101,8 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long if (copy_to_user(argp, &ca, sizeof(ca))) err = -EFAULT; } else { - fput(csock->file); - fput(isock->file); + sockfd_put(csock); + sockfd_put(isock); } return err; diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index 477e052b17b..a8811c0a0ce 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -99,13 +99,6 @@ static void l2cap_sock_clear_timer(struct sock *sk) sk_stop_timer(sk, &sk->sk_timer); } -static void l2cap_sock_init_timer(struct sock *sk) -{ - init_timer(&sk->sk_timer); - sk->sk_timer.function = l2cap_sock_timeout; - sk->sk_timer.data = (unsigned long)sk; -} - /* ---- L2CAP channels ---- */ static struct sock *__l2cap_get_chan_by_dcid(struct l2cap_chan_list *l, u16 cid) { @@ -395,9 +388,7 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon, u8 status) conn->feat_mask = 0; - init_timer(&conn->info_timer); - conn->info_timer.function = l2cap_info_timeout; - conn->info_timer.data = (unsigned long) conn; + setup_timer(&conn->info_timer, l2cap_info_timeout, (unsigned long)conn); spin_lock_init(&conn->lock); rwlock_init(&conn->chan_list.lock); @@ -622,7 +613,7 @@ static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, int p sk->sk_protocol = proto; sk->sk_state = BT_OPEN; - l2cap_sock_init_timer(sk); + setup_timer(&sk->sk_timer, l2cap_sock_timeout, (unsigned long)sk); bt_sock_link(&l2cap_sk_list, sk); return sk; diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index e7ac6ba7eca..d3e4e1877e6 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -279,9 +279,7 @@ struct rfcomm_dlc *rfcomm_dlc_alloc(gfp_t prio) if (!d) return NULL; - init_timer(&d->timer); - d->timer.function = rfcomm_dlc_timeout; - d->timer.data = (unsigned long) d; + setup_timer(&d->timer, rfcomm_dlc_timeout, (unsigned long)d); skb_queue_head_init(&d->tx_queue); spin_lock_init(&d->lock); diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 93ad1aae3f3..b91d3c81a73 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -97,13 +97,6 @@ static void sco_sock_clear_timer(struct sock *sk) sk_stop_timer(sk, &sk->sk_timer); } -static void sco_sock_init_timer(struct sock *sk) -{ - init_timer(&sk->sk_timer); - sk->sk_timer.function = sco_sock_timeout; - sk->sk_timer.data = (unsigned long)sk; -} - /* ---- SCO connections ---- */ static struct sco_conn *sco_conn_add(struct hci_conn *hcon, __u8 status) { @@ -436,7 +429,7 @@ static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, int pro sk->sk_protocol = proto; sk->sk_state = BT_OPEN; - sco_sock_init_timer(sk); + setup_timer(&sk->sk_timer, sco_sock_timeout, (unsigned long)sk); bt_sock_link(&sco_sk_list, sk); return sk; diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 0ee79a726d9..255c00f60ce 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -109,7 +109,7 @@ static inline int is_link_local(const unsigned char *dest) { __be16 *a = (__be16 *)dest; static const __be16 *b = (const __be16 *)br_group_address; - static const __be16 m = __constant_cpu_to_be16(0xfff0); + static const __be16 m = cpu_to_be16(0xfff0); return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | ((a[2] ^ b[2]) & m)) == 0; } diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 9f78a69d6b8..80014bab81b 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -353,7 +353,7 @@ static int br_nf_pre_routing_finish(struct sk_buff *skb) if (err != -EHOSTUNREACH || !in_dev || IN_DEV_FORWARD(in_dev)) goto free_skb; - if (!ip_route_output_key(&rt, &fl)) { + if (!ip_route_output_key(&init_net, &rt, &fl)) { /* - Bridged-and-DNAT'ed traffic doesn't * require ip_forwarding. */ if (((struct dst_entry *)rt)->dev == dev) { @@ -511,7 +511,7 @@ static unsigned int br_nf_pre_routing_ipv6(unsigned int hook, if (!setup_pre_routing(skb)) return NF_DROP; - NF_HOOK(PF_INET6, NF_IP6_PRE_ROUTING, skb, skb->dev, NULL, + NF_HOOK(PF_INET6, NF_INET_PRE_ROUTING, skb, skb->dev, NULL, br_nf_pre_routing_finish_ipv6); return NF_STOLEN; @@ -584,7 +584,7 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb, return NF_DROP; store_orig_dstaddr(skb); - NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, skb->dev, NULL, + NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, skb->dev, NULL, br_nf_pre_routing_finish); return NF_STOLEN; @@ -681,7 +681,7 @@ static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff *skb, nf_bridge->mask |= BRNF_BRIDGED; nf_bridge->physoutdev = skb->dev; - NF_HOOK(pf, NF_IP_FORWARD, skb, bridge_parent(in), parent, + NF_HOOK(pf, NF_INET_FORWARD, skb, bridge_parent(in), parent, br_nf_forward_finish); return NF_STOLEN; @@ -832,7 +832,7 @@ static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff *skb, if (nf_bridge->netoutdev) realoutdev = nf_bridge->netoutdev; #endif - NF_HOOK(pf, NF_IP_POST_ROUTING, skb, NULL, realoutdev, + NF_HOOK(pf, NF_INET_POST_ROUTING, skb, NULL, realoutdev, br_nf_dev_queue_xmit); return NF_STOLEN; @@ -871,7 +871,7 @@ static unsigned int ip_sabotage_in(unsigned int hook, struct sk_buff *skb, * PF_BRIDGE/NF_BR_LOCAL_OUT functions don't get bridged traffic as input. * For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because * ip_refrag() can return NF_STOLEN. */ -static struct nf_hook_ops br_nf_ops[] = { +static struct nf_hook_ops br_nf_ops[] __read_mostly = { { .hook = br_nf_pre_routing, .owner = THIS_MODULE, .pf = PF_BRIDGE, @@ -905,12 +905,12 @@ static struct nf_hook_ops br_nf_ops[] = { { .hook = ip_sabotage_in, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_PRE_ROUTING, + .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_FIRST, }, { .hook = ip_sabotage_in, .owner = THIS_MODULE, .pf = PF_INET6, - .hooknum = NF_IP6_PRE_ROUTING, + .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP6_PRI_FIRST, }, }; @@ -967,24 +967,10 @@ static ctl_table brnf_table[] = { { .ctl_name = 0 } }; -static ctl_table brnf_bridge_table[] = { - { - .ctl_name = NET_BRIDGE, - .procname = "bridge", - .mode = 0555, - .child = brnf_table, - }, - { .ctl_name = 0 } -}; - -static ctl_table brnf_net_table[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = brnf_bridge_table, - }, - { .ctl_name = 0 } +static struct ctl_path brnf_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "bridge", .ctl_name = NET_BRIDGE, }, + { } }; #endif @@ -996,7 +982,7 @@ int __init br_netfilter_init(void) if (ret < 0) return ret; #ifdef CONFIG_SYSCTL - brnf_sysctl_header = register_sysctl_table(brnf_net_table); + brnf_sysctl_header = register_sysctl_paths(brnf_path, brnf_table); if (brnf_sysctl_header == NULL) { printk(KERN_WARNING "br_netfilter: can't register to sysctl.\n"); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 53ab8e0cb51..f5d69336d97 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -13,6 +13,7 @@ #include <linux/kernel.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> +#include <net/sock.h> #include "br_private.h" static inline size_t br_nlmsg_size(void) @@ -96,10 +97,10 @@ void br_ifinfo_notify(int event, struct net_bridge_port *port) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); + err = rtnl_notify(skb, &init_net,0, RTNLGRP_LINK, NULL, GFP_ATOMIC); errout: if (err < 0) - rtnl_set_sk_err(RTNLGRP_LINK, err); + rtnl_set_sk_err(&init_net, RTNLGRP_LINK, err); } /* @@ -107,9 +108,13 @@ errout: */ static int br_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = skb->sk->sk_net; struct net_device *dev; int idx; + if (net != &init_net) + return 0; + idx = 0; for_each_netdev(&init_net, dev) { /* not a bridge port */ @@ -135,12 +140,16 @@ skip: */ static int br_rtm_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { + struct net *net = skb->sk->sk_net; struct ifinfomsg *ifm; struct nlattr *protinfo; struct net_device *dev; struct net_bridge_port *p; u8 new_state; + if (net != &init_net) + return -EINVAL; + if (nlmsg_len(nlh) < sizeof(*ifm)) return -EINVAL; diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index b84fc6075fe..4a3e2bf892c 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -3,7 +3,7 @@ # menu "Bridge: Netfilter Configuration" - depends on BRIDGE && NETFILTER + depends on BRIDGE && BRIDGE_NETFILTER config BRIDGE_NF_EBTABLES tristate "Ethernet Bridge tables (ebtables) support" diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index 457815fb558..3be9e989855 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -17,6 +17,7 @@ #include <linux/in.h> #include <linux/if_arp.h> #include <linux/spinlock.h> +#include <net/netfilter/nf_log.h> static DEFINE_SPINLOCK(ebt_log_lock); @@ -182,7 +183,7 @@ static struct ebt_watcher log = .me = THIS_MODULE, }; -static struct nf_logger ebt_log_logger = { +static const struct nf_logger ebt_log_logger = { .name = "ebt_log", .logfn = &ebt_log_packet, .me = THIS_MODULE, diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c index e7cfd30bac7..8e7b00b68d3 100644 --- a/net/bridge/netfilter/ebt_ulog.c +++ b/net/bridge/netfilter/ebt_ulog.c @@ -38,6 +38,7 @@ #include <linux/netdevice.h> #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_bridge/ebt_ulog.h> +#include <net/netfilter/nf_log.h> #include <net/sock.h> #include "../br_private.h" @@ -278,7 +279,7 @@ static struct ebt_watcher ulog = { .me = THIS_MODULE, }; -static struct nf_logger ebt_ulog_logger = { +static const struct nf_logger ebt_ulog_logger = { .name = EBT_ULOG_WATCHER, .logfn = &ebt_log_packet, .me = THIS_MODULE, @@ -306,7 +307,7 @@ static int __init ebt_ulog_init(void) if (!ebtulognl) ret = -ENOMEM; else if ((ret = ebt_register_watcher(&ulog))) - sock_release(ebtulognl->sk_socket); + netlink_kernel_release(ebtulognl); if (ret == 0) nf_log_register(PF_BRIDGE, &ebt_ulog_logger); @@ -332,7 +333,7 @@ static void __exit ebt_ulog_fini(void) } spin_unlock_bh(&ub->lock); } - sock_release(ebtulognl->sk_socket); + netlink_kernel_release(ebtulognl); } module_init(ebt_ulog_init); diff --git a/net/bridge/netfilter/ebt_vlan.c b/net/bridge/netfilter/ebt_vlan.c index a43c697d3d7..0ddf7499d49 100644 --- a/net/bridge/netfilter/ebt_vlan.c +++ b/net/bridge/netfilter/ebt_vlan.c @@ -37,9 +37,7 @@ MODULE_LICENSE("GPL"); #define DEBUG_MSG(args...) if (debug) printk (KERN_DEBUG "ebt_vlan: " args) -#define INV_FLAG(_inv_flag_) (info->invflags & _inv_flag_) ? "!" : "" #define GET_BITMASK(_BIT_MASK_) info->bitmask & _BIT_MASK_ -#define SET_BITMASK(_BIT_MASK_) info->bitmask |= _BIT_MASK_ #define EXIT_ON_MISMATCH(_MATCH_,_MASK_) {if (!((info->_MATCH_ == _MATCH_)^!!(info->invflags & _MASK_))) return EBT_NOMATCH;} static int diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index 210493f99bc..fb810908732 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -67,7 +67,7 @@ ebt_hook(unsigned int hook, struct sk_buff *skb, const struct net_device *in, return ebt_do_table(hook, skb, in, out, &frame_filter); } -static struct nf_hook_ops ebt_ops_filter[] = { +static struct nf_hook_ops ebt_ops_filter[] __read_mostly = { { .hook = ebt_hook, .owner = THIS_MODULE, diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index 3e58c2e5ee2..bc712730c54 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -74,7 +74,7 @@ ebt_nat_src(unsigned int hook, struct sk_buff *skb, const struct net_device *in return ebt_do_table(hook, skb, in, out, &frame_nat); } -static struct nf_hook_ops ebt_ops_nat[] = { +static struct nf_hook_ops ebt_ops_nat[] __read_mostly = { { .hook = ebt_nat_dst, .owner = THIS_MODULE, diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 817169e718c..32afff859e4 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -15,8 +15,6 @@ * 2 of the License, or (at your option) any later version. */ -/* used for print_string */ -#include <linux/tty.h> #include <linux/kmod.h> #include <linux/module.h> diff --git a/net/can/Kconfig b/net/can/Kconfig new file mode 100644 index 00000000000..89395b2c8bc --- /dev/null +++ b/net/can/Kconfig @@ -0,0 +1,44 @@ +# +# Controller Area Network (CAN) network layer core configuration +# + +menuconfig CAN + depends on NET + tristate "CAN bus subsystem support" + ---help--- + Controller Area Network (CAN) is a slow (up to 1Mbit/s) serial + communications protocol which was developed by Bosch in + 1991, mainly for automotive, but now widely used in marine + (NMEA2000), industrial, and medical applications. + More information on the CAN network protocol family PF_CAN + is contained in <Documentation/networking/can.txt>. + + If you want CAN support you should say Y here and also to the + specific driver for your controller(s) below. + +config CAN_RAW + tristate "Raw CAN Protocol (raw access with CAN-ID filtering)" + depends on CAN + default N + ---help--- + The raw CAN protocol option offers access to the CAN bus via + the BSD socket API. You probably want to use the raw socket in + most cases where no higher level protocol is being used. The raw + socket has several filter options e.g. ID masking / error frames. + To receive/send raw CAN messages, use AF_CAN with protocol CAN_RAW. + +config CAN_BCM + tristate "Broadcast Manager CAN Protocol (with content filtering)" + depends on CAN + default N + ---help--- + The Broadcast Manager offers content filtering, timeout monitoring, + sending of RTR frames, and cyclic CAN messages without permanent user + interaction. The BCM can be 'programmed' via the BSD socket API and + informs you on demand e.g. only on content updates / timeouts. + You probably want to use the bcm socket in most cases where cyclic + CAN messages are used on the bus (e.g. in automotive environments). + To use the Broadcast Manager, use AF_CAN with protocol CAN_BCM. + + +source "drivers/net/can/Kconfig" diff --git a/net/can/Makefile b/net/can/Makefile new file mode 100644 index 00000000000..9cd3c4b3abd --- /dev/null +++ b/net/can/Makefile @@ -0,0 +1,12 @@ +# +# Makefile for the Linux Controller Area Network core. +# + +obj-$(CONFIG_CAN) += can.o +can-objs := af_can.o proc.o + +obj-$(CONFIG_CAN_RAW) += can-raw.o +can-raw-objs := raw.o + +obj-$(CONFIG_CAN_BCM) += can-bcm.o +can-bcm-objs := bcm.o diff --git a/net/can/af_can.c b/net/can/af_can.c new file mode 100644 index 00000000000..5158e886630 --- /dev/null +++ b/net/can/af_can.c @@ -0,0 +1,861 @@ +/* + * af_can.c - Protocol family CAN core module + * (used by different CAN protocol modules) + * + * Copyright (c) 2002-2007 Volkswagen Group Electronic Research + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of Volkswagen nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * Alternatively, provided that this notice is retained in full, this + * software may be distributed under the terms of the GNU General + * Public License ("GPL") version 2, in which case the provisions of the + * GPL apply INSTEAD OF those given above. + * + * The provided data structures and external interfaces from this code + * are not restricted to be used by modules with a GPL compatible license. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Send feedback to <socketcan-users@lists.berlios.de> + * + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/kmod.h> +#include <linux/slab.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/rcupdate.h> +#include <linux/uaccess.h> +#include <linux/net.h> +#include <linux/netdevice.h> +#include <linux/socket.h> +#include <linux/if_ether.h> +#include <linux/if_arp.h> +#include <linux/skbuff.h> +#include <linux/can.h> +#include <linux/can/core.h> +#include <net/net_namespace.h> +#include <net/sock.h> + +#include "af_can.h" + +static __initdata const char banner[] = KERN_INFO + "can: controller area network core (" CAN_VERSION_STRING ")\n"; + +MODULE_DESCRIPTION("Controller Area Network PF_CAN core"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Urs Thuermann <urs.thuermann@volkswagen.de>, " + "Oliver Hartkopp <oliver.hartkopp@volkswagen.de>"); + +MODULE_ALIAS_NETPROTO(PF_CAN); + +static int stats_timer __read_mostly = 1; +module_param(stats_timer, int, S_IRUGO); +MODULE_PARM_DESC(stats_timer, "enable timer for statistics (default:on)"); + +HLIST_HEAD(can_rx_dev_list); +static struct dev_rcv_lists can_rx_alldev_list; +static DEFINE_SPINLOCK(can_rcvlists_lock); + +static struct kmem_cache *rcv_cache __read_mostly; + +/* table of registered CAN protocols */ +static struct can_proto *proto_tab[CAN_NPROTO] __read_mostly; +static DEFINE_SPINLOCK(proto_tab_lock); + +struct timer_list can_stattimer; /* timer for statistics update */ +struct s_stats can_stats; /* packet statistics */ +struct s_pstats can_pstats; /* receive list statistics */ + +/* + * af_can socket functions + */ + +static int can_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + + switch (cmd) { + + case SIOCGSTAMP: + return sock_get_timestamp(sk, (struct timeval __user *)arg); + + default: + return -ENOIOCTLCMD; + } +} + +static void can_sock_destruct(struct sock *sk) +{ + skb_queue_purge(&sk->sk_receive_queue); +} + +static int can_create(struct net *net, struct socket *sock, int protocol) +{ + struct sock *sk; + struct can_proto *cp; + char module_name[sizeof("can-proto-000")]; + int err = 0; + + sock->state = SS_UNCONNECTED; + + if (protocol < 0 || protocol >= CAN_NPROTO) + return -EINVAL; + + if (net != &init_net) + return -EAFNOSUPPORT; + + /* try to load protocol module, when CONFIG_KMOD is defined */ + if (!proto_tab[protocol]) { + sprintf(module_name, "can-proto-%d", protocol); + err = request_module(module_name); + + /* + * In case of error we only print a message but don't + * return the error code immediately. Below we will + * return -EPROTONOSUPPORT + */ + if (err == -ENOSYS) { + if (printk_ratelimit()) + printk(KERN_INFO "can: request_module(%s)" + " not implemented.\n", module_name); + } else if (err) { + if (printk_ratelimit()) + printk(KERN_ERR "can: request_module(%s)" + " failed.\n", module_name); + } + } + + spin_lock(&proto_tab_lock); + cp = proto_tab[protocol]; + if (cp && !try_module_get(cp->prot->owner)) + cp = NULL; + spin_unlock(&proto_tab_lock); + + /* check for available protocol and correct usage */ + + if (!cp) + return -EPROTONOSUPPORT; + + if (cp->type != sock->type) { + err = -EPROTONOSUPPORT; + goto errout; + } + + if (cp->capability >= 0 && !capable(cp->capability)) { + err = -EPERM; + goto errout; + } + + sock->ops = cp->ops; + + sk = sk_alloc(net, PF_CAN, GFP_KERNEL, cp->prot); + if (!sk) { + err = -ENOMEM; + goto errout; + } + + sock_init_data(sock, sk); + sk->sk_destruct = can_sock_destruct; + + if (sk->sk_prot->init) + err = sk->sk_prot->init(sk); + + if (err) { + /* release sk on errors */ + sock_orphan(sk); + sock_put(sk); + } + + errout: + module_put(cp->prot->owner); + return err; +} + +/* + * af_can tx path + */ + +/** + * can_send - transmit a CAN frame (optional with local loopback) + * @skb: pointer to socket buffer with CAN frame in data section + * @loop: loopback for listeners on local CAN sockets (recommended default!) + * + * Return: + * 0 on success + * -ENETDOWN when the selected interface is down + * -ENOBUFS on full driver queue (see net_xmit_errno()) + * -ENOMEM when local loopback failed at calling skb_clone() + * -EPERM when trying to send on a non-CAN interface + */ +int can_send(struct sk_buff *skb, int loop) +{ + int err; + + if (skb->dev->type != ARPHRD_CAN) { + kfree_skb(skb); + return -EPERM; + } + + if (!(skb->dev->flags & IFF_UP)) { + kfree_skb(skb); + return -ENETDOWN; + } + + skb->protocol = htons(ETH_P_CAN); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + + if (loop) { + /* local loopback of sent CAN frames */ + + /* indication for the CAN driver: do loopback */ + skb->pkt_type = PACKET_LOOPBACK; + + /* + * The reference to the originating sock may be required + * by the receiving socket to check whether the frame is + * its own. Example: can_raw sockopt CAN_RAW_RECV_OWN_MSGS + * Therefore we have to ensure that skb->sk remains the + * reference to the originating sock by restoring skb->sk + * after each skb_clone() or skb_orphan() usage. + */ + + if (!(skb->dev->flags & IFF_ECHO)) { + /* + * If the interface is not capable to do loopback + * itself, we do it here. + */ + struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); + + if (!newskb) { + kfree_skb(skb); + return -ENOMEM; + } + + newskb->sk = skb->sk; + newskb->ip_summed = CHECKSUM_UNNECESSARY; + newskb->pkt_type = PACKET_BROADCAST; + netif_rx(newskb); + } + } else { + /* indication for the CAN driver: no loopback required */ + skb->pkt_type = PACKET_HOST; + } + + /* send to netdevice */ + err = dev_queue_xmit(skb); + if (err > 0) + err = net_xmit_errno(err); + + /* update statistics */ + can_stats.tx_frames++; + can_stats.tx_frames_delta++; + + return err; +} +EXPORT_SYMBOL(can_send); + +/* + * af_can rx path + */ + +static struct dev_rcv_lists *find_dev_rcv_lists(struct net_device *dev) +{ + struct dev_rcv_lists *d = NULL; + struct hlist_node *n; + + /* + * find receive list for this device + * + * The hlist_for_each_entry*() macros curse through the list + * using the pointer variable n and set d to the containing + * struct in each list iteration. Therefore, after list + * iteration, d is unmodified when the list is empty, and it + * points to last list element, when the list is non-empty + * but no match in the loop body is found. I.e. d is *not* + * NULL when no match is found. We can, however, use the + * cursor variable n to decide if a match was found. + */ + + hlist_for_each_entry_rcu(d, n, &can_rx_dev_list, list) { + if (d->dev == dev) + break; + } + + return n ? d : NULL; +} + +static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask, + struct dev_rcv_lists *d) +{ + canid_t inv = *can_id & CAN_INV_FILTER; /* save flag before masking */ + + /* filter error frames */ + if (*mask & CAN_ERR_FLAG) { + /* clear CAN_ERR_FLAG in list entry */ + *mask &= CAN_ERR_MASK; + return &d->rx[RX_ERR]; + } + + /* ensure valid values in can_mask */ + if (*mask & CAN_EFF_FLAG) + *mask &= (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG); + else + *mask &= (CAN_SFF_MASK | CAN_RTR_FLAG); + + /* reduce condition testing at receive time */ + *can_id &= *mask; + + /* inverse can_id/can_mask filter */ + if (inv) + return &d->rx[RX_INV]; + + /* mask == 0 => no condition testing at receive time */ + if (!(*mask)) + return &d->rx[RX_ALL]; + + /* use extra filterset for the subscription of exactly *ONE* can_id */ + if (*can_id & CAN_EFF_FLAG) { + if (*mask == (CAN_EFF_MASK | CAN_EFF_FLAG)) { + /* RFC: a use-case for hash-tables in the future? */ + return &d->rx[RX_EFF]; + } + } else { + if (*mask == CAN_SFF_MASK) + return &d->rx_sff[*can_id]; + } + + /* default: filter via can_id/can_mask */ + return &d->rx[RX_FIL]; +} + +/** + * can_rx_register - subscribe CAN frames from a specific interface + * @dev: pointer to netdevice (NULL => subcribe from 'all' CAN devices list) + * @can_id: CAN identifier (see description) + * @mask: CAN mask (see description) + * @func: callback function on filter match + * @data: returned parameter for callback function + * @ident: string for calling module indentification + * + * Description: + * Invokes the callback function with the received sk_buff and the given + * parameter 'data' on a matching receive filter. A filter matches, when + * + * <received_can_id> & mask == can_id & mask + * + * The filter can be inverted (CAN_INV_FILTER bit set in can_id) or it can + * filter for error frames (CAN_ERR_FLAG bit set in mask). + * + * Return: + * 0 on success + * -ENOMEM on missing cache mem to create subscription entry + * -ENODEV unknown device + */ +int can_rx_register(struct net_device *dev, canid_t can_id, canid_t mask, + void (*func)(struct sk_buff *, void *), void *data, + char *ident) +{ + struct receiver *r; + struct hlist_head *rl; + struct dev_rcv_lists *d; + int err = 0; + + /* insert new receiver (dev,canid,mask) -> (func,data) */ + + r = kmem_cache_alloc(rcv_cache, GFP_KERNEL); + if (!r) + return -ENOMEM; + + spin_lock(&can_rcvlists_lock); + + d = find_dev_rcv_lists(dev); + if (d) { + rl = find_rcv_list(&can_id, &mask, d); + + r->can_id = can_id; + r->mask = mask; + r->matches = 0; + r->func = func; + r->data = data; + r->ident = ident; + + hlist_add_head_rcu(&r->list, rl); + d->entries++; + + can_pstats.rcv_entries++; + if (can_pstats.rcv_entries_max < can_pstats.rcv_entries) + can_pstats.rcv_entries_max = can_pstats.rcv_entries; + } else { + kmem_cache_free(rcv_cache, r); + err = -ENODEV; + } + + spin_unlock(&can_rcvlists_lock); + + return err; +} +EXPORT_SYMBOL(can_rx_register); + +/* + * can_rx_delete_device - rcu callback for dev_rcv_lists structure removal + */ +static void can_rx_delete_device(struct rcu_head *rp) +{ + struct dev_rcv_lists *d = container_of(rp, struct dev_rcv_lists, rcu); + + kfree(d); +} + +/* + * can_rx_delete_receiver - rcu callback for single receiver entry removal + */ +static void can_rx_delete_receiver(struct rcu_head *rp) +{ + struct receiver *r = container_of(rp, struct receiver, rcu); + + kmem_cache_free(rcv_cache, r); +} + +/** + * can_rx_unregister - unsubscribe CAN frames from a specific interface + * @dev: pointer to netdevice (NULL => unsubcribe from 'all' CAN devices list) + * @can_id: CAN identifier + * @mask: CAN mask + * @func: callback function on filter match + * @data: returned parameter for callback function + * + * Description: + * Removes subscription entry depending on given (subscription) values. + */ +void can_rx_unregister(struct net_device *dev, canid_t can_id, canid_t mask, + void (*func)(struct sk_buff *, void *), void *data) +{ + struct receiver *r = NULL; + struct hlist_head *rl; + struct hlist_node *next; + struct dev_rcv_lists *d; + + spin_lock(&can_rcvlists_lock); + + d = find_dev_rcv_lists(dev); + if (!d) { + printk(KERN_ERR "BUG: receive list not found for " + "dev %s, id %03X, mask %03X\n", + DNAME(dev), can_id, mask); + goto out; + } + + rl = find_rcv_list(&can_id, &mask, d); + + /* + * Search the receiver list for the item to delete. This should + * exist, since no receiver may be unregistered that hasn't + * been registered before. + */ + + hlist_for_each_entry_rcu(r, next, rl, list) { + if (r->can_id == can_id && r->mask == mask + && r->func == func && r->data == data) + break; + } + + /* + * Check for bugs in CAN protocol implementations: + * If no matching list item was found, the list cursor variable next + * will be NULL, while r will point to the last item of the list. + */ + + if (!next) { + printk(KERN_ERR "BUG: receive list entry not found for " + "dev %s, id %03X, mask %03X\n", + DNAME(dev), can_id, mask); + r = NULL; + d = NULL; + goto out; + } + + hlist_del_rcu(&r->list); + d->entries--; + + if (can_pstats.rcv_entries > 0) + can_pstats.rcv_entries--; + + /* remove device structure requested by NETDEV_UNREGISTER */ + if (d->remove_on_zero_entries && !d->entries) + hlist_del_rcu(&d->list); + else + d = NULL; + + out: + spin_unlock(&can_rcvlists_lock); + + /* schedule the receiver item for deletion */ + if (r) + call_rcu(&r->rcu, can_rx_delete_receiver); + + /* schedule the device structure for deletion */ + if (d) + call_rcu(&d->rcu, can_rx_delete_device); +} +EXPORT_SYMBOL(can_rx_unregister); + +static inline void deliver(struct sk_buff *skb, struct receiver *r) +{ + struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); + + if (clone) { + clone->sk = skb->sk; + r->func(clone, r->data); + r->matches++; + } +} + +static int can_rcv_filter(struct dev_rcv_lists *d, struct sk_buff *skb) +{ + struct receiver *r; + struct hlist_node *n; + int matches = 0; + struct can_frame *cf = (struct can_frame *)skb->data; + canid_t can_id = cf->can_id; + + if (d->entries == 0) + return 0; + + if (can_id & CAN_ERR_FLAG) { + /* check for error frame entries only */ + hlist_for_each_entry_rcu(r, n, &d->rx[RX_ERR], list) { + if (can_id & r->mask) { + deliver(skb, r); + matches++; + } + } + return matches; + } + + /* check for unfiltered entries */ + hlist_for_each_entry_rcu(r, n, &d->rx[RX_ALL], list) { + deliver(skb, r); + matches++; + } + + /* check for can_id/mask entries */ + hlist_for_each_entry_rcu(r, n, &d->rx[RX_FIL], list) { + if ((can_id & r->mask) == r->can_id) { + deliver(skb, r); + matches++; + } + } + + /* check for inverted can_id/mask entries */ + hlist_for_each_entry_rcu(r, n, &d->rx[RX_INV], list) { + if ((can_id & r->mask) != r->can_id) { + deliver(skb, r); + matches++; + } + } + + /* check CAN_ID specific entries */ + if (can_id & CAN_EFF_FLAG) { + hlist_for_each_entry_rcu(r, n, &d->rx[RX_EFF], list) { + if (r->can_id == can_id) { + deliver(skb, r); + matches++; + } + } + } else { + can_id &= CAN_SFF_MASK; + hlist_for_each_entry_rcu(r, n, &d->rx_sff[can_id], list) { + deliver(skb, r); + matches++; + } + } + + return matches; +} + +static int can_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct dev_rcv_lists *d; + int matches; + + if (dev->type != ARPHRD_CAN || dev->nd_net != &init_net) { + kfree_skb(skb); + return 0; + } + + /* update statistics */ + can_stats.rx_frames++; + can_stats.rx_frames_delta++; + + rcu_read_lock(); + + /* deliver the packet to sockets listening on all devices */ + matches = can_rcv_filter(&can_rx_alldev_list, skb); + + /* find receive list for this device */ + d = find_dev_rcv_lists(dev); + if (d) + matches += can_rcv_filter(d, skb); + + rcu_read_unlock(); + + /* free the skbuff allocated by the netdevice driver */ + kfree_skb(skb); + + if (matches > 0) { + can_stats.matches++; + can_stats.matches_delta++; + } + + return 0; +} + +/* + * af_can protocol functions + */ + +/** + * can_proto_register - register CAN transport protocol + * @cp: pointer to CAN protocol structure + * + * Return: + * 0 on success + * -EINVAL invalid (out of range) protocol number + * -EBUSY protocol already in use + * -ENOBUF if proto_register() fails + */ +int can_proto_register(struct can_proto *cp) +{ + int proto = cp->protocol; + int err = 0; + + if (proto < 0 || proto >= CAN_NPROTO) { + printk(KERN_ERR "can: protocol number %d out of range\n", + proto); + return -EINVAL; + } + + spin_lock(&proto_tab_lock); + if (proto_tab[proto]) { + printk(KERN_ERR "can: protocol %d already registered\n", + proto); + err = -EBUSY; + goto errout; + } + + err = proto_register(cp->prot, 0); + if (err < 0) + goto errout; + + proto_tab[proto] = cp; + + /* use generic ioctl function if the module doesn't bring its own */ + if (!cp->ops->ioctl) + cp->ops->ioctl = can_ioctl; + + errout: + spin_unlock(&proto_tab_lock); + + return err; +} +EXPORT_SYMBOL(can_proto_register); + +/** + * can_proto_unregister - unregister CAN transport protocol + * @cp: pointer to CAN protocol structure + */ +void can_proto_unregister(struct can_proto *cp) +{ + int proto = cp->protocol; + + spin_lock(&proto_tab_lock); + if (!proto_tab[proto]) { + printk(KERN_ERR "BUG: can: protocol %d is not registered\n", + proto); + } + proto_unregister(cp->prot); + proto_tab[proto] = NULL; + spin_unlock(&proto_tab_lock); +} +EXPORT_SYMBOL(can_proto_unregister); + +/* + * af_can notifier to create/remove CAN netdevice specific structs + */ +static int can_notifier(struct notifier_block *nb, unsigned long msg, + void *data) +{ + struct net_device *dev = (struct net_device *)data; + struct dev_rcv_lists *d; + + if (dev->nd_net != &init_net) + return NOTIFY_DONE; + + if (dev->type != ARPHRD_CAN) + return NOTIFY_DONE; + + switch (msg) { + + case NETDEV_REGISTER: + + /* + * create new dev_rcv_lists for this device + * + * N.B. zeroing the struct is the correct initialization + * for the embedded hlist_head structs. + * Another list type, e.g. list_head, would require + * explicit initialization. + */ + + d = kzalloc(sizeof(*d), GFP_KERNEL); + if (!d) { + printk(KERN_ERR + "can: allocation of receive list failed\n"); + return NOTIFY_DONE; + } + d->dev = dev; + + spin_lock(&can_rcvlists_lock); + hlist_add_head_rcu(&d->list, &can_rx_dev_list); + spin_unlock(&can_rcvlists_lock); + + break; + + case NETDEV_UNREGISTER: + spin_lock(&can_rcvlists_lock); + + d = find_dev_rcv_lists(dev); + if (d) { + if (d->entries) { + d->remove_on_zero_entries = 1; + d = NULL; + } else + hlist_del_rcu(&d->list); + } else + printk(KERN_ERR "can: notifier: receive list not " + "found for dev %s\n", dev->name); + + spin_unlock(&can_rcvlists_lock); + + if (d) + call_rcu(&d->rcu, can_rx_delete_device); + + break; + } + + return NOTIFY_DONE; +} + +/* + * af_can module init/exit functions + */ + +static struct packet_type can_packet __read_mostly = { + .type = __constant_htons(ETH_P_CAN), + .dev = NULL, + .func = can_rcv, +}; + +static struct net_proto_family can_family_ops __read_mostly = { + .family = PF_CAN, + .create = can_create, + .owner = THIS_MODULE, +}; + +/* notifier block for netdevice event */ +static struct notifier_block can_netdev_notifier __read_mostly = { + .notifier_call = can_notifier, +}; + +static __init int can_init(void) +{ + printk(banner); + + rcv_cache = kmem_cache_create("can_receiver", sizeof(struct receiver), + 0, 0, NULL); + if (!rcv_cache) + return -ENOMEM; + + /* + * Insert can_rx_alldev_list for reception on all devices. + * This struct is zero initialized which is correct for the + * embedded hlist heads, the dev pointer, and the entries counter. + */ + + spin_lock(&can_rcvlists_lock); + hlist_add_head_rcu(&can_rx_alldev_list.list, &can_rx_dev_list); + spin_unlock(&can_rcvlists_lock); + + if (stats_timer) { + /* the statistics are updated every second (timer triggered) */ + setup_timer(&can_stattimer, can_stat_update, 0); + mod_timer(&can_stattimer, round_jiffies(jiffies + HZ)); + } else + can_stattimer.function = NULL; + + can_init_proc(); + + /* protocol register */ + sock_register(&can_family_ops); + register_netdevice_notifier(&can_netdev_notifier); + dev_add_pack(&can_packet); + + return 0; +} + +static __exit void can_exit(void) +{ + struct dev_rcv_lists *d; + struct hlist_node *n, *next; + + if (stats_timer) + del_timer(&can_stattimer); + + can_remove_proc(); + + /* protocol unregister */ + dev_remove_pack(&can_packet); + unregister_netdevice_notifier(&can_netdev_notifier); + sock_unregister(PF_CAN); + + /* remove can_rx_dev_list */ + spin_lock(&can_rcvlists_lock); + hlist_del(&can_rx_alldev_list.list); + hlist_for_each_entry_safe(d, n, next, &can_rx_dev_list, list) { + hlist_del(&d->list); + kfree(d); + } + spin_unlock(&can_rcvlists_lock); + + kmem_cache_destroy(rcv_cache); +} + +module_init(can_init); +module_exit(can_exit); diff --git a/net/can/af_can.h b/net/can/af_can.h new file mode 100644 index 00000000000..18f91e37cc3 --- /dev/null +++ b/net/can/af_can.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2002-2007 Volkswagen Group Electronic Research + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of Volkswagen nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * Alternatively, provided that this notice is retained in full, this + * software may be distributed under the terms of the GNU General + * Public License ("GPL") version 2, in which case the provisions of the + * GPL apply INSTEAD OF those given above. + * + * The provided data structures and external interfaces from this code + * are not restricted to be used by modules with a GPL compatible license. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Send feedback to <socketcan-users@lists.berlios.de> + * + */ + +#ifndef AF_CAN_H +#define AF_CAN_H + +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/list.h> +#include <linux/rcupdate.h> +#include <linux/can.h> + +/* af_can rx dispatcher structures */ + +struct receiver { + struct hlist_node list; + struct rcu_head rcu; + canid_t can_id; + canid_t mask; + unsigned long matches; + void (*func)(struct sk_buff *, void *); + void *data; + char *ident; +}; + +enum { RX_ERR, RX_ALL, RX_FIL, RX_INV, RX_EFF, RX_MAX }; + +struct dev_rcv_lists { + struct hlist_node list; + struct rcu_head rcu; + struct net_device *dev; + struct hlist_head rx[RX_MAX]; + struct hlist_head rx_sff[0x800]; + int remove_on_zero_entries; + int entries; +}; + +/* statistic structures */ + +/* can be reset e.g. by can_init_stats() */ +struct s_stats { + unsigned long jiffies_init; + + unsigned long rx_frames; + unsigned long tx_frames; + unsigned long matches; + + unsigned long total_rx_rate; + unsigned long total_tx_rate; + unsigned long total_rx_match_ratio; + + unsigned long current_rx_rate; + unsigned long current_tx_rate; + unsigned long current_rx_match_ratio; + + unsigned long max_rx_rate; + unsigned long max_tx_rate; + unsigned long max_rx_match_ratio; + + unsigned long rx_frames_delta; + unsigned long tx_frames_delta; + unsigned long matches_delta; +}; + +/* persistent statistics */ +struct s_pstats { + unsigned long stats_reset; + unsigned long user_reset; + unsigned long rcv_entries; + unsigned long rcv_entries_max; +}; + +/* function prototypes for the CAN networklayer procfs (proc.c) */ +extern void can_init_proc(void); +extern void can_remove_proc(void); +extern void can_stat_update(unsigned long data); + +/* structures and variables from af_can.c needed in proc.c for reading */ +extern struct timer_list can_stattimer; /* timer for statistics update */ +extern struct s_stats can_stats; /* packet statistics */ +extern struct s_pstats can_pstats; /* receive list statistics */ +extern struct hlist_head can_rx_dev_list; /* rx dispatcher structures */ + +#endif /* AF_CAN_H */ diff --git a/net/can/bcm.c b/net/can/bcm.c new file mode 100644 index 00000000000..bd4282dae75 --- /dev/null +++ b/net/can/bcm.c @@ -0,0 +1,1561 @@ +/* + * bcm.c - Broadcast Manager to filter/send (cyclic) CAN content + * + * Copyright (c) 2002-2007 Volkswagen Group Electronic Research + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of Volkswagen nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * Alternatively, provided that this notice is retained in full, this + * software may be distributed under the terms of the GNU General + * Public License ("GPL") version 2, in which case the provisions of the + * GPL apply INSTEAD OF those given above. + * + * The provided data structures and external interfaces from this code + * are not restricted to be used by modules with a GPL compatible license. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Send feedback to <socketcan-users@lists.berlios.de> + * + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/proc_fs.h> +#include <linux/uio.h> +#include <linux/net.h> +#include <linux/netdevice.h> +#include <linux/socket.h> +#include <linux/if_arp.h> +#include <linux/skbuff.h> +#include <linux/can.h> +#include <linux/can/core.h> +#include <linux/can/bcm.h> +#include <net/sock.h> +#include <net/net_namespace.h> + +/* use of last_frames[index].can_dlc */ +#define RX_RECV 0x40 /* received data for this element */ +#define RX_THR 0x80 /* element not been sent due to throttle feature */ +#define BCM_CAN_DLC_MASK 0x0F /* clean private flags in can_dlc by masking */ + +/* get best masking value for can_rx_register() for a given single can_id */ +#define REGMASK(id) ((id & CAN_RTR_FLAG) | ((id & CAN_EFF_FLAG) ? \ + (CAN_EFF_MASK | CAN_EFF_FLAG) : CAN_SFF_MASK)) + +#define CAN_BCM_VERSION CAN_VERSION +static __initdata const char banner[] = KERN_INFO + "can: broadcast manager protocol (rev " CAN_BCM_VERSION ")\n"; + +MODULE_DESCRIPTION("PF_CAN broadcast manager protocol"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Oliver Hartkopp <oliver.hartkopp@volkswagen.de>"); + +/* easy access to can_frame payload */ +static inline u64 GET_U64(const struct can_frame *cp) +{ + return *(u64 *)cp->data; +} + +struct bcm_op { + struct list_head list; + int ifindex; + canid_t can_id; + int flags; + unsigned long j_ival1, j_ival2, j_lastmsg; + unsigned long frames_abs, frames_filtered; + struct timer_list timer, thrtimer; + struct timeval ival1, ival2; + ktime_t rx_stamp; + int rx_ifindex; + int count; + int nframes; + int currframe; + struct can_frame *frames; + struct can_frame *last_frames; + struct can_frame sframe; + struct can_frame last_sframe; + struct sock *sk; + struct net_device *rx_reg_dev; +}; + +static struct proc_dir_entry *proc_dir; + +struct bcm_sock { + struct sock sk; + int bound; + int ifindex; + struct notifier_block notifier; + struct list_head rx_ops; + struct list_head tx_ops; + unsigned long dropped_usr_msgs; + struct proc_dir_entry *bcm_proc_read; + char procname [9]; /* pointer printed in ASCII with \0 */ +}; + +static inline struct bcm_sock *bcm_sk(const struct sock *sk) +{ + return (struct bcm_sock *)sk; +} + +#define CFSIZ sizeof(struct can_frame) +#define OPSIZ sizeof(struct bcm_op) +#define MHSIZ sizeof(struct bcm_msg_head) + +/* + * rounded_tv2jif - calculate jiffies from timeval including optional up + * @tv: pointer to timeval + * + * Description: + * Unlike timeval_to_jiffies() provided in include/linux/jiffies.h, this + * function is intentionally more relaxed on precise timer ticks to get + * exact one jiffy for requested 1000us on a 1000HZ machine. + * This code is to be removed when upgrading to kernel hrtimer. + * + * Return: + * calculated jiffies (max: ULONG_MAX) + */ +static unsigned long rounded_tv2jif(const struct timeval *tv) +{ + unsigned long sec = tv->tv_sec; + unsigned long usec = tv->tv_usec; + unsigned long jif; + + if (sec > ULONG_MAX / HZ) + return ULONG_MAX; + + /* round up to get at least the requested time */ + usec += 1000000 / HZ - 1; + + jif = usec / (1000000 / HZ); + + if (sec * HZ > ULONG_MAX - jif) + return ULONG_MAX; + + return jif + sec * HZ; +} + +/* + * procfs functions + */ +static char *bcm_proc_getifname(int ifindex) +{ + struct net_device *dev; + + if (!ifindex) + return "any"; + + /* no usage counting */ + dev = __dev_get_by_index(&init_net, ifindex); + if (dev) + return dev->name; + + return "???"; +} + +static int bcm_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len = 0; + struct sock *sk = (struct sock *)data; + struct bcm_sock *bo = bcm_sk(sk); + struct bcm_op *op; + + len += snprintf(page + len, PAGE_SIZE - len, ">>> socket %p", + sk->sk_socket); + len += snprintf(page + len, PAGE_SIZE - len, " / sk %p", sk); + len += snprintf(page + len, PAGE_SIZE - len, " / bo %p", bo); + len += snprintf(page + len, PAGE_SIZE - len, " / dropped %lu", + bo->dropped_usr_msgs); + len += snprintf(page + len, PAGE_SIZE - len, " / bound %s", + bcm_proc_getifname(bo->ifindex)); + len += snprintf(page + len, PAGE_SIZE - len, " <<<\n"); + + list_for_each_entry(op, &bo->rx_ops, list) { + + unsigned long reduction; + + /* print only active entries & prevent division by zero */ + if (!op->frames_abs) + continue; + + len += snprintf(page + len, PAGE_SIZE - len, + "rx_op: %03X %-5s ", + op->can_id, bcm_proc_getifname(op->ifindex)); + len += snprintf(page + len, PAGE_SIZE - len, "[%d]%c ", + op->nframes, + (op->flags & RX_CHECK_DLC)?'d':' '); + if (op->j_ival1) + len += snprintf(page + len, PAGE_SIZE - len, + "timeo=%ld ", op->j_ival1); + + if (op->j_ival2) + len += snprintf(page + len, PAGE_SIZE - len, + "thr=%ld ", op->j_ival2); + + len += snprintf(page + len, PAGE_SIZE - len, + "# recv %ld (%ld) => reduction: ", + op->frames_filtered, op->frames_abs); + + reduction = 100 - (op->frames_filtered * 100) / op->frames_abs; + + len += snprintf(page + len, PAGE_SIZE - len, "%s%ld%%\n", + (reduction == 100)?"near ":"", reduction); + + if (len > PAGE_SIZE - 200) { + /* mark output cut off */ + len += snprintf(page + len, PAGE_SIZE - len, "(..)\n"); + break; + } + } + + list_for_each_entry(op, &bo->tx_ops, list) { + + len += snprintf(page + len, PAGE_SIZE - len, + "tx_op: %03X %s [%d] ", + op->can_id, bcm_proc_getifname(op->ifindex), + op->nframes); + if (op->j_ival1) + len += snprintf(page + len, PAGE_SIZE - len, "t1=%ld ", + op->j_ival1); + + if (op->j_ival2) + len += snprintf(page + len, PAGE_SIZE - len, "t2=%ld ", + op->j_ival2); + + len += snprintf(page + len, PAGE_SIZE - len, "# sent %ld\n", + op->frames_abs); + + if (len > PAGE_SIZE - 100) { + /* mark output cut off */ + len += snprintf(page + len, PAGE_SIZE - len, "(..)\n"); + break; + } + } + + len += snprintf(page + len, PAGE_SIZE - len, "\n"); + + *eof = 1; + return len; +} + +/* + * bcm_can_tx - send the (next) CAN frame to the appropriate CAN interface + * of the given bcm tx op + */ +static void bcm_can_tx(struct bcm_op *op) +{ + struct sk_buff *skb; + struct net_device *dev; + struct can_frame *cf = &op->frames[op->currframe]; + + /* no target device? => exit */ + if (!op->ifindex) + return; + + dev = dev_get_by_index(&init_net, op->ifindex); + if (!dev) { + /* RFC: should this bcm_op remove itself here? */ + return; + } + + skb = alloc_skb(CFSIZ, gfp_any()); + if (!skb) + goto out; + + memcpy(skb_put(skb, CFSIZ), cf, CFSIZ); + + /* send with loopback */ + skb->dev = dev; + skb->sk = op->sk; + can_send(skb, 1); + + /* update statistics */ + op->currframe++; + op->frames_abs++; + + /* reached last frame? */ + if (op->currframe >= op->nframes) + op->currframe = 0; + out: + dev_put(dev); +} + +/* + * bcm_send_to_user - send a BCM message to the userspace + * (consisting of bcm_msg_head + x CAN frames) + */ +static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head, + struct can_frame *frames, int has_timestamp) +{ + struct sk_buff *skb; + struct can_frame *firstframe; + struct sockaddr_can *addr; + struct sock *sk = op->sk; + int datalen = head->nframes * CFSIZ; + int err; + + skb = alloc_skb(sizeof(*head) + datalen, gfp_any()); + if (!skb) + return; + + memcpy(skb_put(skb, sizeof(*head)), head, sizeof(*head)); + + if (head->nframes) { + /* can_frames starting here */ + firstframe = (struct can_frame *) skb_tail_pointer(skb); + + memcpy(skb_put(skb, datalen), frames, datalen); + + /* + * the BCM uses the can_dlc-element of the can_frame + * structure for internal purposes. This is only + * relevant for updates that are generated by the + * BCM, where nframes is 1 + */ + if (head->nframes == 1) + firstframe->can_dlc &= BCM_CAN_DLC_MASK; + } + + if (has_timestamp) { + /* restore rx timestamp */ + skb->tstamp = op->rx_stamp; + } + + /* + * Put the datagram to the queue so that bcm_recvmsg() can + * get it from there. We need to pass the interface index to + * bcm_recvmsg(). We pass a whole struct sockaddr_can in skb->cb + * containing the interface index. + */ + + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct sockaddr_can)); + addr = (struct sockaddr_can *)skb->cb; + memset(addr, 0, sizeof(*addr)); + addr->can_family = AF_CAN; + addr->can_ifindex = op->rx_ifindex; + + err = sock_queue_rcv_skb(sk, skb); + if (err < 0) { + struct bcm_sock *bo = bcm_sk(sk); + + kfree_skb(skb); + /* don't care about overflows in this statistic */ + bo->dropped_usr_msgs++; + } +} + +/* + * bcm_tx_timeout_handler - performes cyclic CAN frame transmissions + */ +static void bcm_tx_timeout_handler(unsigned long data) +{ + struct bcm_op *op = (struct bcm_op *)data; + + if (op->j_ival1 && (op->count > 0)) { + + op->count--; + if (!op->count && (op->flags & TX_COUNTEVT)) { + struct bcm_msg_head msg_head; + + /* create notification to user */ + msg_head.opcode = TX_EXPIRED; + msg_head.flags = op->flags; + msg_head.count = op->count; + msg_head.ival1 = op->ival1; + msg_head.ival2 = op->ival2; + msg_head.can_id = op->can_id; + msg_head.nframes = 0; + + bcm_send_to_user(op, &msg_head, NULL, 0); + } + } + + if (op->j_ival1 && (op->count > 0)) { + + /* send (next) frame */ + bcm_can_tx(op); + mod_timer(&op->timer, jiffies + op->j_ival1); + + } else { + if (op->j_ival2) { + + /* send (next) frame */ + bcm_can_tx(op); + mod_timer(&op->timer, jiffies + op->j_ival2); + } + } + + return; +} + +/* + * bcm_rx_changed - create a RX_CHANGED notification due to changed content + */ +static void bcm_rx_changed(struct bcm_op *op, struct can_frame *data) +{ + struct bcm_msg_head head; + + op->j_lastmsg = jiffies; + + /* update statistics */ + op->frames_filtered++; + + /* prevent statistics overflow */ + if (op->frames_filtered > ULONG_MAX/100) + op->frames_filtered = op->frames_abs = 0; + + head.opcode = RX_CHANGED; + head.flags = op->flags; + head.count = op->count; + head.ival1 = op->ival1; + head.ival2 = op->ival2; + head.can_id = op->can_id; + head.nframes = 1; + + bcm_send_to_user(op, &head, data, 1); +} + +/* + * bcm_rx_update_and_send - process a detected relevant receive content change + * 1. update the last received data + * 2. send a notification to the user (if possible) + */ +static void bcm_rx_update_and_send(struct bcm_op *op, + struct can_frame *lastdata, + struct can_frame *rxdata) +{ + unsigned long nexttx = op->j_lastmsg + op->j_ival2; + + memcpy(lastdata, rxdata, CFSIZ); + + /* mark as used */ + lastdata->can_dlc |= RX_RECV; + + /* throttle bcm_rx_changed ? */ + if ((op->thrtimer.expires) || + ((op->j_ival2) && (nexttx > jiffies))) { + /* we are already waiting OR we have to start waiting */ + + /* mark as 'throttled' */ + lastdata->can_dlc |= RX_THR; + + if (!(op->thrtimer.expires)) { + /* start the timer only the first time */ + mod_timer(&op->thrtimer, nexttx); + } + + } else { + /* send RX_CHANGED to the user immediately */ + bcm_rx_changed(op, rxdata); + } +} + +/* + * bcm_rx_cmp_to_index - (bit)compares the currently received data to formerly + * received data stored in op->last_frames[] + */ +static void bcm_rx_cmp_to_index(struct bcm_op *op, int index, + struct can_frame *rxdata) +{ + /* + * no one uses the MSBs of can_dlc for comparation, + * so we use it here to detect the first time of reception + */ + + if (!(op->last_frames[index].can_dlc & RX_RECV)) { + /* received data for the first time => send update to user */ + bcm_rx_update_and_send(op, &op->last_frames[index], rxdata); + return; + } + + /* do a real check in can_frame data section */ + + if ((GET_U64(&op->frames[index]) & GET_U64(rxdata)) != + (GET_U64(&op->frames[index]) & GET_U64(&op->last_frames[index]))) { + bcm_rx_update_and_send(op, &op->last_frames[index], rxdata); + return; + } + + if (op->flags & RX_CHECK_DLC) { + /* do a real check in can_frame dlc */ + if (rxdata->can_dlc != (op->last_frames[index].can_dlc & + BCM_CAN_DLC_MASK)) { + bcm_rx_update_and_send(op, &op->last_frames[index], + rxdata); + return; + } + } +} + +/* + * bcm_rx_starttimer - enable timeout monitoring for CAN frame receiption + */ +static void bcm_rx_starttimer(struct bcm_op *op) +{ + if (op->flags & RX_NO_AUTOTIMER) + return; + + if (op->j_ival1) + mod_timer(&op->timer, jiffies + op->j_ival1); +} + +/* + * bcm_rx_timeout_handler - when the (cyclic) CAN frame receiption timed out + */ +static void bcm_rx_timeout_handler(unsigned long data) +{ + struct bcm_op *op = (struct bcm_op *)data; + struct bcm_msg_head msg_head; + + msg_head.opcode = RX_TIMEOUT; + msg_head.flags = op->flags; + msg_head.count = op->count; + msg_head.ival1 = op->ival1; + msg_head.ival2 = op->ival2; + msg_head.can_id = op->can_id; + msg_head.nframes = 0; + + bcm_send_to_user(op, &msg_head, NULL, 0); + + /* no restart of the timer is done here! */ + + /* if user wants to be informed, when cyclic CAN-Messages come back */ + if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) { + /* clear received can_frames to indicate 'nothing received' */ + memset(op->last_frames, 0, op->nframes * CFSIZ); + } +} + +/* + * bcm_rx_thr_handler - the time for blocked content updates is over now: + * Check for throttled data and send it to the userspace + */ +static void bcm_rx_thr_handler(unsigned long data) +{ + struct bcm_op *op = (struct bcm_op *)data; + int i = 0; + + /* mark disabled / consumed timer */ + op->thrtimer.expires = 0; + + if (op->nframes > 1) { + /* for MUX filter we start at index 1 */ + for (i = 1; i < op->nframes; i++) { + if ((op->last_frames) && + (op->last_frames[i].can_dlc & RX_THR)) { + op->last_frames[i].can_dlc &= ~RX_THR; + bcm_rx_changed(op, &op->last_frames[i]); + } + } + + } else { + /* for RX_FILTER_ID and simple filter */ + if (op->last_frames && (op->last_frames[0].can_dlc & RX_THR)) { + op->last_frames[0].can_dlc &= ~RX_THR; + bcm_rx_changed(op, &op->last_frames[0]); + } + } +} + +/* + * bcm_rx_handler - handle a CAN frame receiption + */ +static void bcm_rx_handler(struct sk_buff *skb, void *data) +{ + struct bcm_op *op = (struct bcm_op *)data; + struct can_frame rxframe; + int i; + + /* disable timeout */ + del_timer(&op->timer); + + if (skb->len == sizeof(rxframe)) { + memcpy(&rxframe, skb->data, sizeof(rxframe)); + /* save rx timestamp */ + op->rx_stamp = skb->tstamp; + /* save originator for recvfrom() */ + op->rx_ifindex = skb->dev->ifindex; + /* update statistics */ + op->frames_abs++; + kfree_skb(skb); + + } else { + kfree_skb(skb); + return; + } + + if (op->can_id != rxframe.can_id) + return; + + if (op->flags & RX_RTR_FRAME) { + /* send reply for RTR-request (placed in op->frames[0]) */ + bcm_can_tx(op); + return; + } + + if (op->flags & RX_FILTER_ID) { + /* the easiest case */ + bcm_rx_update_and_send(op, &op->last_frames[0], &rxframe); + bcm_rx_starttimer(op); + return; + } + + if (op->nframes == 1) { + /* simple compare with index 0 */ + bcm_rx_cmp_to_index(op, 0, &rxframe); + bcm_rx_starttimer(op); + return; + } + + if (op->nframes > 1) { + /* + * multiplex compare + * + * find the first multiplex mask that fits. + * Remark: The MUX-mask is stored in index 0 + */ + + for (i = 1; i < op->nframes; i++) { + if ((GET_U64(&op->frames[0]) & GET_U64(&rxframe)) == + (GET_U64(&op->frames[0]) & + GET_U64(&op->frames[i]))) { + bcm_rx_cmp_to_index(op, i, &rxframe); + break; + } + } + bcm_rx_starttimer(op); + } +} + +/* + * helpers for bcm_op handling: find & delete bcm [rx|tx] op elements + */ +static struct bcm_op *bcm_find_op(struct list_head *ops, canid_t can_id, + int ifindex) +{ + struct bcm_op *op; + + list_for_each_entry(op, ops, list) { + if ((op->can_id == can_id) && (op->ifindex == ifindex)) + return op; + } + + return NULL; +} + +static void bcm_remove_op(struct bcm_op *op) +{ + del_timer(&op->timer); + del_timer(&op->thrtimer); + + if ((op->frames) && (op->frames != &op->sframe)) + kfree(op->frames); + + if ((op->last_frames) && (op->last_frames != &op->last_sframe)) + kfree(op->last_frames); + + kfree(op); + + return; +} + +static void bcm_rx_unreg(struct net_device *dev, struct bcm_op *op) +{ + if (op->rx_reg_dev == dev) { + can_rx_unregister(dev, op->can_id, REGMASK(op->can_id), + bcm_rx_handler, op); + + /* mark as removed subscription */ + op->rx_reg_dev = NULL; + } else + printk(KERN_ERR "can-bcm: bcm_rx_unreg: registered device " + "mismatch %p %p\n", op->rx_reg_dev, dev); +} + +/* + * bcm_delete_rx_op - find and remove a rx op (returns number of removed ops) + */ +static int bcm_delete_rx_op(struct list_head *ops, canid_t can_id, int ifindex) +{ + struct bcm_op *op, *n; + + list_for_each_entry_safe(op, n, ops, list) { + if ((op->can_id == can_id) && (op->ifindex == ifindex)) { + + /* + * Don't care if we're bound or not (due to netdev + * problems) can_rx_unregister() is always a save + * thing to do here. + */ + if (op->ifindex) { + /* + * Only remove subscriptions that had not + * been removed due to NETDEV_UNREGISTER + * in bcm_notifier() + */ + if (op->rx_reg_dev) { + struct net_device *dev; + + dev = dev_get_by_index(&init_net, + op->ifindex); + if (dev) { + bcm_rx_unreg(dev, op); + dev_put(dev); + } + } + } else + can_rx_unregister(NULL, op->can_id, + REGMASK(op->can_id), + bcm_rx_handler, op); + + list_del(&op->list); + bcm_remove_op(op); + return 1; /* done */ + } + } + + return 0; /* not found */ +} + +/* + * bcm_delete_tx_op - find and remove a tx op (returns number of removed ops) + */ +static int bcm_delete_tx_op(struct list_head *ops, canid_t can_id, int ifindex) +{ + struct bcm_op *op, *n; + + list_for_each_entry_safe(op, n, ops, list) { + if ((op->can_id == can_id) && (op->ifindex == ifindex)) { + list_del(&op->list); + bcm_remove_op(op); + return 1; /* done */ + } + } + + return 0; /* not found */ +} + +/* + * bcm_read_op - read out a bcm_op and send it to the user (for bcm_sendmsg) + */ +static int bcm_read_op(struct list_head *ops, struct bcm_msg_head *msg_head, + int ifindex) +{ + struct bcm_op *op = bcm_find_op(ops, msg_head->can_id, ifindex); + + if (!op) + return -EINVAL; + + /* put current values into msg_head */ + msg_head->flags = op->flags; + msg_head->count = op->count; + msg_head->ival1 = op->ival1; + msg_head->ival2 = op->ival2; + msg_head->nframes = op->nframes; + + bcm_send_to_user(op, msg_head, op->frames, 0); + + return MHSIZ; +} + +/* + * bcm_tx_setup - create or update a bcm tx op (for bcm_sendmsg) + */ +static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, + int ifindex, struct sock *sk) +{ + struct bcm_sock *bo = bcm_sk(sk); + struct bcm_op *op; + int i, err; + + /* we need a real device to send frames */ + if (!ifindex) + return -ENODEV; + + /* we need at least one can_frame */ + if (msg_head->nframes < 1) + return -EINVAL; + + /* check the given can_id */ + op = bcm_find_op(&bo->tx_ops, msg_head->can_id, ifindex); + + if (op) { + /* update existing BCM operation */ + + /* + * Do we need more space for the can_frames than currently + * allocated? -> This is a _really_ unusual use-case and + * therefore (complexity / locking) it is not supported. + */ + if (msg_head->nframes > op->nframes) + return -E2BIG; + + /* update can_frames content */ + for (i = 0; i < msg_head->nframes; i++) { + err = memcpy_fromiovec((u8 *)&op->frames[i], + msg->msg_iov, CFSIZ); + if (err < 0) + return err; + + if (msg_head->flags & TX_CP_CAN_ID) { + /* copy can_id into frame */ + op->frames[i].can_id = msg_head->can_id; + } + } + + } else { + /* insert new BCM operation for the given can_id */ + + op = kzalloc(OPSIZ, GFP_KERNEL); + if (!op) + return -ENOMEM; + + op->can_id = msg_head->can_id; + + /* create array for can_frames and copy the data */ + if (msg_head->nframes > 1) { + op->frames = kmalloc(msg_head->nframes * CFSIZ, + GFP_KERNEL); + if (!op->frames) { + kfree(op); + return -ENOMEM; + } + } else + op->frames = &op->sframe; + + for (i = 0; i < msg_head->nframes; i++) { + err = memcpy_fromiovec((u8 *)&op->frames[i], + msg->msg_iov, CFSIZ); + if (err < 0) { + if (op->frames != &op->sframe) + kfree(op->frames); + kfree(op); + return err; + } + + if (msg_head->flags & TX_CP_CAN_ID) { + /* copy can_id into frame */ + op->frames[i].can_id = msg_head->can_id; + } + } + + /* tx_ops never compare with previous received messages */ + op->last_frames = NULL; + + /* bcm_can_tx / bcm_tx_timeout_handler needs this */ + op->sk = sk; + op->ifindex = ifindex; + + /* initialize uninitialized (kzalloc) structure */ + setup_timer(&op->timer, bcm_tx_timeout_handler, + (unsigned long)op); + + /* currently unused in tx_ops */ + init_timer(&op->thrtimer); + + /* add this bcm_op to the list of the tx_ops */ + list_add(&op->list, &bo->tx_ops); + + } /* if ((op = bcm_find_op(&bo->tx_ops, msg_head->can_id, ifindex))) */ + + if (op->nframes != msg_head->nframes) { + op->nframes = msg_head->nframes; + /* start multiple frame transmission with index 0 */ + op->currframe = 0; + } + + /* check flags */ + + op->flags = msg_head->flags; + + if (op->flags & TX_RESET_MULTI_IDX) { + /* start multiple frame transmission with index 0 */ + op->currframe = 0; + } + + if (op->flags & SETTIMER) { + /* set timer values */ + op->count = msg_head->count; + op->ival1 = msg_head->ival1; + op->ival2 = msg_head->ival2; + op->j_ival1 = rounded_tv2jif(&msg_head->ival1); + op->j_ival2 = rounded_tv2jif(&msg_head->ival2); + + /* disable an active timer due to zero values? */ + if (!op->j_ival1 && !op->j_ival2) + del_timer(&op->timer); + } + + if ((op->flags & STARTTIMER) && + ((op->j_ival1 && op->count) || op->j_ival2)) { + + /* spec: send can_frame when starting timer */ + op->flags |= TX_ANNOUNCE; + + if (op->j_ival1 && (op->count > 0)) { + /* op->count-- is done in bcm_tx_timeout_handler */ + mod_timer(&op->timer, jiffies + op->j_ival1); + } else + mod_timer(&op->timer, jiffies + op->j_ival2); + } + + if (op->flags & TX_ANNOUNCE) + bcm_can_tx(op); + + return msg_head->nframes * CFSIZ + MHSIZ; +} + +/* + * bcm_rx_setup - create or update a bcm rx op (for bcm_sendmsg) + */ +static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, + int ifindex, struct sock *sk) +{ + struct bcm_sock *bo = bcm_sk(sk); + struct bcm_op *op; + int do_rx_register; + int err = 0; + + if ((msg_head->flags & RX_FILTER_ID) || (!(msg_head->nframes))) { + /* be robust against wrong usage ... */ + msg_head->flags |= RX_FILTER_ID; + /* ignore trailing garbage */ + msg_head->nframes = 0; + } + + if ((msg_head->flags & RX_RTR_FRAME) && + ((msg_head->nframes != 1) || + (!(msg_head->can_id & CAN_RTR_FLAG)))) + return -EINVAL; + + /* check the given can_id */ + op = bcm_find_op(&bo->rx_ops, msg_head->can_id, ifindex); + if (op) { + /* update existing BCM operation */ + + /* + * Do we need more space for the can_frames than currently + * allocated? -> This is a _really_ unusual use-case and + * therefore (complexity / locking) it is not supported. + */ + if (msg_head->nframes > op->nframes) + return -E2BIG; + + if (msg_head->nframes) { + /* update can_frames content */ + err = memcpy_fromiovec((u8 *)op->frames, + msg->msg_iov, + msg_head->nframes * CFSIZ); + if (err < 0) + return err; + + /* clear last_frames to indicate 'nothing received' */ + memset(op->last_frames, 0, msg_head->nframes * CFSIZ); + } + + op->nframes = msg_head->nframes; + + /* Only an update -> do not call can_rx_register() */ + do_rx_register = 0; + + } else { + /* insert new BCM operation for the given can_id */ + op = kzalloc(OPSIZ, GFP_KERNEL); + if (!op) + return -ENOMEM; + + op->can_id = msg_head->can_id; + op->nframes = msg_head->nframes; + + if (msg_head->nframes > 1) { + /* create array for can_frames and copy the data */ + op->frames = kmalloc(msg_head->nframes * CFSIZ, + GFP_KERNEL); + if (!op->frames) { + kfree(op); + return -ENOMEM; + } + + /* create and init array for received can_frames */ + op->last_frames = kzalloc(msg_head->nframes * CFSIZ, + GFP_KERNEL); + if (!op->last_frames) { + kfree(op->frames); + kfree(op); + return -ENOMEM; + } + + } else { + op->frames = &op->sframe; + op->last_frames = &op->last_sframe; + } + + if (msg_head->nframes) { + err = memcpy_fromiovec((u8 *)op->frames, msg->msg_iov, + msg_head->nframes * CFSIZ); + if (err < 0) { + if (op->frames != &op->sframe) + kfree(op->frames); + if (op->last_frames != &op->last_sframe) + kfree(op->last_frames); + kfree(op); + return err; + } + } + + /* bcm_can_tx / bcm_tx_timeout_handler needs this */ + op->sk = sk; + op->ifindex = ifindex; + + /* initialize uninitialized (kzalloc) structure */ + setup_timer(&op->timer, bcm_rx_timeout_handler, + (unsigned long)op); + + /* init throttle timer for RX_CHANGED */ + setup_timer(&op->thrtimer, bcm_rx_thr_handler, + (unsigned long)op); + + /* mark disabled timer */ + op->thrtimer.expires = 0; + + /* add this bcm_op to the list of the rx_ops */ + list_add(&op->list, &bo->rx_ops); + + /* call can_rx_register() */ + do_rx_register = 1; + + } /* if ((op = bcm_find_op(&bo->rx_ops, msg_head->can_id, ifindex))) */ + + /* check flags */ + op->flags = msg_head->flags; + + if (op->flags & RX_RTR_FRAME) { + + /* no timers in RTR-mode */ + del_timer(&op->thrtimer); + del_timer(&op->timer); + + /* + * funny feature in RX(!)_SETUP only for RTR-mode: + * copy can_id into frame BUT without RTR-flag to + * prevent a full-load-loopback-test ... ;-] + */ + if ((op->flags & TX_CP_CAN_ID) || + (op->frames[0].can_id == op->can_id)) + op->frames[0].can_id = op->can_id & ~CAN_RTR_FLAG; + + } else { + if (op->flags & SETTIMER) { + + /* set timer value */ + op->ival1 = msg_head->ival1; + op->ival2 = msg_head->ival2; + op->j_ival1 = rounded_tv2jif(&msg_head->ival1); + op->j_ival2 = rounded_tv2jif(&msg_head->ival2); + + /* disable an active timer due to zero value? */ + if (!op->j_ival1) + del_timer(&op->timer); + + /* free currently blocked msgs ? */ + if (op->thrtimer.expires) { + /* send blocked msgs hereafter */ + mod_timer(&op->thrtimer, jiffies + 2); + } + + /* + * if (op->j_ival2) is zero, no (new) throttling + * will happen. For details see functions + * bcm_rx_update_and_send() and bcm_rx_thr_handler() + */ + } + + if ((op->flags & STARTTIMER) && op->j_ival1) + mod_timer(&op->timer, jiffies + op->j_ival1); + } + + /* now we can register for can_ids, if we added a new bcm_op */ + if (do_rx_register) { + if (ifindex) { + struct net_device *dev; + + dev = dev_get_by_index(&init_net, ifindex); + if (dev) { + err = can_rx_register(dev, op->can_id, + REGMASK(op->can_id), + bcm_rx_handler, op, + "bcm"); + + op->rx_reg_dev = dev; + dev_put(dev); + } + + } else + err = can_rx_register(NULL, op->can_id, + REGMASK(op->can_id), + bcm_rx_handler, op, "bcm"); + if (err) { + /* this bcm rx op is broken -> remove it */ + list_del(&op->list); + bcm_remove_op(op); + return err; + } + } + + return msg_head->nframes * CFSIZ + MHSIZ; +} + +/* + * bcm_tx_send - send a single CAN frame to the CAN interface (for bcm_sendmsg) + */ +static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk) +{ + struct sk_buff *skb; + struct net_device *dev; + int err; + + /* we need a real device to send frames */ + if (!ifindex) + return -ENODEV; + + skb = alloc_skb(CFSIZ, GFP_KERNEL); + + if (!skb) + return -ENOMEM; + + err = memcpy_fromiovec(skb_put(skb, CFSIZ), msg->msg_iov, CFSIZ); + if (err < 0) { + kfree_skb(skb); + return err; + } + + dev = dev_get_by_index(&init_net, ifindex); + if (!dev) { + kfree_skb(skb); + return -ENODEV; + } + + skb->dev = dev; + skb->sk = sk; + can_send(skb, 1); /* send with loopback */ + dev_put(dev); + + return CFSIZ + MHSIZ; +} + +/* + * bcm_sendmsg - process BCM commands (opcodes) from the userspace + */ +static int bcm_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size) +{ + struct sock *sk = sock->sk; + struct bcm_sock *bo = bcm_sk(sk); + int ifindex = bo->ifindex; /* default ifindex for this bcm_op */ + struct bcm_msg_head msg_head; + int ret; /* read bytes or error codes as return value */ + + if (!bo->bound) + return -ENOTCONN; + + /* check for alternative ifindex for this bcm_op */ + + if (!ifindex && msg->msg_name) { + /* no bound device as default => check msg_name */ + struct sockaddr_can *addr = + (struct sockaddr_can *)msg->msg_name; + + if (addr->can_family != AF_CAN) + return -EINVAL; + + /* ifindex from sendto() */ + ifindex = addr->can_ifindex; + + if (ifindex) { + struct net_device *dev; + + dev = dev_get_by_index(&init_net, ifindex); + if (!dev) + return -ENODEV; + + if (dev->type != ARPHRD_CAN) { + dev_put(dev); + return -ENODEV; + } + + dev_put(dev); + } + } + + /* read message head information */ + + ret = memcpy_fromiovec((u8 *)&msg_head, msg->msg_iov, MHSIZ); + if (ret < 0) + return ret; + + lock_sock(sk); + + switch (msg_head.opcode) { + + case TX_SETUP: + ret = bcm_tx_setup(&msg_head, msg, ifindex, sk); + break; + + case RX_SETUP: + ret = bcm_rx_setup(&msg_head, msg, ifindex, sk); + break; + + case TX_DELETE: + if (bcm_delete_tx_op(&bo->tx_ops, msg_head.can_id, ifindex)) + ret = MHSIZ; + else + ret = -EINVAL; + break; + + case RX_DELETE: + if (bcm_delete_rx_op(&bo->rx_ops, msg_head.can_id, ifindex)) + ret = MHSIZ; + else + ret = -EINVAL; + break; + + case TX_READ: + /* reuse msg_head for the reply to TX_READ */ + msg_head.opcode = TX_STATUS; + ret = bcm_read_op(&bo->tx_ops, &msg_head, ifindex); + break; + + case RX_READ: + /* reuse msg_head for the reply to RX_READ */ + msg_head.opcode = RX_STATUS; + ret = bcm_read_op(&bo->rx_ops, &msg_head, ifindex); + break; + + case TX_SEND: + /* we need at least one can_frame */ + if (msg_head.nframes < 1) + ret = -EINVAL; + else + ret = bcm_tx_send(msg, ifindex, sk); + break; + + default: + ret = -EINVAL; + break; + } + + release_sock(sk); + + return ret; +} + +/* + * notification handler for netdevice status changes + */ +static int bcm_notifier(struct notifier_block *nb, unsigned long msg, + void *data) +{ + struct net_device *dev = (struct net_device *)data; + struct bcm_sock *bo = container_of(nb, struct bcm_sock, notifier); + struct sock *sk = &bo->sk; + struct bcm_op *op; + int notify_enodev = 0; + + if (dev->nd_net != &init_net) + return NOTIFY_DONE; + + if (dev->type != ARPHRD_CAN) + return NOTIFY_DONE; + + switch (msg) { + + case NETDEV_UNREGISTER: + lock_sock(sk); + + /* remove device specific receive entries */ + list_for_each_entry(op, &bo->rx_ops, list) + if (op->rx_reg_dev == dev) + bcm_rx_unreg(dev, op); + + /* remove device reference, if this is our bound device */ + if (bo->bound && bo->ifindex == dev->ifindex) { + bo->bound = 0; + bo->ifindex = 0; + notify_enodev = 1; + } + + release_sock(sk); + + if (notify_enodev) { + sk->sk_err = ENODEV; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + } + break; + + case NETDEV_DOWN: + if (bo->bound && bo->ifindex == dev->ifindex) { + sk->sk_err = ENETDOWN; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + } + } + + return NOTIFY_DONE; +} + +/* + * initial settings for all BCM sockets to be set at socket creation time + */ +static int bcm_init(struct sock *sk) +{ + struct bcm_sock *bo = bcm_sk(sk); + + bo->bound = 0; + bo->ifindex = 0; + bo->dropped_usr_msgs = 0; + bo->bcm_proc_read = NULL; + + INIT_LIST_HEAD(&bo->tx_ops); + INIT_LIST_HEAD(&bo->rx_ops); + + /* set notifier */ + bo->notifier.notifier_call = bcm_notifier; + + register_netdevice_notifier(&bo->notifier); + + return 0; +} + +/* + * standard socket functions + */ +static int bcm_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct bcm_sock *bo = bcm_sk(sk); + struct bcm_op *op, *next; + + /* remove bcm_ops, timer, rx_unregister(), etc. */ + + unregister_netdevice_notifier(&bo->notifier); + + lock_sock(sk); + + list_for_each_entry_safe(op, next, &bo->tx_ops, list) + bcm_remove_op(op); + + list_for_each_entry_safe(op, next, &bo->rx_ops, list) { + /* + * Don't care if we're bound or not (due to netdev problems) + * can_rx_unregister() is always a save thing to do here. + */ + if (op->ifindex) { + /* + * Only remove subscriptions that had not + * been removed due to NETDEV_UNREGISTER + * in bcm_notifier() + */ + if (op->rx_reg_dev) { + struct net_device *dev; + + dev = dev_get_by_index(&init_net, op->ifindex); + if (dev) { + bcm_rx_unreg(dev, op); + dev_put(dev); + } + } + } else + can_rx_unregister(NULL, op->can_id, + REGMASK(op->can_id), + bcm_rx_handler, op); + + bcm_remove_op(op); + } + + /* remove procfs entry */ + if (proc_dir && bo->bcm_proc_read) + remove_proc_entry(bo->procname, proc_dir); + + /* remove device reference */ + if (bo->bound) { + bo->bound = 0; + bo->ifindex = 0; + } + + release_sock(sk); + sock_put(sk); + + return 0; +} + +static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len, + int flags) +{ + struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; + struct sock *sk = sock->sk; + struct bcm_sock *bo = bcm_sk(sk); + + if (bo->bound) + return -EISCONN; + + /* bind a device to this socket */ + if (addr->can_ifindex) { + struct net_device *dev; + + dev = dev_get_by_index(&init_net, addr->can_ifindex); + if (!dev) + return -ENODEV; + + if (dev->type != ARPHRD_CAN) { + dev_put(dev); + return -ENODEV; + } + + bo->ifindex = dev->ifindex; + dev_put(dev); + + } else { + /* no interface reference for ifindex = 0 ('any' CAN device) */ + bo->ifindex = 0; + } + + bo->bound = 1; + + if (proc_dir) { + /* unique socket address as filename */ + sprintf(bo->procname, "%p", sock); + bo->bcm_proc_read = create_proc_read_entry(bo->procname, 0644, + proc_dir, + bcm_read_proc, sk); + } + + return 0; +} + +static int bcm_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + struct sock *sk = sock->sk; + struct sk_buff *skb; + int error = 0; + int noblock; + int err; + + noblock = flags & MSG_DONTWAIT; + flags &= ~MSG_DONTWAIT; + skb = skb_recv_datagram(sk, flags, noblock, &error); + if (!skb) + return error; + + if (skb->len < size) + size = skb->len; + + err = memcpy_toiovec(msg->msg_iov, skb->data, size); + if (err < 0) { + skb_free_datagram(sk, skb); + return err; + } + + sock_recv_timestamp(msg, sk, skb); + + if (msg->msg_name) { + msg->msg_namelen = sizeof(struct sockaddr_can); + memcpy(msg->msg_name, skb->cb, msg->msg_namelen); + } + + skb_free_datagram(sk, skb); + + return size; +} + +static struct proto_ops bcm_ops __read_mostly = { + .family = PF_CAN, + .release = bcm_release, + .bind = sock_no_bind, + .connect = bcm_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = datagram_poll, + .ioctl = NULL, /* use can_ioctl() from af_can.c */ + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .sendmsg = bcm_sendmsg, + .recvmsg = bcm_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct proto bcm_proto __read_mostly = { + .name = "CAN_BCM", + .owner = THIS_MODULE, + .obj_size = sizeof(struct bcm_sock), + .init = bcm_init, +}; + +static struct can_proto bcm_can_proto __read_mostly = { + .type = SOCK_DGRAM, + .protocol = CAN_BCM, + .capability = -1, + .ops = &bcm_ops, + .prot = &bcm_proto, +}; + +static int __init bcm_module_init(void) +{ + int err; + + printk(banner); + + err = can_proto_register(&bcm_can_proto); + if (err < 0) { + printk(KERN_ERR "can: registration of bcm protocol failed\n"); + return err; + } + + /* create /proc/net/can-bcm directory */ + proc_dir = proc_mkdir("can-bcm", init_net.proc_net); + + if (proc_dir) + proc_dir->owner = THIS_MODULE; + + return 0; +} + +static void __exit bcm_module_exit(void) +{ + can_proto_unregister(&bcm_can_proto); + + if (proc_dir) + proc_net_remove(&init_net, "can-bcm"); +} + +module_init(bcm_module_init); +module_exit(bcm_module_exit); diff --git a/net/can/proc.c b/net/can/proc.c new file mode 100644 index 00000000000..520fef5e539 --- /dev/null +++ b/net/can/proc.c @@ -0,0 +1,533 @@ +/* + * proc.c - procfs support for Protocol family CAN core module + * + * Copyright (c) 2002-2007 Volkswagen Group Electronic Research + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of Volkswagen nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * Alternatively, provided that this notice is retained in full, this + * software may be distributed under the terms of the GNU General + * Public License ("GPL") version 2, in which case the provisions of the + * GPL apply INSTEAD OF those given above. + * + * The provided data structures and external interfaces from this code + * are not restricted to be used by modules with a GPL compatible license. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Send feedback to <socketcan-users@lists.berlios.de> + * + */ + +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/list.h> +#include <linux/rcupdate.h> +#include <linux/can/core.h> + +#include "af_can.h" + +/* + * proc filenames for the PF_CAN core + */ + +#define CAN_PROC_VERSION "version" +#define CAN_PROC_STATS "stats" +#define CAN_PROC_RESET_STATS "reset_stats" +#define CAN_PROC_RCVLIST_ALL "rcvlist_all" +#define CAN_PROC_RCVLIST_FIL "rcvlist_fil" +#define CAN_PROC_RCVLIST_INV "rcvlist_inv" +#define CAN_PROC_RCVLIST_SFF "rcvlist_sff" +#define CAN_PROC_RCVLIST_EFF "rcvlist_eff" +#define CAN_PROC_RCVLIST_ERR "rcvlist_err" + +static struct proc_dir_entry *can_dir; +static struct proc_dir_entry *pde_version; +static struct proc_dir_entry *pde_stats; +static struct proc_dir_entry *pde_reset_stats; +static struct proc_dir_entry *pde_rcvlist_all; +static struct proc_dir_entry *pde_rcvlist_fil; +static struct proc_dir_entry *pde_rcvlist_inv; +static struct proc_dir_entry *pde_rcvlist_sff; +static struct proc_dir_entry *pde_rcvlist_eff; +static struct proc_dir_entry *pde_rcvlist_err; + +static int user_reset; + +static const char rx_list_name[][8] = { + [RX_ERR] = "rx_err", + [RX_ALL] = "rx_all", + [RX_FIL] = "rx_fil", + [RX_INV] = "rx_inv", + [RX_EFF] = "rx_eff", +}; + +/* + * af_can statistics stuff + */ + +static void can_init_stats(void) +{ + /* + * This memset function is called from a timer context (when + * can_stattimer is active which is the default) OR in a process + * context (reading the proc_fs when can_stattimer is disabled). + */ + memset(&can_stats, 0, sizeof(can_stats)); + can_stats.jiffies_init = jiffies; + + can_pstats.stats_reset++; + + if (user_reset) { + user_reset = 0; + can_pstats.user_reset++; + } +} + +static unsigned long calc_rate(unsigned long oldjif, unsigned long newjif, + unsigned long count) +{ + unsigned long rate; + + if (oldjif == newjif) + return 0; + + /* see can_stat_update() - this should NEVER happen! */ + if (count > (ULONG_MAX / HZ)) { + printk(KERN_ERR "can: calc_rate: count exceeded! %ld\n", + count); + return 99999999; + } + + rate = (count * HZ) / (newjif - oldjif); + + return rate; +} + +void can_stat_update(unsigned long data) +{ + unsigned long j = jiffies; /* snapshot */ + + /* restart counting in timer context on user request */ + if (user_reset) + can_init_stats(); + + /* restart counting on jiffies overflow */ + if (j < can_stats.jiffies_init) + can_init_stats(); + + /* prevent overflow in calc_rate() */ + if (can_stats.rx_frames > (ULONG_MAX / HZ)) + can_init_stats(); + + /* prevent overflow in calc_rate() */ + if (can_stats.tx_frames > (ULONG_MAX / HZ)) + can_init_stats(); + + /* matches overflow - very improbable */ + if (can_stats.matches > (ULONG_MAX / 100)) + can_init_stats(); + + /* calc total values */ + if (can_stats.rx_frames) + can_stats.total_rx_match_ratio = (can_stats.matches * 100) / + can_stats.rx_frames; + + can_stats.total_tx_rate = calc_rate(can_stats.jiffies_init, j, + can_stats.tx_frames); + can_stats.total_rx_rate = calc_rate(can_stats.jiffies_init, j, + can_stats.rx_frames); + + /* calc current values */ + if (can_stats.rx_frames_delta) + can_stats.current_rx_match_ratio = + (can_stats.matches_delta * 100) / + can_stats.rx_frames_delta; + + can_stats.current_tx_rate = calc_rate(0, HZ, can_stats.tx_frames_delta); + can_stats.current_rx_rate = calc_rate(0, HZ, can_stats.rx_frames_delta); + + /* check / update maximum values */ + if (can_stats.max_tx_rate < can_stats.current_tx_rate) + can_stats.max_tx_rate = can_stats.current_tx_rate; + + if (can_stats.max_rx_rate < can_stats.current_rx_rate) + can_stats.max_rx_rate = can_stats.current_rx_rate; + + if (can_stats.max_rx_match_ratio < can_stats.current_rx_match_ratio) + can_stats.max_rx_match_ratio = can_stats.current_rx_match_ratio; + + /* clear values for 'current rate' calculation */ + can_stats.tx_frames_delta = 0; + can_stats.rx_frames_delta = 0; + can_stats.matches_delta = 0; + + /* restart timer (one second) */ + mod_timer(&can_stattimer, round_jiffies(jiffies + HZ)); +} + +/* + * proc read functions + * + * From known use-cases we expect about 10 entries in a receive list to be + * printed in the proc_fs. So PAGE_SIZE is definitely enough space here. + * + */ + +static int can_print_rcvlist(char *page, int len, struct hlist_head *rx_list, + struct net_device *dev) +{ + struct receiver *r; + struct hlist_node *n; + + rcu_read_lock(); + hlist_for_each_entry_rcu(r, n, rx_list, list) { + char *fmt = (r->can_id & CAN_EFF_FLAG)? + " %-5s %08X %08x %08x %08x %8ld %s\n" : + " %-5s %03X %08x %08lx %08lx %8ld %s\n"; + + len += snprintf(page + len, PAGE_SIZE - len, fmt, + DNAME(dev), r->can_id, r->mask, + (unsigned long)r->func, (unsigned long)r->data, + r->matches, r->ident); + + /* does a typical line fit into the current buffer? */ + + /* 100 Bytes before end of buffer */ + if (len > PAGE_SIZE - 100) { + /* mark output cut off */ + len += snprintf(page + len, PAGE_SIZE - len, + " (..)\n"); + break; + } + } + rcu_read_unlock(); + + return len; +} + +static int can_print_recv_banner(char *page, int len) +{ + /* + * can1. 00000000 00000000 00000000 + * ....... 0 tp20 + */ + len += snprintf(page + len, PAGE_SIZE - len, + " device can_id can_mask function" + " userdata matches ident\n"); + + return len; +} + +static int can_proc_read_stats(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len = 0; + + len += snprintf(page + len, PAGE_SIZE - len, "\n"); + len += snprintf(page + len, PAGE_SIZE - len, + " %8ld transmitted frames (TXF)\n", + can_stats.tx_frames); + len += snprintf(page + len, PAGE_SIZE - len, + " %8ld received frames (RXF)\n", can_stats.rx_frames); + len += snprintf(page + len, PAGE_SIZE - len, + " %8ld matched frames (RXMF)\n", can_stats.matches); + + len += snprintf(page + len, PAGE_SIZE - len, "\n"); + + if (can_stattimer.function == can_stat_update) { + len += snprintf(page + len, PAGE_SIZE - len, + " %8ld %% total match ratio (RXMR)\n", + can_stats.total_rx_match_ratio); + + len += snprintf(page + len, PAGE_SIZE - len, + " %8ld frames/s total tx rate (TXR)\n", + can_stats.total_tx_rate); + len += snprintf(page + len, PAGE_SIZE - len, + " %8ld frames/s total rx rate (RXR)\n", + can_stats.total_rx_rate); + + len += snprintf(page + len, PAGE_SIZE - len, "\n"); + + len += snprintf(page + len, PAGE_SIZE - len, + " %8ld %% current match ratio (CRXMR)\n", + can_stats.current_rx_match_ratio); + + len += snprintf(page + len, PAGE_SIZE - len, + " %8ld frames/s current tx rate (CTXR)\n", + can_stats.current_tx_rate); + len += snprintf(page + len, PAGE_SIZE - len, + " %8ld frames/s current rx rate (CRXR)\n", + can_stats.current_rx_rate); + + len += snprintf(page + len, PAGE_SIZE - len, "\n"); + + len += snprintf(page + len, PAGE_SIZE - len, + " %8ld %% max match ratio (MRXMR)\n", + can_stats.max_rx_match_ratio); + + len += snprintf(page + len, PAGE_SIZE - len, + " %8ld frames/s max tx rate (MTXR)\n", + can_stats.max_tx_rate); + len += snprintf(page + len, PAGE_SIZE - len, + " %8ld frames/s max rx rate (MRXR)\n", + can_stats.max_rx_rate); + + len += snprintf(page + len, PAGE_SIZE - len, "\n"); + } + + len += snprintf(page + len, PAGE_SIZE - len, + " %8ld current receive list entries (CRCV)\n", + can_pstats.rcv_entries); + len += snprintf(page + len, PAGE_SIZE - len, + " %8ld maximum receive list entries (MRCV)\n", + can_pstats.rcv_entries_max); + + if (can_pstats.stats_reset) + len += snprintf(page + len, PAGE_SIZE - len, + "\n %8ld statistic resets (STR)\n", + can_pstats.stats_reset); + + if (can_pstats.user_reset) + len += snprintf(page + len, PAGE_SIZE - len, + " %8ld user statistic resets (USTR)\n", + can_pstats.user_reset); + + len += snprintf(page + len, PAGE_SIZE - len, "\n"); + + *eof = 1; + return len; +} + +static int can_proc_read_reset_stats(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len = 0; + + user_reset = 1; + + if (can_stattimer.function == can_stat_update) { + len += snprintf(page + len, PAGE_SIZE - len, + "Scheduled statistic reset #%ld.\n", + can_pstats.stats_reset + 1); + + } else { + if (can_stats.jiffies_init != jiffies) + can_init_stats(); + + len += snprintf(page + len, PAGE_SIZE - len, + "Performed statistic reset #%ld.\n", + can_pstats.stats_reset); + } + + *eof = 1; + return len; +} + +static int can_proc_read_version(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len = 0; + + len += snprintf(page + len, PAGE_SIZE - len, "%s\n", + CAN_VERSION_STRING); + *eof = 1; + return len; +} + +static int can_proc_read_rcvlist(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + /* double cast to prevent GCC warning */ + int idx = (int)(long)data; + int len = 0; + struct dev_rcv_lists *d; + struct hlist_node *n; + + len += snprintf(page + len, PAGE_SIZE - len, + "\nreceive list '%s':\n", rx_list_name[idx]); + + rcu_read_lock(); + hlist_for_each_entry_rcu(d, n, &can_rx_dev_list, list) { + + if (!hlist_empty(&d->rx[idx])) { + len = can_print_recv_banner(page, len); + len = can_print_rcvlist(page, len, &d->rx[idx], d->dev); + } else + len += snprintf(page + len, PAGE_SIZE - len, + " (%s: no entry)\n", DNAME(d->dev)); + + /* exit on end of buffer? */ + if (len > PAGE_SIZE - 100) + break; + } + rcu_read_unlock(); + + len += snprintf(page + len, PAGE_SIZE - len, "\n"); + + *eof = 1; + return len; +} + +static int can_proc_read_rcvlist_sff(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len = 0; + struct dev_rcv_lists *d; + struct hlist_node *n; + + /* RX_SFF */ + len += snprintf(page + len, PAGE_SIZE - len, + "\nreceive list 'rx_sff':\n"); + + rcu_read_lock(); + hlist_for_each_entry_rcu(d, n, &can_rx_dev_list, list) { + int i, all_empty = 1; + /* check wether at least one list is non-empty */ + for (i = 0; i < 0x800; i++) + if (!hlist_empty(&d->rx_sff[i])) { + all_empty = 0; + break; + } + + if (!all_empty) { + len = can_print_recv_banner(page, len); + for (i = 0; i < 0x800; i++) { + if (!hlist_empty(&d->rx_sff[i]) && + len < PAGE_SIZE - 100) + len = can_print_rcvlist(page, len, + &d->rx_sff[i], + d->dev); + } + } else + len += snprintf(page + len, PAGE_SIZE - len, + " (%s: no entry)\n", DNAME(d->dev)); + + /* exit on end of buffer? */ + if (len > PAGE_SIZE - 100) + break; + } + rcu_read_unlock(); + + len += snprintf(page + len, PAGE_SIZE - len, "\n"); + + *eof = 1; + return len; +} + +/* + * proc utility functions + */ + +static struct proc_dir_entry *can_create_proc_readentry(const char *name, + mode_t mode, + read_proc_t *read_proc, + void *data) +{ + if (can_dir) + return create_proc_read_entry(name, mode, can_dir, read_proc, + data); + else + return NULL; +} + +static void can_remove_proc_readentry(const char *name) +{ + if (can_dir) + remove_proc_entry(name, can_dir); +} + +/* + * can_init_proc - create main CAN proc directory and procfs entries + */ +void can_init_proc(void) +{ + /* create /proc/net/can directory */ + can_dir = proc_mkdir("can", init_net.proc_net); + + if (!can_dir) { + printk(KERN_INFO "can: failed to create /proc/net/can . " + "CONFIG_PROC_FS missing?\n"); + return; + } + + can_dir->owner = THIS_MODULE; + + /* own procfs entries from the AF_CAN core */ + pde_version = can_create_proc_readentry(CAN_PROC_VERSION, 0644, + can_proc_read_version, NULL); + pde_stats = can_create_proc_readentry(CAN_PROC_STATS, 0644, + can_proc_read_stats, NULL); + pde_reset_stats = can_create_proc_readentry(CAN_PROC_RESET_STATS, 0644, + can_proc_read_reset_stats, NULL); + pde_rcvlist_err = can_create_proc_readentry(CAN_PROC_RCVLIST_ERR, 0644, + can_proc_read_rcvlist, (void *)RX_ERR); + pde_rcvlist_all = can_create_proc_readentry(CAN_PROC_RCVLIST_ALL, 0644, + can_proc_read_rcvlist, (void *)RX_ALL); + pde_rcvlist_fil = can_create_proc_readentry(CAN_PROC_RCVLIST_FIL, 0644, + can_proc_read_rcvlist, (void *)RX_FIL); + pde_rcvlist_inv = can_create_proc_readentry(CAN_PROC_RCVLIST_INV, 0644, + can_proc_read_rcvlist, (void *)RX_INV); + pde_rcvlist_eff = can_create_proc_readentry(CAN_PROC_RCVLIST_EFF, 0644, + can_proc_read_rcvlist, (void *)RX_EFF); + pde_rcvlist_sff = can_create_proc_readentry(CAN_PROC_RCVLIST_SFF, 0644, + can_proc_read_rcvlist_sff, NULL); +} + +/* + * can_remove_proc - remove procfs entries and main CAN proc directory + */ +void can_remove_proc(void) +{ + if (pde_version) + can_remove_proc_readentry(CAN_PROC_VERSION); + + if (pde_stats) + can_remove_proc_readentry(CAN_PROC_STATS); + + if (pde_reset_stats) + can_remove_proc_readentry(CAN_PROC_RESET_STATS); + + if (pde_rcvlist_err) + can_remove_proc_readentry(CAN_PROC_RCVLIST_ERR); + + if (pde_rcvlist_all) + can_remove_proc_readentry(CAN_PROC_RCVLIST_ALL); + + if (pde_rcvlist_fil) + can_remove_proc_readentry(CAN_PROC_RCVLIST_FIL); + + if (pde_rcvlist_inv) + can_remove_proc_readentry(CAN_PROC_RCVLIST_INV); + + if (pde_rcvlist_eff) + can_remove_proc_readentry(CAN_PROC_RCVLIST_EFF); + + if (pde_rcvlist_sff) + can_remove_proc_readentry(CAN_PROC_RCVLIST_SFF); + + if (can_dir) + proc_net_remove(&init_net, "can"); +} diff --git a/net/can/raw.c b/net/can/raw.c new file mode 100644 index 00000000000..aeefd1419d0 --- /dev/null +++ b/net/can/raw.c @@ -0,0 +1,763 @@ +/* + * raw.c - Raw sockets for protocol family CAN + * + * Copyright (c) 2002-2007 Volkswagen Group Electronic Research + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of Volkswagen nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * Alternatively, provided that this notice is retained in full, this + * software may be distributed under the terms of the GNU General + * Public License ("GPL") version 2, in which case the provisions of the + * GPL apply INSTEAD OF those given above. + * + * The provided data structures and external interfaces from this code + * are not restricted to be used by modules with a GPL compatible license. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Send feedback to <socketcan-users@lists.berlios.de> + * + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/uio.h> +#include <linux/net.h> +#include <linux/netdevice.h> +#include <linux/socket.h> +#include <linux/if_arp.h> +#include <linux/skbuff.h> +#include <linux/can.h> +#include <linux/can/core.h> +#include <linux/can/raw.h> +#include <net/sock.h> +#include <net/net_namespace.h> + +#define CAN_RAW_VERSION CAN_VERSION +static __initdata const char banner[] = + KERN_INFO "can: raw protocol (rev " CAN_RAW_VERSION ")\n"; + +MODULE_DESCRIPTION("PF_CAN raw protocol"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Urs Thuermann <urs.thuermann@volkswagen.de>"); + +#define MASK_ALL 0 + +/* + * A raw socket has a list of can_filters attached to it, each receiving + * the CAN frames matching that filter. If the filter list is empty, + * no CAN frames will be received by the socket. The default after + * opening the socket, is to have one filter which receives all frames. + * The filter list is allocated dynamically with the exception of the + * list containing only one item. This common case is optimized by + * storing the single filter in dfilter, to avoid using dynamic memory. + */ + +struct raw_sock { + struct sock sk; + int bound; + int ifindex; + struct notifier_block notifier; + int loopback; + int recv_own_msgs; + int count; /* number of active filters */ + struct can_filter dfilter; /* default/single filter */ + struct can_filter *filter; /* pointer to filter(s) */ + can_err_mask_t err_mask; +}; + +static inline struct raw_sock *raw_sk(const struct sock *sk) +{ + return (struct raw_sock *)sk; +} + +static void raw_rcv(struct sk_buff *skb, void *data) +{ + struct sock *sk = (struct sock *)data; + struct raw_sock *ro = raw_sk(sk); + struct sockaddr_can *addr; + int error; + + if (!ro->recv_own_msgs) { + /* check the received tx sock reference */ + if (skb->sk == sk) { + kfree_skb(skb); + return; + } + } + + /* + * Put the datagram to the queue so that raw_recvmsg() can + * get it from there. We need to pass the interface index to + * raw_recvmsg(). We pass a whole struct sockaddr_can in skb->cb + * containing the interface index. + */ + + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct sockaddr_can)); + addr = (struct sockaddr_can *)skb->cb; + memset(addr, 0, sizeof(*addr)); + addr->can_family = AF_CAN; + addr->can_ifindex = skb->dev->ifindex; + + error = sock_queue_rcv_skb(sk, skb); + if (error < 0) + kfree_skb(skb); +} + +static int raw_enable_filters(struct net_device *dev, struct sock *sk, + struct can_filter *filter, + int count) +{ + int err = 0; + int i; + + for (i = 0; i < count; i++) { + err = can_rx_register(dev, filter[i].can_id, + filter[i].can_mask, + raw_rcv, sk, "raw"); + if (err) { + /* clean up successfully registered filters */ + while (--i >= 0) + can_rx_unregister(dev, filter[i].can_id, + filter[i].can_mask, + raw_rcv, sk); + break; + } + } + + return err; +} + +static int raw_enable_errfilter(struct net_device *dev, struct sock *sk, + can_err_mask_t err_mask) +{ + int err = 0; + + if (err_mask) + err = can_rx_register(dev, 0, err_mask | CAN_ERR_FLAG, + raw_rcv, sk, "raw"); + + return err; +} + +static void raw_disable_filters(struct net_device *dev, struct sock *sk, + struct can_filter *filter, + int count) +{ + int i; + + for (i = 0; i < count; i++) + can_rx_unregister(dev, filter[i].can_id, filter[i].can_mask, + raw_rcv, sk); +} + +static inline void raw_disable_errfilter(struct net_device *dev, + struct sock *sk, + can_err_mask_t err_mask) + +{ + if (err_mask) + can_rx_unregister(dev, 0, err_mask | CAN_ERR_FLAG, + raw_rcv, sk); +} + +static inline void raw_disable_allfilters(struct net_device *dev, + struct sock *sk) +{ + struct raw_sock *ro = raw_sk(sk); + + raw_disable_filters(dev, sk, ro->filter, ro->count); + raw_disable_errfilter(dev, sk, ro->err_mask); +} + +static int raw_enable_allfilters(struct net_device *dev, struct sock *sk) +{ + struct raw_sock *ro = raw_sk(sk); + int err; + + err = raw_enable_filters(dev, sk, ro->filter, ro->count); + if (!err) { + err = raw_enable_errfilter(dev, sk, ro->err_mask); + if (err) + raw_disable_filters(dev, sk, ro->filter, ro->count); + } + + return err; +} + +static int raw_notifier(struct notifier_block *nb, + unsigned long msg, void *data) +{ + struct net_device *dev = (struct net_device *)data; + struct raw_sock *ro = container_of(nb, struct raw_sock, notifier); + struct sock *sk = &ro->sk; + + if (dev->nd_net != &init_net) + return NOTIFY_DONE; + + if (dev->type != ARPHRD_CAN) + return NOTIFY_DONE; + + if (ro->ifindex != dev->ifindex) + return NOTIFY_DONE; + + switch (msg) { + + case NETDEV_UNREGISTER: + lock_sock(sk); + /* remove current filters & unregister */ + if (ro->bound) + raw_disable_allfilters(dev, sk); + + if (ro->count > 1) + kfree(ro->filter); + + ro->ifindex = 0; + ro->bound = 0; + ro->count = 0; + release_sock(sk); + + sk->sk_err = ENODEV; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + break; + + case NETDEV_DOWN: + sk->sk_err = ENETDOWN; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + break; + } + + return NOTIFY_DONE; +} + +static int raw_init(struct sock *sk) +{ + struct raw_sock *ro = raw_sk(sk); + + ro->bound = 0; + ro->ifindex = 0; + + /* set default filter to single entry dfilter */ + ro->dfilter.can_id = 0; + ro->dfilter.can_mask = MASK_ALL; + ro->filter = &ro->dfilter; + ro->count = 1; + + /* set default loopback behaviour */ + ro->loopback = 1; + ro->recv_own_msgs = 0; + + /* set notifier */ + ro->notifier.notifier_call = raw_notifier; + + register_netdevice_notifier(&ro->notifier); + + return 0; +} + +static int raw_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct raw_sock *ro = raw_sk(sk); + + unregister_netdevice_notifier(&ro->notifier); + + lock_sock(sk); + + /* remove current filters & unregister */ + if (ro->bound) { + if (ro->ifindex) { + struct net_device *dev; + + dev = dev_get_by_index(&init_net, ro->ifindex); + if (dev) { + raw_disable_allfilters(dev, sk); + dev_put(dev); + } + } else + raw_disable_allfilters(NULL, sk); + } + + if (ro->count > 1) + kfree(ro->filter); + + ro->ifindex = 0; + ro->bound = 0; + ro->count = 0; + + release_sock(sk); + sock_put(sk); + + return 0; +} + +static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len) +{ + struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; + struct sock *sk = sock->sk; + struct raw_sock *ro = raw_sk(sk); + int ifindex; + int err = 0; + int notify_enetdown = 0; + + if (len < sizeof(*addr)) + return -EINVAL; + + lock_sock(sk); + + if (ro->bound && addr->can_ifindex == ro->ifindex) + goto out; + + if (addr->can_ifindex) { + struct net_device *dev; + + dev = dev_get_by_index(&init_net, addr->can_ifindex); + if (!dev) { + err = -ENODEV; + goto out; + } + if (dev->type != ARPHRD_CAN) { + dev_put(dev); + err = -ENODEV; + goto out; + } + if (!(dev->flags & IFF_UP)) + notify_enetdown = 1; + + ifindex = dev->ifindex; + + /* filters set by default/setsockopt */ + err = raw_enable_allfilters(dev, sk); + dev_put(dev); + + } else { + ifindex = 0; + + /* filters set by default/setsockopt */ + err = raw_enable_allfilters(NULL, sk); + } + + if (!err) { + if (ro->bound) { + /* unregister old filters */ + if (ro->ifindex) { + struct net_device *dev; + + dev = dev_get_by_index(&init_net, ro->ifindex); + if (dev) { + raw_disable_allfilters(dev, sk); + dev_put(dev); + } + } else + raw_disable_allfilters(NULL, sk); + } + ro->ifindex = ifindex; + ro->bound = 1; + } + + out: + release_sock(sk); + + if (notify_enetdown) { + sk->sk_err = ENETDOWN; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + } + + return err; +} + +static int raw_getname(struct socket *sock, struct sockaddr *uaddr, + int *len, int peer) +{ + struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; + struct sock *sk = sock->sk; + struct raw_sock *ro = raw_sk(sk); + + if (peer) + return -EOPNOTSUPP; + + addr->can_family = AF_CAN; + addr->can_ifindex = ro->ifindex; + + *len = sizeof(*addr); + + return 0; +} + +static int raw_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, int optlen) +{ + struct sock *sk = sock->sk; + struct raw_sock *ro = raw_sk(sk); + struct can_filter *filter = NULL; /* dyn. alloc'ed filters */ + struct can_filter sfilter; /* single filter */ + struct net_device *dev = NULL; + can_err_mask_t err_mask = 0; + int count = 0; + int err = 0; + + if (level != SOL_CAN_RAW) + return -EINVAL; + if (optlen < 0) + return -EINVAL; + + switch (optname) { + + case CAN_RAW_FILTER: + if (optlen % sizeof(struct can_filter) != 0) + return -EINVAL; + + count = optlen / sizeof(struct can_filter); + + if (count > 1) { + /* filter does not fit into dfilter => alloc space */ + filter = kmalloc(optlen, GFP_KERNEL); + if (!filter) + return -ENOMEM; + + err = copy_from_user(filter, optval, optlen); + if (err) { + kfree(filter); + return err; + } + } else if (count == 1) { + err = copy_from_user(&sfilter, optval, optlen); + if (err) + return err; + } + + lock_sock(sk); + + if (ro->bound && ro->ifindex) + dev = dev_get_by_index(&init_net, ro->ifindex); + + if (ro->bound) { + /* (try to) register the new filters */ + if (count == 1) + err = raw_enable_filters(dev, sk, &sfilter, 1); + else + err = raw_enable_filters(dev, sk, filter, + count); + if (err) { + if (count > 1) + kfree(filter); + + goto out_fil; + } + + /* remove old filter registrations */ + raw_disable_filters(dev, sk, ro->filter, ro->count); + } + + /* remove old filter space */ + if (ro->count > 1) + kfree(ro->filter); + + /* link new filters to the socket */ + if (count == 1) { + /* copy filter data for single filter */ + ro->dfilter = sfilter; + filter = &ro->dfilter; + } + ro->filter = filter; + ro->count = count; + + out_fil: + if (dev) + dev_put(dev); + + release_sock(sk); + + break; + + case CAN_RAW_ERR_FILTER: + if (optlen != sizeof(err_mask)) + return -EINVAL; + + err = copy_from_user(&err_mask, optval, optlen); + if (err) + return err; + + err_mask &= CAN_ERR_MASK; + + lock_sock(sk); + + if (ro->bound && ro->ifindex) + dev = dev_get_by_index(&init_net, ro->ifindex); + + /* remove current error mask */ + if (ro->bound) { + /* (try to) register the new err_mask */ + err = raw_enable_errfilter(dev, sk, err_mask); + + if (err) + goto out_err; + + /* remove old err_mask registration */ + raw_disable_errfilter(dev, sk, ro->err_mask); + } + + /* link new err_mask to the socket */ + ro->err_mask = err_mask; + + out_err: + if (dev) + dev_put(dev); + + release_sock(sk); + + break; + + case CAN_RAW_LOOPBACK: + if (optlen != sizeof(ro->loopback)) + return -EINVAL; + + err = copy_from_user(&ro->loopback, optval, optlen); + + break; + + case CAN_RAW_RECV_OWN_MSGS: + if (optlen != sizeof(ro->recv_own_msgs)) + return -EINVAL; + + err = copy_from_user(&ro->recv_own_msgs, optval, optlen); + + break; + + default: + return -ENOPROTOOPT; + } + return err; +} + +static int raw_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct raw_sock *ro = raw_sk(sk); + int len; + void *val; + int err = 0; + + if (level != SOL_CAN_RAW) + return -EINVAL; + if (get_user(len, optlen)) + return -EFAULT; + if (len < 0) + return -EINVAL; + + switch (optname) { + + case CAN_RAW_FILTER: + lock_sock(sk); + if (ro->count > 0) { + int fsize = ro->count * sizeof(struct can_filter); + if (len > fsize) + len = fsize; + err = copy_to_user(optval, ro->filter, len); + } else + len = 0; + release_sock(sk); + + if (!err) + err = put_user(len, optlen); + return err; + + case CAN_RAW_ERR_FILTER: + if (len > sizeof(can_err_mask_t)) + len = sizeof(can_err_mask_t); + val = &ro->err_mask; + break; + + case CAN_RAW_LOOPBACK: + if (len > sizeof(int)) + len = sizeof(int); + val = &ro->loopback; + break; + + case CAN_RAW_RECV_OWN_MSGS: + if (len > sizeof(int)) + len = sizeof(int); + val = &ro->recv_own_msgs; + break; + + default: + return -ENOPROTOOPT; + } + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, val, len)) + return -EFAULT; + return 0; +} + +static int raw_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size) +{ + struct sock *sk = sock->sk; + struct raw_sock *ro = raw_sk(sk); + struct sk_buff *skb; + struct net_device *dev; + int ifindex; + int err; + + if (msg->msg_name) { + struct sockaddr_can *addr = + (struct sockaddr_can *)msg->msg_name; + + if (addr->can_family != AF_CAN) + return -EINVAL; + + ifindex = addr->can_ifindex; + } else + ifindex = ro->ifindex; + + dev = dev_get_by_index(&init_net, ifindex); + if (!dev) + return -ENXIO; + + skb = sock_alloc_send_skb(sk, size, msg->msg_flags & MSG_DONTWAIT, + &err); + if (!skb) { + dev_put(dev); + return err; + } + + err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); + if (err < 0) { + kfree_skb(skb); + dev_put(dev); + return err; + } + skb->dev = dev; + skb->sk = sk; + + err = can_send(skb, ro->loopback); + + dev_put(dev); + + if (err) + return err; + + return size; +} + +static int raw_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + struct sock *sk = sock->sk; + struct sk_buff *skb; + int error = 0; + int noblock; + + noblock = flags & MSG_DONTWAIT; + flags &= ~MSG_DONTWAIT; + + skb = skb_recv_datagram(sk, flags, noblock, &error); + if (!skb) + return error; + + if (size < skb->len) + msg->msg_flags |= MSG_TRUNC; + else + size = skb->len; + + error = memcpy_toiovec(msg->msg_iov, skb->data, size); + if (error < 0) { + skb_free_datagram(sk, skb); + return error; + } + + sock_recv_timestamp(msg, sk, skb); + + if (msg->msg_name) { + msg->msg_namelen = sizeof(struct sockaddr_can); + memcpy(msg->msg_name, skb->cb, msg->msg_namelen); + } + + skb_free_datagram(sk, skb); + + return size; +} + +static struct proto_ops raw_ops __read_mostly = { + .family = PF_CAN, + .release = raw_release, + .bind = raw_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = raw_getname, + .poll = datagram_poll, + .ioctl = NULL, /* use can_ioctl() from af_can.c */ + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = raw_setsockopt, + .getsockopt = raw_getsockopt, + .sendmsg = raw_sendmsg, + .recvmsg = raw_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct proto raw_proto __read_mostly = { + .name = "CAN_RAW", + .owner = THIS_MODULE, + .obj_size = sizeof(struct raw_sock), + .init = raw_init, +}; + +static struct can_proto raw_can_proto __read_mostly = { + .type = SOCK_RAW, + .protocol = CAN_RAW, + .capability = -1, + .ops = &raw_ops, + .prot = &raw_proto, +}; + +static __init int raw_module_init(void) +{ + int err; + + printk(banner); + + err = can_proto_register(&raw_can_proto); + if (err < 0) + printk(KERN_ERR "can: registration of raw protocol failed\n"); + + return err; +} + +static __exit void raw_module_exit(void) +{ + can_proto_unregister(&raw_can_proto); +} + +module_init(raw_module_init); +module_exit(raw_module_exit); diff --git a/net/compat.c b/net/compat.c index 377e560ab5c..80013fb69a6 100644 --- a/net/compat.c +++ b/net/compat.c @@ -20,7 +20,6 @@ #include <linux/syscalls.h> #include <linux/filter.h> #include <linux/compat.h> -#include <linux/netfilter_ipv4/ip_tables.h> #include <linux/security.h> #include <net/scm.h> @@ -317,107 +316,6 @@ void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm) } /* - * For now, we assume that the compatibility and native version - * of struct ipt_entry are the same - sfr. FIXME - */ -struct compat_ipt_replace { - char name[IPT_TABLE_MAXNAMELEN]; - u32 valid_hooks; - u32 num_entries; - u32 size; - u32 hook_entry[NF_IP_NUMHOOKS]; - u32 underflow[NF_IP_NUMHOOKS]; - u32 num_counters; - compat_uptr_t counters; /* struct ipt_counters * */ - struct ipt_entry entries[0]; -}; - -static int do_netfilter_replace(int fd, int level, int optname, - char __user *optval, int optlen) -{ - struct compat_ipt_replace __user *urepl; - struct ipt_replace __user *repl_nat; - char name[IPT_TABLE_MAXNAMELEN]; - u32 origsize, tmp32, num_counters; - unsigned int repl_nat_size; - int ret; - int i; - compat_uptr_t ucntrs; - - urepl = (struct compat_ipt_replace __user *)optval; - if (get_user(origsize, &urepl->size)) - return -EFAULT; - - /* Hack: Causes ipchains to give correct error msg --RR */ - if (optlen != sizeof(*urepl) + origsize) - return -ENOPROTOOPT; - - /* XXX Assumes that size of ipt_entry is the same both in - * native and compat environments. - */ - repl_nat_size = sizeof(*repl_nat) + origsize; - repl_nat = compat_alloc_user_space(repl_nat_size); - - ret = -EFAULT; - if (put_user(origsize, &repl_nat->size)) - goto out; - - if (!access_ok(VERIFY_READ, urepl, optlen) || - !access_ok(VERIFY_WRITE, repl_nat, optlen)) - goto out; - - if (__copy_from_user(name, urepl->name, sizeof(urepl->name)) || - __copy_to_user(repl_nat->name, name, sizeof(repl_nat->name))) - goto out; - - if (__get_user(tmp32, &urepl->valid_hooks) || - __put_user(tmp32, &repl_nat->valid_hooks)) - goto out; - - if (__get_user(tmp32, &urepl->num_entries) || - __put_user(tmp32, &repl_nat->num_entries)) - goto out; - - if (__get_user(num_counters, &urepl->num_counters) || - __put_user(num_counters, &repl_nat->num_counters)) - goto out; - - if (__get_user(ucntrs, &urepl->counters) || - __put_user(compat_ptr(ucntrs), &repl_nat->counters)) - goto out; - - if (__copy_in_user(&repl_nat->entries[0], - &urepl->entries[0], - origsize)) - goto out; - - for (i = 0; i < NF_IP_NUMHOOKS; i++) { - if (__get_user(tmp32, &urepl->hook_entry[i]) || - __put_user(tmp32, &repl_nat->hook_entry[i]) || - __get_user(tmp32, &urepl->underflow[i]) || - __put_user(tmp32, &repl_nat->underflow[i])) - goto out; - } - - /* - * Since struct ipt_counters just contains two u_int64_t members - * we can just do the access_ok check here and pass the (converted) - * pointer into the standard syscall. We hope that the pointer is - * not misaligned ... - */ - if (!access_ok(VERIFY_WRITE, compat_ptr(ucntrs), - num_counters * sizeof(struct ipt_counters))) - goto out; - - - ret = sys_setsockopt(fd, level, optname, - (char __user *)repl_nat, repl_nat_size); - -out: - return ret; -} - -/* * A struct sock_filter is architecture independent. */ struct compat_sock_fprog { @@ -485,10 +383,6 @@ asmlinkage long compat_sys_setsockopt(int fd, int level, int optname, int err; struct socket *sock; - if (level == SOL_IPV6 && optname == IPT_SO_SET_REPLACE) - return do_netfilter_replace(fd, level, optname, - optval, optlen); - if (optlen < 0) return -EINVAL; diff --git a/net/core/datagram.c b/net/core/datagram.c index 029b93e246b..8a28fc93b72 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -115,10 +115,10 @@ out_noerr: } /** - * skb_recv_datagram - Receive a datagram skbuff + * __skb_recv_datagram - Receive a datagram skbuff * @sk: socket * @flags: MSG_ flags - * @noblock: blocking operation? + * @peeked: returns non-zero if this packet has been seen before * @err: error code returned * * Get a datagram skbuff, understands the peeking, nonblocking wakeups @@ -143,8 +143,8 @@ out_noerr: * quite explicitly by POSIX 1003.1g, don't change them without having * the standard around please. */ -struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, - int noblock, int *err) +struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags, + int *peeked, int *err) { struct sk_buff *skb; long timeo; @@ -156,7 +156,7 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, if (error) goto no_packet; - timeo = sock_rcvtimeo(sk, noblock); + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { /* Again only user level code calls this function, so nothing @@ -165,18 +165,19 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, * Look at current nfs client by the way... * However, this function was corrent in any case. 8) */ - if (flags & MSG_PEEK) { - unsigned long cpu_flags; - - spin_lock_irqsave(&sk->sk_receive_queue.lock, - cpu_flags); - skb = skb_peek(&sk->sk_receive_queue); - if (skb) + unsigned long cpu_flags; + + spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags); + skb = skb_peek(&sk->sk_receive_queue); + if (skb) { + *peeked = skb->peeked; + if (flags & MSG_PEEK) { + skb->peeked = 1; atomic_inc(&skb->users); - spin_unlock_irqrestore(&sk->sk_receive_queue.lock, - cpu_flags); - } else - skb = skb_dequeue(&sk->sk_receive_queue); + } else + __skb_unlink(skb, &sk->sk_receive_queue); + } + spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags); if (skb) return skb; @@ -194,10 +195,21 @@ no_packet: *err = error; return NULL; } +EXPORT_SYMBOL(__skb_recv_datagram); + +struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, + int noblock, int *err) +{ + int peeked; + + return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), + &peeked, err); +} void skb_free_datagram(struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); + sk_mem_reclaim(sk); } /** @@ -217,20 +229,28 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb) * This function currently only disables BH when acquiring the * sk_receive_queue lock. Therefore it must not be used in a * context where that lock is acquired in an IRQ context. + * + * It returns 0 if the packet was removed by us. */ -void skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) +int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) { + int err = 0; + if (flags & MSG_PEEK) { + err = -ENOENT; spin_lock_bh(&sk->sk_receive_queue.lock); if (skb == skb_peek(&sk->sk_receive_queue)) { __skb_unlink(skb, &sk->sk_receive_queue); atomic_dec(&skb->users); + err = 0; } spin_unlock_bh(&sk->sk_receive_queue.lock); } kfree_skb(skb); + sk_mem_reclaim(sk); + return err; } EXPORT_SYMBOL(skb_kill_datagram); diff --git a/net/core/dev.c b/net/core/dev.c index 0879f52115e..c9c593e1ba6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -150,8 +150,11 @@ * 86DD IPv6 */ +#define PTYPE_HASH_SIZE (16) +#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1) + static DEFINE_SPINLOCK(ptype_lock); -static struct list_head ptype_base[16] __read_mostly; /* 16 way hashed list */ +static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; static struct list_head ptype_all __read_mostly; /* Taps */ #ifdef CONFIG_NET_DMA @@ -362,7 +365,7 @@ void dev_add_pack(struct packet_type *pt) if (pt->type == htons(ETH_P_ALL)) list_add_rcu(&pt->list, &ptype_all); else { - hash = ntohs(pt->type) & 15; + hash = ntohs(pt->type) & PTYPE_HASH_MASK; list_add_rcu(&pt->list, &ptype_base[hash]); } spin_unlock_bh(&ptype_lock); @@ -391,7 +394,7 @@ void __dev_remove_pack(struct packet_type *pt) if (pt->type == htons(ETH_P_ALL)) head = &ptype_all; else - head = &ptype_base[ntohs(pt->type) & 15]; + head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; list_for_each_entry(pt1, head, list) { if (pt == pt1) { @@ -672,7 +675,7 @@ struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *h ASSERT_RTNL(); - for_each_netdev(&init_net, dev) + for_each_netdev(net, dev) if (dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len)) return dev; @@ -1420,7 +1423,8 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) } rcu_read_lock(); - list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & 15], list) { + list_for_each_entry_rcu(ptype, + &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { if (ptype->type == type && !ptype->dev && ptype->gso_segment) { if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { err = ptype->gso_send_check(skb); @@ -2077,7 +2081,8 @@ ncls: goto out; type = skb->protocol; - list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) { + list_for_each_entry_rcu(ptype, + &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { if (ptype->type == type && (!ptype->dev || ptype->dev == skb->dev)) { if (pt_prev) @@ -2363,8 +2368,9 @@ static int dev_ifconf(struct net *net, char __user *arg) * in detail. */ void *dev_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(dev_base_lock) { - struct net *net = seq->private; + struct net *net = seq_file_net(seq); loff_t off; struct net_device *dev; @@ -2382,13 +2388,14 @@ void *dev_seq_start(struct seq_file *seq, loff_t *pos) void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct net *net = seq->private; + struct net *net = seq_file_net(seq); ++*pos; return v == SEQ_START_TOKEN ? first_net_device(net) : next_net_device((struct net_device *)v); } void dev_seq_stop(struct seq_file *seq, void *v) + __releases(dev_base_lock) { read_unlock(&dev_base_lock); } @@ -2481,26 +2488,8 @@ static const struct seq_operations dev_seq_ops = { static int dev_seq_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - int res; - res = seq_open(file, &dev_seq_ops); - if (!res) { - seq = file->private_data; - seq->private = get_proc_net(inode); - if (!seq->private) { - seq_release(inode, file); - res = -ENXIO; - } - } - return res; -} - -static int dev_seq_release(struct inode *inode, struct file *file) -{ - struct seq_file *seq = file->private_data; - struct net *net = seq->private; - put_net(net); - return seq_release(inode, file); + return seq_open_net(inode, file, &dev_seq_ops, + sizeof(struct seq_net_private)); } static const struct file_operations dev_seq_fops = { @@ -2508,7 +2497,7 @@ static const struct file_operations dev_seq_fops = { .open = dev_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = dev_seq_release, + .release = seq_release_net, }; static const struct seq_operations softnet_seq_ops = { @@ -2543,7 +2532,7 @@ static void *ptype_get_idx(loff_t pos) ++i; } - for (t = 0; t < 16; t++) { + for (t = 0; t < PTYPE_HASH_SIZE; t++) { list_for_each_entry_rcu(pt, &ptype_base[t], list) { if (i == pos) return pt; @@ -2554,6 +2543,7 @@ static void *ptype_get_idx(loff_t pos) } static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) { rcu_read_lock(); return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN; @@ -2577,10 +2567,10 @@ static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) hash = 0; nxt = ptype_base[0].next; } else - hash = ntohs(pt->type) & 15; + hash = ntohs(pt->type) & PTYPE_HASH_MASK; while (nxt == &ptype_base[hash]) { - if (++hash >= 16) + if (++hash >= PTYPE_HASH_SIZE) return NULL; nxt = ptype_base[hash].next; } @@ -2589,6 +2579,7 @@ found: } static void ptype_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) { rcu_read_unlock(); } @@ -3505,7 +3496,7 @@ static int dev_new_index(struct net *net) /* Delayed registration/unregisteration */ static DEFINE_SPINLOCK(net_todo_list_lock); -static struct list_head net_todo_list = LIST_HEAD_INIT(net_todo_list); +static LIST_HEAD(net_todo_list); static void net_set_todo(struct net_device *dev) { @@ -3984,6 +3975,8 @@ void synchronize_net(void) void unregister_netdevice(struct net_device *dev) { + ASSERT_RTNL(); + rollback_registered(dev); /* Finish processing unregister after unlock */ net_set_todo(dev); @@ -4416,7 +4409,7 @@ static int __init net_dev_init(void) goto out; INIT_LIST_HEAD(&ptype_all); - for (i = 0; i < 16; i++) + for (i = 0; i < PTYPE_HASH_SIZE; i++) INIT_LIST_HEAD(&ptype_base[i]); if (register_pernet_subsys(&netdev_net_ops)) diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c index 69fff16ece1..cadbfbf7e7f 100644 --- a/net/core/dev_mcast.c +++ b/net/core/dev_mcast.c @@ -186,8 +186,9 @@ EXPORT_SYMBOL(dev_mc_unsync); #ifdef CONFIG_PROC_FS static void *dev_mc_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(dev_base_lock) { - struct net *net = seq->private; + struct net *net = seq_file_net(seq); struct net_device *dev; loff_t off = 0; @@ -206,6 +207,7 @@ static void *dev_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void dev_mc_seq_stop(struct seq_file *seq, void *v) + __releases(dev_base_lock) { read_unlock(&dev_base_lock); } @@ -241,26 +243,8 @@ static const struct seq_operations dev_mc_seq_ops = { static int dev_mc_seq_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - int res; - res = seq_open(file, &dev_mc_seq_ops); - if (!res) { - seq = file->private_data; - seq->private = get_proc_net(inode); - if (!seq->private) { - seq_release(inode, file); - res = -ENXIO; - } - } - return res; -} - -static int dev_mc_seq_release(struct inode *inode, struct file *file) -{ - struct seq_file *seq = file->private_data; - struct net *net = seq->private; - put_net(net); - return seq_release(inode, file); + return seq_open_net(inode, file, &dev_mc_seq_ops, + sizeof(struct seq_net_private)); } static const struct file_operations dev_mc_seq_fops = { @@ -268,7 +252,7 @@ static const struct file_operations dev_mc_seq_fops = { .open = dev_mc_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = dev_mc_seq_release, + .release = seq_release_net, }; #endif diff --git a/net/core/dst.c b/net/core/dst.c index 03daead3592..7deef483c79 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -153,18 +153,19 @@ loop: #endif } -static int dst_discard(struct sk_buff *skb) +int dst_discard(struct sk_buff *skb) { kfree_skb(skb); return 0; } +EXPORT_SYMBOL(dst_discard); void * dst_alloc(struct dst_ops * ops) { struct dst_entry * dst; if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) { - if (ops->gc()) + if (ops->gc(ops)) return NULL; } dst = kmem_cache_zalloc(ops->kmem_cachep, GFP_ATOMIC); @@ -278,13 +279,13 @@ static inline void dst_ifdown(struct dst_entry *dst, struct net_device *dev, if (!unregister) { dst->input = dst->output = dst_discard; } else { - dst->dev = init_net.loopback_dev; + dst->dev = dst->dev->nd_net->loopback_dev; dev_hold(dst->dev); dev_put(dev); if (dst->neighbour && dst->neighbour->dev == dev) { - dst->neighbour->dev = init_net.loopback_dev; + dst->neighbour->dev = dst->dev; + dev_hold(dst->dev); dev_put(dev); - dev_hold(dst->neighbour->dev); } } } diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 848132b6cb7..42ccaf5b850 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -15,9 +15,6 @@ #include <net/sock.h> #include <net/fib_rules.h> -static LIST_HEAD(rules_ops); -static DEFINE_SPINLOCK(rules_mod_lock); - int fib_default_rule_add(struct fib_rules_ops *ops, u32 pref, u32 table, u32 flags) { @@ -32,6 +29,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops, r->pref = pref; r->table = table; r->flags = flags; + r->fr_net = ops->fro_net; /* The lock is not required here, the list in unreacheable * at the moment this function is called */ @@ -44,12 +42,12 @@ static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, u32 pid); -static struct fib_rules_ops *lookup_rules_ops(int family) +static struct fib_rules_ops *lookup_rules_ops(struct net *net, int family) { struct fib_rules_ops *ops; rcu_read_lock(); - list_for_each_entry_rcu(ops, &rules_ops, list) { + list_for_each_entry_rcu(ops, &net->rules_ops, list) { if (ops->family == family) { if (!try_module_get(ops->owner)) ops = NULL; @@ -78,6 +76,9 @@ int fib_rules_register(struct fib_rules_ops *ops) { int err = -EEXIST; struct fib_rules_ops *o; + struct net *net; + + net = ops->fro_net; if (ops->rule_size < sizeof(struct fib_rule)) return -EINVAL; @@ -87,22 +88,23 @@ int fib_rules_register(struct fib_rules_ops *ops) ops->action == NULL) return -EINVAL; - spin_lock(&rules_mod_lock); - list_for_each_entry(o, &rules_ops, list) + spin_lock(&net->rules_mod_lock); + list_for_each_entry(o, &net->rules_ops, list) if (ops->family == o->family) goto errout; - list_add_tail_rcu(&ops->list, &rules_ops); + hold_net(net); + list_add_tail_rcu(&ops->list, &net->rules_ops); err = 0; errout: - spin_unlock(&rules_mod_lock); + spin_unlock(&net->rules_mod_lock); return err; } EXPORT_SYMBOL_GPL(fib_rules_register); -static void cleanup_ops(struct fib_rules_ops *ops) +void fib_rules_cleanup_ops(struct fib_rules_ops *ops) { struct fib_rule *rule, *tmp; @@ -111,28 +113,19 @@ static void cleanup_ops(struct fib_rules_ops *ops) fib_rule_put(rule); } } +EXPORT_SYMBOL_GPL(fib_rules_cleanup_ops); -int fib_rules_unregister(struct fib_rules_ops *ops) +void fib_rules_unregister(struct fib_rules_ops *ops) { - int err = 0; - struct fib_rules_ops *o; - - spin_lock(&rules_mod_lock); - list_for_each_entry(o, &rules_ops, list) { - if (o == ops) { - list_del_rcu(&o->list); - cleanup_ops(ops); - goto out; - } - } + struct net *net = ops->fro_net; - err = -ENOENT; -out: - spin_unlock(&rules_mod_lock); + spin_lock(&net->rules_mod_lock); + list_del_rcu(&ops->list); + fib_rules_cleanup_ops(ops); + spin_unlock(&net->rules_mod_lock); synchronize_rcu(); - - return err; + release_net(net); } EXPORT_SYMBOL_GPL(fib_rules_unregister); @@ -231,7 +224,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) goto errout; - ops = lookup_rules_ops(frh->family); + ops = lookup_rules_ops(net, frh->family); if (ops == NULL) { err = EAFNOSUPPORT; goto errout; @@ -250,6 +243,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) err = -ENOMEM; goto errout; } + rule->fr_net = net; if (tb[FRA_PRIORITY]) rule->pref = nla_get_u32(tb[FRA_PRIORITY]); @@ -281,7 +275,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) rule->table = frh_get_table(frh, tb); if (!rule->pref && ops->default_pref) - rule->pref = ops->default_pref(); + rule->pref = ops->default_pref(ops); err = -EINVAL; if (tb[FRA_GOTO]) { @@ -358,6 +352,7 @@ errout: static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) { + struct net *net = skb->sk->sk_net; struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rules_ops *ops = NULL; struct fib_rule *rule, *tmp; @@ -367,7 +362,7 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) goto errout; - ops = lookup_rules_ops(frh->family); + ops = lookup_rules_ops(net, frh->family); if (ops == NULL) { err = EAFNOSUPPORT; goto errout; @@ -539,13 +534,14 @@ skip: static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = skb->sk->sk_net; struct fib_rules_ops *ops; int idx = 0, family; family = rtnl_msg_family(cb->nlh); if (family != AF_UNSPEC) { /* Protocol specific dump request */ - ops = lookup_rules_ops(family); + ops = lookup_rules_ops(net, family); if (ops == NULL) return -EAFNOSUPPORT; @@ -553,7 +549,7 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) } rcu_read_lock(); - list_for_each_entry_rcu(ops, &rules_ops, list) { + list_for_each_entry_rcu(ops, &net->rules_ops, list) { if (idx < cb->args[0] || !try_module_get(ops->owner)) goto skip; @@ -574,9 +570,11 @@ static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, u32 pid) { + struct net *net; struct sk_buff *skb; int err = -ENOBUFS; + net = ops->fro_net; skb = nlmsg_new(fib_rule_nlmsg_size(ops, rule), GFP_KERNEL); if (skb == NULL) goto errout; @@ -588,10 +586,11 @@ static void notify_rule_change(int event, struct fib_rule *rule, kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, pid, ops->nlgroup, nlh, GFP_KERNEL); + + err = rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL); errout: if (err < 0) - rtnl_set_sk_err(ops->nlgroup, err); + rtnl_set_sk_err(net, ops->nlgroup, err); } static void attach_rules(struct list_head *rules, struct net_device *dev) @@ -619,22 +618,20 @@ static int fib_rules_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = ptr; + struct net *net = dev->nd_net; struct fib_rules_ops *ops; - if (dev->nd_net != &init_net) - return NOTIFY_DONE; - ASSERT_RTNL(); rcu_read_lock(); switch (event) { case NETDEV_REGISTER: - list_for_each_entry(ops, &rules_ops, list) + list_for_each_entry(ops, &net->rules_ops, list) attach_rules(&ops->rules_list, dev); break; case NETDEV_UNREGISTER: - list_for_each_entry(ops, &rules_ops, list) + list_for_each_entry(ops, &net->rules_ops, list) detach_rules(&ops->rules_list, dev); break; } @@ -648,13 +645,40 @@ static struct notifier_block fib_rules_notifier = { .notifier_call = fib_rules_event, }; +static int fib_rules_net_init(struct net *net) +{ + INIT_LIST_HEAD(&net->rules_ops); + spin_lock_init(&net->rules_mod_lock); + return 0; +} + +static struct pernet_operations fib_rules_net_ops = { + .init = fib_rules_net_init, +}; + static int __init fib_rules_init(void) { + int err; rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL); rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL); rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule); - return register_netdevice_notifier(&fib_rules_notifier); + err = register_netdevice_notifier(&fib_rules_notifier); + if (err < 0) + goto fail; + + err = register_pernet_subsys(&fib_rules_net_ops); + if (err < 0) + goto fail_unregister; + return 0; + +fail_unregister: + unregister_netdevice_notifier(&fib_rules_notifier); +fail: + rtnl_unregister(PF_UNSPEC, RTM_NEWRULE); + rtnl_unregister(PF_UNSPEC, RTM_DELRULE); + rtnl_unregister(PF_UNSPEC, RTM_GETRULE); + return err; } subsys_initcall(fib_rules_init); diff --git a/net/core/flow.c b/net/core/flow.c index 6489f4e24ec..46b38e06e0d 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -352,8 +352,7 @@ static int __init flow_cache_init(void) flow_lwm = 2 * flow_hash_size; flow_hwm = 4 * flow_hash_size; - init_timer(&flow_hash_rnd_timer); - flow_hash_rnd_timer.function = flow_cache_new_hashrnd; + setup_timer(&flow_hash_rnd_timer, flow_cache_new_hashrnd, 0); flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; add_timer(&flow_hash_rnd_timer); diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index daadbcc4e8d..57abe8266be 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -135,7 +135,7 @@ skip: } if (!list_empty(&elist[idx].list)) - mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4)); + mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx)); rcu_read_unlock(); } @@ -159,13 +159,13 @@ skip: int gen_new_estimator(struct gnet_stats_basic *bstats, struct gnet_stats_rate_est *rate_est, spinlock_t *stats_lock, - struct rtattr *opt) + struct nlattr *opt) { struct gen_estimator *est; - struct gnet_estimator *parm = RTA_DATA(opt); + struct gnet_estimator *parm = nla_data(opt); int idx; - if (RTA_PAYLOAD(opt) < sizeof(*parm)) + if (nla_len(opt) < sizeof(*parm)) return -EINVAL; if (parm->interval < -2 || parm->interval > 3) @@ -191,7 +191,7 @@ int gen_new_estimator(struct gnet_stats_basic *bstats, } if (list_empty(&elist[idx].list)) - mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4)); + mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx)); list_add_rcu(&est->list, &elist[idx].list); return 0; @@ -241,7 +241,7 @@ void gen_kill_estimator(struct gnet_stats_basic *bstats, } /** - * gen_replace_estimator - replace rate estimator configruation + * gen_replace_estimator - replace rate estimator configuration * @bstats: basic statistics * @rate_est: rate estimator statistics * @stats_lock: statistics lock @@ -252,13 +252,12 @@ void gen_kill_estimator(struct gnet_stats_basic *bstats, * * Returns 0 on success or a negative error code. */ -int -gen_replace_estimator(struct gnet_stats_basic *bstats, - struct gnet_stats_rate_est *rate_est, spinlock_t *stats_lock, - struct rtattr *opt) +int gen_replace_estimator(struct gnet_stats_basic *bstats, + struct gnet_stats_rate_est *rate_est, + spinlock_t *stats_lock, struct nlattr *opt) { - gen_kill_estimator(bstats, rate_est); - return gen_new_estimator(bstats, rate_est, stats_lock, opt); + gen_kill_estimator(bstats, rate_est); + return gen_new_estimator(bstats, rate_est, stats_lock, opt); } diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index bcc25591d8a..c3d0ffeac24 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -20,16 +20,17 @@ #include <linux/socket.h> #include <linux/rtnetlink.h> #include <linux/gen_stats.h> +#include <net/netlink.h> #include <net/gen_stats.h> static inline int gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size) { - RTA_PUT(d->skb, type, size, buf); + NLA_PUT(d->skb, type, size, buf); return 0; -rtattr_failure: +nla_put_failure: spin_unlock_bh(d->lock); return -1; } @@ -55,13 +56,14 @@ rtattr_failure: int gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type, int xstats_type, spinlock_t *lock, struct gnet_dump *d) + __acquires(lock) { memset(d, 0, sizeof(*d)); spin_lock_bh(lock); d->lock = lock; if (type) - d->tail = (struct rtattr *)skb_tail_pointer(skb); + d->tail = (struct nlattr *)skb_tail_pointer(skb); d->skb = skb; d->compat_tc_stats = tc_stats_type; d->compat_xstats = xstats_type; @@ -212,7 +214,7 @@ int gnet_stats_finish_copy(struct gnet_dump *d) { if (d->tail) - d->tail->rta_len = skb_tail_pointer(d->skb) - (u8 *)d->tail; + d->tail->nla_len = skb_tail_pointer(d->skb) - (u8 *)d->tail; if (d->compat_tc_stats) if (gnet_stats_copy(d, d->compat_tc_stats, &d->tc_stats, diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 29b8ee4e35d..a16cf1ec5e5 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -59,7 +59,6 @@ static void neigh_timer_handler(unsigned long arg); static void __neigh_notify(struct neighbour *n, int type, int flags); static void neigh_update_notify(struct neighbour *neigh); static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev); -void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev); static struct neigh_table *neigh_tables; #ifdef CONFIG_PROC_FS @@ -165,6 +164,16 @@ static int neigh_forced_gc(struct neigh_table *tbl) return shrunk; } +static void neigh_add_timer(struct neighbour *n, unsigned long when) +{ + neigh_hold(n); + if (unlikely(mod_timer(&n->timer, when))) { + printk("NEIGH: BUG, double timer add, state is %x\n", + n->nud_state); + dump_stack(); + } +} + static int neigh_del_timer(struct neighbour *n) { if ((n->nud_state & NUD_IN_TIMER) && @@ -270,9 +279,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl) n->nud_state = NUD_NONE; n->output = neigh_blackhole; n->parms = neigh_parms_clone(&tbl->parms); - init_timer(&n->timer); - n->timer.function = neigh_timer_handler; - n->timer.data = (unsigned long)n; + setup_timer(&n->timer, neigh_timer_handler, (unsigned long)n); NEIGH_CACHE_STAT_INC(tbl, allocs); n->tbl = tbl; @@ -367,7 +374,8 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, return n; } -struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, const void *pkey) +struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, + const void *pkey) { struct neighbour *n; int key_len = tbl->key_len; @@ -377,7 +385,8 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, const void *pkey) read_lock_bh(&tbl->lock); for (n = tbl->hash_buckets[hash_val & tbl->hash_mask]; n; n = n->next) { - if (!memcmp(n->primary_key, pkey, key_len)) { + if (!memcmp(n->primary_key, pkey, key_len) && + (net == n->dev->nd_net)) { neigh_hold(n); NEIGH_CACHE_STAT_INC(tbl, hits); break; @@ -455,7 +464,8 @@ out_neigh_release: goto out; } -struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, const void *pkey, +struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, + struct net *net, const void *pkey, struct net_device *dev, int creat) { struct pneigh_entry *n; @@ -471,6 +481,7 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, const void *pkey, for (n = tbl->phash_buckets[hash_val]; n; n = n->next) { if (!memcmp(n->key, pkey, key_len) && + (n->net == net) && (n->dev == dev || !n->dev)) { read_unlock_bh(&tbl->lock); goto out; @@ -487,6 +498,7 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, const void *pkey, if (!n) goto out; + n->net = hold_net(net); memcpy(n->key, pkey, key_len); n->dev = dev; if (dev) @@ -509,7 +521,7 @@ out: } -int pneigh_delete(struct neigh_table *tbl, const void *pkey, +int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { struct pneigh_entry *n, **np; @@ -524,13 +536,15 @@ int pneigh_delete(struct neigh_table *tbl, const void *pkey, write_lock_bh(&tbl->lock); for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL; np = &n->next) { - if (!memcmp(n->key, pkey, key_len) && n->dev == dev) { + if (!memcmp(n->key, pkey, key_len) && n->dev == dev && + (n->net == net)) { *np = n->next; write_unlock_bh(&tbl->lock); if (tbl->pdestructor) tbl->pdestructor(n); if (n->dev) dev_put(n->dev); + release_net(n->net); kfree(n); return 0; } @@ -553,6 +567,7 @@ static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev) tbl->pdestructor(n); if (n->dev) dev_put(n->dev); + release_net(n->net); kfree(n); continue; } @@ -562,6 +577,13 @@ static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev) return -ENOENT; } +static void neigh_parms_destroy(struct neigh_parms *parms); + +static inline void neigh_parms_put(struct neigh_parms *parms) +{ + if (atomic_dec_and_test(&parms->refcnt)) + neigh_parms_destroy(parms); +} /* * neighbour must already be out of the table; @@ -718,15 +740,6 @@ static __inline__ int neigh_max_probes(struct neighbour *n) p->ucast_probes + p->app_probes + p->mcast_probes); } -static inline void neigh_add_timer(struct neighbour *n, unsigned long when) -{ - if (unlikely(mod_timer(&n->timer, when))) { - printk("NEIGH: BUG, double timer add, state is %x\n", - n->nud_state); - dump_stack(); - } -} - /* Called when a timer expires for a neighbour entry. */ static void neigh_timer_handler(unsigned long arg) @@ -858,7 +871,6 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) atomic_set(&neigh->probes, neigh->parms->ucast_probes); neigh->nud_state = NUD_INCOMPLETE; neigh->updated = jiffies; - neigh_hold(neigh); neigh_add_timer(neigh, now + 1); } else { neigh->nud_state = NUD_FAILED; @@ -871,7 +883,6 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) } } else if (neigh->nud_state & NUD_STALE) { NEIGH_PRINTK2("neigh %p is delayed.\n", neigh); - neigh_hold(neigh); neigh->nud_state = NUD_DELAY; neigh->updated = jiffies; neigh_add_timer(neigh, @@ -1015,13 +1026,11 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, if (new != old) { neigh_del_timer(neigh); - if (new & NUD_IN_TIMER) { - neigh_hold(neigh); + if (new & NUD_IN_TIMER) neigh_add_timer(neigh, (jiffies + ((new & NUD_REACHABLE) ? neigh->parms->reachable_time : 0))); - } neigh->nud_state = new; } @@ -1266,27 +1275,49 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, spin_unlock(&tbl->proxy_queue.lock); } +static inline struct neigh_parms *lookup_neigh_params(struct neigh_table *tbl, + struct net *net, int ifindex) +{ + struct neigh_parms *p; + + for (p = &tbl->parms; p; p = p->next) { + if (p->net != net) + continue; + if ((p->dev && p->dev->ifindex == ifindex) || + (!p->dev && !ifindex)) + return p; + } + + return NULL; +} struct neigh_parms *neigh_parms_alloc(struct net_device *dev, struct neigh_table *tbl) { - struct neigh_parms *p = kmemdup(&tbl->parms, sizeof(*p), GFP_KERNEL); + struct neigh_parms *p, *ref; + struct net *net; + + net = dev->nd_net; + ref = lookup_neigh_params(tbl, net, 0); + if (!ref) + return NULL; + p = kmemdup(ref, sizeof(*p), GFP_KERNEL); if (p) { p->tbl = tbl; atomic_set(&p->refcnt, 1); INIT_RCU_HEAD(&p->rcu_head); p->reachable_time = neigh_rand_reach_time(p->base_reachable_time); - if (dev) { - if (dev->neigh_setup && dev->neigh_setup(dev, p)) { - kfree(p); - return NULL; - } - dev_hold(dev); - p->dev = dev; + if (dev->neigh_setup && dev->neigh_setup(dev, p)) { + kfree(p); + return NULL; } + + dev_hold(dev); + p->dev = dev; + p->net = hold_net(net); p->sysctl_table = NULL; write_lock_bh(&tbl->lock); p->next = tbl->parms.next; @@ -1326,8 +1357,9 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) NEIGH_PRINTK1("neigh_parms_release: not found\n"); } -void neigh_parms_destroy(struct neigh_parms *parms) +static void neigh_parms_destroy(struct neigh_parms *parms) { + release_net(parms->net); kfree(parms); } @@ -1338,6 +1370,7 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl) unsigned long now = jiffies; unsigned long phsize; + tbl->parms.net = &init_net; atomic_set(&tbl->parms.refcnt, 1); INIT_RCU_HEAD(&tbl->parms.rcu_head); tbl->parms.reachable_time = @@ -1372,15 +1405,11 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl) get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); rwlock_init(&tbl->lock); - init_timer(&tbl->gc_timer); - tbl->gc_timer.data = (unsigned long)tbl; - tbl->gc_timer.function = neigh_periodic_timer; + setup_timer(&tbl->gc_timer, neigh_periodic_timer, (unsigned long)tbl); tbl->gc_timer.expires = now + 1; add_timer(&tbl->gc_timer); - init_timer(&tbl->proxy_timer); - tbl->proxy_timer.data = (unsigned long)tbl; - tbl->proxy_timer.function = neigh_proxy_process; + setup_timer(&tbl->proxy_timer, neigh_proxy_process, (unsigned long)tbl); skb_queue_head_init_class(&tbl->proxy_queue, &neigh_table_proxy_queue_class); @@ -1483,7 +1512,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) goto out_dev_put; if (ndm->ndm_flags & NTF_PROXY) { - err = pneigh_delete(tbl, nla_data(dst_attr), dev); + err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); goto out_dev_put; } @@ -1560,7 +1589,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) struct pneigh_entry *pn; err = -ENOBUFS; - pn = pneigh_lookup(tbl, dst, dev, 1); + pn = pneigh_lookup(tbl, net, dst, dev, 1); if (pn) { pn->flags = ndm->ndm_flags; err = 0; @@ -1755,19 +1784,6 @@ errout: return -EMSGSIZE; } -static inline struct neigh_parms *lookup_neigh_params(struct neigh_table *tbl, - int ifindex) -{ - struct neigh_parms *p; - - for (p = &tbl->parms; p; p = p->next) - if ((p->dev && p->dev->ifindex == ifindex) || - (!p->dev && !ifindex)) - return p; - - return NULL; -} - static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = { [NDTA_NAME] = { .type = NLA_STRING }, [NDTA_THRESH1] = { .type = NLA_U32 }, @@ -1795,6 +1811,7 @@ static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = { static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { + struct net *net = skb->sk->sk_net; struct neigh_table *tbl; struct ndtmsg *ndtmsg; struct nlattr *tb[NDTA_MAX+1]; @@ -1844,7 +1861,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) if (tbp[NDTPA_IFINDEX]) ifindex = nla_get_u32(tbp[NDTPA_IFINDEX]); - p = lookup_neigh_params(tbl, ifindex); + p = lookup_neigh_params(tbl, net, ifindex); if (p == NULL) { err = -ENOENT; goto errout_tbl_lock; @@ -1919,6 +1936,7 @@ errout: static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = skb->sk->sk_net; int family, tidx, nidx = 0; int tbl_skip = cb->args[0]; int neigh_skip = cb->args[1]; @@ -1938,8 +1956,11 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) NLM_F_MULTI) <= 0) break; - for (nidx = 0, p = tbl->parms.next; p; p = p->next, nidx++) { - if (nidx < neigh_skip) + for (nidx = 0, p = tbl->parms.next; p; p = p->next) { + if (net != p->net) + continue; + + if (nidx++ < neigh_skip) continue; if (neightbl_fill_param_info(skb, tbl, p, @@ -2015,6 +2036,7 @@ static void neigh_update_notify(struct neighbour *neigh) static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct netlink_callback *cb) { + struct net * net = skb->sk->sk_net; struct neighbour *n; int rc, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; @@ -2025,8 +2047,12 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, continue; if (h > s_h) s_idx = 0; - for (n = tbl->hash_buckets[h], idx = 0; n; n = n->next, idx++) { - if (idx < s_idx) + for (n = tbl->hash_buckets[h], idx = 0; n; n = n->next) { + int lidx; + if (n->dev->nd_net != net) + continue; + lidx = idx++; + if (lidx < s_idx) continue; if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, @@ -2118,6 +2144,7 @@ EXPORT_SYMBOL(__neigh_for_each_release); static struct neighbour *neigh_get_first(struct seq_file *seq) { struct neigh_seq_state *state = seq->private; + struct net *net = state->p.net; struct neigh_table *tbl = state->tbl; struct neighbour *n = NULL; int bucket = state->bucket; @@ -2127,6 +2154,8 @@ static struct neighbour *neigh_get_first(struct seq_file *seq) n = tbl->hash_buckets[bucket]; while (n) { + if (n->dev->nd_net != net) + goto next; if (state->neigh_sub_iter) { loff_t fakep = 0; void *v; @@ -2156,6 +2185,7 @@ static struct neighbour *neigh_get_next(struct seq_file *seq, loff_t *pos) { struct neigh_seq_state *state = seq->private; + struct net *net = state->p.net; struct neigh_table *tbl = state->tbl; if (state->neigh_sub_iter) { @@ -2167,6 +2197,8 @@ static struct neighbour *neigh_get_next(struct seq_file *seq, while (1) { while (n) { + if (n->dev->nd_net != net) + goto next; if (state->neigh_sub_iter) { void *v = state->neigh_sub_iter(state, n, pos); if (v) @@ -2213,6 +2245,7 @@ static struct neighbour *neigh_get_idx(struct seq_file *seq, loff_t *pos) static struct pneigh_entry *pneigh_get_first(struct seq_file *seq) { struct neigh_seq_state *state = seq->private; + struct net * net = state->p.net; struct neigh_table *tbl = state->tbl; struct pneigh_entry *pn = NULL; int bucket = state->bucket; @@ -2220,6 +2253,8 @@ static struct pneigh_entry *pneigh_get_first(struct seq_file *seq) state->flags |= NEIGH_SEQ_IS_PNEIGH; for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) { pn = tbl->phash_buckets[bucket]; + while (pn && (pn->net != net)) + pn = pn->next; if (pn) break; } @@ -2233,6 +2268,7 @@ static struct pneigh_entry *pneigh_get_next(struct seq_file *seq, loff_t *pos) { struct neigh_seq_state *state = seq->private; + struct net * net = state->p.net; struct neigh_table *tbl = state->tbl; pn = pn->next; @@ -2240,6 +2276,8 @@ static struct pneigh_entry *pneigh_get_next(struct seq_file *seq, if (++state->bucket > PNEIGH_HASHMASK) break; pn = tbl->phash_buckets[state->bucket]; + while (pn && (pn->net != net)) + pn = pn->next; if (pn) break; } @@ -2277,6 +2315,7 @@ static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos) } void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags) + __acquires(tbl->lock) { struct neigh_seq_state *state = seq->private; loff_t pos_minus_one; @@ -2320,6 +2359,7 @@ out: EXPORT_SYMBOL(neigh_seq_next); void neigh_seq_stop(struct seq_file *seq, void *v) + __releases(tbl->lock) { struct neigh_seq_state *state = seq->private; struct neigh_table *tbl = state->tbl; @@ -2441,6 +2481,7 @@ static inline size_t neigh_nlmsg_size(void) static void __neigh_notify(struct neighbour *n, int type, int flags) { + struct net *net = n->dev->nd_net; struct sk_buff *skb; int err = -ENOBUFS; @@ -2455,10 +2496,10 @@ static void __neigh_notify(struct neighbour *n, int type, int flags) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); + err = rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); errout: if (err < 0) - rtnl_set_sk_err(RTNLGRP_NEIGH, err); + rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } #ifdef CONFIG_ARPD @@ -2472,11 +2513,8 @@ void neigh_app_ns(struct neighbour *n) static struct neigh_sysctl_table { struct ctl_table_header *sysctl_header; - ctl_table neigh_vars[__NET_NEIGH_MAX]; - ctl_table neigh_dev[2]; - ctl_table neigh_neigh_dir[2]; - ctl_table neigh_proto_dir[2]; - ctl_table neigh_root_dir[2]; + struct ctl_table neigh_vars[__NET_NEIGH_MAX]; + char *dev_name; } neigh_sysctl_template __read_mostly = { .neigh_vars = { { @@ -2607,32 +2645,7 @@ static struct neigh_sysctl_table { .mode = 0644, .proc_handler = &proc_dointvec, }, - {} - }, - .neigh_dev = { - { - .ctl_name = NET_PROTO_CONF_DEFAULT, - .procname = "default", - .mode = 0555, - }, - }, - .neigh_neigh_dir = { - { - .procname = "neigh", - .mode = 0555, - }, - }, - .neigh_proto_dir = { - { - .mode = 0555, - }, - }, - .neigh_root_dir = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - }, + {}, }, }; @@ -2640,14 +2653,26 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, int p_id, int pdev_id, char *p_name, proc_handler *handler, ctl_handler *strategy) { - struct neigh_sysctl_table *t = kmemdup(&neigh_sysctl_template, - sizeof(*t), GFP_KERNEL); + struct neigh_sysctl_table *t; const char *dev_name_source = NULL; - char *dev_name = NULL; - int err = 0; +#define NEIGH_CTL_PATH_ROOT 0 +#define NEIGH_CTL_PATH_PROTO 1 +#define NEIGH_CTL_PATH_NEIGH 2 +#define NEIGH_CTL_PATH_DEV 3 + + struct ctl_path neigh_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "proto", .ctl_name = 0, }, + { .procname = "neigh", .ctl_name = 0, }, + { .procname = "default", .ctl_name = NET_PROTO_CONF_DEFAULT, }, + { }, + }; + + t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL); if (!t) - return -ENOBUFS; + goto err; + t->neigh_vars[0].data = &p->mcast_probes; t->neigh_vars[1].data = &p->ucast_probes; t->neigh_vars[2].data = &p->app_probes; @@ -2665,11 +2690,11 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, if (dev) { dev_name_source = dev->name; - t->neigh_dev[0].ctl_name = dev->ifindex; + neigh_path[NEIGH_CTL_PATH_DEV].ctl_name = dev->ifindex; /* Terminate the table early */ memset(&t->neigh_vars[14], 0, sizeof(t->neigh_vars[14])); } else { - dev_name_source = t->neigh_dev[0].procname; + dev_name_source = neigh_path[NEIGH_CTL_PATH_DEV].procname; t->neigh_vars[14].data = (int *)(p + 1); t->neigh_vars[15].data = (int *)(p + 1) + 1; t->neigh_vars[16].data = (int *)(p + 1) + 2; @@ -2704,39 +2729,28 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, t->neigh_vars[13].ctl_name = CTL_UNNUMBERED; } - dev_name = kstrdup(dev_name_source, GFP_KERNEL); - if (!dev_name) { - err = -ENOBUFS; + t->dev_name = kstrdup(dev_name_source, GFP_KERNEL); + if (!t->dev_name) goto free; - } - - t->neigh_dev[0].procname = dev_name; - - t->neigh_neigh_dir[0].ctl_name = pdev_id; - t->neigh_proto_dir[0].procname = p_name; - t->neigh_proto_dir[0].ctl_name = p_id; + neigh_path[NEIGH_CTL_PATH_DEV].procname = t->dev_name; + neigh_path[NEIGH_CTL_PATH_NEIGH].ctl_name = pdev_id; + neigh_path[NEIGH_CTL_PATH_PROTO].procname = p_name; + neigh_path[NEIGH_CTL_PATH_PROTO].ctl_name = p_id; - t->neigh_dev[0].child = t->neigh_vars; - t->neigh_neigh_dir[0].child = t->neigh_dev; - t->neigh_proto_dir[0].child = t->neigh_neigh_dir; - t->neigh_root_dir[0].child = t->neigh_proto_dir; - - t->sysctl_header = register_sysctl_table(t->neigh_root_dir); - if (!t->sysctl_header) { - err = -ENOBUFS; + t->sysctl_header = register_sysctl_paths(neigh_path, t->neigh_vars); + if (!t->sysctl_header) goto free_procname; - } + p->sysctl_table = t; return 0; - /* error path */ - free_procname: - kfree(dev_name); - free: +free_procname: + kfree(t->dev_name); +free: kfree(t); - - return err; +err: + return -ENOBUFS; } void neigh_sysctl_unregister(struct neigh_parms *p) @@ -2745,7 +2759,7 @@ void neigh_sysctl_unregister(struct neigh_parms *p) struct neigh_sysctl_table *t = p->sysctl_table; p->sysctl_table = NULL; unregister_sysctl_table(t->sysctl_header); - kfree(t->neigh_dev[0].procname); + kfree(t->dev_name); kfree(t); } } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 61ead1d1113..7635d3f7272 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -95,17 +95,6 @@ NETDEVICE_SHOW(type, fmt_dec); NETDEVICE_SHOW(link_mode, fmt_dec); /* use same locking rules as GIFHWADDR ioctl's */ -static ssize_t format_addr(char *buf, const unsigned char *addr, int len) -{ - int i; - char *cp = buf; - - for (i = 0; i < len; i++) - cp += sprintf(cp, "%02x%c", addr[i], - i == (len - 1) ? '\n' : ':'); - return cp - buf; -} - static ssize_t show_address(struct device *dev, struct device_attribute *attr, char *buf) { @@ -114,7 +103,7 @@ static ssize_t show_address(struct device *dev, struct device_attribute *attr, read_lock(&dev_base_lock); if (dev_isalive(net)) - ret = format_addr(buf, net->dev_addr, net->addr_len); + ret = sysfs_format_mac(buf, net->dev_addr, net->addr_len); read_unlock(&dev_base_lock); return ret; } @@ -124,7 +113,7 @@ static ssize_t show_broadcast(struct device *dev, { struct net_device *net = to_net_dev(dev); if (dev_isalive(net)) - return format_addr(buf, net->broadcast, net->addr_len); + return sysfs_format_mac(buf, net->broadcast, net->addr_len); return -EINVAL; } @@ -247,9 +236,8 @@ static ssize_t netstat_show(const struct device *d, struct net_device_stats *stats; ssize_t ret = -EINVAL; - if (offset > sizeof(struct net_device_stats) || - offset % sizeof(unsigned long) != 0) - WARN_ON(1); + WARN_ON(offset > sizeof(struct net_device_stats) || + offset % sizeof(unsigned long) != 0); read_lock(&dev_base_lock); if (dev_isalive(dev) && dev->get_stats && diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index ec936ae9245..26e941d912e 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -58,6 +58,7 @@ out_undo: #ifdef CONFIG_NET_NS static struct kmem_cache *net_cachep; +static struct workqueue_struct *netns_wq; static struct net *net_alloc(void) { @@ -149,7 +150,7 @@ void __put_net(struct net *net) { /* Cleanup the network namespace in process context */ INIT_WORK(&net->work, cleanup_net); - schedule_work(&net->work); + queue_work(netns_wq, &net->work); } EXPORT_SYMBOL_GPL(__put_net); @@ -171,7 +172,13 @@ static int __init net_ns_init(void) net_cachep = kmem_cache_create("net_namespace", sizeof(struct net), SMP_CACHE_BYTES, SLAB_PANIC, NULL); + + /* Create workqueue for cleanup */ + netns_wq = create_singlethread_workqueue("netns"); + if (!netns_wq) + panic("Could not create netns workq"); #endif + mutex_lock(&net_mutex); err = setup_net(&init_net); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index c499b5c69be..6faa128a4c8 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -39,8 +39,6 @@ static struct sk_buff_head skb_pool; static atomic_t trapped; #define USEC_PER_POLL 50 -#define NETPOLL_RX_ENABLED 1 -#define NETPOLL_RX_DROP 2 #define MAX_SKB_SIZE \ (MAX_UDP_CHUNK + sizeof(struct udphdr) + \ @@ -128,27 +126,24 @@ static int poll_one_napi(struct netpoll_info *npinfo, if (!test_bit(NAPI_STATE_SCHED, &napi->state)) return budget; - npinfo->rx_flags |= NETPOLL_RX_DROP; atomic_inc(&trapped); work = napi->poll(napi, budget); atomic_dec(&trapped); - npinfo->rx_flags &= ~NETPOLL_RX_DROP; return budget - work; } -static void poll_napi(struct netpoll *np) +static void poll_napi(struct net_device *dev) { - struct netpoll_info *npinfo = np->dev->npinfo; struct napi_struct *napi; int budget = 16; - list_for_each_entry(napi, &np->dev->napi_list, dev_list) { + list_for_each_entry(napi, &dev->napi_list, dev_list) { if (napi->poll_owner != smp_processor_id() && spin_trylock(&napi->poll_lock)) { - budget = poll_one_napi(npinfo, napi, budget); + budget = poll_one_napi(dev->npinfo, napi, budget); spin_unlock(&napi->poll_lock); if (!budget) @@ -159,30 +154,27 @@ static void poll_napi(struct netpoll *np) static void service_arp_queue(struct netpoll_info *npi) { - struct sk_buff *skb; - - if (unlikely(!npi)) - return; - - skb = skb_dequeue(&npi->arp_tx); + if (npi) { + struct sk_buff *skb; - while (skb != NULL) { - arp_reply(skb); - skb = skb_dequeue(&npi->arp_tx); + while ((skb = skb_dequeue(&npi->arp_tx))) + arp_reply(skb); } } void netpoll_poll(struct netpoll *np) { - if (!np->dev || !netif_running(np->dev) || !np->dev->poll_controller) + struct net_device *dev = np->dev; + + if (!dev || !netif_running(dev) || !dev->poll_controller) return; /* Process pending work on NIC */ - np->dev->poll_controller(np->dev); - if (!list_empty(&np->dev->napi_list)) - poll_napi(np); + dev->poll_controller(dev); + + poll_napi(dev); - service_arp_queue(np->dev->npinfo); + service_arp_queue(dev->npinfo); zap_completion_queue(); } @@ -364,8 +356,8 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); skb_reset_mac_header(skb); skb->protocol = eth->h_proto = htons(ETH_P_IP); - memcpy(eth->h_source, np->local_mac, 6); - memcpy(eth->h_dest, np->remote_mac, 6); + memcpy(eth->h_source, np->dev->dev_addr, ETH_ALEN); + memcpy(eth->h_dest, np->remote_mac, ETH_ALEN); skb->dev = np->dev; @@ -418,7 +410,8 @@ static void arp_reply(struct sk_buff *skb) memcpy(&tip, arp_ptr, 4); /* Should we ignore arp? */ - if (tip != htonl(np->local_ip) || LOOPBACK(tip) || MULTICAST(tip)) + if (tip != htonl(np->local_ip) || + ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) return; size = sizeof(struct arphdr) + 2 * (skb->dev->addr_len + 4); @@ -435,7 +428,7 @@ static void arp_reply(struct sk_buff *skb) /* Fill the device header for the ARP frame */ if (dev_hard_header(send_skb, skb->dev, ptype, - sha, np->local_mac, + sha, np->dev->dev_addr, send_skb->len) < 0) { kfree_skb(send_skb); return; @@ -479,7 +472,7 @@ int __netpoll_rx(struct sk_buff *skb) if (skb->dev->type != ARPHRD_ETHER) goto out; - /* check if netpoll clients need ARP */ + /* if receive ARP during middle of NAPI poll, then queue */ if (skb->protocol == htons(ETH_P_ARP) && atomic_read(&trapped)) { skb_queue_tail(&npi->arp_tx, skb); @@ -541,6 +534,9 @@ int __netpoll_rx(struct sk_buff *skb) return 1; out: + /* If packet received while already in poll then just + * silently drop. + */ if (atomic_read(&trapped)) { kfree_skb(skb); return 1; @@ -679,7 +675,6 @@ int netpoll_setup(struct netpoll *np) goto release; } - npinfo->rx_flags = 0; npinfo->rx_np = NULL; spin_lock_init(&npinfo->rx_lock); @@ -741,9 +736,6 @@ int netpoll_setup(struct netpoll *np) } } - if (is_zero_ether_addr(np->local_mac) && ndev->dev_addr) - memcpy(np->local_mac, ndev->dev_addr, 6); - if (!np->local_ip) { rcu_read_lock(); in_dev = __in_dev_get_rcu(ndev); @@ -764,7 +756,6 @@ int netpoll_setup(struct netpoll *np) if (np->rx_hook) { spin_lock_irqsave(&npinfo->rx_lock, flags); - npinfo->rx_flags |= NETPOLL_RX_ENABLED; npinfo->rx_np = np; spin_unlock_irqrestore(&npinfo->rx_lock, flags); } @@ -806,7 +797,6 @@ void netpoll_cleanup(struct netpoll *np) if (npinfo->rx_np == np) { spin_lock_irqsave(&npinfo->rx_lock, flags); npinfo->rx_np = NULL; - npinfo->rx_flags &= ~NETPOLL_RX_ENABLED; spin_unlock_irqrestore(&npinfo->rx_lock, flags); } @@ -816,11 +806,7 @@ void netpoll_cleanup(struct netpoll *np) cancel_rearming_delayed_work(&npinfo->tx_work); /* clean after last, unfinished work */ - if (!skb_queue_empty(&npinfo->txq)) { - struct sk_buff *skb; - skb = __skb_dequeue(&npinfo->txq); - kfree_skb(skb); - } + __skb_queue_purge(&npinfo->txq); kfree(npinfo); np->dev->npinfo = NULL; } diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 285ec3ed9b3..eebccdbdbac 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -397,62 +397,6 @@ struct pktgen_thread { #define REMOVE 1 #define FIND 0 -/* This code works around the fact that do_div cannot handle two 64-bit - numbers, and regular 64-bit division doesn't work on x86 kernels. - --Ben -*/ - -#define PG_DIV 0 - -/* This was emailed to LMKL by: Chris Caputo <ccaputo@alt.net> - * Function copied/adapted/optimized from: - * - * nemesis.sourceforge.net/browse/lib/static/intmath/ix86/intmath.c.html - * - * Copyright 1994, University of Cambridge Computer Laboratory - * All Rights Reserved. - * - */ -static inline s64 divremdi3(s64 x, s64 y, int type) -{ - u64 a = (x < 0) ? -x : x; - u64 b = (y < 0) ? -y : y; - u64 res = 0, d = 1; - - if (b > 0) { - while (b < a) { - b <<= 1; - d <<= 1; - } - } - - do { - if (a >= b) { - a -= b; - res += d; - } - b >>= 1; - d >>= 1; - } - while (d); - - if (PG_DIV == type) { - return (((x ^ y) & (1ll << 63)) == 0) ? res : -(s64) res; - } else { - return ((x & (1ll << 63)) == 0) ? a : -(s64) a; - } -} - -/* End of hacks to deal with 64-bit math on x86 */ - -/** Convert to milliseconds */ -static inline __u64 tv_to_ms(const struct timeval *tv) -{ - __u64 ms = tv->tv_usec / 1000; - ms += (__u64) tv->tv_sec * (__u64) 1000; - return ms; -} - /** Convert to micro-seconds */ static inline __u64 tv_to_us(const struct timeval *tv) { @@ -461,51 +405,13 @@ static inline __u64 tv_to_us(const struct timeval *tv) return us; } -static inline __u64 pg_div(__u64 n, __u32 base) -{ - __u64 tmp = n; - do_div(tmp, base); - /* printk("pktgen: pg_div, n: %llu base: %d rv: %llu\n", - n, base, tmp); */ - return tmp; -} - -static inline __u64 pg_div64(__u64 n, __u64 base) -{ - __u64 tmp = n; -/* - * How do we know if the architecture we are running on - * supports division with 64 bit base? - * - */ -#if defined(__sparc_v9__) || defined(__powerpc64__) || defined(__alpha__) || defined(__x86_64__) || defined(__ia64__) - - do_div(tmp, base); -#else - tmp = divremdi3(n, base, PG_DIV); -#endif - return tmp; -} - -static inline __u64 getCurMs(void) -{ - struct timeval tv; - do_gettimeofday(&tv); - return tv_to_ms(&tv); -} - -static inline __u64 getCurUs(void) +static __u64 getCurUs(void) { struct timeval tv; do_gettimeofday(&tv); return tv_to_us(&tv); } -static inline __u64 tv_diff(const struct timeval *a, const struct timeval *b) -{ - return tv_to_us(a) - tv_to_us(b); -} - /* old include end */ static char version[] __initdata = VERSION; @@ -2358,9 +2264,11 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) t = random32() % (imx - imn) + imn; s = htonl(t); - while (LOOPBACK(s) || MULTICAST(s) - || BADCLASS(s) || ZERONET(s) - || LOCAL_MCAST(s)) { + while (ipv4_is_loopback(s) || + ipv4_is_multicast(s) || + ipv4_is_lbcast(s) || + ipv4_is_zeronet(s) || + ipv4_is_local_multicast(s)) { t = random32() % (imx - imn) + imn; s = htonl(t); } diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 45aed75cb57..2d3035d3abd 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -69,8 +69,6 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, return 0; } -EXPORT_SYMBOL(reqsk_queue_alloc); - void __reqsk_queue_destroy(struct request_sock_queue *queue) { struct listen_sock *lopt; @@ -91,8 +89,6 @@ void __reqsk_queue_destroy(struct request_sock_queue *queue) kfree(lopt); } -EXPORT_SYMBOL(__reqsk_queue_destroy); - static inline struct listen_sock *reqsk_queue_yank_listen_sk( struct request_sock_queue *queue) { @@ -134,4 +130,3 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) kfree(lopt); } -EXPORT_SYMBOL(reqsk_queue_destroy); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index fed95a323b2..ddbdde82a70 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -60,7 +60,6 @@ struct rtnl_link }; static DEFINE_MUTEX(rtnl_mutex); -static struct sock *rtnl; void rtnl_lock(void) { @@ -458,8 +457,9 @@ size_t rtattr_strlcpy(char *dest, const struct rtattr *rta, size_t size) return ret; } -int rtnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo) +int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned group, int echo) { + struct sock *rtnl = net->rtnl; int err = 0; NETLINK_CB(skb).dst_group = group; @@ -471,14 +471,17 @@ int rtnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo) return err; } -int rtnl_unicast(struct sk_buff *skb, u32 pid) +int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid) { + struct sock *rtnl = net->rtnl; + return nlmsg_unicast(rtnl, skb, pid); } -int rtnl_notify(struct sk_buff *skb, u32 pid, u32 group, +int rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, struct nlmsghdr *nlh, gfp_t flags) { + struct sock *rtnl = net->rtnl; int report = 0; if (nlh) @@ -487,8 +490,10 @@ int rtnl_notify(struct sk_buff *skb, u32 pid, u32 group, return nlmsg_notify(rtnl, skb, pid, group, report, flags); } -void rtnl_set_sk_err(u32 group, int error) +void rtnl_set_sk_err(struct net *net, u32 group, int error) { + struct sock *rtnl = net->rtnl; + netlink_set_err(rtnl, 0, group, error); } @@ -1186,7 +1191,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) kfree_skb(nskb); goto errout; } - err = rtnl_unicast(nskb, NETLINK_CB(skb).pid); + err = rtnl_unicast(nskb, net, NETLINK_CB(skb).pid); errout: dev_put(dev); @@ -1219,6 +1224,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change) { + struct net *net = dev->nd_net; struct sk_buff *skb; int err = -ENOBUFS; @@ -1233,10 +1239,10 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, 0, RTNLGRP_LINK, NULL, GFP_KERNEL); + err = rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL); errout: if (err < 0) - rtnl_set_sk_err(RTNLGRP_LINK, err); + rtnl_set_sk_err(net, RTNLGRP_LINK, err); } /* Protected by RTNL sempahore. */ @@ -1247,6 +1253,7 @@ static int rtattr_max; static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { + struct net *net = skb->sk->sk_net; rtnl_doit_func doit; int sz_idx, kind; int min_len; @@ -1275,6 +1282,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return -EPERM; if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { + struct sock *rtnl; rtnl_dumpit_func dumpit; dumpit = rtnl_get_dumpit(family, type); @@ -1282,6 +1290,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return -EOPNOTSUPP; __rtnl_unlock(); + rtnl = net->rtnl; err = netlink_dump_start(rtnl, skb, nlh, dumpit, NULL); rtnl_lock(); return err; @@ -1326,9 +1335,6 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi { struct net_device *dev = ptr; - if (dev->nd_net != &init_net) - return NOTIFY_DONE; - switch (event) { case NETDEV_UNREGISTER: rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); @@ -1354,6 +1360,29 @@ static struct notifier_block rtnetlink_dev_notifier = { .notifier_call = rtnetlink_event, }; + +static int rtnetlink_net_init(struct net *net) +{ + struct sock *sk; + sk = netlink_kernel_create(net, NETLINK_ROUTE, RTNLGRP_MAX, + rtnetlink_rcv, &rtnl_mutex, THIS_MODULE); + if (!sk) + return -ENOMEM; + net->rtnl = sk; + return 0; +} + +static void rtnetlink_net_exit(struct net *net) +{ + netlink_kernel_release(net->rtnl); + net->rtnl = NULL; +} + +static struct pernet_operations rtnetlink_net_ops = { + .init = rtnetlink_net_init, + .exit = rtnetlink_net_exit, +}; + void __init rtnetlink_init(void) { int i; @@ -1366,10 +1395,9 @@ void __init rtnetlink_init(void) if (!rta_buf) panic("rtnetlink_init: cannot allocate rta_buf\n"); - rtnl = netlink_kernel_create(&init_net, NETLINK_ROUTE, RTNLGRP_MAX, - rtnetlink_rcv, &rtnl_mutex, THIS_MODULE); - if (rtnl == NULL) + if (register_pernet_subsys(&rtnetlink_net_ops)) panic("rtnetlink_init: cannot initialize rtnetlink\n"); + netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV); register_netdevice_notifier(&rtnetlink_dev_notifier); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b6283779e93..98420f9c4b6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -52,6 +52,7 @@ #endif #include <linux/string.h> #include <linux/skbuff.h> +#include <linux/splice.h> #include <linux/cache.h> #include <linux/rtnetlink.h> #include <linux/init.h> @@ -71,6 +72,40 @@ static struct kmem_cache *skbuff_head_cache __read_mostly; static struct kmem_cache *skbuff_fclone_cache __read_mostly; +static void sock_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct sk_buff *skb = (struct sk_buff *) buf->private; + + kfree_skb(skb); +} + +static void sock_pipe_buf_get(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct sk_buff *skb = (struct sk_buff *) buf->private; + + skb_get(skb); +} + +static int sock_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return 1; +} + + +/* Pipe buffer operations for a socket. */ +static struct pipe_buf_operations sock_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = sock_pipe_buf_release, + .steal = sock_pipe_buf_steal, + .get = sock_pipe_buf_get, +}; + /* * Keep out-of-line to prevent kernel bloat. * __builtin_return_address is not used because it is not always @@ -1122,6 +1157,217 @@ fault: return -EFAULT; } +/* + * Callback from splice_to_pipe(), if we need to release some pages + * at the end of the spd in case we error'ed out in filling the pipe. + */ +static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) +{ + struct sk_buff *skb = (struct sk_buff *) spd->partial[i].private; + + kfree_skb(skb); +} + +/* + * Fill page/offset/length into spd, if it can hold more pages. + */ +static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page, + unsigned int len, unsigned int offset, + struct sk_buff *skb) +{ + if (unlikely(spd->nr_pages == PIPE_BUFFERS)) + return 1; + + spd->pages[spd->nr_pages] = page; + spd->partial[spd->nr_pages].len = len; + spd->partial[spd->nr_pages].offset = offset; + spd->partial[spd->nr_pages].private = (unsigned long) skb_get(skb); + spd->nr_pages++; + return 0; +} + +/* + * Map linear and fragment data from the skb to spd. Returns number of + * pages mapped. + */ +static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, + unsigned int *total_len, + struct splice_pipe_desc *spd) +{ + unsigned int nr_pages = spd->nr_pages; + unsigned int poff, plen, len, toff, tlen; + int headlen, seg; + + toff = *offset; + tlen = *total_len; + if (!tlen) + goto err; + + /* + * if the offset is greater than the linear part, go directly to + * the fragments. + */ + headlen = skb_headlen(skb); + if (toff >= headlen) { + toff -= headlen; + goto map_frag; + } + + /* + * first map the linear region into the pages/partial map, skipping + * any potential initial offset. + */ + len = 0; + while (len < headlen) { + void *p = skb->data + len; + + poff = (unsigned long) p & (PAGE_SIZE - 1); + plen = min_t(unsigned int, headlen - len, PAGE_SIZE - poff); + len += plen; + + if (toff) { + if (plen <= toff) { + toff -= plen; + continue; + } + plen -= toff; + poff += toff; + toff = 0; + } + + plen = min(plen, tlen); + if (!plen) + break; + + /* + * just jump directly to update and return, no point + * in going over fragments when the output is full. + */ + if (spd_fill_page(spd, virt_to_page(p), plen, poff, skb)) + goto done; + + tlen -= plen; + } + + /* + * then map the fragments + */ +map_frag: + for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { + const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; + + plen = f->size; + poff = f->page_offset; + + if (toff) { + if (plen <= toff) { + toff -= plen; + continue; + } + plen -= toff; + poff += toff; + toff = 0; + } + + plen = min(plen, tlen); + if (!plen) + break; + + if (spd_fill_page(spd, f->page, plen, poff, skb)) + break; + + tlen -= plen; + } + +done: + if (spd->nr_pages - nr_pages) { + *offset = 0; + *total_len = tlen; + return 0; + } +err: + return 1; +} + +/* + * Map data from the skb to a pipe. Should handle both the linear part, + * the fragments, and the frag list. It does NOT handle frag lists within + * the frag list, if such a thing exists. We'd probably need to recurse to + * handle that cleanly. + */ +int skb_splice_bits(struct sk_buff *__skb, unsigned int offset, + struct pipe_inode_info *pipe, unsigned int tlen, + unsigned int flags) +{ + struct partial_page partial[PIPE_BUFFERS]; + struct page *pages[PIPE_BUFFERS]; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .flags = flags, + .ops = &sock_pipe_buf_ops, + .spd_release = sock_spd_release, + }; + struct sk_buff *skb; + + /* + * I'd love to avoid the clone here, but tcp_read_sock() + * ignores reference counts and unconditonally kills the sk_buff + * on return from the actor. + */ + skb = skb_clone(__skb, GFP_KERNEL); + if (unlikely(!skb)) + return -ENOMEM; + + /* + * __skb_splice_bits() only fails if the output has no room left, + * so no point in going over the frag_list for the error case. + */ + if (__skb_splice_bits(skb, &offset, &tlen, &spd)) + goto done; + else if (!tlen) + goto done; + + /* + * now see if we have a frag_list to map + */ + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + for (; list && tlen; list = list->next) { + if (__skb_splice_bits(list, &offset, &tlen, &spd)) + break; + } + } + +done: + /* + * drop our reference to the clone, the pipe consumption will + * drop the rest. + */ + kfree_skb(skb); + + if (spd.nr_pages) { + int ret; + + /* + * Drop the socket lock, otherwise we have reverse + * locking dependencies between sk_lock and i_mutex + * here as compared to sendfile(). We enter here + * with the socket lock held, and splice_to_pipe() will + * grab the pipe inode lock. For sendfile() emulation, + * we call into ->sendpage() with the i_mutex lock held + * and networking will grab the socket lock. + */ + release_sock(__skb->sk); + ret = splice_to_pipe(pipe, &spd); + lock_sock(__skb->sk); + return ret; + } + + return 0; +} + /** * skb_store_bits - store bits from kernel buffer to skb * @skb: destination buffer diff --git a/net/core/sock.c b/net/core/sock.c index c519b439b8b..1c4b1cd16d6 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -154,7 +154,7 @@ static const char *af_family_key_strings[AF_MAX+1] = { "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" , "sk_lock-21" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" , "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" , - "sk_lock-27" , "sk_lock-28" , "sk_lock-29" , + "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" , "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" , "sk_lock-AF_RXRPC" , "sk_lock-AF_MAX" }; @@ -168,7 +168,7 @@ static const char *af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" , "slock-21" , "slock-AF_SNA" , "slock-AF_IRDA" , "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" , - "slock-27" , "slock-28" , "slock-29" , + "slock-27" , "slock-28" , "slock-AF_CAN" , "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" , "slock-AF_RXRPC" , "slock-AF_MAX" }; @@ -282,6 +282,11 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (err) goto out; + if (!sk_rmem_schedule(sk, skb->truesize)) { + err = -ENOBUFS; + goto out; + } + skb->dev = NULL; skb_set_owner_r(skb, sk); @@ -419,6 +424,14 @@ out: return ret; } +static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) +{ + if (valbool) + sock_set_flag(sk, bit); + else + sock_reset_flag(sk, bit); +} + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. @@ -463,11 +476,8 @@ int sock_setsockopt(struct socket *sock, int level, int optname, case SO_DEBUG: if (val && !capable(CAP_NET_ADMIN)) { ret = -EACCES; - } - else if (valbool) - sock_set_flag(sk, SOCK_DBG); - else - sock_reset_flag(sk, SOCK_DBG); + } else + sock_valbool_flag(sk, SOCK_DBG, valbool); break; case SO_REUSEADDR: sk->sk_reuse = valbool; @@ -477,10 +487,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, ret = -ENOPROTOOPT; break; case SO_DONTROUTE: - if (valbool) - sock_set_flag(sk, SOCK_LOCALROUTE); - else - sock_reset_flag(sk, SOCK_LOCALROUTE); + sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); break; case SO_BROADCAST: sock_valbool_flag(sk, SOCK_BROADCAST, valbool); @@ -1105,7 +1112,9 @@ void sock_rfree(struct sk_buff *skb) { struct sock *sk = skb->sk; + skb_truesize_check(skb); atomic_sub(skb->truesize, &sk->sk_rmem_alloc); + sk_mem_uncharge(skb->sk, skb->truesize); } @@ -1382,6 +1391,103 @@ int sk_wait_data(struct sock *sk, long *timeo) EXPORT_SYMBOL(sk_wait_data); +/** + * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated + * @sk: socket + * @size: memory size to allocate + * @kind: allocation type + * + * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means + * rmem allocation. This function assumes that protocols which have + * memory_pressure use sk_wmem_queued as write buffer accounting. + */ +int __sk_mem_schedule(struct sock *sk, int size, int kind) +{ + struct proto *prot = sk->sk_prot; + int amt = sk_mem_pages(size); + int allocated; + + sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; + allocated = atomic_add_return(amt, prot->memory_allocated); + + /* Under limit. */ + if (allocated <= prot->sysctl_mem[0]) { + if (prot->memory_pressure && *prot->memory_pressure) + *prot->memory_pressure = 0; + return 1; + } + + /* Under pressure. */ + if (allocated > prot->sysctl_mem[1]) + if (prot->enter_memory_pressure) + prot->enter_memory_pressure(); + + /* Over hard limit. */ + if (allocated > prot->sysctl_mem[2]) + goto suppress_allocation; + + /* guarantee minimum buffer size under pressure */ + if (kind == SK_MEM_RECV) { + if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0]) + return 1; + } else { /* SK_MEM_SEND */ + if (sk->sk_type == SOCK_STREAM) { + if (sk->sk_wmem_queued < prot->sysctl_wmem[0]) + return 1; + } else if (atomic_read(&sk->sk_wmem_alloc) < + prot->sysctl_wmem[0]) + return 1; + } + + if (prot->memory_pressure) { + if (!*prot->memory_pressure || + prot->sysctl_mem[2] > atomic_read(prot->sockets_allocated) * + sk_mem_pages(sk->sk_wmem_queued + + atomic_read(&sk->sk_rmem_alloc) + + sk->sk_forward_alloc)) + return 1; + } + +suppress_allocation: + + if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { + sk_stream_moderate_sndbuf(sk); + + /* Fail only if socket is _under_ its sndbuf. + * In this case we cannot block, so that we have to fail. + */ + if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) + return 1; + } + + /* Alas. Undo changes. */ + sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; + atomic_sub(amt, prot->memory_allocated); + return 0; +} + +EXPORT_SYMBOL(__sk_mem_schedule); + +/** + * __sk_reclaim - reclaim memory_allocated + * @sk: socket + */ +void __sk_mem_reclaim(struct sock *sk) +{ + struct proto *prot = sk->sk_prot; + + atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, + prot->memory_allocated); + sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; + + if (prot->memory_pressure && *prot->memory_pressure && + (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0])) + *prot->memory_pressure = 0; +} + +EXPORT_SYMBOL(__sk_mem_reclaim); + + /* * Set of default routines for initialising struct proto_ops when * the protocol does not support a particular function. In certain @@ -1496,7 +1602,7 @@ static void sock_def_error_report(struct sock *sk) read_lock(&sk->sk_callback_lock); if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) wake_up_interruptible(sk->sk_sleep); - sk_wake_async(sk,0,POLL_ERR); + sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); read_unlock(&sk->sk_callback_lock); } @@ -1505,7 +1611,7 @@ static void sock_def_readable(struct sock *sk, int len) read_lock(&sk->sk_callback_lock); if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) wake_up_interruptible(sk->sk_sleep); - sk_wake_async(sk,1,POLL_IN); + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); read_unlock(&sk->sk_callback_lock); } @@ -1522,7 +1628,7 @@ static void sock_def_write_space(struct sock *sk) /* Should agree with poll, otherwise some programs break */ if (sock_writeable(sk)) - sk_wake_async(sk, 2, POLL_OUT); + sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } read_unlock(&sk->sk_callback_lock); @@ -1537,7 +1643,7 @@ void sk_send_sigurg(struct sock *sk) { if (sk->sk_socket && sk->sk_socket->file) if (send_sigurg(&sk->sk_socket->file->f_owner)) - sk_wake_async(sk, 3, POLL_PRI); + sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); } void sk_reset_timer(struct sock *sk, struct timer_list* timer, @@ -1611,6 +1717,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_stamp = ktime_set(-1L, -1L); atomic_set(&sk->sk_refcnt, 1); + atomic_set(&sk->sk_drops, 0); } void fastcall lock_sock_nested(struct sock *sk, int subclass) @@ -1801,65 +1908,15 @@ EXPORT_SYMBOL(sk_common_release); static DEFINE_RWLOCK(proto_list_lock); static LIST_HEAD(proto_list); -#ifdef CONFIG_SMP -/* - * Define default functions to keep track of inuse sockets per protocol - * Note that often used protocols use dedicated functions to get a speed increase. - * (see DEFINE_PROTO_INUSE/REF_PROTO_INUSE) - */ -static void inuse_add(struct proto *prot, int inc) -{ - per_cpu_ptr(prot->inuse_ptr, smp_processor_id())[0] += inc; -} - -static int inuse_get(const struct proto *prot) -{ - int res = 0, cpu; - for_each_possible_cpu(cpu) - res += per_cpu_ptr(prot->inuse_ptr, cpu)[0]; - return res; -} - -static int inuse_init(struct proto *prot) -{ - if (!prot->inuse_getval || !prot->inuse_add) { - prot->inuse_ptr = alloc_percpu(int); - if (prot->inuse_ptr == NULL) - return -ENOBUFS; - - prot->inuse_getval = inuse_get; - prot->inuse_add = inuse_add; - } - return 0; -} - -static void inuse_fini(struct proto *prot) -{ - if (prot->inuse_ptr != NULL) { - free_percpu(prot->inuse_ptr); - prot->inuse_ptr = NULL; - prot->inuse_getval = NULL; - prot->inuse_add = NULL; - } -} -#else -static inline int inuse_init(struct proto *prot) -{ - return 0; -} - -static inline void inuse_fini(struct proto *prot) -{ -} -#endif - int proto_register(struct proto *prot, int alloc_slab) { char *request_sock_slab_name = NULL; char *timewait_sock_slab_name; - if (inuse_init(prot)) + if (sock_prot_inuse_init(prot) != 0) { + printk(KERN_CRIT "%s: Can't alloc inuse counters!\n", prot->name); goto out; + } if (alloc_slab) { prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0, @@ -1927,7 +1984,7 @@ out_free_sock_slab: kmem_cache_destroy(prot->slab); prot->slab = NULL; out_free_inuse: - inuse_fini(prot); + sock_prot_inuse_free(prot); out: return -ENOBUFS; } @@ -1940,7 +1997,8 @@ void proto_unregister(struct proto *prot) list_del(&prot->node); write_unlock(&proto_list_lock); - inuse_fini(prot); + sock_prot_inuse_free(prot); + if (prot->slab != NULL) { kmem_cache_destroy(prot->slab); prot->slab = NULL; @@ -1967,6 +2025,7 @@ EXPORT_SYMBOL(proto_unregister); #ifdef CONFIG_PROC_FS static void *proto_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(proto_list_lock) { read_lock(&proto_list_lock); return seq_list_start_head(&proto_list, *pos); @@ -1978,6 +2037,7 @@ static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void proto_seq_stop(struct seq_file *seq, void *v) + __releases(proto_list_lock) { read_unlock(&proto_list_lock); } diff --git a/net/core/stream.c b/net/core/stream.c index 755bacbcb32..4a0ad152c9c 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -35,7 +35,7 @@ void sk_stream_write_space(struct sock *sk) if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) wake_up_interruptible(sk->sk_sleep); if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) - sock_wake_async(sock, 2, POLL_OUT); + sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT); } } @@ -172,17 +172,6 @@ do_interrupted: EXPORT_SYMBOL(sk_stream_wait_memory); -void sk_stream_rfree(struct sk_buff *skb) -{ - struct sock *sk = skb->sk; - - skb_truesize_check(skb); - atomic_sub(skb->truesize, &sk->sk_rmem_alloc); - sk->sk_forward_alloc += skb->truesize; -} - -EXPORT_SYMBOL(sk_stream_rfree); - int sk_stream_error(struct sock *sk, int flags, int err) { if (err == -EPIPE) @@ -194,76 +183,6 @@ int sk_stream_error(struct sock *sk, int flags, int err) EXPORT_SYMBOL(sk_stream_error); -void __sk_stream_mem_reclaim(struct sock *sk) -{ - atomic_sub(sk->sk_forward_alloc / SK_STREAM_MEM_QUANTUM, - sk->sk_prot->memory_allocated); - sk->sk_forward_alloc &= SK_STREAM_MEM_QUANTUM - 1; - if (*sk->sk_prot->memory_pressure && - (atomic_read(sk->sk_prot->memory_allocated) < - sk->sk_prot->sysctl_mem[0])) - *sk->sk_prot->memory_pressure = 0; -} - -EXPORT_SYMBOL(__sk_stream_mem_reclaim); - -int sk_stream_mem_schedule(struct sock *sk, int size, int kind) -{ - int amt = sk_stream_pages(size); - - sk->sk_forward_alloc += amt * SK_STREAM_MEM_QUANTUM; - atomic_add(amt, sk->sk_prot->memory_allocated); - - /* Under limit. */ - if (atomic_read(sk->sk_prot->memory_allocated) < sk->sk_prot->sysctl_mem[0]) { - if (*sk->sk_prot->memory_pressure) - *sk->sk_prot->memory_pressure = 0; - return 1; - } - - /* Over hard limit. */ - if (atomic_read(sk->sk_prot->memory_allocated) > sk->sk_prot->sysctl_mem[2]) { - sk->sk_prot->enter_memory_pressure(); - goto suppress_allocation; - } - - /* Under pressure. */ - if (atomic_read(sk->sk_prot->memory_allocated) > sk->sk_prot->sysctl_mem[1]) - sk->sk_prot->enter_memory_pressure(); - - if (kind) { - if (atomic_read(&sk->sk_rmem_alloc) < sk->sk_prot->sysctl_rmem[0]) - return 1; - } else if (sk->sk_wmem_queued < sk->sk_prot->sysctl_wmem[0]) - return 1; - - if (!*sk->sk_prot->memory_pressure || - sk->sk_prot->sysctl_mem[2] > atomic_read(sk->sk_prot->sockets_allocated) * - sk_stream_pages(sk->sk_wmem_queued + - atomic_read(&sk->sk_rmem_alloc) + - sk->sk_forward_alloc)) - return 1; - -suppress_allocation: - - if (!kind) { - sk_stream_moderate_sndbuf(sk); - - /* Fail only if socket is _under_ its sndbuf. - * In this case we cannot block, so that we have to fail. - */ - if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) - return 1; - } - - /* Alas. Undo changes. */ - sk->sk_forward_alloc -= amt * SK_STREAM_MEM_QUANTUM; - atomic_sub(amt, sk->sk_prot->memory_allocated); - return 0; -} - -EXPORT_SYMBOL(sk_stream_mem_schedule); - void sk_stream_kill_queues(struct sock *sk) { /* First the read buffer. */ @@ -276,7 +195,7 @@ void sk_stream_kill_queues(struct sock *sk) BUG_TRAP(skb_queue_empty(&sk->sk_write_queue)); /* Account for returned memory. */ - sk_stream_mem_reclaim(sk); + sk_mem_reclaim(sk); BUG_TRAP(!sk->sk_wmem_queued); BUG_TRAP(!sk->sk_forward_alloc); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 113cc728dc3..130338f83ae 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -10,12 +10,11 @@ #include <linux/module.h> #include <linux/socket.h> #include <linux/netdevice.h> +#include <linux/init.h> #include <net/sock.h> #include <net/xfrm.h> -#ifdef CONFIG_SYSCTL - -ctl_table core_table[] = { +static struct ctl_table net_core_table[] = { #ifdef CONFIG_NET { .ctl_name = NET_CORE_WMEM_MAX, @@ -128,7 +127,7 @@ ctl_table core_table[] = { { .ctl_name = NET_CORE_SOMAXCONN, .procname = "somaxconn", - .data = &sysctl_somaxconn, + .data = &init_net.sysctl_somaxconn, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec @@ -152,4 +151,65 @@ ctl_table core_table[] = { { .ctl_name = 0 } }; -#endif +static __net_initdata struct ctl_path net_core_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "core", .ctl_name = NET_CORE, }, + { }, +}; + +static __net_init int sysctl_core_net_init(struct net *net) +{ + struct ctl_table *tbl, *tmp; + + net->sysctl_somaxconn = SOMAXCONN; + + tbl = net_core_table; + if (net != &init_net) { + tbl = kmemdup(tbl, sizeof(net_core_table), GFP_KERNEL); + if (tbl == NULL) + goto err_dup; + + for (tmp = tbl; tmp->procname; tmp++) { + if (tmp->data >= (void *)&init_net && + tmp->data < (void *)(&init_net + 1)) + tmp->data += (char *)net - (char *)&init_net; + else + tmp->mode &= ~0222; + } + } + + net->sysctl_core_hdr = register_net_sysctl_table(net, + net_core_path, tbl); + if (net->sysctl_core_hdr == NULL) + goto err_reg; + + return 0; + +err_reg: + if (tbl != net_core_table) + kfree(tbl); +err_dup: + return -ENOMEM; +} + +static __net_exit void sysctl_core_net_exit(struct net *net) +{ + struct ctl_table *tbl; + + tbl = net->sysctl_core_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->sysctl_core_hdr); + BUG_ON(tbl == net_core_table); + kfree(tbl); +} + +static __net_initdata struct pernet_operations sysctl_core_ops = { + .init = sysctl_core_net_init, + .exit = sysctl_core_net_exit, +}; + +static __init int sysctl_core_init(void) +{ + return register_pernet_subsys(&sysctl_core_ops); +} + +__initcall(sysctl_core_init); diff --git a/net/core/utils.c b/net/core/utils.c index 0bf17da40d5..8031eb59054 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -91,17 +91,6 @@ EXPORT_SYMBOL(in_aton); #define IN6PTON_NULL 0x20000000 /* first/tail */ #define IN6PTON_UNKNOWN 0x40000000 -static inline int digit2bin(char c, int delim) -{ - if (c == delim || c == '\0') - return IN6PTON_DELIM; - if (c == '.') - return IN6PTON_DOT; - if (c >= '0' && c <= '9') - return (IN6PTON_DIGIT | (c - '0')); - return IN6PTON_UNKNOWN; -} - static inline int xdigit2bin(char c, int delim) { if (c == delim || c == '\0') @@ -293,3 +282,19 @@ out: } EXPORT_SYMBOL(in6_pton); + +void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, + __be32 from, __be32 to, int pseudohdr) +{ + __be32 diff[] = { ~from, to }; + if (skb->ip_summed != CHECKSUM_PARTIAL) { + *sum = csum_fold(csum_partial(diff, sizeof(diff), + ~csum_unfold(*sum))); + if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) + skb->csum = ~csum_partial(diff, sizeof(diff), + ~skb->csum); + } else if (pseudohdr) + *sum = ~csum_fold(csum_partial(diff, sizeof(diff), + csum_unfold(*sum))); +} +EXPORT_SYMBOL(inet_proto_csum_replace4); diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig index 0549e4719b1..7aa2a7acc7e 100644 --- a/net/dccp/Kconfig +++ b/net/dccp/Kconfig @@ -1,6 +1,7 @@ menuconfig IP_DCCP tristate "The DCCP Protocol (EXPERIMENTAL)" depends on INET && EXPERIMENTAL + select IP_DCCP_CCID2 ---help--- Datagram Congestion Control Protocol (RFC 4340) diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index 83378f379f7..6de4bd195d2 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -30,7 +30,7 @@ static struct dccp_ackvec_record *dccp_ackvec_record_new(void) kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); if (avr != NULL) - INIT_LIST_HEAD(&avr->dccpavr_node); + INIT_LIST_HEAD(&avr->avr_node); return avr; } @@ -40,7 +40,7 @@ static void dccp_ackvec_record_delete(struct dccp_ackvec_record *avr) if (unlikely(avr == NULL)) return; /* Check if deleting a linked record */ - WARN_ON(!list_empty(&avr->dccpavr_node)); + WARN_ON(!list_empty(&avr->avr_node)); kmem_cache_free(dccp_ackvec_record_slab, avr); } @@ -52,16 +52,15 @@ static void dccp_ackvec_insert_avr(struct dccp_ackvec *av, * just add the AVR at the head of the list. * -sorbo. */ - if (!list_empty(&av->dccpav_records)) { + if (!list_empty(&av->av_records)) { const struct dccp_ackvec_record *head = - list_entry(av->dccpav_records.next, + list_entry(av->av_records.next, struct dccp_ackvec_record, - dccpavr_node); - BUG_ON(before48(avr->dccpavr_ack_seqno, - head->dccpavr_ack_seqno)); + avr_node); + BUG_ON(before48(avr->avr_ack_seqno, head->avr_ack_seqno)); } - list_add(&avr->dccpavr_node, &av->dccpav_records); + list_add(&avr->avr_node, &av->av_records); } int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) @@ -69,9 +68,8 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) struct dccp_sock *dp = dccp_sk(sk); struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; /* Figure out how many options do we need to represent the ackvec */ - const u16 nr_opts = DIV_ROUND_UP(av->dccpav_vec_len, - DCCP_MAX_ACKVEC_OPT_LEN); - u16 len = av->dccpav_vec_len + 2 * nr_opts, i; + const u16 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_MAX_ACKVEC_OPT_LEN); + u16 len = av->av_vec_len + 2 * nr_opts, i; u32 elapsed_time; const unsigned char *tail, *from; unsigned char *to; @@ -81,7 +79,7 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) return -1; - delta = ktime_us_delta(ktime_get_real(), av->dccpav_time); + delta = ktime_us_delta(ktime_get_real(), av->av_time); elapsed_time = delta / 10; if (elapsed_time != 0 && @@ -95,9 +93,9 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) DCCP_SKB_CB(skb)->dccpd_opt_len += len; to = skb_push(skb, len); - len = av->dccpav_vec_len; - from = av->dccpav_buf + av->dccpav_buf_head; - tail = av->dccpav_buf + DCCP_MAX_ACKVEC_LEN; + len = av->av_vec_len; + from = av->av_buf + av->av_buf_head; + tail = av->av_buf + DCCP_MAX_ACKVEC_LEN; for (i = 0; i < nr_opts; ++i) { int copylen = len; @@ -116,7 +114,7 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) to += tailsize; len -= tailsize; copylen -= tailsize; - from = av->dccpav_buf; + from = av->av_buf; } memcpy(to, from, copylen); @@ -134,19 +132,19 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) * buf_head; ack_ackno will equal buf_ackno; and ack_nonce will * equal buf_nonce. */ - avr->dccpavr_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq; - avr->dccpavr_ack_ptr = av->dccpav_buf_head; - avr->dccpavr_ack_ackno = av->dccpav_buf_ackno; - avr->dccpavr_ack_nonce = av->dccpav_buf_nonce; - avr->dccpavr_sent_len = av->dccpav_vec_len; + avr->avr_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq; + avr->avr_ack_ptr = av->av_buf_head; + avr->avr_ack_ackno = av->av_buf_ackno; + avr->avr_ack_nonce = av->av_buf_nonce; + avr->avr_sent_len = av->av_vec_len; dccp_ackvec_insert_avr(av, avr); dccp_pr_debug("%s ACK Vector 0, len=%d, ack_seqno=%llu, " "ack_ackno=%llu\n", - dccp_role(sk), avr->dccpavr_sent_len, - (unsigned long long)avr->dccpavr_ack_seqno, - (unsigned long long)avr->dccpavr_ack_ackno); + dccp_role(sk), avr->avr_sent_len, + (unsigned long long)avr->avr_ack_seqno, + (unsigned long long)avr->avr_ack_ackno); return 0; } @@ -155,12 +153,12 @@ struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) struct dccp_ackvec *av = kmem_cache_alloc(dccp_ackvec_slab, priority); if (av != NULL) { - av->dccpav_buf_head = DCCP_MAX_ACKVEC_LEN - 1; - av->dccpav_buf_ackno = UINT48_MAX + 1; - av->dccpav_buf_nonce = av->dccpav_buf_nonce = 0; - av->dccpav_time = ktime_set(0, 0); - av->dccpav_vec_len = 0; - INIT_LIST_HEAD(&av->dccpav_records); + av->av_buf_head = DCCP_MAX_ACKVEC_LEN - 1; + av->av_buf_ackno = UINT48_MAX + 1; + av->av_buf_nonce = 0; + av->av_time = ktime_set(0, 0); + av->av_vec_len = 0; + INIT_LIST_HEAD(&av->av_records); } return av; @@ -171,12 +169,11 @@ void dccp_ackvec_free(struct dccp_ackvec *av) if (unlikely(av == NULL)) return; - if (!list_empty(&av->dccpav_records)) { + if (!list_empty(&av->av_records)) { struct dccp_ackvec_record *avr, *next; - list_for_each_entry_safe(avr, next, &av->dccpav_records, - dccpavr_node) { - list_del_init(&avr->dccpavr_node); + list_for_each_entry_safe(avr, next, &av->av_records, avr_node) { + list_del_init(&avr->avr_node); dccp_ackvec_record_delete(avr); } } @@ -187,13 +184,13 @@ void dccp_ackvec_free(struct dccp_ackvec *av) static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av, const u32 index) { - return av->dccpav_buf[index] & DCCP_ACKVEC_STATE_MASK; + return av->av_buf[index] & DCCP_ACKVEC_STATE_MASK; } static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av, const u32 index) { - return av->dccpav_buf[index] & DCCP_ACKVEC_LEN_MASK; + return av->av_buf[index] & DCCP_ACKVEC_LEN_MASK; } /* @@ -208,29 +205,29 @@ static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av, unsigned int gap; long new_head; - if (av->dccpav_vec_len + packets > DCCP_MAX_ACKVEC_LEN) + if (av->av_vec_len + packets > DCCP_MAX_ACKVEC_LEN) return -ENOBUFS; gap = packets - 1; - new_head = av->dccpav_buf_head - packets; + new_head = av->av_buf_head - packets; if (new_head < 0) { if (gap > 0) { - memset(av->dccpav_buf, DCCP_ACKVEC_STATE_NOT_RECEIVED, + memset(av->av_buf, DCCP_ACKVEC_STATE_NOT_RECEIVED, gap + new_head + 1); gap = -new_head; } new_head += DCCP_MAX_ACKVEC_LEN; } - av->dccpav_buf_head = new_head; + av->av_buf_head = new_head; if (gap > 0) - memset(av->dccpav_buf + av->dccpav_buf_head + 1, + memset(av->av_buf + av->av_buf_head + 1, DCCP_ACKVEC_STATE_NOT_RECEIVED, gap); - av->dccpav_buf[av->dccpav_buf_head] = state; - av->dccpav_vec_len += packets; + av->av_buf[av->av_buf_head] = state; + av->av_vec_len += packets; return 0; } @@ -243,7 +240,7 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, /* * Check at the right places if the buffer is full, if it is, tell the * caller to start dropping packets till the HC-Sender acks our ACK - * vectors, when we will free up space in dccpav_buf. + * vectors, when we will free up space in av_buf. * * We may well decide to do buffer compression, etc, but for now lets * just drop. @@ -263,22 +260,20 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, */ /* See if this is the first ackno being inserted */ - if (av->dccpav_vec_len == 0) { - av->dccpav_buf[av->dccpav_buf_head] = state; - av->dccpav_vec_len = 1; - } else if (after48(ackno, av->dccpav_buf_ackno)) { - const u64 delta = dccp_delta_seqno(av->dccpav_buf_ackno, - ackno); + if (av->av_vec_len == 0) { + av->av_buf[av->av_buf_head] = state; + av->av_vec_len = 1; + } else if (after48(ackno, av->av_buf_ackno)) { + const u64 delta = dccp_delta_seqno(av->av_buf_ackno, ackno); /* * Look if the state of this packet is the same as the * previous ackno and if so if we can bump the head len. */ if (delta == 1 && - dccp_ackvec_state(av, av->dccpav_buf_head) == state && - (dccp_ackvec_len(av, av->dccpav_buf_head) < - DCCP_ACKVEC_LEN_MASK)) - av->dccpav_buf[av->dccpav_buf_head]++; + dccp_ackvec_state(av, av->av_buf_head) == state && + dccp_ackvec_len(av, av->av_buf_head) < DCCP_ACKVEC_LEN_MASK) + av->av_buf[av->av_buf_head]++; else if (dccp_ackvec_set_buf_head_state(av, delta, state)) return -ENOBUFS; } else { @@ -290,14 +285,14 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, * the byte corresponding to S. (Indexing structures * could reduce the complexity of this scan.) */ - u64 delta = dccp_delta_seqno(ackno, av->dccpav_buf_ackno); - u32 index = av->dccpav_buf_head; + u64 delta = dccp_delta_seqno(ackno, av->av_buf_ackno); + u32 index = av->av_buf_head; while (1) { const u8 len = dccp_ackvec_len(av, index); const u8 state = dccp_ackvec_state(av, index); /* - * valid packets not yet in dccpav_buf have a reserved + * valid packets not yet in av_buf have a reserved * entry, with a len equal to 0. */ if (state == DCCP_ACKVEC_STATE_NOT_RECEIVED && @@ -305,7 +300,7 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, reserved seat! */ dccp_pr_debug("Found %llu reserved seat!\n", (unsigned long long)ackno); - av->dccpav_buf[index] = state; + av->av_buf[index] = state; goto out; } /* len == 0 means one packet */ @@ -318,8 +313,8 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, } } - av->dccpav_buf_ackno = ackno; - av->dccpav_time = ktime_get_real(); + av->av_buf_ackno = ackno; + av->av_time = ktime_get_real(); out: return 0; @@ -349,9 +344,9 @@ void dccp_ackvector_print(const u64 ackno, const unsigned char *vector, int len) void dccp_ackvec_print(const struct dccp_ackvec *av) { - dccp_ackvector_print(av->dccpav_buf_ackno, - av->dccpav_buf + av->dccpav_buf_head, - av->dccpav_vec_len); + dccp_ackvector_print(av->av_buf_ackno, + av->av_buf + av->av_buf_head, + av->av_vec_len); } #endif @@ -361,17 +356,15 @@ static void dccp_ackvec_throw_record(struct dccp_ackvec *av, struct dccp_ackvec_record *next; /* sort out vector length */ - if (av->dccpav_buf_head <= avr->dccpavr_ack_ptr) - av->dccpav_vec_len = avr->dccpavr_ack_ptr - av->dccpav_buf_head; + if (av->av_buf_head <= avr->avr_ack_ptr) + av->av_vec_len = avr->avr_ack_ptr - av->av_buf_head; else - av->dccpav_vec_len = DCCP_MAX_ACKVEC_LEN - 1 - - av->dccpav_buf_head - + avr->dccpavr_ack_ptr; + av->av_vec_len = DCCP_MAX_ACKVEC_LEN - 1 - + av->av_buf_head + avr->avr_ack_ptr; /* free records */ - list_for_each_entry_safe_from(avr, next, &av->dccpav_records, - dccpavr_node) { - list_del_init(&avr->dccpavr_node); + list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { + list_del_init(&avr->avr_node); dccp_ackvec_record_delete(avr); } } @@ -386,16 +379,16 @@ void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk, * windows. We will be receiving ACKs for stuff we sent a while back * -sorbo. */ - list_for_each_entry_reverse(avr, &av->dccpav_records, dccpavr_node) { - if (ackno == avr->dccpavr_ack_seqno) { + list_for_each_entry_reverse(avr, &av->av_records, avr_node) { + if (ackno == avr->avr_ack_seqno) { dccp_pr_debug("%s ACK packet 0, len=%d, ack_seqno=%llu, " "ack_ackno=%llu, ACKED!\n", dccp_role(sk), 1, - (unsigned long long)avr->dccpavr_ack_seqno, - (unsigned long long)avr->dccpavr_ack_ackno); + (unsigned long long)avr->avr_ack_seqno, + (unsigned long long)avr->avr_ack_ackno); dccp_ackvec_throw_record(av, avr); break; - } else if (avr->dccpavr_ack_seqno > ackno) + } else if (avr->avr_ack_seqno > ackno) break; /* old news */ } } @@ -409,7 +402,7 @@ static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av, struct dccp_ackvec_record *avr; /* Check if we actually sent an ACK vector */ - if (list_empty(&av->dccpav_records)) + if (list_empty(&av->av_records)) return; i = len; @@ -418,8 +411,7 @@ static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av, * I think it might be more efficient to work backwards. See comment on * rcv_ackno. -sorbo. */ - avr = list_entry(av->dccpav_records.next, struct dccp_ackvec_record, - dccpavr_node); + avr = list_entry(av->av_records.next, struct dccp_ackvec_record, avr_node); while (i--) { const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK; u64 ackno_end_rl; @@ -430,15 +422,14 @@ static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av, * If our AVR sequence number is greater than the ack, go * forward in the AVR list until it is not so. */ - list_for_each_entry_from(avr, &av->dccpav_records, - dccpavr_node) { - if (!after48(avr->dccpavr_ack_seqno, *ackno)) + list_for_each_entry_from(avr, &av->av_records, avr_node) { + if (!after48(avr->avr_ack_seqno, *ackno)) goto found; } - /* End of the dccpav_records list, not found, exit */ + /* End of the av_records list, not found, exit */ break; found: - if (between48(avr->dccpavr_ack_seqno, ackno_end_rl, *ackno)) { + if (between48(avr->avr_ack_seqno, ackno_end_rl, *ackno)) { const u8 state = *vector & DCCP_ACKVEC_STATE_MASK; if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED) { dccp_pr_debug("%s ACK vector 0, len=%d, " @@ -446,9 +437,9 @@ found: "ACKED!\n", dccp_role(sk), len, (unsigned long long) - avr->dccpavr_ack_seqno, + avr->avr_ack_seqno, (unsigned long long) - avr->dccpavr_ack_ackno); + avr->avr_ack_ackno); dccp_ackvec_throw_record(av, avr); break; } diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h index 9671ecd17e0..bcb64fb4ace 100644 --- a/net/dccp/ackvec.h +++ b/net/dccp/ackvec.h @@ -32,54 +32,54 @@ * * This data structure is the one defined in RFC 4340, Appendix A. * - * @dccpav_buf_head - circular buffer head - * @dccpav_buf_tail - circular buffer tail - * @dccpav_buf_ackno - ack # of the most recent packet acknowledgeable in the - * buffer (i.e. %dccpav_buf_head) - * @dccpav_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked + * @av_buf_head - circular buffer head + * @av_buf_tail - circular buffer tail + * @av_buf_ackno - ack # of the most recent packet acknowledgeable in the + * buffer (i.e. %av_buf_head) + * @av_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked * by the buffer with State 0 * * Additionally, the HC-Receiver must keep some information about the * Ack Vectors it has recently sent. For each packet sent carrying an * Ack Vector, it remembers four variables: * - * @dccpav_records - list of dccp_ackvec_record - * @dccpav_ack_nonce - the one-bit sum of the ECN Nonces for all State 0. + * @av_records - list of dccp_ackvec_record + * @av_ack_nonce - the one-bit sum of the ECN Nonces for all State 0. * - * @dccpav_time - the time in usecs - * @dccpav_buf - circular buffer of acknowledgeable packets + * @av_time - the time in usecs + * @av_buf - circular buffer of acknowledgeable packets */ struct dccp_ackvec { - u64 dccpav_buf_ackno; - struct list_head dccpav_records; - ktime_t dccpav_time; - u16 dccpav_buf_head; - u16 dccpav_vec_len; - u8 dccpav_buf_nonce; - u8 dccpav_ack_nonce; - u8 dccpav_buf[DCCP_MAX_ACKVEC_LEN]; + u64 av_buf_ackno; + struct list_head av_records; + ktime_t av_time; + u16 av_buf_head; + u16 av_vec_len; + u8 av_buf_nonce; + u8 av_ack_nonce; + u8 av_buf[DCCP_MAX_ACKVEC_LEN]; }; /** struct dccp_ackvec_record - ack vector record * * ACK vector record as defined in Appendix A of spec. * - * The list is sorted by dccpavr_ack_seqno + * The list is sorted by avr_ack_seqno * - * @dccpavr_node - node in dccpav_records - * @dccpavr_ack_seqno - sequence number of the packet this record was sent on - * @dccpavr_ack_ackno - sequence number being acknowledged - * @dccpavr_ack_ptr - pointer into dccpav_buf where this record starts - * @dccpavr_ack_nonce - dccpav_ack_nonce at the time this record was sent - * @dccpavr_sent_len - length of the record in dccpav_buf + * @avr_node - node in av_records + * @avr_ack_seqno - sequence number of the packet this record was sent on + * @avr_ack_ackno - sequence number being acknowledged + * @avr_ack_ptr - pointer into av_buf where this record starts + * @avr_ack_nonce - av_ack_nonce at the time this record was sent + * @avr_sent_len - lenght of the record in av_buf */ struct dccp_ackvec_record { - struct list_head dccpavr_node; - u64 dccpavr_ack_seqno; - u64 dccpavr_ack_ackno; - u16 dccpavr_ack_ptr; - u16 dccpavr_sent_len; - u8 dccpavr_ack_nonce; + struct list_head avr_node; + u64 avr_ack_seqno; + u64 avr_ack_ackno; + u16 avr_ack_ptr; + u16 avr_sent_len; + u8 avr_ack_nonce; }; struct sock; @@ -105,7 +105,7 @@ extern int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb); static inline int dccp_ackvec_pending(const struct dccp_ackvec *av) { - return av->dccpav_vec_len; + return av->av_vec_len; } #else /* CONFIG_IP_DCCP_ACKVEC */ static inline int dccp_ackvec_init(void) diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c index c45088b5e6f..4809753d12a 100644 --- a/net/dccp/ccid.c +++ b/net/dccp/ccid.c @@ -92,15 +92,15 @@ int ccid_register(struct ccid_operations *ccid_ops) ccid_ops->ccid_hc_rx_slab = ccid_kmem_cache_create(ccid_ops->ccid_hc_rx_obj_size, - "%s_hc_rx_sock", - ccid_ops->ccid_name); + "ccid%u_hc_rx_sock", + ccid_ops->ccid_id); if (ccid_ops->ccid_hc_rx_slab == NULL) goto out; ccid_ops->ccid_hc_tx_slab = ccid_kmem_cache_create(ccid_ops->ccid_hc_tx_obj_size, - "%s_hc_tx_sock", - ccid_ops->ccid_name); + "ccid%u_hc_tx_sock", + ccid_ops->ccid_id); if (ccid_ops->ccid_hc_tx_slab == NULL) goto out_free_rx_slab; diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index c65cb2453e4..fdeae7b5731 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -23,14 +23,37 @@ struct tcp_info; +/** + * struct ccid_operations - Interface to Congestion-Control Infrastructure + * + * @ccid_id: numerical CCID ID (up to %CCID_MAX, cf. table 5 in RFC 4340, 10.) + * @ccid_ccmps: the CCMPS including network/transport headers (0 when disabled) + * @ccid_name: alphabetical identifier string for @ccid_id + * @ccid_owner: module which implements/owns this CCID + * @ccid_hc_{r,t}x_slab: memory pool for the receiver/sender half-connection + * @ccid_hc_{r,t}x_obj_size: size of the receiver/sender half-connection socket + * + * @ccid_hc_{r,t}x_init: CCID-specific initialisation routine (before startup) + * @ccid_hc_{r,t}x_exit: CCID-specific cleanup routine (before destruction) + * @ccid_hc_rx_packet_recv: implements the HC-receiver side + * @ccid_hc_{r,t}x_parse_options: parsing routine for CCID/HC-specific options + * @ccid_hc_{r,t}x_insert_options: insert routine for CCID/HC-specific options + * @ccid_hc_tx_packet_recv: implements feedback processing for the HC-sender + * @ccid_hc_tx_send_packet: implements the sending part of the HC-sender + * @ccid_hc_tx_packet_sent: does accounting for packets in flight by HC-sender + * @ccid_hc_{r,t}x_get_info: INET_DIAG information for HC-receiver/sender + * @ccid_hc_{r,t}x_getsockopt: socket options specific to HC-receiver/sender + */ struct ccid_operations { - unsigned char ccid_id; - const char *ccid_name; - struct module *ccid_owner; - struct kmem_cache *ccid_hc_rx_slab; - __u32 ccid_hc_rx_obj_size; - struct kmem_cache *ccid_hc_tx_slab; - __u32 ccid_hc_tx_obj_size; + unsigned char ccid_id; + __u32 ccid_ccmps; + const char *ccid_name; + struct module *ccid_owner; + struct kmem_cache *ccid_hc_rx_slab, + *ccid_hc_tx_slab; + __u32 ccid_hc_rx_obj_size, + ccid_hc_tx_obj_size; + /* Interface Routines */ int (*ccid_hc_rx_init)(struct ccid *ccid, struct sock *sk); int (*ccid_hc_tx_init)(struct ccid *ccid, struct sock *sk); void (*ccid_hc_rx_exit)(struct sock *sk); diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig index 80f46988769..12275943eab 100644 --- a/net/dccp/ccids/Kconfig +++ b/net/dccp/ccids/Kconfig @@ -1,9 +1,8 @@ menu "DCCP CCIDs Configuration (EXPERIMENTAL)" - depends on IP_DCCP && EXPERIMENTAL + depends on EXPERIMENTAL config IP_DCCP_CCID2 tristate "CCID2 (TCP-Like) (EXPERIMENTAL)" - depends on IP_DCCP def_tristate IP_DCCP select IP_DCCP_ACKVEC ---help--- @@ -20,18 +19,9 @@ config IP_DCCP_CCID2 to the user. For example, a hypothetical application that transferred files over DCCP, using application-level retransmissions for lost packets, would prefer CCID 2 to CCID 3. On-line games may - also prefer CCID 2. + also prefer CCID 2. See RFC 4341 for further details. - CCID 2 is further described in RFC 4341, - http://www.ietf.org/rfc/rfc4341.txt - - This text was extracted from RFC 4340 (sec. 10.1), - http://www.ietf.org/rfc/rfc4340.txt - - To compile this CCID as a module, choose M here: the module will be - called dccp_ccid2. - - If in doubt, say M. + CCID2 is the default CCID used by DCCP. config IP_DCCP_CCID2_DEBUG bool "CCID2 debugging messages" @@ -47,8 +37,8 @@ config IP_DCCP_CCID2_DEBUG config IP_DCCP_CCID3 tristate "CCID3 (TCP-Friendly) (EXPERIMENTAL)" - depends on IP_DCCP def_tristate IP_DCCP + select IP_DCCP_TFRC_LIB ---help--- CCID 3 denotes TCP-Friendly Rate Control (TFRC), an equation-based rate-controlled congestion control mechanism. TFRC is designed to @@ -74,10 +64,6 @@ config IP_DCCP_CCID3 If in doubt, say M. -config IP_DCCP_TFRC_LIB - depends on IP_DCCP_CCID3 - def_tristate IP_DCCP_CCID3 - config IP_DCCP_CCID3_DEBUG bool "CCID3 debugging messages" depends on IP_DCCP_CCID3 @@ -121,5 +107,13 @@ config IP_DCCP_CCID3_RTO is serious network congestion: experimenting with larger values should therefore not be performed on WANs. +config IP_DCCP_TFRC_LIB + tristate + default n + +config IP_DCCP_TFRC_DEBUG + bool + depends on IP_DCCP_TFRC_LIB + default y if IP_DCCP_CCID3_DEBUG endmenu diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index d694656b880..b5b52ebb269 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -24,9 +24,6 @@ /* * This implementation should follow RFC 4341 - * - * BUGS: - * - sequence number wrapping */ #include "../ccid.h" @@ -129,50 +126,35 @@ static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) { struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - ccid2_pr_debug("pipe=%d cwnd=%d\n", hctx->ccid2hctx_pipe, - hctx->ccid2hctx_cwnd); - - if (hctx->ccid2hctx_pipe < hctx->ccid2hctx_cwnd) { - /* OK we can send... make sure previous packet was sent off */ - if (!hctx->ccid2hctx_sendwait) { - hctx->ccid2hctx_sendwait = 1; - return 0; - } - } + if (hctx->ccid2hctx_pipe < hctx->ccid2hctx_cwnd) + return 0; return 1; /* XXX CCID should dequeue when ready instead of polling */ } -static void ccid2_change_l_ack_ratio(struct sock *sk, int val) +static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) { struct dccp_sock *dp = dccp_sk(sk); + u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->ccid2hctx_cwnd, 2); + /* - * XXX I don't really agree with val != 2. If cwnd is 1, ack ratio - * should be 1... it shouldn't be allowed to become 2. - * -sorbo. + * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from + * RFC 4341, 6.1.2. We ignore the statement that Ack Ratio 2 is always + * acceptable since this causes starvation/deadlock whenever cwnd < 2. + * The same problem arises when Ack Ratio is 0 (ie. Ack Ratio disabled). */ - if (val != 2) { - const struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - int max = hctx->ccid2hctx_cwnd / 2; - - /* round up */ - if (hctx->ccid2hctx_cwnd & 1) - max++; - - if (val > max) - val = max; + if (val == 0 || val > max_ratio) { + DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio); + val = max_ratio; } + if (val > 0xFFFF) /* RFC 4340, 11.3 */ + val = 0xFFFF; - ccid2_pr_debug("changing local ack ratio to %d\n", val); - WARN_ON(val <= 0); - dp->dccps_l_ack_ratio = val; -} + if (val == dp->dccps_l_ack_ratio) + return; -static void ccid2_change_cwnd(struct ccid2_hc_tx_sock *hctx, u32 val) -{ - /* XXX do we need to change ack ratio? */ - hctx->ccid2hctx_cwnd = val? : 1; - ccid2_pr_debug("changed cwnd to %u\n", hctx->ccid2hctx_cwnd); + ccid2_pr_debug("changing local ack ratio to %u\n", val); + dp->dccps_l_ack_ratio = val; } static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hctx, long val) @@ -181,11 +163,6 @@ static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hctx, long val) hctx->ccid2hctx_srtt = val; } -static void ccid2_change_pipe(struct ccid2_hc_tx_sock *hctx, long val) -{ - hctx->ccid2hctx_pipe = val; -} - static void ccid2_start_rto_timer(struct sock *sk); static void ccid2_hc_tx_rto_expire(unsigned long data) @@ -215,21 +192,17 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) ccid2_start_rto_timer(sk); /* adjust pipe, cwnd etc */ - ccid2_change_pipe(hctx, 0); - hctx->ccid2hctx_ssthresh = hctx->ccid2hctx_cwnd >> 1; + hctx->ccid2hctx_ssthresh = hctx->ccid2hctx_cwnd / 2; if (hctx->ccid2hctx_ssthresh < 2) hctx->ccid2hctx_ssthresh = 2; - ccid2_change_cwnd(hctx, 1); + hctx->ccid2hctx_cwnd = 1; + hctx->ccid2hctx_pipe = 0; /* clear state about stuff we sent */ - hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqh; - hctx->ccid2hctx_ssacks = 0; - hctx->ccid2hctx_acks = 0; - hctx->ccid2hctx_sent = 0; + hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqh; + hctx->ccid2hctx_packets_acked = 0; /* clear ack ratio state. */ - hctx->ccid2hctx_arsent = 0; - hctx->ccid2hctx_ackloss = 0; hctx->ccid2hctx_rpseq = 0; hctx->ccid2hctx_rpdupack = -1; ccid2_change_l_ack_ratio(sk, 1); @@ -255,23 +228,10 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) struct dccp_sock *dp = dccp_sk(sk); struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); struct ccid2_seq *next; - u64 seq; - - ccid2_hc_tx_check_sanity(hctx); - BUG_ON(!hctx->ccid2hctx_sendwait); - hctx->ccid2hctx_sendwait = 0; - ccid2_change_pipe(hctx, hctx->ccid2hctx_pipe + 1); - BUG_ON(hctx->ccid2hctx_pipe < 0); + hctx->ccid2hctx_pipe++; - /* There is an issue. What if another packet is sent between - * packet_send() and packet_sent(). Then the sequence number would be - * wrong. - * -sorbo. - */ - seq = dp->dccps_gss; - - hctx->ccid2hctx_seqh->ccid2s_seq = seq; + hctx->ccid2hctx_seqh->ccid2s_seq = dp->dccps_gss; hctx->ccid2hctx_seqh->ccid2s_acked = 0; hctx->ccid2hctx_seqh->ccid2s_sent = jiffies; @@ -291,8 +251,26 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->ccid2hctx_cwnd, hctx->ccid2hctx_pipe); - hctx->ccid2hctx_sent++; - + /* + * FIXME: The code below is broken and the variables have been removed + * from the socket struct. The `ackloss' variable was always set to 0, + * and with arsent there are several problems: + * (i) it doesn't just count the number of Acks, but all sent packets; + * (ii) it is expressed in # of packets, not # of windows, so the + * comparison below uses the wrong formula: Appendix A of RFC 4341 + * comes up with the number K = cwnd / (R^2 - R) of consecutive windows + * of data with no lost or marked Ack packets. If arsent were the # of + * consecutive Acks received without loss, then Ack Ratio needs to be + * decreased by 1 when + * arsent >= K * cwnd / R = cwnd^2 / (R^3 - R^2) + * where cwnd / R is the number of Acks received per window of data + * (cf. RFC 4341, App. A). The problems are that + * - arsent counts other packets as well; + * - the comparison uses a formula different from RFC 4341; + * - computing a cubic/quadratic equation each time is too complicated. + * Hence a different algorithm is needed. + */ +#if 0 /* Ack Ratio. Need to maintain a concept of how many windows we sent */ hctx->ccid2hctx_arsent++; /* We had an ack loss in this window... */ @@ -320,14 +298,13 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) hctx->ccid2hctx_arsent = 0; /* or maybe set it to cwnd*/ } } +#endif /* setup RTO timer */ if (!timer_pending(&hctx->ccid2hctx_rtotimer)) ccid2_start_rto_timer(sk); #ifdef CONFIG_IP_DCCP_CCID2_DEBUG - ccid2_pr_debug("pipe=%d\n", hctx->ccid2hctx_pipe); - ccid2_pr_debug("Sent: seq=%llu\n", (unsigned long long)seq); do { struct ccid2_seq *seqp = hctx->ccid2hctx_seqt; @@ -419,31 +396,15 @@ static inline void ccid2_new_ack(struct sock *sk, { struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - /* slow start */ if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) { - hctx->ccid2hctx_acks = 0; - - /* We can increase cwnd at most maxincr [ack_ratio/2] */ - if (*maxincr) { - /* increase every 2 acks */ - hctx->ccid2hctx_ssacks++; - if (hctx->ccid2hctx_ssacks == 2) { - ccid2_change_cwnd(hctx, hctx->ccid2hctx_cwnd+1); - hctx->ccid2hctx_ssacks = 0; - *maxincr = *maxincr - 1; - } - } else { - /* increased cwnd enough for this single ack */ - hctx->ccid2hctx_ssacks = 0; - } - } else { - hctx->ccid2hctx_ssacks = 0; - hctx->ccid2hctx_acks++; - - if (hctx->ccid2hctx_acks >= hctx->ccid2hctx_cwnd) { - ccid2_change_cwnd(hctx, hctx->ccid2hctx_cwnd + 1); - hctx->ccid2hctx_acks = 0; + if (*maxincr > 0 && ++hctx->ccid2hctx_packets_acked == 2) { + hctx->ccid2hctx_cwnd += 1; + *maxincr -= 1; + hctx->ccid2hctx_packets_acked = 0; } + } else if (++hctx->ccid2hctx_packets_acked >= hctx->ccid2hctx_cwnd) { + hctx->ccid2hctx_cwnd += 1; + hctx->ccid2hctx_packets_acked = 0; } /* update RTO */ @@ -502,7 +463,6 @@ static inline void ccid2_new_ack(struct sock *sk, ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n", hctx->ccid2hctx_srtt, hctx->ccid2hctx_rttvar, hctx->ccid2hctx_rto, HZ, r); - hctx->ccid2hctx_sent = 0; } /* we got a new ack, so re-start RTO timer */ @@ -514,16 +474,19 @@ static void ccid2_hc_tx_dec_pipe(struct sock *sk) { struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - ccid2_change_pipe(hctx, hctx->ccid2hctx_pipe-1); - BUG_ON(hctx->ccid2hctx_pipe < 0); + if (hctx->ccid2hctx_pipe == 0) + DCCP_BUG("pipe == 0"); + else + hctx->ccid2hctx_pipe--; if (hctx->ccid2hctx_pipe == 0) ccid2_hc_tx_kill_rto_timer(sk); } -static void ccid2_congestion_event(struct ccid2_hc_tx_sock *hctx, - struct ccid2_seq *seqp) +static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) { + struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); + if (time_before(seqp->ccid2s_sent, hctx->ccid2hctx_last_cong)) { ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); return; @@ -531,10 +494,12 @@ static void ccid2_congestion_event(struct ccid2_hc_tx_sock *hctx, hctx->ccid2hctx_last_cong = jiffies; - ccid2_change_cwnd(hctx, hctx->ccid2hctx_cwnd >> 1); - hctx->ccid2hctx_ssthresh = hctx->ccid2hctx_cwnd; - if (hctx->ccid2hctx_ssthresh < 2) - hctx->ccid2hctx_ssthresh = 2; + hctx->ccid2hctx_cwnd = hctx->ccid2hctx_cwnd / 2 ? : 1U; + hctx->ccid2hctx_ssthresh = max(hctx->ccid2hctx_cwnd, 2U); + + /* Avoid spurious timeouts resulting from Ack Ratio > cwnd */ + if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->ccid2hctx_cwnd) + ccid2_change_l_ack_ratio(sk, hctx->ccid2hctx_cwnd); } static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) @@ -570,12 +535,11 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) hctx->ccid2hctx_rpdupack++; /* check if we got enough dupacks */ - if (hctx->ccid2hctx_rpdupack >= - hctx->ccid2hctx_numdupack) { + if (hctx->ccid2hctx_rpdupack >= NUMDUPACK) { hctx->ccid2hctx_rpdupack = -1; /* XXX lame */ hctx->ccid2hctx_rpseq = 0; - ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio << 1); + ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio); } } } @@ -606,12 +570,13 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) } } - /* If in slow-start, cwnd can increase at most Ack Ratio / 2 packets for - * this single ack. I round up. - * -sorbo. + /* + * In slow-start, cwnd can increase up to a maximum of Ack Ratio/2 + * packets per acknowledgement. Rounding up avoids that cwnd is not + * advanced when Ack Ratio is 1 and gives a slight edge otherwise. */ - maxincr = dp->dccps_l_ack_ratio >> 1; - maxincr++; + if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) + maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2); /* go through all ack vectors */ while ((offset = ccid2_ackvector(sk, skb, offset, @@ -619,9 +584,8 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) /* go through this ack vector */ while (veclen--) { const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK; - u64 ackno_end_rl; + u64 ackno_end_rl = SUB48(ackno, rl); - dccp_set_seqno(&ackno_end_rl, ackno - rl); ccid2_pr_debug("ackvec start:%llu end:%llu\n", (unsigned long long)ackno, (unsigned long long)ackno_end_rl); @@ -651,7 +615,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) !seqp->ccid2s_acked) { if (state == DCCP_ACKVEC_STATE_ECN_MARKED) { - ccid2_congestion_event(hctx, + ccid2_congestion_event(sk, seqp); } else ccid2_new_ack(sk, seqp, @@ -666,13 +630,12 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) done = 1; break; } - seqp = seqp->ccid2s_next; + seqp = seqp->ccid2s_prev; } if (done) break; - - dccp_set_seqno(&ackno, ackno_end_rl - 1); + ackno = SUB48(ackno_end_rl, 1); vector++; } if (done) @@ -694,7 +657,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) while (1) { if (seqp->ccid2s_acked) { done++; - if (done == hctx->ccid2hctx_numdupack) + if (done == NUMDUPACK) break; } if (seqp == hctx->ccid2hctx_seqt) @@ -705,7 +668,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) /* If there are at least 3 acknowledgements, anything unacknowledged * below the last sequence number is considered lost */ - if (done == hctx->ccid2hctx_numdupack) { + if (done == NUMDUPACK) { struct ccid2_seq *last_acked = seqp; /* check for lost packets */ @@ -717,7 +680,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * order to detect multiple congestion events in * one ack vector. */ - ccid2_congestion_event(hctx, seqp); + ccid2_congestion_event(sk, seqp); ccid2_hc_tx_dec_pipe(sk); } if (seqp == hctx->ccid2hctx_seqt) @@ -742,14 +705,23 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) { struct ccid2_hc_tx_sock *hctx = ccid_priv(ccid); + struct dccp_sock *dp = dccp_sk(sk); + u32 max_ratio; + + /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ + hctx->ccid2hctx_ssthresh = ~0U; - ccid2_change_cwnd(hctx, 1); - /* Initialize ssthresh to infinity. This means that we will exit the - * initial slow-start after the first packet loss. This is what we - * want. + /* + * RFC 4341, 5: "The cwnd parameter is initialized to at most four + * packets for new connections, following the rules from [RFC3390]". + * We need to convert the bytes of RFC3390 into the packets of RFC 4341. */ - hctx->ccid2hctx_ssthresh = ~0; - hctx->ccid2hctx_numdupack = 3; + hctx->ccid2hctx_cwnd = min(4U, max(2U, 4380U / dp->dccps_mss_cache)); + + /* Make sure that Ack Ratio is enabled and within bounds. */ + max_ratio = DIV_ROUND_UP(hctx->ccid2hctx_cwnd, 2); + if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio) + dp->dccps_l_ack_ratio = max_ratio; /* XXX init ~ to window size... */ if (ccid2_hc_tx_alloc_seq(hctx)) @@ -760,10 +732,8 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) hctx->ccid2hctx_rttvar = -1; hctx->ccid2hctx_rpdupack = -1; hctx->ccid2hctx_last_cong = jiffies; - - hctx->ccid2hctx_rtotimer.function = &ccid2_hc_tx_rto_expire; - hctx->ccid2hctx_rtotimer.data = (unsigned long)sk; - init_timer(&hctx->ccid2hctx_rtotimer); + setup_timer(&hctx->ccid2hctx_rtotimer, ccid2_hc_tx_rto_expire, + (unsigned long)sk); ccid2_hc_tx_check_sanity(hctx); return 0; @@ -800,7 +770,7 @@ static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) static struct ccid_operations ccid2 = { .ccid_id = DCCPC_CCID2, - .ccid_name = "ccid2", + .ccid_name = "TCP-like", .ccid_owner = THIS_MODULE, .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), .ccid_hc_tx_init = ccid2_hc_tx_init, diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h index d9daa534c9b..2c94ca02901 100644 --- a/net/dccp/ccids/ccid2.h +++ b/net/dccp/ccids/ccid2.h @@ -24,6 +24,8 @@ #include <linux/timer.h> #include <linux/types.h> #include "../ccid.h" +/* NUMDUPACK parameter from RFC 4341, p. 6 */ +#define NUMDUPACK 3 struct sock; @@ -40,22 +42,17 @@ struct ccid2_seq { /** struct ccid2_hc_tx_sock - CCID2 TX half connection * - * @ccid2hctx_ssacks - ACKs recv in slow start - * @ccid2hctx_acks - ACKS recv in AI phase - * @ccid2hctx_sent - packets sent in this window + * @ccid2hctx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 + * @ccid2hctx_packets_acked - Ack counter for deriving cwnd growth (RFC 3465) * @ccid2hctx_lastrtt -time RTT was last measured - * @ccid2hctx_arsent - packets sent [ack ratio] - * @ccid2hctx_ackloss - ack was lost in this win * @ccid2hctx_rpseq - last consecutive seqno * @ccid2hctx_rpdupack - dupacks since rpseq */ struct ccid2_hc_tx_sock { u32 ccid2hctx_cwnd; - int ccid2hctx_ssacks; - int ccid2hctx_acks; - unsigned int ccid2hctx_ssthresh; - int ccid2hctx_pipe; - int ccid2hctx_numdupack; + u32 ccid2hctx_ssthresh; + u32 ccid2hctx_pipe; + u32 ccid2hctx_packets_acked; struct ccid2_seq *ccid2hctx_seqbuf[CCID2_SEQBUF_MAX]; int ccid2hctx_seqbufc; struct ccid2_seq *ccid2hctx_seqh; @@ -63,14 +60,10 @@ struct ccid2_hc_tx_sock { long ccid2hctx_rto; long ccid2hctx_srtt; long ccid2hctx_rttvar; - int ccid2hctx_sent; unsigned long ccid2hctx_lastrtt; struct timer_list ccid2hctx_rtotimer; - unsigned long ccid2hctx_arsent; - int ccid2hctx_ackloss; u64 ccid2hctx_rpseq; int ccid2hctx_rpdupack; - int ccid2hctx_sendwait; unsigned long ccid2hctx_last_cong; u64 ccid2hctx_high_ack; }; diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index d133416d397..e76f460af0e 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -1,6 +1,7 @@ /* * net/dccp/ccids/ccid3.c * + * Copyright (c) 2007 The University of Aberdeen, Scotland, UK * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. * Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz> * @@ -33,11 +34,7 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include "../ccid.h" #include "../dccp.h" -#include "lib/packet_history.h" -#include "lib/loss_interval.h" -#include "lib/tfrc.h" #include "ccid3.h" #include <asm/unaligned.h> @@ -49,9 +46,6 @@ static int ccid3_debug; #define ccid3_pr_debug(format, a...) #endif -static struct dccp_tx_hist *ccid3_tx_hist; -static struct dccp_rx_hist *ccid3_rx_hist; - /* * Transmitter Half-Connection Routines */ @@ -83,24 +77,27 @@ static void ccid3_hc_tx_set_state(struct sock *sk, } /* - * Compute the initial sending rate X_init according to RFC 3390: - * w_init = min(4 * MSS, max(2 * MSS, 4380 bytes)) - * X_init = w_init / RTT + * Compute the initial sending rate X_init in the manner of RFC 3390: + * + * X_init = min(4 * s, max(2 * s, 4380 bytes)) / RTT + * + * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis + * (rev-02) clarifies the use of RFC 3390 with regard to the above formula. * For consistency with other parts of the code, X_init is scaled by 2^6. */ static inline u64 rfc3390_initial_rate(struct sock *sk) { - const struct dccp_sock *dp = dccp_sk(sk); - const __u32 w_init = min(4 * dp->dccps_mss_cache, - max(2 * dp->dccps_mss_cache, 4380U)); + const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); + const __u32 w_init = min_t(__u32, 4 * hctx->ccid3hctx_s, + max_t(__u32, 2 * hctx->ccid3hctx_s, 4380)); - return scaled_div(w_init << 6, ccid3_hc_tx_sk(sk)->ccid3hctx_rtt); + return scaled_div(w_init << 6, hctx->ccid3hctx_rtt); } /* * Recalculate t_ipi and delta (should be called whenever X changes) */ -static inline void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx) +static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx) { /* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */ hctx->ccid3hctx_t_ipi = scaled_div32(((u64)hctx->ccid3hctx_s) << 6, @@ -116,6 +113,13 @@ static inline void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx) } +static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now) +{ + u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count); + + return delta / hctx->ccid3hctx_rtt; +} + /** * ccid3_hc_tx_update_x - Update allowed sending rate X * @stamp: most recent time if available - can be left NULL. @@ -127,19 +131,19 @@ static inline void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx) * */ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) - { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); __u64 min_rate = 2 * hctx->ccid3hctx_x_recv; const __u64 old_x = hctx->ccid3hctx_x; - ktime_t now = stamp? *stamp : ktime_get_real(); + ktime_t now = stamp ? *stamp : ktime_get_real(); /* * Handle IDLE periods: do not reduce below RFC3390 initial sending rate - * when idling [RFC 4342, 5.1]. See also draft-ietf-dccp-rfc3448bis. + * when idling [RFC 4342, 5.1]. Definition of idling is from rfc3448bis: + * a sender is idle if it has not sent anything over a 2-RTT-period. * For consistency with X and X_recv, min_rate is also scaled by 2^6. */ - if (unlikely(hctx->ccid3hctx_idle)) { + if (ccid3_hc_tx_idle_rtt(hctx, now) >= 2) { min_rate = rfc3390_initial_rate(sk); min_rate = max(min_rate, 2 * hctx->ccid3hctx_x_recv); } @@ -181,7 +185,7 @@ static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len) { const u16 old_s = hctx->ccid3hctx_s; - hctx->ccid3hctx_s = old_s == 0 ? len : (9 * old_s + len) / 10; + hctx->ccid3hctx_s = tfrc_ewma(hctx->ccid3hctx_s, len, 9); if (hctx->ccid3hctx_s != old_s) ccid3_update_send_interval(hctx); @@ -225,29 +229,27 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk, ccid3_tx_state_name(hctx->ccid3hctx_state)); - hctx->ccid3hctx_idle = 1; + if (hctx->ccid3hctx_state == TFRC_SSTATE_FBACK) + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); + else if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK) + goto out; - switch (hctx->ccid3hctx_state) { - case TFRC_SSTATE_NO_FBACK: - /* RFC 3448, 4.4: Halve send rate directly */ + /* + * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 + */ + if (hctx->ccid3hctx_t_rto == 0 || /* no feedback received yet */ + hctx->ccid3hctx_p == 0) { + + /* halve send rate directly */ hctx->ccid3hctx_x = max(hctx->ccid3hctx_x / 2, (((__u64)hctx->ccid3hctx_s) << 6) / TFRC_T_MBI); - - ccid3_pr_debug("%s(%p, state=%s), updated tx rate to %u " - "bytes/s\n", dccp_role(sk), sk, - ccid3_tx_state_name(hctx->ccid3hctx_state), - (unsigned)(hctx->ccid3hctx_x >> 6)); - /* The value of R is still undefined and so we can not recompute - * the timeout value. Keep initial value as per [RFC 4342, 5]. */ - t_nfb = TFRC_INITIAL_TIMEOUT; ccid3_update_send_interval(hctx); - break; - case TFRC_SSTATE_FBACK: + } else { /* - * Modify the cached value of X_recv [RFC 3448, 4.4] + * Modify the cached value of X_recv * - * If (p == 0 || X_calc > 2 * X_recv) + * If (X_calc > 2 * X_recv) * X_recv = max(X_recv / 2, s / (2 * t_mbi)); * Else * X_recv = X_calc / 4; @@ -256,32 +258,28 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) */ BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc); - if (hctx->ccid3hctx_p == 0 || - (hctx->ccid3hctx_x_calc > (hctx->ccid3hctx_x_recv >> 5))) { - + if (hctx->ccid3hctx_x_calc > (hctx->ccid3hctx_x_recv >> 5)) hctx->ccid3hctx_x_recv = max(hctx->ccid3hctx_x_recv / 2, (((__u64)hctx->ccid3hctx_s) << 6) / (2 * TFRC_T_MBI)); - } else { + else { hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc; hctx->ccid3hctx_x_recv <<= 4; } - /* Now recalculate X [RFC 3448, 4.3, step (4)] */ ccid3_hc_tx_update_x(sk, NULL); - /* - * Schedule no feedback timer to expire in - * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) - * See comments in packet_recv() regarding the value of t_RTO. - */ - t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); - break; - case TFRC_SSTATE_NO_SENT: - DCCP_BUG("%s(%p) - Illegal state NO_SENT", dccp_role(sk), sk); - /* fall through */ - case TFRC_SSTATE_TERM: - goto out; } + ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n", + (unsigned long long)hctx->ccid3hctx_x); + + /* + * Set new timeout for the nofeedback timer. + * See comments in packet_recv() regarding the value of t_RTO. + */ + if (unlikely(hctx->ccid3hctx_t_rto == 0)) /* no feedback yet */ + t_nfb = TFRC_INITIAL_TIMEOUT; + else + t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); restart_timer: sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, @@ -336,8 +334,8 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) hctx->ccid3hctx_x = rfc3390_initial_rate(sk); hctx->ccid3hctx_t_ld = now; } else { - /* Sender does not have RTT sample: X = MSS/second */ - hctx->ccid3hctx_x = dp->dccps_mss_cache; + /* Sender does not have RTT sample: X_pps = 1 pkt/sec */ + hctx->ccid3hctx_x = hctx->ccid3hctx_s; hctx->ccid3hctx_x <<= 6; } ccid3_update_send_interval(hctx); @@ -369,7 +367,6 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) /* prepare to send now (add options etc.) */ dp->dccps_hc_tx_insert_options = 1; DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count; - hctx->ccid3hctx_idle = 0; /* set the nominal send time for the next following packet */ hctx->ccid3hctx_t_nom = ktime_add_us(hctx->ccid3hctx_t_nom, @@ -381,28 +378,17 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - struct dccp_tx_hist_entry *packet; ccid3_hc_tx_update_s(hctx, len); - packet = dccp_tx_hist_entry_new(ccid3_tx_hist, GFP_ATOMIC); - if (unlikely(packet == NULL)) { + if (tfrc_tx_hist_add(&hctx->ccid3hctx_hist, dccp_sk(sk)->dccps_gss)) DCCP_CRIT("packet history - out of memory!"); - return; - } - dccp_tx_hist_add_entry(&hctx->ccid3hctx_hist, packet); - - packet->dccphtx_tstamp = ktime_get_real(); - packet->dccphtx_seqno = dccp_sk(sk)->dccps_gss; - packet->dccphtx_rtt = hctx->ccid3hctx_rtt; - packet->dccphtx_sent = 1; } static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); struct ccid3_options_received *opt_recv; - struct dccp_tx_hist_entry *packet; ktime_t now; unsigned long t_nfb; u32 pinv, r_sample; @@ -411,131 +397,112 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) return; + /* ... and only in the established state */ + if (hctx->ccid3hctx_state != TFRC_SSTATE_FBACK && + hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK) + return; opt_recv = &hctx->ccid3hctx_options_received; + now = ktime_get_real(); - switch (hctx->ccid3hctx_state) { - case TFRC_SSTATE_NO_FBACK: - case TFRC_SSTATE_FBACK: - /* get packet from history to look up t_recvdata */ - packet = dccp_tx_hist_find_entry(&hctx->ccid3hctx_hist, - DCCP_SKB_CB(skb)->dccpd_ack_seq); - if (unlikely(packet == NULL)) { - DCCP_WARN("%s(%p), seqno %llu(%s) doesn't exist " - "in history!\n", dccp_role(sk), sk, - (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq, - dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type)); - return; - } - - /* Update receive rate in units of 64 * bytes/second */ - hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate; - hctx->ccid3hctx_x_recv <<= 6; + /* Estimate RTT from history if ACK number is valid */ + r_sample = tfrc_tx_hist_rtt(hctx->ccid3hctx_hist, + DCCP_SKB_CB(skb)->dccpd_ack_seq, now); + if (r_sample == 0) { + DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk, + dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type), + (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq); + return; + } - /* Update loss event rate */ - pinv = opt_recv->ccid3or_loss_event_rate; - if (pinv == ~0U || pinv == 0) /* see RFC 4342, 8.5 */ - hctx->ccid3hctx_p = 0; - else /* can not exceed 100% */ - hctx->ccid3hctx_p = 1000000 / pinv; + /* Update receive rate in units of 64 * bytes/second */ + hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate; + hctx->ccid3hctx_x_recv <<= 6; - now = ktime_get_real(); - /* - * Calculate new round trip sample as per [RFC 3448, 4.3] by - * R_sample = (now - t_recvdata) - t_elapsed - */ - r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, packet->dccphtx_tstamp)); + /* Update loss event rate (which is scaled by 1e6) */ + pinv = opt_recv->ccid3or_loss_event_rate; + if (pinv == ~0U || pinv == 0) /* see RFC 4342, 8.5 */ + hctx->ccid3hctx_p = 0; + else /* can not exceed 100% */ + hctx->ccid3hctx_p = scaled_div(1, pinv); + /* + * Validate new RTT sample and update moving average + */ + r_sample = dccp_sample_rtt(sk, r_sample); + hctx->ccid3hctx_rtt = tfrc_ewma(hctx->ccid3hctx_rtt, r_sample, 9); + /* + * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 + */ + if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) { + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); - /* - * Update RTT estimate by - * If (No feedback recv) - * R = R_sample; - * Else - * R = q * R + (1 - q) * R_sample; - * - * q is a constant, RFC 3448 recomments 0.9 - */ - if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) { + if (hctx->ccid3hctx_t_rto == 0) { /* - * Larger Initial Windows [RFC 4342, sec. 5] + * Initial feedback packet: Larger Initial Windows (4.2) */ - hctx->ccid3hctx_rtt = r_sample; hctx->ccid3hctx_x = rfc3390_initial_rate(sk); hctx->ccid3hctx_t_ld = now; ccid3_update_send_interval(hctx); - ccid3_pr_debug("%s(%p), s=%u, MSS=%u, " - "R_sample=%uus, X=%u\n", dccp_role(sk), - sk, hctx->ccid3hctx_s, - dccp_sk(sk)->dccps_mss_cache, r_sample, - (unsigned)(hctx->ccid3hctx_x >> 6)); - - ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); - } else { - hctx->ccid3hctx_rtt = (9 * hctx->ccid3hctx_rtt + - r_sample) / 10; - - /* Update sending rate (step 4 of [RFC 3448, 4.3]) */ - if (hctx->ccid3hctx_p > 0) - hctx->ccid3hctx_x_calc = - tfrc_calc_x(hctx->ccid3hctx_s, - hctx->ccid3hctx_rtt, - hctx->ccid3hctx_p); - ccid3_hc_tx_update_x(sk, &now); - - ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, " - "p=%u, X_calc=%u, X_recv=%u, X=%u\n", - dccp_role(sk), - sk, hctx->ccid3hctx_rtt, r_sample, - hctx->ccid3hctx_s, hctx->ccid3hctx_p, - hctx->ccid3hctx_x_calc, - (unsigned)(hctx->ccid3hctx_x_recv >> 6), - (unsigned)(hctx->ccid3hctx_x >> 6)); + goto done_computing_x; + } else if (hctx->ccid3hctx_p == 0) { + /* + * First feedback after nofeedback timer expiry (4.3) + */ + goto done_computing_x; } + } - /* unschedule no feedback timer */ - sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); + /* Update sending rate (step 4 of [RFC 3448, 4.3]) */ + if (hctx->ccid3hctx_p > 0) + hctx->ccid3hctx_x_calc = + tfrc_calc_x(hctx->ccid3hctx_s, + hctx->ccid3hctx_rtt, + hctx->ccid3hctx_p); + ccid3_hc_tx_update_x(sk, &now); + +done_computing_x: + ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, " + "p=%u, X_calc=%u, X_recv=%u, X=%u\n", + dccp_role(sk), + sk, hctx->ccid3hctx_rtt, r_sample, + hctx->ccid3hctx_s, hctx->ccid3hctx_p, + hctx->ccid3hctx_x_calc, + (unsigned)(hctx->ccid3hctx_x_recv >> 6), + (unsigned)(hctx->ccid3hctx_x >> 6)); - /* remove all packets older than the one acked from history */ - dccp_tx_hist_purge_older(ccid3_tx_hist, - &hctx->ccid3hctx_hist, packet); - /* - * As we have calculated new ipi, delta, t_nom it is possible - * that we now can send a packet, so wake up dccp_wait_for_ccid - */ - sk->sk_write_space(sk); + /* unschedule no feedback timer */ + sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); - /* - * Update timeout interval for the nofeedback timer. - * We use a configuration option to increase the lower bound. - * This can help avoid triggering the nofeedback timer too - * often ('spinning') on LANs with small RTTs. - */ - hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt, - CONFIG_IP_DCCP_CCID3_RTO * - (USEC_PER_SEC/1000)); - /* - * Schedule no feedback timer to expire in - * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) - */ - t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); + /* + * As we have calculated new ipi, delta, t_nom it is possible + * that we now can send a packet, so wake up dccp_wait_for_ccid + */ + sk->sk_write_space(sk); - ccid3_pr_debug("%s(%p), Scheduled no feedback timer to " - "expire in %lu jiffies (%luus)\n", - dccp_role(sk), - sk, usecs_to_jiffies(t_nfb), t_nfb); + /* + * Update timeout interval for the nofeedback timer. + * We use a configuration option to increase the lower bound. + * This can help avoid triggering the nofeedback timer too + * often ('spinning') on LANs with small RTTs. + */ + hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt, + (CONFIG_IP_DCCP_CCID3_RTO * + (USEC_PER_SEC / 1000))); + /* + * Schedule no feedback timer to expire in + * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) + */ + t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); - sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, - jiffies + usecs_to_jiffies(t_nfb)); + ccid3_pr_debug("%s(%p), Scheduled no feedback timer to " + "expire in %lu jiffies (%luus)\n", + dccp_role(sk), + sk, usecs_to_jiffies(t_nfb), t_nfb); - /* set idle flag */ - hctx->ccid3hctx_idle = 1; - break; - case TFRC_SSTATE_NO_SENT: /* fall through */ - case TFRC_SSTATE_TERM: /* ignore feedback when closing */ - break; - } + sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, + jiffies + usecs_to_jiffies(t_nfb)); } static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, @@ -605,12 +572,9 @@ static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid); hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT; - INIT_LIST_HEAD(&hctx->ccid3hctx_hist); - - hctx->ccid3hctx_no_feedback_timer.function = - ccid3_hc_tx_no_feedback_timer; - hctx->ccid3hctx_no_feedback_timer.data = (unsigned long)sk; - init_timer(&hctx->ccid3hctx_no_feedback_timer); + hctx->ccid3hctx_hist = NULL; + setup_timer(&hctx->ccid3hctx_no_feedback_timer, + ccid3_hc_tx_no_feedback_timer, (unsigned long)sk); return 0; } @@ -622,8 +586,7 @@ static void ccid3_hc_tx_exit(struct sock *sk) ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM); sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); - /* Empty packet history */ - dccp_tx_hist_purge(ccid3_tx_hist, &hctx->ccid3hctx_hist); + tfrc_tx_hist_purge(&hctx->ccid3hctx_hist); } static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) @@ -670,6 +633,15 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, /* * Receiver Half-Connection Routines */ + +/* CCID3 feedback types */ +enum ccid3_fback_type { + CCID3_FBACK_NONE = 0, + CCID3_FBACK_INITIAL, + CCID3_FBACK_PERIODIC, + CCID3_FBACK_PARAM_CHANGE +}; + #ifdef CONFIG_IP_DCCP_CCID3_DEBUG static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state) { @@ -696,67 +668,58 @@ static void ccid3_hc_rx_set_state(struct sock *sk, hcrx->ccid3hcrx_state = state; } -static inline void ccid3_hc_rx_update_s(struct ccid3_hc_rx_sock *hcrx, int len) -{ - if (unlikely(len == 0)) /* don't update on empty packets (e.g. ACKs) */ - ccid3_pr_debug("Packet payload length is 0 - not updating\n"); - else - hcrx->ccid3hcrx_s = hcrx->ccid3hcrx_s == 0 ? len : - (9 * hcrx->ccid3hcrx_s + len) / 10; -} - -static void ccid3_hc_rx_send_feedback(struct sock *sk) +static void ccid3_hc_rx_send_feedback(struct sock *sk, + const struct sk_buff *skb, + enum ccid3_fback_type fbtype) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); struct dccp_sock *dp = dccp_sk(sk); - struct dccp_rx_hist_entry *packet; ktime_t now; - suseconds_t delta; + s64 delta = 0; - ccid3_pr_debug("%s(%p) - entry \n", dccp_role(sk), sk); + if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_TERM)) + return; now = ktime_get_real(); - switch (hcrx->ccid3hcrx_state) { - case TFRC_RSTATE_NO_DATA: + switch (fbtype) { + case CCID3_FBACK_INITIAL: hcrx->ccid3hcrx_x_recv = 0; + hcrx->ccid3hcrx_pinv = ~0U; /* see RFC 4342, 8.5 */ break; - case TFRC_RSTATE_DATA: - delta = ktime_us_delta(now, - hcrx->ccid3hcrx_tstamp_last_feedback); - DCCP_BUG_ON(delta < 0); - hcrx->ccid3hcrx_x_recv = - scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta); + case CCID3_FBACK_PARAM_CHANGE: + /* + * When parameters change (new loss or p > p_prev), we do not + * have a reliable estimate for R_m of [RFC 3448, 6.2] and so + * need to reuse the previous value of X_recv. However, when + * X_recv was 0 (due to early loss), this would kill X down to + * s/t_mbi (i.e. one packet in 64 seconds). + * To avoid such drastic reduction, we approximate X_recv as + * the number of bytes since last feedback. + * This is a safe fallback, since X is bounded above by X_calc. + */ + if (hcrx->ccid3hcrx_x_recv > 0) + break; + /* fall through */ + case CCID3_FBACK_PERIODIC: + delta = ktime_us_delta(now, hcrx->ccid3hcrx_tstamp_last_feedback); + if (delta <= 0) + DCCP_BUG("delta (%ld) <= 0", (long)delta); + else + hcrx->ccid3hcrx_x_recv = + scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta); break; - case TFRC_RSTATE_TERM: - DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk); + default: return; } - packet = dccp_rx_hist_find_data_packet(&hcrx->ccid3hcrx_hist); - if (unlikely(packet == NULL)) { - DCCP_WARN("%s(%p), no data packet in history!\n", - dccp_role(sk), sk); - return; - } + ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta, + hcrx->ccid3hcrx_x_recv, hcrx->ccid3hcrx_pinv); hcrx->ccid3hcrx_tstamp_last_feedback = now; - hcrx->ccid3hcrx_ccval_last_counter = packet->dccphrx_ccval; + hcrx->ccid3hcrx_last_counter = dccp_hdr(skb)->dccph_ccval; hcrx->ccid3hcrx_bytes_recv = 0; - /* Elapsed time information [RFC 4340, 13.2] in units of 10 * usecs */ - delta = ktime_us_delta(now, packet->dccphrx_tstamp); - DCCP_BUG_ON(delta < 0); - hcrx->ccid3hcrx_elapsed_time = delta / 10; - - if (hcrx->ccid3hcrx_p == 0) - hcrx->ccid3hcrx_pinv = ~0U; /* see RFC 4342, 8.5 */ - else if (hcrx->ccid3hcrx_p > 1000000) { - DCCP_WARN("p (%u) > 100%%\n", hcrx->ccid3hcrx_p); - hcrx->ccid3hcrx_pinv = 1; /* use 100% in this case */ - } else - hcrx->ccid3hcrx_pinv = 1000000 / hcrx->ccid3hcrx_p; - dp->dccps_hc_rx_insert_options = 1; dccp_send_ack(sk); } @@ -770,7 +733,6 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) return 0; hcrx = ccid3_hc_rx_sk(sk); - DCCP_SKB_CB(skb)->dccpd_ccval = hcrx->ccid3hcrx_ccval_last_counter; if (dccp_packet_without_ack(skb)) return 0; @@ -778,11 +740,7 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) x_recv = htonl(hcrx->ccid3hcrx_x_recv); pinv = htonl(hcrx->ccid3hcrx_pinv); - if ((hcrx->ccid3hcrx_elapsed_time != 0 && - dccp_insert_option_elapsed_time(sk, skb, - hcrx->ccid3hcrx_elapsed_time)) || - dccp_insert_option_timestamp(sk, skb) || - dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE, + if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE, &pinv, sizeof(pinv)) || dccp_insert_option(sk, skb, TFRC_OPT_RECEIVE_RATE, &x_recv, sizeof(x_recv))) @@ -791,180 +749,139 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) return 0; } -static int ccid3_hc_rx_detect_loss(struct sock *sk, - struct dccp_rx_hist_entry *packet) +/** ccid3_first_li - Implements [RFC 3448, 6.3.1] + * + * Determine the length of the first loss interval via inverse lookup. + * Assume that X_recv can be computed by the throughput equation + * s + * X_recv = -------- + * R * fval + * Find some p such that f(p) = fval; return 1/p (scaled). + */ +static u32 ccid3_first_li(struct sock *sk) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); - struct dccp_rx_hist_entry *rx_hist = - dccp_rx_hist_head(&hcrx->ccid3hcrx_hist); - u64 seqno = packet->dccphrx_seqno; - u64 tmp_seqno; - int loss = 0; - u8 ccval; - - - tmp_seqno = hcrx->ccid3hcrx_seqno_nonloss; + u32 x_recv, p, delta; + u64 fval; - if (!rx_hist || - follows48(packet->dccphrx_seqno, hcrx->ccid3hcrx_seqno_nonloss)) { - hcrx->ccid3hcrx_seqno_nonloss = seqno; - hcrx->ccid3hcrx_ccval_nonloss = packet->dccphrx_ccval; - goto detect_out; + if (hcrx->ccid3hcrx_rtt == 0) { + DCCP_WARN("No RTT estimate available, using fallback RTT\n"); + hcrx->ccid3hcrx_rtt = DCCP_FALLBACK_RTT; } - - while (dccp_delta_seqno(hcrx->ccid3hcrx_seqno_nonloss, seqno) - > TFRC_RECV_NUM_LATE_LOSS) { - loss = 1; - dccp_li_update_li(sk, - &hcrx->ccid3hcrx_li_hist, - &hcrx->ccid3hcrx_hist, - hcrx->ccid3hcrx_tstamp_last_feedback, - hcrx->ccid3hcrx_s, - hcrx->ccid3hcrx_bytes_recv, - hcrx->ccid3hcrx_x_recv, - hcrx->ccid3hcrx_seqno_nonloss, - hcrx->ccid3hcrx_ccval_nonloss); - tmp_seqno = hcrx->ccid3hcrx_seqno_nonloss; - dccp_inc_seqno(&tmp_seqno); - hcrx->ccid3hcrx_seqno_nonloss = tmp_seqno; - dccp_inc_seqno(&tmp_seqno); - while (dccp_rx_hist_find_entry(&hcrx->ccid3hcrx_hist, - tmp_seqno, &ccval)) { - hcrx->ccid3hcrx_seqno_nonloss = tmp_seqno; - hcrx->ccid3hcrx_ccval_nonloss = ccval; - dccp_inc_seqno(&tmp_seqno); + delta = ktime_to_us(net_timedelta(hcrx->ccid3hcrx_tstamp_last_feedback)); + x_recv = scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta); + if (x_recv == 0) { /* would also trigger divide-by-zero */ + DCCP_WARN("X_recv==0\n"); + if ((x_recv = hcrx->ccid3hcrx_x_recv) == 0) { + DCCP_BUG("stored value of X_recv is zero"); + return ~0U; } } - /* FIXME - this code could be simplified with above while */ - /* but works at moment */ - if (follows48(packet->dccphrx_seqno, hcrx->ccid3hcrx_seqno_nonloss)) { - hcrx->ccid3hcrx_seqno_nonloss = seqno; - hcrx->ccid3hcrx_ccval_nonloss = packet->dccphrx_ccval; - } + fval = scaled_div(hcrx->ccid3hcrx_s, hcrx->ccid3hcrx_rtt); + fval = scaled_div32(fval, x_recv); + p = tfrc_calc_x_reverse_lookup(fval); -detect_out: - dccp_rx_hist_add_packet(ccid3_rx_hist, &hcrx->ccid3hcrx_hist, - &hcrx->ccid3hcrx_li_hist, packet, - hcrx->ccid3hcrx_seqno_nonloss); - return loss; + ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied " + "loss rate=%u\n", dccp_role(sk), sk, x_recv, p); + + return p == 0 ? ~0U : scaled_div(1, p); } static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); - const struct dccp_options_received *opt_recv; - struct dccp_rx_hist_entry *packet; - u32 p_prev, r_sample, rtt_prev; - int loss, payload_size; - ktime_t now; - - opt_recv = &dccp_sk(sk)->dccps_options_received; - - switch (DCCP_SKB_CB(skb)->dccpd_type) { - case DCCP_PKT_ACK: - if (hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA) - return; - case DCCP_PKT_DATAACK: - if (opt_recv->dccpor_timestamp_echo == 0) - break; - r_sample = dccp_timestamp() - opt_recv->dccpor_timestamp_echo; - rtt_prev = hcrx->ccid3hcrx_rtt; - r_sample = dccp_sample_rtt(sk, 10 * r_sample); + enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE; + const u32 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp; + const bool is_data_packet = dccp_data_packet(skb); + + if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)) { + if (is_data_packet) { + const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; + do_feedback = CCID3_FBACK_INITIAL; + ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA); + hcrx->ccid3hcrx_s = payload; + /* + * Not necessary to update ccid3hcrx_bytes_recv here, + * since X_recv = 0 for the first feedback packet (cf. + * RFC 3448, 6.3) -- gerrit + */ + } + goto update_records; + } - if (hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA) - hcrx->ccid3hcrx_rtt = r_sample; - else - hcrx->ccid3hcrx_rtt = (hcrx->ccid3hcrx_rtt * 9) / 10 + - r_sample / 10; + if (tfrc_rx_hist_duplicate(&hcrx->ccid3hcrx_hist, skb)) + return; /* done receiving */ - if (rtt_prev != hcrx->ccid3hcrx_rtt) - ccid3_pr_debug("%s(%p), New RTT=%uus, elapsed time=%u\n", - dccp_role(sk), sk, hcrx->ccid3hcrx_rtt, - opt_recv->dccpor_elapsed_time); - break; - case DCCP_PKT_DATA: - break; - default: /* We're not interested in other packet types, move along */ - return; + if (is_data_packet) { + const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; + /* + * Update moving-average of s and the sum of received payload bytes + */ + hcrx->ccid3hcrx_s = tfrc_ewma(hcrx->ccid3hcrx_s, payload, 9); + hcrx->ccid3hcrx_bytes_recv += payload; } - packet = dccp_rx_hist_entry_new(ccid3_rx_hist, opt_recv->dccpor_ndp, - skb, GFP_ATOMIC); - if (unlikely(packet == NULL)) { - DCCP_WARN("%s(%p), Not enough mem to add rx packet " - "to history, consider it lost!\n", dccp_role(sk), sk); - return; + /* + * Handle pending losses and otherwise check for new loss + */ + if (tfrc_rx_hist_loss_pending(&hcrx->ccid3hcrx_hist) && + tfrc_rx_handle_loss(&hcrx->ccid3hcrx_hist, + &hcrx->ccid3hcrx_li_hist, + skb, ndp, ccid3_first_li, sk) ) { + do_feedback = CCID3_FBACK_PARAM_CHANGE; + goto done_receiving; } - loss = ccid3_hc_rx_detect_loss(sk, packet); + if (tfrc_rx_hist_new_loss_indicated(&hcrx->ccid3hcrx_hist, skb, ndp)) + goto update_records; - if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK) - return; - - payload_size = skb->len - dccp_hdr(skb)->dccph_doff * 4; - ccid3_hc_rx_update_s(hcrx, payload_size); + /* + * Handle data packets: RTT sampling and monitoring p + */ + if (unlikely(!is_data_packet)) + goto update_records; - switch (hcrx->ccid3hcrx_state) { - case TFRC_RSTATE_NO_DATA: - ccid3_pr_debug("%s(%p, state=%s), skb=%p, sending initial " - "feedback\n", dccp_role(sk), sk, - dccp_state_name(sk->sk_state), skb); - ccid3_hc_rx_send_feedback(sk); - ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA); - return; - case TFRC_RSTATE_DATA: - hcrx->ccid3hcrx_bytes_recv += payload_size; - if (loss) - break; + if (!tfrc_lh_is_initialised(&hcrx->ccid3hcrx_li_hist)) { + const u32 sample = tfrc_rx_hist_sample_rtt(&hcrx->ccid3hcrx_hist, skb); + /* + * Empty loss history: no loss so far, hence p stays 0. + * Sample RTT values, since an RTT estimate is required for the + * computation of p when the first loss occurs; RFC 3448, 6.3.1. + */ + if (sample != 0) + hcrx->ccid3hcrx_rtt = tfrc_ewma(hcrx->ccid3hcrx_rtt, sample, 9); - now = ktime_get_real(); - if ((ktime_us_delta(now, hcrx->ccid3hcrx_tstamp_last_ack) - - (s64)hcrx->ccid3hcrx_rtt) >= 0) { - hcrx->ccid3hcrx_tstamp_last_ack = now; - ccid3_hc_rx_send_feedback(sk); - } - return; - case TFRC_RSTATE_TERM: - DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk); - return; + } else if (tfrc_lh_update_i_mean(&hcrx->ccid3hcrx_li_hist, skb)) { + /* + * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean + * has decreased (resp. p has increased), send feedback now. + */ + do_feedback = CCID3_FBACK_PARAM_CHANGE; } - /* Dealing with packet loss */ - ccid3_pr_debug("%s(%p, state=%s), data loss! Reacting...\n", - dccp_role(sk), sk, dccp_state_name(sk->sk_state)); - - p_prev = hcrx->ccid3hcrx_p; - - /* Calculate loss event rate */ - if (!list_empty(&hcrx->ccid3hcrx_li_hist)) { - u32 i_mean = dccp_li_hist_calc_i_mean(&hcrx->ccid3hcrx_li_hist); + /* + * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3 + */ + if (SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->ccid3hcrx_last_counter) > 3) + do_feedback = CCID3_FBACK_PERIODIC; - /* Scaling up by 1000000 as fixed decimal */ - if (i_mean != 0) - hcrx->ccid3hcrx_p = 1000000 / i_mean; - } else - DCCP_BUG("empty loss history"); +update_records: + tfrc_rx_hist_add_packet(&hcrx->ccid3hcrx_hist, skb, ndp); - if (hcrx->ccid3hcrx_p > p_prev) { - ccid3_hc_rx_send_feedback(sk); - return; - } +done_receiving: + if (do_feedback) + ccid3_hc_rx_send_feedback(sk, skb, do_feedback); } static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk) { struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid); - ccid3_pr_debug("entry\n"); - hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA; - INIT_LIST_HEAD(&hcrx->ccid3hcrx_hist); - INIT_LIST_HEAD(&hcrx->ccid3hcrx_li_hist); - hcrx->ccid3hcrx_tstamp_last_feedback = - hcrx->ccid3hcrx_tstamp_last_ack = ktime_get_real(); - return 0; + tfrc_lh_init(&hcrx->ccid3hcrx_li_hist); + return tfrc_rx_hist_alloc(&hcrx->ccid3hcrx_hist); } static void ccid3_hc_rx_exit(struct sock *sk) @@ -973,11 +890,8 @@ static void ccid3_hc_rx_exit(struct sock *sk) ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM); - /* Empty packet history */ - dccp_rx_hist_purge(ccid3_rx_hist, &hcrx->ccid3hcrx_hist); - - /* Empty loss interval history */ - dccp_li_hist_purge(&hcrx->ccid3hcrx_li_hist); + tfrc_rx_hist_purge(&hcrx->ccid3hcrx_hist); + tfrc_lh_cleanup(&hcrx->ccid3hcrx_li_hist); } static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) @@ -998,6 +912,7 @@ static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { const struct ccid3_hc_rx_sock *hcrx; + struct tfrc_rx_info rx_info; const void *val; /* Listen socks doesn't have a private CCID block */ @@ -1007,10 +922,14 @@ static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, hcrx = ccid3_hc_rx_sk(sk); switch (optname) { case DCCP_SOCKOPT_CCID_RX_INFO: - if (len < sizeof(hcrx->ccid3hcrx_tfrc)) + if (len < sizeof(rx_info)) return -EINVAL; - len = sizeof(hcrx->ccid3hcrx_tfrc); - val = &hcrx->ccid3hcrx_tfrc; + rx_info.tfrcrx_x_recv = hcrx->ccid3hcrx_x_recv; + rx_info.tfrcrx_rtt = hcrx->ccid3hcrx_rtt; + rx_info.tfrcrx_p = hcrx->ccid3hcrx_pinv == 0 ? ~0U : + scaled_div(1, hcrx->ccid3hcrx_pinv); + len = sizeof(rx_info); + val = &rx_info; break; default: return -ENOPROTOOPT; @@ -1024,7 +943,7 @@ static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, static struct ccid_operations ccid3 = { .ccid_id = DCCPC_CCID3, - .ccid_name = "ccid3", + .ccid_name = "TCP-Friendly Rate Control", .ccid_owner = THIS_MODULE, .ccid_hc_tx_obj_size = sizeof(struct ccid3_hc_tx_sock), .ccid_hc_tx_init = ccid3_hc_tx_init, @@ -1051,44 +970,13 @@ MODULE_PARM_DESC(ccid3_debug, "Enable debug messages"); static __init int ccid3_module_init(void) { - int rc = -ENOBUFS; - - ccid3_rx_hist = dccp_rx_hist_new("ccid3"); - if (ccid3_rx_hist == NULL) - goto out; - - ccid3_tx_hist = dccp_tx_hist_new("ccid3"); - if (ccid3_tx_hist == NULL) - goto out_free_rx; - - rc = ccid_register(&ccid3); - if (rc != 0) - goto out_free_tx; -out: - return rc; - -out_free_tx: - dccp_tx_hist_delete(ccid3_tx_hist); - ccid3_tx_hist = NULL; -out_free_rx: - dccp_rx_hist_delete(ccid3_rx_hist); - ccid3_rx_hist = NULL; - goto out; + return ccid_register(&ccid3); } module_init(ccid3_module_init); static __exit void ccid3_module_exit(void) { ccid_unregister(&ccid3); - - if (ccid3_tx_hist != NULL) { - dccp_tx_hist_delete(ccid3_tx_hist); - ccid3_tx_hist = NULL; - } - if (ccid3_rx_hist != NULL) { - dccp_rx_hist_delete(ccid3_rx_hist); - ccid3_rx_hist = NULL; - } } module_exit(ccid3_module_exit); diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 0cdc982cfe4..49ca32bd7e7 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -1,7 +1,8 @@ /* * net/dccp/ccids/ccid3.h * - * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand. + * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. + * Copyright (c) 2007 The University of Aberdeen, Scotland, UK * * An implementation of the DCCP protocol * @@ -40,6 +41,7 @@ #include <linux/list.h> #include <linux/types.h> #include <linux/tfrc.h> +#include "lib/tfrc.h" #include "../ccid.h" /* Two seconds as per RFC 3448 4.2 */ @@ -88,7 +90,6 @@ enum ccid3_hc_tx_states { * @ccid3hctx_t_last_win_count - Timestamp of earliest packet * with last_win_count value sent * @ccid3hctx_no_feedback_timer - Handle to no feedback timer - * @ccid3hctx_idle - Flag indicating that sender is idling * @ccid3hctx_t_ld - Time last doubled during slow start * @ccid3hctx_t_nom - Nominal send time of next packet * @ccid3hctx_delta - Send timer delta (RFC 3448, 4.6) in usecs @@ -107,13 +108,12 @@ struct ccid3_hc_tx_sock { u16 ccid3hctx_s; enum ccid3_hc_tx_states ccid3hctx_state:8; u8 ccid3hctx_last_win_count; - u8 ccid3hctx_idle; ktime_t ccid3hctx_t_last_win_count; struct timer_list ccid3hctx_no_feedback_timer; ktime_t ccid3hctx_t_ld; ktime_t ccid3hctx_t_nom; u32 ccid3hctx_delta; - struct list_head ccid3hctx_hist; + struct tfrc_tx_hist_entry *ccid3hctx_hist; struct ccid3_options_received ccid3hctx_options_received; }; @@ -135,37 +135,30 @@ enum ccid3_hc_rx_states { * * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448 4.3) * @ccid3hcrx_rtt - Receiver estimate of rtt (non-standard) - * @ccid3hcrx_p - current loss event rate (RFC 3448 5.4) - * @ccid3hcrx_seqno_nonloss - Last received non-loss sequence number - * @ccid3hcrx_ccval_nonloss - Last received non-loss Window CCVal - * @ccid3hcrx_ccval_last_counter - Tracks window counter (RFC 4342, 8.1) - * @ccid3hcrx_state - receiver state, one of %ccid3_hc_rx_states + * @ccid3hcrx_p - Current loss event rate (RFC 3448 5.4) + * @ccid3hcrx_last_counter - Tracks window counter (RFC 4342, 8.1) + * @ccid3hcrx_state - Receiver state, one of %ccid3_hc_rx_states * @ccid3hcrx_bytes_recv - Total sum of DCCP payload bytes + * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3) + * @ccid3hcrx_rtt - Receiver estimate of RTT * @ccid3hcrx_tstamp_last_feedback - Time at which last feedback was sent * @ccid3hcrx_tstamp_last_ack - Time at which last feedback was sent - * @ccid3hcrx_hist - Packet history - * @ccid3hcrx_li_hist - Loss Interval History + * @ccid3hcrx_hist - Packet history (loss detection + RTT sampling) + * @ccid3hcrx_li_hist - Loss Interval database * @ccid3hcrx_s - Received packet size in bytes * @ccid3hcrx_pinv - Inverse of Loss Event Rate (RFC 4342, sec. 8.5) - * @ccid3hcrx_elapsed_time - Time since packet reception */ struct ccid3_hc_rx_sock { - struct tfrc_rx_info ccid3hcrx_tfrc; -#define ccid3hcrx_x_recv ccid3hcrx_tfrc.tfrcrx_x_recv -#define ccid3hcrx_rtt ccid3hcrx_tfrc.tfrcrx_rtt -#define ccid3hcrx_p ccid3hcrx_tfrc.tfrcrx_p - u64 ccid3hcrx_seqno_nonloss:48, - ccid3hcrx_ccval_nonloss:4, - ccid3hcrx_ccval_last_counter:4; + u8 ccid3hcrx_last_counter:4; enum ccid3_hc_rx_states ccid3hcrx_state:8; u32 ccid3hcrx_bytes_recv; + u32 ccid3hcrx_x_recv; + u32 ccid3hcrx_rtt; ktime_t ccid3hcrx_tstamp_last_feedback; - ktime_t ccid3hcrx_tstamp_last_ack; - struct list_head ccid3hcrx_hist; - struct list_head ccid3hcrx_li_hist; + struct tfrc_rx_hist ccid3hcrx_hist; + struct tfrc_loss_hist ccid3hcrx_li_hist; u16 ccid3hcrx_s; - u32 ccid3hcrx_pinv; - u32 ccid3hcrx_elapsed_time; +#define ccid3hcrx_pinv ccid3hcrx_li_hist.i_mean }; static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk) diff --git a/net/dccp/ccids/lib/Makefile b/net/dccp/ccids/lib/Makefile index 5f940a6cbac..68c93e3d89d 100644 --- a/net/dccp/ccids/lib/Makefile +++ b/net/dccp/ccids/lib/Makefile @@ -1,3 +1,3 @@ obj-$(CONFIG_IP_DCCP_TFRC_LIB) += dccp_tfrc_lib.o -dccp_tfrc_lib-y := loss_interval.o packet_history.o tfrc_equation.o +dccp_tfrc_lib-y := tfrc.o tfrc_equation.o packet_history.o loss_interval.o diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c index d26b88dbbb4..849e181e698 100644 --- a/net/dccp/ccids/lib/loss_interval.c +++ b/net/dccp/ccids/lib/loss_interval.c @@ -1,6 +1,7 @@ /* * net/dccp/ccids/lib/loss_interval.c * + * Copyright (c) 2007 The University of Aberdeen, Scotland, UK * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. * Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz> * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> @@ -10,285 +11,176 @@ * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. */ - -#include <linux/module.h> #include <net/sock.h> -#include "../../dccp.h" -#include "loss_interval.h" -#include "packet_history.h" #include "tfrc.h" -#define DCCP_LI_HIST_IVAL_F_LENGTH 8 - -struct dccp_li_hist_entry { - struct list_head dccplih_node; - u64 dccplih_seqno:48, - dccplih_win_count:4; - u32 dccplih_interval; -}; +static struct kmem_cache *tfrc_lh_slab __read_mostly; +/* Loss Interval weights from [RFC 3448, 5.4], scaled by 10 */ +static const int tfrc_lh_weights[NINTERVAL] = { 10, 10, 10, 10, 8, 6, 4, 2 }; -static struct kmem_cache *dccp_li_cachep __read_mostly; - -static inline struct dccp_li_hist_entry *dccp_li_hist_entry_new(const gfp_t prio) +/* implements LIFO semantics on the array */ +static inline u8 LIH_INDEX(const u8 ctr) { - return kmem_cache_alloc(dccp_li_cachep, prio); + return (LIH_SIZE - 1 - (ctr % LIH_SIZE)); } -static inline void dccp_li_hist_entry_delete(struct dccp_li_hist_entry *entry) +/* the `counter' index always points at the next entry to be populated */ +static inline struct tfrc_loss_interval *tfrc_lh_peek(struct tfrc_loss_hist *lh) { - if (entry != NULL) - kmem_cache_free(dccp_li_cachep, entry); + return lh->counter ? lh->ring[LIH_INDEX(lh->counter - 1)] : NULL; } -void dccp_li_hist_purge(struct list_head *list) +/* given i with 0 <= i <= k, return I_i as per the rfc3448bis notation */ +static inline u32 tfrc_lh_get_interval(struct tfrc_loss_hist *lh, const u8 i) { - struct dccp_li_hist_entry *entry, *next; - - list_for_each_entry_safe(entry, next, list, dccplih_node) { - list_del_init(&entry->dccplih_node); - kmem_cache_free(dccp_li_cachep, entry); - } + BUG_ON(i >= lh->counter); + return lh->ring[LIH_INDEX(lh->counter - i - 1)]->li_length; } -EXPORT_SYMBOL_GPL(dccp_li_hist_purge); - -/* Weights used to calculate loss event rate */ /* - * These are integers as per section 8 of RFC3448. We can then divide by 4 * - * when we use it. + * On-demand allocation and de-allocation of entries */ -static const int dccp_li_hist_w[DCCP_LI_HIST_IVAL_F_LENGTH] = { - 4, 4, 4, 4, 3, 2, 1, 1, -}; - -u32 dccp_li_hist_calc_i_mean(struct list_head *list) +static struct tfrc_loss_interval *tfrc_lh_demand_next(struct tfrc_loss_hist *lh) { - struct dccp_li_hist_entry *li_entry, *li_next; - int i = 0; - u32 i_tot; - u32 i_tot0 = 0; - u32 i_tot1 = 0; - u32 w_tot = 0; - - list_for_each_entry_safe(li_entry, li_next, list, dccplih_node) { - if (li_entry->dccplih_interval != ~0U) { - i_tot0 += li_entry->dccplih_interval * dccp_li_hist_w[i]; - w_tot += dccp_li_hist_w[i]; - if (i != 0) - i_tot1 += li_entry->dccplih_interval * dccp_li_hist_w[i - 1]; - } - - - if (++i > DCCP_LI_HIST_IVAL_F_LENGTH) - break; - } - - if (i != DCCP_LI_HIST_IVAL_F_LENGTH) - return 0; - - i_tot = max(i_tot0, i_tot1); - - if (!w_tot) { - DCCP_WARN("w_tot = 0\n"); - return 1; - } - - return i_tot / w_tot; + if (lh->ring[LIH_INDEX(lh->counter)] == NULL) + lh->ring[LIH_INDEX(lh->counter)] = kmem_cache_alloc(tfrc_lh_slab, + GFP_ATOMIC); + return lh->ring[LIH_INDEX(lh->counter)]; } -EXPORT_SYMBOL_GPL(dccp_li_hist_calc_i_mean); - -static int dccp_li_hist_interval_new(struct list_head *list, - const u64 seq_loss, const u8 win_loss) +void tfrc_lh_cleanup(struct tfrc_loss_hist *lh) { - struct dccp_li_hist_entry *entry; - int i; - - for (i = 0; i < DCCP_LI_HIST_IVAL_F_LENGTH; i++) { - entry = dccp_li_hist_entry_new(GFP_ATOMIC); - if (entry == NULL) { - dccp_li_hist_purge(list); - DCCP_BUG("loss interval list entry is NULL"); - return 0; + if (!tfrc_lh_is_initialised(lh)) + return; + + for (lh->counter = 0; lh->counter < LIH_SIZE; lh->counter++) + if (lh->ring[LIH_INDEX(lh->counter)] != NULL) { + kmem_cache_free(tfrc_lh_slab, + lh->ring[LIH_INDEX(lh->counter)]); + lh->ring[LIH_INDEX(lh->counter)] = NULL; } - entry->dccplih_interval = ~0; - list_add(&entry->dccplih_node, list); - } - - entry->dccplih_seqno = seq_loss; - entry->dccplih_win_count = win_loss; - return 1; } +EXPORT_SYMBOL_GPL(tfrc_lh_cleanup); -/* calculate first loss interval - * - * returns estimated loss interval in usecs */ -static u32 dccp_li_calc_first_li(struct sock *sk, - struct list_head *hist_list, - ktime_t last_feedback, - u16 s, u32 bytes_recv, - u32 previous_x_recv) +static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh) { - struct dccp_rx_hist_entry *entry, *next, *tail = NULL; - u32 x_recv, p; - suseconds_t rtt, delta; - ktime_t tstamp = ktime_set(0, 0); - int interval = 0; - int win_count = 0; - int step = 0; - u64 fval; + u32 i_i, i_tot0 = 0, i_tot1 = 0, w_tot = 0; + int i, k = tfrc_lh_length(lh) - 1; /* k is as in rfc3448bis, 5.4 */ - list_for_each_entry_safe(entry, next, hist_list, dccphrx_node) { - if (dccp_rx_hist_entry_data_packet(entry)) { - tail = entry; + for (i=0; i <= k; i++) { + i_i = tfrc_lh_get_interval(lh, i); - switch (step) { - case 0: - tstamp = entry->dccphrx_tstamp; - win_count = entry->dccphrx_ccval; - step = 1; - break; - case 1: - interval = win_count - entry->dccphrx_ccval; - if (interval < 0) - interval += TFRC_WIN_COUNT_LIMIT; - if (interval > 4) - goto found; - break; - } + if (i < k) { + i_tot0 += i_i * tfrc_lh_weights[i]; + w_tot += tfrc_lh_weights[i]; } + if (i > 0) + i_tot1 += i_i * tfrc_lh_weights[i-1]; } - if (unlikely(step == 0)) { - DCCP_WARN("%s(%p), packet history has no data packets!\n", - dccp_role(sk), sk); - return ~0; - } - - if (unlikely(interval == 0)) { - DCCP_WARN("%s(%p), Could not find a win_count interval > 0. " - "Defaulting to 1\n", dccp_role(sk), sk); - interval = 1; - } -found: - if (!tail) { - DCCP_CRIT("tail is null\n"); - return ~0; - } - - delta = ktime_us_delta(tstamp, tail->dccphrx_tstamp); - DCCP_BUG_ON(delta < 0); + BUG_ON(w_tot == 0); + lh->i_mean = max(i_tot0, i_tot1) / w_tot; +} - rtt = delta * 4 / interval; - dccp_pr_debug("%s(%p), approximated RTT to %dus\n", - dccp_role(sk), sk, (int)rtt); +/** + * tfrc_lh_update_i_mean - Update the `open' loss interval I_0 + * For recomputing p: returns `true' if p > p_prev <=> 1/p < 1/p_prev + */ +u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) +{ + struct tfrc_loss_interval *cur = tfrc_lh_peek(lh); + u32 old_i_mean = lh->i_mean; + s64 length; - /* - * Determine the length of the first loss interval via inverse lookup. - * Assume that X_recv can be computed by the throughput equation - * s - * X_recv = -------- - * R * fval - * Find some p such that f(p) = fval; return 1/p [RFC 3448, 6.3.1]. - */ - if (rtt == 0) { /* would result in divide-by-zero */ - DCCP_WARN("RTT==0\n"); - return ~0; - } + if (cur == NULL) /* not initialised */ + return 0; - delta = ktime_us_delta(ktime_get_real(), last_feedback); - DCCP_BUG_ON(delta <= 0); + length = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq); - x_recv = scaled_div32(bytes_recv, delta); - if (x_recv == 0) { /* would also trigger divide-by-zero */ - DCCP_WARN("X_recv==0\n"); - if (previous_x_recv == 0) { - DCCP_BUG("stored value of X_recv is zero"); - return ~0; - } - x_recv = previous_x_recv; - } + if (length - cur->li_length <= 0) /* duplicate or reordered */ + return 0; - fval = scaled_div(s, rtt); - fval = scaled_div32(fval, x_recv); - p = tfrc_calc_x_reverse_lookup(fval); + if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4) + /* + * Implements RFC 4342, 10.2: + * If a packet S (skb) exists whose seqno comes `after' the one + * starting the current loss interval (cur) and if the modulo-16 + * distance from C(cur) to C(S) is greater than 4, consider all + * subsequent packets as belonging to a new loss interval. This + * test is necessary since CCVal may wrap between intervals. + */ + cur->li_is_closed = 1; + + if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */ + return 0; - dccp_pr_debug("%s(%p), receive rate=%u bytes/s, implied " - "loss rate=%u\n", dccp_role(sk), sk, x_recv, p); + cur->li_length = length; + tfrc_lh_calc_i_mean(lh); - if (p == 0) - return ~0; - else - return 1000000 / p; + return (lh->i_mean < old_i_mean); } +EXPORT_SYMBOL_GPL(tfrc_lh_update_i_mean); -void dccp_li_update_li(struct sock *sk, - struct list_head *li_hist_list, - struct list_head *hist_list, - ktime_t last_feedback, u16 s, u32 bytes_recv, - u32 previous_x_recv, u64 seq_loss, u8 win_loss) +/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */ +static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur, + struct tfrc_rx_hist_entry *new_loss) { - struct dccp_li_hist_entry *head; - u64 seq_temp; - - if (list_empty(li_hist_list)) { - if (!dccp_li_hist_interval_new(li_hist_list, seq_loss, - win_loss)) - return; - - head = list_entry(li_hist_list->next, struct dccp_li_hist_entry, - dccplih_node); - head->dccplih_interval = dccp_li_calc_first_li(sk, hist_list, - last_feedback, - s, bytes_recv, - previous_x_recv); - } else { - struct dccp_li_hist_entry *entry; - struct list_head *tail; + return dccp_delta_seqno(cur->li_seqno, new_loss->tfrchrx_seqno) > 0 && + (cur->li_is_closed || SUB16(new_loss->tfrchrx_ccval, cur->li_ccval) > 4); +} - head = list_entry(li_hist_list->next, struct dccp_li_hist_entry, - dccplih_node); - /* FIXME win count check removed as was wrong */ - /* should make this check with receive history */ - /* and compare there as per section 10.2 of RFC4342 */ +/** tfrc_lh_interval_add - Insert new record into the Loss Interval database + * @lh: Loss Interval database + * @rh: Receive history containing a fresh loss event + * @calc_first_li: Caller-dependent routine to compute length of first interval + * @sk: Used by @calc_first_li in caller-specific way (subtyping) + * Updates I_mean and returns 1 if a new interval has in fact been added to @lh. + */ +int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, + u32 (*calc_first_li)(struct sock *), struct sock *sk) +{ + struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new; - /* new loss event detected */ - /* calculate last interval length */ - seq_temp = dccp_delta_seqno(head->dccplih_seqno, seq_loss); - entry = dccp_li_hist_entry_new(GFP_ATOMIC); + if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh))) + return 0; - if (entry == NULL) { - DCCP_BUG("out of memory - can not allocate entry"); - return; - } + new = tfrc_lh_demand_next(lh); + if (unlikely(new == NULL)) { + DCCP_CRIT("Cannot allocate/add loss record."); + return 0; + } - list_add(&entry->dccplih_node, li_hist_list); + new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno; + new->li_ccval = tfrc_rx_hist_loss_prev(rh)->tfrchrx_ccval; + new->li_is_closed = 0; - tail = li_hist_list->prev; - list_del(tail); - kmem_cache_free(dccp_li_cachep, tail); + if (++lh->counter == 1) + lh->i_mean = new->li_length = (*calc_first_li)(sk); + else { + cur->li_length = dccp_delta_seqno(cur->li_seqno, new->li_seqno); + new->li_length = dccp_delta_seqno(new->li_seqno, + tfrc_rx_hist_last_rcv(rh)->tfrchrx_seqno); + if (lh->counter > (2*LIH_SIZE)) + lh->counter -= LIH_SIZE; - /* Create the newest interval */ - entry->dccplih_seqno = seq_loss; - entry->dccplih_interval = seq_temp; - entry->dccplih_win_count = win_loss; + tfrc_lh_calc_i_mean(lh); } + return 1; } +EXPORT_SYMBOL_GPL(tfrc_lh_interval_add); -EXPORT_SYMBOL_GPL(dccp_li_update_li); - -static __init int dccp_li_init(void) +int __init tfrc_li_init(void) { - dccp_li_cachep = kmem_cache_create("dccp_li_hist", - sizeof(struct dccp_li_hist_entry), - 0, SLAB_HWCACHE_ALIGN, NULL); - return dccp_li_cachep == NULL ? -ENOBUFS : 0; + tfrc_lh_slab = kmem_cache_create("tfrc_li_hist", + sizeof(struct tfrc_loss_interval), 0, + SLAB_HWCACHE_ALIGN, NULL); + return tfrc_lh_slab == NULL ? -ENOBUFS : 0; } -static __exit void dccp_li_exit(void) +void tfrc_li_exit(void) { - kmem_cache_destroy(dccp_li_cachep); + if (tfrc_lh_slab != NULL) { + kmem_cache_destroy(tfrc_lh_slab); + tfrc_lh_slab = NULL; + } } - -module_init(dccp_li_init); -module_exit(dccp_li_exit); diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h index 27bee92dae1..246018a3b26 100644 --- a/net/dccp/ccids/lib/loss_interval.h +++ b/net/dccp/ccids/lib/loss_interval.h @@ -3,6 +3,7 @@ /* * net/dccp/ccids/lib/loss_interval.h * + * Copyright (c) 2007 The University of Aberdeen, Scotland, UK * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. * Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz> * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> @@ -12,18 +13,63 @@ * Software Foundation; either version 2 of the License, or (at your option) * any later version. */ - #include <linux/ktime.h> #include <linux/list.h> +#include <linux/slab.h> + +/* + * Number of loss intervals (RFC 4342, 8.6.1). The history size is one more than + * NINTERVAL, since the `open' interval I_0 is always stored as the first entry. + */ +#define NINTERVAL 8 +#define LIH_SIZE (NINTERVAL + 1) + +/** + * tfrc_loss_interval - Loss history record for TFRC-based protocols + * @li_seqno: Highest received seqno before the start of loss + * @li_ccval: The CCVal belonging to @li_seqno + * @li_is_closed: Whether @li_seqno is older than 1 RTT + * @li_length: Loss interval sequence length + */ +struct tfrc_loss_interval { + u64 li_seqno:48, + li_ccval:4, + li_is_closed:1; + u32 li_length; +}; + +/** + * tfrc_loss_hist - Loss record database + * @ring: Circular queue managed in LIFO manner + * @counter: Current count of entries (can be more than %LIH_SIZE) + * @i_mean: Current Average Loss Interval [RFC 3448, 5.4] + */ +struct tfrc_loss_hist { + struct tfrc_loss_interval *ring[LIH_SIZE]; + u8 counter; + u32 i_mean; +}; + +static inline void tfrc_lh_init(struct tfrc_loss_hist *lh) +{ + memset(lh, 0, sizeof(struct tfrc_loss_hist)); +} + +static inline u8 tfrc_lh_is_initialised(struct tfrc_loss_hist *lh) +{ + return lh->counter > 0; +} + +static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh) +{ + return min(lh->counter, (u8)LIH_SIZE); +} -extern void dccp_li_hist_purge(struct list_head *list); +struct tfrc_rx_hist; -extern u32 dccp_li_hist_calc_i_mean(struct list_head *list); +extern int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *, + u32 (*first_li)(struct sock *), struct sock *); +extern u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *); +extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh); -extern void dccp_li_update_li(struct sock *sk, - struct list_head *li_hist_list, - struct list_head *hist_list, - ktime_t last_feedback, u16 s, - u32 bytes_recv, u32 previous_x_recv, - u64 seq_loss, u8 win_loss); #endif /* _DCCP_LI_HIST_ */ diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index 34c4f604772..20af1a69342 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -1,7 +1,8 @@ /* * net/dccp/packet_history.c * - * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand. + * Copyright (c) 2007 The University of Aberdeen, Scotland, UK + * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. * * An implementation of the DCCP protocol * @@ -34,267 +35,465 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include <linux/module.h> #include <linux/string.h> +#include <linux/slab.h> #include "packet_history.h" +#include "../../dccp.h" + +/** + * tfrc_tx_hist_entry - Simple singly-linked TX history list + * @next: next oldest entry (LIFO order) + * @seqno: sequence number of this entry + * @stamp: send time of packet with sequence number @seqno + */ +struct tfrc_tx_hist_entry { + struct tfrc_tx_hist_entry *next; + u64 seqno; + ktime_t stamp; +}; /* - * Transmitter History Routines + * Transmitter History Routines */ -struct dccp_tx_hist *dccp_tx_hist_new(const char *name) +static struct kmem_cache *tfrc_tx_hist_slab; + +int __init tfrc_tx_packet_history_init(void) { - struct dccp_tx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC); - static const char dccp_tx_hist_mask[] = "tx_hist_%s"; - char *slab_name; - - if (hist == NULL) - goto out; - - slab_name = kmalloc(strlen(name) + sizeof(dccp_tx_hist_mask) - 1, - GFP_ATOMIC); - if (slab_name == NULL) - goto out_free_hist; - - sprintf(slab_name, dccp_tx_hist_mask, name); - hist->dccptxh_slab = kmem_cache_create(slab_name, - sizeof(struct dccp_tx_hist_entry), - 0, SLAB_HWCACHE_ALIGN, - NULL); - if (hist->dccptxh_slab == NULL) - goto out_free_slab_name; -out: - return hist; -out_free_slab_name: - kfree(slab_name); -out_free_hist: - kfree(hist); - hist = NULL; - goto out; + tfrc_tx_hist_slab = kmem_cache_create("tfrc_tx_hist", + sizeof(struct tfrc_tx_hist_entry), + 0, SLAB_HWCACHE_ALIGN, NULL); + return tfrc_tx_hist_slab == NULL ? -ENOBUFS : 0; } -EXPORT_SYMBOL_GPL(dccp_tx_hist_new); - -void dccp_tx_hist_delete(struct dccp_tx_hist *hist) +void tfrc_tx_packet_history_exit(void) { - const char* name = kmem_cache_name(hist->dccptxh_slab); - - kmem_cache_destroy(hist->dccptxh_slab); - kfree(name); - kfree(hist); + if (tfrc_tx_hist_slab != NULL) { + kmem_cache_destroy(tfrc_tx_hist_slab); + tfrc_tx_hist_slab = NULL; + } } -EXPORT_SYMBOL_GPL(dccp_tx_hist_delete); - -struct dccp_tx_hist_entry * - dccp_tx_hist_find_entry(const struct list_head *list, const u64 seq) +static struct tfrc_tx_hist_entry * + tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) { - struct dccp_tx_hist_entry *packet = NULL, *entry; - - list_for_each_entry(entry, list, dccphtx_node) - if (entry->dccphtx_seqno == seq) { - packet = entry; - break; - } + while (head != NULL && head->seqno != seqno) + head = head->next; - return packet; + return head; } -EXPORT_SYMBOL_GPL(dccp_tx_hist_find_entry); +int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) +{ + struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any()); + + if (entry == NULL) + return -ENOBUFS; + entry->seqno = seqno; + entry->stamp = ktime_get_real(); + entry->next = *headp; + *headp = entry; + return 0; +} +EXPORT_SYMBOL_GPL(tfrc_tx_hist_add); -void dccp_tx_hist_purge(struct dccp_tx_hist *hist, struct list_head *list) +void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp) { - struct dccp_tx_hist_entry *entry, *next; + struct tfrc_tx_hist_entry *head = *headp; + + while (head != NULL) { + struct tfrc_tx_hist_entry *next = head->next; - list_for_each_entry_safe(entry, next, list, dccphtx_node) { - list_del_init(&entry->dccphtx_node); - dccp_tx_hist_entry_delete(hist, entry); + kmem_cache_free(tfrc_tx_hist_slab, head); + head = next; } -} -EXPORT_SYMBOL_GPL(dccp_tx_hist_purge); + *headp = NULL; +} +EXPORT_SYMBOL_GPL(tfrc_tx_hist_purge); -void dccp_tx_hist_purge_older(struct dccp_tx_hist *hist, - struct list_head *list, - struct dccp_tx_hist_entry *packet) +u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno, + const ktime_t now) { - struct dccp_tx_hist_entry *next; + u32 rtt = 0; + struct tfrc_tx_hist_entry *packet = tfrc_tx_hist_find_entry(head, seqno); - list_for_each_entry_safe_continue(packet, next, list, dccphtx_node) { - list_del_init(&packet->dccphtx_node); - dccp_tx_hist_entry_delete(hist, packet); + if (packet != NULL) { + rtt = ktime_us_delta(now, packet->stamp); + /* + * Garbage-collect older (irrelevant) entries: + */ + tfrc_tx_hist_purge(&packet->next); } + + return rtt; } +EXPORT_SYMBOL_GPL(tfrc_tx_hist_rtt); -EXPORT_SYMBOL_GPL(dccp_tx_hist_purge_older); /* * Receiver History Routines */ -struct dccp_rx_hist *dccp_rx_hist_new(const char *name) +static struct kmem_cache *tfrc_rx_hist_slab; + +int __init tfrc_rx_packet_history_init(void) { - struct dccp_rx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC); - static const char dccp_rx_hist_mask[] = "rx_hist_%s"; - char *slab_name; - - if (hist == NULL) - goto out; - - slab_name = kmalloc(strlen(name) + sizeof(dccp_rx_hist_mask) - 1, - GFP_ATOMIC); - if (slab_name == NULL) - goto out_free_hist; - - sprintf(slab_name, dccp_rx_hist_mask, name); - hist->dccprxh_slab = kmem_cache_create(slab_name, - sizeof(struct dccp_rx_hist_entry), - 0, SLAB_HWCACHE_ALIGN, - NULL); - if (hist->dccprxh_slab == NULL) - goto out_free_slab_name; -out: - return hist; -out_free_slab_name: - kfree(slab_name); -out_free_hist: - kfree(hist); - hist = NULL; - goto out; + tfrc_rx_hist_slab = kmem_cache_create("tfrc_rxh_cache", + sizeof(struct tfrc_rx_hist_entry), + 0, SLAB_HWCACHE_ALIGN, NULL); + return tfrc_rx_hist_slab == NULL ? -ENOBUFS : 0; } -EXPORT_SYMBOL_GPL(dccp_rx_hist_new); +void tfrc_rx_packet_history_exit(void) +{ + if (tfrc_rx_hist_slab != NULL) { + kmem_cache_destroy(tfrc_rx_hist_slab); + tfrc_rx_hist_slab = NULL; + } +} -void dccp_rx_hist_delete(struct dccp_rx_hist *hist) +static inline void tfrc_rx_hist_entry_from_skb(struct tfrc_rx_hist_entry *entry, + const struct sk_buff *skb, + const u32 ndp) { - const char* name = kmem_cache_name(hist->dccprxh_slab); + const struct dccp_hdr *dh = dccp_hdr(skb); - kmem_cache_destroy(hist->dccprxh_slab); - kfree(name); - kfree(hist); + entry->tfrchrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq; + entry->tfrchrx_ccval = dh->dccph_ccval; + entry->tfrchrx_type = dh->dccph_type; + entry->tfrchrx_ndp = ndp; + entry->tfrchrx_tstamp = ktime_get_real(); } -EXPORT_SYMBOL_GPL(dccp_rx_hist_delete); +void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, + const struct sk_buff *skb, + const u32 ndp) +{ + struct tfrc_rx_hist_entry *entry = tfrc_rx_hist_last_rcv(h); + + tfrc_rx_hist_entry_from_skb(entry, skb, ndp); +} +EXPORT_SYMBOL_GPL(tfrc_rx_hist_add_packet); -int dccp_rx_hist_find_entry(const struct list_head *list, const u64 seq, - u8 *ccval) +/* has the packet contained in skb been seen before? */ +int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb) { - struct dccp_rx_hist_entry *packet = NULL, *entry; + const u64 seq = DCCP_SKB_CB(skb)->dccpd_seq; + int i; - list_for_each_entry(entry, list, dccphrx_node) - if (entry->dccphrx_seqno == seq) { - packet = entry; - break; - } + if (dccp_delta_seqno(tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, seq) <= 0) + return 1; - if (packet) - *ccval = packet->dccphrx_ccval; + for (i = 1; i <= h->loss_count; i++) + if (tfrc_rx_hist_entry(h, i)->tfrchrx_seqno == seq) + return 1; - return packet != NULL; + return 0; } +EXPORT_SYMBOL_GPL(tfrc_rx_hist_duplicate); -EXPORT_SYMBOL_GPL(dccp_rx_hist_find_entry); -struct dccp_rx_hist_entry * - dccp_rx_hist_find_data_packet(const struct list_head *list) +static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) { - struct dccp_rx_hist_entry *entry, *packet = NULL; - - list_for_each_entry(entry, list, dccphrx_node) - if (entry->dccphrx_type == DCCP_PKT_DATA || - entry->dccphrx_type == DCCP_PKT_DATAACK) { - packet = entry; - break; - } + const u8 idx_a = tfrc_rx_hist_index(h, a), + idx_b = tfrc_rx_hist_index(h, b); + struct tfrc_rx_hist_entry *tmp = h->ring[idx_a]; - return packet; + h->ring[idx_a] = h->ring[idx_b]; + h->ring[idx_b] = tmp; } -EXPORT_SYMBOL_GPL(dccp_rx_hist_find_data_packet); +/* + * Private helper functions for loss detection. + * + * In the descriptions, `Si' refers to the sequence number of entry number i, + * whose NDP count is `Ni' (lower case is used for variables). + * Note: All __after_loss functions expect that a test against duplicates has + * been performed already: the seqno of the skb must not be less than the + * seqno of loss_prev; and it must not equal that of any valid hist_entry. + */ +static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2) +{ + u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, + s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno, + s2 = DCCP_SKB_CB(skb)->dccpd_seq; + int n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp, + d12 = dccp_delta_seqno(s1, s2), d2; + + if (d12 > 0) { /* S1 < S2 */ + h->loss_count = 2; + tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n2); + return; + } + + /* S0 < S2 < S1 */ + d2 = dccp_delta_seqno(s0, s2); -void dccp_rx_hist_add_packet(struct dccp_rx_hist *hist, - struct list_head *rx_list, - struct list_head *li_list, - struct dccp_rx_hist_entry *packet, - u64 nonloss_seqno) + if (d2 == 1 || n2 >= d2) { /* S2 is direct successor of S0 */ + int d21 = -d12; + + if (d21 == 1 || n1 >= d21) { + /* hole is filled: S0, S2, and S1 are consecutive */ + h->loss_count = 0; + h->loss_start = tfrc_rx_hist_index(h, 1); + } else + /* gap between S2 and S1: just update loss_prev */ + tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2); + + } else { /* hole between S0 and S2 */ + /* + * Reorder history to insert S2 between S0 and s1 + */ + tfrc_rx_hist_swap(h, 0, 3); + h->loss_start = tfrc_rx_hist_index(h, 3); + tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n2); + h->loss_count = 2; + } +} + +/* return 1 if a new loss event has been identified */ +static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3) { - struct dccp_rx_hist_entry *entry, *next; - u8 num_later = 0; - - list_add(&packet->dccphrx_node, rx_list); - - num_later = TFRC_RECV_NUM_LATE_LOSS + 1; - - if (!list_empty(li_list)) { - list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) { - if (num_later == 0) { - if (after48(nonloss_seqno, - entry->dccphrx_seqno)) { - list_del_init(&entry->dccphrx_node); - dccp_rx_hist_entry_delete(hist, entry); - } - } else if (dccp_rx_hist_entry_data_packet(entry)) - --num_later; - } - } else { - int step = 0; - u8 win_count = 0; /* Not needed, but lets shut up gcc */ - int tmp; + u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, + s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno, + s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno, + s3 = DCCP_SKB_CB(skb)->dccpd_seq; + int n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp, + d23 = dccp_delta_seqno(s2, s3), d13, d3, d31; + + if (d23 > 0) { /* S2 < S3 */ + h->loss_count = 3; + tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 3), skb, n3); + return 1; + } + + /* S3 < S2 */ + d13 = dccp_delta_seqno(s1, s3); + + if (d13 > 0) { /* - * We have no loss interval history so we need at least one - * rtt:s of data packets to approximate rtt. + * The sequence number order is S1, S3, S2 + * Reorder history to insert entry between S1 and S2 */ - list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) { - if (num_later == 0) { - switch (step) { - case 0: - step = 1; - /* OK, find next data packet */ - num_later = 1; - break; - case 1: - step = 2; - /* OK, find next data packet */ - num_later = 1; - win_count = entry->dccphrx_ccval; - break; - case 2: - tmp = win_count - entry->dccphrx_ccval; - if (tmp < 0) - tmp += TFRC_WIN_COUNT_LIMIT; - if (tmp > TFRC_WIN_COUNT_PER_RTT + 1) { - /* - * We have found a packet older - * than one rtt remove the rest - */ - step = 3; - } else /* OK, find next data packet */ - num_later = 1; - break; - case 3: - list_del_init(&entry->dccphrx_node); - dccp_rx_hist_entry_delete(hist, entry); - break; - } - } else if (dccp_rx_hist_entry_data_packet(entry)) - --num_later; + tfrc_rx_hist_swap(h, 2, 3); + tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n3); + h->loss_count = 3; + return 1; + } + + /* S0 < S3 < S1 */ + d31 = -d13; + d3 = dccp_delta_seqno(s0, s3); + + if (d3 == 1 || n3 >= d3) { /* S3 is a successor of S0 */ + + if (d31 == 1 || n1 >= d31) { + /* hole between S0 and S1 filled by S3 */ + int d2 = dccp_delta_seqno(s1, s2), + n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp; + + if (d2 == 1 || n2 >= d2) { + /* entire hole filled by S0, S3, S1, S2 */ + h->loss_start = tfrc_rx_hist_index(h, 2); + h->loss_count = 0; + } else { + /* gap remains between S1 and S2 */ + h->loss_start = tfrc_rx_hist_index(h, 1); + h->loss_count = 1; + } + + } else /* gap exists between S3 and S1, loss_count stays at 2 */ + tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n3); + + return 0; + } + + /* + * The remaining case: S3 is not a successor of S0. + * Sequence order is S0, S3, S1, S2; reorder to insert between S0 and S1 + */ + tfrc_rx_hist_swap(h, 0, 3); + h->loss_start = tfrc_rx_hist_index(h, 3); + tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n3); + h->loss_count = 3; + + return 1; +} + +/* return the signed modulo-2^48 sequence number distance from entry e1 to e2 */ +static s64 tfrc_rx_hist_delta_seqno(struct tfrc_rx_hist *h, u8 e1, u8 e2) +{ + DCCP_BUG_ON(e1 > h->loss_count || e2 > h->loss_count); + + return dccp_delta_seqno(tfrc_rx_hist_entry(h, e1)->tfrchrx_seqno, + tfrc_rx_hist_entry(h, e2)->tfrchrx_seqno); +} + +/* recycle RX history records to continue loss detection if necessary */ +static void __three_after_loss(struct tfrc_rx_hist *h) +{ + /* + * The distance between S0 and S1 is always greater than 1 and the NDP + * count of S1 is smaller than this distance. Otherwise there would + * have been no loss. Hence it is only necessary to see whether there + * are further missing data packets between S1/S2 and S2/S3. + */ + int d2 = tfrc_rx_hist_delta_seqno(h, 1, 2), + d3 = tfrc_rx_hist_delta_seqno(h, 2, 3), + n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp, + n3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_ndp; + + if (d2 == 1 || n2 >= d2) { /* S2 is successor to S1 */ + + if (d3 == 1 || n3 >= d3) { + /* S3 is successor of S2: entire hole is filled */ + h->loss_start = tfrc_rx_hist_index(h, 3); + h->loss_count = 0; + } else { + /* gap between S2 and S3 */ + h->loss_start = tfrc_rx_hist_index(h, 2); + h->loss_count = 1; } + + } else { /* gap between S1 and S2 */ + h->loss_start = tfrc_rx_hist_index(h, 1); + h->loss_count = 2; } } -EXPORT_SYMBOL_GPL(dccp_rx_hist_add_packet); +/** + * tfrc_rx_handle_loss - Loss detection and further processing + * @h: The non-empty RX history object + * @lh: Loss Intervals database to update + * @skb: Currently received packet + * @ndp: The NDP count belonging to @skb + * @calc_first_li: Caller-dependent computation of first loss interval in @lh + * @sk: Used by @calc_first_li (see tfrc_lh_interval_add) + * Chooses action according to pending loss, updates LI database when a new + * loss was detected, and does required post-processing. Returns 1 when caller + * should send feedback, 0 otherwise. + */ +int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, + struct tfrc_loss_hist *lh, + struct sk_buff *skb, u32 ndp, + u32 (*calc_first_li)(struct sock *), struct sock *sk) +{ + int is_new_loss = 0; -void dccp_rx_hist_purge(struct dccp_rx_hist *hist, struct list_head *list) + if (h->loss_count == 1) { + __one_after_loss(h, skb, ndp); + } else if (h->loss_count != 2) { + DCCP_BUG("invalid loss_count %d", h->loss_count); + } else if (__two_after_loss(h, skb, ndp)) { + /* + * Update Loss Interval database and recycle RX records + */ + is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk); + __three_after_loss(h); + } + return is_new_loss; +} +EXPORT_SYMBOL_GPL(tfrc_rx_handle_loss); + +int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) { - struct dccp_rx_hist_entry *entry, *next; + int i; + + for (i = 0; i <= TFRC_NDUPACK; i++) { + h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC); + if (h->ring[i] == NULL) + goto out_free; + } + + h->loss_count = h->loss_start = 0; + return 0; - list_for_each_entry_safe(entry, next, list, dccphrx_node) { - list_del_init(&entry->dccphrx_node); - kmem_cache_free(hist->dccprxh_slab, entry); +out_free: + while (i-- != 0) { + kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]); + h->ring[i] = NULL; } + return -ENOBUFS; } +EXPORT_SYMBOL_GPL(tfrc_rx_hist_alloc); + +void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) +{ + int i; -EXPORT_SYMBOL_GPL(dccp_rx_hist_purge); + for (i = 0; i <= TFRC_NDUPACK; ++i) + if (h->ring[i] != NULL) { + kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]); + h->ring[i] = NULL; + } +} +EXPORT_SYMBOL_GPL(tfrc_rx_hist_purge); +/** + * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against + */ +static inline struct tfrc_rx_hist_entry * + tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h) +{ + return h->ring[0]; +} -MODULE_AUTHOR("Ian McDonald <ian.mcdonald@jandi.co.nz>, " - "Arnaldo Carvalho de Melo <acme@ghostprotocols.net>"); -MODULE_DESCRIPTION("DCCP TFRC library"); -MODULE_LICENSE("GPL"); +/** + * tfrc_rx_hist_rtt_prev_s: previously suitable (wrt rtt_last_s) RTT-sampling entry + */ +static inline struct tfrc_rx_hist_entry * + tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h) +{ + return h->ring[h->rtt_sample_prev]; +} + +/** + * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal + * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able + * to compute a sample with given data - calling function should check this. + */ +u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) +{ + u32 sample = 0, + delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, + tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); + + if (delta_v < 1 || delta_v > 4) { /* unsuitable CCVal delta */ + if (h->rtt_sample_prev == 2) { /* previous candidate stored */ + sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, + tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); + if (sample) + sample = 4 / sample * + ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp, + tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp); + else /* + * FIXME: This condition is in principle not + * possible but occurs when CCID is used for + * two-way data traffic. I have tried to trace + * it, but the cause does not seem to be here. + */ + DCCP_BUG("please report to dccp@vger.kernel.org" + " => prev = %u, last = %u", + tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, + tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); + } else if (delta_v < 1) { + h->rtt_sample_prev = 1; + goto keep_ref_for_next_time; + } + + } else if (delta_v == 4) /* optimal match */ + sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp)); + else { /* suboptimal match */ + h->rtt_sample_prev = 2; + goto keep_ref_for_next_time; + } + + if (unlikely(sample > DCCP_SANE_RTT_MAX)) { + DCCP_WARN("RTT sample %u too large, using max\n", sample); + sample = DCCP_SANE_RTT_MAX; + } + + h->rtt_sample_prev = 0; /* use current entry as next reference */ +keep_ref_for_next_time: + + return sample; +} +EXPORT_SYMBOL_GPL(tfrc_rx_hist_sample_rtt); diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h index 032bb61c6e3..c7eeda49cb2 100644 --- a/net/dccp/ccids/lib/packet_history.h +++ b/net/dccp/ccids/lib/packet_history.h @@ -1,10 +1,9 @@ /* - * net/dccp/packet_history.h + * Packet RX/TX history data structures and routines for TFRC-based protocols. * + * Copyright (c) 2007 The University of Aberdeen, Scotland, UK * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand. * - * An implementation of the DCCP protocol - * * This code has been developed by the University of Waikato WAND * research group. For further information please see http://www.wand.net.nz/ * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz @@ -37,165 +36,128 @@ #ifndef _DCCP_PKT_HIST_ #define _DCCP_PKT_HIST_ -#include <linux/ktime.h> #include <linux/list.h> #include <linux/slab.h> +#include "tfrc.h" -#include "../../dccp.h" +struct tfrc_tx_hist_entry; -/* Number of later packets received before one is considered lost */ -#define TFRC_RECV_NUM_LATE_LOSS 3 +extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno); +extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp); +extern u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, + const u64 seqno, const ktime_t now); -#define TFRC_WIN_COUNT_PER_RTT 4 -#define TFRC_WIN_COUNT_LIMIT 16 +/* Subtraction a-b modulo-16, respects circular wrap-around */ +#define SUB16(a, b) (((a) + 16 - (b)) & 0xF) -/* - * Transmitter History data structures and declarations +/* Number of packets to wait after a missing packet (RFC 4342, 6.1) */ +#define TFRC_NDUPACK 3 + +/** + * tfrc_rx_hist_entry - Store information about a single received packet + * @tfrchrx_seqno: DCCP packet sequence number + * @tfrchrx_ccval: window counter value of packet (RFC 4342, 8.1) + * @tfrchrx_ndp: the NDP count (if any) of the packet + * @tfrchrx_tstamp: actual receive time of packet */ -struct dccp_tx_hist_entry { - struct list_head dccphtx_node; - u64 dccphtx_seqno:48, - dccphtx_sent:1; - u32 dccphtx_rtt; - ktime_t dccphtx_tstamp; +struct tfrc_rx_hist_entry { + u64 tfrchrx_seqno:48, + tfrchrx_ccval:4, + tfrchrx_type:4; + u32 tfrchrx_ndp; /* In fact it is from 8 to 24 bits */ + ktime_t tfrchrx_tstamp; }; -struct dccp_tx_hist { - struct kmem_cache *dccptxh_slab; +/** + * tfrc_rx_hist - RX history structure for TFRC-based protocols + * + * @ring: Packet history for RTT sampling and loss detection + * @loss_count: Number of entries in circular history + * @loss_start: Movable index (for loss detection) + * @rtt_sample_prev: Used during RTT sampling, points to candidate entry + */ +struct tfrc_rx_hist { + struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1]; + u8 loss_count:2, + loss_start:2; +#define rtt_sample_prev loss_start }; -extern struct dccp_tx_hist *dccp_tx_hist_new(const char *name); -extern void dccp_tx_hist_delete(struct dccp_tx_hist *hist); - -static inline struct dccp_tx_hist_entry * - dccp_tx_hist_entry_new(struct dccp_tx_hist *hist, - const gfp_t prio) +/** + * tfrc_rx_hist_index - index to reach n-th entry after loss_start + */ +static inline u8 tfrc_rx_hist_index(const struct tfrc_rx_hist *h, const u8 n) { - struct dccp_tx_hist_entry *entry = kmem_cache_alloc(hist->dccptxh_slab, - prio); - - if (entry != NULL) - entry->dccphtx_sent = 0; - - return entry; + return (h->loss_start + n) & TFRC_NDUPACK; } -static inline struct dccp_tx_hist_entry * - dccp_tx_hist_head(struct list_head *list) +/** + * tfrc_rx_hist_last_rcv - entry with highest-received-seqno so far + */ +static inline struct tfrc_rx_hist_entry * + tfrc_rx_hist_last_rcv(const struct tfrc_rx_hist *h) { - struct dccp_tx_hist_entry *head = NULL; - - if (!list_empty(list)) - head = list_entry(list->next, struct dccp_tx_hist_entry, - dccphtx_node); - return head; + return h->ring[tfrc_rx_hist_index(h, h->loss_count)]; } -extern struct dccp_tx_hist_entry * - dccp_tx_hist_find_entry(const struct list_head *list, - const u64 seq); - -static inline void dccp_tx_hist_add_entry(struct list_head *list, - struct dccp_tx_hist_entry *entry) +/** + * tfrc_rx_hist_entry - return the n-th history entry after loss_start + */ +static inline struct tfrc_rx_hist_entry * + tfrc_rx_hist_entry(const struct tfrc_rx_hist *h, const u8 n) { - list_add(&entry->dccphtx_node, list); + return h->ring[tfrc_rx_hist_index(h, n)]; } -static inline void dccp_tx_hist_entry_delete(struct dccp_tx_hist *hist, - struct dccp_tx_hist_entry *entry) +/** + * tfrc_rx_hist_loss_prev - entry with highest-received-seqno before loss was detected + */ +static inline struct tfrc_rx_hist_entry * + tfrc_rx_hist_loss_prev(const struct tfrc_rx_hist *h) { - if (entry != NULL) - kmem_cache_free(hist->dccptxh_slab, entry); + return h->ring[h->loss_start]; } -extern void dccp_tx_hist_purge(struct dccp_tx_hist *hist, - struct list_head *list); - -extern void dccp_tx_hist_purge_older(struct dccp_tx_hist *hist, - struct list_head *list, - struct dccp_tx_hist_entry *next); - -/* - * Receiver History data structures and declarations - */ -struct dccp_rx_hist_entry { - struct list_head dccphrx_node; - u64 dccphrx_seqno:48, - dccphrx_ccval:4, - dccphrx_type:4; - u32 dccphrx_ndp; /* In fact it is from 8 to 24 bits */ - ktime_t dccphrx_tstamp; -}; - -struct dccp_rx_hist { - struct kmem_cache *dccprxh_slab; -}; - -extern struct dccp_rx_hist *dccp_rx_hist_new(const char *name); -extern void dccp_rx_hist_delete(struct dccp_rx_hist *hist); - -static inline struct dccp_rx_hist_entry * - dccp_rx_hist_entry_new(struct dccp_rx_hist *hist, - const u32 ndp, - const struct sk_buff *skb, - const gfp_t prio) +/* initialise loss detection and disable RTT sampling */ +static inline void tfrc_rx_hist_loss_indicated(struct tfrc_rx_hist *h) { - struct dccp_rx_hist_entry *entry = kmem_cache_alloc(hist->dccprxh_slab, - prio); - - if (entry != NULL) { - const struct dccp_hdr *dh = dccp_hdr(skb); - - entry->dccphrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq; - entry->dccphrx_ccval = dh->dccph_ccval; - entry->dccphrx_type = dh->dccph_type; - entry->dccphrx_ndp = ndp; - entry->dccphrx_tstamp = ktime_get_real(); - } - - return entry; + h->loss_count = 1; } -static inline struct dccp_rx_hist_entry * - dccp_rx_hist_head(struct list_head *list) +/* indicate whether previously a packet was detected missing */ +static inline int tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h) { - struct dccp_rx_hist_entry *head = NULL; - - if (!list_empty(list)) - head = list_entry(list->next, struct dccp_rx_hist_entry, - dccphrx_node); - return head; + return h->loss_count; } -extern int dccp_rx_hist_find_entry(const struct list_head *list, const u64 seq, - u8 *ccval); -extern struct dccp_rx_hist_entry * - dccp_rx_hist_find_data_packet(const struct list_head *list); - -extern void dccp_rx_hist_add_packet(struct dccp_rx_hist *hist, - struct list_head *rx_list, - struct list_head *li_list, - struct dccp_rx_hist_entry *packet, - u64 nonloss_seqno); - -static inline void dccp_rx_hist_entry_delete(struct dccp_rx_hist *hist, - struct dccp_rx_hist_entry *entry) +/* any data packets missing between last reception and skb ? */ +static inline int tfrc_rx_hist_new_loss_indicated(struct tfrc_rx_hist *h, + const struct sk_buff *skb, + u32 ndp) { - if (entry != NULL) - kmem_cache_free(hist->dccprxh_slab, entry); -} + int delta = dccp_delta_seqno(tfrc_rx_hist_last_rcv(h)->tfrchrx_seqno, + DCCP_SKB_CB(skb)->dccpd_seq); -extern void dccp_rx_hist_purge(struct dccp_rx_hist *hist, - struct list_head *list); + if (delta > 1 && ndp < delta) + tfrc_rx_hist_loss_indicated(h); -static inline int - dccp_rx_hist_entry_data_packet(const struct dccp_rx_hist_entry *entry) -{ - return entry->dccphrx_type == DCCP_PKT_DATA || - entry->dccphrx_type == DCCP_PKT_DATAACK; + return tfrc_rx_hist_loss_pending(h); } -extern u64 dccp_rx_hist_detect_loss(struct list_head *rx_list, - struct list_head *li_list, u8 *win_loss); +extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, + const struct sk_buff *skb, const u32 ndp); + +extern int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb); + +struct tfrc_loss_hist; +extern int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, + struct tfrc_loss_hist *lh, + struct sk_buff *skb, u32 ndp, + u32 (*first_li)(struct sock *sk), + struct sock *sk); +extern u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, + const struct sk_buff *skb); +extern int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h); +extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h); #endif /* _DCCP_PKT_HIST_ */ diff --git a/net/dccp/ccids/lib/tfrc.c b/net/dccp/ccids/lib/tfrc.c new file mode 100644 index 00000000000..d1dfbb8de64 --- /dev/null +++ b/net/dccp/ccids/lib/tfrc.c @@ -0,0 +1,63 @@ +/* + * TFRC: main module holding the pieces of the TFRC library together + * + * Copyright (c) 2007 The University of Aberdeen, Scotland, UK + * Copyright (c) 2007 Arnaldo Carvalho de Melo <acme@redhat.com> + */ +#include <linux/module.h> +#include <linux/moduleparam.h> +#include "tfrc.h" + +#ifdef CONFIG_IP_DCCP_TFRC_DEBUG +int tfrc_debug; +module_param(tfrc_debug, bool, 0444); +MODULE_PARM_DESC(tfrc_debug, "Enable debug messages"); +#endif + +extern int tfrc_tx_packet_history_init(void); +extern void tfrc_tx_packet_history_exit(void); +extern int tfrc_rx_packet_history_init(void); +extern void tfrc_rx_packet_history_exit(void); + +extern int tfrc_li_init(void); +extern void tfrc_li_exit(void); + +static int __init tfrc_module_init(void) +{ + int rc = tfrc_li_init(); + + if (rc) + goto out; + + rc = tfrc_tx_packet_history_init(); + if (rc) + goto out_free_loss_intervals; + + rc = tfrc_rx_packet_history_init(); + if (rc) + goto out_free_tx_history; + return 0; + +out_free_tx_history: + tfrc_tx_packet_history_exit(); +out_free_loss_intervals: + tfrc_li_exit(); +out: + return rc; +} + +static void __exit tfrc_module_exit(void) +{ + tfrc_rx_packet_history_exit(); + tfrc_tx_packet_history_exit(); + tfrc_li_exit(); +} + +module_init(tfrc_module_init); +module_exit(tfrc_module_exit); + +MODULE_AUTHOR("Gerrit Renker <gerrit@erg.abdn.ac.uk>, " + "Ian McDonald <ian.mcdonald@jandi.co.nz>, " + "Arnaldo Carvalho de Melo <acme@redhat.com>"); +MODULE_DESCRIPTION("DCCP TFRC library"); +MODULE_LICENSE("GPL"); diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h index faf5f7e219e..1fb1187bbf1 100644 --- a/net/dccp/ccids/lib/tfrc.h +++ b/net/dccp/ccids/lib/tfrc.h @@ -3,10 +3,11 @@ /* * net/dccp/ccids/lib/tfrc.h * - * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005 Ian McDonald <ian.mcdonald@jandi.co.nz> - * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon + * Copyright (c) 2007 The University of Aberdeen, Scotland, UK + * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand. + * Copyright (c) 2005-6 Ian McDonald <ian.mcdonald@jandi.co.nz> + * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> + * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -15,6 +16,17 @@ */ #include <linux/types.h> #include <asm/div64.h> +#include "../../dccp.h" +/* internal includes that this module exports: */ +#include "loss_interval.h" +#include "packet_history.h" + +#ifdef CONFIG_IP_DCCP_TFRC_DEBUG +extern int tfrc_debug; +#define tfrc_pr_debug(format, a...) DCCP_PR_DEBUG(tfrc_debug, format, ##a) +#else +#define tfrc_pr_debug(format, a...) +#endif /* integer-arithmetic divisions of type (a * 1000000)/b */ static inline u64 scaled_div(u64 a, u32 b) @@ -37,6 +49,15 @@ static inline u32 scaled_div32(u64 a, u32 b) return result; } +/** + * tfrc_ewma - Exponentially weighted moving average + * @weight: Weight to be used as damping factor, in units of 1/10 + */ +static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight) +{ + return avg ? (weight * avg + (10 - weight) * newval) / 10 : newval; +} + extern u32 tfrc_calc_x(u16 s, u32 R, u32 p); extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue); diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index ee97950d77d..ebe59d98721 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -72,11 +72,21 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo); /* RFC 1122, 4.2.3.1 initial RTO value */ #define DCCP_TIMEOUT_INIT ((unsigned)(3 * HZ)) -#define DCCP_RTO_MAX ((unsigned)(120 * HZ)) /* FIXME: using TCP value */ +/* + * The maximum back-off value for retransmissions. This is needed for + * - retransmitting client-Requests (sec. 8.1.1), + * - retransmitting Close/CloseReq when closing (sec. 8.3), + * - feature-negotiation retransmission (sec. 6.6.3), + * - Acks in client-PARTOPEN state (sec. 8.1.5). + */ +#define DCCP_RTO_MAX ((unsigned)(64 * HZ)) -/* bounds for sampled RTT values from packet exchanges (in usec) */ +/* + * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4 + */ #define DCCP_SANE_RTT_MIN 100 -#define DCCP_SANE_RTT_MAX (4 * USEC_PER_SEC) +#define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5) +#define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC) /* Maximal interval between probes for local resources. */ #define DCCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ / 2U)) @@ -143,12 +153,6 @@ static inline u64 max48(const u64 seq1, const u64 seq2) return after48(seq1, seq2) ? seq1 : seq2; } -/* is seq1 next seqno after seq2 */ -static inline int follows48(const u64 seq1, const u64 seq2) -{ - return dccp_delta_seqno(seq2, seq1) == 1; -} - enum { DCCP_MIB_NUM = 0, DCCP_MIB_ACTIVEOPENS, /* ActiveOpens */ @@ -334,6 +338,7 @@ struct dccp_skb_cb { #define DCCP_SKB_CB(__skb) ((struct dccp_skb_cb *)&((__skb)->cb[0])) +/* RFC 4340, sec. 7.7 */ static inline int dccp_non_data_packet(const struct sk_buff *skb) { const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; @@ -346,6 +351,17 @@ static inline int dccp_non_data_packet(const struct sk_buff *skb) type == DCCP_PKT_SYNCACK; } +/* RFC 4340, sec. 7.7 */ +static inline int dccp_data_packet(const struct sk_buff *skb) +{ + const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; + + return type == DCCP_PKT_DATA || + type == DCCP_PKT_DATAACK || + type == DCCP_PKT_REQUEST || + type == DCCP_PKT_RESPONSE; +} + static inline int dccp_packet_without_ack(const struct sk_buff *skb) { const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; @@ -406,6 +422,7 @@ static inline int dccp_ack_pending(const struct sock *sk) } extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb); +extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*); extern int dccp_insert_option_elapsed_time(struct sock *sk, struct sk_buff *skb, u32 elapsed_time); diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 5ebdd86c1b9..4a4f6ce4498 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -4,10 +4,16 @@ * An implementation of the DCCP protocol * Andrea Bittau <a.bittau@cs.ucl.ac.uk> * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. + * ASSUMPTIONS + * ----------- + * o All currently known SP features have 1-byte quantities. If in the future + * extensions of RFCs 4340..42 define features with item lengths larger than + * one byte, a feature-specific extension of the code will be required. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. */ #include <linux/module.h> @@ -24,11 +30,7 @@ int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature, dccp_feat_debug(type, feature, *val); - if (!dccp_feat_is_valid_type(type)) { - DCCP_WARN("option type %d invalid in negotiation\n", type); - return 1; - } - if (!dccp_feat_is_valid_length(type, feature, len)) { + if (len > 3) { DCCP_WARN("invalid length %d\n", len); return 1; } @@ -99,7 +101,6 @@ static int dccp_feat_update_ccid(struct sock *sk, u8 type, u8 new_ccid_nr) return 0; } -/* XXX taking only u8 vals */ static int dccp_feat_update(struct sock *sk, u8 type, u8 feat, u8 val) { dccp_feat_debug(type, feat, val); @@ -144,7 +145,6 @@ static int dccp_feat_reconcile(struct sock *sk, struct dccp_opt_pend *opt, /* FIXME sanity check vals */ /* Are values in any order? XXX Lame "algorithm" here */ - /* XXX assume values are 1 byte */ for (i = 0; i < slen; i++) { for (j = 0; j < rlen; j++) { if (spref[i] == rpref[j]) { @@ -179,7 +179,6 @@ static int dccp_feat_reconcile(struct sock *sk, struct dccp_opt_pend *opt, } /* need to put result and our preference list */ - /* XXX assume 1 byte vals */ rlen = 1 + opt->dccpop_len; rpref = kmalloc(rlen, GFP_ATOMIC); if (rpref == NULL) @@ -637,12 +636,12 @@ const char *dccp_feat_name(const u8 feat) [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage", [DCCPF_DATA_CHECKSUM] = "Send Data Checksum", }; + if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC) + return feature_names[DCCPF_RESERVED]; + if (feat >= DCCPF_MIN_CCID_SPECIFIC) return "CCID-specific"; - if (dccp_feat_is_reserved(feat)) - return feature_names[DCCPF_RESERVED]; - return feature_names[feat]; } diff --git a/net/dccp/feat.h b/net/dccp/feat.h index 177f7dee4d1..e272222c7ac 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -14,32 +14,6 @@ #include <linux/types.h> #include "dccp.h" -static inline int dccp_feat_is_valid_length(u8 type, u8 feature, u8 len) -{ - /* sec. 6.1: Confirm has at least length 3, - * sec. 6.2: Change has at least length 4 */ - if (len < 3) - return 1; - if (len < 4 && (type == DCCPO_CHANGE_L || type == DCCPO_CHANGE_R)) - return 1; - /* XXX: add per-feature length validation (sec. 6.6.8) */ - return 0; -} - -static inline int dccp_feat_is_reserved(const u8 feat) -{ - return (feat > DCCPF_DATA_CHECKSUM && - feat < DCCPF_MIN_CCID_SPECIFIC) || - feat == DCCPF_RESERVED; -} - -/* feature negotiation knows only these four option types (RFC 4340, sec. 6) */ -static inline int dccp_feat_is_valid_type(const u8 optnum) -{ - return optnum >= DCCPO_CHANGE_L && optnum <= DCCPO_CONFIRM_R; - -} - #ifdef CONFIG_IP_DCCP_DEBUG extern const char *dccp_feat_typename(const u8 type); extern const char *dccp_feat_name(const u8 feat); diff --git a/net/dccp/input.c b/net/dccp/input.c index 1ce10106282..08392ed86c2 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -22,26 +22,77 @@ /* rate-limit for syncs in reply to sequence-invalid packets; RFC 4340, 7.5.4 */ int sysctl_dccp_sync_ratelimit __read_mostly = HZ / 8; -static void dccp_fin(struct sock *sk, struct sk_buff *skb) +static void dccp_enqueue_skb(struct sock *sk, struct sk_buff *skb) { - sk->sk_shutdown |= RCV_SHUTDOWN; - sock_set_flag(sk, SOCK_DONE); __skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4); __skb_queue_tail(&sk->sk_receive_queue, skb); skb_set_owner_r(skb, sk); sk->sk_data_ready(sk, 0); } -static void dccp_rcv_close(struct sock *sk, struct sk_buff *skb) +static void dccp_fin(struct sock *sk, struct sk_buff *skb) { - dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED); - dccp_fin(sk, skb); - dccp_set_state(sk, DCCP_CLOSED); - sk_wake_async(sk, 1, POLL_HUP); + /* + * On receiving Close/CloseReq, both RD/WR shutdown are performed. + * RFC 4340, 8.3 says that we MAY send further Data/DataAcks after + * receiving the closing segment, but there is no guarantee that such + * data will be processed at all. + */ + sk->sk_shutdown = SHUTDOWN_MASK; + sock_set_flag(sk, SOCK_DONE); + dccp_enqueue_skb(sk, skb); +} + +static int dccp_rcv_close(struct sock *sk, struct sk_buff *skb) +{ + int queued = 0; + + switch (sk->sk_state) { + /* + * We ignore Close when received in one of the following states: + * - CLOSED (may be a late or duplicate packet) + * - PASSIVE_CLOSEREQ (the peer has sent a CloseReq earlier) + * - RESPOND (already handled by dccp_check_req) + */ + case DCCP_CLOSING: + /* + * Simultaneous-close: receiving a Close after sending one. This + * can happen if both client and server perform active-close and + * will result in an endless ping-pong of crossing and retrans- + * mitted Close packets, which only terminates when one of the + * nodes times out (min. 64 seconds). Quicker convergence can be + * achieved when one of the nodes acts as tie-breaker. + * This is ok as both ends are done with data transfer and each + * end is just waiting for the other to acknowledge termination. + */ + if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) + break; + /* fall through */ + case DCCP_REQUESTING: + case DCCP_ACTIVE_CLOSEREQ: + dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED); + dccp_done(sk); + break; + case DCCP_OPEN: + case DCCP_PARTOPEN: + /* Give waiting application a chance to read pending data */ + queued = 1; + dccp_fin(sk, skb); + dccp_set_state(sk, DCCP_PASSIVE_CLOSE); + /* fall through */ + case DCCP_PASSIVE_CLOSE: + /* + * Retransmitted Close: we have already enqueued the first one. + */ + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); + } + return queued; } -static void dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb) +static int dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb) { + int queued = 0; + /* * Step 7: Check for unexpected packet types * If (S.is_server and P.type == CloseReq) @@ -50,12 +101,26 @@ static void dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb) */ if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) { dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC); - return; + return queued; } - if (sk->sk_state != DCCP_CLOSING) + /* Step 13: process relevant Client states < CLOSEREQ */ + switch (sk->sk_state) { + case DCCP_REQUESTING: + dccp_send_close(sk, 0); dccp_set_state(sk, DCCP_CLOSING); - dccp_send_close(sk, 0); + break; + case DCCP_OPEN: + case DCCP_PARTOPEN: + /* Give waiting application a chance to read pending data */ + queued = 1; + dccp_fin(sk, skb); + dccp_set_state(sk, DCCP_PASSIVE_CLOSEREQ); + /* fall through */ + case DCCP_PASSIVE_CLOSEREQ: + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); + } + return queued; } static u8 dccp_reset_code_convert(const u8 code) @@ -90,7 +155,7 @@ static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb) dccp_fin(sk, skb); if (err && !sock_flag(sk, SOCK_DEAD)) - sk_wake_async(sk, 0, POLL_ERR); + sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); dccp_time_wait(sk, DCCP_TIME_WAIT, 0); } @@ -103,6 +168,21 @@ static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb) DCCP_SKB_CB(skb)->dccpd_ack_seq); } +static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb) +{ + const struct dccp_sock *dp = dccp_sk(sk); + + /* Don't deliver to RX CCID when node has shut down read end. */ + if (!(sk->sk_shutdown & RCV_SHUTDOWN)) + ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); + /* + * Until the TX queue has been drained, we can not honour SHUT_WR, since + * we need received feedback as input to adjust congestion control. + */ + if (sk->sk_write_queue.qlen > 0 || !(sk->sk_shutdown & SEND_SHUTDOWN)) + ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); +} + static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) { const struct dccp_hdr *dh = dccp_hdr(skb); @@ -209,13 +289,11 @@ static int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb, case DCCP_PKT_DATAACK: case DCCP_PKT_DATA: /* - * FIXME: check if sk_receive_queue is full, schedule DATA_DROPPED - * option if it is. + * FIXME: schedule DATA_DROPPED (RFC 4340, 11.7.2) if and when + * - sk_shutdown == RCV_SHUTDOWN, use Code 1, "Not Listening" + * - sk_receive_queue is full, use Code 2, "Receive Buffer" */ - __skb_pull(skb, dh->dccph_doff * 4); - __skb_queue_tail(&sk->sk_receive_queue, skb); - skb_set_owner_r(skb, sk); - sk->sk_data_ready(sk, 0); + dccp_enqueue_skb(sk, skb); return 0; case DCCP_PKT_ACK: goto discard; @@ -231,11 +309,13 @@ static int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb, dccp_rcv_reset(sk, skb); return 0; case DCCP_PKT_CLOSEREQ: - dccp_rcv_closereq(sk, skb); + if (dccp_rcv_closereq(sk, skb)) + return 0; goto discard; case DCCP_PKT_CLOSE: - dccp_rcv_close(sk, skb); - return 0; + if (dccp_rcv_close(sk, skb)) + return 0; + goto discard; case DCCP_PKT_REQUEST: /* Step 7 * or (S.is_server and P.type == Response) @@ -289,7 +369,7 @@ int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, if (dccp_check_seqno(sk, skb)) goto discard; - if (dccp_parse_options(sk, skb)) + if (dccp_parse_options(sk, NULL, skb)) goto discard; if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) @@ -300,9 +380,7 @@ int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_ACKVEC_STATE_RECEIVED)) goto discard; - - ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); - ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); + dccp_deliver_input_to_ccids(sk, skb); return __dccp_rcv_established(sk, skb, dh, len); discard: @@ -349,7 +427,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, goto out_invalid_packet; } - if (dccp_parse_options(sk, skb)) + if (dccp_parse_options(sk, NULL, skb)) goto out_invalid_packet; /* Obtain usec RTT sample from SYN exchange (used by CCID 3) */ @@ -402,7 +480,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); - sk_wake_async(sk, 0, POLL_OUT); + sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); } if (sk->sk_write_pending || icsk->icsk_ack.pingpong || @@ -531,7 +609,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* * Step 8: Process options and mark acknowledgeable */ - if (dccp_parse_options(sk, skb)) + if (dccp_parse_options(sk, NULL, skb)) goto discard; if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) @@ -543,8 +621,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, DCCP_ACKVEC_STATE_RECEIVED)) goto discard; - ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); - ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); + dccp_deliver_input_to_ccids(sk, skb); } /* @@ -560,16 +637,14 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, return 0; /* * Step 7: Check for unexpected packet types - * If (S.is_server and P.type == CloseReq) - * or (S.is_server and P.type == Response) + * If (S.is_server and P.type == Response) * or (S.is_client and P.type == Request) * or (S.state == RESPOND and P.type == Data), * Send Sync packet acknowledging P.seqno * Drop packet and return */ } else if ((dp->dccps_role != DCCP_ROLE_CLIENT && - (dh->dccph_type == DCCP_PKT_RESPONSE || - dh->dccph_type == DCCP_PKT_CLOSEREQ)) || + dh->dccph_type == DCCP_PKT_RESPONSE) || (dp->dccps_role == DCCP_ROLE_CLIENT && dh->dccph_type == DCCP_PKT_REQUEST) || (sk->sk_state == DCCP_RESPOND && @@ -577,11 +652,13 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC); goto discard; } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { - dccp_rcv_closereq(sk, skb); + if (dccp_rcv_closereq(sk, skb)) + return 0; goto discard; } else if (dh->dccph_type == DCCP_PKT_CLOSE) { - dccp_rcv_close(sk, skb); - return 0; + if (dccp_rcv_close(sk, skb)) + return 0; + goto discard; } switch (sk->sk_state) { @@ -611,7 +688,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, switch (old_state) { case DCCP_PARTOPEN: sk->sk_state_change(sk); - sk_wake_async(sk, 0, POLL_OUT); + sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); break; } } else if (unlikely(dh->dccph_type == DCCP_PKT_SYNC)) { diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index db17b83e8d3..9e38b0d6195 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -408,7 +408,7 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb, dccp_sync_mss(newsk, dst_mtu(dst)); - __inet_hash(&dccp_hashinfo, newsk, 0); + __inet_hash_nolisten(&dccp_hashinfo, newsk); __inet_inherit_port(&dccp_hashinfo, sk, newsk); return newsk; @@ -469,7 +469,7 @@ static struct dst_entry* dccp_v4_route_skb(struct sock *sk, }; security_skb_classify_flow(skb, &fl); - if (ip_route_output_flow(&rt, &fl, sk, 0)) { + if (ip_route_output_flow(&init_net, &rt, &fl, sk, 0)) { IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); return NULL; } @@ -600,11 +600,12 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (req == NULL) goto drop; - if (dccp_parse_options(sk, skb)) - goto drop_and_free; - dccp_reqsk_init(req, skb); + dreq = dccp_rsk(req); + if (dccp_parse_options(sk, dreq, skb)) + goto drop_and_free; + if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; @@ -621,7 +622,6 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) * In fact we defer setting S.GSR, S.SWL, S.SWH to * dccp_create_openreq_child. */ - dreq = dccp_rsk(req); dreq->dreq_isr = dcb->dccpd_seq; dreq->dreq_iss = dccp_v4_init_sequence(skb); dreq->dreq_service = service; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 87c98fb86fa..f42b75ce7f5 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -415,11 +415,12 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (req == NULL) goto drop; - if (dccp_parse_options(sk, skb)) - goto drop_and_free; - dccp_reqsk_init(req, skb); + dreq = dccp_rsk(req); + if (dccp_parse_options(sk, dreq, skb)) + goto drop_and_free; + if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; @@ -449,7 +450,6 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) * In fact we defer setting S.GSR, S.SWL, S.SWH to * dccp_create_openreq_child. */ - dreq = dccp_rsk(req); dreq->dreq_isr = dcb->dccpd_seq; dreq->dreq_iss = dccp_v6_init_sequence(skb); dreq->dreq_service = service; @@ -994,7 +994,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (final_p) ipv6_addr_copy(&fl.fl6_dst, final_p); - err = __xfrm_lookup(&dst, &fl, sk, 1); + err = __xfrm_lookup(&dst, &fl, sk, XFRM_LOOKUP_WAIT); if (err < 0) { if (err == -EREMOTE) err = ip6_dst_blackhole(sk, &dst, &fl); diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 831b76e08d0..027d1814e1a 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -117,11 +117,13 @@ struct sock *dccp_create_openreq_child(struct sock *sk, struct dccp_sock *newdp = dccp_sk(newsk); struct dccp_minisock *newdmsk = dccp_msk(newsk); - newdp->dccps_role = DCCP_ROLE_SERVER; - newdp->dccps_hc_rx_ackvec = NULL; - newdp->dccps_service_list = NULL; - newdp->dccps_service = dreq->dreq_service; - newicsk->icsk_rto = DCCP_TIMEOUT_INIT; + newdp->dccps_role = DCCP_ROLE_SERVER; + newdp->dccps_hc_rx_ackvec = NULL; + newdp->dccps_service_list = NULL; + newdp->dccps_service = dreq->dreq_service; + newdp->dccps_timestamp_echo = dreq->dreq_timestamp_echo; + newdp->dccps_timestamp_time = dreq->dreq_timestamp_time; + newicsk->icsk_rto = DCCP_TIMEOUT_INIT; if (dccp_feat_clone(sk, newsk)) goto out_free; @@ -200,10 +202,10 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock **prev) { struct sock *child = NULL; + struct dccp_request_sock *dreq = dccp_rsk(req); /* Check for retransmitted REQUEST */ if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) { - struct dccp_request_sock *dreq = dccp_rsk(req); if (after48(DCCP_SKB_CB(skb)->dccpd_seq, dreq->dreq_isr)) { dccp_pr_debug("Retransmitted REQUEST\n"); @@ -227,22 +229,22 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, goto drop; /* Invalid ACK */ - if (DCCP_SKB_CB(skb)->dccpd_ack_seq != dccp_rsk(req)->dreq_iss) { + if (DCCP_SKB_CB(skb)->dccpd_ack_seq != dreq->dreq_iss) { dccp_pr_debug("Invalid ACK number: ack_seq=%llu, " "dreq_iss=%llu\n", (unsigned long long) DCCP_SKB_CB(skb)->dccpd_ack_seq, - (unsigned long long) - dccp_rsk(req)->dreq_iss); + (unsigned long long) dreq->dreq_iss); goto drop; } + if (dccp_parse_options(sk, dreq, skb)) + goto drop; + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); if (child == NULL) goto listen_overflow; - /* FIXME: deal with options */ - inet_csk_reqsk_queue_unlink(sk, req, prev); inet_csk_reqsk_queue_removed(sk, req); inet_csk_reqsk_queue_add(sk, req, child); @@ -303,9 +305,12 @@ EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack); void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb) { - inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport; - inet_rsk(req)->acked = 0; - req->rcv_wnd = sysctl_dccp_feat_sequence_window; + struct dccp_request_sock *dreq = dccp_rsk(req); + + inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport; + inet_rsk(req)->acked = 0; + req->rcv_wnd = sysctl_dccp_feat_sequence_window; + dreq->dreq_timestamp_echo = 0; } EXPORT_SYMBOL_GPL(dccp_reqsk_init); diff --git a/net/dccp/options.c b/net/dccp/options.c index d286cffe2c4..d2a84a2fece 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -46,7 +46,13 @@ static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len) return value; } -int dccp_parse_options(struct sock *sk, struct sk_buff *skb) +/** + * dccp_parse_options - Parse DCCP options present in @skb + * @sk: client|server|listening dccp socket (when @dreq != NULL) + * @dreq: request socket to use during connection setup, or NULL + */ +int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, + struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); const struct dccp_hdr *dh = dccp_hdr(skb); @@ -92,6 +98,20 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb) goto out_invalid_option; } + /* + * CCID-Specific Options (from RFC 4340, sec. 10.3): + * + * Option numbers 128 through 191 are for options sent from the + * HC-Sender to the HC-Receiver; option numbers 192 through 255 + * are for options sent from the HC-Receiver to the HC-Sender. + * + * CCID-specific options are ignored during connection setup, as + * negotiation may still be in progress (see RFC 4340, 10.3). + * + */ + if (dreq != NULL && opt >= 128) + goto ignore_option; + switch (opt) { case DCCPO_PADDING: break; @@ -112,6 +132,8 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb) case DCCPO_CHANGE_L: /* fall through */ case DCCPO_CHANGE_R: + if (pkt_type == DCCP_PKT_DATA) + break; if (len < 2) goto out_invalid_option; rc = dccp_feat_change_recv(sk, opt, *value, value + 1, @@ -128,7 +150,9 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb) case DCCPO_CONFIRM_L: /* fall through */ case DCCPO_CONFIRM_R: - if (len < 2) + if (pkt_type == DCCP_PKT_DATA) + break; + if (len < 2) /* FIXME this disallows empty confirm */ goto out_invalid_option; if (dccp_feat_confirm_recv(sk, opt, *value, value + 1, len - 1)) @@ -136,7 +160,7 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb) break; case DCCPO_ACK_VECTOR_0: case DCCPO_ACK_VECTOR_1: - if (pkt_type == DCCP_PKT_DATA) + if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ break; if (dccp_msk(sk)->dccpms_send_ack_vector && @@ -146,15 +170,27 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb) case DCCPO_TIMESTAMP: if (len != 4) goto out_invalid_option; - + /* + * RFC 4340 13.1: "The precise time corresponding to + * Timestamp Value zero is not specified". We use + * zero to indicate absence of a meaningful timestamp. + */ opt_val = get_unaligned((__be32 *)value); - opt_recv->dccpor_timestamp = ntohl(opt_val); - - dp->dccps_timestamp_echo = opt_recv->dccpor_timestamp; - dp->dccps_timestamp_time = ktime_get_real(); + if (unlikely(opt_val == 0)) { + DCCP_WARN("Timestamp with zero value\n"); + break; + } + if (dreq != NULL) { + dreq->dreq_timestamp_echo = ntohl(opt_val); + dreq->dreq_timestamp_time = dccp_timestamp(); + } else { + opt_recv->dccpor_timestamp = + dp->dccps_timestamp_echo = ntohl(opt_val); + dp->dccps_timestamp_time = dccp_timestamp(); + } dccp_pr_debug("%s rx opt: TIMESTAMP=%u, ackno=%llu\n", - dccp_role(sk), opt_recv->dccpor_timestamp, + dccp_role(sk), ntohl(opt_val), (unsigned long long) DCCP_SKB_CB(skb)->dccpd_ack_seq); break; @@ -194,18 +230,17 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb) opt_recv->dccpor_elapsed_time = elapsed_time; break; case DCCPO_ELAPSED_TIME: - if (len != 2 && len != 4) - goto out_invalid_option; - - if (pkt_type == DCCP_PKT_DATA) - continue; + if (dccp_packet_without_ack(skb)) /* RFC 4340, 13.2 */ + break; if (len == 2) { __be16 opt_val2 = get_unaligned((__be16 *)value); elapsed_time = ntohs(opt_val2); - } else { + } else if (len == 4) { opt_val = get_unaligned((__be32 *)value); elapsed_time = ntohl(opt_val); + } else { + goto out_invalid_option; } if (elapsed_time > opt_recv->dccpor_elapsed_time) @@ -214,15 +249,6 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb) dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n", dccp_role(sk), elapsed_time); break; - /* - * From RFC 4340, sec. 10.3: - * - * Option numbers 128 through 191 are for - * options sent from the HC-Sender to the - * HC-Receiver; option numbers 192 through 255 - * are for options sent from the HC-Receiver to - * the HC-Sender. - */ case 128 ... 191: { const u16 idx = value - options; @@ -246,7 +272,7 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb) "implemented, ignoring", sk, opt, len); break; } - +ignore_option: if (opt != DCCPO_MANDATORY) mandatory = 0; } @@ -382,16 +408,24 @@ int dccp_insert_option_timestamp(struct sock *sk, struct sk_buff *skb) EXPORT_SYMBOL_GPL(dccp_insert_option_timestamp); -static int dccp_insert_option_timestamp_echo(struct sock *sk, +static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp, + struct dccp_request_sock *dreq, struct sk_buff *skb) { - struct dccp_sock *dp = dccp_sk(sk); __be32 tstamp_echo; - int len, elapsed_time_len; unsigned char *to; - const suseconds_t delta = ktime_us_delta(ktime_get_real(), - dp->dccps_timestamp_time); - u32 elapsed_time = delta / 10; + u32 elapsed_time, elapsed_time_len, len; + + if (dreq != NULL) { + elapsed_time = dccp_timestamp() - dreq->dreq_timestamp_time; + tstamp_echo = htonl(dreq->dreq_timestamp_echo); + dreq->dreq_timestamp_echo = 0; + } else { + elapsed_time = dccp_timestamp() - dp->dccps_timestamp_time; + tstamp_echo = htonl(dp->dccps_timestamp_echo); + dp->dccps_timestamp_echo = 0; + } + elapsed_time_len = dccp_elapsed_time_len(elapsed_time); len = 6 + elapsed_time_len; @@ -404,7 +438,6 @@ static int dccp_insert_option_timestamp_echo(struct sock *sk, *to++ = DCCPO_TIMESTAMP_ECHO; *to++ = len; - tstamp_echo = htonl(dp->dccps_timestamp_echo); memcpy(to, &tstamp_echo, 4); to += 4; @@ -416,8 +449,6 @@ static int dccp_insert_option_timestamp_echo(struct sock *sk, memcpy(to, &var32, 4); } - dp->dccps_timestamp_echo = 0; - dp->dccps_timestamp_time = ktime_set(0, 0); return 0; } @@ -510,6 +541,18 @@ static int dccp_insert_options_feat(struct sock *sk, struct sk_buff *skb) return 0; } +/* The length of all options needs to be a multiple of 4 (5.8) */ +static void dccp_insert_option_padding(struct sk_buff *skb) +{ + int padding = DCCP_SKB_CB(skb)->dccpd_opt_len % 4; + + if (padding != 0) { + padding = 4 - padding; + memset(skb_push(skb, padding), 0, padding); + DCCP_SKB_CB(skb)->dccpd_opt_len += padding; + } +} + int dccp_insert_options(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); @@ -526,10 +569,6 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb) dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) && dccp_insert_option_ackvec(sk, skb)) return -1; - - if (dp->dccps_timestamp_echo != 0 && - dccp_insert_option_timestamp_echo(sk, skb)) - return -1; } if (dp->dccps_hc_rx_insert_options) { @@ -553,18 +592,22 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb) dccp_insert_option_timestamp(sk, skb)) return -1; - /* XXX: insert other options when appropriate */ + if (dp->dccps_timestamp_echo != 0 && + dccp_insert_option_timestamp_echo(dp, NULL, skb)) + return -1; + + dccp_insert_option_padding(skb); + return 0; +} - if (DCCP_SKB_CB(skb)->dccpd_opt_len != 0) { - /* The length of all options has to be a multiple of 4 */ - int padding = DCCP_SKB_CB(skb)->dccpd_opt_len % 4; +int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb) +{ + DCCP_SKB_CB(skb)->dccpd_opt_len = 0; - if (padding != 0) { - padding = 4 - padding; - memset(skb_push(skb, padding), 0, padding); - DCCP_SKB_CB(skb)->dccpd_opt_len += padding; - } - } + if (dreq->dreq_timestamp_echo != 0 && + dccp_insert_option_timestamp_echo(NULL, dreq, skb)) + return -1; + dccp_insert_option_padding(skb); return 0; } diff --git a/net/dccp/output.c b/net/dccp/output.c index f49544618f2..3b763db3d86 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -133,15 +133,31 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) return -ENOBUFS; } +/** + * dccp_determine_ccmps - Find out about CCID-specfic packet-size limits + * We only consider the HC-sender CCID for setting the CCMPS (RFC 4340, 14.), + * since the RX CCID is restricted to feedback packets (Acks), which are small + * in comparison with the data traffic. A value of 0 means "no current CCMPS". + */ +static u32 dccp_determine_ccmps(const struct dccp_sock *dp) +{ + const struct ccid *tx_ccid = dp->dccps_hc_tx_ccid; + + if (tx_ccid == NULL || tx_ccid->ccid_ops == NULL) + return 0; + return tx_ccid->ccid_ops->ccid_ccmps; +} + unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) { struct inet_connection_sock *icsk = inet_csk(sk); struct dccp_sock *dp = dccp_sk(sk); - int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len - - sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext)); + u32 ccmps = dccp_determine_ccmps(dp); + int cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; - /* Now subtract optional transport overhead */ - mss_now -= icsk->icsk_ext_hdr_len; + /* Account for header lengths and IPv4/v6 option overhead */ + cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len + + sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext)); /* * FIXME: this should come from the CCID infrastructure, where, say, @@ -151,13 +167,13 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) * make it a multiple of 4 */ - mss_now -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4; + cur_mps -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4; /* And store cached results */ icsk->icsk_pmtu_cookie = pmtu; - dp->dccps_mss_cache = mss_now; + dp->dccps_mss_cache = cur_mps; - return mss_now; + return cur_mps; } EXPORT_SYMBOL_GPL(dccp_sync_mss); @@ -170,7 +186,7 @@ void dccp_write_space(struct sock *sk) wake_up_interruptible(sk->sk_sleep); /* Should agree with poll, otherwise some programs break */ if (sock_writeable(sk)) - sk_wake_async(sk, 2, POLL_OUT); + sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); read_unlock(&sk->sk_callback_lock); } @@ -303,7 +319,7 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE; DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_iss; - if (dccp_insert_options(sk, skb)) { + if (dccp_insert_options_rsk(dreq, skb)) { kfree_skb(skb); return NULL; } @@ -391,7 +407,7 @@ int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code) * FIXME: what if rebuild_header fails? * Should we be doing a rebuild_header here? */ - int err = inet_sk_rebuild_header(sk); + int err = inet_csk(sk)->icsk_af_ops->rebuild_header(sk); if (err != 0) return err; @@ -567,14 +583,27 @@ void dccp_send_close(struct sock *sk, const int active) /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, sk->sk_prot->max_header); - DCCP_SKB_CB(skb)->dccpd_type = dp->dccps_role == DCCP_ROLE_CLIENT ? - DCCP_PKT_CLOSE : DCCP_PKT_CLOSEREQ; + if (dp->dccps_role == DCCP_ROLE_SERVER && !dp->dccps_server_timewait) + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSEREQ; + else + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE; if (active) { dccp_write_xmit(sk, 1); dccp_skb_entail(sk, skb); dccp_transmit_skb(sk, skb_clone(skb, prio)); - /* FIXME do we need a retransmit timer here? */ + /* + * Retransmission timer for active-close: RFC 4340, 8.3 requires + * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ + * state can be left. The initial timeout is 2 RTTs. + * Since RTT measurement is done by the CCIDs, there is no easy + * way to get an RTT sample. The fallback RTT from RFC 4340, 3.4 + * is too low (200ms); we use a high value to avoid unnecessary + * retransmissions when the link RTT is > 0.2 seconds. + * FIXME: Let main module sample RTTs and use that instead. + */ + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + DCCP_TIMEOUT_INIT, DCCP_RTO_MAX); } else dccp_transmit_skb(sk, skb); } diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 7a3bea9c28c..0bed4a6095b 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -60,8 +60,7 @@ void dccp_set_state(struct sock *sk, const int state) { const int oldstate = sk->sk_state; - dccp_pr_debug("%s(%p) %-10.10s -> %s\n", - dccp_role(sk), sk, + dccp_pr_debug("%s(%p) %s --> %s\n", dccp_role(sk), sk, dccp_state_name(oldstate), dccp_state_name(state)); WARN_ON(state == oldstate); @@ -72,7 +71,8 @@ void dccp_set_state(struct sock *sk, const int state) break; case DCCP_CLOSED: - if (oldstate == DCCP_CLOSING || oldstate == DCCP_OPEN) + if (oldstate == DCCP_OPEN || oldstate == DCCP_ACTIVE_CLOSEREQ || + oldstate == DCCP_CLOSING) DCCP_INC_STATS(DCCP_MIB_ESTABRESETS); sk->sk_prot->unhash(sk); @@ -93,6 +93,24 @@ void dccp_set_state(struct sock *sk, const int state) EXPORT_SYMBOL_GPL(dccp_set_state); +static void dccp_finish_passive_close(struct sock *sk) +{ + switch (sk->sk_state) { + case DCCP_PASSIVE_CLOSE: + /* Node (client or server) has received Close packet. */ + dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED); + dccp_set_state(sk, DCCP_CLOSED); + break; + case DCCP_PASSIVE_CLOSEREQ: + /* + * Client received CloseReq. We set the `active' flag so that + * dccp_send_close() retransmits the Close as per RFC 4340, 8.3. + */ + dccp_send_close(sk, 1); + dccp_set_state(sk, DCCP_CLOSING); + } +} + void dccp_done(struct sock *sk) { dccp_set_state(sk, DCCP_CLOSED); @@ -134,14 +152,17 @@ EXPORT_SYMBOL_GPL(dccp_packet_name); const char *dccp_state_name(const int state) { static char *dccp_state_names[] = { - [DCCP_OPEN] = "OPEN", - [DCCP_REQUESTING] = "REQUESTING", - [DCCP_PARTOPEN] = "PARTOPEN", - [DCCP_LISTEN] = "LISTEN", - [DCCP_RESPOND] = "RESPOND", - [DCCP_CLOSING] = "CLOSING", - [DCCP_TIME_WAIT] = "TIME_WAIT", - [DCCP_CLOSED] = "CLOSED", + [DCCP_OPEN] = "OPEN", + [DCCP_REQUESTING] = "REQUESTING", + [DCCP_PARTOPEN] = "PARTOPEN", + [DCCP_LISTEN] = "LISTEN", + [DCCP_RESPOND] = "RESPOND", + [DCCP_CLOSING] = "CLOSING", + [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ", + [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE", + [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ", + [DCCP_TIME_WAIT] = "TIME_WAIT", + [DCCP_CLOSED] = "CLOSED", }; if (state >= DCCP_MAX_STATES) @@ -174,6 +195,19 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) dccp_minisock_init(&dp->dccps_minisock); + icsk->icsk_rto = DCCP_TIMEOUT_INIT; + icsk->icsk_syn_retries = sysctl_dccp_request_retries; + sk->sk_state = DCCP_CLOSED; + sk->sk_write_space = dccp_write_space; + icsk->icsk_sync_mss = dccp_sync_mss; + dp->dccps_mss_cache = 536; + dp->dccps_rate_last = jiffies; + dp->dccps_role = DCCP_ROLE_UNDEFINED; + dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; + dp->dccps_l_ack_ratio = dp->dccps_r_ack_ratio = 1; + + dccp_init_xmit_timers(sk); + /* * FIXME: We're hardcoding the CCID, and doing this at this point makes * the listening (master) sock get CCID control blocks, which is not @@ -213,18 +247,6 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) INIT_LIST_HEAD(&dmsk->dccpms_conf); } - dccp_init_xmit_timers(sk); - icsk->icsk_rto = DCCP_TIMEOUT_INIT; - icsk->icsk_syn_retries = sysctl_dccp_request_retries; - sk->sk_state = DCCP_CLOSED; - sk->sk_write_space = dccp_write_space; - icsk->icsk_sync_mss = dccp_sync_mss; - dp->dccps_mss_cache = 536; - dp->dccps_rate_last = jiffies; - dp->dccps_role = DCCP_ROLE_UNDEFINED; - dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; - dp->dccps_l_ack_ratio = dp->dccps_r_ack_ratio = 1; - return 0; } @@ -275,6 +297,12 @@ static inline int dccp_listen_start(struct sock *sk, int backlog) return inet_csk_listen_start(sk, backlog); } +static inline int dccp_need_reset(int state) +{ + return state != DCCP_CLOSED && state != DCCP_LISTEN && + state != DCCP_REQUESTING; +} + int dccp_disconnect(struct sock *sk, int flags) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -285,10 +313,15 @@ int dccp_disconnect(struct sock *sk, int flags) if (old_state != DCCP_CLOSED) dccp_set_state(sk, DCCP_CLOSED); - /* ABORT function of RFC793 */ + /* + * This corresponds to the ABORT function of RFC793, sec. 3.8 + * TCP uses a RST segment, DCCP a Reset packet with Code 2, "Aborted". + */ if (old_state == DCCP_LISTEN) { inet_csk_listen_stop(sk); - /* FIXME: do the active reset thing */ + } else if (dccp_need_reset(old_state)) { + dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); + sk->sk_err = ECONNRESET; } else if (old_state == DCCP_REQUESTING) sk->sk_err = ECONNRESET; @@ -518,6 +551,12 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, (struct dccp_so_feat __user *) optval); break; + case DCCP_SOCKOPT_SERVER_TIMEWAIT: + if (dp->dccps_role != DCCP_ROLE_SERVER) + err = -EOPNOTSUPP; + else + dp->dccps_server_timewait = (val != 0); + break; case DCCP_SOCKOPT_SEND_CSCOV: /* sender side, RFC 4340, sec. 9.2 */ if (val < 0 || val > 15) err = -EINVAL; @@ -618,15 +657,15 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname, (__be32 __user *)optval, optlen); case DCCP_SOCKOPT_GET_CUR_MPS: val = dp->dccps_mss_cache; - len = sizeof(val); + break; + case DCCP_SOCKOPT_SERVER_TIMEWAIT: + val = dp->dccps_server_timewait; break; case DCCP_SOCKOPT_SEND_CSCOV: val = dp->dccps_pcslen; - len = sizeof(val); break; case DCCP_SOCKOPT_RECV_CSCOV: val = dp->dccps_pcrlen; - len = sizeof(val); break; case 128 ... 191: return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname, @@ -638,6 +677,7 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname, return -ENOPROTOOPT; } + len = sizeof(val); if (put_user(len, optlen) || copy_to_user(optval, &val, len)) return -EFAULT; @@ -748,19 +788,26 @@ int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, dh = dccp_hdr(skb); - if (dh->dccph_type == DCCP_PKT_DATA || - dh->dccph_type == DCCP_PKT_DATAACK) + switch (dh->dccph_type) { + case DCCP_PKT_DATA: + case DCCP_PKT_DATAACK: goto found_ok_skb; - if (dh->dccph_type == DCCP_PKT_RESET || - dh->dccph_type == DCCP_PKT_CLOSE) { - dccp_pr_debug("found fin ok!\n"); + case DCCP_PKT_CLOSE: + case DCCP_PKT_CLOSEREQ: + if (!(flags & MSG_PEEK)) + dccp_finish_passive_close(sk); + /* fall through */ + case DCCP_PKT_RESET: + dccp_pr_debug("found fin (%s) ok!\n", + dccp_packet_name(dh->dccph_type)); len = 0; goto found_fin_ok; + default: + dccp_pr_debug("packet_type=%s\n", + dccp_packet_name(dh->dccph_type)); + sk_eat_skb(sk, skb, 0); } - dccp_pr_debug("packet_type=%s\n", - dccp_packet_name(dh->dccph_type)); - sk_eat_skb(sk, skb, 0); verify_sock_status: if (sock_flag(sk, SOCK_DONE)) { len = 0; @@ -862,34 +909,38 @@ out: EXPORT_SYMBOL_GPL(inet_dccp_listen); -static const unsigned char dccp_new_state[] = { - /* current state: new state: action: */ - [0] = DCCP_CLOSED, - [DCCP_OPEN] = DCCP_CLOSING | DCCP_ACTION_FIN, - [DCCP_REQUESTING] = DCCP_CLOSED, - [DCCP_PARTOPEN] = DCCP_CLOSING | DCCP_ACTION_FIN, - [DCCP_LISTEN] = DCCP_CLOSED, - [DCCP_RESPOND] = DCCP_CLOSED, - [DCCP_CLOSING] = DCCP_CLOSED, - [DCCP_TIME_WAIT] = DCCP_CLOSED, - [DCCP_CLOSED] = DCCP_CLOSED, -}; - -static int dccp_close_state(struct sock *sk) +static void dccp_terminate_connection(struct sock *sk) { - const int next = dccp_new_state[sk->sk_state]; - const int ns = next & DCCP_STATE_MASK; + u8 next_state = DCCP_CLOSED; - if (ns != sk->sk_state) - dccp_set_state(sk, ns); + switch (sk->sk_state) { + case DCCP_PASSIVE_CLOSE: + case DCCP_PASSIVE_CLOSEREQ: + dccp_finish_passive_close(sk); + break; + case DCCP_PARTOPEN: + dccp_pr_debug("Stop PARTOPEN timer (%p)\n", sk); + inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); + /* fall through */ + case DCCP_OPEN: + dccp_send_close(sk, 1); - return next & DCCP_ACTION_FIN; + if (dccp_sk(sk)->dccps_role == DCCP_ROLE_SERVER && + !dccp_sk(sk)->dccps_server_timewait) + next_state = DCCP_ACTIVE_CLOSEREQ; + else + next_state = DCCP_CLOSING; + /* fall through */ + default: + dccp_set_state(sk, next_state); + } } void dccp_close(struct sock *sk, long timeout) { struct dccp_sock *dp = dccp_sk(sk); struct sk_buff *skb; + u32 data_was_unread = 0; int state; lock_sock(sk); @@ -912,16 +963,21 @@ void dccp_close(struct sock *sk, long timeout) * descriptor close, not protocol-sourced closes, because the *reader process may not have drained the data yet! */ - /* FIXME: check for unread data */ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { + data_was_unread += skb->len; __kfree_skb(skb); } - if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { + if (data_was_unread) { + /* Unread data was tossed, send an appropriate Reset Code */ + DCCP_WARN("DCCP: ABORT -- %u bytes unread\n", data_was_unread); + dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); + dccp_set_state(sk, DCCP_CLOSED); + } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); - } else if (dccp_close_state(sk)) { - dccp_send_close(sk, 1); + } else if (sk->sk_state != DCCP_CLOSED) { + dccp_terminate_connection(sk); } sk_stream_wait_close(sk, timeout); @@ -948,24 +1004,6 @@ adjudge_to_death: if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED) goto out; - /* - * The last release_sock may have processed the CLOSE or RESET - * packet moving sock to CLOSED state, if not we have to fire - * the CLOSE/CLOSEREQ retransmission timer, see "8.3. Termination" - * in draft-ietf-dccp-spec-11. -acme - */ - if (sk->sk_state == DCCP_CLOSING) { - /* FIXME: should start at 2 * RTT */ - /* Timer for repeating the CLOSE/CLOSEREQ until an answer. */ - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, - DCCP_RTO_MAX); -#if 0 - /* Yeah, we should use sk->sk_prot->orphan_count, etc */ - dccp_set_state(sk, DCCP_CLOSED); -#endif - } - if (sk->sk_state == DCCP_CLOSED) inet_csk_destroy_sock(sk); @@ -981,7 +1019,7 @@ EXPORT_SYMBOL_GPL(dccp_close); void dccp_shutdown(struct sock *sk, int how) { - dccp_pr_debug("entry\n"); + dccp_pr_debug("called shutdown(%x)\n", how); } EXPORT_SYMBOL_GPL(dccp_shutdown); diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c index c62c05039f6..21295993fdb 100644 --- a/net/dccp/sysctl.c +++ b/net/dccp/sysctl.c @@ -100,41 +100,19 @@ static struct ctl_table dccp_default_table[] = { { .ctl_name = 0, } }; -static struct ctl_table dccp_table[] = { - { - .ctl_name = NET_DCCP_DEFAULT, - .procname = "default", - .mode = 0555, - .child = dccp_default_table, - }, - { .ctl_name = 0, }, -}; - -static struct ctl_table dccp_dir_table[] = { - { - .ctl_name = NET_DCCP, - .procname = "dccp", - .mode = 0555, - .child = dccp_table, - }, - { .ctl_name = 0, }, -}; - -static struct ctl_table dccp_root_table[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = dccp_dir_table, - }, - { .ctl_name = 0, }, +static struct ctl_path dccp_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "dccp", .ctl_name = NET_DCCP, }, + { .procname = "default", .ctl_name = NET_DCCP_DEFAULT, }, + { } }; static struct ctl_table_header *dccp_table_header; int __init dccp_sysctl_init(void) { - dccp_table_header = register_sysctl_table(dccp_root_table); + dccp_table_header = register_sysctl_paths(dccp_path, + dccp_default_table); return dccp_table_header != NULL ? 0 : -ENOMEM; } diff --git a/net/dccp/timer.c b/net/dccp/timer.c index 3af067354bd..8703a792b56 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -280,9 +280,8 @@ static void dccp_init_write_xmit_timer(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); - init_timer(&dp->dccps_xmit_timer); - dp->dccps_xmit_timer.data = (unsigned long)sk; - dp->dccps_xmit_timer.function = dccp_write_xmit_timer; + setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer, + (unsigned long)sk); } void dccp_init_xmit_timers(struct sock *sk) diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 57d57495183..acd48ee522d 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -1904,7 +1904,7 @@ static inline struct sk_buff *dn_alloc_send_pskb(struct sock *sk, struct sk_buff *skb = sock_alloc_send_skb(sk, datalen, noblock, errcode); if (skb) { - skb->protocol = __constant_htons(ETH_P_DNA_RT); + skb->protocol = htons(ETH_P_DNA_RT); skb->pkt_type = PACKET_OUTGOING; } return skb; diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index 3bc82dc83b3..1bbfce5f7a2 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -173,10 +173,6 @@ static int dn_forwarding_sysctl(ctl_table *table, int __user *name, int nlen, static struct dn_dev_sysctl_table { struct ctl_table_header *sysctl_header; ctl_table dn_dev_vars[5]; - ctl_table dn_dev_dev[2]; - ctl_table dn_dev_conf_dir[2]; - ctl_table dn_dev_proto_dir[2]; - ctl_table dn_dev_root_dir[2]; } dn_dev_sysctl = { NULL, { @@ -224,30 +220,6 @@ static struct dn_dev_sysctl_table { }, {0} }, - {{ - .ctl_name = 0, - .procname = "", - .mode = 0555, - .child = dn_dev_sysctl.dn_dev_vars - }, {0}}, - {{ - .ctl_name = NET_DECNET_CONF, - .procname = "conf", - .mode = 0555, - .child = dn_dev_sysctl.dn_dev_dev - }, {0}}, - {{ - .ctl_name = NET_DECNET, - .procname = "decnet", - .mode = 0555, - .child = dn_dev_sysctl.dn_dev_conf_dir - }, {0}}, - {{ - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = dn_dev_sysctl.dn_dev_proto_dir - }, {0}} }; static void dn_dev_sysctl_register(struct net_device *dev, struct dn_dev_parms *parms) @@ -255,6 +227,16 @@ static void dn_dev_sysctl_register(struct net_device *dev, struct dn_dev_parms * struct dn_dev_sysctl_table *t; int i; +#define DN_CTL_PATH_DEV 3 + + struct ctl_path dn_ctl_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "decnet", .ctl_name = NET_DECNET, }, + { .procname = "conf", .ctl_name = NET_DECNET_CONF, }, + { /* to be set */ }, + { }, + }; + t = kmemdup(&dn_dev_sysctl, sizeof(*t), GFP_KERNEL); if (t == NULL) return; @@ -265,20 +247,16 @@ static void dn_dev_sysctl_register(struct net_device *dev, struct dn_dev_parms * } if (dev) { - t->dn_dev_dev[0].procname = dev->name; - t->dn_dev_dev[0].ctl_name = dev->ifindex; + dn_ctl_path[DN_CTL_PATH_DEV].procname = dev->name; + dn_ctl_path[DN_CTL_PATH_DEV].ctl_name = dev->ifindex; } else { - t->dn_dev_dev[0].procname = parms->name; - t->dn_dev_dev[0].ctl_name = parms->ctl_name; + dn_ctl_path[DN_CTL_PATH_DEV].procname = parms->name; + dn_ctl_path[DN_CTL_PATH_DEV].ctl_name = parms->ctl_name; } - t->dn_dev_dev[0].child = t->dn_dev_vars; - t->dn_dev_conf_dir[0].child = t->dn_dev_dev; - t->dn_dev_proto_dir[0].child = t->dn_dev_conf_dir; - t->dn_dev_root_dir[0].child = t->dn_dev_proto_dir; t->dn_dev_vars[0].extra1 = (void *)dev; - t->sysctl_header = register_sysctl_table(t->dn_dev_root_dir); + t->sysctl_header = register_sysctl_paths(dn_ctl_path, t->dn_dev_vars); if (t->sysctl_header == NULL) kfree(t); else @@ -647,11 +625,15 @@ static const struct nla_policy dn_ifa_policy[IFA_MAX+1] = { static int dn_nl_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { + struct net *net = skb->sk->sk_net; struct nlattr *tb[IFA_MAX+1]; struct dn_dev *dn_db; struct ifaddrmsg *ifm; struct dn_ifaddr *ifa, **ifap; - int err; + int err = -EINVAL; + + if (net != &init_net) + goto errout; err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, dn_ifa_policy); if (err < 0) @@ -681,6 +663,7 @@ errout: static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { + struct net *net = skb->sk->sk_net; struct nlattr *tb[IFA_MAX+1]; struct net_device *dev; struct dn_dev *dn_db; @@ -688,6 +671,9 @@ static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) struct dn_ifaddr *ifa; int err; + if (net != &init_net) + return -EINVAL; + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, dn_ifa_policy); if (err < 0) return err; @@ -785,19 +771,23 @@ static void dn_ifaddr_notify(int event, struct dn_ifaddr *ifa) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, 0, RTNLGRP_DECnet_IFADDR, NULL, GFP_KERNEL); + err = rtnl_notify(skb, &init_net, 0, RTNLGRP_DECnet_IFADDR, NULL, GFP_KERNEL); errout: if (err < 0) - rtnl_set_sk_err(RTNLGRP_DECnet_IFADDR, err); + rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_IFADDR, err); } static int dn_nl_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = skb->sk->sk_net; int idx, dn_idx = 0, skip_ndevs, skip_naddr; struct net_device *dev; struct dn_dev *dn_db; struct dn_ifaddr *ifa; + if (net != &init_net) + return 0; + skip_ndevs = cb->args[0]; skip_naddr = cb->args[1]; diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c index 3760a20d10d..4aa9a423e60 100644 --- a/net/decnet/dn_fib.c +++ b/net/decnet/dn_fib.c @@ -203,8 +203,6 @@ static int dn_fib_check_nh(const struct rtmsg *r, struct dn_fib_info *fi, struct struct flowi fl; struct dn_fib_res res; - memset(&fl, 0, sizeof(fl)); - if (nh->nh_flags&RTNH_F_ONLINK) { struct net_device *dev; @@ -506,10 +504,14 @@ static int dn_fib_check_attr(struct rtmsg *r, struct rtattr **rta) static int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { + struct net *net = skb->sk->sk_net; struct dn_fib_table *tb; struct rtattr **rta = arg; struct rtmsg *r = NLMSG_DATA(nlh); + if (net != &init_net) + return -EINVAL; + if (dn_fib_check_attr(r, rta)) return -EINVAL; @@ -522,10 +524,14 @@ static int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void * static int dn_fib_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { + struct net *net = skb->sk->sk_net; struct dn_fib_table *tb; struct rtattr **rta = arg; struct rtmsg *r = NLMSG_DATA(nlh); + if (net != &init_net) + return -EINVAL; + if (dn_fib_check_attr(r, rta)) return -EINVAL; diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c index e851b143cca..1ca13b17974 100644 --- a/net/decnet/dn_neigh.c +++ b/net/decnet/dn_neigh.c @@ -580,8 +580,8 @@ static const struct seq_operations dn_neigh_seq_ops = { static int dn_neigh_seq_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &dn_neigh_seq_ops, - sizeof(struct neigh_seq_state)); + return seq_open_net(inode, file, &dn_neigh_seq_ops, + sizeof(struct neigh_seq_state)); } static const struct file_operations dn_neigh_seq_fops = { @@ -589,7 +589,7 @@ static const struct file_operations dn_neigh_seq_fops = { .open = dn_neigh_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; #endif diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c index 7404653880b..1964faf203e 100644 --- a/net/decnet/dn_nsp_out.c +++ b/net/decnet/dn_nsp_out.c @@ -124,7 +124,7 @@ struct sk_buff *dn_alloc_skb(struct sock *sk, int size, gfp_t pri) if ((skb = alloc_skb(size + hdr, pri)) == NULL) return NULL; - skb->protocol = __constant_htons(ETH_P_DNA_RT); + skb->protocol = htons(ETH_P_DNA_RT); skb->pkt_type = PACKET_OUTGOING; if (sk) diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 0e10ff21e29..31be29b8b5a 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -107,7 +107,7 @@ static const int dn_rt_mtu_expires = 10 * 60 * HZ; static unsigned long dn_rt_deadline; -static int dn_dst_gc(void); +static int dn_dst_gc(struct dst_ops *ops); static struct dst_entry *dn_dst_check(struct dst_entry *, __u32); static struct dst_entry *dn_dst_negative_advice(struct dst_entry *); static void dn_dst_link_failure(struct sk_buff *); @@ -185,7 +185,7 @@ static void dn_dst_check_expire(unsigned long dummy) mod_timer(&dn_route_timer, now + decnet_dst_gc_interval * HZ); } -static int dn_dst_gc(void) +static int dn_dst_gc(struct dst_ops *ops) { struct dn_route *rt, **rtp; int i; @@ -765,17 +765,6 @@ drop: } /* - * Drop packet. This is used for endnodes and for - * when we should not be forwarding packets from - * this dest. - */ -static int dn_blackhole(struct sk_buff *skb) -{ - kfree_skb(skb); - return NET_RX_DROP; -} - -/* * Used to catch bugs. This should never normally get * called. */ @@ -995,7 +984,7 @@ source_ok: * here */ if (!try_hard) { - neigh = neigh_lookup_nodev(&dn_neigh_table, &fl.fld_dst); + neigh = neigh_lookup_nodev(&dn_neigh_table, &init_net, &fl.fld_dst); if (neigh) { if ((oldflp->oif && (neigh->dev->ifindex != oldflp->oif)) || @@ -1207,7 +1196,8 @@ int dn_route_output_sock(struct dst_entry **pprt, struct flowi *fl, struct sock err = __dn_route_output_key(pprt, fl, flags & MSG_TRYHARD); if (err == 0 && fl->proto) { - err = xfrm_lookup(pprt, fl, sk, !(flags & MSG_DONTWAIT)); + err = xfrm_lookup(pprt, fl, sk, (flags & MSG_DONTWAIT) ? + 0 : XFRM_LOOKUP_WAIT); } return err; } @@ -1396,7 +1386,7 @@ make_route: default: case RTN_UNREACHABLE: case RTN_BLACKHOLE: - rt->u.dst.input = dn_blackhole; + rt->u.dst.input = dst_discard; } rt->rt_flags = flags; if (rt->u.dst.dev) @@ -1522,6 +1512,7 @@ rtattr_failure: */ static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg) { + struct net *net = in_skb->sk->sk_net; struct rtattr **rta = arg; struct rtmsg *rtm = NLMSG_DATA(nlh); struct dn_route *rt = NULL; @@ -1530,6 +1521,9 @@ static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void struct sk_buff *skb; struct flowi fl; + if (net != &init_net) + return -EINVAL; + memset(&fl, 0, sizeof(fl)); fl.proto = DNPROTO_NSP; @@ -1557,7 +1551,7 @@ static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void kfree_skb(skb); return -ENODEV; } - skb->protocol = __constant_htons(ETH_P_DNA_RT); + skb->protocol = htons(ETH_P_DNA_RT); skb->dev = dev; cb->src = fl.fld_src; cb->dst = fl.fld_dst; @@ -1594,7 +1588,7 @@ static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void goto out_free; } - return rtnl_unicast(skb, NETLINK_CB(in_skb).pid); + return rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid); out_free: kfree_skb(skb); @@ -1607,10 +1601,14 @@ out_free: */ int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = skb->sk->sk_net; struct dn_route *rt; int h, s_h; int idx, s_idx; + if (net != &init_net) + return 0; + if (NLMSG_PAYLOAD(cb->nlh, 0) < sizeof(struct rtmsg)) return -EINVAL; if (!(((struct rtmsg *)NLMSG_DATA(cb->nlh))->rtm_flags&RTM_F_CLONED)) @@ -1752,8 +1750,7 @@ void __init dn_route_init(void) dn_dst_ops.kmem_cachep = kmem_cache_create("dn_dst_cache", sizeof(struct dn_route), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - init_timer(&dn_route_timer); - dn_route_timer.function = dn_dst_check_expire; + setup_timer(&dn_route_timer, dn_dst_check_expire, 0); dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ; add_timer(&dn_route_timer); diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c index ffebea04cc9..5b7539b7fe0 100644 --- a/net/decnet/dn_rules.c +++ b/net/decnet/dn_rules.c @@ -212,7 +212,7 @@ nla_put_failure: return -ENOBUFS; } -static u32 dn_fib_rule_default_pref(void) +static u32 dn_fib_rule_default_pref(struct fib_rules_ops *ops) { struct list_head *pos; struct fib_rule *rule; @@ -249,6 +249,7 @@ static struct fib_rules_ops dn_fib_rules_ops = { .policy = dn_fib_rule_policy, .rules_list = LIST_HEAD_INIT(dn_fib_rules_ops.rules_list), .owner = THIS_MODULE, + .fro_net = &init_net, }; void __init dn_fib_rules_init(void) diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c index fda0772fa21..e09d915dbd7 100644 --- a/net/decnet/dn_table.c +++ b/net/decnet/dn_table.c @@ -375,10 +375,10 @@ static void dn_rtmsg_fib(int event, struct dn_fib_node *f, int z, u32 tb_id, kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, pid, RTNLGRP_DECnet_ROUTE, nlh, GFP_KERNEL); + err = rtnl_notify(skb, &init_net, pid, RTNLGRP_DECnet_ROUTE, nlh, GFP_KERNEL); errout: if (err < 0) - rtnl_set_sk_err(RTNLGRP_DECnet_ROUTE, err); + rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_ROUTE, err); } static __inline__ int dn_hash_dump_bucket(struct sk_buff *skb, @@ -463,12 +463,16 @@ static int dn_fib_table_dump(struct dn_fib_table *tb, struct sk_buff *skb, int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = skb->sk->sk_net; unsigned int h, s_h; unsigned int e = 0, s_e; struct dn_fib_table *tb; struct hlist_node *node; int dumped = 0; + if (net != &init_net) + return 0; + if (NLMSG_PAYLOAD(cb->nlh, 0) >= sizeof(struct rtmsg) && ((struct rtmsg *)NLMSG_DATA(cb->nlh))->rtm_flags&RTM_F_CLONED) return dn_cache_dump(skb, cb); diff --git a/net/decnet/netfilter/Kconfig b/net/decnet/netfilter/Kconfig index ecdb3f9f14c..2f81de5e752 100644 --- a/net/decnet/netfilter/Kconfig +++ b/net/decnet/netfilter/Kconfig @@ -4,6 +4,7 @@ menu "DECnet: Netfilter Configuration" depends on DECNET && NETFILTER && EXPERIMENTAL + depends on NETFILTER_ADVANCED config DECNET_NF_GRABULATOR tristate "Routing message grabulator (for userland routing daemon)" diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index 43fcd29046d..6d2bd320204 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -115,7 +115,7 @@ static inline void dnrmg_receive_user_skb(struct sk_buff *skb) RCV_SKB_FAIL(-EINVAL); } -static struct nf_hook_ops dnrmg_ops = { +static struct nf_hook_ops dnrmg_ops __read_mostly = { .hook = dnrmg_hook, .pf = PF_DECnet, .hooknum = NF_DN_ROUTE, @@ -137,7 +137,7 @@ static int __init dn_rtmsg_init(void) rv = nf_register_hook(&dnrmg_ops); if (rv) { - sock_release(dnrmg->sk_socket); + netlink_kernel_release(dnrmg); } return rv; @@ -146,7 +146,7 @@ static int __init dn_rtmsg_init(void) static void __exit dn_rtmsg_fini(void) { nf_unregister_hook(&dnrmg_ops); - sock_release(dnrmg->sk_socket); + netlink_kernel_release(dnrmg); } diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c index ae354a43fb9..228067c571b 100644 --- a/net/decnet/sysctl_net_decnet.c +++ b/net/decnet/sysctl_net_decnet.c @@ -470,28 +470,15 @@ static ctl_table dn_table[] = { {0} }; -static ctl_table dn_dir_table[] = { - { - .ctl_name = NET_DECNET, - .procname = "decnet", - .mode = 0555, - .child = dn_table}, - {0} -}; - -static ctl_table dn_root_table[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = dn_dir_table - }, - {0} +static struct ctl_path dn_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "decnet", .ctl_name = NET_DECNET, }, + { } }; void dn_register_sysctl(void) { - dn_table_header = register_sysctl_table(dn_root_table); + dn_table_header = register_sysctl_paths(dn_path, dn_table); } void dn_unregister_sysctl(void) diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index f70df073c58..bc0f6252613 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -1014,9 +1014,8 @@ static int __init aun_udp_initialise(void) skb_queue_head_init(&aun_queue); spin_lock_init(&aun_queue_lock); - init_timer(&ab_cleanup_timer); + setup_timer(&ab_cleanup_timer, ab_cleanup, 0); ab_cleanup_timer.expires = jiffies + (HZ*2); - ab_cleanup_timer.function = ab_cleanup; add_timer(&ab_cleanup_timer); memset(&sin, 0, sizeof(sin)); diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 6b2e454ae31..a7b417523e9 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -359,10 +359,34 @@ struct net_device *alloc_etherdev_mq(int sizeof_priv, unsigned int queue_count) } EXPORT_SYMBOL(alloc_etherdev_mq); -char *print_mac(char *buf, const u8 *addr) +static size_t _format_mac_addr(char *buf, int buflen, + const unsigned char *addr, int len) { - sprintf(buf, MAC_FMT, - addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]); + int i; + char *cp = buf; + + for (i = 0; i < len; i++) { + cp += scnprintf(cp, buflen - (cp - buf), "%02x", addr[i]); + if (i == len - 1) + break; + cp += strlcpy(cp, ":", buflen - (cp - buf)); + } + return cp - buf; +} + +ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len) +{ + size_t l; + + l = _format_mac_addr(buf, PAGE_SIZE, addr, len); + l += strlcpy(buf + l, "\n", PAGE_SIZE - l); + return ((ssize_t) l); +} +EXPORT_SYMBOL(sysfs_format_mac); + +char *print_mac(char *buf, const unsigned char *addr) +{ + _format_mac_addr(buf, MAC_BUF_SIZE, addr, ETH_ALEN); return buf; } EXPORT_SYMBOL(print_mac); diff --git a/net/ieee80211/Kconfig b/net/ieee80211/Kconfig index 1438adedbc8..bd501046c9c 100644 --- a/net/ieee80211/Kconfig +++ b/net/ieee80211/Kconfig @@ -1,8 +1,9 @@ config IEEE80211 - tristate "Generic IEEE 802.11 Networking Stack" + tristate "Generic IEEE 802.11 Networking Stack (DEPRECATED)" ---help--- This option enables the hardware independent IEEE 802.11 - networking stack. + networking stack. This component is deprecated in favor of the + mac80211 component. config IEEE80211_DEBUG bool "Enable full debugging output" diff --git a/net/ieee80211/ieee80211_crypt_tkip.c b/net/ieee80211/ieee80211_crypt_tkip.c index 8e146949fc6..bba0152e2d7 100644 --- a/net/ieee80211/ieee80211_crypt_tkip.c +++ b/net/ieee80211/ieee80211_crypt_tkip.c @@ -189,7 +189,7 @@ static inline u16 Mk16(u8 hi, u8 lo) return lo | (((u16) hi) << 8); } -static inline u16 Mk16_le(u16 * v) +static inline u16 Mk16_le(__le16 * v) { return le16_to_cpu(*v); } @@ -275,15 +275,15 @@ static void tkip_mixing_phase2(u8 * WEPSeed, const u8 * TK, const u16 * TTAK, PPK[5] = TTAK[4] + IV16; /* Step 2 - 96-bit bijective mixing using S-box */ - PPK[0] += _S_(PPK[5] ^ Mk16_le((u16 *) & TK[0])); - PPK[1] += _S_(PPK[0] ^ Mk16_le((u16 *) & TK[2])); - PPK[2] += _S_(PPK[1] ^ Mk16_le((u16 *) & TK[4])); - PPK[3] += _S_(PPK[2] ^ Mk16_le((u16 *) & TK[6])); - PPK[4] += _S_(PPK[3] ^ Mk16_le((u16 *) & TK[8])); - PPK[5] += _S_(PPK[4] ^ Mk16_le((u16 *) & TK[10])); - - PPK[0] += RotR1(PPK[5] ^ Mk16_le((u16 *) & TK[12])); - PPK[1] += RotR1(PPK[0] ^ Mk16_le((u16 *) & TK[14])); + PPK[0] += _S_(PPK[5] ^ Mk16_le((__le16 *) & TK[0])); + PPK[1] += _S_(PPK[0] ^ Mk16_le((__le16 *) & TK[2])); + PPK[2] += _S_(PPK[1] ^ Mk16_le((__le16 *) & TK[4])); + PPK[3] += _S_(PPK[2] ^ Mk16_le((__le16 *) & TK[6])); + PPK[4] += _S_(PPK[3] ^ Mk16_le((__le16 *) & TK[8])); + PPK[5] += _S_(PPK[4] ^ Mk16_le((__le16 *) & TK[10])); + + PPK[0] += RotR1(PPK[5] ^ Mk16_le((__le16 *) & TK[12])); + PPK[1] += RotR1(PPK[0] ^ Mk16_le((__le16 *) & TK[14])); PPK[2] += RotR1(PPK[1]); PPK[3] += RotR1(PPK[2]); PPK[4] += RotR1(PPK[3]); @@ -294,7 +294,7 @@ static void tkip_mixing_phase2(u8 * WEPSeed, const u8 * TK, const u16 * TTAK, WEPSeed[0] = Hi8(IV16); WEPSeed[1] = (Hi8(IV16) | 0x20) & 0x7F; WEPSeed[2] = Lo8(IV16); - WEPSeed[3] = Lo8((PPK[5] ^ Mk16_le((u16 *) & TK[0])) >> 1); + WEPSeed[3] = Lo8((PPK[5] ^ Mk16_le((__le16 *) & TK[0])) >> 1); #ifdef __BIG_ENDIAN { diff --git a/net/ieee80211/ieee80211_module.c b/net/ieee80211/ieee80211_module.c index 69cb6aad25b..3bca97f55d4 100644 --- a/net/ieee80211/ieee80211_module.c +++ b/net/ieee80211/ieee80211_module.c @@ -181,9 +181,8 @@ struct net_device *alloc_ieee80211(int sizeof_priv) ieee->ieee802_1x = 1; /* Default to supporting 802.1x */ INIT_LIST_HEAD(&ieee->crypt_deinit_list); - init_timer(&ieee->crypt_deinit_timer); - ieee->crypt_deinit_timer.data = (unsigned long)ieee; - ieee->crypt_deinit_timer.function = ieee80211_crypt_deinit_handler; + setup_timer(&ieee->crypt_deinit_timer, ieee80211_crypt_deinit_handler, + (unsigned long)ieee); ieee->crypt_quiesced = 0; spin_lock_init(&ieee->lock); diff --git a/net/ieee80211/ieee80211_rx.c b/net/ieee80211/ieee80211_rx.c index 21c0fadde03..1e3f87c8c01 100644 --- a/net/ieee80211/ieee80211_rx.c +++ b/net/ieee80211/ieee80211_rx.c @@ -45,7 +45,7 @@ static void ieee80211_monitor_rx(struct ieee80211_device *ieee, skb_reset_mac_header(skb); skb_pull(skb, ieee80211_get_hdrlen(fc)); skb->pkt_type = PACKET_OTHERHOST; - skb->protocol = __constant_htons(ETH_P_80211_RAW); + skb->protocol = htons(ETH_P_80211_RAW); memset(skb->cb, 0, sizeof(skb->cb)); netif_rx(skb); } @@ -754,7 +754,7 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb, memcpy(skb_push(skb, ETH_ALEN), src, ETH_ALEN); memcpy(skb_push(skb, ETH_ALEN), dst, ETH_ALEN); } else { - u16 len; + __be16 len; /* Leave Ethernet header part of hdr and full payload */ skb_pull(skb, hdrlen); len = htons(skb->len); @@ -800,7 +800,7 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb, if (skb2 != NULL) { /* send to wireless media */ skb2->dev = dev; - skb2->protocol = __constant_htons(ETH_P_802_3); + skb2->protocol = htons(ETH_P_802_3); skb_reset_mac_header(skb2); skb_reset_network_header(skb2); /* skb2->network_header += ETH_HLEN; */ @@ -1032,16 +1032,16 @@ static int ieee80211_qos_convert_ac_to_parameters(struct qos_param->aifs[i] -= (qos_param->aifs[i] < 2) ? 0 : 2; cw_min = ac_params->ecw_min_max & 0x0F; - qos_param->cw_min[i] = (u16) ((1 << cw_min) - 1); + qos_param->cw_min[i] = cpu_to_le16((1 << cw_min) - 1); cw_max = (ac_params->ecw_min_max & 0xF0) >> 4; - qos_param->cw_max[i] = (u16) ((1 << cw_max) - 1); + qos_param->cw_max[i] = cpu_to_le16((1 << cw_max) - 1); qos_param->flag[i] = (ac_params->aci_aifsn & 0x10) ? 0x01 : 0x00; txop = le16_to_cpu(ac_params->tx_op_limit) * 32; - qos_param->tx_op_limit[i] = (u16) txop; + qos_param->tx_op_limit[i] = cpu_to_le16(txop); } return rc; } @@ -1585,26 +1585,25 @@ static void ieee80211_process_probe_response(struct ieee80211_device DECLARE_MAC_BUF(mac); IEEE80211_DEBUG_SCAN("'%s' (%s" - "): %c%c%c%c %c%c%c%c-%c%c%c%c %c%c%c%c\n", - escape_essid(info_element->data, - info_element->len), - print_mac(mac, beacon->header.addr3), - (beacon->capability & (1 << 0xf)) ? '1' : '0', - (beacon->capability & (1 << 0xe)) ? '1' : '0', - (beacon->capability & (1 << 0xd)) ? '1' : '0', - (beacon->capability & (1 << 0xc)) ? '1' : '0', - (beacon->capability & (1 << 0xb)) ? '1' : '0', - (beacon->capability & (1 << 0xa)) ? '1' : '0', - (beacon->capability & (1 << 0x9)) ? '1' : '0', - (beacon->capability & (1 << 0x8)) ? '1' : '0', - (beacon->capability & (1 << 0x7)) ? '1' : '0', - (beacon->capability & (1 << 0x6)) ? '1' : '0', - (beacon->capability & (1 << 0x5)) ? '1' : '0', - (beacon->capability & (1 << 0x4)) ? '1' : '0', - (beacon->capability & (1 << 0x3)) ? '1' : '0', - (beacon->capability & (1 << 0x2)) ? '1' : '0', - (beacon->capability & (1 << 0x1)) ? '1' : '0', - (beacon->capability & (1 << 0x0)) ? '1' : '0'); + "): %c%c%c%c %c%c%c%c-%c%c%c%c %c%c%c%c\n", + escape_essid(info_element->data, info_element->len), + print_mac(mac, beacon->header.addr3), + (beacon->capability & cpu_to_le16(1 << 0xf)) ? '1' : '0', + (beacon->capability & cpu_to_le16(1 << 0xe)) ? '1' : '0', + (beacon->capability & cpu_to_le16(1 << 0xd)) ? '1' : '0', + (beacon->capability & cpu_to_le16(1 << 0xc)) ? '1' : '0', + (beacon->capability & cpu_to_le16(1 << 0xb)) ? '1' : '0', + (beacon->capability & cpu_to_le16(1 << 0xa)) ? '1' : '0', + (beacon->capability & cpu_to_le16(1 << 0x9)) ? '1' : '0', + (beacon->capability & cpu_to_le16(1 << 0x8)) ? '1' : '0', + (beacon->capability & cpu_to_le16(1 << 0x7)) ? '1' : '0', + (beacon->capability & cpu_to_le16(1 << 0x6)) ? '1' : '0', + (beacon->capability & cpu_to_le16(1 << 0x5)) ? '1' : '0', + (beacon->capability & cpu_to_le16(1 << 0x4)) ? '1' : '0', + (beacon->capability & cpu_to_le16(1 << 0x3)) ? '1' : '0', + (beacon->capability & cpu_to_le16(1 << 0x2)) ? '1' : '0', + (beacon->capability & cpu_to_le16(1 << 0x1)) ? '1' : '0', + (beacon->capability & cpu_to_le16(1 << 0x0)) ? '1' : '0'); if (ieee80211_network_init(ieee, beacon, &network, stats)) { IEEE80211_DEBUG_SCAN("Dropped '%s' (%s) via %s.\n", diff --git a/net/ieee80211/ieee80211_tx.c b/net/ieee80211/ieee80211_tx.c index 6d06f1385e2..d8b02603cbe 100644 --- a/net/ieee80211/ieee80211_tx.c +++ b/net/ieee80211/ieee80211_tx.c @@ -126,7 +126,7 @@ payload of each frame is reduced to 492 bytes. static u8 P802_1H_OUI[P80211_OUI_LEN] = { 0x00, 0x00, 0xf8 }; static u8 RFC1042_OUI[P80211_OUI_LEN] = { 0x00, 0x00, 0x00 }; -static int ieee80211_copy_snap(u8 * data, u16 h_proto) +static int ieee80211_copy_snap(u8 * data, __be16 h_proto) { struct ieee80211_snap_hdr *snap; u8 *oui; @@ -136,7 +136,7 @@ static int ieee80211_copy_snap(u8 * data, u16 h_proto) snap->ssap = 0xaa; snap->ctrl = 0x03; - if (h_proto == 0x8137 || h_proto == 0x80f3) + if (h_proto == htons(ETH_P_AARP) || h_proto == htons(ETH_P_IPX)) oui = P802_1H_OUI; else oui = RFC1042_OUI; @@ -144,7 +144,6 @@ static int ieee80211_copy_snap(u8 * data, u16 h_proto) snap->oui[1] = oui[1]; snap->oui[2] = oui[2]; - h_proto = htons(h_proto); memcpy(data + SNAP_SIZE, &h_proto, sizeof(u16)); return SNAP_SIZE + sizeof(u16); @@ -261,7 +260,8 @@ int ieee80211_xmit(struct sk_buff *skb, struct net_device *dev) rts_required; unsigned long flags; struct net_device_stats *stats = &ieee->stats; - int ether_type, encrypt, host_encrypt, host_encrypt_msdu, host_build_iv; + int encrypt, host_encrypt, host_encrypt_msdu, host_build_iv; + __be16 ether_type; int bytes, fc, hdr_len; struct sk_buff *skb_frag; struct ieee80211_hdr_3addrqos header = {/* Ensure zero initialized */ @@ -292,11 +292,11 @@ int ieee80211_xmit(struct sk_buff *skb, struct net_device *dev) goto success; } - ether_type = ntohs(((struct ethhdr *)skb->data)->h_proto); + ether_type = ((struct ethhdr *)skb->data)->h_proto; crypt = ieee->crypt[ieee->tx_keyidx]; - encrypt = !(ether_type == ETH_P_PAE && ieee->ieee802_1x) && + encrypt = !(ether_type == htons(ETH_P_PAE) && ieee->ieee802_1x) && ieee->sec.encrypt; host_encrypt = ieee->host_encrypt && encrypt && crypt; @@ -304,7 +304,7 @@ int ieee80211_xmit(struct sk_buff *skb, struct net_device *dev) host_build_iv = ieee->host_build_iv && encrypt && crypt; if (!encrypt && ieee->ieee802_1x && - ieee->drop_unencrypted && ether_type != ETH_P_PAE) { + ieee->drop_unencrypted && ether_type != htons(ETH_P_PAE)) { stats->tx_dropped++; goto success; } diff --git a/net/ieee80211/ieee80211_wx.c b/net/ieee80211/ieee80211_wx.c index d309e8f1999..623489afa62 100644 --- a/net/ieee80211/ieee80211_wx.c +++ b/net/ieee80211/ieee80211_wx.c @@ -709,7 +709,7 @@ int ieee80211_wx_get_encodeext(struct ieee80211_device *ieee, } else idx = ieee->tx_keyidx; - if (!ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY && + if (!(ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY) && ext->alg != IW_ENCODE_ALG_WEP) if (idx != 0 || ieee->iw_mode != IW_MODE_INFRA) return -EINVAL; diff --git a/net/ieee80211/softmac/ieee80211softmac_auth.c b/net/ieee80211/softmac/ieee80211softmac_auth.c index a53a751d070..1a96c257257 100644 --- a/net/ieee80211/softmac/ieee80211softmac_auth.c +++ b/net/ieee80211/softmac/ieee80211softmac_auth.c @@ -178,11 +178,11 @@ ieee80211softmac_auth_resp(struct net_device *dev, struct ieee80211_auth *auth) } /* Parse the auth packet */ - switch(auth->algorithm) { + switch(le16_to_cpu(auth->algorithm)) { case WLAN_AUTH_OPEN: /* Check the status code of the response */ - switch(auth->status) { + switch(le16_to_cpu(auth->status)) { case WLAN_STATUS_SUCCESS: /* Update the status to Authenticated */ spin_lock_irqsave(&mac->lock, flags); @@ -210,7 +210,7 @@ ieee80211softmac_auth_resp(struct net_device *dev, struct ieee80211_auth *auth) break; case WLAN_AUTH_SHARED_KEY: /* Figure out where we are in the process */ - switch(auth->transaction) { + switch(le16_to_cpu(auth->transaction)) { case IEEE80211SOFTMAC_AUTH_SHARED_CHALLENGE: /* Check to make sure we have a challenge IE */ data = (u8 *)auth->info_element; diff --git a/net/ieee80211/softmac/ieee80211softmac_io.c b/net/ieee80211/softmac/ieee80211softmac_io.c index 26c35253be3..73b4b13fbd8 100644 --- a/net/ieee80211/softmac/ieee80211softmac_io.c +++ b/net/ieee80211/softmac/ieee80211softmac_io.c @@ -148,11 +148,11 @@ ieee80211softmac_hdr_3addr(struct ieee80211softmac_device *mac, * shouldn't the sequence number be in ieee80211? */ } -static u16 +static __le16 ieee80211softmac_capabilities(struct ieee80211softmac_device *mac, struct ieee80211softmac_network *net) { - u16 capability = 0; + __le16 capability = 0; /* ESS and IBSS bits are set according to the current mode */ switch (mac->ieee->iw_mode) { @@ -163,8 +163,8 @@ ieee80211softmac_capabilities(struct ieee80211softmac_device *mac, capability = cpu_to_le16(WLAN_CAPABILITY_IBSS); break; case IW_MODE_AUTO: - capability = net->capabilities & - (WLAN_CAPABILITY_ESS|WLAN_CAPABILITY_IBSS); + capability = cpu_to_le16(net->capabilities & + (WLAN_CAPABILITY_ESS|WLAN_CAPABILITY_IBSS)); break; default: /* bleh. we don't ever go to these modes */ @@ -182,7 +182,7 @@ ieee80211softmac_capabilities(struct ieee80211softmac_device *mac, /* Short Preamble */ /* Always supported: we probably won't ever be powering devices which * dont support this... */ - capability |= WLAN_CAPABILITY_SHORT_PREAMBLE; + capability |= cpu_to_le16(WLAN_CAPABILITY_SHORT_PREAMBLE); /* PBCC */ /* Not widely used */ diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 9f9fd2c6f6e..24e2b7294bf 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -85,6 +85,13 @@ endchoice config IP_FIB_HASH def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER +config IP_FIB_TRIE_STATS + bool "FIB TRIE statistics" + depends on IP_FIB_TRIE + ---help--- + Keep track of statistics on structure of FIB TRIE table. + Useful for testing and measuring TRIE performance. + config IP_MULTIPLE_TABLES bool "IP: policy routing" depends on IP_ADVANCED_ROUTER diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 93fe3966805..ad40ef3f9eb 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -10,9 +10,10 @@ obj-y := route.o inetpeer.o protocol.o \ tcp_minisocks.o tcp_cong.o \ datagram.o raw.o udp.o udplite.o \ arp.o icmp.o devinet.o af_inet.o igmp.o \ - sysctl_net_ipv4.o fib_frontend.o fib_semantics.o \ + fib_frontend.o fib_semantics.o \ inet_fragment.o +obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o obj-$(CONFIG_PROC_FS) += proc.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d2f22e74b26..09ca5293d08 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -126,6 +126,10 @@ extern void ip_mc_drop_socket(struct sock *sk); static struct list_head inetsw[SOCK_MAX]; static DEFINE_SPINLOCK(inetsw_lock); +struct ipv4_config ipv4_config; + +EXPORT_SYMBOL(ipv4_config); + /* New destruction routine */ void inet_sock_destruct(struct sock *sk) @@ -135,6 +139,8 @@ void inet_sock_destruct(struct sock *sk) __skb_queue_purge(&sk->sk_receive_queue); __skb_queue_purge(&sk->sk_error_queue); + sk_mem_reclaim(sk); + if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) { printk("Attempt to release TCP socket in state %d %p\n", sk->sk_state, sk); @@ -440,7 +446,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (addr_len < sizeof(struct sockaddr_in)) goto out; - chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr); + chk_addr_ret = inet_addr_type(&init_net, addr->sin_addr.s_addr); /* Not specified by any standard per-se, however it breaks too * many applications when removed. It is unfortunate since @@ -789,12 +795,12 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCADDRT: case SIOCDELRT: case SIOCRTMSG: - err = ip_rt_ioctl(cmd, (void __user *)arg); + err = ip_rt_ioctl(sk->sk_net, cmd, (void __user *)arg); break; case SIOCDARP: case SIOCGARP: case SIOCSARP: - err = arp_ioctl(cmd, (void __user *)arg); + err = arp_ioctl(sk->sk_net, cmd, (void __user *)arg); break; case SIOCGIFADDR: case SIOCSIFADDR: @@ -838,6 +844,7 @@ const struct proto_ops inet_stream_ops = { .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, .sendpage = tcp_sendpage, + .splice_read = tcp_splice_read, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, @@ -1106,7 +1113,7 @@ int inet_sk_rebuild_header(struct sock *sk) }; security_sk_classify_flow(sk, &fl); - err = ip_route_output_flow(&rt, &fl, sk, 0); + err = ip_route_output_flow(&init_net, &rt, &fl, sk, 0); } if (!err) sk_setup_caps(sk, &rt->u.dst); @@ -1237,7 +1244,7 @@ unsigned long snmp_fold_field(void *mib[], int offt) } EXPORT_SYMBOL_GPL(snmp_fold_field); -int snmp_mib_init(void *ptr[2], size_t mibsize, size_t mibalign) +int snmp_mib_init(void *ptr[2], size_t mibsize) { BUG_ON(ptr == NULL); ptr[0] = __alloc_percpu(mibsize); @@ -1286,37 +1293,31 @@ static struct net_protocol udp_protocol = { static struct net_protocol icmp_protocol = { .handler = icmp_rcv, + .no_policy = 1, }; static int __init init_ipv4_mibs(void) { if (snmp_mib_init((void **)net_statistics, - sizeof(struct linux_mib), - __alignof__(struct linux_mib)) < 0) + sizeof(struct linux_mib)) < 0) goto err_net_mib; if (snmp_mib_init((void **)ip_statistics, - sizeof(struct ipstats_mib), - __alignof__(struct ipstats_mib)) < 0) + sizeof(struct ipstats_mib)) < 0) goto err_ip_mib; if (snmp_mib_init((void **)icmp_statistics, - sizeof(struct icmp_mib), - __alignof__(struct icmp_mib)) < 0) + sizeof(struct icmp_mib)) < 0) goto err_icmp_mib; if (snmp_mib_init((void **)icmpmsg_statistics, - sizeof(struct icmpmsg_mib), - __alignof__(struct icmpmsg_mib)) < 0) + sizeof(struct icmpmsg_mib)) < 0) goto err_icmpmsg_mib; if (snmp_mib_init((void **)tcp_statistics, - sizeof(struct tcp_mib), - __alignof__(struct tcp_mib)) < 0) + sizeof(struct tcp_mib)) < 0) goto err_tcp_mib; if (snmp_mib_init((void **)udp_statistics, - sizeof(struct udp_mib), - __alignof__(struct udp_mib)) < 0) + sizeof(struct udp_mib)) < 0) goto err_udp_mib; if (snmp_mib_init((void **)udplite_statistics, - sizeof(struct udp_mib), - __alignof__(struct udp_mib)) < 0) + sizeof(struct udp_mib)) < 0) goto err_udplite_mib; tcp_mib_init(); @@ -1418,6 +1419,9 @@ static int __init inet_init(void) /* Setup TCP slab cache for open requests. */ tcp_init(); + /* Setup UDP memory threshold */ + udp_init(); + /* Add UDP-Lite (RFC 3828) */ udplite4_register(); @@ -1471,15 +1475,11 @@ static int __init ipv4_proc_init(void) goto out_tcp; if (udp4_proc_init()) goto out_udp; - if (fib_proc_init()) - goto out_fib; if (ip_misc_proc_init()) goto out_misc; out: return rc; out_misc: - fib_proc_exit(); -out_fib: udp4_proc_exit(); out_udp: tcp4_proc_exit(); diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 5fc346d8b56..d76803a3dca 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -169,6 +169,8 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) if (ip_clear_mutable_options(iph, &dummy)) goto out; } + + spin_lock(&x->lock); { u8 auth_data[MAX_AH_AUTH_LEN]; @@ -176,13 +178,16 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) skb_push(skb, ihl); err = ah_mac_digest(ahp, skb, ah->auth_data); if (err) - goto out; - err = -EINVAL; - if (memcmp(ahp->work_icv, auth_data, ahp->icv_trunc_len)) { - x->stats.integrity_failed++; - goto out; - } + goto unlock; + if (memcmp(ahp->work_icv, auth_data, ahp->icv_trunc_len)) + err = -EBADMSG; } +unlock: + spin_unlock(&x->lock); + + if (err) + goto out; + skb->network_header += ah_hlen; memcpy(skb_network_header(skb), work_buf, ihl); skb->transport_header = skb->network_header; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 54a76b8b803..5976c598cc4 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -235,8 +235,6 @@ static int arp_constructor(struct neighbour *neigh) struct in_device *in_dev; struct neigh_parms *parms; - neigh->type = inet_addr_type(addr); - rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (in_dev == NULL) { @@ -244,6 +242,8 @@ static int arp_constructor(struct neighbour *neigh) return -EINVAL; } + neigh->type = inet_addr_type(&init_net, addr); + parms = in_dev->arp_parms; __neigh_parms_put(neigh->parms); neigh->parms = neigh_parms_clone(parms); @@ -341,14 +341,14 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { default: case 0: /* By default announce any local IP */ - if (skb && inet_addr_type(ip_hdr(skb)->saddr) == RTN_LOCAL) + if (skb && inet_addr_type(&init_net, ip_hdr(skb)->saddr) == RTN_LOCAL) saddr = ip_hdr(skb)->saddr; break; case 1: /* Restrict announcements of saddr in same subnet */ if (!skb) break; saddr = ip_hdr(skb)->saddr; - if (inet_addr_type(saddr) == RTN_LOCAL) { + if (inet_addr_type(&init_net, saddr) == RTN_LOCAL) { /* saddr should be known to target */ if (inet_addr_onlink(in_dev, target, saddr)) break; @@ -382,8 +382,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) read_unlock_bh(&neigh->lock); } -static int arp_ignore(struct in_device *in_dev, struct net_device *dev, - __be32 sip, __be32 tip) +static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) { int scope; @@ -403,7 +402,6 @@ static int arp_ignore(struct in_device *in_dev, struct net_device *dev, case 3: /* Do not reply for scope host addresses */ sip = 0; scope = RT_SCOPE_LINK; - dev = NULL; break; case 4: /* Reserved */ case 5: @@ -415,7 +413,7 @@ static int arp_ignore(struct in_device *in_dev, struct net_device *dev, default: return 0; } - return !inet_confirm_addr(dev, sip, tip, scope); + return !inet_confirm_addr(in_dev, sip, tip, scope); } static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) @@ -426,7 +424,7 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) int flag = 0; /*unsigned long now; */ - if (ip_route_output_key(&rt, &fl) < 0) + if (ip_route_output_key(&init_net, &rt, &fl) < 0) return 1; if (rt->u.dst.dev != dev) { NET_INC_STATS_BH(LINUX_MIB_ARPFILTER); @@ -479,7 +477,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) paddr = ((struct rtable*)skb->dst)->rt_gateway; - if (arp_set_predefined(inet_addr_type(paddr), haddr, paddr, dev)) + if (arp_set_predefined(inet_addr_type(&init_net, paddr), haddr, paddr, dev)) return 0; n = __neigh_lookup(&arp_tbl, &paddr, dev, 1); @@ -777,7 +775,7 @@ static int arp_process(struct sk_buff *skb) * Check for bad requests for 127.x.x.x and requests for multicast * addresses. If this is one such, delete it. */ - if (LOOPBACK(tip) || MULTICAST(tip)) + if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) goto out; /* @@ -806,8 +804,8 @@ static int arp_process(struct sk_buff *skb) /* Special case: IPv4 duplicate address detection packet (RFC2131) */ if (sip == 0) { if (arp->ar_op == htons(ARPOP_REQUEST) && - inet_addr_type(tip) == RTN_LOCAL && - !arp_ignore(in_dev,dev,sip,tip)) + inet_addr_type(&init_net, tip) == RTN_LOCAL && + !arp_ignore(in_dev, sip, tip)) arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha); goto out; @@ -825,7 +823,7 @@ static int arp_process(struct sk_buff *skb) int dont_send = 0; if (!dont_send) - dont_send |= arp_ignore(in_dev,dev,sip,tip); + dont_send |= arp_ignore(in_dev,sip,tip); if (!dont_send && IN_DEV_ARPFILTER(in_dev)) dont_send |= arp_filter(sip,tip,dev); if (!dont_send) @@ -835,9 +833,8 @@ static int arp_process(struct sk_buff *skb) } goto out; } else if (IN_DEV_FORWARD(in_dev)) { - if ((rt->rt_flags&RTCF_DNAT) || - (addr_type == RTN_UNICAST && rt->u.dst.dev != dev && - (arp_fwd_proxy(in_dev, rt) || pneigh_lookup(&arp_tbl, &tip, dev, 0)))) { + if (addr_type == RTN_UNICAST && rt->u.dst.dev != dev && + (arp_fwd_proxy(in_dev, rt) || pneigh_lookup(&arp_tbl, &init_net, &tip, dev, 0))) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) neigh_release(n); @@ -860,14 +857,14 @@ static int arp_process(struct sk_buff *skb) n = __neigh_lookup(&arp_tbl, &sip, dev, 0); - if (IPV4_DEVCONF_ALL(ARP_ACCEPT)) { + if (IPV4_DEVCONF_ALL(dev->nd_net, ARP_ACCEPT)) { /* Unsolicited ARP is not accepted by default. It is possible, that this option should be enabled for some devices (strip is candidate) */ if (n == NULL && arp->ar_op == htons(ARPOP_REPLY) && - inet_addr_type(sip) == RTN_UNICAST) + inet_addr_type(&init_net, sip) == RTN_UNICAST) n = __neigh_lookup(&arp_tbl, &sip, dev, 1); } @@ -952,44 +949,60 @@ out_of_mem: * Set (create) an ARP cache entry. */ -static int arp_req_set(struct arpreq *r, struct net_device * dev) +static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on) { - __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr; + if (dev == NULL) { + IPV4_DEVCONF_ALL(net, PROXY_ARP) = on; + return 0; + } + if (__in_dev_get_rtnl(dev)) { + IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, on); + return 0; + } + return -ENXIO; +} + +static int arp_req_set_public(struct net *net, struct arpreq *r, + struct net_device *dev) +{ + __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; + __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; + + if (mask && mask != htonl(0xFFFFFFFF)) + return -EINVAL; + if (!dev && (r->arp_flags & ATF_COM)) { + dev = dev_getbyhwaddr(net, r->arp_ha.sa_family, + r->arp_ha.sa_data); + if (!dev) + return -ENODEV; + } + if (mask) { + if (pneigh_lookup(&arp_tbl, net, &ip, dev, 1) == NULL) + return -ENOBUFS; + return 0; + } + + return arp_req_set_proxy(net, dev, 1); +} + +static int arp_req_set(struct net *net, struct arpreq *r, + struct net_device * dev) +{ + __be32 ip; struct neighbour *neigh; int err; - if (r->arp_flags&ATF_PUBL) { - __be32 mask = ((struct sockaddr_in *) &r->arp_netmask)->sin_addr.s_addr; - if (mask && mask != htonl(0xFFFFFFFF)) - return -EINVAL; - if (!dev && (r->arp_flags & ATF_COM)) { - dev = dev_getbyhwaddr(&init_net, r->arp_ha.sa_family, r->arp_ha.sa_data); - if (!dev) - return -ENODEV; - } - if (mask) { - if (pneigh_lookup(&arp_tbl, &ip, dev, 1) == NULL) - return -ENOBUFS; - return 0; - } - if (dev == NULL) { - IPV4_DEVCONF_ALL(PROXY_ARP) = 1; - return 0; - } - if (__in_dev_get_rtnl(dev)) { - IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, 1); - return 0; - } - return -ENXIO; - } + if (r->arp_flags & ATF_PUBL) + return arp_req_set_public(net, r, dev); + ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; if (r->arp_flags & ATF_PERM) r->arp_flags |= ATF_COM; if (dev == NULL) { struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, .tos = RTO_ONLINK } } }; struct rtable * rt; - if ((err = ip_route_output_key(&rt, &fl)) != 0) + if ((err = ip_route_output_key(net, &rt, &fl)) != 0) return err; dev = rt->u.dst.dev; ip_rt_put(rt); @@ -1066,37 +1079,37 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev) return err; } -static int arp_req_delete(struct arpreq *r, struct net_device * dev) +static int arp_req_delete_public(struct net *net, struct arpreq *r, + struct net_device *dev) +{ + __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr; + __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; + + if (mask == htonl(0xFFFFFFFF)) + return pneigh_delete(&arp_tbl, net, &ip, dev); + + if (mask) + return -EINVAL; + + return arp_req_set_proxy(net, dev, 0); +} + +static int arp_req_delete(struct net *net, struct arpreq *r, + struct net_device * dev) { int err; - __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; + __be32 ip; struct neighbour *neigh; - if (r->arp_flags & ATF_PUBL) { - __be32 mask = - ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; - if (mask == htonl(0xFFFFFFFF)) - return pneigh_delete(&arp_tbl, &ip, dev); - if (mask == 0) { - if (dev == NULL) { - IPV4_DEVCONF_ALL(PROXY_ARP) = 0; - return 0; - } - if (__in_dev_get_rtnl(dev)) { - IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), - PROXY_ARP, 0); - return 0; - } - return -ENXIO; - } - return -EINVAL; - } + if (r->arp_flags & ATF_PUBL) + return arp_req_delete_public(net, r, dev); + ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; if (dev == NULL) { struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, .tos = RTO_ONLINK } } }; struct rtable * rt; - if ((err = ip_route_output_key(&rt, &fl)) != 0) + if ((err = ip_route_output_key(net, &rt, &fl)) != 0) return err; dev = rt->u.dst.dev; ip_rt_put(rt); @@ -1119,7 +1132,7 @@ static int arp_req_delete(struct arpreq *r, struct net_device * dev) * Handle an ARP layer I/O control request. */ -int arp_ioctl(unsigned int cmd, void __user *arg) +int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) { int err; struct arpreq r; @@ -1151,7 +1164,7 @@ int arp_ioctl(unsigned int cmd, void __user *arg) rtnl_lock(); if (r.arp_dev[0]) { err = -ENODEV; - if ((dev = __dev_get_by_name(&init_net, r.arp_dev)) == NULL) + if ((dev = __dev_get_by_name(net, r.arp_dev)) == NULL) goto out; /* Mmmm... It is wrong... ARPHRD_NETROM==0 */ @@ -1167,10 +1180,10 @@ int arp_ioctl(unsigned int cmd, void __user *arg) switch (cmd) { case SIOCDARP: - err = arp_req_delete(&r, dev); + err = arp_req_delete(net, &r, dev); break; case SIOCSARP: - err = arp_req_set(&r, dev); + err = arp_req_set(net, &r, dev); break; case SIOCGARP: err = arp_req_get(&r, dev); @@ -1359,8 +1372,8 @@ static const struct seq_operations arp_seq_ops = { static int arp_seq_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &arp_seq_ops, - sizeof(struct neigh_seq_state)); + return seq_open_net(inode, file, &arp_seq_ops, + sizeof(struct neigh_seq_state)); } static const struct file_operations arp_seq_fops = { @@ -1368,7 +1381,7 @@ static const struct file_operations arp_seq_fops = { .open = arp_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; static int __init arp_proc_init(void) diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index f18e88bc86e..d4dc4eb48d9 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -63,7 +63,7 @@ struct cipso_v4_domhsh_entry { * probably be turned into a hash table or something similar so we * can do quick lookups. */ static DEFINE_SPINLOCK(cipso_v4_doi_list_lock); -static struct list_head cipso_v4_doi_list = LIST_HEAD_INIT(cipso_v4_doi_list); +static LIST_HEAD(cipso_v4_doi_list); /* Label mapping cache */ int cipso_v4_cache_enabled = 1; diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 0301dd468cf..0c0c73f368c 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -40,7 +40,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) oif = sk->sk_bound_dev_if; saddr = inet->saddr; - if (MULTICAST(usin->sin_addr.s_addr)) { + if (ipv4_is_multicast(usin->sin_addr.s_addr)) { if (!oif) oif = inet->mc_index; if (!saddr) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index b42f74617ba..21f71bf912d 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -62,6 +62,7 @@ #include <net/route.h> #include <net/ip_fib.h> #include <net/rtnetlink.h> +#include <net/net_namespace.h> struct ipv4_devconf ipv4_devconf = { .data = { @@ -82,7 +83,8 @@ static struct ipv4_devconf ipv4_devconf_dflt = { }, }; -#define IPV4_DEVCONF_DFLT(attr) IPV4_DEVCONF(ipv4_devconf_dflt, attr) +#define IPV4_DEVCONF_DFLT(net, attr) \ + IPV4_DEVCONF((*net->ipv4.devconf_dflt), attr) static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_LOCAL] = { .type = NLA_U32 }, @@ -98,9 +100,15 @@ static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy); #ifdef CONFIG_SYSCTL -static void devinet_sysctl_register(struct in_device *in_dev, - struct ipv4_devconf *p); -static void devinet_sysctl_unregister(struct ipv4_devconf *p); +static void devinet_sysctl_register(struct in_device *idev); +static void devinet_sysctl_unregister(struct in_device *idev); +#else +static inline void devinet_sysctl_register(struct in_device *idev) +{ +} +static inline void devinet_sysctl_unregister(struct in_device *idev) +{ +} #endif /* Locks all the inet devices. */ @@ -157,24 +165,18 @@ static struct in_device *inetdev_init(struct net_device *dev) if (!in_dev) goto out; INIT_RCU_HEAD(&in_dev->rcu_head); - memcpy(&in_dev->cnf, &ipv4_devconf_dflt, sizeof(in_dev->cnf)); + memcpy(&in_dev->cnf, dev->nd_net->ipv4.devconf_dflt, + sizeof(in_dev->cnf)); in_dev->cnf.sysctl = NULL; in_dev->dev = dev; if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL) goto out_kfree; /* Reference in_dev->dev */ dev_hold(dev); -#ifdef CONFIG_SYSCTL - neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4, - NET_IPV4_NEIGH, "ipv4", NULL, NULL); -#endif - /* Account for reference dev->ip_ptr (below) */ in_dev_hold(in_dev); -#ifdef CONFIG_SYSCTL - devinet_sysctl_register(in_dev, &in_dev->cnf); -#endif + devinet_sysctl_register(in_dev); ip_mc_init_dev(in_dev); if (dev->flags & IFF_UP) ip_mc_up(in_dev); @@ -213,15 +215,9 @@ static void inetdev_destroy(struct in_device *in_dev) inet_free_ifa(ifa); } -#ifdef CONFIG_SYSCTL - devinet_sysctl_unregister(&in_dev->cnf); -#endif - dev->ip_ptr = NULL; -#ifdef CONFIG_SYSCTL - neigh_sysctl_unregister(in_dev->arp_parms); -#endif + devinet_sysctl_unregister(in_dev); neigh_parms_release(&arp_tbl, in_dev->arp_parms); arp_ifdown(dev); @@ -408,17 +404,17 @@ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) in_dev_hold(in_dev); ifa->ifa_dev = in_dev; } - if (LOOPBACK(ifa->ifa_local)) + if (ipv4_is_loopback(ifa->ifa_local)) ifa->ifa_scope = RT_SCOPE_HOST; return inet_insert_ifa(ifa); } -struct in_device *inetdev_by_index(int ifindex) +struct in_device *inetdev_by_index(struct net *net, int ifindex) { struct net_device *dev; struct in_device *in_dev = NULL; read_lock(&dev_base_lock); - dev = __dev_get_by_index(&init_net, ifindex); + dev = __dev_get_by_index(net, ifindex); if (dev) in_dev = in_dev_get(dev); read_unlock(&dev_base_lock); @@ -441,6 +437,7 @@ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { + struct net *net = skb->sk->sk_net; struct nlattr *tb[IFA_MAX+1]; struct in_device *in_dev; struct ifaddrmsg *ifm; @@ -449,12 +446,15 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg ASSERT_RTNL(); + if (net != &init_net) + return -EINVAL; + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy); if (err < 0) goto errout; ifm = nlmsg_data(nlh); - in_dev = inetdev_by_index(ifm->ifa_index); + in_dev = inetdev_by_index(net, ifm->ifa_index); if (in_dev == NULL) { err = -ENODEV; goto errout; @@ -560,10 +560,14 @@ errout: static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { + struct net *net = skb->sk->sk_net; struct in_ifaddr *ifa; ASSERT_RTNL(); + if (net != &init_net) + return -EINVAL; + ifa = rtm_to_ifaddr(nlh); if (IS_ERR(ifa)) return PTR_ERR(ifa); @@ -579,7 +583,7 @@ static __inline__ int inet_abc_len(__be32 addr) { int rc = -1; /* Something else, probably a multicast. */ - if (ZERONET(addr)) + if (ipv4_is_zeronet(addr)) rc = 0; else { __u32 haddr = ntohl(addr); @@ -964,28 +968,25 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, /* * Confirm that local IP address exists using wildcards: - * - dev: only on this interface, 0=any interface + * - in_dev: only on this interface, 0=any interface * - dst: only in the same subnet as dst, 0=any dst * - local: address, 0=autoselect the local address * - scope: maximum allowed scope value for the local address */ -__be32 inet_confirm_addr(const struct net_device *dev, __be32 dst, __be32 local, int scope) +__be32 inet_confirm_addr(struct in_device *in_dev, + __be32 dst, __be32 local, int scope) { __be32 addr = 0; - struct in_device *in_dev; - - if (dev) { - rcu_read_lock(); - if ((in_dev = __in_dev_get_rcu(dev))) - addr = confirm_addr_indev(in_dev, dst, local, scope); - rcu_read_unlock(); + struct net_device *dev; + struct net *net; - return addr; - } + if (scope != RT_SCOPE_LINK) + return confirm_addr_indev(in_dev, dst, local, scope); + net = in_dev->dev->nd_net; read_lock(&dev_base_lock); rcu_read_lock(); - for_each_netdev(&init_net, dev) { + for_each_netdev(net, dev) { if ((in_dev = __in_dev_get_rcu(dev))) { addr = confirm_addr_indev(in_dev, dst, local, scope); if (addr) @@ -1106,13 +1107,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, */ inetdev_changename(dev, in_dev); -#ifdef CONFIG_SYSCTL - devinet_sysctl_unregister(&in_dev->cnf); - neigh_sysctl_unregister(in_dev->arp_parms); - neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4, - NET_IPV4_NEIGH, "ipv4", NULL, NULL); - devinet_sysctl_register(in_dev, &in_dev->cnf); -#endif + devinet_sysctl_unregister(in_dev); + devinet_sysctl_register(in_dev); break; } out: @@ -1174,12 +1170,16 @@ nla_put_failure: static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = skb->sk->sk_net; int idx, ip_idx; struct net_device *dev; struct in_device *in_dev; struct in_ifaddr *ifa; int s_ip_idx, s_idx = cb->args[0]; + if (net != &init_net) + return 0; + s_ip_idx = ip_idx = cb->args[1]; idx = 0; for_each_netdev(&init_net, dev) { @@ -1228,28 +1228,50 @@ static void rtmsg_ifa(int event, struct in_ifaddr* ifa, struct nlmsghdr *nlh, kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); + err = rtnl_notify(skb, &init_net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); errout: if (err < 0) - rtnl_set_sk_err(RTNLGRP_IPV4_IFADDR, err); + rtnl_set_sk_err(&init_net, RTNLGRP_IPV4_IFADDR, err); } #ifdef CONFIG_SYSCTL -static void devinet_copy_dflt_conf(int i) +static void devinet_copy_dflt_conf(struct net *net, int i) { struct net_device *dev; read_lock(&dev_base_lock); - for_each_netdev(&init_net, dev) { + for_each_netdev(net, dev) { struct in_device *in_dev; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (in_dev && !test_bit(i, in_dev->cnf.state)) - in_dev->cnf.data[i] = ipv4_devconf_dflt.data[i]; + in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i]; + rcu_read_unlock(); + } + read_unlock(&dev_base_lock); +} + +static void inet_forward_change(struct net *net) +{ + struct net_device *dev; + int on = IPV4_DEVCONF_ALL(net, FORWARDING); + + IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on; + IPV4_DEVCONF_DFLT(net, FORWARDING) = on; + + read_lock(&dev_base_lock); + for_each_netdev(net, dev) { + struct in_device *in_dev; + rcu_read_lock(); + in_dev = __in_dev_get_rcu(dev); + if (in_dev) + IN_DEV_CONF_SET(in_dev, FORWARDING, on); rcu_read_unlock(); } read_unlock(&dev_base_lock); + + rt_cache_flush(0); } static int devinet_conf_proc(ctl_table *ctl, int write, @@ -1260,12 +1282,13 @@ static int devinet_conf_proc(ctl_table *ctl, int write, if (write) { struct ipv4_devconf *cnf = ctl->extra1; + struct net *net = ctl->extra2; int i = (int *)ctl->data - cnf->data; set_bit(i, cnf->state); - if (cnf == &ipv4_devconf_dflt) - devinet_copy_dflt_conf(i); + if (cnf == net->ipv4.devconf_dflt) + devinet_copy_dflt_conf(net, i); } return ret; @@ -1276,6 +1299,7 @@ static int devinet_conf_sysctl(ctl_table *table, int __user *name, int nlen, void __user *newval, size_t newlen) { struct ipv4_devconf *cnf; + struct net *net; int *valp = table->data; int new; int i; @@ -1311,38 +1335,17 @@ static int devinet_conf_sysctl(ctl_table *table, int __user *name, int nlen, *valp = new; cnf = table->extra1; + net = table->extra2; i = (int *)table->data - cnf->data; set_bit(i, cnf->state); - if (cnf == &ipv4_devconf_dflt) - devinet_copy_dflt_conf(i); + if (cnf == net->ipv4.devconf_dflt) + devinet_copy_dflt_conf(net, i); return 1; } -void inet_forward_change(void) -{ - struct net_device *dev; - int on = IPV4_DEVCONF_ALL(FORWARDING); - - IPV4_DEVCONF_ALL(ACCEPT_REDIRECTS) = !on; - IPV4_DEVCONF_DFLT(FORWARDING) = on; - - read_lock(&dev_base_lock); - for_each_netdev(&init_net, dev) { - struct in_device *in_dev; - rcu_read_lock(); - in_dev = __in_dev_get_rcu(dev); - if (in_dev) - IN_DEV_CONF_SET(in_dev, FORWARDING, on); - rcu_read_unlock(); - } - read_unlock(&dev_base_lock); - - rt_cache_flush(0); -} - static int devinet_sysctl_forward(ctl_table *ctl, int write, struct file* filp, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -1352,9 +1355,11 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write, int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); if (write && *valp != val) { - if (valp == &IPV4_DEVCONF_ALL(FORWARDING)) - inet_forward_change(); - else if (valp != &IPV4_DEVCONF_DFLT(FORWARDING)) + struct net *net = ctl->extra2; + + if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) + inet_forward_change(net); + else if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) rt_cache_flush(0); } @@ -1419,11 +1424,8 @@ int ipv4_doint_and_flush_strategy(ctl_table *table, int __user *name, int nlen, static struct devinet_sysctl_table { struct ctl_table_header *sysctl_header; - ctl_table devinet_vars[__NET_IPV4_CONF_MAX]; - ctl_table devinet_dev[2]; - ctl_table devinet_conf_dir[2]; - ctl_table devinet_proto_dir[2]; - ctl_table devinet_root_dir[2]; + struct ctl_table devinet_vars[__NET_IPV4_CONF_MAX]; + char *dev_name; } devinet_sysctl = { .devinet_vars = { DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding", @@ -1455,62 +1457,32 @@ static struct devinet_sysctl_table { DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, "promote_secondaries"), }, - .devinet_dev = { - { - .ctl_name = NET_PROTO_CONF_ALL, - .procname = "all", - .mode = 0555, - .child = devinet_sysctl.devinet_vars, - }, - }, - .devinet_conf_dir = { - { - .ctl_name = NET_IPV4_CONF, - .procname = "conf", - .mode = 0555, - .child = devinet_sysctl.devinet_dev, - }, - }, - .devinet_proto_dir = { - { - .ctl_name = NET_IPV4, - .procname = "ipv4", - .mode = 0555, - .child = devinet_sysctl.devinet_conf_dir, - }, - }, - .devinet_root_dir = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = devinet_sysctl.devinet_proto_dir, - }, - }, }; -static void devinet_sysctl_register(struct in_device *in_dev, - struct ipv4_devconf *p) +static int __devinet_sysctl_register(struct net *net, char *dev_name, + int ctl_name, struct ipv4_devconf *p) { int i; - struct net_device *dev = in_dev ? in_dev->dev : NULL; - struct devinet_sysctl_table *t = kmemdup(&devinet_sysctl, sizeof(*t), - GFP_KERNEL); - char *dev_name = NULL; + struct devinet_sysctl_table *t; +#define DEVINET_CTL_PATH_DEV 3 + + struct ctl_path devinet_ctl_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "ipv4", .ctl_name = NET_IPV4, }, + { .procname = "conf", .ctl_name = NET_IPV4_CONF, }, + { /* to be set */ }, + { }, + }; + + t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL); if (!t) - return; + goto out; + for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) { t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf; t->devinet_vars[i].extra1 = p; - } - - if (dev) { - dev_name = dev->name; - t->devinet_dev[0].ctl_name = dev->ifindex; - } else { - dev_name = "default"; - t->devinet_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT; + t->devinet_vars[i].extra2 = net; } /* @@ -1518,56 +1490,183 @@ static void devinet_sysctl_register(struct in_device *in_dev, * by sysctl and we wouldn't want anyone to change it under our feet * (see SIOCSIFNAME). */ - dev_name = kstrdup(dev_name, GFP_KERNEL); - if (!dev_name) - goto free; + t->dev_name = kstrdup(dev_name, GFP_KERNEL); + if (!t->dev_name) + goto free; - t->devinet_dev[0].procname = dev_name; - t->devinet_dev[0].child = t->devinet_vars; - t->devinet_conf_dir[0].child = t->devinet_dev; - t->devinet_proto_dir[0].child = t->devinet_conf_dir; - t->devinet_root_dir[0].child = t->devinet_proto_dir; + devinet_ctl_path[DEVINET_CTL_PATH_DEV].procname = t->dev_name; + devinet_ctl_path[DEVINET_CTL_PATH_DEV].ctl_name = ctl_name; - t->sysctl_header = register_sysctl_table(t->devinet_root_dir); + t->sysctl_header = register_net_sysctl_table(net, devinet_ctl_path, + t->devinet_vars); if (!t->sysctl_header) - goto free_procname; + goto free_procname; p->sysctl = t; - return; + return 0; - /* error path */ - free_procname: - kfree(dev_name); - free: +free_procname: + kfree(t->dev_name); +free: kfree(t); - return; +out: + return -ENOBUFS; } -static void devinet_sysctl_unregister(struct ipv4_devconf *p) +static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf) +{ + struct devinet_sysctl_table *t = cnf->sysctl; + + if (t == NULL) + return; + + cnf->sysctl = NULL; + unregister_sysctl_table(t->sysctl_header); + kfree(t->dev_name); + kfree(t); +} + +static void devinet_sysctl_register(struct in_device *idev) +{ + neigh_sysctl_register(idev->dev, idev->arp_parms, NET_IPV4, + NET_IPV4_NEIGH, "ipv4", NULL, NULL); + __devinet_sysctl_register(idev->dev->nd_net, idev->dev->name, + idev->dev->ifindex, &idev->cnf); +} + +static void devinet_sysctl_unregister(struct in_device *idev) +{ + __devinet_sysctl_unregister(&idev->cnf); + neigh_sysctl_unregister(idev->arp_parms); +} + +static struct ctl_table ctl_forward_entry[] = { + { + .ctl_name = NET_IPV4_FORWARD, + .procname = "ip_forward", + .data = &ipv4_devconf.data[ + NET_IPV4_CONF_FORWARDING - 1], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = devinet_sysctl_forward, + .strategy = devinet_conf_sysctl, + .extra1 = &ipv4_devconf, + .extra2 = &init_net, + }, + { }, +}; + +static __net_initdata struct ctl_path net_ipv4_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "ipv4", .ctl_name = NET_IPV4, }, + { }, +}; +#endif + +static __net_init int devinet_init_net(struct net *net) { - if (p->sysctl) { - struct devinet_sysctl_table *t = p->sysctl; - p->sysctl = NULL; - unregister_sysctl_table(t->sysctl_header); - kfree(t->devinet_dev[0].procname); - kfree(t); + int err; + struct ipv4_devconf *all, *dflt; +#ifdef CONFIG_SYSCTL + struct ctl_table *tbl = ctl_forward_entry; + struct ctl_table_header *forw_hdr; +#endif + + err = -ENOMEM; + all = &ipv4_devconf; + dflt = &ipv4_devconf_dflt; + + if (net != &init_net) { + all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL); + if (all == NULL) + goto err_alloc_all; + + dflt = kmemdup(dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL); + if (dflt == NULL) + goto err_alloc_dflt; + +#ifdef CONFIG_SYSCTL + tbl = kmemdup(tbl, sizeof(ctl_forward_entry), GFP_KERNEL); + if (tbl == NULL) + goto err_alloc_ctl; + + tbl[0].data = &all->data[NET_IPV4_CONF_FORWARDING - 1]; + tbl[0].extra1 = all; + tbl[0].extra2 = net; +#endif } + +#ifdef CONFIG_SYSCTL + err = __devinet_sysctl_register(net, "all", + NET_PROTO_CONF_ALL, all); + if (err < 0) + goto err_reg_all; + + err = __devinet_sysctl_register(net, "default", + NET_PROTO_CONF_DEFAULT, dflt); + if (err < 0) + goto err_reg_dflt; + + err = -ENOMEM; + forw_hdr = register_net_sysctl_table(net, net_ipv4_path, tbl); + if (forw_hdr == NULL) + goto err_reg_ctl; + net->ipv4.forw_hdr = forw_hdr; +#endif + + net->ipv4.devconf_all = all; + net->ipv4.devconf_dflt = dflt; + return 0; + +#ifdef CONFIG_SYSCTL +err_reg_ctl: + __devinet_sysctl_unregister(dflt); +err_reg_dflt: + __devinet_sysctl_unregister(all); +err_reg_all: + if (tbl != ctl_forward_entry) + kfree(tbl); +err_alloc_ctl: +#endif + if (dflt != &ipv4_devconf_dflt) + kfree(dflt); +err_alloc_dflt: + if (all != &ipv4_devconf) + kfree(all); +err_alloc_all: + return err; } + +static __net_exit void devinet_exit_net(struct net *net) +{ +#ifdef CONFIG_SYSCTL + struct ctl_table *tbl; + + tbl = net->ipv4.forw_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->ipv4.forw_hdr); + __devinet_sysctl_unregister(net->ipv4.devconf_dflt); + __devinet_sysctl_unregister(net->ipv4.devconf_all); + kfree(tbl); #endif + kfree(net->ipv4.devconf_dflt); + kfree(net->ipv4.devconf_all); +} + +static __net_initdata struct pernet_operations devinet_ops = { + .init = devinet_init_net, + .exit = devinet_exit_net, +}; void __init devinet_init(void) { + register_pernet_subsys(&devinet_ops); + register_gifconf(PF_INET, inet_gifconf); register_netdevice_notifier(&ip_netdev_notifier); rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL); rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL); rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr); -#ifdef CONFIG_SYSCTL - devinet_sysctl.sysctl_header = - register_sysctl_table(devinet_sysctl.devinet_root_dir); - devinet_sysctl_register(NULL, &ipv4_devconf_dflt); -#endif } EXPORT_SYMBOL(in_dev_finish_destroy); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 1738113268b..28ea5c77ca2 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -163,7 +163,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) u8 nexthdr[2]; struct scatterlist *sg; int padlen; - int err; + int err = -EINVAL; if (!pskb_may_pull(skb, sizeof(*esph))) goto out; @@ -171,28 +171,31 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) if (elen <= 0 || (elen & (blksize-1))) goto out; + if ((err = skb_cow_data(skb, 0, &trailer)) < 0) + goto out; + nfrags = err; + + skb->ip_summed = CHECKSUM_NONE; + + spin_lock(&x->lock); + /* If integrity check is required, do this. */ if (esp->auth.icv_full_len) { u8 sum[alen]; err = esp_mac_digest(esp, skb, 0, skb->len - alen); if (err) - goto out; + goto unlock; if (skb_copy_bits(skb, skb->len - alen, sum, alen)) BUG(); if (unlikely(memcmp(esp->auth.work_icv, sum, alen))) { - x->stats.integrity_failed++; - goto out; + err = -EBADMSG; + goto unlock; } } - if ((nfrags = skb_cow_data(skb, 0, &trailer)) < 0) - goto out; - - skb->ip_summed = CHECKSUM_NONE; - esph = (struct ip_esp_hdr *)skb->data; /* Get ivec. This can be wrong, check against another impls. */ @@ -202,9 +205,10 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) sg = &esp->sgbuf[0]; if (unlikely(nfrags > ESP_NUM_FAST_SG)) { + err = -ENOMEM; sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC); if (!sg) - goto out; + goto unlock; } sg_init_table(sg, nfrags); skb_to_sgvec(skb, sg, @@ -213,12 +217,17 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) err = crypto_blkcipher_decrypt(&desc, sg, sg, elen); if (unlikely(sg != &esp->sgbuf[0])) kfree(sg); + +unlock: + spin_unlock(&x->lock); + if (unlikely(err)) - return err; + goto out; if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2)) BUG(); + err = -EINVAL; padlen = nexthdr[0]; if (padlen+2 >= elen) goto out; @@ -276,7 +285,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) return nexthdr[1]; out: - return -EINVAL; + return err; } static u32 esp4_get_mtu(struct xfrm_state *x, int mtu) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 97abf934d18..d28261826bc 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -47,59 +47,65 @@ #include <net/ip_fib.h> #include <net/rtnetlink.h> -#define FFprint(a...) printk(KERN_DEBUG a) +#ifndef CONFIG_IP_MULTIPLE_TABLES -static struct sock *fibnl; +static int __net_init fib4_rules_init(struct net *net) +{ + struct fib_table *local_table, *main_table; -#ifndef CONFIG_IP_MULTIPLE_TABLES + local_table = fib_hash_table(RT_TABLE_LOCAL); + if (local_table == NULL) + return -ENOMEM; -struct fib_table *ip_fib_local_table; -struct fib_table *ip_fib_main_table; + main_table = fib_hash_table(RT_TABLE_MAIN); + if (main_table == NULL) + goto fail; -#define FIB_TABLE_HASHSZ 1 -static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ]; + hlist_add_head_rcu(&local_table->tb_hlist, + &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]); + hlist_add_head_rcu(&main_table->tb_hlist, + &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]); + return 0; -static void __init fib4_rules_init(void) -{ - ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL); - hlist_add_head_rcu(&ip_fib_local_table->tb_hlist, &fib_table_hash[0]); - ip_fib_main_table = fib_hash_init(RT_TABLE_MAIN); - hlist_add_head_rcu(&ip_fib_main_table->tb_hlist, &fib_table_hash[0]); +fail: + kfree(local_table); + return -ENOMEM; } #else -#define FIB_TABLE_HASHSZ 256 -static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ]; - -struct fib_table *fib_new_table(u32 id) +struct fib_table *fib_new_table(struct net *net, u32 id) { struct fib_table *tb; unsigned int h; if (id == 0) id = RT_TABLE_MAIN; - tb = fib_get_table(id); + tb = fib_get_table(net, id); if (tb) return tb; - tb = fib_hash_init(id); + + tb = fib_hash_table(id); if (!tb) return NULL; h = id & (FIB_TABLE_HASHSZ - 1); - hlist_add_head_rcu(&tb->tb_hlist, &fib_table_hash[h]); + hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]); return tb; } -struct fib_table *fib_get_table(u32 id) +struct fib_table *fib_get_table(struct net *net, u32 id) { struct fib_table *tb; struct hlist_node *node; + struct hlist_head *head; unsigned int h; if (id == 0) id = RT_TABLE_MAIN; h = id & (FIB_TABLE_HASHSZ - 1); + rcu_read_lock(); - hlist_for_each_entry_rcu(tb, node, &fib_table_hash[h], tb_hlist) { + head = &net->ipv4.fib_table_hash[h]; + hlist_for_each_entry_rcu(tb, node, head, tb_hlist) { if (tb->tb_id == id) { rcu_read_unlock(); return tb; @@ -110,15 +116,32 @@ struct fib_table *fib_get_table(u32 id) } #endif /* CONFIG_IP_MULTIPLE_TABLES */ -static void fib_flush(void) +void fib_select_default(struct net *net, + const struct flowi *flp, struct fib_result *res) +{ + struct fib_table *tb; + int table = RT_TABLE_MAIN; +#ifdef CONFIG_IP_MULTIPLE_TABLES + if (res->r == NULL || res->r->action != FR_ACT_TO_TBL) + return; + table = res->r->table; +#endif + tb = fib_get_table(net, table); + if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) + tb->tb_select_default(tb, flp, res); +} + +static void fib_flush(struct net *net) { int flushed = 0; struct fib_table *tb; struct hlist_node *node; + struct hlist_head *head; unsigned int h; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { - hlist_for_each_entry(tb, node, &fib_table_hash[h], tb_hlist) + head = &net->ipv4.fib_table_hash[h]; + hlist_for_each_entry(tb, node, head, tb_hlist) flushed += tb->tb_flush(tb); } @@ -130,7 +153,7 @@ static void fib_flush(void) * Find the first device with a given source address. */ -struct net_device * ip_dev_find(__be32 addr) +struct net_device * ip_dev_find(struct net *net, __be32 addr) { struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } }; struct fib_result res; @@ -141,7 +164,7 @@ struct net_device * ip_dev_find(__be32 addr) res.r = NULL; #endif - local_table = fib_get_table(RT_TABLE_LOCAL); + local_table = fib_get_table(net, RT_TABLE_LOCAL); if (!local_table || local_table->tb_lookup(local_table, &fl, &res)) return NULL; if (res.type != RTN_LOCAL) @@ -155,33 +178,51 @@ out: return dev; } -unsigned inet_addr_type(__be32 addr) +/* + * Find address type as if only "dev" was present in the system. If + * on_dev is NULL then all interfaces are taken into consideration. + */ +static inline unsigned __inet_dev_addr_type(struct net *net, + const struct net_device *dev, + __be32 addr) { struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } }; struct fib_result res; unsigned ret = RTN_BROADCAST; struct fib_table *local_table; - if (ZERONET(addr) || BADCLASS(addr)) + if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) return RTN_BROADCAST; - if (MULTICAST(addr)) + if (ipv4_is_multicast(addr)) return RTN_MULTICAST; #ifdef CONFIG_IP_MULTIPLE_TABLES res.r = NULL; #endif - local_table = fib_get_table(RT_TABLE_LOCAL); + local_table = fib_get_table(net, RT_TABLE_LOCAL); if (local_table) { ret = RTN_UNICAST; if (!local_table->tb_lookup(local_table, &fl, &res)) { - ret = res.type; + if (!dev || dev == res.fi->fib_dev) + ret = res.type; fib_res_put(&res); } } return ret; } +unsigned int inet_addr_type(struct net *net, __be32 addr) +{ + return __inet_dev_addr_type(net, NULL, addr); +} + +unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, + __be32 addr) +{ + return __inet_dev_addr_type(net, dev, addr); +} + /* Given (packet source, input interface) and optional (dst, oif, tos): - (main) check, that source is valid i.e. not broadcast or our local address. @@ -202,6 +243,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, struct fib_result res; int no_addr, rpf; int ret; + struct net *net; no_addr = rpf = 0; rcu_read_lock(); @@ -215,7 +257,8 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, if (in_dev == NULL) goto e_inval; - if (fib_lookup(&fl, &res)) + net = dev->nd_net; + if (fib_lookup(net, &fl, &res)) goto last_resort; if (res.type != RTN_UNICAST) goto e_inval_res; @@ -239,7 +282,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, fl.oif = dev->ifindex; ret = 0; - if (fib_lookup(&fl, &res) == 0) { + if (fib_lookup(net, &fl, &res) == 0) { if (res.type == RTN_UNICAST) { *spec_dst = FIB_RES_PREFSRC(res); ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; @@ -278,13 +321,14 @@ static int put_rtax(struct nlattr *mx, int len, int type, u32 value) return len + nla_total_size(4); } -static int rtentry_to_fib_config(int cmd, struct rtentry *rt, +static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, struct fib_config *cfg) { __be32 addr; int plen; memset(cfg, 0, sizeof(*cfg)); + cfg->fc_nlinfo.nl_net = net; if (rt->rt_dst.sa_family != AF_INET) return -EAFNOSUPPORT; @@ -345,7 +389,7 @@ static int rtentry_to_fib_config(int cmd, struct rtentry *rt, colon = strchr(devname, ':'); if (colon) *colon = 0; - dev = __dev_get_by_name(&init_net, devname); + dev = __dev_get_by_name(net, devname); if (!dev) return -ENODEV; cfg->fc_oif = dev->ifindex; @@ -368,7 +412,7 @@ static int rtentry_to_fib_config(int cmd, struct rtentry *rt, if (rt->rt_gateway.sa_family == AF_INET && addr) { cfg->fc_gw = addr; if (rt->rt_flags & RTF_GATEWAY && - inet_addr_type(addr) == RTN_UNICAST) + inet_addr_type(net, addr) == RTN_UNICAST) cfg->fc_scope = RT_SCOPE_UNIVERSE; } @@ -409,7 +453,7 @@ static int rtentry_to_fib_config(int cmd, struct rtentry *rt, * Handle IP routing ioctl calls. These are used to manipulate the routing tables */ -int ip_rt_ioctl(unsigned int cmd, void __user *arg) +int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) { struct fib_config cfg; struct rtentry rt; @@ -425,18 +469,18 @@ int ip_rt_ioctl(unsigned int cmd, void __user *arg) return -EFAULT; rtnl_lock(); - err = rtentry_to_fib_config(cmd, &rt, &cfg); + err = rtentry_to_fib_config(net, cmd, &rt, &cfg); if (err == 0) { struct fib_table *tb; if (cmd == SIOCDELRT) { - tb = fib_get_table(cfg.fc_table); + tb = fib_get_table(net, cfg.fc_table); if (tb) err = tb->tb_delete(tb, &cfg); else err = -ESRCH; } else { - tb = fib_new_table(cfg.fc_table); + tb = fib_new_table(net, cfg.fc_table); if (tb) err = tb->tb_insert(tb, &cfg); else @@ -466,8 +510,8 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = { [RTA_FLOW] = { .type = NLA_U32 }, }; -static int rtm_to_fib_config(struct sk_buff *skb, struct nlmsghdr *nlh, - struct fib_config *cfg) +static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, + struct nlmsghdr *nlh, struct fib_config *cfg) { struct nlattr *attr; int err, remaining; @@ -491,6 +535,7 @@ static int rtm_to_fib_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid; cfg->fc_nlinfo.nlh = nlh; + cfg->fc_nlinfo.nl_net = net; if (cfg->fc_type > RTN_MAX) { err = -EINVAL; @@ -538,15 +583,16 @@ errout: static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) { + struct net *net = skb->sk->sk_net; struct fib_config cfg; struct fib_table *tb; int err; - err = rtm_to_fib_config(skb, nlh, &cfg); + err = rtm_to_fib_config(net, skb, nlh, &cfg); if (err < 0) goto errout; - tb = fib_get_table(cfg.fc_table); + tb = fib_get_table(net, cfg.fc_table); if (tb == NULL) { err = -ESRCH; goto errout; @@ -559,15 +605,16 @@ errout: static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) { + struct net *net = skb->sk->sk_net; struct fib_config cfg; struct fib_table *tb; int err; - err = rtm_to_fib_config(skb, nlh, &cfg); + err = rtm_to_fib_config(net, skb, nlh, &cfg); if (err < 0) goto errout; - tb = fib_new_table(cfg.fc_table); + tb = fib_new_table(net, cfg.fc_table); if (tb == NULL) { err = -ENOBUFS; goto errout; @@ -580,10 +627,12 @@ errout: static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = skb->sk->sk_net; unsigned int h, s_h; unsigned int e = 0, s_e; struct fib_table *tb; struct hlist_node *node; + struct hlist_head *head; int dumped = 0; if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) && @@ -595,7 +644,8 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) { e = 0; - hlist_for_each_entry(tb, node, &fib_table_hash[h], tb_hlist) { + head = &net->ipv4.fib_table_hash[h]; + hlist_for_each_entry(tb, node, head, tb_hlist) { if (e < s_e) goto next; if (dumped) @@ -624,6 +674,7 @@ out: static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa) { + struct net *net = ifa->ifa_dev->dev->nd_net; struct fib_table *tb; struct fib_config cfg = { .fc_protocol = RTPROT_KERNEL, @@ -633,12 +684,15 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad .fc_prefsrc = ifa->ifa_local, .fc_oif = ifa->ifa_dev->dev->ifindex, .fc_nlflags = NLM_F_CREATE | NLM_F_APPEND, + .fc_nlinfo = { + .nl_net = net, + }, }; if (type == RTN_UNICAST) - tb = fib_new_table(RT_TABLE_MAIN); + tb = fib_new_table(net, RT_TABLE_MAIN); else - tb = fib_new_table(RT_TABLE_LOCAL); + tb = fib_new_table(net, RT_TABLE_LOCAL); if (tb == NULL) return; @@ -668,7 +722,7 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) if (ifa->ifa_flags&IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, prefix, mask); if (prim == NULL) { - printk(KERN_DEBUG "fib_add_ifaddr: bug: prim == NULL\n"); + printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n"); return; } } @@ -682,7 +736,7 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); - if (!ZERONET(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) && + if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) && (prefix != addr || ifa->ifa_prefixlen < 32)) { fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim); @@ -715,7 +769,7 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) else { prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); if (prim == NULL) { - printk(KERN_DEBUG "fib_del_ifaddr: bug: prim == NULL\n"); + printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n"); return; } } @@ -747,7 +801,7 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); /* Check, that this local address finally disappeared. */ - if (inet_addr_type(ifa->ifa_local) != RTN_LOCAL) { + if (inet_addr_type(dev->nd_net, ifa->ifa_local) != RTN_LOCAL) { /* And the last, but not the least thing. We must flush stray FIB entries. @@ -755,7 +809,7 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) for stray nexthop entries, then ignite fib_flush. */ if (fib_sync_down(ifa->ifa_local, NULL, 0)) - fib_flush(); + fib_flush(dev->nd_net); } } #undef LOCAL_OK @@ -797,11 +851,13 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb ) static void nl_fib_input(struct sk_buff *skb) { + struct net *net; struct fib_result_nl *frn; struct nlmsghdr *nlh; struct fib_table *tb; u32 pid; + net = skb->sk->sk_net; nlh = nlmsg_hdr(skb); if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len || nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn))) @@ -813,26 +869,37 @@ static void nl_fib_input(struct sk_buff *skb) nlh = nlmsg_hdr(skb); frn = (struct fib_result_nl *) NLMSG_DATA(nlh); - tb = fib_get_table(frn->tb_id_in); + tb = fib_get_table(net, frn->tb_id_in); nl_fib_lookup(frn, tb); pid = NETLINK_CB(skb).pid; /* pid of sending process */ NETLINK_CB(skb).pid = 0; /* from kernel */ NETLINK_CB(skb).dst_group = 0; /* unicast */ - netlink_unicast(fibnl, skb, pid, MSG_DONTWAIT); + netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT); +} + +static int nl_fib_lookup_init(struct net *net) +{ + struct sock *sk; + sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0, + nl_fib_input, NULL, THIS_MODULE); + if (sk == NULL) + return -EAFNOSUPPORT; + net->ipv4.fibnl = sk; + return 0; } -static void nl_fib_lookup_init(void) +static void nl_fib_lookup_exit(struct net *net) { - fibnl = netlink_kernel_create(&init_net, NETLINK_FIB_LOOKUP, 0, - nl_fib_input, NULL, THIS_MODULE); + netlink_kernel_release(net->ipv4.fibnl); + net->ipv4.fibnl = NULL; } static void fib_disable_ip(struct net_device *dev, int force) { if (fib_sync_down(0, dev, force)) - fib_flush(); + fib_flush(dev->nd_net); rt_cache_flush(0); arp_ifdown(dev); } @@ -869,9 +936,6 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo struct net_device *dev = ptr; struct in_device *in_dev = __in_dev_get_rtnl(dev); - if (dev->nd_net != &init_net) - return NOTIFY_DONE; - if (event == NETDEV_UNREGISTER) { fib_disable_ip(dev, 2); return NOTIFY_DONE; @@ -909,23 +973,92 @@ static struct notifier_block fib_netdev_notifier = { .notifier_call =fib_netdev_event, }; -void __init ip_fib_init(void) +static int __net_init ip_fib_net_init(struct net *net) { unsigned int i; + net->ipv4.fib_table_hash = kzalloc( + sizeof(struct hlist_head)*FIB_TABLE_HASHSZ, GFP_KERNEL); + if (net->ipv4.fib_table_hash == NULL) + return -ENOMEM; + for (i = 0; i < FIB_TABLE_HASHSZ; i++) - INIT_HLIST_HEAD(&fib_table_hash[i]); + INIT_HLIST_HEAD(&net->ipv4.fib_table_hash[i]); - fib4_rules_init(); + return fib4_rules_init(net); +} - register_netdevice_notifier(&fib_netdev_notifier); - register_inetaddr_notifier(&fib_inetaddr_notifier); - nl_fib_lookup_init(); +static void __net_exit ip_fib_net_exit(struct net *net) +{ + unsigned int i; + +#ifdef CONFIG_IP_MULTIPLE_TABLES + fib4_rules_exit(net); +#endif + for (i = 0; i < FIB_TABLE_HASHSZ; i++) { + struct fib_table *tb; + struct hlist_head *head; + struct hlist_node *node, *tmp; + + head = &net->ipv4.fib_table_hash[i]; + hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) { + hlist_del(node); + tb->tb_flush(tb); + kfree(tb); + } + } + kfree(net->ipv4.fib_table_hash); +} + +static int __net_init fib_net_init(struct net *net) +{ + int error; + + error = ip_fib_net_init(net); + if (error < 0) + goto out; + error = nl_fib_lookup_init(net); + if (error < 0) + goto out_nlfl; + error = fib_proc_init(net); + if (error < 0) + goto out_proc; +out: + return error; + +out_proc: + nl_fib_lookup_exit(net); +out_nlfl: + ip_fib_net_exit(net); + goto out; +} + +static void __net_exit fib_net_exit(struct net *net) +{ + fib_proc_exit(net); + nl_fib_lookup_exit(net); + ip_fib_net_exit(net); +} + +static struct pernet_operations fib_net_ops = { + .init = fib_net_init, + .exit = fib_net_exit, +}; + +void __init ip_fib_init(void) +{ rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL); rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL); rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib); + + register_pernet_subsys(&fib_net_ops); + register_netdevice_notifier(&fib_netdev_notifier); + register_inetaddr_notifier(&fib_inetaddr_notifier); + + fib_hash_init(); } EXPORT_SYMBOL(inet_addr_type); +EXPORT_SYMBOL(inet_dev_addr_type); EXPORT_SYMBOL(ip_dev_find); diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 0dfee27cfbc..a15b2f1b272 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -52,6 +52,7 @@ struct fib_node { struct hlist_node fn_hash; struct list_head fn_alias; __be32 fn_key; + struct fib_alias fn_embedded_alias; }; struct fn_zone { @@ -102,10 +103,10 @@ static struct hlist_head *fz_hash_alloc(int divisor) unsigned long size = divisor * sizeof(struct hlist_head); if (size <= PAGE_SIZE) { - return kmalloc(size, GFP_KERNEL); + return kzalloc(size, GFP_KERNEL); } else { return (struct hlist_head *) - __get_free_pages(GFP_KERNEL, get_order(size)); + __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size)); } } @@ -168,14 +169,13 @@ static void fn_rehash_zone(struct fn_zone *fz) new_hashmask = (new_divisor - 1); #if RT_CACHE_DEBUG >= 2 - printk("fn_rehash_zone: hash for zone %d grows from %d\n", fz->fz_order, old_divisor); + printk(KERN_DEBUG "fn_rehash_zone: hash for zone %d grows from %d\n", + fz->fz_order, old_divisor); #endif ht = fz_hash_alloc(new_divisor); if (ht) { - memset(ht, 0, new_divisor * sizeof(struct hlist_head)); - write_lock_bh(&fib_hash_lock); old_ht = fz->fz_hash; fz->fz_hash = ht; @@ -194,10 +194,13 @@ static inline void fn_free_node(struct fib_node * f) kmem_cache_free(fn_hash_kmem, f); } -static inline void fn_free_alias(struct fib_alias *fa) +static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f) { fib_release_info(fa->fa_info); - kmem_cache_free(fn_alias_kmem, fa); + if (fa == &f->fn_embedded_alias) + fa->fa_info = NULL; + else + kmem_cache_free(fn_alias_kmem, fa); } static struct fn_zone * @@ -219,7 +222,6 @@ fn_new_zone(struct fn_hash *table, int z) kfree(fz); return NULL; } - memset(fz->fz_hash, 0, fz->fz_divisor * sizeof(struct hlist_head *)); fz->fz_order = z; fz->fz_mask = inet_make_mask(z); @@ -275,8 +277,6 @@ out: return err; } -static int fn_hash_last_dflt=-1; - static void fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) { @@ -317,12 +317,9 @@ fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib if (next_fi != res->fi) break; } else if (!fib_detect_death(fi, order, &last_resort, - &last_idx, &fn_hash_last_dflt)) { - if (res->fi) - fib_info_put(res->fi); - res->fi = fi; - atomic_inc(&fi->fib_clntref); - fn_hash_last_dflt = order; + &last_idx, tb->tb_default)) { + fib_result_assign(res, fi); + tb->tb_default = order; goto out; } fi = next_fi; @@ -331,27 +328,20 @@ fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib } if (order <= 0 || fi == NULL) { - fn_hash_last_dflt = -1; + tb->tb_default = -1; goto out; } - if (!fib_detect_death(fi, order, &last_resort, &last_idx, &fn_hash_last_dflt)) { - if (res->fi) - fib_info_put(res->fi); - res->fi = fi; - atomic_inc(&fi->fib_clntref); - fn_hash_last_dflt = order; + if (!fib_detect_death(fi, order, &last_resort, &last_idx, + tb->tb_default)) { + fib_result_assign(res, fi); + tb->tb_default = order; goto out; } - if (last_idx >= 0) { - if (res->fi) - fib_info_put(res->fi); - res->fi = last_resort; - if (last_resort) - atomic_inc(&last_resort->fib_clntref); - } - fn_hash_last_dflt = last_idx; + if (last_idx >= 0) + fib_result_assign(res, last_resort); + tb->tb_default = last_idx; out: read_unlock(&fib_hash_lock); } @@ -490,15 +480,12 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg) goto out; err = -ENOBUFS; - new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); - if (new_fa == NULL) - goto out; new_f = NULL; if (!f) { - new_f = kmem_cache_alloc(fn_hash_kmem, GFP_KERNEL); + new_f = kmem_cache_zalloc(fn_hash_kmem, GFP_KERNEL); if (new_f == NULL) - goto out_free_new_fa; + goto out; INIT_HLIST_NODE(&new_f->fn_hash); INIT_LIST_HEAD(&new_f->fn_alias); @@ -506,6 +493,12 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg) f = new_f; } + new_fa = &f->fn_embedded_alias; + if (new_fa->fa_info != NULL) { + new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); + if (new_fa == NULL) + goto out_free_new_f; + } new_fa->fa_info = fi; new_fa->fa_tos = tos; new_fa->fa_type = cfg->fc_type; @@ -532,8 +525,8 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg) &cfg->fc_nlinfo, 0); return 0; -out_free_new_fa: - kmem_cache_free(fn_alias_kmem, new_fa); +out_free_new_f: + kmem_cache_free(fn_hash_kmem, new_f); out: fib_release_info(fi); return err; @@ -609,7 +602,7 @@ static int fn_hash_delete(struct fib_table *tb, struct fib_config *cfg) if (fa->fa_state & FA_S_ACCESSED) rt_cache_flush(-1); - fn_free_alias(fa); + fn_free_alias(fa, f); if (kill_fn) { fn_free_node(f); fz->fz_nent--; @@ -645,7 +638,7 @@ static int fn_flush_list(struct fn_zone *fz, int idx) fib_hash_genid++; write_unlock_bh(&fib_hash_lock); - fn_free_alias(fa); + fn_free_alias(fa, f); found++; } } @@ -761,25 +754,19 @@ static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin return skb->len; } -#ifdef CONFIG_IP_MULTIPLE_TABLES -struct fib_table * fib_hash_init(u32 id) -#else -struct fib_table * __init fib_hash_init(u32 id) -#endif +void __init fib_hash_init(void) { - struct fib_table *tb; + fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node), + 0, SLAB_PANIC, NULL); - if (fn_hash_kmem == NULL) - fn_hash_kmem = kmem_cache_create("ip_fib_hash", - sizeof(struct fib_node), - 0, SLAB_HWCACHE_ALIGN, - NULL); + fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias), + 0, SLAB_PANIC, NULL); - if (fn_alias_kmem == NULL) - fn_alias_kmem = kmem_cache_create("ip_fib_alias", - sizeof(struct fib_alias), - 0, SLAB_HWCACHE_ALIGN, - NULL); +} + +struct fib_table *fib_hash_table(u32 id) +{ + struct fib_table *tb; tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash), GFP_KERNEL); @@ -787,6 +774,7 @@ struct fib_table * __init fib_hash_init(u32 id) return NULL; tb->tb_id = id; + tb->tb_default = -1; tb->tb_lookup = fn_hash_lookup; tb->tb_insert = fn_hash_insert; tb->tb_delete = fn_hash_delete; @@ -801,6 +789,7 @@ struct fib_table * __init fib_hash_init(u32 id) #ifdef CONFIG_PROC_FS struct fib_iter_state { + struct seq_net_private p; struct fn_zone *zone; int bucket; struct hlist_head *hash_head; @@ -814,7 +803,11 @@ struct fib_iter_state { static struct fib_alias *fib_get_first(struct seq_file *seq) { struct fib_iter_state *iter = seq->private; - struct fn_hash *table = (struct fn_hash *) ip_fib_main_table->tb_data; + struct fib_table *main_table; + struct fn_hash *table; + + main_table = fib_get_table(iter->p.net, RT_TABLE_MAIN); + table = (struct fn_hash *)main_table->tb_data; iter->bucket = 0; iter->hash_head = NULL; @@ -949,11 +942,13 @@ static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos) } static void *fib_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(fib_hash_lock) { + struct fib_iter_state *iter = seq->private; void *v = NULL; read_lock(&fib_hash_lock); - if (ip_fib_main_table) + if (fib_get_table(iter->p.net, RT_TABLE_MAIN)) v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; return v; } @@ -965,6 +960,7 @@ static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void fib_seq_stop(struct seq_file *seq, void *v) + __releases(fib_hash_lock) { read_unlock(&fib_hash_lock); } @@ -1040,8 +1036,8 @@ static const struct seq_operations fib_seq_ops = { static int fib_seq_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &fib_seq_ops, - sizeof(struct fib_iter_state)); + return seq_open_net(inode, file, &fib_seq_ops, + sizeof(struct fib_iter_state)); } static const struct file_operations fib_seq_fops = { @@ -1049,18 +1045,18 @@ static const struct file_operations fib_seq_fops = { .open = fib_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; -int __init fib_proc_init(void) +int __net_init fib_proc_init(struct net *net) { - if (!proc_net_fops_create(&init_net, "route", S_IRUGO, &fib_seq_fops)) + if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_seq_fops)) return -ENOMEM; return 0; } -void __init fib_proc_exit(void) +void __net_exit fib_proc_exit(struct net *net) { - proc_net_remove(&init_net, "route"); + proc_net_remove(net, "route"); } #endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index eef9eec17e0..2c1623d2768 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -7,12 +7,14 @@ struct fib_alias { struct list_head fa_list; - struct rcu_head rcu; struct fib_info *fa_info; u8 fa_tos; u8 fa_type; u8 fa_scope; u8 fa_state; +#ifdef CONFIG_IP_FIB_TRIE + struct rcu_head rcu; +#endif }; #define FA_S_ACCESSED 0x01 @@ -36,6 +38,16 @@ extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio); extern int fib_detect_death(struct fib_info *fi, int order, struct fib_info **last_resort, - int *last_idx, int *dflt); + int *last_idx, int dflt); + +static inline void fib_result_assign(struct fib_result *res, + struct fib_info *fi) +{ + if (res->fi != NULL) + fib_info_put(res->fi); + res->fi = fi; + if (fi != NULL) + atomic_inc(&fi->fib_clntref); +} #endif /* _FIB_LOOKUP_H */ diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index a0ada3a8d8d..19274d01afa 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -32,8 +32,6 @@ #include <net/ip_fib.h> #include <net/fib_rules.h> -static struct fib_rules_ops fib4_rules_ops; - struct fib4_rule { struct fib_rule common; @@ -56,14 +54,14 @@ u32 fib_rules_tclass(struct fib_result *res) } #endif -int fib_lookup(struct flowi *flp, struct fib_result *res) +int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res) { struct fib_lookup_arg arg = { .result = res, }; int err; - err = fib_rules_lookup(&fib4_rules_ops, flp, 0, &arg); + err = fib_rules_lookup(net->ipv4.rules_ops, flp, 0, &arg); res->r = arg.rule; return err; @@ -93,7 +91,7 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, goto errout; } - if ((tbl = fib_get_table(rule->table)) == NULL) + if ((tbl = fib_get_table(rule->fr_net, rule->table)) == NULL) goto errout; err = tbl->tb_lookup(tbl, flp, (struct fib_result *) arg->result); @@ -104,16 +102,6 @@ errout: } -void fib_select_default(const struct flowi *flp, struct fib_result *res) -{ - if (res->r && res->r->action == FR_ACT_TO_TBL && - FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) { - struct fib_table *tb; - if ((tb = fib_get_table(res->r->table)) != NULL) - tb->tb_select_default(tb, flp, res); - } -} - static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) { struct fib4_rule *r = (struct fib4_rule *) rule; @@ -130,13 +118,13 @@ static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) return 1; } -static struct fib_table *fib_empty_table(void) +static struct fib_table *fib_empty_table(struct net *net) { u32 id; for (id = 1; id <= RT_TABLE_MAX; id++) - if (fib_get_table(id) == NULL) - return fib_new_table(id); + if (fib_get_table(net, id) == NULL) + return fib_new_table(net, id); return NULL; } @@ -149,6 +137,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct nlmsghdr *nlh, struct fib_rule_hdr *frh, struct nlattr **tb) { + struct net *net = skb->sk->sk_net; int err = -EINVAL; struct fib4_rule *rule4 = (struct fib4_rule *) rule; @@ -159,7 +148,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, if (rule->action == FR_ACT_TO_TBL) { struct fib_table *table; - table = fib_empty_table(); + table = fib_empty_table(net); if (table == NULL) { err = -ENOBUFS; goto errout; @@ -245,14 +234,14 @@ nla_put_failure: return -ENOBUFS; } -static u32 fib4_rule_default_pref(void) +static u32 fib4_rule_default_pref(struct fib_rules_ops *ops) { struct list_head *pos; struct fib_rule *rule; - if (!list_empty(&fib4_rules_ops.rules_list)) { - pos = fib4_rules_ops.rules_list.next; - if (pos->next != &fib4_rules_ops.rules_list) { + if (!list_empty(&ops->rules_list)) { + pos = ops->rules_list.next; + if (pos->next != &ops->rules_list) { rule = list_entry(pos->next, struct fib_rule, list); if (rule->pref) return rule->pref - 1; @@ -274,7 +263,7 @@ static void fib4_rule_flush_cache(void) rt_cache_flush(-1); } -static struct fib_rules_ops fib4_rules_ops = { +static struct fib_rules_ops fib4_rules_ops_template = { .family = AF_INET, .rule_size = sizeof(struct fib4_rule), .addr_size = sizeof(u32), @@ -288,31 +277,53 @@ static struct fib_rules_ops fib4_rules_ops = { .flush_cache = fib4_rule_flush_cache, .nlgroup = RTNLGRP_IPV4_RULE, .policy = fib4_rule_policy, - .rules_list = LIST_HEAD_INIT(fib4_rules_ops.rules_list), .owner = THIS_MODULE, }; -static int __init fib_default_rules_init(void) +static int fib_default_rules_init(struct fib_rules_ops *ops) { int err; - err = fib_default_rule_add(&fib4_rules_ops, 0, - RT_TABLE_LOCAL, FIB_RULE_PERMANENT); + err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, FIB_RULE_PERMANENT); if (err < 0) return err; - err = fib_default_rule_add(&fib4_rules_ops, 0x7FFE, - RT_TABLE_MAIN, 0); + err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0); if (err < 0) return err; - err = fib_default_rule_add(&fib4_rules_ops, 0x7FFF, - RT_TABLE_DEFAULT, 0); + err = fib_default_rule_add(ops, 0x7FFF, RT_TABLE_DEFAULT, 0); if (err < 0) return err; return 0; } -void __init fib4_rules_init(void) +int __net_init fib4_rules_init(struct net *net) +{ + int err; + struct fib_rules_ops *ops; + + ops = kmemdup(&fib4_rules_ops_template, sizeof(*ops), GFP_KERNEL); + if (ops == NULL) + return -ENOMEM; + INIT_LIST_HEAD(&ops->rules_list); + ops->fro_net = net; + + fib_rules_register(ops); + + err = fib_default_rules_init(ops); + if (err < 0) + goto fail; + net->ipv4.rules_ops = ops; + return 0; + +fail: + /* also cleans all rules already added */ + fib_rules_unregister(ops); + kfree(ops); + return err; +} + +void __net_exit fib4_rules_exit(struct net *net) { - BUG_ON(fib_default_rules_init()); - fib_rules_register(&fib4_rules_ops); + fib_rules_unregister(net->ipv4.rules_ops); + kfree(net->ipv4.rules_ops); } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 1351a2617dc..c7912866d98 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -47,8 +47,6 @@ #include "fib_lookup.h" -#define FSprintk(a...) - static DEFINE_SPINLOCK(fib_info_lock); static struct hlist_head *fib_info_hash; static struct hlist_head *fib_info_laddrhash; @@ -145,7 +143,7 @@ static const struct void free_fib_info(struct fib_info *fi) { if (fi->fib_dead == 0) { - printk("Freeing alive fib_info %p\n", fi); + printk(KERN_WARNING "Freeing alive fib_info %p\n", fi); return; } change_nexthops(fi) { @@ -196,6 +194,15 @@ static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info * return 0; } +static inline unsigned int fib_devindex_hashfn(unsigned int val) +{ + unsigned int mask = DEVINDEX_HASHSIZE - 1; + + return (val ^ + (val >> DEVINDEX_HASHBITS) ^ + (val >> (DEVINDEX_HASHBITS * 2))) & mask; +} + static inline unsigned int fib_info_hashfn(const struct fib_info *fi) { unsigned int mask = (fib_hash_size - 1); @@ -204,6 +211,9 @@ static inline unsigned int fib_info_hashfn(const struct fib_info *fi) val ^= fi->fib_protocol; val ^= (__force u32)fi->fib_prefsrc; val ^= fi->fib_priority; + for_nexthops(fi) { + val ^= fib_devindex_hashfn(nh->nh_oif); + } endfor_nexthops(fi) return (val ^ (val >> 7) ^ (val >> 12)) & mask; } @@ -234,15 +244,6 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) return NULL; } -static inline unsigned int fib_devindex_hashfn(unsigned int val) -{ - unsigned int mask = DEVINDEX_HASHSIZE - 1; - - return (val ^ - (val >> DEVINDEX_HASHBITS) ^ - (val >> (DEVINDEX_HASHBITS * 2))) & mask; -} - /* Check, that the gateway is already configured. Used only by redirect accept routine. */ @@ -320,11 +321,11 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, info->pid, RTNLGRP_IPV4_ROUTE, + err = rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE, info->nlh, GFP_KERNEL); errout: if (err < 0) - rtnl_set_sk_err(RTNLGRP_IPV4_ROUTE, err); + rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err); } /* Return the first fib alias matching TOS with @@ -346,7 +347,7 @@ struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio) } int fib_detect_death(struct fib_info *fi, int order, - struct fib_info **last_resort, int *last_idx, int *dflt) + struct fib_info **last_resort, int *last_idx, int dflt) { struct neighbour *n; int state = NUD_NONE; @@ -358,10 +359,10 @@ int fib_detect_death(struct fib_info *fi, int order, } if (state==NUD_REACHABLE) return 0; - if ((state&NUD_VALID) && order != *dflt) + if ((state&NUD_VALID) && order != dflt) return 0; if ((state&NUD_VALID) || - (*last_idx<0 && order > *dflt)) { + (*last_idx<0 && order > dflt)) { *last_resort = fi; *last_idx = order; } @@ -518,7 +519,9 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, struct fib_nh *nh) { int err; + struct net *net; + net = cfg->fc_nlinfo.nl_net; if (nh->nh_gw) { struct fib_result res; @@ -531,9 +534,9 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, if (cfg->fc_scope >= RT_SCOPE_LINK) return -EINVAL; - if (inet_addr_type(nh->nh_gw) != RTN_UNICAST) + if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST) return -EINVAL; - if ((dev = __dev_get_by_index(&init_net, nh->nh_oif)) == NULL) + if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL) return -ENODEV; if (!(dev->flags&IFF_UP)) return -ENETDOWN; @@ -556,7 +559,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, /* It is not necessary, but requires a bit of thinking */ if (fl.fl4_scope < RT_SCOPE_LINK) fl.fl4_scope = RT_SCOPE_LINK; - if ((err = fib_lookup(&fl, &res)) != 0) + if ((err = fib_lookup(net, &fl, &res)) != 0) return err; } err = -EINVAL; @@ -580,7 +583,7 @@ out: if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK)) return -EINVAL; - in_dev = inetdev_by_index(nh->nh_oif); + in_dev = inetdev_by_index(net, nh->nh_oif); if (in_dev == NULL) return -ENODEV; if (!(in_dev->dev->flags&IFF_UP)) { @@ -605,10 +608,10 @@ static inline unsigned int fib_laddr_hashfn(__be32 val) static struct hlist_head *fib_hash_alloc(int bytes) { if (bytes <= PAGE_SIZE) - return kmalloc(bytes, GFP_KERNEL); + return kzalloc(bytes, GFP_KERNEL); else return (struct hlist_head *) - __get_free_pages(GFP_KERNEL, get_order(bytes)); + __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes)); } static void fib_hash_free(struct hlist_head *hash, int bytes) @@ -712,12 +715,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (!new_info_hash || !new_laddrhash) { fib_hash_free(new_info_hash, bytes); fib_hash_free(new_laddrhash, bytes); - } else { - memset(new_info_hash, 0, bytes); - memset(new_laddrhash, 0, bytes); - + } else fib_hash_move(new_info_hash, new_laddrhash, new_size); - } if (!fib_hash_size) goto failure; @@ -799,7 +798,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (nhs != 1 || nh->nh_gw) goto err_inval; nh->nh_scope = RT_SCOPE_NOWHERE; - nh->nh_dev = dev_get_by_index(&init_net, fi->fib_nh->nh_oif); + nh->nh_dev = dev_get_by_index(cfg->fc_nlinfo.nl_net, + fi->fib_nh->nh_oif); err = -ENODEV; if (nh->nh_dev == NULL) goto failure; @@ -813,7 +813,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (fi->fib_prefsrc) { if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || fi->fib_prefsrc != cfg->fc_dst) - if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL) + if (inet_addr_type(cfg->fc_nlinfo.nl_net, + fi->fib_prefsrc) != RTN_LOCAL) goto err_inval; } @@ -914,7 +915,8 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp, continue; default: - printk(KERN_DEBUG "impossible 102\n"); + printk(KERN_WARNING "fib_semantic_match bad type %#x\n", + fa->fa_type); return -EINVAL; } } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 1010b469d7d..f2f47033f31 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -82,7 +82,6 @@ #include <net/ip_fib.h> #include "fib_lookup.h" -#undef CONFIG_IP_FIB_TRIE_STATS #define MAX_STAT_DEPTH 32 #define KEYLENGTH (8*sizeof(t_key)) @@ -98,13 +97,13 @@ typedef unsigned int t_key; #define IS_LEAF(n) (n->parent & T_LEAF) struct node { - t_key key; unsigned long parent; + t_key key; }; struct leaf { - t_key key; unsigned long parent; + t_key key; struct hlist_head list; struct rcu_head rcu; }; @@ -117,12 +116,12 @@ struct leaf_info { }; struct tnode { - t_key key; unsigned long parent; - unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */ - unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */ - unsigned short full_children; /* KEYLENGTH bits needed */ - unsigned short empty_children; /* KEYLENGTH bits needed */ + t_key key; + unsigned char pos; /* 2log(KEYLENGTH) bits needed */ + unsigned char bits; /* 2log(KEYLENGTH) bits needed */ + unsigned int full_children; /* KEYLENGTH bits needed */ + unsigned int empty_children; /* KEYLENGTH bits needed */ struct rcu_head rcu; struct node *child[0]; }; @@ -144,6 +143,7 @@ struct trie_stat { unsigned int tnodes; unsigned int leaves; unsigned int nullpointers; + unsigned int prefixes; unsigned int nodesizes[MAX_STAT_DEPTH]; }; @@ -152,25 +152,28 @@ struct trie { #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats stats; #endif - int size; - unsigned int revision; }; static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); -static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); +static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, + int wasfull); static struct node *resize(struct trie *t, struct tnode *tn); static struct tnode *inflate(struct trie *t, struct tnode *tn); static struct tnode *halve(struct trie *t, struct tnode *tn); static void tnode_free(struct tnode *tn); static struct kmem_cache *fn_alias_kmem __read_mostly; -static struct trie *trie_local = NULL, *trie_main = NULL; +static struct kmem_cache *trie_leaf_kmem __read_mostly; static inline struct tnode *node_parent(struct node *node) { - struct tnode *ret; + return (struct tnode *)(node->parent & ~NODE_TYPE_MASK); +} + +static inline struct tnode *node_parent_rcu(struct node *node) +{ + struct tnode *ret = node_parent(node); - ret = (struct tnode *)(node->parent & ~NODE_TYPE_MASK); return rcu_dereference(ret); } @@ -180,13 +183,18 @@ static inline void node_set_parent(struct node *node, struct tnode *ptr) (unsigned long)ptr | NODE_TYPE(node)); } -/* rcu_read_lock needs to be hold by caller from readside */ +static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i) +{ + BUG_ON(i >= 1U << tn->bits); + + return tn->child[i]; +} -static inline struct node *tnode_get_child(struct tnode *tn, int i) +static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i) { - BUG_ON(i >= 1 << tn->bits); + struct node *ret = tnode_get_child(tn, i); - return rcu_dereference(tn->child[i]); + return rcu_dereference(ret); } static inline int tnode_child_length(const struct tnode *tn) @@ -300,10 +308,10 @@ static inline void check_tnode(const struct tnode *tn) WARN_ON(tn && tn->pos+tn->bits > 32); } -static int halve_threshold = 25; -static int inflate_threshold = 50; -static int halve_threshold_root = 8; -static int inflate_threshold_root = 15; +static const int halve_threshold = 25; +static const int inflate_threshold = 50; +static const int halve_threshold_root = 8; +static const int inflate_threshold_root = 15; static void __alias_free_mem(struct rcu_head *head) @@ -319,7 +327,8 @@ static inline void alias_free_mem_rcu(struct fib_alias *fa) static void __leaf_free_rcu(struct rcu_head *head) { - kfree(container_of(head, struct leaf, rcu)); + struct leaf *l = container_of(head, struct leaf, rcu); + kmem_cache_free(trie_leaf_kmem, l); } static void __leaf_info_free_rcu(struct rcu_head *head) @@ -332,12 +341,12 @@ static inline void free_leaf_info(struct leaf_info *leaf) call_rcu(&leaf->rcu, __leaf_info_free_rcu); } -static struct tnode *tnode_alloc(unsigned int size) +static struct tnode *tnode_alloc(size_t size) { struct page *pages; if (size <= PAGE_SIZE) - return kcalloc(size, 1, GFP_KERNEL); + return kzalloc(size, GFP_KERNEL); pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size)); if (!pages) @@ -349,8 +358,8 @@ static struct tnode *tnode_alloc(unsigned int size) static void __tnode_free_rcu(struct rcu_head *head) { struct tnode *tn = container_of(head, struct tnode, rcu); - unsigned int size = sizeof(struct tnode) + - (1 << tn->bits) * sizeof(struct node *); + size_t size = sizeof(struct tnode) + + (sizeof(struct node *) << tn->bits); if (size <= PAGE_SIZE) kfree(tn); @@ -369,7 +378,7 @@ static inline void tnode_free(struct tnode *tn) static struct leaf *leaf_new(void) { - struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL); + struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); if (l) { l->parent = T_LEAF; INIT_HLIST_HEAD(&l->list); @@ -387,14 +396,12 @@ static struct leaf_info *leaf_info_new(int plen) return li; } -static struct tnode* tnode_new(t_key key, int pos, int bits) +static struct tnode *tnode_new(t_key key, int pos, int bits) { - int nchildren = 1<<bits; - int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *); + size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits); struct tnode *tn = tnode_alloc(sz); if (tn) { - memset(tn, 0, sz); tn->parent = T_TNODE; tn->pos = pos; tn->bits = bits; @@ -403,8 +410,8 @@ static struct tnode* tnode_new(t_key key, int pos, int bits) tn->empty_children = 1<<bits; } - pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode), - (unsigned int) (sizeof(struct node) * 1<<bits)); + pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode), + (unsigned long) (sizeof(struct node) << bits)); return tn; } @@ -421,7 +428,8 @@ static inline int tnode_full(const struct tnode *tn, const struct node *n) return ((struct tnode *) n)->pos == tn->pos + tn->bits; } -static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n) +static inline void put_child(struct trie *t, struct tnode *tn, int i, + struct node *n) { tnode_put_child_reorg(tn, i, n, -1); } @@ -431,14 +439,14 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i, struct nod * Update the value of full_children and empty_children. */ -static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull) +static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, + int wasfull) { struct node *chi = tn->child[i]; int isfull; BUG_ON(i >= 1<<tn->bits); - /* update emptyChildren */ if (n == NULL && chi != NULL) tn->empty_children++; @@ -571,11 +579,13 @@ static struct node *resize(struct trie *t, struct tnode *tn) err = 0; max_resize = 10; while ((tn->full_children > 0 && max_resize-- && - 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= - inflate_threshold_use * tnode_child_length(tn))) { + 50 * (tn->full_children + tnode_child_length(tn) + - tn->empty_children) + >= inflate_threshold_use * tnode_child_length(tn))) { old_tn = tn; tn = inflate(t, tn); + if (IS_ERR(tn)) { tn = old_tn; #ifdef CONFIG_IP_FIB_TRIE_STATS @@ -587,11 +597,13 @@ static struct node *resize(struct trie *t, struct tnode *tn) if (max_resize < 0) { if (!tn->parent) - printk(KERN_WARNING "Fix inflate_threshold_root. Now=%d size=%d bits\n", - inflate_threshold_root, tn->bits); + pr_warning("Fix inflate_threshold_root." + " Now=%d size=%d bits\n", + inflate_threshold_root, tn->bits); else - printk(KERN_WARNING "Fix inflate_threshold. Now=%d size=%d bits\n", - inflate_threshold, tn->bits); + pr_warning("Fix inflate_threshold." + " Now=%d size=%d bits\n", + inflate_threshold, tn->bits); } check_tnode(tn); @@ -628,11 +640,13 @@ static struct node *resize(struct trie *t, struct tnode *tn) if (max_resize < 0) { if (!tn->parent) - printk(KERN_WARNING "Fix halve_threshold_root. Now=%d size=%d bits\n", - halve_threshold_root, tn->bits); + pr_warning("Fix halve_threshold_root." + " Now=%d size=%d bits\n", + halve_threshold_root, tn->bits); else - printk(KERN_WARNING "Fix halve_threshold. Now=%d size=%d bits\n", - halve_threshold, tn->bits); + pr_warning("Fix halve_threshold." + " Now=%d size=%d bits\n", + halve_threshold, tn->bits); } /* Only one child remains */ @@ -656,7 +670,6 @@ static struct node *resize(struct trie *t, struct tnode *tn) static struct tnode *inflate(struct trie *t, struct tnode *tn) { - struct tnode *inode; struct tnode *oldtnode = tn; int olen = tnode_child_length(tn); int i; @@ -676,8 +689,9 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) */ for (i = 0; i < olen; i++) { - struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i); + struct tnode *inode; + inode = (struct tnode *) tnode_get_child(oldtnode, i); if (inode && IS_TNODE(inode) && inode->pos == oldtnode->pos + oldtnode->bits && @@ -704,6 +718,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) } for (i = 0; i < olen; i++) { + struct tnode *inode; struct node *node = tnode_get_child(oldtnode, i); struct tnode *left, *right; int size, j; @@ -716,8 +731,9 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) if (IS_LEAF(node) || ((struct tnode *) node)->pos > tn->pos + tn->bits - 1) { - if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits, - 1) == 0) + if (tkey_extract_bits(node->key, + oldtnode->pos + oldtnode->bits, + 1) == 0) put_child(t, tn, 2*i, node); else put_child(t, tn, 2*i+1, node); @@ -877,19 +893,6 @@ nomem: } } -static void trie_init(struct trie *t) -{ - if (!t) - return; - - t->size = 0; - rcu_assign_pointer(t->trie, NULL); - t->revision = 0; -#ifdef CONFIG_IP_FIB_TRIE_STATS - memset(&t->stats, 0, sizeof(struct trie_use_stats)); -#endif -} - /* readside must use rcu_read_lock currently dump routines via get_fa_head and dump */ @@ -906,7 +909,7 @@ static struct leaf_info *find_leaf_info(struct leaf *l, int plen) return NULL; } -static inline struct list_head * get_fa_head(struct leaf *l, int plen) +static inline struct list_head *get_fa_head(struct leaf *l, int plen) { struct leaf_info *li = find_leaf_info(l, plen); @@ -956,7 +959,10 @@ fib_find_node(struct trie *t, u32 key) if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { pos = tn->pos + tn->bits; - n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); + n = tnode_get_child_rcu(tn, + tkey_extract_bits(key, + tn->pos, + tn->bits)); } else break; } @@ -977,8 +983,10 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn) while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); - tn = (struct tnode *) resize (t, (struct tnode *)tn); - tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull); + tn = (struct tnode *) resize(t, (struct tnode *)tn); + + tnode_put_child_reorg((struct tnode *)tp, cindex, + (struct node *)tn, wasfull); tp = node_parent((struct node *) tn); if (!tp) @@ -988,15 +996,14 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn) /* Handle last (top) tnode */ if (IS_TNODE(tn)) - tn = (struct tnode*) resize(t, (struct tnode *)tn); + tn = (struct tnode *)resize(t, (struct tnode *)tn); - return (struct node*) tn; + return (struct node *)tn; } /* only used from updater-side */ -static struct list_head * -fib_insert_node(struct trie *t, int *err, u32 key, int plen) +static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) { int pos, newpos; struct tnode *tp = NULL, *tn = NULL; @@ -1036,7 +1043,10 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { tp = tn; pos = tn->pos + tn->bits; - n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); + n = tnode_get_child(tn, + tkey_extract_bits(key, + tn->pos, + tn->bits)); BUG_ON(n && node_parent(n) != tn); } else @@ -1054,34 +1064,27 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) /* Case 1: n is a leaf. Compare prefixes */ if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { - struct leaf *l = (struct leaf *) n; - + l = (struct leaf *) n; li = leaf_info_new(plen); - if (!li) { - *err = -ENOMEM; - goto err; - } + if (!li) + return NULL; fa_head = &li->falh; insert_leaf_info(&l->list, li); goto done; } - t->size++; l = leaf_new(); - if (!l) { - *err = -ENOMEM; - goto err; - } + if (!l) + return NULL; l->key = key; li = leaf_info_new(plen); if (!li) { tnode_free((struct tnode *) l); - *err = -ENOMEM; - goto err; + return NULL; } fa_head = &li->falh; @@ -1117,8 +1120,7 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) if (!tn) { free_leaf_info(li); tnode_free((struct tnode *) l); - *err = -ENOMEM; - goto err; + return NULL; } node_set_parent((struct node *)tn, tp); @@ -1129,23 +1131,23 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) if (tp) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); - put_child(t, (struct tnode *)tp, cindex, (struct node *)tn); + put_child(t, (struct tnode *)tp, cindex, + (struct node *)tn); } else { - rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */ + rcu_assign_pointer(t->trie, (struct node *)tn); tp = tn; } } if (tp && tp->pos + tp->bits > 32) - printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", - tp, tp->pos, tp->bits, key, plen); + pr_warning("fib_trie" + " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", + tp, tp->pos, tp->bits, key, plen); /* Rebalance the trie */ rcu_assign_pointer(t->trie, trie_rebalance(t, tp)); done: - t->revision++; -err: return fa_head; } @@ -1253,10 +1255,10 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg) break; if (fa->fa_type == cfg->fc_type && fa->fa_scope == cfg->fc_scope && - fa->fa_info == fi) { + fa->fa_info == fi) goto out; - } } + if (!(cfg->fc_nlflags & NLM_F_APPEND)) fa = fa_orig; } @@ -1279,10 +1281,11 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg) */ if (!fa_head) { - err = 0; - fa_head = fib_insert_node(t, &err, key, plen); - if (err) + fa_head = fib_insert_node(t, key, plen); + if (unlikely(!fa_head)) { + err = -ENOMEM; goto out_free_new_fa; + } } list_add_tail_rcu(&new_fa->fa_list, @@ -1302,40 +1305,41 @@ err: return err; } - /* should be called with rcu_read_lock */ -static inline int check_leaf(struct trie *t, struct leaf *l, - t_key key, int *plen, const struct flowi *flp, - struct fib_result *res) +static int check_leaf(struct trie *t, struct leaf *l, + t_key key, const struct flowi *flp, + struct fib_result *res) { - int err, i; - __be32 mask; struct leaf_info *li; struct hlist_head *hhead = &l->list; struct hlist_node *node; hlist_for_each_entry_rcu(li, node, hhead, hlist) { - i = li->plen; - mask = inet_make_mask(i); + int err; + int plen = li->plen; + __be32 mask = inet_make_mask(plen); + if (l->key != (key & ntohl(mask))) continue; - if ((err = fib_semantic_match(&li->falh, flp, res, htonl(l->key), mask, i)) <= 0) { - *plen = i; + err = fib_semantic_match(&li->falh, flp, res, + htonl(l->key), mask, plen); + #ifdef CONFIG_IP_FIB_TRIE_STATS + if (err <= 0) t->stats.semantic_match_passed++; + else + t->stats.semantic_match_miss++; #endif - return err; - } -#ifdef CONFIG_IP_FIB_TRIE_STATS - t->stats.semantic_match_miss++; -#endif + if (err <= 0) + return plen; } - return 1; + + return -1; } -static int -fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) +static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, + struct fib_result *res) { struct trie *t = (struct trie *) tb->tb_data; int plen, ret = 0; @@ -1362,10 +1366,13 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result /* Just a leaf? */ if (IS_LEAF(n)) { - if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0) - goto found; - goto failed; + plen = check_leaf(t, (struct leaf *)n, key, flp, res); + if (plen < 0) + goto failed; + ret = 0; + goto found; } + pn = (struct tnode *) n; chopped_off = 0; @@ -1387,14 +1394,14 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result } if (IS_LEAF(n)) { - if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0) - goto found; - else + plen = check_leaf(t, (struct leaf *)n, key, flp, res); + if (plen < 0) goto backtrace; + + ret = 0; + goto found; } -#define HL_OPTIMIZE -#ifdef HL_OPTIMIZE cn = (struct tnode *)n; /* @@ -1423,12 +1430,13 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result * *are* zero. */ - /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */ + /* NOTA BENE: Checking only skipped bits + for the new node here */ if (current_prefix_length < pos+bits) { if (tkey_extract_bits(cn->key, current_prefix_length, - cn->pos - current_prefix_length) != 0 || - !(cn->child[0])) + cn->pos - current_prefix_length) + || !(cn->child[0])) goto backtrace; } @@ -1451,14 +1459,17 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result * new tnode's key. */ - /* Note: We aren't very concerned about the piece of the key - * that precede pn->pos+pn->bits, since these have already been - * checked. The bits after cn->pos aren't checked since these are - * by definition "unknown" at this point. Thus, what we want to - * see is if we are about to enter the "prefix matching" state, - * and in that case verify that the skipped bits that will prevail - * throughout this subtree are zero, as they have to be if we are - * to find a matching prefix. + /* + * Note: We aren't very concerned about the piece of + * the key that precede pn->pos+pn->bits, since these + * have already been checked. The bits after cn->pos + * aren't checked since these are by definition + * "unknown" at this point. Thus, what we want to see + * is if we are about to enter the "prefix matching" + * state, and in that case verify that the skipped + * bits that will prevail throughout this subtree are + * zero, as they have to be if we are to find a + * matching prefix. */ node_prefix = mask_pfx(cn->key, cn->pos); @@ -1466,13 +1477,15 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result pref_mismatch = key_prefix^node_prefix; mp = 0; - /* In short: If skipped bits in this node do not match the search - * key, enter the "prefix matching" state.directly. + /* + * In short: If skipped bits in this node do not match + * the search key, enter the "prefix matching" + * state.directly. */ if (pref_mismatch) { while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) { mp++; - pref_mismatch = pref_mismatch <<1; + pref_mismatch = pref_mismatch << 1; } key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp); @@ -1482,7 +1495,7 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result if (current_prefix_length >= cn->pos) current_prefix_length = mp; } -#endif + pn = (struct tnode *)n; /* Descend */ chopped_off = 0; continue; @@ -1491,12 +1504,14 @@ backtrace: chopped_off++; /* As zero don't change the child key (cindex) */ - while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) + while ((chopped_off <= pn->bits) + && !(cindex & (1<<(chopped_off-1)))) chopped_off++; /* Decrease current_... with bits chopped off */ if (current_prefix_length > pn->pos + pn->bits - chopped_off) - current_prefix_length = pn->pos + pn->bits - chopped_off; + current_prefix_length = pn->pos + pn->bits + - chopped_off; /* * Either we do the actual chop off according or if we have @@ -1528,52 +1543,23 @@ found: return ret; } -/* only called from updater side */ -static int trie_leaf_remove(struct trie *t, t_key key) +/* + * Remove the leaf and return parent. + */ +static void trie_leaf_remove(struct trie *t, struct leaf *l) { - t_key cindex; - struct tnode *tp = NULL; - struct node *n = t->trie; - struct leaf *l; - - pr_debug("entering trie_leaf_remove(%p)\n", n); - - /* Note that in the case skipped bits, those bits are *not* checked! - * When we finish this, we will have NULL or a T_LEAF, and the - * T_LEAF may or may not match our key. - */ - - while (n != NULL && IS_TNODE(n)) { - struct tnode *tn = (struct tnode *) n; - check_tnode(tn); - n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits)); - - BUG_ON(n && node_parent(n) != tn); - } - l = (struct leaf *) n; - - if (!n || !tkey_equals(l->key, key)) - return 0; - - /* - * Key found. - * Remove the leaf and rebalance the tree - */ - - t->revision++; - t->size--; + struct tnode *tp = node_parent((struct node *) l); - tp = node_parent(n); - tnode_free((struct tnode *) n); + pr_debug("entering trie_leaf_remove(%p)\n", l); if (tp) { - cindex = tkey_extract_bits(key, tp->pos, tp->bits); + t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits); put_child(t, (struct tnode *)tp, cindex, NULL); rcu_assign_pointer(t->trie, trie_rebalance(t, tp)); } else rcu_assign_pointer(t->trie, NULL); - return 1; + tnode_free((struct tnode *) l); } /* @@ -1651,7 +1637,7 @@ static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg) } if (hlist_empty(&l->list)) - trie_leaf_remove(t, key); + trie_leaf_remove(t, l); if (fa->fa_state & FA_S_ACCESSED) rt_cache_flush(-1); @@ -1697,64 +1683,64 @@ static int trie_flush_leaf(struct trie *t, struct leaf *l) return found; } -/* rcu_read_lock needs to be hold by caller from readside */ - -static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf) +/* + * Scan for the next right leaf starting at node p->child[idx] + * Since we have back pointer, no recursion necessary. + */ +static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c) { - struct node *c = (struct node *) thisleaf; - struct tnode *p; - int idx; - struct node *trie = rcu_dereference(t->trie); - - if (c == NULL) { - if (trie == NULL) - return NULL; - - if (IS_LEAF(trie)) /* trie w. just a leaf */ - return (struct leaf *) trie; - - p = (struct tnode*) trie; /* Start */ - } else - p = node_parent(c); - - while (p) { - int pos, last; + do { + t_key idx; - /* Find the next child of the parent */ if (c) - pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits); + idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1; else - pos = 0; - - last = 1 << p->bits; - for (idx = pos; idx < last ; idx++) { - c = rcu_dereference(p->child[idx]); + idx = 0; + while (idx < 1u << p->bits) { + c = tnode_get_child_rcu(p, idx++); if (!c) continue; - /* Decend if tnode */ - while (IS_TNODE(c)) { - p = (struct tnode *) c; - idx = 0; - - /* Rightmost non-NULL branch */ - if (p && IS_TNODE(p)) - while (!(c = rcu_dereference(p->child[idx])) - && idx < (1<<p->bits)) idx++; - - /* Done with this tnode? */ - if (idx >= (1 << p->bits) || !c) - goto up; + if (IS_LEAF(c)) { + prefetch(p->child[idx]); + return (struct leaf *) c; } - return (struct leaf *) c; + + /* Rescan start scanning in new node */ + p = (struct tnode *) c; + idx = 0; } -up: - /* No more children go up one step */ + + /* Node empty, walk back up to parent */ c = (struct node *) p; - p = node_parent(c); - } - return NULL; /* Ready. Root of trie */ + } while ( (p = node_parent_rcu(c)) != NULL); + + return NULL; /* Root of trie */ +} + +static struct leaf *trie_firstleaf(struct trie *t) +{ + struct tnode *n = (struct tnode *) rcu_dereference(t->trie); + + if (!n) + return NULL; + + if (IS_LEAF(n)) /* trie is just a leaf */ + return (struct leaf *) n; + + return leaf_walk_rcu(n, NULL); +} + +static struct leaf *trie_nextleaf(struct leaf *l) +{ + struct node *c = (struct node *) l; + struct tnode *p = node_parent(c); + + if (!p) + return NULL; /* trie with just one leaf */ + + return leaf_walk_rcu(p, c); } /* @@ -1763,30 +1749,27 @@ up: static int fn_trie_flush(struct fib_table *tb) { struct trie *t = (struct trie *) tb->tb_data; - struct leaf *ll = NULL, *l = NULL; - int found = 0, h; - - t->revision++; + struct leaf *l, *ll = NULL; + int found = 0; - for (h = 0; (l = nextleaf(t, l)) != NULL; h++) { + for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) { found += trie_flush_leaf(t, l); if (ll && hlist_empty(&ll->list)) - trie_leaf_remove(t, ll->key); + trie_leaf_remove(t, ll); ll = l; } if (ll && hlist_empty(&ll->list)) - trie_leaf_remove(t, ll->key); + trie_leaf_remove(t, ll); pr_debug("trie_flush found=%d\n", found); return found; } -static int trie_last_dflt = -1; - -static void -fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) +static void fn_trie_select_default(struct fib_table *tb, + const struct flowi *flp, + struct fib_result *res) { struct trie *t = (struct trie *) tb->tb_data; int order, last_idx; @@ -1831,48 +1814,38 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib if (next_fi != res->fi) break; } else if (!fib_detect_death(fi, order, &last_resort, - &last_idx, &trie_last_dflt)) { - if (res->fi) - fib_info_put(res->fi); - res->fi = fi; - atomic_inc(&fi->fib_clntref); - trie_last_dflt = order; + &last_idx, tb->tb_default)) { + fib_result_assign(res, fi); + tb->tb_default = order; goto out; } fi = next_fi; order++; } if (order <= 0 || fi == NULL) { - trie_last_dflt = -1; + tb->tb_default = -1; goto out; } - if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) { - if (res->fi) - fib_info_put(res->fi); - res->fi = fi; - atomic_inc(&fi->fib_clntref); - trie_last_dflt = order; + if (!fib_detect_death(fi, order, &last_resort, &last_idx, + tb->tb_default)) { + fib_result_assign(res, fi); + tb->tb_default = order; goto out; } - if (last_idx >= 0) { - if (res->fi) - fib_info_put(res->fi); - res->fi = last_resort; - if (last_resort) - atomic_inc(&last_resort->fib_clntref); - } - trie_last_dflt = last_idx; - out:; + if (last_idx >= 0) + fib_result_assign(res, last_resort); + tb->tb_default = last_idx; +out: rcu_read_unlock(); } -static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb, +static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, + struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) { int i, s_i; struct fib_alias *fa; - __be32 xkey = htonl(key); s_i = cb->args[4]; @@ -1885,7 +1858,6 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi i++; continue; } - BUG_ON(!fa->fa_info); if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, @@ -1896,7 +1868,7 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi xkey, plen, fa->fa_tos, - fa->fa_info, 0) < 0) { + fa->fa_info, NLM_F_MULTI) < 0) { cb->args[4] = i; return -1; } @@ -1906,109 +1878,118 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi return skb->len; } -static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb, - struct netlink_callback *cb) +static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb, + struct sk_buff *skb, struct netlink_callback *cb) { - int h, s_h; - struct list_head *fa_head; - struct leaf *l = NULL; + struct leaf_info *li; + struct hlist_node *node; + int i, s_i; - s_h = cb->args[3]; + s_i = cb->args[3]; + i = 0; - for (h = 0; (l = nextleaf(t, l)) != NULL; h++) { - if (h < s_h) + /* rcu_read_lock is hold by caller */ + hlist_for_each_entry_rcu(li, node, &l->list, hlist) { + if (i < s_i) { + i++; continue; - if (h > s_h) - memset(&cb->args[4], 0, - sizeof(cb->args) - 4*sizeof(cb->args[0])); - - fa_head = get_fa_head(l, plen); + } - if (!fa_head) - continue; + if (i > s_i) + cb->args[4] = 0; - if (list_empty(fa_head)) + if (list_empty(&li->falh)) continue; - if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) { - cb->args[3] = h; + if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) { + cb->args[3] = i; return -1; } + i++; } - cb->args[3] = h; + + cb->args[3] = i; return skb->len; } -static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) +static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, + struct netlink_callback *cb) { - int m, s_m; + struct leaf *l; struct trie *t = (struct trie *) tb->tb_data; - - s_m = cb->args[2]; + t_key key = cb->args[2]; rcu_read_lock(); - for (m = 0; m <= 32; m++) { - if (m < s_m) - continue; - if (m > s_m) - memset(&cb->args[3], 0, - sizeof(cb->args) - 3*sizeof(cb->args[0])); + /* Dump starting at last key. + * Note: 0.0.0.0/0 (ie default) is first key. + */ + if (!key) + l = trie_firstleaf(t); + else { + l = fib_find_node(t, key); + if (!l) { + /* The table changed during the dump, rather than + * giving partial data, just make application retry. + */ + rcu_read_unlock(); + return -EBUSY; + } + } - if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) { - cb->args[2] = m; - goto out; + while (l) { + cb->args[2] = l->key; + if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) { + rcu_read_unlock(); + return -1; } + + l = trie_nextleaf(l); + memset(&cb->args[3], 0, + sizeof(cb->args) - 3*sizeof(cb->args[0])); } rcu_read_unlock(); - cb->args[2] = m; + return skb->len; -out: - rcu_read_unlock(); - return -1; } -/* Fix more generic FIB names for init later */ +void __init fib_hash_init(void) +{ + fn_alias_kmem = kmem_cache_create("ip_fib_alias", + sizeof(struct fib_alias), + 0, SLAB_PANIC, NULL); -#ifdef CONFIG_IP_MULTIPLE_TABLES -struct fib_table * fib_hash_init(u32 id) -#else -struct fib_table * __init fib_hash_init(u32 id) -#endif + trie_leaf_kmem = kmem_cache_create("ip_fib_trie", + max(sizeof(struct leaf), + sizeof(struct leaf_info)), + 0, SLAB_PANIC, NULL); +} + + +/* Fix more generic FIB names for init later */ +struct fib_table *fib_hash_table(u32 id) { struct fib_table *tb; struct trie *t; - if (fn_alias_kmem == NULL) - fn_alias_kmem = kmem_cache_create("ip_fib_alias", - sizeof(struct fib_alias), - 0, SLAB_HWCACHE_ALIGN, - NULL); - tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie), GFP_KERNEL); if (tb == NULL) return NULL; tb->tb_id = id; + tb->tb_default = -1; tb->tb_lookup = fn_trie_lookup; tb->tb_insert = fn_trie_insert; tb->tb_delete = fn_trie_delete; tb->tb_flush = fn_trie_flush; tb->tb_select_default = fn_trie_select_default; tb->tb_dump = fn_trie_dump; - memset(tb->tb_data, 0, sizeof(struct trie)); t = (struct trie *) tb->tb_data; - - trie_init(t); - - if (id == RT_TABLE_LOCAL) - trie_local = t; - else if (id == RT_TABLE_MAIN) - trie_main = t; + memset(t, 0, sizeof(*t)); if (id == RT_TABLE_LOCAL) - printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION); + pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION); return tb; } @@ -2016,6 +1997,8 @@ struct fib_table * __init fib_hash_init(u32 id) #ifdef CONFIG_PROC_FS /* Depth first Trie walk iterator */ struct fib_trie_iter { + struct seq_net_private p; + struct trie *trie_local, *trie_main; struct tnode *tnode; struct trie *trie; unsigned index; @@ -2036,7 +2019,7 @@ static struct node *fib_trie_get_next(struct fib_trie_iter *iter) iter->tnode, iter->index, iter->depth); rescan: while (cindex < (1<<tn->bits)) { - struct node *n = tnode_get_child(tn, cindex); + struct node *n = tnode_get_child_rcu(tn, cindex); if (n) { if (IS_LEAF(n)) { @@ -2055,7 +2038,7 @@ rescan: } /* Current node exhausted, pop back up */ - p = node_parent((struct node *)tn); + p = node_parent_rcu((struct node *)tn); if (p) { cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1; tn = p; @@ -2108,10 +2091,17 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s) for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) { if (IS_LEAF(n)) { + struct leaf *l = (struct leaf *)n; + struct leaf_info *li; + struct hlist_node *tmp; + s->leaves++; s->totdepth += iter.depth; if (iter.depth > s->maxdepth) s->maxdepth = iter.depth; + + hlist_for_each_entry_rcu(li, tmp, &l->list, hlist) + ++s->prefixes; } else { const struct tnode *tn = (const struct tnode *) n; int i; @@ -2140,13 +2130,17 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) else avdepth = 0; - seq_printf(seq, "\tAver depth: %d.%02d\n", avdepth / 100, avdepth % 100 ); + seq_printf(seq, "\tAver depth: %u.%02d\n", + avdepth / 100, avdepth % 100); seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth); seq_printf(seq, "\tLeaves: %u\n", stat->leaves); - bytes = sizeof(struct leaf) * stat->leaves; - seq_printf(seq, "\tInternal nodes: %d\n\t", stat->tnodes); + + seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes); + bytes += sizeof(struct leaf_info) * stat->prefixes; + + seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes); bytes += sizeof(struct tnode) * stat->tnodes; max = MAX_STAT_DEPTH; @@ -2156,60 +2150,89 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) pointers = 0; for (i = 1; i <= max; i++) if (stat->nodesizes[i] != 0) { - seq_printf(seq, " %d: %d", i, stat->nodesizes[i]); + seq_printf(seq, " %u: %u", i, stat->nodesizes[i]); pointers += (1<<i) * stat->nodesizes[i]; } seq_putc(seq, '\n'); - seq_printf(seq, "\tPointers: %d\n", pointers); + seq_printf(seq, "\tPointers: %u\n", pointers); bytes += sizeof(struct node *) * pointers; - seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers); - seq_printf(seq, "Total size: %d kB\n", (bytes + 1023) / 1024); + seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); + seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); +} #ifdef CONFIG_IP_FIB_TRIE_STATS - seq_printf(seq, "Counters:\n---------\n"); - seq_printf(seq,"gets = %d\n", t->stats.gets); - seq_printf(seq,"backtracks = %d\n", t->stats.backtrack); - seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed); - seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss); - seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit); - seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped); -#ifdef CLEAR_STATS - memset(&(t->stats), 0, sizeof(t->stats)); -#endif +static void trie_show_usage(struct seq_file *seq, + const struct trie_use_stats *stats) +{ + seq_printf(seq, "\nCounters:\n---------\n"); + seq_printf(seq, "gets = %u\n", stats->gets); + seq_printf(seq, "backtracks = %u\n", stats->backtrack); + seq_printf(seq, "semantic match passed = %u\n", + stats->semantic_match_passed); + seq_printf(seq, "semantic match miss = %u\n", + stats->semantic_match_miss); + seq_printf(seq, "null node hit= %u\n", stats->null_node_hit); + seq_printf(seq, "skipped node resize = %u\n\n", + stats->resize_node_skipped); +} #endif /* CONFIG_IP_FIB_TRIE_STATS */ + +static void fib_trie_show(struct seq_file *seq, const char *name, + struct trie *trie) +{ + struct trie_stat stat; + + trie_collect_stats(trie, &stat); + seq_printf(seq, "%s:\n", name); + trie_show_stats(seq, &stat); +#ifdef CONFIG_IP_FIB_TRIE_STATS + trie_show_usage(seq, &trie->stats); +#endif } static int fib_triestat_seq_show(struct seq_file *seq, void *v) { - struct trie_stat *stat; - - stat = kmalloc(sizeof(*stat), GFP_KERNEL); - if (!stat) - return -ENOMEM; + struct net *net = (struct net *)seq->private; + struct fib_table *tb; - seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n", + seq_printf(seq, + "Basic info: size of leaf:" + " %Zd bytes, size of tnode: %Zd bytes.\n", sizeof(struct leaf), sizeof(struct tnode)); - if (trie_local) { - seq_printf(seq, "Local:\n"); - trie_collect_stats(trie_local, stat); - trie_show_stats(seq, stat); - } + tb = fib_get_table(net, RT_TABLE_LOCAL); + if (tb) + fib_trie_show(seq, "Local", (struct trie *) tb->tb_data); - if (trie_main) { - seq_printf(seq, "Main:\n"); - trie_collect_stats(trie_main, stat); - trie_show_stats(seq, stat); - } - kfree(stat); + tb = fib_get_table(net, RT_TABLE_MAIN); + if (tb) + fib_trie_show(seq, "Main", (struct trie *) tb->tb_data); return 0; } static int fib_triestat_seq_open(struct inode *inode, struct file *file) { - return single_open(file, fib_triestat_seq_show, NULL); + int err; + struct net *net; + + net = get_proc_net(inode); + if (net == NULL) + return -ENXIO; + err = single_open(file, fib_triestat_seq_show, net); + if (err < 0) { + put_net(net); + return err; + } + return 0; +} + +static int fib_triestat_seq_release(struct inode *ino, struct file *f) +{ + struct seq_file *seq = f->private_data; + put_net(seq->private); + return single_release(ino, f); } static const struct file_operations fib_triestat_fops = { @@ -2217,7 +2240,7 @@ static const struct file_operations fib_triestat_fops = { .open = fib_triestat_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = fib_triestat_seq_release, }; static struct node *fib_trie_get_idx(struct fib_trie_iter *iter, @@ -2226,13 +2249,13 @@ static struct node *fib_trie_get_idx(struct fib_trie_iter *iter, loff_t idx = 0; struct node *n; - for (n = fib_trie_get_first(iter, trie_local); + for (n = fib_trie_get_first(iter, iter->trie_local); n; ++idx, n = fib_trie_get_next(iter)) { if (pos == idx) return n; } - for (n = fib_trie_get_first(iter, trie_main); + for (n = fib_trie_get_first(iter, iter->trie_main); n; ++idx, n = fib_trie_get_next(iter)) { if (pos == idx) return n; @@ -2241,11 +2264,25 @@ static struct node *fib_trie_get_idx(struct fib_trie_iter *iter, } static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) { + struct fib_trie_iter *iter = seq->private; + struct fib_table *tb; + + if (!iter->trie_local) { + tb = fib_get_table(iter->p.net, RT_TABLE_LOCAL); + if (tb) + iter->trie_local = (struct trie *) tb->tb_data; + } + if (!iter->trie_main) { + tb = fib_get_table(iter->p.net, RT_TABLE_MAIN); + if (tb) + iter->trie_main = (struct trie *) tb->tb_data; + } rcu_read_lock(); if (*pos == 0) return SEQ_START_TOKEN; - return fib_trie_get_idx(seq->private, *pos - 1); + return fib_trie_get_idx(iter, *pos - 1); } static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) @@ -2263,13 +2300,14 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) return v; /* continue scan in next trie */ - if (iter->trie == trie_local) - return fib_trie_get_first(iter, trie_main); + if (iter->trie == iter->trie_local) + return fib_trie_get_first(iter, iter->trie_main); return NULL; } static void fib_trie_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) { rcu_read_unlock(); } @@ -2279,10 +2317,8 @@ static void seq_indent(struct seq_file *seq, int n) while (n-- > 0) seq_puts(seq, " "); } -static inline const char *rtn_scope(enum rt_scope_t s) +static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s) { - static char buf[32]; - switch (s) { case RT_SCOPE_UNIVERSE: return "universe"; case RT_SCOPE_SITE: return "site"; @@ -2290,7 +2326,7 @@ static inline const char *rtn_scope(enum rt_scope_t s) case RT_SCOPE_HOST: return "host"; case RT_SCOPE_NOWHERE: return "nowhere"; default: - snprintf(buf, sizeof(buf), "scope=%d", s); + snprintf(buf, len, "scope=%d", s); return buf; } } @@ -2310,13 +2346,11 @@ static const char *rtn_type_names[__RTN_MAX] = { [RTN_XRESOLVE] = "XRESOLVE", }; -static inline const char *rtn_type(unsigned t) +static inline const char *rtn_type(char *buf, size_t len, unsigned t) { - static char buf[32]; - if (t < __RTN_MAX && rtn_type_names[t]) return rtn_type_names[t]; - snprintf(buf, sizeof(buf), "type %d", t); + snprintf(buf, len, "type %u", t); return buf; } @@ -2329,8 +2363,8 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) return 0; - if (!node_parent(n)) { - if (iter->trie == trie_local) + if (!node_parent_rcu(n)) { + if (iter->trie == iter->trie_local) seq_puts(seq, "<local>:\n"); else seq_puts(seq, "<main>:\n"); @@ -2347,25 +2381,29 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) } else { struct leaf *l = (struct leaf *) n; - int i; + struct leaf_info *li; + struct hlist_node *node; __be32 val = htonl(l->key); seq_indent(seq, iter->depth); seq_printf(seq, " |-- %d.%d.%d.%d\n", NIPQUAD(val)); - for (i = 32; i >= 0; i--) { - struct leaf_info *li = find_leaf_info(l, i); - if (li) { - struct fib_alias *fa; - list_for_each_entry_rcu(fa, &li->falh, fa_list) { - seq_indent(seq, iter->depth+1); - seq_printf(seq, " /%d %s %s", i, - rtn_scope(fa->fa_scope), - rtn_type(fa->fa_type)); - if (fa->fa_tos) - seq_printf(seq, "tos =%d\n", - fa->fa_tos); - seq_putc(seq, '\n'); - } + + hlist_for_each_entry_rcu(li, node, &l->list, hlist) { + struct fib_alias *fa; + + list_for_each_entry_rcu(fa, &li->falh, fa_list) { + char buf1[32], buf2[32]; + + seq_indent(seq, iter->depth+1); + seq_printf(seq, " /%d %s %s", li->plen, + rtn_scope(buf1, sizeof(buf1), + fa->fa_scope), + rtn_type(buf2, sizeof(buf2), + fa->fa_type)); + if (fa->fa_tos) + seq_printf(seq, "tos =%d\n", + fa->fa_tos); + seq_putc(seq, '\n'); } } } @@ -2382,8 +2420,8 @@ static const struct seq_operations fib_trie_seq_ops = { static int fib_trie_seq_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &fib_trie_seq_ops, - sizeof(struct fib_trie_iter)); + return seq_open_net(inode, file, &fib_trie_seq_ops, + sizeof(struct fib_trie_iter)); } static const struct file_operations fib_trie_fops = { @@ -2391,7 +2429,7 @@ static const struct file_operations fib_trie_fops = { .open = fib_trie_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi) @@ -2419,8 +2457,8 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) { const struct fib_trie_iter *iter = seq->private; struct leaf *l = v; - int i; - char bf[128]; + struct leaf_info *li; + struct hlist_node *node; if (v == SEQ_START_TOKEN) { seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway " @@ -2429,25 +2467,23 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) return 0; } - if (iter->trie == trie_local) + if (iter->trie == iter->trie_local) return 0; + if (IS_TNODE(l)) return 0; - for (i=32; i>=0; i--) { - struct leaf_info *li = find_leaf_info(l, i); + hlist_for_each_entry_rcu(li, node, &l->list, hlist) { struct fib_alias *fa; __be32 mask, prefix; - if (!li) - continue; - mask = inet_make_mask(li->plen); prefix = htonl(l->key); list_for_each_entry_rcu(fa, &li->falh, fa_list) { const struct fib_info *fi = fa->fa_info; unsigned flags = fib_flag_trans(fa->fa_type, mask, fi); + char bf[128]; if (fa->fa_type == RTN_BROADCAST || fa->fa_type == RTN_MULTICAST) @@ -2461,7 +2497,8 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority, mask, - (fi->fib_advmss ? fi->fib_advmss + 40 : 0), + (fi->fib_advmss ? + fi->fib_advmss + 40 : 0), fi->fib_window, fi->fib_rtt >> 3); else @@ -2486,8 +2523,8 @@ static const struct seq_operations fib_route_seq_ops = { static int fib_route_seq_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &fib_route_seq_ops, - sizeof(struct fib_trie_iter)); + return seq_open_net(inode, file, &fib_route_seq_ops, + sizeof(struct fib_trie_iter)); } static const struct file_operations fib_route_fops = { @@ -2495,35 +2532,36 @@ static const struct file_operations fib_route_fops = { .open = fib_route_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; -int __init fib_proc_init(void) +int __net_init fib_proc_init(struct net *net) { - if (!proc_net_fops_create(&init_net, "fib_trie", S_IRUGO, &fib_trie_fops)) + if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops)) goto out1; - if (!proc_net_fops_create(&init_net, "fib_triestat", S_IRUGO, &fib_triestat_fops)) + if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO, + &fib_triestat_fops)) goto out2; - if (!proc_net_fops_create(&init_net, "route", S_IRUGO, &fib_route_fops)) + if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops)) goto out3; return 0; out3: - proc_net_remove(&init_net, "fib_triestat"); + proc_net_remove(net, "fib_triestat"); out2: - proc_net_remove(&init_net, "fib_trie"); + proc_net_remove(net, "fib_trie"); out1: return -ENOMEM; } -void __init fib_proc_exit(void) +void __net_exit fib_proc_exit(struct net *net) { - proc_net_remove(&init_net, "fib_trie"); - proc_net_remove(&init_net, "fib_triestat"); - proc_net_remove(&init_net, "route"); + proc_net_remove(net, "fib_trie"); + proc_net_remove(net, "fib_triestat"); + proc_net_remove(net, "route"); } #endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 82baea02648..a7321a82df6 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -92,6 +92,7 @@ #include <asm/system.h> #include <asm/uaccess.h> #include <net/checksum.h> +#include <net/xfrm.h> /* * Build xmit assembly blocks @@ -231,7 +232,7 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; static DEFINE_PER_CPU(struct socket *, __icmp_socket) = NULL; #define icmp_socket __get_cpu_var(__icmp_socket) -static __inline__ int icmp_xmit_lock(void) +static inline int icmp_xmit_lock(void) { local_bh_disable(); @@ -245,7 +246,7 @@ static __inline__ int icmp_xmit_lock(void) return 0; } -static void icmp_xmit_unlock(void) +static inline void icmp_xmit_unlock(void) { spin_unlock_bh(&icmp_socket->sk->sk_lock.slock); } @@ -274,18 +275,19 @@ static void icmp_xmit_unlock(void) #define XRLIM_BURST_FACTOR 6 int xrlim_allow(struct dst_entry *dst, int timeout) { - unsigned long now; + unsigned long now, token = dst->rate_tokens; int rc = 0; now = jiffies; - dst->rate_tokens += now - dst->rate_last; + token += now - dst->rate_last; dst->rate_last = now; - if (dst->rate_tokens > XRLIM_BURST_FACTOR * timeout) - dst->rate_tokens = XRLIM_BURST_FACTOR * timeout; - if (dst->rate_tokens >= timeout) { - dst->rate_tokens -= timeout; + if (token > XRLIM_BURST_FACTOR * timeout) + token = XRLIM_BURST_FACTOR * timeout; + if (token >= timeout) { + token -= timeout; rc = 1; } + dst->rate_tokens = token; return rc; } @@ -403,7 +405,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) .tos = RT_TOS(ip_hdr(skb)->tos) } }, .proto = IPPROTO_ICMP }; security_skb_classify_flow(skb, &fl); - if (ip_route_output_key(&rt, &fl)) + if (ip_route_output_key(rt->u.dst.dev->nd_net, &rt, &fl)) goto out_unlock; } if (icmpv4_xrlim_allow(rt, icmp_param->data.icmph.type, @@ -435,9 +437,11 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) struct ipcm_cookie ipc; __be32 saddr; u8 tos; + struct net *net; if (!rt) goto out; + net = rt->u.dst.dev->nd_net; /* * Find the original header. It is expected to be valid, of course. @@ -513,7 +517,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) struct net_device *dev = NULL; if (rt->fl.iif && sysctl_icmp_errors_use_inbound_ifaddr) - dev = dev_get_by_index(&init_net, rt->fl.iif); + dev = dev_get_by_index(net, rt->fl.iif); if (dev) { saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); @@ -563,11 +567,71 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) } } }; + int err; + struct rtable *rt2; + security_skb_classify_flow(skb_in, &fl); - if (ip_route_output_key(&rt, &fl)) + if (__ip_route_output_key(net, &rt, &fl)) + goto out_unlock; + + /* No need to clone since we're just using its address. */ + rt2 = rt; + + err = xfrm_lookup((struct dst_entry **)&rt, &fl, NULL, 0); + switch (err) { + case 0: + if (rt != rt2) + goto route_done; + break; + case -EPERM: + rt = NULL; + break; + default: + goto out_unlock; + } + + if (xfrm_decode_session_reverse(skb_in, &fl, AF_INET)) + goto out_unlock; + + if (inet_addr_type(net, fl.fl4_src) == RTN_LOCAL) + err = __ip_route_output_key(net, &rt2, &fl); + else { + struct flowi fl2 = {}; + struct dst_entry *odst; + + fl2.fl4_dst = fl.fl4_src; + if (ip_route_output_key(net, &rt2, &fl2)) + goto out_unlock; + + /* Ugh! */ + odst = skb_in->dst; + err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src, + RT_TOS(tos), rt2->u.dst.dev); + + dst_release(&rt2->u.dst); + rt2 = (struct rtable *)skb_in->dst; + skb_in->dst = odst; + } + + if (err) + goto out_unlock; + + err = xfrm_lookup((struct dst_entry **)&rt2, &fl, NULL, + XFRM_LOOKUP_ICMP); + if (err == -ENOENT) { + if (!rt) + goto out_unlock; + goto route_done; + } + + dst_release(&rt->u.dst); + rt = rt2; + + if (err) goto out_unlock; } +route_done: if (!icmpv4_xrlim_allow(rt, type, code)) goto ende; @@ -603,8 +667,10 @@ static void icmp_unreach(struct sk_buff *skb) struct icmphdr *icmph; int hash, protocol; struct net_protocol *ipprot; - struct sock *raw_sk; u32 info = 0; + struct net *net; + + net = skb->dst->dev->nd_net; /* * Incomplete header ? @@ -635,7 +701,7 @@ static void icmp_unreach(struct sk_buff *skb) "and DF set.\n", NIPQUAD(iph->daddr)); } else { - info = ip_rt_frag_needed(iph, + info = ip_rt_frag_needed(net, iph, ntohs(icmph->un.frag.mtu)); if (!info) goto out; @@ -673,7 +739,7 @@ static void icmp_unreach(struct sk_buff *skb) */ if (!sysctl_icmp_ignore_bogus_error_responses && - inet_addr_type(iph->daddr) == RTN_BROADCAST) { + inet_addr_type(net, iph->daddr) == RTN_BROADCAST) { if (net_ratelimit()) printk(KERN_WARNING "%u.%u.%u.%u sent an invalid ICMP " "type %u, code %u " @@ -697,21 +763,9 @@ static void icmp_unreach(struct sk_buff *skb) /* * Deliver ICMP message to raw sockets. Pretty useless feature? */ + raw_icmp_error(skb, protocol, info); - /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */ hash = protocol & (MAX_INET_PROTOS - 1); - read_lock(&raw_v4_lock); - if ((raw_sk = sk_head(&raw_v4_htable[hash])) != NULL) { - while ((raw_sk = __raw_v4_lookup(raw_sk, protocol, iph->daddr, - iph->saddr, - skb->dev->ifindex)) != NULL) { - raw_err(raw_sk, skb, info); - raw_sk = sk_next(raw_sk); - iph = (struct iphdr *)skb->data; - } - } - read_unlock(&raw_v4_lock); - rcu_read_lock(); ipprot = rcu_dereference(inet_protos[hash]); if (ipprot && ipprot->err_handler) @@ -929,6 +983,25 @@ int icmp_rcv(struct sk_buff *skb) struct icmphdr *icmph; struct rtable *rt = (struct rtable *)skb->dst; + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + int nh; + + if (!(skb->sp && skb->sp->xvec[skb->sp->len - 1]->props.flags & + XFRM_STATE_ICMP)) + goto drop; + + if (!pskb_may_pull(skb, sizeof(*icmph) + sizeof(struct iphdr))) + goto drop; + + nh = skb_network_offset(skb); + skb_set_network_header(skb, sizeof(*icmph)); + + if (!xfrm4_policy_check_reverse(NULL, XFRM_POLICY_IN, skb)) + goto drop; + + skb_set_network_header(skb, nh); + } + ICMP_INC_STATS_BH(ICMP_MIB_INMSGS); switch (skb->ip_summed) { @@ -942,8 +1015,7 @@ int icmp_rcv(struct sk_buff *skb) goto error; } - if (!pskb_pull(skb, sizeof(struct icmphdr))) - goto error; + __skb_pull(skb, sizeof(*icmph)); icmph = icmp_hdr(skb); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 7dbc282d4f9..994648be80a 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -130,12 +130,12 @@ */ #define IGMP_V1_SEEN(in_dev) \ - (IPV4_DEVCONF_ALL(FORCE_IGMP_VERSION) == 1 || \ + (IPV4_DEVCONF_ALL(in_dev->dev->nd_net, FORCE_IGMP_VERSION) == 1 || \ IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \ ((in_dev)->mr_v1_seen && \ time_before(jiffies, (in_dev)->mr_v1_seen))) #define IGMP_V2_SEEN(in_dev) \ - (IPV4_DEVCONF_ALL(FORCE_IGMP_VERSION) == 2 || \ + (IPV4_DEVCONF_ALL(in_dev->dev->nd_net, FORCE_IGMP_VERSION) == 2 || \ IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \ ((in_dev)->mr_v2_seen && \ time_before(jiffies, (in_dev)->mr_v2_seen))) @@ -301,7 +301,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) .nl_u = { .ip4_u = { .daddr = IGMPV3_ALL_MCR } }, .proto = IPPROTO_IGMP }; - if (ip_route_output_key(&rt, &fl)) { + if (ip_route_output_key(&init_net, &rt, &fl)) { kfree_skb(skb); return NULL; } @@ -349,17 +349,12 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) static int igmpv3_sendpack(struct sk_buff *skb) { - struct iphdr *pip = ip_hdr(skb); struct igmphdr *pig = igmp_hdr(skb); - const int iplen = skb->tail - skb->network_header; const int igmplen = skb->tail - skb->transport_header; - pip->tot_len = htons(iplen); - ip_send_check(pip); pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen); - return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dev, - dst_output); + return ip_local_out(skb); } static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel) @@ -650,7 +645,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, struct flowi fl = { .oif = dev->ifindex, .nl_u = { .ip4_u = { .daddr = dst } }, .proto = IPPROTO_IGMP }; - if (ip_route_output_key(&rt, &fl)) + if (ip_route_output_key(&init_net, &rt, &fl)) return -1; } if (rt->rt_src == 0) { @@ -680,13 +675,11 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, iph->daddr = dst; iph->saddr = rt->rt_src; iph->protocol = IPPROTO_IGMP; - iph->tot_len = htons(IGMP_SIZE); ip_select_ident(iph, &rt->u.dst, NULL); ((u8*)&iph[1])[0] = IPOPT_RA; ((u8*)&iph[1])[1] = 4; ((u8*)&iph[1])[2] = 0; ((u8*)&iph[1])[3] = 0; - ip_send_check(iph); ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); ih->type=type; @@ -695,8 +688,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, ih->group=group; ih->csum=ip_compute_csum((void *)ih, sizeof(struct igmphdr)); - return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, - dst_output); + return ip_local_out(skb); } static void igmp_gq_timer_expire(unsigned long data) @@ -1234,9 +1226,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) spin_lock_init(&im->lock); #ifdef CONFIG_IP_MULTICAST im->tm_running=0; - init_timer(&im->timer); - im->timer.data=(unsigned long)im; - im->timer.function=&igmp_timer_expire; + setup_timer(&im->timer, &igmp_timer_expire, (unsigned long)im); im->unsolicit_count = IGMP_Unsolicited_Report_Count; im->reporter = 0; im->gsquery = 0; @@ -1338,13 +1328,11 @@ void ip_mc_init_dev(struct in_device *in_dev) in_dev->mc_tomb = NULL; #ifdef CONFIG_IP_MULTICAST in_dev->mr_gq_running = 0; - init_timer(&in_dev->mr_gq_timer); - in_dev->mr_gq_timer.data=(unsigned long) in_dev; - in_dev->mr_gq_timer.function=&igmp_gq_timer_expire; + setup_timer(&in_dev->mr_gq_timer, igmp_gq_timer_expire, + (unsigned long)in_dev); in_dev->mr_ifc_count = 0; - init_timer(&in_dev->mr_ifc_timer); - in_dev->mr_ifc_timer.data=(unsigned long) in_dev; - in_dev->mr_ifc_timer.function=&igmp_ifc_timer_expire; + setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, + (unsigned long)in_dev); in_dev->mr_qrv = IGMP_Unsolicited_Report_Count; #endif @@ -1401,19 +1389,19 @@ static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr) struct in_device *idev = NULL; if (imr->imr_ifindex) { - idev = inetdev_by_index(imr->imr_ifindex); + idev = inetdev_by_index(&init_net, imr->imr_ifindex); if (idev) __in_dev_put(idev); return idev; } if (imr->imr_address.s_addr) { - dev = ip_dev_find(imr->imr_address.s_addr); + dev = ip_dev_find(&init_net, imr->imr_address.s_addr); if (!dev) return NULL; dev_put(dev); } - if (!dev && !ip_route_output_key(&rt, &fl)) { + if (!dev && !ip_route_output_key(&init_net, &rt, &fl)) { dev = rt->u.dst.dev; ip_rt_put(rt); } @@ -1754,7 +1742,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) int ifindex; int count = 0; - if (!MULTICAST(addr)) + if (!ipv4_is_multicast(addr)) return -EINVAL; rtnl_lock(); @@ -1867,7 +1855,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct int leavegroup = 0; int i, j, rv; - if (!MULTICAST(addr)) + if (!ipv4_is_multicast(addr)) return -EINVAL; rtnl_lock(); @@ -1997,7 +1985,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) struct ip_sf_socklist *newpsl, *psl; int leavegroup = 0; - if (!MULTICAST(addr)) + if (!ipv4_is_multicast(addr)) return -EINVAL; if (msf->imsf_fmode != MCAST_INCLUDE && msf->imsf_fmode != MCAST_EXCLUDE) @@ -2080,7 +2068,7 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, struct inet_sock *inet = inet_sk(sk); struct ip_sf_socklist *psl; - if (!MULTICAST(addr)) + if (!ipv4_is_multicast(addr)) return -EINVAL; rtnl_lock(); @@ -2142,7 +2130,7 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, if (psin->sin_family != AF_INET) return -EINVAL; addr = psin->sin_addr.s_addr; - if (!MULTICAST(addr)) + if (!ipv4_is_multicast(addr)) return -EINVAL; rtnl_lock(); @@ -2192,7 +2180,7 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif) struct ip_sf_socklist *psl; int i; - if (!MULTICAST(loc_addr)) + if (!ipv4_is_multicast(loc_addr)) return 1; for (pmc=inet->mc_list; pmc; pmc=pmc->next) { @@ -2234,7 +2222,7 @@ void ip_mc_drop_socket(struct sock *sk) struct in_device *in_dev; inet->mc_list = iml->next; - in_dev = inetdev_by_index(iml->multi.imr_ifindex); + in_dev = inetdev_by_index(&init_net, iml->multi.imr_ifindex); (void) ip_mc_leave_src(sk, iml, in_dev); if (in_dev != NULL) { ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); @@ -2341,6 +2329,7 @@ static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos) } static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(dev_base_lock) { read_lock(&dev_base_lock); return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; @@ -2358,6 +2347,7 @@ static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void igmp_mc_seq_stop(struct seq_file *seq, void *v) + __releases(dev_base_lock) { struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); if (likely(state->in_dev != NULL)) { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 8fb6ca23700..7801cceb2d1 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -277,18 +277,11 @@ void inet_csk_init_xmit_timers(struct sock *sk, { struct inet_connection_sock *icsk = inet_csk(sk); - init_timer(&icsk->icsk_retransmit_timer); - init_timer(&icsk->icsk_delack_timer); - init_timer(&sk->sk_timer); - - icsk->icsk_retransmit_timer.function = retransmit_handler; - icsk->icsk_delack_timer.function = delack_handler; - sk->sk_timer.function = keepalive_handler; - - icsk->icsk_retransmit_timer.data = - icsk->icsk_delack_timer.data = - sk->sk_timer.data = (unsigned long)sk; - + setup_timer(&icsk->icsk_retransmit_timer, retransmit_handler, + (unsigned long)sk); + setup_timer(&icsk->icsk_delack_timer, delack_handler, + (unsigned long)sk); + setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk); icsk->icsk_pending = icsk->icsk_ack.pending = 0; } @@ -340,7 +333,7 @@ struct dst_entry* inet_csk_route_req(struct sock *sk, .dport = ireq->rmt_port } } }; security_req_classify_flow(req, &fl); - if (ip_route_output_flow(&rt, &fl, sk, 0)) { + if (ip_route_output_flow(&init_net, &rt, &fl, sk, 0)) { IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); return NULL; } diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index e468e7a7aac..605ed2cd797 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -935,7 +935,7 @@ out_free_table: static void __exit inet_diag_exit(void) { - sock_release(idiagnl->sk_socket); + netlink_kernel_release(idiagnl); kfree(inet_diag_table); } diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index e15e04fc666..724d69aed03 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -47,7 +47,7 @@ static void inet_frag_secret_rebuild(unsigned long dummy) } write_unlock(&f->lock); - mod_timer(&f->secret_timer, now + f->ctl->secret_interval); + mod_timer(&f->secret_timer, now + f->secret_interval); } void inet_frags_init(struct inet_frags *f) @@ -57,35 +57,45 @@ void inet_frags_init(struct inet_frags *f) for (i = 0; i < INETFRAGS_HASHSZ; i++) INIT_HLIST_HEAD(&f->hash[i]); - INIT_LIST_HEAD(&f->lru_list); rwlock_init(&f->lock); f->rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^ (jiffies ^ (jiffies >> 6))); - f->nqueues = 0; - atomic_set(&f->mem, 0); - - init_timer(&f->secret_timer); - f->secret_timer.function = inet_frag_secret_rebuild; - f->secret_timer.data = (unsigned long)f; - f->secret_timer.expires = jiffies + f->ctl->secret_interval; + setup_timer(&f->secret_timer, inet_frag_secret_rebuild, + (unsigned long)f); + f->secret_timer.expires = jiffies + f->secret_interval; add_timer(&f->secret_timer); } EXPORT_SYMBOL(inet_frags_init); +void inet_frags_init_net(struct netns_frags *nf) +{ + nf->nqueues = 0; + atomic_set(&nf->mem, 0); + INIT_LIST_HEAD(&nf->lru_list); +} +EXPORT_SYMBOL(inet_frags_init_net); + void inet_frags_fini(struct inet_frags *f) { del_timer(&f->secret_timer); } EXPORT_SYMBOL(inet_frags_fini); +void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) +{ + nf->low_thresh = 0; + inet_frag_evictor(nf, f); +} +EXPORT_SYMBOL(inet_frags_exit_net); + static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) { write_lock(&f->lock); hlist_del(&fq->list); list_del(&fq->lru_list); - f->nqueues--; + fq->net->nqueues--; write_unlock(&f->lock); } @@ -103,13 +113,13 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) EXPORT_SYMBOL(inet_frag_kill); -static inline void frag_kfree_skb(struct inet_frags *f, struct sk_buff *skb, - int *work) +static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f, + struct sk_buff *skb, int *work) { if (work) *work -= skb->truesize; - atomic_sub(skb->truesize, &f->mem); + atomic_sub(skb->truesize, &nf->mem); if (f->skb_free) f->skb_free(skb); kfree_skb(skb); @@ -119,22 +129,24 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, int *work) { struct sk_buff *fp; + struct netns_frags *nf; BUG_TRAP(q->last_in & COMPLETE); BUG_TRAP(del_timer(&q->timer) == 0); /* Release all fragment data. */ fp = q->fragments; + nf = q->net; while (fp) { struct sk_buff *xp = fp->next; - frag_kfree_skb(f, fp, work); + frag_kfree_skb(nf, f, fp, work); fp = xp; } if (work) *work -= f->qsize; - atomic_sub(f->qsize, &f->mem); + atomic_sub(f->qsize, &nf->mem); if (f->destructor) f->destructor(q); @@ -143,20 +155,20 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, } EXPORT_SYMBOL(inet_frag_destroy); -int inet_frag_evictor(struct inet_frags *f) +int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f) { struct inet_frag_queue *q; int work, evicted = 0; - work = atomic_read(&f->mem) - f->ctl->low_thresh; + work = atomic_read(&nf->mem) - nf->low_thresh; while (work > 0) { read_lock(&f->lock); - if (list_empty(&f->lru_list)) { + if (list_empty(&nf->lru_list)) { read_unlock(&f->lock); break; } - q = list_first_entry(&f->lru_list, + q = list_first_entry(&nf->lru_list, struct inet_frag_queue, lru_list); atomic_inc(&q->refcnt); read_unlock(&f->lock); @@ -175,8 +187,9 @@ int inet_frag_evictor(struct inet_frags *f) } EXPORT_SYMBOL(inet_frag_evictor); -static struct inet_frag_queue *inet_frag_intern(struct inet_frag_queue *qp_in, - struct inet_frags *f, unsigned int hash, void *arg) +static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, + struct inet_frag_queue *qp_in, struct inet_frags *f, + unsigned int hash, void *arg) { struct inet_frag_queue *qp; #ifdef CONFIG_SMP @@ -190,7 +203,7 @@ static struct inet_frag_queue *inet_frag_intern(struct inet_frag_queue *qp_in, * promoted read lock to write lock. */ hlist_for_each_entry(qp, n, &f->hash[hash], list) { - if (f->match(qp, arg)) { + if (qp->net == nf && f->match(qp, arg)) { atomic_inc(&qp->refcnt); write_unlock(&f->lock); qp_in->last_in |= COMPLETE; @@ -200,18 +213,19 @@ static struct inet_frag_queue *inet_frag_intern(struct inet_frag_queue *qp_in, } #endif qp = qp_in; - if (!mod_timer(&qp->timer, jiffies + f->ctl->timeout)) + if (!mod_timer(&qp->timer, jiffies + nf->timeout)) atomic_inc(&qp->refcnt); atomic_inc(&qp->refcnt); hlist_add_head(&qp->list, &f->hash[hash]); - list_add_tail(&qp->lru_list, &f->lru_list); - f->nqueues++; + list_add_tail(&qp->lru_list, &nf->lru_list); + nf->nqueues++; write_unlock(&f->lock); return qp; } -static struct inet_frag_queue *inet_frag_alloc(struct inet_frags *f, void *arg) +static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, + struct inet_frags *f, void *arg) { struct inet_frag_queue *q; @@ -220,35 +234,36 @@ static struct inet_frag_queue *inet_frag_alloc(struct inet_frags *f, void *arg) return NULL; f->constructor(q, arg); - atomic_add(f->qsize, &f->mem); + atomic_add(f->qsize, &nf->mem); setup_timer(&q->timer, f->frag_expire, (unsigned long)q); spin_lock_init(&q->lock); atomic_set(&q->refcnt, 1); + q->net = nf; return q; } -static struct inet_frag_queue *inet_frag_create(struct inet_frags *f, - void *arg, unsigned int hash) +static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, + struct inet_frags *f, void *arg, unsigned int hash) { struct inet_frag_queue *q; - q = inet_frag_alloc(f, arg); + q = inet_frag_alloc(nf, f, arg); if (q == NULL) return NULL; - return inet_frag_intern(q, f, hash, arg); + return inet_frag_intern(nf, q, f, hash, arg); } -struct inet_frag_queue *inet_frag_find(struct inet_frags *f, void *key, - unsigned int hash) +struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, + struct inet_frags *f, void *key, unsigned int hash) { struct inet_frag_queue *q; struct hlist_node *n; read_lock(&f->lock); hlist_for_each_entry(q, n, &f->hash[hash], list) { - if (f->match(q, key)) { + if (q->net == nf && f->match(q, key)) { atomic_inc(&q->refcnt); read_unlock(&f->lock); return q; @@ -256,6 +271,6 @@ struct inet_frag_queue *inet_frag_find(struct inet_frags *f, void *key, } read_unlock(&f->lock); - return inet_frag_create(f, key, hash); + return inet_frag_create(nf, f, key, hash); } EXPORT_SYMBOL(inet_frag_find); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 67704da04fc..619c63c6948 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -96,6 +96,7 @@ EXPORT_SYMBOL(inet_put_port); * exclusive lock release). It should be ifdefed really. */ void inet_listen_wlock(struct inet_hashinfo *hashinfo) + __acquires(hashinfo->lhash_lock) { write_lock(&hashinfo->lhash_lock); @@ -190,6 +191,44 @@ sherry_cache: } EXPORT_SYMBOL_GPL(__inet_lookup_listener); +struct sock * __inet_lookup_established(struct inet_hashinfo *hashinfo, + const __be32 saddr, const __be16 sport, + const __be32 daddr, const u16 hnum, + const int dif) +{ + INET_ADDR_COOKIE(acookie, saddr, daddr) + const __portpair ports = INET_COMBINED_PORTS(sport, hnum); + struct sock *sk; + const struct hlist_node *node; + /* Optimize here for direct hit, only listening connections can + * have wildcards anyways. + */ + unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport); + struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); + rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); + + prefetch(head->chain.first); + read_lock(lock); + sk_for_each(sk, node, &head->chain) { + if (INET_MATCH(sk, hash, acookie, saddr, daddr, ports, dif)) + goto hit; /* You sunk my battleship! */ + } + + /* Must check for a TIME_WAIT'er before going to listener hash. */ + sk_for_each(sk, node, &head->twchain) { + if (INET_TW_MATCH(sk, hash, acookie, saddr, daddr, ports, dif)) + goto hit; + } + sk = NULL; +out: + read_unlock(lock); + return sk; +hit: + sock_hold(sk); + goto out; +} +EXPORT_SYMBOL_GPL(__inet_lookup_established); + /* called with local bh disabled */ static int __inet_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, __u16 lport, @@ -239,7 +278,7 @@ unique: sk->sk_hash = hash; BUG_TRAP(sk_unhashed(sk)); __sk_add_node(sk, &head->chain); - sock_prot_inc_use(sk->sk_prot); + sock_prot_inuse_add(sk->sk_prot, 1); write_unlock(lock); if (twp) { @@ -267,6 +306,48 @@ static inline u32 inet_sk_port_offset(const struct sock *sk) inet->dport); } +void __inet_hash_nolisten(struct inet_hashinfo *hashinfo, struct sock *sk) +{ + struct hlist_head *list; + rwlock_t *lock; + struct inet_ehash_bucket *head; + + BUG_TRAP(sk_unhashed(sk)); + + sk->sk_hash = inet_sk_ehashfn(sk); + head = inet_ehash_bucket(hashinfo, sk->sk_hash); + list = &head->chain; + lock = inet_ehash_lockp(hashinfo, sk->sk_hash); + + write_lock(lock); + __sk_add_node(sk, list); + sock_prot_inuse_add(sk->sk_prot, 1); + write_unlock(lock); +} +EXPORT_SYMBOL_GPL(__inet_hash_nolisten); + +void __inet_hash(struct inet_hashinfo *hashinfo, struct sock *sk) +{ + struct hlist_head *list; + rwlock_t *lock; + + if (sk->sk_state != TCP_LISTEN) { + __inet_hash_nolisten(hashinfo, sk); + return; + } + + BUG_TRAP(sk_unhashed(sk)); + list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; + lock = &hashinfo->lhash_lock; + + inet_listen_wlock(hashinfo); + __sk_add_node(sk, list); + sock_prot_inuse_add(sk->sk_prot, 1); + write_unlock(lock); + wake_up(&hashinfo->lhash_wait); +} +EXPORT_SYMBOL_GPL(__inet_hash); + /* * Bind a port for a connect operation and hash it. */ @@ -334,7 +415,7 @@ ok: inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->sport = htons(port); - __inet_hash(hinfo, sk, 0); + __inet_hash_nolisten(hinfo, sk); } spin_unlock(&head->lock); @@ -351,7 +432,7 @@ ok: tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - __inet_hash(hinfo, sk, 0); + __inet_hash_nolisten(hinfo, sk); spin_unlock_bh(&head->lock); return 0; } else { diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index a60b99e0ebd..876169f3a52 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -48,6 +48,21 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw, inet_twsk_put(tw); } +void inet_twsk_put(struct inet_timewait_sock *tw) +{ + if (atomic_dec_and_test(&tw->tw_refcnt)) { + struct module *owner = tw->tw_prot->owner; + twsk_destructor((struct sock *)tw); +#ifdef SOCK_REFCNT_DEBUG + printk(KERN_DEBUG "%s timewait_sock %p released\n", + tw->tw_prot->name, tw); +#endif + kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw); + module_put(owner); + } +} +EXPORT_SYMBOL_GPL(inet_twsk_put); + /* * Enter the time wait state. This is called with locally disabled BH. * Essentially we whip up a timewait bucket, copy the relevant info into it @@ -76,7 +91,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, /* Step 2: Remove SK from established hash. */ if (__sk_del_node_init(sk)) - sock_prot_dec_use(sk->sk_prot); + sock_prot_inuse_add(sk->sk_prot, -1); /* Step 3: Hash TW into TIMEWAIT chain. */ inet_twsk_add_node(tw, &ehead->twchain); @@ -194,16 +209,14 @@ out: EXPORT_SYMBOL_GPL(inet_twdr_hangman); -extern void twkill_slots_invalid(void); - void inet_twdr_twkill_work(struct work_struct *work) { struct inet_timewait_death_row *twdr = container_of(work, struct inet_timewait_death_row, twkill_work); int i; - if ((INET_TWDR_TWKILL_SLOTS - 1) > (sizeof(twdr->thread_slots) * 8)) - twkill_slots_invalid(); + BUILD_BUG_ON((INET_TWDR_TWKILL_SLOTS - 1) > + (sizeof(twdr->thread_slots) * 8)); while (twdr->thread_slots) { spin_lock_bh(&twdr->death_lock); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 877da3ed52e..0b3b328d82d 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -110,7 +110,7 @@ int ip_forward(struct sk_buff *skb) skb->priority = rt_tos2priority(iph->tos); - return NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, rt->u.dst.dev, + return NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, rt->u.dst.dev, ip_forward_finish); sr_failed: diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 2143bf30597..a2e92f9709d 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -50,7 +50,7 @@ * as well. Or notify me, at least. --ANK */ -int sysctl_ipfrag_max_dist __read_mostly = 64; +static int sysctl_ipfrag_max_dist __read_mostly = 64; struct ipfrag_skb_cb { @@ -74,35 +74,16 @@ struct ipq { struct inet_peer *peer; }; -struct inet_frags_ctl ip4_frags_ctl __read_mostly = { - /* - * Fragment cache limits. We will commit 256K at one time. Should we - * cross that limit we will prune down to 192K. This should cope with - * even the most extreme cases without allowing an attacker to - * measurably harm machine performance. - */ - .high_thresh = 256 * 1024, - .low_thresh = 192 * 1024, - - /* - * Important NOTE! Fragment queue must be destroyed before MSL expires. - * RFC791 is wrong proposing to prolongate timer each fragment arrival - * by TTL. - */ - .timeout = IP_FRAG_TIME, - .secret_interval = 10 * 60 * HZ, -}; - static struct inet_frags ip4_frags; -int ip_frag_nqueues(void) +int ip_frag_nqueues(struct net *net) { - return ip4_frags.nqueues; + return net->ipv4.frags.nqueues; } -int ip_frag_mem(void) +int ip_frag_mem(struct net *net) { - return atomic_read(&ip4_frags.mem); + return atomic_read(&net->ipv4.frags.mem); } static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, @@ -142,11 +123,12 @@ static int ip4_frag_match(struct inet_frag_queue *q, void *a) } /* Memory Tracking Functions. */ -static __inline__ void frag_kfree_skb(struct sk_buff *skb, int *work) +static __inline__ void frag_kfree_skb(struct netns_frags *nf, + struct sk_buff *skb, int *work) { if (work) *work -= skb->truesize; - atomic_sub(skb->truesize, &ip4_frags.mem); + atomic_sub(skb->truesize, &nf->mem); kfree_skb(skb); } @@ -192,11 +174,11 @@ static void ipq_kill(struct ipq *ipq) /* Memory limiting on fragments. Evictor trashes the oldest * fragment queue until we are back under the threshold. */ -static void ip_evictor(void) +static void ip_evictor(struct net *net) { int evicted; - evicted = inet_frag_evictor(&ip4_frags); + evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags); if (evicted) IP_ADD_STATS_BH(IPSTATS_MIB_REASMFAILS, evicted); } @@ -236,7 +218,7 @@ out: /* Find the correct entry in the "incomplete datagrams" queue for * this IP datagram, and create new one, if nothing is found. */ -static inline struct ipq *ip_find(struct iphdr *iph, u32 user) +static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) { struct inet_frag_queue *q; struct ip4_create_arg arg; @@ -246,7 +228,7 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user) arg.user = user; hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); - q = inet_frag_find(&ip4_frags, &arg, hash); + q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); if (q == NULL) goto out_nomem; @@ -286,7 +268,7 @@ static int ip_frag_reinit(struct ipq *qp) { struct sk_buff *fp; - if (!mod_timer(&qp->q.timer, jiffies + ip4_frags_ctl.timeout)) { + if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) { atomic_inc(&qp->q.refcnt); return -ETIMEDOUT; } @@ -294,7 +276,7 @@ static int ip_frag_reinit(struct ipq *qp) fp = qp->q.fragments; do { struct sk_buff *xp = fp->next; - frag_kfree_skb(fp, NULL); + frag_kfree_skb(qp->q.net, fp, NULL); fp = xp; } while (fp); @@ -431,7 +413,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) qp->q.fragments = next; qp->q.meat -= free_it->len; - frag_kfree_skb(free_it, NULL); + frag_kfree_skb(qp->q.net, free_it, NULL); } } @@ -451,7 +433,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) } qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; - atomic_add(skb->truesize, &ip4_frags.mem); + atomic_add(skb->truesize, &qp->q.net->mem); if (offset == 0) qp->q.last_in |= FIRST_IN; @@ -459,7 +441,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) return ip_frag_reasm(qp, prev, dev); write_lock(&ip4_frags.lock); - list_move_tail(&qp->q.lru_list, &ip4_frags.lru_list); + list_move_tail(&qp->q.lru_list, &qp->q.net->lru_list); write_unlock(&ip4_frags.lock); return -EINPROGRESS; @@ -534,12 +516,12 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, head->len -= clone->len; clone->csum = 0; clone->ip_summed = head->ip_summed; - atomic_add(clone->truesize, &ip4_frags.mem); + atomic_add(clone->truesize, &qp->q.net->mem); } skb_shinfo(head)->frag_list = head->next; skb_push(head, head->data - skb_network_header(head)); - atomic_sub(head->truesize, &ip4_frags.mem); + atomic_sub(head->truesize, &qp->q.net->mem); for (fp=head->next; fp; fp = fp->next) { head->data_len += fp->len; @@ -549,7 +531,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); head->truesize += fp->truesize; - atomic_sub(fp->truesize, &ip4_frags.mem); + atomic_sub(fp->truesize, &qp->q.net->mem); } head->next = NULL; @@ -582,15 +564,17 @@ out_fail: int ip_defrag(struct sk_buff *skb, u32 user) { struct ipq *qp; + struct net *net; IP_INC_STATS_BH(IPSTATS_MIB_REASMREQDS); + net = skb->dev->nd_net; /* Start by cleaning up the memory. */ - if (atomic_read(&ip4_frags.mem) > ip4_frags_ctl.high_thresh) - ip_evictor(); + if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh) + ip_evictor(net); /* Lookup (or create) queue header */ - if ((qp = ip_find(ip_hdr(skb), user)) != NULL) { + if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) { int ret; spin_lock(&qp->q.lock); @@ -607,9 +591,142 @@ int ip_defrag(struct sk_buff *skb, u32 user) return -ENOMEM; } +#ifdef CONFIG_SYSCTL +static int zero; + +static struct ctl_table ip4_frags_ctl_table[] = { + { + .ctl_name = NET_IPV4_IPFRAG_HIGH_THRESH, + .procname = "ipfrag_high_thresh", + .data = &init_net.ipv4.frags.high_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_IPFRAG_LOW_THRESH, + .procname = "ipfrag_low_thresh", + .data = &init_net.ipv4.frags.low_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_IPFRAG_TIME, + .procname = "ipfrag_time", + .data = &init_net.ipv4.frags.timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies + }, + { + .ctl_name = NET_IPV4_IPFRAG_SECRET_INTERVAL, + .procname = "ipfrag_secret_interval", + .data = &ip4_frags.secret_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies + }, + { + .procname = "ipfrag_max_dist", + .data = &sysctl_ipfrag_max_dist, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &zero + }, + { } +}; + +static int ip4_frags_ctl_register(struct net *net) +{ + struct ctl_table *table; + struct ctl_table_header *hdr; + + table = ip4_frags_ctl_table; + if (net != &init_net) { + table = kmemdup(table, sizeof(ip4_frags_ctl_table), GFP_KERNEL); + if (table == NULL) + goto err_alloc; + + table[0].data = &net->ipv4.frags.high_thresh; + table[1].data = &net->ipv4.frags.low_thresh; + table[2].data = &net->ipv4.frags.timeout; + table[3].mode &= ~0222; + table[4].mode &= ~0222; + } + + hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table); + if (hdr == NULL) + goto err_reg; + + net->ipv4.frags_hdr = hdr; + return 0; + +err_reg: + if (net != &init_net) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static void ip4_frags_ctl_unregister(struct net *net) +{ + struct ctl_table *table; + + table = net->ipv4.frags_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->ipv4.frags_hdr); + kfree(table); +} +#else +static inline int ip4_frags_ctl_register(struct net *net) +{ + return 0; +} + +static inline void ip4_frags_ctl_unregister(struct net *net) +{ +} +#endif + +static int ipv4_frags_init_net(struct net *net) +{ + /* + * Fragment cache limits. We will commit 256K at one time. Should we + * cross that limit we will prune down to 192K. This should cope with + * even the most extreme cases without allowing an attacker to + * measurably harm machine performance. + */ + net->ipv4.frags.high_thresh = 256 * 1024; + net->ipv4.frags.low_thresh = 192 * 1024; + /* + * Important NOTE! Fragment queue must be destroyed before MSL expires. + * RFC791 is wrong proposing to prolongate timer each fragment arrival + * by TTL. + */ + net->ipv4.frags.timeout = IP_FRAG_TIME; + + inet_frags_init_net(&net->ipv4.frags); + + return ip4_frags_ctl_register(net); +} + +static void ipv4_frags_exit_net(struct net *net) +{ + ip4_frags_ctl_unregister(net); + inet_frags_exit_net(&net->ipv4.frags, &ip4_frags); +} + +static struct pernet_operations ip4_frags_ops = { + .init = ipv4_frags_init_net, + .exit = ipv4_frags_exit_net, +}; + void __init ipfrag_init(void) { - ip4_frags.ctl = &ip4_frags_ctl; + register_pernet_subsys(&ip4_frags_ops); ip4_frags.hashfn = ip4_hashfn; ip4_frags.constructor = ip4_frag_init; ip4_frags.destructor = ip4_frag_free; @@ -617,6 +734,7 @@ void __init ipfrag_init(void) ip4_frags.qsize = sizeof(struct ipq); ip4_frags.match = ip4_frag_match; ip4_frags.frag_expire = ip_expire; + ip4_frags.secret_interval = 10 * 60 * HZ; inet_frags_init(&ip4_frags); } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 4b93f32de10..63f69171935 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -176,7 +176,8 @@ static struct ip_tunnel * ipgre_tunnel_lookup(__be32 remote, __be32 local, __be3 } for (t = tunnels_l[h1]; t; t = t->next) { if (local == t->parms.iph.saddr || - (local == t->parms.iph.daddr && MULTICAST(local))) { + (local == t->parms.iph.daddr && + ipv4_is_multicast(local))) { if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) return t; } @@ -201,7 +202,7 @@ static struct ip_tunnel **__ipgre_bucket(struct ip_tunnel_parm *parms) if (local) prio |= 1; - if (remote && !MULTICAST(remote)) { + if (remote && !ipv4_is_multicast(remote)) { prio |= 2; h ^= HASH(remote); } @@ -367,7 +368,8 @@ static void ipgre_err(struct sk_buff *skb, u32 info) read_lock(&ipgre_lock); t = ipgre_tunnel_lookup(iph->daddr, iph->saddr, (flags&GRE_KEY) ? *(((__be32*)p) + (grehlen>>2) - 1) : 0); - if (t == NULL || t->parms.iph.daddr == 0 || MULTICAST(t->parms.iph.daddr)) + if (t == NULL || t->parms.iph.daddr == 0 || + ipv4_is_multicast(t->parms.iph.daddr)) goto out; if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) @@ -478,7 +480,7 @@ out: fl.fl4_dst = eiph->saddr; fl.fl4_tos = RT_TOS(eiph->tos); fl.proto = IPPROTO_GRE; - if (ip_route_output_key(&rt, &fl)) { + if (ip_route_output_key(&init_net, &rt, &fl)) { kfree_skb(skb2); return; } @@ -491,7 +493,7 @@ out: fl.fl4_dst = eiph->daddr; fl.fl4_src = eiph->saddr; fl.fl4_tos = eiph->tos; - if (ip_route_output_key(&rt, &fl) || + if (ip_route_output_key(&init_net, &rt, &fl) || rt->u.dst.dev->type != ARPHRD_IPGRE) { ip_rt_put(rt); kfree_skb(skb2); @@ -619,7 +621,7 @@ static int ipgre_rcv(struct sk_buff *skb) skb_postpull_rcsum(skb, skb_transport_header(skb), offset); skb->pkt_type = PACKET_HOST; #ifdef CONFIG_NET_IPGRE_BROADCAST - if (MULTICAST(iph->daddr)) { + if (ipv4_is_multicast(iph->daddr)) { /* Looped back packet, drop it! */ if (((struct rtable*)skb->dst)->fl.iif == 0) goto drop; @@ -746,7 +748,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) .saddr = tiph->saddr, .tos = RT_TOS(tos) } }, .proto = IPPROTO_GRE }; - if (ip_route_output_key(&rt, &fl)) { + if (ip_route_output_key(&init_net, &rt, &fl)) { tunnel->stat.tx_carrier_errors++; goto tx_error; } @@ -783,7 +785,8 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) struct rt6_info *rt6 = (struct rt6_info*)skb->dst; if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) { - if ((tunnel->parms.iph.daddr && !MULTICAST(tunnel->parms.iph.daddr)) || + if ((tunnel->parms.iph.daddr && + !ipv4_is_multicast(tunnel->parms.iph.daddr)) || rt6->rt6i_dst.plen == 128) { rt6->rt6i_flags |= RTF_MODIFIED; skb->dst->metrics[RTAX_MTU-1] = mtu; @@ -896,6 +899,59 @@ tx_error: return 0; } +static void ipgre_tunnel_bind_dev(struct net_device *dev) +{ + struct net_device *tdev = NULL; + struct ip_tunnel *tunnel; + struct iphdr *iph; + int hlen = LL_MAX_HEADER; + int mtu = ETH_DATA_LEN; + int addend = sizeof(struct iphdr) + 4; + + tunnel = netdev_priv(dev); + iph = &tunnel->parms.iph; + + /* Guess output device to choose reasonable mtu and hard_header_len */ + + if (iph->daddr) { + struct flowi fl = { .oif = tunnel->parms.link, + .nl_u = { .ip4_u = + { .daddr = iph->daddr, + .saddr = iph->saddr, + .tos = RT_TOS(iph->tos) } }, + .proto = IPPROTO_GRE }; + struct rtable *rt; + if (!ip_route_output_key(&init_net, &rt, &fl)) { + tdev = rt->u.dst.dev; + ip_rt_put(rt); + } + dev->flags |= IFF_POINTOPOINT; + } + + if (!tdev && tunnel->parms.link) + tdev = __dev_get_by_index(&init_net, tunnel->parms.link); + + if (tdev) { + hlen = tdev->hard_header_len; + mtu = tdev->mtu; + } + dev->iflink = tunnel->parms.link; + + /* Precalculate GRE options length */ + if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) { + if (tunnel->parms.o_flags&GRE_CSUM) + addend += 4; + if (tunnel->parms.o_flags&GRE_KEY) + addend += 4; + if (tunnel->parms.o_flags&GRE_SEQ) + addend += 4; + } + dev->hard_header_len = hlen + addend; + dev->mtu = mtu - addend; + tunnel->hlen = addend; + +} + static int ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) { @@ -956,7 +1012,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) t = netdev_priv(dev); - if (MULTICAST(p.iph.daddr)) + if (ipv4_is_multicast(p.iph.daddr)) nflags = IFF_BROADCAST; else if (p.iph.daddr) nflags = IFF_POINTOPOINT; @@ -983,6 +1039,11 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) t->parms.iph.ttl = p.iph.ttl; t->parms.iph.tos = p.iph.tos; t->parms.iph.frag_off = p.iph.frag_off; + if (t->parms.link != p.link) { + t->parms.link = p.link; + ipgre_tunnel_bind_dev(dev); + netdev_state_change(dev); + } } if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) err = -EFAULT; @@ -1085,7 +1146,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, memcpy(&iph->daddr, daddr, 4); return t->hlen; } - if (iph->daddr && !MULTICAST(iph->daddr)) + if (iph->daddr && !ipv4_is_multicast(iph->daddr)) return t->hlen; return -t->hlen; @@ -1108,7 +1169,7 @@ static int ipgre_open(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); - if (MULTICAST(t->parms.iph.daddr)) { + if (ipv4_is_multicast(t->parms.iph.daddr)) { struct flowi fl = { .oif = t->parms.link, .nl_u = { .ip4_u = { .daddr = t->parms.iph.daddr, @@ -1116,7 +1177,7 @@ static int ipgre_open(struct net_device *dev) .tos = RT_TOS(t->parms.iph.tos) } }, .proto = IPPROTO_GRE }; struct rtable *rt; - if (ip_route_output_key(&rt, &fl)) + if (ip_route_output_key(&init_net, &rt, &fl)) return -EADDRNOTAVAIL; dev = rt->u.dst.dev; ip_rt_put(rt); @@ -1131,8 +1192,9 @@ static int ipgre_open(struct net_device *dev) static int ipgre_close(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); - if (MULTICAST(t->parms.iph.daddr) && t->mlink) { - struct in_device *in_dev = inetdev_by_index(t->mlink); + if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { + struct in_device *in_dev; + in_dev = inetdev_by_index(dev->nd_net, t->mlink); if (in_dev) { ip_mc_dec_group(in_dev, t->parms.iph.daddr); in_dev_put(in_dev); @@ -1162,12 +1224,8 @@ static void ipgre_tunnel_setup(struct net_device *dev) static int ipgre_tunnel_init(struct net_device *dev) { - struct net_device *tdev = NULL; struct ip_tunnel *tunnel; struct iphdr *iph; - int hlen = LL_MAX_HEADER; - int mtu = ETH_DATA_LEN; - int addend = sizeof(struct iphdr) + 4; tunnel = netdev_priv(dev); iph = &tunnel->parms.iph; @@ -1178,25 +1236,11 @@ static int ipgre_tunnel_init(struct net_device *dev) memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); - /* Guess output device to choose reasonable mtu and hard_header_len */ + ipgre_tunnel_bind_dev(dev); if (iph->daddr) { - struct flowi fl = { .oif = tunnel->parms.link, - .nl_u = { .ip4_u = - { .daddr = iph->daddr, - .saddr = iph->saddr, - .tos = RT_TOS(iph->tos) } }, - .proto = IPPROTO_GRE }; - struct rtable *rt; - if (!ip_route_output_key(&rt, &fl)) { - tdev = rt->u.dst.dev; - ip_rt_put(rt); - } - - dev->flags |= IFF_POINTOPOINT; - #ifdef CONFIG_NET_IPGRE_BROADCAST - if (MULTICAST(iph->daddr)) { + if (ipv4_is_multicast(iph->daddr)) { if (!iph->saddr) return -EINVAL; dev->flags = IFF_BROADCAST; @@ -1205,31 +1249,9 @@ static int ipgre_tunnel_init(struct net_device *dev) dev->stop = ipgre_close; } #endif - } else { + } else dev->header_ops = &ipgre_header_ops; - } - - if (!tdev && tunnel->parms.link) - tdev = __dev_get_by_index(&init_net, tunnel->parms.link); - - if (tdev) { - hlen = tdev->hard_header_len; - mtu = tdev->mtu; - } - dev->iflink = tunnel->parms.link; - /* Precalculate GRE options length */ - if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) { - if (tunnel->parms.o_flags&GRE_CSUM) - addend += 4; - if (tunnel->parms.o_flags&GRE_KEY) - addend += 4; - if (tunnel->parms.o_flags&GRE_SEQ) - addend += 4; - } - dev->hard_header_len = hlen + addend; - dev->mtu = mtu - addend; - tunnel->hlen = addend; return 0; } diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 168c871fcd7..65631391d47 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -204,22 +204,14 @@ static int ip_local_deliver_finish(struct sk_buff *skb) rcu_read_lock(); { - /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */ int protocol = ip_hdr(skb)->protocol; - int hash; - struct sock *raw_sk; + int hash, raw; struct net_protocol *ipprot; resubmit: - hash = protocol & (MAX_INET_PROTOS - 1); - raw_sk = sk_head(&raw_v4_htable[hash]); - - /* If there maybe a raw socket we must check - if not we - * don't care less - */ - if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash)) - raw_sk = NULL; + raw = raw_local_deliver(skb, protocol); + hash = protocol & (MAX_INET_PROTOS - 1); if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) { int ret; @@ -237,7 +229,7 @@ static int ip_local_deliver_finish(struct sk_buff *skb) } IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS); } else { - if (!raw_sk) { + if (!raw) { if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { IP_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS); icmp_send(skb, ICMP_DEST_UNREACH, @@ -268,7 +260,7 @@ int ip_local_deliver(struct sk_buff *skb) return 0; } - return NF_HOOK(PF_INET, NF_IP_LOCAL_IN, skb, skb->dev, NULL, + return NF_HOOK(PF_INET, NF_INET_LOCAL_IN, skb, skb->dev, NULL, ip_local_deliver_finish); } @@ -347,7 +339,7 @@ static int ip_rcv_finish(struct sk_buff *skb) #ifdef CONFIG_NET_CLS_ROUTE if (unlikely(skb->dst->tclassid)) { - struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id(); + struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id()); u32 idx = skb->dst->tclassid; st[idx&0xFF].o_packets++; st[idx&0xFF].o_bytes+=skb->len; @@ -442,7 +434,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, /* Remove any debris in the socket control block */ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); - return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL, + return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL, ip_rcv_finish); inhdr_error: diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 2f14745a9e1..4d315158fd3 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -151,7 +151,7 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) __be32 addr; memcpy(&addr, sptr+soffset-1, 4); - if (inet_addr_type(addr) != RTN_LOCAL) { + if (inet_addr_type(&init_net, addr) != RTN_LOCAL) { dopt->ts_needtime = 1; soffset += 8; } @@ -400,7 +400,7 @@ int ip_options_compile(struct ip_options * opt, struct sk_buff * skb) { __be32 addr; memcpy(&addr, &optptr[optptr[2]-1], 4); - if (inet_addr_type(addr) == RTN_UNICAST) + if (inet_addr_type(&init_net, addr) == RTN_UNICAST) break; if (skb) timeptr = (__be32*)&optptr[optptr[2]+3]; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index bc9e57550e8..18070ca6577 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -91,6 +91,28 @@ __inline__ void ip_send_check(struct iphdr *iph) iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } +int __ip_local_out(struct sk_buff *skb) +{ + struct iphdr *iph = ip_hdr(skb); + + iph->tot_len = htons(skb->len); + ip_send_check(iph); + return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev, + dst_output); +} + +int ip_local_out(struct sk_buff *skb) +{ + int err; + + err = __ip_local_out(skb); + if (likely(err == 1)) + err = dst_output(skb); + + return err; +} +EXPORT_SYMBOL_GPL(ip_local_out); + /* dev_loopback_xmit for use with netfilter. */ static int ip_dev_loopback_xmit(struct sk_buff *newskb) { @@ -138,20 +160,17 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, iph->daddr = rt->rt_dst; iph->saddr = rt->rt_src; iph->protocol = sk->sk_protocol; - iph->tot_len = htons(skb->len); ip_select_ident(iph, &rt->u.dst, sk); if (opt && opt->optlen) { iph->ihl += opt->optlen>>2; ip_options_build(skb, opt, daddr, rt, 0); } - ip_send_check(iph); skb->priority = sk->sk_priority; /* Send it out. */ - return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, - dst_output); + return ip_local_out(skb); } EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); @@ -251,8 +270,8 @@ int ip_mc_output(struct sk_buff *skb) ) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) - NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL, - newskb->dev, + NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, + NULL, newskb->dev, ip_dev_loopback_xmit); } @@ -267,11 +286,11 @@ int ip_mc_output(struct sk_buff *skb) if (rt->rt_flags&RTCF_BROADCAST) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) - NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL, + NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, NULL, newskb->dev, ip_dev_loopback_xmit); } - return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev, + return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, skb->dev, ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } @@ -285,7 +304,7 @@ int ip_output(struct sk_buff *skb) skb->dev = dev; skb->protocol = htons(ETH_P_IP); - return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev, + return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev, ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } @@ -331,7 +350,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok) * itself out. */ security_sk_classify_flow(sk, &fl); - if (ip_route_output_flow(&rt, &fl, sk, 0)) + if (ip_route_output_flow(&init_net, &rt, &fl, sk, 0)) goto no_route; } sk_setup_caps(sk, &rt->u.dst); @@ -347,7 +366,6 @@ packet_routed: skb_reset_network_header(skb); iph = ip_hdr(skb); *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); - iph->tot_len = htons(skb->len); if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok) iph->frag_off = htons(IP_DF); else @@ -366,13 +384,9 @@ packet_routed: ip_select_ident_more(iph, &rt->u.dst, sk, (skb_shinfo(skb)->gso_segs ?: 1) - 1); - /* Add an IP checksum. */ - ip_send_check(iph); - skb->priority = sk->sk_priority; - return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, - dst_output); + return ip_local_out(skb); no_route: IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES); @@ -1262,14 +1276,12 @@ int ip_push_pending_frames(struct sock *sk) ip_options_build(skb, opt, inet->cork.addr, rt, 0); } iph->tos = inet->tos; - iph->tot_len = htons(skb->len); iph->frag_off = df; ip_select_ident(iph, &rt->u.dst, sk); iph->ttl = ttl; iph->protocol = sk->sk_protocol; iph->saddr = rt->rt_src; iph->daddr = rt->rt_dst; - ip_send_check(iph); skb->priority = sk->sk_priority; skb->dst = dst_clone(&rt->u.dst); @@ -1279,8 +1291,7 @@ int ip_push_pending_frames(struct sock *sk) skb_transport_header(skb))->type); /* Netfilter gets whole the not fragmented skb. */ - err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, - skb->dst->dev, dst_output); + err = ip_local_out(skb); if (err) { if (err > 0) err = inet->recverr ? net_xmit_errno(err) : 0; @@ -1330,8 +1341,6 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset, * * Should run single threaded per socket because it uses the sock * structure to pass arguments. - * - * LATER: switch from ip_build_xmit to ip_append_* */ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg, unsigned int len) @@ -1370,7 +1379,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar .dport = tcp_hdr(skb)->source } }, .proto = sk->sk_protocol }; security_skb_classify_flow(skb, &fl); - if (ip_route_output_key(&rt, &fl)) + if (ip_route_output_key(sk->sk_net, &rt, &fl)) return; } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 82817e55436..754b0a5bbfe 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -594,7 +594,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, err = 0; break; } - dev = ip_dev_find(mreq.imr_address.s_addr); + dev = ip_dev_find(&init_net, mreq.imr_address.s_addr); if (dev) { mreq.imr_ifindex = dev->ifindex; dev_put(dev); diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 2c44a94c213..f4af99ad8fd 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -182,7 +182,6 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) { struct xfrm_state *t; - u8 mode = XFRM_MODE_TUNNEL; t = xfrm_state_alloc(); if (t == NULL) @@ -193,9 +192,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) t->id.daddr.a4 = x->id.daddr.a4; memcpy(&t->sel, &x->sel, sizeof(t->sel)); t->props.family = AF_INET; - if (x->props.mode == XFRM_MODE_BEET) - mode = x->props.mode; - t->props.mode = mode; + t->props.mode = x->props.mode; t->props.saddr.a4 = x->props.saddr.a4; t->props.flags = x->props.flags; @@ -389,15 +386,22 @@ static int ipcomp_init_state(struct xfrm_state *x) if (x->encap) goto out; + x->props.header_len = 0; + switch (x->props.mode) { + case XFRM_MODE_TRANSPORT: + break; + case XFRM_MODE_TUNNEL: + x->props.header_len += sizeof(struct iphdr); + break; + default: + goto out; + } + err = -ENOMEM; ipcd = kzalloc(sizeof(*ipcd), GFP_KERNEL); if (!ipcd) goto out; - x->props.header_len = 0; - if (x->props.mode == XFRM_MODE_TUNNEL) - x->props.header_len += sizeof(struct iphdr); - mutex_lock(&ipcomp_resource_mutex); if (!ipcomp_alloc_scratches()) goto error; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index b8f7763b226..a52b5853aaa 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -140,6 +140,9 @@ __be32 ic_servaddr = NONE; /* Boot server IP address */ __be32 root_server_addr = NONE; /* Address of NFS server */ u8 root_server_path[256] = { 0, }; /* Path to mount as root */ +/* vendor class identifier */ +static char vendor_class_identifier[253] __initdata; + /* Persistent data: */ static int ic_proto_used; /* Protocol used, if any */ @@ -299,7 +302,7 @@ static int __init ic_route_ioctl(unsigned int cmd, struct rtentry *arg) mm_segment_t oldfs = get_fs(); set_fs(get_ds()); - res = ip_rt_ioctl(cmd, (void __user *) arg); + res = ip_rt_ioctl(&init_net, cmd, (void __user *) arg); set_fs(oldfs); return res; } @@ -588,6 +591,7 @@ ic_dhcp_init_options(u8 *options) u8 mt = ((ic_servaddr == NONE) ? DHCPDISCOVER : DHCPREQUEST); u8 *e = options; + int len; #ifdef IPCONFIG_DEBUG printk("DHCP: Sending message type %d\n", mt); @@ -628,6 +632,16 @@ ic_dhcp_init_options(u8 *options) *e++ = sizeof(ic_req_params); memcpy(e, ic_req_params, sizeof(ic_req_params)); e += sizeof(ic_req_params); + + if (*vendor_class_identifier) { + printk(KERN_INFO "DHCP: sending class identifier \"%s\"\n", + vendor_class_identifier); + *e++ = 60; /* Class-identifier */ + len = strlen(vendor_class_identifier); + *e++ = len; + memcpy(e, vendor_class_identifier, len); + e += len; + } } *e++ = 255; /* End of the list */ @@ -1513,5 +1527,16 @@ static int __init nfsaddrs_config_setup(char *addrs) return ip_auto_config_setup(addrs); } +static int __init vendor_class_identifier_setup(char *addrs) +{ + if (strlcpy(vendor_class_identifier, addrs, + sizeof(vendor_class_identifier)) + >= sizeof(vendor_class_identifier)) + printk(KERN_WARNING "DHCP: vendorclass too long, truncated to \"%s\"", + vendor_class_identifier); + return 1; +} + __setup("ip=", ip_auto_config_setup); __setup("nfsaddrs=", nfsaddrs_config_setup); +__setup("dhcpclass=", vendor_class_identifier_setup); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 8c2b2b0741d..da281581692 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -405,7 +405,7 @@ out: fl.fl4_daddr = eiph->saddr; fl.fl4_tos = RT_TOS(eiph->tos); fl.proto = IPPROTO_IPIP; - if (ip_route_output_key(&rt, &key)) { + if (ip_route_output_key(&init_net, &rt, &key)) { kfree_skb(skb2); return 0; } @@ -418,7 +418,7 @@ out: fl.fl4_daddr = eiph->daddr; fl.fl4_src = eiph->saddr; fl.fl4_tos = eiph->tos; - if (ip_route_output_key(&rt, &fl) || + if (ip_route_output_key(&init_net, &rt, &fl) || rt->u.dst.dev->type != ARPHRD_TUNNEL) { ip_rt_put(rt); kfree_skb(skb2); @@ -547,7 +547,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) .saddr = tiph->saddr, .tos = RT_TOS(tos) } }, .proto = IPPROTO_IPIP }; - if (ip_route_output_key(&rt, &fl)) { + if (ip_route_output_key(&init_net, &rt, &fl)) { tunnel->stat.tx_carrier_errors++; goto tx_error_icmp; } @@ -651,6 +651,40 @@ tx_error: return 0; } +static void ipip_tunnel_bind_dev(struct net_device *dev) +{ + struct net_device *tdev = NULL; + struct ip_tunnel *tunnel; + struct iphdr *iph; + + tunnel = netdev_priv(dev); + iph = &tunnel->parms.iph; + + if (iph->daddr) { + struct flowi fl = { .oif = tunnel->parms.link, + .nl_u = { .ip4_u = + { .daddr = iph->daddr, + .saddr = iph->saddr, + .tos = RT_TOS(iph->tos) } }, + .proto = IPPROTO_IPIP }; + struct rtable *rt; + if (!ip_route_output_key(&init_net, &rt, &fl)) { + tdev = rt->u.dst.dev; + ip_rt_put(rt); + } + dev->flags |= IFF_POINTOPOINT; + } + + if (!tdev && tunnel->parms.link) + tdev = __dev_get_by_index(&init_net, tunnel->parms.link); + + if (tdev) { + dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); + dev->mtu = tdev->mtu - sizeof(struct iphdr); + } + dev->iflink = tunnel->parms.link; +} + static int ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) { @@ -723,6 +757,11 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) t->parms.iph.ttl = p.iph.ttl; t->parms.iph.tos = p.iph.tos; t->parms.iph.frag_off = p.iph.frag_off; + if (t->parms.link != p.link) { + t->parms.link = p.link; + ipip_tunnel_bind_dev(dev); + netdev_state_change(dev); + } } if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) err = -EFAULT; @@ -791,12 +830,9 @@ static void ipip_tunnel_setup(struct net_device *dev) static int ipip_tunnel_init(struct net_device *dev) { - struct net_device *tdev = NULL; struct ip_tunnel *tunnel; - struct iphdr *iph; tunnel = netdev_priv(dev); - iph = &tunnel->parms.iph; tunnel->dev = dev; strcpy(tunnel->parms.name, dev->name); @@ -804,29 +840,7 @@ static int ipip_tunnel_init(struct net_device *dev) memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); - if (iph->daddr) { - struct flowi fl = { .oif = tunnel->parms.link, - .nl_u = { .ip4_u = - { .daddr = iph->daddr, - .saddr = iph->saddr, - .tos = RT_TOS(iph->tos) } }, - .proto = IPPROTO_IPIP }; - struct rtable *rt; - if (!ip_route_output_key(&rt, &fl)) { - tdev = rt->u.dst.dev; - ip_rt_put(rt); - } - dev->flags |= IFF_POINTOPOINT; - } - - if (!tdev && tunnel->parms.link) - tdev = __dev_get_by_index(&init_net, tunnel->parms.link); - - if (tdev) { - dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); - dev->mtu = tdev->mtu - sizeof(struct iphdr); - } - dev->iflink = tunnel->parms.link; + ipip_tunnel_bind_dev(dev); return 0; } diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 37bb497d92a..a94f52c207a 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -141,7 +141,7 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v) p.iph.ihl = 5; p.iph.protocol = IPPROTO_IPIP; sprintf(p.name, "dvmrp%d", v->vifc_vifi); - ifr.ifr_ifru.ifru_data = (void*)&p; + ifr.ifr_ifru.ifru_data = (__force void __user *)&p; oldfs = get_fs(); set_fs(KERNEL_DS); err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL); @@ -321,7 +321,7 @@ static void ipmr_destroy_unres(struct mfc_cache *c) e->error = -ETIMEDOUT; memset(&e->msg, 0, sizeof(e->msg)); - rtnl_unicast(skb, NETLINK_CB(skb).pid); + rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid); } else kfree_skb(skb); } @@ -423,7 +423,7 @@ static int vif_add(struct vifctl *vifc, int mrtsock) return -ENOBUFS; break; case 0: - dev = ip_dev_find(vifc->vifc_lcl_addr.s_addr); + dev = ip_dev_find(&init_net, vifc->vifc_lcl_addr.s_addr); if (!dev) return -EADDRNOTAVAIL; dev_put(dev); @@ -533,7 +533,7 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c) memset(&e->msg, 0, sizeof(e->msg)); } - rtnl_unicast(skb, NETLINK_CB(skb).pid); + rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid); } else ip_mr_forward(skb, c, 0); } @@ -749,7 +749,7 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock) return 0; } - if (!MULTICAST(mfc->mfcc_mcastgrp.s_addr)) + if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr)) return -EINVAL; c=ipmr_cache_alloc(); @@ -849,7 +849,7 @@ static void mrtsock_destruct(struct sock *sk) { rtnl_lock(); if (sk == mroute_socket) { - IPV4_DEVCONF_ALL(MC_FORWARDING)--; + IPV4_DEVCONF_ALL(sk->sk_net, MC_FORWARDING)--; write_lock_bh(&mrt_lock); mroute_socket=NULL; @@ -898,7 +898,7 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int opt mroute_socket=sk; write_unlock_bh(&mrt_lock); - IPV4_DEVCONF_ALL(MC_FORWARDING)++; + IPV4_DEVCONF_ALL(sk->sk_net, MC_FORWARDING)++; } rtnl_unlock(); return ret; @@ -954,10 +954,12 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int opt #ifdef CONFIG_IP_PIMSM case MRT_PIM: { - int v, ret; + int v; + if (get_user(v,(int __user *)optval)) return -EFAULT; - v = (v)?1:0; + v = (v) ? 1 : 0; + rtnl_lock(); ret = 0; if (v != mroute_do_pim) { @@ -1183,7 +1185,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) .saddr = vif->local, .tos = RT_TOS(iph->tos) } }, .proto = IPPROTO_IPIP }; - if (ip_route_output_key(&rt, &fl)) + if (ip_route_output_key(&init_net, &rt, &fl)) goto out_free; encap = sizeof(struct iphdr); } else { @@ -1192,7 +1194,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) { .daddr = iph->daddr, .tos = RT_TOS(iph->tos) } }, .proto = IPPROTO_IPIP }; - if (ip_route_output_key(&rt, &fl)) + if (ip_route_output_key(&init_net, &rt, &fl)) goto out_free; } @@ -1245,7 +1247,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) * not mrouter) cannot join to more than one interface - it will * result in receiving multiple packets. */ - NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev, + NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, dev, ipmr_forward_finish); return; @@ -1461,7 +1463,7 @@ int pim_rcv_v1(struct sk_buff * skb) b. packet is not a NULL-REGISTER c. packet is not truncated */ - if (!MULTICAST(encap->daddr) || + if (!ipv4_is_multicast(encap->daddr) || encap->tot_len == 0 || ntohs(encap->tot_len) + sizeof(*pim) > skb->len) goto drop; @@ -1517,7 +1519,7 @@ static int pim_rcv(struct sk_buff * skb) /* check if the inner packet is destined to mcast group */ encap = (struct iphdr *)(skb_transport_header(skb) + sizeof(struct pimreghdr)); - if (!MULTICAST(encap->daddr) || + if (!ipv4_is_multicast(encap->daddr) || encap->tot_len == 0 || ntohs(encap->tot_len) + sizeof(*pim) > skb->len) goto drop; @@ -1659,6 +1661,7 @@ static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter, } static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(mrt_lock) { read_lock(&mrt_lock); return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1) @@ -1682,6 +1685,7 @@ static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void ipmr_vif_seq_stop(struct seq_file *seq, void *v) + __releases(mrt_lock) { read_unlock(&mrt_lock); } @@ -1889,8 +1893,7 @@ void __init ip_mr_init(void) sizeof(struct mfc_cache), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - init_timer(&ipmr_expire_timer); - ipmr_expire_timer.function=ipmr_expire_process; + setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0); register_netdevice_notifier(&ip_mr_notifier); #ifdef CONFIG_PROC_FS proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops); diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c index 664cb8e97c1..535abe0c45e 100644 --- a/net/ipv4/ipvs/ip_vs_app.c +++ b/net/ipv4/ipvs/ip_vs_app.c @@ -51,18 +51,13 @@ static DEFINE_MUTEX(__ip_vs_app_mutex); */ static inline int ip_vs_app_get(struct ip_vs_app *app) { - /* test and get the module atomically */ - if (app->module) - return try_module_get(app->module); - else - return 1; + return try_module_get(app->module); } static inline void ip_vs_app_put(struct ip_vs_app *app) { - if (app->module) - module_put(app->module); + module_put(app->module); } diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index 0a9f3c37e18..65f1ba11275 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -393,7 +393,15 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) atomic_inc(&dest->refcnt); /* Bind with the destination and its corresponding transmitter */ - cp->flags |= atomic_read(&dest->conn_flags); + if ((cp->flags & IP_VS_CONN_F_SYNC) && + (!(cp->flags & IP_VS_CONN_F_TEMPLATE))) + /* if the connection is not template and is created + * by sync, preserve the activity flag. + */ + cp->flags |= atomic_read(&dest->conn_flags) & + (~IP_VS_CONN_F_INACTIVE); + else + cp->flags |= atomic_read(&dest->conn_flags); cp->dest = dest; IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " @@ -412,7 +420,11 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) /* It is a normal connection, so increase the inactive connection counter because it is in TCP SYNRECV state (inactive) or other protocol inacive state */ - atomic_inc(&dest->inactconns); + if ((cp->flags & IP_VS_CONN_F_SYNC) && + (!(cp->flags & IP_VS_CONN_F_INACTIVE))) + atomic_inc(&dest->activeconns); + else + atomic_inc(&dest->inactconns); } else { /* It is a persistent connection/template, so increase the peristent connection counter */ @@ -629,9 +641,7 @@ ip_vs_conn_new(int proto, __be32 caddr, __be16 cport, __be32 vaddr, __be16 vport } INIT_LIST_HEAD(&cp->c_list); - init_timer(&cp->timer); - cp->timer.data = (unsigned long)cp; - cp->timer.function = ip_vs_conn_expire; + setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); cp->protocol = proto; cp->caddr = caddr; cp->cport = cport; @@ -783,6 +793,57 @@ static const struct file_operations ip_vs_conn_fops = { .llseek = seq_lseek, .release = seq_release, }; + +static const char *ip_vs_origin_name(unsigned flags) +{ + if (flags & IP_VS_CONN_F_SYNC) + return "SYNC"; + else + return "LOCAL"; +} + +static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) +{ + + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n"); + else { + const struct ip_vs_conn *cp = v; + + seq_printf(seq, + "%-3s %08X %04X %08X %04X %08X %04X %-11s %-6s %7lu\n", + ip_vs_proto_name(cp->protocol), + ntohl(cp->caddr), ntohs(cp->cport), + ntohl(cp->vaddr), ntohs(cp->vport), + ntohl(cp->daddr), ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + ip_vs_origin_name(cp->flags), + (cp->timer.expires-jiffies)/HZ); + } + return 0; +} + +static const struct seq_operations ip_vs_conn_sync_seq_ops = { + .start = ip_vs_conn_seq_start, + .next = ip_vs_conn_seq_next, + .stop = ip_vs_conn_seq_stop, + .show = ip_vs_conn_sync_seq_show, +}; + +static int ip_vs_conn_sync_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ip_vs_conn_sync_seq_ops); +} + +static const struct file_operations ip_vs_conn_sync_fops = { + .owner = THIS_MODULE, + .open = ip_vs_conn_sync_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + #endif @@ -942,6 +1003,7 @@ int ip_vs_conn_init(void) } proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops); + proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops); /* calculate the random value for connection hash */ get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); @@ -958,5 +1020,6 @@ void ip_vs_conn_cleanup(void) /* Release the empty cache */ kmem_cache_destroy(ip_vs_conn_cachep); proc_net_remove(&init_net, "ip_vs_conn"); + proc_net_remove(&init_net, "ip_vs_conn_sync"); vfree(ip_vs_conn_tab); } diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 8fba20256f5..963981a9d50 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -423,7 +423,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, and the destination is RTN_UNICAST (and not local), then create a cache_bypass connection entry */ if (sysctl_ip_vs_cache_bypass && svc->fwmark - && (inet_addr_type(iph->daddr) == RTN_UNICAST)) { + && (inet_addr_type(&init_net, iph->daddr) == RTN_UNICAST)) { int ret, cs; struct ip_vs_conn *cp; @@ -481,7 +481,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, /* - * It is hooked before NF_IP_PRI_NAT_SRC at the NF_IP_POST_ROUTING + * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING * chain, and is used for VS/NAT. * It detects packets for VS/NAT connections and sends the packets * immediately. This can avoid that iptable_nat mangles the packets @@ -679,7 +679,7 @@ static inline int is_tcp_reset(const struct sk_buff *skb) } /* - * It is hooked at the NF_IP_FORWARD chain, used only for VS/NAT. + * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. * Check if outgoing packet belongs to the established ip_vs_conn, * rewrite addresses of the packet and send it on its way... */ @@ -814,7 +814,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) /* reassemble IP fragments */ if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { - if (ip_vs_gather_frags(skb, hooknum == NF_IP_LOCAL_IN ? + if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ? IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD)) return NF_STOLEN; } @@ -1003,12 +1003,12 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, /* - * It is hooked at the NF_IP_FORWARD chain, in order to catch ICMP + * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP * related packets destined for 0.0.0.0/0. * When fwmark-based virtual service is used, such as transparent * cache cluster, TCP packets can be marked and routed to ip_vs_in, * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and - * sent to ip_vs_in_icmp. So, catch them at the NF_IP_FORWARD chain + * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain * and send them to ip_vs_in_icmp. */ static unsigned int @@ -1025,43 +1025,42 @@ ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb, } -/* After packet filtering, forward packet through VS/DR, VS/TUN, - or VS/NAT(change destination), so that filtering rules can be - applied to IPVS. */ -static struct nf_hook_ops ip_vs_in_ops = { - .hook = ip_vs_in, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_LOCAL_IN, - .priority = 100, -}; - -/* After packet filtering, change source only for VS/NAT */ -static struct nf_hook_ops ip_vs_out_ops = { - .hook = ip_vs_out, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_FORWARD, - .priority = 100, -}; - -/* After packet filtering (but before ip_vs_out_icmp), catch icmp - destined for 0.0.0.0/0, which is for incoming IPVS connections */ -static struct nf_hook_ops ip_vs_forward_icmp_ops = { - .hook = ip_vs_forward_icmp, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_FORWARD, - .priority = 99, -}; - -/* Before the netfilter connection tracking, exit from POST_ROUTING */ -static struct nf_hook_ops ip_vs_post_routing_ops = { - .hook = ip_vs_post_routing, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_POST_ROUTING, - .priority = NF_IP_PRI_NAT_SRC-1, +static struct nf_hook_ops ip_vs_ops[] __read_mostly = { + /* After packet filtering, forward packet through VS/DR, VS/TUN, + * or VS/NAT(change destination), so that filtering rules can be + * applied to IPVS. */ + { + .hook = ip_vs_in, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_IN, + .priority = 100, + }, + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_out, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_FORWARD, + .priority = 100, + }, + /* After packet filtering (but before ip_vs_out_icmp), catch icmp + * destined for 0.0.0.0/0, which is for incoming IPVS connections */ + { + .hook = ip_vs_forward_icmp, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_FORWARD, + .priority = 99, + }, + /* Before the netfilter connection tracking, exit from POST_ROUTING */ + { + .hook = ip_vs_post_routing, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_NAT_SRC-1, + }, }; @@ -1092,37 +1091,15 @@ static int __init ip_vs_init(void) goto cleanup_app; } - ret = nf_register_hook(&ip_vs_in_ops); + ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); if (ret < 0) { - IP_VS_ERR("can't register in hook.\n"); + IP_VS_ERR("can't register hooks.\n"); goto cleanup_conn; } - ret = nf_register_hook(&ip_vs_out_ops); - if (ret < 0) { - IP_VS_ERR("can't register out hook.\n"); - goto cleanup_inops; - } - ret = nf_register_hook(&ip_vs_post_routing_ops); - if (ret < 0) { - IP_VS_ERR("can't register post_routing hook.\n"); - goto cleanup_outops; - } - ret = nf_register_hook(&ip_vs_forward_icmp_ops); - if (ret < 0) { - IP_VS_ERR("can't register forward_icmp hook.\n"); - goto cleanup_postroutingops; - } - IP_VS_INFO("ipvs loaded.\n"); return ret; - cleanup_postroutingops: - nf_unregister_hook(&ip_vs_post_routing_ops); - cleanup_outops: - nf_unregister_hook(&ip_vs_out_ops); - cleanup_inops: - nf_unregister_hook(&ip_vs_in_ops); cleanup_conn: ip_vs_conn_cleanup(); cleanup_app: @@ -1136,10 +1113,7 @@ static int __init ip_vs_init(void) static void __exit ip_vs_cleanup(void) { - nf_unregister_hook(&ip_vs_forward_icmp_ops); - nf_unregister_hook(&ip_vs_post_routing_ops); - nf_unregister_hook(&ip_vs_out_ops); - nf_unregister_hook(&ip_vs_in_ops); + nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); ip_vs_conn_cleanup(); ip_vs_app_cleanup(); ip_vs_protocol_cleanup(); diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 693d92490c1..94c5767c8e0 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -704,7 +704,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE; /* check if local node and update the flags */ - if (inet_addr_type(udest->addr) == RTN_LOCAL) { + if (inet_addr_type(&init_net, udest->addr) == RTN_LOCAL) { conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) | IP_VS_CONN_F_LOCALNODE; } @@ -756,7 +756,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest, EnterFunction(2); - atype = inet_addr_type(udest->addr); + atype = inet_addr_type(&init_net, udest->addr); if (atype != RTN_LOCAL && atype != RTN_UNICAST) return -EINVAL; @@ -1591,34 +1591,13 @@ static struct ctl_table vs_vars[] = { { .ctl_name = 0 } }; -static ctl_table vs_table[] = { - { - .procname = "vs", - .mode = 0555, - .child = vs_vars - }, - { .ctl_name = 0 } -}; - -static ctl_table ipvs_ipv4_table[] = { - { - .ctl_name = NET_IPV4, - .procname = "ipv4", - .mode = 0555, - .child = vs_table, - }, - { .ctl_name = 0 } -}; - -static ctl_table vs_root_table[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = ipvs_ipv4_table, - }, - { .ctl_name = 0 } +struct ctl_path net_vs_ctl_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "ipv4", .ctl_name = NET_IPV4, }, + { .procname = "vs", }, + { } }; +EXPORT_SYMBOL_GPL(net_vs_ctl_path); static struct ctl_table_header * sysctl_header; @@ -2345,7 +2324,7 @@ int ip_vs_control_init(void) proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops); proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops); - sysctl_header = register_sysctl_table(vs_root_table); + sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars); /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c index 7d68b80c4c1..dfa0d713c80 100644 --- a/net/ipv4/ipvs/ip_vs_est.c +++ b/net/ipv4/ipvs/ip_vs_est.c @@ -18,6 +18,7 @@ #include <linux/slab.h> #include <linux/types.h> #include <linux/interrupt.h> +#include <linux/sysctl.h> #include <net/ip_vs.h> @@ -146,9 +147,8 @@ int ip_vs_new_estimator(struct ip_vs_stats *stats) write_lock_bh(&est_lock); est->next = est_list; if (est->next == NULL) { - init_timer(&est_timer); + setup_timer(&est_timer, estimation_timer, 0); est_timer.expires = jiffies + 2*HZ; - est_timer.function = estimation_timer; add_timer(&est_timer); } est_list = est; diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index ad89644ef5d..3888642706a 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c @@ -123,35 +123,6 @@ static ctl_table vs_vars_table[] = { { .ctl_name = 0 } }; -static ctl_table vs_table[] = { - { - .procname = "vs", - .mode = 0555, - .child = vs_vars_table - }, - { .ctl_name = 0 } -}; - -static ctl_table ipvs_ipv4_table[] = { - { - .ctl_name = NET_IPV4, - .procname = "ipv4", - .mode = 0555, - .child = vs_table - }, - { .ctl_name = 0 } -}; - -static ctl_table lblc_root_table[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = ipvs_ipv4_table - }, - { .ctl_name = 0 } -}; - static struct ctl_table_header * sysctl_header; /* @@ -391,9 +362,8 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) /* * Hook periodic timer for garbage collection */ - init_timer(&tbl->periodic_timer); - tbl->periodic_timer.data = (unsigned long)tbl; - tbl->periodic_timer.function = ip_vs_lblc_check_expire; + setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, + (unsigned long)tbl); tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; add_timer(&tbl->periodic_timer); @@ -583,7 +553,7 @@ static int __init ip_vs_lblc_init(void) int ret; INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list); - sysctl_header = register_sysctl_table(lblc_root_table); + sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler); if (ret) unregister_sysctl_table(sysctl_header); diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index 2a5ed85a335..daa260eb21c 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -311,35 +311,6 @@ static ctl_table vs_vars_table[] = { { .ctl_name = 0 } }; -static ctl_table vs_table[] = { - { - .procname = "vs", - .mode = 0555, - .child = vs_vars_table - }, - { .ctl_name = 0 } -}; - -static ctl_table ipvs_ipv4_table[] = { - { - .ctl_name = NET_IPV4, - .procname = "ipv4", - .mode = 0555, - .child = vs_table - }, - { .ctl_name = 0 } -}; - -static ctl_table lblcr_root_table[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = ipvs_ipv4_table - }, - { .ctl_name = 0 } -}; - static struct ctl_table_header * sysctl_header; /* @@ -575,9 +546,8 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) /* * Hook periodic timer for garbage collection */ - init_timer(&tbl->periodic_timer); - tbl->periodic_timer.data = (unsigned long)tbl; - tbl->periodic_timer.function = ip_vs_lblcr_check_expire; + setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire, + (unsigned long)tbl); tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; add_timer(&tbl->periodic_timer); @@ -772,7 +742,7 @@ static int __init ip_vs_lblcr_init(void) int ret; INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list); - sysctl_header = register_sysctl_table(lblcr_root_table); + sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); if (ret) unregister_sysctl_table(sysctl_header); diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c index c0e11ec8f0f..dde28a250d9 100644 --- a/net/ipv4/ipvs/ip_vs_proto.c +++ b/net/ipv4/ipvs/ip_vs_proto.c @@ -165,7 +165,7 @@ ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); if (ih == NULL) sprintf(buf, "%s TRUNCATED", pp->name); - else if (ih->frag_off & __constant_htons(IP_OFFSET)) + else if (ih->frag_off & htons(IP_OFFSET)) sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag", pp->name, NIPQUAD(ih->saddr), NIPQUAD(ih->daddr)); diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c index c36ccf057a1..aef0d3ee8e4 100644 --- a/net/ipv4/ipvs/ip_vs_proto_esp.c +++ b/net/ipv4/ipvs/ip_vs_proto_esp.c @@ -52,15 +52,15 @@ esp_conn_in_get(const struct sk_buff *skb, if (likely(!inverse)) { cp = ip_vs_conn_in_get(IPPROTO_UDP, iph->saddr, - __constant_htons(PORT_ISAKMP), + htons(PORT_ISAKMP), iph->daddr, - __constant_htons(PORT_ISAKMP)); + htons(PORT_ISAKMP)); } else { cp = ip_vs_conn_in_get(IPPROTO_UDP, iph->daddr, - __constant_htons(PORT_ISAKMP), + htons(PORT_ISAKMP), iph->saddr, - __constant_htons(PORT_ISAKMP)); + htons(PORT_ISAKMP)); } if (!cp) { @@ -89,15 +89,15 @@ esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, if (likely(!inverse)) { cp = ip_vs_conn_out_get(IPPROTO_UDP, iph->saddr, - __constant_htons(PORT_ISAKMP), + htons(PORT_ISAKMP), iph->daddr, - __constant_htons(PORT_ISAKMP)); + htons(PORT_ISAKMP)); } else { cp = ip_vs_conn_out_get(IPPROTO_UDP, iph->daddr, - __constant_htons(PORT_ISAKMP), + htons(PORT_ISAKMP), iph->saddr, - __constant_htons(PORT_ISAKMP)); + htons(PORT_ISAKMP)); } if (!cp) { diff --git a/net/ipv4/ipvs/ip_vs_sched.c b/net/ipv4/ipvs/ip_vs_sched.c index 43223586190..121a32b1b75 100644 --- a/net/ipv4/ipvs/ip_vs_sched.c +++ b/net/ipv4/ipvs/ip_vs_sched.c @@ -24,6 +24,7 @@ #include <linux/interrupt.h> #include <asm/string.h> #include <linux/kmod.h> +#include <linux/sysctl.h> #include <net/ip_vs.h> diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index bd930efc18d..948378d0a75 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -305,10 +305,11 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) p = (char *)buffer + sizeof(struct ip_vs_sync_mesg); for (i=0; i<m->nr_conns; i++) { - unsigned flags; + unsigned flags, state; s = (struct ip_vs_sync_conn *)p; - flags = ntohs(s->flags); + flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; + state = ntohs(s->state); if (!(flags & IP_VS_CONN_F_TEMPLATE)) cp = ip_vs_conn_in_get(s->protocol, s->caddr, s->cport, @@ -326,6 +327,13 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) dest = ip_vs_find_dest(s->daddr, s->dport, s->vaddr, s->vport, s->protocol); + /* Set the approprite ativity flag */ + if (s->protocol == IPPROTO_TCP) { + if (state != IP_VS_TCP_S_ESTABLISHED) + flags |= IP_VS_CONN_F_INACTIVE; + else + flags &= ~IP_VS_CONN_F_INACTIVE; + } cp = ip_vs_conn_new(s->protocol, s->caddr, s->cport, s->vaddr, s->vport, @@ -337,7 +345,7 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) IP_VS_ERR("ip_vs_conn_new failed\n"); return; } - cp->state = ntohs(s->state); + cp->state = state; } else if (!cp->dest) { dest = ip_vs_try_bind_dest(cp); if (!dest) { @@ -346,8 +354,22 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) cp->flags = flags | IP_VS_CONN_F_HASHED; } else atomic_dec(&dest->refcnt); - } /* Note that we don't touch its state and flags - if it is a normal entry. */ + } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) && + (cp->state != state)) { + /* update active/inactive flag for the connection */ + dest = cp->dest; + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (state != IP_VS_TCP_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags |= IP_VS_CONN_F_INACTIVE; + } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && + (state == IP_VS_TCP_S_ESTABLISHED)) { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } if (flags & IP_VS_CONN_F_SEQ_MASK) { opt = (struct ip_vs_sync_conn_options *)&s[1]; @@ -357,7 +379,7 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) p += SIMPLE_CONN_SIZE; atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); - cp->state = ntohs(s->state); + cp->state = state; pp = ip_vs_proto_get(s->protocol); cp->timeout = pp->timeout_table[cp->state]; ip_vs_conn_put(cp); diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c index 7c074e386c1..f63006caea0 100644 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ b/net/ipv4/ipvs/ip_vs_xmit.c @@ -16,8 +16,8 @@ */ #include <linux/kernel.h> -#include <linux/ip.h> #include <linux/tcp.h> /* for tcphdr */ +#include <net/ip.h> #include <net/tcp.h> /* for csum_tcpudp_magic */ #include <net/udp.h> #include <net/icmp.h> /* for icmp_send */ @@ -59,7 +59,7 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie) return dst; } -static inline struct rtable * +static struct rtable * __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) { struct rtable *rt; /* Route to the other host */ @@ -78,7 +78,7 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) .tos = rtos, } }, }; - if (ip_route_output_key(&rt, &fl)) { + if (ip_route_output_key(&init_net, &rt, &fl)) { spin_unlock(&dest->dst_lock); IP_VS_DBG_RL("ip_route_output error, " "dest: %u.%u.%u.%u\n", @@ -101,7 +101,7 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) .tos = rtos, } }, }; - if (ip_route_output_key(&rt, &fl)) { + if (ip_route_output_key(&init_net, &rt, &fl)) { IP_VS_DBG_RL("ip_route_output error, dest: " "%u.%u.%u.%u\n", NIPQUAD(cp->daddr)); return NULL; @@ -129,7 +129,7 @@ ip_vs_dst_reset(struct ip_vs_dest *dest) do { \ (skb)->ipvs_property = 1; \ skb_forward_csum(skb); \ - NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \ + NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, (skb), NULL, \ (rt)->u.dst.dev, dst_output); \ } while (0) @@ -170,7 +170,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - if (ip_route_output_key(&rt, &fl)) { + if (ip_route_output_key(&init_net, &rt, &fl)) { IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, " "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr)); goto tx_error_icmp; @@ -406,14 +406,12 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, iph->daddr = rt->rt_dst; iph->saddr = rt->rt_src; iph->ttl = old_iph->ttl; - iph->tot_len = htons(skb->len); ip_select_ident(iph, &rt->u.dst, NULL); - ip_send_check(iph); /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(skb, rt); + ip_local_out(skb); LeaveFunction(10); diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 5539debf497..9a904c6c0dc 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -7,6 +7,7 @@ #include <net/route.h> #include <net/xfrm.h> #include <net/ip.h> +#include <net/netfilter/nf_queue.h> /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) @@ -18,12 +19,12 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) unsigned int hh_len; unsigned int type; - type = inet_addr_type(iph->saddr); + type = inet_addr_type(&init_net, iph->saddr); if (addr_type == RTN_UNSPEC) addr_type = type; /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause - * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook. + * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook. */ if (addr_type == RTN_LOCAL) { fl.nl_u.ip4_u.daddr = iph->daddr; @@ -32,7 +33,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; fl.mark = skb->mark; - if (ip_route_output_key(&rt, &fl) != 0) + if (ip_route_output_key(&init_net, &rt, &fl) != 0) return -1; /* Drop old route. */ @@ -42,7 +43,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) /* non-local src, find valid iif to satisfy * rp-filter when calling ip_route_input. */ fl.nl_u.ip4_u.daddr = iph->saddr; - if (ip_route_output_key(&rt, &fl) != 0) + if (ip_route_output_key(&init_net, &rt, &fl) != 0) return -1; odst = skb->dst; @@ -122,11 +123,12 @@ struct ip_rt_info { u_int8_t tos; }; -static void nf_ip_saveroute(const struct sk_buff *skb, struct nf_info *info) +static void nf_ip_saveroute(const struct sk_buff *skb, + struct nf_queue_entry *entry) { - struct ip_rt_info *rt_info = nf_info_reroute(info); + struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); - if (info->hook == NF_IP_LOCAL_OUT) { + if (entry->hook == NF_INET_LOCAL_OUT) { const struct iphdr *iph = ip_hdr(skb); rt_info->tos = iph->tos; @@ -135,11 +137,12 @@ static void nf_ip_saveroute(const struct sk_buff *skb, struct nf_info *info) } } -static int nf_ip_reroute(struct sk_buff *skb, const struct nf_info *info) +static int nf_ip_reroute(struct sk_buff *skb, + const struct nf_queue_entry *entry) { - const struct ip_rt_info *rt_info = nf_info_reroute(info); + const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); - if (info->hook == NF_IP_LOCAL_OUT) { + if (entry->hook == NF_INET_LOCAL_OUT) { const struct iphdr *iph = ip_hdr(skb); if (!(iph->tos == rt_info->tos @@ -158,7 +161,7 @@ __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, switch (skb->ip_summed) { case CHECKSUM_COMPLETE: - if (hook != NF_IP_PRE_ROUTING && hook != NF_IP_LOCAL_IN) + if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN) break; if ((protocol == 0 && !csum_fold(skb->csum)) || !csum_tcpudp_magic(iph->saddr, iph->daddr, @@ -182,9 +185,15 @@ __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, EXPORT_SYMBOL(nf_ip_checksum); -static struct nf_afinfo nf_ip_afinfo = { +static int nf_ip_route(struct dst_entry **dst, struct flowi *fl) +{ + return ip_route_output_key(&init_net, (struct rtable **)dst, fl); +} + +static const struct nf_afinfo nf_ip_afinfo = { .family = AF_INET, .checksum = nf_ip_checksum, + .route = nf_ip_route, .saveroute = nf_ip_saveroute, .reroute = nf_ip_reroute, .route_key_size = sizeof(struct ip_rt_info), @@ -202,3 +211,13 @@ static void ipv4_netfilter_fini(void) module_init(ipv4_netfilter_init); module_exit(ipv4_netfilter_fini); + +#ifdef CONFIG_SYSCTL +struct ctl_path nf_net_ipv4_netfilter_sysctl_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "ipv4", .ctl_name = NET_IPV4, }, + { .procname = "netfilter", .ctl_name = NET_IPV4_NETFILTER, }, + { } +}; +EXPORT_SYMBOL_GPL(nf_net_ipv4_netfilter_sysctl_path); +#endif /* CONFIG_SYSCTL */ diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 9aca9c55687..9a077cb2479 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -8,6 +8,7 @@ menu "IP: Netfilter Configuration" config NF_CONNTRACK_IPV4 tristate "IPv4 connection tracking support (required for NAT)" depends on NF_CONNTRACK + default m if NETFILTER_ADVANCED=n ---help--- Connection tracking keeps a record of what packets have passed through your machine, in order to figure out how they are related @@ -32,6 +33,7 @@ config NF_CONNTRACK_PROC_COMPAT config IP_NF_QUEUE tristate "IP Userspace queueing via NETLINK (OBSOLETE)" + depends on NETFILTER_ADVANCED help Netfilter has the ability to queue packets to user space: the netlink device can be used to access them using this driver. @@ -44,6 +46,7 @@ config IP_NF_QUEUE config IP_NF_IPTABLES tristate "IP tables support (required for filtering/masq/NAT)" + default m if NETFILTER_ADVANCED=n select NETFILTER_XTABLES help iptables is a general, extensible packet identification framework. @@ -54,27 +57,10 @@ config IP_NF_IPTABLES To compile it as a module, choose M here. If unsure, say N. # The matches. -config IP_NF_MATCH_IPRANGE - tristate "IP range match support" - depends on IP_NF_IPTABLES - help - This option makes possible to match IP addresses against IP address - ranges. - - To compile it as a module, choose M here. If unsure, say N. - -config IP_NF_MATCH_TOS - tristate "TOS match support" - depends on IP_NF_IPTABLES - help - TOS matching allows you to match packets based on the Type Of - Service fields of the IP packet. - - To compile it as a module, choose M here. If unsure, say N. - config IP_NF_MATCH_RECENT - tristate "recent match support" + tristate '"recent" match support' depends on IP_NF_IPTABLES + depends on NETFILTER_ADVANCED help This match is used for creating one or many lists of recently used addresses and then matching against that/those list(s). @@ -85,8 +71,9 @@ config IP_NF_MATCH_RECENT To compile it as a module, choose M here. If unsure, say N. config IP_NF_MATCH_ECN - tristate "ECN match support" + tristate '"ecn" match support' depends on IP_NF_IPTABLES + depends on NETFILTER_ADVANCED help This option adds a `ECN' match, which allows you to match against the IPv4 and TCP header ECN fields. @@ -94,8 +81,9 @@ config IP_NF_MATCH_ECN To compile it as a module, choose M here. If unsure, say N. config IP_NF_MATCH_AH - tristate "AH match support" + tristate '"ah" match support' depends on IP_NF_IPTABLES + depends on NETFILTER_ADVANCED help This match extension allows you to match a range of SPIs inside AH header of IPSec packets. @@ -103,30 +91,23 @@ config IP_NF_MATCH_AH To compile it as a module, choose M here. If unsure, say N. config IP_NF_MATCH_TTL - tristate "TTL match support" + tristate '"ttl" match support' depends on IP_NF_IPTABLES + depends on NETFILTER_ADVANCED help This adds CONFIG_IP_NF_MATCH_TTL option, which enabled the user to match packets by their TTL value. To compile it as a module, choose M here. If unsure, say N. -config IP_NF_MATCH_OWNER - tristate "Owner match support" - depends on IP_NF_IPTABLES - help - Packet owner matching allows you to match locally-generated packets - based on who created them: the user, group, process or session. - - To compile it as a module, choose M here. If unsure, say N. - config IP_NF_MATCH_ADDRTYPE - tristate 'address type match support' + tristate '"addrtype" address type match support' depends on IP_NF_IPTABLES + depends on NETFILTER_ADVANCED help This option allows you to match what routing thinks of an address, eg. UNICAST, LOCAL, BROADCAST, ... - + If you want to compile it as a module, say M here and read <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. @@ -134,6 +115,7 @@ config IP_NF_MATCH_ADDRTYPE config IP_NF_FILTER tristate "Packet filtering" depends on IP_NF_IPTABLES + default m if NETFILTER_ADVANCED=n help Packet filtering defines a table `filter', which has a series of rules for simple packet filtering at local input, forwarding and @@ -144,6 +126,7 @@ config IP_NF_FILTER config IP_NF_TARGET_REJECT tristate "REJECT target support" depends on IP_NF_FILTER + default m if NETFILTER_ADVANCED=n help The REJECT target allows a filtering rule to specify that an ICMP error should be issued in response to an incoming packet, rather @@ -154,6 +137,7 @@ config IP_NF_TARGET_REJECT config IP_NF_TARGET_LOG tristate "LOG target support" depends on IP_NF_IPTABLES + default m if NETFILTER_ADVANCED=n help This option adds a `LOG' target, which allows you to create rules in any iptables table which records the packet header to the syslog. @@ -163,6 +147,7 @@ config IP_NF_TARGET_LOG config IP_NF_TARGET_ULOG tristate "ULOG target support" depends on IP_NF_IPTABLES + default m if NETFILTER_ADVANCED=n ---help--- This option enables the old IPv4-only "ipt_ULOG" implementation @@ -183,6 +168,7 @@ config IP_NF_TARGET_ULOG config NF_NAT tristate "Full NAT" depends on IP_NF_IPTABLES && NF_CONNTRACK_IPV4 + default m if NETFILTER_ADVANCED=n help The Full NAT option allows masquerading, port forwarding and other forms of full Network Address Port Translation. It is controlled by @@ -198,6 +184,7 @@ config NF_NAT_NEEDED config IP_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" depends on NF_NAT + default m if NETFILTER_ADVANCED=n help Masquerading is a special case of NAT: all outgoing connections are changed to seem to come from a particular interface's address, and @@ -210,6 +197,7 @@ config IP_NF_TARGET_MASQUERADE config IP_NF_TARGET_REDIRECT tristate "REDIRECT target support" depends on NF_NAT + depends on NETFILTER_ADVANCED help REDIRECT is a special case of NAT: all incoming connections are mapped onto the incoming interface's address, causing the packets to @@ -221,6 +209,7 @@ config IP_NF_TARGET_REDIRECT config IP_NF_TARGET_NETMAP tristate "NETMAP target support" depends on NF_NAT + depends on NETFILTER_ADVANCED help NETMAP is an implementation of static 1:1 NAT mapping of network addresses. It maps the network address part, while keeping the host @@ -229,18 +218,10 @@ config IP_NF_TARGET_NETMAP To compile it as a module, choose M here. If unsure, say N. -config IP_NF_TARGET_SAME - tristate "SAME target support (OBSOLETE)" - depends on NF_NAT - help - This option adds a `SAME' target, which works like the standard SNAT - target, but attempts to give clients the same IP for all connections. - - To compile it as a module, choose M here. If unsure, say N. - config NF_NAT_SNMP_BASIC - tristate "Basic SNMP-ALG support (EXPERIMENTAL)" - depends on EXPERIMENTAL && NF_NAT + tristate "Basic SNMP-ALG support" + depends on NF_NAT + depends on NETFILTER_ADVANCED ---help--- This module implements an Application Layer Gateway (ALG) for @@ -304,6 +285,7 @@ config NF_NAT_SIP config IP_NF_MANGLE tristate "Packet mangling" depends on IP_NF_IPTABLES + default m if NETFILTER_ADVANCED=n help This option adds a `mangle' table to iptables: see the man page for iptables(8). This table is used for various packet alterations @@ -311,19 +293,10 @@ config IP_NF_MANGLE To compile it as a module, choose M here. If unsure, say N. -config IP_NF_TARGET_TOS - tristate "TOS target support" - depends on IP_NF_MANGLE - help - This option adds a `TOS' target, which allows you to create rules in - the `mangle' table which alter the Type Of Service field of an IP - packet prior to routing. - - To compile it as a module, choose M here. If unsure, say N. - config IP_NF_TARGET_ECN tristate "ECN target support" depends on IP_NF_MANGLE + depends on NETFILTER_ADVANCED ---help--- This option adds a `ECN' target, which can be used in the iptables mangle table. @@ -338,6 +311,7 @@ config IP_NF_TARGET_ECN config IP_NF_TARGET_TTL tristate 'TTL target support' depends on IP_NF_MANGLE + depends on NETFILTER_ADVANCED help This option adds a `TTL' target, which enables the user to modify the TTL value of the IP header. @@ -353,6 +327,7 @@ config IP_NF_TARGET_CLUSTERIP tristate "CLUSTERIP target support (EXPERIMENTAL)" depends on IP_NF_MANGLE && EXPERIMENTAL depends on NF_CONNTRACK_IPV4 + depends on NETFILTER_ADVANCED select NF_CONNTRACK_MARK help The CLUSTERIP target allows you to build load-balancing clusters of @@ -365,6 +340,7 @@ config IP_NF_TARGET_CLUSTERIP config IP_NF_RAW tristate 'raw table support (required for NOTRACK/TRACE)' depends on IP_NF_IPTABLES + depends on NETFILTER_ADVANCED help This option adds a `raw' table to iptables. This table is the very first in the netfilter framework and hooks in at the PREROUTING @@ -377,6 +353,7 @@ config IP_NF_RAW config IP_NF_ARPTABLES tristate "ARP tables support" select NETFILTER_XTABLES + depends on NETFILTER_ADVANCED help arptables is a general, extensible packet identification framework. The ARP packet filtering and mangling (manipulation)subsystems diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 7456833d6ad..0c7dc78a62e 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -44,10 +44,7 @@ obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o -obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o -obj-$(CONFIG_IP_NF_MATCH_OWNER) += ipt_owner.o obj-$(CONFIG_IP_NF_MATCH_RECENT) += ipt_recent.o -obj-$(CONFIG_IP_NF_MATCH_TOS) += ipt_tos.o obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o # targets @@ -58,8 +55,6 @@ obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o -obj-$(CONFIG_IP_NF_TARGET_SAME) += ipt_SAME.o -obj-$(CONFIG_IP_NF_TARGET_TOS) += ipt_TOS.o obj-$(CONFIG_IP_NF_TARGET_TTL) += ipt_TTL.o obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 2909c92ecd9..b4a810c28ac 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -19,9 +19,10 @@ #include <linux/proc_fs.h> #include <linux/module.h> #include <linux/init.h> - -#include <asm/uaccess.h> #include <linux/mutex.h> +#include <linux/err.h> +#include <net/compat.h> +#include <asm/uaccess.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_arp/arp_tables.h> @@ -83,7 +84,7 @@ static inline int arp_packet_match(const struct arphdr *arphdr, __be32 src_ipaddr, tgt_ipaddr; int i, ret; -#define FWINV(bool,invflg) ((bool) ^ !!(arpinfo->invflags & invflg)) +#define FWINV(bool, invflg) ((bool) ^ !!(arpinfo->invflags & (invflg))) if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop, ARPT_INV_ARPOP)) { @@ -179,6 +180,7 @@ static inline int arp_packet_match(const struct arphdr *arphdr, } return 1; +#undef FWINV } static inline int arp_checkentry(const struct arpt_arp *arp) @@ -435,29 +437,9 @@ static int mark_source_chains(struct xt_table_info *newinfo, return 1; } -static inline int standard_check(const struct arpt_entry_target *t, - unsigned int max_offset) -{ - /* Check standard info. */ - if (t->u.target_size - != ARPT_ALIGN(sizeof(struct arpt_standard_target))) { - duprintf("arpt_standard_check: target size %u != %Zu\n", - t->u.target_size, - ARPT_ALIGN(sizeof(struct arpt_standard_target))); - return 0; - } - - return 1; -} - -static struct arpt_target arpt_standard_target; - -static inline int check_entry(struct arpt_entry *e, const char *name, unsigned int size, - unsigned int *i) +static inline int check_entry(struct arpt_entry *e, const char *name) { struct arpt_entry_target *t; - struct arpt_target *target; - int ret; if (!arp_checkentry(&e->arp)) { duprintf("arp_tables: arp check failed %p %s.\n", e, name); @@ -471,35 +453,57 @@ static inline int check_entry(struct arpt_entry *e, const char *name, unsigned i if (e->target_offset + t->u.target_size > e->next_offset) return -EINVAL; + return 0; +} + +static inline int check_target(struct arpt_entry *e, const char *name) +{ + struct arpt_entry_target *t; + struct arpt_target *target; + int ret; + + t = arpt_get_target(e); + target = t->u.kernel.target; + + ret = xt_check_target(target, NF_ARP, t->u.target_size - sizeof(*t), + name, e->comefrom, 0, 0); + if (!ret && t->u.kernel.target->checkentry + && !t->u.kernel.target->checkentry(name, e, target, t->data, + e->comefrom)) { + duprintf("arp_tables: check failed for `%s'.\n", + t->u.kernel.target->name); + ret = -EINVAL; + } + return ret; +} + +static inline int +find_check_entry(struct arpt_entry *e, const char *name, unsigned int size, + unsigned int *i) +{ + struct arpt_entry_target *t; + struct arpt_target *target; + int ret; + + ret = check_entry(e, name); + if (ret) + return ret; + + t = arpt_get_target(e); target = try_then_request_module(xt_find_target(NF_ARP, t->u.user.name, t->u.user.revision), "arpt_%s", t->u.user.name); if (IS_ERR(target) || !target) { - duprintf("check_entry: `%s' not found\n", t->u.user.name); + duprintf("find_check_entry: `%s' not found\n", t->u.user.name); ret = target ? PTR_ERR(target) : -ENOENT; goto out; } t->u.kernel.target = target; - ret = xt_check_target(target, NF_ARP, t->u.target_size - sizeof(*t), - name, e->comefrom, 0, 0); + ret = check_target(e, name); if (ret) goto err; - if (t->u.kernel.target == &arpt_standard_target) { - if (!standard_check(t, size)) { - ret = -EINVAL; - goto err; - } - } else if (t->u.kernel.target->checkentry - && !t->u.kernel.target->checkentry(name, e, target, t->data, - e->comefrom)) { - duprintf("arp_tables: check failed for `%s'.\n", - t->u.kernel.target->name); - ret = -EINVAL; - goto err; - } - (*i)++; return 0; err: @@ -633,7 +637,7 @@ static int translate_table(const char *name, /* Finally, each sanity check must pass */ i = 0; ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size, - check_entry, name, size, &i); + find_check_entry, name, size, &i); if (ret != 0) { ARPT_ENTRY_ITERATE(entry0, newinfo->size, @@ -704,16 +708,11 @@ static void get_counters(const struct xt_table_info *t, } } -static int copy_entries_to_user(unsigned int total_size, - struct arpt_table *table, - void __user *userptr) +static inline struct xt_counters *alloc_counters(struct arpt_table *table) { - unsigned int off, num, countersize; - struct arpt_entry *e; + unsigned int countersize; struct xt_counters *counters; struct xt_table_info *private = table->private; - int ret = 0; - void *loc_cpu_entry; /* We need atomic snapshot of counters: rest doesn't change * (other than comefrom, which userspace doesn't care @@ -723,13 +722,31 @@ static int copy_entries_to_user(unsigned int total_size, counters = vmalloc_node(countersize, numa_node_id()); if (counters == NULL) - return -ENOMEM; + return ERR_PTR(-ENOMEM); /* First, sum counters... */ write_lock_bh(&table->lock); get_counters(private, counters); write_unlock_bh(&table->lock); + return counters; +} + +static int copy_entries_to_user(unsigned int total_size, + struct arpt_table *table, + void __user *userptr) +{ + unsigned int off, num; + struct arpt_entry *e; + struct xt_counters *counters; + struct xt_table_info *private = table->private; + int ret = 0; + void *loc_cpu_entry; + + counters = alloc_counters(table); + if (IS_ERR(counters)) + return PTR_ERR(counters); + loc_cpu_entry = private->entries[raw_smp_processor_id()]; /* ... then copy entire thing ... */ if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { @@ -767,23 +784,159 @@ static int copy_entries_to_user(unsigned int total_size, return ret; } -static int get_entries(const struct arpt_get_entries *entries, - struct arpt_get_entries __user *uptr) +#ifdef CONFIG_COMPAT +static void compat_standard_from_user(void *dst, void *src) +{ + int v = *(compat_int_t *)src; + + if (v > 0) + v += xt_compat_calc_jump(NF_ARP, v); + memcpy(dst, &v, sizeof(v)); +} + +static int compat_standard_to_user(void __user *dst, void *src) { + compat_int_t cv = *(int *)src; + + if (cv > 0) + cv -= xt_compat_calc_jump(NF_ARP, cv); + return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; +} + +static int compat_calc_entry(struct arpt_entry *e, + const struct xt_table_info *info, + void *base, struct xt_table_info *newinfo) +{ + struct arpt_entry_target *t; + unsigned int entry_offset; + int off, i, ret; + + off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); + entry_offset = (void *)e - base; + + t = arpt_get_target(e); + off += xt_compat_target_offset(t->u.kernel.target); + newinfo->size -= off; + ret = xt_compat_add_offset(NF_ARP, entry_offset, off); + if (ret) + return ret; + + for (i = 0; i < NF_ARP_NUMHOOKS; i++) { + if (info->hook_entry[i] && + (e < (struct arpt_entry *)(base + info->hook_entry[i]))) + newinfo->hook_entry[i] -= off; + if (info->underflow[i] && + (e < (struct arpt_entry *)(base + info->underflow[i]))) + newinfo->underflow[i] -= off; + } + return 0; +} + +static int compat_table_info(const struct xt_table_info *info, + struct xt_table_info *newinfo) +{ + void *loc_cpu_entry; + + if (!newinfo || !info) + return -EINVAL; + + /* we dont care about newinfo->entries[] */ + memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); + newinfo->initial_entries = 0; + loc_cpu_entry = info->entries[raw_smp_processor_id()]; + return ARPT_ENTRY_ITERATE(loc_cpu_entry, info->size, + compat_calc_entry, info, loc_cpu_entry, + newinfo); +} +#endif + +static int get_info(void __user *user, int *len, int compat) +{ + char name[ARPT_TABLE_MAXNAMELEN]; + struct arpt_table *t; int ret; + + if (*len != sizeof(struct arpt_getinfo)) { + duprintf("length %u != %Zu\n", *len, + sizeof(struct arpt_getinfo)); + return -EINVAL; + } + + if (copy_from_user(name, user, sizeof(name)) != 0) + return -EFAULT; + + name[ARPT_TABLE_MAXNAMELEN-1] = '\0'; +#ifdef CONFIG_COMPAT + if (compat) + xt_compat_lock(NF_ARP); +#endif + t = try_then_request_module(xt_find_table_lock(NF_ARP, name), + "arptable_%s", name); + if (t && !IS_ERR(t)) { + struct arpt_getinfo info; + struct xt_table_info *private = t->private; + +#ifdef CONFIG_COMPAT + if (compat) { + struct xt_table_info tmp; + ret = compat_table_info(private, &tmp); + xt_compat_flush_offsets(NF_ARP); + private = &tmp; + } +#endif + info.valid_hooks = t->valid_hooks; + memcpy(info.hook_entry, private->hook_entry, + sizeof(info.hook_entry)); + memcpy(info.underflow, private->underflow, + sizeof(info.underflow)); + info.num_entries = private->number; + info.size = private->size; + strcpy(info.name, name); + + if (copy_to_user(user, &info, *len) != 0) + ret = -EFAULT; + else + ret = 0; + xt_table_unlock(t); + module_put(t->me); + } else + ret = t ? PTR_ERR(t) : -ENOENT; +#ifdef CONFIG_COMPAT + if (compat) + xt_compat_unlock(NF_ARP); +#endif + return ret; +} + +static int get_entries(struct arpt_get_entries __user *uptr, int *len) +{ + int ret; + struct arpt_get_entries get; struct arpt_table *t; - t = xt_find_table_lock(NF_ARP, entries->name); + if (*len < sizeof(get)) { + duprintf("get_entries: %u < %Zu\n", *len, sizeof(get)); + return -EINVAL; + } + if (copy_from_user(&get, uptr, sizeof(get)) != 0) + return -EFAULT; + if (*len != sizeof(struct arpt_get_entries) + get.size) { + duprintf("get_entries: %u != %Zu\n", *len, + sizeof(struct arpt_get_entries) + get.size); + return -EINVAL; + } + + t = xt_find_table_lock(NF_ARP, get.name); if (t && !IS_ERR(t)) { struct xt_table_info *private = t->private; duprintf("t->private->number = %u\n", private->number); - if (entries->size == private->size) + if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); else { duprintf("get_entries: I've got %u not %u!\n", - private->size, entries->size); + private->size, get.size); ret = -EINVAL; } module_put(t->me); @@ -794,71 +947,41 @@ static int get_entries(const struct arpt_get_entries *entries, return ret; } -static int do_replace(void __user *user, unsigned int len) +static int __do_replace(const char *name, unsigned int valid_hooks, + struct xt_table_info *newinfo, + unsigned int num_counters, + void __user *counters_ptr) { int ret; - struct arpt_replace tmp; struct arpt_table *t; - struct xt_table_info *newinfo, *oldinfo; + struct xt_table_info *oldinfo; struct xt_counters *counters; - void *loc_cpu_entry, *loc_cpu_old_entry; - - if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) - return -EFAULT; + void *loc_cpu_old_entry; - /* Hack: Causes ipchains to give correct error msg --RR */ - if (len != sizeof(tmp) + tmp.size) - return -ENOPROTOOPT; - - /* overflow check */ - if (tmp.size >= (INT_MAX - sizeof(struct xt_table_info)) / NR_CPUS - - SMP_CACHE_BYTES) - return -ENOMEM; - if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) - return -ENOMEM; - - newinfo = xt_alloc_table_info(tmp.size); - if (!newinfo) - return -ENOMEM; - - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; - if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), - tmp.size) != 0) { - ret = -EFAULT; - goto free_newinfo; - } - - counters = vmalloc(tmp.num_counters * sizeof(struct xt_counters)); + ret = 0; + counters = vmalloc_node(num_counters * sizeof(struct xt_counters), + numa_node_id()); if (!counters) { ret = -ENOMEM; - goto free_newinfo; + goto out; } - ret = translate_table(tmp.name, tmp.valid_hooks, - newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, - tmp.hook_entry, tmp.underflow); - if (ret != 0) - goto free_newinfo_counters; - - duprintf("arp_tables: Translated table\n"); - - t = try_then_request_module(xt_find_table_lock(NF_ARP, tmp.name), - "arptable_%s", tmp.name); + t = try_then_request_module(xt_find_table_lock(NF_ARP, name), + "arptable_%s", name); if (!t || IS_ERR(t)) { ret = t ? PTR_ERR(t) : -ENOENT; goto free_newinfo_counters_untrans; } /* You lied! */ - if (tmp.valid_hooks != t->valid_hooks) { + if (valid_hooks != t->valid_hooks) { duprintf("Valid hook crap: %08X vs %08X\n", - tmp.valid_hooks, t->valid_hooks); + valid_hooks, t->valid_hooks); ret = -EINVAL; goto put_module; } - oldinfo = xt_replace_table(t, tmp.num_counters, newinfo, &ret); + oldinfo = xt_replace_table(t, num_counters, newinfo, &ret); if (!oldinfo) goto put_module; @@ -876,11 +999,12 @@ static int do_replace(void __user *user, unsigned int len) get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; - ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL); + ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, + NULL); xt_free_table_info(oldinfo); - if (copy_to_user(tmp.counters, counters, - sizeof(struct xt_counters) * tmp.num_counters) != 0) + if (copy_to_user(counters_ptr, counters, + sizeof(struct xt_counters) * num_counters) != 0) ret = -EFAULT; vfree(counters); xt_table_unlock(t); @@ -890,9 +1014,53 @@ static int do_replace(void __user *user, unsigned int len) module_put(t->me); xt_table_unlock(t); free_newinfo_counters_untrans: - ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); - free_newinfo_counters: vfree(counters); + out: + return ret; +} + +static int do_replace(void __user *user, unsigned int len) +{ + int ret; + struct arpt_replace tmp; + struct xt_table_info *newinfo; + void *loc_cpu_entry; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + /* overflow check */ + if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) + return -ENOMEM; + + newinfo = xt_alloc_table_info(tmp.size); + if (!newinfo) + return -ENOMEM; + + /* choose the copy that is on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), + tmp.size) != 0) { + ret = -EFAULT; + goto free_newinfo; + } + + ret = translate_table(tmp.name, tmp.valid_hooks, + newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, + tmp.hook_entry, tmp.underflow); + if (ret != 0) + goto free_newinfo; + + duprintf("arp_tables: Translated table\n"); + + ret = __do_replace(tmp.name, tmp.valid_hooks, newinfo, + tmp.num_counters, tmp.counters); + if (ret) + goto free_newinfo_untrans; + return 0; + + free_newinfo_untrans: + ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); free_newinfo: xt_free_table_info(newinfo); return ret; @@ -912,31 +1080,59 @@ static inline int add_counter_to_entry(struct arpt_entry *e, return 0; } -static int do_add_counters(void __user *user, unsigned int len) +static int do_add_counters(void __user *user, unsigned int len, int compat) { unsigned int i; - struct xt_counters_info tmp, *paddc; + struct xt_counters_info tmp; + struct xt_counters *paddc; + unsigned int num_counters; + char *name; + int size; + void *ptmp; struct arpt_table *t; struct xt_table_info *private; int ret = 0; void *loc_cpu_entry; +#ifdef CONFIG_COMPAT + struct compat_xt_counters_info compat_tmp; - if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + if (compat) { + ptmp = &compat_tmp; + size = sizeof(struct compat_xt_counters_info); + } else +#endif + { + ptmp = &tmp; + size = sizeof(struct xt_counters_info); + } + + if (copy_from_user(ptmp, user, size) != 0) return -EFAULT; - if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct xt_counters)) +#ifdef CONFIG_COMPAT + if (compat) { + num_counters = compat_tmp.num_counters; + name = compat_tmp.name; + } else +#endif + { + num_counters = tmp.num_counters; + name = tmp.name; + } + + if (len != size + num_counters * sizeof(struct xt_counters)) return -EINVAL; - paddc = vmalloc(len); + paddc = vmalloc_node(len - size, numa_node_id()); if (!paddc) return -ENOMEM; - if (copy_from_user(paddc, user, len) != 0) { + if (copy_from_user(paddc, user + size, len - size) != 0) { ret = -EFAULT; goto free; } - t = xt_find_table_lock(NF_ARP, tmp.name); + t = xt_find_table_lock(NF_ARP, name); if (!t || IS_ERR(t)) { ret = t ? PTR_ERR(t) : -ENOENT; goto free; @@ -944,7 +1140,7 @@ static int do_add_counters(void __user *user, unsigned int len) write_lock_bh(&t->lock); private = t->private; - if (private->number != tmp.num_counters) { + if (private->number != num_counters) { ret = -EINVAL; goto unlock_up_free; } @@ -955,7 +1151,7 @@ static int do_add_counters(void __user *user, unsigned int len) ARPT_ENTRY_ITERATE(loc_cpu_entry, private->size, add_counter_to_entry, - paddc->counters, + paddc, &i); unlock_up_free: write_unlock_bh(&t->lock); @@ -967,7 +1163,329 @@ static int do_add_counters(void __user *user, unsigned int len) return ret; } -static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +#ifdef CONFIG_COMPAT +static inline int +compat_release_entry(struct compat_arpt_entry *e, unsigned int *i) +{ + struct arpt_entry_target *t; + + if (i && (*i)-- == 0) + return 1; + + t = compat_arpt_get_target(e); + module_put(t->u.kernel.target->me); + return 0; +} + +static inline int +check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, + struct xt_table_info *newinfo, + unsigned int *size, + unsigned char *base, + unsigned char *limit, + unsigned int *hook_entries, + unsigned int *underflows, + unsigned int *i, + const char *name) +{ + struct arpt_entry_target *t; + struct xt_target *target; + unsigned int entry_offset; + int ret, off, h; + + duprintf("check_compat_entry_size_and_hooks %p\n", e); + if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 + || (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit) { + duprintf("Bad offset %p, limit = %p\n", e, limit); + return -EINVAL; + } + + if (e->next_offset < sizeof(struct compat_arpt_entry) + + sizeof(struct compat_xt_entry_target)) { + duprintf("checking: element %p size %u\n", + e, e->next_offset); + return -EINVAL; + } + + /* For purposes of check_entry casting the compat entry is fine */ + ret = check_entry((struct arpt_entry *)e, name); + if (ret) + return ret; + + off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); + entry_offset = (void *)e - (void *)base; + + t = compat_arpt_get_target(e); + target = try_then_request_module(xt_find_target(NF_ARP, + t->u.user.name, + t->u.user.revision), + "arpt_%s", t->u.user.name); + if (IS_ERR(target) || !target) { + duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", + t->u.user.name); + ret = target ? PTR_ERR(target) : -ENOENT; + goto out; + } + t->u.kernel.target = target; + + off += xt_compat_target_offset(target); + *size += off; + ret = xt_compat_add_offset(NF_ARP, entry_offset, off); + if (ret) + goto release_target; + + /* Check hooks & underflows */ + for (h = 0; h < NF_ARP_NUMHOOKS; h++) { + if ((unsigned char *)e - base == hook_entries[h]) + newinfo->hook_entry[h] = hook_entries[h]; + if ((unsigned char *)e - base == underflows[h]) + newinfo->underflow[h] = underflows[h]; + } + + /* Clear counters and comefrom */ + memset(&e->counters, 0, sizeof(e->counters)); + e->comefrom = 0; + + (*i)++; + return 0; + +release_target: + module_put(t->u.kernel.target->me); +out: + return ret; +} + +static int +compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr, + unsigned int *size, const char *name, + struct xt_table_info *newinfo, unsigned char *base) +{ + struct arpt_entry_target *t; + struct xt_target *target; + struct arpt_entry *de; + unsigned int origsize; + int ret, h; + + ret = 0; + origsize = *size; + de = (struct arpt_entry *)*dstptr; + memcpy(de, e, sizeof(struct arpt_entry)); + memcpy(&de->counters, &e->counters, sizeof(e->counters)); + + *dstptr += sizeof(struct arpt_entry); + *size += sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); + + de->target_offset = e->target_offset - (origsize - *size); + t = compat_arpt_get_target(e); + target = t->u.kernel.target; + xt_compat_target_from_user(t, dstptr, size); + + de->next_offset = e->next_offset - (origsize - *size); + for (h = 0; h < NF_ARP_NUMHOOKS; h++) { + if ((unsigned char *)de - base < newinfo->hook_entry[h]) + newinfo->hook_entry[h] -= origsize - *size; + if ((unsigned char *)de - base < newinfo->underflow[h]) + newinfo->underflow[h] -= origsize - *size; + } + return ret; +} + +static inline int compat_check_entry(struct arpt_entry *e, const char *name, + unsigned int *i) +{ + int ret; + + ret = check_target(e, name); + if (ret) + return ret; + + (*i)++; + return 0; +} + +static int translate_compat_table(const char *name, + unsigned int valid_hooks, + struct xt_table_info **pinfo, + void **pentry0, + unsigned int total_size, + unsigned int number, + unsigned int *hook_entries, + unsigned int *underflows) +{ + unsigned int i, j; + struct xt_table_info *newinfo, *info; + void *pos, *entry0, *entry1; + unsigned int size; + int ret; + + info = *pinfo; + entry0 = *pentry0; + size = total_size; + info->number = number; + + /* Init all hooks to impossible value. */ + for (i = 0; i < NF_ARP_NUMHOOKS; i++) { + info->hook_entry[i] = 0xFFFFFFFF; + info->underflow[i] = 0xFFFFFFFF; + } + + duprintf("translate_compat_table: size %u\n", info->size); + j = 0; + xt_compat_lock(NF_ARP); + /* Walk through entries, checking offsets. */ + ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, + check_compat_entry_size_and_hooks, + info, &size, entry0, + entry0 + total_size, + hook_entries, underflows, &j, name); + if (ret != 0) + goto out_unlock; + + ret = -EINVAL; + if (j != number) { + duprintf("translate_compat_table: %u not %u entries\n", + j, number); + goto out_unlock; + } + + /* Check hooks all assigned */ + for (i = 0; i < NF_ARP_NUMHOOKS; i++) { + /* Only hooks which are valid */ + if (!(valid_hooks & (1 << i))) + continue; + if (info->hook_entry[i] == 0xFFFFFFFF) { + duprintf("Invalid hook entry %u %u\n", + i, hook_entries[i]); + goto out_unlock; + } + if (info->underflow[i] == 0xFFFFFFFF) { + duprintf("Invalid underflow %u %u\n", + i, underflows[i]); + goto out_unlock; + } + } + + ret = -ENOMEM; + newinfo = xt_alloc_table_info(size); + if (!newinfo) + goto out_unlock; + + newinfo->number = number; + for (i = 0; i < NF_ARP_NUMHOOKS; i++) { + newinfo->hook_entry[i] = info->hook_entry[i]; + newinfo->underflow[i] = info->underflow[i]; + } + entry1 = newinfo->entries[raw_smp_processor_id()]; + pos = entry1; + size = total_size; + ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, + compat_copy_entry_from_user, + &pos, &size, name, newinfo, entry1); + xt_compat_flush_offsets(NF_ARP); + xt_compat_unlock(NF_ARP); + if (ret) + goto free_newinfo; + + ret = -ELOOP; + if (!mark_source_chains(newinfo, valid_hooks, entry1)) + goto free_newinfo; + + i = 0; + ret = ARPT_ENTRY_ITERATE(entry1, newinfo->size, compat_check_entry, + name, &i); + if (ret) { + j -= i; + COMPAT_ARPT_ENTRY_ITERATE_CONTINUE(entry0, newinfo->size, i, + compat_release_entry, &j); + ARPT_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, &i); + xt_free_table_info(newinfo); + return ret; + } + + /* And one copy for every other CPU */ + for_each_possible_cpu(i) + if (newinfo->entries[i] && newinfo->entries[i] != entry1) + memcpy(newinfo->entries[i], entry1, newinfo->size); + + *pinfo = newinfo; + *pentry0 = entry1; + xt_free_table_info(info); + return 0; + +free_newinfo: + xt_free_table_info(newinfo); +out: + COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j); + return ret; +out_unlock: + xt_compat_flush_offsets(NF_ARP); + xt_compat_unlock(NF_ARP); + goto out; +} + +struct compat_arpt_replace { + char name[ARPT_TABLE_MAXNAMELEN]; + u32 valid_hooks; + u32 num_entries; + u32 size; + u32 hook_entry[NF_ARP_NUMHOOKS]; + u32 underflow[NF_ARP_NUMHOOKS]; + u32 num_counters; + compat_uptr_t counters; + struct compat_arpt_entry entries[0]; +}; + +static int compat_do_replace(void __user *user, unsigned int len) +{ + int ret; + struct compat_arpt_replace tmp; + struct xt_table_info *newinfo; + void *loc_cpu_entry; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + /* overflow check */ + if (tmp.size >= INT_MAX / num_possible_cpus()) + return -ENOMEM; + if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) + return -ENOMEM; + + newinfo = xt_alloc_table_info(tmp.size); + if (!newinfo) + return -ENOMEM; + + /* choose the copy that is on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { + ret = -EFAULT; + goto free_newinfo; + } + + ret = translate_compat_table(tmp.name, tmp.valid_hooks, + &newinfo, &loc_cpu_entry, tmp.size, + tmp.num_entries, tmp.hook_entry, + tmp.underflow); + if (ret != 0) + goto free_newinfo; + + duprintf("compat_do_replace: Translated table\n"); + + ret = __do_replace(tmp.name, tmp.valid_hooks, newinfo, + tmp.num_counters, compat_ptr(tmp.counters)); + if (ret) + goto free_newinfo_untrans; + return 0; + + free_newinfo_untrans: + ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); + free_newinfo: + xt_free_table_info(newinfo); + return ret; +} + +static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, + unsigned int len) { int ret; @@ -976,11 +1494,11 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned switch (cmd) { case ARPT_SO_SET_REPLACE: - ret = do_replace(user, len); + ret = compat_do_replace(user, len); break; case ARPT_SO_SET_ADD_COUNTERS: - ret = do_add_counters(user, len); + ret = do_add_counters(user, len, 1); break; default: @@ -991,74 +1509,190 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned return ret; } -static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr, + compat_uint_t *size, + struct xt_counters *counters, + unsigned int *i) { + struct arpt_entry_target *t; + struct compat_arpt_entry __user *ce; + u_int16_t target_offset, next_offset; + compat_uint_t origsize; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; + ret = -EFAULT; + origsize = *size; + ce = (struct compat_arpt_entry __user *)*dstptr; + if (copy_to_user(ce, e, sizeof(struct arpt_entry))) + goto out; - switch (cmd) { - case ARPT_SO_GET_INFO: { - char name[ARPT_TABLE_MAXNAMELEN]; - struct arpt_table *t; + if (copy_to_user(&ce->counters, &counters[*i], sizeof(counters[*i]))) + goto out; + + *dstptr += sizeof(struct compat_arpt_entry); + *size -= sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); - if (*len != sizeof(struct arpt_getinfo)) { - duprintf("length %u != %Zu\n", *len, - sizeof(struct arpt_getinfo)); + target_offset = e->target_offset - (origsize - *size); + + t = arpt_get_target(e); + ret = xt_compat_target_to_user(t, dstptr, size); + if (ret) + goto out; + ret = -EFAULT; + next_offset = e->next_offset - (origsize - *size); + if (put_user(target_offset, &ce->target_offset)) + goto out; + if (put_user(next_offset, &ce->next_offset)) + goto out; + + (*i)++; + return 0; +out: + return ret; +} + +static int compat_copy_entries_to_user(unsigned int total_size, + struct arpt_table *table, + void __user *userptr) +{ + struct xt_counters *counters; + struct xt_table_info *private = table->private; + void __user *pos; + unsigned int size; + int ret = 0; + void *loc_cpu_entry; + unsigned int i = 0; + + counters = alloc_counters(table); + if (IS_ERR(counters)) + return PTR_ERR(counters); + + /* choose the copy on our node/cpu */ + loc_cpu_entry = private->entries[raw_smp_processor_id()]; + pos = userptr; + size = total_size; + ret = ARPT_ENTRY_ITERATE(loc_cpu_entry, total_size, + compat_copy_entry_to_user, + &pos, &size, counters, &i); + vfree(counters); + return ret; +} + +struct compat_arpt_get_entries { + char name[ARPT_TABLE_MAXNAMELEN]; + compat_uint_t size; + struct compat_arpt_entry entrytable[0]; +}; + +static int compat_get_entries(struct compat_arpt_get_entries __user *uptr, + int *len) +{ + int ret; + struct compat_arpt_get_entries get; + struct arpt_table *t; + + if (*len < sizeof(get)) { + duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get)); + return -EINVAL; + } + if (copy_from_user(&get, uptr, sizeof(get)) != 0) + return -EFAULT; + if (*len != sizeof(struct compat_arpt_get_entries) + get.size) { + duprintf("compat_get_entries: %u != %zu\n", + *len, sizeof(get) + get.size); + return -EINVAL; + } + + xt_compat_lock(NF_ARP); + t = xt_find_table_lock(NF_ARP, get.name); + if (t && !IS_ERR(t)) { + struct xt_table_info *private = t->private; + struct xt_table_info info; + + duprintf("t->private->number = %u\n", private->number); + ret = compat_table_info(private, &info); + if (!ret && get.size == info.size) { + ret = compat_copy_entries_to_user(private->size, + t, uptr->entrytable); + } else if (!ret) { + duprintf("compat_get_entries: I've got %u not %u!\n", + private->size, get.size); ret = -EINVAL; - break; } + xt_compat_flush_offsets(NF_ARP); + module_put(t->me); + xt_table_unlock(t); + } else + ret = t ? PTR_ERR(t) : -ENOENT; - if (copy_from_user(name, user, sizeof(name)) != 0) { - ret = -EFAULT; - break; - } - name[ARPT_TABLE_MAXNAMELEN-1] = '\0'; - - t = try_then_request_module(xt_find_table_lock(NF_ARP, name), - "arptable_%s", name); - if (t && !IS_ERR(t)) { - struct arpt_getinfo info; - struct xt_table_info *private = t->private; - - info.valid_hooks = t->valid_hooks; - memcpy(info.hook_entry, private->hook_entry, - sizeof(info.hook_entry)); - memcpy(info.underflow, private->underflow, - sizeof(info.underflow)); - info.num_entries = private->number; - info.size = private->size; - strcpy(info.name, name); - - if (copy_to_user(user, &info, *len) != 0) - ret = -EFAULT; - else - ret = 0; - xt_table_unlock(t); - module_put(t->me); - } else - ret = t ? PTR_ERR(t) : -ENOENT; + xt_compat_unlock(NF_ARP); + return ret; +} + +static int do_arpt_get_ctl(struct sock *, int, void __user *, int *); + +static int compat_do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, + int *len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case ARPT_SO_GET_INFO: + ret = get_info(user, len, 1); + break; + case ARPT_SO_GET_ENTRIES: + ret = compat_get_entries(user, len); + break; + default: + ret = do_arpt_get_ctl(sk, cmd, user, len); } - break; + return ret; +} +#endif - case ARPT_SO_GET_ENTRIES: { - struct arpt_get_entries get; +static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +{ + int ret; - if (*len < sizeof(get)) { - duprintf("get_entries: %u < %Zu\n", *len, sizeof(get)); - ret = -EINVAL; - } else if (copy_from_user(&get, user, sizeof(get)) != 0) { - ret = -EFAULT; - } else if (*len != sizeof(struct arpt_get_entries) + get.size) { - duprintf("get_entries: %u != %Zu\n", *len, - sizeof(struct arpt_get_entries) + get.size); - ret = -EINVAL; - } else - ret = get_entries(&get, user); + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case ARPT_SO_SET_REPLACE: + ret = do_replace(user, len); + break; + + case ARPT_SO_SET_ADD_COUNTERS: + ret = do_add_counters(user, len, 0); break; + + default: + duprintf("do_arpt_set_ctl: unknown request %i\n", cmd); + ret = -EINVAL; } + return ret; +} + +static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case ARPT_SO_GET_INFO: + ret = get_info(user, len, 0); + break; + + case ARPT_SO_GET_ENTRIES: + ret = get_entries(user, len); + break; + case ARPT_SO_GET_REVISION_TARGET: { struct xt_get_revision rev; @@ -1090,7 +1724,7 @@ int arpt_register_table(struct arpt_table *table, { int ret; struct xt_table_info *newinfo; - static struct xt_table_info bootstrap + struct xt_table_info bootstrap = { 0, 0, 0, { 0 }, { 0 }, { } }; void *loc_cpu_entry; @@ -1144,6 +1778,11 @@ static struct arpt_target arpt_standard_target __read_mostly = { .name = ARPT_STANDARD_TARGET, .targetsize = sizeof(int), .family = NF_ARP, +#ifdef CONFIG_COMPAT + .compatsize = sizeof(compat_int_t), + .compat_from_user = compat_standard_from_user, + .compat_to_user = compat_standard_to_user, +#endif }; static struct arpt_target arpt_error_target __read_mostly = { @@ -1158,9 +1797,15 @@ static struct nf_sockopt_ops arpt_sockopts = { .set_optmin = ARPT_BASE_CTL, .set_optmax = ARPT_SO_SET_MAX+1, .set = do_arpt_set_ctl, +#ifdef CONFIG_COMPAT + .compat_set = compat_do_arpt_set_ctl, +#endif .get_optmin = ARPT_BASE_CTL, .get_optmax = ARPT_SO_GET_MAX+1, .get = do_arpt_get_ctl, +#ifdef CONFIG_COMPAT + .compat_get = compat_do_arpt_get_ctl, +#endif .owner = THIS_MODULE, }; diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 302d3da5f69..7201511d54d 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -64,7 +64,7 @@ static unsigned int arpt_hook(unsigned int hook, return arpt_do_table(skb, hook, in, out, &packet_filter); } -static struct nf_hook_ops arpt_ops[] = { +static struct nf_hook_ops arpt_ops[] __read_mostly = { { .hook = arpt_hook, .owner = THIS_MODULE, diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 14d64a383db..5109839da22 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -28,19 +28,15 @@ #include <net/net_namespace.h> #include <net/sock.h> #include <net/route.h> +#include <net/netfilter/nf_queue.h> +#include <net/ip.h> #define IPQ_QMAX_DEFAULT 1024 #define IPQ_PROC_FS_NAME "ip_queue" #define NET_IPQ_QMAX 2088 #define NET_IPQ_QMAX_NAME "ip_queue_maxlen" -struct ipq_queue_entry { - struct list_head list; - struct nf_info *info; - struct sk_buff *skb; -}; - -typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long); +typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long); static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE; static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT; @@ -54,76 +50,13 @@ static struct sock *ipqnl __read_mostly; static LIST_HEAD(queue_list); static DEFINE_MUTEX(ipqnl_mutex); -static void -ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict) -{ - /* TCP input path (and probably other bits) assume to be called - * from softirq context, not from syscall, like ipq_issue_verdict is - * called. TCP input path deadlocks with locks taken from timer - * softirq, e.g. We therefore emulate this by local_bh_disable() */ - - local_bh_disable(); - nf_reinject(entry->skb, entry->info, verdict); - local_bh_enable(); - - kfree(entry); -} - static inline void -__ipq_enqueue_entry(struct ipq_queue_entry *entry) +__ipq_enqueue_entry(struct nf_queue_entry *entry) { - list_add(&entry->list, &queue_list); + list_add_tail(&entry->list, &queue_list); queue_total++; } -/* - * Find and return a queued entry matched by cmpfn, or return the last - * entry if cmpfn is NULL. - */ -static inline struct ipq_queue_entry * -__ipq_find_entry(ipq_cmpfn cmpfn, unsigned long data) -{ - struct list_head *p; - - list_for_each_prev(p, &queue_list) { - struct ipq_queue_entry *entry = (struct ipq_queue_entry *)p; - - if (!cmpfn || cmpfn(entry, data)) - return entry; - } - return NULL; -} - -static inline void -__ipq_dequeue_entry(struct ipq_queue_entry *entry) -{ - list_del(&entry->list); - queue_total--; -} - -static inline struct ipq_queue_entry * -__ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data) -{ - struct ipq_queue_entry *entry; - - entry = __ipq_find_entry(cmpfn, data); - if (entry == NULL) - return NULL; - - __ipq_dequeue_entry(entry); - return entry; -} - - -static inline void -__ipq_flush(int verdict) -{ - struct ipq_queue_entry *entry; - - while ((entry = __ipq_find_dequeue_entry(NULL, 0))) - ipq_issue_verdict(entry, verdict); -} - static inline int __ipq_set_mode(unsigned char mode, unsigned int range) { @@ -150,36 +83,64 @@ __ipq_set_mode(unsigned char mode, unsigned int range) return status; } +static void __ipq_flush(ipq_cmpfn cmpfn, unsigned long data); + static inline void __ipq_reset(void) { peer_pid = 0; net_disable_timestamp(); __ipq_set_mode(IPQ_COPY_NONE, 0); - __ipq_flush(NF_DROP); + __ipq_flush(NULL, 0); } -static struct ipq_queue_entry * -ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data) +static struct nf_queue_entry * +ipq_find_dequeue_entry(unsigned long id) { - struct ipq_queue_entry *entry; + struct nf_queue_entry *entry = NULL, *i; write_lock_bh(&queue_lock); - entry = __ipq_find_dequeue_entry(cmpfn, data); + + list_for_each_entry(i, &queue_list, list) { + if ((unsigned long)i == id) { + entry = i; + break; + } + } + + if (entry) { + list_del(&entry->list); + queue_total--; + } + write_unlock_bh(&queue_lock); return entry; } static void -ipq_flush(int verdict) +__ipq_flush(ipq_cmpfn cmpfn, unsigned long data) +{ + struct nf_queue_entry *entry, *next; + + list_for_each_entry_safe(entry, next, &queue_list, list) { + if (!cmpfn || cmpfn(entry, data)) { + list_del(&entry->list); + queue_total--; + nf_reinject(entry, NF_DROP); + } + } +} + +static void +ipq_flush(ipq_cmpfn cmpfn, unsigned long data) { write_lock_bh(&queue_lock); - __ipq_flush(verdict); + __ipq_flush(cmpfn, data); write_unlock_bh(&queue_lock); } static struct sk_buff * -ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp) +ipq_build_packet_message(struct nf_queue_entry *entry, int *errp) { sk_buff_data_t old_tail; size_t size = 0; @@ -236,20 +197,20 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp) pmsg->timestamp_sec = tv.tv_sec; pmsg->timestamp_usec = tv.tv_usec; pmsg->mark = entry->skb->mark; - pmsg->hook = entry->info->hook; + pmsg->hook = entry->hook; pmsg->hw_protocol = entry->skb->protocol; - if (entry->info->indev) - strcpy(pmsg->indev_name, entry->info->indev->name); + if (entry->indev) + strcpy(pmsg->indev_name, entry->indev->name); else pmsg->indev_name[0] = '\0'; - if (entry->info->outdev) - strcpy(pmsg->outdev_name, entry->info->outdev->name); + if (entry->outdev) + strcpy(pmsg->outdev_name, entry->outdev->name); else pmsg->outdev_name[0] = '\0'; - if (entry->info->indev && entry->skb->dev) { + if (entry->indev && entry->skb->dev) { pmsg->hw_type = entry->skb->dev->type; pmsg->hw_addrlen = dev_parse_header(entry->skb, pmsg->hw_addr); @@ -271,28 +232,17 @@ nlmsg_failure: } static int -ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, - unsigned int queuenum, void *data) +ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) { int status = -EINVAL; struct sk_buff *nskb; - struct ipq_queue_entry *entry; if (copy_mode == IPQ_COPY_NONE) return -EAGAIN; - entry = kmalloc(sizeof(*entry), GFP_ATOMIC); - if (entry == NULL) { - printk(KERN_ERR "ip_queue: OOM in ipq_enqueue_packet()\n"); - return -ENOMEM; - } - - entry->info = info; - entry->skb = skb; - nskb = ipq_build_packet_message(entry, &status); if (nskb == NULL) - goto err_out_free; + return status; write_lock_bh(&queue_lock); @@ -326,14 +276,11 @@ err_out_free_nskb: err_out_unlock: write_unlock_bh(&queue_lock); - -err_out_free: - kfree(entry); return status; } static int -ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e) +ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct nf_queue_entry *e) { int diff; int err; @@ -368,21 +315,15 @@ ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e) return 0; } -static inline int -id_cmp(struct ipq_queue_entry *e, unsigned long id) -{ - return (id == (unsigned long )e); -} - static int ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len) { - struct ipq_queue_entry *entry; + struct nf_queue_entry *entry; if (vmsg->value > NF_MAX_VERDICT) return -EINVAL; - entry = ipq_find_dequeue_entry(id_cmp, vmsg->id); + entry = ipq_find_dequeue_entry(vmsg->id); if (entry == NULL) return -ENOENT; else { @@ -392,7 +333,7 @@ ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len) if (ipq_mangle_ipv4(vmsg, entry) < 0) verdict = NF_DROP; - ipq_issue_verdict(entry, verdict); + nf_reinject(entry, verdict); return 0; } } @@ -437,13 +378,13 @@ ipq_receive_peer(struct ipq_peer_msg *pmsg, } static int -dev_cmp(struct ipq_queue_entry *entry, unsigned long ifindex) +dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) { - if (entry->info->indev) - if (entry->info->indev->ifindex == ifindex) + if (entry->indev) + if (entry->indev->ifindex == ifindex) return 1; - if (entry->info->outdev) - if (entry->info->outdev->ifindex == ifindex) + if (entry->outdev) + if (entry->outdev->ifindex == ifindex) return 1; #ifdef CONFIG_BRIDGE_NETFILTER if (entry->skb->nf_bridge) { @@ -461,10 +402,7 @@ dev_cmp(struct ipq_queue_entry *entry, unsigned long ifindex) static void ipq_dev_drop(int ifindex) { - struct ipq_queue_entry *entry; - - while ((entry = ipq_find_dequeue_entry(dev_cmp, ifindex)) != NULL) - ipq_issue_verdict(entry, NF_DROP); + ipq_flush(dev_cmp, ifindex); } #define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) @@ -588,26 +526,6 @@ static ctl_table ipq_table[] = { { .ctl_name = 0 } }; -static ctl_table ipq_dir_table[] = { - { - .ctl_name = NET_IPV4, - .procname = "ipv4", - .mode = 0555, - .child = ipq_table - }, - { .ctl_name = 0 } -}; - -static ctl_table ipq_root_table[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = ipq_dir_table - }, - { .ctl_name = 0 } -}; - static int ip_queue_show(struct seq_file *m, void *v) { read_lock_bh(&queue_lock); @@ -645,7 +563,7 @@ static const struct file_operations ip_queue_proc_fops = { .owner = THIS_MODULE, }; -static struct nf_queue_handler nfqh = { +static const struct nf_queue_handler nfqh = { .name = "ip_queue", .outfn = &ipq_enqueue_packet, }; @@ -673,7 +591,7 @@ static int __init ip_queue_init(void) } register_netdevice_notifier(&ipq_dev_notifier); - ipq_sysctl_header = register_sysctl_table(ipq_root_table); + ipq_sysctl_header = register_sysctl_paths(net_ipv4_ctl_path, ipq_table); status = nf_register_queue_handler(PF_INET, &nfqh); if (status < 0) { @@ -687,7 +605,7 @@ cleanup_sysctl: unregister_netdevice_notifier(&ipq_dev_notifier); proc_net_remove(&init_net, IPQ_PROC_FS_NAME); cleanup_ipqnl: - sock_release(ipqnl->sk_socket); + netlink_kernel_release(ipqnl); mutex_lock(&ipqnl_mutex); mutex_unlock(&ipqnl_mutex); @@ -700,13 +618,13 @@ static void __exit ip_queue_fini(void) { nf_unregister_queue_handlers(&nfqh); synchronize_net(); - ipq_flush(NF_DROP); + ipq_flush(NULL, 0); unregister_sysctl_table(ipq_sysctl_header); unregister_netdevice_notifier(&ipq_dev_notifier); proc_net_remove(&init_net, IPQ_PROC_FS_NAME); - sock_release(ipqnl->sk_socket); + netlink_kernel_release(ipqnl); mutex_lock(&ipqnl_mutex); mutex_unlock(&ipqnl_mutex); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index b9b189c2620..982b7f98629 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -26,6 +26,7 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> +#include <net/netfilter/nf_log.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); @@ -74,7 +75,8 @@ do { \ Hence the start of any table is given by get_table() below. */ /* Returns whether matches rule or not. */ -static inline int +/* Performance critical - called for every packet */ +static inline bool ip_packet_match(const struct iphdr *ip, const char *indev, const char *outdev, @@ -84,7 +86,7 @@ ip_packet_match(const struct iphdr *ip, size_t i; unsigned long ret; -#define FWINV(bool,invflg) ((bool) ^ !!(ipinfo->invflags & invflg)) +#define FWINV(bool, invflg) ((bool) ^ !!(ipinfo->invflags & (invflg))) if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr, IPT_INV_SRCIP) @@ -102,7 +104,7 @@ ip_packet_match(const struct iphdr *ip, NIPQUAD(ipinfo->dmsk.s_addr), NIPQUAD(ipinfo->dst.s_addr), ipinfo->invflags & IPT_INV_DSTIP ? " (INV)" : ""); - return 0; + return false; } /* Look for ifname matches; this should unroll nicely. */ @@ -116,7 +118,7 @@ ip_packet_match(const struct iphdr *ip, dprintf("VIA in mismatch (%s vs %s).%s\n", indev, ipinfo->iniface, ipinfo->invflags&IPT_INV_VIA_IN ?" (INV)":""); - return 0; + return false; } for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) { @@ -129,7 +131,7 @@ ip_packet_match(const struct iphdr *ip, dprintf("VIA out mismatch (%s vs %s).%s\n", outdev, ipinfo->outiface, ipinfo->invflags&IPT_INV_VIA_OUT ?" (INV)":""); - return 0; + return false; } /* Check specific protocol */ @@ -138,7 +140,7 @@ ip_packet_match(const struct iphdr *ip, dprintf("Packet protocol %hi does not match %hi.%s\n", ip->protocol, ipinfo->proto, ipinfo->invflags&IPT_INV_PROTO ? " (INV)":""); - return 0; + return false; } /* If we have a fragment rule but the packet is not a fragment @@ -146,13 +148,13 @@ ip_packet_match(const struct iphdr *ip, if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) { dprintf("Fragment rule but not fragment.%s\n", ipinfo->invflags & IPT_INV_FRAG ? " (INV)" : ""); - return 0; + return false; } - return 1; + return true; } -static inline bool +static bool ip_checkentry(const struct ipt_ip *ip) { if (ip->flags & ~IPT_F_MASK) { @@ -182,8 +184,9 @@ ipt_error(struct sk_buff *skb, return NF_DROP; } -static inline -bool do_match(struct ipt_entry_match *m, +/* Performance critical - called for every packet */ +static inline bool +do_match(struct ipt_entry_match *m, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -198,6 +201,7 @@ bool do_match(struct ipt_entry_match *m, return false; } +/* Performance critical */ static inline struct ipt_entry * get_entry(void *base, unsigned int offset) { @@ -205,6 +209,7 @@ get_entry(void *base, unsigned int offset) } /* All zeroes == unconditional rule. */ +/* Mildly perf critical (only if packet tracing is on) */ static inline int unconditional(const struct ipt_ip *ip) { @@ -215,16 +220,17 @@ unconditional(const struct ipt_ip *ip) return 0; return 1; +#undef FWINV } #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) -static const char *hooknames[] = { - [NF_IP_PRE_ROUTING] = "PREROUTING", - [NF_IP_LOCAL_IN] = "INPUT", - [NF_IP_FORWARD] = "FORWARD", - [NF_IP_LOCAL_OUT] = "OUTPUT", - [NF_IP_POST_ROUTING] = "POSTROUTING", +static const char *const hooknames[] = { + [NF_INET_PRE_ROUTING] = "PREROUTING", + [NF_INET_LOCAL_IN] = "INPUT", + [NF_INET_FORWARD] = "FORWARD", + [NF_INET_LOCAL_OUT] = "OUTPUT", + [NF_INET_POST_ROUTING] = "POSTROUTING", }; enum nf_ip_trace_comments { @@ -233,7 +239,7 @@ enum nf_ip_trace_comments { NF_IP_TRACE_COMMENT_POLICY, }; -static const char *comments[] = { +static const char *const comments[] = { [NF_IP_TRACE_COMMENT_RULE] = "rule", [NF_IP_TRACE_COMMENT_RETURN] = "return", [NF_IP_TRACE_COMMENT_POLICY] = "policy", @@ -249,6 +255,7 @@ static struct nf_loginfo trace_loginfo = { }, }; +/* Mildly perf critical (only if packet tracing is on) */ static inline int get_chainname_rulenum(struct ipt_entry *s, struct ipt_entry *e, char *hookname, char **chainname, @@ -465,10 +472,9 @@ mark_source_chains(struct xt_table_info *newinfo, /* No recursion; use packet counter to save back ptrs (reset to 0 as we leave), and comefrom to save source hook bitmask */ - for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) { + for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; - struct ipt_entry *e - = (struct ipt_entry *)(entry0 + pos); + struct ipt_entry *e = (struct ipt_entry *)(entry0 + pos); if (!(valid_hooks & (1 << hook))) continue; @@ -481,13 +487,12 @@ mark_source_chains(struct xt_table_info *newinfo, = (void *)ipt_get_target(e); int visited = e->comefrom & (1 << hook); - if (e->comefrom & (1 << NF_IP_NUMHOOKS)) { + if (e->comefrom & (1 << NF_INET_NUMHOOKS)) { printk("iptables: loop hook %u pos %u %08X.\n", hook, pos, e->comefrom); return 0; } - e->comefrom - |= ((1 << hook) | (1 << NF_IP_NUMHOOKS)); + e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); /* Unconditional return/END. */ if ((e->target_offset == sizeof(struct ipt_entry) @@ -507,10 +512,10 @@ mark_source_chains(struct xt_table_info *newinfo, /* Return: backtrack through the last big jump. */ do { - e->comefrom ^= (1<<NF_IP_NUMHOOKS); + e->comefrom ^= (1<<NF_INET_NUMHOOKS); #ifdef DEBUG_IP_FIREWALL_USER if (e->comefrom - & (1 << NF_IP_NUMHOOKS)) { + & (1 << NF_INET_NUMHOOKS)) { duprintf("Back unset " "on hook %u " "rule %u\n", @@ -567,7 +572,7 @@ mark_source_chains(struct xt_table_info *newinfo, return 1; } -static inline int +static int cleanup_match(struct ipt_entry_match *m, unsigned int *i) { if (i && (*i)-- == 0) @@ -579,7 +584,7 @@ cleanup_match(struct ipt_entry_match *m, unsigned int *i) return 0; } -static inline int +static int check_entry(struct ipt_entry *e, const char *name) { struct ipt_entry_target *t; @@ -589,7 +594,8 @@ check_entry(struct ipt_entry *e, const char *name) return -EINVAL; } - if (e->target_offset + sizeof(struct ipt_entry_target) > e->next_offset) + if (e->target_offset + sizeof(struct ipt_entry_target) > + e->next_offset) return -EINVAL; t = ipt_get_target(e); @@ -599,9 +605,10 @@ check_entry(struct ipt_entry *e, const char *name) return 0; } -static inline int check_match(struct ipt_entry_match *m, const char *name, - const struct ipt_ip *ip, unsigned int hookmask, - unsigned int *i) +static int +check_match(struct ipt_entry_match *m, const char *name, + const struct ipt_ip *ip, + unsigned int hookmask, unsigned int *i) { struct xt_match *match; int ret; @@ -622,18 +629,18 @@ static inline int check_match(struct ipt_entry_match *m, const char *name, return ret; } -static inline int +static int find_check_match(struct ipt_entry_match *m, - const char *name, - const struct ipt_ip *ip, - unsigned int hookmask, - unsigned int *i) + const char *name, + const struct ipt_ip *ip, + unsigned int hookmask, + unsigned int *i) { struct xt_match *match; int ret; match = try_then_request_module(xt_find_match(AF_INET, m->u.user.name, - m->u.user.revision), + m->u.user.revision), "ipt_%s", m->u.user.name); if (IS_ERR(match) || !match) { duprintf("find_check_match: `%s' not found\n", m->u.user.name); @@ -651,7 +658,7 @@ err: return ret; } -static inline int check_target(struct ipt_entry *e, const char *name) +static int check_target(struct ipt_entry *e, const char *name) { struct ipt_entry_target *t; struct xt_target *target; @@ -663,8 +670,8 @@ static inline int check_target(struct ipt_entry *e, const char *name) name, e->comefrom, e->ip.proto, e->ip.invflags & IPT_INV_PROTO); if (!ret && t->u.kernel.target->checkentry - && !t->u.kernel.target->checkentry(name, e, target, - t->data, e->comefrom)) { + && !t->u.kernel.target->checkentry(name, e, target, t->data, + e->comefrom)) { duprintf("ip_tables: check failed for `%s'.\n", t->u.kernel.target->name); ret = -EINVAL; @@ -672,9 +679,9 @@ static inline int check_target(struct ipt_entry *e, const char *name) return ret; } -static inline int +static int find_check_entry(struct ipt_entry *e, const char *name, unsigned int size, - unsigned int *i) + unsigned int *i) { struct ipt_entry_target *t; struct xt_target *target; @@ -687,14 +694,14 @@ find_check_entry(struct ipt_entry *e, const char *name, unsigned int size, j = 0; ret = IPT_MATCH_ITERATE(e, find_check_match, name, &e->ip, - e->comefrom, &j); + e->comefrom, &j); if (ret != 0) goto cleanup_matches; t = ipt_get_target(e); target = try_then_request_module(xt_find_target(AF_INET, - t->u.user.name, - t->u.user.revision), + t->u.user.name, + t->u.user.revision), "ipt_%s", t->u.user.name); if (IS_ERR(target) || !target) { duprintf("find_check_entry: `%s' not found\n", t->u.user.name); @@ -716,7 +723,7 @@ find_check_entry(struct ipt_entry *e, const char *name, unsigned int size, return ret; } -static inline int +static int check_entry_size_and_hooks(struct ipt_entry *e, struct xt_table_info *newinfo, unsigned char *base, @@ -741,7 +748,7 @@ check_entry_size_and_hooks(struct ipt_entry *e, } /* Check hooks & underflows */ - for (h = 0; h < NF_IP_NUMHOOKS; h++) { + for (h = 0; h < NF_INET_NUMHOOKS; h++) { if ((unsigned char *)e - base == hook_entries[h]) newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) @@ -759,7 +766,7 @@ check_entry_size_and_hooks(struct ipt_entry *e, return 0; } -static inline int +static int cleanup_entry(struct ipt_entry *e, unsigned int *i) { struct ipt_entry_target *t; @@ -795,7 +802,7 @@ translate_table(const char *name, newinfo->number = number; /* Init all hooks to impossible value. */ - for (i = 0; i < NF_IP_NUMHOOKS; i++) { + for (i = 0; i < NF_INET_NUMHOOKS; i++) { newinfo->hook_entry[i] = 0xFFFFFFFF; newinfo->underflow[i] = 0xFFFFFFFF; } @@ -819,7 +826,7 @@ translate_table(const char *name, } /* Check hooks all assigned */ - for (i = 0; i < NF_IP_NUMHOOKS; i++) { + for (i = 0; i < NF_INET_NUMHOOKS; i++) { /* Only hooks which are valid */ if (!(valid_hooks & (1 << i))) continue; @@ -915,7 +922,7 @@ get_counters(const struct xt_table_info *t, } } -static inline struct xt_counters * alloc_counters(struct xt_table *table) +static struct xt_counters * alloc_counters(struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; @@ -959,7 +966,6 @@ copy_entries_to_user(unsigned int total_size, * allowed to migrate to another cpu) */ loc_cpu_entry = private->entries[raw_smp_processor_id()]; - /* ... then copy entire thing ... */ if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; goto free_counters; @@ -1014,63 +1020,12 @@ copy_entries_to_user(unsigned int total_size, } #ifdef CONFIG_COMPAT -struct compat_delta { - struct compat_delta *next; - unsigned int offset; - short delta; -}; - -static struct compat_delta *compat_offsets = NULL; - -static int compat_add_offset(unsigned int offset, short delta) -{ - struct compat_delta *tmp; - - tmp = kmalloc(sizeof(struct compat_delta), GFP_KERNEL); - if (!tmp) - return -ENOMEM; - tmp->offset = offset; - tmp->delta = delta; - if (compat_offsets) { - tmp->next = compat_offsets->next; - compat_offsets->next = tmp; - } else { - compat_offsets = tmp; - tmp->next = NULL; - } - return 0; -} - -static void compat_flush_offsets(void) -{ - struct compat_delta *tmp, *next; - - if (compat_offsets) { - for(tmp = compat_offsets; tmp; tmp = next) { - next = tmp->next; - kfree(tmp); - } - compat_offsets = NULL; - } -} - -static short compat_calc_jump(unsigned int offset) -{ - struct compat_delta *tmp; - short delta; - - for(tmp = compat_offsets, delta = 0; tmp; tmp = tmp->next) - if (tmp->offset < offset) - delta += tmp->delta; - return delta; -} - static void compat_standard_from_user(void *dst, void *src) { int v = *(compat_int_t *)src; if (v > 0) - v += compat_calc_jump(v); + v += xt_compat_calc_jump(AF_INET, v); memcpy(dst, &v, sizeof(v)); } @@ -1079,64 +1034,61 @@ static int compat_standard_to_user(void __user *dst, void *src) compat_int_t cv = *(int *)src; if (cv > 0) - cv -= compat_calc_jump(cv); + cv -= xt_compat_calc_jump(AF_INET, cv); return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; } static inline int -compat_calc_match(struct ipt_entry_match *m, int * size) +compat_calc_match(struct ipt_entry_match *m, int *size) { *size += xt_compat_match_offset(m->u.kernel.match); return 0; } -static int compat_calc_entry(struct ipt_entry *e, struct xt_table_info *info, - void *base, struct xt_table_info *newinfo) +static int compat_calc_entry(struct ipt_entry *e, + const struct xt_table_info *info, + void *base, struct xt_table_info *newinfo) { struct ipt_entry_target *t; unsigned int entry_offset; int off, i, ret; - off = 0; + off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); entry_offset = (void *)e - base; IPT_MATCH_ITERATE(e, compat_calc_match, &off); t = ipt_get_target(e); off += xt_compat_target_offset(t->u.kernel.target); newinfo->size -= off; - ret = compat_add_offset(entry_offset, off); + ret = xt_compat_add_offset(AF_INET, entry_offset, off); if (ret) return ret; - for (i = 0; i< NF_IP_NUMHOOKS; i++) { - if (info->hook_entry[i] && (e < (struct ipt_entry *) - (base + info->hook_entry[i]))) + for (i = 0; i < NF_INET_NUMHOOKS; i++) { + if (info->hook_entry[i] && + (e < (struct ipt_entry *)(base + info->hook_entry[i]))) newinfo->hook_entry[i] -= off; - if (info->underflow[i] && (e < (struct ipt_entry *) - (base + info->underflow[i]))) + if (info->underflow[i] && + (e < (struct ipt_entry *)(base + info->underflow[i]))) newinfo->underflow[i] -= off; } return 0; } -static int compat_table_info(struct xt_table_info *info, - struct xt_table_info *newinfo) +static int compat_table_info(const struct xt_table_info *info, + struct xt_table_info *newinfo) { void *loc_cpu_entry; - int i; if (!newinfo || !info) return -EINVAL; - memset(newinfo, 0, sizeof(struct xt_table_info)); - newinfo->size = info->size; - newinfo->number = info->number; - for (i = 0; i < NF_IP_NUMHOOKS; i++) { - newinfo->hook_entry[i] = info->hook_entry[i]; - newinfo->underflow[i] = info->underflow[i]; - } + /* we dont care about newinfo->entries[] */ + memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); + newinfo->initial_entries = 0; loc_cpu_entry = info->entries[raw_smp_processor_id()]; return IPT_ENTRY_ITERATE(loc_cpu_entry, info->size, - compat_calc_entry, info, loc_cpu_entry, newinfo); + compat_calc_entry, info, loc_cpu_entry, + newinfo); } #endif @@ -1147,8 +1099,8 @@ static int get_info(void __user *user, int *len, int compat) int ret; if (*len != sizeof(struct ipt_getinfo)) { - duprintf("length %u != %u\n", *len, - (unsigned int)sizeof(struct ipt_getinfo)); + duprintf("length %u != %zu\n", *len, + sizeof(struct ipt_getinfo)); return -EINVAL; } @@ -1161,7 +1113,7 @@ static int get_info(void __user *user, int *len, int compat) xt_compat_lock(AF_INET); #endif t = try_then_request_module(xt_find_table_lock(AF_INET, name), - "iptable_%s", name); + "iptable_%s", name); if (t && !IS_ERR(t)) { struct ipt_getinfo info; struct xt_table_info *private = t->private; @@ -1170,15 +1122,15 @@ static int get_info(void __user *user, int *len, int compat) if (compat) { struct xt_table_info tmp; ret = compat_table_info(private, &tmp); - compat_flush_offsets(); - private = &tmp; + xt_compat_flush_offsets(AF_INET); + private = &tmp; } #endif info.valid_hooks = t->valid_hooks; memcpy(info.hook_entry, private->hook_entry, - sizeof(info.hook_entry)); + sizeof(info.hook_entry)); memcpy(info.underflow, private->underflow, - sizeof(info.underflow)); + sizeof(info.underflow)); info.num_entries = private->number; info.size = private->size; strcpy(info.name, name); @@ -1207,31 +1159,27 @@ get_entries(struct ipt_get_entries __user *uptr, int *len) struct xt_table *t; if (*len < sizeof(get)) { - duprintf("get_entries: %u < %d\n", *len, - (unsigned int)sizeof(get)); + duprintf("get_entries: %u < %zu\n", *len, sizeof(get)); return -EINVAL; } if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; if (*len != sizeof(struct ipt_get_entries) + get.size) { - duprintf("get_entries: %u != %u\n", *len, - (unsigned int)(sizeof(struct ipt_get_entries) + - get.size)); + duprintf("get_entries: %u != %zu\n", + *len, sizeof(get) + get.size); return -EINVAL; } t = xt_find_table_lock(AF_INET, get.name); if (t && !IS_ERR(t)) { struct xt_table_info *private = t->private; - duprintf("t->private->number = %u\n", - private->number); + duprintf("t->private->number = %u\n", private->number); if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); else { duprintf("get_entries: I've got %u not %u!\n", - private->size, - get.size); + private->size, get.size); ret = -EINVAL; } module_put(t->me); @@ -1244,8 +1192,8 @@ get_entries(struct ipt_get_entries __user *uptr, int *len) static int __do_replace(const char *name, unsigned int valid_hooks, - struct xt_table_info *newinfo, unsigned int num_counters, - void __user *counters_ptr) + struct xt_table_info *newinfo, unsigned int num_counters, + void __user *counters_ptr) { int ret; struct xt_table *t; @@ -1293,7 +1241,8 @@ __do_replace(const char *name, unsigned int valid_hooks, get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; - IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL); + IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, + NULL); xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, sizeof(struct xt_counters) * num_counters) != 0) @@ -1322,14 +1271,7 @@ do_replace(void __user *user, unsigned int len) if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; - /* Hack: Causes ipchains to give correct error msg --RR */ - if (len != sizeof(tmp) + tmp.size) - return -ENOPROTOOPT; - /* overflow check */ - if (tmp.size >= (INT_MAX - sizeof(struct xt_table_info)) / NR_CPUS - - SMP_CACHE_BYTES) - return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; @@ -1337,7 +1279,7 @@ do_replace(void __user *user, unsigned int len) if (!newinfo) return -ENOMEM; - /* choose the copy that is our node/cpu */ + /* choose the copy that is on our node/cpu */ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { @@ -1353,15 +1295,14 @@ do_replace(void __user *user, unsigned int len) duprintf("ip_tables: Translated table\n"); - ret = __do_replace(tmp.name, tmp.valid_hooks, - newinfo, tmp.num_counters, - tmp.counters); + ret = __do_replace(tmp.name, tmp.valid_hooks, newinfo, + tmp.num_counters, tmp.counters); if (ret) goto free_newinfo_untrans; return 0; free_newinfo_untrans: - IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL); + IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); free_newinfo: xt_free_table_info(newinfo); return ret; @@ -1369,7 +1310,7 @@ do_replace(void __user *user, unsigned int len) /* We're lazy, and add to the first CPU; overflow works its fey magic * and everything is OK. */ -static inline int +static int add_counter_to_entry(struct ipt_entry *e, const struct xt_counters addme[], unsigned int *i) @@ -1479,19 +1420,13 @@ struct compat_ipt_replace { u32 valid_hooks; u32 num_entries; u32 size; - u32 hook_entry[NF_IP_NUMHOOKS]; - u32 underflow[NF_IP_NUMHOOKS]; + u32 hook_entry[NF_INET_NUMHOOKS]; + u32 underflow[NF_INET_NUMHOOKS]; u32 num_counters; compat_uptr_t counters; /* struct ipt_counters * */ struct compat_ipt_entry entries[0]; }; -static inline int compat_copy_match_to_user(struct ipt_entry_match *m, - void __user **dstptr, compat_uint_t *size) -{ - return xt_compat_match_to_user(m, dstptr, size); -} - static int compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr, compat_uint_t *size, struct xt_counters *counters, @@ -1513,7 +1448,9 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr, goto out; *dstptr += sizeof(struct compat_ipt_entry); - ret = IPT_MATCH_ITERATE(e, compat_copy_match_to_user, dstptr, size); + *size -= sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); + + ret = IPT_MATCH_ITERATE(e, xt_compat_match_to_user, dstptr, size); target_offset = e->target_offset - (origsize - *size); if (ret) goto out; @@ -1534,21 +1471,21 @@ out: return ret; } -static inline int +static int compat_find_calc_match(struct ipt_entry_match *m, - const char *name, - const struct ipt_ip *ip, - unsigned int hookmask, - int *size, int *i) + const char *name, + const struct ipt_ip *ip, + unsigned int hookmask, + int *size, int *i) { struct xt_match *match; match = try_then_request_module(xt_find_match(AF_INET, m->u.user.name, - m->u.user.revision), + m->u.user.revision), "ipt_%s", m->u.user.name); if (IS_ERR(match) || !match) { duprintf("compat_check_calc_match: `%s' not found\n", - m->u.user.name); + m->u.user.name); return match ? PTR_ERR(match) : -ENOENT; } m->u.kernel.match = match; @@ -1558,7 +1495,7 @@ compat_find_calc_match(struct ipt_entry_match *m, return 0; } -static inline int +static int compat_release_match(struct ipt_entry_match *m, unsigned int *i) { if (i && (*i)-- == 0) @@ -1568,8 +1505,8 @@ compat_release_match(struct ipt_entry_match *m, unsigned int *i) return 0; } -static inline int -compat_release_entry(struct ipt_entry *e, unsigned int *i) +static int +compat_release_entry(struct compat_ipt_entry *e, unsigned int *i) { struct ipt_entry_target *t; @@ -1577,22 +1514,22 @@ compat_release_entry(struct ipt_entry *e, unsigned int *i) return 1; /* Cleanup all matches */ - IPT_MATCH_ITERATE(e, compat_release_match, NULL); - t = ipt_get_target(e); + COMPAT_IPT_MATCH_ITERATE(e, compat_release_match, NULL); + t = compat_ipt_get_target(e); module_put(t->u.kernel.target->me); return 0; } -static inline int -check_compat_entry_size_and_hooks(struct ipt_entry *e, - struct xt_table_info *newinfo, - unsigned int *size, - unsigned char *base, - unsigned char *limit, - unsigned int *hook_entries, - unsigned int *underflows, - unsigned int *i, - const char *name) +static int +check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, + struct xt_table_info *newinfo, + unsigned int *size, + unsigned char *base, + unsigned char *limit, + unsigned int *hook_entries, + unsigned int *underflows, + unsigned int *i, + const char *name) { struct ipt_entry_target *t; struct xt_target *target; @@ -1607,32 +1544,33 @@ check_compat_entry_size_and_hooks(struct ipt_entry *e, } if (e->next_offset < sizeof(struct compat_ipt_entry) + - sizeof(struct compat_xt_entry_target)) { + sizeof(struct compat_xt_entry_target)) { duprintf("checking: element %p size %u\n", e, e->next_offset); return -EINVAL; } - ret = check_entry(e, name); + /* For purposes of check_entry casting the compat entry is fine */ + ret = check_entry((struct ipt_entry *)e, name); if (ret) return ret; - off = 0; + off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); entry_offset = (void *)e - (void *)base; j = 0; - ret = IPT_MATCH_ITERATE(e, compat_find_calc_match, name, &e->ip, - e->comefrom, &off, &j); + ret = COMPAT_IPT_MATCH_ITERATE(e, compat_find_calc_match, name, + &e->ip, e->comefrom, &off, &j); if (ret != 0) goto release_matches; - t = ipt_get_target(e); + t = compat_ipt_get_target(e); target = try_then_request_module(xt_find_target(AF_INET, - t->u.user.name, - t->u.user.revision), + t->u.user.name, + t->u.user.revision), "ipt_%s", t->u.user.name); if (IS_ERR(target) || !target) { duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", - t->u.user.name); + t->u.user.name); ret = target ? PTR_ERR(target) : -ENOENT; goto release_matches; } @@ -1640,12 +1578,12 @@ check_compat_entry_size_and_hooks(struct ipt_entry *e, off += xt_compat_target_offset(target); *size += off; - ret = compat_add_offset(entry_offset, off); + ret = xt_compat_add_offset(AF_INET, entry_offset, off); if (ret) goto out; /* Check hooks & underflows */ - for (h = 0; h < NF_IP_NUMHOOKS; h++) { + for (h = 0; h < NF_INET_NUMHOOKS; h++) { if ((unsigned char *)e - base == hook_entries[h]) newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) @@ -1653,7 +1591,7 @@ check_compat_entry_size_and_hooks(struct ipt_entry *e, } /* Clear counters and comefrom */ - e->counters = ((struct ipt_counters) { 0, 0 }); + memset(&e->counters, 0, sizeof(e->counters)); e->comefrom = 0; (*i)++; @@ -1666,17 +1604,10 @@ release_matches: return ret; } -static inline int compat_copy_match_from_user(struct ipt_entry_match *m, - void **dstptr, compat_uint_t *size, const char *name, - const struct ipt_ip *ip, unsigned int hookmask) -{ - xt_compat_match_from_user(m, dstptr, size); - return 0; -} - -static int compat_copy_entry_from_user(struct ipt_entry *e, void **dstptr, - unsigned int *size, const char *name, - struct xt_table_info *newinfo, unsigned char *base) +static int +compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr, + unsigned int *size, const char *name, + struct xt_table_info *newinfo, unsigned char *base) { struct ipt_entry_target *t; struct xt_target *target; @@ -1688,19 +1619,22 @@ static int compat_copy_entry_from_user(struct ipt_entry *e, void **dstptr, origsize = *size; de = (struct ipt_entry *)*dstptr; memcpy(de, e, sizeof(struct ipt_entry)); + memcpy(&de->counters, &e->counters, sizeof(e->counters)); - *dstptr += sizeof(struct compat_ipt_entry); - ret = IPT_MATCH_ITERATE(e, compat_copy_match_from_user, dstptr, size, - name, &de->ip, de->comefrom); + *dstptr += sizeof(struct ipt_entry); + *size += sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); + + ret = COMPAT_IPT_MATCH_ITERATE(e, xt_compat_match_from_user, + dstptr, size); if (ret) return ret; de->target_offset = e->target_offset - (origsize - *size); - t = ipt_get_target(e); + t = compat_ipt_get_target(e); target = t->u.kernel.target; xt_compat_target_from_user(t, dstptr, size); de->next_offset = e->next_offset - (origsize - *size); - for (h = 0; h < NF_IP_NUMHOOKS; h++) { + for (h = 0; h < NF_INET_NUMHOOKS; h++) { if ((unsigned char *)de - base < newinfo->hook_entry[h]) newinfo->hook_entry[h] -= origsize - *size; if ((unsigned char *)de - base < newinfo->underflow[h]) @@ -1709,13 +1643,15 @@ static int compat_copy_entry_from_user(struct ipt_entry *e, void **dstptr, return ret; } -static inline int compat_check_entry(struct ipt_entry *e, const char *name, - unsigned int *i) +static int +compat_check_entry(struct ipt_entry *e, const char *name, + unsigned int *i) { int j, ret; j = 0; - ret = IPT_MATCH_ITERATE(e, check_match, name, &e->ip, e->comefrom, &j); + ret = IPT_MATCH_ITERATE(e, check_match, name, &e->ip, + e->comefrom, &j); if (ret) goto cleanup_matches; @@ -1733,13 +1669,13 @@ static inline int compat_check_entry(struct ipt_entry *e, const char *name, static int translate_compat_table(const char *name, - unsigned int valid_hooks, - struct xt_table_info **pinfo, - void **pentry0, - unsigned int total_size, - unsigned int number, - unsigned int *hook_entries, - unsigned int *underflows) + unsigned int valid_hooks, + struct xt_table_info **pinfo, + void **pentry0, + unsigned int total_size, + unsigned int number, + unsigned int *hook_entries, + unsigned int *underflows) { unsigned int i, j; struct xt_table_info *newinfo, *info; @@ -1753,7 +1689,7 @@ translate_compat_table(const char *name, info->number = number; /* Init all hooks to impossible value. */ - for (i = 0; i < NF_IP_NUMHOOKS; i++) { + for (i = 0; i < NF_INET_NUMHOOKS; i++) { info->hook_entry[i] = 0xFFFFFFFF; info->underflow[i] = 0xFFFFFFFF; } @@ -1762,11 +1698,11 @@ translate_compat_table(const char *name, j = 0; xt_compat_lock(AF_INET); /* Walk through entries, checking offsets. */ - ret = IPT_ENTRY_ITERATE(entry0, total_size, - check_compat_entry_size_and_hooks, - info, &size, entry0, - entry0 + total_size, - hook_entries, underflows, &j, name); + ret = COMPAT_IPT_ENTRY_ITERATE(entry0, total_size, + check_compat_entry_size_and_hooks, + info, &size, entry0, + entry0 + total_size, + hook_entries, underflows, &j, name); if (ret != 0) goto out_unlock; @@ -1778,7 +1714,7 @@ translate_compat_table(const char *name, } /* Check hooks all assigned */ - for (i = 0; i < NF_IP_NUMHOOKS; i++) { + for (i = 0; i < NF_INET_NUMHOOKS; i++) { /* Only hooks which are valid */ if (!(valid_hooks & (1 << i))) continue; @@ -1800,17 +1736,17 @@ translate_compat_table(const char *name, goto out_unlock; newinfo->number = number; - for (i = 0; i < NF_IP_NUMHOOKS; i++) { + for (i = 0; i < NF_INET_NUMHOOKS; i++) { newinfo->hook_entry[i] = info->hook_entry[i]; newinfo->underflow[i] = info->underflow[i]; } entry1 = newinfo->entries[raw_smp_processor_id()]; pos = entry1; - size = total_size; - ret = IPT_ENTRY_ITERATE(entry0, total_size, - compat_copy_entry_from_user, &pos, &size, - name, newinfo, entry1); - compat_flush_offsets(); + size = total_size; + ret = COMPAT_IPT_ENTRY_ITERATE(entry0, total_size, + compat_copy_entry_from_user, + &pos, &size, name, newinfo, entry1); + xt_compat_flush_offsets(AF_INET); xt_compat_unlock(AF_INET); if (ret) goto free_newinfo; @@ -1821,11 +1757,11 @@ translate_compat_table(const char *name, i = 0; ret = IPT_ENTRY_ITERATE(entry1, newinfo->size, compat_check_entry, - name, &i); + name, &i); if (ret) { j -= i; - IPT_ENTRY_ITERATE_CONTINUE(entry1, newinfo->size, i, - compat_release_entry, &j); + COMPAT_IPT_ENTRY_ITERATE_CONTINUE(entry0, newinfo->size, i, + compat_release_entry, &j); IPT_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, &i); xt_free_table_info(newinfo); return ret; @@ -1844,10 +1780,10 @@ translate_compat_table(const char *name, free_newinfo: xt_free_table_info(newinfo); out: - IPT_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j); + COMPAT_IPT_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j); return ret; out_unlock: - compat_flush_offsets(); + xt_compat_flush_offsets(AF_INET); xt_compat_unlock(AF_INET); goto out; } @@ -1863,13 +1799,8 @@ compat_do_replace(void __user *user, unsigned int len) if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; - /* Hack: Causes ipchains to give correct error msg --RR */ - if (len != sizeof(tmp) + tmp.size) - return -ENOPROTOOPT; - /* overflow check */ - if (tmp.size >= (INT_MAX - sizeof(struct xt_table_info)) / NR_CPUS - - SMP_CACHE_BYTES) + if (tmp.size >= INT_MAX / num_possible_cpus()) return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; @@ -1878,7 +1809,7 @@ compat_do_replace(void __user *user, unsigned int len) if (!newinfo) return -ENOMEM; - /* choose the copy that is our node/cpu */ + /* choose the copy that is on our node/cpu */ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { @@ -1887,22 +1818,22 @@ compat_do_replace(void __user *user, unsigned int len) } ret = translate_compat_table(tmp.name, tmp.valid_hooks, - &newinfo, &loc_cpu_entry, tmp.size, - tmp.num_entries, tmp.hook_entry, tmp.underflow); + &newinfo, &loc_cpu_entry, tmp.size, + tmp.num_entries, tmp.hook_entry, + tmp.underflow); if (ret != 0) goto free_newinfo; duprintf("compat_do_replace: Translated table\n"); - ret = __do_replace(tmp.name, tmp.valid_hooks, - newinfo, tmp.num_counters, - compat_ptr(tmp.counters)); + ret = __do_replace(tmp.name, tmp.valid_hooks, newinfo, + tmp.num_counters, compat_ptr(tmp.counters)); if (ret) goto free_newinfo_untrans; return 0; free_newinfo_untrans: - IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL); + IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); free_newinfo: xt_free_table_info(newinfo); return ret; @@ -1910,7 +1841,7 @@ compat_do_replace(void __user *user, unsigned int len) static int compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, - unsigned int len) + unsigned int len) { int ret; @@ -1934,15 +1865,15 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, return ret; } -struct compat_ipt_get_entries -{ +struct compat_ipt_get_entries { char name[IPT_TABLE_MAXNAMELEN]; compat_uint_t size; struct compat_ipt_entry entrytable[0]; }; -static int compat_copy_entries_to_user(unsigned int total_size, - struct xt_table *table, void __user *userptr) +static int +compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, + void __user *userptr) { struct xt_counters *counters; struct xt_table_info *private = table->private; @@ -1978,10 +1909,8 @@ compat_get_entries(struct compat_ipt_get_entries __user *uptr, int *len) struct compat_ipt_get_entries get; struct xt_table *t; - if (*len < sizeof(get)) { - duprintf("compat_get_entries: %u < %u\n", - *len, (unsigned int)sizeof(get)); + duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get)); return -EINVAL; } @@ -1989,9 +1918,8 @@ compat_get_entries(struct compat_ipt_get_entries __user *uptr, int *len) return -EFAULT; if (*len != sizeof(struct compat_ipt_get_entries) + get.size) { - duprintf("compat_get_entries: %u != %u\n", *len, - (unsigned int)(sizeof(struct compat_ipt_get_entries) + - get.size)); + duprintf("compat_get_entries: %u != %zu\n", + *len, sizeof(get) + get.size); return -EINVAL; } @@ -2000,19 +1928,17 @@ compat_get_entries(struct compat_ipt_get_entries __user *uptr, int *len) if (t && !IS_ERR(t)) { struct xt_table_info *private = t->private; struct xt_table_info info; - duprintf("t->private->number = %u\n", - private->number); + duprintf("t->private->number = %u\n", private->number); ret = compat_table_info(private, &info); if (!ret && get.size == info.size) { ret = compat_copy_entries_to_user(private->size, - t, uptr->entrytable); + t, uptr->entrytable); } else if (!ret) { duprintf("compat_get_entries: I've got %u not %u!\n", - private->size, - get.size); + private->size, get.size); ret = -EINVAL; } - compat_flush_offsets(); + xt_compat_flush_offsets(AF_INET); module_put(t->me); xt_table_unlock(t); } else @@ -2047,7 +1973,7 @@ compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) #endif static int -do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) { int ret; @@ -2126,7 +2052,7 @@ int ipt_register_table(struct xt_table *table, const struct ipt_replace *repl) { int ret; struct xt_table_info *newinfo; - static struct xt_table_info bootstrap + struct xt_table_info bootstrap = { 0, 0, 0, { 0 }, { 0 }, { } }; void *loc_cpu_entry; @@ -2134,9 +2060,7 @@ int ipt_register_table(struct xt_table *table, const struct ipt_replace *repl) if (!newinfo) return -ENOMEM; - /* choose the copy on our node/cpu - * but dont care of preemption - */ + /* choose the copy on our node/cpu, but dont care about preemption */ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; memcpy(loc_cpu_entry, repl->entries, repl->size); @@ -2178,7 +2102,8 @@ icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code, u_int8_t type, u_int8_t code, bool invert) { - return ((test_type == 0xFF) || (type == test_type && code >= min_code && code <= max_code)) + return ((test_type == 0xFF) || + (type == test_type && code >= min_code && code <= max_code)) ^ invert; } @@ -2219,7 +2144,7 @@ icmp_match(const struct sk_buff *skb, /* Called when user tries to insert an entry of this type. */ static bool icmp_checkentry(const char *tablename, - const void *info, + const void *entry, const struct xt_match *match, void *matchinfo, unsigned int hook_mask) @@ -2270,9 +2195,9 @@ static struct xt_match icmp_matchstruct __read_mostly = { .name = "icmp", .match = icmp_match, .matchsize = sizeof(struct ipt_icmp), + .checkentry = icmp_checkentry, .proto = IPPROTO_ICMP, .family = AF_INET, - .checkentry = icmp_checkentry, }; static int __init ip_tables_init(void) diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 2f544dac72d..1b31f7d14d4 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -32,7 +32,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("iptables target for CLUSTERIP"); +MODULE_DESCRIPTION("Xtables: CLUSTERIP target"); struct clusterip_config { struct list_head list; /* list of all configs */ @@ -109,11 +109,9 @@ clusterip_config_entry_put(struct clusterip_config *c) static struct clusterip_config * __clusterip_config_find(__be32 clusterip) { - struct list_head *pos; + struct clusterip_config *c; - list_for_each(pos, &clusterip_configs) { - struct clusterip_config *c = list_entry(pos, - struct clusterip_config, list); + list_for_each_entry(c, &clusterip_configs, list) { if (c->clusterip == clusterip) return c; } @@ -275,7 +273,7 @@ clusterip_hashfn(const struct sk_buff *skb, } /* node numbers are 1..n, not 0..n */ - return (hashval % config->num_total_nodes) + 1; + return (((u64)hashval * config->num_total_nodes) >> 32) + 1; } static inline int @@ -289,12 +287,9 @@ clusterip_responsible(const struct clusterip_config *config, u_int32_t hash) ***********************************************************************/ static unsigned int -target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +clusterip_tg(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { const struct ipt_clusterip_tgt_info *cipinfo = targinfo; struct nf_conn *ct; @@ -361,11 +356,9 @@ target(struct sk_buff *skb, } static bool -checkentry(const char *tablename, - const void *e_void, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +clusterip_tg_check(const char *tablename, const void *e_void, + const struct xt_target *target, void *targinfo, + unsigned int hook_mask) { struct ipt_clusterip_tgt_info *cipinfo = targinfo; const struct ipt_entry *e = e_void; @@ -421,7 +414,7 @@ checkentry(const char *tablename, if (nf_ct_l3proto_try_module_get(target->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%d\n", target->family); + "proto=%u\n", target->family); return false; } @@ -429,7 +422,7 @@ checkentry(const char *tablename, } /* drop reference count of cluster config when rule is deleted */ -static void destroy(const struct xt_target *target, void *targinfo) +static void clusterip_tg_destroy(const struct xt_target *target, void *targinfo) { struct ipt_clusterip_tgt_info *cipinfo = targinfo; @@ -456,12 +449,12 @@ struct compat_ipt_clusterip_tgt_info }; #endif /* CONFIG_COMPAT */ -static struct xt_target clusterip_tgt __read_mostly = { +static struct xt_target clusterip_tg_reg __read_mostly = { .name = "CLUSTERIP", .family = AF_INET, - .target = target, - .checkentry = checkentry, - .destroy = destroy, + .target = clusterip_tg, + .checkentry = clusterip_tg_check, + .destroy = clusterip_tg_destroy, .targetsize = sizeof(struct ipt_clusterip_tgt_info), #ifdef CONFIG_COMPAT .compatsize = sizeof(struct compat_ipt_clusterip_tgt_info), @@ -558,7 +551,7 @@ arp_mangle(unsigned int hook, return NF_ACCEPT; } -static struct nf_hook_ops cip_arp_ops = { +static struct nf_hook_ops cip_arp_ops __read_mostly = { .hook = arp_mangle, .pf = NF_ARP, .hooknum = NF_ARP_OUT, @@ -714,11 +707,11 @@ static const struct file_operations clusterip_proc_fops = { #endif /* CONFIG_PROC_FS */ -static int __init ipt_clusterip_init(void) +static int __init clusterip_tg_init(void) { int ret; - ret = xt_register_target(&clusterip_tgt); + ret = xt_register_target(&clusterip_tg_reg); if (ret < 0) return ret; @@ -744,11 +737,11 @@ cleanup_hook: nf_unregister_hook(&cip_arp_ops); #endif /* CONFIG_PROC_FS */ cleanup_target: - xt_unregister_target(&clusterip_tgt); + xt_unregister_target(&clusterip_tg_reg); return ret; } -static void __exit ipt_clusterip_fini(void) +static void __exit clusterip_tg_exit(void) { printk(KERN_NOTICE "ClusterIP Version %s unloading\n", CLUSTERIP_VERSION); @@ -756,8 +749,8 @@ static void __exit ipt_clusterip_fini(void) remove_proc_entry(clusterip_procdir->name, clusterip_procdir->parent); #endif nf_unregister_hook(&cip_arp_ops); - xt_unregister_target(&clusterip_tgt); + xt_unregister_target(&clusterip_tg_reg); } -module_init(ipt_clusterip_init); -module_exit(ipt_clusterip_fini); +module_init(clusterip_tg_init); +module_exit(clusterip_tg_exit); diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index add110060a2..21395bc2b27 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -21,7 +21,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("iptables ECN modification module"); +MODULE_DESCRIPTION("Xtables: Explicit Congestion Notification (ECN) flag modification"); /* set ECT codepoint from IP header. * return false if there was an error. */ @@ -38,7 +38,7 @@ set_ect_ip(struct sk_buff *skb, const struct ipt_ECN_info *einfo) oldtos = iph->tos; iph->tos &= ~IPT_ECN_IP_MASK; iph->tos |= (einfo->ip_ect & IPT_ECN_IP_MASK); - nf_csum_replace2(&iph->check, htons(oldtos), htons(iph->tos)); + csum_replace2(&iph->check, htons(oldtos), htons(iph->tos)); } return true; } @@ -71,18 +71,15 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo) if (einfo->operation & IPT_ECN_OP_SET_CWR) tcph->cwr = einfo->proto.tcp.cwr; - nf_proto_csum_replace2(&tcph->check, skb, - oldval, ((__be16 *)tcph)[6], 0); + inet_proto_csum_replace2(&tcph->check, skb, + oldval, ((__be16 *)tcph)[6], 0); return true; } static unsigned int -target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +ecn_tg(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { const struct ipt_ECN_info *einfo = targinfo; @@ -99,11 +96,9 @@ target(struct sk_buff *skb, } static bool -checkentry(const char *tablename, - const void *e_void, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +ecn_tg_check(const char *tablename, const void *e_void, + const struct xt_target *target, void *targinfo, + unsigned int hook_mask) { const struct ipt_ECN_info *einfo = (struct ipt_ECN_info *)targinfo; const struct ipt_entry *e = e_void; @@ -127,25 +122,25 @@ checkentry(const char *tablename, return true; } -static struct xt_target ipt_ecn_reg __read_mostly = { +static struct xt_target ecn_tg_reg __read_mostly = { .name = "ECN", .family = AF_INET, - .target = target, + .target = ecn_tg, .targetsize = sizeof(struct ipt_ECN_info), .table = "mangle", - .checkentry = checkentry, + .checkentry = ecn_tg_check, .me = THIS_MODULE, }; -static int __init ipt_ecn_init(void) +static int __init ecn_tg_init(void) { - return xt_register_target(&ipt_ecn_reg); + return xt_register_target(&ecn_tg_reg); } -static void __exit ipt_ecn_fini(void) +static void __exit ecn_tg_exit(void) { - xt_unregister_target(&ipt_ecn_reg); + xt_unregister_target(&ecn_tg_reg); } -module_init(ipt_ecn_init); -module_exit(ipt_ecn_fini); +module_init(ecn_tg_init); +module_exit(ecn_tg_exit); diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index 4b5e8216a4e..b38d7850f50 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -22,10 +22,11 @@ #include <linux/netfilter.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ipt_LOG.h> +#include <net/netfilter/nf_log.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); -MODULE_DESCRIPTION("iptables syslog logging module"); +MODULE_DESCRIPTION("Xtables: IPv4 packet logging to syslog"); /* Use lock to serialize, so printks don't overlap */ static DEFINE_SPINLOCK(log_lock); @@ -337,7 +338,9 @@ static void dump_packet(const struct nf_loginfo *info, if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) { read_lock_bh(&skb->sk->sk_callback_lock); if (skb->sk->sk_socket && skb->sk->sk_socket->file) - printk("UID=%u ", skb->sk->sk_socket->file->f_uid); + printk("UID=%u GID=%u", + skb->sk->sk_socket->file->f_uid, + skb->sk->sk_socket->file->f_gid); read_unlock_bh(&skb->sk->sk_callback_lock); } @@ -418,12 +421,9 @@ ipt_log_packet(unsigned int pf, } static unsigned int -ipt_log_target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +log_tg(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { const struct ipt_log_info *loginfo = targinfo; struct nf_loginfo li; @@ -437,11 +437,10 @@ ipt_log_target(struct sk_buff *skb, return XT_CONTINUE; } -static bool ipt_log_checkentry(const char *tablename, - const void *e, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +static bool +log_tg_check(const char *tablename, const void *e, + const struct xt_target *target, void *targinfo, + unsigned int hook_mask) { const struct ipt_log_info *loginfo = targinfo; @@ -457,37 +456,37 @@ static bool ipt_log_checkentry(const char *tablename, return true; } -static struct xt_target ipt_log_reg __read_mostly = { +static struct xt_target log_tg_reg __read_mostly = { .name = "LOG", .family = AF_INET, - .target = ipt_log_target, + .target = log_tg, .targetsize = sizeof(struct ipt_log_info), - .checkentry = ipt_log_checkentry, + .checkentry = log_tg_check, .me = THIS_MODULE, }; -static struct nf_logger ipt_log_logger ={ +static const struct nf_logger ipt_log_logger ={ .name = "ipt_LOG", .logfn = &ipt_log_packet, .me = THIS_MODULE, }; -static int __init ipt_log_init(void) +static int __init log_tg_init(void) { int ret; - ret = xt_register_target(&ipt_log_reg); + ret = xt_register_target(&log_tg_reg); if (ret < 0) return ret; nf_log_register(PF_INET, &ipt_log_logger); return 0; } -static void __exit ipt_log_fini(void) +static void __exit log_tg_exit(void) { nf_log_unregister(&ipt_log_logger); - xt_unregister_target(&ipt_log_reg); + xt_unregister_target(&log_tg_reg); } -module_init(ipt_log_init); -module_exit(ipt_log_fini); +module_init(log_tg_init); +module_exit(log_tg_exit); diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 44b516e7cb7..d80fee8327e 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -25,18 +25,16 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); -MODULE_DESCRIPTION("iptables MASQUERADE target module"); +MODULE_DESCRIPTION("Xtables: automatic-address SNAT"); /* Lock protects masq region inside conntrack */ static DEFINE_RWLOCK(masq_lock); /* FIXME: Multiple targets. --RR */ static bool -masquerade_check(const char *tablename, - const void *e, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +masquerade_tg_check(const char *tablename, const void *e, + const struct xt_target *target, void *targinfo, + unsigned int hook_mask) { const struct nf_nat_multi_range_compat *mr = targinfo; @@ -52,12 +50,9 @@ masquerade_check(const char *tablename, } static unsigned int -masquerade_target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +masquerade_tg(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { struct nf_conn *ct; struct nf_conn_nat *nat; @@ -67,7 +62,7 @@ masquerade_target(struct sk_buff *skb, const struct rtable *rt; __be32 newsrc; - NF_CT_ASSERT(hooknum == NF_IP_POST_ROUTING); + NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING); ct = nf_ct_get(skb, &ctinfo); nat = nfct_nat(ct); @@ -100,7 +95,7 @@ masquerade_target(struct sk_buff *skb, mr->range[0].min, mr->range[0].max }); /* Hand modified range to generic setup. */ - return nf_nat_setup_info(ct, &newrange, hooknum); + return nf_nat_setup_info(ct, &newrange, IP_NAT_MANIP_SRC); } static int @@ -166,22 +161,22 @@ static struct notifier_block masq_inet_notifier = { .notifier_call = masq_inet_event, }; -static struct xt_target masquerade __read_mostly = { +static struct xt_target masquerade_tg_reg __read_mostly = { .name = "MASQUERADE", .family = AF_INET, - .target = masquerade_target, + .target = masquerade_tg, .targetsize = sizeof(struct nf_nat_multi_range_compat), .table = "nat", - .hooks = 1 << NF_IP_POST_ROUTING, - .checkentry = masquerade_check, + .hooks = 1 << NF_INET_POST_ROUTING, + .checkentry = masquerade_tg_check, .me = THIS_MODULE, }; -static int __init ipt_masquerade_init(void) +static int __init masquerade_tg_init(void) { int ret; - ret = xt_register_target(&masquerade); + ret = xt_register_target(&masquerade_tg_reg); if (ret == 0) { /* Register for device down reports */ @@ -193,12 +188,12 @@ static int __init ipt_masquerade_init(void) return ret; } -static void __exit ipt_masquerade_fini(void) +static void __exit masquerade_tg_exit(void) { - xt_unregister_target(&masquerade); + xt_unregister_target(&masquerade_tg_reg); unregister_netdevice_notifier(&masq_dev_notifier); unregister_inetaddr_notifier(&masq_inet_notifier); } -module_init(ipt_masquerade_init); -module_exit(ipt_masquerade_fini); +module_init(masquerade_tg_init); +module_exit(masquerade_tg_exit); diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c index f8699291e33..6739abfd152 100644 --- a/net/ipv4/netfilter/ipt_NETMAP.c +++ b/net/ipv4/netfilter/ipt_NETMAP.c @@ -20,14 +20,12 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>"); -MODULE_DESCRIPTION("iptables 1:1 NAT mapping of IP networks target"); +MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of IPv4 subnets"); static bool -check(const char *tablename, - const void *e, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +netmap_tg_check(const char *tablename, const void *e, + const struct xt_target *target, void *targinfo, + unsigned int hook_mask) { const struct nf_nat_multi_range_compat *mr = targinfo; @@ -43,12 +41,9 @@ check(const char *tablename, } static unsigned int -target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +netmap_tg(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; @@ -56,14 +51,14 @@ target(struct sk_buff *skb, const struct nf_nat_multi_range_compat *mr = targinfo; struct nf_nat_range newrange; - NF_CT_ASSERT(hooknum == NF_IP_PRE_ROUTING - || hooknum == NF_IP_POST_ROUTING - || hooknum == NF_IP_LOCAL_OUT); + NF_CT_ASSERT(hooknum == NF_INET_PRE_ROUTING + || hooknum == NF_INET_POST_ROUTING + || hooknum == NF_INET_LOCAL_OUT); ct = nf_ct_get(skb, &ctinfo); netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); - if (hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_LOCAL_OUT) + if (hooknum == NF_INET_PRE_ROUTING || hooknum == NF_INET_LOCAL_OUT) new_ip = ip_hdr(skb)->daddr & ~netmask; else new_ip = ip_hdr(skb)->saddr & ~netmask; @@ -75,30 +70,31 @@ target(struct sk_buff *skb, mr->range[0].min, mr->range[0].max }); /* Hand modified range to generic setup. */ - return nf_nat_setup_info(ct, &newrange, hooknum); + return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(hooknum)); } -static struct xt_target target_module __read_mostly = { +static struct xt_target netmap_tg_reg __read_mostly = { .name = "NETMAP", .family = AF_INET, - .target = target, + .target = netmap_tg, .targetsize = sizeof(struct nf_nat_multi_range_compat), .table = "nat", - .hooks = (1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING) | - (1 << NF_IP_LOCAL_OUT), - .checkentry = check, + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_OUT), + .checkentry = netmap_tg_check, .me = THIS_MODULE }; -static int __init ipt_netmap_init(void) +static int __init netmap_tg_init(void) { - return xt_register_target(&target_module); + return xt_register_target(&netmap_tg_reg); } -static void __exit ipt_netmap_fini(void) +static void __exit netmap_tg_exit(void) { - xt_unregister_target(&target_module); + xt_unregister_target(&netmap_tg_reg); } -module_init(ipt_netmap_init); -module_exit(ipt_netmap_fini); +module_init(netmap_tg_init); +module_exit(netmap_tg_exit); diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c index f7cf7d61a2d..5c6292449d1 100644 --- a/net/ipv4/netfilter/ipt_REDIRECT.c +++ b/net/ipv4/netfilter/ipt_REDIRECT.c @@ -23,15 +23,13 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); -MODULE_DESCRIPTION("iptables REDIRECT target module"); +MODULE_DESCRIPTION("Xtables: Connection redirection to localhost"); /* FIXME: Take multiple ranges --RR */ static bool -redirect_check(const char *tablename, - const void *e, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +redirect_tg_check(const char *tablename, const void *e, + const struct xt_target *target, void *targinfo, + unsigned int hook_mask) { const struct nf_nat_multi_range_compat *mr = targinfo; @@ -47,12 +45,9 @@ redirect_check(const char *tablename, } static unsigned int -redirect_target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +redirect_tg(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; @@ -60,14 +55,14 @@ redirect_target(struct sk_buff *skb, const struct nf_nat_multi_range_compat *mr = targinfo; struct nf_nat_range newrange; - NF_CT_ASSERT(hooknum == NF_IP_PRE_ROUTING - || hooknum == NF_IP_LOCAL_OUT); + NF_CT_ASSERT(hooknum == NF_INET_PRE_ROUTING + || hooknum == NF_INET_LOCAL_OUT); ct = nf_ct_get(skb, &ctinfo); NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); /* Local packets: make them go to loopback */ - if (hooknum == NF_IP_LOCAL_OUT) + if (hooknum == NF_INET_LOCAL_OUT) newdst = htonl(0x7F000001); else { struct in_device *indev; @@ -92,29 +87,29 @@ redirect_target(struct sk_buff *skb, mr->range[0].min, mr->range[0].max }); /* Hand modified range to generic setup. */ - return nf_nat_setup_info(ct, &newrange, hooknum); + return nf_nat_setup_info(ct, &newrange, IP_NAT_MANIP_DST); } -static struct xt_target redirect_reg __read_mostly = { +static struct xt_target redirect_tg_reg __read_mostly = { .name = "REDIRECT", .family = AF_INET, - .target = redirect_target, + .target = redirect_tg, .targetsize = sizeof(struct nf_nat_multi_range_compat), .table = "nat", - .hooks = (1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT), - .checkentry = redirect_check, + .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT), + .checkentry = redirect_tg_check, .me = THIS_MODULE, }; -static int __init ipt_redirect_init(void) +static int __init redirect_tg_init(void) { - return xt_register_target(&redirect_reg); + return xt_register_target(&redirect_tg_reg); } -static void __exit ipt_redirect_fini(void) +static void __exit redirect_tg_exit(void) { - xt_unregister_target(&redirect_reg); + xt_unregister_target(&redirect_tg_reg); } -module_init(ipt_redirect_init); -module_exit(ipt_redirect_fini); +module_init(redirect_tg_init); +module_exit(redirect_tg_exit); diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index dcf4d21d511..22606e2baa1 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -29,17 +29,14 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); -MODULE_DESCRIPTION("iptables REJECT target module"); +MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv4"); /* Send RST reply */ static void send_reset(struct sk_buff *oldskb, int hook) { struct sk_buff *nskb; - struct iphdr *niph; + struct iphdr *oiph, *niph; struct tcphdr _otcph, *oth, *tcph; - __be16 tmp_port; - __be32 tmp_addr; - int needs_ack; unsigned int addr_type; /* IP header checks: fragment. */ @@ -58,99 +55,73 @@ static void send_reset(struct sk_buff *oldskb, int hook) /* Check checksum */ if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP)) return; + oiph = ip_hdr(oldskb); - /* We need a linear, writeable skb. We also need to expand - headroom in case hh_len of incoming interface < hh_len of - outgoing interface */ - nskb = skb_copy_expand(oldskb, LL_MAX_HEADER, skb_tailroom(oldskb), - GFP_ATOMIC); + nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) + + LL_MAX_HEADER, GFP_ATOMIC); if (!nskb) return; - /* This packet will not be the same as the other: clear nf fields */ - nf_reset(nskb); - nskb->mark = 0; - skb_init_secmark(nskb); - - skb_shinfo(nskb)->gso_size = 0; - skb_shinfo(nskb)->gso_segs = 0; - skb_shinfo(nskb)->gso_type = 0; - - tcph = (struct tcphdr *)(skb_network_header(nskb) + ip_hdrlen(nskb)); - - /* Swap source and dest */ - niph = ip_hdr(nskb); - tmp_addr = niph->saddr; - niph->saddr = niph->daddr; - niph->daddr = tmp_addr; - tmp_port = tcph->source; - tcph->source = tcph->dest; - tcph->dest = tmp_port; - - /* Truncate to length (no data) */ - tcph->doff = sizeof(struct tcphdr)/4; - skb_trim(nskb, ip_hdrlen(nskb) + sizeof(struct tcphdr)); - niph->tot_len = htons(nskb->len); - - if (tcph->ack) { - needs_ack = 0; + skb_reserve(nskb, LL_MAX_HEADER); + + skb_reset_network_header(nskb); + niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr)); + niph->version = 4; + niph->ihl = sizeof(struct iphdr) / 4; + niph->tos = 0; + niph->id = 0; + niph->frag_off = htons(IP_DF); + niph->protocol = IPPROTO_TCP; + niph->check = 0; + niph->saddr = oiph->daddr; + niph->daddr = oiph->saddr; + + tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); + memset(tcph, 0, sizeof(*tcph)); + tcph->source = oth->dest; + tcph->dest = oth->source; + tcph->doff = sizeof(struct tcphdr) / 4; + + if (oth->ack) tcph->seq = oth->ack_seq; - tcph->ack_seq = 0; - } else { - needs_ack = 1; + else { tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + oldskb->len - ip_hdrlen(oldskb) - (oth->doff << 2)); - tcph->seq = 0; + tcph->ack = 1; } - /* Reset flags */ - ((u_int8_t *)tcph)[13] = 0; - tcph->rst = 1; - tcph->ack = needs_ack; - - tcph->window = 0; - tcph->urg_ptr = 0; - - /* Adjust TCP checksum */ - tcph->check = 0; - tcph->check = tcp_v4_check(sizeof(struct tcphdr), - niph->saddr, niph->daddr, - csum_partial(tcph, - sizeof(struct tcphdr), 0)); - - /* Set DF, id = 0 */ - niph->frag_off = htons(IP_DF); - niph->id = 0; + tcph->rst = 1; + tcph->check = tcp_v4_check(sizeof(struct tcphdr), + niph->saddr, niph->daddr, + csum_partial(tcph, + sizeof(struct tcphdr), 0)); addr_type = RTN_UNSPEC; - if (hook != NF_IP_FORWARD + if (hook != NF_INET_FORWARD #ifdef CONFIG_BRIDGE_NETFILTER || (nskb->nf_bridge && nskb->nf_bridge->mask & BRNF_BRIDGED) #endif ) addr_type = RTN_LOCAL; + /* ip_route_me_harder expects skb->dst to be set */ + dst_hold(oldskb->dst); + nskb->dst = oldskb->dst; + if (ip_route_me_harder(nskb, addr_type)) goto free_nskb; + niph->ttl = dst_metric(nskb->dst, RTAX_HOPLIMIT); nskb->ip_summed = CHECKSUM_NONE; - /* Adjust IP TTL */ - niph->ttl = dst_metric(nskb->dst, RTAX_HOPLIMIT); - - /* Adjust IP checksum */ - niph->check = 0; - niph->check = ip_fast_csum(skb_network_header(nskb), niph->ihl); - /* "Never happens" */ if (nskb->len > dst_mtu(nskb->dst)) goto free_nskb; nf_ct_attach(nskb, oldskb); - NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, nskb, NULL, nskb->dst->dev, - dst_output); + ip_local_out(nskb); return; free_nskb: @@ -162,20 +133,13 @@ static inline void send_unreach(struct sk_buff *skb_in, int code) icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); } -static unsigned int reject(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +static unsigned int +reject_tg(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { const struct ipt_reject_info *reject = targinfo; - /* Our naive response construction doesn't deal with IP - options, and probably shouldn't try. */ - if (ip_hdrlen(skb) != sizeof(struct iphdr)) - return NF_DROP; - /* WARNING: This code causes reentry within iptables. This means that the iptables jump stack is now crap. We must return an absolute verdict. --RR */ @@ -211,11 +175,10 @@ static unsigned int reject(struct sk_buff *skb, return NF_DROP; } -static bool check(const char *tablename, - const void *e_void, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +static bool +reject_tg_check(const char *tablename, const void *e_void, + const struct xt_target *target, void *targinfo, + unsigned int hook_mask) { const struct ipt_reject_info *rejinfo = targinfo; const struct ipt_entry *e = e_void; @@ -234,27 +197,27 @@ static bool check(const char *tablename, return true; } -static struct xt_target ipt_reject_reg __read_mostly = { +static struct xt_target reject_tg_reg __read_mostly = { .name = "REJECT", .family = AF_INET, - .target = reject, + .target = reject_tg, .targetsize = sizeof(struct ipt_reject_info), .table = "filter", - .hooks = (1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD) | - (1 << NF_IP_LOCAL_OUT), - .checkentry = check, + .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT), + .checkentry = reject_tg_check, .me = THIS_MODULE, }; -static int __init ipt_reject_init(void) +static int __init reject_tg_init(void) { - return xt_register_target(&ipt_reject_reg); + return xt_register_target(&reject_tg_reg); } -static void __exit ipt_reject_fini(void) +static void __exit reject_tg_exit(void) { - xt_unregister_target(&ipt_reject_reg); + xt_unregister_target(&reject_tg_reg); } -module_init(ipt_reject_init); -module_exit(ipt_reject_fini); +module_init(reject_tg_init); +module_exit(reject_tg_exit); diff --git a/net/ipv4/netfilter/ipt_SAME.c b/net/ipv4/netfilter/ipt_SAME.c deleted file mode 100644 index 8988571436b..00000000000 --- a/net/ipv4/netfilter/ipt_SAME.c +++ /dev/null @@ -1,179 +0,0 @@ -/* Same. Just like SNAT, only try to make the connections - * between client A and server B always have the same source ip. - * - * (C) 2000 Paul `Rusty' Russell - * (C) 2001 Martin Josefsson - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#include <linux/types.h> -#include <linux/ip.h> -#include <linux/timer.h> -#include <linux/module.h> -#include <linux/netfilter.h> -#include <linux/netdevice.h> -#include <linux/if.h> -#include <linux/inetdevice.h> -#include <net/protocol.h> -#include <net/checksum.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter/x_tables.h> -#include <net/netfilter/nf_nat_rule.h> -#include <linux/netfilter_ipv4/ipt_SAME.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Martin Josefsson <gandalf@wlug.westbo.se>"); -MODULE_DESCRIPTION("iptables special SNAT module for consistent sourceip"); - -static bool -same_check(const char *tablename, - const void *e, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) -{ - unsigned int count, countess, rangeip, index = 0; - struct ipt_same_info *mr = targinfo; - - mr->ipnum = 0; - - if (mr->rangesize < 1) { - pr_debug("same_check: need at least one dest range.\n"); - return false; - } - if (mr->rangesize > IPT_SAME_MAX_RANGE) { - pr_debug("same_check: too many ranges specified, maximum " - "is %u ranges\n", IPT_SAME_MAX_RANGE); - return false; - } - for (count = 0; count < mr->rangesize; count++) { - if (ntohl(mr->range[count].min_ip) > - ntohl(mr->range[count].max_ip)) { - pr_debug("same_check: min_ip is larger than max_ip in " - "range `%u.%u.%u.%u-%u.%u.%u.%u'.\n", - NIPQUAD(mr->range[count].min_ip), - NIPQUAD(mr->range[count].max_ip)); - return false; - } - if (!(mr->range[count].flags & IP_NAT_RANGE_MAP_IPS)) { - pr_debug("same_check: bad MAP_IPS.\n"); - return false; - } - rangeip = (ntohl(mr->range[count].max_ip) - - ntohl(mr->range[count].min_ip) + 1); - mr->ipnum += rangeip; - - pr_debug("same_check: range %u, ipnum = %u\n", count, rangeip); - } - pr_debug("same_check: total ipaddresses = %u\n", mr->ipnum); - - mr->iparray = kmalloc((sizeof(u_int32_t) * mr->ipnum), GFP_KERNEL); - if (!mr->iparray) { - pr_debug("same_check: Couldn't allocate %Zu bytes " - "for %u ipaddresses!\n", - (sizeof(u_int32_t) * mr->ipnum), mr->ipnum); - return false; - } - pr_debug("same_check: Allocated %Zu bytes for %u ipaddresses.\n", - (sizeof(u_int32_t) * mr->ipnum), mr->ipnum); - - for (count = 0; count < mr->rangesize; count++) { - for (countess = ntohl(mr->range[count].min_ip); - countess <= ntohl(mr->range[count].max_ip); - countess++) { - mr->iparray[index] = countess; - pr_debug("same_check: Added ipaddress `%u.%u.%u.%u' " - "in index %u.\n", HIPQUAD(countess), index); - index++; - } - } - return true; -} - -static void -same_destroy(const struct xt_target *target, void *targinfo) -{ - struct ipt_same_info *mr = targinfo; - - kfree(mr->iparray); - - pr_debug("same_destroy: Deallocated %Zu bytes for %u ipaddresses.\n", - (sizeof(u_int32_t) * mr->ipnum), mr->ipnum); -} - -static unsigned int -same_target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) -{ - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - u_int32_t tmpip, aindex; - __be32 new_ip; - const struct ipt_same_info *same = targinfo; - struct nf_nat_range newrange; - const struct nf_conntrack_tuple *t; - - NF_CT_ASSERT(hooknum == NF_IP_PRE_ROUTING || - hooknum == NF_IP_POST_ROUTING); - ct = nf_ct_get(skb, &ctinfo); - - t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; - - /* Base new source on real src ip and optionally dst ip, - giving some hope for consistency across reboots. - Here we calculate the index in same->iparray which - holds the ipaddress we should use */ - - tmpip = ntohl(t->src.u3.ip); - - if (!(same->info & IPT_SAME_NODST)) - tmpip += ntohl(t->dst.u3.ip); - aindex = tmpip % same->ipnum; - - new_ip = htonl(same->iparray[aindex]); - - pr_debug("ipt_SAME: src=%u.%u.%u.%u dst=%u.%u.%u.%u, " - "new src=%u.%u.%u.%u\n", - NIPQUAD(t->src.u3.ip), NIPQUAD(t->dst.u3.ip), NIPQUAD(new_ip)); - - /* Transfer from original range. */ - newrange = ((struct nf_nat_range) - { same->range[0].flags, new_ip, new_ip, - /* FIXME: Use ports from correct range! */ - same->range[0].min, same->range[0].max }); - - /* Hand modified range to generic setup. */ - return nf_nat_setup_info(ct, &newrange, hooknum); -} - -static struct xt_target same_reg __read_mostly = { - .name = "SAME", - .family = AF_INET, - .target = same_target, - .targetsize = sizeof(struct ipt_same_info), - .table = "nat", - .hooks = (1 << NF_IP_PRE_ROUTING | 1 << NF_IP_POST_ROUTING), - .checkentry = same_check, - .destroy = same_destroy, - .me = THIS_MODULE, -}; - -static int __init ipt_same_init(void) -{ - return xt_register_target(&same_reg); -} - -static void __exit ipt_same_fini(void) -{ - xt_unregister_target(&same_reg); -} - -module_init(ipt_same_init); -module_exit(ipt_same_fini); - diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c deleted file mode 100644 index d4573baa7f2..00000000000 --- a/net/ipv4/netfilter/ipt_TOS.c +++ /dev/null @@ -1,87 +0,0 @@ -/* This is a module which is used for setting the TOS field of a packet. */ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/ip.h> -#include <net/checksum.h> - -#include <linux/netfilter/x_tables.h> -#include <linux/netfilter_ipv4/ipt_TOS.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); -MODULE_DESCRIPTION("iptables TOS mangling module"); - -static unsigned int -target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) -{ - const struct ipt_tos_target_info *tosinfo = targinfo; - struct iphdr *iph = ip_hdr(skb); - - if ((iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) { - __u8 oldtos; - if (!skb_make_writable(skb, sizeof(struct iphdr))) - return NF_DROP; - iph = ip_hdr(skb); - oldtos = iph->tos; - iph->tos = (iph->tos & IPTOS_PREC_MASK) | tosinfo->tos; - nf_csum_replace2(&iph->check, htons(oldtos), htons(iph->tos)); - } - return XT_CONTINUE; -} - -static bool -checkentry(const char *tablename, - const void *e_void, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) -{ - const u_int8_t tos = ((struct ipt_tos_target_info *)targinfo)->tos; - - if (tos != IPTOS_LOWDELAY - && tos != IPTOS_THROUGHPUT - && tos != IPTOS_RELIABILITY - && tos != IPTOS_MINCOST - && tos != IPTOS_NORMALSVC) { - printk(KERN_WARNING "TOS: bad tos value %#x\n", tos); - return false; - } - return true; -} - -static struct xt_target ipt_tos_reg __read_mostly = { - .name = "TOS", - .family = AF_INET, - .target = target, - .targetsize = sizeof(struct ipt_tos_target_info), - .table = "mangle", - .checkentry = checkentry, - .me = THIS_MODULE, -}; - -static int __init ipt_tos_init(void) -{ - return xt_register_target(&ipt_tos_reg); -} - -static void __exit ipt_tos_fini(void) -{ - xt_unregister_target(&ipt_tos_reg); -} - -module_init(ipt_tos_init); -module_exit(ipt_tos_fini); diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c index c620a052766..30eed65e733 100644 --- a/net/ipv4/netfilter/ipt_TTL.c +++ b/net/ipv4/netfilter/ipt_TTL.c @@ -16,14 +16,13 @@ #include <linux/netfilter_ipv4/ipt_TTL.h> MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("IP tables TTL modification module"); +MODULE_DESCRIPTION("Xtables: IPv4 TTL field modification target"); MODULE_LICENSE("GPL"); static unsigned int -ipt_ttl_target(struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - unsigned int hooknum, const struct xt_target *target, - const void *targinfo) +ttl_tg(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { struct iphdr *iph; const struct ipt_TTL_info *info = targinfo; @@ -54,19 +53,18 @@ ipt_ttl_target(struct sk_buff *skb, } if (new_ttl != iph->ttl) { - nf_csum_replace2(&iph->check, htons(iph->ttl << 8), - htons(new_ttl << 8)); + csum_replace2(&iph->check, htons(iph->ttl << 8), + htons(new_ttl << 8)); iph->ttl = new_ttl; } return XT_CONTINUE; } -static bool ipt_ttl_checkentry(const char *tablename, - const void *e, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +static bool +ttl_tg_check(const char *tablename, const void *e, + const struct xt_target *target, void *targinfo, + unsigned int hook_mask) { const struct ipt_TTL_info *info = targinfo; @@ -80,25 +78,25 @@ static bool ipt_ttl_checkentry(const char *tablename, return true; } -static struct xt_target ipt_TTL __read_mostly = { +static struct xt_target ttl_tg_reg __read_mostly = { .name = "TTL", .family = AF_INET, - .target = ipt_ttl_target, + .target = ttl_tg, .targetsize = sizeof(struct ipt_TTL_info), .table = "mangle", - .checkentry = ipt_ttl_checkentry, + .checkentry = ttl_tg_check, .me = THIS_MODULE, }; -static int __init ipt_ttl_init(void) +static int __init ttl_tg_init(void) { - return xt_register_target(&ipt_TTL); + return xt_register_target(&ttl_tg_reg); } -static void __exit ipt_ttl_fini(void) +static void __exit ttl_tg_exit(void) { - xt_unregister_target(&ipt_TTL); + xt_unregister_target(&ttl_tg_reg); } -module_init(ipt_ttl_init); -module_exit(ipt_ttl_fini); +module_init(ttl_tg_init); +module_exit(ttl_tg_exit); diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index 212b830765a..b192756c6d0 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -43,13 +43,14 @@ #include <linux/netfilter.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ipt_ULOG.h> +#include <net/netfilter/nf_log.h> #include <net/sock.h> #include <linux/bitops.h> #include <asm/unaligned.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); -MODULE_DESCRIPTION("iptables userspace logging module"); +MODULE_DESCRIPTION("Xtables: packet logging to netlink using ULOG"); MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG); #define ULOG_NL_EVENT 111 /* Harald's favorite number */ @@ -279,12 +280,10 @@ alloc_failure: spin_unlock_bh(&ulog_lock); } -static unsigned int ipt_ulog_target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +static unsigned int +ulog_tg(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { struct ipt_ulog_info *loginfo = (struct ipt_ulog_info *) targinfo; @@ -318,11 +317,10 @@ static void ipt_logfn(unsigned int pf, ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); } -static bool ipt_ulog_checkentry(const char *tablename, - const void *e, - const struct xt_target *target, - void *targinfo, - unsigned int hookmask) +static bool +ulog_tg_check(const char *tablename, const void *e, + const struct xt_target *target, void *targinfo, + unsigned int hookmask) { const struct ipt_ulog_info *loginfo = targinfo; @@ -347,7 +345,7 @@ struct compat_ipt_ulog_info { char prefix[ULOG_PREFIX_LEN]; }; -static void compat_from_user(void *dst, void *src) +static void ulog_tg_compat_from_user(void *dst, void *src) { const struct compat_ipt_ulog_info *cl = src; struct ipt_ulog_info l = { @@ -360,7 +358,7 @@ static void compat_from_user(void *dst, void *src) memcpy(dst, &l, sizeof(l)); } -static int compat_to_user(void __user *dst, void *src) +static int ulog_tg_compat_to_user(void __user *dst, void *src) { const struct ipt_ulog_info *l = src; struct compat_ipt_ulog_info cl = { @@ -374,16 +372,16 @@ static int compat_to_user(void __user *dst, void *src) } #endif /* CONFIG_COMPAT */ -static struct xt_target ipt_ulog_reg __read_mostly = { +static struct xt_target ulog_tg_reg __read_mostly = { .name = "ULOG", .family = AF_INET, - .target = ipt_ulog_target, + .target = ulog_tg, .targetsize = sizeof(struct ipt_ulog_info), - .checkentry = ipt_ulog_checkentry, + .checkentry = ulog_tg_check, #ifdef CONFIG_COMPAT .compatsize = sizeof(struct compat_ipt_ulog_info), - .compat_from_user = compat_from_user, - .compat_to_user = compat_to_user, + .compat_from_user = ulog_tg_compat_from_user, + .compat_to_user = ulog_tg_compat_to_user, #endif .me = THIS_MODULE, }; @@ -394,7 +392,7 @@ static struct nf_logger ipt_ulog_logger = { .me = THIS_MODULE, }; -static int __init ipt_ulog_init(void) +static int __init ulog_tg_init(void) { int ret, i; @@ -415,9 +413,9 @@ static int __init ipt_ulog_init(void) if (!nflognl) return -ENOMEM; - ret = xt_register_target(&ipt_ulog_reg); + ret = xt_register_target(&ulog_tg_reg); if (ret < 0) { - sock_release(nflognl->sk_socket); + netlink_kernel_release(nflognl); return ret; } if (nflog) @@ -426,7 +424,7 @@ static int __init ipt_ulog_init(void) return 0; } -static void __exit ipt_ulog_fini(void) +static void __exit ulog_tg_exit(void) { ulog_buff_t *ub; int i; @@ -435,8 +433,8 @@ static void __exit ipt_ulog_fini(void) if (nflog) nf_log_unregister(&ipt_ulog_logger); - xt_unregister_target(&ipt_ulog_reg); - sock_release(nflognl->sk_socket); + xt_unregister_target(&ulog_tg_reg); + netlink_kernel_release(nflognl); /* remove pending timers and free allocated skb's */ for (i = 0; i < ULOG_MAXNLGROUPS; i++) { @@ -453,5 +451,5 @@ static void __exit ipt_ulog_fini(void) } } -module_init(ipt_ulog_init); -module_exit(ipt_ulog_fini); +module_init(ulog_tg_init); +module_exit(ulog_tg_exit); diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c index 59f01f7ba6b..49587a49722 100644 --- a/net/ipv4/netfilter/ipt_addrtype.c +++ b/net/ipv4/netfilter/ipt_addrtype.c @@ -2,6 +2,7 @@ * iptables module to match inet_addr_type() of an ip. * * Copyright (c) 2004 Patrick McHardy <kaber@trash.net> + * (C) 2007 Laszlo Attila Toth <panther@balabit.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -20,47 +21,119 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_DESCRIPTION("iptables addrtype match"); +MODULE_DESCRIPTION("Xtables: address type match for IPv4"); -static inline bool match_type(__be32 addr, u_int16_t mask) +static inline bool match_type(const struct net_device *dev, __be32 addr, + u_int16_t mask) { - return !!(mask & (1 << inet_addr_type(addr))); + return !!(mask & (1 << inet_dev_addr_type(&init_net, dev, addr))); } -static bool match(const struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - const struct xt_match *match, const void *matchinfo, - int offset, unsigned int protoff, bool *hotdrop) +static bool +addrtype_mt_v0(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { const struct ipt_addrtype_info *info = matchinfo; const struct iphdr *iph = ip_hdr(skb); bool ret = true; if (info->source) - ret &= match_type(iph->saddr, info->source)^info->invert_source; + ret &= match_type(NULL, iph->saddr, info->source) ^ + info->invert_source; if (info->dest) - ret &= match_type(iph->daddr, info->dest)^info->invert_dest; + ret &= match_type(NULL, iph->daddr, info->dest) ^ + info->invert_dest; return ret; } -static struct xt_match addrtype_match __read_mostly = { - .name = "addrtype", - .family = AF_INET, - .match = match, - .matchsize = sizeof(struct ipt_addrtype_info), - .me = THIS_MODULE +static bool +addrtype_mt_v1(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) +{ + const struct ipt_addrtype_info_v1 *info = matchinfo; + const struct iphdr *iph = ip_hdr(skb); + const struct net_device *dev = NULL; + bool ret = true; + + if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) + dev = in; + else if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) + dev = out; + + if (info->source) + ret &= match_type(dev, iph->saddr, info->source) ^ + (info->flags & IPT_ADDRTYPE_INVERT_SOURCE); + if (ret && info->dest) + ret &= match_type(dev, iph->daddr, info->dest) ^ + (info->flags & IPT_ADDRTYPE_INVERT_DEST); + return ret; +} + +static bool +addrtype_mt_checkentry_v1(const char *tablename, const void *ip_void, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) +{ + struct ipt_addrtype_info_v1 *info = matchinfo; + + if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN && + info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { + printk(KERN_ERR "ipt_addrtype: both incoming and outgoing " + "interface limitation cannot be selected\n"); + return false; + } + + if (hook_mask & (1 << NF_INET_PRE_ROUTING | 1 << NF_INET_LOCAL_IN) && + info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { + printk(KERN_ERR "ipt_addrtype: output interface limitation " + "not valid in PRE_ROUTING and INPUT\n"); + return false; + } + + if (hook_mask & (1 << NF_INET_POST_ROUTING | 1 << NF_INET_LOCAL_OUT) && + info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) { + printk(KERN_ERR "ipt_addrtype: input interface limitation " + "not valid in POST_ROUTING and OUTPUT\n"); + return false; + } + + return true; +} + +static struct xt_match addrtype_mt_reg[] __read_mostly = { + { + .name = "addrtype", + .family = AF_INET, + .match = addrtype_mt_v0, + .matchsize = sizeof(struct ipt_addrtype_info), + .me = THIS_MODULE + }, + { + .name = "addrtype", + .family = AF_INET, + .revision = 1, + .match = addrtype_mt_v1, + .checkentry = addrtype_mt_checkentry_v1, + .matchsize = sizeof(struct ipt_addrtype_info_v1), + .me = THIS_MODULE + } }; -static int __init ipt_addrtype_init(void) +static int __init addrtype_mt_init(void) { - return xt_register_match(&addrtype_match); + return xt_register_matches(addrtype_mt_reg, + ARRAY_SIZE(addrtype_mt_reg)); } -static void __exit ipt_addrtype_fini(void) +static void __exit addrtype_mt_exit(void) { - xt_unregister_match(&addrtype_match); + xt_unregister_matches(addrtype_mt_reg, ARRAY_SIZE(addrtype_mt_reg)); } -module_init(ipt_addrtype_init); -module_exit(ipt_addrtype_fini); +module_init(addrtype_mt_init); +module_exit(addrtype_mt_exit); diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c index 61b017fd743..e977989629c 100644 --- a/net/ipv4/netfilter/ipt_ah.c +++ b/net/ipv4/netfilter/ipt_ah.c @@ -16,7 +16,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>"); -MODULE_DESCRIPTION("iptables AH SPI match module"); +MODULE_DESCRIPTION("Xtables: IPv4 IPsec-AH SPI match"); #ifdef DEBUG_CONNTRACK #define duprintf(format, args...) printk(format , ## args) @@ -37,14 +37,9 @@ spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) } static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +ah_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) { struct ip_auth_hdr _ahdr; const struct ip_auth_hdr *ah; @@ -72,11 +67,9 @@ match(const struct sk_buff *skb, /* Called when user tries to insert an entry of this type. */ static bool -checkentry(const char *tablename, - const void *ip_void, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +ah_mt_check(const char *tablename, const void *ip_void, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { const struct ipt_ah *ahinfo = matchinfo; @@ -88,25 +81,25 @@ checkentry(const char *tablename, return true; } -static struct xt_match ah_match __read_mostly = { +static struct xt_match ah_mt_reg __read_mostly = { .name = "ah", .family = AF_INET, - .match = match, + .match = ah_mt, .matchsize = sizeof(struct ipt_ah), .proto = IPPROTO_AH, - .checkentry = checkentry, + .checkentry = ah_mt_check, .me = THIS_MODULE, }; -static int __init ipt_ah_init(void) +static int __init ah_mt_init(void) { - return xt_register_match(&ah_match); + return xt_register_match(&ah_mt_reg); } -static void __exit ipt_ah_fini(void) +static void __exit ah_mt_exit(void) { - xt_unregister_match(&ah_match); + xt_unregister_match(&ah_mt_reg); } -module_init(ipt_ah_init); -module_exit(ipt_ah_fini); +module_init(ah_mt_init); +module_exit(ah_mt_exit); diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c index d6925c67406..749de8284ce 100644 --- a/net/ipv4/netfilter/ipt_ecn.c +++ b/net/ipv4/netfilter/ipt_ecn.c @@ -19,7 +19,7 @@ #include <linux/netfilter_ipv4/ipt_ecn.h> MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("iptables ECN matching module"); +MODULE_DESCRIPTION("Xtables: Explicit Congestion Notification (ECN) flag match for IPv4"); MODULE_LICENSE("GPL"); static inline bool match_ip(const struct sk_buff *skb, @@ -67,10 +67,10 @@ static inline bool match_tcp(const struct sk_buff *skb, return true; } -static bool match(const struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - const struct xt_match *match, const void *matchinfo, - int offset, unsigned int protoff, bool *hotdrop) +static bool +ecn_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) { const struct ipt_ecn_info *info = matchinfo; @@ -88,9 +88,10 @@ static bool match(const struct sk_buff *skb, return true; } -static bool checkentry(const char *tablename, const void *ip_void, - const struct xt_match *match, - void *matchinfo, unsigned int hook_mask) +static bool +ecn_mt_check(const char *tablename, const void *ip_void, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { const struct ipt_ecn_info *info = matchinfo; const struct ipt_ip *ip = ip_void; @@ -111,24 +112,24 @@ static bool checkentry(const char *tablename, const void *ip_void, return true; } -static struct xt_match ecn_match __read_mostly = { +static struct xt_match ecn_mt_reg __read_mostly = { .name = "ecn", .family = AF_INET, - .match = match, + .match = ecn_mt, .matchsize = sizeof(struct ipt_ecn_info), - .checkentry = checkentry, + .checkentry = ecn_mt_check, .me = THIS_MODULE, }; -static int __init ipt_ecn_init(void) +static int __init ecn_mt_init(void) { - return xt_register_match(&ecn_match); + return xt_register_match(&ecn_mt_reg); } -static void __exit ipt_ecn_fini(void) +static void __exit ecn_mt_exit(void) { - xt_unregister_match(&ecn_match); + xt_unregister_match(&ecn_mt_reg); } -module_init(ipt_ecn_init); -module_exit(ipt_ecn_fini); +module_init(ecn_mt_init); +module_exit(ecn_mt_exit); diff --git a/net/ipv4/netfilter/ipt_iprange.c b/net/ipv4/netfilter/ipt_iprange.c deleted file mode 100644 index 0106dc955a6..00000000000 --- a/net/ipv4/netfilter/ipt_iprange.c +++ /dev/null @@ -1,79 +0,0 @@ -/* - * iptables module to match IP address ranges - * - * (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/ip.h> -#include <linux/netfilter/x_tables.h> -#include <linux/netfilter_ipv4/ipt_iprange.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); -MODULE_DESCRIPTION("iptables arbitrary IP range match module"); - -static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, unsigned int protoff, bool *hotdrop) -{ - const struct ipt_iprange_info *info = matchinfo; - const struct iphdr *iph = ip_hdr(skb); - - if (info->flags & IPRANGE_SRC) { - if ((ntohl(iph->saddr) < ntohl(info->src.min_ip) - || ntohl(iph->saddr) > ntohl(info->src.max_ip)) - ^ !!(info->flags & IPRANGE_SRC_INV)) { - pr_debug("src IP %u.%u.%u.%u NOT in range %s" - "%u.%u.%u.%u-%u.%u.%u.%u\n", - NIPQUAD(iph->saddr), - info->flags & IPRANGE_SRC_INV ? "(INV) " : "", - NIPQUAD(info->src.min_ip), - NIPQUAD(info->src.max_ip)); - return false; - } - } - if (info->flags & IPRANGE_DST) { - if ((ntohl(iph->daddr) < ntohl(info->dst.min_ip) - || ntohl(iph->daddr) > ntohl(info->dst.max_ip)) - ^ !!(info->flags & IPRANGE_DST_INV)) { - pr_debug("dst IP %u.%u.%u.%u NOT in range %s" - "%u.%u.%u.%u-%u.%u.%u.%u\n", - NIPQUAD(iph->daddr), - info->flags & IPRANGE_DST_INV ? "(INV) " : "", - NIPQUAD(info->dst.min_ip), - NIPQUAD(info->dst.max_ip)); - return false; - } - } - return true; -} - -static struct xt_match iprange_match __read_mostly = { - .name = "iprange", - .family = AF_INET, - .match = match, - .matchsize = sizeof(struct ipt_iprange_info), - .me = THIS_MODULE -}; - -static int __init ipt_iprange_init(void) -{ - return xt_register_match(&iprange_match); -} - -static void __exit ipt_iprange_fini(void) -{ - xt_unregister_match(&iprange_match); -} - -module_init(ipt_iprange_init); -module_exit(ipt_iprange_fini); diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c deleted file mode 100644 index b14e77da7a3..00000000000 --- a/net/ipv4/netfilter/ipt_owner.c +++ /dev/null @@ -1,92 +0,0 @@ -/* Kernel module to match various things tied to sockets associated with - locally generated outgoing packets. */ - -/* (C) 2000 Marc Boucher <marc@mbsi.ca> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/file.h> -#include <linux/rcupdate.h> -#include <net/sock.h> - -#include <linux/netfilter_ipv4/ipt_owner.h> -#include <linux/netfilter/x_tables.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); -MODULE_DESCRIPTION("iptables owner match"); - -static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) -{ - const struct ipt_owner_info *info = matchinfo; - - if (!skb->sk || !skb->sk->sk_socket || !skb->sk->sk_socket->file) - return false; - - if(info->match & IPT_OWNER_UID) { - if ((skb->sk->sk_socket->file->f_uid != info->uid) ^ - !!(info->invert & IPT_OWNER_UID)) - return false; - } - - if(info->match & IPT_OWNER_GID) { - if ((skb->sk->sk_socket->file->f_gid != info->gid) ^ - !!(info->invert & IPT_OWNER_GID)) - return false; - } - - return true; -} - -static bool -checkentry(const char *tablename, - const void *ip, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) -{ - const struct ipt_owner_info *info = matchinfo; - - if (info->match & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) { - printk("ipt_owner: pid, sid and command matching " - "not supported anymore\n"); - return false; - } - return true; -} - -static struct xt_match owner_match __read_mostly = { - .name = "owner", - .family = AF_INET, - .match = match, - .matchsize = sizeof(struct ipt_owner_info), - .hooks = (1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING), - .checkentry = checkentry, - .me = THIS_MODULE, -}; - -static int __init ipt_owner_init(void) -{ - return xt_register_match(&owner_match); -} - -static void __exit ipt_owner_fini(void) -{ - xt_unregister_match(&owner_match); -} - -module_init(ipt_owner_init); -module_exit(ipt_owner_fini); diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c index 11d39fb5f38..e3154a99c08 100644 --- a/net/ipv4/netfilter/ipt_recent.c +++ b/net/ipv4/netfilter/ipt_recent.c @@ -30,7 +30,7 @@ #include <linux/netfilter_ipv4/ipt_recent.h> MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_DESCRIPTION("IP tables recently seen matching module"); +MODULE_DESCRIPTION("Xtables: \"recently-seen\" host matching for IPv4"); MODULE_LICENSE("GPL"); static unsigned int ip_list_tot = 100; @@ -170,10 +170,10 @@ static void recent_table_flush(struct recent_table *t) } static bool -ipt_recent_match(const struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - const struct xt_match *match, const void *matchinfo, - int offset, unsigned int protoff, bool *hotdrop) +recent_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { const struct ipt_recent_info *info = matchinfo; struct recent_table *t; @@ -236,9 +236,9 @@ out: } static bool -ipt_recent_checkentry(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +recent_mt_check(const char *tablename, const void *ip, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { const struct ipt_recent_info *info = matchinfo; struct recent_table *t; @@ -293,8 +293,7 @@ out: return ret; } -static void -ipt_recent_destroy(const struct xt_match *match, void *matchinfo) +static void recent_mt_destroy(const struct xt_match *match, void *matchinfo) { const struct ipt_recent_info *info = matchinfo; struct recent_table *t; @@ -455,17 +454,17 @@ static const struct file_operations recent_fops = { }; #endif /* CONFIG_PROC_FS */ -static struct xt_match recent_match __read_mostly = { +static struct xt_match recent_mt_reg __read_mostly = { .name = "recent", .family = AF_INET, - .match = ipt_recent_match, + .match = recent_mt, .matchsize = sizeof(struct ipt_recent_info), - .checkentry = ipt_recent_checkentry, - .destroy = ipt_recent_destroy, + .checkentry = recent_mt_check, + .destroy = recent_mt_destroy, .me = THIS_MODULE, }; -static int __init ipt_recent_init(void) +static int __init recent_mt_init(void) { int err; @@ -473,27 +472,27 @@ static int __init ipt_recent_init(void) return -EINVAL; ip_list_hash_size = 1 << fls(ip_list_tot); - err = xt_register_match(&recent_match); + err = xt_register_match(&recent_mt_reg); #ifdef CONFIG_PROC_FS if (err) return err; proc_dir = proc_mkdir("ipt_recent", init_net.proc_net); if (proc_dir == NULL) { - xt_unregister_match(&recent_match); + xt_unregister_match(&recent_mt_reg); err = -ENOMEM; } #endif return err; } -static void __exit ipt_recent_exit(void) +static void __exit recent_mt_exit(void) { BUG_ON(!list_empty(&tables)); - xt_unregister_match(&recent_match); + xt_unregister_match(&recent_mt_reg); #ifdef CONFIG_PROC_FS remove_proc_entry("ipt_recent", init_net.proc_net); #endif } -module_init(ipt_recent_init); -module_exit(ipt_recent_exit); +module_init(recent_mt_init); +module_exit(recent_mt_exit); diff --git a/net/ipv4/netfilter/ipt_tos.c b/net/ipv4/netfilter/ipt_tos.c deleted file mode 100644 index e740441c973..00000000000 --- a/net/ipv4/netfilter/ipt_tos.c +++ /dev/null @@ -1,55 +0,0 @@ -/* Kernel module to match TOS values. */ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/ip.h> -#include <linux/module.h> -#include <linux/skbuff.h> - -#include <linux/netfilter_ipv4/ipt_tos.h> -#include <linux/netfilter/x_tables.h> - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("iptables TOS match module"); - -static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) -{ - const struct ipt_tos_info *info = matchinfo; - - return (ip_hdr(skb)->tos == info->tos) ^ info->invert; -} - -static struct xt_match tos_match __read_mostly = { - .name = "tos", - .family = AF_INET, - .match = match, - .matchsize = sizeof(struct ipt_tos_info), - .me = THIS_MODULE, -}; - -static int __init ipt_multiport_init(void) -{ - return xt_register_match(&tos_match); -} - -static void __exit ipt_multiport_fini(void) -{ - xt_unregister_match(&tos_match); -} - -module_init(ipt_multiport_init); -module_exit(ipt_multiport_fini); diff --git a/net/ipv4/netfilter/ipt_ttl.c b/net/ipv4/netfilter/ipt_ttl.c index a439900a4ba..e0b8caeb710 100644 --- a/net/ipv4/netfilter/ipt_ttl.c +++ b/net/ipv4/netfilter/ipt_ttl.c @@ -15,13 +15,13 @@ #include <linux/netfilter/x_tables.h> MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("IP tables TTL matching module"); +MODULE_DESCRIPTION("Xtables: IPv4 TTL field match"); MODULE_LICENSE("GPL"); -static bool match(const struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - const struct xt_match *match, const void *matchinfo, - int offset, unsigned int protoff, bool *hotdrop) +static bool +ttl_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) { const struct ipt_ttl_info *info = matchinfo; const u8 ttl = ip_hdr(skb)->ttl; @@ -44,23 +44,23 @@ static bool match(const struct sk_buff *skb, return false; } -static struct xt_match ttl_match __read_mostly = { +static struct xt_match ttl_mt_reg __read_mostly = { .name = "ttl", .family = AF_INET, - .match = match, + .match = ttl_mt, .matchsize = sizeof(struct ipt_ttl_info), .me = THIS_MODULE, }; -static int __init ipt_ttl_init(void) +static int __init ttl_mt_init(void) { - return xt_register_match(&ttl_match); + return xt_register_match(&ttl_mt_reg); } -static void __exit ipt_ttl_fini(void) +static void __exit ttl_mt_exit(void) { - xt_unregister_match(&ttl_match); + xt_unregister_match(&ttl_mt_reg); } -module_init(ipt_ttl_init); -module_exit(ipt_ttl_fini); +module_init(ttl_mt_init); +module_exit(ttl_mt_exit); diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index ba3262c6043..29bb4f9fbda 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -19,7 +19,9 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("iptables filter table"); -#define FILTER_VALID_HOOKS ((1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD) | (1 << NF_IP_LOCAL_OUT)) +#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \ + (1 << NF_INET_FORWARD) | \ + (1 << NF_INET_LOCAL_OUT)) static struct { @@ -33,14 +35,14 @@ static struct .num_entries = 4, .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), .hook_entry = { - [NF_IP_LOCAL_IN] = 0, - [NF_IP_FORWARD] = sizeof(struct ipt_standard), - [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2, + [NF_INET_LOCAL_IN] = 0, + [NF_INET_FORWARD] = sizeof(struct ipt_standard), + [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2, }, .underflow = { - [NF_IP_LOCAL_IN] = 0, - [NF_IP_FORWARD] = sizeof(struct ipt_standard), - [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2, + [NF_INET_LOCAL_IN] = 0, + [NF_INET_FORWARD] = sizeof(struct ipt_standard), + [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2, }, }, .entries = { @@ -89,26 +91,26 @@ ipt_local_out_hook(unsigned int hook, return ipt_do_table(skb, hook, in, out, &packet_filter); } -static struct nf_hook_ops ipt_ops[] = { +static struct nf_hook_ops ipt_ops[] __read_mostly = { { .hook = ipt_hook, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_LOCAL_IN, + .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_FILTER, }, { .hook = ipt_hook, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_FORWARD, + .hooknum = NF_INET_FORWARD, .priority = NF_IP_PRI_FILTER, }, { .hook = ipt_local_out_hook, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_LOCAL_OUT, + .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_FILTER, }, }; diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index b4360a69d5c..5c4be202430 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -21,11 +21,11 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("iptables mangle table"); -#define MANGLE_VALID_HOOKS ((1 << NF_IP_PRE_ROUTING) | \ - (1 << NF_IP_LOCAL_IN) | \ - (1 << NF_IP_FORWARD) | \ - (1 << NF_IP_LOCAL_OUT) | \ - (1 << NF_IP_POST_ROUTING)) +#define MANGLE_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \ + (1 << NF_INET_LOCAL_IN) | \ + (1 << NF_INET_FORWARD) | \ + (1 << NF_INET_LOCAL_OUT) | \ + (1 << NF_INET_POST_ROUTING)) /* Ouch - five different hooks? Maybe this should be a config option..... -- BC */ static struct @@ -40,18 +40,18 @@ static struct .num_entries = 6, .size = sizeof(struct ipt_standard) * 5 + sizeof(struct ipt_error), .hook_entry = { - [NF_IP_PRE_ROUTING] = 0, - [NF_IP_LOCAL_IN] = sizeof(struct ipt_standard), - [NF_IP_FORWARD] = sizeof(struct ipt_standard) * 2, - [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 3, - [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 4, + [NF_INET_PRE_ROUTING] = 0, + [NF_INET_LOCAL_IN] = sizeof(struct ipt_standard), + [NF_INET_FORWARD] = sizeof(struct ipt_standard) * 2, + [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 3, + [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard) * 4, }, .underflow = { - [NF_IP_PRE_ROUTING] = 0, - [NF_IP_LOCAL_IN] = sizeof(struct ipt_standard), - [NF_IP_FORWARD] = sizeof(struct ipt_standard) * 2, - [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 3, - [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 4, + [NF_INET_PRE_ROUTING] = 0, + [NF_INET_LOCAL_IN] = sizeof(struct ipt_standard), + [NF_INET_FORWARD] = sizeof(struct ipt_standard) * 2, + [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 3, + [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard) * 4, }, }, .entries = { @@ -128,40 +128,40 @@ ipt_local_hook(unsigned int hook, return ret; } -static struct nf_hook_ops ipt_ops[] = { +static struct nf_hook_ops ipt_ops[] __read_mostly = { { .hook = ipt_route_hook, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_PRE_ROUTING, + .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_MANGLE, }, { .hook = ipt_route_hook, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_LOCAL_IN, + .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_MANGLE, }, { .hook = ipt_route_hook, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_FORWARD, + .hooknum = NF_INET_FORWARD, .priority = NF_IP_PRI_MANGLE, }, { .hook = ipt_local_hook, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_LOCAL_OUT, + .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_MANGLE, }, { .hook = ipt_route_hook, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_POST_ROUTING, + .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_MANGLE, }, }; diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index f8678651250..dc34aa27453 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -7,7 +7,7 @@ #include <linux/netfilter_ipv4/ip_tables.h> #include <net/ip.h> -#define RAW_VALID_HOOKS ((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT)) +#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)) static struct { @@ -21,12 +21,12 @@ static struct .num_entries = 3, .size = sizeof(struct ipt_standard) * 2 + sizeof(struct ipt_error), .hook_entry = { - [NF_IP_PRE_ROUTING] = 0, - [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) + [NF_INET_PRE_ROUTING] = 0, + [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) }, .underflow = { - [NF_IP_PRE_ROUTING] = 0, - [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) + [NF_INET_PRE_ROUTING] = 0, + [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) }, }, .entries = { @@ -74,18 +74,18 @@ ipt_local_hook(unsigned int hook, } /* 'raw' is the very first table. */ -static struct nf_hook_ops ipt_ops[] = { +static struct nf_hook_ops ipt_ops[] __read_mostly = { { .hook = ipt_hook, .pf = PF_INET, - .hooknum = NF_IP_PRE_ROUTING, + .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_RAW, .owner = THIS_MODULE, }, { .hook = ipt_local_hook, .pf = PF_INET, - .hooknum = NF_IP_LOCAL_OUT, + .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_RAW, .owner = THIS_MODULE, }, diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 910dae732a0..ac3d61d8026 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -56,12 +56,6 @@ static int ipv4_print_tuple(struct seq_file *s, NIPQUAD(tuple->dst.u3.ip)); } -static int ipv4_print_conntrack(struct seq_file *s, - const struct nf_conn *conntrack) -{ - return 0; -} - /* Returns new sk_buff, or NULL */ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) { @@ -150,7 +144,7 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, /* Gather fragments. */ if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { if (nf_ct_ipv4_gather_frags(skb, - hooknum == NF_IP_PRE_ROUTING ? + hooknum == NF_INET_PRE_ROUTING ? IP_DEFRAG_CONNTRACK_IN : IP_DEFRAG_CONNTRACK_OUT)) return NF_STOLEN; @@ -185,61 +179,61 @@ static unsigned int ipv4_conntrack_local(unsigned int hooknum, /* Connection tracking may drop packets, but never alters them, so make it the first hook. */ -static struct nf_hook_ops ipv4_conntrack_ops[] = { +static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { { .hook = ipv4_conntrack_defrag, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_PRE_ROUTING, + .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_CONNTRACK_DEFRAG, }, { .hook = ipv4_conntrack_in, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_PRE_ROUTING, + .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_CONNTRACK, }, { .hook = ipv4_conntrack_defrag, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_LOCAL_OUT, + .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_CONNTRACK_DEFRAG, }, { .hook = ipv4_conntrack_local, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_LOCAL_OUT, + .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_CONNTRACK, }, { .hook = ipv4_conntrack_help, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_POST_ROUTING, + .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_HELPER, }, { .hook = ipv4_conntrack_help, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_LOCAL_IN, + .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_HELPER, }, { .hook = ipv4_confirm, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_POST_ROUTING, + .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, }, { .hook = ipv4_confirm, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_LOCAL_IN, + .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, }, }; @@ -363,10 +357,8 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) static int ipv4_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { - NLA_PUT(skb, CTA_IP_V4_SRC, sizeof(u_int32_t), - &tuple->src.u3.ip); - NLA_PUT(skb, CTA_IP_V4_DST, sizeof(u_int32_t), - &tuple->dst.u3.ip); + NLA_PUT_BE32(skb, CTA_IP_V4_SRC, tuple->src.u3.ip); + NLA_PUT_BE32(skb, CTA_IP_V4_DST, tuple->dst.u3.ip); return 0; nla_put_failure: @@ -384,8 +376,8 @@ static int ipv4_nlattr_to_tuple(struct nlattr *tb[], if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST]) return -EINVAL; - t->src.u3.ip = *(__be32 *)nla_data(tb[CTA_IP_V4_SRC]); - t->dst.u3.ip = *(__be32 *)nla_data(tb[CTA_IP_V4_DST]); + t->src.u3.ip = nla_get_be32(tb[CTA_IP_V4_SRC]); + t->dst.u3.ip = nla_get_be32(tb[CTA_IP_V4_DST]); return 0; } @@ -405,7 +397,6 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { .pkt_to_tuple = ipv4_pkt_to_tuple, .invert_tuple = ipv4_invert_tuple, .print_tuple = ipv4_print_tuple, - .print_conntrack = ipv4_print_conntrack, .get_l4proto = ipv4_get_l4proto, #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) .tuple_to_nlattr = ipv4_tuple_to_nlattr, diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 741f3dfaa5a..543c02b74c9 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -121,10 +121,7 @@ static int ct_seq_show(struct seq_file *s, void *v) ? (long)(ct->timeout.expires - jiffies)/HZ : 0) != 0) return -ENOSPC; - if (l3proto->print_conntrack(s, ct)) - return -ENOSPC; - - if (l4proto->print_conntrack(s, ct)) + if (l4proto->print_conntrack && l4proto->print_conntrack(s, ct)) return -ENOSPC; if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index adcbaf6d429..4004a04c551 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -18,6 +18,7 @@ #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_log.h> static unsigned long nf_ct_icmp_timeout __read_mostly = 30*HZ; @@ -73,13 +74,6 @@ static int icmp_print_tuple(struct seq_file *s, ntohs(tuple->src.u.icmp.id)); } -/* Print out the private part of the conntrack. */ -static int icmp_print_conntrack(struct seq_file *s, - const struct nf_conn *conntrack) -{ - return 0; -} - /* Returns verdict for packet, or -1 for invalid. */ static int icmp_packet(struct nf_conn *ct, const struct sk_buff *skb, @@ -128,7 +122,6 @@ static int icmp_new(struct nf_conn *conntrack, return 1; } -extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4; /* Returns conntrack if it dealt with ICMP, and filled in skb fields */ static int icmp_error_message(struct sk_buff *skb, @@ -195,7 +188,7 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff, } /* See ip_conntrack_proto_tcp.c */ - if (nf_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && + if (nf_conntrack_checksum && hooknum == NF_INET_PRE_ROUTING && nf_ip_checksum(skb, hooknum, dataoff, 0)) { if (LOG_INVALID(IPPROTO_ICMP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, @@ -235,12 +228,9 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff, static int icmp_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *t) { - NLA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(u_int16_t), - &t->src.u.icmp.id); - NLA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t), - &t->dst.u.icmp.type); - NLA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t), - &t->dst.u.icmp.code); + NLA_PUT_BE16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id); + NLA_PUT_U8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type); + NLA_PUT_U8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code); return 0; @@ -262,12 +252,9 @@ static int icmp_nlattr_to_tuple(struct nlattr *tb[], || !tb[CTA_PROTO_ICMP_ID]) return -EINVAL; - tuple->dst.u.icmp.type = - *(u_int8_t *)nla_data(tb[CTA_PROTO_ICMP_TYPE]); - tuple->dst.u.icmp.code = - *(u_int8_t *)nla_data(tb[CTA_PROTO_ICMP_CODE]); - tuple->src.u.icmp.id = - *(__be16 *)nla_data(tb[CTA_PROTO_ICMP_ID]); + tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]); + tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]); + tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]); if (tuple->dst.u.icmp.type >= sizeof(invmap) || !invmap[tuple->dst.u.icmp.type]) @@ -315,7 +302,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = .pkt_to_tuple = icmp_pkt_to_tuple, .invert_tuple = icmp_invert_tuple, .print_tuple = icmp_print_tuple, - .print_conntrack = icmp_print_conntrack, .packet = icmp_packet, .new = icmp_new, .error = icmp_error, diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 86b465b176b..e53ae1ef8f5 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -33,27 +33,28 @@ static DEFINE_RWLOCK(nf_nat_lock); -static struct nf_conntrack_l3proto *l3proto = NULL; +static struct nf_conntrack_l3proto *l3proto __read_mostly; /* Calculated at init based on memory size */ -static unsigned int nf_nat_htable_size; +static unsigned int nf_nat_htable_size __read_mostly; static int nf_nat_vmalloced; -static struct hlist_head *bysource; +static struct hlist_head *bysource __read_mostly; #define MAX_IP_NAT_PROTO 256 -static struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO]; +static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO] + __read_mostly; -static inline struct nf_nat_protocol * +static inline const struct nf_nat_protocol * __nf_nat_proto_find(u_int8_t protonum) { return rcu_dereference(nf_nat_protos[protonum]); } -struct nf_nat_protocol * +const struct nf_nat_protocol * nf_nat_proto_find_get(u_int8_t protonum) { - struct nf_nat_protocol *p; + const struct nf_nat_protocol *p; rcu_read_lock(); p = __nf_nat_proto_find(protonum); @@ -66,7 +67,7 @@ nf_nat_proto_find_get(u_int8_t protonum) EXPORT_SYMBOL_GPL(nf_nat_proto_find_get); void -nf_nat_proto_put(struct nf_nat_protocol *p) +nf_nat_proto_put(const struct nf_nat_protocol *p) { module_put(p->me); } @@ -76,10 +77,13 @@ EXPORT_SYMBOL_GPL(nf_nat_proto_put); static inline unsigned int hash_by_src(const struct nf_conntrack_tuple *tuple) { + unsigned int hash; + /* Original src, to ensure we map it consistently if poss. */ - return jhash_3words((__force u32)tuple->src.u3.ip, + hash = jhash_3words((__force u32)tuple->src.u3.ip, (__force u32)tuple->src.u.all, - tuple->dst.protonum, 0) % nf_nat_htable_size; + tuple->dst.protonum, 0); + return ((u64)hash * nf_nat_htable_size) >> 32; } /* Is this tuple already taken? (not by us) */ @@ -105,7 +109,7 @@ static int in_range(const struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range) { - struct nf_nat_protocol *proto; + const struct nf_nat_protocol *proto; int ret = 0; /* If we are supposed to map IPs, then we must be in the @@ -210,12 +214,13 @@ find_best_ips_proto(struct nf_conntrack_tuple *tuple, maxip = ntohl(range->max_ip); j = jhash_2words((__force u32)tuple->src.u3.ip, (__force u32)tuple->dst.u3.ip, 0); - *var_ipp = htonl(minip + j % (maxip - minip + 1)); + j = ((u64)j * (maxip - minip + 1)) >> 32; + *var_ipp = htonl(minip + j); } -/* Manipulate the tuple into the range given. For NF_IP_POST_ROUTING, - * we change the source to map into the range. For NF_IP_PRE_ROUTING - * and NF_IP_LOCAL_OUT, we change the destination to map into the +/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING, + * we change the source to map into the range. For NF_INET_PRE_ROUTING + * and NF_INET_LOCAL_OUT, we change the destination to map into the * range. It might not be possible to get a unique tuple, but we try. * At worst (or if we race), we will end up with a final duplicate in * __ip_conntrack_confirm and drop the packet. */ @@ -226,7 +231,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, struct nf_conn *ct, enum nf_nat_manip_type maniptype) { - struct nf_nat_protocol *proto; + const struct nf_nat_protocol *proto; /* 1) If this srcip/proto/src-proto-part is currently mapped, and that same mapping gives a unique tuple within the given @@ -276,12 +281,11 @@ out: unsigned int nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range *range, - unsigned int hooknum) + enum nf_nat_manip_type maniptype) { struct nf_conntrack_tuple curr_tuple, new_tuple; struct nf_conn_nat *nat; int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK); - enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum); /* nat helper or nfctnetlink also setup binding */ nat = nfct_nat(ct); @@ -293,10 +297,8 @@ nf_nat_setup_info(struct nf_conn *ct, } } - NF_CT_ASSERT(hooknum == NF_IP_PRE_ROUTING || - hooknum == NF_IP_POST_ROUTING || - hooknum == NF_IP_LOCAL_IN || - hooknum == NF_IP_LOCAL_OUT); + NF_CT_ASSERT(maniptype == IP_NAT_MANIP_SRC || + maniptype == IP_NAT_MANIP_DST); BUG_ON(nf_nat_initialized(ct, maniptype)); /* What we've got will look like inverse of reply. Normally @@ -355,7 +357,7 @@ manip_pkt(u_int16_t proto, enum nf_nat_manip_type maniptype) { struct iphdr *iph; - struct nf_nat_protocol *p; + const struct nf_nat_protocol *p; if (!skb_make_writable(skb, iphdroff + sizeof(*iph))) return 0; @@ -372,10 +374,10 @@ manip_pkt(u_int16_t proto, iph = (void *)skb->data + iphdroff; if (maniptype == IP_NAT_MANIP_SRC) { - nf_csum_replace4(&iph->check, iph->saddr, target->src.u3.ip); + csum_replace4(&iph->check, iph->saddr, target->src.u3.ip); iph->saddr = target->src.u3.ip; } else { - nf_csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip); + csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip); iph->daddr = target->dst.u3.ip; } return 1; @@ -515,7 +517,7 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation); /* Protocol registration. */ -int nf_nat_protocol_register(struct nf_nat_protocol *proto) +int nf_nat_protocol_register(const struct nf_nat_protocol *proto) { int ret = 0; @@ -532,7 +534,7 @@ int nf_nat_protocol_register(struct nf_nat_protocol *proto) EXPORT_SYMBOL(nf_nat_protocol_register); /* Noone stores the protocol anywhere; simply delete it. */ -void nf_nat_protocol_unregister(struct nf_nat_protocol *proto) +void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto) { write_lock_bh(&nf_nat_lock); rcu_assign_pointer(nf_nat_protos[proto->protonum], @@ -547,10 +549,8 @@ int nf_nat_port_range_to_nlattr(struct sk_buff *skb, const struct nf_nat_range *range) { - NLA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(__be16), - &range->min.tcp.port); - NLA_PUT(skb, CTA_PROTONAT_PORT_MAX, sizeof(__be16), - &range->max.tcp.port); + NLA_PUT_BE16(skb, CTA_PROTONAT_PORT_MIN, range->min.tcp.port); + NLA_PUT_BE16(skb, CTA_PROTONAT_PORT_MAX, range->max.tcp.port); return 0; @@ -568,8 +568,7 @@ nf_nat_port_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range *range) if (tb[CTA_PROTONAT_PORT_MIN]) { ret = 1; - range->min.tcp.port = - *(__be16 *)nla_data(tb[CTA_PROTONAT_PORT_MIN]); + range->min.tcp.port = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); } if (!tb[CTA_PROTONAT_PORT_MAX]) { @@ -577,8 +576,7 @@ nf_nat_port_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range *range) range->max.tcp.port = range->min.tcp.port; } else { ret = 1; - range->max.tcp.port = - *(__be16 *)nla_data(tb[CTA_PROTONAT_PORT_MAX]); + range->max.tcp.port = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]); } return ret; diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index 93e18ef114f..a121989fdad 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -76,7 +76,7 @@ static int set_addr(struct sk_buff *skb, static int set_h225_addr(struct sk_buff *skb, unsigned char **data, int dataoff, TransportAddress *taddr, - union nf_conntrack_address *addr, __be16 port) + union nf_inet_addr *addr, __be16 port) { return set_addr(skb, data, dataoff, taddr->ipAddress.ip, addr->ip, port); @@ -86,7 +86,7 @@ static int set_h225_addr(struct sk_buff *skb, static int set_h245_addr(struct sk_buff *skb, unsigned char **data, int dataoff, H245_TransportAddress *taddr, - union nf_conntrack_address *addr, __be16 port) + union nf_inet_addr *addr, __be16 port) { return set_addr(skb, data, dataoff, taddr->unicastAddress.iPAddress.network, @@ -103,7 +103,7 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct, int dir = CTINFO2DIR(ctinfo); int i; __be16 port; - union nf_conntrack_address addr; + union nf_inet_addr addr; for (i = 0; i < count; i++) { if (get_h225_addr(ct, *data, &taddr[i], &addr, &port)) { @@ -155,7 +155,7 @@ static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct, int dir = CTINFO2DIR(ctinfo); int i; __be16 port; - union nf_conntrack_address addr; + union nf_inet_addr addr; for (i = 0; i < count; i++) { if (get_h225_addr(ct, *data, &taddr[i], &addr, &port) && @@ -389,18 +389,14 @@ static void ip_nat_q931_expect(struct nf_conn *new, /* Change src to where master sends to */ range.flags = IP_NAT_RANGE_MAP_IPS; range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip; - - /* hook doesn't matter, but it has to do source manip */ - nf_nat_setup_info(new, &range, NF_IP_POST_ROUTING); + nf_nat_setup_info(new, &range, IP_NAT_MANIP_SRC); /* For DST manip, map port here to where it's expected. */ range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); range.min = range.max = this->saved_proto; range.min_ip = range.max_ip = new->master->tuplehash[!this->dir].tuple.src.u3.ip; - - /* hook doesn't matter, but it has to do destination manip */ - nf_nat_setup_info(new, &range, NF_IP_PRE_ROUTING); + nf_nat_setup_info(new, &range, IP_NAT_MANIP_DST); } /****************************************************************************/ @@ -412,7 +408,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct, struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; int dir = CTINFO2DIR(ctinfo); u_int16_t nated_port = ntohs(port); - union nf_conntrack_address addr; + union nf_inet_addr addr; /* Set expectations for NAT */ exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; @@ -479,17 +475,13 @@ static void ip_nat_callforwarding_expect(struct nf_conn *new, /* Change src to where master sends to */ range.flags = IP_NAT_RANGE_MAP_IPS; range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip; - - /* hook doesn't matter, but it has to do source manip */ - nf_nat_setup_info(new, &range, NF_IP_POST_ROUTING); + nf_nat_setup_info(new, &range, IP_NAT_MANIP_SRC); /* For DST manip, map port here to where it's expected. */ range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); range.min = range.max = this->saved_proto; range.min_ip = range.max_ip = this->saved_ip; - - /* hook doesn't matter, but it has to do destination manip */ - nf_nat_setup_info(new, &range, NF_IP_PRE_ROUTING); + nf_nat_setup_info(new, &range, IP_NAT_MANIP_DST); } /****************************************************************************/ diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c index 8718da00ef2..4c0232842e7 100644 --- a/net/ipv4/netfilter/nf_nat_helper.c +++ b/net/ipv4/netfilter/nf_nat_helper.c @@ -20,6 +20,7 @@ #include <linux/netfilter_ipv4.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_protocol.h> @@ -180,8 +181,8 @@ nf_nat_mangle_tcp_packet(struct sk_buff *skb, datalen, 0)); } } else - nf_proto_csum_replace2(&tcph->check, skb, - htons(oldlen), htons(datalen), 1); + inet_proto_csum_replace2(&tcph->check, skb, + htons(oldlen), htons(datalen), 1); if (rep_len != match_len) { set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); @@ -191,6 +192,8 @@ nf_nat_mangle_tcp_packet(struct sk_buff *skb, /* Tell TCP window tracking about seq change */ nf_conntrack_tcp_update(skb, ip_hdrlen(skb), ct, CTINFO2DIR(ctinfo)); + + nf_conntrack_event_cache(IPCT_NATSEQADJ, skb); } return 1; } @@ -270,8 +273,8 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb, udph->check = CSUM_MANGLED_0; } } else - nf_proto_csum_replace2(&udph->check, skb, - htons(oldlen), htons(datalen), 1); + inet_proto_csum_replace2(&udph->check, skb, + htons(oldlen), htons(datalen), 1); return 1; } @@ -310,10 +313,10 @@ sack_adjust(struct sk_buff *skb, ntohl(sack->start_seq), new_start_seq, ntohl(sack->end_seq), new_end_seq); - nf_proto_csum_replace4(&tcph->check, skb, - sack->start_seq, new_start_seq, 0); - nf_proto_csum_replace4(&tcph->check, skb, - sack->end_seq, new_end_seq, 0); + inet_proto_csum_replace4(&tcph->check, skb, + sack->start_seq, new_start_seq, 0); + inet_proto_csum_replace4(&tcph->check, skb, + sack->end_seq, new_end_seq, 0); sack->start_seq = new_start_seq; sack->end_seq = new_end_seq; sackoff += sizeof(*sack); @@ -397,8 +400,8 @@ nf_nat_seq_adjust(struct sk_buff *skb, else newack = htonl(ntohl(tcph->ack_seq) - other_way->offset_before); - nf_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0); - nf_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0); + inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0); + inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0); pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n", ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), @@ -430,15 +433,13 @@ void nf_nat_follow_master(struct nf_conn *ct, range.flags = IP_NAT_RANGE_MAP_IPS; range.min_ip = range.max_ip = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip; - /* hook doesn't matter, but it has to do source manip */ - nf_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); + nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC); /* For DST manip, map port here to where it's expected. */ range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); range.min = range.max = exp->saved_proto; range.min_ip = range.max_ip = ct->master->tuplehash[!exp->dir].tuple.src.u3.ip; - /* hook doesn't matter, but it has to do destination manip */ - nf_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING); + nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST); } EXPORT_SYMBOL(nf_nat_follow_master); diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index 6817e7995f3..e63b944a2eb 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -93,8 +93,7 @@ static void pptp_nat_expected(struct nf_conn *ct, range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED; range.min = range.max = exp->saved_proto; } - /* hook doesn't matter, but it has to do source manip */ - nf_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); + nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC); /* For DST manip, map port here to where it's expected. */ range.flags = IP_NAT_RANGE_MAP_IPS; @@ -104,8 +103,7 @@ static void pptp_nat_expected(struct nf_conn *ct, range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED; range.min = range.max = exp->saved_proto; } - /* hook doesn't matter, but it has to do destination manip */ - nf_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING); + nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST); } /* outbound packets == from PNS to PAC */ diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c index b820f996035..9fa272e7311 100644 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ b/net/ipv4/netfilter/nf_nat_proto_gre.c @@ -135,9 +135,10 @@ gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, return 1; } -static struct nf_nat_protocol gre __read_mostly = { +static const struct nf_nat_protocol gre = { .name = "GRE", .protonum = IPPROTO_GRE, + .me = THIS_MODULE, .manip_pkt = gre_manip_pkt, .in_range = gre_in_range, .unique_tuple = gre_unique_tuple, diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c index b9fc724388f..a0e44c953cb 100644 --- a/net/ipv4/netfilter/nf_nat_proto_icmp.c +++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c @@ -65,13 +65,13 @@ icmp_manip_pkt(struct sk_buff *skb, return 0; hdr = (struct icmphdr *)(skb->data + hdroff); - nf_proto_csum_replace2(&hdr->checksum, skb, - hdr->un.echo.id, tuple->src.u.icmp.id, 0); + inet_proto_csum_replace2(&hdr->checksum, skb, + hdr->un.echo.id, tuple->src.u.icmp.id, 0); hdr->un.echo.id = tuple->src.u.icmp.id; return 1; } -struct nf_nat_protocol nf_nat_protocol_icmp = { +const struct nf_nat_protocol nf_nat_protocol_icmp = { .name = "ICMP", .protonum = IPPROTO_ICMP, .me = THIS_MODULE, diff --git a/net/ipv4/netfilter/nf_nat_proto_tcp.c b/net/ipv4/netfilter/nf_nat_proto_tcp.c index 6bab2e18445..da23e9fbe67 100644 --- a/net/ipv4/netfilter/nf_nat_proto_tcp.c +++ b/net/ipv4/netfilter/nf_nat_proto_tcp.c @@ -132,12 +132,12 @@ tcp_manip_pkt(struct sk_buff *skb, if (hdrsize < sizeof(*hdr)) return 1; - nf_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1); - nf_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0); + inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1); + inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0); return 1; } -struct nf_nat_protocol nf_nat_protocol_tcp = { +const struct nf_nat_protocol nf_nat_protocol_tcp = { .name = "TCP", .protonum = IPPROTO_TCP, .me = THIS_MODULE, diff --git a/net/ipv4/netfilter/nf_nat_proto_udp.c b/net/ipv4/netfilter/nf_nat_proto_udp.c index cbf1a61e290..10df4db078a 100644 --- a/net/ipv4/netfilter/nf_nat_proto_udp.c +++ b/net/ipv4/netfilter/nf_nat_proto_udp.c @@ -117,9 +117,9 @@ udp_manip_pkt(struct sk_buff *skb, portptr = &hdr->dest; } if (hdr->check || skb->ip_summed == CHECKSUM_PARTIAL) { - nf_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1); - nf_proto_csum_replace2(&hdr->check, skb, *portptr, newport, - 0); + inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1); + inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, + 0); if (!hdr->check) hdr->check = CSUM_MANGLED_0; } @@ -127,7 +127,7 @@ udp_manip_pkt(struct sk_buff *skb, return 1; } -struct nf_nat_protocol nf_nat_protocol_udp = { +const struct nf_nat_protocol nf_nat_protocol_udp = { .name = "UDP", .protonum = IPPROTO_UDP, .me = THIS_MODULE, diff --git a/net/ipv4/netfilter/nf_nat_proto_unknown.c b/net/ipv4/netfilter/nf_nat_proto_unknown.c index cfd2742e970..a26efeb073c 100644 --- a/net/ipv4/netfilter/nf_nat_proto_unknown.c +++ b/net/ipv4/netfilter/nf_nat_proto_unknown.c @@ -45,7 +45,7 @@ unknown_manip_pkt(struct sk_buff *skb, return 1; } -struct nf_nat_protocol nf_nat_unknown_protocol = { +const struct nf_nat_protocol nf_nat_unknown_protocol = { .name = "unknown", /* .me isn't set: getting a ref to this cannot fail. */ .manip_pkt = unknown_manip_pkt, diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c index 46b25ab5f78..519182269e7 100644 --- a/net/ipv4/netfilter/nf_nat_rule.c +++ b/net/ipv4/netfilter/nf_nat_rule.c @@ -24,7 +24,9 @@ #include <net/netfilter/nf_nat_core.h> #include <net/netfilter/nf_nat_rule.h> -#define NAT_VALID_HOOKS ((1<<NF_IP_PRE_ROUTING) | (1<<NF_IP_POST_ROUTING) | (1<<NF_IP_LOCAL_OUT)) +#define NAT_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \ + (1 << NF_INET_POST_ROUTING) | \ + (1 << NF_INET_LOCAL_OUT)) static struct { @@ -38,14 +40,14 @@ static struct .num_entries = 4, .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), .hook_entry = { - [NF_IP_PRE_ROUTING] = 0, - [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard), - [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 + [NF_INET_PRE_ROUTING] = 0, + [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard), + [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 }, .underflow = { - [NF_IP_PRE_ROUTING] = 0, - [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard), - [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 + [NF_INET_PRE_ROUTING] = 0, + [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard), + [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 }, }, .entries = { @@ -76,7 +78,7 @@ static unsigned int ipt_snat_target(struct sk_buff *skb, enum ip_conntrack_info ctinfo; const struct nf_nat_multi_range_compat *mr = targinfo; - NF_CT_ASSERT(hooknum == NF_IP_POST_ROUTING); + NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING); ct = nf_ct_get(skb, &ctinfo); @@ -85,7 +87,7 @@ static unsigned int ipt_snat_target(struct sk_buff *skb, ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); NF_CT_ASSERT(out); - return nf_nat_setup_info(ct, &mr->range[0], hooknum); + return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_SRC); } /* Before 2.6.11 we did implicit source NAT if required. Warn about change. */ @@ -95,7 +97,7 @@ static void warn_if_extra_mangle(__be32 dstip, __be32 srcip) struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dstip } } }; struct rtable *rt; - if (ip_route_output_key(&rt, &fl) != 0) + if (ip_route_output_key(&init_net, &rt, &fl) != 0) return; if (rt->rt_src != srcip && !warned) { @@ -118,20 +120,20 @@ static unsigned int ipt_dnat_target(struct sk_buff *skb, enum ip_conntrack_info ctinfo; const struct nf_nat_multi_range_compat *mr = targinfo; - NF_CT_ASSERT(hooknum == NF_IP_PRE_ROUTING || - hooknum == NF_IP_LOCAL_OUT); + NF_CT_ASSERT(hooknum == NF_INET_PRE_ROUTING || + hooknum == NF_INET_LOCAL_OUT); ct = nf_ct_get(skb, &ctinfo); /* Connection must be valid and new. */ NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); - if (hooknum == NF_IP_LOCAL_OUT && + if (hooknum == NF_INET_LOCAL_OUT && mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) warn_if_extra_mangle(ip_hdr(skb)->daddr, mr->range[0].min_ip); - return nf_nat_setup_info(ct, &mr->range[0], hooknum); + return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_DST); } static bool ipt_snat_checkentry(const char *tablename, @@ -182,7 +184,7 @@ alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) pr_debug("Allocating NULL binding for %p (%u.%u.%u.%u)\n", ct, NIPQUAD(ip)); - return nf_nat_setup_info(ct, &range, hooknum); + return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); } unsigned int @@ -201,7 +203,7 @@ alloc_null_binding_confirmed(struct nf_conn *ct, unsigned int hooknum) pr_debug("Allocating NULL binding for confirmed %p (%u.%u.%u.%u)\n", ct, NIPQUAD(ip)); - return nf_nat_setup_info(ct, &range, hooknum); + return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); } int nf_nat_rule_find(struct sk_buff *skb, @@ -227,7 +229,7 @@ static struct xt_target ipt_snat_reg __read_mostly = { .target = ipt_snat_target, .targetsize = sizeof(struct nf_nat_multi_range_compat), .table = "nat", - .hooks = 1 << NF_IP_POST_ROUTING, + .hooks = 1 << NF_INET_POST_ROUTING, .checkentry = ipt_snat_checkentry, .family = AF_INET, }; @@ -237,7 +239,7 @@ static struct xt_target ipt_dnat_reg __read_mostly = { .target = ipt_dnat_target, .targetsize = sizeof(struct nf_nat_multi_range_compat), .table = "nat", - .hooks = (1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT), + .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT), .checkentry = ipt_dnat_checkentry, .family = AF_INET, }; diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c index 8996ccb757d..606a170bf4c 100644 --- a/net/ipv4/netfilter/nf_nat_sip.c +++ b/net/ipv4/netfilter/nf_nat_sip.c @@ -228,15 +228,13 @@ static void ip_nat_sdp_expect(struct nf_conn *ct, range.flags = IP_NAT_RANGE_MAP_IPS; range.min_ip = range.max_ip = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip; - /* hook doesn't matter, but it has to do source manip */ - nf_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); + nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC); /* For DST manip, map port here to where it's expected. */ range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); range.min = range.max = exp->saved_proto; range.min_ip = range.max_ip = exp->saved_ip; - /* hook doesn't matter, but it has to do destination manip */ - nf_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING); + nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST); } /* So, this packet has hit the connection tracking matching code. diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index 03709d6b4b0..07f2a49926d 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -60,7 +60,7 @@ MODULE_ALIAS("ip_nat_snmp_basic"); #define SNMP_PORT 161 #define SNMP_TRAP_PORT 162 -#define NOCT1(n) (*(u8 *)n) +#define NOCT1(n) (*(u8 *)(n)) static int debug; static DEFINE_SPINLOCK(snmp_lock); diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c index 7db76ea9af9..99b2c788d5a 100644 --- a/net/ipv4/netfilter/nf_nat_standalone.c +++ b/net/ipv4/netfilter/nf_nat_standalone.c @@ -137,7 +137,7 @@ nf_nat_fn(unsigned int hooknum, if (unlikely(nf_ct_is_confirmed(ct))) /* NAT module was loaded late */ ret = alloc_null_binding_confirmed(ct, hooknum); - else if (hooknum == NF_IP_LOCAL_IN) + else if (hooknum == NF_INET_LOCAL_IN) /* LOCAL_IN hook doesn't have a chain! */ ret = alloc_null_binding(ct, hooknum); else @@ -273,13 +273,13 @@ nf_nat_adjust(unsigned int hooknum, /* We must be after connection tracking and before packet filtering. */ -static struct nf_hook_ops nf_nat_ops[] = { +static struct nf_hook_ops nf_nat_ops[] __read_mostly = { /* Before packet filtering, change destination */ { .hook = nf_nat_in, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_PRE_ROUTING, + .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_NAT_DST, }, /* After packet filtering, change source */ @@ -287,7 +287,7 @@ static struct nf_hook_ops nf_nat_ops[] = { .hook = nf_nat_out, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_POST_ROUTING, + .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_NAT_SRC, }, /* After conntrack, adjust sequence number */ @@ -295,7 +295,7 @@ static struct nf_hook_ops nf_nat_ops[] = { .hook = nf_nat_adjust, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_POST_ROUTING, + .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_NAT_SEQ_ADJUST, }, /* Before packet filtering, change destination */ @@ -303,7 +303,7 @@ static struct nf_hook_ops nf_nat_ops[] = { .hook = nf_nat_local_fn, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_LOCAL_OUT, + .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST, }, /* After packet filtering, change source */ @@ -311,7 +311,7 @@ static struct nf_hook_ops nf_nat_ops[] = { .hook = nf_nat_fn, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_LOCAL_IN, + .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC, }, /* After conntrack, adjust sequence number */ @@ -319,7 +319,7 @@ static struct nf_hook_ops nf_nat_ops[] = { .hook = nf_nat_adjust, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_LOCAL_IN, + .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SEQ_ADJUST, }, }; @@ -332,7 +332,7 @@ static int __init nf_nat_standalone_init(void) #ifdef CONFIG_XFRM BUG_ON(ip_nat_decode_session != NULL); - ip_nat_decode_session = nat_decode_session; + rcu_assign_pointer(ip_nat_decode_session, nat_decode_session); #endif ret = nf_nat_rule_init(); if (ret < 0) { @@ -350,7 +350,7 @@ static int __init nf_nat_standalone_init(void) nf_nat_rule_cleanup(); cleanup_decode_session: #ifdef CONFIG_XFRM - ip_nat_decode_session = NULL; + rcu_assign_pointer(ip_nat_decode_session, NULL); synchronize_net(); #endif return ret; @@ -361,7 +361,7 @@ static void __exit nf_nat_standalone_fini(void) nf_unregister_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops)); nf_nat_rule_cleanup(); #ifdef CONFIG_XFRM - ip_nat_decode_session = NULL; + rcu_assign_pointer(ip_nat_decode_session, NULL); synchronize_net(); #endif /* Conntrack caches are unregistered in nf_conntrack_cleanup */ diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index ce34b281803..d63474c6b40 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -53,14 +53,16 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) { socket_seq_show(seq); seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", - sock_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count), + sock_prot_inuse_get(&tcp_prot), + atomic_read(&tcp_orphan_count), tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated), atomic_read(&tcp_memory_allocated)); - seq_printf(seq, "UDP: inuse %d\n", sock_prot_inuse(&udp_prot)); - seq_printf(seq, "UDPLITE: inuse %d\n", sock_prot_inuse(&udplite_prot)); - seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse(&raw_prot)); + seq_printf(seq, "UDP: inuse %d mem %d\n", sock_prot_inuse_get(&udp_prot), + atomic_read(&udp_memory_allocated)); + seq_printf(seq, "UDPLITE: inuse %d\n", sock_prot_inuse_get(&udplite_prot)); + seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse_get(&raw_prot)); seq_printf(seq, "FRAG: inuse %d memory %d\n", - ip_frag_nqueues(), ip_frag_mem()); + ip_frag_nqueues(&init_net), ip_frag_mem(&init_net)); return 0; } @@ -309,7 +311,8 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_printf(seq, " %s", snmp4_ipstats_list[i].name); seq_printf(seq, "\nIp: %d %d", - IPV4_DEVCONF_ALL(FORWARDING) ? 1 : 2, sysctl_ip_default_ttl); + IPV4_DEVCONF_ALL(&init_net, FORWARDING) ? 1 : 2, + sysctl_ip_default_ttl); for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) seq_printf(seq, " %lu", diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index e7050f8eabe..85c08696abb 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -80,38 +80,51 @@ #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> -struct hlist_head raw_v4_htable[RAWV4_HTABLE_SIZE]; -DEFINE_RWLOCK(raw_v4_lock); +static struct raw_hashinfo raw_v4_hashinfo = { + .lock = __RW_LOCK_UNLOCKED(), +}; -static void raw_v4_hash(struct sock *sk) +void raw_hash_sk(struct sock *sk, struct raw_hashinfo *h) { - struct hlist_head *head = &raw_v4_htable[inet_sk(sk)->num & - (RAWV4_HTABLE_SIZE - 1)]; + struct hlist_head *head; + + head = &h->ht[inet_sk(sk)->num & (RAW_HTABLE_SIZE - 1)]; - write_lock_bh(&raw_v4_lock); + write_lock_bh(&h->lock); sk_add_node(sk, head); - sock_prot_inc_use(sk->sk_prot); - write_unlock_bh(&raw_v4_lock); + sock_prot_inuse_add(sk->sk_prot, 1); + write_unlock_bh(&h->lock); } +EXPORT_SYMBOL_GPL(raw_hash_sk); -static void raw_v4_unhash(struct sock *sk) +void raw_unhash_sk(struct sock *sk, struct raw_hashinfo *h) { - write_lock_bh(&raw_v4_lock); + write_lock_bh(&h->lock); if (sk_del_node_init(sk)) - sock_prot_dec_use(sk->sk_prot); - write_unlock_bh(&raw_v4_lock); + sock_prot_inuse_add(sk->sk_prot, -1); + write_unlock_bh(&h->lock); +} +EXPORT_SYMBOL_GPL(raw_unhash_sk); + +static void raw_v4_hash(struct sock *sk) +{ + raw_hash_sk(sk, &raw_v4_hashinfo); } -struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num, - __be32 raddr, __be32 laddr, - int dif) +static void raw_v4_unhash(struct sock *sk) +{ + raw_unhash_sk(sk, &raw_v4_hashinfo); +} + +static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, + unsigned short num, __be32 raddr, __be32 laddr, int dif) { struct hlist_node *node; sk_for_each_from(sk, node) { struct inet_sock *inet = inet_sk(sk); - if (inet->num == num && + if (sk->sk_net == net && inet->num == num && !(inet->daddr && inet->daddr != raddr) && !(inet->rcv_saddr && inet->rcv_saddr != laddr) && !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) @@ -150,17 +163,20 @@ static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) * RFC 1122: SHOULD pass TOS value up to the transport layer. * -> It does. And not only TOS, but all IP header. */ -int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) +static int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) { struct sock *sk; struct hlist_head *head; int delivered = 0; + struct net *net; - read_lock(&raw_v4_lock); - head = &raw_v4_htable[hash]; + read_lock(&raw_v4_hashinfo.lock); + head = &raw_v4_hashinfo.ht[hash]; if (hlist_empty(head)) goto out; - sk = __raw_v4_lookup(__sk_head(head), iph->protocol, + + net = skb->dev->nd_net; + sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol, iph->saddr, iph->daddr, skb->dev->ifindex); @@ -173,16 +189,34 @@ int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) if (clone) raw_rcv(sk, clone); } - sk = __raw_v4_lookup(sk_next(sk), iph->protocol, + sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol, iph->saddr, iph->daddr, skb->dev->ifindex); } out: - read_unlock(&raw_v4_lock); + read_unlock(&raw_v4_hashinfo.lock); return delivered; } -void raw_err (struct sock *sk, struct sk_buff *skb, u32 info) +int raw_local_deliver(struct sk_buff *skb, int protocol) +{ + int hash; + struct sock *raw_sk; + + hash = protocol & (RAW_HTABLE_SIZE - 1); + raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]); + + /* If there maybe a raw socket we must check - if not we + * don't care less + */ + if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash)) + raw_sk = NULL; + + return raw_sk != NULL; + +} + +static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) { struct inet_sock *inet = inet_sk(sk); const int type = icmp_hdr(skb)->type; @@ -236,12 +270,38 @@ void raw_err (struct sock *sk, struct sk_buff *skb, u32 info) } } +void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) +{ + int hash; + struct sock *raw_sk; + struct iphdr *iph; + struct net *net; + + hash = protocol & (RAW_HTABLE_SIZE - 1); + + read_lock(&raw_v4_hashinfo.lock); + raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]); + if (raw_sk != NULL) { + iph = (struct iphdr *)skb->data; + net = skb->dev->nd_net; + + while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol, + iph->daddr, iph->saddr, + skb->dev->ifindex)) != NULL) { + raw_err(raw_sk, skb, info); + raw_sk = sk_next(raw_sk); + iph = (struct iphdr *)skb->data; + } + } + read_unlock(&raw_v4_hashinfo.lock); +} + static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb) { /* Charge it to the socket. */ if (sock_queue_rcv_skb(sk, skb) < 0) { - /* FIXME: increment a raw drops counter here */ + atomic_inc(&sk->sk_drops); kfree_skb(skb); return NET_RX_DROP; } @@ -252,6 +312,7 @@ static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb) int raw_rcv(struct sock *sk, struct sk_buff *skb) { if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { + atomic_inc(&sk->sk_drops); kfree_skb(skb); return NET_RX_DROP; } @@ -320,7 +381,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, icmp_out_count(((struct icmphdr *) skb_transport_header(skb))->type); - err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, + err = NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, rt->u.dst.dev, dst_output); if (err > 0) err = inet->recverr ? net_xmit_errno(err) : 0; @@ -474,7 +535,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (msg->msg_flags & MSG_DONTROUTE) tos |= RTO_ONLINK; - if (MULTICAST(daddr)) { + if (ipv4_is_multicast(daddr)) { if (!ipc.oif) ipc.oif = inet->mc_index; if (!saddr) @@ -497,7 +558,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } security_sk_classify_flow(sk, &fl); - err = ip_route_output_flow(&rt, &fl, sk, 1); + err = ip_route_output_flow(&init_net, &rt, &fl, sk, 1); } if (err) goto done; @@ -564,7 +625,7 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in)) goto out; - chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr); + chk_addr_ret = inet_addr_type(sk->sk_net, addr->sin_addr.s_addr); ret = -EADDRNOTAVAIL; if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) @@ -789,22 +850,18 @@ struct proto raw_prot = { }; #ifdef CONFIG_PROC_FS -struct raw_iter_state { - int bucket; -}; - -#define raw_seq_private(seq) ((struct raw_iter_state *)(seq)->private) - static struct sock *raw_get_first(struct seq_file *seq) { struct sock *sk; struct raw_iter_state* state = raw_seq_private(seq); - for (state->bucket = 0; state->bucket < RAWV4_HTABLE_SIZE; ++state->bucket) { + for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE; + ++state->bucket) { struct hlist_node *node; - sk_for_each(sk, node, &raw_v4_htable[state->bucket]) - if (sk->sk_family == PF_INET) + sk_for_each(sk, node, &state->h->ht[state->bucket]) + if (sk->sk_net == state->p.net && + sk->sk_family == state->family) goto found; } sk = NULL; @@ -820,10 +877,11 @@ static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk) sk = sk_next(sk); try_again: ; - } while (sk && sk->sk_family != PF_INET); + } while (sk && sk->sk_net != state->p.net && + sk->sk_family != state->family); - if (!sk && ++state->bucket < RAWV4_HTABLE_SIZE) { - sk = sk_head(&raw_v4_htable[state->bucket]); + if (!sk && ++state->bucket < RAW_HTABLE_SIZE) { + sk = sk_head(&state->h->ht[state->bucket]); goto try_again; } return sk; @@ -839,13 +897,16 @@ static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos) return pos ? NULL : sk; } -static void *raw_seq_start(struct seq_file *seq, loff_t *pos) +void *raw_seq_start(struct seq_file *seq, loff_t *pos) { - read_lock(&raw_v4_lock); + struct raw_iter_state *state = raw_seq_private(seq); + + read_lock(&state->h->lock); return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } +EXPORT_SYMBOL_GPL(raw_seq_start); -static void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos) +void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sock *sk; @@ -856,11 +917,15 @@ static void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++*pos; return sk; } +EXPORT_SYMBOL_GPL(raw_seq_next); -static void raw_seq_stop(struct seq_file *seq, void *v) +void raw_seq_stop(struct seq_file *seq, void *v) { - read_unlock(&raw_v4_lock); + struct raw_iter_state *state = raw_seq_private(seq); + + read_unlock(&state->h->lock); } +EXPORT_SYMBOL_GPL(raw_seq_stop); static __inline__ char *get_raw_sock(struct sock *sp, char *tmpbuf, int i) { @@ -871,28 +936,30 @@ static __inline__ char *get_raw_sock(struct sock *sp, char *tmpbuf, int i) srcp = inet->num; sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p", + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d", i, src, srcp, dest, destp, sp->sk_state, atomic_read(&sp->sk_wmem_alloc), atomic_read(&sp->sk_rmem_alloc), 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), - atomic_read(&sp->sk_refcnt), sp); + atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); return tmpbuf; } +#define TMPSZ 128 + static int raw_seq_show(struct seq_file *seq, void *v) { - char tmpbuf[129]; + char tmpbuf[TMPSZ+1]; if (v == SEQ_START_TOKEN) - seq_printf(seq, "%-127s\n", + seq_printf(seq, "%-*s\n", TMPSZ-1, " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " - "inode"); + "inode drops"); else { struct raw_iter_state *state = raw_seq_private(seq); - seq_printf(seq, "%-127s\n", + seq_printf(seq, "%-*s\n", TMPSZ-1, get_raw_sock(v, tmpbuf, state->bucket)); } return 0; @@ -905,29 +972,62 @@ static const struct seq_operations raw_seq_ops = { .show = raw_seq_show, }; -static int raw_seq_open(struct inode *inode, struct file *file) +int raw_seq_open(struct inode *ino, struct file *file, struct raw_hashinfo *h, + unsigned short family) { - return seq_open_private(file, &raw_seq_ops, + int err; + struct raw_iter_state *i; + + err = seq_open_net(ino, file, &raw_seq_ops, sizeof(struct raw_iter_state)); + if (err < 0) + return err; + + i = raw_seq_private((struct seq_file *)file->private_data); + i->h = h; + i->family = family; + return 0; +} +EXPORT_SYMBOL_GPL(raw_seq_open); + +static int raw_v4_seq_open(struct inode *inode, struct file *file) +{ + return raw_seq_open(inode, file, &raw_v4_hashinfo, PF_INET); } static const struct file_operations raw_seq_fops = { .owner = THIS_MODULE, - .open = raw_seq_open, + .open = raw_v4_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; -int __init raw_proc_init(void) +static __net_init int raw_init_net(struct net *net) { - if (!proc_net_fops_create(&init_net, "raw", S_IRUGO, &raw_seq_fops)) + if (!proc_net_fops_create(net, "raw", S_IRUGO, &raw_seq_fops)) return -ENOMEM; + return 0; } +static __net_exit void raw_exit_net(struct net *net) +{ + proc_net_remove(net, "raw"); +} + +static __net_initdata struct pernet_operations raw_net_ops = { + .init = raw_init_net, + .exit = raw_exit_net, +}; + +int __init raw_proc_init(void) +{ + return register_pernet_subsys(&raw_net_ops); +} + void __init raw_proc_exit(void) { - proc_net_remove(&init_net, "raw"); + unregister_pernet_subsys(&raw_net_ops); } #endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 28484f396b0..896c768e41a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -92,6 +92,7 @@ #include <linux/jhash.h> #include <linux/rcupdate.h> #include <linux/times.h> +#include <net/dst.h> #include <net/net_namespace.h> #include <net/protocol.h> #include <net/ip.h> @@ -132,13 +133,14 @@ static int ip_rt_mtu_expires = 10 * 60 * HZ; static int ip_rt_min_pmtu = 512 + 20 + 20; static int ip_rt_min_advmss = 256; static int ip_rt_secret_interval = 10 * 60 * HZ; +static int ip_rt_flush_expected; static unsigned long rt_deadline; #define RTprint(a...) printk(KERN_DEBUG a) static struct timer_list rt_flush_timer; -static void rt_check_expire(struct work_struct *work); -static DECLARE_DELAYED_WORK(expires_work, rt_check_expire); +static void rt_worker_func(struct work_struct *work); +static DECLARE_DELAYED_WORK(expires_work, rt_worker_func); static struct timer_list rt_secret_timer; /* @@ -152,7 +154,7 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); -static int rt_garbage_collect(void); +static int rt_garbage_collect(struct dst_ops *ops); static struct dst_ops ipv4_dst_ops = { @@ -165,6 +167,7 @@ static struct dst_ops ipv4_dst_ops = { .negative_advice = ipv4_negative_advice, .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, + .local_out = ip_local_out, .entry_size = sizeof(struct rtable), }; @@ -232,16 +235,25 @@ struct rt_hash_bucket { static spinlock_t *rt_hash_locks; # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)] -# define rt_hash_lock_init() { \ - int i; \ - rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \ - if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \ - for (i = 0; i < RT_HASH_LOCK_SZ; i++) \ - spin_lock_init(&rt_hash_locks[i]); \ - } + +static __init void rt_hash_lock_init(void) +{ + int i; + + rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, + GFP_KERNEL); + if (!rt_hash_locks) + panic("IP: failed to allocate rt_hash_locks\n"); + + for (i = 0; i < RT_HASH_LOCK_SZ; i++) + spin_lock_init(&rt_hash_locks[i]); +} #else # define rt_hash_lock_addr(slot) NULL -# define rt_hash_lock_init() + +static inline void rt_hash_lock_init(void) +{ +} #endif static struct rt_hash_bucket *rt_hash_table; @@ -478,6 +490,83 @@ static const struct file_operations rt_cpu_seq_fops = { .release = seq_release, }; +#ifdef CONFIG_NET_CLS_ROUTE +static int ip_rt_acct_read(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + unsigned int i; + + if ((offset & 3) || (length & 3)) + return -EIO; + + if (offset >= sizeof(struct ip_rt_acct) * 256) { + *eof = 1; + return 0; + } + + if (offset + length >= sizeof(struct ip_rt_acct) * 256) { + length = sizeof(struct ip_rt_acct) * 256 - offset; + *eof = 1; + } + + offset /= sizeof(u32); + + if (length > 0) { + u32 *dst = (u32 *) buffer; + + *start = buffer; + memset(dst, 0, length); + + for_each_possible_cpu(i) { + unsigned int j; + u32 *src; + + src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset; + for (j = 0; j < length/4; j++) + dst[j] += src[j]; + } + } + return length; +} +#endif + +static __init int ip_rt_proc_init(struct net *net) +{ + struct proc_dir_entry *pde; + + pde = proc_net_fops_create(net, "rt_cache", S_IRUGO, + &rt_cache_seq_fops); + if (!pde) + goto err1; + + pde = create_proc_entry("rt_cache", S_IRUGO, net->proc_net_stat); + if (!pde) + goto err2; + + pde->proc_fops = &rt_cpu_seq_fops; + +#ifdef CONFIG_NET_CLS_ROUTE + pde = create_proc_read_entry("rt_acct", 0, net->proc_net, + ip_rt_acct_read, NULL); + if (!pde) + goto err3; +#endif + return 0; + +#ifdef CONFIG_NET_CLS_ROUTE +err3: + remove_proc_entry("rt_cache", net->proc_net_stat); +#endif +err2: + remove_proc_entry("rt_cache", net->proc_net); +err1: + return -ENOMEM; +} +#else +static inline int ip_rt_proc_init(struct net *net) +{ + return 0; +} #endif /* CONFIG_PROC_FS */ static __inline__ void rt_free(struct rtable *rt) @@ -559,7 +648,41 @@ static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) (fl1->iif ^ fl2->iif)) == 0; } -static void rt_check_expire(struct work_struct *work) +static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) +{ + return rt1->u.dst.dev->nd_net == rt2->u.dst.dev->nd_net; +} + +/* + * Perform a full scan of hash table and free all entries. + * Can be called by a softirq or a process. + * In the later case, we want to be reschedule if necessary + */ +static void rt_do_flush(int process_context) +{ + unsigned int i; + struct rtable *rth, *next; + + for (i = 0; i <= rt_hash_mask; i++) { + if (process_context && need_resched()) + cond_resched(); + rth = rt_hash_table[i].chain; + if (!rth) + continue; + + spin_lock_bh(rt_hash_lock_addr(i)); + rth = rt_hash_table[i].chain; + rt_hash_table[i].chain = NULL; + spin_unlock_bh(rt_hash_lock_addr(i)); + + for (; rth; rth = next) { + next = rth->u.dst.rt_next; + rt_free(rth); + } + } +} + +static void rt_check_expire(void) { static unsigned int rover; unsigned int i = rover, goal; @@ -605,33 +728,33 @@ static void rt_check_expire(struct work_struct *work) spin_unlock_bh(rt_hash_lock_addr(i)); } rover = i; +} + +/* + * rt_worker_func() is run in process context. + * If a whole flush was scheduled, it is done. + * Else, we call rt_check_expire() to scan part of the hash table + */ +static void rt_worker_func(struct work_struct *work) +{ + if (ip_rt_flush_expected) { + ip_rt_flush_expected = 0; + rt_do_flush(1); + } else + rt_check_expire(); schedule_delayed_work(&expires_work, ip_rt_gc_interval); } /* This can run from both BH and non-BH contexts, the latter * in the case of a forced flush event. */ -static void rt_run_flush(unsigned long dummy) +static void rt_run_flush(unsigned long process_context) { - int i; - struct rtable *rth, *next; - rt_deadline = 0; get_random_bytes(&rt_hash_rnd, 4); - for (i = rt_hash_mask; i >= 0; i--) { - spin_lock_bh(rt_hash_lock_addr(i)); - rth = rt_hash_table[i].chain; - if (rth) - rt_hash_table[i].chain = NULL; - spin_unlock_bh(rt_hash_lock_addr(i)); - - for (; rth; rth = next) { - next = rth->u.dst.rt_next; - rt_free(rth); - } - } + rt_do_flush(process_context); } static DEFINE_SPINLOCK(rt_flush_lock); @@ -665,7 +788,7 @@ void rt_cache_flush(int delay) if (delay <= 0) { spin_unlock_bh(&rt_flush_lock); - rt_run_flush(0); + rt_run_flush(user_mode); return; } @@ -676,12 +799,17 @@ void rt_cache_flush(int delay) spin_unlock_bh(&rt_flush_lock); } +/* + * We change rt_hash_rnd and ask next rt_worker_func() invocation + * to perform a flush in process context + */ static void rt_secret_rebuild(unsigned long dummy) { - unsigned long now = jiffies; - - rt_cache_flush(0); - mod_timer(&rt_secret_timer, now + ip_rt_secret_interval); + get_random_bytes(&rt_hash_rnd, 4); + ip_rt_flush_expected = 1; + cancel_delayed_work(&expires_work); + schedule_delayed_work(&expires_work, HZ/10); + mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval); } /* @@ -697,7 +825,7 @@ static void rt_secret_rebuild(unsigned long dummy) and when load increases it reduces to limit cache size. */ -static int rt_garbage_collect(void) +static int rt_garbage_collect(struct dst_ops *ops) { static unsigned long expire = RT_GC_TIMEOUT; static unsigned long last_gc; @@ -728,14 +856,14 @@ static int rt_garbage_collect(void) equilibrium = ipv4_dst_ops.gc_thresh; goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; if (goal > 0) { - equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1); + equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; } } else { /* We are in dangerous area. Try to reduce cache really * aggressively. */ - goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1); + goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal; } @@ -838,7 +966,7 @@ restart: spin_lock_bh(rt_hash_lock_addr(hash)); while ((rth = *rthp) != NULL) { - if (compare_keys(&rth->fl, &rt->fl)) { + if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) { /* Put it first */ *rthp = rth->u.dst.rt_next; /* @@ -912,7 +1040,7 @@ restart: int saved_int = ip_rt_gc_min_interval; ip_rt_gc_elasticity = 1; ip_rt_gc_min_interval = 0; - rt_garbage_collect(); + rt_garbage_collect(&ipv4_dst_ops); ip_rt_gc_min_interval = saved_int; ip_rt_gc_elasticity = saved_elasticity; goto restart; @@ -1031,7 +1159,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, return; if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) - || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw)) + || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) + || ipv4_is_zeronet(new_gw)) goto reject_redirect; if (!IN_DEV_SHARED_MEDIA(in_dev)) { @@ -1040,7 +1169,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) goto reject_redirect; } else { - if (inet_addr_type(new_gw) != RTN_UNICAST) + if (inet_addr_type(&init_net, new_gw) != RTN_UNICAST) goto reject_redirect; } @@ -1291,7 +1420,8 @@ static __inline__ unsigned short guess_mtu(unsigned short old_mtu) return 68; } -unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) +unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, + unsigned short new_mtu) { int i; unsigned short old_mtu = ntohs(iph->tot_len); @@ -1314,7 +1444,8 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) rth->rt_dst == daddr && rth->rt_src == iph->saddr && rth->fl.iif == 0 && - !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) { + !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) && + rth->u.dst.dev->nd_net == net) { unsigned short mtu = new_mtu; if (new_mtu < 68 || new_mtu >= old_mtu) { @@ -1389,8 +1520,9 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, { struct rtable *rt = (struct rtable *) dst; struct in_device *idev = rt->idev; - if (dev != init_net.loopback_dev && idev && idev->dev == dev) { - struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev); + if (dev != dev->nd_net->loopback_dev && idev && idev->dev == dev) { + struct in_device *loopback_idev = + in_dev_get(dev->nd_net->loopback_dev); if (loopback_idev) { rt->idev = loopback_idev; in_dev_put(idev); @@ -1434,7 +1566,7 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt) if (rt->fl.iif == 0) src = rt->rt_src; - else if (fib_lookup(&rt->fl, &res) == 0) { + else if (fib_lookup(rt->u.dst.dev->nd_net, &rt->fl, &res) == 0) { src = FIB_RES_PREFSRC(res); fib_res_put(&res); } else @@ -1509,12 +1641,12 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (in_dev == NULL) return -EINVAL; - if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) || - skb->protocol != htons(ETH_P_IP)) + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || + ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP)) goto e_inval; - if (ZERONET(saddr)) { - if (!LOCAL_MCAST(daddr)) + if (ipv4_is_zeronet(saddr)) { + if (!ipv4_is_local_multicast(daddr)) goto e_inval; spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); } else if (fib_validate_source(saddr, 0, tos, 0, @@ -1556,7 +1688,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, } #ifdef CONFIG_IP_MROUTE - if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) + if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) rth->u.dst.input = ip_mr_input; #endif RT_CACHE_STAT_INC(in_slow_mc); @@ -1643,7 +1775,7 @@ static inline int __mkroute_input(struct sk_buff *skb, if (err) flags |= RTCF_DIRECTSRC; - if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) && + if (out_dev == in_dev && err && !(flags & RTCF_MASQ) && (IN_DEV_SHARED_MEDIA(out_dev) || inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) flags |= RTCF_DOREDIRECT; @@ -1652,7 +1784,7 @@ static inline int __mkroute_input(struct sk_buff *skb, /* Not IP (i.e. ARP). Do not create route, if it is * invalid for proxy arp. DNAT routes are always valid. */ - if (out_dev == in_dev && !(flags & RTCF_DNAT)) { + if (out_dev == in_dev) { err = -EINVAL; goto cleanup; } @@ -1756,6 +1888,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, __be32 spec_dst; int err = -EINVAL; int free_res = 0; + struct net * net = dev->nd_net; /* IP on this device is disabled. */ @@ -1766,7 +1899,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, by fib_lookup. */ - if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr)) + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || + ipv4_is_loopback(saddr)) goto martian_source; if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0)) @@ -1775,16 +1909,17 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, /* Accept zero addresses only to limited broadcast; * I even do not know to fix it or not. Waiting for complains :-) */ - if (ZERONET(saddr)) + if (ipv4_is_zeronet(saddr)) goto martian_source; - if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr)) + if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) || + ipv4_is_loopback(daddr)) goto martian_destination; /* * Now we are ready to route packet. */ - if ((err = fib_lookup(&fl, &res)) != 0) { + if ((err = fib_lookup(net, &fl, &res)) != 0) { if (!IN_DEV_FORWARD(in_dev)) goto e_hostunreach; goto no_route; @@ -1799,7 +1934,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res.type == RTN_LOCAL) { int result; result = fib_validate_source(saddr, daddr, tos, - init_net.loopback_dev->ifindex, + net->loopback_dev->ifindex, dev, &spec_dst, &itag); if (result < 0) goto martian_source; @@ -1825,7 +1960,7 @@ brd_input: if (skb->protocol != htons(ETH_P_IP)) goto e_inval; - if (ZERONET(saddr)) + if (ipv4_is_zeronet(saddr)) spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); else { err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, @@ -1861,7 +1996,7 @@ local_input: #endif rth->rt_iif = rth->fl.iif = dev->ifindex; - rth->u.dst.dev = init_net.loopback_dev; + rth->u.dst.dev = net->loopback_dev; dev_hold(rth->u.dst.dev); rth->idev = in_dev_get(rth->u.dst.dev); rth->rt_gateway = daddr; @@ -1921,7 +2056,9 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, struct rtable * rth; unsigned hash; int iif = dev->ifindex; + struct net *net; + net = skb->dev->nd_net; tos &= IPTOS_RT_MASK; hash = rt_hash(daddr, saddr, iif); @@ -1933,7 +2070,8 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->fl.iif == iif && rth->fl.oif == 0 && rth->fl.mark == skb->mark && - rth->fl.fl4_tos == tos) { + rth->fl.fl4_tos == tos && + rth->u.dst.dev->nd_net == net) { dst_use(&rth->u.dst, jiffies); RT_CACHE_STAT_INC(in_hit); rcu_read_unlock(); @@ -1955,7 +2093,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, Note, that multicast routers are not affected, because route cache entry is created eventually. */ - if (MULTICAST(daddr)) { + if (ipv4_is_multicast(daddr)) { struct in_device *in_dev; rcu_read_lock(); @@ -1964,7 +2102,8 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, ip_hdr(skb)->protocol); if (our #ifdef CONFIG_IP_MROUTE - || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) + || (!ipv4_is_local_multicast(daddr) && + IN_DEV_MFORWARD(in_dev)) #endif ) { rcu_read_unlock(); @@ -1990,14 +2129,14 @@ static inline int __mkroute_output(struct rtable **result, u32 tos = RT_FL_TOS(oldflp); int err = 0; - if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK)) + if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK)) return -EINVAL; if (fl->fl4_dst == htonl(0xFFFFFFFF)) res->type = RTN_BROADCAST; - else if (MULTICAST(fl->fl4_dst)) + else if (ipv4_is_multicast(fl->fl4_dst)) res->type = RTN_MULTICAST; - else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst)) + else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst)) return -EINVAL; if (dev_out->flags & IFF_LOOPBACK) @@ -2077,7 +2216,7 @@ static inline int __mkroute_output(struct rtable **result, #ifdef CONFIG_IP_MROUTE if (res->type == RTN_MULTICAST) { if (IN_DEV_MFORWARD(in_dev) && - !LOCAL_MCAST(oldflp->fl4_dst)) { + !ipv4_is_local_multicast(oldflp->fl4_dst)) { rth->u.dst.input = ip_mr_input; rth->u.dst.output = ip_mc_output; } @@ -2119,7 +2258,8 @@ static inline int ip_mkroute_output(struct rtable **rp, * Major route resolver routine. */ -static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) +static int ip_route_output_slow(struct net *net, struct rtable **rp, + const struct flowi *oldflp) { u32 tos = RT_FL_TOS(oldflp); struct flowi fl = { .nl_u = { .ip4_u = @@ -2131,7 +2271,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) RT_SCOPE_UNIVERSE), } }, .mark = oldflp->mark, - .iif = init_net.loopback_dev->ifindex, + .iif = net->loopback_dev->ifindex, .oif = oldflp->oif }; struct fib_result res; unsigned flags = 0; @@ -2147,26 +2287,27 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) if (oldflp->fl4_src) { err = -EINVAL; - if (MULTICAST(oldflp->fl4_src) || - BADCLASS(oldflp->fl4_src) || - ZERONET(oldflp->fl4_src)) + if (ipv4_is_multicast(oldflp->fl4_src) || + ipv4_is_lbcast(oldflp->fl4_src) || + ipv4_is_zeronet(oldflp->fl4_src)) goto out; /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ - dev_out = ip_dev_find(oldflp->fl4_src); + dev_out = ip_dev_find(net, oldflp->fl4_src); if (dev_out == NULL) goto out; /* I removed check for oif == dev_out->oif here. It was wrong for two reasons: - 1. ip_dev_find(saddr) can return wrong iface, if saddr is - assigned to multiple interfaces. + 1. ip_dev_find(net, saddr) can return wrong iface, if saddr + is assigned to multiple interfaces. 2. Moreover, we are allowed to send packets with saddr of another iface. --ANK */ if (oldflp->oif == 0 - && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) { + && (ipv4_is_multicast(oldflp->fl4_dst) || + oldflp->fl4_dst == htonl(0xFFFFFFFF))) { /* Special hack: user can direct multicasts and limited broadcast via necessary interface without fiddling with IP_MULTICAST_IF or IP_PKTINFO. @@ -2192,7 +2333,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) if (oldflp->oif) { - dev_out = dev_get_by_index(&init_net, oldflp->oif); + dev_out = dev_get_by_index(net, oldflp->oif); err = -ENODEV; if (dev_out == NULL) goto out; @@ -2203,14 +2344,15 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) goto out; /* Wrong error code */ } - if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) { + if (ipv4_is_local_multicast(oldflp->fl4_dst) || + oldflp->fl4_dst == htonl(0xFFFFFFFF)) { if (!fl.fl4_src) fl.fl4_src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); goto make_route; } if (!fl.fl4_src) { - if (MULTICAST(oldflp->fl4_dst)) + if (ipv4_is_multicast(oldflp->fl4_dst)) fl.fl4_src = inet_select_addr(dev_out, 0, fl.fl4_scope); else if (!oldflp->fl4_dst) @@ -2225,15 +2367,15 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); if (dev_out) dev_put(dev_out); - dev_out = init_net.loopback_dev; + dev_out = net->loopback_dev; dev_hold(dev_out); - fl.oif = init_net.loopback_dev->ifindex; + fl.oif = net->loopback_dev->ifindex; res.type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; } - if (fib_lookup(&fl, &res)) { + if (fib_lookup(net, &fl, &res)) { res.fi = NULL; if (oldflp->oif) { /* Apparently, routing tables are wrong. Assume, @@ -2272,7 +2414,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) fl.fl4_src = fl.fl4_dst; if (dev_out) dev_put(dev_out); - dev_out = init_net.loopback_dev; + dev_out = net->loopback_dev; dev_hold(dev_out); fl.oif = dev_out->ifindex; if (res.fi) @@ -2288,7 +2430,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) else #endif if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif) - fib_select_default(&fl, &res); + fib_select_default(net, &fl, &res); if (!fl.fl4_src) fl.fl4_src = FIB_RES_PREFSRC(res); @@ -2311,7 +2453,8 @@ make_route: out: return err; } -int __ip_route_output_key(struct rtable **rp, const struct flowi *flp) +int __ip_route_output_key(struct net *net, struct rtable **rp, + const struct flowi *flp) { unsigned hash; struct rtable *rth; @@ -2327,7 +2470,8 @@ int __ip_route_output_key(struct rtable **rp, const struct flowi *flp) rth->fl.oif == flp->oif && rth->fl.mark == flp->mark && !((rth->fl.fl4_tos ^ flp->fl4_tos) & - (IPTOS_RT_MASK | RTO_ONLINK))) { + (IPTOS_RT_MASK | RTO_ONLINK)) && + rth->u.dst.dev->nd_net == net) { dst_use(&rth->u.dst, jiffies); RT_CACHE_STAT_INC(out_hit); rcu_read_unlock_bh(); @@ -2338,7 +2482,7 @@ int __ip_route_output_key(struct rtable **rp, const struct flowi *flp) } rcu_read_unlock_bh(); - return ip_route_output_slow(rp, flp); + return ip_route_output_slow(net, rp, flp); } EXPORT_SYMBOL_GPL(__ip_route_output_key); @@ -2357,12 +2501,6 @@ static struct dst_ops ipv4_dst_blackhole_ops = { }; -static int ipv4_blackhole_output(struct sk_buff *skb) -{ - kfree_skb(skb); - return 0; -} - static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk) { struct rtable *ort = *rp; @@ -2374,8 +2512,8 @@ static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock atomic_set(&new->__refcnt, 1); new->__use = 1; - new->input = ipv4_blackhole_output; - new->output = ipv4_blackhole_output; + new->input = dst_discard; + new->output = dst_discard; memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32)); new->dev = ort->u.dst.dev; @@ -2406,11 +2544,12 @@ static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock return (rt ? 0 : -ENOMEM); } -int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags) +int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, + struct sock *sk, int flags) { int err; - if ((err = __ip_route_output_key(rp, flp)) != 0) + if ((err = __ip_route_output_key(net, rp, flp)) != 0) return err; if (flp->proto) { @@ -2418,7 +2557,8 @@ int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, flp->fl4_src = (*rp)->rt_src; if (!flp->fl4_dst) flp->fl4_dst = (*rp)->rt_dst; - err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags); + err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, + flags ? XFRM_LOOKUP_WAIT : 0); if (err == -EREMOTE) err = ipv4_dst_blackhole(rp, flp, sk); @@ -2430,9 +2570,9 @@ int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, EXPORT_SYMBOL_GPL(ip_route_output_flow); -int ip_route_output_key(struct rtable **rp, struct flowi *flp) +int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp) { - return ip_route_output_flow(rp, flp, NULL, 0); + return ip_route_output_flow(net, rp, flp, NULL, 0); } static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, @@ -2499,8 +2639,8 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, #ifdef CONFIG_IP_MROUTE __be32 dst = rt->rt_dst; - if (MULTICAST(dst) && !LOCAL_MCAST(dst) && - IPV4_DEVCONF_ALL(MC_FORWARDING)) { + if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && + IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) { int err = ipmr_get_route(skb, r, nowait); if (err <= 0) { if (!nowait) { @@ -2531,6 +2671,7 @@ nla_put_failure: static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) { + struct net *net = in_skb->sk->sk_net; struct rtmsg *rtm; struct nlattr *tb[RTA_MAX+1]; struct rtable *rt = NULL; @@ -2540,6 +2681,9 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void int err; struct sk_buff *skb; + if (net != &init_net) + return -EINVAL; + err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); if (err < 0) goto errout; @@ -2595,7 +2739,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void }, .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, }; - err = ip_route_output_key(&rt, &fl); + err = ip_route_output_key(&init_net, &rt, &fl); } if (err) @@ -2610,7 +2754,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void if (err <= 0) goto errout_free; - err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid); + err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid); errout: return err; @@ -2862,51 +3006,7 @@ ctl_table ipv4_route_table[] = { #endif #ifdef CONFIG_NET_CLS_ROUTE -struct ip_rt_acct *ip_rt_acct; - -/* This code sucks. But you should have seen it before! --RR */ - -/* IP route accounting ptr for this logical cpu number. */ -#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256) - -#ifdef CONFIG_PROC_FS -static int ip_rt_acct_read(char *buffer, char **start, off_t offset, - int length, int *eof, void *data) -{ - unsigned int i; - - if ((offset & 3) || (length & 3)) - return -EIO; - - if (offset >= sizeof(struct ip_rt_acct) * 256) { - *eof = 1; - return 0; - } - - if (offset + length >= sizeof(struct ip_rt_acct) * 256) { - length = sizeof(struct ip_rt_acct) * 256 - offset; - *eof = 1; - } - - offset /= sizeof(u32); - - if (length > 0) { - u32 *dst = (u32 *) buffer; - - *start = buffer; - memset(dst, 0, length); - - for_each_possible_cpu(i) { - unsigned int j; - u32 *src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset; - - for (j = 0; j < length/4; j++) - dst[j] += src[j]; - } - } - return length; -} -#endif /* CONFIG_PROC_FS */ +struct ip_rt_acct *ip_rt_acct __read_mostly; #endif /* CONFIG_NET_CLS_ROUTE */ static __initdata unsigned long rhash_entries; @@ -2927,16 +3027,9 @@ int __init ip_rt_init(void) (jiffies ^ (jiffies >> 7))); #ifdef CONFIG_NET_CLS_ROUTE - { - int order; - for (order = 0; - (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++) - /* NOTHING */; - ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order); + ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct)); if (!ip_rt_acct) panic("IP: failed to allocate ip_rt_acct\n"); - memset(ip_rt_acct, 0, PAGE_SIZE << order); - } #endif ipv4_dst_ops.kmem_cachep = @@ -2964,10 +3057,8 @@ int __init ip_rt_init(void) devinet_init(); ip_fib_init(); - init_timer(&rt_flush_timer); - rt_flush_timer.function = rt_run_flush; - init_timer(&rt_secret_timer); - rt_secret_timer.function = rt_secret_rebuild; + setup_timer(&rt_flush_timer, rt_run_flush, 0); + setup_timer(&rt_secret_timer, rt_secret_rebuild, 0); /* All the timers, started at system startup tend to synchronize. Perturb it a bit. @@ -2979,20 +3070,8 @@ int __init ip_rt_init(void) ip_rt_secret_interval; add_timer(&rt_secret_timer); -#ifdef CONFIG_PROC_FS - { - struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */ - if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) || - !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, - init_net.proc_net_stat))) { - return -ENOMEM; - } - rtstat_pde->proc_fops = &rt_cpu_seq_fops; - } -#ifdef CONFIG_NET_CLS_ROUTE - create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL); -#endif -#endif + if (ip_rt_proc_init(&init_net)) + printk(KERN_ERR "Unable to create route proc files\n"); #ifdef CONFIG_XFRM xfrm_init(); xfrm4_init(); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 2da1be0589a..f470fe4511d 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -264,7 +264,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, { .sport = th->dest, .dport = th->source } } }; security_req_classify_flow(req, &fl); - if (ip_route_output_key(&rt, &fl)) { + if (ip_route_output_key(&init_net, &rt, &fl)) { reqsk_free(req); goto out; } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index bec6fe88065..82cdf23837e 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -13,83 +13,20 @@ #include <linux/igmp.h> #include <linux/inetdevice.h> #include <linux/seqlock.h> +#include <linux/init.h> #include <net/snmp.h> #include <net/icmp.h> #include <net/ip.h> #include <net/route.h> #include <net/tcp.h> +#include <net/udp.h> #include <net/cipso_ipv4.h> #include <net/inet_frag.h> -/* From af_inet.c */ -extern int sysctl_ip_nonlocal_bind; - -#ifdef CONFIG_SYSCTL static int zero; static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; -#endif - -struct ipv4_config ipv4_config; - -#ifdef CONFIG_SYSCTL - -static -int ipv4_sysctl_forward(ctl_table *ctl, int write, struct file * filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int val = IPV4_DEVCONF_ALL(FORWARDING); - int ret; - - ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); - - if (write && IPV4_DEVCONF_ALL(FORWARDING) != val) - inet_forward_change(); - - return ret; -} - -static int ipv4_sysctl_forward_strategy(ctl_table *table, - int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - int *valp = table->data; - int new; - - if (!newval || !newlen) - return 0; - - if (newlen != sizeof(int)) - return -EINVAL; - - if (get_user(new, (int __user *)newval)) - return -EFAULT; - - if (new == *valp) - return 0; - - if (oldval && oldlenp) { - size_t len; - - if (get_user(len, oldlenp)) - return -EFAULT; - - if (len) { - if (len > table->maxlen) - len = table->maxlen; - if (copy_to_user(oldval, valp, len)) - return -EFAULT; - if (put_user(len, oldlenp)) - return -EFAULT; - } - } - - *valp = new; - inet_forward_change(); - return 1; -} extern seqlock_t sysctl_port_range_lock; extern int sysctl_local_port_range[2]; @@ -256,7 +193,7 @@ static int strategy_allowed_congestion_control(ctl_table *table, int __user *nam } -ctl_table ipv4_table[] = { +static struct ctl_table ipv4_table[] = { { .ctl_name = NET_IPV4_TCP_TIMESTAMPS, .procname = "tcp_timestamps", @@ -290,15 +227,6 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_dointvec }, { - .ctl_name = NET_IPV4_FORWARD, - .procname = "ip_forward", - .data = &IPV4_DEVCONF_ALL(FORWARDING), - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &ipv4_sysctl_forward, - .strategy = &ipv4_sysctl_forward_strategy - }, - { .ctl_name = NET_IPV4_DEFAULT_TTL, .procname = "ip_default_ttl", .data = &sysctl_ip_default_ttl, @@ -356,22 +284,6 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_dointvec }, { - .ctl_name = NET_IPV4_IPFRAG_HIGH_THRESH, - .procname = "ipfrag_high_thresh", - .data = &ip4_frags_ctl.high_thresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = NET_IPV4_IPFRAG_LOW_THRESH, - .procname = "ipfrag_low_thresh", - .data = &ip4_frags_ctl.low_thresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { .ctl_name = NET_IPV4_DYNADDR, .procname = "ip_dynaddr", .data = &sysctl_ip_dynaddr, @@ -380,15 +292,6 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_dointvec }, { - .ctl_name = NET_IPV4_IPFRAG_TIME, - .procname = "ipfrag_time", - .data = &ip4_frags_ctl.timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies - }, - { .ctl_name = NET_IPV4_TCP_KEEPALIVE_TIME, .procname = "tcp_keepalive_time", .data = &sysctl_tcp_keepalive_time, @@ -731,23 +634,6 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_dointvec }, { - .ctl_name = NET_IPV4_IPFRAG_SECRET_INTERVAL, - .procname = "ipfrag_secret_interval", - .data = &ip4_frags_ctl.secret_interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies - }, - { - .procname = "ipfrag_max_dist", - .data = &sysctl_ipfrag_max_dist, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .extra1 = &zero - }, - { .ctl_name = NET_TCP_NO_METRICS_SAVE, .procname = "tcp_no_metrics_save", .data = &sysctl_tcp_nometrics_save, @@ -885,9 +771,52 @@ ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "udp_mem", + .data = &sysctl_udp_mem, + .maxlen = sizeof(sysctl_udp_mem), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "udp_rmem_min", + .data = &sysctl_udp_rmem_min, + .maxlen = sizeof(sysctl_udp_rmem_min), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "udp_wmem_min", + .data = &sysctl_udp_wmem_min, + .maxlen = sizeof(sysctl_udp_wmem_min), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero + }, { .ctl_name = 0 } }; -#endif /* CONFIG_SYSCTL */ +struct ctl_path net_ipv4_ctl_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "ipv4", .ctl_name = NET_IPV4, }, + { }, +}; +EXPORT_SYMBOL_GPL(net_ipv4_ctl_path); + +static __init int sysctl_ipv4_init(void) +{ + struct ctl_table_header *hdr; + + hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table); + return hdr == NULL ? -ENOMEM : 0; +} -EXPORT_SYMBOL(ipv4_config); +__initcall(sysctl_ipv4_init); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8e65182f7af..a0d373bd906 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -254,6 +254,10 @@ #include <linux/poll.h> #include <linux/init.h> #include <linux/fs.h> +#include <linux/skbuff.h> +#include <linux/splice.h> +#include <linux/net.h> +#include <linux/socket.h> #include <linux/random.h> #include <linux/bootmem.h> #include <linux/cache.h> @@ -265,6 +269,7 @@ #include <net/xfrm.h> #include <net/ip.h> #include <net/netdma.h> +#include <net/sock.h> #include <asm/uaccess.h> #include <asm/ioctls.h> @@ -292,9 +297,18 @@ EXPORT_SYMBOL(tcp_memory_allocated); EXPORT_SYMBOL(tcp_sockets_allocated); /* + * TCP splice context + */ +struct tcp_splice_state { + struct pipe_inode_info *pipe; + size_t len; + unsigned int flags; +}; + +/* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. - * All the sk_stream_mem_schedule() is of this nature: accounting + * All the __sk_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency. */ int tcp_memory_pressure __read_mostly; @@ -471,7 +485,8 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb) tcb->sacked = 0; skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); - sk_charge_skb(sk, skb); + sk->sk_wmem_queued += skb->truesize; + sk_mem_charge(sk, skb->truesize); if (tp->nonagle & TCP_NAGLE_PUSH) tp->nonagle &= ~TCP_NAGLE_PUSH; } @@ -482,7 +497,6 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags, if (flags & MSG_OOB) { tp->urg_mode = 1; tp->snd_up = tp->write_seq; - TCP_SKB_CB(skb)->sacked |= TCPCB_URG; } } @@ -501,6 +515,145 @@ static inline void tcp_push(struct sock *sk, int flags, int mss_now, } } +static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, + unsigned int offset, size_t len) +{ + struct tcp_splice_state *tss = rd_desc->arg.data; + + return skb_splice_bits(skb, offset, tss->pipe, tss->len, tss->flags); +} + +static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss) +{ + /* Store TCP splice context information in read_descriptor_t. */ + read_descriptor_t rd_desc = { + .arg.data = tss, + }; + + return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv); +} + +/** + * tcp_splice_read - splice data from TCP socket to a pipe + * @sock: socket to splice from + * @ppos: position (not valid) + * @pipe: pipe to splice to + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Description: + * Will read pages from given socket and fill them into a pipe. + * + **/ +ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct sock *sk = sock->sk; + struct tcp_splice_state tss = { + .pipe = pipe, + .len = len, + .flags = flags, + }; + long timeo; + ssize_t spliced; + int ret; + + /* + * We can't seek on a socket input + */ + if (unlikely(*ppos)) + return -ESPIPE; + + ret = spliced = 0; + + lock_sock(sk); + + timeo = sock_rcvtimeo(sk, flags & SPLICE_F_NONBLOCK); + while (tss.len) { + ret = __tcp_splice_read(sk, &tss); + if (ret < 0) + break; + else if (!ret) { + if (spliced) + break; + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + if (sock_flag(sk, SOCK_DONE)) + break; + if (sk->sk_err) { + ret = sock_error(sk); + break; + } + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + if (sk->sk_state == TCP_CLOSE) { + /* + * This occurs when user tries to read + * from never connected socket. + */ + if (!sock_flag(sk, SOCK_DONE)) + ret = -ENOTCONN; + break; + } + if (!timeo) { + ret = -EAGAIN; + break; + } + sk_wait_data(sk, &timeo); + if (signal_pending(current)) { + ret = sock_intr_errno(timeo); + break; + } + continue; + } + tss.len -= ret; + spliced += ret; + + release_sock(sk); + lock_sock(sk); + + if (sk->sk_err || sk->sk_state == TCP_CLOSE || + (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo || + signal_pending(current)) + break; + } + + release_sock(sk); + + if (spliced) + return spliced; + + return ret; +} + +struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) +{ + struct sk_buff *skb; + + /* The TCP header must be at least 32-bit aligned. */ + size = ALIGN(size, 4); + + skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); + if (skb) { + if (sk_wmem_schedule(sk, skb->truesize)) { + /* + * Make sure that we have exactly size bytes + * available to the caller, no more, no less. + */ + skb_reserve(skb, skb_tailroom(skb) - size); + return skb; + } + __kfree_skb(skb); + } else { + sk->sk_prot->enter_memory_pressure(); + sk_stream_moderate_sndbuf(sk); + } + return NULL; +} + static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags) { @@ -537,8 +690,7 @@ new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; - skb = sk_stream_alloc_pskb(sk, 0, 0, - sk->sk_allocation); + skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); if (!skb) goto wait_for_memory; @@ -555,7 +707,7 @@ new_segment: tcp_mark_push(tp, skb); goto new_segment; } - if (!sk_stream_wmem_schedule(sk, copy)) + if (!sk_wmem_schedule(sk, copy)) goto wait_for_memory; if (can_coalesce) { @@ -569,7 +721,7 @@ new_segment: skb->data_len += copy; skb->truesize += copy; sk->sk_wmem_queued += copy; - sk->sk_forward_alloc -= copy; + sk_mem_charge(sk, copy); skb->ip_summed = CHECKSUM_PARTIAL; tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; @@ -718,8 +870,8 @@ new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; - skb = sk_stream_alloc_pskb(sk, select_size(sk), - 0, sk->sk_allocation); + skb = sk_stream_alloc_skb(sk, select_size(sk), + sk->sk_allocation); if (!skb) goto wait_for_memory; @@ -776,7 +928,7 @@ new_segment: if (copy > PAGE_SIZE - off) copy = PAGE_SIZE - off; - if (!sk_stream_wmem_schedule(sk, copy)) + if (!sk_wmem_schedule(sk, copy)) goto wait_for_memory; if (!page) { @@ -867,7 +1019,7 @@ do_fault: * reset, where we can be unlinking the send_head. */ tcp_check_send_head(sk, skb); - sk_stream_free_skb(sk, skb); + sk_wmem_free_skb(sk, skb); } do_error: @@ -1500,6 +1652,41 @@ recv_urg: goto out; } +void tcp_set_state(struct sock *sk, int state) +{ + int oldstate = sk->sk_state; + + switch (state) { + case TCP_ESTABLISHED: + if (oldstate != TCP_ESTABLISHED) + TCP_INC_STATS(TCP_MIB_CURRESTAB); + break; + + case TCP_CLOSE: + if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED) + TCP_INC_STATS(TCP_MIB_ESTABRESETS); + + sk->sk_prot->unhash(sk); + if (inet_csk(sk)->icsk_bind_hash && + !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) + inet_put_port(&tcp_hashinfo, sk); + /* fall through */ + default: + if (oldstate==TCP_ESTABLISHED) + TCP_DEC_STATS(TCP_MIB_CURRESTAB); + } + + /* Change state AFTER socket is unhashed to avoid closed + * socket sitting in hash tables. + */ + sk->sk_state = state; + +#ifdef STATE_TRACE + SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n",sk, statename[oldstate],statename[state]); +#endif +} +EXPORT_SYMBOL_GPL(tcp_set_state); + /* * State processing on a close. This implements the state shift for * sending our FIN frame. Note that we only send a FIN for some @@ -1586,7 +1773,7 @@ void tcp_close(struct sock *sk, long timeout) __kfree_skb(skb); } - sk_stream_mem_reclaim(sk); + sk_mem_reclaim(sk); /* As outlined in RFC 2525, section 2.17, we send a RST here because * data was lost. To witness the awful effects of the old behavior of @@ -1689,7 +1876,7 @@ adjudge_to_death: } } if (sk->sk_state != TCP_CLOSE) { - sk_stream_mem_reclaim(sk); + sk_mem_reclaim(sk); if (tcp_too_many_orphans(sk, atomic_read(sk->sk_prot->orphan_count))) { if (net_ratelimit()) @@ -2411,7 +2598,6 @@ void tcp_done(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_done); -extern void __skb_cb_too_small_for_tcp(int, int); extern struct tcp_congestion_ops tcp_reno; static __initdata unsigned long thash_entries; @@ -2430,9 +2616,7 @@ void __init tcp_init(void) unsigned long limit; int order, i, max_share; - if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb)) - __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb), - sizeof(skb->cb)); + BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", @@ -2509,11 +2693,11 @@ void __init tcp_init(void) limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7); max_share = min(4UL*1024*1024, limit); - sysctl_tcp_wmem[0] = SK_STREAM_MEM_QUANTUM; + sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; sysctl_tcp_wmem[1] = 16*1024; sysctl_tcp_wmem[2] = max(64*1024, max_share); - sysctl_tcp_rmem[0] = SK_STREAM_MEM_QUANTUM; + sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; sysctl_tcp_rmem[1] = 87380; sysctl_tcp_rmem[2] = max(87380, max_share); @@ -2532,6 +2716,7 @@ EXPORT_SYMBOL(tcp_poll); EXPORT_SYMBOL(tcp_read_sock); EXPORT_SYMBOL(tcp_recvmsg); EXPORT_SYMBOL(tcp_sendmsg); +EXPORT_SYMBOL(tcp_splice_read); EXPORT_SYMBOL(tcp_sendpage); EXPORT_SYMBOL(tcp_setsockopt); EXPORT_SYMBOL(tcp_shutdown); diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 5dba0fc8f57..5212ed9b0c9 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -136,8 +136,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) ca->cnt = 1; } -static void bictcp_cong_avoid(struct sock *sk, u32 ack, - u32 in_flight, int data_acked) +static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 55fca1820c3..3a6be23d222 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -274,6 +274,27 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) return err; } +/* RFC2861 Check whether we are limited by application or congestion window + * This is the inverse of cwnd check in tcp_tso_should_defer + */ +int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) +{ + const struct tcp_sock *tp = tcp_sk(sk); + u32 left; + + if (in_flight >= tp->snd_cwnd) + return 1; + + if (!sk_can_gso(sk)) + return 0; + + left = tp->snd_cwnd - in_flight; + if (sysctl_tcp_tso_win_divisor) + return left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd; + else + return left <= tcp_max_burst(tp); +} +EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited); /* * Slow start is used when congestion window is less than slow start @@ -324,7 +345,7 @@ EXPORT_SYMBOL_GPL(tcp_slow_start); /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ -void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight, int flag) +void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 80bd084a9f9..3aa0b23c1ea 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -246,8 +246,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) ca->cnt = 1; } -static void bictcp_cong_avoid(struct sock *sk, u32 ack, - u32 in_flight, int data_acked) +static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 14a073d8b60..8b6caaf75bb 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -109,8 +109,7 @@ static void hstcp_init(struct sock *sk) tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); } -static void hstcp_cong_avoid(struct sock *sk, u32 adk, - u32 in_flight, int data_acked) +static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); struct hstcp *ca = inet_csk_ca(sk); diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 5215691f276..af99776146f 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -225,8 +225,7 @@ static u32 htcp_recalc_ssthresh(struct sock *sk) return max((tp->snd_cwnd * ca->beta) >> 7, 2U); } -static void htcp_cong_avoid(struct sock *sk, u32 ack, - u32 in_flight, int data_acked) +static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); struct htcp *ca = inet_csk_ca(sk); diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index b3e55cf5617..44618b67591 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -85,8 +85,7 @@ static inline u32 hybla_fraction(u32 odds) * o Give cwnd a new value based on the model proposed * o remember increments <1 */ -static void hybla_cong_avoid(struct sock *sk, u32 ack, - u32 in_flight, int flag) +static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); struct hybla *ca = inet_csk_ca(sk); @@ -103,7 +102,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, return; if (!ca->hybla_en) - return tcp_reno_cong_avoid(sk, ack, in_flight, flag); + return tcp_reno_cong_avoid(sk, ack, in_flight); if (ca->rho == 0) hybla_recalc_param(sk); diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 5aa5f5496d6..1eba160b72d 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -256,8 +256,7 @@ static void tcp_illinois_state(struct sock *sk, u8 new_state) /* * Increase window in response to successful acknowledgment. */ -static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, - u32 in_flight, int flag) +static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); struct illinois *ca = inet_csk_ca(sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b39f0d86e44..fa2c85ca5bc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -105,6 +105,7 @@ int sysctl_tcp_abc __read_mostly; #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ #define FLAG_NONHEAD_RETRANS_ACKED 0x1000 /* Non-head rexmitted data was ACKed */ +#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) @@ -120,8 +121,7 @@ int sysctl_tcp_abc __read_mostly; /* Adapt the MSS value used to make delayed ack decision to the * real world. */ -static void tcp_measure_rcv_mss(struct sock *sk, - const struct sk_buff *skb) +static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); const unsigned int lss = icsk->icsk_ack.last_seg_size; @@ -132,7 +132,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, /* skb->len may jitter because of SACKs, even if peer * sends good full-sized frames. */ - len = skb_shinfo(skb)->gso_size ?: skb->len; + len = skb_shinfo(skb)->gso_size ? : skb->len; if (len >= icsk->icsk_ack.rcv_mss) { icsk->icsk_ack.rcv_mss = len; } else { @@ -172,8 +172,8 @@ static void tcp_incr_quickack(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); - if (quickacks==0) - quickacks=2; + if (quickacks == 0) + quickacks = 2; if (quickacks > icsk->icsk_ack.quick) icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); } @@ -198,7 +198,7 @@ static inline int tcp_in_quickack_mode(const struct sock *sk) static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp) { - if (tp->ecn_flags&TCP_ECN_OK) + if (tp->ecn_flags & TCP_ECN_OK) tp->ecn_flags |= TCP_ECN_QUEUE_CWR; } @@ -215,7 +215,7 @@ static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp) static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb) { - if (tp->ecn_flags&TCP_ECN_OK) { + if (tp->ecn_flags & TCP_ECN_OK) { if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags)) tp->ecn_flags |= TCP_ECN_DEMAND_CWR; /* Funny extension: if ECT is not set on a segment, @@ -228,19 +228,19 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb) static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, struct tcphdr *th) { - if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || th->cwr)) + if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr)) tp->ecn_flags &= ~TCP_ECN_OK; } static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, struct tcphdr *th) { - if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || !th->cwr)) + if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr)) tp->ecn_flags &= ~TCP_ECN_OK; } static inline int TCP_ECN_rcv_ecn_echo(struct tcp_sock *tp, struct tcphdr *th) { - if (th->ece && !th->syn && (tp->ecn_flags&TCP_ECN_OK)) + if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) return 1; return 0; } @@ -289,8 +289,8 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); /* Optimize this! */ - int truesize = tcp_win_from_space(skb->truesize)/2; - int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2; + int truesize = tcp_win_from_space(skb->truesize) >> 1; + int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1; while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) @@ -302,8 +302,7 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) return 0; } -static void tcp_grow_window(struct sock *sk, - struct sk_buff *skb) +static void tcp_grow_window(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -317,12 +316,13 @@ static void tcp_grow_window(struct sock *sk, * will fit to rcvbuf in future. */ if (tcp_win_from_space(skb->truesize) <= skb->len) - incr = 2*tp->advmss; + incr = 2 * tp->advmss; else incr = __tcp_grow_window(sk, skb); if (incr) { - tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp); + tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, + tp->window_clamp); inet_csk(sk)->icsk_ack.quick |= 1; } } @@ -397,10 +397,9 @@ static void tcp_clamp_window(struct sock *sk) sysctl_tcp_rmem[2]); } if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) - tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); + tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); } - /* Initialize RCV_MSS value. * RCV_MSS is an our guess about MSS used by the peer. * We haven't any direct information about the MSS. @@ -413,7 +412,7 @@ void tcp_initialize_rcv_mss(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); - hint = min(hint, tp->rcv_wnd/2); + hint = min(hint, tp->rcv_wnd / 2); hint = min(hint, TCP_MIN_RCVMSS); hint = max(hint, TCP_MIN_MSS); @@ -470,16 +469,15 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) goto new_measure; if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) return; - tcp_rcv_rtt_update(tp, - jiffies - tp->rcv_rtt_est.time, - 1); + tcp_rcv_rtt_update(tp, jiffies - tp->rcv_rtt_est.time, 1); new_measure: tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; tp->rcv_rtt_est.time = tcp_time_stamp; } -static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb) +static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, + const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); if (tp->rx_opt.rcv_tsecr && @@ -502,8 +500,7 @@ void tcp_rcv_space_adjust(struct sock *sk) goto new_measure; time = tcp_time_stamp - tp->rcvq_space.time; - if (time < (tp->rcv_rtt_est.rtt >> 3) || - tp->rcv_rtt_est.rtt == 0) + if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) return; space = 2 * (tp->copied_seq - tp->rcvq_space.seq); @@ -579,7 +576,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) } else { int m = now - icsk->icsk_ack.lrcvtime; - if (m <= TCP_ATO_MIN/2) { + if (m <= TCP_ATO_MIN / 2) { /* The fastest case is the first. */ icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2; } else if (m < icsk->icsk_ack.ato) { @@ -591,7 +588,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) * restart window, so that we send ACKs quickly. */ tcp_incr_quickack(sk); - sk_stream_mem_reclaim(sk); + sk_mem_reclaim(sk); } } icsk->icsk_ack.lrcvtime = now; @@ -608,7 +605,7 @@ static u32 tcp_rto_min(struct sock *sk) u32 rto_min = TCP_RTO_MIN; if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) - rto_min = dst->metrics[RTAX_RTO_MIN-1]; + rto_min = dst->metrics[RTAX_RTO_MIN - 1]; return rto_min; } @@ -671,14 +668,14 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) } if (after(tp->snd_una, tp->rtt_seq)) { if (tp->mdev_max < tp->rttvar) - tp->rttvar -= (tp->rttvar-tp->mdev_max)>>2; + tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2; tp->rtt_seq = tp->snd_nxt; tp->mdev_max = tcp_rto_min(sk); } } else { /* no previous measure. */ - tp->srtt = m<<3; /* take the measured time to be rtt */ - tp->mdev = m<<1; /* make sure rto = 3*rtt */ + tp->srtt = m << 3; /* take the measured time to be rtt */ + tp->mdev = m << 1; /* make sure rto = 3*rtt */ tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); tp->rtt_seq = tp->snd_nxt; } @@ -732,7 +729,7 @@ void tcp_update_metrics(struct sock *sk) dst_confirm(dst); - if (dst && (dst->flags&DST_HOST)) { + if (dst && (dst->flags & DST_HOST)) { const struct inet_connection_sock *icsk = inet_csk(sk); int m; @@ -742,7 +739,7 @@ void tcp_update_metrics(struct sock *sk) * Reset our results. */ if (!(dst_metric_locked(dst, RTAX_RTT))) - dst->metrics[RTAX_RTT-1] = 0; + dst->metrics[RTAX_RTT - 1] = 0; return; } @@ -754,9 +751,9 @@ void tcp_update_metrics(struct sock *sk) */ if (!(dst_metric_locked(dst, RTAX_RTT))) { if (m <= 0) - dst->metrics[RTAX_RTT-1] = tp->srtt; + dst->metrics[RTAX_RTT - 1] = tp->srtt; else - dst->metrics[RTAX_RTT-1] -= (m>>3); + dst->metrics[RTAX_RTT - 1] -= (m >> 3); } if (!(dst_metric_locked(dst, RTAX_RTTVAR))) { @@ -769,7 +766,7 @@ void tcp_update_metrics(struct sock *sk) m = tp->mdev; if (m >= dst_metric(dst, RTAX_RTTVAR)) - dst->metrics[RTAX_RTTVAR-1] = m; + dst->metrics[RTAX_RTTVAR - 1] = m; else dst->metrics[RTAX_RTTVAR-1] -= (dst->metrics[RTAX_RTTVAR-1] - m)>>2; @@ -783,7 +780,7 @@ void tcp_update_metrics(struct sock *sk) dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1; if (!dst_metric_locked(dst, RTAX_CWND) && tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) - dst->metrics[RTAX_CWND-1] = tp->snd_cwnd; + dst->metrics[RTAX_CWND - 1] = tp->snd_cwnd; } else if (tp->snd_cwnd > tp->snd_ssthresh && icsk->icsk_ca_state == TCP_CA_Open) { /* Cong. avoidance phase, cwnd is reliable. */ @@ -863,6 +860,9 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) */ static void tcp_disable_fack(struct tcp_sock *tp) { + /* RFC3517 uses different metric in lost marker => reset on change */ + if (tcp_is_fack(tp)) + tp->lost_skb_hint = NULL; tp->rx_opt.sack_ok &= ~2; } @@ -1112,16 +1112,22 @@ static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, * * Search retransmitted skbs from write_queue that were sent when snd_nxt was * less than what is now known to be received by the other end (derived from - * SACK blocks by the caller). Also calculate the lowest snd_nxt among the - * remaining retransmitted skbs to avoid some costly processing per ACKs. + * highest SACK block). Also calculate the lowest snd_nxt among the remaining + * retransmitted skbs to avoid some costly processing per ACKs. */ -static int tcp_mark_lost_retrans(struct sock *sk, u32 received_upto) +static void tcp_mark_lost_retrans(struct sock *sk) { + const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int flag = 0; int cnt = 0; u32 new_low_seq = tp->snd_nxt; + u32 received_upto = tcp_highest_sack_seq(tp); + + if (!tcp_is_fack(tp) || !tp->retrans_out || + !after(received_upto, tp->lost_retrans_low) || + icsk->icsk_ca_state != TCP_CA_Recovery) + return; tcp_for_write_queue(skb, sk) { u32 ack_seq = TCP_SKB_CB(skb)->ack_seq; @@ -1149,9 +1155,8 @@ static int tcp_mark_lost_retrans(struct sock *sk, u32 received_upto) if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { tp->lost_out += tcp_skb_pcount(skb); TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - flag |= FLAG_DATA_SACKED; - NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT); } + NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT); } else { if (before(ack_seq, new_low_seq)) new_low_seq = ack_seq; @@ -1161,8 +1166,6 @@ static int tcp_mark_lost_retrans(struct sock *sk, u32 received_upto) if (tp->retrans_out) tp->lost_retrans_low = new_low_seq; - - return flag; } static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, @@ -1230,34 +1233,205 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, return in_sack; } +static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, + int *reord, int dup_sack, int fack_count) +{ + struct tcp_sock *tp = tcp_sk(sk); + u8 sacked = TCP_SKB_CB(skb)->sacked; + int flag = 0; + + /* Account D-SACK for retransmitted packet. */ + if (dup_sack && (sacked & TCPCB_RETRANS)) { + if (after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) + tp->undo_retrans--; + if (sacked & TCPCB_SACKED_ACKED) + *reord = min(fack_count, *reord); + } + + /* Nothing to do; acked frame is about to be dropped (was ACKed). */ + if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) + return flag; + + if (!(sacked & TCPCB_SACKED_ACKED)) { + if (sacked & TCPCB_SACKED_RETRANS) { + /* If the segment is not tagged as lost, + * we do not clear RETRANS, believing + * that retransmission is still in flight. + */ + if (sacked & TCPCB_LOST) { + TCP_SKB_CB(skb)->sacked &= + ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); + tp->lost_out -= tcp_skb_pcount(skb); + tp->retrans_out -= tcp_skb_pcount(skb); + + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; + } + } else { + if (!(sacked & TCPCB_RETRANS)) { + /* New sack for not retransmitted frame, + * which was in hole. It is reordering. + */ + if (before(TCP_SKB_CB(skb)->seq, + tcp_highest_sack_seq(tp))) + *reord = min(fack_count, *reord); + + /* SACK enhanced F-RTO (RFC4138; Appendix B) */ + if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) + flag |= FLAG_ONLY_ORIG_SACKED; + } + + if (sacked & TCPCB_LOST) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; + tp->lost_out -= tcp_skb_pcount(skb); + + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; + } + } + + TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; + flag |= FLAG_DATA_SACKED; + tp->sacked_out += tcp_skb_pcount(skb); + + fack_count += tcp_skb_pcount(skb); + + /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ + if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->lost_skb_hint)->seq)) + tp->lost_cnt_hint += tcp_skb_pcount(skb); + + if (fack_count > tp->fackets_out) + tp->fackets_out = fack_count; + + if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) + tcp_advance_highest_sack(sk, skb); + } + + /* D-SACK. We can detect redundant retransmission in S|R and plain R + * frames and clear it. undo_retrans is decreased above, L|R frames + * are accounted above as well. + */ + if (dup_sack && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); + tp->retransmit_skb_hint = NULL; + } + + return flag; +} + +static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, + struct tcp_sack_block *next_dup, + u32 start_seq, u32 end_seq, + int dup_sack_in, int *fack_count, + int *reord, int *flag) +{ + tcp_for_write_queue_from(skb, sk) { + int in_sack = 0; + int dup_sack = dup_sack_in; + + if (skb == tcp_send_head(sk)) + break; + + /* queue is in-order => we can short-circuit the walk early */ + if (!before(TCP_SKB_CB(skb)->seq, end_seq)) + break; + + if ((next_dup != NULL) && + before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) { + in_sack = tcp_match_skb_to_sack(sk, skb, + next_dup->start_seq, + next_dup->end_seq); + if (in_sack > 0) + dup_sack = 1; + } + + if (in_sack <= 0) + in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, + end_seq); + if (unlikely(in_sack < 0)) + break; + + if (in_sack) + *flag |= tcp_sacktag_one(skb, sk, reord, dup_sack, + *fack_count); + + *fack_count += tcp_skb_pcount(skb); + } + return skb; +} + +/* Avoid all extra work that is being done by sacktag while walking in + * a normal way + */ +static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, + u32 skip_to_seq) +{ + tcp_for_write_queue_from(skb, sk) { + if (skb == tcp_send_head(sk)) + break; + + if (!before(TCP_SKB_CB(skb)->end_seq, skip_to_seq)) + break; + } + return skb; +} + +static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, + struct sock *sk, + struct tcp_sack_block *next_dup, + u32 skip_to_seq, + int *fack_count, int *reord, + int *flag) +{ + if (next_dup == NULL) + return skb; + + if (before(next_dup->start_seq, skip_to_seq)) { + skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq); + tcp_sacktag_walk(skb, sk, NULL, + next_dup->start_seq, next_dup->end_seq, + 1, fack_count, reord, flag); + } + + return skb; +} + +static int tcp_sack_cache_ok(struct tcp_sock *tp, struct tcp_sack_block *cache) +{ + return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); +} + static int -tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una) +tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, + u32 prior_snd_una) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned char *ptr = (skb_transport_header(ack_skb) + TCP_SKB_CB(ack_skb)->sacked); - struct tcp_sack_block_wire *sp = (struct tcp_sack_block_wire *)(ptr+2); - struct sk_buff *cached_skb; - int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3; + struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); + struct tcp_sack_block sp[4]; + struct tcp_sack_block *cache; + struct sk_buff *skb; + int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE) >> 3; + int used_sacks; int reord = tp->packets_out; - int prior_fackets; - u32 highest_sack_end_seq = tp->lost_retrans_low; int flag = 0; int found_dup_sack = 0; - int cached_fack_count; - int i; + int fack_count; + int i, j; int first_sack_index; - int force_one_sack; if (!tp->sacked_out) { if (WARN_ON(tp->fackets_out)) tp->fackets_out = 0; - tp->highest_sack = tp->snd_una; + tcp_highest_sack_reset(sk); } - prior_fackets = tp->fackets_out; - found_dup_sack = tcp_check_dsack(tp, ack_skb, sp, + found_dup_sack = tcp_check_dsack(tp, ack_skb, sp_wire, num_sacks, prior_snd_una); if (found_dup_sack) flag |= FLAG_DSACKING_ACK; @@ -1272,78 +1446,17 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (!tp->packets_out) goto out; - /* SACK fastpath: - * if the only SACK change is the increase of the end_seq of - * the first block then only apply that SACK block - * and use retrans queue hinting otherwise slowpath */ - force_one_sack = 1; - for (i = 0; i < num_sacks; i++) { - __be32 start_seq = sp[i].start_seq; - __be32 end_seq = sp[i].end_seq; - - if (i == 0) { - if (tp->recv_sack_cache[i].start_seq != start_seq) - force_one_sack = 0; - } else { - if ((tp->recv_sack_cache[i].start_seq != start_seq) || - (tp->recv_sack_cache[i].end_seq != end_seq)) - force_one_sack = 0; - } - tp->recv_sack_cache[i].start_seq = start_seq; - tp->recv_sack_cache[i].end_seq = end_seq; - } - /* Clear the rest of the cache sack blocks so they won't match mistakenly. */ - for (; i < ARRAY_SIZE(tp->recv_sack_cache); i++) { - tp->recv_sack_cache[i].start_seq = 0; - tp->recv_sack_cache[i].end_seq = 0; - } - + used_sacks = 0; first_sack_index = 0; - if (force_one_sack) - num_sacks = 1; - else { - int j; - tp->fastpath_skb_hint = NULL; - - /* order SACK blocks to allow in order walk of the retrans queue */ - for (i = num_sacks-1; i > 0; i--) { - for (j = 0; j < i; j++){ - if (after(ntohl(sp[j].start_seq), - ntohl(sp[j+1].start_seq))){ - struct tcp_sack_block_wire tmp; - - tmp = sp[j]; - sp[j] = sp[j+1]; - sp[j+1] = tmp; - - /* Track where the first SACK block goes to */ - if (j == first_sack_index) - first_sack_index = j+1; - } - - } - } - } - - /* Use SACK fastpath hint if valid */ - cached_skb = tp->fastpath_skb_hint; - cached_fack_count = tp->fastpath_cnt_hint; - if (!cached_skb) { - cached_skb = tcp_write_queue_head(sk); - cached_fack_count = 0; - } - for (i = 0; i < num_sacks; i++) { - struct sk_buff *skb; - __u32 start_seq = ntohl(sp->start_seq); - __u32 end_seq = ntohl(sp->end_seq); - int fack_count; - int dup_sack = (found_dup_sack && (i == first_sack_index)); - int next_dup = (found_dup_sack && (i+1 == first_sack_index)); + int dup_sack = !i && found_dup_sack; - sp++; + sp[used_sacks].start_seq = ntohl(get_unaligned(&sp_wire[i].start_seq)); + sp[used_sacks].end_seq = ntohl(get_unaligned(&sp_wire[i].end_seq)); - if (!tcp_is_sackblock_valid(tp, dup_sack, start_seq, end_seq)) { + if (!tcp_is_sackblock_valid(tp, dup_sack, + sp[used_sacks].start_seq, + sp[used_sacks].end_seq)) { if (dup_sack) { if (!tp->undo_marker) NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDNOUNDO); @@ -1352,169 +1465,148 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ } else { /* Don't count olds caused by ACK reordering */ if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) && - !after(end_seq, tp->snd_una)) + !after(sp[used_sacks].end_seq, tp->snd_una)) continue; NET_INC_STATS_BH(LINUX_MIB_TCPSACKDISCARD); } + if (i == 0) + first_sack_index = -1; continue; } - skb = cached_skb; - fack_count = cached_fack_count; - - /* Event "B" in the comment above. */ - if (after(end_seq, tp->high_seq)) - flag |= FLAG_DATA_LOST; - - tcp_for_write_queue_from(skb, sk) { - int in_sack = 0; - u8 sacked; - - if (skb == tcp_send_head(sk)) - break; - - cached_skb = skb; - cached_fack_count = fack_count; - if (i == first_sack_index) { - tp->fastpath_skb_hint = skb; - tp->fastpath_cnt_hint = fack_count; - } + /* Ignore very old stuff early */ + if (!after(sp[used_sacks].end_seq, prior_snd_una)) + continue; - /* The retransmission queue is always in order, so - * we can short-circuit the walk early. - */ - if (!before(TCP_SKB_CB(skb)->seq, end_seq)) - break; + used_sacks++; + } - dup_sack = (found_dup_sack && (i == first_sack_index)); + /* order SACK blocks to allow in order walk of the retrans queue */ + for (i = used_sacks - 1; i > 0; i--) { + for (j = 0; j < i; j++) { + if (after(sp[j].start_seq, sp[j + 1].start_seq)) { + struct tcp_sack_block tmp; - /* Due to sorting DSACK may reside within this SACK block! */ - if (next_dup) { - u32 dup_start = ntohl(sp->start_seq); - u32 dup_end = ntohl(sp->end_seq); + tmp = sp[j]; + sp[j] = sp[j + 1]; + sp[j + 1] = tmp; - if (before(TCP_SKB_CB(skb)->seq, dup_end)) { - in_sack = tcp_match_skb_to_sack(sk, skb, dup_start, dup_end); - if (in_sack > 0) - dup_sack = 1; - } + /* Track where the first SACK block goes to */ + if (j == first_sack_index) + first_sack_index = j + 1; } + } + } - /* DSACK info lost if out-of-mem, try SACK still */ - if (in_sack <= 0) - in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, end_seq); - if (unlikely(in_sack < 0)) - break; + skb = tcp_write_queue_head(sk); + fack_count = 0; + i = 0; - sacked = TCP_SKB_CB(skb)->sacked; + if (!tp->sacked_out) { + /* It's already past, so skip checking against it */ + cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); + } else { + cache = tp->recv_sack_cache; + /* Skip empty blocks in at head of the cache */ + while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq && + !cache->end_seq) + cache++; + } - /* Account D-SACK for retransmitted packet. */ - if ((dup_sack && in_sack) && - (sacked & TCPCB_RETRANS) && - after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) - tp->undo_retrans--; + while (i < used_sacks) { + u32 start_seq = sp[i].start_seq; + u32 end_seq = sp[i].end_seq; + int dup_sack = (found_dup_sack && (i == first_sack_index)); + struct tcp_sack_block *next_dup = NULL; - /* The frame is ACKed. */ - if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) { - if (sacked&TCPCB_RETRANS) { - if ((dup_sack && in_sack) && - (sacked&TCPCB_SACKED_ACKED)) - reord = min(fack_count, reord); - } + if (found_dup_sack && ((i + 1) == first_sack_index)) + next_dup = &sp[i + 1]; - /* Nothing to do; acked frame is about to be dropped. */ - fack_count += tcp_skb_pcount(skb); - continue; - } + /* Event "B" in the comment above. */ + if (after(end_seq, tp->high_seq)) + flag |= FLAG_DATA_LOST; - if (!in_sack) { - fack_count += tcp_skb_pcount(skb); - continue; + /* Skip too early cached blocks */ + while (tcp_sack_cache_ok(tp, cache) && + !before(start_seq, cache->end_seq)) + cache++; + + /* Can skip some work by looking recv_sack_cache? */ + if (tcp_sack_cache_ok(tp, cache) && !dup_sack && + after(end_seq, cache->start_seq)) { + + /* Head todo? */ + if (before(start_seq, cache->start_seq)) { + skb = tcp_sacktag_skip(skb, sk, start_seq); + skb = tcp_sacktag_walk(skb, sk, next_dup, + start_seq, + cache->start_seq, + dup_sack, &fack_count, + &reord, &flag); } - if (!(sacked&TCPCB_SACKED_ACKED)) { - if (sacked & TCPCB_SACKED_RETRANS) { - /* If the segment is not tagged as lost, - * we do not clear RETRANS, believing - * that retransmission is still in flight. - */ - if (sacked & TCPCB_LOST) { - TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); - tp->lost_out -= tcp_skb_pcount(skb); - tp->retrans_out -= tcp_skb_pcount(skb); - - /* clear lost hint */ - tp->retransmit_skb_hint = NULL; - } - } else { - if (!(sacked & TCPCB_RETRANS)) { - /* New sack for not retransmitted frame, - * which was in hole. It is reordering. - */ - if (fack_count < prior_fackets) - reord = min(fack_count, reord); - - /* SACK enhanced F-RTO (RFC4138; Appendix B) */ - if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) - flag |= FLAG_ONLY_ORIG_SACKED; - } - - if (sacked & TCPCB_LOST) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; - tp->lost_out -= tcp_skb_pcount(skb); + /* Rest of the block already fully processed? */ + if (!after(end_seq, cache->end_seq)) + goto advance_sp; - /* clear lost hint */ - tp->retransmit_skb_hint = NULL; - } - } + skb = tcp_maybe_skipping_dsack(skb, sk, next_dup, + cache->end_seq, + &fack_count, &reord, + &flag); - TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; - flag |= FLAG_DATA_SACKED; - tp->sacked_out += tcp_skb_pcount(skb); - - fack_count += tcp_skb_pcount(skb); - if (fack_count > tp->fackets_out) - tp->fackets_out = fack_count; - - if (after(TCP_SKB_CB(skb)->seq, tp->highest_sack)) { - tp->highest_sack = TCP_SKB_CB(skb)->seq; - highest_sack_end_seq = TCP_SKB_CB(skb)->end_seq; - } - } else { - if (dup_sack && (sacked&TCPCB_RETRANS)) - reord = min(fack_count, reord); - - fack_count += tcp_skb_pcount(skb); + /* ...tail remains todo... */ + if (tcp_highest_sack_seq(tp) == cache->end_seq) { + /* ...but better entrypoint exists! */ + skb = tcp_highest_sack(sk); + if (skb == NULL) + break; + fack_count = tp->fackets_out; + cache++; + goto walk; } - /* D-SACK. We can detect redundant retransmission - * in S|R and plain R frames and clear it. - * undo_retrans is decreased above, L|R frames - * are accounted above as well. - */ - if (dup_sack && - (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; - tp->retrans_out -= tcp_skb_pcount(skb); - tp->retransmit_skb_hint = NULL; - } + skb = tcp_sacktag_skip(skb, sk, cache->end_seq); + /* Check overlap against next cached too (past this one already) */ + cache++; + continue; + } + + if (!before(start_seq, tcp_highest_sack_seq(tp))) { + skb = tcp_highest_sack(sk); + if (skb == NULL) + break; + fack_count = tp->fackets_out; } + skb = tcp_sacktag_skip(skb, sk, start_seq); + +walk: + skb = tcp_sacktag_walk(skb, sk, next_dup, start_seq, end_seq, + dup_sack, &fack_count, &reord, &flag); +advance_sp: /* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct * due to in-order walk */ if (after(end_seq, tp->frto_highmark)) flag &= ~FLAG_ONLY_ORIG_SACKED; + + i++; } - if (tp->retrans_out && - after(highest_sack_end_seq, tp->lost_retrans_low) && - icsk->icsk_ca_state == TCP_CA_Recovery) - flag |= tcp_mark_lost_retrans(sk, highest_sack_end_seq); + /* Clear the head of the cache sack blocks so we can skip it next time */ + for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) { + tp->recv_sack_cache[i].start_seq = 0; + tp->recv_sack_cache[i].end_seq = 0; + } + for (j = 0; j < used_sacks; j++) + tp->recv_sack_cache[i++] = sp[j]; + + tcp_mark_lost_retrans(sk); tcp_verify_left_out(tp); - if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss && + if ((reord < tp->fackets_out) && + ((icsk->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker) && (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark))) tcp_update_reordering(sk, tp->fackets_out - reord, 0); @@ -1565,10 +1657,10 @@ static void tcp_remove_reno_sacks(struct sock *sk, int acked) if (acked > 0) { /* One ACK acked hole. The rest eat duplicate ACKs. */ - if (acked-1 >= tp->sacked_out) + if (acked - 1 >= tp->sacked_out) tp->sacked_out = 0; else - tp->sacked_out -= acked-1; + tp->sacked_out -= acked - 1; } tcp_check_reno_reordering(sk, acked); tcp_verify_left_out(tp); @@ -1602,10 +1694,10 @@ int tcp_use_frto(struct sock *sk) tcp_for_write_queue_from(skb, sk) { if (skb == tcp_send_head(sk)) break; - if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS) + if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) return 0; /* Short-circuit when first non-SACKed skb has been checked */ - if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) + if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) break; } return 1; @@ -1715,7 +1807,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) * Count the retransmission made on RTO correctly (only when * waiting for the first ACK and did not get it)... */ - if ((tp->frto_counter == 1) && !(flag&FLAG_DATA_ACKED)) { + if ((tp->frto_counter == 1) && !(flag & FLAG_DATA_ACKED)) { /* For some reason this R-bit might get cleared? */ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) tp->retrans_out += tcp_skb_pcount(skb); @@ -1728,7 +1820,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) } /* Don't lost mark skbs that were fwd transmitted after RTO */ - if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) && + if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) && !after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); @@ -1743,7 +1835,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) tp->bytes_acked = 0; tp->reordering = min_t(unsigned int, tp->reordering, - sysctl_tcp_reordering); + sysctl_tcp_reordering); tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->frto_highmark; TCP_ECN_queue_cwr(tp); @@ -1810,7 +1902,7 @@ void tcp_enter_loss(struct sock *sk, int how) if (skb == tcp_send_head(sk)) break; - if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS) + if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) tp->undo_marker = 0; TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) { @@ -1822,7 +1914,7 @@ void tcp_enter_loss(struct sock *sk, int how) tcp_verify_left_out(tp); tp->reordering = min_t(unsigned int, tp->reordering, - sysctl_tcp_reordering); + sysctl_tcp_reordering); tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); @@ -1830,18 +1922,15 @@ void tcp_enter_loss(struct sock *sk, int how) tp->frto_counter = 0; } -static int tcp_check_sack_reneging(struct sock *sk) +/* If ACK arrived pointing to a remembered SACK, it means that our + * remembered SACKs do not reflect real state of receiver i.e. + * receiver _host_ is heavily congested (or buggy). + * + * Do processing similar to RTO timeout. + */ +static int tcp_check_sack_reneging(struct sock *sk, int flag) { - struct sk_buff *skb; - - /* If ACK arrived pointing to a remembered SACK, - * it means that our remembered SACKs do not reflect - * real state of receiver i.e. - * receiver _host_ is heavily congested (or buggy). - * Do processing similar to RTO timeout. - */ - if ((skb = tcp_write_queue_head(sk)) != NULL && - (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { + if (flag & FLAG_SACK_RENEGING) { struct inet_connection_sock *icsk = inet_csk(sk); NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING); @@ -1857,7 +1946,27 @@ static int tcp_check_sack_reneging(struct sock *sk) static inline int tcp_fackets_out(struct tcp_sock *tp) { - return tcp_is_reno(tp) ? tp->sacked_out+1 : tp->fackets_out; + return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out; +} + +/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs + * counter when SACK is enabled (without SACK, sacked_out is used for + * that purpose). + * + * Instead, with FACK TCP uses fackets_out that includes both SACKed + * segments up to the highest received SACK block so far and holes in + * between them. + * + * With reordering, holes may still be in flight, so RFC3517 recovery + * uses pure sacked_out (total number of SACKed segments) even though + * it violates the RFC that uses duplicate ACKs, often these are equal + * but when e.g. out-of-window ACKs or packet duplication occurs, + * they differ. Since neither occurs due to loss, TCP should really + * ignore them. + */ +static inline int tcp_dupack_heurestics(struct tcp_sock *tp) +{ + return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; } static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) @@ -1980,13 +2089,13 @@ static int tcp_time_to_recover(struct sock *sk) return 1; /* Not-A-Trick#2 : Classic rule... */ - if (tcp_fackets_out(tp) > tp->reordering) + if (tcp_dupack_heurestics(tp) > tp->reordering) return 1; /* Trick#3 : when we use RFC2988 timer restart, fast * retransmit can be triggered by timeout of queue head. */ - if (tcp_head_timedout(sk)) + if (tcp_is_fack(tp) && tcp_head_timedout(sk)) return 1; /* Trick#4: It is still not OK... But will it be useful to delay @@ -2010,17 +2119,18 @@ static int tcp_time_to_recover(struct sock *sk) * retransmitted past LOST markings in the first place? I'm not fully sure * about undo and end of connection cases, which can cause R without L? */ -static void tcp_verify_retransmit_hint(struct tcp_sock *tp, - struct sk_buff *skb) +static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) { if ((tp->retransmit_skb_hint != NULL) && before(TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) + TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) tp->retransmit_skb_hint = NULL; } -/* Mark head of queue up as lost. */ -static void tcp_mark_head_lost(struct sock *sk, int packets) +/* Mark head of queue up as lost. With RFC3517 SACK, the packets is + * is against sacked "cnt", otherwise it's against facked "cnt" + */ +static void tcp_mark_head_lost(struct sock *sk, int packets, int fast_rexmit) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -2042,8 +2152,13 @@ static void tcp_mark_head_lost(struct sock *sk, int packets) /* this is not the most efficient way to do this... */ tp->lost_skb_hint = skb; tp->lost_cnt_hint = cnt; - cnt += tcp_skb_pcount(skb); - if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, tp->high_seq)) + + if (tcp_is_fack(tp) || + (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) + cnt += tcp_skb_pcount(skb); + + if (((!fast_rexmit || (tp->lost_out > 0)) && (cnt > packets)) || + after(TCP_SKB_CB(skb)->end_seq, tp->high_seq)) break; if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_SACKED_ACKED|TCPCB_LOST))) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; @@ -2056,17 +2171,22 @@ static void tcp_mark_head_lost(struct sock *sk, int packets) /* Account newly detected lost packet(s) */ -static void tcp_update_scoreboard(struct sock *sk) +static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_is_fack(tp)) { + if (tcp_is_reno(tp)) { + tcp_mark_head_lost(sk, 1, fast_rexmit); + } else if (tcp_is_fack(tp)) { int lost = tp->fackets_out - tp->reordering; if (lost <= 0) lost = 1; - tcp_mark_head_lost(sk, lost); + tcp_mark_head_lost(sk, lost, fast_rexmit); } else { - tcp_mark_head_lost(sk, 1); + int sacked_upto = tp->sacked_out - tp->reordering; + if (sacked_upto < 0) + sacked_upto = 0; + tcp_mark_head_lost(sk, sacked_upto, fast_rexmit); } /* New heuristics: it is possible only after we switched @@ -2074,7 +2194,7 @@ static void tcp_update_scoreboard(struct sock *sk) * Hence, we can detect timed out packets during fast * retransmit without falling to slow start. */ - if (!tcp_is_reno(tp) && tcp_head_timedout(sk)) { + if (tcp_is_fack(tp) && tcp_head_timedout(sk)) { struct sk_buff *skb; skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint @@ -2105,7 +2225,7 @@ static void tcp_update_scoreboard(struct sock *sk) static inline void tcp_moderate_cwnd(struct tcp_sock *tp) { tp->snd_cwnd = min(tp->snd_cwnd, - tcp_packets_in_flight(tp)+tcp_max_burst(tp)); + tcp_packets_in_flight(tp) + tcp_max_burst(tp)); tp->snd_cwnd_stamp = tcp_time_stamp; } @@ -2125,15 +2245,15 @@ static void tcp_cwnd_down(struct sock *sk, int flag) struct tcp_sock *tp = tcp_sk(sk); int decr = tp->snd_cwnd_cnt + 1; - if ((flag&(FLAG_ANY_PROGRESS|FLAG_DSACKING_ACK)) || - (tcp_is_reno(tp) && !(flag&FLAG_NOT_DUP))) { - tp->snd_cwnd_cnt = decr&1; + if ((flag & (FLAG_ANY_PROGRESS | FLAG_DSACKING_ACK)) || + (tcp_is_reno(tp) && !(flag & FLAG_NOT_DUP))) { + tp->snd_cwnd_cnt = decr & 1; decr >>= 1; if (decr && tp->snd_cwnd > tcp_cwnd_min(sk)) tp->snd_cwnd -= decr; - tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); + tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1); tp->snd_cwnd_stamp = tcp_time_stamp; } } @@ -2177,7 +2297,7 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) if (icsk->icsk_ca_ops->undo_cwnd) tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk); else - tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); + tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1); if (undo && tp->prior_ssthresh > tp->snd_ssthresh) { tp->snd_ssthresh = tp->prior_ssthresh; @@ -2196,8 +2316,7 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) static inline int tcp_may_undo(struct tcp_sock *tp) { - return tp->undo_marker && - (!tp->undo_retrans || tcp_packet_delayed(tp)); + return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); } /* People celebrate: "We love our President!" */ @@ -2247,7 +2366,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked) { struct tcp_sock *tp = tcp_sk(sk); /* Partial ACK arrived. Force Hoe's retransmit. */ - int failed = tcp_is_reno(tp) || tp->fackets_out>tp->reordering; + int failed = tcp_is_reno(tp) || (tcp_fackets_out(tp) > tp->reordering); if (tcp_may_undo(tp)) { /* Plain luck! Hole if filled with delayed @@ -2316,7 +2435,7 @@ static void tcp_try_to_open(struct sock *sk, int flag) if (tp->retrans_out == 0) tp->retrans_stamp = 0; - if (flag&FLAG_ECE) + if (flag & FLAG_ECE) tcp_enter_cwr(sk, 1); if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { @@ -2362,7 +2481,6 @@ static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb) tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } - /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and @@ -2374,38 +2492,35 @@ static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb) * It does _not_ decide what to send, it is made in function * tcp_xmit_retransmit_queue(). */ -static void -tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) +static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - int is_dupack = !(flag&(FLAG_SND_UNA_ADVANCED|FLAG_NOT_DUP)); - int do_lost = is_dupack || ((flag&FLAG_DATA_SACKED) && - (tp->fackets_out > tp->reordering)); + int is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); + int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && + (tcp_fackets_out(tp) > tp->reordering)); + int fast_rexmit = 0; - /* Some technical things: - * 1. Reno does not count dupacks (sacked_out) automatically. */ - if (!tp->packets_out) + if (WARN_ON(!tp->packets_out && tp->sacked_out)) tp->sacked_out = 0; - if (WARN_ON(!tp->sacked_out && tp->fackets_out)) tp->fackets_out = 0; /* Now state machine starts. * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ - if (flag&FLAG_ECE) + if (flag & FLAG_ECE) tp->prior_ssthresh = 0; /* B. In all the states check for reneging SACKs. */ - if (tp->sacked_out && tcp_check_sack_reneging(sk)) + if (tcp_check_sack_reneging(sk, flag)) return; /* C. Process data loss notification, provided it is valid. */ - if ((flag&FLAG_DATA_LOST) && + if (tcp_is_fack(tp) && (flag & FLAG_DATA_LOST) && before(tp->snd_una, tp->high_seq) && icsk->icsk_ca_state != TCP_CA_Open && tp->fackets_out > tp->reordering) { - tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering); + tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0); NET_INC_STATS_BH(LINUX_MIB_TCPLOSS); } @@ -2465,7 +2580,7 @@ tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) do_lost = tcp_try_undo_partial(sk, pkts_acked); break; case TCP_CA_Loss: - if (flag&FLAG_DATA_ACKED) + if (flag & FLAG_DATA_ACKED) icsk->icsk_retransmits = 0; if (!tcp_try_undo_loss(sk)) { tcp_moderate_cwnd(tp); @@ -2515,7 +2630,7 @@ tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) tp->undo_retrans = tp->retrans_out; if (icsk->icsk_ca_state < TCP_CA_CWR) { - if (!(flag&FLAG_ECE)) + if (!(flag & FLAG_ECE)) tp->prior_ssthresh = tcp_current_ssthresh(sk); tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); TCP_ECN_queue_cwr(tp); @@ -2524,10 +2639,11 @@ tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) tp->bytes_acked = 0; tp->snd_cwnd_cnt = 0; tcp_set_ca_state(sk, TCP_CA_Recovery); + fast_rexmit = 1; } - if (do_lost || tcp_head_timedout(sk)) - tcp_update_scoreboard(sk); + if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk))) + tcp_update_scoreboard(sk, fast_rexmit); tcp_cwnd_down(sk, flag); tcp_xmit_retransmit_queue(sk); } @@ -2591,11 +2707,10 @@ static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, tcp_ack_no_tstamp(sk, seq_rtt, flag); } -static void tcp_cong_avoid(struct sock *sk, u32 ack, - u32 in_flight, int good) +static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { const struct inet_connection_sock *icsk = inet_csk(sk); - icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight, good); + icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight); tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; } @@ -2609,7 +2724,8 @@ static void tcp_rearm_rto(struct sock *sk) if (!tp->packets_out) { inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); } else { - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, TCP_RTO_MAX); } } @@ -2638,8 +2754,7 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) * is before the ack sequence we can discard it as it's confirmed to have * arrived at the other end. */ -static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p, - int prior_fackets) +static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -2647,8 +2762,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p, u32 now = tcp_time_stamp; int fully_acked = 1; int flag = 0; - int prior_packets = tp->packets_out; - u32 cnt = 0; + u32 pkts_acked = 0; u32 reord = tp->packets_out; s32 seq_rtt = -1; s32 ca_seq_rtt = -1; @@ -2657,7 +2771,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p, while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); u32 end_seq; - u32 packets_acked; + u32 acked_pcount; u8 sacked = scb->sacked; /* Determine how many packets and what bytes were acked, tso and else */ @@ -2666,14 +2780,14 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p, !after(tp->snd_una, scb->seq)) break; - packets_acked = tcp_tso_acked(sk, skb); - if (!packets_acked) + acked_pcount = tcp_tso_acked(sk, skb); + if (!acked_pcount) break; fully_acked = 0; end_seq = tp->snd_una; } else { - packets_acked = tcp_skb_pcount(skb); + acked_pcount = tcp_skb_pcount(skb); end_seq = scb->end_seq; } @@ -2683,44 +2797,34 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p, tcp_mtup_probe_success(sk, skb); } - if (sacked) { - if (sacked & TCPCB_RETRANS) { - if (sacked & TCPCB_SACKED_RETRANS) - tp->retrans_out -= packets_acked; - flag |= FLAG_RETRANS_DATA_ACKED; - ca_seq_rtt = -1; - seq_rtt = -1; - if ((flag & FLAG_DATA_ACKED) || - (packets_acked > 1)) - flag |= FLAG_NONHEAD_RETRANS_ACKED; - } else { - ca_seq_rtt = now - scb->when; - last_ackt = skb->tstamp; - if (seq_rtt < 0) { - seq_rtt = ca_seq_rtt; - } - if (!(sacked & TCPCB_SACKED_ACKED)) - reord = min(cnt, reord); - } - - if (sacked & TCPCB_SACKED_ACKED) - tp->sacked_out -= packets_acked; - if (sacked & TCPCB_LOST) - tp->lost_out -= packets_acked; - - if ((sacked & TCPCB_URG) && tp->urg_mode && - !before(end_seq, tp->snd_up)) - tp->urg_mode = 0; + if (sacked & TCPCB_RETRANS) { + if (sacked & TCPCB_SACKED_RETRANS) + tp->retrans_out -= acked_pcount; + flag |= FLAG_RETRANS_DATA_ACKED; + ca_seq_rtt = -1; + seq_rtt = -1; + if ((flag & FLAG_DATA_ACKED) || (acked_pcount > 1)) + flag |= FLAG_NONHEAD_RETRANS_ACKED; } else { ca_seq_rtt = now - scb->when; last_ackt = skb->tstamp; if (seq_rtt < 0) { seq_rtt = ca_seq_rtt; } - reord = min(cnt, reord); + if (!(sacked & TCPCB_SACKED_ACKED)) + reord = min(pkts_acked, reord); } - tp->packets_out -= packets_acked; - cnt += packets_acked; + + if (sacked & TCPCB_SACKED_ACKED) + tp->sacked_out -= acked_pcount; + if (sacked & TCPCB_LOST) + tp->lost_out -= acked_pcount; + + if (unlikely(tp->urg_mode && !before(end_seq, tp->snd_up))) + tp->urg_mode = 0; + + tp->packets_out -= acked_pcount; + pkts_acked += acked_pcount; /* Initial outgoing SYN's get put onto the write_queue * just like anything else we transmit. It is not @@ -2740,12 +2844,14 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p, break; tcp_unlink_write_queue(skb, sk); - sk_stream_free_skb(sk, skb); + sk_wmem_free_skb(sk, skb); tcp_clear_all_retrans_hints(tp); } + if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) + flag |= FLAG_SACK_RENEGING; + if (flag & FLAG_ACKED) { - u32 pkts_acked = prior_packets - tp->packets_out; const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; @@ -2761,9 +2867,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p, } tp->fackets_out -= min(pkts_acked, tp->fackets_out); - /* hint's skb might be NULL but we don't need to care */ - tp->fastpath_cnt_hint -= min_t(u32, pkts_acked, - tp->fastpath_cnt_hint); + if (ca_ops->pkts_acked) { s32 rtt_us = -1; @@ -2806,7 +2910,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p, } } #endif - *seq_rtt_p = seq_rtt; return flag; } @@ -2817,8 +2920,7 @@ static void tcp_ack_probe(struct sock *sk) /* Was it a usable window open? */ - if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, - tp->snd_una + tp->snd_wnd)) { + if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) { icsk->icsk_backoff = 0; inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); /* Socket must be waked up by subsequent tcp_data_snd_check(). @@ -2847,8 +2949,9 @@ static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) /* Check that window update is acceptable. * The function assumes that snd_una<=ack<=snd_next. */ -static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, - const u32 ack_seq, const u32 nwin) +static inline int tcp_may_update_window(const struct tcp_sock *tp, + const u32 ack, const u32 ack_seq, + const u32 nwin) { return (after(ack, tp->snd_una) || after(ack_seq, tp->snd_wl1) || @@ -2917,7 +3020,7 @@ static void tcp_ratehalving_spur_to_response(struct sock *sk) static void tcp_undo_spur_to_response(struct sock *sk, int flag) { - if (flag&FLAG_ECE) + if (flag & FLAG_ECE) tcp_ratehalving_spur_to_response(sk); else tcp_undo_cwr(sk, 1); @@ -2960,7 +3063,7 @@ static int tcp_process_frto(struct sock *sk, int flag) tcp_verify_left_out(tp); /* Duplicate the behavior from Loss state (fastretrans_alert) */ - if (flag&FLAG_DATA_ACKED) + if (flag & FLAG_DATA_ACKED) inet_csk(sk)->icsk_retransmits = 0; if ((flag & FLAG_NONHEAD_RETRANS_ACKED) || @@ -2977,16 +3080,16 @@ static int tcp_process_frto(struct sock *sk, int flag) * ACK isn't duplicate nor advances window, e.g., opposite dir * data, winupdate */ - if (!(flag&FLAG_ANY_PROGRESS) && (flag&FLAG_NOT_DUP)) + if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP)) return 1; - if (!(flag&FLAG_DATA_ACKED)) { + if (!(flag & FLAG_DATA_ACKED)) { tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3), flag); return 1; } } else { - if (!(flag&FLAG_DATA_ACKED) && (tp->frto_counter == 1)) { + if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) { /* Prevent sending of new data. */ tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)); @@ -2994,10 +3097,12 @@ static int tcp_process_frto(struct sock *sk, int flag) } if ((tp->frto_counter >= 2) && - (!(flag&FLAG_FORWARD_PROGRESS) || - ((flag&FLAG_DATA_SACKED) && !(flag&FLAG_ONLY_ORIG_SACKED)))) { + (!(flag & FLAG_FORWARD_PROGRESS) || + ((flag & FLAG_DATA_SACKED) && + !(flag & FLAG_ONLY_ORIG_SACKED)))) { /* RFC4138 shortcoming (see comment above) */ - if (!(flag&FLAG_FORWARD_PROGRESS) && (flag&FLAG_NOT_DUP)) + if (!(flag & FLAG_FORWARD_PROGRESS) && + (flag & FLAG_NOT_DUP)) return 1; tcp_enter_frto_loss(sk, 3, flag); @@ -3043,7 +3148,6 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) u32 ack = TCP_SKB_CB(skb)->ack_seq; u32 prior_in_flight; u32 prior_fackets; - s32 seq_rtt; int prior_packets; int frto_cwnd = 0; @@ -3064,13 +3168,14 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) tp->bytes_acked += ack - prior_snd_una; else if (icsk->icsk_ca_state == TCP_CA_Loss) /* we assume just one segment left network */ - tp->bytes_acked += min(ack - prior_snd_una, tp->mss_cache); + tp->bytes_acked += min(ack - prior_snd_una, + tp->mss_cache); } prior_fackets = tp->fackets_out; prior_in_flight = tcp_packets_in_flight(tp); - if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) { + if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) { /* Window is constant, pure forward advance. * No more checks are required. * Note, we use the fact that SND.UNA>=SND.WL2. @@ -3109,7 +3214,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) goto no_queue; /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, &seq_rtt, prior_fackets); + flag |= tcp_clean_rtx_queue(sk, prior_fackets); if (tp->frto_counter) frto_cwnd = tcp_process_frto(sk, flag); @@ -3121,14 +3226,15 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) /* Advance CWND, if state allows this. */ if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && tcp_may_raise_cwnd(sk, flag)) - tcp_cong_avoid(sk, ack, prior_in_flight, 0); - tcp_fastretrans_alert(sk, prior_packets - tp->packets_out, flag); + tcp_cong_avoid(sk, ack, prior_in_flight); + tcp_fastretrans_alert(sk, prior_packets - tp->packets_out, + flag); } else { if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) - tcp_cong_avoid(sk, ack, prior_in_flight, 1); + tcp_cong_avoid(sk, ack, prior_in_flight); } - if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) + if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) dst_confirm(sk->sk_dst_cache); return 1; @@ -3153,100 +3259,99 @@ uninteresting_ack: return 0; } - /* Look for tcp options. Normally only called on SYN and SYNACK packets. * But, this can also be called on packets in the established flow when * the fast version below fails. */ -void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab) +void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, + int estab) { unsigned char *ptr; struct tcphdr *th = tcp_hdr(skb); - int length=(th->doff*4)-sizeof(struct tcphdr); + int length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (unsigned char *)(th + 1); opt_rx->saw_tstamp = 0; while (length > 0) { - int opcode=*ptr++; + int opcode = *ptr++; int opsize; switch (opcode) { - case TCPOPT_EOL: + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2) /* "silly options" */ return; - case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ - length--; - continue; - default: - opsize=*ptr++; - if (opsize < 2) /* "silly options" */ - return; - if (opsize > length) - return; /* don't parse partial options */ - switch (opcode) { - case TCPOPT_MSS: - if (opsize==TCPOLEN_MSS && th->syn && !estab) { - u16 in_mss = ntohs(get_unaligned((__be16 *)ptr)); - if (in_mss) { - if (opt_rx->user_mss && opt_rx->user_mss < in_mss) - in_mss = opt_rx->user_mss; - opt_rx->mss_clamp = in_mss; - } - } - break; - case TCPOPT_WINDOW: - if (opsize==TCPOLEN_WINDOW && th->syn && !estab) - if (sysctl_tcp_window_scaling) { - __u8 snd_wscale = *(__u8 *) ptr; - opt_rx->wscale_ok = 1; - if (snd_wscale > 14) { - if (net_ratelimit()) - printk(KERN_INFO "tcp_parse_options: Illegal window " - "scaling value %d >14 received.\n", - snd_wscale); - snd_wscale = 14; - } - opt_rx->snd_wscale = snd_wscale; - } - break; - case TCPOPT_TIMESTAMP: - if (opsize==TCPOLEN_TIMESTAMP) { - if ((estab && opt_rx->tstamp_ok) || - (!estab && sysctl_tcp_timestamps)) { - opt_rx->saw_tstamp = 1; - opt_rx->rcv_tsval = ntohl(get_unaligned((__be32 *)ptr)); - opt_rx->rcv_tsecr = ntohl(get_unaligned((__be32 *)(ptr+4))); - } + if (opsize > length) + return; /* don't parse partial options */ + switch (opcode) { + case TCPOPT_MSS: + if (opsize == TCPOLEN_MSS && th->syn && !estab) { + u16 in_mss = ntohs(get_unaligned((__be16 *)ptr)); + if (in_mss) { + if (opt_rx->user_mss && + opt_rx->user_mss < in_mss) + in_mss = opt_rx->user_mss; + opt_rx->mss_clamp = in_mss; } - break; - case TCPOPT_SACK_PERM: - if (opsize==TCPOLEN_SACK_PERM && th->syn && !estab) { - if (sysctl_tcp_sack) { - opt_rx->sack_ok = 1; - tcp_sack_reset(opt_rx); - } + } + break; + case TCPOPT_WINDOW: + if (opsize == TCPOLEN_WINDOW && th->syn && + !estab && sysctl_tcp_window_scaling) { + __u8 snd_wscale = *(__u8 *)ptr; + opt_rx->wscale_ok = 1; + if (snd_wscale > 14) { + if (net_ratelimit()) + printk(KERN_INFO "tcp_parse_options: Illegal window " + "scaling value %d >14 received.\n", + snd_wscale); + snd_wscale = 14; } - break; + opt_rx->snd_wscale = snd_wscale; + } + break; + case TCPOPT_TIMESTAMP: + if ((opsize == TCPOLEN_TIMESTAMP) && + ((estab && opt_rx->tstamp_ok) || + (!estab && sysctl_tcp_timestamps))) { + opt_rx->saw_tstamp = 1; + opt_rx->rcv_tsval = ntohl(get_unaligned((__be32 *)ptr)); + opt_rx->rcv_tsecr = ntohl(get_unaligned((__be32 *)(ptr+4))); + } + break; + case TCPOPT_SACK_PERM: + if (opsize == TCPOLEN_SACK_PERM && th->syn && + !estab && sysctl_tcp_sack) { + opt_rx->sack_ok = 1; + tcp_sack_reset(opt_rx); + } + break; - case TCPOPT_SACK: - if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && - !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) && - opt_rx->sack_ok) { - TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th; - } - break; + case TCPOPT_SACK: + if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && + !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) && + opt_rx->sack_ok) { + TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th; + } + break; #ifdef CONFIG_TCP_MD5SIG - case TCPOPT_MD5SIG: - /* - * The MD5 Hash has already been - * checked (see tcp_v{4,6}_do_rcv()). - */ - break; + case TCPOPT_MD5SIG: + /* + * The MD5 Hash has already been + * checked (see tcp_v{4,6}_do_rcv()). + */ + break; #endif - } + } - ptr+=opsize-2; - length-=opsize; + ptr += opsize-2; + length -= opsize; } } } @@ -3257,7 +3362,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, struct tcp_sock *tp) { - if (th->doff == sizeof(struct tcphdr)>>2) { + if (th->doff == sizeof(struct tcphdr) >> 2) { tp->rx_opt.saw_tstamp = 0; return 0; } else if (tp->rx_opt.tstamp_ok && @@ -3342,7 +3447,8 @@ static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ); } -static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff *skb) +static inline int tcp_paws_discard(const struct sock *sk, + const struct sk_buff *skb) { const struct tcp_sock *tp = tcp_sk(sk); return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW && @@ -3374,16 +3480,16 @@ static void tcp_reset(struct sock *sk) { /* We want the right error as BSD sees it (and indeed as we do). */ switch (sk->sk_state) { - case TCP_SYN_SENT: - sk->sk_err = ECONNREFUSED; - break; - case TCP_CLOSE_WAIT: - sk->sk_err = EPIPE; - break; - case TCP_CLOSE: - return; - default: - sk->sk_err = ECONNRESET; + case TCP_SYN_SENT: + sk->sk_err = ECONNREFUSED; + break; + case TCP_CLOSE_WAIT: + sk->sk_err = EPIPE; + break; + case TCP_CLOSE: + return; + default: + sk->sk_err = ECONNRESET; } if (!sock_flag(sk, SOCK_DEAD)) @@ -3416,43 +3522,43 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) sock_set_flag(sk, SOCK_DONE); switch (sk->sk_state) { - case TCP_SYN_RECV: - case TCP_ESTABLISHED: - /* Move to CLOSE_WAIT */ - tcp_set_state(sk, TCP_CLOSE_WAIT); - inet_csk(sk)->icsk_ack.pingpong = 1; - break; + case TCP_SYN_RECV: + case TCP_ESTABLISHED: + /* Move to CLOSE_WAIT */ + tcp_set_state(sk, TCP_CLOSE_WAIT); + inet_csk(sk)->icsk_ack.pingpong = 1; + break; - case TCP_CLOSE_WAIT: - case TCP_CLOSING: - /* Received a retransmission of the FIN, do - * nothing. - */ - break; - case TCP_LAST_ACK: - /* RFC793: Remain in the LAST-ACK state. */ - break; + case TCP_CLOSE_WAIT: + case TCP_CLOSING: + /* Received a retransmission of the FIN, do + * nothing. + */ + break; + case TCP_LAST_ACK: + /* RFC793: Remain in the LAST-ACK state. */ + break; - case TCP_FIN_WAIT1: - /* This case occurs when a simultaneous close - * happens, we must ack the received FIN and - * enter the CLOSING state. - */ - tcp_send_ack(sk); - tcp_set_state(sk, TCP_CLOSING); - break; - case TCP_FIN_WAIT2: - /* Received a FIN -- send ACK and enter TIME_WAIT. */ - tcp_send_ack(sk); - tcp_time_wait(sk, TCP_TIME_WAIT, 0); - break; - default: - /* Only TCP_LISTEN and TCP_CLOSE are left, in these - * cases we should never reach this piece of code. - */ - printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n", - __FUNCTION__, sk->sk_state); - break; + case TCP_FIN_WAIT1: + /* This case occurs when a simultaneous close + * happens, we must ack the received FIN and + * enter the CLOSING state. + */ + tcp_send_ack(sk); + tcp_set_state(sk, TCP_CLOSING); + break; + case TCP_FIN_WAIT2: + /* Received a FIN -- send ACK and enter TIME_WAIT. */ + tcp_send_ack(sk); + tcp_time_wait(sk, TCP_TIME_WAIT, 0); + break; + default: + /* Only TCP_LISTEN and TCP_CLOSE are left, in these + * cases we should never reach this piece of code. + */ + printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n", + __FUNCTION__, sk->sk_state); + break; } /* It _is_ possible, that we have something out-of-order _after_ FIN. @@ -3461,7 +3567,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) __skb_queue_purge(&tp->out_of_order_queue); if (tcp_is_sack(tp)) tcp_sack_reset(&tp->rx_opt); - sk_stream_mem_reclaim(sk); + sk_mem_reclaim(sk); if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); @@ -3469,13 +3575,14 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) /* Do not send POLL_HUP for half duplex close. */ if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) - sk_wake_async(sk, 1, POLL_HUP); + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); else - sk_wake_async(sk, 1, POLL_IN); + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); } } -static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) +static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, + u32 end_seq) { if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { if (before(seq, sp->start_seq)) @@ -3498,7 +3605,8 @@ static void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) tp->rx_opt.dsack = 1; tp->duplicate_sack[0].start_seq = seq; tp->duplicate_sack[0].end_seq = end_seq; - tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + 1, 4 - tp->rx_opt.tstamp_ok); + tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + 1, + 4 - tp->rx_opt.tstamp_ok); } } @@ -3538,12 +3646,12 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) { int this_sack; struct tcp_sack_block *sp = &tp->selective_acks[0]; - struct tcp_sack_block *swalk = sp+1; + struct tcp_sack_block *swalk = sp + 1; /* See if the recent change to the first SACK eats into * or hits the sequence space of other SACK blocks, if so coalesce. */ - for (this_sack = 1; this_sack < tp->rx_opt.num_sacks; ) { + for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) { if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) { int i; @@ -3551,16 +3659,19 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) * Decrease num_sacks. */ tp->rx_opt.num_sacks--; - tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok); - for (i=this_sack; i < tp->rx_opt.num_sacks; i++) - sp[i] = sp[i+1]; + tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + + tp->rx_opt.dsack, + 4 - tp->rx_opt.tstamp_ok); + for (i = this_sack; i < tp->rx_opt.num_sacks; i++) + sp[i] = sp[i + 1]; continue; } this_sack++, swalk++; } } -static inline void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2) +static inline void tcp_sack_swap(struct tcp_sack_block *sack1, + struct tcp_sack_block *sack2) { __u32 tmp; @@ -3583,11 +3694,11 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) if (!cur_sacks) goto new_sack; - for (this_sack=0; this_sack<cur_sacks; this_sack++, sp++) { + for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) { if (tcp_sack_extend(sp, seq, end_seq)) { /* Rotate this_sack to the first one. */ - for (; this_sack>0; this_sack--, sp--) - tcp_sack_swap(sp, sp-1); + for (; this_sack > 0; this_sack--, sp--) + tcp_sack_swap(sp, sp - 1); if (cur_sacks > 1) tcp_sack_maybe_coalesce(tp); return; @@ -3606,14 +3717,15 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) sp--; } for (; this_sack > 0; this_sack--, sp--) - *sp = *(sp-1); + *sp = *(sp - 1); new_sack: /* Build the new head SACK, and we're done. */ sp->start_seq = seq; sp->end_seq = end_seq; tp->rx_opt.num_sacks++; - tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok); + tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, + 4 - tp->rx_opt.tstamp_ok); } /* RCV.NXT advances, some SACKs should be eaten. */ @@ -3631,7 +3743,7 @@ static void tcp_sack_remove(struct tcp_sock *tp) return; } - for (this_sack = 0; this_sack < num_sacks; ) { + for (this_sack = 0; this_sack < num_sacks;) { /* Check if the start of the sack is covered by RCV.NXT. */ if (!before(tp->rcv_nxt, sp->start_seq)) { int i; @@ -3650,7 +3762,9 @@ static void tcp_sack_remove(struct tcp_sock *tp) } if (num_sacks != tp->rx_opt.num_sacks) { tp->rx_opt.num_sacks = num_sacks; - tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok); + tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + + tp->rx_opt.dsack, + 4 - tp->rx_opt.tstamp_ok); } } @@ -3703,14 +3817,14 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) goto drop; - __skb_pull(skb, th->doff*4); + __skb_pull(skb, th->doff * 4); TCP_ECN_accept_cwr(tp, skb); if (tp->rx_opt.dsack) { tp->rx_opt.dsack = 0; tp->rx_opt.eff_sacks = min_t(unsigned int, tp->rx_opt.num_sacks, - 4 - tp->rx_opt.tstamp_ok); + 4 - tp->rx_opt.tstamp_ok); } /* Queue data for delivery to the user. @@ -3726,7 +3840,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) tp->copied_seq == tp->rcv_nxt && tp->ucopy.len && sock_owned_by_user(sk) && !tp->urg_data) { int chunk = min_t(unsigned int, skb->len, - tp->ucopy.len); + tp->ucopy.len); __set_current_state(TASK_RUNNING); @@ -3744,12 +3858,12 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) queue_and_out: if (eaten < 0 && (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - !sk_stream_rmem_schedule(sk, skb))) { + !sk_rmem_schedule(sk, skb->truesize))) { if (tcp_prune_queue(sk) < 0 || - !sk_stream_rmem_schedule(sk, skb)) + !sk_rmem_schedule(sk, skb->truesize)) goto drop; } - sk_stream_set_owner_r(skb, sk); + skb_set_owner_r(skb, sk); __skb_queue_tail(&sk->sk_receive_queue, skb); } tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; @@ -3818,9 +3932,9 @@ drop: TCP_ECN_check_ce(tp, skb); if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - !sk_stream_rmem_schedule(sk, skb)) { + !sk_rmem_schedule(sk, skb->truesize)) { if (tcp_prune_queue(sk) < 0 || - !sk_stream_rmem_schedule(sk, skb)) + !sk_rmem_schedule(sk, skb->truesize)) goto drop; } @@ -3831,7 +3945,7 @@ drop: SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); - sk_stream_set_owner_r(skb, sk); + skb_set_owner_r(skb, sk); if (!skb_peek(&tp->out_of_order_queue)) { /* Initial out of order segment, build 1 SACK. */ @@ -3843,7 +3957,7 @@ drop: tp->selective_acks[0].end_seq = TCP_SKB_CB(skb)->end_seq; } - __skb_queue_head(&tp->out_of_order_queue,skb); + __skb_queue_head(&tp->out_of_order_queue, skb); } else { struct sk_buff *skb1 = tp->out_of_order_queue.prev; u32 seq = TCP_SKB_CB(skb)->seq; @@ -3866,10 +3980,10 @@ drop: if (!after(TCP_SKB_CB(skb1)->seq, seq)) break; } while ((skb1 = skb1->prev) != - (struct sk_buff*)&tp->out_of_order_queue); + (struct sk_buff *)&tp->out_of_order_queue); /* Do skb overlap to previous one? */ - if (skb1 != (struct sk_buff*)&tp->out_of_order_queue && + if (skb1 != (struct sk_buff *)&tp->out_of_order_queue && before(seq, TCP_SKB_CB(skb1)->end_seq)) { if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { /* All the bits are present. Drop. */ @@ -3879,7 +3993,8 @@ drop: } if (after(seq, TCP_SKB_CB(skb1)->seq)) { /* Partial overlap. */ - tcp_dsack_set(tp, seq, TCP_SKB_CB(skb1)->end_seq); + tcp_dsack_set(tp, seq, + TCP_SKB_CB(skb1)->end_seq); } else { skb1 = skb1->prev; } @@ -3888,15 +4003,17 @@ drop: /* And clean segments covered by new one as whole. */ while ((skb1 = skb->next) != - (struct sk_buff*)&tp->out_of_order_queue && + (struct sk_buff *)&tp->out_of_order_queue && after(end_seq, TCP_SKB_CB(skb1)->seq)) { - if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { - tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq); - break; - } - __skb_unlink(skb1, &tp->out_of_order_queue); - tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); - __kfree_skb(skb1); + if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { + tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, + end_seq); + break; + } + __skb_unlink(skb1, &tp->out_of_order_queue); + tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, + TCP_SKB_CB(skb1)->end_seq); + __kfree_skb(skb1); } add_sack: @@ -3919,7 +4036,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, /* First, check that queue is collapsible and find * the point where collapsing can be useful. */ - for (skb = head; skb != tail; ) { + for (skb = head; skb != tail;) { /* No new bits? It is possible on ofo queue. */ if (!before(start, TCP_SKB_CB(skb)->end_seq)) { struct sk_buff *next = skb->next; @@ -3957,9 +4074,9 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, /* Too big header? This can happen with IPv6. */ if (copy < 0) return; - if (end-start < copy) - copy = end-start; - nskb = alloc_skb(copy+header, GFP_ATOMIC); + if (end - start < copy) + copy = end - start; + nskb = alloc_skb(copy + header, GFP_ATOMIC); if (!nskb) return; @@ -3973,7 +4090,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; __skb_insert(nskb, skb->prev, skb, list); - sk_stream_set_owner_r(nskb, sk); + skb_set_owner_r(nskb, sk); /* Copy data, releasing collapsed skbs. */ while (copy > 0) { @@ -4069,9 +4186,9 @@ static int tcp_prune_queue(struct sock *sk) tcp_collapse_ofo_queue(sk); tcp_collapse(sk, &sk->sk_receive_queue, sk->sk_receive_queue.next, - (struct sk_buff*)&sk->sk_receive_queue, + (struct sk_buff *)&sk->sk_receive_queue, tp->copied_seq, tp->rcv_nxt); - sk_stream_mem_reclaim(sk); + sk_mem_reclaim(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) return 0; @@ -4091,7 +4208,7 @@ static int tcp_prune_queue(struct sock *sk) */ if (tcp_is_sack(tp)) tcp_sack_reset(&tp->rx_opt); - sk_stream_mem_reclaim(sk); + sk_mem_reclaim(sk); } if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) @@ -4108,7 +4225,6 @@ static int tcp_prune_queue(struct sock *sk) return -1; } - /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. * As additional protections, we do not touch cwnd in retransmission phases, * and if application hit its sndbuf limit recently. @@ -4170,8 +4286,8 @@ static void tcp_new_space(struct sock *sk) int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff), demanded = max_t(unsigned int, tp->snd_cwnd, - tp->reordering + 1); - sndmem *= 2*demanded; + tp->reordering + 1); + sndmem *= 2 * demanded; if (sndmem > sk->sk_sndbuf) sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); tp->snd_cwnd_stamp = tcp_time_stamp; @@ -4212,8 +4328,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) /* We ACK each frame or... */ tcp_in_quickack_mode(sk) || /* We have out of order data. */ - (ofo_possible && - skb_peek(&tp->out_of_order_queue))) { + (ofo_possible && skb_peek(&tp->out_of_order_queue))) { /* Then ack it now */ tcp_send_ack(sk); } else { @@ -4241,7 +4356,7 @@ static inline void tcp_ack_snd_check(struct sock *sk) * either form (or just set the sysctl tcp_stdurg). */ -static void tcp_check_urg(struct sock * sk, struct tcphdr * th) +static void tcp_check_urg(struct sock *sk, struct tcphdr *th) { struct tcp_sock *tp = tcp_sk(sk); u32 ptr = ntohs(th->urg_ptr); @@ -4290,8 +4405,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) * buggy users. */ if (tp->urg_seq == tp->copied_seq && tp->urg_data && - !sock_flag(sk, SOCK_URGINLINE) && - tp->copied_seq != tp->rcv_nxt) { + !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) { struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); tp->copied_seq++; if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) { @@ -4300,8 +4414,8 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) } } - tp->urg_data = TCP_URG_NOTYET; - tp->urg_seq = ptr; + tp->urg_data = TCP_URG_NOTYET; + tp->urg_seq = ptr; /* Disable header prediction. */ tp->pred_flags = 0; @@ -4314,7 +4428,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th) /* Check if we get a new urgent pointer - normally not. */ if (th->urg) - tcp_check_urg(sk,th); + tcp_check_urg(sk, th); /* Do we wait for any urgent data? - normally not... */ if (tp->urg_data == TCP_URG_NOTYET) { @@ -4356,7 +4470,8 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) return err; } -static __sum16 __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) +static __sum16 __tcp_checksum_complete_user(struct sock *sk, + struct sk_buff *skb) { __sum16 result; @@ -4370,14 +4485,16 @@ static __sum16 __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb return result; } -static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) +static inline int tcp_checksum_complete_user(struct sock *sk, + struct sk_buff *skb) { return !skb_csum_unnecessary(skb) && - __tcp_checksum_complete_user(sk, skb); + __tcp_checksum_complete_user(sk, skb); } #ifdef CONFIG_NET_DMA -static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen) +static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, + int hlen) { struct tcp_sock *tp = tcp_sk(sk); int chunk = skb->len - hlen; @@ -4393,7 +4510,9 @@ static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) { dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan, - skb, hlen, tp->ucopy.iov, chunk, tp->ucopy.pinned_list); + skb, hlen, + tp->ucopy.iov, chunk, + tp->ucopy.pinned_list); if (dma_cookie < 0) goto out; @@ -4475,7 +4594,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, */ if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags && - TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { + TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { int tcp_header_len = tp->tcp_header_len; /* Timestamp header prediction: tcp_header_len @@ -4544,7 +4663,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, eaten = 1; } #endif - if (tp->ucopy.task == current && sock_owned_by_user(sk) && !copied_early) { + if (tp->ucopy.task == current && + sock_owned_by_user(sk) && !copied_early) { __set_current_state(TASK_RUNNING); if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) @@ -4591,9 +4711,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, NET_INC_STATS_BH(LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ - __skb_pull(skb,tcp_header_len); + __skb_pull(skb, tcp_header_len); __skb_queue_tail(&sk->sk_receive_queue, skb); - sk_stream_set_owner_r(skb, sk); + skb_set_owner_r(skb, sk); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; } @@ -4623,7 +4743,7 @@ no_ack: } slow_path: - if (len < (th->doff<<2) || tcp_checksum_complete_user(sk, skb)) + if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb)) goto csum_error; /* @@ -4830,7 +4950,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); - sk_wake_async(sk, 0, POLL_OUT); + sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); } if (sk->sk_write_pending || @@ -4873,7 +4993,8 @@ discard: } /* PAWS check. */ - if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp && tcp_paws_check(&tp->rx_opt, 0)) + if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp && + tcp_paws_check(&tp->rx_opt, 0)) goto discard_and_undo; if (th->syn) { @@ -4908,7 +5029,6 @@ discard: tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); - tcp_send_synack(sk); #if 0 /* Note, we could accept data and URG from this segment. @@ -4940,7 +5060,6 @@ reset_and_undo: return 1; } - /* * This function implements the receiving procedure of RFC 793 for * all states except ESTABLISHED and TIME_WAIT. @@ -5060,9 +5179,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * are not waked up, because sk->sk_sleep == * NULL and sk->sk_socket == NULL. */ - if (sk->sk_socket) { - sk_wake_async(sk,0,POLL_OUT); - } + if (sk->sk_socket) + sk_wake_async(sk, + SOCK_WAKE_IO, POLL_OUT); tp->snd_una = TCP_SKB_CB(skb)->ack_seq; tp->snd_wnd = ntohs(th->window) << @@ -5074,8 +5193,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * and does not calculate rtt. * Fix it at least with timestamps. */ - if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && - !tp->srtt) + if (tp->rx_opt.saw_tstamp && + tp->rx_opt.rcv_tsecr && !tp->srtt) tcp_ack_saw_tstamp(sk, 0); if (tp->rx_opt.tstamp_ok) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 652c32368cc..9aea88b8d4f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -99,7 +99,7 @@ static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, __be32 saddr, __be32 daddr, struct tcphdr *th, int protocol, - int tcplen); + unsigned int tcplen); #endif struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { @@ -1020,7 +1020,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, __be32 saddr, __be32 daddr, struct tcphdr *th, int protocol, - int tcplen) + unsigned int tcplen) { struct scatterlist sg[4]; __u16 data_len; @@ -1113,7 +1113,7 @@ int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, struct dst_entry *dst, struct request_sock *req, struct tcphdr *th, int protocol, - int tcplen) + unsigned int tcplen) { __be32 saddr, daddr; @@ -1478,7 +1478,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, } #endif - __inet_hash(&tcp_hashinfo, newsk, 0); + __inet_hash_nolisten(&tcp_hashinfo, newsk); __inet_inherit_port(&tcp_hashinfo, sk, newsk); return newsk; diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index e7f5ef92cbd..ce3c41ff50b 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -115,12 +115,12 @@ static void tcp_lp_init(struct sock *sk) * Will only call newReno CA when away from inference. * From TCP-LP's paper, this will be handled in additive increasement. */ -static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight, int flag) +static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct lp *lp = inet_csk_ca(sk); if (!(lp->flag & LP_WITHIN_INF)) - tcp_reno_cong_avoid(sk, ack, in_flight, flag); + tcp_reno_cong_avoid(sk, ack, in_flight); } /** diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f4c1eef89af..89f0188885c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -61,27 +61,24 @@ int sysctl_tcp_base_mss __read_mostly = 512; /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; -static inline void tcp_packets_out_inc(struct sock *sk, - const struct sk_buff *skb) +static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - int orig = tp->packets_out; + unsigned int prior_packets = tp->packets_out; + + tcp_advance_send_head(sk, skb); + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + + /* Don't override Nagle indefinately with F-RTO */ + if (tp->frto_counter == 2) + tp->frto_counter = 3; tp->packets_out += tcp_skb_pcount(skb); - if (!orig) + if (!prior_packets) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); } -static void update_send_head(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - tcp_advance_send_head(sk, skb); - tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - tcp_packets_out_inc(sk, skb); -} - /* SND.NXT, if window was not shrunk. * If window has been shrunk, what should we make? It is not clear at all. * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-( @@ -92,10 +89,10 @@ static inline __u32 tcp_acceptable_seq(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt)) + if (!before(tcp_wnd_end(tp), tp->snd_nxt)) return tp->snd_nxt; else - return tp->snd_una+tp->snd_wnd; + return tcp_wnd_end(tp); } /* Calculate mss to advertise in SYN segment. @@ -224,14 +221,14 @@ void tcp_select_initial_window(int __space, __u32 mss, * following RFC2414. Senders, not following this RFC, * will be satisfied with 2. */ - if (mss > (1<<*rcv_wscale)) { + if (mss > (1 << *rcv_wscale)) { int init_cwnd = 4; - if (mss > 1460*3) + if (mss > 1460 * 3) init_cwnd = 2; else if (mss > 1460) init_cwnd = 3; - if (*rcv_wnd > init_cwnd*mss) - *rcv_wnd = init_cwnd*mss; + if (*rcv_wnd > init_cwnd * mss) + *rcv_wnd = init_cwnd * mss; } /* Set the clamp no higher than max representable value */ @@ -281,11 +278,10 @@ static u16 tcp_select_window(struct sock *sk) return new_win; } -static inline void TCP_ECN_send_synack(struct tcp_sock *tp, - struct sk_buff *skb) +static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb) { TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR; - if (!(tp->ecn_flags&TCP_ECN_OK)) + if (!(tp->ecn_flags & TCP_ECN_OK)) TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE; } @@ -295,7 +291,7 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) tp->ecn_flags = 0; if (sysctl_tcp_ecn) { - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE|TCPCB_FLAG_CWR; + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE | TCPCB_FLAG_CWR; tp->ecn_flags = TCP_ECN_OK; } } @@ -317,7 +313,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, if (skb->len != tcp_header_len && !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) { INET_ECN_xmit(sk); - if (tp->ecn_flags&TCP_ECN_QUEUE_CWR) { + if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) { tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; tcp_hdr(skb)->cwr = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; @@ -331,6 +327,26 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, } } +/* Constructs common control bits of non-data skb. If SYN/FIN is present, + * auto increment end seqno. + */ +static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) +{ + skb->csum = 0; + + TCP_SKB_CB(skb)->flags = flags; + TCP_SKB_CB(skb)->sacked = 0; + + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; + + TCP_SKB_CB(skb)->seq = seq; + if (flags & (TCPCB_FLAG_SYN | TCPCB_FLAG_FIN)) + seq++; + TCP_SKB_CB(skb)->end_seq = seq; +} + static void tcp_build_and_update_options(__be32 *ptr, struct tcp_sock *tp, __u32 tstamp, __u8 **md5_hash) { @@ -434,7 +450,7 @@ static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack, (TCPOPT_NOP << 16) | (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); - *md5_hash = (__u8 *) ptr; + *md5_hash = (__u8 *)ptr; } #endif } @@ -450,7 +466,8 @@ static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack, * We are working here with either a clone of the original * SKB, or a fresh unique copy made by the retransmit engine. */ -static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask) +static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, + gfp_t gfp_mask) { const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet; @@ -554,8 +571,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, th->urg_ptr = 0; if (unlikely(tp->urg_mode && - between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF))) { - th->urg_ptr = htons(tp->snd_up-tcb->seq); + between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) { + th->urg_ptr = htons(tp->snd_up - tcb->seq); th->urg = 1; } @@ -619,7 +636,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, #undef SYSCTL_FLAG_SACK } - /* This routine just queue's the buffer * * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, @@ -633,10 +649,12 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) tp->write_seq = TCP_SKB_CB(skb)->end_seq; skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); - sk_charge_skb(sk, skb); + sk->sk_wmem_queued += skb->truesize; + sk_mem_charge(sk, skb->truesize); } -static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) +static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, + unsigned int mss_now) { if (skb->len <= mss_now || !sk_can_gso(sk)) { /* Avoid the costly divide in the normal @@ -653,23 +671,18 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned } /* When a modification to fackets out becomes necessary, we need to check - * skb is counted to fackets_out or not. Another important thing is to - * tweak SACK fastpath hint too as it would overwrite all changes unless - * hint is also changed. + * skb is counted to fackets_out or not. */ -static void tcp_adjust_fackets_out(struct tcp_sock *tp, struct sk_buff *skb, +static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb, int decr) { + struct tcp_sock *tp = tcp_sk(sk); + if (!tp->sacked_out || tcp_is_reno(tp)) return; - if (!before(tp->highest_sack, TCP_SKB_CB(skb)->seq)) + if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq)) tp->fackets_out -= decr; - - /* cnt_hint is "off-by-one" compared with fackets_out (see sacktag) */ - if (tp->fastpath_skb_hint != NULL && - after(TCP_SKB_CB(tp->fastpath_skb_hint)->seq, TCP_SKB_CB(skb)->seq)) - tp->fastpath_cnt_hint -= decr; } /* Function to create two new TCP segments. Shrinks the given segment @@ -677,7 +690,8 @@ static void tcp_adjust_fackets_out(struct tcp_sock *tp, struct sk_buff *skb, * packet to the list. This won't be called frequently, I hope. * Remember, these are still headerless SKBs at this point. */ -int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss_now) +int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, + unsigned int mss_now) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; @@ -702,7 +716,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss if (buff == NULL) return -ENOMEM; /* We'll just try again later. */ - sk_charge_skb(sk, buff); + sk->sk_wmem_queued += buff->truesize; + sk_mem_charge(sk, buff->truesize); nlen = skb->len - len - nsize; buff->truesize += nlen; skb->truesize -= nlen; @@ -712,20 +727,16 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; - if (tcp_is_sack(tp) && tp->sacked_out && - (TCP_SKB_CB(skb)->seq == tp->highest_sack)) - tp->highest_sack = TCP_SKB_CB(buff)->seq; - /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->flags; - TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); + TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH); TCP_SKB_CB(buff)->flags = flags; TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; - TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL; if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) { /* Copy and checksum data tail into the new buffer. */ - buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize), + buff->csum = csum_partial_copy_nocheck(skb->data + len, + skb_put(buff, nsize), nsize, 0); skb_trim(skb, len); @@ -772,7 +783,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss tcp_dec_pcount_approx_int(&tp->sacked_out, diff); tcp_verify_left_out(tp); } - tcp_adjust_fackets_out(tp, skb, diff); + tcp_adjust_fackets_out(sk, skb, diff); } /* Link BUFF into the send queue. */ @@ -792,7 +803,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) eat = len; k = 0; - for (i=0; i<skb_shinfo(skb)->nr_frags; i++) { + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { if (skb_shinfo(skb)->frags[i].size <= eat) { put_page(skb_shinfo(skb)->frags[i].page); eat -= skb_shinfo(skb)->frags[i].size; @@ -815,8 +826,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { - if (skb_cloned(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) return -ENOMEM; /* If len == headlen, we avoid __skb_pull to preserve alignment. */ @@ -830,7 +840,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) skb->truesize -= len; sk->sk_wmem_queued -= len; - sk->sk_forward_alloc += len; + sk_mem_uncharge(sk, len); sock_set_flag(sk, SOCK_QUEUE_SHRUNK); /* Any change of skb->len requires recalculation of tso @@ -898,6 +908,15 @@ void tcp_mtup_init(struct sock *sk) icsk->icsk_mtup.probe_size = 0; } +/* Bound MSS / TSO packet size with the half of the window */ +static int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) +{ + if (tp->max_window && pktsize > (tp->max_window >> 1)) + return max(tp->max_window >> 1, 68U - tp->tcp_header_len); + else + return pktsize; +} + /* This function synchronize snd mss to current pmtu/exthdr set. tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts @@ -920,7 +939,6 @@ void tcp_mtup_init(struct sock *sk) NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache are READ ONLY outside this function. --ANK (980731) */ - unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) { struct tcp_sock *tp = tcp_sk(sk); @@ -931,10 +949,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) icsk->icsk_mtup.search_high = pmtu; mss_now = tcp_mtu_to_mss(sk, pmtu); - - /* Bound mss with half of window */ - if (tp->max_window && mss_now > (tp->max_window>>1)) - mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len); + mss_now = tcp_bound_to_half_wnd(tp, mss_now); /* And store cached results */ icsk->icsk_pmtu_cookie = pmtu; @@ -988,11 +1003,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) inet_csk(sk)->icsk_ext_hdr_len - tp->tcp_header_len); - if (tp->max_window && - (xmit_size_goal > (tp->max_window >> 1))) - xmit_size_goal = max((tp->max_window >> 1), - 68U - tp->tcp_header_len); - + xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); xmit_size_goal -= (xmit_size_goal % mss_now); } tp->xmit_size_goal = xmit_size_goal; @@ -1001,13 +1012,11 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) } /* Congestion window validation. (RFC2861) */ - static void tcp_cwnd_validate(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - __u32 packets_out = tp->packets_out; - if (packets_out >= tp->snd_cwnd) { + if (tp->packets_out >= tp->snd_cwnd) { /* Network is feed fully. */ tp->snd_cwnd_used = 0; tp->snd_cwnd_stamp = tcp_time_stamp; @@ -1022,19 +1031,35 @@ static void tcp_cwnd_validate(struct sock *sk) } } -static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd) +/* Returns the portion of skb which can be sent right away without + * introducing MSS oddities to segment boundaries. In rare cases where + * mss_now != mss_cache, we will request caller to create a small skb + * per input skb which could be mostly avoided here (if desired). + */ +static unsigned int tcp_mss_split_point(struct sock *sk, struct sk_buff *skb, + unsigned int mss_now, unsigned int cwnd) { - u32 window, cwnd_len; + struct tcp_sock *tp = tcp_sk(sk); + u32 needed, window, cwnd_len; - window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq); + window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; cwnd_len = mss_now * cwnd; - return min(window, cwnd_len); + + if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk))) + return cwnd_len; + + if (skb == tcp_write_queue_tail(sk) && cwnd_len <= skb->len) + return cwnd_len; + + needed = min(skb->len, window); + return needed - needed % mss_now; } /* Can at least one segment of SKB be sent right now, according to the * congestion window rules? If so, return how many segments are allowed. */ -static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb) +static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, + struct sk_buff *skb) { u32 in_flight, cwnd; @@ -1054,13 +1079,12 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *sk /* This must be invoked the first time we consider transmitting * SKB onto the wire. */ -static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) +static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, + unsigned int mss_now) { int tso_segs = tcp_skb_pcount(skb); - if (!tso_segs || - (tso_segs > 1 && - tcp_skb_mss(skb) != mss_now)) { + if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) { tcp_set_skb_tso_segs(sk, skb, mss_now); tso_segs = tcp_skb_pcount(skb); } @@ -1080,16 +1104,13 @@ static inline int tcp_minshall_check(const struct tcp_sock *tp) * 4. Or TCP_CORK is not set, and all sent packets are ACKed. * With Minshall's modification: all sent small packets are ACKed. */ - static inline int tcp_nagle_check(const struct tcp_sock *tp, const struct sk_buff *skb, unsigned mss_now, int nonagle) { return (skb->len < mss_now && - ((nonagle&TCP_NAGLE_CORK) || - (!nonagle && - tp->packets_out && - tcp_minshall_check(tp)))); + ((nonagle & TCP_NAGLE_CORK) || + (!nonagle && tp->packets_out && tcp_minshall_check(tp)))); } /* Return non-zero if the Nagle test allows this packet to be @@ -1121,14 +1142,15 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, } /* Does at least the first segment of SKB fit into the send window? */ -static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss) +static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, + unsigned int cur_mss) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (skb->len > cur_mss) end_seq = TCP_SKB_CB(skb)->seq + cur_mss; - return !after(end_seq, tp->snd_una + tp->snd_wnd); + return !after(end_seq, tcp_wnd_end(tp)); } /* This checks if the data bearing packet SKB (usually tcp_send_head(sk)) @@ -1147,8 +1169,7 @@ static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb, return 0; cwnd_quota = tcp_cwnd_test(tp, skb); - if (cwnd_quota && - !tcp_snd_wnd_test(tp, skb, cur_mss)) + if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss)) cwnd_quota = 0; return cwnd_quota; @@ -1172,7 +1193,8 @@ int tcp_may_send_now(struct sock *sk) * know that all the data is in scatter-gather pages, and that the * packet has never been sent out before (and thus is not cloned). */ -static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, unsigned int mss_now) +static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, + unsigned int mss_now) { struct sk_buff *buff; int nlen = skb->len - len; @@ -1182,11 +1204,12 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, if (skb->len != skb->data_len) return tcp_fragment(sk, skb, len, mss_now); - buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC); + buff = sk_stream_alloc_skb(sk, 0, GFP_ATOMIC); if (unlikely(buff == NULL)) return -ENOMEM; - sk_charge_skb(sk, buff); + sk->sk_wmem_queued += buff->truesize; + sk_mem_charge(sk, buff->truesize); buff->truesize += nlen; skb->truesize -= nlen; @@ -1197,7 +1220,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->flags; - TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); + TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH); TCP_SKB_CB(buff)->flags = flags; /* This packet was never sent out yet, so no SACK bits. */ @@ -1235,15 +1258,15 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) goto send_now; /* Defer for less than two clock ticks. */ - if (!tp->tso_deferred && ((jiffies<<1)>>1) - (tp->tso_deferred>>1) > 1) + if (tp->tso_deferred && + ((jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1) goto send_now; in_flight = tcp_packets_in_flight(tp); - BUG_ON(tcp_skb_pcount(skb) <= 1 || - (tp->snd_cwnd <= in_flight)); + BUG_ON(tcp_skb_pcount(skb) <= 1 || (tp->snd_cwnd <= in_flight)); - send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq; + send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; /* From in_flight test above, we know that cwnd > in_flight. */ cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache; @@ -1274,7 +1297,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) } /* Ok, it looks like it is advisable to defer. */ - tp->tso_deferred = 1 | (jiffies<<1); + tp->tso_deferred = 1 | (jiffies << 1); return 1; @@ -1286,7 +1309,8 @@ send_now: /* Create a new MTU probe if we are ready. * Returns 0 if we should wait to probe (no cwnd available), * 1 if a probe was sent, - * -1 otherwise */ + * -1 otherwise + */ static int tcp_mtu_probe(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -1295,7 +1319,6 @@ static int tcp_mtu_probe(struct sock *sk) int len; int probe_size; int size_needed; - unsigned int pif; int copy; int mss_now; @@ -1312,7 +1335,7 @@ static int tcp_mtu_probe(struct sock *sk) /* Very simple search strategy: just double the MSS. */ mss_now = tcp_current_mss(sk, 0); - probe_size = 2*tp->mss_cache; + probe_size = 2 * tp->mss_cache; size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache; if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) { /* TODO: set timer for probe_converge_event */ @@ -1325,14 +1348,12 @@ static int tcp_mtu_probe(struct sock *sk) if (tp->snd_wnd < size_needed) return -1; - if (after(tp->snd_nxt + size_needed, tp->snd_una + tp->snd_wnd)) + if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp))) return 0; - /* Do we need to wait to drain cwnd? */ - pif = tcp_packets_in_flight(tp); - if (pif + 2 > tp->snd_cwnd) { - /* With no packets in flight, don't stall. */ - if (pif == 0) + /* Do we need to wait to drain cwnd? With none in flight, don't stall */ + if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) { + if (!tcp_packets_in_flight(tp)) return -1; else return 0; @@ -1341,10 +1362,10 @@ static int tcp_mtu_probe(struct sock *sk) /* We're allowed to probe. Build it now. */ if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL) return -1; - sk_charge_skb(sk, nskb); + sk->sk_wmem_queued += nskb->truesize; + sk_mem_charge(sk, nskb->truesize); skb = tcp_send_head(sk); - tcp_insert_write_queue_before(nskb, skb, sk); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; @@ -1353,30 +1374,32 @@ static int tcp_mtu_probe(struct sock *sk) nskb->csum = 0; nskb->ip_summed = skb->ip_summed; - len = 0; - while (len < probe_size) { - next = tcp_write_queue_next(sk, skb); + tcp_insert_write_queue_before(nskb, skb, sk); + len = 0; + tcp_for_write_queue_from_safe(skb, next, sk) { copy = min_t(int, skb->len, probe_size - len); if (nskb->ip_summed) skb_copy_bits(skb, 0, skb_put(nskb, copy), copy); else nskb->csum = skb_copy_and_csum_bits(skb, 0, - skb_put(nskb, copy), copy, nskb->csum); + skb_put(nskb, copy), + copy, nskb->csum); if (skb->len <= copy) { /* We've eaten all the data from this skb. * Throw it away. */ TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags; tcp_unlink_write_queue(skb, sk); - sk_stream_free_skb(sk, skb); + sk_wmem_free_skb(sk, skb); } else { TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); if (!skb_shinfo(skb)->nr_frags) { skb_pull(skb, copy); if (skb->ip_summed != CHECKSUM_PARTIAL) - skb->csum = csum_partial(skb->data, skb->len, 0); + skb->csum = csum_partial(skb->data, + skb->len, 0); } else { __pskb_trim_head(skb, copy); tcp_set_skb_tso_segs(sk, skb, mss_now); @@ -1385,7 +1408,9 @@ static int tcp_mtu_probe(struct sock *sk) } len += copy; - skb = next; + + if (len >= probe_size) + break; } tcp_init_tso_segs(sk, nskb, nskb->len); @@ -1394,9 +1419,9 @@ static int tcp_mtu_probe(struct sock *sk) TCP_SKB_CB(nskb)->when = tcp_time_stamp; if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { /* Decrement cwnd here because we are sending - * effectively two packets. */ + * effectively two packets. */ tp->snd_cwnd--; - update_send_head(sk, nskb); + tcp_event_new_data_sent(sk, nskb); icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len); tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq; @@ -1408,7 +1433,6 @@ static int tcp_mtu_probe(struct sock *sk) return -1; } - /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. @@ -1464,17 +1488,9 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) } limit = mss_now; - if (tso_segs > 1) { - limit = tcp_window_allows(tp, skb, - mss_now, cwnd_quota); - - if (skb->len < limit) { - unsigned int trim = skb->len % mss_now; - - if (trim) - limit = skb->len - trim; - } - } + if (tso_segs > 1) + limit = tcp_mss_split_point(sk, skb, mss_now, + cwnd_quota); if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now))) @@ -1488,7 +1504,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) /* Advance the send_head. This one is sent out. * This call will increment packets_out. */ - update_send_head(sk, skb); + tcp_event_new_data_sent(sk, skb); tcp_minshall_update(tp, mss_now, skb); sent_pkts++; @@ -1521,7 +1537,6 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, */ void tcp_push_one(struct sock *sk, unsigned int mss_now) { - struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb = tcp_send_head(sk); unsigned int tso_segs, cwnd_quota; @@ -1536,17 +1551,9 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now) BUG_ON(!tso_segs); limit = mss_now; - if (tso_segs > 1) { - limit = tcp_window_allows(tp, skb, - mss_now, cwnd_quota); - - if (skb->len < limit) { - unsigned int trim = skb->len % mss_now; - - if (trim) - limit = skb->len - trim; - } - } + if (tso_segs > 1) + limit = tcp_mss_split_point(sk, skb, mss_now, + cwnd_quota); if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now))) @@ -1556,7 +1563,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now) TCP_SKB_CB(skb)->when = tcp_time_stamp; if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) { - update_send_head(sk, skb); + tcp_event_new_data_sent(sk, skb); tcp_cwnd_validate(sk); return; } @@ -1633,11 +1640,12 @@ u32 __tcp_select_window(struct sock *sk) if (mss > full_space) mss = full_space; - if (free_space < full_space/2) { + if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; if (tcp_memory_pressure) - tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss); + tp->rcv_ssthresh = min(tp->rcv_ssthresh, + 4U * tp->advmss); if (free_space < mss) return 0; @@ -1670,9 +1678,9 @@ u32 __tcp_select_window(struct sock *sk) * is too small. */ if (window <= free_space - mss || window > free_space) - window = (free_space/mss)*mss; + window = (free_space / mss) * mss; else if (mss == full_space && - free_space > window + full_space/2) + free_space > window + (full_space >> 1)) window = free_space; } @@ -1680,86 +1688,82 @@ u32 __tcp_select_window(struct sock *sk) } /* Attempt to collapse two adjacent SKB's during retransmission. */ -static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now) +static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, + int mss_now) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); + int skb_size, next_skb_size; + u16 flags; /* The first test we must make is that neither of these two * SKB's are still referenced by someone else. */ - if (!skb_cloned(skb) && !skb_cloned(next_skb)) { - int skb_size = skb->len, next_skb_size = next_skb->len; - u16 flags = TCP_SKB_CB(skb)->flags; + if (skb_cloned(skb) || skb_cloned(next_skb)) + return; - /* Also punt if next skb has been SACK'd. */ - if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED) - return; + skb_size = skb->len; + next_skb_size = next_skb->len; + flags = TCP_SKB_CB(skb)->flags; - /* Next skb is out of window. */ - if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd)) - return; + /* Also punt if next skb has been SACK'd. */ + if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED) + return; - /* Punt if not enough space exists in the first SKB for - * the data in the second, or the total combined payload - * would exceed the MSS. - */ - if ((next_skb_size > skb_tailroom(skb)) || - ((skb_size + next_skb_size) > mss_now)) - return; + /* Next skb is out of window. */ + if (after(TCP_SKB_CB(next_skb)->end_seq, tcp_wnd_end(tp))) + return; - BUG_ON(tcp_skb_pcount(skb) != 1 || - tcp_skb_pcount(next_skb) != 1); + /* Punt if not enough space exists in the first SKB for + * the data in the second, or the total combined payload + * would exceed the MSS. + */ + if ((next_skb_size > skb_tailroom(skb)) || + ((skb_size + next_skb_size) > mss_now)) + return; - if (WARN_ON(tcp_is_sack(tp) && tp->sacked_out && - (TCP_SKB_CB(next_skb)->seq == tp->highest_sack))) - return; + BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); - /* Ok. We will be able to collapse the packet. */ - tcp_unlink_write_queue(next_skb, sk); + tcp_highest_sack_combine(sk, next_skb, skb); - skb_copy_from_linear_data(next_skb, - skb_put(skb, next_skb_size), - next_skb_size); + /* Ok. We will be able to collapse the packet. */ + tcp_unlink_write_queue(next_skb, sk); - if (next_skb->ip_summed == CHECKSUM_PARTIAL) - skb->ip_summed = CHECKSUM_PARTIAL; + skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size), + next_skb_size); - if (skb->ip_summed != CHECKSUM_PARTIAL) - skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size); + if (next_skb->ip_summed == CHECKSUM_PARTIAL) + skb->ip_summed = CHECKSUM_PARTIAL; - /* Update sequence range on original skb. */ - TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; + if (skb->ip_summed != CHECKSUM_PARTIAL) + skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size); - /* Merge over control information. */ - flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */ - TCP_SKB_CB(skb)->flags = flags; + /* Update sequence range on original skb. */ + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; - /* All done, get rid of second SKB and account for it so - * packet counting does not break. - */ - TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL); - if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS) - tp->retrans_out -= tcp_skb_pcount(next_skb); - if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) - tp->lost_out -= tcp_skb_pcount(next_skb); - /* Reno case is special. Sigh... */ - if (tcp_is_reno(tp) && tp->sacked_out) - tcp_dec_pcount_approx(&tp->sacked_out, next_skb); - - tcp_adjust_fackets_out(tp, next_skb, tcp_skb_pcount(next_skb)); - tp->packets_out -= tcp_skb_pcount(next_skb); - - /* changed transmit queue under us so clear hints */ - tcp_clear_retrans_hints_partial(tp); - /* manually tune sacktag skb hint */ - if (tp->fastpath_skb_hint == next_skb) { - tp->fastpath_skb_hint = skb; - tp->fastpath_cnt_hint -= tcp_skb_pcount(skb); - } + /* Merge over control information. */ + flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */ + TCP_SKB_CB(skb)->flags = flags; - sk_stream_free_skb(sk, next_skb); - } + /* All done, get rid of second SKB and account for it so + * packet counting does not break. + */ + TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS; + if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_RETRANS) + tp->retrans_out -= tcp_skb_pcount(next_skb); + if (TCP_SKB_CB(next_skb)->sacked & TCPCB_LOST) + tp->lost_out -= tcp_skb_pcount(next_skb); + /* Reno case is special. Sigh... */ + if (tcp_is_reno(tp) && tp->sacked_out) + tcp_dec_pcount_approx(&tp->sacked_out, next_skb); + + tcp_adjust_fackets_out(sk, next_skb, tcp_skb_pcount(next_skb)); + tp->packets_out -= tcp_skb_pcount(next_skb); + + /* changed transmit queue under us so clear hints */ + tcp_clear_retrans_hints_partial(tp); + + sk_wmem_free_skb(sk, next_skb); } /* Do a simple retransmit without using the backoff mechanisms in @@ -1778,12 +1782,12 @@ void tcp_simple_retransmit(struct sock *sk) if (skb == tcp_send_head(sk)) break; if (skb->len > mss && - !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) { - if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) { + !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); } - if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) { + if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); lost = 1; @@ -1848,7 +1852,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) * case, when window is shrunk to zero. In this case * our retransmit serves as a zero window probe. */ - if (!before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd) + if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) && TCP_SKB_CB(skb)->seq != tp->snd_una) return -EAGAIN; @@ -1862,8 +1866,10 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) (skb->len < (cur_mss >> 1)) && (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) && (!tcp_skb_is_last(sk, skb)) && - (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) && - (tcp_skb_pcount(skb) == 1 && tcp_skb_pcount(tcp_write_queue_next(sk, skb)) == 1) && + (skb_shinfo(skb)->nr_frags == 0 && + skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) && + (tcp_skb_pcount(skb) == 1 && + tcp_skb_pcount(tcp_write_queue_next(sk, skb)) == 1) && (sysctl_tcp_retrans_collapse != 0)) tcp_retrans_try_collapse(sk, skb, cur_mss); @@ -1878,12 +1884,10 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { if (!pskb_trim(skb, 0)) { - TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1; - skb_shinfo(skb)->gso_segs = 1; - skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; + /* Reuse, even though it does some unnecessary work */ + tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1, + TCP_SKB_CB(skb)->flags); skb->ip_summed = CHECKSUM_NONE; - skb->csum = 0; } } @@ -1901,7 +1905,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) tp->total_retrans++; #if FASTRETRANS_DEBUG > 0 - if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) { + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { if (net_ratelimit()) printk(KERN_DEBUG "retrans_out leaked.\n"); } @@ -1943,7 +1947,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (tp->retransmit_skb_hint) { skb = tp->retransmit_skb_hint; packet_cnt = tp->retransmit_cnt_hint; - }else{ + } else { skb = tcp_write_queue_head(sk); packet_cnt = 0; } @@ -1970,7 +1974,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) return; if (sacked & TCPCB_LOST) { - if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { + if (!(sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { if (tcp_retransmit_skb(sk, skb)) { tp->retransmit_skb_hint = NULL; return; @@ -2028,7 +2032,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) break; tp->forward_skb_hint = skb; - if (after(TCP_SKB_CB(skb)->seq, tp->highest_sack)) + if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) break; if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) @@ -2052,7 +2056,6 @@ void tcp_xmit_retransmit_queue(struct sock *sk) } } - /* Send a fin. The caller locks the socket for us. This cannot be * allowed to fail queueing a FIN frame under any circumstances. */ @@ -2083,16 +2086,9 @@ void tcp_send_fin(struct sock *sk) /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, MAX_TCP_HEADER); - skb->csum = 0; - TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); - TCP_SKB_CB(skb)->sacked = 0; - skb_shinfo(skb)->gso_segs = 1; - skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; - /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ - TCP_SKB_CB(skb)->seq = tp->write_seq; - TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; + tcp_init_nondata_skb(skb, tp->write_seq, + TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); tcp_queue_skb(sk, skb); } __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF); @@ -2116,16 +2112,9 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, MAX_TCP_HEADER); - skb->csum = 0; - TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST); - TCP_SKB_CB(skb)->sacked = 0; - skb_shinfo(skb)->gso_segs = 1; - skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; - + tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), + TCPCB_FLAG_ACK | TCPCB_FLAG_RST); /* Send it off. */ - TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk); - TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); @@ -2138,14 +2127,14 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) */ int tcp_send_synack(struct sock *sk) { - struct sk_buff* skb; + struct sk_buff *skb; skb = tcp_write_queue_head(sk); - if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) { + if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)) { printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n"); return -EFAULT; } - if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) { + if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_ACK)) { if (skb_cloned(skb)) { struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); if (nskb == NULL) @@ -2153,8 +2142,9 @@ int tcp_send_synack(struct sock *sk) tcp_unlink_write_queue(skb, sk); skb_header_release(nskb); __tcp_add_write_queue_head(sk, nskb); - sk_stream_free_skb(sk, skb); - sk_charge_skb(sk, nskb); + sk_wmem_free_skb(sk, skb); + sk->sk_wmem_queued += nskb->truesize; + sk_mem_charge(sk, nskb->truesize); skb = nskb; } @@ -2168,8 +2158,8 @@ int tcp_send_synack(struct sock *sk) /* * Prepare a SYN-ACK. */ -struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, - struct request_sock *req) +struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, + struct request_sock *req) { struct inet_request_sock *ireq = inet_rsk(req); struct tcp_sock *tp = tcp_sk(sk); @@ -2212,12 +2202,11 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, TCP_ECN_make_synack(req, th); th->source = inet_sk(sk)->sport; th->dest = ireq->rmt_port; - TCP_SKB_CB(skb)->seq = tcp_rsk(req)->snt_isn; - TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; - TCP_SKB_CB(skb)->sacked = 0; - skb_shinfo(skb)->gso_segs = 1; - skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; + /* Setting of flags are superfluous here for callers (and ECE is + * not even correctly set) + */ + tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn, + TCPCB_FLAG_SYN | TCPCB_FLAG_ACK); th->seq = htonl(TCP_SKB_CB(skb)->seq); th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ @@ -2249,7 +2238,6 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, NULL) ); - skb->csum = 0; th->doff = (tcp_header_size >> 2); TCP_INC_STATS(TCP_MIB_OUTSEGS); @@ -2341,23 +2329,17 @@ int tcp_connect(struct sock *sk) /* Reserve space for headers. */ skb_reserve(buff, MAX_TCP_HEADER); - TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN; - TCP_ECN_send_syn(sk, buff); - TCP_SKB_CB(buff)->sacked = 0; - skb_shinfo(buff)->gso_segs = 1; - skb_shinfo(buff)->gso_size = 0; - skb_shinfo(buff)->gso_type = 0; - buff->csum = 0; tp->snd_nxt = tp->write_seq; - TCP_SKB_CB(buff)->seq = tp->write_seq++; - TCP_SKB_CB(buff)->end_seq = tp->write_seq; + tcp_init_nondata_skb(buff, tp->write_seq++, TCPCB_FLAG_SYN); + TCP_ECN_send_syn(sk, buff); /* Send it off. */ TCP_SKB_CB(buff)->when = tcp_time_stamp; tp->retrans_stamp = TCP_SKB_CB(buff)->when; skb_header_release(buff); __tcp_add_write_queue_tail(sk, buff); - sk_charge_skb(sk, buff); + sk->sk_wmem_queued += buff->truesize; + sk_mem_charge(sk, buff->truesize); tp->packets_out += tcp_skb_pcount(buff); tcp_transmit_skb(sk, buff, 1, GFP_KERNEL); @@ -2386,9 +2368,10 @@ void tcp_send_delayed_ack(struct sock *sk) if (ato > TCP_DELACK_MIN) { const struct tcp_sock *tp = tcp_sk(sk); - int max_ato = HZ/2; + int max_ato = HZ / 2; - if (icsk->icsk_ack.pingpong || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)) + if (icsk->icsk_ack.pingpong || + (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)) max_ato = TCP_DELACK_MAX; /* Slow path, intersegment interval is "high". */ @@ -2398,7 +2381,7 @@ void tcp_send_delayed_ack(struct sock *sk) * directly. */ if (tp->srtt) { - int rtt = max(tp->srtt>>3, TCP_DELACK_MIN); + int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN); if (rtt < max_ato) max_ato = rtt; @@ -2432,37 +2415,32 @@ void tcp_send_delayed_ack(struct sock *sk) /* This routine sends an ack and also updates the window. */ void tcp_send_ack(struct sock *sk) { - /* If we have been reset, we may not send again. */ - if (sk->sk_state != TCP_CLOSE) { - struct sk_buff *buff; + struct sk_buff *buff; - /* We are not putting this on the write queue, so - * tcp_transmit_skb() will set the ownership to this - * sock. - */ - buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); - if (buff == NULL) { - inet_csk_schedule_ack(sk); - inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - TCP_DELACK_MAX, TCP_RTO_MAX); - return; - } + /* If we have been reset, we may not send again. */ + if (sk->sk_state == TCP_CLOSE) + return; - /* Reserve space for headers and prepare control bits. */ - skb_reserve(buff, MAX_TCP_HEADER); - buff->csum = 0; - TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK; - TCP_SKB_CB(buff)->sacked = 0; - skb_shinfo(buff)->gso_segs = 1; - skb_shinfo(buff)->gso_size = 0; - skb_shinfo(buff)->gso_type = 0; - - /* Send it off, this clears delayed acks for us. */ - TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk); - TCP_SKB_CB(buff)->when = tcp_time_stamp; - tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); + /* We are not putting this on the write queue, so + * tcp_transmit_skb() will set the ownership to this + * sock. + */ + buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); + if (buff == NULL) { + inet_csk_schedule_ack(sk); + inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + TCP_DELACK_MAX, TCP_RTO_MAX); + return; } + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(buff, MAX_TCP_HEADER); + tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPCB_FLAG_ACK); + + /* Send it off, this clears delayed acks for us. */ + TCP_SKB_CB(buff)->when = tcp_time_stamp; + tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); } /* This routine sends a packet with an out of date sequence @@ -2488,66 +2466,57 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) /* Reserve space for headers and set control bits. */ skb_reserve(skb, MAX_TCP_HEADER); - skb->csum = 0; - TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; - TCP_SKB_CB(skb)->sacked = urgent; - skb_shinfo(skb)->gso_segs = 1; - skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; - /* Use a previous sequence. This should cause the other * end to send an ack. Don't queue or clone SKB, just * send it. */ - TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1; - TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; + tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPCB_FLAG_ACK); TCP_SKB_CB(skb)->when = tcp_time_stamp; return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } int tcp_write_wakeup(struct sock *sk) { - if (sk->sk_state != TCP_CLOSE) { - struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - - if ((skb = tcp_send_head(sk)) != NULL && - before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) { - int err; - unsigned int mss = tcp_current_mss(sk, 0); - unsigned int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq; - - if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) - tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; - - /* We are probing the opening of a window - * but the window size is != 0 - * must have been a result SWS avoidance ( sender ) - */ - if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || - skb->len > mss) { - seg_size = min(seg_size, mss); - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; - if (tcp_fragment(sk, skb, seg_size, mss)) - return -1; - } else if (!tcp_skb_pcount(skb)) - tcp_set_skb_tso_segs(sk, skb, mss); + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + if (sk->sk_state == TCP_CLOSE) + return -1; + + if ((skb = tcp_send_head(sk)) != NULL && + before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) { + int err; + unsigned int mss = tcp_current_mss(sk, 0); + unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; + + if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) + tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; + + /* We are probing the opening of a window + * but the window size is != 0 + * must have been a result SWS avoidance ( sender ) + */ + if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || + skb->len > mss) { + seg_size = min(seg_size, mss); TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; - TCP_SKB_CB(skb)->when = tcp_time_stamp; - err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); - if (!err) { - update_send_head(sk, skb); - } - return err; - } else { - if (tp->urg_mode && - between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF)) - tcp_xmit_probe_skb(sk, TCPCB_URG); - return tcp_xmit_probe_skb(sk, 0); - } + if (tcp_fragment(sk, skb, seg_size, mss)) + return -1; + } else if (!tcp_skb_pcount(skb)) + tcp_set_skb_tso_segs(sk, skb, mss); + + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; + TCP_SKB_CB(skb)->when = tcp_time_stamp; + err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); + if (!err) + tcp_event_new_data_sent(sk, skb); + return err; + } else { + if (tp->urg_mode && + between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF)) + tcp_xmit_probe_skb(sk, 1); + return tcp_xmit_probe_skb(sk, 0); } - return -1; } /* A window probe timeout has occurred. If window is not closed send diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index be27a33a1c6..2747ec7bfb6 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -15,8 +15,7 @@ #define TCP_SCALABLE_AI_CNT 50U #define TCP_SCALABLE_MD_SCALE 3 -static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, - u32 in_flight, int flag) +static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index d8970ecfcfc..803d758a2b1 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -114,13 +114,31 @@ static int tcp_orphan_retries(struct sock *sk, int alive) return retries; } +static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) +{ + /* Black hole detection */ + if (sysctl_tcp_mtu_probing) { + if (!icsk->icsk_mtup.enabled) { + icsk->icsk_mtup.enabled = 1; + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); + } else { + struct tcp_sock *tp = tcp_sk(sk); + int mss; + + mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; + mss = min(sysctl_tcp_base_mss, mss); + mss = max(mss, 68 - tp->tcp_header_len); + icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); + } + } +} + /* A write timeout has occurred. Process the after effects. */ static int tcp_write_timeout(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_sock *tp = tcp_sk(sk); int retry_until; - int mss; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) @@ -129,18 +147,7 @@ static int tcp_write_timeout(struct sock *sk) } else { if (icsk->icsk_retransmits >= sysctl_tcp_retries1) { /* Black hole detection */ - if (sysctl_tcp_mtu_probing) { - if (!icsk->icsk_mtup.enabled) { - icsk->icsk_mtup.enabled = 1; - tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); - } else { - mss = min(sysctl_tcp_base_mss, - tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)/2); - mss = max(mss, 68 - tp->tcp_header_len); - icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); - tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); - } - } + tcp_mtu_probing(icsk, sk); dst_negative_advice(&sk->sk_dst_cache); } @@ -179,7 +186,7 @@ static void tcp_delack_timer(unsigned long data) goto out_unlock; } - sk_stream_mem_reclaim(sk); + sk_mem_reclaim_partial(sk); if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) goto out; @@ -219,7 +226,7 @@ static void tcp_delack_timer(unsigned long data) out: if (tcp_memory_pressure) - sk_stream_mem_reclaim(sk); + sk_mem_reclaim(sk); out_unlock: bh_unlock_sock(sk); sock_put(sk); @@ -413,7 +420,7 @@ static void tcp_write_timer(unsigned long data) TCP_CHECK_TIMER(sk); out: - sk_stream_mem_reclaim(sk); + sk_mem_reclaim(sk); out_unlock: bh_unlock_sock(sk); sock_put(sk); @@ -507,7 +514,7 @@ static void tcp_keepalive_timer (unsigned long data) } TCP_CHECK_TIMER(sk); - sk_stream_mem_reclaim(sk); + sk_mem_reclaim(sk); resched: inet_csk_reset_keepalive_timer (sk, elapsed); diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 007304e9984..be24d6ee34b 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -162,14 +162,13 @@ void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event) } EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event); -static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, - u32 in_flight, int flag) +static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); struct vegas *vegas = inet_csk_ca(sk); if (!vegas->doing_vegas_now) - return tcp_reno_cong_avoid(sk, ack, in_flight, flag); + return tcp_reno_cong_avoid(sk, ack, in_flight); /* The key players are v_beg_snd_una and v_beg_snd_nxt. * @@ -228,7 +227,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, /* We don't have enough RTT samples to do the Vegas * calculation, so we'll behave like Reno. */ - tcp_reno_cong_avoid(sk, ack, in_flight, flag); + tcp_reno_cong_avoid(sk, ack, in_flight); } else { u32 rtt, target_cwnd, diff; diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 8fb2aee0b1a..d16689e9851 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -114,14 +114,13 @@ static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event) tcp_veno_init(sk); } -static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, - u32 in_flight, int flag) +static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); struct veno *veno = inet_csk_ca(sk); if (!veno->doing_veno_now) - return tcp_reno_cong_avoid(sk, ack, in_flight, flag); + return tcp_reno_cong_avoid(sk, ack, in_flight); /* limited by applications */ if (!tcp_is_cwnd_limited(sk, in_flight)) @@ -132,7 +131,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, /* We don't have enough rtt samples to do the Veno * calculation, so we'll behave like Reno. */ - tcp_reno_cong_avoid(sk, ack, in_flight, flag); + tcp_reno_cong_avoid(sk, ack, in_flight); } else { u32 rtt, target_cwnd; diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index c107fba7430..e03b10183a8 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -69,8 +69,7 @@ static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us) tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us); } -static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, - u32 in_flight, int flag) +static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); struct yeah *yeah = inet_csk_ca(sk); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 03c400ca14c..2fb8d731026 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -82,6 +82,7 @@ #include <asm/system.h> #include <asm/uaccess.h> #include <asm/ioctls.h> +#include <linux/bootmem.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/module.h> @@ -110,10 +111,25 @@ */ DEFINE_SNMP_STAT(struct udp_mib, udp_statistics) __read_mostly; +EXPORT_SYMBOL(udp_statistics); + +DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6) __read_mostly; +EXPORT_SYMBOL(udp_stats_in6); struct hlist_head udp_hash[UDP_HTABLE_SIZE]; DEFINE_RWLOCK(udp_hash_lock); +int sysctl_udp_mem[3] __read_mostly; +int sysctl_udp_rmem_min __read_mostly; +int sysctl_udp_wmem_min __read_mostly; + +EXPORT_SYMBOL(sysctl_udp_mem); +EXPORT_SYMBOL(sysctl_udp_rmem_min); +EXPORT_SYMBOL(sysctl_udp_wmem_min); + +atomic_t udp_memory_allocated; +EXPORT_SYMBOL(udp_memory_allocated); + static inline int __udp_lib_lport_inuse(__u16 num, const struct hlist_head udptable[]) { @@ -214,7 +230,7 @@ gotit: if (sk_unhashed(sk)) { head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; sk_add_node(sk, head); - sock_prot_inc_use(sk->sk_prot); + sock_prot_inuse_add(sk->sk_prot, 1); } error = 0; fail: @@ -402,7 +418,7 @@ out: void udp_err(struct sk_buff *skb, u32 info) { - return __udp4_lib_err(skb, info, udp_hash); + __udp4_lib_err(skb, info, udp_hash); } /* @@ -471,6 +487,7 @@ static int udp_push_pending_frames(struct sock *sk) struct sk_buff *skb; struct udphdr *uh; int err = 0; + int is_udplite = IS_UDPLITE(sk); __wsum csum = 0; /* Grab the skbuff where UDP header space exists. */ @@ -486,7 +503,7 @@ static int udp_push_pending_frames(struct sock *sk) uh->len = htons(up->len); uh->check = 0; - if (up->pcflag) /* UDP-Lite */ + if (is_udplite) /* UDP-Lite */ csum = udplite_csum_outgoing(sk, skb); else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */ @@ -514,7 +531,7 @@ out: up->len = 0; up->pending = 0; if (!err) - UDP_INC_STATS_USER(UDP_MIB_OUTDATAGRAMS, up->pcflag); + UDP_INC_STATS_USER(UDP_MIB_OUTDATAGRAMS, is_udplite); return err; } @@ -531,7 +548,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, __be32 daddr, faddr, saddr; __be16 dport; u8 tos; - int err, is_udplite = up->pcflag; + int err, is_udplite = IS_UDPLITE(sk); int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); @@ -621,7 +638,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, connected = 0; } - if (MULTICAST(daddr)) { + if (ipv4_is_multicast(daddr)) { if (!ipc.oif) ipc.oif = inet->mc_index; if (!saddr) @@ -643,7 +660,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, { .sport = inet->sport, .dport = dport } } }; security_sk_classify_flow(sk, &fl); - err = ip_route_output_flow(&rt, &fl, sk, 1); + err = ip_route_output_flow(&init_net, &rt, &fl, sk, 1); if (err) { if (err == -ENETUNREACH) IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); @@ -825,6 +842,7 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; struct sk_buff *skb; unsigned int ulen, copied; + int peeked; int err; int is_udplite = IS_UDPLITE(sk); @@ -838,7 +856,8 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, return ip_recv_error(sk, msg, len); try_again: - skb = skb_recv_datagram(sk, flags, noblock, &err); + skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), + &peeked, &err); if (!skb) goto out; @@ -873,6 +892,9 @@ try_again: if (err) goto out_free; + if (!peeked) + UDP_INC_STATS_USER(UDP_MIB_INDATAGRAMS, is_udplite); + sock_recv_timestamp(msg, sk, skb); /* Copy the address. */ @@ -891,14 +913,17 @@ try_again: err = ulen; out_free: + lock_sock(sk); skb_free_datagram(sk, skb); + release_sock(sk); out: return err; csum_copy_err: - UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_udplite); - - skb_kill_datagram(sk, skb, flags); + lock_sock(sk); + if (!skb_kill_datagram(sk, skb, flags)) + UDP_INC_STATS_USER(UDP_MIB_INERRORS, is_udplite); + release_sock(sk); if (noblock) return -EAGAIN; @@ -940,6 +965,7 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) { struct udp_sock *up = udp_sk(sk); int rc; + int is_udplite = IS_UDPLITE(sk); /* * Charge it to the socket, dropping if the queue is full. @@ -967,7 +993,8 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) ret = (*up->encap_rcv)(sk, skb); if (ret <= 0) { - UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS, up->pcflag); + UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS, + is_udplite); return -ret; } } @@ -978,7 +1005,7 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) /* * UDP-Lite specific tests, ignored on UDP sockets */ - if ((up->pcflag & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) { + if ((is_udplite & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) { /* * MIB statistics other than incrementing the error count are @@ -1019,15 +1046,14 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) if ((rc = sock_queue_rcv_skb(sk,skb)) < 0) { /* Note that an ENOMEM error is charged twice */ if (rc == -ENOMEM) - UDP_INC_STATS_BH(UDP_MIB_RCVBUFERRORS, up->pcflag); + UDP_INC_STATS_BH(UDP_MIB_RCVBUFERRORS, is_udplite); goto drop; } - UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS, up->pcflag); return 0; drop: - UDP_INC_STATS_BH(UDP_MIB_INERRORS, up->pcflag); + UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_udplite); kfree_skb(skb); return -1; } @@ -1062,7 +1088,15 @@ static int __udp4_lib_mcast_deliver(struct sk_buff *skb, skb1 = skb_clone(skb, GFP_ATOMIC); if (skb1) { - int ret = udp_queue_rcv_skb(sk, skb1); + int ret = 0; + + bh_lock_sock_nested(sk); + if (!sock_owned_by_user(sk)) + ret = udp_queue_rcv_skb(sk, skb1); + else + sk_add_backlog(sk, skb1); + bh_unlock_sock(sk); + if (ret > 0) /* we should probably re-process instead * of dropping packets here. */ @@ -1155,7 +1189,13 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], inet_iif(skb), udptable); if (sk != NULL) { - int ret = udp_queue_rcv_skb(sk, skb); + int ret = 0; + bh_lock_sock_nested(sk); + if (!sock_owned_by_user(sk)) + ret = udp_queue_rcv_skb(sk, skb); + else + sk_add_backlog(sk, skb); + bh_unlock_sock(sk); sock_put(sk); /* a return value > 0 means to resubmit the input, but @@ -1236,6 +1276,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, struct udp_sock *up = udp_sk(sk); int val; int err = 0; + int is_udplite = IS_UDPLITE(sk); if (optlen<sizeof(int)) return -EINVAL; @@ -1277,7 +1318,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, /* The sender sets actual checksum coverage length via this option. * The case coverage > packet length is handled by send module. */ case UDPLITE_SEND_CSCOV: - if (!up->pcflag) /* Disable the option on UDP sockets */ + if (!is_udplite) /* Disable the option on UDP sockets */ return -ENOPROTOOPT; if (val != 0 && val < 8) /* Illegal coverage: use default (8) */ val = 8; @@ -1289,7 +1330,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, * sense, this should be set to at least 8 (as done below). If zero is * used, this again means full checksum coverage. */ case UDPLITE_RECV_CSCOV: - if (!up->pcflag) /* Disable the option on UDP sockets */ + if (!is_udplite) /* Disable the option on UDP sockets */ return -ENOPROTOOPT; if (val != 0 && val < 8) /* Avoid silly minimal values. */ val = 8; @@ -1449,6 +1490,10 @@ struct proto udp_prot = { .hash = udp_lib_hash, .unhash = udp_lib_unhash, .get_port = udp_v4_get_port, + .memory_allocated = &udp_memory_allocated, + .sysctl_mem = sysctl_udp_mem, + .sysctl_wmem = &sysctl_udp_wmem_min, + .sysctl_rmem = &sysctl_udp_rmem_min, .obj_size = sizeof(struct udp_sock), #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udp_setsockopt, @@ -1505,6 +1550,7 @@ static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos) } static void *udp_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(udp_hash_lock) { read_lock(&udp_hash_lock); return *pos ? udp_get_idx(seq, *pos-1) : (void *)1; @@ -1524,6 +1570,7 @@ static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void udp_seq_stop(struct seq_file *seq, void *v) + __releases(udp_hash_lock) { read_unlock(&udp_hash_lock); } @@ -1644,6 +1691,25 @@ void udp4_proc_exit(void) } #endif /* CONFIG_PROC_FS */ +void __init udp_init(void) +{ + unsigned long limit; + + /* Set the pressure threshold up by the same strategy of TCP. It is a + * fraction of global memory that is up to 1/2 at 256 MB, decreasing + * toward zero with the amount of memory, with a floor of 128 pages. + */ + limit = min(nr_all_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT); + limit = (limit * (nr_all_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11); + limit = max(limit, 128UL); + sysctl_udp_mem[0] = limit / 4 * 3; + sysctl_udp_mem[1] = limit; + sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2; + + sysctl_udp_rmem_min = SK_MEM_QUANTUM; + sysctl_udp_wmem_min = SK_MEM_QUANTUM; +} + EXPORT_SYMBOL(udp_disconnect); EXPORT_SYMBOL(udp_hash); EXPORT_SYMBOL(udp_hash_lock); diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index f5baeb3e8b8..001b881ca36 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -35,7 +35,7 @@ static int udplite_rcv(struct sk_buff *skb) static void udplite_err(struct sk_buff *skb, u32 info) { - return __udp4_lib_err(skb, info, udplite_hash); + __udp4_lib_err(skb, info, udplite_hash); } static struct net_protocol udplite_protocol = { diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 5e95c8a07ef..390dcb1354a 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -16,7 +16,11 @@ #include <net/ip.h> #include <net/xfrm.h> -#ifdef CONFIG_NETFILTER +int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb) +{ + return xfrm4_extract_header(skb); +} + static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb) { if (skb->dst == NULL) { @@ -31,129 +35,35 @@ drop: kfree_skb(skb); return NET_RX_DROP; } -#endif int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { - int err; - __be32 seq; - struct xfrm_state *xfrm_vec[XFRM_MAX_DEPTH]; - struct xfrm_state *x; - int xfrm_nr = 0; - int decaps = 0; - unsigned int nhoff = offsetof(struct iphdr, protocol); - - seq = 0; - if (!spi && (err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) - goto drop; - - do { - const struct iphdr *iph = ip_hdr(skb); - - if (xfrm_nr == XFRM_MAX_DEPTH) - goto drop; - - x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, spi, - nexthdr, AF_INET); - if (x == NULL) - goto drop; - - spin_lock(&x->lock); - if (unlikely(x->km.state != XFRM_STATE_VALID)) - goto drop_unlock; - - if ((x->encap ? x->encap->encap_type : 0) != encap_type) - goto drop_unlock; - - if (x->props.replay_window && xfrm_replay_check(x, seq)) - goto drop_unlock; - - if (xfrm_state_check_expire(x)) - goto drop_unlock; - - nexthdr = x->type->input(x, skb); - if (nexthdr <= 0) - goto drop_unlock; - - skb_network_header(skb)[nhoff] = nexthdr; - - /* only the first xfrm gets the encap type */ - encap_type = 0; - - if (x->props.replay_window) - xfrm_replay_advance(x, seq); - - x->curlft.bytes += skb->len; - x->curlft.packets++; - - spin_unlock(&x->lock); - - xfrm_vec[xfrm_nr++] = x; - - if (x->outer_mode->input(x, skb)) - goto drop; - - if (x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL) { - decaps = 1; - break; - } - - err = xfrm_parse_spi(skb, nexthdr, &spi, &seq); - if (err < 0) - goto drop; - } while (!err); - - /* Allocate new secpath or COW existing one. */ - - if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) { - struct sec_path *sp; - sp = secpath_dup(skb->sp); - if (!sp) - goto drop; - if (skb->sp) - secpath_put(skb->sp); - skb->sp = sp; - } - if (xfrm_nr + skb->sp->len > XFRM_MAX_DEPTH) - goto drop; - - memcpy(skb->sp->xvec + skb->sp->len, xfrm_vec, - xfrm_nr * sizeof(xfrm_vec[0])); - skb->sp->len += xfrm_nr; + XFRM_SPI_SKB_CB(skb)->family = AF_INET; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + return xfrm_input(skb, nexthdr, spi, encap_type); +} +EXPORT_SYMBOL(xfrm4_rcv_encap); - nf_reset(skb); +int xfrm4_transport_finish(struct sk_buff *skb, int async) +{ + struct iphdr *iph = ip_hdr(skb); - if (decaps) { - dst_release(skb->dst); - skb->dst = NULL; - netif_rx(skb); - return 0; - } else { -#ifdef CONFIG_NETFILTER - __skb_push(skb, skb->data - skb_network_header(skb)); - ip_hdr(skb)->tot_len = htons(skb->len); - ip_send_check(ip_hdr(skb)); + iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol; - NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, skb->dev, NULL, - xfrm4_rcv_encap_finish); - return 0; -#else - return -ip_hdr(skb)->protocol; +#ifndef CONFIG_NETFILTER + if (!async) + return -iph->protocol; #endif - } -drop_unlock: - spin_unlock(&x->lock); - xfrm_state_put(x); -drop: - while (--xfrm_nr >= 0) - xfrm_state_put(xfrm_vec[xfrm_nr]); + __skb_push(skb, skb->data - skb_network_header(skb)); + iph->tot_len = htons(skb->len); + ip_send_check(iph); - kfree_skb(skb); + NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, skb->dev, NULL, + xfrm4_rcv_encap_finish); return 0; } -EXPORT_SYMBOL(xfrm4_rcv_encap); /* If it's a keepalive packet, then just eat it. * If it's an encapsulated packet, then pass it to the diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c index e42e122414b..e093a7b59e1 100644 --- a/net/ipv4/xfrm4_mode_beet.c +++ b/net/ipv4/xfrm4_mode_beet.c @@ -17,6 +17,21 @@ #include <net/ip.h> #include <net/xfrm.h> +static void xfrm4_beet_make_header(struct sk_buff *skb) +{ + struct iphdr *iph = ip_hdr(skb); + + iph->ihl = 5; + iph->version = 4; + + iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol; + iph->tos = XFRM_MODE_SKB_CB(skb)->tos; + + iph->id = XFRM_MODE_SKB_CB(skb)->id; + iph->frag_off = XFRM_MODE_SKB_CB(skb)->frag_off; + iph->ttl = XFRM_MODE_SKB_CB(skb)->ttl; +} + /* Add encapsulation header. * * The top IP header will be constructed per draft-nikander-esp-beet-mode-06.txt. @@ -40,10 +55,12 @@ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb) offsetof(struct iphdr, protocol); skb->transport_header = skb->network_header + sizeof(*iph); + xfrm4_beet_make_header(skb); + ph = (struct ip_beet_phdr *)__skb_pull(skb, sizeof(*iph) - hdrlen); top_iph = ip_hdr(skb); - memmove(top_iph, iph, sizeof(*iph)); + if (unlikely(optlen)) { BUG_ON(optlen < 0); @@ -65,43 +82,46 @@ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb) static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb) { - struct iphdr *iph = ip_hdr(skb); - int phlen = 0; + struct iphdr *iph; int optlen = 0; - u8 ph_nexthdr = 0; int err = -EINVAL; - if (unlikely(iph->protocol == IPPROTO_BEETPH)) { + if (unlikely(XFRM_MODE_SKB_CB(skb)->protocol == IPPROTO_BEETPH)) { struct ip_beet_phdr *ph; + int phlen; if (!pskb_may_pull(skb, sizeof(*ph))) goto out; - ph = (struct ip_beet_phdr *)(ipip_hdr(skb) + 1); + + ph = (struct ip_beet_phdr *)skb->data; phlen = sizeof(*ph) + ph->padlen; optlen = ph->hdrlen * 8 + (IPV4_BEET_PHMAXLEN - phlen); if (optlen < 0 || optlen & 3 || optlen > 250) goto out; - if (!pskb_may_pull(skb, phlen + optlen)) - goto out; - skb->len -= phlen + optlen; + XFRM_MODE_SKB_CB(skb)->protocol = ph->nexthdr; - ph_nexthdr = ph->nexthdr; + if (!pskb_may_pull(skb, phlen)); + goto out; + __skb_pull(skb, phlen); } - skb_set_network_header(skb, phlen - sizeof(*iph)); - memmove(skb_network_header(skb), iph, sizeof(*iph)); - skb_set_transport_header(skb, phlen + optlen); - skb->data = skb_transport_header(skb); + skb_push(skb, sizeof(*iph)); + skb_reset_network_header(skb); + + memmove(skb->data - skb->mac_len, skb_mac_header(skb), + skb->mac_len); + skb_set_mac_header(skb, -skb->mac_len); + + xfrm4_beet_make_header(skb); iph = ip_hdr(skb); - iph->ihl = (sizeof(*iph) + optlen) / 4; - iph->tot_len = htons(skb->len + iph->ihl * 4); + + iph->ihl += optlen / 4; + iph->tot_len = htons(skb->len); iph->daddr = x->sel.daddr.a4; iph->saddr = x->sel.saddr.a4; - if (ph_nexthdr) - iph->protocol = ph_nexthdr; iph->check = 0; iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); err = 0; @@ -110,8 +130,10 @@ out: } static struct xfrm_mode xfrm4_beet_mode = { - .input = xfrm4_beet_input, - .output = xfrm4_beet_output, + .input2 = xfrm4_beet_input, + .input = xfrm_prepare_input, + .output2 = xfrm4_beet_output, + .output = xfrm4_prepare_output, .owner = THIS_MODULE, .encap = XFRM_MODE_BEET, .flags = XFRM_MODE_FLAG_TUNNEL, diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index e4deecba6dd..8dee617ee90 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -16,92 +16,60 @@ static inline void ipip_ecn_decapsulate(struct sk_buff *skb) { - struct iphdr *outer_iph = ip_hdr(skb); struct iphdr *inner_iph = ipip_hdr(skb); - if (INET_ECN_is_ce(outer_iph->tos)) + if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) IP_ECN_set_ce(inner_iph); } -static inline void ipip6_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) -{ - if (INET_ECN_is_ce(iph->tos)) - IP6_ECN_set_ce(ipv6_hdr(skb)); -} - /* Add encapsulation header. * * The top IP header will be constructed per RFC 2401. */ -static int xfrm4_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) +static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) { struct dst_entry *dst = skb->dst; - struct xfrm_dst *xdst = (struct xfrm_dst*)dst; - struct iphdr *iph, *top_iph; + struct iphdr *top_iph; int flags; - iph = ip_hdr(skb); - skb_set_network_header(skb, -x->props.header_len); skb->mac_header = skb->network_header + offsetof(struct iphdr, protocol); - skb->transport_header = skb->network_header + sizeof(*iph); + skb->transport_header = skb->network_header + sizeof(*top_iph); top_iph = ip_hdr(skb); top_iph->ihl = 5; top_iph->version = 4; - flags = x->props.flags; + top_iph->protocol = x->inner_mode->afinfo->proto; /* DS disclosed */ - if (xdst->route->ops->family == AF_INET) { - top_iph->protocol = IPPROTO_IPIP; - top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos); - top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? - 0 : (iph->frag_off & htons(IP_DF)); - } -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - else { - struct ipv6hdr *ipv6h = (struct ipv6hdr*)iph; - top_iph->protocol = IPPROTO_IPV6; - top_iph->tos = INET_ECN_encapsulate(iph->tos, ipv6_get_dsfield(ipv6h)); - top_iph->frag_off = 0; - } -#endif + top_iph->tos = INET_ECN_encapsulate(XFRM_MODE_SKB_CB(skb)->tos, + XFRM_MODE_SKB_CB(skb)->tos); + flags = x->props.flags; if (flags & XFRM_STATE_NOECN) IP_ECN_clear(top_iph); - if (!top_iph->frag_off) - __ip_select_ident(top_iph, dst->child, 0); + top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? + 0 : XFRM_MODE_SKB_CB(skb)->frag_off; + ip_select_ident(top_iph, dst->child, NULL); top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT); top_iph->saddr = x->props.saddr.a4; top_iph->daddr = x->id.daddr.a4; - skb->protocol = htons(ETH_P_IP); - - memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); return 0; } -static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) +static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) { - struct iphdr *iph = ip_hdr(skb); const unsigned char *old_mac; int err = -EINVAL; - switch (iph->protocol){ - case IPPROTO_IPIP: - break; -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - case IPPROTO_IPV6: - break; -#endif - default: - goto out; - } + if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP) + goto out; if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto out; @@ -110,20 +78,11 @@ static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) goto out; - iph = ip_hdr(skb); - if (iph->protocol == IPPROTO_IPIP) { - if (x->props.flags & XFRM_STATE_DECAP_DSCP) - ipv4_copy_dscp(iph, ipip_hdr(skb)); - if (!(x->props.flags & XFRM_STATE_NOECN)) - ipip_ecn_decapsulate(skb); - } -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - else { - if (!(x->props.flags & XFRM_STATE_NOECN)) - ipip6_ecn_decapsulate(iph, skb); - skb->protocol = htons(ETH_P_IPV6); - } -#endif + if (x->props.flags & XFRM_STATE_DECAP_DSCP) + ipv4_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, ipip_hdr(skb)); + if (!(x->props.flags & XFRM_STATE_NOECN)) + ipip_ecn_decapsulate(skb); + old_mac = skb_mac_header(skb); skb_set_mac_header(skb, -skb->mac_len); memmove(skb_mac_header(skb), old_mac, skb->mac_len); @@ -135,19 +94,21 @@ out: } static struct xfrm_mode xfrm4_tunnel_mode = { - .input = xfrm4_tunnel_input, - .output = xfrm4_tunnel_output, + .input2 = xfrm4_mode_tunnel_input, + .input = xfrm_prepare_input, + .output2 = xfrm4_mode_tunnel_output, + .output = xfrm4_prepare_output, .owner = THIS_MODULE, .encap = XFRM_MODE_TUNNEL, .flags = XFRM_MODE_FLAG_TUNNEL, }; -static int __init xfrm4_tunnel_init(void) +static int __init xfrm4_mode_tunnel_init(void) { return xfrm_register_mode(&xfrm4_tunnel_mode, AF_INET); } -static void __exit xfrm4_tunnel_exit(void) +static void __exit xfrm4_mode_tunnel_exit(void) { int err; @@ -155,7 +116,7 @@ static void __exit xfrm4_tunnel_exit(void) BUG_ON(err); } -module_init(xfrm4_tunnel_init); -module_exit(xfrm4_tunnel_exit); +module_init(xfrm4_mode_tunnel_init); +module_exit(xfrm4_mode_tunnel_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TUNNEL); diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index c4a7156962b..d5a58a81802 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -8,11 +8,12 @@ * 2 of the License, or (at your option) any later version. */ -#include <linux/compiler.h> #include <linux/if_ether.h> #include <linux/kernel.h> +#include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter_ipv4.h> +#include <net/dst.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/icmp.h> @@ -25,8 +26,6 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE) goto out; - IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; - if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df) goto out; @@ -40,106 +39,54 @@ out: return ret; } -static inline int xfrm4_output_one(struct sk_buff *skb) +int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb) { - struct dst_entry *dst = skb->dst; - struct xfrm_state *x = dst->xfrm; - struct iphdr *iph; int err; - if (x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL) { - err = xfrm4_tunnel_check_size(skb); - if (err) - goto error_nolock; - } - - err = xfrm_output(skb); + err = xfrm4_tunnel_check_size(skb); if (err) - goto error_nolock; + return err; - iph = ip_hdr(skb); - iph->tot_len = htons(skb->len); - ip_send_check(iph); + XFRM_MODE_SKB_CB(skb)->protocol = ip_hdr(skb)->protocol; - IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; - err = 0; - -out_exit: - return err; -error_nolock: - kfree_skb(skb); - goto out_exit; + return xfrm4_extract_header(skb); } -static int xfrm4_output_finish2(struct sk_buff *skb) +int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb) { int err; - while (likely((err = xfrm4_output_one(skb)) == 0)) { - nf_reset(skb); - - err = nf_hook(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, - skb->dst->dev, dst_output); - if (unlikely(err != 1)) - break; + err = x->inner_mode->afinfo->extract_output(x, skb); + if (err) + return err; - if (!skb->dst->xfrm) - return dst_output(skb); + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED; - err = nf_hook(PF_INET, NF_IP_POST_ROUTING, skb, NULL, - skb->dst->dev, xfrm4_output_finish2); - if (unlikely(err != 1)) - break; - } + skb->protocol = htons(ETH_P_IP); - return err; + return x->outer_mode->output2(x, skb); } +EXPORT_SYMBOL(xfrm4_prepare_output); static int xfrm4_output_finish(struct sk_buff *skb) { - struct sk_buff *segs; - #ifdef CONFIG_NETFILTER if (!skb->dst->xfrm) { IPCB(skb)->flags |= IPSKB_REROUTED; return dst_output(skb); } -#endif - if (!skb_is_gso(skb)) - return xfrm4_output_finish2(skb); + IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; +#endif skb->protocol = htons(ETH_P_IP); - segs = skb_gso_segment(skb, 0); - kfree_skb(skb); - if (unlikely(IS_ERR(segs))) - return PTR_ERR(segs); - - do { - struct sk_buff *nskb = segs->next; - int err; - - segs->next = NULL; - err = xfrm4_output_finish2(segs); - - if (unlikely(err)) { - while ((segs = nskb)) { - nskb = segs->next; - segs->next = NULL; - kfree_skb(segs); - } - return err; - } - - segs = nskb; - } while (segs); - - return 0; + return xfrm_output(skb); } int xfrm4_output(struct sk_buff *skb) { - return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dst->dev, - xfrm4_output_finish, + return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, + NULL, skb->dst->dev, xfrm4_output_finish, !(IPCB(skb)->flags & IPSKB_REROUTED)); } diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index cc86fb110dd..3783e3ee56a 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -8,36 +8,54 @@ * */ -#include <linux/compiler.h> +#include <linux/err.h> +#include <linux/kernel.h> #include <linux/inetdevice.h> +#include <net/dst.h> #include <net/xfrm.h> #include <net/ip.h> static struct dst_ops xfrm4_dst_ops; static struct xfrm_policy_afinfo xfrm4_policy_afinfo; -static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl) +static struct dst_entry *xfrm4_dst_lookup(int tos, xfrm_address_t *saddr, + xfrm_address_t *daddr) { - return __ip_route_output_key((struct rtable**)dst, fl); -} - -static int xfrm4_get_saddr(xfrm_address_t *saddr, xfrm_address_t *daddr) -{ - struct rtable *rt; - struct flowi fl_tunnel = { + struct flowi fl = { .nl_u = { .ip4_u = { + .tos = tos, .daddr = daddr->a4, }, }, }; + struct dst_entry *dst; + struct rtable *rt; + int err; - if (!xfrm4_dst_lookup((struct xfrm_dst **)&rt, &fl_tunnel)) { - saddr->a4 = rt->rt_src; - dst_release(&rt->u.dst); - return 0; - } - return -EHOSTUNREACH; + if (saddr) + fl.fl4_src = saddr->a4; + + err = __ip_route_output_key(&init_net, &rt, &fl); + dst = &rt->u.dst; + if (err) + dst = ERR_PTR(err); + return dst; +} + +static int xfrm4_get_saddr(xfrm_address_t *saddr, xfrm_address_t *daddr) +{ + struct dst_entry *dst; + struct rtable *rt; + + dst = xfrm4_dst_lookup(0, NULL, daddr); + if (IS_ERR(dst)) + return -EHOSTUNREACH; + + rt = (struct rtable *)dst; + saddr->a4 = rt->rt_src; + dst_release(dst); + return 0; } static struct dst_entry * @@ -61,142 +79,49 @@ __xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy) return dst; } -/* Allocate chain of dst_entry's, attach known xfrm's, calculate - * all the metrics... Shortly, bundle a bundle. - */ +static int xfrm4_get_tos(struct flowi *fl) +{ + return fl->fl4_tos; +} -static int -__xfrm4_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx, - struct flowi *fl, struct dst_entry **dst_p) +static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, + int nfheader_len) { - struct dst_entry *dst, *dst_prev; - struct rtable *rt0 = (struct rtable*)(*dst_p); - struct rtable *rt = rt0; - struct flowi fl_tunnel = { - .nl_u = { - .ip4_u = { - .saddr = fl->fl4_src, - .daddr = fl->fl4_dst, - .tos = fl->fl4_tos - } - } - }; - int i; - int err; - int header_len = 0; - int trailer_len = 0; + return 0; +} - dst = dst_prev = NULL; - dst_hold(&rt->u.dst); +static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev) +{ + struct rtable *rt = (struct rtable *)xdst->route; - for (i = 0; i < nx; i++) { - struct dst_entry *dst1 = dst_alloc(&xfrm4_dst_ops); - struct xfrm_dst *xdst; + xdst->u.rt.fl = rt->fl; - if (unlikely(dst1 == NULL)) { - err = -ENOBUFS; - dst_release(&rt->u.dst); - goto error; - } + xdst->u.dst.dev = dev; + dev_hold(dev); - if (!dst) - dst = dst1; - else { - dst_prev->child = dst1; - dst1->flags |= DST_NOHASH; - dst_clone(dst1); - } + xdst->u.rt.idev = in_dev_get(dev); + if (!xdst->u.rt.idev) + return -ENODEV; - xdst = (struct xfrm_dst *)dst1; - xdst->route = &rt->u.dst; - xdst->genid = xfrm[i]->genid; - - dst1->next = dst_prev; - dst_prev = dst1; - - header_len += xfrm[i]->props.header_len; - trailer_len += xfrm[i]->props.trailer_len; - - if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) { - unsigned short encap_family = xfrm[i]->props.family; - switch (encap_family) { - case AF_INET: - fl_tunnel.fl4_dst = xfrm[i]->id.daddr.a4; - fl_tunnel.fl4_src = xfrm[i]->props.saddr.a4; - break; -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - case AF_INET6: - ipv6_addr_copy(&fl_tunnel.fl6_dst, (struct in6_addr*)&xfrm[i]->id.daddr.a6); - ipv6_addr_copy(&fl_tunnel.fl6_src, (struct in6_addr*)&xfrm[i]->props.saddr.a6); - break; -#endif - default: - BUG_ON(1); - } - err = xfrm_dst_lookup((struct xfrm_dst **)&rt, - &fl_tunnel, encap_family); - if (err) - goto error; - } else - dst_hold(&rt->u.dst); - } + xdst->u.rt.peer = rt->peer; + if (rt->peer) + atomic_inc(&rt->peer->refcnt); - dst_prev->child = &rt->u.dst; - dst->path = &rt->u.dst; - - *dst_p = dst; - dst = dst_prev; - - dst_prev = *dst_p; - i = 0; - for (; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) { - struct xfrm_dst *x = (struct xfrm_dst*)dst_prev; - x->u.rt.fl = *fl; - - dst_prev->xfrm = xfrm[i++]; - dst_prev->dev = rt->u.dst.dev; - if (rt->u.dst.dev) - dev_hold(rt->u.dst.dev); - dst_prev->obsolete = -1; - dst_prev->flags |= DST_HOST; - dst_prev->lastuse = jiffies; - dst_prev->header_len = header_len; - dst_prev->nfheader_len = 0; - dst_prev->trailer_len = trailer_len; - memcpy(&dst_prev->metrics, &x->route->metrics, sizeof(dst_prev->metrics)); - - /* Copy neighbout for reachability confirmation */ - dst_prev->neighbour = neigh_clone(rt->u.dst.neighbour); - dst_prev->input = rt->u.dst.input; - dst_prev->output = dst_prev->xfrm->outer_mode->afinfo->output; - if (rt0->peer) - atomic_inc(&rt0->peer->refcnt); - x->u.rt.peer = rt0->peer; - /* Sheit... I remember I did this right. Apparently, - * it was magically lost, so this code needs audit */ - x->u.rt.rt_flags = rt0->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL); - x->u.rt.rt_type = rt0->rt_type; - x->u.rt.rt_src = rt0->rt_src; - x->u.rt.rt_dst = rt0->rt_dst; - x->u.rt.rt_gateway = rt0->rt_gateway; - x->u.rt.rt_spec_dst = rt0->rt_spec_dst; - x->u.rt.idev = rt0->idev; - in_dev_hold(rt0->idev); - header_len -= x->u.dst.xfrm->props.header_len; - trailer_len -= x->u.dst.xfrm->props.trailer_len; - } + /* Sheit... I remember I did this right. Apparently, + * it was magically lost, so this code needs audit */ + xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | + RTCF_LOCAL); + xdst->u.rt.rt_type = rt->rt_type; + xdst->u.rt.rt_src = rt->rt_src; + xdst->u.rt.rt_dst = rt->rt_dst; + xdst->u.rt.rt_gateway = rt->rt_gateway; + xdst->u.rt.rt_spec_dst = rt->rt_spec_dst; - xfrm_init_pmtu(dst); return 0; - -error: - if (dst) - dst_free(dst); - return err; } static void -_decode_session4(struct sk_buff *skb, struct flowi *fl) +_decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) { struct iphdr *iph = ip_hdr(skb); u8 *xprth = skb_network_header(skb) + iph->ihl * 4; @@ -212,8 +137,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl) if (pskb_may_pull(skb, xprth + 4 - skb->data)) { __be16 *ports = (__be16 *)xprth; - fl->fl_ip_sport = ports[0]; - fl->fl_ip_dport = ports[1]; + fl->fl_ip_sport = ports[!!reverse]; + fl->fl_ip_dport = ports[!reverse]; } break; @@ -255,12 +180,12 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl) } } fl->proto = iph->protocol; - fl->fl4_dst = iph->daddr; - fl->fl4_src = iph->saddr; + fl->fl4_dst = reverse ? iph->saddr : iph->daddr; + fl->fl4_src = reverse ? iph->daddr : iph->saddr; fl->fl4_tos = iph->tos; } -static inline int xfrm4_garbage_collect(void) +static inline int xfrm4_garbage_collect(struct dst_ops *ops) { xfrm4_policy_afinfo.garbage_collect(); return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2); @@ -295,7 +220,8 @@ static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, xdst = (struct xfrm_dst *)dst; if (xdst->u.rt.idev->dev == dev) { - struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev); + struct in_device *loopback_idev = + in_dev_get(dev->nd_net->loopback_dev); BUG_ON(!loopback_idev); do { @@ -318,6 +244,7 @@ static struct dst_ops xfrm4_dst_ops = { .update_pmtu = xfrm4_update_pmtu, .destroy = xfrm4_dst_destroy, .ifdown = xfrm4_dst_ifdown, + .local_out = __ip_local_out, .gc_thresh = 1024, .entry_size = sizeof(struct xfrm_dst), }; @@ -328,8 +255,10 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { .dst_lookup = xfrm4_dst_lookup, .get_saddr = xfrm4_get_saddr, .find_bundle = __xfrm4_find_bundle, - .bundle_create = __xfrm4_bundle_create, .decode_session = _decode_session4, + .get_tos = xfrm4_get_tos, + .init_path = xfrm4_init_path, + .fill_dst = xfrm4_fill_dst, }; static void __init xfrm4_policy_init(void) diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 13d54a1c333..fdeebe68a37 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -11,6 +11,7 @@ #include <net/xfrm.h> #include <linux/pfkeyv2.h> #include <linux/ipsec.h> +#include <linux/netfilter_ipv4.h> static struct xfrm_state_afinfo xfrm4_state_afinfo; @@ -47,12 +48,31 @@ __xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl, x->props.family = AF_INET; } +int xfrm4_extract_header(struct sk_buff *skb) +{ + struct iphdr *iph = ip_hdr(skb); + + XFRM_MODE_SKB_CB(skb)->id = iph->id; + XFRM_MODE_SKB_CB(skb)->frag_off = iph->frag_off; + XFRM_MODE_SKB_CB(skb)->tos = iph->tos; + XFRM_MODE_SKB_CB(skb)->ttl = iph->ttl; + memset(XFRM_MODE_SKB_CB(skb)->flow_lbl, 0, + sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl)); + + return 0; +} + static struct xfrm_state_afinfo xfrm4_state_afinfo = { .family = AF_INET, + .proto = IPPROTO_IPIP, + .eth_proto = htons(ETH_P_IP), .owner = THIS_MODULE, .init_flags = xfrm4_init_flags, .init_tempsel = __xfrm4_init_tempsel, .output = xfrm4_output, + .extract_input = xfrm4_extract_input, + .extract_output = xfrm4_extract_output, + .transport_finish = xfrm4_transport_finish, }; void __init xfrm4_state_init(void) diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 87c23a73d28..24f3aa0f2a3 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -5,11 +5,12 @@ obj-$(CONFIG_IPV6) += ipv6.o ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \ + addrlabel.o \ route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \ raw.o protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \ - exthdrs.o sysctl_net_ipv6.o datagram.o \ - ip6_flowlabel.o inet6_connection_sock.o + exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o +ipv6-$(CONFIG_SYSCTL) = sysctl_net_ipv6.o ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \ xfrm6_output.o ipv6-$(CONFIG_NETFILTER) += netfilter.o diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index e8c347579da..e40213db9e4 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -101,8 +101,16 @@ #define TIME_DELTA(a,b) ((unsigned long)((long)(a) - (long)(b))) #ifdef CONFIG_SYSCTL -static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf *p); -static void addrconf_sysctl_unregister(struct ipv6_devconf *p); +static void addrconf_sysctl_register(struct inet6_dev *idev); +static void addrconf_sysctl_unregister(struct inet6_dev *idev); +#else +static inline void addrconf_sysctl_register(struct inet6_dev *idev) +{ +} + +static inline void addrconf_sysctl_unregister(struct inet6_dev *idev) +{ +} #endif #ifdef CONFIG_IPV6_PRIVACY @@ -141,7 +149,8 @@ static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); static void inet6_prefix_notify(int event, struct inet6_dev *idev, struct prefix_info *pinfo); -static int ipv6_chk_same_addr(const struct in6_addr *addr, struct net_device *dev); +static int ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, + struct net_device *dev); static ATOMIC_NOTIFIER_HEAD(inet6addr_chain); @@ -256,16 +265,13 @@ static void addrconf_mod_timer(struct inet6_ifaddr *ifp, static int snmp6_alloc_dev(struct inet6_dev *idev) { if (snmp_mib_init((void **)idev->stats.ipv6, - sizeof(struct ipstats_mib), - __alignof__(struct ipstats_mib)) < 0) + sizeof(struct ipstats_mib)) < 0) goto err_ip; if (snmp_mib_init((void **)idev->stats.icmpv6, - sizeof(struct icmpv6_mib), - __alignof__(struct icmpv6_mib)) < 0) + sizeof(struct icmpv6_mib)) < 0) goto err_icmp; if (snmp_mib_init((void **)idev->stats.icmpv6msg, - sizeof(struct icmpv6msg_mib), - __alignof__(struct icmpv6msg_mib)) < 0) + sizeof(struct icmpv6msg_mib)) < 0) goto err_icmpmsg; return 0; @@ -329,7 +335,7 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev) rwlock_init(&ndev->lock); ndev->dev = dev; - memcpy(&ndev->cnf, &ipv6_devconf_dflt, sizeof(ndev->cnf)); + memcpy(&ndev->cnf, dev->nd_net->ipv6.devconf_dflt, sizeof(ndev->cnf)); ndev->cnf.mtu6 = dev->mtu; ndev->cnf.sysctl = NULL; ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl); @@ -366,9 +372,7 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev) in6_dev_hold(ndev); #ifdef CONFIG_IPV6_PRIVACY - init_timer(&ndev->regen_timer); - ndev->regen_timer.function = ipv6_regen_rndid; - ndev->regen_timer.data = (unsigned long) ndev; + setup_timer(&ndev->regen_timer, ipv6_regen_rndid, (unsigned long)ndev); if ((dev->flags&IFF_LOOPBACK) || dev->type == ARPHRD_TUNNEL || #if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) @@ -379,6 +383,13 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev) "%s: Disabled Privacy Extensions\n", dev->name); ndev->cnf.use_tempaddr = -1; + + if (dev->type == ARPHRD_SIT && (dev->priv_flags & IFF_ISATAP)) { + printk(KERN_INFO + "%s: Disabled Multicast RS\n", + dev->name); + ndev->cnf.rtr_solicits = 0; + } } else { in6_dev_hold(ndev); ipv6_regen_rndid((unsigned long) ndev); @@ -390,13 +401,7 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev) ipv6_mc_init_dev(ndev); ndev->tstamp = jiffies; -#ifdef CONFIG_SYSCTL - neigh_sysctl_register(dev, ndev->nd_parms, NET_IPV6, - NET_IPV6_NEIGH, "ipv6", - &ndisc_ifinfo_sysctl_change, - NULL); - addrconf_sysctl_register(ndev, &ndev->cnf); -#endif + addrconf_sysctl_register(ndev); /* protected by rtnl_lock */ rcu_assign_pointer(dev->ip6_ptr, ndev); @@ -452,18 +457,18 @@ static void dev_forward_change(struct inet6_dev *idev) } -static void addrconf_forward_change(void) +static void addrconf_forward_change(struct net *net, __s32 newf) { struct net_device *dev; struct inet6_dev *idev; read_lock(&dev_base_lock); - for_each_netdev(&init_net, dev) { + for_each_netdev(net, dev) { rcu_read_lock(); idev = __in6_dev_get(dev); if (idev) { - int changed = (!idev->cnf.forwarding) ^ (!ipv6_devconf.forwarding); - idev->cnf.forwarding = ipv6_devconf.forwarding; + int changed = (!idev->cnf.forwarding) ^ (!newf); + idev->cnf.forwarding = newf; if (changed) dev_forward_change(idev); } @@ -471,6 +476,25 @@ static void addrconf_forward_change(void) } read_unlock(&dev_base_lock); } + +static void addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old) +{ + struct net *net; + + net = (struct net *)table->extra2; + if (p == &net->ipv6.devconf_dflt->forwarding) + return; + + if (p == &net->ipv6.devconf_all->forwarding) { + __s32 newf = net->ipv6.devconf_all->forwarding; + net->ipv6.devconf_dflt->forwarding = newf; + addrconf_forward_change(net, newf); + } else if ((!*p) ^ (!old)) + dev_forward_change((struct inet6_dev *)table->extra1); + + if (*p) + rt6_purge_dflt_routers(); +} #endif /* Nobody refers to this ifaddr, destroy it */ @@ -537,7 +561,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, write_lock(&addrconf_hash_lock); /* Ignore adding duplicate addresses on an interface */ - if (ipv6_chk_same_addr(addr, idev->dev)) { + if (ipv6_chk_same_addr(&init_net, addr, idev->dev)) { ADBG(("ipv6_add_addr: already assigned\n")); err = -EEXIST; goto out; @@ -876,35 +900,6 @@ static inline int ipv6_saddr_preferred(int type) return 0; } -/* static matching label */ -static inline int ipv6_saddr_label(const struct in6_addr *addr, int type) -{ - /* - * prefix (longest match) label - * ----------------------------- - * ::1/128 0 - * ::/0 1 - * 2002::/16 2 - * ::/96 3 - * ::ffff:0:0/96 4 - * fc00::/7 5 - * 2001::/32 6 - */ - if (type & IPV6_ADDR_LOOPBACK) - return 0; - else if (type & IPV6_ADDR_COMPATv4) - return 3; - else if (type & IPV6_ADDR_MAPPED) - return 4; - else if (addr->s6_addr32[0] == htonl(0x20010000)) - return 6; - else if (addr->s6_addr16[0] == htons(0x2002)) - return 2; - else if ((addr->s6_addr[0] & 0xfe) == 0xfc) - return 5; - return 1; -} - int ipv6_dev_get_saddr(struct net_device *daddr_dev, struct in6_addr *daddr, struct in6_addr *saddr) { @@ -912,7 +907,8 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev, struct inet6_ifaddr *ifa_result = NULL; int daddr_type = __ipv6_addr_type(daddr); int daddr_scope = __ipv6_addr_src_scope(daddr_type); - u32 daddr_label = ipv6_saddr_label(daddr, daddr_type); + int daddr_ifindex = daddr_dev ? daddr_dev->ifindex : 0; + u32 daddr_label = ipv6_addr_label(daddr, daddr_type, daddr_ifindex); struct net_device *dev; memset(&hiscore, 0, sizeof(hiscore)); @@ -1085,11 +1081,15 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev, /* Rule 6: Prefer matching label */ if (hiscore.rule < 6) { - if (ipv6_saddr_label(&ifa_result->addr, hiscore.addr_type) == daddr_label) + if (ipv6_addr_label(&ifa_result->addr, + hiscore.addr_type, + ifa_result->idev->dev->ifindex) == daddr_label) hiscore.attrs |= IPV6_SADDR_SCORE_LABEL; hiscore.rule++; } - if (ipv6_saddr_label(&ifa->addr, score.addr_type) == daddr_label) { + if (ipv6_addr_label(&ifa->addr, + score.addr_type, + ifa->idev->dev->ifindex) == daddr_label) { score.attrs |= IPV6_SADDR_SCORE_LABEL; if (!(hiscore.attrs & IPV6_SADDR_SCORE_LABEL)) { score.rule = 6; @@ -1207,13 +1207,16 @@ static int ipv6_count_addresses(struct inet6_dev *idev) return cnt; } -int ipv6_chk_addr(struct in6_addr *addr, struct net_device *dev, int strict) +int ipv6_chk_addr(struct net *net, struct in6_addr *addr, + struct net_device *dev, int strict) { struct inet6_ifaddr * ifp; u8 hash = ipv6_addr_hash(addr); read_lock_bh(&addrconf_hash_lock); for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) { + if (ifp->idev->dev->nd_net != net) + continue; if (ipv6_addr_equal(&ifp->addr, addr) && !(ifp->flags&IFA_F_TENTATIVE)) { if (dev == NULL || ifp->idev->dev == dev || @@ -1224,16 +1227,18 @@ int ipv6_chk_addr(struct in6_addr *addr, struct net_device *dev, int strict) read_unlock_bh(&addrconf_hash_lock); return ifp != NULL; } - EXPORT_SYMBOL(ipv6_chk_addr); static -int ipv6_chk_same_addr(const struct in6_addr *addr, struct net_device *dev) +int ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, + struct net_device *dev) { struct inet6_ifaddr * ifp; u8 hash = ipv6_addr_hash(addr); for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) { + if (ifp->idev->dev->nd_net != net) + continue; if (ipv6_addr_equal(&ifp->addr, addr)) { if (dev == NULL || ifp->idev->dev == dev) break; @@ -1242,13 +1247,16 @@ int ipv6_chk_same_addr(const struct in6_addr *addr, struct net_device *dev) return ifp != NULL; } -struct inet6_ifaddr * ipv6_get_ifaddr(struct in6_addr *addr, struct net_device *dev, int strict) +struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, struct in6_addr *addr, + struct net_device *dev, int strict) { struct inet6_ifaddr * ifp; u8 hash = ipv6_addr_hash(addr); read_lock_bh(&addrconf_hash_lock); for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) { + if (ifp->idev->dev->nd_net != net) + continue; if (ipv6_addr_equal(&ifp->addr, addr)) { if (dev == NULL || ifp->idev->dev == dev || !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) { @@ -1435,6 +1443,9 @@ static int ipv6_generate_eui64(u8 *eui, struct net_device *dev) return addrconf_ifid_arcnet(eui, dev); case ARPHRD_INFINIBAND: return addrconf_ifid_infiniband(eui, dev); + case ARPHRD_SIT: + if (dev->priv_flags & IFF_ISATAP) + return ipv6_isatap_eui64(eui, *(__be32 *)dev->dev_addr); } return -1; } @@ -1470,7 +1481,7 @@ regen: * * - Reserved subnet anycast (RFC 2526) * 11111101 11....11 1xxxxxxx - * - ISATAP (draft-ietf-ngtrans-isatap-13.txt) 5.1 + * - ISATAP (RFC4214) 6.1 * 00-00-5E-FE-xx-xx-xx-xx * - value 0 * - XXX: already assigned to an address on the device @@ -1731,7 +1742,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) ok: - ifp = ipv6_get_ifaddr(&addr, dev, 1); + ifp = ipv6_get_ifaddr(&init_net, &addr, dev, 1); if (ifp == NULL && valid_lft) { int max_addresses = in6_dev->cnf.max_addresses; @@ -1889,7 +1900,7 @@ int addrconf_set_dstaddr(void __user *arg) p.iph.ihl = 5; p.iph.protocol = IPPROTO_IPV6; p.iph.ttl = 64; - ifr.ifr_ifru.ifru_data = (void __user *)&p; + ifr.ifr_ifru.ifru_data = (__force void __user *)&p; oldfs = get_fs(); set_fs(KERNEL_DS); err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL); @@ -2201,6 +2212,16 @@ static void addrconf_sit_config(struct net_device *dev) return; } + if (dev->priv_flags & IFF_ISATAP) { + struct in6_addr addr; + + ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); + addrconf_prefix_route(&addr, 64, dev, 0, 0); + if (!ipv6_generate_eui64(addr.s6_addr + 8, dev)) + addrconf_add_linklocal(idev, &addr); + return; + } + sit_add_v4_addrs(idev); if (dev->flags&IFF_POINTOPOINT) { @@ -2385,15 +2406,8 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, case NETDEV_CHANGENAME: if (idev) { snmp6_unregister_dev(idev); -#ifdef CONFIG_SYSCTL - addrconf_sysctl_unregister(&idev->cnf); - neigh_sysctl_unregister(idev->nd_parms); - neigh_sysctl_register(dev, idev->nd_parms, - NET_IPV6, NET_IPV6_NEIGH, "ipv6", - &ndisc_ifinfo_sysctl_change, - NULL); - addrconf_sysctl_register(idev, &idev->cnf); -#endif + addrconf_sysctl_unregister(idev); + addrconf_sysctl_register(idev); err = snmp6_register_dev(idev); if (err) return notifier_from_errno(err); @@ -2517,10 +2531,7 @@ static int addrconf_ifdown(struct net_device *dev, int how) /* Shot the device (if unregistered) */ if (how == 1) { -#ifdef CONFIG_SYSCTL - addrconf_sysctl_unregister(&idev->cnf); - neigh_sysctl_unregister(idev->nd_parms); -#endif + addrconf_sysctl_unregister(idev); neigh_parms_release(&nd_tbl, idev->nd_parms); neigh_ifdown(&nd_tbl, dev); in6_dev_put(idev); @@ -2734,6 +2745,7 @@ static void addrconf_dad_run(struct inet6_dev *idev) { #ifdef CONFIG_PROC_FS struct if6_iter_state { + struct seq_net_private p; int bucket; }; @@ -2741,9 +2753,13 @@ static struct inet6_ifaddr *if6_get_first(struct seq_file *seq) { struct inet6_ifaddr *ifa = NULL; struct if6_iter_state *state = seq->private; + struct net *net = state->p.net; for (state->bucket = 0; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) { ifa = inet6_addr_lst[state->bucket]; + + while (ifa && ifa->idev->dev->nd_net != net) + ifa = ifa->lst_next; if (ifa) break; } @@ -2753,13 +2769,22 @@ static struct inet6_ifaddr *if6_get_first(struct seq_file *seq) static struct inet6_ifaddr *if6_get_next(struct seq_file *seq, struct inet6_ifaddr *ifa) { struct if6_iter_state *state = seq->private; + struct net *net = state->p.net; ifa = ifa->lst_next; try_again: + if (ifa) { + if (ifa->idev->dev->nd_net != net) { + ifa = ifa->lst_next; + goto try_again; + } + } + if (!ifa && ++state->bucket < IN6_ADDR_HSIZE) { ifa = inet6_addr_lst[state->bucket]; goto try_again; } + return ifa; } @@ -2774,6 +2799,7 @@ static struct inet6_ifaddr *if6_get_idx(struct seq_file *seq, loff_t pos) } static void *if6_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(addrconf_hash_lock) { read_lock_bh(&addrconf_hash_lock); return if6_get_idx(seq, *pos); @@ -2789,6 +2815,7 @@ static void *if6_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void if6_seq_stop(struct seq_file *seq, void *v) + __releases(addrconf_hash_lock) { read_unlock_bh(&addrconf_hash_lock); } @@ -2816,8 +2843,8 @@ static const struct seq_operations if6_seq_ops = { static int if6_seq_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &if6_seq_ops, - sizeof(struct if6_iter_state)); + return seq_open_net(inode, file, &if6_seq_ops, + sizeof(struct if6_iter_state)); } static const struct file_operations if6_fops = { @@ -2825,31 +2852,48 @@ static const struct file_operations if6_fops = { .open = if6_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; -int __init if6_proc_init(void) +static int if6_proc_net_init(struct net *net) { - if (!proc_net_fops_create(&init_net, "if_inet6", S_IRUGO, &if6_fops)) + if (!proc_net_fops_create(net, "if_inet6", S_IRUGO, &if6_fops)) return -ENOMEM; return 0; } +static void if6_proc_net_exit(struct net *net) +{ + proc_net_remove(net, "if_inet6"); +} + +static struct pernet_operations if6_proc_net_ops = { + .init = if6_proc_net_init, + .exit = if6_proc_net_exit, +}; + +int __init if6_proc_init(void) +{ + return register_pernet_subsys(&if6_proc_net_ops); +} + void if6_proc_exit(void) { - proc_net_remove(&init_net, "if_inet6"); + unregister_pernet_subsys(&if6_proc_net_ops); } #endif /* CONFIG_PROC_FS */ #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) /* Check if address is a home address configured on any interface. */ -int ipv6_chk_home_addr(struct in6_addr *addr) +int ipv6_chk_home_addr(struct net *net, struct in6_addr *addr) { int ret = 0; struct inet6_ifaddr * ifp; u8 hash = ipv6_addr_hash(addr); read_lock_bh(&addrconf_hash_lock); for (ifp = inet6_addr_lst[hash]; ifp; ifp = ifp->lst_next) { + if (ifp->idev->dev->nd_net != net) + continue; if (ipv6_addr_cmp(&ifp->addr, addr) == 0 && (ifp->flags & IFA_F_HOMEADDRESS)) { ret = 1; @@ -2997,11 +3041,15 @@ static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = { static int inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { + struct net *net = skb->sk->sk_net; struct ifaddrmsg *ifm; struct nlattr *tb[IFA_MAX+1]; struct in6_addr *pfx; int err; + if (net != &init_net) + return -EINVAL; + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy); if (err < 0) return err; @@ -3054,6 +3102,7 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u8 ifa_flags, static int inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { + struct net *net = skb->sk->sk_net; struct ifaddrmsg *ifm; struct nlattr *tb[IFA_MAX+1]; struct in6_addr *pfx; @@ -3063,6 +3112,9 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) u8 ifa_flags; int err; + if (net != &init_net) + return -EINVAL; + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy); if (err < 0) return err; @@ -3090,7 +3142,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) /* We ignore other flags so far. */ ifa_flags = ifm->ifa_flags & (IFA_F_NODAD | IFA_F_HOMEADDRESS); - ifa = ipv6_get_ifaddr(pfx, dev, 1); + ifa = ipv6_get_ifaddr(net, pfx, dev, 1); if (ifa == NULL) { /* * It would be best to check for !NLM_F_CREATE here but @@ -3283,11 +3335,11 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, ifa = ifa->if_next, ip_idx++) { if (ip_idx < s_ip_idx) continue; - if ((err = inet6_fill_ifaddr(skb, ifa, - NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, RTM_NEWADDR, - NLM_F_MULTI)) <= 0) - goto done; + err = inet6_fill_ifaddr(skb, ifa, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_NEWADDR, + NLM_F_MULTI); } break; case MULTICAST_ADDR: @@ -3296,11 +3348,11 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, ifmca = ifmca->next, ip_idx++) { if (ip_idx < s_ip_idx) continue; - if ((err = inet6_fill_ifmcaddr(skb, ifmca, - NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, RTM_GETMULTICAST, - NLM_F_MULTI)) <= 0) - goto done; + err = inet6_fill_ifmcaddr(skb, ifmca, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_GETMULTICAST, + NLM_F_MULTI); } break; case ANYCAST_ADDR: @@ -3309,11 +3361,11 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, ifaca = ifaca->aca_next, ip_idx++) { if (ip_idx < s_ip_idx) continue; - if ((err = inet6_fill_ifacaddr(skb, ifaca, - NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, RTM_GETANYCAST, - NLM_F_MULTI)) <= 0) - goto done; + err = inet6_fill_ifacaddr(skb, ifaca, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_GETANYCAST, + NLM_F_MULTI); } break; default: @@ -3321,14 +3373,12 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, } read_unlock_bh(&idev->lock); in6_dev_put(idev); + + if (err <= 0) + break; cont: idx++; } -done: - if (err <= 0) { - read_unlock_bh(&idev->lock); - in6_dev_put(idev); - } cb->args[0] = idx; cb->args[1] = ip_idx; return skb->len; @@ -3336,26 +3386,42 @@ done: static int inet6_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = skb->sk->sk_net; enum addr_type_t type = UNICAST_ADDR; + + if (net != &init_net) + return 0; + return inet6_dump_addr(skb, cb, type); } static int inet6_dump_ifmcaddr(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = skb->sk->sk_net; enum addr_type_t type = MULTICAST_ADDR; + + if (net != &init_net) + return 0; + return inet6_dump_addr(skb, cb, type); } static int inet6_dump_ifacaddr(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = skb->sk->sk_net; enum addr_type_t type = ANYCAST_ADDR; + + if (net != &init_net) + return 0; + return inet6_dump_addr(skb, cb, type); } static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) { + struct net *net = in_skb->sk->sk_net; struct ifaddrmsg *ifm; struct nlattr *tb[IFA_MAX+1]; struct in6_addr *addr = NULL; @@ -3364,6 +3430,9 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr* nlh, struct sk_buff *skb; int err; + if (net != &init_net) + return -EINVAL; + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy); if (err < 0) goto errout; @@ -3378,7 +3447,7 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr* nlh, if (ifm->ifa_index) dev = __dev_get_by_index(&init_net, ifm->ifa_index); - if ((ifa = ipv6_get_ifaddr(addr, dev, 1)) == NULL) { + if ((ifa = ipv6_get_ifaddr(net, addr, dev, 1)) == NULL) { err = -EADDRNOTAVAIL; goto errout; } @@ -3396,7 +3465,7 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr* nlh, kfree_skb(skb); goto errout_ifa; } - err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid); + err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid); errout_ifa: in6_ifa_put(ifa); errout: @@ -3419,10 +3488,10 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC); + err = rtnl_notify(skb, &init_net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC); errout: if (err < 0) - rtnl_set_sk_err(RTNLGRP_IPV6_IFADDR, err); + rtnl_set_sk_err(&init_net, RTNLGRP_IPV6_IFADDR, err); } static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, @@ -3581,11 +3650,15 @@ nla_put_failure: static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = skb->sk->sk_net; int idx, err; int s_idx = cb->args[0]; struct net_device *dev; struct inet6_dev *idev; + if (net != &init_net) + return 0; + read_lock(&dev_base_lock); idx = 0; for_each_netdev(&init_net, dev) { @@ -3623,10 +3696,10 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC); + err = rtnl_notify(skb, &init_net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC); errout: if (err < 0) - rtnl_set_sk_err(RTNLGRP_IPV6_IFADDR, err); + rtnl_set_sk_err(&init_net, RTNLGRP_IPV6_IFADDR, err); } static inline size_t inet6_prefix_nlmsg_size(void) @@ -3692,10 +3765,10 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev, kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC); + err = rtnl_notify(skb, &init_net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC); errout: if (err < 0) - rtnl_set_sk_err(RTNLGRP_IPV6_PREFIX, err); + rtnl_set_sk_err(&init_net, RTNLGRP_IPV6_PREFIX, err); } static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) @@ -3746,22 +3819,8 @@ int addrconf_sysctl_forward(ctl_table *ctl, int write, struct file * filp, ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); - if (write && valp != &ipv6_devconf_dflt.forwarding) { - if (valp != &ipv6_devconf.forwarding) { - if ((!*valp) ^ (!val)) { - struct inet6_dev *idev = (struct inet6_dev *)ctl->extra1; - if (idev == NULL) - return ret; - dev_forward_change(idev); - } - } else { - ipv6_devconf_dflt.forwarding = ipv6_devconf.forwarding; - addrconf_forward_change(); - } - if (*valp) - rt6_purge_dflt_routers(); - } - + if (write) + addrconf_fixup_forwarding(ctl, valp, val); return ret; } @@ -3772,6 +3831,7 @@ static int addrconf_sysctl_forward_strategy(ctl_table *table, void __user *newval, size_t newlen) { int *valp = table->data; + int val = *valp; int new; if (!newval || !newlen) @@ -3796,26 +3856,8 @@ static int addrconf_sysctl_forward_strategy(ctl_table *table, } } - if (valp != &ipv6_devconf_dflt.forwarding) { - if (valp != &ipv6_devconf.forwarding) { - struct inet6_dev *idev = (struct inet6_dev *)table->extra1; - int changed; - if (unlikely(idev == NULL)) - return -ENODEV; - changed = (!*valp) ^ (!new); - *valp = new; - if (changed) - dev_forward_change(idev); - } else { - *valp = new; - addrconf_forward_change(); - } - - if (*valp) - rt6_purge_dflt_routers(); - } else - *valp = new; - + *valp = new; + addrconf_fixup_forwarding(table, valp, val); return 1; } @@ -3823,10 +3865,7 @@ static struct addrconf_sysctl_table { struct ctl_table_header *sysctl_header; ctl_table addrconf_vars[__NET_IPV6_MAX]; - ctl_table addrconf_dev[2]; - ctl_table addrconf_conf_dir[2]; - ctl_table addrconf_proto_dir[2]; - ctl_table addrconf_root_dir[2]; + char *dev_name; } addrconf_sysctl __read_mostly = { .sysctl_header = NULL, .addrconf_vars = { @@ -4047,72 +4086,33 @@ static struct addrconf_sysctl_table .ctl_name = 0, /* sentinel */ } }, - .addrconf_dev = { - { - .ctl_name = NET_PROTO_CONF_ALL, - .procname = "all", - .mode = 0555, - .child = addrconf_sysctl.addrconf_vars, - }, - { - .ctl_name = 0, /* sentinel */ - } - }, - .addrconf_conf_dir = { - { - .ctl_name = NET_IPV6_CONF, - .procname = "conf", - .mode = 0555, - .child = addrconf_sysctl.addrconf_dev, - }, - { - .ctl_name = 0, /* sentinel */ - } - }, - .addrconf_proto_dir = { - { - .ctl_name = NET_IPV6, - .procname = "ipv6", - .mode = 0555, - .child = addrconf_sysctl.addrconf_conf_dir, - }, - { - .ctl_name = 0, /* sentinel */ - } - }, - .addrconf_root_dir = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = addrconf_sysctl.addrconf_proto_dir, - }, - { - .ctl_name = 0, /* sentinel */ - } - }, }; -static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf *p) +static int __addrconf_sysctl_register(struct net *net, char *dev_name, + int ctl_name, struct inet6_dev *idev, struct ipv6_devconf *p) { int i; - struct net_device *dev = idev ? idev->dev : NULL; struct addrconf_sysctl_table *t; - char *dev_name = NULL; + +#define ADDRCONF_CTL_PATH_DEV 3 + + struct ctl_path addrconf_ctl_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "ipv6", .ctl_name = NET_IPV6, }, + { .procname = "conf", .ctl_name = NET_IPV6_CONF, }, + { /* to be set */ }, + { }, + }; + t = kmemdup(&addrconf_sysctl, sizeof(*t), GFP_KERNEL); if (t == NULL) - return; + goto out; + for (i=0; t->addrconf_vars[i].data; i++) { t->addrconf_vars[i].data += (char*)p - (char*)&ipv6_devconf; t->addrconf_vars[i].extra1 = idev; /* embedded; no ref */ - } - if (dev) { - dev_name = dev->name; - t->addrconf_dev[0].ctl_name = dev->ifindex; - } else { - dev_name = "default"; - t->addrconf_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT; + t->addrconf_vars[i].extra2 = net; } /* @@ -4120,47 +4120,126 @@ static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf * by sysctl and we wouldn't want anyone to change it under our feet * (see SIOCSIFNAME). */ - dev_name = kstrdup(dev_name, GFP_KERNEL); - if (!dev_name) - goto free; - - t->addrconf_dev[0].procname = dev_name; + t->dev_name = kstrdup(dev_name, GFP_KERNEL); + if (!t->dev_name) + goto free; - t->addrconf_dev[0].child = t->addrconf_vars; - t->addrconf_conf_dir[0].child = t->addrconf_dev; - t->addrconf_proto_dir[0].child = t->addrconf_conf_dir; - t->addrconf_root_dir[0].child = t->addrconf_proto_dir; + addrconf_ctl_path[ADDRCONF_CTL_PATH_DEV].procname = t->dev_name; + addrconf_ctl_path[ADDRCONF_CTL_PATH_DEV].ctl_name = ctl_name; - t->sysctl_header = register_sysctl_table(t->addrconf_root_dir); + t->sysctl_header = register_net_sysctl_table(net, addrconf_ctl_path, + t->addrconf_vars); if (t->sysctl_header == NULL) goto free_procname; - else - p->sysctl = t; - return; - /* error path */ - free_procname: - kfree(dev_name); - free: + p->sysctl = t; + return 0; + +free_procname: + kfree(t->dev_name); +free: kfree(t); +out: + return -ENOBUFS; +} - return; +static void __addrconf_sysctl_unregister(struct ipv6_devconf *p) +{ + struct addrconf_sysctl_table *t; + + if (p->sysctl == NULL) + return; + + t = p->sysctl; + p->sysctl = NULL; + unregister_sysctl_table(t->sysctl_header); + kfree(t->dev_name); + kfree(t); } -static void addrconf_sysctl_unregister(struct ipv6_devconf *p) +static void addrconf_sysctl_register(struct inet6_dev *idev) { - if (p->sysctl) { - struct addrconf_sysctl_table *t = p->sysctl; - p->sysctl = NULL; - unregister_sysctl_table(t->sysctl_header); - kfree(t->addrconf_dev[0].procname); - kfree(t); - } + neigh_sysctl_register(idev->dev, idev->nd_parms, NET_IPV6, + NET_IPV6_NEIGH, "ipv6", + &ndisc_ifinfo_sysctl_change, + NULL); + __addrconf_sysctl_register(idev->dev->nd_net, idev->dev->name, + idev->dev->ifindex, idev, &idev->cnf); +} + +static void addrconf_sysctl_unregister(struct inet6_dev *idev) +{ + __addrconf_sysctl_unregister(&idev->cnf); + neigh_sysctl_unregister(idev->nd_parms); } #endif +static int addrconf_init_net(struct net *net) +{ + int err; + struct ipv6_devconf *all, *dflt; + + err = -ENOMEM; + all = &ipv6_devconf; + dflt = &ipv6_devconf_dflt; + + if (net != &init_net) { + all = kmemdup(all, sizeof(ipv6_devconf), GFP_KERNEL); + if (all == NULL) + goto err_alloc_all; + + dflt = kmemdup(dflt, sizeof(ipv6_devconf_dflt), GFP_KERNEL); + if (dflt == NULL) + goto err_alloc_dflt; + } + + net->ipv6.devconf_all = all; + net->ipv6.devconf_dflt = dflt; + +#ifdef CONFIG_SYSCTL + err = __addrconf_sysctl_register(net, "all", NET_PROTO_CONF_ALL, + NULL, all); + if (err < 0) + goto err_reg_all; + + err = __addrconf_sysctl_register(net, "default", NET_PROTO_CONF_DEFAULT, + NULL, dflt); + if (err < 0) + goto err_reg_dflt; +#endif + return 0; + +#ifdef CONFIG_SYSCTL +err_reg_dflt: + __addrconf_sysctl_unregister(all); +err_reg_all: + kfree(dflt); +#endif +err_alloc_dflt: + kfree(all); +err_alloc_all: + return err; +} + +static void addrconf_exit_net(struct net *net) +{ +#ifdef CONFIG_SYSCTL + __addrconf_sysctl_unregister(net->ipv6.devconf_dflt); + __addrconf_sysctl_unregister(net->ipv6.devconf_all); +#endif + if (net != &init_net) { + kfree(net->ipv6.devconf_dflt); + kfree(net->ipv6.devconf_all); + } +} + +static struct pernet_operations addrconf_ops = { + .init = addrconf_init_net, + .exit = addrconf_exit_net, +}; + /* * Device notifier */ @@ -4185,7 +4264,15 @@ EXPORT_SYMBOL(unregister_inet6addr_notifier); int __init addrconf_init(void) { - int err = 0; + int err; + + if ((err = ipv6_addr_label_init()) < 0) { + printk(KERN_CRIT "IPv6 Addrconf: cannot initialize default policy table: %d.\n", + err); + return err; + } + + register_pernet_subsys(&addrconf_ops); /* The addrconf netdev notifier requires that loopback_dev * has it's ipv6 private information allocated and setup @@ -4210,7 +4297,7 @@ int __init addrconf_init(void) err = -ENOMEM; rtnl_unlock(); if (err) - return err; + goto errlo; ip6_null_entry.u.dst.dev = init_net.loopback_dev; ip6_null_entry.rt6i_idev = in6_dev_get(init_net.loopback_dev); @@ -4236,20 +4323,18 @@ int __init addrconf_init(void) __rtnl_register(PF_INET6, RTM_GETMULTICAST, NULL, inet6_dump_ifmcaddr); __rtnl_register(PF_INET6, RTM_GETANYCAST, NULL, inet6_dump_ifacaddr); -#ifdef CONFIG_SYSCTL - addrconf_sysctl.sysctl_header = - register_sysctl_table(addrconf_sysctl.addrconf_root_dir); - addrconf_sysctl_register(NULL, &ipv6_devconf_dflt); -#endif + ipv6_addr_label_rtnl_register(); return 0; errout: unregister_netdevice_notifier(&ipv6_dev_notf); +errlo: + unregister_pernet_subsys(&addrconf_ops); return err; } -void __exit addrconf_cleanup(void) +void addrconf_cleanup(void) { struct net_device *dev; struct inet6_ifaddr *ifa; @@ -4257,10 +4342,7 @@ void __exit addrconf_cleanup(void) unregister_netdevice_notifier(&ipv6_dev_notf); -#ifdef CONFIG_SYSCTL - addrconf_sysctl_unregister(&ipv6_devconf_dflt); - addrconf_sysctl_unregister(&ipv6_devconf); -#endif + unregister_pernet_subsys(&addrconf_ops); rtnl_lock(); diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c new file mode 100644 index 00000000000..a3c5a72218f --- /dev/null +++ b/net/ipv6/addrlabel.c @@ -0,0 +1,561 @@ +/* + * IPv6 Address Label subsystem + * for the IPv6 "Default" Source Address Selection + * + * Copyright (C)2007 USAGI/WIDE Project + */ +/* + * Author: + * YOSHIFUJI Hideaki @ USAGI/WIDE Project <yoshfuji@linux-ipv6.org> + */ + +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/rcupdate.h> +#include <linux/in6.h> +#include <net/addrconf.h> +#include <linux/if_addrlabel.h> +#include <linux/netlink.h> +#include <linux/rtnetlink.h> + +#if 0 +#define ADDRLABEL(x...) printk(x) +#else +#define ADDRLABEL(x...) do { ; } while(0) +#endif + +/* + * Policy Table + */ +struct ip6addrlbl_entry +{ + struct in6_addr prefix; + int prefixlen; + int ifindex; + int addrtype; + u32 label; + struct hlist_node list; + atomic_t refcnt; + struct rcu_head rcu; +}; + +static struct ip6addrlbl_table +{ + struct hlist_head head; + spinlock_t lock; + u32 seq; +} ip6addrlbl_table; + +/* + * Default policy table (RFC3484 + extensions) + * + * prefix addr_type label + * ------------------------------------------------------------------------- + * ::1/128 LOOPBACK 0 + * ::/0 N/A 1 + * 2002::/16 N/A 2 + * ::/96 COMPATv4 3 + * ::ffff:0:0/96 V4MAPPED 4 + * fc00::/7 N/A 5 ULA (RFC 4193) + * 2001::/32 N/A 6 Teredo (RFC 4380) + * + * Note: 0xffffffff is used if we do not have any policies. + */ + +#define IPV6_ADDR_LABEL_DEFAULT 0xffffffffUL + +static const __initdata struct ip6addrlbl_init_table +{ + const struct in6_addr *prefix; + int prefixlen; + u32 label; +} ip6addrlbl_init_table[] = { + { /* ::/0 */ + .prefix = &in6addr_any, + .label = 1, + },{ /* fc00::/7 */ + .prefix = &(struct in6_addr){{{ 0xfc }}}, + .prefixlen = 7, + .label = 5, + },{ /* 2002::/16 */ + .prefix = &(struct in6_addr){{{ 0x20, 0x02 }}}, + .prefixlen = 16, + .label = 2, + },{ /* 2001::/32 */ + .prefix = &(struct in6_addr){{{ 0x20, 0x01 }}}, + .prefixlen = 32, + .label = 6, + },{ /* ::ffff:0:0 */ + .prefix = &(struct in6_addr){{{ [10] = 0xff, [11] = 0xff }}}, + .prefixlen = 96, + .label = 4, + },{ /* ::/96 */ + .prefix = &in6addr_any, + .prefixlen = 96, + .label = 3, + },{ /* ::1/128 */ + .prefix = &in6addr_loopback, + .prefixlen = 128, + .label = 0, + } +}; + +/* Object management */ +static inline void ip6addrlbl_free(struct ip6addrlbl_entry *p) +{ + kfree(p); +} + +static void ip6addrlbl_free_rcu(struct rcu_head *h) +{ + ip6addrlbl_free(container_of(h, struct ip6addrlbl_entry, rcu)); +} + +static inline int ip6addrlbl_hold(struct ip6addrlbl_entry *p) +{ + return atomic_inc_not_zero(&p->refcnt); +} + +static inline void ip6addrlbl_put(struct ip6addrlbl_entry *p) +{ + if (atomic_dec_and_test(&p->refcnt)) + call_rcu(&p->rcu, ip6addrlbl_free_rcu); +} + +/* Find label */ +static int __ip6addrlbl_match(struct ip6addrlbl_entry *p, + const struct in6_addr *addr, + int addrtype, int ifindex) +{ + if (p->ifindex && p->ifindex != ifindex) + return 0; + if (p->addrtype && p->addrtype != addrtype) + return 0; + if (!ipv6_prefix_equal(addr, &p->prefix, p->prefixlen)) + return 0; + return 1; +} + +static struct ip6addrlbl_entry *__ipv6_addr_label(const struct in6_addr *addr, + int type, int ifindex) +{ + struct hlist_node *pos; + struct ip6addrlbl_entry *p; + hlist_for_each_entry_rcu(p, pos, &ip6addrlbl_table.head, list) { + if (__ip6addrlbl_match(p, addr, type, ifindex)) + return p; + } + return NULL; +} + +u32 ipv6_addr_label(const struct in6_addr *addr, int type, int ifindex) +{ + u32 label; + struct ip6addrlbl_entry *p; + + type &= IPV6_ADDR_MAPPED | IPV6_ADDR_COMPATv4 | IPV6_ADDR_LOOPBACK; + + rcu_read_lock(); + p = __ipv6_addr_label(addr, type, ifindex); + label = p ? p->label : IPV6_ADDR_LABEL_DEFAULT; + rcu_read_unlock(); + + ADDRLABEL(KERN_DEBUG "%s(addr=" NIP6_FMT ", type=%d, ifindex=%d) => %08x\n", + __FUNCTION__, + NIP6(*addr), type, ifindex, + label); + + return label; +} + +/* allocate one entry */ +static struct ip6addrlbl_entry *ip6addrlbl_alloc(const struct in6_addr *prefix, + int prefixlen, int ifindex, + u32 label) +{ + struct ip6addrlbl_entry *newp; + int addrtype; + + ADDRLABEL(KERN_DEBUG "%s(prefix=" NIP6_FMT ", prefixlen=%d, ifindex=%d, label=%u)\n", + __FUNCTION__, + NIP6(*prefix), prefixlen, + ifindex, + (unsigned int)label); + + addrtype = ipv6_addr_type(prefix) & (IPV6_ADDR_MAPPED | IPV6_ADDR_COMPATv4 | IPV6_ADDR_LOOPBACK); + + switch (addrtype) { + case IPV6_ADDR_MAPPED: + if (prefixlen > 96) + return ERR_PTR(-EINVAL); + if (prefixlen < 96) + addrtype = 0; + break; + case IPV6_ADDR_COMPATv4: + if (prefixlen != 96) + addrtype = 0; + break; + case IPV6_ADDR_LOOPBACK: + if (prefixlen != 128) + addrtype = 0; + break; + } + + newp = kmalloc(sizeof(*newp), GFP_KERNEL); + if (!newp) + return ERR_PTR(-ENOMEM); + + ipv6_addr_prefix(&newp->prefix, prefix, prefixlen); + newp->prefixlen = prefixlen; + newp->ifindex = ifindex; + newp->addrtype = addrtype; + newp->label = label; + INIT_HLIST_NODE(&newp->list); + atomic_set(&newp->refcnt, 1); + return newp; +} + +/* add a label */ +static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace) +{ + int ret = 0; + + ADDRLABEL(KERN_DEBUG "%s(newp=%p, replace=%d)\n", + __FUNCTION__, + newp, replace); + + if (hlist_empty(&ip6addrlbl_table.head)) { + hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head); + } else { + struct hlist_node *pos, *n; + struct ip6addrlbl_entry *p = NULL; + hlist_for_each_entry_safe(p, pos, n, + &ip6addrlbl_table.head, list) { + if (p->prefixlen == newp->prefixlen && + p->ifindex == newp->ifindex && + ipv6_addr_equal(&p->prefix, &newp->prefix)) { + if (!replace) { + ret = -EEXIST; + goto out; + } + hlist_replace_rcu(&p->list, &newp->list); + ip6addrlbl_put(p); + goto out; + } else if ((p->prefixlen == newp->prefixlen && !p->ifindex) || + (p->prefixlen < newp->prefixlen)) { + hlist_add_before_rcu(&newp->list, &p->list); + goto out; + } + } + hlist_add_after_rcu(&p->list, &newp->list); + } +out: + if (!ret) + ip6addrlbl_table.seq++; + return ret; +} + +/* add a label */ +static int ip6addrlbl_add(const struct in6_addr *prefix, int prefixlen, + int ifindex, u32 label, int replace) +{ + struct ip6addrlbl_entry *newp; + int ret = 0; + + ADDRLABEL(KERN_DEBUG "%s(prefix=" NIP6_FMT ", prefixlen=%d, ifindex=%d, label=%u, replace=%d)\n", + __FUNCTION__, + NIP6(*prefix), prefixlen, + ifindex, + (unsigned int)label, + replace); + + newp = ip6addrlbl_alloc(prefix, prefixlen, ifindex, label); + if (IS_ERR(newp)) + return PTR_ERR(newp); + spin_lock(&ip6addrlbl_table.lock); + ret = __ip6addrlbl_add(newp, replace); + spin_unlock(&ip6addrlbl_table.lock); + if (ret) + ip6addrlbl_free(newp); + return ret; +} + +/* remove a label */ +static int __ip6addrlbl_del(const struct in6_addr *prefix, int prefixlen, + int ifindex) +{ + struct ip6addrlbl_entry *p = NULL; + struct hlist_node *pos, *n; + int ret = -ESRCH; + + ADDRLABEL(KERN_DEBUG "%s(prefix=" NIP6_FMT ", prefixlen=%d, ifindex=%d)\n", + __FUNCTION__, + NIP6(*prefix), prefixlen, + ifindex); + + hlist_for_each_entry_safe(p, pos, n, &ip6addrlbl_table.head, list) { + if (p->prefixlen == prefixlen && + p->ifindex == ifindex && + ipv6_addr_equal(&p->prefix, prefix)) { + hlist_del_rcu(&p->list); + ip6addrlbl_put(p); + ret = 0; + break; + } + } + return ret; +} + +static int ip6addrlbl_del(const struct in6_addr *prefix, int prefixlen, + int ifindex) +{ + struct in6_addr prefix_buf; + int ret; + + ADDRLABEL(KERN_DEBUG "%s(prefix=" NIP6_FMT ", prefixlen=%d, ifindex=%d)\n", + __FUNCTION__, + NIP6(*prefix), prefixlen, + ifindex); + + ipv6_addr_prefix(&prefix_buf, prefix, prefixlen); + spin_lock(&ip6addrlbl_table.lock); + ret = __ip6addrlbl_del(&prefix_buf, prefixlen, ifindex); + spin_unlock(&ip6addrlbl_table.lock); + return ret; +} + +/* add default label */ +static __init int ip6addrlbl_init(void) +{ + int err = 0; + int i; + + ADDRLABEL(KERN_DEBUG "%s()\n", __FUNCTION__); + + for (i = 0; i < ARRAY_SIZE(ip6addrlbl_init_table); i++) { + int ret = ip6addrlbl_add(ip6addrlbl_init_table[i].prefix, + ip6addrlbl_init_table[i].prefixlen, + 0, + ip6addrlbl_init_table[i].label, 0); + /* XXX: should we free all rules when we catch an error? */ + if (ret && (!err || err != -ENOMEM)) + err = ret; + } + return err; +} + +int __init ipv6_addr_label_init(void) +{ + spin_lock_init(&ip6addrlbl_table.lock); + + return ip6addrlbl_init(); +} + +static const struct nla_policy ifal_policy[IFAL_MAX+1] = { + [IFAL_ADDRESS] = { .len = sizeof(struct in6_addr), }, + [IFAL_LABEL] = { .len = sizeof(u32), }, +}; + +static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh, + void *arg) +{ + struct net *net = skb->sk->sk_net; + struct ifaddrlblmsg *ifal; + struct nlattr *tb[IFAL_MAX+1]; + struct in6_addr *pfx; + u32 label; + int err = 0; + + if (net != &init_net) + return 0; + + err = nlmsg_parse(nlh, sizeof(*ifal), tb, IFAL_MAX, ifal_policy); + if (err < 0) + return err; + + ifal = nlmsg_data(nlh); + + if (ifal->ifal_family != AF_INET6 || + ifal->ifal_prefixlen > 128) + return -EINVAL; + + if (ifal->ifal_index && + !__dev_get_by_index(&init_net, ifal->ifal_index)) + return -EINVAL; + + if (!tb[IFAL_ADDRESS]) + return -EINVAL; + + pfx = nla_data(tb[IFAL_ADDRESS]); + if (!pfx) + return -EINVAL; + + if (!tb[IFAL_LABEL]) + return -EINVAL; + label = nla_get_u32(tb[IFAL_LABEL]); + if (label == IPV6_ADDR_LABEL_DEFAULT) + return -EINVAL; + + switch(nlh->nlmsg_type) { + case RTM_NEWADDRLABEL: + err = ip6addrlbl_add(pfx, ifal->ifal_prefixlen, + ifal->ifal_index, label, + nlh->nlmsg_flags & NLM_F_REPLACE); + break; + case RTM_DELADDRLABEL: + err = ip6addrlbl_del(pfx, ifal->ifal_prefixlen, + ifal->ifal_index); + break; + default: + err = -EOPNOTSUPP; + } + return err; +} + +static inline void ip6addrlbl_putmsg(struct nlmsghdr *nlh, + int prefixlen, int ifindex, u32 lseq) +{ + struct ifaddrlblmsg *ifal = nlmsg_data(nlh); + ifal->ifal_family = AF_INET6; + ifal->ifal_prefixlen = prefixlen; + ifal->ifal_flags = 0; + ifal->ifal_index = ifindex; + ifal->ifal_seq = lseq; +}; + +static int ip6addrlbl_fill(struct sk_buff *skb, + struct ip6addrlbl_entry *p, + u32 lseq, + u32 pid, u32 seq, int event, + unsigned int flags) +{ + struct nlmsghdr *nlh = nlmsg_put(skb, pid, seq, event, + sizeof(struct ifaddrlblmsg), flags); + if (!nlh) + return -EMSGSIZE; + + ip6addrlbl_putmsg(nlh, p->prefixlen, p->ifindex, lseq); + + if (nla_put(skb, IFAL_ADDRESS, 16, &p->prefix) < 0 || + nla_put_u32(skb, IFAL_LABEL, p->label) < 0) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } + + return nlmsg_end(skb, nlh); +} + +static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = skb->sk->sk_net; + struct ip6addrlbl_entry *p; + struct hlist_node *pos; + int idx = 0, s_idx = cb->args[0]; + int err; + + if (net != &init_net) + return 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(p, pos, &ip6addrlbl_table.head, list) { + if (idx >= s_idx) { + if ((err = ip6addrlbl_fill(skb, p, + ip6addrlbl_table.seq, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_NEWADDRLABEL, + NLM_F_MULTI)) <= 0) + break; + } + idx++; + } + rcu_read_unlock(); + cb->args[0] = idx; + return skb->len; +} + +static inline int ip6addrlbl_msgsize(void) +{ + return (NLMSG_ALIGN(sizeof(struct ifaddrlblmsg)) + + nla_total_size(16) /* IFAL_ADDRESS */ + + nla_total_size(4) /* IFAL_LABEL */ + ); +} + +static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr* nlh, + void *arg) +{ + struct net *net = in_skb->sk->sk_net; + struct ifaddrlblmsg *ifal; + struct nlattr *tb[IFAL_MAX+1]; + struct in6_addr *addr; + u32 lseq; + int err = 0; + struct ip6addrlbl_entry *p; + struct sk_buff *skb; + + if (net != &init_net) + return 0; + + err = nlmsg_parse(nlh, sizeof(*ifal), tb, IFAL_MAX, ifal_policy); + if (err < 0) + return err; + + ifal = nlmsg_data(nlh); + + if (ifal->ifal_family != AF_INET6 || + ifal->ifal_prefixlen != 128) + return -EINVAL; + + if (ifal->ifal_index && + !__dev_get_by_index(&init_net, ifal->ifal_index)) + return -EINVAL; + + if (!tb[IFAL_ADDRESS]) + return -EINVAL; + + addr = nla_data(tb[IFAL_ADDRESS]); + if (!addr) + return -EINVAL; + + rcu_read_lock(); + p = __ipv6_addr_label(addr, ipv6_addr_type(addr), ifal->ifal_index); + if (p && ip6addrlbl_hold(p)) + p = NULL; + lseq = ip6addrlbl_table.seq; + rcu_read_unlock(); + + if (!p) { + err = -ESRCH; + goto out; + } + + if (!(skb = nlmsg_new(ip6addrlbl_msgsize(), GFP_KERNEL))) { + ip6addrlbl_put(p); + return -ENOBUFS; + } + + err = ip6addrlbl_fill(skb, p, lseq, + NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, + RTM_NEWADDRLABEL, 0); + + ip6addrlbl_put(p); + + if (err < 0) { + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto out; + } + + err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid); +out: + return err; +} + +void __init ipv6_addr_label_rtnl_register(void) +{ + __rtnl_register(PF_INET6, RTM_NEWADDRLABEL, ip6addrlbl_newdel, NULL); + __rtnl_register(PF_INET6, RTM_DELADDRLABEL, ip6addrlbl_newdel, NULL); + __rtnl_register(PF_INET6, RTM_GETADDRLABEL, ip6addrlbl_get, ip6addrlbl_dump); +} + diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index ecbd38894fd..bddac0e8780 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -66,9 +66,7 @@ MODULE_AUTHOR("Cast of dozens"); MODULE_DESCRIPTION("IPv6 protocol stack for Linux"); MODULE_LICENSE("GPL"); -int sysctl_ipv6_bindv6only __read_mostly; - -/* The inetsw table contains everything that inet_create needs to +/* The inetsw6 table contains everything that inet6_create needs to * build a new socket. */ static struct list_head inetsw6[SOCK_MAX]; @@ -193,7 +191,7 @@ lookup_protocol: np->mcast_hops = -1; np->mc_loop = 1; np->pmtudisc = IPV6_PMTUDISC_WANT; - np->ipv6only = sysctl_ipv6_bindv6only; + np->ipv6only = init_net.ipv6.sysctl.bindv6only; /* Init the ipv4 part of the socket since we can have sockets * using v6 API for ipv4. @@ -280,7 +278,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* Check if the address belongs to the host. */ if (addr_type == IPV6_ADDR_MAPPED) { v4addr = addr->sin6_addr.s6_addr32[3]; - if (inet_addr_type(v4addr) != RTN_LOCAL) { + if (inet_addr_type(&init_net, v4addr) != RTN_LOCAL) { err = -EADDRNOTAVAIL; goto out; } @@ -314,7 +312,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) */ v4addr = LOOPBACK4_IPV6; if (!(addr_type & IPV6_ADDR_MULTICAST)) { - if (!ipv6_chk_addr(&addr->sin6_addr, dev, 0)) { + if (!ipv6_chk_addr(&init_net, &addr->sin6_addr, + dev, 0)) { if (dev) dev_put(dev); err = -EADDRNOTAVAIL; @@ -491,6 +490,7 @@ const struct proto_ops inet6_stream_ops = { .recvmsg = sock_common_recvmsg, /* ok */ .mmap = sock_no_mmap, .sendpage = tcp_sendpage, + .splice_read = tcp_splice_read, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, @@ -528,57 +528,23 @@ static struct net_proto_family inet6_family_ops = { .owner = THIS_MODULE, }; -/* Same as inet6_dgram_ops, sans udp_poll. */ -static const struct proto_ops inet6_sockraw_ops = { - .family = PF_INET6, - .owner = THIS_MODULE, - .release = inet6_release, - .bind = inet6_bind, - .connect = inet_dgram_connect, /* ok */ - .socketpair = sock_no_socketpair, /* a do nothing */ - .accept = sock_no_accept, /* a do nothing */ - .getname = inet6_getname, - .poll = datagram_poll, /* ok */ - .ioctl = inet6_ioctl, /* must change */ - .listen = sock_no_listen, /* ok */ - .shutdown = inet_shutdown, /* ok */ - .setsockopt = sock_common_setsockopt, /* ok */ - .getsockopt = sock_common_getsockopt, /* ok */ - .sendmsg = inet_sendmsg, /* ok */ - .recvmsg = sock_common_recvmsg, /* ok */ - .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, -#ifdef CONFIG_COMPAT - .compat_setsockopt = compat_sock_common_setsockopt, - .compat_getsockopt = compat_sock_common_getsockopt, -#endif -}; - -static struct inet_protosw rawv6_protosw = { - .type = SOCK_RAW, - .protocol = IPPROTO_IP, /* wild card */ - .prot = &rawv6_prot, - .ops = &inet6_sockraw_ops, - .capability = CAP_NET_RAW, - .no_check = UDP_CSUM_DEFAULT, - .flags = INET_PROTOSW_REUSE, -}; - -void -inet6_register_protosw(struct inet_protosw *p) +int inet6_register_protosw(struct inet_protosw *p) { struct list_head *lh; struct inet_protosw *answer; - int protocol = p->protocol; struct list_head *last_perm; + int protocol = p->protocol; + int ret; spin_lock_bh(&inetsw6_lock); + ret = -EINVAL; if (p->type >= SOCK_MAX) goto out_illegal; /* If we are trying to override a permanent protocol, bail. */ answer = NULL; + ret = -EPERM; last_perm = &inetsw6[p->type]; list_for_each(lh, &inetsw6[p->type]) { answer = list_entry(lh, struct inet_protosw, list); @@ -602,9 +568,10 @@ inet6_register_protosw(struct inet_protosw *p) * system automatically returns to the old behavior. */ list_add_rcu(&p->list, last_perm); + ret = 0; out: spin_unlock_bh(&inetsw6_lock); - return; + return ret; out_permanent: printk(KERN_ERR "Attempt to override permanent protocol %d.\n", @@ -713,20 +680,19 @@ EXPORT_SYMBOL_GPL(ipv6_opt_accepted); static int __init init_ipv6_mibs(void) { - if (snmp_mib_init((void **)ipv6_statistics, sizeof (struct ipstats_mib), - __alignof__(struct ipstats_mib)) < 0) + if (snmp_mib_init((void **)ipv6_statistics, + sizeof(struct ipstats_mib)) < 0) goto err_ip_mib; - if (snmp_mib_init((void **)icmpv6_statistics, sizeof (struct icmpv6_mib), - __alignof__(struct icmpv6_mib)) < 0) + if (snmp_mib_init((void **)icmpv6_statistics, + sizeof(struct icmpv6_mib)) < 0) goto err_icmp_mib; if (snmp_mib_init((void **)icmpv6msg_statistics, - sizeof (struct icmpv6msg_mib), __alignof__(struct icmpv6_mib)) < 0) + sizeof(struct icmpv6msg_mib)) < 0) goto err_icmpmsg_mib; - if (snmp_mib_init((void **)udp_stats_in6, sizeof (struct udp_mib), - __alignof__(struct udp_mib)) < 0) + if (snmp_mib_init((void **)udp_stats_in6, sizeof (struct udp_mib)) < 0) goto err_udp_mib; - if (snmp_mib_init((void **)udplite_stats_in6, sizeof (struct udp_mib), - __alignof__(struct udp_mib)) < 0) + if (snmp_mib_init((void **)udplite_stats_in6, + sizeof (struct udp_mib)) < 0) goto err_udplite_mib; return 0; @@ -752,6 +718,32 @@ static void cleanup_ipv6_mibs(void) snmp_mib_free((void **)udplite_stats_in6); } +static int inet6_net_init(struct net *net) +{ + net->ipv6.sysctl.bindv6only = 0; + net->ipv6.sysctl.flush_delay = 0; + net->ipv6.sysctl.ip6_rt_max_size = 4096; + net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; + net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; + net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; + net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; + net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; + net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; + net->ipv6.sysctl.icmpv6_time = 1*HZ; + + return 0; +} + +static void inet6_net_exit(struct net *net) +{ + return; +} + +static struct pernet_operations inet6_net_ops = { + .init = inet6_net_init, + .exit = inet6_net_exit, +}; + static int __init inet6_init(void) { struct sk_buff *dummy_skb; @@ -768,7 +760,6 @@ static int __init inet6_init(void) __this_module.can_unload = &ipv6_unload; #endif #endif - err = proto_register(&tcpv6_prot, 1); if (err) goto out; @@ -793,14 +784,16 @@ static int __init inet6_init(void) /* We MUST register RAW sockets before we create the ICMP6, * IGMP6, or NDISC control sockets. */ - inet6_register_protosw(&rawv6_protosw); + err = rawv6_init(); + if (err) + goto out_unregister_raw_proto; /* Register the family here so that the init calls below will * be able to create sockets. (?? is this dangerous ??) */ err = sock_register(&inet6_family_ops); if (err) - goto out_unregister_raw_proto; + goto out_sock_register_fail; /* Initialise ipv6 mibs */ err = init_ipv6_mibs(); @@ -814,8 +807,14 @@ static int __init inet6_init(void) * able to communicate via both network protocols. */ + err = register_pernet_subsys(&inet6_net_ops); + if (err) + goto register_pernet_fail; + #ifdef CONFIG_SYSCTL - ipv6_sysctl_register(); + err = ipv6_sysctl_register(); + if (err) + goto sysctl_fail; #endif err = icmpv6_init(&inet6_family_ops); if (err) @@ -848,31 +847,61 @@ static int __init inet6_init(void) if (if6_proc_init()) goto proc_if6_fail; #endif - ip6_route_init(); - ip6_flowlabel_init(); + err = ip6_route_init(); + if (err) + goto ip6_route_fail; + err = ip6_flowlabel_init(); + if (err) + goto ip6_flowlabel_fail; err = addrconf_init(); if (err) goto addrconf_fail; /* Init v6 extension headers. */ - ipv6_rthdr_init(); - ipv6_frag_init(); - ipv6_nodata_init(); - ipv6_destopt_init(); + err = ipv6_exthdrs_init(); + if (err) + goto ipv6_exthdrs_fail; + + err = ipv6_frag_init(); + if (err) + goto ipv6_frag_fail; /* Init v6 transport protocols. */ - udpv6_init(); - udplitev6_init(); - tcpv6_init(); + err = udpv6_init(); + if (err) + goto udpv6_fail; - ipv6_packet_init(); - err = 0; + err = udplitev6_init(); + if (err) + goto udplitev6_fail; + + err = tcpv6_init(); + if (err) + goto tcpv6_fail; + + err = ipv6_packet_init(); + if (err) + goto ipv6_packet_fail; out: return err; +ipv6_packet_fail: + tcpv6_exit(); +tcpv6_fail: + udplitev6_exit(); +udplitev6_fail: + udpv6_exit(); +udpv6_fail: + ipv6_frag_exit(); +ipv6_frag_fail: + ipv6_exthdrs_exit(); +ipv6_exthdrs_fail: + addrconf_cleanup(); addrconf_fail: ip6_flowlabel_cleanup(); +ip6_flowlabel_fail: ip6_route_cleanup(); +ip6_route_fail: #ifdef CONFIG_PROC_FS if6_proc_exit(); proc_if6_fail: @@ -899,10 +928,16 @@ ndisc_fail: icmp_fail: #ifdef CONFIG_SYSCTL ipv6_sysctl_unregister(); +sysctl_fail: #endif + unregister_pernet_subsys(&inet6_net_ops); +register_pernet_fail: cleanup_ipv6_mibs(); out_unregister_sock: sock_unregister(PF_INET6); + rtnl_unregister_all(PF_INET6); +out_sock_register_fail: + rawv6_exit(); out_unregister_raw_proto: proto_unregister(&rawv6_prot); out_unregister_udplite_proto: @@ -922,9 +957,14 @@ static void __exit inet6_exit(void) /* Disallow any further netlink messages */ rtnl_unregister_all(PF_INET6); + udpv6_exit(); + udplitev6_exit(); + tcpv6_exit(); + /* Cleanup code parts. */ ipv6_packet_cleanup(); - + ipv6_frag_exit(); + ipv6_exthdrs_exit(); addrconf_cleanup(); ip6_flowlabel_cleanup(); ip6_route_cleanup(); @@ -943,9 +983,11 @@ static void __exit inet6_exit(void) igmp6_cleanup(); ndisc_cleanup(); icmpv6_cleanup(); + rawv6_exit(); #ifdef CONFIG_SYSCTL ipv6_sysctl_unregister(); #endif + unregister_pernet_subsys(&inet6_net_ops); cleanup_ipv6_mibs(); proto_unregister(&rawv6_prot); proto_unregister(&udplitev6_prot); diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 4eaf55072b1..fb0d07a15e9 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -370,6 +370,7 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) ip6h->flow_lbl[2] = 0; ip6h->hop_limit = 0; + spin_lock(&x->lock); { u8 auth_data[MAX_AH_AUTH_LEN]; @@ -378,14 +379,15 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) skb_push(skb, hdr_len); err = ah_mac_digest(ahp, skb, ah->auth_data); if (err) - goto free_out; - err = -EINVAL; - if (memcmp(ahp->work_icv, auth_data, ahp->icv_trunc_len)) { - LIMIT_NETDEBUG(KERN_WARNING "ipsec ah authentication error\n"); - x->stats.integrity_failed++; - goto free_out; - } + goto unlock; + if (memcmp(ahp->work_icv, auth_data, ahp->icv_trunc_len)) + err = -EBADMSG; } +unlock: + spin_unlock(&x->lock); + + if (err) + goto free_out; skb->network_header += ah_hlen; memcpy(skb_network_header(skb), tmp_hdr, hdr_len); diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index f915c4df982..9c7f83fbc3a 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -89,7 +89,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, struct in6_addr *addr) return -EPERM; if (ipv6_addr_is_multicast(addr)) return -EINVAL; - if (ipv6_chk_addr(addr, NULL, 0)) + if (ipv6_chk_addr(&init_net, addr, NULL, 0)) return -EINVAL; pac = sock_kmalloc(sk, sizeof(struct ipv6_ac_socklist), GFP_KERNEL); @@ -504,6 +504,7 @@ static struct ifacaddr6 *ac6_get_idx(struct seq_file *seq, loff_t pos) } static void *ac6_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(dev_base_lock) { read_lock(&dev_base_lock); return ac6_get_idx(seq, *pos); @@ -518,6 +519,7 @@ static void *ac6_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void ac6_seq_stop(struct seq_file *seq, void *v) + __releases(dev_base_lock) { struct ac6_iter_state *state = ac6_seq_private(seq); if (likely(state->idev != NULL)) { diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 5d4245ab418..94fa6ae77cf 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -177,7 +177,7 @@ ipv4_connected: if (final_p) ipv6_addr_copy(&fl.fl6_dst, final_p); - if ((err = __xfrm_lookup(&dst, &fl, sk, 1)) < 0) { + if ((err = __xfrm_lookup(&dst, &fl, sk, XFRM_LOOKUP_WAIT)) < 0) { if (err == -EREMOTE) err = ip6_dst_blackhole(sk, &dst, &fl); if (err < 0) @@ -549,7 +549,8 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl, return -ENODEV; } } - if (!ipv6_chk_addr(&src_info->ipi6_addr, dev, 0)) { + if (!ipv6_chk_addr(&init_net, &src_info->ipi6_addr, + dev, 0)) { if (dev) dev_put(dev); err = -EINVAL; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 44405325467..5bd5292ad9f 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -165,31 +165,32 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) goto out; } + if ((nfrags = skb_cow_data(skb, 0, &trailer)) < 0) { + ret = -EINVAL; + goto out; + } + + skb->ip_summed = CHECKSUM_NONE; + + spin_lock(&x->lock); + /* If integrity check is required, do this. */ if (esp->auth.icv_full_len) { u8 sum[alen]; ret = esp_mac_digest(esp, skb, 0, skb->len - alen); if (ret) - goto out; + goto unlock; if (skb_copy_bits(skb, skb->len - alen, sum, alen)) BUG(); if (unlikely(memcmp(esp->auth.work_icv, sum, alen))) { - x->stats.integrity_failed++; - ret = -EINVAL; - goto out; + ret = -EBADMSG; + goto unlock; } } - if ((nfrags = skb_cow_data(skb, 0, &trailer)) < 0) { - ret = -EINVAL; - goto out; - } - - skb->ip_summed = CHECKSUM_NONE; - esph = (struct ip_esp_hdr *)skb->data; iph = ipv6_hdr(skb); @@ -198,15 +199,13 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) crypto_blkcipher_set_iv(tfm, esph->enc_data, esp->conf.ivlen); { - u8 nexthdr[2]; struct scatterlist *sg = &esp->sgbuf[0]; - u8 padlen; if (unlikely(nfrags > ESP_NUM_FAST_SG)) { sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC); if (!sg) { ret = -ENOMEM; - goto out; + goto unlock; } } sg_init_table(sg, nfrags); @@ -216,8 +215,17 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) ret = crypto_blkcipher_decrypt(&desc, sg, sg, elen); if (unlikely(sg != &esp->sgbuf[0])) kfree(sg); - if (unlikely(ret)) - goto out; + } + +unlock: + spin_unlock(&x->lock); + + if (unlikely(ret)) + goto out; + + { + u8 nexthdr[2]; + u8 padlen; if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2)) BUG(); diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 1e89efd38a0..3cd1c993d52 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -32,6 +32,7 @@ #include <linux/in6.h> #include <linux/icmpv6.h> +#include <net/dst.h> #include <net/sock.h> #include <net/snmp.h> @@ -307,38 +308,6 @@ static int ipv6_destopt_rcv(struct sk_buff *skb) return -1; } -static struct inet6_protocol destopt_protocol = { - .handler = ipv6_destopt_rcv, - .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_GSO_EXTHDR, -}; - -void __init ipv6_destopt_init(void) -{ - if (inet6_add_protocol(&destopt_protocol, IPPROTO_DSTOPTS) < 0) - printk(KERN_ERR "ipv6_destopt_init: Could not register protocol\n"); -} - -/******************************** - NONE header. No data in packet. - ********************************/ - -static int ipv6_nodata_rcv(struct sk_buff *skb) -{ - kfree_skb(skb); - return 0; -} - -static struct inet6_protocol nodata_protocol = { - .handler = ipv6_nodata_rcv, - .flags = INET6_PROTO_NOPOLICY, -}; - -void __init ipv6_nodata_init(void) -{ - if (inet6_add_protocol(&nodata_protocol, IPPROTO_NONE) < 0) - printk(KERN_ERR "ipv6_nodata_init: Could not register protocol\n"); -} - /******************************** Routing header. ********************************/ @@ -476,7 +445,7 @@ looped_back: kfree_skb(skb); return -1; } - if (!ipv6_chk_home_addr(addr)) { + if (!ipv6_chk_home_addr(&init_net, addr)) { IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); @@ -536,12 +505,48 @@ static struct inet6_protocol rthdr_protocol = { .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_GSO_EXTHDR, }; -void __init ipv6_rthdr_init(void) +static struct inet6_protocol destopt_protocol = { + .handler = ipv6_destopt_rcv, + .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_GSO_EXTHDR, +}; + +static struct inet6_protocol nodata_protocol = { + .handler = dst_discard, + .flags = INET6_PROTO_NOPOLICY, +}; + +int __init ipv6_exthdrs_init(void) { - if (inet6_add_protocol(&rthdr_protocol, IPPROTO_ROUTING) < 0) - printk(KERN_ERR "ipv6_rthdr_init: Could not register protocol\n"); + int ret; + + ret = inet6_add_protocol(&rthdr_protocol, IPPROTO_ROUTING); + if (ret) + goto out; + + ret = inet6_add_protocol(&destopt_protocol, IPPROTO_DSTOPTS); + if (ret) + goto out_rthdr; + + ret = inet6_add_protocol(&nodata_protocol, IPPROTO_NONE); + if (ret) + goto out_destopt; + +out: + return ret; +out_rthdr: + inet6_del_protocol(&rthdr_protocol, IPPROTO_ROUTING); +out_destopt: + inet6_del_protocol(&destopt_protocol, IPPROTO_DSTOPTS); + goto out; }; +void ipv6_exthdrs_exit(void) +{ + inet6_del_protocol(&nodata_protocol, IPPROTO_NONE); + inet6_del_protocol(&destopt_protocol, IPPROTO_DSTOPTS); + inet6_del_protocol(&rthdr_protocol, IPPROTO_ROUTING); +} + /********************************** Hop-by-hop options. **********************************/ diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 428c6b0e26d..695c0ca8a41 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -223,7 +223,7 @@ nla_put_failure: return -ENOBUFS; } -static u32 fib6_rule_default_pref(void) +static u32 fib6_rule_default_pref(struct fib_rules_ops *ops) { return 0x3FFF; } @@ -249,6 +249,7 @@ static struct fib_rules_ops fib6_rules_ops = { .policy = fib6_rule_policy, .rules_list = LIST_HEAD_INIT(fib6_rules_ops.rules_list), .owner = THIS_MODULE, + .fro_net = &init_net, }; static int __init fib6_default_rules_init(void) @@ -265,10 +266,23 @@ static int __init fib6_default_rules_init(void) return 0; } -void __init fib6_rules_init(void) +int __init fib6_rules_init(void) { - BUG_ON(fib6_default_rules_init()); - fib_rules_register(&fib6_rules_ops); + int ret; + + ret = fib6_default_rules_init(); + if (ret) + goto out; + + ret = fib_rules_register(&fib6_rules_ops); + if (ret) + goto out_default_rules_init; +out: + return ret; + +out_default_rules_init: + fib_rules_cleanup_ops(&fib6_rules_ops); + goto out; } void fib6_rules_cleanup(void) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index f1240688dc5..cbb5b9cf84a 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -63,6 +63,7 @@ #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/icmp.h> +#include <net/xfrm.h> #include <asm/uaccess.h> #include <asm/system.h> @@ -86,7 +87,7 @@ static int icmpv6_rcv(struct sk_buff *skb); static struct inet6_protocol icmpv6_protocol = { .handler = icmpv6_rcv, - .flags = INET6_PROTO_FINAL, + .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; static __inline__ int icmpv6_xmit_lock(void) @@ -153,8 +154,6 @@ static int is_ineligible(struct sk_buff *skb) return 0; } -static int sysctl_icmpv6_time __read_mostly = 1*HZ; - /* * Check the ICMP output rate limit */ @@ -185,7 +184,7 @@ static inline int icmpv6_xrlim_allow(struct sock *sk, int type, res = 1; } else { struct rt6_info *rt = (struct rt6_info *)dst; - int tmo = sysctl_icmpv6_time; + int tmo = init_net.ipv6.sysctl.icmpv6_time; /* Give more bandwidth to wider prefixes. */ if (rt->rt6i_dst.plen < 128) @@ -310,8 +309,10 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, struct ipv6_pinfo *np; struct in6_addr *saddr = NULL; struct dst_entry *dst; + struct dst_entry *dst2; struct icmp6hdr tmp_hdr; struct flowi fl; + struct flowi fl2; struct icmpv6_msg msg; int iif = 0; int addr_type = 0; @@ -331,7 +332,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, */ addr_type = ipv6_addr_type(&hdr->daddr); - if (ipv6_chk_addr(&hdr->daddr, skb->dev, 0)) + if (ipv6_chk_addr(&init_net, &hdr->daddr, skb->dev, 0)) saddr = &hdr->daddr; /* @@ -418,9 +419,42 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, goto out_dst_release; } - if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) + /* No need to clone since we're just using its address. */ + dst2 = dst; + + err = xfrm_lookup(&dst, &fl, sk, 0); + switch (err) { + case 0: + if (dst != dst2) + goto route_done; + break; + case -EPERM: + dst = NULL; + break; + default: + goto out; + } + + if (xfrm_decode_session_reverse(skb, &fl2, AF_INET6)) + goto out; + + if (ip6_dst_lookup(sk, &dst2, &fl)) + goto out; + + err = xfrm_lookup(&dst2, &fl, sk, XFRM_LOOKUP_ICMP); + if (err == -ENOENT) { + if (!dst) + goto out; + goto route_done; + } + + dst_release(dst); + dst = dst2; + + if (err) goto out; +route_done: if (ipv6_addr_is_multicast(&fl.fl6_dst)) hlimit = np->mcast_hops; else @@ -555,9 +589,7 @@ out: static void icmpv6_notify(struct sk_buff *skb, int type, int code, __be32 info) { - struct in6_addr *saddr, *daddr; struct inet6_protocol *ipprot; - struct sock *sk; int inner_offset; int hash; u8 nexthdr; @@ -579,9 +611,6 @@ static void icmpv6_notify(struct sk_buff *skb, int type, int code, __be32 info) if (!pskb_may_pull(skb, inner_offset+8)) return; - saddr = &ipv6_hdr(skb)->saddr; - daddr = &ipv6_hdr(skb)->daddr; - /* BUGGG_FUTURE: we should try to parse exthdrs in this packet. Without this we will not able f.e. to make source routed pmtu discovery. @@ -597,15 +626,7 @@ static void icmpv6_notify(struct sk_buff *skb, int type, int code, __be32 info) ipprot->err_handler(skb, NULL, type, code, inner_offset, info); rcu_read_unlock(); - read_lock(&raw_v6_lock); - if ((sk = sk_head(&raw_v6_htable[hash])) != NULL) { - while ((sk = __raw_v6_lookup(sk, nexthdr, saddr, daddr, - IP6CB(skb)->iif))) { - rawv6_err(sk, skb, NULL, type, code, inner_offset, info); - sk = sk_next(sk); - } - } - read_unlock(&raw_v6_lock); + raw6_icmp_error(skb, nexthdr, type, code, inner_offset, info); } /* @@ -621,6 +642,25 @@ static int icmpv6_rcv(struct sk_buff *skb) struct icmp6hdr *hdr; int type; + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { + int nh; + + if (!(skb->sp && skb->sp->xvec[skb->sp->len - 1]->props.flags & + XFRM_STATE_ICMP)) + goto drop_no_count; + + if (!pskb_may_pull(skb, sizeof(*hdr) + sizeof(*orig_hdr))) + goto drop_no_count; + + nh = skb_network_offset(skb); + skb_set_network_header(skb, sizeof(*hdr)); + + if (!xfrm6_policy_check_reverse(NULL, XFRM_POLICY_IN, skb)) + goto drop_no_count; + + skb_set_network_header(skb, nh); + } + ICMP6_INC_STATS_BH(idev, ICMP6_MIB_INMSGS); saddr = &ipv6_hdr(skb)->saddr; @@ -643,8 +683,7 @@ static int icmpv6_rcv(struct sk_buff *skb) } } - if (!pskb_pull(skb, sizeof(struct icmp6hdr))) - goto discard_it; + __skb_pull(skb, sizeof(*hdr)); hdr = icmp6_hdr(skb); @@ -730,6 +769,7 @@ static int icmpv6_rcv(struct sk_buff *skb) discard_it: ICMP6_INC_STATS_BH(idev, ICMP6_MIB_INERRORS); +drop_no_count: kfree_skb(skb); return 0; } @@ -865,16 +905,26 @@ int icmpv6_err_convert(int type, int code, int *err) EXPORT_SYMBOL(icmpv6_err_convert); #ifdef CONFIG_SYSCTL -ctl_table ipv6_icmp_table[] = { +ctl_table ipv6_icmp_table_template[] = { { .ctl_name = NET_IPV6_ICMP_RATELIMIT, .procname = "ratelimit", - .data = &sysctl_icmpv6_time, + .data = &init_net.ipv6.sysctl.icmpv6_time, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec }, { .ctl_name = 0 }, }; + +struct ctl_table *ipv6_icmp_sysctl_init(struct net *net) +{ + struct ctl_table *table; + + table = kmemdup(ipv6_icmp_table_template, + sizeof(ipv6_icmp_table_template), + GFP_KERNEL); + return table; +} #endif diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 0765d8bd380..a66a7d8e281 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -43,7 +43,7 @@ void __inet6_hash(struct inet_hashinfo *hashinfo, } __sk_add_node(sk, list); - sock_prot_inc_use(sk->sk_prot); + sock_prot_inuse_add(sk->sk_prot, 1); write_unlock(lock); } EXPORT_SYMBOL(__inet6_hash); @@ -216,7 +216,7 @@ unique: BUG_TRAP(sk_unhashed(sk)); __sk_add_node(sk, &head->chain); sk->sk_hash = hash; - sock_prot_inc_use(sk->sk_prot); + sock_prot_inuse_add(sk->sk_prot, 1); write_unlock(lock); if (twp != NULL) { diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 946cf389ab9..f93407cf651 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -361,6 +361,7 @@ end: static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = skb->sk->sk_net; unsigned int h, s_h; unsigned int e = 0, s_e; struct rt6_rtnl_dump_arg arg; @@ -369,6 +370,9 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) struct hlist_node *node; int res = 0; + if (net != &init_net) + return 0; + s_h = cb->args[0]; s_e = cb->args[1]; @@ -677,13 +681,15 @@ static __inline__ void fib6_start_gc(struct rt6_info *rt) { if (ip6_fib_timer.expires == 0 && (rt->rt6i_flags & (RTF_EXPIRES|RTF_CACHE))) - mod_timer(&ip6_fib_timer, jiffies + ip6_rt_gc_interval); + mod_timer(&ip6_fib_timer, jiffies + + init_net.ipv6.sysctl.ip6_rt_gc_interval); } void fib6_force_start_gc(void) { if (ip6_fib_timer.expires == 0) - mod_timer(&ip6_fib_timer, jiffies + ip6_rt_gc_interval); + mod_timer(&ip6_fib_timer, jiffies + + init_net.ipv6.sysctl.ip6_rt_gc_interval); } /* @@ -1122,9 +1128,6 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, rt->u.dst.rt6_next = NULL; - if (fn->leaf == NULL && fn->fn_flags&RTN_TL_ROOT) - fn->leaf = &ip6_null_entry; - /* If it was last route, expunge its radix tree node */ if (fn->leaf == NULL) { fn->fn_flags &= ~RTN_RTINFO; @@ -1311,6 +1314,9 @@ static int fib6_walk(struct fib6_walker_t *w) static int fib6_clean_node(struct fib6_walker_t *w) { + struct nl_info info = { + .nl_net = &init_net, + }; int res; struct rt6_info *rt; struct fib6_cleaner_t *c = container_of(w, struct fib6_cleaner_t, w); @@ -1319,7 +1325,7 @@ static int fib6_clean_node(struct fib6_walker_t *w) res = c->func(rt, c->arg); if (res < 0) { w->leaf = rt; - res = fib6_del(rt, NULL); + res = fib6_del(rt, &info); if (res) { #if RT6_DEBUG >= 2 printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res); @@ -1445,7 +1451,8 @@ void fib6_run_gc(unsigned long dummy) { if (dummy != ~0UL) { spin_lock_bh(&fib6_gc_lock); - gc_args.timeout = dummy ? (int)dummy : ip6_rt_gc_interval; + gc_args.timeout = dummy ? (int)dummy : + init_net.ipv6.sysctl.ip6_rt_gc_interval; } else { local_bh_disable(); if (!spin_trylock(&fib6_gc_lock)) { @@ -1453,7 +1460,7 @@ void fib6_run_gc(unsigned long dummy) local_bh_enable(); return; } - gc_args.timeout = ip6_rt_gc_interval; + gc_args.timeout = init_net.ipv6.sysctl.ip6_rt_gc_interval; } gc_args.more = 0; @@ -1461,7 +1468,8 @@ void fib6_run_gc(unsigned long dummy) fib6_clean_all(fib6_age, 0, NULL); if (gc_args.more) - mod_timer(&ip6_fib_timer, jiffies + ip6_rt_gc_interval); + mod_timer(&ip6_fib_timer, jiffies + + init_net.ipv6.sysctl.ip6_rt_gc_interval); else { del_timer(&ip6_fib_timer); ip6_fib_timer.expires = 0; @@ -1469,16 +1477,27 @@ void fib6_run_gc(unsigned long dummy) spin_unlock_bh(&fib6_gc_lock); } -void __init fib6_init(void) +int __init fib6_init(void) { + int ret; fib6_node_kmem = kmem_cache_create("fib6_nodes", sizeof(struct fib6_node), - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!fib6_node_kmem) + return -ENOMEM; fib6_tables_init(); - __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib); + ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib); + if (ret) + goto out_kmem_cache_create; +out: + return ret; + +out_kmem_cache_create: + kmem_cache_destroy(fib6_node_kmem); + goto out; } void fib6_gc_cleanup(void) diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index b12cc22e774..2b7d9ee9883 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -629,6 +629,7 @@ static struct ip6_flowlabel *ip6fl_get_idx(struct seq_file *seq, loff_t pos) } static void *ip6fl_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(ip6_fl_lock) { read_lock_bh(&ip6_fl_lock); return *pos ? ip6fl_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; @@ -647,6 +648,7 @@ static void *ip6fl_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void ip6fl_seq_stop(struct seq_file *seq, void *v) + __releases(ip6_fl_lock) { read_unlock_bh(&ip6_fl_lock); } @@ -692,20 +694,36 @@ static const struct file_operations ip6fl_seq_fops = { .llseek = seq_lseek, .release = seq_release_private, }; -#endif +static int ip6_flowlabel_proc_init(struct net *net) +{ + if (!proc_net_fops_create(net, "ip6_flowlabel", S_IRUGO, &ip6fl_seq_fops)) + return -ENOMEM; + return 0; +} -void ip6_flowlabel_init(void) +static void ip6_flowlabel_proc_fini(struct net *net) { -#ifdef CONFIG_PROC_FS - proc_net_fops_create(&init_net, "ip6_flowlabel", S_IRUGO, &ip6fl_seq_fops); + proc_net_remove(net, "ip6_flowlabel"); +} +#else +static inline int ip6_flowlabel_proc_init(struct net *net) +{ + return 0; +} +static inline void ip6_flowlabel_proc_fini(struct net *net) +{ + return ; +} #endif + +int ip6_flowlabel_init(void) +{ + return ip6_flowlabel_proc_init(&init_net); } void ip6_flowlabel_cleanup(void) { del_timer(&ip6_fl_gc_timer); -#ifdef CONFIG_PROC_FS - proc_net_remove(&init_net, "ip6_flowlabel"); -#endif + ip6_flowlabel_proc_fini(&init_net); } diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index fac6f7f9dd7..178aebc0427 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -134,7 +134,8 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt rcu_read_unlock(); - return NF_HOOK(PF_INET6,NF_IP6_PRE_ROUTING, skb, dev, NULL, ip6_rcv_finish); + return NF_HOOK(PF_INET6, NF_INET_PRE_ROUTING, skb, dev, NULL, + ip6_rcv_finish); err: IP6_INC_STATS_BH(idev, IPSTATS_MIB_INHDRERRORS); drop: @@ -152,9 +153,8 @@ out: static int ip6_input_finish(struct sk_buff *skb) { struct inet6_protocol *ipprot; - struct sock *raw_sk; unsigned int nhoff; - int nexthdr; + int nexthdr, raw; u8 hash; struct inet6_dev *idev; @@ -170,9 +170,7 @@ resubmit: nhoff = IP6CB(skb)->nhoff; nexthdr = skb_network_header(skb)[nhoff]; - raw_sk = sk_head(&raw_v6_htable[nexthdr & (MAX_INET_PROTOS - 1)]); - if (raw_sk && !ipv6_raw_deliver(skb, nexthdr)) - raw_sk = NULL; + raw = raw6_local_deliver(skb, nexthdr); hash = nexthdr & (MAX_INET_PROTOS - 1); if ((ipprot = rcu_dereference(inet6_protos[hash])) != NULL) { @@ -205,7 +203,7 @@ resubmit: else if (ret == 0) IP6_INC_STATS_BH(idev, IPSTATS_MIB_INDELIVERS); } else { - if (!raw_sk) { + if (!raw) { if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { IP6_INC_STATS_BH(idev, IPSTATS_MIB_INUNKNOWNPROTOS); icmpv6_send(skb, ICMPV6_PARAMPROB, @@ -229,7 +227,8 @@ discard: int ip6_input(struct sk_buff *skb) { - return NF_HOOK(PF_INET6,NF_IP6_LOCAL_IN, skb, skb->dev, NULL, ip6_input_finish); + return NF_HOOK(PF_INET6, NF_INET_LOCAL_IN, skb, skb->dev, NULL, + ip6_input_finish); } int ip6_mc_input(struct sk_buff *skb) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 3bef30e4a23..15c4f6cee3e 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -29,7 +29,7 @@ */ #include <linux/errno.h> -#include <linux/types.h> +#include <linux/kernel.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/net.h> @@ -70,6 +70,31 @@ static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *f spin_unlock_bh(&ip6_id_lock); } +int __ip6_local_out(struct sk_buff *skb) +{ + int len; + + len = skb->len - sizeof(struct ipv6hdr); + if (len > IPV6_MAXPLEN) + len = 0; + ipv6_hdr(skb)->payload_len = htons(len); + + return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev, + dst_output); +} + +int ip6_local_out(struct sk_buff *skb) +{ + int err; + + err = __ip6_local_out(skb); + if (likely(err == 1)) + err = dst_output(skb); + + return err; +} +EXPORT_SYMBOL_GPL(ip6_local_out); + static int ip6_output_finish(struct sk_buff *skb) { struct dst_entry *dst = skb->dst; @@ -120,8 +145,8 @@ static int ip6_output2(struct sk_buff *skb) is not supported in any case. */ if (newskb) - NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL, - newskb->dev, + NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb, + NULL, newskb->dev, ip6_dev_loopback_xmit); if (ipv6_hdr(skb)->hop_limit == 0) { @@ -134,7 +159,8 @@ static int ip6_output2(struct sk_buff *skb) IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS); } - return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish); + return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev, + ip6_output_finish); } static inline int ip6_skb_dst_mtu(struct sk_buff *skb) @@ -236,7 +262,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, if ((skb->len <= mtu) || ipfragok || skb_is_gso(skb)) { IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_OUTREQUESTS); - return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, + return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev, dst_output); } @@ -423,7 +449,7 @@ int ip6_forward(struct sk_buff *skb) /* XXX: idev->cnf.proxy_ndp? */ if (ipv6_devconf.proxy_ndp && - pneigh_lookup(&nd_tbl, &hdr->daddr, skb->dev, 0)) { + pneigh_lookup(&nd_tbl, &init_net, &hdr->daddr, skb->dev, 0)) { int proxied = ip6_forward_proxy_check(skb); if (proxied > 0) return ip6_input(skb); @@ -500,7 +526,8 @@ int ip6_forward(struct sk_buff *skb) hdr->hop_limit--; IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); - return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish); + return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev, + ip6_forward_finish); error: IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS); @@ -909,7 +936,8 @@ static int ip6_dst_lookup_tail(struct sock *sk, struct flowi fl_gw; int redirect; - ifp = ipv6_get_ifaddr(&fl->fl6_src, (*dst)->dev, 1); + ifp = ipv6_get_ifaddr(&init_net, &fl->fl6_src, + (*dst)->dev, 1); redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); if (ifp) @@ -1098,7 +1126,8 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, inet->cork.length = 0; sk->sk_sndmsg_page = NULL; sk->sk_sndmsg_off = 0; - exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0); + exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) - + rt->rt6i_nfheader_len; length += exthdrlen; transhdrlen += exthdrlen; } else { @@ -1113,7 +1142,8 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); - fragheaderlen = sizeof(struct ipv6hdr) + rt->u.dst.nfheader_len + (opt ? opt->opt_nflen : 0); + fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + + (opt ? opt->opt_nflen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr); if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) { @@ -1401,10 +1431,6 @@ int ip6_push_pending_frames(struct sock *sk) *(__be32*)hdr = fl->fl6_flowlabel | htonl(0x60000000 | ((int)np->cork.tclass << 20)); - if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) - hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); - else - hdr->payload_len = 0; hdr->hop_limit = np->cork.hop_limit; hdr->nexthdr = proto; ipv6_addr_copy(&hdr->saddr, &fl->fl6_src); @@ -1421,7 +1447,7 @@ int ip6_push_pending_frames(struct sock *sk) ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS); } - err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output); + err = ip6_local_out(skb); if (err) { if (err > 0) err = np->recverr ? net_xmit_errno(err) : 0; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 5383b33db8c..9031e521c1d 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -533,7 +533,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, fl.fl4_dst = eiph->saddr; fl.fl4_tos = RT_TOS(eiph->tos); fl.proto = IPPROTO_IPIP; - if (ip_route_output_key(&rt, &fl)) + if (ip_route_output_key(&init_net, &rt, &fl)) goto out; skb2->dev = rt->u.dst.dev; @@ -545,7 +545,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, fl.fl4_dst = eiph->daddr; fl.fl4_src = eiph->saddr; fl.fl4_tos = eiph->tos; - if (ip_route_output_key(&rt, &fl) || + if (ip_route_output_key(&init_net, &rt, &fl) || rt->u.dst.dev->type != ARPHRD_TUNNEL) { ip_rt_put(rt); goto out; @@ -635,7 +635,7 @@ static void ip6ip6_dscp_ecn_decapsulate(struct ip6_tnl *t, struct sk_buff *skb) { if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) - ipv6_copy_dscp(ipv6h, ipv6_hdr(skb)); + ipv6_copy_dscp(ipv6_get_dsfield(ipv6h), ipv6_hdr(skb)); if (INET_ECN_is_ce(ipv6_get_dsfield(ipv6h))) IP6_ECN_set_ce(ipv6_hdr(skb)); @@ -653,8 +653,8 @@ static inline int ip6_tnl_rcv_ctl(struct ip6_tnl *t) ldev = dev_get_by_index(&init_net, p->link); if ((ipv6_addr_is_multicast(&p->laddr) || - likely(ipv6_chk_addr(&p->laddr, ldev, 0))) && - likely(!ipv6_chk_addr(&p->raddr, NULL, 0))) + likely(ipv6_chk_addr(&init_net, &p->laddr, ldev, 0))) && + likely(!ipv6_chk_addr(&init_net, &p->raddr, NULL, 0))) ret = 1; if (ldev) @@ -788,12 +788,12 @@ static inline int ip6_tnl_xmit_ctl(struct ip6_tnl *t) if (p->link) ldev = dev_get_by_index(&init_net, p->link); - if (unlikely(!ipv6_chk_addr(&p->laddr, ldev, 0))) + if (unlikely(!ipv6_chk_addr(&init_net, &p->laddr, ldev, 0))) printk(KERN_WARNING "%s xmit: Local address not yet configured!\n", p->name); else if (!ipv6_addr_is_multicast(&p->raddr) && - unlikely(ipv6_chk_addr(&p->raddr, NULL, 0))) + unlikely(ipv6_chk_addr(&init_net, &p->raddr, NULL, 0))) printk(KERN_WARNING "%s xmit: Routing loop! " "Remote address found on this node!\n", @@ -910,15 +910,13 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, *(__be32*)ipv6h = fl->fl6_flowlabel | htonl(0x60000000); dsfield = INET_ECN_encapsulate(0, dsfield); ipv6_change_dsfield(ipv6h, ~INET_ECN_MASK, dsfield); - ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = proto; ipv6_addr_copy(&ipv6h->saddr, &fl->fl6_src); ipv6_addr_copy(&ipv6h->daddr, &fl->fl6_dst); nf_reset(skb); pkt_len = skb->len; - err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, - skb->dst->dev, dst_output); + err = ip6_local_out(skb); if (net_xmit_eval(err) == 0) { stats->tx_bytes += pkt_len; diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 0cd4056f912..b276d04d6db 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -190,7 +190,6 @@ static void ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) { struct xfrm_state *t = NULL; - u8 mode = XFRM_MODE_TUNNEL; t = xfrm_state_alloc(); if (!t) @@ -204,9 +203,7 @@ static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) memcpy(t->id.daddr.a6, x->id.daddr.a6, sizeof(struct in6_addr)); memcpy(&t->sel, &x->sel, sizeof(t->sel)); t->props.family = AF_INET6; - if (x->props.mode == XFRM_MODE_BEET) - mode = x->props.mode; - t->props.mode = mode; + t->props.mode = x->props.mode; memcpy(t->props.saddr.a6, x->props.saddr.a6, sizeof(struct in6_addr)); if (xfrm_init_state(t)) @@ -405,22 +402,22 @@ static int ipcomp6_init_state(struct xfrm_state *x) if (x->encap) goto out; - err = -ENOMEM; - ipcd = kzalloc(sizeof(*ipcd), GFP_KERNEL); - if (!ipcd) - goto out; - x->props.header_len = 0; switch (x->props.mode) { - case XFRM_MODE_BEET: case XFRM_MODE_TRANSPORT: break; case XFRM_MODE_TUNNEL: x->props.header_len += sizeof(struct ipv6hdr); + break; default: - goto error; + goto out; } + err = -ENOMEM; + ipcd = kzalloc(sizeof(*ipcd), GFP_KERNEL); + if (!ipcd) + goto out; + mutex_lock(&ipcomp6_resource_mutex); if (!ipcomp6_alloc_scratches()) goto error; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 8c5f80fd03a..bf2a686aa13 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -268,8 +268,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, struct inet_connection_sock *icsk = inet_csk(sk); local_bh_disable(); - sock_prot_dec_use(sk->sk_prot); - sock_prot_inc_use(&tcp_prot); + sock_prot_inuse_add(sk->sk_prot, -1); + sock_prot_inuse_add(&tcp_prot, 1); local_bh_enable(); sk->sk_prot = &tcp_prot; icsk->icsk_af_ops = &ipv4_specific; @@ -282,8 +282,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (sk->sk_protocol == IPPROTO_UDPLITE) prot = &udplite_prot; local_bh_disable(); - sock_prot_dec_use(sk->sk_prot); - sock_prot_inc_use(prot); + sock_prot_inuse_add(sk->sk_prot, -1); + sock_prot_inuse_add(prot, 1); local_bh_enable(); sk->sk_prot = prot; sk->sk_socket->ops = &inet_dgram_ops; @@ -1128,9 +1128,10 @@ int compat_ipv6_getsockopt(struct sock *sk, int level, int optname, EXPORT_SYMBOL(compat_ipv6_getsockopt); #endif -void __init ipv6_packet_init(void) +int __init ipv6_packet_init(void) { dev_add_pack(&ipv6_packet_type); + return 0; } void ipv6_packet_cleanup(void) diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 331d728c203..ab228d1ea11 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -903,9 +903,7 @@ int ipv6_dev_mc_inc(struct net_device *dev, struct in6_addr *addr) return -ENOMEM; } - init_timer(&mc->mca_timer); - mc->mca_timer.function = igmp6_timer_handler; - mc->mca_timer.data = (unsigned long) mc; + setup_timer(&mc->mca_timer, igmp6_timer_handler, (unsigned long)mc); ipv6_addr_copy(&mc->mca_addr, addr); mc->idev = idev; @@ -1450,7 +1448,7 @@ static inline int mld_dev_queue_xmit2(struct sk_buff *skb) static inline int mld_dev_queue_xmit(struct sk_buff *skb) { - return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb, NULL, skb->dev, + return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev, mld_dev_queue_xmit2); } @@ -1471,7 +1469,7 @@ static void mld_sendpack(struct sk_buff *skb) pmr->csum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen, IPPROTO_ICMPV6, csum_partial(skb_transport_header(skb), mldlen, 0)); - err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev, + err = NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dev, mld_dev_queue_xmit); if (!err) { ICMP6MSGOUT_INC_STATS_BH(idev, ICMPV6_MLD2_REPORT); @@ -1815,7 +1813,7 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) idev = in6_dev_get(skb->dev); - err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev, + err = NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dev, mld_dev_queue_xmit); if (!err) { ICMP6MSGOUT_INC_STATS(idev, type); @@ -2259,14 +2257,12 @@ void ipv6_mc_init_dev(struct inet6_dev *idev) write_lock_bh(&idev->lock); rwlock_init(&idev->mc_lock); idev->mc_gq_running = 0; - init_timer(&idev->mc_gq_timer); - idev->mc_gq_timer.data = (unsigned long) idev; - idev->mc_gq_timer.function = &mld_gq_timer_expire; + setup_timer(&idev->mc_gq_timer, mld_gq_timer_expire, + (unsigned long)idev); idev->mc_tomb = NULL; idev->mc_ifc_count = 0; - init_timer(&idev->mc_ifc_timer); - idev->mc_ifc_timer.data = (unsigned long) idev; - idev->mc_ifc_timer.function = &mld_ifc_timer_expire; + setup_timer(&idev->mc_ifc_timer, mld_ifc_timer_expire, + (unsigned long)idev); idev->mc_qrv = MLD_QRV_DEFAULT; idev->mc_maxdelay = IGMP6_UNSOLICITED_IVAL; idev->mc_v1_seen = 0; @@ -2377,6 +2373,7 @@ static struct ifmcaddr6 *igmp6_mc_get_idx(struct seq_file *seq, loff_t pos) } static void *igmp6_mc_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(dev_base_lock) { read_lock(&dev_base_lock); return igmp6_mc_get_idx(seq, *pos); @@ -2391,6 +2388,7 @@ static void *igmp6_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void igmp6_mc_seq_stop(struct seq_file *seq, void *v) + __releases(dev_base_lock) { struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); if (likely(state->idev != NULL)) { @@ -2520,6 +2518,7 @@ static struct ip6_sf_list *igmp6_mcf_get_idx(struct seq_file *seq, loff_t pos) } static void *igmp6_mcf_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(dev_base_lock) { read_lock(&dev_base_lock); return *pos ? igmp6_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; @@ -2537,6 +2536,7 @@ static void *igmp6_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void igmp6_mcf_seq_stop(struct seq_file *seq, void *v) + __releases(dev_base_lock) { struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); if (likely(state->im != NULL)) { diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c index 7fd841d4101..49d396620ea 100644 --- a/net/ipv6/mip6.c +++ b/net/ipv6/mip6.c @@ -34,11 +34,6 @@ #include <net/xfrm.h> #include <net/mip6.h> -static xfrm_address_t *mip6_xfrm_addr(struct xfrm_state *x, xfrm_address_t *addr) -{ - return x->coaddr; -} - static inline unsigned int calc_padlen(unsigned int len, unsigned int n) { return (n - len + 16) & 0x7; @@ -133,12 +128,15 @@ static int mip6_destopt_input(struct xfrm_state *x, struct sk_buff *skb) { struct ipv6hdr *iph = ipv6_hdr(skb); struct ipv6_destopt_hdr *destopt = (struct ipv6_destopt_hdr *)skb->data; + int err = destopt->nexthdr; + spin_lock(&x->lock); if (!ipv6_addr_equal(&iph->saddr, (struct in6_addr *)x->coaddr) && !ipv6_addr_any((struct in6_addr *)x->coaddr)) - return -ENOENT; + err = -ENOENT; + spin_unlock(&x->lock); - return destopt->nexthdr; + return err; } /* Destination Option Header is inserted. @@ -337,25 +335,27 @@ static struct xfrm_type mip6_destopt_type = .description = "MIP6DESTOPT", .owner = THIS_MODULE, .proto = IPPROTO_DSTOPTS, - .flags = XFRM_TYPE_NON_FRAGMENT, + .flags = XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_LOCAL_COADDR, .init_state = mip6_destopt_init_state, .destructor = mip6_destopt_destroy, .input = mip6_destopt_input, .output = mip6_destopt_output, .reject = mip6_destopt_reject, .hdr_offset = mip6_destopt_offset, - .local_addr = mip6_xfrm_addr, }; static int mip6_rthdr_input(struct xfrm_state *x, struct sk_buff *skb) { struct rt2_hdr *rt2 = (struct rt2_hdr *)skb->data; + int err = rt2->rt_hdr.nexthdr; + spin_lock(&x->lock); if (!ipv6_addr_equal(&rt2->addr, (struct in6_addr *)x->coaddr) && !ipv6_addr_any((struct in6_addr *)x->coaddr)) - return -ENOENT; + err = -ENOENT; + spin_unlock(&x->lock); - return rt2->rt_hdr.nexthdr; + return err; } /* Routing Header type 2 is inserted. @@ -467,13 +467,12 @@ static struct xfrm_type mip6_rthdr_type = .description = "MIP6RT", .owner = THIS_MODULE, .proto = IPPROTO_ROUTING, - .flags = XFRM_TYPE_NON_FRAGMENT, + .flags = XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_REMOTE_COADDR, .init_state = mip6_rthdr_init_state, .destructor = mip6_rthdr_destroy, .input = mip6_rthdr_input, .output = mip6_rthdr_output, .hdr_offset = mip6_rthdr_offset, - .remote_addr = mip6_xfrm_addr, }; static int __init mip6_init(void) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 85947eae5bf..0d33a7d3212 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -533,7 +533,8 @@ static void __ndisc_send(struct net_device *dev, idev = in6_dev_get(dst->dev); IP6_INC_STATS(idev, IPSTATS_MIB_OUTREQUESTS); - err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, dst_output); + err = NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev, + dst_output); if (!err) { ICMP6MSGOUT_INC_STATS(idev, type); ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS); @@ -555,7 +556,7 @@ static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, }; /* for anycast or proxy, solicited_addr != src_addr */ - ifp = ipv6_get_ifaddr(solicited_addr, dev, 1); + ifp = ipv6_get_ifaddr(&init_net, solicited_addr, dev, 1); if (ifp) { src_addr = solicited_addr; if (ifp->flags & IFA_F_OPTIMISTIC) @@ -615,7 +616,8 @@ void ndisc_send_rs(struct net_device *dev, struct in6_addr *saddr, * suppress the inclusion of the sllao. */ if (send_sllao) { - struct inet6_ifaddr *ifp = ipv6_get_ifaddr(saddr, dev, 1); + struct inet6_ifaddr *ifp = ipv6_get_ifaddr(&init_net, saddr, + dev, 1); if (ifp) { if (ifp->flags & IFA_F_OPTIMISTIC) { send_sllao = 0; @@ -652,7 +654,7 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) struct in6_addr *target = (struct in6_addr *)&neigh->primary_key; int probes = atomic_read(&neigh->probes); - if (skb && ipv6_chk_addr(&ipv6_hdr(skb)->saddr, dev, 1)) + if (skb && ipv6_chk_addr(&init_net, &ipv6_hdr(skb)->saddr, dev, 1)) saddr = &ipv6_hdr(skb)->saddr; if ((probes -= neigh->parms->ucast_probes) < 0) { @@ -740,7 +742,7 @@ static void ndisc_recv_ns(struct sk_buff *skb) inc = ipv6_addr_is_multicast(daddr); - if ((ifp = ipv6_get_ifaddr(&msg->target, dev, 1)) != NULL) { + if ((ifp = ipv6_get_ifaddr(&init_net, &msg->target, dev, 1)) != NULL) { if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) { if (dad) { @@ -788,7 +790,7 @@ static void ndisc_recv_ns(struct sk_buff *skb) if (ipv6_chk_acast_addr(dev, &msg->target) || (idev->cnf.forwarding && (ipv6_devconf.proxy_ndp || idev->cnf.proxy_ndp) && - (pneigh = pneigh_lookup(&nd_tbl, + (pneigh = pneigh_lookup(&nd_tbl, &init_net, &msg->target, dev, 0)) != NULL)) { if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) && skb->pkt_type != PACKET_HOST && @@ -898,7 +900,7 @@ static void ndisc_recv_na(struct sk_buff *skb) return; } } - if ((ifp = ipv6_get_ifaddr(&msg->target, dev, 1))) { + if ((ifp = ipv6_get_ifaddr(&init_net, &msg->target, dev, 1))) { if (ifp->flags & IFA_F_TENTATIVE) { addrconf_dad_failure(ifp); return; @@ -929,7 +931,7 @@ static void ndisc_recv_na(struct sk_buff *skb) */ if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) && ipv6_devconf.forwarding && ipv6_devconf.proxy_ndp && - pneigh_lookup(&nd_tbl, &msg->target, dev, 0)) { + pneigh_lookup(&nd_tbl, &init_net, &msg->target, dev, 0)) { /* XXX: idev->cnf.prixy_ndp */ goto out; } @@ -1048,7 +1050,8 @@ static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt) &ipv6_hdr(ra)->saddr); nlmsg_end(skb, nlh); - err = rtnl_notify(skb, 0, RTNLGRP_ND_USEROPT, NULL, GFP_ATOMIC); + err = rtnl_notify(skb, &init_net, 0, RTNLGRP_ND_USEROPT, NULL, + GFP_ATOMIC); if (err < 0) goto errout; @@ -1058,7 +1061,7 @@ nla_put_failure: nlmsg_free(skb); err = -EMSGSIZE; errout: - rtnl_set_sk_err(RTNLGRP_ND_USEROPT, err); + rtnl_set_sk_err(&init_net, RTNLGRP_ND_USEROPT, err); } static void ndisc_router_discovery(struct sk_buff *skb) @@ -1294,11 +1297,11 @@ skip_defrtr: } if (ndopts.nd_useropts) { - struct nd_opt_hdr *opt; - for (opt = ndopts.nd_useropts; - opt; - opt = ndisc_next_useropt(opt, ndopts.nd_useropts_end)) { - ndisc_ra_useropt(skb, opt); + struct nd_opt_hdr *p; + for (p = ndopts.nd_useropts; + p; + p = ndisc_next_useropt(p, ndopts.nd_useropts_end)) { + ndisc_ra_useropt(skb, p); } } @@ -1538,7 +1541,8 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, buff->dst = dst; idev = in6_dev_get(dst->dev); IP6_INC_STATS(idev, IPSTATS_MIB_OUTREQUESTS); - err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, buff, NULL, dst->dev, dst_output); + err = NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, buff, NULL, dst->dev, + dst_output); if (!err) { ICMP6MSGOUT_INC_STATS(idev, NDISC_REDIRECT); ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS); diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index b1326c2bf8a..2e06724dc34 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -8,6 +8,7 @@ #include <net/ip6_route.h> #include <net/xfrm.h> #include <net/ip6_checksum.h> +#include <net/netfilter/nf_queue.h> int ip6_route_me_harder(struct sk_buff *skb) { @@ -56,11 +57,12 @@ struct ip6_rt_info { struct in6_addr saddr; }; -static void nf_ip6_saveroute(const struct sk_buff *skb, struct nf_info *info) +static void nf_ip6_saveroute(const struct sk_buff *skb, + struct nf_queue_entry *entry) { - struct ip6_rt_info *rt_info = nf_info_reroute(info); + struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry); - if (info->hook == NF_IP6_LOCAL_OUT) { + if (entry->hook == NF_INET_LOCAL_OUT) { struct ipv6hdr *iph = ipv6_hdr(skb); rt_info->daddr = iph->daddr; @@ -68,11 +70,12 @@ static void nf_ip6_saveroute(const struct sk_buff *skb, struct nf_info *info) } } -static int nf_ip6_reroute(struct sk_buff *skb, const struct nf_info *info) +static int nf_ip6_reroute(struct sk_buff *skb, + const struct nf_queue_entry *entry) { - struct ip6_rt_info *rt_info = nf_info_reroute(info); + struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry); - if (info->hook == NF_IP6_LOCAL_OUT) { + if (entry->hook == NF_INET_LOCAL_OUT) { struct ipv6hdr *iph = ipv6_hdr(skb); if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) || !ipv6_addr_equal(&iph->saddr, &rt_info->saddr)) @@ -81,6 +84,12 @@ static int nf_ip6_reroute(struct sk_buff *skb, const struct nf_info *info) return 0; } +static int nf_ip6_route(struct dst_entry **dst, struct flowi *fl) +{ + *dst = ip6_route_output(NULL, fl); + return (*dst)->error; +} + __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u_int8_t protocol) { @@ -89,7 +98,7 @@ __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, switch (skb->ip_summed) { case CHECKSUM_COMPLETE: - if (hook != NF_IP6_PRE_ROUTING && hook != NF_IP6_LOCAL_IN) + if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN) break; if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, skb->len - dataoff, protocol, @@ -115,9 +124,10 @@ __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, EXPORT_SYMBOL(nf_ip6_checksum); -static struct nf_afinfo nf_ip6_afinfo = { +static const struct nf_afinfo nf_ip6_afinfo = { .family = AF_INET6, .checksum = nf_ip6_checksum, + .route = nf_ip6_route, .saveroute = nf_ip6_saveroute, .reroute = nf_ip6_reroute, .route_key_size = sizeof(struct ip6_rt_info), diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 838b8ddee8c..4fc0b023cfd 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -2,12 +2,13 @@ # IP netfilter configuration # -menu "IPv6: Netfilter Configuration (EXPERIMENTAL)" - depends on INET && IPV6 && NETFILTER && EXPERIMENTAL +menu "IPv6: Netfilter Configuration" + depends on INET && IPV6 && NETFILTER config NF_CONNTRACK_IPV6 - tristate "IPv6 connection tracking support (EXPERIMENTAL)" - depends on INET && IPV6 && EXPERIMENTAL && NF_CONNTRACK + tristate "IPv6 connection tracking support" + depends on INET && IPV6 && NF_CONNTRACK + default m if NETFILTER_ADVANCED=n ---help--- Connection tracking keeps a record of what packets have passed through your machine, in order to figure out how they are related @@ -21,7 +22,8 @@ config NF_CONNTRACK_IPV6 config IP6_NF_QUEUE tristate "IP6 Userspace queueing via NETLINK (OBSOLETE)" - depends on INET && IPV6 && NETFILTER && EXPERIMENTAL + depends on INET && IPV6 && NETFILTER + depends on NETFILTER_ADVANCED ---help--- This option adds a queue handler to the kernel for IPv6 @@ -42,8 +44,9 @@ config IP6_NF_QUEUE config IP6_NF_IPTABLES tristate "IP6 tables support (required for filtering)" - depends on INET && IPV6 && EXPERIMENTAL + depends on INET && IPV6 select NETFILTER_XTABLES + default m if NETFILTER_ADVANCED=n help ip6tables is a general, extensible packet identification framework. Currently only the packet filtering and packet mangling subsystem @@ -54,8 +57,9 @@ config IP6_NF_IPTABLES # The simple matches. config IP6_NF_MATCH_RT - tristate "Routing header match support" + tristate '"rt" Routing header match support' depends on IP6_NF_IPTABLES + depends on NETFILTER_ADVANCED help rt matching allows you to match packets based on the routing header of the packet. @@ -63,8 +67,9 @@ config IP6_NF_MATCH_RT To compile it as a module, choose M here. If unsure, say N. config IP6_NF_MATCH_OPTS - tristate "Hop-by-hop and Dst opts header match support" + tristate '"hopbyhop" and "dst" opts header match support' depends on IP6_NF_IPTABLES + depends on NETFILTER_ADVANCED help This allows one to match packets based on the hop-by-hop and destination options headers of a packet. @@ -72,8 +77,9 @@ config IP6_NF_MATCH_OPTS To compile it as a module, choose M here. If unsure, say N. config IP6_NF_MATCH_FRAG - tristate "Fragmentation header match support" + tristate '"frag" Fragmentation header match support' depends on IP6_NF_IPTABLES + depends on NETFILTER_ADVANCED help frag matching allows you to match packets based on the fragmentation header of the packet. @@ -81,26 +87,19 @@ config IP6_NF_MATCH_FRAG To compile it as a module, choose M here. If unsure, say N. config IP6_NF_MATCH_HL - tristate "HL match support" + tristate '"hl" match support' depends on IP6_NF_IPTABLES + depends on NETFILTER_ADVANCED help HL matching allows you to match packets based on the hop limit of the packet. To compile it as a module, choose M here. If unsure, say N. -config IP6_NF_MATCH_OWNER - tristate "Owner match support" - depends on IP6_NF_IPTABLES - help - Packet owner matching allows you to match locally-generated packets - based on who created them: the user, group, process or session. - - To compile it as a module, choose M here. If unsure, say N. - config IP6_NF_MATCH_IPV6HEADER - tristate "IPv6 Extension Headers Match" + tristate '"ipv6header" IPv6 Extension Headers Match' depends on IP6_NF_IPTABLES + depends on NETFILTER_ADVANCED help This module allows one to match packets based upon the ipv6 extension headers. @@ -108,24 +107,27 @@ config IP6_NF_MATCH_IPV6HEADER To compile it as a module, choose M here. If unsure, say N. config IP6_NF_MATCH_AH - tristate "AH match support" + tristate '"ah" match support' depends on IP6_NF_IPTABLES + depends on NETFILTER_ADVANCED help This module allows one to match AH packets. To compile it as a module, choose M here. If unsure, say N. config IP6_NF_MATCH_MH - tristate "MH match support" + tristate '"mh" match support' depends on IP6_NF_IPTABLES + depends on NETFILTER_ADVANCED help This module allows one to match MH packets. To compile it as a module, choose M here. If unsure, say N. config IP6_NF_MATCH_EUI64 - tristate "EUI64 address check" + tristate '"eui64" address check' depends on IP6_NF_IPTABLES + depends on NETFILTER_ADVANCED help This module performs checking on the IPv6 source address Compares the last 64 bits with the EUI64 (delivered @@ -137,6 +139,7 @@ config IP6_NF_MATCH_EUI64 config IP6_NF_FILTER tristate "Packet filtering" depends on IP6_NF_IPTABLES + default m if NETFILTER_ADVANCED=n help Packet filtering defines a table `filter', which has a series of rules for simple packet filtering at local input, forwarding and @@ -147,6 +150,7 @@ config IP6_NF_FILTER config IP6_NF_TARGET_LOG tristate "LOG target support" depends on IP6_NF_FILTER + default m if NETFILTER_ADVANCED=n help This option adds a `LOG' target, which allows you to create rules in any iptables table which records the packet header to the syslog. @@ -156,6 +160,7 @@ config IP6_NF_TARGET_LOG config IP6_NF_TARGET_REJECT tristate "REJECT target support" depends on IP6_NF_FILTER + default m if NETFILTER_ADVANCED=n help The REJECT target allows a filtering rule to specify that an ICMPv6 error should be issued in response to an incoming packet, rather @@ -166,6 +171,7 @@ config IP6_NF_TARGET_REJECT config IP6_NF_MANGLE tristate "Packet mangling" depends on IP6_NF_IPTABLES + default m if NETFILTER_ADVANCED=n help This option adds a `mangle' table to iptables: see the man page for iptables(8). This table is used for various packet alterations @@ -176,27 +182,29 @@ config IP6_NF_MANGLE config IP6_NF_TARGET_HL tristate 'HL (hoplimit) target support' depends on IP6_NF_MANGLE + depends on NETFILTER_ADVANCED help This option adds a `HL' target, which enables the user to decrement the hoplimit value of the IPv6 header or set it to a given (lower) value. - + While it is safe to decrement the hoplimit value, this option also enables functionality to increment and set the hoplimit value of the IPv6 header to arbitrary values. This is EXTREMELY DANGEROUS since you can easily create immortal packets that loop forever on the - network. + network. To compile it as a module, choose M here. If unsure, say N. config IP6_NF_RAW tristate 'raw table support (required for TRACE)' depends on IP6_NF_IPTABLES + depends on NETFILTER_ADVANCED help This option adds a `raw' table to ip6tables. This table is the very first in the netfilter framework and hooks in at the PREROUTING and OUTPUT chains. - + If you want to compile it as a module, say M here and read <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index e789ec44d23..fbf2c14ed88 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -23,7 +23,6 @@ obj-$(CONFIG_IP6_NF_MATCH_HL) += ip6t_hl.o obj-$(CONFIG_IP6_NF_MATCH_IPV6HEADER) += ip6t_ipv6header.o obj-$(CONFIG_IP6_NF_MATCH_MH) += ip6t_mh.o obj-$(CONFIG_IP6_NF_MATCH_OPTS) += ip6t_hbh.o -obj-$(CONFIG_IP6_NF_MATCH_OWNER) += ip6t_owner.o obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o # targets diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c index e273605eef8..56b4ea6d29e 100644 --- a/net/ipv6/netfilter/ip6_queue.c +++ b/net/ipv6/netfilter/ip6_queue.c @@ -29,6 +29,7 @@ #include <net/sock.h> #include <net/ipv6.h> #include <net/ip6_route.h> +#include <net/netfilter/nf_queue.h> #include <linux/netfilter_ipv4/ip_queue.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> @@ -38,13 +39,7 @@ #define NET_IPQ_QMAX 2088 #define NET_IPQ_QMAX_NAME "ip6_queue_maxlen" -struct ipq_queue_entry { - struct list_head list; - struct nf_info *info; - struct sk_buff *skb; -}; - -typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long); +typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long); static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE; static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT; @@ -58,70 +53,13 @@ static struct sock *ipqnl __read_mostly; static LIST_HEAD(queue_list); static DEFINE_MUTEX(ipqnl_mutex); -static void -ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict) -{ - local_bh_disable(); - nf_reinject(entry->skb, entry->info, verdict); - local_bh_enable(); - kfree(entry); -} - static inline void -__ipq_enqueue_entry(struct ipq_queue_entry *entry) +__ipq_enqueue_entry(struct nf_queue_entry *entry) { - list_add(&entry->list, &queue_list); + list_add_tail(&entry->list, &queue_list); queue_total++; } -/* - * Find and return a queued entry matched by cmpfn, or return the last - * entry if cmpfn is NULL. - */ -static inline struct ipq_queue_entry * -__ipq_find_entry(ipq_cmpfn cmpfn, unsigned long data) -{ - struct list_head *p; - - list_for_each_prev(p, &queue_list) { - struct ipq_queue_entry *entry = (struct ipq_queue_entry *)p; - - if (!cmpfn || cmpfn(entry, data)) - return entry; - } - return NULL; -} - -static inline void -__ipq_dequeue_entry(struct ipq_queue_entry *entry) -{ - list_del(&entry->list); - queue_total--; -} - -static inline struct ipq_queue_entry * -__ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data) -{ - struct ipq_queue_entry *entry; - - entry = __ipq_find_entry(cmpfn, data); - if (entry == NULL) - return NULL; - - __ipq_dequeue_entry(entry); - return entry; -} - - -static inline void -__ipq_flush(int verdict) -{ - struct ipq_queue_entry *entry; - - while ((entry = __ipq_find_dequeue_entry(NULL, 0))) - ipq_issue_verdict(entry, verdict); -} - static inline int __ipq_set_mode(unsigned char mode, unsigned int range) { @@ -148,36 +86,64 @@ __ipq_set_mode(unsigned char mode, unsigned int range) return status; } +static void __ipq_flush(ipq_cmpfn cmpfn, unsigned long data); + static inline void __ipq_reset(void) { peer_pid = 0; net_disable_timestamp(); __ipq_set_mode(IPQ_COPY_NONE, 0); - __ipq_flush(NF_DROP); + __ipq_flush(NULL, 0); } -static struct ipq_queue_entry * -ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data) +static struct nf_queue_entry * +ipq_find_dequeue_entry(unsigned long id) { - struct ipq_queue_entry *entry; + struct nf_queue_entry *entry = NULL, *i; write_lock_bh(&queue_lock); - entry = __ipq_find_dequeue_entry(cmpfn, data); + + list_for_each_entry(i, &queue_list, list) { + if ((unsigned long)i == id) { + entry = i; + break; + } + } + + if (entry) { + list_del(&entry->list); + queue_total--; + } + write_unlock_bh(&queue_lock); return entry; } static void -ipq_flush(int verdict) +__ipq_flush(ipq_cmpfn cmpfn, unsigned long data) +{ + struct nf_queue_entry *entry, *next; + + list_for_each_entry_safe(entry, next, &queue_list, list) { + if (!cmpfn || cmpfn(entry, data)) { + list_del(&entry->list); + queue_total--; + nf_reinject(entry, NF_DROP); + } + } +} + +static void +ipq_flush(ipq_cmpfn cmpfn, unsigned long data) { write_lock_bh(&queue_lock); - __ipq_flush(verdict); + __ipq_flush(cmpfn, data); write_unlock_bh(&queue_lock); } static struct sk_buff * -ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp) +ipq_build_packet_message(struct nf_queue_entry *entry, int *errp) { sk_buff_data_t old_tail; size_t size = 0; @@ -234,20 +200,20 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp) pmsg->timestamp_sec = tv.tv_sec; pmsg->timestamp_usec = tv.tv_usec; pmsg->mark = entry->skb->mark; - pmsg->hook = entry->info->hook; + pmsg->hook = entry->hook; pmsg->hw_protocol = entry->skb->protocol; - if (entry->info->indev) - strcpy(pmsg->indev_name, entry->info->indev->name); + if (entry->indev) + strcpy(pmsg->indev_name, entry->indev->name); else pmsg->indev_name[0] = '\0'; - if (entry->info->outdev) - strcpy(pmsg->outdev_name, entry->info->outdev->name); + if (entry->outdev) + strcpy(pmsg->outdev_name, entry->outdev->name); else pmsg->outdev_name[0] = '\0'; - if (entry->info->indev && entry->skb->dev) { + if (entry->indev && entry->skb->dev) { pmsg->hw_type = entry->skb->dev->type; pmsg->hw_addrlen = dev_parse_header(entry->skb, pmsg->hw_addr); } @@ -268,28 +234,17 @@ nlmsg_failure: } static int -ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, - unsigned int queuenum, void *data) +ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) { int status = -EINVAL; struct sk_buff *nskb; - struct ipq_queue_entry *entry; if (copy_mode == IPQ_COPY_NONE) return -EAGAIN; - entry = kmalloc(sizeof(*entry), GFP_ATOMIC); - if (entry == NULL) { - printk(KERN_ERR "ip6_queue: OOM in ipq_enqueue_packet()\n"); - return -ENOMEM; - } - - entry->info = info; - entry->skb = skb; - nskb = ipq_build_packet_message(entry, &status); if (nskb == NULL) - goto err_out_free; + return status; write_lock_bh(&queue_lock); @@ -323,14 +278,11 @@ err_out_free_nskb: err_out_unlock: write_unlock_bh(&queue_lock); - -err_out_free: - kfree(entry); return status; } static int -ipq_mangle_ipv6(ipq_verdict_msg_t *v, struct ipq_queue_entry *e) +ipq_mangle_ipv6(ipq_verdict_msg_t *v, struct nf_queue_entry *e) { int diff; int err; @@ -365,21 +317,15 @@ ipq_mangle_ipv6(ipq_verdict_msg_t *v, struct ipq_queue_entry *e) return 0; } -static inline int -id_cmp(struct ipq_queue_entry *e, unsigned long id) -{ - return (id == (unsigned long )e); -} - static int ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len) { - struct ipq_queue_entry *entry; + struct nf_queue_entry *entry; if (vmsg->value > NF_MAX_VERDICT) return -EINVAL; - entry = ipq_find_dequeue_entry(id_cmp, vmsg->id); + entry = ipq_find_dequeue_entry(vmsg->id); if (entry == NULL) return -ENOENT; else { @@ -389,7 +335,7 @@ ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len) if (ipq_mangle_ipv6(vmsg, entry) < 0) verdict = NF_DROP; - ipq_issue_verdict(entry, verdict); + nf_reinject(entry, verdict); return 0; } } @@ -434,26 +380,32 @@ ipq_receive_peer(struct ipq_peer_msg *pmsg, } static int -dev_cmp(struct ipq_queue_entry *entry, unsigned long ifindex) +dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) { - if (entry->info->indev) - if (entry->info->indev->ifindex == ifindex) + if (entry->indev) + if (entry->indev->ifindex == ifindex) return 1; - if (entry->info->outdev) - if (entry->info->outdev->ifindex == ifindex) + if (entry->outdev) + if (entry->outdev->ifindex == ifindex) return 1; - +#ifdef CONFIG_BRIDGE_NETFILTER + if (entry->skb->nf_bridge) { + if (entry->skb->nf_bridge->physindev && + entry->skb->nf_bridge->physindev->ifindex == ifindex) + return 1; + if (entry->skb->nf_bridge->physoutdev && + entry->skb->nf_bridge->physoutdev->ifindex == ifindex) + return 1; + } +#endif return 0; } static void ipq_dev_drop(int ifindex) { - struct ipq_queue_entry *entry; - - while ((entry = ipq_find_dequeue_entry(dev_cmp, ifindex)) != NULL) - ipq_issue_verdict(entry, NF_DROP); + ipq_flush(dev_cmp, ifindex); } #define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) @@ -577,26 +529,6 @@ static ctl_table ipq_table[] = { { .ctl_name = 0 } }; -static ctl_table ipq_dir_table[] = { - { - .ctl_name = NET_IPV6, - .procname = "ipv6", - .mode = 0555, - .child = ipq_table - }, - { .ctl_name = 0 } -}; - -static ctl_table ipq_root_table[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = ipq_dir_table - }, - { .ctl_name = 0 } -}; - static int ip6_queue_show(struct seq_file *m, void *v) { read_lock_bh(&queue_lock); @@ -634,7 +566,7 @@ static const struct file_operations ip6_queue_proc_fops = { .owner = THIS_MODULE, }; -static struct nf_queue_handler nfqh = { +static const struct nf_queue_handler nfqh = { .name = "ip6_queue", .outfn = &ipq_enqueue_packet, }; @@ -662,7 +594,7 @@ static int __init ip6_queue_init(void) } register_netdevice_notifier(&ipq_dev_notifier); - ipq_sysctl_header = register_sysctl_table(ipq_root_table); + ipq_sysctl_header = register_sysctl_paths(net_ipv6_ctl_path, ipq_table); status = nf_register_queue_handler(PF_INET6, &nfqh); if (status < 0) { @@ -677,7 +609,7 @@ cleanup_sysctl: proc_net_remove(&init_net, IPQ_PROC_FS_NAME); cleanup_ipqnl: - sock_release(ipqnl->sk_socket); + netlink_kernel_release(ipqnl); mutex_lock(&ipqnl_mutex); mutex_unlock(&ipqnl_mutex); @@ -690,13 +622,13 @@ static void __exit ip6_queue_fini(void) { nf_unregister_queue_handlers(&nfqh); synchronize_net(); - ipq_flush(NF_DROP); + ipq_flush(NULL, 0); unregister_sysctl_table(ipq_sysctl_header); unregister_netdevice_notifier(&ipq_dev_notifier); proc_net_remove(&init_net, IPQ_PROC_FS_NAME); - sock_release(ipqnl->sk_socket); + netlink_kernel_release(ipqnl); mutex_lock(&ipqnl_mutex); mutex_unlock(&ipqnl_mutex); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index acaba153793..dd7860fea61 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -19,21 +19,21 @@ #include <linux/poison.h> #include <linux/icmpv6.h> #include <net/ipv6.h> +#include <net/compat.h> #include <asm/uaccess.h> #include <linux/mutex.h> #include <linux/proc_fs.h> +#include <linux/err.h> #include <linux/cpumask.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter/x_tables.h> +#include <net/netfilter/nf_log.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("IPv6 packet filter"); -#define IPV6_HDR_LEN (sizeof(struct ipv6hdr)) -#define IPV6_OPTHDR_LEN (sizeof(struct ipv6_opt_hdr)) - /*#define DEBUG_IP_FIREWALL*/ /*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */ /*#define DEBUG_IP_FIREWALL_USER*/ @@ -76,12 +76,6 @@ do { \ Hence the start of any table is given by get_table() below. */ -#if 0 -#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0) -#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; }) -#define up(x) do { printk("UP:%u:" #x "\n", __LINE__); up(x); } while(0) -#endif - /* Check for an extension */ int ip6t_ext_hdr(u8 nexthdr) @@ -96,6 +90,7 @@ ip6t_ext_hdr(u8 nexthdr) } /* Returns whether matches rule or not. */ +/* Performance critical - called for every packet */ static inline bool ip6_packet_match(const struct sk_buff *skb, const char *indev, @@ -108,7 +103,7 @@ ip6_packet_match(const struct sk_buff *skb, unsigned long ret; const struct ipv6hdr *ipv6 = ipv6_hdr(skb); -#define FWINV(bool,invflg) ((bool) ^ !!(ip6info->invflags & invflg)) +#define FWINV(bool, invflg) ((bool) ^ !!(ip6info->invflags & (invflg))) if (FWINV(ipv6_masked_addr_cmp(&ipv6->saddr, &ip6info->smsk, &ip6info->src), IP6T_INV_SRCIP) @@ -188,7 +183,7 @@ ip6_packet_match(const struct sk_buff *skb, } /* should be ip6 safe */ -static inline bool +static bool ip6_checkentry(const struct ip6t_ip6 *ipv6) { if (ipv6->flags & ~IP6T_F_MASK) { @@ -218,8 +213,9 @@ ip6t_error(struct sk_buff *skb, return NF_DROP; } -static inline -bool do_match(struct ip6t_entry_match *m, +/* Performance critical - called for every packet */ +static inline bool +do_match(struct ip6t_entry_match *m, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -242,6 +238,7 @@ get_entry(void *base, unsigned int offset) } /* All zeroes == unconditional rule. */ +/* Mildly perf critical (only if packet tracing is on) */ static inline int unconditional(const struct ip6t_ip6 *ipv6) { @@ -257,12 +254,12 @@ unconditional(const struct ip6t_ip6 *ipv6) #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) /* This cries for unification! */ -static const char *hooknames[] = { - [NF_IP6_PRE_ROUTING] = "PREROUTING", - [NF_IP6_LOCAL_IN] = "INPUT", - [NF_IP6_FORWARD] = "FORWARD", - [NF_IP6_LOCAL_OUT] = "OUTPUT", - [NF_IP6_POST_ROUTING] = "POSTROUTING", +static const char *const hooknames[] = { + [NF_INET_PRE_ROUTING] = "PREROUTING", + [NF_INET_LOCAL_IN] = "INPUT", + [NF_INET_FORWARD] = "FORWARD", + [NF_INET_LOCAL_OUT] = "OUTPUT", + [NF_INET_POST_ROUTING] = "POSTROUTING", }; enum nf_ip_trace_comments { @@ -271,7 +268,7 @@ enum nf_ip_trace_comments { NF_IP6_TRACE_COMMENT_POLICY, }; -static const char *comments[] = { +static const char *const comments[] = { [NF_IP6_TRACE_COMMENT_RULE] = "rule", [NF_IP6_TRACE_COMMENT_RETURN] = "return", [NF_IP6_TRACE_COMMENT_POLICY] = "policy", @@ -287,6 +284,7 @@ static struct nf_loginfo trace_loginfo = { }, }; +/* Mildly perf critical (only if packet tracing is on) */ static inline int get_chainname_rulenum(struct ip6t_entry *s, struct ip6t_entry *e, char *hookname, char **chainname, @@ -378,8 +376,8 @@ ip6t_do_table(struct sk_buff *skb, * match it. */ read_lock_bh(&table->lock); - private = table->private; IP_NF_ASSERT(table->valid_hooks & (1 << hook)); + private = table->private; table_base = (void *)private->entries[smp_processor_id()]; e = get_entry(table_base, private->hook_entry[hook]); @@ -399,9 +397,8 @@ ip6t_do_table(struct sk_buff *skb, goto no_match; ADD_COUNTER(e->counters, - ntohs(ipv6_hdr(skb)->payload_len) - + IPV6_HDR_LEN, - 1); + ntohs(ipv6_hdr(skb)->payload_len) + + sizeof(struct ipv6hdr), 1); t = ip6t_get_target(e); IP_NF_ASSERT(t->u.kernel.target); @@ -502,11 +499,9 @@ mark_source_chains(struct xt_table_info *newinfo, /* No recursion; use packet counter to save back ptrs (reset to 0 as we leave), and comefrom to save source hook bitmask */ - for (hook = 0; hook < NF_IP6_NUMHOOKS; hook++) { + for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; - struct ip6t_entry *e - = (struct ip6t_entry *)(entry0 + pos); - int visited = e->comefrom & (1 << hook); + struct ip6t_entry *e = (struct ip6t_entry *)(entry0 + pos); if (!(valid_hooks & (1 << hook))) continue; @@ -517,14 +512,14 @@ mark_source_chains(struct xt_table_info *newinfo, for (;;) { struct ip6t_standard_target *t = (void *)ip6t_get_target(e); + int visited = e->comefrom & (1 << hook); - if (e->comefrom & (1 << NF_IP6_NUMHOOKS)) { + if (e->comefrom & (1 << NF_INET_NUMHOOKS)) { printk("iptables: loop hook %u pos %u %08X.\n", hook, pos, e->comefrom); return 0; } - e->comefrom - |= ((1 << hook) | (1 << NF_IP6_NUMHOOKS)); + e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); /* Unconditional return/END. */ if ((e->target_offset == sizeof(struct ip6t_entry) @@ -544,10 +539,10 @@ mark_source_chains(struct xt_table_info *newinfo, /* Return: backtrack through the last big jump. */ do { - e->comefrom ^= (1<<NF_IP6_NUMHOOKS); + e->comefrom ^= (1<<NF_INET_NUMHOOKS); #ifdef DEBUG_IP_FIREWALL_USER if (e->comefrom - & (1 << NF_IP6_NUMHOOKS)) { + & (1 << NF_INET_NUMHOOKS)) { duprintf("Back unset " "on hook %u " "rule %u\n", @@ -604,7 +599,7 @@ mark_source_chains(struct xt_table_info *newinfo, return 1; } -static inline int +static int cleanup_match(struct ip6t_entry_match *m, unsigned int *i) { if (i && (*i)-- == 0) @@ -616,102 +611,135 @@ cleanup_match(struct ip6t_entry_match *m, unsigned int *i) return 0; } -static inline int -check_match(struct ip6t_entry_match *m, - const char *name, - const struct ip6t_ip6 *ipv6, - unsigned int hookmask, - unsigned int *i) +static int +check_entry(struct ip6t_entry *e, const char *name) +{ + struct ip6t_entry_target *t; + + if (!ip6_checkentry(&e->ipv6)) { + duprintf("ip_tables: ip check failed %p %s.\n", e, name); + return -EINVAL; + } + + if (e->target_offset + sizeof(struct ip6t_entry_target) > + e->next_offset) + return -EINVAL; + + t = ip6t_get_target(e); + if (e->target_offset + t->u.target_size > e->next_offset) + return -EINVAL; + + return 0; +} + +static int check_match(struct ip6t_entry_match *m, const char *name, + const struct ip6t_ip6 *ipv6, + unsigned int hookmask, unsigned int *i) +{ + struct xt_match *match; + int ret; + + match = m->u.kernel.match; + ret = xt_check_match(match, AF_INET6, m->u.match_size - sizeof(*m), + name, hookmask, ipv6->proto, + ipv6->invflags & IP6T_INV_PROTO); + if (!ret && m->u.kernel.match->checkentry + && !m->u.kernel.match->checkentry(name, ipv6, match, m->data, + hookmask)) { + duprintf("ip_tables: check failed for `%s'.\n", + m->u.kernel.match->name); + ret = -EINVAL; + } + if (!ret) + (*i)++; + return ret; +} + +static int +find_check_match(struct ip6t_entry_match *m, + const char *name, + const struct ip6t_ip6 *ipv6, + unsigned int hookmask, + unsigned int *i) { struct xt_match *match; int ret; match = try_then_request_module(xt_find_match(AF_INET6, m->u.user.name, - m->u.user.revision), + m->u.user.revision), "ip6t_%s", m->u.user.name); if (IS_ERR(match) || !match) { - duprintf("check_match: `%s' not found\n", m->u.user.name); + duprintf("find_check_match: `%s' not found\n", m->u.user.name); return match ? PTR_ERR(match) : -ENOENT; } m->u.kernel.match = match; - ret = xt_check_match(match, AF_INET6, m->u.match_size - sizeof(*m), - name, hookmask, ipv6->proto, - ipv6->invflags & IP6T_INV_PROTO); + ret = check_match(m, name, ipv6, hookmask, i); if (ret) goto err; - if (m->u.kernel.match->checkentry - && !m->u.kernel.match->checkentry(name, ipv6, match, m->data, - hookmask)) { - duprintf("ip_tables: check failed for `%s'.\n", - m->u.kernel.match->name); - ret = -EINVAL; - goto err; - } - - (*i)++; return 0; err: module_put(m->u.kernel.match->me); return ret; } -static struct xt_target ip6t_standard_target; - -static inline int -check_entry(struct ip6t_entry *e, const char *name, unsigned int size, - unsigned int *i) +static int check_target(struct ip6t_entry *e, const char *name) { struct ip6t_entry_target *t; struct xt_target *target; int ret; - unsigned int j; - if (!ip6_checkentry(&e->ipv6)) { - duprintf("ip_tables: ip check failed %p %s.\n", e, name); - return -EINVAL; + t = ip6t_get_target(e); + target = t->u.kernel.target; + ret = xt_check_target(target, AF_INET6, t->u.target_size - sizeof(*t), + name, e->comefrom, e->ipv6.proto, + e->ipv6.invflags & IP6T_INV_PROTO); + if (!ret && t->u.kernel.target->checkentry + && !t->u.kernel.target->checkentry(name, e, target, t->data, + e->comefrom)) { + duprintf("ip_tables: check failed for `%s'.\n", + t->u.kernel.target->name); + ret = -EINVAL; } + return ret; +} - if (e->target_offset + sizeof(struct ip6t_entry_target) > - e->next_offset) - return -EINVAL; +static int +find_check_entry(struct ip6t_entry *e, const char *name, unsigned int size, + unsigned int *i) +{ + struct ip6t_entry_target *t; + struct xt_target *target; + int ret; + unsigned int j; + + ret = check_entry(e, name); + if (ret) + return ret; j = 0; - ret = IP6T_MATCH_ITERATE(e, check_match, name, &e->ipv6, e->comefrom, &j); + ret = IP6T_MATCH_ITERATE(e, find_check_match, name, &e->ipv6, + e->comefrom, &j); if (ret != 0) goto cleanup_matches; t = ip6t_get_target(e); - ret = -EINVAL; - if (e->target_offset + t->u.target_size > e->next_offset) - goto cleanup_matches; target = try_then_request_module(xt_find_target(AF_INET6, t->u.user.name, t->u.user.revision), "ip6t_%s", t->u.user.name); if (IS_ERR(target) || !target) { - duprintf("check_entry: `%s' not found\n", t->u.user.name); + duprintf("find_check_entry: `%s' not found\n", t->u.user.name); ret = target ? PTR_ERR(target) : -ENOENT; goto cleanup_matches; } t->u.kernel.target = target; - ret = xt_check_target(target, AF_INET6, t->u.target_size - sizeof(*t), - name, e->comefrom, e->ipv6.proto, - e->ipv6.invflags & IP6T_INV_PROTO); + ret = check_target(e, name); if (ret) goto err; - if (t->u.kernel.target->checkentry - && !t->u.kernel.target->checkentry(name, e, target, t->data, - e->comefrom)) { - duprintf("ip_tables: check failed for `%s'.\n", - t->u.kernel.target->name); - ret = -EINVAL; - goto err; - } - (*i)++; return 0; err: @@ -721,7 +749,7 @@ check_entry(struct ip6t_entry *e, const char *name, unsigned int size, return ret; } -static inline int +static int check_entry_size_and_hooks(struct ip6t_entry *e, struct xt_table_info *newinfo, unsigned char *base, @@ -746,7 +774,7 @@ check_entry_size_and_hooks(struct ip6t_entry *e, } /* Check hooks & underflows */ - for (h = 0; h < NF_IP6_NUMHOOKS; h++) { + for (h = 0; h < NF_INET_NUMHOOKS; h++) { if ((unsigned char *)e - base == hook_entries[h]) newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) @@ -764,7 +792,7 @@ check_entry_size_and_hooks(struct ip6t_entry *e, return 0; } -static inline int +static int cleanup_entry(struct ip6t_entry *e, unsigned int *i) { struct ip6t_entry_target *t; @@ -800,7 +828,7 @@ translate_table(const char *name, newinfo->number = number; /* Init all hooks to impossible value. */ - for (i = 0; i < NF_IP6_NUMHOOKS; i++) { + for (i = 0; i < NF_INET_NUMHOOKS; i++) { newinfo->hook_entry[i] = 0xFFFFFFFF; newinfo->underflow[i] = 0xFFFFFFFF; } @@ -824,7 +852,7 @@ translate_table(const char *name, } /* Check hooks all assigned */ - for (i = 0; i < NF_IP6_NUMHOOKS; i++) { + for (i = 0; i < NF_INET_NUMHOOKS; i++) { /* Only hooks which are valid */ if (!(valid_hooks & (1 << i))) continue; @@ -846,7 +874,7 @@ translate_table(const char *name, /* Finally, each sanity check must pass */ i = 0; ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size, - check_entry, name, size, &i); + find_check_entry, name, size, &i); if (ret != 0) { IP6T_ENTRY_ITERATE(entry0, newinfo->size, @@ -860,7 +888,7 @@ translate_table(const char *name, memcpy(newinfo->entries[i], entry0, newinfo->size); } - return 0; + return ret; } /* Gets counters. */ @@ -920,33 +948,49 @@ get_counters(const struct xt_table_info *t, } } -static int -copy_entries_to_user(unsigned int total_size, - struct xt_table *table, - void __user *userptr) +static struct xt_counters *alloc_counters(struct xt_table *table) { - unsigned int off, num, countersize; - struct ip6t_entry *e; + unsigned int countersize; struct xt_counters *counters; struct xt_table_info *private = table->private; - int ret = 0; - void *loc_cpu_entry; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care about). */ countersize = sizeof(struct xt_counters) * private->number; - counters = vmalloc(countersize); + counters = vmalloc_node(countersize, numa_node_id()); if (counters == NULL) - return -ENOMEM; + return ERR_PTR(-ENOMEM); /* First, sum counters... */ write_lock_bh(&table->lock); get_counters(private, counters); write_unlock_bh(&table->lock); - /* choose the copy that is on ourc node/cpu */ + return counters; +} + +static int +copy_entries_to_user(unsigned int total_size, + struct xt_table *table, + void __user *userptr) +{ + unsigned int off, num; + struct ip6t_entry *e; + struct xt_counters *counters; + struct xt_table_info *private = table->private; + int ret = 0; + void *loc_cpu_entry; + + counters = alloc_counters(table); + if (IS_ERR(counters)) + return PTR_ERR(counters); + + /* choose the copy that is on our node/cpu, ... + * This choice is lazy (because current thread is + * allowed to migrate to another cpu) + */ loc_cpu_entry = private->entries[raw_smp_processor_id()]; if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; @@ -1001,23 +1045,167 @@ copy_entries_to_user(unsigned int total_size, return ret; } +#ifdef CONFIG_COMPAT +static void compat_standard_from_user(void *dst, void *src) +{ + int v = *(compat_int_t *)src; + + if (v > 0) + v += xt_compat_calc_jump(AF_INET6, v); + memcpy(dst, &v, sizeof(v)); +} + +static int compat_standard_to_user(void __user *dst, void *src) +{ + compat_int_t cv = *(int *)src; + + if (cv > 0) + cv -= xt_compat_calc_jump(AF_INET6, cv); + return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; +} + +static inline int +compat_calc_match(struct ip6t_entry_match *m, int *size) +{ + *size += xt_compat_match_offset(m->u.kernel.match); + return 0; +} + +static int compat_calc_entry(struct ip6t_entry *e, + const struct xt_table_info *info, + void *base, struct xt_table_info *newinfo) +{ + struct ip6t_entry_target *t; + unsigned int entry_offset; + int off, i, ret; + + off = sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); + entry_offset = (void *)e - base; + IP6T_MATCH_ITERATE(e, compat_calc_match, &off); + t = ip6t_get_target(e); + off += xt_compat_target_offset(t->u.kernel.target); + newinfo->size -= off; + ret = xt_compat_add_offset(AF_INET6, entry_offset, off); + if (ret) + return ret; + + for (i = 0; i < NF_INET_NUMHOOKS; i++) { + if (info->hook_entry[i] && + (e < (struct ip6t_entry *)(base + info->hook_entry[i]))) + newinfo->hook_entry[i] -= off; + if (info->underflow[i] && + (e < (struct ip6t_entry *)(base + info->underflow[i]))) + newinfo->underflow[i] -= off; + } + return 0; +} + +static int compat_table_info(const struct xt_table_info *info, + struct xt_table_info *newinfo) +{ + void *loc_cpu_entry; + + if (!newinfo || !info) + return -EINVAL; + + /* we dont care about newinfo->entries[] */ + memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); + newinfo->initial_entries = 0; + loc_cpu_entry = info->entries[raw_smp_processor_id()]; + return IP6T_ENTRY_ITERATE(loc_cpu_entry, info->size, + compat_calc_entry, info, loc_cpu_entry, + newinfo); +} +#endif + +static int get_info(void __user *user, int *len, int compat) +{ + char name[IP6T_TABLE_MAXNAMELEN]; + struct xt_table *t; + int ret; + + if (*len != sizeof(struct ip6t_getinfo)) { + duprintf("length %u != %zu\n", *len, + sizeof(struct ip6t_getinfo)); + return -EINVAL; + } + + if (copy_from_user(name, user, sizeof(name)) != 0) + return -EFAULT; + + name[IP6T_TABLE_MAXNAMELEN-1] = '\0'; +#ifdef CONFIG_COMPAT + if (compat) + xt_compat_lock(AF_INET6); +#endif + t = try_then_request_module(xt_find_table_lock(AF_INET6, name), + "ip6table_%s", name); + if (t && !IS_ERR(t)) { + struct ip6t_getinfo info; + struct xt_table_info *private = t->private; + +#ifdef CONFIG_COMPAT + if (compat) { + struct xt_table_info tmp; + ret = compat_table_info(private, &tmp); + xt_compat_flush_offsets(AF_INET6); + private = &tmp; + } +#endif + info.valid_hooks = t->valid_hooks; + memcpy(info.hook_entry, private->hook_entry, + sizeof(info.hook_entry)); + memcpy(info.underflow, private->underflow, + sizeof(info.underflow)); + info.num_entries = private->number; + info.size = private->size; + strcpy(info.name, name); + + if (copy_to_user(user, &info, *len) != 0) + ret = -EFAULT; + else + ret = 0; + + xt_table_unlock(t); + module_put(t->me); + } else + ret = t ? PTR_ERR(t) : -ENOENT; +#ifdef CONFIG_COMPAT + if (compat) + xt_compat_unlock(AF_INET6); +#endif + return ret; +} + static int -get_entries(const struct ip6t_get_entries *entries, - struct ip6t_get_entries __user *uptr) +get_entries(struct ip6t_get_entries __user *uptr, int *len) { int ret; + struct ip6t_get_entries get; struct xt_table *t; - t = xt_find_table_lock(AF_INET6, entries->name); + if (*len < sizeof(get)) { + duprintf("get_entries: %u < %zu\n", *len, sizeof(get)); + return -EINVAL; + } + if (copy_from_user(&get, uptr, sizeof(get)) != 0) + return -EFAULT; + if (*len != sizeof(struct ip6t_get_entries) + get.size) { + duprintf("get_entries: %u != %zu\n", + *len, sizeof(get) + get.size); + return -EINVAL; + } + + t = xt_find_table_lock(AF_INET6, get.name); if (t && !IS_ERR(t)) { struct xt_table_info *private = t->private; duprintf("t->private->number = %u\n", private->number); - if (entries->size == private->size) + if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); else { duprintf("get_entries: I've got %u not %u!\n", - private->size, entries->size); + private->size, get.size); ret = -EINVAL; } module_put(t->me); @@ -1029,67 +1217,40 @@ get_entries(const struct ip6t_get_entries *entries, } static int -do_replace(void __user *user, unsigned int len) +__do_replace(const char *name, unsigned int valid_hooks, + struct xt_table_info *newinfo, unsigned int num_counters, + void __user *counters_ptr) { int ret; - struct ip6t_replace tmp; struct xt_table *t; - struct xt_table_info *newinfo, *oldinfo; + struct xt_table_info *oldinfo; struct xt_counters *counters; - void *loc_cpu_entry, *loc_cpu_old_entry; + void *loc_cpu_old_entry; - if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) - return -EFAULT; - - /* overflow check */ - if (tmp.size >= (INT_MAX - sizeof(struct xt_table_info)) / NR_CPUS - - SMP_CACHE_BYTES) - return -ENOMEM; - if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) - return -ENOMEM; - - newinfo = xt_alloc_table_info(tmp.size); - if (!newinfo) - return -ENOMEM; - - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; - if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), - tmp.size) != 0) { - ret = -EFAULT; - goto free_newinfo; - } - - counters = vmalloc(tmp.num_counters * sizeof(struct xt_counters)); + ret = 0; + counters = vmalloc_node(num_counters * sizeof(struct xt_counters), + numa_node_id()); if (!counters) { ret = -ENOMEM; - goto free_newinfo; + goto out; } - ret = translate_table(tmp.name, tmp.valid_hooks, - newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, - tmp.hook_entry, tmp.underflow); - if (ret != 0) - goto free_newinfo_counters; - - duprintf("ip_tables: Translated table\n"); - - t = try_then_request_module(xt_find_table_lock(AF_INET6, tmp.name), - "ip6table_%s", tmp.name); + t = try_then_request_module(xt_find_table_lock(AF_INET6, name), + "ip6table_%s", name); if (!t || IS_ERR(t)) { ret = t ? PTR_ERR(t) : -ENOENT; goto free_newinfo_counters_untrans; } /* You lied! */ - if (tmp.valid_hooks != t->valid_hooks) { + if (valid_hooks != t->valid_hooks) { duprintf("Valid hook crap: %08X vs %08X\n", - tmp.valid_hooks, t->valid_hooks); + valid_hooks, t->valid_hooks); ret = -EINVAL; goto put_module; } - oldinfo = xt_replace_table(t, tmp.num_counters, newinfo, &ret); + oldinfo = xt_replace_table(t, num_counters, newinfo, &ret); if (!oldinfo) goto put_module; @@ -1107,10 +1268,11 @@ do_replace(void __user *user, unsigned int len) get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; - IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL); + IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, + NULL); xt_free_table_info(oldinfo); - if (copy_to_user(tmp.counters, counters, - sizeof(struct xt_counters) * tmp.num_counters) != 0) + if (copy_to_user(counters_ptr, counters, + sizeof(struct xt_counters) * num_counters) != 0) ret = -EFAULT; vfree(counters); xt_table_unlock(t); @@ -1120,9 +1282,54 @@ do_replace(void __user *user, unsigned int len) module_put(t->me); xt_table_unlock(t); free_newinfo_counters_untrans: - IP6T_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL); - free_newinfo_counters: vfree(counters); + out: + return ret; +} + +static int +do_replace(void __user *user, unsigned int len) +{ + int ret; + struct ip6t_replace tmp; + struct xt_table_info *newinfo; + void *loc_cpu_entry; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + /* overflow check */ + if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) + return -ENOMEM; + + newinfo = xt_alloc_table_info(tmp.size); + if (!newinfo) + return -ENOMEM; + + /* choose the copy that is on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), + tmp.size) != 0) { + ret = -EFAULT; + goto free_newinfo; + } + + ret = translate_table(tmp.name, tmp.valid_hooks, + newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, + tmp.hook_entry, tmp.underflow); + if (ret != 0) + goto free_newinfo; + + duprintf("ip_tables: Translated table\n"); + + ret = __do_replace(tmp.name, tmp.valid_hooks, newinfo, + tmp.num_counters, tmp.counters); + if (ret) + goto free_newinfo_untrans; + return 0; + + free_newinfo_untrans: + IP6T_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); free_newinfo: xt_free_table_info(newinfo); return ret; @@ -1151,31 +1358,59 @@ add_counter_to_entry(struct ip6t_entry *e, } static int -do_add_counters(void __user *user, unsigned int len) +do_add_counters(void __user *user, unsigned int len, int compat) { unsigned int i; - struct xt_counters_info tmp, *paddc; - struct xt_table_info *private; + struct xt_counters_info tmp; + struct xt_counters *paddc; + unsigned int num_counters; + char *name; + int size; + void *ptmp; struct xt_table *t; + struct xt_table_info *private; int ret = 0; void *loc_cpu_entry; +#ifdef CONFIG_COMPAT + struct compat_xt_counters_info compat_tmp; - if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + if (compat) { + ptmp = &compat_tmp; + size = sizeof(struct compat_xt_counters_info); + } else +#endif + { + ptmp = &tmp; + size = sizeof(struct xt_counters_info); + } + + if (copy_from_user(ptmp, user, size) != 0) return -EFAULT; - if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct xt_counters)) +#ifdef CONFIG_COMPAT + if (compat) { + num_counters = compat_tmp.num_counters; + name = compat_tmp.name; + } else +#endif + { + num_counters = tmp.num_counters; + name = tmp.name; + } + + if (len != size + num_counters * sizeof(struct xt_counters)) return -EINVAL; - paddc = vmalloc(len); + paddc = vmalloc_node(len - size, numa_node_id()); if (!paddc) return -ENOMEM; - if (copy_from_user(paddc, user, len) != 0) { + if (copy_from_user(paddc, user + size, len - size) != 0) { ret = -EFAULT; goto free; } - t = xt_find_table_lock(AF_INET6, tmp.name); + t = xt_find_table_lock(AF_INET6, name); if (!t || IS_ERR(t)) { ret = t ? PTR_ERR(t) : -ENOENT; goto free; @@ -1183,18 +1418,18 @@ do_add_counters(void __user *user, unsigned int len) write_lock_bh(&t->lock); private = t->private; - if (private->number != tmp.num_counters) { + if (private->number != num_counters) { ret = -EINVAL; goto unlock_up_free; } i = 0; /* Choose the copy that is on our node */ - loc_cpu_entry = private->entries[smp_processor_id()]; + loc_cpu_entry = private->entries[raw_smp_processor_id()]; IP6T_ENTRY_ITERATE(loc_cpu_entry, private->size, add_counter_to_entry, - paddc->counters, + paddc, &i); unlock_up_free: write_unlock_bh(&t->lock); @@ -1206,8 +1441,433 @@ do_add_counters(void __user *user, unsigned int len) return ret; } +#ifdef CONFIG_COMPAT +struct compat_ip6t_replace { + char name[IP6T_TABLE_MAXNAMELEN]; + u32 valid_hooks; + u32 num_entries; + u32 size; + u32 hook_entry[NF_INET_NUMHOOKS]; + u32 underflow[NF_INET_NUMHOOKS]; + u32 num_counters; + compat_uptr_t counters; /* struct ip6t_counters * */ + struct compat_ip6t_entry entries[0]; +}; + static int -do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +compat_copy_entry_to_user(struct ip6t_entry *e, void __user **dstptr, + compat_uint_t *size, struct xt_counters *counters, + unsigned int *i) +{ + struct ip6t_entry_target *t; + struct compat_ip6t_entry __user *ce; + u_int16_t target_offset, next_offset; + compat_uint_t origsize; + int ret; + + ret = -EFAULT; + origsize = *size; + ce = (struct compat_ip6t_entry __user *)*dstptr; + if (copy_to_user(ce, e, sizeof(struct ip6t_entry))) + goto out; + + if (copy_to_user(&ce->counters, &counters[*i], sizeof(counters[*i]))) + goto out; + + *dstptr += sizeof(struct compat_ip6t_entry); + *size -= sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); + + ret = IP6T_MATCH_ITERATE(e, xt_compat_match_to_user, dstptr, size); + target_offset = e->target_offset - (origsize - *size); + if (ret) + goto out; + t = ip6t_get_target(e); + ret = xt_compat_target_to_user(t, dstptr, size); + if (ret) + goto out; + ret = -EFAULT; + next_offset = e->next_offset - (origsize - *size); + if (put_user(target_offset, &ce->target_offset)) + goto out; + if (put_user(next_offset, &ce->next_offset)) + goto out; + + (*i)++; + return 0; +out: + return ret; +} + +static int +compat_find_calc_match(struct ip6t_entry_match *m, + const char *name, + const struct ip6t_ip6 *ipv6, + unsigned int hookmask, + int *size, int *i) +{ + struct xt_match *match; + + match = try_then_request_module(xt_find_match(AF_INET6, m->u.user.name, + m->u.user.revision), + "ip6t_%s", m->u.user.name); + if (IS_ERR(match) || !match) { + duprintf("compat_check_calc_match: `%s' not found\n", + m->u.user.name); + return match ? PTR_ERR(match) : -ENOENT; + } + m->u.kernel.match = match; + *size += xt_compat_match_offset(match); + + (*i)++; + return 0; +} + +static int +compat_release_match(struct ip6t_entry_match *m, unsigned int *i) +{ + if (i && (*i)-- == 0) + return 1; + + module_put(m->u.kernel.match->me); + return 0; +} + +static int +compat_release_entry(struct compat_ip6t_entry *e, unsigned int *i) +{ + struct ip6t_entry_target *t; + + if (i && (*i)-- == 0) + return 1; + + /* Cleanup all matches */ + COMPAT_IP6T_MATCH_ITERATE(e, compat_release_match, NULL); + t = compat_ip6t_get_target(e); + module_put(t->u.kernel.target->me); + return 0; +} + +static int +check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, + struct xt_table_info *newinfo, + unsigned int *size, + unsigned char *base, + unsigned char *limit, + unsigned int *hook_entries, + unsigned int *underflows, + unsigned int *i, + const char *name) +{ + struct ip6t_entry_target *t; + struct xt_target *target; + unsigned int entry_offset; + int ret, off, h, j; + + duprintf("check_compat_entry_size_and_hooks %p\n", e); + if ((unsigned long)e % __alignof__(struct compat_ip6t_entry) != 0 + || (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit) { + duprintf("Bad offset %p, limit = %p\n", e, limit); + return -EINVAL; + } + + if (e->next_offset < sizeof(struct compat_ip6t_entry) + + sizeof(struct compat_xt_entry_target)) { + duprintf("checking: element %p size %u\n", + e, e->next_offset); + return -EINVAL; + } + + /* For purposes of check_entry casting the compat entry is fine */ + ret = check_entry((struct ip6t_entry *)e, name); + if (ret) + return ret; + + off = sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); + entry_offset = (void *)e - (void *)base; + j = 0; + ret = COMPAT_IP6T_MATCH_ITERATE(e, compat_find_calc_match, name, + &e->ipv6, e->comefrom, &off, &j); + if (ret != 0) + goto release_matches; + + t = compat_ip6t_get_target(e); + target = try_then_request_module(xt_find_target(AF_INET6, + t->u.user.name, + t->u.user.revision), + "ip6t_%s", t->u.user.name); + if (IS_ERR(target) || !target) { + duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", + t->u.user.name); + ret = target ? PTR_ERR(target) : -ENOENT; + goto release_matches; + } + t->u.kernel.target = target; + + off += xt_compat_target_offset(target); + *size += off; + ret = xt_compat_add_offset(AF_INET6, entry_offset, off); + if (ret) + goto out; + + /* Check hooks & underflows */ + for (h = 0; h < NF_INET_NUMHOOKS; h++) { + if ((unsigned char *)e - base == hook_entries[h]) + newinfo->hook_entry[h] = hook_entries[h]; + if ((unsigned char *)e - base == underflows[h]) + newinfo->underflow[h] = underflows[h]; + } + + /* Clear counters and comefrom */ + memset(&e->counters, 0, sizeof(e->counters)); + e->comefrom = 0; + + (*i)++; + return 0; + +out: + module_put(t->u.kernel.target->me); +release_matches: + IP6T_MATCH_ITERATE(e, compat_release_match, &j); + return ret; +} + +static int +compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr, + unsigned int *size, const char *name, + struct xt_table_info *newinfo, unsigned char *base) +{ + struct ip6t_entry_target *t; + struct xt_target *target; + struct ip6t_entry *de; + unsigned int origsize; + int ret, h; + + ret = 0; + origsize = *size; + de = (struct ip6t_entry *)*dstptr; + memcpy(de, e, sizeof(struct ip6t_entry)); + memcpy(&de->counters, &e->counters, sizeof(e->counters)); + + *dstptr += sizeof(struct ip6t_entry); + *size += sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); + + ret = COMPAT_IP6T_MATCH_ITERATE(e, xt_compat_match_from_user, + dstptr, size); + if (ret) + return ret; + de->target_offset = e->target_offset - (origsize - *size); + t = compat_ip6t_get_target(e); + target = t->u.kernel.target; + xt_compat_target_from_user(t, dstptr, size); + + de->next_offset = e->next_offset - (origsize - *size); + for (h = 0; h < NF_INET_NUMHOOKS; h++) { + if ((unsigned char *)de - base < newinfo->hook_entry[h]) + newinfo->hook_entry[h] -= origsize - *size; + if ((unsigned char *)de - base < newinfo->underflow[h]) + newinfo->underflow[h] -= origsize - *size; + } + return ret; +} + +static int compat_check_entry(struct ip6t_entry *e, const char *name, + unsigned int *i) +{ + int j, ret; + + j = 0; + ret = IP6T_MATCH_ITERATE(e, check_match, name, &e->ipv6, + e->comefrom, &j); + if (ret) + goto cleanup_matches; + + ret = check_target(e, name); + if (ret) + goto cleanup_matches; + + (*i)++; + return 0; + + cleanup_matches: + IP6T_MATCH_ITERATE(e, cleanup_match, &j); + return ret; +} + +static int +translate_compat_table(const char *name, + unsigned int valid_hooks, + struct xt_table_info **pinfo, + void **pentry0, + unsigned int total_size, + unsigned int number, + unsigned int *hook_entries, + unsigned int *underflows) +{ + unsigned int i, j; + struct xt_table_info *newinfo, *info; + void *pos, *entry0, *entry1; + unsigned int size; + int ret; + + info = *pinfo; + entry0 = *pentry0; + size = total_size; + info->number = number; + + /* Init all hooks to impossible value. */ + for (i = 0; i < NF_INET_NUMHOOKS; i++) { + info->hook_entry[i] = 0xFFFFFFFF; + info->underflow[i] = 0xFFFFFFFF; + } + + duprintf("translate_compat_table: size %u\n", info->size); + j = 0; + xt_compat_lock(AF_INET6); + /* Walk through entries, checking offsets. */ + ret = COMPAT_IP6T_ENTRY_ITERATE(entry0, total_size, + check_compat_entry_size_and_hooks, + info, &size, entry0, + entry0 + total_size, + hook_entries, underflows, &j, name); + if (ret != 0) + goto out_unlock; + + ret = -EINVAL; + if (j != number) { + duprintf("translate_compat_table: %u not %u entries\n", + j, number); + goto out_unlock; + } + + /* Check hooks all assigned */ + for (i = 0; i < NF_INET_NUMHOOKS; i++) { + /* Only hooks which are valid */ + if (!(valid_hooks & (1 << i))) + continue; + if (info->hook_entry[i] == 0xFFFFFFFF) { + duprintf("Invalid hook entry %u %u\n", + i, hook_entries[i]); + goto out_unlock; + } + if (info->underflow[i] == 0xFFFFFFFF) { + duprintf("Invalid underflow %u %u\n", + i, underflows[i]); + goto out_unlock; + } + } + + ret = -ENOMEM; + newinfo = xt_alloc_table_info(size); + if (!newinfo) + goto out_unlock; + + newinfo->number = number; + for (i = 0; i < NF_INET_NUMHOOKS; i++) { + newinfo->hook_entry[i] = info->hook_entry[i]; + newinfo->underflow[i] = info->underflow[i]; + } + entry1 = newinfo->entries[raw_smp_processor_id()]; + pos = entry1; + size = total_size; + ret = COMPAT_IP6T_ENTRY_ITERATE(entry0, total_size, + compat_copy_entry_from_user, + &pos, &size, name, newinfo, entry1); + xt_compat_flush_offsets(AF_INET6); + xt_compat_unlock(AF_INET6); + if (ret) + goto free_newinfo; + + ret = -ELOOP; + if (!mark_source_chains(newinfo, valid_hooks, entry1)) + goto free_newinfo; + + i = 0; + ret = IP6T_ENTRY_ITERATE(entry1, newinfo->size, compat_check_entry, + name, &i); + if (ret) { + j -= i; + COMPAT_IP6T_ENTRY_ITERATE_CONTINUE(entry0, newinfo->size, i, + compat_release_entry, &j); + IP6T_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, &i); + xt_free_table_info(newinfo); + return ret; + } + + /* And one copy for every other CPU */ + for_each_possible_cpu(i) + if (newinfo->entries[i] && newinfo->entries[i] != entry1) + memcpy(newinfo->entries[i], entry1, newinfo->size); + + *pinfo = newinfo; + *pentry0 = entry1; + xt_free_table_info(info); + return 0; + +free_newinfo: + xt_free_table_info(newinfo); +out: + COMPAT_IP6T_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j); + return ret; +out_unlock: + xt_compat_flush_offsets(AF_INET6); + xt_compat_unlock(AF_INET6); + goto out; +} + +static int +compat_do_replace(void __user *user, unsigned int len) +{ + int ret; + struct compat_ip6t_replace tmp; + struct xt_table_info *newinfo; + void *loc_cpu_entry; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + /* overflow check */ + if (tmp.size >= INT_MAX / num_possible_cpus()) + return -ENOMEM; + if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) + return -ENOMEM; + + newinfo = xt_alloc_table_info(tmp.size); + if (!newinfo) + return -ENOMEM; + + /* choose the copy that is on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), + tmp.size) != 0) { + ret = -EFAULT; + goto free_newinfo; + } + + ret = translate_compat_table(tmp.name, tmp.valid_hooks, + &newinfo, &loc_cpu_entry, tmp.size, + tmp.num_entries, tmp.hook_entry, + tmp.underflow); + if (ret != 0) + goto free_newinfo; + + duprintf("compat_do_replace: Translated table\n"); + + ret = __do_replace(tmp.name, tmp.valid_hooks, newinfo, + tmp.num_counters, compat_ptr(tmp.counters)); + if (ret) + goto free_newinfo_untrans; + return 0; + + free_newinfo_untrans: + IP6T_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); + free_newinfo: + xt_free_table_info(newinfo); + return ret; +} + +static int +compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, + unsigned int len) { int ret; @@ -1216,11 +1876,11 @@ do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) switch (cmd) { case IP6T_SO_SET_REPLACE: - ret = do_replace(user, len); + ret = compat_do_replace(user, len); break; case IP6T_SO_SET_ADD_COUNTERS: - ret = do_add_counters(user, len); + ret = do_add_counters(user, len, 1); break; default: @@ -1231,75 +1891,155 @@ do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) return ret; } +struct compat_ip6t_get_entries { + char name[IP6T_TABLE_MAXNAMELEN]; + compat_uint_t size; + struct compat_ip6t_entry entrytable[0]; +}; + static int -do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, + void __user *userptr) +{ + struct xt_counters *counters; + struct xt_table_info *private = table->private; + void __user *pos; + unsigned int size; + int ret = 0; + void *loc_cpu_entry; + unsigned int i = 0; + + counters = alloc_counters(table); + if (IS_ERR(counters)) + return PTR_ERR(counters); + + /* choose the copy that is on our node/cpu, ... + * This choice is lazy (because current thread is + * allowed to migrate to another cpu) + */ + loc_cpu_entry = private->entries[raw_smp_processor_id()]; + pos = userptr; + size = total_size; + ret = IP6T_ENTRY_ITERATE(loc_cpu_entry, total_size, + compat_copy_entry_to_user, + &pos, &size, counters, &i); + + vfree(counters); + return ret; +} + +static int +compat_get_entries(struct compat_ip6t_get_entries __user *uptr, int *len) { int ret; + struct compat_ip6t_get_entries get; + struct xt_table *t; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; + if (*len < sizeof(get)) { + duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get)); + return -EINVAL; + } - switch (cmd) { - case IP6T_SO_GET_INFO: { - char name[IP6T_TABLE_MAXNAMELEN]; - struct xt_table *t; + if (copy_from_user(&get, uptr, sizeof(get)) != 0) + return -EFAULT; - if (*len != sizeof(struct ip6t_getinfo)) { - duprintf("length %u != %u\n", *len, - sizeof(struct ip6t_getinfo)); + if (*len != sizeof(struct compat_ip6t_get_entries) + get.size) { + duprintf("compat_get_entries: %u != %zu\n", + *len, sizeof(get) + get.size); + return -EINVAL; + } + + xt_compat_lock(AF_INET6); + t = xt_find_table_lock(AF_INET6, get.name); + if (t && !IS_ERR(t)) { + struct xt_table_info *private = t->private; + struct xt_table_info info; + duprintf("t->private->number = %u\n", private->number); + ret = compat_table_info(private, &info); + if (!ret && get.size == info.size) { + ret = compat_copy_entries_to_user(private->size, + t, uptr->entrytable); + } else if (!ret) { + duprintf("compat_get_entries: I've got %u not %u!\n", + private->size, get.size); ret = -EINVAL; - break; } + xt_compat_flush_offsets(AF_INET6); + module_put(t->me); + xt_table_unlock(t); + } else + ret = t ? PTR_ERR(t) : -ENOENT; - if (copy_from_user(name, user, sizeof(name)) != 0) { - ret = -EFAULT; - break; - } - name[IP6T_TABLE_MAXNAMELEN-1] = '\0'; - - t = try_then_request_module(xt_find_table_lock(AF_INET6, name), - "ip6table_%s", name); - if (t && !IS_ERR(t)) { - struct ip6t_getinfo info; - struct xt_table_info *private = t->private; - - info.valid_hooks = t->valid_hooks; - memcpy(info.hook_entry, private->hook_entry, - sizeof(info.hook_entry)); - memcpy(info.underflow, private->underflow, - sizeof(info.underflow)); - info.num_entries = private->number; - info.size = private->size; - memcpy(info.name, name, sizeof(info.name)); - - if (copy_to_user(user, &info, *len) != 0) - ret = -EFAULT; - else - ret = 0; - xt_table_unlock(t); - module_put(t->me); - } else - ret = t ? PTR_ERR(t) : -ENOENT; + xt_compat_unlock(AF_INET6); + return ret; +} + +static int do_ip6t_get_ctl(struct sock *, int, void __user *, int *); + +static int +compat_do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case IP6T_SO_GET_INFO: + ret = get_info(user, len, 1); + break; + case IP6T_SO_GET_ENTRIES: + ret = compat_get_entries(user, len); + break; + default: + ret = do_ip6t_get_ctl(sk, cmd, user, len); } - break; + return ret; +} +#endif - case IP6T_SO_GET_ENTRIES: { - struct ip6t_get_entries get; +static int +do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +{ + int ret; - if (*len < sizeof(get)) { - duprintf("get_entries: %u < %u\n", *len, sizeof(get)); - ret = -EINVAL; - } else if (copy_from_user(&get, user, sizeof(get)) != 0) { - ret = -EFAULT; - } else if (*len != sizeof(struct ip6t_get_entries) + get.size) { - duprintf("get_entries: %u != %u\n", *len, - sizeof(struct ip6t_get_entries) + get.size); - ret = -EINVAL; - } else - ret = get_entries(&get, user); + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case IP6T_SO_SET_REPLACE: + ret = do_replace(user, len); break; + + case IP6T_SO_SET_ADD_COUNTERS: + ret = do_add_counters(user, len, 0); + break; + + default: + duprintf("do_ip6t_set_ctl: unknown request %i\n", cmd); + ret = -EINVAL; } + return ret; +} + +static int +do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case IP6T_SO_GET_INFO: + ret = get_info(user, len, 0); + break; + + case IP6T_SO_GET_ENTRIES: + ret = get_entries(user, len); + break; + case IP6T_SO_GET_REVISION_MATCH: case IP6T_SO_GET_REVISION_TARGET: { struct ip6t_get_revision rev; @@ -1334,12 +2074,11 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) return ret; } -int ip6t_register_table(struct xt_table *table, - const struct ip6t_replace *repl) +int ip6t_register_table(struct xt_table *table, const struct ip6t_replace *repl) { int ret; struct xt_table_info *newinfo; - static struct xt_table_info bootstrap + struct xt_table_info bootstrap = { 0, 0, 0, { 0 }, { 0 }, { } }; void *loc_cpu_entry; @@ -1347,7 +2086,7 @@ int ip6t_register_table(struct xt_table *table, if (!newinfo) return -ENOMEM; - /* choose the copy on our node/cpu */ + /* choose the copy on our node/cpu, but dont care about preemption */ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; memcpy(loc_cpu_entry, repl->entries, repl->size); @@ -1403,17 +2142,18 @@ icmp6_match(const struct sk_buff *skb, unsigned int protoff, bool *hotdrop) { - struct icmp6hdr _icmp, *ic; + struct icmp6hdr _icmph, *ic; const struct ip6t_icmp *icmpinfo = matchinfo; /* Must not be a fragment. */ if (offset) return false; - ic = skb_header_pointer(skb, protoff, sizeof(_icmp), &_icmp); + ic = skb_header_pointer(skb, protoff, sizeof(_icmph), &_icmph); if (ic == NULL) { /* We've been asked to examine this packet, and we - can't. Hence, no choice but to drop. */ + * can't. Hence, no choice but to drop. + */ duprintf("Dropping evil ICMP tinygram.\n"); *hotdrop = true; return false; @@ -1445,6 +2185,11 @@ static struct xt_target ip6t_standard_target __read_mostly = { .name = IP6T_STANDARD_TARGET, .targetsize = sizeof(int), .family = AF_INET6, +#ifdef CONFIG_COMPAT + .compatsize = sizeof(compat_int_t), + .compat_from_user = compat_standard_from_user, + .compat_to_user = compat_standard_to_user, +#endif }; static struct xt_target ip6t_error_target __read_mostly = { @@ -1459,15 +2204,21 @@ static struct nf_sockopt_ops ip6t_sockopts = { .set_optmin = IP6T_BASE_CTL, .set_optmax = IP6T_SO_SET_MAX+1, .set = do_ip6t_set_ctl, +#ifdef CONFIG_COMPAT + .compat_set = compat_do_ip6t_set_ctl, +#endif .get_optmin = IP6T_BASE_CTL, .get_optmax = IP6T_SO_GET_MAX+1, .get = do_ip6t_get_ctl, +#ifdef CONFIG_COMPAT + .compat_get = compat_do_ip6t_get_ctl, +#endif .owner = THIS_MODULE, }; static struct xt_match icmp6_matchstruct __read_mostly = { .name = "icmp6", - .match = &icmp6_match, + .match = icmp6_match, .matchsize = sizeof(struct ip6t_icmp), .checkentry = icmp6_checkentry, .proto = IPPROTO_ICMPV6, @@ -1516,6 +2267,7 @@ err1: static void __exit ip6_tables_fini(void) { nf_unregister_sockopt(&ip6t_sockopts); + xt_unregister_match(&icmp6_matchstruct); xt_unregister_target(&ip6t_error_target); xt_unregister_target(&ip6t_standard_target); diff --git a/net/ipv6/netfilter/ip6t_HL.c b/net/ipv6/netfilter/ip6t_HL.c index 9afc836fd45..d5f8fd5f29d 100644 --- a/net/ipv6/netfilter/ip6t_HL.c +++ b/net/ipv6/netfilter/ip6t_HL.c @@ -15,15 +15,13 @@ #include <linux/netfilter_ipv6/ip6t_HL.h> MODULE_AUTHOR("Maciej Soltysiak <solt@dns.toxicfilms.tv>"); -MODULE_DESCRIPTION("IP6 tables Hop Limit modification module"); +MODULE_DESCRIPTION("Xtables: IPv6 Hop Limit field modification target"); MODULE_LICENSE("GPL"); -static unsigned int ip6t_hl_target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +static unsigned int +hl_tg6(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { struct ipv6hdr *ip6h; const struct ip6t_HL_info *info = targinfo; @@ -58,11 +56,10 @@ static unsigned int ip6t_hl_target(struct sk_buff *skb, return XT_CONTINUE; } -static bool ip6t_hl_checkentry(const char *tablename, - const void *entry, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +static bool +hl_tg6_check(const char *tablename, const void *entry, + const struct xt_target *target, void *targinfo, + unsigned int hook_mask) { const struct ip6t_HL_info *info = targinfo; @@ -79,25 +76,25 @@ static bool ip6t_hl_checkentry(const char *tablename, return true; } -static struct xt_target ip6t_HL __read_mostly = { +static struct xt_target hl_tg6_reg __read_mostly = { .name = "HL", .family = AF_INET6, - .target = ip6t_hl_target, + .target = hl_tg6, .targetsize = sizeof(struct ip6t_HL_info), .table = "mangle", - .checkentry = ip6t_hl_checkentry, + .checkentry = hl_tg6_check, .me = THIS_MODULE }; -static int __init ip6t_hl_init(void) +static int __init hl_tg6_init(void) { - return xt_register_target(&ip6t_HL); + return xt_register_target(&hl_tg6_reg); } -static void __exit ip6t_hl_fini(void) +static void __exit hl_tg6_exit(void) { - xt_unregister_target(&ip6t_HL); + xt_unregister_target(&hl_tg6_reg); } -module_init(ip6t_hl_init); -module_exit(ip6t_hl_fini); +module_init(hl_tg6_init); +module_exit(hl_tg6_exit); diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c index 7a48c342df4..86a613810b6 100644 --- a/net/ipv6/netfilter/ip6t_LOG.c +++ b/net/ipv6/netfilter/ip6t_LOG.c @@ -23,9 +23,10 @@ #include <linux/netfilter.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> +#include <net/netfilter/nf_log.h> MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>"); -MODULE_DESCRIPTION("IP6 tables LOG target module"); +MODULE_DESCRIPTION("Xtables: IPv6 packet logging to syslog"); MODULE_LICENSE("GPL"); struct in_device; @@ -362,7 +363,9 @@ static void dump_packet(const struct nf_loginfo *info, if ((logflags & IP6T_LOG_UID) && recurse && skb->sk) { read_lock_bh(&skb->sk->sk_callback_lock); if (skb->sk->sk_socket && skb->sk->sk_socket->file) - printk("UID=%u ", skb->sk->sk_socket->file->f_uid); + printk("UID=%u GID=%u", + skb->sk->sk_socket->file->f_uid, + skb->sk->sk_socket->file->f_gid); read_unlock_bh(&skb->sk->sk_callback_lock); } } @@ -431,12 +434,9 @@ ip6t_log_packet(unsigned int pf, } static unsigned int -ip6t_log_target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +log_tg6(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { const struct ip6t_log_info *loginfo = targinfo; struct nf_loginfo li; @@ -450,11 +450,10 @@ ip6t_log_target(struct sk_buff *skb, } -static bool ip6t_log_checkentry(const char *tablename, - const void *entry, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +static bool +log_tg6_check(const char *tablename, const void *entry, + const struct xt_target *target, void *targinfo, + unsigned int hook_mask) { const struct ip6t_log_info *loginfo = targinfo; @@ -470,37 +469,37 @@ static bool ip6t_log_checkentry(const char *tablename, return true; } -static struct xt_target ip6t_log_reg __read_mostly = { +static struct xt_target log_tg6_reg __read_mostly = { .name = "LOG", .family = AF_INET6, - .target = ip6t_log_target, + .target = log_tg6, .targetsize = sizeof(struct ip6t_log_info), - .checkentry = ip6t_log_checkentry, + .checkentry = log_tg6_check, .me = THIS_MODULE, }; -static struct nf_logger ip6t_logger = { +static const struct nf_logger ip6t_logger = { .name = "ip6t_LOG", .logfn = &ip6t_log_packet, .me = THIS_MODULE, }; -static int __init ip6t_log_init(void) +static int __init log_tg6_init(void) { int ret; - ret = xt_register_target(&ip6t_log_reg); + ret = xt_register_target(&log_tg6_reg); if (ret < 0) return ret; nf_log_register(PF_INET6, &ip6t_logger); return 0; } -static void __exit ip6t_log_fini(void) +static void __exit log_tg6_exit(void) { nf_log_unregister(&ip6t_logger); - xt_unregister_target(&ip6t_log_reg); + xt_unregister_target(&log_tg6_reg); } -module_init(ip6t_log_init); -module_exit(ip6t_log_fini); +module_init(log_tg6_init); +module_exit(log_tg6_exit); diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c index 1a7d2917545..b23baa635fe 100644 --- a/net/ipv6/netfilter/ip6t_REJECT.c +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -31,7 +31,7 @@ #include <linux/netfilter_ipv6/ip6t_REJECT.h> MODULE_AUTHOR("Yasuyuki KOZAKAI <yasuyuki.kozakai@toshiba.co.jp>"); -MODULE_DESCRIPTION("IP6 tables REJECT target module"); +MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv6"); MODULE_LICENSE("GPL"); /* Send RST reply */ @@ -121,7 +121,6 @@ static void send_reset(struct sk_buff *oldskb) ip6h->version = 6; ip6h->hop_limit = dst_metric(dst, RTAX_HOPLIMIT); ip6h->nexthdr = IPPROTO_TCP; - ip6h->payload_len = htons(sizeof(struct tcphdr)); ipv6_addr_copy(&ip6h->saddr, &oip6h->daddr); ipv6_addr_copy(&ip6h->daddr, &oip6h->saddr); @@ -159,25 +158,22 @@ static void send_reset(struct sk_buff *oldskb) nf_ct_attach(nskb, oldskb); - NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, nskb, NULL, nskb->dst->dev, - dst_output); + ip6_local_out(nskb); } static inline void send_unreach(struct sk_buff *skb_in, unsigned char code, unsigned int hooknum) { - if (hooknum == NF_IP6_LOCAL_OUT && skb_in->dev == NULL) + if (hooknum == NF_INET_LOCAL_OUT && skb_in->dev == NULL) skb_in->dev = init_net.loopback_dev; icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0, NULL); } -static unsigned int reject6_target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +static unsigned int +reject_tg6(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { const struct ip6t_reject_info *reject = targinfo; @@ -216,11 +212,10 @@ static unsigned int reject6_target(struct sk_buff *skb, return NF_DROP; } -static bool check(const char *tablename, - const void *entry, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +static bool +reject_tg6_check(const char *tablename, const void *entry, + const struct xt_target *target, void *targinfo, + unsigned int hook_mask) { const struct ip6t_reject_info *rejinfo = targinfo; const struct ip6t_entry *e = entry; @@ -239,27 +234,27 @@ static bool check(const char *tablename, return true; } -static struct xt_target ip6t_reject_reg __read_mostly = { +static struct xt_target reject_tg6_reg __read_mostly = { .name = "REJECT", .family = AF_INET6, - .target = reject6_target, + .target = reject_tg6, .targetsize = sizeof(struct ip6t_reject_info), .table = "filter", - .hooks = (1 << NF_IP6_LOCAL_IN) | (1 << NF_IP6_FORWARD) | - (1 << NF_IP6_LOCAL_OUT), - .checkentry = check, + .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT), + .checkentry = reject_tg6_check, .me = THIS_MODULE }; -static int __init ip6t_reject_init(void) +static int __init reject_tg6_init(void) { - return xt_register_target(&ip6t_reject_reg); + return xt_register_target(&reject_tg6_reg); } -static void __exit ip6t_reject_fini(void) +static void __exit reject_tg6_exit(void) { - xt_unregister_target(&ip6t_reject_reg); + xt_unregister_target(&reject_tg6_reg); } -module_init(ip6t_reject_init); -module_exit(ip6t_reject_fini); +module_init(reject_tg6_init); +module_exit(reject_tg6_exit); diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c index 2a25fe25e0e..429629fd63b 100644 --- a/net/ipv6/netfilter/ip6t_ah.c +++ b/net/ipv6/netfilter/ip6t_ah.c @@ -20,7 +20,7 @@ #include <linux/netfilter_ipv6/ip6t_ah.h> MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("IPv6 AH match"); +MODULE_DESCRIPTION("Xtables: IPv6 IPsec-AH match"); MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); /* Returns 1 if the spi is matched by the range, 0 otherwise */ @@ -37,14 +37,9 @@ spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) } static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +ah_mt6(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) { struct ip_auth_hdr _ah; const struct ip_auth_hdr *ah; @@ -100,11 +95,9 @@ match(const struct sk_buff *skb, /* Called when user tries to insert an entry of this type. */ static bool -checkentry(const char *tablename, - const void *entry, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +ah_mt6_check(const char *tablename, const void *entry, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { const struct ip6t_ah *ahinfo = matchinfo; @@ -115,24 +108,24 @@ checkentry(const char *tablename, return true; } -static struct xt_match ah_match __read_mostly = { +static struct xt_match ah_mt6_reg __read_mostly = { .name = "ah", .family = AF_INET6, - .match = match, + .match = ah_mt6, .matchsize = sizeof(struct ip6t_ah), - .checkentry = checkentry, + .checkentry = ah_mt6_check, .me = THIS_MODULE, }; -static int __init ip6t_ah_init(void) +static int __init ah_mt6_init(void) { - return xt_register_match(&ah_match); + return xt_register_match(&ah_mt6_reg); } -static void __exit ip6t_ah_fini(void) +static void __exit ah_mt6_exit(void) { - xt_unregister_match(&ah_match); + xt_unregister_match(&ah_mt6_reg); } -module_init(ip6t_ah_init); -module_exit(ip6t_ah_fini); +module_init(ah_mt6_init); +module_exit(ah_mt6_exit); diff --git a/net/ipv6/netfilter/ip6t_eui64.c b/net/ipv6/netfilter/ip6t_eui64.c index 41df9a578c7..8f331f12b2e 100644 --- a/net/ipv6/netfilter/ip6t_eui64.c +++ b/net/ipv6/netfilter/ip6t_eui64.c @@ -15,19 +15,15 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> -MODULE_DESCRIPTION("IPv6 EUI64 address checking match"); +MODULE_DESCRIPTION("Xtables: IPv6 EUI64 address match"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +eui64_mt6(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { unsigned char eui64[8]; int i = 0; @@ -62,25 +58,25 @@ match(const struct sk_buff *skb, return false; } -static struct xt_match eui64_match __read_mostly = { +static struct xt_match eui64_mt6_reg __read_mostly = { .name = "eui64", .family = AF_INET6, - .match = match, + .match = eui64_mt6, .matchsize = sizeof(int), - .hooks = (1 << NF_IP6_PRE_ROUTING) | (1 << NF_IP6_LOCAL_IN) | - (1 << NF_IP6_FORWARD), + .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD), .me = THIS_MODULE, }; -static int __init ip6t_eui64_init(void) +static int __init eui64_mt6_init(void) { - return xt_register_match(&eui64_match); + return xt_register_match(&eui64_mt6_reg); } -static void __exit ip6t_eui64_fini(void) +static void __exit eui64_mt6_exit(void) { - xt_unregister_match(&eui64_match); + xt_unregister_match(&eui64_mt6_reg); } -module_init(ip6t_eui64_init); -module_exit(ip6t_eui64_fini); +module_init(eui64_mt6_init); +module_exit(eui64_mt6_exit); diff --git a/net/ipv6/netfilter/ip6t_frag.c b/net/ipv6/netfilter/ip6t_frag.c index 968aeba0207..e2bbc63dba5 100644 --- a/net/ipv6/netfilter/ip6t_frag.c +++ b/net/ipv6/netfilter/ip6t_frag.c @@ -19,7 +19,7 @@ #include <linux/netfilter_ipv6/ip6t_frag.h> MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("IPv6 FRAG match"); +MODULE_DESCRIPTION("Xtables: IPv6 fragment match"); MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); /* Returns 1 if the id is matched by the range, 0 otherwise */ @@ -35,14 +35,10 @@ id_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert) } static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +frag_mt6(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { struct frag_hdr _frag; const struct frag_hdr *fh; @@ -116,11 +112,9 @@ match(const struct sk_buff *skb, /* Called when user tries to insert an entry of this type. */ static bool -checkentry(const char *tablename, - const void *ip, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +frag_mt6_check(const char *tablename, const void *ip, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { const struct ip6t_frag *fraginfo = matchinfo; @@ -131,24 +125,24 @@ checkentry(const char *tablename, return true; } -static struct xt_match frag_match __read_mostly = { +static struct xt_match frag_mt6_reg __read_mostly = { .name = "frag", .family = AF_INET6, - .match = match, + .match = frag_mt6, .matchsize = sizeof(struct ip6t_frag), - .checkentry = checkentry, + .checkentry = frag_mt6_check, .me = THIS_MODULE, }; -static int __init ip6t_frag_init(void) +static int __init frag_mt6_init(void) { - return xt_register_match(&frag_match); + return xt_register_match(&frag_mt6_reg); } -static void __exit ip6t_frag_fini(void) +static void __exit frag_mt6_exit(void) { - xt_unregister_match(&frag_match); + xt_unregister_match(&frag_mt6_reg); } -module_init(ip6t_frag_init); -module_exit(ip6t_frag_fini); +module_init(frag_mt6_init); +module_exit(frag_mt6_exit); diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c index e6ca6018b1e..62e39ace058 100644 --- a/net/ipv6/netfilter/ip6t_hbh.c +++ b/net/ipv6/netfilter/ip6t_hbh.c @@ -21,7 +21,7 @@ #include <linux/netfilter_ipv6/ip6t_opts.h> MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("IPv6 opts match"); +MODULE_DESCRIPTION("Xtables: IPv6 Hop-By-Hop and Destination Header match"); MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); MODULE_ALIAS("ip6t_dst"); @@ -42,14 +42,10 @@ MODULE_ALIAS("ip6t_dst"); */ static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +hbh_mt6(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { struct ipv6_opt_hdr _optsh; const struct ipv6_opt_hdr *oh; @@ -171,11 +167,9 @@ match(const struct sk_buff *skb, /* Called when user tries to insert an entry of this type. */ static bool -checkentry(const char *tablename, - const void *entry, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +hbh_mt6_check(const char *tablename, const void *entry, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { const struct ip6t_opts *optsinfo = matchinfo; @@ -186,36 +180,36 @@ checkentry(const char *tablename, return true; } -static struct xt_match opts_match[] __read_mostly = { +static struct xt_match hbh_mt6_reg[] __read_mostly = { { .name = "hbh", .family = AF_INET6, - .match = match, + .match = hbh_mt6, .matchsize = sizeof(struct ip6t_opts), - .checkentry = checkentry, + .checkentry = hbh_mt6_check, .me = THIS_MODULE, .data = NEXTHDR_HOP, }, { .name = "dst", .family = AF_INET6, - .match = match, + .match = hbh_mt6, .matchsize = sizeof(struct ip6t_opts), - .checkentry = checkentry, + .checkentry = hbh_mt6_check, .me = THIS_MODULE, .data = NEXTHDR_DEST, }, }; -static int __init ip6t_hbh_init(void) +static int __init hbh_mt6_init(void) { - return xt_register_matches(opts_match, ARRAY_SIZE(opts_match)); + return xt_register_matches(hbh_mt6_reg, ARRAY_SIZE(hbh_mt6_reg)); } -static void __exit ip6t_hbh_fini(void) +static void __exit hbh_mt6_exit(void) { - xt_unregister_matches(opts_match, ARRAY_SIZE(opts_match)); + xt_unregister_matches(hbh_mt6_reg, ARRAY_SIZE(hbh_mt6_reg)); } -module_init(ip6t_hbh_init); -module_exit(ip6t_hbh_fini); +module_init(hbh_mt6_init); +module_exit(hbh_mt6_exit); diff --git a/net/ipv6/netfilter/ip6t_hl.c b/net/ipv6/netfilter/ip6t_hl.c index ca29ec00dc1..34567167384 100644 --- a/net/ipv6/netfilter/ip6t_hl.c +++ b/net/ipv6/netfilter/ip6t_hl.c @@ -16,13 +16,13 @@ #include <linux/netfilter/x_tables.h> MODULE_AUTHOR("Maciej Soltysiak <solt@dns.toxicfilms.tv>"); -MODULE_DESCRIPTION("IP tables Hop Limit matching module"); +MODULE_DESCRIPTION("Xtables: IPv6 Hop Limit field match"); MODULE_LICENSE("GPL"); -static bool match(const struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - const struct xt_match *match, const void *matchinfo, - int offset, unsigned int protoff, bool *hotdrop) +static bool +hl_mt6(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) { const struct ip6t_hl_info *info = matchinfo; const struct ipv6hdr *ip6h = ipv6_hdr(skb); @@ -49,23 +49,23 @@ static bool match(const struct sk_buff *skb, return false; } -static struct xt_match hl_match __read_mostly = { +static struct xt_match hl_mt6_reg __read_mostly = { .name = "hl", .family = AF_INET6, - .match = match, + .match = hl_mt6, .matchsize = sizeof(struct ip6t_hl_info), .me = THIS_MODULE, }; -static int __init ip6t_hl_init(void) +static int __init hl_mt6_init(void) { - return xt_register_match(&hl_match); + return xt_register_match(&hl_mt6_reg); } -static void __exit ip6t_hl_fini(void) +static void __exit hl_mt6_exit(void) { - xt_unregister_match(&hl_match); + xt_unregister_match(&hl_mt6_reg); } -module_init(ip6t_hl_init); -module_exit(ip6t_hl_fini); +module_init(hl_mt6_init); +module_exit(hl_mt6_exit); diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c index 2c65c2f9a4a..3a940171f82 100644 --- a/net/ipv6/netfilter/ip6t_ipv6header.c +++ b/net/ipv6/netfilter/ip6t_ipv6header.c @@ -23,18 +23,14 @@ #include <linux/netfilter_ipv6/ip6t_ipv6header.h> MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("IPv6 headers match"); +MODULE_DESCRIPTION("Xtables: IPv6 header types match"); MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); static bool -ipv6header_match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +ipv6header_mt6(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { const struct ip6t_ipv6header_info *info = matchinfo; unsigned int temp; @@ -125,11 +121,9 @@ ipv6header_match(const struct sk_buff *skb, } static bool -ipv6header_checkentry(const char *tablename, - const void *ip, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +ipv6header_mt6_check(const char *tablename, const void *ip, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { const struct ip6t_ipv6header_info *info = matchinfo; @@ -141,25 +135,25 @@ ipv6header_checkentry(const char *tablename, return true; } -static struct xt_match ip6t_ipv6header_match __read_mostly = { +static struct xt_match ipv6header_mt6_reg __read_mostly = { .name = "ipv6header", .family = AF_INET6, - .match = &ipv6header_match, + .match = ipv6header_mt6, .matchsize = sizeof(struct ip6t_ipv6header_info), - .checkentry = &ipv6header_checkentry, + .checkentry = ipv6header_mt6_check, .destroy = NULL, .me = THIS_MODULE, }; -static int __init ipv6header_init(void) +static int __init ipv6header_mt6_init(void) { - return xt_register_match(&ip6t_ipv6header_match); + return xt_register_match(&ipv6header_mt6_reg); } -static void __exit ipv6header_exit(void) +static void __exit ipv6header_mt6_exit(void) { - xt_unregister_match(&ip6t_ipv6header_match); + xt_unregister_match(&ipv6header_mt6_reg); } -module_init(ipv6header_init); -module_exit(ipv6header_exit); +module_init(ipv6header_mt6_init); +module_exit(ipv6header_mt6_exit); diff --git a/net/ipv6/netfilter/ip6t_mh.c b/net/ipv6/netfilter/ip6t_mh.c index 0fa714092dc..e06678d07ec 100644 --- a/net/ipv6/netfilter/ip6t_mh.c +++ b/net/ipv6/netfilter/ip6t_mh.c @@ -21,7 +21,7 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv6/ip6t_mh.h> -MODULE_DESCRIPTION("ip6t_tables match for MH"); +MODULE_DESCRIPTION("Xtables: IPv6 Mobility Header match"); MODULE_LICENSE("GPL"); #ifdef DEBUG_IP_FIREWALL_USER @@ -38,14 +38,9 @@ type_match(u_int8_t min, u_int8_t max, u_int8_t type, bool invert) } static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +mh_mt6(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) { struct ip6_mh _mh; const struct ip6_mh *mh; @@ -77,11 +72,9 @@ match(const struct sk_buff *skb, /* Called when user tries to insert an entry of this type. */ static bool -mh_checkentry(const char *tablename, - const void *entry, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +mh_mt6_check(const char *tablename, const void *entry, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { const struct ip6t_mh *mhinfo = matchinfo; @@ -89,25 +82,25 @@ mh_checkentry(const char *tablename, return !(mhinfo->invflags & ~IP6T_MH_INV_MASK); } -static struct xt_match mh_match __read_mostly = { +static struct xt_match mh_mt6_reg __read_mostly = { .name = "mh", .family = AF_INET6, - .checkentry = mh_checkentry, - .match = match, + .checkentry = mh_mt6_check, + .match = mh_mt6, .matchsize = sizeof(struct ip6t_mh), .proto = IPPROTO_MH, .me = THIS_MODULE, }; -static int __init ip6t_mh_init(void) +static int __init mh_mt6_init(void) { - return xt_register_match(&mh_match); + return xt_register_match(&mh_mt6_reg); } -static void __exit ip6t_mh_fini(void) +static void __exit mh_mt6_exit(void) { - xt_unregister_match(&mh_match); + xt_unregister_match(&mh_mt6_reg); } -module_init(ip6t_mh_init); -module_exit(ip6t_mh_fini); +module_init(mh_mt6_init); +module_exit(mh_mt6_exit); diff --git a/net/ipv6/netfilter/ip6t_owner.c b/net/ipv6/netfilter/ip6t_owner.c deleted file mode 100644 index 6036613aef3..00000000000 --- a/net/ipv6/netfilter/ip6t_owner.c +++ /dev/null @@ -1,92 +0,0 @@ -/* Kernel module to match various things tied to sockets associated with - locally generated outgoing packets. */ - -/* (C) 2000-2001 Marc Boucher <marc@mbsi.ca> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/file.h> -#include <linux/rcupdate.h> -#include <net/sock.h> - -#include <linux/netfilter_ipv6/ip6t_owner.h> -#include <linux/netfilter_ipv6/ip6_tables.h> -#include <linux/netfilter/x_tables.h> - -MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); -MODULE_DESCRIPTION("IP6 tables owner matching module"); -MODULE_LICENSE("GPL"); - - -static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) -{ - const struct ip6t_owner_info *info = matchinfo; - - if (!skb->sk || !skb->sk->sk_socket || !skb->sk->sk_socket->file) - return false; - - if (info->match & IP6T_OWNER_UID) - if ((skb->sk->sk_socket->file->f_uid != info->uid) ^ - !!(info->invert & IP6T_OWNER_UID)) - return false; - - if (info->match & IP6T_OWNER_GID) - if ((skb->sk->sk_socket->file->f_gid != info->gid) ^ - !!(info->invert & IP6T_OWNER_GID)) - return false; - - return true; -} - -static bool -checkentry(const char *tablename, - const void *ip, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) -{ - const struct ip6t_owner_info *info = matchinfo; - - if (info->match & (IP6T_OWNER_PID | IP6T_OWNER_SID)) { - printk("ipt_owner: pid and sid matching " - "not supported anymore\n"); - return false; - } - return true; -} - -static struct xt_match owner_match __read_mostly = { - .name = "owner", - .family = AF_INET6, - .match = match, - .matchsize = sizeof(struct ip6t_owner_info), - .hooks = (1 << NF_IP6_LOCAL_OUT) | (1 << NF_IP6_POST_ROUTING), - .checkentry = checkentry, - .me = THIS_MODULE, -}; - -static int __init ip6t_owner_init(void) -{ - return xt_register_match(&owner_match); -} - -static void __exit ip6t_owner_fini(void) -{ - xt_unregister_match(&owner_match); -} - -module_init(ip6t_owner_init); -module_exit(ip6t_owner_fini); diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c index 357cea703bd..12a9efe9886 100644 --- a/net/ipv6/netfilter/ip6t_rt.c +++ b/net/ipv6/netfilter/ip6t_rt.c @@ -21,7 +21,7 @@ #include <linux/netfilter_ipv6/ip6t_rt.h> MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("IPv6 RT match"); +MODULE_DESCRIPTION("Xtables: IPv6 Routing Header match"); MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); /* Returns 1 if the id is matched by the range, 0 otherwise */ @@ -37,14 +37,9 @@ segsleft_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert) } static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +rt_mt6(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) { struct ipv6_rt_hdr _route; const struct ipv6_rt_hdr *rh; @@ -195,11 +190,9 @@ match(const struct sk_buff *skb, /* Called when user tries to insert an entry of this type. */ static bool -checkentry(const char *tablename, - const void *entry, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +rt_mt6_check(const char *tablename, const void *entry, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { const struct ip6t_rt *rtinfo = matchinfo; @@ -218,24 +211,24 @@ checkentry(const char *tablename, return true; } -static struct xt_match rt_match __read_mostly = { +static struct xt_match rt_mt6_reg __read_mostly = { .name = "rt", .family = AF_INET6, - .match = match, + .match = rt_mt6, .matchsize = sizeof(struct ip6t_rt), - .checkentry = checkentry, + .checkentry = rt_mt6_check, .me = THIS_MODULE, }; -static int __init ip6t_rt_init(void) +static int __init rt_mt6_init(void) { - return xt_register_match(&rt_match); + return xt_register_match(&rt_mt6_reg); } -static void __exit ip6t_rt_fini(void) +static void __exit rt_mt6_exit(void) { - xt_unregister_match(&rt_match); + xt_unregister_match(&rt_mt6_reg); } -module_init(ip6t_rt_init); -module_exit(ip6t_rt_fini); +module_init(rt_mt6_init); +module_exit(rt_mt6_exit); diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index 1d26b202bf3..87d38d08aad 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -17,7 +17,9 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("ip6tables filter table"); -#define FILTER_VALID_HOOKS ((1 << NF_IP6_LOCAL_IN) | (1 << NF_IP6_FORWARD) | (1 << NF_IP6_LOCAL_OUT)) +#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \ + (1 << NF_INET_FORWARD) | \ + (1 << NF_INET_LOCAL_OUT)) static struct { @@ -31,14 +33,14 @@ static struct .num_entries = 4, .size = sizeof(struct ip6t_standard) * 3 + sizeof(struct ip6t_error), .hook_entry = { - [NF_IP6_LOCAL_IN] = 0, - [NF_IP6_FORWARD] = sizeof(struct ip6t_standard), - [NF_IP6_LOCAL_OUT] = sizeof(struct ip6t_standard) * 2 + [NF_INET_LOCAL_IN] = 0, + [NF_INET_FORWARD] = sizeof(struct ip6t_standard), + [NF_INET_LOCAL_OUT] = sizeof(struct ip6t_standard) * 2 }, .underflow = { - [NF_IP6_LOCAL_IN] = 0, - [NF_IP6_FORWARD] = sizeof(struct ip6t_standard), - [NF_IP6_LOCAL_OUT] = sizeof(struct ip6t_standard) * 2 + [NF_INET_LOCAL_IN] = 0, + [NF_INET_FORWARD] = sizeof(struct ip6t_standard), + [NF_INET_LOCAL_OUT] = sizeof(struct ip6t_standard) * 2 }, }, .entries = { @@ -88,26 +90,26 @@ ip6t_local_out_hook(unsigned int hook, return ip6t_do_table(skb, hook, in, out, &packet_filter); } -static struct nf_hook_ops ip6t_ops[] = { +static struct nf_hook_ops ip6t_ops[] __read_mostly = { { .hook = ip6t_hook, .owner = THIS_MODULE, .pf = PF_INET6, - .hooknum = NF_IP6_LOCAL_IN, + .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_FILTER, }, { .hook = ip6t_hook, .owner = THIS_MODULE, .pf = PF_INET6, - .hooknum = NF_IP6_FORWARD, + .hooknum = NF_INET_FORWARD, .priority = NF_IP6_PRI_FILTER, }, { .hook = ip6t_local_out_hook, .owner = THIS_MODULE, .pf = PF_INET6, - .hooknum = NF_IP6_LOCAL_OUT, + .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_FILTER, }, }; diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index a0b6381f1e8..d6082600bc5 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -15,11 +15,11 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("ip6tables mangle table"); -#define MANGLE_VALID_HOOKS ((1 << NF_IP6_PRE_ROUTING) | \ - (1 << NF_IP6_LOCAL_IN) | \ - (1 << NF_IP6_FORWARD) | \ - (1 << NF_IP6_LOCAL_OUT) | \ - (1 << NF_IP6_POST_ROUTING)) +#define MANGLE_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \ + (1 << NF_INET_LOCAL_IN) | \ + (1 << NF_INET_FORWARD) | \ + (1 << NF_INET_LOCAL_OUT) | \ + (1 << NF_INET_POST_ROUTING)) static struct { @@ -33,18 +33,18 @@ static struct .num_entries = 6, .size = sizeof(struct ip6t_standard) * 5 + sizeof(struct ip6t_error), .hook_entry = { - [NF_IP6_PRE_ROUTING] = 0, - [NF_IP6_LOCAL_IN] = sizeof(struct ip6t_standard), - [NF_IP6_FORWARD] = sizeof(struct ip6t_standard) * 2, - [NF_IP6_LOCAL_OUT] = sizeof(struct ip6t_standard) * 3, - [NF_IP6_POST_ROUTING] = sizeof(struct ip6t_standard) * 4, + [NF_INET_PRE_ROUTING] = 0, + [NF_INET_LOCAL_IN] = sizeof(struct ip6t_standard), + [NF_INET_FORWARD] = sizeof(struct ip6t_standard) * 2, + [NF_INET_LOCAL_OUT] = sizeof(struct ip6t_standard) * 3, + [NF_INET_POST_ROUTING] = sizeof(struct ip6t_standard) * 4, }, .underflow = { - [NF_IP6_PRE_ROUTING] = 0, - [NF_IP6_LOCAL_IN] = sizeof(struct ip6t_standard), - [NF_IP6_FORWARD] = sizeof(struct ip6t_standard) * 2, - [NF_IP6_LOCAL_OUT] = sizeof(struct ip6t_standard) * 3, - [NF_IP6_POST_ROUTING] = sizeof(struct ip6t_standard) * 4, + [NF_INET_PRE_ROUTING] = 0, + [NF_INET_LOCAL_IN] = sizeof(struct ip6t_standard), + [NF_INET_FORWARD] = sizeof(struct ip6t_standard) * 2, + [NF_INET_LOCAL_OUT] = sizeof(struct ip6t_standard) * 3, + [NF_INET_POST_ROUTING] = sizeof(struct ip6t_standard) * 4, }, }, .entries = { @@ -120,40 +120,40 @@ ip6t_local_hook(unsigned int hook, return ret; } -static struct nf_hook_ops ip6t_ops[] = { +static struct nf_hook_ops ip6t_ops[] __read_mostly = { { .hook = ip6t_route_hook, .owner = THIS_MODULE, .pf = PF_INET6, - .hooknum = NF_IP6_PRE_ROUTING, + .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP6_PRI_MANGLE, }, { .hook = ip6t_local_hook, .owner = THIS_MODULE, .pf = PF_INET6, - .hooknum = NF_IP6_LOCAL_IN, + .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_MANGLE, }, { .hook = ip6t_route_hook, .owner = THIS_MODULE, .pf = PF_INET6, - .hooknum = NF_IP6_FORWARD, + .hooknum = NF_INET_FORWARD, .priority = NF_IP6_PRI_MANGLE, }, { .hook = ip6t_local_hook, .owner = THIS_MODULE, .pf = PF_INET6, - .hooknum = NF_IP6_LOCAL_OUT, + .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_MANGLE, }, { .hook = ip6t_route_hook, .owner = THIS_MODULE, .pf = PF_INET6, - .hooknum = NF_IP6_POST_ROUTING, + .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_MANGLE, }, }; diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index 8f7109f991e..eccbaaa104a 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -6,7 +6,7 @@ #include <linux/module.h> #include <linux/netfilter_ipv6/ip6_tables.h> -#define RAW_VALID_HOOKS ((1 << NF_IP6_PRE_ROUTING) | (1 << NF_IP6_LOCAL_OUT)) +#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)) static struct { @@ -20,12 +20,12 @@ static struct .num_entries = 3, .size = sizeof(struct ip6t_standard) * 2 + sizeof(struct ip6t_error), .hook_entry = { - [NF_IP6_PRE_ROUTING] = 0, - [NF_IP6_LOCAL_OUT] = sizeof(struct ip6t_standard) + [NF_INET_PRE_ROUTING] = 0, + [NF_INET_LOCAL_OUT] = sizeof(struct ip6t_standard) }, .underflow = { - [NF_IP6_PRE_ROUTING] = 0, - [NF_IP6_LOCAL_OUT] = sizeof(struct ip6t_standard) + [NF_INET_PRE_ROUTING] = 0, + [NF_INET_LOCAL_OUT] = sizeof(struct ip6t_standard) }, }, .entries = { @@ -54,18 +54,18 @@ ip6t_hook(unsigned int hook, return ip6t_do_table(skb, hook, in, out, &packet_raw); } -static struct nf_hook_ops ip6t_ops[] = { +static struct nf_hook_ops ip6t_ops[] __read_mostly = { { .hook = ip6t_hook, .pf = PF_INET6, - .hooknum = NF_IP6_PRE_ROUTING, + .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP6_PRI_FIRST, .owner = THIS_MODULE, }, { .hook = ip6t_hook, .pf = PF_INET6, - .hooknum = NF_IP6_LOCAL_OUT, + .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_FIRST, .owner = THIS_MODULE, }, diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index ad74bab0504..2d7b0246475 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -60,12 +60,6 @@ static int ipv6_print_tuple(struct seq_file *s, NIP6(*((struct in6_addr *)tuple->dst.u3.ip6))); } -static int ipv6_print_conntrack(struct seq_file *s, - const struct nf_conn *conntrack) -{ - return 0; -} - /* * Based on ipv6_skip_exthdr() in net/ipv6/exthdr.c * @@ -258,80 +252,51 @@ static unsigned int ipv6_conntrack_local(unsigned int hooknum, return ipv6_conntrack_in(hooknum, skb, in, out, okfn); } -static struct nf_hook_ops ipv6_conntrack_ops[] = { +static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = { { .hook = ipv6_defrag, .owner = THIS_MODULE, .pf = PF_INET6, - .hooknum = NF_IP6_PRE_ROUTING, + .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, }, { .hook = ipv6_conntrack_in, .owner = THIS_MODULE, .pf = PF_INET6, - .hooknum = NF_IP6_PRE_ROUTING, + .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP6_PRI_CONNTRACK, }, { .hook = ipv6_conntrack_local, .owner = THIS_MODULE, .pf = PF_INET6, - .hooknum = NF_IP6_LOCAL_OUT, + .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_CONNTRACK, }, { .hook = ipv6_defrag, .owner = THIS_MODULE, .pf = PF_INET6, - .hooknum = NF_IP6_LOCAL_OUT, + .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, }, { .hook = ipv6_confirm, .owner = THIS_MODULE, .pf = PF_INET6, - .hooknum = NF_IP6_POST_ROUTING, + .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_LAST, }, { .hook = ipv6_confirm, .owner = THIS_MODULE, .pf = PF_INET6, - .hooknum = NF_IP6_LOCAL_IN, + .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_LAST-1, }, }; -#ifdef CONFIG_SYSCTL -static ctl_table nf_ct_ipv6_sysctl_table[] = { - { - .procname = "nf_conntrack_frag6_timeout", - .data = &nf_frags_ctl.timeout, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .ctl_name = NET_NF_CONNTRACK_FRAG6_LOW_THRESH, - .procname = "nf_conntrack_frag6_low_thresh", - .data = &nf_frags_ctl.low_thresh, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, - .procname = "nf_conntrack_frag6_high_thresh", - .data = &nf_frags_ctl.high_thresh, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { .ctl_name = 0 } -}; -#endif - #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) #include <linux/netfilter/nfnetlink.h> @@ -376,7 +341,6 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = { .pkt_to_tuple = ipv6_pkt_to_tuple, .invert_tuple = ipv6_invert_tuple, .print_tuple = ipv6_print_tuple, - .print_conntrack = ipv6_print_conntrack, .get_l4proto = ipv6_get_l4proto, #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) .tuple_to_nlattr = ipv6_tuple_to_nlattr, diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index fd9123f3dc0..da924c6b5f0 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -24,6 +24,7 @@ #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/ipv6/nf_conntrack_icmpv6.h> +#include <net/netfilter/nf_log.h> static unsigned long nf_ct_icmpv6_timeout __read_mostly = 30*HZ; @@ -74,13 +75,6 @@ static int icmpv6_print_tuple(struct seq_file *s, ntohs(tuple->src.u.icmp.id)); } -/* Print out the private part of the conntrack. */ -static int icmpv6_print_conntrack(struct seq_file *s, - const struct nf_conn *conntrack) -{ - return 0; -} - /* Returns verdict for packet, or -1 for invalid. */ static int icmpv6_packet(struct nf_conn *ct, const struct sk_buff *skb, @@ -192,7 +186,7 @@ icmpv6_error(struct sk_buff *skb, unsigned int dataoff, return -NF_ACCEPT; } - if (nf_conntrack_checksum && hooknum == NF_IP6_PRE_ROUTING && + if (nf_conntrack_checksum && hooknum == NF_INET_PRE_ROUTING && nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) { nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, "nf_ct_icmpv6: ICMPv6 checksum failed\n"); @@ -213,12 +207,9 @@ icmpv6_error(struct sk_buff *skb, unsigned int dataoff, static int icmpv6_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *t) { - NLA_PUT(skb, CTA_PROTO_ICMPV6_ID, sizeof(u_int16_t), - &t->src.u.icmp.id); - NLA_PUT(skb, CTA_PROTO_ICMPV6_TYPE, sizeof(u_int8_t), - &t->dst.u.icmp.type); - NLA_PUT(skb, CTA_PROTO_ICMPV6_CODE, sizeof(u_int8_t), - &t->dst.u.icmp.code); + NLA_PUT_BE16(skb, CTA_PROTO_ICMPV6_ID, t->src.u.icmp.id); + NLA_PUT_U8(skb, CTA_PROTO_ICMPV6_TYPE, t->dst.u.icmp.type); + NLA_PUT_U8(skb, CTA_PROTO_ICMPV6_CODE, t->dst.u.icmp.code); return 0; @@ -240,12 +231,9 @@ static int icmpv6_nlattr_to_tuple(struct nlattr *tb[], || !tb[CTA_PROTO_ICMPV6_ID]) return -EINVAL; - tuple->dst.u.icmp.type = - *(u_int8_t *)nla_data(tb[CTA_PROTO_ICMPV6_TYPE]); - tuple->dst.u.icmp.code = - *(u_int8_t *)nla_data(tb[CTA_PROTO_ICMPV6_CODE]); - tuple->src.u.icmp.id = - *(__be16 *)nla_data(tb[CTA_PROTO_ICMPV6_ID]); + tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMPV6_TYPE]); + tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMPV6_CODE]); + tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMPV6_ID]); if (tuple->dst.u.icmp.type < 128 || tuple->dst.u.icmp.type - 128 >= sizeof(invmap) @@ -280,7 +268,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 __read_mostly = .pkt_to_tuple = icmpv6_pkt_to_tuple, .invert_tuple = icmpv6_invert_tuple, .print_tuple = icmpv6_print_tuple, - .print_conntrack = icmpv6_print_conntrack, .packet = icmpv6_packet, .new = icmpv6_new, .error = icmpv6_error, diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index e170c67c47a..022da6ce4c0 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -70,14 +70,37 @@ struct nf_ct_frag6_queue __u16 nhoffset; }; -struct inet_frags_ctl nf_frags_ctl __read_mostly = { - .high_thresh = 256 * 1024, - .low_thresh = 192 * 1024, - .timeout = IPV6_FRAG_TIMEOUT, - .secret_interval = 10 * 60 * HZ, -}; - static struct inet_frags nf_frags; +static struct netns_frags nf_init_frags; + +#ifdef CONFIG_SYSCTL +struct ctl_table nf_ct_ipv6_sysctl_table[] = { + { + .procname = "nf_conntrack_frag6_timeout", + .data = &nf_init_frags.timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_FRAG6_LOW_THRESH, + .procname = "nf_conntrack_frag6_low_thresh", + .data = &nf_init_frags.low_thresh, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, + .procname = "nf_conntrack_frag6_high_thresh", + .data = &nf_init_frags.high_thresh, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; +#endif static unsigned int ip6qhashfn(__be32 id, struct in6_addr *saddr, struct in6_addr *daddr) @@ -125,7 +148,7 @@ static inline void frag_kfree_skb(struct sk_buff *skb, unsigned int *work) { if (work) *work -= skb->truesize; - atomic_sub(skb->truesize, &nf_frags.mem); + atomic_sub(skb->truesize, &nf_init_frags.mem); nf_skb_free(skb); kfree_skb(skb); } @@ -147,7 +170,7 @@ static __inline__ void fq_kill(struct nf_ct_frag6_queue *fq) static void nf_ct_frag6_evictor(void) { - inet_frag_evictor(&nf_frags); + inet_frag_evictor(&nf_init_frags, &nf_frags); } static void nf_ct_frag6_expire(unsigned long data) @@ -183,7 +206,7 @@ fq_find(__be32 id, struct in6_addr *src, struct in6_addr *dst) arg.dst = dst; hash = ip6qhashfn(id, src, dst); - q = inet_frag_find(&nf_frags, &arg, hash); + q = inet_frag_find(&nf_init_frags, &nf_frags, &arg, hash); if (q == NULL) goto oom; @@ -352,7 +375,7 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, skb->dev = NULL; fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; - atomic_add(skb->truesize, &nf_frags.mem); + atomic_add(skb->truesize, &nf_init_frags.mem); /* The first fragment. * nhoffset is obtained from the first fragment, of course. @@ -362,7 +385,7 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, fq->q.last_in |= FIRST_IN; } write_lock(&nf_frags.lock); - list_move_tail(&fq->q.lru_list, &nf_frags.lru_list); + list_move_tail(&fq->q.lru_list, &nf_init_frags.lru_list); write_unlock(&nf_frags.lock); return 0; @@ -429,7 +452,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) clone->ip_summed = head->ip_summed; NFCT_FRAG6_CB(clone)->orig = NULL; - atomic_add(clone->truesize, &nf_frags.mem); + atomic_add(clone->truesize, &nf_init_frags.mem); } /* We have to remove fragment header from datagram and to relocate @@ -443,7 +466,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) skb_shinfo(head)->frag_list = head->next; skb_reset_transport_header(head); skb_push(head, head->data - skb_network_header(head)); - atomic_sub(head->truesize, &nf_frags.mem); + atomic_sub(head->truesize, &nf_init_frags.mem); for (fp=head->next; fp; fp = fp->next) { head->data_len += fp->len; @@ -453,7 +476,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); head->truesize += fp->truesize; - atomic_sub(fp->truesize, &nf_frags.mem); + atomic_sub(fp->truesize, &nf_init_frags.mem); } head->next = NULL; @@ -603,7 +626,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb) goto ret_orig; } - if (atomic_read(&nf_frags.mem) > nf_frags_ctl.high_thresh) + if (atomic_read(&nf_init_frags.mem) > nf_init_frags.high_thresh) nf_ct_frag6_evictor(); fq = fq_find(fhdr->identification, &hdr->saddr, &hdr->daddr); @@ -674,7 +697,6 @@ int nf_ct_frag6_kfree_frags(struct sk_buff *skb) int nf_ct_frag6_init(void) { - nf_frags.ctl = &nf_frags_ctl; nf_frags.hashfn = nf_hashfn; nf_frags.constructor = ip6_frag_init; nf_frags.destructor = NULL; @@ -682,6 +704,11 @@ int nf_ct_frag6_init(void) nf_frags.qsize = sizeof(struct nf_ct_frag6_queue); nf_frags.match = ip6_frag_match; nf_frags.frag_expire = nf_ct_frag6_expire; + nf_frags.secret_interval = 10 * 60 * HZ; + nf_init_frags.timeout = IPV6_FRAG_TIMEOUT; + nf_init_frags.high_thresh = 256 * 1024; + nf_init_frags.low_thresh = 192 * 1024; + inet_frags_init_net(&nf_init_frags); inet_frags_init(&nf_frags); return 0; @@ -691,6 +718,6 @@ void nf_ct_frag6_cleanup(void) { inet_frags_fini(&nf_frags); - nf_frags_ctl.low_thresh = 0; + nf_init_frags.low_thresh = 0; nf_ct_frag6_evictor(); } diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 44937616057..35e502a7249 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -27,6 +27,7 @@ #include <net/ip.h> #include <net/sock.h> #include <net/tcp.h> +#include <net/udp.h> #include <net/transp_v6.h> #include <net/ipv6.h> @@ -35,15 +36,15 @@ static struct proc_dir_entry *proc_net_devsnmp6; static int sockstat6_seq_show(struct seq_file *seq, void *v) { seq_printf(seq, "TCP6: inuse %d\n", - sock_prot_inuse(&tcpv6_prot)); + sock_prot_inuse_get(&tcpv6_prot)); seq_printf(seq, "UDP6: inuse %d\n", - sock_prot_inuse(&udpv6_prot)); + sock_prot_inuse_get(&udpv6_prot)); seq_printf(seq, "UDPLITE6: inuse %d\n", - sock_prot_inuse(&udplitev6_prot)); + sock_prot_inuse_get(&udplitev6_prot)); seq_printf(seq, "RAW6: inuse %d\n", - sock_prot_inuse(&rawv6_prot)); + sock_prot_inuse_get(&rawv6_prot)); seq_printf(seq, "FRAG6: inuse %d memory %d\n", - ip6_frag_nqueues(), ip6_frag_mem()); + ip6_frag_nqueues(&init_net), ip6_frag_mem(&init_net)); return 0; } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 807260d0358..4d880551fe6 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -54,39 +54,31 @@ #include <net/mip6.h> #endif +#include <net/raw.h> #include <net/rawv6.h> #include <net/xfrm.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> -struct hlist_head raw_v6_htable[RAWV6_HTABLE_SIZE]; -DEFINE_RWLOCK(raw_v6_lock); +static struct raw_hashinfo raw_v6_hashinfo = { + .lock = __RW_LOCK_UNLOCKED(), +}; static void raw_v6_hash(struct sock *sk) { - struct hlist_head *list = &raw_v6_htable[inet_sk(sk)->num & - (RAWV6_HTABLE_SIZE - 1)]; - - write_lock_bh(&raw_v6_lock); - sk_add_node(sk, list); - sock_prot_inc_use(sk->sk_prot); - write_unlock_bh(&raw_v6_lock); + raw_hash_sk(sk, &raw_v6_hashinfo); } static void raw_v6_unhash(struct sock *sk) { - write_lock_bh(&raw_v6_lock); - if (sk_del_node_init(sk)) - sock_prot_dec_use(sk->sk_prot); - write_unlock_bh(&raw_v6_lock); + raw_unhash_sk(sk, &raw_v6_hashinfo); } -/* Grumble... icmp and ip_input want to get at this... */ -struct sock *__raw_v6_lookup(struct sock *sk, unsigned short num, - struct in6_addr *loc_addr, struct in6_addr *rmt_addr, - int dif) +static struct sock *__raw_v6_lookup(struct net *net, struct sock *sk, + unsigned short num, struct in6_addr *loc_addr, + struct in6_addr *rmt_addr, int dif) { struct hlist_node *node; int is_multicast = ipv6_addr_is_multicast(loc_addr); @@ -95,6 +87,9 @@ struct sock *__raw_v6_lookup(struct sock *sk, unsigned short num, if (inet_sk(sk)->num == num) { struct ipv6_pinfo *np = inet6_sk(sk); + if (sk->sk_net != net) + continue; + if (!ipv6_addr_any(&np->daddr) && !ipv6_addr_equal(&np->daddr, rmt_addr)) continue; @@ -167,21 +162,22 @@ EXPORT_SYMBOL(rawv6_mh_filter_unregister); * * Caller owns SKB so we must make clones. */ -int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) +static int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) { struct in6_addr *saddr; struct in6_addr *daddr; struct sock *sk; int delivered = 0; __u8 hash; + struct net *net; saddr = &ipv6_hdr(skb)->saddr; daddr = saddr + 1; hash = nexthdr & (MAX_INET_PROTOS - 1); - read_lock(&raw_v6_lock); - sk = sk_head(&raw_v6_htable[hash]); + read_lock(&raw_v6_hashinfo.lock); + sk = sk_head(&raw_v6_hashinfo.ht[hash]); /* * The first socket found will be delivered after @@ -191,7 +187,8 @@ int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) if (sk == NULL) goto out; - sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr, IP6CB(skb)->iif); + net = skb->dev->nd_net; + sk = __raw_v6_lookup(net, sk, nexthdr, daddr, saddr, IP6CB(skb)->iif); while (sk) { int filtered; @@ -234,14 +231,25 @@ int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) rawv6_rcv(sk, clone); } } - sk = __raw_v6_lookup(sk_next(sk), nexthdr, daddr, saddr, + sk = __raw_v6_lookup(net, sk_next(sk), nexthdr, daddr, saddr, IP6CB(skb)->iif); } out: - read_unlock(&raw_v6_lock); + read_unlock(&raw_v6_hashinfo.lock); return delivered; } +int raw6_local_deliver(struct sk_buff *skb, int nexthdr) +{ + struct sock *raw_sk; + + raw_sk = sk_head(&raw_v6_hashinfo.ht[nexthdr & (MAX_INET_PROTOS - 1)]); + if (raw_sk && !ipv6_raw_deliver(skb, nexthdr)) + raw_sk = NULL; + + return raw_sk != NULL; +} + /* This cleans up af_inet6 a bit. -DaveM */ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -283,7 +291,7 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (!sk->sk_bound_dev_if) goto out; - dev = dev_get_by_index(&init_net, sk->sk_bound_dev_if); + dev = dev_get_by_index(sk->sk_net, sk->sk_bound_dev_if); if (!dev) { err = -ENODEV; goto out; @@ -296,7 +304,8 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) v4addr = LOOPBACK4_IPV6; if (!(addr_type & IPV6_ADDR_MULTICAST)) { err = -EADDRNOTAVAIL; - if (!ipv6_chk_addr(&addr->sin6_addr, dev, 0)) { + if (!ipv6_chk_addr(sk->sk_net, &addr->sin6_addr, + dev, 0)) { if (dev) dev_put(dev); goto out; @@ -316,7 +325,7 @@ out: return err; } -void rawv6_err(struct sock *sk, struct sk_buff *skb, +static void rawv6_err(struct sock *sk, struct sk_buff *skb, struct inet6_skb_parm *opt, int type, int code, int offset, __be32 info) { @@ -350,18 +359,45 @@ void rawv6_err(struct sock *sk, struct sk_buff *skb, } } +void raw6_icmp_error(struct sk_buff *skb, int nexthdr, + int type, int code, int inner_offset, __be32 info) +{ + struct sock *sk; + int hash; + struct in6_addr *saddr, *daddr; + struct net *net; + + hash = nexthdr & (RAW_HTABLE_SIZE - 1); + + read_lock(&raw_v6_hashinfo.lock); + sk = sk_head(&raw_v6_hashinfo.ht[hash]); + if (sk != NULL) { + saddr = &ipv6_hdr(skb)->saddr; + daddr = &ipv6_hdr(skb)->daddr; + net = skb->dev->nd_net; + + while ((sk = __raw_v6_lookup(net, sk, nexthdr, saddr, daddr, + IP6CB(skb)->iif))) { + rawv6_err(sk, skb, NULL, type, code, + inner_offset, info); + sk = sk_next(sk); + } + } + read_unlock(&raw_v6_hashinfo.lock); +} + static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb) { if ((raw6_sk(sk)->checksum || sk->sk_filter) && skb_checksum_complete(skb)) { - /* FIXME: increment a raw6 drops counter here */ + atomic_inc(&sk->sk_drops); kfree_skb(skb); return 0; } /* Charge it to the socket. */ if (sock_queue_rcv_skb(sk,skb)<0) { - /* FIXME: increment a raw6 drops counter here */ + atomic_inc(&sk->sk_drops); kfree_skb(skb); return 0; } @@ -382,6 +418,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) struct raw6_sock *rp = raw6_sk(sk); if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { + atomic_inc(&sk->sk_drops); kfree_skb(skb); return NET_RX_DROP; } @@ -405,7 +442,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) if (inet->hdrincl) { if (skb_checksum_complete(skb)) { - /* FIXME: increment a raw6 drops counter here */ + atomic_inc(&sk->sk_drops); kfree_skb(skb); return 0; } @@ -496,7 +533,7 @@ csum_copy_err: as some normal condition. */ err = (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH; - /* FIXME: increment a raw6 drops counter here */ + atomic_inc(&sk->sk_drops); goto out; } @@ -618,7 +655,7 @@ static int rawv6_send_hdrinc(struct sock *sk, void *from, int length, goto error_fault; IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); - err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, rt->u.dst.dev, + err = NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, rt->u.dst.dev, dst_output); if (err > 0) err = np->recverr ? net_xmit_errno(err) : 0; @@ -843,7 +880,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, if (final_p) ipv6_addr_copy(&fl.fl6_dst, final_p); - if ((err = __xfrm_lookup(&dst, &fl, sk, 1)) < 0) { + if ((err = __xfrm_lookup(&dst, &fl, sk, XFRM_LOOKUP_WAIT)) < 0) { if (err == -EREMOTE) err = ip6_dst_blackhole(sk, &dst, &fl); if (err < 0) @@ -1172,76 +1209,6 @@ struct proto rawv6_prot = { }; #ifdef CONFIG_PROC_FS -struct raw6_iter_state { - int bucket; -}; - -#define raw6_seq_private(seq) ((struct raw6_iter_state *)(seq)->private) - -static struct sock *raw6_get_first(struct seq_file *seq) -{ - struct sock *sk; - struct hlist_node *node; - struct raw6_iter_state* state = raw6_seq_private(seq); - - for (state->bucket = 0; state->bucket < RAWV6_HTABLE_SIZE; ++state->bucket) - sk_for_each(sk, node, &raw_v6_htable[state->bucket]) - if (sk->sk_family == PF_INET6) - goto out; - sk = NULL; -out: - return sk; -} - -static struct sock *raw6_get_next(struct seq_file *seq, struct sock *sk) -{ - struct raw6_iter_state* state = raw6_seq_private(seq); - - do { - sk = sk_next(sk); -try_again: - ; - } while (sk && sk->sk_family != PF_INET6); - - if (!sk && ++state->bucket < RAWV6_HTABLE_SIZE) { - sk = sk_head(&raw_v6_htable[state->bucket]); - goto try_again; - } - return sk; -} - -static struct sock *raw6_get_idx(struct seq_file *seq, loff_t pos) -{ - struct sock *sk = raw6_get_first(seq); - if (sk) - while (pos && (sk = raw6_get_next(seq, sk)) != NULL) - --pos; - return pos ? NULL : sk; -} - -static void *raw6_seq_start(struct seq_file *seq, loff_t *pos) -{ - read_lock(&raw_v6_lock); - return *pos ? raw6_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; -} - -static void *raw6_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct sock *sk; - - if (v == SEQ_START_TOKEN) - sk = raw6_get_first(seq); - else - sk = raw6_get_next(seq, v); - ++*pos; - return sk; -} - -static void raw6_seq_stop(struct seq_file *seq, void *v) -{ - read_unlock(&raw_v6_lock); -} - static void raw6_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) { struct ipv6_pinfo *np = inet6_sk(sp); @@ -1254,7 +1221,7 @@ static void raw6_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) srcp = inet_sk(sp)->num; seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " - "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p\n", + "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n", i, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], srcp, @@ -1266,7 +1233,7 @@ static void raw6_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), - atomic_read(&sp->sk_refcnt), sp); + atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); } static int raw6_seq_show(struct seq_file *seq, void *v) @@ -1277,23 +1244,22 @@ static int raw6_seq_show(struct seq_file *seq, void *v) "local_address " "remote_address " "st tx_queue rx_queue tr tm->when retrnsmt" - " uid timeout inode\n"); + " uid timeout inode drops\n"); else - raw6_sock_seq_show(seq, v, raw6_seq_private(seq)->bucket); + raw6_sock_seq_show(seq, v, raw_seq_private(seq)->bucket); return 0; } static const struct seq_operations raw6_seq_ops = { - .start = raw6_seq_start, - .next = raw6_seq_next, - .stop = raw6_seq_stop, + .start = raw_seq_start, + .next = raw_seq_next, + .stop = raw_seq_stop, .show = raw6_seq_show, }; static int raw6_seq_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &raw6_seq_ops, - sizeof(struct raw6_iter_state)); + return raw_seq_open(inode, file, &raw_v6_hashinfo, PF_INET6); } static const struct file_operations raw6_seq_fops = { @@ -1301,18 +1267,86 @@ static const struct file_operations raw6_seq_fops = { .open = raw6_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; -int __init raw6_proc_init(void) +static int raw6_init_net(struct net *net) { - if (!proc_net_fops_create(&init_net, "raw6", S_IRUGO, &raw6_seq_fops)) + if (!proc_net_fops_create(net, "raw6", S_IRUGO, &raw6_seq_fops)) return -ENOMEM; + return 0; } +static void raw6_exit_net(struct net *net) +{ + proc_net_remove(net, "raw6"); +} + +static struct pernet_operations raw6_net_ops = { + .init = raw6_init_net, + .exit = raw6_exit_net, +}; + +int __init raw6_proc_init(void) +{ + return register_pernet_subsys(&raw6_net_ops); +} + void raw6_proc_exit(void) { - proc_net_remove(&init_net, "raw6"); + unregister_pernet_subsys(&raw6_net_ops); } #endif /* CONFIG_PROC_FS */ + +/* Same as inet6_dgram_ops, sans udp_poll. */ +static const struct proto_ops inet6_sockraw_ops = { + .family = PF_INET6, + .owner = THIS_MODULE, + .release = inet6_release, + .bind = inet6_bind, + .connect = inet_dgram_connect, /* ok */ + .socketpair = sock_no_socketpair, /* a do nothing */ + .accept = sock_no_accept, /* a do nothing */ + .getname = inet6_getname, + .poll = datagram_poll, /* ok */ + .ioctl = inet6_ioctl, /* must change */ + .listen = sock_no_listen, /* ok */ + .shutdown = inet_shutdown, /* ok */ + .setsockopt = sock_common_setsockopt, /* ok */ + .getsockopt = sock_common_getsockopt, /* ok */ + .sendmsg = inet_sendmsg, /* ok */ + .recvmsg = sock_common_recvmsg, /* ok */ + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_sock_common_setsockopt, + .compat_getsockopt = compat_sock_common_getsockopt, +#endif +}; + +static struct inet_protosw rawv6_protosw = { + .type = SOCK_RAW, + .protocol = IPPROTO_IP, /* wild card */ + .prot = &rawv6_prot, + .ops = &inet6_sockraw_ops, + .capability = CAP_NET_RAW, + .no_check = UDP_CSUM_DEFAULT, + .flags = INET_PROTOSW_REUSE, +}; + +int __init rawv6_init(void) +{ + int ret; + + ret = inet6_register_protosw(&rawv6_protosw); + if (ret) + goto out; +out: + return ret; +} + +void rawv6_exit(void) +{ + inet6_unregister_protosw(&rawv6_protosw); +} diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 76c88a93b9b..f936d045a39 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -82,23 +82,16 @@ struct frag_queue __u16 nhoffset; }; -struct inet_frags_ctl ip6_frags_ctl __read_mostly = { - .high_thresh = 256 * 1024, - .low_thresh = 192 * 1024, - .timeout = IPV6_FRAG_TIMEOUT, - .secret_interval = 10 * 60 * HZ, -}; - static struct inet_frags ip6_frags; -int ip6_frag_nqueues(void) +int ip6_frag_nqueues(struct net *net) { - return ip6_frags.nqueues; + return net->ipv6.frags.nqueues; } -int ip6_frag_mem(void) +int ip6_frag_mem(struct net *net) { - return atomic_read(&ip6_frags.mem); + return atomic_read(&net->ipv6.frags.mem); } static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, @@ -156,11 +149,12 @@ int ip6_frag_match(struct inet_frag_queue *q, void *a) EXPORT_SYMBOL(ip6_frag_match); /* Memory Tracking Functions. */ -static inline void frag_kfree_skb(struct sk_buff *skb, int *work) +static inline void frag_kfree_skb(struct netns_frags *nf, + struct sk_buff *skb, int *work) { if (work) *work -= skb->truesize; - atomic_sub(skb->truesize, &ip6_frags.mem); + atomic_sub(skb->truesize, &nf->mem); kfree_skb(skb); } @@ -190,11 +184,11 @@ static __inline__ void fq_kill(struct frag_queue *fq) inet_frag_kill(&fq->q, &ip6_frags); } -static void ip6_evictor(struct inet6_dev *idev) +static void ip6_evictor(struct net *net, struct inet6_dev *idev) { int evicted; - evicted = inet_frag_evictor(&ip6_frags); + evicted = inet_frag_evictor(&net->ipv6.frags, &ip6_frags); if (evicted) IP6_ADD_STATS_BH(idev, IPSTATS_MIB_REASMFAILS, evicted); } @@ -241,7 +235,7 @@ out: } static __inline__ struct frag_queue * -fq_find(__be32 id, struct in6_addr *src, struct in6_addr *dst, +fq_find(struct net *net, __be32 id, struct in6_addr *src, struct in6_addr *dst, struct inet6_dev *idev) { struct inet_frag_queue *q; @@ -253,7 +247,7 @@ fq_find(__be32 id, struct in6_addr *src, struct in6_addr *dst, arg.dst = dst; hash = ip6qhashfn(id, src, dst); - q = inet_frag_find(&ip6_frags, &arg, hash); + q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash); if (q == NULL) goto oom; @@ -396,7 +390,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, fq->q.fragments = next; fq->q.meat -= free_it->len; - frag_kfree_skb(free_it, NULL); + frag_kfree_skb(fq->q.net, free_it, NULL); } } @@ -416,7 +410,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, } fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; - atomic_add(skb->truesize, &ip6_frags.mem); + atomic_add(skb->truesize, &fq->q.net->mem); /* The first fragment. * nhoffset is obtained from the first fragment, of course. @@ -430,7 +424,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, return ip6_frag_reasm(fq, prev, dev); write_lock(&ip6_frags.lock); - list_move_tail(&fq->q.lru_list, &ip6_frags.lru_list); + list_move_tail(&fq->q.lru_list, &fq->q.net->lru_list); write_unlock(&ip6_frags.lock); return -1; @@ -510,7 +504,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, head->len -= clone->len; clone->csum = 0; clone->ip_summed = head->ip_summed; - atomic_add(clone->truesize, &ip6_frags.mem); + atomic_add(clone->truesize, &fq->q.net->mem); } /* We have to remove fragment header from datagram and to relocate @@ -525,7 +519,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, skb_shinfo(head)->frag_list = head->next; skb_reset_transport_header(head); skb_push(head, head->data - skb_network_header(head)); - atomic_sub(head->truesize, &ip6_frags.mem); + atomic_sub(head->truesize, &fq->q.net->mem); for (fp=head->next; fp; fp = fp->next) { head->data_len += fp->len; @@ -535,7 +529,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); head->truesize += fp->truesize; - atomic_sub(fp->truesize, &ip6_frags.mem); + atomic_sub(fp->truesize, &fq->q.net->mem); } head->next = NULL; @@ -575,6 +569,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) struct frag_hdr *fhdr; struct frag_queue *fq; struct ipv6hdr *hdr = ipv6_hdr(skb); + struct net *net; IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_REASMREQDS); @@ -605,10 +600,11 @@ static int ipv6_frag_rcv(struct sk_buff *skb) return 1; } - if (atomic_read(&ip6_frags.mem) > ip6_frags_ctl.high_thresh) - ip6_evictor(ip6_dst_idev(skb->dst)); + net = skb->dev->nd_net; + if (atomic_read(&net->ipv6.frags.mem) > net->ipv6.frags.high_thresh) + ip6_evictor(net, ip6_dst_idev(skb->dst)); - if ((fq = fq_find(fhdr->identification, &hdr->saddr, &hdr->daddr, + if ((fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr, ip6_dst_idev(skb->dst))) != NULL) { int ret; @@ -632,12 +628,127 @@ static struct inet6_protocol frag_protocol = .flags = INET6_PROTO_NOPOLICY, }; -void __init ipv6_frag_init(void) +#ifdef CONFIG_SYSCTL +static struct ctl_table ip6_frags_ctl_table[] = { + { + .ctl_name = NET_IPV6_IP6FRAG_HIGH_THRESH, + .procname = "ip6frag_high_thresh", + .data = &init_net.ipv6.frags.high_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV6_IP6FRAG_LOW_THRESH, + .procname = "ip6frag_low_thresh", + .data = &init_net.ipv6.frags.low_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV6_IP6FRAG_TIME, + .procname = "ip6frag_time", + .data = &init_net.ipv6.frags.timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV6_IP6FRAG_SECRET_INTERVAL, + .procname = "ip6frag_secret_interval", + .data = &ip6_frags.secret_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies + }, + { } +}; + +static int ip6_frags_sysctl_register(struct net *net) { - if (inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT) < 0) - printk(KERN_ERR "ipv6_frag_init: Could not register protocol\n"); + struct ctl_table *table; + struct ctl_table_header *hdr; + + table = ip6_frags_ctl_table; + if (net != &init_net) { + table = kmemdup(table, sizeof(ip6_frags_ctl_table), GFP_KERNEL); + if (table == NULL) + goto err_alloc; + + table[0].data = &net->ipv6.frags.high_thresh; + table[1].data = &net->ipv6.frags.low_thresh; + table[2].data = &net->ipv6.frags.timeout; + table[3].mode &= ~0222; + } + + hdr = register_net_sysctl_table(net, net_ipv6_ctl_path, table); + if (hdr == NULL) + goto err_reg; + + net->ipv6.sysctl.frags_hdr = hdr; + return 0; + +err_reg: + if (net != &init_net) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static void ip6_frags_sysctl_unregister(struct net *net) +{ + struct ctl_table *table; + + table = net->ipv6.sysctl.frags_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->ipv6.sysctl.frags_hdr); + kfree(table); +} +#else +static inline int ip6_frags_sysctl_register(struct net *net) +{ + return 0; +} + +static inline void ip6_frags_sysctl_unregister(struct net *net) +{ +} +#endif + +static int ipv6_frags_init_net(struct net *net) +{ + net->ipv6.frags.high_thresh = 256 * 1024; + net->ipv6.frags.low_thresh = 192 * 1024; + net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT; + + inet_frags_init_net(&net->ipv6.frags); + + return ip6_frags_sysctl_register(net); +} + +static void ipv6_frags_exit_net(struct net *net) +{ + ip6_frags_sysctl_unregister(net); + inet_frags_exit_net(&net->ipv6.frags, &ip6_frags); +} + +static struct pernet_operations ip6_frags_ops = { + .init = ipv6_frags_init_net, + .exit = ipv6_frags_exit_net, +}; + +int __init ipv6_frag_init(void) +{ + int ret; + + ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT); + if (ret) + goto out; + + register_pernet_subsys(&ip6_frags_ops); - ip6_frags.ctl = &ip6_frags_ctl; ip6_frags.hashfn = ip6_hashfn; ip6_frags.constructor = ip6_frag_init; ip6_frags.destructor = NULL; @@ -645,5 +756,15 @@ void __init ipv6_frag_init(void) ip6_frags.qsize = sizeof(struct frag_queue); ip6_frags.match = ip6_frag_match; ip6_frags.frag_expire = ip6_frag_expire; + ip6_frags.secret_interval = 10 * 60 * HZ; inet_frags_init(&ip6_frags); +out: + return ret; +} + +void ipv6_frag_exit(void) +{ + inet_frags_fini(&ip6_frags); + unregister_pernet_subsys(&ip6_frags_ops); + inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 20083e0d399..4004c5f0b8d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -73,21 +73,13 @@ #define CLONE_OFFLINK_ROUTE 0 -static int ip6_rt_max_size = 4096; -static int ip6_rt_gc_min_interval = HZ / 2; -static int ip6_rt_gc_timeout = 60*HZ; -int ip6_rt_gc_interval = 30*HZ; -static int ip6_rt_gc_elasticity = 9; -static int ip6_rt_mtu_expires = 10*60*HZ; -static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; - static struct rt6_info * ip6_rt_copy(struct rt6_info *ort); static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); static struct dst_entry *ip6_negative_advice(struct dst_entry *); static void ip6_dst_destroy(struct dst_entry *); static void ip6_dst_ifdown(struct dst_entry *, struct net_device *dev, int how); -static int ip6_dst_gc(void); +static int ip6_dst_gc(struct dst_ops *ops); static int ip6_pkt_discard(struct sk_buff *skb); static int ip6_pkt_discard_out(struct sk_buff *skb); @@ -113,6 +105,7 @@ static struct dst_ops ip6_dst_ops = { .negative_advice = ip6_negative_advice, .link_failure = ip6_link_failure, .update_pmtu = ip6_rt_update_pmtu, + .local_out = ip6_local_out, .entry_size = sizeof(struct rt6_info), }; @@ -152,7 +145,6 @@ struct rt6_info ip6_null_entry = { static int ip6_pkt_prohibit(struct sk_buff *skb); static int ip6_pkt_prohibit_out(struct sk_buff *skb); -static int ip6_pkt_blk_hole(struct sk_buff *skb); struct rt6_info ip6_prohibit_entry = { .u = { @@ -181,8 +173,8 @@ struct rt6_info ip6_blk_hole_entry = { .obsolete = -1, .error = -EINVAL, .metrics = { [RTAX_HOPLIMIT - 1] = 255, }, - .input = ip6_pkt_blk_hole, - .output = ip6_pkt_blk_hole, + .input = dst_discard, + .output = dst_discard, .ops = &ip6_dst_ops, .path = (struct dst_entry*)&ip6_blk_hole_entry, } @@ -216,9 +208,12 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, { struct rt6_info *rt = (struct rt6_info *)dst; struct inet6_dev *idev = rt->rt6i_idev; + struct net_device *loopback_dev = + dev->nd_net->loopback_dev; - if (dev != init_net.loopback_dev && idev != NULL && idev->dev == dev) { - struct inet6_dev *loopback_idev = in6_dev_get(init_net.loopback_dev); + if (dev != loopback_dev && idev != NULL && idev->dev == dev) { + struct inet6_dev *loopback_idev = + in6_dev_get(loopback_dev); if (loopback_idev != NULL) { rt->rt6i_idev = loopback_idev; in6_dev_put(idev); @@ -606,7 +601,10 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info) int ip6_ins_rt(struct rt6_info *rt) { - return __ip6_ins_rt(rt, NULL); + struct nl_info info = { + .nl_net = &init_net, + }; + return __ip6_ins_rt(rt, &info); } static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr, @@ -782,12 +780,6 @@ struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl) EXPORT_SYMBOL(ip6_route_output); -static int ip6_blackhole_output(struct sk_buff *skb) -{ - kfree_skb(skb); - return 0; -} - int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl) { struct rt6_info *ort = (struct rt6_info *) *dstp; @@ -800,8 +792,8 @@ int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl atomic_set(&new->__refcnt, 1); new->__use = 1; - new->input = ip6_blackhole_output; - new->output = ip6_blackhole_output; + new->input = dst_discard; + new->output = dst_discard; memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32)); new->dev = ort->u.dst.dev; @@ -896,8 +888,8 @@ static inline unsigned int ipv6_advmss(unsigned int mtu) { mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); - if (mtu < ip6_rt_min_advmss) - mtu = ip6_rt_min_advmss; + if (mtu < init_net.ipv6.sysctl.ip6_rt_min_advmss) + mtu = init_net.ipv6.sysctl.ip6_rt_min_advmss; /* * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and @@ -991,25 +983,25 @@ int ndisc_dst_gc(int *more) return freed; } -static int ip6_dst_gc(void) +static int ip6_dst_gc(struct dst_ops *ops) { static unsigned expire = 30*HZ; static unsigned long last_gc; unsigned long now = jiffies; - if (time_after(last_gc + ip6_rt_gc_min_interval, now) && - atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size) + if (time_after(last_gc + init_net.ipv6.sysctl.ip6_rt_gc_min_interval, now) && + atomic_read(&ip6_dst_ops.entries) <= init_net.ipv6.sysctl.ip6_rt_max_size) goto out; expire++; fib6_run_gc(expire); last_gc = now; if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh) - expire = ip6_rt_gc_timeout>>1; + expire = init_net.ipv6.sysctl.ip6_rt_gc_timeout>>1; out: - expire -= expire>>ip6_rt_gc_elasticity; - return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size); + expire -= expire>>init_net.ipv6.sysctl.ip6_rt_gc_elasticity; + return (atomic_read(&ip6_dst_ops.entries) > init_net.ipv6.sysctl.ip6_rt_max_size); } /* Clean host part of a prefix. Not necessary in radix tree, @@ -1269,7 +1261,10 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) int ip6_del_rt(struct rt6_info *rt) { - return __ip6_del_rt(rt, NULL); + struct nl_info info = { + .nl_net = &init_net, + }; + return __ip6_del_rt(rt, &info); } static int ip6_route_del(struct fib6_config *cfg) @@ -1514,7 +1509,7 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr, rt->u.dst.metrics[RTAX_MTU-1] = pmtu; if (allfrag) rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG; - dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires); + dst_set_expires(&rt->u.dst, init_net.ipv6.sysctl.ip6_rt_mtu_expires); rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES; goto out; } @@ -1540,7 +1535,7 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr, * which is 10 mins. After 10 mins the decreased pmtu is expired * and detecting PMTU increase will be automatically happened. */ - dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires); + dst_set_expires(&nrt->u.dst, init_net.ipv6.sysctl.ip6_rt_mtu_expires); nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES; ip6_ins_rt(nrt); @@ -1665,6 +1660,8 @@ struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *d return rt; } +EXPORT_SYMBOL(rt6_get_dflt_router); + struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref) @@ -1766,8 +1763,7 @@ int ipv6_route_ioctl(unsigned int cmd, void __user *arg) * Drop the packet on the floor */ -static inline int ip6_pkt_drop(struct sk_buff *skb, int code, - int ipstats_mib_noroutes) +static int ip6_pkt_drop(struct sk_buff *skb, int code, int ipstats_mib_noroutes) { int type; switch (ipstats_mib_noroutes) { @@ -1811,12 +1807,6 @@ static int ip6_pkt_prohibit_out(struct sk_buff *skb) return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); } -static int ip6_pkt_blk_hole(struct sk_buff *skb) -{ - kfree_skb(skb); - return 0; -} - #endif /* @@ -2015,9 +2005,13 @@ errout: static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) { + struct net *net = skb->sk->sk_net; struct fib6_config cfg; int err; + if (net != &init_net) + return -EINVAL; + err = rtm_to_fib6_config(skb, nlh, &cfg); if (err < 0) return err; @@ -2027,9 +2021,13 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *a static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) { + struct net *net = skb->sk->sk_net; struct fib6_config cfg; int err; + if (net != &init_net) + return -EINVAL; + err = rtm_to_fib6_config(skb, nlh, &cfg); if (err < 0) return err; @@ -2164,6 +2162,7 @@ int rt6_dump_route(struct rt6_info *rt, void *p_arg) static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) { + struct net *net = in_skb->sk->sk_net; struct nlattr *tb[RTA_MAX+1]; struct rt6_info *rt; struct sk_buff *skb; @@ -2171,6 +2170,9 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void struct flowi fl; int err, iif = 0; + if (net != &init_net) + return -EINVAL; + err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy); if (err < 0) goto errout; @@ -2230,7 +2232,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void goto errout; } - err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid); + err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid); errout: return err; } @@ -2238,32 +2240,29 @@ errout: void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info) { struct sk_buff *skb; - u32 pid = 0, seq = 0; - struct nlmsghdr *nlh = NULL; - int err = -ENOBUFS; - - if (info) { - pid = info->pid; - nlh = info->nlh; - if (nlh) - seq = nlh->nlmsg_seq; - } + u32 seq; + int err; + + err = -ENOBUFS; + seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0; skb = nlmsg_new(rt6_nlmsg_size(), gfp_any()); if (skb == NULL) goto errout; - err = rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0); + err = rt6_fill_node(skb, rt, NULL, NULL, 0, + event, info->pid, seq, 0, 0); if (err < 0) { /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, pid, RTNLGRP_IPV6_ROUTE, nlh, gfp_any()); + err = rtnl_notify(skb, &init_net, info->pid, + RTNLGRP_IPV6_ROUTE, info->nlh, gfp_any()); errout: if (err < 0) - rtnl_set_sk_err(RTNLGRP_IPV6_ROUTE, err); + rtnl_set_sk_err(&init_net, RTNLGRP_IPV6_ROUTE, err); } /* @@ -2353,28 +2352,61 @@ static const struct file_operations rt6_stats_seq_fops = { .llseek = seq_lseek, .release = single_release, }; + +static int ipv6_route_proc_init(struct net *net) +{ + int ret = -ENOMEM; + if (!proc_net_fops_create(net, "ipv6_route", + 0, &ipv6_route_proc_fops)) + goto out; + + if (!proc_net_fops_create(net, "rt6_stats", + S_IRUGO, &rt6_stats_seq_fops)) + goto out_ipv6_route; + + ret = 0; +out: + return ret; +out_ipv6_route: + proc_net_remove(net, "ipv6_route"); + goto out; +} + +static void ipv6_route_proc_fini(struct net *net) +{ + proc_net_remove(net, "ipv6_route"); + proc_net_remove(net, "rt6_stats"); +} +#else +static inline int ipv6_route_proc_init(struct net *net) +{ + return 0; +} +static inline void ipv6_route_proc_fini(struct net *net) +{ + return ; +} #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SYSCTL -static int flush_delay; - static int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos) { + int delay = init_net.ipv6.sysctl.flush_delay; if (write) { proc_dointvec(ctl, write, filp, buffer, lenp, ppos); - fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay); + fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay); return 0; } else return -EINVAL; } -ctl_table ipv6_route_table[] = { +ctl_table ipv6_route_table_template[] = { { .procname = "flush", - .data = &flush_delay, + .data = &init_net.ipv6.sysctl.flush_delay, .maxlen = sizeof(int), .mode = 0200, .proc_handler = &ipv6_sysctl_rtcache_flush @@ -2390,7 +2422,7 @@ ctl_table ipv6_route_table[] = { { .ctl_name = NET_IPV6_ROUTE_MAX_SIZE, .procname = "max_size", - .data = &ip6_rt_max_size, + .data = &init_net.ipv6.sysctl.ip6_rt_max_size, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -2398,7 +2430,7 @@ ctl_table ipv6_route_table[] = { { .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL, .procname = "gc_min_interval", - .data = &ip6_rt_gc_min_interval, + .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, @@ -2407,7 +2439,7 @@ ctl_table ipv6_route_table[] = { { .ctl_name = NET_IPV6_ROUTE_GC_TIMEOUT, .procname = "gc_timeout", - .data = &ip6_rt_gc_timeout, + .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, @@ -2416,7 +2448,7 @@ ctl_table ipv6_route_table[] = { { .ctl_name = NET_IPV6_ROUTE_GC_INTERVAL, .procname = "gc_interval", - .data = &ip6_rt_gc_interval, + .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, @@ -2425,7 +2457,7 @@ ctl_table ipv6_route_table[] = { { .ctl_name = NET_IPV6_ROUTE_GC_ELASTICITY, .procname = "gc_elasticity", - .data = &ip6_rt_gc_elasticity, + .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, @@ -2434,7 +2466,7 @@ ctl_table ipv6_route_table[] = { { .ctl_name = NET_IPV6_ROUTE_MTU_EXPIRES, .procname = "mtu_expires", - .data = &ip6_rt_mtu_expires, + .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, @@ -2443,7 +2475,7 @@ ctl_table ipv6_route_table[] = { { .ctl_name = NET_IPV6_ROUTE_MIN_ADVMSS, .procname = "min_adv_mss", - .data = &ip6_rt_min_advmss, + .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, @@ -2452,7 +2484,7 @@ ctl_table ipv6_route_table[] = { { .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, .procname = "gc_min_interval_ms", - .data = &ip6_rt_gc_min_interval, + .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec_ms_jiffies, @@ -2461,42 +2493,74 @@ ctl_table ipv6_route_table[] = { { .ctl_name = 0 } }; +struct ctl_table *ipv6_route_sysctl_init(struct net *net) +{ + struct ctl_table *table; + + table = kmemdup(ipv6_route_table_template, + sizeof(ipv6_route_table_template), + GFP_KERNEL); + return table; +} #endif -void __init ip6_route_init(void) +int __init ip6_route_init(void) { + int ret; + ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + SLAB_HWCACHE_ALIGN, NULL); + if (!ip6_dst_ops.kmem_cachep) + return -ENOMEM; + ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops.kmem_cachep; - fib6_init(); - proc_net_fops_create(&init_net, "ipv6_route", 0, &ipv6_route_proc_fops); - proc_net_fops_create(&init_net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops); -#ifdef CONFIG_XFRM - xfrm6_init(); -#endif -#ifdef CONFIG_IPV6_MULTIPLE_TABLES - fib6_rules_init(); -#endif + ret = fib6_init(); + if (ret) + goto out_kmem_cache; - __rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL); - __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL); - __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL); + ret = ipv6_route_proc_init(&init_net); + if (ret) + goto out_fib6_init; + + ret = xfrm6_init(); + if (ret) + goto out_proc_init; + + ret = fib6_rules_init(); + if (ret) + goto xfrm6_init; + + ret = -ENOBUFS; + if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) || + __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) || + __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL)) + goto fib6_rules_init; + + ret = 0; +out: + return ret; + +fib6_rules_init: + fib6_rules_cleanup(); +xfrm6_init: + xfrm6_fini(); +out_proc_init: + ipv6_route_proc_fini(&init_net); +out_fib6_init: + rt6_ifdown(NULL); + fib6_gc_cleanup(); +out_kmem_cache: + kmem_cache_destroy(ip6_dst_ops.kmem_cachep); + goto out; } void ip6_route_cleanup(void) { -#ifdef CONFIG_IPV6_MULTIPLE_TABLES fib6_rules_cleanup(); -#endif -#ifdef CONFIG_PROC_FS - proc_net_remove(&init_net, "ipv6_route"); - proc_net_remove(&init_net, "rt6_stats"); -#endif -#ifdef CONFIG_XFRM + ipv6_route_proc_fini(&init_net); xfrm6_fini(); -#endif rt6_ifdown(NULL); fib6_gc_cleanup(); kmem_cache_destroy(ip6_dst_ops.kmem_cachep); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 71433d29d88..e77239d02bf 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -16,6 +16,7 @@ * Changes: * Roger Venning <r.venning@telstra.com>: 6to4 support * Nate Thompson <nate@thebog.net>: 6to4 support + * Fred L. Templin <fltemplin@acm.org>: isatap support */ #include <linux/module.h> @@ -182,6 +183,9 @@ static struct ip_tunnel * ipip6_tunnel_locate(struct ip_tunnel_parm *parms, int dev->init = ipip6_tunnel_init; nt->parms = *parms; + if (parms->i_flags & SIT_ISATAP) + dev->priv_flags |= IFF_ISATAP; + if (register_netdevice(dev) < 0) { free_netdev(dev); goto failed; @@ -364,6 +368,48 @@ static inline void ipip6_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) IP6_ECN_set_ce(ipv6_hdr(skb)); } +/* ISATAP (RFC4214) - check source address */ +static int +isatap_srcok(struct sk_buff *skb, struct iphdr *iph, struct net_device *dev) +{ + struct neighbour *neigh; + struct dst_entry *dst; + struct rt6_info *rt; + struct flowi fl; + struct in6_addr *addr6; + struct in6_addr rtr; + struct ipv6hdr *iph6; + int ok = 0; + + /* from onlink default router */ + ipv6_addr_set(&rtr, htonl(0xFE800000), 0, 0, 0); + ipv6_isatap_eui64(rtr.s6_addr + 8, iph->saddr); + if ((rt = rt6_get_dflt_router(&rtr, dev))) { + dst_release(&rt->u.dst); + return 1; + } + + iph6 = ipv6_hdr(skb); + memset(&fl, 0, sizeof(fl)); + fl.proto = iph6->nexthdr; + ipv6_addr_copy(&fl.fl6_dst, &iph6->saddr); + fl.oif = dev->ifindex; + security_skb_classify_flow(skb, &fl); + + dst = ip6_route_output(NULL, &fl); + if (!dst->error && (dst->dev == dev) && (neigh = dst->neighbour)) { + + addr6 = (struct in6_addr*)&neigh->primary_key; + + /* from correct previous hop */ + if (ipv6_addr_is_isatap(addr6) && + (addr6->s6_addr32[3] == iph->saddr)) + ok = 1; + } + dst_release(dst); + return ok; +} + static int ipip6_rcv(struct sk_buff *skb) { struct iphdr *iph; @@ -382,6 +428,14 @@ static int ipip6_rcv(struct sk_buff *skb) IPCB(skb)->flags = 0; skb->protocol = htons(ETH_P_IPV6); skb->pkt_type = PACKET_HOST; + + if ((tunnel->dev->priv_flags & IFF_ISATAP) && + !isatap_srcok(skb, iph, tunnel->dev)) { + tunnel->stat.rx_errors++; + read_unlock(&ipip6_lock); + kfree_skb(skb); + return 0; + } tunnel->stat.rx_packets++; tunnel->stat.rx_bytes += skb->len; skb->dev = tunnel->dev; @@ -444,6 +498,29 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (skb->protocol != htons(ETH_P_IPV6)) goto tx_error; + /* ISATAP (RFC4214) - must come before 6to4 */ + if (dev->priv_flags & IFF_ISATAP) { + struct neighbour *neigh = NULL; + + if (skb->dst) + neigh = skb->dst->neighbour; + + if (neigh == NULL) { + if (net_ratelimit()) + printk(KERN_DEBUG "sit: nexthop == NULL\n"); + goto tx_error; + } + + addr6 = (struct in6_addr*)&neigh->primary_key; + addr_type = ipv6_addr_type(addr6); + + if ((addr_type & IPV6_ADDR_UNICAST) && + ipv6_addr_is_isatap(addr6)) + dst = addr6->s6_addr32[3]; + else + goto tx_error; + } + if (!dst) dst = try_6to4(&iph6->daddr); @@ -480,7 +557,7 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) .tos = RT_TOS(tos) } }, .oif = tunnel->parms.link, .proto = IPPROTO_IPV6 }; - if (ip_route_output_key(&rt, &fl)) { + if (ip_route_output_key(&init_net, &rt, &fl)) { tunnel->stat.tx_carrier_errors++; goto tx_error_icmp; } @@ -592,6 +669,42 @@ tx_error: return 0; } +static void ipip6_tunnel_bind_dev(struct net_device *dev) +{ + struct net_device *tdev = NULL; + struct ip_tunnel *tunnel; + struct iphdr *iph; + + tunnel = netdev_priv(dev); + iph = &tunnel->parms.iph; + + if (iph->daddr) { + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = iph->daddr, + .saddr = iph->saddr, + .tos = RT_TOS(iph->tos) } }, + .oif = tunnel->parms.link, + .proto = IPPROTO_IPV6 }; + struct rtable *rt; + if (!ip_route_output_key(&init_net, &rt, &fl)) { + tdev = rt->u.dst.dev; + ip_rt_put(rt); + } + dev->flags |= IFF_POINTOPOINT; + } + + if (!tdev && tunnel->parms.link) + tdev = __dev_get_by_index(&init_net, tunnel->parms.link); + + if (tdev) { + dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); + dev->mtu = tdev->mtu - sizeof(struct iphdr); + if (dev->mtu < IPV6_MIN_MTU) + dev->mtu = IPV6_MIN_MTU; + } + dev->iflink = tunnel->parms.link; +} + static int ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) { @@ -663,6 +776,11 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) if (cmd == SIOCCHGTUNNEL) { t->parms.iph.ttl = p.iph.ttl; t->parms.iph.tos = p.iph.tos; + if (t->parms.link != p.link) { + t->parms.link = p.link; + ipip6_tunnel_bind_dev(dev); + netdev_state_change(dev); + } } if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) err = -EFAULT; @@ -731,12 +849,9 @@ static void ipip6_tunnel_setup(struct net_device *dev) static int ipip6_tunnel_init(struct net_device *dev) { - struct net_device *tdev = NULL; struct ip_tunnel *tunnel; - struct iphdr *iph; tunnel = netdev_priv(dev); - iph = &tunnel->parms.iph; tunnel->dev = dev; strcpy(tunnel->parms.name, dev->name); @@ -744,31 +859,7 @@ static int ipip6_tunnel_init(struct net_device *dev) memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); - if (iph->daddr) { - struct flowi fl = { .nl_u = { .ip4_u = - { .daddr = iph->daddr, - .saddr = iph->saddr, - .tos = RT_TOS(iph->tos) } }, - .oif = tunnel->parms.link, - .proto = IPPROTO_IPV6 }; - struct rtable *rt; - if (!ip_route_output_key(&rt, &fl)) { - tdev = rt->u.dst.dev; - ip_rt_put(rt); - } - dev->flags |= IFF_POINTOPOINT; - } - - if (!tdev && tunnel->parms.link) - tdev = __dev_get_by_index(&init_net, tunnel->parms.link); - - if (tdev) { - dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); - dev->mtu = tdev->mtu - sizeof(struct iphdr); - if (dev->mtu < IPV6_MIN_MTU) - dev->mtu = IPV6_MIN_MTU; - } - dev->iflink = tunnel->parms.link; + ipip6_tunnel_bind_dev(dev); return 0; } diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 68bb2548e46..408691b777c 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -14,66 +14,30 @@ #include <net/addrconf.h> #include <net/inet_frag.h> -#ifdef CONFIG_SYSCTL - -static ctl_table ipv6_table[] = { +static ctl_table ipv6_table_template[] = { { .ctl_name = NET_IPV6_ROUTE, .procname = "route", .maxlen = 0, .mode = 0555, - .child = ipv6_route_table + .child = ipv6_route_table_template }, { .ctl_name = NET_IPV6_ICMP, .procname = "icmp", .maxlen = 0, .mode = 0555, - .child = ipv6_icmp_table + .child = ipv6_icmp_table_template }, { .ctl_name = NET_IPV6_BINDV6ONLY, .procname = "bindv6only", - .data = &sysctl_ipv6_bindv6only, + .data = &init_net.ipv6.sysctl.bindv6only, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec }, { - .ctl_name = NET_IPV6_IP6FRAG_HIGH_THRESH, - .procname = "ip6frag_high_thresh", - .data = &ip6_frags_ctl.high_thresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = NET_IPV6_IP6FRAG_LOW_THRESH, - .procname = "ip6frag_low_thresh", - .data = &ip6_frags_ctl.low_thresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = NET_IPV6_IP6FRAG_TIME, - .procname = "ip6frag_time", - .data = &ip6_frags_ctl.timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, - }, - { - .ctl_name = NET_IPV6_IP6FRAG_SECRET_INTERVAL, - .procname = "ip6frag_secret_interval", - .data = &ip6_frags_ctl.secret_interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies - }, - { .ctl_name = NET_IPV6_MLD_MAX_MSF, .procname = "mld_max_msf", .data = &sysctl_mld_max_msf, @@ -84,39 +48,106 @@ static ctl_table ipv6_table[] = { { .ctl_name = 0 } }; -static struct ctl_table_header *ipv6_sysctl_header; - -static ctl_table ipv6_net_table[] = { - { - .ctl_name = NET_IPV6, - .procname = "ipv6", - .mode = 0555, - .child = ipv6_table - }, - { .ctl_name = 0 } -}; - -static ctl_table ipv6_root_table[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = ipv6_net_table - }, - { .ctl_name = 0 } +struct ctl_path net_ipv6_ctl_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "ipv6", .ctl_name = NET_IPV6, }, + { }, }; +EXPORT_SYMBOL_GPL(net_ipv6_ctl_path); -void ipv6_sysctl_register(void) +static int ipv6_sysctl_net_init(struct net *net) { - ipv6_sysctl_header = register_sysctl_table(ipv6_root_table); + struct ctl_table *ipv6_table; + struct ctl_table *ipv6_route_table; + struct ctl_table *ipv6_icmp_table; + int err; + + err = -ENOMEM; + ipv6_table = kmemdup(ipv6_table_template, sizeof(ipv6_table_template), + GFP_KERNEL); + if (!ipv6_table) + goto out; + + ipv6_route_table = ipv6_route_sysctl_init(net); + if (!ipv6_route_table) + goto out_ipv6_table; + + ipv6_icmp_table = ipv6_icmp_sysctl_init(net); + if (!ipv6_icmp_table) + goto out_ipv6_route_table; + + ipv6_route_table[0].data = &net->ipv6.sysctl.flush_delay; + /* ipv6_route_table[1].data will be handled when we have + routes per namespace */ + ipv6_route_table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; + ipv6_route_table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; + ipv6_route_table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; + ipv6_route_table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; + ipv6_route_table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; + ipv6_route_table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; + ipv6_route_table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; + ipv6_table[0].child = ipv6_route_table; + + ipv6_icmp_table[0].data = &net->ipv6.sysctl.icmpv6_time; + ipv6_table[1].child = ipv6_icmp_table; + + ipv6_table[2].data = &net->ipv6.sysctl.bindv6only; + + /* We don't want this value to be per namespace, it should be global + to all namespaces, so make it read-only when we are not in the + init network namespace */ + if (net != &init_net) + ipv6_table[3].mode = 0444; + + net->ipv6.sysctl.table = register_net_sysctl_table(net, net_ipv6_ctl_path, + ipv6_table); + if (!net->ipv6.sysctl.table) + return -ENOMEM; + + if (!net->ipv6.sysctl.table) + goto out_ipv6_icmp_table; + + err = 0; +out: + return err; + +out_ipv6_icmp_table: + kfree(ipv6_icmp_table); +out_ipv6_route_table: + kfree(ipv6_route_table); +out_ipv6_table: + kfree(ipv6_table); + goto out; } -void ipv6_sysctl_unregister(void) +static void ipv6_sysctl_net_exit(struct net *net) { - unregister_sysctl_table(ipv6_sysctl_header); -} + struct ctl_table *ipv6_table; + struct ctl_table *ipv6_route_table; + struct ctl_table *ipv6_icmp_table; -#endif /* CONFIG_SYSCTL */ + ipv6_table = net->ipv6.sysctl.table->ctl_table_arg; + ipv6_route_table = ipv6_table[0].child; + ipv6_icmp_table = ipv6_table[1].child; + unregister_net_sysctl_table(net->ipv6.sysctl.table); + kfree(ipv6_table); + kfree(ipv6_route_table); + kfree(ipv6_icmp_table); +} + +static struct pernet_operations ipv6_sysctl_net_ops = { + .init = ipv6_sysctl_net_init, + .exit = ipv6_sysctl_net_exit, +}; +int ipv6_sysctl_register(void) +{ + return register_pernet_subsys(&ipv6_sysctl_net_ops); +} + +void ipv6_sysctl_unregister(void) +{ + unregister_pernet_subsys(&ipv6_sysctl_net_ops); +} diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 93980c3b83e..00c08399837 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -265,7 +265,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (final_p) ipv6_addr_copy(&fl.fl6_dst, final_p); - if ((err = __xfrm_lookup(&dst, &fl, sk, 1)) < 0) { + if ((err = __xfrm_lookup(&dst, &fl, sk, XFRM_LOOKUP_WAIT)) < 0) { if (err == -EREMOTE) err = ip6_dst_blackhole(sk, &dst, &fl); if (err < 0) @@ -733,7 +733,7 @@ static int tcp_v6_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, struct in6_addr *saddr, struct in6_addr *daddr, struct tcphdr *th, int protocol, - int tcplen) + unsigned int tcplen) { struct scatterlist sg[4]; __u16 data_len; @@ -818,7 +818,7 @@ static int tcp_v6_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, struct dst_entry *dst, struct request_sock *req, struct tcphdr *th, int protocol, - int tcplen) + unsigned int tcplen) { struct in6_addr *saddr, *daddr; @@ -985,7 +985,7 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) struct tcphdr *th = tcp_hdr(skb), *t1; struct sk_buff *buff; struct flowi fl; - int tot_len = sizeof(*th); + unsigned int tot_len = sizeof(*th); #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *key; #endif @@ -1085,7 +1085,7 @@ static void tcp_v6_send_ack(struct tcp_timewait_sock *tw, struct tcphdr *th = tcp_hdr(skb), *t1; struct sk_buff *buff; struct flowi fl; - int tot_len = sizeof(struct tcphdr); + unsigned int tot_len = sizeof(struct tcphdr); __be32 *topt; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *key; @@ -2166,14 +2166,36 @@ static struct inet_protosw tcpv6_protosw = { INET_PROTOSW_ICSK, }; -void __init tcpv6_init(void) +int __init tcpv6_init(void) { + int ret; + + ret = inet6_add_protocol(&tcpv6_protocol, IPPROTO_TCP); + if (ret) + goto out; + /* register inet6 protocol */ - if (inet6_add_protocol(&tcpv6_protocol, IPPROTO_TCP) < 0) - printk(KERN_ERR "tcpv6_init: Could not register protocol\n"); - inet6_register_protosw(&tcpv6_protosw); + ret = inet6_register_protosw(&tcpv6_protosw); + if (ret) + goto out_tcpv6_protocol; + + ret = inet_csk_ctl_sock_create(&tcp6_socket, PF_INET6, + SOCK_RAW, IPPROTO_TCP); + if (ret) + goto out_tcpv6_protosw; +out: + return ret; - if (inet_csk_ctl_sock_create(&tcp6_socket, PF_INET6, SOCK_RAW, - IPPROTO_TCP) < 0) - panic("Failed to create the TCPv6 control socket.\n"); +out_tcpv6_protocol: + inet6_del_protocol(&tcpv6_protocol, IPPROTO_TCP); +out_tcpv6_protosw: + inet6_unregister_protosw(&tcpv6_protosw); + goto out; +} + +void tcpv6_exit(void) +{ + sock_release(tcp6_socket); + inet6_unregister_protosw(&tcpv6_protosw); + inet6_del_protocol(&tcpv6_protocol, IPPROTO_TCP); } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index ee1cc3f8599..bd4b9df8f61 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -34,6 +34,7 @@ #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/init.h> +#include <linux/module.h> #include <linux/skbuff.h> #include <asm/uaccess.h> @@ -50,8 +51,6 @@ #include <linux/seq_file.h> #include "udp_impl.h" -DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6) __read_mostly; - static inline int udp_v6_get_port(struct sock *sk, unsigned short snum) { return udp_get_port(sk, snum, ipv6_rcv_saddr_equal); @@ -121,6 +120,7 @@ int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk, struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; unsigned int ulen, copied; + int peeked; int err; int is_udplite = IS_UDPLITE(sk); @@ -131,7 +131,8 @@ int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk, return ipv6_recv_error(sk, msg, len); try_again: - skb = skb_recv_datagram(sk, flags, noblock, &err); + skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), + &peeked, &err); if (!skb) goto out; @@ -164,6 +165,9 @@ try_again: if (err) goto out_free; + if (!peeked) + UDP6_INC_STATS_USER(UDP_MIB_INDATAGRAMS, is_udplite); + sock_recv_timestamp(msg, sk, skb); /* Copy the address. */ @@ -200,13 +204,17 @@ try_again: err = ulen; out_free: + lock_sock(sk); skb_free_datagram(sk, skb); + release_sock(sk); out: return err; csum_copy_err: - UDP6_INC_STATS_USER(UDP_MIB_INERRORS, is_udplite); - skb_kill_datagram(sk, skb, flags); + lock_sock(sk); + if (!skb_kill_datagram(sk, skb, flags)) + UDP6_INC_STATS_USER(UDP_MIB_INERRORS, is_udplite); + release_sock(sk); if (flags & MSG_DONTWAIT) return -EAGAIN; @@ -251,13 +259,14 @@ static __inline__ void udpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, int type, int code, int offset, __be32 info ) { - return __udp6_lib_err(skb, opt, type, code, offset, info, udp_hash); + __udp6_lib_err(skb, opt, type, code, offset, info, udp_hash); } int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) { struct udp_sock *up = udp_sk(sk); int rc; + int is_udplite = IS_UDPLITE(sk); if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto drop; @@ -265,7 +274,7 @@ int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) /* * UDP-Lite specific tests, ignored on UDP sockets (see net/ipv4/udp.c). */ - if ((up->pcflag & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) { + if ((is_udplite & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) { if (up->pcrlen == 0) { /* full coverage was set */ LIMIT_NETDEBUG(KERN_WARNING "UDPLITE6: partial coverage" @@ -289,13 +298,13 @@ int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) if ((rc = sock_queue_rcv_skb(sk,skb)) < 0) { /* Note that an ENOMEM error is charged twice */ if (rc == -ENOMEM) - UDP6_INC_STATS_BH(UDP_MIB_RCVBUFERRORS, up->pcflag); + UDP6_INC_STATS_BH(UDP_MIB_RCVBUFERRORS, is_udplite); goto drop; } - UDP6_INC_STATS_BH(UDP_MIB_INDATAGRAMS, up->pcflag); + return 0; drop: - UDP6_INC_STATS_BH(UDP_MIB_INERRORS, up->pcflag); + UDP6_INC_STATS_BH(UDP_MIB_INERRORS, is_udplite); kfree_skb(skb); return -1; } @@ -361,10 +370,21 @@ static int __udp6_lib_mcast_deliver(struct sk_buff *skb, struct in6_addr *saddr, while ((sk2 = udp_v6_mcast_next(sk_next(sk2), uh->dest, daddr, uh->source, saddr, dif))) { struct sk_buff *buff = skb_clone(skb, GFP_ATOMIC); - if (buff) - udpv6_queue_rcv_skb(sk2, buff); + if (buff) { + bh_lock_sock_nested(sk2); + if (!sock_owned_by_user(sk2)) + udpv6_queue_rcv_skb(sk2, buff); + else + sk_add_backlog(sk2, buff); + bh_unlock_sock(sk2); + } } - udpv6_queue_rcv_skb(sk, skb); + bh_lock_sock_nested(sk); + if (!sock_owned_by_user(sk)) + udpv6_queue_rcv_skb(sk, skb); + else + sk_add_backlog(sk, skb); + bh_unlock_sock(sk); out: read_unlock(&udp_hash_lock); return 0; @@ -477,7 +497,12 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], /* deliver */ - udpv6_queue_rcv_skb(sk, skb); + bh_lock_sock_nested(sk); + if (!sock_owned_by_user(sk)) + udpv6_queue_rcv_skb(sk, skb); + else + sk_add_backlog(sk, skb); + bh_unlock_sock(sk); sock_put(sk); return 0; @@ -523,6 +548,7 @@ static int udp_v6_push_pending_frames(struct sock *sk) struct inet_sock *inet = inet_sk(sk); struct flowi *fl = &inet->cork.fl; int err = 0; + int is_udplite = IS_UDPLITE(sk); __wsum csum = 0; /* Grab the skbuff where UDP header space exists. */ @@ -538,7 +564,7 @@ static int udp_v6_push_pending_frames(struct sock *sk) uh->len = htons(up->len); uh->check = 0; - if (up->pcflag) + if (is_udplite) csum = udplite_csum_outgoing(sk, skb); else csum = udp_csum_outgoing(sk, skb); @@ -554,7 +580,7 @@ out: up->len = 0; up->pending = 0; if (!err) - UDP6_INC_STATS_USER(UDP_MIB_OUTDATAGRAMS, up->pcflag); + UDP6_INC_STATS_USER(UDP_MIB_OUTDATAGRAMS, is_udplite); return err; } @@ -578,7 +604,7 @@ int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk, int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; int err; int connected = 0; - int is_udplite = up->pcflag; + int is_udplite = IS_UDPLITE(sk); int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); /* destination address check */ @@ -748,7 +774,7 @@ do_udp_sendmsg: if (final_p) ipv6_addr_copy(&fl.fl6_dst, final_p); - if ((err = __xfrm_lookup(&dst, &fl, sk, 1)) < 0) { + if ((err = __xfrm_lookup(&dst, &fl, sk, XFRM_LOOKUP_WAIT)) < 0) { if (err == -EREMOTE) err = ip6_dst_blackhole(sk, &dst, &fl); if (err < 0) @@ -988,6 +1014,10 @@ struct proto udpv6_prot = { .hash = udp_lib_hash, .unhash = udp_lib_unhash, .get_port = udp_v6_get_port, + .memory_allocated = &udp_memory_allocated, + .sysctl_mem = sysctl_udp_mem, + .sysctl_wmem = &sysctl_udp_wmem_min, + .sysctl_rmem = &sysctl_udp_rmem_min, .obj_size = sizeof(struct udp6_sock), #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udpv6_setsockopt, @@ -1007,9 +1037,27 @@ static struct inet_protosw udpv6_protosw = { }; -void __init udpv6_init(void) +int __init udpv6_init(void) +{ + int ret; + + ret = inet6_add_protocol(&udpv6_protocol, IPPROTO_UDP); + if (ret) + goto out; + + ret = inet6_register_protosw(&udpv6_protosw); + if (ret) + goto out_udpv6_protocol; +out: + return ret; + +out_udpv6_protocol: + inet6_del_protocol(&udpv6_protocol, IPPROTO_UDP); + goto out; +} + +void udpv6_exit(void) { - if (inet6_add_protocol(&udpv6_protocol, IPPROTO_UDP) < 0) - printk(KERN_ERR "udpv6_init: Could not register protocol\n"); - inet6_register_protosw(&udpv6_protosw); + inet6_unregister_protosw(&udpv6_protosw); + inet6_del_protocol(&udpv6_protocol, IPPROTO_UDP); } diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h index 2d3fda60123..21be3a83e7b 100644 --- a/net/ipv6/udp_impl.h +++ b/net/ipv6/udp_impl.h @@ -5,6 +5,7 @@ #include <net/protocol.h> #include <net/addrconf.h> #include <net/inet_common.h> +#include <net/transp_v6.h> extern int __udp6_lib_rcv(struct sk_buff *, struct hlist_head [], int ); extern void __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index 5a0379f7141..87d4202522e 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -26,7 +26,7 @@ static void udplitev6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, int type, int code, int offset, __be32 info) { - return __udp6_lib_err(skb, opt, type, code, offset, info, udplite_hash); + __udp6_lib_err(skb, opt, type, code, offset, info, udplite_hash); } static struct inet6_protocol udplitev6_protocol = { @@ -77,12 +77,29 @@ static struct inet_protosw udplite6_protosw = { .flags = INET_PROTOSW_PERMANENT, }; -void __init udplitev6_init(void) +int __init udplitev6_init(void) { - if (inet6_add_protocol(&udplitev6_protocol, IPPROTO_UDPLITE) < 0) - printk(KERN_ERR "%s: Could not register.\n", __FUNCTION__); + int ret; - inet6_register_protosw(&udplite6_protosw); + ret = inet6_add_protocol(&udplitev6_protocol, IPPROTO_UDPLITE); + if (ret) + goto out; + + ret = inet6_register_protosw(&udplite6_protosw); + if (ret) + goto out_udplitev6_protocol; +out: + return ret; + +out_udplitev6_protocol: + inet6_del_protocol(&udplitev6_protocol, IPPROTO_UDPLITE); + goto out; +} + +void udplitev6_exit(void) +{ + inet6_unregister_protosw(&udplite6_protosw); + inet6_del_protocol(&udplitev6_protocol, IPPROTO_UDPLITE); } #ifdef CONFIG_PROC_FS diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 515783707e8..a4714d76ae6 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -16,120 +16,37 @@ #include <net/ipv6.h> #include <net/xfrm.h> -int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi) +int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb) { - int err; - __be32 seq; - struct xfrm_state *xfrm_vec[XFRM_MAX_DEPTH]; - struct xfrm_state *x; - int xfrm_nr = 0; - int decaps = 0; - unsigned int nhoff; - - nhoff = IP6CB(skb)->nhoff; - - seq = 0; - if (!spi && (err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) - goto drop; - - do { - struct ipv6hdr *iph = ipv6_hdr(skb); - - if (xfrm_nr == XFRM_MAX_DEPTH) - goto drop; - - x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, spi, - nexthdr, AF_INET6); - if (x == NULL) - goto drop; - spin_lock(&x->lock); - if (unlikely(x->km.state != XFRM_STATE_VALID)) - goto drop_unlock; - - if (x->props.replay_window && xfrm_replay_check(x, seq)) - goto drop_unlock; - - if (xfrm_state_check_expire(x)) - goto drop_unlock; - - nexthdr = x->type->input(x, skb); - if (nexthdr <= 0) - goto drop_unlock; - - skb_network_header(skb)[nhoff] = nexthdr; - - if (x->props.replay_window) - xfrm_replay_advance(x, seq); - - x->curlft.bytes += skb->len; - x->curlft.packets++; - - spin_unlock(&x->lock); - - xfrm_vec[xfrm_nr++] = x; - - if (x->outer_mode->input(x, skb)) - goto drop; - - if (x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL) { - decaps = 1; - break; - } - - if ((err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) < 0) - goto drop; - } while (!err); + return xfrm6_extract_header(skb); +} - /* Allocate new secpath or COW existing one. */ - if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) { - struct sec_path *sp; - sp = secpath_dup(skb->sp); - if (!sp) - goto drop; - if (skb->sp) - secpath_put(skb->sp); - skb->sp = sp; - } +int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi) +{ + XFRM_SPI_SKB_CB(skb)->family = AF_INET6; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); + return xfrm_input(skb, nexthdr, spi, 0); +} +EXPORT_SYMBOL(xfrm6_rcv_spi); - if (xfrm_nr + skb->sp->len > XFRM_MAX_DEPTH) - goto drop; +int xfrm6_transport_finish(struct sk_buff *skb, int async) +{ + skb_network_header(skb)[IP6CB(skb)->nhoff] = + XFRM_MODE_SKB_CB(skb)->protocol; - memcpy(skb->sp->xvec + skb->sp->len, xfrm_vec, - xfrm_nr * sizeof(xfrm_vec[0])); - skb->sp->len += xfrm_nr; - - nf_reset(skb); - - if (decaps) { - dst_release(skb->dst); - skb->dst = NULL; - netif_rx(skb); - return -1; - } else { -#ifdef CONFIG_NETFILTER - ipv6_hdr(skb)->payload_len = htons(skb->len); - __skb_push(skb, skb->data - skb_network_header(skb)); - - NF_HOOK(PF_INET6, NF_IP6_PRE_ROUTING, skb, skb->dev, NULL, - ip6_rcv_finish); - return -1; -#else +#ifndef CONFIG_NETFILTER + if (!async) return 1; #endif - } -drop_unlock: - spin_unlock(&x->lock); - xfrm_state_put(x); -drop: - while (--xfrm_nr >= 0) - xfrm_state_put(xfrm_vec[xfrm_nr]); - kfree_skb(skb); + ipv6_hdr(skb)->payload_len = htons(skb->len); + __skb_push(skb, skb->data - skb_network_header(skb)); + + NF_HOOK(PF_INET6, NF_INET_PRE_ROUTING, skb, skb->dev, NULL, + ip6_rcv_finish); return -1; } -EXPORT_SYMBOL(xfrm6_rcv_spi); - int xfrm6_rcv(struct sk_buff *skb) { return xfrm6_rcv_spi(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], @@ -144,10 +61,28 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, struct xfrm_state *x = NULL; int wildcard = 0; xfrm_address_t *xany; - struct xfrm_state *xfrm_vec_one = NULL; int nh = 0; int i = 0; + /* Allocate new secpath or COW existing one. */ + if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) { + struct sec_path *sp; + + sp = secpath_dup(skb->sp); + if (!sp) { + XFRM_INC_STATS(LINUX_MIB_XFRMINERROR); + goto drop; + } + if (skb->sp) + secpath_put(skb->sp); + skb->sp = sp; + } + + if (1 + skb->sp->len == XFRM_MAX_DEPTH) { + XFRM_INC_STATS(LINUX_MIB_XFRMINBUFFERERROR); + goto drop; + } + xany = (xfrm_address_t *)&in6addr_any; for (i = 0; i < 3; i++) { @@ -200,47 +135,37 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, continue; } + spin_unlock(&x->lock); + nh = x->type->input(x, skb); if (nh <= 0) { - spin_unlock(&x->lock); xfrm_state_put(x); x = NULL; continue; } - x->curlft.bytes += skb->len; - x->curlft.packets++; - - spin_unlock(&x->lock); - - xfrm_vec_one = x; + /* Found a state */ break; } - if (!xfrm_vec_one) + if (!x) { + XFRM_INC_STATS(LINUX_MIB_XFRMINNOSTATES); + xfrm_audit_state_notfound_simple(skb, AF_INET6); goto drop; - - /* Allocate new secpath or COW existing one. */ - if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) { - struct sec_path *sp; - sp = secpath_dup(skb->sp); - if (!sp) - goto drop; - if (skb->sp) - secpath_put(skb->sp); - skb->sp = sp; } - if (1 + skb->sp->len > XFRM_MAX_DEPTH) - goto drop; + skb->sp->xvec[skb->sp->len++] = x; + + spin_lock(&x->lock); - skb->sp->xvec[skb->sp->len] = xfrm_vec_one; - skb->sp->len ++; + x->curlft.bytes += skb->len; + x->curlft.packets++; + + spin_unlock(&x->lock); return 1; + drop: - if (xfrm_vec_one) - xfrm_state_put(xfrm_vec_one); return -1; } diff --git a/net/ipv6/xfrm6_mode_beet.c b/net/ipv6/xfrm6_mode_beet.c index 2bfb4f05c14..0527d11c1ae 100644 --- a/net/ipv6/xfrm6_mode_beet.c +++ b/net/ipv6/xfrm6_mode_beet.c @@ -19,31 +19,39 @@ #include <net/ipv6.h> #include <net/xfrm.h> +static void xfrm6_beet_make_header(struct sk_buff *skb) +{ + struct ipv6hdr *iph = ipv6_hdr(skb); + + iph->version = 6; + + memcpy(iph->flow_lbl, XFRM_MODE_SKB_CB(skb)->flow_lbl, + sizeof(iph->flow_lbl)); + iph->nexthdr = XFRM_MODE_SKB_CB(skb)->protocol; + + ipv6_change_dsfield(iph, 0, XFRM_MODE_SKB_CB(skb)->tos); + iph->hop_limit = XFRM_MODE_SKB_CB(skb)->ttl; +} + /* Add encapsulation header. * * The top IP header will be constructed per draft-nikander-esp-beet-mode-06.txt. */ static int xfrm6_beet_output(struct xfrm_state *x, struct sk_buff *skb) { - struct ipv6hdr *iph, *top_iph; - u8 *prevhdr; - int hdr_len; + struct ipv6hdr *top_iph; - iph = ipv6_hdr(skb); - - hdr_len = ip6_find_1stfragopt(skb, &prevhdr); - - skb_set_mac_header(skb, (prevhdr - x->props.header_len) - skb->data); skb_set_network_header(skb, -x->props.header_len); - skb->transport_header = skb->network_header + hdr_len; - __skb_pull(skb, hdr_len); + skb->mac_header = skb->network_header + + offsetof(struct ipv6hdr, nexthdr); + skb->transport_header = skb->network_header + sizeof(*top_iph); + + xfrm6_beet_make_header(skb); top_iph = ipv6_hdr(skb); - memmove(top_iph, iph, hdr_len); ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr); ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr); - return 0; } @@ -52,19 +60,21 @@ static int xfrm6_beet_input(struct xfrm_state *x, struct sk_buff *skb) struct ipv6hdr *ip6h; const unsigned char *old_mac; int size = sizeof(struct ipv6hdr); - int err = -EINVAL; + int err; - if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + err = skb_cow_head(skb, size + skb->mac_len); + if (err) goto out; - skb_push(skb, size); - memmove(skb->data, skb_network_header(skb), size); + __skb_push(skb, size); skb_reset_network_header(skb); old_mac = skb_mac_header(skb); skb_set_mac_header(skb, -skb->mac_len); memmove(skb_mac_header(skb), old_mac, skb->mac_len); + xfrm6_beet_make_header(skb); + ip6h = ipv6_hdr(skb); ip6h->payload_len = htons(skb->len - size); ipv6_addr_copy(&ip6h->daddr, (struct in6_addr *) &x->sel.daddr.a6); @@ -75,8 +85,10 @@ out: } static struct xfrm_mode xfrm6_beet_mode = { - .input = xfrm6_beet_input, - .output = xfrm6_beet_output, + .input2 = xfrm6_beet_input, + .input = xfrm_prepare_input, + .output2 = xfrm6_beet_output, + .output = xfrm6_prepare_output, .owner = THIS_MODULE, .encap = XFRM_MODE_BEET, .flags = XFRM_MODE_FLAG_TUNNEL, diff --git a/net/ipv6/xfrm6_mode_ro.c b/net/ipv6/xfrm6_mode_ro.c index a7bc8c62317..63d5d493098 100644 --- a/net/ipv6/xfrm6_mode_ro.c +++ b/net/ipv6/xfrm6_mode_ro.c @@ -28,6 +28,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/skbuff.h> +#include <linux/spinlock.h> #include <linux/stringify.h> #include <linux/time.h> #include <net/ipv6.h> diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c index fd84e221727..0c742faaa30 100644 --- a/net/ipv6/xfrm6_mode_tunnel.c +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -25,46 +25,29 @@ static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) IP6_ECN_set_ce(inner_iph); } -static inline void ip6ip_ecn_decapsulate(struct sk_buff *skb) -{ - if (INET_ECN_is_ce(ipv6_get_dsfield(ipv6_hdr(skb)))) - IP_ECN_set_ce(ipip_hdr(skb)); -} - /* Add encapsulation header. * * The top IP header will be constructed per RFC 2401. */ -static int xfrm6_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) +static int xfrm6_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) { struct dst_entry *dst = skb->dst; - struct xfrm_dst *xdst = (struct xfrm_dst*)dst; - struct ipv6hdr *iph, *top_iph; + struct ipv6hdr *top_iph; int dsfield; - iph = ipv6_hdr(skb); - skb_set_network_header(skb, -x->props.header_len); skb->mac_header = skb->network_header + offsetof(struct ipv6hdr, nexthdr); - skb->transport_header = skb->network_header + sizeof(*iph); + skb->transport_header = skb->network_header + sizeof(*top_iph); top_iph = ipv6_hdr(skb); top_iph->version = 6; - if (xdst->route->ops->family == AF_INET6) { - top_iph->priority = iph->priority; - top_iph->flow_lbl[0] = iph->flow_lbl[0]; - top_iph->flow_lbl[1] = iph->flow_lbl[1]; - top_iph->flow_lbl[2] = iph->flow_lbl[2]; - top_iph->nexthdr = IPPROTO_IPV6; - } else { - top_iph->priority = 0; - top_iph->flow_lbl[0] = 0; - top_iph->flow_lbl[1] = 0; - top_iph->flow_lbl[2] = 0; - top_iph->nexthdr = IPPROTO_IPIP; - } - dsfield = ipv6_get_dsfield(top_iph); + + memcpy(top_iph->flow_lbl, XFRM_MODE_SKB_CB(skb)->flow_lbl, + sizeof(top_iph->flow_lbl)); + top_iph->nexthdr = x->inner_mode->afinfo->proto; + + dsfield = XFRM_MODE_SKB_CB(skb)->tos; dsfield = INET_ECN_encapsulate(dsfield, dsfield); if (x->props.flags & XFRM_STATE_NOECN) dsfield &= ~INET_ECN_MASK; @@ -72,18 +55,15 @@ static int xfrm6_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->hop_limit = dst_metric(dst->child, RTAX_HOPLIMIT); ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr); ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr); - skb->protocol = htons(ETH_P_IPV6); return 0; } -static int xfrm6_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) +static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) { int err = -EINVAL; const unsigned char *old_mac; - const unsigned char *nh = skb_network_header(skb); - if (nh[IP6CB(skb)->nhoff] != IPPROTO_IPV6 && - nh[IP6CB(skb)->nhoff] != IPPROTO_IPIP) + if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPV6) goto out; if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto out; @@ -92,17 +72,12 @@ static int xfrm6_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) goto out; - nh = skb_network_header(skb); - if (nh[IP6CB(skb)->nhoff] == IPPROTO_IPV6) { - if (x->props.flags & XFRM_STATE_DECAP_DSCP) - ipv6_copy_dscp(ipv6_hdr(skb), ipipv6_hdr(skb)); - if (!(x->props.flags & XFRM_STATE_NOECN)) - ipip6_ecn_decapsulate(skb); - } else { - if (!(x->props.flags & XFRM_STATE_NOECN)) - ip6ip_ecn_decapsulate(skb); - skb->protocol = htons(ETH_P_IP); - } + if (x->props.flags & XFRM_STATE_DECAP_DSCP) + ipv6_copy_dscp(ipv6_get_dsfield(ipv6_hdr(skb)), + ipipv6_hdr(skb)); + if (!(x->props.flags & XFRM_STATE_NOECN)) + ipip6_ecn_decapsulate(skb); + old_mac = skb_mac_header(skb); skb_set_mac_header(skb, -skb->mac_len); memmove(skb_mac_header(skb), old_mac, skb->mac_len); @@ -114,19 +89,21 @@ out: } static struct xfrm_mode xfrm6_tunnel_mode = { - .input = xfrm6_tunnel_input, - .output = xfrm6_tunnel_output, + .input2 = xfrm6_mode_tunnel_input, + .input = xfrm_prepare_input, + .output2 = xfrm6_mode_tunnel_output, + .output = xfrm6_prepare_output, .owner = THIS_MODULE, .encap = XFRM_MODE_TUNNEL, .flags = XFRM_MODE_FLAG_TUNNEL, }; -static int __init xfrm6_tunnel_init(void) +static int __init xfrm6_mode_tunnel_init(void) { return xfrm_register_mode(&xfrm6_tunnel_mode, AF_INET6); } -static void __exit xfrm6_tunnel_exit(void) +static void __exit xfrm6_mode_tunnel_exit(void) { int err; @@ -134,7 +111,7 @@ static void __exit xfrm6_tunnel_exit(void) BUG_ON(err); } -module_init(xfrm6_tunnel_init); -module_exit(xfrm6_tunnel_exit); +module_init(xfrm6_mode_tunnel_init); +module_exit(xfrm6_mode_tunnel_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TUNNEL); diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 656976760ad..b34c58c6565 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -10,10 +10,12 @@ */ #include <linux/if_ether.h> -#include <linux/compiler.h> +#include <linux/kernel.h> +#include <linux/module.h> #include <linux/skbuff.h> #include <linux/icmpv6.h> #include <linux/netfilter_ipv6.h> +#include <net/dst.h> #include <net/ipv6.h> #include <net/xfrm.h> @@ -43,97 +45,50 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb) return ret; } -static inline int xfrm6_output_one(struct sk_buff *skb) +int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb) { - struct dst_entry *dst = skb->dst; - struct xfrm_state *x = dst->xfrm; - struct ipv6hdr *iph; int err; - if (x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL) { - err = xfrm6_tunnel_check_size(skb); - if (err) - goto error_nolock; - } - - err = xfrm_output(skb); + err = xfrm6_tunnel_check_size(skb); if (err) - goto error_nolock; + return err; - iph = ipv6_hdr(skb); - iph->payload_len = htons(skb->len - sizeof(*iph)); + XFRM_MODE_SKB_CB(skb)->protocol = ipv6_hdr(skb)->nexthdr; - IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; - err = 0; - -out_exit: - return err; -error_nolock: - kfree_skb(skb); - goto out_exit; + return xfrm6_extract_header(skb); } -static int xfrm6_output_finish2(struct sk_buff *skb) +int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb) { int err; - while (likely((err = xfrm6_output_one(skb)) == 0)) { - nf_reset(skb); - - err = nf_hook(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, - skb->dst->dev, dst_output); - if (unlikely(err != 1)) - break; + err = x->inner_mode->afinfo->extract_output(x, skb); + if (err) + return err; - if (!skb->dst->xfrm) - return dst_output(skb); + memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); +#ifdef CONFIG_NETFILTER + IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; +#endif - err = nf_hook(PF_INET6, NF_IP6_POST_ROUTING, skb, NULL, - skb->dst->dev, xfrm6_output_finish2); - if (unlikely(err != 1)) - break; - } + skb->protocol = htons(ETH_P_IPV6); - return err; + return x->outer_mode->output2(x, skb); } +EXPORT_SYMBOL(xfrm6_prepare_output); static int xfrm6_output_finish(struct sk_buff *skb) { - struct sk_buff *segs; - - if (!skb_is_gso(skb)) - return xfrm6_output_finish2(skb); +#ifdef CONFIG_NETFILTER + IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; +#endif skb->protocol = htons(ETH_P_IPV6); - segs = skb_gso_segment(skb, 0); - kfree_skb(skb); - if (unlikely(IS_ERR(segs))) - return PTR_ERR(segs); - - do { - struct sk_buff *nskb = segs->next; - int err; - - segs->next = NULL; - err = xfrm6_output_finish2(segs); - - if (unlikely(err)) { - while ((segs = nskb)) { - nskb = segs->next; - segs->next = NULL; - kfree_skb(segs); - } - return err; - } - - segs = nskb; - } while (segs); - - return 0; + return xfrm_output(skb); } int xfrm6_output(struct sk_buff *skb) { - return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb, NULL, skb->dst->dev, + return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dst->dev, xfrm6_output_finish); } diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index b8e9eb445d7..c25a6b527fc 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -11,9 +11,11 @@ * */ -#include <linux/compiler.h> +#include <linux/err.h> +#include <linux/kernel.h> #include <linux/netdevice.h> #include <net/addrconf.h> +#include <net/dst.h> #include <net/xfrm.h> #include <net/ip.h> #include <net/ipv6.h> @@ -25,35 +27,40 @@ static struct dst_ops xfrm6_dst_ops; static struct xfrm_policy_afinfo xfrm6_policy_afinfo; -static int xfrm6_dst_lookup(struct xfrm_dst **xdst, struct flowi *fl) +static struct dst_entry *xfrm6_dst_lookup(int tos, xfrm_address_t *saddr, + xfrm_address_t *daddr) { - struct dst_entry *dst = ip6_route_output(NULL, fl); - int err = dst->error; - if (!err) - *xdst = (struct xfrm_dst *) dst; - else + struct flowi fl = {}; + struct dst_entry *dst; + int err; + + memcpy(&fl.fl6_dst, daddr, sizeof(fl.fl6_dst)); + if (saddr) + memcpy(&fl.fl6_src, saddr, sizeof(fl.fl6_src)); + + dst = ip6_route_output(NULL, &fl); + + err = dst->error; + if (dst->error) { dst_release(dst); - return err; + dst = ERR_PTR(err); + } + + return dst; } static int xfrm6_get_saddr(xfrm_address_t *saddr, xfrm_address_t *daddr) { - struct rt6_info *rt; - struct flowi fl_tunnel = { - .nl_u = { - .ip6_u = { - .daddr = *(struct in6_addr *)&daddr->a6, - }, - }, - }; - - if (!xfrm6_dst_lookup((struct xfrm_dst **)&rt, &fl_tunnel)) { - ipv6_get_saddr(&rt->u.dst, (struct in6_addr *)&daddr->a6, - (struct in6_addr *)&saddr->a6); - dst_release(&rt->u.dst); - return 0; - } - return -EHOSTUNREACH; + struct dst_entry *dst; + + dst = xfrm6_dst_lookup(0, NULL, daddr); + if (IS_ERR(dst)) + return -EHOSTUNREACH; + + ipv6_get_saddr(dst, (struct in6_addr *)&daddr->a6, + (struct in6_addr *)&saddr->a6); + dst_release(dst); + return 0; } static struct dst_entry * @@ -86,177 +93,53 @@ __xfrm6_find_bundle(struct flowi *fl, struct xfrm_policy *policy) return dst; } -static inline struct in6_addr* -__xfrm6_bundle_addr_remote(struct xfrm_state *x, struct in6_addr *addr) +static int xfrm6_get_tos(struct flowi *fl) { - return (x->type->remote_addr) ? - (struct in6_addr*)x->type->remote_addr(x, (xfrm_address_t *)addr) : - (struct in6_addr*)&x->id.daddr; + return 0; } -static inline struct in6_addr* -__xfrm6_bundle_addr_local(struct xfrm_state *x, struct in6_addr *addr) +static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst, + int nfheader_len) { - return (x->type->local_addr) ? - (struct in6_addr*)x->type->local_addr(x, (xfrm_address_t *)addr) : - (struct in6_addr*)&x->props.saddr; -} + if (dst->ops->family == AF_INET6) { + struct rt6_info *rt = (struct rt6_info*)dst; + if (rt->rt6i_node) + path->path_cookie = rt->rt6i_node->fn_sernum; + } -static inline void -__xfrm6_bundle_len_inc(int *len, int *nflen, struct xfrm_state *x) -{ - if (x->type->flags & XFRM_TYPE_NON_FRAGMENT) - *nflen += x->props.header_len; - else - *len += x->props.header_len; -} + path->u.rt6.rt6i_nfheader_len = nfheader_len; -static inline void -__xfrm6_bundle_len_dec(int *len, int *nflen, struct xfrm_state *x) -{ - if (x->type->flags & XFRM_TYPE_NON_FRAGMENT) - *nflen -= x->props.header_len; - else - *len -= x->props.header_len; + return 0; } -/* Allocate chain of dst_entry's, attach known xfrm's, calculate - * all the metrics... Shortly, bundle a bundle. - */ - -static int -__xfrm6_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx, - struct flowi *fl, struct dst_entry **dst_p) +static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev) { - struct dst_entry *dst, *dst_prev; - struct rt6_info *rt0 = (struct rt6_info*)(*dst_p); - struct rt6_info *rt = rt0; - struct flowi fl_tunnel = { - .nl_u = { - .ip6_u = { - .saddr = fl->fl6_src, - .daddr = fl->fl6_dst, - } - } - }; - int i; - int err = 0; - int header_len = 0; - int nfheader_len = 0; - int trailer_len = 0; - - dst = dst_prev = NULL; - dst_hold(&rt->u.dst); - - for (i = 0; i < nx; i++) { - struct dst_entry *dst1 = dst_alloc(&xfrm6_dst_ops); - struct xfrm_dst *xdst; - - if (unlikely(dst1 == NULL)) { - err = -ENOBUFS; - dst_release(&rt->u.dst); - goto error; - } + struct rt6_info *rt = (struct rt6_info*)xdst->route; - if (!dst) - dst = dst1; - else { - dst_prev->child = dst1; - dst1->flags |= DST_NOHASH; - dst_clone(dst1); - } - - xdst = (struct xfrm_dst *)dst1; - xdst->route = &rt->u.dst; - xdst->genid = xfrm[i]->genid; - if (rt->rt6i_node) - xdst->route_cookie = rt->rt6i_node->fn_sernum; - - dst1->next = dst_prev; - dst_prev = dst1; - - __xfrm6_bundle_len_inc(&header_len, &nfheader_len, xfrm[i]); - trailer_len += xfrm[i]->props.trailer_len; - - if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) { - unsigned short encap_family = xfrm[i]->props.family; - switch(encap_family) { - case AF_INET: - fl_tunnel.fl4_dst = xfrm[i]->id.daddr.a4; - fl_tunnel.fl4_src = xfrm[i]->props.saddr.a4; - break; - case AF_INET6: - ipv6_addr_copy(&fl_tunnel.fl6_dst, __xfrm6_bundle_addr_remote(xfrm[i], &fl->fl6_dst)); - - ipv6_addr_copy(&fl_tunnel.fl6_src, __xfrm6_bundle_addr_local(xfrm[i], &fl->fl6_src)); - break; - default: - BUG_ON(1); - } + xdst->u.dst.dev = dev; + dev_hold(dev); - err = xfrm_dst_lookup((struct xfrm_dst **) &rt, - &fl_tunnel, encap_family); - if (err) - goto error; - } else - dst_hold(&rt->u.dst); - } + xdst->u.rt6.rt6i_idev = in6_dev_get(rt->u.dst.dev); + if (!xdst->u.rt6.rt6i_idev) + return -ENODEV; - dst_prev->child = &rt->u.dst; - dst->path = &rt->u.dst; + /* Sheit... I remember I did this right. Apparently, + * it was magically lost, so this code needs audit */ + xdst->u.rt6.rt6i_flags = rt->rt6i_flags & (RTF_ANYCAST | + RTF_LOCAL); + xdst->u.rt6.rt6i_metric = rt->rt6i_metric; + xdst->u.rt6.rt6i_node = rt->rt6i_node; if (rt->rt6i_node) - ((struct xfrm_dst *)dst)->path_cookie = rt->rt6i_node->fn_sernum; - - *dst_p = dst; - dst = dst_prev; - - dst_prev = *dst_p; - i = 0; - for (; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) { - struct xfrm_dst *x = (struct xfrm_dst*)dst_prev; - - dst_prev->xfrm = xfrm[i++]; - dst_prev->dev = rt->u.dst.dev; - if (rt->u.dst.dev) - dev_hold(rt->u.dst.dev); - dst_prev->obsolete = -1; - dst_prev->flags |= DST_HOST; - dst_prev->lastuse = jiffies; - dst_prev->header_len = header_len; - dst_prev->nfheader_len = nfheader_len; - dst_prev->trailer_len = trailer_len; - memcpy(&dst_prev->metrics, &x->route->metrics, sizeof(dst_prev->metrics)); - - /* Copy neighbour for reachability confirmation */ - dst_prev->neighbour = neigh_clone(rt->u.dst.neighbour); - dst_prev->input = rt->u.dst.input; - dst_prev->output = dst_prev->xfrm->outer_mode->afinfo->output; - /* Sheit... I remember I did this right. Apparently, - * it was magically lost, so this code needs audit */ - x->u.rt6.rt6i_flags = rt0->rt6i_flags&(RTF_ANYCAST|RTF_LOCAL); - x->u.rt6.rt6i_metric = rt0->rt6i_metric; - x->u.rt6.rt6i_node = rt0->rt6i_node; - x->u.rt6.rt6i_gateway = rt0->rt6i_gateway; - memcpy(&x->u.rt6.rt6i_gateway, &rt0->rt6i_gateway, sizeof(x->u.rt6.rt6i_gateway)); - x->u.rt6.rt6i_dst = rt0->rt6i_dst; - x->u.rt6.rt6i_src = rt0->rt6i_src; - x->u.rt6.rt6i_idev = rt0->rt6i_idev; - in6_dev_hold(rt0->rt6i_idev); - __xfrm6_bundle_len_dec(&header_len, &nfheader_len, x->u.dst.xfrm); - trailer_len -= x->u.dst.xfrm->props.trailer_len; - } + xdst->route_cookie = rt->rt6i_node->fn_sernum; + xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway; + xdst->u.rt6.rt6i_dst = rt->rt6i_dst; + xdst->u.rt6.rt6i_src = rt->rt6i_src; - xfrm_init_pmtu(dst); return 0; - -error: - if (dst) - dst_free(dst); - return err; } static inline void -_decode_session6(struct sk_buff *skb, struct flowi *fl) +_decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) { u16 offset = skb_network_header_len(skb); struct ipv6hdr *hdr = ipv6_hdr(skb); @@ -265,8 +148,8 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl) u8 nexthdr = nh[IP6CB(skb)->nhoff]; memset(fl, 0, sizeof(struct flowi)); - ipv6_addr_copy(&fl->fl6_dst, &hdr->daddr); - ipv6_addr_copy(&fl->fl6_src, &hdr->saddr); + ipv6_addr_copy(&fl->fl6_dst, reverse ? &hdr->saddr : &hdr->daddr); + ipv6_addr_copy(&fl->fl6_src, reverse ? &hdr->daddr : &hdr->saddr); while (pskb_may_pull(skb, nh + offset + 1 - skb->data)) { nh = skb_network_header(skb); @@ -289,8 +172,8 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl) if (pskb_may_pull(skb, nh + offset + 4 - skb->data)) { __be16 *ports = (__be16 *)exthdr; - fl->fl_ip_sport = ports[0]; - fl->fl_ip_dport = ports[1]; + fl->fl_ip_sport = ports[!!reverse]; + fl->fl_ip_dport = ports[!reverse]; } fl->proto = nexthdr; return; @@ -329,7 +212,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl) } } -static inline int xfrm6_garbage_collect(void) +static inline int xfrm6_garbage_collect(struct dst_ops *ops) { xfrm6_policy_afinfo.garbage_collect(); return (atomic_read(&xfrm6_dst_ops.entries) > xfrm6_dst_ops.gc_thresh*2); @@ -362,7 +245,8 @@ static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, xdst = (struct xfrm_dst *)dst; if (xdst->u.rt6.rt6i_idev->dev == dev) { - struct inet6_dev *loopback_idev = in6_dev_get(init_net.loopback_dev); + struct inet6_dev *loopback_idev = + in6_dev_get(dev->nd_net->loopback_dev); BUG_ON(!loopback_idev); do { @@ -385,6 +269,7 @@ static struct dst_ops xfrm6_dst_ops = { .update_pmtu = xfrm6_update_pmtu, .destroy = xfrm6_dst_destroy, .ifdown = xfrm6_dst_ifdown, + .local_out = __ip6_local_out, .gc_thresh = 1024, .entry_size = sizeof(struct xfrm_dst), }; @@ -395,13 +280,15 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { .dst_lookup = xfrm6_dst_lookup, .get_saddr = xfrm6_get_saddr, .find_bundle = __xfrm6_find_bundle, - .bundle_create = __xfrm6_bundle_create, .decode_session = _decode_session6, + .get_tos = xfrm6_get_tos, + .init_path = xfrm6_init_path, + .fill_dst = xfrm6_fill_dst, }; -static void __init xfrm6_policy_init(void) +static int __init xfrm6_policy_init(void) { - xfrm_policy_register_afinfo(&xfrm6_policy_afinfo); + return xfrm_policy_register_afinfo(&xfrm6_policy_afinfo); } static void xfrm6_policy_fini(void) @@ -409,10 +296,22 @@ static void xfrm6_policy_fini(void) xfrm_policy_unregister_afinfo(&xfrm6_policy_afinfo); } -void __init xfrm6_init(void) +int __init xfrm6_init(void) { - xfrm6_policy_init(); - xfrm6_state_init(); + int ret; + + ret = xfrm6_policy_init(); + if (ret) + goto out; + + ret = xfrm6_state_init(); + if (ret) + goto out_policy; +out: + return ret; +out_policy: + xfrm6_policy_fini(); + goto out; } void xfrm6_fini(void) diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index b392bee396f..dc817e035e2 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -14,6 +14,8 @@ #include <net/xfrm.h> #include <linux/pfkeyv2.h> #include <linux/ipsec.h> +#include <linux/netfilter_ipv6.h> +#include <net/dsfield.h> #include <net/ipv6.h> #include <net/addrconf.h> @@ -168,18 +170,37 @@ __xfrm6_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n) return 0; } +int xfrm6_extract_header(struct sk_buff *skb) +{ + struct ipv6hdr *iph = ipv6_hdr(skb); + + XFRM_MODE_SKB_CB(skb)->id = 0; + XFRM_MODE_SKB_CB(skb)->frag_off = htons(IP_DF); + XFRM_MODE_SKB_CB(skb)->tos = ipv6_get_dsfield(iph); + XFRM_MODE_SKB_CB(skb)->ttl = iph->hop_limit; + memcpy(XFRM_MODE_SKB_CB(skb)->flow_lbl, iph->flow_lbl, + sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl)); + + return 0; +} + static struct xfrm_state_afinfo xfrm6_state_afinfo = { .family = AF_INET6, + .proto = IPPROTO_IPV6, + .eth_proto = htons(ETH_P_IPV6), .owner = THIS_MODULE, .init_tempsel = __xfrm6_init_tempsel, .tmpl_sort = __xfrm6_tmpl_sort, .state_sort = __xfrm6_state_sort, .output = xfrm6_output, + .extract_input = xfrm6_extract_input, + .extract_output = xfrm6_extract_output, + .transport_finish = xfrm6_transport_finish, }; -void __init xfrm6_state_init(void) +int __init xfrm6_state_init(void) { - xfrm_state_register_afinfo(&xfrm6_state_afinfo); + return xfrm_state_register_afinfo(&xfrm6_state_afinfo); } void xfrm6_state_fini(void) diff --git a/net/ipx/sysctl_net_ipx.c b/net/ipx/sysctl_net_ipx.c index 0cf52645053..92fef864e85 100644 --- a/net/ipx/sysctl_net_ipx.c +++ b/net/ipx/sysctl_net_ipx.c @@ -28,31 +28,17 @@ static struct ctl_table ipx_table[] = { { 0 }, }; -static struct ctl_table ipx_dir_table[] = { - { - .ctl_name = NET_IPX, - .procname = "ipx", - .mode = 0555, - .child = ipx_table, - }, - { 0 }, -}; - -static struct ctl_table ipx_root_table[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = ipx_dir_table, - }, - { 0 }, +static struct ctl_path ipx_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "ipx", .ctl_name = NET_IPX, }, + { } }; static struct ctl_table_header *ipx_table_header; void ipx_register_sysctl(void) { - ipx_table_header = register_sysctl_table(ipx_root_table); + ipx_table_header = register_sysctl_paths(ipx_path, ipx_table); } void ipx_unregister_sysctl(void) diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 07dfa7fdd2a..240b0cbfb53 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -2410,9 +2410,8 @@ bed: /* Set watchdog timer to expire in <val> ms. */ self->errno = 0; - init_timer(&self->watchdog); - self->watchdog.function = irda_discovery_timeout; - self->watchdog.data = (unsigned long) self; + setup_timer(&self->watchdog, irda_discovery_timeout, + (unsigned long)self); self->watchdog.expires = jiffies + (val * HZ/1000); add_timer(&(self->watchdog)); diff --git a/net/irda/ircomm/ircomm_core.c b/net/irda/ircomm/ircomm_core.c index 2d63fa8e155..b825399fc16 100644 --- a/net/irda/ircomm/ircomm_core.c +++ b/net/irda/ircomm/ircomm_core.c @@ -363,6 +363,18 @@ void ircomm_process_data(struct ircomm_cb *self, struct sk_buff *skb) clen = skb->data[0]; /* + * Input validation check: a stir4200/mcp2150 combinations sometimes + * results in frames with clen > remaining packet size. These are + * illegal; if we throw away just this frame then it seems to carry on + * fine + */ + if (unlikely(skb->len < (clen + 1))) { + IRDA_DEBUG(2, "%s() throwing away illegal frame\n", + __FUNCTION__ ); + return; + } + + /* * If there are any data hiding in the control channel, we must * deliver it first. The side effect is that the control channel * will be removed from the skb diff --git a/net/irda/irda_device.c b/net/irda/irda_device.c index 435b563d29a..87185910d0e 100644 --- a/net/irda/irda_device.c +++ b/net/irda/irda_device.c @@ -57,20 +57,6 @@ static void __irda_task_delete(struct irda_task *task); static hashbin_t *dongles = NULL; static hashbin_t *tasks = NULL; -#ifdef CONFIG_IRDA_DEBUG -static const char *task_state[] = { - "IRDA_TASK_INIT", - "IRDA_TASK_DONE", - "IRDA_TASK_WAIT", - "IRDA_TASK_WAIT1", - "IRDA_TASK_WAIT2", - "IRDA_TASK_WAIT3", - "IRDA_TASK_CHILD_INIT", - "IRDA_TASK_CHILD_WAIT", - "IRDA_TASK_CHILD_DONE", -}; -#endif /* CONFIG_IRDA_DEBUG */ - static void irda_task_timer_expired(void *data); int __init irda_device_init( void) @@ -176,14 +162,6 @@ int irda_device_is_receiving(struct net_device *dev) return req.ifr_receiving; } -void irda_task_next_state(struct irda_task *task, IRDA_TASK_STATE state) -{ - IRDA_DEBUG(2, "%s(), state = %s\n", __FUNCTION__, task_state[state]); - - task->state = state; -} -EXPORT_SYMBOL(irda_task_next_state); - static void __irda_task_delete(struct irda_task *task) { del_timer(&task->timer); @@ -191,14 +169,13 @@ static void __irda_task_delete(struct irda_task *task) kfree(task); } -void irda_task_delete(struct irda_task *task) +static void irda_task_delete(struct irda_task *task) { /* Unregister task */ hashbin_remove(tasks, (long) task, NULL); __irda_task_delete(task); } -EXPORT_SYMBOL(irda_task_delete); /* * Function irda_task_kick (task) @@ -272,51 +249,6 @@ static int irda_task_kick(struct irda_task *task) } /* - * Function irda_task_execute (instance, function, finished) - * - * This function registers and tries to execute tasks that may take some - * time to complete. We do it this hairy way since we may have been - * called from interrupt context, so it's not possible to use - * schedule_timeout() - * Two important notes : - * o Make sure you irda_task_delete(task); in case you delete the - * calling instance. - * o No real need to lock when calling this function, but you may - * want to lock within the task handler. - * Jean II - */ -struct irda_task *irda_task_execute(void *instance, - IRDA_TASK_CALLBACK function, - IRDA_TASK_CALLBACK finished, - struct irda_task *parent, void *param) -{ - struct irda_task *task; - - IRDA_DEBUG(2, "%s()\n", __FUNCTION__); - - task = kmalloc(sizeof(struct irda_task), GFP_ATOMIC); - if (!task) - return NULL; - - task->state = IRDA_TASK_INIT; - task->instance = instance; - task->function = function; - task->finished = finished; - task->parent = parent; - task->param = param; - task->magic = IRDA_TASK_MAGIC; - - init_timer(&task->timer); - - /* Register task */ - hashbin_insert(tasks, (irda_queue_t *) task, (long) task, NULL); - - /* No time to waste, so lets get going! */ - return irda_task_kick(task) ? NULL : task; -} -EXPORT_SYMBOL(irda_task_execute); - -/* * Function irda_task_timer_expired (data) * * Task time has expired. We now try to execute task (again), and restart @@ -364,105 +296,6 @@ struct net_device *alloc_irdadev(int sizeof_priv) } EXPORT_SYMBOL(alloc_irdadev); -/* - * Function irda_device_init_dongle (self, type, qos) - * - * Initialize attached dongle. - * - * Important : request_module require us to call this function with - * a process context and irq enabled. - Jean II - */ -dongle_t *irda_device_dongle_init(struct net_device *dev, int type) -{ - struct dongle_reg *reg; - dongle_t *dongle = kzalloc(sizeof(dongle_t), GFP_KERNEL); - - might_sleep(); - - spin_lock(&dongles->hb_spinlock); - reg = hashbin_find(dongles, type, NULL); - -#ifdef CONFIG_KMOD - /* Try to load the module needed */ - if (!reg && capable(CAP_SYS_MODULE)) { - spin_unlock(&dongles->hb_spinlock); - - request_module("irda-dongle-%d", type); - - spin_lock(&dongles->hb_spinlock); - reg = hashbin_find(dongles, type, NULL); - } -#endif - - if (!reg || !try_module_get(reg->owner) ) { - IRDA_ERROR("IrDA: Unable to find requested dongle type %x\n", - type); - kfree(dongle); - dongle = NULL; - } - if (dongle) { - /* Bind the registration info to this particular instance */ - dongle->issue = reg; - dongle->dev = dev; - } - spin_unlock(&dongles->hb_spinlock); - return dongle; -} -EXPORT_SYMBOL(irda_device_dongle_init); - -/* - * Function irda_device_dongle_cleanup (dongle) - */ -int irda_device_dongle_cleanup(dongle_t *dongle) -{ - IRDA_ASSERT(dongle != NULL, return -1;); - - dongle->issue->close(dongle); - module_put(dongle->issue->owner); - kfree(dongle); - - return 0; -} -EXPORT_SYMBOL(irda_device_dongle_cleanup); - -/* - * Function irda_device_register_dongle (dongle) - */ -int irda_device_register_dongle(struct dongle_reg *new) -{ - spin_lock(&dongles->hb_spinlock); - /* Check if this dongle has been registered before */ - if (hashbin_find(dongles, new->type, NULL)) { - IRDA_MESSAGE("%s: Dongle type %x already registered\n", - __FUNCTION__, new->type); - } else { - /* Insert IrDA dongle into hashbin */ - hashbin_insert(dongles, (irda_queue_t *) new, new->type, NULL); - } - spin_unlock(&dongles->hb_spinlock); - - return 0; -} -EXPORT_SYMBOL(irda_device_register_dongle); - -/* - * Function irda_device_unregister_dongle (dongle) - * - * Unregister dongle, and remove dongle from list of registered dongles - * - */ -void irda_device_unregister_dongle(struct dongle_reg *dongle) -{ - struct dongle *node; - - spin_lock(&dongles->hb_spinlock); - node = hashbin_remove(dongles, dongle->type, NULL); - if (!node) - IRDA_ERROR("%s: dongle not found!\n", __FUNCTION__); - spin_unlock(&dongles->hb_spinlock); -} -EXPORT_SYMBOL(irda_device_unregister_dongle); - #ifdef CONFIG_ISA_DMA_API /* * Function setup_dma (idev, buffer, count, mode) diff --git a/net/irda/iriap.c b/net/irda/iriap.c index a86a5d83786..390a790886e 100644 --- a/net/irda/iriap.c +++ b/net/irda/iriap.c @@ -579,7 +579,7 @@ static void iriap_getvaluebyclass_response(struct iriap_cb *self, fp[n++] = ret_code; /* Insert list length (MSB first) */ - tmp_be16 = __constant_htons(0x0001); + tmp_be16 = htons(0x0001); memcpy(fp+n, &tmp_be16, 2); n += 2; /* Insert object identifier ( MSB first) */ diff --git a/net/irda/irlap_event.c b/net/irda/irlap_event.c index 4c33bf5c835..6af86eba746 100644 --- a/net/irda/irlap_event.c +++ b/net/irda/irlap_event.c @@ -1199,6 +1199,19 @@ static int irlap_state_nrm_p(struct irlap_cb *self, IRLAP_EVENT event, switch (event) { case RECV_I_RSP: /* Optimize for the common case */ + if (unlikely(skb->len <= LAP_ADDR_HEADER + LAP_CTRL_HEADER)) { + /* + * Input validation check: a stir4200/mcp2150 + * combination sometimes results in an empty i:rsp. + * This makes no sense; we can just ignore the frame + * and send an rr:cmd immediately. This happens before + * changing nr or ns so triggers a retransmit + */ + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_rr_frame(self, CMD_FRAME); + /* Keep state */ + break; + } /* FIXME: must check for remote_busy below */ #ifdef CONFIG_IRDA_FAST_RR /* @@ -1514,9 +1527,15 @@ static int irlap_state_nrm_p(struct irlap_cb *self, IRLAP_EVENT event, /* N2 is the disconnect timer. Until we reach it, we retry */ if (self->retry_count < self->N2) { - /* Retry sending the pf bit to the secondary */ - irlap_wait_min_turn_around(self, &self->qos_tx); - irlap_send_rr_frame(self, CMD_FRAME); + if (skb_peek(&self->wx_list) == NULL) { + /* Retry sending the pf bit to the secondary */ + IRDA_DEBUG(4, "nrm_p: resending rr"); + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_rr_frame(self, CMD_FRAME); + } else { + IRDA_DEBUG(4, "nrm_p: resend frames"); + irlap_resend_rejected_frames(self, CMD_FRAME); + } irlap_start_final_timer(self, self->final_timeout); self->retry_count++; diff --git a/net/irda/irlmp.c b/net/irda/irlmp.c index f24cb755908..135ac6907bb 100644 --- a/net/irda/irlmp.c +++ b/net/irda/irlmp.c @@ -103,9 +103,12 @@ int __init irlmp_init(void) irlmp->last_lsap_sel = 0x0f; /* Reserved 0x00-0x0f */ strcpy(sysctl_devname, "Linux"); - /* Do discovery every 3 seconds */ init_timer(&irlmp->discovery_timer); - irlmp_start_discovery_timer(irlmp, sysctl_discovery_timeout*HZ); + + /* Do discovery every 3 seconds, conditionaly */ + if (sysctl_discovery) + irlmp_start_discovery_timer(irlmp, + sysctl_discovery_timeout*HZ); return 0; } diff --git a/net/irda/irlmp_event.c b/net/irda/irlmp_event.c index 1bba87e7860..150cd3f1129 100644 --- a/net/irda/irlmp_event.c +++ b/net/irda/irlmp_event.c @@ -174,9 +174,7 @@ void irlmp_discovery_timer_expired(void *data) /* We always cleanup the log (active & passive discovery) */ irlmp_do_expiry(); - /* Active discovery is conditional */ - if (sysctl_discovery) - irlmp_do_discovery(sysctl_discovery_slots); + irlmp_do_discovery(sysctl_discovery_slots); /* Restart timer */ irlmp_start_discovery_timer(irlmp, sysctl_discovery_timeout * HZ); diff --git a/net/irda/irsysctl.c b/net/irda/irsysctl.c index 565cbf0421c..9ab3df15425 100644 --- a/net/irda/irsysctl.c +++ b/net/irda/irsysctl.c @@ -29,6 +29,8 @@ #include <linux/init.h> #include <net/irda/irda.h> /* irda_debug */ +#include <net/irda/irlmp.h> +#include <net/irda/timer.h> #include <net/irda/irias_object.h> extern int sysctl_discovery; @@ -45,6 +47,8 @@ extern int sysctl_max_noreply_time; extern int sysctl_warn_noreply_time; extern int sysctl_lap_keepalive_time; +extern struct irlmp_cb *irlmp; + /* this is needed for the proc_dointvec_minmax - Jean II */ static int max_discovery_slots = 16; /* ??? */ static int min_discovery_slots = 1; @@ -85,6 +89,27 @@ static int do_devname(ctl_table *table, int write, struct file *filp, return ret; } + +static int do_discovery(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); + if (ret) + return ret; + + if (irlmp == NULL) + return -ENODEV; + + if (sysctl_discovery) + irlmp_start_discovery_timer(irlmp, sysctl_discovery_timeout*HZ); + else + del_timer_sync(&irlmp->discovery_timer); + + return ret; +} + /* One file */ static ctl_table irda_table[] = { { @@ -93,7 +118,8 @@ static ctl_table irda_table[] = { .data = &sysctl_discovery, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec + .proc_handler = &do_discovery, + .strategy = &sysctl_intvec }, { .ctl_name = NET_IRDA_DEVNAME, @@ -234,28 +260,10 @@ static ctl_table irda_table[] = { { .ctl_name = 0 } }; -/* One directory */ -static ctl_table irda_net_table[] = { - { - .ctl_name = NET_IRDA, - .procname = "irda", - .maxlen = 0, - .mode = 0555, - .child = irda_table - }, - { .ctl_name = 0 } -}; - -/* The parent directory */ -static ctl_table irda_root_table[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .maxlen = 0, - .mode = 0555, - .child = irda_net_table - }, - { .ctl_name = 0 } +static struct ctl_path irda_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "irda", .ctl_name = NET_IRDA, }, + { } }; static struct ctl_table_header *irda_table_header; @@ -268,7 +276,7 @@ static struct ctl_table_header *irda_table_header; */ int __init irda_sysctl_register(void) { - irda_table_header = register_sysctl_table(irda_root_table); + irda_table_header = register_sysctl_paths(irda_path, irda_table); if (!irda_table_header) return -ENOMEM; diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index aef66458035..2255e3c082e 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -94,13 +94,6 @@ static void iucv_sock_clear_timer(struct sock *sk) sk_stop_timer(sk, &sk->sk_timer); } -static void iucv_sock_init_timer(struct sock *sk) -{ - init_timer(&sk->sk_timer); - sk->sk_timer.function = iucv_sock_timeout; - sk->sk_timer.data = (unsigned long)sk; -} - static struct sock *__iucv_get_sock_by_name(char *nm) { struct sock *sk; @@ -238,7 +231,7 @@ static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio) sk->sk_protocol = proto; sk->sk_state = IUCV_OPEN; - iucv_sock_init_timer(sk); + setup_timer(&sk->sk_timer, iucv_sock_timeout, (unsigned long)sk); iucv_sock_link(&iucv_sk_list, sk); return sk; diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index 7698f6c459d..f13fe8821cb 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -1492,7 +1492,7 @@ static void iucv_tasklet_fn(unsigned long ignored) [0x08] = iucv_message_pending, [0x09] = iucv_message_pending, }; - struct list_head task_queue = LIST_HEAD_INIT(task_queue); + LIST_HEAD(task_queue); struct iucv_irq_list *p, *n; /* Serialize tasklet, iucv_path_sever and iucv_path_connect. */ @@ -1526,7 +1526,7 @@ static void iucv_tasklet_fn(unsigned long ignored) static void iucv_work_fn(struct work_struct *work) { typedef void iucv_irq_fn(struct iucv_irq_data *); - struct list_head work_queue = LIST_HEAD_INIT(work_queue); + LIST_HEAD(work_queue); struct iucv_irq_list *p, *n; /* Serialize tasklet, iucv_path_sever and iucv_path_connect. */ diff --git a/net/key/af_key.c b/net/key/af_key.c index 76dcd882f87..16b72b5570c 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -2291,8 +2291,7 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h return 0; out: - security_xfrm_policy_free(xp); - kfree(xp); + xfrm_policy_destroy(xp); return err; } @@ -3236,8 +3235,7 @@ static struct xfrm_policy *pfkey_compile_policy(struct sock *sk, int opt, return xp; out: - security_xfrm_policy_free(xp); - kfree(xp); + xfrm_policy_destroy(xp); return NULL; } diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c index a2e7aa63fd8..2ba1bc4f3c3 100644 --- a/net/lapb/lapb_iface.c +++ b/net/lapb/lapb_iface.c @@ -39,7 +39,7 @@ #include <linux/init.h> #include <net/lapb.h> -static struct list_head lapb_list = LIST_HEAD_INIT(lapb_list); +static LIST_HEAD(lapb_list); static DEFINE_RWLOCK(lapb_list_lock); /* diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 5c0b484237c..441bc18f996 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -831,25 +831,21 @@ static void llc_sk_init(struct sock* sk) llc->inc_cntr = llc->dec_cntr = 2; llc->dec_step = llc->connect_step = 1; - init_timer(&llc->ack_timer.timer); + setup_timer(&llc->ack_timer.timer, llc_conn_ack_tmr_cb, + (unsigned long)sk); llc->ack_timer.expire = sysctl_llc2_ack_timeout; - llc->ack_timer.timer.data = (unsigned long)sk; - llc->ack_timer.timer.function = llc_conn_ack_tmr_cb; - init_timer(&llc->pf_cycle_timer.timer); + setup_timer(&llc->pf_cycle_timer.timer, llc_conn_pf_cycle_tmr_cb, + (unsigned long)sk); llc->pf_cycle_timer.expire = sysctl_llc2_p_timeout; - llc->pf_cycle_timer.timer.data = (unsigned long)sk; - llc->pf_cycle_timer.timer.function = llc_conn_pf_cycle_tmr_cb; - init_timer(&llc->rej_sent_timer.timer); + setup_timer(&llc->rej_sent_timer.timer, llc_conn_rej_tmr_cb, + (unsigned long)sk); llc->rej_sent_timer.expire = sysctl_llc2_rej_timeout; - llc->rej_sent_timer.timer.data = (unsigned long)sk; - llc->rej_sent_timer.timer.function = llc_conn_rej_tmr_cb; - init_timer(&llc->busy_state_timer.timer); + setup_timer(&llc->busy_state_timer.timer, llc_conn_busy_tmr_cb, + (unsigned long)sk); llc->busy_state_timer.expire = sysctl_llc2_busy_timeout; - llc->busy_state_timer.timer.data = (unsigned long)sk; - llc->busy_state_timer.timer.function = llc_conn_busy_tmr_cb; llc->n2 = 2; /* max retransmit */ llc->k = 2; /* tx win size, will adjust dynam */ diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c index 576355a192a..6f2ea209032 100644 --- a/net/llc/llc_station.c +++ b/net/llc/llc_station.c @@ -688,9 +688,8 @@ int __init llc_station_init(void) skb_queue_head_init(&llc_main_station.mac_pdu_q); skb_queue_head_init(&llc_main_station.ev_q.list); spin_lock_init(&llc_main_station.ev_q.lock); - init_timer(&llc_main_station.ack_timer); - llc_main_station.ack_timer.data = (unsigned long)&llc_main_station; - llc_main_station.ack_timer.function = llc_station_ack_tmr_cb; + setup_timer(&llc_main_station.ack_timer, llc_station_ack_tmr_cb, + (unsigned long)&llc_main_station); llc_main_station.ack_timer.expires = jiffies + sysctl_llc_station_ack_timeout; skb = alloc_skb(0, GFP_ATOMIC); diff --git a/net/llc/sysctl_net_llc.c b/net/llc/sysctl_net_llc.c index 46992d03601..5bef1dcf18e 100644 --- a/net/llc/sysctl_net_llc.c +++ b/net/llc/sysctl_net_llc.c @@ -92,31 +92,17 @@ static struct ctl_table llc_table[] = { { 0 }, }; -static struct ctl_table llc_dir_table[] = { - { - .ctl_name = NET_LLC, - .procname = "llc", - .mode = 0555, - .child = llc_table, - }, - { 0 }, -}; - -static struct ctl_table llc_root_table[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = llc_dir_table, - }, - { 0 }, +static struct ctl_path llc_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "llc", .ctl_name = NET_LLC, }, + { } }; static struct ctl_table_header *llc_table_header; int __init llc_sysctl_init(void) { - llc_table_header = register_sysctl_table(llc_root_table); + llc_table_header = register_sysctl_paths(llc_path, llc_table); return llc_table_header ? 0 : -ENOMEM; } diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig index ce176e691af..09c255002e5 100644 --- a/net/mac80211/Kconfig +++ b/net/mac80211/Kconfig @@ -10,27 +10,84 @@ config MAC80211 select CFG80211 select NET_SCH_FIFO ---help--- - This option enables the hardware independent IEEE 802.11 - networking stack. + This option enables the hardware independent IEEE 802.11 + networking stack. -config MAC80211_RCSIMPLE - bool "'simple' rate control algorithm" if EMBEDDED - default y - depends on MAC80211 +menu "Rate control algorithm selection" + depends on MAC80211 != n + +choice + prompt "Default rate control algorithm" + default MAC80211_RC_DEFAULT_PID + ---help--- + This option selects the default rate control algorithm + mac80211 will use. Note that this default can still be + overriden through the ieee80211_default_rc_algo module + parameter if different algorithms are available. + +config MAC80211_RC_DEFAULT_PID + bool "PID controller based rate control algorithm" + select MAC80211_RC_PID + ---help--- + Select the PID controller based rate control as the + default rate control algorithm. You should choose + this unless you know what you are doing. + +config MAC80211_RC_DEFAULT_SIMPLE + bool "Simple rate control algorithm" + select MAC80211_RC_SIMPLE + ---help--- + Select the simple rate control as the default rate + control algorithm. Note that this is a non-responsive, + dumb algorithm. You should choose the PID rate control + instead. + +config MAC80211_RC_DEFAULT_NONE + bool "No default algorithm" + depends on EMBEDDED help - This option allows you to turn off the 'simple' rate - control algorithm in mac80211. If you do turn it off, - you absolutely need another rate control algorithm. + Selecting this option will select no default algorithm + and allow you to not build any. Do not choose this + option unless you know your driver comes with another + suitable algorithm. +endchoice + +comment "Selecting 'y' for an algorithm will" +comment "build the algorithm into mac80211." + +config MAC80211_RC_DEFAULT + string + default "pid" if MAC80211_RC_DEFAULT_PID + default "simple" if MAC80211_RC_DEFAULT_SIMPLE + default "" - Say Y unless you know you will have another algorithm - available. +config MAC80211_RC_PID + tristate "PID controller based rate control algorithm" + ---help--- + This option enables a TX rate control algorithm for + mac80211 that uses a PID controller to select the TX + rate. + + Say Y or M unless you're sure you want to use a + different rate control algorithm. + +config MAC80211_RC_SIMPLE + tristate "Simple rate control algorithm (DEPRECATED)" + ---help--- + This option enables a very simple, non-responsive TX + rate control algorithm. This algorithm is deprecated + and will be removed from the kernel in the near future. + It has been replaced by the PID algorithm. + + Say N unless you know what you are doing. +endmenu config MAC80211_LEDS bool "Enable LED triggers" depends on MAC80211 && LEDS_TRIGGERS ---help--- - This option enables a few LED triggers for different - packet receive/transmit events. + This option enables a few LED triggers for different + packet receive/transmit events. config MAC80211_DEBUGFS bool "Export mac80211 internals in DebugFS" @@ -51,6 +108,16 @@ config MAC80211_DEBUG If you are not trying to debug or develop the ieee80211 subsystem, you most likely want to say N here. +config MAC80211_HT_DEBUG + bool "Enable HT debugging output" + depends on MAC80211_DEBUG + ---help--- + This option enables 802.11n High Throughput features + debug tracing output. + + If you are not trying to debug of develop the ieee80211 + subsystem, you most likely want to say N here. + config MAC80211_VERBOSE_DEBUG bool "Verbose debugging output" depends on MAC80211_DEBUG diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile index 1e6237b3484..54f46bc80cf 100644 --- a/net/mac80211/Makefile +++ b/net/mac80211/Makefile @@ -1,11 +1,15 @@ obj-$(CONFIG_MAC80211) += mac80211.o -mac80211-objs-$(CONFIG_MAC80211_LEDS) += ieee80211_led.o -mac80211-objs-$(CONFIG_MAC80211_DEBUGFS) += debugfs.o debugfs_sta.o debugfs_netdev.o debugfs_key.o -mac80211-objs-$(CONFIG_NET_SCHED) += wme.o -mac80211-objs-$(CONFIG_MAC80211_RCSIMPLE) += rc80211_simple.o +# objects for PID algorithm +rc80211_pid-y := rc80211_pid_algo.o +rc80211_pid-$(CONFIG_MAC80211_DEBUGFS) += rc80211_pid_debugfs.o -mac80211-objs := \ +# build helper for PID algorithm +rc-pid-y := $(rc80211_pid-y) +rc-pid-m := rc80211_pid.o + +# mac80211 objects +mac80211-y := \ ieee80211.o \ ieee80211_ioctl.o \ sta_info.o \ @@ -23,5 +27,22 @@ mac80211-objs := \ tx.o \ key.o \ util.o \ - event.o \ - $(mac80211-objs-y) + event.o + +mac80211-$(CONFIG_MAC80211_LEDS) += ieee80211_led.o +mac80211-$(CONFIG_NET_SCHED) += wme.o +mac80211-$(CONFIG_MAC80211_DEBUGFS) += \ + debugfs.o \ + debugfs_sta.o \ + debugfs_netdev.o \ + debugfs_key.o + + +# Build rate control algorithm(s) +CFLAGS_rc80211_simple.o += -DRC80211_SIMPLE_COMPILE +CFLAGS_rc80211_pid_algo.o += -DRC80211_PID_COMPILE +mac80211-$(CONFIG_MAC80211_RC_SIMPLE) += rc80211_simple.o +mac80211-$(CONFIG_MAC80211_RC_PID) += $(rc-pid-$(CONFIG_MAC80211_RC_PID)) + +# Modular rate algorithms are assigned to mac80211-m - make separate modules +obj-m += $(mac80211-m) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 9e2bc1fd023..22c9619ba77 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1,17 +1,20 @@ /* * mac80211 configuration hooks for cfg80211 * - * Copyright 2006 Johannes Berg <johannes@sipsolutions.net> + * Copyright 2006, 2007 Johannes Berg <johannes@sipsolutions.net> * * This file is GPLv2 as found in COPYING. */ +#include <linux/ieee80211.h> #include <linux/nl80211.h> #include <linux/rtnetlink.h> #include <net/net_namespace.h> +#include <linux/rcupdate.h> #include <net/cfg80211.h> #include "ieee80211_i.h" #include "cfg.h" +#include "ieee80211_rate.h" static enum ieee80211_if_types nl80211_type_to_mac80211_type(enum nl80211_iftype type) @@ -90,7 +93,7 @@ static int ieee80211_change_iface(struct wiphy *wiphy, int ifindex, sdata = IEEE80211_DEV_TO_SUB_IF(dev); - if (sdata->type == IEEE80211_IF_TYPE_VLAN) + if (sdata->vif.type == IEEE80211_IF_TYPE_VLAN) return -EOPNOTSUPP; ieee80211_if_reinit(dev); @@ -99,8 +102,553 @@ static int ieee80211_change_iface(struct wiphy *wiphy, int ifindex, return 0; } +static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, + u8 key_idx, u8 *mac_addr, + struct key_params *params) +{ + struct ieee80211_sub_if_data *sdata; + struct sta_info *sta = NULL; + enum ieee80211_key_alg alg; + int ret; + + sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + switch (params->cipher) { + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: + alg = ALG_WEP; + break; + case WLAN_CIPHER_SUITE_TKIP: + alg = ALG_TKIP; + break; + case WLAN_CIPHER_SUITE_CCMP: + alg = ALG_CCMP; + break; + default: + return -EINVAL; + } + + if (mac_addr) { + sta = sta_info_get(sdata->local, mac_addr); + if (!sta) + return -ENOENT; + } + + ret = 0; + if (!ieee80211_key_alloc(sdata, sta, alg, key_idx, + params->key_len, params->key)) + ret = -ENOMEM; + + if (sta) + sta_info_put(sta); + + return ret; +} + +static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev, + u8 key_idx, u8 *mac_addr) +{ + struct ieee80211_sub_if_data *sdata; + struct sta_info *sta; + int ret; + + sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + if (mac_addr) { + sta = sta_info_get(sdata->local, mac_addr); + if (!sta) + return -ENOENT; + + ret = 0; + if (sta->key) + ieee80211_key_free(sta->key); + else + ret = -ENOENT; + + sta_info_put(sta); + return ret; + } + + if (!sdata->keys[key_idx]) + return -ENOENT; + + ieee80211_key_free(sdata->keys[key_idx]); + + return 0; +} + +static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, + u8 key_idx, u8 *mac_addr, void *cookie, + void (*callback)(void *cookie, + struct key_params *params)) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct sta_info *sta = NULL; + u8 seq[6] = {0}; + struct key_params params; + struct ieee80211_key *key; + u32 iv32; + u16 iv16; + int err = -ENOENT; + + if (mac_addr) { + sta = sta_info_get(sdata->local, mac_addr); + if (!sta) + goto out; + + key = sta->key; + } else + key = sdata->keys[key_idx]; + + if (!key) + goto out; + + memset(¶ms, 0, sizeof(params)); + + switch (key->conf.alg) { + case ALG_TKIP: + params.cipher = WLAN_CIPHER_SUITE_TKIP; + + iv32 = key->u.tkip.iv32; + iv16 = key->u.tkip.iv16; + + if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && + sdata->local->ops->get_tkip_seq) + sdata->local->ops->get_tkip_seq( + local_to_hw(sdata->local), + key->conf.hw_key_idx, + &iv32, &iv16); + + seq[0] = iv16 & 0xff; + seq[1] = (iv16 >> 8) & 0xff; + seq[2] = iv32 & 0xff; + seq[3] = (iv32 >> 8) & 0xff; + seq[4] = (iv32 >> 16) & 0xff; + seq[5] = (iv32 >> 24) & 0xff; + params.seq = seq; + params.seq_len = 6; + break; + case ALG_CCMP: + params.cipher = WLAN_CIPHER_SUITE_CCMP; + seq[0] = key->u.ccmp.tx_pn[5]; + seq[1] = key->u.ccmp.tx_pn[4]; + seq[2] = key->u.ccmp.tx_pn[3]; + seq[3] = key->u.ccmp.tx_pn[2]; + seq[4] = key->u.ccmp.tx_pn[1]; + seq[5] = key->u.ccmp.tx_pn[0]; + params.seq = seq; + params.seq_len = 6; + break; + case ALG_WEP: + if (key->conf.keylen == 5) + params.cipher = WLAN_CIPHER_SUITE_WEP40; + else + params.cipher = WLAN_CIPHER_SUITE_WEP104; + break; + } + + params.key = key->conf.key; + params.key_len = key->conf.keylen; + + callback(cookie, ¶ms); + err = 0; + + out: + if (sta) + sta_info_put(sta); + return err; +} + +static int ieee80211_config_default_key(struct wiphy *wiphy, + struct net_device *dev, + u8 key_idx) +{ + struct ieee80211_sub_if_data *sdata; + + sdata = IEEE80211_DEV_TO_SUB_IF(dev); + ieee80211_set_default_key(sdata, key_idx); + + return 0; +} + +static int ieee80211_get_station(struct wiphy *wiphy, struct net_device *dev, + u8 *mac, struct station_stats *stats) +{ + struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); + struct sta_info *sta; + + sta = sta_info_get(local, mac); + if (!sta) + return -ENOENT; + + /* XXX: verify sta->dev == dev */ + + stats->filled = STATION_STAT_INACTIVE_TIME | + STATION_STAT_RX_BYTES | + STATION_STAT_TX_BYTES; + + stats->inactive_time = jiffies_to_msecs(jiffies - sta->last_rx); + stats->rx_bytes = sta->rx_bytes; + stats->tx_bytes = sta->tx_bytes; + + sta_info_put(sta); + + return 0; +} + +/* + * This handles both adding a beacon and setting new beacon info + */ +static int ieee80211_config_beacon(struct ieee80211_sub_if_data *sdata, + struct beacon_parameters *params) +{ + struct beacon_data *new, *old; + int new_head_len, new_tail_len; + int size; + int err = -EINVAL; + + old = sdata->u.ap.beacon; + + /* head must not be zero-length */ + if (params->head && !params->head_len) + return -EINVAL; + + /* + * This is a kludge. beacon interval should really be part + * of the beacon information. + */ + if (params->interval) { + sdata->local->hw.conf.beacon_int = params->interval; + if (ieee80211_hw_config(sdata->local)) + return -EINVAL; + /* + * We updated some parameter so if below bails out + * it's not an error. + */ + err = 0; + } + + /* Need to have a beacon head if we don't have one yet */ + if (!params->head && !old) + return err; + + /* sorry, no way to start beaconing without dtim period */ + if (!params->dtim_period && !old) + return err; + + /* new or old head? */ + if (params->head) + new_head_len = params->head_len; + else + new_head_len = old->head_len; + + /* new or old tail? */ + if (params->tail || !old) + /* params->tail_len will be zero for !params->tail */ + new_tail_len = params->tail_len; + else + new_tail_len = old->tail_len; + + size = sizeof(*new) + new_head_len + new_tail_len; + + new = kzalloc(size, GFP_KERNEL); + if (!new) + return -ENOMEM; + + /* start filling the new info now */ + + /* new or old dtim period? */ + if (params->dtim_period) + new->dtim_period = params->dtim_period; + else + new->dtim_period = old->dtim_period; + + /* + * pointers go into the block we allocated, + * memory is | beacon_data | head | tail | + */ + new->head = ((u8 *) new) + sizeof(*new); + new->tail = new->head + new_head_len; + new->head_len = new_head_len; + new->tail_len = new_tail_len; + + /* copy in head */ + if (params->head) + memcpy(new->head, params->head, new_head_len); + else + memcpy(new->head, old->head, new_head_len); + + /* copy in optional tail */ + if (params->tail) + memcpy(new->tail, params->tail, new_tail_len); + else + if (old) + memcpy(new->tail, old->tail, new_tail_len); + + rcu_assign_pointer(sdata->u.ap.beacon, new); + + synchronize_rcu(); + + kfree(old); + + return ieee80211_if_config_beacon(sdata->dev); +} + +static int ieee80211_add_beacon(struct wiphy *wiphy, struct net_device *dev, + struct beacon_parameters *params) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct beacon_data *old; + + if (sdata->vif.type != IEEE80211_IF_TYPE_AP) + return -EINVAL; + + old = sdata->u.ap.beacon; + + if (old) + return -EALREADY; + + return ieee80211_config_beacon(sdata, params); +} + +static int ieee80211_set_beacon(struct wiphy *wiphy, struct net_device *dev, + struct beacon_parameters *params) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct beacon_data *old; + + if (sdata->vif.type != IEEE80211_IF_TYPE_AP) + return -EINVAL; + + old = sdata->u.ap.beacon; + + if (!old) + return -ENOENT; + + return ieee80211_config_beacon(sdata, params); +} + +static int ieee80211_del_beacon(struct wiphy *wiphy, struct net_device *dev) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct beacon_data *old; + + if (sdata->vif.type != IEEE80211_IF_TYPE_AP) + return -EINVAL; + + old = sdata->u.ap.beacon; + + if (!old) + return -ENOENT; + + rcu_assign_pointer(sdata->u.ap.beacon, NULL); + synchronize_rcu(); + kfree(old); + + return ieee80211_if_config_beacon(dev); +} + +/* Layer 2 Update frame (802.2 Type 1 LLC XID Update response) */ +struct iapp_layer2_update { + u8 da[ETH_ALEN]; /* broadcast */ + u8 sa[ETH_ALEN]; /* STA addr */ + __be16 len; /* 6 */ + u8 dsap; /* 0 */ + u8 ssap; /* 0 */ + u8 control; + u8 xid_info[3]; +} __attribute__ ((packed)); + +static void ieee80211_send_layer2_update(struct sta_info *sta) +{ + struct iapp_layer2_update *msg; + struct sk_buff *skb; + + /* Send Level 2 Update Frame to update forwarding tables in layer 2 + * bridge devices */ + + skb = dev_alloc_skb(sizeof(*msg)); + if (!skb) + return; + msg = (struct iapp_layer2_update *)skb_put(skb, sizeof(*msg)); + + /* 802.2 Type 1 Logical Link Control (LLC) Exchange Identifier (XID) + * Update response frame; IEEE Std 802.2-1998, 5.4.1.2.1 */ + + memset(msg->da, 0xff, ETH_ALEN); + memcpy(msg->sa, sta->addr, ETH_ALEN); + msg->len = htons(6); + msg->dsap = 0; + msg->ssap = 0x01; /* NULL LSAP, CR Bit: Response */ + msg->control = 0xaf; /* XID response lsb.1111F101. + * F=0 (no poll command; unsolicited frame) */ + msg->xid_info[0] = 0x81; /* XID format identifier */ + msg->xid_info[1] = 1; /* LLC types/classes: Type 1 LLC */ + msg->xid_info[2] = 0; /* XID sender's receive window size (RW) */ + + skb->dev = sta->dev; + skb->protocol = eth_type_trans(skb, sta->dev); + memset(skb->cb, 0, sizeof(skb->cb)); + netif_rx(skb); +} + +static void sta_apply_parameters(struct ieee80211_local *local, + struct sta_info *sta, + struct station_parameters *params) +{ + u32 rates; + int i, j; + struct ieee80211_hw_mode *mode; + + if (params->station_flags & STATION_FLAG_CHANGED) { + sta->flags &= ~WLAN_STA_AUTHORIZED; + if (params->station_flags & STATION_FLAG_AUTHORIZED) + sta->flags |= WLAN_STA_AUTHORIZED; + + sta->flags &= ~WLAN_STA_SHORT_PREAMBLE; + if (params->station_flags & STATION_FLAG_SHORT_PREAMBLE) + sta->flags |= WLAN_STA_SHORT_PREAMBLE; + + sta->flags &= ~WLAN_STA_WME; + if (params->station_flags & STATION_FLAG_WME) + sta->flags |= WLAN_STA_WME; + } + + if (params->aid) { + sta->aid = params->aid; + if (sta->aid > IEEE80211_MAX_AID) + sta->aid = 0; /* XXX: should this be an error? */ + } + + if (params->listen_interval >= 0) + sta->listen_interval = params->listen_interval; + + if (params->supported_rates) { + rates = 0; + mode = local->oper_hw_mode; + for (i = 0; i < params->supported_rates_len; i++) { + int rate = (params->supported_rates[i] & 0x7f) * 5; + for (j = 0; j < mode->num_rates; j++) { + if (mode->rates[j].rate == rate) + rates |= BIT(j); + } + } + sta->supp_rates = rates; + } +} + +static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, + u8 *mac, struct station_parameters *params) +{ + struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); + struct sta_info *sta; + struct ieee80211_sub_if_data *sdata; + + /* Prevent a race with changing the rate control algorithm */ + if (!netif_running(dev)) + return -ENETDOWN; + + /* XXX: get sta belonging to dev */ + sta = sta_info_get(local, mac); + if (sta) { + sta_info_put(sta); + return -EEXIST; + } + + if (params->vlan) { + sdata = IEEE80211_DEV_TO_SUB_IF(params->vlan); + + if (sdata->vif.type != IEEE80211_IF_TYPE_VLAN || + sdata->vif.type != IEEE80211_IF_TYPE_AP) + return -EINVAL; + } else + sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + sta = sta_info_add(local, dev, mac, GFP_KERNEL); + if (!sta) + return -ENOMEM; + + sta->dev = sdata->dev; + if (sdata->vif.type == IEEE80211_IF_TYPE_VLAN || + sdata->vif.type == IEEE80211_IF_TYPE_AP) + ieee80211_send_layer2_update(sta); + + sta->flags = WLAN_STA_AUTH | WLAN_STA_ASSOC; + + sta_apply_parameters(local, sta, params); + + rate_control_rate_init(sta, local); + + sta_info_put(sta); + + return 0; +} + +static int ieee80211_del_station(struct wiphy *wiphy, struct net_device *dev, + u8 *mac) +{ + struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); + struct sta_info *sta; + + if (mac) { + /* XXX: get sta belonging to dev */ + sta = sta_info_get(local, mac); + if (!sta) + return -ENOENT; + + sta_info_free(sta); + sta_info_put(sta); + } else + sta_info_flush(local, dev); + + return 0; +} + +static int ieee80211_change_station(struct wiphy *wiphy, + struct net_device *dev, + u8 *mac, + struct station_parameters *params) +{ + struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); + struct sta_info *sta; + struct ieee80211_sub_if_data *vlansdata; + + /* XXX: get sta belonging to dev */ + sta = sta_info_get(local, mac); + if (!sta) + return -ENOENT; + + if (params->vlan && params->vlan != sta->dev) { + vlansdata = IEEE80211_DEV_TO_SUB_IF(params->vlan); + + if (vlansdata->vif.type != IEEE80211_IF_TYPE_VLAN || + vlansdata->vif.type != IEEE80211_IF_TYPE_AP) + return -EINVAL; + + sta->dev = params->vlan; + ieee80211_send_layer2_update(sta); + } + + sta_apply_parameters(local, sta, params); + + sta_info_put(sta); + + return 0; +} + struct cfg80211_ops mac80211_config_ops = { .add_virtual_intf = ieee80211_add_iface, .del_virtual_intf = ieee80211_del_iface, .change_virtual_intf = ieee80211_change_iface, + .add_key = ieee80211_add_key, + .del_key = ieee80211_del_key, + .get_key = ieee80211_get_key, + .set_default_key = ieee80211_config_default_key, + .add_beacon = ieee80211_add_beacon, + .set_beacon = ieee80211_set_beacon, + .del_beacon = ieee80211_del_beacon, + .add_station = ieee80211_add_station, + .del_station = ieee80211_del_station, + .change_station = ieee80211_change_station, + .get_station = ieee80211_get_station, }; diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index f0e6ab7eb62..829872a3ae8 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -91,8 +91,7 @@ static const struct file_operations name##_ops = { \ /* common attributes */ IEEE80211_IF_FILE(channel_use, channel_use, DEC); IEEE80211_IF_FILE(drop_unencrypted, drop_unencrypted, DEC); -IEEE80211_IF_FILE(eapol, eapol, DEC); -IEEE80211_IF_FILE(ieee8021_x, ieee802_1x, DEC); +IEEE80211_IF_FILE(ieee802_1x_pac, ieee802_1x_pac, DEC); /* STA/IBSS attributes */ IEEE80211_IF_FILE(state, u.sta.state, DEC); @@ -119,13 +118,12 @@ static ssize_t ieee80211_if_fmt_flags( sdata->u.sta.flags & IEEE80211_STA_AUTHENTICATED ? "AUTH\n" : "", sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED ? "ASSOC\n" : "", sdata->u.sta.flags & IEEE80211_STA_PROBEREQ_POLL ? "PROBEREQ POLL\n" : "", - sdata->flags & IEEE80211_SDATA_USE_PROTECTION ? "CTS prot\n" : ""); + sdata->bss_conf.use_cts_prot ? "CTS prot\n" : ""); } __IEEE80211_IF_FILE(flags); /* AP attributes */ IEEE80211_IF_FILE(num_sta_ps, u.ap.num_sta_ps, ATOMIC); -IEEE80211_IF_FILE(dtim_period, u.ap.dtim_period, DEC); IEEE80211_IF_FILE(dtim_count, u.ap.dtim_count, DEC); IEEE80211_IF_FILE(num_beacons, u.ap.num_beacons, DEC); IEEE80211_IF_FILE(force_unicast_rateidx, u.ap.force_unicast_rateidx, DEC); @@ -139,26 +137,6 @@ static ssize_t ieee80211_if_fmt_num_buffered_multicast( } __IEEE80211_IF_FILE(num_buffered_multicast); -static ssize_t ieee80211_if_fmt_beacon_head_len( - const struct ieee80211_sub_if_data *sdata, char *buf, int buflen) -{ - if (sdata->u.ap.beacon_head) - return scnprintf(buf, buflen, "%d\n", - sdata->u.ap.beacon_head_len); - return scnprintf(buf, buflen, "\n"); -} -__IEEE80211_IF_FILE(beacon_head_len); - -static ssize_t ieee80211_if_fmt_beacon_tail_len( - const struct ieee80211_sub_if_data *sdata, char *buf, int buflen) -{ - if (sdata->u.ap.beacon_tail) - return scnprintf(buf, buflen, "%d\n", - sdata->u.ap.beacon_tail_len); - return scnprintf(buf, buflen, "\n"); -} -__IEEE80211_IF_FILE(beacon_tail_len); - /* WDS attributes */ IEEE80211_IF_FILE(peer, u.wds.remote_addr, MAC); @@ -170,8 +148,7 @@ static void add_sta_files(struct ieee80211_sub_if_data *sdata) { DEBUGFS_ADD(channel_use, sta); DEBUGFS_ADD(drop_unencrypted, sta); - DEBUGFS_ADD(eapol, sta); - DEBUGFS_ADD(ieee8021_x, sta); + DEBUGFS_ADD(ieee802_1x_pac, sta); DEBUGFS_ADD(state, sta); DEBUGFS_ADD(bssid, sta); DEBUGFS_ADD(prev_bssid, sta); @@ -192,25 +169,20 @@ static void add_ap_files(struct ieee80211_sub_if_data *sdata) { DEBUGFS_ADD(channel_use, ap); DEBUGFS_ADD(drop_unencrypted, ap); - DEBUGFS_ADD(eapol, ap); - DEBUGFS_ADD(ieee8021_x, ap); + DEBUGFS_ADD(ieee802_1x_pac, ap); DEBUGFS_ADD(num_sta_ps, ap); - DEBUGFS_ADD(dtim_period, ap); DEBUGFS_ADD(dtim_count, ap); DEBUGFS_ADD(num_beacons, ap); DEBUGFS_ADD(force_unicast_rateidx, ap); DEBUGFS_ADD(max_ratectrl_rateidx, ap); DEBUGFS_ADD(num_buffered_multicast, ap); - DEBUGFS_ADD(beacon_head_len, ap); - DEBUGFS_ADD(beacon_tail_len, ap); } static void add_wds_files(struct ieee80211_sub_if_data *sdata) { DEBUGFS_ADD(channel_use, wds); DEBUGFS_ADD(drop_unencrypted, wds); - DEBUGFS_ADD(eapol, wds); - DEBUGFS_ADD(ieee8021_x, wds); + DEBUGFS_ADD(ieee802_1x_pac, wds); DEBUGFS_ADD(peer, wds); } @@ -218,8 +190,7 @@ static void add_vlan_files(struct ieee80211_sub_if_data *sdata) { DEBUGFS_ADD(channel_use, vlan); DEBUGFS_ADD(drop_unencrypted, vlan); - DEBUGFS_ADD(eapol, vlan); - DEBUGFS_ADD(ieee8021_x, vlan); + DEBUGFS_ADD(ieee802_1x_pac, vlan); } static void add_monitor_files(struct ieee80211_sub_if_data *sdata) @@ -231,7 +202,7 @@ static void add_files(struct ieee80211_sub_if_data *sdata) if (!sdata->debugfsdir) return; - switch (sdata->type) { + switch (sdata->vif.type) { case IEEE80211_IF_TYPE_STA: case IEEE80211_IF_TYPE_IBSS: add_sta_files(sdata); @@ -263,8 +234,7 @@ static void del_sta_files(struct ieee80211_sub_if_data *sdata) { DEBUGFS_DEL(channel_use, sta); DEBUGFS_DEL(drop_unencrypted, sta); - DEBUGFS_DEL(eapol, sta); - DEBUGFS_DEL(ieee8021_x, sta); + DEBUGFS_DEL(ieee802_1x_pac, sta); DEBUGFS_DEL(state, sta); DEBUGFS_DEL(bssid, sta); DEBUGFS_DEL(prev_bssid, sta); @@ -285,25 +255,20 @@ static void del_ap_files(struct ieee80211_sub_if_data *sdata) { DEBUGFS_DEL(channel_use, ap); DEBUGFS_DEL(drop_unencrypted, ap); - DEBUGFS_DEL(eapol, ap); - DEBUGFS_DEL(ieee8021_x, ap); + DEBUGFS_DEL(ieee802_1x_pac, ap); DEBUGFS_DEL(num_sta_ps, ap); - DEBUGFS_DEL(dtim_period, ap); DEBUGFS_DEL(dtim_count, ap); DEBUGFS_DEL(num_beacons, ap); DEBUGFS_DEL(force_unicast_rateidx, ap); DEBUGFS_DEL(max_ratectrl_rateidx, ap); DEBUGFS_DEL(num_buffered_multicast, ap); - DEBUGFS_DEL(beacon_head_len, ap); - DEBUGFS_DEL(beacon_tail_len, ap); } static void del_wds_files(struct ieee80211_sub_if_data *sdata) { DEBUGFS_DEL(channel_use, wds); DEBUGFS_DEL(drop_unencrypted, wds); - DEBUGFS_DEL(eapol, wds); - DEBUGFS_DEL(ieee8021_x, wds); + DEBUGFS_DEL(ieee802_1x_pac, wds); DEBUGFS_DEL(peer, wds); } @@ -311,8 +276,7 @@ static void del_vlan_files(struct ieee80211_sub_if_data *sdata) { DEBUGFS_DEL(channel_use, vlan); DEBUGFS_DEL(drop_unencrypted, vlan); - DEBUGFS_DEL(eapol, vlan); - DEBUGFS_DEL(ieee8021_x, vlan); + DEBUGFS_DEL(ieee802_1x_pac, vlan); } static void del_monitor_files(struct ieee80211_sub_if_data *sdata) @@ -362,7 +326,7 @@ void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata) void ieee80211_debugfs_remove_netdev(struct ieee80211_sub_if_data *sdata) { - del_files(sdata, sdata->type); + del_files(sdata, sdata->vif.type); debugfs_remove(sdata->debugfsdir); sdata->debugfsdir = NULL; } diff --git a/net/mac80211/ieee80211.c b/net/mac80211/ieee80211.c index 6378850d858..5dcc2d61551 100644 --- a/net/mac80211/ieee80211.c +++ b/net/mac80211/ieee80211.c @@ -34,6 +34,8 @@ #include "debugfs.h" #include "debugfs_netdev.h" +#define SUPP_MCS_SET_LEN 16 + /* * For seeing transmitted packets on monitor interfaces * we have a radiotap header too. @@ -175,21 +177,21 @@ static int ieee80211_open(struct net_device *dev) /* * check whether it may have the same address */ - if (!identical_mac_addr_allowed(sdata->type, - nsdata->type)) + if (!identical_mac_addr_allowed(sdata->vif.type, + nsdata->vif.type)) return -ENOTUNIQ; /* * can only add VLANs to enabled APs */ - if (sdata->type == IEEE80211_IF_TYPE_VLAN && - nsdata->type == IEEE80211_IF_TYPE_AP && + if (sdata->vif.type == IEEE80211_IF_TYPE_VLAN && + nsdata->vif.type == IEEE80211_IF_TYPE_AP && netif_running(nsdata->dev)) sdata->u.vlan.ap = nsdata; } } - switch (sdata->type) { + switch (sdata->vif.type) { case IEEE80211_IF_TYPE_WDS: if (is_zero_ether_addr(sdata->u.wds.remote_addr)) return -ENOLINK; @@ -217,9 +219,10 @@ static int ieee80211_open(struct net_device *dev) if (res) return res; ieee80211_hw_config(local); + ieee80211_led_radio(local, local->hw.conf.radio_enabled); } - switch (sdata->type) { + switch (sdata->vif.type) { case IEEE80211_IF_TYPE_VLAN: list_add(&sdata->u.vlan.list, &sdata->u.vlan.ap->u.ap.vlans); /* no need to tell driver */ @@ -240,8 +243,8 @@ static int ieee80211_open(struct net_device *dev) sdata->u.sta.flags &= ~IEEE80211_STA_PREV_BSSID_SET; /* fall through */ default: - conf.if_id = dev->ifindex; - conf.type = sdata->type; + conf.vif = &sdata->vif; + conf.type = sdata->vif.type; conf.mac_addr = dev->dev_addr; res = local->ops->add_interface(local_to_hw(local), &conf); if (res && !local->open_count && local->ops->stop) @@ -253,7 +256,7 @@ static int ieee80211_open(struct net_device *dev) ieee80211_reset_erp_info(dev); ieee80211_enable_keys(sdata); - if (sdata->type == IEEE80211_IF_TYPE_STA && + if (sdata->vif.type == IEEE80211_IF_TYPE_STA && !(sdata->flags & IEEE80211_SDATA_USERSPACE_MLME)) netif_carrier_off(dev); else @@ -290,9 +293,20 @@ static int ieee80211_stop(struct net_device *dev) struct ieee80211_sub_if_data *sdata; struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); struct ieee80211_if_init_conf conf; + struct sta_info *sta; + int i; sdata = IEEE80211_DEV_TO_SUB_IF(dev); + list_for_each_entry(sta, &local->sta_list, list) { + if (sta->dev == dev) + for (i = 0; i < STA_TID_NUM; i++) + ieee80211_sta_stop_rx_ba_session(sta->dev, + sta->addr, i, + WLAN_BACK_RECIPIENT, + WLAN_REASON_QSTA_LEAVE_QBSS); + } + netif_stop_queue(dev); /* @@ -309,10 +323,17 @@ static int ieee80211_stop(struct net_device *dev) dev_mc_unsync(local->mdev, dev); - /* down all dependent devices, that is VLANs */ - if (sdata->type == IEEE80211_IF_TYPE_AP) { + /* APs need special treatment */ + if (sdata->vif.type == IEEE80211_IF_TYPE_AP) { struct ieee80211_sub_if_data *vlan, *tmp; + struct beacon_data *old_beacon = sdata->u.ap.beacon; + /* remove beacon */ + rcu_assign_pointer(sdata->u.ap.beacon, NULL); + synchronize_rcu(); + kfree(old_beacon); + + /* down all dependent devices, that is VLANs */ list_for_each_entry_safe(vlan, tmp, &sdata->u.ap.vlans, u.vlan.list) dev_close(vlan->dev); @@ -321,7 +342,7 @@ static int ieee80211_stop(struct net_device *dev) local->open_count--; - switch (sdata->type) { + switch (sdata->vif.type) { case IEEE80211_IF_TYPE_VLAN: list_del(&sdata->u.vlan.list); sdata->u.vlan.ap = NULL; @@ -350,11 +371,14 @@ static int ieee80211_stop(struct net_device *dev) synchronize_rcu(); skb_queue_purge(&sdata->u.sta.skb_queue); - if (!local->ops->hw_scan && - local->scan_dev == sdata->dev) { - local->sta_scanning = 0; - cancel_delayed_work(&local->scan_work); + if (local->scan_dev == sdata->dev) { + if (!local->ops->hw_scan) { + local->sta_sw_scanning = 0; + cancel_delayed_work(&local->scan_work); + } else + local->sta_hw_scanning = 0; } + flush_workqueue(local->hw.workqueue); sdata->u.sta.flags &= ~IEEE80211_STA_PRIVACY_INVOKED; @@ -363,8 +387,8 @@ static int ieee80211_stop(struct net_device *dev) sdata->u.sta.extra_ie_len = 0; /* fall through */ default: - conf.if_id = dev->ifindex; - conf.type = sdata->type; + conf.vif = &sdata->vif; + conf.type = sdata->vif.type; conf.mac_addr = dev->dev_addr; /* disable all keys for as long as this netdev is down */ ieee80211_disable_keys(sdata); @@ -378,6 +402,8 @@ static int ieee80211_stop(struct net_device *dev) if (local->ops->stop) local->ops->stop(local_to_hw(local)); + ieee80211_led_radio(local, 0); + tasklet_disable(&local->tx_pending_tasklet); tasklet_disable(&local->tasklet); } @@ -485,20 +511,20 @@ static int __ieee80211_if_config(struct net_device *dev, return 0; memset(&conf, 0, sizeof(conf)); - conf.type = sdata->type; - if (sdata->type == IEEE80211_IF_TYPE_STA || - sdata->type == IEEE80211_IF_TYPE_IBSS) { + conf.type = sdata->vif.type; + if (sdata->vif.type == IEEE80211_IF_TYPE_STA || + sdata->vif.type == IEEE80211_IF_TYPE_IBSS) { conf.bssid = sdata->u.sta.bssid; conf.ssid = sdata->u.sta.ssid; conf.ssid_len = sdata->u.sta.ssid_len; - } else if (sdata->type == IEEE80211_IF_TYPE_AP) { + } else if (sdata->vif.type == IEEE80211_IF_TYPE_AP) { conf.ssid = sdata->u.ap.ssid; conf.ssid_len = sdata->u.ap.ssid_len; conf.beacon = beacon; conf.beacon_control = control; } return local->ops->config_interface(local_to_hw(local), - dev->ifindex, &conf); + &sdata->vif, &conf); } int ieee80211_if_config(struct net_device *dev) @@ -510,11 +536,13 @@ int ieee80211_if_config_beacon(struct net_device *dev) { struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); struct ieee80211_tx_control control; + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct sk_buff *skb; if (!(local->hw.flags & IEEE80211_HW_HOST_GEN_BEACON_TEMPLATE)) return 0; - skb = ieee80211_beacon_get(local_to_hw(local), dev->ifindex, &control); + skb = ieee80211_beacon_get(local_to_hw(local), &sdata->vif, + &control); if (!skb) return -ENOMEM; return __ieee80211_if_config(dev, skb, &control); @@ -526,7 +554,7 @@ int ieee80211_hw_config(struct ieee80211_local *local) struct ieee80211_channel *chan; int ret = 0; - if (local->sta_scanning) { + if (local->sta_sw_scanning) { chan = local->scan_channel; mode = local->scan_hw_mode; } else { @@ -560,25 +588,79 @@ int ieee80211_hw_config(struct ieee80211_local *local) return ret; } -void ieee80211_erp_info_change_notify(struct net_device *dev, u8 changes) +/** + * ieee80211_hw_config_ht should be used only after legacy configuration + * has been determined, as ht configuration depends upon the hardware's + * HT abilities for a _specific_ band. + */ +int ieee80211_hw_config_ht(struct ieee80211_local *local, int enable_ht, + struct ieee80211_ht_info *req_ht_cap, + struct ieee80211_ht_bss_info *req_bss_cap) { - struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); - struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - if (local->ops->erp_ie_changed) - local->ops->erp_ie_changed(local_to_hw(local), changes, - !!(sdata->flags & IEEE80211_SDATA_USE_PROTECTION), - !(sdata->flags & IEEE80211_SDATA_SHORT_PREAMBLE)); + struct ieee80211_conf *conf = &local->hw.conf; + struct ieee80211_hw_mode *mode = conf->mode; + int i; + + /* HT is not supported */ + if (!mode->ht_info.ht_supported) { + conf->flags &= ~IEEE80211_CONF_SUPPORT_HT_MODE; + return -EOPNOTSUPP; + } + + /* disable HT */ + if (!enable_ht) { + conf->flags &= ~IEEE80211_CONF_SUPPORT_HT_MODE; + } else { + conf->flags |= IEEE80211_CONF_SUPPORT_HT_MODE; + conf->ht_conf.cap = req_ht_cap->cap & mode->ht_info.cap; + conf->ht_conf.cap &= ~(IEEE80211_HT_CAP_MIMO_PS); + conf->ht_conf.cap |= + mode->ht_info.cap & IEEE80211_HT_CAP_MIMO_PS; + conf->ht_bss_conf.primary_channel = + req_bss_cap->primary_channel; + conf->ht_bss_conf.bss_cap = req_bss_cap->bss_cap; + conf->ht_bss_conf.bss_op_mode = req_bss_cap->bss_op_mode; + for (i = 0; i < SUPP_MCS_SET_LEN; i++) + conf->ht_conf.supp_mcs_set[i] = + mode->ht_info.supp_mcs_set[i] & + req_ht_cap->supp_mcs_set[i]; + + /* In STA mode, this gives us indication + * to the AP's mode of operation */ + conf->ht_conf.ht_supported = 1; + conf->ht_conf.ampdu_factor = req_ht_cap->ampdu_factor; + conf->ht_conf.ampdu_density = req_ht_cap->ampdu_density; + } + + local->ops->conf_ht(local_to_hw(local), &local->hw.conf); + + return 0; +} + +void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata, + u32 changed) +{ + struct ieee80211_local *local = sdata->local; + + if (!changed) + return; + + if (local->ops->bss_info_changed) + local->ops->bss_info_changed(local_to_hw(local), + &sdata->vif, + &sdata->bss_conf, + changed); } void ieee80211_reset_erp_info(struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - sdata->flags &= ~(IEEE80211_SDATA_USE_PROTECTION | - IEEE80211_SDATA_SHORT_PREAMBLE); - ieee80211_erp_info_change_notify(dev, - IEEE80211_ERP_CHANGE_PROTECTION | - IEEE80211_ERP_CHANGE_PREAMBLE); + sdata->bss_conf.use_cts_prot = 0; + sdata->bss_conf.use_short_preamble = 0; + ieee80211_bss_info_change_notify(sdata, + BSS_CHANGED_ERP_CTS_PROT | + BSS_CHANGED_ERP_PREAMBLE); } void ieee80211_tx_status_irqsafe(struct ieee80211_hw *hw, @@ -635,7 +717,7 @@ static void ieee80211_tasklet_handler(unsigned long data) case IEEE80211_RX_MSG: /* status is in skb->cb */ memcpy(&rx_status, skb->cb, sizeof(rx_status)); - /* Clear skb->type in order to not confuse kernel + /* Clear skb->pkt_type in order to not confuse kernel * netstack. */ skb->pkt_type = 0; __ieee80211_rx(local_to_hw(local), skb, &rx_status); @@ -670,7 +752,7 @@ static void ieee80211_remove_tx_extra(struct ieee80211_local *local, struct ieee80211_tx_packet_data *pkt_data; pkt_data = (struct ieee80211_tx_packet_data *)skb->cb; - pkt_data->ifindex = control->ifindex; + pkt_data->ifindex = vif_to_sdata(control->vif)->dev->ifindex; pkt_data->flags = 0; if (control->flags & IEEE80211_TXCTL_REQ_TX_STATUS) pkt_data->flags |= IEEE80211_TXPD_REQ_TX_STATUS; @@ -678,6 +760,8 @@ static void ieee80211_remove_tx_extra(struct ieee80211_local *local, pkt_data->flags |= IEEE80211_TXPD_DO_NOT_ENCRYPT; if (control->flags & IEEE80211_TXCTL_REQUEUE) pkt_data->flags |= IEEE80211_TXPD_REQUEUE; + if (control->flags & IEEE80211_TXCTL_EAPOL_FRAME) + pkt_data->flags |= IEEE80211_TXPD_EAPOL_FRAME; pkt_data->queue = control->queue; hdrlen = ieee80211_get_hdrlen_from_skb(skb); @@ -805,10 +889,8 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb, sta_info_put(sta); return; } - } else { - /* FIXME: STUPID to call this with both local and local->mdev */ - rate_control_tx_status(local, local->mdev, skb, status); - } + } else + rate_control_tx_status(local->mdev, skb, status); ieee80211_led_tx(local, 0); @@ -894,7 +976,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb, if (!monitors || !skb) goto out; - if (sdata->type == IEEE80211_IF_TYPE_MNTR) { + if (sdata->vif.type == IEEE80211_IF_TYPE_MNTR) { if (!netif_running(sdata->dev)) continue; monitors--; @@ -1016,7 +1098,7 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, mdev->header_ops = &ieee80211_header_ops; mdev->set_multicast_list = ieee80211_master_set_multicast_list; - sdata->type = IEEE80211_IF_TYPE_AP; + sdata->vif.type = IEEE80211_IF_TYPE_AP; sdata->dev = mdev; sdata->local = local; sdata->u.ap.force_unicast_rateidx = -1; @@ -1260,33 +1342,38 @@ static int __init ieee80211_init(void) BUILD_BUG_ON(sizeof(struct ieee80211_tx_packet_data) > sizeof(skb->cb)); -#ifdef CONFIG_MAC80211_RCSIMPLE - ret = ieee80211_rate_control_register(&mac80211_rcsimple); + ret = rc80211_simple_init(); if (ret) - return ret; -#endif + goto fail; + + ret = rc80211_pid_init(); + if (ret) + goto fail_simple; ret = ieee80211_wme_register(); if (ret) { -#ifdef CONFIG_MAC80211_RCSIMPLE - ieee80211_rate_control_unregister(&mac80211_rcsimple); -#endif printk(KERN_DEBUG "ieee80211_init: failed to " "initialize WME (err=%d)\n", ret); - return ret; + goto fail_pid; } ieee80211_debugfs_netdev_init(); ieee80211_regdomain_init(); return 0; + + fail_pid: + rc80211_simple_exit(); + fail_simple: + rc80211_pid_exit(); + fail: + return ret; } static void __exit ieee80211_exit(void) { -#ifdef CONFIG_MAC80211_RCSIMPLE - ieee80211_rate_control_unregister(&mac80211_rcsimple); -#endif + rc80211_simple_exit(); + rc80211_pid_exit(); ieee80211_wme_unregister(); ieee80211_debugfs_netdev_exit(); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 72e1c93dd87..72ecbf7bf96 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -89,6 +89,8 @@ struct ieee80211_sta_bss { size_t rsn_ie_len; u8 *wmm_ie; size_t wmm_ie_len; + u8 *ht_ie; + size_t ht_ie_len; #define IEEE80211_MAX_SUPP_RATES 32 u8 supp_rates[IEEE80211_MAX_SUPP_RATES]; size_t supp_rates_len; @@ -121,6 +123,7 @@ typedef enum { /* frame is destined to interface currently processed (incl. multicast frames) */ #define IEEE80211_TXRXD_RXRA_MATCH BIT(5) #define IEEE80211_TXRXD_TX_INJECTED BIT(6) +#define IEEE80211_TXRXD_RX_AMSDU BIT(7) struct ieee80211_txrx_data { struct sk_buff *skb; struct net_device *dev; @@ -161,6 +164,7 @@ struct ieee80211_txrx_data { #define IEEE80211_TXPD_REQ_TX_STATUS BIT(0) #define IEEE80211_TXPD_DO_NOT_ENCRYPT BIT(1) #define IEEE80211_TXPD_REQUEUE BIT(2) +#define IEEE80211_TXPD_EAPOL_FRAME BIT(3) /* Stored in sk_buff->cb */ struct ieee80211_tx_packet_data { int ifindex; @@ -186,9 +190,14 @@ typedef ieee80211_txrx_result (*ieee80211_tx_handler) typedef ieee80211_txrx_result (*ieee80211_rx_handler) (struct ieee80211_txrx_data *rx); +struct beacon_data { + u8 *head, *tail; + int head_len, tail_len; + int dtim_period; +}; + struct ieee80211_if_ap { - u8 *beacon_head, *beacon_tail; - int beacon_head_len, beacon_tail_len; + struct beacon_data *beacon; struct list_head vlans; @@ -201,7 +210,7 @@ struct ieee80211_if_ap { u8 tim[sizeof(unsigned long) * BITS_TO_LONGS(IEEE80211_MAX_AID + 1)]; atomic_t num_sta_ps; /* number of stations in PS mode */ struct sk_buff_head ps_bc_buf; - int dtim_period, dtim_count; + int dtim_count; int force_unicast_rateidx; /* forced TX rateidx for unicast frames */ int max_ratectrl_rateidx; /* max TX rateidx for rate control */ int num_beacons; /* number of TXed beacon frames for this BSS */ @@ -282,15 +291,9 @@ struct ieee80211_if_sta { /* flags used in struct ieee80211_sub_if_data.flags */ #define IEEE80211_SDATA_ALLMULTI BIT(0) #define IEEE80211_SDATA_PROMISC BIT(1) -#define IEEE80211_SDATA_USE_PROTECTION BIT(2) /* CTS protect ERP frames */ -/* use short preamble with IEEE 802.11b: this flag is set when the AP or beacon - * generator reports that there are no present stations that cannot support short - * preambles */ -#define IEEE80211_SDATA_SHORT_PREAMBLE BIT(3) -#define IEEE80211_SDATA_USERSPACE_MLME BIT(4) +#define IEEE80211_SDATA_USERSPACE_MLME BIT(2) struct ieee80211_sub_if_data { struct list_head list; - enum ieee80211_if_types type; struct wireless_dev wdev; @@ -303,11 +306,11 @@ struct ieee80211_sub_if_data { unsigned int flags; int drop_unencrypted; - int eapol; /* 0 = process EAPOL frames as normal data frames, - * 1 = send EAPOL frames through wlan#ap to hostapd - * (default) */ - int ieee802_1x; /* IEEE 802.1X PAE - drop packet to/from unauthorized - * port */ + /* + * IEEE 802.1X Port access control in effect, + * drop packets to/from unauthorized port + */ + int ieee802_1x_pac; u16 sequence; @@ -319,6 +322,15 @@ struct ieee80211_sub_if_data { struct ieee80211_key *keys[NUM_DEFAULT_KEYS]; struct ieee80211_key *default_key; + /* + * BSS configuration for this interface. + * + * FIXME: I feel bad putting this here when we already have a + * bss pointer, but the bss pointer is just wrong when + * you have multiple virtual STA mode interfaces... + * This needs to be fixed. + */ + struct ieee80211_bss_conf bss_conf; struct ieee80211_if_ap *bss; /* BSS that this device belongs to */ union { @@ -336,8 +348,7 @@ struct ieee80211_sub_if_data { struct { struct dentry *channel_use; struct dentry *drop_unencrypted; - struct dentry *eapol; - struct dentry *ieee8021_x; + struct dentry *ieee802_1x_pac; struct dentry *state; struct dentry *bssid; struct dentry *prev_bssid; @@ -356,30 +367,24 @@ struct ieee80211_sub_if_data { struct { struct dentry *channel_use; struct dentry *drop_unencrypted; - struct dentry *eapol; - struct dentry *ieee8021_x; + struct dentry *ieee802_1x_pac; struct dentry *num_sta_ps; - struct dentry *dtim_period; struct dentry *dtim_count; struct dentry *num_beacons; struct dentry *force_unicast_rateidx; struct dentry *max_ratectrl_rateidx; struct dentry *num_buffered_multicast; - struct dentry *beacon_head_len; - struct dentry *beacon_tail_len; } ap; struct { struct dentry *channel_use; struct dentry *drop_unencrypted; - struct dentry *eapol; - struct dentry *ieee8021_x; + struct dentry *ieee802_1x_pac; struct dentry *peer; } wds; struct { struct dentry *channel_use; struct dentry *drop_unencrypted; - struct dentry *eapol; - struct dentry *ieee8021_x; + struct dentry *ieee802_1x_pac; } vlan; struct { struct dentry *mode; @@ -387,8 +392,16 @@ struct ieee80211_sub_if_data { struct dentry *default_key; } debugfs; #endif + /* must be last, dynamically sized area in this! */ + struct ieee80211_vif vif; }; +static inline +struct ieee80211_sub_if_data *vif_to_sdata(struct ieee80211_vif *p) +{ + return container_of(p, struct ieee80211_sub_if_data, vif); +} + #define IEEE80211_DEV_TO_SUB_IF(dev) netdev_priv(dev) enum { @@ -470,7 +483,8 @@ struct ieee80211_local { struct list_head interfaces; - int sta_scanning; + bool sta_sw_scanning; + bool sta_hw_scanning; int scan_channel_idx; enum { SCAN_SET_CHANNEL, SCAN_SEND_PROBE } scan_state; unsigned long last_scan_completed; @@ -483,10 +497,6 @@ struct ieee80211_local { struct list_head sta_bss_list; struct ieee80211_sta_bss *sta_bss_hash[STA_HASH_SIZE]; spinlock_t sta_bss_lock; -#define IEEE80211_SCAN_MATCH_SSID BIT(0) -#define IEEE80211_SCAN_WPA_ONLY BIT(1) -#define IEEE80211_SCAN_EXTRA_INFO BIT(2) - int scan_flags; /* SNMP counters */ /* dot11CountersTable */ @@ -503,8 +513,9 @@ struct ieee80211_local { #ifdef CONFIG_MAC80211_LEDS int tx_led_counter, rx_led_counter; - struct led_trigger *tx_led, *rx_led, *assoc_led; - char tx_led_name[32], rx_led_name[32], assoc_led_name[32]; + struct led_trigger *tx_led, *rx_led, *assoc_led, *radio_led; + char tx_led_name[32], rx_led_name[32], + assoc_led_name[32], radio_led_name[32]; #endif u32 channel_use; @@ -708,6 +719,9 @@ int ieee80211_if_update_wds(struct net_device *dev, u8 *remote_addr); void ieee80211_if_setup(struct net_device *dev); struct ieee80211_rate *ieee80211_get_rate(struct ieee80211_local *local, int phymode, int hwrate); +int ieee80211_hw_config_ht(struct ieee80211_local *local, int enable_ht, + struct ieee80211_ht_info *req_ht_cap, + struct ieee80211_ht_bss_info *req_bss_cap); /* ieee80211_ioctl.c */ extern const struct iw_handler_def ieee80211_iw_handler_def; @@ -749,7 +763,8 @@ int ieee80211_sta_req_scan(struct net_device *dev, u8 *ssid, size_t ssid_len); void ieee80211_sta_req_auth(struct net_device *dev, struct ieee80211_if_sta *ifsta); int ieee80211_sta_scan_results(struct net_device *dev, char *buf, size_t len); -void ieee80211_sta_rx_scan(struct net_device *dev, struct sk_buff *skb, +ieee80211_txrx_result ieee80211_sta_rx_scan(struct net_device *dev, + struct sk_buff *skb, struct ieee80211_rx_status *rx_status); void ieee80211_rx_bss_list_init(struct net_device *dev); void ieee80211_rx_bss_list_deinit(struct net_device *dev); @@ -759,9 +774,17 @@ struct sta_info * ieee80211_ibss_add_sta(struct net_device *dev, u8 *addr); int ieee80211_sta_deauthenticate(struct net_device *dev, u16 reason); int ieee80211_sta_disassociate(struct net_device *dev, u16 reason); -void ieee80211_erp_info_change_notify(struct net_device *dev, u8 changes); +void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata, + u32 changed); void ieee80211_reset_erp_info(struct net_device *dev); - +int ieee80211_ht_cap_ie_to_ht_info(struct ieee80211_ht_cap *ht_cap_ie, + struct ieee80211_ht_info *ht_info); +int ieee80211_ht_addt_info_ie_to_ht_bss_info( + struct ieee80211_ht_addt_info *ht_add_info_ie, + struct ieee80211_ht_bss_info *bss_info); +void ieee80211_sta_stop_rx_ba_session(struct net_device *dev, u8 *da, + u16 tid, u16 initiator, u16 reason); +void sta_rx_agg_session_timer_expired(unsigned long data); /* ieee80211_iface.c */ int ieee80211_if_add(struct net_device *dev, const char *name, struct net_device **new_dev, int type); @@ -793,8 +816,8 @@ int ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev); extern void *mac80211_wiphy_privid; /* for wiphy privid */ extern const unsigned char rfc1042_header[6]; extern const unsigned char bridge_tunnel_header[6]; -u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len); -int ieee80211_is_eapol(const struct sk_buff *skb); +u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, + enum ieee80211_if_types type); int ieee80211_frame_duration(struct ieee80211_local *local, size_t len, int rate, int erp, int short_preamble); void mac80211_ev_michael_mic_failure(struct net_device *dev, int keyidx, diff --git a/net/mac80211/ieee80211_iface.c b/net/mac80211/ieee80211_iface.c index 43e505d2945..92f1eb2da31 100644 --- a/net/mac80211/ieee80211_iface.c +++ b/net/mac80211/ieee80211_iface.c @@ -22,7 +22,6 @@ void ieee80211_if_sdata_init(struct ieee80211_sub_if_data *sdata) /* Default values for sub-interface parameters */ sdata->drop_unencrypted = 0; - sdata->eapol = 1; for (i = 0; i < IEEE80211_FRAGMENT_MAX; i++) skb_queue_head_init(&sdata->fragments[i].skb_list); @@ -48,7 +47,7 @@ int ieee80211_if_add(struct net_device *dev, const char *name, int ret; ASSERT_RTNL(); - ndev = alloc_netdev(sizeof(struct ieee80211_sub_if_data), + ndev = alloc_netdev(sizeof(*sdata) + local->hw.vif_data_size, name, ieee80211_if_setup); if (!ndev) return -ENOMEM; @@ -67,7 +66,7 @@ int ieee80211_if_add(struct net_device *dev, const char *name, sdata = IEEE80211_DEV_TO_SUB_IF(ndev); ndev->ieee80211_ptr = &sdata->wdev; sdata->wdev.wiphy = local->hw.wiphy; - sdata->type = IEEE80211_IF_TYPE_AP; + sdata->vif.type = IEEE80211_IF_TYPE_AP; sdata->dev = ndev; sdata->local = local; ieee80211_if_sdata_init(sdata); @@ -99,7 +98,7 @@ fail: void ieee80211_if_set_type(struct net_device *dev, int type) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - int oldtype = sdata->type; + int oldtype = sdata->vif.type; /* * We need to call this function on the master interface @@ -117,7 +116,7 @@ void ieee80211_if_set_type(struct net_device *dev, int type) /* most have no BSS pointer */ sdata->bss = NULL; - sdata->type = type; + sdata->vif.type = type; switch (type) { case IEEE80211_IF_TYPE_WDS: @@ -127,7 +126,6 @@ void ieee80211_if_set_type(struct net_device *dev, int type) sdata->u.vlan.ap = NULL; break; case IEEE80211_IF_TYPE_AP: - sdata->u.ap.dtim_period = 2; sdata->u.ap.force_unicast_rateidx = -1; sdata->u.ap.max_ratectrl_rateidx = -1; skb_queue_head_init(&sdata->u.ap.ps_bc_buf); @@ -182,7 +180,7 @@ void ieee80211_if_reinit(struct net_device *dev) ieee80211_if_sdata_deinit(sdata); - switch (sdata->type) { + switch (sdata->vif.type) { case IEEE80211_IF_TYPE_INVALID: /* cannot happen */ WARN_ON(1); @@ -208,8 +206,7 @@ void ieee80211_if_reinit(struct net_device *dev) } } - kfree(sdata->u.ap.beacon_head); - kfree(sdata->u.ap.beacon_tail); + kfree(sdata->u.ap.beacon); while ((skb = skb_dequeue(&sdata->u.ap.ps_bc_buf))) { local->total_ps_buffered--; @@ -280,7 +277,7 @@ int ieee80211_if_remove(struct net_device *dev, const char *name, int id) ASSERT_RTNL(); list_for_each_entry_safe(sdata, n, &local->interfaces, list) { - if ((sdata->type == id || id == -1) && + if ((sdata->vif.type == id || id == -1) && strcmp(name, sdata->dev->name) == 0 && sdata->dev != local->mdev) { list_del_rcu(&sdata->list); diff --git a/net/mac80211/ieee80211_ioctl.c b/net/mac80211/ieee80211_ioctl.c index 308bbe4a133..5024d373383 100644 --- a/net/mac80211/ieee80211_ioctl.c +++ b/net/mac80211/ieee80211_ioctl.c @@ -21,6 +21,7 @@ #include <net/mac80211.h> #include "ieee80211_i.h" +#include "ieee80211_led.h" #include "ieee80211_rate.h" #include "wpa.h" #include "aes_ccm.h" @@ -111,8 +112,8 @@ static int ieee80211_ioctl_siwgenie(struct net_device *dev, if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME) return -EOPNOTSUPP; - if (sdata->type == IEEE80211_IF_TYPE_STA || - sdata->type == IEEE80211_IF_TYPE_IBSS) { + if (sdata->vif.type == IEEE80211_IF_TYPE_STA || + sdata->vif.type == IEEE80211_IF_TYPE_IBSS) { int ret = ieee80211_sta_set_extra_ie(dev, extra, data->length); if (ret) return ret; @@ -218,6 +219,8 @@ static int ieee80211_ioctl_giwrange(struct net_device *dev, IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWAP); IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWSCAN); + range->scan_capa |= IW_SCAN_CAPA_ESSID; + return 0; } @@ -229,7 +232,7 @@ static int ieee80211_ioctl_siwmode(struct net_device *dev, struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); int type; - if (sdata->type == IEEE80211_IF_TYPE_VLAN) + if (sdata->vif.type == IEEE80211_IF_TYPE_VLAN) return -EOPNOTSUPP; switch (*mode) { @@ -246,7 +249,7 @@ static int ieee80211_ioctl_siwmode(struct net_device *dev, return -EINVAL; } - if (type == sdata->type) + if (type == sdata->vif.type) return 0; if (netif_running(dev)) return -EBUSY; @@ -265,7 +268,7 @@ static int ieee80211_ioctl_giwmode(struct net_device *dev, struct ieee80211_sub_if_data *sdata; sdata = IEEE80211_DEV_TO_SUB_IF(dev); - switch (sdata->type) { + switch (sdata->vif.type) { case IEEE80211_IF_TYPE_AP: *mode = IW_MODE_MASTER; break; @@ -315,7 +318,7 @@ int ieee80211_set_channel(struct ieee80211_local *local, int channel, int freq) } if (set) { - if (local->sta_scanning) + if (local->sta_sw_scanning) ret = 0; else ret = ieee80211_hw_config(local); @@ -333,13 +336,13 @@ static int ieee80211_ioctl_siwfreq(struct net_device *dev, struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - if (sdata->type == IEEE80211_IF_TYPE_STA) + if (sdata->vif.type == IEEE80211_IF_TYPE_STA) sdata->u.sta.flags &= ~IEEE80211_STA_AUTO_CHANNEL_SEL; /* freq->e == 0: freq->m = channel; otherwise freq = m * 10^e */ if (freq->e == 0) { if (freq->m < 0) { - if (sdata->type == IEEE80211_IF_TYPE_STA) + if (sdata->vif.type == IEEE80211_IF_TYPE_STA) sdata->u.sta.flags |= IEEE80211_STA_AUTO_CHANNEL_SEL; return 0; @@ -385,8 +388,8 @@ static int ieee80211_ioctl_siwessid(struct net_device *dev, len--; sdata = IEEE80211_DEV_TO_SUB_IF(dev); - if (sdata->type == IEEE80211_IF_TYPE_STA || - sdata->type == IEEE80211_IF_TYPE_IBSS) { + if (sdata->vif.type == IEEE80211_IF_TYPE_STA || + sdata->vif.type == IEEE80211_IF_TYPE_IBSS) { int ret; if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME) { if (len > IEEE80211_MAX_SSID_LEN) @@ -406,7 +409,7 @@ static int ieee80211_ioctl_siwessid(struct net_device *dev, return 0; } - if (sdata->type == IEEE80211_IF_TYPE_AP) { + if (sdata->vif.type == IEEE80211_IF_TYPE_AP) { memcpy(sdata->u.ap.ssid, ssid, len); memset(sdata->u.ap.ssid + len, 0, IEEE80211_MAX_SSID_LEN - len); @@ -425,8 +428,8 @@ static int ieee80211_ioctl_giwessid(struct net_device *dev, struct ieee80211_sub_if_data *sdata; sdata = IEEE80211_DEV_TO_SUB_IF(dev); - if (sdata->type == IEEE80211_IF_TYPE_STA || - sdata->type == IEEE80211_IF_TYPE_IBSS) { + if (sdata->vif.type == IEEE80211_IF_TYPE_STA || + sdata->vif.type == IEEE80211_IF_TYPE_IBSS) { int res = ieee80211_sta_get_ssid(dev, ssid, &len); if (res == 0) { data->length = len; @@ -436,7 +439,7 @@ static int ieee80211_ioctl_giwessid(struct net_device *dev, return res; } - if (sdata->type == IEEE80211_IF_TYPE_AP) { + if (sdata->vif.type == IEEE80211_IF_TYPE_AP) { len = sdata->u.ap.ssid_len; if (len > IW_ESSID_MAX_SIZE) len = IW_ESSID_MAX_SIZE; @@ -456,8 +459,8 @@ static int ieee80211_ioctl_siwap(struct net_device *dev, struct ieee80211_sub_if_data *sdata; sdata = IEEE80211_DEV_TO_SUB_IF(dev); - if (sdata->type == IEEE80211_IF_TYPE_STA || - sdata->type == IEEE80211_IF_TYPE_IBSS) { + if (sdata->vif.type == IEEE80211_IF_TYPE_STA || + sdata->vif.type == IEEE80211_IF_TYPE_IBSS) { int ret; if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME) { memcpy(sdata->u.sta.bssid, (u8 *) &ap_addr->sa_data, @@ -476,7 +479,7 @@ static int ieee80211_ioctl_siwap(struct net_device *dev, return ret; ieee80211_sta_req_auth(dev, &sdata->u.sta); return 0; - } else if (sdata->type == IEEE80211_IF_TYPE_WDS) { + } else if (sdata->vif.type == IEEE80211_IF_TYPE_WDS) { if (memcmp(sdata->u.wds.remote_addr, (u8 *) &ap_addr->sa_data, ETH_ALEN) == 0) return 0; @@ -494,12 +497,12 @@ static int ieee80211_ioctl_giwap(struct net_device *dev, struct ieee80211_sub_if_data *sdata; sdata = IEEE80211_DEV_TO_SUB_IF(dev); - if (sdata->type == IEEE80211_IF_TYPE_STA || - sdata->type == IEEE80211_IF_TYPE_IBSS) { + if (sdata->vif.type == IEEE80211_IF_TYPE_STA || + sdata->vif.type == IEEE80211_IF_TYPE_IBSS) { ap_addr->sa_family = ARPHRD_ETHER; memcpy(&ap_addr->sa_data, sdata->u.sta.bssid, ETH_ALEN); return 0; - } else if (sdata->type == IEEE80211_IF_TYPE_WDS) { + } else if (sdata->vif.type == IEEE80211_IF_TYPE_WDS) { ap_addr->sa_family = ARPHRD_ETHER; memcpy(&ap_addr->sa_data, sdata->u.wds.remote_addr, ETH_ALEN); return 0; @@ -513,7 +516,6 @@ static int ieee80211_ioctl_siwscan(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { - struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct iw_scan_req *req = NULL; u8 *ssid = NULL; @@ -522,23 +524,10 @@ static int ieee80211_ioctl_siwscan(struct net_device *dev, if (!netif_running(dev)) return -ENETDOWN; - switch (sdata->type) { - case IEEE80211_IF_TYPE_STA: - case IEEE80211_IF_TYPE_IBSS: - if (local->scan_flags & IEEE80211_SCAN_MATCH_SSID) { - ssid = sdata->u.sta.ssid; - ssid_len = sdata->u.sta.ssid_len; - } - break; - case IEEE80211_IF_TYPE_AP: - if (local->scan_flags & IEEE80211_SCAN_MATCH_SSID) { - ssid = sdata->u.ap.ssid; - ssid_len = sdata->u.ap.ssid_len; - } - break; - default: + if (sdata->vif.type != IEEE80211_IF_TYPE_STA && + sdata->vif.type != IEEE80211_IF_TYPE_IBSS && + sdata->vif.type != IEEE80211_IF_TYPE_AP) return -EOPNOTSUPP; - } /* if SSID was specified explicitly then use that */ if (wrqu->data.length == sizeof(struct iw_scan_req) && @@ -558,8 +547,10 @@ static int ieee80211_ioctl_giwscan(struct net_device *dev, { int res; struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); - if (local->sta_scanning) + + if (local->sta_sw_scanning || local->sta_hw_scanning) return -EAGAIN; + res = ieee80211_sta_scan_results(dev, extra, data->length); if (res >= 0) { data->length = res; @@ -614,7 +605,7 @@ static int ieee80211_ioctl_giwrate(struct net_device *dev, struct ieee80211_sub_if_data *sdata; sdata = IEEE80211_DEV_TO_SUB_IF(dev); - if (sdata->type == IEEE80211_IF_TYPE_STA) + if (sdata->vif.type == IEEE80211_IF_TYPE_STA) sta = sta_info_get(local, sdata->u.sta.bssid); else return -EOPNOTSUPP; @@ -634,22 +625,36 @@ static int ieee80211_ioctl_siwtxpower(struct net_device *dev, { struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); bool need_reconfig = 0; + u8 new_power_level; if ((data->txpower.flags & IW_TXPOW_TYPE) != IW_TXPOW_DBM) return -EINVAL; if (data->txpower.flags & IW_TXPOW_RANGE) return -EINVAL; - if (!data->txpower.fixed) - return -EINVAL; - if (local->hw.conf.power_level != data->txpower.value) { - local->hw.conf.power_level = data->txpower.value; + if (data->txpower.fixed) { + new_power_level = data->txpower.value; + } else { + /* Automatic power level. Get the px power from the current + * channel. */ + struct ieee80211_channel* chan = local->oper_channel; + if (!chan) + return -EINVAL; + + new_power_level = chan->power_level; + } + + if (local->hw.conf.power_level != new_power_level) { + local->hw.conf.power_level = new_power_level; need_reconfig = 1; } + if (local->hw.conf.radio_enabled != !(data->txpower.disabled)) { local->hw.conf.radio_enabled = !(data->txpower.disabled); need_reconfig = 1; + ieee80211_led_radio(local, local->hw.conf.radio_enabled); } + if (need_reconfig) { ieee80211_hw_config(local); /* The return value of hw_config is not of big interest here, @@ -814,8 +819,8 @@ static int ieee80211_ioctl_siwmlme(struct net_device *dev, struct iw_mlme *mlme = (struct iw_mlme *) extra; sdata = IEEE80211_DEV_TO_SUB_IF(dev); - if (sdata->type != IEEE80211_IF_TYPE_STA && - sdata->type != IEEE80211_IF_TYPE_IBSS) + if (sdata->vif.type != IEEE80211_IF_TYPE_STA && + sdata->vif.type != IEEE80211_IF_TYPE_IBSS) return -EINVAL; switch (mlme->cmd) { @@ -928,8 +933,11 @@ static int ieee80211_ioctl_siwauth(struct net_device *dev, case IW_AUTH_RX_UNENCRYPTED_EAPOL: case IW_AUTH_KEY_MGMT: break; + case IW_AUTH_DROP_UNENCRYPTED: + sdata->drop_unencrypted = !!data->value; + break; case IW_AUTH_PRIVACY_INVOKED: - if (sdata->type != IEEE80211_IF_TYPE_STA) + if (sdata->vif.type != IEEE80211_IF_TYPE_STA) ret = -EINVAL; else { sdata->u.sta.flags &= ~IEEE80211_STA_PRIVACY_INVOKED; @@ -944,8 +952,8 @@ static int ieee80211_ioctl_siwauth(struct net_device *dev, } break; case IW_AUTH_80211_AUTH_ALG: - if (sdata->type == IEEE80211_IF_TYPE_STA || - sdata->type == IEEE80211_IF_TYPE_IBSS) + if (sdata->vif.type == IEEE80211_IF_TYPE_STA || + sdata->vif.type == IEEE80211_IF_TYPE_IBSS) sdata->u.sta.auth_algs = data->value; else ret = -EOPNOTSUPP; @@ -965,8 +973,8 @@ static struct iw_statistics *ieee80211_get_wireless_stats(struct net_device *dev struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct sta_info *sta = NULL; - if (sdata->type == IEEE80211_IF_TYPE_STA || - sdata->type == IEEE80211_IF_TYPE_IBSS) + if (sdata->vif.type == IEEE80211_IF_TYPE_STA || + sdata->vif.type == IEEE80211_IF_TYPE_IBSS) sta = sta_info_get(local, sdata->u.sta.bssid); if (!sta) { wstats->discard.fragment = 0; @@ -994,8 +1002,8 @@ static int ieee80211_ioctl_giwauth(struct net_device *dev, switch (data->flags & IW_AUTH_INDEX) { case IW_AUTH_80211_AUTH_ALG: - if (sdata->type == IEEE80211_IF_TYPE_STA || - sdata->type == IEEE80211_IF_TYPE_IBSS) + if (sdata->vif.type == IEEE80211_IF_TYPE_STA || + sdata->vif.type == IEEE80211_IF_TYPE_IBSS) data->value = sdata->u.sta.auth_algs; else ret = -EOPNOTSUPP; diff --git a/net/mac80211/ieee80211_led.c b/net/mac80211/ieee80211_led.c index 4cf89af9d10..f401484ab6d 100644 --- a/net/mac80211/ieee80211_led.c +++ b/net/mac80211/ieee80211_led.c @@ -43,6 +43,16 @@ void ieee80211_led_assoc(struct ieee80211_local *local, bool associated) led_trigger_event(local->assoc_led, LED_OFF); } +void ieee80211_led_radio(struct ieee80211_local *local, bool enabled) +{ + if (unlikely(!local->radio_led)) + return; + if (enabled) + led_trigger_event(local->radio_led, LED_FULL); + else + led_trigger_event(local->radio_led, LED_OFF); +} + void ieee80211_led_init(struct ieee80211_local *local) { local->rx_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL); @@ -77,10 +87,25 @@ void ieee80211_led_init(struct ieee80211_local *local) local->assoc_led = NULL; } } + + local->radio_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL); + if (local->radio_led) { + snprintf(local->radio_led_name, sizeof(local->radio_led_name), + "%sradio", wiphy_name(local->hw.wiphy)); + local->radio_led->name = local->radio_led_name; + if (led_trigger_register(local->radio_led)) { + kfree(local->radio_led); + local->radio_led = NULL; + } + } } void ieee80211_led_exit(struct ieee80211_local *local) { + if (local->radio_led) { + led_trigger_unregister(local->radio_led); + kfree(local->radio_led); + } if (local->assoc_led) { led_trigger_unregister(local->assoc_led); kfree(local->assoc_led); @@ -95,6 +120,16 @@ void ieee80211_led_exit(struct ieee80211_local *local) } } +char *__ieee80211_get_radio_led_name(struct ieee80211_hw *hw) +{ + struct ieee80211_local *local = hw_to_local(hw); + + if (local->radio_led) + return local->radio_led_name; + return NULL; +} +EXPORT_SYMBOL(__ieee80211_get_radio_led_name); + char *__ieee80211_get_assoc_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); diff --git a/net/mac80211/ieee80211_led.h b/net/mac80211/ieee80211_led.h index 0feb2261983..77b1e1ba603 100644 --- a/net/mac80211/ieee80211_led.h +++ b/net/mac80211/ieee80211_led.h @@ -16,6 +16,8 @@ extern void ieee80211_led_rx(struct ieee80211_local *local); extern void ieee80211_led_tx(struct ieee80211_local *local, int q); extern void ieee80211_led_assoc(struct ieee80211_local *local, bool associated); +extern void ieee80211_led_radio(struct ieee80211_local *local, + bool enabled); extern void ieee80211_led_init(struct ieee80211_local *local); extern void ieee80211_led_exit(struct ieee80211_local *local); #else @@ -29,6 +31,10 @@ static inline void ieee80211_led_assoc(struct ieee80211_local *local, bool associated) { } +static inline void ieee80211_led_radio(struct ieee80211_local *local, + bool enabled) +{ +} static inline void ieee80211_led_init(struct ieee80211_local *local) { } diff --git a/net/mac80211/ieee80211_rate.c b/net/mac80211/ieee80211_rate.c index c3f27839374..b957e67c5fb 100644 --- a/net/mac80211/ieee80211_rate.c +++ b/net/mac80211/ieee80211_rate.c @@ -21,6 +21,11 @@ struct rate_control_alg { static LIST_HEAD(rate_ctrl_algs); static DEFINE_MUTEX(rate_ctrl_mutex); +static char *ieee80211_default_rc_algo = CONFIG_MAC80211_RC_DEFAULT; +module_param(ieee80211_default_rc_algo, charp, 0644); +MODULE_PARM_DESC(ieee80211_default_rc_algo, + "Default rate control algorithm for mac80211 to use"); + int ieee80211_rate_control_register(struct rate_control_ops *ops) { struct rate_control_alg *alg; @@ -89,21 +94,31 @@ ieee80211_try_rate_control_ops_get(const char *name) return ops; } -/* Get the rate control algorithm. If `name' is NULL, get the first - * available algorithm. */ +/* Get the rate control algorithm. */ static struct rate_control_ops * ieee80211_rate_control_ops_get(const char *name) { struct rate_control_ops *ops; + const char *alg_name; if (!name) - name = "simple"; + alg_name = ieee80211_default_rc_algo; + else + alg_name = name; - ops = ieee80211_try_rate_control_ops_get(name); + ops = ieee80211_try_rate_control_ops_get(alg_name); if (!ops) { - request_module("rc80211_%s", name); - ops = ieee80211_try_rate_control_ops_get(name); + request_module("rc80211_%s", alg_name); + ops = ieee80211_try_rate_control_ops_get(alg_name); } + if (!ops && name) + /* try default if specific alg requested but not found */ + ops = ieee80211_try_rate_control_ops_get(ieee80211_default_rc_algo); + + /* try built-in one if specific alg requested but not found */ + if (!ops && strlen(CONFIG_MAC80211_RC_DEFAULT)) + ops = ieee80211_try_rate_control_ops_get(CONFIG_MAC80211_RC_DEFAULT); + return ops; } @@ -147,6 +162,37 @@ static void rate_control_release(struct kref *kref) kfree(ctrl_ref); } +void rate_control_get_rate(struct net_device *dev, + struct ieee80211_hw_mode *mode, struct sk_buff *skb, + struct rate_selection *sel) +{ + struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); + struct rate_control_ref *ref = local->rate_ctrl; + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + struct sta_info *sta = sta_info_get(local, hdr->addr1); + int i; + + memset(sel, 0, sizeof(struct rate_selection)); + + ref->ops->get_rate(ref->priv, dev, mode, skb, sel); + + /* Select a non-ERP backup rate. */ + if (!sel->nonerp) { + for (i = 0; i < mode->num_rates - 1; i++) { + struct ieee80211_rate *rate = &mode->rates[i]; + if (sel->rate->rate < rate->rate) + break; + + if (rate_supported(sta, mode, i) && + !(rate->flags & IEEE80211_RATE_ERP)) + sel->nonerp = rate; + } + } + + if (sta) + sta_info_put(sta); +} + struct rate_control_ref *rate_control_get(struct rate_control_ref *ref) { kref_get(&ref->kref); @@ -197,3 +243,4 @@ void rate_control_deinitialize(struct ieee80211_local *local) local->rate_ctrl = NULL; rate_control_put(ref); } + diff --git a/net/mac80211/ieee80211_rate.h b/net/mac80211/ieee80211_rate.h index 23688139ffb..73f19e8aa51 100644 --- a/net/mac80211/ieee80211_rate.h +++ b/net/mac80211/ieee80211_rate.h @@ -18,31 +18,24 @@ #include "ieee80211_i.h" #include "sta_info.h" -#define RATE_CONTROL_NUM_DOWN 20 -#define RATE_CONTROL_NUM_UP 15 - - -struct rate_control_extra { - /* values from rate_control_get_rate() to the caller: */ - struct ieee80211_rate *probe; /* probe with this rate, or NULL for no - * probing */ +struct rate_selection { + /* Selected transmission rate */ + struct ieee80211_rate *rate; + /* Non-ERP rate to use if mac80211 decides it cannot use an ERP rate */ struct ieee80211_rate *nonerp; - - /* parameters from the caller to rate_control_get_rate(): */ - struct ieee80211_hw_mode *mode; - u16 ethertype; + /* probe with this rate, or NULL for no probing */ + struct ieee80211_rate *probe; }; - struct rate_control_ops { struct module *module; const char *name; void (*tx_status)(void *priv, struct net_device *dev, struct sk_buff *skb, struct ieee80211_tx_status *status); - struct ieee80211_rate *(*get_rate)(void *priv, struct net_device *dev, - struct sk_buff *skb, - struct rate_control_extra *extra); + void (*get_rate)(void *priv, struct net_device *dev, + struct ieee80211_hw_mode *mode, struct sk_buff *skb, + struct rate_selection *sel); void (*rate_init)(void *priv, void *priv_sta, struct ieee80211_local *local, struct sta_info *sta); void (*clear)(void *priv); @@ -65,9 +58,6 @@ struct rate_control_ref { struct kref kref; }; -/* default 'simple' algorithm */ -extern struct rate_control_ops mac80211_rcsimple; - int ieee80211_rate_control_register(struct rate_control_ops *ops); void ieee80211_rate_control_unregister(struct rate_control_ops *ops); @@ -75,25 +65,20 @@ void ieee80211_rate_control_unregister(struct rate_control_ops *ops); * first available algorithm. */ struct rate_control_ref *rate_control_alloc(const char *name, struct ieee80211_local *local); +void rate_control_get_rate(struct net_device *dev, + struct ieee80211_hw_mode *mode, struct sk_buff *skb, + struct rate_selection *sel); struct rate_control_ref *rate_control_get(struct rate_control_ref *ref); void rate_control_put(struct rate_control_ref *ref); -static inline void rate_control_tx_status(struct ieee80211_local *local, - struct net_device *dev, +static inline void rate_control_tx_status(struct net_device *dev, struct sk_buff *skb, struct ieee80211_tx_status *status) { + struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); struct rate_control_ref *ref = local->rate_ctrl; - ref->ops->tx_status(ref->priv, dev, skb, status); -} - -static inline struct ieee80211_rate * -rate_control_get_rate(struct ieee80211_local *local, struct net_device *dev, - struct sk_buff *skb, struct rate_control_extra *extra) -{ - struct rate_control_ref *ref = local->rate_ctrl; - return ref->ops->get_rate(ref->priv, dev, skb, extra); + ref->ops->tx_status(ref->priv, dev, skb, status); } @@ -142,10 +127,73 @@ static inline void rate_control_remove_sta_debugfs(struct sta_info *sta) #endif } +static inline int +rate_supported(struct sta_info *sta, struct ieee80211_hw_mode *mode, int index) +{ + return (sta == NULL || sta->supp_rates & BIT(index)) && + (mode->rates[index].flags & IEEE80211_RATE_SUPPORTED); +} + +static inline int +rate_lowest_index(struct ieee80211_local *local, struct ieee80211_hw_mode *mode, + struct sta_info *sta) +{ + int i; + + for (i = 0; i < mode->num_rates; i++) { + if (rate_supported(sta, mode, i)) + return i; + } + + /* warn when we cannot find a rate. */ + WARN_ON(1); + + return 0; +} + +static inline struct ieee80211_rate * +rate_lowest(struct ieee80211_local *local, struct ieee80211_hw_mode *mode, + struct sta_info *sta) +{ + return &mode->rates[rate_lowest_index(local, mode, sta)]; +} + /* functions for rate control related to a device */ int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local, const char *name); void rate_control_deinitialize(struct ieee80211_local *local); + +/* Rate control algorithms */ +#if defined(RC80211_SIMPLE_COMPILE) || \ + (defined(CONFIG_MAC80211_RC_SIMPLE) && \ + !defined(CONFIG_MAC80211_RC_SIMPLE_MODULE)) +extern int rc80211_simple_init(void); +extern void rc80211_simple_exit(void); +#else +static inline int rc80211_simple_init(void) +{ + return 0; +} +static inline void rc80211_simple_exit(void) +{ +} +#endif + +#if defined(RC80211_PID_COMPILE) || \ + (defined(CONFIG_MAC80211_RC_PID) && \ + !defined(CONFIG_MAC80211_RC_PID_MODULE)) +extern int rc80211_pid_init(void); +extern void rc80211_pid_exit(void); +#else +static inline int rc80211_pid_init(void) +{ + return 0; +} +static inline void rc80211_pid_exit(void) +{ +} +#endif + #endif /* IEEE80211_RATE_H */ diff --git a/net/mac80211/ieee80211_sta.c b/net/mac80211/ieee80211_sta.c index bee8080f224..2019b4f0528 100644 --- a/net/mac80211/ieee80211_sta.c +++ b/net/mac80211/ieee80211_sta.c @@ -57,6 +57,20 @@ #define ERP_INFO_USE_PROTECTION BIT(1) +/* mgmt header + 1 byte action code */ +#define IEEE80211_MIN_ACTION_SIZE (24 + 1) + +#define IEEE80211_ADDBA_PARAM_POLICY_MASK 0x0002 +#define IEEE80211_ADDBA_PARAM_TID_MASK 0x003C +#define IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK 0xFFA0 +#define IEEE80211_DELBA_PARAM_TID_MASK 0xF000 +#define IEEE80211_DELBA_PARAM_INITIATOR_MASK 0x0800 + +/* next values represent the buffer size for A-MPDU frame. + * According to IEEE802.11n spec size varies from 8K to 64K (in powers of 2) */ +#define IEEE80211_MIN_AMPDU_BUF 0x8 +#define IEEE80211_MAX_AMPDU_BUF 0x40 + static void ieee80211_send_probe_req(struct net_device *dev, u8 *dst, u8 *ssid, size_t ssid_len); static struct ieee80211_sta_bss * @@ -90,7 +104,8 @@ struct ieee802_11_elems { u8 *ext_supp_rates; u8 *wmm_info; u8 *wmm_param; - + u8 *ht_cap_elem; + u8 *ht_info_elem; /* length of them, respectively */ u8 ssid_len; u8 supp_rates_len; @@ -106,6 +121,8 @@ struct ieee802_11_elems { u8 ext_supp_rates_len; u8 wmm_info_len; u8 wmm_param_len; + u8 ht_cap_elem_len; + u8 ht_info_elem_len; }; static void ieee802_11_parse_elems(u8 *start, size_t len, @@ -190,6 +207,14 @@ static void ieee802_11_parse_elems(u8 *start, size_t len, elems->ext_supp_rates = pos; elems->ext_supp_rates_len = elen; break; + case WLAN_EID_HT_CAPABILITY: + elems->ht_cap_elem = pos; + elems->ht_cap_elem_len = elen; + break; + case WLAN_EID_HT_EXTRA_INFO: + elems->ht_info_elem = pos; + elems->ht_info_elem_len = elen; + break; default: break; } @@ -288,50 +313,89 @@ static void ieee80211_sta_wmm_params(struct net_device *dev, } -static void ieee80211_handle_erp_ie(struct net_device *dev, u8 erp_value) +static u32 ieee80211_handle_erp_ie(struct ieee80211_sub_if_data *sdata, + u8 erp_value) { - struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_bss_conf *bss_conf = &sdata->bss_conf; struct ieee80211_if_sta *ifsta = &sdata->u.sta; - int use_protection = (erp_value & WLAN_ERP_USE_PROTECTION) != 0; - int preamble_mode = (erp_value & WLAN_ERP_BARKER_PREAMBLE) != 0; - u8 changes = 0; + bool use_protection = (erp_value & WLAN_ERP_USE_PROTECTION) != 0; + bool preamble_mode = (erp_value & WLAN_ERP_BARKER_PREAMBLE) != 0; DECLARE_MAC_BUF(mac); + u32 changed = 0; - if (use_protection != !!(sdata->flags & IEEE80211_SDATA_USE_PROTECTION)) { + if (use_protection != bss_conf->use_cts_prot) { if (net_ratelimit()) { printk(KERN_DEBUG "%s: CTS protection %s (BSSID=" "%s)\n", - dev->name, + sdata->dev->name, use_protection ? "enabled" : "disabled", print_mac(mac, ifsta->bssid)); } - if (use_protection) - sdata->flags |= IEEE80211_SDATA_USE_PROTECTION; - else - sdata->flags &= ~IEEE80211_SDATA_USE_PROTECTION; - changes |= IEEE80211_ERP_CHANGE_PROTECTION; + bss_conf->use_cts_prot = use_protection; + changed |= BSS_CHANGED_ERP_CTS_PROT; } - if (preamble_mode != !(sdata->flags & IEEE80211_SDATA_SHORT_PREAMBLE)) { + if (preamble_mode != bss_conf->use_short_preamble) { if (net_ratelimit()) { printk(KERN_DEBUG "%s: switched to %s barker preamble" " (BSSID=%s)\n", - dev->name, + sdata->dev->name, (preamble_mode == WLAN_ERP_PREAMBLE_SHORT) ? "short" : "long", print_mac(mac, ifsta->bssid)); } - if (preamble_mode) - sdata->flags &= ~IEEE80211_SDATA_SHORT_PREAMBLE; - else - sdata->flags |= IEEE80211_SDATA_SHORT_PREAMBLE; - changes |= IEEE80211_ERP_CHANGE_PREAMBLE; + bss_conf->use_short_preamble = preamble_mode; + changed |= BSS_CHANGED_ERP_PREAMBLE; } - if (changes) - ieee80211_erp_info_change_notify(dev, changes); + return changed; } +int ieee80211_ht_cap_ie_to_ht_info(struct ieee80211_ht_cap *ht_cap_ie, + struct ieee80211_ht_info *ht_info) +{ + + if (ht_info == NULL) + return -EINVAL; + + memset(ht_info, 0, sizeof(*ht_info)); + + if (ht_cap_ie) { + u8 ampdu_info = ht_cap_ie->ampdu_params_info; + + ht_info->ht_supported = 1; + ht_info->cap = le16_to_cpu(ht_cap_ie->cap_info); + ht_info->ampdu_factor = + ampdu_info & IEEE80211_HT_CAP_AMPDU_FACTOR; + ht_info->ampdu_density = + (ampdu_info & IEEE80211_HT_CAP_AMPDU_DENSITY) >> 2; + memcpy(ht_info->supp_mcs_set, ht_cap_ie->supp_mcs_set, 16); + } else + ht_info->ht_supported = 0; + + return 0; +} + +int ieee80211_ht_addt_info_ie_to_ht_bss_info( + struct ieee80211_ht_addt_info *ht_add_info_ie, + struct ieee80211_ht_bss_info *bss_info) +{ + if (bss_info == NULL) + return -EINVAL; + + memset(bss_info, 0, sizeof(*bss_info)); + + if (ht_add_info_ie) { + u16 op_mode; + op_mode = le16_to_cpu(ht_add_info_ie->operation_mode); + + bss_info->primary_channel = ht_add_info_ie->control_chan; + bss_info->bss_cap = ht_add_info_ie->ht_param; + bss_info->bss_op_mode = (u8)(op_mode & 0xff); + } + + return 0; +} static void ieee80211_sta_send_associnfo(struct net_device *dev, struct ieee80211_if_sta *ifsta) @@ -388,20 +452,17 @@ static void ieee80211_set_associated(struct net_device *dev, struct ieee80211_if_sta *ifsta, bool assoc) { - struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; union iwreq_data wrqu; - - if (!!(ifsta->flags & IEEE80211_STA_ASSOCIATED) == assoc) - return; + u32 changed = BSS_CHANGED_ASSOC; if (assoc) { - struct ieee80211_sub_if_data *sdata; struct ieee80211_sta_bss *bss; ifsta->flags |= IEEE80211_STA_ASSOCIATED; - sdata = IEEE80211_DEV_TO_SUB_IF(dev); - if (sdata->type != IEEE80211_IF_TYPE_STA) + if (sdata->vif.type != IEEE80211_IF_TYPE_STA) return; bss = ieee80211_rx_bss_get(dev, ifsta->bssid, @@ -409,7 +470,8 @@ static void ieee80211_set_associated(struct net_device *dev, ifsta->ssid, ifsta->ssid_len); if (bss) { if (bss->has_erp_value) - ieee80211_handle_erp_ie(dev, bss->erp_value); + changed |= ieee80211_handle_erp_ie( + sdata, bss->erp_value); ieee80211_rx_bss_put(dev, bss); } @@ -429,6 +491,8 @@ static void ieee80211_set_associated(struct net_device *dev, wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL); ifsta->last_probe = jiffies; ieee80211_led_assoc(local, assoc); + + ieee80211_bss_info_change_notify(sdata, changed); } static void ieee80211_set_disassoc(struct net_device *dev, @@ -630,6 +694,19 @@ static void ieee80211_send_assoc(struct net_device *dev, *pos++ = 1; /* WME ver */ *pos++ = 0; } + /* wmm support is a must to HT */ + if (wmm && mode->ht_info.ht_supported) { + __le16 tmp = cpu_to_le16(mode->ht_info.cap); + pos = skb_put(skb, sizeof(struct ieee80211_ht_cap)+2); + *pos++ = WLAN_EID_HT_CAPABILITY; + *pos++ = sizeof(struct ieee80211_ht_cap); + memset(pos, 0, sizeof(struct ieee80211_ht_cap)); + memcpy(pos, &tmp, sizeof(u16)); + pos += sizeof(u16); + *pos++ = (mode->ht_info.ampdu_factor | + (mode->ht_info.ampdu_density << 2)); + memcpy(pos, mode->ht_info.supp_mcs_set, 16); + } kfree(ifsta->assocreq_ies); ifsta->assocreq_ies_len = (skb->data + skb->len) - ies; @@ -918,6 +995,320 @@ static void ieee80211_auth_challenge(struct net_device *dev, elems.challenge_len + 2, 1); } +static void ieee80211_send_addba_resp(struct net_device *dev, u8 *da, u16 tid, + u8 dialog_token, u16 status, u16 policy, + u16 buf_size, u16 timeout) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_if_sta *ifsta = &sdata->u.sta; + struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); + struct sk_buff *skb; + struct ieee80211_mgmt *mgmt; + u16 capab; + + skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom + 1 + + sizeof(mgmt->u.action.u.addba_resp)); + if (!skb) { + printk(KERN_DEBUG "%s: failed to allocate buffer " + "for addba resp frame\n", dev->name); + return; + } + + skb_reserve(skb, local->hw.extra_tx_headroom); + mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24); + memset(mgmt, 0, 24); + memcpy(mgmt->da, da, ETH_ALEN); + memcpy(mgmt->sa, dev->dev_addr, ETH_ALEN); + if (sdata->vif.type == IEEE80211_IF_TYPE_AP) + memcpy(mgmt->bssid, dev->dev_addr, ETH_ALEN); + else + memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN); + mgmt->frame_control = IEEE80211_FC(IEEE80211_FTYPE_MGMT, + IEEE80211_STYPE_ACTION); + + skb_put(skb, 1 + sizeof(mgmt->u.action.u.addba_resp)); + mgmt->u.action.category = WLAN_CATEGORY_BACK; + mgmt->u.action.u.addba_resp.action_code = WLAN_ACTION_ADDBA_RESP; + mgmt->u.action.u.addba_resp.dialog_token = dialog_token; + + capab = (u16)(policy << 1); /* bit 1 aggregation policy */ + capab |= (u16)(tid << 2); /* bit 5:2 TID number */ + capab |= (u16)(buf_size << 6); /* bit 15:6 max size of aggregation */ + + mgmt->u.action.u.addba_resp.capab = cpu_to_le16(capab); + mgmt->u.action.u.addba_resp.timeout = cpu_to_le16(timeout); + mgmt->u.action.u.addba_resp.status = cpu_to_le16(status); + + ieee80211_sta_tx(dev, skb, 0); + + return; +} + +static void ieee80211_sta_process_addba_request(struct net_device *dev, + struct ieee80211_mgmt *mgmt, + size_t len) +{ + struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); + struct ieee80211_hw *hw = &local->hw; + struct ieee80211_conf *conf = &hw->conf; + struct sta_info *sta; + struct tid_ampdu_rx *tid_agg_rx; + u16 capab, tid, timeout, ba_policy, buf_size, start_seq_num, status; + u8 dialog_token; + int ret = -EOPNOTSUPP; + DECLARE_MAC_BUF(mac); + + sta = sta_info_get(local, mgmt->sa); + if (!sta) + return; + + /* extract session parameters from addba request frame */ + dialog_token = mgmt->u.action.u.addba_req.dialog_token; + timeout = le16_to_cpu(mgmt->u.action.u.addba_req.timeout); + start_seq_num = + le16_to_cpu(mgmt->u.action.u.addba_req.start_seq_num) >> 4; + + capab = le16_to_cpu(mgmt->u.action.u.addba_req.capab); + ba_policy = (capab & IEEE80211_ADDBA_PARAM_POLICY_MASK) >> 1; + tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2; + buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6; + + status = WLAN_STATUS_REQUEST_DECLINED; + + /* sanity check for incoming parameters: + * check if configuration can support the BA policy + * and if buffer size does not exceeds max value */ + if (((ba_policy != 1) + && (!(conf->ht_conf.cap & IEEE80211_HT_CAP_DELAY_BA))) + || (buf_size > IEEE80211_MAX_AMPDU_BUF)) { + status = WLAN_STATUS_INVALID_QOS_PARAM; +#ifdef CONFIG_MAC80211_HT_DEBUG + if (net_ratelimit()) + printk(KERN_DEBUG "Block Ack Req with bad params from " + "%s on tid %u. policy %d, buffer size %d\n", + print_mac(mac, mgmt->sa), tid, ba_policy, + buf_size); +#endif /* CONFIG_MAC80211_HT_DEBUG */ + goto end_no_lock; + } + /* determine default buffer size */ + if (buf_size == 0) { + struct ieee80211_hw_mode *mode = conf->mode; + buf_size = IEEE80211_MIN_AMPDU_BUF; + buf_size = buf_size << mode->ht_info.ampdu_factor; + } + + tid_agg_rx = &sta->ampdu_mlme.tid_rx[tid]; + + /* examine state machine */ + spin_lock_bh(&sta->ampdu_mlme.ampdu_rx); + + if (tid_agg_rx->state != HT_AGG_STATE_IDLE) { +#ifdef CONFIG_MAC80211_HT_DEBUG + if (net_ratelimit()) + printk(KERN_DEBUG "unexpected Block Ack Req from " + "%s on tid %u\n", + print_mac(mac, mgmt->sa), tid); +#endif /* CONFIG_MAC80211_HT_DEBUG */ + goto end; + } + + /* prepare reordering buffer */ + tid_agg_rx->reorder_buf = + kmalloc(buf_size * sizeof(struct sk_buf *), GFP_ATOMIC); + if ((!tid_agg_rx->reorder_buf) && net_ratelimit()) { + printk(KERN_ERR "can not allocate reordering buffer " + "to tid %d\n", tid); + goto end; + } + memset(tid_agg_rx->reorder_buf, 0, + buf_size * sizeof(struct sk_buf *)); + + if (local->ops->ampdu_action) + ret = local->ops->ampdu_action(hw, IEEE80211_AMPDU_RX_START, + sta->addr, tid, start_seq_num); +#ifdef CONFIG_MAC80211_HT_DEBUG + printk(KERN_DEBUG "Rx A-MPDU on tid %d result %d", tid, ret); +#endif /* CONFIG_MAC80211_HT_DEBUG */ + + if (ret) { + kfree(tid_agg_rx->reorder_buf); + goto end; + } + + /* change state and send addba resp */ + tid_agg_rx->state = HT_AGG_STATE_OPERATIONAL; + tid_agg_rx->dialog_token = dialog_token; + tid_agg_rx->ssn = start_seq_num; + tid_agg_rx->head_seq_num = start_seq_num; + tid_agg_rx->buf_size = buf_size; + tid_agg_rx->timeout = timeout; + tid_agg_rx->stored_mpdu_num = 0; + status = WLAN_STATUS_SUCCESS; +end: + spin_unlock_bh(&sta->ampdu_mlme.ampdu_rx); + +end_no_lock: + ieee80211_send_addba_resp(sta->dev, sta->addr, tid, dialog_token, + status, 1, buf_size, timeout); + sta_info_put(sta); +} + +static void ieee80211_send_delba(struct net_device *dev, const u8 *da, u16 tid, + u16 initiator, u16 reason_code) +{ + struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_if_sta *ifsta = &sdata->u.sta; + struct sk_buff *skb; + struct ieee80211_mgmt *mgmt; + u16 params; + + skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom + 1 + + sizeof(mgmt->u.action.u.delba)); + + if (!skb) { + printk(KERN_ERR "%s: failed to allocate buffer " + "for delba frame\n", dev->name); + return; + } + + skb_reserve(skb, local->hw.extra_tx_headroom); + mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24); + memset(mgmt, 0, 24); + memcpy(mgmt->da, da, ETH_ALEN); + memcpy(mgmt->sa, dev->dev_addr, ETH_ALEN); + if (sdata->vif.type == IEEE80211_IF_TYPE_AP) + memcpy(mgmt->bssid, dev->dev_addr, ETH_ALEN); + else + memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN); + mgmt->frame_control = IEEE80211_FC(IEEE80211_FTYPE_MGMT, + IEEE80211_STYPE_ACTION); + + skb_put(skb, 1 + sizeof(mgmt->u.action.u.delba)); + + mgmt->u.action.category = WLAN_CATEGORY_BACK; + mgmt->u.action.u.delba.action_code = WLAN_ACTION_DELBA; + params = (u16)(initiator << 11); /* bit 11 initiator */ + params |= (u16)(tid << 12); /* bit 15:12 TID number */ + + mgmt->u.action.u.delba.params = cpu_to_le16(params); + mgmt->u.action.u.delba.reason_code = cpu_to_le16(reason_code); + + ieee80211_sta_tx(dev, skb, 0); +} + +void ieee80211_sta_stop_rx_ba_session(struct net_device *dev, u8 *ra, u16 tid, + u16 initiator, u16 reason) +{ + struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); + struct ieee80211_hw *hw = &local->hw; + struct sta_info *sta; + int ret, i; + + sta = sta_info_get(local, ra); + if (!sta) + return; + + /* check if TID is in operational state */ + spin_lock_bh(&sta->ampdu_mlme.ampdu_rx); + if (sta->ampdu_mlme.tid_rx[tid].state + != HT_AGG_STATE_OPERATIONAL) { + spin_unlock_bh(&sta->ampdu_mlme.ampdu_rx); + sta_info_put(sta); + return; + } + sta->ampdu_mlme.tid_rx[tid].state = + HT_AGG_STATE_REQ_STOP_BA_MSK | + (initiator << HT_AGG_STATE_INITIATOR_SHIFT); + spin_unlock_bh(&sta->ampdu_mlme.ampdu_rx); + + /* stop HW Rx aggregation. ampdu_action existence + * already verified in session init so we add the BUG_ON */ + BUG_ON(!local->ops->ampdu_action); + + ret = local->ops->ampdu_action(hw, IEEE80211_AMPDU_RX_STOP, + ra, tid, EINVAL); + if (ret) + printk(KERN_DEBUG "HW problem - can not stop rx " + "aggergation for tid %d\n", tid); + + /* shutdown timer has not expired */ + if (initiator != WLAN_BACK_TIMER) + del_timer_sync(&sta->ampdu_mlme.tid_rx[tid]. + session_timer); + + /* check if this is a self generated aggregation halt */ + if (initiator == WLAN_BACK_RECIPIENT || initiator == WLAN_BACK_TIMER) + ieee80211_send_delba(dev, ra, tid, 0, reason); + + /* free the reordering buffer */ + for (i = 0; i < sta->ampdu_mlme.tid_rx[tid].buf_size; i++) { + if (sta->ampdu_mlme.tid_rx[tid].reorder_buf[i]) { + /* release the reordered frames */ + dev_kfree_skb(sta->ampdu_mlme.tid_rx[tid].reorder_buf[i]); + sta->ampdu_mlme.tid_rx[tid].stored_mpdu_num--; + sta->ampdu_mlme.tid_rx[tid].reorder_buf[i] = NULL; + } + } + kfree(sta->ampdu_mlme.tid_rx[tid].reorder_buf); + + sta->ampdu_mlme.tid_rx[tid].state = HT_AGG_STATE_IDLE; + sta_info_put(sta); +} + +static void ieee80211_sta_process_delba(struct net_device *dev, + struct ieee80211_mgmt *mgmt, size_t len) +{ + struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); + struct sta_info *sta; + u16 tid, params; + u16 initiator; + DECLARE_MAC_BUF(mac); + + sta = sta_info_get(local, mgmt->sa); + if (!sta) + return; + + params = le16_to_cpu(mgmt->u.action.u.delba.params); + tid = (params & IEEE80211_DELBA_PARAM_TID_MASK) >> 12; + initiator = (params & IEEE80211_DELBA_PARAM_INITIATOR_MASK) >> 11; + +#ifdef CONFIG_MAC80211_HT_DEBUG + if (net_ratelimit()) + printk(KERN_DEBUG "delba from %s on tid %d reason code %d\n", + print_mac(mac, mgmt->sa), tid, + mgmt->u.action.u.delba.reason_code); +#endif /* CONFIG_MAC80211_HT_DEBUG */ + + if (initiator == WLAN_BACK_INITIATOR) + ieee80211_sta_stop_rx_ba_session(dev, sta->addr, tid, + WLAN_BACK_INITIATOR, 0); + sta_info_put(sta); +} + +/* + * After receiving Block Ack Request (BAR) we activated a + * timer after each frame arrives from the originator. + * if this timer expires ieee80211_sta_stop_rx_ba_session will be executed. + */ +void sta_rx_agg_session_timer_expired(unsigned long data) +{ + /* not an elegant detour, but there is no choice as the timer passes + * only one argument, and verious sta_info are needed here, so init + * flow in sta_info_add gives the TID as data, while the timer_to_id + * array gives the sta through container_of */ + u8 *ptid = (u8 *)data; + u8 *timer_to_id = ptid - *ptid; + struct sta_info *sta = container_of(timer_to_id, struct sta_info, + timer_to_tid[0]); + + printk(KERN_DEBUG "rx session timer expired on tid %d\n", (u16)*ptid); + ieee80211_sta_stop_rx_ba_session(sta->dev, sta->addr, (u16)*ptid, + WLAN_BACK_TIMER, + WLAN_REASON_QSTA_TIMEOUT); +} + static void ieee80211_rx_mgmt_auth(struct net_device *dev, struct ieee80211_if_sta *ifsta, @@ -929,7 +1320,7 @@ static void ieee80211_rx_mgmt_auth(struct net_device *dev, DECLARE_MAC_BUF(mac); if (ifsta->state != IEEE80211_AUTHENTICATE && - sdata->type != IEEE80211_IF_TYPE_IBSS) { + sdata->vif.type != IEEE80211_IF_TYPE_IBSS) { printk(KERN_DEBUG "%s: authentication frame received from " "%s, but not in authenticate state - ignored\n", dev->name, print_mac(mac, mgmt->sa)); @@ -943,7 +1334,7 @@ static void ieee80211_rx_mgmt_auth(struct net_device *dev, return; } - if (sdata->type != IEEE80211_IF_TYPE_IBSS && + if (sdata->vif.type != IEEE80211_IF_TYPE_IBSS && memcmp(ifsta->bssid, mgmt->sa, ETH_ALEN) != 0) { printk(KERN_DEBUG "%s: authentication frame received from " "unknown AP (SA=%s BSSID=%s) - " @@ -952,7 +1343,7 @@ static void ieee80211_rx_mgmt_auth(struct net_device *dev, return; } - if (sdata->type != IEEE80211_IF_TYPE_IBSS && + if (sdata->vif.type != IEEE80211_IF_TYPE_IBSS && memcmp(ifsta->bssid, mgmt->bssid, ETH_ALEN) != 0) { printk(KERN_DEBUG "%s: authentication frame received from " "unknown BSSID (SA=%s BSSID=%s) - " @@ -970,7 +1361,7 @@ static void ieee80211_rx_mgmt_auth(struct net_device *dev, dev->name, print_mac(mac, mgmt->sa), auth_alg, auth_transaction, status_code); - if (sdata->type == IEEE80211_IF_TYPE_IBSS) { + if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS) { /* IEEE 802.11 standard does not require authentication in IBSS * networks and most implementations do not seem to use it. * However, try to reply to authentication attempts if someone @@ -1136,18 +1527,20 @@ static void ieee80211_rx_mgmt_disassoc(struct net_device *dev, } -static void ieee80211_rx_mgmt_assoc_resp(struct net_device *dev, +static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, struct ieee80211_if_sta *ifsta, struct ieee80211_mgmt *mgmt, size_t len, int reassoc) { - struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); + struct ieee80211_local *local = sdata->local; + struct net_device *dev = sdata->dev; struct ieee80211_hw_mode *mode; struct sta_info *sta; u32 rates; u16 capab_info, status_code, aid; struct ieee802_11_elems elems; + struct ieee80211_bss_conf *bss_conf = &sdata->bss_conf; u8 *pos; int i, j; DECLARE_MAC_BUF(mac); @@ -1210,20 +1603,6 @@ static void ieee80211_rx_mgmt_assoc_resp(struct net_device *dev, return; } - /* it probably doesn't, but if the frame includes an ERP value then - * update our stored copy */ - if (elems.erp_info && elems.erp_info_len >= 1) { - struct ieee80211_sta_bss *bss - = ieee80211_rx_bss_get(dev, ifsta->bssid, - local->hw.conf.channel, - ifsta->ssid, ifsta->ssid_len); - if (bss) { - bss->erp_value = elems.erp_info[0]; - bss->has_erp_value = 1; - ieee80211_rx_bss_put(dev, bss); - } - } - printk(KERN_DEBUG "%s: associated\n", dev->name); ifsta->aid = aid; ifsta->ap_capab = capab_info; @@ -1234,6 +1613,8 @@ static void ieee80211_rx_mgmt_assoc_resp(struct net_device *dev, if (ifsta->assocresp_ies) memcpy(ifsta->assocresp_ies, pos, ifsta->assocresp_ies_len); + /* set AID, ieee80211_set_associated() will tell the driver */ + bss_conf->aid = aid; ieee80211_set_associated(dev, ifsta, 1); /* Add STA entry for the AP */ @@ -1276,6 +1657,19 @@ static void ieee80211_rx_mgmt_assoc_resp(struct net_device *dev, } sta->supp_rates = rates; + if (elems.ht_cap_elem && elems.ht_info_elem && elems.wmm_param && + local->ops->conf_ht) { + struct ieee80211_ht_bss_info bss_info; + + ieee80211_ht_cap_ie_to_ht_info( + (struct ieee80211_ht_cap *) + elems.ht_cap_elem, &sta->ht_info); + ieee80211_ht_addt_info_ie_to_ht_bss_info( + (struct ieee80211_ht_addt_info *) + elems.ht_info_elem, &bss_info); + ieee80211_hw_config_ht(local, 1, &sta->ht_info, &bss_info); + } + rate_control_rate_init(sta, local); if (elems.wmm_param && (ifsta->flags & IEEE80211_STA_WMM_ENABLED)) { @@ -1380,6 +1774,7 @@ static void ieee80211_rx_bss_free(struct ieee80211_sta_bss *bss) kfree(bss->wpa_ie); kfree(bss->rsn_ie); kfree(bss->wmm_ie); + kfree(bss->ht_ie); kfree(bss); } @@ -1449,7 +1844,7 @@ static void ieee80211_rx_bss_info(struct net_device *dev, timestamp = le64_to_cpu(mgmt->u.beacon.timestamp); - if (sdata->type == IEEE80211_IF_TYPE_IBSS && beacon && + if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS && beacon && memcmp(mgmt->bssid, sdata->u.sta.bssid, ETH_ALEN) == 0) { #ifdef CONFIG_MAC80211_IBSS_DEBUG static unsigned long last_tsf_debug = 0; @@ -1474,7 +1869,7 @@ static void ieee80211_rx_bss_info(struct net_device *dev, ieee802_11_parse_elems(mgmt->u.beacon.variable, len - baselen, &elems); - if (sdata->type == IEEE80211_IF_TYPE_IBSS && elems.supp_rates && + if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS && elems.supp_rates && memcmp(mgmt->bssid, sdata->u.sta.bssid, ETH_ALEN) == 0 && (sta = sta_info_get(local, mgmt->sa))) { struct ieee80211_hw_mode *mode; @@ -1483,8 +1878,18 @@ static void ieee80211_rx_bss_info(struct net_device *dev, u32 supp_rates, prev_rates; int i, j; - mode = local->sta_scanning ? + mode = local->sta_sw_scanning ? local->scan_hw_mode : local->oper_hw_mode; + + if (local->sta_hw_scanning) { + /* search for the correct mode matches the beacon */ + list_for_each_entry(mode, &local->modes_list, list) + if (mode->mode == rx_status->phymode) + break; + + if (mode == NULL) + mode = local->oper_hw_mode; + } rates = mode->rates; num_rates = mode->num_rates; @@ -1627,7 +2032,22 @@ static void ieee80211_rx_bss_info(struct net_device *dev, bss->wmm_ie = NULL; bss->wmm_ie_len = 0; } - + if (elems.ht_cap_elem && + (!bss->ht_ie || bss->ht_ie_len != elems.ht_cap_elem_len || + memcmp(bss->ht_ie, elems.ht_cap_elem, elems.ht_cap_elem_len))) { + kfree(bss->ht_ie); + bss->ht_ie = kmalloc(elems.ht_cap_elem_len + 2, GFP_ATOMIC); + if (bss->ht_ie) { + memcpy(bss->ht_ie, elems.ht_cap_elem - 2, + elems.ht_cap_elem_len + 2); + bss->ht_ie_len = elems.ht_cap_elem_len + 2; + } else + bss->ht_ie_len = 0; + } else if (!elems.ht_cap_elem && bss->ht_ie) { + kfree(bss->ht_ie); + bss->ht_ie = NULL; + bss->ht_ie_len = 0; + } bss->hw_mode = rx_status->phymode; bss->freq = rx_status->freq; @@ -1672,11 +2092,14 @@ static void ieee80211_rx_mgmt_beacon(struct net_device *dev, struct ieee80211_if_sta *ifsta; size_t baselen; struct ieee802_11_elems elems; + struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); + struct ieee80211_conf *conf = &local->hw.conf; + u32 changed = 0; ieee80211_rx_bss_info(dev, mgmt, len, rx_status, 1); sdata = IEEE80211_DEV_TO_SUB_IF(dev); - if (sdata->type != IEEE80211_IF_TYPE_STA) + if (sdata->vif.type != IEEE80211_IF_TYPE_STA) return; ifsta = &sdata->u.sta; @@ -1692,12 +2115,31 @@ static void ieee80211_rx_mgmt_beacon(struct net_device *dev, ieee802_11_parse_elems(mgmt->u.beacon.variable, len - baselen, &elems); if (elems.erp_info && elems.erp_info_len >= 1) - ieee80211_handle_erp_ie(dev, elems.erp_info[0]); + changed |= ieee80211_handle_erp_ie(sdata, elems.erp_info[0]); + + if (elems.ht_cap_elem && elems.ht_info_elem && + elems.wmm_param && local->ops->conf_ht && + conf->flags & IEEE80211_CONF_SUPPORT_HT_MODE) { + struct ieee80211_ht_bss_info bss_info; + + ieee80211_ht_addt_info_ie_to_ht_bss_info( + (struct ieee80211_ht_addt_info *) + elems.ht_info_elem, &bss_info); + /* check if AP changed bss inforamation */ + if ((conf->ht_bss_conf.primary_channel != + bss_info.primary_channel) || + (conf->ht_bss_conf.bss_cap != bss_info.bss_cap) || + (conf->ht_bss_conf.bss_op_mode != bss_info.bss_op_mode)) + ieee80211_hw_config_ht(local, 1, &conf->ht_conf, + &bss_info); + } if (elems.wmm_param && (ifsta->flags & IEEE80211_STA_WMM_ENABLED)) { ieee80211_sta_wmm_params(dev, ifsta, elems.wmm_param, elems.wmm_param_len); } + + ieee80211_bss_info_change_notify(sdata, changed); } @@ -1719,7 +2161,7 @@ static void ieee80211_rx_mgmt_probe_req(struct net_device *dev, DECLARE_MAC_BUF(mac3); #endif - if (sdata->type != IEEE80211_IF_TYPE_IBSS || + if (sdata->vif.type != IEEE80211_IF_TYPE_IBSS || ifsta->state != IEEE80211_IBSS_JOINED || len < 24 + 2 || !ifsta->probe_resp) return; @@ -1775,6 +2217,40 @@ static void ieee80211_rx_mgmt_probe_req(struct net_device *dev, ieee80211_sta_tx(dev, skb, 0); } +static void ieee80211_rx_mgmt_action(struct net_device *dev, + struct ieee80211_if_sta *ifsta, + struct ieee80211_mgmt *mgmt, + size_t len) +{ + if (len < IEEE80211_MIN_ACTION_SIZE) + return; + + switch (mgmt->u.action.category) { + case WLAN_CATEGORY_BACK: + switch (mgmt->u.action.u.addba_req.action_code) { + case WLAN_ACTION_ADDBA_REQ: + if (len < (IEEE80211_MIN_ACTION_SIZE + + sizeof(mgmt->u.action.u.addba_req))) + break; + ieee80211_sta_process_addba_request(dev, mgmt, len); + break; + case WLAN_ACTION_DELBA: + if (len < (IEEE80211_MIN_ACTION_SIZE + + sizeof(mgmt->u.action.u.delba))) + break; + ieee80211_sta_process_delba(dev, mgmt, len); + break; + default: + if (net_ratelimit()) + printk(KERN_DEBUG "%s: Rx unknown A-MPDU action\n", + dev->name); + break; + } + break; + default: + break; + } +} void ieee80211_sta_rx_mgmt(struct net_device *dev, struct sk_buff *skb, struct ieee80211_rx_status *rx_status) @@ -1804,6 +2280,7 @@ void ieee80211_sta_rx_mgmt(struct net_device *dev, struct sk_buff *skb, case IEEE80211_STYPE_REASSOC_RESP: case IEEE80211_STYPE_DEAUTH: case IEEE80211_STYPE_DISASSOC: + case IEEE80211_STYPE_ACTION: skb_queue_tail(&ifsta->skb_queue, skb); queue_work(local->hw.workqueue, &ifsta->work); return; @@ -1850,10 +2327,10 @@ static void ieee80211_sta_rx_queued_mgmt(struct net_device *dev, ieee80211_rx_mgmt_auth(dev, ifsta, mgmt, skb->len); break; case IEEE80211_STYPE_ASSOC_RESP: - ieee80211_rx_mgmt_assoc_resp(dev, ifsta, mgmt, skb->len, 0); + ieee80211_rx_mgmt_assoc_resp(sdata, ifsta, mgmt, skb->len, 0); break; case IEEE80211_STYPE_REASSOC_RESP: - ieee80211_rx_mgmt_assoc_resp(dev, ifsta, mgmt, skb->len, 1); + ieee80211_rx_mgmt_assoc_resp(sdata, ifsta, mgmt, skb->len, 1); break; case IEEE80211_STYPE_DEAUTH: ieee80211_rx_mgmt_deauth(dev, ifsta, mgmt, skb->len); @@ -1861,37 +2338,48 @@ static void ieee80211_sta_rx_queued_mgmt(struct net_device *dev, case IEEE80211_STYPE_DISASSOC: ieee80211_rx_mgmt_disassoc(dev, ifsta, mgmt, skb->len); break; + case IEEE80211_STYPE_ACTION: + ieee80211_rx_mgmt_action(dev, ifsta, mgmt, skb->len); + break; } kfree_skb(skb); } -void ieee80211_sta_rx_scan(struct net_device *dev, struct sk_buff *skb, - struct ieee80211_rx_status *rx_status) +ieee80211_txrx_result +ieee80211_sta_rx_scan(struct net_device *dev, struct sk_buff *skb, + struct ieee80211_rx_status *rx_status) { struct ieee80211_mgmt *mgmt; u16 fc; - if (skb->len < 24) { - dev_kfree_skb(skb); - return; - } + if (skb->len < 2) + return TXRX_DROP; mgmt = (struct ieee80211_mgmt *) skb->data; fc = le16_to_cpu(mgmt->frame_control); + if ((fc & IEEE80211_FCTL_FTYPE) == IEEE80211_FTYPE_CTL) + return TXRX_CONTINUE; + + if (skb->len < 24) + return TXRX_DROP; + if ((fc & IEEE80211_FCTL_FTYPE) == IEEE80211_FTYPE_MGMT) { if ((fc & IEEE80211_FCTL_STYPE) == IEEE80211_STYPE_PROBE_RESP) { ieee80211_rx_mgmt_probe_resp(dev, mgmt, skb->len, rx_status); + dev_kfree_skb(skb); + return TXRX_QUEUED; } else if ((fc & IEEE80211_FCTL_STYPE) == IEEE80211_STYPE_BEACON) { ieee80211_rx_mgmt_beacon(dev, mgmt, skb->len, rx_status); + dev_kfree_skb(skb); + return TXRX_QUEUED; } } - - dev_kfree_skb(skb); + return TXRX_CONTINUE; } @@ -1981,13 +2469,13 @@ void ieee80211_sta_work(struct work_struct *work) if (!netif_running(dev)) return; - if (local->sta_scanning) + if (local->sta_sw_scanning || local->sta_hw_scanning) return; - if (sdata->type != IEEE80211_IF_TYPE_STA && - sdata->type != IEEE80211_IF_TYPE_IBSS) { + if (sdata->vif.type != IEEE80211_IF_TYPE_STA && + sdata->vif.type != IEEE80211_IF_TYPE_IBSS) { printk(KERN_DEBUG "%s: ieee80211_sta_work: non-STA interface " - "(type=%d)\n", dev->name, sdata->type); + "(type=%d)\n", dev->name, sdata->vif.type); return; } ifsta = &sdata->u.sta; @@ -2082,7 +2570,7 @@ void ieee80211_sta_req_auth(struct net_device *dev, struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - if (sdata->type != IEEE80211_IF_TYPE_STA) + if (sdata->vif.type != IEEE80211_IF_TYPE_STA) return; if ((ifsta->flags & (IEEE80211_STA_BSSID_SET | @@ -2204,9 +2692,8 @@ static int ieee80211_sta_join_ibss(struct net_device *dev, struct sk_buff *skb; struct ieee80211_mgmt *mgmt; struct ieee80211_tx_control control; - struct ieee80211_rate *rate; struct ieee80211_hw_mode *mode; - struct rate_control_extra extra; + struct rate_selection ratesel; u8 *pos; struct ieee80211_sub_if_data *sdata; @@ -2291,18 +2778,17 @@ static int ieee80211_sta_join_ibss(struct net_device *dev, } memset(&control, 0, sizeof(control)); - memset(&extra, 0, sizeof(extra)); - extra.mode = local->oper_hw_mode; - rate = rate_control_get_rate(local, dev, skb, &extra); - if (!rate) { + rate_control_get_rate(dev, local->oper_hw_mode, skb, &ratesel); + if (!ratesel.rate) { printk(KERN_DEBUG "%s: Failed to determine TX rate " "for IBSS beacon\n", dev->name); break; } + control.vif = &sdata->vif; control.tx_rate = - ((sdata->flags & IEEE80211_SDATA_SHORT_PREAMBLE) && - (rate->flags & IEEE80211_RATE_PREAMBLE2)) ? - rate->val2 : rate->val; + (sdata->bss_conf.use_short_preamble && + (ratesel.rate->flags & IEEE80211_RATE_PREAMBLE2)) ? + ratesel.rate->val2 : ratesel.rate->val; control.antenna_sel_tx = local->hw.conf.antenna_sel_tx; control.power_level = local->hw.conf.power_level; control.flags |= IEEE80211_TXCTL_NO_ACK; @@ -2552,7 +3038,7 @@ int ieee80211_sta_set_ssid(struct net_device *dev, char *ssid, size_t len) ifsta->flags |= IEEE80211_STA_SSID_SET; else ifsta->flags &= ~IEEE80211_STA_SSID_SET; - if (sdata->type == IEEE80211_IF_TYPE_IBSS && + if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS && !(ifsta->flags & IEEE80211_STA_BSSID_SET)) { ifsta->ibss_join_req = jiffies; ifsta->state = IEEE80211_IBSS_SEARCH; @@ -2639,9 +3125,15 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw) union iwreq_data wrqu; local->last_scan_completed = jiffies; - wmb(); - local->sta_scanning = 0; + memset(&wrqu, 0, sizeof(wrqu)); + wireless_send_event(dev, SIOCGIWSCAN, &wrqu, NULL); + + if (local->sta_hw_scanning) { + local->sta_hw_scanning = 0; + goto done; + } + local->sta_sw_scanning = 0; if (ieee80211_hw_config(local)) printk(KERN_DEBUG "%s: failed to restore operational " "channel after scan\n", dev->name); @@ -2657,9 +3149,6 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw) netif_tx_unlock_bh(local->mdev); - memset(&wrqu, 0, sizeof(wrqu)); - wireless_send_event(dev, SIOCGIWSCAN, &wrqu, NULL); - rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { @@ -2667,7 +3156,7 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw) if (sdata->dev == local->mdev) continue; - if (sdata->type == IEEE80211_IF_TYPE_STA) { + if (sdata->vif.type == IEEE80211_IF_TYPE_STA) { if (sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED) ieee80211_send_nullfunc(local, sdata, 0); ieee80211_sta_timer((unsigned long)sdata); @@ -2677,8 +3166,9 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw) } rcu_read_unlock(); +done: sdata = IEEE80211_DEV_TO_SUB_IF(dev); - if (sdata->type == IEEE80211_IF_TYPE_IBSS) { + if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS) { struct ieee80211_if_sta *ifsta = &sdata->u.sta; if (!(ifsta->flags & IEEE80211_STA_BSSID_SET) || (!ifsta->state == IEEE80211_IBSS_JOINED && @@ -2699,7 +3189,7 @@ void ieee80211_sta_scan_work(struct work_struct *work) int skip; unsigned long next_delay = 0; - if (!local->sta_scanning) + if (!local->sta_sw_scanning) return; switch (local->scan_state) { @@ -2713,7 +3203,7 @@ void ieee80211_sta_scan_work(struct work_struct *work) skip = !(local->enabled_modes & (1 << mode->mode)); chan = &mode->channels[local->scan_channel_idx]; if (!(chan->flag & IEEE80211_CHAN_W_SCAN) || - (sdata->type == IEEE80211_IF_TYPE_IBSS && + (sdata->vif.type == IEEE80211_IF_TYPE_IBSS && !(chan->flag & IEEE80211_CHAN_W_IBSS)) || (local->hw_modes & local->enabled_modes & (1 << MODE_IEEE80211G) && mode->mode == MODE_IEEE80211B)) @@ -2762,7 +3252,7 @@ void ieee80211_sta_scan_work(struct work_struct *work) break; } - if (local->sta_scanning) + if (local->sta_sw_scanning) queue_delayed_work(local->hw.workqueue, &local->scan_work, next_delay); } @@ -2794,7 +3284,7 @@ static int ieee80211_sta_start_scan(struct net_device *dev, * ResultCode: SUCCESS, INVALID_PARAMETERS */ - if (local->sta_scanning) { + if (local->sta_sw_scanning || local->sta_hw_scanning) { if (local->scan_dev == dev) return 0; return -EBUSY; @@ -2802,15 +3292,15 @@ static int ieee80211_sta_start_scan(struct net_device *dev, if (local->ops->hw_scan) { int rc = local->ops->hw_scan(local_to_hw(local), - ssid, ssid_len); + ssid, ssid_len); if (!rc) { - local->sta_scanning = 1; + local->sta_hw_scanning = 1; local->scan_dev = dev; } return rc; } - local->sta_scanning = 1; + local->sta_sw_scanning = 1; rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { @@ -2821,7 +3311,7 @@ static int ieee80211_sta_start_scan(struct net_device *dev, continue; netif_stop_queue(sdata->dev); - if (sdata->type == IEEE80211_IF_TYPE_STA && + if (sdata->vif.type == IEEE80211_IF_TYPE_STA && (sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED)) ieee80211_send_nullfunc(local, sdata, 1); } @@ -2862,10 +3352,10 @@ int ieee80211_sta_req_scan(struct net_device *dev, u8 *ssid, size_t ssid_len) struct ieee80211_if_sta *ifsta = &sdata->u.sta; struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); - if (sdata->type != IEEE80211_IF_TYPE_STA) + if (sdata->vif.type != IEEE80211_IF_TYPE_STA) return ieee80211_sta_start_scan(dev, ssid, ssid_len); - if (local->sta_scanning) { + if (local->sta_sw_scanning || local->sta_hw_scanning) { if (local->scan_dev == dev) return 0; return -EBUSY; @@ -2894,15 +3384,6 @@ ieee80211_sta_scan_result(struct net_device *dev, if (!(local->enabled_modes & (1 << bss->hw_mode))) return current_ev; - if (local->scan_flags & IEEE80211_SCAN_WPA_ONLY && - !bss->wpa_ie && !bss->rsn_ie) - return current_ev; - - if (local->scan_flags & IEEE80211_SCAN_MATCH_SSID && - (local->scan_ssid_len != bss->ssid_len || - memcmp(local->scan_ssid, bss->ssid, bss->ssid_len) != 0)) - return current_ev; - memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWAP; iwe.u.ap_addr.sa_family = ARPHRD_ETHER; @@ -3006,34 +3487,6 @@ ieee80211_sta_scan_result(struct net_device *dev, } } - do { - char *buf; - - if (!(local->scan_flags & IEEE80211_SCAN_EXTRA_INFO)) - break; - - buf = kmalloc(100, GFP_ATOMIC); - if (!buf) - break; - - memset(&iwe, 0, sizeof(iwe)); - iwe.cmd = IWEVCUSTOM; - sprintf(buf, "bcn_int=%d", bss->beacon_int); - iwe.u.data.length = strlen(buf); - current_ev = iwe_stream_add_point(current_ev, end_buf, &iwe, - buf); - - memset(&iwe, 0, sizeof(iwe)); - iwe.cmd = IWEVCUSTOM; - sprintf(buf, "capab=0x%04x", bss->capability); - iwe.u.data.length = strlen(buf); - current_ev = iwe_stream_add_point(current_ev, end_buf, &iwe, - buf); - - kfree(buf); - break; - } while (0); - return current_ev; } @@ -3122,8 +3575,8 @@ int ieee80211_sta_deauthenticate(struct net_device *dev, u16 reason) printk(KERN_DEBUG "%s: deauthenticate(reason=%d)\n", dev->name, reason); - if (sdata->type != IEEE80211_IF_TYPE_STA && - sdata->type != IEEE80211_IF_TYPE_IBSS) + if (sdata->vif.type != IEEE80211_IF_TYPE_STA && + sdata->vif.type != IEEE80211_IF_TYPE_IBSS) return -EINVAL; ieee80211_send_deauth(dev, ifsta, reason); @@ -3140,7 +3593,7 @@ int ieee80211_sta_disassociate(struct net_device *dev, u16 reason) printk(KERN_DEBUG "%s: disassociate(reason=%d)\n", dev->name, reason); - if (sdata->type != IEEE80211_IF_TYPE_STA) + if (sdata->vif.type != IEEE80211_IF_TYPE_STA) return -EINVAL; if (!(ifsta->flags & IEEE80211_STA_ASSOCIATED)) diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 0b2328f7d67..ed57fb8e82f 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -49,8 +49,8 @@ static const u8 *get_mac_for_key(struct ieee80211_key *key) * address to indicate a transmit-only key. */ if (key->conf.alg != ALG_WEP && - (key->sdata->type == IEEE80211_IF_TYPE_AP || - key->sdata->type == IEEE80211_IF_TYPE_VLAN)) + (key->sdata->vif.type == IEEE80211_IF_TYPE_AP || + key->sdata->vif.type == IEEE80211_IF_TYPE_VLAN)) addr = zero_addr; if (key->sta) @@ -172,7 +172,7 @@ struct ieee80211_key *ieee80211_key_alloc(struct ieee80211_sub_if_data *sdata, if (sta->flags & WLAN_STA_WME) key->conf.flags |= IEEE80211_KEY_FLAG_WMM_STA; } else { - if (sdata->type == IEEE80211_IF_TYPE_STA) { + if (sdata->vif.type == IEEE80211_IF_TYPE_STA) { struct sta_info *ap; /* same here, the AP could be using QoS */ diff --git a/net/mac80211/rc80211_pid.h b/net/mac80211/rc80211_pid.h new file mode 100644 index 00000000000..04afc13ed82 --- /dev/null +++ b/net/mac80211/rc80211_pid.h @@ -0,0 +1,285 @@ +/* + * Copyright 2007, Mattias Nissler <mattias.nissler@gmx.de> + * Copyright 2007, Stefano Brivio <stefano.brivio@polimi.it> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef RC80211_PID_H +#define RC80211_PID_H + +/* Sampling period for measuring percentage of failed frames in ms. */ +#define RC_PID_INTERVAL 125 + +/* Exponential averaging smoothness (used for I part of PID controller) */ +#define RC_PID_SMOOTHING_SHIFT 3 +#define RC_PID_SMOOTHING (1 << RC_PID_SMOOTHING_SHIFT) + +/* Sharpening factor (used for D part of PID controller) */ +#define RC_PID_SHARPENING_FACTOR 0 +#define RC_PID_SHARPENING_DURATION 0 + +/* Fixed point arithmetic shifting amount. */ +#define RC_PID_ARITH_SHIFT 8 + +/* Fixed point arithmetic factor. */ +#define RC_PID_ARITH_FACTOR (1 << RC_PID_ARITH_SHIFT) + +/* Proportional PID component coefficient. */ +#define RC_PID_COEFF_P 15 +/* Integral PID component coefficient. */ +#define RC_PID_COEFF_I 9 +/* Derivative PID component coefficient. */ +#define RC_PID_COEFF_D 15 + +/* Target failed frames rate for the PID controller. NB: This effectively gives + * maximum failed frames percentage we're willing to accept. If the wireless + * link quality is good, the controller will fail to adjust failed frames + * percentage to the target. This is intentional. + */ +#define RC_PID_TARGET_PF 14 + +/* Rate behaviour normalization quantity over time. */ +#define RC_PID_NORM_OFFSET 3 + +/* Push high rates right after loading. */ +#define RC_PID_FAST_START 0 + +/* Arithmetic right shift for positive and negative values for ISO C. */ +#define RC_PID_DO_ARITH_RIGHT_SHIFT(x, y) \ + (x) < 0 ? -((-(x)) >> (y)) : (x) >> (y) + +enum rc_pid_event_type { + RC_PID_EVENT_TYPE_TX_STATUS, + RC_PID_EVENT_TYPE_RATE_CHANGE, + RC_PID_EVENT_TYPE_TX_RATE, + RC_PID_EVENT_TYPE_PF_SAMPLE, +}; + +union rc_pid_event_data { + /* RC_PID_EVENT_TX_STATUS */ + struct { + struct ieee80211_tx_status tx_status; + }; + /* RC_PID_EVENT_TYPE_RATE_CHANGE */ + /* RC_PID_EVENT_TYPE_TX_RATE */ + struct { + int index; + int rate; + }; + /* RC_PID_EVENT_TYPE_PF_SAMPLE */ + struct { + s32 pf_sample; + s32 prop_err; + s32 int_err; + s32 der_err; + }; +}; + +struct rc_pid_event { + /* The time when the event occured */ + unsigned long timestamp; + + /* Event ID number */ + unsigned int id; + + /* Type of event */ + enum rc_pid_event_type type; + + /* type specific data */ + union rc_pid_event_data data; +}; + +/* Size of the event ring buffer. */ +#define RC_PID_EVENT_RING_SIZE 32 + +struct rc_pid_event_buffer { + /* Counter that generates event IDs */ + unsigned int ev_count; + + /* Ring buffer of events */ + struct rc_pid_event ring[RC_PID_EVENT_RING_SIZE]; + + /* Index to the entry in events_buf to be reused */ + unsigned int next_entry; + + /* Lock that guards against concurrent access to this buffer struct */ + spinlock_t lock; + + /* Wait queue for poll/select and blocking I/O */ + wait_queue_head_t waitqueue; +}; + +struct rc_pid_events_file_info { + /* The event buffer we read */ + struct rc_pid_event_buffer *events; + + /* The entry we have should read next */ + unsigned int next_entry; +}; + +/** + * struct rc_pid_debugfs_entries - tunable parameters + * + * Algorithm parameters, tunable via debugfs. + * @dir: the debugfs directory for a specific phy + * @target: target percentage for failed frames + * @sampling_period: error sampling interval in milliseconds + * @coeff_p: absolute value of the proportional coefficient + * @coeff_i: absolute value of the integral coefficient + * @coeff_d: absolute value of the derivative coefficient + * @smoothing_shift: absolute value of the integral smoothing factor (i.e. + * amount of smoothing introduced by the exponential moving average) + * @sharpen_factor: absolute value of the derivative sharpening factor (i.e. + * amount of emphasis given to the derivative term after low activity + * events) + * @sharpen_duration: duration of the sharpening effect after the detected low + * activity event, relative to sampling_period + * @norm_offset: amount of normalization periodically performed on the learnt + * rate behaviour values (lower means we should trust more what we learnt + * about behaviour of rates, higher means we should trust more the natural + * ordering of rates) + * @fast_start: if Y, push high rates right after initialization + */ +struct rc_pid_debugfs_entries { + struct dentry *dir; + struct dentry *target; + struct dentry *sampling_period; + struct dentry *coeff_p; + struct dentry *coeff_i; + struct dentry *coeff_d; + struct dentry *smoothing_shift; + struct dentry *sharpen_factor; + struct dentry *sharpen_duration; + struct dentry *norm_offset; + struct dentry *fast_start; +}; + +void rate_control_pid_event_tx_status(struct rc_pid_event_buffer *buf, + struct ieee80211_tx_status *stat); + +void rate_control_pid_event_rate_change(struct rc_pid_event_buffer *buf, + int index, int rate); + +void rate_control_pid_event_tx_rate(struct rc_pid_event_buffer *buf, + int index, int rate); + +void rate_control_pid_event_pf_sample(struct rc_pid_event_buffer *buf, + s32 pf_sample, s32 prop_err, + s32 int_err, s32 der_err); + +void rate_control_pid_add_sta_debugfs(void *priv, void *priv_sta, + struct dentry *dir); + +void rate_control_pid_remove_sta_debugfs(void *priv, void *priv_sta); + +struct rc_pid_sta_info { + unsigned long last_change; + unsigned long last_sample; + + u32 tx_num_failed; + u32 tx_num_xmit; + + /* Average failed frames percentage error (i.e. actual vs. target + * percentage), scaled by RC_PID_SMOOTHING. This value is computed + * using using an exponential weighted average technique: + * + * (RC_PID_SMOOTHING - 1) * err_avg_old + err + * err_avg = ------------------------------------------ + * RC_PID_SMOOTHING + * + * where err_avg is the new approximation, err_avg_old the previous one + * and err is the error w.r.t. to the current failed frames percentage + * sample. Note that the bigger RC_PID_SMOOTHING the more weight is + * given to the previous estimate, resulting in smoother behavior (i.e. + * corresponding to a longer integration window). + * + * For computation, we actually don't use the above formula, but this + * one: + * + * err_avg_scaled = err_avg_old_scaled - err_avg_old + err + * + * where: + * err_avg_scaled = err * RC_PID_SMOOTHING + * err_avg_old_scaled = err_avg_old * RC_PID_SMOOTHING + * + * This avoids floating point numbers and the per_failed_old value can + * easily be obtained by shifting per_failed_old_scaled right by + * RC_PID_SMOOTHING_SHIFT. + */ + s32 err_avg_sc; + + /* Last framed failes percentage sample. */ + u32 last_pf; + + /* Sharpening needed. */ + u8 sharp_cnt; + +#ifdef CONFIG_MAC80211_DEBUGFS + /* Event buffer */ + struct rc_pid_event_buffer events; + + /* Events debugfs file entry */ + struct dentry *events_entry; +#endif +}; + +/* Algorithm parameters. We keep them on a per-algorithm approach, so they can + * be tuned individually for each interface. + */ +struct rc_pid_rateinfo { + + /* Map sorted rates to rates in ieee80211_hw_mode. */ + int index; + + /* Map rates in ieee80211_hw_mode to sorted rates. */ + int rev_index; + + /* Did we do any measurement on this rate? */ + bool valid; + + /* Comparison with the lowest rate. */ + int diff; +}; + +struct rc_pid_info { + + /* The failed frames percentage target. */ + unsigned int target; + + /* Rate at which failed frames percentage is sampled in 0.001s. */ + unsigned int sampling_period; + + /* P, I and D coefficients. */ + int coeff_p; + int coeff_i; + int coeff_d; + + /* Exponential averaging shift. */ + unsigned int smoothing_shift; + + /* Sharpening factor and duration. */ + unsigned int sharpen_factor; + unsigned int sharpen_duration; + + /* Normalization offset. */ + unsigned int norm_offset; + + /* Fast starst parameter. */ + unsigned int fast_start; + + /* Rates information. */ + struct rc_pid_rateinfo *rinfo; + + /* Index of the last used rate. */ + int oldrate; + +#ifdef CONFIG_MAC80211_DEBUGFS + /* Debugfs entries created for the parameters above. */ + struct rc_pid_debugfs_entries dentries; +#endif +}; + +#endif /* RC80211_PID_H */ diff --git a/net/mac80211/rc80211_pid_algo.c b/net/mac80211/rc80211_pid_algo.c new file mode 100644 index 00000000000..554c4baed6f --- /dev/null +++ b/net/mac80211/rc80211_pid_algo.c @@ -0,0 +1,549 @@ +/* + * Copyright 2002-2005, Instant802 Networks, Inc. + * Copyright 2005, Devicescape Software, Inc. + * Copyright 2007, Mattias Nissler <mattias.nissler@gmx.de> + * Copyright 2007, Stefano Brivio <stefano.brivio@polimi.it> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/netdevice.h> +#include <linux/types.h> +#include <linux/skbuff.h> +#include <linux/debugfs.h> +#include <net/mac80211.h> +#include "ieee80211_rate.h" + +#include "rc80211_pid.h" + + +/* This is an implementation of a TX rate control algorithm that uses a PID + * controller. Given a target failed frames rate, the controller decides about + * TX rate changes to meet the target failed frames rate. + * + * The controller basically computes the following: + * + * adj = CP * err + CI * err_avg + CD * (err - last_err) * (1 + sharpening) + * + * where + * adj adjustment value that is used to switch TX rate (see below) + * err current error: target vs. current failed frames percentage + * last_err last error + * err_avg average (i.e. poor man's integral) of recent errors + * sharpening non-zero when fast response is needed (i.e. right after + * association or no frames sent for a long time), heading + * to zero over time + * CP Proportional coefficient + * CI Integral coefficient + * CD Derivative coefficient + * + * CP, CI, CD are subject to careful tuning. + * + * The integral component uses a exponential moving average approach instead of + * an actual sliding window. The advantage is that we don't need to keep an + * array of the last N error values and computation is easier. + * + * Once we have the adj value, we map it to a rate by means of a learning + * algorithm. This algorithm keeps the state of the percentual failed frames + * difference between rates. The behaviour of the lowest available rate is kept + * as a reference value, and every time we switch between two rates, we compute + * the difference between the failed frames each rate exhibited. By doing so, + * we compare behaviours which different rates exhibited in adjacent timeslices, + * thus the comparison is minimally affected by external conditions. This + * difference gets propagated to the whole set of measurements, so that the + * reference is always the same. Periodically, we normalize this set so that + * recent events weigh the most. By comparing the adj value with this set, we + * avoid pejorative switches to lower rates and allow for switches to higher + * rates if they behaved well. + * + * Note that for the computations we use a fixed-point representation to avoid + * floating point arithmetic. Hence, all values are shifted left by + * RC_PID_ARITH_SHIFT. + */ + + +/* Shift the adjustment so that we won't switch to a lower rate if it exhibited + * a worse failed frames behaviour and we'll choose the highest rate whose + * failed frames behaviour is not worse than the one of the original rate + * target. While at it, check that the adjustment is within the ranges. Then, + * provide the new rate index. */ +static int rate_control_pid_shift_adjust(struct rc_pid_rateinfo *r, + int adj, int cur, int l) +{ + int i, j, k, tmp; + + j = r[cur].rev_index; + i = j + adj; + + if (i < 0) + return r[0].index; + if (i >= l - 1) + return r[l - 1].index; + + tmp = i; + + if (adj < 0) { + for (k = j; k >= i; k--) + if (r[k].diff <= r[j].diff) + tmp = k; + } else { + for (k = i + 1; k + i < l; k++) + if (r[k].diff <= r[i].diff) + tmp = k; + } + + return r[tmp].index; +} + +static void rate_control_pid_adjust_rate(struct ieee80211_local *local, + struct sta_info *sta, int adj, + struct rc_pid_rateinfo *rinfo) +{ + struct ieee80211_sub_if_data *sdata; + struct ieee80211_hw_mode *mode; + int newidx; + int maxrate; + int back = (adj > 0) ? 1 : -1; + + sdata = IEEE80211_DEV_TO_SUB_IF(sta->dev); + + mode = local->oper_hw_mode; + maxrate = sdata->bss ? sdata->bss->max_ratectrl_rateidx : -1; + + newidx = rate_control_pid_shift_adjust(rinfo, adj, sta->txrate, + mode->num_rates); + + while (newidx != sta->txrate) { + if (rate_supported(sta, mode, newidx) && + (maxrate < 0 || newidx <= maxrate)) { + sta->txrate = newidx; + break; + } + + newidx += back; + } + +#ifdef CONFIG_MAC80211_DEBUGFS + rate_control_pid_event_rate_change( + &((struct rc_pid_sta_info *)sta->rate_ctrl_priv)->events, + newidx, mode->rates[newidx].rate); +#endif +} + +/* Normalize the failed frames per-rate differences. */ +static void rate_control_pid_normalize(struct rc_pid_info *pinfo, int l) +{ + int i, norm_offset = pinfo->norm_offset; + struct rc_pid_rateinfo *r = pinfo->rinfo; + + if (r[0].diff > norm_offset) + r[0].diff -= norm_offset; + else if (r[0].diff < -norm_offset) + r[0].diff += norm_offset; + for (i = 0; i < l - 1; i++) + if (r[i + 1].diff > r[i].diff + norm_offset) + r[i + 1].diff -= norm_offset; + else if (r[i + 1].diff <= r[i].diff) + r[i + 1].diff += norm_offset; +} + +static void rate_control_pid_sample(struct rc_pid_info *pinfo, + struct ieee80211_local *local, + struct sta_info *sta) +{ + struct rc_pid_sta_info *spinfo = sta->rate_ctrl_priv; + struct rc_pid_rateinfo *rinfo = pinfo->rinfo; + struct ieee80211_hw_mode *mode; + u32 pf; + s32 err_avg; + u32 err_prop; + u32 err_int; + u32 err_der; + int adj, i, j, tmp; + unsigned long period; + + mode = local->oper_hw_mode; + spinfo = sta->rate_ctrl_priv; + + /* In case nothing happened during the previous control interval, turn + * the sharpening factor on. */ + period = (HZ * pinfo->sampling_period + 500) / 1000; + if (!period) + period = 1; + if (jiffies - spinfo->last_sample > 2 * period) + spinfo->sharp_cnt = pinfo->sharpen_duration; + + spinfo->last_sample = jiffies; + + /* This should never happen, but in case, we assume the old sample is + * still a good measurement and copy it. */ + if (unlikely(spinfo->tx_num_xmit == 0)) + pf = spinfo->last_pf; + else { + pf = spinfo->tx_num_failed * 100 / spinfo->tx_num_xmit; + pf <<= RC_PID_ARITH_SHIFT; + } + + spinfo->tx_num_xmit = 0; + spinfo->tx_num_failed = 0; + + /* If we just switched rate, update the rate behaviour info. */ + if (pinfo->oldrate != sta->txrate) { + + i = rinfo[pinfo->oldrate].rev_index; + j = rinfo[sta->txrate].rev_index; + + tmp = (pf - spinfo->last_pf); + tmp = RC_PID_DO_ARITH_RIGHT_SHIFT(tmp, RC_PID_ARITH_SHIFT); + + rinfo[j].diff = rinfo[i].diff + tmp; + pinfo->oldrate = sta->txrate; + } + rate_control_pid_normalize(pinfo, mode->num_rates); + + /* Compute the proportional, integral and derivative errors. */ + err_prop = (pinfo->target << RC_PID_ARITH_SHIFT) - pf; + + err_avg = spinfo->err_avg_sc >> pinfo->smoothing_shift; + spinfo->err_avg_sc = spinfo->err_avg_sc - err_avg + err_prop; + err_int = spinfo->err_avg_sc >> pinfo->smoothing_shift; + + err_der = (pf - spinfo->last_pf) * + (1 + pinfo->sharpen_factor * spinfo->sharp_cnt); + spinfo->last_pf = pf; + if (spinfo->sharp_cnt) + spinfo->sharp_cnt--; + +#ifdef CONFIG_MAC80211_DEBUGFS + rate_control_pid_event_pf_sample(&spinfo->events, pf, err_prop, err_int, + err_der); +#endif + + /* Compute the controller output. */ + adj = (err_prop * pinfo->coeff_p + err_int * pinfo->coeff_i + + err_der * pinfo->coeff_d); + adj = RC_PID_DO_ARITH_RIGHT_SHIFT(adj, 2 * RC_PID_ARITH_SHIFT); + + /* Change rate. */ + if (adj) + rate_control_pid_adjust_rate(local, sta, adj, rinfo); +} + +static void rate_control_pid_tx_status(void *priv, struct net_device *dev, + struct sk_buff *skb, + struct ieee80211_tx_status *status) +{ + struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + struct ieee80211_sub_if_data *sdata; + struct rc_pid_info *pinfo = priv; + struct sta_info *sta; + struct rc_pid_sta_info *spinfo; + unsigned long period; + + sta = sta_info_get(local, hdr->addr1); + + if (!sta) + return; + + /* Don't update the state if we're not controlling the rate. */ + sdata = IEEE80211_DEV_TO_SUB_IF(sta->dev); + if (sdata->bss && sdata->bss->force_unicast_rateidx > -1) { + sta->txrate = sdata->bss->max_ratectrl_rateidx; + return; + } + + /* Ignore all frames that were sent with a different rate than the rate + * we currently advise mac80211 to use. */ + if (status->control.rate != &local->oper_hw_mode->rates[sta->txrate]) + goto ignore; + + spinfo = sta->rate_ctrl_priv; + spinfo->tx_num_xmit++; + +#ifdef CONFIG_MAC80211_DEBUGFS + rate_control_pid_event_tx_status(&spinfo->events, status); +#endif + + /* We count frames that totally failed to be transmitted as two bad + * frames, those that made it out but had some retries as one good and + * one bad frame. */ + if (status->excessive_retries) { + spinfo->tx_num_failed += 2; + spinfo->tx_num_xmit++; + } else if (status->retry_count) { + spinfo->tx_num_failed++; + spinfo->tx_num_xmit++; + } + + if (status->excessive_retries) { + sta->tx_retry_failed++; + sta->tx_num_consecutive_failures++; + sta->tx_num_mpdu_fail++; + } else { + sta->last_ack_rssi[0] = sta->last_ack_rssi[1]; + sta->last_ack_rssi[1] = sta->last_ack_rssi[2]; + sta->last_ack_rssi[2] = status->ack_signal; + sta->tx_num_consecutive_failures = 0; + sta->tx_num_mpdu_ok++; + } + sta->tx_retry_count += status->retry_count; + sta->tx_num_mpdu_fail += status->retry_count; + + /* Update PID controller state. */ + period = (HZ * pinfo->sampling_period + 500) / 1000; + if (!period) + period = 1; + if (time_after(jiffies, spinfo->last_sample + period)) + rate_control_pid_sample(pinfo, local, sta); + +ignore: + sta_info_put(sta); +} + +static void rate_control_pid_get_rate(void *priv, struct net_device *dev, + struct ieee80211_hw_mode *mode, + struct sk_buff *skb, + struct rate_selection *sel) +{ + struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + struct ieee80211_sub_if_data *sdata; + struct sta_info *sta; + int rateidx; + u16 fc; + + sta = sta_info_get(local, hdr->addr1); + + /* Send management frames and broadcast/multicast data using lowest + * rate. */ + fc = le16_to_cpu(hdr->frame_control); + if ((fc & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_DATA || + is_multicast_ether_addr(hdr->addr1) || !sta) { + sel->rate = rate_lowest(local, mode, sta); + if (sta) + sta_info_put(sta); + return; + } + + /* If a forced rate is in effect, select it. */ + sdata = IEEE80211_DEV_TO_SUB_IF(dev); + if (sdata->bss && sdata->bss->force_unicast_rateidx > -1) + sta->txrate = sdata->bss->force_unicast_rateidx; + + rateidx = sta->txrate; + + if (rateidx >= mode->num_rates) + rateidx = mode->num_rates - 1; + + sta->last_txrate = rateidx; + + sta_info_put(sta); + + sel->rate = &mode->rates[rateidx]; + +#ifdef CONFIG_MAC80211_DEBUGFS + rate_control_pid_event_tx_rate( + &((struct rc_pid_sta_info *) sta->rate_ctrl_priv)->events, + rateidx, mode->rates[rateidx].rate); +#endif +} + +static void rate_control_pid_rate_init(void *priv, void *priv_sta, + struct ieee80211_local *local, + struct sta_info *sta) +{ + /* TODO: This routine should consider using RSSI from previous packets + * as we need to have IEEE 802.1X auth succeed immediately after assoc.. + * Until that method is implemented, we will use the lowest supported + * rate as a workaround. */ + sta->txrate = rate_lowest_index(local, local->oper_hw_mode, sta); +} + +static void *rate_control_pid_alloc(struct ieee80211_local *local) +{ + struct rc_pid_info *pinfo; + struct rc_pid_rateinfo *rinfo; + struct ieee80211_hw_mode *mode; + int i, j, tmp; + bool s; +#ifdef CONFIG_MAC80211_DEBUGFS + struct rc_pid_debugfs_entries *de; +#endif + + pinfo = kmalloc(sizeof(*pinfo), GFP_ATOMIC); + if (!pinfo) + return NULL; + + /* We can safely assume that oper_hw_mode won't change unless we get + * reinitialized. */ + mode = local->oper_hw_mode; + rinfo = kmalloc(sizeof(*rinfo) * mode->num_rates, GFP_ATOMIC); + if (!rinfo) { + kfree(pinfo); + return NULL; + } + + /* Sort the rates. This is optimized for the most common case (i.e. + * almost-sorted CCK+OFDM rates). Kind of bubble-sort with reversed + * mapping too. */ + for (i = 0; i < mode->num_rates; i++) { + rinfo[i].index = i; + rinfo[i].rev_index = i; + if (pinfo->fast_start) + rinfo[i].diff = 0; + else + rinfo[i].diff = i * pinfo->norm_offset; + } + for (i = 1; i < mode->num_rates; i++) { + s = 0; + for (j = 0; j < mode->num_rates - i; j++) + if (unlikely(mode->rates[rinfo[j].index].rate > + mode->rates[rinfo[j + 1].index].rate)) { + tmp = rinfo[j].index; + rinfo[j].index = rinfo[j + 1].index; + rinfo[j + 1].index = tmp; + rinfo[rinfo[j].index].rev_index = j; + rinfo[rinfo[j + 1].index].rev_index = j + 1; + s = 1; + } + if (!s) + break; + } + + pinfo->target = RC_PID_TARGET_PF; + pinfo->sampling_period = RC_PID_INTERVAL; + pinfo->coeff_p = RC_PID_COEFF_P; + pinfo->coeff_i = RC_PID_COEFF_I; + pinfo->coeff_d = RC_PID_COEFF_D; + pinfo->smoothing_shift = RC_PID_SMOOTHING_SHIFT; + pinfo->sharpen_factor = RC_PID_SHARPENING_FACTOR; + pinfo->sharpen_duration = RC_PID_SHARPENING_DURATION; + pinfo->norm_offset = RC_PID_NORM_OFFSET; + pinfo->fast_start = RC_PID_FAST_START; + pinfo->rinfo = rinfo; + pinfo->oldrate = 0; + +#ifdef CONFIG_MAC80211_DEBUGFS + de = &pinfo->dentries; + de->dir = debugfs_create_dir("rc80211_pid", + local->hw.wiphy->debugfsdir); + de->target = debugfs_create_u32("target_pf", S_IRUSR | S_IWUSR, + de->dir, &pinfo->target); + de->sampling_period = debugfs_create_u32("sampling_period", + S_IRUSR | S_IWUSR, de->dir, + &pinfo->sampling_period); + de->coeff_p = debugfs_create_u32("coeff_p", S_IRUSR | S_IWUSR, + de->dir, &pinfo->coeff_p); + de->coeff_i = debugfs_create_u32("coeff_i", S_IRUSR | S_IWUSR, + de->dir, &pinfo->coeff_i); + de->coeff_d = debugfs_create_u32("coeff_d", S_IRUSR | S_IWUSR, + de->dir, &pinfo->coeff_d); + de->smoothing_shift = debugfs_create_u32("smoothing_shift", + S_IRUSR | S_IWUSR, de->dir, + &pinfo->smoothing_shift); + de->sharpen_factor = debugfs_create_u32("sharpen_factor", + S_IRUSR | S_IWUSR, de->dir, + &pinfo->sharpen_factor); + de->sharpen_duration = debugfs_create_u32("sharpen_duration", + S_IRUSR | S_IWUSR, de->dir, + &pinfo->sharpen_duration); + de->norm_offset = debugfs_create_u32("norm_offset", + S_IRUSR | S_IWUSR, de->dir, + &pinfo->norm_offset); + de->fast_start = debugfs_create_bool("fast_start", + S_IRUSR | S_IWUSR, de->dir, + &pinfo->fast_start); +#endif + + return pinfo; +} + +static void rate_control_pid_free(void *priv) +{ + struct rc_pid_info *pinfo = priv; +#ifdef CONFIG_MAC80211_DEBUGFS + struct rc_pid_debugfs_entries *de = &pinfo->dentries; + + debugfs_remove(de->fast_start); + debugfs_remove(de->norm_offset); + debugfs_remove(de->sharpen_duration); + debugfs_remove(de->sharpen_factor); + debugfs_remove(de->smoothing_shift); + debugfs_remove(de->coeff_d); + debugfs_remove(de->coeff_i); + debugfs_remove(de->coeff_p); + debugfs_remove(de->sampling_period); + debugfs_remove(de->target); + debugfs_remove(de->dir); +#endif + + kfree(pinfo->rinfo); + kfree(pinfo); +} + +static void rate_control_pid_clear(void *priv) +{ +} + +static void *rate_control_pid_alloc_sta(void *priv, gfp_t gfp) +{ + struct rc_pid_sta_info *spinfo; + + spinfo = kzalloc(sizeof(*spinfo), gfp); + if (spinfo == NULL) + return NULL; + + spinfo->last_sample = jiffies; + +#ifdef CONFIG_MAC80211_DEBUGFS + spin_lock_init(&spinfo->events.lock); + init_waitqueue_head(&spinfo->events.waitqueue); +#endif + + return spinfo; +} + +static void rate_control_pid_free_sta(void *priv, void *priv_sta) +{ + struct rc_pid_sta_info *spinfo = priv_sta; + kfree(spinfo); +} + +static struct rate_control_ops mac80211_rcpid = { + .name = "pid", + .tx_status = rate_control_pid_tx_status, + .get_rate = rate_control_pid_get_rate, + .rate_init = rate_control_pid_rate_init, + .clear = rate_control_pid_clear, + .alloc = rate_control_pid_alloc, + .free = rate_control_pid_free, + .alloc_sta = rate_control_pid_alloc_sta, + .free_sta = rate_control_pid_free_sta, +#ifdef CONFIG_MAC80211_DEBUGFS + .add_sta_debugfs = rate_control_pid_add_sta_debugfs, + .remove_sta_debugfs = rate_control_pid_remove_sta_debugfs, +#endif +}; + +MODULE_DESCRIPTION("PID controller based rate control algorithm"); +MODULE_AUTHOR("Stefano Brivio"); +MODULE_AUTHOR("Mattias Nissler"); +MODULE_LICENSE("GPL"); + +int __init rc80211_pid_init(void) +{ + return ieee80211_rate_control_register(&mac80211_rcpid); +} + +void __exit rc80211_pid_exit(void) +{ + ieee80211_rate_control_unregister(&mac80211_rcpid); +} + +#ifdef CONFIG_MAC80211_RC_PID_MODULE +module_init(rc80211_pid_init); +module_exit(rc80211_pid_exit); +#endif diff --git a/net/mac80211/rc80211_pid_debugfs.c b/net/mac80211/rc80211_pid_debugfs.c new file mode 100644 index 00000000000..88b8dc9999b --- /dev/null +++ b/net/mac80211/rc80211_pid_debugfs.c @@ -0,0 +1,223 @@ +/* + * Copyright 2007, Mattias Nissler <mattias.nissler@gmx.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/spinlock.h> +#include <linux/poll.h> +#include <linux/netdevice.h> +#include <linux/types.h> +#include <linux/skbuff.h> + +#include <net/mac80211.h> +#include "ieee80211_rate.h" + +#include "rc80211_pid.h" + +static void rate_control_pid_event(struct rc_pid_event_buffer *buf, + enum rc_pid_event_type type, + union rc_pid_event_data *data) +{ + struct rc_pid_event *ev; + unsigned long status; + + spin_lock_irqsave(&buf->lock, status); + ev = &(buf->ring[buf->next_entry]); + buf->next_entry = (buf->next_entry + 1) % RC_PID_EVENT_RING_SIZE; + + ev->timestamp = jiffies; + ev->id = buf->ev_count++; + ev->type = type; + ev->data = *data; + + spin_unlock_irqrestore(&buf->lock, status); + + wake_up_all(&buf->waitqueue); +} + +void rate_control_pid_event_tx_status(struct rc_pid_event_buffer *buf, + struct ieee80211_tx_status *stat) +{ + union rc_pid_event_data evd; + + memcpy(&evd.tx_status, stat, sizeof(struct ieee80211_tx_status)); + rate_control_pid_event(buf, RC_PID_EVENT_TYPE_TX_STATUS, &evd); +} + +void rate_control_pid_event_rate_change(struct rc_pid_event_buffer *buf, + int index, int rate) +{ + union rc_pid_event_data evd; + + evd.index = index; + evd.rate = rate; + rate_control_pid_event(buf, RC_PID_EVENT_TYPE_RATE_CHANGE, &evd); +} + +void rate_control_pid_event_tx_rate(struct rc_pid_event_buffer *buf, + int index, int rate) +{ + union rc_pid_event_data evd; + + evd.index = index; + evd.rate = rate; + rate_control_pid_event(buf, RC_PID_EVENT_TYPE_TX_RATE, &evd); +} + +void rate_control_pid_event_pf_sample(struct rc_pid_event_buffer *buf, + s32 pf_sample, s32 prop_err, + s32 int_err, s32 der_err) +{ + union rc_pid_event_data evd; + + evd.pf_sample = pf_sample; + evd.prop_err = prop_err; + evd.int_err = int_err; + evd.der_err = der_err; + rate_control_pid_event(buf, RC_PID_EVENT_TYPE_PF_SAMPLE, &evd); +} + +static int rate_control_pid_events_open(struct inode *inode, struct file *file) +{ + struct rc_pid_sta_info *sinfo = inode->i_private; + struct rc_pid_event_buffer *events = &sinfo->events; + struct rc_pid_events_file_info *file_info; + unsigned int status; + + /* Allocate a state struct */ + file_info = kmalloc(sizeof(*file_info), GFP_KERNEL); + if (file_info == NULL) + return -ENOMEM; + + spin_lock_irqsave(&events->lock, status); + + file_info->next_entry = events->next_entry; + file_info->events = events; + + spin_unlock_irqrestore(&events->lock, status); + + file->private_data = file_info; + + return 0; +} + +static int rate_control_pid_events_release(struct inode *inode, + struct file *file) +{ + struct rc_pid_events_file_info *file_info = file->private_data; + + kfree(file_info); + + return 0; +} + +static unsigned int rate_control_pid_events_poll(struct file *file, + poll_table *wait) +{ + struct rc_pid_events_file_info *file_info = file->private_data; + + poll_wait(file, &file_info->events->waitqueue, wait); + + return POLLIN | POLLRDNORM; +} + +#define RC_PID_PRINT_BUF_SIZE 64 + +static ssize_t rate_control_pid_events_read(struct file *file, char __user *buf, + size_t length, loff_t *offset) +{ + struct rc_pid_events_file_info *file_info = file->private_data; + struct rc_pid_event_buffer *events = file_info->events; + struct rc_pid_event *ev; + char pb[RC_PID_PRINT_BUF_SIZE]; + int ret; + int p; + unsigned int status; + + /* Check if there is something to read. */ + if (events->next_entry == file_info->next_entry) { + if (file->f_flags & O_NONBLOCK) + return -EAGAIN; + + /* Wait */ + ret = wait_event_interruptible(events->waitqueue, + events->next_entry != file_info->next_entry); + + if (ret) + return ret; + } + + /* Write out one event per call. I don't care whether it's a little + * inefficient, this is debugging code anyway. */ + spin_lock_irqsave(&events->lock, status); + + /* Get an event */ + ev = &(events->ring[file_info->next_entry]); + file_info->next_entry = (file_info->next_entry + 1) % + RC_PID_EVENT_RING_SIZE; + + /* Print information about the event. Note that userpace needs to + * provide large enough buffers. */ + length = length < RC_PID_PRINT_BUF_SIZE ? + length : RC_PID_PRINT_BUF_SIZE; + p = snprintf(pb, length, "%u %lu ", ev->id, ev->timestamp); + switch (ev->type) { + case RC_PID_EVENT_TYPE_TX_STATUS: + p += snprintf(pb + p, length - p, "tx_status %u %u", + ev->data.tx_status.excessive_retries, + ev->data.tx_status.retry_count); + break; + case RC_PID_EVENT_TYPE_RATE_CHANGE: + p += snprintf(pb + p, length - p, "rate_change %d %d", + ev->data.index, ev->data.rate); + break; + case RC_PID_EVENT_TYPE_TX_RATE: + p += snprintf(pb + p, length - p, "tx_rate %d %d", + ev->data.index, ev->data.rate); + break; + case RC_PID_EVENT_TYPE_PF_SAMPLE: + p += snprintf(pb + p, length - p, + "pf_sample %d %d %d %d", + ev->data.pf_sample, ev->data.prop_err, + ev->data.int_err, ev->data.der_err); + break; + } + p += snprintf(pb + p, length - p, "\n"); + + spin_unlock_irqrestore(&events->lock, status); + + if (copy_to_user(buf, pb, p)) + return -EFAULT; + + return p; +} + +#undef RC_PID_PRINT_BUF_SIZE + +static struct file_operations rc_pid_fop_events = { + .owner = THIS_MODULE, + .read = rate_control_pid_events_read, + .poll = rate_control_pid_events_poll, + .open = rate_control_pid_events_open, + .release = rate_control_pid_events_release, +}; + +void rate_control_pid_add_sta_debugfs(void *priv, void *priv_sta, + struct dentry *dir) +{ + struct rc_pid_sta_info *spinfo = priv_sta; + + spinfo->events_entry = debugfs_create_file("rc_pid_events", S_IRUGO, + dir, spinfo, + &rc_pid_fop_events); +} + +void rate_control_pid_remove_sta_debugfs(void *priv, void *priv_sta) +{ + struct rc_pid_sta_info *spinfo = priv_sta; + + debugfs_remove(spinfo->events_entry); +} diff --git a/net/mac80211/rc80211_simple.c b/net/mac80211/rc80211_simple.c index da72737364e..934676d687d 100644 --- a/net/mac80211/rc80211_simple.c +++ b/net/mac80211/rc80211_simple.c @@ -13,6 +13,7 @@ #include <linux/slab.h> #include <linux/skbuff.h> #include <linux/compiler.h> +#include <linux/module.h> #include <net/mac80211.h> #include "ieee80211_i.h" @@ -23,6 +24,8 @@ /* This is a minimal implementation of TX rate controlling that can be used * as the default when no improved mechanisms are available. */ +#define RATE_CONTROL_NUM_DOWN 20 +#define RATE_CONTROL_NUM_UP 15 #define RATE_CONTROL_EMERG_DEC 2 #define RATE_CONTROL_INTERVAL (HZ / 20) @@ -87,26 +90,6 @@ static void rate_control_rate_dec(struct ieee80211_local *local, } } - -static struct ieee80211_rate * -rate_control_lowest_rate(struct ieee80211_local *local, - struct ieee80211_hw_mode *mode) -{ - int i; - - for (i = 0; i < mode->num_rates; i++) { - struct ieee80211_rate *rate = &mode->rates[i]; - - if (rate->flags & IEEE80211_RATE_SUPPORTED) - return rate; - } - - printk(KERN_DEBUG "rate_control_lowest_rate - no supported rates " - "found\n"); - return &mode->rates[0]; -} - - struct global_rate_control { int dummy; }; @@ -216,35 +199,33 @@ static void rate_control_simple_tx_status(void *priv, struct net_device *dev, } -static struct ieee80211_rate * +static void rate_control_simple_get_rate(void *priv, struct net_device *dev, + struct ieee80211_hw_mode *mode, struct sk_buff *skb, - struct rate_control_extra *extra) + struct rate_selection *sel) { struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); - struct ieee80211_sub_if_data *sdata; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; - struct ieee80211_hw_mode *mode = extra->mode; + struct ieee80211_sub_if_data *sdata; struct sta_info *sta; - int rateidx, nonerp_idx; + int rateidx; u16 fc; - memset(extra, 0, sizeof(*extra)); + sta = sta_info_get(local, hdr->addr1); + /* Send management frames and broadcast/multicast data using lowest + * rate. */ fc = le16_to_cpu(hdr->frame_control); if ((fc & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_DATA || - (hdr->addr1[0] & 0x01)) { - /* Send management frames and broadcast/multicast data using - * lowest rate. */ - /* TODO: this could probably be improved.. */ - return rate_control_lowest_rate(local, mode); + is_multicast_ether_addr(hdr->addr1) || !sta) { + sel->rate = rate_lowest(local, mode, sta); + if (sta) + sta_info_put(sta); + return; } - sta = sta_info_get(local, hdr->addr1); - - if (!sta) - return rate_control_lowest_rate(local, mode); - + /* If a forced rate is in effect, select it. */ sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (sdata->bss && sdata->bss->force_unicast_rateidx > -1) sta->txrate = sdata->bss->force_unicast_rateidx; @@ -255,17 +236,10 @@ rate_control_simple_get_rate(void *priv, struct net_device *dev, rateidx = mode->num_rates - 1; sta->last_txrate = rateidx; - nonerp_idx = rateidx; - while (nonerp_idx > 0 && - ((mode->rates[nonerp_idx].flags & IEEE80211_RATE_ERP) || - !(mode->rates[nonerp_idx].flags & IEEE80211_RATE_SUPPORTED) || - !(sta->supp_rates & BIT(nonerp_idx)))) - nonerp_idx--; - extra->nonerp = &mode->rates[nonerp_idx]; sta_info_put(sta); - return &mode->rates[rateidx]; + sel->rate = &mode->rates[rateidx]; } @@ -391,7 +365,7 @@ static void rate_control_simple_remove_sta_debugfs(void *priv, void *priv_sta) } #endif -struct rate_control_ops mac80211_rcsimple = { +static struct rate_control_ops mac80211_rcsimple = { .name = "simple", .tx_status = rate_control_simple_tx_status, .get_rate = rate_control_simple_get_rate, @@ -406,3 +380,21 @@ struct rate_control_ops mac80211_rcsimple = { .remove_sta_debugfs = rate_control_simple_remove_sta_debugfs, #endif }; + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Simple rate control algorithm"); + +int __init rc80211_simple_init(void) +{ + return ieee80211_rate_control_register(&mac80211_rcsimple); +} + +void __exit rc80211_simple_exit(void) +{ + ieee80211_rate_control_unregister(&mac80211_rcsimple); +} + +#ifdef CONFIG_MAC80211_RC_SIMPLE_MODULE +module_init(rc80211_simple_init); +module_exit(rc80211_simple_exit); +#endif diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 00f908d9275..89e1e3070ec 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -24,6 +24,10 @@ #include "tkip.h" #include "wme.h" +u8 ieee80211_sta_manage_reorder_buf(struct ieee80211_hw *hw, + struct tid_ampdu_rx *tid_agg_rx, + struct sk_buff *skb, u16 mpdu_seq_num, + int bar_req); /* * monitor mode reception * @@ -61,8 +65,12 @@ static inline int should_drop_frame(struct ieee80211_rx_status *status, return 1; if (unlikely(skb->len < 16 + present_fcs_len + radiotap_len)) return 1; - if ((hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_FTYPE)) == - cpu_to_le16(IEEE80211_FTYPE_CTL)) + if (((hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_FTYPE)) == + cpu_to_le16(IEEE80211_FTYPE_CTL)) && + ((hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_STYPE)) != + cpu_to_le16(IEEE80211_STYPE_PSPOLL)) && + ((hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_STYPE)) != + cpu_to_le16(IEEE80211_STYPE_BACK_REQ))) return 1; return 0; } @@ -79,8 +87,9 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, struct ieee80211_sub_if_data *sdata; struct ieee80211_rate *rate; int needed_headroom = 0; - struct ieee80211_rtap_hdr { - struct ieee80211_radiotap_header hdr; + struct ieee80211_radiotap_header *rthdr; + __le64 *rttsft = NULL; + struct ieee80211_rtap_fixed_data { u8 flags; u8 rate; __le16 chan_freq; @@ -88,7 +97,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, u8 antsignal; u8 padding_for_rxflags; __le16 rx_flags; - } __attribute__ ((packed)) *rthdr; + } __attribute__ ((packed)) *rtfixed; struct sk_buff *skb, *skb2; struct net_device *prev_dev = NULL; int present_fcs_len = 0; @@ -105,7 +114,8 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, if (status->flag & RX_FLAG_RADIOTAP) rtap_len = ieee80211_get_radiotap_len(origskb->data); else - needed_headroom = sizeof(*rthdr); + /* room for radiotap header, always present fields and TSFT */ + needed_headroom = sizeof(*rthdr) + sizeof(*rtfixed) + 8; if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS) present_fcs_len = FCS_LEN; @@ -133,7 +143,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, * them allocate enough headroom to start with. */ if (skb_headroom(skb) < needed_headroom && - pskb_expand_head(skb, sizeof(*rthdr), 0, GFP_ATOMIC)) { + pskb_expand_head(skb, needed_headroom, 0, GFP_ATOMIC)) { dev_kfree_skb(skb); return NULL; } @@ -152,45 +162,59 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, /* if necessary, prepend radiotap information */ if (!(status->flag & RX_FLAG_RADIOTAP)) { + rtfixed = (void *) skb_push(skb, sizeof(*rtfixed)); + rtap_len = sizeof(*rthdr) + sizeof(*rtfixed); + if (status->flag & RX_FLAG_TSFT) { + rttsft = (void *) skb_push(skb, sizeof(*rttsft)); + rtap_len += 8; + } rthdr = (void *) skb_push(skb, sizeof(*rthdr)); memset(rthdr, 0, sizeof(*rthdr)); - rthdr->hdr.it_len = cpu_to_le16(sizeof(*rthdr)); - rthdr->hdr.it_present = + memset(rtfixed, 0, sizeof(*rtfixed)); + rthdr->it_present = cpu_to_le32((1 << IEEE80211_RADIOTAP_FLAGS) | (1 << IEEE80211_RADIOTAP_RATE) | (1 << IEEE80211_RADIOTAP_CHANNEL) | (1 << IEEE80211_RADIOTAP_DB_ANTSIGNAL) | (1 << IEEE80211_RADIOTAP_RX_FLAGS)); - rthdr->flags = local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS ? - IEEE80211_RADIOTAP_F_FCS : 0; + rtfixed->flags = 0; + if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS) + rtfixed->flags |= IEEE80211_RADIOTAP_F_FCS; + + if (rttsft) { + *rttsft = cpu_to_le64(status->mactime); + rthdr->it_present |= + cpu_to_le32(1 << IEEE80211_RADIOTAP_TSFT); + } /* FIXME: when radiotap gets a 'bad PLCP' flag use it here */ - rthdr->rx_flags = 0; + rtfixed->rx_flags = 0; if (status->flag & (RX_FLAG_FAILED_FCS_CRC | RX_FLAG_FAILED_PLCP_CRC)) - rthdr->rx_flags |= + rtfixed->rx_flags |= cpu_to_le16(IEEE80211_RADIOTAP_F_RX_BADFCS); rate = ieee80211_get_rate(local, status->phymode, status->rate); if (rate) - rthdr->rate = rate->rate / 5; + rtfixed->rate = rate->rate / 5; - rthdr->chan_freq = cpu_to_le16(status->freq); + rtfixed->chan_freq = cpu_to_le16(status->freq); if (status->phymode == MODE_IEEE80211A) - rthdr->chan_flags = + rtfixed->chan_flags = cpu_to_le16(IEEE80211_CHAN_OFDM | IEEE80211_CHAN_5GHZ); else - rthdr->chan_flags = + rtfixed->chan_flags = cpu_to_le16(IEEE80211_CHAN_DYN | IEEE80211_CHAN_2GHZ); - rthdr->antsignal = status->ssi; + rtfixed->antsignal = status->ssi; + rthdr->it_len = cpu_to_le16(rtap_len); } - skb_set_mac_header(skb, 0); + skb_reset_mac_header(skb); skb->ip_summed = CHECKSUM_UNNECESSARY; skb->pkt_type = PACKET_OTHERHOST; skb->protocol = htons(ETH_P_802_2); @@ -199,7 +223,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, if (!netif_running(sdata->dev)) continue; - if (sdata->type != IEEE80211_IF_TYPE_MNTR) + if (sdata->vif.type != IEEE80211_IF_TYPE_MNTR) continue; if (prev_dev) { @@ -243,6 +267,10 @@ ieee80211_rx_h_parse_qos(struct ieee80211_txrx_data *rx) u8 *qc = data + ieee80211_get_hdrlen(rx->fc) - QOS_CONTROL_LEN; /* frame has qos control */ tid = qc[0] & QOS_CONTROL_TID_MASK; + if (qc[0] & IEEE80211_QOS_CONTROL_A_MSDU_PRESENT) + rx->flags |= IEEE80211_TXRXD_RX_AMSDU; + else + rx->flags &= ~IEEE80211_TXRXD_RX_AMSDU; } else { if (unlikely((rx->fc & IEEE80211_FCTL_FTYPE) == IEEE80211_FTYPE_MGMT)) { /* Separate TID for management frames */ @@ -266,11 +294,11 @@ ieee80211_rx_h_parse_qos(struct ieee80211_txrx_data *rx) return TXRX_CONTINUE; } -static ieee80211_txrx_result -ieee80211_rx_h_load_stats(struct ieee80211_txrx_data *rx) + +static u32 ieee80211_rx_load_stats(struct ieee80211_local *local, + struct sk_buff *skb, + struct ieee80211_rx_status *status) { - struct ieee80211_local *local = rx->local; - struct sk_buff *skb = rx->skb; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; u32 load = 0, hdrtime; struct ieee80211_rate *rate; @@ -284,7 +312,7 @@ ieee80211_rx_h_load_stats(struct ieee80211_txrx_data *rx) rate = &mode->rates[0]; for (i = 0; i < mode->num_rates; i++) { - if (mode->rates[i].val == rx->u.rx.status->rate) { + if (mode->rates[i].val == status->rate) { rate = &mode->rates[i]; break; } @@ -308,16 +336,13 @@ ieee80211_rx_h_load_stats(struct ieee80211_txrx_data *rx) /* Divide channel_use by 8 to avoid wrapping around the counter */ load >>= CHAN_UTIL_SHIFT; - local->channel_use_raw += load; - rx->u.rx.load = load; - return TXRX_CONTINUE; + return load; } ieee80211_rx_handler ieee80211_rx_pre_handlers[] = { ieee80211_rx_h_parse_qos, - ieee80211_rx_h_load_stats, NULL }; @@ -338,8 +363,14 @@ ieee80211_rx_h_passive_scan(struct ieee80211_txrx_data *rx) struct ieee80211_local *local = rx->local; struct sk_buff *skb = rx->skb; - if (unlikely(local->sta_scanning != 0)) { - ieee80211_sta_rx_scan(rx->dev, skb, rx->u.rx.status); + if (unlikely(local->sta_hw_scanning)) + return ieee80211_sta_rx_scan(rx->dev, skb, rx->u.rx.status); + + if (unlikely(local->sta_sw_scanning)) { + /* drop all the other packets during a software scan anyway */ + if (ieee80211_sta_rx_scan(rx->dev, skb, rx->u.rx.status) + != TXRX_QUEUED) + dev_kfree_skb(skb); return TXRX_QUEUED; } @@ -377,18 +408,6 @@ ieee80211_rx_h_check(struct ieee80211_txrx_data *rx) return TXRX_DROP; } - if (!(rx->flags & IEEE80211_TXRXD_RXRA_MATCH)) - rx->skb->pkt_type = PACKET_OTHERHOST; - else if (compare_ether_addr(rx->dev->dev_addr, hdr->addr1) == 0) - rx->skb->pkt_type = PACKET_HOST; - else if (is_multicast_ether_addr(hdr->addr1)) { - if (is_broadcast_ether_addr(hdr->addr1)) - rx->skb->pkt_type = PACKET_BROADCAST; - else - rx->skb->pkt_type = PACKET_MULTICAST; - } else - rx->skb->pkt_type = PACKET_OTHERHOST; - /* Drop disallowed frame classes based on STA auth/assoc state; * IEEE 802.11, Chap 5.5. * @@ -400,7 +419,7 @@ ieee80211_rx_h_check(struct ieee80211_txrx_data *rx) if (unlikely(((rx->fc & IEEE80211_FCTL_FTYPE) == IEEE80211_FTYPE_DATA || ((rx->fc & IEEE80211_FCTL_FTYPE) == IEEE80211_FTYPE_CTL && (rx->fc & IEEE80211_FCTL_STYPE) == IEEE80211_STYPE_PSPOLL)) && - rx->sdata->type != IEEE80211_IF_TYPE_IBSS && + rx->sdata->vif.type != IEEE80211_IF_TYPE_IBSS && (!rx->sta || !(rx->sta->flags & WLAN_STA_ASSOC)))) { if ((!(rx->fc & IEEE80211_FCTL_FROMDS) && !(rx->fc & IEEE80211_FCTL_TODS) && @@ -620,13 +639,14 @@ ieee80211_rx_h_sta_process(struct ieee80211_txrx_data *rx) /* Update last_rx only for IBSS packets which are for the current * BSSID to avoid keeping the current IBSS network alive in cases where * other STAs are using different BSSID. */ - if (rx->sdata->type == IEEE80211_IF_TYPE_IBSS) { - u8 *bssid = ieee80211_get_bssid(hdr, rx->skb->len); + if (rx->sdata->vif.type == IEEE80211_IF_TYPE_IBSS) { + u8 *bssid = ieee80211_get_bssid(hdr, rx->skb->len, + IEEE80211_IF_TYPE_IBSS); if (compare_ether_addr(bssid, rx->sdata->u.sta.bssid) == 0) sta->last_rx = jiffies; } else if (!is_multicast_ether_addr(hdr->addr1) || - rx->sdata->type == IEEE80211_IF_TYPE_STA) { + rx->sdata->vif.type == IEEE80211_IF_TYPE_STA) { /* Update last_rx only for unicast frames in order to prevent * the Probe Request frames (the only broadcast frames from a * STA in infrastructure mode) from keeping a connection alive. @@ -870,6 +890,7 @@ ieee80211_rx_h_defragment(struct ieee80211_txrx_data *rx) static ieee80211_txrx_result ieee80211_rx_h_ps_poll(struct ieee80211_txrx_data *rx) { + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(rx->dev); struct sk_buff *skb; int no_pending_pkts; DECLARE_MAC_BUF(mac); @@ -880,6 +901,10 @@ ieee80211_rx_h_ps_poll(struct ieee80211_txrx_data *rx) !(rx->flags & IEEE80211_TXRXD_RXRA_MATCH))) return TXRX_CONTINUE; + if ((sdata->vif.type != IEEE80211_IF_TYPE_AP) && + (sdata->vif.type != IEEE80211_IF_TYPE_VLAN)) + return TXRX_DROP; + skb = skb_dequeue(&rx->sta->tx_filtered); if (!skb) { skb = skb_dequeue(&rx->sta->ps_tx_buf); @@ -956,68 +981,54 @@ ieee80211_rx_h_remove_qos_control(struct ieee80211_txrx_data *rx) return TXRX_CONTINUE; } -static ieee80211_txrx_result -ieee80211_rx_h_802_1x_pae(struct ieee80211_txrx_data *rx) +static int +ieee80211_802_1x_port_control(struct ieee80211_txrx_data *rx) { - if (rx->sdata->eapol && ieee80211_is_eapol(rx->skb) && - rx->sdata->type != IEEE80211_IF_TYPE_STA && - (rx->flags & IEEE80211_TXRXD_RXRA_MATCH)) - return TXRX_CONTINUE; - - if (unlikely(rx->sdata->ieee802_1x && - (rx->fc & IEEE80211_FCTL_FTYPE) == IEEE80211_FTYPE_DATA && - (rx->fc & IEEE80211_FCTL_STYPE) != IEEE80211_STYPE_NULLFUNC && - (!rx->sta || !(rx->sta->flags & WLAN_STA_AUTHORIZED)) && - !ieee80211_is_eapol(rx->skb))) { + if (unlikely(rx->sdata->ieee802_1x_pac && + (!rx->sta || !(rx->sta->flags & WLAN_STA_AUTHORIZED)))) { #ifdef CONFIG_MAC80211_DEBUG - struct ieee80211_hdr *hdr = - (struct ieee80211_hdr *) rx->skb->data; - DECLARE_MAC_BUF(mac); - printk(KERN_DEBUG "%s: dropped frame from %s" - " (unauthorized port)\n", rx->dev->name, - print_mac(mac, hdr->addr2)); + printk(KERN_DEBUG "%s: dropped frame " + "(unauthorized port)\n", rx->dev->name); #endif /* CONFIG_MAC80211_DEBUG */ - return TXRX_DROP; + return -EACCES; } - return TXRX_CONTINUE; + return 0; } -static ieee80211_txrx_result -ieee80211_rx_h_drop_unencrypted(struct ieee80211_txrx_data *rx) +static int +ieee80211_drop_unencrypted(struct ieee80211_txrx_data *rx) { /* * Pass through unencrypted frames if the hardware has * decrypted them already. */ if (rx->u.rx.status->flag & RX_FLAG_DECRYPTED) - return TXRX_CONTINUE; + return 0; /* Drop unencrypted frames if key is set. */ if (unlikely(!(rx->fc & IEEE80211_FCTL_PROTECTED) && (rx->fc & IEEE80211_FCTL_FTYPE) == IEEE80211_FTYPE_DATA && (rx->fc & IEEE80211_FCTL_STYPE) != IEEE80211_STYPE_NULLFUNC && - (rx->key || rx->sdata->drop_unencrypted) && - (rx->sdata->eapol == 0 || !ieee80211_is_eapol(rx->skb)))) { + (rx->key || rx->sdata->drop_unencrypted))) { if (net_ratelimit()) printk(KERN_DEBUG "%s: RX non-WEP frame, but expected " "encryption\n", rx->dev->name); - return TXRX_DROP; + return -EACCES; } - return TXRX_CONTINUE; + return 0; } -static ieee80211_txrx_result -ieee80211_rx_h_data(struct ieee80211_txrx_data *rx) +static int +ieee80211_data_to_8023(struct ieee80211_txrx_data *rx) { struct net_device *dev = rx->dev; - struct ieee80211_local *local = rx->local; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) rx->skb->data; u16 fc, hdrlen, ethertype; u8 *payload; u8 dst[ETH_ALEN]; u8 src[ETH_ALEN]; - struct sk_buff *skb = rx->skb, *skb2; + struct sk_buff *skb = rx->skb; struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); DECLARE_MAC_BUF(mac); DECLARE_MAC_BUF(mac2); @@ -1025,11 +1036,9 @@ ieee80211_rx_h_data(struct ieee80211_txrx_data *rx) DECLARE_MAC_BUF(mac4); fc = rx->fc; - if (unlikely((fc & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_DATA)) - return TXRX_CONTINUE; if (unlikely(!WLAN_FC_DATA_PRESENT(fc))) - return TXRX_DROP; + return -1; hdrlen = ieee80211_get_hdrlen(fc); @@ -1049,8 +1058,8 @@ ieee80211_rx_h_data(struct ieee80211_txrx_data *rx) memcpy(dst, hdr->addr3, ETH_ALEN); memcpy(src, hdr->addr2, ETH_ALEN); - if (unlikely(sdata->type != IEEE80211_IF_TYPE_AP && - sdata->type != IEEE80211_IF_TYPE_VLAN)) { + if (unlikely(sdata->vif.type != IEEE80211_IF_TYPE_AP && + sdata->vif.type != IEEE80211_IF_TYPE_VLAN)) { if (net_ratelimit()) printk(KERN_DEBUG "%s: dropped ToDS frame " "(BSSID=%s SA=%s DA=%s)\n", @@ -1058,7 +1067,7 @@ ieee80211_rx_h_data(struct ieee80211_txrx_data *rx) print_mac(mac, hdr->addr1), print_mac(mac2, hdr->addr2), print_mac(mac3, hdr->addr3)); - return TXRX_DROP; + return -1; } break; case (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS): @@ -1066,7 +1075,7 @@ ieee80211_rx_h_data(struct ieee80211_txrx_data *rx) memcpy(dst, hdr->addr3, ETH_ALEN); memcpy(src, hdr->addr4, ETH_ALEN); - if (unlikely(sdata->type != IEEE80211_IF_TYPE_WDS)) { + if (unlikely(sdata->vif.type != IEEE80211_IF_TYPE_WDS)) { if (net_ratelimit()) printk(KERN_DEBUG "%s: dropped FromDS&ToDS " "frame (RA=%s TA=%s DA=%s SA=%s)\n", @@ -1075,7 +1084,7 @@ ieee80211_rx_h_data(struct ieee80211_txrx_data *rx) print_mac(mac2, hdr->addr2), print_mac(mac3, hdr->addr3), print_mac(mac4, hdr->addr4)); - return TXRX_DROP; + return -1; } break; case IEEE80211_FCTL_FROMDS: @@ -1083,17 +1092,17 @@ ieee80211_rx_h_data(struct ieee80211_txrx_data *rx) memcpy(dst, hdr->addr1, ETH_ALEN); memcpy(src, hdr->addr3, ETH_ALEN); - if (sdata->type != IEEE80211_IF_TYPE_STA || + if (sdata->vif.type != IEEE80211_IF_TYPE_STA || (is_multicast_ether_addr(dst) && !compare_ether_addr(src, dev->dev_addr))) - return TXRX_DROP; + return -1; break; case 0: /* DA SA BSSID */ memcpy(dst, hdr->addr1, ETH_ALEN); memcpy(src, hdr->addr2, ETH_ALEN); - if (sdata->type != IEEE80211_IF_TYPE_IBSS) { + if (sdata->vif.type != IEEE80211_IF_TYPE_IBSS) { if (net_ratelimit()) { printk(KERN_DEBUG "%s: dropped IBSS frame " "(DA=%s SA=%s BSSID=%s)\n", @@ -1102,21 +1111,20 @@ ieee80211_rx_h_data(struct ieee80211_txrx_data *rx) print_mac(mac2, hdr->addr2), print_mac(mac3, hdr->addr3)); } - return TXRX_DROP; + return -1; } break; } - payload = skb->data + hdrlen; - if (unlikely(skb->len - hdrlen < 8)) { if (net_ratelimit()) { printk(KERN_DEBUG "%s: RX too short data frame " "payload\n", dev->name); } - return TXRX_DROP; + return -1; } + payload = skb->data + hdrlen; ethertype = (payload[6] << 8) | payload[7]; if (likely((compare_ether_addr(payload, rfc1042_header) == 0 && @@ -1130,6 +1138,7 @@ ieee80211_rx_h_data(struct ieee80211_txrx_data *rx) } else { struct ethhdr *ehdr; __be16 len; + skb_pull(skb, hdrlen); len = htons(skb->len); ehdr = (struct ethhdr *) skb_push(skb, sizeof(struct ethhdr)); @@ -1137,36 +1146,72 @@ ieee80211_rx_h_data(struct ieee80211_txrx_data *rx) memcpy(ehdr->h_source, src, ETH_ALEN); ehdr->h_proto = len; } - skb->dev = dev; + return 0; +} - skb2 = NULL; +/* + * requires that rx->skb is a frame with ethernet header + */ +static bool ieee80211_frame_allowed(struct ieee80211_txrx_data *rx) +{ + static const u8 pae_group_addr[ETH_ALEN] + = { 0x01, 0x80, 0xC2, 0x00, 0x00, 0x03 }; + struct ethhdr *ehdr = (struct ethhdr *) rx->skb->data; - dev->stats.rx_packets++; - dev->stats.rx_bytes += skb->len; + /* + * Allow EAPOL frames to us/the PAE group address regardless + * of whether the frame was encrypted or not. + */ + if (ehdr->h_proto == htons(ETH_P_PAE) && + (compare_ether_addr(ehdr->h_dest, rx->dev->dev_addr) == 0 || + compare_ether_addr(ehdr->h_dest, pae_group_addr) == 0)) + return true; + + if (ieee80211_802_1x_port_control(rx) || + ieee80211_drop_unencrypted(rx)) + return false; - if (local->bridge_packets && (sdata->type == IEEE80211_IF_TYPE_AP - || sdata->type == IEEE80211_IF_TYPE_VLAN) && + return true; +} + +/* + * requires that rx->skb is a frame with ethernet header + */ +static void +ieee80211_deliver_skb(struct ieee80211_txrx_data *rx) +{ + struct net_device *dev = rx->dev; + struct ieee80211_local *local = rx->local; + struct sk_buff *skb, *xmit_skb; + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ethhdr *ehdr = (struct ethhdr *) rx->skb->data; + struct sta_info *dsta; + + skb = rx->skb; + xmit_skb = NULL; + + if (local->bridge_packets && (sdata->vif.type == IEEE80211_IF_TYPE_AP || + sdata->vif.type == IEEE80211_IF_TYPE_VLAN) && (rx->flags & IEEE80211_TXRXD_RXRA_MATCH)) { - if (is_multicast_ether_addr(skb->data)) { - /* send multicast frames both to higher layers in - * local net stack and back to the wireless media */ - skb2 = skb_copy(skb, GFP_ATOMIC); - if (!skb2 && net_ratelimit()) + if (is_multicast_ether_addr(ehdr->h_dest)) { + /* + * send multicast frames both to higher layers in + * local net stack and back to the wireless medium + */ + xmit_skb = skb_copy(skb, GFP_ATOMIC); + if (!xmit_skb && net_ratelimit()) printk(KERN_DEBUG "%s: failed to clone " "multicast frame\n", dev->name); } else { - struct sta_info *dsta; dsta = sta_info_get(local, skb->data); - if (dsta && !dsta->dev) { - if (net_ratelimit()) - printk(KERN_DEBUG "Station with null " - "dev structure!\n"); - } else if (dsta && dsta->dev == dev) { - /* Destination station is associated to this - * AP, so send the frame directly to it and - * do not pass the frame to local net stack. + if (dsta && dsta->dev == dev) { + /* + * The destination station is associated to + * this AP (in this VLAN), so send the frame + * directly to it and do not pass it to local + * net stack. */ - skb2 = skb; + xmit_skb = skb; skb = NULL; } if (dsta) @@ -1181,18 +1226,207 @@ ieee80211_rx_h_data(struct ieee80211_txrx_data *rx) netif_rx(skb); } - if (skb2) { + if (xmit_skb) { /* send to wireless media */ - skb2->protocol = __constant_htons(ETH_P_802_3); - skb_set_network_header(skb2, 0); - skb_set_mac_header(skb2, 0); - dev_queue_xmit(skb2); + xmit_skb->protocol = htons(ETH_P_802_3); + skb_reset_network_header(xmit_skb); + skb_reset_mac_header(xmit_skb); + dev_queue_xmit(xmit_skb); } +} + +static ieee80211_txrx_result +ieee80211_rx_h_amsdu(struct ieee80211_txrx_data *rx) +{ + struct net_device *dev = rx->dev; + struct ieee80211_local *local = rx->local; + u16 fc, ethertype; + u8 *payload; + struct sk_buff *skb = rx->skb, *frame = NULL; + const struct ethhdr *eth; + int remaining, err; + u8 dst[ETH_ALEN]; + u8 src[ETH_ALEN]; + DECLARE_MAC_BUF(mac); + + fc = rx->fc; + if (unlikely((fc & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_DATA)) + return TXRX_CONTINUE; + + if (unlikely(!WLAN_FC_DATA_PRESENT(fc))) + return TXRX_DROP; + + if (!(rx->flags & IEEE80211_TXRXD_RX_AMSDU)) + return TXRX_CONTINUE; + + err = ieee80211_data_to_8023(rx); + if (unlikely(err)) + return TXRX_DROP; + + skb->dev = dev; + + dev->stats.rx_packets++; + dev->stats.rx_bytes += skb->len; + + /* skip the wrapping header */ + eth = (struct ethhdr *) skb_pull(skb, sizeof(struct ethhdr)); + if (!eth) + return TXRX_DROP; + + while (skb != frame) { + u8 padding; + __be16 len = eth->h_proto; + unsigned int subframe_len = sizeof(struct ethhdr) + ntohs(len); + + remaining = skb->len; + memcpy(dst, eth->h_dest, ETH_ALEN); + memcpy(src, eth->h_source, ETH_ALEN); + + padding = ((4 - subframe_len) & 0x3); + /* the last MSDU has no padding */ + if (subframe_len > remaining) { + printk(KERN_DEBUG "%s: wrong buffer size", dev->name); + return TXRX_DROP; + } + + skb_pull(skb, sizeof(struct ethhdr)); + /* if last subframe reuse skb */ + if (remaining <= subframe_len + padding) + frame = skb; + else { + frame = dev_alloc_skb(local->hw.extra_tx_headroom + + subframe_len); + + if (frame == NULL) + return TXRX_DROP; + + skb_reserve(frame, local->hw.extra_tx_headroom + + sizeof(struct ethhdr)); + memcpy(skb_put(frame, ntohs(len)), skb->data, + ntohs(len)); + + eth = (struct ethhdr *) skb_pull(skb, ntohs(len) + + padding); + if (!eth) { + printk(KERN_DEBUG "%s: wrong buffer size ", + dev->name); + dev_kfree_skb(frame); + return TXRX_DROP; + } + } + + skb_reset_network_header(frame); + frame->dev = dev; + frame->priority = skb->priority; + rx->skb = frame; + + payload = frame->data; + ethertype = (payload[6] << 8) | payload[7]; + + if (likely((compare_ether_addr(payload, rfc1042_header) == 0 && + ethertype != ETH_P_AARP && ethertype != ETH_P_IPX) || + compare_ether_addr(payload, + bridge_tunnel_header) == 0)) { + /* remove RFC1042 or Bridge-Tunnel + * encapsulation and replace EtherType */ + skb_pull(frame, 6); + memcpy(skb_push(frame, ETH_ALEN), src, ETH_ALEN); + memcpy(skb_push(frame, ETH_ALEN), dst, ETH_ALEN); + } else { + memcpy(skb_push(frame, sizeof(__be16)), + &len, sizeof(__be16)); + memcpy(skb_push(frame, ETH_ALEN), src, ETH_ALEN); + memcpy(skb_push(frame, ETH_ALEN), dst, ETH_ALEN); + } + + if (!ieee80211_frame_allowed(rx)) { + if (skb == frame) /* last frame */ + return TXRX_DROP; + dev_kfree_skb(frame); + continue; + } + + ieee80211_deliver_skb(rx); + } + + return TXRX_QUEUED; +} + +static ieee80211_txrx_result +ieee80211_rx_h_data(struct ieee80211_txrx_data *rx) +{ + struct net_device *dev = rx->dev; + u16 fc; + int err; + + fc = rx->fc; + if (unlikely((fc & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_DATA)) + return TXRX_CONTINUE; + + if (unlikely(!WLAN_FC_DATA_PRESENT(fc))) + return TXRX_DROP; + + err = ieee80211_data_to_8023(rx); + if (unlikely(err)) + return TXRX_DROP; + + if (!ieee80211_frame_allowed(rx)) + return TXRX_DROP; + + rx->skb->dev = dev; + + dev->stats.rx_packets++; + dev->stats.rx_bytes += rx->skb->len; + + ieee80211_deliver_skb(rx); return TXRX_QUEUED; } static ieee80211_txrx_result +ieee80211_rx_h_ctrl(struct ieee80211_txrx_data *rx) +{ + struct ieee80211_local *local = rx->local; + struct ieee80211_hw *hw = &local->hw; + struct sk_buff *skb = rx->skb; + struct ieee80211_bar *bar = (struct ieee80211_bar *) skb->data; + struct tid_ampdu_rx *tid_agg_rx; + u16 start_seq_num; + u16 tid; + + if (likely((rx->fc & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_CTL)) + return TXRX_CONTINUE; + + if ((rx->fc & IEEE80211_FCTL_STYPE) == IEEE80211_STYPE_BACK_REQ) { + if (!rx->sta) + return TXRX_CONTINUE; + tid = le16_to_cpu(bar->control) >> 12; + tid_agg_rx = &(rx->sta->ampdu_mlme.tid_rx[tid]); + if (tid_agg_rx->state != HT_AGG_STATE_OPERATIONAL) + return TXRX_CONTINUE; + + start_seq_num = le16_to_cpu(bar->start_seq_num) >> 4; + + /* reset session timer */ + if (tid_agg_rx->timeout) { + unsigned long expires = + jiffies + (tid_agg_rx->timeout / 1000) * HZ; + mod_timer(&tid_agg_rx->session_timer, expires); + } + + /* manage reordering buffer according to requested */ + /* sequence number */ + rcu_read_lock(); + ieee80211_sta_manage_reorder_buf(hw, tid_agg_rx, NULL, + start_seq_num, 1); + rcu_read_unlock(); + return TXRX_DROP; + } + + return TXRX_CONTINUE; +} + +static ieee80211_txrx_result ieee80211_rx_h_mgmt(struct ieee80211_txrx_data *rx) { struct ieee80211_sub_if_data *sdata; @@ -1201,8 +1435,8 @@ ieee80211_rx_h_mgmt(struct ieee80211_txrx_data *rx) return TXRX_DROP; sdata = IEEE80211_DEV_TO_SUB_IF(rx->dev); - if ((sdata->type == IEEE80211_IF_TYPE_STA || - sdata->type == IEEE80211_IF_TYPE_IBSS) && + if ((sdata->vif.type == IEEE80211_IF_TYPE_STA || + sdata->vif.type == IEEE80211_IF_TYPE_IBSS) && !(sdata->flags & IEEE80211_SDATA_USERSPACE_MLME)) ieee80211_sta_rx_mgmt(rx->dev, rx->skb, rx->u.rx.status); else @@ -1294,7 +1528,7 @@ static void ieee80211_rx_michael_mic_report(struct net_device *dev, goto ignore; } - if (rx->sdata->type == IEEE80211_IF_TYPE_AP && keyidx) { + if (rx->sdata->vif.type == IEEE80211_IF_TYPE_AP && keyidx) { /* * APs with pairwise keys should never receive Michael MIC * errors for non-zero keyidx because these are reserved for @@ -1341,9 +1575,9 @@ ieee80211_rx_handler ieee80211_rx_handlers[] = * are not passed to user space by these functions */ ieee80211_rx_h_remove_qos_control, - ieee80211_rx_h_802_1x_pae, - ieee80211_rx_h_drop_unencrypted, + ieee80211_rx_h_amsdu, ieee80211_rx_h_data, + ieee80211_rx_h_ctrl, ieee80211_rx_h_mgmt, NULL }; @@ -1356,7 +1590,7 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata, { int multicast = is_multicast_ether_addr(hdr->addr1); - switch (sdata->type) { + switch (sdata->vif.type) { case IEEE80211_IF_TYPE_STA: if (!bssid) return 0; @@ -1427,11 +1661,13 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata, } /* - * This is the receive path handler. It is called by a low level driver when an - * 802.11 MPDU is received from the hardware. + * This is the actual Rx frames handler. as it blongs to Rx path it must + * be called with rcu_read_lock protection. */ -void __ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb, - struct ieee80211_rx_status *status) +static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, + struct sk_buff *skb, + struct ieee80211_rx_status *status, + u32 load) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_sub_if_data *sdata; @@ -1439,29 +1675,11 @@ void __ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb, struct ieee80211_hdr *hdr; struct ieee80211_txrx_data rx; u16 type; - int prepres; + int prepares; struct ieee80211_sub_if_data *prev = NULL; struct sk_buff *skb_new; u8 *bssid; - - /* - * key references and virtual interfaces are protected using RCU - * and this requires that we are in a read-side RCU section during - * receive processing - */ - rcu_read_lock(); - - /* - * Frames with failed FCS/PLCP checksum are not returned, - * all other frames are returned without radiotap header - * if it was previously present. - * Also, frames with less than 16 bytes are dropped. - */ - skb = ieee80211_rx_monitor(local, skb, status); - if (!skb) { - rcu_read_unlock(); - return; - } + int hdrlen; hdr = (struct ieee80211_hdr *) skb->data; memset(&rx, 0, sizeof(rx)); @@ -1469,9 +1687,22 @@ void __ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb, rx.local = local; rx.u.rx.status = status; + rx.u.rx.load = load; rx.fc = le16_to_cpu(hdr->frame_control); type = rx.fc & IEEE80211_FCTL_FTYPE; + /* + * Drivers are required to align the payload data to a four-byte + * boundary, so the last two bits of the address where it starts + * may not be set. The header is required to be directly before + * the payload data, padding like atheros hardware adds which is + * inbetween the 802.11 header and the payload is not supported, + * the driver is required to move the 802.11 header further back + * in that case. + */ + hdrlen = ieee80211_get_hdrlen(rx.fc); + WARN_ON_ONCE(((unsigned long)(skb->data + hdrlen)) & 3); + if (type == IEEE80211_FTYPE_DATA || type == IEEE80211_FTYPE_MGMT) local->dot11ReceivedFragmentCount++; @@ -1486,7 +1717,7 @@ void __ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb, goto end; } - if (unlikely(local->sta_scanning)) + if (unlikely(local->sta_sw_scanning || local->sta_hw_scanning)) rx.flags |= IEEE80211_TXRXD_RXIN_SCAN; if (__ieee80211_invoke_rx_handlers(local, local->rx_pre_handlers, &rx, @@ -1501,25 +1732,23 @@ void __ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb, ieee80211_invoke_rx_handlers(local, local->rx_handlers, &rx, rx.sta); sta_info_put(sta); - rcu_read_unlock(); return; } - bssid = ieee80211_get_bssid(hdr, skb->len); - list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (!netif_running(sdata->dev)) continue; - if (sdata->type == IEEE80211_IF_TYPE_MNTR) + if (sdata->vif.type == IEEE80211_IF_TYPE_MNTR) continue; + bssid = ieee80211_get_bssid(hdr, skb->len, sdata->vif.type); rx.flags |= IEEE80211_TXRXD_RXRA_MATCH; - prepres = prepare_for_handlers(sdata, bssid, &rx, hdr); + prepares = prepare_for_handlers(sdata, bssid, &rx, hdr); /* prepare_for_handlers can change sta */ sta = rx.sta; - if (!prepres) + if (!prepares) continue; /* @@ -1547,6 +1776,7 @@ void __ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb, prev->dev->name); continue; } + rx.fc = le16_to_cpu(hdr->frame_control); rx.skb = skb_new; rx.dev = prev->dev; rx.sdata = prev; @@ -1555,6 +1785,7 @@ void __ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb, prev = sdata; } if (prev) { + rx.fc = le16_to_cpu(hdr->frame_control); rx.skb = skb; rx.dev = prev->dev; rx.sdata = prev; @@ -1564,10 +1795,230 @@ void __ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb, dev_kfree_skb(skb); end: - rcu_read_unlock(); + if (sta) + sta_info_put(sta); +} + +#define SEQ_MODULO 0x1000 +#define SEQ_MASK 0xfff + +static inline int seq_less(u16 sq1, u16 sq2) +{ + return (((sq1 - sq2) & SEQ_MASK) > (SEQ_MODULO >> 1)); +} + +static inline u16 seq_inc(u16 sq) +{ + return ((sq + 1) & SEQ_MASK); +} + +static inline u16 seq_sub(u16 sq1, u16 sq2) +{ + return ((sq1 - sq2) & SEQ_MASK); +} + + +/* + * As it function blongs to Rx path it must be called with + * the proper rcu_read_lock protection for its flow. + */ +u8 ieee80211_sta_manage_reorder_buf(struct ieee80211_hw *hw, + struct tid_ampdu_rx *tid_agg_rx, + struct sk_buff *skb, u16 mpdu_seq_num, + int bar_req) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_rx_status status; + u16 head_seq_num, buf_size; + int index; + u32 pkt_load; + + buf_size = tid_agg_rx->buf_size; + head_seq_num = tid_agg_rx->head_seq_num; + + /* frame with out of date sequence number */ + if (seq_less(mpdu_seq_num, head_seq_num)) { + dev_kfree_skb(skb); + return 1; + } + + /* if frame sequence number exceeds our buffering window size or + * block Ack Request arrived - release stored frames */ + if ((!seq_less(mpdu_seq_num, head_seq_num + buf_size)) || (bar_req)) { + /* new head to the ordering buffer */ + if (bar_req) + head_seq_num = mpdu_seq_num; + else + head_seq_num = + seq_inc(seq_sub(mpdu_seq_num, buf_size)); + /* release stored frames up to new head to stack */ + while (seq_less(tid_agg_rx->head_seq_num, head_seq_num)) { + index = seq_sub(tid_agg_rx->head_seq_num, + tid_agg_rx->ssn) + % tid_agg_rx->buf_size; + + if (tid_agg_rx->reorder_buf[index]) { + /* release the reordered frames to stack */ + memcpy(&status, + tid_agg_rx->reorder_buf[index]->cb, + sizeof(status)); + pkt_load = ieee80211_rx_load_stats(local, + tid_agg_rx->reorder_buf[index], + &status); + __ieee80211_rx_handle_packet(hw, + tid_agg_rx->reorder_buf[index], + &status, pkt_load); + tid_agg_rx->stored_mpdu_num--; + tid_agg_rx->reorder_buf[index] = NULL; + } + tid_agg_rx->head_seq_num = + seq_inc(tid_agg_rx->head_seq_num); + } + if (bar_req) + return 1; + } + + /* now the new frame is always in the range of the reordering */ + /* buffer window */ + index = seq_sub(mpdu_seq_num, tid_agg_rx->ssn) + % tid_agg_rx->buf_size; + /* check if we already stored this frame */ + if (tid_agg_rx->reorder_buf[index]) { + dev_kfree_skb(skb); + return 1; + } + /* if arrived mpdu is in the right order and nothing else stored */ + /* release it immediately */ + if (mpdu_seq_num == tid_agg_rx->head_seq_num && + tid_agg_rx->stored_mpdu_num == 0) { + tid_agg_rx->head_seq_num = + seq_inc(tid_agg_rx->head_seq_num); + return 0; + } + + /* put the frame in the reordering buffer */ + tid_agg_rx->reorder_buf[index] = skb; + tid_agg_rx->stored_mpdu_num++; + /* release the buffer until next missing frame */ + index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) + % tid_agg_rx->buf_size; + while (tid_agg_rx->reorder_buf[index]) { + /* release the reordered frame back to stack */ + memcpy(&status, tid_agg_rx->reorder_buf[index]->cb, + sizeof(status)); + pkt_load = ieee80211_rx_load_stats(local, + tid_agg_rx->reorder_buf[index], + &status); + __ieee80211_rx_handle_packet(hw, tid_agg_rx->reorder_buf[index], + &status, pkt_load); + tid_agg_rx->stored_mpdu_num--; + tid_agg_rx->reorder_buf[index] = NULL; + tid_agg_rx->head_seq_num = seq_inc(tid_agg_rx->head_seq_num); + index = seq_sub(tid_agg_rx->head_seq_num, + tid_agg_rx->ssn) % tid_agg_rx->buf_size; + } + return 1; +} + +static u8 ieee80211_rx_reorder_ampdu(struct ieee80211_local *local, + struct sk_buff *skb) +{ + struct ieee80211_hw *hw = &local->hw; + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + struct sta_info *sta; + struct tid_ampdu_rx *tid_agg_rx; + u16 fc, sc; + u16 mpdu_seq_num; + u8 ret = 0, *qc; + int tid; + + sta = sta_info_get(local, hdr->addr2); + if (!sta) + return ret; + + fc = le16_to_cpu(hdr->frame_control); + + /* filter the QoS data rx stream according to + * STA/TID and check if this STA/TID is on aggregation */ + if (!WLAN_FC_IS_QOS_DATA(fc)) + goto end_reorder; + + qc = skb->data + ieee80211_get_hdrlen(fc) - QOS_CONTROL_LEN; + tid = qc[0] & QOS_CONTROL_TID_MASK; + tid_agg_rx = &(sta->ampdu_mlme.tid_rx[tid]); + + if (tid_agg_rx->state != HT_AGG_STATE_OPERATIONAL) + goto end_reorder; + + /* null data frames are excluded */ + if (unlikely(fc & IEEE80211_STYPE_QOS_NULLFUNC)) + goto end_reorder; + + /* new un-ordered ampdu frame - process it */ + + /* reset session timer */ + if (tid_agg_rx->timeout) { + unsigned long expires = + jiffies + (tid_agg_rx->timeout / 1000) * HZ; + mod_timer(&tid_agg_rx->session_timer, expires); + } + + /* if this mpdu is fragmented - terminate rx aggregation session */ + sc = le16_to_cpu(hdr->seq_ctrl); + if (sc & IEEE80211_SCTL_FRAG) { + ieee80211_sta_stop_rx_ba_session(sta->dev, sta->addr, + tid, 0, WLAN_REASON_QSTA_REQUIRE_SETUP); + ret = 1; + goto end_reorder; + } + + /* according to mpdu sequence number deal with reordering buffer */ + mpdu_seq_num = (sc & IEEE80211_SCTL_SEQ) >> 4; + ret = ieee80211_sta_manage_reorder_buf(hw, tid_agg_rx, skb, + mpdu_seq_num, 0); +end_reorder: if (sta) sta_info_put(sta); + return ret; +} + +/* + * This is the receive path handler. It is called by a low level driver when an + * 802.11 MPDU is received from the hardware. + */ +void __ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb, + struct ieee80211_rx_status *status) +{ + struct ieee80211_local *local = hw_to_local(hw); + u32 pkt_load; + + /* + * key references and virtual interfaces are protected using RCU + * and this requires that we are in a read-side RCU section during + * receive processing + */ + rcu_read_lock(); + + /* + * Frames with failed FCS/PLCP checksum are not returned, + * all other frames are returned without radiotap header + * if it was previously present. + * Also, frames with less than 16 bytes are dropped. + */ + skb = ieee80211_rx_monitor(local, skb, status); + if (!skb) { + rcu_read_unlock(); + return; + } + + pkt_load = ieee80211_rx_load_stats(local, skb, status); + local->channel_use_raw += pkt_load; + + if (!ieee80211_rx_reorder_ampdu(local, skb)) + __ieee80211_rx_handle_packet(hw, skb, status, pkt_load); + + rcu_read_unlock(); } EXPORT_SYMBOL(__ieee80211_rx); diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index cfd8ee9adad..1f74bd29635 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -104,6 +104,7 @@ static void sta_info_release(struct kref *kref) struct sta_info *sta = container_of(kref, struct sta_info, kref); struct ieee80211_local *local = sta->local; struct sk_buff *skb; + int i; /* free sta structure; it has already been removed from * hash table etc. external structures. Make sure that all @@ -116,6 +117,8 @@ static void sta_info_release(struct kref *kref) while ((skb = skb_dequeue(&sta->tx_filtered)) != NULL) { dev_kfree_skb_any(skb); } + for (i = 0; i < STA_TID_NUM; i++) + del_timer_sync(&sta->ampdu_mlme.tid_rx[i].session_timer); rate_control_free_sta(sta->rate_ctrl, sta->rate_ctrl_priv); rate_control_put(sta->rate_ctrl); kfree(sta); @@ -133,6 +136,7 @@ struct sta_info * sta_info_add(struct ieee80211_local *local, struct net_device *dev, u8 *addr, gfp_t gfp) { struct sta_info *sta; + int i; DECLARE_MAC_BUF(mac); sta = kzalloc(sizeof(*sta), gfp); @@ -152,6 +156,19 @@ struct sta_info * sta_info_add(struct ieee80211_local *local, memcpy(sta->addr, addr, ETH_ALEN); sta->local = local; sta->dev = dev; + spin_lock_init(&sta->ampdu_mlme.ampdu_rx); + for (i = 0; i < STA_TID_NUM; i++) { + /* timer_to_tid must be initialized with identity mapping to + * enable session_timer's data differentiation. refer to + * sta_rx_agg_session_timer_expired for useage */ + sta->timer_to_tid[i] = i; + /* rx timers */ + sta->ampdu_mlme.tid_rx[i].session_timer.function = + sta_rx_agg_session_timer_expired; + sta->ampdu_mlme.tid_rx[i].session_timer.data = + (unsigned long)&sta->timer_to_tid[i]; + init_timer(&sta->ampdu_mlme.tid_rx[i].session_timer); + } skb_queue_head_init(&sta->ps_tx_buf); skb_queue_head_init(&sta->tx_filtered); __sta_info_get(sta); /* sta used by caller, decremented by @@ -160,9 +177,16 @@ struct sta_info * sta_info_add(struct ieee80211_local *local, list_add(&sta->list, &local->sta_list); local->num_sta++; sta_info_hash_add(local, sta); - if (local->ops->sta_notify) - local->ops->sta_notify(local_to_hw(local), dev->ifindex, - STA_NOTIFY_ADD, addr); + if (local->ops->sta_notify) { + struct ieee80211_sub_if_data *sdata; + + sdata = IEEE80211_DEV_TO_SUB_IF(dev); + if (sdata->vif.type == IEEE80211_IF_TYPE_VLAN) + sdata = sdata->u.vlan.ap; + + local->ops->sta_notify(local_to_hw(local), &sdata->vif, + STA_NOTIFY_ADD, addr); + } write_unlock_bh(&local->sta_lock); #ifdef CONFIG_MAC80211_VERBOSE_DEBUG @@ -230,9 +254,17 @@ void sta_info_free(struct sta_info *sta) ieee80211_key_free(sta->key); sta->key = NULL; - if (local->ops->sta_notify) - local->ops->sta_notify(local_to_hw(local), sta->dev->ifindex, - STA_NOTIFY_REMOVE, sta->addr); + if (local->ops->sta_notify) { + struct ieee80211_sub_if_data *sdata; + + sdata = IEEE80211_DEV_TO_SUB_IF(sta->dev); + + if (sdata->vif.type == IEEE80211_IF_TYPE_VLAN) + sdata = sdata->u.vlan.ap; + + local->ops->sta_notify(local_to_hw(local), &sdata->vif, + STA_NOTIFY_REMOVE, sta->addr); + } rate_control_remove_sta_debugfs(sta); ieee80211_sta_debugfs_remove(sta); @@ -346,11 +378,10 @@ void sta_info_init(struct ieee80211_local *local) rwlock_init(&local->sta_lock); INIT_LIST_HEAD(&local->sta_list); - init_timer(&local->sta_cleanup); + setup_timer(&local->sta_cleanup, sta_info_cleanup, + (unsigned long)local); local->sta_cleanup.expires = round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL); - local->sta_cleanup.data = (unsigned long) local; - local->sta_cleanup.function = sta_info_cleanup; #ifdef CONFIG_MAC80211_DEBUGFS INIT_WORK(&local->sta_debugfs_add, sta_info_debugfs_add_task); diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 8f7ebe41c02..96fe3ed9503 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -31,6 +31,51 @@ #define WLAN_STA_WME BIT(9) #define WLAN_STA_WDS BIT(27) +#define STA_TID_NUM 16 +#define ADDBA_RESP_INTERVAL HZ + +#define HT_AGG_STATE_INITIATOR_SHIFT (4) + +#define HT_AGG_STATE_REQ_STOP_BA_MSK BIT(3) + +#define HT_AGG_STATE_IDLE (0x0) +#define HT_AGG_STATE_OPERATIONAL (0x7) + +/** + * struct tid_ampdu_rx - TID aggregation information (Rx). + * + * @state: TID's state in session state machine. + * @dialog_token: dialog token for aggregation session + * @ssn: Starting Sequence Number expected to be aggregated. + * @buf_size: buffer size for incoming A-MPDUs + * @timeout: reset timer value. + * @head_seq_num: head sequence number in reordering buffer. + * @stored_mpdu_num: number of MPDUs in reordering buffer + * @reorder_buf: buffer to reorder incoming aggregated MPDUs + * @session_timer: check if peer keeps Tx-ing on the TID (by timeout value) + */ +struct tid_ampdu_rx { + u8 state; + u8 dialog_token; + u16 ssn; + u16 buf_size; + u16 timeout; + u16 head_seq_num; + u16 stored_mpdu_num; + struct sk_buff **reorder_buf; + struct timer_list session_timer; +}; + +/** + * struct sta_ampdu_mlme - STA aggregation information. + * + * @tid_agg_info_rx: aggregation info for Rx per TID + * @ampdu_rx: for locking sections in aggregation Rx flow + */ +struct sta_ampdu_mlme { + struct tid_ampdu_rx tid_rx[STA_TID_NUM]; + spinlock_t ampdu_rx; +}; struct sta_info { struct kref kref; @@ -99,6 +144,11 @@ struct sta_info { u16 listen_interval; + struct ieee80211_ht_info ht_info; /* 802.11n HT capabilities + of this STA */ + struct sta_ampdu_mlme ampdu_mlme; + u8 timer_to_tid[STA_TID_NUM]; /* convert timer id to tid */ + #ifdef CONFIG_MAC80211_DEBUGFS struct sta_info_debugfsdentries { struct dentry *dir; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 1a531543bcc..67b509edd43 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -176,7 +176,7 @@ static u16 ieee80211_duration(struct ieee80211_txrx_data *tx, int group_addr, * to closest integer */ dur = ieee80211_frame_duration(local, 10, rate, erp, - tx->sdata->flags & IEEE80211_SDATA_SHORT_PREAMBLE); + tx->sdata->bss_conf.use_short_preamble); if (next_frag_len) { /* Frame is fragmented: duration increases with time needed to @@ -185,8 +185,7 @@ static u16 ieee80211_duration(struct ieee80211_txrx_data *tx, int group_addr, /* next fragment */ dur += ieee80211_frame_duration(local, next_frag_len, txrate->rate, erp, - tx->sdata->flags & - IEEE80211_SDATA_SHORT_PREAMBLE); + tx->sdata->bss_conf.use_short_preamble); } return dur; @@ -225,7 +224,7 @@ ieee80211_tx_h_check_assoc(struct ieee80211_txrx_data *tx) if (unlikely(tx->flags & IEEE80211_TXRXD_TX_INJECTED)) return TXRX_CONTINUE; - if (unlikely(tx->local->sta_scanning != 0) && + if (unlikely(tx->local->sta_sw_scanning) && ((tx->fc & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_MGMT || (tx->fc & IEEE80211_FCTL_STYPE) != IEEE80211_STYPE_PROBE_REQ)) return TXRX_DROP; @@ -237,7 +236,7 @@ ieee80211_tx_h_check_assoc(struct ieee80211_txrx_data *tx) if (likely(tx->flags & IEEE80211_TXRXD_TXUNICAST)) { if (unlikely(!(sta_flags & WLAN_STA_ASSOC) && - tx->sdata->type != IEEE80211_IF_TYPE_IBSS && + tx->sdata->vif.type != IEEE80211_IF_TYPE_IBSS && (tx->fc & IEEE80211_FCTL_FTYPE) == IEEE80211_FTYPE_DATA)) { #ifdef CONFIG_MAC80211_VERBOSE_DEBUG DECLARE_MAC_BUF(mac); @@ -251,7 +250,7 @@ ieee80211_tx_h_check_assoc(struct ieee80211_txrx_data *tx) } else { if (unlikely((tx->fc & IEEE80211_FCTL_FTYPE) == IEEE80211_FTYPE_DATA && tx->local->num_sta == 0 && - tx->sdata->type != IEEE80211_IF_TYPE_IBSS)) { + tx->sdata->vif.type != IEEE80211_IF_TYPE_IBSS)) { /* * No associated STAs - no need to send multicast * frames. @@ -261,18 +260,6 @@ ieee80211_tx_h_check_assoc(struct ieee80211_txrx_data *tx) return TXRX_CONTINUE; } - if (unlikely(/* !injected && */ tx->sdata->ieee802_1x && - !(sta_flags & WLAN_STA_AUTHORIZED))) { -#ifdef CONFIG_MAC80211_VERBOSE_DEBUG - DECLARE_MAC_BUF(mac); - printk(KERN_DEBUG "%s: dropped frame to %s" - " (unauthorized port)\n", tx->dev->name, - print_mac(mac, hdr->addr1)); -#endif - I802_DEBUG_INC(tx->local->tx_handlers_drop_unauth_port); - return TXRX_DROP; - } - return TXRX_CONTINUE; } @@ -306,7 +293,7 @@ static void purge_old_ps_buffers(struct ieee80211_local *local) list_for_each_entry_rcu(sdata, &local->interfaces, list) { struct ieee80211_if_ap *ap; if (sdata->dev == local->mdev || - sdata->type != IEEE80211_IF_TYPE_AP) + sdata->vif.type != IEEE80211_IF_TYPE_AP) continue; ap = &sdata->u.ap; skb = skb_dequeue(&ap->ps_bc_buf); @@ -334,16 +321,27 @@ static void purge_old_ps_buffers(struct ieee80211_local *local) wiphy_name(local->hw.wiphy), purged); } -static inline ieee80211_txrx_result +static ieee80211_txrx_result ieee80211_tx_h_multicast_ps_buf(struct ieee80211_txrx_data *tx) { - /* broadcast/multicast frame */ - /* If any of the associated stations is in power save mode, - * the frame is buffered to be sent after DTIM beacon frame */ - if ((tx->local->hw.flags & IEEE80211_HW_HOST_BROADCAST_PS_BUFFERING) && - tx->sdata->type != IEEE80211_IF_TYPE_WDS && - tx->sdata->bss && atomic_read(&tx->sdata->bss->num_sta_ps) && - !(tx->fc & IEEE80211_FCTL_ORDER)) { + /* + * broadcast/multicast frame + * + * If any of the associated stations is in power save mode, + * the frame is buffered to be sent after DTIM beacon frame. + * This is done either by the hardware or us. + */ + + /* not AP/IBSS or ordered frame */ + if (!tx->sdata->bss || (tx->fc & IEEE80211_FCTL_ORDER)) + return TXRX_CONTINUE; + + /* no stations in PS mode */ + if (!atomic_read(&tx->sdata->bss->num_sta_ps)) + return TXRX_CONTINUE; + + /* buffered in mac80211 */ + if (tx->local->hw.flags & IEEE80211_HW_HOST_BROADCAST_PS_BUFFERING) { if (tx->local->total_ps_buffered >= TOTAL_MAX_TX_BUFFER) purge_old_ps_buffers(tx->local); if (skb_queue_len(&tx->sdata->bss->ps_bc_buf) >= @@ -360,10 +358,13 @@ ieee80211_tx_h_multicast_ps_buf(struct ieee80211_txrx_data *tx) return TXRX_QUEUED; } + /* buffered in hardware */ + tx->u.tx.control->flags |= IEEE80211_TXCTL_SEND_AFTER_DTIM; + return TXRX_CONTINUE; } -static inline ieee80211_txrx_result +static ieee80211_txrx_result ieee80211_tx_h_unicast_ps_buf(struct ieee80211_txrx_data *tx) { struct sta_info *sta = tx->sta; @@ -420,7 +421,6 @@ ieee80211_tx_h_unicast_ps_buf(struct ieee80211_txrx_data *tx) return TXRX_CONTINUE; } - static ieee80211_txrx_result ieee80211_tx_h_ps_buf(struct ieee80211_txrx_data *tx) { @@ -433,13 +433,11 @@ ieee80211_tx_h_ps_buf(struct ieee80211_txrx_data *tx) return ieee80211_tx_h_multicast_ps_buf(tx); } - - - static ieee80211_txrx_result ieee80211_tx_h_select_key(struct ieee80211_txrx_data *tx) { struct ieee80211_key *key; + u16 fc = tx->fc; if (unlikely(tx->u.tx.control->flags & IEEE80211_TXCTL_DO_NOT_ENCRYPT)) tx->key = NULL; @@ -448,19 +446,38 @@ ieee80211_tx_h_select_key(struct ieee80211_txrx_data *tx) else if ((key = rcu_dereference(tx->sdata->default_key))) tx->key = key; else if (tx->sdata->drop_unencrypted && - !(tx->sdata->eapol && ieee80211_is_eapol(tx->skb))) { + !(tx->u.tx.control->flags & IEEE80211_TXCTL_EAPOL_FRAME) && + !(tx->flags & IEEE80211_TXRXD_TX_INJECTED)) { I802_DEBUG_INC(tx->local->tx_handlers_drop_unencrypted); return TXRX_DROP; - } else { + } else tx->key = NULL; - tx->u.tx.control->flags |= IEEE80211_TXCTL_DO_NOT_ENCRYPT; - } if (tx->key) { + u16 ftype, stype; + tx->key->tx_rx_count++; /* TODO: add threshold stuff again */ + + switch (tx->key->conf.alg) { + case ALG_WEP: + ftype = fc & IEEE80211_FCTL_FTYPE; + stype = fc & IEEE80211_FCTL_STYPE; + + if (ftype == IEEE80211_FTYPE_MGMT && + stype == IEEE80211_STYPE_AUTH) + break; + case ALG_TKIP: + case ALG_CCMP: + if (!WLAN_FC_DATA_PRESENT(fc)) + tx->key = NULL; + break; + } } + if (!tx->key || !(tx->key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) + tx->u.tx.control->flags |= IEEE80211_TXCTL_DO_NOT_ENCRYPT; + return TXRX_CONTINUE; } @@ -567,21 +584,17 @@ ieee80211_tx_h_encrypt(struct ieee80211_txrx_data *tx) static ieee80211_txrx_result ieee80211_tx_h_rate_ctrl(struct ieee80211_txrx_data *tx) { - struct rate_control_extra extra; + struct rate_selection rsel; if (likely(!tx->u.tx.rate)) { - memset(&extra, 0, sizeof(extra)); - extra.mode = tx->u.tx.mode; - extra.ethertype = tx->ethertype; - - tx->u.tx.rate = rate_control_get_rate(tx->local, tx->dev, - tx->skb, &extra); - if (unlikely(extra.probe != NULL)) { + rate_control_get_rate(tx->dev, tx->u.tx.mode, tx->skb, &rsel); + tx->u.tx.rate = rsel.rate; + if (unlikely(rsel.probe != NULL)) { tx->u.tx.control->flags |= IEEE80211_TXCTL_RATE_CTRL_PROBE; tx->flags |= IEEE80211_TXRXD_TXPROBE_LAST_FRAG; tx->u.tx.control->alt_retry_rate = tx->u.tx.rate->val; - tx->u.tx.rate = extra.probe; + tx->u.tx.rate = rsel.probe; } else tx->u.tx.control->alt_retry_rate = -1; @@ -591,15 +604,15 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_txrx_data *tx) tx->u.tx.control->alt_retry_rate = -1; if (tx->u.tx.mode->mode == MODE_IEEE80211G && - (tx->sdata->flags & IEEE80211_SDATA_USE_PROTECTION) && - (tx->flags & IEEE80211_TXRXD_FRAGMENTED) && extra.nonerp) { + tx->sdata->bss_conf.use_cts_prot && + (tx->flags & IEEE80211_TXRXD_FRAGMENTED) && rsel.nonerp) { tx->u.tx.last_frag_rate = tx->u.tx.rate; - if (extra.probe) + if (rsel.probe) tx->flags &= ~IEEE80211_TXRXD_TXPROBE_LAST_FRAG; else tx->flags |= IEEE80211_TXRXD_TXPROBE_LAST_FRAG; - tx->u.tx.rate = extra.nonerp; - tx->u.tx.control->rate = extra.nonerp; + tx->u.tx.rate = rsel.nonerp; + tx->u.tx.control->rate = rsel.nonerp; tx->u.tx.control->flags &= ~IEEE80211_TXCTL_RATE_CTRL_PROBE; } else { tx->u.tx.last_frag_rate = tx->u.tx.rate; @@ -653,7 +666,7 @@ ieee80211_tx_h_misc(struct ieee80211_txrx_data *tx) if (mode->mode == MODE_IEEE80211G && (tx->u.tx.rate->flags & IEEE80211_RATE_ERP) && (tx->flags & IEEE80211_TXRXD_TXUNICAST) && - (tx->sdata->flags & IEEE80211_SDATA_USE_PROTECTION) && + tx->sdata->bss_conf.use_cts_prot && !(control->flags & IEEE80211_TXCTL_USE_RTS_CTS)) control->flags |= IEEE80211_TXCTL_USE_CTS_PROTECT; @@ -662,7 +675,7 @@ ieee80211_tx_h_misc(struct ieee80211_txrx_data *tx) * available on the network at the current point in time. */ if (((fc & IEEE80211_FCTL_FTYPE) == IEEE80211_FTYPE_DATA) && (tx->u.tx.rate->flags & IEEE80211_RATE_PREAMBLE2) && - (tx->sdata->flags & IEEE80211_SDATA_SHORT_PREAMBLE) && + tx->sdata->bss_conf.use_short_preamble && (!tx->sta || (tx->sta->flags & WLAN_STA_SHORT_PREAMBLE))) { tx->u.tx.control->tx_rate = tx->u.tx.rate->val2; } @@ -706,15 +719,6 @@ ieee80211_tx_h_misc(struct ieee80211_txrx_data *tx) } } - /* - * Tell hardware to not encrypt when we had sw crypto. - * Because we use the same flag to internally indicate that - * no (software) encryption should be done, we have to set it - * after all crypto handlers. - */ - if (tx->key && !(tx->key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) - tx->u.tx.control->flags |= IEEE80211_TXCTL_DO_NOT_ENCRYPT; - return TXRX_CONTINUE; } @@ -927,7 +931,6 @@ __ieee80211_tx_prepare(struct ieee80211_txrx_data *tx, struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); struct ieee80211_hdr *hdr; struct ieee80211_sub_if_data *sdata; - ieee80211_txrx_result res = TXRX_CONTINUE; int hdrlen; @@ -945,7 +948,7 @@ __ieee80211_tx_prepare(struct ieee80211_txrx_data *tx, /* process and remove the injection radiotap header */ sdata = IEEE80211_DEV_TO_SUB_IF(dev); - if (unlikely(sdata->type == IEEE80211_IF_TYPE_MNTR)) { + if (unlikely(sdata->vif.type == IEEE80211_IF_TYPE_MNTR)) { if (__ieee80211_parse_tx_radiotap(tx, skb) == TXRX_DROP) return TXRX_DROP; @@ -992,12 +995,10 @@ __ieee80211_tx_prepare(struct ieee80211_txrx_data *tx, } control->flags |= IEEE80211_TXCTL_FIRST_FRAGMENT; - return res; + return TXRX_CONTINUE; } -/* Device in tx->dev has a reference added; use dev_put(tx->dev) when - * finished with it. - * +/* * NB: @tx is uninitialised when passed in here */ static int ieee80211_tx_prepare(struct ieee80211_txrx_data *tx, @@ -1018,6 +1019,7 @@ static int ieee80211_tx_prepare(struct ieee80211_txrx_data *tx, return -ENODEV; /* initialises tx with control */ __ieee80211_tx_prepare(tx, skb, dev, control); + dev_put(dev); return 0; } @@ -1248,14 +1250,16 @@ int ieee80211_master_start_xmit(struct sk_buff *skb, } } - control.ifindex = odev->ifindex; - control.type = osdata->type; + control.vif = &osdata->vif; + control.type = osdata->vif.type; if (pkt_data->flags & IEEE80211_TXPD_REQ_TX_STATUS) control.flags |= IEEE80211_TXCTL_REQ_TX_STATUS; if (pkt_data->flags & IEEE80211_TXPD_DO_NOT_ENCRYPT) control.flags |= IEEE80211_TXCTL_DO_NOT_ENCRYPT; if (pkt_data->flags & IEEE80211_TXPD_REQUEUE) control.flags |= IEEE80211_TXCTL_REQUEUE; + if (pkt_data->flags & IEEE80211_TXPD_EAPOL_FRAME) + control.flags |= IEEE80211_TXCTL_EAPOL_FRAME; control.queue = pkt_data->queue; ret = ieee80211_tx(odev, skb, &control); @@ -1348,6 +1352,7 @@ int ieee80211_subif_start_xmit(struct sk_buff *skb, int encaps_len, skip_header_bytes; int nh_pos, h_pos; struct sta_info *sta; + u32 sta_flags = 0; sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (unlikely(skb->len < ETH_HLEN)) { @@ -1363,10 +1368,9 @@ int ieee80211_subif_start_xmit(struct sk_buff *skb, /* convert Ethernet header to proper 802.11 header (based on * operation mode) */ ethertype = (skb->data[12] << 8) | skb->data[13]; - /* TODO: handling for 802.1x authorized/unauthorized port */ fc = IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA; - switch (sdata->type) { + switch (sdata->vif.type) { case IEEE80211_IF_TYPE_AP: case IEEE80211_IF_TYPE_VLAN: fc |= IEEE80211_FCTL_FROMDS; @@ -1405,16 +1409,42 @@ int ieee80211_subif_start_xmit(struct sk_buff *skb, goto fail; } - /* receiver is QoS enabled, use a QoS type frame */ sta = sta_info_get(local, hdr.addr1); if (sta) { - if (sta->flags & WLAN_STA_WME) { - fc |= IEEE80211_STYPE_QOS_DATA; - hdrlen += 2; - } + sta_flags = sta->flags; sta_info_put(sta); } + /* receiver is QoS enabled, use a QoS type frame */ + if (sta_flags & WLAN_STA_WME) { + fc |= IEEE80211_STYPE_QOS_DATA; + hdrlen += 2; + } + + /* + * If port access control is enabled, drop frames to unauthorised + * stations unless they are EAPOL frames from the local station. + */ + if (unlikely(sdata->ieee802_1x_pac && + !(sta_flags & WLAN_STA_AUTHORIZED) && + !(ethertype == ETH_P_PAE && + compare_ether_addr(dev->dev_addr, + skb->data + ETH_ALEN) == 0))) { +#ifdef CONFIG_MAC80211_VERBOSE_DEBUG + DECLARE_MAC_BUF(mac); + + if (net_ratelimit()) + printk(KERN_DEBUG "%s: dropped frame to %s" + " (unauthorized port)\n", dev->name, + print_mac(mac, hdr.addr1)); +#endif + + I802_DEBUG_INC(local->tx_handlers_drop_unauth_port); + + ret = 0; + goto fail; + } + hdr.frame_control = cpu_to_le16(fc); hdr.duration_id = 0; hdr.seq_ctrl = 0; @@ -1503,6 +1533,8 @@ int ieee80211_subif_start_xmit(struct sk_buff *skb, pkt_data = (struct ieee80211_tx_packet_data *)skb->cb; memset(pkt_data, 0, sizeof(struct ieee80211_tx_packet_data)); pkt_data->ifindex = dev->ifindex; + if (ethertype == ETH_P_PAE) + pkt_data->flags |= IEEE80211_TXPD_EAPOL_FRAME; skb->dev = local->mdev; dev->stats.tx_packets++; @@ -1527,64 +1559,6 @@ int ieee80211_subif_start_xmit(struct sk_buff *skb, return ret; } -/* - * This is the transmit routine for the 802.11 type interfaces - * called by upper layers of the linux networking - * stack when it has a frame to transmit - */ -int ieee80211_mgmt_start_xmit(struct sk_buff *skb, struct net_device *dev) -{ - struct ieee80211_sub_if_data *sdata; - struct ieee80211_tx_packet_data *pkt_data; - struct ieee80211_hdr *hdr; - u16 fc; - - sdata = IEEE80211_DEV_TO_SUB_IF(dev); - - if (skb->len < 10) { - dev_kfree_skb(skb); - return 0; - } - - if (skb_headroom(skb) < sdata->local->tx_headroom) { - if (pskb_expand_head(skb, sdata->local->tx_headroom, - 0, GFP_ATOMIC)) { - dev_kfree_skb(skb); - return 0; - } - } - - hdr = (struct ieee80211_hdr *) skb->data; - fc = le16_to_cpu(hdr->frame_control); - - pkt_data = (struct ieee80211_tx_packet_data *) skb->cb; - memset(pkt_data, 0, sizeof(struct ieee80211_tx_packet_data)); - pkt_data->ifindex = sdata->dev->ifindex; - - skb->priority = 20; /* use hardcoded priority for mgmt TX queue */ - skb->dev = sdata->local->mdev; - - /* - * We're using the protocol field of the the frame control header - * to request TX callback for hostapd. BIT(1) is checked. - */ - if ((fc & BIT(1)) == BIT(1)) { - pkt_data->flags |= IEEE80211_TXPD_REQ_TX_STATUS; - fc &= ~BIT(1); - hdr->frame_control = cpu_to_le16(fc); - } - - if (!(fc & IEEE80211_FCTL_PROTECTED)) - pkt_data->flags |= IEEE80211_TXPD_DO_NOT_ENCRYPT; - - dev->stats.tx_packets++; - dev->stats.tx_bytes += skb->len; - - dev_queue_xmit(skb); - - return 0; -} - /* helper functions for pending packets for when queues are stopped */ void ieee80211_clear_tx_pending(struct ieee80211_local *local) @@ -1653,7 +1627,8 @@ void ieee80211_tx_pending(unsigned long data) static void ieee80211_beacon_add_tim(struct ieee80211_local *local, struct ieee80211_if_ap *bss, - struct sk_buff *skb) + struct sk_buff *skb, + struct beacon_data *beacon) { u8 *pos, *tim; int aid0 = 0; @@ -1669,7 +1644,7 @@ static void ieee80211_beacon_add_tim(struct ieee80211_local *local, IEEE80211_MAX_AID+1); if (bss->dtim_count == 0) - bss->dtim_count = bss->dtim_period - 1; + bss->dtim_count = beacon->dtim_period - 1; else bss->dtim_count--; @@ -1677,7 +1652,7 @@ static void ieee80211_beacon_add_tim(struct ieee80211_local *local, *pos++ = WLAN_EID_TIM; *pos++ = 4; *pos++ = bss->dtim_count; - *pos++ = bss->dtim_period; + *pos++ = beacon->dtim_period; if (bss->dtim_count == 0 && !skb_queue_empty(&bss->ps_bc_buf)) aid0 = 1; @@ -1715,7 +1690,8 @@ static void ieee80211_beacon_add_tim(struct ieee80211_local *local, read_unlock_bh(&local->sta_lock); } -struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw, int if_id, +struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, struct ieee80211_tx_control *control) { struct ieee80211_local *local = hw_to_local(hw); @@ -1723,68 +1699,64 @@ struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw, int if_id, struct net_device *bdev; struct ieee80211_sub_if_data *sdata = NULL; struct ieee80211_if_ap *ap = NULL; - struct ieee80211_rate *rate; - struct rate_control_extra extra; - u8 *b_head, *b_tail; - int bh_len, bt_len; - - bdev = dev_get_by_index(&init_net, if_id); - if (bdev) { - sdata = IEEE80211_DEV_TO_SUB_IF(bdev); - ap = &sdata->u.ap; - dev_put(bdev); - } + struct rate_selection rsel; + struct beacon_data *beacon; + + rcu_read_lock(); - if (!ap || sdata->type != IEEE80211_IF_TYPE_AP || - !ap->beacon_head) { + sdata = vif_to_sdata(vif); + bdev = sdata->dev; + ap = &sdata->u.ap; + + beacon = rcu_dereference(ap->beacon); + + if (!ap || sdata->vif.type != IEEE80211_IF_TYPE_AP || !beacon) { #ifdef CONFIG_MAC80211_VERBOSE_DEBUG if (net_ratelimit()) - printk(KERN_DEBUG "no beacon data avail for idx=%d " - "(%s)\n", if_id, bdev ? bdev->name : "N/A"); + printk(KERN_DEBUG "no beacon data avail for %s\n", + bdev->name); #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ - return NULL; + skb = NULL; + goto out; } - /* Assume we are generating the normal beacon locally */ - b_head = ap->beacon_head; - b_tail = ap->beacon_tail; - bh_len = ap->beacon_head_len; - bt_len = ap->beacon_tail_len; - - skb = dev_alloc_skb(local->tx_headroom + - bh_len + bt_len + 256 /* maximum TIM len */); + /* headroom, head length, tail length and maximum TIM length */ + skb = dev_alloc_skb(local->tx_headroom + beacon->head_len + + beacon->tail_len + 256); if (!skb) - return NULL; + goto out; skb_reserve(skb, local->tx_headroom); - memcpy(skb_put(skb, bh_len), b_head, bh_len); + memcpy(skb_put(skb, beacon->head_len), beacon->head, + beacon->head_len); ieee80211_include_sequence(sdata, (struct ieee80211_hdr *)skb->data); - ieee80211_beacon_add_tim(local, ap, skb); + ieee80211_beacon_add_tim(local, ap, skb, beacon); - if (b_tail) { - memcpy(skb_put(skb, bt_len), b_tail, bt_len); - } + if (beacon->tail) + memcpy(skb_put(skb, beacon->tail_len), beacon->tail, + beacon->tail_len); if (control) { - memset(&extra, 0, sizeof(extra)); - extra.mode = local->oper_hw_mode; - - rate = rate_control_get_rate(local, local->mdev, skb, &extra); - if (!rate) { + rate_control_get_rate(local->mdev, local->oper_hw_mode, skb, + &rsel); + if (!rsel.rate) { if (net_ratelimit()) { - printk(KERN_DEBUG "%s: ieee80211_beacon_get: no rate " - "found\n", wiphy_name(local->hw.wiphy)); + printk(KERN_DEBUG "%s: ieee80211_beacon_get: " + "no rate found\n", + wiphy_name(local->hw.wiphy)); } dev_kfree_skb(skb); - return NULL; + skb = NULL; + goto out; } + control->vif = vif; control->tx_rate = - ((sdata->flags & IEEE80211_SDATA_SHORT_PREAMBLE) && - (rate->flags & IEEE80211_RATE_PREAMBLE2)) ? - rate->val2 : rate->val; + (sdata->bss_conf.use_short_preamble && + (rsel.rate->flags & IEEE80211_RATE_PREAMBLE2)) ? + rsel.rate->val2 : rsel.rate->val; control->antenna_sel_tx = local->hw.conf.antenna_sel_tx; control->power_level = local->hw.conf.power_level; control->flags |= IEEE80211_TXCTL_NO_ACK; @@ -1793,11 +1765,14 @@ struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw, int if_id, } ap->num_beacons++; + + out: + rcu_read_unlock(); return skb; } EXPORT_SYMBOL(ieee80211_beacon_get); -void ieee80211_rts_get(struct ieee80211_hw *hw, int if_id, +void ieee80211_rts_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const void *frame, size_t frame_len, const struct ieee80211_tx_control *frame_txctl, struct ieee80211_rts *rts) @@ -1807,13 +1782,14 @@ void ieee80211_rts_get(struct ieee80211_hw *hw, int if_id, fctl = IEEE80211_FTYPE_CTL | IEEE80211_STYPE_RTS; rts->frame_control = cpu_to_le16(fctl); - rts->duration = ieee80211_rts_duration(hw, if_id, frame_len, frame_txctl); + rts->duration = ieee80211_rts_duration(hw, vif, frame_len, + frame_txctl); memcpy(rts->ra, hdr->addr1, sizeof(rts->ra)); memcpy(rts->ta, hdr->addr2, sizeof(rts->ta)); } EXPORT_SYMBOL(ieee80211_rts_get); -void ieee80211_ctstoself_get(struct ieee80211_hw *hw, int if_id, +void ieee80211_ctstoself_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, const void *frame, size_t frame_len, const struct ieee80211_tx_control *frame_txctl, struct ieee80211_cts *cts) @@ -1823,13 +1799,15 @@ void ieee80211_ctstoself_get(struct ieee80211_hw *hw, int if_id, fctl = IEEE80211_FTYPE_CTL | IEEE80211_STYPE_CTS; cts->frame_control = cpu_to_le16(fctl); - cts->duration = ieee80211_ctstoself_duration(hw, if_id, frame_len, frame_txctl); + cts->duration = ieee80211_ctstoself_duration(hw, vif, + frame_len, frame_txctl); memcpy(cts->ra, hdr->addr1, sizeof(cts->ra)); } EXPORT_SYMBOL(ieee80211_ctstoself_get); struct sk_buff * -ieee80211_get_buffered_bc(struct ieee80211_hw *hw, int if_id, +ieee80211_get_buffered_bc(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, struct ieee80211_tx_control *control) { struct ieee80211_local *local = hw_to_local(hw); @@ -1841,16 +1819,25 @@ ieee80211_get_buffered_bc(struct ieee80211_hw *hw, int if_id, struct net_device *bdev; struct ieee80211_sub_if_data *sdata; struct ieee80211_if_ap *bss = NULL; + struct beacon_data *beacon; - bdev = dev_get_by_index(&init_net, if_id); - if (bdev) { - sdata = IEEE80211_DEV_TO_SUB_IF(bdev); - bss = &sdata->u.ap; - dev_put(bdev); - } - if (!bss || sdata->type != IEEE80211_IF_TYPE_AP || !bss->beacon_head) + sdata = vif_to_sdata(vif); + bdev = sdata->dev; + + + if (!bss) return NULL; + rcu_read_lock(); + beacon = rcu_dereference(bss->beacon); + + if (sdata->vif.type != IEEE80211_IF_TYPE_AP || !beacon || + !beacon->head) { + rcu_read_unlock(); + return NULL; + } + rcu_read_unlock(); + if (bss->dtim_count != 0) return NULL; /* send buffered bc/mc only after DTIM beacon */ memset(control, 0, sizeof(*control)); @@ -1883,7 +1870,6 @@ ieee80211_get_buffered_bc(struct ieee80211_hw *hw, int if_id, if (res == TXRX_DROP || res == TXRX_QUEUED) break; } - dev_put(tx.dev); skb = tx.skb; /* handlers are allowed to change skb */ if (res == TXRX_DROP) { diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 5a0564e1dbd..5e631ce98d7 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -22,6 +22,7 @@ #include <linux/bitmap.h> #include <net/net_namespace.h> #include <net/cfg80211.h> +#include <net/rtnetlink.h> #include "ieee80211_i.h" #include "ieee80211_rate.h" @@ -39,10 +40,6 @@ const unsigned char rfc1042_header[] = const unsigned char bridge_tunnel_header[] = { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 }; -/* No encapsulation header if EtherType < 0x600 (=length) */ -static const unsigned char eapol_header[] = - { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0x00, 0x88, 0x8e }; - static int rate_list_match(const int *rate_list, int rate) { @@ -130,17 +127,21 @@ void ieee80211_prepare_rates(struct ieee80211_local *local, } } -u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len) +u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, + enum ieee80211_if_types type) { u16 fc; - if (len < 24) + /* drop ACK/CTS frames and incorrect hdr len (ctrl) */ + if (len < 16) return NULL; fc = le16_to_cpu(hdr->frame_control); switch (fc & IEEE80211_FCTL_FTYPE) { case IEEE80211_FTYPE_DATA: + if (len < 24) /* drop incorrect hdr len (data) */ + return NULL; switch (fc & (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) { case IEEE80211_FCTL_TODS: return hdr->addr1; @@ -153,10 +154,24 @@ u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len) } break; case IEEE80211_FTYPE_MGMT: + if (len < 24) /* drop incorrect hdr len (mgmt) */ + return NULL; return hdr->addr3; case IEEE80211_FTYPE_CTL: if ((fc & IEEE80211_FCTL_STYPE) == IEEE80211_STYPE_PSPOLL) return hdr->addr1; + else if ((fc & IEEE80211_FCTL_STYPE) == + IEEE80211_STYPE_BACK_REQ) { + switch (type) { + case IEEE80211_IF_TYPE_STA: + return hdr->addr2; + case IEEE80211_IF_TYPE_AP: + case IEEE80211_IF_TYPE_VLAN: + return hdr->addr1; + default: + return NULL; + } + } else return NULL; } @@ -217,31 +232,6 @@ int ieee80211_get_hdrlen_from_skb(const struct sk_buff *skb) } EXPORT_SYMBOL(ieee80211_get_hdrlen_from_skb); -int ieee80211_is_eapol(const struct sk_buff *skb) -{ - const struct ieee80211_hdr *hdr; - u16 fc; - int hdrlen; - - if (unlikely(skb->len < 10)) - return 0; - - hdr = (const struct ieee80211_hdr *) skb->data; - fc = le16_to_cpu(hdr->frame_control); - - if (unlikely(!WLAN_FC_DATA_PRESENT(fc))) - return 0; - - hdrlen = ieee80211_get_hdrlen(fc); - - if (unlikely(skb->len >= hdrlen + sizeof(eapol_header) && - memcmp(skb->data + hdrlen, eapol_header, - sizeof(eapol_header)) == 0)) - return 1; - - return 0; -} - void ieee80211_tx_set_iswep(struct ieee80211_txrx_data *tx) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) tx->skb->data; @@ -312,45 +302,35 @@ int ieee80211_frame_duration(struct ieee80211_local *local, size_t len, } /* Exported duration function for driver use */ -__le16 ieee80211_generic_frame_duration(struct ieee80211_hw *hw, int if_id, +__le16 ieee80211_generic_frame_duration(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, size_t frame_len, int rate) { struct ieee80211_local *local = hw_to_local(hw); - struct net_device *bdev = dev_get_by_index(&init_net, if_id); - struct ieee80211_sub_if_data *sdata; + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); u16 dur; int erp; - if (unlikely(!bdev)) - return 0; - - sdata = IEEE80211_DEV_TO_SUB_IF(bdev); erp = ieee80211_is_erp_rate(hw->conf.phymode, rate); - dur = ieee80211_frame_duration(local, frame_len, rate, - erp, sdata->flags & IEEE80211_SDATA_SHORT_PREAMBLE); + dur = ieee80211_frame_duration(local, frame_len, rate, erp, + sdata->bss_conf.use_short_preamble); - dev_put(bdev); return cpu_to_le16(dur); } EXPORT_SYMBOL(ieee80211_generic_frame_duration); -__le16 ieee80211_rts_duration(struct ieee80211_hw *hw, int if_id, - size_t frame_len, +__le16 ieee80211_rts_duration(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, size_t frame_len, const struct ieee80211_tx_control *frame_txctl) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_rate *rate; - struct net_device *bdev = dev_get_by_index(&init_net, if_id); - struct ieee80211_sub_if_data *sdata; - int short_preamble; + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + bool short_preamble; int erp; u16 dur; - if (unlikely(!bdev)) - return 0; - - sdata = IEEE80211_DEV_TO_SUB_IF(bdev); - short_preamble = sdata->flags & IEEE80211_SDATA_SHORT_PREAMBLE; + short_preamble = sdata->bss_conf.use_short_preamble; rate = frame_txctl->rts_rate; erp = !!(rate->flags & IEEE80211_RATE_ERP); @@ -365,28 +345,23 @@ __le16 ieee80211_rts_duration(struct ieee80211_hw *hw, int if_id, dur += ieee80211_frame_duration(local, 10, rate->rate, erp, short_preamble); - dev_put(bdev); return cpu_to_le16(dur); } EXPORT_SYMBOL(ieee80211_rts_duration); -__le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw, int if_id, +__le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, size_t frame_len, const struct ieee80211_tx_control *frame_txctl) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_rate *rate; - struct net_device *bdev = dev_get_by_index(&init_net, if_id); - struct ieee80211_sub_if_data *sdata; - int short_preamble; + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + bool short_preamble; int erp; u16 dur; - if (unlikely(!bdev)) - return 0; - - sdata = IEEE80211_DEV_TO_SUB_IF(bdev); - short_preamble = sdata->flags & IEEE80211_SDATA_SHORT_PREAMBLE; + short_preamble = sdata->bss_conf.use_short_preamble; rate = frame_txctl->rts_rate; erp = !!(rate->flags & IEEE80211_RATE_ERP); @@ -400,7 +375,6 @@ __le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw, int if_id, erp, short_preamble); } - dev_put(bdev); return cpu_to_le16(dur); } EXPORT_SYMBOL(ieee80211_ctstoself_duration); @@ -484,3 +458,37 @@ void ieee80211_wake_queues(struct ieee80211_hw *hw) ieee80211_wake_queue(hw, i); } EXPORT_SYMBOL(ieee80211_wake_queues); + +void ieee80211_iterate_active_interfaces( + struct ieee80211_hw *hw, + void (*iterator)(void *data, u8 *mac, + struct ieee80211_vif *vif), + void *data) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_sub_if_data *sdata; + + rcu_read_lock(); + + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + switch (sdata->vif.type) { + case IEEE80211_IF_TYPE_INVALID: + case IEEE80211_IF_TYPE_MNTR: + case IEEE80211_IF_TYPE_VLAN: + continue; + case IEEE80211_IF_TYPE_AP: + case IEEE80211_IF_TYPE_STA: + case IEEE80211_IF_TYPE_IBSS: + case IEEE80211_IF_TYPE_WDS: + break; + } + if (sdata->dev == local->mdev) + continue; + if (netif_running(sdata->dev)) + iterator(data, sdata->dev->dev_addr, + &sdata->vif); + } + + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces); diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c index b5f3413403b..a0cff72a580 100644 --- a/net/mac80211/wep.c +++ b/net/mac80211/wep.c @@ -349,16 +349,6 @@ static int wep_encrypt_skb(struct ieee80211_txrx_data *tx, struct sk_buff *skb) ieee80211_txrx_result ieee80211_crypto_wep_encrypt(struct ieee80211_txrx_data *tx) { - struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) tx->skb->data; - u16 fc; - - fc = le16_to_cpu(hdr->frame_control); - - if (((fc & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_DATA && - ((fc & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_MGMT || - (fc & IEEE80211_FCTL_STYPE) != IEEE80211_STYPE_AUTH))) - return TXRX_CONTINUE; - tx->u.tx.control->iv_len = WEP_IV_LEN; tx->u.tx.control->icv_len = WEP_ICV_LEN; ieee80211_tx_set_iswep(tx); diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c index 5b8a157975a..4e236599dd3 100644 --- a/net/mac80211/wme.c +++ b/net/mac80211/wme.c @@ -28,6 +28,7 @@ struct ieee80211_sched_data struct sk_buff_head requeued[TC_80211_MAX_QUEUES]; }; +static const char llc_ip_hdr[8] = {0xAA, 0xAA, 0x3, 0, 0, 0, 0x08, 0}; /* given a data frame determine the 802.1p/1d tag to use */ static inline unsigned classify_1d(struct sk_buff *skb, struct Qdisc *qd) @@ -54,12 +55,12 @@ static inline unsigned classify_1d(struct sk_buff *skb, struct Qdisc *qd) return skb->priority - 256; /* check there is a valid IP header present */ - offset = ieee80211_get_hdrlen_from_skb(skb) + 8 /* LLC + proto */; - if (skb->protocol != __constant_htons(ETH_P_IP) || - skb->len < offset + sizeof(*ip)) + offset = ieee80211_get_hdrlen_from_skb(skb); + if (skb->len < offset + sizeof(llc_ip_hdr) + sizeof(*ip) || + memcmp(skb->data + offset, llc_ip_hdr, sizeof(llc_ip_hdr))) return 0; - ip = (struct iphdr *) (skb->data + offset); + ip = (struct iphdr *) (skb->data + offset + sizeof(llc_ip_hdr)); dscp = ip->tos & 0xfc; if (dscp & 0x1c) @@ -296,16 +297,16 @@ static void wme_qdiscop_destroy(struct Qdisc* qd) /* called whenever parameters are updated on existing qdisc */ -static int wme_qdiscop_tune(struct Qdisc *qd, struct rtattr *opt) +static int wme_qdiscop_tune(struct Qdisc *qd, struct nlattr *opt) { /* struct ieee80211_sched_data *q = qdisc_priv(qd); */ /* check our options block is the right size */ /* copy any options to our local structure */ /* Ignore options block for now - always use static mapping - struct tc_ieee80211_qopt *qopt = RTA_DATA(opt); + struct tc_ieee80211_qopt *qopt = nla_data(opt); - if (opt->rta_len < RTA_LENGTH(sizeof(*qopt))) + if (opt->nla_len < nla_attr_size(sizeof(*qopt))) return -EINVAL; memcpy(q->tag2queue, qopt->tag2queue, sizeof(qopt->tag2queue)); */ @@ -314,7 +315,7 @@ static int wme_qdiscop_tune(struct Qdisc *qd, struct rtattr *opt) /* called during initial creation of qdisc on device */ -static int wme_qdiscop_init(struct Qdisc *qd, struct rtattr *opt) +static int wme_qdiscop_init(struct Qdisc *qd, struct nlattr *opt) { struct ieee80211_sched_data *q = qdisc_priv(qd); struct net_device *dev = qd->dev; @@ -369,10 +370,10 @@ static int wme_qdiscop_dump(struct Qdisc *qd, struct sk_buff *skb) struct tc_ieee80211_qopt opt; memcpy(&opt.tag2queue, q->tag2queue, TC_80211_MAX_TAG + 1); - RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); */ return skb->len; /* -rtattr_failure: +nla_put_failure: skb_trim(skb, p - skb->data);*/ return -1; } @@ -443,7 +444,7 @@ static void wme_classop_put(struct Qdisc *q, unsigned long cl) static int wme_classop_change(struct Qdisc *qd, u32 handle, u32 parent, - struct rtattr **tca, unsigned long *arg) + struct nlattr **tca, unsigned long *arg) { unsigned long cl = *arg; struct ieee80211_local *local = wdev_priv(qd->dev->ieee80211_ptr); @@ -527,7 +528,7 @@ static struct tcf_proto ** wme_classop_find_tcf(struct Qdisc *qd, /* this qdisc is classful (i.e. has classes, some of which may have leaf qdiscs attached) * - these are the operations on the classes */ -static struct Qdisc_class_ops class_ops = +static const struct Qdisc_class_ops class_ops = { .graft = wme_classop_graft, .leaf = wme_classop_leaf, @@ -547,7 +548,7 @@ static struct Qdisc_class_ops class_ops = /* queueing discipline operations */ -static struct Qdisc_ops wme_qdisc_ops = +static struct Qdisc_ops wme_qdisc_ops __read_mostly = { .next = NULL, .cl_ops = &class_ops, diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index 20cec1cb956..6f04311cf0a 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -245,16 +245,9 @@ static int tkip_encrypt_skb(struct ieee80211_txrx_data *tx, ieee80211_txrx_result ieee80211_crypto_tkip_encrypt(struct ieee80211_txrx_data *tx) { - struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) tx->skb->data; - u16 fc; struct sk_buff *skb = tx->skb; int wpa_test = 0, test = 0; - fc = le16_to_cpu(hdr->frame_control); - - if (!WLAN_FC_DATA_PRESENT(fc)) - return TXRX_CONTINUE; - tx->u.tx.control->icv_len = TKIP_ICV_LEN; tx->u.tx.control->iv_len = TKIP_IV_LEN; ieee80211_tx_set_iswep(tx); @@ -501,16 +494,9 @@ static int ccmp_encrypt_skb(struct ieee80211_txrx_data *tx, ieee80211_txrx_result ieee80211_crypto_ccmp_encrypt(struct ieee80211_txrx_data *tx) { - struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) tx->skb->data; - u16 fc; struct sk_buff *skb = tx->skb; int test = 0; - fc = le16_to_cpu(hdr->frame_control); - - if (!WLAN_FC_DATA_PRESENT(fc)) - return TXRX_CONTINUE; - tx->u.tx.control->icv_len = CCMP_MIC_LEN; tx->u.tx.control->iv_len = CCMP_HDR_LEN; ieee80211_tx_set_iswep(tx); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 21a9fcc0379..daf5b881064 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -2,21 +2,20 @@ menu "Core Netfilter Configuration" depends on NET && INET && NETFILTER config NETFILTER_NETLINK - tristate "Netfilter netlink interface" - help - If this option is enabled, the kernel will include support - for the new netfilter netlink interface. + tristate config NETFILTER_NETLINK_QUEUE tristate "Netfilter NFQUEUE over NFNETLINK interface" - depends on NETFILTER_NETLINK + depends on NETFILTER_ADVANCED + select NETFILTER_NETLINK help If this option is enabled, the kernel will include support for queueing packets via NFNETLINK. config NETFILTER_NETLINK_LOG tristate "Netfilter LOG over NFNETLINK interface" - depends on NETFILTER_NETLINK + default m if NETFILTER_ADVANCED=n + select NETFILTER_NETLINK help If this option is enabled, the kernel will include support for logging packets via NFNETLINK. @@ -25,9 +24,9 @@ config NETFILTER_NETLINK_LOG and is also scheduled to replace the old syslog-based ipt_LOG and ip6t_LOG modules. -# Rename this to NF_CONNTRACK in a 2.6.25 -config NF_CONNTRACK_ENABLED +config NF_CONNTRACK tristate "Netfilter connection tracking support" + default m if NETFILTER_ADVANCED=n help Connection tracking keeps a record of what packets have passed through your machine, in order to figure out how they are related @@ -40,12 +39,9 @@ config NF_CONNTRACK_ENABLED To compile it as a module, choose M here. If unsure, say N. -config NF_CONNTRACK - tristate - default NF_CONNTRACK_ENABLED - config NF_CT_ACCT bool "Connection tracking flow accounting" + depends on NETFILTER_ADVANCED depends on NF_CONNTRACK help If this option is enabled, the connection tracking code will @@ -58,6 +54,7 @@ config NF_CT_ACCT config NF_CONNTRACK_MARK bool 'Connection mark tracking support' + depends on NETFILTER_ADVANCED depends on NF_CONNTRACK help This option enables support for connection marks, used by the @@ -68,6 +65,7 @@ config NF_CONNTRACK_MARK config NF_CONNTRACK_SECMARK bool 'Connection tracking security mark support' depends on NF_CONNTRACK && NETWORK_SECMARK + default m if NETFILTER_ADVANCED=n help This option enables security markings to be applied to connections. Typically they are copied to connections from @@ -78,8 +76,9 @@ config NF_CONNTRACK_SECMARK If unsure, say 'N'. config NF_CONNTRACK_EVENTS - bool "Connection tracking events (EXPERIMENTAL)" - depends on EXPERIMENTAL && NF_CONNTRACK + bool "Connection tracking events" + depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED help If this option is enabled, the connection tracking code will provide a notifier chain that can be used by other kernel code @@ -94,7 +93,7 @@ config NF_CT_PROTO_GRE config NF_CT_PROTO_SCTP tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)' depends on EXPERIMENTAL && NF_CONNTRACK - default n + depends on NETFILTER_ADVANCED help With this option enabled, the layer 3 independent connection tracking code will be able to do state tracking on SCTP connections. @@ -103,8 +102,9 @@ config NF_CT_PROTO_SCTP <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. config NF_CT_PROTO_UDPLITE - tristate 'UDP-Lite protocol connection tracking support (EXPERIMENTAL)' - depends on EXPERIMENTAL && NF_CONNTRACK + tristate 'UDP-Lite protocol connection tracking support' + depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED help With this option enabled, the layer 3 independent connection tracking code will be able to do state tracking on UDP-Lite @@ -115,6 +115,7 @@ config NF_CT_PROTO_UDPLITE config NF_CONNTRACK_AMANDA tristate "Amanda backup protocol support" depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED select TEXTSEARCH select TEXTSEARCH_KMP help @@ -130,6 +131,7 @@ config NF_CONNTRACK_AMANDA config NF_CONNTRACK_FTP tristate "FTP protocol support" depends on NF_CONNTRACK + default m if NETFILTER_ADVANCED=n help Tracking FTP connections is problematic: special helpers are required for tracking them, and doing masquerading and other forms @@ -142,8 +144,9 @@ config NF_CONNTRACK_FTP To compile it as a module, choose M here. If unsure, say N. config NF_CONNTRACK_H323 - tristate "H.323 protocol support (EXPERIMENTAL)" - depends on EXPERIMENTAL && NF_CONNTRACK && (IPV6 || IPV6=n) + tristate "H.323 protocol support" + depends on NF_CONNTRACK && (IPV6 || IPV6=n) + depends on NETFILTER_ADVANCED help H.323 is a VoIP signalling protocol from ITU-T. As one of the most important VoIP protocols, it is widely used by voice hardware and @@ -163,6 +166,7 @@ config NF_CONNTRACK_H323 config NF_CONNTRACK_IRC tristate "IRC protocol support" depends on NF_CONNTRACK + default m if NETFILTER_ADVANCED=n help There is a commonly-used extension to IRC called Direct Client-to-Client Protocol (DCC). This enables users to send @@ -176,8 +180,9 @@ config NF_CONNTRACK_IRC To compile it as a module, choose M here. If unsure, say N. config NF_CONNTRACK_NETBIOS_NS - tristate "NetBIOS name service protocol support (EXPERIMENTAL)" - depends on EXPERIMENTAL && NF_CONNTRACK + tristate "NetBIOS name service protocol support" + depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED help NetBIOS name service requests are sent as broadcast messages from an unprivileged port and responded to with unicast messages to the @@ -197,6 +202,7 @@ config NF_CONNTRACK_NETBIOS_NS config NF_CONNTRACK_PPTP tristate "PPtP protocol support" depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED select NF_CT_PROTO_GRE help This module adds support for PPTP (Point to Point Tunnelling @@ -216,6 +222,7 @@ config NF_CONNTRACK_PPTP config NF_CONNTRACK_SANE tristate "SANE protocol support (EXPERIMENTAL)" depends on EXPERIMENTAL && NF_CONNTRACK + depends on NETFILTER_ADVANCED help SANE is a protocol for remote access to scanners as implemented by the 'saned' daemon. Like FTP, it uses separate control and @@ -227,8 +234,9 @@ config NF_CONNTRACK_SANE To compile it as a module, choose M here. If unsure, say N. config NF_CONNTRACK_SIP - tristate "SIP protocol support (EXPERIMENTAL)" - depends on EXPERIMENTAL && NF_CONNTRACK + tristate "SIP protocol support" + depends on NF_CONNTRACK + default m if NETFILTER_ADVANCED=n help SIP is an application-layer control protocol that can establish, modify, and terminate multimedia sessions (conferences) such as @@ -241,6 +249,7 @@ config NF_CONNTRACK_SIP config NF_CONNTRACK_TFTP tristate "TFTP protocol support" depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED help TFTP connection tracking helper, this is required depending on how restrictive your ruleset is. @@ -250,15 +259,17 @@ config NF_CONNTRACK_TFTP To compile it as a module, choose M here. If unsure, say N. config NF_CT_NETLINK - tristate 'Connection tracking netlink interface (EXPERIMENTAL)' - depends on EXPERIMENTAL && NF_CONNTRACK && NETFILTER_NETLINK - depends on NF_CONNTRACK!=y || NETFILTER_NETLINK!=m + tristate 'Connection tracking netlink interface' + depends on NF_CONNTRACK + select NETFILTER_NETLINK depends on NF_NAT=n || NF_NAT + default m if NETFILTER_ADVANCED=n help This option enables support for a netlink-based userspace interface config NETFILTER_XTABLES tristate "Netfilter Xtables support (required for ip_tables)" + default m if NETFILTER_ADVANCED=n help This is required if you intend to use any of ip_tables, ip6_tables or arp_tables. @@ -268,6 +279,7 @@ config NETFILTER_XTABLES config NETFILTER_XT_TARGET_CLASSIFY tristate '"CLASSIFY" target support' depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED help This option adds a `CLASSIFY' target, which enables the user to set the priority of a packet. Some qdiscs can use this value for @@ -282,31 +294,38 @@ config NETFILTER_XT_TARGET_CONNMARK depends on NETFILTER_XTABLES depends on IP_NF_MANGLE || IP6_NF_MANGLE depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED select NF_CONNTRACK_MARK help This option adds a `CONNMARK' target, which allows one to manipulate the connection mark value. Similar to the MARK target, but affects the connection mark value rather than the packet mark value. - + If you want to compile it as a module, say M here and read <file:Documentation/kbuild/modules.txt>. The module will be called ipt_CONNMARK.ko. If unsure, say `N'. config NETFILTER_XT_TARGET_DSCP - tristate '"DSCP" target support' + tristate '"DSCP" and "TOS" target support' depends on NETFILTER_XTABLES depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on NETFILTER_ADVANCED help This option adds a `DSCP' target, which allows you to manipulate the IPv4/IPv6 header DSCP field (differentiated services codepoint). The DSCP field can have any value between 0x0 and 0x3f inclusive. + It also adds the "TOS" target, which allows you to create rules in + the "mangle" table which alter the Type Of Service field of an IPv4 + or the Priority field of an IPv6 packet, prior to routing. + To compile it as a module, choose M here. If unsure, say N. config NETFILTER_XT_TARGET_MARK tristate '"MARK" target support' depends on NETFILTER_XTABLES + default m if NETFILTER_ADVANCED=n help This option adds a `MARK' target, which allows you to create rules in the `mangle' table which alter the netfilter mark (nfmark) field @@ -320,6 +339,7 @@ config NETFILTER_XT_TARGET_MARK config NETFILTER_XT_TARGET_NFQUEUE tristate '"NFQUEUE" target Support' depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED help This target replaced the old obsolete QUEUE target. @@ -331,6 +351,7 @@ config NETFILTER_XT_TARGET_NFQUEUE config NETFILTER_XT_TARGET_NFLOG tristate '"NFLOG" target support' depends on NETFILTER_XTABLES + default m if NETFILTER_ADVANCED=n help This option enables the NFLOG target, which allows to LOG messages through the netfilter logging API, which can use @@ -344,19 +365,32 @@ config NETFILTER_XT_TARGET_NOTRACK depends on NETFILTER_XTABLES depends on IP_NF_RAW || IP6_NF_RAW depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED help The NOTRACK target allows a select rule to specify which packets *not* to enter the conntrack/NAT subsystem with all the consequences (no ICMP error tracking, no protocol helpers for the selected packets). - + If you want to compile it as a module, say M here and read <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. +config NETFILTER_XT_TARGET_RATEEST + tristate '"RATEEST" target support' + depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED + help + This option adds a `RATEEST' target, which allows to measure + rates similar to TC estimators. The `rateest' match can be + used to match on the measured rates. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_TARGET_TRACE tristate '"TRACE" target support' depends on NETFILTER_XTABLES depends on IP_NF_RAW || IP6_NF_RAW + depends on NETFILTER_ADVANCED help The TRACE target allows you to mark packets so that the kernel will log every rule which match the packets as those traverse @@ -368,6 +402,7 @@ config NETFILTER_XT_TARGET_TRACE config NETFILTER_XT_TARGET_SECMARK tristate '"SECMARK" target support' depends on NETFILTER_XTABLES && NETWORK_SECMARK + default m if NETFILTER_ADVANCED=n help The SECMARK target allows security marking of network packets, for use with security subsystems. @@ -377,6 +412,7 @@ config NETFILTER_XT_TARGET_SECMARK config NETFILTER_XT_TARGET_CONNSECMARK tristate '"CONNSECMARK" target support' depends on NETFILTER_XTABLES && NF_CONNTRACK && NF_CONNTRACK_SECMARK + default m if NETFILTER_ADVANCED=n help The CONNSECMARK target copies security markings from packets to connections, and restores security markings from connections @@ -388,6 +424,7 @@ config NETFILTER_XT_TARGET_CONNSECMARK config NETFILTER_XT_TARGET_TCPMSS tristate '"TCPMSS" target support' depends on NETFILTER_XTABLES && (IPV6 || IPV6=n) + default m if NETFILTER_ADVANCED=n ---help--- This option adds a `TCPMSS' target, which allows you to alter the MSS value of TCP SYN packets, to control the maximum size for that @@ -411,9 +448,19 @@ config NETFILTER_XT_TARGET_TCPMSS To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_TARGET_TCPOPTSTRIP + tristate '"TCPOPTSTRIP" target support (EXPERIMENTAL)' + depends on EXPERIMENTAL && NETFILTER_XTABLES + depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on NETFILTER_ADVANCED + help + This option adds a "TCPOPTSTRIP" target, which allows you to strip + TCP options from TCP packets. + config NETFILTER_XT_MATCH_COMMENT tristate '"comment" match support' depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED help This option adds a `comment' dummy-match, which allows you to put comments in your iptables ruleset. @@ -425,6 +472,7 @@ config NETFILTER_XT_MATCH_CONNBYTES tristate '"connbytes" per-connection counter match support' depends on NETFILTER_XTABLES depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED select NF_CT_ACCT help This option adds a `connbytes' match, which allows you to match the @@ -437,6 +485,7 @@ config NETFILTER_XT_MATCH_CONNLIMIT tristate '"connlimit" match support"' depends on NETFILTER_XTABLES depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED ---help--- This match allows you to match against the number of parallel connections to a server per client IP address (or address block). @@ -445,11 +494,12 @@ config NETFILTER_XT_MATCH_CONNMARK tristate '"connmark" connection mark match support' depends on NETFILTER_XTABLES depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED select NF_CONNTRACK_MARK help This option adds a `connmark' match, which allows you to match the connection mark value previously set for the session by `CONNMARK'. - + If you want to compile it as a module, say M here and read <file:Documentation/kbuild/modules.txt>. The module will be called ipt_connmark.ko. If unsure, say `N'. @@ -458,6 +508,7 @@ config NETFILTER_XT_MATCH_CONNTRACK tristate '"conntrack" connection tracking match support' depends on NETFILTER_XTABLES depends on NF_CONNTRACK + default m if NETFILTER_ADVANCED=n help This is a general conntrack match module, a superset of the state match. @@ -468,8 +519,9 @@ config NETFILTER_XT_MATCH_CONNTRACK To compile it as a module, choose M here. If unsure, say N. config NETFILTER_XT_MATCH_DCCP - tristate '"DCCP" protocol match support' + tristate '"dccp" protocol match support' depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED help With this option enabled, you will be able to use the iptables `dccp' match in order to match on DCCP source/destination ports @@ -479,19 +531,25 @@ config NETFILTER_XT_MATCH_DCCP <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. config NETFILTER_XT_MATCH_DSCP - tristate '"DSCP" match support' + tristate '"dscp" and "tos" match support' depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED help This option adds a `DSCP' match, which allows you to match against the IPv4/IPv6 header DSCP field (differentiated services codepoint). The DSCP field can have any value between 0x0 and 0x3f inclusive. + It will also add a "tos" match, which allows you to match packets + based on the Type Of Service fields of the IPv4 packet (which share + the same bits as DSCP). + To compile it as a module, choose M here. If unsure, say N. config NETFILTER_XT_MATCH_ESP - tristate '"ESP" match support' + tristate '"esp" match support' depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED help This match extension allows you to match a range of SPIs inside ESP header of IPSec packets. @@ -502,15 +560,28 @@ config NETFILTER_XT_MATCH_HELPER tristate '"helper" match support' depends on NETFILTER_XTABLES depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED help Helper matching allows you to match packets in dynamic connections tracked by a conntrack-helper, ie. ip_conntrack_ftp To compile it as a module, choose M here. If unsure, say Y. +config NETFILTER_XT_MATCH_IPRANGE + tristate '"iprange" address range match support' + depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED + ---help--- + This option adds a "iprange" match, which allows you to match based on + an IP address range. (Normal iptables only matches on single addresses + with an optional mask.) + + If unsure, say M. + config NETFILTER_XT_MATCH_LENGTH tristate '"length" match support' depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED help This option allows you to match the length of a packet against a specific value or range of values. @@ -520,6 +591,7 @@ config NETFILTER_XT_MATCH_LENGTH config NETFILTER_XT_MATCH_LIMIT tristate '"limit" match support' depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED help limit matching allows you to control the rate at which a rule can be matched: mainly useful in combination with the LOG target ("LOG @@ -530,6 +602,7 @@ config NETFILTER_XT_MATCH_LIMIT config NETFILTER_XT_MATCH_MAC tristate '"mac" address match support' depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED help MAC matching allows you to match packets based on the source Ethernet address of the packet. @@ -539,6 +612,7 @@ config NETFILTER_XT_MATCH_MAC config NETFILTER_XT_MATCH_MARK tristate '"mark" match support' depends on NETFILTER_XTABLES + default m if NETFILTER_ADVANCED=n help Netfilter mark matching allows you to match packets based on the `nfmark' value in the packet. This can be set by the MARK target @@ -546,9 +620,19 @@ config NETFILTER_XT_MATCH_MARK To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_MATCH_OWNER + tristate '"owner" match support' + depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED + ---help--- + Socket owner matching allows you to match locally-generated packets + based on who created the socket: the user or group. It is also + possible to check whether a socket actually exists. + config NETFILTER_XT_MATCH_POLICY tristate 'IPsec "policy" match support' depends on NETFILTER_XTABLES && XFRM + default m if NETFILTER_ADVANCED=n help Policy matching allows you to match packets based on the IPsec policy that was used during decapsulation/will @@ -557,8 +641,9 @@ config NETFILTER_XT_MATCH_POLICY To compile it as a module, choose M here. If unsure, say N. config NETFILTER_XT_MATCH_MULTIPORT - tristate "Multiple port match support" + tristate '"multiport" Multiple port match support' depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED help Multiport matching allows you to match TCP or UDP packets based on a series of source or destination ports: normally a rule can only @@ -569,6 +654,7 @@ config NETFILTER_XT_MATCH_MULTIPORT config NETFILTER_XT_MATCH_PHYSDEV tristate '"physdev" match support' depends on NETFILTER_XTABLES && BRIDGE && BRIDGE_NETFILTER + depends on NETFILTER_ADVANCED help Physdev packet matching matches against the physical bridge ports the IP packet arrived on or will leave by. @@ -578,6 +664,7 @@ config NETFILTER_XT_MATCH_PHYSDEV config NETFILTER_XT_MATCH_PKTTYPE tristate '"pkttype" packet type match support' depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED help Packet type matching allows you to match a packet by its "class", eg. BROADCAST, MULTICAST, ... @@ -590,6 +677,7 @@ config NETFILTER_XT_MATCH_PKTTYPE config NETFILTER_XT_MATCH_QUOTA tristate '"quota" match support' depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED help This option adds a `quota' match, which allows to match on a byte counter. @@ -597,23 +685,36 @@ config NETFILTER_XT_MATCH_QUOTA If you want to compile it as a module, say M here and read <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. +config NETFILTER_XT_MATCH_RATEEST + tristate '"rateest" match support' + depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED + select NETFILTER_XT_TARGET_RATEEST + help + This option adds a `rateest' match, which allows to match on the + rate estimated by the RATEEST target. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_MATCH_REALM tristate '"realm" match support' depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED select NET_CLS_ROUTE help This option adds a `realm' match, which allows you to use the realm key from the routing subsystem inside iptables. - + This match pretty much resembles the CONFIG_NET_CLS_ROUTE4 option in tc world. - + If you want to compile it as a module, say M here and read <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. config NETFILTER_XT_MATCH_SCTP tristate '"sctp" protocol match support (EXPERIMENTAL)' depends on NETFILTER_XTABLES && EXPERIMENTAL + depends on NETFILTER_ADVANCED help With this option enabled, you will be able to use the `sctp' match in order to match on SCTP source/destination ports @@ -626,6 +727,7 @@ config NETFILTER_XT_MATCH_STATE tristate '"state" match support' depends on NETFILTER_XTABLES depends on NF_CONNTRACK + default m if NETFILTER_ADVANCED=n help Connection state matching allows you to match packets based on their relationship to a tracked connection (ie. previous packets). This @@ -636,6 +738,7 @@ config NETFILTER_XT_MATCH_STATE config NETFILTER_XT_MATCH_STATISTIC tristate '"statistic" match support' depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED help This option adds a `statistic' match, which allows you to match on packets periodically or randomly with a given percentage. @@ -645,6 +748,7 @@ config NETFILTER_XT_MATCH_STATISTIC config NETFILTER_XT_MATCH_STRING tristate '"string" match support' depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED select TEXTSEARCH select TEXTSEARCH_KMP select TEXTSEARCH_BM @@ -658,6 +762,7 @@ config NETFILTER_XT_MATCH_STRING config NETFILTER_XT_MATCH_TCPMSS tristate '"tcpmss" match support' depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED help This option adds a `tcpmss' match, which allows you to examine the MSS value of TCP SYN packets, which control the maximum packet size @@ -668,6 +773,7 @@ config NETFILTER_XT_MATCH_TCPMSS config NETFILTER_XT_MATCH_TIME tristate '"time" match support' depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED ---help--- This option adds a "time" match, which allows you to match based on the packet arrival time (at the machine which netfilter is running) @@ -682,6 +788,7 @@ config NETFILTER_XT_MATCH_TIME config NETFILTER_XT_MATCH_U32 tristate '"u32" match support' depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED ---help--- u32 allows you to extract quantities of up to 4 bytes from a packet, AND them with specified masks, shift them by specified amounts and @@ -695,6 +802,7 @@ config NETFILTER_XT_MATCH_U32 config NETFILTER_XT_MATCH_HASHLIMIT tristate '"hashlimit" match support' depends on NETFILTER_XTABLES && (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on NETFILTER_ADVANCED help This option adds a `hashlimit' match. diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index ad0e36ebea3..ea7508387f9 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -4,7 +4,6 @@ nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_exp nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o obj-$(CONFIG_NETFILTER) = netfilter.o -obj-$(CONFIG_SYSCTL) += nf_sysctl.o obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o @@ -46,8 +45,10 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_MARK) += xt_MARK.o obj-$(CONFIG_NETFILTER_XT_TARGET_NFLOG) += xt_NFLOG.o obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o obj-$(CONFIG_NETFILTER_XT_TARGET_NOTRACK) += xt_NOTRACK.o +obj-$(CONFIG_NETFILTER_XT_TARGET_RATEEST) += xt_RATEEST.o obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o obj-$(CONFIG_NETFILTER_XT_TARGET_TCPMSS) += xt_TCPMSS.o +obj-$(CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP) += xt_TCPOPTSTRIP.o obj-$(CONFIG_NETFILTER_XT_TARGET_TRACE) += xt_TRACE.o # matches @@ -61,15 +62,18 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_DSCP) += xt_dscp.o obj-$(CONFIG_NETFILTER_XT_MATCH_ESP) += xt_esp.o obj-$(CONFIG_NETFILTER_XT_MATCH_HASHLIMIT) += xt_hashlimit.o obj-$(CONFIG_NETFILTER_XT_MATCH_HELPER) += xt_helper.o +obj-$(CONFIG_NETFILTER_XT_MATCH_IPRANGE) += xt_iprange.o obj-$(CONFIG_NETFILTER_XT_MATCH_LENGTH) += xt_length.o obj-$(CONFIG_NETFILTER_XT_MATCH_LIMIT) += xt_limit.o obj-$(CONFIG_NETFILTER_XT_MATCH_MAC) += xt_mac.o obj-$(CONFIG_NETFILTER_XT_MATCH_MARK) += xt_mark.o obj-$(CONFIG_NETFILTER_XT_MATCH_MULTIPORT) += xt_multiport.o +obj-$(CONFIG_NETFILTER_XT_MATCH_OWNER) += xt_owner.o obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o +obj-$(CONFIG_NETFILTER_XT_MATCH_RATEEST) += xt_rateest.o obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c index bed9ba01e8e..c4065b8f9a9 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -26,10 +26,10 @@ static DEFINE_MUTEX(afinfo_mutex); -struct nf_afinfo *nf_afinfo[NPROTO] __read_mostly; +const struct nf_afinfo *nf_afinfo[NPROTO] __read_mostly; EXPORT_SYMBOL(nf_afinfo); -int nf_register_afinfo(struct nf_afinfo *afinfo) +int nf_register_afinfo(const struct nf_afinfo *afinfo) { int err; @@ -42,7 +42,7 @@ int nf_register_afinfo(struct nf_afinfo *afinfo) } EXPORT_SYMBOL_GPL(nf_register_afinfo); -void nf_unregister_afinfo(struct nf_afinfo *afinfo) +void nf_unregister_afinfo(const struct nf_afinfo *afinfo) { mutex_lock(&afinfo_mutex); rcu_assign_pointer(nf_afinfo[afinfo->family], NULL); @@ -51,28 +51,23 @@ void nf_unregister_afinfo(struct nf_afinfo *afinfo) } EXPORT_SYMBOL_GPL(nf_unregister_afinfo); -/* In this code, we can be waiting indefinitely for userspace to - * service a packet if a hook returns NF_QUEUE. We could keep a count - * of skbuffs queued for userspace, and not deregister a hook unless - * this is zero, but that sucks. Now, we simply check when the - * packets come back: if the hook is gone, the packet is discarded. */ struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS] __read_mostly; EXPORT_SYMBOL(nf_hooks); static DEFINE_MUTEX(nf_hook_mutex); int nf_register_hook(struct nf_hook_ops *reg) { - struct list_head *i; + struct nf_hook_ops *elem; int err; err = mutex_lock_interruptible(&nf_hook_mutex); if (err < 0) return err; - list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) { - if (reg->priority < ((struct nf_hook_ops *)i)->priority) + list_for_each_entry(elem, &nf_hooks[reg->pf][reg->hooknum], list) { + if (reg->priority < elem->priority) break; } - list_add_rcu(®->list, i->prev); + list_add_rcu(®->list, elem->list.prev); mutex_unlock(&nf_hook_mutex); return 0; } @@ -183,8 +178,7 @@ next_hook: } else if (verdict == NF_DROP) { kfree_skb(skb); ret = -EPERM; - } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { - NFDEBUG("nf_hook: Verdict = QUEUE.\n"); + } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { if (!nf_queue(skb, elem, pf, hook, indev, outdev, okfn, verdict >> NF_VERDICT_BITS)) goto next_hook; @@ -217,22 +211,6 @@ int skb_make_writable(struct sk_buff *skb, unsigned int writable_len) } EXPORT_SYMBOL(skb_make_writable); -void nf_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, - __be32 from, __be32 to, int pseudohdr) -{ - __be32 diff[] = { ~from, to }; - if (skb->ip_summed != CHECKSUM_PARTIAL) { - *sum = csum_fold(csum_partial(diff, sizeof(diff), - ~csum_unfold(*sum))); - if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) - skb->csum = ~csum_partial(diff, sizeof(diff), - ~skb->csum); - } else if (pseudohdr) - *sum = ~csum_fold(csum_partial(diff, sizeof(diff), - csum_unfold(*sum))); -} -EXPORT_SYMBOL(nf_proto_csum_replace4); - #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) /* This does not belong here, but locally generated errors need it if connection tracking in use: without this, connection may not be in hash table, and hence @@ -294,3 +272,12 @@ void __init netfilter_init(void) if (netfilter_log_init() < 0) panic("cannot initialize nf_log"); } + +#ifdef CONFIG_SYSCTL +struct ctl_path nf_net_netfilter_sysctl_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "netfilter", .ctl_name = NET_NETFILTER, }, + { } +}; +EXPORT_SYMBOL_GPL(nf_net_netfilter_sysctl_path); +#endif /* CONFIG_SYSCTL */ diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index a4d5cdeb011..078fff0335a 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -81,7 +81,7 @@ static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple, ((__force __u16)tuple->src.u.all << 16) | (__force __u16)tuple->dst.u.all); - return jhash_2words(a, b, rnd) % size; + return ((u64)jhash_2words(a, b, rnd) * size) >> 32; } static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple) @@ -831,10 +831,8 @@ EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { - NLA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t), - &tuple->src.u.tcp.port); - NLA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t), - &tuple->dst.u.tcp.port); + NLA_PUT_BE16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port); + NLA_PUT_BE16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port); return 0; nla_put_failure: @@ -854,8 +852,8 @@ int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT]) return -EINVAL; - t->src.u.tcp.port = *(__be16 *)nla_data(tb[CTA_PROTO_SRC_PORT]); - t->dst.u.tcp.port = *(__be16 *)nla_data(tb[CTA_PROTO_DST_PORT]); + t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); + t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); return 0; } @@ -863,7 +861,7 @@ EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple); #endif /* Used by ipt_REJECT and ip6t_REJECT. */ -void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) +static void nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; @@ -880,7 +878,6 @@ void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) nskb->nfctinfo = ctinfo; nf_conntrack_get(nskb->nfct); } -EXPORT_SYMBOL_GPL(__nf_conntrack_attach); static inline int do_iter(const struct nf_conntrack_tuple_hash *i, @@ -1124,7 +1121,7 @@ int __init nf_conntrack_init(void) goto out_fini_expect; /* For use by REJECT target */ - rcu_assign_pointer(ip_ct_attach, __nf_conntrack_attach); + rcu_assign_pointer(ip_ct_attach, nf_conntrack_attach); rcu_assign_pointer(nf_ct_destroy, destroy_conntrack); /* Set up fake conntrack: diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 175c8d1a199..e0cd9d00aa6 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -73,15 +73,17 @@ static void nf_ct_expectation_timed_out(unsigned long ul_expect) static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple) { + unsigned int hash; + if (unlikely(!nf_ct_expect_hash_rnd_initted)) { get_random_bytes(&nf_ct_expect_hash_rnd, 4); nf_ct_expect_hash_rnd_initted = 1; } - return jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all), + hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all), (((tuple->dst.protonum ^ tuple->src.l3num) << 16) | - (__force __u16)tuple->dst.u.all) ^ nf_ct_expect_hash_rnd) % - nf_ct_expect_hsize; + (__force __u16)tuple->dst.u.all) ^ nf_ct_expect_hash_rnd); + return ((u64)hash * nf_ct_expect_hsize) >> 32; } struct nf_conntrack_expect * @@ -226,8 +228,8 @@ struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me) EXPORT_SYMBOL_GPL(nf_ct_expect_alloc); void nf_ct_expect_init(struct nf_conntrack_expect *exp, int family, - union nf_conntrack_address *saddr, - union nf_conntrack_address *daddr, + union nf_inet_addr *saddr, + union nf_inet_addr *daddr, u_int8_t proto, __be16 *src, __be16 *dst) { int len; diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index 6df259067f7..6770baf2e84 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -358,7 +358,7 @@ static int help(struct sk_buff *skb, unsigned int matchlen, matchoff; struct nf_ct_ftp_master *ct_ftp_info = &nfct_help(ct)->help.ct_ftp_info; struct nf_conntrack_expect *exp; - union nf_conntrack_address *daddr; + union nf_inet_addr *daddr; struct nf_conntrack_man cmd = {}; unsigned int i; int found = 0, ends_in_nl; diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c index a869403b229..ff66fba514f 100644 --- a/net/netfilter/nf_conntrack_h323_asn1.c +++ b/net/netfilter/nf_conntrack_h323_asn1.c @@ -100,10 +100,10 @@ typedef struct { } bitstr_t; /* Tool Functions */ -#define INC_BIT(bs) if((++bs->bit)>7){bs->cur++;bs->bit=0;} -#define INC_BITS(bs,b) if((bs->bit+=b)>7){bs->cur+=bs->bit>>3;bs->bit&=7;} -#define BYTE_ALIGN(bs) if(bs->bit){bs->cur++;bs->bit=0;} -#define CHECK_BOUND(bs,n) if(bs->cur+(n)>bs->end)return(H323_ERROR_BOUND) +#define INC_BIT(bs) if((++(bs)->bit)>7){(bs)->cur++;(bs)->bit=0;} +#define INC_BITS(bs,b) if(((bs)->bit+=(b))>7){(bs)->cur+=(bs)->bit>>3;(bs)->bit&=7;} +#define BYTE_ALIGN(bs) if((bs)->bit){(bs)->cur++;(bs)->bit=0;} +#define CHECK_BOUND(bs,n) if((bs)->cur+(n)>(bs)->end)return(H323_ERROR_BOUND) static unsigned get_len(bitstr_t * bs); static unsigned get_bit(bitstr_t * bs); static unsigned get_bits(bitstr_t * bs, unsigned b); diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index f23fd9598e1..872c1aa3124 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -50,12 +50,12 @@ MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations " int (*set_h245_addr_hook) (struct sk_buff *skb, unsigned char **data, int dataoff, H245_TransportAddress *taddr, - union nf_conntrack_address *addr, __be16 port) + union nf_inet_addr *addr, __be16 port) __read_mostly; int (*set_h225_addr_hook) (struct sk_buff *skb, unsigned char **data, int dataoff, TransportAddress *taddr, - union nf_conntrack_address *addr, __be16 port) + union nf_inet_addr *addr, __be16 port) __read_mostly; int (*set_sig_addr_hook) (struct sk_buff *skb, struct nf_conn *ct, @@ -214,7 +214,7 @@ static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff, /****************************************************************************/ static int get_h245_addr(struct nf_conn *ct, unsigned char *data, H245_TransportAddress *taddr, - union nf_conntrack_address *addr, __be16 *port) + union nf_inet_addr *addr, __be16 *port) { unsigned char *p; int family = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; @@ -257,7 +257,7 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, int ret = 0; __be16 port; __be16 rtp_port, rtcp_port; - union nf_conntrack_address addr; + union nf_inet_addr addr; struct nf_conntrack_expect *rtp_exp; struct nf_conntrack_expect *rtcp_exp; typeof(nat_rtp_rtcp_hook) nat_rtp_rtcp; @@ -330,7 +330,7 @@ static int expect_t120(struct sk_buff *skb, int dir = CTINFO2DIR(ctinfo); int ret = 0; __be16 port; - union nf_conntrack_address addr; + union nf_inet_addr addr; struct nf_conntrack_expect *exp; typeof(nat_t120_hook) nat_t120; @@ -623,7 +623,7 @@ static struct nf_conntrack_helper nf_conntrack_helper_h245 __read_mostly = { /****************************************************************************/ int get_h225_addr(struct nf_conn *ct, unsigned char *data, TransportAddress *taddr, - union nf_conntrack_address *addr, __be16 *port) + union nf_inet_addr *addr, __be16 *port) { unsigned char *p; int family = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; @@ -662,7 +662,7 @@ static int expect_h245(struct sk_buff *skb, struct nf_conn *ct, int dir = CTINFO2DIR(ctinfo); int ret = 0; __be16 port; - union nf_conntrack_address addr; + union nf_inet_addr addr; struct nf_conntrack_expect *exp; typeof(nat_h245_hook) nat_h245; @@ -704,13 +704,19 @@ static int expect_h245(struct sk_buff *skb, struct nf_conn *ct, /* If the calling party is on the same side of the forward-to party, * we don't need to track the second call */ -static int callforward_do_filter(union nf_conntrack_address *src, - union nf_conntrack_address *dst, +static int callforward_do_filter(union nf_inet_addr *src, + union nf_inet_addr *dst, int family) { + const struct nf_afinfo *afinfo; struct flowi fl1, fl2; int ret = 0; + /* rcu_read_lock()ed by nf_hook_slow() */ + afinfo = nf_get_afinfo(family); + if (!afinfo) + return 0; + memset(&fl1, 0, sizeof(fl1)); memset(&fl2, 0, sizeof(fl2)); @@ -720,8 +726,8 @@ static int callforward_do_filter(union nf_conntrack_address *src, fl1.fl4_dst = src->ip; fl2.fl4_dst = dst->ip; - if (ip_route_output_key(&rt1, &fl1) == 0) { - if (ip_route_output_key(&rt2, &fl2) == 0) { + if (!afinfo->route((struct dst_entry **)&rt1, &fl1)) { + if (!afinfo->route((struct dst_entry **)&rt2, &fl2)) { if (rt1->rt_gateway == rt2->rt_gateway && rt1->u.dst.dev == rt2->u.dst.dev) ret = 1; @@ -731,16 +737,15 @@ static int callforward_do_filter(union nf_conntrack_address *src, } break; } -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if defined(CONFIG_NF_CONNTRACK_IPV6) || \ + defined(CONFIG_NF_CONNTRACK_IPV6_MODULE) case AF_INET6: { struct rt6_info *rt1, *rt2; memcpy(&fl1.fl6_dst, src, sizeof(fl1.fl6_dst)); memcpy(&fl2.fl6_dst, dst, sizeof(fl2.fl6_dst)); - rt1 = (struct rt6_info *)ip6_route_output(NULL, &fl1); - if (rt1) { - rt2 = (struct rt6_info *)ip6_route_output(NULL, &fl2); - if (rt2) { + if (!afinfo->route((struct dst_entry **)&rt1, &fl1)) { + if (!afinfo->route((struct dst_entry **)&rt2, &fl2)) { if (!memcmp(&rt1->rt6i_gateway, &rt2->rt6i_gateway, sizeof(rt1->rt6i_gateway)) && rt1->u.dst.dev == rt2->u.dst.dev) @@ -767,7 +772,7 @@ static int expect_callforwarding(struct sk_buff *skb, int dir = CTINFO2DIR(ctinfo); int ret = 0; __be16 port; - union nf_conntrack_address addr; + union nf_inet_addr addr; struct nf_conntrack_expect *exp; typeof(nat_callforwarding_hook) nat_callforwarding; @@ -823,7 +828,7 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct, int ret; int i; __be16 port; - union nf_conntrack_address addr; + union nf_inet_addr addr; typeof(set_h225_addr_hook) set_h225_addr; pr_debug("nf_ct_q931: Setup\n"); @@ -1195,7 +1200,7 @@ static unsigned char *get_udp_data(struct sk_buff *skb, unsigned int protoff, /****************************************************************************/ static struct nf_conntrack_expect *find_expect(struct nf_conn *ct, - union nf_conntrack_address *addr, + union nf_inet_addr *addr, __be16 port) { struct nf_conntrack_expect *exp; @@ -1237,7 +1242,7 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct, int ret = 0; int i; __be16 port; - union nf_conntrack_address addr; + union nf_inet_addr addr; struct nf_conntrack_expect *exp; typeof(nat_q931_hook) nat_q931; @@ -1306,7 +1311,7 @@ static int process_gcf(struct sk_buff *skb, struct nf_conn *ct, int dir = CTINFO2DIR(ctinfo); int ret = 0; __be16 port; - union nf_conntrack_address addr; + union nf_inet_addr addr; struct nf_conntrack_expect *exp; pr_debug("nf_ct_ras: GCF\n"); @@ -1466,7 +1471,7 @@ static int process_arq(struct sk_buff *skb, struct nf_conn *ct, struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; int dir = CTINFO2DIR(ctinfo); __be16 port; - union nf_conntrack_address addr; + union nf_inet_addr addr; typeof(set_h225_addr_hook) set_h225_addr; pr_debug("nf_ct_ras: ARQ\n"); @@ -1508,7 +1513,7 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct, int dir = CTINFO2DIR(ctinfo); int ret = 0; __be16 port; - union nf_conntrack_address addr; + union nf_inet_addr addr; struct nf_conntrack_expect *exp; typeof(set_sig_addr_hook) set_sig_addr; @@ -1571,7 +1576,7 @@ static int process_lcf(struct sk_buff *skb, struct nf_conn *ct, int dir = CTINFO2DIR(ctinfo); int ret = 0; __be16 port; - union nf_conntrack_address addr; + union nf_inet_addr addr; struct nf_conntrack_expect *exp; pr_debug("nf_ct_ras: LCF\n"); diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c index 991c52c9a28..8e914e5ffea 100644 --- a/net/netfilter/nf_conntrack_l3proto_generic.c +++ b/net/netfilter/nf_conntrack_l3proto_generic.c @@ -55,12 +55,6 @@ static int generic_print_tuple(struct seq_file *s, return 0; } -static int generic_print_conntrack(struct seq_file *s, - const struct nf_conn *conntrack) -{ - return 0; -} - static int generic_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, unsigned int *dataoff, u_int8_t *protonum) { @@ -75,7 +69,6 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = { .pkt_to_tuple = generic_pkt_to_tuple, .invert_tuple = generic_invert_tuple, .print_tuple = generic_print_tuple, - .print_conntrack = generic_print_conntrack, .get_l4proto = generic_get_l4proto, }; EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_generic); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 7d231243754..38141f104db 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -59,7 +59,7 @@ ctnetlink_dump_tuples_proto(struct sk_buff *skb, nest_parms = nla_nest_start(skb, CTA_TUPLE_PROTO | NLA_F_NESTED); if (!nest_parms) goto nla_put_failure; - NLA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum); + NLA_PUT_U8(skb, CTA_PROTO_NUM, tuple->dst.protonum); if (likely(l4proto->tuple_to_nlattr)) ret = l4proto->tuple_to_nlattr(skb, tuple); @@ -95,7 +95,7 @@ nla_put_failure: return -1; } -static inline int +static int ctnetlink_dump_tuples(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { @@ -120,8 +120,7 @@ ctnetlink_dump_tuples(struct sk_buff *skb, static inline int ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct) { - __be32 status = htonl((u_int32_t) ct->status); - NLA_PUT(skb, CTA_STATUS, sizeof(status), &status); + NLA_PUT_BE32(skb, CTA_STATUS, htonl(ct->status)); return 0; nla_put_failure: @@ -131,15 +130,12 @@ nla_put_failure: static inline int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct) { - long timeout_l = ct->timeout.expires - jiffies; - __be32 timeout; + long timeout = (ct->timeout.expires - jiffies) / HZ; - if (timeout_l < 0) + if (timeout < 0) timeout = 0; - else - timeout = htonl(timeout_l / HZ); - NLA_PUT(skb, CTA_TIMEOUT, sizeof(timeout), &timeout); + NLA_PUT_BE32(skb, CTA_TIMEOUT, htonl(timeout)); return 0; nla_put_failure: @@ -193,7 +189,7 @@ ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct nf_conn *ct) nest_helper = nla_nest_start(skb, CTA_HELP | NLA_F_NESTED); if (!nest_helper) goto nla_put_failure; - NLA_PUT(skb, CTA_HELP_NAME, strlen(helper->name), helper->name); + NLA_PUT_STRING(skb, CTA_HELP_NAME, helper->name); if (helper->to_nlattr) helper->to_nlattr(skb, ct); @@ -209,23 +205,21 @@ nla_put_failure: } #ifdef CONFIG_NF_CT_ACCT -static inline int +static int ctnetlink_dump_counters(struct sk_buff *skb, const struct nf_conn *ct, enum ip_conntrack_dir dir) { enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; struct nlattr *nest_count; - __be32 tmp; nest_count = nla_nest_start(skb, type | NLA_F_NESTED); if (!nest_count) goto nla_put_failure; - tmp = htonl(ct->counters[dir].packets); - NLA_PUT(skb, CTA_COUNTERS32_PACKETS, sizeof(u_int32_t), &tmp); - - tmp = htonl(ct->counters[dir].bytes); - NLA_PUT(skb, CTA_COUNTERS32_BYTES, sizeof(u_int32_t), &tmp); + NLA_PUT_BE32(skb, CTA_COUNTERS32_PACKETS, + htonl(ct->counters[dir].packets)); + NLA_PUT_BE32(skb, CTA_COUNTERS32_BYTES, + htonl(ct->counters[dir].bytes)); nla_nest_end(skb, nest_count); @@ -242,9 +236,7 @@ nla_put_failure: static inline int ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct) { - __be32 mark = htonl(ct->mark); - - NLA_PUT(skb, CTA_MARK, sizeof(u_int32_t), &mark); + NLA_PUT_BE32(skb, CTA_MARK, htonl(ct->mark)); return 0; nla_put_failure: @@ -254,11 +246,95 @@ nla_put_failure: #define ctnetlink_dump_mark(a, b) (0) #endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK +static inline int +ctnetlink_dump_secmark(struct sk_buff *skb, const struct nf_conn *ct) +{ + NLA_PUT_BE32(skb, CTA_SECMARK, htonl(ct->secmark)); + return 0; + +nla_put_failure: + return -1; +} +#else +#define ctnetlink_dump_secmark(a, b) (0) +#endif + +#define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple) + +static inline int +ctnetlink_dump_master(struct sk_buff *skb, const struct nf_conn *ct) +{ + struct nlattr *nest_parms; + + if (!(ct->status & IPS_EXPECTED)) + return 0; + + nest_parms = nla_nest_start(skb, CTA_TUPLE_MASTER | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + if (ctnetlink_dump_tuples(skb, master_tuple(ct)) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest_parms); + + return 0; + +nla_put_failure: + return -1; +} + +#ifdef CONFIG_NF_NAT_NEEDED +static int +dump_nat_seq_adj(struct sk_buff *skb, const struct nf_nat_seq *natseq, int type) +{ + struct nlattr *nest_parms; + + nest_parms = nla_nest_start(skb, type | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + + NLA_PUT_BE32(skb, CTA_NAT_SEQ_CORRECTION_POS, + htonl(natseq->correction_pos)); + NLA_PUT_BE32(skb, CTA_NAT_SEQ_OFFSET_BEFORE, + htonl(natseq->offset_before)); + NLA_PUT_BE32(skb, CTA_NAT_SEQ_OFFSET_AFTER, + htonl(natseq->offset_after)); + + nla_nest_end(skb, nest_parms); + + return 0; + +nla_put_failure: + return -1; +} + +static inline int +ctnetlink_dump_nat_seq_adj(struct sk_buff *skb, const struct nf_conn *ct) +{ + struct nf_nat_seq *natseq; + struct nf_conn_nat *nat = nfct_nat(ct); + + if (!(ct->status & IPS_SEQ_ADJUST) || !nat) + return 0; + + natseq = &nat->seq[IP_CT_DIR_ORIGINAL]; + if (dump_nat_seq_adj(skb, natseq, CTA_NAT_SEQ_ADJ_ORIG) == -1) + return -1; + + natseq = &nat->seq[IP_CT_DIR_REPLY]; + if (dump_nat_seq_adj(skb, natseq, CTA_NAT_SEQ_ADJ_REPLY) == -1) + return -1; + + return 0; +} +#else +#define ctnetlink_dump_nat_seq_adj(a, b) (0) +#endif + static inline int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) { - __be32 id = htonl((unsigned long)ct); - NLA_PUT(skb, CTA_ID, sizeof(u_int32_t), &id); + NLA_PUT_BE32(skb, CTA_ID, htonl((unsigned long)ct)); return 0; nla_put_failure: @@ -268,9 +344,7 @@ nla_put_failure: static inline int ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct) { - __be32 use = htonl(atomic_read(&ct->ct_general.use)); - - NLA_PUT(skb, CTA_USE, sizeof(u_int32_t), &use); + NLA_PUT_BE32(skb, CTA_USE, htonl(atomic_read(&ct->ct_general.use))); return 0; nla_put_failure: @@ -320,8 +394,11 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq, ctnetlink_dump_protoinfo(skb, ct) < 0 || ctnetlink_dump_helpinfo(skb, ct) < 0 || ctnetlink_dump_mark(skb, ct) < 0 || + ctnetlink_dump_secmark(skb, ct) < 0 || ctnetlink_dump_id(skb, ct) < 0 || - ctnetlink_dump_use(skb, ct) < 0) + ctnetlink_dump_use(skb, ct) < 0 || + ctnetlink_dump_master(skb, ct) < 0 || + ctnetlink_dump_nat_seq_adj(skb, ct) < 0) goto nla_put_failure; nlh->nlmsg_len = skb_tail_pointer(skb) - b; @@ -419,11 +496,24 @@ static int ctnetlink_conntrack_event(struct notifier_block *this, && ctnetlink_dump_mark(skb, ct) < 0) goto nla_put_failure; #endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK + if ((events & IPCT_SECMARK || ct->secmark) + && ctnetlink_dump_secmark(skb, ct) < 0) + goto nla_put_failure; +#endif if (events & IPCT_COUNTER_FILLING && (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0)) goto nla_put_failure; + + if (events & IPCT_RELATED && + ctnetlink_dump_master(skb, ct) < 0) + goto nla_put_failure; + + if (events & IPCT_NATSEQADJ && + ctnetlink_dump_nat_seq_adj(skb, ct) < 0) + goto nla_put_failure; } nlh->nlmsg_len = skb->tail - b; @@ -444,7 +534,7 @@ static int ctnetlink_done(struct netlink_callback *cb) return 0; } -#define L3PROTO(ct) ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num +#define L3PROTO(ct) (ct)->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num static int ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) @@ -542,7 +632,7 @@ ctnetlink_parse_tuple_proto(struct nlattr *attr, if (!tb[CTA_PROTO_NUM]) return -EINVAL; - tuple->dst.protonum = *(u_int8_t *)nla_data(tb[CTA_PROTO_NUM]); + tuple->dst.protonum = nla_get_u8(tb[CTA_PROTO_NUM]); l4proto = nf_ct_l4proto_find_get(tuple->src.l3num, tuple->dst.protonum); @@ -558,7 +648,7 @@ ctnetlink_parse_tuple_proto(struct nlattr *attr, return ret; } -static inline int +static int ctnetlink_parse_tuple(struct nlattr *cda[], struct nf_conntrack_tuple *tuple, enum ctattr_tuple type, u_int8_t l3num) { @@ -605,7 +695,7 @@ static int nfnetlink_parse_nat_proto(struct nlattr *attr, struct nf_nat_range *range) { struct nlattr *tb[CTA_PROTONAT_MAX+1]; - struct nf_nat_protocol *npt; + const struct nf_nat_protocol *npt; int err; err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy); @@ -647,12 +737,12 @@ nfnetlink_parse_nat(struct nlattr *nat, return err; if (tb[CTA_NAT_MINIP]) - range->min_ip = *(__be32 *)nla_data(tb[CTA_NAT_MINIP]); + range->min_ip = nla_get_be32(tb[CTA_NAT_MINIP]); if (!tb[CTA_NAT_MAXIP]) range->max_ip = range->min_ip; else - range->max_ip = *(__be32 *)nla_data(tb[CTA_NAT_MAXIP]); + range->max_ip = nla_get_be32(tb[CTA_NAT_MAXIP]); if (range->min_ip) range->flags |= IP_NAT_RANGE_MAP_IPS; @@ -722,7 +812,7 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, ct = nf_ct_tuplehash_to_ctrack(h); if (cda[CTA_ID]) { - u_int32_t id = ntohl(*(__be32 *)nla_data(cda[CTA_ID])); + u_int32_t id = ntohl(nla_get_be32(cda[CTA_ID])); if (id != (u32)(unsigned long)ct) { nf_ct_put(ct); return -ENOENT; @@ -798,11 +888,11 @@ out: return err; } -static inline int +static int ctnetlink_change_status(struct nf_conn *ct, struct nlattr *cda[]) { unsigned long d; - unsigned int status = ntohl(*(__be32 *)nla_data(cda[CTA_STATUS])); + unsigned int status = ntohl(nla_get_be32(cda[CTA_STATUS])); d = ct->status ^ status; if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING)) @@ -828,19 +918,17 @@ ctnetlink_change_status(struct nf_conn *ct, struct nlattr *cda[]) if (nfnetlink_parse_nat(cda[CTA_NAT_DST], ct, &range) < 0) return -EINVAL; - if (nf_nat_initialized(ct, - HOOK2MANIP(NF_IP_PRE_ROUTING))) + if (nf_nat_initialized(ct, IP_NAT_MANIP_DST)) return -EEXIST; - nf_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING); + nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST); } if (cda[CTA_NAT_SRC]) { if (nfnetlink_parse_nat(cda[CTA_NAT_SRC], ct, &range) < 0) return -EINVAL; - if (nf_nat_initialized(ct, - HOOK2MANIP(NF_IP_POST_ROUTING))) + if (nf_nat_initialized(ct, IP_NAT_MANIP_SRC)) return -EEXIST; - nf_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); + nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC); } #endif } @@ -904,7 +992,7 @@ ctnetlink_change_helper(struct nf_conn *ct, struct nlattr *cda[]) static inline int ctnetlink_change_timeout(struct nf_conn *ct, struct nlattr *cda[]) { - u_int32_t timeout = ntohl(*(__be32 *)nla_data(cda[CTA_TIMEOUT])); + u_int32_t timeout = ntohl(nla_get_be32(cda[CTA_TIMEOUT])); if (!del_timer(&ct->timeout)) return -ETIME; @@ -935,6 +1023,66 @@ ctnetlink_change_protoinfo(struct nf_conn *ct, struct nlattr *cda[]) return err; } +#ifdef CONFIG_NF_NAT_NEEDED +static inline int +change_nat_seq_adj(struct nf_nat_seq *natseq, struct nlattr *attr) +{ + struct nlattr *cda[CTA_NAT_SEQ_MAX+1]; + + nla_parse_nested(cda, CTA_NAT_SEQ_MAX, attr, NULL); + + if (!cda[CTA_NAT_SEQ_CORRECTION_POS]) + return -EINVAL; + + natseq->correction_pos = + ntohl(nla_get_be32(cda[CTA_NAT_SEQ_CORRECTION_POS])); + + if (!cda[CTA_NAT_SEQ_OFFSET_BEFORE]) + return -EINVAL; + + natseq->offset_before = + ntohl(nla_get_be32(cda[CTA_NAT_SEQ_OFFSET_BEFORE])); + + if (!cda[CTA_NAT_SEQ_OFFSET_AFTER]) + return -EINVAL; + + natseq->offset_after = + ntohl(nla_get_be32(cda[CTA_NAT_SEQ_OFFSET_AFTER])); + + return 0; +} + +static int +ctnetlink_change_nat_seq_adj(struct nf_conn *ct, struct nlattr *cda[]) +{ + int ret = 0; + struct nf_conn_nat *nat = nfct_nat(ct); + + if (!nat) + return 0; + + if (cda[CTA_NAT_SEQ_ADJ_ORIG]) { + ret = change_nat_seq_adj(&nat->seq[IP_CT_DIR_ORIGINAL], + cda[CTA_NAT_SEQ_ADJ_ORIG]); + if (ret < 0) + return ret; + + ct->status |= IPS_SEQ_ADJUST; + } + + if (cda[CTA_NAT_SEQ_ADJ_REPLY]) { + ret = change_nat_seq_adj(&nat->seq[IP_CT_DIR_REPLY], + cda[CTA_NAT_SEQ_ADJ_REPLY]); + if (ret < 0) + return ret; + + ct->status |= IPS_SEQ_ADJUST; + } + + return 0; +} +#endif + static int ctnetlink_change_conntrack(struct nf_conn *ct, struct nlattr *cda[]) { @@ -966,7 +1114,15 @@ ctnetlink_change_conntrack(struct nf_conn *ct, struct nlattr *cda[]) #if defined(CONFIG_NF_CONNTRACK_MARK) if (cda[CTA_MARK]) - ct->mark = ntohl(*(__be32 *)nla_data(cda[CTA_MARK])); + ct->mark = ntohl(nla_get_be32(cda[CTA_MARK])); +#endif + +#ifdef CONFIG_NF_NAT_NEEDED + if (cda[CTA_NAT_SEQ_ADJ_ORIG] || cda[CTA_NAT_SEQ_ADJ_REPLY]) { + err = ctnetlink_change_nat_seq_adj(ct, cda); + if (err < 0) + return err; + } #endif return 0; @@ -989,7 +1145,7 @@ ctnetlink_create_conntrack(struct nlattr *cda[], if (!cda[CTA_TIMEOUT]) goto err; - ct->timeout.expires = ntohl(*(__be32 *)nla_data(cda[CTA_TIMEOUT])); + ct->timeout.expires = ntohl(nla_get_be32(cda[CTA_TIMEOUT])); ct->timeout.expires = jiffies + ct->timeout.expires * HZ; ct->status |= IPS_CONFIRMED; @@ -1008,7 +1164,7 @@ ctnetlink_create_conntrack(struct nlattr *cda[], #if defined(CONFIG_NF_CONNTRACK_MARK) if (cda[CTA_MARK]) - ct->mark = ntohl(*(__be32 *)nla_data(cda[CTA_MARK])); + ct->mark = ntohl(nla_get_be32(cda[CTA_MARK])); #endif helper = nf_ct_helper_find_get(rtuple); @@ -1193,13 +1349,15 @@ nla_put_failure: return -1; } -static inline int +static int ctnetlink_exp_dump_expect(struct sk_buff *skb, const struct nf_conntrack_expect *exp) { struct nf_conn *master = exp->master; - __be32 timeout = htonl((exp->timeout.expires - jiffies) / HZ); - __be32 id = htonl((unsigned long)exp); + long timeout = (exp->timeout.expires - jiffies) / HZ; + + if (timeout < 0) + timeout = 0; if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0) goto nla_put_failure; @@ -1210,8 +1368,8 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb, CTA_EXPECT_MASTER) < 0) goto nla_put_failure; - NLA_PUT(skb, CTA_EXPECT_TIMEOUT, sizeof(timeout), &timeout); - NLA_PUT(skb, CTA_EXPECT_ID, sizeof(u_int32_t), &id); + NLA_PUT_BE32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)); + NLA_PUT_BE32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp)); return 0; @@ -1384,7 +1542,7 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, return -ENOENT; if (cda[CTA_EXPECT_ID]) { - __be32 id = *(__be32 *)nla_data(cda[CTA_EXPECT_ID]); + __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); if (ntohl(id) != (u32)(unsigned long)exp) { nf_ct_expect_put(exp); return -ENOENT; @@ -1438,7 +1596,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, return -ENOENT; if (cda[CTA_EXPECT_ID]) { - __be32 id = *(__be32 *)nla_data(cda[CTA_EXPECT_ID]); + __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); if (ntohl(id) != (u32)(unsigned long)exp) { nf_ct_expect_put(exp); return -ENOENT; diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 6d947068c58..8595b5946ac 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -36,11 +36,11 @@ static DEFINE_MUTEX(nf_ct_proto_mutex); #ifdef CONFIG_SYSCTL static int -nf_ct_register_sysctl(struct ctl_table_header **header, struct ctl_table *path, +nf_ct_register_sysctl(struct ctl_table_header **header, struct ctl_path *path, struct ctl_table *table, unsigned int *users) { if (*header == NULL) { - *header = nf_register_sysctl_table(path, table); + *header = register_sysctl_paths(path, table); if (*header == NULL) return -ENOMEM; } @@ -55,7 +55,8 @@ nf_ct_unregister_sysctl(struct ctl_table_header **header, { if (users != NULL && --*users > 0) return; - nf_unregister_sysctl_table(*header, table); + + unregister_sysctl_table(*header); *header = NULL; } #endif diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index 13f81917964..22c5dcb6306 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -40,13 +40,6 @@ static int generic_print_tuple(struct seq_file *s, return 0; } -/* Print out the private part of the conntrack. */ -static int generic_print_conntrack(struct seq_file *s, - const struct nf_conn *state) -{ - return 0; -} - /* Returns verdict for packet, or -1 for invalid. */ static int packet(struct nf_conn *conntrack, const struct sk_buff *skb, @@ -104,7 +97,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_generic __read_mostly = .pkt_to_tuple = generic_pkt_to_tuple, .invert_tuple = generic_invert_tuple, .print_tuple = generic_print_tuple, - .print_conntrack = generic_print_conntrack, .packet = packet, .new = new, #ifdef CONFIG_SYSCTL diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index cb046751059..21d29e782ba 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -49,24 +49,15 @@ static const char *sctp_conntrack_names[] = { #define HOURS * 60 MINS #define DAYS * 24 HOURS -static unsigned int nf_ct_sctp_timeout_closed __read_mostly = 10 SECS; -static unsigned int nf_ct_sctp_timeout_cookie_wait __read_mostly = 3 SECS; -static unsigned int nf_ct_sctp_timeout_cookie_echoed __read_mostly = 3 SECS; -static unsigned int nf_ct_sctp_timeout_established __read_mostly = 5 DAYS; -static unsigned int nf_ct_sctp_timeout_shutdown_sent __read_mostly = 300 SECS / 1000; -static unsigned int nf_ct_sctp_timeout_shutdown_recd __read_mostly = 300 SECS / 1000; -static unsigned int nf_ct_sctp_timeout_shutdown_ack_sent __read_mostly = 3 SECS; - -static unsigned int * sctp_timeouts[] -= { NULL, /* SCTP_CONNTRACK_NONE */ - &nf_ct_sctp_timeout_closed, /* SCTP_CONNTRACK_CLOSED */ - &nf_ct_sctp_timeout_cookie_wait, /* SCTP_CONNTRACK_COOKIE_WAIT */ - &nf_ct_sctp_timeout_cookie_echoed, /* SCTP_CONNTRACK_COOKIE_ECHOED */ - &nf_ct_sctp_timeout_established, /* SCTP_CONNTRACK_ESTABLISHED */ - &nf_ct_sctp_timeout_shutdown_sent, /* SCTP_CONNTRACK_SHUTDOWN_SENT */ - &nf_ct_sctp_timeout_shutdown_recd, /* SCTP_CONNTRACK_SHUTDOWN_RECD */ - &nf_ct_sctp_timeout_shutdown_ack_sent /* SCTP_CONNTRACK_SHUTDOWN_ACK_SENT */ - }; +static unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] __read_mostly = { + [SCTP_CONNTRACK_CLOSED] = 10 SECS, + [SCTP_CONNTRACK_COOKIE_WAIT] = 3 SECS, + [SCTP_CONNTRACK_COOKIE_ECHOED] = 3 SECS, + [SCTP_CONNTRACK_ESTABLISHED] = 5 DAYS, + [SCTP_CONNTRACK_SHUTDOWN_SENT] = 300 SECS / 1000, + [SCTP_CONNTRACK_SHUTDOWN_RECD] = 300 SECS / 1000, + [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS, +}; #define sNO SCTP_CONNTRACK_NONE #define sCL SCTP_CONNTRACK_CLOSED @@ -110,7 +101,7 @@ cookie echoed to closed. */ /* SCTP conntrack state transitions */ -static enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = { +static const u8 sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ @@ -173,29 +164,28 @@ static int sctp_print_tuple(struct seq_file *s, } /* Print out the private part of the conntrack. */ -static int sctp_print_conntrack(struct seq_file *s, - const struct nf_conn *conntrack) +static int sctp_print_conntrack(struct seq_file *s, const struct nf_conn *ct) { enum sctp_conntrack state; read_lock_bh(&sctp_lock); - state = conntrack->proto.sctp.state; + state = ct->proto.sctp.state; read_unlock_bh(&sctp_lock); return seq_printf(s, "%s ", sctp_conntrack_names[state]); } #define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \ -for (offset = dataoff + sizeof(sctp_sctphdr_t), count = 0; \ - offset < skb->len && \ - (sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch)); \ - offset += (ntohs(sch->length) + 3) & ~3, count++) +for ((offset) = (dataoff) + sizeof(sctp_sctphdr_t), (count) = 0; \ + (offset) < (skb)->len && \ + ((sch) = skb_header_pointer((skb), (offset), sizeof(_sch), &(_sch))); \ + (offset) += (ntohs((sch)->length) + 3) & ~3, (count)++) /* Some validity checks to make sure the chunks are fine */ -static int do_basic_checks(struct nf_conn *conntrack, +static int do_basic_checks(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - char *map) + unsigned long *map) { u_int32_t offset, count; sctp_chunkhdr_t _sch, *sch; @@ -206,76 +196,83 @@ static int do_basic_checks(struct nf_conn *conntrack, for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { pr_debug("Chunk Num: %d Type: %d\n", count, sch->type); - if (sch->type == SCTP_CID_INIT - || sch->type == SCTP_CID_INIT_ACK - || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { + if (sch->type == SCTP_CID_INIT || + sch->type == SCTP_CID_INIT_ACK || + sch->type == SCTP_CID_SHUTDOWN_COMPLETE) flag = 1; - } /* * Cookie Ack/Echo chunks not the first OR * Init / Init Ack / Shutdown compl chunks not the only chunks * OR zero-length. */ - if (((sch->type == SCTP_CID_COOKIE_ACK - || sch->type == SCTP_CID_COOKIE_ECHO - || flag) - && count !=0) || !sch->length) { + if (((sch->type == SCTP_CID_COOKIE_ACK || + sch->type == SCTP_CID_COOKIE_ECHO || + flag) && + count != 0) || !sch->length) { pr_debug("Basic checks failed\n"); return 1; } - if (map) { - set_bit(sch->type, (void *)map); - } + if (map) + set_bit(sch->type, map); } pr_debug("Basic checks passed\n"); return count == 0; } -static int new_state(enum ip_conntrack_dir dir, - enum sctp_conntrack cur_state, - int chunk_type) +static int sctp_new_state(enum ip_conntrack_dir dir, + enum sctp_conntrack cur_state, + int chunk_type) { int i; pr_debug("Chunk type: %d\n", chunk_type); switch (chunk_type) { - case SCTP_CID_INIT: - pr_debug("SCTP_CID_INIT\n"); - i = 0; break; - case SCTP_CID_INIT_ACK: - pr_debug("SCTP_CID_INIT_ACK\n"); - i = 1; break; - case SCTP_CID_ABORT: - pr_debug("SCTP_CID_ABORT\n"); - i = 2; break; - case SCTP_CID_SHUTDOWN: - pr_debug("SCTP_CID_SHUTDOWN\n"); - i = 3; break; - case SCTP_CID_SHUTDOWN_ACK: - pr_debug("SCTP_CID_SHUTDOWN_ACK\n"); - i = 4; break; - case SCTP_CID_ERROR: - pr_debug("SCTP_CID_ERROR\n"); - i = 5; break; - case SCTP_CID_COOKIE_ECHO: - pr_debug("SCTP_CID_COOKIE_ECHO\n"); - i = 6; break; - case SCTP_CID_COOKIE_ACK: - pr_debug("SCTP_CID_COOKIE_ACK\n"); - i = 7; break; - case SCTP_CID_SHUTDOWN_COMPLETE: - pr_debug("SCTP_CID_SHUTDOWN_COMPLETE\n"); - i = 8; break; - default: - /* Other chunks like DATA, SACK, HEARTBEAT and - its ACK do not cause a change in state */ - pr_debug("Unknown chunk type, Will stay in %s\n", - sctp_conntrack_names[cur_state]); - return cur_state; + case SCTP_CID_INIT: + pr_debug("SCTP_CID_INIT\n"); + i = 0; + break; + case SCTP_CID_INIT_ACK: + pr_debug("SCTP_CID_INIT_ACK\n"); + i = 1; + break; + case SCTP_CID_ABORT: + pr_debug("SCTP_CID_ABORT\n"); + i = 2; + break; + case SCTP_CID_SHUTDOWN: + pr_debug("SCTP_CID_SHUTDOWN\n"); + i = 3; + break; + case SCTP_CID_SHUTDOWN_ACK: + pr_debug("SCTP_CID_SHUTDOWN_ACK\n"); + i = 4; + break; + case SCTP_CID_ERROR: + pr_debug("SCTP_CID_ERROR\n"); + i = 5; + break; + case SCTP_CID_COOKIE_ECHO: + pr_debug("SCTP_CID_COOKIE_ECHO\n"); + i = 6; + break; + case SCTP_CID_COOKIE_ACK: + pr_debug("SCTP_CID_COOKIE_ACK\n"); + i = 7; + break; + case SCTP_CID_SHUTDOWN_COMPLETE: + pr_debug("SCTP_CID_SHUTDOWN_COMPLETE\n"); + i = 8; + break; + default: + /* Other chunks like DATA, SACK, HEARTBEAT and + its ACK do not cause a change in state */ + pr_debug("Unknown chunk type, Will stay in %s\n", + sctp_conntrack_names[cur_state]); + return cur_state; } pr_debug("dir: %d cur_state: %s chunk_type: %d new_state: %s\n", @@ -285,154 +282,145 @@ static int new_state(enum ip_conntrack_dir dir, return sctp_conntracks[dir][i][cur_state]; } -/* Returns verdict for packet, or -1 for invalid. */ -static int sctp_packet(struct nf_conn *conntrack, +/* Returns verdict for packet, or -NF_ACCEPT for invalid. */ +static int sctp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, int pf, unsigned int hooknum) { - enum sctp_conntrack newconntrack, oldsctpstate; + enum sctp_conntrack new_state, old_state; + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); sctp_sctphdr_t _sctph, *sh; sctp_chunkhdr_t _sch, *sch; u_int32_t offset, count; - char map[256 / sizeof (char)] = {0}; + unsigned long map[256 / sizeof(unsigned long)] = { 0 }; sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); if (sh == NULL) - return -1; + goto out; - if (do_basic_checks(conntrack, skb, dataoff, map) != 0) - return -1; + if (do_basic_checks(ct, skb, dataoff, map) != 0) + goto out; /* Check the verification tag (Sec 8.5) */ - if (!test_bit(SCTP_CID_INIT, (void *)map) - && !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, (void *)map) - && !test_bit(SCTP_CID_COOKIE_ECHO, (void *)map) - && !test_bit(SCTP_CID_ABORT, (void *)map) - && !test_bit(SCTP_CID_SHUTDOWN_ACK, (void *)map) - && (sh->vtag != conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { + if (!test_bit(SCTP_CID_INIT, map) && + !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) && + !test_bit(SCTP_CID_COOKIE_ECHO, map) && + !test_bit(SCTP_CID_ABORT, map) && + !test_bit(SCTP_CID_SHUTDOWN_ACK, map) && + sh->vtag != ct->proto.sctp.vtag[dir]) { pr_debug("Verification tag check failed\n"); - return -1; + goto out; } - oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX; + old_state = new_state = SCTP_CONNTRACK_MAX; + write_lock_bh(&sctp_lock); for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { - write_lock_bh(&sctp_lock); - /* Special cases of Verification tag check (Sec 8.5.1) */ if (sch->type == SCTP_CID_INIT) { /* Sec 8.5.1 (A) */ - if (sh->vtag != 0) { - write_unlock_bh(&sctp_lock); - return -1; - } + if (sh->vtag != 0) + goto out_unlock; } else if (sch->type == SCTP_CID_ABORT) { /* Sec 8.5.1 (B) */ - if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) - && !(sh->vtag == conntrack->proto.sctp.vtag - [1 - CTINFO2DIR(ctinfo)])) { - write_unlock_bh(&sctp_lock); - return -1; - } + if (sh->vtag != ct->proto.sctp.vtag[dir] && + sh->vtag != ct->proto.sctp.vtag[!dir]) + goto out_unlock; } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { /* Sec 8.5.1 (C) */ - if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) - && !(sh->vtag == conntrack->proto.sctp.vtag - [1 - CTINFO2DIR(ctinfo)] - && (sch->flags & 1))) { - write_unlock_bh(&sctp_lock); - return -1; - } + if (sh->vtag != ct->proto.sctp.vtag[dir] && + sh->vtag != ct->proto.sctp.vtag[!dir] && + sch->flags & SCTP_CHUNK_FLAG_T) + goto out_unlock; } else if (sch->type == SCTP_CID_COOKIE_ECHO) { /* Sec 8.5.1 (D) */ - if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { - write_unlock_bh(&sctp_lock); - return -1; - } + if (sh->vtag != ct->proto.sctp.vtag[dir]) + goto out_unlock; } - oldsctpstate = conntrack->proto.sctp.state; - newconntrack = new_state(CTINFO2DIR(ctinfo), oldsctpstate, sch->type); + old_state = ct->proto.sctp.state; + new_state = sctp_new_state(dir, old_state, sch->type); /* Invalid */ - if (newconntrack == SCTP_CONNTRACK_MAX) { + if (new_state == SCTP_CONNTRACK_MAX) { pr_debug("nf_conntrack_sctp: Invalid dir=%i ctype=%u " "conntrack=%u\n", - CTINFO2DIR(ctinfo), sch->type, oldsctpstate); - write_unlock_bh(&sctp_lock); - return -1; + dir, sch->type, old_state); + goto out_unlock; } /* If it is an INIT or an INIT ACK note down the vtag */ - if (sch->type == SCTP_CID_INIT - || sch->type == SCTP_CID_INIT_ACK) { + if (sch->type == SCTP_CID_INIT || + sch->type == SCTP_CID_INIT_ACK) { sctp_inithdr_t _inithdr, *ih; ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), sizeof(_inithdr), &_inithdr); - if (ih == NULL) { - write_unlock_bh(&sctp_lock); - return -1; - } + if (ih == NULL) + goto out_unlock; pr_debug("Setting vtag %x for dir %d\n", - ih->init_tag, !CTINFO2DIR(ctinfo)); - conntrack->proto.sctp.vtag[!CTINFO2DIR(ctinfo)] = ih->init_tag; + ih->init_tag, !dir); + ct->proto.sctp.vtag[!dir] = ih->init_tag; } - conntrack->proto.sctp.state = newconntrack; - if (oldsctpstate != newconntrack) + ct->proto.sctp.state = new_state; + if (old_state != new_state) nf_conntrack_event_cache(IPCT_PROTOINFO, skb); - write_unlock_bh(&sctp_lock); } + write_unlock_bh(&sctp_lock); - nf_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]); + nf_ct_refresh_acct(ct, ctinfo, skb, sctp_timeouts[new_state]); - if (oldsctpstate == SCTP_CONNTRACK_COOKIE_ECHOED - && CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY - && newconntrack == SCTP_CONNTRACK_ESTABLISHED) { + if (old_state == SCTP_CONNTRACK_COOKIE_ECHOED && + dir == IP_CT_DIR_REPLY && + new_state == SCTP_CONNTRACK_ESTABLISHED) { pr_debug("Setting assured bit\n"); - set_bit(IPS_ASSURED_BIT, &conntrack->status); + set_bit(IPS_ASSURED_BIT, &ct->status); nf_conntrack_event_cache(IPCT_STATUS, skb); } return NF_ACCEPT; + +out_unlock: + write_unlock_bh(&sctp_lock); +out: + return -NF_ACCEPT; } /* Called when a new connection for this protocol found. */ -static int sctp_new(struct nf_conn *conntrack, const struct sk_buff *skb, +static int sctp_new(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff) { - enum sctp_conntrack newconntrack; + enum sctp_conntrack new_state; sctp_sctphdr_t _sctph, *sh; sctp_chunkhdr_t _sch, *sch; u_int32_t offset, count; - char map[256 / sizeof (char)] = {0}; + unsigned long map[256 / sizeof(unsigned long)] = { 0 }; sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); if (sh == NULL) return 0; - if (do_basic_checks(conntrack, skb, dataoff, map) != 0) + if (do_basic_checks(ct, skb, dataoff, map) != 0) return 0; /* If an OOTB packet has any of these chunks discard (Sec 8.4) */ - if ((test_bit (SCTP_CID_ABORT, (void *)map)) - || (test_bit (SCTP_CID_SHUTDOWN_COMPLETE, (void *)map)) - || (test_bit (SCTP_CID_COOKIE_ACK, (void *)map))) { + if (test_bit(SCTP_CID_ABORT, map) || + test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) || + test_bit(SCTP_CID_COOKIE_ACK, map)) return 0; - } - newconntrack = SCTP_CONNTRACK_MAX; + new_state = SCTP_CONNTRACK_MAX; for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { /* Don't need lock here: this conntrack not in circulation yet */ - newconntrack = new_state(IP_CT_DIR_ORIGINAL, - SCTP_CONNTRACK_NONE, sch->type); + new_state = sctp_new_state(IP_CT_DIR_ORIGINAL, + SCTP_CONNTRACK_NONE, sch->type); /* Invalid: delete conntrack */ - if (newconntrack == SCTP_CONNTRACK_NONE || - newconntrack == SCTP_CONNTRACK_MAX) { + if (new_state == SCTP_CONNTRACK_NONE || + new_state == SCTP_CONNTRACK_MAX) { pr_debug("nf_conntrack_sctp: invalid new deleting.\n"); return 0; } @@ -450,7 +438,7 @@ static int sctp_new(struct nf_conn *conntrack, const struct sk_buff *skb, pr_debug("Setting vtag %x for new conn\n", ih->init_tag); - conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = + ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = ih->init_tag; } else { /* Sec 8.5.1 (A) */ @@ -462,10 +450,10 @@ static int sctp_new(struct nf_conn *conntrack, const struct sk_buff *skb, else { pr_debug("Setting vtag %x for new conn OOTB\n", sh->vtag); - conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; + ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; } - conntrack->proto.sctp.state = newconntrack; + ct->proto.sctp.state = new_state; } return 1; @@ -477,49 +465,49 @@ static struct ctl_table_header *sctp_sysctl_header; static struct ctl_table sctp_sysctl_table[] = { { .procname = "nf_conntrack_sctp_timeout_closed", - .data = &nf_ct_sctp_timeout_closed, + .data = &sctp_timeouts[SCTP_CONNTRACK_CLOSED], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, }, { .procname = "nf_conntrack_sctp_timeout_cookie_wait", - .data = &nf_ct_sctp_timeout_cookie_wait, + .data = &sctp_timeouts[SCTP_CONNTRACK_COOKIE_WAIT], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, }, { .procname = "nf_conntrack_sctp_timeout_cookie_echoed", - .data = &nf_ct_sctp_timeout_cookie_echoed, + .data = &sctp_timeouts[SCTP_CONNTRACK_COOKIE_ECHOED], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, }, { .procname = "nf_conntrack_sctp_timeout_established", - .data = &nf_ct_sctp_timeout_established, + .data = &sctp_timeouts[SCTP_CONNTRACK_ESTABLISHED], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, }, { .procname = "nf_conntrack_sctp_timeout_shutdown_sent", - .data = &nf_ct_sctp_timeout_shutdown_sent, + .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, }, { .procname = "nf_conntrack_sctp_timeout_shutdown_recd", - .data = &nf_ct_sctp_timeout_shutdown_recd, + .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, }, { .procname = "nf_conntrack_sctp_timeout_shutdown_ack_sent", - .data = &nf_ct_sctp_timeout_shutdown_ack_sent, + .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, @@ -533,49 +521,49 @@ static struct ctl_table sctp_sysctl_table[] = { static struct ctl_table sctp_compat_sysctl_table[] = { { .procname = "ip_conntrack_sctp_timeout_closed", - .data = &nf_ct_sctp_timeout_closed, + .data = &sctp_timeouts[SCTP_CONNTRACK_CLOSED], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, }, { .procname = "ip_conntrack_sctp_timeout_cookie_wait", - .data = &nf_ct_sctp_timeout_cookie_wait, + .data = &sctp_timeouts[SCTP_CONNTRACK_COOKIE_WAIT], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, }, { .procname = "ip_conntrack_sctp_timeout_cookie_echoed", - .data = &nf_ct_sctp_timeout_cookie_echoed, + .data = &sctp_timeouts[SCTP_CONNTRACK_COOKIE_ECHOED], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, }, { .procname = "ip_conntrack_sctp_timeout_established", - .data = &nf_ct_sctp_timeout_established, + .data = &sctp_timeouts[SCTP_CONNTRACK_ESTABLISHED], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, }, { .procname = "ip_conntrack_sctp_timeout_shutdown_sent", - .data = &nf_ct_sctp_timeout_shutdown_sent, + .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, }, { .procname = "ip_conntrack_sctp_timeout_shutdown_recd", - .data = &nf_ct_sctp_timeout_shutdown_recd, + .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, }, { .procname = "ip_conntrack_sctp_timeout_shutdown_ack_sent", - .data = &nf_ct_sctp_timeout_shutdown_ack_sent, + .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, @@ -598,6 +586,11 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = { .packet = sctp_packet, .new = sctp_new, .me = THIS_MODULE, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nla_policy = nf_ct_port_nla_policy, +#endif #ifdef CONFIG_SYSCTL .ctl_table_users = &sctp_sysctl_table_users, .ctl_table_header = &sctp_sysctl_header, @@ -619,6 +612,11 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = { .packet = sctp_packet, .new = sctp_new, .me = THIS_MODULE, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nla_policy = nf_ct_port_nla_policy, +#endif #ifdef CONFIG_SYSCTL .ctl_table_users = &sctp_sysctl_table_users, .ctl_table_header = &sctp_sysctl_header, diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 7a3f64c1aca..64c9b910419 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -24,6 +24,7 @@ #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_log.h> /* Protects conntrack->proto.tcp */ static DEFINE_RWLOCK(tcp_lock); @@ -63,32 +64,21 @@ static const char *tcp_conntrack_names[] = { #define HOURS * 60 MINS #define DAYS * 24 HOURS -static unsigned int nf_ct_tcp_timeout_syn_sent __read_mostly = 2 MINS; -static unsigned int nf_ct_tcp_timeout_syn_recv __read_mostly = 60 SECS; -static unsigned int nf_ct_tcp_timeout_established __read_mostly = 5 DAYS; -static unsigned int nf_ct_tcp_timeout_fin_wait __read_mostly = 2 MINS; -static unsigned int nf_ct_tcp_timeout_close_wait __read_mostly = 60 SECS; -static unsigned int nf_ct_tcp_timeout_last_ack __read_mostly = 30 SECS; -static unsigned int nf_ct_tcp_timeout_time_wait __read_mostly = 2 MINS; -static unsigned int nf_ct_tcp_timeout_close __read_mostly = 10 SECS; - /* RFC1122 says the R2 limit should be at least 100 seconds. Linux uses 15 packets as limit, which corresponds to ~13-30min depending on RTO. */ static unsigned int nf_ct_tcp_timeout_max_retrans __read_mostly = 5 MINS; -static unsigned int * tcp_timeouts[] = { - NULL, /* TCP_CONNTRACK_NONE */ - &nf_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */ - &nf_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */ - &nf_ct_tcp_timeout_established, /* TCP_CONNTRACK_ESTABLISHED, */ - &nf_ct_tcp_timeout_fin_wait, /* TCP_CONNTRACK_FIN_WAIT, */ - &nf_ct_tcp_timeout_close_wait, /* TCP_CONNTRACK_CLOSE_WAIT, */ - &nf_ct_tcp_timeout_last_ack, /* TCP_CONNTRACK_LAST_ACK, */ - &nf_ct_tcp_timeout_time_wait, /* TCP_CONNTRACK_TIME_WAIT, */ - &nf_ct_tcp_timeout_close, /* TCP_CONNTRACK_CLOSE, */ - NULL, /* TCP_CONNTRACK_LISTEN */ - }; +static unsigned int tcp_timeouts[TCP_CONNTRACK_MAX] __read_mostly = { + [TCP_CONNTRACK_SYN_SENT] = 2 MINS, + [TCP_CONNTRACK_SYN_RECV] = 60 SECS, + [TCP_CONNTRACK_ESTABLISHED] = 5 DAYS, + [TCP_CONNTRACK_FIN_WAIT] = 2 MINS, + [TCP_CONNTRACK_CLOSE_WAIT] = 60 SECS, + [TCP_CONNTRACK_LAST_ACK] = 30 SECS, + [TCP_CONNTRACK_TIME_WAIT] = 2 MINS, + [TCP_CONNTRACK_CLOSE] = 10 SECS, +}; #define sNO TCP_CONNTRACK_NONE #define sSS TCP_CONNTRACK_SYN_SENT @@ -148,7 +138,7 @@ enum tcp_bit_set { * if they are invalid * or we do not support the request (simultaneous open) */ -static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { +static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { { /* ORIGINAL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ @@ -783,9 +773,7 @@ static int tcp_error(struct sk_buff *skb, * because the checksum is assumed to be correct. */ /* FIXME: Source route IP option packets --RR */ - if (nf_conntrack_checksum && - ((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || - (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) && + if (nf_conntrack_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) { if (LOG_INVALID(IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, @@ -942,8 +930,8 @@ static int tcp_packet(struct nf_conn *conntrack, || new_state == TCP_CONNTRACK_CLOSE)) conntrack->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; timeout = conntrack->proto.tcp.retrans >= nf_ct_tcp_max_retrans - && *tcp_timeouts[new_state] > nf_ct_tcp_timeout_max_retrans - ? nf_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; + && tcp_timeouts[new_state] > nf_ct_tcp_timeout_max_retrans + ? nf_ct_tcp_timeout_max_retrans : tcp_timeouts[new_state]; write_unlock_bh(&tcp_lock); nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); @@ -1074,14 +1062,13 @@ static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, if (!nest_parms) goto nla_put_failure; - NLA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t), - &ct->proto.tcp.state); + NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state); - NLA_PUT(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL, sizeof(u_int8_t), - &ct->proto.tcp.seen[0].td_scale); + NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL, + ct->proto.tcp.seen[0].td_scale); - NLA_PUT(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY, sizeof(u_int8_t), - &ct->proto.tcp.seen[1].td_scale); + NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY, + ct->proto.tcp.seen[1].td_scale); tmp.flags = ct->proto.tcp.seen[0].flags; NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL, @@ -1128,8 +1115,7 @@ static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct) return -EINVAL; write_lock_bh(&tcp_lock); - ct->proto.tcp.state = - *(u_int8_t *)nla_data(tb[CTA_PROTOINFO_TCP_STATE]); + ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]); if (tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]) { struct nf_ct_tcp_flags *attr = @@ -1149,10 +1135,10 @@ static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct) tb[CTA_PROTOINFO_TCP_WSCALE_REPLY] && ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_WINDOW_SCALE && ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_WINDOW_SCALE) { - ct->proto.tcp.seen[0].td_scale = *(u_int8_t *) - nla_data(tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]); - ct->proto.tcp.seen[1].td_scale = *(u_int8_t *) - nla_data(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]); + ct->proto.tcp.seen[0].td_scale = + nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]); + ct->proto.tcp.seen[1].td_scale = + nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]); } write_unlock_bh(&tcp_lock); @@ -1166,56 +1152,56 @@ static struct ctl_table_header *tcp_sysctl_header; static struct ctl_table tcp_sysctl_table[] = { { .procname = "nf_conntrack_tcp_timeout_syn_sent", - .data = &nf_ct_tcp_timeout_syn_sent, + .data = &tcp_timeouts[TCP_CONNTRACK_SYN_SENT], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_syn_recv", - .data = &nf_ct_tcp_timeout_syn_recv, + .data = &tcp_timeouts[TCP_CONNTRACK_SYN_RECV], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_established", - .data = &nf_ct_tcp_timeout_established, + .data = &tcp_timeouts[TCP_CONNTRACK_ESTABLISHED], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_fin_wait", - .data = &nf_ct_tcp_timeout_fin_wait, + .data = &tcp_timeouts[TCP_CONNTRACK_FIN_WAIT], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_close_wait", - .data = &nf_ct_tcp_timeout_close_wait, + .data = &tcp_timeouts[TCP_CONNTRACK_CLOSE_WAIT], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_last_ack", - .data = &nf_ct_tcp_timeout_last_ack, + .data = &tcp_timeouts[TCP_CONNTRACK_LAST_ACK], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_time_wait", - .data = &nf_ct_tcp_timeout_time_wait, + .data = &tcp_timeouts[TCP_CONNTRACK_TIME_WAIT], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_close", - .data = &nf_ct_tcp_timeout_close, + .data = &tcp_timeouts[TCP_CONNTRACK_CLOSE], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, @@ -1260,56 +1246,56 @@ static struct ctl_table tcp_sysctl_table[] = { static struct ctl_table tcp_compat_sysctl_table[] = { { .procname = "ip_conntrack_tcp_timeout_syn_sent", - .data = &nf_ct_tcp_timeout_syn_sent, + .data = &tcp_timeouts[TCP_CONNTRACK_SYN_SENT], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_syn_recv", - .data = &nf_ct_tcp_timeout_syn_recv, + .data = &tcp_timeouts[TCP_CONNTRACK_SYN_RECV], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_established", - .data = &nf_ct_tcp_timeout_established, + .data = &tcp_timeouts[TCP_CONNTRACK_ESTABLISHED], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_fin_wait", - .data = &nf_ct_tcp_timeout_fin_wait, + .data = &tcp_timeouts[TCP_CONNTRACK_FIN_WAIT], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_close_wait", - .data = &nf_ct_tcp_timeout_close_wait, + .data = &tcp_timeouts[TCP_CONNTRACK_CLOSE_WAIT], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_last_ack", - .data = &nf_ct_tcp_timeout_last_ack, + .data = &tcp_timeouts[TCP_CONNTRACK_LAST_ACK], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_time_wait", - .data = &nf_ct_tcp_timeout_time_wait, + .data = &tcp_timeouts[TCP_CONNTRACK_TIME_WAIT], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_close", - .data = &nf_ct_tcp_timeout_close, + .data = &tcp_timeouts[TCP_CONNTRACK_CLOSE], .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index b3e7ecb080e..38487541108 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -21,6 +21,7 @@ #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_log.h> static unsigned int nf_ct_udp_timeout __read_mostly = 30*HZ; static unsigned int nf_ct_udp_timeout_stream __read_mostly = 180*HZ; @@ -59,13 +60,6 @@ static int udp_print_tuple(struct seq_file *s, ntohs(tuple->dst.u.udp.port)); } -/* Print out the private part of the conntrack. */ -static int udp_print_conntrack(struct seq_file *s, - const struct nf_conn *conntrack) -{ - return 0; -} - /* Returns verdict for packet, and may modify conntracktype */ static int udp_packet(struct nf_conn *conntrack, const struct sk_buff *skb, @@ -128,9 +122,7 @@ static int udp_error(struct sk_buff *skb, unsigned int dataoff, * We skip checking packets on the outgoing path * because the checksum is assumed to be correct. * FIXME: Source route IP option packets --RR */ - if (nf_conntrack_checksum && - ((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || - (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) && + if (nf_conntrack_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) { if (LOG_INVALID(IPPROTO_UDP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, @@ -194,7 +186,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly = .pkt_to_tuple = udp_pkt_to_tuple, .invert_tuple = udp_invert_tuple, .print_tuple = udp_print_tuple, - .print_conntrack = udp_print_conntrack, .packet = udp_packet, .new = udp_new, .error = udp_error, @@ -222,7 +213,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly = .pkt_to_tuple = udp_pkt_to_tuple, .invert_tuple = udp_invert_tuple, .print_tuple = udp_print_tuple, - .print_conntrack = udp_print_conntrack, .packet = udp_packet, .new = udp_new, .error = udp_error, diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c index b8981dd922b..070056d9bcd 100644 --- a/net/netfilter/nf_conntrack_proto_udplite.c +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -22,6 +22,7 @@ #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_log.h> static unsigned int nf_ct_udplite_timeout __read_mostly = 30*HZ; static unsigned int nf_ct_udplite_timeout_stream __read_mostly = 180*HZ; @@ -58,13 +59,6 @@ static int udplite_print_tuple(struct seq_file *s, ntohs(tuple->dst.u.udp.port)); } -/* Print out the private part of the conntrack. */ -static int udplite_print_conntrack(struct seq_file *s, - const struct nf_conn *conntrack) -{ - return 0; -} - /* Returns verdict for packet, and may modify conntracktype */ static int udplite_packet(struct nf_conn *conntrack, const struct sk_buff *skb, @@ -133,8 +127,7 @@ static int udplite_error(struct sk_buff *skb, unsigned int dataoff, /* Checksum invalid? Ignore. */ if (nf_conntrack_checksum && !skb_csum_unnecessary(skb) && - ((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || - (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING))) { + hooknum == NF_INET_PRE_ROUTING) { if (pf == PF_INET) { struct iphdr *iph = ip_hdr(skb); @@ -198,7 +191,6 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly = .pkt_to_tuple = udplite_pkt_to_tuple, .invert_tuple = udplite_invert_tuple, .print_tuple = udplite_print_tuple, - .print_conntrack = udplite_print_conntrack, .packet = udplite_packet, .new = udplite_new, .error = udplite_error, @@ -222,7 +214,6 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly = .pkt_to_tuple = udplite_pkt_to_tuple, .invert_tuple = udplite_invert_tuple, .print_tuple = udplite_print_tuple, - .print_conntrack = udplite_print_conntrack, .packet = udplite_packet, .new = udplite_new, .error = udplite_error, diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 515abffc4a0..47d8947cf26 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -247,7 +247,7 @@ static int skp_digits_len(struct nf_conn *ct, const char *dptr, } static int parse_addr(struct nf_conn *ct, const char *cp, const char **endp, - union nf_conntrack_address *addr, const char *limit) + union nf_inet_addr *addr, const char *limit) { const char *end; int family = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; @@ -275,7 +275,7 @@ static int parse_addr(struct nf_conn *ct, const char *cp, const char **endp, static int epaddr_len(struct nf_conn *ct, const char *dptr, const char *limit, int *shift) { - union nf_conntrack_address addr; + union nf_inet_addr addr; const char *aux = dptr; if (!parse_addr(ct, dptr, &dptr, &addr, limit)) { @@ -366,7 +366,7 @@ EXPORT_SYMBOL_GPL(ct_sip_get_info); static int set_expected_rtp(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, - union nf_conntrack_address *addr, + union nf_inet_addr *addr, __be16 port, const char *dptr) { @@ -403,7 +403,7 @@ static int sip_help(struct sk_buff *skb, enum ip_conntrack_info ctinfo) { int family = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; - union nf_conntrack_address addr; + union nf_inet_addr addr; unsigned int dataoff, datalen; const char *dptr; int ret = NF_ACCEPT; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 9efdd37fc19..696074a037c 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -142,10 +142,7 @@ static int ct_seq_show(struct seq_file *s, void *v) ? (long)(conntrack->timeout.expires - jiffies)/HZ : 0) != 0) return -ENOSPC; - if (l3proto->print_conntrack(s, conntrack)) - return -ENOSPC; - - if (l4proto->print_conntrack(s, conntrack)) + if (l4proto->print_conntrack && l4proto->print_conntrack(s, conntrack)) return -ENOSPC; if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, @@ -383,15 +380,11 @@ static ctl_table nf_ct_netfilter_table[] = { { .ctl_name = 0 } }; -static ctl_table nf_ct_net_table[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = nf_ct_netfilter_table, - }, - { .ctl_name = 0 } +struct ctl_path nf_ct_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { } }; + EXPORT_SYMBOL_GPL(nf_ct_log_invalid); #endif /* CONFIG_SYSCTL */ @@ -418,7 +411,8 @@ static int __init nf_conntrack_standalone_init(void) proc_stat->owner = THIS_MODULE; #endif #ifdef CONFIG_SYSCTL - nf_ct_sysctl_header = register_sysctl_table(nf_ct_net_table); + nf_ct_sysctl_header = register_sysctl_paths(nf_ct_path, + nf_ct_netfilter_table); if (nf_ct_sysctl_header == NULL) { printk("nf_conntrack: can't register to sysctl.\n"); ret = -ENOMEM; diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index d67c4fbf603..4f5f2885fca 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -6,6 +6,7 @@ #include <linux/netfilter.h> #include <linux/seq_file.h> #include <net/protocol.h> +#include <net/netfilter/nf_log.h> #include "nf_internals.h" @@ -14,12 +15,12 @@ #define NF_LOG_PREFIXLEN 128 -static struct nf_logger *nf_loggers[NPROTO]; +static const struct nf_logger *nf_loggers[NPROTO] __read_mostly; static DEFINE_MUTEX(nf_log_mutex); /* return EBUSY if somebody else is registered, EEXIST if the same logger * is registred, 0 on success. */ -int nf_log_register(int pf, struct nf_logger *logger) +int nf_log_register(int pf, const struct nf_logger *logger) { int ret; @@ -57,7 +58,7 @@ void nf_log_unregister_pf(int pf) } EXPORT_SYMBOL(nf_log_unregister_pf); -void nf_log_unregister(struct nf_logger *logger) +void nf_log_unregister(const struct nf_logger *logger) { int i; @@ -77,12 +78,12 @@ void nf_log_packet(int pf, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, - struct nf_loginfo *loginfo, + const struct nf_loginfo *loginfo, const char *fmt, ...) { va_list args; char prefix[NF_LOG_PREFIXLEN]; - struct nf_logger *logger; + const struct nf_logger *logger; rcu_read_lock(); logger = rcu_dereference(nf_loggers[pf]); @@ -90,7 +91,6 @@ void nf_log_packet(int pf, va_start(args, fmt); vsnprintf(prefix, sizeof(prefix), fmt, args); va_end(args); - /* We must read logging before nf_logfn[pf] */ logger->logfn(pf, hooknum, skb, in, out, loginfo, prefix); } else if (net_ratelimit()) { printk(KERN_WARNING "nf_log_packet: can\'t log since " diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 0cef1433d66..bfc2928c191 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -7,6 +7,7 @@ #include <linux/seq_file.h> #include <linux/rcupdate.h> #include <net/protocol.h> +#include <net/netfilter/nf_queue.h> #include "nf_internals.h" @@ -15,13 +16,13 @@ * long term mutex. The handler must provide an an outfn() to accept packets * for queueing and must reinject all packets it receives, no matter what. */ -static struct nf_queue_handler *queue_handler[NPROTO]; +static const struct nf_queue_handler *queue_handler[NPROTO]; static DEFINE_MUTEX(queue_handler_mutex); /* return EBUSY when somebody else is registered, return EEXIST if the * same handler is registered, return 0 in case of success. */ -int nf_register_queue_handler(int pf, struct nf_queue_handler *qh) +int nf_register_queue_handler(int pf, const struct nf_queue_handler *qh) { int ret; @@ -44,7 +45,7 @@ int nf_register_queue_handler(int pf, struct nf_queue_handler *qh) EXPORT_SYMBOL(nf_register_queue_handler); /* The caller must flush their queue before this */ -int nf_unregister_queue_handler(int pf, struct nf_queue_handler *qh) +int nf_unregister_queue_handler(int pf, const struct nf_queue_handler *qh) { if (pf >= NPROTO) return -EINVAL; @@ -64,7 +65,7 @@ int nf_unregister_queue_handler(int pf, struct nf_queue_handler *qh) } EXPORT_SYMBOL(nf_unregister_queue_handler); -void nf_unregister_queue_handlers(struct nf_queue_handler *qh) +void nf_unregister_queue_handlers(const struct nf_queue_handler *qh) { int pf; @@ -79,6 +80,27 @@ void nf_unregister_queue_handlers(struct nf_queue_handler *qh) } EXPORT_SYMBOL_GPL(nf_unregister_queue_handlers); +static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) +{ + /* Release those devices we held, or Alexey will kill me. */ + if (entry->indev) + dev_put(entry->indev); + if (entry->outdev) + dev_put(entry->outdev); +#ifdef CONFIG_BRIDGE_NETFILTER + if (entry->skb->nf_bridge) { + struct nf_bridge_info *nf_bridge = entry->skb->nf_bridge; + + if (nf_bridge->physindev) + dev_put(nf_bridge->physindev); + if (nf_bridge->physoutdev) + dev_put(nf_bridge->physoutdev); + } +#endif + /* Drop reference to owner of hook which queued us. */ + module_put(entry->elem->owner); +} + /* * Any packet that leaves via this function must come back * through nf_reinject(). @@ -92,84 +114,79 @@ static int __nf_queue(struct sk_buff *skb, unsigned int queuenum) { int status; - struct nf_info *info; + struct nf_queue_entry *entry = NULL; #ifdef CONFIG_BRIDGE_NETFILTER - struct net_device *physindev = NULL; - struct net_device *physoutdev = NULL; + struct net_device *physindev; + struct net_device *physoutdev; #endif - struct nf_afinfo *afinfo; - struct nf_queue_handler *qh; + const struct nf_afinfo *afinfo; + const struct nf_queue_handler *qh; /* QUEUE == DROP if noone is waiting, to be safe. */ rcu_read_lock(); qh = rcu_dereference(queue_handler[pf]); - if (!qh) { - rcu_read_unlock(); - kfree_skb(skb); - return 1; - } + if (!qh) + goto err_unlock; afinfo = nf_get_afinfo(pf); - if (!afinfo) { - rcu_read_unlock(); - kfree_skb(skb); - return 1; - } - - info = kmalloc(sizeof(*info) + afinfo->route_key_size, GFP_ATOMIC); - if (!info) { - if (net_ratelimit()) - printk(KERN_ERR "OOM queueing packet %p\n", - skb); - rcu_read_unlock(); - kfree_skb(skb); - return 1; - } - - *info = (struct nf_info) { - (struct nf_hook_ops *)elem, pf, hook, indev, outdev, okfn }; + if (!afinfo) + goto err_unlock; + + entry = kmalloc(sizeof(*entry) + afinfo->route_key_size, GFP_ATOMIC); + if (!entry) + goto err_unlock; + + *entry = (struct nf_queue_entry) { + .skb = skb, + .elem = list_entry(elem, struct nf_hook_ops, list), + .pf = pf, + .hook = hook, + .indev = indev, + .outdev = outdev, + .okfn = okfn, + }; /* If it's going away, ignore hook. */ - if (!try_module_get(info->elem->owner)) { + if (!try_module_get(entry->elem->owner)) { rcu_read_unlock(); - kfree(info); + kfree(entry); return 0; } /* Bump dev refs so they don't vanish while packet is out */ - if (indev) dev_hold(indev); - if (outdev) dev_hold(outdev); - + if (indev) + dev_hold(indev); + if (outdev) + dev_hold(outdev); #ifdef CONFIG_BRIDGE_NETFILTER if (skb->nf_bridge) { physindev = skb->nf_bridge->physindev; - if (physindev) dev_hold(physindev); + if (physindev) + dev_hold(physindev); physoutdev = skb->nf_bridge->physoutdev; - if (physoutdev) dev_hold(physoutdev); + if (physoutdev) + dev_hold(physoutdev); } #endif - afinfo->saveroute(skb, info); - status = qh->outfn(skb, info, queuenum, qh->data); + afinfo->saveroute(skb, entry); + status = qh->outfn(entry, queuenum); rcu_read_unlock(); if (status < 0) { - /* James M doesn't say fuck enough. */ - if (indev) dev_put(indev); - if (outdev) dev_put(outdev); -#ifdef CONFIG_BRIDGE_NETFILTER - if (physindev) dev_put(physindev); - if (physoutdev) dev_put(physoutdev); -#endif - module_put(info->elem->owner); - kfree(info); - kfree_skb(skb); - - return 1; + nf_queue_entry_release_refs(entry); + goto err; } return 1; + +err_unlock: + rcu_read_unlock(); +err: + kfree_skb(skb); + kfree(entry); + return 1; } int nf_queue(struct sk_buff *skb, @@ -212,41 +229,15 @@ int nf_queue(struct sk_buff *skb, return 1; } -void nf_reinject(struct sk_buff *skb, struct nf_info *info, - unsigned int verdict) +void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) { - struct list_head *elem = &info->elem->list; - struct list_head *i; - struct nf_afinfo *afinfo; + struct sk_buff *skb = entry->skb; + struct list_head *elem = &entry->elem->list; + const struct nf_afinfo *afinfo; rcu_read_lock(); - /* Release those devices we held, or Alexey will kill me. */ - if (info->indev) dev_put(info->indev); - if (info->outdev) dev_put(info->outdev); -#ifdef CONFIG_BRIDGE_NETFILTER - if (skb->nf_bridge) { - if (skb->nf_bridge->physindev) - dev_put(skb->nf_bridge->physindev); - if (skb->nf_bridge->physoutdev) - dev_put(skb->nf_bridge->physoutdev); - } -#endif - - /* Drop reference to owner of hook which queued us. */ - module_put(info->elem->owner); - - list_for_each_rcu(i, &nf_hooks[info->pf][info->hook]) { - if (i == elem) - break; - } - - if (i == &nf_hooks[info->pf][info->hook]) { - /* The module which sent it to userspace is gone. */ - NFDEBUG("%s: module disappeared, dropping packet.\n", - __FUNCTION__); - verdict = NF_DROP; - } + nf_queue_entry_release_refs(entry); /* Continue traversal iff userspace said ok... */ if (verdict == NF_REPEAT) { @@ -255,28 +246,30 @@ void nf_reinject(struct sk_buff *skb, struct nf_info *info, } if (verdict == NF_ACCEPT) { - afinfo = nf_get_afinfo(info->pf); - if (!afinfo || afinfo->reroute(skb, info) < 0) + afinfo = nf_get_afinfo(entry->pf); + if (!afinfo || afinfo->reroute(skb, entry) < 0) verdict = NF_DROP; } if (verdict == NF_ACCEPT) { next_hook: - verdict = nf_iterate(&nf_hooks[info->pf][info->hook], - skb, info->hook, - info->indev, info->outdev, &elem, - info->okfn, INT_MIN); + verdict = nf_iterate(&nf_hooks[entry->pf][entry->hook], + skb, entry->hook, + entry->indev, entry->outdev, &elem, + entry->okfn, INT_MIN); } switch (verdict & NF_VERDICT_MASK) { case NF_ACCEPT: case NF_STOP: - info->okfn(skb); + local_bh_disable(); + entry->okfn(skb); + local_bh_enable(); case NF_STOLEN: break; case NF_QUEUE: - if (!__nf_queue(skb, elem, info->pf, info->hook, - info->indev, info->outdev, info->okfn, + if (!__nf_queue(skb, elem, entry->pf, entry->hook, + entry->indev, entry->outdev, entry->okfn, verdict >> NF_VERDICT_BITS)) goto next_hook; break; @@ -284,7 +277,7 @@ void nf_reinject(struct sk_buff *skb, struct nf_info *info, kfree_skb(skb); } rcu_read_unlock(); - kfree(info); + kfree(entry); return; } EXPORT_SYMBOL(nf_reinject); @@ -317,7 +310,7 @@ static int seq_show(struct seq_file *s, void *v) { int ret; loff_t *pos = v; - struct nf_queue_handler *qh; + const struct nf_queue_handler *qh; rcu_read_lock(); qh = rcu_dereference(queue_handler[*pos]); diff --git a/net/netfilter/nf_sysctl.c b/net/netfilter/nf_sysctl.c deleted file mode 100644 index ee34589e48a..00000000000 --- a/net/netfilter/nf_sysctl.c +++ /dev/null @@ -1,134 +0,0 @@ -/* nf_sysctl.c netfilter sysctl registration/unregistation - * - * Copyright (c) 2006 Patrick McHardy <kaber@trash.net> - */ -#include <linux/module.h> -#include <linux/sysctl.h> -#include <linux/string.h> -#include <linux/slab.h> - -static void -path_free(struct ctl_table *path, struct ctl_table *table) -{ - struct ctl_table *t, *next; - - for (t = path; t != NULL && t != table; t = next) { - next = t->child; - kfree(t); - } -} - -static struct ctl_table * -path_dup(struct ctl_table *path, struct ctl_table *table) -{ - struct ctl_table *t, *last = NULL, *tmp; - - for (t = path; t != NULL; t = t->child) { - /* twice the size since path elements are terminated by an - * empty element */ - tmp = kmemdup(t, 2 * sizeof(*t), GFP_KERNEL); - if (tmp == NULL) { - if (last != NULL) - path_free(path, table); - return NULL; - } - - if (last != NULL) - last->child = tmp; - else - path = tmp; - last = tmp; - } - - if (last != NULL) - last->child = table; - else - path = table; - - return path; -} - -struct ctl_table_header * -nf_register_sysctl_table(struct ctl_table *path, struct ctl_table *table) -{ - struct ctl_table_header *header; - - path = path_dup(path, table); - if (path == NULL) - return NULL; - header = register_sysctl_table(path); - if (header == NULL) - path_free(path, table); - return header; -} -EXPORT_SYMBOL_GPL(nf_register_sysctl_table); - -void -nf_unregister_sysctl_table(struct ctl_table_header *header, - struct ctl_table *table) -{ - struct ctl_table *path = header->ctl_table; - - unregister_sysctl_table(header); - path_free(path, table); -} -EXPORT_SYMBOL_GPL(nf_unregister_sysctl_table); - -/* net/netfilter */ -static struct ctl_table nf_net_netfilter_table[] = { - { - .ctl_name = NET_NETFILTER, - .procname = "netfilter", - .mode = 0555, - }, - { - .ctl_name = 0 - } -}; -struct ctl_table nf_net_netfilter_sysctl_path[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = nf_net_netfilter_table, - }, - { - .ctl_name = 0 - } -}; -EXPORT_SYMBOL_GPL(nf_net_netfilter_sysctl_path); - -/* net/ipv4/netfilter */ -static struct ctl_table nf_net_ipv4_netfilter_table[] = { - { - .ctl_name = NET_IPV4_NETFILTER, - .procname = "netfilter", - .mode = 0555, - }, - { - .ctl_name = 0 - } -}; -static struct ctl_table nf_net_ipv4_table[] = { - { - .ctl_name = NET_IPV4, - .procname = "ipv4", - .mode = 0555, - .child = nf_net_ipv4_netfilter_table, - }, - { - .ctl_name = 0 - } -}; -struct ctl_table nf_net_ipv4_netfilter_sysctl_path[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = nf_net_ipv4_table, - }, - { - .ctl_name = 0 - } -}; -EXPORT_SYMBOL_GPL(nf_net_ipv4_netfilter_sysctl_path); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 2128542995f..b75c9c4a995 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -179,7 +179,7 @@ static void nfnetlink_rcv(struct sk_buff *skb) static void __exit nfnetlink_exit(void) { printk("Removing netfilter NETLINK layer.\n"); - sock_release(nfnl->sk_socket); + netlink_kernel_release(nfnl); return; } diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 2c7bd2eb029..5013cb97ce2 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -29,6 +29,7 @@ #include <linux/jhash.h> #include <linux/random.h> #include <net/sock.h> +#include <net/netfilter/nf_log.h> #include <asm/atomic.h> @@ -44,14 +45,6 @@ #define PRINTR(x, args...) do { if (net_ratelimit()) \ printk(x, ## args); } while (0); -#if 0 -#define UDEBUG(x, args ...) printk(KERN_DEBUG "%s(%d):%s(): " x, \ - __FILE__, __LINE__, __FUNCTION__, \ - ## args) -#else -#define UDEBUG(x, ...) -#endif - struct nfulnl_instance { struct hlist_node hlist; /* global list of instances */ spinlock_t lock; @@ -92,8 +85,6 @@ __instance_lookup(u_int16_t group_num) struct hlist_node *pos; struct nfulnl_instance *inst; - UDEBUG("entering (group_num=%u)\n", group_num); - head = &instance_table[instance_hashfn(group_num)]; hlist_for_each_entry(inst, pos, head, hlist) { if (inst->group_num == group_num) @@ -126,7 +117,6 @@ static void instance_put(struct nfulnl_instance *inst) { if (inst && atomic_dec_and_test(&inst->use)) { - UDEBUG("kfree(inst=%p)\n", inst); kfree(inst); module_put(THIS_MODULE); } @@ -138,23 +128,23 @@ static struct nfulnl_instance * instance_create(u_int16_t group_num, int pid) { struct nfulnl_instance *inst; - - UDEBUG("entering (group_num=%u, pid=%d)\n", group_num, - pid); + int err; write_lock_bh(&instances_lock); if (__instance_lookup(group_num)) { - inst = NULL; - UDEBUG("aborting, instance already exists\n"); + err = -EEXIST; goto out_unlock; } inst = kzalloc(sizeof(*inst), GFP_ATOMIC); - if (!inst) + if (!inst) { + err = -ENOMEM; goto out_unlock; + } if (!try_module_get(THIS_MODULE)) { kfree(inst); + err = -EAGAIN; goto out_unlock; } @@ -177,16 +167,13 @@ instance_create(u_int16_t group_num, int pid) hlist_add_head(&inst->hlist, &instance_table[instance_hashfn(group_num)]); - UDEBUG("newly added node: %p, next=%p\n", &inst->hlist, - inst->hlist.next); - write_unlock_bh(&instances_lock); return inst; out_unlock: write_unlock_bh(&instances_lock); - return NULL; + return ERR_PTR(err); } static void __nfulnl_flush(struct nfulnl_instance *inst); @@ -195,9 +182,6 @@ static void __instance_destroy(struct nfulnl_instance *inst) { /* first pull it out of the global list */ - UDEBUG("removing instance %p (queuenum=%u) from hash\n", - inst, inst->group_num); - hlist_del(&inst->hlist); /* then flush all pending packets from skb */ @@ -305,8 +289,6 @@ nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size) struct sk_buff *skb; unsigned int n; - UDEBUG("entered (%u, %u)\n", inst_size, pkt_size); - /* alloc skb which should be big enough for a whole multipart * message. WARNING: has to be <= 128k due to slab restrictions */ @@ -341,10 +323,6 @@ __nfulnl_send(struct nfulnl_instance *inst) sizeof(struct nfgenmsg)); status = nfnetlink_unicast(inst->skb, inst->peer_pid, MSG_DONTWAIT); - if (status < 0) { - UDEBUG("netlink_unicast() failed\n"); - /* FIXME: statistics */ - } inst->qlen = 0; inst->skb = NULL; @@ -368,8 +346,6 @@ nfulnl_timer(unsigned long data) { struct nfulnl_instance *inst = (struct nfulnl_instance *)data; - UDEBUG("timer function called, flushing buffer\n"); - spin_lock_bh(&inst->lock); if (inst->skb) __nfulnl_send(inst); @@ -396,8 +372,6 @@ __build_packet_message(struct nfulnl_instance *inst, __be32 tmp_uint; sk_buff_data_t old_tail = inst->skb->tail; - UDEBUG("entered\n"); - nlh = NLMSG_PUT(inst->skb, 0, 0, NFNL_SUBSYS_ULOG << 8 | NFULNL_MSG_PACKET, sizeof(struct nfgenmsg)); @@ -415,32 +389,27 @@ __build_packet_message(struct nfulnl_instance *inst, NLA_PUT(inst->skb, NFULA_PREFIX, plen, prefix); if (indev) { - tmp_uint = htonl(indev->ifindex); #ifndef CONFIG_BRIDGE_NETFILTER - NLA_PUT(inst->skb, NFULA_IFINDEX_INDEV, sizeof(tmp_uint), - &tmp_uint); + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV, + htonl(indev->ifindex)); #else if (pf == PF_BRIDGE) { /* Case 1: outdev is physical input device, we need to * look for bridge group (when called from * netfilter_bridge) */ - NLA_PUT(inst->skb, NFULA_IFINDEX_PHYSINDEV, - sizeof(tmp_uint), &tmp_uint); + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSINDEV, + htonl(indev->ifindex)); /* this is the bridge group "brX" */ - tmp_uint = htonl(indev->br_port->br->dev->ifindex); - NLA_PUT(inst->skb, NFULA_IFINDEX_INDEV, - sizeof(tmp_uint), &tmp_uint); + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV, + htonl(indev->br_port->br->dev->ifindex)); } else { /* Case 2: indev is bridge group, we need to look for * physical device (when called from ipv4) */ - NLA_PUT(inst->skb, NFULA_IFINDEX_INDEV, - sizeof(tmp_uint), &tmp_uint); - if (skb->nf_bridge && skb->nf_bridge->physindev) { - tmp_uint = - htonl(skb->nf_bridge->physindev->ifindex); - NLA_PUT(inst->skb, NFULA_IFINDEX_PHYSINDEV, - sizeof(tmp_uint), &tmp_uint); - } + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV, + htonl(indev->ifindex)); + if (skb->nf_bridge && skb->nf_bridge->physindev) + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSINDEV, + htonl(skb->nf_bridge->physindev->ifindex)); } #endif } @@ -448,38 +417,32 @@ __build_packet_message(struct nfulnl_instance *inst, if (outdev) { tmp_uint = htonl(outdev->ifindex); #ifndef CONFIG_BRIDGE_NETFILTER - NLA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV, sizeof(tmp_uint), - &tmp_uint); + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV, + htonl(outdev->ifindex)); #else if (pf == PF_BRIDGE) { /* Case 1: outdev is physical output device, we need to * look for bridge group (when called from * netfilter_bridge) */ - NLA_PUT(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, - sizeof(tmp_uint), &tmp_uint); + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, + htonl(outdev->ifindex)); /* this is the bridge group "brX" */ - tmp_uint = htonl(outdev->br_port->br->dev->ifindex); - NLA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV, - sizeof(tmp_uint), &tmp_uint); + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV, + htonl(outdev->br_port->br->dev->ifindex)); } else { /* Case 2: indev is a bridge group, we need to look * for physical device (when called from ipv4) */ - NLA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV, - sizeof(tmp_uint), &tmp_uint); - if (skb->nf_bridge && skb->nf_bridge->physoutdev) { - tmp_uint = - htonl(skb->nf_bridge->physoutdev->ifindex); - NLA_PUT(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, - sizeof(tmp_uint), &tmp_uint); - } + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV, + htonl(outdev->ifindex)); + if (skb->nf_bridge && skb->nf_bridge->physoutdev) + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, + htonl(skb->nf_bridge->physoutdev->ifindex)); } #endif } - if (skb->mark) { - tmp_uint = htonl(skb->mark); - NLA_PUT(inst->skb, NFULA_MARK, sizeof(tmp_uint), &tmp_uint); - } + if (skb->mark) + NLA_PUT_BE32(inst->skb, NFULA_MARK, htonl(skb->mark)); if (indev && skb->dev) { struct nfulnl_msg_packet_hw phw; @@ -504,23 +467,23 @@ __build_packet_message(struct nfulnl_instance *inst, read_lock_bh(&skb->sk->sk_callback_lock); if (skb->sk->sk_socket && skb->sk->sk_socket->file) { __be32 uid = htonl(skb->sk->sk_socket->file->f_uid); + __be32 gid = htons(skb->sk->sk_socket->file->f_gid); /* need to unlock here since NLA_PUT may goto */ read_unlock_bh(&skb->sk->sk_callback_lock); - NLA_PUT(inst->skb, NFULA_UID, sizeof(uid), &uid); + NLA_PUT_BE32(inst->skb, NFULA_UID, uid); + NLA_PUT_BE32(inst->skb, NFULA_GID, gid); } else read_unlock_bh(&skb->sk->sk_callback_lock); } /* local sequence number */ - if (inst->flags & NFULNL_CFG_F_SEQ) { - tmp_uint = htonl(inst->seq++); - NLA_PUT(inst->skb, NFULA_SEQ, sizeof(tmp_uint), &tmp_uint); - } + if (inst->flags & NFULNL_CFG_F_SEQ) + NLA_PUT_BE32(inst->skb, NFULA_SEQ, htonl(inst->seq++)); + /* global sequence number */ - if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) { - tmp_uint = htonl(atomic_inc_return(&global_seq)); - NLA_PUT(inst->skb, NFULA_SEQ_GLOBAL, sizeof(tmp_uint), &tmp_uint); - } + if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) + NLA_PUT_BE32(inst->skb, NFULA_SEQ_GLOBAL, + htonl(atomic_inc_return(&global_seq))); if (data_len) { struct nlattr *nla; @@ -543,7 +506,6 @@ __build_packet_message(struct nfulnl_instance *inst, return 0; nlmsg_failure: - UDEBUG("nlmsg_failure\n"); nla_put_failure: PRINTR(KERN_ERR "nfnetlink_log: error creating log nlmsg\n"); return -1; @@ -604,12 +566,11 @@ nfulnl_log_packet(unsigned int pf, #endif + nla_total_size(sizeof(u_int32_t)) /* mark */ + nla_total_size(sizeof(u_int32_t)) /* uid */ + + nla_total_size(sizeof(u_int32_t)) /* gid */ + nla_total_size(plen) /* prefix */ + nla_total_size(sizeof(struct nfulnl_msg_packet_hw)) + nla_total_size(sizeof(struct nfulnl_msg_packet_timestamp)); - UDEBUG("initial size=%u\n", size); - spin_lock_bh(&inst->lock); if (inst->flags & NFULNL_CFG_F_SEQ) @@ -636,7 +597,6 @@ nfulnl_log_packet(unsigned int pf, data_len = inst->copy_range; size += nla_total_size(data_len); - UDEBUG("copy_packet, therefore size now %u\n", size); break; default: @@ -647,8 +607,6 @@ nfulnl_log_packet(unsigned int pf, size > skb_tailroom(inst->skb) - sizeof(struct nfgenmsg)) { /* either the queue len is too high or we don't have * enough room in the skb left. flush to userspace. */ - UDEBUG("flushing old skb\n"); - __nfulnl_flush(inst); } @@ -658,7 +616,6 @@ nfulnl_log_packet(unsigned int pf, goto alloc_failure; } - UDEBUG("qlen %d, qthreshold %d\n", inst->qlen, qthreshold); inst->qlen++; __build_packet_message(inst, skb, data_len, pf, @@ -680,7 +637,6 @@ unlock_and_release: return; alloc_failure: - UDEBUG("error allocating skb\n"); /* FIXME: statistics */ goto unlock_and_release; } @@ -703,7 +659,6 @@ nfulnl_rcv_nl_event(struct notifier_block *this, struct hlist_head *head = &instance_table[i]; hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) { - UDEBUG("node = %p\n", inst); if ((n->net == &init_net) && (n->pid == inst->peer_pid)) __instance_destroy(inst); @@ -725,7 +680,7 @@ nfulnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb, return -ENOTSUPP; } -static struct nf_logger nfulnl_logger = { +static const struct nf_logger nfulnl_logger = { .name = "nfnetlink_log", .logfn = &nfulnl_log_packet, .me = THIS_MODULE, @@ -749,14 +704,17 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, struct nfulnl_instance *inst; int ret = 0; - UDEBUG("entering for msg %u\n", NFNL_MSG_TYPE(nlh->nlmsg_type)); - inst = instance_lookup_get(group_num); + if (inst && inst->peer_pid != NETLINK_CB(skb).pid) { + ret = -EPERM; + goto out_put; + } + if (nfula[NFULA_CFG_CMD]) { u_int8_t pf = nfmsg->nfgen_family; struct nfulnl_msg_config_cmd *cmd; + cmd = nla_data(nfula[NFULA_CFG_CMD]); - UDEBUG("found CFG_CMD for\n"); switch (cmd->command) { case NFULNL_CFG_CMD_BIND: @@ -767,8 +725,8 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, inst = instance_create(group_num, NETLINK_CB(skb).pid); - if (!inst) { - ret = -EINVAL; + if (IS_ERR(inst)) { + ret = PTR_ERR(inst); goto out; } break; @@ -778,78 +736,71 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, goto out; } - if (inst->peer_pid != NETLINK_CB(skb).pid) { - ret = -EPERM; - goto out_put; - } - instance_destroy(inst); goto out; case NFULNL_CFG_CMD_PF_BIND: - UDEBUG("registering log handler for pf=%u\n", pf); ret = nf_log_register(pf, &nfulnl_logger); break; case NFULNL_CFG_CMD_PF_UNBIND: - UDEBUG("unregistering log handler for pf=%u\n", pf); /* This is a bug and a feature. We cannot unregister * other handlers, like nfnetlink_inst can */ nf_log_unregister_pf(pf); break; default: - ret = -EINVAL; + ret = -ENOTSUPP; break; } - - if (!inst) - goto out; - } else { - if (!inst) { - UDEBUG("no config command, and no instance for " - "group=%u pid=%u =>ENOENT\n", - group_num, NETLINK_CB(skb).pid); - ret = -ENOENT; - goto out; - } - - if (inst->peer_pid != NETLINK_CB(skb).pid) { - UDEBUG("no config command, and wrong pid\n"); - ret = -EPERM; - goto out_put; - } } if (nfula[NFULA_CFG_MODE]) { struct nfulnl_msg_config_mode *params; params = nla_data(nfula[NFULA_CFG_MODE]); + if (!inst) { + ret = -ENODEV; + goto out; + } nfulnl_set_mode(inst, params->copy_mode, ntohl(params->copy_range)); } if (nfula[NFULA_CFG_TIMEOUT]) { - __be32 timeout = - *(__be32 *)nla_data(nfula[NFULA_CFG_TIMEOUT]); + __be32 timeout = nla_get_be32(nfula[NFULA_CFG_TIMEOUT]); + if (!inst) { + ret = -ENODEV; + goto out; + } nfulnl_set_timeout(inst, ntohl(timeout)); } if (nfula[NFULA_CFG_NLBUFSIZ]) { - __be32 nlbufsiz = - *(__be32 *)nla_data(nfula[NFULA_CFG_NLBUFSIZ]); + __be32 nlbufsiz = nla_get_be32(nfula[NFULA_CFG_NLBUFSIZ]); + if (!inst) { + ret = -ENODEV; + goto out; + } nfulnl_set_nlbufsiz(inst, ntohl(nlbufsiz)); } if (nfula[NFULA_CFG_QTHRESH]) { - __be32 qthresh = - *(__be32 *)nla_data(nfula[NFULA_CFG_QTHRESH]); + __be32 qthresh = nla_get_be32(nfula[NFULA_CFG_QTHRESH]); + if (!inst) { + ret = -ENODEV; + goto out; + } nfulnl_set_qthresh(inst, ntohl(qthresh)); } if (nfula[NFULA_CFG_FLAGS]) { - __be16 flags = - *(__be16 *)nla_data(nfula[NFULA_CFG_FLAGS]); + __be16 flags = nla_get_be16(nfula[NFULA_CFG_FLAGS]); + + if (!inst) { + ret = -ENODEV; + goto out; + } nfulnl_set_flags(inst, ntohs(flags)); } diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 3ceeffcf6f9..51476f82bb5 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -3,6 +3,7 @@ * userspace via nfetlink. * * (C) 2005 by Harald Welte <laforge@netfilter.org> + * (C) 2007 by Patrick McHardy <kaber@trash.net> * * Based on the old ipv4-only ip_queue.c: * (C) 2000-2002 James Morris <jmorris@intercode.com.au> @@ -27,6 +28,7 @@ #include <linux/netfilter/nfnetlink_queue.h> #include <linux/list.h> #include <net/sock.h> +#include <net/netfilter/nf_queue.h> #include <asm/atomic.h> @@ -36,24 +38,9 @@ #define NFQNL_QMAX_DEFAULT 1024 -#if 0 -#define QDEBUG(x, args ...) printk(KERN_DEBUG "%s(%d):%s(): " x, \ - __FILE__, __LINE__, __FUNCTION__, \ - ## args) -#else -#define QDEBUG(x, ...) -#endif - -struct nfqnl_queue_entry { - struct list_head list; - struct nf_info *info; - struct sk_buff *skb; - unsigned int id; -}; - struct nfqnl_instance { struct hlist_node hlist; /* global list of queues */ - atomic_t use; + struct rcu_head rcu; int peer_pid; unsigned int queue_maxlen; @@ -62,7 +49,7 @@ struct nfqnl_instance { unsigned int queue_dropped; unsigned int queue_user_dropped; - atomic_t id_sequence; /* 'sequence' of pkt ids */ + unsigned int id_sequence; /* 'sequence' of pkt ids */ u_int16_t queue_num; /* number of this queue */ u_int8_t copy_mode; @@ -72,12 +59,12 @@ struct nfqnl_instance { struct list_head queue_list; /* packets in queue */ }; -typedef int (*nfqnl_cmpfn)(struct nfqnl_queue_entry *, unsigned long); +typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long); -static DEFINE_RWLOCK(instances_lock); +static DEFINE_SPINLOCK(instances_lock); #define INSTANCE_BUCKETS 16 -static struct hlist_head instance_table[INSTANCE_BUCKETS]; +static struct hlist_head instance_table[INSTANCE_BUCKETS] __read_mostly; static inline u_int8_t instance_hashfn(u_int16_t queue_num) { @@ -85,14 +72,14 @@ static inline u_int8_t instance_hashfn(u_int16_t queue_num) } static struct nfqnl_instance * -__instance_lookup(u_int16_t queue_num) +instance_lookup(u_int16_t queue_num) { struct hlist_head *head; struct hlist_node *pos; struct nfqnl_instance *inst; head = &instance_table[instance_hashfn(queue_num)]; - hlist_for_each_entry(inst, pos, head, hlist) { + hlist_for_each_entry_rcu(inst, pos, head, hlist) { if (inst->queue_num == queue_num) return inst; } @@ -100,243 +87,131 @@ __instance_lookup(u_int16_t queue_num) } static struct nfqnl_instance * -instance_lookup_get(u_int16_t queue_num) -{ - struct nfqnl_instance *inst; - - read_lock_bh(&instances_lock); - inst = __instance_lookup(queue_num); - if (inst) - atomic_inc(&inst->use); - read_unlock_bh(&instances_lock); - - return inst; -} - -static void -instance_put(struct nfqnl_instance *inst) -{ - if (inst && atomic_dec_and_test(&inst->use)) { - QDEBUG("kfree(inst=%p)\n", inst); - kfree(inst); - } -} - -static struct nfqnl_instance * instance_create(u_int16_t queue_num, int pid) { struct nfqnl_instance *inst; + unsigned int h; + int err; - QDEBUG("entering for queue_num=%u, pid=%d\n", queue_num, pid); - - write_lock_bh(&instances_lock); - if (__instance_lookup(queue_num)) { - inst = NULL; - QDEBUG("aborting, instance already exists\n"); + spin_lock(&instances_lock); + if (instance_lookup(queue_num)) { + err = -EEXIST; goto out_unlock; } inst = kzalloc(sizeof(*inst), GFP_ATOMIC); - if (!inst) + if (!inst) { + err = -ENOMEM; goto out_unlock; + } inst->queue_num = queue_num; inst->peer_pid = pid; inst->queue_maxlen = NFQNL_QMAX_DEFAULT; inst->copy_range = 0xfffff; inst->copy_mode = NFQNL_COPY_NONE; - atomic_set(&inst->id_sequence, 0); - /* needs to be two, since we _put() after creation */ - atomic_set(&inst->use, 2); spin_lock_init(&inst->lock); INIT_LIST_HEAD(&inst->queue_list); + INIT_RCU_HEAD(&inst->rcu); - if (!try_module_get(THIS_MODULE)) + if (!try_module_get(THIS_MODULE)) { + err = -EAGAIN; goto out_free; + } - hlist_add_head(&inst->hlist, - &instance_table[instance_hashfn(queue_num)]); - - write_unlock_bh(&instances_lock); + h = instance_hashfn(queue_num); + hlist_add_head_rcu(&inst->hlist, &instance_table[h]); - QDEBUG("successfully created new instance\n"); + spin_unlock(&instances_lock); return inst; out_free: kfree(inst); out_unlock: - write_unlock_bh(&instances_lock); - return NULL; + spin_unlock(&instances_lock); + return ERR_PTR(err); } -static void nfqnl_flush(struct nfqnl_instance *queue, int verdict); +static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, + unsigned long data); static void -_instance_destroy2(struct nfqnl_instance *inst, int lock) +instance_destroy_rcu(struct rcu_head *head) { - /* first pull it out of the global list */ - if (lock) - write_lock_bh(&instances_lock); - - QDEBUG("removing instance %p (queuenum=%u) from hash\n", - inst, inst->queue_num); - hlist_del(&inst->hlist); - - if (lock) - write_unlock_bh(&instances_lock); - - /* then flush all pending skbs from the queue */ - nfqnl_flush(inst, NF_DROP); - - /* and finally put the refcount */ - instance_put(inst); + struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance, + rcu); + nfqnl_flush(inst, NULL, 0); + kfree(inst); module_put(THIS_MODULE); } -static inline void +static void __instance_destroy(struct nfqnl_instance *inst) { - _instance_destroy2(inst, 0); + hlist_del_rcu(&inst->hlist); + call_rcu(&inst->rcu, instance_destroy_rcu); } -static inline void -instance_destroy(struct nfqnl_instance *inst) -{ - _instance_destroy2(inst, 1); -} - - - static void -issue_verdict(struct nfqnl_queue_entry *entry, int verdict) +instance_destroy(struct nfqnl_instance *inst) { - QDEBUG("entering for entry %p, verdict %u\n", entry, verdict); - - /* TCP input path (and probably other bits) assume to be called - * from softirq context, not from syscall, like issue_verdict is - * called. TCP input path deadlocks with locks taken from timer - * softirq, e.g. We therefore emulate this by local_bh_disable() */ - - local_bh_disable(); - nf_reinject(entry->skb, entry->info, verdict); - local_bh_enable(); - - kfree(entry); + spin_lock(&instances_lock); + __instance_destroy(inst); + spin_unlock(&instances_lock); } static inline void -__enqueue_entry(struct nfqnl_instance *queue, - struct nfqnl_queue_entry *entry) +__enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) { - list_add(&entry->list, &queue->queue_list); + list_add_tail(&entry->list, &queue->queue_list); queue->queue_total++; } -/* - * Find and return a queued entry matched by cmpfn, or return the last - * entry if cmpfn is NULL. - */ -static inline struct nfqnl_queue_entry * -__find_entry(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, - unsigned long data) +static struct nf_queue_entry * +find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id) { - struct list_head *p; + struct nf_queue_entry *entry = NULL, *i; - list_for_each_prev(p, &queue->queue_list) { - struct nfqnl_queue_entry *entry = (struct nfqnl_queue_entry *)p; + spin_lock_bh(&queue->lock); - if (!cmpfn || cmpfn(entry, data)) - return entry; + list_for_each_entry(i, &queue->queue_list, list) { + if (i->id == id) { + entry = i; + break; + } } - return NULL; -} - -static inline void -__dequeue_entry(struct nfqnl_instance *q, struct nfqnl_queue_entry *entry) -{ - list_del(&entry->list); - q->queue_total--; -} - -static inline struct nfqnl_queue_entry * -__find_dequeue_entry(struct nfqnl_instance *queue, - nfqnl_cmpfn cmpfn, unsigned long data) -{ - struct nfqnl_queue_entry *entry; - - entry = __find_entry(queue, cmpfn, data); - if (entry == NULL) - return NULL; - - __dequeue_entry(queue, entry); - return entry; -} - - -static inline void -__nfqnl_flush(struct nfqnl_instance *queue, int verdict) -{ - struct nfqnl_queue_entry *entry; - - while ((entry = __find_dequeue_entry(queue, NULL, 0))) - issue_verdict(entry, verdict); -} - -static inline int -__nfqnl_set_mode(struct nfqnl_instance *queue, - unsigned char mode, unsigned int range) -{ - int status = 0; - - switch (mode) { - case NFQNL_COPY_NONE: - case NFQNL_COPY_META: - queue->copy_mode = mode; - queue->copy_range = 0; - break; - - case NFQNL_COPY_PACKET: - queue->copy_mode = mode; - /* we're using struct nlattr which has 16bit nla_len */ - if (range > 0xffff) - queue->copy_range = 0xffff; - else - queue->copy_range = range; - break; - - default: - status = -EINVAL; + if (entry) { + list_del(&entry->list); + queue->queue_total--; } - return status; -} -static struct nfqnl_queue_entry * -find_dequeue_entry(struct nfqnl_instance *queue, - nfqnl_cmpfn cmpfn, unsigned long data) -{ - struct nfqnl_queue_entry *entry; - - spin_lock_bh(&queue->lock); - entry = __find_dequeue_entry(queue, cmpfn, data); spin_unlock_bh(&queue->lock); return entry; } static void -nfqnl_flush(struct nfqnl_instance *queue, int verdict) +nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data) { + struct nf_queue_entry *entry, *next; + spin_lock_bh(&queue->lock); - __nfqnl_flush(queue, verdict); + list_for_each_entry_safe(entry, next, &queue->queue_list, list) { + if (!cmpfn || cmpfn(entry, data)) { + list_del(&entry->list); + queue->queue_total--; + nf_reinject(entry, NF_DROP); + } + } spin_unlock_bh(&queue->lock); } static struct sk_buff * nfqnl_build_packet_message(struct nfqnl_instance *queue, - struct nfqnl_queue_entry *entry, int *errp) + struct nf_queue_entry *entry) { sk_buff_data_t old_tail; size_t size; @@ -345,13 +220,9 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, struct nfqnl_msg_packet_hdr pmsg; struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; - struct nf_info *entinf = entry->info; struct sk_buff *entskb = entry->skb; struct net_device *indev; struct net_device *outdev; - __be32 tmp_uint; - - QDEBUG("entered\n"); size = NLMSG_ALIGN(sizeof(struct nfgenmsg)) + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr)) @@ -365,11 +236,11 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, + nla_total_size(sizeof(struct nfqnl_msg_packet_hw)) + nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)); - outdev = entinf->outdev; + outdev = entry->outdev; spin_lock_bh(&queue->lock); - switch (queue->copy_mode) { + switch ((enum nfqnl_config_mode)queue->copy_mode) { case NFQNL_COPY_META: case NFQNL_COPY_NONE: data_len = 0; @@ -378,7 +249,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, case NFQNL_COPY_PACKET: if ((entskb->ip_summed == CHECKSUM_PARTIAL || entskb->ip_summed == CHECKSUM_COMPLETE) && - (*errp = skb_checksum_help(entskb))) { + skb_checksum_help(entskb)) { spin_unlock_bh(&queue->lock); return NULL; } @@ -390,13 +261,10 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, size += nla_total_size(data_len); break; - - default: - *errp = -EINVAL; - spin_unlock_bh(&queue->lock); - return NULL; } + entry->id = queue->id_sequence++; + spin_unlock_bh(&queue->lock); skb = alloc_skb(size, GFP_ATOMIC); @@ -408,81 +276,69 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET, sizeof(struct nfgenmsg)); nfmsg = NLMSG_DATA(nlh); - nfmsg->nfgen_family = entinf->pf; + nfmsg->nfgen_family = entry->pf; nfmsg->version = NFNETLINK_V0; nfmsg->res_id = htons(queue->queue_num); pmsg.packet_id = htonl(entry->id); pmsg.hw_protocol = entskb->protocol; - pmsg.hook = entinf->hook; + pmsg.hook = entry->hook; NLA_PUT(skb, NFQA_PACKET_HDR, sizeof(pmsg), &pmsg); - indev = entinf->indev; + indev = entry->indev; if (indev) { - tmp_uint = htonl(indev->ifindex); #ifndef CONFIG_BRIDGE_NETFILTER - NLA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint), &tmp_uint); + NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex)); #else - if (entinf->pf == PF_BRIDGE) { + if (entry->pf == PF_BRIDGE) { /* Case 1: indev is physical input device, we need to * look for bridge group (when called from * netfilter_bridge) */ - NLA_PUT(skb, NFQA_IFINDEX_PHYSINDEV, sizeof(tmp_uint), - &tmp_uint); + NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSINDEV, + htonl(indev->ifindex)); /* this is the bridge group "brX" */ - tmp_uint = htonl(indev->br_port->br->dev->ifindex); - NLA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint), - &tmp_uint); + NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV, + htonl(indev->br_port->br->dev->ifindex)); } else { /* Case 2: indev is bridge group, we need to look for * physical device (when called from ipv4) */ - NLA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint), - &tmp_uint); - if (entskb->nf_bridge - && entskb->nf_bridge->physindev) { - tmp_uint = htonl(entskb->nf_bridge->physindev->ifindex); - NLA_PUT(skb, NFQA_IFINDEX_PHYSINDEV, - sizeof(tmp_uint), &tmp_uint); - } + NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV, + htonl(indev->ifindex)); + if (entskb->nf_bridge && entskb->nf_bridge->physindev) + NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSINDEV, + htonl(entskb->nf_bridge->physindev->ifindex)); } #endif } if (outdev) { - tmp_uint = htonl(outdev->ifindex); #ifndef CONFIG_BRIDGE_NETFILTER - NLA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint), &tmp_uint); + NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex)); #else - if (entinf->pf == PF_BRIDGE) { + if (entry->pf == PF_BRIDGE) { /* Case 1: outdev is physical output device, we need to * look for bridge group (when called from * netfilter_bridge) */ - NLA_PUT(skb, NFQA_IFINDEX_PHYSOUTDEV, sizeof(tmp_uint), - &tmp_uint); + NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSOUTDEV, + htonl(outdev->ifindex)); /* this is the bridge group "brX" */ - tmp_uint = htonl(outdev->br_port->br->dev->ifindex); - NLA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint), - &tmp_uint); + NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV, + htonl(outdev->br_port->br->dev->ifindex)); } else { /* Case 2: outdev is bridge group, we need to look for * physical output device (when called from ipv4) */ - NLA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint), - &tmp_uint); - if (entskb->nf_bridge - && entskb->nf_bridge->physoutdev) { - tmp_uint = htonl(entskb->nf_bridge->physoutdev->ifindex); - NLA_PUT(skb, NFQA_IFINDEX_PHYSOUTDEV, - sizeof(tmp_uint), &tmp_uint); - } + NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV, + htonl(outdev->ifindex)); + if (entskb->nf_bridge && entskb->nf_bridge->physoutdev) + NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSOUTDEV, + htonl(entskb->nf_bridge->physoutdev->ifindex)); } #endif } - if (entskb->mark) { - tmp_uint = htonl(entskb->mark); - NLA_PUT(skb, NFQA_MARK, sizeof(u_int32_t), &tmp_uint); - } + if (entskb->mark) + NLA_PUT_BE32(skb, NFQA_MARK, htonl(entskb->mark)); if (indev && entskb->dev) { struct nfqnl_msg_packet_hw phw; @@ -526,51 +382,29 @@ nlmsg_failure: nla_put_failure: if (skb) kfree_skb(skb); - *errp = -EINVAL; if (net_ratelimit()) printk(KERN_ERR "nf_queue: error creating packet message\n"); return NULL; } static int -nfqnl_enqueue_packet(struct sk_buff *skb, struct nf_info *info, - unsigned int queuenum, void *data) +nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) { - int status = -EINVAL; struct sk_buff *nskb; struct nfqnl_instance *queue; - struct nfqnl_queue_entry *entry; - - QDEBUG("entered\n"); - - queue = instance_lookup_get(queuenum); - if (!queue) { - QDEBUG("no queue instance matching\n"); - return -EINVAL; - } - - if (queue->copy_mode == NFQNL_COPY_NONE) { - QDEBUG("mode COPY_NONE, aborting\n"); - status = -EAGAIN; - goto err_out_put; - } + int err; - entry = kmalloc(sizeof(*entry), GFP_ATOMIC); - if (entry == NULL) { - if (net_ratelimit()) - printk(KERN_ERR - "nf_queue: OOM in nfqnl_enqueue_packet()\n"); - status = -ENOMEM; - goto err_out_put; - } + /* rcu_read_lock()ed by nf_hook_slow() */ + queue = instance_lookup(queuenum); + if (!queue) + goto err_out; - entry->info = info; - entry->skb = skb; - entry->id = atomic_inc_return(&queue->id_sequence); + if (queue->copy_mode == NFQNL_COPY_NONE) + goto err_out; - nskb = nfqnl_build_packet_message(queue, entry, &status); + nskb = nfqnl_build_packet_message(queue, entry); if (nskb == NULL) - goto err_out_free; + goto err_out; spin_lock_bh(&queue->lock); @@ -579,7 +413,6 @@ nfqnl_enqueue_packet(struct sk_buff *skb, struct nf_info *info, if (queue->queue_total >= queue->queue_maxlen) { queue->queue_dropped++; - status = -ENOSPC; if (net_ratelimit()) printk(KERN_WARNING "nf_queue: full at %d entries, " "dropping packets(s). Dropped: %d\n", @@ -588,8 +421,8 @@ nfqnl_enqueue_packet(struct sk_buff *skb, struct nf_info *info, } /* nfnetlink_unicast will either free the nskb or add it to a socket */ - status = nfnetlink_unicast(nskb, queue->peer_pid, MSG_DONTWAIT); - if (status < 0) { + err = nfnetlink_unicast(nskb, queue->peer_pid, MSG_DONTWAIT); + if (err < 0) { queue->queue_user_dropped++; goto err_out_unlock; } @@ -597,24 +430,18 @@ nfqnl_enqueue_packet(struct sk_buff *skb, struct nf_info *info, __enqueue_entry(queue, entry); spin_unlock_bh(&queue->lock); - instance_put(queue); - return status; + return 0; err_out_free_nskb: kfree_skb(nskb); - err_out_unlock: spin_unlock_bh(&queue->lock); - -err_out_free: - kfree(entry); -err_out_put: - instance_put(queue); - return status; +err_out: + return -1; } static int -nfqnl_mangle(void *data, int data_len, struct nfqnl_queue_entry *e) +nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e) { int diff; int err; @@ -645,35 +472,46 @@ nfqnl_mangle(void *data, int data_len, struct nfqnl_queue_entry *e) return 0; } -static inline int -id_cmp(struct nfqnl_queue_entry *e, unsigned long id) -{ - return (id == e->id); -} - static int nfqnl_set_mode(struct nfqnl_instance *queue, unsigned char mode, unsigned int range) { - int status; + int status = 0; spin_lock_bh(&queue->lock); - status = __nfqnl_set_mode(queue, mode, range); + switch (mode) { + case NFQNL_COPY_NONE: + case NFQNL_COPY_META: + queue->copy_mode = mode; + queue->copy_range = 0; + break; + + case NFQNL_COPY_PACKET: + queue->copy_mode = mode; + /* we're using struct nlattr which has 16bit nla_len */ + if (range > 0xffff) + queue->copy_range = 0xffff; + else + queue->copy_range = range; + break; + + default: + status = -EINVAL; + + } spin_unlock_bh(&queue->lock); return status; } static int -dev_cmp(struct nfqnl_queue_entry *entry, unsigned long ifindex) +dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) { - struct nf_info *entinf = entry->info; - - if (entinf->indev) - if (entinf->indev->ifindex == ifindex) + if (entry->indev) + if (entry->indev->ifindex == ifindex) return 1; - if (entinf->outdev) - if (entinf->outdev->ifindex == ifindex) + if (entry->outdev) + if (entry->outdev->ifindex == ifindex) return 1; #ifdef CONFIG_BRIDGE_NETFILTER if (entry->skb->nf_bridge) { @@ -695,27 +533,18 @@ nfqnl_dev_drop(int ifindex) { int i; - QDEBUG("entering for ifindex %u\n", ifindex); - - /* this only looks like we have to hold the readlock for a way too long - * time, issue_verdict(), nf_reinject(), ... - but we always only - * issue NF_DROP, which is processed directly in nf_reinject() */ - read_lock_bh(&instances_lock); + rcu_read_lock(); - for (i = 0; i < INSTANCE_BUCKETS; i++) { + for (i = 0; i < INSTANCE_BUCKETS; i++) { struct hlist_node *tmp; struct nfqnl_instance *inst; struct hlist_head *head = &instance_table[i]; - hlist_for_each_entry(inst, tmp, head, hlist) { - struct nfqnl_queue_entry *entry; - while ((entry = find_dequeue_entry(inst, dev_cmp, - ifindex)) != NULL) - issue_verdict(entry, NF_DROP); - } + hlist_for_each_entry_rcu(inst, tmp, head, hlist) + nfqnl_flush(inst, dev_cmp, ifindex); } - read_unlock_bh(&instances_lock); + rcu_read_unlock(); } #define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) @@ -750,8 +579,8 @@ nfqnl_rcv_nl_event(struct notifier_block *this, int i; /* destroy all instances for this pid */ - write_lock_bh(&instances_lock); - for (i = 0; i < INSTANCE_BUCKETS; i++) { + spin_lock(&instances_lock); + for (i = 0; i < INSTANCE_BUCKETS; i++) { struct hlist_node *tmp, *t2; struct nfqnl_instance *inst; struct hlist_head *head = &instance_table[i]; @@ -762,7 +591,7 @@ nfqnl_rcv_nl_event(struct notifier_block *this, __instance_destroy(inst); } } - write_unlock_bh(&instances_lock); + spin_unlock(&instances_lock); } return NOTIFY_DONE; } @@ -787,21 +616,24 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, struct nfqnl_msg_verdict_hdr *vhdr; struct nfqnl_instance *queue; unsigned int verdict; - struct nfqnl_queue_entry *entry; + struct nf_queue_entry *entry; int err; - queue = instance_lookup_get(queue_num); - if (!queue) - return -ENODEV; + rcu_read_lock(); + queue = instance_lookup(queue_num); + if (!queue) { + err = -ENODEV; + goto err_out_unlock; + } if (queue->peer_pid != NETLINK_CB(skb).pid) { err = -EPERM; - goto err_out_put; + goto err_out_unlock; } if (!nfqa[NFQA_VERDICT_HDR]) { err = -EINVAL; - goto err_out_put; + goto err_out_unlock; } vhdr = nla_data(nfqa[NFQA_VERDICT_HDR]); @@ -809,14 +641,15 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, if ((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT) { err = -EINVAL; - goto err_out_put; + goto err_out_unlock; } - entry = find_dequeue_entry(queue, id_cmp, ntohl(vhdr->id)); + entry = find_dequeue_entry(queue, ntohl(vhdr->id)); if (entry == NULL) { err = -ENOENT; - goto err_out_put; + goto err_out_unlock; } + rcu_read_unlock(); if (nfqa[NFQA_PAYLOAD]) { if (nfqnl_mangle(nla_data(nfqa[NFQA_PAYLOAD]), @@ -825,15 +658,13 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, } if (nfqa[NFQA_MARK]) - entry->skb->mark = ntohl(*(__be32 *) - nla_data(nfqa[NFQA_MARK])); + entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); - issue_verdict(entry, verdict); - instance_put(queue); + nf_reinject(entry, verdict); return 0; -err_out_put: - instance_put(queue); +err_out_unlock: + rcu_read_unlock(); return err; } @@ -849,7 +680,7 @@ static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = { [NFQA_CFG_PARAMS] = { .len = sizeof(struct nfqnl_msg_config_params) }, }; -static struct nf_queue_handler nfqh = { +static const struct nf_queue_handler nfqh = { .name = "nf_queue", .outfn = &nfqnl_enqueue_packet, }; @@ -861,70 +692,72 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); u_int16_t queue_num = ntohs(nfmsg->res_id); struct nfqnl_instance *queue; + struct nfqnl_msg_config_cmd *cmd = NULL; int ret = 0; - QDEBUG("entering for msg %u\n", NFNL_MSG_TYPE(nlh->nlmsg_type)); - - queue = instance_lookup_get(queue_num); if (nfqa[NFQA_CFG_CMD]) { - struct nfqnl_msg_config_cmd *cmd; cmd = nla_data(nfqa[NFQA_CFG_CMD]); - QDEBUG("found CFG_CMD\n"); + /* Commands without queue context - might sleep */ switch (cmd->command) { - case NFQNL_CFG_CMD_BIND: - if (queue) - return -EBUSY; + case NFQNL_CFG_CMD_PF_BIND: + ret = nf_register_queue_handler(ntohs(cmd->pf), + &nfqh); + break; + case NFQNL_CFG_CMD_PF_UNBIND: + ret = nf_unregister_queue_handler(ntohs(cmd->pf), + &nfqh); + break; + default: + break; + } + + if (ret < 0) + return ret; + } + + rcu_read_lock(); + queue = instance_lookup(queue_num); + if (queue && queue->peer_pid != NETLINK_CB(skb).pid) { + ret = -EPERM; + goto err_out_unlock; + } + if (cmd != NULL) { + switch (cmd->command) { + case NFQNL_CFG_CMD_BIND: + if (queue) { + ret = -EBUSY; + goto err_out_unlock; + } queue = instance_create(queue_num, NETLINK_CB(skb).pid); - if (!queue) - return -EINVAL; + if (IS_ERR(queue)) { + ret = PTR_ERR(queue); + goto err_out_unlock; + } break; case NFQNL_CFG_CMD_UNBIND: - if (!queue) - return -ENODEV; - - if (queue->peer_pid != NETLINK_CB(skb).pid) { - ret = -EPERM; - goto out_put; + if (!queue) { + ret = -ENODEV; + goto err_out_unlock; } - instance_destroy(queue); break; case NFQNL_CFG_CMD_PF_BIND: - QDEBUG("registering queue handler for pf=%u\n", - ntohs(cmd->pf)); - ret = nf_register_queue_handler(ntohs(cmd->pf), &nfqh); - break; case NFQNL_CFG_CMD_PF_UNBIND: - QDEBUG("unregistering queue handler for pf=%u\n", - ntohs(cmd->pf)); - ret = nf_unregister_queue_handler(ntohs(cmd->pf), &nfqh); break; default: - ret = -EINVAL; + ret = -ENOTSUPP; break; } - } else { - if (!queue) { - QDEBUG("no config command, and no instance ENOENT\n"); - ret = -ENOENT; - goto out_put; - } - - if (queue->peer_pid != NETLINK_CB(skb).pid) { - QDEBUG("no config command, and wrong pid\n"); - ret = -EPERM; - goto out_put; - } } if (nfqa[NFQA_CFG_PARAMS]) { struct nfqnl_msg_config_params *params; if (!queue) { - ret = -ENOENT; - goto out_put; + ret = -ENODEV; + goto err_out_unlock; } params = nla_data(nfqa[NFQA_CFG_PARAMS]); nfqnl_set_mode(queue, params->copy_mode, @@ -933,14 +766,19 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, if (nfqa[NFQA_CFG_QUEUE_MAXLEN]) { __be32 *queue_maxlen; + + if (!queue) { + ret = -ENODEV; + goto err_out_unlock; + } queue_maxlen = nla_data(nfqa[NFQA_CFG_QUEUE_MAXLEN]); spin_lock_bh(&queue->lock); queue->queue_maxlen = ntohl(*queue_maxlen); spin_unlock_bh(&queue->lock); } -out_put: - instance_put(queue); +err_out_unlock: + rcu_read_unlock(); return ret; } @@ -1008,7 +846,7 @@ static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos) static void *seq_start(struct seq_file *seq, loff_t *pos) { - read_lock_bh(&instances_lock); + spin_lock(&instances_lock); return get_idx(seq, *pos); } @@ -1020,7 +858,7 @@ static void *seq_next(struct seq_file *s, void *v, loff_t *pos) static void seq_stop(struct seq_file *s, void *v) { - read_unlock_bh(&instances_lock); + spin_unlock(&instances_lock); } static int seq_show(struct seq_file *s, void *v) @@ -1032,8 +870,7 @@ static int seq_show(struct seq_file *s, void *v) inst->peer_pid, inst->queue_total, inst->copy_mode, inst->copy_range, inst->queue_dropped, inst->queue_user_dropped, - atomic_read(&inst->id_sequence), - atomic_read(&inst->use)); + inst->id_sequence, 1); } static const struct seq_operations nfqnl_seq_ops = { diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index b6160e41eb1..8d4fca96a4a 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -34,12 +34,21 @@ MODULE_DESCRIPTION("[ip,ip6,arp]_tables backend module"); #define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) +struct compat_delta { + struct compat_delta *next; + unsigned int offset; + short delta; +}; + struct xt_af { struct mutex mutex; struct list_head match; struct list_head target; struct list_head tables; +#ifdef CONFIG_COMPAT struct mutex compat_mutex; + struct compat_delta *compat_offsets; +#endif }; static struct xt_af *xt; @@ -335,6 +344,54 @@ int xt_check_match(const struct xt_match *match, unsigned short family, EXPORT_SYMBOL_GPL(xt_check_match); #ifdef CONFIG_COMPAT +int xt_compat_add_offset(int af, unsigned int offset, short delta) +{ + struct compat_delta *tmp; + + tmp = kmalloc(sizeof(struct compat_delta), GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + tmp->offset = offset; + tmp->delta = delta; + + if (xt[af].compat_offsets) { + tmp->next = xt[af].compat_offsets->next; + xt[af].compat_offsets->next = tmp; + } else { + xt[af].compat_offsets = tmp; + tmp->next = NULL; + } + return 0; +} +EXPORT_SYMBOL_GPL(xt_compat_add_offset); + +void xt_compat_flush_offsets(int af) +{ + struct compat_delta *tmp, *next; + + if (xt[af].compat_offsets) { + for (tmp = xt[af].compat_offsets; tmp; tmp = next) { + next = tmp->next; + kfree(tmp); + } + xt[af].compat_offsets = NULL; + } +} +EXPORT_SYMBOL_GPL(xt_compat_flush_offsets); + +short xt_compat_calc_jump(int af, unsigned int offset) +{ + struct compat_delta *tmp; + short delta; + + for (tmp = xt[af].compat_offsets, delta = 0; tmp; tmp = tmp->next) + if (tmp->offset < offset) + delta += tmp->delta; + return delta; +} +EXPORT_SYMBOL_GPL(xt_compat_calc_jump); + int xt_compat_match_offset(struct xt_match *match) { u_int16_t csize = match->compatsize ? : match->matchsize; @@ -342,8 +399,8 @@ int xt_compat_match_offset(struct xt_match *match) } EXPORT_SYMBOL_GPL(xt_compat_match_offset); -void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr, - int *size) +int xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr, + int *size) { struct xt_match *match = m->u.kernel.match; struct compat_xt_entry_match *cm = (struct compat_xt_entry_match *)m; @@ -365,6 +422,7 @@ void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr, *size += off; *dstptr += msize; + return 0; } EXPORT_SYMBOL_GPL(xt_compat_match_from_user); @@ -499,7 +557,7 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size) if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > num_physpages) return NULL; - newinfo = kzalloc(sizeof(struct xt_table_info), GFP_KERNEL); + newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL); if (!newinfo) return NULL; @@ -872,6 +930,7 @@ static int __init xt_init(void) mutex_init(&xt[i].mutex); #ifdef CONFIG_COMPAT mutex_init(&xt[i].compat_mutex); + xt[i].compat_offsets = NULL; #endif INIT_LIST_HEAD(&xt[i].target); INIT_LIST_HEAD(&xt[i].match); diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c index 77eeae658d4..77a52bf8322 100644 --- a/net/netfilter/xt_CLASSIFY.c +++ b/net/netfilter/xt_CLASSIFY.c @@ -22,17 +22,14 @@ MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("iptables qdisc classification target module"); +MODULE_DESCRIPTION("Xtables: Qdisc classification"); MODULE_ALIAS("ipt_CLASSIFY"); MODULE_ALIAS("ip6t_CLASSIFY"); static unsigned int -target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +classify_tg(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { const struct xt_classify_target_info *clinfo = targinfo; @@ -40,42 +37,41 @@ target(struct sk_buff *skb, return XT_CONTINUE; } -static struct xt_target xt_classify_target[] __read_mostly = { +static struct xt_target classify_tg_reg[] __read_mostly = { { .family = AF_INET, .name = "CLASSIFY", - .target = target, + .target = classify_tg, .targetsize = sizeof(struct xt_classify_target_info), .table = "mangle", - .hooks = (1 << NF_IP_LOCAL_OUT) | - (1 << NF_IP_FORWARD) | - (1 << NF_IP_POST_ROUTING), + .hooks = (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_POST_ROUTING), .me = THIS_MODULE, }, { .name = "CLASSIFY", .family = AF_INET6, - .target = target, + .target = classify_tg, .targetsize = sizeof(struct xt_classify_target_info), .table = "mangle", - .hooks = (1 << NF_IP6_LOCAL_OUT) | - (1 << NF_IP6_FORWARD) | - (1 << NF_IP6_POST_ROUTING), + .hooks = (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_POST_ROUTING), .me = THIS_MODULE, }, }; -static int __init xt_classify_init(void) +static int __init classify_tg_init(void) { - return xt_register_targets(xt_classify_target, - ARRAY_SIZE(xt_classify_target)); + return xt_register_targets(classify_tg_reg, + ARRAY_SIZE(classify_tg_reg)); } -static void __exit xt_classify_fini(void) +static void __exit classify_tg_exit(void) { - xt_unregister_targets(xt_classify_target, - ARRAY_SIZE(xt_classify_target)); + xt_unregister_targets(classify_tg_reg, ARRAY_SIZE(classify_tg_reg)); } -module_init(xt_classify_init); -module_exit(xt_classify_fini); +module_init(classify_tg_init); +module_exit(classify_tg_exit); diff --git a/net/netfilter/xt_CONNMARK.c b/net/netfilter/xt_CONNMARK.c index 0621ca7de3b..5fecfb4794b 100644 --- a/net/netfilter/xt_CONNMARK.c +++ b/net/netfilter/xt_CONNMARK.c @@ -1,8 +1,10 @@ -/* This kernel module is used to modify the connection mark values, or - * to optionally restore the skb nfmark from the connection mark +/* + * xt_CONNMARK - Netfilter module to modify the connection mark values * - * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com> - * by Henrik Nordstrom <hno@marasystems.com> + * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com> + * by Henrik Nordstrom <hno@marasystems.com> + * Copyright © CC Computer Consultants GmbH, 2007 - 2008 + * Jan Engelhardt <jengelh@computergmbh.de> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -24,7 +26,7 @@ #include <net/checksum.h> MODULE_AUTHOR("Henrik Nordstrom <hno@marasystems.com>"); -MODULE_DESCRIPTION("IP tables CONNMARK matching module"); +MODULE_DESCRIPTION("Xtables: connection mark modification"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_CONNMARK"); MODULE_ALIAS("ip6t_CONNMARK"); @@ -34,12 +36,9 @@ MODULE_ALIAS("ip6t_CONNMARK"); #include <net/netfilter/nf_conntrack_ecache.h> static unsigned int -target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +connmark_tg_v0(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { const struct xt_connmark_target_info *markinfo = targinfo; struct nf_conn *ct; @@ -77,12 +76,50 @@ target(struct sk_buff *skb, return XT_CONTINUE; } +static unsigned int +connmark_tg(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) +{ + const struct xt_connmark_tginfo1 *info = targinfo; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + u_int32_t newmark; + + ct = nf_ct_get(skb, &ctinfo); + if (ct == NULL) + return XT_CONTINUE; + + switch (info->mode) { + case XT_CONNMARK_SET: + newmark = (ct->mark & ~info->ctmask) ^ info->ctmark; + if (ct->mark != newmark) { + ct->mark = newmark; + nf_conntrack_event_cache(IPCT_MARK, skb); + } + break; + case XT_CONNMARK_SAVE: + newmark = (ct->mark & ~info->ctmask) ^ + (skb->mark & info->nfmask); + if (ct->mark != newmark) { + ct->mark = newmark; + nf_conntrack_event_cache(IPCT_MARK, skb); + } + break; + case XT_CONNMARK_RESTORE: + newmark = (skb->mark & ~info->nfmask) ^ + (ct->mark & info->ctmask); + skb->mark = newmark; + break; + } + + return XT_CONTINUE; +} + static bool -checkentry(const char *tablename, - const void *entry, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +connmark_tg_check_v0(const char *tablename, const void *entry, + const struct xt_target *target, void *targinfo, + unsigned int hook_mask) { const struct xt_connmark_target_info *matchinfo = targinfo; @@ -100,14 +137,27 @@ checkentry(const char *tablename, } if (nf_ct_l3proto_try_module_get(target->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%d\n", target->family); + "proto=%u\n", target->family); + return false; + } + return true; +} + +static bool +connmark_tg_check(const char *tablename, const void *entry, + const struct xt_target *target, void *targinfo, + unsigned int hook_mask) +{ + if (nf_ct_l3proto_try_module_get(target->family) < 0) { + printk(KERN_WARNING "cannot load conntrack support for " + "proto=%u\n", target->family); return false; } return true; } static void -destroy(const struct xt_target *target, void *targinfo) +connmark_tg_destroy(const struct xt_target *target, void *targinfo) { nf_ct_l3proto_module_put(target->family); } @@ -120,7 +170,7 @@ struct compat_xt_connmark_target_info { u_int16_t __pad2; }; -static void compat_from_user(void *dst, void *src) +static void connmark_tg_compat_from_user_v0(void *dst, void *src) { const struct compat_xt_connmark_target_info *cm = src; struct xt_connmark_target_info m = { @@ -131,7 +181,7 @@ static void compat_from_user(void *dst, void *src) memcpy(dst, &m, sizeof(m)); } -static int compat_to_user(void __user *dst, void *src) +static int connmark_tg_compat_to_user_v0(void __user *dst, void *src) { const struct xt_connmark_target_info *m = src; struct compat_xt_connmark_target_info cm = { @@ -143,43 +193,69 @@ static int compat_to_user(void __user *dst, void *src) } #endif /* CONFIG_COMPAT */ -static struct xt_target xt_connmark_target[] __read_mostly = { +static struct xt_target connmark_tg_reg[] __read_mostly = { { .name = "CONNMARK", + .revision = 0, .family = AF_INET, - .checkentry = checkentry, - .destroy = destroy, - .target = target, + .checkentry = connmark_tg_check_v0, + .destroy = connmark_tg_destroy, + .target = connmark_tg_v0, .targetsize = sizeof(struct xt_connmark_target_info), #ifdef CONFIG_COMPAT .compatsize = sizeof(struct compat_xt_connmark_target_info), - .compat_from_user = compat_from_user, - .compat_to_user = compat_to_user, + .compat_from_user = connmark_tg_compat_from_user_v0, + .compat_to_user = connmark_tg_compat_to_user_v0, #endif .me = THIS_MODULE }, { .name = "CONNMARK", + .revision = 0, .family = AF_INET6, - .checkentry = checkentry, - .destroy = destroy, - .target = target, + .checkentry = connmark_tg_check_v0, + .destroy = connmark_tg_destroy, + .target = connmark_tg_v0, .targetsize = sizeof(struct xt_connmark_target_info), +#ifdef CONFIG_COMPAT + .compatsize = sizeof(struct compat_xt_connmark_target_info), + .compat_from_user = connmark_tg_compat_from_user_v0, + .compat_to_user = connmark_tg_compat_to_user_v0, +#endif .me = THIS_MODULE }, + { + .name = "CONNMARK", + .revision = 1, + .family = AF_INET, + .checkentry = connmark_tg_check, + .target = connmark_tg, + .targetsize = sizeof(struct xt_connmark_tginfo1), + .destroy = connmark_tg_destroy, + .me = THIS_MODULE, + }, + { + .name = "CONNMARK", + .revision = 1, + .family = AF_INET6, + .checkentry = connmark_tg_check, + .target = connmark_tg, + .targetsize = sizeof(struct xt_connmark_tginfo1), + .destroy = connmark_tg_destroy, + .me = THIS_MODULE, + }, }; -static int __init xt_connmark_init(void) +static int __init connmark_tg_init(void) { - return xt_register_targets(xt_connmark_target, - ARRAY_SIZE(xt_connmark_target)); + return xt_register_targets(connmark_tg_reg, + ARRAY_SIZE(connmark_tg_reg)); } -static void __exit xt_connmark_fini(void) +static void __exit connmark_tg_exit(void) { - xt_unregister_targets(xt_connmark_target, - ARRAY_SIZE(xt_connmark_target)); + xt_unregister_targets(connmark_tg_reg, ARRAY_SIZE(connmark_tg_reg)); } -module_init(xt_connmark_init); -module_exit(xt_connmark_fini); +module_init(connmark_tg_init); +module_exit(connmark_tg_exit); diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c index d8feba9bdb4..1faa9136195 100644 --- a/net/netfilter/xt_CONNSECMARK.c +++ b/net/netfilter/xt_CONNSECMARK.c @@ -20,12 +20,13 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_CONNSECMARK.h> #include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_ecache.h> #define PFX "CONNSECMARK: " MODULE_LICENSE("GPL"); MODULE_AUTHOR("James Morris <jmorris@redhat.com>"); -MODULE_DESCRIPTION("ip[6]tables CONNSECMARK module"); +MODULE_DESCRIPTION("Xtables: target for copying between connection and security mark"); MODULE_ALIAS("ipt_CONNSECMARK"); MODULE_ALIAS("ip6t_CONNSECMARK"); @@ -40,8 +41,10 @@ static void secmark_save(const struct sk_buff *skb) enum ip_conntrack_info ctinfo; ct = nf_ct_get(skb, &ctinfo); - if (ct && !ct->secmark) + if (ct && !ct->secmark) { ct->secmark = skb->secmark; + nf_conntrack_event_cache(IPCT_SECMARK, skb); + } } } @@ -61,10 +64,10 @@ static void secmark_restore(struct sk_buff *skb) } } -static unsigned int target(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +static unsigned int +connsecmark_tg(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { const struct xt_connsecmark_target_info *info = targinfo; @@ -84,9 +87,10 @@ static unsigned int target(struct sk_buff *skb, const struct net_device *in, return XT_CONTINUE; } -static bool checkentry(const char *tablename, const void *entry, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool +connsecmark_tg_check(const char *tablename, const void *entry, + const struct xt_target *target, void *targinfo, + unsigned int hook_mask) { const struct xt_connsecmark_target_info *info = targinfo; @@ -102,25 +106,25 @@ static bool checkentry(const char *tablename, const void *entry, if (nf_ct_l3proto_try_module_get(target->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%d\n", target->family); + "proto=%u\n", target->family); return false; } return true; } static void -destroy(const struct xt_target *target, void *targinfo) +connsecmark_tg_destroy(const struct xt_target *target, void *targinfo) { nf_ct_l3proto_module_put(target->family); } -static struct xt_target xt_connsecmark_target[] __read_mostly = { +static struct xt_target connsecmark_tg_reg[] __read_mostly = { { .name = "CONNSECMARK", .family = AF_INET, - .checkentry = checkentry, - .destroy = destroy, - .target = target, + .checkentry = connsecmark_tg_check, + .destroy = connsecmark_tg_destroy, + .target = connsecmark_tg, .targetsize = sizeof(struct xt_connsecmark_target_info), .table = "mangle", .me = THIS_MODULE, @@ -128,26 +132,26 @@ static struct xt_target xt_connsecmark_target[] __read_mostly = { { .name = "CONNSECMARK", .family = AF_INET6, - .checkentry = checkentry, - .destroy = destroy, - .target = target, + .checkentry = connsecmark_tg_check, + .destroy = connsecmark_tg_destroy, + .target = connsecmark_tg, .targetsize = sizeof(struct xt_connsecmark_target_info), .table = "mangle", .me = THIS_MODULE, }, }; -static int __init xt_connsecmark_init(void) +static int __init connsecmark_tg_init(void) { - return xt_register_targets(xt_connsecmark_target, - ARRAY_SIZE(xt_connsecmark_target)); + return xt_register_targets(connsecmark_tg_reg, + ARRAY_SIZE(connsecmark_tg_reg)); } -static void __exit xt_connsecmark_fini(void) +static void __exit connsecmark_tg_exit(void) { - xt_unregister_targets(xt_connsecmark_target, - ARRAY_SIZE(xt_connsecmark_target)); + xt_unregister_targets(connsecmark_tg_reg, + ARRAY_SIZE(connsecmark_tg_reg)); } -module_init(xt_connsecmark_init); -module_exit(xt_connsecmark_fini); +module_init(connsecmark_tg_init); +module_exit(connsecmark_tg_exit); diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c index 6322a933ab7..97efd74c04f 100644 --- a/net/netfilter/xt_DSCP.c +++ b/net/netfilter/xt_DSCP.c @@ -18,19 +18,20 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_DSCP.h> +#include <linux/netfilter_ipv4/ipt_TOS.h> MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("x_tables DSCP modification module"); +MODULE_DESCRIPTION("Xtables: DSCP/TOS field modification"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_DSCP"); MODULE_ALIAS("ip6t_DSCP"); +MODULE_ALIAS("ipt_TOS"); +MODULE_ALIAS("ip6t_TOS"); -static unsigned int target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +static unsigned int +dscp_tg(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { const struct xt_DSCP_info *dinfo = targinfo; u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT; @@ -46,12 +47,10 @@ static unsigned int target(struct sk_buff *skb, return XT_CONTINUE; } -static unsigned int target6(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +static unsigned int +dscp_tg6(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { const struct xt_DSCP_info *dinfo = targinfo; u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT; @@ -66,11 +65,10 @@ static unsigned int target6(struct sk_buff *skb, return XT_CONTINUE; } -static bool checkentry(const char *tablename, - const void *e_void, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +static bool +dscp_tg_check(const char *tablename, const void *e_void, + const struct xt_target *target, void *targinfo, + unsigned int hook_mask) { const u_int8_t dscp = ((struct xt_DSCP_info *)targinfo)->dscp; @@ -81,12 +79,95 @@ static bool checkentry(const char *tablename, return true; } -static struct xt_target xt_dscp_target[] __read_mostly = { +static unsigned int +tos_tg_v0(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) +{ + const struct ipt_tos_target_info *info = targinfo; + struct iphdr *iph = ip_hdr(skb); + u_int8_t oldtos; + + if ((iph->tos & IPTOS_TOS_MASK) != info->tos) { + if (!skb_make_writable(skb, sizeof(struct iphdr))) + return NF_DROP; + + iph = ip_hdr(skb); + oldtos = iph->tos; + iph->tos = (iph->tos & IPTOS_PREC_MASK) | info->tos; + csum_replace2(&iph->check, htons(oldtos), htons(iph->tos)); + } + + return XT_CONTINUE; +} + +static bool +tos_tg_check_v0(const char *tablename, const void *e_void, + const struct xt_target *target, void *targinfo, + unsigned int hook_mask) +{ + const u_int8_t tos = ((struct ipt_tos_target_info *)targinfo)->tos; + + if (tos != IPTOS_LOWDELAY && tos != IPTOS_THROUGHPUT && + tos != IPTOS_RELIABILITY && tos != IPTOS_MINCOST && + tos != IPTOS_NORMALSVC) { + printk(KERN_WARNING "TOS: bad tos value %#x\n", tos); + return false; + } + + return true; +} + +static unsigned int +tos_tg(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) +{ + const struct xt_tos_target_info *info = targinfo; + struct iphdr *iph = ip_hdr(skb); + u_int8_t orig, nv; + + orig = ipv4_get_dsfield(iph); + nv = (orig & ~info->tos_mask) ^ info->tos_value; + + if (orig != nv) { + if (!skb_make_writable(skb, sizeof(struct iphdr))) + return NF_DROP; + iph = ip_hdr(skb); + ipv4_change_dsfield(iph, 0, nv); + } + + return XT_CONTINUE; +} + +static unsigned int +tos_tg6(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) +{ + const struct xt_tos_target_info *info = targinfo; + struct ipv6hdr *iph = ipv6_hdr(skb); + u_int8_t orig, nv; + + orig = ipv6_get_dsfield(iph); + nv = (orig & info->tos_mask) ^ info->tos_value; + + if (orig != nv) { + if (!skb_make_writable(skb, sizeof(struct iphdr))) + return NF_DROP; + iph = ipv6_hdr(skb); + ipv6_change_dsfield(iph, 0, nv); + } + + return XT_CONTINUE; +} + +static struct xt_target dscp_tg_reg[] __read_mostly = { { .name = "DSCP", .family = AF_INET, - .checkentry = checkentry, - .target = target, + .checkentry = dscp_tg_check, + .target = dscp_tg, .targetsize = sizeof(struct xt_DSCP_info), .table = "mangle", .me = THIS_MODULE, @@ -94,23 +175,51 @@ static struct xt_target xt_dscp_target[] __read_mostly = { { .name = "DSCP", .family = AF_INET6, - .checkentry = checkentry, - .target = target6, + .checkentry = dscp_tg_check, + .target = dscp_tg6, .targetsize = sizeof(struct xt_DSCP_info), .table = "mangle", .me = THIS_MODULE, }, + { + .name = "TOS", + .revision = 0, + .family = AF_INET, + .table = "mangle", + .target = tos_tg_v0, + .targetsize = sizeof(struct ipt_tos_target_info), + .checkentry = tos_tg_check_v0, + .me = THIS_MODULE, + }, + { + .name = "TOS", + .revision = 1, + .family = AF_INET, + .table = "mangle", + .target = tos_tg, + .targetsize = sizeof(struct xt_tos_target_info), + .me = THIS_MODULE, + }, + { + .name = "TOS", + .revision = 1, + .family = AF_INET6, + .table = "mangle", + .target = tos_tg6, + .targetsize = sizeof(struct xt_tos_target_info), + .me = THIS_MODULE, + }, }; -static int __init xt_dscp_target_init(void) +static int __init dscp_tg_init(void) { - return xt_register_targets(xt_dscp_target, ARRAY_SIZE(xt_dscp_target)); + return xt_register_targets(dscp_tg_reg, ARRAY_SIZE(dscp_tg_reg)); } -static void __exit xt_dscp_target_fini(void) +static void __exit dscp_tg_exit(void) { - xt_unregister_targets(xt_dscp_target, ARRAY_SIZE(xt_dscp_target)); + xt_unregister_targets(dscp_tg_reg, ARRAY_SIZE(dscp_tg_reg)); } -module_init(xt_dscp_target_init); -module_exit(xt_dscp_target_fini); +module_init(dscp_tg_init); +module_exit(dscp_tg_exit); diff --git a/net/netfilter/xt_MARK.c b/net/netfilter/xt_MARK.c index bc6503d77d7..f9ce20b5898 100644 --- a/net/netfilter/xt_MARK.c +++ b/net/netfilter/xt_MARK.c @@ -1,10 +1,13 @@ -/* This is a module which is used for setting the NFMARK field of an skb. */ - -/* (C) 1999-2001 Marc Boucher <marc@mbsi.ca> +/* + * xt_MARK - Netfilter module to modify the NFMARK field of an skb + * + * (C) 1999-2001 Marc Boucher <marc@mbsi.ca> + * Copyright © CC Computer Consultants GmbH, 2007 - 2008 + * Jan Engelhardt <jengelh@computergmbh.de> * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. */ #include <linux/module.h> @@ -17,17 +20,14 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); -MODULE_DESCRIPTION("ip[6]tables MARK modification module"); +MODULE_DESCRIPTION("Xtables: packet mark modification"); MODULE_ALIAS("ipt_MARK"); MODULE_ALIAS("ip6t_MARK"); static unsigned int -target_v0(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +mark_tg_v0(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { const struct xt_mark_target_info *markinfo = targinfo; @@ -36,12 +36,9 @@ target_v0(struct sk_buff *skb, } static unsigned int -target_v1(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +mark_tg_v1(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { const struct xt_mark_target_info_v1 *markinfo = targinfo; int mark = 0; @@ -64,13 +61,21 @@ target_v1(struct sk_buff *skb, return XT_CONTINUE; } +static unsigned int +mark_tg(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) +{ + const struct xt_mark_tginfo2 *info = targinfo; + + skb->mark = (skb->mark & ~info->mask) ^ info->mark; + return XT_CONTINUE; +} static bool -checkentry_v0(const char *tablename, - const void *entry, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +mark_tg_check_v0(const char *tablename, const void *entry, + const struct xt_target *target, void *targinfo, + unsigned int hook_mask) { const struct xt_mark_target_info *markinfo = targinfo; @@ -82,11 +87,9 @@ checkentry_v0(const char *tablename, } static bool -checkentry_v1(const char *tablename, - const void *entry, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +mark_tg_check_v1(const char *tablename, const void *entry, + const struct xt_target *target, void *targinfo, + unsigned int hook_mask) { const struct xt_mark_target_info_v1 *markinfo = targinfo; @@ -105,6 +108,28 @@ checkentry_v1(const char *tablename, } #ifdef CONFIG_COMPAT +struct compat_xt_mark_target_info { + compat_ulong_t mark; +}; + +static void mark_tg_compat_from_user_v0(void *dst, void *src) +{ + const struct compat_xt_mark_target_info *cm = src; + struct xt_mark_target_info m = { + .mark = cm->mark, + }; + memcpy(dst, &m, sizeof(m)); +} + +static int mark_tg_compat_to_user_v0(void __user *dst, void *src) +{ + const struct xt_mark_target_info *m = src; + struct compat_xt_mark_target_info cm = { + .mark = m->mark, + }; + return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0; +} + struct compat_xt_mark_target_info_v1 { compat_ulong_t mark; u_int8_t mode; @@ -112,7 +137,7 @@ struct compat_xt_mark_target_info_v1 { u_int16_t __pad2; }; -static void compat_from_user_v1(void *dst, void *src) +static void mark_tg_compat_from_user_v1(void *dst, void *src) { const struct compat_xt_mark_target_info_v1 *cm = src; struct xt_mark_target_info_v1 m = { @@ -122,7 +147,7 @@ static void compat_from_user_v1(void *dst, void *src) memcpy(dst, &m, sizeof(m)); } -static int compat_to_user_v1(void __user *dst, void *src) +static int mark_tg_compat_to_user_v1(void __user *dst, void *src) { const struct xt_mark_target_info_v1 *m = src; struct compat_xt_mark_target_info_v1 cm = { @@ -133,14 +158,19 @@ static int compat_to_user_v1(void __user *dst, void *src) } #endif /* CONFIG_COMPAT */ -static struct xt_target xt_mark_target[] __read_mostly = { +static struct xt_target mark_tg_reg[] __read_mostly = { { .name = "MARK", .family = AF_INET, .revision = 0, - .checkentry = checkentry_v0, - .target = target_v0, + .checkentry = mark_tg_check_v0, + .target = mark_tg_v0, .targetsize = sizeof(struct xt_mark_target_info), +#ifdef CONFIG_COMPAT + .compatsize = sizeof(struct compat_xt_mark_target_info), + .compat_from_user = mark_tg_compat_from_user_v0, + .compat_to_user = mark_tg_compat_to_user_v0, +#endif .table = "mangle", .me = THIS_MODULE, }, @@ -148,13 +178,13 @@ static struct xt_target xt_mark_target[] __read_mostly = { .name = "MARK", .family = AF_INET, .revision = 1, - .checkentry = checkentry_v1, - .target = target_v1, + .checkentry = mark_tg_check_v1, + .target = mark_tg_v1, .targetsize = sizeof(struct xt_mark_target_info_v1), #ifdef CONFIG_COMPAT .compatsize = sizeof(struct compat_xt_mark_target_info_v1), - .compat_from_user = compat_from_user_v1, - .compat_to_user = compat_to_user_v1, + .compat_from_user = mark_tg_compat_from_user_v1, + .compat_to_user = mark_tg_compat_to_user_v1, #endif .table = "mangle", .me = THIS_MODULE, @@ -163,23 +193,59 @@ static struct xt_target xt_mark_target[] __read_mostly = { .name = "MARK", .family = AF_INET6, .revision = 0, - .checkentry = checkentry_v0, - .target = target_v0, + .checkentry = mark_tg_check_v0, + .target = mark_tg_v0, .targetsize = sizeof(struct xt_mark_target_info), +#ifdef CONFIG_COMPAT + .compatsize = sizeof(struct compat_xt_mark_target_info), + .compat_from_user = mark_tg_compat_from_user_v0, + .compat_to_user = mark_tg_compat_to_user_v0, +#endif + .table = "mangle", + .me = THIS_MODULE, + }, + { + .name = "MARK", + .family = AF_INET6, + .revision = 1, + .checkentry = mark_tg_check_v1, + .target = mark_tg_v1, + .targetsize = sizeof(struct xt_mark_target_info_v1), +#ifdef CONFIG_COMPAT + .compatsize = sizeof(struct compat_xt_mark_target_info_v1), + .compat_from_user = mark_tg_compat_from_user_v1, + .compat_to_user = mark_tg_compat_to_user_v1, +#endif .table = "mangle", .me = THIS_MODULE, }, + { + .name = "MARK", + .revision = 2, + .family = AF_INET, + .target = mark_tg, + .targetsize = sizeof(struct xt_mark_tginfo2), + .me = THIS_MODULE, + }, + { + .name = "MARK", + .revision = 2, + .family = AF_INET6, + .target = mark_tg, + .targetsize = sizeof(struct xt_mark_tginfo2), + .me = THIS_MODULE, + }, }; -static int __init xt_mark_init(void) +static int __init mark_tg_init(void) { - return xt_register_targets(xt_mark_target, ARRAY_SIZE(xt_mark_target)); + return xt_register_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg)); } -static void __exit xt_mark_fini(void) +static void __exit mark_tg_exit(void) { - xt_unregister_targets(xt_mark_target, ARRAY_SIZE(xt_mark_target)); + xt_unregister_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg)); } -module_init(xt_mark_init); -module_exit(xt_mark_fini); +module_init(mark_tg_init); +module_exit(mark_tg_exit); diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index 9fb449ffbf8..19ae8efae65 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -12,18 +12,18 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_NFLOG.h> +#include <net/netfilter/nf_log.h> MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_DESCRIPTION("x_tables NFLOG target"); +MODULE_DESCRIPTION("Xtables: packet logging to netlink using NFLOG"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_NFLOG"); MODULE_ALIAS("ip6t_NFLOG"); static unsigned int -nflog_target(struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - unsigned int hooknum, const struct xt_target *target, - const void *targinfo) +nflog_tg(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { const struct xt_nflog_info *info = targinfo; struct nf_loginfo li; @@ -39,9 +39,9 @@ nflog_target(struct sk_buff *skb, } static bool -nflog_checkentry(const char *tablename, const void *entry, - const struct xt_target *target, void *targetinfo, - unsigned int hookmask) +nflog_tg_check(const char *tablename, const void *entry, + const struct xt_target *target, void *targetinfo, + unsigned int hookmask) { const struct xt_nflog_info *info = targetinfo; @@ -52,35 +52,34 @@ nflog_checkentry(const char *tablename, const void *entry, return true; } -static struct xt_target xt_nflog_target[] __read_mostly = { +static struct xt_target nflog_tg_reg[] __read_mostly = { { .name = "NFLOG", .family = AF_INET, - .checkentry = nflog_checkentry, - .target = nflog_target, + .checkentry = nflog_tg_check, + .target = nflog_tg, .targetsize = sizeof(struct xt_nflog_info), .me = THIS_MODULE, }, { .name = "NFLOG", .family = AF_INET6, - .checkentry = nflog_checkentry, - .target = nflog_target, + .checkentry = nflog_tg_check, + .target = nflog_tg, .targetsize = sizeof(struct xt_nflog_info), .me = THIS_MODULE, }, }; -static int __init xt_nflog_init(void) +static int __init nflog_tg_init(void) { - return xt_register_targets(xt_nflog_target, - ARRAY_SIZE(xt_nflog_target)); + return xt_register_targets(nflog_tg_reg, ARRAY_SIZE(nflog_tg_reg)); } -static void __exit xt_nflog_fini(void) +static void __exit nflog_tg_exit(void) { - xt_unregister_targets(xt_nflog_target, ARRAY_SIZE(xt_nflog_target)); + xt_unregister_targets(nflog_tg_reg, ARRAY_SIZE(nflog_tg_reg)); } -module_init(xt_nflog_init); -module_exit(xt_nflog_fini); +module_init(nflog_tg_init); +module_exit(nflog_tg_exit); diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c index c3984e9f766..beb24d19a56 100644 --- a/net/netfilter/xt_NFQUEUE.c +++ b/net/netfilter/xt_NFQUEUE.c @@ -17,59 +17,55 @@ #include <linux/netfilter/xt_NFQUEUE.h> MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("[ip,ip6,arp]_tables NFQUEUE target"); +MODULE_DESCRIPTION("Xtables: packet forwarding to netlink"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_NFQUEUE"); MODULE_ALIAS("ip6t_NFQUEUE"); MODULE_ALIAS("arpt_NFQUEUE"); static unsigned int -target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +nfqueue_tg(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { const struct xt_NFQ_info *tinfo = targinfo; return NF_QUEUE_NR(tinfo->queuenum); } -static struct xt_target xt_nfqueue_target[] __read_mostly = { +static struct xt_target nfqueue_tg_reg[] __read_mostly = { { .name = "NFQUEUE", .family = AF_INET, - .target = target, + .target = nfqueue_tg, .targetsize = sizeof(struct xt_NFQ_info), .me = THIS_MODULE, }, { .name = "NFQUEUE", .family = AF_INET6, - .target = target, + .target = nfqueue_tg, .targetsize = sizeof(struct xt_NFQ_info), .me = THIS_MODULE, }, { .name = "NFQUEUE", .family = NF_ARP, - .target = target, + .target = nfqueue_tg, .targetsize = sizeof(struct xt_NFQ_info), .me = THIS_MODULE, }, }; -static int __init xt_nfqueue_init(void) +static int __init nfqueue_tg_init(void) { - return xt_register_targets(xt_nfqueue_target, - ARRAY_SIZE(xt_nfqueue_target)); + return xt_register_targets(nfqueue_tg_reg, ARRAY_SIZE(nfqueue_tg_reg)); } -static void __exit xt_nfqueue_fini(void) +static void __exit nfqueue_tg_exit(void) { - xt_unregister_targets(xt_nfqueue_target, ARRAY_SIZE(xt_nfqueue_target)); + xt_unregister_targets(nfqueue_tg_reg, ARRAY_SIZE(nfqueue_tg_reg)); } -module_init(xt_nfqueue_init); -module_exit(xt_nfqueue_fini); +module_init(nfqueue_tg_init); +module_exit(nfqueue_tg_exit); diff --git a/net/netfilter/xt_NOTRACK.c b/net/netfilter/xt_NOTRACK.c index 4976ce18661..6c9de611eb8 100644 --- a/net/netfilter/xt_NOTRACK.c +++ b/net/netfilter/xt_NOTRACK.c @@ -7,17 +7,15 @@ #include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_conntrack.h> +MODULE_DESCRIPTION("Xtables: Disabling connection tracking for packets"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_NOTRACK"); MODULE_ALIAS("ip6t_NOTRACK"); static unsigned int -target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +notrack_tg(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { /* Previously seen (loopback)? Ignore. */ if (skb->nfct != NULL) @@ -34,33 +32,32 @@ target(struct sk_buff *skb, return XT_CONTINUE; } -static struct xt_target xt_notrack_target[] __read_mostly = { +static struct xt_target notrack_tg_reg[] __read_mostly = { { .name = "NOTRACK", .family = AF_INET, - .target = target, + .target = notrack_tg, .table = "raw", .me = THIS_MODULE, }, { .name = "NOTRACK", .family = AF_INET6, - .target = target, + .target = notrack_tg, .table = "raw", .me = THIS_MODULE, }, }; -static int __init xt_notrack_init(void) +static int __init notrack_tg_init(void) { - return xt_register_targets(xt_notrack_target, - ARRAY_SIZE(xt_notrack_target)); + return xt_register_targets(notrack_tg_reg, ARRAY_SIZE(notrack_tg_reg)); } -static void __exit xt_notrack_fini(void) +static void __exit notrack_tg_exit(void) { - xt_unregister_targets(xt_notrack_target, ARRAY_SIZE(xt_notrack_target)); + xt_unregister_targets(notrack_tg_reg, ARRAY_SIZE(notrack_tg_reg)); } -module_init(xt_notrack_init); -module_exit(xt_notrack_fini); +module_init(notrack_tg_init); +module_exit(notrack_tg_exit); diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c new file mode 100644 index 00000000000..24c73ba31ea --- /dev/null +++ b/net/netfilter/xt_RATEEST.c @@ -0,0 +1,205 @@ +/* + * (C) 2007 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/gen_stats.h> +#include <linux/jhash.h> +#include <linux/rtnetlink.h> +#include <linux/random.h> +#include <net/gen_stats.h> +#include <net/netlink.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_RATEEST.h> +#include <net/netfilter/xt_rateest.h> + +static DEFINE_MUTEX(xt_rateest_mutex); + +#define RATEEST_HSIZE 16 +static struct hlist_head rateest_hash[RATEEST_HSIZE] __read_mostly; +static unsigned int jhash_rnd __read_mostly; + +static unsigned int xt_rateest_hash(const char *name) +{ + return jhash(name, FIELD_SIZEOF(struct xt_rateest, name), jhash_rnd) & + (RATEEST_HSIZE - 1); +} + +static void xt_rateest_hash_insert(struct xt_rateest *est) +{ + unsigned int h; + + h = xt_rateest_hash(est->name); + hlist_add_head(&est->list, &rateest_hash[h]); +} + +struct xt_rateest *xt_rateest_lookup(const char *name) +{ + struct xt_rateest *est; + struct hlist_node *n; + unsigned int h; + + h = xt_rateest_hash(name); + mutex_lock(&xt_rateest_mutex); + hlist_for_each_entry(est, n, &rateest_hash[h], list) { + if (strcmp(est->name, name) == 0) { + est->refcnt++; + mutex_unlock(&xt_rateest_mutex); + return est; + } + } + mutex_unlock(&xt_rateest_mutex); + return NULL; +} +EXPORT_SYMBOL_GPL(xt_rateest_lookup); + +void xt_rateest_put(struct xt_rateest *est) +{ + mutex_lock(&xt_rateest_mutex); + if (--est->refcnt == 0) { + hlist_del(&est->list); + gen_kill_estimator(&est->bstats, &est->rstats); + kfree(est); + } + mutex_unlock(&xt_rateest_mutex); +} +EXPORT_SYMBOL_GPL(xt_rateest_put); + +static unsigned int +xt_rateest_tg(struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const struct xt_target *target, + const void *targinfo) +{ + const struct xt_rateest_target_info *info = targinfo; + struct gnet_stats_basic *stats = &info->est->bstats; + + spin_lock_bh(&info->est->lock); + stats->bytes += skb->len; + stats->packets++; + spin_unlock_bh(&info->est->lock); + + return XT_CONTINUE; +} + +static bool +xt_rateest_tg_checkentry(const char *tablename, + const void *entry, + const struct xt_target *target, + void *targinfo, + unsigned int hook_mask) +{ + struct xt_rateest_target_info *info = (void *)targinfo; + struct xt_rateest *est; + struct { + struct nlattr opt; + struct gnet_estimator est; + } cfg; + + est = xt_rateest_lookup(info->name); + if (est) { + /* + * If estimator parameters are specified, they must match the + * existing estimator. + */ + if ((!info->interval && !info->ewma_log) || + (info->interval != est->params.interval || + info->ewma_log != est->params.ewma_log)) { + xt_rateest_put(est); + return false; + } + info->est = est; + return true; + } + + est = kzalloc(sizeof(*est), GFP_KERNEL); + if (!est) + goto err1; + + strlcpy(est->name, info->name, sizeof(est->name)); + spin_lock_init(&est->lock); + est->refcnt = 1; + est->params.interval = info->interval; + est->params.ewma_log = info->ewma_log; + + cfg.opt.nla_len = nla_attr_size(sizeof(cfg.est)); + cfg.opt.nla_type = TCA_STATS_RATE_EST; + cfg.est.interval = info->interval; + cfg.est.ewma_log = info->ewma_log; + + if (gen_new_estimator(&est->bstats, &est->rstats, &est->lock, + &cfg.opt) < 0) + goto err2; + + info->est = est; + xt_rateest_hash_insert(est); + + return true; + +err2: + kfree(est); +err1: + return false; +} + +static void xt_rateest_tg_destroy(const struct xt_target *target, + void *targinfo) +{ + struct xt_rateest_target_info *info = targinfo; + + xt_rateest_put(info->est); +} + +static struct xt_target xt_rateest_target[] __read_mostly = { + { + .family = AF_INET, + .name = "RATEEST", + .target = xt_rateest_tg, + .checkentry = xt_rateest_tg_checkentry, + .destroy = xt_rateest_tg_destroy, + .targetsize = sizeof(struct xt_rateest_target_info), + .me = THIS_MODULE, + }, + { + .family = AF_INET6, + .name = "RATEEST", + .target = xt_rateest_tg, + .checkentry = xt_rateest_tg_checkentry, + .destroy = xt_rateest_tg_destroy, + .targetsize = sizeof(struct xt_rateest_target_info), + .me = THIS_MODULE, + }, +}; + +static int __init xt_rateest_tg_init(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(rateest_hash); i++) + INIT_HLIST_HEAD(&rateest_hash[i]); + + get_random_bytes(&jhash_rnd, sizeof(jhash_rnd)); + return xt_register_targets(xt_rateest_target, + ARRAY_SIZE(xt_rateest_target)); +} + +static void __exit xt_rateest_tg_fini(void) +{ + xt_unregister_targets(xt_rateest_target, ARRAY_SIZE(xt_rateest_target)); +} + + +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: packet rate estimator"); +MODULE_ALIAS("ipt_RATEEST"); +MODULE_ALIAS("ip6t_RATEEST"); +module_init(xt_rateest_tg_init); +module_exit(xt_rateest_tg_fini); diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c index 235806eb6ec..b11b3ecbb39 100644 --- a/net/netfilter/xt_SECMARK.c +++ b/net/netfilter/xt_SECMARK.c @@ -20,7 +20,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("James Morris <jmorris@redhat.com>"); -MODULE_DESCRIPTION("ip[6]tables SECMARK modification module"); +MODULE_DESCRIPTION("Xtables: packet security mark modification"); MODULE_ALIAS("ipt_SECMARK"); MODULE_ALIAS("ip6t_SECMARK"); @@ -28,10 +28,10 @@ MODULE_ALIAS("ip6t_SECMARK"); static u8 mode; -static unsigned int target(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +static unsigned int +secmark_tg(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { u32 secmark = 0; const struct xt_secmark_target_info *info = targinfo; @@ -81,9 +81,10 @@ static bool checkentry_selinux(struct xt_secmark_target_info *info) return true; } -static bool checkentry(const char *tablename, const void *entry, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool +secmark_tg_check(const char *tablename, const void *entry, + const struct xt_target *target, void *targinfo, + unsigned int hook_mask) { struct xt_secmark_target_info *info = targinfo; @@ -109,12 +110,12 @@ static bool checkentry(const char *tablename, const void *entry, return true; } -static struct xt_target xt_secmark_target[] __read_mostly = { +static struct xt_target secmark_tg_reg[] __read_mostly = { { .name = "SECMARK", .family = AF_INET, - .checkentry = checkentry, - .target = target, + .checkentry = secmark_tg_check, + .target = secmark_tg, .targetsize = sizeof(struct xt_secmark_target_info), .table = "mangle", .me = THIS_MODULE, @@ -122,24 +123,23 @@ static struct xt_target xt_secmark_target[] __read_mostly = { { .name = "SECMARK", .family = AF_INET6, - .checkentry = checkentry, - .target = target, + .checkentry = secmark_tg_check, + .target = secmark_tg, .targetsize = sizeof(struct xt_secmark_target_info), .table = "mangle", .me = THIS_MODULE, }, }; -static int __init xt_secmark_init(void) +static int __init secmark_tg_init(void) { - return xt_register_targets(xt_secmark_target, - ARRAY_SIZE(xt_secmark_target)); + return xt_register_targets(secmark_tg_reg, ARRAY_SIZE(secmark_tg_reg)); } -static void __exit xt_secmark_fini(void) +static void __exit secmark_tg_exit(void) { - xt_unregister_targets(xt_secmark_target, ARRAY_SIZE(xt_secmark_target)); + xt_unregister_targets(secmark_tg_reg, ARRAY_SIZE(secmark_tg_reg)); } -module_init(xt_secmark_init); -module_exit(xt_secmark_fini); +module_init(secmark_tg_init); +module_exit(secmark_tg_exit); diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index 8e76d1f52fb..60e3767cc71 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -24,7 +24,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); -MODULE_DESCRIPTION("x_tables TCP MSS modification module"); +MODULE_DESCRIPTION("Xtables: TCP Maximum Segment Size (MSS) adjustment"); MODULE_ALIAS("ipt_TCPMSS"); MODULE_ALIAS("ip6t_TCPMSS"); @@ -88,15 +88,19 @@ tcpmss_mangle_packet(struct sk_buff *skb, oldmss = (opt[i+2] << 8) | opt[i+3]; - if (info->mss == XT_TCPMSS_CLAMP_PMTU && - oldmss <= newmss) + /* Never increase MSS, even when setting it, as + * doing so results in problems for hosts that rely + * on MSS being set correctly. + */ + if (oldmss <= newmss) return 0; opt[i+2] = (newmss & 0xff00) >> 8; opt[i+3] = newmss & 0x00ff; - nf_proto_csum_replace2(&tcph->check, skb, - htons(oldmss), htons(newmss), 0); + inet_proto_csum_replace2(&tcph->check, skb, + htons(oldmss), htons(newmss), + 0); return 0; } } @@ -117,29 +121,26 @@ tcpmss_mangle_packet(struct sk_buff *skb, opt = (u_int8_t *)tcph + sizeof(struct tcphdr); memmove(opt + TCPOLEN_MSS, opt, tcplen - sizeof(struct tcphdr)); - nf_proto_csum_replace2(&tcph->check, skb, - htons(tcplen), htons(tcplen + TCPOLEN_MSS), 1); + inet_proto_csum_replace2(&tcph->check, skb, + htons(tcplen), htons(tcplen + TCPOLEN_MSS), 1); opt[0] = TCPOPT_MSS; opt[1] = TCPOLEN_MSS; opt[2] = (newmss & 0xff00) >> 8; opt[3] = newmss & 0x00ff; - nf_proto_csum_replace4(&tcph->check, skb, 0, *((__be32 *)opt), 0); + inet_proto_csum_replace4(&tcph->check, skb, 0, *((__be32 *)opt), 0); oldval = ((__be16 *)tcph)[6]; tcph->doff += TCPOLEN_MSS/4; - nf_proto_csum_replace2(&tcph->check, skb, - oldval, ((__be16 *)tcph)[6], 0); + inet_proto_csum_replace2(&tcph->check, skb, + oldval, ((__be16 *)tcph)[6], 0); return TCPOLEN_MSS; } static unsigned int -xt_tcpmss_target4(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +tcpmss_tg4(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { struct iphdr *iph = ip_hdr(skb); __be16 newlen; @@ -152,7 +153,7 @@ xt_tcpmss_target4(struct sk_buff *skb, if (ret > 0) { iph = ip_hdr(skb); newlen = htons(ntohs(iph->tot_len) + ret); - nf_csum_replace2(&iph->check, iph->tot_len, newlen); + csum_replace2(&iph->check, iph->tot_len, newlen); iph->tot_len = newlen; } return XT_CONTINUE; @@ -160,12 +161,9 @@ xt_tcpmss_target4(struct sk_buff *skb, #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) static unsigned int -xt_tcpmss_target6(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +tcpmss_tg6(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { struct ipv6hdr *ipv6h = ipv6_hdr(skb); u8 nexthdr; @@ -204,19 +202,17 @@ static inline bool find_syn_match(const struct xt_entry_match *m) } static bool -xt_tcpmss_checkentry4(const char *tablename, - const void *entry, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +tcpmss_tg4_check(const char *tablename, const void *entry, + const struct xt_target *target, void *targinfo, + unsigned int hook_mask) { const struct xt_tcpmss_info *info = targinfo; const struct ipt_entry *e = entry; if (info->mss == XT_TCPMSS_CLAMP_PMTU && - (hook_mask & ~((1 << NF_IP_FORWARD) | - (1 << NF_IP_LOCAL_OUT) | - (1 << NF_IP_POST_ROUTING))) != 0) { + (hook_mask & ~((1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING))) != 0) { printk("xt_TCPMSS: path-MTU clamping only supported in " "FORWARD, OUTPUT and POSTROUTING hooks\n"); return false; @@ -229,19 +225,17 @@ xt_tcpmss_checkentry4(const char *tablename, #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) static bool -xt_tcpmss_checkentry6(const char *tablename, - const void *entry, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +tcpmss_tg6_check(const char *tablename, const void *entry, + const struct xt_target *target, void *targinfo, + unsigned int hook_mask) { const struct xt_tcpmss_info *info = targinfo; const struct ip6t_entry *e = entry; if (info->mss == XT_TCPMSS_CLAMP_PMTU && - (hook_mask & ~((1 << NF_IP6_FORWARD) | - (1 << NF_IP6_LOCAL_OUT) | - (1 << NF_IP6_POST_ROUTING))) != 0) { + (hook_mask & ~((1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING))) != 0) { printk("xt_TCPMSS: path-MTU clamping only supported in " "FORWARD, OUTPUT and POSTROUTING hooks\n"); return false; @@ -253,12 +247,12 @@ xt_tcpmss_checkentry6(const char *tablename, } #endif -static struct xt_target xt_tcpmss_reg[] __read_mostly = { +static struct xt_target tcpmss_tg_reg[] __read_mostly = { { .family = AF_INET, .name = "TCPMSS", - .checkentry = xt_tcpmss_checkentry4, - .target = xt_tcpmss_target4, + .checkentry = tcpmss_tg4_check, + .target = tcpmss_tg4, .targetsize = sizeof(struct xt_tcpmss_info), .proto = IPPROTO_TCP, .me = THIS_MODULE, @@ -267,8 +261,8 @@ static struct xt_target xt_tcpmss_reg[] __read_mostly = { { .family = AF_INET6, .name = "TCPMSS", - .checkentry = xt_tcpmss_checkentry6, - .target = xt_tcpmss_target6, + .checkentry = tcpmss_tg6_check, + .target = tcpmss_tg6, .targetsize = sizeof(struct xt_tcpmss_info), .proto = IPPROTO_TCP, .me = THIS_MODULE, @@ -276,15 +270,15 @@ static struct xt_target xt_tcpmss_reg[] __read_mostly = { #endif }; -static int __init xt_tcpmss_init(void) +static int __init tcpmss_tg_init(void) { - return xt_register_targets(xt_tcpmss_reg, ARRAY_SIZE(xt_tcpmss_reg)); + return xt_register_targets(tcpmss_tg_reg, ARRAY_SIZE(tcpmss_tg_reg)); } -static void __exit xt_tcpmss_fini(void) +static void __exit tcpmss_tg_exit(void) { - xt_unregister_targets(xt_tcpmss_reg, ARRAY_SIZE(xt_tcpmss_reg)); + xt_unregister_targets(tcpmss_tg_reg, ARRAY_SIZE(tcpmss_tg_reg)); } -module_init(xt_tcpmss_init); -module_exit(xt_tcpmss_fini); +module_init(tcpmss_tg_init); +module_exit(tcpmss_tg_exit); diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c new file mode 100644 index 00000000000..3b2aa56833b --- /dev/null +++ b/net/netfilter/xt_TCPOPTSTRIP.c @@ -0,0 +1,147 @@ +/* + * A module for stripping a specific TCP option from TCP packets. + * + * Copyright (C) 2007 Sven Schnelle <svens@bitebene.org> + * Copyright © CC Computer Consultants GmbH, 2007 + * Contact: Jan Engelhardt <jengelh@computergmbh.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/tcp.h> +#include <net/ipv6.h> +#include <net/tcp.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_TCPOPTSTRIP.h> + +static inline unsigned int optlen(const u_int8_t *opt, unsigned int offset) +{ + /* Beware zero-length options: make finite progress */ + if (opt[offset] <= TCPOPT_NOP || opt[offset+1] == 0) + return 1; + else + return opt[offset+1]; +} + +static unsigned int +tcpoptstrip_mangle_packet(struct sk_buff *skb, + const struct xt_tcpoptstrip_target_info *info, + unsigned int tcphoff, unsigned int minlen) +{ + unsigned int optl, i, j; + struct tcphdr *tcph; + u_int16_t n, o; + u_int8_t *opt; + + if (!skb_make_writable(skb, skb->len)) + return NF_DROP; + + tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); + opt = (u_int8_t *)tcph; + + /* + * Walk through all TCP options - if we find some option to remove, + * set all octets to %TCPOPT_NOP and adjust checksum. + */ + for (i = sizeof(struct tcphdr); i < tcp_hdrlen(skb); i += optl) { + optl = optlen(opt, i); + + if (i + optl > tcp_hdrlen(skb)) + break; + + if (!tcpoptstrip_test_bit(info->strip_bmap, opt[i])) + continue; + + for (j = 0; j < optl; ++j) { + o = opt[i+j]; + n = TCPOPT_NOP; + if ((i + j) % 2 == 0) { + o <<= 8; + n <<= 8; + } + inet_proto_csum_replace2(&tcph->check, skb, htons(o), + htons(n), 0); + } + memset(opt + i, TCPOPT_NOP, optl); + } + + return XT_CONTINUE; +} + +static unsigned int +tcpoptstrip_tg4(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) +{ + return tcpoptstrip_mangle_packet(skb, targinfo, ip_hdrlen(skb), + sizeof(struct iphdr) + sizeof(struct tcphdr)); +} + +#if defined(CONFIG_IP6_NF_MANGLE) || defined(CONFIG_IP6_NF_MANGLE_MODULE) +static unsigned int +tcpoptstrip_tg6(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) +{ + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + unsigned int tcphoff; + u_int8_t nexthdr; + + nexthdr = ipv6h->nexthdr; + tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr); + if (tcphoff < 0) + return NF_DROP; + + return tcpoptstrip_mangle_packet(skb, targinfo, tcphoff, + sizeof(*ipv6h) + sizeof(struct tcphdr)); +} +#endif + +static struct xt_target tcpoptstrip_tg_reg[] __read_mostly = { + { + .name = "TCPOPTSTRIP", + .family = AF_INET, + .table = "mangle", + .proto = IPPROTO_TCP, + .target = tcpoptstrip_tg4, + .targetsize = sizeof(struct xt_tcpoptstrip_target_info), + .me = THIS_MODULE, + }, +#if defined(CONFIG_IP6_NF_MANGLE) || defined(CONFIG_IP6_NF_MANGLE_MODULE) + { + .name = "TCPOPTSTRIP", + .family = AF_INET6, + .table = "mangle", + .proto = IPPROTO_TCP, + .target = tcpoptstrip_tg6, + .targetsize = sizeof(struct xt_tcpoptstrip_target_info), + .me = THIS_MODULE, + }, +#endif +}; + +static int __init tcpoptstrip_tg_init(void) +{ + return xt_register_targets(tcpoptstrip_tg_reg, + ARRAY_SIZE(tcpoptstrip_tg_reg)); +} + +static void __exit tcpoptstrip_tg_exit(void) +{ + xt_unregister_targets(tcpoptstrip_tg_reg, + ARRAY_SIZE(tcpoptstrip_tg_reg)); +} + +module_init(tcpoptstrip_tg_init); +module_exit(tcpoptstrip_tg_exit); +MODULE_AUTHOR("Sven Schnelle <svens@bitebene.org>, Jan Engelhardt <jengelh@computergmbh.de>"); +MODULE_DESCRIPTION("Xtables: TCP option stripping"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_TCPOPTSTRIP"); +MODULE_ALIAS("ip6t_TCPOPTSTRIP"); diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c index 26c5d08ab2c..30dab79a343 100644 --- a/net/netfilter/xt_TRACE.c +++ b/net/netfilter/xt_TRACE.c @@ -5,49 +5,46 @@ #include <linux/netfilter/x_tables.h> +MODULE_DESCRIPTION("Xtables: packet flow tracing"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_TRACE"); MODULE_ALIAS("ip6t_TRACE"); static unsigned int -target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +trace_tg(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { skb->nf_trace = 1; return XT_CONTINUE; } -static struct xt_target xt_trace_target[] __read_mostly = { +static struct xt_target trace_tg_reg[] __read_mostly = { { .name = "TRACE", .family = AF_INET, - .target = target, + .target = trace_tg, .table = "raw", .me = THIS_MODULE, }, { .name = "TRACE", .family = AF_INET6, - .target = target, + .target = trace_tg, .table = "raw", .me = THIS_MODULE, }, }; -static int __init xt_trace_init(void) +static int __init trace_tg_init(void) { - return xt_register_targets(xt_trace_target, - ARRAY_SIZE(xt_trace_target)); + return xt_register_targets(trace_tg_reg, ARRAY_SIZE(trace_tg_reg)); } -static void __exit xt_trace_fini(void) +static void __exit trace_tg_exit(void) { - xt_unregister_targets(xt_trace_target, ARRAY_SIZE(xt_trace_target)); + xt_unregister_targets(trace_tg_reg, ARRAY_SIZE(trace_tg_reg)); } -module_init(xt_trace_init); -module_exit(xt_trace_fini); +module_init(trace_tg_init); +module_exit(trace_tg_exit); diff --git a/net/netfilter/xt_comment.c b/net/netfilter/xt_comment.c index 64bcdb0fe1e..89f47364e84 100644 --- a/net/netfilter/xt_comment.c +++ b/net/netfilter/xt_comment.c @@ -10,52 +10,47 @@ #include <linux/netfilter/xt_comment.h> MODULE_AUTHOR("Brad Fisher <brad@info-link.net>"); -MODULE_DESCRIPTION("iptables comment match module"); +MODULE_DESCRIPTION("Xtables: No-op match which can be tagged with a comment"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_comment"); MODULE_ALIAS("ip6t_comment"); static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protooff, - bool *hotdrop) +comment_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protooff, + bool *hotdrop) { /* We always match */ return true; } -static struct xt_match xt_comment_match[] __read_mostly = { +static struct xt_match comment_mt_reg[] __read_mostly = { { .name = "comment", .family = AF_INET, - .match = match, + .match = comment_mt, .matchsize = sizeof(struct xt_comment_info), .me = THIS_MODULE }, { .name = "comment", .family = AF_INET6, - .match = match, + .match = comment_mt, .matchsize = sizeof(struct xt_comment_info), .me = THIS_MODULE }, }; -static int __init xt_comment_init(void) +static int __init comment_mt_init(void) { - return xt_register_matches(xt_comment_match, - ARRAY_SIZE(xt_comment_match)); + return xt_register_matches(comment_mt_reg, ARRAY_SIZE(comment_mt_reg)); } -static void __exit xt_comment_fini(void) +static void __exit comment_mt_exit(void) { - xt_unregister_matches(xt_comment_match, ARRAY_SIZE(xt_comment_match)); + xt_unregister_matches(comment_mt_reg, ARRAY_SIZE(comment_mt_reg)); } -module_init(xt_comment_init); -module_exit(xt_comment_fini); +module_init(comment_mt_init); +module_exit(comment_mt_exit); diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c index 9ec50139b9a..b15e7e2fa14 100644 --- a/net/netfilter/xt_connbytes.c +++ b/net/netfilter/xt_connbytes.c @@ -12,19 +12,15 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("iptables match for matching number of pkts/bytes per connection"); +MODULE_DESCRIPTION("Xtables: Number of packets/bytes per connection matching"); MODULE_ALIAS("ipt_connbytes"); MODULE_ALIAS("ip6t_connbytes"); static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +connbytes_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { const struct xt_connbytes_info *sinfo = matchinfo; const struct nf_conn *ct; @@ -96,11 +92,10 @@ match(const struct sk_buff *skb, return what >= sinfo->count.from; } -static bool check(const char *tablename, - const void *ip, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +static bool +connbytes_mt_check(const char *tablename, const void *ip, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { const struct xt_connbytes_info *sinfo = matchinfo; @@ -116,7 +111,7 @@ static bool check(const char *tablename, if (nf_ct_l3proto_try_module_get(match->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%d\n", match->family); + "proto=%u\n", match->family); return false; } @@ -124,43 +119,42 @@ static bool check(const char *tablename, } static void -destroy(const struct xt_match *match, void *matchinfo) +connbytes_mt_destroy(const struct xt_match *match, void *matchinfo) { nf_ct_l3proto_module_put(match->family); } -static struct xt_match xt_connbytes_match[] __read_mostly = { +static struct xt_match connbytes_mt_reg[] __read_mostly = { { .name = "connbytes", .family = AF_INET, - .checkentry = check, - .match = match, - .destroy = destroy, + .checkentry = connbytes_mt_check, + .match = connbytes_mt, + .destroy = connbytes_mt_destroy, .matchsize = sizeof(struct xt_connbytes_info), .me = THIS_MODULE }, { .name = "connbytes", .family = AF_INET6, - .checkentry = check, - .match = match, - .destroy = destroy, + .checkentry = connbytes_mt_check, + .match = connbytes_mt, + .destroy = connbytes_mt_destroy, .matchsize = sizeof(struct xt_connbytes_info), .me = THIS_MODULE }, }; -static int __init xt_connbytes_init(void) +static int __init connbytes_mt_init(void) { - return xt_register_matches(xt_connbytes_match, - ARRAY_SIZE(xt_connbytes_match)); + return xt_register_matches(connbytes_mt_reg, + ARRAY_SIZE(connbytes_mt_reg)); } -static void __exit xt_connbytes_fini(void) +static void __exit connbytes_mt_exit(void) { - xt_unregister_matches(xt_connbytes_match, - ARRAY_SIZE(xt_connbytes_match)); + xt_unregister_matches(connbytes_mt_reg, ARRAY_SIZE(connbytes_mt_reg)); } -module_init(xt_connbytes_init); -module_exit(xt_connbytes_fini); +module_init(connbytes_mt_init); +module_exit(connbytes_mt_exit); diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index d7becf08a93..e00ecd974fa 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -53,10 +53,10 @@ static inline unsigned int connlimit_iphash(__be32 addr) } static inline unsigned int -connlimit_iphash6(const union nf_conntrack_address *addr, - const union nf_conntrack_address *mask) +connlimit_iphash6(const union nf_inet_addr *addr, + const union nf_inet_addr *mask) { - union nf_conntrack_address res; + union nf_inet_addr res; unsigned int i; if (unlikely(!connlimit_rnd_inited)) { @@ -81,14 +81,14 @@ static inline bool already_closed(const struct nf_conn *conn) } static inline unsigned int -same_source_net(const union nf_conntrack_address *addr, - const union nf_conntrack_address *mask, - const union nf_conntrack_address *u3, unsigned int family) +same_source_net(const union nf_inet_addr *addr, + const union nf_inet_addr *mask, + const union nf_inet_addr *u3, unsigned int family) { if (family == AF_INET) { return (addr->ip & mask->ip) == (u3->ip & mask->ip); } else { - union nf_conntrack_address lh, rh; + union nf_inet_addr lh, rh; unsigned int i; for (i = 0; i < ARRAY_SIZE(addr->ip6); ++i) { @@ -102,8 +102,8 @@ same_source_net(const union nf_conntrack_address *addr, static int count_them(struct xt_connlimit_data *data, const struct nf_conntrack_tuple *tuple, - const union nf_conntrack_address *addr, - const union nf_conntrack_address *mask, + const union nf_inet_addr *addr, + const union nf_inet_addr *mask, const struct xt_match *match) { struct nf_conntrack_tuple_hash *found; @@ -178,15 +178,14 @@ static int count_them(struct xt_connlimit_data *data, return matches; } -static bool connlimit_match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, int offset, - unsigned int protoff, bool *hotdrop) +static bool +connlimit_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { const struct xt_connlimit_info *info = matchinfo; - union nf_conntrack_address addr, mask; + union nf_inet_addr addr; struct nf_conntrack_tuple tuple; const struct nf_conntrack_tuple *tuple_ptr = &tuple; enum ip_conntrack_info ctinfo; @@ -203,15 +202,14 @@ static bool connlimit_match(const struct sk_buff *skb, if (match->family == AF_INET6) { const struct ipv6hdr *iph = ipv6_hdr(skb); memcpy(&addr.ip6, &iph->saddr, sizeof(iph->saddr)); - memcpy(&mask.ip6, info->v6_mask, sizeof(info->v6_mask)); } else { const struct iphdr *iph = ip_hdr(skb); addr.ip = iph->saddr; - mask.ip = info->v4_mask; } spin_lock_bh(&info->data->lock); - connections = count_them(info->data, tuple_ptr, &addr, &mask, match); + connections = count_them(info->data, tuple_ptr, &addr, + &info->mask, match); spin_unlock_bh(&info->data->lock); if (connections < 0) { @@ -227,9 +225,10 @@ static bool connlimit_match(const struct sk_buff *skb, return false; } -static bool connlimit_check(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool +connlimit_mt_check(const char *tablename, const void *ip, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { struct xt_connlimit_info *info = matchinfo; unsigned int i; @@ -254,7 +253,8 @@ static bool connlimit_check(const char *tablename, const void *ip, return true; } -static void connlimit_destroy(const struct xt_match *match, void *matchinfo) +static void +connlimit_mt_destroy(const struct xt_match *match, void *matchinfo) { struct xt_connlimit_info *info = matchinfo; struct xt_connlimit_conn *conn; @@ -274,41 +274,42 @@ static void connlimit_destroy(const struct xt_match *match, void *matchinfo) kfree(info->data); } -static struct xt_match connlimit_reg[] __read_mostly = { +static struct xt_match connlimit_mt_reg[] __read_mostly = { { .name = "connlimit", .family = AF_INET, - .checkentry = connlimit_check, - .match = connlimit_match, + .checkentry = connlimit_mt_check, + .match = connlimit_mt, .matchsize = sizeof(struct xt_connlimit_info), - .destroy = connlimit_destroy, + .destroy = connlimit_mt_destroy, .me = THIS_MODULE, }, { .name = "connlimit", .family = AF_INET6, - .checkentry = connlimit_check, - .match = connlimit_match, + .checkentry = connlimit_mt_check, + .match = connlimit_mt, .matchsize = sizeof(struct xt_connlimit_info), - .destroy = connlimit_destroy, + .destroy = connlimit_mt_destroy, .me = THIS_MODULE, }, }; -static int __init xt_connlimit_init(void) +static int __init connlimit_mt_init(void) { - return xt_register_matches(connlimit_reg, ARRAY_SIZE(connlimit_reg)); + return xt_register_matches(connlimit_mt_reg, + ARRAY_SIZE(connlimit_mt_reg)); } -static void __exit xt_connlimit_exit(void) +static void __exit connlimit_mt_exit(void) { - xt_unregister_matches(connlimit_reg, ARRAY_SIZE(connlimit_reg)); + xt_unregister_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg)); } -module_init(xt_connlimit_init); -module_exit(xt_connlimit_exit); +module_init(connlimit_mt_init); +module_exit(connlimit_mt_exit); MODULE_AUTHOR("Jan Engelhardt <jengelh@computergmbh.de>"); -MODULE_DESCRIPTION("netfilter xt_connlimit match module"); +MODULE_DESCRIPTION("Xtables: Number of connections matching"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_connlimit"); MODULE_ALIAS("ip6t_connlimit"); diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index 9f67920af41..aaa1b96691f 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -1,8 +1,10 @@ -/* This kernel module matches connection mark values set by the - * CONNMARK target +/* + * xt_connmark - Netfilter module to match connection mark values * - * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com> - * by Henrik Nordstrom <hno@marasystems.com> + * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com> + * by Henrik Nordstrom <hno@marasystems.com> + * Copyright © CC Computer Consultants GmbH, 2007 - 2008 + * Jan Engelhardt <jengelh@computergmbh.de> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -26,20 +28,33 @@ #include <linux/netfilter/xt_connmark.h> MODULE_AUTHOR("Henrik Nordstrom <hno@marasystems.com>"); -MODULE_DESCRIPTION("IP tables connmark match module"); +MODULE_DESCRIPTION("Xtables: connection mark match"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_connmark"); MODULE_ALIAS("ip6t_connmark"); static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +connmark_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) +{ + const struct xt_connmark_mtinfo1 *info = matchinfo; + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (ct == NULL) + return false; + + return ((ct->mark & info->mask) == info->mark) ^ info->invert; +} + +static bool +connmark_mt_v0(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { const struct xt_connmark_info *info = matchinfo; const struct nf_conn *ct; @@ -53,11 +68,9 @@ match(const struct sk_buff *skb, } static bool -checkentry(const char *tablename, - const void *ip, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +connmark_mt_check_v0(const char *tablename, const void *ip, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { const struct xt_connmark_info *cm = matchinfo; @@ -67,14 +80,27 @@ checkentry(const char *tablename, } if (nf_ct_l3proto_try_module_get(match->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%d\n", match->family); + "proto=%u\n", match->family); + return false; + } + return true; +} + +static bool +connmark_mt_check(const char *tablename, const void *ip, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) +{ + if (nf_ct_l3proto_try_module_get(match->family) < 0) { + printk(KERN_WARNING "cannot load conntrack support for " + "proto=%u\n", match->family); return false; } return true; } static void -destroy(const struct xt_match *match, void *matchinfo) +connmark_mt_destroy(const struct xt_match *match, void *matchinfo) { nf_ct_l3proto_module_put(match->family); } @@ -87,7 +113,7 @@ struct compat_xt_connmark_info { u_int16_t __pad2; }; -static void compat_from_user(void *dst, void *src) +static void connmark_mt_compat_from_user_v0(void *dst, void *src) { const struct compat_xt_connmark_info *cm = src; struct xt_connmark_info m = { @@ -98,7 +124,7 @@ static void compat_from_user(void *dst, void *src) memcpy(dst, &m, sizeof(m)); } -static int compat_to_user(void __user *dst, void *src) +static int connmark_mt_compat_to_user_v0(void __user *dst, void *src) { const struct xt_connmark_info *m = src; struct compat_xt_connmark_info cm = { @@ -110,42 +136,69 @@ static int compat_to_user(void __user *dst, void *src) } #endif /* CONFIG_COMPAT */ -static struct xt_match xt_connmark_match[] __read_mostly = { +static struct xt_match connmark_mt_reg[] __read_mostly = { { .name = "connmark", + .revision = 0, .family = AF_INET, - .checkentry = checkentry, - .match = match, - .destroy = destroy, + .checkentry = connmark_mt_check_v0, + .match = connmark_mt_v0, + .destroy = connmark_mt_destroy, .matchsize = sizeof(struct xt_connmark_info), #ifdef CONFIG_COMPAT .compatsize = sizeof(struct compat_xt_connmark_info), - .compat_from_user = compat_from_user, - .compat_to_user = compat_to_user, + .compat_from_user = connmark_mt_compat_from_user_v0, + .compat_to_user = connmark_mt_compat_to_user_v0, #endif .me = THIS_MODULE }, { .name = "connmark", + .revision = 0, .family = AF_INET6, - .checkentry = checkentry, - .match = match, - .destroy = destroy, + .checkentry = connmark_mt_check_v0, + .match = connmark_mt_v0, + .destroy = connmark_mt_destroy, .matchsize = sizeof(struct xt_connmark_info), +#ifdef CONFIG_COMPAT + .compatsize = sizeof(struct compat_xt_connmark_info), + .compat_from_user = connmark_mt_compat_from_user_v0, + .compat_to_user = connmark_mt_compat_to_user_v0, +#endif .me = THIS_MODULE }, + { + .name = "connmark", + .revision = 1, + .family = AF_INET, + .checkentry = connmark_mt_check, + .match = connmark_mt, + .matchsize = sizeof(struct xt_connmark_mtinfo1), + .destroy = connmark_mt_destroy, + .me = THIS_MODULE, + }, + { + .name = "connmark", + .revision = 1, + .family = AF_INET6, + .checkentry = connmark_mt_check, + .match = connmark_mt, + .matchsize = sizeof(struct xt_connmark_mtinfo1), + .destroy = connmark_mt_destroy, + .me = THIS_MODULE, + }, }; -static int __init xt_connmark_init(void) +static int __init connmark_mt_init(void) { - return xt_register_matches(xt_connmark_match, - ARRAY_SIZE(xt_connmark_match)); + return xt_register_matches(connmark_mt_reg, + ARRAY_SIZE(connmark_mt_reg)); } -static void __exit xt_connmark_fini(void) +static void __exit connmark_mt_exit(void) { - xt_unregister_matches(xt_connmark_match, ARRAY_SIZE(xt_connmark_match)); + xt_unregister_matches(connmark_mt_reg, ARRAY_SIZE(connmark_mt_reg)); } -module_init(xt_connmark_init); -module_exit(xt_connmark_fini); +module_init(connmark_mt_init); +module_exit(connmark_mt_exit); diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index ca4b69f020a..e92190eafcc 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -1,33 +1,34 @@ -/* Kernel module to match connection tracking information. - * Superset of Rusty's minimalistic state match. +/* + * xt_conntrack - Netfilter module to match connection tracking + * information. (Superset of Rusty's minimalistic state match.) * - * (C) 2001 Marc Boucher (marc@mbsi.ca). + * (C) 2001 Marc Boucher (marc@mbsi.ca). + * Copyright © CC Computer Consultants GmbH, 2007 - 2008 + * Jan Engelhardt <jengelh@computergmbh.de> * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. */ #include <linux/module.h> #include <linux/skbuff.h> +#include <net/ipv6.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_conntrack.h> #include <net/netfilter/nf_conntrack.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); -MODULE_DESCRIPTION("iptables connection tracking match module"); +MODULE_DESCRIPTION("Xtables: connection tracking state match"); MODULE_ALIAS("ipt_conntrack"); +MODULE_ALIAS("ip6t_conntrack"); static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +conntrack_mt_v0(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { const struct xt_conntrack_info *sinfo = matchinfo; const struct nf_conn *ct; @@ -36,7 +37,7 @@ match(const struct sk_buff *skb, ct = nf_ct_get(skb, &ctinfo); -#define FWINV(bool,invflg) ((bool) ^ !!(sinfo->invflags & invflg)) +#define FWINV(bool, invflg) ((bool) ^ !!(sinfo->invflags & (invflg))) if (ct == &nf_conntrack_untracked) statebit = XT_CONNTRACK_STATE_UNTRACKED; @@ -112,24 +113,152 @@ match(const struct sk_buff *skb, return false; } return true; +#undef FWINV } static bool -checkentry(const char *tablename, - const void *ip, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +conntrack_addrcmp(const union nf_inet_addr *kaddr, + const union nf_inet_addr *uaddr, + const union nf_inet_addr *umask, unsigned int l3proto) +{ + if (l3proto == AF_INET) + return (kaddr->ip & umask->ip) == uaddr->ip; + else if (l3proto == AF_INET6) + return ipv6_masked_addr_cmp(&kaddr->in6, &umask->in6, + &uaddr->in6) == 0; + else + return false; +} + +static inline bool +conntrack_mt_origsrc(const struct nf_conn *ct, + const struct xt_conntrack_mtinfo1 *info, + unsigned int family) +{ + return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3, + &info->origsrc_addr, &info->origsrc_mask, family); +} + +static inline bool +conntrack_mt_origdst(const struct nf_conn *ct, + const struct xt_conntrack_mtinfo1 *info, + unsigned int family) +{ + return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3, + &info->origdst_addr, &info->origdst_mask, family); +} + +static inline bool +conntrack_mt_replsrc(const struct nf_conn *ct, + const struct xt_conntrack_mtinfo1 *info, + unsigned int family) +{ + return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3, + &info->replsrc_addr, &info->replsrc_mask, family); +} + +static inline bool +conntrack_mt_repldst(const struct nf_conn *ct, + const struct xt_conntrack_mtinfo1 *info, + unsigned int family) +{ + return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3, + &info->repldst_addr, &info->repldst_mask, family); +} + +static bool +conntrack_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) +{ + const struct xt_conntrack_mtinfo1 *info = matchinfo; + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; + unsigned int statebit; + + ct = nf_ct_get(skb, &ctinfo); + + if (ct == &nf_conntrack_untracked) + statebit = XT_CONNTRACK_STATE_UNTRACKED; + else if (ct != NULL) + statebit = XT_CONNTRACK_STATE_BIT(ctinfo); + else + statebit = XT_CONNTRACK_STATE_INVALID; + + if (info->match_flags & XT_CONNTRACK_STATE) { + if (ct != NULL) { + if (test_bit(IPS_SRC_NAT_BIT, &ct->status)) + statebit |= XT_CONNTRACK_STATE_SNAT; + if (test_bit(IPS_DST_NAT_BIT, &ct->status)) + statebit |= XT_CONNTRACK_STATE_DNAT; + } + if ((info->state_mask & statebit) ^ + !(info->invert_flags & XT_CONNTRACK_STATE)) + return false; + } + + if (ct == NULL) + return info->match_flags & XT_CONNTRACK_STATE; + + if ((info->match_flags & XT_CONNTRACK_PROTO) && + ((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum == + info->l4proto) ^ !(info->invert_flags & XT_CONNTRACK_PROTO))) + return false; + + if (info->match_flags & XT_CONNTRACK_ORIGSRC) + if (conntrack_mt_origsrc(ct, info, match->family) ^ + !(info->invert_flags & XT_CONNTRACK_ORIGSRC)) + return false; + + if (info->match_flags & XT_CONNTRACK_ORIGDST) + if (conntrack_mt_origdst(ct, info, match->family) ^ + !(info->invert_flags & XT_CONNTRACK_ORIGDST)) + return false; + + if (info->match_flags & XT_CONNTRACK_REPLSRC) + if (conntrack_mt_replsrc(ct, info, match->family) ^ + !(info->invert_flags & XT_CONNTRACK_REPLSRC)) + return false; + + if (info->match_flags & XT_CONNTRACK_REPLDST) + if (conntrack_mt_repldst(ct, info, match->family) ^ + !(info->invert_flags & XT_CONNTRACK_REPLDST)) + return false; + + if ((info->match_flags & XT_CONNTRACK_STATUS) && + (!!(info->status_mask & ct->status) ^ + !(info->invert_flags & XT_CONNTRACK_STATUS))) + return false; + + if (info->match_flags & XT_CONNTRACK_EXPIRES) { + unsigned long expires = 0; + + if (timer_pending(&ct->timeout)) + expires = (ct->timeout.expires - jiffies) / HZ; + if ((expires >= info->expires_min && + expires <= info->expires_max) ^ + !(info->invert_flags & XT_CONNTRACK_EXPIRES)) + return false; + } + return true; +} + +static bool +conntrack_mt_check(const char *tablename, const void *ip, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { if (nf_ct_l3proto_try_module_get(match->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%d\n", match->family); + "proto=%u\n", match->family); return false; } return true; } -static void destroy(const struct xt_match *match, void *matchinfo) +static void +conntrack_mt_destroy(const struct xt_match *match, void *matchinfo) { nf_ct_l3proto_module_put(match->family); } @@ -148,7 +277,7 @@ struct compat_xt_conntrack_info u_int8_t invflags; }; -static void compat_from_user(void *dst, void *src) +static void conntrack_mt_compat_from_user_v0(void *dst, void *src) { const struct compat_xt_conntrack_info *cm = src; struct xt_conntrack_info m = { @@ -165,7 +294,7 @@ static void compat_from_user(void *dst, void *src) memcpy(dst, &m, sizeof(m)); } -static int compat_to_user(void __user *dst, void *src) +static int conntrack_mt_compat_to_user_v0(void __user *dst, void *src) { const struct xt_conntrack_info *m = src; struct compat_xt_conntrack_info cm = { @@ -183,30 +312,54 @@ static int compat_to_user(void __user *dst, void *src) } #endif -static struct xt_match conntrack_match __read_mostly = { - .name = "conntrack", - .match = match, - .checkentry = checkentry, - .destroy = destroy, - .matchsize = sizeof(struct xt_conntrack_info), +static struct xt_match conntrack_mt_reg[] __read_mostly = { + { + .name = "conntrack", + .revision = 0, + .family = AF_INET, + .match = conntrack_mt_v0, + .checkentry = conntrack_mt_check, + .destroy = conntrack_mt_destroy, + .matchsize = sizeof(struct xt_conntrack_info), + .me = THIS_MODULE, #ifdef CONFIG_COMPAT - .compatsize = sizeof(struct compat_xt_conntrack_info), - .compat_from_user = compat_from_user, - .compat_to_user = compat_to_user, + .compatsize = sizeof(struct compat_xt_conntrack_info), + .compat_from_user = conntrack_mt_compat_from_user_v0, + .compat_to_user = conntrack_mt_compat_to_user_v0, #endif - .family = AF_INET, - .me = THIS_MODULE, + }, + { + .name = "conntrack", + .revision = 1, + .family = AF_INET, + .matchsize = sizeof(struct xt_conntrack_mtinfo1), + .match = conntrack_mt, + .checkentry = conntrack_mt_check, + .destroy = conntrack_mt_destroy, + .me = THIS_MODULE, + }, + { + .name = "conntrack", + .revision = 1, + .family = AF_INET6, + .matchsize = sizeof(struct xt_conntrack_mtinfo1), + .match = conntrack_mt, + .checkentry = conntrack_mt_check, + .destroy = conntrack_mt_destroy, + .me = THIS_MODULE, + }, }; -static int __init xt_conntrack_init(void) +static int __init conntrack_mt_init(void) { - return xt_register_match(&conntrack_match); + return xt_register_matches(conntrack_mt_reg, + ARRAY_SIZE(conntrack_mt_reg)); } -static void __exit xt_conntrack_fini(void) +static void __exit conntrack_mt_exit(void) { - xt_unregister_match(&conntrack_match); + xt_unregister_matches(conntrack_mt_reg, ARRAY_SIZE(conntrack_mt_reg)); } -module_init(xt_conntrack_init); -module_exit(xt_conntrack_fini); +module_init(conntrack_mt_init); +module_exit(conntrack_mt_exit); diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c index c2b1b24ee33..667f45e72cd 100644 --- a/net/netfilter/xt_dccp.c +++ b/net/netfilter/xt_dccp.c @@ -22,7 +22,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("Match for DCCP protocol packets"); +MODULE_DESCRIPTION("Xtables: DCCP protocol packet match"); MODULE_ALIAS("ipt_dccp"); MODULE_ALIAS("ip6t_dccp"); @@ -93,14 +93,9 @@ match_option(u_int8_t option, const struct sk_buff *skb, unsigned int protoff, } static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +dccp_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) { const struct xt_dccp_info *info = matchinfo; struct dccp_hdr _dh, *dh; @@ -128,11 +123,9 @@ match(const struct sk_buff *skb, } static bool -checkentry(const char *tablename, - const void *inf, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +dccp_mt_check(const char *tablename, const void *inf, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { const struct xt_dccp_info *info = matchinfo; @@ -141,12 +134,12 @@ checkentry(const char *tablename, && !(info->invflags & ~info->flags); } -static struct xt_match xt_dccp_match[] __read_mostly = { +static struct xt_match dccp_mt_reg[] __read_mostly = { { .name = "dccp", .family = AF_INET, - .checkentry = checkentry, - .match = match, + .checkentry = dccp_mt_check, + .match = dccp_mt, .matchsize = sizeof(struct xt_dccp_info), .proto = IPPROTO_DCCP, .me = THIS_MODULE, @@ -154,15 +147,15 @@ static struct xt_match xt_dccp_match[] __read_mostly = { { .name = "dccp", .family = AF_INET6, - .checkentry = checkentry, - .match = match, + .checkentry = dccp_mt_check, + .match = dccp_mt, .matchsize = sizeof(struct xt_dccp_info), .proto = IPPROTO_DCCP, .me = THIS_MODULE, }, }; -static int __init xt_dccp_init(void) +static int __init dccp_mt_init(void) { int ret; @@ -172,7 +165,7 @@ static int __init xt_dccp_init(void) dccp_optbuf = kmalloc(256 * 4, GFP_KERNEL); if (!dccp_optbuf) return -ENOMEM; - ret = xt_register_matches(xt_dccp_match, ARRAY_SIZE(xt_dccp_match)); + ret = xt_register_matches(dccp_mt_reg, ARRAY_SIZE(dccp_mt_reg)); if (ret) goto out_kfree; return ret; @@ -182,11 +175,11 @@ out_kfree: return ret; } -static void __exit xt_dccp_fini(void) +static void __exit dccp_mt_exit(void) { - xt_unregister_matches(xt_dccp_match, ARRAY_SIZE(xt_dccp_match)); + xt_unregister_matches(dccp_mt_reg, ARRAY_SIZE(dccp_mt_reg)); kfree(dccp_optbuf); } -module_init(xt_dccp_init); -module_exit(xt_dccp_fini); +module_init(dccp_mt_init); +module_exit(dccp_mt_exit); diff --git a/net/netfilter/xt_dscp.c b/net/netfilter/xt_dscp.c index dde6d66e0d3..26f4aab9c42 100644 --- a/net/netfilter/xt_dscp.c +++ b/net/netfilter/xt_dscp.c @@ -13,23 +13,22 @@ #include <linux/ipv6.h> #include <net/dsfield.h> -#include <linux/netfilter/xt_dscp.h> #include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_dscp.h> +#include <linux/netfilter_ipv4/ipt_tos.h> MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("x_tables DSCP matching module"); +MODULE_DESCRIPTION("Xtables: DSCP/TOS field match"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_dscp"); MODULE_ALIAS("ip6t_dscp"); +MODULE_ALIAS("ipt_tos"); +MODULE_ALIAS("ip6t_tos"); -static bool match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +static bool +dscp_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) { const struct xt_dscp_info *info = matchinfo; u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT; @@ -37,14 +36,11 @@ static bool match(const struct sk_buff *skb, return (dscp == info->dscp) ^ !!info->invert; } -static bool match6(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +static bool +dscp_mt6(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { const struct xt_dscp_info *info = matchinfo; u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT; @@ -52,11 +48,10 @@ static bool match6(const struct sk_buff *skb, return (dscp == info->dscp) ^ !!info->invert; } -static bool checkentry(const char *tablename, - const void *info, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +static bool +dscp_mt_check(const char *tablename, const void *info, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { const u_int8_t dscp = ((struct xt_dscp_info *)matchinfo)->dscp; @@ -68,34 +63,83 @@ static bool checkentry(const char *tablename, return true; } -static struct xt_match xt_dscp_match[] __read_mostly = { +static bool tos_mt_v0(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, + const struct xt_match *match, const void *matchinfo, + int offset, unsigned int protoff, bool *hotdrop) +{ + const struct ipt_tos_info *info = matchinfo; + + return (ip_hdr(skb)->tos == info->tos) ^ info->invert; +} + +static bool tos_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) +{ + const struct xt_tos_match_info *info = matchinfo; + + if (match->family == AF_INET) + return ((ip_hdr(skb)->tos & info->tos_mask) == + info->tos_value) ^ !!info->invert; + else + return ((ipv6_get_dsfield(ipv6_hdr(skb)) & info->tos_mask) == + info->tos_value) ^ !!info->invert; +} + +static struct xt_match dscp_mt_reg[] __read_mostly = { { .name = "dscp", .family = AF_INET, - .checkentry = checkentry, - .match = match, + .checkentry = dscp_mt_check, + .match = dscp_mt, .matchsize = sizeof(struct xt_dscp_info), .me = THIS_MODULE, }, { .name = "dscp", .family = AF_INET6, - .checkentry = checkentry, - .match = match6, + .checkentry = dscp_mt_check, + .match = dscp_mt6, .matchsize = sizeof(struct xt_dscp_info), .me = THIS_MODULE, }, + { + .name = "tos", + .revision = 0, + .family = AF_INET, + .match = tos_mt_v0, + .matchsize = sizeof(struct ipt_tos_info), + .me = THIS_MODULE, + }, + { + .name = "tos", + .revision = 1, + .family = AF_INET, + .match = tos_mt, + .matchsize = sizeof(struct xt_tos_match_info), + .me = THIS_MODULE, + }, + { + .name = "tos", + .revision = 1, + .family = AF_INET6, + .match = tos_mt, + .matchsize = sizeof(struct xt_tos_match_info), + .me = THIS_MODULE, + }, }; -static int __init xt_dscp_match_init(void) +static int __init dscp_mt_init(void) { - return xt_register_matches(xt_dscp_match, ARRAY_SIZE(xt_dscp_match)); + return xt_register_matches(dscp_mt_reg, ARRAY_SIZE(dscp_mt_reg)); } -static void __exit xt_dscp_match_fini(void) +static void __exit dscp_mt_exit(void) { - xt_unregister_matches(xt_dscp_match, ARRAY_SIZE(xt_dscp_match)); + xt_unregister_matches(dscp_mt_reg, ARRAY_SIZE(dscp_mt_reg)); } -module_init(xt_dscp_match_init); -module_exit(xt_dscp_match_fini); +module_init(dscp_mt_init); +module_exit(dscp_mt_exit); diff --git a/net/netfilter/xt_esp.c b/net/netfilter/xt_esp.c index b11378e001b..71c7c378526 100644 --- a/net/netfilter/xt_esp.c +++ b/net/netfilter/xt_esp.c @@ -20,7 +20,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>"); -MODULE_DESCRIPTION("x_tables ESP SPI match module"); +MODULE_DESCRIPTION("Xtables: IPsec-ESP packet match"); MODULE_ALIAS("ipt_esp"); MODULE_ALIAS("ip6t_esp"); @@ -43,14 +43,9 @@ spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) } static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +esp_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) { struct ip_esp_hdr _esp, *eh; const struct xt_esp *espinfo = matchinfo; @@ -75,11 +70,9 @@ match(const struct sk_buff *skb, /* Called when user tries to insert an entry of this type. */ static bool -checkentry(const char *tablename, - const void *ip_void, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +esp_mt_check(const char *tablename, const void *ip_void, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { const struct xt_esp *espinfo = matchinfo; @@ -91,12 +84,12 @@ checkentry(const char *tablename, return true; } -static struct xt_match xt_esp_match[] __read_mostly = { +static struct xt_match esp_mt_reg[] __read_mostly = { { .name = "esp", .family = AF_INET, - .checkentry = checkentry, - .match = match, + .checkentry = esp_mt_check, + .match = esp_mt, .matchsize = sizeof(struct xt_esp), .proto = IPPROTO_ESP, .me = THIS_MODULE, @@ -104,23 +97,23 @@ static struct xt_match xt_esp_match[] __read_mostly = { { .name = "esp", .family = AF_INET6, - .checkentry = checkentry, - .match = match, + .checkentry = esp_mt_check, + .match = esp_mt, .matchsize = sizeof(struct xt_esp), .proto = IPPROTO_ESP, .me = THIS_MODULE, }, }; -static int __init xt_esp_init(void) +static int __init esp_mt_init(void) { - return xt_register_matches(xt_esp_match, ARRAY_SIZE(xt_esp_match)); + return xt_register_matches(esp_mt_reg, ARRAY_SIZE(esp_mt_reg)); } -static void __exit xt_esp_cleanup(void) +static void __exit esp_mt_exit(void) { - xt_unregister_matches(xt_esp_match, ARRAY_SIZE(xt_esp_match)); + xt_unregister_matches(esp_mt_reg, ARRAY_SIZE(esp_mt_reg)); } -module_init(xt_esp_init); -module_exit(xt_esp_cleanup); +module_init(esp_mt_init); +module_exit(esp_mt_exit); diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 2ef44d8560c..d479ca98011 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -20,7 +20,11 @@ #include <linux/mm.h> #include <linux/in.h> #include <linux/ip.h> +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) #include <linux/ipv6.h> +#include <net/ipv6.h> +#endif + #include <net/net_namespace.h> #include <linux/netfilter/x_tables.h> @@ -31,7 +35,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("iptables match for limiting per hash-bucket"); +MODULE_DESCRIPTION("Xtables: per hash-bucket rate-limit match"); MODULE_ALIAS("ipt_hashlimit"); MODULE_ALIAS("ip6t_hashlimit"); @@ -47,10 +51,12 @@ struct dsthash_dst { __be32 src; __be32 dst; } ip; +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) struct { __be32 src[4]; __be32 dst[4]; } ip6; +#endif } addr; __be16 src_port; __be16 dst_port; @@ -104,7 +110,16 @@ static inline bool dst_cmp(const struct dsthash_ent *ent, static u_int32_t hash_dst(const struct xt_hashlimit_htable *ht, const struct dsthash_dst *dst) { - return jhash(dst, sizeof(*dst), ht->rnd) % ht->cfg.size; + u_int32_t hash = jhash2((const u32 *)dst, + sizeof(*dst)/sizeof(u32), + ht->rnd); + /* + * Instead of returning hash % ht->cfg.size (implying a divide) + * we return the high 32 bits of the (hash * ht->cfg.size) that will + * give results between [0 and cfg.size-1] and same hash distribution, + * but using a multiply, less expensive than a divide + */ + return ((u64)hash * ht->cfg.size) >> 32; } static struct dsthash_ent * @@ -379,7 +394,7 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo, const struct sk_buff *skb, unsigned int protoff) { __be16 _ports[2], *ports; - int nexthdr; + u8 nexthdr; memset(dst, 0, sizeof(*dst)); @@ -407,8 +422,9 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo, if (!(hinfo->cfg.mode & (XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT))) return 0; - nexthdr = ipv6_find_hdr(skb, &protoff, -1, NULL); - if (nexthdr < 0) + nexthdr = ipv6_hdr(skb)->nexthdr; + protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr); + if ((int)protoff < 0) return -1; break; #endif @@ -441,14 +457,10 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo, } static bool -hashlimit_match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +hashlimit_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { const struct xt_hashlimit_info *r = ((const struct xt_hashlimit_info *)matchinfo)->u.master; @@ -500,11 +512,9 @@ hotdrop: } static bool -hashlimit_checkentry(const char *tablename, - const void *inf, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +hashlimit_mt_check(const char *tablename, const void *inf, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { struct xt_hashlimit_info *r = matchinfo; @@ -548,7 +558,7 @@ hashlimit_checkentry(const char *tablename, } static void -hashlimit_destroy(const struct xt_match *match, void *matchinfo) +hashlimit_mt_destroy(const struct xt_match *match, void *matchinfo) { const struct xt_hashlimit_info *r = matchinfo; @@ -563,7 +573,7 @@ struct compat_xt_hashlimit_info { compat_uptr_t master; }; -static void compat_from_user(void *dst, void *src) +static void hashlimit_mt_compat_from_user(void *dst, void *src) { int off = offsetof(struct compat_xt_hashlimit_info, hinfo); @@ -571,7 +581,7 @@ static void compat_from_user(void *dst, void *src) memset(dst + off, 0, sizeof(struct compat_xt_hashlimit_info) - off); } -static int compat_to_user(void __user *dst, void *src) +static int hashlimit_mt_compat_to_user(void __user *dst, void *src) { int off = offsetof(struct compat_xt_hashlimit_info, hinfo); @@ -579,35 +589,37 @@ static int compat_to_user(void __user *dst, void *src) } #endif -static struct xt_match xt_hashlimit[] __read_mostly = { +static struct xt_match hashlimit_mt_reg[] __read_mostly = { { .name = "hashlimit", .family = AF_INET, - .match = hashlimit_match, + .match = hashlimit_mt, .matchsize = sizeof(struct xt_hashlimit_info), #ifdef CONFIG_COMPAT .compatsize = sizeof(struct compat_xt_hashlimit_info), - .compat_from_user = compat_from_user, - .compat_to_user = compat_to_user, + .compat_from_user = hashlimit_mt_compat_from_user, + .compat_to_user = hashlimit_mt_compat_to_user, #endif - .checkentry = hashlimit_checkentry, - .destroy = hashlimit_destroy, + .checkentry = hashlimit_mt_check, + .destroy = hashlimit_mt_destroy, .me = THIS_MODULE }, +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) { .name = "hashlimit", .family = AF_INET6, - .match = hashlimit_match, + .match = hashlimit_mt, .matchsize = sizeof(struct xt_hashlimit_info), #ifdef CONFIG_COMPAT .compatsize = sizeof(struct compat_xt_hashlimit_info), - .compat_from_user = compat_from_user, - .compat_to_user = compat_to_user, + .compat_from_user = hashlimit_mt_compat_from_user, + .compat_to_user = hashlimit_mt_compat_to_user, #endif - .checkentry = hashlimit_checkentry, - .destroy = hashlimit_destroy, + .checkentry = hashlimit_mt_check, + .destroy = hashlimit_mt_destroy, .me = THIS_MODULE }, +#endif }; /* PROC stuff */ @@ -670,6 +682,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, int family, ntohs(ent->dst.dst_port), ent->rateinfo.credit, ent->rateinfo.credit_cap, ent->rateinfo.cost); +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) case AF_INET6: return seq_printf(s, "%ld " NIP6_FMT ":%u->" NIP6_FMT ":%u %u %u %u\n", @@ -680,6 +693,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, int family, ntohs(ent->dst.dst_port), ent->rateinfo.credit, ent->rateinfo.credit_cap, ent->rateinfo.cost); +#endif default: BUG(); return 0; @@ -728,11 +742,12 @@ static const struct file_operations dl_file_ops = { .release = seq_release }; -static int __init xt_hashlimit_init(void) +static int __init hashlimit_mt_init(void) { int err; - err = xt_register_matches(xt_hashlimit, ARRAY_SIZE(xt_hashlimit)); + err = xt_register_matches(hashlimit_mt_reg, + ARRAY_SIZE(hashlimit_mt_reg)); if (err < 0) goto err1; @@ -750,31 +765,36 @@ static int __init xt_hashlimit_init(void) "entry\n"); goto err3; } + err = 0; +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) hashlimit_procdir6 = proc_mkdir("ip6t_hashlimit", init_net.proc_net); if (!hashlimit_procdir6) { printk(KERN_ERR "xt_hashlimit: unable to create proc dir " "entry\n"); - goto err4; + err = -ENOMEM; } - return 0; -err4: +#endif + if (!err) + return 0; remove_proc_entry("ipt_hashlimit", init_net.proc_net); err3: kmem_cache_destroy(hashlimit_cachep); err2: - xt_unregister_matches(xt_hashlimit, ARRAY_SIZE(xt_hashlimit)); + xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg)); err1: return err; } -static void __exit xt_hashlimit_fini(void) +static void __exit hashlimit_mt_exit(void) { remove_proc_entry("ipt_hashlimit", init_net.proc_net); +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) remove_proc_entry("ip6t_hashlimit", init_net.proc_net); +#endif kmem_cache_destroy(hashlimit_cachep); - xt_unregister_matches(xt_hashlimit, ARRAY_SIZE(xt_hashlimit)); + xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg)); } -module_init(xt_hashlimit_init); -module_exit(xt_hashlimit_fini); +module_init(hashlimit_mt_init); +module_exit(hashlimit_mt_exit); diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c index d842c4a6d63..dada2905d66 100644 --- a/net/netfilter/xt_helper.c +++ b/net/netfilter/xt_helper.c @@ -18,20 +18,16 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Martin Josefsson <gandalf@netfilter.org>"); -MODULE_DESCRIPTION("iptables helper match module"); +MODULE_DESCRIPTION("Xtables: Related connection matching"); MODULE_ALIAS("ipt_helper"); MODULE_ALIAS("ip6t_helper"); static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +helper_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { const struct xt_helper_info *info = matchinfo; const struct nf_conn *ct; @@ -61,61 +57,57 @@ match(const struct sk_buff *skb, return ret; } -static bool check(const char *tablename, - const void *inf, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +static bool +helper_mt_check(const char *tablename, const void *inf, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { struct xt_helper_info *info = matchinfo; if (nf_ct_l3proto_try_module_get(match->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%d\n", match->family); + "proto=%u\n", match->family); return false; } info->name[29] = '\0'; return true; } -static void -destroy(const struct xt_match *match, void *matchinfo) +static void helper_mt_destroy(const struct xt_match *match, void *matchinfo) { nf_ct_l3proto_module_put(match->family); } -static struct xt_match xt_helper_match[] __read_mostly = { +static struct xt_match helper_mt_reg[] __read_mostly = { { .name = "helper", .family = AF_INET, - .checkentry = check, - .match = match, - .destroy = destroy, + .checkentry = helper_mt_check, + .match = helper_mt, + .destroy = helper_mt_destroy, .matchsize = sizeof(struct xt_helper_info), .me = THIS_MODULE, }, { .name = "helper", .family = AF_INET6, - .checkentry = check, - .match = match, - .destroy = destroy, + .checkentry = helper_mt_check, + .match = helper_mt, + .destroy = helper_mt_destroy, .matchsize = sizeof(struct xt_helper_info), .me = THIS_MODULE, }, }; -static int __init xt_helper_init(void) +static int __init helper_mt_init(void) { - return xt_register_matches(xt_helper_match, - ARRAY_SIZE(xt_helper_match)); + return xt_register_matches(helper_mt_reg, ARRAY_SIZE(helper_mt_reg)); } -static void __exit xt_helper_fini(void) +static void __exit helper_mt_exit(void) { - xt_unregister_matches(xt_helper_match, ARRAY_SIZE(xt_helper_match)); + xt_unregister_matches(helper_mt_reg, ARRAY_SIZE(helper_mt_reg)); } -module_init(xt_helper_init); -module_exit(xt_helper_fini); - +module_init(helper_mt_init); +module_exit(helper_mt_exit); diff --git a/net/netfilter/xt_iprange.c b/net/netfilter/xt_iprange.c new file mode 100644 index 00000000000..dbea0e0893f --- /dev/null +++ b/net/netfilter/xt_iprange.c @@ -0,0 +1,180 @@ +/* + * xt_iprange - Netfilter module to match IP address ranges + * + * (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * (C) CC Computer Consultants GmbH, 2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv4/ipt_iprange.h> + +static bool +iprange_mt_v0(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) +{ + const struct ipt_iprange_info *info = matchinfo; + const struct iphdr *iph = ip_hdr(skb); + + if (info->flags & IPRANGE_SRC) { + if ((ntohl(iph->saddr) < ntohl(info->src.min_ip) + || ntohl(iph->saddr) > ntohl(info->src.max_ip)) + ^ !!(info->flags & IPRANGE_SRC_INV)) { + pr_debug("src IP %u.%u.%u.%u NOT in range %s" + "%u.%u.%u.%u-%u.%u.%u.%u\n", + NIPQUAD(iph->saddr), + info->flags & IPRANGE_SRC_INV ? "(INV) " : "", + NIPQUAD(info->src.min_ip), + NIPQUAD(info->src.max_ip)); + return false; + } + } + if (info->flags & IPRANGE_DST) { + if ((ntohl(iph->daddr) < ntohl(info->dst.min_ip) + || ntohl(iph->daddr) > ntohl(info->dst.max_ip)) + ^ !!(info->flags & IPRANGE_DST_INV)) { + pr_debug("dst IP %u.%u.%u.%u NOT in range %s" + "%u.%u.%u.%u-%u.%u.%u.%u\n", + NIPQUAD(iph->daddr), + info->flags & IPRANGE_DST_INV ? "(INV) " : "", + NIPQUAD(info->dst.min_ip), + NIPQUAD(info->dst.max_ip)); + return false; + } + } + return true; +} + +static bool +iprange_mt4(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) +{ + const struct xt_iprange_mtinfo *info = matchinfo; + const struct iphdr *iph = ip_hdr(skb); + bool m; + + if (info->flags & IPRANGE_SRC) { + m = ntohl(iph->saddr) < ntohl(info->src_min.ip); + m |= ntohl(iph->saddr) > ntohl(info->src_max.ip); + m ^= info->flags & IPRANGE_SRC_INV; + if (m) { + pr_debug("src IP " NIPQUAD_FMT " NOT in range %s" + NIPQUAD_FMT "-" NIPQUAD_FMT "\n", + NIPQUAD(iph->saddr), + (info->flags & IPRANGE_SRC_INV) ? "(INV) " : "", + NIPQUAD(info->src_max.ip), + NIPQUAD(info->src_max.ip)); + return false; + } + } + if (info->flags & IPRANGE_DST) { + m = ntohl(iph->daddr) < ntohl(info->dst_min.ip); + m |= ntohl(iph->daddr) > ntohl(info->dst_max.ip); + m ^= info->flags & IPRANGE_DST_INV; + if (m) { + pr_debug("dst IP " NIPQUAD_FMT " NOT in range %s" + NIPQUAD_FMT "-" NIPQUAD_FMT "\n", + NIPQUAD(iph->daddr), + (info->flags & IPRANGE_DST_INV) ? "(INV) " : "", + NIPQUAD(info->dst_min.ip), + NIPQUAD(info->dst_max.ip)); + return false; + } + } + return true; +} + +static inline int +iprange_ipv6_sub(const struct in6_addr *a, const struct in6_addr *b) +{ + unsigned int i; + int r; + + for (i = 0; i < 4; ++i) { + r = a->s6_addr32[i] - b->s6_addr32[i]; + if (r != 0) + return r; + } + + return 0; +} + +static bool +iprange_mt6(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) +{ + const struct xt_iprange_mtinfo *info = matchinfo; + const struct ipv6hdr *iph = ipv6_hdr(skb); + bool m; + + if (info->flags & IPRANGE_SRC) { + m = iprange_ipv6_sub(&iph->saddr, &info->src_min.in6) < 0; + m |= iprange_ipv6_sub(&iph->saddr, &info->src_max.in6) > 0; + m ^= info->flags & IPRANGE_SRC_INV; + if (m) + return false; + } + if (info->flags & IPRANGE_DST) { + m = iprange_ipv6_sub(&iph->daddr, &info->dst_min.in6) < 0; + m |= iprange_ipv6_sub(&iph->daddr, &info->dst_max.in6) > 0; + m ^= info->flags & IPRANGE_DST_INV; + if (m) + return false; + } + return true; +} + +static struct xt_match iprange_mt_reg[] __read_mostly = { + { + .name = "iprange", + .revision = 0, + .family = AF_INET, + .match = iprange_mt_v0, + .matchsize = sizeof(struct ipt_iprange_info), + .me = THIS_MODULE, + }, + { + .name = "iprange", + .revision = 1, + .family = AF_INET6, + .match = iprange_mt4, + .matchsize = sizeof(struct xt_iprange_mtinfo), + .me = THIS_MODULE, + }, + { + .name = "iprange", + .revision = 1, + .family = AF_INET6, + .match = iprange_mt6, + .matchsize = sizeof(struct xt_iprange_mtinfo), + .me = THIS_MODULE, + }, +}; + +static int __init iprange_mt_init(void) +{ + return xt_register_matches(iprange_mt_reg, ARRAY_SIZE(iprange_mt_reg)); +} + +static void __exit iprange_mt_exit(void) +{ + xt_unregister_matches(iprange_mt_reg, ARRAY_SIZE(iprange_mt_reg)); +} + +module_init(iprange_mt_init); +module_exit(iprange_mt_exit); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>, Jan Engelhardt <jengelh@computergmbh.de>"); +MODULE_DESCRIPTION("Xtables: arbitrary IPv4 range matching"); diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c index 3dad173d973..b8640f97295 100644 --- a/net/netfilter/xt_length.c +++ b/net/netfilter/xt_length.c @@ -15,20 +15,16 @@ #include <linux/netfilter/x_tables.h> MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); -MODULE_DESCRIPTION("IP tables packet length matching module"); +MODULE_DESCRIPTION("Xtables: Packet length (Layer3,4,5) match"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_length"); MODULE_ALIAS("ip6t_length"); static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +length_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { const struct xt_length_info *info = matchinfo; u_int16_t pktlen = ntohs(ip_hdr(skb)->tot_len); @@ -37,14 +33,10 @@ match(const struct sk_buff *skb, } static bool -match6(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +length_mt6(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { const struct xt_length_info *info = matchinfo; const u_int16_t pktlen = ntohs(ipv6_hdr(skb)->payload_len) + @@ -53,33 +45,32 @@ match6(const struct sk_buff *skb, return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; } -static struct xt_match xt_length_match[] __read_mostly = { +static struct xt_match length_mt_reg[] __read_mostly = { { .name = "length", .family = AF_INET, - .match = match, + .match = length_mt, .matchsize = sizeof(struct xt_length_info), .me = THIS_MODULE, }, { .name = "length", .family = AF_INET6, - .match = match6, + .match = length_mt6, .matchsize = sizeof(struct xt_length_info), .me = THIS_MODULE, }, }; -static int __init xt_length_init(void) +static int __init length_mt_init(void) { - return xt_register_matches(xt_length_match, - ARRAY_SIZE(xt_length_match)); + return xt_register_matches(length_mt_reg, ARRAY_SIZE(length_mt_reg)); } -static void __exit xt_length_fini(void) +static void __exit length_mt_exit(void) { - xt_unregister_matches(xt_length_match, ARRAY_SIZE(xt_length_match)); + xt_unregister_matches(length_mt_reg, ARRAY_SIZE(length_mt_reg)); } -module_init(xt_length_init); -module_exit(xt_length_fini); +module_init(length_mt_init); +module_exit(length_mt_exit); diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index f263a77e57b..aad9ab8d204 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -16,7 +16,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Herve Eychenne <rv@wallfire.org>"); -MODULE_DESCRIPTION("iptables rate limit match"); +MODULE_DESCRIPTION("Xtables: rate-limit match"); MODULE_ALIAS("ipt_limit"); MODULE_ALIAS("ip6t_limit"); @@ -58,14 +58,10 @@ static DEFINE_SPINLOCK(limit_lock); #define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ) static bool -ipt_limit_match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +limit_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { struct xt_rateinfo *r = ((const struct xt_rateinfo *)matchinfo)->master; @@ -100,11 +96,9 @@ user2credits(u_int32_t user) } static bool -ipt_limit_checkentry(const char *tablename, - const void *inf, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +limit_mt_check(const char *tablename, const void *inf, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { struct xt_rateinfo *r = matchinfo; @@ -143,7 +137,7 @@ struct compat_xt_rateinfo { /* To keep the full "prev" timestamp, the upper 32 bits are stored in the * master pointer, which does not need to be preserved. */ -static void compat_from_user(void *dst, void *src) +static void limit_mt_compat_from_user(void *dst, void *src) { const struct compat_xt_rateinfo *cm = src; struct xt_rateinfo m = { @@ -157,7 +151,7 @@ static void compat_from_user(void *dst, void *src) memcpy(dst, &m, sizeof(m)); } -static int compat_to_user(void __user *dst, void *src) +static int limit_mt_compat_to_user(void __user *dst, void *src) { const struct xt_rateinfo *m = src; struct compat_xt_rateinfo cm = { @@ -173,39 +167,44 @@ static int compat_to_user(void __user *dst, void *src) } #endif /* CONFIG_COMPAT */ -static struct xt_match xt_limit_match[] __read_mostly = { +static struct xt_match limit_mt_reg[] __read_mostly = { { .name = "limit", .family = AF_INET, - .checkentry = ipt_limit_checkentry, - .match = ipt_limit_match, + .checkentry = limit_mt_check, + .match = limit_mt, .matchsize = sizeof(struct xt_rateinfo), #ifdef CONFIG_COMPAT .compatsize = sizeof(struct compat_xt_rateinfo), - .compat_from_user = compat_from_user, - .compat_to_user = compat_to_user, + .compat_from_user = limit_mt_compat_from_user, + .compat_to_user = limit_mt_compat_to_user, #endif .me = THIS_MODULE, }, { .name = "limit", .family = AF_INET6, - .checkentry = ipt_limit_checkentry, - .match = ipt_limit_match, + .checkentry = limit_mt_check, + .match = limit_mt, .matchsize = sizeof(struct xt_rateinfo), +#ifdef CONFIG_COMPAT + .compatsize = sizeof(struct compat_xt_rateinfo), + .compat_from_user = limit_mt_compat_from_user, + .compat_to_user = limit_mt_compat_to_user, +#endif .me = THIS_MODULE, }, }; -static int __init xt_limit_init(void) +static int __init limit_mt_init(void) { - return xt_register_matches(xt_limit_match, ARRAY_SIZE(xt_limit_match)); + return xt_register_matches(limit_mt_reg, ARRAY_SIZE(limit_mt_reg)); } -static void __exit xt_limit_fini(void) +static void __exit limit_mt_exit(void) { - xt_unregister_matches(xt_limit_match, ARRAY_SIZE(xt_limit_match)); + xt_unregister_matches(limit_mt_reg, ARRAY_SIZE(limit_mt_reg)); } -module_init(xt_limit_init); -module_exit(xt_limit_fini); +module_init(limit_mt_init); +module_exit(limit_mt_exit); diff --git a/net/netfilter/xt_mac.c b/net/netfilter/xt_mac.c index 00490d777a0..b3e96a0ec17 100644 --- a/net/netfilter/xt_mac.c +++ b/net/netfilter/xt_mac.c @@ -20,19 +20,14 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); -MODULE_DESCRIPTION("iptables mac matching module"); +MODULE_DESCRIPTION("Xtables: MAC address match"); MODULE_ALIAS("ipt_mac"); MODULE_ALIAS("ip6t_mac"); static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +mac_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) { const struct xt_mac_info *info = matchinfo; @@ -44,38 +39,38 @@ match(const struct sk_buff *skb, ^ info->invert); } -static struct xt_match xt_mac_match[] __read_mostly = { +static struct xt_match mac_mt_reg[] __read_mostly = { { .name = "mac", .family = AF_INET, - .match = match, + .match = mac_mt, .matchsize = sizeof(struct xt_mac_info), - .hooks = (1 << NF_IP_PRE_ROUTING) | - (1 << NF_IP_LOCAL_IN) | - (1 << NF_IP_FORWARD), + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD), .me = THIS_MODULE, }, { .name = "mac", .family = AF_INET6, - .match = match, + .match = mac_mt, .matchsize = sizeof(struct xt_mac_info), - .hooks = (1 << NF_IP6_PRE_ROUTING) | - (1 << NF_IP6_LOCAL_IN) | - (1 << NF_IP6_FORWARD), + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD), .me = THIS_MODULE, }, }; -static int __init xt_mac_init(void) +static int __init mac_mt_init(void) { - return xt_register_matches(xt_mac_match, ARRAY_SIZE(xt_mac_match)); + return xt_register_matches(mac_mt_reg, ARRAY_SIZE(mac_mt_reg)); } -static void __exit xt_mac_fini(void) +static void __exit mac_mt_exit(void) { - xt_unregister_matches(xt_mac_match, ARRAY_SIZE(xt_mac_match)); + xt_unregister_matches(mac_mt_reg, ARRAY_SIZE(mac_mt_reg)); } -module_init(xt_mac_init); -module_exit(xt_mac_fini); +module_init(mac_mt_init); +module_exit(mac_mt_exit); diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c index c02a7f8f392..9f78f6120fb 100644 --- a/net/netfilter/xt_mark.c +++ b/net/netfilter/xt_mark.c @@ -1,10 +1,13 @@ -/* Kernel module to match NFMARK values. */ - -/* (C) 1999-2001 Marc Boucher <marc@mbsi.ca> +/* + * xt_mark - Netfilter module to match NFMARK value + * + * (C) 1999-2001 Marc Boucher <marc@mbsi.ca> + * Copyright © CC Computer Consultants GmbH, 2007 - 2008 + * Jan Engelhardt <jengelh@computergmbh.de> * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. */ #include <linux/module.h> @@ -15,19 +18,15 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); -MODULE_DESCRIPTION("iptables mark matching module"); +MODULE_DESCRIPTION("Xtables: packet mark match"); MODULE_ALIAS("ipt_mark"); MODULE_ALIAS("ip6t_mark"); static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +mark_mt_v0(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { const struct xt_mark_info *info = matchinfo; @@ -35,11 +34,19 @@ match(const struct sk_buff *skb, } static bool -checkentry(const char *tablename, - const void *entry, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +mark_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +{ + const struct xt_mark_mtinfo1 *info = matchinfo; + + return ((skb->mark & info->mask) == info->mark) ^ info->invert; +} + +static bool +mark_mt_check_v0(const char *tablename, const void *entry, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { const struct xt_mark_info *minfo = matchinfo; @@ -58,7 +65,7 @@ struct compat_xt_mark_info { u_int16_t __pad2; }; -static void compat_from_user(void *dst, void *src) +static void mark_mt_compat_from_user_v0(void *dst, void *src) { const struct compat_xt_mark_info *cm = src; struct xt_mark_info m = { @@ -69,7 +76,7 @@ static void compat_from_user(void *dst, void *src) memcpy(dst, &m, sizeof(m)); } -static int compat_to_user(void __user *dst, void *src) +static int mark_mt_compat_to_user_v0(void __user *dst, void *src) { const struct xt_mark_info *m = src; struct compat_xt_mark_info cm = { @@ -81,39 +88,62 @@ static int compat_to_user(void __user *dst, void *src) } #endif /* CONFIG_COMPAT */ -static struct xt_match xt_mark_match[] __read_mostly = { +static struct xt_match mark_mt_reg[] __read_mostly = { { .name = "mark", + .revision = 0, .family = AF_INET, - .checkentry = checkentry, - .match = match, + .checkentry = mark_mt_check_v0, + .match = mark_mt_v0, .matchsize = sizeof(struct xt_mark_info), #ifdef CONFIG_COMPAT .compatsize = sizeof(struct compat_xt_mark_info), - .compat_from_user = compat_from_user, - .compat_to_user = compat_to_user, + .compat_from_user = mark_mt_compat_from_user_v0, + .compat_to_user = mark_mt_compat_to_user_v0, #endif .me = THIS_MODULE, }, { .name = "mark", + .revision = 0, .family = AF_INET6, - .checkentry = checkentry, - .match = match, + .checkentry = mark_mt_check_v0, + .match = mark_mt_v0, .matchsize = sizeof(struct xt_mark_info), +#ifdef CONFIG_COMPAT + .compatsize = sizeof(struct compat_xt_mark_info), + .compat_from_user = mark_mt_compat_from_user_v0, + .compat_to_user = mark_mt_compat_to_user_v0, +#endif .me = THIS_MODULE, }, + { + .name = "mark", + .revision = 1, + .family = AF_INET, + .match = mark_mt, + .matchsize = sizeof(struct xt_mark_mtinfo1), + .me = THIS_MODULE, + }, + { + .name = "mark", + .revision = 1, + .family = AF_INET6, + .match = mark_mt, + .matchsize = sizeof(struct xt_mark_mtinfo1), + .me = THIS_MODULE, + }, }; -static int __init xt_mark_init(void) +static int __init mark_mt_init(void) { - return xt_register_matches(xt_mark_match, ARRAY_SIZE(xt_mark_match)); + return xt_register_matches(mark_mt_reg, ARRAY_SIZE(mark_mt_reg)); } -static void __exit xt_mark_fini(void) +static void __exit mark_mt_exit(void) { - xt_unregister_matches(xt_mark_match, ARRAY_SIZE(xt_mark_match)); + xt_unregister_matches(mark_mt_reg, ARRAY_SIZE(mark_mt_reg)); } -module_init(xt_mark_init); -module_exit(xt_mark_fini); +module_init(mark_mt_init); +module_exit(mark_mt_exit); diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c index e8ae10284ac..31daa819242 100644 --- a/net/netfilter/xt_multiport.c +++ b/net/netfilter/xt_multiport.c @@ -22,7 +22,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); -MODULE_DESCRIPTION("x_tables multiple port match module"); +MODULE_DESCRIPTION("Xtables: multiple port matching for TCP, UDP, UDP-Lite, SCTP and DCCP"); MODULE_ALIAS("ipt_multiport"); MODULE_ALIAS("ip6t_multiport"); @@ -34,8 +34,8 @@ MODULE_ALIAS("ip6t_multiport"); /* Returns 1 if the port is matched by the test, 0 otherwise. */ static inline bool -ports_match(const u_int16_t *portlist, enum xt_multiport_flags flags, - u_int8_t count, u_int16_t src, u_int16_t dst) +ports_match_v0(const u_int16_t *portlist, enum xt_multiport_flags flags, + u_int8_t count, u_int16_t src, u_int16_t dst) { unsigned int i; for (i = 0; i < count; i++) { @@ -95,14 +95,10 @@ ports_match_v1(const struct xt_multiport_v1 *minfo, } static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +multiport_mt_v0(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { __be16 _ports[2], *pptr; const struct xt_multiport *multiinfo = matchinfo; @@ -120,20 +116,15 @@ match(const struct sk_buff *skb, return false; } - return ports_match(multiinfo->ports, - multiinfo->flags, multiinfo->count, - ntohs(pptr[0]), ntohs(pptr[1])); + return ports_match_v0(multiinfo->ports, multiinfo->flags, + multiinfo->count, ntohs(pptr[0]), ntohs(pptr[1])); } static bool -match_v1(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +multiport_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { __be16 _ports[2], *pptr; const struct xt_multiport_v1 *multiinfo = matchinfo; @@ -173,11 +164,9 @@ check(u_int16_t proto, /* Called when user tries to insert an entry of this type. */ static bool -checkentry(const char *tablename, - const void *info, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +multiport_mt_check_v0(const char *tablename, const void *info, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { const struct ipt_ip *ip = info; const struct xt_multiport *multiinfo = matchinfo; @@ -187,11 +176,9 @@ checkentry(const char *tablename, } static bool -checkentry_v1(const char *tablename, - const void *info, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +multiport_mt_check(const char *tablename, const void *info, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { const struct ipt_ip *ip = info; const struct xt_multiport_v1 *multiinfo = matchinfo; @@ -201,11 +188,9 @@ checkentry_v1(const char *tablename, } static bool -checkentry6(const char *tablename, - const void *info, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +multiport_mt6_check_v0(const char *tablename, const void *info, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { const struct ip6t_ip6 *ip = info; const struct xt_multiport *multiinfo = matchinfo; @@ -215,11 +200,9 @@ checkentry6(const char *tablename, } static bool -checkentry6_v1(const char *tablename, - const void *info, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +multiport_mt6_check(const char *tablename, const void *info, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { const struct ip6t_ip6 *ip = info; const struct xt_multiport_v1 *multiinfo = matchinfo; @@ -228,13 +211,13 @@ checkentry6_v1(const char *tablename, multiinfo->count); } -static struct xt_match xt_multiport_match[] __read_mostly = { +static struct xt_match multiport_mt_reg[] __read_mostly = { { .name = "multiport", .family = AF_INET, .revision = 0, - .checkentry = checkentry, - .match = match, + .checkentry = multiport_mt_check_v0, + .match = multiport_mt_v0, .matchsize = sizeof(struct xt_multiport), .me = THIS_MODULE, }, @@ -242,8 +225,8 @@ static struct xt_match xt_multiport_match[] __read_mostly = { .name = "multiport", .family = AF_INET, .revision = 1, - .checkentry = checkentry_v1, - .match = match_v1, + .checkentry = multiport_mt_check, + .match = multiport_mt, .matchsize = sizeof(struct xt_multiport_v1), .me = THIS_MODULE, }, @@ -251,8 +234,8 @@ static struct xt_match xt_multiport_match[] __read_mostly = { .name = "multiport", .family = AF_INET6, .revision = 0, - .checkentry = checkentry6, - .match = match, + .checkentry = multiport_mt6_check_v0, + .match = multiport_mt_v0, .matchsize = sizeof(struct xt_multiport), .me = THIS_MODULE, }, @@ -260,24 +243,23 @@ static struct xt_match xt_multiport_match[] __read_mostly = { .name = "multiport", .family = AF_INET6, .revision = 1, - .checkentry = checkentry6_v1, - .match = match_v1, + .checkentry = multiport_mt6_check, + .match = multiport_mt, .matchsize = sizeof(struct xt_multiport_v1), .me = THIS_MODULE, }, }; -static int __init xt_multiport_init(void) +static int __init multiport_mt_init(void) { - return xt_register_matches(xt_multiport_match, - ARRAY_SIZE(xt_multiport_match)); + return xt_register_matches(multiport_mt_reg, + ARRAY_SIZE(multiport_mt_reg)); } -static void __exit xt_multiport_fini(void) +static void __exit multiport_mt_exit(void) { - xt_unregister_matches(xt_multiport_match, - ARRAY_SIZE(xt_multiport_match)); + xt_unregister_matches(multiport_mt_reg, ARRAY_SIZE(multiport_mt_reg)); } -module_init(xt_multiport_init); -module_exit(xt_multiport_fini); +module_init(multiport_mt_init); +module_exit(multiport_mt_exit); diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c new file mode 100644 index 00000000000..d382f9cc38b --- /dev/null +++ b/net/netfilter/xt_owner.c @@ -0,0 +1,211 @@ +/* + * Kernel module to match various things tied to sockets associated with + * locally generated outgoing packets. + * + * (C) 2000 Marc Boucher <marc@mbsi.ca> + * + * Copyright © CC Computer Consultants GmbH, 2007 + * Contact: <jengelh@computergmbh.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/file.h> +#include <net/sock.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_owner.h> +#include <linux/netfilter_ipv4/ipt_owner.h> +#include <linux/netfilter_ipv6/ip6t_owner.h> + +static bool +owner_mt_v0(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) +{ + const struct ipt_owner_info *info = matchinfo; + const struct file *filp; + + if (skb->sk == NULL || skb->sk->sk_socket == NULL) + return false; + + filp = skb->sk->sk_socket->file; + if (filp == NULL) + return false; + + if (info->match & IPT_OWNER_UID) + if ((filp->f_uid != info->uid) ^ + !!(info->invert & IPT_OWNER_UID)) + return false; + + if (info->match & IPT_OWNER_GID) + if ((filp->f_gid != info->gid) ^ + !!(info->invert & IPT_OWNER_GID)) + return false; + + return true; +} + +static bool +owner_mt6_v0(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) +{ + const struct ip6t_owner_info *info = matchinfo; + const struct file *filp; + + if (skb->sk == NULL || skb->sk->sk_socket == NULL) + return false; + + filp = skb->sk->sk_socket->file; + if (filp == NULL) + return false; + + if (info->match & IP6T_OWNER_UID) + if ((filp->f_uid != info->uid) ^ + !!(info->invert & IP6T_OWNER_UID)) + return false; + + if (info->match & IP6T_OWNER_GID) + if ((filp->f_gid != info->gid) ^ + !!(info->invert & IP6T_OWNER_GID)) + return false; + + return true; +} + +static bool +owner_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) +{ + const struct xt_owner_match_info *info = matchinfo; + const struct file *filp; + + if (skb->sk == NULL || skb->sk->sk_socket == NULL) + return (info->match ^ info->invert) == 0; + else if (info->match & info->invert & XT_OWNER_SOCKET) + /* + * Socket exists but user wanted ! --socket-exists. + * (Single ampersands intended.) + */ + return false; + + filp = skb->sk->sk_socket->file; + if (filp == NULL) + return ((info->match ^ info->invert) & + (XT_OWNER_UID | XT_OWNER_GID)) == 0; + + if (info->match & XT_OWNER_UID) + if ((filp->f_uid != info->uid) ^ + !!(info->invert & XT_OWNER_UID)) + return false; + + if (info->match & XT_OWNER_GID) + if ((filp->f_gid != info->gid) ^ + !!(info->invert & XT_OWNER_GID)) + return false; + + return true; +} + +static bool +owner_mt_check_v0(const char *tablename, const void *ip, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) +{ + const struct ipt_owner_info *info = matchinfo; + + if (info->match & (IPT_OWNER_PID | IPT_OWNER_SID | IPT_OWNER_COMM)) { + printk(KERN_WARNING KBUILD_MODNAME + ": PID, SID and command matching is not " + "supported anymore\n"); + return false; + } + + return true; +} + +static bool +owner_mt6_check_v0(const char *tablename, const void *ip, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) +{ + const struct ip6t_owner_info *info = matchinfo; + + if (info->match & (IP6T_OWNER_PID | IP6T_OWNER_SID)) { + printk(KERN_WARNING KBUILD_MODNAME + ": PID and SID matching is not supported anymore\n"); + return false; + } + + return true; +} + +static struct xt_match owner_mt_reg[] __read_mostly = { + { + .name = "owner", + .revision = 0, + .family = AF_INET, + .match = owner_mt_v0, + .matchsize = sizeof(struct ipt_owner_info), + .checkentry = owner_mt_check_v0, + .hooks = (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING), + .me = THIS_MODULE, + }, + { + .name = "owner", + .revision = 0, + .family = AF_INET6, + .match = owner_mt6_v0, + .matchsize = sizeof(struct ip6t_owner_info), + .checkentry = owner_mt6_check_v0, + .hooks = (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING), + .me = THIS_MODULE, + }, + { + .name = "owner", + .revision = 1, + .family = AF_INET, + .match = owner_mt, + .matchsize = sizeof(struct xt_owner_match_info), + .hooks = (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING), + .me = THIS_MODULE, + }, + { + .name = "owner", + .revision = 1, + .family = AF_INET6, + .match = owner_mt, + .matchsize = sizeof(struct xt_owner_match_info), + .hooks = (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING), + .me = THIS_MODULE, + }, +}; + +static int __init owner_mt_init(void) +{ + return xt_register_matches(owner_mt_reg, ARRAY_SIZE(owner_mt_reg)); +} + +static void __exit owner_mt_exit(void) +{ + xt_unregister_matches(owner_mt_reg, ARRAY_SIZE(owner_mt_reg)); +} + +module_init(owner_mt_init); +module_exit(owner_mt_exit); +MODULE_AUTHOR("Jan Engelhardt <jengelh@computergmbh.de>"); +MODULE_DESCRIPTION("Xtables: socket owner matching"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_owner"); +MODULE_ALIAS("ip6t_owner"); diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index a4bab043a6d..4ec1094bda9 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -16,19 +16,15 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>"); -MODULE_DESCRIPTION("iptables bridge physical device match module"); +MODULE_DESCRIPTION("Xtables: Bridge physical device match"); MODULE_ALIAS("ipt_physdev"); MODULE_ALIAS("ip6t_physdev"); static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +physdev_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { int i; static const char nulldevname[IFNAMSIZ]; @@ -99,11 +95,9 @@ match_outdev: } static bool -checkentry(const char *tablename, - const void *ip, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +physdev_mt_check(const char *tablename, const void *ip, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { const struct xt_physdev_info *info = matchinfo; @@ -113,46 +107,45 @@ checkentry(const char *tablename, if (info->bitmask & XT_PHYSDEV_OP_OUT && (!(info->bitmask & XT_PHYSDEV_OP_BRIDGED) || info->invert & XT_PHYSDEV_OP_BRIDGED) && - hook_mask & ((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_FORWARD) | - (1 << NF_IP_POST_ROUTING))) { + hook_mask & ((1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | + (1 << NF_INET_POST_ROUTING))) { printk(KERN_WARNING "physdev match: using --physdev-out in the " "OUTPUT, FORWARD and POSTROUTING chains for non-bridged " "traffic is not supported anymore.\n"); - if (hook_mask & (1 << NF_IP_LOCAL_OUT)) + if (hook_mask & (1 << NF_INET_LOCAL_OUT)) return false; } return true; } -static struct xt_match xt_physdev_match[] __read_mostly = { +static struct xt_match physdev_mt_reg[] __read_mostly = { { .name = "physdev", .family = AF_INET, - .checkentry = checkentry, - .match = match, + .checkentry = physdev_mt_check, + .match = physdev_mt, .matchsize = sizeof(struct xt_physdev_info), .me = THIS_MODULE, }, { .name = "physdev", .family = AF_INET6, - .checkentry = checkentry, - .match = match, + .checkentry = physdev_mt_check, + .match = physdev_mt, .matchsize = sizeof(struct xt_physdev_info), .me = THIS_MODULE, }, }; -static int __init xt_physdev_init(void) +static int __init physdev_mt_init(void) { - return xt_register_matches(xt_physdev_match, - ARRAY_SIZE(xt_physdev_match)); + return xt_register_matches(physdev_mt_reg, ARRAY_SIZE(physdev_mt_reg)); } -static void __exit xt_physdev_fini(void) +static void __exit physdev_mt_exit(void) { - xt_unregister_matches(xt_physdev_match, ARRAY_SIZE(xt_physdev_match)); + xt_unregister_matches(physdev_mt_reg, ARRAY_SIZE(physdev_mt_reg)); } -module_init(xt_physdev_init); -module_exit(xt_physdev_fini); +module_init(physdev_mt_init); +module_exit(physdev_mt_exit); diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c index a52925f12f3..7936f7e2325 100644 --- a/net/netfilter/xt_pkttype.c +++ b/net/netfilter/xt_pkttype.c @@ -11,65 +11,66 @@ #include <linux/if_packet.h> #include <linux/in.h> #include <linux/ip.h> +#include <linux/ipv6.h> #include <linux/netfilter/xt_pkttype.h> #include <linux/netfilter/x_tables.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Michal Ludvig <michal@logix.cz>"); -MODULE_DESCRIPTION("IP tables match to match on linklayer packet type"); +MODULE_DESCRIPTION("Xtables: link layer packet type match"); MODULE_ALIAS("ipt_pkttype"); MODULE_ALIAS("ip6t_pkttype"); -static bool match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +static bool +pkttype_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { - u_int8_t type; const struct xt_pkttype_info *info = matchinfo; + u_int8_t type; - if (skb->pkt_type == PACKET_LOOPBACK) - type = MULTICAST(ip_hdr(skb)->daddr) - ? PACKET_MULTICAST - : PACKET_BROADCAST; - else + if (skb->pkt_type != PACKET_LOOPBACK) type = skb->pkt_type; + else if (match->family == AF_INET && + ipv4_is_multicast(ip_hdr(skb)->daddr)) + type = PACKET_MULTICAST; + else if (match->family == AF_INET6 && + ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF) + type = PACKET_MULTICAST; + else + type = PACKET_BROADCAST; return (type == info->pkttype) ^ info->invert; } -static struct xt_match xt_pkttype_match[] __read_mostly = { +static struct xt_match pkttype_mt_reg[] __read_mostly = { { .name = "pkttype", .family = AF_INET, - .match = match, + .match = pkttype_mt, .matchsize = sizeof(struct xt_pkttype_info), .me = THIS_MODULE, }, { .name = "pkttype", .family = AF_INET6, - .match = match, + .match = pkttype_mt, .matchsize = sizeof(struct xt_pkttype_info), .me = THIS_MODULE, }, }; -static int __init xt_pkttype_init(void) +static int __init pkttype_mt_init(void) { - return xt_register_matches(xt_pkttype_match, - ARRAY_SIZE(xt_pkttype_match)); + return xt_register_matches(pkttype_mt_reg, ARRAY_SIZE(pkttype_mt_reg)); } -static void __exit xt_pkttype_fini(void) +static void __exit pkttype_mt_exit(void) { - xt_unregister_matches(xt_pkttype_match, ARRAY_SIZE(xt_pkttype_match)); + xt_unregister_matches(pkttype_mt_reg, ARRAY_SIZE(pkttype_mt_reg)); } -module_init(xt_pkttype_init); -module_exit(xt_pkttype_fini); +module_init(pkttype_mt_init); +module_exit(pkttype_mt_exit); diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c index 6d6d3b7fcbb..9e918add228 100644 --- a/net/netfilter/xt_policy.c +++ b/net/netfilter/xt_policy.c @@ -13,37 +13,38 @@ #include <linux/init.h> #include <net/xfrm.h> +#include <linux/netfilter.h> #include <linux/netfilter/xt_policy.h> #include <linux/netfilter/x_tables.h> MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_DESCRIPTION("Xtables IPsec policy matching module"); +MODULE_DESCRIPTION("Xtables: IPsec policy match"); MODULE_LICENSE("GPL"); static inline bool -xt_addr_cmp(const union xt_policy_addr *a1, const union xt_policy_addr *m, - const union xt_policy_addr *a2, unsigned short family) +xt_addr_cmp(const union nf_inet_addr *a1, const union nf_inet_addr *m, + const union nf_inet_addr *a2, unsigned short family) { switch (family) { case AF_INET: - return !((a1->a4.s_addr ^ a2->a4.s_addr) & m->a4.s_addr); + return ((a1->ip ^ a2->ip) & m->ip) == 0; case AF_INET6: - return !ipv6_masked_addr_cmp(&a1->a6, &m->a6, &a2->a6); + return ipv6_masked_addr_cmp(&a1->in6, &m->in6, &a2->in6) == 0; } return false; } -static inline bool +static bool match_xfrm_state(const struct xfrm_state *x, const struct xt_policy_elem *e, unsigned short family) { #define MATCH_ADDR(x,y,z) (!e->match.x || \ - (xt_addr_cmp(&e->x, &e->y, z, family) \ + (xt_addr_cmp(&e->x, &e->y, (const union nf_inet_addr *)(z), family) \ ^ e->invert.x)) #define MATCH(x,y) (!e->match.x || ((e->x == (y)) ^ e->invert.x)) - return MATCH_ADDR(saddr, smask, (union xt_policy_addr *)&x->props.saddr) && - MATCH_ADDR(daddr, dmask, (union xt_policy_addr *)&x->id.daddr) && + return MATCH_ADDR(saddr, smask, &x->props.saddr) && + MATCH_ADDR(daddr, dmask, &x->id.daddr) && MATCH(proto, x->id.proto) && MATCH(mode, x->props.mode) && MATCH(spi, x->id.spi) && @@ -108,14 +109,11 @@ match_policy_out(const struct sk_buff *skb, const struct xt_policy_info *info, return strict ? i == info->len : 0; } -static bool match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +static bool +policy_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { const struct xt_policy_info *info = matchinfo; int ret; @@ -133,9 +131,10 @@ static bool match(const struct sk_buff *skb, return ret; } -static bool checkentry(const char *tablename, const void *ip_void, - const struct xt_match *match, - void *matchinfo, unsigned int hook_mask) +static bool +policy_mt_check(const char *tablename, const void *ip_void, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { struct xt_policy_info *info = matchinfo; @@ -144,14 +143,13 @@ static bool checkentry(const char *tablename, const void *ip_void, "outgoing policy selected\n"); return false; } - /* hook values are equal for IPv4 and IPv6 */ - if (hook_mask & (1 << NF_IP_PRE_ROUTING | 1 << NF_IP_LOCAL_IN) + if (hook_mask & (1 << NF_INET_PRE_ROUTING | 1 << NF_INET_LOCAL_IN) && info->flags & XT_POLICY_MATCH_OUT) { printk(KERN_ERR "xt_policy: output policy not valid in " "PRE_ROUTING and INPUT\n"); return false; } - if (hook_mask & (1 << NF_IP_POST_ROUTING | 1 << NF_IP_LOCAL_OUT) + if (hook_mask & (1 << NF_INET_POST_ROUTING | 1 << NF_INET_LOCAL_OUT) && info->flags & XT_POLICY_MATCH_IN) { printk(KERN_ERR "xt_policy: input policy not valid in " "POST_ROUTING and OUTPUT\n"); @@ -164,37 +162,36 @@ static bool checkentry(const char *tablename, const void *ip_void, return true; } -static struct xt_match xt_policy_match[] __read_mostly = { +static struct xt_match policy_mt_reg[] __read_mostly = { { .name = "policy", .family = AF_INET, - .checkentry = checkentry, - .match = match, + .checkentry = policy_mt_check, + .match = policy_mt, .matchsize = sizeof(struct xt_policy_info), .me = THIS_MODULE, }, { .name = "policy", .family = AF_INET6, - .checkentry = checkentry, - .match = match, + .checkentry = policy_mt_check, + .match = policy_mt, .matchsize = sizeof(struct xt_policy_info), .me = THIS_MODULE, }, }; -static int __init init(void) +static int __init policy_mt_init(void) { - return xt_register_matches(xt_policy_match, - ARRAY_SIZE(xt_policy_match)); + return xt_register_matches(policy_mt_reg, ARRAY_SIZE(policy_mt_reg)); } -static void __exit fini(void) +static void __exit policy_mt_exit(void) { - xt_unregister_matches(xt_policy_match, ARRAY_SIZE(xt_policy_match)); + xt_unregister_matches(policy_mt_reg, ARRAY_SIZE(policy_mt_reg)); } -module_init(init); -module_exit(fini); +module_init(policy_mt_init); +module_exit(policy_mt_exit); MODULE_ALIAS("ipt_policy"); MODULE_ALIAS("ip6t_policy"); diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c index dae97445b87..3b021d0c522 100644 --- a/net/netfilter/xt_quota.c +++ b/net/netfilter/xt_quota.c @@ -11,16 +11,17 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Sam Johnston <samj@samj.net>"); +MODULE_DESCRIPTION("Xtables: countdown quota match"); MODULE_ALIAS("ipt_quota"); MODULE_ALIAS("ip6t_quota"); static DEFINE_SPINLOCK(quota_lock); static bool -match(const struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - const struct xt_match *match, const void *matchinfo, - int offset, unsigned int protoff, bool *hotdrop) +quota_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { struct xt_quota_info *q = ((const struct xt_quota_info *)matchinfo)->master; @@ -40,9 +41,9 @@ match(const struct sk_buff *skb, } static bool -checkentry(const char *tablename, const void *entry, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +quota_mt_check(const char *tablename, const void *entry, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { struct xt_quota_info *q = matchinfo; @@ -53,34 +54,34 @@ checkentry(const char *tablename, const void *entry, return true; } -static struct xt_match xt_quota_match[] __read_mostly = { +static struct xt_match quota_mt_reg[] __read_mostly = { { .name = "quota", .family = AF_INET, - .checkentry = checkentry, - .match = match, + .checkentry = quota_mt_check, + .match = quota_mt, .matchsize = sizeof(struct xt_quota_info), .me = THIS_MODULE }, { .name = "quota", .family = AF_INET6, - .checkentry = checkentry, - .match = match, + .checkentry = quota_mt_check, + .match = quota_mt, .matchsize = sizeof(struct xt_quota_info), .me = THIS_MODULE }, }; -static int __init xt_quota_init(void) +static int __init quota_mt_init(void) { - return xt_register_matches(xt_quota_match, ARRAY_SIZE(xt_quota_match)); + return xt_register_matches(quota_mt_reg, ARRAY_SIZE(quota_mt_reg)); } -static void __exit xt_quota_fini(void) +static void __exit quota_mt_exit(void) { - xt_unregister_matches(xt_quota_match, ARRAY_SIZE(xt_quota_match)); + xt_unregister_matches(quota_mt_reg, ARRAY_SIZE(quota_mt_reg)); } -module_init(xt_quota_init); -module_exit(xt_quota_fini); +module_init(quota_mt_init); +module_exit(quota_mt_exit); diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c new file mode 100644 index 00000000000..fdb86a51514 --- /dev/null +++ b/net/netfilter/xt_rateest.c @@ -0,0 +1,178 @@ +/* + * (C) 2007 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/gen_stats.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_rateest.h> +#include <net/netfilter/xt_rateest.h> + + +static bool xt_rateest_mt(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct xt_match *match, + const void *matchinfo, + int offset, + unsigned int protoff, + bool *hotdrop) +{ + const struct xt_rateest_match_info *info = matchinfo; + struct gnet_stats_rate_est *r; + u_int32_t bps1, bps2, pps1, pps2; + bool ret = true; + + spin_lock_bh(&info->est1->lock); + r = &info->est1->rstats; + if (info->flags & XT_RATEEST_MATCH_DELTA) { + bps1 = info->bps1 >= r->bps ? info->bps1 - r->bps : 0; + pps1 = info->pps1 >= r->pps ? info->pps1 - r->pps : 0; + } else { + bps1 = r->bps; + pps1 = r->pps; + } + spin_unlock_bh(&info->est1->lock); + + if (info->flags & XT_RATEEST_MATCH_ABS) { + bps2 = info->bps2; + pps2 = info->pps2; + } else { + spin_lock_bh(&info->est2->lock); + r = &info->est2->rstats; + if (info->flags & XT_RATEEST_MATCH_DELTA) { + bps2 = info->bps2 >= r->bps ? info->bps2 - r->bps : 0; + pps2 = info->pps2 >= r->pps ? info->pps2 - r->pps : 0; + } else { + bps2 = r->bps; + pps2 = r->pps; + } + spin_unlock_bh(&info->est2->lock); + } + + switch (info->mode) { + case XT_RATEEST_MATCH_LT: + if (info->flags & XT_RATEEST_MATCH_BPS) + ret &= bps1 < bps2; + if (info->flags & XT_RATEEST_MATCH_PPS) + ret &= pps1 < pps2; + break; + case XT_RATEEST_MATCH_GT: + if (info->flags & XT_RATEEST_MATCH_BPS) + ret &= bps1 > bps2; + if (info->flags & XT_RATEEST_MATCH_PPS) + ret &= pps1 > pps2; + break; + case XT_RATEEST_MATCH_EQ: + if (info->flags & XT_RATEEST_MATCH_BPS) + ret &= bps1 == bps2; + if (info->flags & XT_RATEEST_MATCH_PPS) + ret &= pps2 == pps2; + break; + } + + ret ^= info->flags & XT_RATEEST_MATCH_INVERT ? true : false; + return ret; +} + +static bool xt_rateest_mt_checkentry(const char *tablename, + const void *ip, + const struct xt_match *match, + void *matchinfo, + unsigned int hook_mask) +{ + struct xt_rateest_match_info *info = (void *)matchinfo; + struct xt_rateest *est1, *est2; + + if (hweight32(info->flags & (XT_RATEEST_MATCH_ABS | + XT_RATEEST_MATCH_REL)) != 1) + goto err1; + + if (!(info->flags & (XT_RATEEST_MATCH_BPS | XT_RATEEST_MATCH_PPS))) + goto err1; + + switch (info->mode) { + case XT_RATEEST_MATCH_EQ: + case XT_RATEEST_MATCH_LT: + case XT_RATEEST_MATCH_GT: + break; + default: + goto err1; + } + + est1 = xt_rateest_lookup(info->name1); + if (!est1) + goto err1; + + if (info->flags & XT_RATEEST_MATCH_REL) { + est2 = xt_rateest_lookup(info->name2); + if (!est2) + goto err2; + } else + est2 = NULL; + + + info->est1 = est1; + info->est2 = est2; + return true; + +err2: + xt_rateest_put(est1); +err1: + return false; +} + +static void xt_rateest_mt_destroy(const struct xt_match *match, + void *matchinfo) +{ + struct xt_rateest_match_info *info = (void *)matchinfo; + + xt_rateest_put(info->est1); + if (info->est2) + xt_rateest_put(info->est2); +} + +static struct xt_match xt_rateest_match[] __read_mostly = { + { + .family = AF_INET, + .name = "rateest", + .match = xt_rateest_mt, + .checkentry = xt_rateest_mt_checkentry, + .destroy = xt_rateest_mt_destroy, + .matchsize = sizeof(struct xt_rateest_match_info), + .me = THIS_MODULE, + }, + { + .family = AF_INET6, + .name = "rateest", + .match = xt_rateest_mt, + .checkentry = xt_rateest_mt_checkentry, + .destroy = xt_rateest_mt_destroy, + .matchsize = sizeof(struct xt_rateest_match_info), + .me = THIS_MODULE, + }, +}; + +static int __init xt_rateest_mt_init(void) +{ + return xt_register_matches(xt_rateest_match, + ARRAY_SIZE(xt_rateest_match)); +} + +static void __exit xt_rateest_mt_fini(void) +{ + xt_unregister_matches(xt_rateest_match, ARRAY_SIZE(xt_rateest_match)); +} + +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("xtables rate estimator match"); +MODULE_ALIAS("ipt_rateest"); +MODULE_ALIAS("ip6t_rateest"); +module_init(xt_rateest_mt_init); +module_exit(xt_rateest_mt_fini); diff --git a/net/netfilter/xt_realm.c b/net/netfilter/xt_realm.c index cc3e76d77a9..7df1627c536 100644 --- a/net/netfilter/xt_realm.c +++ b/net/netfilter/xt_realm.c @@ -18,18 +18,14 @@ MODULE_AUTHOR("Sampsa Ranta <sampsa@netsonic.fi>"); MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("X_tables realm match"); +MODULE_DESCRIPTION("Xtables: Routing realm match"); MODULE_ALIAS("ipt_realm"); static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +realm_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { const struct xt_realm_info *info = matchinfo; const struct dst_entry *dst = skb->dst; @@ -37,25 +33,25 @@ match(const struct sk_buff *skb, return (info->id == (dst->tclassid & info->mask)) ^ info->invert; } -static struct xt_match realm_match __read_mostly = { +static struct xt_match realm_mt_reg __read_mostly = { .name = "realm", - .match = match, + .match = realm_mt, .matchsize = sizeof(struct xt_realm_info), - .hooks = (1 << NF_IP_POST_ROUTING) | (1 << NF_IP_FORWARD) | - (1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_LOCAL_IN), + .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), .family = AF_INET, .me = THIS_MODULE }; -static int __init xt_realm_init(void) +static int __init realm_mt_init(void) { - return xt_register_match(&realm_match); + return xt_register_match(&realm_mt_reg); } -static void __exit xt_realm_fini(void) +static void __exit realm_mt_exit(void) { - xt_unregister_match(&realm_match); + xt_unregister_match(&realm_mt_reg); } -module_init(xt_realm_init); -module_exit(xt_realm_fini); +module_init(realm_mt_init); +module_exit(realm_mt_exit); diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c index 3358273a47b..b718ec64333 100644 --- a/net/netfilter/xt_sctp.c +++ b/net/netfilter/xt_sctp.c @@ -11,7 +11,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Kiran Kumar Immidi"); -MODULE_DESCRIPTION("Match for SCTP protocol packets"); +MODULE_DESCRIPTION("Xtables: SCTP protocol packet match"); MODULE_ALIAS("ipt_sctp"); MODULE_ALIAS("ip6t_sctp"); @@ -116,14 +116,9 @@ match_packet(const struct sk_buff *skb, } static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +sctp_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) { const struct xt_sctp_info *info = matchinfo; sctp_sctphdr_t _sh, *sh; @@ -153,11 +148,9 @@ match(const struct sk_buff *skb, } static bool -checkentry(const char *tablename, - const void *inf, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +sctp_mt_check(const char *tablename, const void *inf, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { const struct xt_sctp_info *info = matchinfo; @@ -171,12 +164,12 @@ checkentry(const char *tablename, | SCTP_CHUNK_MATCH_ONLY))); } -static struct xt_match xt_sctp_match[] __read_mostly = { +static struct xt_match sctp_mt_reg[] __read_mostly = { { .name = "sctp", .family = AF_INET, - .checkentry = checkentry, - .match = match, + .checkentry = sctp_mt_check, + .match = sctp_mt, .matchsize = sizeof(struct xt_sctp_info), .proto = IPPROTO_SCTP, .me = THIS_MODULE @@ -184,23 +177,23 @@ static struct xt_match xt_sctp_match[] __read_mostly = { { .name = "sctp", .family = AF_INET6, - .checkentry = checkentry, - .match = match, + .checkentry = sctp_mt_check, + .match = sctp_mt, .matchsize = sizeof(struct xt_sctp_info), .proto = IPPROTO_SCTP, .me = THIS_MODULE }, }; -static int __init xt_sctp_init(void) +static int __init sctp_mt_init(void) { - return xt_register_matches(xt_sctp_match, ARRAY_SIZE(xt_sctp_match)); + return xt_register_matches(sctp_mt_reg, ARRAY_SIZE(sctp_mt_reg)); } -static void __exit xt_sctp_fini(void) +static void __exit sctp_mt_exit(void) { - xt_unregister_matches(xt_sctp_match, ARRAY_SIZE(xt_sctp_match)); + xt_unregister_matches(sctp_mt_reg, ARRAY_SIZE(sctp_mt_reg)); } -module_init(xt_sctp_init); -module_exit(xt_sctp_fini); +module_init(sctp_mt_init); +module_exit(sctp_mt_exit); diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c index e0a528df19a..a776dc36a19 100644 --- a/net/netfilter/xt_state.c +++ b/net/netfilter/xt_state.c @@ -21,14 +21,10 @@ MODULE_ALIAS("ipt_state"); MODULE_ALIAS("ip6t_state"); static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +state_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { const struct xt_state_info *sinfo = matchinfo; enum ip_conntrack_info ctinfo; @@ -44,56 +40,54 @@ match(const struct sk_buff *skb, return (sinfo->statemask & statebit); } -static bool check(const char *tablename, - const void *inf, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +static bool +state_mt_check(const char *tablename, const void *inf, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { if (nf_ct_l3proto_try_module_get(match->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%d\n", match->family); + "proto=%u\n", match->family); return false; } return true; } -static void -destroy(const struct xt_match *match, void *matchinfo) +static void state_mt_destroy(const struct xt_match *match, void *matchinfo) { nf_ct_l3proto_module_put(match->family); } -static struct xt_match xt_state_match[] __read_mostly = { +static struct xt_match state_mt_reg[] __read_mostly = { { .name = "state", .family = AF_INET, - .checkentry = check, - .match = match, - .destroy = destroy, + .checkentry = state_mt_check, + .match = state_mt, + .destroy = state_mt_destroy, .matchsize = sizeof(struct xt_state_info), .me = THIS_MODULE, }, { .name = "state", .family = AF_INET6, - .checkentry = check, - .match = match, - .destroy = destroy, + .checkentry = state_mt_check, + .match = state_mt, + .destroy = state_mt_destroy, .matchsize = sizeof(struct xt_state_info), .me = THIS_MODULE, }, }; -static int __init xt_state_init(void) +static int __init state_mt_init(void) { - return xt_register_matches(xt_state_match, ARRAY_SIZE(xt_state_match)); + return xt_register_matches(state_mt_reg, ARRAY_SIZE(state_mt_reg)); } -static void __exit xt_state_fini(void) +static void __exit state_mt_exit(void) { - xt_unregister_matches(xt_state_match, ARRAY_SIZE(xt_state_match)); + xt_unregister_matches(state_mt_reg, ARRAY_SIZE(state_mt_reg)); } -module_init(xt_state_init); -module_exit(xt_state_fini); +module_init(state_mt_init); +module_exit(state_mt_exit); diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c index 4089dae4e28..43133080da7 100644 --- a/net/netfilter/xt_statistic.c +++ b/net/netfilter/xt_statistic.c @@ -18,17 +18,17 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_DESCRIPTION("xtables statistical match module"); +MODULE_DESCRIPTION("Xtables: statistics-based matching (\"Nth\", random)"); MODULE_ALIAS("ipt_statistic"); MODULE_ALIAS("ip6t_statistic"); static DEFINE_SPINLOCK(nth_lock); static bool -match(const struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - const struct xt_match *match, const void *matchinfo, - int offset, unsigned int protoff, bool *hotdrop) +statistic_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { struct xt_statistic_info *info = (struct xt_statistic_info *)matchinfo; bool ret = info->flags & XT_STATISTIC_INVERT; @@ -53,9 +53,9 @@ match(const struct sk_buff *skb, } static bool -checkentry(const char *tablename, const void *entry, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +statistic_mt_check(const char *tablename, const void *entry, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { struct xt_statistic_info *info = matchinfo; @@ -66,36 +66,36 @@ checkentry(const char *tablename, const void *entry, return true; } -static struct xt_match xt_statistic_match[] __read_mostly = { +static struct xt_match statistic_mt_reg[] __read_mostly = { { .name = "statistic", .family = AF_INET, - .checkentry = checkentry, - .match = match, + .checkentry = statistic_mt_check, + .match = statistic_mt, .matchsize = sizeof(struct xt_statistic_info), .me = THIS_MODULE, }, { .name = "statistic", .family = AF_INET6, - .checkentry = checkentry, - .match = match, + .checkentry = statistic_mt_check, + .match = statistic_mt, .matchsize = sizeof(struct xt_statistic_info), .me = THIS_MODULE, }, }; -static int __init xt_statistic_init(void) +static int __init statistic_mt_init(void) { - return xt_register_matches(xt_statistic_match, - ARRAY_SIZE(xt_statistic_match)); + return xt_register_matches(statistic_mt_reg, + ARRAY_SIZE(statistic_mt_reg)); } -static void __exit xt_statistic_fini(void) +static void __exit statistic_mt_exit(void) { - xt_unregister_matches(xt_statistic_match, - ARRAY_SIZE(xt_statistic_match)); + xt_unregister_matches(statistic_mt_reg, + ARRAY_SIZE(statistic_mt_reg)); } -module_init(xt_statistic_init); -module_exit(xt_statistic_fini); +module_init(statistic_mt_init); +module_exit(statistic_mt_exit); diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c index 864133442cd..72f694d947f 100644 --- a/net/netfilter/xt_string.c +++ b/net/netfilter/xt_string.c @@ -16,19 +16,16 @@ #include <linux/textsearch.h> MODULE_AUTHOR("Pablo Neira Ayuso <pablo@eurodev.net>"); -MODULE_DESCRIPTION("IP tables string match module"); +MODULE_DESCRIPTION("Xtables: string-based matching"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_string"); MODULE_ALIAS("ip6t_string"); -static bool match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +static bool +string_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { const struct xt_string_info *conf = matchinfo; struct ts_state state; @@ -40,13 +37,12 @@ static bool match(const struct sk_buff *skb, != UINT_MAX) ^ conf->invert; } -#define STRING_TEXT_PRIV(m) ((struct xt_string_info *) m) +#define STRING_TEXT_PRIV(m) ((struct xt_string_info *)(m)) -static bool checkentry(const char *tablename, - const void *ip, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +static bool +string_mt_check(const char *tablename, const void *ip, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { struct xt_string_info *conf = matchinfo; struct ts_config *ts_conf; @@ -68,41 +64,41 @@ static bool checkentry(const char *tablename, return true; } -static void destroy(const struct xt_match *match, void *matchinfo) +static void string_mt_destroy(const struct xt_match *match, void *matchinfo) { textsearch_destroy(STRING_TEXT_PRIV(matchinfo)->config); } -static struct xt_match xt_string_match[] __read_mostly = { +static struct xt_match string_mt_reg[] __read_mostly = { { .name = "string", .family = AF_INET, - .checkentry = checkentry, - .match = match, - .destroy = destroy, + .checkentry = string_mt_check, + .match = string_mt, + .destroy = string_mt_destroy, .matchsize = sizeof(struct xt_string_info), .me = THIS_MODULE }, { .name = "string", .family = AF_INET6, - .checkentry = checkentry, - .match = match, - .destroy = destroy, + .checkentry = string_mt_check, + .match = string_mt, + .destroy = string_mt_destroy, .matchsize = sizeof(struct xt_string_info), .me = THIS_MODULE }, }; -static int __init xt_string_init(void) +static int __init string_mt_init(void) { - return xt_register_matches(xt_string_match, ARRAY_SIZE(xt_string_match)); + return xt_register_matches(string_mt_reg, ARRAY_SIZE(string_mt_reg)); } -static void __exit xt_string_fini(void) +static void __exit string_mt_exit(void) { - xt_unregister_matches(xt_string_match, ARRAY_SIZE(xt_string_match)); + xt_unregister_matches(string_mt_reg, ARRAY_SIZE(string_mt_reg)); } -module_init(xt_string_init); -module_exit(xt_string_fini); +module_init(string_mt_init); +module_exit(string_mt_exit); diff --git a/net/netfilter/xt_tcpmss.c b/net/netfilter/xt_tcpmss.c index 84d401bfafa..d7a5b27fe81 100644 --- a/net/netfilter/xt_tcpmss.c +++ b/net/netfilter/xt_tcpmss.c @@ -20,19 +20,15 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); -MODULE_DESCRIPTION("iptables TCP MSS match module"); +MODULE_DESCRIPTION("Xtables: TCP MSS match"); MODULE_ALIAS("ipt_tcpmss"); MODULE_ALIAS("ip6t_tcpmss"); static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +tcpmss_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { const struct xt_tcpmss_match_info *info = matchinfo; struct tcphdr _tcph, *th; @@ -82,11 +78,11 @@ dropit: return false; } -static struct xt_match xt_tcpmss_match[] __read_mostly = { +static struct xt_match tcpmss_mt_reg[] __read_mostly = { { .name = "tcpmss", .family = AF_INET, - .match = match, + .match = tcpmss_mt, .matchsize = sizeof(struct xt_tcpmss_match_info), .proto = IPPROTO_TCP, .me = THIS_MODULE, @@ -94,23 +90,22 @@ static struct xt_match xt_tcpmss_match[] __read_mostly = { { .name = "tcpmss", .family = AF_INET6, - .match = match, + .match = tcpmss_mt, .matchsize = sizeof(struct xt_tcpmss_match_info), .proto = IPPROTO_TCP, .me = THIS_MODULE, }, }; -static int __init xt_tcpmss_init(void) +static int __init tcpmss_mt_init(void) { - return xt_register_matches(xt_tcpmss_match, - ARRAY_SIZE(xt_tcpmss_match)); + return xt_register_matches(tcpmss_mt_reg, ARRAY_SIZE(tcpmss_mt_reg)); } -static void __exit xt_tcpmss_fini(void) +static void __exit tcpmss_mt_exit(void) { - xt_unregister_matches(xt_tcpmss_match, ARRAY_SIZE(xt_tcpmss_match)); + xt_unregister_matches(tcpmss_mt_reg, ARRAY_SIZE(tcpmss_mt_reg)); } -module_init(xt_tcpmss_init); -module_exit(xt_tcpmss_fini); +module_init(tcpmss_mt_init); +module_exit(tcpmss_mt_exit); diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c index 223f9bded67..4fa3b669f69 100644 --- a/net/netfilter/xt_tcpudp.c +++ b/net/netfilter/xt_tcpudp.c @@ -10,7 +10,7 @@ #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> -MODULE_DESCRIPTION("x_tables match for TCP and UDP(-Lite), supports IPv4 and IPv6"); +MODULE_DESCRIPTION("Xtables: TCP, UDP and UDP-Lite match"); MODULE_LICENSE("GPL"); MODULE_ALIAS("xt_tcp"); MODULE_ALIAS("xt_udp"); @@ -68,14 +68,9 @@ tcp_find_option(u_int8_t option, } static bool -tcp_match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +tcp_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) { struct tcphdr _tcph, *th; const struct xt_tcp *tcpinfo = matchinfo; @@ -134,11 +129,9 @@ tcp_match(const struct sk_buff *skb, /* Called when user tries to insert an entry of this type. */ static bool -tcp_checkentry(const char *tablename, - const void *info, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +tcp_mt_check(const char *tablename, const void *info, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { const struct xt_tcp *tcpinfo = matchinfo; @@ -147,14 +140,9 @@ tcp_checkentry(const char *tablename, } static bool -udp_match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +udp_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) { struct udphdr _udph, *uh; const struct xt_udp *udpinfo = matchinfo; @@ -182,11 +170,9 @@ udp_match(const struct sk_buff *skb, /* Called when user tries to insert an entry of this type. */ static bool -udp_checkentry(const char *tablename, - const void *info, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +udp_mt_check(const char *tablename, const void *info, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { const struct xt_udp *udpinfo = matchinfo; @@ -194,12 +180,12 @@ udp_checkentry(const char *tablename, return !(udpinfo->invflags & ~XT_UDP_INV_MASK); } -static struct xt_match xt_tcpudp_match[] __read_mostly = { +static struct xt_match tcpudp_mt_reg[] __read_mostly = { { .name = "tcp", .family = AF_INET, - .checkentry = tcp_checkentry, - .match = tcp_match, + .checkentry = tcp_mt_check, + .match = tcp_mt, .matchsize = sizeof(struct xt_tcp), .proto = IPPROTO_TCP, .me = THIS_MODULE, @@ -207,8 +193,8 @@ static struct xt_match xt_tcpudp_match[] __read_mostly = { { .name = "tcp", .family = AF_INET6, - .checkentry = tcp_checkentry, - .match = tcp_match, + .checkentry = tcp_mt_check, + .match = tcp_mt, .matchsize = sizeof(struct xt_tcp), .proto = IPPROTO_TCP, .me = THIS_MODULE, @@ -216,8 +202,8 @@ static struct xt_match xt_tcpudp_match[] __read_mostly = { { .name = "udp", .family = AF_INET, - .checkentry = udp_checkentry, - .match = udp_match, + .checkentry = udp_mt_check, + .match = udp_mt, .matchsize = sizeof(struct xt_udp), .proto = IPPROTO_UDP, .me = THIS_MODULE, @@ -225,8 +211,8 @@ static struct xt_match xt_tcpudp_match[] __read_mostly = { { .name = "udp", .family = AF_INET6, - .checkentry = udp_checkentry, - .match = udp_match, + .checkentry = udp_mt_check, + .match = udp_mt, .matchsize = sizeof(struct xt_udp), .proto = IPPROTO_UDP, .me = THIS_MODULE, @@ -234,8 +220,8 @@ static struct xt_match xt_tcpudp_match[] __read_mostly = { { .name = "udplite", .family = AF_INET, - .checkentry = udp_checkentry, - .match = udp_match, + .checkentry = udp_mt_check, + .match = udp_mt, .matchsize = sizeof(struct xt_udp), .proto = IPPROTO_UDPLITE, .me = THIS_MODULE, @@ -243,24 +229,23 @@ static struct xt_match xt_tcpudp_match[] __read_mostly = { { .name = "udplite", .family = AF_INET6, - .checkentry = udp_checkentry, - .match = udp_match, + .checkentry = udp_mt_check, + .match = udp_mt, .matchsize = sizeof(struct xt_udp), .proto = IPPROTO_UDPLITE, .me = THIS_MODULE, }, }; -static int __init xt_tcpudp_init(void) +static int __init tcpudp_mt_init(void) { - return xt_register_matches(xt_tcpudp_match, - ARRAY_SIZE(xt_tcpudp_match)); + return xt_register_matches(tcpudp_mt_reg, ARRAY_SIZE(tcpudp_mt_reg)); } -static void __exit xt_tcpudp_fini(void) +static void __exit tcpudp_mt_exit(void) { - xt_unregister_matches(xt_tcpudp_match, ARRAY_SIZE(xt_tcpudp_match)); + xt_unregister_matches(tcpudp_mt_reg, ARRAY_SIZE(tcpudp_mt_reg)); } -module_init(xt_tcpudp_init); -module_exit(xt_tcpudp_fini); +module_init(tcpudp_mt_init); +module_exit(tcpudp_mt_exit); diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c index f9c55dcd894..e9a8794bc3a 100644 --- a/net/netfilter/xt_time.c +++ b/net/netfilter/xt_time.c @@ -147,11 +147,10 @@ static void localtime_3(struct xtm *r, time_t time) return; } -static bool xt_time_match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, const void *matchinfo, - int offset, unsigned int protoff, bool *hotdrop) +static bool +time_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) { const struct xt_time_info *info = matchinfo; unsigned int packet_time; @@ -216,9 +215,10 @@ static bool xt_time_match(const struct sk_buff *skb, return true; } -static bool xt_time_check(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool +time_mt_check(const char *tablename, const void *ip, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { struct xt_time_info *info = matchinfo; @@ -232,39 +232,39 @@ static bool xt_time_check(const char *tablename, const void *ip, return true; } -static struct xt_match xt_time_reg[] __read_mostly = { +static struct xt_match time_mt_reg[] __read_mostly = { { .name = "time", .family = AF_INET, - .match = xt_time_match, + .match = time_mt, .matchsize = sizeof(struct xt_time_info), - .checkentry = xt_time_check, + .checkentry = time_mt_check, .me = THIS_MODULE, }, { .name = "time", .family = AF_INET6, - .match = xt_time_match, + .match = time_mt, .matchsize = sizeof(struct xt_time_info), - .checkentry = xt_time_check, + .checkentry = time_mt_check, .me = THIS_MODULE, }, }; -static int __init xt_time_init(void) +static int __init time_mt_init(void) { - return xt_register_matches(xt_time_reg, ARRAY_SIZE(xt_time_reg)); + return xt_register_matches(time_mt_reg, ARRAY_SIZE(time_mt_reg)); } -static void __exit xt_time_exit(void) +static void __exit time_mt_exit(void) { - xt_unregister_matches(xt_time_reg, ARRAY_SIZE(xt_time_reg)); + xt_unregister_matches(time_mt_reg, ARRAY_SIZE(time_mt_reg)); } -module_init(xt_time_init); -module_exit(xt_time_exit); +module_init(time_mt_init); +module_exit(time_mt_exit); MODULE_AUTHOR("Jan Engelhardt <jengelh@computergmbh.de>"); -MODULE_DESCRIPTION("netfilter time match"); +MODULE_DESCRIPTION("Xtables: time-based matching"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_time"); MODULE_ALIAS("ip6t_time"); diff --git a/net/netfilter/xt_u32.c b/net/netfilter/xt_u32.c index af75b8c3f20..9b8ed390a8e 100644 --- a/net/netfilter/xt_u32.c +++ b/net/netfilter/xt_u32.c @@ -88,11 +88,10 @@ static bool u32_match_it(const struct xt_u32 *data, return true; } -static bool u32_match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, const void *matchinfo, - int offset, unsigned int protoff, bool *hotdrop) +static bool +u32_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) { const struct xt_u32 *data = matchinfo; bool ret; @@ -101,37 +100,37 @@ static bool u32_match(const struct sk_buff *skb, return ret ^ data->invert; } -static struct xt_match u32_reg[] __read_mostly = { +static struct xt_match u32_mt_reg[] __read_mostly = { { .name = "u32", .family = AF_INET, - .match = u32_match, + .match = u32_mt, .matchsize = sizeof(struct xt_u32), .me = THIS_MODULE, }, { .name = "u32", .family = AF_INET6, - .match = u32_match, + .match = u32_mt, .matchsize = sizeof(struct xt_u32), .me = THIS_MODULE, }, }; -static int __init xt_u32_init(void) +static int __init u32_mt_init(void) { - return xt_register_matches(u32_reg, ARRAY_SIZE(u32_reg)); + return xt_register_matches(u32_mt_reg, ARRAY_SIZE(u32_mt_reg)); } -static void __exit xt_u32_exit(void) +static void __exit u32_mt_exit(void) { - xt_unregister_matches(u32_reg, ARRAY_SIZE(u32_reg)); + xt_unregister_matches(u32_mt_reg, ARRAY_SIZE(u32_mt_reg)); } -module_init(xt_u32_init); -module_exit(xt_u32_exit); +module_init(u32_mt_init); +module_exit(u32_mt_exit); MODULE_AUTHOR("Jan Engelhardt <jengelh@computergmbh.de>"); -MODULE_DESCRIPTION("netfilter u32 match module"); +MODULE_DESCRIPTION("Xtables: arbitrary byte matching"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_u32"); MODULE_ALIAS("ip6t_u32"); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index de3988ba1f4..6b178e1247b 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -156,7 +156,7 @@ static void netlink_sock_destruct(struct sock *sk) skb_queue_purge(&sk->sk_receive_queue); if (!sock_flag(sk, SOCK_DEAD)) { - printk("Freeing alive netlink socket %p\n", sk); + printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); return; } BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc)); @@ -164,13 +164,14 @@ static void netlink_sock_destruct(struct sock *sk) BUG_TRAP(!nlk_sk(sk)->groups); } -/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on SMP. - * Look, when several writers sleep and reader wakes them up, all but one +/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on + * SMP. Look, when several writers sleep and reader wakes them up, all but one * immediately hit write lock and grab all the cpus. Exclusive sleep solves * this, _but_ remember, it adds useless work on UP machines. */ static void netlink_table_grab(void) + __acquires(nl_table_lock) { write_lock_irq(&nl_table_lock); @@ -178,7 +179,7 @@ static void netlink_table_grab(void) DECLARE_WAITQUEUE(wait, current); add_wait_queue_exclusive(&nl_table_wait, &wait); - for(;;) { + for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (atomic_read(&nl_table_users) == 0) break; @@ -192,13 +193,14 @@ static void netlink_table_grab(void) } } -static __inline__ void netlink_table_ungrab(void) +static void netlink_table_ungrab(void) + __releases(nl_table_lock) { write_unlock_irq(&nl_table_lock); wake_up(&nl_table_wait); } -static __inline__ void +static inline void netlink_lock_table(void) { /* read_lock() synchronizes us to netlink_table_grab */ @@ -208,14 +210,15 @@ netlink_lock_table(void) read_unlock(&nl_table_lock); } -static __inline__ void +static inline void netlink_unlock_table(void) { if (atomic_dec_and_test(&nl_table_users)) wake_up(&nl_table_wait); } -static __inline__ struct sock *netlink_lookup(struct net *net, int protocol, u32 pid) +static inline struct sock *netlink_lookup(struct net *net, int protocol, + u32 pid) { struct nl_pid_hash *hash = &nl_table[protocol].hash; struct hlist_head *head; @@ -236,13 +239,14 @@ found: return sk; } -static inline struct hlist_head *nl_pid_hash_alloc(size_t size) +static inline struct hlist_head *nl_pid_hash_zalloc(size_t size) { if (size <= PAGE_SIZE) - return kmalloc(size, GFP_ATOMIC); + return kzalloc(size, GFP_ATOMIC); else return (struct hlist_head *) - __get_free_pages(GFP_ATOMIC, get_order(size)); + __get_free_pages(GFP_ATOMIC | __GFP_ZERO, + get_order(size)); } static inline void nl_pid_hash_free(struct hlist_head *table, size_t size) @@ -271,11 +275,10 @@ static int nl_pid_hash_rehash(struct nl_pid_hash *hash, int grow) size *= 2; } - table = nl_pid_hash_alloc(size); + table = nl_pid_hash_zalloc(size); if (!table) return 0; - memset(table, 0, size); otable = hash->table; hash->table = table; hash->mask = mask; @@ -428,7 +431,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol) if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) return -ESOCKTNOSUPPORT; - if (protocol<0 || protocol >= MAX_LINKS) + if (protocol < 0 || protocol >= MAX_LINKS) return -EPROTONOSUPPORT; netlink_lock_table(); @@ -445,7 +448,8 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol) cb_mutex = nl_table[protocol].cb_mutex; netlink_unlock_table(); - if ((err = __netlink_create(net, sock, cb_mutex, protocol)) < 0) + err = __netlink_create(net, sock, cb_mutex, protocol); + if (err < 0) goto out_module; nlk = nlk_sk(sock->sk); @@ -494,9 +498,12 @@ static int netlink_release(struct socket *sock) netlink_table_grab(); if (netlink_is_kernel(sk)) { - kfree(nl_table[sk->sk_protocol].listeners); - nl_table[sk->sk_protocol].module = NULL; - nl_table[sk->sk_protocol].registered = 0; + BUG_ON(nl_table[sk->sk_protocol].registered == 0); + if (--nl_table[sk->sk_protocol].registered == 0) { + kfree(nl_table[sk->sk_protocol].listeners); + nl_table[sk->sk_protocol].module = NULL; + nl_table[sk->sk_protocol].registered = 0; + } } else if (nlk->subscriptions) netlink_update_listeners(sk); netlink_table_ungrab(); @@ -590,7 +597,7 @@ static int netlink_realloc_groups(struct sock *sk) err = -ENOMEM; goto out_unlock; } - memset((char*)new_groups + NLGRPSZ(nlk->ngroups), 0, + memset((char *)new_groups + NLGRPSZ(nlk->ngroups), 0, NLGRPSZ(groups) - NLGRPSZ(nlk->ngroups)); nlk->groups = new_groups; @@ -600,7 +607,8 @@ static int netlink_realloc_groups(struct sock *sk) return err; } -static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +static int netlink_bind(struct socket *sock, struct sockaddr *addr, + int addr_len) { struct sock *sk = sock->sk; struct net *net = sk->sk_net; @@ -651,7 +659,7 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, int err = 0; struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); - struct sockaddr_nl *nladdr=(struct sockaddr_nl*)addr; + struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; if (addr->sa_family == AF_UNSPEC) { sk->sk_state = NETLINK_UNCONNECTED; @@ -678,11 +686,12 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, return err; } -static int netlink_getname(struct socket *sock, struct sockaddr *addr, int *addr_len, int peer) +static int netlink_getname(struct socket *sock, struct sockaddr *addr, + int *addr_len, int peer) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); - struct sockaddr_nl *nladdr=(struct sockaddr_nl *)addr; + struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; nladdr->nl_family = AF_NETLINK; nladdr->nl_pad = 0; @@ -885,6 +894,7 @@ retry: return netlink_sendskb(sk, skb); } +EXPORT_SYMBOL(netlink_unicast); int netlink_has_listeners(struct sock *sk, unsigned int group) { @@ -905,7 +915,8 @@ int netlink_has_listeners(struct sock *sk, unsigned int group) } EXPORT_SYMBOL_GPL(netlink_has_listeners); -static __inline__ int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb) +static inline int netlink_broadcast_deliver(struct sock *sk, + struct sk_buff *skb) { struct netlink_sock *nlk = nlk_sk(sk); @@ -1026,6 +1037,7 @@ int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, return -ENOBUFS; return -ESRCH; } +EXPORT_SYMBOL(netlink_broadcast); struct netlink_set_err_data { struct sock *exclude_sk; @@ -1182,7 +1194,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, struct sock_iocb *siocb = kiocb_to_siocb(kiocb); struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); - struct sockaddr_nl *addr=msg->msg_name; + struct sockaddr_nl *addr = msg->msg_name; u32 dst_pid; u32 dst_group; struct sk_buff *skb; @@ -1221,7 +1233,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, goto out; err = -ENOBUFS; skb = alloc_skb(len, GFP_KERNEL); - if (skb==NULL) + if (skb == NULL) goto out; NETLINK_CB(skb).pid = nlk->pid; @@ -1237,7 +1249,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, */ err = -EFAULT; - if (memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len)) { + if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) { kfree_skb(skb); goto out; } @@ -1276,8 +1288,8 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, copied = 0; - skb = skb_recv_datagram(sk,flags,noblock,&err); - if (skb==NULL) + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (skb == NULL) goto out; msg->msg_namelen = 0; @@ -1292,7 +1304,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); if (msg->msg_name) { - struct sockaddr_nl *addr = (struct sockaddr_nl*)msg->msg_name; + struct sockaddr_nl *addr = (struct sockaddr_nl *)msg->msg_name; addr->nl_family = AF_NETLINK; addr->nl_pad = 0; addr->nl_pid = NETLINK_CB(skb).pid; @@ -1344,7 +1356,7 @@ netlink_kernel_create(struct net *net, int unit, unsigned int groups, BUG_ON(!nl_table); - if (unit<0 || unit>=MAX_LINKS) + if (unit < 0 || unit >= MAX_LINKS) return NULL; if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) @@ -1380,9 +1392,13 @@ netlink_kernel_create(struct net *net, int unit, unsigned int groups, nl_table[unit].registered = 1; } else { kfree(listeners); + nl_table[unit].registered++; } netlink_table_ungrab(); + /* Do not hold an extra referrence to a namespace as this socket is + * internal to a namespace and does not prevent it to stop. */ + put_net(net); return sk; out_sock_release: @@ -1390,6 +1406,30 @@ out_sock_release: sock_release(sock); return NULL; } +EXPORT_SYMBOL(netlink_kernel_create); + + +void +netlink_kernel_release(struct sock *sk) +{ + if (sk == NULL || sk->sk_socket == NULL) + return; + + /* + * Last sock_put should drop referrence to sk->sk_net. It has already + * been dropped in netlink_kernel_create. Taking referrence to stopping + * namespace is not an option. + * Take referrence to a socket to remove it from netlink lookup table + * _alive_ and after that destroy it in the context of init_net. + */ + sock_hold(sk); + sock_release(sk->sk_socket); + + sk->sk_net = get_net(&init_net); + sock_put(sk); +} +EXPORT_SYMBOL(netlink_kernel_release); + /** * netlink_change_ngroups - change number of multicast groups @@ -1461,6 +1501,7 @@ void netlink_set_nonroot(int protocol, unsigned int flags) if ((unsigned int)protocol < MAX_LINKS) nl_table[protocol].nl_nonroot = flags; } +EXPORT_SYMBOL(netlink_set_nonroot); static void netlink_destroy_callback(struct netlink_callback *cb) { @@ -1529,8 +1570,9 @@ errout: int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, struct nlmsghdr *nlh, - int (*dump)(struct sk_buff *skb, struct netlink_callback*), - int (*done)(struct netlink_callback*)) + int (*dump)(struct sk_buff *skb, + struct netlink_callback *), + int (*done)(struct netlink_callback *)) { struct netlink_callback *cb; struct sock *sk; @@ -1571,6 +1613,7 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, */ return -EINTR; } +EXPORT_SYMBOL(netlink_dump_start); void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) { @@ -1605,6 +1648,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) memcpy(&errmsg->msg, nlh, err ? nlh->nlmsg_len : sizeof(*nlh)); netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); } +EXPORT_SYMBOL(netlink_ack); int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, struct nlmsghdr *)) @@ -1638,7 +1682,7 @@ ack: netlink_ack(skb, nlh, err); skip: - msglen = NLMSG_ALIGN(nlh->nlmsg_len); + msglen = NLMSG_ALIGN(nlh->nlmsg_len); if (msglen > skb->len) msglen = skb->len; skb_pull(skb, msglen); @@ -1646,6 +1690,7 @@ skip: return 0; } +EXPORT_SYMBOL(netlink_rcv_skb); /** * nlmsg_notify - send a notification netlink message @@ -1678,10 +1723,11 @@ int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 pid, return err; } +EXPORT_SYMBOL(nlmsg_notify); #ifdef CONFIG_PROC_FS struct nl_seq_iter { - struct net *net; + struct seq_net_private p; int link; int hash_idx; }; @@ -1694,12 +1740,12 @@ static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos) struct hlist_node *node; loff_t off = 0; - for (i=0; i<MAX_LINKS; i++) { + for (i = 0; i < MAX_LINKS; i++) { struct nl_pid_hash *hash = &nl_table[i].hash; for (j = 0; j <= hash->mask; j++) { sk_for_each(s, node, &hash->table[j]) { - if (iter->net != s->sk_net) + if (iter->p.net != s->sk_net) continue; if (off == pos) { iter->link = i; @@ -1714,6 +1760,7 @@ static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos) } static void *netlink_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(nl_table_lock) { read_lock(&nl_table_lock); return *pos ? netlink_seq_socket_idx(seq, *pos - 1) : SEQ_START_TOKEN; @@ -1734,7 +1781,7 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) s = v; do { s = sk_next(s); - } while (s && (iter->net != s->sk_net)); + } while (s && (iter->p.net != s->sk_net)); if (s) return s; @@ -1746,7 +1793,7 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) for (; j <= hash->mask; j++) { s = sk_head(&hash->table[j]); - while (s && (iter->net != s->sk_net)) + while (s && (iter->p.net != s->sk_net)) s = sk_next(s); if (s) { iter->link = i; @@ -1762,6 +1809,7 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void netlink_seq_stop(struct seq_file *seq, void *v) + __releases(nl_table_lock) { read_unlock(&nl_table_lock); } @@ -1802,27 +1850,8 @@ static const struct seq_operations netlink_seq_ops = { static int netlink_seq_open(struct inode *inode, struct file *file) { - struct nl_seq_iter *iter; - - iter = __seq_open_private(file, &netlink_seq_ops, sizeof(*iter)); - if (!iter) - return -ENOMEM; - - iter->net = get_proc_net(inode); - if (!iter->net) { - seq_release_private(inode, file); - return -ENXIO; - } - - return 0; -} - -static int netlink_seq_release(struct inode *inode, struct file *file) -{ - struct seq_file *seq = file->private_data; - struct nl_seq_iter *iter = seq->private; - put_net(iter->net); - return seq_release_private(inode, file); + return seq_open_net(inode, file, &netlink_seq_ops, + sizeof(struct nl_seq_iter)); } static const struct file_operations netlink_seq_fops = { @@ -1830,7 +1859,7 @@ static const struct file_operations netlink_seq_fops = { .open = netlink_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = netlink_seq_release, + .release = seq_release_net, }; #endif @@ -1839,11 +1868,13 @@ int netlink_register_notifier(struct notifier_block *nb) { return atomic_notifier_chain_register(&netlink_chain, nb); } +EXPORT_SYMBOL(netlink_register_notifier); int netlink_unregister_notifier(struct notifier_block *nb) { return atomic_notifier_chain_unregister(&netlink_chain, nb); } +EXPORT_SYMBOL(netlink_unregister_notifier); static const struct proto_ops netlink_ops = { .family = PF_NETLINK, @@ -1922,7 +1953,7 @@ static int __init netlink_proto_init(void) for (i = 0; i < MAX_LINKS; i++) { struct nl_pid_hash *hash = &nl_table[i].hash; - hash->table = nl_pid_hash_alloc(1 * sizeof(*hash->table)); + hash->table = nl_pid_hash_zalloc(1 * sizeof(*hash->table)); if (!hash->table) { while (i-- > 0) nl_pid_hash_free(nl_table[i].hash.table, @@ -1930,7 +1961,6 @@ static int __init netlink_proto_init(void) kfree(nl_table); goto panic; } - memset(hash->table, 0, 1 * sizeof(*hash->table)); hash->max_shift = order; hash->shift = 0; hash->mask = 0; @@ -1948,14 +1978,3 @@ panic: } core_initcall(netlink_proto_init); - -EXPORT_SYMBOL(netlink_ack); -EXPORT_SYMBOL(netlink_rcv_skb); -EXPORT_SYMBOL(netlink_broadcast); -EXPORT_SYMBOL(netlink_dump_start); -EXPORT_SYMBOL(netlink_kernel_create); -EXPORT_SYMBOL(netlink_register_notifier); -EXPORT_SYMBOL(netlink_set_nonroot); -EXPORT_SYMBOL(netlink_unicast); -EXPORT_SYMBOL(netlink_unregister_notifier); -EXPORT_SYMBOL(nlmsg_notify); diff --git a/net/netlink/attr.c b/net/netlink/attr.c index ec39d12c242..feb326f4a75 100644 --- a/net/netlink/attr.c +++ b/net/netlink/attr.c @@ -430,6 +430,24 @@ int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data) return 0; } +/** + * nla_append - Add a netlink attribute without header or padding + * @skb: socket buffer to add attribute to + * @attrlen: length of attribute payload + * @data: head of attribute payload + * + * Returns -1 if the tailroom of the skb is insufficient to store + * the attribute payload. + */ +int nla_append(struct sk_buff *skb, int attrlen, const void *data) +{ + if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) + return -1; + + memcpy(skb_put(skb, attrlen), data, attrlen); + return 0; +} + EXPORT_SYMBOL(nla_validate); EXPORT_SYMBOL(nla_parse); EXPORT_SYMBOL(nla_find); @@ -445,3 +463,4 @@ EXPORT_SYMBOL(nla_put_nohdr); EXPORT_SYMBOL(nla_memcpy); EXPORT_SYMBOL(nla_memcmp); EXPORT_SYMBOL(nla_strcmp); +EXPORT_SYMBOL(nla_append); diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c index 6cfaad952c6..1cb98e88f5e 100644 --- a/net/netrom/nr_timer.c +++ b/net/netrom/nr_timer.c @@ -40,21 +40,10 @@ void nr_init_timers(struct sock *sk) { struct nr_sock *nr = nr_sk(sk); - init_timer(&nr->t1timer); - nr->t1timer.data = (unsigned long)sk; - nr->t1timer.function = &nr_t1timer_expiry; - - init_timer(&nr->t2timer); - nr->t2timer.data = (unsigned long)sk; - nr->t2timer.function = &nr_t2timer_expiry; - - init_timer(&nr->t4timer); - nr->t4timer.data = (unsigned long)sk; - nr->t4timer.function = &nr_t4timer_expiry; - - init_timer(&nr->idletimer); - nr->idletimer.data = (unsigned long)sk; - nr->idletimer.function = &nr_idletimer_expiry; + setup_timer(&nr->t1timer, nr_t1timer_expiry, (unsigned long)sk); + setup_timer(&nr->t2timer, nr_t2timer_expiry, (unsigned long)sk); + setup_timer(&nr->t4timer, nr_t4timer_expiry, (unsigned long)sk); + setup_timer(&nr->idletimer, nr_idletimer_expiry, (unsigned long)sk); /* initialized by sock_init_data */ sk->sk_timer.data = (unsigned long)sk; diff --git a/net/netrom/sysctl_net_netrom.c b/net/netrom/sysctl_net_netrom.c index 2ea68da01fb..34c96c9674d 100644 --- a/net/netrom/sysctl_net_netrom.c +++ b/net/netrom/sysctl_net_netrom.c @@ -170,29 +170,15 @@ static ctl_table nr_table[] = { { .ctl_name = 0 } }; -static ctl_table nr_dir_table[] = { - { - .ctl_name = NET_NETROM, - .procname = "netrom", - .mode = 0555, - .child = nr_table - }, - { .ctl_name = 0 } -}; - -static ctl_table nr_root_table[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = nr_dir_table - }, - { .ctl_name = 0 } +static struct ctl_path nr_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "netrom", .ctl_name = NET_NETROM, }, + { } }; void __init nr_register_sysctl(void) { - nr_table_header = register_sysctl_table(nr_root_table); + nr_table_header = register_sysctl_paths(nr_path, nr_table); } void nr_unregister_sysctl(void) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 8a7807dbba0..b8b827c7062 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -135,10 +135,6 @@ dev->hard_header == NULL (ll header is added by device, we cannot control it) packet classifier depends on it. */ -/* List of all packet sockets. */ -static HLIST_HEAD(packet_sklist); -static DEFINE_RWLOCK(packet_sklist_lock); - /* Private packet socket structures. */ struct packet_mclist @@ -246,9 +242,6 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct struct sock *sk; struct sockaddr_pkt *spkt; - if (dev->nd_net != &init_net) - goto out; - /* * When we registered the protocol we saved the socket in the data * field for just this event. @@ -270,6 +263,9 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct if (skb->pkt_type == PACKET_LOOPBACK) goto out; + if (dev->nd_net != sk->sk_net) + goto out; + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) goto oom; @@ -341,7 +337,7 @@ static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock, */ saddr->spkt_device[13] = 0; - dev = dev_get_by_name(&init_net, saddr->spkt_device); + dev = dev_get_by_name(sk->sk_net, saddr->spkt_device); err = -ENODEV; if (dev == NULL) goto out_unlock; @@ -449,15 +445,15 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet int skb_len = skb->len; unsigned int snaplen, res; - if (dev->nd_net != &init_net) - goto drop; - if (skb->pkt_type == PACKET_LOOPBACK) goto drop; sk = pt->af_packet_priv; po = pkt_sk(sk); + if (dev->nd_net != sk->sk_net) + goto drop; + skb->dev = dev; if (dev->header_ops) { @@ -566,15 +562,15 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe struct sk_buff *copy_skb = NULL; struct timeval tv; - if (dev->nd_net != &init_net) - goto drop; - if (skb->pkt_type == PACKET_LOOPBACK) goto drop; sk = pt->af_packet_priv; po = pkt_sk(sk); + if (dev->nd_net != sk->sk_net) + goto drop; + if (dev->header_ops) { if (sk->sk_type != SOCK_DGRAM) skb_push(skb, skb->data - skb_mac_header(skb)); @@ -732,7 +728,7 @@ static int packet_sendmsg(struct kiocb *iocb, struct socket *sock, } - dev = dev_get_by_index(&init_net, ifindex); + dev = dev_get_by_index(sk->sk_net, ifindex); err = -ENXIO; if (dev == NULL) goto out_unlock; @@ -799,15 +795,17 @@ static int packet_release(struct socket *sock) { struct sock *sk = sock->sk; struct packet_sock *po; + struct net *net; if (!sk) return 0; + net = sk->sk_net; po = pkt_sk(sk); - write_lock_bh(&packet_sklist_lock); + write_lock_bh(&net->packet.sklist_lock); sk_del_node_init(sk); - write_unlock_bh(&packet_sklist_lock); + write_unlock_bh(&net->packet.sklist_lock); /* * Unhook packet receive handler. @@ -916,7 +914,7 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int add return -EINVAL; strlcpy(name,uaddr->sa_data,sizeof(name)); - dev = dev_get_by_name(&init_net, name); + dev = dev_get_by_name(sk->sk_net, name); if (dev) { err = packet_do_bind(sk, dev, pkt_sk(sk)->num); dev_put(dev); @@ -943,7 +941,7 @@ static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len if (sll->sll_ifindex) { err = -ENODEV; - dev = dev_get_by_index(&init_net, sll->sll_ifindex); + dev = dev_get_by_index(sk->sk_net, sll->sll_ifindex); if (dev == NULL) goto out; } @@ -972,9 +970,6 @@ static int packet_create(struct net *net, struct socket *sock, int protocol) __be16 proto = (__force __be16)protocol; /* weird, but documented */ int err; - if (net != &init_net) - return -EAFNOSUPPORT; - if (!capable(CAP_NET_RAW)) return -EPERM; if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW && @@ -1020,9 +1015,9 @@ static int packet_create(struct net *net, struct socket *sock, int protocol) po->running = 1; } - write_lock_bh(&packet_sklist_lock); - sk_add_node(sk, &packet_sklist); - write_unlock_bh(&packet_sklist_lock); + write_lock_bh(&net->packet.sklist_lock); + sk_add_node(sk, &net->packet.sklist); + write_unlock_bh(&net->packet.sklist_lock); return(0); out: return err; @@ -1140,7 +1135,7 @@ static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, return -EOPNOTSUPP; uaddr->sa_family = AF_PACKET; - dev = dev_get_by_index(&init_net, pkt_sk(sk)->ifindex); + dev = dev_get_by_index(sk->sk_net, pkt_sk(sk)->ifindex); if (dev) { strlcpy(uaddr->sa_data, dev->name, 15); dev_put(dev); @@ -1165,7 +1160,7 @@ static int packet_getname(struct socket *sock, struct sockaddr *uaddr, sll->sll_family = AF_PACKET; sll->sll_ifindex = po->ifindex; sll->sll_protocol = po->num; - dev = dev_get_by_index(&init_net, po->ifindex); + dev = dev_get_by_index(sk->sk_net, po->ifindex); if (dev) { sll->sll_hatype = dev->type; sll->sll_halen = dev->addr_len; @@ -1217,7 +1212,7 @@ static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq) rtnl_lock(); err = -ENODEV; - dev = __dev_get_by_index(&init_net, mreq->mr_ifindex); + dev = __dev_get_by_index(sk->sk_net, mreq->mr_ifindex); if (!dev) goto done; @@ -1271,7 +1266,7 @@ static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq) if (--ml->count == 0) { struct net_device *dev; *mlp = ml->next; - dev = dev_get_by_index(&init_net, ml->ifindex); + dev = dev_get_by_index(sk->sk_net, ml->ifindex); if (dev) { packet_dev_mc(dev, ml, -1); dev_put(dev); @@ -1299,7 +1294,7 @@ static void packet_flush_mclist(struct sock *sk) struct net_device *dev; po->mclist = ml->next; - if ((dev = dev_get_by_index(&init_net, ml->ifindex)) != NULL) { + if ((dev = dev_get_by_index(sk->sk_net, ml->ifindex)) != NULL) { packet_dev_mc(dev, ml, -1); dev_put(dev); } @@ -1455,12 +1450,10 @@ static int packet_notifier(struct notifier_block *this, unsigned long msg, void struct sock *sk; struct hlist_node *node; struct net_device *dev = data; + struct net *net = dev->nd_net; - if (dev->nd_net != &init_net) - return NOTIFY_DONE; - - read_lock(&packet_sklist_lock); - sk_for_each(sk, node, &packet_sklist) { + read_lock(&net->packet.sklist_lock); + sk_for_each(sk, node, &net->packet.sklist) { struct packet_sock *po = pkt_sk(sk); switch (msg) { @@ -1499,7 +1492,7 @@ static int packet_notifier(struct notifier_block *this, unsigned long msg, void break; } } - read_unlock(&packet_sklist_lock); + read_unlock(&net->packet.sklist_lock); return NOTIFY_DONE; } @@ -1547,6 +1540,8 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd, case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCSIFFLAGS: + if (sk->sk_net != &init_net) + return -ENOIOCTLCMD; return inet_dgram_ops.ioctl(sock, cmd, arg); #endif @@ -1862,12 +1857,12 @@ static struct notifier_block packet_netdev_notifier = { }; #ifdef CONFIG_PROC_FS -static inline struct sock *packet_seq_idx(loff_t off) +static inline struct sock *packet_seq_idx(struct net *net, loff_t off) { struct sock *s; struct hlist_node *node; - sk_for_each(s, node, &packet_sklist) { + sk_for_each(s, node, &net->packet.sklist) { if (!off--) return s; } @@ -1875,22 +1870,27 @@ static inline struct sock *packet_seq_idx(loff_t off) } static void *packet_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(seq_file_net(seq)->packet.sklist_lock) { - read_lock(&packet_sklist_lock); - return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN; + struct net *net = seq_file_net(seq); + read_lock(&net->packet.sklist_lock); + return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN; } static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + struct net *net = seq_file_net(seq); ++*pos; return (v == SEQ_START_TOKEN) - ? sk_head(&packet_sklist) + ? sk_head(&net->packet.sklist) : sk_next((struct sock*)v) ; } static void packet_seq_stop(struct seq_file *seq, void *v) + __releases(seq_file_net(seq)->packet.sklist_lock) { - read_unlock(&packet_sklist_lock); + struct net *net = seq_file_net(seq); + read_unlock(&net->packet.sklist_lock); } static int packet_seq_show(struct seq_file *seq, void *v) @@ -1926,7 +1926,8 @@ static const struct seq_operations packet_seq_ops = { static int packet_seq_open(struct inode *inode, struct file *file) { - return seq_open(file, &packet_seq_ops); + return seq_open_net(inode, file, &packet_seq_ops, + sizeof(struct seq_net_private)); } static const struct file_operations packet_seq_fops = { @@ -1934,15 +1935,37 @@ static const struct file_operations packet_seq_fops = { .open = packet_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_net, }; #endif +static int packet_net_init(struct net *net) +{ + rwlock_init(&net->packet.sklist_lock); + INIT_HLIST_HEAD(&net->packet.sklist); + + if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops)) + return -ENOMEM; + + return 0; +} + +static void packet_net_exit(struct net *net) +{ + proc_net_remove(net, "packet"); +} + +static struct pernet_operations packet_net_ops = { + .init = packet_net_init, + .exit = packet_net_exit, +}; + + static void __exit packet_exit(void) { - proc_net_remove(&init_net, "packet"); unregister_netdevice_notifier(&packet_netdev_notifier); + unregister_pernet_subsys(&packet_net_ops); sock_unregister(PF_PACKET); proto_unregister(&packet_proto); } @@ -1955,8 +1978,8 @@ static int __init packet_init(void) goto out; sock_register(&packet_family_ops); + register_pernet_subsys(&packet_net_ops); register_netdevice_notifier(&packet_netdev_notifier); - proc_net_fops_create(&init_net, "packet", 0, &packet_seq_fops); out: return rc; } diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index ed2d65cd801..4a31a81059a 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -116,7 +116,7 @@ int rosecmp(rose_address *addr1, rose_address *addr2) */ int rosecmpm(rose_address *addr1, rose_address *addr2, unsigned short mask) { - int i, j; + unsigned int i, j; if (mask > 10) return 1; @@ -345,10 +345,9 @@ void rose_destroy_socket(struct sock *sk) if (atomic_read(&sk->sk_wmem_alloc) || atomic_read(&sk->sk_rmem_alloc)) { /* Defer: outstanding buffers */ - init_timer(&sk->sk_timer); + setup_timer(&sk->sk_timer, rose_destroy_timer, + (unsigned long)sk); sk->sk_timer.expires = jiffies + 10 * HZ; - sk->sk_timer.function = rose_destroy_timer; - sk->sk_timer.data = (unsigned long)sk; add_timer(&sk->sk_timer); } else sock_put(sk); @@ -974,8 +973,8 @@ int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct ros */ memset(&facilities, 0x00, sizeof(struct rose_facilities_struct)); - len = (((skb->data[3] >> 4) & 0x0F) + 1) / 2; - len += (((skb->data[3] >> 0) & 0x0F) + 1) / 2; + len = (((skb->data[3] >> 4) & 0x0F) + 1) >> 1; + len += (((skb->data[3] >> 0) & 0x0F) + 1) >> 1; if (!rose_parse_facilities(skb->data + len + 4, &facilities)) { rose_transmit_clear_request(neigh, lci, ROSE_INVALID_FACILITY, 76); return 0; @@ -1378,6 +1377,7 @@ static int rose_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) #ifdef CONFIG_PROC_FS static void *rose_info_start(struct seq_file *seq, loff_t *pos) + __acquires(rose_list_lock) { int i; struct sock *s; @@ -1405,6 +1405,7 @@ static void *rose_info_next(struct seq_file *seq, void *v, loff_t *pos) } static void rose_info_stop(struct seq_file *seq, void *v) + __releases(rose_list_lock) { spin_unlock_bh(&rose_list_lock); } diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c index 4ee0879d354..7f7fcb46b4f 100644 --- a/net/rose/rose_in.c +++ b/net/rose/rose_in.c @@ -182,7 +182,7 @@ static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int framety break; } if (atomic_read(&sk->sk_rmem_alloc) > - (sk->sk_rcvbuf / 2)) + (sk->sk_rcvbuf >> 1)) rose->condition |= ROSE_COND_OWN_RX_BUSY; } /* diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index 540c0f26ffe..fb9359fb235 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -994,8 +994,8 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) goto out; } - len = (((skb->data[3] >> 4) & 0x0F) + 1) / 2; - len += (((skb->data[3] >> 0) & 0x0F) + 1) / 2; + len = (((skb->data[3] >> 4) & 0x0F) + 1) >> 1; + len += (((skb->data[3] >> 0) & 0x0F) + 1) >> 1; memset(&facilities, 0x00, sizeof(struct rose_facilities_struct)); @@ -1068,6 +1068,7 @@ out: #ifdef CONFIG_PROC_FS static void *rose_node_start(struct seq_file *seq, loff_t *pos) + __acquires(rose_neigh_list_lock) { struct rose_node *rose_node; int i = 1; @@ -1091,6 +1092,7 @@ static void *rose_node_next(struct seq_file *seq, void *v, loff_t *pos) } static void rose_node_stop(struct seq_file *seq, void *v) + __releases(rose_neigh_list_lock) { spin_unlock_bh(&rose_neigh_list_lock); } @@ -1144,6 +1146,7 @@ const struct file_operations rose_nodes_fops = { }; static void *rose_neigh_start(struct seq_file *seq, loff_t *pos) + __acquires(rose_neigh_list_lock) { struct rose_neigh *rose_neigh; int i = 1; @@ -1167,6 +1170,7 @@ static void *rose_neigh_next(struct seq_file *seq, void *v, loff_t *pos) } static void rose_neigh_stop(struct seq_file *seq, void *v) + __releases(rose_neigh_list_lock) { spin_unlock_bh(&rose_neigh_list_lock); } @@ -1227,6 +1231,7 @@ const struct file_operations rose_neigh_fops = { static void *rose_route_start(struct seq_file *seq, loff_t *pos) + __acquires(rose_route_list_lock) { struct rose_route *rose_route; int i = 1; @@ -1250,6 +1255,7 @@ static void *rose_route_next(struct seq_file *seq, void *v, loff_t *pos) } static void rose_route_stop(struct seq_file *seq, void *v) + __releases(rose_route_list_lock) { spin_unlock_bh(&rose_route_list_lock); } diff --git a/net/rose/sysctl_net_rose.c b/net/rose/sysctl_net_rose.c index 455b0555a66..20be3485a97 100644 --- a/net/rose/sysctl_net_rose.c +++ b/net/rose/sysctl_net_rose.c @@ -138,29 +138,15 @@ static ctl_table rose_table[] = { { .ctl_name = 0 } }; -static ctl_table rose_dir_table[] = { - { - .ctl_name = NET_ROSE, - .procname = "rose", - .mode = 0555, - .child = rose_table - }, - { .ctl_name = 0 } -}; - -static ctl_table rose_root_table[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = rose_dir_table - }, - { .ctl_name = 0 } +static struct ctl_path rose_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "rose", .ctl_name = NET_ROSE, }, + { } }; void __init rose_register_sysctl(void) { - rose_table_header = register_sysctl_table(rose_root_table); + rose_table_header = register_sysctl_paths(rose_path, rose_table); } void rose_unregister_sysctl(void) diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index d6389450c4b..5e82f1c0afb 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -65,7 +65,7 @@ static void rxrpc_write_space(struct sock *sk) if (rxrpc_writable(sk)) { if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) wake_up_interruptible(sk->sk_sleep); - sk_wake_async(sk, 2, POLL_OUT); + sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } read_unlock(&sk->sk_callback_lock); } diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c index d6667f7bc85..3869a586675 100644 --- a/net/rxrpc/ar-connection.c +++ b/net/rxrpc/ar-connection.c @@ -651,7 +651,7 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans, candidate->trans = trans; candidate->epoch = hdr->epoch; - candidate->cid = hdr->cid & __constant_cpu_to_be32(RXRPC_CIDMASK); + candidate->cid = hdr->cid & cpu_to_be32(RXRPC_CIDMASK); candidate->service_id = hdr->serviceId; candidate->security_ix = hdr->securityIndex; candidate->in_clientflag = RXRPC_CLIENT_INITIATED; diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c index 91b5bbb003e..f8a699e9296 100644 --- a/net/rxrpc/ar-input.c +++ b/net/rxrpc/ar-input.c @@ -20,6 +20,7 @@ #include <net/sock.h> #include <net/af_rxrpc.h> #include <net/ip.h> +#include <net/udp.h> #include "ar-internal.h" unsigned long rxrpc_ack_timeout = 1; @@ -594,7 +595,7 @@ dead_call: read_unlock_bh(&conn->lock); if (sp->hdr.flags & RXRPC_CLIENT_INITIATED && - sp->hdr.seq == __constant_cpu_to_be32(1)) { + sp->hdr.seq == cpu_to_be32(1)) { _debug("incoming call"); skb_queue_tail(&conn->trans->local->accept_queue, skb); rxrpc_queue_work(&conn->trans->local->acceptor); @@ -707,10 +708,13 @@ void rxrpc_data_ready(struct sock *sk, int count) if (skb_checksum_complete(skb)) { rxrpc_free_skb(skb); rxrpc_put_local(local); + UDP_INC_STATS_BH(UDP_MIB_INERRORS, 0); _leave(" [CSUM failed]"); return; } + UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS, 0); + /* the socket buffer we have is owned by UDP, with UDP's data all over * it, but we really want our own */ skb_orphan(skb); @@ -770,7 +774,7 @@ cant_route_call: _debug("can't route call"); if (sp->hdr.flags & RXRPC_CLIENT_INITIATED && sp->hdr.type == RXRPC_PACKET_TYPE_DATA) { - if (sp->hdr.seq == __constant_cpu_to_be32(1)) { + if (sp->hdr.seq == cpu_to_be32(1)) { _debug("first packet"); skb_queue_tail(&local->accept_queue, skb); rxrpc_queue_work(&local->acceptor); diff --git a/net/rxrpc/ar-peer.c b/net/rxrpc/ar-peer.c index 90fa107a8af..2abe2081a5e 100644 --- a/net/rxrpc/ar-peer.c +++ b/net/rxrpc/ar-peer.c @@ -57,7 +57,7 @@ static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer) BUG(); } - ret = ip_route_output_key(&rt, &fl); + ret = ip_route_output_key(&init_net, &rt, &fl); if (ret < 0) { _leave(" [route err %d]", ret); return; diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 8e69d699383..f48434adb7c 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -284,7 +284,7 @@ static int rxkad_secure_packet(const struct rxrpc_call *call, /* calculate the security checksum */ x = htonl(call->channel << (32 - RXRPC_CIDSHIFT)); - x |= sp->hdr.seq & __constant_cpu_to_be32(0x3fffffff); + x |= sp->hdr.seq & cpu_to_be32(0x3fffffff); tmpbuf.x[0] = sp->hdr.callNumber; tmpbuf.x[1] = x; @@ -518,7 +518,7 @@ static int rxkad_verify_packet(const struct rxrpc_call *call, /* validate the security checksum */ x = htonl(call->channel << (32 - RXRPC_CIDSHIFT)); - x |= sp->hdr.seq & __constant_cpu_to_be32(0x3fffffff); + x |= sp->hdr.seq & cpu_to_be32(0x3fffffff); tmpbuf.x[0] = call->call_id; tmpbuf.x[1] = x; diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 9c15c4888d1..87af7c913d8 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -198,6 +198,7 @@ config NET_SCH_NETEM config NET_SCH_INGRESS tristate "Ingress Qdisc" + depends on NET_CLS_ACT || NETFILTER ---help--- Say Y here if you want to use classifiers for incoming packets. If unsure, say Y. @@ -445,7 +446,6 @@ config NET_ACT_IPT config NET_ACT_NAT tristate "Stateless NAT" depends on NET_CLS_ACT - select NETFILTER ---help--- Say Y here to do stateless NAT on IPv4 packets. You should use netfilter for NAT unless you know what you are doing. @@ -476,15 +476,6 @@ config NET_ACT_SIMP To compile this code as a module, choose M here: the module will be called simple. -config NET_CLS_POLICE - bool "Traffic Policing (obsolete)" - select NET_CLS_ACT - select NET_ACT_POLICE - ---help--- - Say Y here if you want to do traffic policing, i.e. strict - bandwidth limiting. This option is obsolete and just selects - the option replacing it. It will be removed in the future. - config NET_CLS_IND bool "Incoming device classification" depends on NET_CLS_U32 || NET_CLS_FW diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 72cdb0fade2..0b8eb235bc1 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -18,6 +18,9 @@ #include <linux/skbuff.h> #include <linux/init.h> #include <linux/kmod.h> +#include <linux/err.h> +#include <net/net_namespace.h> +#include <net/sock.h> #include <net/sch_generic.h> #include <net/act_api.h> #include <net/netlink.h> @@ -66,7 +69,7 @@ static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb, { struct tcf_common *p; int err = 0, index = -1,i = 0, s_i = 0, n_i = 0; - struct rtattr *r ; + struct nlattr *nest; read_lock_bh(hinfo->lock); @@ -81,15 +84,17 @@ static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb, continue; a->priv = p; a->order = n_i; - r = (struct rtattr *)skb_tail_pointer(skb); - RTA_PUT(skb, a->order, 0, NULL); + + nest = nla_nest_start(skb, a->order); + if (nest == NULL) + goto nla_put_failure; err = tcf_action_dump_1(skb, a, 0, 0); if (err < 0) { index--; - nlmsg_trim(skb, r); + nlmsg_trim(skb, nest); goto done; } - r->rta_len = skb_tail_pointer(skb) - (u8 *)r; + nla_nest_end(skb, nest); n_i++; if (n_i >= TCA_ACT_MAX_PRIO) goto done; @@ -101,8 +106,8 @@ done: cb->args[0] += n_i; return n_i; -rtattr_failure: - nlmsg_trim(skb, r); +nla_put_failure: + nla_nest_cancel(skb, nest); goto done; } @@ -110,12 +115,13 @@ static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a, struct tcf_hashinfo *hinfo) { struct tcf_common *p, *s_p; - struct rtattr *r ; + struct nlattr *nest; int i= 0, n_i = 0; - r = (struct rtattr *)skb_tail_pointer(skb); - RTA_PUT(skb, a->order, 0, NULL); - RTA_PUT(skb, TCA_KIND, IFNAMSIZ, a->ops->kind); + nest = nla_nest_start(skb, a->order); + if (nest == NULL) + goto nla_put_failure; + NLA_PUT_STRING(skb, TCA_KIND, a->ops->kind); for (i = 0; i < (hinfo->hmask + 1); i++) { p = hinfo->htab[tcf_hash(i, hinfo->hmask)]; @@ -127,12 +133,12 @@ static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a, p = s_p; } } - RTA_PUT(skb, TCA_FCNT, 4, &n_i); - r->rta_len = skb_tail_pointer(skb) - (u8 *)r; + NLA_PUT_U32(skb, TCA_FCNT, n_i); + nla_nest_end(skb, nest); return n_i; -rtattr_failure: - nlmsg_trim(skb, r); +nla_put_failure: + nla_nest_cancel(skb, nest); return -EINVAL; } @@ -209,7 +215,7 @@ struct tcf_common *tcf_hash_check(u32 index, struct tc_action *a, int bind, } EXPORT_SYMBOL(tcf_hash_check); -struct tcf_common *tcf_hash_create(u32 index, struct rtattr *est, struct tc_action *a, int size, int bind, u32 *idx_gen, struct tcf_hashinfo *hinfo) +struct tcf_common *tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a, int size, int bind, u32 *idx_gen, struct tcf_hashinfo *hinfo) { struct tcf_common *p = kzalloc(size, GFP_KERNEL); @@ -261,6 +267,7 @@ int tcf_register_action(struct tc_action_ops *act) write_unlock(&act_mod_lock); return 0; } +EXPORT_SYMBOL(tcf_register_action); int tcf_unregister_action(struct tc_action_ops *act) { @@ -279,6 +286,7 @@ int tcf_unregister_action(struct tc_action_ops *act) write_unlock(&act_mod_lock); return err; } +EXPORT_SYMBOL(tcf_unregister_action); /* lookup by name */ static struct tc_action_ops *tc_lookup_action_n(char *kind) @@ -301,15 +309,15 @@ static struct tc_action_ops *tc_lookup_action_n(char *kind) return a; } -/* lookup by rtattr */ -static struct tc_action_ops *tc_lookup_action(struct rtattr *kind) +/* lookup by nlattr */ +static struct tc_action_ops *tc_lookup_action(struct nlattr *kind) { struct tc_action_ops *a = NULL; if (kind) { read_lock(&act_mod_lock); for (a = act_base; a; a = a->next) { - if (rtattr_strcmp(kind, a->kind) == 0) { + if (nla_strcmp(kind, a->kind) == 0) { if (!try_module_get(a->owner)) { read_unlock(&act_mod_lock); return NULL; @@ -375,6 +383,7 @@ repeat: exec_done: return ret; } +EXPORT_SYMBOL(tcf_action_exec); void tcf_action_destroy(struct tc_action *act, int bind) { @@ -409,73 +418,77 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { int err = -EINVAL; unsigned char *b = skb_tail_pointer(skb); - struct rtattr *r; + struct nlattr *nest; if (a->ops == NULL || a->ops->dump == NULL) return err; - RTA_PUT(skb, TCA_KIND, IFNAMSIZ, a->ops->kind); + NLA_PUT_STRING(skb, TCA_KIND, a->ops->kind); if (tcf_action_copy_stats(skb, a, 0)) - goto rtattr_failure; - r = (struct rtattr *)skb_tail_pointer(skb); - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + goto nla_put_failure; + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; if ((err = tcf_action_dump_old(skb, a, bind, ref)) > 0) { - r->rta_len = skb_tail_pointer(skb) - (u8 *)r; + nla_nest_end(skb, nest); return err; } -rtattr_failure: +nla_put_failure: nlmsg_trim(skb, b); return -1; } +EXPORT_SYMBOL(tcf_action_dump_1); int tcf_action_dump(struct sk_buff *skb, struct tc_action *act, int bind, int ref) { struct tc_action *a; int err = -EINVAL; - unsigned char *b = skb_tail_pointer(skb); - struct rtattr *r ; + struct nlattr *nest; while ((a = act) != NULL) { - r = (struct rtattr *)skb_tail_pointer(skb); act = a->next; - RTA_PUT(skb, a->order, 0, NULL); + nest = nla_nest_start(skb, a->order); + if (nest == NULL) + goto nla_put_failure; err = tcf_action_dump_1(skb, a, bind, ref); if (err < 0) goto errout; - r->rta_len = skb_tail_pointer(skb) - (u8 *)r; + nla_nest_end(skb, nest); } return 0; -rtattr_failure: +nla_put_failure: err = -EINVAL; errout: - nlmsg_trim(skb, b); + nla_nest_cancel(skb, nest); return err; } -struct tc_action *tcf_action_init_1(struct rtattr *rta, struct rtattr *est, - char *name, int ovr, int bind, int *err) +struct tc_action *tcf_action_init_1(struct nlattr *nla, struct nlattr *est, + char *name, int ovr, int bind) { struct tc_action *a; struct tc_action_ops *a_o; char act_name[IFNAMSIZ]; - struct rtattr *tb[TCA_ACT_MAX+1]; - struct rtattr *kind; - - *err = -EINVAL; + struct nlattr *tb[TCA_ACT_MAX+1]; + struct nlattr *kind; + int err; if (name == NULL) { - if (rtattr_parse_nested(tb, TCA_ACT_MAX, rta) < 0) + err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL); + if (err < 0) goto err_out; - kind = tb[TCA_ACT_KIND-1]; + err = -EINVAL; + kind = tb[TCA_ACT_KIND]; if (kind == NULL) goto err_out; - if (rtattr_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) + if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) goto err_out; } else { + err = -EINVAL; if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) goto err_out; } @@ -496,36 +509,35 @@ struct tc_action *tcf_action_init_1(struct rtattr *rta, struct rtattr *est, * indicate this using -EAGAIN. */ if (a_o != NULL) { - *err = -EAGAIN; + err = -EAGAIN; goto err_mod; } #endif - *err = -ENOENT; + err = -ENOENT; goto err_out; } - *err = -ENOMEM; + err = -ENOMEM; a = kzalloc(sizeof(*a), GFP_KERNEL); if (a == NULL) goto err_mod; /* backward compatibility for policer */ if (name == NULL) - *err = a_o->init(tb[TCA_ACT_OPTIONS-1], est, a, ovr, bind); + err = a_o->init(tb[TCA_ACT_OPTIONS], est, a, ovr, bind); else - *err = a_o->init(rta, est, a, ovr, bind); - if (*err < 0) + err = a_o->init(nla, est, a, ovr, bind); + if (err < 0) goto err_free; /* module count goes up only when brand new policy is created if it exists and is only bound to in a_o->init() then ACT_P_CREATED is not returned (a zero is). */ - if (*err != ACT_P_CREATED) + if (err != ACT_P_CREATED) module_put(a_o->owner); a->ops = a_o; - *err = 0; return a; err_free: @@ -533,26 +545,26 @@ err_free: err_mod: module_put(a_o->owner); err_out: - return NULL; + return ERR_PTR(err); } -struct tc_action *tcf_action_init(struct rtattr *rta, struct rtattr *est, - char *name, int ovr, int bind, int *err) +struct tc_action *tcf_action_init(struct nlattr *nla, struct nlattr *est, + char *name, int ovr, int bind) { - struct rtattr *tb[TCA_ACT_MAX_PRIO+1]; + struct nlattr *tb[TCA_ACT_MAX_PRIO+1]; struct tc_action *head = NULL, *act, *act_prev = NULL; + int err; int i; - if (rtattr_parse_nested(tb, TCA_ACT_MAX_PRIO, rta) < 0) { - *err = -EINVAL; - return head; - } + err = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL); + if (err < 0) + return ERR_PTR(err); - for (i=0; i < TCA_ACT_MAX_PRIO && tb[i]; i++) { - act = tcf_action_init_1(tb[i], est, name, ovr, bind, err); - if (act == NULL) + for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { + act = tcf_action_init_1(tb[i], est, name, ovr, bind); + if (IS_ERR(act)) goto err; - act->order = i+1; + act->order = i; if (head == NULL) head = act; @@ -565,7 +577,7 @@ struct tc_action *tcf_action_init(struct rtattr *rta, struct rtattr *est, err: if (head != NULL) tcf_action_destroy(head, bind); - return NULL; + return act; } int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, @@ -619,7 +631,7 @@ tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq, struct tcamsg *t; struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); - struct rtattr *x; + struct nlattr *nest; nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags); @@ -628,18 +640,19 @@ tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq, t->tca__pad1 = 0; t->tca__pad2 = 0; - x = (struct rtattr *)skb_tail_pointer(skb); - RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); + nest = nla_nest_start(skb, TCA_ACT_TAB); + if (nest == NULL) + goto nla_put_failure; if (tcf_action_dump(skb, a, bind, ref) < 0) - goto rtattr_failure; + goto nla_put_failure; - x->rta_len = skb_tail_pointer(skb) - (u8 *)x; + nla_nest_end(skb, nest); nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; -rtattr_failure: +nla_put_failure: nlmsg_failure: nlmsg_trim(skb, b); return -1; @@ -658,48 +671,51 @@ act_get_notify(u32 pid, struct nlmsghdr *n, struct tc_action *a, int event) return -EINVAL; } - return rtnl_unicast(skb, pid); + return rtnl_unicast(skb, &init_net, pid); } static struct tc_action * -tcf_action_get_1(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int *err) +tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 pid) { - struct rtattr *tb[TCA_ACT_MAX+1]; + struct nlattr *tb[TCA_ACT_MAX+1]; struct tc_action *a; int index; + int err; - *err = -EINVAL; - if (rtattr_parse_nested(tb, TCA_ACT_MAX, rta) < 0) - return NULL; + err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL); + if (err < 0) + goto err_out; - if (tb[TCA_ACT_INDEX - 1] == NULL || - RTA_PAYLOAD(tb[TCA_ACT_INDEX - 1]) < sizeof(index)) - return NULL; - index = *(int *)RTA_DATA(tb[TCA_ACT_INDEX - 1]); + err = -EINVAL; + if (tb[TCA_ACT_INDEX] == NULL || + nla_len(tb[TCA_ACT_INDEX]) < sizeof(index)) + goto err_out; + index = nla_get_u32(tb[TCA_ACT_INDEX]); - *err = -ENOMEM; + err = -ENOMEM; a = kzalloc(sizeof(struct tc_action), GFP_KERNEL); if (a == NULL) - return NULL; + goto err_out; - *err = -EINVAL; - a->ops = tc_lookup_action(tb[TCA_ACT_KIND - 1]); + err = -EINVAL; + a->ops = tc_lookup_action(tb[TCA_ACT_KIND]); if (a->ops == NULL) goto err_free; if (a->ops->lookup == NULL) goto err_mod; - *err = -ENOENT; + err = -ENOENT; if (a->ops->lookup(a, index) == 0) goto err_mod; module_put(a->ops->owner); - *err = 0; return a; + err_mod: module_put(a->ops->owner); err_free: kfree(a); - return NULL; +err_out: + return ERR_PTR(err); } static void cleanup_a(struct tc_action *act) @@ -725,16 +741,16 @@ static struct tc_action *create_a(int i) return act; } -static int tca_action_flush(struct rtattr *rta, struct nlmsghdr *n, u32 pid) +static int tca_action_flush(struct nlattr *nla, struct nlmsghdr *n, u32 pid) { struct sk_buff *skb; unsigned char *b; struct nlmsghdr *nlh; struct tcamsg *t; struct netlink_callback dcb; - struct rtattr *x; - struct rtattr *tb[TCA_ACT_MAX+1]; - struct rtattr *kind; + struct nlattr *nest; + struct nlattr *tb[TCA_ACT_MAX+1]; + struct nlattr *kind; struct tc_action *a = create_a(0); int err = -EINVAL; @@ -752,10 +768,12 @@ static int tca_action_flush(struct rtattr *rta, struct nlmsghdr *n, u32 pid) b = skb_tail_pointer(skb); - if (rtattr_parse_nested(tb, TCA_ACT_MAX, rta) < 0) + err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL); + if (err < 0) goto err_out; - kind = tb[TCA_ACT_KIND-1]; + err = -EINVAL; + kind = tb[TCA_ACT_KIND]; a->ops = tc_lookup_action(kind); if (a->ops == NULL) goto err_out; @@ -766,26 +784,27 @@ static int tca_action_flush(struct rtattr *rta, struct nlmsghdr *n, u32 pid) t->tca__pad1 = 0; t->tca__pad2 = 0; - x = (struct rtattr *)skb_tail_pointer(skb); - RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); + nest = nla_nest_start(skb, TCA_ACT_TAB); + if (nest == NULL) + goto nla_put_failure; err = a->ops->walk(skb, &dcb, RTM_DELACTION, a); if (err < 0) - goto rtattr_failure; + goto nla_put_failure; - x->rta_len = skb_tail_pointer(skb) - (u8 *)x; + nla_nest_end(skb, nest); nlh->nlmsg_len = skb_tail_pointer(skb) - b; nlh->nlmsg_flags |= NLM_F_ROOT; module_put(a->ops->owner); kfree(a); - err = rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + err = rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); if (err > 0) return 0; return err; -rtattr_failure: +nla_put_failure: nlmsg_failure: module_put(a->ops->owner); err_out: @@ -795,25 +814,28 @@ err_out: } static int -tca_action_gd(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int event) +tca_action_gd(struct nlattr *nla, struct nlmsghdr *n, u32 pid, int event) { - int i, ret = 0; - struct rtattr *tb[TCA_ACT_MAX_PRIO+1]; + int i, ret; + struct nlattr *tb[TCA_ACT_MAX_PRIO+1]; struct tc_action *head = NULL, *act, *act_prev = NULL; - if (rtattr_parse_nested(tb, TCA_ACT_MAX_PRIO, rta) < 0) - return -EINVAL; + ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL); + if (ret < 0) + return ret; if (event == RTM_DELACTION && n->nlmsg_flags&NLM_F_ROOT) { if (tb[0] != NULL && tb[1] == NULL) return tca_action_flush(tb[0], n, pid); } - for (i=0; i < TCA_ACT_MAX_PRIO && tb[i]; i++) { - act = tcf_action_get_1(tb[i], n, pid, &ret); - if (act == NULL) + for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { + act = tcf_action_get_1(tb[i], n, pid); + if (IS_ERR(act)) { + ret = PTR_ERR(act); goto err; - act->order = i+1; + } + act->order = i; if (head == NULL) head = act; @@ -842,7 +864,7 @@ tca_action_gd(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int event) /* now do the delete */ tcf_action_destroy(head, 0); - ret = rtnetlink_send(skb, pid, RTNLGRP_TC, + ret = rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); if (ret > 0) return 0; @@ -859,7 +881,7 @@ static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event, struct tcamsg *t; struct nlmsghdr *nlh; struct sk_buff *skb; - struct rtattr *x; + struct nlattr *nest; unsigned char *b; int err = 0; @@ -875,23 +897,24 @@ static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event, t->tca__pad1 = 0; t->tca__pad2 = 0; - x = (struct rtattr *)skb_tail_pointer(skb); - RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); + nest = nla_nest_start(skb, TCA_ACT_TAB); + if (nest == NULL) + goto nla_put_failure; if (tcf_action_dump(skb, a, 0, 0) < 0) - goto rtattr_failure; + goto nla_put_failure; - x->rta_len = skb_tail_pointer(skb) - (u8 *)x; + nla_nest_end(skb, nest); nlh->nlmsg_len = skb_tail_pointer(skb) - b; NETLINK_CB(skb).dst_group = RTNLGRP_TC; - err = rtnetlink_send(skb, pid, RTNLGRP_TC, flags&NLM_F_ECHO); + err = rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, flags&NLM_F_ECHO); if (err > 0) err = 0; return err; -rtattr_failure: +nla_put_failure: nlmsg_failure: kfree_skb(skb); return -1; @@ -899,16 +922,20 @@ nlmsg_failure: static int -tcf_action_add(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int ovr) +tcf_action_add(struct nlattr *nla, struct nlmsghdr *n, u32 pid, int ovr) { int ret = 0; struct tc_action *act; struct tc_action *a; u32 seq = n->nlmsg_seq; - act = tcf_action_init(rta, NULL, NULL, ovr, 0, &ret); + act = tcf_action_init(nla, NULL, NULL, ovr, 0); if (act == NULL) goto done; + if (IS_ERR(act)) { + ret = PTR_ERR(act); + goto done; + } /* dump then free all the actions after update; inserted policy * stays intact @@ -924,11 +951,19 @@ done: static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg) { - struct rtattr **tca = arg; + struct net *net = skb->sk->sk_net; + struct nlattr *tca[TCA_ACT_MAX + 1]; u32 pid = skb ? NETLINK_CB(skb).pid : 0; int ret = 0, ovr = 0; - if (tca[TCA_ACT_TAB-1] == NULL) { + if (net != &init_net) + return -EINVAL; + + ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL); + if (ret < 0) + return ret; + + if (tca[TCA_ACT_TAB] == NULL) { printk("tc_ctl_action: received NO action attribs\n"); return -EINVAL; } @@ -946,15 +981,15 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg) if (n->nlmsg_flags&NLM_F_REPLACE) ovr = 1; replay: - ret = tcf_action_add(tca[TCA_ACT_TAB-1], n, pid, ovr); + ret = tcf_action_add(tca[TCA_ACT_TAB], n, pid, ovr); if (ret == -EAGAIN) goto replay; break; case RTM_DELACTION: - ret = tca_action_gd(tca[TCA_ACT_TAB-1], n, pid, RTM_DELACTION); + ret = tca_action_gd(tca[TCA_ACT_TAB], n, pid, RTM_DELACTION); break; case RTM_GETACTION: - ret = tca_action_gd(tca[TCA_ACT_TAB-1], n, pid, RTM_GETACTION); + ret = tca_action_gd(tca[TCA_ACT_TAB], n, pid, RTM_GETACTION); break; default: BUG(); @@ -963,33 +998,30 @@ replay: return ret; } -static struct rtattr * +static struct nlattr * find_dump_kind(struct nlmsghdr *n) { - struct rtattr *tb1, *tb2[TCA_ACT_MAX+1]; - struct rtattr *tb[TCA_ACT_MAX_PRIO + 1]; - struct rtattr *rta[TCAA_MAX + 1]; - struct rtattr *kind; - int min_len = NLMSG_LENGTH(sizeof(struct tcamsg)); - int attrlen = n->nlmsg_len - NLMSG_ALIGN(min_len); - struct rtattr *attr = (void *) n + NLMSG_ALIGN(min_len); - - if (rtattr_parse(rta, TCAA_MAX, attr, attrlen) < 0) + struct nlattr *tb1, *tb2[TCA_ACT_MAX+1]; + struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; + struct nlattr *nla[TCAA_MAX + 1]; + struct nlattr *kind; + + if (nlmsg_parse(n, sizeof(struct tcamsg), nla, TCAA_MAX, NULL) < 0) return NULL; - tb1 = rta[TCA_ACT_TAB - 1]; + tb1 = nla[TCA_ACT_TAB]; if (tb1 == NULL) return NULL; - if (rtattr_parse(tb, TCA_ACT_MAX_PRIO, RTA_DATA(tb1), - NLMSG_ALIGN(RTA_PAYLOAD(tb1))) < 0) - return NULL; - if (tb[0] == NULL) + if (nla_parse(tb, TCA_ACT_MAX_PRIO, nla_data(tb1), + NLMSG_ALIGN(nla_len(tb1)), NULL) < 0) return NULL; - if (rtattr_parse(tb2, TCA_ACT_MAX, RTA_DATA(tb[0]), - RTA_PAYLOAD(tb[0])) < 0) + if (tb[1] == NULL) return NULL; - kind = tb2[TCA_ACT_KIND-1]; + if (nla_parse(tb2, TCA_ACT_MAX, nla_data(tb[1]), + nla_len(tb[1]), NULL) < 0) + return NULL; + kind = tb2[TCA_ACT_KIND]; return kind; } @@ -997,14 +1029,18 @@ find_dump_kind(struct nlmsghdr *n) static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = skb->sk->sk_net; struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); - struct rtattr *x; + struct nlattr *nest; struct tc_action_ops *a_o; struct tc_action a; int ret = 0; struct tcamsg *t = (struct tcamsg *) NLMSG_DATA(cb->nlh); - struct rtattr *kind = find_dump_kind(cb->nlh); + struct nlattr *kind = find_dump_kind(cb->nlh); + + if (net != &init_net) + return 0; if (kind == NULL) { printk("tc_dump_action: action bad kind\n"); @@ -1021,7 +1057,7 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) if (a_o->walk == NULL) { printk("tc_dump_action: %s !capable of dumping table\n", a_o->kind); - goto rtattr_failure; + goto nla_put_failure; } nlh = NLMSG_PUT(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, @@ -1031,18 +1067,19 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) t->tca__pad1 = 0; t->tca__pad2 = 0; - x = (struct rtattr *)skb_tail_pointer(skb); - RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); + nest = nla_nest_start(skb, TCA_ACT_TAB); + if (nest == NULL) + goto nla_put_failure; ret = a_o->walk(skb, cb, RTM_GETACTION, &a); if (ret < 0) - goto rtattr_failure; + goto nla_put_failure; if (ret > 0) { - x->rta_len = skb_tail_pointer(skb) - (u8 *)x; + nla_nest_end(skb, nest); ret = skb->len; } else - nlmsg_trim(skb, x); + nla_nest_cancel(skb, nest); nlh->nlmsg_len = skb_tail_pointer(skb) - b; if (NETLINK_CB(cb->skb).pid && ret) @@ -1050,7 +1087,7 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) module_put(a_o->owner); return skb->len; -rtattr_failure: +nla_put_failure: nlmsg_failure: module_put(a_o->owner); nlmsg_trim(skb, b); @@ -1067,8 +1104,3 @@ static int __init tc_action_init(void) } subsys_initcall(tc_action_init); - -EXPORT_SYMBOL(tcf_register_action); -EXPORT_SYMBOL(tcf_unregister_action); -EXPORT_SYMBOL(tcf_action_exec); -EXPORT_SYMBOL(tcf_action_dump_1); diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index a9631e426d9..422872c4f14 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -53,28 +53,34 @@ typedef int (*g_rand)(struct tcf_gact *gact); static g_rand gact_rand[MAX_RAND]= { NULL, gact_net_rand, gact_determ }; #endif /* CONFIG_GACT_PROB */ -static int tcf_gact_init(struct rtattr *rta, struct rtattr *est, +static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = { + [TCA_GACT_PARMS] = { .len = sizeof(struct tc_gact) }, + [TCA_GACT_PROB] = { .len = sizeof(struct tc_gact_p) }, +}; + +static int tcf_gact_init(struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { - struct rtattr *tb[TCA_GACT_MAX]; + struct nlattr *tb[TCA_GACT_MAX + 1]; struct tc_gact *parm; struct tcf_gact *gact; struct tcf_common *pc; int ret = 0; + int err; - if (rta == NULL || rtattr_parse_nested(tb, TCA_GACT_MAX, rta) < 0) + if (nla == NULL) return -EINVAL; - if (tb[TCA_GACT_PARMS - 1] == NULL || - RTA_PAYLOAD(tb[TCA_GACT_PARMS - 1]) < sizeof(*parm)) + err = nla_parse_nested(tb, TCA_GACT_MAX, nla, gact_policy); + if (err < 0) + return err; + + if (tb[TCA_GACT_PARMS] == NULL) return -EINVAL; - parm = RTA_DATA(tb[TCA_GACT_PARMS - 1]); + parm = nla_data(tb[TCA_GACT_PARMS]); - if (tb[TCA_GACT_PROB-1] != NULL) -#ifdef CONFIG_GACT_PROB - if (RTA_PAYLOAD(tb[TCA_GACT_PROB-1]) < sizeof(struct tc_gact_p)) - return -EINVAL; -#else +#ifndef CONFIG_GACT_PROB + if (tb[TCA_GACT_PROB] != NULL) return -EOPNOTSUPP; #endif @@ -97,8 +103,8 @@ static int tcf_gact_init(struct rtattr *rta, struct rtattr *est, spin_lock_bh(&gact->tcf_lock); gact->tcf_action = parm->action; #ifdef CONFIG_GACT_PROB - if (tb[TCA_GACT_PROB-1] != NULL) { - struct tc_gact_p *p_parm = RTA_DATA(tb[TCA_GACT_PROB-1]); + if (tb[TCA_GACT_PROB] != NULL) { + struct tc_gact_p *p_parm = nla_data(tb[TCA_GACT_PROB]); gact->tcfg_paction = p_parm->paction; gact->tcfg_pval = p_parm->pval; gact->tcfg_ptype = p_parm->ptype; @@ -154,23 +160,23 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int opt.refcnt = gact->tcf_refcnt - ref; opt.bindcnt = gact->tcf_bindcnt - bind; opt.action = gact->tcf_action; - RTA_PUT(skb, TCA_GACT_PARMS, sizeof(opt), &opt); + NLA_PUT(skb, TCA_GACT_PARMS, sizeof(opt), &opt); #ifdef CONFIG_GACT_PROB if (gact->tcfg_ptype) { struct tc_gact_p p_opt; p_opt.paction = gact->tcfg_paction; p_opt.pval = gact->tcfg_pval; p_opt.ptype = gact->tcfg_ptype; - RTA_PUT(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt); + NLA_PUT(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt); } #endif t.install = jiffies_to_clock_t(jiffies - gact->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(gact->tcf_tm.expires); - RTA_PUT(skb, TCA_GACT_TM, sizeof(t), &t); + NLA_PUT(skb, TCA_GACT_TM, sizeof(t), &t); return skb->len; -rtattr_failure: +nla_put_failure: nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index fa006e06ce3..da696fd3e34 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -92,10 +92,17 @@ static int tcf_ipt_release(struct tcf_ipt *ipt, int bind) return ret; } -static int tcf_ipt_init(struct rtattr *rta, struct rtattr *est, +static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { + [TCA_IPT_TABLE] = { .type = NLA_STRING, .len = IFNAMSIZ }, + [TCA_IPT_HOOK] = { .type = NLA_U32 }, + [TCA_IPT_INDEX] = { .type = NLA_U32 }, + [TCA_IPT_TARG] = { .len = sizeof(struct ipt_entry_target) }, +}; + +static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { - struct rtattr *tb[TCA_IPT_MAX]; + struct nlattr *tb[TCA_IPT_MAX + 1]; struct tcf_ipt *ipt; struct tcf_common *pc; struct ipt_entry_target *td, *t; @@ -104,22 +111,24 @@ static int tcf_ipt_init(struct rtattr *rta, struct rtattr *est, u32 hook = 0; u32 index = 0; - if (rta == NULL || rtattr_parse_nested(tb, TCA_IPT_MAX, rta) < 0) + if (nla == NULL) return -EINVAL; - if (tb[TCA_IPT_HOOK-1] == NULL || - RTA_PAYLOAD(tb[TCA_IPT_HOOK-1]) < sizeof(u32)) + err = nla_parse_nested(tb, TCA_IPT_MAX, nla, ipt_policy); + if (err < 0) + return err; + + if (tb[TCA_IPT_HOOK] == NULL) return -EINVAL; - if (tb[TCA_IPT_TARG-1] == NULL || - RTA_PAYLOAD(tb[TCA_IPT_TARG-1]) < sizeof(*t)) + if (tb[TCA_IPT_TARG] == NULL) return -EINVAL; - td = (struct ipt_entry_target *)RTA_DATA(tb[TCA_IPT_TARG-1]); - if (RTA_PAYLOAD(tb[TCA_IPT_TARG-1]) < td->u.target_size) + + td = (struct ipt_entry_target *)nla_data(tb[TCA_IPT_TARG]); + if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size) return -EINVAL; - if (tb[TCA_IPT_INDEX-1] != NULL && - RTA_PAYLOAD(tb[TCA_IPT_INDEX-1]) >= sizeof(u32)) - index = *(u32 *)RTA_DATA(tb[TCA_IPT_INDEX-1]); + if (tb[TCA_IPT_INDEX] != NULL) + index = nla_get_u32(tb[TCA_IPT_INDEX]); pc = tcf_hash_check(index, a, bind, &ipt_hash_info); if (!pc) { @@ -136,14 +145,14 @@ static int tcf_ipt_init(struct rtattr *rta, struct rtattr *est, } ipt = to_ipt(pc); - hook = *(u32 *)RTA_DATA(tb[TCA_IPT_HOOK-1]); + hook = nla_get_u32(tb[TCA_IPT_HOOK]); err = -ENOMEM; tname = kmalloc(IFNAMSIZ, GFP_KERNEL); if (unlikely(!tname)) goto err1; - if (tb[TCA_IPT_TABLE - 1] == NULL || - rtattr_strlcpy(tname, tb[TCA_IPT_TABLE-1], IFNAMSIZ) >= IFNAMSIZ) + if (tb[TCA_IPT_TABLE] == NULL || + nla_strlcpy(tname, tb[TCA_IPT_TABLE], IFNAMSIZ) >= IFNAMSIZ) strcpy(tname, "mangle"); t = kmemdup(td, td->u.target_size, GFP_KERNEL); @@ -243,25 +252,25 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int t = kmemdup(ipt->tcfi_t, ipt->tcfi_t->u.user.target_size, GFP_ATOMIC); if (unlikely(!t)) - goto rtattr_failure; + goto nla_put_failure; c.bindcnt = ipt->tcf_bindcnt - bind; c.refcnt = ipt->tcf_refcnt - ref; strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name); - RTA_PUT(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t); - RTA_PUT(skb, TCA_IPT_INDEX, 4, &ipt->tcf_index); - RTA_PUT(skb, TCA_IPT_HOOK, 4, &ipt->tcfi_hook); - RTA_PUT(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c); - RTA_PUT(skb, TCA_IPT_TABLE, IFNAMSIZ, ipt->tcfi_tname); + NLA_PUT(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t); + NLA_PUT_U32(skb, TCA_IPT_INDEX, ipt->tcf_index); + NLA_PUT_U32(skb, TCA_IPT_HOOK, ipt->tcfi_hook); + NLA_PUT(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c); + NLA_PUT_STRING(skb, TCA_IPT_TABLE, ipt->tcfi_tname); tm.install = jiffies_to_clock_t(jiffies - ipt->tcf_tm.install); tm.lastuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.lastuse); tm.expires = jiffies_to_clock_t(ipt->tcf_tm.expires); - RTA_PUT(skb, TCA_IPT_TM, sizeof (tm), &tm); + NLA_PUT(skb, TCA_IPT_TM, sizeof (tm), &tm); kfree(t); return skb->len; -rtattr_failure: +nla_put_failure: nlmsg_trim(skb, b); kfree(t); return -1; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index c3fde9180f9..1aff005d95c 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -54,24 +54,31 @@ static inline int tcf_mirred_release(struct tcf_mirred *m, int bind) return 0; } -static int tcf_mirred_init(struct rtattr *rta, struct rtattr *est, +static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { + [TCA_MIRRED_PARMS] = { .len = sizeof(struct tc_mirred) }, +}; + +static int tcf_mirred_init(struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { - struct rtattr *tb[TCA_MIRRED_MAX]; + struct nlattr *tb[TCA_MIRRED_MAX + 1]; struct tc_mirred *parm; struct tcf_mirred *m; struct tcf_common *pc; struct net_device *dev = NULL; - int ret = 0; + int ret = 0, err; int ok_push = 0; - if (rta == NULL || rtattr_parse_nested(tb, TCA_MIRRED_MAX, rta) < 0) + if (nla == NULL) return -EINVAL; - if (tb[TCA_MIRRED_PARMS-1] == NULL || - RTA_PAYLOAD(tb[TCA_MIRRED_PARMS-1]) < sizeof(*parm)) + err = nla_parse_nested(tb, TCA_MIRRED_MAX, nla, mirred_policy); + if (err < 0) + return err; + + if (tb[TCA_MIRRED_PARMS] == NULL) return -EINVAL; - parm = RTA_DATA(tb[TCA_MIRRED_PARMS-1]); + parm = nla_data(tb[TCA_MIRRED_PARMS]); if (parm->ifindex) { dev = __dev_get_by_index(&init_net, parm->ifindex); @@ -207,14 +214,14 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, i opt.bindcnt = m->tcf_bindcnt - bind; opt.eaction = m->tcfm_eaction; opt.ifindex = m->tcfm_ifindex; - RTA_PUT(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt); + NLA_PUT(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt); t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(m->tcf_tm.expires); - RTA_PUT(skb, TCA_MIRRED_TM, sizeof(t), &t); + NLA_PUT(skb, TCA_MIRRED_TM, sizeof(t), &t); return skb->len; -rtattr_failure: +nla_put_failure: nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index c96273bcaf9..0a3c8339767 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -40,22 +40,29 @@ static struct tcf_hashinfo nat_hash_info = { .lock = &nat_lock, }; -static int tcf_nat_init(struct rtattr *rta, struct rtattr *est, +static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { + [TCA_NAT_PARMS] = { .len = sizeof(struct tc_nat) }, +}; + +static int tcf_nat_init(struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { - struct rtattr *tb[TCA_NAT_MAX]; + struct nlattr *tb[TCA_NAT_MAX + 1]; struct tc_nat *parm; - int ret = 0; + int ret = 0, err; struct tcf_nat *p; struct tcf_common *pc; - if (rta == NULL || rtattr_parse_nested(tb, TCA_NAT_MAX, rta) < 0) + if (nla == NULL) return -EINVAL; - if (tb[TCA_NAT_PARMS - 1] == NULL || - RTA_PAYLOAD(tb[TCA_NAT_PARMS - 1]) < sizeof(*parm)) + err = nla_parse_nested(tb, TCA_NAT_MAX, nla, nat_policy); + if (err < 0) + return err; + + if (tb[TCA_NAT_PARMS] == NULL) return -EINVAL; - parm = RTA_DATA(tb[TCA_NAT_PARMS - 1]); + parm = nla_data(tb[TCA_NAT_PARMS]); pc = tcf_hash_check(parm->index, a, bind, &nat_hash_info); if (!pc) { @@ -151,7 +158,7 @@ static int tcf_nat(struct sk_buff *skb, struct tc_action *a, else iph->daddr = new_addr; - nf_csum_replace4(&iph->check, addr, new_addr); + csum_replace4(&iph->check, addr, new_addr); } ihl = iph->ihl * 4; @@ -169,7 +176,7 @@ static int tcf_nat(struct sk_buff *skb, struct tc_action *a, goto drop; tcph = (void *)(skb_network_header(skb) + ihl); - nf_proto_csum_replace4(&tcph->check, skb, addr, new_addr, 1); + inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, 1); break; } case IPPROTO_UDP: @@ -184,8 +191,8 @@ static int tcf_nat(struct sk_buff *skb, struct tc_action *a, udph = (void *)(skb_network_header(skb) + ihl); if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { - nf_proto_csum_replace4(&udph->check, skb, addr, - new_addr, 1); + inet_proto_csum_replace4(&udph->check, skb, addr, + new_addr, 1); if (!udph->check) udph->check = CSUM_MANGLED_0; } @@ -232,8 +239,8 @@ static int tcf_nat(struct sk_buff *skb, struct tc_action *a, else iph->saddr = new_addr; - nf_proto_csum_replace4(&icmph->checksum, skb, addr, new_addr, - 1); + inet_proto_csum_replace4(&icmph->checksum, skb, addr, new_addr, + 1); break; } default: @@ -275,17 +282,17 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, opt->refcnt = p->tcf_refcnt - ref; opt->bindcnt = p->tcf_bindcnt - bind; - RTA_PUT(skb, TCA_NAT_PARMS, s, opt); + NLA_PUT(skb, TCA_NAT_PARMS, s, opt); t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(p->tcf_tm.expires); - RTA_PUT(skb, TCA_NAT_TM, sizeof(t), &t); + NLA_PUT(skb, TCA_NAT_TM, sizeof(t), &t); kfree(opt); return skb->len; -rtattr_failure: +nla_put_failure: nlmsg_trim(skb, b); kfree(opt); return -1; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index b46fab5fb32..3cc4cb9e500 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -33,26 +33,33 @@ static struct tcf_hashinfo pedit_hash_info = { .lock = &pedit_lock, }; -static int tcf_pedit_init(struct rtattr *rta, struct rtattr *est, +static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = { + [TCA_PEDIT_PARMS] = { .len = sizeof(struct tcf_pedit) }, +}; + +static int tcf_pedit_init(struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { - struct rtattr *tb[TCA_PEDIT_MAX]; + struct nlattr *tb[TCA_PEDIT_MAX + 1]; struct tc_pedit *parm; - int ret = 0; + int ret = 0, err; struct tcf_pedit *p; struct tcf_common *pc; struct tc_pedit_key *keys = NULL; int ksize; - if (rta == NULL || rtattr_parse_nested(tb, TCA_PEDIT_MAX, rta) < 0) + if (nla == NULL) return -EINVAL; - if (tb[TCA_PEDIT_PARMS - 1] == NULL || - RTA_PAYLOAD(tb[TCA_PEDIT_PARMS-1]) < sizeof(*parm)) + err = nla_parse_nested(tb, TCA_PEDIT_MAX, nla, pedit_policy); + if (err < 0) + return err; + + if (tb[TCA_PEDIT_PARMS] == NULL) return -EINVAL; - parm = RTA_DATA(tb[TCA_PEDIT_PARMS-1]); + parm = nla_data(tb[TCA_PEDIT_PARMS]); ksize = parm->nkeys * sizeof(struct tc_pedit_key); - if (RTA_PAYLOAD(tb[TCA_PEDIT_PARMS-1]) < sizeof(*parm) + ksize) + if (nla_len(tb[TCA_PEDIT_PARMS]) < sizeof(*parm) + ksize) return -EINVAL; pc = tcf_hash_check(parm->index, a, bind, &pedit_hash_info); @@ -206,15 +213,15 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, opt->refcnt = p->tcf_refcnt - ref; opt->bindcnt = p->tcf_bindcnt - bind; - RTA_PUT(skb, TCA_PEDIT_PARMS, s, opt); + NLA_PUT(skb, TCA_PEDIT_PARMS, s, opt); t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(p->tcf_tm.expires); - RTA_PUT(skb, TCA_PEDIT_TM, sizeof(t), &t); + NLA_PUT(skb, TCA_PEDIT_TM, sizeof(t), &t); kfree(opt); return skb->len; -rtattr_failure: +nla_put_failure: nlmsg_trim(skb, b); kfree(opt); return -1; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index a73e3e6d87e..0898120bbcc 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -54,7 +54,7 @@ static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *c { struct tcf_common *p; int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; - struct rtattr *r; + struct nlattr *nest; read_lock_bh(&police_lock); @@ -69,18 +69,19 @@ static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *c continue; a->priv = p; a->order = index; - r = (struct rtattr *)skb_tail_pointer(skb); - RTA_PUT(skb, a->order, 0, NULL); + nest = nla_nest_start(skb, a->order); + if (nest == NULL) + goto nla_put_failure; if (type == RTM_DELACTION) err = tcf_action_dump_1(skb, a, 0, 1); else err = tcf_action_dump_1(skb, a, 0, 0); if (err < 0) { index--; - nlmsg_trim(skb, r); + nla_nest_cancel(skb, nest); goto done; } - r->rta_len = skb_tail_pointer(skb) - (u8 *)r; + nla_nest_end(skb, nest); n_i++; } } @@ -90,8 +91,8 @@ done: cb->args[0] += n_i; return n_i; -rtattr_failure: - nlmsg_trim(skb, r); +nla_put_failure: + nla_nest_cancel(skb, nest); goto done; } @@ -118,33 +119,37 @@ static void tcf_police_destroy(struct tcf_police *p) BUG_TRAP(0); } -static int tcf_act_police_locate(struct rtattr *rta, struct rtattr *est, +static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { + [TCA_POLICE_RATE] = { .len = TC_RTAB_SIZE }, + [TCA_POLICE_PEAKRATE] = { .len = TC_RTAB_SIZE }, + [TCA_POLICE_AVRATE] = { .type = NLA_U32 }, + [TCA_POLICE_RESULT] = { .type = NLA_U32 }, +}; + +static int tcf_act_police_locate(struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { unsigned h; int ret = 0, err; - struct rtattr *tb[TCA_POLICE_MAX]; + struct nlattr *tb[TCA_POLICE_MAX + 1]; struct tc_police *parm; struct tcf_police *police; struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL; int size; - if (rta == NULL || rtattr_parse_nested(tb, TCA_POLICE_MAX, rta) < 0) + if (nla == NULL) return -EINVAL; - if (tb[TCA_POLICE_TBF-1] == NULL) - return -EINVAL; - size = RTA_PAYLOAD(tb[TCA_POLICE_TBF-1]); - if (size != sizeof(*parm) && size != sizeof(struct tc_police_compat)) - return -EINVAL; - parm = RTA_DATA(tb[TCA_POLICE_TBF-1]); + err = nla_parse_nested(tb, TCA_POLICE_MAX, nla, police_policy); + if (err < 0) + return err; - if (tb[TCA_POLICE_RESULT-1] != NULL && - RTA_PAYLOAD(tb[TCA_POLICE_RESULT-1]) != sizeof(u32)) + if (tb[TCA_POLICE_TBF] == NULL) return -EINVAL; - if (tb[TCA_POLICE_RESULT-1] != NULL && - RTA_PAYLOAD(tb[TCA_POLICE_RESULT-1]) != sizeof(u32)) + size = nla_len(tb[TCA_POLICE_TBF]); + if (size != sizeof(*parm) && size != sizeof(struct tc_police_compat)) return -EINVAL; + parm = nla_data(tb[TCA_POLICE_TBF]); if (parm->index) { struct tcf_common *pc; @@ -174,12 +179,12 @@ static int tcf_act_police_locate(struct rtattr *rta, struct rtattr *est, override: if (parm->rate.rate) { err = -ENOMEM; - R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE-1]); + R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE]); if (R_tab == NULL) goto failure; if (parm->peakrate.rate) { P_tab = qdisc_get_rtab(&parm->peakrate, - tb[TCA_POLICE_PEAKRATE-1]); + tb[TCA_POLICE_PEAKRATE]); if (P_tab == NULL) { qdisc_put_rtab(R_tab); goto failure; @@ -197,8 +202,8 @@ override: police->tcfp_P_tab = P_tab; } - if (tb[TCA_POLICE_RESULT-1]) - police->tcfp_result = *(u32*)RTA_DATA(tb[TCA_POLICE_RESULT-1]); + if (tb[TCA_POLICE_RESULT]) + police->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]); police->tcfp_toks = police->tcfp_burst = parm->burst; police->tcfp_mtu = parm->mtu; if (police->tcfp_mtu == 0) { @@ -210,9 +215,8 @@ override: police->tcfp_ptoks = L2T_P(police, police->tcfp_mtu); police->tcf_action = parm->action; - if (tb[TCA_POLICE_AVRATE-1]) - police->tcfp_ewma_rate = - *(u32*)RTA_DATA(tb[TCA_POLICE_AVRATE-1]); + if (tb[TCA_POLICE_AVRATE]) + police->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]); if (est) gen_replace_estimator(&police->tcf_bstats, &police->tcf_rate_est, @@ -332,15 +336,14 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) opt.peakrate = police->tcfp_P_tab->rate; else memset(&opt.peakrate, 0, sizeof(opt.peakrate)); - RTA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt); + NLA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt); if (police->tcfp_result) - RTA_PUT(skb, TCA_POLICE_RESULT, sizeof(int), - &police->tcfp_result); + NLA_PUT_U32(skb, TCA_POLICE_RESULT, police->tcfp_result); if (police->tcfp_ewma_rate) - RTA_PUT(skb, TCA_POLICE_AVRATE, 4, &police->tcfp_ewma_rate); + NLA_PUT_U32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate); return skb->len; -rtattr_failure: +nla_put_failure: nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index fb84ef33d14..fbde461b716 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -84,30 +84,37 @@ static int realloc_defdata(struct tcf_defact *d, u32 datalen, void *defdata) return alloc_defdata(d, datalen, defdata); } -static int tcf_simp_init(struct rtattr *rta, struct rtattr *est, +static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = { + [TCA_DEF_PARMS] = { .len = sizeof(struct tc_defact) }, +}; + +static int tcf_simp_init(struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { - struct rtattr *tb[TCA_DEF_MAX]; + struct nlattr *tb[TCA_DEF_MAX + 1]; struct tc_defact *parm; struct tcf_defact *d; struct tcf_common *pc; void *defdata; u32 datalen = 0; - int ret = 0; + int ret = 0, err; - if (rta == NULL || rtattr_parse_nested(tb, TCA_DEF_MAX, rta) < 0) + if (nla == NULL) return -EINVAL; - if (tb[TCA_DEF_PARMS - 1] == NULL || - RTA_PAYLOAD(tb[TCA_DEF_PARMS - 1]) < sizeof(*parm)) + err = nla_parse_nested(tb, TCA_DEF_MAX, nla, NULL); + if (err < 0) + return err; + + if (tb[TCA_DEF_PARMS] == NULL) return -EINVAL; - parm = RTA_DATA(tb[TCA_DEF_PARMS - 1]); - defdata = RTA_DATA(tb[TCA_DEF_DATA - 1]); + parm = nla_data(tb[TCA_DEF_PARMS]); + defdata = nla_data(tb[TCA_DEF_DATA]); if (defdata == NULL) return -EINVAL; - datalen = RTA_PAYLOAD(tb[TCA_DEF_DATA - 1]); + datalen = nla_len(tb[TCA_DEF_DATA]); if (datalen <= 0) return -EINVAL; @@ -164,15 +171,15 @@ static inline int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, opt.refcnt = d->tcf_refcnt - ref; opt.bindcnt = d->tcf_bindcnt - bind; opt.action = d->tcf_action; - RTA_PUT(skb, TCA_DEF_PARMS, sizeof(opt), &opt); - RTA_PUT(skb, TCA_DEF_DATA, d->tcfd_datalen, d->tcfd_defdata); + NLA_PUT(skb, TCA_DEF_PARMS, sizeof(opt), &opt); + NLA_PUT(skb, TCA_DEF_DATA, d->tcfd_datalen, d->tcfd_defdata); t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(d->tcf_tm.expires); - RTA_PUT(skb, TCA_DEF_TM, sizeof(t), &t); + NLA_PUT(skb, TCA_DEF_TM, sizeof(t), &t); return skb->len; -rtattr_failure: +nla_put_failure: nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 03657976fd5..3377ca0d0a0 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -23,33 +23,30 @@ #include <linux/init.h> #include <linux/kmod.h> #include <linux/netlink.h> +#include <linux/err.h> +#include <net/net_namespace.h> +#include <net/sock.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> -#if 0 /* control */ -#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) -#else -#define DPRINTK(format,args...) -#endif - /* The list of all installed classifier types */ -static struct tcf_proto_ops *tcf_proto_base; +static struct tcf_proto_ops *tcf_proto_base __read_mostly; /* Protects list of registered TC modules. It is pure SMP lock. */ static DEFINE_RWLOCK(cls_mod_lock); /* Find classifier type by string name */ -static struct tcf_proto_ops * tcf_proto_lookup_ops(struct rtattr *kind) +static struct tcf_proto_ops *tcf_proto_lookup_ops(struct nlattr *kind) { struct tcf_proto_ops *t = NULL; if (kind) { read_lock(&cls_mod_lock); for (t = tcf_proto_base; t; t = t->next) { - if (rtattr_strcmp(kind, t->kind) == 0) { + if (nla_strcmp(kind, t->kind) == 0) { if (!try_module_get(t->owner)) t = NULL; break; @@ -79,6 +76,7 @@ out: write_unlock(&cls_mod_lock); return rc; } +EXPORT_SYMBOL(register_tcf_proto_ops); int unregister_tcf_proto_ops(struct tcf_proto_ops *ops) { @@ -98,6 +96,7 @@ out: write_unlock(&cls_mod_lock); return rc; } +EXPORT_SYMBOL(unregister_tcf_proto_ops); static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n, struct tcf_proto *tp, unsigned long fh, int event); @@ -105,9 +104,9 @@ static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n, /* Select new prio value from the range, managed by kernel. */ -static __inline__ u32 tcf_auto_prio(struct tcf_proto *tp) +static inline u32 tcf_auto_prio(struct tcf_proto *tp) { - u32 first = TC_H_MAKE(0xC0000000U,0U); + u32 first = TC_H_MAKE(0xC0000000U, 0U); if (tp) first = tp->prio-1; @@ -119,7 +118,8 @@ static __inline__ u32 tcf_auto_prio(struct tcf_proto *tp) static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg) { - struct rtattr **tca; + struct net *net = skb->sk->sk_net; + struct nlattr *tca[TCA_MAX + 1]; struct tcmsg *t; u32 protocol; u32 prio; @@ -130,13 +130,15 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg) struct tcf_proto **back, **chain; struct tcf_proto *tp; struct tcf_proto_ops *tp_ops; - struct Qdisc_class_ops *cops; + const struct Qdisc_class_ops *cops; unsigned long cl; unsigned long fh; int err; + if (net != &init_net) + return -EINVAL; + replay: - tca = arg; t = NLMSG_DATA(n); protocol = TC_H_MIN(t->tcm_info); prio = TC_H_MAJ(t->tcm_info); @@ -148,21 +150,29 @@ replay: /* If no priority is given, user wants we allocated it. */ if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) return -ENOENT; - prio = TC_H_MAKE(0x80000000U,0U); + prio = TC_H_MAKE(0x80000000U, 0U); } /* Find head of filter chain. */ /* Find link */ - if ((dev = __dev_get_by_index(&init_net, t->tcm_ifindex)) == NULL) + dev = __dev_get_by_index(&init_net, t->tcm_ifindex); + if (dev == NULL) return -ENODEV; + err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL); + if (err < 0) + return err; + /* Find qdisc */ if (!parent) { q = dev->qdisc_sleeping; parent = q->handle; - } else if ((q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent))) == NULL) - return -EINVAL; + } else { + q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent)); + if (q == NULL) + return -EINVAL; + } /* Is it classful? */ if ((cops = q->ops->cl_ops) == NULL) @@ -196,7 +206,7 @@ replay: if (tp == NULL) { /* Proto-tcf does not exist, create new one */ - if (tca[TCA_KIND-1] == NULL || !protocol) + if (tca[TCA_KIND] == NULL || !protocol) goto errout; err = -ENOENT; @@ -207,17 +217,18 @@ replay: /* Create new proto tcf */ err = -ENOBUFS; - if ((tp = kzalloc(sizeof(*tp), GFP_KERNEL)) == NULL) + tp = kzalloc(sizeof(*tp), GFP_KERNEL); + if (tp == NULL) goto errout; err = -EINVAL; - tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND-1]); + tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND]); if (tp_ops == NULL) { #ifdef CONFIG_KMOD - struct rtattr *kind = tca[TCA_KIND-1]; + struct nlattr *kind = tca[TCA_KIND]; char name[IFNAMSIZ]; if (kind != NULL && - rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) { + nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) { rtnl_unlock(); request_module("cls_%s", name); rtnl_lock(); @@ -243,7 +254,9 @@ replay: tp->q = q; tp->classify = tp_ops->classify; tp->classid = parent; - if ((err = tp_ops->init(tp)) != 0) { + + err = tp_ops->init(tp); + if (err != 0) { module_put(tp_ops->owner); kfree(tp); goto errout; @@ -254,7 +267,7 @@ replay: *back = tp; qdisc_unlock_tree(dev); - } else if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], tp->ops->kind)) + } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) goto errout; fh = tp->ops->get(tp, t->tcm_handle); @@ -272,13 +285,14 @@ replay: } err = -ENOENT; - if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) + if (n->nlmsg_type != RTM_NEWTFILTER || + !(n->nlmsg_flags & NLM_F_CREATE)) goto errout; } else { switch (n->nlmsg_type) { case RTM_NEWTFILTER: err = -EEXIST; - if (n->nlmsg_flags&NLM_F_EXCL) + if (n->nlmsg_flags & NLM_F_EXCL) goto errout; break; case RTM_DELTFILTER: @@ -308,9 +322,8 @@ errout: return err; } -static int -tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, unsigned long fh, - u32 pid, u32 seq, u16 flags, int event) +static int tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, + unsigned long fh, u32 pid, u32 seq, u16 flags, int event) { struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -324,18 +337,18 @@ tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, unsigned long fh, tcm->tcm_ifindex = tp->q->dev->ifindex; tcm->tcm_parent = tp->classid; tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol); - RTA_PUT(skb, TCA_KIND, IFNAMSIZ, tp->ops->kind); + NLA_PUT_STRING(skb, TCA_KIND, tp->ops->kind); tcm->tcm_handle = fh; if (RTM_DELTFILTER != event) { tcm->tcm_handle = 0; if (tp->ops->dump && tp->ops->dump(tp, fh, skb, tcm) < 0) - goto rtattr_failure; + goto nla_put_failure; } nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; nlmsg_failure: -rtattr_failure: +nla_put_failure: nlmsg_trim(skb, b); return -1; } @@ -355,19 +368,20 @@ static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n, return -EINVAL; } - return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); } -struct tcf_dump_args -{ +struct tcf_dump_args { struct tcf_walker w; struct sk_buff *skb; struct netlink_callback *cb; }; -static int tcf_node_dump(struct tcf_proto *tp, unsigned long n, struct tcf_walker *arg) +static int tcf_node_dump(struct tcf_proto *tp, unsigned long n, + struct tcf_walker *arg) { - struct tcf_dump_args *a = (void*)arg; + struct tcf_dump_args *a = (void *)arg; return tcf_fill_node(a->skb, tp, n, NETLINK_CB(a->cb->skb).pid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER); @@ -375,16 +389,20 @@ static int tcf_node_dump(struct tcf_proto *tp, unsigned long n, struct tcf_walke static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = skb->sk->sk_net; int t; int s_t; struct net_device *dev; struct Qdisc *q; struct tcf_proto *tp, **chain; - struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh); + struct tcmsg *tcm = (struct tcmsg *)NLMSG_DATA(cb->nlh); unsigned long cl = 0; - struct Qdisc_class_ops *cops; + const struct Qdisc_class_ops *cops; struct tcf_dump_args arg; + if (net != &init_net) + return 0; + if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) return skb->len; if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL) @@ -421,9 +439,10 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); if (cb->args[1] == 0) { if (tcf_fill_node(skb, tp, 0, NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) { + cb->nlh->nlmsg_seq, NLM_F_MULTI, + RTM_NEWTFILTER) <= 0) break; - } + cb->args[1] = 1; } if (tp->ops->walk == NULL) @@ -450,8 +469,7 @@ out: return skb->len; } -void -tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts) +void tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT if (exts->action) { @@ -460,49 +478,48 @@ tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts) } #endif } +EXPORT_SYMBOL(tcf_exts_destroy); - -int -tcf_exts_validate(struct tcf_proto *tp, struct rtattr **tb, - struct rtattr *rate_tlv, struct tcf_exts *exts, +int tcf_exts_validate(struct tcf_proto *tp, struct nlattr **tb, + struct nlattr *rate_tlv, struct tcf_exts *exts, struct tcf_ext_map *map) { memset(exts, 0, sizeof(*exts)); #ifdef CONFIG_NET_CLS_ACT { - int err; struct tc_action *act; - if (map->police && tb[map->police-1]) { - act = tcf_action_init_1(tb[map->police-1], rate_tlv, "police", - TCA_ACT_NOREPLACE, TCA_ACT_BIND, &err); - if (act == NULL) - return err; + if (map->police && tb[map->police]) { + act = tcf_action_init_1(tb[map->police], rate_tlv, + "police", TCA_ACT_NOREPLACE, + TCA_ACT_BIND); + if (IS_ERR(act)) + return PTR_ERR(act); act->type = TCA_OLD_COMPAT; exts->action = act; - } else if (map->action && tb[map->action-1]) { - act = tcf_action_init(tb[map->action-1], rate_tlv, NULL, - TCA_ACT_NOREPLACE, TCA_ACT_BIND, &err); - if (act == NULL) - return err; + } else if (map->action && tb[map->action]) { + act = tcf_action_init(tb[map->action], rate_tlv, NULL, + TCA_ACT_NOREPLACE, TCA_ACT_BIND); + if (IS_ERR(act)) + return PTR_ERR(act); exts->action = act; } } #else - if ((map->action && tb[map->action-1]) || - (map->police && tb[map->police-1])) + if ((map->action && tb[map->action]) || + (map->police && tb[map->police])) return -EOPNOTSUPP; #endif return 0; } +EXPORT_SYMBOL(tcf_exts_validate); -void -tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst, - struct tcf_exts *src) +void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst, + struct tcf_exts *src) { #ifdef CONFIG_NET_CLS_ACT if (src->action) { @@ -515,9 +532,9 @@ tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst, } #endif } +EXPORT_SYMBOL(tcf_exts_change); -int -tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts, +int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts, struct tcf_ext_map *map) { #ifdef CONFIG_NET_CLS_ACT @@ -527,39 +544,45 @@ tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts, * to work with both old and new modes of entering * tc data even if iproute2 was newer - jhs */ - struct rtattr *p_rta = (struct rtattr *)skb_tail_pointer(skb); + struct nlattr *nest; if (exts->action->type != TCA_OLD_COMPAT) { - RTA_PUT(skb, map->action, 0, NULL); + nest = nla_nest_start(skb, map->action); + if (nest == NULL) + goto nla_put_failure; if (tcf_action_dump(skb, exts->action, 0, 0) < 0) - goto rtattr_failure; - p_rta->rta_len = skb_tail_pointer(skb) - (u8 *)p_rta; + goto nla_put_failure; + nla_nest_end(skb, nest); } else if (map->police) { - RTA_PUT(skb, map->police, 0, NULL); + nest = nla_nest_start(skb, map->police); + if (nest == NULL) + goto nla_put_failure; if (tcf_action_dump_old(skb, exts->action, 0, 0) < 0) - goto rtattr_failure; - p_rta->rta_len = skb_tail_pointer(skb) - (u8 *)p_rta; + goto nla_put_failure; + nla_nest_end(skb, nest); } } #endif return 0; -rtattr_failure: __attribute__ ((unused)) +nla_put_failure: __attribute__ ((unused)) return -1; } +EXPORT_SYMBOL(tcf_exts_dump); -int -tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts, - struct tcf_ext_map *map) + +int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts, + struct tcf_ext_map *map) { #ifdef CONFIG_NET_CLS_ACT if (exts->action) if (tcf_action_copy_stats(skb, exts->action, 1) < 0) - goto rtattr_failure; + goto nla_put_failure; #endif return 0; -rtattr_failure: __attribute__ ((unused)) +nla_put_failure: __attribute__ ((unused)) return -1; } +EXPORT_SYMBOL(tcf_exts_dump_stats); static int __init tc_filter_init(void) { @@ -572,11 +595,3 @@ static int __init tc_filter_init(void) } subsys_initcall(tc_filter_init); - -EXPORT_SYMBOL(register_tcf_proto_ops); -EXPORT_SYMBOL(unregister_tcf_proto_ops); -EXPORT_SYMBOL(tcf_exts_validate); -EXPORT_SYMBOL(tcf_exts_destroy); -EXPORT_SYMBOL(tcf_exts_change); -EXPORT_SYMBOL(tcf_exts_dump); -EXPORT_SYMBOL(tcf_exts_dump_stats); diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 8dbcf2771a4..bfb4342ea88 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -129,28 +129,29 @@ static int basic_delete(struct tcf_proto *tp, unsigned long arg) return -ENOENT; } +static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = { + [TCA_BASIC_CLASSID] = { .type = NLA_U32 }, + [TCA_BASIC_EMATCHES] = { .type = NLA_NESTED }, +}; + static inline int basic_set_parms(struct tcf_proto *tp, struct basic_filter *f, - unsigned long base, struct rtattr **tb, - struct rtattr *est) + unsigned long base, struct nlattr **tb, + struct nlattr *est) { int err = -EINVAL; struct tcf_exts e; struct tcf_ematch_tree t; - if (tb[TCA_BASIC_CLASSID-1]) - if (RTA_PAYLOAD(tb[TCA_BASIC_CLASSID-1]) < sizeof(u32)) - return err; - err = tcf_exts_validate(tp, tb, est, &e, &basic_ext_map); if (err < 0) return err; - err = tcf_em_tree_validate(tp, tb[TCA_BASIC_EMATCHES-1], &t); + err = tcf_em_tree_validate(tp, tb[TCA_BASIC_EMATCHES], &t); if (err < 0) goto errout; - if (tb[TCA_BASIC_CLASSID-1]) { - f->res.classid = *(u32*)RTA_DATA(tb[TCA_BASIC_CLASSID-1]); + if (tb[TCA_BASIC_CLASSID]) { + f->res.classid = nla_get_u32(tb[TCA_BASIC_CLASSID]); tcf_bind_filter(tp, &f->res, base); } @@ -164,23 +165,25 @@ errout: } static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle, - struct rtattr **tca, unsigned long *arg) + struct nlattr **tca, unsigned long *arg) { - int err = -EINVAL; + int err; struct basic_head *head = (struct basic_head *) tp->root; - struct rtattr *tb[TCA_BASIC_MAX]; + struct nlattr *tb[TCA_BASIC_MAX + 1]; struct basic_filter *f = (struct basic_filter *) *arg; - if (tca[TCA_OPTIONS-1] == NULL) + if (tca[TCA_OPTIONS] == NULL) return -EINVAL; - if (rtattr_parse_nested(tb, TCA_BASIC_MAX, tca[TCA_OPTIONS-1]) < 0) - return -EINVAL; + err = nla_parse_nested(tb, TCA_BASIC_MAX, tca[TCA_OPTIONS], + basic_policy); + if (err < 0) + return err; if (f != NULL) { if (handle && f->handle != handle) return -EINVAL; - return basic_set_parms(tp, f, base, tb, tca[TCA_RATE-1]); + return basic_set_parms(tp, f, base, tb, tca[TCA_RATE]); } err = -ENOBUFS; @@ -206,7 +209,7 @@ static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle, f->handle = head->hgenerator; } - err = basic_set_parms(tp, f, base, tb, tca[TCA_RATE-1]); + err = basic_set_parms(tp, f, base, tb, tca[TCA_RATE]); if (err < 0) goto errout; @@ -245,33 +248,33 @@ static int basic_dump(struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { struct basic_filter *f = (struct basic_filter *) fh; - unsigned char *b = skb_tail_pointer(skb); - struct rtattr *rta; + struct nlattr *nest; if (f == NULL) return skb->len; t->tcm_handle = f->handle; - rta = (struct rtattr *) b; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; if (f->res.classid) - RTA_PUT(skb, TCA_BASIC_CLASSID, sizeof(u32), &f->res.classid); + NLA_PUT_U32(skb, TCA_BASIC_CLASSID, f->res.classid); if (tcf_exts_dump(skb, &f->exts, &basic_ext_map) < 0 || tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0) - goto rtattr_failure; + goto nla_put_failure; - rta->rta_len = skb_tail_pointer(skb) - b; + nla_nest_end(skb, nest); return skb->len; -rtattr_failure: - nlmsg_trim(skb, b); +nla_put_failure: + nla_nest_cancel(skb, nest); return -1; } -static struct tcf_proto_ops cls_basic_ops = { +static struct tcf_proto_ops cls_basic_ops __read_mostly = { .kind = "basic", .classify = basic_classify, .init = basic_init, diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 8adbd6a37d1..436a6e7c438 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -186,39 +186,41 @@ out: return -EINVAL; } +static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = { + [TCA_FW_CLASSID] = { .type = NLA_U32 }, + [TCA_FW_INDEV] = { .type = NLA_STRING, .len = IFNAMSIZ }, + [TCA_FW_MASK] = { .type = NLA_U32 }, +}; + static int fw_change_attrs(struct tcf_proto *tp, struct fw_filter *f, - struct rtattr **tb, struct rtattr **tca, unsigned long base) + struct nlattr **tb, struct nlattr **tca, unsigned long base) { struct fw_head *head = (struct fw_head *)tp->root; struct tcf_exts e; u32 mask; int err; - err = tcf_exts_validate(tp, tb, tca[TCA_RATE-1], &e, &fw_ext_map); + err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &fw_ext_map); if (err < 0) return err; err = -EINVAL; - if (tb[TCA_FW_CLASSID-1]) { - if (RTA_PAYLOAD(tb[TCA_FW_CLASSID-1]) != sizeof(u32)) - goto errout; - f->res.classid = *(u32*)RTA_DATA(tb[TCA_FW_CLASSID-1]); + if (tb[TCA_FW_CLASSID]) { + f->res.classid = nla_get_u32(tb[TCA_FW_CLASSID]); tcf_bind_filter(tp, &f->res, base); } #ifdef CONFIG_NET_CLS_IND - if (tb[TCA_FW_INDEV-1]) { - err = tcf_change_indev(tp, f->indev, tb[TCA_FW_INDEV-1]); + if (tb[TCA_FW_INDEV]) { + err = tcf_change_indev(tp, f->indev, tb[TCA_FW_INDEV]); if (err < 0) goto errout; } #endif /* CONFIG_NET_CLS_IND */ - if (tb[TCA_FW_MASK-1]) { - if (RTA_PAYLOAD(tb[TCA_FW_MASK-1]) != sizeof(u32)) - goto errout; - mask = *(u32*)RTA_DATA(tb[TCA_FW_MASK-1]); + if (tb[TCA_FW_MASK]) { + mask = nla_get_u32(tb[TCA_FW_MASK]); if (mask != head->mask) goto errout; } else if (head->mask != 0xFFFFFFFF) @@ -234,20 +236,21 @@ errout: static int fw_change(struct tcf_proto *tp, unsigned long base, u32 handle, - struct rtattr **tca, + struct nlattr **tca, unsigned long *arg) { struct fw_head *head = (struct fw_head*)tp->root; struct fw_filter *f = (struct fw_filter *) *arg; - struct rtattr *opt = tca[TCA_OPTIONS-1]; - struct rtattr *tb[TCA_FW_MAX]; + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_FW_MAX + 1]; int err; if (!opt) return handle ? -EINVAL : 0; - if (rtattr_parse_nested(tb, TCA_FW_MAX, opt) < 0) - return -EINVAL; + err = nla_parse_nested(tb, TCA_FW_MAX, opt, fw_policy); + if (err < 0) + return err; if (f != NULL) { if (f->id != handle && handle) @@ -260,11 +263,8 @@ static int fw_change(struct tcf_proto *tp, unsigned long base, if (head == NULL) { u32 mask = 0xFFFFFFFF; - if (tb[TCA_FW_MASK-1]) { - if (RTA_PAYLOAD(tb[TCA_FW_MASK-1]) != sizeof(u32)) - return -EINVAL; - mask = *(u32*)RTA_DATA(tb[TCA_FW_MASK-1]); - } + if (tb[TCA_FW_MASK]) + mask = nla_get_u32(tb[TCA_FW_MASK]); head = kzalloc(sizeof(struct fw_head), GFP_KERNEL); if (head == NULL) @@ -333,7 +333,7 @@ static int fw_dump(struct tcf_proto *tp, unsigned long fh, struct fw_head *head = (struct fw_head *)tp->root; struct fw_filter *f = (struct fw_filter*)fh; unsigned char *b = skb_tail_pointer(skb); - struct rtattr *rta; + struct nlattr *nest; if (f == NULL) return skb->len; @@ -343,35 +343,35 @@ static int fw_dump(struct tcf_proto *tp, unsigned long fh, if (!f->res.classid && !tcf_exts_is_available(&f->exts)) return skb->len; - rta = (struct rtattr*)b; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; if (f->res.classid) - RTA_PUT(skb, TCA_FW_CLASSID, 4, &f->res.classid); + NLA_PUT_U32(skb, TCA_FW_CLASSID, f->res.classid); #ifdef CONFIG_NET_CLS_IND if (strlen(f->indev)) - RTA_PUT(skb, TCA_FW_INDEV, IFNAMSIZ, f->indev); + NLA_PUT_STRING(skb, TCA_FW_INDEV, f->indev); #endif /* CONFIG_NET_CLS_IND */ if (head->mask != 0xFFFFFFFF) - RTA_PUT(skb, TCA_FW_MASK, 4, &head->mask); + NLA_PUT_U32(skb, TCA_FW_MASK, head->mask); if (tcf_exts_dump(skb, &f->exts, &fw_ext_map) < 0) - goto rtattr_failure; + goto nla_put_failure; - rta->rta_len = skb_tail_pointer(skb) - b; + nla_nest_end(skb, nest); if (tcf_exts_dump_stats(skb, &f->exts, &fw_ext_map) < 0) - goto rtattr_failure; + goto nla_put_failure; return skb->len; -rtattr_failure: +nla_put_failure: nlmsg_trim(skb, b); return -1; } -static struct tcf_proto_ops cls_fw_ops = { - .next = NULL, +static struct tcf_proto_ops cls_fw_ops __read_mostly = { .kind = "fw", .classify = fw_classify, .init = fw_init, diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 0a8409c1d28..f7e7d3955d2 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -323,9 +323,16 @@ static int route4_delete(struct tcf_proto *tp, unsigned long arg) return 0; } +static const struct nla_policy route4_policy[TCA_ROUTE4_MAX + 1] = { + [TCA_ROUTE4_CLASSID] = { .type = NLA_U32 }, + [TCA_ROUTE4_TO] = { .type = NLA_U32 }, + [TCA_ROUTE4_FROM] = { .type = NLA_U32 }, + [TCA_ROUTE4_IIF] = { .type = NLA_U32 }, +}; + static int route4_set_parms(struct tcf_proto *tp, unsigned long base, struct route4_filter *f, u32 handle, struct route4_head *head, - struct rtattr **tb, struct rtattr *est, int new) + struct nlattr **tb, struct nlattr *est, int new) { int err; u32 id = 0, to = 0, nhandle = 0x8000; @@ -339,34 +346,24 @@ static int route4_set_parms(struct tcf_proto *tp, unsigned long base, return err; err = -EINVAL; - if (tb[TCA_ROUTE4_CLASSID-1]) - if (RTA_PAYLOAD(tb[TCA_ROUTE4_CLASSID-1]) < sizeof(u32)) - goto errout; - - if (tb[TCA_ROUTE4_TO-1]) { + if (tb[TCA_ROUTE4_TO]) { if (new && handle & 0x8000) goto errout; - if (RTA_PAYLOAD(tb[TCA_ROUTE4_TO-1]) < sizeof(u32)) - goto errout; - to = *(u32*)RTA_DATA(tb[TCA_ROUTE4_TO-1]); + to = nla_get_u32(tb[TCA_ROUTE4_TO]); if (to > 0xFF) goto errout; nhandle = to; } - if (tb[TCA_ROUTE4_FROM-1]) { - if (tb[TCA_ROUTE4_IIF-1]) + if (tb[TCA_ROUTE4_FROM]) { + if (tb[TCA_ROUTE4_IIF]) goto errout; - if (RTA_PAYLOAD(tb[TCA_ROUTE4_FROM-1]) < sizeof(u32)) - goto errout; - id = *(u32*)RTA_DATA(tb[TCA_ROUTE4_FROM-1]); + id = nla_get_u32(tb[TCA_ROUTE4_FROM]); if (id > 0xFF) goto errout; nhandle |= id << 16; - } else if (tb[TCA_ROUTE4_IIF-1]) { - if (RTA_PAYLOAD(tb[TCA_ROUTE4_IIF-1]) < sizeof(u32)) - goto errout; - id = *(u32*)RTA_DATA(tb[TCA_ROUTE4_IIF-1]); + } else if (tb[TCA_ROUTE4_IIF]) { + id = nla_get_u32(tb[TCA_ROUTE4_IIF]); if (id > 0x7FFF) goto errout; nhandle |= (id | 0x8000) << 16; @@ -398,20 +395,20 @@ static int route4_set_parms(struct tcf_proto *tp, unsigned long base, } tcf_tree_lock(tp); - if (tb[TCA_ROUTE4_TO-1]) + if (tb[TCA_ROUTE4_TO]) f->id = to; - if (tb[TCA_ROUTE4_FROM-1]) + if (tb[TCA_ROUTE4_FROM]) f->id = to | id<<16; - else if (tb[TCA_ROUTE4_IIF-1]) + else if (tb[TCA_ROUTE4_IIF]) f->iif = id; f->handle = nhandle; f->bkt = b; tcf_tree_unlock(tp); - if (tb[TCA_ROUTE4_CLASSID-1]) { - f->res.classid = *(u32*)RTA_DATA(tb[TCA_ROUTE4_CLASSID-1]); + if (tb[TCA_ROUTE4_CLASSID]) { + f->res.classid = nla_get_u32(tb[TCA_ROUTE4_CLASSID]); tcf_bind_filter(tp, &f->res, base); } @@ -425,14 +422,14 @@ errout: static int route4_change(struct tcf_proto *tp, unsigned long base, u32 handle, - struct rtattr **tca, + struct nlattr **tca, unsigned long *arg) { struct route4_head *head = tp->root; struct route4_filter *f, *f1, **fp; struct route4_bucket *b; - struct rtattr *opt = tca[TCA_OPTIONS-1]; - struct rtattr *tb[TCA_ROUTE4_MAX]; + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_ROUTE4_MAX + 1]; unsigned int h, th; u32 old_handle = 0; int err; @@ -440,8 +437,9 @@ static int route4_change(struct tcf_proto *tp, unsigned long base, if (opt == NULL) return handle ? -EINVAL : 0; - if (rtattr_parse_nested(tb, TCA_ROUTE4_MAX, opt) < 0) - return -EINVAL; + err = nla_parse_nested(tb, TCA_ROUTE4_MAX, opt, route4_policy); + if (err < 0) + return err; if ((f = (struct route4_filter*)*arg) != NULL) { if (f->handle != handle && handle) @@ -451,7 +449,7 @@ static int route4_change(struct tcf_proto *tp, unsigned long base, old_handle = f->handle; err = route4_set_parms(tp, base, f, handle, head, tb, - tca[TCA_RATE-1], 0); + tca[TCA_RATE], 0); if (err < 0) return err; @@ -474,7 +472,7 @@ static int route4_change(struct tcf_proto *tp, unsigned long base, goto errout; err = route4_set_parms(tp, base, f, handle, head, tb, - tca[TCA_RATE-1], 1); + tca[TCA_RATE], 1); if (err < 0) goto errout; @@ -550,7 +548,7 @@ static int route4_dump(struct tcf_proto *tp, unsigned long fh, { struct route4_filter *f = (struct route4_filter*)fh; unsigned char *b = skb_tail_pointer(skb); - struct rtattr *rta; + struct nlattr *nest; u32 id; if (f == NULL) @@ -558,40 +556,40 @@ static int route4_dump(struct tcf_proto *tp, unsigned long fh, t->tcm_handle = f->handle; - rta = (struct rtattr*)b; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; if (!(f->handle&0x8000)) { id = f->id&0xFF; - RTA_PUT(skb, TCA_ROUTE4_TO, sizeof(id), &id); + NLA_PUT_U32(skb, TCA_ROUTE4_TO, id); } if (f->handle&0x80000000) { if ((f->handle>>16) != 0xFFFF) - RTA_PUT(skb, TCA_ROUTE4_IIF, sizeof(f->iif), &f->iif); + NLA_PUT_U32(skb, TCA_ROUTE4_IIF, f->iif); } else { id = f->id>>16; - RTA_PUT(skb, TCA_ROUTE4_FROM, sizeof(id), &id); + NLA_PUT_U32(skb, TCA_ROUTE4_FROM, id); } if (f->res.classid) - RTA_PUT(skb, TCA_ROUTE4_CLASSID, 4, &f->res.classid); + NLA_PUT_U32(skb, TCA_ROUTE4_CLASSID, f->res.classid); if (tcf_exts_dump(skb, &f->exts, &route_ext_map) < 0) - goto rtattr_failure; + goto nla_put_failure; - rta->rta_len = skb_tail_pointer(skb) - b; + nla_nest_end(skb, nest); if (tcf_exts_dump_stats(skb, &f->exts, &route_ext_map) < 0) - goto rtattr_failure; + goto nla_put_failure; return skb->len; -rtattr_failure: +nla_put_failure: nlmsg_trim(skb, b); return -1; } -static struct tcf_proto_ops cls_route4_ops = { - .next = NULL, +static struct tcf_proto_ops cls_route4_ops __read_mostly = { .kind = "route", .classify = route4_classify, .init = route4_init, diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 22f9ede70e8..7034ea4530e 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -397,17 +397,26 @@ static u32 gen_tunnel(struct rsvp_head *data) return 0; } +static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = { + [TCA_RSVP_CLASSID] = { .type = NLA_U32 }, + [TCA_RSVP_DST] = { .type = NLA_BINARY, + .len = RSVP_DST_LEN * sizeof(u32) }, + [TCA_RSVP_SRC] = { .type = NLA_BINARY, + .len = RSVP_DST_LEN * sizeof(u32) }, + [TCA_RSVP_PINFO] = { .len = sizeof(struct tc_rsvp_pinfo) }, +}; + static int rsvp_change(struct tcf_proto *tp, unsigned long base, u32 handle, - struct rtattr **tca, + struct nlattr **tca, unsigned long *arg) { struct rsvp_head *data = tp->root; struct rsvp_filter *f, **fp; struct rsvp_session *s, **sp; struct tc_rsvp_pinfo *pinfo = NULL; - struct rtattr *opt = tca[TCA_OPTIONS-1]; - struct rtattr *tb[TCA_RSVP_MAX]; + struct nlattr *opt = tca[TCA_OPTIONS-1]; + struct nlattr *tb[TCA_RSVP_MAX + 1]; struct tcf_exts e; unsigned h1, h2; __be32 *dst; @@ -416,8 +425,9 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base, if (opt == NULL) return handle ? -EINVAL : 0; - if (rtattr_parse_nested(tb, TCA_RSVP_MAX, opt) < 0) - return -EINVAL; + err = nla_parse_nested(tb, TCA_RSVP_MAX, opt, rsvp_policy); + if (err < 0) + return err; err = tcf_exts_validate(tp, tb, tca[TCA_RATE-1], &e, &rsvp_ext_map); if (err < 0) @@ -429,7 +439,7 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base, if (f->handle != handle && handle) goto errout2; if (tb[TCA_RSVP_CLASSID-1]) { - f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]); + f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID-1]); tcf_bind_filter(tp, &f->res, base); } @@ -451,31 +461,18 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base, h2 = 16; if (tb[TCA_RSVP_SRC-1]) { - err = -EINVAL; - if (RTA_PAYLOAD(tb[TCA_RSVP_SRC-1]) != sizeof(f->src)) - goto errout; - memcpy(f->src, RTA_DATA(tb[TCA_RSVP_SRC-1]), sizeof(f->src)); + memcpy(f->src, nla_data(tb[TCA_RSVP_SRC-1]), sizeof(f->src)); h2 = hash_src(f->src); } if (tb[TCA_RSVP_PINFO-1]) { - err = -EINVAL; - if (RTA_PAYLOAD(tb[TCA_RSVP_PINFO-1]) < sizeof(struct tc_rsvp_pinfo)) - goto errout; - pinfo = RTA_DATA(tb[TCA_RSVP_PINFO-1]); + pinfo = nla_data(tb[TCA_RSVP_PINFO-1]); f->spi = pinfo->spi; f->tunnelhdr = pinfo->tunnelhdr; } - if (tb[TCA_RSVP_CLASSID-1]) { - err = -EINVAL; - if (RTA_PAYLOAD(tb[TCA_RSVP_CLASSID-1]) != 4) - goto errout; - f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]); - } + if (tb[TCA_RSVP_CLASSID-1]) + f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID-1]); - err = -EINVAL; - if (RTA_PAYLOAD(tb[TCA_RSVP_DST-1]) != sizeof(f->src)) - goto errout; - dst = RTA_DATA(tb[TCA_RSVP_DST-1]); + dst = nla_data(tb[TCA_RSVP_DST-1]); h1 = hash_dst(dst, pinfo ? pinfo->protocol : 0, pinfo ? pinfo->tunnelid : 0); err = -ENOMEM; @@ -594,7 +591,7 @@ static int rsvp_dump(struct tcf_proto *tp, unsigned long fh, struct rsvp_filter *f = (struct rsvp_filter*)fh; struct rsvp_session *s; unsigned char *b = skb_tail_pointer(skb); - struct rtattr *rta; + struct nlattr *nest; struct tc_rsvp_pinfo pinfo; if (f == NULL) @@ -603,33 +600,33 @@ static int rsvp_dump(struct tcf_proto *tp, unsigned long fh, t->tcm_handle = f->handle; + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; - rta = (struct rtattr*)b; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); - - RTA_PUT(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst); + NLA_PUT(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst); pinfo.dpi = s->dpi; pinfo.spi = f->spi; pinfo.protocol = s->protocol; pinfo.tunnelid = s->tunnelid; pinfo.tunnelhdr = f->tunnelhdr; pinfo.pad = 0; - RTA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo); + NLA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo); if (f->res.classid) - RTA_PUT(skb, TCA_RSVP_CLASSID, 4, &f->res.classid); + NLA_PUT_U32(skb, TCA_RSVP_CLASSID, f->res.classid); if (((f->handle>>8)&0xFF) != 16) - RTA_PUT(skb, TCA_RSVP_SRC, sizeof(f->src), f->src); + NLA_PUT(skb, TCA_RSVP_SRC, sizeof(f->src), f->src); if (tcf_exts_dump(skb, &f->exts, &rsvp_ext_map) < 0) - goto rtattr_failure; + goto nla_put_failure; - rta->rta_len = skb_tail_pointer(skb) - b; + nla_nest_end(skb, nest); if (tcf_exts_dump_stats(skb, &f->exts, &rsvp_ext_map) < 0) - goto rtattr_failure; + goto nla_put_failure; return skb->len; -rtattr_failure: +nla_put_failure: nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 2314820a080..ee60b2d1705 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -29,19 +29,6 @@ #define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */ -#if 1 /* control */ -#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) -#else -#define DPRINTK(format,args...) -#endif - -#if 0 /* data */ -#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) -#else -#define D2PRINTK(format,args...) -#endif - - #define PRIV(tp) ((struct tcindex_data *) (tp)->root) @@ -104,7 +91,8 @@ static int tcindex_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcindex_filter_result *f; int key = (skb->tc_index & p->mask) >> p->shift; - D2PRINTK("tcindex_classify(skb %p,tp %p,res %p),p %p\n",skb,tp,res,p); + pr_debug("tcindex_classify(skb %p,tp %p,res %p),p %p\n", + skb, tp, res, p); f = tcindex_lookup(p, key); if (!f) { @@ -112,11 +100,11 @@ static int tcindex_classify(struct sk_buff *skb, struct tcf_proto *tp, return -1; res->classid = TC_H_MAKE(TC_H_MAJ(tp->q->handle), key); res->class = 0; - D2PRINTK("alg 0x%x\n",res->classid); + pr_debug("alg 0x%x\n", res->classid); return 0; } *res = f->res; - D2PRINTK("map 0x%x\n",res->classid); + pr_debug("map 0x%x\n", res->classid); return tcf_exts_exec(skb, &f->exts, res); } @@ -127,7 +115,7 @@ static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle) struct tcindex_data *p = PRIV(tp); struct tcindex_filter_result *r; - DPRINTK("tcindex_get(tp %p,handle 0x%08x)\n",tp,handle); + pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle); if (p->perfect && handle >= p->alloc_hash) return 0; r = tcindex_lookup(p, handle); @@ -137,7 +125,7 @@ static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle) static void tcindex_put(struct tcf_proto *tp, unsigned long f) { - DPRINTK("tcindex_put(tp %p,f 0x%lx)\n",tp,f); + pr_debug("tcindex_put(tp %p,f 0x%lx)\n", tp, f); } @@ -145,8 +133,8 @@ static int tcindex_init(struct tcf_proto *tp) { struct tcindex_data *p; - DPRINTK("tcindex_init(tp %p)\n",tp); - p = kzalloc(sizeof(struct tcindex_data),GFP_KERNEL); + pr_debug("tcindex_init(tp %p)\n", tp); + p = kzalloc(sizeof(struct tcindex_data), GFP_KERNEL); if (!p) return -ENOMEM; @@ -166,7 +154,7 @@ __tcindex_delete(struct tcf_proto *tp, unsigned long arg, int lock) struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg; struct tcindex_filter *f = NULL; - DPRINTK("tcindex_delete(tp %p,arg 0x%lx),p %p,f %p\n",tp,arg,p,f); + pr_debug("tcindex_delete(tp %p,arg 0x%lx),p %p,f %p\n", tp, arg, p, f); if (p->perfect) { if (!r->res.class) return -ENOENT; @@ -205,10 +193,18 @@ valid_perfect_hash(struct tcindex_data *p) return p->hash > (p->mask >> p->shift); } +static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = { + [TCA_TCINDEX_HASH] = { .type = NLA_U32 }, + [TCA_TCINDEX_MASK] = { .type = NLA_U16 }, + [TCA_TCINDEX_SHIFT] = { .type = NLA_U32 }, + [TCA_TCINDEX_FALL_THROUGH] = { .type = NLA_U32 }, + [TCA_TCINDEX_CLASSID] = { .type = NLA_U32 }, +}; + static int tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle, struct tcindex_data *p, struct tcindex_filter_result *r, - struct rtattr **tb, struct rtattr *est) + struct nlattr **tb, struct nlattr *est) { int err, balloc = 0; struct tcindex_filter_result new_filter_result, *old_r = r; @@ -229,24 +225,14 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle, else memset(&cr, 0, sizeof(cr)); - err = -EINVAL; - if (tb[TCA_TCINDEX_HASH-1]) { - if (RTA_PAYLOAD(tb[TCA_TCINDEX_HASH-1]) < sizeof(u32)) - goto errout; - cp.hash = *(u32 *) RTA_DATA(tb[TCA_TCINDEX_HASH-1]); - } + if (tb[TCA_TCINDEX_HASH]) + cp.hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); - if (tb[TCA_TCINDEX_MASK-1]) { - if (RTA_PAYLOAD(tb[TCA_TCINDEX_MASK-1]) < sizeof(u16)) - goto errout; - cp.mask = *(u16 *) RTA_DATA(tb[TCA_TCINDEX_MASK-1]); - } + if (tb[TCA_TCINDEX_MASK]) + cp.mask = nla_get_u16(tb[TCA_TCINDEX_MASK]); - if (tb[TCA_TCINDEX_SHIFT-1]) { - if (RTA_PAYLOAD(tb[TCA_TCINDEX_SHIFT-1]) < sizeof(int)) - goto errout; - cp.shift = *(int *) RTA_DATA(tb[TCA_TCINDEX_SHIFT-1]); - } + if (tb[TCA_TCINDEX_SHIFT]) + cp.shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]); err = -EBUSY; /* Hash already allocated, make sure that we still meet the @@ -260,12 +246,8 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle, goto errout; err = -EINVAL; - if (tb[TCA_TCINDEX_FALL_THROUGH-1]) { - if (RTA_PAYLOAD(tb[TCA_TCINDEX_FALL_THROUGH-1]) < sizeof(u32)) - goto errout; - cp.fall_through = - *(u32 *) RTA_DATA(tb[TCA_TCINDEX_FALL_THROUGH-1]); - } + if (tb[TCA_TCINDEX_FALL_THROUGH]) + cp.fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]); if (!cp.hash) { /* Hash not specified, use perfect hash if the upper limit @@ -316,8 +298,8 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle, goto errout_alloc; } - if (tb[TCA_TCINDEX_CLASSID-1]) { - cr.res.classid = *(u32 *) RTA_DATA(tb[TCA_TCINDEX_CLASSID-1]); + if (tb[TCA_TCINDEX_CLASSID]) { + cr.res.classid = nla_get_u32(tb[TCA_TCINDEX_CLASSID]); tcf_bind_filter(tp, &cr.res, base); } @@ -356,34 +338,36 @@ errout: static int tcindex_change(struct tcf_proto *tp, unsigned long base, u32 handle, - struct rtattr **tca, unsigned long *arg) + struct nlattr **tca, unsigned long *arg) { - struct rtattr *opt = tca[TCA_OPTIONS-1]; - struct rtattr *tb[TCA_TCINDEX_MAX]; + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_TCINDEX_MAX + 1]; struct tcindex_data *p = PRIV(tp); struct tcindex_filter_result *r = (struct tcindex_filter_result *) *arg; + int err; - DPRINTK("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p," + pr_debug("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p," "p %p,r %p,*arg 0x%lx\n", tp, handle, tca, arg, opt, p, r, arg ? *arg : 0L); if (!opt) return 0; - if (rtattr_parse_nested(tb, TCA_TCINDEX_MAX, opt) < 0) - return -EINVAL; + err = nla_parse_nested(tb, TCA_TCINDEX_MAX, opt, tcindex_policy); + if (err < 0) + return err; - return tcindex_set_parms(tp, base, handle, p, r, tb, tca[TCA_RATE-1]); + return tcindex_set_parms(tp, base, handle, p, r, tb, tca[TCA_RATE]); } static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) { struct tcindex_data *p = PRIV(tp); - struct tcindex_filter *f,*next; + struct tcindex_filter *f, *next; int i; - DPRINTK("tcindex_walk(tp %p,walker %p),p %p\n",tp,walker,p); + pr_debug("tcindex_walk(tp %p,walker %p),p %p\n", tp, walker, p); if (p->perfect) { for (i = 0; i < p->hash; i++) { if (!p->perfect[i].res.class) @@ -405,7 +389,7 @@ static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) for (f = p->h[i]; f; f = next) { next = f->next; if (walker->count >= walker->skip) { - if (walker->fn(tp,(unsigned long) &f->result, + if (walker->fn(tp, (unsigned long) &f->result, walker) < 0) { walker->stop = 1; return; @@ -429,11 +413,11 @@ static void tcindex_destroy(struct tcf_proto *tp) struct tcindex_data *p = PRIV(tp); struct tcf_walker walker; - DPRINTK("tcindex_destroy(tp %p),p %p\n",tp,p); + pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p); walker.count = 0; walker.skip = 0; walker.fn = &tcindex_destroy_element; - tcindex_walk(tp,&walker); + tcindex_walk(tp, &walker); kfree(p->perfect); kfree(p->h); kfree(p); @@ -447,21 +431,23 @@ static int tcindex_dump(struct tcf_proto *tp, unsigned long fh, struct tcindex_data *p = PRIV(tp); struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh; unsigned char *b = skb_tail_pointer(skb); - struct rtattr *rta; + struct nlattr *nest; + + pr_debug("tcindex_dump(tp %p,fh 0x%lx,skb %p,t %p),p %p,r %p,b %p\n", + tp, fh, skb, t, p, r, b); + pr_debug("p->perfect %p p->h %p\n", p->perfect, p->h); + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; - DPRINTK("tcindex_dump(tp %p,fh 0x%lx,skb %p,t %p),p %p,r %p,b %p\n", - tp,fh,skb,t,p,r,b); - DPRINTK("p->perfect %p p->h %p\n",p->perfect,p->h); - rta = (struct rtattr *) b; - RTA_PUT(skb,TCA_OPTIONS,0,NULL); if (!fh) { t->tcm_handle = ~0; /* whatever ... */ - RTA_PUT(skb,TCA_TCINDEX_HASH,sizeof(p->hash),&p->hash); - RTA_PUT(skb,TCA_TCINDEX_MASK,sizeof(p->mask),&p->mask); - RTA_PUT(skb,TCA_TCINDEX_SHIFT,sizeof(p->shift),&p->shift); - RTA_PUT(skb,TCA_TCINDEX_FALL_THROUGH,sizeof(p->fall_through), - &p->fall_through); - rta->rta_len = skb_tail_pointer(skb) - b; + NLA_PUT_U32(skb, TCA_TCINDEX_HASH, p->hash); + NLA_PUT_U16(skb, TCA_TCINDEX_MASK, p->mask); + NLA_PUT_U32(skb, TCA_TCINDEX_SHIFT, p->shift); + NLA_PUT_U32(skb, TCA_TCINDEX_FALL_THROUGH, p->fall_through); + nla_nest_end(skb, nest); } else { if (p->perfect) { t->tcm_handle = r-p->perfect; @@ -478,27 +464,26 @@ static int tcindex_dump(struct tcf_proto *tp, unsigned long fh, } } } - DPRINTK("handle = %d\n",t->tcm_handle); + pr_debug("handle = %d\n", t->tcm_handle); if (r->res.class) - RTA_PUT(skb, TCA_TCINDEX_CLASSID, 4, &r->res.classid); + NLA_PUT_U32(skb, TCA_TCINDEX_CLASSID, r->res.classid); if (tcf_exts_dump(skb, &r->exts, &tcindex_ext_map) < 0) - goto rtattr_failure; - rta->rta_len = skb_tail_pointer(skb) - b; + goto nla_put_failure; + nla_nest_end(skb, nest); if (tcf_exts_dump_stats(skb, &r->exts, &tcindex_ext_map) < 0) - goto rtattr_failure; + goto nla_put_failure; } return skb->len; -rtattr_failure: +nla_put_failure: nlmsg_trim(skb, b); return -1; } -static struct tcf_proto_ops cls_tcindex_ops = { - .next = NULL, +static struct tcf_proto_ops cls_tcindex_ops __read_mostly = { .kind = "tcindex", .classify = tcindex_classify, .init = tcindex_init, diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index c3900820916..e8a77568912 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -460,10 +460,20 @@ static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle) return handle|(i>0xFFF ? 0xFFF : i); } +static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { + [TCA_U32_CLASSID] = { .type = NLA_U32 }, + [TCA_U32_HASH] = { .type = NLA_U32 }, + [TCA_U32_LINK] = { .type = NLA_U32 }, + [TCA_U32_DIVISOR] = { .type = NLA_U32 }, + [TCA_U32_SEL] = { .len = sizeof(struct tc_u32_sel) }, + [TCA_U32_INDEV] = { .type = NLA_STRING, .len = IFNAMSIZ }, + [TCA_U32_MARK] = { .len = sizeof(struct tc_u32_mark) }, +}; + static int u32_set_parms(struct tcf_proto *tp, unsigned long base, struct tc_u_hnode *ht, - struct tc_u_knode *n, struct rtattr **tb, - struct rtattr *est) + struct tc_u_knode *n, struct nlattr **tb, + struct nlattr *est) { int err; struct tcf_exts e; @@ -473,8 +483,8 @@ static int u32_set_parms(struct tcf_proto *tp, unsigned long base, return err; err = -EINVAL; - if (tb[TCA_U32_LINK-1]) { - u32 handle = *(u32*)RTA_DATA(tb[TCA_U32_LINK-1]); + if (tb[TCA_U32_LINK]) { + u32 handle = nla_get_u32(tb[TCA_U32_LINK]); struct tc_u_hnode *ht_down = NULL; if (TC_U32_KEY(handle)) @@ -495,14 +505,14 @@ static int u32_set_parms(struct tcf_proto *tp, unsigned long base, if (ht_down) ht_down->refcnt--; } - if (tb[TCA_U32_CLASSID-1]) { - n->res.classid = *(u32*)RTA_DATA(tb[TCA_U32_CLASSID-1]); + if (tb[TCA_U32_CLASSID]) { + n->res.classid = nla_get_u32(tb[TCA_U32_CLASSID]); tcf_bind_filter(tp, &n->res, base); } #ifdef CONFIG_NET_CLS_IND - if (tb[TCA_U32_INDEV-1]) { - err = tcf_change_indev(tp, n->indev, tb[TCA_U32_INDEV-1]); + if (tb[TCA_U32_INDEV]) { + err = tcf_change_indev(tp, n->indev, tb[TCA_U32_INDEV]); if (err < 0) goto errout; } @@ -516,33 +526,34 @@ errout: } static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle, - struct rtattr **tca, + struct nlattr **tca, unsigned long *arg) { struct tc_u_common *tp_c = tp->data; struct tc_u_hnode *ht; struct tc_u_knode *n; struct tc_u32_sel *s; - struct rtattr *opt = tca[TCA_OPTIONS-1]; - struct rtattr *tb[TCA_U32_MAX]; + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_U32_MAX + 1]; u32 htid; int err; if (opt == NULL) return handle ? -EINVAL : 0; - if (rtattr_parse_nested(tb, TCA_U32_MAX, opt) < 0) - return -EINVAL; + err = nla_parse_nested(tb, TCA_U32_MAX, opt, u32_policy); + if (err < 0) + return err; if ((n = (struct tc_u_knode*)*arg) != NULL) { if (TC_U32_KEY(n->handle) == 0) return -EINVAL; - return u32_set_parms(tp, base, n->ht_up, n, tb, tca[TCA_RATE-1]); + return u32_set_parms(tp, base, n->ht_up, n, tb, tca[TCA_RATE]); } - if (tb[TCA_U32_DIVISOR-1]) { - unsigned divisor = *(unsigned*)RTA_DATA(tb[TCA_U32_DIVISOR-1]); + if (tb[TCA_U32_DIVISOR]) { + unsigned divisor = nla_get_u32(tb[TCA_U32_DIVISOR]); if (--divisor > 0x100) return -EINVAL; @@ -567,8 +578,8 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle, return 0; } - if (tb[TCA_U32_HASH-1]) { - htid = *(unsigned*)RTA_DATA(tb[TCA_U32_HASH-1]); + if (tb[TCA_U32_HASH]) { + htid = nla_get_u32(tb[TCA_U32_HASH]); if (TC_U32_HTID(htid) == TC_U32_ROOT) { ht = tp->root; htid = ht->handle; @@ -592,11 +603,10 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle, } else handle = gen_new_kid(ht, htid); - if (tb[TCA_U32_SEL-1] == NULL || - RTA_PAYLOAD(tb[TCA_U32_SEL-1]) < sizeof(struct tc_u32_sel)) + if (tb[TCA_U32_SEL] == NULL) return -EINVAL; - s = RTA_DATA(tb[TCA_U32_SEL-1]); + s = nla_data(tb[TCA_U32_SEL]); n = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL); if (n == NULL) @@ -616,23 +626,16 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle, n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0; #ifdef CONFIG_CLS_U32_MARK - if (tb[TCA_U32_MARK-1]) { + if (tb[TCA_U32_MARK]) { struct tc_u32_mark *mark; - if (RTA_PAYLOAD(tb[TCA_U32_MARK-1]) < sizeof(struct tc_u32_mark)) { -#ifdef CONFIG_CLS_U32_PERF - kfree(n->pf); -#endif - kfree(n); - return -EINVAL; - } - mark = RTA_DATA(tb[TCA_U32_MARK-1]); + mark = nla_data(tb[TCA_U32_MARK]); memcpy(&n->mark, mark, sizeof(struct tc_u32_mark)); n->mark.success = 0; } #endif - err = u32_set_parms(tp, base, ht, n, tb, tca[TCA_RATE-1]); + err = u32_set_parms(tp, base, ht, n, tb, tca[TCA_RATE]); if (err == 0) { struct tc_u_knode **ins; for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next) @@ -693,66 +696,66 @@ static int u32_dump(struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { struct tc_u_knode *n = (struct tc_u_knode*)fh; - unsigned char *b = skb_tail_pointer(skb); - struct rtattr *rta; + struct nlattr *nest; if (n == NULL) return skb->len; t->tcm_handle = n->handle; - rta = (struct rtattr*)b; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; if (TC_U32_KEY(n->handle) == 0) { struct tc_u_hnode *ht = (struct tc_u_hnode*)fh; u32 divisor = ht->divisor+1; - RTA_PUT(skb, TCA_U32_DIVISOR, 4, &divisor); + NLA_PUT_U32(skb, TCA_U32_DIVISOR, divisor); } else { - RTA_PUT(skb, TCA_U32_SEL, + NLA_PUT(skb, TCA_U32_SEL, sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key), &n->sel); if (n->ht_up) { u32 htid = n->handle & 0xFFFFF000; - RTA_PUT(skb, TCA_U32_HASH, 4, &htid); + NLA_PUT_U32(skb, TCA_U32_HASH, htid); } if (n->res.classid) - RTA_PUT(skb, TCA_U32_CLASSID, 4, &n->res.classid); + NLA_PUT_U32(skb, TCA_U32_CLASSID, n->res.classid); if (n->ht_down) - RTA_PUT(skb, TCA_U32_LINK, 4, &n->ht_down->handle); + NLA_PUT_U32(skb, TCA_U32_LINK, n->ht_down->handle); #ifdef CONFIG_CLS_U32_MARK if (n->mark.val || n->mark.mask) - RTA_PUT(skb, TCA_U32_MARK, sizeof(n->mark), &n->mark); + NLA_PUT(skb, TCA_U32_MARK, sizeof(n->mark), &n->mark); #endif if (tcf_exts_dump(skb, &n->exts, &u32_ext_map) < 0) - goto rtattr_failure; + goto nla_put_failure; #ifdef CONFIG_NET_CLS_IND if(strlen(n->indev)) - RTA_PUT(skb, TCA_U32_INDEV, IFNAMSIZ, n->indev); + NLA_PUT_STRING(skb, TCA_U32_INDEV, n->indev); #endif #ifdef CONFIG_CLS_U32_PERF - RTA_PUT(skb, TCA_U32_PCNT, + NLA_PUT(skb, TCA_U32_PCNT, sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64), n->pf); #endif } - rta->rta_len = skb_tail_pointer(skb) - b; + nla_nest_end(skb, nest); + if (TC_U32_KEY(n->handle)) if (tcf_exts_dump_stats(skb, &n->exts, &u32_ext_map) < 0) - goto rtattr_failure; + goto nla_put_failure; return skb->len; -rtattr_failure: - nlmsg_trim(skb, b); +nla_put_failure: + nla_nest_cancel(skb, nest); return -1; } -static struct tcf_proto_ops cls_u32_ops = { - .next = NULL, +static struct tcf_proto_ops cls_u32_ops __read_mostly = { .kind = "u32", .classify = u32_classify, .init = u32_init, diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index ceda8890ab0..a1e5619b187 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -542,11 +542,11 @@ static int meta_var_compare(struct meta_obj *a, struct meta_obj *b) return r; } -static int meta_var_change(struct meta_value *dst, struct rtattr *rta) +static int meta_var_change(struct meta_value *dst, struct nlattr *nla) { - int len = RTA_PAYLOAD(rta); + int len = nla_len(nla); - dst->val = (unsigned long)kmemdup(RTA_DATA(rta), len, GFP_KERNEL); + dst->val = (unsigned long)kmemdup(nla_data(nla), len, GFP_KERNEL); if (dst->val == 0UL) return -ENOMEM; dst->len = len; @@ -570,10 +570,10 @@ static void meta_var_apply_extras(struct meta_value *v, static int meta_var_dump(struct sk_buff *skb, struct meta_value *v, int tlv) { if (v->val && v->len) - RTA_PUT(skb, tlv, v->len, (void *) v->val); + NLA_PUT(skb, tlv, v->len, (void *) v->val); return 0; -rtattr_failure: +nla_put_failure: return -1; } @@ -594,13 +594,13 @@ static int meta_int_compare(struct meta_obj *a, struct meta_obj *b) return 1; } -static int meta_int_change(struct meta_value *dst, struct rtattr *rta) +static int meta_int_change(struct meta_value *dst, struct nlattr *nla) { - if (RTA_PAYLOAD(rta) >= sizeof(unsigned long)) { - dst->val = *(unsigned long *) RTA_DATA(rta); + if (nla_len(nla) >= sizeof(unsigned long)) { + dst->val = *(unsigned long *) nla_data(nla); dst->len = sizeof(unsigned long); - } else if (RTA_PAYLOAD(rta) == sizeof(u32)) { - dst->val = *(u32 *) RTA_DATA(rta); + } else if (nla_len(nla) == sizeof(u32)) { + dst->val = nla_get_u32(nla); dst->len = sizeof(u32); } else return -EINVAL; @@ -621,15 +621,14 @@ static void meta_int_apply_extras(struct meta_value *v, static int meta_int_dump(struct sk_buff *skb, struct meta_value *v, int tlv) { if (v->len == sizeof(unsigned long)) - RTA_PUT(skb, tlv, sizeof(unsigned long), &v->val); + NLA_PUT(skb, tlv, sizeof(unsigned long), &v->val); else if (v->len == sizeof(u32)) { - u32 d = v->val; - RTA_PUT(skb, tlv, sizeof(d), &d); + NLA_PUT_U32(skb, tlv, v->val); } return 0; -rtattr_failure: +nla_put_failure: return -1; } @@ -641,7 +640,7 @@ struct meta_type_ops { void (*destroy)(struct meta_value *); int (*compare)(struct meta_obj *, struct meta_obj *); - int (*change)(struct meta_value *, struct rtattr *); + int (*change)(struct meta_value *, struct nlattr *); void (*apply_extras)(struct meta_value *, struct meta_obj *); int (*dump)(struct sk_buff *, struct meta_value *, int); }; @@ -729,13 +728,13 @@ static inline void meta_delete(struct meta_match *meta) kfree(meta); } -static inline int meta_change_data(struct meta_value *dst, struct rtattr *rta) +static inline int meta_change_data(struct meta_value *dst, struct nlattr *nla) { - if (rta) { - if (RTA_PAYLOAD(rta) == 0) + if (nla) { + if (nla_len(nla) == 0) return -EINVAL; - return meta_type_ops(dst)->change(dst, rta); + return meta_type_ops(dst)->change(dst, nla); } return 0; @@ -746,21 +745,26 @@ static inline int meta_is_supported(struct meta_value *val) return (!meta_id(val) || meta_ops(val)->get); } +static const struct nla_policy meta_policy[TCA_EM_META_MAX + 1] = { + [TCA_EM_META_HDR] = { .len = sizeof(struct tcf_meta_hdr) }, +}; + static int em_meta_change(struct tcf_proto *tp, void *data, int len, struct tcf_ematch *m) { - int err = -EINVAL; - struct rtattr *tb[TCA_EM_META_MAX]; + int err; + struct nlattr *tb[TCA_EM_META_MAX + 1]; struct tcf_meta_hdr *hdr; struct meta_match *meta = NULL; - if (rtattr_parse(tb, TCA_EM_META_MAX, data, len) < 0) + err = nla_parse(tb, TCA_EM_META_MAX, data, len, meta_policy); + if (err < 0) goto errout; - if (tb[TCA_EM_META_HDR-1] == NULL || - RTA_PAYLOAD(tb[TCA_EM_META_HDR-1]) < sizeof(*hdr)) + err = -EINVAL; + if (tb[TCA_EM_META_HDR] == NULL) goto errout; - hdr = RTA_DATA(tb[TCA_EM_META_HDR-1]); + hdr = nla_data(tb[TCA_EM_META_HDR]); if (TCF_META_TYPE(hdr->left.kind) != TCF_META_TYPE(hdr->right.kind) || TCF_META_TYPE(hdr->left.kind) > TCF_META_TYPE_MAX || @@ -781,8 +785,8 @@ static int em_meta_change(struct tcf_proto *tp, void *data, int len, goto errout; } - if (meta_change_data(&meta->lvalue, tb[TCA_EM_META_LVALUE-1]) < 0 || - meta_change_data(&meta->rvalue, tb[TCA_EM_META_RVALUE-1]) < 0) + if (meta_change_data(&meta->lvalue, tb[TCA_EM_META_LVALUE]) < 0 || + meta_change_data(&meta->rvalue, tb[TCA_EM_META_RVALUE]) < 0) goto errout; m->datalen = sizeof(*meta); @@ -811,16 +815,16 @@ static int em_meta_dump(struct sk_buff *skb, struct tcf_ematch *em) memcpy(&hdr.left, &meta->lvalue.hdr, sizeof(hdr.left)); memcpy(&hdr.right, &meta->rvalue.hdr, sizeof(hdr.right)); - RTA_PUT(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr); + NLA_PUT(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr); ops = meta_type_ops(&meta->lvalue); if (ops->dump(skb, &meta->lvalue, TCA_EM_META_LVALUE) < 0 || ops->dump(skb, &meta->rvalue, TCA_EM_META_RVALUE) < 0) - goto rtattr_failure; + goto nla_put_failure; return 0; -rtattr_failure: +nla_put_failure: return -1; } diff --git a/net/sched/em_text.c b/net/sched/em_text.c index d5cd86efb7d..853c5ead87f 100644 --- a/net/sched/em_text.c +++ b/net/sched/em_text.c @@ -118,11 +118,14 @@ static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m) conf.pattern_len = textsearch_get_pattern_len(tm->config); conf.pad = 0; - RTA_PUT_NOHDR(skb, sizeof(conf), &conf); - RTA_APPEND(skb, conf.pattern_len, textsearch_get_pattern(tm->config)); + if (nla_put_nohdr(skb, sizeof(conf), &conf) < 0) + goto nla_put_failure; + if (nla_append(skb, conf.pattern_len, + textsearch_get_pattern(tm->config)) < 0) + goto nla_put_failure; return 0; -rtattr_failure: +nla_put_failure: return -1; } diff --git a/net/sched/ematch.c b/net/sched/ematch.c index f3a104e323b..74ff918455a 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -141,6 +141,7 @@ errout: write_unlock(&ematch_mod_lock); return err; } +EXPORT_SYMBOL(tcf_em_register); /** * tcf_em_unregister - unregster and extended match @@ -171,6 +172,7 @@ out: write_unlock(&ematch_mod_lock); return err; } +EXPORT_SYMBOL(tcf_em_unregister); static inline struct tcf_ematch * tcf_em_get_match(struct tcf_ematch_tree *tree, int index) @@ -181,11 +183,11 @@ static inline struct tcf_ematch * tcf_em_get_match(struct tcf_ematch_tree *tree, static int tcf_em_validate(struct tcf_proto *tp, struct tcf_ematch_tree_hdr *tree_hdr, - struct tcf_ematch *em, struct rtattr *rta, int idx) + struct tcf_ematch *em, struct nlattr *nla, int idx) { int err = -EINVAL; - struct tcf_ematch_hdr *em_hdr = RTA_DATA(rta); - int data_len = RTA_PAYLOAD(rta) - sizeof(*em_hdr); + struct tcf_ematch_hdr *em_hdr = nla_data(nla); + int data_len = nla_len(nla) - sizeof(*em_hdr); void *data = (void *) em_hdr + sizeof(*em_hdr); if (!TCF_EM_REL_VALID(em_hdr->flags)) @@ -280,15 +282,20 @@ errout: return err; } +static const struct nla_policy em_policy[TCA_EMATCH_TREE_MAX + 1] = { + [TCA_EMATCH_TREE_HDR] = { .len = sizeof(struct tcf_ematch_tree_hdr) }, + [TCA_EMATCH_TREE_LIST] = { .type = NLA_NESTED }, +}; + /** * tcf_em_tree_validate - validate ematch config TLV and build ematch tree * * @tp: classifier kind handle - * @rta: ematch tree configuration TLV + * @nla: ematch tree configuration TLV * @tree: destination ematch tree variable to store the resulting * ematch tree. * - * This function validates the given configuration TLV @rta and builds an + * This function validates the given configuration TLV @nla and builds an * ematch tree in @tree. The resulting tree must later be copied into * the private classifier data using tcf_em_tree_change(). You MUST NOT * provide the ematch tree variable of the private classifier data directly, @@ -296,45 +303,43 @@ errout: * * Returns a negative error code if the configuration TLV contains errors. */ -int tcf_em_tree_validate(struct tcf_proto *tp, struct rtattr *rta, +int tcf_em_tree_validate(struct tcf_proto *tp, struct nlattr *nla, struct tcf_ematch_tree *tree) { - int idx, list_len, matches_len, err = -EINVAL; - struct rtattr *tb[TCA_EMATCH_TREE_MAX]; - struct rtattr *rt_match, *rt_hdr, *rt_list; + int idx, list_len, matches_len, err; + struct nlattr *tb[TCA_EMATCH_TREE_MAX + 1]; + struct nlattr *rt_match, *rt_hdr, *rt_list; struct tcf_ematch_tree_hdr *tree_hdr; struct tcf_ematch *em; - if (!rta) { + if (!nla) { memset(tree, 0, sizeof(*tree)); return 0; } - if (rtattr_parse_nested(tb, TCA_EMATCH_TREE_MAX, rta) < 0) + err = nla_parse_nested(tb, TCA_EMATCH_TREE_MAX, nla, em_policy); + if (err < 0) goto errout; - rt_hdr = tb[TCA_EMATCH_TREE_HDR-1]; - rt_list = tb[TCA_EMATCH_TREE_LIST-1]; + err = -EINVAL; + rt_hdr = tb[TCA_EMATCH_TREE_HDR]; + rt_list = tb[TCA_EMATCH_TREE_LIST]; if (rt_hdr == NULL || rt_list == NULL) goto errout; - if (RTA_PAYLOAD(rt_hdr) < sizeof(*tree_hdr) || - RTA_PAYLOAD(rt_list) < sizeof(*rt_match)) - goto errout; - - tree_hdr = RTA_DATA(rt_hdr); + tree_hdr = nla_data(rt_hdr); memcpy(&tree->hdr, tree_hdr, sizeof(*tree_hdr)); - rt_match = RTA_DATA(rt_list); - list_len = RTA_PAYLOAD(rt_list); + rt_match = nla_data(rt_list); + list_len = nla_len(rt_list); matches_len = tree_hdr->nmatches * sizeof(*em); tree->matches = kzalloc(matches_len, GFP_KERNEL); if (tree->matches == NULL) goto errout; - /* We do not use rtattr_parse_nested here because the maximum + /* We do not use nla_parse_nested here because the maximum * number of attributes is unknown. This saves us the allocation * for a tb buffer which would serve no purpose at all. * @@ -342,16 +347,16 @@ int tcf_em_tree_validate(struct tcf_proto *tp, struct rtattr *rta, * provided, their type must be incremental from 1 to n. Even * if it does not serve any real purpose, a failure of sticking * to this policy will result in parsing failure. */ - for (idx = 0; RTA_OK(rt_match, list_len); idx++) { + for (idx = 0; nla_ok(rt_match, list_len); idx++) { err = -EINVAL; - if (rt_match->rta_type != (idx + 1)) + if (rt_match->nla_type != (idx + 1)) goto errout_abort; if (idx >= tree_hdr->nmatches) goto errout_abort; - if (RTA_PAYLOAD(rt_match) < sizeof(struct tcf_ematch_hdr)) + if (nla_len(rt_match) < sizeof(struct tcf_ematch_hdr)) goto errout_abort; em = tcf_em_get_match(tree, idx); @@ -360,7 +365,7 @@ int tcf_em_tree_validate(struct tcf_proto *tp, struct rtattr *rta, if (err < 0) goto errout_abort; - rt_match = RTA_NEXT(rt_match, list_len); + rt_match = nla_next(rt_match, &list_len); } /* Check if the number of matches provided by userspace actually @@ -380,6 +385,7 @@ errout_abort: tcf_em_tree_destroy(tp, tree); return err; } +EXPORT_SYMBOL(tcf_em_tree_validate); /** * tcf_em_tree_destroy - destroy an ematch tree @@ -413,6 +419,7 @@ void tcf_em_tree_destroy(struct tcf_proto *tp, struct tcf_ematch_tree *tree) tree->hdr.nmatches = 0; kfree(tree->matches); } +EXPORT_SYMBOL(tcf_em_tree_destroy); /** * tcf_em_tree_dump - dump ematch tree into a rtnl message @@ -430,18 +437,22 @@ int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv) { int i; u8 *tail; - struct rtattr *top_start = (struct rtattr *)skb_tail_pointer(skb); - struct rtattr *list_start; + struct nlattr *top_start; + struct nlattr *list_start; + + top_start = nla_nest_start(skb, tlv); + if (top_start == NULL) + goto nla_put_failure; - RTA_PUT(skb, tlv, 0, NULL); - RTA_PUT(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr); + NLA_PUT(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr); - list_start = (struct rtattr *)skb_tail_pointer(skb); - RTA_PUT(skb, TCA_EMATCH_TREE_LIST, 0, NULL); + list_start = nla_nest_start(skb, TCA_EMATCH_TREE_LIST); + if (list_start == NULL) + goto nla_put_failure; tail = skb_tail_pointer(skb); for (i = 0; i < tree->hdr.nmatches; i++) { - struct rtattr *match_start = (struct rtattr *)tail; + struct nlattr *match_start = (struct nlattr *)tail; struct tcf_ematch *em = tcf_em_get_match(tree, i); struct tcf_ematch_hdr em_hdr = { .kind = em->ops ? em->ops->kind : TCF_EM_CONTAINER, @@ -449,29 +460,30 @@ int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv) .flags = em->flags }; - RTA_PUT(skb, i+1, sizeof(em_hdr), &em_hdr); + NLA_PUT(skb, i+1, sizeof(em_hdr), &em_hdr); if (em->ops && em->ops->dump) { if (em->ops->dump(skb, em) < 0) - goto rtattr_failure; + goto nla_put_failure; } else if (tcf_em_is_container(em) || tcf_em_is_simple(em)) { u32 u = em->data; - RTA_PUT_NOHDR(skb, sizeof(u), &u); + nla_put_nohdr(skb, sizeof(u), &u); } else if (em->datalen > 0) - RTA_PUT_NOHDR(skb, em->datalen, (void *) em->data); + nla_put_nohdr(skb, em->datalen, (void *) em->data); tail = skb_tail_pointer(skb); - match_start->rta_len = tail - (u8 *)match_start; + match_start->nla_len = tail - (u8 *)match_start; } - list_start->rta_len = tail - (u8 *)list_start; - top_start->rta_len = tail - (u8 *)top_start; + nla_nest_end(skb, list_start); + nla_nest_end(skb, top_start); return 0; -rtattr_failure: +nla_put_failure: return -1; } +EXPORT_SYMBOL(tcf_em_tree_dump); static inline int tcf_em_match(struct sk_buff *skb, struct tcf_ematch *em, struct tcf_pkt_info *info) @@ -529,10 +541,4 @@ stack_overflow: printk("Local stack overflow, increase NET_EMATCH_STACK\n"); return -1; } - -EXPORT_SYMBOL(tcf_em_register); -EXPORT_SYMBOL(tcf_em_unregister); -EXPORT_SYMBOL(tcf_em_tree_validate); -EXPORT_SYMBOL(tcf_em_tree_destroy); -EXPORT_SYMBOL(tcf_em_tree_dump); EXPORT_SYMBOL(__tcf_em_tree_match); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 8ae137e3522..7e3c048ba9b 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -29,6 +29,7 @@ #include <linux/hrtimer.h> #include <net/net_namespace.h> +#include <net/sock.h> #include <net/netlink.h> #include <net/pkt_sched.h> @@ -157,6 +158,7 @@ out: write_unlock(&qdisc_mod_lock); return rc; } +EXPORT_SYMBOL(register_qdisc); int unregister_qdisc(struct Qdisc_ops *qops) { @@ -175,6 +177,7 @@ int unregister_qdisc(struct Qdisc_ops *qops) write_unlock(&qdisc_mod_lock); return err; } +EXPORT_SYMBOL(unregister_qdisc); /* We know handle. Find qdisc among all qdisc's attached to device (root qdisc, all its children, children of children etc.) @@ -195,7 +198,7 @@ static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) { unsigned long cl; struct Qdisc *leaf; - struct Qdisc_class_ops *cops = p->ops->cl_ops; + const struct Qdisc_class_ops *cops = p->ops->cl_ops; if (cops == NULL) return NULL; @@ -210,14 +213,14 @@ static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) /* Find queueing discipline by name */ -static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind) +static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind) { struct Qdisc_ops *q = NULL; if (kind) { read_lock(&qdisc_mod_lock); for (q = qdisc_base; q; q = q->next) { - if (rtattr_strcmp(kind, q->id) == 0) { + if (nla_strcmp(kind, q->id) == 0) { if (!try_module_get(q->owner)) q = NULL; break; @@ -230,7 +233,7 @@ static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind) static struct qdisc_rate_table *qdisc_rtab_list; -struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab) +struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab) { struct qdisc_rate_table *rtab; @@ -241,19 +244,21 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *ta } } - if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024) + if (tab == NULL || r->rate == 0 || r->cell_log == 0 || + nla_len(tab) != TC_RTAB_SIZE) return NULL; rtab = kmalloc(sizeof(*rtab), GFP_KERNEL); if (rtab) { rtab->rate = *r; rtab->refcnt = 1; - memcpy(rtab->data, RTA_DATA(tab), 1024); + memcpy(rtab->data, nla_data(tab), 1024); rtab->next = qdisc_rtab_list; qdisc_rtab_list = rtab; } return rtab; } +EXPORT_SYMBOL(qdisc_get_rtab); void qdisc_put_rtab(struct qdisc_rate_table *tab) { @@ -270,6 +275,7 @@ void qdisc_put_rtab(struct qdisc_rate_table *tab) } } } +EXPORT_SYMBOL(qdisc_put_rtab); static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) { @@ -373,7 +379,7 @@ dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc) void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) { - struct Qdisc_class_ops *cops; + const struct Qdisc_class_ops *cops; unsigned long cl; u32 parentid; @@ -417,7 +423,7 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, *old = dev_graft_qdisc(dev, new); } } else { - struct Qdisc_class_ops *cops = parent->ops->cl_ops; + const struct Qdisc_class_ops *cops = parent->ops->cl_ops; err = -EINVAL; @@ -440,10 +446,10 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, static struct Qdisc * qdisc_create(struct net_device *dev, u32 parent, u32 handle, - struct rtattr **tca, int *errp) + struct nlattr **tca, int *errp) { int err; - struct rtattr *kind = tca[TCA_KIND-1]; + struct nlattr *kind = tca[TCA_KIND]; struct Qdisc *sch; struct Qdisc_ops *ops; @@ -451,7 +457,7 @@ qdisc_create(struct net_device *dev, u32 parent, u32 handle, #ifdef CONFIG_KMOD if (ops == NULL && kind != NULL) { char name[IFNAMSIZ]; - if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) { + if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) { /* We dropped the RTNL semaphore in order to * perform the module load. So, even if we * succeeded in loading the module we have to @@ -504,11 +510,11 @@ qdisc_create(struct net_device *dev, u32 parent, u32 handle, sch->handle = handle; - if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) { - if (tca[TCA_RATE-1]) { + if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) { + if (tca[TCA_RATE]) { err = gen_new_estimator(&sch->bstats, &sch->rate_est, sch->stats_lock, - tca[TCA_RATE-1]); + tca[TCA_RATE]); if (err) { /* * Any broken qdiscs that would require @@ -536,20 +542,20 @@ err_out: return NULL; } -static int qdisc_change(struct Qdisc *sch, struct rtattr **tca) +static int qdisc_change(struct Qdisc *sch, struct nlattr **tca) { - if (tca[TCA_OPTIONS-1]) { + if (tca[TCA_OPTIONS]) { int err; if (sch->ops->change == NULL) return -EINVAL; - err = sch->ops->change(sch, tca[TCA_OPTIONS-1]); + err = sch->ops->change(sch, tca[TCA_OPTIONS]); if (err) return err; } - if (tca[TCA_RATE-1]) + if (tca[TCA_RATE]) gen_replace_estimator(&sch->bstats, &sch->rate_est, - sch->stats_lock, tca[TCA_RATE-1]); + sch->stats_lock, tca[TCA_RATE]); return 0; } @@ -581,7 +587,7 @@ static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) { struct Qdisc *leaf; - struct Qdisc_class_ops *cops = q->ops->cl_ops; + const struct Qdisc_class_ops *cops = q->ops->cl_ops; struct check_loop_arg *arg = (struct check_loop_arg *)w; leaf = cops->leaf(q, cl); @@ -599,17 +605,25 @@ check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) { + struct net *net = skb->sk->sk_net; struct tcmsg *tcm = NLMSG_DATA(n); - struct rtattr **tca = arg; + struct nlattr *tca[TCA_MAX + 1]; struct net_device *dev; u32 clid = tcm->tcm_parent; struct Qdisc *q = NULL; struct Qdisc *p = NULL; int err; + if (net != &init_net) + return -EINVAL; + if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL) return -ENODEV; + err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); + if (err < 0) + return err; + if (clid) { if (clid != TC_H_ROOT) { if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) { @@ -632,7 +646,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) return -ENOENT; } - if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) + if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) return -EINVAL; if (n->nlmsg_type == RTM_DELQDISC) { @@ -660,23 +674,30 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) { + struct net *net = skb->sk->sk_net; struct tcmsg *tcm; - struct rtattr **tca; + struct nlattr *tca[TCA_MAX + 1]; struct net_device *dev; u32 clid; struct Qdisc *q, *p; int err; + if (net != &init_net) + return -EINVAL; + replay: /* Reinit, just in case something touches this. */ tcm = NLMSG_DATA(n); - tca = arg; clid = tcm->tcm_parent; q = p = NULL; if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL) return -ENODEV; + err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); + if (err < 0) + return err; + if (clid) { if (clid != TC_H_ROOT) { if (clid != TC_H_INGRESS) { @@ -704,7 +725,7 @@ replay: goto create_n_graft; if (n->nlmsg_flags&NLM_F_EXCL) return -EEXIST; - if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) + if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) return -EINVAL; if (q == p || (p && check_loop(q, p, 0))) @@ -737,8 +758,8 @@ replay: if ((n->nlmsg_flags&NLM_F_CREATE) && (n->nlmsg_flags&NLM_F_REPLACE) && ((n->nlmsg_flags&NLM_F_EXCL) || - (tca[TCA_KIND-1] && - rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)))) + (tca[TCA_KIND] && + nla_strcmp(tca[TCA_KIND], q->ops->id)))) goto create_n_graft; } } @@ -753,7 +774,7 @@ replay: return -ENOENT; if (n->nlmsg_flags&NLM_F_EXCL) return -EEXIST; - if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) + if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) return -EINVAL; err = qdisc_change(q, tca); if (err == 0) @@ -814,31 +835,31 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, tcm->tcm_parent = clid; tcm->tcm_handle = q->handle; tcm->tcm_info = atomic_read(&q->refcnt); - RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); + NLA_PUT_STRING(skb, TCA_KIND, q->ops->id); if (q->ops->dump && q->ops->dump(q, skb) < 0) - goto rtattr_failure; + goto nla_put_failure; q->qstats.qlen = q->q.qlen; if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, q->stats_lock, &d) < 0) - goto rtattr_failure; + goto nla_put_failure; if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) - goto rtattr_failure; + goto nla_put_failure; if (gnet_stats_copy_basic(&d, &q->bstats) < 0 || gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 || gnet_stats_copy_queue(&d, &q->qstats) < 0) - goto rtattr_failure; + goto nla_put_failure; if (gnet_stats_finish_copy(&d) < 0) - goto rtattr_failure; + goto nla_put_failure; nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; nlmsg_failure: -rtattr_failure: +nla_put_failure: nlmsg_trim(skb, b); return -1; } @@ -863,7 +884,7 @@ static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, } if (skb->len) - return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); err_out: kfree_skb(skb); @@ -872,11 +893,15 @@ err_out: static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = skb->sk->sk_net; int idx, q_idx; int s_idx, s_q_idx; struct net_device *dev; struct Qdisc *q; + if (net != &init_net) + return 0; + s_idx = cb->args[0]; s_q_idx = q_idx = cb->args[1]; read_lock(&dev_base_lock); @@ -920,11 +945,12 @@ done: static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) { + struct net *net = skb->sk->sk_net; struct tcmsg *tcm = NLMSG_DATA(n); - struct rtattr **tca = arg; + struct nlattr *tca[TCA_MAX + 1]; struct net_device *dev; struct Qdisc *q = NULL; - struct Qdisc_class_ops *cops; + const struct Qdisc_class_ops *cops; unsigned long cl = 0; unsigned long new_cl; u32 pid = tcm->tcm_parent; @@ -932,9 +958,16 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) u32 qid = TC_H_MAJ(clid); int err; + if (net != &init_net) + return -EINVAL; + if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL) return -ENODEV; + err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); + if (err < 0) + return err; + /* parent == TC_H_UNSPEC - unspecified parent. parent == TC_H_ROOT - class is root, which has no parent. @@ -1039,7 +1072,7 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); struct gnet_dump d; - struct Qdisc_class_ops *cl_ops = q->ops->cl_ops; + const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops; nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); tcm = NLMSG_DATA(nlh); @@ -1048,25 +1081,25 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, tcm->tcm_parent = q->handle; tcm->tcm_handle = q->handle; tcm->tcm_info = 0; - RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); + NLA_PUT_STRING(skb, TCA_KIND, q->ops->id); if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0) - goto rtattr_failure; + goto nla_put_failure; if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, q->stats_lock, &d) < 0) - goto rtattr_failure; + goto nla_put_failure; if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0) - goto rtattr_failure; + goto nla_put_failure; if (gnet_stats_finish_copy(&d) < 0) - goto rtattr_failure; + goto nla_put_failure; nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; nlmsg_failure: -rtattr_failure: +nla_put_failure: nlmsg_trim(skb, b); return -1; } @@ -1086,7 +1119,7 @@ static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, return -EINVAL; } - return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); } struct qdisc_dump_args @@ -1106,6 +1139,7 @@ static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walk static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = skb->sk->sk_net; int t; int s_t; struct net_device *dev; @@ -1113,6 +1147,9 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh); struct qdisc_dump_args arg; + if (net != &init_net) + return 0; + if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) return 0; if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL) @@ -1268,8 +1305,3 @@ static int __init pktsched_init(void) } subsys_initcall(pktsched_init); - -EXPORT_SYMBOL(qdisc_get_rtab); -EXPORT_SYMBOL(qdisc_put_rtab); -EXPORT_SYMBOL(register_qdisc); -EXPORT_SYMBOL(unregister_qdisc); diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index ddc4f2c5437..33527341638 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -16,18 +16,6 @@ extern struct socket *sockfd_lookup(int fd, int *err); /* @@@ fix this */ -#if 0 /* control */ -#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) -#else -#define DPRINTK(format,args...) -#endif - -#if 0 /* data */ -#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) -#else -#define D2PRINTK(format,args...) -#endif - /* * The ATM queuing discipline provides a framework for invoking classifiers * (aka "filters"), which in turn select classes of this queuing discipline. @@ -49,7 +37,6 @@ extern struct socket *sockfd_lookup(int fd, int *err); /* @@@ fix this */ * - should lock the flow while there is data in the queue (?) */ -#define PRIV(sch) qdisc_priv(sch) #define VCC2FLOW(vcc) ((struct atm_flow_data *) ((vcc)->user_back)) struct atm_flow_data { @@ -57,7 +44,7 @@ struct atm_flow_data { struct tcf_proto *filter_list; struct atm_vcc *vcc; /* VCC; NULL if VCC is closed */ void (*old_pop)(struct atm_vcc *vcc, - struct sk_buff * skb); /* chaining */ + struct sk_buff *skb); /* chaining */ struct atm_qdisc_data *parent; /* parent qdisc */ struct socket *sock; /* for closing */ u32 classid; /* x:y type ID */ @@ -84,17 +71,17 @@ static int find_flow(struct atm_qdisc_data *qdisc, struct atm_flow_data *flow) { struct atm_flow_data *walk; - DPRINTK("find_flow(qdisc %p,flow %p)\n", qdisc, flow); + pr_debug("find_flow(qdisc %p,flow %p)\n", qdisc, flow); for (walk = qdisc->flows; walk; walk = walk->next) if (walk == flow) return 1; - DPRINTK("find_flow: not found\n"); + pr_debug("find_flow: not found\n"); return 0; } static inline struct atm_flow_data *lookup_flow(struct Qdisc *sch, u32 classid) { - struct atm_qdisc_data *p = PRIV(sch); + struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow; for (flow = p->flows; flow; flow = flow->next) @@ -106,10 +93,10 @@ static inline struct atm_flow_data *lookup_flow(struct Qdisc *sch, u32 classid) static int atm_tc_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old) { - struct atm_qdisc_data *p = PRIV(sch); + struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow = (struct atm_flow_data *)arg; - DPRINTK("atm_tc_graft(sch %p,[qdisc %p],flow %p,new %p,old %p)\n", + pr_debug("atm_tc_graft(sch %p,[qdisc %p],flow %p,new %p,old %p)\n", sch, p, flow, new, old); if (!find_flow(p, flow)) return -EINVAL; @@ -125,20 +112,20 @@ static struct Qdisc *atm_tc_leaf(struct Qdisc *sch, unsigned long cl) { struct atm_flow_data *flow = (struct atm_flow_data *)cl; - DPRINTK("atm_tc_leaf(sch %p,flow %p)\n", sch, flow); + pr_debug("atm_tc_leaf(sch %p,flow %p)\n", sch, flow); return flow ? flow->q : NULL; } static unsigned long atm_tc_get(struct Qdisc *sch, u32 classid) { - struct atm_qdisc_data *p __maybe_unused = PRIV(sch); + struct atm_qdisc_data *p __maybe_unused = qdisc_priv(sch); struct atm_flow_data *flow; - DPRINTK("atm_tc_get(sch %p,[qdisc %p],classid %x)\n", sch, p, classid); + pr_debug("atm_tc_get(sch %p,[qdisc %p],classid %x)\n", sch, p, classid); flow = lookup_flow(sch, classid); if (flow) flow->ref++; - DPRINTK("atm_tc_get: flow %p\n", flow); + pr_debug("atm_tc_get: flow %p\n", flow); return (unsigned long)flow; } @@ -155,14 +142,14 @@ static unsigned long atm_tc_bind_filter(struct Qdisc *sch, */ static void atm_tc_put(struct Qdisc *sch, unsigned long cl) { - struct atm_qdisc_data *p = PRIV(sch); + struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow = (struct atm_flow_data *)cl; struct atm_flow_data **prev; - DPRINTK("atm_tc_put(sch %p,[qdisc %p],flow %p)\n", sch, p, flow); + pr_debug("atm_tc_put(sch %p,[qdisc %p],flow %p)\n", sch, p, flow); if (--flow->ref) return; - DPRINTK("atm_tc_put: destroying\n"); + pr_debug("atm_tc_put: destroying\n"); for (prev = &p->flows; *prev; prev = &(*prev)->next) if (*prev == flow) break; @@ -171,11 +158,11 @@ static void atm_tc_put(struct Qdisc *sch, unsigned long cl) return; } *prev = flow->next; - DPRINTK("atm_tc_put: qdisc %p\n", flow->q); + pr_debug("atm_tc_put: qdisc %p\n", flow->q); qdisc_destroy(flow->q); tcf_destroy_chain(flow->filter_list); if (flow->sock) { - DPRINTK("atm_tc_put: f_count %d\n", + pr_debug("atm_tc_put: f_count %d\n", file_count(flow->sock->file)); flow->vcc->pop = flow->old_pop; sockfd_put(flow->sock); @@ -194,7 +181,7 @@ static void sch_atm_pop(struct atm_vcc *vcc, struct sk_buff *skb) { struct atm_qdisc_data *p = VCC2FLOW(vcc)->parent; - D2PRINTK("sch_atm_pop(vcc %p,skb %p,[qdisc %p])\n", vcc, skb, p); + pr_debug("sch_atm_pop(vcc %p,skb %p,[qdisc %p])\n", vcc, skb, p); VCC2FLOW(vcc)->old_pop(vcc, skb); tasklet_schedule(&p->task); } @@ -208,19 +195,24 @@ static const u8 llc_oui_ip[] = { 0x08, 0x00 }; /* Ethertype IP (0800) */ +static const struct nla_policy atm_policy[TCA_ATM_MAX + 1] = { + [TCA_ATM_FD] = { .type = NLA_U32 }, + [TCA_ATM_EXCESS] = { .type = NLA_U32 }, +}; + static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent, - struct rtattr **tca, unsigned long *arg) + struct nlattr **tca, unsigned long *arg) { - struct atm_qdisc_data *p = PRIV(sch); + struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow = (struct atm_flow_data *)*arg; struct atm_flow_data *excess = NULL; - struct rtattr *opt = tca[TCA_OPTIONS - 1]; - struct rtattr *tb[TCA_ATM_MAX]; + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_ATM_MAX + 1]; struct socket *sock; int fd, error, hdr_len; void *hdr; - DPRINTK("atm_tc_change(sch %p,[qdisc %p],classid %x,parent %x," + pr_debug("atm_tc_change(sch %p,[qdisc %p],classid %x,parent %x," "flow %p,opt %p)\n", sch, p, classid, parent, flow, opt); /* * The concept of parents doesn't apply for this qdisc. @@ -236,34 +228,38 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent, */ if (flow) return -EBUSY; - if (opt == NULL || rtattr_parse_nested(tb, TCA_ATM_MAX, opt)) + if (opt == NULL) return -EINVAL; - if (!tb[TCA_ATM_FD - 1] || RTA_PAYLOAD(tb[TCA_ATM_FD - 1]) < sizeof(fd)) + + error = nla_parse_nested(tb, TCA_ATM_MAX, opt, atm_policy); + if (error < 0) + return error; + + if (!tb[TCA_ATM_FD]) return -EINVAL; - fd = *(int *)RTA_DATA(tb[TCA_ATM_FD - 1]); - DPRINTK("atm_tc_change: fd %d\n", fd); - if (tb[TCA_ATM_HDR - 1]) { - hdr_len = RTA_PAYLOAD(tb[TCA_ATM_HDR - 1]); - hdr = RTA_DATA(tb[TCA_ATM_HDR - 1]); + fd = nla_get_u32(tb[TCA_ATM_FD]); + pr_debug("atm_tc_change: fd %d\n", fd); + if (tb[TCA_ATM_HDR]) { + hdr_len = nla_len(tb[TCA_ATM_HDR]); + hdr = nla_data(tb[TCA_ATM_HDR]); } else { hdr_len = RFC1483LLC_LEN; hdr = NULL; /* default LLC/SNAP for IP */ } - if (!tb[TCA_ATM_EXCESS - 1]) + if (!tb[TCA_ATM_EXCESS]) excess = NULL; else { - if (RTA_PAYLOAD(tb[TCA_ATM_EXCESS - 1]) != sizeof(u32)) - return -EINVAL; excess = (struct atm_flow_data *) - atm_tc_get(sch, *(u32 *)RTA_DATA(tb[TCA_ATM_EXCESS - 1])); + atm_tc_get(sch, nla_get_u32(tb[TCA_ATM_EXCESS])); if (!excess) return -ENOENT; } - DPRINTK("atm_tc_change: type %d, payload %d, hdr_len %d\n", - opt->rta_type, RTA_PAYLOAD(opt), hdr_len); - if (!(sock = sockfd_lookup(fd, &error))) + pr_debug("atm_tc_change: type %d, payload %d, hdr_len %d\n", + opt->nla_type, nla_len(opt), hdr_len); + sock = sockfd_lookup(fd, &error); + if (!sock) return error; /* f_count++ */ - DPRINTK("atm_tc_change: f_count %d\n", file_count(sock->file)); + pr_debug("atm_tc_change: f_count %d\n", file_count(sock->file)); if (sock->ops->family != PF_ATMSVC && sock->ops->family != PF_ATMPVC) { error = -EPROTOTYPE; goto err_out; @@ -272,7 +268,7 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent, on vcc->send */ if (classid) { if (TC_H_MAJ(classid ^ sch->handle)) { - DPRINTK("atm_tc_change: classid mismatch\n"); + pr_debug("atm_tc_change: classid mismatch\n"); error = -EINVAL; goto err_out; } @@ -286,26 +282,28 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent, for (i = 1; i < 0x8000; i++) { classid = TC_H_MAKE(sch->handle, 0x8000 | i); - if (!(cl = atm_tc_get(sch, classid))) + cl = atm_tc_get(sch, classid); + if (!cl) break; atm_tc_put(sch, cl); } } - DPRINTK("atm_tc_change: new id %x\n", classid); + pr_debug("atm_tc_change: new id %x\n", classid); flow = kzalloc(sizeof(struct atm_flow_data) + hdr_len, GFP_KERNEL); - DPRINTK("atm_tc_change: flow %p\n", flow); + pr_debug("atm_tc_change: flow %p\n", flow); if (!flow) { error = -ENOBUFS; goto err_out; } flow->filter_list = NULL; - if (!(flow->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops, classid))) + flow->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops, classid); + if (!flow->q) flow->q = &noop_qdisc; - DPRINTK("atm_tc_change: qdisc %p\n", flow->q); + pr_debug("atm_tc_change: qdisc %p\n", flow->q); flow->sock = sock; flow->vcc = ATM_SD(sock); /* speedup */ flow->vcc->user_back = flow; - DPRINTK("atm_tc_change: vcc %p\n", flow->vcc); + pr_debug("atm_tc_change: vcc %p\n", flow->vcc); flow->old_pop = flow->vcc->pop; flow->parent = p; flow->vcc->pop = sch_atm_pop; @@ -330,11 +328,11 @@ err_out: static int atm_tc_delete(struct Qdisc *sch, unsigned long arg) { - struct atm_qdisc_data *p = PRIV(sch); + struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow = (struct atm_flow_data *)arg; - DPRINTK("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n", sch, p, flow); - if (!find_flow(PRIV(sch), flow)) + pr_debug("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n", sch, p, flow); + if (!find_flow(qdisc_priv(sch), flow)) return -EINVAL; if (flow->filter_list || flow == &p->link) return -EBUSY; @@ -354,10 +352,10 @@ static int atm_tc_delete(struct Qdisc *sch, unsigned long arg) static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker) { - struct atm_qdisc_data *p = PRIV(sch); + struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow; - DPRINTK("atm_tc_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker); + pr_debug("atm_tc_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker); if (walker->stop) return; for (flow = p->flows; flow; flow = flow->next) { @@ -372,10 +370,10 @@ static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker) static struct tcf_proto **atm_tc_find_tcf(struct Qdisc *sch, unsigned long cl) { - struct atm_qdisc_data *p = PRIV(sch); + struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow = (struct atm_flow_data *)cl; - DPRINTK("atm_tc_find_tcf(sch %p,[qdisc %p],flow %p)\n", sch, p, flow); + pr_debug("atm_tc_find_tcf(sch %p,[qdisc %p],flow %p)\n", sch, p, flow); return flow ? &flow->filter_list : &p->link.filter_list; } @@ -383,13 +381,13 @@ static struct tcf_proto **atm_tc_find_tcf(struct Qdisc *sch, unsigned long cl) static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch) { - struct atm_qdisc_data *p = PRIV(sch); + struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow = NULL; /* @@@ */ struct tcf_result res; int result; int ret = NET_XMIT_POLICED; - D2PRINTK("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); + pr_debug("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); result = TC_POLICE_OK; /* be nice to gcc */ if (TC_H_MAJ(skb->priority) != sch->handle || !(flow = (struct atm_flow_data *)atm_tc_get(sch, skb->priority))) @@ -430,7 +428,8 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch) #endif } - if ((ret = flow->q->enqueue(skb, flow->q)) != 0) { + ret = flow->q->enqueue(skb, flow->q); + if (ret != 0) { drop: __maybe_unused sch->qstats.drops++; if (flow) @@ -468,11 +467,11 @@ drop: __maybe_unused static void sch_atm_dequeue(unsigned long data) { struct Qdisc *sch = (struct Qdisc *)data; - struct atm_qdisc_data *p = PRIV(sch); + struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow; struct sk_buff *skb; - D2PRINTK("sch_atm_dequeue(sch %p,[qdisc %p])\n", sch, p); + pr_debug("sch_atm_dequeue(sch %p,[qdisc %p])\n", sch, p); for (flow = p->link.next; flow; flow = flow->next) /* * If traffic is properly shaped, this won't generate nasty @@ -483,7 +482,7 @@ static void sch_atm_dequeue(unsigned long data) (void)flow->q->ops->requeue(skb, flow->q); break; } - D2PRINTK("atm_tc_dequeue: sending on class %p\n", flow); + pr_debug("atm_tc_dequeue: sending on class %p\n", flow); /* remove any LL header somebody else has attached */ skb_pull(skb, skb_network_offset(skb)); if (skb_headroom(skb) < flow->hdr_len) { @@ -495,7 +494,7 @@ static void sch_atm_dequeue(unsigned long data) continue; skb = new; } - D2PRINTK("sch_atm_dequeue: ip %p, data %p\n", + pr_debug("sch_atm_dequeue: ip %p, data %p\n", skb_network_header(skb), skb->data); ATM_SKB(skb)->vcc = flow->vcc; memcpy(skb_push(skb, flow->hdr_len), flow->hdr, @@ -509,10 +508,10 @@ static void sch_atm_dequeue(unsigned long data) static struct sk_buff *atm_tc_dequeue(struct Qdisc *sch) { - struct atm_qdisc_data *p = PRIV(sch); + struct atm_qdisc_data *p = qdisc_priv(sch); struct sk_buff *skb; - D2PRINTK("atm_tc_dequeue(sch %p,[qdisc %p])\n", sch, p); + pr_debug("atm_tc_dequeue(sch %p,[qdisc %p])\n", sch, p); tasklet_schedule(&p->task); skb = p->link.q->dequeue(p->link.q); if (skb) @@ -522,10 +521,10 @@ static struct sk_buff *atm_tc_dequeue(struct Qdisc *sch) static int atm_tc_requeue(struct sk_buff *skb, struct Qdisc *sch) { - struct atm_qdisc_data *p = PRIV(sch); + struct atm_qdisc_data *p = qdisc_priv(sch); int ret; - D2PRINTK("atm_tc_requeue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); + pr_debug("atm_tc_requeue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); ret = p->link.q->ops->requeue(skb, p->link.q); if (!ret) { sch->q.qlen++; @@ -539,27 +538,27 @@ static int atm_tc_requeue(struct sk_buff *skb, struct Qdisc *sch) static unsigned int atm_tc_drop(struct Qdisc *sch) { - struct atm_qdisc_data *p = PRIV(sch); + struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow; unsigned int len; - DPRINTK("atm_tc_drop(sch %p,[qdisc %p])\n", sch, p); + pr_debug("atm_tc_drop(sch %p,[qdisc %p])\n", sch, p); for (flow = p->flows; flow; flow = flow->next) if (flow->q->ops->drop && (len = flow->q->ops->drop(flow->q))) return len; return 0; } -static int atm_tc_init(struct Qdisc *sch, struct rtattr *opt) +static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt) { - struct atm_qdisc_data *p = PRIV(sch); + struct atm_qdisc_data *p = qdisc_priv(sch); - DPRINTK("atm_tc_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt); + pr_debug("atm_tc_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt); p->flows = &p->link; - if (!(p->link.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops, - sch->handle))) + p->link.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops, sch->handle); + if (!p->link.q) p->link.q = &noop_qdisc; - DPRINTK("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q); + pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q); p->link.filter_list = NULL; p->link.vcc = NULL; p->link.sock = NULL; @@ -572,10 +571,10 @@ static int atm_tc_init(struct Qdisc *sch, struct rtattr *opt) static void atm_tc_reset(struct Qdisc *sch) { - struct atm_qdisc_data *p = PRIV(sch); + struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow; - DPRINTK("atm_tc_reset(sch %p,[qdisc %p])\n", sch, p); + pr_debug("atm_tc_reset(sch %p,[qdisc %p])\n", sch, p); for (flow = p->flows; flow; flow = flow->next) qdisc_reset(flow->q); sch->q.qlen = 0; @@ -583,10 +582,10 @@ static void atm_tc_reset(struct Qdisc *sch) static void atm_tc_destroy(struct Qdisc *sch) { - struct atm_qdisc_data *p = PRIV(sch); + struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow; - DPRINTK("atm_tc_destroy(sch %p,[qdisc %p])\n", sch, p); + pr_debug("atm_tc_destroy(sch %p,[qdisc %p])\n", sch, p); /* races ? */ while ((flow = p->flows)) { tcf_destroy_chain(flow->filter_list); @@ -608,20 +607,22 @@ static void atm_tc_destroy(struct Qdisc *sch) static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { - struct atm_qdisc_data *p = PRIV(sch); + struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow = (struct atm_flow_data *)cl; - unsigned char *b = skb_tail_pointer(skb); - struct rtattr *rta; + struct nlattr *nest; - DPRINTK("atm_tc_dump_class(sch %p,[qdisc %p],flow %p,skb %p,tcm %p)\n", + pr_debug("atm_tc_dump_class(sch %p,[qdisc %p],flow %p,skb %p,tcm %p)\n", sch, p, flow, skb, tcm); if (!find_flow(p, flow)) return -EINVAL; tcm->tcm_handle = flow->classid; tcm->tcm_info = flow->q->handle; - rta = (struct rtattr *)b; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); - RTA_PUT(skb, TCA_ATM_HDR, flow->hdr_len, flow->hdr); + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + + NLA_PUT(skb, TCA_ATM_HDR, flow->hdr_len, flow->hdr); if (flow->vcc) { struct sockaddr_atmpvc pvc; int state; @@ -630,22 +631,21 @@ static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl, pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1; pvc.sap_addr.vpi = flow->vcc->vpi; pvc.sap_addr.vci = flow->vcc->vci; - RTA_PUT(skb, TCA_ATM_ADDR, sizeof(pvc), &pvc); + NLA_PUT(skb, TCA_ATM_ADDR, sizeof(pvc), &pvc); state = ATM_VF2VS(flow->vcc->flags); - RTA_PUT(skb, TCA_ATM_STATE, sizeof(state), &state); + NLA_PUT_U32(skb, TCA_ATM_STATE, state); } if (flow->excess) - RTA_PUT(skb, TCA_ATM_EXCESS, sizeof(u32), &flow->classid); + NLA_PUT_U32(skb, TCA_ATM_EXCESS, flow->classid); else { - static u32 zero; - - RTA_PUT(skb, TCA_ATM_EXCESS, sizeof(zero), &zero); + NLA_PUT_U32(skb, TCA_ATM_EXCESS, 0); } - rta->rta_len = skb_tail_pointer(skb) - b; + + nla_nest_end(skb, nest); return skb->len; -rtattr_failure: - nlmsg_trim(skb, b); +nla_put_failure: + nla_nest_cancel(skb, nest); return -1; } static int @@ -668,7 +668,7 @@ static int atm_tc_dump(struct Qdisc *sch, struct sk_buff *skb) return 0; } -static struct Qdisc_class_ops atm_class_ops = { +static const struct Qdisc_class_ops atm_class_ops = { .graft = atm_tc_graft, .leaf = atm_tc_leaf, .get = atm_tc_get, @@ -683,7 +683,7 @@ static struct Qdisc_class_ops atm_class_ops = { .dump_stats = atm_tc_dump_class_stats, }; -static struct Qdisc_ops atm_qdisc_ops = { +static struct Qdisc_ops atm_qdisc_ops __read_mostly = { .cl_ops = &atm_class_ops, .id = "atm", .priv_size = sizeof(struct atm_qdisc_data), diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c index f914fc43a12..507fb488bc9 100644 --- a/net/sched/sch_blackhole.c +++ b/net/sched/sch_blackhole.c @@ -28,7 +28,7 @@ static struct sk_buff *blackhole_dequeue(struct Qdisc *sch) return NULL; } -static struct Qdisc_ops blackhole_qdisc_ops = { +static struct Qdisc_ops blackhole_qdisc_ops __read_mostly = { .id = "blackhole", .priv_size = 0, .enqueue = blackhole_enqueue, diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 4de3744e65c..09969c1fbc0 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1377,24 +1377,33 @@ static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt) return 0; } -static int cbq_init(struct Qdisc *sch, struct rtattr *opt) +static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = { + [TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) }, + [TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) }, + [TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) }, + [TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) }, + [TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) }, + [TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, + [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) }, +}; + +static int cbq_init(struct Qdisc *sch, struct nlattr *opt) { struct cbq_sched_data *q = qdisc_priv(sch); - struct rtattr *tb[TCA_CBQ_MAX]; + struct nlattr *tb[TCA_CBQ_MAX + 1]; struct tc_ratespec *r; + int err; - if (rtattr_parse_nested(tb, TCA_CBQ_MAX, opt) < 0 || - tb[TCA_CBQ_RTAB-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL || - RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec)) - return -EINVAL; + err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy); + if (err < 0) + return err; - if (tb[TCA_CBQ_LSSOPT-1] && - RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt)) + if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL) return -EINVAL; - r = RTA_DATA(tb[TCA_CBQ_RATE-1]); + r = nla_data(tb[TCA_CBQ_RATE]); - if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB-1])) == NULL) + if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL) return -EINVAL; q->link.refcnt = 1; @@ -1427,8 +1436,8 @@ static int cbq_init(struct Qdisc *sch, struct rtattr *opt) cbq_link_class(&q->link); - if (tb[TCA_CBQ_LSSOPT-1]) - cbq_set_lss(&q->link, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); + if (tb[TCA_CBQ_LSSOPT]) + cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT])); cbq_addprio(q, &q->link); return 0; @@ -1438,10 +1447,10 @@ static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl) { unsigned char *b = skb_tail_pointer(skb); - RTA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate); + NLA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate); return skb->len; -rtattr_failure: +nla_put_failure: nlmsg_trim(skb, b); return -1; } @@ -1463,10 +1472,10 @@ static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl) opt.minidle = (u32)(-cl->minidle); opt.offtime = cl->offtime; opt.change = ~0; - RTA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt); + NLA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt); return skb->len; -rtattr_failure: +nla_put_failure: nlmsg_trim(skb, b); return -1; } @@ -1481,10 +1490,10 @@ static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl) opt.priority = cl->priority+1; opt.cpriority = cl->cpriority+1; opt.weight = cl->weight; - RTA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt); + NLA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt); return skb->len; -rtattr_failure: +nla_put_failure: nlmsg_trim(skb, b); return -1; } @@ -1498,10 +1507,10 @@ static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl) opt.priority2 = cl->priority2+1; opt.pad = 0; opt.penalty = cl->penalty; - RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt); + NLA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt); return skb->len; -rtattr_failure: +nla_put_failure: nlmsg_trim(skb, b); return -1; } @@ -1515,11 +1524,11 @@ static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl) opt.split = cl->split ? cl->split->classid : 0; opt.defmap = cl->defmap; opt.defchange = ~0; - RTA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt); + NLA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt); } return skb->len; -rtattr_failure: +nla_put_failure: nlmsg_trim(skb, b); return -1; } @@ -1534,11 +1543,11 @@ static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl) opt.police = cl->police; opt.__res1 = 0; opt.__res2 = 0; - RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt); + NLA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt); } return skb->len; -rtattr_failure: +nla_put_failure: nlmsg_trim(skb, b); return -1; } @@ -1561,18 +1570,18 @@ static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl) static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct cbq_sched_data *q = qdisc_priv(sch); - unsigned char *b = skb_tail_pointer(skb); - struct rtattr *rta; + struct nlattr *nest; - rta = (struct rtattr*)b; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; if (cbq_dump_attr(skb, &q->link) < 0) - goto rtattr_failure; - rta->rta_len = skb_tail_pointer(skb) - b; + goto nla_put_failure; + nla_nest_end(skb, nest); return skb->len; -rtattr_failure: - nlmsg_trim(skb, b); +nla_put_failure: + nla_nest_cancel(skb, nest); return -1; } @@ -1590,8 +1599,7 @@ cbq_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, struct tcmsg *tcm) { struct cbq_class *cl = (struct cbq_class*)arg; - unsigned char *b = skb_tail_pointer(skb); - struct rtattr *rta; + struct nlattr *nest; if (cl->tparent) tcm->tcm_parent = cl->tparent->classid; @@ -1600,15 +1608,16 @@ cbq_dump_class(struct Qdisc *sch, unsigned long arg, tcm->tcm_handle = cl->classid; tcm->tcm_info = cl->q->handle; - rta = (struct rtattr*)b; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; if (cbq_dump_attr(skb, cl) < 0) - goto rtattr_failure; - rta->rta_len = skb_tail_pointer(skb) - b; + goto nla_put_failure; + nla_nest_end(skb, nest); return skb->len; -rtattr_failure: - nlmsg_trim(skb, b); +nla_put_failure: + nla_nest_cancel(skb, nest); return -1; } @@ -1753,45 +1762,23 @@ static void cbq_put(struct Qdisc *sch, unsigned long arg) } static int -cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca, +cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca, unsigned long *arg) { int err; struct cbq_sched_data *q = qdisc_priv(sch); struct cbq_class *cl = (struct cbq_class*)*arg; - struct rtattr *opt = tca[TCA_OPTIONS-1]; - struct rtattr *tb[TCA_CBQ_MAX]; + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_CBQ_MAX + 1]; struct cbq_class *parent; struct qdisc_rate_table *rtab = NULL; - if (opt==NULL || rtattr_parse_nested(tb, TCA_CBQ_MAX, opt)) + if (opt == NULL) return -EINVAL; - if (tb[TCA_CBQ_OVL_STRATEGY-1] && - RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY-1]) < sizeof(struct tc_cbq_ovl)) - return -EINVAL; - - if (tb[TCA_CBQ_FOPT-1] && - RTA_PAYLOAD(tb[TCA_CBQ_FOPT-1]) < sizeof(struct tc_cbq_fopt)) - return -EINVAL; - - if (tb[TCA_CBQ_RATE-1] && - RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec)) - return -EINVAL; - - if (tb[TCA_CBQ_LSSOPT-1] && - RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt)) - return -EINVAL; - - if (tb[TCA_CBQ_WRROPT-1] && - RTA_PAYLOAD(tb[TCA_CBQ_WRROPT-1]) < sizeof(struct tc_cbq_wrropt)) - return -EINVAL; - -#ifdef CONFIG_NET_CLS_ACT - if (tb[TCA_CBQ_POLICE-1] && - RTA_PAYLOAD(tb[TCA_CBQ_POLICE-1]) < sizeof(struct tc_cbq_police)) - return -EINVAL; -#endif + err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy); + if (err < 0) + return err; if (cl) { /* Check parent */ @@ -1802,8 +1789,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **t return -EINVAL; } - if (tb[TCA_CBQ_RATE-1]) { - rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]); + if (tb[TCA_CBQ_RATE]) { + rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]); if (rtab == NULL) return -EINVAL; } @@ -1819,45 +1806,45 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **t qdisc_put_rtab(rtab); } - if (tb[TCA_CBQ_LSSOPT-1]) - cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); + if (tb[TCA_CBQ_LSSOPT]) + cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT])); - if (tb[TCA_CBQ_WRROPT-1]) { + if (tb[TCA_CBQ_WRROPT]) { cbq_rmprio(q, cl); - cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1])); + cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT])); } - if (tb[TCA_CBQ_OVL_STRATEGY-1]) - cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1])); + if (tb[TCA_CBQ_OVL_STRATEGY]) + cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY])); #ifdef CONFIG_NET_CLS_ACT - if (tb[TCA_CBQ_POLICE-1]) - cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1])); + if (tb[TCA_CBQ_POLICE]) + cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE])); #endif - if (tb[TCA_CBQ_FOPT-1]) - cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1])); + if (tb[TCA_CBQ_FOPT]) + cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT])); if (cl->q->q.qlen) cbq_activate_class(cl); sch_tree_unlock(sch); - if (tca[TCA_RATE-1]) + if (tca[TCA_RATE]) gen_replace_estimator(&cl->bstats, &cl->rate_est, &sch->dev->queue_lock, - tca[TCA_RATE-1]); + tca[TCA_RATE]); return 0; } if (parentid == TC_H_ROOT) return -EINVAL; - if (tb[TCA_CBQ_WRROPT-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL || - tb[TCA_CBQ_LSSOPT-1] == NULL) + if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL || + tb[TCA_CBQ_LSSOPT] == NULL) return -EINVAL; - rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]); + rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]); if (rtab == NULL) return -EINVAL; @@ -1912,8 +1899,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **t cl->share = cl->tparent; cbq_adjust_levels(parent); cl->minidle = -0x7FFFFFFF; - cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); - cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1])); + cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT])); + cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT])); if (cl->ewma_log==0) cl->ewma_log = q->link.ewma_log; if (cl->maxidle==0) @@ -1921,19 +1908,19 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **t if (cl->avpkt==0) cl->avpkt = q->link.avpkt; cl->overlimit = cbq_ovl_classic; - if (tb[TCA_CBQ_OVL_STRATEGY-1]) - cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1])); + if (tb[TCA_CBQ_OVL_STRATEGY]) + cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY])); #ifdef CONFIG_NET_CLS_ACT - if (tb[TCA_CBQ_POLICE-1]) - cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1])); + if (tb[TCA_CBQ_POLICE]) + cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE])); #endif - if (tb[TCA_CBQ_FOPT-1]) - cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1])); + if (tb[TCA_CBQ_FOPT]) + cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT])); sch_tree_unlock(sch); - if (tca[TCA_RATE-1]) + if (tca[TCA_RATE]) gen_new_estimator(&cl->bstats, &cl->rate_est, - &sch->dev->queue_lock, tca[TCA_RATE-1]); + &sch->dev->queue_lock, tca[TCA_RATE]); *arg = (unsigned long)cl; return 0; @@ -2045,7 +2032,7 @@ static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg) } } -static struct Qdisc_class_ops cbq_class_ops = { +static const struct Qdisc_class_ops cbq_class_ops = { .graft = cbq_graft, .leaf = cbq_leaf, .qlen_notify = cbq_qlen_notify, @@ -2061,7 +2048,7 @@ static struct Qdisc_class_ops cbq_class_ops = { .dump_stats = cbq_dump_class_stats, }; -static struct Qdisc_ops cbq_qdisc_ops = { +static struct Qdisc_ops cbq_qdisc_ops __read_mostly = { .next = NULL, .cl_ops = &cbq_class_ops, .id = "cbq", diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 60f89199e3d..0df911fd67b 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -10,28 +10,12 @@ #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> +#include <linux/bitops.h> #include <net/pkt_sched.h> #include <net/dsfield.h> #include <net/inet_ecn.h> #include <asm/byteorder.h> - -#if 0 /* control */ -#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) -#else -#define DPRINTK(format,args...) -#endif - -#if 0 /* data */ -#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) -#else -#define D2PRINTK(format,args...) -#endif - - -#define PRIV(sch) ((struct dsmark_qdisc_data *) qdisc_priv(sch)) - - /* * classid class marking * ------- ----- ------- @@ -60,17 +44,6 @@ struct dsmark_qdisc_data { int set_tc_index; }; -static inline int dsmark_valid_indices(u16 indices) -{ - while (indices != 1) { - if (indices & 1) - return 0; - indices >>= 1; - } - - return 1; -} - static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index) { return (index <= p->indices && index > 0); @@ -81,9 +54,9 @@ static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index) static int dsmark_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old) { - struct dsmark_qdisc_data *p = PRIV(sch); + struct dsmark_qdisc_data *p = qdisc_priv(sch); - DPRINTK("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n", + pr_debug("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n", sch, p, new, old); if (new == NULL) { @@ -104,13 +77,14 @@ static int dsmark_graft(struct Qdisc *sch, unsigned long arg, static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg) { - return PRIV(sch)->q; + struct dsmark_qdisc_data *p = qdisc_priv(sch); + return p->q; } static unsigned long dsmark_get(struct Qdisc *sch, u32 classid) { - DPRINTK("dsmark_get(sch %p,[qdisc %p],classid %x)\n", - sch, PRIV(sch), classid); + pr_debug("dsmark_get(sch %p,[qdisc %p],classid %x)\n", + sch, qdisc_priv(sch), classid); return TC_H_MIN(classid) + 1; } @@ -125,44 +99,56 @@ static void dsmark_put(struct Qdisc *sch, unsigned long cl) { } +static const struct nla_policy dsmark_policy[TCA_DSMARK_MAX + 1] = { + [TCA_DSMARK_INDICES] = { .type = NLA_U16 }, + [TCA_DSMARK_DEFAULT_INDEX] = { .type = NLA_U16 }, + [TCA_DSMARK_SET_TC_INDEX] = { .type = NLA_FLAG }, + [TCA_DSMARK_MASK] = { .type = NLA_U8 }, + [TCA_DSMARK_VALUE] = { .type = NLA_U8 }, +}; + static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent, - struct rtattr **tca, unsigned long *arg) + struct nlattr **tca, unsigned long *arg) { - struct dsmark_qdisc_data *p = PRIV(sch); - struct rtattr *opt = tca[TCA_OPTIONS-1]; - struct rtattr *tb[TCA_DSMARK_MAX]; + struct dsmark_qdisc_data *p = qdisc_priv(sch); + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_DSMARK_MAX + 1]; int err = -EINVAL; u8 mask = 0; - DPRINTK("dsmark_change(sch %p,[qdisc %p],classid %x,parent %x)," + pr_debug("dsmark_change(sch %p,[qdisc %p],classid %x,parent %x)," "arg 0x%lx\n", sch, p, classid, parent, *arg); if (!dsmark_valid_index(p, *arg)) { err = -ENOENT; - goto rtattr_failure; + goto errout; } - if (!opt || rtattr_parse_nested(tb, TCA_DSMARK_MAX, opt)) - goto rtattr_failure; + if (!opt) + goto errout; - if (tb[TCA_DSMARK_MASK-1]) - mask = RTA_GET_U8(tb[TCA_DSMARK_MASK-1]); + err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy); + if (err < 0) + goto errout; + + if (tb[TCA_DSMARK_MASK]) + mask = nla_get_u8(tb[TCA_DSMARK_MASK]); - if (tb[TCA_DSMARK_VALUE-1]) - p->value[*arg-1] = RTA_GET_U8(tb[TCA_DSMARK_VALUE-1]); + if (tb[TCA_DSMARK_VALUE]) + p->value[*arg-1] = nla_get_u8(tb[TCA_DSMARK_VALUE]); - if (tb[TCA_DSMARK_MASK-1]) + if (tb[TCA_DSMARK_MASK]) p->mask[*arg-1] = mask; err = 0; -rtattr_failure: +errout: return err; } static int dsmark_delete(struct Qdisc *sch, unsigned long arg) { - struct dsmark_qdisc_data *p = PRIV(sch); + struct dsmark_qdisc_data *p = qdisc_priv(sch); if (!dsmark_valid_index(p, arg)) return -EINVAL; @@ -173,12 +159,12 @@ static int dsmark_delete(struct Qdisc *sch, unsigned long arg) return 0; } -static void dsmark_walk(struct Qdisc *sch,struct qdisc_walker *walker) +static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker) { - struct dsmark_qdisc_data *p = PRIV(sch); + struct dsmark_qdisc_data *p = qdisc_priv(sch); int i; - DPRINTK("dsmark_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker); + pr_debug("dsmark_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker); if (walker->stop) return; @@ -197,34 +183,42 @@ ignore: } } -static struct tcf_proto **dsmark_find_tcf(struct Qdisc *sch,unsigned long cl) +static inline struct tcf_proto **dsmark_find_tcf(struct Qdisc *sch, + unsigned long cl) { - return &PRIV(sch)->filter_list; + struct dsmark_qdisc_data *p = qdisc_priv(sch); + return &p->filter_list; } /* --------------------------- Qdisc operations ---------------------------- */ -static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch) +static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) { - struct dsmark_qdisc_data *p = PRIV(sch); + struct dsmark_qdisc_data *p = qdisc_priv(sch); int err; - D2PRINTK("dsmark_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); + pr_debug("dsmark_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); if (p->set_tc_index) { - /* FIXME: Safe with non-linear skbs? --RR */ switch (skb->protocol) { - case __constant_htons(ETH_P_IP): - skb->tc_index = ipv4_get_dsfield(ip_hdr(skb)) - & ~INET_ECN_MASK; - break; - case __constant_htons(ETH_P_IPV6): - skb->tc_index = ipv6_get_dsfield(ipv6_hdr(skb)) - & ~INET_ECN_MASK; - break; - default: - skb->tc_index = 0; - break; + case __constant_htons(ETH_P_IP): + if (skb_cow_head(skb, sizeof(struct iphdr))) + goto drop; + + skb->tc_index = ipv4_get_dsfield(ip_hdr(skb)) + & ~INET_ECN_MASK; + break; + + case __constant_htons(ETH_P_IPV6): + if (skb_cow_head(skb, sizeof(struct ipv6hdr))) + goto drop; + + skb->tc_index = ipv6_get_dsfield(ipv6_hdr(skb)) + & ~INET_ECN_MASK; + break; + default: + skb->tc_index = 0; + break; } } @@ -234,7 +228,7 @@ static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch) struct tcf_result res; int result = tc_classify(skb, p->filter_list, &res); - D2PRINTK("result %d class 0x%04x\n", result, res.classid); + pr_debug("result %d class 0x%04x\n", result, res.classid); switch (result) { #ifdef CONFIG_NET_CLS_ACT @@ -242,14 +236,14 @@ static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch) case TC_ACT_STOLEN: kfree_skb(skb); return NET_XMIT_SUCCESS; + case TC_ACT_SHOT: - kfree_skb(skb); - sch->qstats.drops++; - return NET_XMIT_BYPASS; + goto drop; #endif case TC_ACT_OK: skb->tc_index = TC_H_MIN(res.classid); break; + default: if (p->default_index != NO_DEFAULT_INDEX) skb->tc_index = p->default_index; @@ -257,7 +251,7 @@ static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch) } } - err = p->q->enqueue(skb,p->q); + err = p->q->enqueue(skb, p->q); if (err != NET_XMIT_SUCCESS) { sch->qstats.drops++; return err; @@ -268,15 +262,20 @@ static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch) sch->q.qlen++; return NET_XMIT_SUCCESS; + +drop: + kfree_skb(skb); + sch->qstats.drops++; + return NET_XMIT_BYPASS; } static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) { - struct dsmark_qdisc_data *p = PRIV(sch); + struct dsmark_qdisc_data *p = qdisc_priv(sch); struct sk_buff *skb; u32 index; - D2PRINTK("dsmark_dequeue(sch %p,[qdisc %p])\n", sch, p); + pr_debug("dsmark_dequeue(sch %p,[qdisc %p])\n", sch, p); skb = p->q->ops->dequeue(p->q); if (skb == NULL) @@ -285,39 +284,39 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) sch->q.qlen--; index = skb->tc_index & (p->indices - 1); - D2PRINTK("index %d->%d\n", skb->tc_index, index); + pr_debug("index %d->%d\n", skb->tc_index, index); switch (skb->protocol) { - case __constant_htons(ETH_P_IP): - ipv4_change_dsfield(ip_hdr(skb), p->mask[index], - p->value[index]); - break; - case __constant_htons(ETH_P_IPV6): - ipv6_change_dsfield(ipv6_hdr(skb), p->mask[index], - p->value[index]); + case __constant_htons(ETH_P_IP): + ipv4_change_dsfield(ip_hdr(skb), p->mask[index], + p->value[index]); break; - default: - /* - * Only complain if a change was actually attempted. - * This way, we can send non-IP traffic through dsmark - * and don't need yet another qdisc as a bypass. - */ - if (p->mask[index] != 0xff || p->value[index]) - printk(KERN_WARNING "dsmark_dequeue: " - "unsupported protocol %d\n", - ntohs(skb->protocol)); + case __constant_htons(ETH_P_IPV6): + ipv6_change_dsfield(ipv6_hdr(skb), p->mask[index], + p->value[index]); break; + default: + /* + * Only complain if a change was actually attempted. + * This way, we can send non-IP traffic through dsmark + * and don't need yet another qdisc as a bypass. + */ + if (p->mask[index] != 0xff || p->value[index]) + printk(KERN_WARNING + "dsmark_dequeue: unsupported protocol %d\n", + ntohs(skb->protocol)); + break; } return skb; } -static int dsmark_requeue(struct sk_buff *skb,struct Qdisc *sch) +static int dsmark_requeue(struct sk_buff *skb, struct Qdisc *sch) { - struct dsmark_qdisc_data *p = PRIV(sch); + struct dsmark_qdisc_data *p = qdisc_priv(sch); int err; - D2PRINTK("dsmark_requeue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); + pr_debug("dsmark_requeue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); err = p->q->ops->requeue(skb, p->q); if (err != NET_XMIT_SUCCESS) { @@ -333,10 +332,10 @@ static int dsmark_requeue(struct sk_buff *skb,struct Qdisc *sch) static unsigned int dsmark_drop(struct Qdisc *sch) { - struct dsmark_qdisc_data *p = PRIV(sch); + struct dsmark_qdisc_data *p = qdisc_priv(sch); unsigned int len; - DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n", sch, p); + pr_debug("dsmark_reset(sch %p,[qdisc %p])\n", sch, p); if (p->q->ops->drop == NULL) return 0; @@ -348,26 +347,32 @@ static unsigned int dsmark_drop(struct Qdisc *sch) return len; } -static int dsmark_init(struct Qdisc *sch, struct rtattr *opt) +static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) { - struct dsmark_qdisc_data *p = PRIV(sch); - struct rtattr *tb[TCA_DSMARK_MAX]; + struct dsmark_qdisc_data *p = qdisc_priv(sch); + struct nlattr *tb[TCA_DSMARK_MAX + 1]; int err = -EINVAL; u32 default_index = NO_DEFAULT_INDEX; u16 indices; u8 *mask; - DPRINTK("dsmark_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt); + pr_debug("dsmark_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt); - if (!opt || rtattr_parse_nested(tb, TCA_DSMARK_MAX, opt) < 0) + if (!opt) goto errout; - indices = RTA_GET_U16(tb[TCA_DSMARK_INDICES-1]); - if (!indices || !dsmark_valid_indices(indices)) + err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy); + if (err < 0) + goto errout; + + err = -EINVAL; + indices = nla_get_u16(tb[TCA_DSMARK_INDICES]); + + if (hweight32(indices) != 1) goto errout; - if (tb[TCA_DSMARK_DEFAULT_INDEX-1]) - default_index = RTA_GET_U16(tb[TCA_DSMARK_DEFAULT_INDEX-1]); + if (tb[TCA_DSMARK_DEFAULT_INDEX]) + default_index = nla_get_u16(tb[TCA_DSMARK_DEFAULT_INDEX]); mask = kmalloc(indices * 2, GFP_KERNEL); if (mask == NULL) { @@ -383,34 +388,33 @@ static int dsmark_init(struct Qdisc *sch, struct rtattr *opt) p->indices = indices; p->default_index = default_index; - p->set_tc_index = RTA_GET_FLAG(tb[TCA_DSMARK_SET_TC_INDEX-1]); + p->set_tc_index = nla_get_flag(tb[TCA_DSMARK_SET_TC_INDEX]); p->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops, sch->handle); if (p->q == NULL) p->q = &noop_qdisc; - DPRINTK("dsmark_init: qdisc %p\n", p->q); + pr_debug("dsmark_init: qdisc %p\n", p->q); err = 0; errout: -rtattr_failure: return err; } static void dsmark_reset(struct Qdisc *sch) { - struct dsmark_qdisc_data *p = PRIV(sch); + struct dsmark_qdisc_data *p = qdisc_priv(sch); - DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n", sch, p); + pr_debug("dsmark_reset(sch %p,[qdisc %p])\n", sch, p); qdisc_reset(p->q); sch->q.qlen = 0; } static void dsmark_destroy(struct Qdisc *sch) { - struct dsmark_qdisc_data *p = PRIV(sch); + struct dsmark_qdisc_data *p = qdisc_priv(sch); - DPRINTK("dsmark_destroy(sch %p,[qdisc %p])\n", sch, p); + pr_debug("dsmark_destroy(sch %p,[qdisc %p])\n", sch, p); tcf_destroy_chain(p->filter_list); qdisc_destroy(p->q); @@ -420,10 +424,10 @@ static void dsmark_destroy(struct Qdisc *sch) static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { - struct dsmark_qdisc_data *p = PRIV(sch); - struct rtattr *opts = NULL; + struct dsmark_qdisc_data *p = qdisc_priv(sch); + struct nlattr *opts = NULL; - DPRINTK("dsmark_dump_class(sch %p,[qdisc %p],class %ld\n", sch, p, cl); + pr_debug("dsmark_dump_class(sch %p,[qdisc %p],class %ld\n", sch, p, cl); if (!dsmark_valid_index(p, cl)) return -EINVAL; @@ -431,37 +435,41 @@ static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl, tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl-1); tcm->tcm_info = p->q->handle; - opts = RTA_NEST(skb, TCA_OPTIONS); - RTA_PUT_U8(skb,TCA_DSMARK_MASK, p->mask[cl-1]); - RTA_PUT_U8(skb,TCA_DSMARK_VALUE, p->value[cl-1]); + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + NLA_PUT_U8(skb, TCA_DSMARK_MASK, p->mask[cl-1]); + NLA_PUT_U8(skb, TCA_DSMARK_VALUE, p->value[cl-1]); - return RTA_NEST_END(skb, opts); + return nla_nest_end(skb, opts); -rtattr_failure: - return RTA_NEST_CANCEL(skb, opts); +nla_put_failure: + return nla_nest_cancel(skb, opts); } static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb) { - struct dsmark_qdisc_data *p = PRIV(sch); - struct rtattr *opts = NULL; + struct dsmark_qdisc_data *p = qdisc_priv(sch); + struct nlattr *opts = NULL; - opts = RTA_NEST(skb, TCA_OPTIONS); - RTA_PUT_U16(skb, TCA_DSMARK_INDICES, p->indices); + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + NLA_PUT_U16(skb, TCA_DSMARK_INDICES, p->indices); if (p->default_index != NO_DEFAULT_INDEX) - RTA_PUT_U16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index); + NLA_PUT_U16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index); if (p->set_tc_index) - RTA_PUT_FLAG(skb, TCA_DSMARK_SET_TC_INDEX); + NLA_PUT_FLAG(skb, TCA_DSMARK_SET_TC_INDEX); - return RTA_NEST_END(skb, opts); + return nla_nest_end(skb, opts); -rtattr_failure: - return RTA_NEST_CANCEL(skb, opts); +nla_put_failure: + return nla_nest_cancel(skb, opts); } -static struct Qdisc_class_ops dsmark_class_ops = { +static const struct Qdisc_class_ops dsmark_class_ops = { .graft = dsmark_graft, .leaf = dsmark_leaf, .get = dsmark_get, @@ -475,7 +483,7 @@ static struct Qdisc_class_ops dsmark_class_ops = { .dump = dsmark_dump_class, }; -static struct Qdisc_ops dsmark_qdisc_ops = { +static struct Qdisc_ops dsmark_qdisc_ops __read_mostly = { .next = NULL, .cl_ops = &dsmark_class_ops, .id = "dsmark", diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index c264308f17c..95ed4822165 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -43,7 +43,7 @@ static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) return qdisc_reshape_fail(skb, sch); } -static int fifo_init(struct Qdisc *sch, struct rtattr *opt) +static int fifo_init(struct Qdisc *sch, struct nlattr *opt) { struct fifo_sched_data *q = qdisc_priv(sch); @@ -55,9 +55,9 @@ static int fifo_init(struct Qdisc *sch, struct rtattr *opt) q->limit = limit; } else { - struct tc_fifo_qopt *ctl = RTA_DATA(opt); + struct tc_fifo_qopt *ctl = nla_data(opt); - if (RTA_PAYLOAD(opt) < sizeof(*ctl)) + if (nla_len(opt) < sizeof(*ctl)) return -EINVAL; q->limit = ctl->limit; @@ -71,14 +71,14 @@ static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb) struct fifo_sched_data *q = qdisc_priv(sch); struct tc_fifo_qopt opt = { .limit = q->limit }; - RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); return skb->len; -rtattr_failure: +nla_put_failure: return -1; } -struct Qdisc_ops pfifo_qdisc_ops = { +struct Qdisc_ops pfifo_qdisc_ops __read_mostly = { .id = "pfifo", .priv_size = sizeof(struct fifo_sched_data), .enqueue = pfifo_enqueue, @@ -91,8 +91,9 @@ struct Qdisc_ops pfifo_qdisc_ops = { .dump = fifo_dump, .owner = THIS_MODULE, }; +EXPORT_SYMBOL(pfifo_qdisc_ops); -struct Qdisc_ops bfifo_qdisc_ops = { +struct Qdisc_ops bfifo_qdisc_ops __read_mostly = { .id = "bfifo", .priv_size = sizeof(struct fifo_sched_data), .enqueue = bfifo_enqueue, @@ -105,6 +106,4 @@ struct Qdisc_ops bfifo_qdisc_ops = { .dump = fifo_dump, .owner = THIS_MODULE, }; - EXPORT_SYMBOL(bfifo_qdisc_ops); -EXPORT_SYMBOL(pfifo_qdisc_ops); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index e595e6570ce..10b5c0887ff 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -40,16 +40,22 @@ */ void qdisc_lock_tree(struct net_device *dev) + __acquires(dev->queue_lock) + __acquires(dev->ingress_lock) { spin_lock_bh(&dev->queue_lock); spin_lock(&dev->ingress_lock); } +EXPORT_SYMBOL(qdisc_lock_tree); void qdisc_unlock_tree(struct net_device *dev) + __releases(dev->ingress_lock) + __releases(dev->queue_lock) { spin_unlock(&dev->ingress_lock); spin_unlock_bh(&dev->queue_lock); } +EXPORT_SYMBOL(qdisc_unlock_tree); static inline int qdisc_qlen(struct Qdisc *q) { @@ -211,13 +217,6 @@ static void dev_watchdog(unsigned long arg) dev_put(dev); } -static void dev_watchdog_init(struct net_device *dev) -{ - init_timer(&dev->watchdog_timer); - dev->watchdog_timer.data = (unsigned long)dev; - dev->watchdog_timer.function = dev_watchdog; -} - void __netdev_watchdog_up(struct net_device *dev) { if (dev->tx_timeout) { @@ -256,6 +255,7 @@ void netif_carrier_on(struct net_device *dev) __netdev_watchdog_up(dev); } } +EXPORT_SYMBOL(netif_carrier_on); /** * netif_carrier_off - clear carrier @@ -268,6 +268,7 @@ void netif_carrier_off(struct net_device *dev) if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) linkwatch_fire_event(dev); } +EXPORT_SYMBOL(netif_carrier_off); /* "NOOP" scheduler: the best scheduler, recommended for all interfaces under all circumstances. It is difficult to invent anything faster or @@ -294,7 +295,7 @@ static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc) return NET_XMIT_CN; } -struct Qdisc_ops noop_qdisc_ops = { +struct Qdisc_ops noop_qdisc_ops __read_mostly = { .id = "noop", .priv_size = 0, .enqueue = noop_enqueue, @@ -310,8 +311,9 @@ struct Qdisc noop_qdisc = { .ops = &noop_qdisc_ops, .list = LIST_HEAD_INIT(noop_qdisc.list), }; +EXPORT_SYMBOL(noop_qdisc); -static struct Qdisc_ops noqueue_qdisc_ops = { +static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = { .id = "noqueue", .priv_size = 0, .enqueue = noop_enqueue, @@ -395,14 +397,14 @@ static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb) struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS }; memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1); - RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); return skb->len; -rtattr_failure: +nla_put_failure: return -1; } -static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt) +static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt) { int prio; struct sk_buff_head *list = qdisc_priv(qdisc); @@ -413,7 +415,7 @@ static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt) return 0; } -static struct Qdisc_ops pfifo_fast_ops = { +static struct Qdisc_ops pfifo_fast_ops __read_mostly = { .id = "pfifo_fast", .priv_size = PFIFO_FAST_BANDS * sizeof(struct sk_buff_head), .enqueue = pfifo_fast_enqueue, @@ -474,16 +476,18 @@ struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops, errout: return NULL; } +EXPORT_SYMBOL(qdisc_create_dflt); /* Under dev->queue_lock and BH! */ void qdisc_reset(struct Qdisc *qdisc) { - struct Qdisc_ops *ops = qdisc->ops; + const struct Qdisc_ops *ops = qdisc->ops; if (ops->reset) ops->reset(qdisc); } +EXPORT_SYMBOL(qdisc_reset); /* this is the rcu callback function to clean up a qdisc when there * are no further references to it */ @@ -498,7 +502,7 @@ static void __qdisc_destroy(struct rcu_head *head) void qdisc_destroy(struct Qdisc *qdisc) { - struct Qdisc_ops *ops = qdisc->ops; + const struct Qdisc_ops *ops = qdisc->ops; if (qdisc->flags & TCQ_F_BUILTIN || !atomic_dec_and_test(&qdisc->refcnt)) @@ -515,6 +519,7 @@ void qdisc_destroy(struct Qdisc *qdisc) dev_put(qdisc->dev); call_rcu(&qdisc->q_rcu, __qdisc_destroy); } +EXPORT_SYMBOL(qdisc_destroy); void dev_activate(struct net_device *dev) { @@ -608,7 +613,7 @@ void dev_init_scheduler(struct net_device *dev) INIT_LIST_HEAD(&dev->qdisc_list); qdisc_unlock_tree(dev); - dev_watchdog_init(dev); + setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev); } void dev_shutdown(struct net_device *dev) @@ -629,12 +634,3 @@ void dev_shutdown(struct net_device *dev) BUG_TRAP(!timer_pending(&dev->watchdog_timer)); qdisc_unlock_tree(dev); } - -EXPORT_SYMBOL(netif_carrier_on); -EXPORT_SYMBOL(netif_carrier_off); -EXPORT_SYMBOL(noop_qdisc); -EXPORT_SYMBOL(qdisc_create_dflt); -EXPORT_SYMBOL(qdisc_destroy); -EXPORT_SYMBOL(qdisc_reset); -EXPORT_SYMBOL(qdisc_lock_tree); -EXPORT_SYMBOL(qdisc_unlock_tree); diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 3cc6dda02e2..3a9d226ff1e 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -350,16 +350,16 @@ static inline void gred_destroy_vq(struct gred_sched_data *q) kfree(q); } -static inline int gred_change_table_def(struct Qdisc *sch, struct rtattr *dps) +static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps) { struct gred_sched *table = qdisc_priv(sch); struct tc_gred_sopt *sopt; int i; - if (dps == NULL || RTA_PAYLOAD(dps) < sizeof(*sopt)) + if (dps == NULL) return -EINVAL; - sopt = RTA_DATA(dps); + sopt = nla_data(dps); if (sopt->DPs > MAX_DPs || sopt->DPs == 0 || sopt->def_DP >= sopt->DPs) return -EINVAL; @@ -425,28 +425,37 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp, return 0; } -static int gred_change(struct Qdisc *sch, struct rtattr *opt) +static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = { + [TCA_GRED_PARMS] = { .len = sizeof(struct tc_gred_qopt) }, + [TCA_GRED_STAB] = { .len = 256 }, + [TCA_GRED_DPS] = { .len = sizeof(struct tc_gred_sopt) }, +}; + +static int gred_change(struct Qdisc *sch, struct nlattr *opt) { struct gred_sched *table = qdisc_priv(sch); struct tc_gred_qopt *ctl; - struct rtattr *tb[TCA_GRED_MAX]; - int err = -EINVAL, prio = GRED_DEF_PRIO; + struct nlattr *tb[TCA_GRED_MAX + 1]; + int err, prio = GRED_DEF_PRIO; u8 *stab; - if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt)) + if (opt == NULL) return -EINVAL; - if (tb[TCA_GRED_PARMS-1] == NULL && tb[TCA_GRED_STAB-1] == NULL) + err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy); + if (err < 0) + return err; + + if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL) return gred_change_table_def(sch, opt); - if (tb[TCA_GRED_PARMS-1] == NULL || - RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) || - tb[TCA_GRED_STAB-1] == NULL || - RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256) + if (tb[TCA_GRED_PARMS] == NULL || + tb[TCA_GRED_STAB] == NULL) return -EINVAL; - ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]); - stab = RTA_DATA(tb[TCA_GRED_STAB-1]); + err = -EINVAL; + ctl = nla_data(tb[TCA_GRED_PARMS]); + stab = nla_data(tb[TCA_GRED_STAB]); if (ctl->DP >= table->DPs) goto errout; @@ -486,23 +495,28 @@ errout: return err; } -static int gred_init(struct Qdisc *sch, struct rtattr *opt) +static int gred_init(struct Qdisc *sch, struct nlattr *opt) { - struct rtattr *tb[TCA_GRED_MAX]; + struct nlattr *tb[TCA_GRED_MAX + 1]; + int err; - if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt)) + if (opt == NULL) return -EINVAL; - if (tb[TCA_GRED_PARMS-1] || tb[TCA_GRED_STAB-1]) + err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy); + if (err < 0) + return err; + + if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB]) return -EINVAL; - return gred_change_table_def(sch, tb[TCA_GRED_DPS-1]); + return gred_change_table_def(sch, tb[TCA_GRED_DPS]); } static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) { struct gred_sched *table = qdisc_priv(sch); - struct rtattr *parms, *opts = NULL; + struct nlattr *parms, *opts = NULL; int i; struct tc_gred_sopt sopt = { .DPs = table->DPs, @@ -511,9 +525,13 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) .flags = table->red_flags, }; - opts = RTA_NEST(skb, TCA_OPTIONS); - RTA_PUT(skb, TCA_GRED_DPS, sizeof(sopt), &sopt); - parms = RTA_NEST(skb, TCA_GRED_PARMS); + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + NLA_PUT(skb, TCA_GRED_DPS, sizeof(sopt), &sopt); + parms = nla_nest_start(skb, TCA_GRED_PARMS); + if (parms == NULL) + goto nla_put_failure; for (i = 0; i < MAX_DPs; i++) { struct gred_sched_data *q = table->tab[i]; @@ -555,15 +573,16 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) opt.qave = red_calc_qavg(&q->parms, q->parms.qavg); append_opt: - RTA_APPEND(skb, sizeof(opt), &opt); + if (nla_append(skb, sizeof(opt), &opt) < 0) + goto nla_put_failure; } - RTA_NEST_END(skb, parms); + nla_nest_end(skb, parms); - return RTA_NEST_END(skb, opts); + return nla_nest_end(skb, opts); -rtattr_failure: - return RTA_NEST_CANCEL(skb, opts); +nla_put_failure: + return nla_nest_cancel(skb, opts); } static void gred_destroy(struct Qdisc *sch) @@ -577,7 +596,7 @@ static void gred_destroy(struct Qdisc *sch) } } -static struct Qdisc_ops gred_qdisc_ops = { +static struct Qdisc_ops gred_qdisc_ops __read_mostly = { .id = "gred", .priv_size = sizeof(struct gred_sched), .enqueue = gred_enqueue, diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index a6ad491e434..87293d0db1d 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -986,41 +986,46 @@ hfsc_change_usc(struct hfsc_class *cl, struct tc_service_curve *usc, cl->cl_flags |= HFSC_USC; } +static const struct nla_policy hfsc_policy[TCA_HFSC_MAX + 1] = { + [TCA_HFSC_RSC] = { .len = sizeof(struct tc_service_curve) }, + [TCA_HFSC_FSC] = { .len = sizeof(struct tc_service_curve) }, + [TCA_HFSC_USC] = { .len = sizeof(struct tc_service_curve) }, +}; + static int hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, - struct rtattr **tca, unsigned long *arg) + struct nlattr **tca, unsigned long *arg) { struct hfsc_sched *q = qdisc_priv(sch); struct hfsc_class *cl = (struct hfsc_class *)*arg; struct hfsc_class *parent = NULL; - struct rtattr *opt = tca[TCA_OPTIONS-1]; - struct rtattr *tb[TCA_HFSC_MAX]; + struct nlattr *opt = tca[TCA_OPTIONS]; + struct nlattr *tb[TCA_HFSC_MAX + 1]; struct tc_service_curve *rsc = NULL, *fsc = NULL, *usc = NULL; u64 cur_time; + int err; - if (opt == NULL || rtattr_parse_nested(tb, TCA_HFSC_MAX, opt)) + if (opt == NULL) return -EINVAL; - if (tb[TCA_HFSC_RSC-1]) { - if (RTA_PAYLOAD(tb[TCA_HFSC_RSC-1]) < sizeof(*rsc)) - return -EINVAL; - rsc = RTA_DATA(tb[TCA_HFSC_RSC-1]); + err = nla_parse_nested(tb, TCA_HFSC_MAX, opt, hfsc_policy); + if (err < 0) + return err; + + if (tb[TCA_HFSC_RSC]) { + rsc = nla_data(tb[TCA_HFSC_RSC]); if (rsc->m1 == 0 && rsc->m2 == 0) rsc = NULL; } - if (tb[TCA_HFSC_FSC-1]) { - if (RTA_PAYLOAD(tb[TCA_HFSC_FSC-1]) < sizeof(*fsc)) - return -EINVAL; - fsc = RTA_DATA(tb[TCA_HFSC_FSC-1]); + if (tb[TCA_HFSC_FSC]) { + fsc = nla_data(tb[TCA_HFSC_FSC]); if (fsc->m1 == 0 && fsc->m2 == 0) fsc = NULL; } - if (tb[TCA_HFSC_USC-1]) { - if (RTA_PAYLOAD(tb[TCA_HFSC_USC-1]) < sizeof(*usc)) - return -EINVAL; - usc = RTA_DATA(tb[TCA_HFSC_USC-1]); + if (tb[TCA_HFSC_USC]) { + usc = nla_data(tb[TCA_HFSC_USC]); if (usc->m1 == 0 && usc->m2 == 0) usc = NULL; } @@ -1050,10 +1055,10 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, } sch_tree_unlock(sch); - if (tca[TCA_RATE-1]) + if (tca[TCA_RATE]) gen_replace_estimator(&cl->bstats, &cl->rate_est, &sch->dev->queue_lock, - tca[TCA_RATE-1]); + tca[TCA_RATE]); return 0; } @@ -1106,9 +1111,9 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, cl->cl_pcvtoff = parent->cl_cvtoff; sch_tree_unlock(sch); - if (tca[TCA_RATE-1]) + if (tca[TCA_RATE]) gen_new_estimator(&cl->bstats, &cl->rate_est, - &sch->dev->queue_lock, tca[TCA_RATE-1]); + &sch->dev->queue_lock, tca[TCA_RATE]); *arg = (unsigned long)cl; return 0; } @@ -1304,11 +1309,11 @@ hfsc_dump_sc(struct sk_buff *skb, int attr, struct internal_sc *sc) tsc.m1 = sm2m(sc->sm1); tsc.d = dx2d(sc->dx); tsc.m2 = sm2m(sc->sm2); - RTA_PUT(skb, attr, sizeof(tsc), &tsc); + NLA_PUT(skb, attr, sizeof(tsc), &tsc); return skb->len; - rtattr_failure: + nla_put_failure: return -1; } @@ -1317,19 +1322,19 @@ hfsc_dump_curves(struct sk_buff *skb, struct hfsc_class *cl) { if ((cl->cl_flags & HFSC_RSC) && (hfsc_dump_sc(skb, TCA_HFSC_RSC, &cl->cl_rsc) < 0)) - goto rtattr_failure; + goto nla_put_failure; if ((cl->cl_flags & HFSC_FSC) && (hfsc_dump_sc(skb, TCA_HFSC_FSC, &cl->cl_fsc) < 0)) - goto rtattr_failure; + goto nla_put_failure; if ((cl->cl_flags & HFSC_USC) && (hfsc_dump_sc(skb, TCA_HFSC_USC, &cl->cl_usc) < 0)) - goto rtattr_failure; + goto nla_put_failure; return skb->len; - rtattr_failure: + nla_put_failure: return -1; } @@ -1338,22 +1343,23 @@ hfsc_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, struct tcmsg *tcm) { struct hfsc_class *cl = (struct hfsc_class *)arg; - unsigned char *b = skb_tail_pointer(skb); - struct rtattr *rta = (struct rtattr *)b; + struct nlattr *nest; tcm->tcm_parent = cl->cl_parent ? cl->cl_parent->classid : TC_H_ROOT; tcm->tcm_handle = cl->classid; if (cl->level == 0) tcm->tcm_info = cl->qdisc->handle; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; if (hfsc_dump_curves(skb, cl) < 0) - goto rtattr_failure; - rta->rta_len = skb_tail_pointer(skb) - b; + goto nla_put_failure; + nla_nest_end(skb, nest); return skb->len; - rtattr_failure: - nlmsg_trim(skb, b); + nla_put_failure: + nla_nest_cancel(skb, nest); return -1; } @@ -1423,15 +1429,15 @@ hfsc_schedule_watchdog(struct Qdisc *sch) } static int -hfsc_init_qdisc(struct Qdisc *sch, struct rtattr *opt) +hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt) { struct hfsc_sched *q = qdisc_priv(sch); struct tc_hfsc_qopt *qopt; unsigned int i; - if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt)) + if (opt == NULL || nla_len(opt) < sizeof(*qopt)) return -EINVAL; - qopt = RTA_DATA(opt); + qopt = nla_data(opt); q->defcls = qopt->defcls; for (i = 0; i < HFSC_HSIZE; i++) @@ -1459,14 +1465,14 @@ hfsc_init_qdisc(struct Qdisc *sch, struct rtattr *opt) } static int -hfsc_change_qdisc(struct Qdisc *sch, struct rtattr *opt) +hfsc_change_qdisc(struct Qdisc *sch, struct nlattr *opt) { struct hfsc_sched *q = qdisc_priv(sch); struct tc_hfsc_qopt *qopt; - if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt)) + if (opt == NULL || nla_len(opt) < sizeof(*qopt)) return -EINVAL; - qopt = RTA_DATA(opt); + qopt = nla_data(opt); sch_tree_lock(sch); q->defcls = qopt->defcls; @@ -1550,10 +1556,10 @@ hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb) struct tc_hfsc_qopt qopt; qopt.defcls = q->defcls; - RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt); + NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt); return skb->len; - rtattr_failure: + nla_put_failure: nlmsg_trim(skb, b); return -1; } @@ -1698,7 +1704,7 @@ hfsc_drop(struct Qdisc *sch) return 0; } -static struct Qdisc_class_ops hfsc_class_ops = { +static const struct Qdisc_class_ops hfsc_class_ops = { .change = hfsc_change_class, .delete = hfsc_delete_class, .graft = hfsc_graft_class, @@ -1714,7 +1720,7 @@ static struct Qdisc_class_ops hfsc_class_ops = { .walk = hfsc_walk }; -static struct Qdisc_ops hfsc_qdisc_ops = { +static struct Qdisc_ops hfsc_qdisc_ops __read_mostly = { .id = "hfsc", .init = hfsc_init_qdisc, .change = hfsc_change_qdisc, diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 5e608a64935..e1a579efc21 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -214,10 +214,6 @@ static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch) * then finish and return direct queue. */ #define HTB_DIRECT (struct htb_class*)-1 -static inline u32 htb_classid(struct htb_class *cl) -{ - return (cl && cl != HTB_DIRECT) ? cl->classid : TC_H_UNSPEC; -} static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) @@ -996,19 +992,33 @@ static void htb_reset(struct Qdisc *sch) INIT_LIST_HEAD(q->drops + i); } -static int htb_init(struct Qdisc *sch, struct rtattr *opt) +static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = { + [TCA_HTB_PARMS] = { .len = sizeof(struct tc_htb_opt) }, + [TCA_HTB_INIT] = { .len = sizeof(struct tc_htb_glob) }, + [TCA_HTB_CTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, + [TCA_HTB_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, +}; + +static int htb_init(struct Qdisc *sch, struct nlattr *opt) { struct htb_sched *q = qdisc_priv(sch); - struct rtattr *tb[TCA_HTB_INIT]; + struct nlattr *tb[TCA_HTB_INIT + 1]; struct tc_htb_glob *gopt; + int err; int i; - if (!opt || rtattr_parse_nested(tb, TCA_HTB_INIT, opt) || - tb[TCA_HTB_INIT - 1] == NULL || - RTA_PAYLOAD(tb[TCA_HTB_INIT - 1]) < sizeof(*gopt)) { + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_HTB_INIT, opt, htb_policy); + if (err < 0) + return err; + + if (tb[TCA_HTB_INIT] == NULL) { printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n"); return -EINVAL; } - gopt = RTA_DATA(tb[TCA_HTB_INIT - 1]); + gopt = nla_data(tb[TCA_HTB_INIT]); if (gopt->version != HTB_VER >> 16) { printk(KERN_ERR "HTB: need tc/htb version %d (minor is %d), you have %d\n", @@ -1039,25 +1049,29 @@ static int htb_init(struct Qdisc *sch, struct rtattr *opt) static int htb_dump(struct Qdisc *sch, struct sk_buff *skb) { struct htb_sched *q = qdisc_priv(sch); - unsigned char *b = skb_tail_pointer(skb); - struct rtattr *rta; + struct nlattr *nest; struct tc_htb_glob gopt; + spin_lock_bh(&sch->dev->queue_lock); - gopt.direct_pkts = q->direct_pkts; + gopt.direct_pkts = q->direct_pkts; gopt.version = HTB_VER; gopt.rate2quantum = q->rate2quantum; gopt.defcls = q->defcls; gopt.debug = 0; - rta = (struct rtattr *)b; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); - RTA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt); - rta->rta_len = skb_tail_pointer(skb) - b; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + NLA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt); + nla_nest_end(skb, nest); + spin_unlock_bh(&sch->dev->queue_lock); return skb->len; -rtattr_failure: + +nla_put_failure: spin_unlock_bh(&sch->dev->queue_lock); - nlmsg_trim(skb, skb_tail_pointer(skb)); + nla_nest_cancel(skb, nest); return -1; } @@ -1065,8 +1079,7 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, struct tcmsg *tcm) { struct htb_class *cl = (struct htb_class *)arg; - unsigned char *b = skb_tail_pointer(skb); - struct rtattr *rta; + struct nlattr *nest; struct tc_htb_opt opt; spin_lock_bh(&sch->dev->queue_lock); @@ -1075,8 +1088,9 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg, if (!cl->level && cl->un.leaf.q) tcm->tcm_info = cl->un.leaf.q->handle; - rta = (struct rtattr *)b; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; memset(&opt, 0, sizeof(opt)); @@ -1087,13 +1101,15 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg, opt.quantum = cl->un.leaf.quantum; opt.prio = cl->un.leaf.prio; opt.level = cl->level; - RTA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt); - rta->rta_len = skb_tail_pointer(skb) - b; + NLA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt); + + nla_nest_end(skb, nest); spin_unlock_bh(&sch->dev->queue_lock); return skb->len; -rtattr_failure: + +nla_put_failure: spin_unlock_bh(&sch->dev->queue_lock); - nlmsg_trim(skb, b); + nla_nest_cancel(skb, nest); return -1; } @@ -1294,29 +1310,35 @@ static void htb_put(struct Qdisc *sch, unsigned long arg) } static int htb_change_class(struct Qdisc *sch, u32 classid, - u32 parentid, struct rtattr **tca, + u32 parentid, struct nlattr **tca, unsigned long *arg) { int err = -EINVAL; struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl = (struct htb_class *)*arg, *parent; - struct rtattr *opt = tca[TCA_OPTIONS - 1]; + struct nlattr *opt = tca[TCA_OPTIONS]; struct qdisc_rate_table *rtab = NULL, *ctab = NULL; - struct rtattr *tb[TCA_HTB_RTAB]; + struct nlattr *tb[TCA_HTB_RTAB + 1]; struct tc_htb_opt *hopt; /* extract all subattrs from opt attr */ - if (!opt || rtattr_parse_nested(tb, TCA_HTB_RTAB, opt) || - tb[TCA_HTB_PARMS - 1] == NULL || - RTA_PAYLOAD(tb[TCA_HTB_PARMS - 1]) < sizeof(*hopt)) + if (!opt) + goto failure; + + err = nla_parse_nested(tb, TCA_HTB_RTAB, opt, htb_policy); + if (err < 0) + goto failure; + + err = -EINVAL; + if (tb[TCA_HTB_PARMS] == NULL) goto failure; parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch); - hopt = RTA_DATA(tb[TCA_HTB_PARMS - 1]); + hopt = nla_data(tb[TCA_HTB_PARMS]); - rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB - 1]); - ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB - 1]); + rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]); + ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]); if (!rtab || !ctab) goto failure; @@ -1324,12 +1346,12 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, struct Qdisc *new_q; int prio; struct { - struct rtattr rta; + struct nlattr nla; struct gnet_estimator opt; } est = { - .rta = { - .rta_len = RTA_LENGTH(sizeof(est.opt)), - .rta_type = TCA_RATE, + .nla = { + .nla_len = nla_attr_size(sizeof(est.opt)), + .nla_type = TCA_RATE, }, .opt = { /* 4s interval, 16s averaging constant */ @@ -1354,7 +1376,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, gen_new_estimator(&cl->bstats, &cl->rate_est, &sch->dev->queue_lock, - tca[TCA_RATE-1] ? : &est.rta); + tca[TCA_RATE] ? : &est.nla); cl->refcnt = 1; INIT_LIST_HEAD(&cl->sibling); INIT_HLIST_NODE(&cl->hlist); @@ -1407,10 +1429,10 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, list_add_tail(&cl->sibling, parent ? &parent->children : &q->root); } else { - if (tca[TCA_RATE-1]) + if (tca[TCA_RATE]) gen_replace_estimator(&cl->bstats, &cl->rate_est, &sch->dev->queue_lock, - tca[TCA_RATE-1]); + tca[TCA_RATE]); sch_tree_lock(sch); } @@ -1529,7 +1551,7 @@ static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg) } } -static struct Qdisc_class_ops htb_class_ops = { +static const struct Qdisc_class_ops htb_class_ops = { .graft = htb_graft, .leaf = htb_leaf, .qlen_notify = htb_qlen_notify, @@ -1545,7 +1567,7 @@ static struct Qdisc_class_ops htb_class_ops = { .dump_stats = htb_dump_class_stats, }; -static struct Qdisc_ops htb_qdisc_ops = { +static struct Qdisc_ops htb_qdisc_ops __read_mostly = { .next = NULL, .cl_ops = &htb_class_ops, .id = "htb", diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 3f8335e6ea2..3f72d528273 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -19,127 +19,71 @@ #include <net/pkt_sched.h> -#undef DEBUG_INGRESS - -#ifdef DEBUG_INGRESS /* control */ -#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) -#else -#define DPRINTK(format,args...) -#endif - -#if 0 /* data */ -#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) -#else -#define D2PRINTK(format,args...) -#endif - - -#define PRIV(sch) qdisc_priv(sch) - - -/* Thanks to Doron Oz for this hack -*/ -#ifndef CONFIG_NET_CLS_ACT -#ifdef CONFIG_NETFILTER +/* Thanks to Doron Oz for this hack */ +#if !defined(CONFIG_NET_CLS_ACT) && defined(CONFIG_NETFILTER) static int nf_registered; #endif -#endif struct ingress_qdisc_data { - struct Qdisc *q; struct tcf_proto *filter_list; }; - /* ------------------------- Class/flow operations ------------------------- */ - -static int ingress_graft(struct Qdisc *sch,unsigned long arg, - struct Qdisc *new,struct Qdisc **old) +static int ingress_graft(struct Qdisc *sch, unsigned long arg, + struct Qdisc *new, struct Qdisc **old) { -#ifdef DEBUG_INGRESS - struct ingress_qdisc_data *p = PRIV(sch); -#endif - - DPRINTK("ingress_graft(sch %p,[qdisc %p],new %p,old %p)\n", - sch, p, new, old); - DPRINTK("\n ingress_graft: You cannot add qdiscs to classes"); - return 1; + return -EOPNOTSUPP; } - static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg) { return NULL; } - -static unsigned long ingress_get(struct Qdisc *sch,u32 classid) +static unsigned long ingress_get(struct Qdisc *sch, u32 classid) { -#ifdef DEBUG_INGRESS - struct ingress_qdisc_data *p = PRIV(sch); -#endif - DPRINTK("ingress_get(sch %p,[qdisc %p],classid %x)\n", sch, p, classid); return TC_H_MIN(classid) + 1; } - static unsigned long ingress_bind_filter(struct Qdisc *sch, - unsigned long parent, u32 classid) + unsigned long parent, u32 classid) { return ingress_get(sch, classid); } - static void ingress_put(struct Qdisc *sch, unsigned long cl) { } - static int ingress_change(struct Qdisc *sch, u32 classid, u32 parent, - struct rtattr **tca, unsigned long *arg) + struct nlattr **tca, unsigned long *arg) { -#ifdef DEBUG_INGRESS - struct ingress_qdisc_data *p = PRIV(sch); -#endif - DPRINTK("ingress_change(sch %p,[qdisc %p],classid %x,parent %x)," - "arg 0x%lx\n", sch, p, classid, parent, *arg); - DPRINTK("No effect. sch_ingress doesn't maintain classes at the moment"); return 0; } - - -static void ingress_walk(struct Qdisc *sch,struct qdisc_walker *walker) +static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker) { -#ifdef DEBUG_INGRESS - struct ingress_qdisc_data *p = PRIV(sch); -#endif - DPRINTK("ingress_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker); - DPRINTK("No effect. sch_ingress doesn't maintain classes at the moment"); + return; } - -static struct tcf_proto **ingress_find_tcf(struct Qdisc *sch,unsigned long cl) +static struct tcf_proto **ingress_find_tcf(struct Qdisc *sch, unsigned long cl) { - struct ingress_qdisc_data *p = PRIV(sch); + struct ingress_qdisc_data *p = qdisc_priv(sch); return &p->filter_list; } - /* --------------------------- Qdisc operations ---------------------------- */ - -static int ingress_enqueue(struct sk_buff *skb,struct Qdisc *sch) +static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch) { - struct ingress_qdisc_data *p = PRIV(sch); + struct ingress_qdisc_data *p = qdisc_priv(sch); struct tcf_result res; int result; - D2PRINTK("ingress_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); result = tc_classify(skb, p->filter_list, &res); - D2PRINTK("result %d class 0x%04x\n", result, res.classid); + /* * Unlike normal "enqueue" functions, ingress_enqueue returns a * firewall FW_* code. @@ -148,23 +92,22 @@ static int ingress_enqueue(struct sk_buff *skb,struct Qdisc *sch) sch->bstats.packets++; sch->bstats.bytes += skb->len; switch (result) { - case TC_ACT_SHOT: - result = TC_ACT_SHOT; - sch->qstats.drops++; - break; - case TC_ACT_STOLEN: - case TC_ACT_QUEUED: - result = TC_ACT_STOLEN; - break; - case TC_ACT_RECLASSIFY: - case TC_ACT_OK: - skb->tc_index = TC_H_MIN(res.classid); - default: - result = TC_ACT_OK; - break; + case TC_ACT_SHOT: + result = TC_ACT_SHOT; + sch->qstats.drops++; + break; + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + result = TC_ACT_STOLEN; + break; + case TC_ACT_RECLASSIFY: + case TC_ACT_OK: + skb->tc_index = TC_H_MIN(res.classid); + default: + result = TC_ACT_OK; + break; } #else - D2PRINTK("Overriding result to ACCEPT\n"); result = NF_ACCEPT; sch->bstats.packets++; sch->bstats.bytes += skb->len; @@ -173,39 +116,8 @@ static int ingress_enqueue(struct sk_buff *skb,struct Qdisc *sch) return result; } - -static struct sk_buff *ingress_dequeue(struct Qdisc *sch) -{ -/* - struct ingress_qdisc_data *p = PRIV(sch); - D2PRINTK("ingress_dequeue(sch %p,[qdisc %p])\n",sch,PRIV(p)); -*/ - return NULL; -} - - -static int ingress_requeue(struct sk_buff *skb,struct Qdisc *sch) -{ -/* - struct ingress_qdisc_data *p = PRIV(sch); - D2PRINTK("ingress_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,PRIV(p)); -*/ - return 0; -} - -static unsigned int ingress_drop(struct Qdisc *sch) -{ -#ifdef DEBUG_INGRESS - struct ingress_qdisc_data *p = PRIV(sch); -#endif - DPRINTK("ingress_drop(sch %p,[qdisc %p])\n", sch, p); - return 0; -} - -#ifndef CONFIG_NET_CLS_ACT -#ifdef CONFIG_NETFILTER -static unsigned int -ing_hook(unsigned int hook, struct sk_buff *skb, +#if !defined(CONFIG_NET_CLS_ACT) && defined(CONFIG_NETFILTER) +static unsigned int ing_hook(unsigned int hook, struct sk_buff *skb, const struct net_device *indev, const struct net_device *outdev, int (*okfn)(struct sk_buff *)) @@ -213,12 +125,7 @@ ing_hook(unsigned int hook, struct sk_buff *skb, struct Qdisc *q; struct net_device *dev = skb->dev; - int fwres=NF_ACCEPT; - - DPRINTK("ing_hook: skb %s dev=%s len=%u\n", - skb->sk ? "(owned)" : "(unowned)", - skb->dev ? skb->dev->name : "(no dev)", - skb->len); + int fwres = NF_ACCEPT; if (dev->qdisc_ingress) { spin_lock(&dev->ingress_lock); @@ -231,168 +138,101 @@ ing_hook(unsigned int hook, struct sk_buff *skb, } /* after ipt_filter */ -static struct nf_hook_ops ing_ops = { - .hook = ing_hook, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_PRE_ROUTING, - .priority = NF_IP_PRI_FILTER + 1, -}; - -static struct nf_hook_ops ing6_ops = { - .hook = ing_hook, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_IP6_PRE_ROUTING, - .priority = NF_IP6_PRI_FILTER + 1, +static struct nf_hook_ops ing_ops[] __read_mostly = { + { + .hook = ing_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP_PRI_FILTER + 1, + }, + { + .hook = ing_hook, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP6_PRI_FILTER + 1, + }, }; - -#endif #endif -static int ingress_init(struct Qdisc *sch,struct rtattr *opt) +static int ingress_init(struct Qdisc *sch, struct nlattr *opt) { - struct ingress_qdisc_data *p = PRIV(sch); - -/* Make sure either netfilter or preferably CLS_ACT is -* compiled in */ -#ifndef CONFIG_NET_CLS_ACT -#ifndef CONFIG_NETFILTER - printk("You MUST compile classifier actions into the kernel\n"); - return -EINVAL; -#else +#if !defined(CONFIG_NET_CLS_ACT) && defined(CONFIG_NETFILTER) printk("Ingress scheduler: Classifier actions prefered over netfilter\n"); -#endif -#endif -#ifndef CONFIG_NET_CLS_ACT -#ifdef CONFIG_NETFILTER if (!nf_registered) { - if (nf_register_hook(&ing_ops) < 0) { + if (nf_register_hooks(ing_ops, ARRAY_SIZE(ing_ops)) < 0) { printk("ingress qdisc registration error \n"); return -EINVAL; } nf_registered++; - - if (nf_register_hook(&ing6_ops) < 0) { - printk("IPv6 ingress qdisc registration error, " \ - "disabling IPv6 support.\n"); - } else - nf_registered++; } #endif -#endif - - DPRINTK("ingress_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt); - p->q = &noop_qdisc; return 0; } - -static void ingress_reset(struct Qdisc *sch) -{ - struct ingress_qdisc_data *p = PRIV(sch); - - DPRINTK("ingress_reset(sch %p,[qdisc %p])\n", sch, p); - -/* -#if 0 -*/ -/* for future use */ - qdisc_reset(p->q); -/* -#endif -*/ -} - -/* ------------------------------------------------------------- */ - - /* ------------------------------------------------------------- */ static void ingress_destroy(struct Qdisc *sch) { - struct ingress_qdisc_data *p = PRIV(sch); + struct ingress_qdisc_data *p = qdisc_priv(sch); - DPRINTK("ingress_destroy(sch %p,[qdisc %p])\n", sch, p); tcf_destroy_chain(p->filter_list); -#if 0 -/* for future use */ - qdisc_destroy(p->q); -#endif } - static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb) { - unsigned char *b = skb_tail_pointer(skb); - struct rtattr *rta; + struct nlattr *nest; - rta = (struct rtattr *) b; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); - rta->rta_len = skb_tail_pointer(skb) - b; + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + nla_nest_end(skb, nest); return skb->len; -rtattr_failure: - nlmsg_trim(skb, b); +nla_put_failure: + nla_nest_cancel(skb, nest); return -1; } -static struct Qdisc_class_ops ingress_class_ops = { +static const struct Qdisc_class_ops ingress_class_ops = { .graft = ingress_graft, .leaf = ingress_leaf, .get = ingress_get, .put = ingress_put, .change = ingress_change, - .delete = NULL, .walk = ingress_walk, .tcf_chain = ingress_find_tcf, .bind_tcf = ingress_bind_filter, .unbind_tcf = ingress_put, - .dump = NULL, }; -static struct Qdisc_ops ingress_qdisc_ops = { - .next = NULL, +static struct Qdisc_ops ingress_qdisc_ops __read_mostly = { .cl_ops = &ingress_class_ops, .id = "ingress", .priv_size = sizeof(struct ingress_qdisc_data), .enqueue = ingress_enqueue, - .dequeue = ingress_dequeue, - .requeue = ingress_requeue, - .drop = ingress_drop, .init = ingress_init, - .reset = ingress_reset, .destroy = ingress_destroy, - .change = NULL, .dump = ingress_dump, .owner = THIS_MODULE, }; static int __init ingress_module_init(void) { - int ret = 0; - - if ((ret = register_qdisc(&ingress_qdisc_ops)) < 0) { - printk("Unable to register Ingress qdisc\n"); - return ret; - } - - return ret; + return register_qdisc(&ingress_qdisc_ops); } + static void __exit ingress_module_exit(void) { unregister_qdisc(&ingress_qdisc_ops); -#ifndef CONFIG_NET_CLS_ACT -#ifdef CONFIG_NETFILTER - if (nf_registered) { - nf_unregister_hook(&ing_ops); - if (nf_registered > 1) - nf_unregister_hook(&ing6_ops); - } -#endif +#if !defined(CONFIG_NET_CLS_ACT) && defined(CONFIG_NETFILTER) + if (nf_registered) + nf_unregister_hooks(ing_ops, ARRAY_SIZE(ing_ops)); #endif } + module_init(ingress_module_init) module_exit(ingress_module_exit) MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 9e5e87e81f0..c9c649b26ea 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -313,21 +313,21 @@ static void netem_reset(struct Qdisc *sch) /* Pass size change message down to embedded FIFO */ static int set_fifo_limit(struct Qdisc *q, int limit) { - struct rtattr *rta; + struct nlattr *nla; int ret = -ENOMEM; /* Hack to avoid sending change message to non-FIFO */ if (strncmp(q->ops->id + 1, "fifo", 4) != 0) return 0; - rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); - if (rta) { - rta->rta_type = RTM_NEWQDISC; - rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt)); - ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit; + nla = kmalloc(nla_attr_size(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); + if (nla) { + nla->nla_type = RTM_NEWQDISC; + nla->nla_len = nla_attr_size(sizeof(struct tc_fifo_qopt)); + ((struct tc_fifo_qopt *)nla_data(nla))->limit = limit; - ret = q->ops->change(q, rta); - kfree(rta); + ret = q->ops->change(q, nla); + kfree(nla); } return ret; } @@ -336,11 +336,11 @@ static int set_fifo_limit(struct Qdisc *q, int limit) * Distribution data is a variable size payload containing * signed 16 bit values. */ -static int get_dist_table(struct Qdisc *sch, const struct rtattr *attr) +static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) { struct netem_sched_data *q = qdisc_priv(sch); - unsigned long n = RTA_PAYLOAD(attr)/sizeof(__s16); - const __s16 *data = RTA_DATA(attr); + unsigned long n = nla_len(attr)/sizeof(__s16); + const __s16 *data = nla_data(attr); struct disttable *d; int i; @@ -363,13 +363,10 @@ static int get_dist_table(struct Qdisc *sch, const struct rtattr *attr) return 0; } -static int get_correlation(struct Qdisc *sch, const struct rtattr *attr) +static int get_correlation(struct Qdisc *sch, const struct nlattr *attr) { struct netem_sched_data *q = qdisc_priv(sch); - const struct tc_netem_corr *c = RTA_DATA(attr); - - if (RTA_PAYLOAD(attr) != sizeof(*c)) - return -EINVAL; + const struct tc_netem_corr *c = nla_data(attr); init_crandom(&q->delay_cor, c->delay_corr); init_crandom(&q->loss_cor, c->loss_corr); @@ -377,43 +374,48 @@ static int get_correlation(struct Qdisc *sch, const struct rtattr *attr) return 0; } -static int get_reorder(struct Qdisc *sch, const struct rtattr *attr) +static int get_reorder(struct Qdisc *sch, const struct nlattr *attr) { struct netem_sched_data *q = qdisc_priv(sch); - const struct tc_netem_reorder *r = RTA_DATA(attr); - - if (RTA_PAYLOAD(attr) != sizeof(*r)) - return -EINVAL; + const struct tc_netem_reorder *r = nla_data(attr); q->reorder = r->probability; init_crandom(&q->reorder_cor, r->correlation); return 0; } -static int get_corrupt(struct Qdisc *sch, const struct rtattr *attr) +static int get_corrupt(struct Qdisc *sch, const struct nlattr *attr) { struct netem_sched_data *q = qdisc_priv(sch); - const struct tc_netem_corrupt *r = RTA_DATA(attr); - - if (RTA_PAYLOAD(attr) != sizeof(*r)) - return -EINVAL; + const struct tc_netem_corrupt *r = nla_data(attr); q->corrupt = r->probability; init_crandom(&q->corrupt_cor, r->correlation); return 0; } +static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = { + [TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) }, + [TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) }, + [TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) }, +}; + /* Parse netlink message to set options */ -static int netem_change(struct Qdisc *sch, struct rtattr *opt) +static int netem_change(struct Qdisc *sch, struct nlattr *opt) { struct netem_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_NETEM_MAX + 1]; struct tc_netem_qopt *qopt; int ret; - if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt)) + if (opt == NULL) return -EINVAL; - qopt = RTA_DATA(opt); + ret = nla_parse_nested_compat(tb, TCA_NETEM_MAX, opt, netem_policy, + qopt, sizeof(*qopt)); + if (ret < 0) + return ret; + ret = set_fifo_limit(q->qdisc, qopt->limit); if (ret) { pr_debug("netem: can't set fifo limit\n"); @@ -434,39 +436,28 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt) if (q->gap) q->reorder = ~0; - /* Handle nested options after initial queue options. - * Should have put all options in nested format but too late now. - */ - if (RTA_PAYLOAD(opt) > sizeof(*qopt)) { - struct rtattr *tb[TCA_NETEM_MAX]; - if (rtattr_parse(tb, TCA_NETEM_MAX, - RTA_DATA(opt) + sizeof(*qopt), - RTA_PAYLOAD(opt) - sizeof(*qopt))) - return -EINVAL; - - if (tb[TCA_NETEM_CORR-1]) { - ret = get_correlation(sch, tb[TCA_NETEM_CORR-1]); - if (ret) - return ret; - } + if (tb[TCA_NETEM_CORR]) { + ret = get_correlation(sch, tb[TCA_NETEM_CORR]); + if (ret) + return ret; + } - if (tb[TCA_NETEM_DELAY_DIST-1]) { - ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST-1]); - if (ret) - return ret; - } + if (tb[TCA_NETEM_DELAY_DIST]) { + ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]); + if (ret) + return ret; + } - if (tb[TCA_NETEM_REORDER-1]) { - ret = get_reorder(sch, tb[TCA_NETEM_REORDER-1]); - if (ret) - return ret; - } + if (tb[TCA_NETEM_REORDER]) { + ret = get_reorder(sch, tb[TCA_NETEM_REORDER]); + if (ret) + return ret; + } - if (tb[TCA_NETEM_CORRUPT-1]) { - ret = get_corrupt(sch, tb[TCA_NETEM_CORRUPT-1]); - if (ret) - return ret; - } + if (tb[TCA_NETEM_CORRUPT]) { + ret = get_corrupt(sch, tb[TCA_NETEM_CORRUPT]); + if (ret) + return ret; } return 0; @@ -515,13 +506,13 @@ static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) return qdisc_reshape_fail(nskb, sch); } -static int tfifo_init(struct Qdisc *sch, struct rtattr *opt) +static int tfifo_init(struct Qdisc *sch, struct nlattr *opt) { struct fifo_sched_data *q = qdisc_priv(sch); if (opt) { - struct tc_fifo_qopt *ctl = RTA_DATA(opt); - if (RTA_PAYLOAD(opt) < sizeof(*ctl)) + struct tc_fifo_qopt *ctl = nla_data(opt); + if (nla_len(opt) < sizeof(*ctl)) return -EINVAL; q->limit = ctl->limit; @@ -537,14 +528,14 @@ static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb) struct fifo_sched_data *q = qdisc_priv(sch); struct tc_fifo_qopt opt = { .limit = q->limit }; - RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); return skb->len; -rtattr_failure: +nla_put_failure: return -1; } -static struct Qdisc_ops tfifo_qdisc_ops = { +static struct Qdisc_ops tfifo_qdisc_ops __read_mostly = { .id = "tfifo", .priv_size = sizeof(struct fifo_sched_data), .enqueue = tfifo_enqueue, @@ -557,7 +548,7 @@ static struct Qdisc_ops tfifo_qdisc_ops = { .dump = tfifo_dump, }; -static int netem_init(struct Qdisc *sch, struct rtattr *opt) +static int netem_init(struct Qdisc *sch, struct nlattr *opt) { struct netem_sched_data *q = qdisc_priv(sch); int ret; @@ -595,7 +586,7 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) { const struct netem_sched_data *q = qdisc_priv(sch); unsigned char *b = skb_tail_pointer(skb); - struct rtattr *rta = (struct rtattr *) b; + struct nlattr *nla = (struct nlattr *) b; struct tc_netem_qopt qopt; struct tc_netem_corr cor; struct tc_netem_reorder reorder; @@ -607,26 +598,26 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) qopt.loss = q->loss; qopt.gap = q->gap; qopt.duplicate = q->duplicate; - RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt); + NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt); cor.delay_corr = q->delay_cor.rho; cor.loss_corr = q->loss_cor.rho; cor.dup_corr = q->dup_cor.rho; - RTA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor); + NLA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor); reorder.probability = q->reorder; reorder.correlation = q->reorder_cor.rho; - RTA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder); + NLA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder); corrupt.probability = q->corrupt; corrupt.correlation = q->corrupt_cor.rho; - RTA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt); + NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt); - rta->rta_len = skb_tail_pointer(skb) - b; + nla->nla_len = skb_tail_pointer(skb) - b; return skb->len; -rtattr_failure: +nla_put_failure: nlmsg_trim(skb, b); return -1; } @@ -678,7 +669,7 @@ static void netem_put(struct Qdisc *sch, unsigned long arg) } static int netem_change_class(struct Qdisc *sch, u32 classid, u32 parentid, - struct rtattr **tca, unsigned long *arg) + struct nlattr **tca, unsigned long *arg) { return -ENOSYS; } @@ -705,7 +696,7 @@ static struct tcf_proto **netem_find_tcf(struct Qdisc *sch, unsigned long cl) return NULL; } -static struct Qdisc_class_ops netem_class_ops = { +static const struct Qdisc_class_ops netem_class_ops = { .graft = netem_graft, .leaf = netem_leaf, .get = netem_get, @@ -717,7 +708,7 @@ static struct Qdisc_class_ops netem_class_ops = { .dump = netem_dump_class, }; -static struct Qdisc_ops netem_qdisc_ops = { +static struct Qdisc_ops netem_qdisc_ops __read_mostly = { .id = "netem", .cl_ops = &netem_class_ops, .priv_size = sizeof(struct netem_sched_data), diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index de894096e44..4aa2b45dad0 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -224,16 +224,19 @@ prio_destroy(struct Qdisc* sch) qdisc_destroy(q->queues[prio]); } -static int prio_tune(struct Qdisc *sch, struct rtattr *opt) +static int prio_tune(struct Qdisc *sch, struct nlattr *opt) { struct prio_sched_data *q = qdisc_priv(sch); struct tc_prio_qopt *qopt; - struct rtattr *tb[TCA_PRIO_MAX]; + struct nlattr *tb[TCA_PRIO_MAX + 1]; + int err; int i; - if (rtattr_parse_nested_compat(tb, TCA_PRIO_MAX, opt, qopt, - sizeof(*qopt))) - return -EINVAL; + err = nla_parse_nested_compat(tb, TCA_PRIO_MAX, opt, NULL, qopt, + sizeof(*qopt)); + if (err < 0) + return err; + q->bands = qopt->bands; /* If we're multiqueue, make sure the number of incoming bands * matches the number of queues on the device we're associating with. @@ -242,7 +245,7 @@ static int prio_tune(struct Qdisc *sch, struct rtattr *opt) * only one that is enabled for multiqueue, since it's the only one * that interacts with the underlying device. */ - q->mq = RTA_GET_FLAG(tb[TCA_PRIO_MQ - 1]); + q->mq = nla_get_flag(tb[TCA_PRIO_MQ]); if (q->mq) { if (sch->parent != TC_H_ROOT) return -EINVAL; @@ -296,7 +299,7 @@ static int prio_tune(struct Qdisc *sch, struct rtattr *opt) return 0; } -static int prio_init(struct Qdisc *sch, struct rtattr *opt) +static int prio_init(struct Qdisc *sch, struct nlattr *opt) { struct prio_sched_data *q = qdisc_priv(sch); int i; @@ -319,20 +322,24 @@ static int prio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct prio_sched_data *q = qdisc_priv(sch); unsigned char *b = skb_tail_pointer(skb); - struct rtattr *nest; + struct nlattr *nest; struct tc_prio_qopt opt; opt.bands = q->bands; memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1); - nest = RTA_NEST_COMPAT(skb, TCA_OPTIONS, sizeof(opt), &opt); - if (q->mq) - RTA_PUT_FLAG(skb, TCA_PRIO_MQ); - RTA_NEST_COMPAT_END(skb, nest); + nest = nla_nest_compat_start(skb, TCA_OPTIONS, sizeof(opt), &opt); + if (nest == NULL) + goto nla_put_failure; + if (q->mq) { + if (nla_put_flag(skb, TCA_PRIO_MQ) < 0) + goto nla_put_failure; + } + nla_nest_compat_end(skb, nest); return skb->len; -rtattr_failure: +nla_put_failure: nlmsg_trim(skb, b); return -1; } @@ -392,7 +399,7 @@ static void prio_put(struct Qdisc *q, unsigned long cl) return; } -static int prio_change(struct Qdisc *sch, u32 handle, u32 parent, struct rtattr **tca, unsigned long *arg) +static int prio_change(struct Qdisc *sch, u32 handle, u32 parent, struct nlattr **tca, unsigned long *arg) { unsigned long cl = *arg; struct prio_sched_data *q = qdisc_priv(sch); @@ -468,7 +475,7 @@ static struct tcf_proto ** prio_find_tcf(struct Qdisc *sch, unsigned long cl) return &q->filter_list; } -static struct Qdisc_class_ops prio_class_ops = { +static const struct Qdisc_class_ops prio_class_ops = { .graft = prio_graft, .leaf = prio_leaf, .get = prio_get, @@ -483,7 +490,7 @@ static struct Qdisc_class_ops prio_class_ops = { .dump_stats = prio_dump_class_stats, }; -static struct Qdisc_ops prio_qdisc_ops = { +static struct Qdisc_ops prio_qdisc_ops __read_mostly = { .next = NULL, .cl_ops = &prio_class_ops, .id = "prio", @@ -500,7 +507,7 @@ static struct Qdisc_ops prio_qdisc_ops = { .owner = THIS_MODULE, }; -static struct Qdisc_ops rr_qdisc_ops = { +static struct Qdisc_ops rr_qdisc_ops __read_mostly = { .next = NULL, .cl_ops = &prio_class_ops, .id = "rr", diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 9b95fefb70f..3dcd493f4f4 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -177,21 +177,21 @@ static void red_destroy(struct Qdisc *sch) static struct Qdisc *red_create_dflt(struct Qdisc *sch, u32 limit) { struct Qdisc *q; - struct rtattr *rta; + struct nlattr *nla; int ret; q = qdisc_create_dflt(sch->dev, &bfifo_qdisc_ops, TC_H_MAKE(sch->handle, 1)); if (q) { - rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), + nla = kmalloc(nla_attr_size(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); - if (rta) { - rta->rta_type = RTM_NEWQDISC; - rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt)); - ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit; + if (nla) { + nla->nla_type = RTM_NEWQDISC; + nla->nla_len = nla_attr_size(sizeof(struct tc_fifo_qopt)); + ((struct tc_fifo_qopt *)nla_data(nla))->limit = limit; - ret = q->ops->change(q, rta); - kfree(rta); + ret = q->ops->change(q, nla); + kfree(nla); if (ret == 0) return q; @@ -201,23 +201,31 @@ static struct Qdisc *red_create_dflt(struct Qdisc *sch, u32 limit) return NULL; } -static int red_change(struct Qdisc *sch, struct rtattr *opt) +static const struct nla_policy red_policy[TCA_RED_MAX + 1] = { + [TCA_RED_PARMS] = { .len = sizeof(struct tc_red_qopt) }, + [TCA_RED_STAB] = { .len = RED_STAB_SIZE }, +}; + +static int red_change(struct Qdisc *sch, struct nlattr *opt) { struct red_sched_data *q = qdisc_priv(sch); - struct rtattr *tb[TCA_RED_MAX]; + struct nlattr *tb[TCA_RED_MAX + 1]; struct tc_red_qopt *ctl; struct Qdisc *child = NULL; + int err; - if (opt == NULL || rtattr_parse_nested(tb, TCA_RED_MAX, opt)) + if (opt == NULL) return -EINVAL; - if (tb[TCA_RED_PARMS-1] == NULL || - RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) || - tb[TCA_RED_STAB-1] == NULL || - RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < RED_STAB_SIZE) + err = nla_parse_nested(tb, TCA_RED_MAX, opt, red_policy); + if (err < 0) + return err; + + if (tb[TCA_RED_PARMS] == NULL || + tb[TCA_RED_STAB] == NULL) return -EINVAL; - ctl = RTA_DATA(tb[TCA_RED_PARMS-1]); + ctl = nla_data(tb[TCA_RED_PARMS]); if (ctl->limit > 0) { child = red_create_dflt(sch, ctl->limit); @@ -235,7 +243,7 @@ static int red_change(struct Qdisc *sch, struct rtattr *opt) red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog, ctl->Scell_log, - RTA_DATA(tb[TCA_RED_STAB-1])); + nla_data(tb[TCA_RED_STAB])); if (skb_queue_empty(&sch->q)) red_end_of_idle_period(&q->parms); @@ -244,7 +252,7 @@ static int red_change(struct Qdisc *sch, struct rtattr *opt) return 0; } -static int red_init(struct Qdisc* sch, struct rtattr *opt) +static int red_init(struct Qdisc* sch, struct nlattr *opt) { struct red_sched_data *q = qdisc_priv(sch); @@ -255,7 +263,7 @@ static int red_init(struct Qdisc* sch, struct rtattr *opt) static int red_dump(struct Qdisc *sch, struct sk_buff *skb) { struct red_sched_data *q = qdisc_priv(sch); - struct rtattr *opts = NULL; + struct nlattr *opts = NULL; struct tc_red_qopt opt = { .limit = q->limit, .flags = q->flags, @@ -266,12 +274,14 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) .Scell_log = q->parms.Scell_log, }; - opts = RTA_NEST(skb, TCA_OPTIONS); - RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt); - return RTA_NEST_END(skb, opts); + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + NLA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt); + return nla_nest_end(skb, opts); -rtattr_failure: - return RTA_NEST_CANCEL(skb, opts); +nla_put_failure: + return nla_nest_cancel(skb, opts); } static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d) @@ -332,7 +342,7 @@ static void red_put(struct Qdisc *sch, unsigned long arg) } static int red_change_class(struct Qdisc *sch, u32 classid, u32 parentid, - struct rtattr **tca, unsigned long *arg) + struct nlattr **tca, unsigned long *arg) { return -ENOSYS; } @@ -359,7 +369,7 @@ static struct tcf_proto **red_find_tcf(struct Qdisc *sch, unsigned long cl) return NULL; } -static struct Qdisc_class_ops red_class_ops = { +static const struct Qdisc_class_ops red_class_ops = { .graft = red_graft, .leaf = red_leaf, .get = red_get, @@ -371,7 +381,7 @@ static struct Qdisc_class_ops red_class_ops = { .dump = red_dump_class, }; -static struct Qdisc_ops red_qdisc_ops = { +static struct Qdisc_ops red_qdisc_ops __read_mostly = { .id = "red", .priv_size = sizeof(struct red_sched_data), .cl_ops = &red_class_ops, diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index b542c875e15..91af539ab6e 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -122,7 +122,7 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); h = iph->daddr; - h2 = iph->saddr^iph->protocol; + h2 = iph->saddr ^ iph->protocol; if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP || @@ -137,7 +137,7 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) { struct ipv6hdr *iph = ipv6_hdr(skb); h = iph->daddr.s6_addr32[3]; - h2 = iph->saddr.s6_addr32[3]^iph->nexthdr; + h2 = iph->saddr.s6_addr32[3] ^ iph->nexthdr; if (iph->nexthdr == IPPROTO_TCP || iph->nexthdr == IPPROTO_UDP || iph->nexthdr == IPPROTO_UDPLITE || @@ -148,9 +148,10 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) break; } default: - h = (u32)(unsigned long)skb->dst^skb->protocol; - h2 = (u32)(unsigned long)skb->sk; + h = (unsigned long)skb->dst ^ skb->protocol; + h2 = (unsigned long)skb->sk; } + return sfq_fold_hash(q, h, h2); } @@ -208,7 +209,7 @@ static unsigned int sfq_drop(struct Qdisc *sch) drop a packet from it */ if (d > 1) { - sfq_index x = q->dep[d+SFQ_DEPTH].next; + sfq_index x = q->dep[d + SFQ_DEPTH].next; skb = q->qs[x].prev; len = skb->len; __skb_unlink(skb, &q->qs[x]); @@ -241,7 +242,7 @@ static unsigned int sfq_drop(struct Qdisc *sch) } static int -sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch) +sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct sfq_sched_data *q = qdisc_priv(sch); unsigned hash = sfq_hash(q, skb); @@ -252,6 +253,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch) q->ht[hash] = x = q->dep[SFQ_DEPTH].next; q->hash[x] = hash; } + /* If selected queue has length q->limit, this means that * all another queues are empty and that we do simple tail drop, * i.e. drop _this_ packet. @@ -284,7 +286,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch) } static int -sfq_requeue(struct sk_buff *skb, struct Qdisc* sch) +sfq_requeue(struct sk_buff *skb, struct Qdisc *sch) { struct sfq_sched_data *q = qdisc_priv(sch); unsigned hash = sfq_hash(q, skb); @@ -295,6 +297,7 @@ sfq_requeue(struct sk_buff *skb, struct Qdisc* sch) q->ht[hash] = x = q->dep[SFQ_DEPTH].next; q->hash[x] = hash; } + sch->qstats.backlog += skb->len; __skb_queue_head(&q->qs[x], skb); /* If selected queue has length q->limit+1, this means that @@ -310,6 +313,7 @@ sfq_requeue(struct sk_buff *skb, struct Qdisc* sch) kfree_skb(skb); return NET_XMIT_CN; } + sfq_inc(q, x); if (q->qs[x].qlen == 1) { /* The flow is new */ if (q->tail == SFQ_DEPTH) { /* It is the first flow */ @@ -322,6 +326,7 @@ sfq_requeue(struct sk_buff *skb, struct Qdisc* sch) q->tail = x; } } + if (++sch->q.qlen <= q->limit) { sch->qstats.requeues++; return 0; @@ -336,7 +341,7 @@ sfq_requeue(struct sk_buff *skb, struct Qdisc* sch) static struct sk_buff * -sfq_dequeue(struct Qdisc* sch) +sfq_dequeue(struct Qdisc *sch) { struct sfq_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; @@ -373,7 +378,7 @@ sfq_dequeue(struct Qdisc* sch) } static void -sfq_reset(struct Qdisc* sch) +sfq_reset(struct Qdisc *sch) { struct sk_buff *skb; @@ -383,27 +388,27 @@ sfq_reset(struct Qdisc* sch) static void sfq_perturbation(unsigned long arg) { - struct Qdisc *sch = (struct Qdisc*)arg; + struct Qdisc *sch = (struct Qdisc *)arg; struct sfq_sched_data *q = qdisc_priv(sch); - get_random_bytes(&q->perturbation, 4); + q->perturbation = net_random(); if (q->perturb_period) mod_timer(&q->perturb_timer, jiffies + q->perturb_period); } -static int sfq_change(struct Qdisc *sch, struct rtattr *opt) +static int sfq_change(struct Qdisc *sch, struct nlattr *opt) { struct sfq_sched_data *q = qdisc_priv(sch); - struct tc_sfq_qopt *ctl = RTA_DATA(opt); + struct tc_sfq_qopt *ctl = nla_data(opt); unsigned int qlen; - if (opt->rta_len < RTA_LENGTH(sizeof(*ctl))) + if (opt->nla_len < nla_attr_size(sizeof(*ctl))) return -EINVAL; sch_tree_lock(sch); q->quantum = ctl->quantum ? : psched_mtu(sch->dev); - q->perturb_period = ctl->perturb_period*HZ; + q->perturb_period = ctl->perturb_period * HZ; if (ctl->limit) q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1); @@ -415,41 +420,44 @@ static int sfq_change(struct Qdisc *sch, struct rtattr *opt) del_timer(&q->perturb_timer); if (q->perturb_period) { mod_timer(&q->perturb_timer, jiffies + q->perturb_period); - get_random_bytes(&q->perturbation, 4); + q->perturbation = net_random(); } sch_tree_unlock(sch); return 0; } -static int sfq_init(struct Qdisc *sch, struct rtattr *opt) +static int sfq_init(struct Qdisc *sch, struct nlattr *opt) { struct sfq_sched_data *q = qdisc_priv(sch); int i; - init_timer(&q->perturb_timer); - q->perturb_timer.data = (unsigned long)sch; q->perturb_timer.function = sfq_perturbation; + q->perturb_timer.data = (unsigned long)sch;; + init_timer_deferrable(&q->perturb_timer); - for (i=0; i<SFQ_HASH_DIVISOR; i++) + for (i = 0; i < SFQ_HASH_DIVISOR; i++) q->ht[i] = SFQ_DEPTH; - for (i=0; i<SFQ_DEPTH; i++) { + + for (i = 0; i < SFQ_DEPTH; i++) { skb_queue_head_init(&q->qs[i]); - q->dep[i+SFQ_DEPTH].next = i+SFQ_DEPTH; - q->dep[i+SFQ_DEPTH].prev = i+SFQ_DEPTH; + q->dep[i + SFQ_DEPTH].next = i + SFQ_DEPTH; + q->dep[i + SFQ_DEPTH].prev = i + SFQ_DEPTH; } + q->limit = SFQ_DEPTH - 1; q->max_depth = 0; q->tail = SFQ_DEPTH; if (opt == NULL) { q->quantum = psched_mtu(sch->dev); q->perturb_period = 0; - get_random_bytes(&q->perturbation, 4); + q->perturbation = net_random(); } else { int err = sfq_change(sch, opt); if (err) return err; } - for (i=0; i<SFQ_DEPTH; i++) + + for (i = 0; i < SFQ_DEPTH; i++) sfq_link(q, i); return 0; } @@ -467,22 +475,22 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) struct tc_sfq_qopt opt; opt.quantum = q->quantum; - opt.perturb_period = q->perturb_period/HZ; + opt.perturb_period = q->perturb_period / HZ; opt.limit = q->limit; opt.divisor = SFQ_HASH_DIVISOR; opt.flows = q->limit; - RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); return skb->len; -rtattr_failure: +nla_put_failure: nlmsg_trim(skb, b); return -1; } -static struct Qdisc_ops sfq_qdisc_ops = { +static struct Qdisc_ops sfq_qdisc_ops __read_mostly = { .next = NULL, .cl_ops = NULL, .id = "sfq", diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index b0d81098b0e..0b7d78f59d8 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -245,20 +245,21 @@ static void tbf_reset(struct Qdisc* sch) static struct Qdisc *tbf_create_dflt_qdisc(struct Qdisc *sch, u32 limit) { struct Qdisc *q; - struct rtattr *rta; + struct nlattr *nla; int ret; q = qdisc_create_dflt(sch->dev, &bfifo_qdisc_ops, TC_H_MAKE(sch->handle, 1)); if (q) { - rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); - if (rta) { - rta->rta_type = RTM_NEWQDISC; - rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt)); - ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit; + nla = kmalloc(nla_attr_size(sizeof(struct tc_fifo_qopt)), + GFP_KERNEL); + if (nla) { + nla->nla_type = RTM_NEWQDISC; + nla->nla_len = nla_attr_size(sizeof(struct tc_fifo_qopt)); + ((struct tc_fifo_qopt *)nla_data(nla))->limit = limit; - ret = q->ops->change(q, rta); - kfree(rta); + ret = q->ops->change(q, nla); + kfree(nla); if (ret == 0) return q; @@ -269,30 +270,39 @@ static struct Qdisc *tbf_create_dflt_qdisc(struct Qdisc *sch, u32 limit) return NULL; } -static int tbf_change(struct Qdisc* sch, struct rtattr *opt) +static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = { + [TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) }, + [TCA_TBF_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, + [TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, +}; + +static int tbf_change(struct Qdisc* sch, struct nlattr *opt) { - int err = -EINVAL; + int err; struct tbf_sched_data *q = qdisc_priv(sch); - struct rtattr *tb[TCA_TBF_PTAB]; + struct nlattr *tb[TCA_TBF_PTAB + 1]; struct tc_tbf_qopt *qopt; struct qdisc_rate_table *rtab = NULL; struct qdisc_rate_table *ptab = NULL; struct Qdisc *child = NULL; int max_size,n; - if (rtattr_parse_nested(tb, TCA_TBF_PTAB, opt) || - tb[TCA_TBF_PARMS-1] == NULL || - RTA_PAYLOAD(tb[TCA_TBF_PARMS-1]) < sizeof(*qopt)) + err = nla_parse_nested(tb, TCA_TBF_PTAB, opt, tbf_policy); + if (err < 0) + return err; + + err = -EINVAL; + if (tb[TCA_TBF_PARMS] == NULL) goto done; - qopt = RTA_DATA(tb[TCA_TBF_PARMS-1]); - rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB-1]); + qopt = nla_data(tb[TCA_TBF_PARMS]); + rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB]); if (rtab == NULL) goto done; if (qopt->peakrate.rate) { if (qopt->peakrate.rate > qopt->rate.rate) - ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB-1]); + ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB]); if (ptab == NULL) goto done; } @@ -339,7 +349,7 @@ done: return err; } -static int tbf_init(struct Qdisc* sch, struct rtattr *opt) +static int tbf_init(struct Qdisc* sch, struct nlattr *opt) { struct tbf_sched_data *q = qdisc_priv(sch); @@ -370,12 +380,12 @@ static void tbf_destroy(struct Qdisc *sch) static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) { struct tbf_sched_data *q = qdisc_priv(sch); - unsigned char *b = skb_tail_pointer(skb); - struct rtattr *rta; + struct nlattr *nest; struct tc_tbf_qopt opt; - rta = (struct rtattr*)b; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; opt.limit = q->limit; opt.rate = q->R_tab->rate; @@ -385,13 +395,13 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) memset(&opt.peakrate, 0, sizeof(opt.peakrate)); opt.mtu = q->mtu; opt.buffer = q->buffer; - RTA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt); - rta->rta_len = skb_tail_pointer(skb) - b; + NLA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt); + nla_nest_end(skb, nest); return skb->len; -rtattr_failure: - nlmsg_trim(skb, b); +nla_put_failure: + nla_nest_cancel(skb, nest); return -1; } @@ -442,7 +452,7 @@ static void tbf_put(struct Qdisc *sch, unsigned long arg) } static int tbf_change_class(struct Qdisc *sch, u32 classid, u32 parentid, - struct rtattr **tca, unsigned long *arg) + struct nlattr **tca, unsigned long *arg) { return -ENOSYS; } @@ -469,7 +479,7 @@ static struct tcf_proto **tbf_find_tcf(struct Qdisc *sch, unsigned long cl) return NULL; } -static struct Qdisc_class_ops tbf_class_ops = +static const struct Qdisc_class_ops tbf_class_ops = { .graft = tbf_graft, .leaf = tbf_leaf, @@ -482,7 +492,7 @@ static struct Qdisc_class_ops tbf_class_ops = .dump = tbf_dump_class, }; -static struct Qdisc_ops tbf_qdisc_ops = { +static struct Qdisc_ops tbf_qdisc_ops __read_mostly = { .next = NULL, .cl_ops = &tbf_class_ops, .id = "tbf", diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index c0ed06d4a50..1411c7b1fbd 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -168,7 +168,7 @@ teql_destroy(struct Qdisc* sch) } } -static int teql_qdisc_init(struct Qdisc *sch, struct rtattr *opt) +static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt) { struct net_device *dev = sch->dev; struct teql_master *m = (struct teql_master*)sch->ops; diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig index 5390bc79215..0b79f869c4e 100644 --- a/net/sctp/Kconfig +++ b/net/sctp/Kconfig @@ -10,6 +10,7 @@ menuconfig IP_SCTP select CRYPTO_HMAC select CRYPTO_SHA1 select CRYPTO_MD5 if SCTP_HMAC_MD5 + select LIBCRC32C ---help--- Stream Control Transmission Protocol diff --git a/net/sctp/Makefile b/net/sctp/Makefile index 1da7204d9b4..f5356b9d5ee 100644 --- a/net/sctp/Makefile +++ b/net/sctp/Makefile @@ -9,7 +9,7 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \ transport.o chunk.o sm_make_chunk.o ulpevent.o \ inqueue.o outqueue.o ulpqueue.o command.o \ tsnmap.o bind_addr.o socket.o primitive.o \ - output.o input.o debug.o ssnmap.o proc.o crc32c.o \ + output.o input.o debug.o ssnmap.o proc.o \ auth.o sctp-$(CONFIG_SCTP_DBG_OBJCNT) += objcnt.o diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 013e3d3ab0f..a016e78061f 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -61,6 +61,7 @@ /* Forward declarations for internal functions. */ static void sctp_assoc_bh_rcv(struct work_struct *work); +static void sctp_assoc_free_asconf_acks(struct sctp_association *asoc); /* 1st Level Abstractions. */ @@ -167,11 +168,9 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a sp->autoclose * HZ; /* Initilizes the timers */ - for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) { - init_timer(&asoc->timers[i]); - asoc->timers[i].function = sctp_timer_events[i]; - asoc->timers[i].data = (unsigned long) asoc; - } + for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) + setup_timer(&asoc->timers[i], sctp_timer_events[i], + (unsigned long)asoc); /* Pull default initialization values from the sock options. * Note: This assumes that the values have already been @@ -244,6 +243,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a asoc->addip_serial = asoc->c.initial_tsn; INIT_LIST_HEAD(&asoc->addip_chunk_list); + INIT_LIST_HEAD(&asoc->asconf_ack_list); /* Make an empty list of remote transport addresses. */ INIT_LIST_HEAD(&asoc->peer.transport_addr_list); @@ -433,8 +433,7 @@ void sctp_association_free(struct sctp_association *asoc) asoc->peer.transport_count = 0; /* Free any cached ASCONF_ACK chunk. */ - if (asoc->addip_last_asconf_ack) - sctp_chunk_free(asoc->addip_last_asconf_ack); + sctp_assoc_free_asconf_acks(asoc); /* Free any cached ASCONF chunk. */ if (asoc->addip_last_asconf) @@ -732,6 +731,23 @@ struct sctp_transport *sctp_assoc_lookup_paddr( return NULL; } +/* Remove all transports except a give one */ +void sctp_assoc_del_nonprimary_peers(struct sctp_association *asoc, + struct sctp_transport *primary) +{ + struct sctp_transport *temp; + struct sctp_transport *t; + + list_for_each_entry_safe(t, temp, &asoc->peer.transport_addr_list, + transports) { + /* if the current transport is not the primary one, delete it */ + if (t != primary) + sctp_assoc_rm_peer(asoc, t); + } + + return; +} + /* Engage in transport control operations. * Mark the transport up or down and send a notification to the user. * Select and update the new active and retran paths. @@ -1470,3 +1486,56 @@ retry: asoc->assoc_id = (sctp_assoc_t) assoc_id; return error; } + +/* Free asconf_ack cache */ +static void sctp_assoc_free_asconf_acks(struct sctp_association *asoc) +{ + struct sctp_chunk *ack; + struct sctp_chunk *tmp; + + list_for_each_entry_safe(ack, tmp, &asoc->asconf_ack_list, + transmitted_list) { + list_del_init(&ack->transmitted_list); + sctp_chunk_free(ack); + } +} + +/* Clean up the ASCONF_ACK queue */ +void sctp_assoc_clean_asconf_ack_cache(const struct sctp_association *asoc) +{ + struct sctp_chunk *ack; + struct sctp_chunk *tmp; + + /* We can remove all the entries from the queue upto + * the "Peer-Sequence-Number". + */ + list_for_each_entry_safe(ack, tmp, &asoc->asconf_ack_list, + transmitted_list) { + if (ack->subh.addip_hdr->serial == + htonl(asoc->peer.addip_serial)) + break; + + list_del_init(&ack->transmitted_list); + sctp_chunk_free(ack); + } +} + +/* Find the ASCONF_ACK whose serial number matches ASCONF */ +struct sctp_chunk *sctp_assoc_lookup_asconf_ack( + const struct sctp_association *asoc, + __be32 serial) +{ + struct sctp_chunk *ack = NULL; + + /* Walk through the list of cached ASCONF-ACKs and find the + * ack chunk whose serial number matches that of the request. + */ + list_for_each_entry(ack, &asoc->asconf_ack_list, transmitted_list) { + if (ack->subh.addip_hdr->serial == serial) { + sctp_chunk_hold(ack); + break; + } + } + + return ack; +} diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c index 6a7d01091f0..13fbfb449a5 100644 --- a/net/sctp/bind_addr.c +++ b/net/sctp/bind_addr.c @@ -171,7 +171,7 @@ void sctp_bind_addr_free(struct sctp_bind_addr *bp) /* Add an address to the bind address list in the SCTP_bind_addr structure. */ int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new, - __u8 use_as_src, gfp_t gfp) + __u8 addr_state, gfp_t gfp) { struct sctp_sockaddr_entry *addr; @@ -188,7 +188,7 @@ int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new, if (!addr->a.v4.sin_port) addr->a.v4.sin_port = htons(bp->port); - addr->use_as_src = use_as_src; + addr->state = addr_state; addr->valid = 1; INIT_LIST_HEAD(&addr->list); @@ -312,7 +312,7 @@ int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list, } af->from_addr_param(&addr, rawaddr, htons(port), 0); - retval = sctp_add_bind_addr(bp, &addr, 1, gfp); + retval = sctp_add_bind_addr(bp, &addr, SCTP_ADDR_SRC, gfp); if (retval) { /* Can't finish building the list, clean up. */ sctp_bind_addr_clean(bp); @@ -353,6 +353,32 @@ int sctp_bind_addr_match(struct sctp_bind_addr *bp, return match; } +/* Get the state of the entry in the bind_addr_list */ +int sctp_bind_addr_state(const struct sctp_bind_addr *bp, + const union sctp_addr *addr) +{ + struct sctp_sockaddr_entry *laddr; + struct sctp_af *af; + int state = -1; + + af = sctp_get_af_specific(addr->sa.sa_family); + if (unlikely(!af)) + return state; + + rcu_read_lock(); + list_for_each_entry_rcu(laddr, &bp->address_list, list) { + if (!laddr->valid) + continue; + if (af->cmp_addr(&laddr->a, addr)) { + state = laddr->state; + break; + } + } + rcu_read_unlock(); + + return state; +} + /* Find the first address in the bind address list that is not present in * the addrs packed array. */ @@ -411,7 +437,8 @@ static int sctp_copy_one_addr(struct sctp_bind_addr *dest, (((AF_INET6 == addr->sa.sa_family) && (flags & SCTP_ADDR6_ALLOWED) && (flags & SCTP_ADDR6_PEERSUPP)))) - error = sctp_add_bind_addr(dest, addr, 1, gfp); + error = sctp_add_bind_addr(dest, addr, SCTP_ADDR_SRC, + gfp); } return error; diff --git a/net/sctp/crc32c.c b/net/sctp/crc32c.c deleted file mode 100644 index 181edabdb8c..00000000000 --- a/net/sctp/crc32c.c +++ /dev/null @@ -1,222 +0,0 @@ -/* SCTP kernel reference Implementation - * Copyright (c) 1999-2001 Motorola, Inc. - * Copyright (c) 2001-2003 International Business Machines, Corp. - * - * This file is part of the SCTP kernel reference Implementation - * - * SCTP Checksum functions - * - * The SCTP reference implementation is free software; - * you can redistribute it and/or modify it under the terms of - * the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * The SCTP reference implementation is distributed in the hope that it - * will be useful, but WITHOUT ANY WARRANTY; without even the implied - * ************************ - * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU CC; see the file COPYING. If not, write to - * the Free Software Foundation, 59 Temple Place - Suite 330, - * Boston, MA 02111-1307, USA. - * - * Please send any bug reports or fixes you make to the - * email address(es): - * lksctp developers <lksctp-developers@lists.sourceforge.net> - * - * Or submit a bug report through the following website: - * http://www.sf.net/projects/lksctp - * - * Written or modified by: - * Dinakaran Joseph - * Jon Grimm <jgrimm@us.ibm.com> - * Sridhar Samudrala <sri@us.ibm.com> - * - * Any bugs reported given to us we will try to fix... any fixes shared will - * be incorporated into the next SCTP release. - */ - -/* The following code has been taken directly from - * draft-ietf-tsvwg-sctpcsum-03.txt - * - * The code has now been modified specifically for SCTP knowledge. - */ - -#include <linux/types.h> -#include <net/sctp/sctp.h> - -#define CRC32C_POLY 0x1EDC6F41 -#define CRC32C(c,d) (c=(c>>8)^crc_c[(c^(d))&0xFF]) -/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ -/* Copyright 2001, D. Otis. Use this program, code or tables */ -/* extracted from it, as desired without restriction. */ -/* */ -/* 32 Bit Reflected CRC table generation for SCTP. */ -/* To accommodate serial byte data being shifted out least */ -/* significant bit first, the table's 32 bit words are reflected */ -/* which flips both byte and bit MS and LS positions. The CRC */ -/* is calculated MS bits first from the perspective of the serial*/ -/* stream. The x^32 term is implied and the x^0 term may also */ -/* be shown as +1. The polynomial code used is 0x1EDC6F41. */ -/* Castagnoli93 */ -/* x^32+x^28+x^27+x^26+x^25+x^23+x^22+x^20+x^19+x^18+x^14+x^13+ */ -/* x^11+x^10+x^9+x^8+x^6+x^0 */ -/* Guy Castagnoli Stefan Braeuer and Martin Herrman */ -/* "Optimization of Cyclic Redundancy-Check Codes */ -/* with 24 and 32 Parity Bits", */ -/* IEEE Transactions on Communications, Vol.41, No.6, June 1993 */ -/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ -static const __u32 crc_c[256] = { - 0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4, - 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB, - 0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B, - 0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24, - 0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B, - 0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384, - 0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54, - 0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B, - 0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A, - 0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35, - 0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5, - 0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA, - 0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45, - 0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A, - 0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A, - 0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595, - 0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48, - 0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957, - 0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687, - 0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198, - 0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927, - 0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38, - 0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8, - 0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7, - 0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096, - 0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789, - 0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859, - 0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46, - 0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9, - 0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6, - 0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36, - 0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829, - 0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C, - 0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93, - 0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043, - 0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C, - 0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3, - 0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC, - 0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C, - 0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033, - 0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652, - 0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D, - 0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D, - 0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982, - 0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D, - 0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622, - 0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2, - 0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED, - 0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530, - 0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F, - 0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF, - 0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0, - 0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F, - 0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540, - 0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90, - 0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F, - 0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE, - 0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1, - 0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321, - 0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E, - 0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81, - 0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E, - 0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E, - 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351, -}; - -__u32 sctp_start_cksum(__u8 *buffer, __u16 length) -{ - __u32 crc32 = ~(__u32) 0; - __u32 i; - - /* Optimize this routine to be SCTP specific, knowing how - * to skip the checksum field of the SCTP header. - */ - - /* Calculate CRC up to the checksum. */ - for (i = 0; i < (sizeof(struct sctphdr) - sizeof(__u32)); i++) - CRC32C(crc32, buffer[i]); - - /* Skip checksum field of the header. */ - for (i = 0; i < sizeof(__u32); i++) - CRC32C(crc32, 0); - - /* Calculate the rest of the CRC. */ - for (i = sizeof(struct sctphdr); i < length ; i++) - CRC32C(crc32, buffer[i]); - - return crc32; -} - -__u32 sctp_update_cksum(__u8 *buffer, __u16 length, __u32 crc32) -{ - __u32 i; - - for (i = 0; i < length ; i++) - CRC32C(crc32, buffer[i]); - - return crc32; -} - -#if 0 -__u32 sctp_update_copy_cksum(__u8 *to, __u8 *from, __u16 length, __u32 crc32) -{ - __u32 i; - __u32 *_to = (__u32 *)to; - __u32 *_from = (__u32 *)from; - - for (i = 0; i < (length/4); i++) { - _to[i] = _from[i]; - CRC32C(crc32, from[i*4]); - CRC32C(crc32, from[i*4+1]); - CRC32C(crc32, from[i*4+2]); - CRC32C(crc32, from[i*4+3]); - } - - return crc32; -} -#endif /* 0 */ - -__u32 sctp_end_cksum(__u32 crc32) -{ - __u32 result; - __u8 byte0, byte1, byte2, byte3; - - result = ~crc32; - - /* result now holds the negated polynomial remainder; - * since the table and algorithm is "reflected" [williams95]. - * That is, result has the same value as if we mapped the message - * to a polyomial, computed the host-bit-order polynomial - * remainder, performed final negation, then did an end-for-end - * bit-reversal. - * Note that a 32-bit bit-reversal is identical to four inplace - * 8-bit reversals followed by an end-for-end byteswap. - * In other words, the bytes of each bit are in the right order, - * but the bytes have been byteswapped. So we now do an explicit - * byteswap. On a little-endian machine, this byteswap and - * the final ntohl cancel out and could be elided. - */ - byte0 = result & 0xff; - byte1 = (result>>8) & 0xff; - byte2 = (result>>16) & 0xff; - byte3 = (result>>24) & 0xff; - - crc32 = ((byte0 << 24) | - (byte1 << 16) | - (byte2 << 8) | - byte3); - return crc32; -} diff --git a/net/sctp/input.c b/net/sctp/input.c index 91ae463b079..d695f710fc7 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -60,6 +60,7 @@ #include <net/xfrm.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> +#include <net/sctp/checksum.h> /* Forward declarations for internal helpers. */ static int sctp_rcv_ootb(struct sk_buff *); @@ -890,14 +891,6 @@ static struct sctp_association *__sctp_rcv_init_lookup(struct sk_buff *skb, ch = (sctp_chunkhdr_t *) skb->data; - /* The code below will attempt to walk the chunk and extract - * parameter information. Before we do that, we need to verify - * that the chunk length doesn't cause overflow. Otherwise, we'll - * walk off the end. - */ - if (WORD_ROUND(ntohs(ch->length)) > skb->len) - return NULL; - /* * This code will NOT touch anything inside the chunk--it is * strictly READ-ONLY. @@ -934,6 +927,44 @@ static struct sctp_association *__sctp_rcv_init_lookup(struct sk_buff *skb, return NULL; } +/* ADD-IP, Section 5.2 + * When an endpoint receives an ASCONF Chunk from the remote peer + * special procedures may be needed to identify the association the + * ASCONF Chunk is associated with. To properly find the association + * the following procedures SHOULD be followed: + * + * D2) If the association is not found, use the address found in the + * Address Parameter TLV combined with the port number found in the + * SCTP common header. If found proceed to rule D4. + * + * D2-ext) If more than one ASCONF Chunks are packed together, use the + * address found in the ASCONF Address Parameter TLV of each of the + * subsequent ASCONF Chunks. If found, proceed to rule D4. + */ +static struct sctp_association *__sctp_rcv_asconf_lookup( + sctp_chunkhdr_t *ch, + const union sctp_addr *laddr, + __be32 peer_port, + struct sctp_transport **transportp) +{ + sctp_addip_chunk_t *asconf = (struct sctp_addip_chunk *)ch; + struct sctp_af *af; + union sctp_addr_param *param; + union sctp_addr paddr; + + /* Skip over the ADDIP header and find the Address parameter */ + param = (union sctp_addr_param *)(asconf + 1); + + af = sctp_get_af_specific(param_type2af(param->v4.param_hdr.type)); + if (unlikely(!af)) + return NULL; + + af->from_addr_param(&paddr, param, peer_port, 0); + + return __sctp_lookup_association(laddr, &paddr, transportp); +} + + /* SCTP-AUTH, Section 6.3: * If the receiver does not find a STCB for a packet containing an AUTH * chunk as the first chunk and not a COOKIE-ECHO chunk as the second @@ -942,20 +973,64 @@ static struct sctp_association *__sctp_rcv_init_lookup(struct sk_buff *skb, * * This means that any chunks that can help us identify the association need * to be looked at to find this assocation. -* -* TODO: The only chunk currently defined that can do that is ASCONF, but we -* don't support that functionality yet. */ -static struct sctp_association *__sctp_rcv_auth_lookup(struct sk_buff *skb, - const union sctp_addr *paddr, +static struct sctp_association *__sctp_rcv_walk_lookup(struct sk_buff *skb, const union sctp_addr *laddr, struct sctp_transport **transportp) { - /* XXX - walk through the chunks looking for something that can - * help us find the association. INIT, and INIT-ACK are not permitted. - * That leaves ASCONF, but we don't support that yet. + struct sctp_association *asoc = NULL; + sctp_chunkhdr_t *ch; + int have_auth = 0; + unsigned int chunk_num = 1; + __u8 *ch_end; + + /* Walk through the chunks looking for AUTH or ASCONF chunks + * to help us find the association. */ - return NULL; + ch = (sctp_chunkhdr_t *) skb->data; + do { + /* Break out if chunk length is less then minimal. */ + if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t)) + break; + + ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length)); + if (ch_end > skb_tail_pointer(skb)) + break; + + switch(ch->type) { + case SCTP_CID_AUTH: + have_auth = chunk_num; + break; + + case SCTP_CID_COOKIE_ECHO: + /* If a packet arrives containing an AUTH chunk as + * a first chunk, a COOKIE-ECHO chunk as the second + * chunk, and possibly more chunks after them, and + * the receiver does not have an STCB for that + * packet, then authentication is based on + * the contents of the COOKIE- ECHO chunk. + */ + if (have_auth == 1 && chunk_num == 2) + return NULL; + break; + + case SCTP_CID_ASCONF: + if (have_auth || sctp_addip_noauth) + asoc = __sctp_rcv_asconf_lookup(ch, laddr, + sctp_hdr(skb)->source, + transportp); + default: + break; + } + + if (asoc) + break; + + ch = (sctp_chunkhdr_t *) ch_end; + chunk_num++; + } while (ch_end < skb_tail_pointer(skb)); + + return asoc; } /* @@ -965,7 +1040,6 @@ static struct sctp_association *__sctp_rcv_auth_lookup(struct sk_buff *skb, * chunks. */ static struct sctp_association *__sctp_rcv_lookup_harder(struct sk_buff *skb, - const union sctp_addr *paddr, const union sctp_addr *laddr, struct sctp_transport **transportp) { @@ -973,6 +1047,14 @@ static struct sctp_association *__sctp_rcv_lookup_harder(struct sk_buff *skb, ch = (sctp_chunkhdr_t *) skb->data; + /* The code below will attempt to walk the chunk and extract + * parameter information. Before we do that, we need to verify + * that the chunk length doesn't cause overflow. Otherwise, we'll + * walk off the end. + */ + if (WORD_ROUND(ntohs(ch->length)) > skb->len) + return NULL; + /* If this is INIT/INIT-ACK look inside the chunk too. */ switch (ch->type) { case SCTP_CID_INIT: @@ -980,11 +1062,12 @@ static struct sctp_association *__sctp_rcv_lookup_harder(struct sk_buff *skb, return __sctp_rcv_init_lookup(skb, laddr, transportp); break; - case SCTP_CID_AUTH: - return __sctp_rcv_auth_lookup(skb, paddr, laddr, transportp); + default: + return __sctp_rcv_walk_lookup(skb, laddr, transportp); break; } + return NULL; } @@ -1003,7 +1086,7 @@ static struct sctp_association *__sctp_rcv_lookup(struct sk_buff *skb, * parameters within the INIT or INIT-ACK. */ if (!asoc) - asoc = __sctp_rcv_lookup_harder(skb, paddr, laddr, transportp); + asoc = __sctp_rcv_lookup_harder(skb, laddr, transportp); return asoc; } diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 7f31ff638bc..74f106a7a7e 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -330,7 +330,7 @@ static void sctp_v6_get_saddr(struct sctp_association *asoc, list_for_each_entry_rcu(laddr, &bp->address_list, list) { if (!laddr->valid) continue; - if ((laddr->use_as_src) && + if ((laddr->state == SCTP_ADDR_SRC) && (laddr->a.sa.sa_family == AF_INET6) && (scope <= sctp_scope(&laddr->a))) { bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a); @@ -556,7 +556,7 @@ static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp) if (!(type & IPV6_ADDR_UNICAST)) return 0; - return ipv6_chk_addr(in6, NULL, 0); + return ipv6_chk_addr(&init_net, in6, NULL, 0); } /* This function checks if the address is a valid address to be used for @@ -858,7 +858,8 @@ static int sctp_inet6_bind_verify(struct sctp_sock *opt, union sctp_addr *addr) dev = dev_get_by_index(&init_net, addr->v6.sin6_scope_id); if (!dev) return 0; - if (!ipv6_chk_addr(&addr->v6.sin6_addr, dev, 0)) { + if (!ipv6_chk_addr(&init_net, &addr->v6.sin6_addr, + dev, 0)) { dev_put(dev); return 0; } diff --git a/net/sctp/output.c b/net/sctp/output.c index 847639d542c..5e811b91f21 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -60,6 +60,7 @@ #include <net/sctp/sctp.h> #include <net/sctp/sm.h> +#include <net/sctp/checksum.h> /* Forward declarations for private helpers. */ static sctp_xmit_t sctp_packet_append_data(struct sctp_packet *packet, diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index fa76f235169..a42af865c2e 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -716,7 +716,29 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) new_transport = chunk->transport; if (!new_transport) { - new_transport = asoc->peer.active_path; + /* + * If we have a prior transport pointer, see if + * the destination address of the chunk + * matches the destination address of the + * current transport. If not a match, then + * try to look up the transport with a given + * destination address. We do this because + * after processing ASCONFs, we may have new + * transports created. + */ + if (transport && + sctp_cmp_addr_exact(&chunk->dest, + &transport->ipaddr)) + new_transport = transport; + else + new_transport = sctp_assoc_lookup_paddr(asoc, + &chunk->dest); + + /* if we still don't have a new transport, then + * use the current active path. + */ + if (!new_transport) + new_transport = asoc->peer.active_path; } else if ((new_transport->state == SCTP_INACTIVE) || (new_transport->state == SCTP_UNCONFIRMED)) { /* If the chunk is Heartbeat or Heartbeat Ack, @@ -729,9 +751,12 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) * address of the IP datagram containing the * HEARTBEAT chunk to which this ack is responding. * ... + * + * ASCONF_ACKs also must be sent to the source. */ if (chunk->chunk_hdr->type != SCTP_CID_HEARTBEAT && - chunk->chunk_hdr->type != SCTP_CID_HEARTBEAT_ACK) + chunk->chunk_hdr->type != SCTP_CID_HEARTBEAT_ACK && + chunk->chunk_hdr->type != SCTP_CID_ASCONF_ACK) new_transport = asoc->peer.active_path; } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index d50f610d1b0..1339742e49f 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -229,8 +229,8 @@ int sctp_copy_local_addr_list(struct sctp_bind_addr *bp, sctp_scope_t scope, (((AF_INET6 == addr->a.sa.sa_family) && (copy_flags & SCTP_ADDR6_ALLOWED) && (copy_flags & SCTP_ADDR6_PEERSUPP)))) { - error = sctp_add_bind_addr(bp, &addr->a, 1, - GFP_ATOMIC); + error = sctp_add_bind_addr(bp, &addr->a, + SCTP_ADDR_SRC, GFP_ATOMIC); if (error) goto end_copy; } @@ -359,7 +359,7 @@ static int sctp_v4_addr_valid(union sctp_addr *addr, const struct sk_buff *skb) { /* Is this a non-unicast address or a unusable SCTP address? */ - if (IS_IPV4_UNUSABLE_ADDRESS(&addr->v4.sin_addr.s_addr)) + if (IS_IPV4_UNUSABLE_ADDRESS(addr->v4.sin_addr.s_addr)) return 0; /* Is this a broadcast address? */ @@ -372,7 +372,7 @@ static int sctp_v4_addr_valid(union sctp_addr *addr, /* Should this be available for binding? */ static int sctp_v4_available(union sctp_addr *addr, struct sctp_sock *sp) { - int ret = inet_addr_type(addr->v4.sin_addr.s_addr); + int ret = inet_addr_type(&init_net, addr->v4.sin_addr.s_addr); if (addr->v4.sin_addr.s_addr != INADDR_ANY && @@ -408,13 +408,15 @@ static sctp_scope_t sctp_v4_scope(union sctp_addr *addr) */ /* Check for unusable SCTP addresses. */ - if (IS_IPV4_UNUSABLE_ADDRESS(&addr->v4.sin_addr.s_addr)) { + if (IS_IPV4_UNUSABLE_ADDRESS(addr->v4.sin_addr.s_addr)) { retval = SCTP_SCOPE_UNUSABLE; - } else if (LOOPBACK(addr->v4.sin_addr.s_addr)) { + } else if (ipv4_is_loopback(addr->v4.sin_addr.s_addr)) { retval = SCTP_SCOPE_LOOPBACK; - } else if (IS_IPV4_LINK_ADDRESS(&addr->v4.sin_addr.s_addr)) { + } else if (ipv4_is_linklocal_169(addr->v4.sin_addr.s_addr)) { retval = SCTP_SCOPE_LINK; - } else if (IS_IPV4_PRIVATE_ADDRESS(&addr->v4.sin_addr.s_addr)) { + } else if (ipv4_is_private_10(addr->v4.sin_addr.s_addr) || + ipv4_is_private_172(addr->v4.sin_addr.s_addr) || + ipv4_is_private_192(addr->v4.sin_addr.s_addr)) { retval = SCTP_SCOPE_PRIVATE; } else { retval = SCTP_SCOPE_GLOBAL; @@ -452,7 +454,7 @@ static struct dst_entry *sctp_v4_get_dst(struct sctp_association *asoc, __FUNCTION__, NIPQUAD(fl.fl4_dst), NIPQUAD(fl.fl4_src)); - if (!ip_route_output_key(&rt, &fl)) { + if (!ip_route_output_key(&init_net, &rt, &fl)) { dst = &rt->u.dst; } @@ -470,7 +472,7 @@ static struct dst_entry *sctp_v4_get_dst(struct sctp_association *asoc, */ rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { - if (!laddr->valid || !laddr->use_as_src) + if (!laddr->valid || (laddr->state != SCTP_ADDR_SRC)) continue; sctp_v4_dst_saddr(&dst_saddr, dst, htons(bp->port)); if (sctp_v4_cmp_addr(&dst_saddr, &laddr->a)) @@ -492,10 +494,10 @@ static struct dst_entry *sctp_v4_get_dst(struct sctp_association *asoc, list_for_each_entry_rcu(laddr, &bp->address_list, list) { if (!laddr->valid) continue; - if ((laddr->use_as_src) && + if ((laddr->state == SCTP_ADDR_SRC) && (AF_INET == laddr->a.sa.sa_family)) { fl.fl4_src = laddr->a.v4.sin_addr.s_addr; - if (!ip_route_output_key(&rt, &fl)) { + if (!ip_route_output_key(&init_net, &rt, &fl)) { dst = &rt->u.dst; goto out_unlock; } @@ -1107,7 +1109,7 @@ SCTP_STATIC __init int sctp_init(void) sysctl_sctp_rmem[1] = (1500 *(sizeof(struct sk_buff) + 1)); sysctl_sctp_rmem[2] = max(sysctl_sctp_rmem[1], max_share); - sysctl_sctp_wmem[0] = SK_STREAM_MEM_QUANTUM; + sysctl_sctp_wmem[0] = SK_MEM_QUANTUM; sysctl_sctp_wmem[1] = 16*1024; sysctl_sctp_wmem[2] = max(64*1024, max_share); diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 3cc629d3c9f..dd98763c8b0 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -1275,6 +1275,9 @@ nodata: /* Release the memory occupied by a chunk. */ static void sctp_chunk_destroy(struct sctp_chunk *chunk) { + BUG_ON(!list_empty(&chunk->list)); + list_del_init(&chunk->transmitted_list); + /* Free the chunk skb data and the SCTP_chunk stub itself. */ dev_kfree_skb(chunk->skb); @@ -1285,9 +1288,6 @@ static void sctp_chunk_destroy(struct sctp_chunk *chunk) /* Possibly, free the chunk. */ void sctp_chunk_free(struct sctp_chunk *chunk) { - BUG_ON(!list_empty(&chunk->list)); - list_del_init(&chunk->transmitted_list); - /* Release our reference on the message tracker. */ if (chunk->msg) sctp_datamsg_put(chunk->msg); @@ -1692,8 +1692,8 @@ no_hmac: /* Also, add the destination address. */ if (list_empty(&retval->base.bind_addr.address_list)) { - sctp_add_bind_addr(&retval->base.bind_addr, &chunk->dest, 1, - GFP_ATOMIC); + sctp_add_bind_addr(&retval->base.bind_addr, &chunk->dest, + SCTP_ADDR_SRC, GFP_ATOMIC); } retval->next_tsn = retval->c.initial_tsn; @@ -1836,6 +1836,39 @@ static int sctp_process_hn_param(const struct sctp_association *asoc, return 0; } +static int sctp_verify_ext_param(union sctp_params param) +{ + __u16 num_ext = ntohs(param.p->length) - sizeof(sctp_paramhdr_t); + int have_auth = 0; + int have_asconf = 0; + int i; + + for (i = 0; i < num_ext; i++) { + switch (param.ext->chunks[i]) { + case SCTP_CID_AUTH: + have_auth = 1; + break; + case SCTP_CID_ASCONF: + case SCTP_CID_ASCONF_ACK: + have_asconf = 1; + break; + } + } + + /* ADD-IP Security: The draft requires us to ABORT or ignore the + * INIT/INIT-ACK if ADD-IP is listed, but AUTH is not. Do this + * only if ADD-IP is turned on and we are not backward-compatible + * mode. + */ + if (sctp_addip_noauth) + return 1; + + if (sctp_addip_enable && !have_auth && have_asconf) + return 0; + + return 1; +} + static void sctp_process_ext_param(struct sctp_association *asoc, union sctp_params param) { @@ -1966,9 +1999,18 @@ static sctp_ierror_t sctp_verify_param(const struct sctp_association *asoc, case SCTP_PARAM_UNRECOGNIZED_PARAMETERS: case SCTP_PARAM_ECN_CAPABLE: case SCTP_PARAM_ADAPTATION_LAYER_IND: + break; + case SCTP_PARAM_SUPPORTED_EXT: + if (!sctp_verify_ext_param(param)) + return SCTP_IERROR_ABORT; break; + case SCTP_PARAM_SET_PRIMARY: + if (sctp_addip_enable) + break; + goto fallthrough; + case SCTP_PARAM_HOST_NAME_ADDRESS: /* Tell the peer, we won't support this param. */ sctp_process_hn_param(asoc, param, chunk, err_chunk); @@ -2134,10 +2176,11 @@ int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid, !asoc->peer.peer_hmacs)) asoc->peer.auth_capable = 0; - - /* If the peer claims support for ADD-IP without support - * for AUTH, disable support for ADD-IP. - * Do this only if backward compatible mode is turned off. + /* In a non-backward compatible mode, if the peer claims + * support for ADD-IP but not AUTH, the ADD-IP spec states + * that we MUST ABORT the association. Section 6. The section + * also give us an option to silently ignore the packet, which + * is what we'll do here. */ if (!sctp_addip_noauth && (asoc->peer.asconf_capable && !asoc->peer.auth_capable)) { @@ -2145,6 +2188,7 @@ int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid, SCTP_PARAM_DEL_IP | SCTP_PARAM_SET_PRIMARY); asoc->peer.asconf_capable = 0; + goto clean_up; } /* Walk list of transports, removing transports in the UNKNOWN state. */ @@ -2286,6 +2330,8 @@ static int sctp_process_param(struct sctp_association *asoc, sctp_scope_t scope; time_t stale; struct sctp_af *af; + union sctp_addr_param *addr_param; + struct sctp_transport *t; /* We maintain all INIT parameters in network byte order all the * time. This allows us to not worry about whether the parameters @@ -2376,6 +2422,26 @@ static int sctp_process_param(struct sctp_association *asoc, asoc->peer.adaptation_ind = param.aind->adaptation_ind; break; + case SCTP_PARAM_SET_PRIMARY: + addr_param = param.v + sizeof(sctp_addip_param_t); + + af = sctp_get_af_specific(param_type2af(param.p->type)); + af->from_addr_param(&addr, addr_param, + htons(asoc->peer.port), 0); + + /* if the address is invalid, we can't process it. + * XXX: see spec for what to do. + */ + if (!af->addr_valid(&addr, NULL, NULL)) + break; + + t = sctp_assoc_lookup_paddr(asoc, &addr); + if (!t) + break; + + sctp_assoc_set_primary(asoc, t); + break; + case SCTP_PARAM_SUPPORTED_EXT: sctp_process_ext_param(asoc, param); break; @@ -2727,7 +2793,6 @@ static __be16 sctp_process_asconf_param(struct sctp_association *asoc, struct sctp_transport *peer; struct sctp_af *af; union sctp_addr addr; - struct list_head *pos; union sctp_addr_param *addr_param; addr_param = (union sctp_addr_param *) @@ -2738,8 +2803,24 @@ static __be16 sctp_process_asconf_param(struct sctp_association *asoc, return SCTP_ERROR_INV_PARAM; af->from_addr_param(&addr, addr_param, htons(asoc->peer.port), 0); + + /* ADDIP 4.2.1 This parameter MUST NOT contain a broadcast + * or multicast address. + * (note: wildcard is permitted and requires special handling so + * make sure we check for that) + */ + if (!af->is_any(&addr) && !af->addr_valid(&addr, NULL, asconf->skb)) + return SCTP_ERROR_INV_PARAM; + switch (asconf_param->param_hdr.type) { case SCTP_PARAM_ADD_IP: + /* Section 4.2.1: + * If the address 0.0.0.0 or ::0 is provided, the source + * address of the packet MUST be added. + */ + if (af->is_any(&addr)) + memcpy(&addr, &asconf->source, sizeof(addr)); + /* ADDIP 4.3 D9) If an endpoint receives an ADD IP address * request and does not have the local resources to add this * new address to the association, it MUST return an Error @@ -2761,8 +2842,7 @@ static __be16 sctp_process_asconf_param(struct sctp_association *asoc, * MUST send an Error Cause TLV with the error cause set to the * new error code 'Request to Delete Last Remaining IP Address'. */ - pos = asoc->peer.transport_addr_list.next; - if (pos->next == &asoc->peer.transport_addr_list) + if (asoc->peer.transport_count == 1) return SCTP_ERROR_DEL_LAST_IP; /* ADDIP 4.3 D8) If a request is received to delete an IP @@ -2775,9 +2855,27 @@ static __be16 sctp_process_asconf_param(struct sctp_association *asoc, if (sctp_cmp_addr_exact(sctp_source(asconf), &addr)) return SCTP_ERROR_DEL_SRC_IP; - sctp_assoc_del_peer(asoc, &addr); + /* Section 4.2.2 + * If the address 0.0.0.0 or ::0 is provided, all + * addresses of the peer except the source address of the + * packet MUST be deleted. + */ + if (af->is_any(&addr)) { + sctp_assoc_set_primary(asoc, asconf->transport); + sctp_assoc_del_nonprimary_peers(asoc, + asconf->transport); + } else + sctp_assoc_del_peer(asoc, &addr); break; case SCTP_PARAM_SET_PRIMARY: + /* ADDIP Section 4.2.4 + * If the address 0.0.0.0 or ::0 is provided, the receiver + * MAY mark the source address of the packet as its + * primary. + */ + if (af->is_any(&addr)) + memcpy(&addr.v4, sctp_source(asconf), sizeof(addr)); + peer = sctp_assoc_lookup_paddr(asoc, &addr); if (!peer) return SCTP_ERROR_INV_PARAM; @@ -2921,11 +3019,9 @@ done: * after freeing the reference to old asconf ack if any. */ if (asconf_ack) { - if (asoc->addip_last_asconf_ack) - sctp_chunk_free(asoc->addip_last_asconf_ack); - sctp_chunk_hold(asconf_ack); - asoc->addip_last_asconf_ack = asconf_ack; + list_add_tail(&asconf_ack->transmitted_list, + &asoc->asconf_ack_list); } return asconf_ack; @@ -2959,7 +3055,7 @@ static int sctp_asconf_param_success(struct sctp_association *asoc, local_bh_disable(); list_for_each_entry(saddr, &bp->address_list, list) { if (sctp_cmp_addr_exact(&saddr->a, &addr)) - saddr->use_as_src = 1; + saddr->state = SCTP_ADDR_SRC; } local_bh_enable(); break; diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index d247ed4ee42..61cbd5a8dd0 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -143,6 +143,12 @@ static sctp_ierror_t sctp_sf_authenticate(const struct sctp_endpoint *ep, const sctp_subtype_t type, struct sctp_chunk *chunk); +static sctp_disposition_t __sctp_sf_do_9_1_abort(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands); + /* Small helper function that checks if the chunk length * is of the appropriate length. The 'required_length' argument * is set to be the size of a specific chunk we are testing. @@ -475,7 +481,6 @@ sctp_disposition_t sctp_sf_do_5_1C_ack(const struct sctp_endpoint *ep, sctp_init_chunk_t *initchunk; struct sctp_chunk *err_chunk; struct sctp_packet *packet; - sctp_error_t error; if (!sctp_vtag_verify(chunk, asoc)) return sctp_sf_pdiscard(ep, asoc, type, arg, commands); @@ -500,8 +505,12 @@ sctp_disposition_t sctp_sf_do_5_1C_ack(const struct sctp_endpoint *ep, (sctp_init_chunk_t *)chunk->chunk_hdr, chunk, &err_chunk)) { + sctp_error_t error = SCTP_ERROR_NO_RESOURCE; + /* This chunk contains fatal error. It is to be discarded. - * Send an ABORT, with causes if there is any. + * Send an ABORT, with causes. If there are no causes, + * then there wasn't enough memory. Just terminate + * the association. */ if (err_chunk) { packet = sctp_abort_pkt_new(ep, asoc, arg, @@ -517,12 +526,7 @@ sctp_disposition_t sctp_sf_do_5_1C_ack(const struct sctp_endpoint *ep, SCTP_PACKET(packet)); SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); error = SCTP_ERROR_INV_PARAM; - } else { - error = SCTP_ERROR_NO_RESOURCE; } - } else { - sctp_sf_tabort_8_4_8(ep, asoc, type, arg, commands); - error = SCTP_ERROR_INV_PARAM; } /* SCTP-AUTH, Section 6.3: @@ -2073,11 +2077,20 @@ sctp_disposition_t sctp_sf_shutdown_pending_abort( if (!sctp_chunk_length_valid(chunk, sizeof(sctp_abort_chunk_t))) return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + /* ADD-IP: Special case for ABORT chunks + * F4) One special consideration is that ABORT Chunks arriving + * destined to the IP address being deleted MUST be + * ignored (see Section 5.3.1 for further details). + */ + if (SCTP_ADDR_DEL == + sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest)) + return sctp_sf_discard_chunk(ep, asoc, type, arg, commands); + /* Stop the T5-shutdown guard timer. */ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD)); - return sctp_sf_do_9_1_abort(ep, asoc, type, arg, commands); + return __sctp_sf_do_9_1_abort(ep, asoc, type, arg, commands); } /* @@ -2109,6 +2122,15 @@ sctp_disposition_t sctp_sf_shutdown_sent_abort(const struct sctp_endpoint *ep, if (!sctp_chunk_length_valid(chunk, sizeof(sctp_abort_chunk_t))) return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + /* ADD-IP: Special case for ABORT chunks + * F4) One special consideration is that ABORT Chunks arriving + * destined to the IP address being deleted MUST be + * ignored (see Section 5.3.1 for further details). + */ + if (SCTP_ADDR_DEL == + sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest)) + return sctp_sf_discard_chunk(ep, asoc, type, arg, commands); + /* Stop the T2-shutdown timer. */ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN)); @@ -2117,7 +2139,7 @@ sctp_disposition_t sctp_sf_shutdown_sent_abort(const struct sctp_endpoint *ep, sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD)); - return sctp_sf_do_9_1_abort(ep, asoc, type, arg, commands); + return __sctp_sf_do_9_1_abort(ep, asoc, type, arg, commands); } /* @@ -2344,8 +2366,6 @@ sctp_disposition_t sctp_sf_do_9_1_abort(const struct sctp_endpoint *ep, sctp_cmd_seq_t *commands) { struct sctp_chunk *chunk = arg; - unsigned len; - __be16 error = SCTP_ERROR_NO_ERROR; if (!sctp_vtag_verify_either(chunk, asoc)) return sctp_sf_pdiscard(ep, asoc, type, arg, commands); @@ -2363,6 +2383,28 @@ sctp_disposition_t sctp_sf_do_9_1_abort(const struct sctp_endpoint *ep, if (!sctp_chunk_length_valid(chunk, sizeof(sctp_abort_chunk_t))) return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + /* ADD-IP: Special case for ABORT chunks + * F4) One special consideration is that ABORT Chunks arriving + * destined to the IP address being deleted MUST be + * ignored (see Section 5.3.1 for further details). + */ + if (SCTP_ADDR_DEL == + sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest)) + return sctp_sf_discard_chunk(ep, asoc, type, arg, commands); + + return __sctp_sf_do_9_1_abort(ep, asoc, type, arg, commands); +} + +static sctp_disposition_t __sctp_sf_do_9_1_abort(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + unsigned len; + __be16 error = SCTP_ERROR_NO_ERROR; + /* See if we have an error cause code in the chunk. */ len = ntohs(chunk->chunk_hdr->length); if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr)) @@ -3377,6 +3419,15 @@ sctp_disposition_t sctp_sf_do_asconf(const struct sctp_endpoint *ep, return sctp_sf_pdiscard(ep, asoc, type, arg, commands); } + /* ADD-IP: Section 4.1.1 + * This chunk MUST be sent in an authenticated way by using + * the mechanism defined in [I-D.ietf-tsvwg-sctp-auth]. If this chunk + * is received unauthenticated it MUST be silently discarded as + * described in [I-D.ietf-tsvwg-sctp-auth]. + */ + if (!sctp_addip_noauth && !chunk->auth) + return sctp_sf_discard_chunk(ep, asoc, type, arg, commands); + /* Make sure that the ASCONF ADDIP chunk has a valid length. */ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_addip_chunk_t))) return sctp_sf_violation_chunklen(ep, asoc, type, arg, @@ -3393,48 +3444,68 @@ sctp_disposition_t sctp_sf_do_asconf(const struct sctp_endpoint *ep, /* Verify the ASCONF chunk before processing it. */ if (!sctp_verify_asconf(asoc, - (sctp_paramhdr_t *)((void *)addr_param + length), - (void *)chunk->chunk_end, - &err_param)) + (sctp_paramhdr_t *)((void *)addr_param + length), + (void *)chunk->chunk_end, + &err_param)) return sctp_sf_violation_paramlen(ep, asoc, type, - (void *)&err_param, commands); + (void *)&err_param, commands); - /* ADDIP 4.2 C1) Compare the value of the serial number to the value + /* ADDIP 5.2 E1) Compare the value of the serial number to the value * the endpoint stored in a new association variable * 'Peer-Serial-Number'. */ if (serial == asoc->peer.addip_serial + 1) { - /* ADDIP 4.2 C2) If the value found in the serial number is - * equal to the ('Peer-Serial-Number' + 1), the endpoint MUST - * do V1-V5. + /* If this is the first instance of ASCONF in the packet, + * we can clean our old ASCONF-ACKs. + */ + if (!chunk->has_asconf) + sctp_assoc_clean_asconf_ack_cache(asoc); + + /* ADDIP 5.2 E4) When the Sequence Number matches the next one + * expected, process the ASCONF as described below and after + * processing the ASCONF Chunk, append an ASCONF-ACK Chunk to + * the response packet and cache a copy of it (in the event it + * later needs to be retransmitted). + * + * Essentially, do V1-V5. */ asconf_ack = sctp_process_asconf((struct sctp_association *) asoc, chunk); if (!asconf_ack) return SCTP_DISPOSITION_NOMEM; - } else if (serial == asoc->peer.addip_serial) { - /* ADDIP 4.2 C3) If the value found in the serial number is - * equal to the value stored in the 'Peer-Serial-Number' - * IMPLEMENTATION NOTE: As an optimization a receiver may wish - * to save the last ASCONF-ACK for some predetermined period of - * time and instead of re-processing the ASCONF (with the same - * serial number) it may just re-transmit the ASCONF-ACK. + } else if (serial < asoc->peer.addip_serial + 1) { + /* ADDIP 5.2 E2) + * If the value found in the Sequence Number is less than the + * ('Peer- Sequence-Number' + 1), simply skip to the next + * ASCONF, and include in the outbound response packet + * any previously cached ASCONF-ACK response that was + * sent and saved that matches the Sequence Number of the + * ASCONF. Note: It is possible that no cached ASCONF-ACK + * Chunk exists. This will occur when an older ASCONF + * arrives out of order. In such a case, the receiver + * should skip the ASCONF Chunk and not include ASCONF-ACK + * Chunk for that chunk. */ - if (asoc->addip_last_asconf_ack) - asconf_ack = asoc->addip_last_asconf_ack; - else + asconf_ack = sctp_assoc_lookup_asconf_ack(asoc, hdr->serial); + if (!asconf_ack) return SCTP_DISPOSITION_DISCARD; } else { - /* ADDIP 4.2 C4) Otherwise, the ASCONF Chunk is discarded since + /* ADDIP 5.2 E5) Otherwise, the ASCONF Chunk is discarded since * it must be either a stale packet or from an attacker. */ return SCTP_DISPOSITION_DISCARD; } - /* ADDIP 4.2 C5) In both cases C2 and C3 the ASCONF-ACK MUST be sent - * back to the source address contained in the IP header of the ASCONF - * being responded to. + /* ADDIP 5.2 E6) The destination address of the SCTP packet + * containing the ASCONF-ACK Chunks MUST be the source address of + * the SCTP packet that held the ASCONF Chunks. + * + * To do this properly, we'll set the destination address of the chunk + * and at the transmit time, will try look up the transport to use. + * Since ASCONFs may be bundled, the correct transport may not be + * created untill we process the entire packet, thus this workaround. */ + asconf_ack->dest = chunk->source; sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(asconf_ack)); return SCTP_DISPOSITION_CONSUME; @@ -3463,6 +3534,15 @@ sctp_disposition_t sctp_sf_do_asconf_ack(const struct sctp_endpoint *ep, return sctp_sf_pdiscard(ep, asoc, type, arg, commands); } + /* ADD-IP, Section 4.1.2: + * This chunk MUST be sent in an authenticated way by using + * the mechanism defined in [I-D.ietf-tsvwg-sctp-auth]. If this chunk + * is received unauthenticated it MUST be silently discarded as + * described in [I-D.ietf-tsvwg-sctp-auth]. + */ + if (!sctp_addip_noauth && !asconf_ack->auth) + return sctp_sf_discard_chunk(ep, asoc, type, arg, commands); + /* Make sure that the ADDIP chunk has a valid length. */ if (!sctp_chunk_length_valid(asconf_ack, sizeof(sctp_addip_chunk_t))) return sctp_sf_violation_chunklen(ep, asoc, type, arg, @@ -5763,7 +5843,7 @@ static int sctp_eat_data(const struct sctp_association *asoc, /* * Also try to renege to limit our memory usage in the event that * we are under memory pressure - * If we can't renege, don't worry about it, the sk_stream_rmem_schedule + * If we can't renege, don't worry about it, the sk_rmem_schedule * in sctp_ulpevent_make_rcvmsg will drop the frame if we grow our * memory usage too much */ diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c index a93a4bc8f68..e6016e41ffa 100644 --- a/net/sctp/sm_statetable.c +++ b/net/sctp/sm_statetable.c @@ -457,11 +457,11 @@ static const sctp_sm_table_entry_t chunk_event_table[SCTP_NUM_BASE_CHUNK_TYPES][ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ - TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + TYPE_SCTP_FUNC(sctp_sf_do_asconf), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ - TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + TYPE_SCTP_FUNC(sctp_sf_do_asconf), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ - TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + TYPE_SCTP_FUNC(sctp_sf_do_asconf), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_ASCONF */ @@ -478,11 +478,11 @@ static const sctp_sm_table_entry_t chunk_event_table[SCTP_NUM_BASE_CHUNK_TYPES][ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ - TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ - TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ - TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_ASCONF_ACK */ @@ -691,11 +691,11 @@ chunk_event_table_unknown[SCTP_STATE_NUM_STATES] = { /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ - TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ + TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ - TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ + TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ - TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ + TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \ } /* TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT */ diff --git a/net/sctp/socket.c b/net/sctp/socket.c index ea9649ca0b2..710df67a678 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -174,7 +174,8 @@ static inline void sctp_set_owner_w(struct sctp_chunk *chunk) sizeof(struct sctp_chunk); atomic_add(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc); - sk_charge_skb(sk, chunk->skb); + sk->sk_wmem_queued += chunk->skb->truesize; + sk_mem_charge(sk, chunk->skb->truesize); } /* Verify that this is a valid address. */ @@ -390,7 +391,7 @@ SCTP_STATIC int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len) /* Add the address to the bind address list. * Use GFP_ATOMIC since BHs will be disabled. */ - ret = sctp_add_bind_addr(bp, addr, 1, GFP_ATOMIC); + ret = sctp_add_bind_addr(bp, addr, SCTP_ADDR_SRC, GFP_ATOMIC); /* Copy back into socket for getsockname() use. */ if (!ret) { @@ -585,8 +586,8 @@ static int sctp_send_asconf_add_ip(struct sock *sk, addr = (union sctp_addr *)addr_buf; af = sctp_get_af_specific(addr->v4.sin_family); memcpy(&saveaddr, addr, af->sockaddr_len); - retval = sctp_add_bind_addr(bp, &saveaddr, 0, - GFP_ATOMIC); + retval = sctp_add_bind_addr(bp, &saveaddr, + SCTP_ADDR_NEW, GFP_ATOMIC); addr_buf += af->sockaddr_len; } } @@ -777,7 +778,7 @@ static int sctp_send_asconf_del_ip(struct sock *sk, af = sctp_get_af_specific(laddr->v4.sin_family); list_for_each_entry(saddr, &bp->address_list, list) { if (sctp_cmp_addr_exact(&saddr->a, laddr)) - saddr->use_as_src = 0; + saddr->state = SCTP_ADDR_DEL; } addr_buf += af->sockaddr_len; } @@ -6008,7 +6009,8 @@ static void __sctp_write_space(struct sctp_association *asoc) */ if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) - sock_wake_async(sock, 2, POLL_OUT); + sock_wake_async(sock, + SOCK_WAKE_SPACE, POLL_OUT); } } } @@ -6034,10 +6036,10 @@ static void sctp_wfree(struct sk_buff *skb) atomic_sub(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc); /* - * This undoes what is done via sk_charge_skb + * This undoes what is done via sctp_set_owner_w and sk_mem_charge */ sk->sk_wmem_queued -= skb->truesize; - sk->sk_forward_alloc += skb->truesize; + sk_mem_uncharge(sk, skb->truesize); sock_wfree(skb); __sctp_write_space(asoc); @@ -6058,9 +6060,9 @@ void sctp_sock_rfree(struct sk_buff *skb) atomic_sub(event->rmem_len, &sk->sk_rmem_alloc); /* - * Mimic the behavior of sk_stream_rfree + * Mimic the behavior of sock_rfree */ - sk->sk_forward_alloc += event->rmem_len; + sk_mem_uncharge(sk, event->rmem_len); } diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index da4f15734fb..5eb6ea829b5 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -275,24 +275,10 @@ static ctl_table sctp_table[] = { { .ctl_name = 0 } }; -static ctl_table sctp_net_table[] = { - { - .ctl_name = NET_SCTP, - .procname = "sctp", - .mode = 0555, - .child = sctp_table - }, - { .ctl_name = 0 } -}; - -static ctl_table sctp_root_table[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = sctp_net_table - }, - { .ctl_name = 0 } +static struct ctl_path sctp_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "sctp", .ctl_name = NET_SCTP, }, + { } }; static struct ctl_table_header * sctp_sysctl_header; @@ -300,7 +286,7 @@ static struct ctl_table_header * sctp_sysctl_header; /* Sysctl registration. */ void sctp_sysctl_register(void) { - sctp_sysctl_header = register_sysctl_table(sctp_root_table); + sctp_sysctl_header = register_sysctl_paths(sctp_path, sctp_table); } /* Sysctl deregistration. */ diff --git a/net/sctp/transport.c b/net/sctp/transport.c index d55ce83a020..dfa109341ae 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -99,15 +99,10 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer, INIT_LIST_HEAD(&peer->send_ready); INIT_LIST_HEAD(&peer->transports); - /* Set up the retransmission timer. */ - init_timer(&peer->T3_rtx_timer); - peer->T3_rtx_timer.function = sctp_generate_t3_rtx_event; - peer->T3_rtx_timer.data = (unsigned long)peer; - - /* Set up the heartbeat timer. */ - init_timer(&peer->hb_timer); - peer->hb_timer.function = sctp_generate_heartbeat_event; - peer->hb_timer.data = (unsigned long)peer; + setup_timer(&peer->T3_rtx_timer, sctp_generate_t3_rtx_event, + (unsigned long)peer); + setup_timer(&peer->hb_timer, sctp_generate_heartbeat_event, + (unsigned long)peer); /* Initialize the 64-bit random nonce sent with heartbeat. */ get_random_bytes(&peer->hb_nonce, sizeof(peer->hb_nonce)); diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index 307314356e1..047c27df98f 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -700,7 +700,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, if (rx_count >= asoc->base.sk->sk_rcvbuf) { if ((asoc->base.sk->sk_userlocks & SOCK_RCVBUF_LOCK) || - (!sk_stream_rmem_schedule(asoc->base.sk, chunk->skb))) + (!sk_rmem_schedule(asoc->base.sk, chunk->skb->truesize))) goto fail; } diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 1733fa29a50..c25caefa3bc 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -1046,7 +1046,7 @@ void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, sctp_ulpq_partial_delivery(ulpq, chunk, gfp); } - sk_stream_mem_reclaim(asoc->base.sk); + sk_mem_reclaim(asoc->base.sk); return; } diff --git a/net/socket.c b/net/socket.c index 74784dfe8e5..7651de00850 100644 --- a/net/socket.c +++ b/net/socket.c @@ -112,6 +112,9 @@ static long compat_sock_ioctl(struct file *file, static int sock_fasync(int fd, struct file *filp, int on); static ssize_t sock_sendpage(struct file *file, struct page *page, int offset, size_t size, loff_t *ppos, int more); +static ssize_t sock_splice_read(struct file *file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags); /* * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear @@ -134,6 +137,7 @@ static const struct file_operations socket_file_ops = { .fasync = sock_fasync, .sendpage = sock_sendpage, .splice_write = generic_splice_sendpage, + .splice_read = sock_splice_read, }; /* @@ -691,6 +695,15 @@ static ssize_t sock_sendpage(struct file *file, struct page *page, return sock->ops->sendpage(sock, page, offset, size, flags); } +static ssize_t sock_splice_read(struct file *file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct socket *sock = file->private_data; + + return sock->ops->splice_read(sock, ppos, pipe, len, flags); +} + static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb, struct sock_iocb *siocb) { @@ -1057,20 +1070,19 @@ int sock_wake_async(struct socket *sock, int how, int band) if (!sock || !sock->fasync_list) return -1; switch (how) { - case 1: - + case SOCK_WAKE_WAITD: if (test_bit(SOCK_ASYNC_WAITDATA, &sock->flags)) break; goto call_kill; - case 2: + case SOCK_WAKE_SPACE: if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags)) break; /* fall through */ - case 0: + case SOCK_WAKE_IO: call_kill: __kill_fasync(sock->fasync_list, SIGIO, band); break; - case 3: + case SOCK_WAKE_URG: __kill_fasync(sock->fasync_list, SIGURG, band); } return 0; @@ -1353,17 +1365,17 @@ asmlinkage long sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen) * ready for listening. */ -int sysctl_somaxconn __read_mostly = SOMAXCONN; - asmlinkage long sys_listen(int fd, int backlog) { struct socket *sock; int err, fput_needed; + int somaxconn; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock) { - if ((unsigned)backlog > sysctl_somaxconn) - backlog = sysctl_somaxconn; + somaxconn = sock->sk->sk_net->sysctl_somaxconn; + if ((unsigned)backlog > somaxconn) + backlog = somaxconn; err = security_socket_listen(sock, backlog); if (!err) @@ -1581,16 +1593,11 @@ asmlinkage long sys_sendto(int fd, void __user *buff, size_t len, struct msghdr msg; struct iovec iov; int fput_needed; - struct file *sock_file; - sock_file = fget_light(fd, &fput_needed); - err = -EBADF; - if (!sock_file) + sock = sockfd_lookup_light(fd, &err, &fput_needed); + if (!sock) goto out; - sock = sock_from_file(sock_file, &err); - if (!sock) - goto out_put; iov.iov_base = buff; iov.iov_len = len; msg.msg_name = NULL; @@ -1612,7 +1619,7 @@ asmlinkage long sys_sendto(int fd, void __user *buff, size_t len, err = sock_sendmsg(sock, &msg, len); out_put: - fput_light(sock_file, fput_needed); + fput_light(sock->file, fput_needed); out: return err; } @@ -1641,17 +1648,11 @@ asmlinkage long sys_recvfrom(int fd, void __user *ubuf, size_t size, struct msghdr msg; char address[MAX_SOCK_ADDR]; int err, err2; - struct file *sock_file; int fput_needed; - sock_file = fget_light(fd, &fput_needed); - err = -EBADF; - if (!sock_file) - goto out; - - sock = sock_from_file(sock_file, &err); + sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) - goto out_put; + goto out; msg.msg_control = NULL; msg.msg_controllen = 0; @@ -1670,8 +1671,8 @@ asmlinkage long sys_recvfrom(int fd, void __user *ubuf, size_t size, if (err2 < 0) err = err2; } -out_put: - fput_light(sock_file, fput_needed); + + fput_light(sock->file, fput_needed); out: return err; } diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 8e05557414c..73f053d0cc7 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1127,6 +1127,7 @@ struct handle { }; static void *c_start(struct seq_file *m, loff_t *pos) + __acquires(cd->hash_lock) { loff_t n = *pos; unsigned hash, entry; @@ -1183,6 +1184,7 @@ static void *c_next(struct seq_file *m, void *p, loff_t *pos) } static void c_stop(struct seq_file *m, void *p) + __releases(cd->hash_lock) { struct cache_detail *cd = ((struct handle*)m->private)->cd; read_unlock(&cd->hash_lock); diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index c98873f39ae..eed5dd9819c 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -811,9 +811,8 @@ EXPORT_SYMBOL_GPL(rpc_free); void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, int flags, const struct rpc_call_ops *tk_ops, void *calldata) { memset(task, 0, sizeof(*task)); - init_timer(&task->tk_timer); - task->tk_timer.data = (unsigned long) task; - task->tk_timer.function = (void (*)(unsigned long)) rpc_run_timer; + setup_timer(&task->tk_timer, (void (*)(unsigned long))rpc_run_timer, + (unsigned long)task); atomic_set(&task->tk_count, 1); task->tk_client = clnt; task->tk_flags = flags; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index cd641c8634f..fb92f51405c 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1011,9 +1011,8 @@ found: INIT_LIST_HEAD(&xprt->free); INIT_LIST_HEAD(&xprt->recv); INIT_WORK(&xprt->task_cleanup, xprt_autoclose); - init_timer(&xprt->timer); - xprt->timer.function = xprt_init_autodisconnect; - xprt->timer.data = (unsigned long) xprt; + setup_timer(&xprt->timer, xprt_init_autodisconnect, + (unsigned long)xprt); xprt->last_used = jiffies; xprt->cwnd = RPC_INITCWND; xprt->bind_index = 0; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index ee8de7af2a5..1aa1580cda6 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -380,7 +380,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) headerp->rm_xid = rqst->rq_xid; headerp->rm_vers = xdr_one; headerp->rm_credit = htonl(r_xprt->rx_buf.rb_max_requests); - headerp->rm_type = __constant_htonl(RDMA_MSG); + headerp->rm_type = htonl(RDMA_MSG); /* * Chunks needed for results? @@ -458,11 +458,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) RPCRDMA_INLINE_PAD_VALUE(rqst)); if (padlen) { - headerp->rm_type = __constant_htonl(RDMA_MSGP); + headerp->rm_type = htonl(RDMA_MSGP); headerp->rm_body.rm_padded.rm_align = htonl(RPCRDMA_INLINE_PAD_VALUE(rqst)); headerp->rm_body.rm_padded.rm_thresh = - __constant_htonl(RPCRDMA_INLINE_PAD_THRESH); + htonl(RPCRDMA_INLINE_PAD_THRESH); headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero; headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 2f630a512ab..6fa52f44de0 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -838,8 +838,12 @@ static void xs_udp_data_ready(struct sock *sk, int len) copied = repsize; /* Suck it into the iovec, verify checksum if not done by hw. */ - if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) + if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) { + UDPX_INC_STATS_BH(sk, UDP_MIB_INERRORS); goto out_unlock; + } + + UDPX_INC_STATS_BH(sk, UDP_MIB_INDATAGRAMS); /* Something worked... */ dst_confirm(skb->dst); diff --git a/net/sysctl_net.c b/net/sysctl_net.c index cd4eafbab1b..665e856675a 100644 --- a/net/sysctl_net.c +++ b/net/sysctl_net.c @@ -14,6 +14,7 @@ #include <linux/mm.h> #include <linux/sysctl.h> +#include <linux/nsproxy.h> #include <net/sock.h> @@ -29,28 +30,58 @@ #include <linux/if_tr.h> #endif -struct ctl_table net_table[] = { - { - .ctl_name = NET_CORE, - .procname = "core", - .mode = 0555, - .child = core_table, - }, -#ifdef CONFIG_INET - { - .ctl_name = NET_IPV4, - .procname = "ipv4", - .mode = 0555, - .child = ipv4_table - }, -#endif -#ifdef CONFIG_TR - { - .ctl_name = NET_TR, - .procname = "token-ring", - .mode = 0555, - .child = tr_table, - }, -#endif - { 0 }, +static struct list_head * +net_ctl_header_lookup(struct ctl_table_root *root, struct nsproxy *namespaces) +{ + return &namespaces->net_ns->sysctl_table_headers; +} + +static struct ctl_table_root net_sysctl_root = { + .lookup = net_ctl_header_lookup, +}; + +static int sysctl_net_init(struct net *net) +{ + INIT_LIST_HEAD(&net->sysctl_table_headers); + return 0; +} + +static void sysctl_net_exit(struct net *net) +{ + WARN_ON(!list_empty(&net->sysctl_table_headers)); + return; +} + +static struct pernet_operations sysctl_pernet_ops = { + .init = sysctl_net_init, + .exit = sysctl_net_exit, }; + +static __init int sysctl_init(void) +{ + int ret; + ret = register_pernet_subsys(&sysctl_pernet_ops); + if (ret) + goto out; + register_sysctl_root(&net_sysctl_root); +out: + return ret; +} +subsys_initcall(sysctl_init); + +struct ctl_table_header *register_net_sysctl_table(struct net *net, + const struct ctl_path *path, struct ctl_table *table) +{ + struct nsproxy namespaces; + namespaces = *current->nsproxy; + namespaces.net_ns = net; + return __register_sysctl_paths(&net_sysctl_root, + &namespaces, path, table); +} +EXPORT_SYMBOL_GPL(register_net_sysctl_table); + +void unregister_net_sysctl_table(struct ctl_table_header *header) +{ + return unregister_sysctl_table(header); +} +EXPORT_SYMBOL_GPL(unregister_net_sysctl_table); diff --git a/net/tipc/core.h b/net/tipc/core.h index e40ada964d6..feabca58082 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -212,9 +212,7 @@ static inline void k_init_timer(struct timer_list *timer, Handler routine, unsigned long argument) { dbg("initializing timer %p\n", timer); - init_timer(timer); - timer->function = routine; - timer->data = argument; + setup_timer(timer, routine, argument); } /** diff --git a/net/tipc/port.c b/net/tipc/port.c index 76088153524..f508614ca59 100644 --- a/net/tipc/port.c +++ b/net/tipc/port.c @@ -340,7 +340,7 @@ int tipc_portunreliable(u32 ref, unsigned int *isunreliable) if (!p_ptr) return -EINVAL; *isunreliable = port_unreliable(p_ptr); - spin_unlock_bh(p_ptr->publ.lock); + tipc_port_unlock(p_ptr); return TIPC_OK; } @@ -369,7 +369,7 @@ int tipc_portunreturnable(u32 ref, unsigned int *isunrejectable) if (!p_ptr) return -EINVAL; *isunrejectable = port_unreturnable(p_ptr); - spin_unlock_bh(p_ptr->publ.lock); + tipc_port_unlock(p_ptr); return TIPC_OK; } @@ -843,7 +843,7 @@ static void port_dispatcher_sigh(void *dummy) u32 peer_port = port_peerport(p_ptr); u32 peer_node = port_peernode(p_ptr); - spin_unlock_bh(p_ptr->publ.lock); + tipc_port_unlock(p_ptr); if (unlikely(!connected)) { if (unlikely(published)) goto reject; @@ -867,7 +867,7 @@ static void port_dispatcher_sigh(void *dummy) case TIPC_DIRECT_MSG:{ tipc_msg_event cb = up_ptr->msg_cb; - spin_unlock_bh(p_ptr->publ.lock); + tipc_port_unlock(p_ptr); if (unlikely(connected)) goto reject; if (unlikely(!cb)) @@ -882,7 +882,7 @@ static void port_dispatcher_sigh(void *dummy) case TIPC_NAMED_MSG:{ tipc_named_msg_event cb = up_ptr->named_msg_cb; - spin_unlock_bh(p_ptr->publ.lock); + tipc_port_unlock(p_ptr); if (unlikely(connected)) goto reject; if (unlikely(!cb)) @@ -913,7 +913,7 @@ err: u32 peer_port = port_peerport(p_ptr); u32 peer_node = port_peernode(p_ptr); - spin_unlock_bh(p_ptr->publ.lock); + tipc_port_unlock(p_ptr); if (!connected || !cb) break; if (msg_origport(msg) != peer_port) @@ -929,7 +929,7 @@ err: case TIPC_DIRECT_MSG:{ tipc_msg_err_event cb = up_ptr->err_cb; - spin_unlock_bh(p_ptr->publ.lock); + tipc_port_unlock(p_ptr); if (connected || !cb) break; skb_pull(buf, msg_hdr_sz(msg)); @@ -942,7 +942,7 @@ err: tipc_named_msg_err_event cb = up_ptr->named_err_cb; - spin_unlock_bh(p_ptr->publ.lock); + tipc_port_unlock(p_ptr); if (connected || !cb) break; dseq.type = msg_nametype(msg); @@ -1107,7 +1107,7 @@ int tipc_portimportance(u32 ref, unsigned int *importance) if (!p_ptr) return -EINVAL; *importance = (unsigned int)msg_importance(&p_ptr->publ.phdr); - spin_unlock_bh(p_ptr->publ.lock); + tipc_port_unlock(p_ptr); return TIPC_OK; } @@ -1122,7 +1122,7 @@ int tipc_set_portimportance(u32 ref, unsigned int imp) if (!p_ptr) return -EINVAL; msg_set_importance(&p_ptr->publ.phdr, (u32)imp); - spin_unlock_bh(p_ptr->publ.lock); + tipc_port_unlock(p_ptr); return TIPC_OK; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 060bba4567d..eea75888805 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -117,8 +117,6 @@ #include <net/checksum.h> #include <linux/security.h> -int sysctl_unix_max_dgram_qlen __read_mostly = 10; - static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; static DEFINE_SPINLOCK(unix_table_lock); static atomic_t unix_nr_socks = ATOMIC_INIT(0); @@ -127,32 +125,6 @@ static atomic_t unix_nr_socks = ATOMIC_INIT(0); #define UNIX_ABSTRACT(sk) (unix_sk(sk)->addr->hash != UNIX_HASH_SIZE) -static struct sock *first_unix_socket(int *i) -{ - for (*i = 0; *i <= UNIX_HASH_SIZE; (*i)++) { - if (!hlist_empty(&unix_socket_table[*i])) - return __sk_head(&unix_socket_table[*i]); - } - return NULL; -} - -static struct sock *next_unix_socket(int *i, struct sock *s) -{ - struct sock *next = sk_next(s); - /* More in this chain? */ - if (next) - return next; - /* Look for next non-empty chain. */ - for ((*i)++; *i <= UNIX_HASH_SIZE; (*i)++) { - if (!hlist_empty(&unix_socket_table[*i])) - return __sk_head(&unix_socket_table[*i]); - } - return NULL; -} - -#define forall_unix_sockets(i, s) \ - for (s = first_unix_socket(&(i)); s; s = next_unix_socket(&(i),(s))) - #ifdef CONFIG_SECURITY_NETWORK static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) { @@ -270,7 +242,8 @@ static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk) spin_unlock(&unix_table_lock); } -static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname, +static struct sock *__unix_find_socket_byname(struct net *net, + struct sockaddr_un *sunname, int len, int type, unsigned hash) { struct sock *s; @@ -279,6 +252,9 @@ static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname, sk_for_each(s, node, &unix_socket_table[hash ^ type]) { struct unix_sock *u = unix_sk(s); + if (s->sk_net != net) + continue; + if (u->addr->len == len && !memcmp(u->addr->name, sunname, len)) goto found; @@ -288,21 +264,22 @@ found: return s; } -static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname, +static inline struct sock *unix_find_socket_byname(struct net *net, + struct sockaddr_un *sunname, int len, int type, unsigned hash) { struct sock *s; spin_lock(&unix_table_lock); - s = __unix_find_socket_byname(sunname, len, type, hash); + s = __unix_find_socket_byname(net, sunname, len, type, hash); if (s) sock_hold(s); spin_unlock(&unix_table_lock); return s; } -static struct sock *unix_find_socket_byinode(struct inode *i) +static struct sock *unix_find_socket_byinode(struct net *net, struct inode *i) { struct sock *s; struct hlist_node *node; @@ -312,6 +289,9 @@ static struct sock *unix_find_socket_byinode(struct inode *i) &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) { struct dentry *dentry = unix_sk(s)->dentry; + if (s->sk_net != net) + continue; + if(dentry && dentry->d_inode == i) { sock_hold(s); @@ -335,7 +315,7 @@ static void unix_write_space(struct sock *sk) if (unix_writable(sk)) { if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) wake_up_interruptible_sync(sk->sk_sleep); - sk_wake_async(sk, 2, POLL_OUT); + sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } read_unlock(&sk->sk_callback_lock); } @@ -421,7 +401,7 @@ static int unix_release_sock (struct sock *sk, int embrion) unix_state_unlock(skpair); skpair->sk_state_change(skpair); read_lock(&skpair->sk_callback_lock); - sk_wake_async(skpair,1,POLL_HUP); + sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); read_unlock(&skpair->sk_callback_lock); } sock_put(skpair); /* It may now die */ @@ -612,7 +592,7 @@ static struct sock * unix_create1(struct net *net, struct socket *sock) &af_unix_sk_receive_queue_lock_key); sk->sk_write_space = unix_write_space; - sk->sk_max_ack_backlog = sysctl_unix_max_dgram_qlen; + sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; sk->sk_destruct = unix_sock_destructor; u = unix_sk(sk); u->dentry = NULL; @@ -631,9 +611,6 @@ out: static int unix_create(struct net *net, struct socket *sock, int protocol) { - if (net != &init_net) - return -EAFNOSUPPORT; - if (protocol && protocol != PF_UNIX) return -EPROTONOSUPPORT; @@ -677,6 +654,7 @@ static int unix_release(struct socket *sock) static int unix_autobind(struct socket *sock) { struct sock *sk = sock->sk; + struct net *net = sk->sk_net; struct unix_sock *u = unix_sk(sk); static u32 ordernum = 1; struct unix_address * addr; @@ -703,7 +681,7 @@ retry: spin_lock(&unix_table_lock); ordernum = (ordernum+1)&0xFFFFF; - if (__unix_find_socket_byname(addr->name, addr->len, sock->type, + if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type, addr->hash)) { spin_unlock(&unix_table_lock); /* Sanity yield. It is unusual case, but yet... */ @@ -723,7 +701,8 @@ out: mutex_unlock(&u->readlock); return err; } -static struct sock *unix_find_other(struct sockaddr_un *sunname, int len, +static struct sock *unix_find_other(struct net *net, + struct sockaddr_un *sunname, int len, int type, unsigned hash, int *error) { struct sock *u; @@ -741,7 +720,7 @@ static struct sock *unix_find_other(struct sockaddr_un *sunname, int len, err = -ECONNREFUSED; if (!S_ISSOCK(nd.dentry->d_inode->i_mode)) goto put_fail; - u=unix_find_socket_byinode(nd.dentry->d_inode); + u=unix_find_socket_byinode(net, nd.dentry->d_inode); if (!u) goto put_fail; @@ -757,7 +736,7 @@ static struct sock *unix_find_other(struct sockaddr_un *sunname, int len, } } else { err = -ECONNREFUSED; - u=unix_find_socket_byname(sunname, len, type, hash); + u=unix_find_socket_byname(net, sunname, len, type, hash); if (u) { struct dentry *dentry; dentry = unix_sk(u)->dentry; @@ -779,6 +758,7 @@ fail: static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; + struct net *net = sk->sk_net; struct unix_sock *u = unix_sk(sk); struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr; struct dentry * dentry = NULL; @@ -853,7 +833,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (!sunaddr->sun_path[0]) { err = -EADDRINUSE; - if (__unix_find_socket_byname(sunaddr, addr_len, + if (__unix_find_socket_byname(net, sunaddr, addr_len, sk->sk_type, hash)) { unix_release_addr(addr); goto out_unlock; @@ -919,6 +899,7 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { struct sock *sk = sock->sk; + struct net *net = sk->sk_net; struct sockaddr_un *sunaddr=(struct sockaddr_un*)addr; struct sock *other; unsigned hash; @@ -935,7 +916,7 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, goto out; restart: - other=unix_find_other(sunaddr, alen, sock->type, hash, &err); + other=unix_find_other(net, sunaddr, alen, sock->type, hash, &err); if (!other) goto out; @@ -1015,6 +996,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, { struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr; struct sock *sk = sock->sk; + struct net *net = sk->sk_net; struct unix_sock *u = unix_sk(sk), *newu, *otheru; struct sock *newsk = NULL; struct sock *other = NULL; @@ -1054,7 +1036,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, restart: /* Find listening sock. */ - other = unix_find_other(sunaddr, addr_len, sk->sk_type, hash, &err); + other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err); if (!other) goto out; @@ -1330,6 +1312,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, { struct sock_iocb *siocb = kiocb_to_siocb(kiocb); struct sock *sk = sock->sk; + struct net *net = sk->sk_net; struct unix_sock *u = unix_sk(sk); struct sockaddr_un *sunaddr=msg->msg_name; struct sock *other = NULL; @@ -1393,7 +1376,7 @@ restart: if (sunaddr == NULL) goto out_free; - other = unix_find_other(sunaddr, namelen, sk->sk_type, + other = unix_find_other(net, sunaddr, namelen, sk->sk_type, hash, &err); if (other==NULL) goto out_free; @@ -1915,9 +1898,9 @@ static int unix_shutdown(struct socket *sock, int mode) other->sk_state_change(other); read_lock(&other->sk_callback_lock); if (peer_mode == SHUTDOWN_MASK) - sk_wake_async(other,1,POLL_HUP); + sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); else if (peer_mode & RCV_SHUTDOWN) - sk_wake_async(other,1,POLL_IN); + sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); read_unlock(&other->sk_callback_lock); } if (other) @@ -2006,12 +1989,41 @@ static unsigned int unix_poll(struct file * file, struct socket *sock, poll_tabl #ifdef CONFIG_PROC_FS -static struct sock *unix_seq_idx(int *iter, loff_t pos) +static struct sock *first_unix_socket(int *i) +{ + for (*i = 0; *i <= UNIX_HASH_SIZE; (*i)++) { + if (!hlist_empty(&unix_socket_table[*i])) + return __sk_head(&unix_socket_table[*i]); + } + return NULL; +} + +static struct sock *next_unix_socket(int *i, struct sock *s) +{ + struct sock *next = sk_next(s); + /* More in this chain? */ + if (next) + return next; + /* Look for next non-empty chain. */ + for ((*i)++; *i <= UNIX_HASH_SIZE; (*i)++) { + if (!hlist_empty(&unix_socket_table[*i])) + return __sk_head(&unix_socket_table[*i]); + } + return NULL; +} + +struct unix_iter_state { + struct seq_net_private p; + int i; +}; +static struct sock *unix_seq_idx(struct unix_iter_state *iter, loff_t pos) { loff_t off = 0; struct sock *s; - for (s = first_unix_socket(iter); s; s = next_unix_socket(iter, s)) { + for (s = first_unix_socket(&iter->i); s; s = next_unix_socket(&iter->i, s)) { + if (s->sk_net != iter->p.net) + continue; if (off == pos) return s; ++off; @@ -2021,21 +2033,30 @@ static struct sock *unix_seq_idx(int *iter, loff_t pos) static void *unix_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(unix_table_lock) { + struct unix_iter_state *iter = seq->private; spin_lock(&unix_table_lock); - return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1); + return *pos ? unix_seq_idx(iter, *pos - 1) : ((void *) 1); } static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + struct unix_iter_state *iter = seq->private; + struct sock *sk = v; ++*pos; if (v == (void *)1) - return first_unix_socket(seq->private); - return next_unix_socket(seq->private, v); + sk = first_unix_socket(&iter->i); + else + sk = next_unix_socket(&iter->i, sk); + while (sk && (sk->sk_net != iter->p.net)) + sk = next_unix_socket(&iter->i, sk); + return sk; } static void unix_seq_stop(struct seq_file *seq, void *v) + __releases(unix_table_lock) { spin_unlock(&unix_table_lock); } @@ -2094,7 +2115,8 @@ static const struct seq_operations unix_seq_ops = { static int unix_seq_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &unix_seq_ops, sizeof(int)); + return seq_open_net(inode, file, &unix_seq_ops, + sizeof(struct unix_iter_state)); } static const struct file_operations unix_seq_fops = { @@ -2102,7 +2124,7 @@ static const struct file_operations unix_seq_fops = { .open = unix_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; #endif @@ -2113,6 +2135,37 @@ static struct net_proto_family unix_family_ops = { .owner = THIS_MODULE, }; + +static int unix_net_init(struct net *net) +{ + int error = -ENOMEM; + + net->unx.sysctl_max_dgram_qlen = 10; + if (unix_sysctl_register(net)) + goto out; + +#ifdef CONFIG_PROC_FS + if (!proc_net_fops_create(net, "unix", 0, &unix_seq_fops)) { + unix_sysctl_unregister(net); + goto out; + } +#endif + error = 0; +out: + return 0; +} + +static void unix_net_exit(struct net *net) +{ + unix_sysctl_unregister(net); + proc_net_remove(net, "unix"); +} + +static struct pernet_operations unix_net_ops = { + .init = unix_net_init, + .exit = unix_net_exit, +}; + static int __init af_unix_init(void) { int rc = -1; @@ -2128,10 +2181,7 @@ static int __init af_unix_init(void) } sock_register(&unix_family_ops); -#ifdef CONFIG_PROC_FS - proc_net_fops_create(&init_net, "unix", 0, &unix_seq_fops); -#endif - unix_sysctl_register(); + register_pernet_subsys(&unix_net_ops); out: return rc; } @@ -2139,9 +2189,8 @@ out: static void __exit af_unix_exit(void) { sock_unregister(PF_UNIX); - unix_sysctl_unregister(); - proc_net_remove(&init_net, "unix"); proto_unregister(&unix_proto); + unregister_pernet_subsys(&unix_net_ops); } module_init(af_unix_init); diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c index eb0bd57ebad..77513d7e35f 100644 --- a/net/unix/sysctl_net_unix.c +++ b/net/unix/sysctl_net_unix.c @@ -18,7 +18,7 @@ static ctl_table unix_table[] = { { .ctl_name = NET_UNIX_MAX_DGRAM_QLEN, .procname = "max_dgram_qlen", - .data = &sysctl_unix_max_dgram_qlen, + .data = &init_net.unx.sysctl_max_dgram_qlen, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec @@ -26,35 +26,39 @@ static ctl_table unix_table[] = { { .ctl_name = 0 } }; -static ctl_table unix_net_table[] = { - { - .ctl_name = NET_UNIX, - .procname = "unix", - .mode = 0555, - .child = unix_table - }, - { .ctl_name = 0 } +static struct ctl_path unix_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "unix", .ctl_name = NET_UNIX, }, + { }, }; -static ctl_table unix_root_table[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = unix_net_table - }, - { .ctl_name = 0 } -}; +int unix_sysctl_register(struct net *net) +{ + struct ctl_table *table; -static struct ctl_table_header * unix_sysctl_header; + table = kmemdup(unix_table, sizeof(unix_table), GFP_KERNEL); + if (table == NULL) + goto err_alloc; -void unix_sysctl_register(void) -{ - unix_sysctl_header = register_sysctl_table(unix_root_table); + table[0].data = &net->unx.sysctl_max_dgram_qlen; + net->unx.ctl = register_net_sysctl_table(net, unix_path, table); + if (net->unx.ctl == NULL) + goto err_reg; + + return 0; + +err_reg: + kfree(table); +err_alloc: + return -ENOMEM; } -void unix_sysctl_unregister(void) +void unix_sysctl_unregister(struct net *net) { - unregister_sysctl_table(unix_sysctl_header); + struct ctl_table *table; + + table = net->unx.ctl->ctl_table_arg; + unregister_sysctl_table(net->unx.ctl); + kfree(table); } diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index 6426055a8be..79270903bda 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -6,13 +6,13 @@ config NL80211 depends on CFG80211 default y ---help--- - This option turns on the new netlink interface - (nl80211) support in cfg80211. + This option turns on the new netlink interface + (nl80211) support in cfg80211. - If =n, drivers using mac80211 will be configured via - wireless extension support provided by that subsystem. + If =n, drivers using mac80211 will be configured via + wireless extension support provided by that subsystem. - If unsure, say Y. + If unsure, say Y. config WIRELESS_EXT bool "Wireless extensions" diff --git a/net/wireless/core.c b/net/wireless/core.c index febc33bc9c0..cfc5fc5f9e7 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -184,6 +184,9 @@ struct wiphy *wiphy_new(struct cfg80211_ops *ops, int sizeof_priv) struct cfg80211_registered_device *drv; int alloc_size; + WARN_ON(!ops->add_key && ops->del_key); + WARN_ON(ops->add_key && !ops->del_key); + alloc_size = sizeof(*drv) + sizeof_priv; drv = kzalloc(alloc_size, GFP_KERNEL); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 48b0d453e4e..e3a214f63f9 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -61,6 +61,27 @@ static struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] __read_mostly = { [NL80211_ATTR_IFTYPE] = { .type = NLA_U32 }, [NL80211_ATTR_IFINDEX] = { .type = NLA_U32 }, [NL80211_ATTR_IFNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ-1 }, + + [NL80211_ATTR_MAC] = { .type = NLA_BINARY, .len = ETH_ALEN }, + + [NL80211_ATTR_KEY_DATA] = { .type = NLA_BINARY, + .len = WLAN_MAX_KEY_LEN }, + [NL80211_ATTR_KEY_IDX] = { .type = NLA_U8 }, + [NL80211_ATTR_KEY_CIPHER] = { .type = NLA_U32 }, + [NL80211_ATTR_KEY_DEFAULT] = { .type = NLA_FLAG }, + + [NL80211_ATTR_BEACON_INTERVAL] = { .type = NLA_U32 }, + [NL80211_ATTR_DTIM_PERIOD] = { .type = NLA_U32 }, + [NL80211_ATTR_BEACON_HEAD] = { .type = NLA_BINARY, + .len = IEEE80211_MAX_DATA_LEN }, + [NL80211_ATTR_BEACON_TAIL] = { .type = NLA_BINARY, + .len = IEEE80211_MAX_DATA_LEN }, + [NL80211_ATTR_STA_AID] = { .type = NLA_U16 }, + [NL80211_ATTR_STA_FLAGS] = { .type = NLA_NESTED }, + [NL80211_ATTR_STA_LISTEN_INTERVAL] = { .type = NLA_U16 }, + [NL80211_ATTR_STA_SUPPORTED_RATES] = { .type = NLA_BINARY, + .len = NL80211_MAX_SUPP_RATES }, + [NL80211_ATTR_STA_VLAN] = { .type = NLA_U32 }, }; /* message building helper */ @@ -335,6 +356,655 @@ static int nl80211_del_interface(struct sk_buff *skb, struct genl_info *info) return err; } +struct get_key_cookie { + struct sk_buff *msg; + int error; +}; + +static void get_key_callback(void *c, struct key_params *params) +{ + struct get_key_cookie *cookie = c; + + if (params->key) + NLA_PUT(cookie->msg, NL80211_ATTR_KEY_DATA, + params->key_len, params->key); + + if (params->seq) + NLA_PUT(cookie->msg, NL80211_ATTR_KEY_SEQ, + params->seq_len, params->seq); + + if (params->cipher) + NLA_PUT_U32(cookie->msg, NL80211_ATTR_KEY_CIPHER, + params->cipher); + + return; + nla_put_failure: + cookie->error = 1; +} + +static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *drv; + int err; + struct net_device *dev; + u8 key_idx = 0; + u8 *mac_addr = NULL; + struct get_key_cookie cookie = { + .error = 0, + }; + void *hdr; + struct sk_buff *msg; + + if (info->attrs[NL80211_ATTR_KEY_IDX]) + key_idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]); + + if (key_idx > 3) + return -EINVAL; + + if (info->attrs[NL80211_ATTR_MAC]) + mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); + + err = get_drv_dev_by_info_ifindex(info, &drv, &dev); + if (err) + return err; + + if (!drv->ops->get_key) { + err = -EOPNOTSUPP; + goto out; + } + + msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!msg) { + err = -ENOMEM; + goto out; + } + + hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0, + NL80211_CMD_NEW_KEY); + + if (IS_ERR(hdr)) { + err = PTR_ERR(hdr); + goto out; + } + + cookie.msg = msg; + + NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, dev->ifindex); + NLA_PUT_U8(msg, NL80211_ATTR_KEY_IDX, key_idx); + if (mac_addr) + NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, mac_addr); + + rtnl_lock(); + err = drv->ops->get_key(&drv->wiphy, dev, key_idx, mac_addr, + &cookie, get_key_callback); + rtnl_unlock(); + + if (err) + goto out; + + if (cookie.error) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + err = genlmsg_unicast(msg, info->snd_pid); + goto out; + + nla_put_failure: + err = -ENOBUFS; + nlmsg_free(msg); + out: + cfg80211_put_dev(drv); + dev_put(dev); + return err; +} + +static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *drv; + int err; + struct net_device *dev; + u8 key_idx; + + if (!info->attrs[NL80211_ATTR_KEY_IDX]) + return -EINVAL; + + key_idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]); + + if (key_idx > 3) + return -EINVAL; + + /* currently only support setting default key */ + if (!info->attrs[NL80211_ATTR_KEY_DEFAULT]) + return -EINVAL; + + err = get_drv_dev_by_info_ifindex(info, &drv, &dev); + if (err) + return err; + + if (!drv->ops->set_default_key) { + err = -EOPNOTSUPP; + goto out; + } + + rtnl_lock(); + err = drv->ops->set_default_key(&drv->wiphy, dev, key_idx); + rtnl_unlock(); + + out: + cfg80211_put_dev(drv); + dev_put(dev); + return err; +} + +static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *drv; + int err; + struct net_device *dev; + struct key_params params; + u8 key_idx = 0; + u8 *mac_addr = NULL; + + memset(¶ms, 0, sizeof(params)); + + if (!info->attrs[NL80211_ATTR_KEY_CIPHER]) + return -EINVAL; + + if (info->attrs[NL80211_ATTR_KEY_DATA]) { + params.key = nla_data(info->attrs[NL80211_ATTR_KEY_DATA]); + params.key_len = nla_len(info->attrs[NL80211_ATTR_KEY_DATA]); + } + + if (info->attrs[NL80211_ATTR_KEY_IDX]) + key_idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]); + + params.cipher = nla_get_u32(info->attrs[NL80211_ATTR_KEY_CIPHER]); + + if (info->attrs[NL80211_ATTR_MAC]) + mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); + + if (key_idx > 3) + return -EINVAL; + + /* + * Disallow pairwise keys with non-zero index unless it's WEP + * (because current deployments use pairwise WEP keys with + * non-zero indizes but 802.11i clearly specifies to use zero) + */ + if (mac_addr && key_idx && + params.cipher != WLAN_CIPHER_SUITE_WEP40 && + params.cipher != WLAN_CIPHER_SUITE_WEP104) + return -EINVAL; + + /* TODO: add definitions for the lengths to linux/ieee80211.h */ + switch (params.cipher) { + case WLAN_CIPHER_SUITE_WEP40: + if (params.key_len != 5) + return -EINVAL; + break; + case WLAN_CIPHER_SUITE_TKIP: + if (params.key_len != 32) + return -EINVAL; + break; + case WLAN_CIPHER_SUITE_CCMP: + if (params.key_len != 16) + return -EINVAL; + break; + case WLAN_CIPHER_SUITE_WEP104: + if (params.key_len != 13) + return -EINVAL; + break; + default: + return -EINVAL; + } + + err = get_drv_dev_by_info_ifindex(info, &drv, &dev); + if (err) + return err; + + if (!drv->ops->add_key) { + err = -EOPNOTSUPP; + goto out; + } + + rtnl_lock(); + err = drv->ops->add_key(&drv->wiphy, dev, key_idx, mac_addr, ¶ms); + rtnl_unlock(); + + out: + cfg80211_put_dev(drv); + dev_put(dev); + return err; +} + +static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *drv; + int err; + struct net_device *dev; + u8 key_idx = 0; + u8 *mac_addr = NULL; + + if (info->attrs[NL80211_ATTR_KEY_IDX]) + key_idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]); + + if (key_idx > 3) + return -EINVAL; + + if (info->attrs[NL80211_ATTR_MAC]) + mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); + + err = get_drv_dev_by_info_ifindex(info, &drv, &dev); + if (err) + return err; + + if (!drv->ops->del_key) { + err = -EOPNOTSUPP; + goto out; + } + + rtnl_lock(); + err = drv->ops->del_key(&drv->wiphy, dev, key_idx, mac_addr); + rtnl_unlock(); + + out: + cfg80211_put_dev(drv); + dev_put(dev); + return err; +} + +static int nl80211_addset_beacon(struct sk_buff *skb, struct genl_info *info) +{ + int (*call)(struct wiphy *wiphy, struct net_device *dev, + struct beacon_parameters *info); + struct cfg80211_registered_device *drv; + int err; + struct net_device *dev; + struct beacon_parameters params; + int haveinfo = 0; + + err = get_drv_dev_by_info_ifindex(info, &drv, &dev); + if (err) + return err; + + switch (info->genlhdr->cmd) { + case NL80211_CMD_NEW_BEACON: + /* these are required for NEW_BEACON */ + if (!info->attrs[NL80211_ATTR_BEACON_INTERVAL] || + !info->attrs[NL80211_ATTR_DTIM_PERIOD] || + !info->attrs[NL80211_ATTR_BEACON_HEAD]) { + err = -EINVAL; + goto out; + } + + call = drv->ops->add_beacon; + break; + case NL80211_CMD_SET_BEACON: + call = drv->ops->set_beacon; + break; + default: + WARN_ON(1); + err = -EOPNOTSUPP; + goto out; + } + + if (!call) { + err = -EOPNOTSUPP; + goto out; + } + + memset(¶ms, 0, sizeof(params)); + + if (info->attrs[NL80211_ATTR_BEACON_INTERVAL]) { + params.interval = + nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]); + haveinfo = 1; + } + + if (info->attrs[NL80211_ATTR_DTIM_PERIOD]) { + params.dtim_period = + nla_get_u32(info->attrs[NL80211_ATTR_DTIM_PERIOD]); + haveinfo = 1; + } + + if (info->attrs[NL80211_ATTR_BEACON_HEAD]) { + params.head = nla_data(info->attrs[NL80211_ATTR_BEACON_HEAD]); + params.head_len = + nla_len(info->attrs[NL80211_ATTR_BEACON_HEAD]); + haveinfo = 1; + } + + if (info->attrs[NL80211_ATTR_BEACON_TAIL]) { + params.tail = nla_data(info->attrs[NL80211_ATTR_BEACON_TAIL]); + params.tail_len = + nla_len(info->attrs[NL80211_ATTR_BEACON_TAIL]); + haveinfo = 1; + } + + if (!haveinfo) { + err = -EINVAL; + goto out; + } + + rtnl_lock(); + err = call(&drv->wiphy, dev, ¶ms); + rtnl_unlock(); + + out: + cfg80211_put_dev(drv); + dev_put(dev); + return err; +} + +static int nl80211_del_beacon(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *drv; + int err; + struct net_device *dev; + + err = get_drv_dev_by_info_ifindex(info, &drv, &dev); + if (err) + return err; + + if (!drv->ops->del_beacon) { + err = -EOPNOTSUPP; + goto out; + } + + rtnl_lock(); + err = drv->ops->del_beacon(&drv->wiphy, dev); + rtnl_unlock(); + + out: + cfg80211_put_dev(drv); + dev_put(dev); + return err; +} + +static const struct nla_policy sta_flags_policy[NL80211_STA_FLAG_MAX + 1] = { + [NL80211_STA_FLAG_AUTHORIZED] = { .type = NLA_FLAG }, + [NL80211_STA_FLAG_SHORT_PREAMBLE] = { .type = NLA_FLAG }, + [NL80211_STA_FLAG_WME] = { .type = NLA_FLAG }, +}; + +static int parse_station_flags(struct nlattr *nla, u32 *staflags) +{ + struct nlattr *flags[NL80211_STA_FLAG_MAX + 1]; + int flag; + + *staflags = 0; + + if (!nla) + return 0; + + if (nla_parse_nested(flags, NL80211_STA_FLAG_MAX, + nla, sta_flags_policy)) + return -EINVAL; + + *staflags = STATION_FLAG_CHANGED; + + for (flag = 1; flag <= NL80211_STA_FLAG_MAX; flag++) + if (flags[flag]) + *staflags |= (1<<flag); + + return 0; +} + +static int nl80211_send_station(struct sk_buff *msg, u32 pid, u32 seq, + int flags, struct net_device *dev, + u8 *mac_addr, struct station_stats *stats) +{ + void *hdr; + struct nlattr *statsattr; + + hdr = nl80211hdr_put(msg, pid, seq, flags, NL80211_CMD_NEW_STATION); + if (!hdr) + return -1; + + NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, dev->ifindex); + NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, mac_addr); + + statsattr = nla_nest_start(msg, NL80211_ATTR_STA_STATS); + if (!statsattr) + goto nla_put_failure; + if (stats->filled & STATION_STAT_INACTIVE_TIME) + NLA_PUT_U32(msg, NL80211_STA_STAT_INACTIVE_TIME, + stats->inactive_time); + if (stats->filled & STATION_STAT_RX_BYTES) + NLA_PUT_U32(msg, NL80211_STA_STAT_RX_BYTES, + stats->rx_bytes); + if (stats->filled & STATION_STAT_TX_BYTES) + NLA_PUT_U32(msg, NL80211_STA_STAT_TX_BYTES, + stats->tx_bytes); + + nla_nest_end(msg, statsattr); + + return genlmsg_end(msg, hdr); + + nla_put_failure: + return genlmsg_cancel(msg, hdr); +} + + +static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *drv; + int err; + struct net_device *dev; + struct station_stats stats; + struct sk_buff *msg; + u8 *mac_addr = NULL; + + memset(&stats, 0, sizeof(stats)); + + if (!info->attrs[NL80211_ATTR_MAC]) + return -EINVAL; + + mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); + + err = get_drv_dev_by_info_ifindex(info, &drv, &dev); + if (err) + return err; + + if (!drv->ops->get_station) { + err = -EOPNOTSUPP; + goto out; + } + + rtnl_lock(); + err = drv->ops->get_station(&drv->wiphy, dev, mac_addr, &stats); + rtnl_unlock(); + + msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!msg) + goto out; + + if (nl80211_send_station(msg, info->snd_pid, info->snd_seq, 0, + dev, mac_addr, &stats) < 0) + goto out_free; + + err = genlmsg_unicast(msg, info->snd_pid); + goto out; + + out_free: + nlmsg_free(msg); + + out: + cfg80211_put_dev(drv); + dev_put(dev); + return err; +} + +/* + * Get vlan interface making sure it is on the right wiphy. + */ +static int get_vlan(struct nlattr *vlanattr, + struct cfg80211_registered_device *rdev, + struct net_device **vlan) +{ + *vlan = NULL; + + if (vlanattr) { + *vlan = dev_get_by_index(&init_net, nla_get_u32(vlanattr)); + if (!*vlan) + return -ENODEV; + if (!(*vlan)->ieee80211_ptr) + return -EINVAL; + if ((*vlan)->ieee80211_ptr->wiphy != &rdev->wiphy) + return -EINVAL; + } + return 0; +} + +static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *drv; + int err; + struct net_device *dev; + struct station_parameters params; + u8 *mac_addr = NULL; + + memset(¶ms, 0, sizeof(params)); + + params.listen_interval = -1; + + if (info->attrs[NL80211_ATTR_STA_AID]) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_MAC]) + return -EINVAL; + + mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); + + if (info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]) { + params.supported_rates = + nla_data(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]); + params.supported_rates_len = + nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]); + } + + if (info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]) + params.listen_interval = + nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]); + + if (parse_station_flags(info->attrs[NL80211_ATTR_STA_FLAGS], + ¶ms.station_flags)) + return -EINVAL; + + err = get_drv_dev_by_info_ifindex(info, &drv, &dev); + if (err) + return err; + + err = get_vlan(info->attrs[NL80211_ATTR_STA_VLAN], drv, ¶ms.vlan); + if (err) + goto out; + + if (!drv->ops->change_station) { + err = -EOPNOTSUPP; + goto out; + } + + rtnl_lock(); + err = drv->ops->change_station(&drv->wiphy, dev, mac_addr, ¶ms); + rtnl_unlock(); + + out: + if (params.vlan) + dev_put(params.vlan); + cfg80211_put_dev(drv); + dev_put(dev); + return err; +} + +static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *drv; + int err; + struct net_device *dev; + struct station_parameters params; + u8 *mac_addr = NULL; + + memset(¶ms, 0, sizeof(params)); + + if (!info->attrs[NL80211_ATTR_MAC]) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_STA_AID]) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]) + return -EINVAL; + + mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); + params.supported_rates = + nla_data(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]); + params.supported_rates_len = + nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]); + params.listen_interval = + nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]); + params.listen_interval = nla_get_u16(info->attrs[NL80211_ATTR_STA_AID]); + + if (parse_station_flags(info->attrs[NL80211_ATTR_STA_FLAGS], + ¶ms.station_flags)) + return -EINVAL; + + err = get_drv_dev_by_info_ifindex(info, &drv, &dev); + if (err) + return err; + + err = get_vlan(info->attrs[NL80211_ATTR_STA_VLAN], drv, ¶ms.vlan); + if (err) + goto out; + + if (!drv->ops->add_station) { + err = -EOPNOTSUPP; + goto out; + } + + rtnl_lock(); + err = drv->ops->add_station(&drv->wiphy, dev, mac_addr, ¶ms); + rtnl_unlock(); + + out: + if (params.vlan) + dev_put(params.vlan); + cfg80211_put_dev(drv); + dev_put(dev); + return err; +} + +static int nl80211_del_station(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *drv; + int err; + struct net_device *dev; + u8 *mac_addr = NULL; + + if (info->attrs[NL80211_ATTR_MAC]) + mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); + + err = get_drv_dev_by_info_ifindex(info, &drv, &dev); + if (err) + return err; + + if (!drv->ops->del_station) { + err = -EOPNOTSUPP; + goto out; + } + + rtnl_lock(); + err = drv->ops->del_station(&drv->wiphy, dev, mac_addr); + rtnl_unlock(); + + out: + cfg80211_put_dev(drv); + dev_put(dev); + return err; +} + static struct genl_ops nl80211_ops[] = { { .cmd = NL80211_CMD_GET_WIPHY, @@ -374,6 +1044,73 @@ static struct genl_ops nl80211_ops[] = { .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, }, + { + .cmd = NL80211_CMD_GET_KEY, + .doit = nl80211_get_key, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = NL80211_CMD_SET_KEY, + .doit = nl80211_set_key, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = NL80211_CMD_NEW_KEY, + .doit = nl80211_new_key, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = NL80211_CMD_DEL_KEY, + .doit = nl80211_del_key, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = NL80211_CMD_SET_BEACON, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .doit = nl80211_addset_beacon, + }, + { + .cmd = NL80211_CMD_NEW_BEACON, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .doit = nl80211_addset_beacon, + }, + { + .cmd = NL80211_CMD_DEL_BEACON, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .doit = nl80211_del_beacon, + }, + { + .cmd = NL80211_CMD_GET_STATION, + .doit = nl80211_get_station, + /* TODO: implement dumpit */ + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = NL80211_CMD_SET_STATION, + .doit = nl80211_set_station, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = NL80211_CMD_NEW_STATION, + .doit = nl80211_new_station, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = NL80211_CMD_DEL_STATION, + .doit = nl80211_del_station, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + }, }; /* multicast groups */ diff --git a/net/wireless/wext.c b/net/wireless/wext.c index 47e80cc2077..2c569b63e7d 100644 --- a/net/wireless/wext.c +++ b/net/wireless/wext.c @@ -417,20 +417,6 @@ static const int event_type_size[] = { IW_EV_QUAL_LEN, /* IW_HEADER_TYPE_QUAL */ }; -/* Size (in bytes) of various events, as packed */ -static const int event_type_pk_size[] = { - IW_EV_LCP_PK_LEN, /* IW_HEADER_TYPE_NULL */ - 0, - IW_EV_CHAR_PK_LEN, /* IW_HEADER_TYPE_CHAR */ - 0, - IW_EV_UINT_PK_LEN, /* IW_HEADER_TYPE_UINT */ - IW_EV_FREQ_PK_LEN, /* IW_HEADER_TYPE_FREQ */ - IW_EV_ADDR_PK_LEN, /* IW_HEADER_TYPE_ADDR */ - 0, - IW_EV_POINT_PK_LEN, /* Without variable payload */ - IW_EV_PARAM_PK_LEN, /* IW_HEADER_TYPE_PARAM */ - IW_EV_QUAL_PK_LEN, /* IW_HEADER_TYPE_QUAL */ -}; /************************ COMMON SUBROUTINES ************************/ /* @@ -673,26 +659,8 @@ static const struct seq_operations wireless_seq_ops = { static int wireless_seq_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - int res; - res = seq_open(file, &wireless_seq_ops); - if (!res) { - seq = file->private_data; - seq->private = get_proc_net(inode); - if (!seq->private) { - seq_release(inode, file); - res = -ENXIO; - } - } - return res; -} - -static int wireless_seq_release(struct inode *inode, struct file *file) -{ - struct seq_file *seq = file->private_data; - struct net *net = seq->private; - put_net(net); - return seq_release(inode, file); + return seq_open_net(inode, file, &wireless_seq_ops, + sizeof(struct seq_net_private)); } static const struct file_operations wireless_seq_fops = { @@ -700,7 +668,7 @@ static const struct file_operations wireless_seq_fops = { .open = wireless_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = wireless_seq_release, + .release = seq_release_net, }; int wext_proc_init(struct net *net) @@ -1137,7 +1105,7 @@ static void wireless_nlevent_process(unsigned long data) struct sk_buff *skb; while ((skb = skb_dequeue(&wireless_nlevent_queue))) - rtnl_notify(skb, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); + rtnl_notify(skb, &init_net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); } static DECLARE_TASKLET(wireless_nlevent_tasklet, wireless_nlevent_process, 0); @@ -1189,6 +1157,9 @@ static void rtmsg_iwinfo(struct net_device *dev, char *event, int event_len) struct sk_buff *skb; int err; + if (dev->nd_net != &init_net) + return; + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) return; diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 92cfe8e3e0b..07fad7ccf83 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -83,9 +83,9 @@ struct compat_x25_subscrip_struct { int x25_addr_ntoa(unsigned char *p, struct x25_address *called_addr, struct x25_address *calling_addr) { - int called_len, calling_len; + unsigned int called_len, calling_len; char *called, *calling; - int i; + unsigned int i; called_len = (*p >> 0) & 0x0F; calling_len = (*p >> 4) & 0x0F; diff --git a/net/x25/sysctl_net_x25.c b/net/x25/sysctl_net_x25.c index a59b77f1823..6ebda25c24e 100644 --- a/net/x25/sysctl_net_x25.c +++ b/net/x25/sysctl_net_x25.c @@ -84,29 +84,15 @@ static struct ctl_table x25_table[] = { { 0, }, }; -static struct ctl_table x25_dir_table[] = { - { - .ctl_name = NET_X25, - .procname = "x25", - .mode = 0555, - .child = x25_table, - }, - { 0, }, -}; - -static struct ctl_table x25_root_table[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = x25_dir_table, - }, - { 0, }, +static struct ctl_path x25_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "x25", .ctl_name = NET_X25, }, + { } }; void __init x25_register_sysctl(void) { - x25_table_header = register_sysctl_table(x25_root_table); + x25_table_header = register_sysctl_paths(x25_path, x25_table); } void x25_unregister_sysctl(void) diff --git a/net/x25/x25_facilities.c b/net/x25/x25_facilities.c index dec404afa11..a21f6646eb3 100644 --- a/net/x25/x25_facilities.c +++ b/net/x25/x25_facilities.c @@ -205,9 +205,7 @@ int x25_create_facilities(unsigned char *buffer, } if (dte_facs->calling_len && (facil_mask & X25_MASK_CALLING_AE)) { - unsigned bytecount = (dte_facs->calling_len % 2) ? - dte_facs->calling_len / 2 + 1 : - dte_facs->calling_len / 2; + unsigned bytecount = (dte_facs->calling_len + 1) >> 1; *p++ = X25_FAC_CALLING_AE; *p++ = 1 + bytecount; *p++ = dte_facs->calling_len; diff --git a/net/x25/x25_forward.c b/net/x25/x25_forward.c index 34478035e05..056a55f3a87 100644 --- a/net/x25/x25_forward.c +++ b/net/x25/x25_forward.c @@ -12,7 +12,7 @@ #include <linux/init.h> #include <net/x25.h> -struct list_head x25_forward_list = LIST_HEAD_INIT(x25_forward_list); +LIST_HEAD(x25_forward_list); DEFINE_RWLOCK(x25_forward_list_lock); int x25_forward_call(struct x25_address *dest_addr, struct x25_neigh *from, diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c index 1c88762c279..7d7c3abf38b 100644 --- a/net/x25/x25_in.c +++ b/net/x25/x25_in.c @@ -247,7 +247,7 @@ static int x25_state3_machine(struct sock *sk, struct sk_buff *skb, int frametyp break; } if (atomic_read(&sk->sk_rmem_alloc) > - (sk->sk_rcvbuf / 2)) + (sk->sk_rcvbuf >> 1)) x25->condition |= X25_COND_OWN_RX_BUSY; } /* diff --git a/net/x25/x25_link.c b/net/x25/x25_link.c index 741ce95d4ad..e4e1b6e4953 100644 --- a/net/x25/x25_link.c +++ b/net/x25/x25_link.c @@ -30,7 +30,7 @@ #include <linux/init.h> #include <net/x25.h> -static struct list_head x25_neigh_list = LIST_HEAD_INIT(x25_neigh_list); +static LIST_HEAD(x25_neigh_list); static DEFINE_RWLOCK(x25_neigh_list_lock); static void x25_t20timer_expiry(unsigned long); @@ -247,10 +247,7 @@ void x25_link_device_up(struct net_device *dev) return; skb_queue_head_init(&nb->queue); - - init_timer(&nb->t20timer); - nb->t20timer.data = (unsigned long)nb; - nb->t20timer.function = &x25_t20timer_expiry; + setup_timer(&nb->t20timer, x25_t20timer_expiry, (unsigned long)nb); dev_hold(dev); nb->dev = dev; diff --git a/net/x25/x25_proc.c b/net/x25/x25_proc.c index 7d55e50c936..3f52b09bed0 100644 --- a/net/x25/x25_proc.c +++ b/net/x25/x25_proc.c @@ -41,6 +41,7 @@ found: } static void *x25_seq_route_start(struct seq_file *seq, loff_t *pos) + __acquires(x25_route_list_lock) { loff_t l = *pos; @@ -70,6 +71,7 @@ out: } static void x25_seq_route_stop(struct seq_file *seq, void *v) + __releases(x25_route_list_lock) { read_unlock_bh(&x25_route_list_lock); } @@ -105,6 +107,7 @@ found: } static void *x25_seq_socket_start(struct seq_file *seq, loff_t *pos) + __acquires(x25_list_lock) { loff_t l = *pos; @@ -127,6 +130,7 @@ out: } static void x25_seq_socket_stop(struct seq_file *seq, void *v) + __releases(x25_list_lock) { read_unlock_bh(&x25_list_lock); } @@ -183,6 +187,7 @@ found: } static void *x25_seq_forward_start(struct seq_file *seq, loff_t *pos) + __acquires(x25_forward_list_lock) { loff_t l = *pos; @@ -213,6 +218,7 @@ out: } static void x25_seq_forward_stop(struct seq_file *seq, void *v) + __releases(x25_forward_list_lock) { read_unlock_bh(&x25_forward_list_lock); } @@ -287,7 +293,7 @@ static const struct file_operations x25_seq_route_fops = { .release = seq_release, }; -static struct file_operations x25_seq_forward_fops = { +static const struct file_operations x25_seq_forward_fops = { .owner = THIS_MODULE, .open = x25_seq_forward_open, .read = seq_read, diff --git a/net/x25/x25_route.c b/net/x25/x25_route.c index 86b5b4da097..2c999ccf504 100644 --- a/net/x25/x25_route.c +++ b/net/x25/x25_route.c @@ -21,7 +21,7 @@ #include <linux/init.h> #include <net/x25.h> -struct list_head x25_route_list = LIST_HEAD_INIT(x25_route_list); +LIST_HEAD(x25_route_list); DEFINE_RWLOCK(x25_route_list_lock); /* diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c index 8d6220aa5d0..511a5986af3 100644 --- a/net/x25/x25_subr.c +++ b/net/x25/x25_subr.c @@ -359,7 +359,7 @@ void x25_check_rbuf(struct sock *sk) { struct x25_sock *x25 = x25_sk(sk); - if (atomic_read(&sk->sk_rmem_alloc) < (sk->sk_rcvbuf / 2) && + if (atomic_read(&sk->sk_rmem_alloc) < (sk->sk_rcvbuf >> 1) && (x25->condition & X25_COND_OWN_RX_BUSY)) { x25->condition &= ~X25_COND_OWN_RX_BUSY; x25->condition &= ~X25_COND_ACK_PENDING; diff --git a/net/x25/x25_timer.c b/net/x25/x25_timer.c index 2af190dc5b0..d3e3e54db93 100644 --- a/net/x25/x25_timer.c +++ b/net/x25/x25_timer.c @@ -33,9 +33,7 @@ void x25_init_timers(struct sock *sk) { struct x25_sock *x25 = x25_sk(sk); - init_timer(&x25->timer); - x25->timer.data = (unsigned long)sk; - x25->timer.function = &x25_timer_expiry; + setup_timer(&x25->timer, x25_timer_expiry, (unsigned long)sk); /* initialized by sock_init_data */ sk->sk_timer.data = (unsigned long)sk; diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index 577a4f821b9..8f9dbec319b 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -3,6 +3,7 @@ # config XFRM bool + select CRYPTO depends on NET config XFRM_USER @@ -35,6 +36,16 @@ config XFRM_MIGRATE If unsure, say N. +config XFRM_STATISTICS + bool "Transformation statistics (EXPERIMENTAL)" + depends on XFRM && PROC_FS && EXPERIMENTAL + ---help--- + This statistics is not a SNMP/MIB specification but shows + statistics about transformation error (or almost error) factor + at packet processing for developer. + + If unsure, say N. + config NET_KEY tristate "PF_KEY sockets" select XFRM diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile index 45744a3d3a5..332cfb0ff56 100644 --- a/net/xfrm/Makefile +++ b/net/xfrm/Makefile @@ -4,5 +4,6 @@ obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \ xfrm_input.o xfrm_output.o xfrm_algo.o +obj-$(CONFIG_XFRM_STATISTICS) += xfrm_proc.o obj-$(CONFIG_XFRM_USER) += xfrm_user.o diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c index 1686f64c435..b5c5347aed6 100644 --- a/net/xfrm/xfrm_algo.c +++ b/net/xfrm/xfrm_algo.c @@ -486,7 +486,6 @@ EXPORT_SYMBOL_GPL(xfrm_ealg_get_byidx); */ void xfrm_probe_algs(void) { -#ifdef CONFIG_CRYPTO int i, status; BUG_ON(in_softirq()); @@ -511,7 +510,6 @@ void xfrm_probe_algs(void) if (calg_list[i].available != status) calg_list[i].available = status; } -#endif } EXPORT_SYMBOL_GPL(xfrm_probe_algs); diff --git a/net/xfrm/xfrm_hash.c b/net/xfrm/xfrm_hash.c index 55ab5792af5..a2023ec5232 100644 --- a/net/xfrm/xfrm_hash.c +++ b/net/xfrm/xfrm_hash.c @@ -17,17 +17,14 @@ struct hlist_head *xfrm_hash_alloc(unsigned int sz) struct hlist_head *n; if (sz <= PAGE_SIZE) - n = kmalloc(sz, GFP_KERNEL); + n = kzalloc(sz, GFP_KERNEL); else if (hashdist) - n = __vmalloc(sz, GFP_KERNEL, PAGE_KERNEL); + n = __vmalloc(sz, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); else n = (struct hlist_head *) - __get_free_pages(GFP_KERNEL | __GFP_NOWARN, + __get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, get_order(sz)); - if (n) - memset(n, 0, sz); - return n; } diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index cb97fda1b6d..039e7019c48 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -9,6 +9,8 @@ #include <linux/slab.h> #include <linux/module.h> +#include <linux/netdevice.h> +#include <net/dst.h> #include <net/ip.h> #include <net/xfrm.h> @@ -81,6 +83,180 @@ int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq) } EXPORT_SYMBOL(xfrm_parse_spi); +int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + + err = x->outer_mode->afinfo->extract_input(x, skb); + if (err) + return err; + + skb->protocol = x->inner_mode->afinfo->eth_proto; + return x->inner_mode->input2(x, skb); +} +EXPORT_SYMBOL(xfrm_prepare_input); + +int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) +{ + int err; + __be32 seq; + struct xfrm_state *x; + xfrm_address_t *daddr; + unsigned int family; + int decaps = 0; + int async = 0; + + /* A negative encap_type indicates async resumption. */ + if (encap_type < 0) { + async = 1; + x = xfrm_input_state(skb); + seq = XFRM_SKB_CB(skb)->seq; + goto resume; + } + + /* Allocate new secpath or COW existing one. */ + if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) { + struct sec_path *sp; + + sp = secpath_dup(skb->sp); + if (!sp) { + XFRM_INC_STATS(LINUX_MIB_XFRMINERROR); + goto drop; + } + if (skb->sp) + secpath_put(skb->sp); + skb->sp = sp; + } + + daddr = (xfrm_address_t *)(skb_network_header(skb) + + XFRM_SPI_SKB_CB(skb)->daddroff); + family = XFRM_SPI_SKB_CB(skb)->family; + + seq = 0; + if (!spi && (err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) { + XFRM_INC_STATS(LINUX_MIB_XFRMINHDRERROR); + goto drop; + } + + do { + if (skb->sp->len == XFRM_MAX_DEPTH) { + XFRM_INC_STATS(LINUX_MIB_XFRMINBUFFERERROR); + goto drop; + } + + x = xfrm_state_lookup(daddr, spi, nexthdr, family); + if (x == NULL) { + XFRM_INC_STATS(LINUX_MIB_XFRMINNOSTATES); + xfrm_audit_state_notfound(skb, family, spi, seq); + goto drop; + } + + skb->sp->xvec[skb->sp->len++] = x; + + spin_lock(&x->lock); + if (unlikely(x->km.state != XFRM_STATE_VALID)) { + XFRM_INC_STATS(LINUX_MIB_XFRMINSTATEINVALID); + goto drop_unlock; + } + + if ((x->encap ? x->encap->encap_type : 0) != encap_type) { + XFRM_INC_STATS(LINUX_MIB_XFRMINSTATEINVALID); + goto drop_unlock; + } + + if (x->props.replay_window && xfrm_replay_check(x, skb, seq)) { + XFRM_INC_STATS(LINUX_MIB_XFRMINSEQOUTOFWINDOW); + goto drop_unlock; + } + + if (xfrm_state_check_expire(x)) { + XFRM_INC_STATS(LINUX_MIB_XFRMINSTATEEXPIRED); + goto drop_unlock; + } + + spin_unlock(&x->lock); + + XFRM_SKB_CB(skb)->seq = seq; + + nexthdr = x->type->input(x, skb); + + if (nexthdr == -EINPROGRESS) + return 0; + +resume: + spin_lock(&x->lock); + if (nexthdr <= 0) { + if (nexthdr == -EBADMSG) { + xfrm_audit_state_icvfail(x, skb, + x->type->proto); + x->stats.integrity_failed++; + } + XFRM_INC_STATS(LINUX_MIB_XFRMINSTATEPROTOERROR); + goto drop_unlock; + } + + /* only the first xfrm gets the encap type */ + encap_type = 0; + + if (x->props.replay_window) + xfrm_replay_advance(x, seq); + + x->curlft.bytes += skb->len; + x->curlft.packets++; + + spin_unlock(&x->lock); + + XFRM_MODE_SKB_CB(skb)->protocol = nexthdr; + + if (x->inner_mode->input(x, skb)) { + XFRM_INC_STATS(LINUX_MIB_XFRMINSTATEMODEERROR); + goto drop; + } + + if (x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL) { + decaps = 1; + break; + } + + /* + * We need the inner address. However, we only get here for + * transport mode so the outer address is identical. + */ + daddr = &x->id.daddr; + family = x->outer_mode->afinfo->family; + + err = xfrm_parse_spi(skb, nexthdr, &spi, &seq); + if (err < 0) { + XFRM_INC_STATS(LINUX_MIB_XFRMINHDRERROR); + goto drop; + } + } while (!err); + + nf_reset(skb); + + if (decaps) { + dst_release(skb->dst); + skb->dst = NULL; + netif_rx(skb); + return 0; + } else { + return x->inner_mode->afinfo->transport_finish(skb, async); + } + +drop_unlock: + spin_unlock(&x->lock); +drop: + kfree_skb(skb); + return 0; +} +EXPORT_SYMBOL(xfrm_input); + +int xfrm_input_resume(struct sk_buff *skb, int nexthdr) +{ + return xfrm_input(skb, nexthdr, 0, -1); +} +EXPORT_SYMBOL(xfrm_input_resume); + void __init xfrm_input_init(void) { secpath_cachep = kmem_cache_create("secpath_cache", diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index f4bfd6c4565..f4a1047a557 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -12,14 +12,18 @@ #include <linux/errno.h> #include <linux/module.h> #include <linux/netdevice.h> +#include <linux/netfilter.h> #include <linux/skbuff.h> #include <linux/spinlock.h> #include <net/dst.h> #include <net/xfrm.h> +static int xfrm_output2(struct sk_buff *skb); + static int xfrm_state_check_space(struct xfrm_state *x, struct sk_buff *skb) { - int nhead = x->props.header_len + LL_RESERVED_SPACE(skb->dst->dev) + struct dst_entry *dst = skb->dst; + int nhead = dst->header_len + LL_RESERVED_SPACE(dst->dev) - skb_headroom(skb); if (nhead > 0) @@ -29,54 +33,63 @@ static int xfrm_state_check_space(struct xfrm_state *x, struct sk_buff *skb) return 0; } -static int xfrm_state_check(struct xfrm_state *x, struct sk_buff *skb) -{ - int err = xfrm_state_check_expire(x); - if (err < 0) - goto err; - err = xfrm_state_check_space(x, skb); -err: - return err; -} - -int xfrm_output(struct sk_buff *skb) +static int xfrm_output_one(struct sk_buff *skb, int err) { struct dst_entry *dst = skb->dst; struct xfrm_state *x = dst->xfrm; - int err; - if (skb->ip_summed == CHECKSUM_PARTIAL) { - err = skb_checksum_help(skb); - if (err) - goto error_nolock; - } + if (err <= 0) + goto resume; do { + err = xfrm_state_check_space(x, skb); + if (err) { + XFRM_INC_STATS(LINUX_MIB_XFRMOUTERROR); + goto error_nolock; + } + + err = x->outer_mode->output(x, skb); + if (err) { + XFRM_INC_STATS(LINUX_MIB_XFRMOUTSTATEMODEERROR); + goto error_nolock; + } + spin_lock_bh(&x->lock); - err = xfrm_state_check(x, skb); - if (err) + err = xfrm_state_check_expire(x); + if (err) { + XFRM_INC_STATS(LINUX_MIB_XFRMOUTSTATEEXPIRED); goto error; + } if (x->type->flags & XFRM_TYPE_REPLAY_PROT) { XFRM_SKB_CB(skb)->seq = ++x->replay.oseq; + if (unlikely(x->replay.oseq == 0)) { + x->replay.oseq--; + xfrm_audit_state_replay_overflow(x, skb); + err = -EOVERFLOW; + goto error; + } if (xfrm_aevent_is_on()) xfrm_replay_notify(x, XFRM_REPLAY_UPDATE); } - err = x->outer_mode->output(x, skb); - if (err) - goto error; - x->curlft.bytes += skb->len; x->curlft.packets++; spin_unlock_bh(&x->lock); err = x->type->output(x, skb); - if (err) + if (err == -EINPROGRESS) + goto out_exit; + +resume: + if (err) { + XFRM_INC_STATS(LINUX_MIB_XFRMOUTSTATEPROTOERROR); goto error_nolock; + } if (!(skb->dst = dst_pop(dst))) { + XFRM_INC_STATS(LINUX_MIB_XFRMOUTERROR); err = -EHOSTUNREACH; goto error_nolock; } @@ -86,10 +99,97 @@ int xfrm_output(struct sk_buff *skb) err = 0; -error_nolock: +out_exit: return err; error: spin_unlock_bh(&x->lock); - goto error_nolock; +error_nolock: + kfree_skb(skb); + goto out_exit; +} + +int xfrm_output_resume(struct sk_buff *skb, int err) +{ + while (likely((err = xfrm_output_one(skb, err)) == 0)) { + struct xfrm_state *x; + + nf_reset(skb); + + err = skb->dst->ops->local_out(skb); + if (unlikely(err != 1)) + goto out; + + x = skb->dst->xfrm; + if (!x) + return dst_output(skb); + + err = nf_hook(x->inner_mode->afinfo->family, + NF_INET_POST_ROUTING, skb, + NULL, skb->dst->dev, xfrm_output2); + if (unlikely(err != 1)) + goto out; + } + + if (err == -EINPROGRESS) + err = 0; + +out: + return err; +} +EXPORT_SYMBOL_GPL(xfrm_output_resume); + +static int xfrm_output2(struct sk_buff *skb) +{ + return xfrm_output_resume(skb, 1); +} + +static int xfrm_output_gso(struct sk_buff *skb) +{ + struct sk_buff *segs; + + segs = skb_gso_segment(skb, 0); + kfree_skb(skb); + if (unlikely(IS_ERR(segs))) + return PTR_ERR(segs); + + do { + struct sk_buff *nskb = segs->next; + int err; + + segs->next = NULL; + err = xfrm_output2(segs); + + if (unlikely(err)) { + while ((segs = nskb)) { + nskb = segs->next; + segs->next = NULL; + kfree_skb(segs); + } + return err; + } + + segs = nskb; + } while (segs); + + return 0; +} + +int xfrm_output(struct sk_buff *skb) +{ + int err; + + if (skb_is_gso(skb)) + return xfrm_output_gso(skb); + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + err = skb_checksum_help(skb); + if (err) { + XFRM_INC_STATS(LINUX_MIB_XFRMOUTERROR); + kfree_skb(skb); + return err; + } + } + + return xfrm_output2(skb); } EXPORT_SYMBOL_GPL(xfrm_output); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 26b846e11bf..47219f98053 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -13,6 +13,7 @@ * */ +#include <linux/err.h> #include <linux/slab.h> #include <linux/kmod.h> #include <linux/list.h> @@ -23,13 +24,23 @@ #include <linux/netfilter.h> #include <linux/module.h> #include <linux/cache.h> +#include <linux/audit.h> +#include <net/dst.h> #include <net/xfrm.h> #include <net/ip.h> +#ifdef CONFIG_XFRM_STATISTICS +#include <net/snmp.h> +#endif #include "xfrm_hash.h" int sysctl_xfrm_larval_drop __read_mostly; +#ifdef CONFIG_XFRM_STATISTICS +DEFINE_SNMP_STAT(struct linux_xfrm_mib, xfrm_statistics) __read_mostly; +EXPORT_SYMBOL(xfrm_statistics); +#endif + DEFINE_MUTEX(xfrm_cfg_mutex); EXPORT_SYMBOL(xfrm_cfg_mutex); @@ -49,6 +60,7 @@ static DEFINE_SPINLOCK(xfrm_policy_gc_lock); static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family); static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo); +static void xfrm_init_pmtu(struct dst_entry *dst); static inline int __xfrm4_selector_match(struct xfrm_selector *sel, struct flowi *fl) @@ -84,23 +96,27 @@ int xfrm_selector_match(struct xfrm_selector *sel, struct flowi *fl, return 0; } -int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl, - unsigned short family) +static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, int tos, + int family) { - struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); - int err = 0; + xfrm_address_t *saddr = &x->props.saddr; + xfrm_address_t *daddr = &x->id.daddr; + struct xfrm_policy_afinfo *afinfo; + struct dst_entry *dst; + + if (x->type->flags & XFRM_TYPE_LOCAL_COADDR) + saddr = x->coaddr; + if (x->type->flags & XFRM_TYPE_REMOTE_COADDR) + daddr = x->coaddr; + afinfo = xfrm_policy_get_afinfo(family); if (unlikely(afinfo == NULL)) - return -EAFNOSUPPORT; + return ERR_PTR(-EAFNOSUPPORT); - if (likely(afinfo->dst_lookup != NULL)) - err = afinfo->dst_lookup(dst, fl); - else - err = -EINVAL; + dst = afinfo->dst_lookup(tos, saddr, daddr); xfrm_policy_put_afinfo(afinfo); - return err; + return dst; } -EXPORT_SYMBOL(xfrm_dst_lookup); static inline unsigned long make_jiffies(long secs) { @@ -196,9 +212,8 @@ struct xfrm_policy *xfrm_policy_alloc(gfp_t gfp) INIT_HLIST_NODE(&policy->byidx); rwlock_init(&policy->lock); atomic_set(&policy->refcnt, 1); - init_timer(&policy->timer); - policy->timer.data = (unsigned long)policy; - policy->timer.function = xfrm_policy_timer; + setup_timer(&policy->timer, xfrm_policy_timer, + (unsigned long)policy); } return policy; } @@ -206,7 +221,7 @@ EXPORT_SYMBOL(xfrm_policy_alloc); /* Destroy xfrm_policy: descendant resources must be released to this moment. */ -void __xfrm_policy_destroy(struct xfrm_policy *policy) +void xfrm_policy_destroy(struct xfrm_policy *policy) { BUG_ON(!policy->dead); @@ -218,7 +233,7 @@ void __xfrm_policy_destroy(struct xfrm_policy *policy) security_xfrm_policy_free(policy); kfree(policy); } -EXPORT_SYMBOL(__xfrm_policy_destroy); +EXPORT_SYMBOL(xfrm_policy_destroy); static void xfrm_policy_gc_kill(struct xfrm_policy *policy) { @@ -1230,24 +1245,185 @@ xfrm_find_bundle(struct flowi *fl, struct xfrm_policy *policy, unsigned short fa return x; } -/* Allocate chain of dst_entry's, attach known xfrm's, calculate - * all the metrics... Shortly, bundle a bundle. - */ +static inline int xfrm_get_tos(struct flowi *fl, int family) +{ + struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); + int tos; -static int -xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx, - struct flowi *fl, struct dst_entry **dst_p, - unsigned short family) + if (!afinfo) + return -EINVAL; + + tos = afinfo->get_tos(fl); + + xfrm_policy_put_afinfo(afinfo); + + return tos; +} + +static inline struct xfrm_dst *xfrm_alloc_dst(int family) { - int err; struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); - if (unlikely(afinfo == NULL)) + struct xfrm_dst *xdst; + + if (!afinfo) + return ERR_PTR(-EINVAL); + + xdst = dst_alloc(afinfo->dst_ops) ?: ERR_PTR(-ENOBUFS); + + xfrm_policy_put_afinfo(afinfo); + + return xdst; +} + +static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst, + int nfheader_len) +{ + struct xfrm_policy_afinfo *afinfo = + xfrm_policy_get_afinfo(dst->ops->family); + int err; + + if (!afinfo) return -EINVAL; - err = afinfo->bundle_create(policy, xfrm, nx, fl, dst_p); + + err = afinfo->init_path(path, dst, nfheader_len); + xfrm_policy_put_afinfo(afinfo); + return err; } +static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev) +{ + struct xfrm_policy_afinfo *afinfo = + xfrm_policy_get_afinfo(xdst->u.dst.ops->family); + int err; + + if (!afinfo) + return -EINVAL; + + err = afinfo->fill_dst(xdst, dev); + + xfrm_policy_put_afinfo(afinfo); + + return err; +} + +/* Allocate chain of dst_entry's, attach known xfrm's, calculate + * all the metrics... Shortly, bundle a bundle. + */ + +static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, + struct xfrm_state **xfrm, int nx, + struct flowi *fl, + struct dst_entry *dst) +{ + unsigned long now = jiffies; + struct net_device *dev; + struct dst_entry *dst_prev = NULL; + struct dst_entry *dst0 = NULL; + int i = 0; + int err; + int header_len = 0; + int nfheader_len = 0; + int trailer_len = 0; + int tos; + int family = policy->selector.family; + + tos = xfrm_get_tos(fl, family); + err = tos; + if (tos < 0) + goto put_states; + + dst_hold(dst); + + for (; i < nx; i++) { + struct xfrm_dst *xdst = xfrm_alloc_dst(family); + struct dst_entry *dst1 = &xdst->u.dst; + + err = PTR_ERR(xdst); + if (IS_ERR(xdst)) { + dst_release(dst); + goto put_states; + } + + if (!dst_prev) + dst0 = dst1; + else { + dst_prev->child = dst_clone(dst1); + dst1->flags |= DST_NOHASH; + } + + xdst->route = dst; + memcpy(&dst1->metrics, &dst->metrics, sizeof(dst->metrics)); + + if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) { + family = xfrm[i]->props.family; + dst = xfrm_dst_lookup(xfrm[i], tos, family); + err = PTR_ERR(dst); + if (IS_ERR(dst)) + goto put_states; + } else + dst_hold(dst); + + dst1->xfrm = xfrm[i]; + xdst->genid = xfrm[i]->genid; + + dst1->obsolete = -1; + dst1->flags |= DST_HOST; + dst1->lastuse = now; + + dst1->input = dst_discard; + dst1->output = xfrm[i]->outer_mode->afinfo->output; + + dst1->next = dst_prev; + dst_prev = dst1; + + header_len += xfrm[i]->props.header_len; + if (xfrm[i]->type->flags & XFRM_TYPE_NON_FRAGMENT) + nfheader_len += xfrm[i]->props.header_len; + trailer_len += xfrm[i]->props.trailer_len; + } + + dst_prev->child = dst; + dst0->path = dst; + + err = -ENODEV; + dev = dst->dev; + if (!dev) + goto free_dst; + + /* Copy neighbout for reachability confirmation */ + dst0->neighbour = neigh_clone(dst->neighbour); + + xfrm_init_path((struct xfrm_dst *)dst0, dst, nfheader_len); + xfrm_init_pmtu(dst_prev); + + for (dst_prev = dst0; dst_prev != dst; dst_prev = dst_prev->child) { + struct xfrm_dst *xdst = (struct xfrm_dst *)dst_prev; + + err = xfrm_fill_dst(xdst, dev); + if (err) + goto free_dst; + + dst_prev->header_len = header_len; + dst_prev->trailer_len = trailer_len; + header_len -= xdst->u.dst.xfrm->props.header_len; + trailer_len -= xdst->u.dst.xfrm->props.trailer_len; + } + +out: + return dst0; + +put_states: + for (; i < nx; i++) + xfrm_state_put(xfrm[i]); +free_dst: + if (dst0) + dst_free(dst0); + dst0 = ERR_PTR(err); + goto out; +} + static int inline xfrm_dst_alloc_copy(void **target, void *src, int size) { @@ -1319,36 +1495,46 @@ restart: if (sk && sk->sk_policy[XFRM_POLICY_OUT]) { policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl); err = PTR_ERR(policy); - if (IS_ERR(policy)) + if (IS_ERR(policy)) { + XFRM_INC_STATS(LINUX_MIB_XFRMOUTPOLERROR); goto dropdst; + } } if (!policy) { /* To accelerate a bit... */ if ((dst_orig->flags & DST_NOXFRM) || !xfrm_policy_count[XFRM_POLICY_OUT]) - return 0; + goto nopol; policy = flow_cache_lookup(fl, dst_orig->ops->family, dir, xfrm_policy_lookup); err = PTR_ERR(policy); - if (IS_ERR(policy)) + if (IS_ERR(policy)) { + XFRM_INC_STATS(LINUX_MIB_XFRMOUTPOLERROR); goto dropdst; + } } if (!policy) - return 0; + goto nopol; family = dst_orig->ops->family; - policy->curlft.use_time = get_seconds(); pols[0] = policy; npols ++; xfrm_nr += pols[0]->xfrm_nr; + err = -ENOENT; + if ((flags & XFRM_LOOKUP_ICMP) && !(policy->flags & XFRM_POLICY_ICMP)) + goto error; + + policy->curlft.use_time = get_seconds(); + switch (policy->action) { default: case XFRM_POLICY_BLOCK: /* Prohibit the flow */ + XFRM_INC_STATS(LINUX_MIB_XFRMOUTPOLBLOCK); err = -EPERM; goto error; @@ -1368,6 +1554,7 @@ restart: */ dst = xfrm_find_bundle(fl, policy, family); if (IS_ERR(dst)) { + XFRM_INC_STATS(LINUX_MIB_XFRMOUTBUNDLECHECKERROR); err = PTR_ERR(dst); goto error; } @@ -1382,10 +1569,12 @@ restart: XFRM_POLICY_OUT); if (pols[1]) { if (IS_ERR(pols[1])) { + XFRM_INC_STATS(LINUX_MIB_XFRMOUTPOLERROR); err = PTR_ERR(pols[1]); goto error; } if (pols[1]->action == XFRM_POLICY_BLOCK) { + XFRM_INC_STATS(LINUX_MIB_XFRMOUTPOLBLOCK); err = -EPERM; goto error; } @@ -1416,10 +1605,11 @@ restart: /* EREMOTE tells the caller to generate * a one-shot blackhole route. */ + XFRM_INC_STATS(LINUX_MIB_XFRMOUTNOSTATES); xfrm_pol_put(policy); return -EREMOTE; } - if (err == -EAGAIN && flags) { + if (err == -EAGAIN && (flags & XFRM_LOOKUP_WAIT)) { DECLARE_WAITQUEUE(wait, current); add_wait_queue(&km_waitq, &wait); @@ -1431,6 +1621,7 @@ restart: nx = xfrm_tmpl_resolve(pols, npols, fl, xfrm, family); if (nx == -EAGAIN && signal_pending(current)) { + XFRM_INC_STATS(LINUX_MIB_XFRMOUTNOSTATES); err = -ERESTART; goto error; } @@ -1441,8 +1632,10 @@ restart: } err = nx; } - if (err < 0) + if (err < 0) { + XFRM_INC_STATS(LINUX_MIB_XFRMOUTNOSTATES); goto error; + } } if (nx == 0) { /* Flow passes not transformed. */ @@ -1450,13 +1643,10 @@ restart: return 0; } - dst = dst_orig; - err = xfrm_bundle_create(policy, xfrm, nx, fl, &dst, family); - - if (unlikely(err)) { - int i; - for (i=0; i<nx; i++) - xfrm_state_put(xfrm[i]); + dst = xfrm_bundle_create(policy, xfrm, nx, fl, dst_orig); + err = PTR_ERR(dst); + if (IS_ERR(dst)) { + XFRM_INC_STATS(LINUX_MIB_XFRMOUTBUNDLEGENERROR); goto error; } @@ -1477,6 +1667,10 @@ restart: if (dst) dst_free(dst); + if (pol_dead) + XFRM_INC_STATS(LINUX_MIB_XFRMOUTPOLDEAD); + else + XFRM_INC_STATS(LINUX_MIB_XFRMOUTBUNDLECHECKERROR); err = -EHOSTUNREACH; goto error; } @@ -1489,6 +1683,7 @@ restart: write_unlock_bh(&policy->lock); if (dst) dst_free(dst); + XFRM_INC_STATS(LINUX_MIB_XFRMOUTBUNDLECHECKERROR); goto error; } @@ -1508,6 +1703,12 @@ dropdst: dst_release(dst_orig); *dst_p = NULL; return err; + +nopol: + err = -ENOENT; + if (flags & XFRM_LOOKUP_ICMP) + goto dropdst; + return 0; } EXPORT_SYMBOL(__xfrm_lookup); @@ -1591,8 +1792,8 @@ xfrm_policy_ok(struct xfrm_tmpl *tmpl, struct sec_path *sp, int start, return start; } -int -xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family) +int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, + unsigned int family, int reverse) { struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); int err; @@ -1600,12 +1801,12 @@ xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family if (unlikely(afinfo == NULL)) return -EAFNOSUPPORT; - afinfo->decode_session(skb, fl); + afinfo->decode_session(skb, fl, reverse); err = security_xfrm_decode_session(skb, &fl->secid); xfrm_policy_put_afinfo(afinfo); return err; } -EXPORT_SYMBOL(xfrm_decode_session); +EXPORT_SYMBOL(__xfrm_decode_session); static inline int secpath_has_nontransport(struct sec_path *sp, int k, int *idxp) { @@ -1627,12 +1828,20 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, int npols = 0; int xfrm_nr; int pi; + int reverse; struct flowi fl; - u8 fl_dir = policy_to_flow_dir(dir); + u8 fl_dir; int xerr_idx = -1; - if (xfrm_decode_session(skb, &fl, family) < 0) + reverse = dir & ~XFRM_POLICY_MASK; + dir &= XFRM_POLICY_MASK; + fl_dir = policy_to_flow_dir(dir); + + if (__xfrm_decode_session(skb, &fl, family, reverse) < 0) { + XFRM_INC_STATS(LINUX_MIB_XFRMINHDRERROR); return 0; + } + nf_nat_decode_session(skb, &fl, family); /* First, check used SA against their selectors. */ @@ -1641,28 +1850,35 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, for (i=skb->sp->len-1; i>=0; i--) { struct xfrm_state *x = skb->sp->xvec[i]; - if (!xfrm_selector_match(&x->sel, &fl, family)) + if (!xfrm_selector_match(&x->sel, &fl, family)) { + XFRM_INC_STATS(LINUX_MIB_XFRMINSTATEMISMATCH); return 0; + } } } pol = NULL; if (sk && sk->sk_policy[dir]) { pol = xfrm_sk_policy_lookup(sk, dir, &fl); - if (IS_ERR(pol)) + if (IS_ERR(pol)) { + XFRM_INC_STATS(LINUX_MIB_XFRMINPOLERROR); return 0; + } } if (!pol) pol = flow_cache_lookup(&fl, family, fl_dir, xfrm_policy_lookup); - if (IS_ERR(pol)) + if (IS_ERR(pol)) { + XFRM_INC_STATS(LINUX_MIB_XFRMINPOLERROR); return 0; + } if (!pol) { if (skb->sp && secpath_has_nontransport(skb->sp, 0, &xerr_idx)) { xfrm_secpath_reject(xerr_idx, skb, &fl); + XFRM_INC_STATS(LINUX_MIB_XFRMINNOPOLS); return 0; } return 1; @@ -1678,8 +1894,10 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, &fl, family, XFRM_POLICY_IN); if (pols[1]) { - if (IS_ERR(pols[1])) + if (IS_ERR(pols[1])) { + XFRM_INC_STATS(LINUX_MIB_XFRMINPOLERROR); return 0; + } pols[1]->curlft.use_time = get_seconds(); npols ++; } @@ -1700,10 +1918,14 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, for (pi = 0; pi < npols; pi++) { if (pols[pi] != pol && - pols[pi]->action != XFRM_POLICY_ALLOW) + pols[pi]->action != XFRM_POLICY_ALLOW) { + XFRM_INC_STATS(LINUX_MIB_XFRMINPOLBLOCK); goto reject; - if (ti + pols[pi]->xfrm_nr >= XFRM_MAX_DEPTH) + } + if (ti + pols[pi]->xfrm_nr >= XFRM_MAX_DEPTH) { + XFRM_INC_STATS(LINUX_MIB_XFRMINBUFFERERROR); goto reject_error; + } for (i = 0; i < pols[pi]->xfrm_nr; i++) tpp[ti++] = &pols[pi]->xfrm_vec[i]; } @@ -1725,16 +1947,20 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, if (k < -1) /* "-2 - errored_index" returned */ xerr_idx = -(2+k); + XFRM_INC_STATS(LINUX_MIB_XFRMINTMPLMISMATCH); goto reject; } } - if (secpath_has_nontransport(sp, k, &xerr_idx)) + if (secpath_has_nontransport(sp, k, &xerr_idx)) { + XFRM_INC_STATS(LINUX_MIB_XFRMINTMPLMISMATCH); goto reject; + } xfrm_pols_put(pols, npols); return 1; } + XFRM_INC_STATS(LINUX_MIB_XFRMINPOLBLOCK); reject: xfrm_secpath_reject(xerr_idx, skb, &fl); @@ -1748,8 +1974,11 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family) { struct flowi fl; - if (xfrm_decode_session(skb, &fl, family) < 0) + if (xfrm_decode_session(skb, &fl, family) < 0) { + /* XXX: we should have something like FWDHDRERROR here. */ + XFRM_INC_STATS(LINUX_MIB_XFRMINHDRERROR); return 0; + } return xfrm_lookup(&skb->dst, &fl, NULL, 0) == 0; } @@ -1793,7 +2022,7 @@ static int stale_bundle(struct dst_entry *dst) void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { while ((dst = dst->child) && dst->xfrm && dst->dev == dev) { - dst->dev = init_net.loopback_dev; + dst->dev = dev->nd_net->loopback_dev; dev_hold(dst->dev); dev_put(dev); } @@ -1882,7 +2111,7 @@ static int xfrm_flush_bundles(void) return 0; } -void xfrm_init_pmtu(struct dst_entry *dst) +static void xfrm_init_pmtu(struct dst_entry *dst) { do { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; @@ -1903,8 +2132,6 @@ void xfrm_init_pmtu(struct dst_entry *dst) } while ((dst = dst->next)); } -EXPORT_SYMBOL(xfrm_init_pmtu); - /* Check that the bundle accepts the flow and its components are * still valid. */ @@ -2082,6 +2309,16 @@ static struct notifier_block xfrm_dev_notifier = { 0 }; +#ifdef CONFIG_XFRM_STATISTICS +static int __init xfrm_statistics_init(void) +{ + if (snmp_mib_init((void **)xfrm_statistics, + sizeof(struct linux_xfrm_mib)) < 0) + return -ENOMEM; + return 0; +} +#endif + static void __init xfrm_policy_init(void) { unsigned int hmask, sz; @@ -2118,71 +2355,81 @@ static void __init xfrm_policy_init(void) void __init xfrm_init(void) { +#ifdef CONFIG_XFRM_STATISTICS + xfrm_statistics_init(); +#endif xfrm_state_init(); xfrm_policy_init(); xfrm_input_init(); +#ifdef CONFIG_XFRM_STATISTICS + xfrm_proc_init(); +#endif } #ifdef CONFIG_AUDITSYSCALL -static inline void xfrm_audit_common_policyinfo(struct xfrm_policy *xp, - struct audit_buffer *audit_buf) +static void xfrm_audit_common_policyinfo(struct xfrm_policy *xp, + struct audit_buffer *audit_buf) { - if (xp->security) + struct xfrm_sec_ctx *ctx = xp->security; + struct xfrm_selector *sel = &xp->selector; + + if (ctx) audit_log_format(audit_buf, " sec_alg=%u sec_doi=%u sec_obj=%s", - xp->security->ctx_alg, xp->security->ctx_doi, - xp->security->ctx_str); + ctx->ctx_alg, ctx->ctx_doi, ctx->ctx_str); - switch(xp->selector.family) { + switch(sel->family) { case AF_INET: - audit_log_format(audit_buf, " src=%u.%u.%u.%u dst=%u.%u.%u.%u", - NIPQUAD(xp->selector.saddr.a4), - NIPQUAD(xp->selector.daddr.a4)); + audit_log_format(audit_buf, " src=" NIPQUAD_FMT, + NIPQUAD(sel->saddr.a4)); + if (sel->prefixlen_s != 32) + audit_log_format(audit_buf, " src_prefixlen=%d", + sel->prefixlen_s); + audit_log_format(audit_buf, " dst=" NIPQUAD_FMT, + NIPQUAD(sel->daddr.a4)); + if (sel->prefixlen_d != 32) + audit_log_format(audit_buf, " dst_prefixlen=%d", + sel->prefixlen_d); break; case AF_INET6: - { - struct in6_addr saddr6, daddr6; - - memcpy(&saddr6, xp->selector.saddr.a6, - sizeof(struct in6_addr)); - memcpy(&daddr6, xp->selector.daddr.a6, - sizeof(struct in6_addr)); - audit_log_format(audit_buf, - " src=" NIP6_FMT " dst=" NIP6_FMT, - NIP6(saddr6), NIP6(daddr6)); - } + audit_log_format(audit_buf, " src=" NIP6_FMT, + NIP6(*(struct in6_addr *)sel->saddr.a6)); + if (sel->prefixlen_s != 128) + audit_log_format(audit_buf, " src_prefixlen=%d", + sel->prefixlen_s); + audit_log_format(audit_buf, " dst=" NIP6_FMT, + NIP6(*(struct in6_addr *)sel->daddr.a6)); + if (sel->prefixlen_d != 128) + audit_log_format(audit_buf, " dst_prefixlen=%d", + sel->prefixlen_d); break; } } -void -xfrm_audit_policy_add(struct xfrm_policy *xp, int result, u32 auid, u32 sid) +void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, + u32 auid, u32 secid) { struct audit_buffer *audit_buf; - extern int audit_enabled; - if (audit_enabled == 0) - return; - audit_buf = xfrm_audit_start(auid, sid); + audit_buf = xfrm_audit_start("SPD-add"); if (audit_buf == NULL) return; - audit_log_format(audit_buf, " op=SPD-add res=%u", result); + xfrm_audit_helper_usrinfo(auid, secid, audit_buf); + audit_log_format(audit_buf, " res=%u", result); xfrm_audit_common_policyinfo(xp, audit_buf); audit_log_end(audit_buf); } EXPORT_SYMBOL_GPL(xfrm_audit_policy_add); -void -xfrm_audit_policy_delete(struct xfrm_policy *xp, int result, u32 auid, u32 sid) +void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result, + u32 auid, u32 secid) { struct audit_buffer *audit_buf; - extern int audit_enabled; - if (audit_enabled == 0) - return; - audit_buf = xfrm_audit_start(auid, sid); + audit_buf = xfrm_audit_start("SPD-delete"); if (audit_buf == NULL) return; - audit_log_format(audit_buf, " op=SPD-delete res=%u", result); + xfrm_audit_helper_usrinfo(auid, secid, audit_buf); + audit_log_format(audit_buf, " res=%u", result); xfrm_audit_common_policyinfo(xp, audit_buf); audit_log_end(audit_buf); } diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c new file mode 100644 index 00000000000..31d035415ec --- /dev/null +++ b/net/xfrm/xfrm_proc.c @@ -0,0 +1,96 @@ +/* + * xfrm_proc.c + * + * Copyright (C)2006-2007 USAGI/WIDE Project + * + * Authors: Masahide NAKAMURA <nakam@linux-ipv6.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <net/snmp.h> +#include <net/xfrm.h> + +static struct snmp_mib xfrm_mib_list[] = { + SNMP_MIB_ITEM("XfrmInError", LINUX_MIB_XFRMINERROR), + SNMP_MIB_ITEM("XfrmInBufferError", LINUX_MIB_XFRMINBUFFERERROR), + SNMP_MIB_ITEM("XfrmInHdrError", LINUX_MIB_XFRMINHDRERROR), + SNMP_MIB_ITEM("XfrmInNoStates", LINUX_MIB_XFRMINNOSTATES), + SNMP_MIB_ITEM("XfrmInStateProtoError", LINUX_MIB_XFRMINSTATEPROTOERROR), + SNMP_MIB_ITEM("XfrmInStateModeError", LINUX_MIB_XFRMINSTATEMODEERROR), + SNMP_MIB_ITEM("XfrmInSeqOutOfWindow", LINUX_MIB_XFRMINSEQOUTOFWINDOW), + SNMP_MIB_ITEM("XfrmInStateExpired", LINUX_MIB_XFRMINSTATEEXPIRED), + SNMP_MIB_ITEM("XfrmInStateMismatch", LINUX_MIB_XFRMINSTATEMISMATCH), + SNMP_MIB_ITEM("XfrmInStateInvalid", LINUX_MIB_XFRMINSTATEINVALID), + SNMP_MIB_ITEM("XfrmInTmplMismatch", LINUX_MIB_XFRMINTMPLMISMATCH), + SNMP_MIB_ITEM("XfrmInNoPols", LINUX_MIB_XFRMINNOPOLS), + SNMP_MIB_ITEM("XfrmInPolBlock", LINUX_MIB_XFRMINPOLBLOCK), + SNMP_MIB_ITEM("XfrmInPolError", LINUX_MIB_XFRMINPOLERROR), + SNMP_MIB_ITEM("XfrmOutError", LINUX_MIB_XFRMOUTERROR), + SNMP_MIB_ITEM("XfrmOutBundleGenError", LINUX_MIB_XFRMOUTBUNDLEGENERROR), + SNMP_MIB_ITEM("XfrmOutBundleCheckError", LINUX_MIB_XFRMOUTBUNDLECHECKERROR), + SNMP_MIB_ITEM("XfrmOutNoStates", LINUX_MIB_XFRMOUTNOSTATES), + SNMP_MIB_ITEM("XfrmOutStateProtoError", LINUX_MIB_XFRMOUTSTATEPROTOERROR), + SNMP_MIB_ITEM("XfrmOutStateModeError", LINUX_MIB_XFRMOUTSTATEMODEERROR), + SNMP_MIB_ITEM("XfrmOutStateExpired", LINUX_MIB_XFRMOUTSTATEEXPIRED), + SNMP_MIB_ITEM("XfrmOutPolBlock", LINUX_MIB_XFRMOUTPOLBLOCK), + SNMP_MIB_ITEM("XfrmOutPolDead", LINUX_MIB_XFRMOUTPOLDEAD), + SNMP_MIB_ITEM("XfrmOutPolError", LINUX_MIB_XFRMOUTPOLERROR), + SNMP_MIB_SENTINEL +}; + +static unsigned long +fold_field(void *mib[], int offt) +{ + unsigned long res = 0; + int i; + + for_each_possible_cpu(i) { + res += *(((unsigned long *)per_cpu_ptr(mib[0], i)) + offt); + res += *(((unsigned long *)per_cpu_ptr(mib[1], i)) + offt); + } + return res; +} + +static int xfrm_statistics_seq_show(struct seq_file *seq, void *v) +{ + int i; + for (i=0; xfrm_mib_list[i].name; i++) + seq_printf(seq, "%-24s\t%lu\n", xfrm_mib_list[i].name, + fold_field((void **)xfrm_statistics, + xfrm_mib_list[i].entry)); + return 0; +} + +static int xfrm_statistics_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, xfrm_statistics_seq_show, NULL); +} + +static struct file_operations xfrm_statistics_seq_fops = { + .owner = THIS_MODULE, + .open = xfrm_statistics_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +int __init xfrm_proc_init(void) +{ + int rc = 0; + + if (!proc_net_fops_create(&init_net, "xfrm_stat", S_IRUGO, + &xfrm_statistics_seq_fops)) + goto stat_fail; + + out: + return rc; + + stat_fail: + rc = -ENOMEM; + goto out; +} diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index f26aaaca1fa..3003503d0c9 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -19,6 +19,7 @@ #include <linux/ipsec.h> #include <linux/module.h> #include <linux/cache.h> +#include <linux/audit.h> #include <asm/uaccess.h> #include "xfrm_hash.h" @@ -60,6 +61,13 @@ static unsigned int xfrm_state_genid; static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family); static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo); +#ifdef CONFIG_AUDITSYSCALL +static void xfrm_audit_state_replay(struct xfrm_state *x, + struct sk_buff *skb, __be32 net_seq); +#else +#define xfrm_audit_state_replay(x, s, sq) do { ; } while (0) +#endif /* CONFIG_AUDITSYSCALL */ + static inline unsigned int xfrm_dst_hash(xfrm_address_t *daddr, xfrm_address_t *saddr, u32 reqid, @@ -203,6 +211,7 @@ static struct xfrm_state_afinfo *xfrm_state_lock_afinfo(unsigned int family) } static void xfrm_state_unlock_afinfo(struct xfrm_state_afinfo *afinfo) + __releases(xfrm_state_afinfo_lock) { write_unlock_bh(&xfrm_state_afinfo_lock); } @@ -504,12 +513,9 @@ struct xfrm_state *xfrm_state_alloc(void) INIT_HLIST_NODE(&x->bydst); INIT_HLIST_NODE(&x->bysrc); INIT_HLIST_NODE(&x->byspi); - init_timer(&x->timer); - x->timer.function = xfrm_timer_handler; - x->timer.data = (unsigned long)x; - init_timer(&x->rtimer); - x->rtimer.function = xfrm_replay_timer_handler; - x->rtimer.data = (unsigned long)x; + setup_timer(&x->timer, xfrm_timer_handler, (unsigned long)x); + setup_timer(&x->rtimer, xfrm_replay_timer_handler, + (unsigned long)x); x->curlft.add_time = get_seconds(); x->lft.soft_byte_limit = XFRM_INF; x->lft.soft_packet_limit = XFRM_INF; @@ -759,7 +765,7 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, struct xfrm_policy *pol, int *err, unsigned short family) { - unsigned int h = xfrm_dst_hash(daddr, saddr, tmpl->reqid, family); + unsigned int h; struct hlist_node *entry; struct xfrm_state *x, *x0; int acquire_in_progress = 0; @@ -767,6 +773,7 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, struct xfrm_state *best = NULL; spin_lock_bh(&xfrm_state_lock); + h = xfrm_dst_hash(daddr, saddr, tmpl->reqid, family); hlist_for_each_entry(x, entry, xfrm_state_bydst+h, bydst) { if (x->props.family == family && x->props.reqid == tmpl->reqid && @@ -868,11 +875,12 @@ struct xfrm_state * xfrm_stateonly_find(xfrm_address_t *daddr, xfrm_address_t *saddr, unsigned short family, u8 mode, u8 proto, u32 reqid) { - unsigned int h = xfrm_dst_hash(daddr, saddr, reqid, family); + unsigned int h; struct xfrm_state *rx = NULL, *x = NULL; struct hlist_node *entry; spin_lock(&xfrm_state_lock); + h = xfrm_dst_hash(daddr, saddr, reqid, family); hlist_for_each_entry(x, entry, xfrm_state_bydst+h, bydst) { if (x->props.family == family && x->props.reqid == reqid && @@ -1092,7 +1100,7 @@ out: EXPORT_SYMBOL(xfrm_state_add); #ifdef CONFIG_XFRM_MIGRATE -struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, int *errp) +static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, int *errp) { int err = -ENOMEM; struct xfrm_state *x = xfrm_state_alloc(); @@ -1167,7 +1175,6 @@ struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, int *errp) kfree(x); return NULL; } -EXPORT_SYMBOL(xfrm_state_clone); /* xfrm_state_lock is held */ struct xfrm_state * xfrm_migrate_state_find(struct xfrm_migrate *m) @@ -1609,13 +1616,14 @@ static void xfrm_replay_timer_handler(unsigned long data) spin_unlock(&x->lock); } -int xfrm_replay_check(struct xfrm_state *x, __be32 net_seq) +int xfrm_replay_check(struct xfrm_state *x, + struct sk_buff *skb, __be32 net_seq) { u32 diff; u32 seq = ntohl(net_seq); if (unlikely(seq == 0)) - return -EINVAL; + goto err; if (likely(seq > x->replay.seq)) return 0; @@ -1624,14 +1632,18 @@ int xfrm_replay_check(struct xfrm_state *x, __be32 net_seq) if (diff >= min_t(unsigned int, x->props.replay_window, sizeof(x->replay.bitmap) * 8)) { x->stats.replay_window++; - return -EINVAL; + goto err; } if (x->replay.bitmap & (1U << diff)) { x->stats.replay++; - return -EINVAL; + goto err; } return 0; + +err: + xfrm_audit_state_replay(x, skb, net_seq); + return -EINVAL; } EXPORT_SYMBOL(xfrm_replay_check); @@ -1657,7 +1669,7 @@ void xfrm_replay_advance(struct xfrm_state *x, __be32 net_seq) } EXPORT_SYMBOL(xfrm_replay_advance); -static struct list_head xfrm_km_list = LIST_HEAD_INIT(xfrm_km_list); +static LIST_HEAD(xfrm_km_list); static DEFINE_RWLOCK(xfrm_km_lock); void km_policy_notify(struct xfrm_policy *xp, int dir, struct km_event *c) @@ -1897,6 +1909,7 @@ static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family) } static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo) + __releases(xfrm_state_afinfo_lock) { read_unlock(&xfrm_state_afinfo_lock); } @@ -1996,73 +2009,172 @@ void __init xfrm_state_init(void) } #ifdef CONFIG_AUDITSYSCALL -static inline void xfrm_audit_common_stateinfo(struct xfrm_state *x, - struct audit_buffer *audit_buf) +static void xfrm_audit_helper_sainfo(struct xfrm_state *x, + struct audit_buffer *audit_buf) { - if (x->security) + struct xfrm_sec_ctx *ctx = x->security; + u32 spi = ntohl(x->id.spi); + + if (ctx) audit_log_format(audit_buf, " sec_alg=%u sec_doi=%u sec_obj=%s", - x->security->ctx_alg, x->security->ctx_doi, - x->security->ctx_str); + ctx->ctx_alg, ctx->ctx_doi, ctx->ctx_str); switch(x->props.family) { case AF_INET: - audit_log_format(audit_buf, " src=%u.%u.%u.%u dst=%u.%u.%u.%u", + audit_log_format(audit_buf, + " src=" NIPQUAD_FMT " dst=" NIPQUAD_FMT, NIPQUAD(x->props.saddr.a4), NIPQUAD(x->id.daddr.a4)); break; case AF_INET6: - { - struct in6_addr saddr6, daddr6; - - memcpy(&saddr6, x->props.saddr.a6, - sizeof(struct in6_addr)); - memcpy(&daddr6, x->id.daddr.a6, - sizeof(struct in6_addr)); - audit_log_format(audit_buf, - " src=" NIP6_FMT " dst=" NIP6_FMT, - NIP6(saddr6), NIP6(daddr6)); - } + audit_log_format(audit_buf, + " src=" NIP6_FMT " dst=" NIP6_FMT, + NIP6(*(struct in6_addr *)x->props.saddr.a6), + NIP6(*(struct in6_addr *)x->id.daddr.a6)); break; } + + audit_log_format(audit_buf, " spi=%u(0x%x)", spi, spi); } -void -xfrm_audit_state_add(struct xfrm_state *x, int result, u32 auid, u32 sid) +static void xfrm_audit_helper_pktinfo(struct sk_buff *skb, u16 family, + struct audit_buffer *audit_buf) +{ + struct iphdr *iph4; + struct ipv6hdr *iph6; + + switch (family) { + case AF_INET: + iph4 = ip_hdr(skb); + audit_log_format(audit_buf, + " src=" NIPQUAD_FMT " dst=" NIPQUAD_FMT, + NIPQUAD(iph4->saddr), + NIPQUAD(iph4->daddr)); + break; + case AF_INET6: + iph6 = ipv6_hdr(skb); + audit_log_format(audit_buf, + " src=" NIP6_FMT " dst=" NIP6_FMT + " flowlbl=0x%x%x%x", + NIP6(iph6->saddr), + NIP6(iph6->daddr), + iph6->flow_lbl[0] & 0x0f, + iph6->flow_lbl[1], + iph6->flow_lbl[2]); + break; + } +} + +void xfrm_audit_state_add(struct xfrm_state *x, int result, + u32 auid, u32 secid) { struct audit_buffer *audit_buf; - u32 spi; - extern int audit_enabled; - if (audit_enabled == 0) + audit_buf = xfrm_audit_start("SAD-add"); + if (audit_buf == NULL) return; - audit_buf = xfrm_audit_start(auid, sid); + xfrm_audit_helper_usrinfo(auid, secid, audit_buf); + xfrm_audit_helper_sainfo(x, audit_buf); + audit_log_format(audit_buf, " res=%u", result); + audit_log_end(audit_buf); +} +EXPORT_SYMBOL_GPL(xfrm_audit_state_add); + +void xfrm_audit_state_delete(struct xfrm_state *x, int result, + u32 auid, u32 secid) +{ + struct audit_buffer *audit_buf; + + audit_buf = xfrm_audit_start("SAD-delete"); if (audit_buf == NULL) return; - audit_log_format(audit_buf, " op=SAD-add res=%u",result); - xfrm_audit_common_stateinfo(x, audit_buf); + xfrm_audit_helper_usrinfo(auid, secid, audit_buf); + xfrm_audit_helper_sainfo(x, audit_buf); + audit_log_format(audit_buf, " res=%u", result); + audit_log_end(audit_buf); +} +EXPORT_SYMBOL_GPL(xfrm_audit_state_delete); + +void xfrm_audit_state_replay_overflow(struct xfrm_state *x, + struct sk_buff *skb) +{ + struct audit_buffer *audit_buf; + u32 spi; + + audit_buf = xfrm_audit_start("SA-replay-overflow"); + if (audit_buf == NULL) + return; + xfrm_audit_helper_pktinfo(skb, x->props.family, audit_buf); + /* don't record the sequence number because it's inherent in this kind + * of audit message */ spi = ntohl(x->id.spi); audit_log_format(audit_buf, " spi=%u(0x%x)", spi, spi); audit_log_end(audit_buf); } -EXPORT_SYMBOL_GPL(xfrm_audit_state_add); +EXPORT_SYMBOL_GPL(xfrm_audit_state_replay_overflow); -void -xfrm_audit_state_delete(struct xfrm_state *x, int result, u32 auid, u32 sid) +static void xfrm_audit_state_replay(struct xfrm_state *x, + struct sk_buff *skb, __be32 net_seq) { struct audit_buffer *audit_buf; u32 spi; - extern int audit_enabled; - if (audit_enabled == 0) - return; - audit_buf = xfrm_audit_start(auid, sid); + audit_buf = xfrm_audit_start("SA-replayed-pkt"); if (audit_buf == NULL) return; - audit_log_format(audit_buf, " op=SAD-delete res=%u",result); - xfrm_audit_common_stateinfo(x, audit_buf); + xfrm_audit_helper_pktinfo(skb, x->props.family, audit_buf); spi = ntohl(x->id.spi); - audit_log_format(audit_buf, " spi=%u(0x%x)", spi, spi); + audit_log_format(audit_buf, " spi=%u(0x%x) seqno=%u", + spi, spi, ntohl(net_seq)); audit_log_end(audit_buf); } -EXPORT_SYMBOL_GPL(xfrm_audit_state_delete); + +void xfrm_audit_state_notfound_simple(struct sk_buff *skb, u16 family) +{ + struct audit_buffer *audit_buf; + + audit_buf = xfrm_audit_start("SA-notfound"); + if (audit_buf == NULL) + return; + xfrm_audit_helper_pktinfo(skb, family, audit_buf); + audit_log_end(audit_buf); +} +EXPORT_SYMBOL_GPL(xfrm_audit_state_notfound_simple); + +void xfrm_audit_state_notfound(struct sk_buff *skb, u16 family, + __be32 net_spi, __be32 net_seq) +{ + struct audit_buffer *audit_buf; + u32 spi; + + audit_buf = xfrm_audit_start("SA-notfound"); + if (audit_buf == NULL) + return; + xfrm_audit_helper_pktinfo(skb, family, audit_buf); + spi = ntohl(net_spi); + audit_log_format(audit_buf, " spi=%u(0x%x) seqno=%u", + spi, spi, ntohl(net_seq)); + audit_log_end(audit_buf); +} +EXPORT_SYMBOL_GPL(xfrm_audit_state_notfound); + +void xfrm_audit_state_icvfail(struct xfrm_state *x, + struct sk_buff *skb, u8 proto) +{ + struct audit_buffer *audit_buf; + __be32 net_spi; + __be32 net_seq; + + audit_buf = xfrm_audit_start("SA-icv-failure"); + if (audit_buf == NULL) + return; + xfrm_audit_helper_pktinfo(skb, x->props.family, audit_buf); + if (xfrm_parse_spi(skb, proto, &net_spi, &net_seq) == 0) { + u32 spi = ntohl(net_spi); + audit_log_format(audit_buf, " spi=%u(0x%x) seqno=%u", + spi, spi, ntohl(net_seq)); + } + audit_log_end(audit_buf); +} +EXPORT_SYMBOL_GPL(xfrm_audit_state_icvfail); #endif /* CONFIG_AUDITSYSCALL */ diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index c4f6419b176..e0ccdf26781 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1043,7 +1043,7 @@ static struct xfrm_policy *xfrm_policy_construct(struct xfrm_userpolicy_info *p, return xp; error: *errp = err; - kfree(xp); + xfrm_policy_destroy(xp); return NULL; } @@ -1986,8 +1986,8 @@ static inline size_t xfrm_sa_len(struct xfrm_state *x) if (x->coaddr) l += nla_total_size(sizeof(*x->coaddr)); - /* Must count this as this may become non-zero behind our back. */ - l += nla_total_size(sizeof(x->lastused)); + /* Must count x->lastused as it may become non-zero behind our back. */ + l += nla_total_size(sizeof(u64)); return l; } @@ -2420,7 +2420,7 @@ static void __exit xfrm_user_exit(void) xfrm_unregister_km(&netlink_mgr); rcu_assign_pointer(xfrm_nl, NULL); synchronize_rcu(); - sock_release(nlsk->sk_socket); + netlink_kernel_release(nlsk); } module_init(xfrm_user_init); |